├── .Renviron
├── .github
└── workflows
│ └── snakemake-run.yaml
├── LICENSE
├── NEWS
├── README.md
├── Snakefile
├── config.yaml
├── envs
├── environment.yaml
└── environment_R.yaml
├── example_data
├── FASTQ
│ ├── SRR1039508_R1.fastq.gz
│ ├── SRR1039508_R2.fastq.gz
│ ├── SRR1039509_R1.fastq.gz
│ ├── SRR1039509_R2.fastq.gz
│ ├── SRR1039512_R1.fastq.gz
│ ├── SRR1039512_R2.fastq.gz
│ ├── SRR1039513_R1.fastq.gz
│ └── SRR1039513_R2.fastq.gz
├── README.md
├── metadata.txt
└── reference
│ ├── Ensembl.GRCh38.93
│ ├── Homo_sapiens.GRCh38.93.1.1.10M.gtf
│ ├── Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz
│ ├── Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa
│ └── Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz
│ └── Gencode28
│ ├── GRCh38.primary_assembly.genome.1.1.10M.fa
│ ├── gencode.v28.annotation.1.1.10M.gtf
│ └── gencode.v28.transcripts.1.1.10M.fa.gz
├── img
├── ARMOR.png
├── ARMOR.svg
├── benchmark_summary.png
├── dag_nice.svg
├── dag_nice5.png
├── draw.io_run_modes_ARMOR.xml
├── run_modes_ARMOR.png
├── software_management.png
└── software_management.svg
├── scripts
├── DRIMSeq_dtu.Rmd
├── check_input.R
├── custom_iSEE_panels.R
├── edgeR_dge.Rmd
├── generate_linkedtxome.R
├── generate_report.R
├── install_pkgs.R
├── list_packages.R
├── prepare_shiny.Rmd
├── run_render.R
└── run_tximeta.R
└── version
/.Renviron:
--------------------------------------------------------------------------------
1 | R_LIBS_USER=""
2 |
--------------------------------------------------------------------------------
/.github/workflows/snakemake-run.yaml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | pull_request:
4 | branches:
5 | - master
6 | schedule:
7 | - cron: '0 9 * * 5'
8 |
9 | name: snakemake-run
10 |
11 | jobs:
12 | snakemake-run:
13 | defaults:
14 | run:
15 | shell: bash -l {0}
16 | name: run snakemake
17 | runs-on: ${{ matrix.os }}
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | os: [macos-13, ubuntu-latest]
22 | include:
23 | - os: macos-13
24 | rversion: '4.4'
25 | - os: ubuntu-latest
26 | rversion: '4.4'
27 | steps:
28 | - name: Check out repository
29 | uses: actions/checkout@v2
30 |
31 | - name: Install R (macOS)
32 | uses: r-lib/actions/setup-r@v2
33 | if: runner.os == 'macOS'
34 | with:
35 | r-version: ${{ matrix.rversion }}
36 |
37 | - name: Check where R is installed
38 | if: runner.os == 'macOS'
39 | run: |
40 | which R
41 | Rscript -e 'print(.libPaths())'
42 |
43 | - name: Set up workflow R for macOS
44 | if: runner.os == 'macOS'
45 | run: |
46 | sed -i .bak 's/useCondaR: True/useCondaR: False/' config.yaml
47 | mkdir -p $HOME/Rlib
48 | echo "R_LIBS_USER=${HOME}/Rlib" > .Renviron
49 | cat .Renviron
50 | Rscript -e "install.packages('BiocManager'); BiocManager::install('GenomeInfoDbData')"
51 |
52 | - name: Set up conda
53 | uses: conda-incubator/setup-miniconda@v3
54 | with:
55 | auto-update-conda: true
56 | channels: bioconda,conda-forge,nodefaults
57 | auto-activate-base: true
58 | miniforge-version: latest
59 |
60 | - name: Install system dependencies (Linux)
61 | if: runner.os == 'Linux'
62 | run: |
63 | sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable --yes
64 | sudo apt-get --yes --force-yes update -qq && \
65 | sudo apt-get -y install libcairo2-dev libv8-dev \
66 | libgdal-dev libgeos-dev libgeos++-dev libproj-dev libudunits2-dev \
67 | libcurl4-openssl-dev libharfbuzz-dev libfribidi-dev libglpk-dev \
68 | libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev libgit2-dev \
69 | libxml2-dev libuv1 libuv1-dev
70 |
71 | - name: Prepare for running workflow
72 | env:
73 | BIOCONDUCTOR_USE_CONTAINER_REPOSITORY: true
74 | run: |
75 | mkdir -p $HOME/.R
76 | echo -e 'MAKEFLAGS = -j8' > $HOME/.R/Makevars
77 | echo 'options(Ncpus = 8)' > $HOME/.Rprofile
78 | echo 'Sys.setenv(BIOCONDUCTOR_USE_CONTAINER_REPOSITORY=TRUE)' >> $HOME/.Rprofile
79 |
80 | - name: Install Snakemake
81 | run: |
82 | conda create -c conda-forge -c bioconda -n snakemake snakemake
83 |
84 | - name: Run Snakemake
85 | env:
86 | BIOCONDUCTOR_USE_CONTAINER_REPOSITORY: true
87 | run: |
88 | conda activate snakemake
89 | snakemake --use-conda --cores 4
90 |
91 | - name: Upload artifact
92 | uses: actions/upload-artifact@v4
93 | if: failure()
94 | with:
95 | name: all_rout
96 | path: example_data/output/Rout/*.Rout
97 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Charlotte Soneson
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | v1.5.10
2 | ======
3 | o Update R to v4.4.1
4 |
5 | v1.5.9
6 | ======
7 | o Add r-mass and r-matrix to conda R environment
8 |
9 | v1.5.8
10 | ======
11 | o Update to R 4.3.2
12 |
13 | v1.5.7
14 | ======
15 | o Swap deprecated msigdbr function to get supported species
16 | o Convert entrezid column to a vector if required (to allow saving to csv)
17 | o Update to R 4.1.0
18 |
19 | v1.5.6
20 | ======
21 | o Remove tx_ids column from text file exported by edgeR (addressing https://github.com/csoneson/ARMOR/issues/109)
22 |
23 | v1.5.5
24 | ======
25 | o Adapt to the new output format of limma::plotMDS
26 | o Adapt pandoc-citeproc check to work with pandoc 2.11
27 |
28 | v1.5.4
29 | ======
30 | o Specify tbb version 2020.2 based on issue with salmon in conda (https://twitter.com/dpryan79/status/1368116490801717251)
31 |
32 | v1.5.3
33 | ======
34 | o Update syntax based on suggestions from snakemake --lint
35 | o Update software versions
36 | o Use built-in cpm() function from edgeR
37 |
38 | v1.5.2
39 | ======
40 | o Fix pandoc version to 2.10 (thanks @carissableker, https://github.com/csoneson/ARMOR/pull/101)
41 |
42 | v1.5.1
43 | ======
44 | o Specify channel in environment_R.yaml
45 |
46 | v1.5.0
47 | ======
48 | o Transition to R 4.0
49 | o Use convenience function from tximeta to create DGEList for DE
50 |
51 | v1.4.0
52 | ======
53 | o Add possibility to provide additional arguments to STAR and Salmon
54 |
55 | v1.3.2
56 | ======
57 | o Fix small bug in edgeR script, triggered when no gene sets were provided
58 |
59 | v1.3.1
60 | ======
61 | o Adjust code for the latest version 7.0.1 of the msigdbr R package
62 | o Fix bug to make sure that camera is not run with an empty gene set list
63 |
64 | v1.3.0
65 | ======
66 | o Change CPM calculations in edgeR_dge.Rmd to account for average transcript length offsets, using the approach from csaw::calculateCPM()
67 |
68 | v1.2.3
69 | ======
70 | o Rename rules for clarity
71 | o Add benchmarks directive
72 |
73 | v1.2.2
74 | ======
75 | o Add sticker to README
76 |
77 | v1.2.1
78 | ======
79 | o Change deprecated pandas.read_table to pandas.read_csv
80 |
81 | v1.2.0
82 | ======
83 | o Add possibility to use multiple cores for DRIMSeq and R package installation
84 |
85 | v1.1.0
86 | ======
87 | o Extend checks of inputs
88 |
89 | v1.0.0
90 | ======
91 | o Initial version
92 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ARMOR workflow
2 | [](https://github.com/csoneson/ARMOR/actions)
3 |
4 | **ARMOR** (**A**utomated **R**eproducible **MO**dular **R**NA-seq) is a [Snakemake workflow](https://snakemake.readthedocs.io/en/stable/index.html), aimed at performing a typical RNA-seq workflow in a reproducible, automated, and partially contained manner. It is implemented such that alternative or similar analysis can be added or removed.
5 |
6 | ARMOR consists of a `Snakefile`, a [`conda`](https://conda.io/docs/) environment file (`envs/environment.yaml`) a configuration file (`config.yaml`) and a set of `R` scripts, to perform quality control, preprocessing and differential expression analysis of RNA-seq data. The output can be combined with the [`iSEE`](https://bioconductor.org/packages/iSEE/) `R` package to generate a `shiny` application for browsing and sharing the results.
7 |
8 | By default, the pipeline performs all the steps shown in the [diagram](img/dag_nice3.png) below. However, you can turn off any combination of the light-colored steps (e.g `STAR` alignment or `DRIMSeq` analysis) in the `config.yaml` file.
9 |
10 | *Advanced use*: If you prefer other software to run one of the outlined steps (e.g. `DESeq2` over `edgeR`, or `kallisto` over `Salmon`), you can use the software of your preference provided you have your own script(s), and change some lines within the `Snakefile`. If you think your "custom rule" might be of use to a broader audience, let us know by opening an issue.
11 |
12 |
13 | ## Using the ARMOR workflow
14 |
15 | Assuming that snakemake and conda are installed (and your system has the necessary libraries to compile R packages), you can use the following commands on a test dataset:
16 |
17 | ```
18 | git clone https://github.com/csoneson/ARMOR.git
19 | cd ARMOR && snakemake --use-conda
20 | ```
21 |
22 | To use the ARMOR workflow on your own data, follow the steps outlined in the [wiki](https://github.com/csoneson/ARMOR/wiki).
23 |
24 | ## Workflow graph
25 | 
26 | Blue circles are rules run in `R`, orange circles from software called as shell commands. Dashed lines and light-colored circles are optional rules, controlled in `config.yaml`
27 |
28 | ## Contributors
29 | Current contributors include:
30 |
31 | - [Ruizhu Huang](https://github.com/fionarhuang)
32 | - [Katharina Hembach](https://github.com/khembach)
33 | - [Stephany Orjuela](https://github.com/sorjuela)
34 | - [Mark D. Robinson](https://github.com/markrobinsonuzh)
35 | - [Charlotte Soneson](https://github.com/csoneson)
36 |
--------------------------------------------------------------------------------
/Snakefile:
--------------------------------------------------------------------------------
1 | ## Configuration file
2 | import os
3 | if len(config) == 0:
4 | if os.path.isfile("./config.yaml"):
5 | configfile: "./config.yaml"
6 | else:
7 | sys.exit("".join(["Make sure there is a config.yaml file in ", os.getcwd(),
8 | " or specify one with the --configfile commandline parameter."]))
9 |
10 | ## Make sure that all expected variables from the config file are in the config dictionary
11 | configvars = ['annotation', 'organism', 'build', 'release', 'txome', 'genome', 'gtf', 'salmonindex', 'salmonk', 'STARindex', 'readlength', 'fldMean', 'fldSD', 'metatxt', 'design', 'contrast', 'genesets', 'ncores', 'FASTQ', 'fqext1', 'fqext2', 'fqsuffix', 'output', 'useCondaR', 'Rbin', 'run_trimming', 'run_STAR', 'run_DRIMSeq', 'run_camera']
12 | for k in configvars:
13 | if k not in config:
14 | config[k] = None
15 |
16 | ## If any of the file paths is missing, replace it with ""
17 | def sanitizefile(str):
18 | if str is None:
19 | str = ''
20 | return str
21 |
22 | config['txome'] = sanitizefile(config['txome'])
23 | config['gtf'] = sanitizefile(config['gtf'])
24 | config['genome'] = sanitizefile(config['genome'])
25 | config['STARindex'] = sanitizefile(config['STARindex'])
26 | config['salmonindex'] = sanitizefile(config['salmonindex'])
27 | config['metatxt'] = sanitizefile(config['metatxt'])
28 |
29 | ## Read metadata
30 | if not os.path.isfile(config["metatxt"]):
31 | sys.exit("".join(["Metadata file ", config["metatxt"], " does not exist."]))
32 |
33 | import pandas as pd
34 | samples = pd.read_csv(config["metatxt"], sep='\t')
35 |
36 | if not set(['names','type']).issubset(samples.columns):
37 | sys.exit("".join(["Make sure 'names' and 'type' are columns in ", config["metatxt"]]))
38 |
39 |
40 | ## Sanitize provided input and output directories
41 | import re
42 | def getpath(str):
43 | if str in ['', '.', './']:
44 | return ''
45 | if str.startswith('./'):
46 | regex = re.compile('^\./?')
47 | str = regex.sub('', str)
48 | if not str.endswith('/'):
49 | str += '/'
50 | return str
51 |
52 | outputdir = getpath(config["output"])
53 | FASTQdir = getpath(config["FASTQ"])
54 |
55 | ## Define the conda environment for all rules using R
56 | if config["useCondaR"] == True:
57 | Renv = "envs/environment_R.yaml"
58 | else:
59 | Renv = "envs/environment.yaml"
60 |
61 | ## Define the R binary
62 | Rbin = config["Rbin"]
63 |
64 | ## ------------------------------------------------------------------------------------ ##
65 | ## Target definitions
66 | ## ------------------------------------------------------------------------------------ ##
67 | ## Run all analyses
68 | rule all:
69 | input:
70 | os.path.join(outputdir, "MultiQC", "multiqc_report.html"),
71 | os.path.join(outputdir, "outputR", "shiny_sce.rds")
72 |
73 | rule setup:
74 | input:
75 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"),
76 | os.path.join(outputdir, "Rout", "softwareversions.done")
77 |
78 | ## Install R packages
79 | rule pkginstall:
80 | input:
81 | script = "scripts/install_pkgs.R"
82 | output:
83 | os.path.join(outputdir, "Rout", "pkginstall_state.txt")
84 | params:
85 | flag = config["annotation"],
86 | ncores = config["ncores"],
87 | organism = config["organism"],
88 | Rbin = Rbin
89 | priority:
90 | 50
91 | conda:
92 | Renv
93 | log:
94 | os.path.join(outputdir, "Rout", "install_pkgs.Rout")
95 | benchmark:
96 | os.path.join(outputdir, "benchmarks", "install_pkgs.txt")
97 | shell:
98 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args outtxt='{output}' ncores='{params.ncores}' annotation='{params.flag}' organism='{params.organism}'" {input.script} {log}'''
99 |
100 | ## FastQC on original (untrimmed) files
101 | rule runfastqc:
102 | input:
103 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()),
104 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()),
105 | expand(os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist())
106 |
107 | ## Trimming and FastQC on trimmed files
108 | rule runtrimming:
109 | input:
110 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_val_1_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()),
111 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_val_2_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()),
112 | expand(os.path.join(outputdir, "FastQC", "{sample}_trimmed_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist())
113 |
114 | ## Salmon quantification
115 | rule runsalmonquant:
116 | input:
117 | expand(os.path.join(outputdir, "salmon", "{sample}", "quant.sf"), sample = samples.names.values.tolist())
118 |
119 | ## STAR alignment
120 | rule runstar:
121 | input:
122 | expand(os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam.bai"), sample = samples.names.values.tolist()),
123 | expand(os.path.join(outputdir, "STARbigwig", "{sample}_Aligned.sortedByCoord.out.bw"), sample = samples.names.values.tolist())
124 |
125 | ## List all the packages that were used by the R analyses
126 | rule listpackages:
127 | log:
128 | os.path.join(outputdir, "Rout", "list_packages.Rout")
129 | params:
130 | Routdir = os.path.join(outputdir, "Rout"),
131 | outtxt = os.path.join(outputdir, "R_package_versions.txt"),
132 | script = "scripts/list_packages.R",
133 | Rbin = Rbin
134 | conda:
135 | Renv
136 | shell:
137 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args Routdir='{params.Routdir}' outtxt='{params.outtxt}'" {params.script} {log}'''
138 |
139 | ## Print the versions of all software packages
140 | rule softwareversions:
141 | output:
142 | touch(os.path.join(outputdir, "Rout", "softwareversions.done"))
143 | log:
144 | os.path.join(outputdir, "logs", "softversions.log")
145 | conda:
146 | "envs/environment.yaml"
147 | shell:
148 | "echo -n 'ARMOR version ' && cat version; "
149 | "salmon --version; trim_galore --version; "
150 | "echo -n 'cutadapt ' && cutadapt --version; "
151 | "fastqc --version; STAR --version; samtools --version; multiqc --version; "
152 | "bedtools --version"
153 |
154 | ## ------------------------------------------------------------------------------------ ##
155 | ## Reference preparation
156 | ## ------------------------------------------------------------------------------------ ##
157 | ## Generate Salmon index from merged cDNA and ncRNA files
158 | rule salmonindex:
159 | input:
160 | txome = config["txome"]
161 | output:
162 | os.path.join(config["salmonindex"], "versionInfo.json")
163 | log:
164 | os.path.join(outputdir, "logs", "salmon_index.log")
165 | benchmark:
166 | os.path.join(outputdir, "benchmarks", "salmon_index.txt")
167 | params:
168 | salmonoutdir = lambda wildcards, output: os.path.dirname(output[0]), ## dirname of first output
169 | anno = config["annotation"],
170 | salmonextraparams = config["additional_salmon_index"]
171 | conda:
172 | "envs/environment.yaml"
173 | shell:
174 | """
175 | if [ {params.anno} == "Gencode" ]; then
176 | echo 'Salmon version:\n' > {log}; salmon --version >> {log};
177 | salmon index -t {input.txome} -i {params.salmonoutdir} --gencode {params.salmonextraparams}
178 |
179 | else
180 | echo 'Salmon version:\n' > {log}; salmon --version >> {log};
181 | salmon index -t {input.txome} -i {params.salmonoutdir} {params.salmonextraparams}
182 | fi
183 | """
184 |
185 | ## Generate linkedtxome mapping
186 | rule linkedtxome:
187 | input:
188 | txome = config["txome"],
189 | gtf = config["gtf"],
190 | salmonidx = os.path.join(config["salmonindex"], "versionInfo.json"),
191 | script = "scripts/generate_linkedtxome.R",
192 | install = os.path.join(outputdir, "Rout", "pkginstall_state.txt")
193 | log:
194 | os.path.join(outputdir, "Rout", "generate_linkedtxome.Rout")
195 | benchmark:
196 | os.path.join(outputdir, "benchmarks", "generate_linkedtxome.txt")
197 | output:
198 | "".join([config["salmonindex"], ".json"])
199 | params:
200 | flag = config["annotation"],
201 | organism = config["organism"],
202 | release = str(config["release"]),
203 | build = config["build"],
204 | Rbin = Rbin
205 | conda:
206 | Renv
207 | shell:
208 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args transcriptfasta='{input.txome}' salmonidx='{input.salmonidx}' gtf='{input.gtf}' annotation='{params.flag}' organism='{params.organism}' release='{params.release}' build='{params.build}' output='{output}'" {input.script} {log}'''
209 |
210 | ## Generate STAR index
211 | rule starindex:
212 | input:
213 | genome = config["genome"],
214 | gtf = config["gtf"]
215 | output:
216 | os.path.join(config["STARindex"], "SA"),
217 | os.path.join(config["STARindex"], "chrNameLength.txt")
218 | log:
219 | os.path.join(outputdir, "logs", "STAR_index.log")
220 | benchmark:
221 | os.path.join(outputdir, "benchmarks", "STAR_index.txt")
222 | params:
223 | STARindex = lambda wildcards, output: os.path.dirname(output[0]), ## dirname of first output
224 | readlength = config["readlength"],
225 | starextraparams = config["additional_star_index"]
226 | conda:
227 | "envs/environment.yaml"
228 | threads:
229 | config["ncores"]
230 | shell:
231 | "echo 'STAR version:\n' > {log}; STAR --version >> {log}; "
232 | "STAR --runMode genomeGenerate --runThreadN {threads} --genomeDir {params.STARindex} "
233 | "--genomeFastaFiles {input.genome} --sjdbGTFfile {input.gtf} --sjdbOverhang {params.readlength} "
234 | "{params.starextraparams}"
235 |
236 | ## ------------------------------------------------------------------------------------ ##
237 | ## Quality control
238 | ## ------------------------------------------------------------------------------------ ##
239 | ## FastQC, original reads
240 | rule fastqc:
241 | input:
242 | fastq = os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"]))
243 | output:
244 | os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip")
245 | params:
246 | FastQC = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output
247 | log:
248 | os.path.join(outputdir, "logs", "fastqc_{sample}.log")
249 | benchmark:
250 | os.path.join(outputdir, "benchmarks", "fastqc_{sample}.txt")
251 | conda:
252 | "envs/environment.yaml"
253 | threads:
254 | config["ncores"]
255 | shell:
256 | "echo 'FastQC version:\n' > {log}; fastqc --version >> {log}; "
257 | "fastqc -o {params.FastQC} -t {threads} {input.fastq}"
258 |
259 | ## FastQC, trimmed reads
260 | rule fastqctrimmed:
261 | input:
262 | fastq = os.path.join(outputdir, "FASTQtrimmed", "{sample}.fq.gz")
263 | output:
264 | os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip")
265 | params:
266 | FastQC = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output
267 | log:
268 | os.path.join(outputdir, "logs", "fastqc_trimmed_{sample}.log")
269 | benchmark:
270 | os.path.join(outputdir, "benchmarks", "fastqc_trimmed_{sample}.txt")
271 | conda:
272 | "envs/environment.yaml"
273 | threads:
274 | config["ncores"]
275 | shell:
276 | "echo 'FastQC version:\n' > {log}; fastqc --version >> {log}; "
277 | "fastqc -o {params.FastQC} -t {threads} {input.fastq}"
278 |
279 |
280 |
281 | # The config.yaml files determines which steps should be performed
282 | def multiqc_input(wildcards):
283 | input = []
284 | input.extend(expand(os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist()))
285 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()))
286 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()))
287 | input.extend(expand(os.path.join(outputdir, "salmon", "{sample}", "quant.sf"), sample = samples.names.values.tolist()))
288 | if config["run_trimming"]:
289 | input.extend(expand(os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz"), sample = samples.names[samples.type == 'SE'].values.tolist()))
290 | input.extend(expand(os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])), sample = samples.names[samples.type == 'PE'].values.tolist()))
291 | input.extend(expand(os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"])), sample = samples.names[samples.type == 'PE'].values.tolist()))
292 | input.extend(expand(os.path.join(outputdir, "FastQC", "{sample}_trimmed_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist()))
293 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_val_1_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()))
294 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_val_2_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()))
295 | if config["run_STAR"]:
296 | input.extend(expand(os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam.bai"), sample = samples.names.values.tolist()))
297 | return input
298 |
299 | ## Determine the input directories for MultiQC depending on the config file
300 | def multiqc_params(wildcards):
301 | param = [os.path.join(outputdir, "FastQC"),
302 | os.path.join(outputdir, "salmon")]
303 | if config["run_trimming"]:
304 | param.append(os.path.join(outputdir, "FASTQtrimmed"))
305 | if config["run_STAR"]:
306 | param.append(os.path.join(outputdir, "STAR"))
307 | return param
308 |
309 | ## MultiQC
310 | rule multiqc:
311 | input:
312 | multiqc_input
313 | output:
314 | os.path.join(outputdir, "MultiQC", "multiqc_report.html")
315 | params:
316 | inputdirs = multiqc_params,
317 | MultiQCdir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output
318 | log:
319 | os.path.join(outputdir, "logs", "multiqc.log")
320 | benchmark:
321 | os.path.join(outputdir, "benchmarks", "multiqc.txt")
322 | conda:
323 | "envs/environment.yaml"
324 | shell:
325 | "echo 'MultiQC version:\n' > {log}; multiqc --version >> {log}; "
326 | "multiqc {params.inputdirs} -f -o {params.MultiQCdir}"
327 |
328 |
329 | ## ------------------------------------------------------------------------------------ ##
330 | ## Adapter trimming
331 | ## ------------------------------------------------------------------------------------ ##
332 | # TrimGalore!
333 | rule trimgaloreSE:
334 | input:
335 | fastq = os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"]))
336 | output:
337 | os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz")
338 | params:
339 | FASTQtrimmeddir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output
340 | log:
341 | os.path.join(outputdir, "logs", "trimgalore_{sample}.log")
342 | benchmark:
343 | os.path.join(outputdir, "benchmarks", "trimgalore_{sample}.txt")
344 | conda:
345 | "envs/environment.yaml"
346 | shell:
347 | "echo 'TrimGalore! version:\n' > {log}; trim_galore --version >> {log}; "
348 | "trim_galore -q 20 --phred33 --length 20 -o {params.FASTQtrimmeddir} --path_to_cutadapt cutadapt {input.fastq}"
349 |
350 | rule trimgalorePE:
351 | input:
352 | fastq1 = os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext1"]), ".", str(config["fqsuffix"]), ".gz"])),
353 | fastq2 = os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext2"]), ".", str(config["fqsuffix"]), ".gz"]))
354 | output:
355 | os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])),
356 | os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"]))
357 | params:
358 | FASTQtrimmeddir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output
359 | log:
360 | os.path.join(outputdir, "logs", "trimgalore_{sample}.log")
361 | benchmark:
362 | os.path.join(outputdir, "benchmarks", "trimgalore_{sample}.txt")
363 | conda:
364 | "envs/environment.yaml"
365 | shell:
366 | "echo 'TrimGalore! version:\n' > {log}; trim_galore --version >> {log}; "
367 | "trim_galore -q 20 --phred33 --length 20 -o {params.FASTQtrimmeddir} --path_to_cutadapt cutadapt "
368 | "--paired {input.fastq1} {input.fastq2}"
369 |
370 | ## ------------------------------------------------------------------------------------ ##
371 | ## Salmon abundance estimation
372 | ## ------------------------------------------------------------------------------------ ##
373 | # Estimate abundances with Salmon
374 | rule salmonSE:
375 | input:
376 | index = os.path.join(config["salmonindex"], "versionInfo.json"),
377 | fastq = os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz") if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"]))
378 | output:
379 | os.path.join(outputdir, "salmon", "{sample}", "quant.sf")
380 | log:
381 | os.path.join(outputdir, "logs", "salmon_{sample}.log")
382 | benchmark:
383 | os.path.join(outputdir, "benchmarks", "salmon_{sample}.txt")
384 | threads:
385 | config["ncores"]
386 | params:
387 | salmonindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input
388 | salmondir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output
389 | salmonextraparams = config["additional_salmon_quant"]
390 | conda:
391 | "envs/environment.yaml"
392 | shell:
393 | "echo 'Salmon version:\n' > {log}; salmon --version >> {log}; "
394 | "salmon quant -i {params.salmonindex} -l A -r {input.fastq} "
395 | "-o {params.salmondir}/{wildcards.sample} -p {threads} {params.salmonextraparams}"
396 |
397 | rule salmonPE:
398 | input:
399 | index = os.path.join(config["salmonindex"], "versionInfo.json"),
400 | fastq1 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext1"]), ".", str(config["fqsuffix"]), ".gz"])),
401 | fastq2 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext2"]), ".", str(config["fqsuffix"]), ".gz"]))
402 | output:
403 | os.path.join(outputdir, "salmon", "{sample}", "quant.sf")
404 | log:
405 | os.path.join(outputdir, "logs", "salmon_{sample}.log")
406 | benchmark:
407 | os.path.join(outputdir, "benchmarks", "salmon_{sample}.txt")
408 | threads:
409 | config["ncores"]
410 | params:
411 | salmonindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input
412 | salmondir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output
413 | salmonextraparams = config["additional_salmon_quant"]
414 | conda:
415 | "envs/environment.yaml"
416 | shell:
417 | "echo 'Salmon version:\n' > {log}; salmon --version >> {log}; "
418 | "salmon quant -i {params.salmonindex} -l A -1 {input.fastq1} -2 {input.fastq2} "
419 | "-o {params.salmondir}/{wildcards.sample} -p {threads} {params.salmonextraparams}"
420 |
421 | ## ------------------------------------------------------------------------------------ ##
422 | ## STAR mapping
423 | ## ------------------------------------------------------------------------------------ ##
424 | ## Genome mapping with STAR
425 | rule starSE:
426 | input:
427 | index = os.path.join(config["STARindex"], "SA"),
428 | fastq = os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz") if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"]))
429 | output:
430 | os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam")
431 | threads:
432 | config["ncores"]
433 | log:
434 | os.path.join(outputdir, "logs", "STAR_{sample}.log")
435 | benchmark:
436 | os.path.join(outputdir, "benchmarks", "STAR_{sample}.txt")
437 | params:
438 | STARindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input
439 | STARdir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output
440 | starextraparams = config["additional_star_align"]
441 | conda:
442 | "envs/environment.yaml"
443 | shell:
444 | "echo 'STAR version:\n' > {log}; STAR --version >> {log}; "
445 | "STAR --genomeDir {params.STARindex} --readFilesIn {input.fastq} "
446 | "--runThreadN {threads} --outFileNamePrefix {params.STARdir}/{wildcards.sample}/{wildcards.sample}_ "
447 | "--outSAMtype BAM SortedByCoordinate --readFilesCommand gunzip -c "
448 | "{params.starextraparams}"
449 |
450 | rule starPE:
451 | input:
452 | index = os.path.join(config["STARindex"], "SA"),
453 | fastq1 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext1"]), ".", str(config["fqsuffix"]), ".gz"])),
454 | fastq2 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext2"]), ".", str(config["fqsuffix"]), ".gz"]))
455 | output:
456 | os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam")
457 | threads:
458 | config["ncores"]
459 | log:
460 | os.path.join(outputdir, "logs", "STAR_{sample}.log")
461 | benchmark:
462 | os.path.join(outputdir, "benchmarks", "STAR_{sample}.txt")
463 | params:
464 | STARindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input
465 | STARdir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output
466 | starextraparams = config["additional_star_align"]
467 | conda:
468 | "envs/environment.yaml"
469 | shell:
470 | "echo 'STAR version:\n' > {log}; STAR --version >> {log}; "
471 | "STAR --genomeDir {params.STARindex} --readFilesIn {input.fastq1} {input.fastq2} "
472 | "--runThreadN {threads} --outFileNamePrefix {params.STARdir}/{wildcards.sample}/{wildcards.sample}_ "
473 | "--outSAMtype BAM SortedByCoordinate --readFilesCommand gunzip -c "
474 | "{params.starextraparams}"
475 |
476 | ## Index bam files
477 | rule bamindex:
478 | input:
479 | bam = os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam")
480 | output:
481 | os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam.bai")
482 | log:
483 | os.path.join(outputdir, "logs", "samtools_index_{sample}.log")
484 | benchmark:
485 | os.path.join(outputdir, "benchmarks", "samtools_index_{sample}.txt")
486 | conda:
487 | "envs/environment.yaml"
488 | shell:
489 | "echo 'samtools version:\n' > {log}; samtools --version >> {log}; "
490 | "samtools index {input.bam}"
491 |
492 | ## Convert BAM files to bigWig
493 | rule bigwig:
494 | input:
495 | bam = os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam"),
496 | chrl = os.path.join(config["STARindex"], "chrNameLength.txt")
497 | output:
498 | os.path.join(outputdir, "STARbigwig", "{sample}_Aligned.sortedByCoord.out.bw")
499 | params:
500 | STARbigwigdir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output
501 | log:
502 | os.path.join(outputdir, "logs", "bigwig_{sample}.log")
503 | benchmark:
504 | os.path.join(outputdir, "benchmarks", "bigwig_{sample}.txt")
505 | conda:
506 | "envs/environment.yaml"
507 | shell:
508 | "echo 'bedtools version:\n' > {log}; bedtools --version >> {log}; "
509 | "bedtools genomecov -split -ibam {input.bam} -bg | LC_COLLATE=C sort -k1,1 -k2,2n > "
510 | "{params.STARbigwigdir}/{wildcards.sample}_Aligned.sortedByCoord.out.bedGraph; "
511 | "bedGraphToBigWig {params.STARbigwigdir}/{wildcards.sample}_Aligned.sortedByCoord.out.bedGraph "
512 | "{input.chrl} {output}; rm -f {params.STARbigwigdir}/{wildcards.sample}_Aligned.sortedByCoord.out.bedGraph"
513 |
514 | ## ------------------------------------------------------------------------------------ ##
515 | ## Transcript quantification
516 | ## ------------------------------------------------------------------------------------ ##
517 | ## tximeta
518 | rule tximeta:
519 | input:
520 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"),
521 | expand(os.path.join(outputdir, "salmon", "{sample}", "quant.sf"), sample = samples.names.values.tolist()),
522 | metatxt = config["metatxt"],
523 | salmonidx = os.path.join(config["salmonindex"], "versionInfo.json"),
524 | json = "".join([config["salmonindex"], ".json"]),
525 | script = "scripts/run_tximeta.R"
526 | output:
527 | os.path.join(outputdir, "outputR", "tximeta_se.rds")
528 | log:
529 | os.path.join(outputdir, "Rout", "tximeta_se.Rout")
530 | benchmark:
531 | os.path.join(outputdir, "benchmarks", "tximeta_se.txt")
532 | params:
533 | salmondir = lambda wildcards, input: os.path.dirname(os.path.dirname(input[1])), ## dirname of second output
534 | flag = config["annotation"],
535 | organism = config["organism"],
536 | Rbin = Rbin
537 | conda:
538 | Renv
539 | shell:
540 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args salmondir='{params.salmondir}' json='{input.json}' metafile='{input.metatxt}' outrds='{output}' annotation='{params.flag}' organism='{params.organism}'" {input.script} {log}'''
541 |
542 | ## ------------------------------------------------------------------------------------ ##
543 | ## Input variable check
544 | ## ------------------------------------------------------------------------------------ ##
545 | def geneset_param(wildcards):
546 | if config["run_camera"]:
547 | gs = config["genesets"].replace(" ", "") if config["genesets"] is not None else "NOTDEFINED"
548 | return "".join(["genesets='", gs, "'"])
549 | else:
550 | return ""
551 |
552 |
553 | ## check design matrix and contrasts
554 | rule checkinputs:
555 | input:
556 | "config.yaml",
557 | script = "scripts/check_input.R"
558 | output:
559 | os.path.join(outputdir, "Rout", "check_input.txt")
560 | log:
561 | os.path.join(outputdir, "Rout", "check_input.Rout")
562 | benchmark:
563 | os.path.join(outputdir, "benchmarks", "check_input.txt")
564 | params:
565 | gtf = config["gtf"],
566 | genome = config["genome"],
567 | txome = config["txome"],
568 | fastqdir = config["FASTQ"],
569 | metatxt = config["metatxt"],
570 | design = config["design"].replace(" ", "") if config["design"] is not None else "NOTDEFINED",
571 | contrast = config["contrast"].replace(" ", "") if config["contrast"] is not None else "NOTDEFINED",
572 | annotation = config["annotation"].replace(" ", "") if config["annotation"] is not None else "NOTDEFINED",
573 | genesets = geneset_param,
574 | fqsuffix = str(config["fqsuffix"]),
575 | fqext1 = str(config["fqext1"]),
576 | fqext2 = str(config["fqext2"]),
577 | run_camera = str(config["run_camera"]),
578 | organism = config["organism"],
579 | Rbin = Rbin
580 | conda:
581 | Renv
582 | shell:
583 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args metafile='{params.metatxt}' design='{params.design}' contrast='{params.contrast}' outFile='{output}' gtf='{params.gtf}' genome='{params.genome}' fastqdir='{params.fastqdir}' fqsuffix='{params.fqsuffix}' fqext1='{params.fqext1}' fqext2='{params.fqext2}' txome='{params.txome}' run_camera='{params.run_camera}' organism='{params.organism}' {params.genesets} annotation='{params.annotation}'" {input.script} {log};
584 | cat {output}
585 | '''
586 |
587 |
588 | ## ------------------------------------------------------------------------------------ ##
589 | ## Differential expression
590 | ## ------------------------------------------------------------------------------------ ##
591 | rule edgeR:
592 | input:
593 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"),
594 | rds = os.path.join(outputdir, "outputR", "tximeta_se.rds"),
595 | script = "scripts/run_render.R",
596 | template = "scripts/edgeR_dge.Rmd"
597 | output:
598 | html = os.path.join(outputdir, "outputR", "edgeR_dge.html"),
599 | rds = os.path.join(outputdir, "outputR", "edgeR_dge.rds")
600 | params:
601 | directory = lambda wildcards, input: os.path.dirname(input['rds']), ## dirname of rds input
602 | organism = config["organism"],
603 | design = config["design"].replace(" ", "") if config["design"] is not None else "",
604 | contrast = config["contrast"].replace(" ", "") if config["contrast"] is not None else "",
605 | genesets = geneset_param,
606 | Rbin = Rbin
607 | log:
608 | os.path.join(outputdir, "Rout", "run_dge_edgeR.Rout")
609 | benchmark:
610 | os.path.join(outputdir, "benchmarks", "run_dge_edgeR.txt")
611 | conda:
612 | Renv
613 | shell:
614 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args se='{input.rds}' organism='{params.organism}' design='{params.design}' contrast='{params.contrast}' {params.genesets} rmdtemplate='{input.template}' outputdir='{params.directory}' outputfile='edgeR_dge.html'" {input.script} {log}'''
615 |
616 | ## ------------------------------------------------------------------------------------ ##
617 | ## Differential transcript usage
618 | ## ------------------------------------------------------------------------------------ ##
619 | ## DRIMSeq
620 | rule DRIMSeq:
621 | input:
622 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"),
623 | rds = os.path.join(outputdir, "outputR", "edgeR_dge.rds"),
624 | script = "scripts/run_render.R",
625 | template = "scripts/DRIMSeq_dtu.Rmd"
626 | output:
627 | html = os.path.join(outputdir, "outputR", "DRIMSeq_dtu.html"),
628 | rds = os.path.join(outputdir, "outputR", "DRIMSeq_dtu.rds")
629 | params:
630 | directory = lambda wildcards, input: os.path.dirname(input['rds']), ## dirname of rds input
631 | organism = config["organism"],
632 | ncores = config["ncores"],
633 | design = config["design"].replace(" ", "") if config["design"] is not None else "",
634 | contrast = config["contrast"].replace(" ", "") if config["contrast"] is not None else "",
635 | Rbin = Rbin
636 | log:
637 | os.path.join(outputdir, "Rout", "run_dtu_drimseq.Rout")
638 | benchmark:
639 | os.path.join(outputdir, "benchmarks", "run_dtu_drimseq.txt")
640 | conda:
641 | Renv
642 | threads:
643 | config["ncores"]
644 | shell:
645 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args se='{input.rds}' design='{params.design}' contrast='{params.contrast}' ncores='{params.ncores}' rmdtemplate='{input.template}' outputdir='{params.directory}' outputfile='DRIMSeq_dtu.html'" {input.script} {log}'''
646 |
647 | ## ------------------------------------------------------------------------------------ ##
648 | ## shiny app
649 | ## ------------------------------------------------------------------------------------ ##
650 | def shiny_input(wildcards):
651 | input = [os.path.join(outputdir, "Rout", "pkginstall_state.txt")]
652 | if config["run_STAR"]:
653 | input.extend(expand(os.path.join(outputdir, "STARbigwig", "{sample}_Aligned.sortedByCoord.out.bw"), sample = samples.names.values.tolist()))
654 | return input
655 |
656 | def shiny_params(wildcards):
657 | param = ["".join(["outputdir='", outputdir, "outputR'"])]
658 | if config["run_STAR"]:
659 | param.append("".join(["bigwigdir='", outputdir, "STARbigwig'"]))
660 | return param
661 |
662 | ## shiny
663 | rule shiny:
664 | input:
665 | shiny_input,
666 | rds = os.path.join(outputdir, "outputR", "DRIMSeq_dtu.rds") if config["run_DRIMSeq"] else os.path.join(outputdir, "outputR", "edgeR_dge.rds"),
667 | script = "scripts/run_render.R",
668 | gtf = config["gtf"],
669 | template = "scripts/prepare_shiny.Rmd"
670 | output:
671 | html = os.path.join(outputdir, "outputR", "prepare_shiny.html"),
672 | rds = os.path.join(outputdir, "outputR", "shiny_sce.rds")
673 | params:
674 | p = shiny_params,
675 | Rbin = Rbin
676 | log:
677 | os.path.join(outputdir, "Rout", "prepare_shiny.Rout")
678 | benchmark:
679 | os.path.join(outputdir, "benchmarks", "prepare_shiny.txt")
680 | conda:
681 | Renv
682 | shell:
683 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args se='{input.rds}' gtffile='{input.gtf}' rmdtemplate='{input.template}' outputfile='prepare_shiny.html' {params.p}" {input.script} {log}'''
684 |
685 | ## ------------------------------------------------------------------------------------ ##
686 | ## Success and failure messages
687 | ## ------------------------------------------------------------------------------------ ##
688 | onsuccess:
689 | print("Success! The Snakemake workflow is completed.")
690 |
691 | onerror:
692 | print("Error! The Snakemake workflow aborted.")
693 |
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | ## Important note:
2 | ## All paths defined in this configuration file must be either absolute or relative to the
3 | ## location of the Snakefile!
4 |
5 | ## Reference annotation details
6 | ##--------------------------------------------------------------------------------------------
7 | ## Specify "Ensembl" or "Gencode" depending on your choice
8 | annotation: Ensembl
9 |
10 | organism: Homo_sapiens # separate with underscore
11 | build: GRCh38
12 | release: 93
13 | ##--------------------------------------------------------------------------------------------
14 |
15 |
16 | ## Paths to existing reference files
17 | ##--------------------------------------------------------------------------------------------
18 | txome: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz
19 | genome: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa
20 | gtf: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.93.1.1.10M.gtf
21 | ##--------------------------------------------------------------------------------------------
22 |
23 |
24 | ## Paths to indexes that will be generated by the workflow
25 | ##--------------------------------------------------------------------------------------------
26 | salmonindex: example_data/reference/SalmonIndex/Homo_sapiens.GRCh38.93.sidx
27 | #salmonk: 31
28 | STARindex: example_data/reference/STARIndex/Homo_sapiens.GRCh38.93.STAR.idx
29 | ##--------------------------------------------------------------------------------------------
30 |
31 | ## Additional STAR parameters
32 | ## Here, you can specify any optional parameters for the index building and/or alignment
33 | ## with STAR. The following arguments are automatically populated and should NOT be
34 | ## specified here:
35 | ## Indexing: runMode, runThreadN, genomeDir, genomeFastaFiles, sjdbGTFfile, sjdbOverhang
36 | ## Alignment: runMode, genomeDir, readFilesIn, runThreadN, outFileNamePrefix, outSAMtype, readFilesCommand
37 | ##--------------------------------------------------------------------------------------------
38 | ## Add or remove parameters inside the ""
39 | additional_star_index: ""
40 | additional_star_align: ""
41 |
42 | ## Additional Salmon parameters
43 | ## Here, you can specify any optional parameters for the index building and/or
44 | ## abundance quantification with Salmon. The following arguments are automatically populated
45 | ## based on the arguments specified elsewhere, and should NOT be specified here:
46 | ## Indexing: transcriptome input file, index directory, gencode flag
47 | ## Quantification: library type, fastq files, index directory, output directory, number of cores
48 | ##--------------------------------------------------------------------------------------------
49 | ## Add or remove parameters inside the ""
50 | additional_salmon_index: "-k 31"
51 |
52 | ## Add or remove parameters inside the ""
53 | ## We specify the mean and standard deviation of the fragment length distribution, for use with Salmon.
54 | ## This is important to specify for single-end reads.
55 | ## For paired-end reads, these values will define the prior, which is then updated
56 | ## based on the observed fragment lengths.
57 | additional_salmon_quant: "--seqBias --gcBias --fldMean 250 --fldSD 25"
58 | ##--------------------------------------------------------------------------------------------
59 |
60 |
61 | ## Information about the experiment
62 | ##--------------------------------------------------------------------------------------------
63 | readlength: 63
64 |
65 | ## Path to metadata text file. This file must contain at least the following columns:
66 | ## names: the sample identifiers = the names of the FASTQ files (excluding the _R1/R2.fastq.gz part)
67 | ## type: either SE or PE, indicating whether the sample was analyzed
68 | ## via single-end or paired-end sequencing.
69 | metatxt: example_data/metadata.txt
70 |
71 | ## Variables used for model fitting
72 | ## design: design formula for use with edgeR, camera and DRIMSeq. Must be a string
73 | ## of the form "~ "
74 | ## contrast: (comma-separated if multiple) list of contrasts to estimate in edgeR_dge.Rmd
75 | design: "~ 0 + celline"
76 | contrast: cellineN61311-cellineN052611,cellineN052611-cellineN61311
77 |
78 | ## Gene sets used for gene set analysis with camera
79 | ## Comma-separated list of gene set categories to test with camera.
80 | ## Must be a subset of H,C1,C2,C3,C4,C5,C6,C7
81 | ## Only required if variable "run_camera: is True (see below).
82 | genesets: H,C5
83 |
84 | ## The maximal number of cores to use for FastQC, STAR, Salmon and DRIMSeq.
85 | ## Note that the actual number of cores available to Snakemake is determined by
86 | ## the --cores argument when it is invoked.
87 | ncores: 1
88 | ##---------------------------------------------------------------------------------------------
89 |
90 |
91 | ## Path to a folder containing gzipped fastq files, and the file suffix (typically, either fastq or fq).
92 | ## If you have paired-end fastq files, you also need to define the extension distinguishing the two read files.
93 | ## More precisely, ARMOR assumes that paired-end fastq files are named
94 | ## _..gz and _..gz.
95 | ## Single-end fastq files are supposed to be named
96 | ## ..gz.
97 | ##---------------------------------------------------------------------------------------------
98 | FASTQ: example_data/FASTQ
99 | fqext1: R1
100 | fqext2: R2
101 | fqsuffix: fastq
102 | ##---------------------------------------------------------------------------------------------
103 |
104 |
105 | ## Path to a folder that will store the output generated by the workflow.
106 | ## Additional subfolders of this folder will be generated by the workflow.
107 | ## To put output in the current directory, set output to ".".
108 | ##---------------------------------------------------------------------------------------------
109 | output: example_data/output
110 | ##---------------------------------------------------------------------------------------------
111 |
112 | ## R setup
113 | ##---------------------------------------------------------------------------------------------
114 | ## Specify "True" if R should be installed in a conda environment or "False" if you want to use
115 | ## your own R installation (then you have to set the path to your library in the .Renviron file)
116 | useCondaR: True
117 | Rbin: R
118 | ##---------------------------------------------------------------------------------------------
119 |
120 | ## Conditional conda rules
121 | ##---------------------------------------------------------------------------------------------
122 | ## Should read trimming, STAR mapping, DRIMSeq analysis and gene set analysis be performed? Set
123 | ## to False if the step is not required.
124 | run_trimming: True
125 | run_STAR: True
126 | run_DRIMSeq: True
127 | run_camera: True
128 | ##---------------------------------------------------------------------------------------------
129 |
--------------------------------------------------------------------------------
/envs/environment.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - bioconda
3 | - conda-forge
4 | dependencies:
5 | - samtools=1.11
6 | - salmon=1.4.0
7 | - star=2.7.7a
8 | - fastqc=0.11.9
9 | - multiqc=1.9
10 | - trim-galore=0.6.6
11 | - cutadapt=3.2
12 | - bedtools=2.30.0
13 | - ucsc-bedgraphtobigwig=377
14 | - pandoc=2.11
15 | - tbb=2020.2
16 |
--------------------------------------------------------------------------------
/envs/environment_R.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | dependencies:
4 | - r-base=4.4.1
5 | - pandoc=3.4
6 | - r-curl=5.2.1
7 | - r-rsqlite=2.3.7
8 | - r-xml2=1.3.6
9 | - r-httpuv=1.6.15
10 | - r-mass=7.3_60.0.1
11 | - r-matrix=1.6_5
12 |
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039508_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039508_R1.fastq.gz
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039508_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039508_R2.fastq.gz
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039509_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039509_R1.fastq.gz
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039509_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039509_R2.fastq.gz
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039512_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039512_R1.fastq.gz
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039512_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039512_R2.fastq.gz
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039513_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039513_R1.fastq.gz
--------------------------------------------------------------------------------
/example_data/FASTQ/SRR1039513_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039513_R2.fastq.gz
--------------------------------------------------------------------------------
/example_data/README.md:
--------------------------------------------------------------------------------
1 | ## A small RNA-seq example data set
2 |
3 | This repository contains a small RNA-seq example data set that may be suitable, e.g., for teaching or testing purposes. The original data files come from the study
4 |
5 | > Himes BE, Jiang X, Wagner P, Hu R, Wang Q, Klanderman B, Whitaker RM, Duan Q, Lasky-Su J, Nikolos C, Jester W, Johnson M, Panettieri Jr RA, Tantisira KG, Weiss ST, Lu Q: [RNA-Seq Transcriptome Profiling Identifies CRISPLD2 as a Glucocorticoid Responsive Gene that Modulates Cytokine Function in Airway Smooth Muscle Cells.](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0099625) PLoS ONE 9(6): e99625. https://doi.org/10.1371/journal.pone.0099625 (2014).
6 |
7 | (GEO accession number [GSE52778](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52778)), in which RNA-Seq was used to characterize the human airway smooth muscle transcriptome at baseline and under asthma treatment conditions. This example data set includes four of the samples from this data set (SRR1039508, SRR1039509, SRR1039512 and SRR1039513), representing Dexamethasone treated and untreated samples from two cell lines (N61311 and N052611). The FASTQ files have been subsetted to include only reads aligning within the first 10M bases of chromosome 1 (of the GRCh38 reference genome).
8 |
9 | In addition to the FASTQ files with the reads, we provide reference annotation files from two sources: Ensembl (release GRCh38.93) and Gencode (v28). For each annotation source, we include a fasta file with the genome sequence, a gtf file with the corresponding gene annotation, and one or more fasta files with transcript sequences. All files are subsetted to include only features from the first 10M bases of chromosome 1.
10 |
11 | ### Gencode reference files
12 | The Gencode reference files were downloaded from [https://www.gencodegenes.org/releases/current.html](https://www.gencodegenes.org/releases/current.html).
13 |
14 | - reference/Gencode28/GRCh38.primary_assembly.genome.1.1.10M.fa (genome sequence)
15 | - reference/Gencode28/gencode.v28.transcripts.1.1.10M.fa.gz (transcript sequences)
16 | - reference/Gencode28/gencode.v28.annotation.1.1.10M.gtf (gene annotation)
17 |
18 | ### Ensembl reference files
19 | The Ensembl reference files were downloaded from [https://www.ensembl.org/info/data/ftp/index.html](https://www.ensembl.org/info/data/ftp/index.html).
20 |
21 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa (genome sequence)
22 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz (cDNA transcript sequences)
23 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz (ncRNA transcript sequences)
24 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.93.1.1.10M.gtf (gene annotation)
25 |
--------------------------------------------------------------------------------
/example_data/metadata.txt:
--------------------------------------------------------------------------------
1 | names type celline treatment
2 | SRR1039508 PE N61311 Untreated
3 | SRR1039509 PE N61311 Dexamethasone
4 | SRR1039512 PE N052611 Untreated
5 | SRR1039513 PE N052611 Dexamethasone
6 |
--------------------------------------------------------------------------------
/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz
--------------------------------------------------------------------------------
/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz
--------------------------------------------------------------------------------
/example_data/reference/Gencode28/gencode.v28.transcripts.1.1.10M.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/reference/Gencode28/gencode.v28.transcripts.1.1.10M.fa.gz
--------------------------------------------------------------------------------
/img/ARMOR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/ARMOR.png
--------------------------------------------------------------------------------
/img/ARMOR.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/img/benchmark_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/benchmark_summary.png
--------------------------------------------------------------------------------
/img/dag_nice5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/dag_nice5.png
--------------------------------------------------------------------------------
/img/draw.io_run_modes_ARMOR.xml:
--------------------------------------------------------------------------------
1 | 7R1rd6I4+9f0nPf9YA935WNtp7vdbadz7MzuzqeeqFF5i+Byaev8+jeBRIgERAmKbebSQoCQPPc8l3ChXy/ffwvAavHgT6F7oSnT9wv95kLTVN0eoF+4ZZ22DEwzbZgHzpTclDU8Ob8gaVRIa+xMYcjcGPm+GzkrtnHiex6cREwbCAL/jb1t5rvsW1dgDgsNTxPgFlv/dqbRgrSqipJd+B0680VE50cujMHkZR74sUfed6Hps+RPenkJaF/k/nABpv5brkn/cqFfB74fpUfL92voYthSsKXP3ZZc3Yw7gF5U54Ff+mN89WPQ/36n9b5+d1/+ePjjtWelvbwCN4Z0GpaL+htOnVc86GhNAGX9G+ORDiP4HvWA68y9C/0K3eHCWZRdRUdz8jvpZRxst6ABJl3TVo15i4agtMKHnh+hX8O3hRPBpxWY4LY3RIGobREtXXSm4u43OHiMI9fxIGmfguDlET3lRJgqlUvF3LwrDzACw1cYoFnlmggAf4P+EkbBGt1Crpq6lj6ypkRsGmnDW452bILxRY5sbJ2QLCHX+abvDGXogGCNj8G7Hy9/Xgfxn/50ejeCd30QukpvMwmJwoNQqPW1AgYR+xcxOBi0hUGFg8GzgqlhsjDtK0Wu0HlcoVEp2wSomgqBbdmaZYKpolqDnsoDaEq6Mx9NkscT+EIvTLQTZgnVWr0XWWIE/42dAC4RpMIcM6SdlnHDm7N0AUZBHkP4kSdyk4LOJwvHnd6DtR9jyIQRQiA9Gy78wPmF7gf0YXQ5iIgm1Szcm+O6177rB8krdajgv8yTT7hH8q4A4nl+owhXN033IIzoeHzXBavQGbuUdpYgmDve0I8if0luolO7ZV9PtKA+JCLmZoJeBAP8GkyScEqnQRVi+sKlMyHHLhhDd7ihYdq15ydQDKPAf4G59ynJn80VqsUpmG/B0nExYf4FgynwAIV+Cj9VI+e8DgWwhmqprMagXTDihsMaqmW3wRrl2qI2axgsayDLjEIua0xgaNvlWuXQt7G9hCvgcXvJJGCPDA93FszH/9GwfYrgjWCnaGp/c6wq2n+LL0JW5xTkWD19IRlE0mepHBAxz5+wXM7UEz9Y17Oih+UfwlR5CUL5DCsKb36PjQP9xshaRoRIjRJF5CPumLkJZy+c6RR6CedHIALjjbxb+Q6WoWiI5hD9wzhQLs0L8wYjwxyq2Tn6h28PEIN6aOjASVgRIlH1BsNsynkmLVcIu9mWWgVWPS6l9wllUqpNy0w6LoUdgQtTkagI5K2ZM79cA0SagviZPea8M3mOz8178tmxBnx0LLcxAh6gueuK5hgpby7pm8NSnCaJ6EMQDZZYIXjjcJWKDvUS/YhDpH2QWh2lg/keICEnaeLT0oSGaWI0dogzYnQc+bw9e71q9nsPpRFZFZmgOfSPDoF6Uz33iUljW4SxTe3dXca2KcADVxyufpYrYsG2OPRe0UrzFv1yAt/DDi7GLq+UrvnVMLtgrWf2dWziz6O2pt6Ki6DSdJKuAfHSSqWP7BRXAnzb/IBBY4mFgFJFSrTBj6NVHOXoabwvjWUeX6WEJHLkJ8DdOrDZSITOiUSoGgdZVlvIMmQsQsYiuhmLsPpF5tB0TuhTM9riDnMXd8jgQmuWgwwuHGJBVAr5RvEFLucJiC9wR9yvDjEI8hd9hqXTWYUxeLcewY2p7BitAL9vmcPpvCj5AP/5LXBDKMHGm3NjUHQK+VuO8gbsqw5yLKsZmYCzeOy7wpms2m3kox+jZ/R+gPVPfX7enkexqUwI1Gmpi2ZpuggyXXjeWp7pYmktmS5que3yicyOU3pspSPzg/Cyqtb0AOit+TIHrXgAagcHT+sBCNdhBJcXuaj3cZYOu+zxreacjJAK+PRMa5yaact59lQx/tOy8eWIqOHjcvE+LNycfz/2kpG70j4MRKebx+j5/m749Pzj6ctoq5vjLxUPIc5PRnPHwcnxZFJxXd/wvJwBb0fPrjMOKj0RW8TfouTrmsKTSW3dMNTqxlfNtqI8AiohP02qSH9w6lQRrV8ACpzOIc3NQBS88Oe+B9wvWeuQBVsORPDdif7JHf/Et1ya5OzmnTyRnKzpiYfm8U/+JPcUPs0eS87oc/+DUbQmWQwgRuofEcRmtPe+v9qFttCPgwmsomRSDRmBYA6ruJSSPIZcJRkE0AWR8wqZcYjnQLu5s5Kb/7Ozlwq1rmQ2g6bZW2qd+ClrDSKxkicBBBHEuFngnzmPZG6VIt6JuDUgZQYm7APXiKYciN6rfIVvxbGT7As83vwkej0PLPHB1ejhcZQ0zDBQzOv03gOcrnXCGIdYZ5Vo3J5uEj0CE0Txu5ElPnW3KbKIfGBmQBDUfdDrGPRB7G2gDpBUXIdOeAjISyZadzV0mCwRjk4PvMAl+i9N031N0+abc6jFAB53cw510NqeNWYrhs4lIkzG2MEJjlXmTl2zBT/0DQYOmj3OHWVNpZ6C3ttn7SVL13dYTMnZdp+Hm0eaVtM80jplHdFhV1hHdMMwKkJUAYvnSiHYujpQRaqD2iLbFCeykUkUh7C3nb66Xy2kDA41FeyFnazoDlW7ClOMttwNmiFSru8jnan8txn5P7C0Svm/W64brFzXLLU1uV66DchumS5cqJNHv2F6zwhO77P1A4UAYzod8lRGS4WOTLrtV1lH6ZQLHV0FAVjnbiP8WDpggzrW6Hs0vXJc2/frCrOlIjpIR5BxyAa4DZjGaskYUkyGGexqVmjZ88NjtQbeoH5db5DdKXtHL99vs7Yit6uMl5hZ7j8ALwauu877N3Juj83KH9sr2CYopqOgWcbNlXdS0lWqR7GqxKRDTD38AK3WSnam1IdYJzoT4F6R5giTFVXs3xMa6xlilKpK0xZpjRQtfcopVcPgKFWtLa+wLiDdsR7NqAATzQ2cgdhN3UFKHCIIZ1RT4ZT4bGTCarABpyaYFztojUqM5pEeWWcq60wFMQdrJ9q8hQm32q21jDVD7nspuDT1q18qDeQaf7+8AiG7Ux61etSQm+ZcyMrPg1LChJWN7ayF3Ks3WVwnc/u7INpPXl1ntLMlx5lk8p+qIGe/bGlFFuR0imlPX0ZnlDsWO5mhLEtyRKjdj10eUSjJOQw8p5uDLCGSJUSyhOgDlRDJupqO2Ft1K6BbK6yhn/37nKukZRZXdhDqQQKv0J9FbwjXXTa45MKpW4y8+f7f6RZOzaPrn6ZCTlUG/QK6jryb8qAAFVkiR2FTt0SO0nxHkqLouFsrkTvLjO4WSmrKQjFSuzWVlMXvQvf1S7MoK3llNiK+88m3URWhsvJUcm+3OBOe41mS/mwbLJJVmoa6I/9ZVKawWZ7EUnvhUC8TMKlYvOOY9krO9N8zZ5SVfh8rGZDlfpuzPNVNDuuLyAa0/naB9/jz9kGJf3+x7l5fnL8WAr4QIpMBZTKgIOZgkwF1gxPh5n5aV8RHJ7jccZ7JSx3OBdzhCG02T/mZikNM2Eq10MnPYHNHrJZrsjMM3shsxY5nK+4K18pQfc1QvfxORe3orQhQdAr5MpVWBpc6YLvU/aqwiExavu1ynusMwWaH/E6F5OXGvMxNsOUxs4g4MZ+ZP3XCh/xOhWTa/ZmWl6V1XKYtr2WRWfEd8gDIpbb8UIXMMu8uzcksc/mhilYUnkyo74alxvtQBddX0lacxy5g87gbR27tladpdCPhQzeO3NoQ2NZ35UkdvnFkZeCss5tH6v2tvLe6m0cafXbrLbPm5pEHZGHxIStgK6Ga+7GNsW33g92CrTTtSu7KVsw1oTIsL9cEJWKh08DH+MtoC+mExYM/hfiO/wM=
--------------------------------------------------------------------------------
/img/run_modes_ARMOR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/run_modes_ARMOR.png
--------------------------------------------------------------------------------
/img/software_management.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/software_management.png
--------------------------------------------------------------------------------
/img/software_management.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
750 |
--------------------------------------------------------------------------------
/scripts/DRIMSeq_dtu.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "DRIMSeq DTU"
3 | author: ""
4 | date: "`r Sys.Date()`"
5 | output:
6 | html_document:
7 | toc: true
8 | toc_float: true
9 | theme: yeti
10 | highlight: tango
11 | code_folding: show
12 | keep_md: true
13 | references:
14 | - id: Nowicka2016DRIMSeq
15 | title: DRIMSeq- a Dirichlet-multinomial framework for multivariate count outcomes in genomics
16 | author:
17 | - family: Nowicka
18 | given: Malgorzata
19 | - family: Robinson
20 | given: Mark D
21 | container-title: F1000Research
22 | volume: 5
23 | page: 1356
24 | type: article-journal
25 | URL: https://f1000research.com/articles/5-1356/v2
26 | issued:
27 | year: 2016
28 | ---
29 |
30 | ```{r DRIMSeq-setup, include=FALSE}
31 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf"))
32 | ```
33 |
34 | # Introduction
35 |
36 | This script performs differential transcript usage analysis with DRIMSeq
37 | [@Nowicka2016DRIMSeq], based on abundance estimates from Salmon. It supports
38 | testing one or more contrasts. For more detailed information of every step,
39 | we refer to the [DRIMSeq vignette](http://bioconductor.org/packages/release/bioc/vignettes/DRIMSeq/inst/doc/DRIMSeq.pdf).
40 |
41 | # Load packages
42 |
43 | ```{r DRIMSeq-load-pkg}
44 | suppressPackageStartupMessages({
45 | library(dplyr)
46 | library(tximport)
47 | library(tximeta)
48 | library(SingleCellExperiment)
49 | library(edgeR)
50 | library(DRIMSeq)
51 | library(ggplot2)
52 | })
53 | ```
54 |
55 | # Load `SummarizedExperiment` object
56 |
57 | We load the `SummarizedExperiment` objects prepared using `tximeta`, containing
58 | gene- and transcript-level counts and feature lengths. In this report, we will
59 | use the transcript-level quantifications.
60 |
61 | ```{r DRIMSeq-print-se}
62 | sg <- se$sg
63 | st <- se$st
64 | st
65 | ```
66 |
67 | # Plot total number of reads per sample
68 |
69 | ```{r DRIMSeq-plot-totalcount}
70 | ggplot(data.frame(totCount = colSums(assay(sg, "counts")),
71 | sample = colnames(assay(sg, "counts")),
72 | stringsAsFactors = FALSE),
73 | aes(x = sample, y = totCount)) + geom_bar(stat = "identity") +
74 | theme_bw() + xlab("") + ylab("Total read count") +
75 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
76 | ```
77 |
78 | # Create dmDSdata object
79 |
80 | To create a `dmDSdata` object, which is the container used by `DRIMSeq` to store
81 | feature counts and metadata, we need a `data.frame` containing information about
82 | the samples (`metadata`) and a `data.frame` with counts (`counts`). The
83 | `dmDSdata` object is used to create a data summary plot.
84 |
85 | ```{r DRIMSeq-dmDSdata}
86 | print(contrast)
87 | print(design)
88 |
89 | counts <- data.frame(feature_id = rownames(st),
90 | gene_id = unlist(rowData(st)$gene_id),
91 | assay(st, "counts"),
92 | row.names = NULL,
93 | check.names = FALSE)
94 |
95 | metadata <- data.frame(colData(st))
96 | metadata <- metadata %>%
97 | dplyr::rename(sample_id = names)
98 |
99 | d <- dmDSdata(counts = counts, samples = metadata)
100 | plotData(d)
101 | ```
102 |
103 | # Filter
104 |
105 | The genes with low expression levels are filtered out to ensure that the
106 | observed transcript ratios are reliable. A single gene may have many
107 | transcripts, and lowly expressed individual transcripts are removed using
108 | `min_samps_feature_expr`.
109 |
110 | ```{r DRIMSeq-data-filter}
111 | d <- dmFilter(d, min_samps_gene_expr = 3, min_samps_feature_expr = 3,
112 | min_gene_expr = 10, min_feature_expr = 5)
113 | plotData(d)
114 | ```
115 |
116 | # Define design.
117 |
118 | Here, we specify the design matrix used for the Dirichlet-multinomial model in the later step.
119 |
120 | ```{r DRIMSeq-define-design}
121 | print(samples(d))
122 | (des <- model.matrix(as.formula(design), data = samples(d)))
123 | ```
124 |
125 | # Calculate precision
126 |
127 | Computationally, it is more convenient to first estimate the precision before
128 | you fit a Dirichlet-multinomial model to the data. The precision parameters are
129 | estimated using the Cox-Reid adjusted profile likelihood. By default, $10\%$ of
130 | the genes (randomly selected) are used to estimate the initial value (the common
131 | precision). To get reproducible results, a random seed is used.
132 |
133 | To inspect the behavior of the precision estimates, they are plotted against the
134 | mean gene expression. Typically, precision increases for genes with higher mean
135 | expression in RNA-seq data.
136 |
137 | ```{r DRIMSeq-calculate-precision}
138 | set.seed(123)
139 | if(ncores > 1) {
140 | bpps <- BiocParallel::MulticoreParam(min(parallel::detectCores(),ncores))
141 | } else {
142 | bpps <- BiocParallel::SerialParam()
143 | }
144 |
145 | d <- dmPrecision(d, design = des, add_uniform = TRUE, BPPARAM = bpps)
146 | plotPrecision(d)
147 | ```
148 |
149 | # Fit model
150 |
151 | At the gene level, the maximum likelihood is used to estimate the coefficients
152 | of the Dirichlet-multinomial (DM) regression and the fitted transcript
153 | proportions in each sample. At the transcript level, beta-binomial regression is
154 | applied to each transcript separately.
155 |
156 | ```{r DRIMSeq-fit-model}
157 | d <- dmFit(d, design = des, verbose = 1, add_uniform = TRUE)
158 | ```
159 |
160 | # Define contrasts.
161 |
162 | The contrasts are defined to do comparisons between specified groups.
163 |
164 | ```{r DRIMSeq-define-contrasts}
165 | print(contrast)
166 | (contrasts <- as.data.frame(makeContrasts(contrasts = contrast, levels = des)))
167 | ```
168 |
169 | # Perform tests
170 |
171 | The test can be performed on the gene level (`level <- 'gene'`) or the
172 | transcript level (`level <- 'feature'`) using the likelihood ratio test. The
173 | results are stored as `DRIMSeq_res` and `DRIMSeq_feature_res` for the gene and
174 | the transcript level, respectively.
175 |
176 | ```{r DRIMSeq-result-genes, warning = FALSE}
177 | level <- "gene"
178 | signif3 <- function(x) signif(x, digits = 3)
179 | DRIMSeq_fits <- lapply(contrasts, function(cm) {
180 | dr <- dmTest(d, contrast = cm, verbose = 1)
181 | print(plotPValues(dr, level = level))
182 | dr
183 | })
184 |
185 | DRIMSeq_res <- lapply(DRIMSeq_fits, function(dr) {
186 | results(dr, level = level) %>%
187 | dplyr::mutate(mlog10PValue = -log10(pvalue)) %>%
188 | dplyr::mutate_at(vars(one_of(c("lr", "df", "pvalue",
189 | "adj_pvalue", "mlog10PValue"))),
190 | list(signif3))
191 | })
192 | ```
193 |
194 | ```{r DRIMSeq-result-transcripts, warning = FALSE}
195 | level <- "feature"
196 | DRIMSeq_feature_fits <- lapply(contrasts, function(cm) {
197 | dr <- dmTest(d, contrast = cm, verbose = 1)
198 | print(plotPValues(dr, level = level))
199 | dr
200 | })
201 |
202 | DRIMSeq_feature_res <- lapply(DRIMSeq_feature_fits, function(dr) {
203 | results(dr, level = level) %>%
204 | dplyr::mutate(mlog10PValue = -log10(pvalue)) %>%
205 | dplyr::mutate_at(vars(one_of(c("lr", "df", "pvalue",
206 | "adj_pvalue", "mlog10PValue"))),
207 | list(signif3))
208 | })
209 | ```
210 |
211 | # Write results to text files
212 |
213 | The gene-level results are exported to text files.
214 |
215 | ```{r DRIMSeq-save-result}
216 | if (class(DRIMSeq_res) == "data.frame") {
217 | write.table(DRIMSeq_res %>% dplyr::arrange(pvalue),
218 | file = "DRIMSeq_dtu_results.txt",
219 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
220 | } else {
221 | for (nm in names(DRIMSeq_res)) {
222 | write.table(DRIMSeq_res[[nm]] %>% dplyr::arrange(pvalue),
223 | file = paste0("DRIMSeq_dtu_results_", nm, ".txt"),
224 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
225 | }
226 | }
227 | ```
228 |
229 | # Output results as `SingleCellExperiment` object
230 |
231 | Here, we store the results on the gene level together with the original data.
232 | The result table `DRIMSeq_res` is appended to the `rowData` of the original
233 | gene-level `SummarizedExperiment` object `sg`. For genes that were filtered out,
234 | `NA` values are used in the results. The updated `sg` can be fed to the R
235 | package `iSEE` to perform more exploratory and visual analysis.
236 |
237 | ```{r DRIMSeq-se-gene}
238 | ## add rows (NA) for genes that are filtered out (if any)
239 | DRIMSeq_resA <- lapply(seq_along(DRIMSeq_res), FUN = function(x) {
240 |
241 | # all genes
242 | geneA <- rowData(sg)$gene_id
243 |
244 | # genes that are not filtered out
245 | resX <- DRIMSeq_res[[x]]
246 |
247 | # other characteristics that have been calculated
248 | mexp <- mean_expression(DRIMSeq_fits[[x]]) %>%
249 | dplyr::arrange(match(gene_id, resX$gene_id)) %>%
250 | dplyr::select(-gene_id)
251 | prec <- genewise_precision(DRIMSeq_fits[[x]]) %>%
252 | dplyr::arrange(match(gene_id, resX$gene_id)) %>%
253 | dplyr::select(-gene_id)
254 |
255 | resX <- resX %>%
256 | dplyr::bind_cols(mexp) %>%
257 | dplyr::bind_cols(prec) %>%
258 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]])
259 | resX$common_precision <- common_precision(DRIMSeq_fits[[x]])
260 | rownames(resX) <- resX$gene_id
261 |
262 | # genes that are filtered out
263 | geneO <- setdiff(geneA, resX$gene_id)
264 |
265 | # results for all genes
266 | if (length(geneO) > 0) {
267 | # create a data frame with values NA as the results of the genes that
268 | # are filtered out
269 | matO <- matrix(NA, nrow = length(geneO),
270 | ncol = ncol(resX),
271 | dimnames = list(geneO,
272 | colnames(resX)))
273 | resO <- data.frame(matO)
274 | resO$gene_id <- geneO
275 |
276 | # combine the result tables
277 | resA <- resO %>%
278 | dplyr::bind_rows(resX) %>%
279 | dplyr::arrange(match(gene_id, geneA)) %>%
280 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]])
281 |
282 | } else {
283 | resA <- resX %>%
284 | dplyr::arrange(match(gene_id, geneA)) %>%
285 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]])
286 | }
287 |
288 | # round numeric columns to 3 significant digits
289 | resA <- resA %>%
290 | dplyr::mutate_if(is.numeric, signif3)
291 |
292 | # use gene column as rownames
293 | rownames(resA) <- resA$gene_id
294 |
295 | # convert to DataFrame
296 | resA <- S4Vectors::DataFrame(resA)
297 | return(resA)
298 | })
299 | names(DRIMSeq_resA) <- names(DRIMSeq_res)
300 |
301 | ## Put the result tables in rowData
302 | for (i in seq_along(DRIMSeq_resA)) {
303 | nam <- names(DRIMSeq_resA)[i]
304 | namI <- paste("DRIMSeq:", nam, sep = "")
305 | stopifnot(all(rowData(sg)$gene_id == rownames(DRIMSeq_resA[[i]])))
306 | rowData(sg)[[namI]] <- DRIMSeq_resA[[i]]
307 | }
308 | ```
309 |
310 | Here, we store the results on the transcript-level together with the original
311 | data. The result table `DRIMSeq_feature_res` is appended to the `rowData` of the
312 | original transcript-level `SummarizedExperiment` object `st`. For transcripts
313 | that were filtered out, `NA` values are used in the results. The updated `st`
314 | can be fed to the R package `iSEE` to perform more exploratory and visual
315 | analysis.
316 |
317 | ```{r DRIMSeq-se-tx}
318 | ## add rows (NA) for genes that are filtered out (if any)
319 | DRIMSeq_resB <- lapply(seq_along(DRIMSeq_feature_res), FUN = function(x) {
320 |
321 | # all genes
322 | txA <- rowData(st)$tx_id
323 |
324 | # genes that are not filtered out
325 | resX <- DRIMSeq_feature_res[[x]]
326 |
327 | prop <- proportions(DRIMSeq_feature_fits[[x]]) %>%
328 | dplyr::arrange(match(feature_id, resX$feature_id)) %>%
329 | dplyr::select(-c(gene_id, feature_id))
330 | colnames(prop) <- paste("proportion", colnames(prop), sep = "_")
331 |
332 | coef <- coefficients(DRIMSeq_feature_fits[[x]]) %>%
333 | dplyr::arrange(match(feature_id, resX$feature_id)) %>%
334 | dplyr::select(-c(gene_id, feature_id))
335 | colnames(coef) <- paste("coef", colnames(coef), sep = "_")
336 |
337 | resX <- resX %>%
338 | dplyr::bind_cols(prop) %>%
339 | dplyr::bind_cols(coef) %>%
340 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]])
341 |
342 | rownames(resX) <- resX$feature_id
343 |
344 | # genes that are filtered out
345 | txO <- setdiff(txA, resX$feature_id)
346 |
347 | # results for all genes
348 | if (length(txO) > 0) {
349 | # create a data frame with values NA as the results of the genes that
350 | # are filtered out
351 | matO <- matrix(NA, nrow = length(txO),
352 | ncol = ncol(resX),
353 | dimnames = list(txO,
354 | colnames(resX)))
355 | resO <- data.frame(matO)
356 | resO$feature_id <- txO
357 |
358 | # combine the result tables
359 | resA <- resO %>%
360 | dplyr::bind_rows(resX) %>%
361 | dplyr::arrange(match(feature_id, txA)) %>%
362 | dplyr::mutate(contrast = names(DRIMSeq_feature_res)[[x]])
363 |
364 | } else {
365 | resA <- resX %>%
366 | dplyr::arrange(match(feature_id, txA)) %>%
367 | dplyr::mutate(contrast = names(DRIMSeq_feature_res)[[x]])
368 | }
369 |
370 | # round numeric columns to 3 significant digits
371 | resA <- resA %>%
372 | dplyr::mutate_if(is.numeric, signif3)
373 |
374 | # use gene column as rownames
375 | rownames(resA) <- resA$feature_id
376 |
377 | # convert to DataFrame
378 | resA <- S4Vectors::DataFrame(resA)
379 | return(resA)
380 | })
381 | names(DRIMSeq_resB) <- names(DRIMSeq_feature_res)
382 |
383 | ## Put the result tables in rowData
384 | for (i in seq_along(DRIMSeq_resB)) {
385 | nam <- names(DRIMSeq_resB)[i]
386 | namI <- paste("DRIMSeq:", nam, sep = "")
387 | stopifnot(all(rowData(st)$tx_id == rownames(DRIMSeq_resB[[i]])))
388 | rowData(st)[[namI]] <- DRIMSeq_resB[[i]]
389 | }
390 | ```
391 |
392 | The output is saved as a list.
393 |
394 | ```{r DRIMSeq-save-se}
395 | analysis_se <- list(sg = sg, st = st)
396 | saveRDS(analysis_se, file = "DRIMSeq_dtu.rds")
397 | ```
398 |
399 | # Session info
400 |
401 | The analyses above were performed with the following package versions:
402 |
403 | ```{r DRIMSeq-session-info}
404 | sessionInfo()
405 | date()
406 | ```
407 |
408 | # References
409 |
--------------------------------------------------------------------------------
/scripts/check_input.R:
--------------------------------------------------------------------------------
1 |
2 | args <- (commandArgs(trailingOnly = TRUE))
3 | for (i in seq_len(length(args))) {
4 | eval(parse(text = args[[i]]))
5 | }
6 |
7 |
8 | ## ----------------The input arguments------------------------------------------
9 | if (exists("metafile")) {
10 | print(metafile)
11 | } else {
12 | metafile <- NULL
13 | }
14 |
15 | if (exists("organism")) {
16 | print(organism)
17 | } else {
18 | organism <- NULL
19 | }
20 |
21 | if (exists("annotation")) {
22 | print(annotation)
23 | } else {
24 | annotation <- NULL
25 | }
26 |
27 | if (exists("genesets")) {
28 | print(genesets)
29 | } else {
30 | genesets <- NULL
31 | }
32 |
33 | if (exists("outFile")) {
34 | print(outFile)
35 | } else {
36 | outFile <- NULL
37 | }
38 |
39 | if (exists("gtf")) {
40 | print(gtf)
41 | } else {
42 | gtf <- NULL
43 | }
44 |
45 |
46 | if (exists("genome")) {
47 | print(genome)
48 | } else {
49 | genome <- NULL
50 | }
51 |
52 | if (exists("fastqdir")) {
53 | print(fastqdir)
54 | } else {
55 | fastqdir <- NULL
56 | }
57 |
58 | if (exists("fqsuffix")) {
59 | print(fqsuffix)
60 | } else {
61 | fqsuffix <- NULL
62 | }
63 |
64 | if (exists("fqext1")) {
65 | print(fqext1)
66 | } else {
67 | fqext1 <- NULL
68 | }
69 |
70 | if (exists("fqext2")) {
71 | print(fqext2)
72 | } else {
73 | fqext2 <- NULL
74 | }
75 | if (exists("txome")) {
76 | print(txome)
77 | } else {
78 | txome <- NULL
79 | }
80 |
81 | if (exists("run_camera")) {
82 | print(run_camera)
83 | } else {
84 | run_camera <- "DUMMY"
85 | }
86 |
87 |
88 | ## Read metadata
89 | msg0 <- try({
90 | if(!file.exists(metafile)) {
91 | error("The metafile, ", metafile, ", does not exist.\n")
92 | } else {
93 | metadata <- read.delim(metafile, header = TRUE, as.is = TRUE, sep = "\t");
94 | if(!all(c("names","type") %in% colnames(metadata)))
95 | stop(paste0("ERROR: 'names' and 'type' columns must exist in ", metafile))
96 | rownames(metadata) <- metadata$names;
97 | utype <- unique(metadata$type);
98 | if (length(utype) == 1 & any(utype %in% c("PE", "SE"))) {
99 | type <- metadata$type
100 | } else{
101 | stop("ERROR: 'type' column in the metadata file must be PE or SE. \n")
102 | }
103 | }
104 | }, silent = TRUE)
105 |
106 |
107 | msg1 <- try({
108 | if (utype == "SE") {
109 | pt <- paste0(metadata$names, ".", fqsuffix, ".gz")
110 | } else {
111 | pt1 <- paste0(metadata$names, "_", fqext1, ".", fqsuffix, ".gz")
112 | pt2 <- paste0(metadata$names, "_", fqext2, ".", fqsuffix, ".gz")
113 | pt <- c(pt1, pt2)
114 | }
115 | lf <- file.path(fastqdir, pt)
116 | fe <- file.exists(lf)
117 | if (any(!fe)) {
118 | stop(paste0("ERROR: ", paste(lf[!fe], collapse=" "), " are/is not available.\n"))
119 | }
120 | }, silent = TRUE)
121 |
122 | print(lf)
123 | print(fe)
124 |
125 | msg2 <- try({
126 | fe <- file.exists(genome)
127 | if (!fe) {
128 | stop(paste0("ERROR: The 'genome' file, ", genome, ", doesn't exist.\n"))
129 | }
130 | }, silent = TRUE)
131 |
132 | msg3 <- try({
133 | fe <- file.exists(gtf)
134 | if (!fe) {
135 | stop(paste0("ERROR: The 'gtf' file, ", gtf, ", doesn't exist.\n"))
136 | }
137 | }, silent = TRUE)
138 |
139 | msg4 <- try({
140 | fe <- file.exists(txome)
141 | if (!fe) {
142 | stop(paste0("ERROR: The 'txome' file, ", txome, ", doesn't exist.\n"))
143 | }
144 | }, silent = TRUE)
145 |
146 | msg5 <- try({
147 | if (run_camera == "True")
148 | if (require("msigdbr")) {
149 | if (!(gsub("_"," ",organism) %in% msigdbr::msigdbr_show_species()))
150 | stop(paste0("ERROR: '", gsub("_"," ",organism), "' not found in 'msigdbr::msigdbr_show_species()' database; fix the organism or set 'run_camera: False'"))
151 | } else {
152 | stop("Cannot check 'organism': msigdbr package not available; run 'snakemake [--use-conda] setup' before 'snakemake [--use-conda] checkinputs'")
153 | }
154 | }, silent = TRUE)
155 |
156 | msg6 <- try({
157 | if (exists("design")) {
158 | print(design)
159 | } else {
160 | stop("ERROR: no 'design' specified; please specify one in the config file")
161 | }
162 | }, silent = TRUE)
163 |
164 |
165 | msg7 <- try({
166 | if (exists("contrast")) {
167 | contrast <- strsplit(gsub(" ","",contrast), ",")[[1]]
168 | print(contrast)
169 | } else {
170 | stop("ERROR: no 'contrast' specified; please specify one in the config file")
171 | }
172 | }, silent = TRUE)
173 |
174 | msg12 <- try({
175 | if( !(annotation %in% c("Ensembl","Gencode")) )
176 | stop(paste0("ERROR: 'annotation' needs to be (exactly) 'Gencode' or 'Ensembl'; currently: ", annotation))
177 | }, silent = TRUE)
178 |
179 | msg13 <- try({
180 | if (!is.character(design) || length(design) != 1) {
181 | stop("ERROR: 'design' must be a character scalar")
182 | }
183 | }, silent = TRUE)
184 |
185 | msg14 <- try({
186 | if (substr(gsub(" ", "", design), 1, 1) != "~") {
187 | stop("ERROR: the first character of 'design' must be ~")
188 | }
189 | }, silent = TRUE)
190 |
191 | msg15 <- try({
192 | terms <- strsplit(gsub(" ", "", design), "\\~|\\+|\\:|\\*|\\^|\\-")[[1]]
193 | pres <- terms %in% c("", "0", "1", colnames(metadata))
194 | if (any(!pres))
195 | stop(paste0("ERROR: the following terms in the design are not available in the metadata: ", terms[!pres]))
196 | }, silent = TRUE)
197 |
198 | msg16 <- try({
199 | if (exists("genesets") && run_camera == "True") {
200 | genesets_split <- strsplit(genesets, ",")[[1]]
201 | if( !all(genesets_split %in% c("H",paste0("C",1:7))) )
202 | stop(paste0("ERROR: 'genesets' must be a subset of H,C1,C2,C3,C4,C5,C6,C7; currently ", genesets))
203 | }
204 | }, silent = TRUE)
205 |
206 | ## Define design matrix
207 | msg8 <- try({
208 | des <- model.matrix(as.formula(design), data = metadata)
209 | }, silent = TRUE)
210 | if(is(msg8, "try-error"))
211 | msg8 <- try({
212 | stop("ERROR in 'design' value: ", design)
213 | }, silent=TRUE)
214 |
215 |
216 | # Define contrasts
217 | msg9 <- try({
218 | have_edgeR <<- FALSE
219 | if (require("edgeR")) {
220 | have_edgeR <<- TRUE
221 | contrasts <- as.data.frame(makeContrasts(contrasts = contrast,
222 | levels = des))
223 | } else {
224 | stop("Cannot check 'contrast', since the edgeR package is not available; run 'snakemake [--use-conda] setup' before 'snakemake [--use-conda] checkinputs'")
225 | }
226 | }, silent = TRUE)
227 | if(is(msg9, "try-error") && have_edgeR)
228 | msg9 <- try({
229 | stop("ERROR in specified 'contrast' (n.b., could be due to invalid 'design' specified): ", paste0(contrast, collapse=","))
230 | }, silent=TRUE)
231 |
232 | msgL <- list(msg0, msg1, msg2, msg3, msg4, msg5, msg6, msg13, msg14, msg15, msg7, msg8, msg9, msg12, msg16)
233 | isError <- sapply(msgL, FUN = function(x) {is(x, "try-error")})
234 | msg <- msgL[isError]
235 | print(msg)
236 |
237 | if (length(msg) > 0) {
238 | for(i in seq_len(length(msg))) {
239 | m <- trimws(gsub("Error in try({ :", "", msg[[i]], fixed=TRUE))
240 | capture.output(writeLines(m), file = outFile, append = !(i==1))
241 | }
242 | stars <- paste(strrep("*", 84), "\n", strrep("*", 84), sep="")
243 | xmsg <- paste("check for the error message above and fix the config.yaml or one of it's components.", sep="")
244 | capture.output(writeLines(stars), file = outFile, append = TRUE)
245 | capture.output(writeLines(xmsg), file = outFile, append = TRUE)
246 | capture.output(writeLines(stars), file = outFile, append = TRUE)
247 | } else {
248 | mylist <- list("Design matrix" = des, "Contrasts matrix" = contrasts)
249 | capture.output(mylist, file = outFile)
250 | stars <- paste(strrep("*", 19), "\n", strrep("*", 19), sep="")
251 | xmsg <- paste("No errors detected.", sep="")
252 | capture.output(writeLines(stars), file = outFile, append = TRUE)
253 | capture.output(writeLines(xmsg), file = outFile, append = TRUE)
254 | capture.output(writeLines(stars), file = outFile, append = TRUE)
255 | }
256 |
257 |
258 |
259 |
--------------------------------------------------------------------------------
/scripts/custom_iSEE_panels.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(ggplot2)
3 | library(rtracklayer)
4 | library(iSEE)
5 | })
6 |
7 | options(ucscChromosomeNames = FALSE)
8 | prepareGtf <- function(gtf) {
9 | gtf <- rtracklayer::import(gtf)
10 |
11 | ## Set appropriate column names
12 | idx <- match(c("transcript_id", "gene_id", "exon_id"),
13 | colnames(S4Vectors::mcols(gtf)))
14 | colnames(S4Vectors::mcols(gtf))[idx] <- c("transcript", "gene", "exon")
15 | if (!("gene_name" %in% colnames(S4Vectors::mcols(gtf)))) {
16 | gtf$gene_name <- gtf$gene
17 | }
18 |
19 | ## Keep only exons
20 | gtf <- BiocGenerics::subset(gtf, type == "exon")
21 |
22 | ## Strip version numbers from gene and transcript IDs if they exist
23 | gtf$transcript <- gsub("\\.[0-9]+$", "", gtf$transcript)
24 | gtf$gene <- gsub("\\.[0-9]+$", "", gtf$gene)
25 |
26 | gtf
27 | }
28 |
29 | customGviz <- function(se, rows, columns, bigwig_files="", bigwig_names="",
30 | bigwig_condition="", granges="",
31 | chr="", start="", end="", showgene="") {
32 | options(ucscChromosomeNames = FALSE)
33 |
34 | ## ---------------------------------------------------------------------- ##
35 | ## Pre-flight checks
36 | ## ---------------------------------------------------------------------- ##
37 | ## Must have at least one of bigwig_files and granges
38 | if (bigwig_files == "" && granges == "") {
39 | return(NULL)
40 | }
41 |
42 | ## If no names are given, assign names to bigwig files
43 | if (bigwig_files != "" && bigwig_names == "") {
44 | bigwig_names <- paste(paste0("S", seq_along(strsplit(bigwig_files, ",")[[1]])),
45 | collapse = ",")
46 | }
47 |
48 | ## If granges file does not exist, don't show annotation
49 | if (!file.exists(granges)) {
50 | granges <- ""
51 | }
52 |
53 | ## If granges file does not exist, the viewing region must be set
54 | if (granges == "" && (chr == "" || start == "" || end == "")) {
55 | return(NULL)
56 | }
57 |
58 | ## Convert start and end positions to numeric values
59 | if (start != "") {
60 | start <- as.numeric(start)
61 | }
62 | if (end != "") {
63 | end <- as.numeric(end)
64 | }
65 |
66 | ## ---------------------------------------------------------------------- ##
67 | ## Prepare the annotation
68 | ## ---------------------------------------------------------------------- ##
69 | if (granges != "") {
70 | ## Read the GRanges object
71 | if (caching$granges == granges && !is.null(caching$gr0)) {
72 | gr0 <- caching$gr0
73 | } else {
74 | caching$gr0 <- readRDS(granges)
75 | caching$granges <- granges
76 | gr0 <- caching$gr0
77 | }
78 |
79 | ## Subset the GRanges object depending on the input
80 | ## If rows has length 1, overwrite any provided showgene
81 | if (length(rows) == 1) {
82 | showgene <- rows
83 | }
84 |
85 | ## Strip version number from the gene of interest if it exists
86 | showgene <- gsub("\\.[0-9]+$", "", showgene)
87 |
88 | if (showgene == "" && (chr == "" || is.na(start) || is.na(end))) {
89 | return(NULL)
90 | }
91 |
92 | ## If a gene has been defined (either via rows or via showgene), set the
93 | ## viewing range accordingly
94 | if (showgene != "") {
95 | gr <- BiocGenerics::subset(gr0, tolower(gene) == tolower(showgene) |
96 | tolower(gene_name) == tolower(showgene))
97 | ## Select only one gene if there are many with the same name
98 | gr <- BiocGenerics::subset(gr, gene == gene[1])
99 | chr <- unique(GenomeInfoDb::seqnames(gr))
100 | start <- min(BiocGenerics::start(gr))
101 | end <- max(BiocGenerics::end(gr))
102 | } else {
103 | gr <- gr0[IRanges::overlapsAny(
104 | gr0,
105 | GenomicRanges::GRanges(seqnames = chr,
106 | ranges = IRanges::IRanges(start = start,
107 | end = end),
108 | strand = "*")), ]
109 | }
110 |
111 | ## Other features in the region
112 | gro <- gr0[IRanges::overlapsAny(
113 | gr0,
114 | GenomicRanges::GRanges(seqnames = chr,
115 | ranges = IRanges::IRanges(start = start,
116 | end = end),
117 | strand = "*"))]
118 | gro <- gro[!(S4Vectors::`%in%`(gro, gr))]
119 |
120 | grtr <- Gviz::GeneRegionTrack(gr, showId = TRUE, col = NULL, fill = "gray80",
121 | name = "Genes", col.title = "black")
122 | grtr2 <- Gviz::GeneRegionTrack(gro, showId = TRUE, col = "black", fill = "white",
123 | name = "", col.title = "black")
124 | } else {
125 | gr <- gro <- grtr <- grtr2 <- NULL
126 | }
127 |
128 | ## ---------------------------------------------------------------------- ##
129 | ## Set title and viewing range
130 | ## ---------------------------------------------------------------------- ##
131 | ## Define the title for the plot
132 | if (showgene != "" && !is.null(gr)) {
133 | if (all(gr$gene == gr$gene_name)) {
134 | plot_title <- unique(gr$gene)
135 | } else {
136 | plot_title <- unique(paste0(gr$gene, " (", gr$gene_name, ")"))
137 | }
138 | } else {
139 | plot_title <- paste0(chr, ":", start, "-", end)
140 | }
141 |
142 | ## Set min and max coord for the plot (add some padding to each side)
143 | minCoord <- start - 0.15*(end - start)
144 | maxCoord <- end + 0.05*(end - start)
145 |
146 | ## ---------------------------------------------------------------------- ##
147 | ## Prepare bigWig files
148 | ## ---------------------------------------------------------------------- ##
149 | ## Reformat bigWig file paths and names (provided to the function as
150 | ## character strings)
151 | if (bigwig_files != "") {
152 | bigwig_files <- strsplit(bigwig_files, ",")[[1]]
153 | bigwig_names <- strsplit(bigwig_names, ",")[[1]]
154 | if (bigwig_condition != "") {
155 | bigwig_condition <- strsplit(bigwig_condition, ",")[[1]]
156 | names(bigwig_condition) <- bigwig_names
157 | }
158 | names(bigwig_files) <- bigwig_names
159 |
160 | ## ---------------------------------------------------------------------- ##
161 | ## Define colors if bigwig_condition is provided
162 | ## ---------------------------------------------------------------------- ##
163 | ## Define colors for coverage tracks
164 | color_list <- rep(c("#DC050C", "#7BAFDE", "#B17BA6", "#F1932D", "#F7EE55",
165 | "#90C987", "#777777", "#E8601C", "#1965B0", "#882E72",
166 | "#F6C141", "#4EB265", "#CAEDAB"),
167 | ceiling(length(unique(bigwig_condition))/13))
168 |
169 | if (length(bigwig_condition) > 1 || bigwig_condition != "") {
170 | usecol <- color_list[match(bigwig_condition,
171 | unique(bigwig_condition))]
172 | } else {
173 | usecol <- rep("gray", length(bigwig_files))
174 | }
175 | names(usecol) <- bigwig_names
176 |
177 | ## ------------------------------------------------------------------ ##
178 | ## Show only selected sample(s)
179 | ## ------------------------------------------------------------------ ##
180 | ## If columns is specified, subset bigwig files
181 | if (!is.null(columns)) {
182 | bigwig_files <- bigwig_files[columns]
183 | bigwig_condition <- bigwig_condition[columns]
184 | usecol <- usecol[columns]
185 | }
186 |
187 | ## ------------------------------------------------------------------ ##
188 | ## Prepare final plot
189 | ## ------------------------------------------------------------------ ##
190 | ## Set up coverage tracks
191 | tracks <- lapply(seq_along(bigwig_files), function(i) {
192 | assign(paste0("covtr", i),
193 | Gviz::DataTrack(range = bigwig_files[i],
194 | type = "histogram",
195 | name = names(bigwig_files)[i],
196 | col.title = "black",
197 | fill = usecol[i],
198 | col = usecol[i],
199 | col.histogram = usecol[i],
200 | fill.histogram = usecol[i]))
201 | })
202 | } else {
203 | tracks <- NULL
204 | }
205 |
206 | ## Add genome axis track
207 | tracks <- c(tracks, Gviz::GenomeAxisTrack(), grtr, grtr2)
208 |
209 | ## Plot tracks
210 | Gviz::plotTracks(tracks, chromosome = chr, from = minCoord,
211 | to = maxCoord, main = plot_title,
212 | transcriptAnnotation = "transcript",
213 | min.width = 0, min.distance = 0, collapse = FALSE)
214 | }
215 |
216 | customVolcano <- function(se, rows, columns, contrasts) {
217 | contrasts <- strsplit(contrasts, ",")[[1]]
218 | tmp <- do.call(plyr::rbind.fill, lapply(contrasts, function(w) {
219 | x <- data.frame(rowData(se)[, grep(paste0("^", w, ":"),
220 | colnames(rowData(se))),
221 | drop = FALSE], check.names = FALSE)
222 | colnames(x) <- gsub(paste0("^", w, ":"), "", colnames(x))
223 | x$contrast <- w
224 | x$feature <- rownames(x)
225 | x
226 | }))
227 | ggplot(tmp, aes(x = logFC, y = mlog10PValue)) +
228 | geom_point(alpha = 0.3) + facet_grid(~ contrast) +
229 | theme_bw() + ylab("-log10(PValue)")
230 | }
231 |
232 | # Set up a cache for the GRanges object
233 | caching <- new.env()
234 |
235 | # gtf <- prepareGtf("example_data/reference/Homo_sapiens.GRCh38.93.1.1.10M.gtf")
236 | # saveRDS(gtf, file = "example_data/reference/Homo_sapiens.GRCh38.93.1.1.10M.granges.rds")
237 | #
238 | # cdp <- customDataPlotDefaults(sce, 2)
239 | # cdp$Function <- c("customGviz", "customVolcano")
240 | # cdp$Arguments <- c("bigwig_files example_data/output/STARbigwig/SRR1039508_Aligned.sortedByCoord.out.bw,example_data/output/STARbigwig/SRR1039509_Aligned.sortedByCoord.out.bw,example_data/output/STARbigwig/SRR1039512_Aligned.sortedByCoord.out.bw,example_data/output/STARbigwig/SRR1039513_Aligned.sortedByCoord.out.bw\nbigwig_names SRR1039508,SRR1039509,SRR1039512,SRR1039513\nbigwig_condition Untreated,Dexamethasone,Untreated,Dexamethasone\ngranges example_data/reference/Homo_sapiens.GRCh38.93.1.1.10M.granges.rds\nchr 1\nstart 6.1e6\nend 6.2e6\nshowgene DDX11L1",
241 | # "contrasts cellineN61311-cellineN052611")
242 | #
243 | # iSEE(sce,
244 | # customDataArgs = cdp,
245 | # customDataFun = list(customGviz = customGviz,
246 | # customVolcano = customVolcano))
247 |
248 |
--------------------------------------------------------------------------------
/scripts/edgeR_dge.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "edgeR DGE"
3 | author: ""
4 | date: "`r Sys.Date()`"
5 | output:
6 | html_document:
7 | toc: true
8 | toc_float: true
9 | theme: yeti
10 | highlight: tango
11 | code_folding: show
12 | keep_md: true
13 | references:
14 | - id: Robinson2010edgeR
15 | title: edgeR-a Bioconductor package for differential expression analysis of digital gene expression data
16 | author:
17 | - family: Robinson
18 | given: Mark D
19 | - family: McCarthy
20 | given: Davis J
21 | - family: Smyth
22 | given: Gordon K
23 | container-title: Bioinformatics
24 | volume: 26
25 | page: 139-140
26 | type: article-journal
27 | URL: https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btp616
28 | issued:
29 | year: 2010
30 | - id: Robinson2010TMM
31 | title: A scaling normalization method for differential expression analysis of RNA-seq data
32 | author:
33 | - family: Robinson
34 | given: Mark D
35 | - family: Oshlack
36 | given: Alicia
37 | container-title: Genome Biology
38 | volume: 11
39 | page: R25
40 | type: article-journal
41 | URL: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25
42 | issued:
43 | year: 2010
44 | - id: Soneson2016tximport
45 | title: Differential analyses for RNA-seq- transcript-level estimates improve gene-level inferences
46 | author:
47 | - family: Soneson
48 | given: Charlotte
49 | - family: Love
50 | given: Michael I
51 | - family: Robinson
52 | given: Mark D
53 | container-title: F1000Research
54 | volume: 4
55 | page: 1521
56 | type: article-journal
57 | URL: https://f1000research.com/articles/4-1521/v2
58 | issued:
59 | year: 2016
60 | - id: Wu2012camera
61 | title: Camera- a competitive gene set test accounting for inter-gene correlation
62 | author:
63 | - family: Wu
64 | given: Di
65 | - family: Smyth
66 | given: Gordon K
67 | container-title: Nucleic Acids Research
68 | volume: 40
69 | page: e133
70 | type: article-journal
71 | issued:
72 | year: 2012
73 | ---
74 |
75 | ```{r edgeR-setup, include=FALSE}
76 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf"))
77 | ```
78 |
79 | # Introduction
80 |
81 | Here, we perform differential gene expression analysis with edgeR
82 | [@Robinson2010edgeR] followed by gene set analysis with camera [@Wu2012camera],
83 | based on abundance estimates from Salmon. For more detailed information of each
84 | step, please refer to the
85 | [edgeR user guide](https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf).
86 |
87 | # Load packages
88 |
89 | ```{r edgeR-load-pkg}
90 | suppressPackageStartupMessages({
91 | library(dplyr)
92 | library(tximport)
93 | library(tximeta)
94 | library(SingleCellExperiment)
95 | library(edgeR)
96 | library(ggplot2)
97 | library(msigdbr)
98 | })
99 | ```
100 |
101 | # Load `SummarizedExperiment` object
102 |
103 | We load the `SummarizedExperiment` objects prepared using `tximeta`, containing
104 | gene- and transcript-level counts and feature lengths. In this report, we will
105 | use the gene-level quantifications.
106 |
107 | ```{r edgeR-print-se}
108 | ## List of SummarizedExperiment objects (gene/transcript level)
109 | se
110 |
111 | ## Get gene-level SummarizedExperiment object
112 | sg <- se$sg
113 | metadata <- colData(sg)
114 |
115 | sg
116 | ```
117 |
118 | # Plot total number of reads per sample
119 |
120 | ```{r edgeR-plot-totalcount}
121 | ggplot(data.frame(totCount = colSums(assay(sg, "counts")),
122 | sample = colnames(assay(sg, "counts")),
123 | stringsAsFactors = FALSE),
124 | aes(x = sample, y = totCount)) + geom_bar(stat = "identity") +
125 | theme_bw() + xlab("") + ylab("Total read count") +
126 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
127 | ```
128 |
129 | # Create DGEList and include average transcript length offsets
130 |
131 | A `DGEList` is the main object `edgeR` requires to perform the DGE analysis. It
132 | is designed to store read counts and associated information. After creating this
133 | object, we add offsets, which are average transcript length correction terms
134 | [@Soneson2016tximport],
135 | and scale them so they are consistent with library sizes (sequencing depth for
136 | each sample).
137 |
138 | Then we calculate normalization factors to scale the raw library sizes and
139 | minimize the log-fold changes between the samples for most genes. Here the
140 | trimmed mean of M-values between each pair of samples (TMM) is used by default
141 | [@Robinson2010TMM].
142 |
143 | Finally we add gene annotation information.
144 |
145 | ```{r edgeR-dge-generate}
146 | dge0 <- tximeta::makeDGEList(sg)
147 |
148 | dge0$genes <- as.data.frame(rowRanges(sg))
149 | ```
150 |
151 | # Calculate logCPMs and add as an assay
152 |
153 | We calculate log-counts per million (CPMs) because they are useful descriptive
154 | measures for the expression level of a gene. Note, however, that the normalized
155 | values are not used for the differential expression analysis. By default, the
156 | normalized library sizes are used in the computation.
157 |
158 | We add the logCPMs to one of the fields (or assay) of the first gene-level
159 | `SummarizedExperiment` object `sg`. At the end of the analysis, we will use this
160 | object again to export the results of all the genes we started with.
161 |
162 | ```{r edgeR-add-logcpm}
163 | logcpms <- edgeR::cpm(dge0, offset = dge0$offset, log = TRUE,
164 | prior.count = 2)
165 | dimnames(logcpms) <- dimnames(dge0$counts)
166 |
167 | stopifnot(all(rownames(logcpms) == rownames(sg)),
168 | all(colnames(logcpms) == colnames(sg)))
169 | assay(sg, "logcpm") <- logcpms
170 | ```
171 |
172 | # Define design.
173 |
174 | Next, we specify the design matrix of the experiment, defining which sample
175 | annotations will be taken into account in the statistical modeling.
176 |
177 | ```{r edgeR-define-design}
178 | stopifnot(all(colnames(dge0) == metadata$names))
179 |
180 | print(metadata)
181 | print(design)
182 |
183 | (des <- model.matrix(as.formula(design), data = metadata))
184 | ```
185 |
186 | # Filter out lowly expressed genes
187 |
188 | Next we determine which genes have sufficiently large counts to be retained in
189 | the statistical analysis, and remove the rest. After removing genes, we
190 | recalculate the normalization factors.
191 |
192 | ```{r edgeR-filter-genes}
193 | dim(dge0)
194 | keep <- edgeR::filterByExpr(dge0, design = des)
195 | dge <- dge0[keep, ]
196 | dim(dge)
197 | ```
198 |
199 | # Estimate dispersion and fit QL model
200 |
201 | We model the count data using a quasi-likelihood (QL) negative binomial (NB)
202 | generalized log-linear model, which accounts for gene-specific variability from
203 | both biological and technical sources. Before fitting the model, we estimate
204 | the NB dispersion (overall biological variability across all genes), and the QL
205 | dispersion (gene-specific) using the `estimateDisp()` function.
206 |
207 | It is also good practice to look at the relationship between the biological
208 | coefficient of variation (NB dispersion) and the gene abundance (in logCPMs).
209 |
210 | ```{r edgeR-estimate-disp}
211 | ## Estimate dispersion and fit model
212 | dge <- estimateDisp(dge, design = des)
213 | qlfit <- glmQLFit(dge, design = des)
214 |
215 | ## Plot dispersions
216 | plotBCV(dge)
217 | ```
218 |
219 | # Define contrasts
220 |
221 | Before testing for differences in gene expression, we define the contrasts
222 | we wish to test for. Here we represent the constrasts as a numeric matrix:
223 |
224 | ```{r edgeR-define-contrasts}
225 | print(contrast)
226 | (contrasts <- as.data.frame(makeContrasts(contrasts = contrast, levels = des)))
227 | ```
228 |
229 | # Perform DGE tests
230 |
231 | Now we perform genewise tests for every contrast defined above, and save the
232 | results for every contrast.
233 |
234 | ```{r edgeR-perform-tests}
235 | signif3 <- function(x) signif(x, digits = 3)
236 | edgeR_res <- lapply(contrasts, function(cm) {
237 | qlf <- glmQLFTest(qlfit, contrast = cm)
238 | tt <- topTags(qlf, n = Inf, sort.by = "none")$table
239 | tt %>%
240 | dplyr::mutate(mlog10PValue = -log10(PValue)) %>%
241 | dplyr::mutate_at(vars(one_of(c("logFC", "logCPM", "F",
242 | "PValue", "FDR", "mlog10PValue"))),
243 | list(signif3))
244 | })
245 | ```
246 |
247 | # Make MA plots
248 |
249 | We can visualize the test results by plotting the logCPM (average) vs the logFC,
250 | and coloring genes with an adjusted p-value below 0.05 (or another specificed
251 | FDR threshold). A plot is drawn for every contrast.
252 |
253 | ```{r edgeR-ma-plots}
254 | if (is(edgeR_res, "data.frame")) {
255 | print(ggplot(edgeR_res, aes(x = logCPM, y = logFC, color = FDR <= 0.05)) +
256 | geom_point() + theme_bw() +
257 | scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")))
258 | } else {
259 | for (nm in names(edgeR_res)) {
260 | print(ggplot(edgeR_res[[nm]], aes(x = logCPM, y = logFC, color = FDR <= 0.05)) +
261 | geom_point() + theme_bw() +
262 | scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
263 | ggtitle(nm))
264 | }
265 | }
266 | ```
267 |
268 | # Write DGE results to text files
269 |
270 | We export the results into text files that can be opened using any text editor.
271 |
272 | ```{r edgeR-save-results}
273 | ## Write results to text files and make MA plots
274 | if (is(edgeR_res, "data.frame")) {
275 | write.table(edgeR_res %>% dplyr::arrange(PValue) %>%
276 | dplyr::select(-dplyr::any_of("tx_ids")),
277 | file = "edgeR_dge_results.txt",
278 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
279 | } else {
280 | for (nm in names(edgeR_res)) {
281 | write.table(edgeR_res[[nm]] %>% dplyr::arrange(PValue) %>%
282 | dplyr::select(-dplyr::any_of("tx_ids")),
283 | file = paste0("edgeR_dge_results_", nm, ".txt"),
284 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
285 | }
286 | }
287 | ```
288 |
289 | # Output DGE results as list of `SingleCellExperiment` objects
290 |
291 | Here, we store the analysis results with the original data. The results are
292 | appended on the `rowData` of the original gene-level `SummarizedExperiment`
293 | object `sg`. For genes that were filtered out, `NA` values are used in the
294 | result columns. The updated `sg` could be fed to the R package `iSEE` to
295 | perform more exploratory and visual analysis.
296 |
297 | ```{r edgeR-se}
298 | ## add rows (NA) for genes that are filtered out (if any)
299 | edgeR_resA <- lapply(seq_along(edgeR_res), FUN = function(x) {
300 |
301 | ## All genes
302 | geneA <- rowData(sg)$gene_id
303 |
304 | ## Genes that are not filtered out
305 | resX <- edgeR_res[[x]]
306 | resX <- resX %>%
307 | dplyr::select(c("gene_id", "gene_name", "logFC", "logCPM",
308 | "F", "FDR", "PValue", "mlog10PValue"))
309 | rownames(resX) <- resX$gene_id
310 |
311 | ## Genes that are filtered out
312 | geneO <- setdiff(geneA, resX$gene_id)
313 |
314 | ## results for all genes
315 | if (length(geneO) > 0) {
316 | ## create a data frame with values NA as the results of the genes that
317 | ## are filtered out
318 | matO <- matrix(NA, nrow = length(geneO),
319 | ncol = ncol(resX),
320 | dimnames = list(geneO,
321 | colnames(resX)))
322 | resO <- data.frame(matO)
323 | resO$gene_id <- geneO
324 | resO$gene_name <- rowData(sg)$gene_name[match(geneO, rowData(sg)$gene_id)]
325 |
326 | ## Combine the result tables
327 | resA <- resO %>%
328 | dplyr::bind_rows(resX) %>%
329 | dplyr::arrange(match(gene_id, geneA)) %>%
330 | dplyr::mutate(contrast = names(edgeR_res)[[x]])
331 | } else {
332 | resA <- resX %>%
333 | dplyr::arrange(match(gene_id, geneA)) %>%
334 | dplyr::mutate(contrast = names(edgeR_res)[[x]])
335 | }
336 |
337 | ## Use gene column as rownames
338 | rownames(resA) <- paste(resA$gene_id, resA$gene_name, sep = "__")
339 |
340 | ## convert to DataFrame
341 | resA <- S4Vectors::DataFrame(resA)
342 | return(resA)
343 | })
344 | names(edgeR_resA) <- names(edgeR_res)
345 |
346 | ## Put the result tables in rowData
347 | for (i in seq_along(edgeR_resA)) {
348 | nam <- names(edgeR_resA)[i]
349 | namI <- paste("edgeR:", nam, sep = "")
350 | stopifnot(all(rownames(sg) == rownames(edgeR_resA[[i]])))
351 | rowData(sg)[[namI]] <- edgeR_resA[[i]]
352 | }
353 | ```
354 |
355 | The output is saved as a list. Compared to the input data `se`, the element `sg`
356 | is updated and `st` stays the same.
357 |
358 | ```{r edgeR-save-se}
359 | analysis_se <- list(sg = sg, st = se$st)
360 | saveRDS(analysis_se, file = "edgeR_dge.rds")
361 | ```
362 |
363 |
364 | ```{r check-gene_names-column, eval = !is.null(genesets), include = FALSE}
365 | if(!("gene_name" %in% colnames(rowData(sg)))) {
366 | genesets <- NULL
367 | }
368 | ```
369 |
370 | ```{r camera-text1, echo = FALSE, results = 'asis', eval = !is.null(genesets)}
371 | cat("# Load gene sets
372 |
373 | We will use `camera` to perform an enrichment analysis for a collection of
374 | gene sets from the [mSigDB](http://software.broadinstitute.org/gsea/msigdb),
375 | packaged in the `msigdbr` R package. Here, we load the gene set definitions
376 | and select which ones to include in the analysis.")
377 | ```
378 |
379 | ```{r camera-load-genesets, eval = !is.null(genesets), include = !is.null(genesets)}
380 | ## Retrieve gene sets and combine in a tibble
381 | m_df <- bind_rows(lapply(genesets,
382 | function(x) msigdbr(species = organism, category = x)))
383 | ```
384 |
385 | ```{r camera-text2, echo= FALSE, results = 'asis', eval = !is.null(genesets)}
386 | cat("# Perform tests
387 |
388 | Next, we perform the gene set analysis. We consider only gene sets where the
389 | number of genes shared with the data set is not too small and not too large.
390 | `camera` is a competitive gene set test that accounts for correlations among
391 | the genes within a gene set.")
392 | ```
393 |
394 | ```{r camera-filter-gene-sets, eval = !is.null(genesets), include = !is.null(genesets)}
395 | minSize <- 3
396 | maxSize <- 500
397 |
398 | ## Get index for genes in each gene set in the DGEList
399 | indexList <- limma::ids2indices(
400 | gene.sets = lapply(split(m_df, f = m_df$gs_name), function(w) w$gene_symbol),
401 | identifiers = dge$genes$gene_name,
402 | remove.empty = TRUE
403 | )
404 |
405 | ## Filter out too small or too large gene sets
406 | gsSizes <- vapply(indexList, length, 0)
407 | indexList <- indexList[gsSizes >= minSize & gsSizes <= maxSize]
408 | ```
409 |
410 | ```{r camera-check-indexList-length, eval = !is.null(genesets), include = FALSE}
411 | ## Check if the index list is empty after filtering
412 | if (length(indexList) == 0){
413 | genesets <- NULL
414 | empty <- TRUE
415 | } else {
416 | empty <- FALSE
417 | }
418 | ```
419 |
420 | ```{r camera-print-empty-list-message, echo = FALSE, results = 'asis', eval = !is.null(genesets) && empty}
421 | cat("**NOTE:**
422 | The index list is empty after filtering and `camera` cannot be run. Either try
423 | different gene categories, try different filtering parameters or disable the
424 | gene set analysis in the `config.yaml` file by setting `run_camera: False`.")
425 | ```
426 |
427 |
428 |
429 | ```{r camera-perform-tests, eval = !is.null(genesets), include = !is.null(genesets)}
430 | camera_res <- lapply(contrasts, function(cm) {
431 | camera(dge, index = indexList, design = des, contrast = cm,
432 | inter.gene.cor = NA)
433 | })
434 | ```
435 |
436 |
437 | ```{r camera-text3, echo = FALSE, results = 'asis', eval = !is.null(genesets)}
438 | cat("# Write gene set analysis results to text files
439 |
440 | The results from `camera` are written to a separate text file for each tested
441 | contrast.")
442 | ```
443 |
444 | ```{r camera-save-results, eval = !is.null(genesets), include = !is.null(genesets)}
445 | ## Write results to text files
446 | if (is(camera_res, "data.frame")) {
447 | write.table(camera_res %>% tibble::rownames_to_column("GeneSet") %>%
448 | dplyr::arrange(PValue),
449 | file = "camera_dge_results.txt",
450 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
451 | } else {
452 | for (nm in names(camera_res)) {
453 | write.table(camera_res[[nm]] %>%
454 | tibble::rownames_to_column("GeneSet") %>%
455 | dplyr::arrange(PValue),
456 | file = paste0("camera_dge_results_", nm, ".txt"),
457 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
458 | }
459 | }
460 | ```
461 |
462 | ```{r camera-text4, echo = FALSE, results = 'asis', eval = !is.null(genesets)}
463 | cat("The `camera` output, as well as the used gene sets, are saved to a file.")
464 | ```
465 |
466 | ```{r camera-save-se, eval = !is.null(genesets), include = !is.null(genesets)}
467 | geneSets <- lapply(indexList, function(i) dge$genes$gene_name[i])
468 | saveRDS(list(cameraRes = camera_res,
469 | geneSets = geneSets), file = "camera_gsa.rds")
470 | ```
471 |
472 | # Session info
473 |
474 | The analyses above were performed with the following package versions:
475 |
476 | ```{r edgeR-session-info}
477 | sessionInfo()
478 | date()
479 | ```
480 |
481 | # References
482 |
483 |
--------------------------------------------------------------------------------
/scripts/generate_linkedtxome.R:
--------------------------------------------------------------------------------
1 | args <- (commandArgs(trailingOnly = TRUE))
2 | for (i in seq_len(length(args))) {
3 | eval(parse(text = args[[i]]))
4 | }
5 |
6 | suppressPackageStartupMessages({
7 | library(tximeta)
8 | })
9 |
10 | print(transcriptfasta)
11 | print(salmonidx)
12 | print(gtf)
13 |
14 | print(annotation)
15 | ss <- strsplit(organism, "_")[[1]]
16 | organism <- paste(paste(ss[1], ss[2]))
17 | print(organism)
18 | print(release)
19 | print(build)
20 | print(output)
21 |
22 | makeLinkedTxome(indexDir = dirname(salmonidx),
23 | source = annotation,
24 | organism = organism,
25 | release = release,
26 | genome = build,
27 | fasta = transcriptfasta,
28 | gtf = gtf,
29 | write = TRUE,
30 | jsonFile = output)
31 |
32 | sessionInfo()
33 | date()
34 |
--------------------------------------------------------------------------------
/scripts/generate_report.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(rmarkdown)
3 | })
4 |
5 | .checkPandoc <- function(ignorePandoc) {
6 | ## Initialize output to TRUE
7 | doRender <- TRUE
8 |
9 | ## First check whether pandoc is available
10 | if (!rmarkdown::pandoc_available()) {
11 | doRender <- FALSE
12 | ## If pandoc is not available, either give a warning or an error,
13 | ## depending on the value of ignorePandoc
14 | if (ignorePandoc) {
15 | ## If ignorePandoc is TRUE, just give a warning
16 | warning("pandoc is not available! ",
17 | "The final report will not be generated.",
18 | immediate. = TRUE)
19 | } else {
20 | ## If ignorePandoc is FALSE, stop
21 | stop("pandoc is not available!")
22 | }
23 | } else {
24 | ## If pandoc is available, check for pandoc-citeproc
25 | ## Only do this if the pandoc version is <2.11, since
26 | ## pandoc-citeproc is not included (or needed) in v2.11 and later.
27 | if (!rmarkdown::pandoc_available(version = "2.11")) {
28 | ## TRUE if the available pandoc version is not 2.11 or newer
29 | ## pandoc-citeproc should be found in the path, or in the
30 | ## same folder as the pandoc executable
31 | if (Sys.which("pandoc-citeproc") == "" &&
32 | !file.exists(file.path(dirname(rmarkdown::pandoc_exec()),
33 | "pandoc-citeproc"))) {
34 | doRender <- FALSE
35 | ## pandoc-citeproc is required, but not found
36 | if (ignorePandoc) {
37 | ## If ignorePandoc is TRUE, just give a warning
38 | warning("pandoc-citeproc is not available! ",
39 | "The final report will not be generated.",
40 | immediate. = TRUE)
41 | } else {
42 | ## If ignorePandoc is FALSE, stop
43 | stop("pandoc-citeproc is not available!")
44 | }
45 | }
46 | }
47 | }
48 | return(doRender)
49 | }
50 |
51 | #' Generate report
52 | #'
53 | #' Generate a report based on a Rmarkdown template file.
54 | #'
55 | #' @param se,gtffile,bigwigdir,genesets,organism,design,contrast Arguments that
56 | #' are passed to the provided Rmarkdown template
57 | #' @param rmdTemplate Path to a .Rmd template file.
58 | #' @param outputFile File name of the output report. The file name extension
59 | #' must be either \code{.html} or \code{.pdf}, and consistent with the value
60 | #' of \code{outputFormat}.
61 | #' @param outputDir Path to the output directory where the report will be
62 | #' generated.
63 | #' @param outputFormat The format of the output report. Either
64 | #' \code{"html_document"} or \code{"pdf_document"}. The file name extension of
65 | #' \code{outputFile} must be consistent with this choice.
66 | #' @param showCode Logical, whether to display the R code in the report.
67 | #' @param forceOverwrite Logical, whether to force overwrite an existing report
68 | #' with the same name in the output directory.
69 | #' @param knitrProgress Logical, whether to display the progress of \code{knitr}
70 | #' when generating the report.
71 | #' @param quiet Logical, whether to show progress messages.
72 | #' @param ignorePandoc Logical, determines what to do if \code{pandoc} or
73 | #' \code{pandoc-citeproc} is missing (if \code{Sys.which("pandoc")} or
74 | #' \code{Sys.which("pandoc-citeproc")} returns ""). If \code{ignorePandoc} is
75 | #' TRUE, only a warning is given. The figures will be generated, but not the
76 | #' final report. If \code{ignorePandoc} is FALSE (default), the execution
77 | #' stops immediately.
78 | #' @param ... Other arguments that will be passed to \code{rmarkdown::render}.
79 | #'
80 | #' @author Charlotte Soneson
81 | #'
82 | #' @details When the function is called, an .Rmd template file will be copied
83 | #' into the output directory, and \code{rmarkdown::render} will be called to
84 | #' generate the final report. If there is already a .Rmd file with the same
85 | #' name in the output directory, the function will raise an error and stop, to
86 | #' avoid overwriting the existing file. The reason for this behaviour is that
87 | #' the copied template in the output directory will be deleted once the report
88 | #' is generated.
89 | #'
90 | #' @export
91 | #'
92 | #' @importFrom rmarkdown render
93 | #' @importFrom tools file_ext file_path_sans_ext
94 | #' @importFrom methods is
95 | #' @import dplyr
96 | #'
97 | #' @return Generates a summary report in the \code{outputDir} directory, and
98 | #' returns (invisibly) the name of the generated report.
99 | #'
100 |
101 | generateReport <- function(se, gtffile = NULL, organism = NULL,
102 | bigwigdir = NULL, design = NULL, genesets = NULL,
103 | contrast = NULL, rmdTemplate, outputFile,
104 | ncores = NULL,
105 | outputDir = "./", outputFormat = NULL,
106 | showCode = FALSE, forceOverwrite = FALSE,
107 | knitrProgress = FALSE, quiet = FALSE,
108 | ignorePandoc = FALSE, ...) {
109 | ## This function was inspired by code from Nicholas Hamilton, provided at
110 | ## http://stackoverflow.com/questions/37097535/generate-report-in-r
111 |
112 | ## If possible, set output format based on the extension of outputFile, if
113 | ## the output format is not provided
114 | if (is.null(outputFormat)) {
115 | if (tools::file_ext(outputFile) == "pdf") {
116 | outputFormat <- "pdf_document"
117 | } else {
118 | outputFormat <- "html_document"
119 | }
120 | }
121 |
122 | ## Check if pandoc and pandoc-citeproc are available
123 | .checkPandoc(ignorePandoc)
124 |
125 | ## ---------------------------------------------------------------------- ##
126 | ## --------------------- Check input arguments -------------------------- ##
127 | ## ---------------------------------------------------------------------- ##
128 |
129 | ## ------------------------ outputFormat -------------------------------- ##
130 | ## Raise an error if outputFormat is not one of the allowed
131 | if (!(outputFormat %in% c("pdf_document", "html_document"))) {
132 | stop("The provided outputFormat is currently not supported. Please ",
133 | "use either 'html_document' or 'pdf_document'.", call. = FALSE)
134 | }
135 |
136 | ## Raise an error if the output format and file name extension don't match
137 | if (outputFormat != paste0(tools::file_ext(outputFile), "_document")) {
138 | stop(paste0("File name extension of outputFile doesn't agree with the ",
139 | "outputFormat, should be .",
140 | gsub("_document$", "", outputFormat)), call. = FALSE)
141 | }
142 |
143 | ## ----------------------- input directory ------------------------------ ##
144 | ## se must be a character string of length 1, and point to an existing rds file
145 | if (!is(se, "character") || length(se) != 1) {
146 | stop("se must be a character string")
147 | }
148 | if (!file.exists(se)) {
149 | stop("The indicated se object does not exist")
150 | }
151 | se <- readRDS(se)
152 |
153 | ## organism
154 | if (!is.null(organism)) {
155 | if (!is(organism, "character") || length(organism) != 1) {
156 | stop("organism must be a character string")
157 | }
158 | organism <- gsub("_", " ", organism)
159 | if (!is.null(genesets) &&
160 | !organism %in% msigdbr::msigdbr_species()$species_name) {
161 | stop("organism must be one of the organisms listed in ",
162 | "msigdbr::msigdbr_show_species()")
163 | }
164 | }
165 |
166 | ## design
167 | if (!is.null(design)) {
168 | if (!is(design, "character") || length(design) != 1) {
169 | stop("design must be a character string")
170 | }
171 | }
172 |
173 | ## contrasts
174 | if (!is.null(contrast)) {
175 | if (!is(contrast, "character")) {
176 | stop("contrast must be a character string")
177 | }
178 | }
179 |
180 | ## ncores
181 | if (!is.null(ncores)) {
182 | if (!is(ncores, "numeric")) {
183 | stop("ncores must be numeric")
184 | }
185 | }
186 |
187 | ## genesets
188 | if (!is.null(genesets)) {
189 | if (!is(genesets, "character")) {
190 | stop("genesets must be a character string")
191 | }
192 | }
193 |
194 | ## gtffile
195 | if (!is.null(gtffile)) {
196 | if (!is(gtffile, "character") || length(gtffile) != 1) {
197 | stop("gtffile must be a character string")
198 | }
199 | if (!file.exists(gtffile)) {
200 | stop("The indicated gtffile does not exist")
201 | }
202 | genemodels <- rtracklayer::import(gtffile)
203 | }
204 |
205 | ## bigwigdir
206 | if (!is.null(bigwigdir)) {
207 | if (!is(bigwigdir, "character") || length(bigwigdir) != 1) {
208 | stop("bigwigdir must be a character string")
209 | }
210 | if (!file.exists(bigwigdir)) {
211 | stop("The indicated bigwigdir does not exist")
212 | }
213 | }
214 |
215 | ## ------------------------- output files ------------------------------- ##
216 | outputReport <- file.path(outputDir, basename(outputFile))
217 | outputRmd <- file.path(
218 | outputDir,
219 | paste0(tools::file_path_sans_ext(basename(outputFile)), ".Rmd"))
220 |
221 | ## Report
222 | if (file.exists(outputReport)) {
223 | if (!forceOverwrite) {
224 | stop("The file ", outputReport,
225 | " already exists. Please remove or rename the file, provide ",
226 | "another value of outputFile, or set forceOverwrite = TRUE.",
227 | call. = FALSE)
228 | } else {
229 | if (!quiet) {
230 | warning("The file ", outputReport,
231 | " already exists and will be overwritten, since ",
232 | "forceOverwrite = TRUE.", immediate. = TRUE,
233 | call. = FALSE)
234 | }
235 | }
236 | }
237 |
238 | ## ------------------------- Rmd template ------------------------------- ##
239 | ## Path to the template file
240 | templateFile <- rmdTemplate
241 | if (file.exists(templateFile)) {
242 | if (file.exists(outputRmd)) {
243 | if (!forceOverwrite) {
244 | stop("There is already an .Rmd file ", outputRmd,
245 | ". Please remove or rename this file, or choose another ",
246 | "outputFile name.", call. = FALSE)
247 | } else {
248 | warning("There is already an .Rmd file ", outputRmd,
249 | ". That file will be renamed with a suffix '_conflicting'",
250 | ", a time stamp and a random sequence. If you did not ",
251 | "explicitly create this file, it can be removed.",
252 | call. = FALSE)
253 | file.rename(from = outputRmd,
254 | to = paste0(outputRmd, "_conflicting_", Sys.Date(), "_",
255 | round(1e6*runif(1))))
256 | }
257 | }
258 | file.copy(from = templateFile, to = outputRmd, overwrite = FALSE)
259 | } else {
260 | stop("The Rmd template file ", templateFile, " does not exist.",
261 | call. = FALSE)
262 | }
263 |
264 | ## ---------------------------------------------------------------------- ##
265 | ## ----------------------- Process the arguments ------------------------ ##
266 | ## ---------------------------------------------------------------------- ##
267 |
268 | args <- list(...)
269 | args$input <- outputRmd
270 | args$output_format <- outputFormat
271 | args$output_file <- outputFile
272 | args$quiet <- !knitrProgress
273 |
274 | ## ---------------------------------------------------------------------- ##
275 | ## ------------------------ Render the report --------------------------- ##
276 | ## ---------------------------------------------------------------------- ##
277 |
278 | outputFile <- do.call("render", args = args)
279 |
280 | ## ---------------------------------------------------------------------- ##
281 | ## --------------------- Remove temporary file -------------------------- ##
282 | ## ---------------------------------------------------------------------- ##
283 |
284 | file.remove(outputRmd)
285 |
286 | invisible(outputFile)
287 | }
288 |
--------------------------------------------------------------------------------
/scripts/install_pkgs.R:
--------------------------------------------------------------------------------
1 | args <- (commandArgs(trailingOnly = TRUE))
2 | for (i in seq_len(length(args))) {
3 | eval(parse(text = args[[i]]))
4 | }
5 |
6 | print(outtxt)
7 | print(annotation)
8 | print(organism)
9 | print(ncores)
10 |
11 | (mirror <- getOption("repos"))
12 |
13 | ## Function to install packages that are not installed
14 | usePackage <- function(pkgs) {
15 |
16 | ## Install BiocManager package
17 | isBiocM <- "BiocManager" %in% installed.packages()[, 1]
18 | if (!isBiocM) {
19 | install.packages("BiocManager", repos = "http://cran.rstudio.com/",
20 | lib = .libPaths()[1])
21 | }
22 |
23 | ## Check that Bioc is new enough
24 | if (BiocManager::version() < '3.12') {
25 | stop("Bioconductor release 3.12 or newer is required ",
26 | "for this version of ARMOR.")
27 | }
28 |
29 | ## Install the other packages
30 | isInstalled <- pkgs %in% installed.packages(lib.loc = .libPaths()[1])[, 1]
31 | BiocManager::install(pkgs[!isInstalled],
32 | update = FALSE, dependencies = TRUE,
33 | lib = .libPaths()[1], Ncpus = as.integer(ncores))
34 |
35 | pkg.load <- lapply(pkgs, FUN = function(x) {
36 | x[!(x %in% installed.packages(.libPaths()[1])[, "Package"])]
37 | })
38 |
39 | if (length(unlist(pkg.load)) == 0) {
40 | cat("All required packages are installed \n")
41 | } else {
42 | cat(unlist(pkg.load), ": failed to install")
43 | }
44 |
45 | ## Test whether packages could be loaded successfully
46 | suppressPackageStartupMessages(
47 | lapply(pkgs, library, character.only = TRUE)
48 | )
49 |
50 | sink(outtxt)
51 | cat("packages loaded successfully: \n",
52 | pkgs[pkgs %in% loadedNamespaces()])
53 | sink()
54 | }
55 |
56 |
57 | paths <- .libPaths()
58 | print(paths)
59 |
60 | ## Install packages
61 | pkgs.use <- c("dplyr", "ggplot2", "tidyr", "remotes", "limma", "edgeR",
62 | "S4Vectors", "DRIMSeq", "SingleCellExperiment", "tximeta",
63 | "msigdbr", "rmarkdown")
64 |
65 |
66 | if (annotation == "Gencode") {
67 | if (organism == "Homo_sapiens") {
68 | pkgs.extra = "org.Hs.eg.db"
69 | } else {
70 | pkgs.extra = "org.Mm.eg.db"
71 | }
72 | pkgs.use <- c(pkgs.use, pkgs.extra)
73 | }
74 |
75 |
76 | usePackage(pkgs = pkgs.use)
77 |
78 |
79 | ## Session info
80 | sessionInfo()
81 | date()
82 |
83 |
--------------------------------------------------------------------------------
/scripts/list_packages.R:
--------------------------------------------------------------------------------
1 | args <- (commandArgs(trailingOnly = TRUE))
2 | for (i in seq_len(length(args))) {
3 | eval(parse(text = args[[i]]))
4 | }
5 |
6 | ## List the R version and all packages used in the analyses together with the
7 | ## version, by parsing the files in the "Routdir" directory. The results are
8 | ## written to the "outtxt" text file.
9 |
10 | print(Routdir)
11 | print(outtxt)
12 |
13 | lf <- list.files(Routdir)
14 | all_packages <- c()
15 | for (f in lf) {
16 | x <- readLines(paste0(Routdir, "/", f))
17 | idx1 <- which(x == "> sessionInfo()")
18 | idx2 <- which(x == "other attached packages:")
19 | idx3 <- which(x == "loaded via a namespace (and not attached):")
20 | if (length(idx1) != 0 & length(idx2) != 0 & length(idx3) != 0) {
21 | all_packages <-
22 | unique(c(all_packages, x[idx1 + 1],
23 | do.call(c, lapply((idx2 + 1):(idx3 - 2), function(i) {
24 | grep("\\[", setdiff(setdiff(strsplit(x[i], " ")[[1]], " "), ""),
25 | value = TRUE, invert = TRUE)
26 | }))))
27 | }
28 | }
29 | write.table(sort(all_packages), file = outtxt,
30 | row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")
31 |
--------------------------------------------------------------------------------
/scripts/prepare_shiny.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Shiny preparation"
3 | author: ""
4 | date: "`r Sys.Date()`"
5 | output:
6 | html_document:
7 | toc: true
8 | toc_float: true
9 | theme: yeti
10 | highlight: tango
11 | code_folding: show
12 | keep_md: true
13 | editor_options:
14 | chunk_output_type: console
15 | ---
16 |
17 | ```{r shiny-setup, include=FALSE}
18 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf"))
19 | ```
20 |
21 | # Introduction
22 |
23 | This script prepares the data to be used for shiny app.
24 |
25 | # Load packages
26 |
27 | ```{r shiny-load-pkg}
28 | suppressPackageStartupMessages({
29 | library(tibble)
30 | library(dplyr)
31 | library(tidyr)
32 | library(limma)
33 | library(edgeR)
34 | library(reshape2)
35 | library(SingleCellExperiment)
36 | library(S4Vectors)
37 | })
38 | ```
39 |
40 | # Load data
41 |
42 | ```{r shiny-load-data}
43 | options(ucscChromosomeNames = FALSE)
44 | sg <- se$sg
45 | st <- se$st
46 | ```
47 |
48 | # Gene models
49 |
50 | ```{r shiny-gene-model}
51 | create_genemodels <- function(genemodels) {
52 | idx <- match(c("transcript_id", "gene_id", "exon_id"),
53 | colnames(mcols(genemodels)))
54 | colnames(mcols(genemodels))[idx] <- c("transcript", "gene", "exon")
55 | mcols(genemodels)$symbol <- mcols(genemodels)$transcript
56 | subset(genemodels, type == "exon")
57 | }
58 |
59 | if (!is.null(gtffile)) {
60 | genemodels <- create_genemodels(genemodels)
61 | } else {
62 | genemodels <- NULL
63 | }
64 | ```
65 |
66 | # Vector with bigWig file names
67 |
68 | ```{r shiny-bigwig}
69 | if (!is.null(bigwigdir)) {
70 | bwfiles <- normalizePath(list.files(bigwigdir, pattern = "\\.bw$",
71 | full.names = TRUE))
72 | names(bwfiles) <- gsub("_Aligned.sortedByCoord.out.bw", "", basename(bwfiles))
73 | } else {
74 | bwfiles <- NA
75 | }
76 | ```
77 |
78 | # edgeR - gene-level MDS
79 |
80 | ```{r shiny-MDS}
81 | logcpms <- assay(sg, "logcpm")
82 | mds <- limma::plotMDS(logcpms, top = 500, labels = NULL, pch = NULL,
83 | cex = 1, dim.plot = c(1, 2), ndim = min(7, ncol(logcpms) - 1),
84 | gene.selection = "common",
85 | xlab = NULL, ylab = NULL, plot = FALSE)
86 | if (!is.null(mds$cmdscale.out)) {
87 | ## Bioc 3.12 and earlier
88 | mds <- mds$cmdscale.out
89 | colnames(mds) <- paste0("MDS", seq_len(ncol(mds)))
90 | mds <- as.data.frame(mds) %>% tibble::rownames_to_column(var = "names")
91 | } else {
92 | mds <- data.frame(names = colnames(logcpms),
93 | MDS1 = mds$x,
94 | MDS2 = mds$y)
95 | }
96 | mds <- mds %>%
97 | dplyr::full_join(data.frame(colData(sg)), by = "names")
98 | ```
99 |
100 | # SingleCellExperiment on gene level
101 |
102 | The `rowData` of `sce_gene` includes the gene information and the result tables
103 | from `edgeR` and `DRIMSeq`. Each result table is stored as a column, and the
104 | column name is composed by `edgeR:` or `DRIMSeq:` and the name of the contrast
105 | used.
106 |
107 | The `colData` of `sce_gene` stores the sample information, the bigWig file names
108 | and condition information
109 |
110 | The multidimensional scale data is stored in `reducedDims`.
111 |
112 | ```{r shiny-sce-gene}
113 | nam <- colData(sg)$names
114 |
115 | ## low dimensional representation
116 | reducedData <- mds %>%
117 | dplyr::arrange(match(names, nam)) %>%
118 | as.data.frame() %>%
119 | dplyr::mutate(namestmp = names) %>%
120 | tibble::column_to_rownames("namestmp") %>%
121 | dplyr::select(-one_of(colnames(colData(sg))))
122 | reducedData <- as.matrix(reducedData)
123 |
124 | ## column data
125 | colData(sg)$bwFiles <- bwfiles[nam]
126 |
127 | sce_gene <- SingleCellExperiment(assays = assays(sg),
128 | rowData = rowData(sg),
129 | colData = colData(sg),
130 | metadata = list(geneModels = genemodels),
131 | reducedDims = SimpleList(MDS = reducedData))
132 | ```
133 |
134 | # SingleCellExperiment on transcript level
135 |
136 | The `rowData` of `sce_tx` includes the information of genes and transcripts,
137 | and the result table on the transcript level from `DRIMSeq`.
138 |
139 | The `colData` of `sce_tx` stores the sample information, the bigWig file names
140 | and condition information.
141 |
142 | ```{r shiny-sce-tx}
143 | nam <- colData(st)$names
144 |
145 | ## column data
146 | colData(st)$bwFiles <- bwfiles[nam]
147 |
148 | sce_tx <- SingleCellExperiment(assays = assays(st),
149 | rowData = rowData(st),
150 | colData = colData(st),
151 | metadata = list(geneModels = genemodels))
152 | ```
153 |
154 | # Output results
155 |
156 | ```{r shiny-save-sce}
157 | saveRDS(list(sce_tx = sce_tx,
158 | sce_gene = sce_gene),
159 | file = "shiny_sce.rds")
160 | ```
161 |
162 | # Session info
163 |
164 | The analyses above were performed with the following package versions:
165 |
166 | ```{r shiny-session-info}
167 | sessionInfo()
168 | date()
169 | ```
170 |
171 |
--------------------------------------------------------------------------------
/scripts/run_render.R:
--------------------------------------------------------------------------------
1 | args <- (commandArgs(trailingOnly = TRUE))
2 | for (i in seq_len(length(args))) {
3 | eval(parse(text = args[[i]]))
4 | }
5 |
6 | ## Mandatory arguments
7 | print(se)
8 | print(rmdtemplate)
9 | print(outputdir)
10 | print(outputfile)
11 |
12 | ## Arguments that are only used for some of the reports
13 | if (exists("organism")) {
14 | print(organism)
15 | } else {
16 | organism <- NULL
17 | }
18 |
19 | if (exists("design")) {
20 | print(design)
21 | } else {
22 | design <- NULL
23 | }
24 |
25 | if (exists("contrast")) {
26 | contrast <- strsplit(gsub(" ","",contrast), ",")[[1]]
27 | print(contrast)
28 | } else {
29 | contrast <- NULL
30 | }
31 |
32 | if (exists("genesets")) {
33 | genesets <- strsplit(gsub(" ","",genesets), ",")[[1]]
34 | print(genesets)
35 | } else {
36 | genesets <- NULL
37 | }
38 |
39 | if (exists("gtffile")) {
40 | print(gtffile)
41 | } else {
42 | gtffile <- NULL
43 | }
44 |
45 | if (exists("ncores")) {
46 | ncores <- as.numeric(ncores)
47 | if(is.na(ncores))
48 | ncores <- 1
49 | print(ncores)
50 | } else {
51 | ncores <- 1
52 | }
53 |
54 | if (exists("bigwigdir")) {
55 | bigwigdir <- normalizePath(bigwigdir)
56 | print(bigwigdir)
57 | } else {
58 | bigwigdir <- NULL
59 | }
60 |
61 | source("scripts/generate_report.R")
62 |
63 | generateReport(se = se, organism = organism, gtffile = gtffile,
64 | contrast = contrast, design = design, genesets = genesets,
65 | bigwigdir = bigwigdir, rmdTemplate = rmdtemplate,
66 | outputDir = outputdir, outputFile = outputfile, ncores = ncores,
67 | forceOverwrite = TRUE, showCode = TRUE)
68 |
--------------------------------------------------------------------------------
/scripts/run_tximeta.R:
--------------------------------------------------------------------------------
1 | args <- (commandArgs(trailingOnly = TRUE))
2 | for (i in seq_len(length(args))) {
3 | eval(parse(text = args[[i]]))
4 | }
5 |
6 | suppressPackageStartupMessages({
7 | library(dplyr)
8 | library(tximport)
9 | library(tximeta)
10 | library(SingleCellExperiment)
11 | })
12 |
13 | print(salmondir)
14 | print(json)
15 | print(metafile)
16 | print(outrds)
17 | print(annotation)
18 | print(organism)
19 |
20 | ## Load json linkedTxome
21 | loadLinkedTxome(json)
22 |
23 | ## Read metadata
24 | metadata <- read.delim(metafile, header = TRUE, as.is = TRUE, sep = "\t")
25 |
26 | ## List Salmon directories
27 | salmonfiles <- paste0(salmondir, "/", metadata$names, "/quant.sf")
28 | names(salmonfiles) <- metadata$names
29 |
30 | ## Add file column to metadata and import annotated abundances
31 | ## In transcript level
32 | coldata <- cbind(metadata, files = salmonfiles, stringsAsFactors = FALSE)
33 | st <- tximeta::tximeta(coldata)
34 |
35 | ## Summarize to gene level
36 | sg <- summarizeToGene(st)
37 |
38 | ## If the 'entrezid' column exists and is a list, convert to a vector
39 | if ("entrezid" %in% colnames(rowData(sg)) &&
40 | is(rowData(sg)$entrezid, "list")) {
41 | if (any(vapply(rowData(sg)$entrezid, length, 1) > 1)) {
42 | warning("Some elements of rowData(sg)$entrezid consisted of ",
43 | "more than one object. Only the first one is retained.")
44 | }
45 | rowData(sg)$entrezid <- vapply(
46 | rowData(sg)$entrezid,
47 | function(w) w[[1]],
48 | as(NA, class(rowData(sg)$entrezid[[1]]))
49 | )
50 | }
51 |
52 | ## Add gene_names for Gencode reference
53 | if(annotation == "Gencode") {
54 | if(organism == "Homo_sapiens") {
55 | library(org.Hs.eg.db)
56 | } else {
57 | library(org.Mm.eg.db)
58 | }
59 | sg <- tximeta::addIds(sg, "SYMBOL", gene = TRUE)
60 | rowData(sg)$gene_name <- rowData(sg)$SYMBOL
61 | }
62 |
63 | ## If rowData(st)$gene_id is a CharacterList, convert it to character to allow
64 | ## the joining below
65 | if (is(rowData(st)$gene_id, "CharacterList")) {
66 | if (any(vapply(rowData(st)$gene_id, length, 1) > 1)) {
67 | warning("Some elements of rowData(st)$gene_id consisted of more than one",
68 | "object. Only the first one is retained.")
69 | }
70 | rowData(st)$gene_id <- vapply(rowData(st)$gene_id, function(w) w[[1]], "")
71 | }
72 |
73 | ## If rowData(st)$tx_id is of class integer, replace it with the tx_name
74 | ## column
75 | if (is(rowData(st)$tx_id, "integer")) {
76 | rowData(st)$tx_id <- rowData(st)$tx_name
77 | }
78 |
79 | ## Add gene information, e.g. gene_name, entrezid, ... (if provided) to
80 | ## transcript-level SE
81 | rowData(st) <- rowData(st) %>%
82 | data.frame() %>%
83 | dplyr::left_join(data.frame(rowData(sg))) %>%
84 | DataFrame()
85 |
86 | ## Change the row names in sg to have geneID__geneSymbol
87 | rownames(sg) <- paste(rowData(sg)$gene_id, rowData(sg)$gene_name, sep = "__")
88 |
89 | # Coerce the object from SummarizedExperiment to SingleCellExperiment
90 | st <- as(st, "SingleCellExperiment")
91 | sg <- as(sg, "SingleCellExperiment")
92 |
93 | saveRDS(list(st = st, sg = sg), file = outrds)
94 |
95 | sessionInfo()
96 | date()
97 |
98 |
99 |
--------------------------------------------------------------------------------
/version:
--------------------------------------------------------------------------------
1 | 1.5.10 (2024-09-21)
2 |
--------------------------------------------------------------------------------