├── .Renviron ├── .github └── workflows │ └── snakemake-run.yaml ├── LICENSE ├── NEWS ├── README.md ├── Snakefile ├── config.yaml ├── envs ├── environment.yaml └── environment_R.yaml ├── example_data ├── FASTQ │ ├── SRR1039508_R1.fastq.gz │ ├── SRR1039508_R2.fastq.gz │ ├── SRR1039509_R1.fastq.gz │ ├── SRR1039509_R2.fastq.gz │ ├── SRR1039512_R1.fastq.gz │ ├── SRR1039512_R2.fastq.gz │ ├── SRR1039513_R1.fastq.gz │ └── SRR1039513_R2.fastq.gz ├── README.md ├── metadata.txt └── reference │ ├── Ensembl.GRCh38.93 │ ├── Homo_sapiens.GRCh38.93.1.1.10M.gtf │ ├── Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz │ ├── Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa │ └── Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz │ └── Gencode28 │ ├── GRCh38.primary_assembly.genome.1.1.10M.fa │ ├── gencode.v28.annotation.1.1.10M.gtf │ └── gencode.v28.transcripts.1.1.10M.fa.gz ├── img ├── ARMOR.png ├── ARMOR.svg ├── benchmark_summary.png ├── dag_nice.svg ├── dag_nice5.png ├── draw.io_run_modes_ARMOR.xml ├── run_modes_ARMOR.png ├── software_management.png └── software_management.svg ├── scripts ├── DRIMSeq_dtu.Rmd ├── check_input.R ├── custom_iSEE_panels.R ├── edgeR_dge.Rmd ├── generate_linkedtxome.R ├── generate_report.R ├── install_pkgs.R ├── list_packages.R ├── prepare_shiny.Rmd ├── run_render.R └── run_tximeta.R └── version /.Renviron: -------------------------------------------------------------------------------- 1 | R_LIBS_USER="" 2 | -------------------------------------------------------------------------------- /.github/workflows/snakemake-run.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | pull_request: 4 | branches: 5 | - master 6 | schedule: 7 | - cron: '0 9 * * 5' 8 | 9 | name: snakemake-run 10 | 11 | jobs: 12 | snakemake-run: 13 | defaults: 14 | run: 15 | shell: bash -l {0} 16 | name: run snakemake 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | os: [macos-13, ubuntu-latest] 22 | include: 23 | - os: macos-13 24 | rversion: '4.4' 25 | - os: ubuntu-latest 26 | rversion: '4.4' 27 | steps: 28 | - name: Check out repository 29 | uses: actions/checkout@v2 30 | 31 | - name: Install R (macOS) 32 | uses: r-lib/actions/setup-r@v2 33 | if: runner.os == 'macOS' 34 | with: 35 | r-version: ${{ matrix.rversion }} 36 | 37 | - name: Check where R is installed 38 | if: runner.os == 'macOS' 39 | run: | 40 | which R 41 | Rscript -e 'print(.libPaths())' 42 | 43 | - name: Set up workflow R for macOS 44 | if: runner.os == 'macOS' 45 | run: | 46 | sed -i .bak 's/useCondaR: True/useCondaR: False/' config.yaml 47 | mkdir -p $HOME/Rlib 48 | echo "R_LIBS_USER=${HOME}/Rlib" > .Renviron 49 | cat .Renviron 50 | Rscript -e "install.packages('BiocManager'); BiocManager::install('GenomeInfoDbData')" 51 | 52 | - name: Set up conda 53 | uses: conda-incubator/setup-miniconda@v3 54 | with: 55 | auto-update-conda: true 56 | channels: bioconda,conda-forge,nodefaults 57 | auto-activate-base: true 58 | miniforge-version: latest 59 | 60 | - name: Install system dependencies (Linux) 61 | if: runner.os == 'Linux' 62 | run: | 63 | sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable --yes 64 | sudo apt-get --yes --force-yes update -qq && \ 65 | sudo apt-get -y install libcairo2-dev libv8-dev \ 66 | libgdal-dev libgeos-dev libgeos++-dev libproj-dev libudunits2-dev \ 67 | libcurl4-openssl-dev libharfbuzz-dev libfribidi-dev libglpk-dev \ 68 | libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev libgit2-dev \ 69 | libxml2-dev libuv1 libuv1-dev 70 | 71 | - name: Prepare for running workflow 72 | env: 73 | BIOCONDUCTOR_USE_CONTAINER_REPOSITORY: true 74 | run: | 75 | mkdir -p $HOME/.R 76 | echo -e 'MAKEFLAGS = -j8' > $HOME/.R/Makevars 77 | echo 'options(Ncpus = 8)' > $HOME/.Rprofile 78 | echo 'Sys.setenv(BIOCONDUCTOR_USE_CONTAINER_REPOSITORY=TRUE)' >> $HOME/.Rprofile 79 | 80 | - name: Install Snakemake 81 | run: | 82 | conda create -c conda-forge -c bioconda -n snakemake snakemake 83 | 84 | - name: Run Snakemake 85 | env: 86 | BIOCONDUCTOR_USE_CONTAINER_REPOSITORY: true 87 | run: | 88 | conda activate snakemake 89 | snakemake --use-conda --cores 4 90 | 91 | - name: Upload artifact 92 | uses: actions/upload-artifact@v4 93 | if: failure() 94 | with: 95 | name: all_rout 96 | path: example_data/output/Rout/*.Rout 97 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Charlotte Soneson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | v1.5.10 2 | ====== 3 | o Update R to v4.4.1 4 | 5 | v1.5.9 6 | ====== 7 | o Add r-mass and r-matrix to conda R environment 8 | 9 | v1.5.8 10 | ====== 11 | o Update to R 4.3.2 12 | 13 | v1.5.7 14 | ====== 15 | o Swap deprecated msigdbr function to get supported species 16 | o Convert entrezid column to a vector if required (to allow saving to csv) 17 | o Update to R 4.1.0 18 | 19 | v1.5.6 20 | ====== 21 | o Remove tx_ids column from text file exported by edgeR (addressing https://github.com/csoneson/ARMOR/issues/109) 22 | 23 | v1.5.5 24 | ====== 25 | o Adapt to the new output format of limma::plotMDS 26 | o Adapt pandoc-citeproc check to work with pandoc 2.11 27 | 28 | v1.5.4 29 | ====== 30 | o Specify tbb version 2020.2 based on issue with salmon in conda (https://twitter.com/dpryan79/status/1368116490801717251) 31 | 32 | v1.5.3 33 | ====== 34 | o Update syntax based on suggestions from snakemake --lint 35 | o Update software versions 36 | o Use built-in cpm() function from edgeR 37 | 38 | v1.5.2 39 | ====== 40 | o Fix pandoc version to 2.10 (thanks @carissableker, https://github.com/csoneson/ARMOR/pull/101) 41 | 42 | v1.5.1 43 | ====== 44 | o Specify channel in environment_R.yaml 45 | 46 | v1.5.0 47 | ====== 48 | o Transition to R 4.0 49 | o Use convenience function from tximeta to create DGEList for DE 50 | 51 | v1.4.0 52 | ====== 53 | o Add possibility to provide additional arguments to STAR and Salmon 54 | 55 | v1.3.2 56 | ====== 57 | o Fix small bug in edgeR script, triggered when no gene sets were provided 58 | 59 | v1.3.1 60 | ====== 61 | o Adjust code for the latest version 7.0.1 of the msigdbr R package 62 | o Fix bug to make sure that camera is not run with an empty gene set list 63 | 64 | v1.3.0 65 | ====== 66 | o Change CPM calculations in edgeR_dge.Rmd to account for average transcript length offsets, using the approach from csaw::calculateCPM() 67 | 68 | v1.2.3 69 | ====== 70 | o Rename rules for clarity 71 | o Add benchmarks directive 72 | 73 | v1.2.2 74 | ====== 75 | o Add sticker to README 76 | 77 | v1.2.1 78 | ====== 79 | o Change deprecated pandas.read_table to pandas.read_csv 80 | 81 | v1.2.0 82 | ====== 83 | o Add possibility to use multiple cores for DRIMSeq and R package installation 84 | 85 | v1.1.0 86 | ====== 87 | o Extend checks of inputs 88 | 89 | v1.0.0 90 | ====== 91 | o Initial version 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ARMOR workflow 2 | [![snakemake-run](https://github.com/csoneson/ARMOR/actions/workflows/snakemake-run.yaml/badge.svg)](https://github.com/csoneson/ARMOR/actions) 3 | 4 | **ARMOR** (**A**utomated **R**eproducible **MO**dular **R**NA-seq) is a [Snakemake workflow](https://snakemake.readthedocs.io/en/stable/index.html), aimed at performing a typical RNA-seq workflow in a reproducible, automated, and partially contained manner. It is implemented such that alternative or similar analysis can be added or removed. 5 | 6 | ARMOR consists of a `Snakefile`, a [`conda`](https://conda.io/docs/) environment file (`envs/environment.yaml`) a configuration file (`config.yaml`) and a set of `R` scripts, to perform quality control, preprocessing and differential expression analysis of RNA-seq data. The output can be combined with the [`iSEE`](https://bioconductor.org/packages/iSEE/) `R` package to generate a `shiny` application for browsing and sharing the results. 7 | 8 | By default, the pipeline performs all the steps shown in the [diagram](img/dag_nice3.png) below. However, you can turn off any combination of the light-colored steps (e.g `STAR` alignment or `DRIMSeq` analysis) in the `config.yaml` file. 9 | 10 | *Advanced use*: If you prefer other software to run one of the outlined steps (e.g. `DESeq2` over `edgeR`, or `kallisto` over `Salmon`), you can use the software of your preference provided you have your own script(s), and change some lines within the `Snakefile`. If you think your "custom rule" might be of use to a broader audience, let us know by opening an issue. 11 | 12 | 13 | ## Using the ARMOR workflow 14 | 15 | Assuming that snakemake and conda are installed (and your system has the necessary libraries to compile R packages), you can use the following commands on a test dataset: 16 | 17 | ``` 18 | git clone https://github.com/csoneson/ARMOR.git 19 | cd ARMOR && snakemake --use-conda 20 | ``` 21 | 22 | To use the ARMOR workflow on your own data, follow the steps outlined in the [wiki](https://github.com/csoneson/ARMOR/wiki). 23 | 24 | ## Workflow graph 25 | ![DAG](img/dag_nice5.png) 26 | Blue circles are rules run in `R`, orange circles from software called as shell commands. Dashed lines and light-colored circles are optional rules, controlled in `config.yaml` 27 | 28 | ## Contributors 29 | Current contributors include: 30 | 31 | - [Ruizhu Huang](https://github.com/fionarhuang) 32 | - [Katharina Hembach](https://github.com/khembach) 33 | - [Stephany Orjuela](https://github.com/sorjuela) 34 | - [Mark D. Robinson](https://github.com/markrobinsonuzh) 35 | - [Charlotte Soneson](https://github.com/csoneson) 36 | -------------------------------------------------------------------------------- /Snakefile: -------------------------------------------------------------------------------- 1 | ## Configuration file 2 | import os 3 | if len(config) == 0: 4 | if os.path.isfile("./config.yaml"): 5 | configfile: "./config.yaml" 6 | else: 7 | sys.exit("".join(["Make sure there is a config.yaml file in ", os.getcwd(), 8 | " or specify one with the --configfile commandline parameter."])) 9 | 10 | ## Make sure that all expected variables from the config file are in the config dictionary 11 | configvars = ['annotation', 'organism', 'build', 'release', 'txome', 'genome', 'gtf', 'salmonindex', 'salmonk', 'STARindex', 'readlength', 'fldMean', 'fldSD', 'metatxt', 'design', 'contrast', 'genesets', 'ncores', 'FASTQ', 'fqext1', 'fqext2', 'fqsuffix', 'output', 'useCondaR', 'Rbin', 'run_trimming', 'run_STAR', 'run_DRIMSeq', 'run_camera'] 12 | for k in configvars: 13 | if k not in config: 14 | config[k] = None 15 | 16 | ## If any of the file paths is missing, replace it with "" 17 | def sanitizefile(str): 18 | if str is None: 19 | str = '' 20 | return str 21 | 22 | config['txome'] = sanitizefile(config['txome']) 23 | config['gtf'] = sanitizefile(config['gtf']) 24 | config['genome'] = sanitizefile(config['genome']) 25 | config['STARindex'] = sanitizefile(config['STARindex']) 26 | config['salmonindex'] = sanitizefile(config['salmonindex']) 27 | config['metatxt'] = sanitizefile(config['metatxt']) 28 | 29 | ## Read metadata 30 | if not os.path.isfile(config["metatxt"]): 31 | sys.exit("".join(["Metadata file ", config["metatxt"], " does not exist."])) 32 | 33 | import pandas as pd 34 | samples = pd.read_csv(config["metatxt"], sep='\t') 35 | 36 | if not set(['names','type']).issubset(samples.columns): 37 | sys.exit("".join(["Make sure 'names' and 'type' are columns in ", config["metatxt"]])) 38 | 39 | 40 | ## Sanitize provided input and output directories 41 | import re 42 | def getpath(str): 43 | if str in ['', '.', './']: 44 | return '' 45 | if str.startswith('./'): 46 | regex = re.compile('^\./?') 47 | str = regex.sub('', str) 48 | if not str.endswith('/'): 49 | str += '/' 50 | return str 51 | 52 | outputdir = getpath(config["output"]) 53 | FASTQdir = getpath(config["FASTQ"]) 54 | 55 | ## Define the conda environment for all rules using R 56 | if config["useCondaR"] == True: 57 | Renv = "envs/environment_R.yaml" 58 | else: 59 | Renv = "envs/environment.yaml" 60 | 61 | ## Define the R binary 62 | Rbin = config["Rbin"] 63 | 64 | ## ------------------------------------------------------------------------------------ ## 65 | ## Target definitions 66 | ## ------------------------------------------------------------------------------------ ## 67 | ## Run all analyses 68 | rule all: 69 | input: 70 | os.path.join(outputdir, "MultiQC", "multiqc_report.html"), 71 | os.path.join(outputdir, "outputR", "shiny_sce.rds") 72 | 73 | rule setup: 74 | input: 75 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"), 76 | os.path.join(outputdir, "Rout", "softwareversions.done") 77 | 78 | ## Install R packages 79 | rule pkginstall: 80 | input: 81 | script = "scripts/install_pkgs.R" 82 | output: 83 | os.path.join(outputdir, "Rout", "pkginstall_state.txt") 84 | params: 85 | flag = config["annotation"], 86 | ncores = config["ncores"], 87 | organism = config["organism"], 88 | Rbin = Rbin 89 | priority: 90 | 50 91 | conda: 92 | Renv 93 | log: 94 | os.path.join(outputdir, "Rout", "install_pkgs.Rout") 95 | benchmark: 96 | os.path.join(outputdir, "benchmarks", "install_pkgs.txt") 97 | shell: 98 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args outtxt='{output}' ncores='{params.ncores}' annotation='{params.flag}' organism='{params.organism}'" {input.script} {log}''' 99 | 100 | ## FastQC on original (untrimmed) files 101 | rule runfastqc: 102 | input: 103 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()), 104 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()), 105 | expand(os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist()) 106 | 107 | ## Trimming and FastQC on trimmed files 108 | rule runtrimming: 109 | input: 110 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_val_1_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()), 111 | expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_val_2_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist()), 112 | expand(os.path.join(outputdir, "FastQC", "{sample}_trimmed_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist()) 113 | 114 | ## Salmon quantification 115 | rule runsalmonquant: 116 | input: 117 | expand(os.path.join(outputdir, "salmon", "{sample}", "quant.sf"), sample = samples.names.values.tolist()) 118 | 119 | ## STAR alignment 120 | rule runstar: 121 | input: 122 | expand(os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam.bai"), sample = samples.names.values.tolist()), 123 | expand(os.path.join(outputdir, "STARbigwig", "{sample}_Aligned.sortedByCoord.out.bw"), sample = samples.names.values.tolist()) 124 | 125 | ## List all the packages that were used by the R analyses 126 | rule listpackages: 127 | log: 128 | os.path.join(outputdir, "Rout", "list_packages.Rout") 129 | params: 130 | Routdir = os.path.join(outputdir, "Rout"), 131 | outtxt = os.path.join(outputdir, "R_package_versions.txt"), 132 | script = "scripts/list_packages.R", 133 | Rbin = Rbin 134 | conda: 135 | Renv 136 | shell: 137 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args Routdir='{params.Routdir}' outtxt='{params.outtxt}'" {params.script} {log}''' 138 | 139 | ## Print the versions of all software packages 140 | rule softwareversions: 141 | output: 142 | touch(os.path.join(outputdir, "Rout", "softwareversions.done")) 143 | log: 144 | os.path.join(outputdir, "logs", "softversions.log") 145 | conda: 146 | "envs/environment.yaml" 147 | shell: 148 | "echo -n 'ARMOR version ' && cat version; " 149 | "salmon --version; trim_galore --version; " 150 | "echo -n 'cutadapt ' && cutadapt --version; " 151 | "fastqc --version; STAR --version; samtools --version; multiqc --version; " 152 | "bedtools --version" 153 | 154 | ## ------------------------------------------------------------------------------------ ## 155 | ## Reference preparation 156 | ## ------------------------------------------------------------------------------------ ## 157 | ## Generate Salmon index from merged cDNA and ncRNA files 158 | rule salmonindex: 159 | input: 160 | txome = config["txome"] 161 | output: 162 | os.path.join(config["salmonindex"], "versionInfo.json") 163 | log: 164 | os.path.join(outputdir, "logs", "salmon_index.log") 165 | benchmark: 166 | os.path.join(outputdir, "benchmarks", "salmon_index.txt") 167 | params: 168 | salmonoutdir = lambda wildcards, output: os.path.dirname(output[0]), ## dirname of first output 169 | anno = config["annotation"], 170 | salmonextraparams = config["additional_salmon_index"] 171 | conda: 172 | "envs/environment.yaml" 173 | shell: 174 | """ 175 | if [ {params.anno} == "Gencode" ]; then 176 | echo 'Salmon version:\n' > {log}; salmon --version >> {log}; 177 | salmon index -t {input.txome} -i {params.salmonoutdir} --gencode {params.salmonextraparams} 178 | 179 | else 180 | echo 'Salmon version:\n' > {log}; salmon --version >> {log}; 181 | salmon index -t {input.txome} -i {params.salmonoutdir} {params.salmonextraparams} 182 | fi 183 | """ 184 | 185 | ## Generate linkedtxome mapping 186 | rule linkedtxome: 187 | input: 188 | txome = config["txome"], 189 | gtf = config["gtf"], 190 | salmonidx = os.path.join(config["salmonindex"], "versionInfo.json"), 191 | script = "scripts/generate_linkedtxome.R", 192 | install = os.path.join(outputdir, "Rout", "pkginstall_state.txt") 193 | log: 194 | os.path.join(outputdir, "Rout", "generate_linkedtxome.Rout") 195 | benchmark: 196 | os.path.join(outputdir, "benchmarks", "generate_linkedtxome.txt") 197 | output: 198 | "".join([config["salmonindex"], ".json"]) 199 | params: 200 | flag = config["annotation"], 201 | organism = config["organism"], 202 | release = str(config["release"]), 203 | build = config["build"], 204 | Rbin = Rbin 205 | conda: 206 | Renv 207 | shell: 208 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args transcriptfasta='{input.txome}' salmonidx='{input.salmonidx}' gtf='{input.gtf}' annotation='{params.flag}' organism='{params.organism}' release='{params.release}' build='{params.build}' output='{output}'" {input.script} {log}''' 209 | 210 | ## Generate STAR index 211 | rule starindex: 212 | input: 213 | genome = config["genome"], 214 | gtf = config["gtf"] 215 | output: 216 | os.path.join(config["STARindex"], "SA"), 217 | os.path.join(config["STARindex"], "chrNameLength.txt") 218 | log: 219 | os.path.join(outputdir, "logs", "STAR_index.log") 220 | benchmark: 221 | os.path.join(outputdir, "benchmarks", "STAR_index.txt") 222 | params: 223 | STARindex = lambda wildcards, output: os.path.dirname(output[0]), ## dirname of first output 224 | readlength = config["readlength"], 225 | starextraparams = config["additional_star_index"] 226 | conda: 227 | "envs/environment.yaml" 228 | threads: 229 | config["ncores"] 230 | shell: 231 | "echo 'STAR version:\n' > {log}; STAR --version >> {log}; " 232 | "STAR --runMode genomeGenerate --runThreadN {threads} --genomeDir {params.STARindex} " 233 | "--genomeFastaFiles {input.genome} --sjdbGTFfile {input.gtf} --sjdbOverhang {params.readlength} " 234 | "{params.starextraparams}" 235 | 236 | ## ------------------------------------------------------------------------------------ ## 237 | ## Quality control 238 | ## ------------------------------------------------------------------------------------ ## 239 | ## FastQC, original reads 240 | rule fastqc: 241 | input: 242 | fastq = os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"])) 243 | output: 244 | os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip") 245 | params: 246 | FastQC = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output 247 | log: 248 | os.path.join(outputdir, "logs", "fastqc_{sample}.log") 249 | benchmark: 250 | os.path.join(outputdir, "benchmarks", "fastqc_{sample}.txt") 251 | conda: 252 | "envs/environment.yaml" 253 | threads: 254 | config["ncores"] 255 | shell: 256 | "echo 'FastQC version:\n' > {log}; fastqc --version >> {log}; " 257 | "fastqc -o {params.FastQC} -t {threads} {input.fastq}" 258 | 259 | ## FastQC, trimmed reads 260 | rule fastqctrimmed: 261 | input: 262 | fastq = os.path.join(outputdir, "FASTQtrimmed", "{sample}.fq.gz") 263 | output: 264 | os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip") 265 | params: 266 | FastQC = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output 267 | log: 268 | os.path.join(outputdir, "logs", "fastqc_trimmed_{sample}.log") 269 | benchmark: 270 | os.path.join(outputdir, "benchmarks", "fastqc_trimmed_{sample}.txt") 271 | conda: 272 | "envs/environment.yaml" 273 | threads: 274 | config["ncores"] 275 | shell: 276 | "echo 'FastQC version:\n' > {log}; fastqc --version >> {log}; " 277 | "fastqc -o {params.FastQC} -t {threads} {input.fastq}" 278 | 279 | 280 | 281 | # The config.yaml files determines which steps should be performed 282 | def multiqc_input(wildcards): 283 | input = [] 284 | input.extend(expand(os.path.join(outputdir, "FastQC", "{sample}_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist())) 285 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist())) 286 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist())) 287 | input.extend(expand(os.path.join(outputdir, "salmon", "{sample}", "quant.sf"), sample = samples.names.values.tolist())) 288 | if config["run_trimming"]: 289 | input.extend(expand(os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz"), sample = samples.names[samples.type == 'SE'].values.tolist())) 290 | input.extend(expand(os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])), sample = samples.names[samples.type == 'PE'].values.tolist())) 291 | input.extend(expand(os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"])), sample = samples.names[samples.type == 'PE'].values.tolist())) 292 | input.extend(expand(os.path.join(outputdir, "FastQC", "{sample}_trimmed_fastqc.zip"), sample = samples.names[samples.type == 'SE'].values.tolist())) 293 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext1"]), "_val_1_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist())) 294 | input.extend(expand(os.path.join(outputdir, "FastQC", "".join(["{sample}_", str(config["fqext2"]), "_val_2_fastqc.zip"])), sample = samples.names[samples.type == 'PE'].values.tolist())) 295 | if config["run_STAR"]: 296 | input.extend(expand(os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam.bai"), sample = samples.names.values.tolist())) 297 | return input 298 | 299 | ## Determine the input directories for MultiQC depending on the config file 300 | def multiqc_params(wildcards): 301 | param = [os.path.join(outputdir, "FastQC"), 302 | os.path.join(outputdir, "salmon")] 303 | if config["run_trimming"]: 304 | param.append(os.path.join(outputdir, "FASTQtrimmed")) 305 | if config["run_STAR"]: 306 | param.append(os.path.join(outputdir, "STAR")) 307 | return param 308 | 309 | ## MultiQC 310 | rule multiqc: 311 | input: 312 | multiqc_input 313 | output: 314 | os.path.join(outputdir, "MultiQC", "multiqc_report.html") 315 | params: 316 | inputdirs = multiqc_params, 317 | MultiQCdir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output 318 | log: 319 | os.path.join(outputdir, "logs", "multiqc.log") 320 | benchmark: 321 | os.path.join(outputdir, "benchmarks", "multiqc.txt") 322 | conda: 323 | "envs/environment.yaml" 324 | shell: 325 | "echo 'MultiQC version:\n' > {log}; multiqc --version >> {log}; " 326 | "multiqc {params.inputdirs} -f -o {params.MultiQCdir}" 327 | 328 | 329 | ## ------------------------------------------------------------------------------------ ## 330 | ## Adapter trimming 331 | ## ------------------------------------------------------------------------------------ ## 332 | # TrimGalore! 333 | rule trimgaloreSE: 334 | input: 335 | fastq = os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"])) 336 | output: 337 | os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz") 338 | params: 339 | FASTQtrimmeddir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output 340 | log: 341 | os.path.join(outputdir, "logs", "trimgalore_{sample}.log") 342 | benchmark: 343 | os.path.join(outputdir, "benchmarks", "trimgalore_{sample}.txt") 344 | conda: 345 | "envs/environment.yaml" 346 | shell: 347 | "echo 'TrimGalore! version:\n' > {log}; trim_galore --version >> {log}; " 348 | "trim_galore -q 20 --phred33 --length 20 -o {params.FASTQtrimmeddir} --path_to_cutadapt cutadapt {input.fastq}" 349 | 350 | rule trimgalorePE: 351 | input: 352 | fastq1 = os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext1"]), ".", str(config["fqsuffix"]), ".gz"])), 353 | fastq2 = os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext2"]), ".", str(config["fqsuffix"]), ".gz"])) 354 | output: 355 | os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])), 356 | os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"])) 357 | params: 358 | FASTQtrimmeddir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output 359 | log: 360 | os.path.join(outputdir, "logs", "trimgalore_{sample}.log") 361 | benchmark: 362 | os.path.join(outputdir, "benchmarks", "trimgalore_{sample}.txt") 363 | conda: 364 | "envs/environment.yaml" 365 | shell: 366 | "echo 'TrimGalore! version:\n' > {log}; trim_galore --version >> {log}; " 367 | "trim_galore -q 20 --phred33 --length 20 -o {params.FASTQtrimmeddir} --path_to_cutadapt cutadapt " 368 | "--paired {input.fastq1} {input.fastq2}" 369 | 370 | ## ------------------------------------------------------------------------------------ ## 371 | ## Salmon abundance estimation 372 | ## ------------------------------------------------------------------------------------ ## 373 | # Estimate abundances with Salmon 374 | rule salmonSE: 375 | input: 376 | index = os.path.join(config["salmonindex"], "versionInfo.json"), 377 | fastq = os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz") if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"])) 378 | output: 379 | os.path.join(outputdir, "salmon", "{sample}", "quant.sf") 380 | log: 381 | os.path.join(outputdir, "logs", "salmon_{sample}.log") 382 | benchmark: 383 | os.path.join(outputdir, "benchmarks", "salmon_{sample}.txt") 384 | threads: 385 | config["ncores"] 386 | params: 387 | salmonindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input 388 | salmondir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output 389 | salmonextraparams = config["additional_salmon_quant"] 390 | conda: 391 | "envs/environment.yaml" 392 | shell: 393 | "echo 'Salmon version:\n' > {log}; salmon --version >> {log}; " 394 | "salmon quant -i {params.salmonindex} -l A -r {input.fastq} " 395 | "-o {params.salmondir}/{wildcards.sample} -p {threads} {params.salmonextraparams}" 396 | 397 | rule salmonPE: 398 | input: 399 | index = os.path.join(config["salmonindex"], "versionInfo.json"), 400 | fastq1 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext1"]), ".", str(config["fqsuffix"]), ".gz"])), 401 | fastq2 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext2"]), ".", str(config["fqsuffix"]), ".gz"])) 402 | output: 403 | os.path.join(outputdir, "salmon", "{sample}", "quant.sf") 404 | log: 405 | os.path.join(outputdir, "logs", "salmon_{sample}.log") 406 | benchmark: 407 | os.path.join(outputdir, "benchmarks", "salmon_{sample}.txt") 408 | threads: 409 | config["ncores"] 410 | params: 411 | salmonindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input 412 | salmondir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output 413 | salmonextraparams = config["additional_salmon_quant"] 414 | conda: 415 | "envs/environment.yaml" 416 | shell: 417 | "echo 'Salmon version:\n' > {log}; salmon --version >> {log}; " 418 | "salmon quant -i {params.salmonindex} -l A -1 {input.fastq1} -2 {input.fastq2} " 419 | "-o {params.salmondir}/{wildcards.sample} -p {threads} {params.salmonextraparams}" 420 | 421 | ## ------------------------------------------------------------------------------------ ## 422 | ## STAR mapping 423 | ## ------------------------------------------------------------------------------------ ## 424 | ## Genome mapping with STAR 425 | rule starSE: 426 | input: 427 | index = os.path.join(config["STARindex"], "SA"), 428 | fastq = os.path.join(outputdir, "FASTQtrimmed", "{sample}_trimmed.fq.gz") if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}.", str(config["fqsuffix"]), ".gz"])) 429 | output: 430 | os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam") 431 | threads: 432 | config["ncores"] 433 | log: 434 | os.path.join(outputdir, "logs", "STAR_{sample}.log") 435 | benchmark: 436 | os.path.join(outputdir, "benchmarks", "STAR_{sample}.txt") 437 | params: 438 | STARindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input 439 | STARdir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output 440 | starextraparams = config["additional_star_align"] 441 | conda: 442 | "envs/environment.yaml" 443 | shell: 444 | "echo 'STAR version:\n' > {log}; STAR --version >> {log}; " 445 | "STAR --genomeDir {params.STARindex} --readFilesIn {input.fastq} " 446 | "--runThreadN {threads} --outFileNamePrefix {params.STARdir}/{wildcards.sample}/{wildcards.sample}_ " 447 | "--outSAMtype BAM SortedByCoordinate --readFilesCommand gunzip -c " 448 | "{params.starextraparams}" 449 | 450 | rule starPE: 451 | input: 452 | index = os.path.join(config["STARindex"], "SA"), 453 | fastq1 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext1"]), "_val_1.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext1"]), ".", str(config["fqsuffix"]), ".gz"])), 454 | fastq2 = os.path.join(outputdir, "FASTQtrimmed", "".join(["{sample}_", str(config["fqext2"]), "_val_2.fq.gz"])) if config["run_trimming"] else os.path.join(FASTQdir, "".join(["{sample}_", str(config["fqext2"]), ".", str(config["fqsuffix"]), ".gz"])) 455 | output: 456 | os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam") 457 | threads: 458 | config["ncores"] 459 | log: 460 | os.path.join(outputdir, "logs", "STAR_{sample}.log") 461 | benchmark: 462 | os.path.join(outputdir, "benchmarks", "STAR_{sample}.txt") 463 | params: 464 | STARindex = lambda wildcards, input: os.path.dirname(input['index']), ## dirname of index input 465 | STARdir = lambda wildcards, output: os.path.dirname(os.path.dirname(output[0])), ## dirname of first output 466 | starextraparams = config["additional_star_align"] 467 | conda: 468 | "envs/environment.yaml" 469 | shell: 470 | "echo 'STAR version:\n' > {log}; STAR --version >> {log}; " 471 | "STAR --genomeDir {params.STARindex} --readFilesIn {input.fastq1} {input.fastq2} " 472 | "--runThreadN {threads} --outFileNamePrefix {params.STARdir}/{wildcards.sample}/{wildcards.sample}_ " 473 | "--outSAMtype BAM SortedByCoordinate --readFilesCommand gunzip -c " 474 | "{params.starextraparams}" 475 | 476 | ## Index bam files 477 | rule bamindex: 478 | input: 479 | bam = os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam") 480 | output: 481 | os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam.bai") 482 | log: 483 | os.path.join(outputdir, "logs", "samtools_index_{sample}.log") 484 | benchmark: 485 | os.path.join(outputdir, "benchmarks", "samtools_index_{sample}.txt") 486 | conda: 487 | "envs/environment.yaml" 488 | shell: 489 | "echo 'samtools version:\n' > {log}; samtools --version >> {log}; " 490 | "samtools index {input.bam}" 491 | 492 | ## Convert BAM files to bigWig 493 | rule bigwig: 494 | input: 495 | bam = os.path.join(outputdir, "STAR", "{sample}", "{sample}_Aligned.sortedByCoord.out.bam"), 496 | chrl = os.path.join(config["STARindex"], "chrNameLength.txt") 497 | output: 498 | os.path.join(outputdir, "STARbigwig", "{sample}_Aligned.sortedByCoord.out.bw") 499 | params: 500 | STARbigwigdir = lambda wildcards, output: os.path.dirname(output[0]) ## dirname of first output 501 | log: 502 | os.path.join(outputdir, "logs", "bigwig_{sample}.log") 503 | benchmark: 504 | os.path.join(outputdir, "benchmarks", "bigwig_{sample}.txt") 505 | conda: 506 | "envs/environment.yaml" 507 | shell: 508 | "echo 'bedtools version:\n' > {log}; bedtools --version >> {log}; " 509 | "bedtools genomecov -split -ibam {input.bam} -bg | LC_COLLATE=C sort -k1,1 -k2,2n > " 510 | "{params.STARbigwigdir}/{wildcards.sample}_Aligned.sortedByCoord.out.bedGraph; " 511 | "bedGraphToBigWig {params.STARbigwigdir}/{wildcards.sample}_Aligned.sortedByCoord.out.bedGraph " 512 | "{input.chrl} {output}; rm -f {params.STARbigwigdir}/{wildcards.sample}_Aligned.sortedByCoord.out.bedGraph" 513 | 514 | ## ------------------------------------------------------------------------------------ ## 515 | ## Transcript quantification 516 | ## ------------------------------------------------------------------------------------ ## 517 | ## tximeta 518 | rule tximeta: 519 | input: 520 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"), 521 | expand(os.path.join(outputdir, "salmon", "{sample}", "quant.sf"), sample = samples.names.values.tolist()), 522 | metatxt = config["metatxt"], 523 | salmonidx = os.path.join(config["salmonindex"], "versionInfo.json"), 524 | json = "".join([config["salmonindex"], ".json"]), 525 | script = "scripts/run_tximeta.R" 526 | output: 527 | os.path.join(outputdir, "outputR", "tximeta_se.rds") 528 | log: 529 | os.path.join(outputdir, "Rout", "tximeta_se.Rout") 530 | benchmark: 531 | os.path.join(outputdir, "benchmarks", "tximeta_se.txt") 532 | params: 533 | salmondir = lambda wildcards, input: os.path.dirname(os.path.dirname(input[1])), ## dirname of second output 534 | flag = config["annotation"], 535 | organism = config["organism"], 536 | Rbin = Rbin 537 | conda: 538 | Renv 539 | shell: 540 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args salmondir='{params.salmondir}' json='{input.json}' metafile='{input.metatxt}' outrds='{output}' annotation='{params.flag}' organism='{params.organism}'" {input.script} {log}''' 541 | 542 | ## ------------------------------------------------------------------------------------ ## 543 | ## Input variable check 544 | ## ------------------------------------------------------------------------------------ ## 545 | def geneset_param(wildcards): 546 | if config["run_camera"]: 547 | gs = config["genesets"].replace(" ", "") if config["genesets"] is not None else "NOTDEFINED" 548 | return "".join(["genesets='", gs, "'"]) 549 | else: 550 | return "" 551 | 552 | 553 | ## check design matrix and contrasts 554 | rule checkinputs: 555 | input: 556 | "config.yaml", 557 | script = "scripts/check_input.R" 558 | output: 559 | os.path.join(outputdir, "Rout", "check_input.txt") 560 | log: 561 | os.path.join(outputdir, "Rout", "check_input.Rout") 562 | benchmark: 563 | os.path.join(outputdir, "benchmarks", "check_input.txt") 564 | params: 565 | gtf = config["gtf"], 566 | genome = config["genome"], 567 | txome = config["txome"], 568 | fastqdir = config["FASTQ"], 569 | metatxt = config["metatxt"], 570 | design = config["design"].replace(" ", "") if config["design"] is not None else "NOTDEFINED", 571 | contrast = config["contrast"].replace(" ", "") if config["contrast"] is not None else "NOTDEFINED", 572 | annotation = config["annotation"].replace(" ", "") if config["annotation"] is not None else "NOTDEFINED", 573 | genesets = geneset_param, 574 | fqsuffix = str(config["fqsuffix"]), 575 | fqext1 = str(config["fqext1"]), 576 | fqext2 = str(config["fqext2"]), 577 | run_camera = str(config["run_camera"]), 578 | organism = config["organism"], 579 | Rbin = Rbin 580 | conda: 581 | Renv 582 | shell: 583 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args metafile='{params.metatxt}' design='{params.design}' contrast='{params.contrast}' outFile='{output}' gtf='{params.gtf}' genome='{params.genome}' fastqdir='{params.fastqdir}' fqsuffix='{params.fqsuffix}' fqext1='{params.fqext1}' fqext2='{params.fqext2}' txome='{params.txome}' run_camera='{params.run_camera}' organism='{params.organism}' {params.genesets} annotation='{params.annotation}'" {input.script} {log}; 584 | cat {output} 585 | ''' 586 | 587 | 588 | ## ------------------------------------------------------------------------------------ ## 589 | ## Differential expression 590 | ## ------------------------------------------------------------------------------------ ## 591 | rule edgeR: 592 | input: 593 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"), 594 | rds = os.path.join(outputdir, "outputR", "tximeta_se.rds"), 595 | script = "scripts/run_render.R", 596 | template = "scripts/edgeR_dge.Rmd" 597 | output: 598 | html = os.path.join(outputdir, "outputR", "edgeR_dge.html"), 599 | rds = os.path.join(outputdir, "outputR", "edgeR_dge.rds") 600 | params: 601 | directory = lambda wildcards, input: os.path.dirname(input['rds']), ## dirname of rds input 602 | organism = config["organism"], 603 | design = config["design"].replace(" ", "") if config["design"] is not None else "", 604 | contrast = config["contrast"].replace(" ", "") if config["contrast"] is not None else "", 605 | genesets = geneset_param, 606 | Rbin = Rbin 607 | log: 608 | os.path.join(outputdir, "Rout", "run_dge_edgeR.Rout") 609 | benchmark: 610 | os.path.join(outputdir, "benchmarks", "run_dge_edgeR.txt") 611 | conda: 612 | Renv 613 | shell: 614 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args se='{input.rds}' organism='{params.organism}' design='{params.design}' contrast='{params.contrast}' {params.genesets} rmdtemplate='{input.template}' outputdir='{params.directory}' outputfile='edgeR_dge.html'" {input.script} {log}''' 615 | 616 | ## ------------------------------------------------------------------------------------ ## 617 | ## Differential transcript usage 618 | ## ------------------------------------------------------------------------------------ ## 619 | ## DRIMSeq 620 | rule DRIMSeq: 621 | input: 622 | os.path.join(outputdir, "Rout", "pkginstall_state.txt"), 623 | rds = os.path.join(outputdir, "outputR", "edgeR_dge.rds"), 624 | script = "scripts/run_render.R", 625 | template = "scripts/DRIMSeq_dtu.Rmd" 626 | output: 627 | html = os.path.join(outputdir, "outputR", "DRIMSeq_dtu.html"), 628 | rds = os.path.join(outputdir, "outputR", "DRIMSeq_dtu.rds") 629 | params: 630 | directory = lambda wildcards, input: os.path.dirname(input['rds']), ## dirname of rds input 631 | organism = config["organism"], 632 | ncores = config["ncores"], 633 | design = config["design"].replace(" ", "") if config["design"] is not None else "", 634 | contrast = config["contrast"].replace(" ", "") if config["contrast"] is not None else "", 635 | Rbin = Rbin 636 | log: 637 | os.path.join(outputdir, "Rout", "run_dtu_drimseq.Rout") 638 | benchmark: 639 | os.path.join(outputdir, "benchmarks", "run_dtu_drimseq.txt") 640 | conda: 641 | Renv 642 | threads: 643 | config["ncores"] 644 | shell: 645 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args se='{input.rds}' design='{params.design}' contrast='{params.contrast}' ncores='{params.ncores}' rmdtemplate='{input.template}' outputdir='{params.directory}' outputfile='DRIMSeq_dtu.html'" {input.script} {log}''' 646 | 647 | ## ------------------------------------------------------------------------------------ ## 648 | ## shiny app 649 | ## ------------------------------------------------------------------------------------ ## 650 | def shiny_input(wildcards): 651 | input = [os.path.join(outputdir, "Rout", "pkginstall_state.txt")] 652 | if config["run_STAR"]: 653 | input.extend(expand(os.path.join(outputdir, "STARbigwig", "{sample}_Aligned.sortedByCoord.out.bw"), sample = samples.names.values.tolist())) 654 | return input 655 | 656 | def shiny_params(wildcards): 657 | param = ["".join(["outputdir='", outputdir, "outputR'"])] 658 | if config["run_STAR"]: 659 | param.append("".join(["bigwigdir='", outputdir, "STARbigwig'"])) 660 | return param 661 | 662 | ## shiny 663 | rule shiny: 664 | input: 665 | shiny_input, 666 | rds = os.path.join(outputdir, "outputR", "DRIMSeq_dtu.rds") if config["run_DRIMSeq"] else os.path.join(outputdir, "outputR", "edgeR_dge.rds"), 667 | script = "scripts/run_render.R", 668 | gtf = config["gtf"], 669 | template = "scripts/prepare_shiny.Rmd" 670 | output: 671 | html = os.path.join(outputdir, "outputR", "prepare_shiny.html"), 672 | rds = os.path.join(outputdir, "outputR", "shiny_sce.rds") 673 | params: 674 | p = shiny_params, 675 | Rbin = Rbin 676 | log: 677 | os.path.join(outputdir, "Rout", "prepare_shiny.Rout") 678 | benchmark: 679 | os.path.join(outputdir, "benchmarks", "prepare_shiny.txt") 680 | conda: 681 | Renv 682 | shell: 683 | '''{params.Rbin} CMD BATCH --no-restore --no-save "--args se='{input.rds}' gtffile='{input.gtf}' rmdtemplate='{input.template}' outputfile='prepare_shiny.html' {params.p}" {input.script} {log}''' 684 | 685 | ## ------------------------------------------------------------------------------------ ## 686 | ## Success and failure messages 687 | ## ------------------------------------------------------------------------------------ ## 688 | onsuccess: 689 | print("Success! The Snakemake workflow is completed.") 690 | 691 | onerror: 692 | print("Error! The Snakemake workflow aborted.") 693 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | ## Important note: 2 | ## All paths defined in this configuration file must be either absolute or relative to the 3 | ## location of the Snakefile! 4 | 5 | ## Reference annotation details 6 | ##-------------------------------------------------------------------------------------------- 7 | ## Specify "Ensembl" or "Gencode" depending on your choice 8 | annotation: Ensembl 9 | 10 | organism: Homo_sapiens # separate with underscore 11 | build: GRCh38 12 | release: 93 13 | ##-------------------------------------------------------------------------------------------- 14 | 15 | 16 | ## Paths to existing reference files 17 | ##-------------------------------------------------------------------------------------------- 18 | txome: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz 19 | genome: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa 20 | gtf: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.93.1.1.10M.gtf 21 | ##-------------------------------------------------------------------------------------------- 22 | 23 | 24 | ## Paths to indexes that will be generated by the workflow 25 | ##-------------------------------------------------------------------------------------------- 26 | salmonindex: example_data/reference/SalmonIndex/Homo_sapiens.GRCh38.93.sidx 27 | #salmonk: 31 28 | STARindex: example_data/reference/STARIndex/Homo_sapiens.GRCh38.93.STAR.idx 29 | ##-------------------------------------------------------------------------------------------- 30 | 31 | ## Additional STAR parameters 32 | ## Here, you can specify any optional parameters for the index building and/or alignment 33 | ## with STAR. The following arguments are automatically populated and should NOT be 34 | ## specified here: 35 | ## Indexing: runMode, runThreadN, genomeDir, genomeFastaFiles, sjdbGTFfile, sjdbOverhang 36 | ## Alignment: runMode, genomeDir, readFilesIn, runThreadN, outFileNamePrefix, outSAMtype, readFilesCommand 37 | ##-------------------------------------------------------------------------------------------- 38 | ## Add or remove parameters inside the "" 39 | additional_star_index: "" 40 | additional_star_align: "" 41 | 42 | ## Additional Salmon parameters 43 | ## Here, you can specify any optional parameters for the index building and/or 44 | ## abundance quantification with Salmon. The following arguments are automatically populated 45 | ## based on the arguments specified elsewhere, and should NOT be specified here: 46 | ## Indexing: transcriptome input file, index directory, gencode flag 47 | ## Quantification: library type, fastq files, index directory, output directory, number of cores 48 | ##-------------------------------------------------------------------------------------------- 49 | ## Add or remove parameters inside the "" 50 | additional_salmon_index: "-k 31" 51 | 52 | ## Add or remove parameters inside the "" 53 | ## We specify the mean and standard deviation of the fragment length distribution, for use with Salmon. 54 | ## This is important to specify for single-end reads. 55 | ## For paired-end reads, these values will define the prior, which is then updated 56 | ## based on the observed fragment lengths. 57 | additional_salmon_quant: "--seqBias --gcBias --fldMean 250 --fldSD 25" 58 | ##-------------------------------------------------------------------------------------------- 59 | 60 | 61 | ## Information about the experiment 62 | ##-------------------------------------------------------------------------------------------- 63 | readlength: 63 64 | 65 | ## Path to metadata text file. This file must contain at least the following columns: 66 | ## names: the sample identifiers = the names of the FASTQ files (excluding the _R1/R2.fastq.gz part) 67 | ## type: either SE or PE, indicating whether the sample was analyzed 68 | ## via single-end or paired-end sequencing. 69 | metatxt: example_data/metadata.txt 70 | 71 | ## Variables used for model fitting 72 | ## design: design formula for use with edgeR, camera and DRIMSeq. Must be a string 73 | ## of the form "~ " 74 | ## contrast: (comma-separated if multiple) list of contrasts to estimate in edgeR_dge.Rmd 75 | design: "~ 0 + celline" 76 | contrast: cellineN61311-cellineN052611,cellineN052611-cellineN61311 77 | 78 | ## Gene sets used for gene set analysis with camera 79 | ## Comma-separated list of gene set categories to test with camera. 80 | ## Must be a subset of H,C1,C2,C3,C4,C5,C6,C7 81 | ## Only required if variable "run_camera: is True (see below). 82 | genesets: H,C5 83 | 84 | ## The maximal number of cores to use for FastQC, STAR, Salmon and DRIMSeq. 85 | ## Note that the actual number of cores available to Snakemake is determined by 86 | ## the --cores argument when it is invoked. 87 | ncores: 1 88 | ##--------------------------------------------------------------------------------------------- 89 | 90 | 91 | ## Path to a folder containing gzipped fastq files, and the file suffix (typically, either fastq or fq). 92 | ## If you have paired-end fastq files, you also need to define the extension distinguishing the two read files. 93 | ## More precisely, ARMOR assumes that paired-end fastq files are named 94 | ## _..gz and _..gz. 95 | ## Single-end fastq files are supposed to be named 96 | ## ..gz. 97 | ##--------------------------------------------------------------------------------------------- 98 | FASTQ: example_data/FASTQ 99 | fqext1: R1 100 | fqext2: R2 101 | fqsuffix: fastq 102 | ##--------------------------------------------------------------------------------------------- 103 | 104 | 105 | ## Path to a folder that will store the output generated by the workflow. 106 | ## Additional subfolders of this folder will be generated by the workflow. 107 | ## To put output in the current directory, set output to ".". 108 | ##--------------------------------------------------------------------------------------------- 109 | output: example_data/output 110 | ##--------------------------------------------------------------------------------------------- 111 | 112 | ## R setup 113 | ##--------------------------------------------------------------------------------------------- 114 | ## Specify "True" if R should be installed in a conda environment or "False" if you want to use 115 | ## your own R installation (then you have to set the path to your library in the .Renviron file) 116 | useCondaR: True 117 | Rbin: R 118 | ##--------------------------------------------------------------------------------------------- 119 | 120 | ## Conditional conda rules 121 | ##--------------------------------------------------------------------------------------------- 122 | ## Should read trimming, STAR mapping, DRIMSeq analysis and gene set analysis be performed? Set 123 | ## to False if the step is not required. 124 | run_trimming: True 125 | run_STAR: True 126 | run_DRIMSeq: True 127 | run_camera: True 128 | ##--------------------------------------------------------------------------------------------- 129 | -------------------------------------------------------------------------------- /envs/environment.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | dependencies: 5 | - samtools=1.11 6 | - salmon=1.4.0 7 | - star=2.7.7a 8 | - fastqc=0.11.9 9 | - multiqc=1.9 10 | - trim-galore=0.6.6 11 | - cutadapt=3.2 12 | - bedtools=2.30.0 13 | - ucsc-bedgraphtobigwig=377 14 | - pandoc=2.11 15 | - tbb=2020.2 16 | -------------------------------------------------------------------------------- /envs/environment_R.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - r-base=4.4.1 5 | - pandoc=3.4 6 | - r-curl=5.2.1 7 | - r-rsqlite=2.3.7 8 | - r-xml2=1.3.6 9 | - r-httpuv=1.6.15 10 | - r-mass=7.3_60.0.1 11 | - r-matrix=1.6_5 12 | -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039508_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039508_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039508_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039508_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039509_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039509_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039509_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039509_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039512_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039512_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039512_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039512_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039513_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039513_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/FASTQ/SRR1039513_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/FASTQ/SRR1039513_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/README.md: -------------------------------------------------------------------------------- 1 | ## A small RNA-seq example data set 2 | 3 | This repository contains a small RNA-seq example data set that may be suitable, e.g., for teaching or testing purposes. The original data files come from the study 4 | 5 | > Himes BE, Jiang X, Wagner P, Hu R, Wang Q, Klanderman B, Whitaker RM, Duan Q, Lasky-Su J, Nikolos C, Jester W, Johnson M, Panettieri Jr RA, Tantisira KG, Weiss ST, Lu Q: [RNA-Seq Transcriptome Profiling Identifies CRISPLD2 as a Glucocorticoid Responsive Gene that Modulates Cytokine Function in Airway Smooth Muscle Cells.](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0099625) PLoS ONE 9(6): e99625. https://doi.org/10.1371/journal.pone.0099625 (2014). 6 | 7 | (GEO accession number [GSE52778](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52778)), in which RNA-Seq was used to characterize the human airway smooth muscle transcriptome at baseline and under asthma treatment conditions. This example data set includes four of the samples from this data set (SRR1039508, SRR1039509, SRR1039512 and SRR1039513), representing Dexamethasone treated and untreated samples from two cell lines (N61311 and N052611). The FASTQ files have been subsetted to include only reads aligning within the first 10M bases of chromosome 1 (of the GRCh38 reference genome). 8 | 9 | In addition to the FASTQ files with the reads, we provide reference annotation files from two sources: Ensembl (release GRCh38.93) and Gencode (v28). For each annotation source, we include a fasta file with the genome sequence, a gtf file with the corresponding gene annotation, and one or more fasta files with transcript sequences. All files are subsetted to include only features from the first 10M bases of chromosome 1. 10 | 11 | ### Gencode reference files 12 | The Gencode reference files were downloaded from [https://www.gencodegenes.org/releases/current.html](https://www.gencodegenes.org/releases/current.html). 13 | 14 | - reference/Gencode28/GRCh38.primary_assembly.genome.1.1.10M.fa (genome sequence) 15 | - reference/Gencode28/gencode.v28.transcripts.1.1.10M.fa.gz (transcript sequences) 16 | - reference/Gencode28/gencode.v28.annotation.1.1.10M.gtf (gene annotation) 17 | 18 | ### Ensembl reference files 19 | The Ensembl reference files were downloaded from [https://www.ensembl.org/info/data/ftp/index.html](https://www.ensembl.org/info/data/ftp/index.html). 20 | 21 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa (genome sequence) 22 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz (cDNA transcript sequences) 23 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz (ncRNA transcript sequences) 24 | - reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.93.1.1.10M.gtf (gene annotation) 25 | -------------------------------------------------------------------------------- /example_data/metadata.txt: -------------------------------------------------------------------------------- 1 | names type celline treatment 2 | SRR1039508 PE N61311 Untreated 3 | SRR1039509 PE N61311 Dexamethasone 4 | SRR1039512 PE N052611 Untreated 5 | SRR1039513 PE N052611 Dexamethasone 6 | -------------------------------------------------------------------------------- /example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz -------------------------------------------------------------------------------- /example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.ncrna.1.1.10M.fa.gz -------------------------------------------------------------------------------- /example_data/reference/Gencode28/gencode.v28.transcripts.1.1.10M.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/example_data/reference/Gencode28/gencode.v28.transcripts.1.1.10M.fa.gz -------------------------------------------------------------------------------- /img/ARMOR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/ARMOR.png -------------------------------------------------------------------------------- /img/ARMOR.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xml 183 | ARMOR 195 | 196 | 197 | 198 | armot 210 | 211 | 212 | 213 | 230 | 231 | 488 | 489 | M 503 | 504 | O 516 | 517 | R 529 | 530 | A 542 | 543 | R 555 | 556 | -------------------------------------------------------------------------------- /img/benchmark_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/benchmark_summary.png -------------------------------------------------------------------------------- /img/dag_nice5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/dag_nice5.png -------------------------------------------------------------------------------- /img/draw.io_run_modes_ARMOR.xml: -------------------------------------------------------------------------------- 1 | 7R1rd6I4+9f0nPf9YA935WNtp7vdbadz7MzuzqeeqFF5i+Byaev8+jeBRIgERAmKbebSQoCQPPc8l3ChXy/ffwvAavHgT6F7oSnT9wv95kLTVN0eoF+4ZZ22DEwzbZgHzpTclDU8Ob8gaVRIa+xMYcjcGPm+GzkrtnHiex6cREwbCAL/jb1t5rvsW1dgDgsNTxPgFlv/dqbRgrSqipJd+B0680VE50cujMHkZR74sUfed6Hps+RPenkJaF/k/nABpv5brkn/cqFfB74fpUfL92voYthSsKXP3ZZc3Yw7gF5U54Ff+mN89WPQ/36n9b5+d1/+ePjjtWelvbwCN4Z0GpaL+htOnVc86GhNAGX9G+ORDiP4HvWA68y9C/0K3eHCWZRdRUdz8jvpZRxst6ABJl3TVo15i4agtMKHnh+hX8O3hRPBpxWY4LY3RIGobREtXXSm4u43OHiMI9fxIGmfguDlET3lRJgqlUvF3LwrDzACw1cYoFnlmggAf4P+EkbBGt1Crpq6lj6ypkRsGmnDW452bILxRY5sbJ2QLCHX+abvDGXogGCNj8G7Hy9/Xgfxn/50ejeCd30QukpvMwmJwoNQqPW1AgYR+xcxOBi0hUGFg8GzgqlhsjDtK0Wu0HlcoVEp2wSomgqBbdmaZYKpolqDnsoDaEq6Mx9NkscT+EIvTLQTZgnVWr0XWWIE/42dAC4RpMIcM6SdlnHDm7N0AUZBHkP4kSdyk4LOJwvHnd6DtR9jyIQRQiA9Gy78wPmF7gf0YXQ5iIgm1Szcm+O6177rB8krdajgv8yTT7hH8q4A4nl+owhXN033IIzoeHzXBavQGbuUdpYgmDve0I8if0luolO7ZV9PtKA+JCLmZoJeBAP8GkyScEqnQRVi+sKlMyHHLhhDd7ihYdq15ydQDKPAf4G59ynJn80VqsUpmG/B0nExYf4FgynwAIV+Cj9VI+e8DgWwhmqprMagXTDihsMaqmW3wRrl2qI2axgsayDLjEIua0xgaNvlWuXQt7G9hCvgcXvJJGCPDA93FszH/9GwfYrgjWCnaGp/c6wq2n+LL0JW5xTkWD19IRlE0mepHBAxz5+wXM7UEz9Y17Oih+UfwlR5CUL5DCsKb36PjQP9xshaRoRIjRJF5CPumLkJZy+c6RR6CedHIALjjbxb+Q6WoWiI5hD9wzhQLs0L8wYjwxyq2Tn6h28PEIN6aOjASVgRIlH1BsNsynkmLVcIu9mWWgVWPS6l9wllUqpNy0w6LoUdgQtTkagI5K2ZM79cA0SagviZPea8M3mOz8178tmxBnx0LLcxAh6gueuK5hgpby7pm8NSnCaJ6EMQDZZYIXjjcJWKDvUS/YhDpH2QWh2lg/keICEnaeLT0oSGaWI0dogzYnQc+bw9e71q9nsPpRFZFZmgOfSPDoF6Uz33iUljW4SxTe3dXca2KcADVxyufpYrYsG2OPRe0UrzFv1yAt/DDi7GLq+UrvnVMLtgrWf2dWziz6O2pt6Ki6DSdJKuAfHSSqWP7BRXAnzb/IBBY4mFgFJFSrTBj6NVHOXoabwvjWUeX6WEJHLkJ8DdOrDZSITOiUSoGgdZVlvIMmQsQsYiuhmLsPpF5tB0TuhTM9riDnMXd8jgQmuWgwwuHGJBVAr5RvEFLucJiC9wR9yvDjEI8hd9hqXTWYUxeLcewY2p7BitAL9vmcPpvCj5AP/5LXBDKMHGm3NjUHQK+VuO8gbsqw5yLKsZmYCzeOy7wpms2m3kox+jZ/R+gPVPfX7enkexqUwI1Gmpi2ZpuggyXXjeWp7pYmktmS5que3yicyOU3pspSPzg/Cyqtb0AOit+TIHrXgAagcHT+sBCNdhBJcXuaj3cZYOu+zxreacjJAK+PRMa5yaact59lQx/tOy8eWIqOHjcvE+LNycfz/2kpG70j4MRKebx+j5/m749Pzj6ctoq5vjLxUPIc5PRnPHwcnxZFJxXd/wvJwBb0fPrjMOKj0RW8TfouTrmsKTSW3dMNTqxlfNtqI8AiohP02qSH9w6lQRrV8ACpzOIc3NQBS88Oe+B9wvWeuQBVsORPDdif7JHf/Et1ya5OzmnTyRnKzpiYfm8U/+JPcUPs0eS87oc/+DUbQmWQwgRuofEcRmtPe+v9qFttCPgwmsomRSDRmBYA6ruJSSPIZcJRkE0AWR8wqZcYjnQLu5s5Kb/7Ozlwq1rmQ2g6bZW2qd+ClrDSKxkicBBBHEuFngnzmPZG6VIt6JuDUgZQYm7APXiKYciN6rfIVvxbGT7As83vwkej0PLPHB1ejhcZQ0zDBQzOv03gOcrnXCGIdYZ5Vo3J5uEj0CE0Txu5ElPnW3KbKIfGBmQBDUfdDrGPRB7G2gDpBUXIdOeAjISyZadzV0mCwRjk4PvMAl+i9N031N0+abc6jFAB53cw510NqeNWYrhs4lIkzG2MEJjlXmTl2zBT/0DQYOmj3OHWVNpZ6C3ttn7SVL13dYTMnZdp+Hm0eaVtM80jplHdFhV1hHdMMwKkJUAYvnSiHYujpQRaqD2iLbFCeykUkUh7C3nb66Xy2kDA41FeyFnazoDlW7ClOMttwNmiFSru8jnan8txn5P7C0Svm/W64brFzXLLU1uV66DchumS5cqJNHv2F6zwhO77P1A4UAYzod8lRGS4WOTLrtV1lH6ZQLHV0FAVjnbiP8WDpggzrW6Hs0vXJc2/frCrOlIjpIR5BxyAa4DZjGaskYUkyGGexqVmjZ88NjtQbeoH5db5DdKXtHL99vs7Yit6uMl5hZ7j8ALwauu877N3Juj83KH9sr2CYopqOgWcbNlXdS0lWqR7GqxKRDTD38AK3WSnam1IdYJzoT4F6R5giTFVXs3xMa6xlilKpK0xZpjRQtfcopVcPgKFWtLa+wLiDdsR7NqAATzQ2cgdhN3UFKHCIIZ1RT4ZT4bGTCarABpyaYFztojUqM5pEeWWcq60wFMQdrJ9q8hQm32q21jDVD7nspuDT1q18qDeQaf7+8AiG7Ux61etSQm+ZcyMrPg1LChJWN7ayF3Ks3WVwnc/u7INpPXl1ntLMlx5lk8p+qIGe/bGlFFuR0imlPX0ZnlDsWO5mhLEtyRKjdj10eUSjJOQw8p5uDLCGSJUSyhOgDlRDJupqO2Ft1K6BbK6yhn/37nKukZRZXdhDqQQKv0J9FbwjXXTa45MKpW4y8+f7f6RZOzaPrn6ZCTlUG/QK6jryb8qAAFVkiR2FTt0SO0nxHkqLouFsrkTvLjO4WSmrKQjFSuzWVlMXvQvf1S7MoK3llNiK+88m3URWhsvJUcm+3OBOe41mS/mwbLJJVmoa6I/9ZVKawWZ7EUnvhUC8TMKlYvOOY9krO9N8zZ5SVfh8rGZDlfpuzPNVNDuuLyAa0/naB9/jz9kGJf3+x7l5fnL8WAr4QIpMBZTKgIOZgkwF1gxPh5n5aV8RHJ7jccZ7JSx3OBdzhCG02T/mZikNM2Eq10MnPYHNHrJZrsjMM3shsxY5nK+4K18pQfc1QvfxORe3orQhQdAr5MpVWBpc6YLvU/aqwiExavu1ynusMwWaH/E6F5OXGvMxNsOUxs4g4MZ+ZP3XCh/xOhWTa/ZmWl6V1XKYtr2WRWfEd8gDIpbb8UIXMMu8uzcksc/mhilYUnkyo74alxvtQBddX0lacxy5g87gbR27tladpdCPhQzeO3NoQ2NZ35UkdvnFkZeCss5tH6v2tvLe6m0cafXbrLbPm5pEHZGHxIStgK6Ga+7GNsW33g92CrTTtSu7KVsw1oTIsL9cEJWKh08DH+MtoC+mExYM/hfiO/wM= -------------------------------------------------------------------------------- /img/run_modes_ARMOR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/run_modes_ARMOR.png -------------------------------------------------------------------------------- /img/software_management.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csoneson/ARMOR/dc6b902493d4d75ed702426dddcb0f9babd8784a/img/software_management.png -------------------------------------------------------------------------------- /img/software_management.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 23 | 25 | 33 | 39 | 40 | 48 | 54 | 55 | 63 | 69 | 70 | 78 | 84 | 85 | 93 | 99 | 100 | 108 | 114 | 115 | 123 | 129 | 130 | 131 | 150 | 155 | 156 | 158 | 159 | 161 | image/svg+xml 162 | 164 | 165 | 166 | 167 | 168 | 173 | 182 | 191 | 200 | 212 | 223 | Do you want to use conda? 250 | 260 | 272 | 282 | 3. Installing software manually +system R 320 | 330 | 3. Installing software manually 360 | 370 | 3. Installing software manually 402 | 414 | 425 | Do you want to use a system R installation? 459 | 471 | 481 | 1a. Using conda to install software (including R) 512 | 520 | 530 | 538 | 1b. Using conda to install software+ system R 567 | 575 | 582 | 589 | 597 | NO YES NO YES 658 | 666 | 676 | 684 | 2. Manually creating a conda environment+ system R 713 | OR 748 | 749 | 750 | -------------------------------------------------------------------------------- /scripts/DRIMSeq_dtu.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DRIMSeq DTU" 3 | author: "" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document: 7 | toc: true 8 | toc_float: true 9 | theme: yeti 10 | highlight: tango 11 | code_folding: show 12 | keep_md: true 13 | references: 14 | - id: Nowicka2016DRIMSeq 15 | title: DRIMSeq- a Dirichlet-multinomial framework for multivariate count outcomes in genomics 16 | author: 17 | - family: Nowicka 18 | given: Malgorzata 19 | - family: Robinson 20 | given: Mark D 21 | container-title: F1000Research 22 | volume: 5 23 | page: 1356 24 | type: article-journal 25 | URL: https://f1000research.com/articles/5-1356/v2 26 | issued: 27 | year: 2016 28 | --- 29 | 30 | ```{r DRIMSeq-setup, include=FALSE} 31 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf")) 32 | ``` 33 | 34 | # Introduction 35 | 36 | This script performs differential transcript usage analysis with DRIMSeq 37 | [@Nowicka2016DRIMSeq], based on abundance estimates from Salmon. It supports 38 | testing one or more contrasts. For more detailed information of every step, 39 | we refer to the [DRIMSeq vignette](http://bioconductor.org/packages/release/bioc/vignettes/DRIMSeq/inst/doc/DRIMSeq.pdf). 40 | 41 | # Load packages 42 | 43 | ```{r DRIMSeq-load-pkg} 44 | suppressPackageStartupMessages({ 45 | library(dplyr) 46 | library(tximport) 47 | library(tximeta) 48 | library(SingleCellExperiment) 49 | library(edgeR) 50 | library(DRIMSeq) 51 | library(ggplot2) 52 | }) 53 | ``` 54 | 55 | # Load `SummarizedExperiment` object 56 | 57 | We load the `SummarizedExperiment` objects prepared using `tximeta`, containing 58 | gene- and transcript-level counts and feature lengths. In this report, we will 59 | use the transcript-level quantifications. 60 | 61 | ```{r DRIMSeq-print-se} 62 | sg <- se$sg 63 | st <- se$st 64 | st 65 | ``` 66 | 67 | # Plot total number of reads per sample 68 | 69 | ```{r DRIMSeq-plot-totalcount} 70 | ggplot(data.frame(totCount = colSums(assay(sg, "counts")), 71 | sample = colnames(assay(sg, "counts")), 72 | stringsAsFactors = FALSE), 73 | aes(x = sample, y = totCount)) + geom_bar(stat = "identity") + 74 | theme_bw() + xlab("") + ylab("Total read count") + 75 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 76 | ``` 77 | 78 | # Create dmDSdata object 79 | 80 | To create a `dmDSdata` object, which is the container used by `DRIMSeq` to store 81 | feature counts and metadata, we need a `data.frame` containing information about 82 | the samples (`metadata`) and a `data.frame` with counts (`counts`). The 83 | `dmDSdata` object is used to create a data summary plot. 84 | 85 | ```{r DRIMSeq-dmDSdata} 86 | print(contrast) 87 | print(design) 88 | 89 | counts <- data.frame(feature_id = rownames(st), 90 | gene_id = unlist(rowData(st)$gene_id), 91 | assay(st, "counts"), 92 | row.names = NULL, 93 | check.names = FALSE) 94 | 95 | metadata <- data.frame(colData(st)) 96 | metadata <- metadata %>% 97 | dplyr::rename(sample_id = names) 98 | 99 | d <- dmDSdata(counts = counts, samples = metadata) 100 | plotData(d) 101 | ``` 102 | 103 | # Filter 104 | 105 | The genes with low expression levels are filtered out to ensure that the 106 | observed transcript ratios are reliable. A single gene may have many 107 | transcripts, and lowly expressed individual transcripts are removed using 108 | `min_samps_feature_expr`. 109 | 110 | ```{r DRIMSeq-data-filter} 111 | d <- dmFilter(d, min_samps_gene_expr = 3, min_samps_feature_expr = 3, 112 | min_gene_expr = 10, min_feature_expr = 5) 113 | plotData(d) 114 | ``` 115 | 116 | # Define design. 117 | 118 | Here, we specify the design matrix used for the Dirichlet-multinomial model in the later step. 119 | 120 | ```{r DRIMSeq-define-design} 121 | print(samples(d)) 122 | (des <- model.matrix(as.formula(design), data = samples(d))) 123 | ``` 124 | 125 | # Calculate precision 126 | 127 | Computationally, it is more convenient to first estimate the precision before 128 | you fit a Dirichlet-multinomial model to the data. The precision parameters are 129 | estimated using the Cox-Reid adjusted profile likelihood. By default, $10\%$ of 130 | the genes (randomly selected) are used to estimate the initial value (the common 131 | precision). To get reproducible results, a random seed is used. 132 | 133 | To inspect the behavior of the precision estimates, they are plotted against the 134 | mean gene expression. Typically, precision increases for genes with higher mean 135 | expression in RNA-seq data. 136 | 137 | ```{r DRIMSeq-calculate-precision} 138 | set.seed(123) 139 | if(ncores > 1) { 140 | bpps <- BiocParallel::MulticoreParam(min(parallel::detectCores(),ncores)) 141 | } else { 142 | bpps <- BiocParallel::SerialParam() 143 | } 144 | 145 | d <- dmPrecision(d, design = des, add_uniform = TRUE, BPPARAM = bpps) 146 | plotPrecision(d) 147 | ``` 148 | 149 | # Fit model 150 | 151 | At the gene level, the maximum likelihood is used to estimate the coefficients 152 | of the Dirichlet-multinomial (DM) regression and the fitted transcript 153 | proportions in each sample. At the transcript level, beta-binomial regression is 154 | applied to each transcript separately. 155 | 156 | ```{r DRIMSeq-fit-model} 157 | d <- dmFit(d, design = des, verbose = 1, add_uniform = TRUE) 158 | ``` 159 | 160 | # Define contrasts. 161 | 162 | The contrasts are defined to do comparisons between specified groups. 163 | 164 | ```{r DRIMSeq-define-contrasts} 165 | print(contrast) 166 | (contrasts <- as.data.frame(makeContrasts(contrasts = contrast, levels = des))) 167 | ``` 168 | 169 | # Perform tests 170 | 171 | The test can be performed on the gene level (`level <- 'gene'`) or the 172 | transcript level (`level <- 'feature'`) using the likelihood ratio test. The 173 | results are stored as `DRIMSeq_res` and `DRIMSeq_feature_res` for the gene and 174 | the transcript level, respectively. 175 | 176 | ```{r DRIMSeq-result-genes, warning = FALSE} 177 | level <- "gene" 178 | signif3 <- function(x) signif(x, digits = 3) 179 | DRIMSeq_fits <- lapply(contrasts, function(cm) { 180 | dr <- dmTest(d, contrast = cm, verbose = 1) 181 | print(plotPValues(dr, level = level)) 182 | dr 183 | }) 184 | 185 | DRIMSeq_res <- lapply(DRIMSeq_fits, function(dr) { 186 | results(dr, level = level) %>% 187 | dplyr::mutate(mlog10PValue = -log10(pvalue)) %>% 188 | dplyr::mutate_at(vars(one_of(c("lr", "df", "pvalue", 189 | "adj_pvalue", "mlog10PValue"))), 190 | list(signif3)) 191 | }) 192 | ``` 193 | 194 | ```{r DRIMSeq-result-transcripts, warning = FALSE} 195 | level <- "feature" 196 | DRIMSeq_feature_fits <- lapply(contrasts, function(cm) { 197 | dr <- dmTest(d, contrast = cm, verbose = 1) 198 | print(plotPValues(dr, level = level)) 199 | dr 200 | }) 201 | 202 | DRIMSeq_feature_res <- lapply(DRIMSeq_feature_fits, function(dr) { 203 | results(dr, level = level) %>% 204 | dplyr::mutate(mlog10PValue = -log10(pvalue)) %>% 205 | dplyr::mutate_at(vars(one_of(c("lr", "df", "pvalue", 206 | "adj_pvalue", "mlog10PValue"))), 207 | list(signif3)) 208 | }) 209 | ``` 210 | 211 | # Write results to text files 212 | 213 | The gene-level results are exported to text files. 214 | 215 | ```{r DRIMSeq-save-result} 216 | if (class(DRIMSeq_res) == "data.frame") { 217 | write.table(DRIMSeq_res %>% dplyr::arrange(pvalue), 218 | file = "DRIMSeq_dtu_results.txt", 219 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) 220 | } else { 221 | for (nm in names(DRIMSeq_res)) { 222 | write.table(DRIMSeq_res[[nm]] %>% dplyr::arrange(pvalue), 223 | file = paste0("DRIMSeq_dtu_results_", nm, ".txt"), 224 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) 225 | } 226 | } 227 | ``` 228 | 229 | # Output results as `SingleCellExperiment` object 230 | 231 | Here, we store the results on the gene level together with the original data. 232 | The result table `DRIMSeq_res` is appended to the `rowData` of the original 233 | gene-level `SummarizedExperiment` object `sg`. For genes that were filtered out, 234 | `NA` values are used in the results. The updated `sg` can be fed to the R 235 | package `iSEE` to perform more exploratory and visual analysis. 236 | 237 | ```{r DRIMSeq-se-gene} 238 | ## add rows (NA) for genes that are filtered out (if any) 239 | DRIMSeq_resA <- lapply(seq_along(DRIMSeq_res), FUN = function(x) { 240 | 241 | # all genes 242 | geneA <- rowData(sg)$gene_id 243 | 244 | # genes that are not filtered out 245 | resX <- DRIMSeq_res[[x]] 246 | 247 | # other characteristics that have been calculated 248 | mexp <- mean_expression(DRIMSeq_fits[[x]]) %>% 249 | dplyr::arrange(match(gene_id, resX$gene_id)) %>% 250 | dplyr::select(-gene_id) 251 | prec <- genewise_precision(DRIMSeq_fits[[x]]) %>% 252 | dplyr::arrange(match(gene_id, resX$gene_id)) %>% 253 | dplyr::select(-gene_id) 254 | 255 | resX <- resX %>% 256 | dplyr::bind_cols(mexp) %>% 257 | dplyr::bind_cols(prec) %>% 258 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]]) 259 | resX$common_precision <- common_precision(DRIMSeq_fits[[x]]) 260 | rownames(resX) <- resX$gene_id 261 | 262 | # genes that are filtered out 263 | geneO <- setdiff(geneA, resX$gene_id) 264 | 265 | # results for all genes 266 | if (length(geneO) > 0) { 267 | # create a data frame with values NA as the results of the genes that 268 | # are filtered out 269 | matO <- matrix(NA, nrow = length(geneO), 270 | ncol = ncol(resX), 271 | dimnames = list(geneO, 272 | colnames(resX))) 273 | resO <- data.frame(matO) 274 | resO$gene_id <- geneO 275 | 276 | # combine the result tables 277 | resA <- resO %>% 278 | dplyr::bind_rows(resX) %>% 279 | dplyr::arrange(match(gene_id, geneA)) %>% 280 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]]) 281 | 282 | } else { 283 | resA <- resX %>% 284 | dplyr::arrange(match(gene_id, geneA)) %>% 285 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]]) 286 | } 287 | 288 | # round numeric columns to 3 significant digits 289 | resA <- resA %>% 290 | dplyr::mutate_if(is.numeric, signif3) 291 | 292 | # use gene column as rownames 293 | rownames(resA) <- resA$gene_id 294 | 295 | # convert to DataFrame 296 | resA <- S4Vectors::DataFrame(resA) 297 | return(resA) 298 | }) 299 | names(DRIMSeq_resA) <- names(DRIMSeq_res) 300 | 301 | ## Put the result tables in rowData 302 | for (i in seq_along(DRIMSeq_resA)) { 303 | nam <- names(DRIMSeq_resA)[i] 304 | namI <- paste("DRIMSeq:", nam, sep = "") 305 | stopifnot(all(rowData(sg)$gene_id == rownames(DRIMSeq_resA[[i]]))) 306 | rowData(sg)[[namI]] <- DRIMSeq_resA[[i]] 307 | } 308 | ``` 309 | 310 | Here, we store the results on the transcript-level together with the original 311 | data. The result table `DRIMSeq_feature_res` is appended to the `rowData` of the 312 | original transcript-level `SummarizedExperiment` object `st`. For transcripts 313 | that were filtered out, `NA` values are used in the results. The updated `st` 314 | can be fed to the R package `iSEE` to perform more exploratory and visual 315 | analysis. 316 | 317 | ```{r DRIMSeq-se-tx} 318 | ## add rows (NA) for genes that are filtered out (if any) 319 | DRIMSeq_resB <- lapply(seq_along(DRIMSeq_feature_res), FUN = function(x) { 320 | 321 | # all genes 322 | txA <- rowData(st)$tx_id 323 | 324 | # genes that are not filtered out 325 | resX <- DRIMSeq_feature_res[[x]] 326 | 327 | prop <- proportions(DRIMSeq_feature_fits[[x]]) %>% 328 | dplyr::arrange(match(feature_id, resX$feature_id)) %>% 329 | dplyr::select(-c(gene_id, feature_id)) 330 | colnames(prop) <- paste("proportion", colnames(prop), sep = "_") 331 | 332 | coef <- coefficients(DRIMSeq_feature_fits[[x]]) %>% 333 | dplyr::arrange(match(feature_id, resX$feature_id)) %>% 334 | dplyr::select(-c(gene_id, feature_id)) 335 | colnames(coef) <- paste("coef", colnames(coef), sep = "_") 336 | 337 | resX <- resX %>% 338 | dplyr::bind_cols(prop) %>% 339 | dplyr::bind_cols(coef) %>% 340 | dplyr::mutate(contrast = names(DRIMSeq_res)[[x]]) 341 | 342 | rownames(resX) <- resX$feature_id 343 | 344 | # genes that are filtered out 345 | txO <- setdiff(txA, resX$feature_id) 346 | 347 | # results for all genes 348 | if (length(txO) > 0) { 349 | # create a data frame with values NA as the results of the genes that 350 | # are filtered out 351 | matO <- matrix(NA, nrow = length(txO), 352 | ncol = ncol(resX), 353 | dimnames = list(txO, 354 | colnames(resX))) 355 | resO <- data.frame(matO) 356 | resO$feature_id <- txO 357 | 358 | # combine the result tables 359 | resA <- resO %>% 360 | dplyr::bind_rows(resX) %>% 361 | dplyr::arrange(match(feature_id, txA)) %>% 362 | dplyr::mutate(contrast = names(DRIMSeq_feature_res)[[x]]) 363 | 364 | } else { 365 | resA <- resX %>% 366 | dplyr::arrange(match(feature_id, txA)) %>% 367 | dplyr::mutate(contrast = names(DRIMSeq_feature_res)[[x]]) 368 | } 369 | 370 | # round numeric columns to 3 significant digits 371 | resA <- resA %>% 372 | dplyr::mutate_if(is.numeric, signif3) 373 | 374 | # use gene column as rownames 375 | rownames(resA) <- resA$feature_id 376 | 377 | # convert to DataFrame 378 | resA <- S4Vectors::DataFrame(resA) 379 | return(resA) 380 | }) 381 | names(DRIMSeq_resB) <- names(DRIMSeq_feature_res) 382 | 383 | ## Put the result tables in rowData 384 | for (i in seq_along(DRIMSeq_resB)) { 385 | nam <- names(DRIMSeq_resB)[i] 386 | namI <- paste("DRIMSeq:", nam, sep = "") 387 | stopifnot(all(rowData(st)$tx_id == rownames(DRIMSeq_resB[[i]]))) 388 | rowData(st)[[namI]] <- DRIMSeq_resB[[i]] 389 | } 390 | ``` 391 | 392 | The output is saved as a list. 393 | 394 | ```{r DRIMSeq-save-se} 395 | analysis_se <- list(sg = sg, st = st) 396 | saveRDS(analysis_se, file = "DRIMSeq_dtu.rds") 397 | ``` 398 | 399 | # Session info 400 | 401 | The analyses above were performed with the following package versions: 402 | 403 | ```{r DRIMSeq-session-info} 404 | sessionInfo() 405 | date() 406 | ``` 407 | 408 | # References 409 | -------------------------------------------------------------------------------- /scripts/check_input.R: -------------------------------------------------------------------------------- 1 | 2 | args <- (commandArgs(trailingOnly = TRUE)) 3 | for (i in seq_len(length(args))) { 4 | eval(parse(text = args[[i]])) 5 | } 6 | 7 | 8 | ## ----------------The input arguments------------------------------------------ 9 | if (exists("metafile")) { 10 | print(metafile) 11 | } else { 12 | metafile <- NULL 13 | } 14 | 15 | if (exists("organism")) { 16 | print(organism) 17 | } else { 18 | organism <- NULL 19 | } 20 | 21 | if (exists("annotation")) { 22 | print(annotation) 23 | } else { 24 | annotation <- NULL 25 | } 26 | 27 | if (exists("genesets")) { 28 | print(genesets) 29 | } else { 30 | genesets <- NULL 31 | } 32 | 33 | if (exists("outFile")) { 34 | print(outFile) 35 | } else { 36 | outFile <- NULL 37 | } 38 | 39 | if (exists("gtf")) { 40 | print(gtf) 41 | } else { 42 | gtf <- NULL 43 | } 44 | 45 | 46 | if (exists("genome")) { 47 | print(genome) 48 | } else { 49 | genome <- NULL 50 | } 51 | 52 | if (exists("fastqdir")) { 53 | print(fastqdir) 54 | } else { 55 | fastqdir <- NULL 56 | } 57 | 58 | if (exists("fqsuffix")) { 59 | print(fqsuffix) 60 | } else { 61 | fqsuffix <- NULL 62 | } 63 | 64 | if (exists("fqext1")) { 65 | print(fqext1) 66 | } else { 67 | fqext1 <- NULL 68 | } 69 | 70 | if (exists("fqext2")) { 71 | print(fqext2) 72 | } else { 73 | fqext2 <- NULL 74 | } 75 | if (exists("txome")) { 76 | print(txome) 77 | } else { 78 | txome <- NULL 79 | } 80 | 81 | if (exists("run_camera")) { 82 | print(run_camera) 83 | } else { 84 | run_camera <- "DUMMY" 85 | } 86 | 87 | 88 | ## Read metadata 89 | msg0 <- try({ 90 | if(!file.exists(metafile)) { 91 | error("The metafile, ", metafile, ", does not exist.\n") 92 | } else { 93 | metadata <- read.delim(metafile, header = TRUE, as.is = TRUE, sep = "\t"); 94 | if(!all(c("names","type") %in% colnames(metadata))) 95 | stop(paste0("ERROR: 'names' and 'type' columns must exist in ", metafile)) 96 | rownames(metadata) <- metadata$names; 97 | utype <- unique(metadata$type); 98 | if (length(utype) == 1 & any(utype %in% c("PE", "SE"))) { 99 | type <- metadata$type 100 | } else{ 101 | stop("ERROR: 'type' column in the metadata file must be PE or SE. \n") 102 | } 103 | } 104 | }, silent = TRUE) 105 | 106 | 107 | msg1 <- try({ 108 | if (utype == "SE") { 109 | pt <- paste0(metadata$names, ".", fqsuffix, ".gz") 110 | } else { 111 | pt1 <- paste0(metadata$names, "_", fqext1, ".", fqsuffix, ".gz") 112 | pt2 <- paste0(metadata$names, "_", fqext2, ".", fqsuffix, ".gz") 113 | pt <- c(pt1, pt2) 114 | } 115 | lf <- file.path(fastqdir, pt) 116 | fe <- file.exists(lf) 117 | if (any(!fe)) { 118 | stop(paste0("ERROR: ", paste(lf[!fe], collapse=" "), " are/is not available.\n")) 119 | } 120 | }, silent = TRUE) 121 | 122 | print(lf) 123 | print(fe) 124 | 125 | msg2 <- try({ 126 | fe <- file.exists(genome) 127 | if (!fe) { 128 | stop(paste0("ERROR: The 'genome' file, ", genome, ", doesn't exist.\n")) 129 | } 130 | }, silent = TRUE) 131 | 132 | msg3 <- try({ 133 | fe <- file.exists(gtf) 134 | if (!fe) { 135 | stop(paste0("ERROR: The 'gtf' file, ", gtf, ", doesn't exist.\n")) 136 | } 137 | }, silent = TRUE) 138 | 139 | msg4 <- try({ 140 | fe <- file.exists(txome) 141 | if (!fe) { 142 | stop(paste0("ERROR: The 'txome' file, ", txome, ", doesn't exist.\n")) 143 | } 144 | }, silent = TRUE) 145 | 146 | msg5 <- try({ 147 | if (run_camera == "True") 148 | if (require("msigdbr")) { 149 | if (!(gsub("_"," ",organism) %in% msigdbr::msigdbr_show_species())) 150 | stop(paste0("ERROR: '", gsub("_"," ",organism), "' not found in 'msigdbr::msigdbr_show_species()' database; fix the organism or set 'run_camera: False'")) 151 | } else { 152 | stop("Cannot check 'organism': msigdbr package not available; run 'snakemake [--use-conda] setup' before 'snakemake [--use-conda] checkinputs'") 153 | } 154 | }, silent = TRUE) 155 | 156 | msg6 <- try({ 157 | if (exists("design")) { 158 | print(design) 159 | } else { 160 | stop("ERROR: no 'design' specified; please specify one in the config file") 161 | } 162 | }, silent = TRUE) 163 | 164 | 165 | msg7 <- try({ 166 | if (exists("contrast")) { 167 | contrast <- strsplit(gsub(" ","",contrast), ",")[[1]] 168 | print(contrast) 169 | } else { 170 | stop("ERROR: no 'contrast' specified; please specify one in the config file") 171 | } 172 | }, silent = TRUE) 173 | 174 | msg12 <- try({ 175 | if( !(annotation %in% c("Ensembl","Gencode")) ) 176 | stop(paste0("ERROR: 'annotation' needs to be (exactly) 'Gencode' or 'Ensembl'; currently: ", annotation)) 177 | }, silent = TRUE) 178 | 179 | msg13 <- try({ 180 | if (!is.character(design) || length(design) != 1) { 181 | stop("ERROR: 'design' must be a character scalar") 182 | } 183 | }, silent = TRUE) 184 | 185 | msg14 <- try({ 186 | if (substr(gsub(" ", "", design), 1, 1) != "~") { 187 | stop("ERROR: the first character of 'design' must be ~") 188 | } 189 | }, silent = TRUE) 190 | 191 | msg15 <- try({ 192 | terms <- strsplit(gsub(" ", "", design), "\\~|\\+|\\:|\\*|\\^|\\-")[[1]] 193 | pres <- terms %in% c("", "0", "1", colnames(metadata)) 194 | if (any(!pres)) 195 | stop(paste0("ERROR: the following terms in the design are not available in the metadata: ", terms[!pres])) 196 | }, silent = TRUE) 197 | 198 | msg16 <- try({ 199 | if (exists("genesets") && run_camera == "True") { 200 | genesets_split <- strsplit(genesets, ",")[[1]] 201 | if( !all(genesets_split %in% c("H",paste0("C",1:7))) ) 202 | stop(paste0("ERROR: 'genesets' must be a subset of H,C1,C2,C3,C4,C5,C6,C7; currently ", genesets)) 203 | } 204 | }, silent = TRUE) 205 | 206 | ## Define design matrix 207 | msg8 <- try({ 208 | des <- model.matrix(as.formula(design), data = metadata) 209 | }, silent = TRUE) 210 | if(is(msg8, "try-error")) 211 | msg8 <- try({ 212 | stop("ERROR in 'design' value: ", design) 213 | }, silent=TRUE) 214 | 215 | 216 | # Define contrasts 217 | msg9 <- try({ 218 | have_edgeR <<- FALSE 219 | if (require("edgeR")) { 220 | have_edgeR <<- TRUE 221 | contrasts <- as.data.frame(makeContrasts(contrasts = contrast, 222 | levels = des)) 223 | } else { 224 | stop("Cannot check 'contrast', since the edgeR package is not available; run 'snakemake [--use-conda] setup' before 'snakemake [--use-conda] checkinputs'") 225 | } 226 | }, silent = TRUE) 227 | if(is(msg9, "try-error") && have_edgeR) 228 | msg9 <- try({ 229 | stop("ERROR in specified 'contrast' (n.b., could be due to invalid 'design' specified): ", paste0(contrast, collapse=",")) 230 | }, silent=TRUE) 231 | 232 | msgL <- list(msg0, msg1, msg2, msg3, msg4, msg5, msg6, msg13, msg14, msg15, msg7, msg8, msg9, msg12, msg16) 233 | isError <- sapply(msgL, FUN = function(x) {is(x, "try-error")}) 234 | msg <- msgL[isError] 235 | print(msg) 236 | 237 | if (length(msg) > 0) { 238 | for(i in seq_len(length(msg))) { 239 | m <- trimws(gsub("Error in try({ :", "", msg[[i]], fixed=TRUE)) 240 | capture.output(writeLines(m), file = outFile, append = !(i==1)) 241 | } 242 | stars <- paste(strrep("*", 84), "\n", strrep("*", 84), sep="") 243 | xmsg <- paste("check for the error message above and fix the config.yaml or one of it's components.", sep="") 244 | capture.output(writeLines(stars), file = outFile, append = TRUE) 245 | capture.output(writeLines(xmsg), file = outFile, append = TRUE) 246 | capture.output(writeLines(stars), file = outFile, append = TRUE) 247 | } else { 248 | mylist <- list("Design matrix" = des, "Contrasts matrix" = contrasts) 249 | capture.output(mylist, file = outFile) 250 | stars <- paste(strrep("*", 19), "\n", strrep("*", 19), sep="") 251 | xmsg <- paste("No errors detected.", sep="") 252 | capture.output(writeLines(stars), file = outFile, append = TRUE) 253 | capture.output(writeLines(xmsg), file = outFile, append = TRUE) 254 | capture.output(writeLines(stars), file = outFile, append = TRUE) 255 | } 256 | 257 | 258 | 259 | -------------------------------------------------------------------------------- /scripts/custom_iSEE_panels.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(ggplot2) 3 | library(rtracklayer) 4 | library(iSEE) 5 | }) 6 | 7 | options(ucscChromosomeNames = FALSE) 8 | prepareGtf <- function(gtf) { 9 | gtf <- rtracklayer::import(gtf) 10 | 11 | ## Set appropriate column names 12 | idx <- match(c("transcript_id", "gene_id", "exon_id"), 13 | colnames(S4Vectors::mcols(gtf))) 14 | colnames(S4Vectors::mcols(gtf))[idx] <- c("transcript", "gene", "exon") 15 | if (!("gene_name" %in% colnames(S4Vectors::mcols(gtf)))) { 16 | gtf$gene_name <- gtf$gene 17 | } 18 | 19 | ## Keep only exons 20 | gtf <- BiocGenerics::subset(gtf, type == "exon") 21 | 22 | ## Strip version numbers from gene and transcript IDs if they exist 23 | gtf$transcript <- gsub("\\.[0-9]+$", "", gtf$transcript) 24 | gtf$gene <- gsub("\\.[0-9]+$", "", gtf$gene) 25 | 26 | gtf 27 | } 28 | 29 | customGviz <- function(se, rows, columns, bigwig_files="", bigwig_names="", 30 | bigwig_condition="", granges="", 31 | chr="", start="", end="", showgene="") { 32 | options(ucscChromosomeNames = FALSE) 33 | 34 | ## ---------------------------------------------------------------------- ## 35 | ## Pre-flight checks 36 | ## ---------------------------------------------------------------------- ## 37 | ## Must have at least one of bigwig_files and granges 38 | if (bigwig_files == "" && granges == "") { 39 | return(NULL) 40 | } 41 | 42 | ## If no names are given, assign names to bigwig files 43 | if (bigwig_files != "" && bigwig_names == "") { 44 | bigwig_names <- paste(paste0("S", seq_along(strsplit(bigwig_files, ",")[[1]])), 45 | collapse = ",") 46 | } 47 | 48 | ## If granges file does not exist, don't show annotation 49 | if (!file.exists(granges)) { 50 | granges <- "" 51 | } 52 | 53 | ## If granges file does not exist, the viewing region must be set 54 | if (granges == "" && (chr == "" || start == "" || end == "")) { 55 | return(NULL) 56 | } 57 | 58 | ## Convert start and end positions to numeric values 59 | if (start != "") { 60 | start <- as.numeric(start) 61 | } 62 | if (end != "") { 63 | end <- as.numeric(end) 64 | } 65 | 66 | ## ---------------------------------------------------------------------- ## 67 | ## Prepare the annotation 68 | ## ---------------------------------------------------------------------- ## 69 | if (granges != "") { 70 | ## Read the GRanges object 71 | if (caching$granges == granges && !is.null(caching$gr0)) { 72 | gr0 <- caching$gr0 73 | } else { 74 | caching$gr0 <- readRDS(granges) 75 | caching$granges <- granges 76 | gr0 <- caching$gr0 77 | } 78 | 79 | ## Subset the GRanges object depending on the input 80 | ## If rows has length 1, overwrite any provided showgene 81 | if (length(rows) == 1) { 82 | showgene <- rows 83 | } 84 | 85 | ## Strip version number from the gene of interest if it exists 86 | showgene <- gsub("\\.[0-9]+$", "", showgene) 87 | 88 | if (showgene == "" && (chr == "" || is.na(start) || is.na(end))) { 89 | return(NULL) 90 | } 91 | 92 | ## If a gene has been defined (either via rows or via showgene), set the 93 | ## viewing range accordingly 94 | if (showgene != "") { 95 | gr <- BiocGenerics::subset(gr0, tolower(gene) == tolower(showgene) | 96 | tolower(gene_name) == tolower(showgene)) 97 | ## Select only one gene if there are many with the same name 98 | gr <- BiocGenerics::subset(gr, gene == gene[1]) 99 | chr <- unique(GenomeInfoDb::seqnames(gr)) 100 | start <- min(BiocGenerics::start(gr)) 101 | end <- max(BiocGenerics::end(gr)) 102 | } else { 103 | gr <- gr0[IRanges::overlapsAny( 104 | gr0, 105 | GenomicRanges::GRanges(seqnames = chr, 106 | ranges = IRanges::IRanges(start = start, 107 | end = end), 108 | strand = "*")), ] 109 | } 110 | 111 | ## Other features in the region 112 | gro <- gr0[IRanges::overlapsAny( 113 | gr0, 114 | GenomicRanges::GRanges(seqnames = chr, 115 | ranges = IRanges::IRanges(start = start, 116 | end = end), 117 | strand = "*"))] 118 | gro <- gro[!(S4Vectors::`%in%`(gro, gr))] 119 | 120 | grtr <- Gviz::GeneRegionTrack(gr, showId = TRUE, col = NULL, fill = "gray80", 121 | name = "Genes", col.title = "black") 122 | grtr2 <- Gviz::GeneRegionTrack(gro, showId = TRUE, col = "black", fill = "white", 123 | name = "", col.title = "black") 124 | } else { 125 | gr <- gro <- grtr <- grtr2 <- NULL 126 | } 127 | 128 | ## ---------------------------------------------------------------------- ## 129 | ## Set title and viewing range 130 | ## ---------------------------------------------------------------------- ## 131 | ## Define the title for the plot 132 | if (showgene != "" && !is.null(gr)) { 133 | if (all(gr$gene == gr$gene_name)) { 134 | plot_title <- unique(gr$gene) 135 | } else { 136 | plot_title <- unique(paste0(gr$gene, " (", gr$gene_name, ")")) 137 | } 138 | } else { 139 | plot_title <- paste0(chr, ":", start, "-", end) 140 | } 141 | 142 | ## Set min and max coord for the plot (add some padding to each side) 143 | minCoord <- start - 0.15*(end - start) 144 | maxCoord <- end + 0.05*(end - start) 145 | 146 | ## ---------------------------------------------------------------------- ## 147 | ## Prepare bigWig files 148 | ## ---------------------------------------------------------------------- ## 149 | ## Reformat bigWig file paths and names (provided to the function as 150 | ## character strings) 151 | if (bigwig_files != "") { 152 | bigwig_files <- strsplit(bigwig_files, ",")[[1]] 153 | bigwig_names <- strsplit(bigwig_names, ",")[[1]] 154 | if (bigwig_condition != "") { 155 | bigwig_condition <- strsplit(bigwig_condition, ",")[[1]] 156 | names(bigwig_condition) <- bigwig_names 157 | } 158 | names(bigwig_files) <- bigwig_names 159 | 160 | ## ---------------------------------------------------------------------- ## 161 | ## Define colors if bigwig_condition is provided 162 | ## ---------------------------------------------------------------------- ## 163 | ## Define colors for coverage tracks 164 | color_list <- rep(c("#DC050C", "#7BAFDE", "#B17BA6", "#F1932D", "#F7EE55", 165 | "#90C987", "#777777", "#E8601C", "#1965B0", "#882E72", 166 | "#F6C141", "#4EB265", "#CAEDAB"), 167 | ceiling(length(unique(bigwig_condition))/13)) 168 | 169 | if (length(bigwig_condition) > 1 || bigwig_condition != "") { 170 | usecol <- color_list[match(bigwig_condition, 171 | unique(bigwig_condition))] 172 | } else { 173 | usecol <- rep("gray", length(bigwig_files)) 174 | } 175 | names(usecol) <- bigwig_names 176 | 177 | ## ------------------------------------------------------------------ ## 178 | ## Show only selected sample(s) 179 | ## ------------------------------------------------------------------ ## 180 | ## If columns is specified, subset bigwig files 181 | if (!is.null(columns)) { 182 | bigwig_files <- bigwig_files[columns] 183 | bigwig_condition <- bigwig_condition[columns] 184 | usecol <- usecol[columns] 185 | } 186 | 187 | ## ------------------------------------------------------------------ ## 188 | ## Prepare final plot 189 | ## ------------------------------------------------------------------ ## 190 | ## Set up coverage tracks 191 | tracks <- lapply(seq_along(bigwig_files), function(i) { 192 | assign(paste0("covtr", i), 193 | Gviz::DataTrack(range = bigwig_files[i], 194 | type = "histogram", 195 | name = names(bigwig_files)[i], 196 | col.title = "black", 197 | fill = usecol[i], 198 | col = usecol[i], 199 | col.histogram = usecol[i], 200 | fill.histogram = usecol[i])) 201 | }) 202 | } else { 203 | tracks <- NULL 204 | } 205 | 206 | ## Add genome axis track 207 | tracks <- c(tracks, Gviz::GenomeAxisTrack(), grtr, grtr2) 208 | 209 | ## Plot tracks 210 | Gviz::plotTracks(tracks, chromosome = chr, from = minCoord, 211 | to = maxCoord, main = plot_title, 212 | transcriptAnnotation = "transcript", 213 | min.width = 0, min.distance = 0, collapse = FALSE) 214 | } 215 | 216 | customVolcano <- function(se, rows, columns, contrasts) { 217 | contrasts <- strsplit(contrasts, ",")[[1]] 218 | tmp <- do.call(plyr::rbind.fill, lapply(contrasts, function(w) { 219 | x <- data.frame(rowData(se)[, grep(paste0("^", w, ":"), 220 | colnames(rowData(se))), 221 | drop = FALSE], check.names = FALSE) 222 | colnames(x) <- gsub(paste0("^", w, ":"), "", colnames(x)) 223 | x$contrast <- w 224 | x$feature <- rownames(x) 225 | x 226 | })) 227 | ggplot(tmp, aes(x = logFC, y = mlog10PValue)) + 228 | geom_point(alpha = 0.3) + facet_grid(~ contrast) + 229 | theme_bw() + ylab("-log10(PValue)") 230 | } 231 | 232 | # Set up a cache for the GRanges object 233 | caching <- new.env() 234 | 235 | # gtf <- prepareGtf("example_data/reference/Homo_sapiens.GRCh38.93.1.1.10M.gtf") 236 | # saveRDS(gtf, file = "example_data/reference/Homo_sapiens.GRCh38.93.1.1.10M.granges.rds") 237 | # 238 | # cdp <- customDataPlotDefaults(sce, 2) 239 | # cdp$Function <- c("customGviz", "customVolcano") 240 | # cdp$Arguments <- c("bigwig_files example_data/output/STARbigwig/SRR1039508_Aligned.sortedByCoord.out.bw,example_data/output/STARbigwig/SRR1039509_Aligned.sortedByCoord.out.bw,example_data/output/STARbigwig/SRR1039512_Aligned.sortedByCoord.out.bw,example_data/output/STARbigwig/SRR1039513_Aligned.sortedByCoord.out.bw\nbigwig_names SRR1039508,SRR1039509,SRR1039512,SRR1039513\nbigwig_condition Untreated,Dexamethasone,Untreated,Dexamethasone\ngranges example_data/reference/Homo_sapiens.GRCh38.93.1.1.10M.granges.rds\nchr 1\nstart 6.1e6\nend 6.2e6\nshowgene DDX11L1", 241 | # "contrasts cellineN61311-cellineN052611") 242 | # 243 | # iSEE(sce, 244 | # customDataArgs = cdp, 245 | # customDataFun = list(customGviz = customGviz, 246 | # customVolcano = customVolcano)) 247 | 248 | -------------------------------------------------------------------------------- /scripts/edgeR_dge.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "edgeR DGE" 3 | author: "" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document: 7 | toc: true 8 | toc_float: true 9 | theme: yeti 10 | highlight: tango 11 | code_folding: show 12 | keep_md: true 13 | references: 14 | - id: Robinson2010edgeR 15 | title: edgeR-a Bioconductor package for differential expression analysis of digital gene expression data 16 | author: 17 | - family: Robinson 18 | given: Mark D 19 | - family: McCarthy 20 | given: Davis J 21 | - family: Smyth 22 | given: Gordon K 23 | container-title: Bioinformatics 24 | volume: 26 25 | page: 139-140 26 | type: article-journal 27 | URL: https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btp616 28 | issued: 29 | year: 2010 30 | - id: Robinson2010TMM 31 | title: A scaling normalization method for differential expression analysis of RNA-seq data 32 | author: 33 | - family: Robinson 34 | given: Mark D 35 | - family: Oshlack 36 | given: Alicia 37 | container-title: Genome Biology 38 | volume: 11 39 | page: R25 40 | type: article-journal 41 | URL: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25 42 | issued: 43 | year: 2010 44 | - id: Soneson2016tximport 45 | title: Differential analyses for RNA-seq- transcript-level estimates improve gene-level inferences 46 | author: 47 | - family: Soneson 48 | given: Charlotte 49 | - family: Love 50 | given: Michael I 51 | - family: Robinson 52 | given: Mark D 53 | container-title: F1000Research 54 | volume: 4 55 | page: 1521 56 | type: article-journal 57 | URL: https://f1000research.com/articles/4-1521/v2 58 | issued: 59 | year: 2016 60 | - id: Wu2012camera 61 | title: Camera- a competitive gene set test accounting for inter-gene correlation 62 | author: 63 | - family: Wu 64 | given: Di 65 | - family: Smyth 66 | given: Gordon K 67 | container-title: Nucleic Acids Research 68 | volume: 40 69 | page: e133 70 | type: article-journal 71 | issued: 72 | year: 2012 73 | --- 74 | 75 | ```{r edgeR-setup, include=FALSE} 76 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf")) 77 | ``` 78 | 79 | # Introduction 80 | 81 | Here, we perform differential gene expression analysis with edgeR 82 | [@Robinson2010edgeR] followed by gene set analysis with camera [@Wu2012camera], 83 | based on abundance estimates from Salmon. For more detailed information of each 84 | step, please refer to the 85 | [edgeR user guide](https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf). 86 | 87 | # Load packages 88 | 89 | ```{r edgeR-load-pkg} 90 | suppressPackageStartupMessages({ 91 | library(dplyr) 92 | library(tximport) 93 | library(tximeta) 94 | library(SingleCellExperiment) 95 | library(edgeR) 96 | library(ggplot2) 97 | library(msigdbr) 98 | }) 99 | ``` 100 | 101 | # Load `SummarizedExperiment` object 102 | 103 | We load the `SummarizedExperiment` objects prepared using `tximeta`, containing 104 | gene- and transcript-level counts and feature lengths. In this report, we will 105 | use the gene-level quantifications. 106 | 107 | ```{r edgeR-print-se} 108 | ## List of SummarizedExperiment objects (gene/transcript level) 109 | se 110 | 111 | ## Get gene-level SummarizedExperiment object 112 | sg <- se$sg 113 | metadata <- colData(sg) 114 | 115 | sg 116 | ``` 117 | 118 | # Plot total number of reads per sample 119 | 120 | ```{r edgeR-plot-totalcount} 121 | ggplot(data.frame(totCount = colSums(assay(sg, "counts")), 122 | sample = colnames(assay(sg, "counts")), 123 | stringsAsFactors = FALSE), 124 | aes(x = sample, y = totCount)) + geom_bar(stat = "identity") + 125 | theme_bw() + xlab("") + ylab("Total read count") + 126 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 127 | ``` 128 | 129 | # Create DGEList and include average transcript length offsets 130 | 131 | A `DGEList` is the main object `edgeR` requires to perform the DGE analysis. It 132 | is designed to store read counts and associated information. After creating this 133 | object, we add offsets, which are average transcript length correction terms 134 | [@Soneson2016tximport], 135 | and scale them so they are consistent with library sizes (sequencing depth for 136 | each sample). 137 | 138 | Then we calculate normalization factors to scale the raw library sizes and 139 | minimize the log-fold changes between the samples for most genes. Here the 140 | trimmed mean of M-values between each pair of samples (TMM) is used by default 141 | [@Robinson2010TMM]. 142 | 143 | Finally we add gene annotation information. 144 | 145 | ```{r edgeR-dge-generate} 146 | dge0 <- tximeta::makeDGEList(sg) 147 | 148 | dge0$genes <- as.data.frame(rowRanges(sg)) 149 | ``` 150 | 151 | # Calculate logCPMs and add as an assay 152 | 153 | We calculate log-counts per million (CPMs) because they are useful descriptive 154 | measures for the expression level of a gene. Note, however, that the normalized 155 | values are not used for the differential expression analysis. By default, the 156 | normalized library sizes are used in the computation. 157 | 158 | We add the logCPMs to one of the fields (or assay) of the first gene-level 159 | `SummarizedExperiment` object `sg`. At the end of the analysis, we will use this 160 | object again to export the results of all the genes we started with. 161 | 162 | ```{r edgeR-add-logcpm} 163 | logcpms <- edgeR::cpm(dge0, offset = dge0$offset, log = TRUE, 164 | prior.count = 2) 165 | dimnames(logcpms) <- dimnames(dge0$counts) 166 | 167 | stopifnot(all(rownames(logcpms) == rownames(sg)), 168 | all(colnames(logcpms) == colnames(sg))) 169 | assay(sg, "logcpm") <- logcpms 170 | ``` 171 | 172 | # Define design. 173 | 174 | Next, we specify the design matrix of the experiment, defining which sample 175 | annotations will be taken into account in the statistical modeling. 176 | 177 | ```{r edgeR-define-design} 178 | stopifnot(all(colnames(dge0) == metadata$names)) 179 | 180 | print(metadata) 181 | print(design) 182 | 183 | (des <- model.matrix(as.formula(design), data = metadata)) 184 | ``` 185 | 186 | # Filter out lowly expressed genes 187 | 188 | Next we determine which genes have sufficiently large counts to be retained in 189 | the statistical analysis, and remove the rest. After removing genes, we 190 | recalculate the normalization factors. 191 | 192 | ```{r edgeR-filter-genes} 193 | dim(dge0) 194 | keep <- edgeR::filterByExpr(dge0, design = des) 195 | dge <- dge0[keep, ] 196 | dim(dge) 197 | ``` 198 | 199 | # Estimate dispersion and fit QL model 200 | 201 | We model the count data using a quasi-likelihood (QL) negative binomial (NB) 202 | generalized log-linear model, which accounts for gene-specific variability from 203 | both biological and technical sources. Before fitting the model, we estimate 204 | the NB dispersion (overall biological variability across all genes), and the QL 205 | dispersion (gene-specific) using the `estimateDisp()` function. 206 | 207 | It is also good practice to look at the relationship between the biological 208 | coefficient of variation (NB dispersion) and the gene abundance (in logCPMs). 209 | 210 | ```{r edgeR-estimate-disp} 211 | ## Estimate dispersion and fit model 212 | dge <- estimateDisp(dge, design = des) 213 | qlfit <- glmQLFit(dge, design = des) 214 | 215 | ## Plot dispersions 216 | plotBCV(dge) 217 | ``` 218 | 219 | # Define contrasts 220 | 221 | Before testing for differences in gene expression, we define the contrasts 222 | we wish to test for. Here we represent the constrasts as a numeric matrix: 223 | 224 | ```{r edgeR-define-contrasts} 225 | print(contrast) 226 | (contrasts <- as.data.frame(makeContrasts(contrasts = contrast, levels = des))) 227 | ``` 228 | 229 | # Perform DGE tests 230 | 231 | Now we perform genewise tests for every contrast defined above, and save the 232 | results for every contrast. 233 | 234 | ```{r edgeR-perform-tests} 235 | signif3 <- function(x) signif(x, digits = 3) 236 | edgeR_res <- lapply(contrasts, function(cm) { 237 | qlf <- glmQLFTest(qlfit, contrast = cm) 238 | tt <- topTags(qlf, n = Inf, sort.by = "none")$table 239 | tt %>% 240 | dplyr::mutate(mlog10PValue = -log10(PValue)) %>% 241 | dplyr::mutate_at(vars(one_of(c("logFC", "logCPM", "F", 242 | "PValue", "FDR", "mlog10PValue"))), 243 | list(signif3)) 244 | }) 245 | ``` 246 | 247 | # Make MA plots 248 | 249 | We can visualize the test results by plotting the logCPM (average) vs the logFC, 250 | and coloring genes with an adjusted p-value below 0.05 (or another specificed 251 | FDR threshold). A plot is drawn for every contrast. 252 | 253 | ```{r edgeR-ma-plots} 254 | if (is(edgeR_res, "data.frame")) { 255 | print(ggplot(edgeR_res, aes(x = logCPM, y = logFC, color = FDR <= 0.05)) + 256 | geom_point() + theme_bw() + 257 | scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black"))) 258 | } else { 259 | for (nm in names(edgeR_res)) { 260 | print(ggplot(edgeR_res[[nm]], aes(x = logCPM, y = logFC, color = FDR <= 0.05)) + 261 | geom_point() + theme_bw() + 262 | scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) + 263 | ggtitle(nm)) 264 | } 265 | } 266 | ``` 267 | 268 | # Write DGE results to text files 269 | 270 | We export the results into text files that can be opened using any text editor. 271 | 272 | ```{r edgeR-save-results} 273 | ## Write results to text files and make MA plots 274 | if (is(edgeR_res, "data.frame")) { 275 | write.table(edgeR_res %>% dplyr::arrange(PValue) %>% 276 | dplyr::select(-dplyr::any_of("tx_ids")), 277 | file = "edgeR_dge_results.txt", 278 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) 279 | } else { 280 | for (nm in names(edgeR_res)) { 281 | write.table(edgeR_res[[nm]] %>% dplyr::arrange(PValue) %>% 282 | dplyr::select(-dplyr::any_of("tx_ids")), 283 | file = paste0("edgeR_dge_results_", nm, ".txt"), 284 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) 285 | } 286 | } 287 | ``` 288 | 289 | # Output DGE results as list of `SingleCellExperiment` objects 290 | 291 | Here, we store the analysis results with the original data. The results are 292 | appended on the `rowData` of the original gene-level `SummarizedExperiment` 293 | object `sg`. For genes that were filtered out, `NA` values are used in the 294 | result columns. The updated `sg` could be fed to the R package `iSEE` to 295 | perform more exploratory and visual analysis. 296 | 297 | ```{r edgeR-se} 298 | ## add rows (NA) for genes that are filtered out (if any) 299 | edgeR_resA <- lapply(seq_along(edgeR_res), FUN = function(x) { 300 | 301 | ## All genes 302 | geneA <- rowData(sg)$gene_id 303 | 304 | ## Genes that are not filtered out 305 | resX <- edgeR_res[[x]] 306 | resX <- resX %>% 307 | dplyr::select(c("gene_id", "gene_name", "logFC", "logCPM", 308 | "F", "FDR", "PValue", "mlog10PValue")) 309 | rownames(resX) <- resX$gene_id 310 | 311 | ## Genes that are filtered out 312 | geneO <- setdiff(geneA, resX$gene_id) 313 | 314 | ## results for all genes 315 | if (length(geneO) > 0) { 316 | ## create a data frame with values NA as the results of the genes that 317 | ## are filtered out 318 | matO <- matrix(NA, nrow = length(geneO), 319 | ncol = ncol(resX), 320 | dimnames = list(geneO, 321 | colnames(resX))) 322 | resO <- data.frame(matO) 323 | resO$gene_id <- geneO 324 | resO$gene_name <- rowData(sg)$gene_name[match(geneO, rowData(sg)$gene_id)] 325 | 326 | ## Combine the result tables 327 | resA <- resO %>% 328 | dplyr::bind_rows(resX) %>% 329 | dplyr::arrange(match(gene_id, geneA)) %>% 330 | dplyr::mutate(contrast = names(edgeR_res)[[x]]) 331 | } else { 332 | resA <- resX %>% 333 | dplyr::arrange(match(gene_id, geneA)) %>% 334 | dplyr::mutate(contrast = names(edgeR_res)[[x]]) 335 | } 336 | 337 | ## Use gene column as rownames 338 | rownames(resA) <- paste(resA$gene_id, resA$gene_name, sep = "__") 339 | 340 | ## convert to DataFrame 341 | resA <- S4Vectors::DataFrame(resA) 342 | return(resA) 343 | }) 344 | names(edgeR_resA) <- names(edgeR_res) 345 | 346 | ## Put the result tables in rowData 347 | for (i in seq_along(edgeR_resA)) { 348 | nam <- names(edgeR_resA)[i] 349 | namI <- paste("edgeR:", nam, sep = "") 350 | stopifnot(all(rownames(sg) == rownames(edgeR_resA[[i]]))) 351 | rowData(sg)[[namI]] <- edgeR_resA[[i]] 352 | } 353 | ``` 354 | 355 | The output is saved as a list. Compared to the input data `se`, the element `sg` 356 | is updated and `st` stays the same. 357 | 358 | ```{r edgeR-save-se} 359 | analysis_se <- list(sg = sg, st = se$st) 360 | saveRDS(analysis_se, file = "edgeR_dge.rds") 361 | ``` 362 | 363 | 364 | ```{r check-gene_names-column, eval = !is.null(genesets), include = FALSE} 365 | if(!("gene_name" %in% colnames(rowData(sg)))) { 366 | genesets <- NULL 367 | } 368 | ``` 369 | 370 | ```{r camera-text1, echo = FALSE, results = 'asis', eval = !is.null(genesets)} 371 | cat("# Load gene sets 372 | 373 | We will use `camera` to perform an enrichment analysis for a collection of 374 | gene sets from the [mSigDB](http://software.broadinstitute.org/gsea/msigdb), 375 | packaged in the `msigdbr` R package. Here, we load the gene set definitions 376 | and select which ones to include in the analysis.") 377 | ``` 378 | 379 | ```{r camera-load-genesets, eval = !is.null(genesets), include = !is.null(genesets)} 380 | ## Retrieve gene sets and combine in a tibble 381 | m_df <- bind_rows(lapply(genesets, 382 | function(x) msigdbr(species = organism, category = x))) 383 | ``` 384 | 385 | ```{r camera-text2, echo= FALSE, results = 'asis', eval = !is.null(genesets)} 386 | cat("# Perform tests 387 | 388 | Next, we perform the gene set analysis. We consider only gene sets where the 389 | number of genes shared with the data set is not too small and not too large. 390 | `camera` is a competitive gene set test that accounts for correlations among 391 | the genes within a gene set.") 392 | ``` 393 | 394 | ```{r camera-filter-gene-sets, eval = !is.null(genesets), include = !is.null(genesets)} 395 | minSize <- 3 396 | maxSize <- 500 397 | 398 | ## Get index for genes in each gene set in the DGEList 399 | indexList <- limma::ids2indices( 400 | gene.sets = lapply(split(m_df, f = m_df$gs_name), function(w) w$gene_symbol), 401 | identifiers = dge$genes$gene_name, 402 | remove.empty = TRUE 403 | ) 404 | 405 | ## Filter out too small or too large gene sets 406 | gsSizes <- vapply(indexList, length, 0) 407 | indexList <- indexList[gsSizes >= minSize & gsSizes <= maxSize] 408 | ``` 409 | 410 | ```{r camera-check-indexList-length, eval = !is.null(genesets), include = FALSE} 411 | ## Check if the index list is empty after filtering 412 | if (length(indexList) == 0){ 413 | genesets <- NULL 414 | empty <- TRUE 415 | } else { 416 | empty <- FALSE 417 | } 418 | ``` 419 | 420 | ```{r camera-print-empty-list-message, echo = FALSE, results = 'asis', eval = !is.null(genesets) && empty} 421 | cat("**NOTE:** 422 | The index list is empty after filtering and `camera` cannot be run. Either try 423 | different gene categories, try different filtering parameters or disable the 424 | gene set analysis in the `config.yaml` file by setting `run_camera: False`.") 425 | ``` 426 | 427 | 428 | 429 | ```{r camera-perform-tests, eval = !is.null(genesets), include = !is.null(genesets)} 430 | camera_res <- lapply(contrasts, function(cm) { 431 | camera(dge, index = indexList, design = des, contrast = cm, 432 | inter.gene.cor = NA) 433 | }) 434 | ``` 435 | 436 | 437 | ```{r camera-text3, echo = FALSE, results = 'asis', eval = !is.null(genesets)} 438 | cat("# Write gene set analysis results to text files 439 | 440 | The results from `camera` are written to a separate text file for each tested 441 | contrast.") 442 | ``` 443 | 444 | ```{r camera-save-results, eval = !is.null(genesets), include = !is.null(genesets)} 445 | ## Write results to text files 446 | if (is(camera_res, "data.frame")) { 447 | write.table(camera_res %>% tibble::rownames_to_column("GeneSet") %>% 448 | dplyr::arrange(PValue), 449 | file = "camera_dge_results.txt", 450 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) 451 | } else { 452 | for (nm in names(camera_res)) { 453 | write.table(camera_res[[nm]] %>% 454 | tibble::rownames_to_column("GeneSet") %>% 455 | dplyr::arrange(PValue), 456 | file = paste0("camera_dge_results_", nm, ".txt"), 457 | sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) 458 | } 459 | } 460 | ``` 461 | 462 | ```{r camera-text4, echo = FALSE, results = 'asis', eval = !is.null(genesets)} 463 | cat("The `camera` output, as well as the used gene sets, are saved to a file.") 464 | ``` 465 | 466 | ```{r camera-save-se, eval = !is.null(genesets), include = !is.null(genesets)} 467 | geneSets <- lapply(indexList, function(i) dge$genes$gene_name[i]) 468 | saveRDS(list(cameraRes = camera_res, 469 | geneSets = geneSets), file = "camera_gsa.rds") 470 | ``` 471 | 472 | # Session info 473 | 474 | The analyses above were performed with the following package versions: 475 | 476 | ```{r edgeR-session-info} 477 | sessionInfo() 478 | date() 479 | ``` 480 | 481 | # References 482 | 483 | -------------------------------------------------------------------------------- /scripts/generate_linkedtxome.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in seq_len(length(args))) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(tximeta) 8 | }) 9 | 10 | print(transcriptfasta) 11 | print(salmonidx) 12 | print(gtf) 13 | 14 | print(annotation) 15 | ss <- strsplit(organism, "_")[[1]] 16 | organism <- paste(paste(ss[1], ss[2])) 17 | print(organism) 18 | print(release) 19 | print(build) 20 | print(output) 21 | 22 | makeLinkedTxome(indexDir = dirname(salmonidx), 23 | source = annotation, 24 | organism = organism, 25 | release = release, 26 | genome = build, 27 | fasta = transcriptfasta, 28 | gtf = gtf, 29 | write = TRUE, 30 | jsonFile = output) 31 | 32 | sessionInfo() 33 | date() 34 | -------------------------------------------------------------------------------- /scripts/generate_report.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(rmarkdown) 3 | }) 4 | 5 | .checkPandoc <- function(ignorePandoc) { 6 | ## Initialize output to TRUE 7 | doRender <- TRUE 8 | 9 | ## First check whether pandoc is available 10 | if (!rmarkdown::pandoc_available()) { 11 | doRender <- FALSE 12 | ## If pandoc is not available, either give a warning or an error, 13 | ## depending on the value of ignorePandoc 14 | if (ignorePandoc) { 15 | ## If ignorePandoc is TRUE, just give a warning 16 | warning("pandoc is not available! ", 17 | "The final report will not be generated.", 18 | immediate. = TRUE) 19 | } else { 20 | ## If ignorePandoc is FALSE, stop 21 | stop("pandoc is not available!") 22 | } 23 | } else { 24 | ## If pandoc is available, check for pandoc-citeproc 25 | ## Only do this if the pandoc version is <2.11, since 26 | ## pandoc-citeproc is not included (or needed) in v2.11 and later. 27 | if (!rmarkdown::pandoc_available(version = "2.11")) { 28 | ## TRUE if the available pandoc version is not 2.11 or newer 29 | ## pandoc-citeproc should be found in the path, or in the 30 | ## same folder as the pandoc executable 31 | if (Sys.which("pandoc-citeproc") == "" && 32 | !file.exists(file.path(dirname(rmarkdown::pandoc_exec()), 33 | "pandoc-citeproc"))) { 34 | doRender <- FALSE 35 | ## pandoc-citeproc is required, but not found 36 | if (ignorePandoc) { 37 | ## If ignorePandoc is TRUE, just give a warning 38 | warning("pandoc-citeproc is not available! ", 39 | "The final report will not be generated.", 40 | immediate. = TRUE) 41 | } else { 42 | ## If ignorePandoc is FALSE, stop 43 | stop("pandoc-citeproc is not available!") 44 | } 45 | } 46 | } 47 | } 48 | return(doRender) 49 | } 50 | 51 | #' Generate report 52 | #' 53 | #' Generate a report based on a Rmarkdown template file. 54 | #' 55 | #' @param se,gtffile,bigwigdir,genesets,organism,design,contrast Arguments that 56 | #' are passed to the provided Rmarkdown template 57 | #' @param rmdTemplate Path to a .Rmd template file. 58 | #' @param outputFile File name of the output report. The file name extension 59 | #' must be either \code{.html} or \code{.pdf}, and consistent with the value 60 | #' of \code{outputFormat}. 61 | #' @param outputDir Path to the output directory where the report will be 62 | #' generated. 63 | #' @param outputFormat The format of the output report. Either 64 | #' \code{"html_document"} or \code{"pdf_document"}. The file name extension of 65 | #' \code{outputFile} must be consistent with this choice. 66 | #' @param showCode Logical, whether to display the R code in the report. 67 | #' @param forceOverwrite Logical, whether to force overwrite an existing report 68 | #' with the same name in the output directory. 69 | #' @param knitrProgress Logical, whether to display the progress of \code{knitr} 70 | #' when generating the report. 71 | #' @param quiet Logical, whether to show progress messages. 72 | #' @param ignorePandoc Logical, determines what to do if \code{pandoc} or 73 | #' \code{pandoc-citeproc} is missing (if \code{Sys.which("pandoc")} or 74 | #' \code{Sys.which("pandoc-citeproc")} returns ""). If \code{ignorePandoc} is 75 | #' TRUE, only a warning is given. The figures will be generated, but not the 76 | #' final report. If \code{ignorePandoc} is FALSE (default), the execution 77 | #' stops immediately. 78 | #' @param ... Other arguments that will be passed to \code{rmarkdown::render}. 79 | #' 80 | #' @author Charlotte Soneson 81 | #' 82 | #' @details When the function is called, an .Rmd template file will be copied 83 | #' into the output directory, and \code{rmarkdown::render} will be called to 84 | #' generate the final report. If there is already a .Rmd file with the same 85 | #' name in the output directory, the function will raise an error and stop, to 86 | #' avoid overwriting the existing file. The reason for this behaviour is that 87 | #' the copied template in the output directory will be deleted once the report 88 | #' is generated. 89 | #' 90 | #' @export 91 | #' 92 | #' @importFrom rmarkdown render 93 | #' @importFrom tools file_ext file_path_sans_ext 94 | #' @importFrom methods is 95 | #' @import dplyr 96 | #' 97 | #' @return Generates a summary report in the \code{outputDir} directory, and 98 | #' returns (invisibly) the name of the generated report. 99 | #' 100 | 101 | generateReport <- function(se, gtffile = NULL, organism = NULL, 102 | bigwigdir = NULL, design = NULL, genesets = NULL, 103 | contrast = NULL, rmdTemplate, outputFile, 104 | ncores = NULL, 105 | outputDir = "./", outputFormat = NULL, 106 | showCode = FALSE, forceOverwrite = FALSE, 107 | knitrProgress = FALSE, quiet = FALSE, 108 | ignorePandoc = FALSE, ...) { 109 | ## This function was inspired by code from Nicholas Hamilton, provided at 110 | ## http://stackoverflow.com/questions/37097535/generate-report-in-r 111 | 112 | ## If possible, set output format based on the extension of outputFile, if 113 | ## the output format is not provided 114 | if (is.null(outputFormat)) { 115 | if (tools::file_ext(outputFile) == "pdf") { 116 | outputFormat <- "pdf_document" 117 | } else { 118 | outputFormat <- "html_document" 119 | } 120 | } 121 | 122 | ## Check if pandoc and pandoc-citeproc are available 123 | .checkPandoc(ignorePandoc) 124 | 125 | ## ---------------------------------------------------------------------- ## 126 | ## --------------------- Check input arguments -------------------------- ## 127 | ## ---------------------------------------------------------------------- ## 128 | 129 | ## ------------------------ outputFormat -------------------------------- ## 130 | ## Raise an error if outputFormat is not one of the allowed 131 | if (!(outputFormat %in% c("pdf_document", "html_document"))) { 132 | stop("The provided outputFormat is currently not supported. Please ", 133 | "use either 'html_document' or 'pdf_document'.", call. = FALSE) 134 | } 135 | 136 | ## Raise an error if the output format and file name extension don't match 137 | if (outputFormat != paste0(tools::file_ext(outputFile), "_document")) { 138 | stop(paste0("File name extension of outputFile doesn't agree with the ", 139 | "outputFormat, should be .", 140 | gsub("_document$", "", outputFormat)), call. = FALSE) 141 | } 142 | 143 | ## ----------------------- input directory ------------------------------ ## 144 | ## se must be a character string of length 1, and point to an existing rds file 145 | if (!is(se, "character") || length(se) != 1) { 146 | stop("se must be a character string") 147 | } 148 | if (!file.exists(se)) { 149 | stop("The indicated se object does not exist") 150 | } 151 | se <- readRDS(se) 152 | 153 | ## organism 154 | if (!is.null(organism)) { 155 | if (!is(organism, "character") || length(organism) != 1) { 156 | stop("organism must be a character string") 157 | } 158 | organism <- gsub("_", " ", organism) 159 | if (!is.null(genesets) && 160 | !organism %in% msigdbr::msigdbr_species()$species_name) { 161 | stop("organism must be one of the organisms listed in ", 162 | "msigdbr::msigdbr_show_species()") 163 | } 164 | } 165 | 166 | ## design 167 | if (!is.null(design)) { 168 | if (!is(design, "character") || length(design) != 1) { 169 | stop("design must be a character string") 170 | } 171 | } 172 | 173 | ## contrasts 174 | if (!is.null(contrast)) { 175 | if (!is(contrast, "character")) { 176 | stop("contrast must be a character string") 177 | } 178 | } 179 | 180 | ## ncores 181 | if (!is.null(ncores)) { 182 | if (!is(ncores, "numeric")) { 183 | stop("ncores must be numeric") 184 | } 185 | } 186 | 187 | ## genesets 188 | if (!is.null(genesets)) { 189 | if (!is(genesets, "character")) { 190 | stop("genesets must be a character string") 191 | } 192 | } 193 | 194 | ## gtffile 195 | if (!is.null(gtffile)) { 196 | if (!is(gtffile, "character") || length(gtffile) != 1) { 197 | stop("gtffile must be a character string") 198 | } 199 | if (!file.exists(gtffile)) { 200 | stop("The indicated gtffile does not exist") 201 | } 202 | genemodels <- rtracklayer::import(gtffile) 203 | } 204 | 205 | ## bigwigdir 206 | if (!is.null(bigwigdir)) { 207 | if (!is(bigwigdir, "character") || length(bigwigdir) != 1) { 208 | stop("bigwigdir must be a character string") 209 | } 210 | if (!file.exists(bigwigdir)) { 211 | stop("The indicated bigwigdir does not exist") 212 | } 213 | } 214 | 215 | ## ------------------------- output files ------------------------------- ## 216 | outputReport <- file.path(outputDir, basename(outputFile)) 217 | outputRmd <- file.path( 218 | outputDir, 219 | paste0(tools::file_path_sans_ext(basename(outputFile)), ".Rmd")) 220 | 221 | ## Report 222 | if (file.exists(outputReport)) { 223 | if (!forceOverwrite) { 224 | stop("The file ", outputReport, 225 | " already exists. Please remove or rename the file, provide ", 226 | "another value of outputFile, or set forceOverwrite = TRUE.", 227 | call. = FALSE) 228 | } else { 229 | if (!quiet) { 230 | warning("The file ", outputReport, 231 | " already exists and will be overwritten, since ", 232 | "forceOverwrite = TRUE.", immediate. = TRUE, 233 | call. = FALSE) 234 | } 235 | } 236 | } 237 | 238 | ## ------------------------- Rmd template ------------------------------- ## 239 | ## Path to the template file 240 | templateFile <- rmdTemplate 241 | if (file.exists(templateFile)) { 242 | if (file.exists(outputRmd)) { 243 | if (!forceOverwrite) { 244 | stop("There is already an .Rmd file ", outputRmd, 245 | ". Please remove or rename this file, or choose another ", 246 | "outputFile name.", call. = FALSE) 247 | } else { 248 | warning("There is already an .Rmd file ", outputRmd, 249 | ". That file will be renamed with a suffix '_conflicting'", 250 | ", a time stamp and a random sequence. If you did not ", 251 | "explicitly create this file, it can be removed.", 252 | call. = FALSE) 253 | file.rename(from = outputRmd, 254 | to = paste0(outputRmd, "_conflicting_", Sys.Date(), "_", 255 | round(1e6*runif(1)))) 256 | } 257 | } 258 | file.copy(from = templateFile, to = outputRmd, overwrite = FALSE) 259 | } else { 260 | stop("The Rmd template file ", templateFile, " does not exist.", 261 | call. = FALSE) 262 | } 263 | 264 | ## ---------------------------------------------------------------------- ## 265 | ## ----------------------- Process the arguments ------------------------ ## 266 | ## ---------------------------------------------------------------------- ## 267 | 268 | args <- list(...) 269 | args$input <- outputRmd 270 | args$output_format <- outputFormat 271 | args$output_file <- outputFile 272 | args$quiet <- !knitrProgress 273 | 274 | ## ---------------------------------------------------------------------- ## 275 | ## ------------------------ Render the report --------------------------- ## 276 | ## ---------------------------------------------------------------------- ## 277 | 278 | outputFile <- do.call("render", args = args) 279 | 280 | ## ---------------------------------------------------------------------- ## 281 | ## --------------------- Remove temporary file -------------------------- ## 282 | ## ---------------------------------------------------------------------- ## 283 | 284 | file.remove(outputRmd) 285 | 286 | invisible(outputFile) 287 | } 288 | -------------------------------------------------------------------------------- /scripts/install_pkgs.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in seq_len(length(args))) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | print(outtxt) 7 | print(annotation) 8 | print(organism) 9 | print(ncores) 10 | 11 | (mirror <- getOption("repos")) 12 | 13 | ## Function to install packages that are not installed 14 | usePackage <- function(pkgs) { 15 | 16 | ## Install BiocManager package 17 | isBiocM <- "BiocManager" %in% installed.packages()[, 1] 18 | if (!isBiocM) { 19 | install.packages("BiocManager", repos = "http://cran.rstudio.com/", 20 | lib = .libPaths()[1]) 21 | } 22 | 23 | ## Check that Bioc is new enough 24 | if (BiocManager::version() < '3.12') { 25 | stop("Bioconductor release 3.12 or newer is required ", 26 | "for this version of ARMOR.") 27 | } 28 | 29 | ## Install the other packages 30 | isInstalled <- pkgs %in% installed.packages(lib.loc = .libPaths()[1])[, 1] 31 | BiocManager::install(pkgs[!isInstalled], 32 | update = FALSE, dependencies = TRUE, 33 | lib = .libPaths()[1], Ncpus = as.integer(ncores)) 34 | 35 | pkg.load <- lapply(pkgs, FUN = function(x) { 36 | x[!(x %in% installed.packages(.libPaths()[1])[, "Package"])] 37 | }) 38 | 39 | if (length(unlist(pkg.load)) == 0) { 40 | cat("All required packages are installed \n") 41 | } else { 42 | cat(unlist(pkg.load), ": failed to install") 43 | } 44 | 45 | ## Test whether packages could be loaded successfully 46 | suppressPackageStartupMessages( 47 | lapply(pkgs, library, character.only = TRUE) 48 | ) 49 | 50 | sink(outtxt) 51 | cat("packages loaded successfully: \n", 52 | pkgs[pkgs %in% loadedNamespaces()]) 53 | sink() 54 | } 55 | 56 | 57 | paths <- .libPaths() 58 | print(paths) 59 | 60 | ## Install packages 61 | pkgs.use <- c("dplyr", "ggplot2", "tidyr", "remotes", "limma", "edgeR", 62 | "S4Vectors", "DRIMSeq", "SingleCellExperiment", "tximeta", 63 | "msigdbr", "rmarkdown") 64 | 65 | 66 | if (annotation == "Gencode") { 67 | if (organism == "Homo_sapiens") { 68 | pkgs.extra = "org.Hs.eg.db" 69 | } else { 70 | pkgs.extra = "org.Mm.eg.db" 71 | } 72 | pkgs.use <- c(pkgs.use, pkgs.extra) 73 | } 74 | 75 | 76 | usePackage(pkgs = pkgs.use) 77 | 78 | 79 | ## Session info 80 | sessionInfo() 81 | date() 82 | 83 | -------------------------------------------------------------------------------- /scripts/list_packages.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in seq_len(length(args))) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## List the R version and all packages used in the analyses together with the 7 | ## version, by parsing the files in the "Routdir" directory. The results are 8 | ## written to the "outtxt" text file. 9 | 10 | print(Routdir) 11 | print(outtxt) 12 | 13 | lf <- list.files(Routdir) 14 | all_packages <- c() 15 | for (f in lf) { 16 | x <- readLines(paste0(Routdir, "/", f)) 17 | idx1 <- which(x == "> sessionInfo()") 18 | idx2 <- which(x == "other attached packages:") 19 | idx3 <- which(x == "loaded via a namespace (and not attached):") 20 | if (length(idx1) != 0 & length(idx2) != 0 & length(idx3) != 0) { 21 | all_packages <- 22 | unique(c(all_packages, x[idx1 + 1], 23 | do.call(c, lapply((idx2 + 1):(idx3 - 2), function(i) { 24 | grep("\\[", setdiff(setdiff(strsplit(x[i], " ")[[1]], " "), ""), 25 | value = TRUE, invert = TRUE) 26 | })))) 27 | } 28 | } 29 | write.table(sort(all_packages), file = outtxt, 30 | row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") 31 | -------------------------------------------------------------------------------- /scripts/prepare_shiny.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Shiny preparation" 3 | author: "" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document: 7 | toc: true 8 | toc_float: true 9 | theme: yeti 10 | highlight: tango 11 | code_folding: show 12 | keep_md: true 13 | editor_options: 14 | chunk_output_type: console 15 | --- 16 | 17 | ```{r shiny-setup, include=FALSE} 18 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf")) 19 | ``` 20 | 21 | # Introduction 22 | 23 | This script prepares the data to be used for shiny app. 24 | 25 | # Load packages 26 | 27 | ```{r shiny-load-pkg} 28 | suppressPackageStartupMessages({ 29 | library(tibble) 30 | library(dplyr) 31 | library(tidyr) 32 | library(limma) 33 | library(edgeR) 34 | library(reshape2) 35 | library(SingleCellExperiment) 36 | library(S4Vectors) 37 | }) 38 | ``` 39 | 40 | # Load data 41 | 42 | ```{r shiny-load-data} 43 | options(ucscChromosomeNames = FALSE) 44 | sg <- se$sg 45 | st <- se$st 46 | ``` 47 | 48 | # Gene models 49 | 50 | ```{r shiny-gene-model} 51 | create_genemodels <- function(genemodels) { 52 | idx <- match(c("transcript_id", "gene_id", "exon_id"), 53 | colnames(mcols(genemodels))) 54 | colnames(mcols(genemodels))[idx] <- c("transcript", "gene", "exon") 55 | mcols(genemodels)$symbol <- mcols(genemodels)$transcript 56 | subset(genemodels, type == "exon") 57 | } 58 | 59 | if (!is.null(gtffile)) { 60 | genemodels <- create_genemodels(genemodels) 61 | } else { 62 | genemodels <- NULL 63 | } 64 | ``` 65 | 66 | # Vector with bigWig file names 67 | 68 | ```{r shiny-bigwig} 69 | if (!is.null(bigwigdir)) { 70 | bwfiles <- normalizePath(list.files(bigwigdir, pattern = "\\.bw$", 71 | full.names = TRUE)) 72 | names(bwfiles) <- gsub("_Aligned.sortedByCoord.out.bw", "", basename(bwfiles)) 73 | } else { 74 | bwfiles <- NA 75 | } 76 | ``` 77 | 78 | # edgeR - gene-level MDS 79 | 80 | ```{r shiny-MDS} 81 | logcpms <- assay(sg, "logcpm") 82 | mds <- limma::plotMDS(logcpms, top = 500, labels = NULL, pch = NULL, 83 | cex = 1, dim.plot = c(1, 2), ndim = min(7, ncol(logcpms) - 1), 84 | gene.selection = "common", 85 | xlab = NULL, ylab = NULL, plot = FALSE) 86 | if (!is.null(mds$cmdscale.out)) { 87 | ## Bioc 3.12 and earlier 88 | mds <- mds$cmdscale.out 89 | colnames(mds) <- paste0("MDS", seq_len(ncol(mds))) 90 | mds <- as.data.frame(mds) %>% tibble::rownames_to_column(var = "names") 91 | } else { 92 | mds <- data.frame(names = colnames(logcpms), 93 | MDS1 = mds$x, 94 | MDS2 = mds$y) 95 | } 96 | mds <- mds %>% 97 | dplyr::full_join(data.frame(colData(sg)), by = "names") 98 | ``` 99 | 100 | # SingleCellExperiment on gene level 101 | 102 | The `rowData` of `sce_gene` includes the gene information and the result tables 103 | from `edgeR` and `DRIMSeq`. Each result table is stored as a column, and the 104 | column name is composed by `edgeR:` or `DRIMSeq:` and the name of the contrast 105 | used. 106 | 107 | The `colData` of `sce_gene` stores the sample information, the bigWig file names 108 | and condition information 109 | 110 | The multidimensional scale data is stored in `reducedDims`. 111 | 112 | ```{r shiny-sce-gene} 113 | nam <- colData(sg)$names 114 | 115 | ## low dimensional representation 116 | reducedData <- mds %>% 117 | dplyr::arrange(match(names, nam)) %>% 118 | as.data.frame() %>% 119 | dplyr::mutate(namestmp = names) %>% 120 | tibble::column_to_rownames("namestmp") %>% 121 | dplyr::select(-one_of(colnames(colData(sg)))) 122 | reducedData <- as.matrix(reducedData) 123 | 124 | ## column data 125 | colData(sg)$bwFiles <- bwfiles[nam] 126 | 127 | sce_gene <- SingleCellExperiment(assays = assays(sg), 128 | rowData = rowData(sg), 129 | colData = colData(sg), 130 | metadata = list(geneModels = genemodels), 131 | reducedDims = SimpleList(MDS = reducedData)) 132 | ``` 133 | 134 | # SingleCellExperiment on transcript level 135 | 136 | The `rowData` of `sce_tx` includes the information of genes and transcripts, 137 | and the result table on the transcript level from `DRIMSeq`. 138 | 139 | The `colData` of `sce_tx` stores the sample information, the bigWig file names 140 | and condition information. 141 | 142 | ```{r shiny-sce-tx} 143 | nam <- colData(st)$names 144 | 145 | ## column data 146 | colData(st)$bwFiles <- bwfiles[nam] 147 | 148 | sce_tx <- SingleCellExperiment(assays = assays(st), 149 | rowData = rowData(st), 150 | colData = colData(st), 151 | metadata = list(geneModels = genemodels)) 152 | ``` 153 | 154 | # Output results 155 | 156 | ```{r shiny-save-sce} 157 | saveRDS(list(sce_tx = sce_tx, 158 | sce_gene = sce_gene), 159 | file = "shiny_sce.rds") 160 | ``` 161 | 162 | # Session info 163 | 164 | The analyses above were performed with the following package versions: 165 | 166 | ```{r shiny-session-info} 167 | sessionInfo() 168 | date() 169 | ``` 170 | 171 | -------------------------------------------------------------------------------- /scripts/run_render.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in seq_len(length(args))) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Mandatory arguments 7 | print(se) 8 | print(rmdtemplate) 9 | print(outputdir) 10 | print(outputfile) 11 | 12 | ## Arguments that are only used for some of the reports 13 | if (exists("organism")) { 14 | print(organism) 15 | } else { 16 | organism <- NULL 17 | } 18 | 19 | if (exists("design")) { 20 | print(design) 21 | } else { 22 | design <- NULL 23 | } 24 | 25 | if (exists("contrast")) { 26 | contrast <- strsplit(gsub(" ","",contrast), ",")[[1]] 27 | print(contrast) 28 | } else { 29 | contrast <- NULL 30 | } 31 | 32 | if (exists("genesets")) { 33 | genesets <- strsplit(gsub(" ","",genesets), ",")[[1]] 34 | print(genesets) 35 | } else { 36 | genesets <- NULL 37 | } 38 | 39 | if (exists("gtffile")) { 40 | print(gtffile) 41 | } else { 42 | gtffile <- NULL 43 | } 44 | 45 | if (exists("ncores")) { 46 | ncores <- as.numeric(ncores) 47 | if(is.na(ncores)) 48 | ncores <- 1 49 | print(ncores) 50 | } else { 51 | ncores <- 1 52 | } 53 | 54 | if (exists("bigwigdir")) { 55 | bigwigdir <- normalizePath(bigwigdir) 56 | print(bigwigdir) 57 | } else { 58 | bigwigdir <- NULL 59 | } 60 | 61 | source("scripts/generate_report.R") 62 | 63 | generateReport(se = se, organism = organism, gtffile = gtffile, 64 | contrast = contrast, design = design, genesets = genesets, 65 | bigwigdir = bigwigdir, rmdTemplate = rmdtemplate, 66 | outputDir = outputdir, outputFile = outputfile, ncores = ncores, 67 | forceOverwrite = TRUE, showCode = TRUE) 68 | -------------------------------------------------------------------------------- /scripts/run_tximeta.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in seq_len(length(args))) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(tximport) 9 | library(tximeta) 10 | library(SingleCellExperiment) 11 | }) 12 | 13 | print(salmondir) 14 | print(json) 15 | print(metafile) 16 | print(outrds) 17 | print(annotation) 18 | print(organism) 19 | 20 | ## Load json linkedTxome 21 | loadLinkedTxome(json) 22 | 23 | ## Read metadata 24 | metadata <- read.delim(metafile, header = TRUE, as.is = TRUE, sep = "\t") 25 | 26 | ## List Salmon directories 27 | salmonfiles <- paste0(salmondir, "/", metadata$names, "/quant.sf") 28 | names(salmonfiles) <- metadata$names 29 | 30 | ## Add file column to metadata and import annotated abundances 31 | ## In transcript level 32 | coldata <- cbind(metadata, files = salmonfiles, stringsAsFactors = FALSE) 33 | st <- tximeta::tximeta(coldata) 34 | 35 | ## Summarize to gene level 36 | sg <- summarizeToGene(st) 37 | 38 | ## If the 'entrezid' column exists and is a list, convert to a vector 39 | if ("entrezid" %in% colnames(rowData(sg)) && 40 | is(rowData(sg)$entrezid, "list")) { 41 | if (any(vapply(rowData(sg)$entrezid, length, 1) > 1)) { 42 | warning("Some elements of rowData(sg)$entrezid consisted of ", 43 | "more than one object. Only the first one is retained.") 44 | } 45 | rowData(sg)$entrezid <- vapply( 46 | rowData(sg)$entrezid, 47 | function(w) w[[1]], 48 | as(NA, class(rowData(sg)$entrezid[[1]])) 49 | ) 50 | } 51 | 52 | ## Add gene_names for Gencode reference 53 | if(annotation == "Gencode") { 54 | if(organism == "Homo_sapiens") { 55 | library(org.Hs.eg.db) 56 | } else { 57 | library(org.Mm.eg.db) 58 | } 59 | sg <- tximeta::addIds(sg, "SYMBOL", gene = TRUE) 60 | rowData(sg)$gene_name <- rowData(sg)$SYMBOL 61 | } 62 | 63 | ## If rowData(st)$gene_id is a CharacterList, convert it to character to allow 64 | ## the joining below 65 | if (is(rowData(st)$gene_id, "CharacterList")) { 66 | if (any(vapply(rowData(st)$gene_id, length, 1) > 1)) { 67 | warning("Some elements of rowData(st)$gene_id consisted of more than one", 68 | "object. Only the first one is retained.") 69 | } 70 | rowData(st)$gene_id <- vapply(rowData(st)$gene_id, function(w) w[[1]], "") 71 | } 72 | 73 | ## If rowData(st)$tx_id is of class integer, replace it with the tx_name 74 | ## column 75 | if (is(rowData(st)$tx_id, "integer")) { 76 | rowData(st)$tx_id <- rowData(st)$tx_name 77 | } 78 | 79 | ## Add gene information, e.g. gene_name, entrezid, ... (if provided) to 80 | ## transcript-level SE 81 | rowData(st) <- rowData(st) %>% 82 | data.frame() %>% 83 | dplyr::left_join(data.frame(rowData(sg))) %>% 84 | DataFrame() 85 | 86 | ## Change the row names in sg to have geneID__geneSymbol 87 | rownames(sg) <- paste(rowData(sg)$gene_id, rowData(sg)$gene_name, sep = "__") 88 | 89 | # Coerce the object from SummarizedExperiment to SingleCellExperiment 90 | st <- as(st, "SingleCellExperiment") 91 | sg <- as(sg, "SingleCellExperiment") 92 | 93 | saveRDS(list(st = st, sg = sg), file = outrds) 94 | 95 | sessionInfo() 96 | date() 97 | 98 | 99 | -------------------------------------------------------------------------------- /version: -------------------------------------------------------------------------------- 1 | 1.5.10 (2024-09-21) 2 | --------------------------------------------------------------------------------