├── LICENSE ├── Makefile ├── README.md ├── Rscripts ├── extract_observed_junctions.R ├── filter_fastq.R ├── generate_tx2gene.R ├── get_nbr_reads.R ├── get_nbr_reads_NA12878public.R ├── get_polyA_tail_lengths.R ├── get_reads_aligning_to_genome_but_not_txome.R ├── gff2gtf.R ├── list_packages.R ├── merge_promoter_bed_files.R ├── plot_abundances.R ├── plot_abundances_template.Rmd ├── plot_compare_flair_annotation.R ├── plot_distance_primary_supplementary_alignments.R ├── plot_eq_class_counts.R ├── plot_qc.R ├── summarize_abundances.R ├── summarize_abundances_illumina.R └── summarize_star_sjdbs.R ├── datasets_to_include.mk ├── manuscript_results_figures └── Rscripts │ ├── plot_alignment.R │ ├── plot_compare_gffcompare_sqanti.R │ ├── plot_consistency_between_replicates.R │ ├── plot_flair_round2_summary.R │ ├── plot_flair_summary.R │ ├── plot_gc_content_genomenotxome.R │ ├── plot_illumina_nanopore_lengths_all_datasets.R │ ├── plot_nbr_assigned_reads.R │ ├── plot_nbr_assigned_reads_biotype.R │ ├── plot_nbr_detected_features.R │ ├── plot_nbr_junctions_per_read_illumina.R │ ├── plot_nbr_reads.R │ ├── plot_nbr_tx_in_eqclass.R │ ├── plot_observed_junctions.R │ ├── plot_palindromes.R │ ├── plot_polya_tail_estimates.R │ ├── plot_primary_supplementary.R │ ├── plot_rseqc.R │ ├── plot_salmon_variants_all_datasets.R │ ├── plot_sqanti_round2_summary.R │ ├── plot_sqanti_summary.R │ ├── plot_txcov_sirv.R │ ├── plot_txcov_subsampling.R │ ├── plot_txlength_vs_basecoverage.R │ └── remap_sample_names.R └── reference └── chromhmm_promoters └── hg38_promoters.bed /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Charlotte Soneson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## A comprehensive examination of Nanopore native RNA sequencing for characterisation of complex transcriptomes 2 | 3 | This repository contains the code used to perform the analysis and generate the figures in the paper 4 | 5 | * C Soneson, Y Yao, A Bratus-Neuenschwander, A Patrignani, MD Robinson, S Hussain: A comprehensive examination of Nanopore native RNA sequencing for characterization of complex transcriptomes 6 | 7 | The raw data has been uploaded to ArrayExpress, with accession numbers E-MTAB-7757 and E-MTAB-7778. 8 | -------------------------------------------------------------------------------- /Rscripts/extract_observed_junctions.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | print(refgtf) 7 | print(bam) 8 | print(paired) 9 | print(outrds) 10 | 11 | suppressPackageStartupMessages({ 12 | library(GenomicAlignments) 13 | library(SGSeq) 14 | library(GenomicFeatures) 15 | }) 16 | 17 | ## Extract annotated junctions 18 | txdb <- GenomicFeatures::makeTxDbFromGFF(refgtf) 19 | txf <- SGSeq::convertToTxFeatures(txdb) 20 | txf <- txf[SGSeq::type(txf) == "J"] 21 | start(txf) <- start(txf) + 1 22 | end(txf) <- end(txf) - 1 23 | 24 | ## Get junctions from alignments 25 | if (!paired) { 26 | x <- GenomicAlignments::readGAlignments(bam, use.names = TRUE) 27 | } else { 28 | x <- GenomicAlignments::readGAlignmentPairs(bam, use.names = TRUE) 29 | } 30 | juncSum <- GenomicAlignments::summarizeJunctions(x) 31 | 32 | ## To compare with later: number of exact matches of observed junctions among 33 | ## the annotated ones. 34 | table(ranges(juncSum) %in% ranges(txf)) 35 | 36 | ## Find closest annotated junction to each observed junction 37 | txf_seqnames_char <- as.character(seqnames(txf)) 38 | juncsum_seqnames_char <- as.character(seqnames(juncSum)) 39 | txf_starts <- start(txf) 40 | txf_ends <- end(txf) 41 | juncsum_starts <- start(juncSum) 42 | juncsum_ends <- end(juncSum) 43 | juncdists <- do.call(rbind, lapply(seq_along(juncSum), function(i) { 44 | if (i%%1000 == 0) print(i) 45 | idx <- which(txf_seqnames_char == juncsum_seqnames_char[i]) 46 | dists <- abs(juncsum_starts[i] - txf_starts[idx]) + 47 | abs(juncsum_ends[i] - txf_ends[idx]) 48 | tryCatch({ 49 | data.frame(juncSumIdx = i, whichMin = idx[which.min(dists)], minDist = min(dists)) 50 | }, error = function(e) { 51 | data.frame(juncSumIdx = i, whichMin = NA_integer_, minDist = Inf) 52 | }) 53 | })) 54 | 55 | rtmp <- ranges(txf) 56 | l <- length(rtmp) 57 | rtmp <- c(rtmp, IRanges(start = 0, end = 0)) 58 | jtmp <- juncdists 59 | jtmp$whichMin[is.na(jtmp$whichMin)] <- l + 1 60 | mcols(juncSum)$closestRefJunc <- rtmp[jtmp$whichMin] 61 | mcols(juncSum)$distToClosestRefJunc <- juncdists$minDist 62 | 63 | saveRDS(juncSum, file = outrds) 64 | date() 65 | sessionInfo() 66 | -------------------------------------------------------------------------------- /Rscripts/filter_fastq.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Remove reads that don't have _t or _c in the end of their identifier 7 | 8 | print(fastqin) 9 | print(fastqout) 10 | 11 | suppressPackageStartupMessages(library(ShortRead)) 12 | 13 | x <- readFastq(fastqin) 14 | nm <- as.character(x@id) 15 | keep <- grep("_t |_c ", nm) 16 | 17 | x <- x[keep] 18 | 19 | writeFastq(x, file = fastqout, mode = "w", compress = TRUE) 20 | 21 | sessionInfo() 22 | date() 23 | -------------------------------------------------------------------------------- /Rscripts/generate_tx2gene.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Parse a transcript fasta file and optionally a gtf file from Ensembl and 7 | ## extract information about the transcripts. 8 | 9 | suppressPackageStartupMessages({ 10 | library(Biostrings) 11 | library(rtracklayer) 12 | }) 13 | 14 | print(transcriptome) 15 | print(gtf) 16 | print(outrds) 17 | 18 | ## Read input files 19 | transcriptome <- readDNAStringSet(transcriptome) 20 | if (!is.null(gtf)) gtf0 <- import(gtf) 21 | 22 | ## Extract information from fasta file identifiers 23 | tx2gene <- data.frame(t(sapply(as.character(names(transcriptome)), function(nm) { 24 | a <- strsplit(nm, " ")[[1]] 25 | tx <- a[1] 26 | gene <- gsub("^gene:", "", a[grep("^gene:", a)]) 27 | symbol <- gsub("^gene_symbol:", "", a[grep("^gene_symbol:", a)]) 28 | gene_biotype <- gsub("^gene_biotype:", "", a[grep("^gene_biotype:", a)]) 29 | tx_biotype <- gsub("^transcript_biotype:", "", a[grep("^transcript_biotype:", a)]) 30 | position <- gsub("chromosome:|scaffold:", "", a[grep("^chromosome:|^scaffold:", a)]) 31 | c(tx = ifelse(length(tx) != 0, tx, NA), 32 | gene = ifelse(length(gene) != 0, gene, NA), 33 | symbol = ifelse(length(symbol) != 0, symbol, NA), 34 | gene_biotype = ifelse(length(gene_biotype) != 0, gene_biotype, NA), 35 | tx_biotype = ifelse(length(tx_biotype) != 0, tx_biotype, NA), 36 | chromosome = ifelse(length(position) != 0, strsplit(position, ":")[[1]][2], NA), 37 | start = ifelse(length(position) != 0, strsplit(position, ":")[[1]][3], NA), 38 | end = ifelse(length(position) != 0, strsplit(position, ":")[[1]][4], NA), 39 | strand = ifelse(length(position) != 0, strsplit(position, ":")[[1]][5], NA)) 40 | })), stringsAsFactors = FALSE) 41 | rownames(tx2gene) <- NULL 42 | 43 | ## Add information about transcript length 44 | tx2gene$txlength <- width(transcriptome) 45 | 46 | if (!is.null(gtf)) { 47 | ## Add information from gtf file 48 | gtf <- subset(gtf0, type == "transcript") 49 | gtf$transcript_id_with_version <- paste0(gtf$transcript_id, ".", gtf$transcript_version) 50 | ## Keep only transcripts that are not present in the fasta file 51 | gtf <- subset(gtf, !(transcript_id_with_version %in% tx2gene$tx)) 52 | gtfout <- data.frame(tx = gtf$transcript_id_with_version, 53 | gene = paste0(gtf$gene_id, ".", gtf$gene_version), 54 | symbol = gtf$gene_name, 55 | gene_biotype = gtf$gene_biotype, 56 | tx_biotype = gtf$transcript_biotype, 57 | chromosome = seqnames(gtf), 58 | start = start(gtf), 59 | end = end(gtf), 60 | strand = strand(gtf), 61 | stringsAsFactors = FALSE) 62 | 63 | ## Get transcript length by adding up exon lengths for each transcript 64 | gtfe <- subset(gtf0, type == "exon" & 65 | paste0(transcript_id, ".", transcript_version) %in% gtfout$tx) 66 | gtfout$txlength <- sapply(gtfout$tx, function(m) { 67 | sum(width(subset(gtfe, paste0(transcript_id, ".", transcript_version) == m))) 68 | }) 69 | 70 | ## Merge information from fasta and gtf 71 | stopifnot(all(colnames(tx2gene) == colnames(gtfout))) 72 | tx2gene <- rbind(tx2gene, gtfout) 73 | } 74 | 75 | saveRDS(tx2gene, file = outrds) 76 | 77 | sessionInfo() 78 | date() 79 | -------------------------------------------------------------------------------- /Rscripts/get_nbr_reads.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Catalog the number of nanopore reads 7 | 8 | print(fastqdir) 9 | print(genomebamdir) 10 | print(txomebamdir) 11 | print(txomebamdir_p0.99) 12 | print(outrds) 13 | 14 | suppressPackageStartupMessages({ 15 | library(ShortRead) 16 | library(GenomicAlignments) 17 | library(dplyr) 18 | library(ggplot2) 19 | library(data.table) 20 | }) 21 | 22 | ## ========================================================================== ## 23 | ## Help functions 24 | ## ========================================================================== ## 25 | readBam <- function(bamfile) { 26 | bam <- readGAlignments(bamfile, use.names = TRUE, 27 | param = ScanBamParam(tag = c("NM"), 28 | what = c("qname","flag", "rname", 29 | "pos", "mapq"))) 30 | ops <- GenomicAlignments::CIGAR_OPS 31 | wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(bam), ops = ops) 32 | keep.ops <- GenomicAlignments::explodeCigarOps(cigar(bam), ops = ops) 33 | explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 34 | unlist(keep.ops)), wdths)) 35 | for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) { 36 | mcols(bam)[[paste0("nbr", opts)]] <- 37 | sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE)) 38 | } 39 | mcols(bam)$readLength <- rowSums(as.matrix(mcols(bam)[, c("nbrS", "nbrH", "nbrM", "nbrI")])) 40 | bam 41 | } 42 | 43 | makeReadDf <- function(bam) { 44 | tmp <- data.frame(bam %>% setNames(NULL), stringsAsFactors = FALSE) %>% 45 | dplyr::rename(read = qname, 46 | nbrJunctions = njunc) %>% 47 | dplyr::select(-cigar) %>% 48 | dplyr::mutate(alignedLength = nbrM + nbrI) ## equivalent to readLength-nbrS-nbrH 49 | 50 | tmp2 <- as.data.frame(table(names(subset(bam, flag %in% c(0, 16))))) 51 | if (nrow(tmp2) == 0) tmp2 <- data.frame(Var1 = tmp$read[1], Freq = 0) 52 | tmp <- tmp %>% 53 | dplyr::left_join(tmp2 %>% dplyr::rename(read = Var1, nbrPrimaryAlignments = Freq)) 54 | 55 | tmp3 <- as.data.frame(table(names(subset(bam, flag %in% c(256, 272))))) 56 | if (nrow(tmp3) == 0) tmp3 <- data.frame(Var1 = tmp$read[1], Freq = 0) 57 | tmp <- tmp %>% 58 | dplyr::left_join(tmp3 %>% dplyr::rename(read = Var1, nbrSecondaryAlignments = Freq)) 59 | 60 | tmp4 <- as.data.frame(table(names(subset(bam, flag %in% c(2048, 2064))))) 61 | if (nrow(tmp4) == 0) tmp4 <- data.frame(Var1 = tmp$read[1], Freq = 0) 62 | tmp <- tmp %>% 63 | dplyr::left_join(tmp4 %>% dplyr::rename(read = Var1, nbrSupplementaryAlignments = Freq)) 64 | 65 | tmp %>% dplyr::mutate(nbrSecondaryAlignments = replace(nbrSecondaryAlignments, 66 | is.na(nbrSecondaryAlignments), 0), 67 | nbrSupplementaryAlignments = replace(nbrSupplementaryAlignments, 68 | is.na(nbrSupplementaryAlignments), 0)) 69 | } 70 | 71 | makeSummaryList <- function(bam) { 72 | list(nAlignments = length(bam), 73 | nAlignedReads = length(unique(names(bam))), 74 | nPrimaryAlignments = length(subset(bam, flag %in% c(0, 16))), 75 | nReadsWithPrimaryAlignments = length(unique(names(subset(bam, flag %in% c(0, 16))))), 76 | nSecondaryAlignments = length(subset(bam, flag %in% c(256, 272))), 77 | nReadsWithSecondaryAlignments = length(unique(names(subset(bam, flag %in% c(256, 272))))), 78 | nSupplementaryAlignments = length(subset(bam, flag %in% c(2048, 2064))), 79 | nReadsWithSupplementaryAlignments = length(unique(names(subset(bam, flag %in% c(2048, 2064))))) 80 | ) 81 | } 82 | 83 | ## ========================================================================== ## 84 | ## Read individual FASTQ files and get number of reads and read lengths 85 | fastqfiles <- list.files(fastqdir, pattern = "(FASTQ|fastq)\\.gz$", full.names = TRUE) 86 | names(fastqfiles) <- gsub("\\.(FASTQ|fastq).gz", "", basename(fastqfiles)) 87 | fastqfiles 88 | fastqs <- lapply(fastqfiles, function(f) { 89 | fastq <- fread(paste0("zcat ", f), sep = "\n", header = FALSE)$V1 90 | seq_idxs <- seq(2, length(fastq), by = 4) 91 | list(nReads = length(seq_idxs), 92 | reads = do.call(rbind, lapply(seq_idxs, function(i) { 93 | data.frame(read = gsub("^@", "", strsplit(fastq[i - 1], " ")[[1]][1]), 94 | readLength = nchar(fastq[i]), 95 | aveBaseQuality = mean(as.numeric(charToRaw(fastq[i + 2])) - 33), 96 | stringsAsFactors = FALSE) 97 | }))) 98 | }) 99 | 100 | ## BAM files, genome alignment 101 | bamfiles <- list.files(genomebamdir, pattern = "_minimap_genome_s.bam$", 102 | recursive = TRUE, full.names = TRUE) 103 | names(bamfiles) <- gsub("_minimap_genome_s.bam", "", basename(bamfiles)) 104 | bamfiles 105 | genomebams <- lapply(bamfiles, function(f) { 106 | bam <- readBam(f) 107 | tmp <- makeReadDf(bam) 108 | c(makeSummaryList(bam), 109 | list(allAlignments = tmp)) 110 | }) 111 | for (n in names(genomebams)) { 112 | genomebams[[n]][["allAlignments"]]$sample <- n 113 | } 114 | 115 | ## BAM files, transcriptome alignment 116 | bamfilestx <- list.files(txomebamdir, pattern = "_minimap_txome.bam$", 117 | recursive = TRUE, full.names = TRUE) 118 | names(bamfilestx) <- gsub("_minimap_txome.bam", "", basename(bamfilestx)) 119 | bamfilestx 120 | txomebams <- lapply(bamfilestx, function(f) { 121 | bam <- readBam(f) 122 | tmp <- makeReadDf(bam) 123 | 124 | ## Read transcript lengths 125 | bamheader <- scanBamHeader(f)[[1]]$targets 126 | txlengths <- data.frame(rname = names(bamheader), 127 | txLength = bamheader, 128 | stringsAsFactors = FALSE) 129 | 130 | c(makeSummaryList(bam), 131 | list(allAlignments = tmp %>% dplyr::left_join(txlengths, by = "rname"), 132 | nTxCoveredByPrimary = length(unique(seqnames(subset(bam, flag %in% c(0, 16))))))) 133 | }) 134 | for (n in names(txomebams)) { 135 | txomebams[[n]][["allAlignments"]]$sample <- n 136 | } 137 | 138 | ## BAM files, transcriptome alignment, -p 0.99 139 | bamfilestx_p0.99 <- list.files(txomebamdir_p0.99, pattern = "_minimap_txome_p0.99.bam$", 140 | recursive = TRUE, full.names = TRUE) 141 | names(bamfilestx_p0.99) <- gsub("_minimap_txome_p0.99.bam", "", 142 | basename(bamfilestx_p0.99)) 143 | bamfilestx_p0.99 144 | txomebams_p0.99 <- lapply(bamfilestx_p0.99, function(f) { 145 | bam <- readBam(f) 146 | tmp <- makeReadDf(bam) 147 | 148 | ## Read transcript lengths 149 | bamheader <- scanBamHeader(f)[[1]]$targets 150 | txlengths <- data.frame(rname = names(bamheader), 151 | txLength = bamheader, 152 | stringsAsFactors = FALSE) 153 | 154 | c(makeSummaryList(bam), 155 | list(allAlignments = tmp %>% dplyr::left_join(txlengths, by = "rname"), 156 | nTxCoveredByPrimary = length(unique(seqnames(subset(bam, flag %in% c(0, 16))))))) 157 | }) 158 | for (n in names(txomebams_p0.99)) { 159 | txomebams_p0.99[[n]][["allAlignments"]]$sample <- n 160 | } 161 | 162 | nReadTable <- 163 | data.frame(sample = names(fastqs), 164 | nReads = sapply(fastqs, function(w) w$nReads), 165 | stringsAsFactors = FALSE) %>% 166 | dplyr::full_join(data.frame( 167 | sample = names(genomebams), 168 | nAlignedReadsGenome = sapply(genomebams, function(w) w$nAlignedReads), 169 | nAlignmentsGenome = sapply(genomebams, function(w) w$nAlignments), 170 | nPrimaryAlignmentsGenome = sapply(genomebams, function(w) w$nPrimaryAlignments), 171 | nReadsWithPrimaryAlignmentsGenome = sapply(genomebams, function(w) w$nReadsWithPrimaryAlignments), 172 | nSecondaryAlignmentsGenome = sapply(genomebams, function(w) w$nSecondaryAlignments), 173 | nReadsWithSecondaryAlignmentsGenome = sapply(genomebams, function(w) w$nReadsWithSecondaryAlignments), 174 | nSupplementaryAlignmentsGenome = sapply(genomebams, function(w) w$nSupplementaryAlignments), 175 | nReadsWithSupplementaryAlignmentsGenome = sapply(genomebams, function(w) w$nReadsWithSupplementaryAlignments), 176 | stringsAsFactors = FALSE)) %>% 177 | dplyr::full_join(data.frame( 178 | sample = names(txomebams), 179 | nAlignedReadsTxome = sapply(txomebams, function(w) w$nAlignedReads), 180 | nAlignmentsTxome = sapply(txomebams, function(w) w$nAlignments), 181 | nTxCoveredByPrimary = sapply(txomebams, function(w) w$nTxCoveredByPrimary), 182 | nPrimaryAlignmentsTxome = sapply(txomebams, function(w) w$nPrimaryAlignments), 183 | nReadsWithPrimaryAlignmentsTxome = sapply(txomebams, function(w) w$nReadsWithPrimaryAlignments), 184 | nSecondaryAlignmentsTxome = sapply(txomebams, function(w) w$nSecondaryAlignments), 185 | nReadsWithSecondaryAlignmentsTxome = sapply(txomebams, function(w) w$nReadsWithSecondaryAlignments), 186 | nSupplementaryAlignmentsTxome = sapply(txomebams, function(w) w$nSupplementaryAlignments), 187 | nReadsWithSupplementaryAlignmentsTxome = sapply(txomebams, function(w) w$nReadsWithSupplementaryAlignments), 188 | stringsAsFactors = FALSE)) %>% 189 | dplyr::full_join(data.frame( 190 | sample = names(txomebams_p0.99), 191 | nAlignedReadsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nAlignedReads), 192 | nAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nAlignments), 193 | nTxCoveredByPrimary_p0.99 = sapply(txomebams_p0.99, function(w) w$nTxCoveredByPrimary), 194 | nPrimaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nPrimaryAlignments), 195 | nReadsWithPrimaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nReadsWithPrimaryAlignments), 196 | nSecondaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nSecondaryAlignments), 197 | nReadsWithSecondaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nReadsWithSecondaryAlignments), 198 | nSupplementaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nSupplementaryAlignments), 199 | nReadsWithSupplementaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nReadsWithSupplementaryAlignments), 200 | stringsAsFactors = FALSE)) 201 | 202 | write.table(nReadTable, file = gsub("rds$", "txt", outrds), 203 | row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") 204 | 205 | saveRDS(list(fastqs = fastqs, genomebams = genomebams, txomebams = txomebams, 206 | txomebams_p0.99 = txomebams_p0.99, nReadTable = nReadTable), 207 | file = outrds) 208 | sessionInfo() 209 | date() 210 | -------------------------------------------------------------------------------- /Rscripts/get_nbr_reads_NA12878public.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Catalog the number of nanopore reads 7 | 8 | print(txomebamdir_p0.99) 9 | print(outrds) 10 | 11 | suppressPackageStartupMessages({ 12 | library(ShortRead) 13 | library(GenomicAlignments) 14 | library(dplyr) 15 | library(ggplot2) 16 | library(data.table) 17 | }) 18 | 19 | ## ========================================================================== ## 20 | ## Help functions 21 | ## ========================================================================== ## 22 | readBam <- function(bamfile) { 23 | bam <- readGAlignments(bamfile, use.names = TRUE, 24 | param = ScanBamParam(tag = c("NM"), 25 | what = c("qname", "flag", "rname", 26 | "pos", "mapq"))) 27 | mcols(bam)$tmp <- seq_len(length(bam)) %/% 5e5 28 | 29 | bamout <- do.call(c, lapply(unique(mcols(bam)$tmp), function(i) { 30 | tmp <- bam[mcols(bam)$tmp == i] 31 | ops <- GenomicAlignments::CIGAR_OPS 32 | wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(tmp), ops = ops) 33 | keep.ops <- GenomicAlignments::explodeCigarOps(cigar(tmp), ops = ops) 34 | explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 35 | unlist(keep.ops)), wdths)) 36 | for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) { 37 | mcols(tmp)[[paste0("nbr", opts)]] <- 38 | sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE)) 39 | } 40 | mcols(tmp)$readLength <- rowSums(as.matrix(mcols(tmp)[, c("nbrS", "nbrH", "nbrM", "nbrI")])) 41 | tmp 42 | })) 43 | 44 | bamout 45 | } 46 | 47 | makeReadDf <- function(bam) { 48 | tmp <- data.frame(bam %>% setNames(NULL), stringsAsFactors = FALSE) %>% 49 | dplyr::rename(read = qname, 50 | nbrJunctions = njunc) %>% 51 | dplyr::select(-cigar) %>% 52 | dplyr::mutate(alignedLength = nbrM + nbrI) ## equivalent to readLength-nbrS-nbrH 53 | 54 | tmp2 <- as.data.frame(table(names(subset(bam, flag %in% c(0, 16))))) 55 | if (nrow(tmp2) == 0) tmp2 <- data.frame(Var1 = tmp$read[1], Freq = 0) 56 | tmp <- tmp %>% 57 | dplyr::left_join(tmp2 %>% dplyr::rename(read = Var1, nbrPrimaryAlignments = Freq)) 58 | 59 | tmp3 <- as.data.frame(table(names(subset(bam, flag %in% c(256, 272))))) 60 | if (nrow(tmp3) == 0) tmp3 <- data.frame(Var1 = tmp$read[1], Freq = 0) 61 | tmp <- tmp %>% 62 | dplyr::left_join(tmp3 %>% dplyr::rename(read = Var1, nbrSecondaryAlignments = Freq)) 63 | 64 | tmp %>% dplyr::mutate(nbrSecondaryAlignments = replace(nbrSecondaryAlignments, 65 | is.na(nbrSecondaryAlignments), 0)) 66 | } 67 | 68 | makeSummaryList <- function(bam) { 69 | list(nAlignments = length(bam), 70 | nAlignedReads = length(unique(names(bam))), 71 | nPrimaryAlignments = length(subset(bam, flag %in% c(0, 16))), 72 | nReadsWithPrimaryAlignments = length(unique(names(subset(bam, flag %in% c(0, 16))))), 73 | nSecondaryAlignments = length(subset(bam, flag %in% c(256, 272))), 74 | nReadsWithSecondaryAlignments = length(unique(names(subset(bam, flag %in% c(256, 272)))))) 75 | } 76 | 77 | ## ========================================================================== ## 78 | ## BAM files, transcriptome alignment, -p 0.99 79 | bamfilestx_p0.99 <- list.files(txomebamdir_p0.99, 80 | pattern = "_minimap_txome_p0.99.bam$", 81 | recursive = TRUE, full.names = TRUE) 82 | names(bamfilestx_p0.99) <- gsub("_minimap_txome_p0.99.bam", "", 83 | basename(bamfilestx_p0.99)) 84 | bamfilestx_p0.99 85 | txomebams_p0.99 <- lapply(bamfilestx_p0.99, function(f) { 86 | bam <- readBam(f) 87 | tmp <- makeReadDf(bam) 88 | 89 | ## Read transcript lengths 90 | bamheader <- scanBamHeader(f)[[1]]$targets 91 | txlengths <- data.frame(rname = names(bamheader), 92 | txLength = bamheader, 93 | stringsAsFactors = FALSE) 94 | 95 | c(makeSummaryList(bam), 96 | list(allAlignments = tmp %>% dplyr::left_join(txlengths, by = "rname"))) 97 | }) 98 | for (n in names(txomebams_p0.99)) { 99 | txomebams_p0.99[[n]][["allAlignments"]]$sample <- n 100 | } 101 | 102 | saveRDS(list(txomebams_p0.99 = txomebams_p0.99), 103 | file = outrds) 104 | sessionInfo() 105 | date() 106 | -------------------------------------------------------------------------------- /Rscripts/get_polyA_tail_lengths.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Estimate the length of the polyA tail 7 | 8 | print(fast5dir) 9 | print(ncores) 10 | print(outdir) 11 | print(csvfilename) 12 | 13 | suppressPackageStartupMessages({ 14 | library(tailfindr) 15 | }) 16 | 17 | df <- find_tails(fast5_dir = fast5dir, 18 | save_dir = outdir, 19 | csv_filename = csvfilename, 20 | num_cores = ncores) 21 | 22 | saveRDS(df, file = paste0(outdir, "/", csvfilename, ".rds")) 23 | date() 24 | sessionInfo() 25 | 26 | -------------------------------------------------------------------------------- /Rscripts/get_reads_aligning_to_genome_but_not_txome.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | print(nbrreadsrds) 7 | print(outdir) 8 | 9 | ## Extract read names for reads mapping only to genome (not transcriptome) 10 | 11 | rd <- readRDS(nbrreadsrds) 12 | rdg <- rd$genomebams 13 | rdt_p0.99 <- rd$txomebams_p0.99 14 | 15 | stopifnot(names(rdg) %in% names(rdt_p0.99), 16 | names(rdt_p0.99) %in% names(rdg)) 17 | 18 | for (nm in names(rdg)) { 19 | readnames <- setdiff(rdg[[nm]]$allAlignments$read, 20 | rdt_p0.99[[nm]]$allAlignments$read) 21 | write.table(readnames, 22 | file = paste0(outdir, "/", nm, 23 | "_reads_aligning_to_genome_but_not_txome.txt"), 24 | row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE) 25 | } 26 | 27 | saveRDS(NULL, file = paste0(outdir, "/reads_aligning_to_genome_but_not_txome.rds")) 28 | date() 29 | sessionInfo() 30 | -------------------------------------------------------------------------------- /Rscripts/gff2gtf.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | print(gff) 7 | print(gtf) 8 | 9 | suppressPackageStartupMessages({ 10 | library(rtracklayer) 11 | library(readr) 12 | library(withr) 13 | }) 14 | 15 | ## Filter out exons that can not be handled by gffcompare 16 | x <- readr::read_tsv(gff, col_names = FALSE, col_types = "cccddcccc") 17 | dim(x) 18 | message("Excluding the following lines:") 19 | x[x$X5 - x$X4 >= 30000, ] 20 | x <- x[x$X5 - x$X4 < 30000, ] 21 | dim(x) 22 | withr::with_options(c(scipen = 100), 23 | write.table(x, file = gsub("\\.gff$", ".fixed.gff", gff), 24 | quote = FALSE, sep = "\t", row.names = FALSE, 25 | col.names = FALSE)) 26 | 27 | x <- rtracklayer::import(gsub("\\.gff$", ".fixed.gff", gff)) 28 | x$transcript_id <- as.character(x$group) 29 | x$group <- NULL 30 | rtracklayer::export(x, gtf) 31 | 32 | date() 33 | sessionInfo() -------------------------------------------------------------------------------- /Rscripts/list_packages.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | routdirs <- strsplit(routdirs, ",")[[1]] 7 | 8 | print(routdirs) 9 | print(outtxt) 10 | 11 | all_packages <- c() 12 | 13 | for (rd in routdirs) { 14 | lf <- list.files(rd, full.names = TRUE) 15 | for (f in lf) { 16 | x <- readLines(f) 17 | idx1 <- which(x == "other attached packages:") 18 | idx2 <- which(x == "loaded via a namespace (and not attached):") 19 | if (length(idx1) != 0 & length(idx2) != 0) { 20 | all_packages <- 21 | unique(c(all_packages, 22 | do.call(c, lapply((idx1 + 1):(idx2 - 2), function(i) { 23 | grep("\\[", setdiff(setdiff(strsplit(x[i], " ")[[1]], " "), ""), 24 | value = TRUE, invert = TRUE) 25 | })))) 26 | } 27 | } 28 | } 29 | write.table(sort(all_packages), file = outtxt, 30 | row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t") -------------------------------------------------------------------------------- /Rscripts/merge_promoter_bed_files.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(rtracklayer) 8 | library(GenomicRanges) 9 | library(GenomeInfoDb) 10 | }) 11 | 12 | bedsin <- strsplit(bedsin, ",")[[1]] 13 | 14 | print(bedsin) 15 | print(bedout) 16 | 17 | ## Read all input bed files and merge into one GRanges object 18 | suppressWarnings(bedall <- do.call(c, lapply(bedsin, function(b) rtracklayer::import(b)))) 19 | 20 | ## Reduce the ranges in the merged GRanges object 21 | bedrr <- GenomicRanges::reduce(bedall) 22 | 23 | ## Change the seqlevels style to Ensembl 24 | GenomeInfoDb::seqlevelsStyle(bedrr) <- "Ensembl" 25 | 26 | ## Write reduced promoter regions to bed file 27 | rtracklayer::export(bedrr, con = bedout, format = "bed") 28 | 29 | date() 30 | sessionInfo() -------------------------------------------------------------------------------- /Rscripts/plot_abundances.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(ggplot2) 8 | library(tibble) 9 | library(dplyr) 10 | library(tidyr) 11 | library(GGally) 12 | library(rmarkdown) 13 | }) 14 | 15 | print(abundances) 16 | print(rmdtemplate) 17 | print(outhtml) 18 | 19 | abundances <- readRDS(abundances) 20 | 21 | plot_abundance_report <- function(abundances, output_file, output_dir = "./", 22 | output_format = "html_document", 23 | rmd_template = NULL, 24 | knitr_progress = FALSE, ...){ 25 | output_report <- file.path(output_dir, basename(output_file)) 26 | output_rmd <- file.path(output_dir, 27 | paste0(tools::file_path_sans_ext(basename(output_file)), 28 | ".Rmd")) 29 | template_file <- rmd_template 30 | file.copy(from = template_file, to = output_rmd, overwrite = TRUE) 31 | 32 | args <- list(...) 33 | args$input <- output_rmd 34 | args$output_format <- output_format 35 | args$output_file <- output_file 36 | args$quiet <- !knitr_progress 37 | 38 | output_file <- do.call("render", args = args) 39 | invisible(output_file) 40 | } 41 | 42 | plot_abundance_report(abundances = abundances, 43 | output_file = basename(outhtml), 44 | output_dir = dirname(outhtml), 45 | rmd_template = rmdtemplate) 46 | 47 | sessionInfo() 48 | date() 49 | -------------------------------------------------------------------------------- /Rscripts/plot_abundances_template.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "" 3 | author: "" 4 | date: "" 5 | output: html_document 6 | editor_options: 7 | chunk_output_type: console 8 | --- 9 | 10 | # Correlation among replicates 11 | 12 | ```{r} 13 | suppressPackageStartupMessages({ 14 | library(ggplot2) 15 | library(tibble) 16 | library(dplyr) 17 | library(tidyr) 18 | library(GGally) 19 | }) 20 | ``` 21 | 22 | ## Transcript-level 23 | 24 | ```{r txlevel, fig.width = 8, fig.height = 8} 25 | txab <- abundances$tx_abundances 26 | 27 | quantmethods <- unique(paste0( 28 | sapply(strsplit(colnames(txab), "__"), .subset, 2), "__", 29 | sapply(strsplit(colnames(txab), "__"), .subset, 3) 30 | )) 31 | 32 | txsum <- lapply(quantmethods, function(qm) { 33 | tmp <- txab[, grep(paste0(qm, "$"), colnames(txab))] 34 | colnames(tmp) <- gsub(paste0("__", qm), "", colnames(tmp)) 35 | print(ggpairs(tmp, 36 | lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) + 37 | theme_bw() + ggtitle(qm)) 38 | print(ggpairs(log10(tmp + 1), 39 | lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) + 40 | theme_bw() + ggtitle(paste0(qm, ", log10(. + 1)"))) 41 | cor_srpk <- cor(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE]) 42 | cor_wt <- cor(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE]) 43 | cor_srpk_log <- cor(log10(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE] + 1)) 44 | cor_wt_log <- cor(log10(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE] + 1)) 45 | data.frame(method = qm, 46 | cor_srpk_lin = mean(cor_srpk[upper.tri(cor_srpk)]), 47 | cor_wt_lin = mean(cor_wt[upper.tri(cor_wt)]), 48 | cor_srpk_log = mean(cor_srpk_log[upper.tri(cor_srpk_log)]), 49 | cor_wt_log = mean(cor_wt_log[upper.tri(cor_wt_log)]), 50 | stringsAsFactors = FALSE) 51 | }) 52 | txsum <- do.call(rbind, txsum) %>% 53 | tidyr::gather(group, correlation, -method) %>% 54 | tidyr::separate(group, into = c("cor", "group", "transformation")) 55 | ``` 56 | 57 | ```{r txlevelsum, fig.width = 8, fig.height = 8} 58 | levels <- txsum %>% dplyr::group_by(method) %>% 59 | dplyr::summarize(meancorr = mean(correlation)) %>% 60 | dplyr::arrange(desc(meancorr)) %>% dplyr::pull(method) 61 | ggplot(txsum %>% dplyr::mutate(method = factor(method, levels = levels)), 62 | aes(x = method, y = correlation, fill = method)) + 63 | geom_bar(stat = "identity") + theme_bw() + facet_grid(transformation ~ group) + 64 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 65 | legend.position = "none") + 66 | xlab("") 67 | ``` 68 | 69 | ## Gene-level 70 | 71 | ```{r genelevel, fig.width = 8, fig.height = 8} 72 | geneab <- abundances$gene_abundances 73 | 74 | quantmethods <- unique(paste0( 75 | sapply(strsplit(colnames(geneab), "__"), .subset, 2), "__", 76 | sapply(strsplit(colnames(geneab), "__"), .subset, 3) 77 | )) 78 | 79 | genesum <- lapply(quantmethods, function(qm) { 80 | tmp <- geneab[, grep(paste0(qm, "$"), colnames(geneab))] 81 | colnames(tmp) <- gsub(paste0("__", qm), "", colnames(tmp)) 82 | print(ggpairs(tmp, 83 | lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) + 84 | theme_bw() + ggtitle(qm)) 85 | print(ggpairs(log10(tmp + 1), 86 | lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) + 87 | theme_bw() + ggtitle(paste0(qm, ", log10(. + 1)"))) 88 | cor_srpk <- cor(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE]) 89 | cor_wt <- cor(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE]) 90 | cor_srpk_log <- cor(log10(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE] + 1)) 91 | cor_wt_log <- cor(log10(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE] + 1)) 92 | data.frame(method = qm, 93 | cor_srpk_lin = mean(cor_srpk[upper.tri(cor_srpk)]), 94 | cor_wt_lin = mean(cor_wt[upper.tri(cor_wt)]), 95 | cor_srpk_log = mean(cor_srpk_log[upper.tri(cor_srpk_log)]), 96 | cor_wt_log = mean(cor_wt_log[upper.tri(cor_wt_log)]), 97 | stringsAsFactors = FALSE) 98 | }) 99 | genesum <- do.call(rbind, genesum) %>% 100 | tidyr::gather(group, correlation, -method) %>% 101 | tidyr::separate(group, into = c("cor", "group", "transformation")) 102 | ``` 103 | 104 | ```{r genelevelsum, fig.width = 8, fig.height = 8} 105 | levels <- genesum %>% dplyr::group_by(method) %>% 106 | dplyr::summarize(meancorr = mean(correlation)) %>% 107 | dplyr::arrange(desc(meancorr)) %>% dplyr::pull(method) 108 | ggplot(genesum %>% dplyr::mutate(method = factor(method, levels = levels)), 109 | aes(x = method, y = correlation, fill = method)) + 110 | geom_bar(stat = "identity") + theme_bw() + facet_grid(transformation ~ group) + 111 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 112 | legend.position = "none") + 113 | xlab("") 114 | ``` 115 | 116 | ## Session info 117 | 118 | ```{r} 119 | date() 120 | sessionInfo() 121 | ``` 122 | 123 | -------------------------------------------------------------------------------- /Rscripts/plot_compare_flair_annotation.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | print(annotatedGtf) 7 | print(flairGtf) 8 | print(dataSet) 9 | print(outDir) 10 | print(seed) 11 | print(nPlots) 12 | print(flairTmap) ## gffcompare classification 13 | print(sqanticlass) ## SQANTI classification 14 | print(outrds) 15 | 16 | suppressPackageStartupMessages({ 17 | library(S4Vectors) 18 | library(rtracklayer) 19 | library(Gviz) 20 | library(dplyr) 21 | }) 22 | 23 | ## Read gtf files 24 | annotated <- rtracklayer::import(annotatedGtf) 25 | idx <- match(c("transcript_id", "gene_id", "exon_id"), 26 | colnames(mcols(annotated))) 27 | colnames(mcols(annotated))[idx] <- c("transcript", "gene", "exon") 28 | S4Vectors::mcols(annotated)$symbol <- mcols(annotated)$transcript 29 | annotated <- subset(annotated, type == "exon") 30 | 31 | flair <- rtracklayer::import(flairGtf) 32 | colnames(mcols(flair))[match("transcript_id", colnames(mcols(flair)))] <- 33 | "transcript" 34 | S4Vectors::mcols(flair)$symbol <- mcols(flair)$transcript 35 | flair <- subset(flair, type == "exon") 36 | 37 | ## Read tmap file to sample transcript pairs from and sample nPlots pairs 38 | ## Also read SQANTI classification info to add on top 39 | tmap <- read.delim(flairTmap, header = TRUE, as.is = TRUE) 40 | if (sqanticlass != "") { 41 | sqanti <- read.delim(sqanticlass, header = TRUE, as.is = TRUE) 42 | } 43 | if (sqanticlass != "") { 44 | tmap <- tmap %>% dplyr::left_join( 45 | sqanti %>% dplyr::select(isoform, associated_transcript, structural_category), 46 | by = c("qry_id" = "isoform") 47 | ) %>% 48 | dplyr::filter(ref_id == associated_transcript | 49 | !(class_code %in% c("c", "="))) 50 | } else { 51 | tmap$structural_category <- "" 52 | } 53 | set.seed(seed) 54 | tmap <- tmap[sample(seq_len(nrow(tmap)), size = nPlots, replace = FALSE), ] 55 | 56 | ## Define help function 57 | compareFlairAnnotatedTx <- function(dataSet, annotatedGtf, flairGtf, 58 | annotatedTx, flairTx, classCode, 59 | structCat, outDir) { 60 | options(ucscChromosomeNames = FALSE) 61 | 62 | ## Subset gtf files to selected transcripts 63 | atmp <- subset(annotatedGtf, transcript == annotatedTx) 64 | ftmp <- subset(flairGtf, transcript == flairTx) 65 | 66 | ## Get width of each transcript 67 | awd <- sum(width(atmp)) 68 | fwd <- sum(width(ftmp)) 69 | 70 | show_chr <- unique(c(seqnames(atmp), seqnames(ftmp))) 71 | stopifnot(length(show_chr) == 1) 72 | 73 | min_coord <- min( 74 | min(start(atmp)) - 0.2*(max(end(atmp)) - min(start(atmp))), 75 | min(start(ftmp)) - 0.2*(max(end(ftmp)) - min(start(ftmp))) 76 | ) 77 | max_coord <- max( 78 | max(end(atmp)) + 0.05*(max(end(atmp)) - min(start(atmp))), 79 | max(end(ftmp)) + 0.05*(max(end(ftmp)) - min(start(ftmp))) 80 | ) 81 | 82 | agrtr <- Gviz::GeneRegionTrack(atmp, showId = FALSE, col = NULL, 83 | fill = "blue", name = "annotated", 84 | background.title = "transparent", 85 | col.title = "black", min.height = 15) 86 | fgrtr <- Gviz::GeneRegionTrack(ftmp, showId = FALSE, col = NULL, 87 | fill = "orange", name = "flair", 88 | background.title = "transparent", 89 | col.title = "black", min.height = 15) 90 | 91 | gtr <- Gviz::GenomeAxisTrack() 92 | 93 | tracks <- c(gtr, agrtr, fgrtr) 94 | 95 | pdf(paste0(outDir, "/", dataSet, "__flair_comparison__", 96 | gsub(":", "_", annotatedTx), "__", 97 | gsub(":", "_", flairTx), "__", classCode, "__", 98 | structCat, ".pdf"), 99 | height = 3, width = 10) 100 | Gviz::plotTracks(tracks, chromosome = show_chr, 101 | from = min_coord, to = max_coord, 102 | main = paste0("annotated: ", annotatedTx, " (", awd, 103 | " nt); flair: ", flairTx, " (", fwd, 104 | " nt); class code: ", classCode, 105 | "; structural category: ", structCat), 106 | cex.main = 0.7, 107 | min.width = 0, min.distance = 0, collapse = FALSE) 108 | dev.off() 109 | } 110 | 111 | options(ucscChromosomeNames = FALSE) 112 | for (i in seq_len(nrow(tmap))) { 113 | compareFlairAnnotatedTx(dataSet = dataSet, annotatedGtf = annotated, 114 | flairGtf = flair, 115 | annotatedTx = tmap$ref_id[i], 116 | flairTx = tmap$qry_id[i], 117 | classCode = tmap$class_code[i], 118 | structCat = tmap$structural_category[i], 119 | outDir = outDir) 120 | } 121 | 122 | saveRDS(tmap, file = outrds) 123 | date() 124 | sessionInfo() 125 | -------------------------------------------------------------------------------- /Rscripts/plot_distance_primary_supplementary_alignments.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Find the distance between supplementary alignments and the corresponding 7 | ## primary alignment 8 | 9 | print(genomebamdir) 10 | print(outrds) 11 | 12 | suppressPackageStartupMessages({ 13 | library(GenomicRanges) 14 | library(GenomicAlignments) 15 | library(dplyr) 16 | library(ggplot2) 17 | }) 18 | 19 | bamfiles <- list.files(genomebamdir, pattern = "_minimap_genome_s.bam$", 20 | recursive = TRUE, full.names = TRUE) 21 | names(bamfiles) <- gsub("_minimap_genome_s.bam", "", basename(bamfiles)) 22 | bamfiles 23 | 24 | primary_supplementary_distances <- lapply(bamfiles, function(bam) { 25 | bam <- readGAlignments(bam, use.names = TRUE, 26 | param = ScanBamParam(tag=c("NM"), 27 | what=c("qname","flag", "rname", 28 | "pos", "mapq"))) 29 | bam <- subset(bam, flag %in% c(0, 16, 2048, 2064)) 30 | 31 | ops <- GenomicAlignments::CIGAR_OPS 32 | wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(bam), ops = ops) 33 | keep.ops <- GenomicAlignments::explodeCigarOps(cigar(bam), ops = ops) 34 | explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 35 | unlist(keep.ops)), wdths)) 36 | for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) { 37 | mcols(bam)[[paste0("nbr", opts)]] <- 38 | sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE)) 39 | } 40 | mcols(bam)$readLength <- rowSums(as.matrix(mcols(bam)[, c("nbrS", "nbrH", "nbrM", "nbrI")])) 41 | 42 | supplementary_alignments <- subset(bam, flag %in% c(2048, 2064)) 43 | primary_alignments <- subset(bam, flag %in% c(0, 16)) 44 | primary_alignments <- primary_alignments[names(primary_alignments) %in% 45 | names(supplementary_alignments)] 46 | 47 | do.call(rbind, lapply(seq_along(supplementary_alignments), function(i) { 48 | nm <- names(supplementary_alignments)[i] 49 | primary <- as(primary_alignments[nm], "GRanges") 50 | supplementary <- as(supplementary_alignments[i], "GRanges") 51 | if (all(seqnames(primary) == seqnames(supplementary))) { 52 | ## Same chromosome 53 | if (all(strand(primary) == strand(supplementary))) { 54 | strands <- "same chromosome, same strand" 55 | } else { 56 | strands <- "same chromosome, different strand" 57 | } 58 | distn <- GenomicRanges::distance(primary, supplementary, ignore.strand = TRUE) 59 | ovlap <- sum(width(GenomicRanges::intersect(as(primary_alignments[nm], "GRangesList"), 60 | as(supplementary_alignments[i], "GRangesList"), 61 | ignore.strand = TRUE))) 62 | } else { 63 | strands <- "different chromosomes" 64 | distn <- NA 65 | ovlap <- NA 66 | } 67 | nbrMPrimary <- mcols(primary)[["nbrM"]] 68 | nbrMSupplementary <- mcols(supplementary)[["nbrM"]] 69 | nbrMSuppVsPrim <- nbrMSupplementary/nbrMPrimary 70 | data.frame(read = nm, strands = strands, distn = distn, ovlap = ovlap, 71 | nbrMPrimary = nbrMPrimary, nbrMSupplementary = nbrMSupplementary, 72 | nbrMSuppVsPrim = nbrMSuppVsPrim, stringsAsFactors = FALSE) 73 | })) 74 | }) 75 | 76 | for (nm in names(primary_supplementary_distances)) { 77 | primary_supplementary_distances[[nm]]$sample <- nm 78 | } 79 | 80 | primary_supplementary_distances <- do.call(rbind, primary_supplementary_distances) 81 | 82 | pdf(gsub("rds$", "pdf", outrds), width = 8, height = 7) 83 | ggplot(primary_supplementary_distances %>% 84 | dplyr::mutate(dist0 = "") %>% 85 | dplyr::mutate(dist0 = replace(dist0, distn == 0, ", distance = 0"), 86 | dist0 = replace(dist0, distn > 0, ", distance > 0")) %>% 87 | dplyr::group_by(sample, strands, dist0) %>% dplyr::tally() %>% 88 | dplyr::mutate(strands_dist0 = paste0(strands, dist0)), 89 | aes(x = sample, y = n, fill = strands_dist0)) + 90 | theme_bw() + geom_bar(stat = "identity") + 91 | theme(legend.position = "bottom", 92 | axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 93 | ylab("Number of primary/supplementary alignment pairs") + xlab("") + 94 | guides(fill = guide_legend(nrow = 5, byrow = TRUE)) + 95 | scale_fill_manual(values = c("#DC050C", "#1965B0", "#4EB265", "#777777", "#B17BA6"), 96 | name = "") 97 | dev.off() 98 | 99 | saveRDS(primary_supplementary_distances, file = outrds) 100 | date() 101 | sessionInfo() 102 | 103 | 104 | -------------------------------------------------------------------------------- /Rscripts/plot_eq_class_counts.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Plot number of reads per equivalence class, as function of number of 7 | ## transcripts per equivalence class 8 | 9 | suppressPackageStartupMessages({ 10 | library(dplyr) 11 | library(ggplot2) 12 | }) 13 | 14 | print(salmondir) 15 | print(outrds) 16 | 17 | salmondirs <- list.files(salmondir, full.names = TRUE) 18 | salmonfiles <- paste0(salmondirs, "/aux_info/eq_classes.txt") 19 | names(salmonfiles) <- basename(salmondirs) 20 | salmonfiles <- salmonfiles[file.exists(salmonfiles)] 21 | salmonfiles 22 | 23 | eqcl <- lapply(salmonfiles, function(f) { 24 | x <- readLines(f) 25 | n_tr <- as.numeric(x[1]) ## Total number of transcripts 26 | n_eq <- as.numeric(x[2]) ## Total number of equivalence classes 27 | tx_id <- x[3:(n_tr + 2)] ## Transcript IDs 28 | quants <- x[(n_tr + 3):length(x)] ## Characteristics of equivalence classes 29 | 30 | ## Split equivalence class characteristics. Each element of the list corresponds 31 | ## to one equivalence class, and lists its number of transcripts, the 32 | ## transcripts IDs and the total number of reads 33 | do.call(rbind, lapply(quants, function(w) { 34 | tmp = strsplit(w, "\\\t")[[1]] 35 | nbr_tx = as.numeric(tmp[1]) 36 | data.frame(nbr_tx = nbr_tx, 37 | tx_ids = paste0(tx_id[as.numeric(tmp[2:(1 + nbr_tx)]) + 1], collapse = ","), 38 | count = as.numeric(tmp[length(tmp)]), 39 | stringsAsFactors = FALSE) 40 | })) 41 | }) 42 | 43 | for (nm in names(eqcl)) { 44 | eqcl[[nm]]$sample <- nm 45 | } 46 | 47 | eqcl <- do.call(rbind, eqcl) 48 | 49 | pdf(gsub("rds$", "pdf", outrds)) 50 | ggplot(eqcl, aes(x = nbr_tx, y = count)) + geom_point(size = 0.5, alpha = 0.25) + 51 | facet_wrap(~ sample) + theme_bw() + scale_y_log10() + 52 | xlab("Number of transcripts in equivalence class") + 53 | ylab("Number of reads assigned to equivalence class") 54 | dev.off() 55 | 56 | saveRDS(NULL, file = outrds) 57 | 58 | sessionInfo() -------------------------------------------------------------------------------- /Rscripts/plot_qc.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | print(bamfile) 7 | print(fastqfile) 8 | print(outrds) 9 | 10 | suppressPackageStartupMessages({ 11 | library(GenomicAlignments) 12 | library(data.table) 13 | library(ggplot2) 14 | }) 15 | 16 | sbp <- ScanBamParam(tag = "NM") 17 | 18 | ## Read bam file and corresponding fastq file 19 | bam <- readGAlignments(bamfile, use.names = TRUE, param = sbp) 20 | fastq <- fread(paste0("gunzip -c ", fastqfile), sep = "\n", header = FALSE) 21 | 22 | ## Split rows in fastq file 23 | id_idxs <- seq(1, nrow(fastq), by = 4) ## read ids 24 | q_idxs <- seq(4, nrow(fastq), by = 4) ## quality scores 25 | seq_idxs <- seq(2, nrow(fastq), by = 4) ## sequence 26 | 27 | ## Calculate average quality per read 28 | mean_q <- sapply(fastq$V1[q_idxs], function(u) mean(as.numeric(charToRaw(u)) - 33)) 29 | names(mean_q) <- gsub("@", "", sapply(strsplit(fastq$V1[id_idxs]," "), .subset, 1)) 30 | 31 | ## Get read lengths 32 | seqlen <- nchar(fastq$V1[seq_idxs]) 33 | 34 | ## Number of soft clipped bases 35 | clip_length <- sapply(explodeCigarOpLengths(cigar(bam), ops = "S"), sum) 36 | 37 | pdf(gsub("rds$", "pdf", outrds), w = 10, h = 10) 38 | ## mapped length vs number of mismatches 39 | smoothScatter(log10(qwidth(bam) - clip_length), log10(mcols(bam)$NM), 40 | xlab = "mapped length (log10)", ylab = "number of mismatches (log10)") 41 | 42 | ## percent mismatches 43 | pct_mm <- mcols(bam)$NM/(qwidth(bam) - clip_length) 44 | hist(pct_mm[pct_mm >= 0 & pct_mm <= 1], 500, main = "percent mismatches") 45 | 46 | ## read length 47 | hist(log10(seqlen), 500, main = "read length (log10)") 48 | 49 | ## mapped length 50 | hist(log10(qwidth(bam) - clip_length), 500, main = "mapped length (log10)") 51 | 52 | ## read length vs average quality score 53 | smoothScatter(log10(seqlen), mean_q, xlab = "read length (log10)", 54 | ylab = "average quality score") 55 | 56 | df <- data.frame(sequence_length = seqlen, mean_quality = mean_q) 57 | 58 | p <- ggplot(df) + geom_hex(aes(sequence_length, mean_quality), bins = 100) + 59 | scale_x_log10() + 60 | scale_fill_gradientn("", colours = rev(rainbow(10, end = 4/6))) + 61 | theme(text = element_text(size=20)) 62 | print(p) 63 | 64 | ## average quality score 65 | hist(mean_q, 100, main = "average quality score") 66 | 67 | ## average quality score vs percent mismatches 68 | m <- match(names(bam), names(mean_q)) 69 | smoothScatter(mean_q[m], pct_mm, pch = ".", ylim = c(0, 0.4), 70 | xlab = "average quality score", 71 | ylab = "percent mismatches") 72 | 73 | df <- data.frame(mean_quality = mean_q[m], percent_mismatches=pct_mm*100) 74 | 75 | p <- ggplot(df) + geom_hex(aes(mean_quality, percent_mismatches), bins = 100) + 76 | scale_x_log10() + 77 | scale_fill_gradientn("", colours = rev(rainbow(10, end = 4/6))) + 78 | theme(text = element_text(size=20)) 79 | print(p) 80 | 81 | dev.off() 82 | 83 | saveRDS(NULL, outrds) 84 | sessionInfo() 85 | date() 86 | -------------------------------------------------------------------------------- /Rscripts/summarize_abundances.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Summarize abundances on transcript- and gene-level 7 | 8 | suppressPackageStartupMessages({ 9 | library(tximport) 10 | library(dplyr) 11 | }) 12 | 13 | print(topdir) 14 | print(tx2gene) 15 | print(outrds) 16 | 17 | get_file_listing <- function(topdir, subdir, filename) { 18 | files <- list.files(paste0(topdir, "/", subdir), full.names = TRUE) 19 | nm <- basename(files) 20 | files <- paste0(files, "/", filename) 21 | names(files) <- nm 22 | files 23 | } 24 | 25 | import_salmon_files <- function(files, name, tx2gene) { 26 | txi <- tximport(files, type = "salmon", txOut = TRUE) 27 | txig <- summarizeToGene(txi, tx2gene = tx2gene[, c("tx", "gene")]) 28 | txab <- 29 | dplyr::full_join( 30 | txi$counts %>% as.data.frame() %>% 31 | setNames(paste0(names(.), "__count__", name)) %>% 32 | tibble::rownames_to_column("tx"), 33 | txi$abundance %>% as.data.frame() %>% 34 | setNames(paste0(names(.), "__tpm__", name)) %>% 35 | tibble::rownames_to_column("tx"), 36 | by = "tx" 37 | ) 38 | geneab <- 39 | dplyr::full_join( 40 | txig$counts %>% as.data.frame() %>% 41 | setNames(paste0(names(.), "__count__", name)) %>% 42 | tibble::rownames_to_column("gene"), 43 | txig$abundance %>% as.data.frame() %>% 44 | setNames(paste0(names(.), "__tpm__", name)) %>% 45 | tibble::rownames_to_column("gene"), 46 | by = "gene" 47 | ) 48 | list(txab = txab, geneab = geneab) 49 | } 50 | 51 | ## Read tx2gene 52 | tx2gene <- readRDS(tx2gene) 53 | 54 | ## Initialize gene and transcript abundance lists 55 | gene_abundances <- list() 56 | tx_abundances <- list() 57 | 58 | ## Parse the subdirectories of topdir to see which abundances are available 59 | (dirs <- list.files(topdir, full.names = FALSE)) 60 | 61 | ## Salmon31 62 | if ("salmon31" %in% dirs) { 63 | message("Salmon31") 64 | files <- get_file_listing(topdir, "salmon31", "quant.sf") 65 | print(files) 66 | abds <- import_salmon_files(files, "salmon31", tx2gene) 67 | tx_abundances[["salmon31"]] <- abds$txab 68 | gene_abundances[["salmon31"]] <- abds$geneab 69 | } 70 | 71 | ## Salmonminimap2 72 | if ("salmonminimap2" %in% dirs) { 73 | message("salmonminimap2") 74 | files <- get_file_listing(topdir, "salmonminimap2", "quant.sf") 75 | print(files) 76 | abds <- import_salmon_files(files, "salmonminimap2", tx2gene) 77 | tx_abundances[["salmonminimap2"]] <- abds$txab 78 | gene_abundances[["salmonminimap2"]] <- abds$geneab 79 | } 80 | 81 | ## Salmonminimap2_p0.99 82 | if ("salmonminimap2_p0.99" %in% dirs) { 83 | message("salmonminimap2_p0.99") 84 | files <- get_file_listing(topdir, "salmonminimap2_p0.99", "quant.sf") 85 | print(files) 86 | abds <- import_salmon_files(files, "salmonminimap2_p0.99", tx2gene) 87 | tx_abundances[["salmonminimap2_p0.99"]] <- abds$txab 88 | gene_abundances[["salmonminimap2_p0.99"]] <- abds$geneab 89 | } 90 | 91 | ## wub 92 | if ("wubminimap2" %in% dirs) { 93 | message("wub") 94 | files <- get_file_listing(topdir, "wubminimap2", "bam_count_reads.tsv") 95 | print(files) 96 | tx_abundances[["wubminimap2"]] <- 97 | Reduce(function(...) dplyr::full_join(..., by = "tx"), 98 | lapply(names(files), function(nm) { 99 | read.delim(files[nm], header = TRUE, as.is = TRUE) %>% 100 | dplyr::select(Reference, Count) %>% 101 | setNames(c("tx", paste0(nm, "__count__wubminimap2"))) 102 | }) 103 | ) 104 | gene_abundances[["wubminimap2"]] <- 105 | dplyr::left_join(tx_abundances[["wubminimap2"]], 106 | tx2gene[, c("tx", "gene")]) %>% 107 | dplyr::select(-tx) %>% 108 | dplyr::group_by(gene) %>% dplyr::summarise_all(funs(sum)) %>% 109 | as.data.frame() 110 | } 111 | 112 | ## featureCounts (primary) 113 | if ("featurecountsminimap2_primary" %in% dirs) { 114 | message("featureCounts (primary)") 115 | files <- get_file_listing(topdir, "featurecountsminimap2_primary", "featurecountsminimap2.txt") 116 | print(files) 117 | gene_abundances[["featurecountsminimap2primary"]] <- 118 | Reduce(function(...) dplyr::full_join(..., by = "gene"), 119 | lapply(names(files), function(nm) { 120 | read.delim(files[nm], skip = 1, header = TRUE, as.is = TRUE) %>% 121 | dplyr::select(-Chr, -Start, -End, -Strand, -Length) %>% 122 | setNames(c("gene", paste0(nm, "__count__featurecountsminimap2primary"))) 123 | }) 124 | ) 125 | } 126 | 127 | ## Remove version numbers from all genes and transcripts 128 | tx_abundances <- lapply(tx_abundances, function(l) { 129 | l %>% dplyr::mutate(tx = gsub("\\.[0-9]+$", "", tx)) 130 | }) 131 | gene_abundances <- lapply(gene_abundances, function(l) { 132 | l %>% dplyr::mutate(gene = gsub("\\.[0-9]+$", "", gene)) 133 | }) 134 | 135 | ## Merge all abundance estimates 136 | tx_abundances <- Reduce(function(...) dplyr::full_join(..., by = "tx"), 137 | tx_abundances) 138 | gene_abundances <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 139 | gene_abundances) 140 | 141 | ## Set transcript/gene names as row names and replace NAs with zeros 142 | rownames(tx_abundances) <- tx_abundances$tx 143 | tx_abundances$tx <- NULL 144 | rownames(gene_abundances) <- gene_abundances$gene 145 | gene_abundances$gene <- NULL 146 | 147 | tx_abundances[is.na(tx_abundances)] <- 0 148 | gene_abundances[is.na(gene_abundances)] <- 0 149 | 150 | saveRDS(list(tx_abundances = tx_abundances, 151 | gene_abundances = gene_abundances), 152 | file = outrds) 153 | 154 | date() 155 | sessionInfo() 156 | 157 | -------------------------------------------------------------------------------- /Rscripts/summarize_abundances_illumina.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | ## Summarize abundances on transcript- and gene-level 7 | 8 | suppressPackageStartupMessages({ 9 | library(tximport) 10 | library(dplyr) 11 | library(rtracklayer) 12 | }) 13 | 14 | print(topdir) 15 | print(tx2gene) 16 | print(outrds) 17 | 18 | get_file_listing <- function(topdir, subdir, filename) { 19 | files <- list.files(paste0(topdir, "/", subdir), full.names = TRUE) 20 | nm <- basename(files) 21 | files <- paste0(files, "/", filename) 22 | names(files) <- nm 23 | files 24 | } 25 | 26 | import_salmon_files <- function(files, name, tx2gene) { 27 | txi <- tximport(files, type = "salmon", txOut = TRUE) 28 | txig <- summarizeToGene(txi, tx2gene = tx2gene[, c("tx", "gene")]) 29 | txab <- 30 | dplyr::full_join( 31 | txi$counts %>% as.data.frame() %>% 32 | setNames(paste0(names(.), "__count__", name)) %>% 33 | tibble::rownames_to_column("tx"), 34 | txi$abundance %>% as.data.frame() %>% 35 | setNames(paste0(names(.), "__tpm__", name)) %>% 36 | tibble::rownames_to_column("tx"), 37 | by = "tx" 38 | ) 39 | geneab <- 40 | dplyr::full_join( 41 | txig$counts %>% as.data.frame() %>% 42 | setNames(paste0(names(.), "__count__", name)) %>% 43 | tibble::rownames_to_column("gene"), 44 | txig$abundance %>% as.data.frame() %>% 45 | setNames(paste0(names(.), "__tpm__", name)) %>% 46 | tibble::rownames_to_column("gene"), 47 | by = "gene" 48 | ) 49 | list(txab = txab, geneab = geneab) 50 | } 51 | 52 | ## Read tx2gene 53 | tx2gene <- readRDS(tx2gene) 54 | 55 | ## Initialize gene and transcript abundance lists 56 | gene_abundances <- list() 57 | tx_abundances <- list() 58 | 59 | ## Parse the subdirectories of topdir to see which abundances are available 60 | (dirs <- list.files(topdir, full.names = FALSE)) 61 | 62 | ## Salmon 63 | if ("salmon31" %in% dirs) { 64 | message("Salmon") 65 | (files <- get_file_listing(topdir, "salmon31", "quant.sf")) 66 | abds <- import_salmon_files(files, "salmon", tx2gene) 67 | tx_abundances[["salmon"]] <- abds$txab 68 | gene_abundances[["salmon"]] <- abds$geneab 69 | } 70 | 71 | ## StringTie 72 | if ("stringtie" %in% dirs) { 73 | message("StringTie") 74 | files <- list.files(paste0(topdir, "/stringtie"), full.names = TRUE) 75 | nm <- basename(files) 76 | files <- paste0(files, "/", nm, "_stringtie.gtf") 77 | names(files) <- nm 78 | print(files) 79 | txi <- 80 | Reduce(function(...) dplyr::full_join(..., by = c("tx", "gene")), 81 | lapply(names(files), function(nm) { 82 | message(nm) 83 | f <- rtracklayer::import(files[nm]) 84 | f <- subset(f, type == "transcript") 85 | data.frame(tx = f$transcript_id, 86 | gene = f$gene_id, 87 | tpm = as.numeric(as.character(f$TPM)), 88 | stringsAsFactors = FALSE) %>% 89 | setNames(c("tx", "gene", paste0(nm, "__tpm__StringTie"))) 90 | })) 91 | tx_abundances[["stringtie"]] <- txi %>% dplyr::select(-gene) 92 | gene_abundances[["stringtie"]] <- txi %>% dplyr::select(-tx) %>% 93 | dplyr::group_by(gene) %>% dplyr::summarize_all(funs(sum)) %>% as.data.frame() 94 | } 95 | 96 | ## Remove version numbers from all genes and transcripts 97 | tx_abundances <- lapply(tx_abundances, function(l) { 98 | l %>% dplyr::mutate(tx = gsub("\\.[0-9]+$", "", tx)) 99 | }) 100 | gene_abundances <- lapply(gene_abundances, function(l) { 101 | l %>% dplyr::mutate(gene = gsub("\\.[0-9]+$", "", gene)) 102 | }) 103 | 104 | ## Merge all abundance estimates 105 | tx_abundances <- Reduce(function(...) dplyr::full_join(..., by = "tx"), 106 | tx_abundances) 107 | gene_abundances <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 108 | gene_abundances) 109 | 110 | ## Set transcript/gene names as row names and replace NAs with zeros 111 | rownames(tx_abundances) <- tx_abundances$tx 112 | tx_abundances$tx <- NULL 113 | rownames(gene_abundances) <- gene_abundances$gene 114 | gene_abundances$gene <- NULL 115 | 116 | tx_abundances[is.na(tx_abundances)] <- 0 117 | gene_abundances[is.na(gene_abundances)] <- 0 118 | 119 | saveRDS(list(tx_abundances = tx_abundances, 120 | gene_abundances = gene_abundances), 121 | file = outrds) 122 | 123 | date() 124 | sessionInfo() 125 | 126 | -------------------------------------------------------------------------------- /Rscripts/summarize_star_sjdbs.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | sjdbfiles <- strsplit(sjdbfiles, ",")[[1]] 7 | 8 | print(sjdbfiles) 9 | print(min_unique) 10 | print(outtxt) 11 | 12 | suppressPackageStartupMessages({ 13 | library(dplyr) 14 | }) 15 | 16 | x <- do.call(dplyr::bind_rows, lapply(sjdbfiles, function(f) { 17 | read.delim(f, header = FALSE, as.is = TRUE) %>% 18 | setNames(c("chromosome", "start", "end", "strand", "motif", 19 | "annotated", "nbr_unique", "nbr_mm", "max_overhang")) 20 | })) %>% 21 | dplyr::group_by(chromosome, start, end, strand) %>% 22 | dplyr::summarize(motif = motif[1], 23 | annotated = annotated[1], 24 | nbr_unique = sum(nbr_unique), 25 | nbr_mm = sum(nbr_mm), 26 | max_overhang = max(max_overhang)) %>% 27 | dplyr::filter(nbr_unique >= min_unique) %>% 28 | dplyr::select(chromosome, start, end, strand, motif, 29 | annotated, nbr_unique, nbr_mm, max_overhang) 30 | 31 | write.table(x, file = outtxt, col.names = FALSE, row.names = FALSE, 32 | quote = FALSE, sep = "\t") 33 | 34 | date() 35 | sessionInfo() 36 | -------------------------------------------------------------------------------- /datasets_to_include.mk: -------------------------------------------------------------------------------- 1 | null := 2 | space := $(null) # 3 | comma := , 4 | 5 | ## Define the data sets to include in the plots 6 | datasetsms := DCS108 HEK293RNA RNA001 pilot 7 | datasetsmsc := $(subst $(space),$(comma),$(datasetsms)) 8 | 9 | conditionsms := wt 10 | conditionsmsc := $(subst $(space),$(comma),$(conditionsms)) -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_compare_gffcompare_sqanti.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(rtracklayer) 8 | library(dplyr) 9 | library(GenomicFeatures) 10 | library(ggplot2) 11 | library(grDevices) 12 | library(cowplot) 13 | library(grid) 14 | library(pheatmap) 15 | library(viridis) 16 | }) 17 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 18 | 19 | datasets <- strsplit(datasets, ",")[[1]] 20 | names(datasets) <- datasets 21 | conditions <- strsplit(conditions, ",")[[1]] 22 | 23 | if (all(c("wt", "srpk") %in% conditions)) { 24 | flaircond <- "_all" 25 | } else if ("wt" %in% conditions) { 26 | flaircond <- "_WT" 27 | } else { 28 | stop("Unknown conditions") 29 | } 30 | 31 | print(datasets) 32 | print(conditions) 33 | print(flaircond) 34 | print(outrds) 35 | 36 | ## Read class codes from gffcompare 37 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 38 | tmap <- read.delim(paste0( 39 | ds, "/flair_round2/", ds, flaircond, "/", ds, flaircond, 40 | "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare.", 41 | ds, flaircond, "_minimap_genome_s_primary_flair_collapse.isoforms.gtf.tmap"), 42 | header = TRUE, as.is = TRUE) %>% 43 | dplyr::select(qry_id, class_code, ref_id) %>% 44 | dplyr::mutate(dataset = remapds[ds]) %>% 45 | dplyr::mutate(class_code = as.character(class_code), 46 | ref_id = as.character(ref_id)) 47 | if (ds %in% c("RNA001", "DCS108", "pilot")) { 48 | tmap2 <- read.delim(paste0( 49 | ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 50 | "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gffcompare.", 51 | ds, flaircond, "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.", 52 | "gtf.tmap"), 53 | header = TRUE, as.is = TRUE) %>% 54 | dplyr::select(qry_id, class_code, ref_id) %>% 55 | dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>% 56 | dplyr::mutate(class_code = as.character(class_code), 57 | ref_id = as.character(ref_id)) 58 | tmap <- dplyr::bind_rows(tmap, tmap2) 59 | } 60 | tmap 61 | })) %>% 62 | dplyr::mutate( 63 | class_code = replace(class_code, class_code == "=", 64 | "complete, exact match of intron chain (=)"), 65 | class_code = replace(class_code, class_code == "c", 66 | "contained in reference (intron compatible) (c)"), 67 | class_code = replace(class_code, class_code == "k", 68 | "containment of reference (k)"), 69 | class_code = replace(class_code, class_code == "m", 70 | "retained intron(s), full intron chain overlap (m)"), 71 | class_code = replace(class_code, class_code == "n", 72 | "retained intron(s), partial or no intron chain match (n)"), 73 | class_code = replace(class_code, class_code == "j", 74 | "multi-exon with at least one junction match (j)"), 75 | class_code = replace(class_code, class_code == "e", 76 | "single exon transfrag partially covering an intron (e)"), 77 | class_code = replace(class_code, class_code == "o", 78 | "other same strand overlap with reference exons (o)"), 79 | class_code = replace(class_code, class_code == "s", 80 | "intron matching on opposite strand (s)"), 81 | class_code = replace(class_code, class_code == "x", 82 | "exonic overlap on opposite strand (x)"), 83 | class_code = replace(class_code, class_code == "i", 84 | "fully contained within a reference intron (i)"), 85 | class_code = replace(class_code, class_code == "y", 86 | "contains a reference within its introns (y)"), 87 | class_code = replace(class_code, class_code == "p", 88 | "possible polymerase run-on (no overlap) (p)"), 89 | class_code = replace(class_code, class_code == "r", "repeat (r)"), 90 | class_code = replace(class_code, class_code == "u", "other (u)")) %>% 91 | dplyr::mutate(class_code = factor(class_code, levels = c( 92 | "other (u)", 93 | "repeat (r)", 94 | "possible polymerase run-on (no overlap) (p)", 95 | "contains a reference within its introns (y)", 96 | "fully contained within a reference intron (i)", 97 | "exonic overlap on opposite strand (x)", 98 | "intron matching on opposite strand (s)", 99 | "other same strand overlap with reference exons (o)", 100 | "single exon transfrag partially covering an intron (e)", 101 | "multi-exon with at least one junction match (j)", 102 | "retained intron(s), partial or no intron chain match (n)", 103 | "retained intron(s), full intron chain overlap (m)", 104 | "containment of reference (k)", 105 | "contained in reference (intron compatible) (c)", 106 | "complete, exact match of intron chain (=)" 107 | ))) 108 | 109 | dfsc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 110 | tmap <- read.delim(paste0( 111 | ds, "/sqanti_round2/", ds, flaircond, "/", ds, flaircond, 112 | "_classification.txt"), 113 | header = TRUE, as.is = TRUE) %>% 114 | dplyr::select(isoform, structural_category, associated_transcript) %>% 115 | dplyr::mutate(dataset = remapds[ds]) %>% 116 | dplyr::mutate(structural_category = as.character(structural_category), 117 | associated_transcript = as.character(associated_transcript)) 118 | if (ds %in% c("RNA001", "DCS108", "pilot")) { 119 | tmap2 <- read.delim(paste0( 120 | ds, "/sqanti_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 121 | "_ilmnjunc_classification.txt"), 122 | header = TRUE, as.is = TRUE) %>% 123 | dplyr::select(isoform, structural_category, associated_transcript) %>% 124 | dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>% 125 | dplyr::mutate(structural_category = as.character(structural_category), 126 | associated_transcript = as.character(associated_transcript)) 127 | tmap <- dplyr::bind_rows(tmap, tmap2) 128 | } 129 | tmap 130 | })) %>% 131 | dplyr::mutate(structural_category = factor(structural_category, levels = c( 132 | "intergenic", 133 | "genic_intron", 134 | "antisense", 135 | "novel_not_in_catalog", 136 | "novel_in_catalog", 137 | "genic", 138 | "fusion", 139 | "incomplete-splice_match", 140 | "full-splice_match" 141 | ))) 142 | 143 | dfboth <- dplyr::full_join(dfcc, dfsc, by = c("qry_id" = "isoform", "dataset")) 144 | dfbothsub <- dfboth %>% dplyr::filter(dataset == "ONT-RNA001-HAP_ILMNjunc") 145 | 146 | tb <- table(dfboth$class_code, dfboth$structural_category) 147 | tbsub <- table(dfbothsub$class_code, dfbothsub$structural_category) 148 | 149 | sqrt_breaks <- function(xs, n = 10) { 150 | breaks <- (seq(0, (max(xs) + 1)^(1/3), length.out = n))^3 151 | breaks[!duplicated(breaks)] 152 | } 153 | mat_breaks <- sqrt_breaks(tb, n = 101) 154 | mat_breaks_sub <- sqrt_breaks(tbsub, n = 101) 155 | 156 | png(gsub("rds$", "png", outrds), height = 8, width = 8, unit = "in", res = 400) 157 | pheatmap::pheatmap(tb, cluster_rows = FALSE, cluster_cols = FALSE, 158 | color = inferno(length(mat_breaks) - 1), 159 | breaks = mat_breaks) 160 | dev.off() 161 | 162 | png(gsub("\\.rds$", "_sub.png", outrds), height = 8, width = 8, unit = "in", res = 400) 163 | pheatmap::pheatmap(tbsub, cluster_rows = FALSE, cluster_cols = FALSE, 164 | color = inferno(length(mat_breaks_sub) - 1), 165 | breaks = mat_breaks_sub, main = "ONT-RNA001-HAP_ILMNjunc") 166 | dev.off() 167 | 168 | saveRDS(NULL, file = outrds) 169 | date() 170 | sessionInfo() 171 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_consistency_between_replicates.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(stringr) 10 | library(ggbeeswarm) 11 | }) 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 13 | 14 | datasets <- strsplit(datasets, ",")[[1]] 15 | datasets <- c(datasets, "Illumina") 16 | names(datasets) <- datasets 17 | conditions <- strsplit(conditions, ",")[[1]] 18 | 19 | print(tx2gene) 20 | print(datasets) 21 | print(conditions) 22 | print(outrds) 23 | 24 | ## Define base colors 25 | muted <- c("#DC050C","#E8601C","#7BAFDE","#1965B0","#B17BA6", 26 | "#882E72","#F1932D","#F6C141","#F7EE55","#4EB265", 27 | "#90C987","#CAEDAB","#777777") 28 | colfun <- grDevices::colorRampPalette(muted) 29 | 30 | abundances <- lapply(datasets, function(ds) { 31 | readRDS(paste0(ds, "/output/", ds, "_all_abundances.rds")) 32 | }) 33 | 34 | ## Keep only the desired abundance measures 35 | abundances <- lapply(abundances, function(w) { 36 | w$tx_abundances <- 37 | w$tx_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|tpm__salmon$|tpm__StringTie", 38 | colnames(w$tx_abundances))] 39 | w$gene_abundances <- 40 | w$gene_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|count__featurecountsminimap2primary|tpm__salmon$|tpm__StringTie", 41 | colnames(w$gene_abundances))] 42 | w 43 | }) 44 | 45 | tx2gene <- readRDS(tx2gene) 46 | ## Remove version information from tx2gene 47 | tx2gene <- tx2gene %>% dplyr::mutate(tx = gsub("\\.[0-9]+$", "", tx), 48 | gene = gsub("\\.[0-9]+$", "", gene)) 49 | 50 | ## Transcript-level 51 | txlevel <- Reduce(function(...) dplyr::full_join(..., by = "tx"), 52 | lapply(abundances, function(ab) { 53 | ab$tx_abundances %>% tibble::rownames_to_column("tx") 54 | })) %>% 55 | tibble::column_to_rownames("tx") 56 | 57 | ## Gene-level 58 | genelevel <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 59 | lapply(abundances, function(ab) { 60 | ab$gene_abundances %>% tibble::rownames_to_column("gene") 61 | })) %>% 62 | tibble::column_to_rownames("gene") 63 | 64 | cortx <- list() 65 | corgene <- list() 66 | 67 | ## Help functions 68 | calcCor <- function(abundancemat, method) { 69 | cor(abundancemat, 70 | method = method, use = "pairwise.complete.obs") %>% 71 | as.data.frame() %>% tibble::rownames_to_column("sample1") %>% 72 | tidyr::gather(key = "sample2", value = "correlation", -sample1) %>% 73 | tidyr::separate(sample1, into = c("sample1", "type1", "method1"), sep = "__") %>% 74 | tidyr::separate(sample2, into = c("sample2", "type2", "method2"), sep = "__") %>% 75 | tidyr::unite(col = method1, type1, method1, sep = "__") %>% 76 | tidyr::unite(col = method2, type2, method2, sep = "__") %>% 77 | dplyr::mutate(sample1 = remap[sample1], 78 | sample2 = remap[sample2]) %>% 79 | dplyr::mutate(dataset1 = sapply(strsplit(sample1, "_"), .subset, 1), 80 | dataset2 = sapply(strsplit(sample2, "_"), .subset, 1), 81 | condition1 = sapply(strsplit(sample1, "_"), .subset, 2), 82 | condition2 = sapply(strsplit(sample2, "_"), .subset, 2)) %>% 83 | dplyr::filter(condition1 %in% conditions & condition2 %in% conditions) %>% 84 | dplyr::mutate(dtype = ifelse( 85 | dataset1 == "Illumina" & dataset2 == "Illumina", "Illumina", 86 | ifelse((dataset1 == "Illumina" & dataset2 != "Illumina") | 87 | (dataset1 != "Illumina" & dataset2 == "Illumina"), 88 | "Illumina-Nanopore", "Nanopore"))) %>% 89 | dplyr::mutate(combination = ifelse(dataset1 == dataset2, "same dataset", 90 | "different datasets")) 91 | } 92 | 93 | plotCor <- function(cordf, ylab, title) { 94 | tmp <- cordf %>% 95 | dplyr::filter(dataset1 != "ONT-RNA001-HEK" & dataset2 != "ONT-RNA001-HEK") %>% 96 | dplyr::mutate(method1 = gsub("featurecounts", "fC", 97 | gsub("count__", "", method1)), 98 | method2 = gsub("featurecounts", "fC", 99 | gsub("count__", "", method2))) %>% 100 | dplyr::filter(gsub("tpm__", "", gsub("31", "", method1)) == 101 | gsub("tpm__", "", gsub("31", "", method2)) & 102 | sample1 != sample2 & 103 | condition1 == condition2 & 104 | (dataset1 < dataset2 | (dataset1 == dataset2 & 105 | sample1 < sample2))) %>% 106 | dplyr::mutate( 107 | method1 = replace( 108 | method1, dtype == "Illumina-Nanopore", 109 | gsub("tpm__", "", gsub("31", "", 110 | method1[dtype == "Illumina-Nanopore"]))), 111 | method2 = replace( 112 | method2, dtype == "Illumina-Nanopore", 113 | gsub("tpm__", "", gsub("31", "", 114 | method2[dtype == "Illumina-Nanopore"]))) 115 | ) 116 | ncol <- length(unique(interaction(tmp$dataset1, tmp$dataset2))) 117 | ggplot(tmp, 118 | aes(x = method1, y = correlation, color = interaction(dataset1, dataset2), 119 | shape = combination)) + 120 | geom_quasirandom(size = 3, alpha = 0.6) + theme_bw() + 121 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, 122 | hjust = 1, size = 12)) + 123 | xlab("") + ylab(ylab) + 124 | scale_color_manual(values = colfun(ncol), name = "") + 125 | expand_limits(y = c(0, 1)) + scale_shape_discrete(name = "") + 126 | facet_grid(~ dtype, scales = "free_x", space = "free_x") + 127 | ggtitle(title) 128 | } 129 | 130 | plotCorBetween <- function(cordf, ylab, title) { 131 | ggplot(cordf %>% 132 | dplyr::mutate(method1 = gsub("featurecounts", "fC", 133 | gsub("count__", "", method1)), 134 | method2 = gsub("featurecounts", "fC", 135 | gsub("count__", "", method2))) %>% 136 | dplyr::filter(method1 < method2 & sample1 == sample2) %>% 137 | dplyr::mutate(dataset1 = factor(dataset1, levels = 138 | ds_order[ds_order %in% dataset1])), 139 | aes(x = interaction(method1, method2), y = correlation, color = dataset1)) + 140 | geom_quasirandom(size = 4, alpha = 0.8) + theme_bw() + 141 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 142 | xlab("") + ylab(ylab) + ggtitle(title) + 143 | scale_color_manual(values = ds_colors, 144 | name = "") + expand_limits(y = c(0, 1)) + 145 | facet_grid(~ dtype, scales = "free_x", space = "free_x") 146 | } 147 | 148 | for (m in c("Pearson", "Spearman")) { 149 | cortx[[m]] <- calcCor(sqrt(txlevel), method = tolower(m)) 150 | corgene[[m]] <- calcCor(sqrt(genelevel), method = tolower(m)) 151 | 152 | ptx <- plotCor(cordf = cortx[[m]], 153 | ylab = paste0(m, " correlation between\nreplicate pairs, ", 154 | "sqrt(abundances)"), 155 | title = "Transcript") 156 | pg <- plotCor(cordf = corgene[[m]], 157 | ylab = paste0(m, " correlation between\nreplicate pairs, ", 158 | "sqrt(abundances)"), 159 | title = "Gene") 160 | 161 | png(gsub("\\.rds", paste0("_", m, ".png"), outrds), 162 | width = 18, height = 6, unit = "in", res = 400) 163 | print(cowplot::plot_grid( 164 | ptx + theme(legend.position = "none"), 165 | pg + theme(legend.position = "none"), 166 | cowplot::get_legend(pg), 167 | nrow = 1, rel_widths = c(1, 1.15, 0.5), align = "h", axis = "t", 168 | labels = c("A", "B", "") 169 | )) 170 | dev.off() 171 | 172 | ptxb <- plotCorBetween(cordf = cortx[[m]], 173 | ylab = paste0(m, " correlation between\nmethod pairs, ", 174 | "sqrt(abundances)"), 175 | title = "Transcript") 176 | pgb <- plotCorBetween(cordf = corgene[[m]], 177 | ylab = paste0(m, " correlation between\nmethod pairs, ", 178 | "sqrt(abundances)"), 179 | title = "Gene") 180 | 181 | png(gsub("\\.rds", paste0("_betweenmethods_", m, ".png"), outrds), 182 | width = 13, height = 6, unit = "in", res = 400) 183 | print(cowplot::plot_grid( 184 | cowplot::plot_grid( 185 | ptxb + theme(legend.position = "none"), 186 | pgb + theme(legend.position = "none"), 187 | nrow = 1, rel_widths = c(1, 1), labels = c("A", "B"), 188 | align = "h", axis = "tb"), 189 | cowplot::get_legend(pgb), nrow = 1, rel_widths = c(2, 0.4), 190 | labels = c("", ""), align = "h", axis = "t" 191 | )) 192 | dev.off() 193 | } 194 | 195 | saveRDS(NULL, file = outrds) 196 | date() 197 | sessionInfo() 198 | 199 | 200 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_flair_round2_summary.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(rtracklayer) 8 | library(dplyr) 9 | library(GenomicFeatures) 10 | library(ggplot2) 11 | library(grDevices) 12 | library(cowplot) 13 | library(grid) 14 | }) 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 16 | 17 | datasets <- strsplit(datasets, ",")[[1]] 18 | datasets <- c(datasets, "Illumina") 19 | names(datasets) <- datasets 20 | conditions <- strsplit(conditions, ",")[[1]] 21 | 22 | if (all(c("wt", "srpk") %in% conditions)) { 23 | flaircond <- "_all" 24 | } else if ("wt" %in% conditions) { 25 | flaircond <- "_WT" 26 | } else { 27 | stop("Unknown conditions") 28 | } 29 | 30 | print(datasets) 31 | print(conditions) 32 | print(flaircond) 33 | print(outrds) 34 | 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6", 36 | "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265", 37 | "#CAEDAB", "#777777") 38 | colfun <- grDevices::colorRampPalette(muted) 39 | 40 | ## Read Illumina abundance estimates 41 | ilmn_tx_abundances <- 42 | readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances 43 | ilmn_samples <- sample_annotation %>% 44 | dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 45 | dplyr::pull("sample_orig") 46 | print(ilmn_samples) 47 | ave_ilmn_tx_tpm <- 48 | data.frame(transcript_id = rownames(ilmn_tx_abundances), 49 | ave_tx_tpm = rowMeans( 50 | ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")] 51 | ), 52 | stringsAsFactors = FALSE) 53 | 54 | ## ========================================================================== ## 55 | ## Plot overlap of detected transcripts (class code = or c) 56 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 57 | dplyr::bind_rows( 58 | read.delim(paste0( 59 | ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 60 | "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gffcompare.", 61 | ds, flaircond, 62 | "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gtf.tmap"), 63 | header = TRUE, as.is = TRUE) %>% 64 | dplyr::filter(class_code %in% c("c", "=")) %>% 65 | dplyr::select(ref_id) %>% 66 | dplyr::rename(transcript_id = ref_id) %>% 67 | dplyr::distinct() %>% 68 | dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc"), 69 | identified = TRUE), 70 | read.delim(paste0( 71 | ds, "/flair_round2/", ds, flaircond, "/", ds, flaircond, 72 | "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare.", 73 | ds, flaircond, 74 | "_minimap_genome_s_primary_flair_collapse.isoforms.gtf.tmap"), 75 | header = TRUE, as.is = TRUE) %>% 76 | dplyr::filter(class_code %in% c("c", "=")) %>% 77 | dplyr::select(ref_id) %>% 78 | dplyr::rename(transcript_id = ref_id) %>% 79 | dplyr::distinct() %>% 80 | dplyr::mutate(dataset = remapds[ds], 81 | identified = TRUE) 82 | ) 83 | })) %>% 84 | dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc") ## don't use HAP junctions for HEK 85 | dettx <- unique(df0$transcript_id) ## all detected ref. transcripts 86 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset))) 87 | rownames(detmat) <- dettx 88 | colnames(detmat) <- unique(df0$dataset) 89 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset) %>% 90 | dplyr::mutate(transcript_id = as.character(transcript_id), 91 | dataset = as.character(dataset)))] <- 1 92 | ordr <- ds_order[ds_order %in% colnames(detmat)] 93 | ordr <- rep(ordr, each = 2) 94 | suffx <- rep(c("", "_ILMNjunc"), length(ordr)/2) 95 | ordr <- paste0(ordr, suffx) 96 | ordr <- setdiff(ordr, "ONT-RNA001-HEK_ILMNjunc") 97 | stopifnot(ordr %in% colnames(detmat), 98 | colnames(detmat) %in% ordr) 99 | detmat <- detmat[, ordr] 100 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 101 | width = 10, height = 6, unit = "in", res = 400) 102 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq", 103 | decreasing = TRUE, keep.order = TRUE, nsets = length(ordr), 104 | mainbar.y.label = "Number of shared annotated transcripts", 105 | sets.x.label = "Number of identified\nannotated transcripts", 106 | sets.bar.color = ds_colors[gsub("_ILMNjunc", "", colnames(detmat))], 107 | sets = colnames(detmat), mb.ratio = c(0.55, 0.45)) 108 | grid::grid.edit("arrange", name = "arrange2") 109 | upsetplot <- grid::grid.grab() 110 | dev.off() 111 | 112 | ## ========================================================================== ## 113 | ## Plot abundance of identified/unidentified transcripts 114 | ## (identified = class code = or c) 115 | df1 <- df0 %>% 116 | dplyr::full_join( 117 | do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 118 | dplyr::bind_rows(ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds]), 119 | ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = paste0(remapds[ds], 120 | "_ILMNjunc"))) %>% 121 | dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc") 122 | })), 123 | by = c("transcript_id", "dataset") 124 | ) %>% 125 | dplyr::mutate(identified = replace(identified, is.na(identified), FALSE)) 126 | 127 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds), 128 | width = 8, height = 4, unit = "in", res = 400) 129 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 130 | geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 131 | scale_y_log10() + xlab("Transcript identified by FLAIR") + 132 | ylab("Average abundance across Illumina samples (TPM + 1)") 133 | dev.off() 134 | 135 | ## Merge with upset plot 136 | df2 <- df1 %>% dplyr::filter(grepl("_ILMNjunc$|RNA001-HEK", dataset)) %>% 137 | dplyr::select(-dataset) %>% 138 | dplyr::group_by(transcript_id, ave_tx_tpm) %>% 139 | dplyr::summarize(identified = any(identified)) 140 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 141 | geom_boxplot() + theme_bw() + 142 | scale_y_log10() + xlab("Transcript identified by FLAIR") + 143 | ylab("Average abundance across Illumina samples (TPM + 1)") 144 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds), 145 | width = 12, height = 6, unit = "in", res = 400) 146 | print(cowplot::plot_grid( 147 | upsetplot, 148 | p1, 149 | rel_widths = c(4, 1.5), nrow = 1, labels = c("A", "B") 150 | )) 151 | dev.off() 152 | 153 | ## ========================================================================== ## 154 | ## Get number of exons and read length per flair transcript and plot for 155 | ## different class codes 156 | ds_order_both <- rep(ds_order, each = 2) 157 | suffx <- rep(c("", "_ILMNjunc"), length(ds_order_both)/2) 158 | ds_order_both <- paste0(ds_order_both, suffx) 159 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 160 | tmp <- as.data.frame(rtracklayer::import( 161 | paste0(ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", 162 | "flair_round2"), 163 | "/", ds, flaircond, "/", ds, flaircond, 164 | ifelse(ds == "Illumina", "_stringtie_assembly.", 165 | "_minimap_genome_s_primary_flair_collapse.isoforms."), 166 | "gffcompare.annotated.gtf"))) 167 | tmap <- dplyr::full_join( 168 | tmp %>% 169 | dplyr::filter(type == "exon") %>% 170 | dplyr::group_by(transcript_id) %>% 171 | dplyr::summarize(nbr_exons = length(type), 172 | txLength = sum(width)), 173 | tmp %>% 174 | dplyr::filter(type == "transcript") %>% 175 | dplyr::select(transcript_id, class_code), 176 | by = "transcript_id" 177 | ) %>% dplyr::mutate(dataset = remapds[ds]) 178 | if (ds %in% c("RNA001", "DCS108", "pilot")) { 179 | tmp2 <- as.data.frame(rtracklayer::import( 180 | paste0(ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 181 | "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.", 182 | "gffcompare.annotated.gtf"))) 183 | tmap2 <- dplyr::full_join( 184 | tmp2 %>% 185 | dplyr::filter(type == "exon") %>% 186 | dplyr::group_by(transcript_id) %>% 187 | dplyr::summarize(nbr_exons = length(type), 188 | txLength = sum(width)), 189 | tmp2 %>% 190 | dplyr::filter(type == "transcript") %>% 191 | dplyr::select(transcript_id, class_code), 192 | by = "transcript_id" 193 | ) %>% dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) 194 | tmap <- dplyr::bind_rows(tmap, tmap2) 195 | } 196 | tmap 197 | })) %>% 198 | dplyr::mutate(nbr_exons = replace(nbr_exons, nbr_exons > 2, ">2")) %>% 199 | dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset])) 200 | 201 | 202 | ## ========================================================================== ## 203 | ## Plot distribution of class codes 204 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 205 | tmap <- read.delim(paste0( 206 | ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair_round2"), 207 | "/", ds, flaircond, "/", ds, flaircond, 208 | ifelse(ds == "Illumina", "_stringtie_assembly.gffcompare.", 209 | "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare."), 210 | ds, flaircond, ifelse(ds == "Illumina", "_stringtie_assembly.", 211 | "_minimap_genome_s_primary_flair_collapse.isoforms."), 212 | "gtf.tmap"), 213 | header = TRUE, as.is = TRUE) 214 | t1 <- as.data.frame(table(tmap$class_code)) %>% 215 | dplyr::mutate(dataset = remapds[ds]) %>% 216 | dplyr::rename(class_code = Var1) %>% 217 | dplyr::mutate(class_code = as.character(class_code)) 218 | if (ds %in% c("RNA001", "DCS108", "pilot")) { 219 | tmap2 <- read.delim(paste0( 220 | ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 221 | "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gffcompare.", 222 | ds, flaircond, "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.", 223 | "gtf.tmap"), 224 | header = TRUE, as.is = TRUE) 225 | t1 <- dplyr::bind_rows( 226 | t1, as.data.frame(table(tmap2$class_code)) %>% 227 | dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>% 228 | dplyr::rename(class_code = Var1) %>% 229 | dplyr::mutate(class_code = as.character(class_code))) 230 | } 231 | t1 232 | })) %>% 233 | dplyr::mutate( 234 | class_code = replace(class_code, class_code == "=", 235 | "complete, exact match of intron chain (=)"), 236 | class_code = replace(class_code, class_code == "c", 237 | "contained in reference (intron compatible) (c)"), 238 | class_code = replace(class_code, class_code == "k", 239 | "containment of reference (k)"), 240 | class_code = replace(class_code, class_code == "m", 241 | "retained intron(s), full intron chain overlap (m)"), 242 | class_code = replace(class_code, class_code == "n", 243 | "retained intron(s), partial or no intron chain match (n)"), 244 | class_code = replace(class_code, class_code == "j", 245 | "multi-exon with at least one junction match (j)"), 246 | class_code = replace(class_code, class_code == "e", 247 | "single exon transfrag partially covering an intron (e)"), 248 | class_code = replace(class_code, class_code == "o", 249 | "other same strand overlap with reference exons (o)"), 250 | class_code = replace(class_code, class_code == "s", 251 | "intron matching on opposite strand (s)"), 252 | class_code = replace(class_code, class_code == "x", 253 | "exonic overlap on opposite strand (x)"), 254 | class_code = replace(class_code, class_code == "i", 255 | "fully contained within a reference intron (i)"), 256 | class_code = replace(class_code, class_code == "y", 257 | "contains a reference within its introns (y)"), 258 | class_code = replace(class_code, class_code == "p", 259 | "possible polymerase run-on (no overlap) (p)"), 260 | class_code = replace(class_code, class_code == "r", "repeat (r)"), 261 | class_code = replace(class_code, class_code == "u", "other (u)")) %>% 262 | dplyr::mutate(class_code = factor(class_code, levels = c( 263 | "other (u)", 264 | "repeat (r)", 265 | "possible polymerase run-on (no overlap) (p)", 266 | "contains a reference within its introns (y)", 267 | "fully contained within a reference intron (i)", 268 | "exonic overlap on opposite strand (x)", 269 | "intron matching on opposite strand (s)", 270 | "other same strand overlap with reference exons (o)", 271 | "single exon transfrag partially covering an intron (e)", 272 | "multi-exon with at least one junction match (j)", 273 | "retained intron(s), partial or no intron chain match (n)", 274 | "retained intron(s), full intron chain overlap (m)", 275 | "containment of reference (k)", 276 | "contained in reference (intron compatible) (c)", 277 | "complete, exact match of intron chain (=)" 278 | ))) %>% 279 | dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset])) 280 | 281 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 282 | geom_bar(stat = "identity", position = "fill", aes(fill = class_code)) + 283 | theme_bw() + 284 | scale_fill_manual(values = structure(colfun(15), names = levels(dfcc$class_code)), 285 | name = "Overlap type (class code)") + 286 | ylab("Fraction of transcripts") + xlab("") + 287 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 288 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 289 | geom_text(data = dfcc %>% dplyr::group_by(dataset) %>% 290 | dplyr::summarize(nbrTx = sum(Freq)), 291 | aes(x = dataset, y = 1, label = nbrTx), 292 | vjust = 0, nudge_y = 0.02, size = 3) 293 | 294 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_classcode_full.png", outrds), 295 | width = 11, height = 16, unit = "in", res = 400) 296 | p1 <- ggplot(nbrexons %>% 297 | dplyr::group_by(dataset, class_code, nbr_exons) %>% 298 | dplyr::tally() %>% 299 | dplyr::ungroup() %>% 300 | dplyr::mutate(nbr_exons = factor(nbr_exons, 301 | levels = c(">2", "2", "1"))), 302 | aes(x = class_code, y = n, fill = nbr_exons)) + 303 | geom_bar(stat = "identity", position = "fill") + theme_bw() + 304 | facet_wrap(~ dataset, nrow = 2) + 305 | xlab("Class code") + ylab("Fraction of transcripts") + 306 | scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"), 307 | name = "Number of\nexons") + 308 | theme(legend.position = "bottom") + 309 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) 310 | p2 <- ggplot(nbrexons, 311 | aes(x = class_code, y = txLength)) + 312 | geom_boxplot() + theme_bw() + 313 | facet_wrap(~ dataset, nrow = 2) + 314 | scale_y_log10() + 315 | xlab("Class code") + ylab("Inferred transcript length") 316 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 317 | rel_heights = c(1, 1.1, 1), align = "v", axis = "l") 318 | dev.off() 319 | 320 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_classcode.png", outrds), 321 | width = 9, height = 12, unit = "in", res = 400) 322 | p1 <- ggplot(nbrexons %>% 323 | dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")) %>% 324 | dplyr::group_by(dataset, class_code, nbr_exons) %>% 325 | dplyr::tally() %>% 326 | dplyr::ungroup() %>% 327 | dplyr::mutate(nbr_exons = factor(nbr_exons, 328 | levels = c(">2", "2", "1"))), 329 | aes(x = class_code, y = n, fill = nbr_exons)) + 330 | geom_bar(stat = "identity", position = "fill") + theme_bw() + 331 | facet_wrap(~ dataset, nrow = 1) + 332 | xlab("Class code") + ylab("Fraction of transcripts") + 333 | scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"), 334 | name = "Number of\nexons") + 335 | theme(legend.position = "bottom") + 336 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) 337 | p2 <- ggplot(nbrexons %>% 338 | dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")), 339 | aes(x = class_code, y = txLength)) + 340 | geom_boxplot() + theme_bw() + 341 | facet_wrap(~ dataset, nrow = 1) + 342 | scale_y_log10() + 343 | xlab("Class code") + ylab("Inferred transcript length") 344 | cowplot::plot_grid(p0, p1, p2, 345 | ncol = 1, labels = c("A", "B", "C"), 346 | rel_heights = c(1.9, 1.28, 1.1), align = "v", axis = "l") 347 | dev.off() 348 | 349 | saveRDS(NULL, file = outrds) 350 | date() 351 | sessionInfo() -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_flair_summary.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(rtracklayer) 8 | library(dplyr) 9 | library(GenomicFeatures) 10 | library(ggplot2) 11 | library(grDevices) 12 | library(cowplot) 13 | library(grid) 14 | }) 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 16 | 17 | datasets <- strsplit(datasets, ",")[[1]] 18 | datasets <- c(datasets, "Illumina") 19 | names(datasets) <- datasets 20 | conditions <- strsplit(conditions, ",")[[1]] 21 | 22 | if (all(c("wt", "srpk") %in% conditions)) { 23 | flaircond <- "_all" 24 | } else if ("wt" %in% conditions) { 25 | flaircond <- "_WT" 26 | } else { 27 | stop("Unknown conditions") 28 | } 29 | 30 | print(datasets) 31 | print(conditions) 32 | print(flaircond) 33 | print(outrds) 34 | 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6", 36 | "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265", 37 | "#CAEDAB", "#777777") 38 | colfun <- grDevices::colorRampPalette(muted) 39 | 40 | ## Read Illumina abundance estimates 41 | ilmn_tx_abundances <- 42 | readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances 43 | ilmn_samples <- sample_annotation %>% 44 | dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 45 | dplyr::pull("sample_orig") 46 | ave_ilmn_tx_tpm <- 47 | data.frame(transcript_id = rownames(ilmn_tx_abundances), 48 | ave_tx_tpm = rowMeans( 49 | ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")] 50 | ), 51 | stringsAsFactors = FALSE) 52 | 53 | ## ========================================================================== ## 54 | ## Plot overlap of detected transcripts (class code = or c) 55 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 56 | read.delim(paste0( 57 | ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair"), 58 | "/", ds, flaircond, "/", ds, flaircond, 59 | ifelse(ds == "Illumina", "_stringtie_assembly.gffcompare.", 60 | "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare."), 61 | ds, flaircond, 62 | ifelse(ds == "Illumina", "_stringtie_assembly.gtf.tmap", 63 | "_minimap_genome_s_primary_flair_collapse.isoforms.gtf.tmap")), 64 | header = TRUE, as.is = TRUE) %>% 65 | dplyr::filter(class_code %in% c("c", "=")) %>% 66 | dplyr::select(ref_id) %>% 67 | dplyr::rename(transcript_id = ref_id) %>% 68 | dplyr::distinct() %>% 69 | dplyr::mutate(dataset = remapds[ds], 70 | identified = TRUE) 71 | })) 72 | dettx <- unique(df0$transcript_id) 73 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset))) 74 | rownames(detmat) <- dettx 75 | colnames(detmat) <- unique(df0$dataset) 76 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset))] <- 1 77 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 78 | width = 10, height = 6, unit = "in", res = 400) 79 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq") 80 | grid::grid.edit("arrange", name = "arrange2") 81 | upsetplot <- grid::grid.grab() 82 | dev.off() 83 | 84 | ## ========================================================================== ## 85 | ## Plot abundance of identified/unidentified transcripts 86 | ## (identified = class code = or c) 87 | df1 <- df0 %>% 88 | dplyr::full_join( 89 | do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 90 | ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds]) 91 | })), 92 | by = c("transcript_id", "dataset") 93 | ) %>% 94 | dplyr::mutate(identified = replace(identified, is.na(identified), FALSE)) 95 | 96 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds), 97 | width = 8, height = 4, unit = "in", res = 400) 98 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 99 | geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 100 | scale_y_log10() + xlab("Transcript identified by flair") + 101 | ylab("Average abundance across Illumina samples (TPM + 1)") 102 | dev.off() 103 | 104 | ## Merge with upset plot 105 | df2 <- df1 %>% dplyr::select(-dataset) %>% 106 | dplyr::group_by(transcript_id, ave_tx_tpm) %>% 107 | dplyr::summarize(identified = any(identified)) 108 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 109 | geom_boxplot() + theme_bw() + 110 | scale_y_log10() + xlab("Transcript identified by flair") + 111 | ylab("Average abundance across Illumina samples (TPM + 1)") 112 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds), 113 | width = 12, height = 6, unit = "in", res = 400) 114 | print(cowplot::plot_grid( 115 | upsetplot, 116 | p1, 117 | rel_widths = c(4, 1.5), nrow = 1, labels = c("A", "B") 118 | )) 119 | dev.off() 120 | 121 | ## ========================================================================== ## 122 | ## Get number of exons and read length per flair transcript and plot for 123 | ## different class codes 124 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 125 | tmp <- as.data.frame(rtracklayer::import( 126 | paste0(ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair"), 127 | "/", ds, flaircond, "/", ds, flaircond, 128 | ifelse(ds == "Illumina", "_stringtie_assembly.", 129 | "_minimap_genome_s_primary_flair_collapse.isoforms."), 130 | "gffcompare.annotated.gtf"))) 131 | dplyr::full_join( 132 | tmp %>% 133 | dplyr::filter(type == "exon") %>% 134 | dplyr::group_by(transcript_id) %>% 135 | dplyr::summarize(nbr_exons = length(type), 136 | txLength = sum(width)), 137 | tmp %>% 138 | dplyr::filter(type == "transcript") %>% 139 | dplyr::select(transcript_id, class_code), 140 | by = "transcript_id" 141 | ) %>% dplyr::mutate(dataset = remapds[ds]) 142 | })) %>% 143 | dplyr::mutate(nbr_exons = replace(nbr_exons, nbr_exons > 2, ">2")) 144 | 145 | ## ========================================================================== ## 146 | ## Plot distribution of class codes 147 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 148 | tmap <- read.delim(paste0( 149 | ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair"), 150 | "/", ds, flaircond, "/", ds, flaircond, 151 | ifelse(ds == "Illumina", "_stringtie_assembly.gffcompare.", 152 | "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare."), 153 | ds, flaircond, ifelse(ds == "Illumina", "_stringtie_assembly.", 154 | "_minimap_genome_s_primary_flair_collapse.isoforms."), 155 | "gtf.tmap"), 156 | header = TRUE, as.is = TRUE) 157 | as.data.frame(table(tmap$class_code)) %>% 158 | dplyr::mutate(dataset = remapds[ds]) %>% 159 | dplyr::rename(class_code = Var1) %>% 160 | dplyr::mutate(class_code = as.character(class_code)) 161 | })) %>% 162 | dplyr::mutate( 163 | class_code = replace(class_code, class_code == "=", 164 | "complete, exact match of intron chain (=)"), 165 | class_code = replace(class_code, class_code == "c", 166 | "contained in reference (intron compatible) (c)"), 167 | class_code = replace(class_code, class_code == "k", 168 | "containment of reference (k)"), 169 | class_code = replace(class_code, class_code == "m", 170 | "retained intron(s), full intron chain overlap (m)"), 171 | class_code = replace(class_code, class_code == "n", 172 | "retained intron(s), partial or no intron chain match (n)"), 173 | class_code = replace(class_code, class_code == "j", 174 | "multi-exon with at least one junction match (j)"), 175 | class_code = replace(class_code, class_code == "e", 176 | "single exon transfrag partially covering an intron (e)"), 177 | class_code = replace(class_code, class_code == "o", 178 | "other same strand overlap with reference exons (o)"), 179 | class_code = replace(class_code, class_code == "s", 180 | "intron matching on opposite strand (s)"), 181 | class_code = replace(class_code, class_code == "x", 182 | "exonic overlap on opposite strand (x)"), 183 | class_code = replace(class_code, class_code == "i", 184 | "fully contained within a reference intron (i)"), 185 | class_code = replace(class_code, class_code == "y", 186 | "contains a reference within its introns (y)"), 187 | class_code = replace(class_code, class_code == "p", 188 | "possible polymerase run-on (no overlap) (p)"), 189 | class_code = replace(class_code, class_code == "r", "repeat (r)"), 190 | class_code = replace(class_code, class_code == "u", "other (u)")) %>% 191 | dplyr::mutate(class_code = factor(class_code, levels = c( 192 | "other (u)", 193 | "repeat (r)", 194 | "possible polymerase run-on (no overlap) (p)", 195 | "contains a reference within its introns (y)", 196 | "fully contained within a reference intron (i)", 197 | "exonic overlap on opposite strand (x)", 198 | "intron matching on opposite strand (s)", 199 | "other same strand overlap with reference exons (o)", 200 | "single exon transfrag partially covering an intron (e)", 201 | "multi-exon with at least one junction match (j)", 202 | "retained intron(s), partial or no intron chain match (n)", 203 | "retained intron(s), full intron chain overlap (m)", 204 | "containment of reference (k)", 205 | "contained in reference (intron compatible) (c)", 206 | "complete, exact match of intron chain (=)" 207 | ))) 208 | 209 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_classcode.png", outrds), 210 | width = 11, height = 11, unit = "in", res = 400) 211 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 212 | geom_bar(stat = "identity", position = "fill", aes(fill = class_code)) + theme_bw() + 213 | scale_fill_manual(values = structure(colfun(15), names = levels(dfcc$class_code)), 214 | name = "Overlap type (class code)") + 215 | ylab("Fraction of transcripts") + xlab("") + 216 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 217 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 218 | geom_text(data = dfcc %>% dplyr::group_by(dataset) %>% 219 | dplyr::summarize(nbrTx = sum(Freq)), 220 | aes(x = dataset, y = 1, label = nbrTx), 221 | vjust = 0, nudge_y = 0.02, size = 3) 222 | p1 <- ggplot(nbrexons %>% 223 | dplyr::group_by(dataset, class_code, nbr_exons) %>% 224 | dplyr::tally() %>% 225 | dplyr::ungroup() %>% 226 | dplyr::mutate(nbr_exons = factor(nbr_exons, 227 | levels = c(">2", "2", "1"))), 228 | aes(x = class_code, y = n, fill = nbr_exons)) + 229 | geom_bar(stat = "identity", position = "fill") + theme_bw() + 230 | facet_wrap(~ dataset, nrow = 1) + 231 | xlab("Class code") + ylab("Fraction of transcripts") + 232 | scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"), 233 | name = "Number of\nexons") + 234 | theme(legend.position = "bottom") + 235 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) 236 | p2 <- ggplot(nbrexons, 237 | aes(x = class_code, y = txLength)) + 238 | geom_boxplot() + theme_bw() + 239 | facet_wrap(~ dataset, nrow = 1) + 240 | scale_y_log10() + 241 | xlab("Class code") + ylab("Inferred transcript length") 242 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 243 | rel_heights = c(1.7, 1.08, 0.9), align = "v", axis = "l") 244 | dev.off() 245 | 246 | 247 | saveRDS(NULL, file = outrds) 248 | date() 249 | sessionInfo() -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_gc_content_genomenotxome.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(cowplot) 10 | library(stringr) 11 | library(Biostrings) 12 | library(data.table) 13 | }) 14 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 15 | 16 | datasets <- strsplit(datasets, ",")[[1]] 17 | names(datasets) <- datasets 18 | conditions <- strsplit(conditions, ",")[[1]] 19 | 20 | print(datasets) 21 | print(conditions) 22 | print(outrds) 23 | 24 | fastqs <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 25 | fastqdir <- paste0(ds, "/FASTQ", 26 | ifelse(ds %in% c("RNA001", "HEK293RNA"), "dna", "")) 27 | fastqfiles <- list.files(fastqdir, pattern = "(FASTQ|fastq)\\.gz$", 28 | full.names = TRUE) 29 | names(fastqfiles) <- gsub("\\.(FASTQ|fastq).gz", "", basename(fastqfiles)) 30 | fastqfiles <- fastqfiles[!grepl("_orig", names(fastqfiles))] 31 | fastqfiles <- fastqfiles[names(fastqfiles) %in% 32 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 33 | fastqfiles 34 | 35 | genomenotxomedir <- paste0(ds, "/FASTQgenomenotxome") 36 | textfiles <- list.files(genomenotxomedir, pattern = "_reads_aligning_to_genome_but_not_txome.txt", 37 | full.names = TRUE) 38 | names(textfiles) <- gsub("_reads_aligning_to_genome_but_not_txome.txt", "", basename(textfiles)) 39 | textfiles <- textfiles[!grepl("_orig", names(textfiles))] 40 | textfiles <- textfiles[names(textfiles) %in% 41 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 42 | textfiles 43 | 44 | stopifnot(names(fastqfiles) %in% names(textfiles), 45 | names(textfiles) %in% names(fastqfiles)) 46 | textfiles <- textfiles[match(names(fastqfiles), names(textfiles))] 47 | stopifnot(names(fastqfiles) == names(textfiles)) 48 | 49 | do.call(dplyr::bind_rows, lapply(names(fastqfiles), function(nm) { 50 | f <- fastqfiles[nm] 51 | fastq <- fread(paste0("zcat ", f), sep = "\n", header = FALSE)$V1 52 | seq_idxs <- seq(2, length(fastq), by = 4) 53 | reads <- Biostrings::DNAStringSet(x = fastq[seq_idxs]) 54 | names(reads) <- sapply(strsplit(fastq[seq_idxs - 1], " "), .subset, 1) 55 | gc <- letterFrequency(reads, "GC", as.prob = TRUE) 56 | gnt <- read.delim(textfiles[nm], header = FALSE, as.is = TRUE)$V1 57 | as.data.frame(gc) %>% dplyr::mutate(read = names(reads), 58 | sample = remap[nm], 59 | dataset = remapds[ds]) %>% 60 | dplyr::mutate(genomenotxome = gsub("^@", "", read) %in% gnt) 61 | })) 62 | })) 63 | 64 | fastqs <- fastqs %>% 65 | dplyr::mutate(readcat = c("Other reads", "Reads mapping to genome but not transcriptome")[genomenotxome + 1]) 66 | print(table(fastqs$sample, fastqs$readcat)) 67 | 68 | png(gsub("rds$", "png", outrds), width = 6, height = 3, unit = "in", res = 400) 69 | ggplot(fastqs, aes(x = `G|C`, group = interaction(sample, readcat), color = readcat)) + 70 | geom_line(stat = "density", size = 1.5) + 71 | theme_bw() + 72 | scale_color_manual(values = c("red", "blue"), name = "") + 73 | xlab("GC content") + ylab("Density") + 74 | theme(legend.position = "bottom") 75 | dev.off() 76 | 77 | saveRDS(NULL, file = outrds) 78 | date() 79 | sessionInfo() 80 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_illumina_nanopore_lengths_all_datasets.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(tximport) 8 | library(ggplot2) 9 | library(dplyr) 10 | library(ggridges) 11 | library(tibble) 12 | }) 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 14 | 15 | datasets <- strsplit(datasets, ",")[[1]] 16 | names(datasets) <- datasets 17 | conditions <- strsplit(conditions, ",")[[1]] 18 | 19 | print(datasets) 20 | print(conditions) 21 | print(tx2gene) 22 | print(outrds) 23 | 24 | tx2gene <- readRDS(tx2gene) 25 | 26 | ## Read estimated abundances Illumina 27 | files <- list.files("Illumina/salmon31", pattern = "quant.sf", 28 | recursive = TRUE, full.names = TRUE) 29 | names(files) <- basename(dirname(files)) 30 | files <- files[names(files) %in% 31 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 32 | salmon <- tximport(files = files, type = "salmon", txOut = TRUE) 33 | 34 | salmon$length <- cbind(salmon$length, 35 | txLength = tx2gene$txlength[match(rownames(salmon$length), tx2gene$tx)]) 36 | stopifnot(all(rownames(salmon$length) == rownames(salmon$abundance))) 37 | 38 | txlengths_illumina <- 39 | do.call(dplyr::bind_rows, lapply(colnames(salmon$abundance), function(nm) { 40 | data.frame(tx_read_length = rep(salmon$length[, "txLength"], 41 | round(salmon$abundance[, nm] * 10)), 42 | sample = remap[nm], 43 | dtype = "Illumina transcript lengths", 44 | dataset = "Illumina", 45 | stringsAsFactors = FALSE) 46 | })) 47 | 48 | ## Nanopore read lengths 49 | readInfo <- do.call(rbind, lapply(datasets, function(ds) { 50 | rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds")) 51 | rdf <- rd$fastqs 52 | rdf <- rdf[!grepl("_orig", names(rdf))] 53 | rdf <- rdf[names(rdf) %in% 54 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 55 | do.call(dplyr::bind_rows, lapply(names(rdf), function(nm) { 56 | rdf[[nm]]$reads %>% dplyr::select(read, readLength) %>% 57 | dplyr::mutate(sample = remap[nm], dataset = remapds[ds]) %>% 58 | dplyr::left_join( 59 | rd$genomebams[[nm]]$allAlignments %>% 60 | dplyr::filter(flag %in% c(0, 16)) %>% 61 | dplyr::select(read, nbrSupplementaryAlignments) %>% 62 | dplyr::mutate(alignedGenome = TRUE), 63 | by = "read" 64 | ) %>% 65 | dplyr::select(-read) %>% 66 | dplyr::mutate(alignedGenome = replace(alignedGenome, is.na(alignedGenome), 67 | FALSE)) 68 | })) 69 | })) %>% 70 | dplyr::rename(tx_read_length = readLength) %>% 71 | dplyr::mutate(dtype = "Nanopore read lengths") 72 | 73 | readInfo <- readInfo %>% 74 | dplyr::mutate(category = "Unaligned") %>% 75 | dplyr::mutate(category = replace( 76 | category, alignedGenome & nbrSupplementaryAlignments == 0, 77 | "Aligned, without supplementary alignments")) %>% 78 | dplyr::mutate(category = replace( 79 | category, alignedGenome & nbrSupplementaryAlignments > 0, 80 | "Aligned, with supplementary alignment(s)")) 81 | 82 | png(gsub("\\.rds$", "_nanoporeread_density_byalignment.png", outrds), 83 | width = 7, height = 5, unit = "in", res = 400) 84 | print(ggplot(readInfo %>% 85 | dplyr::mutate(dataset = factor(dataset, levels = 86 | ds_order[ds_order %in% dataset])), 87 | aes(x = tx_read_length, color = category, group = category)) + 88 | geom_line(stat = "density", size = 1.25) + theme_bw() + 89 | scale_color_manual(values = c("Unaligned" = "lightblue", 90 | "Aligned, without supplementary alignments" = "#E8601C", 91 | "Aligned, with supplementary alignment(s)" = "#7BAFDE"), 92 | name = "") + 93 | scale_x_log10() + xlab("Read length") + 94 | theme(legend.position = "bottom") + 95 | facet_wrap(~ dataset)) 96 | dev.off() 97 | 98 | ## Plot distribution of transcript lengths from Illumina (weighted by 99 | ## abundance), overlay distribution of Nanopore read lengths 100 | plotdf <- dplyr::bind_rows(txlengths_illumina, readInfo %>% dplyr::select(-category)) %>% 101 | dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])) 102 | ## Only aligned reads 103 | plotdfaligned <- dplyr::bind_rows(txlengths_illumina, 104 | readInfo %>% dplyr::filter(category != "Unaligned") %>% 105 | dplyr::select(-category)) %>% 106 | dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])) 107 | 108 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_byds.png", outrds), 109 | width = 7, height = 5, unit = "in", res = 400) 110 | dbyds <- ggplot(plotdf, 111 | aes(x = tx_read_length, color = dataset, group = dataset)) + 112 | geom_line(stat = "density", size = 1.25) + theme_bw() + 113 | scale_x_log10() + xlab("Transcript/read length") + 114 | scale_color_manual(values = ds_colors, name = "") 115 | print(dbyds) 116 | dev.off() 117 | 118 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_byds_onlyaligned.png", outrds), 119 | width = 7, height = 5, unit = "in", res = 400) 120 | dbydsaligned <- ggplot(plotdfaligned, 121 | aes(x = tx_read_length, color = dataset, group = dataset)) + 122 | geom_line(stat = "density", size = 1.25) + theme_bw() + 123 | scale_x_log10() + xlab("Transcript/read length") + 124 | scale_color_manual(values = ds_colors, name = "") 125 | print(dbydsaligned) 126 | dev.off() 127 | 128 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_linear_byds.png", outrds), 129 | width = 7, height = 5, unit = "in", res = 400) 130 | dlinearbyds <- ggplot(plotdf, 131 | aes(x = tx_read_length, color = dataset, group = dataset)) + 132 | geom_line(stat = "density", size = 1.25) + theme_bw() + 133 | xlab("Transcript/read length") + xlim(0, 1.5e4) + 134 | scale_color_manual(values = ds_colors, name = "") 135 | print(dlinearbyds) 136 | dev.off() 137 | 138 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_linear_byds_onlyaligned.png", outrds), 139 | width = 7, height = 5, unit = "in", res = 400) 140 | dlinearbydsaligned <- ggplot(plotdfaligned, 141 | aes(x = tx_read_length, color = dataset, group = dataset)) + 142 | geom_line(stat = "density", size = 1.25) + theme_bw() + 143 | xlab("Transcript/read length") + xlim(0, 1.5e4) + 144 | scale_color_manual(values = ds_colors, name = "") 145 | print(dlinearbydsaligned) 146 | dev.off() 147 | 148 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_byds.png", outrds), 149 | width = 7, height = 5, unit = "in", res = 400) 150 | dviolinbyds <- ggplot(plotdf, 151 | aes(x = dataset, y = tx_read_length, 152 | fill = dataset)) + 153 | geom_violin(alpha = 0.5) + theme_bw() + 154 | scale_y_log10() + ylab("Transcript/read length") + xlab("") + 155 | scale_fill_manual(values = ds_colors, name = "") + 156 | theme(legend.position = "none", 157 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 158 | print(dviolinbyds) 159 | dev.off() 160 | 161 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_byds_onlyaligned.png", outrds), 162 | width = 7, height = 5, unit = "in", res = 400) 163 | dviolinbydsaligned <- ggplot(plotdfaligned, 164 | aes(x = dataset, y = tx_read_length, 165 | fill = dataset)) + 166 | geom_violin(alpha = 0.5) + theme_bw() + 167 | scale_y_log10() + ylab("Transcript/read length") + xlab("") + 168 | scale_fill_manual(values = ds_colors, name = "") + 169 | theme(legend.position = "none", 170 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 171 | print(dviolinbydsaligned) 172 | dev.off() 173 | 174 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_linear_byds.png", outrds), 175 | width = 7, height = 5, unit = "in", res = 400) 176 | dviolinlinearbyds <- ggplot(plotdf, 177 | aes(x = dataset, y = tx_read_length, 178 | fill = dataset)) + 179 | geom_violin(alpha = 0.5) + theme_bw() + ylim(0, 1.5e4) + 180 | ylab("Transcript/read length") + xlab("") + 181 | scale_fill_manual(values = ds_colors, name = "") + 182 | theme(legend.position = "none", 183 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 184 | print(dviolinlinearbyds) 185 | dev.off() 186 | 187 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_linear_byds_onlyaligned.png", outrds), 188 | width = 7, height = 5, unit = "in", res = 400) 189 | dviolinlinearbydsaligned <- ggplot(plotdfaligned, 190 | aes(x = dataset, y = tx_read_length, 191 | fill = dataset)) + 192 | geom_violin(alpha = 0.5) + theme_bw() + ylim(0, 1.5e4) + 193 | ylab("Transcript/read length") + xlab("") + 194 | scale_fill_manual(values = ds_colors, name = "") + 195 | theme(legend.position = "none", 196 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 197 | print(dviolinlinearbydsaligned) 198 | dev.off() 199 | 200 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_bysample.png", outrds), 201 | width = 7, height = 5, unit = "in", res = 400) 202 | ggplot(plotdf, 203 | aes(x = tx_read_length, color = dataset, group = sample)) + 204 | geom_line(stat = "density", size = 1) + theme_bw() + 205 | scale_x_log10() + xlab("Transcript/read length") + 206 | scale_color_manual(values = ds_colors, name = "") 207 | dev.off() 208 | 209 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_ridge_byds.png", outrds), 210 | width = 7, height = 7, unit = "in", res = 400) 211 | ggplot(plotdf, 212 | aes(y = dataset, x = tx_read_length, fill = dataset, 213 | color = dataset)) + 214 | geom_density_ridges(scale = 1.5) + theme_bw() + 215 | scale_x_log10() + xlab("Transcript/read length") + xlab("") + 216 | scale_fill_manual(values = ds_colors, name = "") + 217 | scale_color_manual(values = ds_colors, name = "") 218 | dev.off() 219 | 220 | plots <- list( 221 | illuminatx_nanoporeread_density_byds = dbyds, 222 | illuminatx_nanoporeread_density_linear_byds = dlinearbyds, 223 | illuminatx_nanoporeread_violin_byds = dviolinbyds, 224 | illuminatx_nanoporeread_violin_linear_byds = dviolinlinearbyds, 225 | illuminatx_nanoporeread_density_byds_aligned = dbydsaligned, 226 | illuminatx_nanoporeread_density_linear_byds_aligned = dlinearbydsaligned, 227 | illuminatx_nanoporeread_violin_byds_aligned = dviolinbydsaligned, 228 | illuminatx_nanoporeread_violin_linear_byds_aligned = dviolinlinearbydsaligned 229 | ) 230 | 231 | saveRDS(plots, file = outrds) 232 | date() 233 | sessionInfo() -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_nbr_assigned_reads.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(cowplot) 10 | library(stringr) 11 | }) 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 13 | 14 | datasets <- strsplit(datasets, ",")[[1]] 15 | names(datasets) <- datasets 16 | conditions <- strsplit(conditions, ",")[[1]] 17 | 18 | print(datasets) 19 | print(conditions) 20 | print(outrds) 21 | 22 | ## Read info 23 | readInfo <- lapply(datasets, function(ds) { 24 | rtbl <- read.delim(paste0(ds, "/output/", ds, "_nbr_reads.txt"), 25 | header = TRUE, as.is = TRUE) 26 | rtbl[!grepl("_orig", rtbl$sample), ] %>% 27 | dplyr::mutate(dataset = remapds[ds]) %>% 28 | dplyr::mutate(sample = remap[sample]) 29 | }) 30 | 31 | ## Number of reads 32 | nReads <- do.call(dplyr::bind_rows, readInfo) %>% 33 | dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition), 34 | by = c("sample" = "sample_remap")) %>% 35 | dplyr::filter(condition %in% conditions) %>% 36 | dplyr::select(-condition) 37 | 38 | ## Abundance info 39 | abundanceInfo <- lapply(datasets, function(ds) { 40 | readRDS(paste0(ds, "/output/", ds, "_all_abundances.rds")) 41 | }) 42 | 43 | ## Keep only the desired abundance measures 44 | abundanceInfo <- lapply(abundanceInfo, function(w) { 45 | w$gene_abundances <- 46 | w$gene_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|count__featurecountsminimap2primary", 47 | colnames(w$gene_abundances))] 48 | w 49 | }) 50 | 51 | genelevel <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 52 | lapply(abundanceInfo, function(ab) { 53 | ab$gene_abundances %>% tibble::rownames_to_column("gene") 54 | })) %>% 55 | tibble::column_to_rownames("gene") 56 | totCounts <- 57 | data.frame(totCount = colSums(genelevel), method = colnames(genelevel), 58 | stringsAsFactors = FALSE) %>% 59 | tidyr::separate(method, into = c("sample", "type", "method"), sep = "__") %>% 60 | dplyr::mutate(sample = remap[sample]) %>% 61 | dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition), 62 | by = c("sample" = "sample_remap")) %>% 63 | dplyr::filter(condition %in% conditions) %>% 64 | dplyr::select(sample, method, totCount) %>% 65 | tidyr::spread(method, totCount) 66 | 67 | colvec <- c(`Total number of reads` = "#777777", 68 | `Number of reads with a primary alignment to the genome` = "#E8601C", 69 | `Number of reads with a primary alignment to the transcriptome` = "#90C987", 70 | `Number of assigned reads, salmonminimap2_p0.99` = "#1965B0", 71 | `Number of assigned reads, fCminimap2primary` = "#882E72", 72 | `Number of assigned reads, salmon31` = "#DC050C", 73 | `Number of assigned reads, wubminimap2` = "#55A1B1") 74 | 75 | nReadsData <- nReads %>% 76 | dplyr::select(sample, dataset, nReads, nAlignedReadsGenome, 77 | nAlignedReadsTxome) %>% 78 | dplyr::full_join(totCounts, by = "sample") %>% 79 | tidyr::gather(rtype, nReads, -sample, -dataset) %>% 80 | dplyr::mutate( 81 | rtype = replace(rtype, rtype == "nReads", 82 | "Total number of reads"), 83 | rtype = replace(rtype, rtype == "nAlignedReadsGenome", 84 | "Number of reads with a primary alignment to the genome"), 85 | rtype = replace(rtype, rtype == "nAlignedReadsTxome", 86 | "Number of reads with a primary alignment to the transcriptome"), 87 | rtype = replace(rtype, rtype == "featurecountsminimap2primary", 88 | "Number of assigned reads, fCminimap2primary"), 89 | rtype = replace(rtype, rtype == "salmon31", 90 | "Number of assigned reads, salmon31"), 91 | rtype = replace(rtype, rtype == "salmonminimap2_p0.99", 92 | "Number of assigned reads, salmonminimap2_p0.99"), 93 | rtype = replace(rtype, rtype == "wubminimap2", 94 | "Number of assigned reads, wubminimap2") 95 | ) %>% 96 | dplyr::mutate( 97 | rtype = factor(rtype, levels = names(colvec)), 98 | dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]) 99 | ) 100 | 101 | png(gsub("\\.rds$", ".png", outrds), width = 16, height = 6, unit = "in", res = 400) 102 | print(ggplot(nReadsData %>% 103 | dplyr::mutate(nReads = nReads/1e6) %>% 104 | dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)), 105 | aes(x = sample, y = nReads, fill = rtype)) + 106 | geom_bar(stat = "identity", position = "dodge") + theme_bw() + 107 | scale_y_continuous(expand = c(0, 0, 0.05, 0), 108 | limits = c(0, max(nReadsData$nReads)/1e6)) + 109 | facet_grid(~ dataset, scales = "free_x", space = "free_x") + 110 | scale_fill_manual(values = colvec, name = "") + 111 | theme(legend.position = "bottom", 112 | strip.text = element_text(size = 8)) + 113 | guides(fill = guide_legend(nrow = 3, byrow = TRUE)) + 114 | xlab("") + ylab("Number of reads (Mio.)") + 115 | stat_summary(data = nReadsData %>% 116 | dplyr::mutate(nReads = nReads/1e6) %>% 117 | dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)) %>% 118 | dplyr::group_by(sample, dataset) %>% 119 | dplyr::mutate( 120 | nReads = round(nReads/nReads[rtype == "Total number of reads"]*100)), 121 | fun.data = function(x) { 122 | return(c(y = ifelse(x == 100, -100000, 1e-4), label = x))}, 123 | geom = "text", alpha = 1, size = 2, vjust = -1, 124 | position = position_dodge(width = 0.9))) + 125 | ylim(0, NA) 126 | 127 | dev.off() 128 | 129 | saveRDS(NULL, file = outrds) 130 | date() 131 | sessionInfo() 132 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_nbr_assigned_reads_biotype.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(cowplot) 10 | library(stringr) 11 | }) 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 13 | 14 | datasets <- strsplit(datasets, ",")[[1]] 15 | datasets <- c(datasets, "Illumina") 16 | names(datasets) <- datasets 17 | conditions <- strsplit(conditions, ",")[[1]] 18 | 19 | muted <- c("#DC050C","#E8601C","#7BAFDE","#1965B0","#B17BA6", 20 | "#882E72","#F1932D","#F6C141","#F7EE55","#4EB265", 21 | "#CAEDAB","#777777") 22 | colfun <- grDevices::colorRampPalette(muted) 23 | 24 | print(tx2gene) 25 | print(datasets) 26 | print(conditions) 27 | print(outrds) 28 | 29 | tx2gene <- readRDS(tx2gene) 30 | 31 | ## Abundance info 32 | abundanceInfo <- lapply(datasets, function(ds) { 33 | readRDS(paste0(ds, "/output/", ds, "_all_abundances.rds")) 34 | }) 35 | 36 | ## Keep only the desired abundance measures 37 | abundanceInfo <- lapply(abundanceInfo, function(w) { 38 | w$gene_abundances <- 39 | w$gene_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|count__featurecountsminimap2primary|tpm__salmon$|tpm__StringTie", 40 | colnames(w$gene_abundances))] 41 | w 42 | }) 43 | 44 | genelevel <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 45 | lapply(abundanceInfo, function(ab) { 46 | ab$gene_abundances %>% tibble::rownames_to_column("gene") 47 | })) %>% 48 | dplyr::left_join(tx2gene %>% dplyr::select(gene, gene_biotype) %>% 49 | dplyr::mutate(gene = gsub("\\.[0-9]+$", "", gene)) %>% 50 | dplyr::distinct(), by = "gene") 51 | 52 | 53 | tmp <- genelevel %>% dplyr::select(-gene) %>% dplyr::group_by(gene_biotype) %>% 54 | dplyr::summarize_at(vars(dplyr::matches("__count__|__tpm__")), sum) %>% 55 | dplyr::mutate_at(vars(dplyr::matches("__count__|__tpm__")), funs(./sum(.))) %>% 56 | tidyr::gather(method, proportion, -gene_biotype) %>% 57 | tidyr::separate(method, into = c("sample", "aType", "method"), sep = "__") %>% 58 | dplyr::mutate(method = gsub("featurecounts", "fC", method)) %>% 59 | dplyr::mutate(sample = remap[sample]) %>% 60 | dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, dataset, condition), 61 | by = c("sample" = "sample_remap")) %>% 62 | dplyr::filter(condition %in% conditions) %>% 63 | dplyr::group_by(gene_biotype) %>% 64 | dplyr::mutate(maxFrac = max(proportion)) %>% 65 | dplyr::ungroup() %>% 66 | dplyr::mutate(dataset2 = factor(dataset, levels = 67 | ds_order[ds_order %in% dataset])) 68 | 69 | methodcols <- c(salmon = "#882E72", StringTie = "#882E72", 70 | salmonminimap2_p0.99 = "#4EB265", 71 | fCminimap2primary = "#4EB265", salmon31 = "#4EB265", 72 | wubminimap2 = "#4EB265") 73 | 74 | png(gsub("\\.rds$", "_biotype_prop_average.png", outrds), width = 10, 75 | height = 15, unit = "in", res = 400) 76 | p1 <- ggplot(tmp %>% 77 | dplyr::mutate(dataset = replace(dataset, dataset != "Illumina", 78 | "Nanopore")) %>% 79 | dplyr::group_by(dataset, gene_biotype, method) %>% 80 | dplyr::summarize(proportion = mean(proportion), 81 | maxFrac = max(maxFrac)) %>% 82 | dplyr::filter(maxFrac >= 0.001) %>% 83 | dplyr::mutate(method = factor( 84 | method, levels = c("fCminimap2primary", 85 | "salmonminimap2_p0.99", "salmon31", 86 | "wubminimap2", "salmon", 87 | "StringTie"))), 88 | aes(x = method, y = proportion, color = method, fill = method)) + 89 | geom_bar(stat = "identity") + 90 | theme_bw() + facet_wrap(~ gene_biotype, scales = "free_y") + 91 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 92 | legend.position = "none", 93 | strip.text = element_text(size = 7)) + 94 | xlab("") + ylab("Average abundance proportion") + 95 | scale_color_manual(values = methodcols, name = "") + 96 | scale_fill_manual(values = methodcols, name = "") 97 | 98 | g0 <- 99 | ggplot(tmp %>% dplyr::filter(dataset != "Illumina") %>% 100 | dplyr::mutate( 101 | gene_biotype = replace( 102 | gene_biotype, 103 | !(gene_biotype %in% c("lincRNA", "Mt_rRNA", 104 | "processed_pseudogene", 105 | "protein_coding", 106 | "antisense_RNA", 107 | "Mt_tRNA", 108 | "transcribed_processed_pseudogene")), 109 | "other")) %>% 110 | dplyr::arrange(dataset2, sample) %>% 111 | dplyr::mutate(sample = factor(sample, levels = unique(sample))), 112 | aes(x = method, y = proportion, group = gene_biotype, fill = gene_biotype)) + 113 | geom_bar(stat = "identity", position = "fill") + 114 | facet_wrap(~ sample, ncol = 4) + theme_bw() + 115 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 116 | legend.position = "bottom") + 117 | xlab("") + ylab("Abundance proportion") + 118 | scale_fill_manual(values = colfun(8), name = "") 119 | 120 | g1 <- 121 | ggplot(tmp %>% dplyr::filter(dataset == "Illumina") %>% 122 | dplyr::mutate( 123 | gene_biotype = replace( 124 | gene_biotype, 125 | !(gene_biotype %in% c("lincRNA", "Mt_rRNA", 126 | "processed_pseudogene", 127 | "protein_coding", 128 | "antisense_RNA", 129 | "Mt_tRNA", 130 | "transcribed_processed_pseudogene")), 131 | "other")), 132 | aes(x = method, y = proportion, group = gene_biotype, fill = gene_biotype)) + 133 | geom_bar(stat = "identity", position = "fill") + 134 | facet_wrap(~ sample, ncol = 4) + theme_bw() + 135 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 136 | legend.position = "bottom") + 137 | xlab("") + ylab("Abundance proportion") + 138 | scale_fill_manual(values = colfun(8), name = "") 139 | 140 | cowplot::plot_grid( 141 | p1, 142 | cowplot::plot_grid(g0 + theme(legend.position = "none"), g1, ncol = 1, 143 | rel_heights = c(1, 0.45)), 144 | ncol = 1, labels = c("A", "B"), rel_heights = c(0.8, 1)) 145 | dev.off() 146 | 147 | saveRDS(NULL, file = outrds) 148 | date() 149 | sessionInfo() 150 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_nbr_junctions_per_read_illumina.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(GenomicAlignments) 8 | library(dplyr) 9 | library(ggplot2) 10 | library(cowplot) 11 | library(stringr) 12 | }) 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 14 | 15 | conditions <- strsplit(conditions, ",")[[1]] 16 | 17 | print(conditions) 18 | print(genomebamdir) 19 | print(outrds) 20 | 21 | readBam <- function(bamfile) { 22 | bf <- BamFile(bamfile, yieldSize = 5e6) 23 | bam <- readGAlignments(bf, use.names = TRUE, 24 | param = ScanBamParam(tag = c("NM"), 25 | what = c("qname", "flag", "rname", 26 | "pos", "mapq"))) 27 | ops <- GenomicAlignments::CIGAR_OPS 28 | wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(bam), ops = ops) 29 | keep.ops <- GenomicAlignments::explodeCigarOps(cigar(bam), ops = ops) 30 | explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 31 | unlist(keep.ops)), wdths)) 32 | for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) { 33 | mcols(bam)[[paste0("nbr", opts)]] <- 34 | sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE)) 35 | } 36 | mcols(bam)$readLength <- rowSums(as.matrix(mcols(bam)[, c("nbrS", "nbrH", "nbrM", "nbrI")])) 37 | bam 38 | } 39 | 40 | makeReadDf <- function(bam) { 41 | tmp <- data.frame(bam %>% setNames(NULL), stringsAsFactors = FALSE) %>% 42 | dplyr::rename(read = qname, 43 | nbrJunctions = njunc) %>% 44 | dplyr::select(-cigar) %>% 45 | dplyr::filter(flag %in% c(83, 99, 339, 355)) %>% ## mapped in proper pair, first in pair 46 | dplyr::mutate(alignedLength = nbrM + nbrI) ## equivalent to readLength-nbrS-nbrH 47 | 48 | tmp2 <- as.data.frame(table(names(subset(bam, flag %in% c(83, 99))))) 49 | if (nrow(tmp2) == 0) tmp2 <- data.frame(Var1 = tmp$read[1], Freq = 0) 50 | tmp <- tmp %>% 51 | dplyr::left_join(tmp2 %>% dplyr::rename(read = Var1, nbrPrimaryAlignments = Freq)) 52 | 53 | tmp3 <- as.data.frame(table(names(subset(bam, flag %in% c(339, 355))))) 54 | if (nrow(tmp3) == 0) tmp3 <- data.frame(Var1 = tmp$read[1], Freq = 0) 55 | tmp <- tmp %>% 56 | dplyr::left_join(tmp3 %>% dplyr::rename(read = Var1, nbrSecondaryAlignments = Freq)) 57 | 58 | tmp %>% dplyr::mutate(nbrSecondaryAlignments = replace(nbrSecondaryAlignments, 59 | is.na(nbrSecondaryAlignments), 0)) 60 | } 61 | 62 | bamfiles <- list.files(genomebamdir, pattern = "_Aligned.sortedByCoord.out.bam$", 63 | recursive = TRUE, full.names = TRUE) 64 | names(bamfiles) <- gsub("_Aligned.sortedByCoord.out.bam", "", basename(bamfiles)) 65 | bamfiles <- bamfiles[names(bamfiles) %in% 66 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 67 | bamfiles 68 | genomebams <- lapply(bamfiles, function(f) { 69 | bam <- readBam(f) 70 | tmp <- makeReadDf(bam) 71 | list(allAlignments = tmp) 72 | }) 73 | for (n in names(genomebams)) { 74 | genomebams[[n]][["allAlignments"]]$sample <- n 75 | } 76 | 77 | genomebams <- do.call(dplyr::bind_rows, lapply(names(genomebams), function(nm) { 78 | genomebams[[nm]]$allAlignments %>% 79 | dplyr::mutate(sample = remap[nm], dataset = "Illumina", 80 | rtype = "Reads aligning to the genome") %>% 81 | dplyr::mutate(fracM = nbrM/readLength, 82 | fracS = nbrS/readLength, 83 | fracI = nbrI/readLength) 84 | })) 85 | 86 | djunc <- genomebams %>% dplyr::filter(flag %in% c(83, 99)) %>% 87 | dplyr::mutate(nbrJunctions = replace(nbrJunctions, nbrJunctions > 20, ">20")) %>% 88 | dplyr::mutate(nbrJunctions = factor(nbrJunctions, 89 | levels = c(as.character(0:20), ">20"))) %>% 90 | dplyr::group_by(sample, dataset, nbrJunctions) %>% 91 | dplyr::tally() %>% 92 | dplyr::group_by(sample) %>% 93 | dplyr::mutate(n = n/sum(n)) 94 | 95 | png(gsub("\\.rds$", "_njunc_distribution_bysample.png", outrds), 96 | width = 12, height = 8.5, unit = "in", res = 400) 97 | print(ggplot(djunc, 98 | aes(x = nbrJunctions, y = n, fill = dataset, color = dataset)) + 99 | geom_bar(stat = "identity", position = "dodge") + 100 | theme_bw() + facet_wrap(~ sample) + 101 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 102 | legend.position = "none") + 103 | scale_color_manual(values = ds_colors, name = "") + 104 | scale_fill_manual(values = ds_colors, name = "") + 105 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 106 | xlab("Number of spanned junctions") + ylab("Fraction of reads")) 107 | dev.off() 108 | 109 | png(gsub("\\.rds$", "_njunc_distribution.png", outrds), 110 | width = 8, height = 6, unit = "in", res = 400) 111 | print(ggplot(djunc, aes(x = nbrJunctions, y = n), fill = "#B3B3B3", color = "#B3B3B3") + 112 | geom_bar(stat = "identity", position = "dodge") + 113 | theme_bw() + 114 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 115 | legend.position = "none") + 116 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 117 | xlab("Number of spanned junctions") + ylab("Fraction of reads")) 118 | dev.off() 119 | 120 | saveRDS(djunc, file = outrds) 121 | date() 122 | sessionInfo() 123 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_nbr_reads.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(plyr) 8 | library(dplyr) 9 | library(ggplot2) 10 | library(cowplot) 11 | library(stringr) 12 | }) 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 14 | 15 | datasets <- strsplit(datasets, ",")[[1]] 16 | names(datasets) <- datasets 17 | conditions <- strsplit(conditions, ",")[[1]] 18 | 19 | print(datasets) 20 | print(conditions) 21 | print(outrds) 22 | 23 | ## ========================================================================== ## 24 | ## Total number of reads 25 | ## ========================================================================== ## 26 | ## Read information about the number of reads 27 | reads <- lapply(datasets, function(ds) { 28 | read.delim(paste0(ds, "/output/", ds, "_nbr_reads.txt"), 29 | header = TRUE, as.is = TRUE) %>% 30 | dplyr::filter(!grepl("_orig", sample)) %>% 31 | dplyr::mutate(sample = remap[sample], 32 | dataset = remapds[ds]) %>% 33 | dplyr::select(sample, dataset, nReads) 34 | }) 35 | 36 | ## Put everything together 37 | reads <- do.call(dplyr::bind_rows, reads) %>% 38 | dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition), 39 | by = c("sample" = "sample_remap")) %>% 40 | dplyr::filter(condition %in% conditions) %>% 41 | dplyr::mutate( 42 | dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]) 43 | ) 44 | 45 | ## Write total number of reads to text file 46 | write.table(reads, file = gsub("\\.rds$", "_nbrreads.txt", outrds), 47 | quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t") 48 | 49 | ## Plot total number of reads 50 | ptot <- ggplot( 51 | reads %>% dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)), 52 | aes(x = sample, y = nReads/1e6, fill = dataset)) + 53 | geom_bar(stat = "identity", position = "dodge") + 54 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 55 | facet_grid(~ dataset, scales = "free_x", space = "free_x") + theme_bw() + 56 | scale_fill_manual(values = ds_colors, name = "") + 57 | theme(legend.position = "none", 58 | strip.text = element_text(size = 8)) + 59 | xlab("") + ylab("Total number of reads (Mio.)") 60 | 61 | ## ========================================================================== ## 62 | ## Read length 63 | ## ========================================================================== ## 64 | ## Read information about read lengths 65 | readInfo <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 66 | rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds")) 67 | rdf <- rd$fastqs 68 | rdf <- rdf[!grepl("_orig", names(rdf))] 69 | rdf <- rdf[names(rdf) %in% 70 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 71 | do.call(dplyr::bind_rows, lapply(names(rdf), function(nm) { 72 | rdf[[nm]]$reads %>% dplyr::select(readLength, aveBaseQuality) %>% 73 | dplyr::mutate(sample = remap[nm], dataset = remapds[ds]) 74 | })) 75 | })) %>% 76 | dplyr::mutate( 77 | dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]) 78 | ) 79 | 80 | ## Write summary information about read lengths to text file 81 | write.table( 82 | readInfo %>% dplyr::group_by(sample) %>% 83 | dplyr::summarize(mean_readlength = mean(readLength), 84 | median_readlength = median(readLength), 85 | min_readlength = min(readLength), 86 | max_readlength = max(readLength), 87 | nbr_reads_longer_than_15000 = length(which(readLength > 1.5e4))), 88 | file = gsub("\\.rds$", "_readlengths.txt", outrds), 89 | quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t" 90 | ) 91 | 92 | ## -------------------------------------------------------------------------- ## 93 | ## Plot read lengths 94 | plength <- ggplot(readInfo %>% 95 | dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 96 | .subset, 3)), 97 | aes(x = readLength, group = sample, 98 | color = dataset, alpha = as.factor(replicate))) + 99 | geom_line(stat = "density", size = 1) + scale_x_log10() + 100 | facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 101 | xlab("Read length") + ylab("Density") + 102 | scale_color_manual(values = ds_colors, name = "") + 103 | scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 104 | theme(legend.position = "none") 105 | 106 | ## Plot read lengths, sqrt-transformed 107 | plengthsqrt <- ggplot(readInfo %>% 108 | dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 109 | .subset, 3)), 110 | aes(x = readLength, group = sample, 111 | color = dataset, alpha = as.factor(replicate))) + 112 | geom_line(stat = "density", size = 1) + scale_x_sqrt(limits = c(0, 1.5e4)) + 113 | facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 114 | xlab("Read length") + ylab("Density") + 115 | scale_color_manual(values = ds_colors, name = "") + 116 | scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 117 | theme(legend.position = "none") 118 | 119 | ## Same plot but with x-axis on linear scale, and cut at 1.5e4 120 | plengthlin <- ggplot(readInfo %>% 121 | dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 122 | .subset, 3)), 123 | aes(x = readLength, group = sample, 124 | color = dataset, alpha = as.factor(replicate))) + 125 | geom_line(stat = "density", size = 1) + 126 | xlim(0, 1.5e4) + 127 | facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 128 | xlab("Read length") + ylab("Density") + 129 | scale_color_manual(values = ds_colors, name = "") + 130 | scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 131 | theme(legend.position = "none") 132 | 133 | ## -------------------------------------------------------------------------- ## 134 | ## Plot mean quality distribution 135 | pqual <- ggplot(readInfo %>% 136 | dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 137 | .subset, 3)), 138 | aes(x = aveBaseQuality, group = sample, 139 | color = dataset, alpha = as.factor(replicate))) + 140 | geom_line(stat = "density", size = 1) + 141 | facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 142 | xlab("Average base quality per read") + ylab("Density") + 143 | scale_color_manual(values = ds_colors, name = "") + 144 | scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 145 | theme(legend.position = "none") 146 | 147 | ## -------------------------------------------------------------------------- ## 148 | ## Plot read length vs mean quality, for each sample 149 | plengthqual <- 150 | ggplot(readInfo %>% dplyr::arrange(dataset, sample) %>% 151 | dplyr::mutate(sample = factor(sample, levels = unique(sample))), 152 | aes(x = readLength, y = aveBaseQuality)) + 153 | geom_hex(bins = 100, aes(fill = stat(density))) + scale_x_log10() + 154 | scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 155 | facet_wrap(~ sample) + theme_bw() + 156 | xlab("Read length") + ylab("Average base quality") 157 | 158 | ## Same plot but with x-axis on linear scale, and cut at 1.5e4 159 | plengthquallin <- 160 | ggplot(readInfo %>% dplyr::arrange(dataset, sample) %>% 161 | dplyr::mutate(sample = factor(sample, levels = unique(sample))), 162 | aes(x = readLength, y = aveBaseQuality)) + 163 | geom_hex(bins = 100, aes(fill = stat(density))) + 164 | xlim(0, 1.5e4) + 165 | scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 166 | facet_wrap(~ sample) + theme_bw() + 167 | xlab("Read length") + ylab("Average base quality") 168 | 169 | ## Plot read length vs mean quality, for each sample, sqrt-transformed 170 | plengthqualsqrt <- 171 | ggplot(readInfo %>% dplyr::arrange(dataset, sample) %>% 172 | dplyr::mutate(sample = factor(sample, levels = unique(sample))), 173 | aes(x = readLength, y = aveBaseQuality)) + 174 | geom_hex(bins = 100, aes(fill = stat(density))) + scale_x_sqrt(limits = c(0, 1.5e4)) + 175 | scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 176 | facet_wrap(~ sample) + theme_bw() + 177 | xlab("Read length") + ylab("Average base quality") 178 | 179 | ## -------------------------------------------------------------------------- ## 180 | ## Make final plots 181 | png(gsub("\\.rds$", "_nbrreads.png", outrds), width = 12, 182 | height = 8, unit = "in", res = 400) 183 | print(cowplot::plot_grid( 184 | ptot, 185 | cowplot::plot_grid(plength, pqual, nrow = 1, 186 | rel_widths = c(1, 1), labels = c("B", "C")), 187 | ncol = 1, rel_heights = c(1, 1), labels = c("A", "")) 188 | ) 189 | dev.off() 190 | 191 | png(gsub("\\.rds$", "_nbrreads_sqrt.png", outrds), width = 12, 192 | height = 8, unit = "in", res = 400) 193 | print(cowplot::plot_grid( 194 | ptot, 195 | cowplot::plot_grid(plengthsqrt, pqual, nrow = 1, 196 | rel_widths = c(1, 1), labels = c("B", "C")), 197 | ncol = 1, rel_heights = c(1, 1), labels = c("A", "")) 198 | ) 199 | dev.off() 200 | 201 | png(gsub("\\.rds$", "_nbrreads_linear.png", outrds), width = 12, 202 | height = 8, unit = "in", res = 400) 203 | print(cowplot::plot_grid( 204 | ptot, 205 | cowplot::plot_grid(plengthlin, pqual, nrow = 1, 206 | rel_widths = c(1, 1), labels = c("B", "C")), 207 | ncol = 1, rel_heights = c(1, 1), labels = c("A", "")) 208 | ) 209 | dev.off() 210 | 211 | png(gsub("\\.rds$", "_readlength_vs_quality.png", outrds), width = 11, 212 | height = 11, unit = "in", res = 400) 213 | print(plengthqual) 214 | dev.off() 215 | 216 | png(gsub("\\.rds$", "_readlength_vs_quality_sqrt.png", outrds), width = 11, 217 | height = 11, unit = "in", res = 400) 218 | print(plengthqualsqrt) 219 | dev.off() 220 | 221 | png(gsub("\\.rds$", "_readlength_vs_quality_linear.png", outrds), width = 11, 222 | height = 11, unit = "in", res = 400) 223 | print(plengthquallin) 224 | dev.off() 225 | 226 | ## -------------------------------------------------------------------------- ## 227 | saveRDS(NULL, file = outrds) 228 | date() 229 | sessionInfo() 230 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_nbr_tx_in_eqclass.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(stringr) 10 | }) 11 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 12 | 13 | datasets <- strsplit(datasets, ",")[[1]] 14 | datasets <- c(datasets, "Illumina") 15 | conditions <- strsplit(conditions, ",")[[1]] 16 | 17 | print(datasets) 18 | print(conditions) 19 | print(outrds) 20 | 21 | salmonfiles <- unlist(lapply(datasets, function(ds) { 22 | lf <- list.files(file.path(ds, "salmon31"), pattern = "eq_classes.txt", 23 | full.names = TRUE, recursive = TRUE) 24 | names(lf) <- remap[basename(gsub("/aux_info/eq_classes.txt$", "", lf))] 25 | lf 26 | })) 27 | salmonfiles <- salmonfiles[names(salmonfiles) %in% 28 | sample_annotation$sample_remap[sample_annotation$condition %in% conditions]] 29 | salmonfiles 30 | 31 | eqcl <- lapply(salmonfiles, function(f) { 32 | x <- readLines(f) 33 | n_tr <- as.numeric(x[1]) ## Total number of transcripts 34 | n_eq <- as.numeric(x[2]) ## Total number of equivalence classes 35 | tx_id <- x[3:(n_tr + 2)] ## Transcript IDs 36 | quants <- x[(n_tr + 3):length(x)] ## Characteristics of equivalence classes 37 | 38 | ## Split equivalence class characteristics. Each element of the list corresponds 39 | ## to one equivalence class, and lists its number of transcripts, the 40 | ## transcripts IDs and the total number of reads 41 | do.call(dplyr::bind_rows, lapply(quants, function(w) { 42 | tmp = strsplit(w, "\\\t")[[1]] 43 | nbr_tx = as.numeric(tmp[1]) 44 | data.frame(nbr_tx = rep(nbr_tx, as.numeric(tmp[length(tmp)])), 45 | stringsAsFactors = FALSE) 46 | })) 47 | }) 48 | 49 | for (nm in names(eqcl)) { 50 | eqcl[[nm]]$sample <- nm 51 | eqcl[[nm]]$dataset <- strsplit(nm, "_")[[1]][1] 52 | } 53 | 54 | eqcl <- do.call(dplyr::bind_rows, eqcl) %>% 55 | dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])) 56 | 57 | png(gsub("\\.rds$", ".png", outrds), height = 12, width = 16, 58 | unit = "in", res = 400) 59 | p1 <- ggplot(eqcl %>% 60 | dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)), 61 | aes(x = sample, y = nbr_tx, fill = dataset)) + 62 | geom_boxplot() + theme_bw() + xlab("") + 63 | ylab("Number of transcripts in equivalence class") + 64 | theme(legend.position = "none") + 65 | facet_grid(~ dataset, scales = "free_x", space = "free_x") + 66 | scale_fill_manual(values = ds_colors, name = "") 67 | p2 <- p1 + 68 | stat_summary(fun.y = mean, geom = "point", shape = 18, 69 | size = 4, color = "black", fill = "black") + 70 | coord_cartesian(ylim = c(0, 15)) 71 | cowplot::plot_grid(p1 + ggtitle("Full range"), 72 | p2 + ggtitle("Zoomed in"), 73 | ncol = 1, labels = c("A", "B"), rel_heights = c(1, 1)) 74 | dev.off() 75 | 76 | ## By dataset 77 | p1ds <- ggplot(eqcl, 78 | aes(x = dataset, y = nbr_tx, fill = dataset)) + 79 | geom_boxplot() + theme_bw() + xlab("") + 80 | ylab("Number of transcripts in equivalence class") + 81 | theme(legend.position = "none", 82 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 83 | scale_fill_manual(values = ds_colors, name = "") 84 | p2ds <- p1ds + 85 | stat_summary(fun.y = mean, geom = "point", shape = 18, 86 | size = 4, color = "black", fill = "black") + 87 | coord_cartesian(ylim = c(0, 15)) 88 | 89 | 90 | saveRDS(list(ptxeqfull = p1 + ggtitle("Full range"), 91 | ptxeqzoom = p2 + ggtitle("Zoomed in"), 92 | ptxeqfullds = p1ds + ggtitle("Full range"), 93 | ptxeqzoomds = p2ds + ggtitle("Zoomed in")), file = outrds) 94 | date() 95 | sessionInfo() -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_palindromes.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(cowplot) 10 | library(stringr) 11 | library(Biostrings) 12 | library(data.table) 13 | library(parallel) 14 | library(BiocParallel) 15 | }) 16 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 17 | 18 | datasets <- strsplit(datasets, ",")[[1]] 19 | names(datasets) <- datasets 20 | conditions <- strsplit(conditions, ",")[[1]] 21 | 22 | print(datasets) 23 | print(conditions) 24 | print(nreads) 25 | print(ncores) 26 | print(outrds) 27 | 28 | ## Information about the reads 29 | readInfo <- lapply(datasets, function(ds) { 30 | rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds")) 31 | 32 | rdf <- rd$fastqs 33 | rdf <- rdf[!grepl("_orig", names(rdf))] 34 | rdf <- rdf[names(rdf) %in% 35 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 36 | rdf <- do.call(dplyr::bind_rows, lapply(names(rdf), function(nm) { 37 | rdf[[nm]]$reads %>% 38 | dplyr::mutate(sample = remap[nm], dataset = remapds[ds], rtype = "All reads") 39 | })) 40 | 41 | rdg <- rd$genomebams 42 | rdg <- rdg[!grepl("_orig", names(rdg))] 43 | rdg <- rdg[names(rdg) %in% 44 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 45 | rdg <- do.call(dplyr::bind_rows, lapply(names(rdg), function(nm) { 46 | rdg[[nm]]$allAlignments %>% 47 | dplyr::mutate(sample = remap[nm], dataset = remapds[ds], 48 | rtype = "Reads aligning to the genome") %>% 49 | dplyr::mutate(fracM = nbrM/readLength, 50 | fracS = nbrS/readLength, 51 | fracI = nbrI/readLength, 52 | accuracy = (nbrM - NM + nbrI + nbrD)/(nbrM + nbrI + nbrD)) 53 | })) 54 | 55 | list(rdf = rdf, rdg = rdg) 56 | }) 57 | 58 | ## All reads 59 | allReads <- do.call(dplyr::bind_rows, lapply(readInfo, function(x) x$rdf)) 60 | 61 | ## Genome alignments 62 | gAlign <- do.call(dplyr::bind_rows, lapply(readInfo, function(x) x$rdg)) 63 | 64 | ## Add information about genome alignments to the data frame with all reads 65 | allReads <- allReads %>% 66 | dplyr::left_join(gAlign %>% dplyr::filter(flag %in% c(0, 16)) %>% 67 | dplyr::select(read, sample, dataset, NM, nbrM, nbrS, 68 | nbrD, nbrI, alignedLength, 69 | nbrSecondaryAlignments, 70 | nbrSupplementaryAlignments, fracM, fracS), 71 | by = c("sample", "dataset", "read")) %>% 72 | dplyr::mutate(aligned = ifelse(is.na(alignedLength), "Not aligned", 73 | "Aligned")) 74 | 75 | ## Get palindrome information for a subset of reads for each sample 76 | fastqs <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 77 | fastqdir <- paste0(ds, "/FASTQ", 78 | ifelse(ds %in% c("RNA001", "HEK293RNA"), "dna", "")) 79 | fastqfiles <- list.files(fastqdir, pattern = "(FASTQ|fastq)\\.gz$", 80 | full.names = TRUE) 81 | names(fastqfiles) <- gsub("\\.(FASTQ|fastq).gz", "", basename(fastqfiles)) 82 | fastqfiles <- fastqfiles[!grepl("_orig", names(fastqfiles))] 83 | fastqfiles <- fastqfiles[names(fastqfiles) %in% 84 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 85 | fastqfiles 86 | 87 | do.call(dplyr::bind_rows, lapply(names(fastqfiles), function(nm) { 88 | f <- fastqfiles[nm] 89 | fastq <- fread(paste0("zcat ", f), sep = "\n", header = FALSE)$V1 90 | seq_idxs <- seq(2, length(fastq), by = 4) 91 | reads <- Biostrings::DNAStringSet(x = fastq[seq_idxs]) 92 | names(reads) <- sapply(strsplit(fastq[seq_idxs - 1], " "), .subset, 1) 93 | print(nm) 94 | print(table(width(reads) < 30000)) 95 | reads <- reads[width(reads) < 30000] 96 | set.seed(1) 97 | reads <- reads[sample(seq_along(reads), nreads, replace = FALSE)] 98 | readlengths <- width(reads) 99 | readnames <- names(reads) 100 | do.call(dplyr::bind_rows, bplapply(seq_along(reads), function(i) { 101 | fp <- Biostrings::findPalindromes(reads[[i]], max.looplength = readlengths[i], 102 | min.armlength = 10) 103 | armlength <- Biostrings::palindromeArmLength(fp) 104 | totlength <- width(fp) 105 | w <- which.max(armlength) 106 | 107 | ## Get the total length of the read that is covered by palindromic sequence 108 | dnatmp <- reduce(IRanges(start = start(fp), 109 | width = armlength)) 110 | totalPalindromeLength <- 2 * sum(width(dnatmp)) 111 | tryCatch({ 112 | data.frame( 113 | read = gsub("^@", "", readnames[i]), 114 | readLength = readlengths[i], 115 | maxArmLengthPalindrome = armlength[w], 116 | totalPalindromeLength = totlength[w], 117 | longestPalindromeArm = Biostrings::palindromeLeftArm(fp[w]), 118 | stringsAsFactors = FALSE 119 | )}, error = function(e) { 120 | data.frame(read = gsub("^@", "", readnames[i]), 121 | readLength = readlengths[i], 122 | maxArmLengthPalindrome = NA, 123 | totalPalindromeLength = NA, 124 | longestPalindromeArm = "", 125 | stringsAsFactors = FALSE) 126 | }) 127 | }, BPPARAM = MulticoreParam(workers = ncores))) %>% 128 | dplyr::mutate(sample = remap[nm], 129 | dataset = remapds[ds]) 130 | })) 131 | })) 132 | 133 | df <- fastqs %>% 134 | dplyr::left_join(allReads, 135 | by = c("dataset", "sample", "read", "readLength")) %>% 136 | dplyr::mutate( 137 | maxArmLengthPalindrome = replace(maxArmLengthPalindrome, 138 | is.na(maxArmLengthPalindrome), 0), 139 | totalPalindromeLength = replace(totalPalindromeLength, 140 | is.na(totalPalindromeLength), 0)) 141 | 142 | ## Read information about primary/supplementary alignments 143 | dfprimsupp <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 144 | readRDS(paste0(ds, "/output/", ds, 145 | "_primary_supplementary_alignments_distances.rds")) %>% 146 | dplyr::mutate(sample = remap[sample]) %>% 147 | dplyr::mutate(dataset = remapds[ds]) 148 | })) %>% 149 | dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition), 150 | by = c("sample" = "sample_remap")) %>% 151 | dplyr::filter(condition %in% conditions) %>% 152 | dplyr::select(-condition) %>% 153 | dplyr::mutate(dist0 = "") %>% 154 | dplyr::mutate(dist0 = replace(dist0, distn == 0, ", overlapping"), 155 | dist0 = replace(dist0, distn > 0, ", non-overlapping")) %>% 156 | dplyr::mutate(strands_dist0 = paste0(strands, dist0)) 157 | 158 | df <- df %>% 159 | dplyr::left_join(dfprimsupp, 160 | by = c("dataset", "sample", "read")) %>% 161 | dplyr::mutate(strands_dist0 = replace(strands_dist0, is.na(strands_dist0), 162 | "no supplementary alignment")) %>% 163 | dplyr::mutate(relMaxArmLengthPalindrome = maxArmLengthPalindrome/readLength) 164 | 165 | catcols <- c("no supplementary alignment" = "lightyellow", 166 | "different chromosomes" = "#E8601C", 167 | "same chromosome, different strand, non-overlapping" = "#7BAFDE", 168 | "same chromosome, different strand, overlapping" = "#90C987", 169 | "same chromosome, same strand, non-overlapping" = "#777777", 170 | "same chromosome, same strand, overlapping" = "#B17BA6") 171 | 172 | df <- df %>% 173 | dplyr::mutate(strands_dist0 = factor(strands_dist0, 174 | levels = names(catcols))) 175 | 176 | png(gsub("\\.rds$", "_aligned_vs_relpalindromelength.png", outrds), 177 | width = 10, height = 5, unit = "in", res = 400) 178 | ggplot(df, aes(x = aligned, y = relMaxArmLengthPalindrome)) + 179 | geom_violin() + geom_boxplot(width = 0.1) + 180 | theme_bw() + facet_wrap(~ dataset) + 181 | xlab("") + ylab("Longest palindrome arm length/read length") + 182 | coord_cartesian(ylim = c(0, max(df$relMaxArmLengthPalindrome) + 0.005)) + 183 | stat_summary(data = df %>% 184 | dplyr::group_by(dataset, aligned) %>% 185 | dplyr::summarize(relMaxArmLengthPalindrome = 186 | length(relMaxArmLengthPalindrome)), 187 | fun.data = function(x) {return(c(y = max(df$relMaxArmLengthPalindrome), 188 | label = x))}, 189 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 190 | dev.off() 191 | 192 | g1 <- ggplot(df, aes(x = aligned, y = maxArmLengthPalindrome)) + 193 | geom_violin() + 194 | theme_bw() + facet_wrap(~ dataset, nrow = 1) + 195 | xlab("") + ylab("Longest palindrome arm length") + 196 | coord_cartesian(ylim = c(0, max(df$maxArmLengthPalindrome) * 1.04)) + 197 | stat_summary(data = df %>% 198 | dplyr::group_by(dataset, aligned) %>% 199 | dplyr::summarize(maxArmLengthPalindrome = 200 | length(maxArmLengthPalindrome)), 201 | fun.data = function(x) {return(c(y = max(df$maxArmLengthPalindrome), 202 | label = x))}, 203 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 204 | 205 | df$hasSuppAlignment <- df$nbrSupplementaryAlignments > 0 206 | df0 <- df %>% dplyr::filter(aligned == "Aligned") 207 | g2 <- ggplot(df0, 208 | aes(x = hasSuppAlignment, y = maxArmLengthPalindrome)) + 209 | geom_violin() + 210 | theme_bw() + facet_wrap(~ dataset, nrow = 1) + 211 | xlab("Read has supplementary alignment") + 212 | ylab("Longest palindrome arm length") + 213 | coord_cartesian(ylim = c(0, max(df0$maxArmLengthPalindrome) * 1.04)) + 214 | stat_summary(data = df0 %>% 215 | dplyr::group_by(dataset, hasSuppAlignment) %>% 216 | dplyr::summarize(maxArmLengthPalindrome = 217 | length(maxArmLengthPalindrome)), 218 | fun.data = function(x) {return(c(y = max(df0$maxArmLengthPalindrome), 219 | label = x))}, 220 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 221 | 222 | df0 <- df %>% dplyr::filter(aligned == "Aligned") %>% 223 | dplyr::filter(dataset %in% c("ONT-DCS108-HAP", "ONT-DCS108-NSK007")) 224 | g3 <- ggplot(df0, 225 | aes(x = strands_dist0, y = maxArmLengthPalindrome)) + 226 | geom_violin(aes(fill = strands_dist0), alpha = 0.5) + 227 | theme_bw() + facet_wrap(~ dataset) + 228 | xlab("") + ylab("Longest palindrome arm length") + 229 | scale_fill_manual(values = catcols, name = "") + 230 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 231 | legend.position = "none") + 232 | coord_cartesian(ylim = c(0, max(df0$maxArmLengthPalindrome) * 1.04)) + 233 | stat_summary(data = df0 %>% 234 | dplyr::group_by(dataset, strands_dist0) %>% 235 | dplyr::summarize(maxArmLengthPalindrome = 236 | length(maxArmLengthPalindrome)), 237 | fun.data = function(x) {return(c(y = max(df0$maxArmLengthPalindrome), 238 | label = x))}, 239 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 240 | 241 | g4 <- ggplot(df0, 242 | aes(x = strands_dist0, y = relMaxArmLengthPalindrome)) + 243 | geom_violin(aes(fill = strands_dist0), alpha = 0.5) + 244 | theme_bw() + facet_wrap(~ dataset) + 245 | xlab("") + ylab("Longest palindrome arm length/read length") + 246 | scale_fill_manual(values = catcols, name = "") + 247 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 248 | legend.position = "none") + 249 | coord_cartesian(ylim = c(0, max(df0$relMaxArmLengthPalindrome) * 1.04)) + 250 | stat_summary(data = df0 %>% 251 | dplyr::group_by(dataset, strands_dist0) %>% 252 | dplyr::summarize(relMaxArmLengthPalindrome = 253 | length(relMaxArmLengthPalindrome)), 254 | fun.data = function(x) {return(c(y = max(df0$relMaxArmLengthPalindrome), 255 | label = x))}, 256 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 257 | 258 | png(gsub("\\.rds$", "_summary.png", outrds), 259 | width = 8, height = 13, unit = "in", res = 400) 260 | cowplot::plot_grid( 261 | g1, 262 | g2, 263 | cowplot::plot_grid(g3, g4, nrow = 1, labels = c("C", "D"), 264 | align = "h", axis = "bt"), 265 | ncol = 1, labels = c("A", "B", ""), 266 | rel_heights = c(1, 1, 1.7) 267 | ) 268 | dev.off() 269 | 270 | png(gsub("\\.rds$", "_summary_sub.png", outrds), 271 | width = 8, height = 6, unit = "in", res = 400) 272 | cowplot::plot_grid( 273 | g1, 274 | g2, 275 | ncol = 1, labels = c("A", "B"), 276 | rel_heights = c(1, 1) 277 | ) 278 | dev.off() 279 | 280 | png(gsub("\\.rds$", "_summary_sub2.png", outrds), 281 | width = 8, height = 3, unit = "in", res = 400) 282 | print(g2) 283 | dev.off() 284 | 285 | df$spacing <- df$totalPalindromeLength - 2*df$maxArmLengthPalindrome 286 | df0 <- df %>% dplyr::filter(aligned == "Aligned") %>% 287 | dplyr::filter(dataset %in% c("ONT-DCS108-HAP", "ONT-DCS108-NSK007")) 288 | png(gsub("\\.rds$", "_stranddist_vs_spacing.png", outrds), 289 | width = 4, height = 7, unit = "in", res = 400) 290 | ggplot(df0, 291 | aes(x = strands_dist0, y = spacing)) + 292 | geom_violin(aes(fill = strands_dist0), alpha = 0.5) + 293 | theme_bw() + facet_wrap(~ dataset) + 294 | xlab("") + ylab("Distance between palindrome arms") + 295 | scale_fill_manual(values = catcols, name = "") + 296 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 297 | legend.position = "none") + 298 | coord_cartesian(ylim = c(0, max(df0$spacing) + 50)) + 299 | stat_summary(data = df0 %>% 300 | dplyr::group_by(dataset, strands_dist0) %>% 301 | dplyr::summarize(spacing = length(spacing)), 302 | fun.data = function(x) {return(c(y = max(df0$spacing), 303 | label = x))}, 304 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 305 | dev.off() 306 | 307 | saveRDS(df, file = outrds) 308 | date() 309 | sessionInfo() 310 | 311 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_polya_tail_estimates.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(tidyr) 10 | library(ggbeeswarm) 11 | }) 12 | 13 | print(tx2gene) 14 | print(outrds) 15 | 16 | ## -------------------------------------------------------------------------- ## 17 | ## Read tx2gene and extract some categories 18 | tx2gene <- readRDS(tx2gene) 19 | tx2gene <- tx2gene %>% 20 | dplyr::mutate(tx_biotype = replace(tx_biotype, tx_biotype == "protein_coding" & 21 | chromosome == "MT", "Mt_protein_coding")) %>% 22 | dplyr::mutate(tx_biotype = replace(tx_biotype, grepl("^RPL[0-9]|^RPS[0-9]", 23 | symbol), "ribosomal_protein")) 24 | 25 | ## -------------------------------------------------------------------------- ## 26 | ## Read polyA tail length estimates 27 | tailfindr <- read.delim("RNA001/tailfindr/wt_1_RNA001_tailfindr.csv", 28 | header = TRUE, as.is = TRUE, sep = ",") 29 | nanopolish <- read.delim(paste0("RNA001/nanopolish/wt_1_RNA001/", 30 | "pipeline-polya-ng/tails/filtered_tails.tsv"), 31 | header = TRUE, as.is = TRUE) 32 | 33 | table(nanopolish$polya_length < 1000) 34 | 35 | ## -------------------------------------------------------------------------- ## 36 | ## Add transcript annotation to nanopolish table 37 | nanopolish <- nanopolish %>% 38 | dplyr::left_join(tx2gene, by = c("contig" = "tx")) %>% 39 | dplyr::filter(polya_length < 1000) 40 | 41 | ## -------------------------------------------------------------------------- ## 42 | ## Merge nanopolish and tailfindr estimates 43 | tails <- dplyr::inner_join( 44 | tailfindr %>% dplyr::select(read_id, tail_length) %>% 45 | dplyr::rename(readname = read_id, tailfindr = tail_length), 46 | nanopolish %>% dplyr::select(readname, polya_length) %>% 47 | dplyr::rename(Nanopolish = polya_length), 48 | by = "readname" 49 | ) %>% 50 | dplyr::filter(!is.na(tailfindr) & !is.na(Nanopolish)) 51 | 52 | ## -------------------------------------------------------------------------- ## 53 | ## Summary plot 54 | png(gsub("\\.rds$", "_summary.png", outrds), 55 | width = 7, height = 9, unit = "in", res = 400) 56 | cowplot::plot_grid( 57 | cowplot::plot_grid( 58 | ggplot(tails %>% 59 | tidyr::gather(key = "method", value = "polya_length", -readname), 60 | aes(x = method, y = polya_length)) + 61 | geom_boxplot() + theme_bw() + 62 | xlab("") + ylab("polyA tail length estimate"), 63 | ggplot(tails, aes(x = Nanopolish, y = tailfindr)) + 64 | geom_abline(slope = 1, intercept = 0) + 65 | geom_point(alpha = 0.25) + theme_bw() + 66 | xlab("Nanopolish polyA tail length estimate") + 67 | ylab("tailfindr polyA tail length estimate"), 68 | rel_widths = c(0.5, 1), nrow = 1, labels = c("A", "B"), 69 | align = "h", axis = "bt"), 70 | ggplot(nanopolish %>% dplyr::group_by(tx_biotype) %>% 71 | dplyr::mutate(nreads = length(readname)) %>% 72 | dplyr::filter(nreads >= 10), 73 | aes(x = tx_biotype, y = polya_length)) + 74 | geom_boxplot() + theme_bw() + 75 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 76 | xlab("Transcript biotype") + 77 | ylab("Nanopolish polyA tail length estimate"), 78 | rel_heights = c(1, 1.5), ncol = 1, labels = c("", "C") 79 | ) 80 | dev.off() 81 | 82 | ## -------------------------------------------------------------------------- ## 83 | ## Calculate median polyA tail length estimate for reads assigned to each 84 | ## transcript. Plot these values for genes with many annotated transcripts. 85 | np <- nanopolish %>% dplyr::group_by(contig, symbol) %>% 86 | dplyr::summarize(median_polya_length = median(polya_length)) %>% 87 | dplyr::ungroup() %>% 88 | dplyr::group_by(symbol) %>% 89 | dplyr::mutate(ntx = length(contig)) %>% 90 | dplyr::filter(ntx > 1) 91 | 92 | png(gsub("\\.rds$", "_polya_length_divergence_within_gene.png", outrds), 93 | height = 4, width = 7, unit = "in", res = 400) 94 | ggplot(np %>% dplyr::filter(ntx >= 7), 95 | aes(x = symbol, y = median_polya_length, color = symbol)) + 96 | geom_beeswarm(size = 2.5, alpha = 0.7) + theme_bw() + 97 | theme(legend.position = "none", 98 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 99 | ylab("Median polyA tail length estimate\nfor reads assigned to transcript") + 100 | xlab("Gene") 101 | dev.off() 102 | ## -------------------------------------------------------------------------- ## 103 | 104 | saveRDS(NULL, file = outrds) 105 | date() 106 | sessionInfo() 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_primary_supplementary.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(stringr) 10 | library(tximport) 11 | }) 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 13 | 14 | datasets <- strsplit(datasets, ",")[[1]] 15 | names(datasets) <- datasets 16 | conditions <- strsplit(conditions, ",")[[1]] 17 | 18 | print(datasets) 19 | print(conditions) 20 | print(outrds) 21 | 22 | ## Read data 23 | dfprimsupp <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 24 | readRDS(paste0(ds, "/output/", ds, 25 | "_primary_supplementary_alignments_distances.rds")) %>% 26 | dplyr::mutate(sample = remap[sample]) %>% 27 | dplyr::mutate(dataset = remapds[ds]) 28 | })) %>% 29 | dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition), 30 | by = c("sample" = "sample_remap")) %>% 31 | dplyr::filter(condition %in% conditions) %>% 32 | dplyr::select(-condition) %>% 33 | dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])) 34 | 35 | g0 <- ggplot(dfprimsupp %>% 36 | dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)) %>% 37 | dplyr::mutate(dist0 = "") %>% 38 | dplyr::mutate(dist0 = replace(dist0, distn == 0, ", overlapping"), 39 | dist0 = replace(dist0, distn > 0, ", non-overlapping")) %>% 40 | dplyr::group_by(sample, dataset, strands, dist0) %>% dplyr::tally() %>% 41 | dplyr::mutate(strands_dist0 = paste0(strands, dist0)), 42 | aes(x = sample, y = n, fill = strands_dist0)) + 43 | facet_grid(~ dataset, scales = "free_x", space = "free_x") + 44 | theme_bw() + 45 | theme(legend.position = "bottom", 46 | strip.text = element_text(size = 8)) + 47 | xlab("") + 48 | guides(fill = guide_legend(nrow = 2, byrow = TRUE)) + 49 | scale_fill_manual(values = c( 50 | "different chromosomes" = "#E8601C", 51 | "same chromosome, different strand, non-overlapping" = "#7BAFDE", 52 | "same chromosome, different strand, overlapping" = "#90C987", 53 | "same chromosome, same strand, non-overlapping" = "#777777", 54 | "same chromosome, same strand, overlapping" = "#B17BA6" 55 | ), name = "") 56 | 57 | ## By dataset 58 | g0ds <- ggplot(dfprimsupp %>% 59 | dplyr::mutate(dist0 = "") %>% 60 | dplyr::mutate(dist0 = replace(dist0, distn == 0, ", overlapping"), 61 | dist0 = replace(dist0, distn > 0, ", non-overlapping")) %>% 62 | dplyr::group_by(dataset, strands, dist0) %>% dplyr::tally() %>% 63 | dplyr::mutate(strands_dist0 = paste0(strands, dist0)), 64 | aes(x = dataset, y = n, fill = strands_dist0)) + 65 | theme_bw() + 66 | theme(legend.position = "bottom", 67 | strip.text = element_text(size = 8), 68 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 69 | xlab("") + 70 | guides(fill = guide_legend(nrow = 5, byrow = TRUE)) + 71 | scale_fill_manual(values = c( 72 | "different chromosomes" = "#E8601C", 73 | "same chromosome, different strand, non-overlapping" = "#7BAFDE", 74 | "same chromosome, different strand, overlapping" = "#90C987", 75 | "same chromosome, same strand, non-overlapping" = "#777777", 76 | "same chromosome, same strand, overlapping" = "#B17BA6" 77 | ), name = "") 78 | 79 | 80 | png(gsub("\\.rds$", "_nbr_primary_supplementary_pairs.png", outrds), 81 | width = 10, height = 6, unit = "in", res = 400) 82 | print(g0 + geom_bar(stat = "identity") + 83 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 84 | ylab("Number of primary/supplementary\nalignment pairs")) 85 | dev.off() 86 | 87 | png(gsub("\\.rds$", "_frac_primary_supplementary_pairs.png", outrds), 88 | width = 10, height = 6, unit = "in", res = 400) 89 | print(g0 + geom_bar(stat = "identity", position = "fill") + 90 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 91 | ylab("Fraction of primary/supplementary\nalignment pairs")) 92 | dev.off() 93 | 94 | plots <- list( 95 | primary_supplementary_fraction = 96 | g0 + geom_bar(stat = "identity", position = "fill") + 97 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 98 | ylab("Fraction of primary/supplementary\nalignment pairs"), 99 | primary_supplementary_fraction_byds = 100 | g0ds + geom_bar(stat = "identity", position = "fill") + 101 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 102 | ylab("Fraction of primary/supplementary\nalignment pairs") 103 | ) 104 | 105 | saveRDS(plots, file = outrds) 106 | date() 107 | sessionInfo() 108 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_rseqc.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(cowplot) 10 | library(tidyr) 11 | }) 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 13 | source("RSeQC/RSeQC_geneBody_coverage.geneBodyCoverage.r") 14 | 15 | datasets <- strsplit(datasets, ",")[[1]] 16 | datasets <- c(datasets, "Illumina") 17 | conditions <- strsplit(conditions, ",")[[1]] 18 | 19 | print(datasets) 20 | print(conditions) 21 | print(outrds) 22 | 23 | df <- data.frame(sample = gsub("0918\\.A_", "0918\\.A-", gsub("^V", "", rowLabel)), 24 | data.frame(data_matrix) %>% setNames(paste0("x", seq_len(100))), 25 | stringsAsFactors = FALSE) %>% 26 | tidyr::gather(key = "x", value = "y", -sample) %>% 27 | dplyr::mutate(x = as.numeric(gsub("x", "", x)), 28 | sample = remap[gsub("_minimap_genome_s|_Aligned.sortedByCoord.out", 29 | "", sample)]) %>% 30 | dplyr::left_join(sample_annotation %>% 31 | dplyr::select(sample_remap, condition, dataset), 32 | by = c("sample" = "sample_remap")) %>% 33 | dplyr::filter(dataset %in% remapds[datasets] & condition %in% conditions) %>% 34 | dplyr::mutate( 35 | dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]) 36 | ) 37 | 38 | png(gsub("\\.rds$", "_gene_body_coverage.png", outrds), width = 6, height = 3.5, 39 | unit = "in", res = 400) 40 | ggplot(df, aes(x = x, y = y, group = sample, color = dataset)) + 41 | geom_line() + theme_bw() + 42 | scale_color_manual(values = ds_colors, name = "") + 43 | guides(color = guide_legend(override.aes = list(size = 2))) + 44 | xlab("Gene body percentile (5' -> 3')") + ylab("Coverage") 45 | dev.off() 46 | 47 | saveRDS(NULL, file = outrds) 48 | date() 49 | sessionInfo() 50 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_salmon_variants_all_datasets.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(stringr) 10 | library(tximport) 11 | library(ggbeeswarm) 12 | library(cowplot) 13 | }) 14 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 15 | 16 | datasets <- strsplit(datasets, ",")[[1]] 17 | datasets <- setdiff(datasets, "HEK293RNA") 18 | conditions <- strsplit(conditions, ",")[[1]] 19 | 20 | print(datasets) 21 | print(conditions) 22 | print(tx2gene) 23 | print(outrds) 24 | 25 | tx2gene <- readRDS(tx2gene) 26 | 27 | ## Read quantifications 28 | salmonfiles_illumina <- list.files("Illumina/salmon31", pattern = "quant.sf", 29 | recursive = TRUE, full.names = TRUE) 30 | names(salmonfiles_illumina) <- paste0("Illumina__", 31 | basename(gsub("/quant.sf", "", 32 | salmonfiles_illumina))) 33 | salmonfiles_illumina <- 34 | salmonfiles_illumina[names(salmonfiles_illumina) %in% 35 | paste0("Illumina__", 36 | sample_annotation$sample_orig[sample_annotation$condition %in% 37 | conditions])] 38 | salmonfiles_illumina 39 | 40 | txi_illumina <- tximport(salmonfiles_illumina, type = "salmon", txOut = TRUE) 41 | txig_illumina <- summarizeToGene(txi_illumina, tx2gene = tx2gene[, c("tx", "gene")]) 42 | 43 | ## ========================================================================== ## 44 | ## Salmon 45 | ## ========================================================================== ## 46 | salmonfiles_nanopore <- unlist(lapply(datasets, function(ds) { 47 | salmon31 <- list.files(file.path(ds, "salmon31"), pattern = "quant.sf", 48 | recursive = TRUE, full.names = TRUE) 49 | names(salmon31) <- paste0("salmon31__", basename(gsub("/quant.sf", "", 50 | salmon31))) 51 | 52 | salmon31noclip <- list.files(file.path(ds, "salmon31_aligned_noclippedbases"), 53 | pattern = "quant.sf", recursive = TRUE, 54 | full.names = TRUE) 55 | names(salmon31noclip) <- paste0("salmon31noclip__", basename(gsub("/quant.sf", "", 56 | salmon31noclip))) 57 | 58 | salmonminimap2_p0.8 <- list.files(file.path(ds, "salmonminimap2"), 59 | pattern = "quant.sf", recursive = TRUE, 60 | full.names = TRUE) 61 | names(salmonminimap2_p0.8) <- paste0("salmonminimap2_p0.8__", 62 | basename(gsub("/quant.sf", "", 63 | salmonminimap2_p0.8))) 64 | 65 | salmonminimap2_p0.99 <- list.files(file.path(ds, "salmonminimap2_p0.99"), 66 | pattern = "quant.sf", recursive = TRUE, 67 | full.names = TRUE) 68 | names(salmonminimap2_p0.99) <- paste0("salmonminimap2_p0.99__", 69 | basename(gsub("/quant.sf", "", 70 | salmonminimap2_p0.99))) 71 | 72 | c(salmon31, salmon31noclip, salmonminimap2_p0.8, salmonminimap2_p0.99) 73 | })) 74 | 75 | salmonfiles_nanopore <- 76 | salmonfiles_nanopore[gsub("salmon31__|salmon31noclip__|salmonminimap2_p0.8__|salmonminimap2_p0.99__", "", names(salmonfiles_nanopore)) %in% 77 | sample_annotation$sample_orig[sample_annotation$condition %in% 78 | conditions]] 79 | salmonfiles_nanopore 80 | 81 | txi_nanopore <- tximport(salmonfiles_nanopore, type = "salmon", txOut = TRUE) 82 | txig_nanopore <- summarizeToGene(txi_nanopore, tx2gene = tx2gene[, c("tx", "gene")]) 83 | 84 | txi <- as.data.frame(txi_nanopore$counts) %>% 85 | tibble::rownames_to_column("tx") %>% 86 | dplyr::full_join(as.data.frame(txi_illumina$abundance) %>% 87 | tibble::rownames_to_column("tx"), by = "tx") %>% 88 | as.data.frame() %>% 89 | tibble::column_to_rownames("tx") 90 | dim(txi) 91 | txig <- as.data.frame(txig_nanopore$counts) %>% 92 | tibble::rownames_to_column("gene") %>% 93 | dplyr::full_join(as.data.frame(txig_illumina$abundance) %>% 94 | tibble::rownames_to_column("gene"), by = "gene") %>% 95 | as.data.frame() %>% 96 | tibble::column_to_rownames("gene") 97 | dim(txig) 98 | 99 | fixCorDf <- function(cordf) { 100 | cordf %>% 101 | tibble::rownames_to_column("sample1") %>% 102 | tidyr::gather(key = "sample2", value = "correlation", -sample1) %>% 103 | tidyr::separate(sample1, into = c("method1", "sample1"), sep = "__") %>% 104 | tidyr::separate(sample2, into = c("method2", "sample2"), sep = "__") %>% 105 | dplyr::mutate(sample1 = remap[sample1], 106 | sample2 = remap[sample2]) %>% 107 | dplyr::mutate(condition1 = stringr::str_extract(sample1, "wt|srpk"), 108 | condition2 = stringr::str_extract(sample2, "wt|srpk")) %>% 109 | dplyr::filter(method1 == "Illumina" & method2 != "Illumina" & 110 | sample1 != sample2 & condition1 == condition2) %>% 111 | dplyr::mutate(dataset2 = sapply(strsplit(sample2, "_"), 112 | .subset, 1)) %>% 113 | dplyr::mutate(dataset2 = factor(dataset2, levels = ds_order[ds_order %in% dataset2])) 114 | } 115 | 116 | spearman_tx <- fixCorDf(as.data.frame(cor(sqrt(txi), method = "spearman"))) 117 | 118 | spearman_gene <- fixCorDf(as.data.frame(cor(sqrt(txig), method = "spearman"))) 119 | 120 | pearson_tx <- fixCorDf(as.data.frame(cor(sqrt(txi), method = "pearson"))) 121 | 122 | pearson_gene <- fixCorDf(as.data.frame(cor(sqrt(txig), method = "pearson"))) 123 | 124 | png(gsub("\\.rds$", "_salmon_spearman.png", outrds), width = 9, height = 5, 125 | unit = "in", res = 400) 126 | p1 <- ggplot(spearman_tx, aes(x = method2, y = correlation, color = dataset2)) + 127 | geom_quasirandom(size = 3, alpha = 0.6) + 128 | theme_bw() + ggtitle("Transcript") + 129 | ylab("Spearman correlation with\nIllumina (sqrt abundance)") + xlab("") + 130 | coord_cartesian(ylim = c(0, 1)) + 131 | scale_color_manual(values = ds_colors, name = "") + 132 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 133 | p2 <- ggplot(spearman_gene, aes(x = method2, y = correlation, color = dataset2)) + 134 | geom_quasirandom(size = 3, alpha = 0.6) + 135 | theme_bw() + ggtitle("Gene") + 136 | ylab("Spearman correlation with\nIllumina (sqrt abundance)") + xlab("") + 137 | coord_cartesian(ylim = c(0, 1)) + 138 | scale_color_manual(values = ds_colors, name = "") + 139 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 140 | legend.position = "bottom") 141 | print(cowplot::plot_grid( 142 | cowplot::plot_grid( 143 | p1 + theme(legend.position = "none"), 144 | p2 + theme(legend.position = "none"), 145 | labels = c("A", "B"), rel_widths = c(1, 1), nrow = 1 146 | ), 147 | cowplot::get_legend(p2), 148 | rel_heights = c(1, 0.2), ncol = 1) 149 | ) 150 | dev.off() 151 | 152 | png(gsub("\\.rds$", "_salmon_pearson.png", outrds), width = 9, height = 5, 153 | unit = "in", res = 400) 154 | p1 <- ggplot(pearson_tx, aes(x = method2, y = correlation, color = dataset2)) + 155 | geom_quasirandom(size = 3, alpha = 0.6) + 156 | theme_bw() + ggtitle("Transcript") + 157 | ylab("Pearson correlation with\nIllumina (sqrt abundance)") + xlab("") + 158 | coord_cartesian(ylim = c(0, 1)) + 159 | scale_color_manual(values = ds_colors, name = "") + 160 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 161 | p2 <- ggplot(pearson_gene, aes(x = method2, y = correlation, color = dataset2)) + 162 | geom_quasirandom(size = 3, alpha = 0.6) + 163 | theme_bw() + ggtitle("Gene") + 164 | ylab("Pearson correlation with\nIllumina (sqrt abundance)") + xlab("") + 165 | coord_cartesian(ylim = c(0, 1)) + 166 | scale_color_manual(values = ds_colors, name = "") + 167 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 168 | legend.position = "bottom") 169 | print(cowplot::plot_grid( 170 | cowplot::plot_grid( 171 | p1 + theme(legend.position = "none"), 172 | p2 + theme(legend.position = "none"), 173 | labels = c("A", "B"), rel_widths = c(1, 1), nrow = 1 174 | ), 175 | cowplot::get_legend(p2), 176 | rel_heights = c(1, 0.2), ncol = 1) 177 | ) 178 | dev.off() 179 | 180 | saveRDS(NULL, file = outrds) 181 | date() 182 | sessionInfo() 183 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_sqanti_round2_summary.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(rtracklayer) 8 | library(dplyr) 9 | library(GenomicFeatures) 10 | library(ggplot2) 11 | library(grDevices) 12 | library(cowplot) 13 | library(grid) 14 | }) 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 16 | 17 | datasets <- strsplit(datasets, ",")[[1]] 18 | datasets <- c(datasets, "Illumina") 19 | names(datasets) <- datasets 20 | conditions <- strsplit(conditions, ",")[[1]] 21 | 22 | if (all(c("wt", "srpk") %in% conditions)) { 23 | sqanticond <- "_all" 24 | } else if ("wt" %in% conditions) { 25 | sqanticond <- "_WT" 26 | } else { 27 | stop("Unknown conditions") 28 | } 29 | 30 | print(datasets) 31 | print(conditions) 32 | print(sqanticond) 33 | print(outrds) 34 | 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6", 36 | "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265", 37 | "#CAEDAB", "#777777") 38 | colfun <- grDevices::colorRampPalette(muted) 39 | 40 | ## Read Illumina abundance estimates 41 | ilmn_tx_abundances <- 42 | readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances 43 | ilmn_samples <- sample_annotation %>% 44 | dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 45 | dplyr::pull("sample_orig") 46 | print(ilmn_samples) 47 | ave_ilmn_tx_tpm <- 48 | data.frame(transcript_id = rownames(ilmn_tx_abundances), 49 | ave_tx_tpm = rowMeans( 50 | ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")] 51 | ), 52 | stringsAsFactors = FALSE) 53 | 54 | ## ========================================================================== ## 55 | ## Plot overlap of detected transcripts 56 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 57 | dplyr::bind_rows( 58 | read.delim(paste0( 59 | ds, "/sqanti_round2_ilmnjunc/", ds, sqanticond, "/", ds, sqanticond, 60 | "_ilmnjunc_classification.txt"), 61 | header = TRUE, as.is = TRUE) %>% 62 | dplyr::filter(structural_category %in% c("full-splice_match", 63 | "incomplete-splice_match")) %>% 64 | dplyr::select(associated_transcript) %>% 65 | dplyr::rename(transcript_id = associated_transcript) %>% 66 | dplyr::distinct() %>% 67 | dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc"), 68 | identified = TRUE), 69 | read.delim(paste0( 70 | ds, "/sqanti_round2/", ds, sqanticond, "/", ds, sqanticond, 71 | "_classification.txt"), 72 | header = TRUE, as.is = TRUE) %>% 73 | dplyr::filter(structural_category %in% c("full-splice_match", 74 | "incomplete-splice_match")) %>% 75 | dplyr::select(associated_transcript) %>% 76 | dplyr::rename(transcript_id = associated_transcript) %>% 77 | dplyr::distinct() %>% 78 | dplyr::mutate(dataset = remapds[ds], 79 | identified = TRUE) 80 | ) 81 | })) %>% 82 | dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc") ## don't use HAP junctions for HEK 83 | dettx <- unique(df0$transcript_id) 84 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset))) 85 | rownames(detmat) <- dettx 86 | colnames(detmat) <- unique(df0$dataset) 87 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset) %>% 88 | dplyr::mutate(transcript_id = as.character(transcript_id), 89 | dataset = as.character(dataset)))] <- 1 90 | ordr <- ds_order[ds_order %in% colnames(detmat)] 91 | ordr <- rep(ordr, each = 2) 92 | suffx <- rep(c("", "_ILMNjunc"), length(ordr)/2) 93 | ordr <- paste0(ordr, suffx) 94 | ordr <- setdiff(ordr, "ONT-RNA001-HEK_ILMNjunc") 95 | stopifnot(ordr %in% colnames(detmat), 96 | colnames(detmat) %in% ordr) 97 | detmat <- detmat[, ordr] 98 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 99 | width = 10, height = 6, unit = "in", res = 400) 100 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq", 101 | decreasing = TRUE, keep.order = TRUE, nsets = length(ordr), 102 | mainbar.y.label = "Number of shared annotated transcripts", 103 | sets.x.label = "Number of identified\nannotated transcripts", 104 | sets.bar.color = ds_colors[gsub("_ILMNjunc", "", colnames(detmat))], 105 | sets = colnames(detmat), mb.ratio = c(0.55, 0.45)) 106 | grid::grid.edit("arrange", name = "arrange2") 107 | upsetplot <- grid::grid.grab() 108 | dev.off() 109 | 110 | ## ========================================================================== ## 111 | ## Plot abundance of identified/unidentified transcripts 112 | df1 <- df0 %>% 113 | dplyr::full_join( 114 | do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 115 | dplyr::bind_rows(ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds]), 116 | ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = paste0(remapds[ds], 117 | "_ILMNjunc"))) %>% 118 | dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc") 119 | })), 120 | by = c("transcript_id", "dataset") 121 | ) %>% 122 | dplyr::mutate(identified = replace(identified, is.na(identified), FALSE)) 123 | 124 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds), 125 | width = 8, height = 4, unit = "in", res = 400) 126 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 127 | geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 128 | scale_y_log10() + xlab("Transcript identified by FLAIR") + 129 | ylab("Average abundance across Illumina samples (TPM + 1)") 130 | dev.off() 131 | 132 | ## Merge with upset plot 133 | df2 <- df1 %>% dplyr::filter(grepl("_ILMNjunc$|RNA001-HEK", dataset)) %>% 134 | dplyr::select(-dataset) %>% 135 | dplyr::group_by(transcript_id, ave_tx_tpm) %>% 136 | dplyr::summarize(identified = any(identified)) 137 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 138 | geom_boxplot() + theme_bw() + 139 | scale_y_log10() + xlab("Transcript identified by FLAIR") + 140 | ylab("Average abundance across Illumina samples (TPM + 1)") 141 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds), 142 | width = 14, height = 6, unit = "in", res = 400) 143 | print(cowplot::plot_grid( 144 | upsetplot, 145 | p1, 146 | rel_widths = c(5, 1.5), nrow = 1, labels = c("A", "B") 147 | )) 148 | dev.off() 149 | 150 | ## ========================================================================== ## 151 | ## Get number of exons and read length per flair transcript and plot for 152 | ## different structural categories 153 | ds_order_both <- rep(ds_order, each = 2) 154 | suffx <- rep(c("", "_ILMNjunc"), length(ds_order_both)/2) 155 | ds_order_both <- paste0(ds_order_both, suffx) 156 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 157 | tmap <- read.delim(paste0( 158 | ds, "/sqanti", ifelse(ds == "Illumina", "/", "_round2/"), 159 | ds, sqanticond, "/", ds, sqanticond, 160 | "_classification.txt"), 161 | header = TRUE, as.is = TRUE) %>% 162 | dplyr::mutate(dataset = remapds[ds]) 163 | if (ds %in% c("RNA001", "DCS108", "pilot")) { 164 | tmap2 <- read.delim(paste0( 165 | ds, "/sqanti_round2_ilmnjunc/", ds, sqanticond, "/", ds, sqanticond, 166 | "_ilmnjunc_classification.txt"), 167 | header = TRUE, as.is = TRUE) %>% 168 | dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) 169 | tmap <- dplyr::bind_rows(tmap, tmap2) 170 | } 171 | tmap 172 | })) %>% 173 | dplyr::mutate(exons = replace(exons, exons > 2, ">2")) %>% 174 | dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset])) 175 | 176 | ## ========================================================================== ## 177 | ## Plot distribution of structural categories 178 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 179 | tmap <- read.delim(paste0( 180 | ds, "/sqanti", ifelse(ds == "Illumina", "/", "_round2/"), 181 | ds, sqanticond, "/", ds, sqanticond, 182 | "_classification.txt"), 183 | header = TRUE, as.is = TRUE) 184 | t1 <- as.data.frame(table(tmap$structural_category)) %>% 185 | dplyr::mutate(dataset = remapds[ds]) %>% 186 | dplyr::rename(structural_category = Var1) %>% 187 | dplyr::mutate(structural_category = as.character(structural_category)) 188 | if (ds %in% c("RNA001", "DCS108", "pilot")) { 189 | tmap2 <- read.delim(paste0( 190 | ds, "/sqanti_round2_ilmnjunc/", ds, sqanticond, "/", ds, sqanticond, 191 | "_ilmnjunc_classification.txt"), 192 | header = TRUE, as.is = TRUE) 193 | t1 <- dplyr::bind_rows( 194 | t1, as.data.frame(table(tmap2$structural_category)) %>% 195 | dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>% 196 | dplyr::rename(structural_category = Var1) %>% 197 | dplyr::mutate(structural_category = as.character(structural_category))) 198 | } 199 | t1 200 | })) %>% 201 | dplyr::mutate(structural_category = factor(structural_category, levels = c( 202 | "novel_not_in_catalog", 203 | "novel_in_catalog", 204 | "antisense", 205 | "intergenic", 206 | "genic", 207 | "genic_intron", 208 | "fusion", 209 | "incomplete-splice_match", 210 | "full-splice_match" 211 | ))) %>% 212 | dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset])) 213 | 214 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 215 | geom_bar(stat = "identity", position = "fill", 216 | aes(fill = structural_category)) + theme_bw() + 217 | scale_fill_manual(values = structure(colfun(15), 218 | names = levels(dfcc$structural_category)), 219 | name = "Overlap type (structural category)") + 220 | ylab("Fraction of transcripts") + xlab("") + 221 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 222 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 223 | geom_text(data = dfcc %>% dplyr::group_by(dataset) %>% 224 | dplyr::summarize(nbrTx = sum(Freq)), 225 | aes(x = dataset, y = 1, label = nbrTx), 226 | vjust = 0, nudge_y = 0.02, size = 3) 227 | 228 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_structuralcategory_full.png", outrds), 229 | width = 11, height = 16, unit = "in", res = 400) 230 | p1 <- ggplot(nbrexons %>% 231 | dplyr::group_by(dataset, structural_category, exons) %>% 232 | dplyr::tally() %>% 233 | dplyr::ungroup() %>% 234 | dplyr::mutate(exons = factor(exons, 235 | levels = c(">2", "2", "1"))), 236 | aes(x = structural_category, y = n, fill = exons)) + 237 | geom_bar(stat = "identity", position = "fill") + theme_bw() + 238 | facet_wrap(~ dataset, nrow = 2) + 239 | xlab("Structural category") + ylab("Fraction of transcripts") + 240 | scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"), 241 | name = "Number of\nexons") + 242 | theme(legend.position = "bottom", 243 | axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 244 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) 245 | p2 <- ggplot(nbrexons, 246 | aes(x = structural_category, y = length)) + 247 | geom_boxplot() + theme_bw() + 248 | facet_wrap(~ dataset, nrow = 2) + 249 | scale_y_log10() + 250 | theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 251 | xlab("Structural category") + ylab("Inferred transcript length") 252 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 253 | rel_heights = c(1, 1.1, 1), align = "v", axis = "l") 254 | dev.off() 255 | 256 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_structuralcategory.png", outrds), 257 | width = 7, height = 12, unit = "in", res = 400) 258 | p1 <- ggplot(nbrexons %>% 259 | dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")) %>% 260 | dplyr::group_by(dataset, structural_category, exons) %>% 261 | dplyr::tally() %>% 262 | dplyr::ungroup() %>% 263 | dplyr::mutate(exons = factor(exons, 264 | levels = c(">2", "2", "1"))), 265 | aes(x = structural_category, y = n, fill = exons)) + 266 | geom_bar(stat = "identity", position = "fill") + theme_bw() + 267 | facet_wrap(~ dataset, nrow = 1) + 268 | xlab("Structural category") + ylab("Fraction of transcripts") + 269 | scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"), 270 | name = "Number of\nexons") + 271 | theme(legend.position = "bottom", 272 | axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 273 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) 274 | p2 <- ggplot(nbrexons %>% 275 | dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")), 276 | aes(x = structural_category, y = length)) + 277 | geom_boxplot() + theme_bw() + 278 | facet_wrap(~ dataset, nrow = 1) + 279 | scale_y_log10() + 280 | theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 281 | xlab("Structural category") + ylab("Inferred transcript length") 282 | cowplot::plot_grid(p0, p1, p2, 283 | ncol = 1, labels = c("A", "B", "C"), 284 | rel_heights = c(1.35, 1.28, 1.1), align = "v", axis = "l") 285 | dev.off() 286 | 287 | saveRDS(NULL, file = outrds) 288 | date() 289 | sessionInfo() -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_sqanti_summary.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(rtracklayer) 8 | library(dplyr) 9 | library(GenomicFeatures) 10 | library(ggplot2) 11 | library(grDevices) 12 | library(cowplot) 13 | library(grid) 14 | }) 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 16 | 17 | datasets <- strsplit(datasets, ",")[[1]] 18 | datasets <- c(datasets, "Illumina") 19 | names(datasets) <- datasets 20 | conditions <- strsplit(conditions, ",")[[1]] 21 | 22 | if (all(c("wt", "srpk") %in% conditions)) { 23 | sqanticond <- "_all" 24 | } else if ("wt" %in% conditions) { 25 | sqanticond <- "_WT" 26 | } else { 27 | stop("Unknown conditions") 28 | } 29 | 30 | print(datasets) 31 | print(conditions) 32 | print(sqanticond) 33 | print(outrds) 34 | 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6", 36 | "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265", 37 | "#CAEDAB", "#777777") 38 | colfun <- grDevices::colorRampPalette(muted) 39 | 40 | ## Read Illumina abundance estimates 41 | ilmn_tx_abundances <- 42 | readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances 43 | ilmn_samples <- sample_annotation %>% 44 | dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 45 | dplyr::pull("sample_orig") 46 | ave_ilmn_tx_tpm <- 47 | data.frame(transcript_id = rownames(ilmn_tx_abundances), 48 | ave_tx_tpm = rowMeans( 49 | ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")] 50 | ), 51 | stringsAsFactors = FALSE) 52 | 53 | ## ========================================================================== ## 54 | ## Plot overlap of detected transcripts 55 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 56 | read.delim(paste0( 57 | ds, "/sqanti/", ds, sqanticond, "/", ds, sqanticond, 58 | "_classification.txt"), 59 | header = TRUE, as.is = TRUE) %>% 60 | dplyr::filter(structural_category %in% c("full-splice_match", 61 | "incomplete-splice_match")) %>% 62 | dplyr::select(associated_transcript) %>% 63 | dplyr::rename(transcript_id = associated_transcript) %>% 64 | dplyr::distinct() %>% 65 | dplyr::mutate(dataset = remapds[ds], 66 | identified = TRUE) 67 | })) 68 | dettx <- unique(df0$transcript_id) 69 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset))) 70 | rownames(detmat) <- dettx 71 | colnames(detmat) <- unique(df0$dataset) 72 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset))] <- 1 73 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 74 | width = 10, height = 6, unit = "in", res = 400) 75 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq") 76 | grid::grid.edit("arrange", name = "arrange2") 77 | upsetplot <- grid::grid.grab() 78 | dev.off() 79 | 80 | ## ========================================================================== ## 81 | ## Plot abundance of identified/unidentified transcripts 82 | df1 <- df0 %>% 83 | dplyr::full_join( 84 | do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) { 85 | ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds]) 86 | })), 87 | by = c("transcript_id", "dataset") 88 | ) %>% 89 | dplyr::mutate(identified = replace(identified, is.na(identified), FALSE)) 90 | 91 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds), 92 | width = 8, height = 4, unit = "in", res = 400) 93 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 94 | geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 95 | scale_y_log10() + xlab("Transcript identified by flair") + 96 | ylab("Average abundance across Illumina samples (TPM + 1)") 97 | dev.off() 98 | 99 | ## Merge with upset plot 100 | df2 <- df1 %>% dplyr::select(-dataset) %>% 101 | dplyr::group_by(transcript_id, ave_tx_tpm) %>% 102 | dplyr::summarize(identified = any(identified)) 103 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 104 | geom_boxplot() + theme_bw() + 105 | scale_y_log10() + xlab("Transcript identified by flair") + 106 | ylab("Average abundance across Illumina samples (TPM + 1)") 107 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds), 108 | width = 12, height = 6, unit = "in", res = 400) 109 | print(cowplot::plot_grid( 110 | upsetplot, 111 | p1, 112 | rel_widths = c(4, 1.5), nrow = 1, labels = c("A", "B") 113 | )) 114 | dev.off() 115 | 116 | ## ========================================================================== ## 117 | ## Get number of exons and read length per flair transcript and plot for 118 | ## different structural categories 119 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 120 | read.delim(paste0( 121 | ds, "/sqanti/", ds, sqanticond, "/", ds, sqanticond, "_classification.txt"), 122 | header = TRUE, as.is = TRUE) %>% 123 | dplyr::mutate(dataset = remapds[ds]) 124 | })) %>% 125 | dplyr::mutate(exons = replace(exons, exons > 2, ">2")) 126 | 127 | ## ========================================================================== ## 128 | ## Plot distribution of structural categories 129 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 130 | tmap <- read.delim(paste0( 131 | ds, "/sqanti/", ds, sqanticond, "/", ds, sqanticond, "_classification.txt"), 132 | header = TRUE, as.is = TRUE) 133 | as.data.frame(table(tmap$structural_category)) %>% 134 | dplyr::mutate(dataset = remapds[ds]) %>% 135 | dplyr::rename(structural_category = Var1) %>% 136 | dplyr::mutate(structural_category = as.character(structural_category)) 137 | })) %>% 138 | dplyr::mutate(structural_category = factor(structural_category, levels = c( 139 | "novel_not_in_catalog", 140 | "novel_in_catalog", 141 | "antisense", 142 | "intergenic", 143 | "genic", 144 | "genic_intron", 145 | "fusion", 146 | "incomplete-splice_match", 147 | "full-splice_match" 148 | ))) 149 | 150 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_structuralcategory.png", outrds), 151 | width = 11, height = 11, unit = "in", res = 400) 152 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 153 | geom_bar(stat = "identity", position = "fill", 154 | aes(fill = structural_category)) + theme_bw() + 155 | scale_fill_manual(values = structure(colfun(15), 156 | names = levels(dfcc$structural_category)), 157 | name = "Overlap type (structural category)") + 158 | ylab("Fraction of transcripts") + xlab("") + 159 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 160 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 161 | geom_text(data = dfcc %>% dplyr::group_by(dataset) %>% 162 | dplyr::summarize(nbrTx = sum(Freq)), 163 | aes(x = dataset, y = 1, label = nbrTx), 164 | vjust = 0, nudge_y = 0.02, size = 3) 165 | p1 <- ggplot(nbrexons %>% 166 | dplyr::group_by(dataset, structural_category, exons) %>% 167 | dplyr::tally() %>% 168 | dplyr::ungroup() %>% 169 | dplyr::mutate(exons = factor(exons, 170 | levels = c(">2", "2", "1"))), 171 | aes(x = structural_category, y = n, fill = exons)) + 172 | geom_bar(stat = "identity", position = "fill") + theme_bw() + 173 | facet_wrap(~ dataset, nrow = 1) + 174 | xlab("Structural category") + ylab("Fraction of transcripts") + 175 | scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"), 176 | name = "Number of\nexons") + 177 | theme(legend.position = "bottom", 178 | axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 179 | scale_y_continuous(expand = c(0, 0, 0.05, 0)) 180 | p2 <- ggplot(nbrexons, 181 | aes(x = structural_category, y = length)) + 182 | geom_boxplot() + theme_bw() + 183 | facet_wrap(~ dataset, nrow = 1) + 184 | scale_y_log10() + 185 | theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 186 | xlab("Structural category") + ylab("Inferred transcript length") 187 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 188 | rel_heights = c(1.7, 1.08, 0.9), align = "v", axis = "l") 189 | dev.off() 190 | 191 | png(gsub("\\.rds$", "_dist_to_tss_tts_by_structuralcategory.png", outrds), 192 | width = 11, height = 11, unit = "in", res = 400) 193 | p0 <- ggplot(nbrexons %>% dplyr::filter(!is.na(diff_to_TSS)) %>% 194 | droplevels(), 195 | aes(x = structural_category, y = diff_to_TSS)) + 196 | geom_violin() + theme_bw() + 197 | facet_wrap(~ dataset, nrow = 1) + 198 | theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 199 | xlab("Structural category") + ylab("Distance to TSS") 200 | p1 <- ggplot(nbrexons %>% dplyr::filter(!is.na(diff_to_TTS)) %>% 201 | droplevels(), 202 | aes(x = structural_category, y = diff_to_TTS)) + 203 | geom_violin() + theme_bw() + 204 | facet_wrap(~ dataset, nrow = 1) + 205 | theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 206 | xlab("Structural category") + ylab("Distance to TTS") 207 | p2 <- ggplot(nbrexons, 208 | aes(x = structural_category, y = perc_A_downstream_TTS)) + 209 | geom_violin() + theme_bw() + 210 | facet_wrap(~ dataset, nrow = 1) + 211 | theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 212 | xlab("Structural category") + ylab("Percentage A\ndownstream of TTS") 213 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 214 | rel_heights = c(1, 1, 1), align = "v", axis = "l") 215 | dev.off() 216 | 217 | 218 | saveRDS(NULL, file = outrds) 219 | date() 220 | sessionInfo() -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_txcov_sirv.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(dplyr) 8 | library(ggplot2) 9 | library(stringr) 10 | library(cowplot) 11 | library(GenomicAlignments) 12 | }) 13 | 14 | labels <- strsplit(labels, ",")[[1]] 15 | 16 | print(dataset) 17 | print(labels) 18 | print(title) 19 | print(outrds) 20 | 21 | ################################################################################ 22 | ## Number of covered transcripts, covered fraction 23 | ################################################################################ 24 | dfct_primary <- do.call(dplyr::bind_rows, lapply(dataset, function(ds) { 25 | rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds")) 26 | rdt <- rd$txomebams_p0.99 27 | do.call(dplyr::bind_rows, lapply(names(rdt), function(nm) { 28 | rdt[[nm]]$allAlignments %>% dplyr::filter(flag %in% c(0, 16)) %>% 29 | dplyr::mutate(sample = nm) %>% 30 | dplyr::mutate(dataset = ds) 31 | })) 32 | })) 33 | 34 | dfct_longest <- do.call(dplyr::bind_rows, lapply(dataset, function(ds) { 35 | rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds")) 36 | rdt <- rd$txomebams_p0.99 37 | do.call(dplyr::bind_rows, lapply(names(rdt), function(nm) { 38 | rdt[[nm]]$allAlignments %>% 39 | dplyr::select(read, flag, rname, nbrM, nbrS, nbrH, nbrD, nbrI, 40 | txLength, alignedLength) %>% 41 | dplyr::filter(flag %in% c(0, 16, 256, 272)) %>% 42 | dplyr::group_by(read) %>% 43 | dplyr::filter(alignedLength/max(alignedLength) > 0.9) %>% 44 | dplyr::arrange(desc((nbrM + nbrD)/txLength)) %>% 45 | dplyr::slice(1) %>% 46 | dplyr::ungroup() %>% 47 | dplyr::mutate(sample = nm) %>% 48 | dplyr::mutate(dataset = ds) 49 | })) 50 | })) 51 | 52 | plotCovFractionAll <- function(plotdf, maxLength) { 53 | ggplot(plotdf %>% dplyr::mutate(covFraction = (nbrM + nbrD)/txLength), 54 | aes(x = "All transcripts", y = covFraction)) + geom_violin() + 55 | scale_y_continuous(limits = c(0, NA)) + 56 | theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 57 | xlab("") + ylab("Fraction of transcript covered per alignment") + 58 | stat_summary(data = plotdf %>% 59 | dplyr::summarize(covFraction = length(txLength)), 60 | fun.data = function(x) {return(c(y = 1, label = x))}, 61 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 62 | } 63 | 64 | plotCovFractionStrat <- function(plotdf, maxLength) { 65 | ggplot(plotdf %>% 66 | dplyr::mutate(txLengthGroup = Hmisc::cut2(txLength, 67 | cuts = c(0, 500, 1000, 1500, 68 | 2000, maxLength))) %>% 69 | dplyr::mutate(covFraction = (nbrM + nbrD)/txLength), 70 | aes(x = txLengthGroup, y = covFraction)) + 71 | geom_violin() + scale_y_continuous(limits = c(0, NA)) + 72 | theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 73 | xlab("Transcript length") + ylab("Fraction of transcript covered per alignment") + 74 | stat_summary(data = plotdf %>% 75 | dplyr::mutate(txLengthGroup = Hmisc::cut2(txLength, 76 | cuts = c(0, 500, 1000, 1500, 77 | 2000, maxLength))) %>% 78 | dplyr::group_by(txLengthGroup) %>% 79 | dplyr::summarize(covFraction = length(txLength)), 80 | fun.data = function(x) {return(c(y = 1, label = x))}, 81 | geom = "text", alpha = 1, size = 2.5, vjust = -1) 82 | } 83 | 84 | png(gsub("\\.rds$", "_coverage_fraction_of_transcripts_primary.png", outrds), 85 | width = 5, height = 5, unit = "in", res = 400) 86 | plot_title <- cowplot::ggdraw() + cowplot::draw_label(title, fontface = 'bold') 87 | p0 <- cowplot::plot_grid( 88 | plotCovFractionAll(dfct_primary, max(dfct_primary$txLength)), 89 | plotCovFractionStrat(dfct_primary, max(dfct_primary$txLength)), 90 | rel_widths = c(2, 5), labels = labels 91 | ) 92 | print(cowplot::plot_grid(plot_title, p0, rel_heights = c(0.075, 1), labels = "", ncol = 1)) 93 | dev.off() 94 | 95 | png(gsub("\\.rds$", "_coverage_fraction_of_transcripts_longest.png", outrds), 96 | width = 5, height = 5, unit = "in", res = 400) 97 | plot_title <- cowplot::ggdraw() + cowplot::draw_label(title, fontface = 'bold') 98 | p0 <- cowplot::plot_grid( 99 | plotCovFractionAll(dfct_longest, max(dfct_longest$txLength)), 100 | plotCovFractionStrat(dfct_longest, max(dfct_longest$txLength)), 101 | rel_widths = c(2, 5), labels = labels 102 | ) 103 | print(cowplot::plot_grid(plot_title, p0, rel_heights = c(0.075, 1), labels = "", ncol = 1)) 104 | dev.off() 105 | 106 | saveRDS(list(covfraclongest = plotCovFractionStrat(dfct_longest, 107 | max(dfct_longest$txLength))), 108 | file = outrds) 109 | date() 110 | sessionInfo() 111 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/plot_txlength_vs_basecoverage.R: -------------------------------------------------------------------------------- 1 | args <- (commandArgs(trailingOnly = TRUE)) 2 | for (i in 1:length(args)) { 3 | eval(parse(text = args[[i]])) 4 | } 5 | 6 | suppressPackageStartupMessages({ 7 | library(bamsignals) 8 | library(dplyr) 9 | library(ggplot2) 10 | library(tximport) 11 | library(tibble) 12 | }) 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R") 14 | 15 | datasets <- strsplit(datasets, ",")[[1]] 16 | datasets <- c(datasets, "Illumina") 17 | conditions <- strsplit(conditions, ",")[[1]] 18 | 19 | print(datasets) 20 | print(conditions) 21 | print(tx2gene) 22 | print(gtf) 23 | print(outrds) 24 | 25 | tx2gene <- readRDS(tx2gene) 26 | genes <- rtracklayer::import(gtf) 27 | genes <- subset(genes, type == "exon") 28 | 29 | ## Get coverages for all bases in all transcripts, for all samples 30 | baseCoverages <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) { 31 | print(paste0(ds, ":")) 32 | if (ds == "Illumina") { 33 | files <- list.files(paste0(ds, "/STAR"), 34 | pattern = "_Aligned.sortedByCoord.out.bam$", 35 | recursive = TRUE, full.names = TRUE) 36 | names(files) <- gsub("_Aligned.sortedByCoord.out.bam", 37 | "", basename(files)) 38 | files <- files[names(files) %in% 39 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 40 | } else { 41 | files <- list.files(paste0(ds, "/minimap2genome"), 42 | pattern = "_minimap_genome_s.bam$", 43 | recursive = TRUE, full.names = TRUE) 44 | names(files) <- gsub("_minimap_genome_s.bam", 45 | "", basename(files)) 46 | files <- files[names(files) %in% 47 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 48 | } 49 | do.call(dplyr::bind_rows, lapply(names(files), function(nm) { 50 | print(paste0(" ", nm)) 51 | sigs <- bamCoverage(files[nm], genes, verbose = FALSE) 52 | as.data.frame(genes) %>% 53 | dplyr::mutate(totCov = sapply(sigs, sum), 54 | totNBases = sapply(sigs, length), 55 | totNonzero = sapply(sigs, function(w) sum(w > 0))) %>% 56 | dplyr::group_by(transcript_id) %>% 57 | dplyr::summarize(txLength = sum(width), 58 | totCov = sum(totCov), 59 | totNBases = sum(totNBases), 60 | totNonzero = sum(totNonzero), 61 | txBiotype = unique(transcript_biotype), 62 | gene = unique(gene_id), 63 | geneName = unique(gene_name), 64 | geneBiotype = unique(gene_biotype)) %>% 65 | dplyr::mutate(dataset = remapds[ds], 66 | sample = remap[nm]) 67 | })) 68 | })) 69 | 70 | ## Add expression (TPM) in Illumina samples 71 | files <- list.files("Illumina/salmon31", pattern = "quant.sf", 72 | recursive = TRUE, full.names = TRUE) 73 | names(files) <- basename(dirname(files)) 74 | files <- files[names(files) %in% 75 | sample_annotation$sample_orig[sample_annotation$condition %in% conditions]] 76 | salmon <- tximport(files = files, type = "salmon", txOut = TRUE) 77 | 78 | tpm <- as.data.frame(salmon$abundance) %>% 79 | tibble::rownames_to_column("transcript_id") %>% 80 | dplyr::mutate(transcript_id = gsub("\\.[0-9]+$", "", transcript_id)) %>% 81 | tidyr::gather(key = sample, value = TPM, -transcript_id) %>% 82 | dplyr::mutate(sample = remap[sample], 83 | dataset = "Illumina") 84 | 85 | df <- baseCoverages %>% 86 | dplyr::left_join(tpm, by = c("sample", "dataset", "transcript_id")) 87 | 88 | ## Plot transcript length vs number of covered bases for Illumina data, 89 | ## at different expression cutoffs 90 | plots <- list() 91 | png(gsub("\\.rds$", paste0("_txlength_vs_numcovbases_illumina_minTPM1.png"), outrds), 92 | width = 7, height = 7, unit = "in", res = 400) 93 | df0 <- df %>% dplyr::filter(dataset == "Illumina") %>% 94 | dplyr::group_by(transcript_id) %>% 95 | dplyr::summarize(aveTPM = mean(TPM), 96 | aveNbrNonzero = mean(totNonzero), 97 | txLength = mean(txLength)) %>% 98 | dplyr::filter(aveTPM > 1) 99 | plots[[paste0("txlength_vs_basecoverage_minTPM1")]] <- 100 | ggplot(df0, aes(x = txLength, y = aveNbrNonzero)) + 101 | geom_abline(slope = 1, intercept = 0) + 102 | geom_hex(bins = 100, aes(fill = stat(density))) + 103 | scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 104 | theme_bw() + 105 | xlab("Transcript length") + 106 | ylab("Average number of covered bases in Illumina samples") + 107 | ggtitle(paste0("Transcripts with average TPM > 1")) 108 | print(plots[[paste0("txlength_vs_basecoverage_minTPM1")]]) 109 | dev.off() 110 | 111 | saveRDS(plots, file = outrds) 112 | date() 113 | sessionInfo() 114 | 115 | -------------------------------------------------------------------------------- /manuscript_results_figures/Rscripts/remap_sample_names.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(dplyr) 3 | library(tidyr) 4 | }) 5 | 6 | ## Define remapping of sample names 7 | remap <- c(Srpk_1 = "ONT-NSK007-HAP_srpk_1", 8 | Srpk_2 = "ONT-NSK007-HAP_srpk_2", 9 | wt1 = "ONT-NSK007-HAP_wt_1", 10 | wt2 = "ONT-NSK007-HAP_wt_2", 11 | wt_4_FGCZ = "ONT-PCS108-HAP_wt_4", 12 | `20171207_1645_p2557_4017_2_ALLREADS.pass` = "ONT-PCS108-HAP_wt_4", 13 | srpk_1_DCS108 = "ONT-DCS108-HAP_srpk_1", 14 | srpk_2_DCS108 = "ONT-DCS108-HAP_srpk_2", 15 | wt_1_DCS108 = "ONT-DCS108-HAP_wt_1", 16 | wt_2_DCS108 = "ONT-DCS108-HAP_wt_2", 17 | srpk_1_RNA001 = "ONT-RNA001-HAP_srpk_1", 18 | srpk_2_RNA001 = "ONT-RNA001-HAP_srpk_2", 19 | srpk_3_RNA001 = "ONT-RNA001-HAP_srpk_3", 20 | srpk_4_RNA001 = "ONT-RNA001-HAP_srpk_4", 21 | srpk_5_RNA001 = "ONT-RNA001-HAP_srpk_5", 22 | srpk_6_RNA001 = "ONT-RNA001-HAP_srpk_6", 23 | wt_1_RNA001 = "ONT-RNA001-HAP_wt_1", 24 | wt_2_RNA001 = "ONT-RNA001-HAP_wt_2", 25 | wt_3_RNA001 = "ONT-RNA001-HAP_wt_3", 26 | wt_4_RNA001 = "ONT-RNA001-HAP_wt_4", 27 | wt_5_RNA001 = "ONT-RNA001-HAP_wt_5", 28 | wt_6_RNA001 = "ONT-RNA001-HAP_wt_6", 29 | SS2_srpk_1 = "TempSwitch-HAP_srpk_1", 30 | SS2_srpk_2 = "TempSwitch-HAP_srpk_2", 31 | SS2_wt_1 = "TempSwitch-HAP_wt_1", 32 | SS2_wt_2 = "TempSwitch-HAP_wt_2", 33 | `20170918.A-dSprk1_1` = "Illumina_srpk_1", 34 | `20170918.A-dSprk1_2` = "Illumina_srpk_2", 35 | `20170918.A-dSprk1_3` = "Illumina_srpk_3", 36 | `20170918.A-dSprk1_4` = "Illumina_srpk_4", 37 | `20170918.A-WT_1` = "Illumina_wt_1", 38 | `20170918.A-WT_2` = "Illumina_wt_2", 39 | `20170918.A-WT_3` = "Illumina_wt_3", 40 | `20170918.A-WT_4` = "Illumina_wt_4", 41 | HEK_1 = "ONT-RNA001-HEK_wt_1", 42 | HEK_2 = "ONT-RNA001-HEK_wt_2", 43 | HEK_3 = "ONT-RNA001-HEK_wt_3", 44 | HEK_4 = "ONT-RNA001-HEK_wt_4", 45 | HEK_5 = "ONT-RNA001-HEK_wt_5", 46 | `20181026_0911_p2557_4959_1.pass` = "ONT-PCS109-HAP_wt_4", 47 | wt4 = "ONT-SQK-PCS109-HAP_wt_4", 48 | p2557_5265_1 = "ONT-SQK-PCS109-HAP_wt_1", 49 | p2557_5265_2 = "ONT-SQK-PCS109-HAP_wt_2", 50 | p2557_5265_5 = "ONT-SQK-PCS109-HAP_srpk_1", 51 | p2557_5265_6 = "ONT-SQK-PCS109-HAP_srpk_2") 52 | 53 | ## Create a sample annotation table 54 | sample_annotation <- data.frame( 55 | sample_orig = names(remap), 56 | sample_remap = remap, 57 | stringsAsFactors = FALSE 58 | ) %>% 59 | tidyr::separate(sample_remap, into = c("dataset", "condition", "sample_nbr"), 60 | sep = "_", remove = FALSE) 61 | 62 | ## Define remapping of data set names 63 | remapds <- c(pilot = "ONT-NSK007-HAP", 64 | NSK007 = "TempSwitch-HAP", 65 | DCS108 = "ONT-DCS108-HAP", 66 | RNA001 = "ONT-RNA001-HAP", 67 | HEK293RNA = "ONT-RNA001-HEK", 68 | FGCZ = "ONT-PCS108-HAP", 69 | Illumina = "Illumina", 70 | FGCZ_PCS109 = "ONT-PCS109-HAP", 71 | FGCZ_SQK_PCS109 = "ONT-SQK-PCS109-HAP", 72 | FGCZ_PCS109_GridION = "ONT-SQK-PCS109-HAP") 73 | 74 | ## Define color schemes 75 | ds_colors <- c(`ONT-DCS108-HAP`= "#66C2A5", `ONT-NSK007-HAP` = "#FC8D62", 76 | `ONT-RNA001-HEK` = "#8DA0CB", `TempSwitch-HAP` = "darkgrey", 77 | `ONT-PCS108-HAP` = "#A6D854", `ONT-RNA001-HAP` = "#FFD92F", 78 | Illumina = "#B3B3B3", `ONT-PCS109-HAP` = "#E78AC3", 79 | `ONT-SQK-PCS109-HAP` = "darkblue") 80 | 81 | ds_order <- c("Illumina", "ONT-NSK007-HAP", "ONT-DCS108-HAP", "ONT-RNA001-HAP", 82 | "ONT-RNA001-HEK") 83 | 84 | ## Help function to remove the dataset name from the sample name 85 | removeDatasetFromSample <- function(sample, dataset) { 86 | sapply(seq_along(sample), function(i) gsub(paste0(dataset[i], "_"), "", sample[i])) 87 | } 88 | --------------------------------------------------------------------------------