├── LICENSE
├── Makefile
├── README.md
├── Rscripts
    ├── extract_observed_junctions.R
    ├── filter_fastq.R
    ├── generate_tx2gene.R
    ├── get_nbr_reads.R
    ├── get_nbr_reads_NA12878public.R
    ├── get_polyA_tail_lengths.R
    ├── get_reads_aligning_to_genome_but_not_txome.R
    ├── gff2gtf.R
    ├── list_packages.R
    ├── merge_promoter_bed_files.R
    ├── plot_abundances.R
    ├── plot_abundances_template.Rmd
    ├── plot_compare_flair_annotation.R
    ├── plot_distance_primary_supplementary_alignments.R
    ├── plot_eq_class_counts.R
    ├── plot_qc.R
    ├── summarize_abundances.R
    ├── summarize_abundances_illumina.R
    └── summarize_star_sjdbs.R
├── datasets_to_include.mk
├── manuscript_results_figures
    └── Rscripts
    │   ├── plot_alignment.R
    │   ├── plot_compare_gffcompare_sqanti.R
    │   ├── plot_consistency_between_replicates.R
    │   ├── plot_flair_round2_summary.R
    │   ├── plot_flair_summary.R
    │   ├── plot_gc_content_genomenotxome.R
    │   ├── plot_illumina_nanopore_lengths_all_datasets.R
    │   ├── plot_nbr_assigned_reads.R
    │   ├── plot_nbr_assigned_reads_biotype.R
    │   ├── plot_nbr_detected_features.R
    │   ├── plot_nbr_junctions_per_read_illumina.R
    │   ├── plot_nbr_reads.R
    │   ├── plot_nbr_tx_in_eqclass.R
    │   ├── plot_observed_junctions.R
    │   ├── plot_palindromes.R
    │   ├── plot_polya_tail_estimates.R
    │   ├── plot_primary_supplementary.R
    │   ├── plot_rseqc.R
    │   ├── plot_salmon_variants_all_datasets.R
    │   ├── plot_sqanti_round2_summary.R
    │   ├── plot_sqanti_summary.R
    │   ├── plot_txcov_sirv.R
    │   ├── plot_txcov_subsampling.R
    │   ├── plot_txlength_vs_basecoverage.R
    │   └── remap_sample_names.R
└── reference
    └── chromhmm_promoters
        └── hg38_promoters.bed


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Charlotte Soneson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## A comprehensive examination of Nanopore native RNA sequencing for characterisation of complex transcriptomes
2 | 
3 | This repository contains the code used to perform the analysis and generate the figures in the paper
4 | 
5 | * C Soneson, Y Yao, A Bratus-Neuenschwander, A Patrignani, MD Robinson, S Hussain: A comprehensive examination of Nanopore native RNA sequencing for characterization of complex transcriptomes
6 | 
7 | The raw data has been uploaded to ArrayExpress, with accession numbers E-MTAB-7757 and E-MTAB-7778. 
8 | 


--------------------------------------------------------------------------------
/Rscripts/extract_observed_junctions.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | print(refgtf)
 7 | print(bam)
 8 | print(paired)
 9 | print(outrds)
10 | 
11 | suppressPackageStartupMessages({
12 |   library(GenomicAlignments)
13 |   library(SGSeq)
14 |   library(GenomicFeatures)
15 | })
16 | 
17 | ## Extract annotated junctions
18 | txdb <- GenomicFeatures::makeTxDbFromGFF(refgtf)
19 | txf <- SGSeq::convertToTxFeatures(txdb)
20 | txf <- txf[SGSeq::type(txf) == "J"]
21 | start(txf) <- start(txf) + 1
22 | end(txf) <- end(txf) - 1
23 | 
24 | ## Get junctions from alignments
25 | if (!paired) {
26 |   x <- GenomicAlignments::readGAlignments(bam, use.names = TRUE)
27 | } else {
28 |   x <- GenomicAlignments::readGAlignmentPairs(bam, use.names = TRUE)
29 | }
30 | juncSum <- GenomicAlignments::summarizeJunctions(x)
31 | 
32 | ## To compare with later: number of exact matches of observed junctions among
33 | ## the annotated ones.
34 | table(ranges(juncSum) %in% ranges(txf))
35 | 
36 | ## Find closest annotated junction to each observed junction
37 | txf_seqnames_char <- as.character(seqnames(txf))
38 | juncsum_seqnames_char <- as.character(seqnames(juncSum))
39 | txf_starts <- start(txf)
40 | txf_ends <- end(txf)
41 | juncsum_starts <- start(juncSum)
42 | juncsum_ends <- end(juncSum)
43 | juncdists <- do.call(rbind, lapply(seq_along(juncSum), function(i) {
44 |   if (i%%1000 == 0) print(i)
45 |   idx <- which(txf_seqnames_char == juncsum_seqnames_char[i])
46 |   dists <- abs(juncsum_starts[i] - txf_starts[idx]) + 
47 |     abs(juncsum_ends[i] - txf_ends[idx])
48 |   tryCatch({
49 |     data.frame(juncSumIdx = i, whichMin = idx[which.min(dists)], minDist = min(dists))
50 |   }, error = function(e) {
51 |     data.frame(juncSumIdx = i, whichMin = NA_integer_, minDist = Inf)
52 |   })
53 | }))
54 | 
55 | rtmp <- ranges(txf)
56 | l <- length(rtmp)
57 | rtmp <- c(rtmp, IRanges(start = 0, end = 0))
58 | jtmp <- juncdists
59 | jtmp$whichMin[is.na(jtmp$whichMin)] <- l + 1
60 | mcols(juncSum)$closestRefJunc <- rtmp[jtmp$whichMin]
61 | mcols(juncSum)$distToClosestRefJunc <- juncdists$minDist
62 | 
63 | saveRDS(juncSum, file = outrds)
64 | date()
65 | sessionInfo()
66 | 


--------------------------------------------------------------------------------
/Rscripts/filter_fastq.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | ## Remove reads that don't have _t or _c in the end of their identifier
 7 | 
 8 | print(fastqin)
 9 | print(fastqout)
10 | 
11 | suppressPackageStartupMessages(library(ShortRead))
12 | 
13 | x <- readFastq(fastqin)
14 | nm <- as.character(x@id)
15 | keep <- grep("_t |_c ", nm)
16 | 
17 | x <- x[keep]
18 | 
19 | writeFastq(x, file = fastqout, mode = "w", compress = TRUE)
20 | 
21 | sessionInfo()
22 | date()
23 | 


--------------------------------------------------------------------------------
/Rscripts/generate_tx2gene.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | ## Parse a transcript fasta file and optionally a gtf file from Ensembl and
 7 | ## extract information about the transcripts.
 8 | 
 9 | suppressPackageStartupMessages({
10 |   library(Biostrings)
11 |   library(rtracklayer)
12 | })
13 | 
14 | print(transcriptome)
15 | print(gtf)
16 | print(outrds)
17 | 
18 | ## Read input files
19 | transcriptome <- readDNAStringSet(transcriptome)
20 | if (!is.null(gtf)) gtf0 <- import(gtf)
21 | 
22 | ## Extract information from fasta file identifiers
23 | tx2gene <- data.frame(t(sapply(as.character(names(transcriptome)), function(nm) {
24 |   a <- strsplit(nm, " ")[[1]]
25 |   tx <- a[1]
26 |   gene <- gsub("^gene:", "", a[grep("^gene:", a)])
27 |   symbol <- gsub("^gene_symbol:", "", a[grep("^gene_symbol:", a)])
28 |   gene_biotype <- gsub("^gene_biotype:", "", a[grep("^gene_biotype:", a)])
29 |   tx_biotype <- gsub("^transcript_biotype:", "", a[grep("^transcript_biotype:", a)])
30 |   position <- gsub("chromosome:|scaffold:", "", a[grep("^chromosome:|^scaffold:", a)])
31 |   c(tx = ifelse(length(tx) != 0, tx, NA), 
32 |     gene = ifelse(length(gene) != 0, gene, NA),
33 |     symbol = ifelse(length(symbol) != 0, symbol, NA),
34 |     gene_biotype = ifelse(length(gene_biotype) != 0, gene_biotype, NA),
35 |     tx_biotype = ifelse(length(tx_biotype) != 0, tx_biotype, NA),
36 |     chromosome = ifelse(length(position) != 0, strsplit(position, ":")[[1]][2], NA),
37 |     start = ifelse(length(position) != 0, strsplit(position, ":")[[1]][3], NA),
38 |     end = ifelse(length(position) != 0, strsplit(position, ":")[[1]][4], NA),
39 |     strand = ifelse(length(position) != 0, strsplit(position, ":")[[1]][5], NA))
40 | })), stringsAsFactors = FALSE)
41 | rownames(tx2gene) <- NULL
42 | 
43 | ## Add information about transcript length
44 | tx2gene$txlength <- width(transcriptome)
45 | 
46 | if (!is.null(gtf)) {
47 |   ## Add information from gtf file
48 |   gtf <- subset(gtf0, type == "transcript")
49 |   gtf$transcript_id_with_version <- paste0(gtf$transcript_id, ".", gtf$transcript_version)
50 |   ## Keep only transcripts that are not present in the fasta file
51 |   gtf <- subset(gtf, !(transcript_id_with_version %in% tx2gene$tx))
52 |   gtfout <- data.frame(tx = gtf$transcript_id_with_version, 
53 |                        gene = paste0(gtf$gene_id, ".", gtf$gene_version),
54 |                        symbol = gtf$gene_name,
55 |                        gene_biotype = gtf$gene_biotype,
56 |                        tx_biotype = gtf$transcript_biotype,
57 |                        chromosome = seqnames(gtf),
58 |                        start = start(gtf),
59 |                        end = end(gtf),
60 |                        strand = strand(gtf),
61 |                        stringsAsFactors = FALSE)
62 |   
63 |   ## Get transcript length by adding up exon lengths for each transcript
64 |   gtfe <- subset(gtf0, type == "exon" & 
65 |                    paste0(transcript_id, ".", transcript_version) %in% gtfout$tx)
66 |   gtfout$txlength <- sapply(gtfout$tx, function(m) {
67 |     sum(width(subset(gtfe, paste0(transcript_id, ".", transcript_version) == m)))
68 |   })
69 |   
70 |   ## Merge information from fasta and gtf
71 |   stopifnot(all(colnames(tx2gene) == colnames(gtfout)))
72 |   tx2gene <- rbind(tx2gene, gtfout)
73 | }
74 | 
75 | saveRDS(tx2gene, file = outrds)
76 | 
77 | sessionInfo()
78 | date()
79 | 


--------------------------------------------------------------------------------
/Rscripts/get_nbr_reads.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | ## Catalog the number of nanopore reads
  7 | 
  8 | print(fastqdir)
  9 | print(genomebamdir)
 10 | print(txomebamdir)
 11 | print(txomebamdir_p0.99)
 12 | print(outrds)
 13 | 
 14 | suppressPackageStartupMessages({
 15 |   library(ShortRead)
 16 |   library(GenomicAlignments)
 17 |   library(dplyr)
 18 |   library(ggplot2)
 19 |   library(data.table)
 20 | })
 21 | 
 22 | ## ========================================================================== ##
 23 | ## Help functions
 24 | ## ========================================================================== ##
 25 | readBam <- function(bamfile) {
 26 |   bam <- readGAlignments(bamfile, use.names = TRUE,
 27 |                          param = ScanBamParam(tag = c("NM"),
 28 |                                               what = c("qname","flag", "rname", 
 29 |                                                        "pos", "mapq")))
 30 |   ops <- GenomicAlignments::CIGAR_OPS
 31 |   wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(bam), ops = ops)
 32 |   keep.ops <- GenomicAlignments::explodeCigarOps(cigar(bam), ops = ops)
 33 |   explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 
 34 |                                                          unlist(keep.ops)), wdths))
 35 |   for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) {
 36 |     mcols(bam)[[paste0("nbr", opts)]] <- 
 37 |       sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE))
 38 |   }
 39 |   mcols(bam)$readLength <- rowSums(as.matrix(mcols(bam)[, c("nbrS", "nbrH", "nbrM", "nbrI")]))
 40 |   bam
 41 | }
 42 | 
 43 | makeReadDf <- function(bam) {
 44 |   tmp <- data.frame(bam %>% setNames(NULL), stringsAsFactors = FALSE) %>%
 45 |     dplyr::rename(read = qname,
 46 |                   nbrJunctions = njunc) %>%
 47 |     dplyr::select(-cigar) %>%
 48 |     dplyr::mutate(alignedLength = nbrM + nbrI) ## equivalent to readLength-nbrS-nbrH
 49 |   
 50 |   tmp2 <- as.data.frame(table(names(subset(bam, flag %in% c(0, 16)))))
 51 |   if (nrow(tmp2) == 0) tmp2 <- data.frame(Var1 = tmp$read[1], Freq = 0)
 52 |   tmp <- tmp %>% 
 53 |     dplyr::left_join(tmp2 %>% dplyr::rename(read = Var1, nbrPrimaryAlignments = Freq))
 54 | 
 55 |   tmp3 <- as.data.frame(table(names(subset(bam, flag %in% c(256, 272)))))
 56 |   if (nrow(tmp3) == 0) tmp3 <- data.frame(Var1 = tmp$read[1], Freq = 0)
 57 |   tmp <- tmp %>% 
 58 |     dplyr::left_join(tmp3 %>% dplyr::rename(read = Var1, nbrSecondaryAlignments = Freq))
 59 | 
 60 |   tmp4 <- as.data.frame(table(names(subset(bam, flag %in% c(2048, 2064)))))
 61 |   if (nrow(tmp4) == 0) tmp4 <- data.frame(Var1 = tmp$read[1], Freq = 0)
 62 |   tmp <- tmp %>% 
 63 |     dplyr::left_join(tmp4 %>% dplyr::rename(read = Var1, nbrSupplementaryAlignments = Freq))
 64 |   
 65 |   tmp %>% dplyr::mutate(nbrSecondaryAlignments = replace(nbrSecondaryAlignments, 
 66 |                                                          is.na(nbrSecondaryAlignments), 0),
 67 |                         nbrSupplementaryAlignments = replace(nbrSupplementaryAlignments, 
 68 |                                                              is.na(nbrSupplementaryAlignments), 0))
 69 | }
 70 | 
 71 | makeSummaryList <- function(bam) {
 72 |   list(nAlignments = length(bam), 
 73 |        nAlignedReads = length(unique(names(bam))),
 74 |        nPrimaryAlignments = length(subset(bam, flag %in% c(0, 16))), 
 75 |        nReadsWithPrimaryAlignments = length(unique(names(subset(bam, flag %in% c(0, 16))))),
 76 |        nSecondaryAlignments = length(subset(bam, flag %in% c(256, 272))),
 77 |        nReadsWithSecondaryAlignments = length(unique(names(subset(bam, flag %in% c(256, 272))))),
 78 |        nSupplementaryAlignments = length(subset(bam, flag %in% c(2048, 2064))),
 79 |        nReadsWithSupplementaryAlignments = length(unique(names(subset(bam, flag %in% c(2048, 2064)))))
 80 |   )
 81 | }
 82 | 
 83 | ## ========================================================================== ##
 84 | ## Read individual FASTQ files and get number of reads and read lengths
 85 | fastqfiles <- list.files(fastqdir, pattern = "(FASTQ|fastq)\\.gz$", full.names = TRUE)
 86 | names(fastqfiles) <- gsub("\\.(FASTQ|fastq).gz", "", basename(fastqfiles))
 87 | fastqfiles
 88 | fastqs <- lapply(fastqfiles, function(f) {
 89 |   fastq <- fread(paste0("zcat ", f), sep = "\n", header = FALSE)$V1
 90 |   seq_idxs <- seq(2, length(fastq), by = 4)
 91 |   list(nReads = length(seq_idxs), 
 92 |        reads = do.call(rbind, lapply(seq_idxs, function(i) {
 93 |          data.frame(read = gsub("^@", "", strsplit(fastq[i - 1], " ")[[1]][1]),
 94 |                     readLength = nchar(fastq[i]),
 95 |                     aveBaseQuality = mean(as.numeric(charToRaw(fastq[i + 2])) - 33),
 96 |                     stringsAsFactors = FALSE)
 97 |        })))
 98 | })
 99 | 
100 | ## BAM files, genome alignment
101 | bamfiles <- list.files(genomebamdir, pattern = "_minimap_genome_s.bam$", 
102 |                        recursive = TRUE, full.names = TRUE)
103 | names(bamfiles) <- gsub("_minimap_genome_s.bam", "", basename(bamfiles))
104 | bamfiles
105 | genomebams <- lapply(bamfiles, function(f) {
106 |   bam <- readBam(f)
107 |   tmp <- makeReadDf(bam)
108 |   c(makeSummaryList(bam), 
109 |     list(allAlignments = tmp))
110 | })
111 | for (n in names(genomebams)) {
112 |   genomebams[[n]][["allAlignments"]]$sample <- n
113 | }
114 | 
115 | ## BAM files, transcriptome alignment
116 | bamfilestx <- list.files(txomebamdir, pattern = "_minimap_txome.bam$", 
117 |                          recursive = TRUE, full.names = TRUE)
118 | names(bamfilestx) <- gsub("_minimap_txome.bam", "", basename(bamfilestx))
119 | bamfilestx
120 | txomebams <- lapply(bamfilestx, function(f) {
121 |   bam <- readBam(f)
122 |   tmp <- makeReadDf(bam)
123 |   
124 |   ## Read transcript lengths
125 |   bamheader <- scanBamHeader(f)[[1]]$targets
126 |   txlengths <- data.frame(rname = names(bamheader),
127 |                           txLength = bamheader,
128 |                           stringsAsFactors = FALSE)
129 |   
130 |   c(makeSummaryList(bam), 
131 |     list(allAlignments = tmp %>% dplyr::left_join(txlengths, by = "rname"),
132 |          nTxCoveredByPrimary = length(unique(seqnames(subset(bam, flag %in% c(0, 16)))))))
133 | })
134 | for (n in names(txomebams)) {
135 |   txomebams[[n]][["allAlignments"]]$sample <- n
136 | }
137 | 
138 | ## BAM files, transcriptome alignment, -p 0.99
139 | bamfilestx_p0.99 <- list.files(txomebamdir_p0.99, pattern = "_minimap_txome_p0.99.bam$", 
140 |                                recursive = TRUE, full.names = TRUE)
141 | names(bamfilestx_p0.99) <- gsub("_minimap_txome_p0.99.bam", "", 
142 |                                 basename(bamfilestx_p0.99))
143 | bamfilestx_p0.99
144 | txomebams_p0.99 <- lapply(bamfilestx_p0.99, function(f) {
145 |   bam <- readBam(f)
146 |   tmp <- makeReadDf(bam)
147 |   
148 |   ## Read transcript lengths
149 |   bamheader <- scanBamHeader(f)[[1]]$targets
150 |   txlengths <- data.frame(rname = names(bamheader),
151 |                           txLength = bamheader,
152 |                           stringsAsFactors = FALSE)
153 |   
154 |   c(makeSummaryList(bam),
155 |     list(allAlignments = tmp %>% dplyr::left_join(txlengths, by = "rname"),
156 |          nTxCoveredByPrimary = length(unique(seqnames(subset(bam, flag %in% c(0, 16)))))))
157 | })
158 | for (n in names(txomebams_p0.99)) {
159 |   txomebams_p0.99[[n]][["allAlignments"]]$sample <- n
160 | }
161 | 
162 | nReadTable <- 
163 |   data.frame(sample = names(fastqs), 
164 |              nReads = sapply(fastqs, function(w) w$nReads),
165 |              stringsAsFactors = FALSE) %>%
166 |   dplyr::full_join(data.frame(
167 |     sample = names(genomebams), 
168 |     nAlignedReadsGenome = sapply(genomebams, function(w) w$nAlignedReads),
169 |     nAlignmentsGenome = sapply(genomebams, function(w) w$nAlignments),
170 |     nPrimaryAlignmentsGenome = sapply(genomebams, function(w) w$nPrimaryAlignments),
171 |     nReadsWithPrimaryAlignmentsGenome = sapply(genomebams, function(w) w$nReadsWithPrimaryAlignments),
172 |     nSecondaryAlignmentsGenome = sapply(genomebams, function(w) w$nSecondaryAlignments),
173 |     nReadsWithSecondaryAlignmentsGenome = sapply(genomebams, function(w) w$nReadsWithSecondaryAlignments),
174 |     nSupplementaryAlignmentsGenome = sapply(genomebams, function(w) w$nSupplementaryAlignments),
175 |     nReadsWithSupplementaryAlignmentsGenome = sapply(genomebams, function(w) w$nReadsWithSupplementaryAlignments),
176 |     stringsAsFactors = FALSE)) %>%
177 |   dplyr::full_join(data.frame(
178 |     sample = names(txomebams), 
179 |     nAlignedReadsTxome = sapply(txomebams, function(w) w$nAlignedReads),
180 |     nAlignmentsTxome = sapply(txomebams, function(w) w$nAlignments),
181 |     nTxCoveredByPrimary = sapply(txomebams, function(w) w$nTxCoveredByPrimary),
182 |     nPrimaryAlignmentsTxome = sapply(txomebams, function(w) w$nPrimaryAlignments),
183 |     nReadsWithPrimaryAlignmentsTxome = sapply(txomebams, function(w) w$nReadsWithPrimaryAlignments),
184 |     nSecondaryAlignmentsTxome = sapply(txomebams, function(w) w$nSecondaryAlignments),
185 |     nReadsWithSecondaryAlignmentsTxome = sapply(txomebams, function(w) w$nReadsWithSecondaryAlignments),
186 |     nSupplementaryAlignmentsTxome = sapply(txomebams, function(w) w$nSupplementaryAlignments),
187 |     nReadsWithSupplementaryAlignmentsTxome = sapply(txomebams, function(w) w$nReadsWithSupplementaryAlignments),
188 |     stringsAsFactors = FALSE)) %>%
189 |   dplyr::full_join(data.frame(
190 |     sample = names(txomebams_p0.99), 
191 |     nAlignedReadsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nAlignedReads),
192 |     nAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nAlignments),
193 |     nTxCoveredByPrimary_p0.99 = sapply(txomebams_p0.99, function(w) w$nTxCoveredByPrimary),
194 |     nPrimaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nPrimaryAlignments),
195 |     nReadsWithPrimaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nReadsWithPrimaryAlignments),
196 |     nSecondaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nSecondaryAlignments),
197 |     nReadsWithSecondaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nReadsWithSecondaryAlignments),
198 |     nSupplementaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nSupplementaryAlignments),
199 |     nReadsWithSupplementaryAlignmentsTxome_p0.99 = sapply(txomebams_p0.99, function(w) w$nReadsWithSupplementaryAlignments),
200 |     stringsAsFactors = FALSE))
201 | 
202 | write.table(nReadTable, file = gsub("rds$", "txt", outrds), 
203 |             row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
204 | 
205 | saveRDS(list(fastqs = fastqs, genomebams = genomebams, txomebams = txomebams,
206 |              txomebams_p0.99 = txomebams_p0.99, nReadTable = nReadTable),
207 |         file = outrds)
208 | sessionInfo()
209 | date()
210 | 


--------------------------------------------------------------------------------
/Rscripts/get_nbr_reads_NA12878public.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | ## Catalog the number of nanopore reads
  7 | 
  8 | print(txomebamdir_p0.99)
  9 | print(outrds)
 10 | 
 11 | suppressPackageStartupMessages({
 12 |   library(ShortRead)
 13 |   library(GenomicAlignments)
 14 |   library(dplyr)
 15 |   library(ggplot2)
 16 |   library(data.table)
 17 | })
 18 | 
 19 | ## ========================================================================== ##
 20 | ## Help functions
 21 | ## ========================================================================== ##
 22 | readBam <- function(bamfile) {
 23 |   bam <- readGAlignments(bamfile, use.names = TRUE,
 24 |                          param = ScanBamParam(tag = c("NM"),
 25 |                                               what = c("qname", "flag", "rname", 
 26 |                                                        "pos", "mapq")))
 27 |   mcols(bam)$tmp <- seq_len(length(bam)) %/% 5e5
 28 |   
 29 |   bamout <- do.call(c, lapply(unique(mcols(bam)$tmp), function(i) {
 30 |     tmp <- bam[mcols(bam)$tmp == i]
 31 |     ops <- GenomicAlignments::CIGAR_OPS
 32 |     wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(tmp), ops = ops)
 33 |     keep.ops <- GenomicAlignments::explodeCigarOps(cigar(tmp), ops = ops)
 34 |     explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 
 35 |                                                            unlist(keep.ops)), wdths))
 36 |     for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) {
 37 |       mcols(tmp)[[paste0("nbr", opts)]] <- 
 38 |         sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE))
 39 |     }
 40 |     mcols(tmp)$readLength <- rowSums(as.matrix(mcols(tmp)[, c("nbrS", "nbrH", "nbrM", "nbrI")]))
 41 |     tmp
 42 |   }))
 43 |   
 44 |   bamout
 45 | }
 46 | 
 47 | makeReadDf <- function(bam) {
 48 |   tmp <- data.frame(bam %>% setNames(NULL), stringsAsFactors = FALSE) %>%
 49 |     dplyr::rename(read = qname,
 50 |                   nbrJunctions = njunc) %>%
 51 |     dplyr::select(-cigar) %>%
 52 |     dplyr::mutate(alignedLength = nbrM + nbrI) ## equivalent to readLength-nbrS-nbrH
 53 |   
 54 |   tmp2 <- as.data.frame(table(names(subset(bam, flag %in% c(0, 16)))))
 55 |   if (nrow(tmp2) == 0) tmp2 <- data.frame(Var1 = tmp$read[1], Freq = 0)
 56 |   tmp <- tmp %>% 
 57 |     dplyr::left_join(tmp2 %>% dplyr::rename(read = Var1, nbrPrimaryAlignments = Freq))
 58 |   
 59 |   tmp3 <- as.data.frame(table(names(subset(bam, flag %in% c(256, 272)))))
 60 |   if (nrow(tmp3) == 0) tmp3 <- data.frame(Var1 = tmp$read[1], Freq = 0)
 61 |   tmp <- tmp %>% 
 62 |     dplyr::left_join(tmp3 %>% dplyr::rename(read = Var1, nbrSecondaryAlignments = Freq))
 63 |   
 64 |   tmp %>% dplyr::mutate(nbrSecondaryAlignments = replace(nbrSecondaryAlignments, 
 65 |                                                          is.na(nbrSecondaryAlignments), 0))
 66 | }
 67 | 
 68 | makeSummaryList <- function(bam) {
 69 |   list(nAlignments = length(bam), 
 70 |        nAlignedReads = length(unique(names(bam))),
 71 |        nPrimaryAlignments = length(subset(bam, flag %in% c(0, 16))), 
 72 |        nReadsWithPrimaryAlignments = length(unique(names(subset(bam, flag %in% c(0, 16))))),
 73 |        nSecondaryAlignments = length(subset(bam, flag %in% c(256, 272))),
 74 |        nReadsWithSecondaryAlignments = length(unique(names(subset(bam, flag %in% c(256, 272))))))
 75 | }
 76 | 
 77 | ## ========================================================================== ##
 78 | ## BAM files, transcriptome alignment, -p 0.99
 79 | bamfilestx_p0.99 <- list.files(txomebamdir_p0.99, 
 80 |                                pattern = "_minimap_txome_p0.99.bam$", 
 81 |                                recursive = TRUE, full.names = TRUE)
 82 | names(bamfilestx_p0.99) <- gsub("_minimap_txome_p0.99.bam", "", 
 83 |                                 basename(bamfilestx_p0.99))
 84 | bamfilestx_p0.99
 85 | txomebams_p0.99 <- lapply(bamfilestx_p0.99, function(f) {
 86 |   bam <- readBam(f)
 87 |   tmp <- makeReadDf(bam)
 88 |   
 89 |   ## Read transcript lengths
 90 |   bamheader <- scanBamHeader(f)[[1]]$targets
 91 |   txlengths <- data.frame(rname = names(bamheader),
 92 |                           txLength = bamheader,
 93 |                           stringsAsFactors = FALSE)
 94 |   
 95 |   c(makeSummaryList(bam),
 96 |     list(allAlignments = tmp %>% dplyr::left_join(txlengths, by = "rname")))
 97 | })
 98 | for (n in names(txomebams_p0.99)) {
 99 |   txomebams_p0.99[[n]][["allAlignments"]]$sample <- n
100 | }
101 | 
102 | saveRDS(list(txomebams_p0.99 = txomebams_p0.99),
103 |         file = outrds)
104 | sessionInfo()
105 | date()
106 | 


--------------------------------------------------------------------------------
/Rscripts/get_polyA_tail_lengths.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | ## Estimate the length of the polyA tail
 7 | 
 8 | print(fast5dir)
 9 | print(ncores)
10 | print(outdir)
11 | print(csvfilename)
12 | 
13 | suppressPackageStartupMessages({
14 |   library(tailfindr)
15 | })
16 | 
17 | df <- find_tails(fast5_dir = fast5dir,
18 |                  save_dir = outdir,
19 |                  csv_filename = csvfilename,
20 |                  num_cores = ncores)
21 | 
22 | saveRDS(df, file = paste0(outdir, "/", csvfilename, ".rds"))
23 | date()
24 | sessionInfo()
25 | 
26 | 


--------------------------------------------------------------------------------
/Rscripts/get_reads_aligning_to_genome_but_not_txome.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | print(nbrreadsrds)
 7 | print(outdir)
 8 | 
 9 | ## Extract read names for reads mapping only to genome (not transcriptome)
10 | 
11 | rd <- readRDS(nbrreadsrds)
12 | rdg <- rd$genomebams
13 | rdt_p0.99 <- rd$txomebams_p0.99
14 | 
15 | stopifnot(names(rdg) %in% names(rdt_p0.99),
16 |           names(rdt_p0.99) %in% names(rdg))
17 | 
18 | for (nm in names(rdg)) {
19 |   readnames <- setdiff(rdg[[nm]]$allAlignments$read,
20 |                        rdt_p0.99[[nm]]$allAlignments$read)
21 |   write.table(readnames, 
22 |               file = paste0(outdir, "/", nm, 
23 |                             "_reads_aligning_to_genome_but_not_txome.txt"),
24 |               row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE)
25 | }
26 | 
27 | saveRDS(NULL, file = paste0(outdir, "/reads_aligning_to_genome_but_not_txome.rds"))
28 | date()
29 | sessionInfo()
30 | 


--------------------------------------------------------------------------------
/Rscripts/gff2gtf.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | print(gff)
 7 | print(gtf)
 8 | 
 9 | suppressPackageStartupMessages({
10 |   library(rtracklayer)
11 |   library(readr)
12 |   library(withr)
13 | })
14 | 
15 | ## Filter out exons that can not be handled by gffcompare
16 | x <- readr::read_tsv(gff, col_names = FALSE, col_types = "cccddcccc")
17 | dim(x)
18 | message("Excluding the following lines:")
19 | x[x$X5 - x$X4 >= 30000, ]
20 | x <- x[x$X5 - x$X4 < 30000, ]
21 | dim(x)
22 | withr::with_options(c(scipen = 100), 
23 |                     write.table(x, file = gsub("\\.gff$", ".fixed.gff", gff), 
24 |                                 quote = FALSE, sep = "\t", row.names = FALSE, 
25 |                                 col.names = FALSE))
26 | 
27 | x <- rtracklayer::import(gsub("\\.gff$", ".fixed.gff", gff))
28 | x$transcript_id <- as.character(x$group)
29 | x$group <- NULL
30 | rtracklayer::export(x, gtf)
31 | 
32 | date()
33 | sessionInfo()


--------------------------------------------------------------------------------
/Rscripts/list_packages.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | routdirs <- strsplit(routdirs, ",")[[1]]
 7 | 
 8 | print(routdirs)
 9 | print(outtxt)
10 | 
11 | all_packages <- c()
12 | 
13 | for (rd in routdirs) {
14 |   lf <- list.files(rd, full.names = TRUE)
15 |   for (f in lf) {
16 |     x <- readLines(f)
17 |     idx1 <- which(x == "other attached packages:")
18 |     idx2 <- which(x == "loaded via a namespace (and not attached):")
19 |     if (length(idx1) != 0 & length(idx2) != 0) {
20 |       all_packages <- 
21 |         unique(c(all_packages, 
22 |                  do.call(c, lapply((idx1 + 1):(idx2 - 2), function(i) {
23 |                    grep("\\[", setdiff(setdiff(strsplit(x[i], " ")[[1]], " "), ""), 
24 |                         value = TRUE, invert = TRUE)
25 |                  }))))
26 |     }
27 |   }
28 | }
29 | write.table(sort(all_packages), file = outtxt, 
30 |             row.names = FALSE, col.names = FALSE, quote = FALSE, sep = "\t")


--------------------------------------------------------------------------------
/Rscripts/merge_promoter_bed_files.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | suppressPackageStartupMessages({
 7 |   library(rtracklayer)
 8 |   library(GenomicRanges)
 9 |   library(GenomeInfoDb)
10 | })
11 | 
12 | bedsin <- strsplit(bedsin, ",")[[1]]
13 | 
14 | print(bedsin)
15 | print(bedout)
16 | 
17 | ## Read all input bed files and merge into one GRanges object
18 | suppressWarnings(bedall <- do.call(c, lapply(bedsin, function(b) rtracklayer::import(b))))
19 | 
20 | ## Reduce the ranges in the merged GRanges object
21 | bedrr <- GenomicRanges::reduce(bedall)
22 | 
23 | ## Change the seqlevels style to Ensembl
24 | GenomeInfoDb::seqlevelsStyle(bedrr) <- "Ensembl"
25 | 
26 | ## Write reduced promoter regions to bed file
27 | rtracklayer::export(bedrr, con = bedout, format = "bed")
28 | 
29 | date()
30 | sessionInfo()


--------------------------------------------------------------------------------
/Rscripts/plot_abundances.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | suppressPackageStartupMessages({
 7 |   library(ggplot2)
 8 |   library(tibble)
 9 |   library(dplyr)
10 |   library(tidyr)
11 |   library(GGally)
12 |   library(rmarkdown)
13 | })
14 | 
15 | print(abundances)
16 | print(rmdtemplate)
17 | print(outhtml)
18 | 
19 | abundances <- readRDS(abundances)
20 | 
21 | plot_abundance_report <- function(abundances, output_file, output_dir = "./",
22 |                                   output_format = "html_document", 
23 |                                   rmd_template = NULL,
24 |                                   knitr_progress = FALSE, ...){
25 |   output_report <- file.path(output_dir, basename(output_file))
26 |   output_rmd <- file.path(output_dir,
27 |                           paste0(tools::file_path_sans_ext(basename(output_file)),
28 |                                  ".Rmd"))
29 |   template_file <- rmd_template
30 |   file.copy(from = template_file, to = output_rmd, overwrite = TRUE)
31 |   
32 |   args <- list(...)
33 |   args$input <- output_rmd
34 |   args$output_format <- output_format
35 |   args$output_file <- output_file
36 |   args$quiet <- !knitr_progress
37 | 
38 |   output_file <- do.call("render", args = args)
39 |   invisible(output_file)
40 | }
41 | 
42 | plot_abundance_report(abundances = abundances, 
43 |                       output_file = basename(outhtml), 
44 |                       output_dir = dirname(outhtml),
45 |                       rmd_template = rmdtemplate)
46 | 
47 | sessionInfo()
48 | date()
49 | 


--------------------------------------------------------------------------------
/Rscripts/plot_abundances_template.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: ""
  3 | author: ""
  4 | date: ""
  5 | output: html_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | # Correlation among replicates
 11 | 
 12 | ```{r}
 13 | suppressPackageStartupMessages({
 14 |   library(ggplot2)
 15 |   library(tibble)
 16 |   library(dplyr)
 17 |   library(tidyr)
 18 |   library(GGally)
 19 | })
 20 | ```
 21 | 
 22 | ## Transcript-level
 23 | 
 24 | ```{r txlevel, fig.width = 8, fig.height = 8}
 25 | txab <- abundances$tx_abundances
 26 | 
 27 | quantmethods <- unique(paste0(
 28 |   sapply(strsplit(colnames(txab), "__"), .subset, 2), "__", 
 29 |   sapply(strsplit(colnames(txab), "__"), .subset, 3)
 30 | ))
 31 | 
 32 | txsum <- lapply(quantmethods, function(qm) {
 33 |   tmp <- txab[, grep(paste0(qm, "$"), colnames(txab))]
 34 |   colnames(tmp) <- gsub(paste0("__", qm), "", colnames(tmp))
 35 |   print(ggpairs(tmp,
 36 |                 lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) +
 37 |           theme_bw() + ggtitle(qm))
 38 |   print(ggpairs(log10(tmp + 1),
 39 |                 lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) +
 40 |           theme_bw() + ggtitle(paste0(qm, ", log10(. + 1)")))
 41 |   cor_srpk <- cor(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE])
 42 |   cor_wt <- cor(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE])
 43 |   cor_srpk_log <- cor(log10(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE] + 1))
 44 |   cor_wt_log <- cor(log10(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE] + 1))
 45 |   data.frame(method = qm, 
 46 |              cor_srpk_lin = mean(cor_srpk[upper.tri(cor_srpk)]),
 47 |              cor_wt_lin = mean(cor_wt[upper.tri(cor_wt)]),
 48 |              cor_srpk_log = mean(cor_srpk_log[upper.tri(cor_srpk_log)]),
 49 |              cor_wt_log = mean(cor_wt_log[upper.tri(cor_wt_log)]),
 50 |              stringsAsFactors = FALSE)
 51 | })
 52 | txsum <- do.call(rbind, txsum) %>% 
 53 |   tidyr::gather(group, correlation, -method) %>%
 54 |   tidyr::separate(group, into = c("cor", "group", "transformation"))
 55 | ```
 56 | 
 57 | ```{r txlevelsum, fig.width = 8, fig.height = 8}
 58 | levels <- txsum %>% dplyr::group_by(method) %>% 
 59 |   dplyr::summarize(meancorr = mean(correlation)) %>%
 60 |   dplyr::arrange(desc(meancorr)) %>% dplyr::pull(method)
 61 | ggplot(txsum %>% dplyr::mutate(method = factor(method, levels = levels)), 
 62 |        aes(x = method, y = correlation, fill = method)) + 
 63 |   geom_bar(stat = "identity") + theme_bw() + facet_grid(transformation ~ group) + 
 64 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
 65 |         legend.position = "none") + 
 66 |   xlab("")
 67 | ```
 68 | 
 69 | ## Gene-level 
 70 | 
 71 | ```{r genelevel, fig.width = 8, fig.height = 8}
 72 | geneab <- abundances$gene_abundances
 73 | 
 74 | quantmethods <- unique(paste0(
 75 |   sapply(strsplit(colnames(geneab), "__"), .subset, 2), "__", 
 76 |   sapply(strsplit(colnames(geneab), "__"), .subset, 3)
 77 | ))
 78 | 
 79 | genesum <- lapply(quantmethods, function(qm) {
 80 |   tmp <- geneab[, grep(paste0(qm, "$"), colnames(geneab))]
 81 |   colnames(tmp) <- gsub(paste0("__", qm), "", colnames(tmp))
 82 |   print(ggpairs(tmp,
 83 |                 lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) + 
 84 |           theme_bw() + ggtitle(qm))
 85 |   print(ggpairs(log10(tmp + 1),
 86 |                 lower = list(continuous = wrap("points", alpha = 0.3, size = 0.25))) + 
 87 |           theme_bw() + ggtitle(paste0(qm, ", log10(. + 1)")))
 88 |   cor_srpk <- cor(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE])
 89 |   cor_wt <- cor(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE])
 90 |   cor_srpk_log <- cor(log10(tmp[, grep("srpk|sprk|Srpk|Sprk", colnames(tmp)), drop = FALSE] + 1))
 91 |   cor_wt_log <- cor(log10(tmp[, grep("wt|WT", colnames(tmp)), drop = FALSE] + 1))
 92 |   data.frame(method = qm, 
 93 |              cor_srpk_lin = mean(cor_srpk[upper.tri(cor_srpk)]),
 94 |              cor_wt_lin = mean(cor_wt[upper.tri(cor_wt)]),
 95 |              cor_srpk_log = mean(cor_srpk_log[upper.tri(cor_srpk_log)]),
 96 |              cor_wt_log = mean(cor_wt_log[upper.tri(cor_wt_log)]),
 97 |              stringsAsFactors = FALSE)
 98 | })
 99 | genesum <- do.call(rbind, genesum) %>% 
100 |   tidyr::gather(group, correlation, -method) %>%
101 |   tidyr::separate(group, into = c("cor", "group", "transformation"))
102 | ```
103 | 
104 | ```{r genelevelsum, fig.width = 8, fig.height = 8}
105 | levels <- genesum %>% dplyr::group_by(method) %>% 
106 |   dplyr::summarize(meancorr = mean(correlation)) %>%
107 |   dplyr::arrange(desc(meancorr)) %>% dplyr::pull(method)
108 | ggplot(genesum %>% dplyr::mutate(method = factor(method, levels = levels)), 
109 |        aes(x = method, y = correlation, fill = method)) + 
110 |   geom_bar(stat = "identity") + theme_bw() + facet_grid(transformation ~ group) + 
111 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
112 |         legend.position = "none") + 
113 |   xlab("")
114 | ```
115 | 
116 | ## Session info
117 | 
118 | ```{r}
119 | date()
120 | sessionInfo()
121 | ```
122 | 
123 | 


--------------------------------------------------------------------------------
/Rscripts/plot_compare_flair_annotation.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | print(annotatedGtf)
  7 | print(flairGtf)
  8 | print(dataSet)
  9 | print(outDir)
 10 | print(seed)
 11 | print(nPlots)
 12 | print(flairTmap)  ## gffcompare classification
 13 | print(sqanticlass)  ## SQANTI classification
 14 | print(outrds)
 15 | 
 16 | suppressPackageStartupMessages({
 17 |   library(S4Vectors)
 18 |   library(rtracklayer)
 19 |   library(Gviz)
 20 |   library(dplyr)
 21 | })
 22 | 
 23 | ## Read gtf files
 24 | annotated <- rtracklayer::import(annotatedGtf)
 25 | idx <- match(c("transcript_id", "gene_id", "exon_id"), 
 26 |              colnames(mcols(annotated)))
 27 | colnames(mcols(annotated))[idx] <- c("transcript", "gene", "exon")
 28 | S4Vectors::mcols(annotated)$symbol <- mcols(annotated)$transcript
 29 | annotated <- subset(annotated, type == "exon")
 30 | 
 31 | flair <- rtracklayer::import(flairGtf)
 32 | colnames(mcols(flair))[match("transcript_id", colnames(mcols(flair)))] <- 
 33 |   "transcript"
 34 | S4Vectors::mcols(flair)$symbol <- mcols(flair)$transcript
 35 | flair <- subset(flair, type == "exon")
 36 | 
 37 | ## Read tmap file to sample transcript pairs from and sample nPlots pairs
 38 | ## Also read SQANTI classification info to add on top
 39 | tmap <- read.delim(flairTmap, header = TRUE, as.is = TRUE)
 40 | if (sqanticlass != "") {
 41 |   sqanti <- read.delim(sqanticlass, header = TRUE, as.is = TRUE)
 42 | }
 43 | if (sqanticlass != "") {
 44 |   tmap <- tmap %>% dplyr::left_join(
 45 |     sqanti %>% dplyr::select(isoform, associated_transcript, structural_category),
 46 |     by = c("qry_id" = "isoform")
 47 |   ) %>%
 48 |     dplyr::filter(ref_id == associated_transcript | 
 49 |                     !(class_code %in% c("c", "=")))
 50 | } else {
 51 |   tmap$structural_category <- ""
 52 | }
 53 | set.seed(seed)
 54 | tmap <- tmap[sample(seq_len(nrow(tmap)), size = nPlots, replace = FALSE), ]
 55 | 
 56 | ## Define help function
 57 | compareFlairAnnotatedTx <- function(dataSet, annotatedGtf, flairGtf, 
 58 |                                     annotatedTx, flairTx, classCode, 
 59 |                                     structCat, outDir) {
 60 |   options(ucscChromosomeNames = FALSE)
 61 |   
 62 |   ## Subset gtf files to selected transcripts
 63 |   atmp <- subset(annotatedGtf, transcript == annotatedTx)
 64 |   ftmp <- subset(flairGtf, transcript == flairTx)
 65 |   
 66 |   ## Get width of each transcript
 67 |   awd <- sum(width(atmp))
 68 |   fwd <- sum(width(ftmp))
 69 |   
 70 |   show_chr <- unique(c(seqnames(atmp), seqnames(ftmp)))
 71 |   stopifnot(length(show_chr) == 1)
 72 |   
 73 |   min_coord <- min(
 74 |     min(start(atmp)) - 0.2*(max(end(atmp)) - min(start(atmp))),
 75 |     min(start(ftmp)) - 0.2*(max(end(ftmp)) - min(start(ftmp)))
 76 |   )
 77 |   max_coord <- max(
 78 |     max(end(atmp)) + 0.05*(max(end(atmp)) - min(start(atmp))),
 79 |     max(end(ftmp)) + 0.05*(max(end(ftmp)) - min(start(ftmp)))
 80 |   )
 81 |   
 82 |   agrtr <- Gviz::GeneRegionTrack(atmp, showId = FALSE, col = NULL,
 83 |                                  fill = "blue", name = "annotated",
 84 |                                  background.title = "transparent",
 85 |                                  col.title = "black", min.height = 15)
 86 |   fgrtr <- Gviz::GeneRegionTrack(ftmp, showId = FALSE, col = NULL,
 87 |                                  fill = "orange", name = "flair", 
 88 |                                  background.title = "transparent",
 89 |                                  col.title = "black", min.height = 15)
 90 |   
 91 |   gtr <- Gviz::GenomeAxisTrack()
 92 |   
 93 |   tracks <- c(gtr, agrtr, fgrtr)
 94 |   
 95 |   pdf(paste0(outDir, "/", dataSet, "__flair_comparison__", 
 96 |              gsub(":", "_", annotatedTx), "__", 
 97 |              gsub(":", "_", flairTx), "__", classCode, "__", 
 98 |              structCat, ".pdf"),
 99 |       height = 3, width = 10)
100 |   Gviz::plotTracks(tracks, chromosome = show_chr,
101 |                    from = min_coord, to = max_coord, 
102 |                    main = paste0("annotated: ", annotatedTx, " (", awd,
103 |                                  " nt); flair: ", flairTx, " (", fwd, 
104 |                                  " nt); class code: ", classCode, 
105 |                                  "; structural category: ", structCat),
106 |                    cex.main = 0.7,
107 |                    min.width = 0, min.distance = 0, collapse = FALSE)
108 |   dev.off()
109 | }
110 | 
111 | options(ucscChromosomeNames = FALSE)
112 | for (i in seq_len(nrow(tmap))) {
113 |   compareFlairAnnotatedTx(dataSet = dataSet, annotatedGtf = annotated, 
114 |                           flairGtf = flair, 
115 |                           annotatedTx = tmap$ref_id[i], 
116 |                           flairTx = tmap$qry_id[i], 
117 |                           classCode = tmap$class_code[i],
118 |                           structCat = tmap$structural_category[i], 
119 |                           outDir = outDir)
120 | }
121 | 
122 | saveRDS(tmap, file = outrds)
123 | date()
124 | sessionInfo()
125 | 


--------------------------------------------------------------------------------
/Rscripts/plot_distance_primary_supplementary_alignments.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | ## Find the distance between supplementary alignments and the corresponding
  7 | ## primary alignment
  8 | 
  9 | print(genomebamdir)
 10 | print(outrds)
 11 | 
 12 | suppressPackageStartupMessages({
 13 |   library(GenomicRanges)
 14 |   library(GenomicAlignments)
 15 |   library(dplyr)
 16 |   library(ggplot2)
 17 | })
 18 | 
 19 | bamfiles <- list.files(genomebamdir, pattern = "_minimap_genome_s.bam$", 
 20 |                        recursive = TRUE, full.names = TRUE)
 21 | names(bamfiles) <- gsub("_minimap_genome_s.bam", "", basename(bamfiles))
 22 | bamfiles
 23 | 
 24 | primary_supplementary_distances <- lapply(bamfiles, function(bam) {
 25 |   bam <- readGAlignments(bam, use.names = TRUE, 
 26 |                          param = ScanBamParam(tag=c("NM"),
 27 |                                               what=c("qname","flag", "rname", 
 28 |                                                      "pos", "mapq")))
 29 |   bam <- subset(bam, flag %in% c(0, 16, 2048, 2064))
 30 |   
 31 |   ops <- GenomicAlignments::CIGAR_OPS
 32 |   wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(bam), ops = ops)
 33 |   keep.ops <- GenomicAlignments::explodeCigarOps(cigar(bam), ops = ops)
 34 |   explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 
 35 |                                                          unlist(keep.ops)), wdths))
 36 |   for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) {
 37 |     mcols(bam)[[paste0("nbr", opts)]] <- 
 38 |       sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE))
 39 |   }
 40 |   mcols(bam)$readLength <- rowSums(as.matrix(mcols(bam)[, c("nbrS", "nbrH", "nbrM", "nbrI")]))
 41 |   
 42 |   supplementary_alignments <- subset(bam, flag %in% c(2048, 2064))
 43 |   primary_alignments <- subset(bam, flag %in% c(0, 16))
 44 |   primary_alignments <- primary_alignments[names(primary_alignments) %in% 
 45 |                                              names(supplementary_alignments)]
 46 |   
 47 |   do.call(rbind, lapply(seq_along(supplementary_alignments), function(i) {
 48 |     nm <- names(supplementary_alignments)[i]
 49 |     primary <- as(primary_alignments[nm], "GRanges")
 50 |     supplementary <- as(supplementary_alignments[i], "GRanges")
 51 |     if (all(seqnames(primary) == seqnames(supplementary))) {
 52 |       ## Same chromosome
 53 |       if (all(strand(primary) == strand(supplementary))) {
 54 |         strands <- "same chromosome, same strand"
 55 |       } else {
 56 |         strands <- "same chromosome, different strand"
 57 |       }
 58 |       distn <- GenomicRanges::distance(primary, supplementary, ignore.strand = TRUE)
 59 |       ovlap <- sum(width(GenomicRanges::intersect(as(primary_alignments[nm], "GRangesList"),
 60 |                                                   as(supplementary_alignments[i], "GRangesList"),
 61 |                                                   ignore.strand = TRUE)))
 62 |     } else {
 63 |       strands <- "different chromosomes"
 64 |       distn <- NA
 65 |       ovlap <- NA
 66 |     }
 67 |     nbrMPrimary <- mcols(primary)[["nbrM"]]
 68 |     nbrMSupplementary <- mcols(supplementary)[["nbrM"]]
 69 |     nbrMSuppVsPrim <- nbrMSupplementary/nbrMPrimary
 70 |     data.frame(read = nm, strands = strands, distn = distn, ovlap = ovlap, 
 71 |                nbrMPrimary = nbrMPrimary, nbrMSupplementary = nbrMSupplementary,
 72 |                nbrMSuppVsPrim = nbrMSuppVsPrim, stringsAsFactors = FALSE)
 73 |   }))
 74 | })
 75 | 
 76 | for (nm in names(primary_supplementary_distances)) {
 77 |   primary_supplementary_distances[[nm]]$sample <- nm
 78 | }
 79 | 
 80 | primary_supplementary_distances <- do.call(rbind, primary_supplementary_distances)
 81 | 
 82 | pdf(gsub("rds$", "pdf", outrds), width = 8, height = 7)
 83 | ggplot(primary_supplementary_distances %>% 
 84 |          dplyr::mutate(dist0 = "") %>%
 85 |          dplyr::mutate(dist0 = replace(dist0, distn == 0, ", distance = 0"),
 86 |                        dist0 = replace(dist0, distn > 0, ", distance > 0")) %>% 
 87 |          dplyr::group_by(sample, strands, dist0) %>% dplyr::tally() %>%
 88 |          dplyr::mutate(strands_dist0 = paste0(strands, dist0)),
 89 |        aes(x = sample, y = n, fill = strands_dist0)) + 
 90 |   theme_bw() + geom_bar(stat = "identity") + 
 91 |   theme(legend.position = "bottom",
 92 |         axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
 93 |   ylab("Number of primary/supplementary alignment pairs") + xlab("") + 
 94 |   guides(fill = guide_legend(nrow = 5, byrow = TRUE)) + 
 95 |   scale_fill_manual(values = c("#DC050C", "#1965B0", "#4EB265", "#777777", "#B17BA6"),
 96 |                     name = "")
 97 | dev.off()
 98 | 
 99 | saveRDS(primary_supplementary_distances, file = outrds)
100 | date()
101 | sessionInfo()
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/Rscripts/plot_eq_class_counts.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | ## Plot number of reads per equivalence class, as function of number of
 7 | ## transcripts per equivalence class
 8 | 
 9 | suppressPackageStartupMessages({
10 |   library(dplyr)
11 |   library(ggplot2)
12 | })
13 | 
14 | print(salmondir)
15 | print(outrds)
16 | 
17 | salmondirs <- list.files(salmondir, full.names = TRUE)
18 | salmonfiles <- paste0(salmondirs, "/aux_info/eq_classes.txt")
19 | names(salmonfiles) <- basename(salmondirs)
20 | salmonfiles <- salmonfiles[file.exists(salmonfiles)]
21 | salmonfiles
22 | 
23 | eqcl <- lapply(salmonfiles, function(f) {
24 |   x <- readLines(f)
25 |   n_tr <- as.numeric(x[1])  ## Total number of transcripts
26 |   n_eq <- as.numeric(x[2])  ## Total number of equivalence classes
27 |   tx_id <- x[3:(n_tr + 2)]  ## Transcript IDs
28 |   quants <- x[(n_tr + 3):length(x)]  ## Characteristics of equivalence classes
29 |   
30 |   ## Split equivalence class characteristics. Each element of the list corresponds
31 |   ## to one equivalence class, and lists its number of transcripts, the
32 |   ## transcripts IDs and the total number of reads
33 |   do.call(rbind, lapply(quants, function(w) {
34 |     tmp = strsplit(w, "\\\t")[[1]]
35 |     nbr_tx = as.numeric(tmp[1])
36 |     data.frame(nbr_tx = nbr_tx,
37 |                tx_ids = paste0(tx_id[as.numeric(tmp[2:(1 + nbr_tx)]) + 1], collapse = ","),
38 |                count = as.numeric(tmp[length(tmp)]),
39 |                stringsAsFactors = FALSE)
40 |   }))
41 | })
42 | 
43 | for (nm in names(eqcl)) {
44 |   eqcl[[nm]]$sample <- nm
45 | }
46 | 
47 | eqcl <- do.call(rbind, eqcl)
48 | 
49 | pdf(gsub("rds$", "pdf", outrds))
50 | ggplot(eqcl, aes(x = nbr_tx, y = count)) + geom_point(size = 0.5, alpha = 0.25) + 
51 |   facet_wrap(~ sample) + theme_bw() + scale_y_log10() + 
52 |   xlab("Number of transcripts in equivalence class") + 
53 |   ylab("Number of reads assigned to equivalence class")
54 | dev.off()
55 | 
56 | saveRDS(NULL, file = outrds)
57 | 
58 | sessionInfo()


--------------------------------------------------------------------------------
/Rscripts/plot_qc.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | print(bamfile)
 7 | print(fastqfile)
 8 | print(outrds)
 9 | 
10 | suppressPackageStartupMessages({
11 |   library(GenomicAlignments)
12 |   library(data.table)
13 |   library(ggplot2)
14 | })
15 | 
16 | sbp <- ScanBamParam(tag = "NM")
17 | 
18 | ## Read bam file and corresponding fastq file
19 | bam <- readGAlignments(bamfile, use.names = TRUE, param = sbp)
20 | fastq <- fread(paste0("gunzip -c ", fastqfile), sep = "\n", header = FALSE)
21 | 
22 | ## Split rows in fastq file
23 | id_idxs <- seq(1, nrow(fastq), by = 4)   ## read ids
24 | q_idxs <- seq(4, nrow(fastq), by = 4)   ## quality scores
25 | seq_idxs <- seq(2, nrow(fastq), by = 4)   ## sequence
26 | 
27 | ## Calculate average quality per read
28 | mean_q <- sapply(fastq$V1[q_idxs], function(u) mean(as.numeric(charToRaw(u)) - 33))
29 | names(mean_q) <- gsub("@", "", sapply(strsplit(fastq$V1[id_idxs]," "), .subset, 1))
30 | 
31 | ## Get read lengths
32 | seqlen <- nchar(fastq$V1[seq_idxs])
33 | 
34 | ## Number of soft clipped bases
35 | clip_length <- sapply(explodeCigarOpLengths(cigar(bam), ops = "S"), sum)
36 | 
37 | pdf(gsub("rds$", "pdf", outrds), w = 10, h = 10)
38 | ## mapped length vs number of mismatches
39 | smoothScatter(log10(qwidth(bam) - clip_length), log10(mcols(bam)$NM), 
40 |                xlab = "mapped length (log10)", ylab = "number of mismatches (log10)")
41 | 
42 | ## percent mismatches
43 | pct_mm <- mcols(bam)$NM/(qwidth(bam) - clip_length)
44 | hist(pct_mm[pct_mm >= 0 & pct_mm <= 1], 500, main = "percent mismatches")
45 | 
46 | ## read length
47 | hist(log10(seqlen), 500, main = "read length (log10)")
48 | 
49 | ## mapped length
50 | hist(log10(qwidth(bam) - clip_length), 500, main = "mapped length (log10)")
51 | 
52 | ## read length vs average quality score
53 | smoothScatter(log10(seqlen), mean_q, xlab = "read length (log10)", 
54 |               ylab = "average quality score")
55 | 
56 | df <- data.frame(sequence_length = seqlen, mean_quality = mean_q)
57 | 
58 | p <- ggplot(df) + geom_hex(aes(sequence_length, mean_quality), bins = 100) +
59 |          scale_x_log10() +
60 |          scale_fill_gradientn("", colours = rev(rainbow(10, end = 4/6))) +
61 |          theme(text = element_text(size=20))
62 | print(p)
63 | 
64 | ## average quality score
65 | hist(mean_q, 100, main = "average quality score")
66 | 
67 | ## average quality score vs percent mismatches
68 | m <- match(names(bam), names(mean_q))
69 | smoothScatter(mean_q[m], pct_mm, pch = ".", ylim = c(0, 0.4), 
70 |               xlab = "average quality score", 
71 |               ylab = "percent mismatches")
72 | 
73 | df <- data.frame(mean_quality = mean_q[m], percent_mismatches=pct_mm*100)
74 | 
75 | p <- ggplot(df) + geom_hex(aes(mean_quality, percent_mismatches), bins = 100) +
76 |          scale_x_log10() +
77 |          scale_fill_gradientn("", colours = rev(rainbow(10, end = 4/6))) +
78 |          theme(text = element_text(size=20))
79 | print(p)
80 | 
81 | dev.off()
82 | 
83 | saveRDS(NULL, outrds)
84 | sessionInfo()
85 | date()
86 | 


--------------------------------------------------------------------------------
/Rscripts/summarize_abundances.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | ## Summarize abundances on transcript- and gene-level
  7 | 
  8 | suppressPackageStartupMessages({
  9 |   library(tximport)
 10 |   library(dplyr)
 11 | })
 12 | 
 13 | print(topdir)
 14 | print(tx2gene)
 15 | print(outrds)
 16 | 
 17 | get_file_listing <- function(topdir, subdir, filename) {
 18 |   files <- list.files(paste0(topdir, "/", subdir), full.names = TRUE)
 19 |   nm <- basename(files)
 20 |   files <- paste0(files, "/", filename)
 21 |   names(files) <- nm
 22 |   files
 23 | }
 24 | 
 25 | import_salmon_files <- function(files, name, tx2gene) {
 26 |   txi <- tximport(files, type = "salmon", txOut = TRUE)
 27 |   txig <- summarizeToGene(txi, tx2gene = tx2gene[, c("tx", "gene")])
 28 |   txab <- 
 29 |     dplyr::full_join(
 30 |       txi$counts %>% as.data.frame() %>% 
 31 |         setNames(paste0(names(.), "__count__", name)) %>%
 32 |         tibble::rownames_to_column("tx"),
 33 |       txi$abundance %>% as.data.frame() %>%
 34 |         setNames(paste0(names(.), "__tpm__", name)) %>%
 35 |         tibble::rownames_to_column("tx"),
 36 |       by = "tx"
 37 |     )
 38 |   geneab <- 
 39 |     dplyr::full_join(
 40 |       txig$counts %>% as.data.frame() %>% 
 41 |         setNames(paste0(names(.), "__count__", name)) %>%
 42 |         tibble::rownames_to_column("gene"),
 43 |       txig$abundance %>% as.data.frame() %>%
 44 |         setNames(paste0(names(.), "__tpm__", name)) %>%
 45 |         tibble::rownames_to_column("gene"),
 46 |       by = "gene"
 47 |     )
 48 |   list(txab = txab, geneab = geneab)
 49 | }
 50 | 
 51 | ## Read tx2gene
 52 | tx2gene <- readRDS(tx2gene)
 53 | 
 54 | ## Initialize gene and transcript abundance lists
 55 | gene_abundances <- list()
 56 | tx_abundances <- list()
 57 | 
 58 | ## Parse the subdirectories of topdir to see which abundances are available
 59 | (dirs <- list.files(topdir, full.names = FALSE))
 60 | 
 61 | ## Salmon31
 62 | if ("salmon31" %in% dirs) {
 63 |   message("Salmon31")
 64 |   files <- get_file_listing(topdir, "salmon31", "quant.sf")
 65 |   print(files)
 66 |   abds <- import_salmon_files(files, "salmon31", tx2gene)
 67 |   tx_abundances[["salmon31"]] <- abds$txab
 68 |   gene_abundances[["salmon31"]] <- abds$geneab
 69 | }
 70 | 
 71 | ## Salmonminimap2
 72 | if ("salmonminimap2" %in% dirs) {
 73 |   message("salmonminimap2")
 74 |   files <- get_file_listing(topdir, "salmonminimap2", "quant.sf")
 75 |   print(files)
 76 |   abds <- import_salmon_files(files, "salmonminimap2", tx2gene)
 77 |   tx_abundances[["salmonminimap2"]] <- abds$txab
 78 |   gene_abundances[["salmonminimap2"]] <- abds$geneab
 79 | }
 80 | 
 81 | ## Salmonminimap2_p0.99
 82 | if ("salmonminimap2_p0.99" %in% dirs) {
 83 |   message("salmonminimap2_p0.99")
 84 |   files <- get_file_listing(topdir, "salmonminimap2_p0.99", "quant.sf")
 85 |   print(files)
 86 |   abds <- import_salmon_files(files, "salmonminimap2_p0.99", tx2gene)
 87 |   tx_abundances[["salmonminimap2_p0.99"]] <- abds$txab
 88 |   gene_abundances[["salmonminimap2_p0.99"]] <- abds$geneab
 89 | }
 90 | 
 91 | ## wub 
 92 | if ("wubminimap2" %in% dirs) {
 93 |   message("wub")
 94 |   files <- get_file_listing(topdir, "wubminimap2", "bam_count_reads.tsv")
 95 |   print(files)
 96 |   tx_abundances[["wubminimap2"]] <- 
 97 |     Reduce(function(...) dplyr::full_join(..., by = "tx"),
 98 |            lapply(names(files), function(nm) {
 99 |              read.delim(files[nm], header = TRUE, as.is = TRUE) %>% 
100 |                dplyr::select(Reference, Count) %>%
101 |                setNames(c("tx", paste0(nm, "__count__wubminimap2")))
102 |            })
103 |     )
104 |   gene_abundances[["wubminimap2"]] <- 
105 |     dplyr::left_join(tx_abundances[["wubminimap2"]],
106 |                      tx2gene[, c("tx", "gene")]) %>% 
107 |     dplyr::select(-tx) %>%
108 |     dplyr::group_by(gene) %>% dplyr::summarise_all(funs(sum)) %>%
109 |     as.data.frame()
110 | }
111 | 
112 | ## featureCounts (primary) 
113 | if ("featurecountsminimap2_primary" %in% dirs) {
114 |   message("featureCounts (primary)")
115 |   files <- get_file_listing(topdir, "featurecountsminimap2_primary", "featurecountsminimap2.txt")
116 |   print(files)
117 |   gene_abundances[["featurecountsminimap2primary"]] <- 
118 |     Reduce(function(...) dplyr::full_join(..., by = "gene"),
119 |            lapply(names(files), function(nm) {
120 |              read.delim(files[nm], skip = 1, header = TRUE, as.is = TRUE) %>% 
121 |                dplyr::select(-Chr, -Start, -End, -Strand, -Length) %>%
122 |                setNames(c("gene", paste0(nm, "__count__featurecountsminimap2primary")))
123 |            })
124 |     )
125 | }
126 | 
127 | ## Remove version numbers from all genes and transcripts
128 | tx_abundances <- lapply(tx_abundances, function(l) {
129 |   l %>% dplyr::mutate(tx = gsub("\\.[0-9]+$", "", tx))
130 | })
131 | gene_abundances <- lapply(gene_abundances, function(l) {
132 |   l %>% dplyr::mutate(gene = gsub("\\.[0-9]+$", "", gene))
133 | })
134 | 
135 | ## Merge all abundance estimates
136 | tx_abundances <- Reduce(function(...) dplyr::full_join(..., by = "tx"),
137 |                         tx_abundances)
138 | gene_abundances <- Reduce(function(...) dplyr::full_join(..., by = "gene"),
139 |                           gene_abundances)
140 | 
141 | ## Set transcript/gene names as row names and replace NAs with zeros
142 | rownames(tx_abundances) <- tx_abundances$tx
143 | tx_abundances$tx <- NULL
144 | rownames(gene_abundances) <- gene_abundances$gene
145 | gene_abundances$gene <- NULL
146 | 
147 | tx_abundances[is.na(tx_abundances)] <- 0
148 | gene_abundances[is.na(gene_abundances)] <- 0
149 | 
150 | saveRDS(list(tx_abundances = tx_abundances, 
151 |              gene_abundances = gene_abundances),
152 |         file = outrds)
153 | 
154 | date()
155 | sessionInfo()
156 | 
157 | 


--------------------------------------------------------------------------------
/Rscripts/summarize_abundances_illumina.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | ## Summarize abundances on transcript- and gene-level
  7 | 
  8 | suppressPackageStartupMessages({
  9 |   library(tximport)
 10 |   library(dplyr)
 11 |   library(rtracklayer)
 12 | })
 13 | 
 14 | print(topdir)
 15 | print(tx2gene)
 16 | print(outrds)
 17 | 
 18 | get_file_listing <- function(topdir, subdir, filename) {
 19 |   files <- list.files(paste0(topdir, "/", subdir), full.names = TRUE)
 20 |   nm <- basename(files)
 21 |   files <- paste0(files, "/", filename)
 22 |   names(files) <- nm
 23 |   files
 24 | }
 25 | 
 26 | import_salmon_files <- function(files, name, tx2gene) {
 27 |   txi <- tximport(files, type = "salmon", txOut = TRUE)
 28 |   txig <- summarizeToGene(txi, tx2gene = tx2gene[, c("tx", "gene")])
 29 |   txab <- 
 30 |     dplyr::full_join(
 31 |       txi$counts %>% as.data.frame() %>% 
 32 |         setNames(paste0(names(.), "__count__", name)) %>%
 33 |         tibble::rownames_to_column("tx"),
 34 |       txi$abundance %>% as.data.frame() %>%
 35 |         setNames(paste0(names(.), "__tpm__", name)) %>%
 36 |         tibble::rownames_to_column("tx"),
 37 |       by = "tx"
 38 |     )
 39 |   geneab <- 
 40 |     dplyr::full_join(
 41 |       txig$counts %>% as.data.frame() %>% 
 42 |         setNames(paste0(names(.), "__count__", name)) %>%
 43 |         tibble::rownames_to_column("gene"),
 44 |       txig$abundance %>% as.data.frame() %>%
 45 |         setNames(paste0(names(.), "__tpm__", name)) %>%
 46 |         tibble::rownames_to_column("gene"),
 47 |       by = "gene"
 48 |     )
 49 |   list(txab = txab, geneab = geneab)
 50 | }
 51 | 
 52 | ## Read tx2gene
 53 | tx2gene <- readRDS(tx2gene)
 54 | 
 55 | ## Initialize gene and transcript abundance lists
 56 | gene_abundances <- list()
 57 | tx_abundances <- list()
 58 | 
 59 | ## Parse the subdirectories of topdir to see which abundances are available
 60 | (dirs <- list.files(topdir, full.names = FALSE))
 61 | 
 62 | ## Salmon
 63 | if ("salmon31" %in% dirs) {
 64 |   message("Salmon")
 65 |   (files <- get_file_listing(topdir, "salmon31", "quant.sf"))
 66 |   abds <- import_salmon_files(files, "salmon", tx2gene)
 67 |   tx_abundances[["salmon"]] <- abds$txab
 68 |   gene_abundances[["salmon"]] <- abds$geneab
 69 | }
 70 | 
 71 | ## StringTie
 72 | if ("stringtie" %in% dirs) {
 73 |   message("StringTie")
 74 |   files <- list.files(paste0(topdir, "/stringtie"), full.names = TRUE)
 75 |   nm <- basename(files)
 76 |   files <- paste0(files, "/", nm, "_stringtie.gtf")
 77 |   names(files) <- nm
 78 |   print(files)
 79 |   txi <- 
 80 |     Reduce(function(...) dplyr::full_join(..., by = c("tx", "gene")),
 81 |            lapply(names(files), function(nm) {
 82 |              message(nm)
 83 |              f <- rtracklayer::import(files[nm])
 84 |              f <- subset(f, type == "transcript")
 85 |              data.frame(tx = f$transcript_id, 
 86 |                         gene = f$gene_id,
 87 |                         tpm = as.numeric(as.character(f$TPM)),
 88 |                         stringsAsFactors = FALSE) %>%
 89 |                setNames(c("tx", "gene", paste0(nm, "__tpm__StringTie")))
 90 |            }))
 91 |   tx_abundances[["stringtie"]] <- txi %>% dplyr::select(-gene)
 92 |   gene_abundances[["stringtie"]] <- txi %>% dplyr::select(-tx) %>%
 93 |     dplyr::group_by(gene) %>% dplyr::summarize_all(funs(sum)) %>% as.data.frame()
 94 | }
 95 | 
 96 | ## Remove version numbers from all genes and transcripts
 97 | tx_abundances <- lapply(tx_abundances, function(l) {
 98 |   l %>% dplyr::mutate(tx = gsub("\\.[0-9]+$", "", tx))
 99 | })
100 | gene_abundances <- lapply(gene_abundances, function(l) {
101 |   l %>% dplyr::mutate(gene = gsub("\\.[0-9]+$", "", gene))
102 | })
103 | 
104 | ## Merge all abundance estimates
105 | tx_abundances <- Reduce(function(...) dplyr::full_join(..., by = "tx"),
106 |                         tx_abundances)
107 | gene_abundances <- Reduce(function(...) dplyr::full_join(..., by = "gene"),
108 |                           gene_abundances)
109 | 
110 | ## Set transcript/gene names as row names and replace NAs with zeros
111 | rownames(tx_abundances) <- tx_abundances$tx
112 | tx_abundances$tx <- NULL
113 | rownames(gene_abundances) <- gene_abundances$gene
114 | gene_abundances$gene <- NULL
115 | 
116 | tx_abundances[is.na(tx_abundances)] <- 0
117 | gene_abundances[is.na(gene_abundances)] <- 0
118 | 
119 | saveRDS(list(tx_abundances = tx_abundances, 
120 |              gene_abundances = gene_abundances),
121 |         file = outrds)
122 | 
123 | date()
124 | sessionInfo()
125 | 
126 | 


--------------------------------------------------------------------------------
/Rscripts/summarize_star_sjdbs.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | sjdbfiles <- strsplit(sjdbfiles, ",")[[1]]
 7 | 
 8 | print(sjdbfiles)
 9 | print(min_unique)
10 | print(outtxt)
11 | 
12 | suppressPackageStartupMessages({
13 |   library(dplyr)
14 | })
15 | 
16 | x <- do.call(dplyr::bind_rows, lapply(sjdbfiles, function(f) {
17 |   read.delim(f, header = FALSE, as.is = TRUE) %>%
18 |     setNames(c("chromosome", "start", "end", "strand", "motif", 
19 |                "annotated", "nbr_unique", "nbr_mm", "max_overhang"))
20 | })) %>%
21 |   dplyr::group_by(chromosome, start, end, strand) %>%
22 |   dplyr::summarize(motif = motif[1], 
23 |                    annotated = annotated[1],
24 |                    nbr_unique = sum(nbr_unique),
25 |                    nbr_mm = sum(nbr_mm),
26 |                    max_overhang = max(max_overhang)) %>%
27 |   dplyr::filter(nbr_unique >= min_unique) %>%
28 |   dplyr::select(chromosome, start, end, strand, motif, 
29 |                 annotated, nbr_unique, nbr_mm, max_overhang)
30 | 
31 | write.table(x, file = outtxt, col.names = FALSE, row.names = FALSE,
32 |             quote = FALSE, sep = "\t")
33 | 
34 | date()
35 | sessionInfo()
36 | 


--------------------------------------------------------------------------------
/datasets_to_include.mk:
--------------------------------------------------------------------------------
 1 | null  :=
 2 | space := $(null) #
 3 | comma := ,
 4 | 
 5 | ## Define the data sets to include in the plots
 6 | datasetsms := DCS108 HEK293RNA RNA001 pilot
 7 | datasetsmsc := $(subst $(space),$(comma),$(datasetsms))
 8 | 
 9 | conditionsms := wt
10 | conditionsmsc := $(subst $(space),$(comma),$(conditionsms))


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_compare_gffcompare_sqanti.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(rtracklayer)
  8 |   library(dplyr)
  9 |   library(GenomicFeatures)
 10 |   library(ggplot2)
 11 |   library(grDevices)
 12 |   library(cowplot)
 13 |   library(grid)
 14 |   library(pheatmap)
 15 |   library(viridis)
 16 | })
 17 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 18 | 
 19 | datasets <- strsplit(datasets, ",")[[1]]
 20 | names(datasets) <- datasets
 21 | conditions <- strsplit(conditions, ",")[[1]]
 22 | 
 23 | if (all(c("wt", "srpk") %in% conditions)) {
 24 |   flaircond <- "_all"
 25 | } else if ("wt" %in% conditions) {
 26 |   flaircond <- "_WT"
 27 | } else {
 28 |   stop("Unknown conditions")
 29 | }
 30 | 
 31 | print(datasets)
 32 | print(conditions)
 33 | print(flaircond)
 34 | print(outrds)
 35 | 
 36 | ## Read class codes from gffcompare
 37 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
 38 |   tmap <- read.delim(paste0(
 39 |     ds, "/flair_round2/", ds, flaircond, "/", ds, flaircond, 
 40 |     "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare.", 
 41 |     ds, flaircond, "_minimap_genome_s_primary_flair_collapse.isoforms.gtf.tmap"), 
 42 |     header = TRUE, as.is = TRUE) %>%
 43 |     dplyr::select(qry_id, class_code, ref_id) %>%
 44 |     dplyr::mutate(dataset = remapds[ds]) %>%
 45 |     dplyr::mutate(class_code = as.character(class_code),
 46 |                   ref_id = as.character(ref_id))
 47 |   if (ds %in% c("RNA001", "DCS108", "pilot")) {
 48 |     tmap2 <- read.delim(paste0(
 49 |       ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 
 50 |       "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gffcompare.", 
 51 |       ds, flaircond, "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.",
 52 |       "gtf.tmap"), 
 53 |       header = TRUE, as.is = TRUE) %>%
 54 |       dplyr::select(qry_id, class_code, ref_id) %>%
 55 |       dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>%
 56 |       dplyr::mutate(class_code = as.character(class_code),
 57 |                     ref_id = as.character(ref_id))
 58 |     tmap <- dplyr::bind_rows(tmap, tmap2)
 59 |   }
 60 |   tmap
 61 | })) %>%
 62 |   dplyr::mutate(
 63 |     class_code = replace(class_code, class_code == "=", 
 64 |                          "complete, exact match of intron chain (=)"),
 65 |     class_code = replace(class_code, class_code == "c", 
 66 |                          "contained in reference (intron compatible) (c)"),
 67 |     class_code = replace(class_code, class_code == "k", 
 68 |                          "containment of reference (k)"),
 69 |     class_code = replace(class_code, class_code == "m",
 70 |                          "retained intron(s), full intron chain overlap (m)"),
 71 |     class_code = replace(class_code, class_code == "n", 
 72 |                          "retained intron(s), partial or no intron chain match (n)"),
 73 |     class_code = replace(class_code, class_code == "j", 
 74 |                          "multi-exon with at least one junction match (j)"),
 75 |     class_code = replace(class_code, class_code == "e", 
 76 |                          "single exon transfrag partially covering an intron (e)"),
 77 |     class_code = replace(class_code, class_code == "o", 
 78 |                          "other same strand overlap with reference exons (o)"),
 79 |     class_code = replace(class_code, class_code == "s", 
 80 |                          "intron matching on opposite strand (s)"),
 81 |     class_code = replace(class_code, class_code == "x", 
 82 |                          "exonic overlap on opposite strand (x)"),
 83 |     class_code = replace(class_code, class_code == "i", 
 84 |                          "fully contained within a reference intron (i)"),
 85 |     class_code = replace(class_code, class_code == "y", 
 86 |                          "contains a reference within its introns (y)"),
 87 |     class_code = replace(class_code, class_code == "p", 
 88 |                          "possible polymerase run-on (no overlap) (p)"),
 89 |     class_code = replace(class_code, class_code == "r", "repeat (r)"),
 90 |     class_code = replace(class_code, class_code == "u", "other (u)")) %>%
 91 |   dplyr::mutate(class_code = factor(class_code, levels = c(
 92 |     "other (u)",
 93 |     "repeat (r)",
 94 |     "possible polymerase run-on (no overlap) (p)",
 95 |     "contains a reference within its introns (y)",
 96 |     "fully contained within a reference intron (i)",
 97 |     "exonic overlap on opposite strand (x)",
 98 |     "intron matching on opposite strand (s)",
 99 |     "other same strand overlap with reference exons (o)",
100 |     "single exon transfrag partially covering an intron (e)",
101 |     "multi-exon with at least one junction match (j)",
102 |     "retained intron(s), partial or no intron chain match (n)",
103 |     "retained intron(s), full intron chain overlap (m)",
104 |     "containment of reference (k)",
105 |     "contained in reference (intron compatible) (c)",
106 |     "complete, exact match of intron chain (=)"
107 |   )))
108 | 
109 | dfsc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
110 |   tmap <- read.delim(paste0(
111 |     ds, "/sqanti_round2/", ds, flaircond, "/", ds, flaircond, 
112 |     "_classification.txt"), 
113 |     header = TRUE, as.is = TRUE) %>%
114 |     dplyr::select(isoform, structural_category, associated_transcript) %>%
115 |     dplyr::mutate(dataset = remapds[ds]) %>%
116 |     dplyr::mutate(structural_category = as.character(structural_category),
117 |                   associated_transcript = as.character(associated_transcript))
118 |   if (ds %in% c("RNA001", "DCS108", "pilot")) {
119 |     tmap2 <- read.delim(paste0(
120 |       ds, "/sqanti_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 
121 |       "_ilmnjunc_classification.txt"), 
122 |       header = TRUE, as.is = TRUE) %>%
123 |       dplyr::select(isoform, structural_category, associated_transcript) %>%
124 |       dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>%
125 |       dplyr::mutate(structural_category = as.character(structural_category),
126 |                     associated_transcript = as.character(associated_transcript))
127 |     tmap <- dplyr::bind_rows(tmap, tmap2)
128 |   }
129 |   tmap
130 | })) %>%
131 |   dplyr::mutate(structural_category = factor(structural_category, levels = c(
132 |     "intergenic",
133 |     "genic_intron",
134 |     "antisense",
135 |     "novel_not_in_catalog",
136 |     "novel_in_catalog",
137 |     "genic",
138 |     "fusion",
139 |     "incomplete-splice_match",
140 |     "full-splice_match"
141 |   )))
142 | 
143 | dfboth <- dplyr::full_join(dfcc, dfsc, by = c("qry_id" = "isoform", "dataset"))
144 | dfbothsub <- dfboth %>% dplyr::filter(dataset == "ONT-RNA001-HAP_ILMNjunc")
145 | 
146 | tb <- table(dfboth$class_code, dfboth$structural_category)
147 | tbsub <- table(dfbothsub$class_code, dfbothsub$structural_category)
148 | 
149 | sqrt_breaks <- function(xs, n = 10) {
150 |   breaks <- (seq(0, (max(xs) + 1)^(1/3), length.out = n))^3
151 |   breaks[!duplicated(breaks)]
152 | }
153 | mat_breaks <- sqrt_breaks(tb, n = 101)
154 | mat_breaks_sub <- sqrt_breaks(tbsub, n = 101)
155 | 
156 | png(gsub("rds$", "png", outrds), height = 8, width = 8, unit = "in", res = 400)
157 | pheatmap::pheatmap(tb, cluster_rows = FALSE, cluster_cols = FALSE,
158 |                    color = inferno(length(mat_breaks) - 1),
159 |                    breaks = mat_breaks)
160 | dev.off()
161 | 
162 | png(gsub("\\.rds$", "_sub.png", outrds), height = 8, width = 8, unit = "in", res = 400)
163 | pheatmap::pheatmap(tbsub, cluster_rows = FALSE, cluster_cols = FALSE,
164 |                    color = inferno(length(mat_breaks_sub) - 1),
165 |                    breaks = mat_breaks_sub, main = "ONT-RNA001-HAP_ILMNjunc")
166 | dev.off()
167 | 
168 | saveRDS(NULL, file = outrds)
169 | date()
170 | sessionInfo()
171 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_consistency_between_replicates.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(stringr)
 10 |   library(ggbeeswarm)
 11 | })
 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 13 | 
 14 | datasets <- strsplit(datasets, ",")[[1]]
 15 | datasets <- c(datasets, "Illumina")
 16 | names(datasets) <- datasets
 17 | conditions <- strsplit(conditions, ",")[[1]]
 18 | 
 19 | print(tx2gene)
 20 | print(datasets)
 21 | print(conditions)
 22 | print(outrds)
 23 | 
 24 | ## Define base colors
 25 | muted <- c("#DC050C","#E8601C","#7BAFDE","#1965B0","#B17BA6",
 26 |            "#882E72","#F1932D","#F6C141","#F7EE55","#4EB265",
 27 |            "#90C987","#CAEDAB","#777777")
 28 | colfun <- grDevices::colorRampPalette(muted)
 29 | 
 30 | abundances <- lapply(datasets, function(ds) {
 31 |   readRDS(paste0(ds, "/output/", ds, "_all_abundances.rds"))
 32 | })
 33 | 
 34 | ## Keep only the desired abundance measures
 35 | abundances <- lapply(abundances, function(w) {
 36 |   w$tx_abundances <- 
 37 |     w$tx_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|tpm__salmon$|tpm__StringTie", 
 38 |                            colnames(w$tx_abundances))]
 39 |   w$gene_abundances <- 
 40 |     w$gene_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|count__featurecountsminimap2primary|tpm__salmon$|tpm__StringTie", 
 41 |                              colnames(w$gene_abundances))]
 42 |   w
 43 | })
 44 | 
 45 | tx2gene <- readRDS(tx2gene)
 46 | ## Remove version information from tx2gene
 47 | tx2gene <- tx2gene %>% dplyr::mutate(tx = gsub("\\.[0-9]+$", "", tx),
 48 |                                      gene = gsub("\\.[0-9]+$", "", gene))
 49 | 
 50 | ## Transcript-level
 51 | txlevel <- Reduce(function(...) dplyr::full_join(..., by = "tx"), 
 52 |                   lapply(abundances, function(ab) {
 53 |                     ab$tx_abundances %>% tibble::rownames_to_column("tx")
 54 |                   })) %>%
 55 |   tibble::column_to_rownames("tx")
 56 | 
 57 | ## Gene-level
 58 | genelevel <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 
 59 |                   lapply(abundances, function(ab) {
 60 |                     ab$gene_abundances %>% tibble::rownames_to_column("gene")
 61 |                   })) %>%
 62 |   tibble::column_to_rownames("gene")
 63 | 
 64 | cortx <- list()
 65 | corgene <- list()
 66 | 
 67 | ## Help functions
 68 | calcCor <- function(abundancemat, method) {
 69 |   cor(abundancemat,
 70 |       method = method, use = "pairwise.complete.obs") %>%
 71 |     as.data.frame() %>% tibble::rownames_to_column("sample1") %>%
 72 |     tidyr::gather(key = "sample2", value = "correlation", -sample1) %>%
 73 |     tidyr::separate(sample1, into = c("sample1", "type1", "method1"), sep = "__") %>%
 74 |     tidyr::separate(sample2, into = c("sample2", "type2", "method2"), sep = "__") %>%
 75 |     tidyr::unite(col = method1, type1, method1, sep = "__") %>%
 76 |     tidyr::unite(col = method2, type2, method2, sep = "__") %>%
 77 |     dplyr::mutate(sample1 = remap[sample1], 
 78 |                   sample2 = remap[sample2]) %>%
 79 |     dplyr::mutate(dataset1 = sapply(strsplit(sample1, "_"), .subset, 1),
 80 |                   dataset2 = sapply(strsplit(sample2, "_"), .subset, 1),
 81 |                   condition1 = sapply(strsplit(sample1, "_"), .subset, 2),
 82 |                   condition2 = sapply(strsplit(sample2, "_"), .subset, 2)) %>% 
 83 |     dplyr::filter(condition1 %in% conditions & condition2 %in% conditions) %>%
 84 |     dplyr::mutate(dtype = ifelse(
 85 |       dataset1 == "Illumina" & dataset2 == "Illumina", "Illumina", 
 86 |       ifelse((dataset1 == "Illumina" & dataset2 != "Illumina") | 
 87 |                (dataset1 != "Illumina" & dataset2 == "Illumina"),
 88 |              "Illumina-Nanopore", "Nanopore"))) %>%
 89 |     dplyr::mutate(combination = ifelse(dataset1 == dataset2, "same dataset", 
 90 |                                        "different datasets"))
 91 | }
 92 | 
 93 | plotCor <- function(cordf, ylab, title) {
 94 |   tmp <- cordf %>% 
 95 |     dplyr::filter(dataset1 != "ONT-RNA001-HEK" & dataset2 != "ONT-RNA001-HEK") %>%
 96 |     dplyr::mutate(method1 = gsub("featurecounts", "fC", 
 97 |                                  gsub("count__", "", method1)),
 98 |                   method2 = gsub("featurecounts", "fC", 
 99 |                                  gsub("count__", "", method2))) %>%
100 |     dplyr::filter(gsub("tpm__", "", gsub("31", "", method1)) == 
101 |                     gsub("tpm__", "", gsub("31", "", method2)) & 
102 |                     sample1 != sample2 & 
103 |                     condition1 == condition2 & 
104 |                     (dataset1 < dataset2 | (dataset1 == dataset2 & 
105 |                                               sample1 < sample2))) %>%
106 |     dplyr::mutate(
107 |       method1 = replace(
108 |         method1, dtype == "Illumina-Nanopore", 
109 |         gsub("tpm__", "", gsub("31", "", 
110 |                                method1[dtype == "Illumina-Nanopore"]))),
111 |       method2 = replace(
112 |         method2, dtype == "Illumina-Nanopore", 
113 |         gsub("tpm__", "", gsub("31", "", 
114 |                                method2[dtype == "Illumina-Nanopore"])))
115 |     )
116 |   ncol <- length(unique(interaction(tmp$dataset1, tmp$dataset2)))
117 |   ggplot(tmp, 
118 |          aes(x = method1, y = correlation, color = interaction(dataset1, dataset2),
119 |              shape = combination)) + 
120 |     geom_quasirandom(size = 3, alpha = 0.6) + theme_bw() + 
121 |     theme(axis.text.x = element_text(angle = 90, vjust = 0.5,
122 |                                      hjust = 1, size = 12)) + 
123 |     xlab("") + ylab(ylab) + 
124 |     scale_color_manual(values = colfun(ncol), name = "") + 
125 |     expand_limits(y = c(0, 1)) + scale_shape_discrete(name = "") + 
126 |     facet_grid(~ dtype, scales = "free_x", space = "free_x") + 
127 |     ggtitle(title)
128 | }
129 | 
130 | plotCorBetween <- function(cordf, ylab, title) {
131 |   ggplot(cordf %>% 
132 |            dplyr::mutate(method1 = gsub("featurecounts", "fC", 
133 |                                         gsub("count__", "", method1)),
134 |                          method2 = gsub("featurecounts", "fC", 
135 |                                         gsub("count__", "", method2))) %>%
136 |            dplyr::filter(method1 < method2 & sample1 == sample2) %>%
137 |            dplyr::mutate(dataset1 = factor(dataset1, levels = 
138 |                                              ds_order[ds_order %in% dataset1])), 
139 |          aes(x = interaction(method1, method2), y = correlation, color = dataset1)) + 
140 |     geom_quasirandom(size = 4, alpha = 0.8) + theme_bw() + 
141 |     theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
142 |     xlab("") + ylab(ylab) + ggtitle(title) + 
143 |     scale_color_manual(values = ds_colors, 
144 |                        name = "") + expand_limits(y = c(0, 1)) + 
145 |     facet_grid(~ dtype, scales = "free_x", space = "free_x")
146 | }
147 | 
148 | for (m in c("Pearson", "Spearman")) {
149 |   cortx[[m]] <- calcCor(sqrt(txlevel), method = tolower(m))
150 |   corgene[[m]] <- calcCor(sqrt(genelevel), method = tolower(m))
151 | 
152 |   ptx <- plotCor(cordf = cortx[[m]], 
153 |                  ylab = paste0(m, " correlation between\nreplicate pairs, ", 
154 |                                "sqrt(abundances)"),
155 |                  title = "Transcript")
156 |   pg <- plotCor(cordf = corgene[[m]],
157 |                 ylab = paste0(m, " correlation between\nreplicate pairs, ", 
158 |                               "sqrt(abundances)"),
159 |                 title = "Gene")
160 |   
161 |   png(gsub("\\.rds", paste0("_", m, ".png"), outrds), 
162 |       width = 18, height = 6, unit = "in", res = 400)
163 |   print(cowplot::plot_grid(
164 |     ptx + theme(legend.position = "none"),
165 |     pg + theme(legend.position = "none"),
166 |     cowplot::get_legend(pg),
167 |     nrow = 1, rel_widths = c(1, 1.15, 0.5), align = "h", axis = "t",
168 |     labels = c("A", "B", "")
169 |   ))
170 |   dev.off()
171 |   
172 |   ptxb <- plotCorBetween(cordf = cortx[[m]],
173 |                          ylab = paste0(m, " correlation between\nmethod pairs, ", 
174 |                                        "sqrt(abundances)"),
175 |                          title = "Transcript")
176 |   pgb <- plotCorBetween(cordf = corgene[[m]],
177 |                         ylab = paste0(m, " correlation between\nmethod pairs, ", 
178 |                                       "sqrt(abundances)"),
179 |                         title = "Gene")
180 |   
181 |   png(gsub("\\.rds", paste0("_betweenmethods_", m, ".png"), outrds), 
182 |       width = 13, height = 6, unit = "in", res = 400)
183 |   print(cowplot::plot_grid(
184 |     cowplot::plot_grid(
185 |       ptxb + theme(legend.position = "none"),
186 |       pgb + theme(legend.position = "none"),
187 |       nrow = 1, rel_widths = c(1, 1), labels = c("A", "B"),
188 |       align = "h", axis = "tb"),
189 |     cowplot::get_legend(pgb), nrow = 1, rel_widths = c(2, 0.4),
190 |     labels = c("", ""), align = "h", axis = "t"
191 |   ))
192 |   dev.off()
193 | }
194 | 
195 | saveRDS(NULL, file = outrds)
196 | date()
197 | sessionInfo()
198 | 
199 | 
200 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_flair_round2_summary.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(rtracklayer)
  8 |   library(dplyr)
  9 |   library(GenomicFeatures)
 10 |   library(ggplot2)
 11 |   library(grDevices)
 12 |   library(cowplot)
 13 |   library(grid)
 14 | })
 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 16 | 
 17 | datasets <- strsplit(datasets, ",")[[1]]
 18 | datasets <- c(datasets, "Illumina")
 19 | names(datasets) <- datasets
 20 | conditions <- strsplit(conditions, ",")[[1]]
 21 | 
 22 | if (all(c("wt", "srpk") %in% conditions)) {
 23 |   flaircond <- "_all"
 24 | } else if ("wt" %in% conditions) {
 25 |   flaircond <- "_WT"
 26 | } else {
 27 |   stop("Unknown conditions")
 28 | }
 29 | 
 30 | print(datasets)
 31 | print(conditions)
 32 | print(flaircond)
 33 | print(outrds)
 34 | 
 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6",
 36 |            "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265",
 37 |            "#CAEDAB", "#777777")
 38 | colfun <- grDevices::colorRampPalette(muted)
 39 | 
 40 | ## Read Illumina abundance estimates
 41 | ilmn_tx_abundances <- 
 42 |   readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances
 43 | ilmn_samples <- sample_annotation %>% 
 44 |   dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 
 45 |   dplyr::pull("sample_orig")
 46 | print(ilmn_samples)
 47 | ave_ilmn_tx_tpm <- 
 48 |   data.frame(transcript_id = rownames(ilmn_tx_abundances), 
 49 |              ave_tx_tpm = rowMeans(
 50 |                ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")]
 51 |              ),
 52 |              stringsAsFactors = FALSE)
 53 | 
 54 | ## ========================================================================== ##
 55 | ## Plot overlap of detected transcripts (class code = or c)
 56 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
 57 |   dplyr::bind_rows(
 58 |     read.delim(paste0(
 59 |       ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 
 60 |       "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gffcompare.", 
 61 |       ds, flaircond, 
 62 |       "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gtf.tmap"), 
 63 |       header = TRUE, as.is = TRUE) %>%
 64 |       dplyr::filter(class_code %in% c("c", "=")) %>%
 65 |       dplyr::select(ref_id) %>%
 66 |       dplyr::rename(transcript_id = ref_id) %>%
 67 |       dplyr::distinct() %>%
 68 |       dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc"),
 69 |                     identified = TRUE),
 70 |     read.delim(paste0(
 71 |       ds, "/flair_round2/", ds, flaircond, "/", ds, flaircond, 
 72 |       "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare.", 
 73 |       ds, flaircond, 
 74 |       "_minimap_genome_s_primary_flair_collapse.isoforms.gtf.tmap"), 
 75 |       header = TRUE, as.is = TRUE) %>%
 76 |       dplyr::filter(class_code %in% c("c", "=")) %>%
 77 |       dplyr::select(ref_id) %>%
 78 |       dplyr::rename(transcript_id = ref_id) %>%
 79 |       dplyr::distinct() %>%
 80 |       dplyr::mutate(dataset = remapds[ds],
 81 |                     identified = TRUE)
 82 |   )
 83 | })) %>%
 84 |   dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc")  ## don't use HAP junctions for HEK
 85 | dettx <- unique(df0$transcript_id) ## all detected ref. transcripts
 86 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset)))
 87 | rownames(detmat) <- dettx
 88 | colnames(detmat) <- unique(df0$dataset)
 89 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset) %>%
 90 |                    dplyr::mutate(transcript_id = as.character(transcript_id),
 91 |                                  dataset = as.character(dataset)))] <- 1
 92 | ordr <- ds_order[ds_order %in% colnames(detmat)]
 93 | ordr <- rep(ordr, each = 2)
 94 | suffx <- rep(c("", "_ILMNjunc"), length(ordr)/2)
 95 | ordr <- paste0(ordr, suffx)
 96 | ordr <- setdiff(ordr, "ONT-RNA001-HEK_ILMNjunc")
 97 | stopifnot(ordr %in% colnames(detmat),
 98 |           colnames(detmat) %in% ordr)
 99 | detmat <- detmat[, ordr]
100 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 
101 |     width = 10, height = 6, unit = "in", res = 400)
102 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq",
103 |               decreasing = TRUE, keep.order = TRUE, nsets = length(ordr), 
104 |               mainbar.y.label = "Number of shared annotated transcripts",
105 |               sets.x.label = "Number of identified\nannotated transcripts",
106 |               sets.bar.color = ds_colors[gsub("_ILMNjunc", "", colnames(detmat))],
107 |               sets = colnames(detmat), mb.ratio = c(0.55, 0.45))
108 | grid::grid.edit("arrange", name = "arrange2")
109 | upsetplot <- grid::grid.grab()
110 | dev.off()
111 | 
112 | ## ========================================================================== ##
113 | ## Plot abundance of identified/unidentified transcripts 
114 | ## (identified = class code = or c)
115 | df1 <- df0 %>% 
116 |   dplyr::full_join(
117 |     do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
118 |       dplyr::bind_rows(ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds]),
119 |                        ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = paste0(remapds[ds],
120 |                                                                           "_ILMNjunc"))) %>%
121 |         dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc")
122 |     })), 
123 |     by = c("transcript_id", "dataset")
124 |   ) %>%
125 |   dplyr::mutate(identified = replace(identified, is.na(identified), FALSE))
126 | 
127 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds),
128 |     width = 8, height = 4, unit = "in", res = 400)
129 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 
130 |   geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 
131 |   scale_y_log10() + xlab("Transcript identified by FLAIR") + 
132 |   ylab("Average abundance across Illumina samples (TPM + 1)")
133 | dev.off()
134 | 
135 | ## Merge with upset plot
136 | df2 <- df1 %>% dplyr::filter(grepl("_ILMNjunc$|RNA001-HEK", dataset)) %>%
137 |   dplyr::select(-dataset) %>%
138 |   dplyr::group_by(transcript_id, ave_tx_tpm) %>%
139 |   dplyr::summarize(identified = any(identified))
140 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 
141 |   geom_boxplot() + theme_bw() + 
142 |   scale_y_log10() + xlab("Transcript identified by FLAIR") + 
143 |   ylab("Average abundance across Illumina samples (TPM + 1)")
144 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds),
145 |     width = 12, height = 6, unit = "in", res = 400)
146 | print(cowplot::plot_grid(
147 |   upsetplot,
148 |   p1, 
149 |   rel_widths = c(4, 1.5), nrow = 1, labels = c("A", "B")
150 | ))
151 | dev.off()
152 | 
153 | ## ========================================================================== ##
154 | ## Get number of exons and read length per flair transcript and plot for
155 | ## different class codes
156 | ds_order_both <- rep(ds_order, each = 2)
157 | suffx <- rep(c("", "_ILMNjunc"), length(ds_order_both)/2)
158 | ds_order_both <- paste0(ds_order_both, suffx)
159 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
160 |   tmp <- as.data.frame(rtracklayer::import(
161 |     paste0(ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", 
162 |                            "flair_round2"),
163 |            "/", ds, flaircond, "/", ds, flaircond, 
164 |            ifelse(ds == "Illumina", "_stringtie_assembly.", 
165 |                   "_minimap_genome_s_primary_flair_collapse.isoforms."), 
166 |            "gffcompare.annotated.gtf")))
167 |   tmap <- dplyr::full_join(
168 |     tmp %>% 
169 |       dplyr::filter(type == "exon") %>%
170 |       dplyr::group_by(transcript_id) %>%
171 |       dplyr::summarize(nbr_exons = length(type),
172 |                        txLength = sum(width)),
173 |     tmp %>%
174 |       dplyr::filter(type == "transcript") %>%
175 |       dplyr::select(transcript_id, class_code),
176 |     by = "transcript_id"
177 |   ) %>% dplyr::mutate(dataset = remapds[ds])
178 |   if (ds %in% c("RNA001", "DCS108", "pilot")) {
179 |     tmp2 <- as.data.frame(rtracklayer::import(
180 |       paste0(ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 
181 |              "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.", 
182 |              "gffcompare.annotated.gtf")))
183 |     tmap2 <- dplyr::full_join(
184 |       tmp2 %>% 
185 |         dplyr::filter(type == "exon") %>%
186 |         dplyr::group_by(transcript_id) %>%
187 |         dplyr::summarize(nbr_exons = length(type),
188 |                          txLength = sum(width)),
189 |       tmp2 %>%
190 |         dplyr::filter(type == "transcript") %>%
191 |         dplyr::select(transcript_id, class_code),
192 |       by = "transcript_id"
193 |     ) %>% dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc"))
194 |     tmap <- dplyr::bind_rows(tmap, tmap2)
195 |   }
196 |   tmap
197 | })) %>%
198 |   dplyr::mutate(nbr_exons = replace(nbr_exons, nbr_exons > 2, ">2")) %>%
199 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset]))
200 | 
201 | 
202 | ## ========================================================================== ##
203 | ## Plot distribution of class codes
204 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
205 |   tmap <- read.delim(paste0(
206 |     ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair_round2"), 
207 |     "/", ds, flaircond, "/", ds, flaircond, 
208 |     ifelse(ds == "Illumina", "_stringtie_assembly.gffcompare.", 
209 |            "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare."), 
210 |     ds, flaircond, ifelse(ds == "Illumina", "_stringtie_assembly.",
211 |                           "_minimap_genome_s_primary_flair_collapse.isoforms."),
212 |     "gtf.tmap"), 
213 |     header = TRUE, as.is = TRUE)
214 |   t1 <- as.data.frame(table(tmap$class_code)) %>% 
215 |     dplyr::mutate(dataset = remapds[ds]) %>%
216 |     dplyr::rename(class_code = Var1) %>%
217 |     dplyr::mutate(class_code = as.character(class_code))
218 |   if (ds %in% c("RNA001", "DCS108", "pilot")) {
219 |     tmap2 <- read.delim(paste0(
220 |       ds, "/flair_round2_ilmnjunc/", ds, flaircond, "/", ds, flaircond, 
221 |       "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.gffcompare.", 
222 |       ds, flaircond, "_minimap_genome_s_primary_flair_collapse_ilmnjunc.isoforms.",
223 |       "gtf.tmap"), 
224 |       header = TRUE, as.is = TRUE)
225 |     t1 <- dplyr::bind_rows(
226 |       t1, as.data.frame(table(tmap2$class_code)) %>% 
227 |         dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>%
228 |         dplyr::rename(class_code = Var1) %>%
229 |         dplyr::mutate(class_code = as.character(class_code)))
230 |   }
231 |   t1
232 | })) %>%
233 |   dplyr::mutate(
234 |     class_code = replace(class_code, class_code == "=", 
235 |                          "complete, exact match of intron chain (=)"),
236 |     class_code = replace(class_code, class_code == "c", 
237 |                          "contained in reference (intron compatible) (c)"),
238 |     class_code = replace(class_code, class_code == "k", 
239 |                          "containment of reference (k)"),
240 |     class_code = replace(class_code, class_code == "m",
241 |                          "retained intron(s), full intron chain overlap (m)"),
242 |     class_code = replace(class_code, class_code == "n", 
243 |                          "retained intron(s), partial or no intron chain match (n)"),
244 |     class_code = replace(class_code, class_code == "j", 
245 |                          "multi-exon with at least one junction match (j)"),
246 |     class_code = replace(class_code, class_code == "e", 
247 |                          "single exon transfrag partially covering an intron (e)"),
248 |     class_code = replace(class_code, class_code == "o", 
249 |                          "other same strand overlap with reference exons (o)"),
250 |     class_code = replace(class_code, class_code == "s", 
251 |                          "intron matching on opposite strand (s)"),
252 |     class_code = replace(class_code, class_code == "x", 
253 |                          "exonic overlap on opposite strand (x)"),
254 |     class_code = replace(class_code, class_code == "i", 
255 |                          "fully contained within a reference intron (i)"),
256 |     class_code = replace(class_code, class_code == "y", 
257 |                          "contains a reference within its introns (y)"),
258 |     class_code = replace(class_code, class_code == "p", 
259 |                          "possible polymerase run-on (no overlap) (p)"),
260 |     class_code = replace(class_code, class_code == "r", "repeat (r)"),
261 |     class_code = replace(class_code, class_code == "u", "other (u)")) %>%
262 |   dplyr::mutate(class_code = factor(class_code, levels = c(
263 |     "other (u)",
264 |     "repeat (r)",
265 |     "possible polymerase run-on (no overlap) (p)",
266 |     "contains a reference within its introns (y)",
267 |     "fully contained within a reference intron (i)",
268 |     "exonic overlap on opposite strand (x)",
269 |     "intron matching on opposite strand (s)",
270 |     "other same strand overlap with reference exons (o)",
271 |     "single exon transfrag partially covering an intron (e)",
272 |     "multi-exon with at least one junction match (j)",
273 |     "retained intron(s), partial or no intron chain match (n)",
274 |     "retained intron(s), full intron chain overlap (m)",
275 |     "containment of reference (k)",
276 |     "contained in reference (intron compatible) (c)",
277 |     "complete, exact match of intron chain (=)"
278 |   ))) %>%
279 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset]))
280 | 
281 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 
282 |   geom_bar(stat = "identity", position = "fill", aes(fill = class_code)) + 
283 |   theme_bw() + 
284 |   scale_fill_manual(values = structure(colfun(15), names = levels(dfcc$class_code)),
285 |                     name = "Overlap type (class code)") + 
286 |   ylab("Fraction of transcripts") + xlab("") + 
287 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
288 |   scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
289 |   geom_text(data = dfcc %>% dplyr::group_by(dataset) %>%
290 |               dplyr::summarize(nbrTx = sum(Freq)),
291 |             aes(x = dataset, y = 1, label = nbrTx), 
292 |             vjust = 0, nudge_y = 0.02, size = 3)
293 | 
294 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_classcode_full.png", outrds), 
295 |     width = 11, height = 16, unit = "in", res = 400)
296 | p1 <- ggplot(nbrexons %>%
297 |                dplyr::group_by(dataset, class_code, nbr_exons) %>%
298 |                dplyr::tally() %>%
299 |                dplyr::ungroup() %>%
300 |                dplyr::mutate(nbr_exons = factor(nbr_exons, 
301 |                                                 levels = c(">2", "2", "1"))), 
302 |              aes(x = class_code, y = n, fill = nbr_exons)) + 
303 |   geom_bar(stat = "identity", position = "fill") + theme_bw() + 
304 |   facet_wrap(~ dataset, nrow = 2) +
305 |   xlab("Class code") + ylab("Fraction of transcripts") + 
306 |   scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"),
307 |                     name = "Number of\nexons") + 
308 |   theme(legend.position = "bottom") + 
309 |   scale_y_continuous(expand = c(0, 0, 0.05, 0))
310 | p2 <- ggplot(nbrexons, 
311 |              aes(x = class_code, y = txLength)) + 
312 |   geom_boxplot() + theme_bw() + 
313 |   facet_wrap(~ dataset, nrow = 2) +
314 |   scale_y_log10() + 
315 |   xlab("Class code") + ylab("Inferred transcript length")
316 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 
317 |                    rel_heights = c(1, 1.1, 1), align = "v", axis = "l")
318 | dev.off()
319 | 
320 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_classcode.png", outrds), 
321 |     width = 9, height = 12, unit = "in", res = 400)
322 | p1 <- ggplot(nbrexons %>%
323 |                dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")) %>%
324 |                dplyr::group_by(dataset, class_code, nbr_exons) %>%
325 |                dplyr::tally() %>%
326 |                dplyr::ungroup() %>%
327 |                dplyr::mutate(nbr_exons = factor(nbr_exons, 
328 |                                                 levels = c(">2", "2", "1"))), 
329 |              aes(x = class_code, y = n, fill = nbr_exons)) + 
330 |   geom_bar(stat = "identity", position = "fill") + theme_bw() + 
331 |   facet_wrap(~ dataset, nrow = 1) +
332 |   xlab("Class code") + ylab("Fraction of transcripts") + 
333 |   scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"),
334 |                     name = "Number of\nexons") + 
335 |   theme(legend.position = "bottom") + 
336 |   scale_y_continuous(expand = c(0, 0, 0.05, 0))
337 | p2 <- ggplot(nbrexons %>%
338 |                dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")), 
339 |              aes(x = class_code, y = txLength)) + 
340 |   geom_boxplot() + theme_bw() + 
341 |   facet_wrap(~ dataset, nrow = 1) +
342 |   scale_y_log10() + 
343 |   xlab("Class code") + ylab("Inferred transcript length")
344 | cowplot::plot_grid(p0, p1, p2,
345 |                    ncol = 1, labels = c("A", "B", "C"), 
346 |                    rel_heights = c(1.9, 1.28, 1.1), align = "v", axis = "l")
347 | dev.off()
348 | 
349 | saveRDS(NULL, file = outrds)
350 | date()
351 | sessionInfo()


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_flair_summary.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(rtracklayer)
  8 |   library(dplyr)
  9 |   library(GenomicFeatures)
 10 |   library(ggplot2)
 11 |   library(grDevices)
 12 |   library(cowplot)
 13 |   library(grid)
 14 | })
 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 16 | 
 17 | datasets <- strsplit(datasets, ",")[[1]]
 18 | datasets <- c(datasets, "Illumina")
 19 | names(datasets) <- datasets
 20 | conditions <- strsplit(conditions, ",")[[1]]
 21 | 
 22 | if (all(c("wt", "srpk") %in% conditions)) {
 23 |   flaircond <- "_all"
 24 | } else if ("wt" %in% conditions) {
 25 |   flaircond <- "_WT"
 26 | } else {
 27 |   stop("Unknown conditions")
 28 | }
 29 | 
 30 | print(datasets)
 31 | print(conditions)
 32 | print(flaircond)
 33 | print(outrds)
 34 | 
 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6",
 36 |            "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265",
 37 |            "#CAEDAB", "#777777")
 38 | colfun <- grDevices::colorRampPalette(muted)
 39 | 
 40 | ## Read Illumina abundance estimates
 41 | ilmn_tx_abundances <- 
 42 |   readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances
 43 | ilmn_samples <- sample_annotation %>% 
 44 |   dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 
 45 |   dplyr::pull("sample_orig")
 46 | ave_ilmn_tx_tpm <- 
 47 |   data.frame(transcript_id = rownames(ilmn_tx_abundances), 
 48 |              ave_tx_tpm = rowMeans(
 49 |                ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")]
 50 |              ),
 51 |              stringsAsFactors = FALSE)
 52 | 
 53 | ## ========================================================================== ##
 54 | ## Plot overlap of detected transcripts (class code = or c)
 55 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
 56 |   read.delim(paste0(
 57 |     ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair"),
 58 |     "/", ds, flaircond, "/", ds, flaircond, 
 59 |     ifelse(ds == "Illumina", "_stringtie_assembly.gffcompare.",
 60 |            "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare."), 
 61 |     ds, flaircond, 
 62 |     ifelse(ds == "Illumina", "_stringtie_assembly.gtf.tmap", 
 63 |            "_minimap_genome_s_primary_flair_collapse.isoforms.gtf.tmap")), 
 64 |     header = TRUE, as.is = TRUE) %>%
 65 |     dplyr::filter(class_code %in% c("c", "=")) %>%
 66 |     dplyr::select(ref_id) %>%
 67 |     dplyr::rename(transcript_id = ref_id) %>%
 68 |     dplyr::distinct() %>%
 69 |     dplyr::mutate(dataset = remapds[ds],
 70 |                   identified = TRUE)
 71 | }))
 72 | dettx <- unique(df0$transcript_id)
 73 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset)))
 74 | rownames(detmat) <- dettx
 75 | colnames(detmat) <- unique(df0$dataset)
 76 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset))] <- 1
 77 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 
 78 |     width = 10, height = 6, unit = "in", res = 400)
 79 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq")
 80 | grid::grid.edit("arrange", name = "arrange2")
 81 | upsetplot <- grid::grid.grab()
 82 | dev.off()
 83 | 
 84 | ## ========================================================================== ##
 85 | ## Plot abundance of identified/unidentified transcripts 
 86 | ## (identified = class code = or c)
 87 | df1 <- df0 %>% 
 88 |   dplyr::full_join(
 89 |     do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
 90 |       ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds])
 91 |     })), 
 92 |     by = c("transcript_id", "dataset")
 93 |   ) %>%
 94 |   dplyr::mutate(identified = replace(identified, is.na(identified), FALSE))
 95 | 
 96 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds),
 97 |     width = 8, height = 4, unit = "in", res = 400)
 98 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 
 99 |   geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 
100 |   scale_y_log10() + xlab("Transcript identified by flair") + 
101 |   ylab("Average abundance across Illumina samples (TPM + 1)")
102 | dev.off()
103 | 
104 | ## Merge with upset plot
105 | df2 <- df1 %>% dplyr::select(-dataset) %>%
106 |   dplyr::group_by(transcript_id, ave_tx_tpm) %>%
107 |   dplyr::summarize(identified = any(identified))
108 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 
109 |   geom_boxplot() + theme_bw() + 
110 |   scale_y_log10() + xlab("Transcript identified by flair") + 
111 |   ylab("Average abundance across Illumina samples (TPM + 1)")
112 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds),
113 |     width = 12, height = 6, unit = "in", res = 400)
114 | print(cowplot::plot_grid(
115 |   upsetplot,
116 |   p1, 
117 |   rel_widths = c(4, 1.5), nrow = 1, labels = c("A", "B")
118 | ))
119 | dev.off()
120 | 
121 | ## ========================================================================== ##
122 | ## Get number of exons and read length per flair transcript and plot for
123 | ## different class codes
124 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
125 |   tmp <- as.data.frame(rtracklayer::import(
126 |     paste0(ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair"),
127 |            "/", ds, flaircond, "/", ds, flaircond, 
128 |            ifelse(ds == "Illumina", "_stringtie_assembly.", 
129 |                   "_minimap_genome_s_primary_flair_collapse.isoforms."), 
130 |            "gffcompare.annotated.gtf")))
131 |   dplyr::full_join(
132 |     tmp %>% 
133 |       dplyr::filter(type == "exon") %>%
134 |       dplyr::group_by(transcript_id) %>%
135 |       dplyr::summarize(nbr_exons = length(type),
136 |                        txLength = sum(width)),
137 |     tmp %>%
138 |       dplyr::filter(type == "transcript") %>%
139 |       dplyr::select(transcript_id, class_code),
140 |     by = "transcript_id"
141 |   ) %>% dplyr::mutate(dataset = remapds[ds])
142 | })) %>%
143 |   dplyr::mutate(nbr_exons = replace(nbr_exons, nbr_exons > 2, ">2"))
144 | 
145 | ## ========================================================================== ##
146 | ## Plot distribution of class codes
147 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
148 |   tmap <- read.delim(paste0(
149 |     ds, "/", ifelse(ds == "Illumina", "stringtie_assembly", "flair"), 
150 |     "/", ds, flaircond, "/", ds, flaircond, 
151 |     ifelse(ds == "Illumina", "_stringtie_assembly.gffcompare.", 
152 |            "_minimap_genome_s_primary_flair_collapse.isoforms.gffcompare."), 
153 |     ds, flaircond, ifelse(ds == "Illumina", "_stringtie_assembly.",
154 |                           "_minimap_genome_s_primary_flair_collapse.isoforms."),
155 |     "gtf.tmap"), 
156 |     header = TRUE, as.is = TRUE)
157 |   as.data.frame(table(tmap$class_code)) %>% 
158 |     dplyr::mutate(dataset = remapds[ds]) %>%
159 |     dplyr::rename(class_code = Var1) %>%
160 |     dplyr::mutate(class_code = as.character(class_code))
161 | })) %>%
162 |   dplyr::mutate(
163 |     class_code = replace(class_code, class_code == "=", 
164 |                          "complete, exact match of intron chain (=)"),
165 |     class_code = replace(class_code, class_code == "c", 
166 |                          "contained in reference (intron compatible) (c)"),
167 |     class_code = replace(class_code, class_code == "k", 
168 |                          "containment of reference (k)"),
169 |     class_code = replace(class_code, class_code == "m",
170 |                          "retained intron(s), full intron chain overlap (m)"),
171 |     class_code = replace(class_code, class_code == "n", 
172 |                          "retained intron(s), partial or no intron chain match (n)"),
173 |     class_code = replace(class_code, class_code == "j", 
174 |                          "multi-exon with at least one junction match (j)"),
175 |     class_code = replace(class_code, class_code == "e", 
176 |                          "single exon transfrag partially covering an intron (e)"),
177 |     class_code = replace(class_code, class_code == "o", 
178 |                          "other same strand overlap with reference exons (o)"),
179 |     class_code = replace(class_code, class_code == "s", 
180 |                          "intron matching on opposite strand (s)"),
181 |     class_code = replace(class_code, class_code == "x", 
182 |                          "exonic overlap on opposite strand (x)"),
183 |     class_code = replace(class_code, class_code == "i", 
184 |                          "fully contained within a reference intron (i)"),
185 |     class_code = replace(class_code, class_code == "y", 
186 |                          "contains a reference within its introns (y)"),
187 |     class_code = replace(class_code, class_code == "p", 
188 |                          "possible polymerase run-on (no overlap) (p)"),
189 |     class_code = replace(class_code, class_code == "r", "repeat (r)"),
190 |     class_code = replace(class_code, class_code == "u", "other (u)")) %>%
191 |   dplyr::mutate(class_code = factor(class_code, levels = c(
192 |     "other (u)",
193 |     "repeat (r)",
194 |     "possible polymerase run-on (no overlap) (p)",
195 |     "contains a reference within its introns (y)",
196 |     "fully contained within a reference intron (i)",
197 |     "exonic overlap on opposite strand (x)",
198 |     "intron matching on opposite strand (s)",
199 |     "other same strand overlap with reference exons (o)",
200 |     "single exon transfrag partially covering an intron (e)",
201 |     "multi-exon with at least one junction match (j)",
202 |     "retained intron(s), partial or no intron chain match (n)",
203 |     "retained intron(s), full intron chain overlap (m)",
204 |     "containment of reference (k)",
205 |     "contained in reference (intron compatible) (c)",
206 |     "complete, exact match of intron chain (=)"
207 |   )))
208 | 
209 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_classcode.png", outrds), 
210 |     width = 11, height = 11, unit = "in", res = 400)
211 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 
212 |   geom_bar(stat = "identity", position = "fill", aes(fill = class_code)) + theme_bw() + 
213 |   scale_fill_manual(values = structure(colfun(15), names = levels(dfcc$class_code)),
214 |                     name = "Overlap type (class code)") + 
215 |   ylab("Fraction of transcripts") + xlab("") + 
216 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
217 |   scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
218 |   geom_text(data = dfcc %>% dplyr::group_by(dataset) %>%
219 |               dplyr::summarize(nbrTx = sum(Freq)),
220 |             aes(x = dataset, y = 1, label = nbrTx), 
221 |             vjust = 0, nudge_y = 0.02, size = 3)
222 | p1 <- ggplot(nbrexons %>%
223 |                dplyr::group_by(dataset, class_code, nbr_exons) %>%
224 |                dplyr::tally() %>%
225 |                dplyr::ungroup() %>%
226 |                dplyr::mutate(nbr_exons = factor(nbr_exons, 
227 |                                                 levels = c(">2", "2", "1"))), 
228 |              aes(x = class_code, y = n, fill = nbr_exons)) + 
229 |   geom_bar(stat = "identity", position = "fill") + theme_bw() + 
230 |   facet_wrap(~ dataset, nrow = 1) +
231 |   xlab("Class code") + ylab("Fraction of transcripts") + 
232 |   scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"),
233 |                     name = "Number of\nexons") + 
234 |   theme(legend.position = "bottom") + 
235 |   scale_y_continuous(expand = c(0, 0, 0.05, 0))
236 | p2 <- ggplot(nbrexons, 
237 |              aes(x = class_code, y = txLength)) + 
238 |   geom_boxplot() + theme_bw() + 
239 |   facet_wrap(~ dataset, nrow = 1) +
240 |   scale_y_log10() + 
241 |   xlab("Class code") + ylab("Inferred transcript length")
242 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 
243 |                    rel_heights = c(1.7, 1.08, 0.9), align = "v", axis = "l")
244 | dev.off()
245 | 
246 | 
247 | saveRDS(NULL, file = outrds)
248 | date()
249 | sessionInfo()


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_gc_content_genomenotxome.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | suppressPackageStartupMessages({
 7 |   library(dplyr)
 8 |   library(ggplot2)
 9 |   library(cowplot)
10 |   library(stringr)
11 |   library(Biostrings)
12 |   library(data.table)
13 | })
14 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
15 | 
16 | datasets <- strsplit(datasets, ",")[[1]]
17 | names(datasets) <- datasets
18 | conditions <- strsplit(conditions, ",")[[1]]
19 | 
20 | print(datasets)
21 | print(conditions)
22 | print(outrds)
23 | 
24 | fastqs <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
25 |   fastqdir <- paste0(ds, "/FASTQ", 
26 |                      ifelse(ds %in% c("RNA001", "HEK293RNA"), "dna", ""))
27 |   fastqfiles <- list.files(fastqdir, pattern = "(FASTQ|fastq)\\.gz$", 
28 |                            full.names = TRUE)
29 |   names(fastqfiles) <- gsub("\\.(FASTQ|fastq).gz", "", basename(fastqfiles))
30 |   fastqfiles <- fastqfiles[!grepl("_orig", names(fastqfiles))]
31 |   fastqfiles <- fastqfiles[names(fastqfiles) %in% 
32 |                              sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
33 |   fastqfiles
34 |   
35 |   genomenotxomedir <- paste0(ds, "/FASTQgenomenotxome")
36 |   textfiles <- list.files(genomenotxomedir, pattern = "_reads_aligning_to_genome_but_not_txome.txt",
37 |                           full.names = TRUE)
38 |   names(textfiles) <- gsub("_reads_aligning_to_genome_but_not_txome.txt", "", basename(textfiles))
39 |   textfiles <- textfiles[!grepl("_orig", names(textfiles))]
40 |   textfiles <- textfiles[names(textfiles) %in% 
41 |                            sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
42 |   textfiles
43 |   
44 |   stopifnot(names(fastqfiles) %in% names(textfiles),
45 |             names(textfiles) %in% names(fastqfiles))
46 |   textfiles <- textfiles[match(names(fastqfiles), names(textfiles))]
47 |   stopifnot(names(fastqfiles) == names(textfiles))
48 |   
49 |   do.call(dplyr::bind_rows, lapply(names(fastqfiles), function(nm) {
50 |     f <- fastqfiles[nm]
51 |     fastq <- fread(paste0("zcat ", f), sep = "\n", header = FALSE)$V1
52 |     seq_idxs <- seq(2, length(fastq), by = 4)
53 |     reads <- Biostrings::DNAStringSet(x = fastq[seq_idxs])
54 |     names(reads) <- sapply(strsplit(fastq[seq_idxs - 1], " "), .subset, 1)
55 |     gc <- letterFrequency(reads, "GC", as.prob = TRUE)
56 |     gnt <- read.delim(textfiles[nm], header = FALSE, as.is = TRUE)$V1
57 |     as.data.frame(gc) %>% dplyr::mutate(read = names(reads), 
58 |                                         sample = remap[nm],
59 |                                         dataset = remapds[ds]) %>%
60 |       dplyr::mutate(genomenotxome = gsub("^@", "", read) %in% gnt)
61 |   }))
62 | }))
63 | 
64 | fastqs <- fastqs %>% 
65 |   dplyr::mutate(readcat = c("Other reads", "Reads mapping to genome but not transcriptome")[genomenotxome + 1])
66 | print(table(fastqs$sample, fastqs$readcat))
67 | 
68 | png(gsub("rds$", "png", outrds), width = 6, height = 3, unit = "in", res = 400)
69 | ggplot(fastqs, aes(x = `G|C`, group = interaction(sample, readcat), color = readcat)) + 
70 |   geom_line(stat = "density", size = 1.5) + 
71 |   theme_bw() + 
72 |   scale_color_manual(values = c("red", "blue"), name = "") + 
73 |   xlab("GC content") + ylab("Density") + 
74 |   theme(legend.position = "bottom")
75 | dev.off()
76 | 
77 | saveRDS(NULL, file = outrds)
78 | date()
79 | sessionInfo()
80 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_illumina_nanopore_lengths_all_datasets.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(tximport)
  8 |   library(ggplot2)
  9 |   library(dplyr)
 10 |   library(ggridges)
 11 |   library(tibble)
 12 | })
 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 14 | 
 15 | datasets <- strsplit(datasets, ",")[[1]]
 16 | names(datasets) <- datasets
 17 | conditions <- strsplit(conditions, ",")[[1]]
 18 | 
 19 | print(datasets)
 20 | print(conditions)
 21 | print(tx2gene)
 22 | print(outrds)
 23 | 
 24 | tx2gene <- readRDS(tx2gene)
 25 | 
 26 | ## Read estimated abundances Illumina
 27 | files <- list.files("Illumina/salmon31", pattern = "quant.sf", 
 28 |                     recursive = TRUE, full.names = TRUE)
 29 | names(files) <- basename(dirname(files))
 30 | files <- files[names(files) %in% 
 31 |                  sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 32 | salmon <- tximport(files = files, type = "salmon", txOut = TRUE)
 33 | 
 34 | salmon$length <- cbind(salmon$length, 
 35 |                        txLength = tx2gene$txlength[match(rownames(salmon$length), tx2gene$tx)])
 36 | stopifnot(all(rownames(salmon$length) == rownames(salmon$abundance)))
 37 | 
 38 | txlengths_illumina <- 
 39 |   do.call(dplyr::bind_rows, lapply(colnames(salmon$abundance), function(nm) {
 40 |     data.frame(tx_read_length = rep(salmon$length[, "txLength"], 
 41 |                                     round(salmon$abundance[, nm] * 10)),
 42 |                sample = remap[nm],
 43 |                dtype = "Illumina transcript lengths",
 44 |                dataset = "Illumina",
 45 |                stringsAsFactors = FALSE)
 46 |   }))
 47 | 
 48 | ## Nanopore read lengths
 49 | readInfo <- do.call(rbind, lapply(datasets, function(ds) {
 50 |   rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds"))
 51 |   rdf <- rd$fastqs
 52 |   rdf <- rdf[!grepl("_orig", names(rdf))]
 53 |   rdf <- rdf[names(rdf) %in% 
 54 |                sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 55 |   do.call(dplyr::bind_rows, lapply(names(rdf), function(nm) {
 56 |     rdf[[nm]]$reads %>% dplyr::select(read, readLength) %>%
 57 |       dplyr::mutate(sample = remap[nm], dataset = remapds[ds]) %>%
 58 |       dplyr::left_join(
 59 |         rd$genomebams[[nm]]$allAlignments %>%
 60 |           dplyr::filter(flag %in% c(0, 16)) %>%
 61 |           dplyr::select(read, nbrSupplementaryAlignments) %>%
 62 |           dplyr::mutate(alignedGenome = TRUE),
 63 |         by = "read"
 64 |       ) %>%
 65 |       dplyr::select(-read) %>%
 66 |       dplyr::mutate(alignedGenome = replace(alignedGenome, is.na(alignedGenome), 
 67 |                                             FALSE))
 68 |   }))
 69 | })) %>%
 70 |   dplyr::rename(tx_read_length = readLength) %>%
 71 |   dplyr::mutate(dtype = "Nanopore read lengths")
 72 | 
 73 | readInfo <- readInfo %>%
 74 |   dplyr::mutate(category = "Unaligned") %>%
 75 |   dplyr::mutate(category = replace(
 76 |     category, alignedGenome & nbrSupplementaryAlignments == 0, 
 77 |     "Aligned, without supplementary alignments")) %>%
 78 |   dplyr::mutate(category = replace(
 79 |     category, alignedGenome & nbrSupplementaryAlignments > 0,
 80 |     "Aligned, with supplementary alignment(s)"))
 81 | 
 82 | png(gsub("\\.rds$", "_nanoporeread_density_byalignment.png", outrds), 
 83 |     width = 7, height = 5, unit = "in", res = 400)
 84 | print(ggplot(readInfo %>%
 85 |                dplyr::mutate(dataset = factor(dataset, levels = 
 86 |                                                 ds_order[ds_order %in% dataset])),
 87 |              aes(x = tx_read_length, color = category, group = category)) + 
 88 |         geom_line(stat = "density", size = 1.25) + theme_bw() + 
 89 |         scale_color_manual(values = c("Unaligned" = "lightblue", 
 90 |                                       "Aligned, without supplementary alignments" = "#E8601C",
 91 |                                       "Aligned, with supplementary alignment(s)" = "#7BAFDE"), 
 92 |                            name = "") + 
 93 |         scale_x_log10() + xlab("Read length") + 
 94 |         theme(legend.position = "bottom") + 
 95 |         facet_wrap(~ dataset))
 96 | dev.off()
 97 | 
 98 | ## Plot distribution of transcript lengths from Illumina (weighted by
 99 | ## abundance), overlay distribution of Nanopore read lengths
100 | plotdf <- dplyr::bind_rows(txlengths_illumina, readInfo %>% dplyr::select(-category)) %>%
101 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]))
102 | ## Only aligned reads
103 | plotdfaligned <- dplyr::bind_rows(txlengths_illumina, 
104 |                                   readInfo %>% dplyr::filter(category != "Unaligned") %>%
105 |                                     dplyr::select(-category)) %>%
106 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]))
107 | 
108 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_byds.png", outrds), 
109 |     width = 7, height = 5, unit = "in", res = 400)
110 | dbyds <- ggplot(plotdf,
111 |                 aes(x = tx_read_length, color = dataset, group = dataset)) + 
112 |   geom_line(stat = "density", size = 1.25) + theme_bw() + 
113 |   scale_x_log10() + xlab("Transcript/read length") + 
114 |   scale_color_manual(values = ds_colors, name = "")
115 | print(dbyds)
116 | dev.off()
117 | 
118 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_byds_onlyaligned.png", outrds), 
119 |     width = 7, height = 5, unit = "in", res = 400)
120 | dbydsaligned <- ggplot(plotdfaligned,
121 |                        aes(x = tx_read_length, color = dataset, group = dataset)) + 
122 |   geom_line(stat = "density", size = 1.25) + theme_bw() + 
123 |   scale_x_log10() + xlab("Transcript/read length") + 
124 |   scale_color_manual(values = ds_colors, name = "")
125 | print(dbydsaligned)
126 | dev.off()
127 | 
128 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_linear_byds.png", outrds), 
129 |     width = 7, height = 5, unit = "in", res = 400)
130 | dlinearbyds <- ggplot(plotdf,
131 |                       aes(x = tx_read_length, color = dataset, group = dataset)) + 
132 |   geom_line(stat = "density", size = 1.25) + theme_bw() + 
133 |   xlab("Transcript/read length") + xlim(0, 1.5e4) + 
134 |   scale_color_manual(values = ds_colors, name = "")
135 | print(dlinearbyds)
136 | dev.off()
137 | 
138 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_linear_byds_onlyaligned.png", outrds), 
139 |     width = 7, height = 5, unit = "in", res = 400)
140 | dlinearbydsaligned <- ggplot(plotdfaligned,
141 |                              aes(x = tx_read_length, color = dataset, group = dataset)) + 
142 |   geom_line(stat = "density", size = 1.25) + theme_bw() + 
143 |   xlab("Transcript/read length") + xlim(0, 1.5e4) + 
144 |   scale_color_manual(values = ds_colors, name = "")
145 | print(dlinearbydsaligned)
146 | dev.off()
147 | 
148 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_byds.png", outrds), 
149 |     width = 7, height = 5, unit = "in", res = 400)
150 | dviolinbyds <- ggplot(plotdf,
151 |                       aes(x = dataset, y = tx_read_length,
152 |                           fill = dataset)) + 
153 |   geom_violin(alpha = 0.5) + theme_bw() + 
154 |   scale_y_log10() + ylab("Transcript/read length") + xlab("") + 
155 |   scale_fill_manual(values = ds_colors, name = "") + 
156 |   theme(legend.position = "none", 
157 |         axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
158 | print(dviolinbyds)
159 | dev.off()
160 | 
161 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_byds_onlyaligned.png", outrds), 
162 |     width = 7, height = 5, unit = "in", res = 400)
163 | dviolinbydsaligned <- ggplot(plotdfaligned,
164 |                              aes(x = dataset, y = tx_read_length,
165 |                                  fill = dataset)) + 
166 |   geom_violin(alpha = 0.5) + theme_bw() + 
167 |   scale_y_log10() + ylab("Transcript/read length") + xlab("") + 
168 |   scale_fill_manual(values = ds_colors, name = "") + 
169 |   theme(legend.position = "none", 
170 |         axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
171 | print(dviolinbydsaligned)
172 | dev.off()
173 | 
174 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_linear_byds.png", outrds), 
175 |     width = 7, height = 5, unit = "in", res = 400)
176 | dviolinlinearbyds <- ggplot(plotdf,
177 |                             aes(x = dataset, y = tx_read_length,
178 |                                 fill = dataset)) + 
179 |   geom_violin(alpha = 0.5) + theme_bw() + ylim(0, 1.5e4) + 
180 |   ylab("Transcript/read length") + xlab("") + 
181 |   scale_fill_manual(values = ds_colors, name = "") + 
182 |   theme(legend.position = "none", 
183 |         axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
184 | print(dviolinlinearbyds)
185 | dev.off()
186 | 
187 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_violin_linear_byds_onlyaligned.png", outrds), 
188 |     width = 7, height = 5, unit = "in", res = 400)
189 | dviolinlinearbydsaligned <- ggplot(plotdfaligned,
190 |                                    aes(x = dataset, y = tx_read_length,
191 |                                        fill = dataset)) + 
192 |   geom_violin(alpha = 0.5) + theme_bw() + ylim(0, 1.5e4) + 
193 |   ylab("Transcript/read length") + xlab("") + 
194 |   scale_fill_manual(values = ds_colors, name = "") + 
195 |   theme(legend.position = "none", 
196 |         axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
197 | print(dviolinlinearbydsaligned)
198 | dev.off()
199 | 
200 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_density_bysample.png", outrds), 
201 |     width = 7, height = 5, unit = "in", res = 400)
202 | ggplot(plotdf,
203 |        aes(x = tx_read_length, color = dataset, group = sample)) + 
204 |   geom_line(stat = "density", size = 1) + theme_bw() + 
205 |   scale_x_log10() + xlab("Transcript/read length") + 
206 |   scale_color_manual(values = ds_colors, name = "")
207 | dev.off()
208 | 
209 | png(gsub("\\.rds$", "_illuminatx_nanoporeread_ridge_byds.png", outrds), 
210 |     width = 7, height = 7, unit = "in", res = 400)
211 | ggplot(plotdf,
212 |        aes(y = dataset, x = tx_read_length, fill = dataset, 
213 |            color = dataset)) + 
214 |   geom_density_ridges(scale = 1.5) + theme_bw() + 
215 |   scale_x_log10() + xlab("Transcript/read length") + xlab("") + 
216 |   scale_fill_manual(values = ds_colors, name = "") + 
217 |   scale_color_manual(values = ds_colors, name = "")
218 | dev.off()
219 | 
220 | plots <- list(
221 |   illuminatx_nanoporeread_density_byds = dbyds,
222 |   illuminatx_nanoporeread_density_linear_byds = dlinearbyds,
223 |   illuminatx_nanoporeread_violin_byds = dviolinbyds,
224 |   illuminatx_nanoporeread_violin_linear_byds = dviolinlinearbyds,
225 |   illuminatx_nanoporeread_density_byds_aligned = dbydsaligned,
226 |   illuminatx_nanoporeread_density_linear_byds_aligned = dlinearbydsaligned,
227 |   illuminatx_nanoporeread_violin_byds_aligned = dviolinbydsaligned,
228 |   illuminatx_nanoporeread_violin_linear_byds_aligned = dviolinlinearbydsaligned
229 | )
230 | 
231 | saveRDS(plots, file = outrds)
232 | date()
233 | sessionInfo()


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_nbr_assigned_reads.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(cowplot)
 10 |   library(stringr)
 11 | })
 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 13 | 
 14 | datasets <- strsplit(datasets, ",")[[1]]
 15 | names(datasets) <- datasets
 16 | conditions <- strsplit(conditions, ",")[[1]]
 17 | 
 18 | print(datasets)
 19 | print(conditions)
 20 | print(outrds)
 21 | 
 22 | ## Read info
 23 | readInfo <- lapply(datasets, function(ds) {
 24 |   rtbl <- read.delim(paste0(ds, "/output/", ds, "_nbr_reads.txt"), 
 25 |                      header = TRUE, as.is = TRUE)
 26 |   rtbl[!grepl("_orig", rtbl$sample), ] %>%
 27 |     dplyr::mutate(dataset = remapds[ds]) %>%
 28 |     dplyr::mutate(sample = remap[sample])
 29 | })
 30 | 
 31 | ## Number of reads
 32 | nReads <- do.call(dplyr::bind_rows, readInfo) %>%
 33 |   dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition),
 34 |                    by = c("sample" = "sample_remap")) %>%
 35 |   dplyr::filter(condition %in% conditions) %>%
 36 |   dplyr::select(-condition)
 37 | 
 38 | ## Abundance info
 39 | abundanceInfo <- lapply(datasets, function(ds) {
 40 |   readRDS(paste0(ds, "/output/", ds, "_all_abundances.rds"))
 41 | })
 42 | 
 43 | ## Keep only the desired abundance measures
 44 | abundanceInfo <- lapply(abundanceInfo, function(w) {
 45 |   w$gene_abundances <- 
 46 |     w$gene_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|count__featurecountsminimap2primary", 
 47 |                              colnames(w$gene_abundances))]
 48 |   w
 49 | })
 50 | 
 51 | genelevel <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 
 52 |                     lapply(abundanceInfo, function(ab) {
 53 |                       ab$gene_abundances %>% tibble::rownames_to_column("gene")
 54 |                     })) %>%
 55 |   tibble::column_to_rownames("gene")
 56 | totCounts <- 
 57 |   data.frame(totCount = colSums(genelevel), method = colnames(genelevel), 
 58 |              stringsAsFactors = FALSE) %>%
 59 |   tidyr::separate(method, into = c("sample", "type", "method"), sep = "__") %>%
 60 |   dplyr::mutate(sample = remap[sample]) %>% 
 61 |   dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition), 
 62 |                    by = c("sample" = "sample_remap")) %>%
 63 |   dplyr::filter(condition %in% conditions) %>% 
 64 |   dplyr::select(sample, method, totCount) %>%
 65 |   tidyr::spread(method, totCount)
 66 | 
 67 | colvec <- c(`Total number of reads` = "#777777",
 68 |             `Number of reads with a primary alignment to the genome` = "#E8601C",
 69 |             `Number of reads with a primary alignment to the transcriptome` = "#90C987",
 70 |             `Number of assigned reads, salmonminimap2_p0.99` = "#1965B0",
 71 |             `Number of assigned reads, fCminimap2primary` = "#882E72",
 72 |             `Number of assigned reads, salmon31` = "#DC050C",
 73 |             `Number of assigned reads, wubminimap2` = "#55A1B1")
 74 | 
 75 | nReadsData <- nReads %>% 
 76 |   dplyr::select(sample, dataset, nReads, nAlignedReadsGenome,
 77 |                 nAlignedReadsTxome) %>%
 78 |   dplyr::full_join(totCounts, by = "sample") %>%
 79 |   tidyr::gather(rtype, nReads, -sample, -dataset) %>%
 80 |   dplyr::mutate(
 81 |     rtype = replace(rtype, rtype == "nReads",
 82 |                     "Total number of reads"),
 83 |     rtype = replace(rtype, rtype == "nAlignedReadsGenome",
 84 |                     "Number of reads with a primary alignment to the genome"),
 85 |     rtype = replace(rtype, rtype == "nAlignedReadsTxome",
 86 |                     "Number of reads with a primary alignment to the transcriptome"),
 87 |     rtype = replace(rtype, rtype == "featurecountsminimap2primary",
 88 |                     "Number of assigned reads, fCminimap2primary"),
 89 |     rtype = replace(rtype, rtype == "salmon31",
 90 |                     "Number of assigned reads, salmon31"),
 91 |     rtype = replace(rtype, rtype == "salmonminimap2_p0.99",
 92 |                     "Number of assigned reads, salmonminimap2_p0.99"),
 93 |     rtype = replace(rtype, rtype == "wubminimap2",
 94 |                     "Number of assigned reads, wubminimap2")
 95 |   ) %>%
 96 |   dplyr::mutate(
 97 |     rtype = factor(rtype, levels = names(colvec)),
 98 |     dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])
 99 |   )
100 | 
101 | png(gsub("\\.rds$", ".png", outrds), width = 16, height = 6, unit = "in", res = 400)
102 | print(ggplot(nReadsData %>% 
103 |                dplyr::mutate(nReads = nReads/1e6) %>%
104 |                dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)),
105 |              aes(x = sample, y = nReads, fill = rtype)) + 
106 |         geom_bar(stat = "identity", position = "dodge") + theme_bw() + 
107 |         scale_y_continuous(expand = c(0, 0, 0.05, 0), 
108 |                            limits = c(0, max(nReadsData$nReads)/1e6)) + 
109 |         facet_grid(~ dataset, scales = "free_x", space = "free_x") + 
110 |         scale_fill_manual(values = colvec, name = "") +
111 |         theme(legend.position = "bottom",
112 |               strip.text = element_text(size = 8)) + 
113 |         guides(fill = guide_legend(nrow = 3, byrow = TRUE)) + 
114 |         xlab("") + ylab("Number of reads (Mio.)") + 
115 |         stat_summary(data = nReadsData %>% 
116 |                        dplyr::mutate(nReads = nReads/1e6) %>%
117 |                        dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)) %>%
118 |                        dplyr::group_by(sample, dataset) %>% 
119 |                        dplyr::mutate(
120 |                          nReads = round(nReads/nReads[rtype == "Total number of reads"]*100)), 
121 |                      fun.data = function(x) {
122 |                        return(c(y = ifelse(x == 100, -100000, 1e-4), label = x))}, 
123 |                      geom = "text", alpha = 1, size = 2, vjust = -1, 
124 |                      position = position_dodge(width = 0.9))) + 
125 |   ylim(0, NA)
126 | 
127 | dev.off()
128 | 
129 | saveRDS(NULL, file = outrds)
130 | date()
131 | sessionInfo()
132 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_nbr_assigned_reads_biotype.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(cowplot)
 10 |   library(stringr)
 11 | })
 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 13 | 
 14 | datasets <- strsplit(datasets, ",")[[1]]
 15 | datasets <- c(datasets, "Illumina")
 16 | names(datasets) <- datasets
 17 | conditions <- strsplit(conditions, ",")[[1]]
 18 | 
 19 | muted <- c("#DC050C","#E8601C","#7BAFDE","#1965B0","#B17BA6",
 20 |            "#882E72","#F1932D","#F6C141","#F7EE55","#4EB265",
 21 |            "#CAEDAB","#777777")
 22 | colfun <- grDevices::colorRampPalette(muted)
 23 | 
 24 | print(tx2gene)
 25 | print(datasets)
 26 | print(conditions)
 27 | print(outrds)
 28 | 
 29 | tx2gene <- readRDS(tx2gene)
 30 | 
 31 | ## Abundance info
 32 | abundanceInfo <- lapply(datasets, function(ds) {
 33 |   readRDS(paste0(ds, "/output/", ds, "_all_abundances.rds"))
 34 | })
 35 | 
 36 | ## Keep only the desired abundance measures
 37 | abundanceInfo <- lapply(abundanceInfo, function(w) {
 38 |   w$gene_abundances <- 
 39 |     w$gene_abundances[, grep("count__salmon31|count__salmonminimap2_p0.99|count__wubminimap2|count__featurecountsminimap2primary|tpm__salmon$|tpm__StringTie", 
 40 |                              colnames(w$gene_abundances))]
 41 |   w
 42 | })
 43 | 
 44 | genelevel <- Reduce(function(...) dplyr::full_join(..., by = "gene"), 
 45 |                     lapply(abundanceInfo, function(ab) {
 46 |                       ab$gene_abundances %>% tibble::rownames_to_column("gene")
 47 |                     })) %>%
 48 |   dplyr::left_join(tx2gene %>% dplyr::select(gene, gene_biotype) %>%
 49 |                      dplyr::mutate(gene = gsub("\\.[0-9]+$", "", gene)) %>%
 50 |                      dplyr::distinct(), by = "gene")
 51 | 
 52 | 
 53 | tmp <- genelevel %>% dplyr::select(-gene) %>% dplyr::group_by(gene_biotype) %>% 
 54 |   dplyr::summarize_at(vars(dplyr::matches("__count__|__tpm__")), sum) %>%
 55 |   dplyr::mutate_at(vars(dplyr::matches("__count__|__tpm__")), funs(./sum(.))) %>%
 56 |   tidyr::gather(method, proportion, -gene_biotype) %>%
 57 |   tidyr::separate(method, into = c("sample", "aType", "method"), sep = "__") %>%
 58 |   dplyr::mutate(method = gsub("featurecounts", "fC", method)) %>% 
 59 |   dplyr::mutate(sample = remap[sample]) %>%
 60 |   dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, dataset, condition),
 61 |                    by = c("sample" = "sample_remap")) %>%
 62 |   dplyr::filter(condition %in% conditions) %>%
 63 |   dplyr::group_by(gene_biotype) %>% 
 64 |   dplyr::mutate(maxFrac = max(proportion)) %>%
 65 |   dplyr::ungroup() %>% 
 66 |   dplyr::mutate(dataset2 = factor(dataset, levels = 
 67 |                                     ds_order[ds_order %in% dataset]))
 68 | 
 69 | methodcols <- c(salmon = "#882E72", StringTie = "#882E72", 
 70 |                 salmonminimap2_p0.99 = "#4EB265",
 71 |                 fCminimap2primary = "#4EB265", salmon31 = "#4EB265",
 72 |                 wubminimap2 = "#4EB265")
 73 | 
 74 | png(gsub("\\.rds$", "_biotype_prop_average.png", outrds), width = 10, 
 75 |     height = 15, unit = "in", res = 400)
 76 | p1 <- ggplot(tmp %>% 
 77 |                dplyr::mutate(dataset = replace(dataset, dataset != "Illumina", 
 78 |                                                "Nanopore")) %>% 
 79 |                dplyr::group_by(dataset, gene_biotype, method) %>%
 80 |                dplyr::summarize(proportion = mean(proportion),
 81 |                                 maxFrac = max(maxFrac)) %>%
 82 |                dplyr::filter(maxFrac >= 0.001) %>%
 83 |                dplyr::mutate(method = factor(
 84 |                  method, levels = c("fCminimap2primary", 
 85 |                                     "salmonminimap2_p0.99", "salmon31", 
 86 |                                     "wubminimap2", "salmon", 
 87 |                                     "StringTie"))),
 88 |              aes(x = method, y = proportion, color = method, fill = method)) + 
 89 |   geom_bar(stat = "identity") +
 90 |   theme_bw() + facet_wrap(~ gene_biotype, scales = "free_y") + 
 91 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
 92 |         legend.position = "none",
 93 |         strip.text = element_text(size = 7)) + 
 94 |   xlab("") + ylab("Average abundance proportion") + 
 95 |   scale_color_manual(values = methodcols, name = "") + 
 96 |   scale_fill_manual(values = methodcols, name = "")
 97 | 
 98 | g0 <- 
 99 |   ggplot(tmp %>% dplyr::filter(dataset != "Illumina") %>%
100 |            dplyr::mutate(
101 |              gene_biotype = replace(
102 |                gene_biotype, 
103 |                !(gene_biotype %in% c("lincRNA", "Mt_rRNA",
104 |                                      "processed_pseudogene", 
105 |                                      "protein_coding",
106 |                                      "antisense_RNA",
107 |                                      "Mt_tRNA",
108 |                                      "transcribed_processed_pseudogene")),
109 |                "other")) %>%
110 |            dplyr::arrange(dataset2, sample) %>%
111 |            dplyr::mutate(sample = factor(sample, levels = unique(sample))),
112 |          aes(x = method, y = proportion, group = gene_biotype, fill = gene_biotype)) + 
113 |   geom_bar(stat = "identity", position = "fill") + 
114 |   facet_wrap(~ sample, ncol = 4) + theme_bw() + 
115 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
116 |         legend.position = "bottom") + 
117 |   xlab("") + ylab("Abundance proportion") + 
118 |   scale_fill_manual(values = colfun(8), name = "")
119 | 
120 | g1 <- 
121 |   ggplot(tmp %>% dplyr::filter(dataset == "Illumina") %>%
122 |            dplyr::mutate(
123 |              gene_biotype = replace(
124 |                gene_biotype, 
125 |                !(gene_biotype %in% c("lincRNA", "Mt_rRNA",
126 |                                      "processed_pseudogene", 
127 |                                      "protein_coding",
128 |                                      "antisense_RNA",
129 |                                      "Mt_tRNA",
130 |                                      "transcribed_processed_pseudogene")),
131 |                "other")),
132 |          aes(x = method, y = proportion, group = gene_biotype, fill = gene_biotype)) + 
133 |   geom_bar(stat = "identity", position = "fill") + 
134 |   facet_wrap(~ sample, ncol = 4) + theme_bw() + 
135 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
136 |         legend.position = "bottom") + 
137 |   xlab("") + ylab("Abundance proportion") + 
138 |   scale_fill_manual(values = colfun(8), name = "")
139 | 
140 | cowplot::plot_grid(
141 |   p1,
142 |   cowplot::plot_grid(g0 + theme(legend.position = "none"), g1, ncol = 1,
143 |                      rel_heights = c(1, 0.45)),
144 |   ncol = 1, labels = c("A", "B"), rel_heights = c(0.8, 1))
145 | dev.off()
146 | 
147 | saveRDS(NULL, file = outrds)
148 | date()
149 | sessionInfo()
150 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_nbr_junctions_per_read_illumina.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(GenomicAlignments)
  8 |   library(dplyr)
  9 |   library(ggplot2)
 10 |   library(cowplot)
 11 |   library(stringr)
 12 | })
 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 14 | 
 15 | conditions <- strsplit(conditions, ",")[[1]]
 16 | 
 17 | print(conditions)
 18 | print(genomebamdir)
 19 | print(outrds)
 20 | 
 21 | readBam <- function(bamfile) {
 22 |   bf <- BamFile(bamfile, yieldSize = 5e6)
 23 |   bam <- readGAlignments(bf, use.names = TRUE,
 24 |                          param = ScanBamParam(tag = c("NM"),
 25 |                                               what = c("qname", "flag", "rname", 
 26 |                                                        "pos", "mapq")))
 27 |   ops <- GenomicAlignments::CIGAR_OPS
 28 |   wdths <- GenomicAlignments::explodeCigarOpLengths(cigar(bam), ops = ops)
 29 |   keep.ops <- GenomicAlignments::explodeCigarOps(cigar(bam), ops = ops)
 30 |   explodedcigars <- IRanges::CharacterList(relist(paste0(unlist(wdths), 
 31 |                                                          unlist(keep.ops)), wdths))
 32 |   for (opts in setdiff(GenomicAlignments::CIGAR_OPS, "=")) {
 33 |     mcols(bam)[[paste0("nbr", opts)]] <- 
 34 |       sapply(explodedcigars, function(cg) sum(as.numeric(gsub(paste0(opts, "$"), "", cg)), na.rm = TRUE))
 35 |   }
 36 |   mcols(bam)$readLength <- rowSums(as.matrix(mcols(bam)[, c("nbrS", "nbrH", "nbrM", "nbrI")]))
 37 |   bam
 38 | }
 39 | 
 40 | makeReadDf <- function(bam) {
 41 |   tmp <- data.frame(bam %>% setNames(NULL), stringsAsFactors = FALSE) %>%
 42 |     dplyr::rename(read = qname,
 43 |                   nbrJunctions = njunc) %>%
 44 |     dplyr::select(-cigar) %>%
 45 |     dplyr::filter(flag %in% c(83, 99, 339, 355)) %>% ## mapped in proper pair, first in pair
 46 |     dplyr::mutate(alignedLength = nbrM + nbrI) ## equivalent to readLength-nbrS-nbrH
 47 |   
 48 |   tmp2 <- as.data.frame(table(names(subset(bam, flag %in% c(83, 99)))))
 49 |   if (nrow(tmp2) == 0) tmp2 <- data.frame(Var1 = tmp$read[1], Freq = 0)
 50 |   tmp <- tmp %>% 
 51 |     dplyr::left_join(tmp2 %>% dplyr::rename(read = Var1, nbrPrimaryAlignments = Freq))
 52 |   
 53 |   tmp3 <- as.data.frame(table(names(subset(bam, flag %in% c(339, 355)))))
 54 |   if (nrow(tmp3) == 0) tmp3 <- data.frame(Var1 = tmp$read[1], Freq = 0)
 55 |   tmp <- tmp %>% 
 56 |     dplyr::left_join(tmp3 %>% dplyr::rename(read = Var1, nbrSecondaryAlignments = Freq))
 57 |   
 58 |   tmp %>% dplyr::mutate(nbrSecondaryAlignments = replace(nbrSecondaryAlignments, 
 59 |                                                          is.na(nbrSecondaryAlignments), 0))
 60 | }
 61 | 
 62 | bamfiles <- list.files(genomebamdir, pattern = "_Aligned.sortedByCoord.out.bam$", 
 63 |                        recursive = TRUE, full.names = TRUE)
 64 | names(bamfiles) <- gsub("_Aligned.sortedByCoord.out.bam", "", basename(bamfiles))
 65 | bamfiles <- bamfiles[names(bamfiles) %in% 
 66 |                        sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 67 | bamfiles
 68 | genomebams <- lapply(bamfiles, function(f) {
 69 |   bam <- readBam(f)
 70 |   tmp <- makeReadDf(bam)
 71 |   list(allAlignments = tmp)
 72 | })
 73 | for (n in names(genomebams)) {
 74 |   genomebams[[n]][["allAlignments"]]$sample <- n
 75 | }
 76 | 
 77 | genomebams <- do.call(dplyr::bind_rows, lapply(names(genomebams), function(nm) {
 78 |   genomebams[[nm]]$allAlignments %>% 
 79 |     dplyr::mutate(sample = remap[nm], dataset = "Illumina", 
 80 |                   rtype = "Reads aligning to the genome") %>%
 81 |     dplyr::mutate(fracM = nbrM/readLength,
 82 |                   fracS = nbrS/readLength,
 83 |                   fracI = nbrI/readLength)
 84 | }))
 85 | 
 86 | djunc <- genomebams %>% dplyr::filter(flag %in% c(83, 99)) %>%
 87 |   dplyr::mutate(nbrJunctions = replace(nbrJunctions, nbrJunctions > 20, ">20")) %>%
 88 |   dplyr::mutate(nbrJunctions = factor(nbrJunctions, 
 89 |                                       levels = c(as.character(0:20), ">20"))) %>%
 90 |   dplyr::group_by(sample, dataset, nbrJunctions) %>%
 91 |   dplyr::tally() %>%
 92 |   dplyr::group_by(sample) %>%
 93 |   dplyr::mutate(n = n/sum(n))
 94 | 
 95 | png(gsub("\\.rds$", "_njunc_distribution_bysample.png", outrds),
 96 |     width = 12, height = 8.5, unit = "in", res = 400)
 97 | print(ggplot(djunc,
 98 |              aes(x = nbrJunctions, y = n, fill = dataset, color = dataset)) +
 99 |         geom_bar(stat = "identity", position = "dodge") +
100 |         theme_bw() + facet_wrap(~ sample) + 
101 |         theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 
102 |               legend.position = "none") +
103 |         scale_color_manual(values = ds_colors, name = "") + 
104 |         scale_fill_manual(values = ds_colors, name = "") + 
105 |         scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
106 |         xlab("Number of spanned junctions") + ylab("Fraction of reads"))
107 | dev.off()
108 | 
109 | png(gsub("\\.rds$", "_njunc_distribution.png", outrds),
110 |     width = 8, height = 6, unit = "in", res = 400)
111 | print(ggplot(djunc, aes(x = nbrJunctions, y = n), fill = "#B3B3B3", color = "#B3B3B3") +
112 |         geom_bar(stat = "identity", position = "dodge") +
113 |         theme_bw() + 
114 |         theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 
115 |               legend.position = "none") +
116 |         scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
117 |         xlab("Number of spanned junctions") + ylab("Fraction of reads"))
118 | dev.off()
119 | 
120 | saveRDS(djunc, file = outrds)
121 | date()
122 | sessionInfo()
123 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_nbr_reads.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(plyr)
  8 |   library(dplyr)
  9 |   library(ggplot2)
 10 |   library(cowplot)
 11 |   library(stringr)
 12 | })
 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 14 | 
 15 | datasets <- strsplit(datasets, ",")[[1]]
 16 | names(datasets) <- datasets
 17 | conditions <- strsplit(conditions, ",")[[1]]
 18 | 
 19 | print(datasets)
 20 | print(conditions)
 21 | print(outrds)
 22 | 
 23 | ## ========================================================================== ##
 24 | ## Total number of reads
 25 | ## ========================================================================== ##
 26 | ## Read information about the number of reads
 27 | reads <- lapply(datasets, function(ds) {
 28 |   read.delim(paste0(ds, "/output/", ds, "_nbr_reads.txt"),
 29 |              header = TRUE, as.is = TRUE) %>%
 30 |     dplyr::filter(!grepl("_orig", sample)) %>%
 31 |     dplyr::mutate(sample = remap[sample],
 32 |                   dataset = remapds[ds]) %>%
 33 |     dplyr::select(sample, dataset, nReads)
 34 | })
 35 | 
 36 | ## Put everything together
 37 | reads <- do.call(dplyr::bind_rows, reads) %>% 
 38 |   dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition),
 39 |                    by = c("sample" = "sample_remap")) %>%
 40 |   dplyr::filter(condition %in% conditions) %>% 
 41 |   dplyr::mutate(
 42 |     dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])
 43 |   )
 44 | 
 45 | ## Write total number of reads to text file
 46 | write.table(reads, file = gsub("\\.rds$", "_nbrreads.txt", outrds),
 47 |             quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t")
 48 | 
 49 | ## Plot total number of reads
 50 | ptot <- ggplot(
 51 |   reads %>% dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)), 
 52 |   aes(x = sample, y = nReads/1e6, fill = dataset)) + 
 53 |   geom_bar(stat = "identity", position = "dodge") + 
 54 |   scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
 55 |   facet_grid(~ dataset, scales = "free_x", space = "free_x") + theme_bw() + 
 56 |   scale_fill_manual(values = ds_colors, name = "") + 
 57 |   theme(legend.position = "none",
 58 |         strip.text = element_text(size = 8)) + 
 59 |   xlab("") + ylab("Total number of reads (Mio.)")
 60 | 
 61 | ## ========================================================================== ##
 62 | ## Read length
 63 | ## ========================================================================== ##
 64 | ## Read information about read lengths
 65 | readInfo <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
 66 |   rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds"))
 67 |   rdf <- rd$fastqs
 68 |   rdf <- rdf[!grepl("_orig", names(rdf))]
 69 |   rdf <- rdf[names(rdf) %in% 
 70 |                sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 71 |   do.call(dplyr::bind_rows, lapply(names(rdf), function(nm) {
 72 |     rdf[[nm]]$reads %>% dplyr::select(readLength, aveBaseQuality) %>%
 73 |       dplyr::mutate(sample = remap[nm], dataset = remapds[ds])
 74 |   }))
 75 | })) %>% 
 76 |   dplyr::mutate(
 77 |     dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])
 78 |   )
 79 | 
 80 | ## Write summary information about read lengths to text file
 81 | write.table(
 82 |   readInfo %>% dplyr::group_by(sample) %>%
 83 |     dplyr::summarize(mean_readlength = mean(readLength),
 84 |                      median_readlength = median(readLength),
 85 |                      min_readlength = min(readLength),
 86 |                      max_readlength = max(readLength),
 87 |                      nbr_reads_longer_than_15000 = length(which(readLength > 1.5e4))),
 88 |   file = gsub("\\.rds$", "_readlengths.txt", outrds),
 89 |   quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t"
 90 | )
 91 | 
 92 | ## -------------------------------------------------------------------------- ##
 93 | ## Plot read lengths
 94 | plength <- ggplot(readInfo %>% 
 95 |                     dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 
 96 |                                                      .subset, 3)), 
 97 |                   aes(x = readLength, group = sample, 
 98 |                       color = dataset, alpha = as.factor(replicate))) + 
 99 |   geom_line(stat = "density", size = 1) + scale_x_log10() + 
100 |   facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 
101 |   xlab("Read length") + ylab("Density") + 
102 |   scale_color_manual(values = ds_colors, name = "") + 
103 |   scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 
104 |   theme(legend.position = "none")
105 | 
106 | ## Plot read lengths, sqrt-transformed
107 | plengthsqrt <- ggplot(readInfo %>% 
108 |                         dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 
109 |                                                          .subset, 3)), 
110 |                       aes(x = readLength, group = sample, 
111 |                           color = dataset, alpha = as.factor(replicate))) + 
112 |   geom_line(stat = "density", size = 1) + scale_x_sqrt(limits = c(0, 1.5e4)) + 
113 |   facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 
114 |   xlab("Read length") + ylab("Density") + 
115 |   scale_color_manual(values = ds_colors, name = "") + 
116 |   scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 
117 |   theme(legend.position = "none")
118 | 
119 | ## Same plot but with x-axis on linear scale, and cut at 1.5e4
120 | plengthlin <- ggplot(readInfo %>% 
121 |                        dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 
122 |                                                         .subset, 3)), 
123 |                      aes(x = readLength, group = sample, 
124 |                          color = dataset, alpha = as.factor(replicate))) + 
125 |   geom_line(stat = "density", size = 1) + 
126 |   xlim(0, 1.5e4) + 
127 |   facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 
128 |   xlab("Read length") + ylab("Density") + 
129 |   scale_color_manual(values = ds_colors, name = "") + 
130 |   scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 
131 |   theme(legend.position = "none")
132 | 
133 | ## -------------------------------------------------------------------------- ##
134 | ## Plot mean quality distribution
135 | pqual <- ggplot(readInfo %>% 
136 |                   dplyr::mutate(replicate = sapply(strsplit(sample, "_"), 
137 |                                                    .subset, 3)), 
138 |                 aes(x = aveBaseQuality, group = sample, 
139 |                     color = dataset, alpha = as.factor(replicate))) + 
140 |   geom_line(stat = "density", size = 1) + 
141 |   facet_wrap(~ dataset, nrow = 2, scales = "free_y") + theme_bw() + 
142 |   xlab("Average base quality per read") + ylab("Density") + 
143 |   scale_color_manual(values = ds_colors, name = "") + 
144 |   scale_alpha_manual(values = c(1, 0.4, 0.76, 0.52, 0.88, 0.64)) + 
145 |   theme(legend.position = "none")
146 | 
147 | ## -------------------------------------------------------------------------- ##
148 | ## Plot read length vs mean quality, for each sample
149 | plengthqual <- 
150 |   ggplot(readInfo %>% dplyr::arrange(dataset, sample) %>%
151 |            dplyr::mutate(sample = factor(sample, levels = unique(sample))), 
152 |          aes(x = readLength, y = aveBaseQuality)) + 
153 |   geom_hex(bins = 100, aes(fill = stat(density))) + scale_x_log10() + 
154 |   scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 
155 |   facet_wrap(~ sample) + theme_bw() + 
156 |   xlab("Read length") + ylab("Average base quality")
157 | 
158 | ## Same plot but with x-axis on linear scale, and cut at 1.5e4
159 | plengthquallin <- 
160 |   ggplot(readInfo %>% dplyr::arrange(dataset, sample) %>%
161 |            dplyr::mutate(sample = factor(sample, levels = unique(sample))), 
162 |          aes(x = readLength, y = aveBaseQuality)) + 
163 |   geom_hex(bins = 100, aes(fill = stat(density))) + 
164 |   xlim(0, 1.5e4) + 
165 |   scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 
166 |   facet_wrap(~ sample) + theme_bw() + 
167 |   xlab("Read length") + ylab("Average base quality")
168 | 
169 | ## Plot read length vs mean quality, for each sample, sqrt-transformed
170 | plengthqualsqrt <- 
171 |   ggplot(readInfo %>% dplyr::arrange(dataset, sample) %>%
172 |            dplyr::mutate(sample = factor(sample, levels = unique(sample))), 
173 |          aes(x = readLength, y = aveBaseQuality)) + 
174 |   geom_hex(bins = 100, aes(fill = stat(density))) + scale_x_sqrt(limits = c(0, 1.5e4)) + 
175 |   scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 
176 |   facet_wrap(~ sample) + theme_bw() + 
177 |   xlab("Read length") + ylab("Average base quality")
178 | 
179 | ## -------------------------------------------------------------------------- ##
180 | ## Make final plots
181 | png(gsub("\\.rds$", "_nbrreads.png", outrds), width = 12, 
182 |     height = 8, unit = "in", res = 400)
183 | print(cowplot::plot_grid(
184 |   ptot,
185 |   cowplot::plot_grid(plength, pqual, nrow = 1, 
186 |                      rel_widths = c(1, 1), labels = c("B", "C")),
187 |   ncol = 1, rel_heights = c(1, 1), labels = c("A", ""))
188 | )
189 | dev.off()
190 | 
191 | png(gsub("\\.rds$", "_nbrreads_sqrt.png", outrds), width = 12, 
192 |     height = 8, unit = "in", res = 400)
193 | print(cowplot::plot_grid(
194 |   ptot,
195 |   cowplot::plot_grid(plengthsqrt, pqual, nrow = 1, 
196 |                      rel_widths = c(1, 1), labels = c("B", "C")),
197 |   ncol = 1, rel_heights = c(1, 1), labels = c("A", ""))
198 | )
199 | dev.off()
200 | 
201 | png(gsub("\\.rds$", "_nbrreads_linear.png", outrds), width = 12, 
202 |     height = 8, unit = "in", res = 400)
203 | print(cowplot::plot_grid(
204 |   ptot,
205 |   cowplot::plot_grid(plengthlin, pqual, nrow = 1, 
206 |                      rel_widths = c(1, 1), labels = c("B", "C")),
207 |   ncol = 1, rel_heights = c(1, 1), labels = c("A", ""))
208 | )
209 | dev.off()
210 | 
211 | png(gsub("\\.rds$", "_readlength_vs_quality.png", outrds), width = 11, 
212 |     height = 11, unit = "in", res = 400)
213 | print(plengthqual)
214 | dev.off()
215 | 
216 | png(gsub("\\.rds$", "_readlength_vs_quality_sqrt.png", outrds), width = 11, 
217 |     height = 11, unit = "in", res = 400)
218 | print(plengthqualsqrt)
219 | dev.off()
220 | 
221 | png(gsub("\\.rds$", "_readlength_vs_quality_linear.png", outrds), width = 11, 
222 |     height = 11, unit = "in", res = 400)
223 | print(plengthquallin)
224 | dev.off()
225 | 
226 | ## -------------------------------------------------------------------------- ##
227 | saveRDS(NULL, file = outrds)
228 | date()
229 | sessionInfo()
230 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_nbr_tx_in_eqclass.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | suppressPackageStartupMessages({
 7 |   library(dplyr)
 8 |   library(ggplot2)
 9 |   library(stringr)
10 | })
11 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
12 | 
13 | datasets <- strsplit(datasets, ",")[[1]]
14 | datasets <- c(datasets, "Illumina")
15 | conditions <- strsplit(conditions, ",")[[1]]
16 | 
17 | print(datasets)
18 | print(conditions)
19 | print(outrds)
20 | 
21 | salmonfiles <- unlist(lapply(datasets, function(ds) {
22 |   lf <- list.files(file.path(ds, "salmon31"), pattern = "eq_classes.txt",
23 |                    full.names = TRUE, recursive = TRUE)
24 |   names(lf) <- remap[basename(gsub("/aux_info/eq_classes.txt$", "", lf))]
25 |   lf
26 | }))
27 | salmonfiles <- salmonfiles[names(salmonfiles) %in% 
28 |                              sample_annotation$sample_remap[sample_annotation$condition %in% conditions]]
29 | salmonfiles
30 | 
31 | eqcl <- lapply(salmonfiles, function(f) {
32 |   x <- readLines(f)
33 |   n_tr <- as.numeric(x[1])  ## Total number of transcripts
34 |   n_eq <- as.numeric(x[2])  ## Total number of equivalence classes
35 |   tx_id <- x[3:(n_tr + 2)]  ## Transcript IDs
36 |   quants <- x[(n_tr + 3):length(x)]  ## Characteristics of equivalence classes
37 |   
38 |   ## Split equivalence class characteristics. Each element of the list corresponds
39 |   ## to one equivalence class, and lists its number of transcripts, the
40 |   ## transcripts IDs and the total number of reads
41 |   do.call(dplyr::bind_rows, lapply(quants, function(w) {
42 |     tmp = strsplit(w, "\\\t")[[1]]
43 |     nbr_tx = as.numeric(tmp[1])
44 |     data.frame(nbr_tx = rep(nbr_tx, as.numeric(tmp[length(tmp)])),
45 |                stringsAsFactors = FALSE)
46 |   }))
47 | })
48 | 
49 | for (nm in names(eqcl)) {
50 |   eqcl[[nm]]$sample <- nm
51 |   eqcl[[nm]]$dataset <- strsplit(nm, "_")[[1]][1]
52 | }
53 | 
54 | eqcl <- do.call(dplyr::bind_rows, eqcl) %>%
55 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]))
56 | 
57 | png(gsub("\\.rds$", ".png", outrds), height = 12, width = 16, 
58 |     unit = "in", res = 400)
59 | p1 <- ggplot(eqcl %>% 
60 |                dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)), 
61 |              aes(x = sample, y = nbr_tx, fill = dataset)) + 
62 |   geom_boxplot() + theme_bw() + xlab("") + 
63 |   ylab("Number of transcripts in equivalence class") + 
64 |   theme(legend.position = "none") + 
65 |   facet_grid(~ dataset, scales = "free_x", space = "free_x") + 
66 |   scale_fill_manual(values = ds_colors, name = "")
67 | p2 <- p1 + 
68 |   stat_summary(fun.y = mean, geom = "point", shape = 18, 
69 |                size = 4, color = "black", fill = "black") +
70 |   coord_cartesian(ylim = c(0, 15))
71 | cowplot::plot_grid(p1 + ggtitle("Full range"), 
72 |                    p2 + ggtitle("Zoomed in"), 
73 |                    ncol = 1, labels = c("A", "B"), rel_heights = c(1, 1))
74 | dev.off()
75 | 
76 | ## By dataset
77 | p1ds <- ggplot(eqcl, 
78 |                aes(x = dataset, y = nbr_tx, fill = dataset)) + 
79 |   geom_boxplot() + theme_bw() + xlab("") + 
80 |   ylab("Number of transcripts in equivalence class") + 
81 |   theme(legend.position = "none",
82 |         axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
83 |   scale_fill_manual(values = ds_colors, name = "")
84 | p2ds <- p1ds + 
85 |   stat_summary(fun.y = mean, geom = "point", shape = 18, 
86 |                size = 4, color = "black", fill = "black") +
87 |   coord_cartesian(ylim = c(0, 15))
88 | 
89 | 
90 | saveRDS(list(ptxeqfull = p1 + ggtitle("Full range"), 
91 |              ptxeqzoom = p2 + ggtitle("Zoomed in"),
92 |              ptxeqfullds = p1ds + ggtitle("Full range"), 
93 |              ptxeqzoomds = p2ds + ggtitle("Zoomed in")), file = outrds)
94 | date()
95 | sessionInfo()


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_palindromes.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(cowplot)
 10 |   library(stringr)
 11 |   library(Biostrings)
 12 |   library(data.table)
 13 |   library(parallel)
 14 |   library(BiocParallel)
 15 | })
 16 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 17 | 
 18 | datasets <- strsplit(datasets, ",")[[1]]
 19 | names(datasets) <- datasets
 20 | conditions <- strsplit(conditions, ",")[[1]]
 21 | 
 22 | print(datasets)
 23 | print(conditions)
 24 | print(nreads)
 25 | print(ncores)
 26 | print(outrds)
 27 | 
 28 | ## Information about the reads
 29 | readInfo <- lapply(datasets, function(ds) {
 30 |   rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds"))
 31 |   
 32 |   rdf <- rd$fastqs
 33 |   rdf <- rdf[!grepl("_orig", names(rdf))]
 34 |   rdf <- rdf[names(rdf) %in% 
 35 |                sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 36 |   rdf <- do.call(dplyr::bind_rows, lapply(names(rdf), function(nm) {
 37 |     rdf[[nm]]$reads %>% 
 38 |       dplyr::mutate(sample = remap[nm], dataset = remapds[ds], rtype = "All reads")
 39 |   }))
 40 |   
 41 |   rdg <- rd$genomebams
 42 |   rdg <- rdg[!grepl("_orig", names(rdg))]
 43 |   rdg <- rdg[names(rdg) %in% 
 44 |                sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 45 |   rdg <- do.call(dplyr::bind_rows, lapply(names(rdg), function(nm) {
 46 |     rdg[[nm]]$allAlignments %>% 
 47 |       dplyr::mutate(sample = remap[nm], dataset = remapds[ds], 
 48 |                     rtype = "Reads aligning to the genome") %>%
 49 |       dplyr::mutate(fracM = nbrM/readLength,
 50 |                     fracS = nbrS/readLength,
 51 |                     fracI = nbrI/readLength,
 52 |                     accuracy = (nbrM - NM + nbrI + nbrD)/(nbrM + nbrI + nbrD))
 53 |   }))
 54 |   
 55 |   list(rdf = rdf, rdg = rdg)
 56 | })
 57 | 
 58 | ## All reads
 59 | allReads <- do.call(dplyr::bind_rows, lapply(readInfo, function(x) x$rdf))
 60 | 
 61 | ## Genome alignments
 62 | gAlign <- do.call(dplyr::bind_rows, lapply(readInfo, function(x) x$rdg))
 63 | 
 64 | ## Add information about genome alignments to the data frame with all reads
 65 | allReads <- allReads %>%
 66 |   dplyr::left_join(gAlign %>% dplyr::filter(flag %in% c(0, 16)) %>%
 67 |                      dplyr::select(read, sample, dataset, NM, nbrM, nbrS, 
 68 |                                    nbrD, nbrI, alignedLength, 
 69 |                                    nbrSecondaryAlignments, 
 70 |                                    nbrSupplementaryAlignments, fracM, fracS),
 71 |                    by = c("sample", "dataset", "read")) %>%
 72 |   dplyr::mutate(aligned = ifelse(is.na(alignedLength), "Not aligned", 
 73 |                                  "Aligned"))
 74 | 
 75 | ## Get palindrome information for a subset of reads for each sample
 76 | fastqs <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
 77 |   fastqdir <- paste0(ds, "/FASTQ", 
 78 |                      ifelse(ds %in% c("RNA001", "HEK293RNA"), "dna", ""))
 79 |   fastqfiles <- list.files(fastqdir, pattern = "(FASTQ|fastq)\\.gz$", 
 80 |                            full.names = TRUE)
 81 |   names(fastqfiles) <- gsub("\\.(FASTQ|fastq).gz", "", basename(fastqfiles))
 82 |   fastqfiles <- fastqfiles[!grepl("_orig", names(fastqfiles))]
 83 |   fastqfiles <- fastqfiles[names(fastqfiles) %in% 
 84 |                              sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 85 |   fastqfiles
 86 |   
 87 |   do.call(dplyr::bind_rows, lapply(names(fastqfiles), function(nm) {
 88 |     f <- fastqfiles[nm]
 89 |     fastq <- fread(paste0("zcat ", f), sep = "\n", header = FALSE)$V1
 90 |     seq_idxs <- seq(2, length(fastq), by = 4)
 91 |     reads <- Biostrings::DNAStringSet(x = fastq[seq_idxs])
 92 |     names(reads) <- sapply(strsplit(fastq[seq_idxs - 1], " "), .subset, 1)
 93 |     print(nm)
 94 |     print(table(width(reads) < 30000))
 95 |     reads <- reads[width(reads) < 30000]
 96 |     set.seed(1)
 97 |     reads <- reads[sample(seq_along(reads), nreads, replace = FALSE)]
 98 |     readlengths <- width(reads)
 99 |     readnames <- names(reads)
100 |     do.call(dplyr::bind_rows, bplapply(seq_along(reads), function(i) {
101 |       fp <- Biostrings::findPalindromes(reads[[i]], max.looplength = readlengths[i], 
102 |                                         min.armlength = 10)
103 |       armlength <- Biostrings::palindromeArmLength(fp)
104 |       totlength <- width(fp)
105 |       w <- which.max(armlength)
106 |       
107 |       ## Get the total length of the read that is covered by palindromic sequence
108 |       dnatmp <- reduce(IRanges(start = start(fp), 
109 |                                width = armlength))
110 |       totalPalindromeLength <- 2 * sum(width(dnatmp))
111 |       tryCatch({
112 |         data.frame(
113 |           read = gsub("^@", "", readnames[i]),
114 |           readLength = readlengths[i],
115 |           maxArmLengthPalindrome = armlength[w],
116 |           totalPalindromeLength = totlength[w],
117 |           longestPalindromeArm = Biostrings::palindromeLeftArm(fp[w]),
118 |           stringsAsFactors = FALSE
119 |         )}, error = function(e) {
120 |           data.frame(read = gsub("^@", "", readnames[i]),
121 |                      readLength = readlengths[i],
122 |                      maxArmLengthPalindrome = NA,
123 |                      totalPalindromeLength = NA, 
124 |                      longestPalindromeArm = "",
125 |                      stringsAsFactors = FALSE)
126 |         })
127 |     }, BPPARAM = MulticoreParam(workers = ncores))) %>% 
128 |       dplyr::mutate(sample = remap[nm], 
129 |                     dataset = remapds[ds])
130 |   }))
131 | }))
132 | 
133 | df <- fastqs %>%
134 |   dplyr::left_join(allReads, 
135 |                    by = c("dataset", "sample", "read", "readLength")) %>%
136 |   dplyr::mutate(
137 |     maxArmLengthPalindrome = replace(maxArmLengthPalindrome, 
138 |                                      is.na(maxArmLengthPalindrome), 0),
139 |     totalPalindromeLength = replace(totalPalindromeLength, 
140 |                                     is.na(totalPalindromeLength), 0))
141 | 
142 | ## Read information about primary/supplementary alignments
143 | dfprimsupp <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
144 |   readRDS(paste0(ds, "/output/", ds, 
145 |                  "_primary_supplementary_alignments_distances.rds")) %>% 
146 |     dplyr::mutate(sample = remap[sample]) %>% 
147 |     dplyr::mutate(dataset = remapds[ds])
148 | })) %>%
149 |   dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition),
150 |                    by = c("sample" = "sample_remap")) %>%
151 |   dplyr::filter(condition %in% conditions) %>%
152 |   dplyr::select(-condition) %>% 
153 |   dplyr::mutate(dist0 = "") %>%
154 |   dplyr::mutate(dist0 = replace(dist0, distn == 0, ", overlapping"),
155 |                 dist0 = replace(dist0, distn > 0, ", non-overlapping")) %>%
156 |   dplyr::mutate(strands_dist0 = paste0(strands, dist0))
157 | 
158 | df <- df %>%
159 |   dplyr::left_join(dfprimsupp, 
160 |                    by = c("dataset", "sample", "read")) %>%
161 |   dplyr::mutate(strands_dist0 = replace(strands_dist0, is.na(strands_dist0),
162 |                                         "no supplementary alignment")) %>%
163 |   dplyr::mutate(relMaxArmLengthPalindrome = maxArmLengthPalindrome/readLength)
164 | 
165 | catcols <- c("no supplementary alignment" = "lightyellow", 
166 |              "different chromosomes" = "#E8601C", 
167 |              "same chromosome, different strand, non-overlapping" = "#7BAFDE",
168 |              "same chromosome, different strand, overlapping" = "#90C987", 
169 |              "same chromosome, same strand, non-overlapping" = "#777777", 
170 |              "same chromosome, same strand, overlapping" = "#B17BA6")
171 | 
172 | df <- df %>%
173 |   dplyr::mutate(strands_dist0 = factor(strands_dist0, 
174 |                                        levels = names(catcols)))
175 | 
176 | png(gsub("\\.rds$", "_aligned_vs_relpalindromelength.png", outrds),
177 |     width = 10, height = 5, unit = "in", res = 400)
178 | ggplot(df, aes(x = aligned, y = relMaxArmLengthPalindrome)) + 
179 |   geom_violin() + geom_boxplot(width = 0.1) + 
180 |   theme_bw() + facet_wrap(~ dataset) + 
181 |   xlab("") + ylab("Longest palindrome arm length/read length") + 
182 |   coord_cartesian(ylim = c(0, max(df$relMaxArmLengthPalindrome) + 0.005)) + 
183 |   stat_summary(data = df %>%
184 |                  dplyr::group_by(dataset, aligned) %>%
185 |                  dplyr::summarize(relMaxArmLengthPalindrome = 
186 |                                     length(relMaxArmLengthPalindrome)), 
187 |                fun.data = function(x) {return(c(y = max(df$relMaxArmLengthPalindrome),
188 |                                                 label = x))}, 
189 |                geom = "text", alpha = 1, size = 2.5, vjust = -1)
190 | dev.off()
191 | 
192 | g1 <- ggplot(df, aes(x = aligned, y = maxArmLengthPalindrome)) + 
193 |   geom_violin() + 
194 |   theme_bw() + facet_wrap(~ dataset, nrow = 1) + 
195 |   xlab("") + ylab("Longest palindrome arm length") + 
196 |   coord_cartesian(ylim = c(0, max(df$maxArmLengthPalindrome) * 1.04)) + 
197 |   stat_summary(data = df %>%
198 |                  dplyr::group_by(dataset, aligned) %>%
199 |                  dplyr::summarize(maxArmLengthPalindrome = 
200 |                                     length(maxArmLengthPalindrome)), 
201 |                fun.data = function(x) {return(c(y = max(df$maxArmLengthPalindrome),
202 |                                                 label = x))}, 
203 |                geom = "text", alpha = 1, size = 2.5, vjust = -1)
204 | 
205 | df$hasSuppAlignment <- df$nbrSupplementaryAlignments > 0
206 | df0 <- df %>% dplyr::filter(aligned == "Aligned")
207 | g2 <- ggplot(df0, 
208 |              aes(x = hasSuppAlignment, y = maxArmLengthPalindrome)) + 
209 |   geom_violin() + 
210 |   theme_bw() + facet_wrap(~ dataset, nrow = 1) + 
211 |   xlab("Read has supplementary alignment") + 
212 |   ylab("Longest palindrome arm length") + 
213 |   coord_cartesian(ylim = c(0, max(df0$maxArmLengthPalindrome) * 1.04)) + 
214 |   stat_summary(data = df0 %>%
215 |                  dplyr::group_by(dataset, hasSuppAlignment) %>%
216 |                  dplyr::summarize(maxArmLengthPalindrome = 
217 |                                     length(maxArmLengthPalindrome)), 
218 |                fun.data = function(x) {return(c(y = max(df0$maxArmLengthPalindrome),
219 |                                                 label = x))}, 
220 |                geom = "text", alpha = 1, size = 2.5, vjust = -1)
221 | 
222 | df0 <- df %>% dplyr::filter(aligned == "Aligned") %>%
223 |   dplyr::filter(dataset %in% c("ONT-DCS108-HAP", "ONT-DCS108-NSK007"))
224 | g3 <- ggplot(df0,
225 |              aes(x = strands_dist0, y = maxArmLengthPalindrome)) + 
226 |     geom_violin(aes(fill = strands_dist0), alpha = 0.5) + 
227 |     theme_bw() + facet_wrap(~ dataset) + 
228 |     xlab("") + ylab("Longest palindrome arm length") + 
229 |     scale_fill_manual(values = catcols, name = "") + 
230 |     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
231 |           legend.position = "none") + 
232 |     coord_cartesian(ylim = c(0, max(df0$maxArmLengthPalindrome) * 1.04)) + 
233 |     stat_summary(data = df0 %>%
234 |                    dplyr::group_by(dataset, strands_dist0) %>%
235 |                    dplyr::summarize(maxArmLengthPalindrome = 
236 |                                       length(maxArmLengthPalindrome)), 
237 |                  fun.data = function(x) {return(c(y = max(df0$maxArmLengthPalindrome),
238 |                                                   label = x))}, 
239 |                  geom = "text", alpha = 1, size = 2.5, vjust = -1)
240 | 
241 | g4 <- ggplot(df0,
242 |              aes(x = strands_dist0, y = relMaxArmLengthPalindrome)) + 
243 |     geom_violin(aes(fill = strands_dist0), alpha = 0.5) + 
244 |     theme_bw() + facet_wrap(~ dataset) + 
245 |     xlab("") + ylab("Longest palindrome arm length/read length") + 
246 |     scale_fill_manual(values = catcols, name = "") + 
247 |     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
248 |           legend.position = "none") + 
249 |     coord_cartesian(ylim = c(0, max(df0$relMaxArmLengthPalindrome) * 1.04)) + 
250 |     stat_summary(data = df0 %>%
251 |                    dplyr::group_by(dataset, strands_dist0) %>%
252 |                    dplyr::summarize(relMaxArmLengthPalindrome = 
253 |                                       length(relMaxArmLengthPalindrome)), 
254 |                  fun.data = function(x) {return(c(y = max(df0$relMaxArmLengthPalindrome),
255 |                                                   label = x))}, 
256 |                  geom = "text", alpha = 1, size = 2.5, vjust = -1)
257 | 
258 | png(gsub("\\.rds$", "_summary.png", outrds),
259 |     width = 8, height = 13, unit = "in", res = 400)
260 | cowplot::plot_grid(
261 |   g1, 
262 |   g2,
263 |   cowplot::plot_grid(g3, g4, nrow = 1, labels = c("C", "D"),
264 |                      align = "h", axis = "bt"),
265 |   ncol = 1, labels = c("A", "B", ""), 
266 |   rel_heights = c(1, 1, 1.7)
267 | )
268 | dev.off()
269 | 
270 | png(gsub("\\.rds$", "_summary_sub.png", outrds),
271 |     width = 8, height = 6, unit = "in", res = 400)
272 | cowplot::plot_grid(
273 |   g1, 
274 |   g2,
275 |   ncol = 1, labels = c("A", "B"), 
276 |   rel_heights = c(1, 1)
277 | )
278 | dev.off()
279 | 
280 | png(gsub("\\.rds$", "_summary_sub2.png", outrds),
281 |     width = 8, height = 3, unit = "in", res = 400)
282 | print(g2)
283 | dev.off()
284 | 
285 | df$spacing <- df$totalPalindromeLength - 2*df$maxArmLengthPalindrome
286 | df0 <- df %>% dplyr::filter(aligned == "Aligned") %>%
287 |   dplyr::filter(dataset %in% c("ONT-DCS108-HAP", "ONT-DCS108-NSK007"))
288 | png(gsub("\\.rds$", "_stranddist_vs_spacing.png", outrds),
289 |     width = 4, height = 7, unit = "in", res = 400)
290 | ggplot(df0,
291 |        aes(x = strands_dist0, y = spacing)) + 
292 |   geom_violin(aes(fill = strands_dist0), alpha = 0.5) + 
293 |   theme_bw() + facet_wrap(~ dataset) + 
294 |   xlab("") + ylab("Distance between palindrome arms") + 
295 |   scale_fill_manual(values = catcols, name = "") + 
296 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
297 |         legend.position = "none") + 
298 |   coord_cartesian(ylim = c(0, max(df0$spacing) + 50)) + 
299 |   stat_summary(data = df0 %>%
300 |                  dplyr::group_by(dataset, strands_dist0) %>%
301 |                  dplyr::summarize(spacing = length(spacing)), 
302 |                fun.data = function(x) {return(c(y = max(df0$spacing),
303 |                                                 label = x))}, 
304 |                geom = "text", alpha = 1, size = 2.5, vjust = -1)
305 | dev.off()
306 | 
307 | saveRDS(df, file = outrds)
308 | date()
309 | sessionInfo()
310 | 
311 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_polya_tail_estimates.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(tidyr)
 10 |   library(ggbeeswarm)
 11 | })
 12 | 
 13 | print(tx2gene)
 14 | print(outrds)
 15 | 
 16 | ## -------------------------------------------------------------------------- ##
 17 | ## Read tx2gene and extract some categories
 18 | tx2gene <- readRDS(tx2gene)
 19 | tx2gene <- tx2gene %>%
 20 |   dplyr::mutate(tx_biotype = replace(tx_biotype, tx_biotype == "protein_coding" & 
 21 |                                        chromosome == "MT", "Mt_protein_coding")) %>%
 22 |   dplyr::mutate(tx_biotype = replace(tx_biotype, grepl("^RPL[0-9]|^RPS[0-9]", 
 23 |                                                        symbol), "ribosomal_protein"))
 24 | 
 25 | ## -------------------------------------------------------------------------- ##
 26 | ## Read polyA tail length estimates
 27 | tailfindr <- read.delim("RNA001/tailfindr/wt_1_RNA001_tailfindr.csv",
 28 |                         header = TRUE, as.is = TRUE, sep = ",")
 29 | nanopolish <- read.delim(paste0("RNA001/nanopolish/wt_1_RNA001/", 
 30 |                                 "pipeline-polya-ng/tails/filtered_tails.tsv"), 
 31 |                          header = TRUE, as.is = TRUE)
 32 | 
 33 | table(nanopolish$polya_length < 1000)
 34 | 
 35 | ## -------------------------------------------------------------------------- ##
 36 | ## Add transcript annotation to nanopolish table
 37 | nanopolish <- nanopolish %>%
 38 |   dplyr::left_join(tx2gene, by = c("contig" = "tx")) %>%
 39 |   dplyr::filter(polya_length < 1000)
 40 | 
 41 | ## -------------------------------------------------------------------------- ##
 42 | ## Merge nanopolish and tailfindr estimates
 43 | tails <- dplyr::inner_join(
 44 |   tailfindr %>% dplyr::select(read_id, tail_length) %>%
 45 |     dplyr::rename(readname = read_id, tailfindr = tail_length),
 46 |   nanopolish %>% dplyr::select(readname, polya_length) %>%
 47 |     dplyr::rename(Nanopolish = polya_length),
 48 |   by = "readname"
 49 | ) %>%
 50 |   dplyr::filter(!is.na(tailfindr) & !is.na(Nanopolish))
 51 | 
 52 | ## -------------------------------------------------------------------------- ##
 53 | ## Summary plot
 54 | png(gsub("\\.rds$", "_summary.png", outrds),
 55 |     width = 7, height = 9, unit = "in", res = 400)
 56 | cowplot::plot_grid(
 57 |   cowplot::plot_grid(
 58 |     ggplot(tails %>% 
 59 |              tidyr::gather(key = "method", value = "polya_length", -readname), 
 60 |            aes(x = method, y = polya_length)) + 
 61 |       geom_boxplot() + theme_bw() + 
 62 |       xlab("") + ylab("polyA tail length estimate"),
 63 |     ggplot(tails, aes(x = Nanopolish, y = tailfindr)) + 
 64 |       geom_abline(slope = 1, intercept = 0) + 
 65 |       geom_point(alpha = 0.25) + theme_bw() + 
 66 |       xlab("Nanopolish polyA tail length estimate") + 
 67 |       ylab("tailfindr polyA tail length estimate"),
 68 |     rel_widths = c(0.5, 1), nrow = 1, labels = c("A", "B"),
 69 |     align = "h", axis = "bt"),
 70 |   ggplot(nanopolish %>% dplyr::group_by(tx_biotype) %>%
 71 |            dplyr::mutate(nreads = length(readname)) %>%
 72 |            dplyr::filter(nreads >= 10), 
 73 |          aes(x = tx_biotype, y = polya_length)) + 
 74 |     geom_boxplot() + theme_bw() + 
 75 |     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
 76 |     xlab("Transcript biotype") + 
 77 |     ylab("Nanopolish polyA tail length estimate"),
 78 |   rel_heights = c(1, 1.5), ncol = 1, labels = c("", "C")
 79 | )
 80 | dev.off()
 81 | 
 82 | ## -------------------------------------------------------------------------- ##
 83 | ## Calculate median polyA tail length estimate for reads assigned to each
 84 | ## transcript. Plot these values for genes with many annotated transcripts. 
 85 | np <- nanopolish %>% dplyr::group_by(contig, symbol) %>%
 86 |   dplyr::summarize(median_polya_length = median(polya_length)) %>%
 87 |   dplyr::ungroup() %>%
 88 |   dplyr::group_by(symbol) %>%
 89 |   dplyr::mutate(ntx = length(contig)) %>%
 90 |   dplyr::filter(ntx > 1)
 91 | 
 92 | png(gsub("\\.rds$", "_polya_length_divergence_within_gene.png", outrds),
 93 |     height = 4, width = 7, unit = "in", res = 400)
 94 | ggplot(np %>% dplyr::filter(ntx >= 7), 
 95 |        aes(x = symbol, y = median_polya_length, color = symbol)) + 
 96 |   geom_beeswarm(size = 2.5, alpha = 0.7) + theme_bw() + 
 97 |   theme(legend.position = "none",
 98 |         axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
 99 |   ylab("Median polyA tail length estimate\nfor reads assigned to transcript") + 
100 |   xlab("Gene")
101 | dev.off()
102 | ## -------------------------------------------------------------------------- ##
103 | 
104 | saveRDS(NULL, file = outrds)
105 | date()
106 | sessionInfo()
107 | 
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_primary_supplementary.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(stringr)
 10 |   library(tximport)
 11 | })
 12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 13 | 
 14 | datasets <- strsplit(datasets, ",")[[1]]
 15 | names(datasets) <- datasets
 16 | conditions <- strsplit(conditions, ",")[[1]]
 17 | 
 18 | print(datasets)
 19 | print(conditions)
 20 | print(outrds)
 21 | 
 22 | ## Read data
 23 | dfprimsupp <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
 24 |   readRDS(paste0(ds, "/output/", ds, 
 25 |                  "_primary_supplementary_alignments_distances.rds")) %>% 
 26 |     dplyr::mutate(sample = remap[sample]) %>% 
 27 |     dplyr::mutate(dataset = remapds[ds])
 28 | })) %>%
 29 |   dplyr::left_join(sample_annotation %>% dplyr::select(sample_remap, condition),
 30 |                    by = c("sample" = "sample_remap")) %>%
 31 |   dplyr::filter(condition %in% conditions) %>%
 32 |   dplyr::select(-condition) %>%
 33 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order[ds_order %in% dataset]))
 34 | 
 35 | g0 <- ggplot(dfprimsupp %>% 
 36 |                dplyr::mutate(sample = removeDatasetFromSample(sample, dataset)) %>%
 37 |                dplyr::mutate(dist0 = "") %>%
 38 |                dplyr::mutate(dist0 = replace(dist0, distn == 0, ", overlapping"),
 39 |                              dist0 = replace(dist0, distn > 0, ", non-overlapping")) %>% 
 40 |                dplyr::group_by(sample, dataset, strands, dist0) %>% dplyr::tally() %>%
 41 |                dplyr::mutate(strands_dist0 = paste0(strands, dist0)),
 42 |              aes(x = sample, y = n, fill = strands_dist0)) + 
 43 |   facet_grid(~ dataset, scales = "free_x", space = "free_x") + 
 44 |   theme_bw() + 
 45 |   theme(legend.position = "bottom",
 46 |         strip.text = element_text(size = 8)) + 
 47 |   xlab("") + 
 48 |   guides(fill = guide_legend(nrow = 2, byrow = TRUE)) + 
 49 |   scale_fill_manual(values = c(
 50 |     "different chromosomes" = "#E8601C",
 51 |     "same chromosome, different strand, non-overlapping" = "#7BAFDE", 
 52 |     "same chromosome, different strand, overlapping" = "#90C987", 
 53 |     "same chromosome, same strand, non-overlapping" = "#777777", 
 54 |     "same chromosome, same strand, overlapping" = "#B17BA6"
 55 |   ), name = "")
 56 | 
 57 | ## By dataset
 58 | g0ds <- ggplot(dfprimsupp %>% 
 59 |                dplyr::mutate(dist0 = "") %>%
 60 |                dplyr::mutate(dist0 = replace(dist0, distn == 0, ", overlapping"),
 61 |                              dist0 = replace(dist0, distn > 0, ", non-overlapping")) %>% 
 62 |                dplyr::group_by(dataset, strands, dist0) %>% dplyr::tally() %>%
 63 |                dplyr::mutate(strands_dist0 = paste0(strands, dist0)),
 64 |              aes(x = dataset, y = n, fill = strands_dist0)) + 
 65 |   theme_bw() + 
 66 |   theme(legend.position = "bottom",
 67 |         strip.text = element_text(size = 8),
 68 |         axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
 69 |   xlab("") + 
 70 |   guides(fill = guide_legend(nrow = 5, byrow = TRUE)) + 
 71 |   scale_fill_manual(values = c(
 72 |     "different chromosomes" = "#E8601C",
 73 |     "same chromosome, different strand, non-overlapping" = "#7BAFDE", 
 74 |     "same chromosome, different strand, overlapping" = "#90C987", 
 75 |     "same chromosome, same strand, non-overlapping" = "#777777", 
 76 |     "same chromosome, same strand, overlapping" = "#B17BA6"
 77 |   ), name = "")
 78 | 
 79 | 
 80 | png(gsub("\\.rds$", "_nbr_primary_supplementary_pairs.png", outrds),
 81 |     width = 10, height = 6, unit = "in", res = 400)
 82 | print(g0 + geom_bar(stat = "identity") + 
 83 |         scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
 84 |         ylab("Number of primary/supplementary\nalignment pairs"))
 85 | dev.off()
 86 | 
 87 | png(gsub("\\.rds$", "_frac_primary_supplementary_pairs.png", outrds),
 88 |     width = 10, height = 6, unit = "in", res = 400)
 89 | print(g0 + geom_bar(stat = "identity", position = "fill") + 
 90 |         scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
 91 |         ylab("Fraction of primary/supplementary\nalignment pairs"))
 92 | dev.off()
 93 | 
 94 | plots <- list(
 95 |   primary_supplementary_fraction = 
 96 |     g0 + geom_bar(stat = "identity", position = "fill") + 
 97 |     scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
 98 |     ylab("Fraction of primary/supplementary\nalignment pairs"),
 99 |   primary_supplementary_fraction_byds = 
100 |     g0ds + geom_bar(stat = "identity", position = "fill") + 
101 |     scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
102 |     ylab("Fraction of primary/supplementary\nalignment pairs")
103 | )
104 | 
105 | saveRDS(plots, file = outrds)
106 | date()
107 | sessionInfo()
108 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_rseqc.R:
--------------------------------------------------------------------------------
 1 | args <- (commandArgs(trailingOnly = TRUE))
 2 | for (i in 1:length(args)) {
 3 |   eval(parse(text = args[[i]]))
 4 | }
 5 | 
 6 | suppressPackageStartupMessages({
 7 |   library(dplyr)
 8 |   library(ggplot2)
 9 |   library(cowplot)
10 |   library(tidyr)
11 | })
12 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
13 | source("RSeQC/RSeQC_geneBody_coverage.geneBodyCoverage.r")
14 | 
15 | datasets <- strsplit(datasets, ",")[[1]]
16 | datasets <- c(datasets, "Illumina")
17 | conditions <- strsplit(conditions, ",")[[1]]
18 | 
19 | print(datasets)
20 | print(conditions)
21 | print(outrds)
22 | 
23 | df <- data.frame(sample = gsub("0918\\.A_", "0918\\.A-", gsub("^V", "", rowLabel)), 
24 |                  data.frame(data_matrix) %>% setNames(paste0("x", seq_len(100))),
25 |                  stringsAsFactors = FALSE) %>%
26 |   tidyr::gather(key = "x", value = "y", -sample) %>%
27 |   dplyr::mutate(x = as.numeric(gsub("x", "", x)),
28 |                 sample = remap[gsub("_minimap_genome_s|_Aligned.sortedByCoord.out",
29 |                                     "", sample)]) %>%
30 |   dplyr::left_join(sample_annotation %>% 
31 |                      dplyr::select(sample_remap, condition, dataset), 
32 |                    by = c("sample" = "sample_remap")) %>%
33 |   dplyr::filter(dataset %in% remapds[datasets] & condition %in% conditions) %>% 
34 |   dplyr::mutate(
35 |     dataset = factor(dataset, levels = ds_order[ds_order %in% dataset])
36 |   )
37 | 
38 | png(gsub("\\.rds$", "_gene_body_coverage.png", outrds), width = 6, height = 3.5,
39 |     unit = "in", res = 400)
40 | ggplot(df, aes(x = x, y = y, group = sample, color = dataset)) + 
41 |   geom_line() + theme_bw() + 
42 |   scale_color_manual(values = ds_colors, name = "") + 
43 |   guides(color = guide_legend(override.aes = list(size = 2))) + 
44 |   xlab("Gene body percentile (5' -> 3')") + ylab("Coverage")
45 | dev.off()
46 | 
47 | saveRDS(NULL, file = outrds)
48 | date()
49 | sessionInfo()
50 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_salmon_variants_all_datasets.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(stringr)
 10 |   library(tximport)
 11 |   library(ggbeeswarm)
 12 |   library(cowplot)
 13 | })
 14 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 15 | 
 16 | datasets <- strsplit(datasets, ",")[[1]]
 17 | datasets <- setdiff(datasets, "HEK293RNA")
 18 | conditions <- strsplit(conditions, ",")[[1]]
 19 | 
 20 | print(datasets)
 21 | print(conditions)
 22 | print(tx2gene)
 23 | print(outrds)
 24 | 
 25 | tx2gene <- readRDS(tx2gene)
 26 | 
 27 | ## Read quantifications
 28 | salmonfiles_illumina <- list.files("Illumina/salmon31", pattern = "quant.sf", 
 29 |                                    recursive = TRUE, full.names = TRUE)
 30 | names(salmonfiles_illumina) <- paste0("Illumina__", 
 31 |                                       basename(gsub("/quant.sf", "", 
 32 |                                                     salmonfiles_illumina)))
 33 | salmonfiles_illumina <- 
 34 |   salmonfiles_illumina[names(salmonfiles_illumina) %in%
 35 |                          paste0("Illumina__",
 36 |                                 sample_annotation$sample_orig[sample_annotation$condition %in% 
 37 |                                                                 conditions])]
 38 | salmonfiles_illumina
 39 | 
 40 | txi_illumina <- tximport(salmonfiles_illumina, type = "salmon", txOut = TRUE)
 41 | txig_illumina <- summarizeToGene(txi_illumina, tx2gene = tx2gene[, c("tx", "gene")])
 42 | 
 43 | ## ========================================================================== ##
 44 | ## Salmon
 45 | ## ========================================================================== ##
 46 | salmonfiles_nanopore <- unlist(lapply(datasets, function(ds) {
 47 |   salmon31 <- list.files(file.path(ds, "salmon31"), pattern = "quant.sf",
 48 |                          recursive = TRUE, full.names = TRUE)
 49 |   names(salmon31) <- paste0("salmon31__", basename(gsub("/quant.sf", "", 
 50 |                                                         salmon31)))
 51 |   
 52 |   salmon31noclip <- list.files(file.path(ds, "salmon31_aligned_noclippedbases"),
 53 |                                pattern = "quant.sf", recursive = TRUE, 
 54 |                                full.names = TRUE)
 55 |   names(salmon31noclip) <- paste0("salmon31noclip__", basename(gsub("/quant.sf", "",
 56 |                                                                     salmon31noclip)))
 57 |   
 58 |   salmonminimap2_p0.8 <- list.files(file.path(ds, "salmonminimap2"), 
 59 |                                               pattern = "quant.sf", recursive = TRUE,
 60 |                                               full.names = TRUE)
 61 |   names(salmonminimap2_p0.8) <- paste0("salmonminimap2_p0.8__",
 62 |                                        basename(gsub("/quant.sf", "", 
 63 |                                                      salmonminimap2_p0.8)))
 64 |   
 65 |   salmonminimap2_p0.99 <- list.files(file.path(ds, "salmonminimap2_p0.99"), 
 66 |                                                pattern = "quant.sf", recursive = TRUE,
 67 |                                                full.names = TRUE)
 68 |   names(salmonminimap2_p0.99) <- paste0("salmonminimap2_p0.99__",
 69 |                                         basename(gsub("/quant.sf", "", 
 70 |                                                       salmonminimap2_p0.99)))
 71 |   
 72 |   c(salmon31, salmon31noclip, salmonminimap2_p0.8, salmonminimap2_p0.99)
 73 | }))
 74 | 
 75 | salmonfiles_nanopore <- 
 76 |   salmonfiles_nanopore[gsub("salmon31__|salmon31noclip__|salmonminimap2_p0.8__|salmonminimap2_p0.99__", "", names(salmonfiles_nanopore)) %in%
 77 |                          sample_annotation$sample_orig[sample_annotation$condition %in% 
 78 |                                                          conditions]]
 79 | salmonfiles_nanopore
 80 | 
 81 | txi_nanopore <- tximport(salmonfiles_nanopore, type = "salmon", txOut = TRUE)
 82 | txig_nanopore <- summarizeToGene(txi_nanopore, tx2gene = tx2gene[, c("tx", "gene")])
 83 | 
 84 | txi <- as.data.frame(txi_nanopore$counts) %>%
 85 |   tibble::rownames_to_column("tx") %>%
 86 |   dplyr::full_join(as.data.frame(txi_illumina$abundance) %>% 
 87 |                      tibble::rownames_to_column("tx"), by = "tx") %>%
 88 |   as.data.frame() %>%
 89 |   tibble::column_to_rownames("tx")
 90 | dim(txi)
 91 | txig <- as.data.frame(txig_nanopore$counts) %>%
 92 |   tibble::rownames_to_column("gene") %>%
 93 |   dplyr::full_join(as.data.frame(txig_illumina$abundance) %>% 
 94 |                      tibble::rownames_to_column("gene"), by = "gene") %>%
 95 |   as.data.frame() %>%
 96 |   tibble::column_to_rownames("gene")
 97 | dim(txig)
 98 | 
 99 | fixCorDf <- function(cordf) {
100 |   cordf %>%
101 |     tibble::rownames_to_column("sample1") %>%
102 |     tidyr::gather(key = "sample2", value = "correlation", -sample1) %>%
103 |     tidyr::separate(sample1, into = c("method1", "sample1"), sep = "__") %>%
104 |     tidyr::separate(sample2, into = c("method2", "sample2"), sep = "__") %>%
105 |     dplyr::mutate(sample1 = remap[sample1],
106 |                   sample2 = remap[sample2]) %>%
107 |     dplyr::mutate(condition1 = stringr::str_extract(sample1, "wt|srpk"),
108 |                   condition2 = stringr::str_extract(sample2, "wt|srpk")) %>%
109 |     dplyr::filter(method1 == "Illumina" & method2 != "Illumina" & 
110 |                     sample1 != sample2 & condition1 == condition2) %>%
111 |     dplyr::mutate(dataset2 = sapply(strsplit(sample2, "_"), 
112 |                                     .subset, 1)) %>%
113 |     dplyr::mutate(dataset2 = factor(dataset2, levels = ds_order[ds_order %in% dataset2]))
114 | }
115 | 
116 | spearman_tx <- fixCorDf(as.data.frame(cor(sqrt(txi), method = "spearman")))
117 | 
118 | spearman_gene <- fixCorDf(as.data.frame(cor(sqrt(txig), method = "spearman")))
119 | 
120 | pearson_tx <- fixCorDf(as.data.frame(cor(sqrt(txi), method = "pearson")))
121 | 
122 | pearson_gene <- fixCorDf(as.data.frame(cor(sqrt(txig), method = "pearson")))
123 | 
124 | png(gsub("\\.rds$", "_salmon_spearman.png", outrds), width = 9, height = 5, 
125 |     unit = "in", res = 400)
126 | p1 <- ggplot(spearman_tx, aes(x = method2, y = correlation, color = dataset2)) + 
127 |   geom_quasirandom(size = 3, alpha = 0.6) + 
128 |   theme_bw() + ggtitle("Transcript") + 
129 |   ylab("Spearman correlation with\nIllumina (sqrt abundance)") + xlab("") + 
130 |   coord_cartesian(ylim = c(0, 1)) + 
131 |   scale_color_manual(values = ds_colors, name = "") + 
132 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
133 | p2 <- ggplot(spearman_gene, aes(x = method2, y = correlation, color = dataset2)) + 
134 |   geom_quasirandom(size = 3, alpha = 0.6) + 
135 |   theme_bw() + ggtitle("Gene") + 
136 |   ylab("Spearman correlation with\nIllumina (sqrt abundance)") + xlab("") + 
137 |   coord_cartesian(ylim = c(0, 1)) + 
138 |   scale_color_manual(values = ds_colors, name = "") + 
139 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
140 |         legend.position = "bottom")
141 | print(cowplot::plot_grid(
142 |   cowplot::plot_grid(
143 |     p1 + theme(legend.position = "none"), 
144 |     p2 + theme(legend.position = "none"),
145 |     labels = c("A", "B"), rel_widths = c(1, 1), nrow = 1
146 |   ),
147 |   cowplot::get_legend(p2),
148 |   rel_heights = c(1, 0.2), ncol = 1)
149 | )
150 | dev.off()
151 | 
152 | png(gsub("\\.rds$", "_salmon_pearson.png", outrds), width = 9, height = 5, 
153 |     unit = "in", res = 400)
154 | p1 <- ggplot(pearson_tx, aes(x = method2, y = correlation, color = dataset2)) + 
155 |   geom_quasirandom(size = 3, alpha = 0.6) + 
156 |   theme_bw() + ggtitle("Transcript") + 
157 |   ylab("Pearson correlation with\nIllumina (sqrt abundance)") + xlab("") + 
158 |   coord_cartesian(ylim = c(0, 1)) + 
159 |   scale_color_manual(values = ds_colors, name = "") + 
160 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
161 | p2 <- ggplot(pearson_gene, aes(x = method2, y = correlation, color = dataset2)) + 
162 |   geom_quasirandom(size = 3, alpha = 0.6) + 
163 |   theme_bw() + ggtitle("Gene") + 
164 |   ylab("Pearson correlation with\nIllumina (sqrt abundance)") + xlab("") + 
165 |   coord_cartesian(ylim = c(0, 1)) + 
166 |   scale_color_manual(values = ds_colors, name = "") + 
167 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
168 |         legend.position = "bottom")
169 | print(cowplot::plot_grid(
170 |   cowplot::plot_grid(
171 |     p1 + theme(legend.position = "none"), 
172 |     p2 + theme(legend.position = "none"),
173 |     labels = c("A", "B"), rel_widths = c(1, 1), nrow = 1
174 |   ),
175 |   cowplot::get_legend(p2),
176 |   rel_heights = c(1, 0.2), ncol = 1)
177 | )
178 | dev.off()
179 | 
180 | saveRDS(NULL, file = outrds)
181 | date()
182 | sessionInfo()
183 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_sqanti_round2_summary.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(rtracklayer)
  8 |   library(dplyr)
  9 |   library(GenomicFeatures)
 10 |   library(ggplot2)
 11 |   library(grDevices)
 12 |   library(cowplot)
 13 |   library(grid)
 14 | })
 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 16 | 
 17 | datasets <- strsplit(datasets, ",")[[1]]
 18 | datasets <- c(datasets, "Illumina")
 19 | names(datasets) <- datasets
 20 | conditions <- strsplit(conditions, ",")[[1]]
 21 | 
 22 | if (all(c("wt", "srpk") %in% conditions)) {
 23 |   sqanticond <- "_all"
 24 | } else if ("wt" %in% conditions) {
 25 |   sqanticond <- "_WT"
 26 | } else {
 27 |   stop("Unknown conditions")
 28 | }
 29 | 
 30 | print(datasets)
 31 | print(conditions)
 32 | print(sqanticond)
 33 | print(outrds)
 34 | 
 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6",
 36 |            "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265",
 37 |            "#CAEDAB", "#777777")
 38 | colfun <- grDevices::colorRampPalette(muted)
 39 | 
 40 | ## Read Illumina abundance estimates
 41 | ilmn_tx_abundances <- 
 42 |   readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances
 43 | ilmn_samples <- sample_annotation %>% 
 44 |   dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 
 45 |   dplyr::pull("sample_orig")
 46 | print(ilmn_samples)
 47 | ave_ilmn_tx_tpm <- 
 48 |   data.frame(transcript_id = rownames(ilmn_tx_abundances), 
 49 |              ave_tx_tpm = rowMeans(
 50 |                ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")]
 51 |              ),
 52 |              stringsAsFactors = FALSE)
 53 | 
 54 | ## ========================================================================== ##
 55 | ## Plot overlap of detected transcripts
 56 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
 57 |   dplyr::bind_rows(
 58 |     read.delim(paste0(
 59 |       ds, "/sqanti_round2_ilmnjunc/", ds, sqanticond, "/", ds, sqanticond, 
 60 |       "_ilmnjunc_classification.txt"), 
 61 |       header = TRUE, as.is = TRUE) %>%
 62 |       dplyr::filter(structural_category %in% c("full-splice_match", 
 63 |                                                "incomplete-splice_match")) %>%
 64 |       dplyr::select(associated_transcript) %>%
 65 |       dplyr::rename(transcript_id = associated_transcript) %>%
 66 |       dplyr::distinct() %>%
 67 |       dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc"),
 68 |                     identified = TRUE),
 69 |     read.delim(paste0(
 70 |       ds, "/sqanti_round2/", ds, sqanticond, "/", ds, sqanticond, 
 71 |       "_classification.txt"), 
 72 |       header = TRUE, as.is = TRUE) %>%
 73 |       dplyr::filter(structural_category %in% c("full-splice_match", 
 74 |                                                "incomplete-splice_match")) %>%
 75 |       dplyr::select(associated_transcript) %>%
 76 |       dplyr::rename(transcript_id = associated_transcript) %>%
 77 |       dplyr::distinct() %>%
 78 |       dplyr::mutate(dataset = remapds[ds],
 79 |                     identified = TRUE)
 80 |   )
 81 | })) %>%
 82 |   dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc")  ## don't use HAP junctions for HEK
 83 | dettx <- unique(df0$transcript_id)
 84 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset)))
 85 | rownames(detmat) <- dettx
 86 | colnames(detmat) <- unique(df0$dataset)
 87 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset) %>%
 88 |                    dplyr::mutate(transcript_id = as.character(transcript_id),
 89 |                                  dataset = as.character(dataset)))] <- 1
 90 | ordr <- ds_order[ds_order %in% colnames(detmat)]
 91 | ordr <- rep(ordr, each = 2)
 92 | suffx <- rep(c("", "_ILMNjunc"), length(ordr)/2)
 93 | ordr <- paste0(ordr, suffx)
 94 | ordr <- setdiff(ordr, "ONT-RNA001-HEK_ILMNjunc")
 95 | stopifnot(ordr %in% colnames(detmat),
 96 |           colnames(detmat) %in% ordr)
 97 | detmat <- detmat[, ordr]
 98 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 
 99 |     width = 10, height = 6, unit = "in", res = 400)
100 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq",
101 |               decreasing = TRUE, keep.order = TRUE, nsets = length(ordr), 
102 |               mainbar.y.label = "Number of shared annotated transcripts",
103 |               sets.x.label = "Number of identified\nannotated transcripts",
104 |               sets.bar.color = ds_colors[gsub("_ILMNjunc", "", colnames(detmat))],
105 |               sets = colnames(detmat), mb.ratio = c(0.55, 0.45))
106 | grid::grid.edit("arrange", name = "arrange2")
107 | upsetplot <- grid::grid.grab()
108 | dev.off()
109 | 
110 | ## ========================================================================== ##
111 | ## Plot abundance of identified/unidentified transcripts 
112 | df1 <- df0 %>% 
113 |   dplyr::full_join(
114 |     do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
115 |       dplyr::bind_rows(ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds]),
116 |                        ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = paste0(remapds[ds],
117 |                                                                           "_ILMNjunc"))) %>%
118 |         dplyr::filter(dataset != "ONT-RNA001-HEK_ILMNjunc")
119 |     })), 
120 |     by = c("transcript_id", "dataset")
121 |   ) %>%
122 |   dplyr::mutate(identified = replace(identified, is.na(identified), FALSE))
123 | 
124 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds),
125 |     width = 8, height = 4, unit = "in", res = 400)
126 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 
127 |   geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 
128 |   scale_y_log10() + xlab("Transcript identified by FLAIR") + 
129 |   ylab("Average abundance across Illumina samples (TPM + 1)")
130 | dev.off()
131 | 
132 | ## Merge with upset plot
133 | df2 <- df1 %>% dplyr::filter(grepl("_ILMNjunc$|RNA001-HEK", dataset)) %>%
134 |   dplyr::select(-dataset) %>%
135 |   dplyr::group_by(transcript_id, ave_tx_tpm) %>%
136 |   dplyr::summarize(identified = any(identified))
137 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 
138 |   geom_boxplot() + theme_bw() + 
139 |   scale_y_log10() + xlab("Transcript identified by FLAIR") + 
140 |   ylab("Average abundance across Illumina samples (TPM + 1)")
141 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds),
142 |     width = 14, height = 6, unit = "in", res = 400)
143 | print(cowplot::plot_grid(
144 |   upsetplot,
145 |   p1, 
146 |   rel_widths = c(5, 1.5), nrow = 1, labels = c("A", "B")
147 | ))
148 | dev.off()
149 | 
150 | ## ========================================================================== ##
151 | ## Get number of exons and read length per flair transcript and plot for
152 | ## different structural categories
153 | ds_order_both <- rep(ds_order, each = 2)
154 | suffx <- rep(c("", "_ILMNjunc"), length(ds_order_both)/2)
155 | ds_order_both <- paste0(ds_order_both, suffx)
156 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
157 |   tmap <- read.delim(paste0(
158 |     ds, "/sqanti", ifelse(ds == "Illumina", "/", "_round2/"),
159 |     ds, sqanticond, "/", ds, sqanticond, 
160 |     "_classification.txt"), 
161 |     header = TRUE, as.is = TRUE) %>%
162 |     dplyr::mutate(dataset = remapds[ds])
163 |   if (ds %in% c("RNA001", "DCS108", "pilot")) {
164 |     tmap2 <- read.delim(paste0(
165 |       ds, "/sqanti_round2_ilmnjunc/", ds, sqanticond, "/", ds, sqanticond, 
166 |       "_ilmnjunc_classification.txt"), 
167 |       header = TRUE, as.is = TRUE) %>%
168 |       dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc"))
169 |     tmap <- dplyr::bind_rows(tmap, tmap2)
170 |   }
171 |   tmap
172 | })) %>%
173 |   dplyr::mutate(exons = replace(exons, exons > 2, ">2")) %>%
174 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset]))
175 | 
176 | ## ========================================================================== ##
177 | ## Plot distribution of structural categories
178 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
179 |   tmap <- read.delim(paste0(
180 |     ds, "/sqanti", ifelse(ds == "Illumina", "/", "_round2/"),
181 |     ds, sqanticond, "/", ds, sqanticond, 
182 |     "_classification.txt"), 
183 |     header = TRUE, as.is = TRUE)
184 |   t1 <- as.data.frame(table(tmap$structural_category)) %>% 
185 |     dplyr::mutate(dataset = remapds[ds]) %>%
186 |     dplyr::rename(structural_category = Var1) %>%
187 |     dplyr::mutate(structural_category = as.character(structural_category))
188 |   if (ds %in% c("RNA001", "DCS108", "pilot")) {
189 |     tmap2 <- read.delim(paste0(
190 |       ds, "/sqanti_round2_ilmnjunc/", ds, sqanticond, "/", ds, sqanticond, 
191 |       "_ilmnjunc_classification.txt"), 
192 |       header = TRUE, as.is = TRUE)
193 |     t1 <- dplyr::bind_rows(
194 |       t1, as.data.frame(table(tmap2$structural_category)) %>% 
195 |       dplyr::mutate(dataset = paste0(remapds[ds], "_ILMNjunc")) %>%
196 |       dplyr::rename(structural_category = Var1) %>%
197 |       dplyr::mutate(structural_category = as.character(structural_category)))
198 |   }
199 |   t1
200 | })) %>%
201 |   dplyr::mutate(structural_category = factor(structural_category, levels = c(
202 |     "novel_not_in_catalog",
203 |     "novel_in_catalog",
204 |     "antisense",
205 |     "intergenic",
206 |     "genic",
207 |     "genic_intron",
208 |     "fusion",
209 |     "incomplete-splice_match",
210 |     "full-splice_match"
211 |   ))) %>%
212 |   dplyr::mutate(dataset = factor(dataset, levels = ds_order_both[ds_order_both %in% dataset]))
213 | 
214 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 
215 |   geom_bar(stat = "identity", position = "fill", 
216 |            aes(fill = structural_category)) + theme_bw() + 
217 |   scale_fill_manual(values = structure(colfun(15), 
218 |                                        names = levels(dfcc$structural_category)),
219 |                     name = "Overlap type (structural category)") + 
220 |   ylab("Fraction of transcripts") + xlab("") + 
221 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
222 |   scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
223 |   geom_text(data = dfcc %>% dplyr::group_by(dataset) %>%
224 |               dplyr::summarize(nbrTx = sum(Freq)),
225 |             aes(x = dataset, y = 1, label = nbrTx), 
226 |             vjust = 0, nudge_y = 0.02, size = 3)
227 | 
228 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_structuralcategory_full.png", outrds), 
229 |     width = 11, height = 16, unit = "in", res = 400)
230 | p1 <- ggplot(nbrexons %>%
231 |                dplyr::group_by(dataset, structural_category, exons) %>%
232 |                dplyr::tally() %>%
233 |                dplyr::ungroup() %>%
234 |                dplyr::mutate(exons = factor(exons, 
235 |                                             levels = c(">2", "2", "1"))), 
236 |              aes(x = structural_category, y = n, fill = exons)) + 
237 |   geom_bar(stat = "identity", position = "fill") + theme_bw() + 
238 |   facet_wrap(~ dataset, nrow = 2) +
239 |   xlab("Structural category") + ylab("Fraction of transcripts") + 
240 |   scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"),
241 |                     name = "Number of\nexons") + 
242 |   theme(legend.position = "bottom",
243 |         axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
244 |   scale_y_continuous(expand = c(0, 0, 0.05, 0))
245 | p2 <- ggplot(nbrexons, 
246 |              aes(x = structural_category, y = length)) + 
247 |   geom_boxplot() + theme_bw() + 
248 |   facet_wrap(~ dataset, nrow = 2) +
249 |   scale_y_log10() + 
250 |   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
251 |   xlab("Structural category") + ylab("Inferred transcript length")
252 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 
253 |                    rel_heights = c(1, 1.1, 1), align = "v", axis = "l")
254 | dev.off()
255 | 
256 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_structuralcategory.png", outrds), 
257 |     width = 7, height = 12, unit = "in", res = 400)
258 | p1 <- ggplot(nbrexons %>%
259 |                dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")) %>%
260 |                dplyr::group_by(dataset, structural_category, exons) %>%
261 |                dplyr::tally() %>%
262 |                dplyr::ungroup() %>%
263 |                dplyr::mutate(exons = factor(exons, 
264 |                                             levels = c(">2", "2", "1"))), 
265 |              aes(x = structural_category, y = n, fill = exons)) + 
266 |   geom_bar(stat = "identity", position = "fill") + theme_bw() + 
267 |   facet_wrap(~ dataset, nrow = 1) +
268 |   xlab("Structural category") + ylab("Fraction of transcripts") + 
269 |   scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"),
270 |                     name = "Number of\nexons") + 
271 |   theme(legend.position = "bottom",
272 |         axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
273 |   scale_y_continuous(expand = c(0, 0, 0.05, 0))
274 | p2 <- ggplot(nbrexons %>%
275 |                dplyr::filter(dataset %in% c("ONT-RNA001-HAP_ILMNjunc", "Illumina")), 
276 |              aes(x = structural_category, y = length)) + 
277 |   geom_boxplot() + theme_bw() + 
278 |   facet_wrap(~ dataset, nrow = 1) +
279 |   scale_y_log10() + 
280 |   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
281 |   xlab("Structural category") + ylab("Inferred transcript length")
282 | cowplot::plot_grid(p0, p1, p2,
283 |                    ncol = 1, labels = c("A", "B", "C"), 
284 |                    rel_heights = c(1.35, 1.28, 1.1), align = "v", axis = "l")
285 | dev.off()
286 | 
287 | saveRDS(NULL, file = outrds)
288 | date()
289 | sessionInfo()


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_sqanti_summary.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(rtracklayer)
  8 |   library(dplyr)
  9 |   library(GenomicFeatures)
 10 |   library(ggplot2)
 11 |   library(grDevices)
 12 |   library(cowplot)
 13 |   library(grid)
 14 | })
 15 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 16 | 
 17 | datasets <- strsplit(datasets, ",")[[1]]
 18 | datasets <- c(datasets, "Illumina")
 19 | names(datasets) <- datasets
 20 | conditions <- strsplit(conditions, ",")[[1]]
 21 | 
 22 | if (all(c("wt", "srpk") %in% conditions)) {
 23 |   sqanticond <- "_all"
 24 | } else if ("wt" %in% conditions) {
 25 |   sqanticond <- "_WT"
 26 | } else {
 27 |   stop("Unknown conditions")
 28 | }
 29 | 
 30 | print(datasets)
 31 | print(conditions)
 32 | print(sqanticond)
 33 | print(outrds)
 34 | 
 35 | muted <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6",
 36 |            "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265",
 37 |            "#CAEDAB", "#777777")
 38 | colfun <- grDevices::colorRampPalette(muted)
 39 | 
 40 | ## Read Illumina abundance estimates
 41 | ilmn_tx_abundances <- 
 42 |   readRDS("Illumina/output/Illumina_all_abundances.rds")$tx_abundances
 43 | ilmn_samples <- sample_annotation %>% 
 44 |   dplyr::filter(dataset == "Illumina" & condition %in% conditions) %>% 
 45 |   dplyr::pull("sample_orig")
 46 | ave_ilmn_tx_tpm <- 
 47 |   data.frame(transcript_id = rownames(ilmn_tx_abundances), 
 48 |              ave_tx_tpm = rowMeans(
 49 |                ilmn_tx_abundances[, paste0(ilmn_samples, "__tpm__salmon")]
 50 |              ),
 51 |              stringsAsFactors = FALSE)
 52 | 
 53 | ## ========================================================================== ##
 54 | ## Plot overlap of detected transcripts
 55 | df0 <- do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
 56 |   read.delim(paste0(
 57 |     ds, "/sqanti/", ds, sqanticond, "/", ds, sqanticond, 
 58 |     "_classification.txt"), 
 59 |     header = TRUE, as.is = TRUE) %>%
 60 |     dplyr::filter(structural_category %in% c("full-splice_match", 
 61 |                                              "incomplete-splice_match")) %>%
 62 |     dplyr::select(associated_transcript) %>%
 63 |     dplyr::rename(transcript_id = associated_transcript) %>%
 64 |     dplyr::distinct() %>%
 65 |     dplyr::mutate(dataset = remapds[ds],
 66 |                   identified = TRUE)
 67 | }))
 68 | dettx <- unique(df0$transcript_id)
 69 | detmat <- matrix(0, nrow = length(dettx), ncol = length(unique(df0$dataset)))
 70 | rownames(detmat) <- dettx
 71 | colnames(detmat) <- unique(df0$dataset)
 72 | detmat[as.matrix(df0 %>% dplyr::select(transcript_id, dataset))] <- 1
 73 | png(gsub("\\.rds$", "_shared_transcripts.png", outrds), 
 74 |     width = 10, height = 6, unit = "in", res = 400)
 75 | UpSetR::upset(data.frame(detmat, check.names = FALSE), order.by = "freq")
 76 | grid::grid.edit("arrange", name = "arrange2")
 77 | upsetplot <- grid::grid.grab()
 78 | dev.off()
 79 | 
 80 | ## ========================================================================== ##
 81 | ## Plot abundance of identified/unidentified transcripts 
 82 | df1 <- df0 %>% 
 83 |   dplyr::full_join(
 84 |     do.call(dplyr::bind_rows, lapply(setdiff(datasets, "Illumina"), function(ds) {
 85 |       ave_ilmn_tx_tpm %>% dplyr::mutate(dataset = remapds[ds])
 86 |     })), 
 87 |     by = c("transcript_id", "dataset")
 88 |   ) %>%
 89 |   dplyr::mutate(identified = replace(identified, is.na(identified), FALSE))
 90 | 
 91 | png(gsub("\\.rds$", "_illumina_abundance_by_identification.png", outrds),
 92 |     width = 8, height = 4, unit = "in", res = 400)
 93 | ggplot(df1, aes(x = identified, y = ave_tx_tpm + 1)) + 
 94 |   geom_boxplot() + facet_wrap(~ dataset, nrow = 1) + theme_bw() + 
 95 |   scale_y_log10() + xlab("Transcript identified by flair") + 
 96 |   ylab("Average abundance across Illumina samples (TPM + 1)")
 97 | dev.off()
 98 | 
 99 | ## Merge with upset plot
100 | df2 <- df1 %>% dplyr::select(-dataset) %>%
101 |   dplyr::group_by(transcript_id, ave_tx_tpm) %>%
102 |   dplyr::summarize(identified = any(identified))
103 | p1 <- ggplot(df2, aes(x = identified, y = ave_tx_tpm + 1)) + 
104 |   geom_boxplot() + theme_bw() + 
105 |   scale_y_log10() + xlab("Transcript identified by flair") + 
106 |   ylab("Average abundance across Illumina samples (TPM + 1)")
107 | png(gsub("\\.rds$", "_illumina_abundance_plus_shared_tx.png", outrds),
108 |     width = 12, height = 6, unit = "in", res = 400)
109 | print(cowplot::plot_grid(
110 |   upsetplot,
111 |   p1, 
112 |   rel_widths = c(4, 1.5), nrow = 1, labels = c("A", "B")
113 | ))
114 | dev.off()
115 | 
116 | ## ========================================================================== ##
117 | ## Get number of exons and read length per flair transcript and plot for
118 | ## different structural categories
119 | nbrexons <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
120 |   read.delim(paste0(
121 |     ds, "/sqanti/", ds, sqanticond, "/", ds, sqanticond, "_classification.txt"), 
122 |     header = TRUE, as.is = TRUE) %>%
123 |     dplyr::mutate(dataset = remapds[ds])
124 | })) %>%
125 |   dplyr::mutate(exons = replace(exons, exons > 2, ">2"))
126 | 
127 | ## ========================================================================== ##
128 | ## Plot distribution of structural categories
129 | dfcc <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
130 |   tmap <- read.delim(paste0(
131 |     ds, "/sqanti/", ds, sqanticond, "/", ds, sqanticond, "_classification.txt"), 
132 |     header = TRUE, as.is = TRUE)
133 |   as.data.frame(table(tmap$structural_category)) %>% 
134 |     dplyr::mutate(dataset = remapds[ds]) %>%
135 |     dplyr::rename(structural_category = Var1) %>%
136 |     dplyr::mutate(structural_category = as.character(structural_category))
137 | })) %>%
138 |   dplyr::mutate(structural_category = factor(structural_category, levels = c(
139 |     "novel_not_in_catalog",
140 |     "novel_in_catalog",
141 |     "antisense",
142 |     "intergenic",
143 |     "genic",
144 |     "genic_intron",
145 |     "fusion",
146 |     "incomplete-splice_match",
147 |     "full-splice_match"
148 |   )))
149 | 
150 | png(gsub("\\.rds$", "_nbr_exons_read_length_by_structuralcategory.png", outrds), 
151 |     width = 11, height = 11, unit = "in", res = 400)
152 | p0 <- ggplot(dfcc, aes(x = dataset, y = Freq)) + 
153 |   geom_bar(stat = "identity", position = "fill", 
154 |            aes(fill = structural_category)) + theme_bw() + 
155 |   scale_fill_manual(values = structure(colfun(15), 
156 |                                        names = levels(dfcc$structural_category)),
157 |                     name = "Overlap type (structural category)") + 
158 |   ylab("Fraction of transcripts") + xlab("") + 
159 |   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
160 |   scale_y_continuous(expand = c(0, 0, 0.05, 0)) + 
161 |   geom_text(data = dfcc %>% dplyr::group_by(dataset) %>%
162 |               dplyr::summarize(nbrTx = sum(Freq)),
163 |             aes(x = dataset, y = 1, label = nbrTx), 
164 |             vjust = 0, nudge_y = 0.02, size = 3)
165 | p1 <- ggplot(nbrexons %>%
166 |                dplyr::group_by(dataset, structural_category, exons) %>%
167 |                dplyr::tally() %>%
168 |                dplyr::ungroup() %>%
169 |                dplyr::mutate(exons = factor(exons, 
170 |                                             levels = c(">2", "2", "1"))), 
171 |              aes(x = structural_category, y = n, fill = exons)) + 
172 |   geom_bar(stat = "identity", position = "fill") + theme_bw() + 
173 |   facet_wrap(~ dataset, nrow = 1) +
174 |   xlab("Structural category") + ylab("Fraction of transcripts") + 
175 |   scale_fill_manual(values = c("1" = "red", "2" = "blue", ">2" = "grey"),
176 |                     name = "Number of\nexons") + 
177 |   theme(legend.position = "bottom",
178 |         axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
179 |   scale_y_continuous(expand = c(0, 0, 0.05, 0))
180 | p2 <- ggplot(nbrexons, 
181 |              aes(x = structural_category, y = length)) + 
182 |   geom_boxplot() + theme_bw() + 
183 |   facet_wrap(~ dataset, nrow = 1) +
184 |   scale_y_log10() + 
185 |   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
186 |   xlab("Structural category") + ylab("Inferred transcript length")
187 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 
188 |                    rel_heights = c(1.7, 1.08, 0.9), align = "v", axis = "l")
189 | dev.off()
190 | 
191 | png(gsub("\\.rds$", "_dist_to_tss_tts_by_structuralcategory.png", outrds), 
192 |     width = 11, height = 11, unit = "in", res = 400)
193 | p0 <- ggplot(nbrexons %>% dplyr::filter(!is.na(diff_to_TSS)) %>%
194 |                droplevels(), 
195 |              aes(x = structural_category, y = diff_to_TSS)) + 
196 |   geom_violin() + theme_bw() + 
197 |   facet_wrap(~ dataset, nrow = 1) +
198 |   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
199 |   xlab("Structural category") + ylab("Distance to TSS")
200 | p1 <- ggplot(nbrexons %>% dplyr::filter(!is.na(diff_to_TTS)) %>%
201 |                droplevels(), 
202 |              aes(x = structural_category, y = diff_to_TTS)) + 
203 |   geom_violin() + theme_bw() + 
204 |   facet_wrap(~ dataset, nrow = 1) +
205 |   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
206 |   xlab("Structural category") + ylab("Distance to TTS")
207 | p2 <- ggplot(nbrexons, 
208 |              aes(x = structural_category, y = perc_A_downstream_TTS)) + 
209 |   geom_violin() + theme_bw() + 
210 |   facet_wrap(~ dataset, nrow = 1) +
211 |   theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
212 |   xlab("Structural category") + ylab("Percentage A\ndownstream of TTS")
213 | cowplot::plot_grid(p0, p1, p2, ncol = 1, labels = c("A", "B", "C"), 
214 |                    rel_heights = c(1, 1, 1), align = "v", axis = "l")
215 | dev.off()
216 | 
217 | 
218 | saveRDS(NULL, file = outrds)
219 | date()
220 | sessionInfo()


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_txcov_sirv.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(dplyr)
  8 |   library(ggplot2)
  9 |   library(stringr)
 10 |   library(cowplot)
 11 |   library(GenomicAlignments)
 12 | })
 13 | 
 14 | labels <- strsplit(labels, ",")[[1]]
 15 | 
 16 | print(dataset)
 17 | print(labels)
 18 | print(title)
 19 | print(outrds)
 20 | 
 21 | ################################################################################
 22 | ## Number of covered transcripts, covered fraction
 23 | ################################################################################
 24 | dfct_primary <- do.call(dplyr::bind_rows, lapply(dataset, function(ds) {
 25 |   rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds"))
 26 |   rdt <- rd$txomebams_p0.99
 27 |   do.call(dplyr::bind_rows, lapply(names(rdt), function(nm) {
 28 |     rdt[[nm]]$allAlignments %>% dplyr::filter(flag %in% c(0, 16)) %>% 
 29 |       dplyr::mutate(sample = nm) %>% 
 30 |       dplyr::mutate(dataset = ds)
 31 |   }))
 32 | }))
 33 | 
 34 | dfct_longest <- do.call(dplyr::bind_rows, lapply(dataset, function(ds) {
 35 |   rd <- readRDS(paste0(ds, "/output/", ds, "_nbr_reads.rds"))
 36 |   rdt <- rd$txomebams_p0.99
 37 |   do.call(dplyr::bind_rows, lapply(names(rdt), function(nm) {
 38 |     rdt[[nm]]$allAlignments %>% 
 39 |       dplyr::select(read, flag, rname, nbrM, nbrS, nbrH, nbrD, nbrI, 
 40 |                     txLength, alignedLength) %>%
 41 |       dplyr::filter(flag %in% c(0, 16, 256, 272)) %>% 
 42 |       dplyr::group_by(read) %>%
 43 |       dplyr::filter(alignedLength/max(alignedLength) > 0.9) %>%
 44 |       dplyr::arrange(desc((nbrM + nbrD)/txLength)) %>%
 45 |       dplyr::slice(1) %>% 
 46 |       dplyr::ungroup() %>% 
 47 |       dplyr::mutate(sample = nm) %>% 
 48 |       dplyr::mutate(dataset = ds)
 49 |   }))
 50 | }))
 51 | 
 52 | plotCovFractionAll <- function(plotdf, maxLength) {
 53 |   ggplot(plotdf %>% dplyr::mutate(covFraction = (nbrM + nbrD)/txLength),
 54 |          aes(x = "All transcripts", y = covFraction)) + geom_violin() + 
 55 |     scale_y_continuous(limits = c(0, NA)) + 
 56 |     theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
 57 |     xlab("") + ylab("Fraction of transcript covered per alignment") + 
 58 |     stat_summary(data = plotdf %>% 
 59 |                    dplyr::summarize(covFraction = length(txLength)), 
 60 |                  fun.data = function(x) {return(c(y = 1, label = x))}, 
 61 |                  geom = "text", alpha = 1, size = 2.5, vjust = -1)
 62 | }
 63 | 
 64 | plotCovFractionStrat <- function(plotdf, maxLength) {
 65 |   ggplot(plotdf %>% 
 66 |            dplyr::mutate(txLengthGroup = Hmisc::cut2(txLength, 
 67 |                                                      cuts = c(0, 500, 1000, 1500, 
 68 |                                                               2000, maxLength))) %>%
 69 |            dplyr::mutate(covFraction = (nbrM + nbrD)/txLength), 
 70 |          aes(x = txLengthGroup, y = covFraction)) + 
 71 |     geom_violin() + scale_y_continuous(limits = c(0, NA)) + 
 72 |     theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
 73 |     xlab("Transcript length") + ylab("Fraction of transcript covered per alignment") + 
 74 |     stat_summary(data = plotdf %>% 
 75 |                    dplyr::mutate(txLengthGroup = Hmisc::cut2(txLength, 
 76 |                                                              cuts = c(0, 500, 1000, 1500, 
 77 |                                                                       2000, maxLength))) %>%
 78 |                    dplyr::group_by(txLengthGroup) %>%
 79 |                    dplyr::summarize(covFraction = length(txLength)), 
 80 |                  fun.data = function(x) {return(c(y = 1, label = x))}, 
 81 |                  geom = "text", alpha = 1, size = 2.5, vjust = -1)
 82 | }
 83 | 
 84 | png(gsub("\\.rds$", "_coverage_fraction_of_transcripts_primary.png", outrds),
 85 |     width = 5, height = 5, unit = "in", res = 400)
 86 | plot_title <- cowplot::ggdraw() + cowplot::draw_label(title, fontface = 'bold')
 87 | p0 <- cowplot::plot_grid(
 88 |   plotCovFractionAll(dfct_primary, max(dfct_primary$txLength)),
 89 |   plotCovFractionStrat(dfct_primary, max(dfct_primary$txLength)),
 90 |   rel_widths = c(2, 5), labels = labels
 91 | )
 92 | print(cowplot::plot_grid(plot_title, p0, rel_heights = c(0.075, 1), labels = "", ncol = 1))
 93 | dev.off()
 94 | 
 95 | png(gsub("\\.rds$", "_coverage_fraction_of_transcripts_longest.png", outrds),
 96 |     width = 5, height = 5, unit = "in", res = 400)
 97 | plot_title <- cowplot::ggdraw() + cowplot::draw_label(title, fontface = 'bold')
 98 | p0 <- cowplot::plot_grid(
 99 |   plotCovFractionAll(dfct_longest, max(dfct_longest$txLength)),
100 |   plotCovFractionStrat(dfct_longest, max(dfct_longest$txLength)),
101 |   rel_widths = c(2, 5), labels = labels
102 | )
103 | print(cowplot::plot_grid(plot_title, p0, rel_heights = c(0.075, 1), labels = "", ncol = 1))
104 | dev.off()
105 | 
106 | saveRDS(list(covfraclongest = plotCovFractionStrat(dfct_longest, 
107 |                                                    max(dfct_longest$txLength))), 
108 |         file = outrds)
109 | date()
110 | sessionInfo()
111 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/plot_txlength_vs_basecoverage.R:
--------------------------------------------------------------------------------
  1 | args <- (commandArgs(trailingOnly = TRUE))
  2 | for (i in 1:length(args)) {
  3 |   eval(parse(text = args[[i]]))
  4 | }
  5 | 
  6 | suppressPackageStartupMessages({
  7 |   library(bamsignals)
  8 |   library(dplyr)
  9 |   library(ggplot2)
 10 |   library(tximport)
 11 |   library(tibble)
 12 | })
 13 | source("manuscript_results_figures/Rscripts/remap_sample_names.R")
 14 | 
 15 | datasets <- strsplit(datasets, ",")[[1]]
 16 | datasets <- c(datasets, "Illumina")
 17 | conditions <- strsplit(conditions, ",")[[1]]
 18 | 
 19 | print(datasets)
 20 | print(conditions)
 21 | print(tx2gene)
 22 | print(gtf)
 23 | print(outrds)
 24 | 
 25 | tx2gene <- readRDS(tx2gene)
 26 | genes <- rtracklayer::import(gtf)
 27 | genes <- subset(genes, type == "exon")
 28 | 
 29 | ## Get coverages for all bases in all transcripts, for all samples
 30 | baseCoverages <- do.call(dplyr::bind_rows, lapply(datasets, function(ds) {
 31 |   print(paste0(ds, ":"))
 32 |   if (ds == "Illumina") {
 33 |     files <- list.files(paste0(ds, "/STAR"), 
 34 |                         pattern = "_Aligned.sortedByCoord.out.bam$", 
 35 |                         recursive = TRUE, full.names = TRUE)
 36 |     names(files) <- gsub("_Aligned.sortedByCoord.out.bam", 
 37 |                          "", basename(files))
 38 |     files <- files[names(files) %in% 
 39 |                      sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 40 |   } else {
 41 |     files <- list.files(paste0(ds, "/minimap2genome"), 
 42 |                         pattern = "_minimap_genome_s.bam$", 
 43 |                         recursive = TRUE, full.names = TRUE)
 44 |     names(files) <- gsub("_minimap_genome_s.bam", 
 45 |                          "", basename(files))
 46 |     files <- files[names(files) %in% 
 47 |                      sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 48 |   }
 49 |   do.call(dplyr::bind_rows, lapply(names(files), function(nm) {
 50 |     print(paste0("   ", nm))
 51 |     sigs <- bamCoverage(files[nm], genes, verbose = FALSE)
 52 |     as.data.frame(genes) %>% 
 53 |       dplyr::mutate(totCov = sapply(sigs, sum),
 54 |                     totNBases = sapply(sigs, length),
 55 |                     totNonzero = sapply(sigs, function(w) sum(w > 0))) %>%
 56 |       dplyr::group_by(transcript_id) %>%
 57 |       dplyr::summarize(txLength = sum(width),
 58 |                        totCov = sum(totCov), 
 59 |                        totNBases = sum(totNBases),
 60 |                        totNonzero = sum(totNonzero),
 61 |                        txBiotype = unique(transcript_biotype),
 62 |                        gene = unique(gene_id),
 63 |                        geneName = unique(gene_name),
 64 |                        geneBiotype = unique(gene_biotype)) %>%
 65 |       dplyr::mutate(dataset = remapds[ds],
 66 |                     sample = remap[nm])
 67 |   }))
 68 | }))
 69 | 
 70 | ## Add expression (TPM) in Illumina samples
 71 | files <- list.files("Illumina/salmon31", pattern = "quant.sf", 
 72 |                     recursive = TRUE, full.names = TRUE)
 73 | names(files) <- basename(dirname(files))
 74 | files <- files[names(files) %in% 
 75 |                  sample_annotation$sample_orig[sample_annotation$condition %in% conditions]]
 76 | salmon <- tximport(files = files, type = "salmon", txOut = TRUE)
 77 | 
 78 | tpm <- as.data.frame(salmon$abundance) %>%
 79 |   tibble::rownames_to_column("transcript_id") %>%
 80 |   dplyr::mutate(transcript_id = gsub("\\.[0-9]+$", "", transcript_id)) %>%
 81 |   tidyr::gather(key = sample, value = TPM, -transcript_id) %>%
 82 |   dplyr::mutate(sample = remap[sample],
 83 |                 dataset = "Illumina")
 84 | 
 85 | df <- baseCoverages %>% 
 86 |   dplyr::left_join(tpm, by = c("sample", "dataset", "transcript_id"))
 87 | 
 88 | ## Plot transcript length vs number of covered bases for Illumina data,
 89 | ## at different expression cutoffs
 90 | plots <- list()
 91 | png(gsub("\\.rds$", paste0("_txlength_vs_numcovbases_illumina_minTPM1.png"), outrds),
 92 |     width = 7, height = 7, unit = "in", res = 400)
 93 | df0 <- df %>% dplyr::filter(dataset == "Illumina") %>%
 94 |   dplyr::group_by(transcript_id) %>%
 95 |   dplyr::summarize(aveTPM = mean(TPM),
 96 |                    aveNbrNonzero = mean(totNonzero),
 97 |                    txLength = mean(txLength)) %>%
 98 |   dplyr::filter(aveTPM > 1)
 99 | plots[[paste0("txlength_vs_basecoverage_minTPM1")]] <- 
100 |   ggplot(df0, aes(x = txLength, y = aveNbrNonzero)) + 
101 |   geom_abline(slope = 1, intercept = 0) + 
102 |   geom_hex(bins = 100, aes(fill = stat(density))) + 
103 |   scale_fill_gradient(name = "", low = "bisque2", high = "darkblue") + 
104 |   theme_bw() + 
105 |   xlab("Transcript length") + 
106 |   ylab("Average number of covered bases in Illumina samples") + 
107 |   ggtitle(paste0("Transcripts with average TPM > 1"))
108 | print(plots[[paste0("txlength_vs_basecoverage_minTPM1")]])
109 | dev.off()
110 | 
111 | saveRDS(plots, file = outrds)
112 | date()
113 | sessionInfo()
114 | 
115 | 


--------------------------------------------------------------------------------
/manuscript_results_figures/Rscripts/remap_sample_names.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(dplyr)
 3 |   library(tidyr)
 4 | })
 5 | 
 6 | ## Define remapping of sample names
 7 | remap <- c(Srpk_1 = "ONT-NSK007-HAP_srpk_1", 
 8 |            Srpk_2 = "ONT-NSK007-HAP_srpk_2", 
 9 |            wt1 = "ONT-NSK007-HAP_wt_1", 
10 |            wt2 = "ONT-NSK007-HAP_wt_2",
11 |            wt_4_FGCZ = "ONT-PCS108-HAP_wt_4", 
12 |            `20171207_1645_p2557_4017_2_ALLREADS.pass` = "ONT-PCS108-HAP_wt_4",
13 |            srpk_1_DCS108 = "ONT-DCS108-HAP_srpk_1", 
14 |            srpk_2_DCS108 = "ONT-DCS108-HAP_srpk_2",
15 |            wt_1_DCS108 = "ONT-DCS108-HAP_wt_1", 
16 |            wt_2_DCS108 = "ONT-DCS108-HAP_wt_2",
17 |            srpk_1_RNA001 = "ONT-RNA001-HAP_srpk_1", 
18 |            srpk_2_RNA001 = "ONT-RNA001-HAP_srpk_2",
19 |            srpk_3_RNA001 = "ONT-RNA001-HAP_srpk_3", 
20 |            srpk_4_RNA001 = "ONT-RNA001-HAP_srpk_4",
21 |            srpk_5_RNA001 = "ONT-RNA001-HAP_srpk_5", 
22 |            srpk_6_RNA001 = "ONT-RNA001-HAP_srpk_6",
23 |            wt_1_RNA001 = "ONT-RNA001-HAP_wt_1", 
24 |            wt_2_RNA001 = "ONT-RNA001-HAP_wt_2",
25 |            wt_3_RNA001 = "ONT-RNA001-HAP_wt_3", 
26 |            wt_4_RNA001 = "ONT-RNA001-HAP_wt_4",
27 |            wt_5_RNA001 = "ONT-RNA001-HAP_wt_5", 
28 |            wt_6_RNA001 = "ONT-RNA001-HAP_wt_6",
29 |            SS2_srpk_1 = "TempSwitch-HAP_srpk_1", 
30 |            SS2_srpk_2 = "TempSwitch-HAP_srpk_2",
31 |            SS2_wt_1 = "TempSwitch-HAP_wt_1", 
32 |            SS2_wt_2 = "TempSwitch-HAP_wt_2",
33 |            `20170918.A-dSprk1_1` = "Illumina_srpk_1",
34 |            `20170918.A-dSprk1_2` = "Illumina_srpk_2",
35 |            `20170918.A-dSprk1_3` = "Illumina_srpk_3",
36 |            `20170918.A-dSprk1_4` = "Illumina_srpk_4",
37 |            `20170918.A-WT_1` = "Illumina_wt_1",
38 |            `20170918.A-WT_2` = "Illumina_wt_2",
39 |            `20170918.A-WT_3` = "Illumina_wt_3", 
40 |            `20170918.A-WT_4` = "Illumina_wt_4",
41 |            HEK_1 = "ONT-RNA001-HEK_wt_1", 
42 |            HEK_2 = "ONT-RNA001-HEK_wt_2", 
43 |            HEK_3 = "ONT-RNA001-HEK_wt_3",
44 |            HEK_4 = "ONT-RNA001-HEK_wt_4", 
45 |            HEK_5 = "ONT-RNA001-HEK_wt_5",
46 |            `20181026_0911_p2557_4959_1.pass` = "ONT-PCS109-HAP_wt_4",
47 |            wt4 = "ONT-SQK-PCS109-HAP_wt_4",
48 |            p2557_5265_1 = "ONT-SQK-PCS109-HAP_wt_1",
49 |            p2557_5265_2 = "ONT-SQK-PCS109-HAP_wt_2",
50 |            p2557_5265_5 = "ONT-SQK-PCS109-HAP_srpk_1",
51 |            p2557_5265_6 = "ONT-SQK-PCS109-HAP_srpk_2")
52 | 
53 | ## Create a sample annotation table
54 | sample_annotation <- data.frame(
55 |   sample_orig = names(remap),
56 |   sample_remap = remap,
57 |   stringsAsFactors = FALSE
58 | ) %>%
59 |   tidyr::separate(sample_remap, into = c("dataset", "condition", "sample_nbr"),
60 |                   sep = "_", remove = FALSE)
61 | 
62 | ## Define remapping of data set names
63 | remapds <- c(pilot = "ONT-NSK007-HAP",
64 |              NSK007 = "TempSwitch-HAP",
65 |              DCS108 = "ONT-DCS108-HAP",
66 |              RNA001 = "ONT-RNA001-HAP",
67 |              HEK293RNA = "ONT-RNA001-HEK",
68 |              FGCZ = "ONT-PCS108-HAP", 
69 |              Illumina = "Illumina",
70 |              FGCZ_PCS109 = "ONT-PCS109-HAP",
71 |              FGCZ_SQK_PCS109 = "ONT-SQK-PCS109-HAP",
72 |              FGCZ_PCS109_GridION = "ONT-SQK-PCS109-HAP")
73 | 
74 | ## Define color schemes
75 | ds_colors <- c(`ONT-DCS108-HAP`= "#66C2A5", `ONT-NSK007-HAP` = "#FC8D62", 
76 |                `ONT-RNA001-HEK` = "#8DA0CB", `TempSwitch-HAP` = "darkgrey", 
77 |                `ONT-PCS108-HAP` = "#A6D854", `ONT-RNA001-HAP` = "#FFD92F", 
78 |                Illumina = "#B3B3B3", `ONT-PCS109-HAP` = "#E78AC3",
79 |                `ONT-SQK-PCS109-HAP` = "darkblue")
80 | 
81 | ds_order <- c("Illumina", "ONT-NSK007-HAP", "ONT-DCS108-HAP", "ONT-RNA001-HAP", 
82 |               "ONT-RNA001-HEK")
83 | 
84 | ## Help function to remove the dataset name from the sample name
85 | removeDatasetFromSample <- function(sample, dataset) {
86 |   sapply(seq_along(sample), function(i) gsub(paste0(dataset[i], "_"), "", sample[i]))
87 | }
88 | 


--------------------------------------------------------------------------------