├── .gitignore ├── R ├── overlap_classification.R ├── .DS_Store ├── sample_read.R ├── write_csv.R ├── return_exons.R ├── write_bed.R ├── write_gtf.R ├── both_pseudo.R ├── exon_overlap.R ├── pseudo_overlap.R ├── PremrnaAnnotationGenerator.R ├── LoadGtf.R ├── readthrough_or_premature_min.R ├── readthrough_or_premature_plus.R ├── readthrough_or_premature.R ├── IdentifyOverlappers.R ├── GenerateGeneLocationBed.R ├── GenerateExtensionCandidates.R ├── IsolateIntergenicReads.R ├── OptimizedAnnotationAssembler.R └── OverlapResolutions.R ├── LICENSE ├── .DS_Store ├── inst ├── test_gene_replacement.csv ├── .DS_Store └── extdata │ ├── test_gene_replacement.csv │ ├── .DS_Store │ ├── #test_bam.bam │ ├── test_bam.bam │ ├── test_index.bam.bai │ └── #test_index.bam.bai ├── .Rbuildignore ├── ReferenceEnhancer-Manual2023.pdf ├── NAMESPACE ├── ReferenceEnhancer.Rproj ├── man ├── PremrnaAnnotationGenerator.Rd ├── LoadGtf.Rd ├── IdentifyOverlappers.Rd ├── GenerateExtensionCandidates.Rd ├── GenerateGeneLocationBed.Rd ├── IsolateIntergenicReads.Rd ├── OptimizedAnnotationAssembler.Rd └── OverlapResolutions.Rd ├── LICENSE.md ├── DESCRIPTION ├── README.Rmd ├── README.Rmd.orig └── .Rhistory /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | -------------------------------------------------------------------------------- /R/overlap_classification.R: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2022 2 | COPYRIGHT HOLDER: Allan-Hermann Pool 3 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/.DS_Store -------------------------------------------------------------------------------- /R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/R/.DS_Store -------------------------------------------------------------------------------- /inst/test_gene_replacement.csv: -------------------------------------------------------------------------------- 1 | old_name,new_name 2 | Sox17,Sox17-Sox17a 3 | 4 | -------------------------------------------------------------------------------- /inst/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/.DS_Store -------------------------------------------------------------------------------- /inst/extdata/test_gene_replacement.csv: -------------------------------------------------------------------------------- 1 | old_name,new_name 2 | Sox17,Sox17-Sox17a 3 | 4 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^ReferenceEnhancer\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^README\.Rmd$ 5 | -------------------------------------------------------------------------------- /inst/extdata/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/.DS_Store -------------------------------------------------------------------------------- /inst/extdata/#test_bam.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/#test_bam.bam -------------------------------------------------------------------------------- /inst/extdata/test_bam.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/test_bam.bam -------------------------------------------------------------------------------- /inst/extdata/test_index.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/test_index.bam.bai -------------------------------------------------------------------------------- /ReferenceEnhancer-Manual2023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/ReferenceEnhancer-Manual2023.pdf -------------------------------------------------------------------------------- /inst/extdata/#test_index.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/#test_index.bam.bai -------------------------------------------------------------------------------- /R/sample_read.R: -------------------------------------------------------------------------------- 1 | # csv = system.file("extdata", "mouse_sample.csv", package = "ReferenceEnhancer") 2 | sample_read <- function(path){ 3 | readr::read_csv(path) 4 | } 5 | -------------------------------------------------------------------------------- /R/write_csv.R: -------------------------------------------------------------------------------- 1 | write_csv <- function(output_data, file_name){ 2 | path = "." 3 | out_path <- file.path(path, file_name) 4 | write.csv(output_data, out_path) 5 | } 6 | -------------------------------------------------------------------------------- /R/return_exons.R: -------------------------------------------------------------------------------- 1 | return_exons <- function(gene_name){ 2 | exon_subset <- subset(gene_name, type == 'exon') 3 | return(data.frame(exon_subset['start'], exon_subset['end'])) 4 | } 5 | -------------------------------------------------------------------------------- /R/write_bed.R: -------------------------------------------------------------------------------- 1 | write_bed <- function(output_data, file_name){ 2 | path = "." 3 | out_path <- file.path(path, file_name) 4 | rtracklayer::export.bed(output_data, con = out_path) 5 | } 6 | -------------------------------------------------------------------------------- /R/write_gtf.R: -------------------------------------------------------------------------------- 1 | write_gtf <- function(output_data, file_name){ 2 | #path = "." 3 | #out_path <- file.path(path, file_name) 4 | rtracklayer::export(output_data, file_name, format = "gtf") 5 | } 6 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(GenerateExtensionCandidates) 4 | export(GenerateGeneLocationBed) 5 | export(IdentifyOverlappers) 6 | export(IsolateIntergenicReads) 7 | export(LoadGtf) 8 | export(OptimizedAnnotationAssembler) 9 | export(OverlapResolutions) 10 | export(PremrnaAnnotationGenerator) 11 | -------------------------------------------------------------------------------- /R/both_pseudo.R: -------------------------------------------------------------------------------- 1 | both_pseudo <- function(key, overlapping, gene_pattern){ 2 | 3 | if (missing(gene_pattern)){ 4 | return(FALSE) 5 | } 6 | 7 | else{ 8 | key_pseudo = sum(stringr::str_detect(key, gene_pattern)) 9 | overlapping_pseudo = sum(stringr::str_detect(overlapping, gene_pattern)) 10 | return((key_pseudo + overlapping_pseudo) > 1) 11 | 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /ReferenceEnhancer.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /R/exon_overlap.R: -------------------------------------------------------------------------------- 1 | exon_overlap <- function(gene_A_exons, gene_B_exons){ 2 | 3 | if(dim(gene_A_exons)[1] == 0 | dim(gene_B_exons)[1] == 0){ 4 | return (FALSE) 5 | } 6 | 7 | for(row_exonA in 1:nrow(gene_A_exons)){ 8 | for(row_exonB in 1:nrow(gene_B_exons)){ 9 | 10 | x = seq(from = gene_A_exons[row_exonA,1], to = gene_A_exons[row_exonA,2]-1, by = 1) 11 | y = seq(from = gene_B_exons[row_exonB,1], to = gene_B_exons[row_exonB,2]-1, by = 1) 12 | 13 | if(length(intersect(x,y))!=0){ 14 | return (TRUE) 15 | } 16 | } 17 | } 18 | return (FALSE) 19 | } 20 | -------------------------------------------------------------------------------- /R/pseudo_overlap.R: -------------------------------------------------------------------------------- 1 | pseudo_overlap <- function(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern){ 2 | 3 | if(missing(gene_pattern)){ 4 | return('empty') 5 | } 6 | 7 | else{ 8 | # Check for exon overlap 9 | if(sum(stringr::str_detect(key, gene_pattern)) > 0 | sum(stringr::str_detect(overlapping, gene_pattern)) > 0 ){ 10 | if(exon_overlap(gene_A_exons, gene_B_exons) == TRUE){ 11 | # Check if gene_A is a pseudogene 12 | if(sum(stringr::str_detect(key, gene_pattern)) > 0){ 13 | return(key) 14 | } 15 | else{ 16 | return(overlapping) 17 | } 18 | } 19 | else{ 20 | return('exonic') 21 | } 22 | } 23 | 24 | else{ 25 | return('empty') 26 | } 27 | 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/PremrnaAnnotationGenerator.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PremrnaAnnotationGenerator.R 3 | \name{PremrnaAnnotationGenerator} 4 | \alias{PremrnaAnnotationGenerator} 5 | \title{PremrnaAnnotationGenerator} 6 | \usage{ 7 | PremrnaAnnotationGenerator(genome_annotation) 8 | } 9 | \arguments{ 10 | \item{genome_annotation}{ENSEMBL/10x Genomics default genome annotation file (.gtf).} 11 | } 12 | \value{ 13 | Generates a basic pre-mRNA reference and saves in working directory as premrna.gtf 14 | } 15 | \description{ 16 | It supplements original normal gene annotation entries by 17 | traditional pre-mRNA entries where transcripts have been redefined as exons 18 | and map in the --include-introns mode to retrieve most of available intronic reads. 19 | } 20 | \examples{ 21 | genome_annotation <- LoadGtf("test_genes.gtf") 22 | PremrnaAnnotationGenerator(genome_annotation) 23 | } 24 | -------------------------------------------------------------------------------- /man/LoadGtf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/LoadGtf.R 3 | \name{LoadGtf} 4 | \alias{LoadGtf} 5 | \title{LoadGtf} 6 | \usage{ 7 | LoadGtf(unoptimized_annotation_path) 8 | } 9 | \arguments{ 10 | \item{unoptimized_annotation_path}{Path to the unoptimized genome annotion GTF file.} 11 | } 12 | \value{ 13 | Resulting object contains the genome annotation entries from the genome annotation GTF file. 14 | } 15 | \description{ 16 | Use to import the Ensembl/10x Genomics default genome annotation 17 | or other desired genome annotation file in GTF format for optimization for scRNA-seq 18 | analysis. Note: This file can be downloaded from 10x Genomics provided reference 19 | transcriptome "gene" folder at 20 | "https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest" 21 | or Ensembl.org if wish to customize more. 22 | } 23 | \examples{ 24 | LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 25 | } 26 | -------------------------------------------------------------------------------- /man/IdentifyOverlappers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IdentifyOverlappers.R 3 | \name{IdentifyOverlappers} 4 | \alias{IdentifyOverlappers} 5 | \title{IdentifyOverlappers} 6 | \usage{ 7 | IdentifyOverlappers(genome_annotation) 8 | } 9 | \arguments{ 10 | \item{genome_annotation}{Unoptimized genome annotation file in GTF. Could be 11 | obtained from Ensembl, Refseq, 10x Genomics or elsewhere.} 12 | } 13 | \value{ 14 | Rank-ordered gene list of same-strand overlapping genes (“overlapping_gene_list.csv”). 15 | } 16 | \description{ 17 | Identifies all same-strand overlapping genes based on the unoptimized 18 | genome annotation file in GTF, rank-orders them according to the number of gene 19 | overlaps. Saves the list of overlapping genes in working directory as “overlapping_gene_list.csv”. 20 | } 21 | \examples{ 22 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 23 | IdentifyOverlappers(genome_annotation = genome_annotation) 24 | } 25 | -------------------------------------------------------------------------------- /man/GenerateExtensionCandidates.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GenerateExtensionCandidates.R 3 | \name{GenerateExtensionCandidates} 4 | \alias{GenerateExtensionCandidates} 5 | \title{GenerateExtensionCandidates} 6 | \usage{ 7 | GenerateExtensionCandidates(bedtools_loc = NULL) 8 | } 9 | \arguments{ 10 | \item{bedops_loc}{Optional. Location of bedtools in file system.} 11 | } 12 | \value{ 13 | Rank ordered list of gene extension candidates saved to working directory 14 | as “gene_extension_candidates.csv”. 15 | } 16 | \description{ 17 | Identifies candidate genes for extension with excess 3' intergenic 18 | reads and creates a rank ordered list of genes as a function of 3' intergenic 19 | read mapping within 10kb of known gene end. You can use this as a prioritized 20 | gene list for gene extension to examine in Integrated Genomics Viewer. 21 | 22 | Note: It runs partially in Bash/Linux terminal. Make sure bedtools is installed 23 | and provide a path in the function if you get an error message. 24 | } 25 | \examples{ 26 | GenerateExtensionCandidates(bedtools_loc = NULL) 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 Allan-Hermann Pool 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /R/PremrnaAnnotationGenerator.R: -------------------------------------------------------------------------------- 1 | #' @title PremrnaAnnotationGenerator 2 | #' 3 | #' @description It supplements original normal gene annotation entries by 4 | #' traditional pre-mRNA entries where transcripts have been redefined as exons 5 | #' and map in the --include-introns mode to retrieve most of available intronic reads. 6 | #' 7 | #' @param genome_annotation ENSEMBL/10x Genomics default genome annotation file (.gtf). 8 | #' 9 | #' @return Generates a basic pre-mRNA reference and saves in working directory as premrna.gtf 10 | #' @export 11 | #' 12 | #' @examples 13 | #' genome_annotation <- LoadGtf("test_genes.gtf") 14 | #' PremrnaAnnotationGenerator(genome_annotation) 15 | PremrnaAnnotationGenerator <- function(genome_annotation){ 16 | 17 | exonic_df <- genome_annotation 18 | premrna_df = exonic_df[exonic_df$type == "transcript",] # Extract all "transcript" entries in the genome annotation to a new variable 19 | premrna_df$feature = rep("exon", nrow(premrna_df)) # Rename all "feature" 20 | 21 | premrna_df = GenomicRanges::makeGRangesFromDataFrame(premrna_df, keep.extra.columns=TRUE) 22 | write_gtf(premrna_df, "premrna.gtf") 23 | print("Pre-mRNA reference has been saved in working directory as premrna.gtf") 24 | 25 | } 26 | -------------------------------------------------------------------------------- /R/LoadGtf.R: -------------------------------------------------------------------------------- 1 | #' @title LoadGtf 2 | #' 3 | #' @description Use to import the Ensembl/10x Genomics default genome annotation 4 | #' or other desired genome annotation file in GTF format for optimization for scRNA-seq 5 | #' analysis. Note: This file can be downloaded from 10x Genomics provided reference 6 | #' transcriptome "gene" folder at 7 | #' "https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest" 8 | #' or Ensembl.org if wish to customize more. 9 | #' 10 | #' @param unoptimized_annotation_path Path to the unoptimized genome annotion GTF file. 11 | #' 12 | #' @return Resulting object contains the genome annotation entries from the genome annotation GTF file. 13 | #' @export 14 | #' 15 | #' @examples 16 | #' LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 17 | LoadGtf <- function(unoptimized_annotation_path){ 18 | 19 | #Access test data 20 | if(unoptimized_annotation_path == "test_genes.gtf"){ 21 | unoptimized_annotation_path <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer") 22 | } 23 | 24 | genome_annotation <- rtracklayer::import(con = unoptimized_annotation_path, format = "gtf") # Import the original exonic genome annotation file 25 | genome_annotation <- as.data.frame(genome_annotation) 26 | return(genome_annotation) 27 | } 28 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: ReferenceEnhancer 2 | Title: Package For Optimizing And Assembling Genome Annotations For 3’ Single-Cell RNA-Sequencing Analysis 3 | Version: 0.9 4 | Authors@R: 5 | person("Helen", "Poldsam", , "helen.poldsam@utsouthwestern.edu", role = c("aut", "cre")) 6 | person("Allan-Hermann", "Pool", , "allan-hermann.pool@utsouthwestern.edu", role = c("aut", "cre")) 7 | Description: ReferenceEnhancer contains a set of tools for optimizing genome annotations for droplet based 3’ single-cell RNA-sequencing (10x Genomics, Dropseq etc.) data analysis. Regular genome annotations and transcriptomic references generated based on them come with several problems causing discarded sequencing data from final gene expression estimates (outlined in detail in https://www.biorxiv.org/content/10.1101/2022.04.26.489449v1). These include read loss stemming from gene overlaps, sequencing reads mapping to 3’ unannoated exons as well as introns. ReferenceEnhancer enables fixing these issues and assembling optimized genome annotations that circumvent these problems and recover the discarded gene expression data. 8 | URL: https://github.com/PoolLab/ReferenceEnhancer 9 | License: Artistic-2.0 10 | Encoding: UTF-8 11 | Roxygen: list(markdown = TRUE) 12 | RoxygenNote: 7.2.3 13 | Imports: 14 | gdata, 15 | GenomicAlignments, 16 | GenomicRanges, 17 | IRanges, 18 | readr, 19 | Rsamtools, 20 | rtracklayer, 21 | stringr 22 | -------------------------------------------------------------------------------- /man/GenerateGeneLocationBed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GenerateGeneLocationBed.R 3 | \name{GenerateGeneLocationBed} 4 | \alias{GenerateGeneLocationBed} 5 | \title{GenerateGeneLocationBed} 6 | \usage{ 7 | GenerateGeneLocationBed(genome_annotation, bedops_loc = NULL) 8 | } 9 | \arguments{ 10 | \item{genome_annotation}{Genome annotation DataFrame loaded with LoadGtf() 11 | function in this package.} 12 | 13 | \item{bedops_loc}{Optional. Location of BEDOPS in file system.} 14 | } 15 | \value{ 16 | Saves “gene_ranges.bed” in working directory. 17 | } 18 | \description{ 19 | Makes a bed file with gene boundaries, which is required for 20 | assigning intergenic reads to a specific gene and discovering genes with large 21 | amounts of intergenic reads near its 3’ gene end. 22 | 23 | Note 1: This step is partially run in Linux Terminal in Bash and requires BEDOPS 24 | (https://bedops.readthedocs.io/en/latest/). Make sure BEDOPS is installed and 25 | provide a path to BEDOPS in the function if you get an error message. 26 | 27 | Note 2: In Linux terminal, navigate to folder with the genome annotation of interest. 28 | The annotation file should be named "genes.gtf" per 10x Genomics convention. 29 | } 30 | \examples{ 31 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 32 | GenerateGeneLocationBed( 33 | genome_annotation = genome_annotation, 34 | bedops_loc = NULL) 35 | } 36 | -------------------------------------------------------------------------------- /R/readthrough_or_premature_min.R: -------------------------------------------------------------------------------- 1 | readthrough_or_premature_min <- function(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons){ 2 | select_gene_A <- gene_A[gene_A$type=='gene', ] 3 | gene_A_start <- select_gene_A[,'start'] 4 | gene_A_end <- select_gene_A[,'end'] 5 | select_gene_B <- gene_B[gene_B$type=='gene', ] 6 | gene_B_start <- select_gene_B[,'start'] 7 | gene_B_end <- select_gene_B[,'end'] 8 | 9 | if(gene_B_end > gene_A_end){ 10 | upstream_name <- name_B 11 | upstream <- gene_B 12 | downstream_name <- name_A 13 | downstream <- gene_A 14 | } 15 | 16 | else if(gene_A_end > gene_B_end){ 17 | upstream_name <- name_A 18 | upstream <- gene_A 19 | downstream_name <- name_B 20 | downstream <- gene_B 21 | } 22 | 23 | else if(gene_B_start > gene_A_start){ 24 | upstream_name <- name_B 25 | upstream <- gene_B 26 | downstream_name <- name_A 27 | downstream <- gene_A 28 | } 29 | 30 | else{ 31 | upstream_name <- name_A 32 | upstream <- gene_A 33 | downstream_name <- name_B 34 | downstream <- gene_B 35 | } 36 | 37 | upstream_trx <- list(upstream[upstream$type == 'transcript',][,'start'], upstream[upstream$type == 'transcript',][,'end']) 38 | downstream_trx = list(downstream[downstream$type == 'transcript',][,'start'], downstream[downstream$type == 'transcript',][,'end']) 39 | 40 | return(readthrough_or_premature(upstream_name, downstream_name, upstream_trx, downstream_trx)) 41 | } 42 | -------------------------------------------------------------------------------- /R/readthrough_or_premature_plus.R: -------------------------------------------------------------------------------- 1 | readthrough_or_premature_plus <- function(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons){ 2 | select_gene_A <- gene_A[gene_A$type=='gene', ] 3 | gene_A_start <- select_gene_A[,'start'] 4 | gene_A_end <- select_gene_A[,'end'] 5 | select_gene_B <- gene_B[gene_B$type=='gene', ] 6 | gene_B_start <- select_gene_B[,'start'] 7 | gene_B_end <- select_gene_B[,'end'] 8 | 9 | if(gene_A_start < gene_B_start){ 10 | upstream_name <- name_A 11 | upstream <- gene_A 12 | downstream_name <- name_B 13 | downstream <- gene_B 14 | } 15 | 16 | else if(gene_B_start < gene_A_start){ 17 | upstream_name <- name_B 18 | upstream <- gene_B 19 | downstream_name <- name_A 20 | downstream <- gene_A 21 | } 22 | 23 | else if(gene_A_end < gene_B_end){ 24 | upstream_name <- name_A 25 | upstream <- gene_A 26 | downstream_name <- name_B 27 | downstream <- gene_B 28 | } 29 | 30 | else{ 31 | upstream_name <- name_B 32 | upstream <- gene_B 33 | downstream_name <- name_A 34 | downstream <- gene_A 35 | } 36 | 37 | upstream_trx <- list(upstream[upstream$type == 'transcript',][,'start'], upstream[upstream$type == 'transcript',][,'end']) 38 | downstream_trx = list(downstream[downstream$type == 'transcript',][,'start'], downstream[downstream$type == 'transcript',][,'end']) 39 | 40 | return(readthrough_or_premature(upstream_name, downstream_name, upstream_trx, downstream_trx)) 41 | } 42 | -------------------------------------------------------------------------------- /R/readthrough_or_premature.R: -------------------------------------------------------------------------------- 1 | readthrough_or_premature <- function(upstream_name, downstream_name, upstream_trx, downstream_trx){ 2 | max_u = 0 3 | for(trx_u in 1:length(upstream_trx[[1]])){ 4 | count_u = 0 5 | for (trx_d in 1:length(downstream_trx[[1]])){ 6 | x = seq(downstream_trx[[1]][trx_d], downstream_trx[[2]][trx_d]-1) 7 | y = seq(upstream_trx[[1]][trx_u], upstream_trx[[2]][trx_u]-1) 8 | 9 | if(length(intersect(x,y)) > 0){ 10 | count_u = count_u + 1 11 | } 12 | 13 | if(count_u > max_u){ 14 | max_u = count_u 15 | } 16 | } 17 | } 18 | 19 | max_d = 0 20 | for(trx_d in 1:length(downstream_trx[[1]])){ 21 | count_d = 0 22 | for(trx_u in 1:length(upstream_trx[[1]])){ 23 | x = seq(downstream_trx[[1]][trx_d], downstream_trx[[2]][trx_d]-1) 24 | y = seq(upstream_trx[[1]][trx_u], upstream_trx[[2]][trx_u]-1) 25 | if(length(intersect(x,y)) > 0){ 26 | count_d = count_d + 1 27 | } 28 | } 29 | if(count_d > max_d){ 30 | max_d = count_d 31 | } 32 | } 33 | 34 | if(max_u > max_d){ 35 | result <- list(upstream_name, downstream_name, "readthrough") 36 | return(result) 37 | } 38 | else if(max_d > max_u){ 39 | result <- list(upstream_name, downstream_name, "premature") 40 | return(result) 41 | } 42 | else if(min(length(upstream_trx), length(downstream_trx)) == 1 & max(length(upstream_trx), length(downstream_trx)) != 1){ 43 | print("MAX & MIN") 44 | result <- list(upstream_name, downstream_name, "manual") 45 | return(result) 46 | } 47 | else if(max_u == 1 & max_d == 1){ 48 | result <- list(upstream_name, downstream_name, "manual") 49 | return(result) 50 | } 51 | else if(max_u == max_d){ 52 | if(length(upstream_trx[[1]]) > length(downstream_trx[[1]])){ 53 | result <- list(upstream_name, downstream_name, "readthrough") 54 | return(result) 55 | } 56 | else{ 57 | result <- list(upstream_name, downstream_name, "premature") 58 | return(result) 59 | } 60 | } 61 | else{ 62 | result <- list(upstream_name, downstream_name, "manual") 63 | return(result) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /man/IsolateIntergenicReads.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IsolateIntergenicReads.R 3 | \name{IsolateIntergenicReads} 4 | \alias{IsolateIntergenicReads} 5 | \title{IsolateIntergenicReads} 6 | \usage{ 7 | IsolateIntergenicReads(bam_file_name, index_file_name, barcode_length = NULL) 8 | } 9 | \arguments{ 10 | \item{bam_file_name}{Path to Cell Ranger generated bam file (run Cell Ranger 11 | count pipeline on sequencing data of interest and aligning it to the unoptimized 12 | transcriptomic reference).} 13 | 14 | \item{index_file_name}{Path to Cell Ranger generated bam.bai file (run Cell Ranger 15 | count pipeline on sequencing data of interest and aligning it to the unoptimized 16 | transcriptomic reference).} 17 | 18 | \item{barcode_length}{Optional. Specifies the length of barcode needed. If not specified, defaults to 26.} 19 | } 20 | \value{ 21 | Saves extracted intergenic reads as a separate file (“intergenic_reads.bed”) 22 | } 23 | \description{ 24 | Intergenic reads are extracted from Cell Ranger aligned bam file. 25 | Use a scRNA-seq dataset of interest that has been aligned to the unoptimized 26 | genome reference with the Cell Ranger count pipeline. Intergenic reads can be 27 | identified by two features: their read identity tag RE = "I" (for intergenic) 28 | OR their RE=E (for exonic) with AN = \if{html}{\out{}}. The latter reads are in fact 29 | intergenic reads since Cell Ranger wrongly classifies reads mapping antisense 30 | to an exon as exonic (i.e. RE="E"). The false exonic reads can be recognized 31 | and captured as proper intergenic reads by extracting two kinds of reads 32 | (RE=I and RE=E & AN=1){ 31 | overlapper[i] = TRUE 32 | number_of_overlaps[i] = a-1 33 | conflict_genes = gene_names[as.logical(GenomicRanges::countOverlaps(genes_df, genes_df[i]))] 34 | conflict_genes = setdiff(conflict_genes, gene_names[i]) 35 | overlapping_genes[i] = paste(conflict_genes, collapse = ', ') 36 | } 37 | } 38 | 39 | overlapping_gene_list = as.data.frame(cbind(gene_names, number_of_overlaps, overlapping_genes))[overlapper,] 40 | colnames(overlapping_gene_list) = c("gene", "number_of_gene_overlaps", "overlapping_genes") 41 | overlapping_gene_list$number_of_gene_overlaps = as.integer(overlapping_gene_list$number_of_gene_overlaps) 42 | 43 | o = order(overlapping_gene_list$number_of_gene_overlaps, decreasing = TRUE) # Rank order genes by the number of gene overlaps 44 | overlapping_gene_list = overlapping_gene_list[o,] 45 | 46 | if(dim(overlapping_gene_list)[1] > 0){ 47 | row.names(overlapping_gene_list) = 1:nrow(overlapping_gene_list) 48 | } 49 | 50 | overlapping_gene_list["automatic_classification"] <- "" 51 | overlapping_gene_list["final_classification"] <- "" 52 | overlapping_gene_list["transcripts_for_deletion"] <- "" 53 | overlapping_gene_list["comments"] <- "" 54 | 55 | write_csv(overlapping_gene_list, 'overlapping_gene_list.csv') 56 | print("A list of overlapping genes has been saved in your working directory (overlapping_gene_list.csv) for manual curation.") 57 | return(overlapping_gene_list) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /R/GenerateGeneLocationBed.R: -------------------------------------------------------------------------------- 1 | #' @title GenerateGeneLocationBed 2 | #' 3 | #' @description Makes a bed file with gene boundaries, which is required for 4 | #' assigning intergenic reads to a specific gene and discovering genes with large 5 | #' amounts of intergenic reads near its 3’ gene end. 6 | #' 7 | #' Note 1: This step is partially run in Linux Terminal in Bash and requires BEDOPS 8 | #' (https://bedops.readthedocs.io/en/latest/). Make sure BEDOPS is installed and 9 | #' provide a path to BEDOPS in the function if you get an error message. 10 | #' 11 | #' Note 2: In Linux terminal, navigate to folder with the genome annotation of interest. 12 | #' The annotation file should be named "genes.gtf" per 10x Genomics convention. 13 | #' 14 | #' @param genome_annotation Genome annotation DataFrame loaded with LoadGtf() 15 | #' function in this package. 16 | #' @param bedops_loc Optional. Location of BEDOPS in file system. 17 | #' 18 | #' @return Saves “gene_ranges.bed” in working directory. 19 | #' @export 20 | #' 21 | #' @examples 22 | #' genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 23 | #' GenerateGeneLocationBed( 24 | #' genome_annotation = genome_annotation, 25 | #' bedops_loc = NULL) 26 | GenerateGeneLocationBed <- function(genome_annotation, bedops_loc = NULL){ 27 | gene_ranges_df <- genome_annotation 28 | gene_ranges_df <- gene_ranges_df[gene_ranges_df$type == "gene",] # Extract all "gene" entries in the genome annotation ot a new variable 29 | gene_ranges_df <- GenomicRanges::makeGRangesFromDataFrame(gene_ranges_df, keep.extra.columns=TRUE) 30 | rtracklayer::export(gene_ranges_df, "gene_ranges.gtf", format = "gtf") 31 | 32 | ## Add "transcript_id """ column to the gtf file to make it compatible with bedtools format (through terminal) 33 | system('awk \'{ if ($0 ~ "transcript_id") print $0; else print $0" transcript_id \"\";"; }\' gene_ranges.gtf > gene_ranges1.gtf') 34 | 35 | ## Check for bedops 36 | if(is.null(bedops_loc)){ 37 | if(is.na(unlist(strsplit(system("whereis bedops", intern = TRUE),": "))[2])){ 38 | print("Didn't find bedops. Please install bedops or provide a path to bedops.") 39 | } 40 | else{ 41 | old_path <- Sys.getenv("PATH") 42 | Sys.setenv(PATH = paste(old_path, bedops_loc, sep = ":")) 43 | }} 44 | 45 | system('gtf2bed < gene_ranges1.gtf > gene_ranges.bed') # Creates a bed file with gene boundaries 46 | 47 | ## The following code in R replaces final column with gene name. Make sure you navigate to same folder in R. 48 | 49 | gene_ranges = read.table("gene_ranges.bed", sep = "\t") 50 | 51 | if(dim(gene_ranges)[1] > 0){ 52 | for (i in 1:dim(gene_ranges)[1]) 53 | { 54 | a = gene_ranges[i,10] 55 | res <- stringr::str_match(a, "gene_name\\s*(.*?)\\s*;") 56 | b = res[,2] 57 | gene_ranges[i, 10] = b 58 | } 59 | } 60 | 61 | ## Remove gene_ranges.gtf 62 | file.remove("./gene_ranges.gtf") 63 | file.remove("./gene_ranges1.gtf") 64 | 65 | ## Save outcome 66 | write.table(gene_ranges, "gene_ranges.bed", sep="\t",row.names=FALSE, col.names=FALSE, quote = FALSE) 67 | print("Gene ranges file (gene_ranges.bed) has been saved in working directory.") 68 | 69 | } 70 | -------------------------------------------------------------------------------- /R/GenerateExtensionCandidates.R: -------------------------------------------------------------------------------- 1 | #' @title GenerateExtensionCandidates 2 | #' 3 | #' @description Identifies candidate genes for extension with excess 3' intergenic 4 | #' reads and creates a rank ordered list of genes as a function of 3' intergenic 5 | #' read mapping within 10kb of known gene end. You can use this as a prioritized 6 | #' gene list for gene extension to examine in Integrated Genomics Viewer. 7 | #' 8 | #' Note: It runs partially in Bash/Linux terminal. Make sure bedtools is installed 9 | #' and provide a path in the function if you get an error message. 10 | #' 11 | #' @param bedops_loc Optional. Location of bedtools in file system. 12 | #' 13 | #' @return Rank ordered list of gene extension candidates saved to working directory 14 | #' as “gene_extension_candidates.csv”. 15 | #' @export 16 | #' 17 | #' @examples 18 | #' GenerateExtensionCandidates(bedtools_loc = NULL) 19 | GenerateExtensionCandidates <- function(bedtools_loc = NULL){ 20 | 21 | ## In bash/linux terminal: Make sure bedtools is in PATH (make sure bedtools is installed and in the PATH variable in Linux or MacOS) 22 | 23 | system("sort -k 1,1 -k2,2n gene_ranges.bed > gene_ranges1.bed") 24 | system("sort -k 1,1 -k2,2n intergenic_reads.bed > intergenic_reads1.bed") 25 | 26 | # Checks and adds bedtools to path 27 | if(is.null(bedtools_loc)){ 28 | if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){ 29 | print("Didn't find bedtools. Please install bedtools or provide a path to bedtools.") 30 | } 31 | else{ 32 | old_path <- Sys.getenv("PATH") 33 | bedtools_loc = unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2] 34 | Sys.setenv(PATH = paste(old_path, bedtools_loc, sep = ":")) 35 | } 36 | } 37 | else{ 38 | old_path <- Sys.getenv("PATH") 39 | Sys.setenv(PATH = paste(old_path, bedtools_loc, sep = ":")) 40 | } 41 | 42 | 43 | system("bedtools closest -a intergenic_reads1.bed -b gene_ranges1.bed -s -D a -fu > results.txt") # resulting file contains sequencing reads with distance data from closest 3' gene identity and end 44 | 45 | ## In R: Save a rank ordered list of genes with highest-to-lowest number of intergenic reads within 10kb of its known gene end. 46 | 47 | summary_data = read.table("results.txt", sep = "\t") 48 | 49 | summary_data = summary_data[summary_data$V23>-10000,] # retain only sequencing reads within 10kb of known gene ends. Change to more or less stringent as desired. 50 | summary_data = summary_data[summary_data$V23<0,] # retain only sequencing reads within 10kb of known gene ends. Change to more or less stringent as desired. 51 | 52 | hist(summary_data$V23) # plot histogram of intergenic sequencing reads as a function of distance from 3' gene ends. 53 | 54 | summary_data_genes = table(summary_data$V22) # Summarizes # of intergenic reads within 10kb of known gene ends for each gene. 55 | o = order(summary_data_genes, decreasing = TRUE) # Rank order the gene list 56 | length(summary_data_genes) 57 | summary_data_genes = summary_data_genes[o] 58 | length(summary_data_genes[summary_data_genes>10]) # Determine number of genes with more than 10 intergenic reads within 10kb of known gene end 59 | summary_data_genes = summary_data_genes[summary_data_genes>10] # Threshold gene list based on the amount of intergenic gene loading. 60 | summary_data_genes = data.frame(summary_data_genes) 61 | dim(summary_data_genes) 62 | summary_data_genes[1:40,] 63 | summary_data_genes["update_start"] <- "" 64 | summary_data_genes["update_end"] <- "" 65 | 66 | 67 | write_csv(summary_data_genes, "gene_extension_candidates.csv") 68 | print("A rank ordered list of gene extension candidates has been saved to working directory as gene_extension_candidates.csv") 69 | 70 | } 71 | -------------------------------------------------------------------------------- /man/OptimizedAnnotationAssembler.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/OptimizedAnnotationAssembler.R 3 | \name{OptimizedAnnotationAssembler} 4 | \alias{OptimizedAnnotationAssembler} 5 | \title{OptimizedAnnotationAssembler} 6 | \usage{ 7 | OptimizedAnnotationAssembler( 8 | unoptimized_annotation_path, 9 | gene_overlaps, 10 | gene_extension, 11 | gene_replacement 12 | ) 13 | } 14 | \arguments{ 15 | \item{unoptimized_annotation_path}{path to unoptimized genome annotation file in GTF.} 16 | 17 | \item{gene_overlaps}{overlapping genes list generated with IdentifyOverlappers function.} 18 | 19 | \item{gene_extension}{list of gene extension candidates generated with GenerateExtensionCandidates function.} 20 | 21 | \item{gene_replacement}{manually generated list of gene names to be replaced in .csv format. Column names: old_name, new_name. Optional.} 22 | } 23 | \value{ 24 | Single-cell RNA-seq optimized genome annotation that can be used to 25 | generate the transcriptomic reference (e.g. with cellranger mkref or 26 | STAR --runMode genomeGenerate pipelines) for mapping single-cell sequencing data. 27 | } 28 | \description{ 29 | OptimizedAnnotationAssembler generates the scRNA-seq optimized genome annotation. 30 | The resulting optimized genome annotation can be used to generate the transcriptomic 31 | reference for mapping single-cell sequencing data (e.g. with cellranger mkref 32 | or STAR --runMode genomeGenerate). Note that completing this step is time intensive 33 | and can sometimes take 12-24 hours depending on the length of the annotation 34 | to be optimized. 35 | This function goes through the following steps: 36 | 0. Load data and libraries: 37 | \itemize{ 38 | \item genome annotation file to be optimized in GTF. 39 | \item "overlapping_gene_list.csv" file specifying how to resolve gene overlap 40 | derived issues. "Delete" entries in $final_classification field mark genes 41 | for deletion. Transcript names in $transcripts_for_deletion mark specific 42 | transcripts for deletion. 43 | \item "gene_extension_candidates.csv" specifying updated gene boundaries for 44 | incorporating intergenic reads. 45 | \item "rename_genes.csv" specifying gene names to be replaced and new names 46 | (under $old_names and $new_names fields, respectively). 47 | } 48 | \enumerate{ 49 | \item Resolve "self-overlapping" gene (duplicate gene_ids) derived issues. 50 | Required for making references compatible with multiome workflows. 51 | \item Creates pre-mRNA genome annotation from input genome annotation. This step 52 | extracts all transcript entries from the genome annotation and defines them 53 | as full length exons with new transcript IDs and corresponding transcripts. 54 | This allows to capture many intronically mapped reads that otherwise get discarded. 55 | \item Gene deletion step: Deletes all annotation entries for genes destined for 56 | deletion (has "Delete" entry in $final_classification field of 57 | "overlapping_gene_list.csv". 58 | \item Transcript deletion step: Deletes all transcripts destined for deletion 59 | (transcript names listed in the "transcripts_for_deletion" column in 60 | "overlapping_gene_list.csv". 61 | \item Gene coordinate adjustment step: Replaces the left most or right most 62 | coordinate of the first exon of a gene in genome annotation if there is a 63 | coordinate in columns $new_left or $new_right in the 64 | "gene_extension_candidates.csv". 65 | \item Adds pre-mRNA reads to all genes not in the gene overlap list. 66 | \item Renames genes to avoid discarding expression data with near perfect terminal 67 | exon overlap. 68 | \item Saves the optimized genome annotation in a new GTF file. 69 | } 70 | } 71 | \examples{ 72 | OptimizedAnnotationAssembler( 73 | unoptimized_annotation_path = "test_genes.gtf", 74 | gene_overlaps = "test_overlapping_gene_list.csv", 75 | gene_extension = "./gene_extension_candidates.csv", 76 | gene_replacement = "test_gene_replacement.csv") 77 | } 78 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | # ReferenceEnhancer 2 | 3 | The goal of ReferenceEnhancer is to generate a scRNA-seq optimized transcriptomic reference. 4 | 5 | Generating a scRNA-seq optimized transcriptomic reference requires optimizing the genome annotation ("xxx.gtf") file that transcriptomic references are based on. 6 | 7 | The following three aspects of genome annotations need to be optimized: A) Resolving gene overlap derived read loss; B) Recovering intergenic reads from 3' un-annotated exons; and C) Recovering intronic reads. 8 | 9 | After optimizing and assembling the genome annotation, you can use "cellranger mkref" pipeline to assemble the optimized transcriptomic reference for mapping sequencing read data and compiling gene-cell matrices with the "cellranger count" (or other) pipeline. 10 | 11 | ## Installation 12 | 13 | You can install the development version of ReferenceEnhancer as follows: 14 | 15 | ``` r 16 | install.packages("devtools") 17 | require(devtools) 18 | install_github("PoolLab/ReferenceEnhancer") 19 | ``` 20 | 21 | ## Example 22 | 23 | # This is a sample workflow of the package: 24 | 25 | This is the basic workflow for optimizing a genome annotation for single-cell RNA-seq work using ReferenceEnhancer: 26 | 27 | 1. Load ReferenceEnhancer and import ENSEMBL/10x Genomics default genome annotation file (GTF). 28 | 29 | This file can be downloaded from 10x Genomics provided reference transcriptome "gene" folder at "" or Ensembl.org if wish to customize more. 30 | 31 | For testing, we have provided a sample file. 32 | 33 | library(ReferenceEnhancer) 34 | 35 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 36 | 37 | 2. Identify all overlapping genes based on the ENSEMBL/10x Genomics default genome annotation file (GTF), rank-order them according to the number of gene overlaps. 38 | 39 | Prioritize this gene list for manual curation focusing on exonically overlapping genes. The function saves the list of overlapping genes in working directory as overlapping_gene_list.csv. 40 | 41 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 42 | 43 | 3. Generate recommended actions for overlapping genes based on original genome annotation .gtf file and a list of overlapping genes. 44 | 45 | The function updates overlapping_gene_list.csv file with added recommendations. 46 | 47 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 48 | 49 | 4. Extract intergenic reads from Cell Ranger aligned bam file. The function saves extracted intergenic reads in working directory as intergenic_reads.bed. 50 | 51 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 52 | 53 | 5. Generate gene boundaries in order to assign intergenic reads to a specific gene. The function save resulting in working directory as gene_ranges.bed. 54 | 55 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedops () has been installed to your computer. 56 | 57 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL) 58 | 59 | 6. Identify candidate genes for extension with excess 3' intergenic reads and create a rank ordered list of genes as a function of 3' intergenic read mapping within 10kb of known gene end. A rank ordered list of gene extension candidates is saved in working directory as gene_extension_candidates.csv. 60 | 61 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedtools () has been installed to your computer and that it has been added to to the path in your R environment. 62 | 63 | GenerateExtensionCandidates(bedtools_loc = NULL) 64 | 65 | 7. Create the final optimized annotation file. The function saves the result in working directory as optimized_reference.gtf. 66 | 67 | OptimizedAnnotationAssembler(unoptimized_annotation_path = "test_genes.gtf", gene_overlaps = "test_overlapping_gene_list.csv", gene_extension = "gene_extension_candidates.csv", gene_replacement = "test_gene_replacement.csv") 68 | -------------------------------------------------------------------------------- /README.Rmd.orig: -------------------------------------------------------------------------------- 1 | # ReferenceEnhancer 2 | 3 | The goal of ReferenceEnhancer is to generate a scRNA-seq optimized transcriptomic reference. 4 | 5 | Generating a scRNA-seq optimized transcriptomic reference requires optimizing the genome annotation ("xxx.gtf") file that transcriptomic references are based on. 6 | 7 | The following three aspects of genome annotations need to be optimized: A) Resolving gene overlap derived read loss; B) Recovering intergenic reads from 3' un-annotated exons; and C) Recovering intronic reads. 8 | 9 | After optimizing and assembling the genome annotation, you can use "cellranger mkref" pipeline to assemble the optimized transcriptomic reference for mapping sequencing read data and compiling gene-cell matrices with the "cellranger count" (or other) pipeline. 10 | 11 | ## Installation 12 | 13 | <<<<<<< HEAD 14 | You can install ReferenceEnhancer like so: 15 | ======= 16 | You can install the development version of ReferenceEnhancer as follows: 17 | >>>>>>> 67ca0da0bac1c232f772d39eff8f1d99ac69b6d9 18 | 19 | ``` r 20 | install.packages("devtools") 21 | require(devtools) 22 | install_github("PoolLab/ReferenceEnhancer") 23 | ``` 24 | 25 | ## Example 26 | 27 | <<<<<<< HEAD 28 | This is a sample workflow of the package: 29 | ======= 30 | This is the basic workflow for optimizing a genome annotation for single-cell RNA-seq work using ReferenceEnhancer: 31 | >>>>>>> 67ca0da0bac1c232f772d39eff8f1d99ac69b6d9 32 | 33 | 1. Load ReferenceEnhancer and import ENSEMBL/10x Genomics default genome annotation file (GTF). 34 | 35 | This file can be downloaded from 10x Genomics provided reference transcriptome "gene" folder at "https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest" or Ensembl.org if wish to customize more. 36 | 37 | For testing, we have provided a sample file. 38 | ```{r example} 39 | library(ReferenceEnhancer) 40 | genome_annotation <- LoadGtf("test_genes.gtf") 41 | ``` 42 | 43 | 2. Identify all overlapping genes based on the ENSEMBL/10x Genomics default genome annotation file (GTF), rank-order them according to the number of gene overlaps. 44 | 45 | Prioritize this gene list for manual curation focusing on exonically overlapping genes. 46 | The function saves the list of overlapping genes in working directory as overlapping_gene_list.csv. 47 | ```{r example} 48 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 49 | ``` 50 | 51 | 3. Generate recommended actions for overlapping genes based on original genome annotation .gtf file and a list of overlapping genes. 52 | 53 | The function updates overlapping_gene_list.csv file with added recommendations. 54 | ```{r example} 55 | OverlapResolutions(genome_annotation, gene_overlaps) 56 | ``` 57 | 58 | 4. Extract intergenic reads from Cell Ranger aligned bam file. 59 | The function saves extracted intergenic reads in working directory as intergenic_reads.bed. 60 | ```{r example} 61 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 62 | ``` 63 | 64 | 5. Generate gene boundaries in order to assign intergenic reads to a specific gene. The function save resulting in working directory as gene_ranges.bed. 65 | 66 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedops (https://bedops.readthedocs.io/en/latest/) has been installed to your computer. 67 | ```{r example} 68 | GenerateGeneLocationBed(genome_annotation) 69 | ``` 70 | 71 | 6. Identify candidate genes for extension with excess 3' intergenic reads and create a rank ordered list of genes as a function of 3' intergenic read mapping within 10kb of known gene end. A rank ordered list of gene extension candidates is saved in working directory as gene_extension_candidates.csv. 72 | 73 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedtools (https://bedtools.readthedocs.io/en/latest/content/installation.html) has been installed to your computer and that it has been added to to the path in your R environment. 74 | ```{r example} 75 | GenerateExtensionCandidates() 76 | ``` 77 | 78 | 7. Create the final optimized annotation file. The function saves the result in working directory as optimized_reference.gtf. 79 | ```{r example} 80 | OptimizedAnnotationAssembler("test_genes.gtf", "premrna.gtf", "overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv") 81 | ``` 82 | -------------------------------------------------------------------------------- /R/IsolateIntergenicReads.R: -------------------------------------------------------------------------------- 1 | #' @title IsolateIntergenicReads 2 | #' 3 | #' @description Intergenic reads are extracted from Cell Ranger aligned bam file. 4 | #' Use a scRNA-seq dataset of interest that has been aligned to the unoptimized 5 | #' genome reference with the Cell Ranger count pipeline. Intergenic reads can be 6 | #' identified by two features: their read identity tag RE = "I" (for intergenic) 7 | #' OR their RE=E (for exonic) with AN = . The latter reads are in fact 8 | #' intergenic reads since Cell Ranger wrongly classifies reads mapping antisense 9 | #' to an exon as exonic (i.e. RE="E"). The false exonic reads can be recognized 10 | #' and captured as proper intergenic reads by extracting two kinds of reads 11 | #' (RE=I and RE=E & AN= classify for “Manual inspection” 48 | b. If gene’s exons do not overlap with any other genes’ exons --> classify as “Keep as is” 49 | c. Assign recommended action for overlapping genes: 50 | i. If nested gene does not overlap with any other gene  classify as “Keep as is” 51 | ii. If nested gene overlaps with more than one gene  classify for “Manual inspection” 52 | \item If gene overlaps with only one other gene, test whether gene is non-protein 53 | coding/pseudogene (“Gm” and “…Rik” gene models in mice; “AC…” and “AL…” gene models in humans) 54 | a. If both overlapping genes are non-protein coding/pseudogenes --> classify for 55 | “Manual inspection” 56 | b. If only one gene in the overlapping gene pair is non-protein coding/pseudogene, 57 | test if genes have overlapping exons: 58 | i. In case no overlapping exons --> classify both genes as “Keep as is” 59 | ii. In case exons overlap --> mark non-protein coding/pseudogene for 60 | deletion (“Delete”). 61 | c. If both genes are well supported genes: 62 | i. If their exons don’t overlap --> mark both genes as “Keep as is” 63 | ii. If their exons do overlap, determine the number of opposing gene’s 64 | exonic overlap for each exon of each gene and find the exon with most 65 | overlaps for both upstream and downstream gene to determine appropriate 66 | course of action: 67 | 1. If downstream gene’s exon has more overlaps than its upstream 68 | counterpart, classify downstream gene as “Premature transcript deletion” 69 | and upstream gene as “Keep as is” 70 | 2. If upstream gene’s exon has more overlaps than its downstream 71 | counterpart, classify upstream gene as “Readthrough transcript deletion” 72 | and downstream gene as “Keep as is” 73 | 3. Otherwise classify both for “Manual inspection” 74 | The resulting recommendations can be used in the manual curation step, where 75 | all genes that are not classified in the “Keep as is” category should directly 76 | be scrutinized in the Ensembl genome browser (ensemble.org, with the correct 77 | genome builds) and/or cross-referenced to the respective Refseq genome annotation 78 | within the Integrated Genome Browser (IGV 2.11.9). 79 | } 80 | } 81 | \examples{ 82 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 83 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 84 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("^Gm", "Rik$")) 85 | 86 | # Note: The example treats genes starting with “Gm…” and ending with “…Rik” as 87 | # pseudogenes. Additional patterns for recognizing candidate pseudo- or low-quality 88 | # genes can be defined with regular expressions for matching gene names 89 | # with a given pattern. See vignette(regular-expressions) in the stringr package 90 | # for details. 91 | } 92 | -------------------------------------------------------------------------------- /R/OptimizedAnnotationAssembler.R: -------------------------------------------------------------------------------- 1 | #' @title OptimizedAnnotationAssembler 2 | #' 3 | #' @description 4 | #' OptimizedAnnotationAssembler generates the scRNA-seq optimized genome annotation. 5 | #' The resulting optimized genome annotation can be used to generate the transcriptomic 6 | #' reference for mapping single-cell sequencing data (e.g. with cellranger mkref 7 | #' or STAR --runMode genomeGenerate). Note that completing this step is time intensive 8 | #' and can sometimes take 12-24 hours depending on the length of the annotation 9 | #' to be optimized. 10 | #' This function goes through the following steps: 11 | #' 0. Load data and libraries: 12 | #' - genome annotation file to be optimized in GTF. 13 | #' - "overlapping_gene_list.csv" file specifying how to resolve gene overlap 14 | #' derived issues. "Delete" entries in $final_classification field mark genes 15 | #' for deletion. Transcript names in $transcripts_for_deletion mark specific 16 | #' transcripts for deletion. 17 | #' - "gene_extension_candidates.csv" specifying updated gene boundaries for 18 | #' incorporating intergenic reads. 19 | #' - "rename_genes.csv" specifying gene names to be replaced and new names 20 | #' (under $old_names and $new_names fields, respectively). 21 | #' 1. Resolve "self-overlapping" gene (duplicate gene_ids) derived issues. 22 | #' Required for making references compatible with multiome workflows. 23 | #' 2. Creates pre-mRNA genome annotation from input genome annotation. This step 24 | #' extracts all transcript entries from the genome annotation and defines them 25 | #' as full length exons with new transcript IDs and corresponding transcripts. 26 | #' This allows to capture many intronically mapped reads that otherwise get discarded. 27 | #' 3. Gene deletion step: Deletes all annotation entries for genes destined for 28 | #' deletion (has "Delete" entry in $final_classification field of 29 | #' "overlapping_gene_list.csv". 30 | #' 4. Transcript deletion step: Deletes all transcripts destined for deletion 31 | #' (transcript names listed in the "transcripts_for_deletion" column in 32 | #' "overlapping_gene_list.csv". 33 | #' 5. Gene coordinate adjustment step: Replaces the left most or right most 34 | #' coordinate of the first exon of a gene in genome annotation if there is a 35 | #' coordinate in columns $new_left or $new_right in the 36 | #' "gene_extension_candidates.csv". 37 | #' 6. Adds pre-mRNA reads to all genes not in the gene overlap list. 38 | #' 7. Renames genes to avoid discarding expression data with near perfect terminal 39 | #' exon overlap. 40 | #' 8. Saves the optimized genome annotation in a new GTF file. 41 | #' 42 | #' @param unoptimized_annotation_path path to unoptimized genome annotation file in GTF. 43 | #' @param gene_overlaps overlapping genes list generated with IdentifyOverlappers function. 44 | #' @param gene_extension list of gene extension candidates generated with GenerateExtensionCandidates function. 45 | #' @param gene_replacement manually generated list of gene names to be replaced in .csv format. Column names: old_name, new_name. Optional. 46 | #' 47 | #' @return Single-cell RNA-seq optimized genome annotation that can be used to 48 | #' generate the transcriptomic reference (e.g. with cellranger mkref or 49 | #' STAR --runMode genomeGenerate pipelines) for mapping single-cell sequencing data. 50 | #' @export 51 | #' 52 | #' @examples 53 | #' OptimizedAnnotationAssembler( 54 | #' unoptimized_annotation_path = "test_genes.gtf", 55 | #' gene_overlaps = "test_overlapping_gene_list.csv", 56 | #' gene_extension = "./gene_extension_candidates.csv", 57 | #' gene_replacement = "test_gene_replacement.csv") 58 | OptimizedAnnotationAssembler <- function(unoptimized_annotation_path, gene_overlaps, gene_extension, gene_replacement){ 59 | 60 | if(gene_overlaps == "test_overlapping_gene_list.csv"){ 61 | gene_overlaps <- system.file("extdata", "test_overlapping_gene_list.csv", package = "ReferenceEnhancer") 62 | } 63 | 64 | 65 | unoptimized_df <- LoadGtf(unoptimized_annotation_path) 66 | 67 | overlap_df = read.csv(gene_overlaps, header=T) 68 | 69 | new_df = unoptimized_df 70 | 71 | 72 | #### 1. Create premRNA genome annotation from input gtf that defines transcripts as exons #### 73 | ############################################################################################### 74 | transcripts_df = unoptimized_df[unoptimized_df$type == "transcript",] 75 | exons_df = transcripts_df # Create new dataframe to contain premrna exons 76 | exons_df$type = rep("exon", nrow(exons_df)) # rename "type" from transcripts to exon 77 | 78 | premrna_df = gdata::interleave(transcripts_df, exons_df) # interleave transript entries with exon entries 79 | premrna_df$transcript_id = gsub("000000", "100000", premrna_df$transcript_id) 80 | premrna_df$transcript_id = gsub("000001", "110001", premrna_df$transcript_id) 81 | premrna_df$transcript_id = gsub("000002", "110002", premrna_df$transcript_id) 82 | premrna_df$transcript_id = gsub("000003", "110003", premrna_df$transcript_id) 83 | premrna_df$transcript_id = gsub("000004", "110004", premrna_df$transcript_id) 84 | premrna_df$transcript_id = gsub("000005", "110005", premrna_df$transcript_id) 85 | premrna_df$transcript_id = gsub("000006", "110006", premrna_df$transcript_id) 86 | premrna_df$transcript_id = gsub("000007", "110007", premrna_df$transcript_id) 87 | premrna_df$transcript_id = gsub("000008", "110008", premrna_df$transcript_id) 88 | premrna_df$transcript_id = gsub("000009", "110009", premrna_df$transcript_id) 89 | 90 | rm(unoptimized_df) 91 | 92 | #### 2. Delete select genes #### 93 | ################################# 94 | genes_to_delete = overlap_df$genes[overlap_df$final_classification == "Delete"] 95 | new_df = new_df[!new_df$gene_name %in% genes_to_delete,] 96 | 97 | #### 3. Delete select transcripts #### 98 | ####################################### 99 | transcripts_to_delete = overlap_df$transcripts_for_deletion 100 | transcripts_to_delete <- transcripts_to_delete[transcripts_to_delete!=""] 101 | 102 | transcripts_to_delete_final = transcripts_to_delete[!stringr::str_detect(transcripts_to_delete, ", ")] 103 | 104 | if(length(transcripts_to_delete) != 0){ 105 | for (i in 1:length(transcripts_to_delete)){ 106 | a = transcripts_to_delete[i] 107 | if (stringr::str_detect(a, ", ")){ 108 | split_elements <- unlist(stringr::str_split(a, ", ")) 109 | transcripts_to_delete_final = c(transcripts_to_delete_final, split_elements) 110 | } 111 | } 112 | } 113 | 114 | transcripts_to_delete = transcripts_to_delete_final 115 | 116 | new_df = new_df[!new_df$transcript_name %in% transcripts_to_delete,] 117 | 118 | #### 4. Adjust gene coordinates #### 119 | ##################################### 120 | boundary_fix = read.csv(gene_extension, header=T) 121 | 122 | left_genes = as.data.frame(cbind(boundary_fix$genes[!is.na(boundary_fix$update_start)], boundary_fix$update_start[!is.na(boundary_fix$update_start)])) 123 | 124 | colnames(left_genes) = c("genes", "update_start") 125 | left_genes$update_start = as.numeric(left_genes$update_start) 126 | right_genes = as.data.frame(cbind(boundary_fix$genes[!is.na(boundary_fix$update_end)], boundary_fix$update_end[!is.na(boundary_fix$update_end)])) 127 | colnames(right_genes) = c("genes", "update_end") 128 | right_genes$update_end = as.numeric(right_genes$update_end) 129 | 130 | left_exon_difs = rep(0, length(left_genes)) # for troubleshooting 131 | right_exon_difs = rep(0, length(right_genes)) 132 | 133 | for (i in 1:dim(left_genes)[1]){ 134 | gene_entries = which(new_df$gene_name == left_genes[i, 1]) 135 | type_entries = new_df$type[gene_entries] 136 | first_gene_exon = head(gene_entries[type_entries == "exon"], 1) 137 | new_df[first_gene_exon, 2] = left_genes[i, 2] 138 | 139 | if(identical(new_df[first_gene_exon, 3], integer(0)) & identical(new_df[first_gene_exon, 2], integer(0))){ 140 | 141 | } 142 | else{ 143 | left_exon_difs[i] = new_df[first_gene_exon, 3] - new_df[first_gene_exon, 2] 144 | } 145 | 146 | 147 | } 148 | 149 | for (i in 1:dim(right_genes)[1]){ 150 | gene_entries = which(new_df$gene_name == right_genes[i, 1]) 151 | type_entries = new_df$type[gene_entries] 152 | last_gene_exon = tail(gene_entries[type_entries == "exon"], 1) 153 | new_df[last_gene_exon, 3] = right_genes[i, 2] 154 | 155 | if(identical(new_df[last_gene_exon, 3], integer(0)) & identical(new_df[last_gene_exon, 2], integer(0))){ 156 | 157 | } 158 | else{ 159 | right_exon_difs[i] = new_df[last_gene_exon, 3] - new_df[last_gene_exon, 2] 160 | } 161 | 162 | } 163 | 164 | #### 5. Add pre-mRNA transcripts to genes not in the gene overlap list #### 165 | ############################################################################ 166 | # Explanation: Cellranger --include-introns mode unfortunately does not pick up on many intronic reads (unclear why despite lengthy correspondence with their support). I can pick those up however if I add the pre-mRNA transcripts to respective genes as exons with new transcript_id values. 167 | 168 | ## Genes to modify 169 | 170 | #overlap_df$genes # genes to exclude from premrna reference appending 171 | 172 | genes_to_append = unique(new_df$gene_name) 173 | genes_to_append = setdiff(genes_to_append, overlap_df$genes) 174 | 175 | ## Give new transcript_ids to everything in the pre-mRNA gtf 176 | 177 | for (i in 1:dim(premrna_df)[1]){ 178 | premrna_df$transcript_id[i] = as.character(i) 179 | } 180 | 181 | ## Reformat the gtf dataframes such that we can add premrna entries to the original unoptimized entries and thus compile a hybrid reference for capturing intronic reads 182 | 183 | final_colnames = intersect(colnames(new_df), colnames(premrna_df)) 184 | 185 | new_df = new_df[, final_colnames] 186 | premrna_df = premrna_df[, final_colnames] 187 | 188 | ## Append premrna transcript to the end of the gene 189 | 190 | genes_to_append = genes_to_append[1:(length(genes_to_append)-1)] 191 | 192 | for (i in genes_to_append){ 193 | insert = premrna_df[premrna_df$gene_name %in% i,] 194 | first_section = new_df[0:tail(which(new_df$gene_name == i), 1),] 195 | last_section = new_df[(tail(which(new_df$gene_name == i), 1)+1):dim(new_df)[1],] 196 | new_df = rbind(first_section, insert, last_section) 197 | } 198 | 199 | #### 6. Rename desired genes #### 200 | ################################# 201 | # Rename desired genes (example from mouse genome): "Cers1"==>"Cers1_Gdf1" // "Chtf8" ==> "Chtf8_Derpc" // "Insl3" ==> "Insl3_Jak3" // "Pcdhga1" ==> "Pcdhg_all" // "Pcdha1" ==> "Pcdha_all" // "Ugt1a10" ==> "Ugt1a_all" // "4933427D14Rik" ==> "4933427D14Rik_Gm43951" // "Mkks" ==> "Mkks_plus" 202 | if(missing(gene_replacement)){ 203 | 204 | } 205 | else{ 206 | if(gene_replacement == "test_gene_replacement.csv"){ 207 | gene_replacement <- system.file("extdata", "test_gene_replacement.csv", package = "ReferenceEnhancer") 208 | } 209 | 210 | gene_replacement <- read.csv(gene_replacement, header=T) 211 | 212 | old_names <- gene_replacement[,'old_name'] 213 | new_names <- gene_replacement[,'new_name'] 214 | 215 | 216 | for (i in 1:length(old_names)){ 217 | new_df$transcript_name = stringr::str_replace_all(new_df$transcript_name, old_names[i], new_names[i]) 218 | new_df$gene_name = stringr::str_replace_all(new_df$gene_name, old_names[i], new_names[i]) 219 | } 220 | } 221 | 222 | #### 7. Export object to gtf file #### 223 | ###################################### 224 | new_gtf = GenomicRanges::makeGRangesFromDataFrame(new_df, keep.extra.columns=TRUE) 225 | 226 | write_gtf(new_gtf, "optimized_reference.gtf") 227 | print("Optimized annotation reference has been saved in working directory as optimized_reference.gtf") 228 | 229 | } 230 | -------------------------------------------------------------------------------- /R/OverlapResolutions.R: -------------------------------------------------------------------------------- 1 | #' @title OverlapResolutions 2 | #' 3 | #' @description Based on original genome annotation GTF file and a list of 4 | #' overlapping genes, generates recommended actions for overlapping genes. 5 | #' This is an optional step that can help with decision making during the manual 6 | #' curation step. 7 | #' 8 | #' Gene overlaps can be resolved by one of several strategies including 9 | #' (i) leaving overlapping gene annotations unchanged if their exons don’t directly overlap, 10 | #' (ii) deleting offending readthrough transcripts from upstream genes, 11 | #' (iii) deleting offending premature gene transcripts from downstream genes, 12 | #' (iv) deleting pseudogenes and non-protein coding genes with poor support and 13 | #' no read mapping that obscure well established protein coding genes or 14 | #' (v) for extensively overlapping genes deleting one and renaming the other to 15 | #' capture otherwise discarded reads. As well annotated genomes contain several 16 | #' thousand same-strand overlapping genes and properly resolving gene overlaps 17 | #' often requires manual inspection of the locus to determine best course of action, 18 | #' prioritization of genes for manual curation is often desirable. To this end, 19 | #' OverlapResolutions function classifies genes to prioritize for direct inspection. 20 | #' The following algorithm is used to classify genes for appropriate curation: 21 | #' 1. If gene overlaps with multiple genes: 22 | #' a. If gene’s exons overlap with another gene’s exons --> classify for “Manual inspection” 23 | #' b. If gene’s exons do not overlap with any other genes’ exons --> classify as “Keep as is” 24 | #' c. Assign recommended action for overlapping genes: 25 | #' i. If nested gene does not overlap with any other gene  classify as “Keep as is” 26 | #' ii. If nested gene overlaps with more than one gene  classify for “Manual inspection” 27 | #'2. If gene overlaps with only one other gene, test whether gene is non-protein 28 | #'coding/pseudogene (“Gm” and “…Rik” gene models in mice; “AC…” and “AL…” gene models in humans) 29 | #' a. If both overlapping genes are non-protein coding/pseudogenes --> classify for 30 | #' “Manual inspection” 31 | #' b. If only one gene in the overlapping gene pair is non-protein coding/pseudogene, 32 | #' test if genes have overlapping exons: 33 | #' i. In case no overlapping exons --> classify both genes as “Keep as is” 34 | #' ii. In case exons overlap --> mark non-protein coding/pseudogene for 35 | #' deletion (“Delete”). 36 | #' c. If both genes are well supported genes: 37 | #' i. If their exons don’t overlap --> mark both genes as “Keep as is” 38 | #' ii. If their exons do overlap, determine the number of opposing gene’s 39 | #' exonic overlap for each exon of each gene and find the exon with most 40 | #' overlaps for both upstream and downstream gene to determine appropriate 41 | #' course of action: 42 | #' 1. If downstream gene’s exon has more overlaps than its upstream 43 | #' counterpart, classify downstream gene as “Premature transcript deletion” 44 | #' and upstream gene as “Keep as is” 45 | #' 2. If upstream gene’s exon has more overlaps than its downstream 46 | #' counterpart, classify upstream gene as “Readthrough transcript deletion” 47 | #' and downstream gene as “Keep as is” 48 | #' 3. Otherwise classify both for “Manual inspection” 49 | #'The resulting recommendations can be used in the manual curation step, where 50 | #'all genes that are not classified in the “Keep as is” category should directly 51 | #'be scrutinized in the Ensembl genome browser (ensemble.org, with the correct 52 | #'genome builds) and/or cross-referenced to the respective Refseq genome annotation 53 | #'within the Integrated Genome Browser (IGV 2.11.9). 54 | #' 55 | #' @param genome_annotation Unoptimized genome annotation (e.g. Ensembl/10x Genomics) 56 | #' default genome annotation GTF file. This should be a dataframe created with the 57 | #' LoadGtf() function in this package. 58 | #' @param overlap_data A list of overlapping genes generated by IdentifyOverlappers. 59 | #' @param gene_pattern The pattern in gene names that is unique for pseudo- or 60 | #' other low quality or low interest genes. Patterns for recognizing candidate 61 | #' pseudo- or low quality genes can be defined with regular expressions for matching 62 | #' gene names with a given pattern. See vignette (regular-expressions) in the stringr 63 | #' package for details or examples below. 64 | #' 65 | #' @return Generates “overlapping_gene_list.csv” with added recommendations for 66 | #' resolving gene overlaps in the “automatic_classification” column. 67 | #' @export 68 | #' 69 | #' @examples 70 | #' genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 71 | #' gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 72 | #' OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("^Gm", "Rik$")) 73 | #' 74 | #' # Note: The example treats genes starting with “Gm…” and ending with “…Rik” as 75 | #' # pseudogenes. Additional patterns for recognizing candidate pseudo- or low-quality 76 | #' # genes can be defined with regular expressions for matching gene names 77 | #' # with a given pattern. See vignette(regular-expressions) in the stringr package 78 | #' # for details. 79 | OverlapResolutions <- function(genome_annotation, overlap_data, gene_pattern){ 80 | gene_list <- unique(overlap_data$gene) 81 | gene_address <- rep(0, length(gene_list)) 82 | for (i in 1:length(gene_list)){ 83 | gene_address[i] <- which(overlap_data$gene==gene_list[i])[1] 84 | } 85 | 86 | overlap_data <- overlap_data[gene_address,] 87 | 88 | rownames(overlap_data) <- overlap_data[,'gene'] 89 | 90 | overlap_data['automatic_classification'] <- NA 91 | 92 | for(key in (rownames(overlap_data))){ 93 | 94 | # Check that the gene is not classified already 95 | if(is.na(overlap_data[key,'automatic_classification'])){ 96 | gene_A <- subset(genome_annotation, gene_name == key) 97 | 98 | if(overlap_data[key,'number_of_gene_overlaps'] > 1){ 99 | overlaps <- as.list(strsplit(overlap_data[key,'overlapping_genes'], ", ")) 100 | 101 | for(item in overlaps[[1]]){ 102 | gene_B = genome_annotation[genome_annotation['gene_name'] == item,] 103 | 104 | gene_A_exons = return_exons(gene_A) 105 | gene_B_exons = return_exons(gene_B) 106 | 107 | if(exon_overlap(gene_A_exons, gene_B_exons) == TRUE){ 108 | 109 | overlap_data[item, 'automatic_classification'] = 'Manual inspection' 110 | 111 | if(is.na(overlap_data[key,'automatic_classification']) | overlap_data[key,'automatic_classification'] != 'Manual inspection'){ 112 | overlap_data[key,'automatic_classification'] = 'Manual inspection' 113 | } 114 | } 115 | else{ 116 | 117 | if(is.na(overlap_data[key,'automatic_classification'])){ 118 | overlap_data[key,'automatic_classification'] = 'Keep as is' 119 | 120 | if(overlap_data[item,'number_of_gene_overlaps'] > 1){ 121 | overlap_data[item,'automatic_classification'] = 'Manual inspection' 122 | } 123 | else{ 124 | overlap_data[item,'automatic_classification'] = 'Keep as is' 125 | } 126 | } 127 | } 128 | } 129 | } 130 | 131 | if(overlap_data[key,'number_of_gene_overlaps'] == 1){ 132 | overlapping <- overlap_data[key,'overlapping_genes'][[1]] 133 | gene_B <- subset(genome_annotation, gene_name == overlapping) 134 | strand <- gene_A[1,'strand'] 135 | 136 | gene_A_exons = return_exons(gene_A) 137 | gene_B_exons = return_exons(gene_B) 138 | 139 | # Check if both - key and overlapping gene - are pseudogenes 140 | if(both_pseudo(key, overlapping, gene_pattern) == TRUE){ 141 | overlap_data[key, 'automatic_classification'] = 'Manual inspection' 142 | overlap_data[overlapping[[1]], 'automatic_classification'] = 'Manual inspection' 143 | } 144 | 145 | # Check for pseudogene 146 | else if(pseudo_overlap(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern) == key){ 147 | overlap_data[key, 'automatic_classification'] = 'Delete' 148 | overlap_data[overlapping[[1]], 'automatic_classification'] = 'Keep as is' 149 | } 150 | 151 | else if(pseudo_overlap(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern) == overlapping){ 152 | overlap_data[key, 'automatic_classification'] = 'Keep as is' 153 | overlap_data[overlapping[[1]], 'automatic_classification'] = 'Delete' 154 | } 155 | 156 | else if(pseudo_overlap(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern) == 'exonic'){ 157 | overlap_data[key, 'automatic_classification'] = 'Keep as is' 158 | overlap_data[overlapping[[1]], 'automatic_classification'] = 'Keep as is' 159 | } 160 | 161 | # Check for readthrough 162 | else if(exon_overlap(gene_A_exons, gene_B_exons) == TRUE){ 163 | if(strand == '+'){ 164 | name_A = key 165 | name_B = overlapping 166 | result = readthrough_or_premature_plus(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons) 167 | 168 | if(result[[3]] == 'readthrough'){ 169 | overlap_data[result[[1]],'automatic_classification'] = 'Readthrough transcript deletion' 170 | overlap_data[result[[2]],'automatic_classification'] = 'Keep as is' 171 | } 172 | else if(result[[3]] == 'premature'){ 173 | overlap_data[result[[1]],'automatic_classification'] = 'Keep as is' 174 | overlap_data[result[[2]],'automatic_classification'] = 'Premature transcript deletion' 175 | } 176 | else if(result[[3]] == 'manual'){ 177 | overlap_data[result[[1]],'automatic_classification'] = 'Manual inspection' 178 | overlap_data[result[[2]],'automatic_classification'] = 'Manual inspection' 179 | } 180 | } 181 | 182 | else if(strand == '-'){ 183 | name_A = key 184 | name_B = overlapping 185 | result = readthrough_or_premature_min(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons) 186 | 187 | if(result[[3]] == 'readthrough'){ 188 | overlap_data[result[[1]],'automatic_classification'] = 'Readthrough transcript deletion' 189 | overlap_data[result[[2]],'automatic_classification'] = 'Keep as is' 190 | } 191 | else if(result[[3]] == 'premature'){ 192 | overlap_data[result[[1]],'automatic_classification'] = 'Keep as is' 193 | overlap_data[result[[2]],'automatic_classification'] = 'Premature transcript deletion' 194 | } 195 | else if(result[[3]] == 'manual'){ 196 | overlap_data[result[[1]],'automatic_classification'] = 'Manual inspection' 197 | overlap_data[result[[2]],'automatic_classification'] = 'Manual inspection' 198 | } 199 | } 200 | } 201 | 202 | else if(exon_overlap(gene_A_exons, gene_B_exons) == FALSE){ 203 | overlap_data[key,'automatic_classification'] = 'Keep as is' 204 | overlap_data[overlapping,'automatic_classification'] = 'Keep as is' 205 | } 206 | } 207 | } 208 | } 209 | 210 | print("Overlapping genes list (overlapping_gene_list.csv) has been updated with recommended action categories and the file has been saved in your working directory") 211 | write_csv(overlap_data, "overlapping_gene_list.csv") 212 | 213 | } 214 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 2 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm")) 3 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 4 | GenerateGeneLocationBed(genome_annotation) 5 | GenerateExtensionCandidates() 6 | OptimizedAnnotationAssembler("test_genes.gtf", “test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv") 7 | OptimizedAnnotationAssembler("test_genes.gtf", "test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv") 8 | git push 9 | git config pull.ff only 10 | library(ReferenceEnhancer) 11 | library(ReferenceEnhancer) 12 | library(referenceenhancer) 13 | getwd() 14 | setwd("/Users/helen/Desktop") 15 | setwd("/Users/helen/Desktop/Overlap") 16 | genome_annotation <- LoadGtf("test_genes.gtf") 17 | ?IdentifyOverlappers 18 | library(ReferenceEnhancer) 19 | getwd() 20 | install.packages("devtools") 21 | require(devtools) 22 | library(ReferenceEnhancer) 23 | install.packages("gdata") 24 | install.packages("readr") 25 | ?IdentifyOverlappers 26 | library(ReferenceEnhancer) 27 | genome_annotation <- LoadGtf("test_genes.gtf") 28 | library(ReferenceEnhancer) 29 | ?IdentifyOverlappers 30 | ?ReferenceEnhancer 31 | ??ReferenceEnhancer 32 | library(ReferenceEnhancer) 33 | pwd 34 | getwd() 35 | genome_annotation <- LoadGtf("test_genes.gtf") 36 | genome_annotation <- LoadGtf(genes_gtf_path) 37 | library(ReferenceEnhancer) 38 | devtools::document() 39 | library(ReferenceEnhancer) 40 | genome_annotation <- LoadGtf("test_genes.gtf") 41 | BiocManager::install("rtracklayer") 42 | genome_annotation <- LoadGtf("test_genes.gtf") 43 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 44 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm")) 45 | devtools::document() 46 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 47 | ?LoadGtf 48 | ?OptimizedAnnotationAssembler 49 | ??OptimizedAnnotationAssembler 50 | library(ReferenceEnhancer) 51 | getwd() 52 | OptimizedAnnotationAssembler() 53 | library(ReferenceEnhancer) 54 | ?OptimizedAnnotationAssembler 55 | library(ReferenceEnhancer) 56 | library(ReferenceEnhancer) 57 | library(ReferenceEnhancer) 58 | ?OptimizedAnnotationAssembler 59 | library(ReferenceEnhancer) 60 | ?OptimizedAnnotationAssembler 61 | test = LoadGtf("test_genes.gtf") 62 | test 63 | dim(test) 64 | exonic_gtf <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer") 65 | genome_annotation <- LoadGtf("test_genes.gtf") 66 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 67 | gene_extension <- GenerateExtensionCandidates() 68 | OptimizedAnnotationAssembler(exonic_gtf, gene_overlaps, gene_extension, gene_replacement) 69 | library(ReferenceEnhancer) 70 | exonic_gtf <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer") 71 | genome_annotation <- LoadGtf("test_genes.gtf") 72 | genome_annotation <- LoadGtf("test_genes.gtf") 73 | gene_extension <- GenerateExtensionCandidates() 74 | read.table("results.txt", sep = "\t") 75 | library(ReferenceEnhancer) 76 | summary_data 77 | exonic_gtf <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer") 78 | genome_annotation <- LoadGtf("test_genes.gtf") 79 | genome_annotation <- LoadGtf("test_genes.gtf") 80 | gene_extension <- GenerateExtensionCandidates() 81 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 82 | pwd 83 | getwd() 84 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 85 | OptimizedAnnotationAssembler("test_genes.gtf", "overlapping_gene_list.csv", " gene_extension_candidates.csv", "rename_genes.csv") 86 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 87 | library(ReferenceEnhancer) 88 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 89 | library(ReferenceEnhancer) 90 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 91 | library(ReferenceEnhancer) 92 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 93 | gene_overlaps 94 | library(ReferenceEnhancer) 95 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 96 | length(transcripts_to_delete) 97 | library(ReferenceEnhancer) 98 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv") 99 | getwd() 100 | library(ReferenceEnhancer) 101 | genome_annotation <- LoadGtf("test_genes.gtf") 102 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 103 | OverlapResolutions(genome_annotation, gene_overlaps, c("^Gm", "Rik$")) 104 | IsolateIntergenicReads("./input.bam", "./input.bam.bai") 105 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 106 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 107 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 108 | library(ReferenceEnhancer) 109 | genome_annotation <- LoadGtf("test_genes.gtf") 110 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 111 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm")) 112 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 113 | library(ReferenceEnhancer) 114 | genome_annotation <- LoadGtf("test_genes.gtf") 115 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 116 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm")) 117 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai") 118 | GenerateGeneLocationBed(genome_annotation) 119 | GenerateExtensionCandidates() 120 | OptimizedAnnotationAssembler("test_genes.gtf", "test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv") 121 | GenerateGeneLocationBed(genome_annotation, "/usr/bin/bedops") 122 | GenerateGeneLocationBed(genome_annotation) 123 | library(ReferenceEnhancer) 124 | GenerateExtensionCandidates() 125 | library(ReferenceEnhancer) 126 | library(ReferenceEnhancer) 127 | library(ReferenceEnhancer) 128 | unoptimized_gtf <- "test_genes.gtf" 129 | gene_overlaps <- "overlapping_gene_list.csv" 130 | gene_extension <- "gene_extension_candidates.csv" 131 | gene_replacement <- "test_gene_replacement.csv" 132 | OptimizedAnnotationAssembler("test_genes.gtf", "overlapping_gene_list.csv", " gene_extension_candidates.csv", "rename_genes.csv") 133 | unoptimized_gtf <- "test_genes.gtf" 134 | gene_overlaps <- "test_overlapping_gene_list.csv" 135 | gene_extension <- "./gene_extension_candidates.csv" 136 | gene_replacement <- "test_gene_replacement.csv" 137 | OptimizedAnnotationAssembler(unoptimized_gtf, gene_overlaps, gene_extension, gene_replacement) 138 | library(ReferenceEnhancer) 139 | ?LoadGtf 140 | ?LoadGtf 141 | library(ReferenceEnhancer) 142 | ?LoadGtf 143 | ?LoadGtf 144 | library(ReferenceEnhancer) 145 | ?LoadGtf 146 | library(ReferenceEnhancer) 147 | ?LoadGtf 148 | library(ReferenceEnhancer) 149 | ?LoadGtf 150 | library(ReferenceEnhancer) 151 | ?LoadGtf 152 | library(ReferenceEnhancer) 153 | ?LoadGtf 154 | library(ReferenceEnhancer) 155 | ?LoadGtf 156 | library(ReferenceEnhancer) 157 | ?LoadGtf 158 | genome_annotation <- LoadGtf("test_genes.gtf") 159 | ?IdentifyOverlappers 160 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 161 | ?OverlapResolutions 162 | library(ReferenceEnhancer) 163 | ?OverlapResolutions 164 | library(ReferenceEnhancer) 165 | ?OverlapResolutions 166 | genome_annotation <- LoadGtf("test_genes.gtf") 167 | gene_overlaps <- IdentifyOverlappers(genome_annotation) 168 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm")) 169 | ?IsolateIntergenicReads 170 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai", barcode_length = 26) 171 | ?GenerateGeneLocationBed 172 | ?GenerateExtensionCandidates 173 | ?OptimizedAnnotationAssembler 174 | library(ReferenceEnhancer) 175 | ?LoadGtf 176 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 177 | ?IdentifyOverlappers 178 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 179 | ?OverlapResolutions 180 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 181 | ?IsolateIntergenicReads 182 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 183 | ?GenerateGeneLocationBed 184 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 185 | ?GenerateExtensionCandidates 186 | old_path <- Sys.getenv("PATH") 187 | Sys.setenv(PATH = paste(old_path, "/Applications/bedtools2/bin", sep = ":")) 188 | GenerateExtensionCandidates() 189 | Sys.getenv("PATH") 190 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 191 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 192 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 193 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 194 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 195 | GenerateExtensionCandidates() 196 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 197 | library(ReferenceEnhancer) 198 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 199 | library(ReferenceEnhancer) 200 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 201 | library(ReferenceEnhancer) 202 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 203 | library(ReferenceEnhancer) 204 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 205 | library(ReferenceEnhancer) 206 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 207 | library(ReferenceEnhancer) 208 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 209 | library(ReferenceEnhancer) 210 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 211 | library(ReferenceEnhancer) 212 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 213 | library(ReferenceEnhancer) 214 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 215 | library(ReferenceEnhancer) 216 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 217 | library(ReferenceEnhancer) 218 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 219 | library(ReferenceEnhancer) 220 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 221 | genome_annotation 222 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 223 | getwd() 224 | setwd("/System/Applications") 225 | library(ReferenceEnhancer) 226 | getwd() 227 | library(ReferenceEnhancer) 228 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 229 | LoadGtf("test_genes.gtf") 230 | ?LoadGtf 231 | library(ReferenceEnhancer) 232 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 233 | ?LoadGtf 234 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 235 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 236 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 237 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 238 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 239 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 240 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 241 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 242 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 243 | library(ReferenceEnhancer) 244 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 245 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai") 246 | library(ReferenceEnhancer) 247 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai") 248 | library(ReferenceEnhancer) 249 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai") 250 | library(ReferenceEnhancer) 251 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai") 252 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 253 | library(ReferenceEnhancer) 254 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 255 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 256 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 257 | old_path <- Sys.getenv("PATH") 258 | Sys.getenv("PATH") 259 | library(ReferenceEnhancer) 260 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 261 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 262 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 263 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 264 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 265 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 266 | old_path <- Sys.getenv("PATH") 267 | Sys.setenv(PATH = paste(old_path, "/Applications/bedtools2/bin", sep = ":")) 268 | GenerateExtensionCandidates() 269 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 270 | library(ReferenceEnhancer) 271 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 272 | GenerateExtensionCandidates() 273 | Sys.getenv("PATH") 274 | ?GenerateExtensionCandidates 275 | library(ReferenceEnhancer) 276 | ?GenerateExtensionCandidates 277 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 278 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 279 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 280 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 281 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 282 | Sys.getenv("PATH") 283 | GenerateExtensionCandidates() 284 | library(ReferenceEnhancer) 285 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 286 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 287 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 288 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 289 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 290 | Sys.getenv("PATH") 291 | GenerateExtensionCandidates() 292 | Sys.getenv("PATH") 293 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 294 | Sys.getenv("PATH") 295 | Sys.getenv("PATH") 296 | whereis bedtools 297 | library(ReferenceEnhancer) 298 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 299 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 300 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 301 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 302 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 303 | GenerateExtensionCandidates() 304 | Sys.getenv("PATH") 305 | GenerateExtensionCandidates("/opt/homebrew/bin/bedtools") 306 | Sys.getenv("PATH") 307 | system("whereis bedtools") 308 | library(ReferenceEnhancer) 309 | GenerateExtensionCandidates() 310 | GenerateExtensionCandidates() 311 | library(ReferenceEnhancer) 312 | library(ReferenceEnhancer) 313 | GenerateExtensionCandidates() 314 | library(ReferenceEnhancer) 315 | GenerateExtensionCandidates() 316 | library(ReferenceEnhancer) 317 | GenerateExtensionCandidates() 318 | library(ReferenceEnhancer) 319 | GenerateExtensionCandidates() 320 | library(ReferenceEnhancer) 321 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 322 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 323 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 324 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 325 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 326 | GenerateExtensionCandidates() 327 | library(ReferenceEnhancer) 328 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 329 | Sys.getenv("PATH") 330 | system("whereis bedtools") 331 | system("whereis bedtools") 332 | system("whereis bedtools") == None 333 | system("whereis bedtools") == [] 334 | if(system("whereis bedtools")){print("YAY")} 335 | if(system("whereis bedtools")){print("YAY")}else{"NO"} 336 | library(ReferenceEnhancer) 337 | GenerateExtensionCandidates() 338 | Sys.getenv("PATH") 339 | GenerateExtensionCandidates() 340 | Sys.getenv("PATH") 341 | system("whereis bedtools") 342 | GenerateExtensionCandidates() 343 | Sys.getenv("PATH") 344 | system("whereis bedtools") 345 | system("whereis bedtools") 346 | a=system("whereis bedtools") 347 | class(a) 348 | a 349 | a 350 | a=system(print "homo") 351 | a=system("homo") 352 | a=system(""homo"") 353 | a=system("print homo") 354 | a=system("print 'homo'") 355 | a=system("echo homo") 356 | a 357 | system("echo homo")[1] 358 | a=system("echo homo")[1] 359 | a 360 | class(system("echo homo")) 361 | as.character((system("echo homo")) 362 | ) 363 | row.names((system("echo homo"))) 364 | row.names(system("whereis bedtools")) 365 | a=row.names(system("whereis bedtools")) 366 | dim(a) 367 | class(a) 368 | a 369 | row.names(system("whereis bedtools")) 370 | system("whereis bedtools") 371 | unlist(strsplit(system("whereis bedtools", intern = TRUE),":"))[2] 372 | unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2] 373 | library(ReferenceEnhancer) 374 | GenerateExtensionCandidates() 375 | system("whereis bedtools") 376 | Sys.getenv("PATH") 377 | system("whereis bedtools", intern = TRUE) 378 | if(system("whereis bedtools", intern = TRUE)){print("YAY")} 379 | length(system("whereis bedtools")) 380 | length(system("whereis bedtools", intern = TRUE)) 381 | length(system("whereis bedtools")) 382 | length(system("whereis bedtools", intern = TRUE)) 383 | unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2] 384 | if(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2]){print("YAY")} 385 | if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){print("OHNO")} 386 | if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){ 387 | print("Didn't find bedtools. Please install bedtools.") 388 | } 389 | else{ 390 | if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){ 391 | print("Didn't find bedtools. Please install bedtools.") 392 | } 393 | else{ 394 | library(ReferenceEnhancer) 395 | GenerateExtensionCandidates() 396 | library(ReferenceEnhancer) 397 | GenerateExtensionCandidates() 398 | Sys.getenv("PATH") 399 | GenerateExtensionCandidates() 400 | Sys.getenv("PATH") 401 | system("whereis bedtools") 402 | system("whereis bedtools") 403 | Sys.getenv("PATH") 404 | library(ReferenceEnhancer) 405 | Sys.getenv("PATH") 406 | GenerateExtensionCandidates() 407 | old_path <- Sys.getenv("PATH") 408 | bedtools_loc = "/Users/helen/Downloads/bedtools2/bin" 409 | Sys.setenv(PATH = paste(old_path, bedtools_loc, sep = ":")) 410 | Sys.getenv("PATH") 411 | GenerateExtensionCandidates() 412 | ?GenerateExtensionCandidates 413 | library(ReferenceEnhancer) 414 | GenerateExtensionCandidates() 415 | library(ReferenceEnhancer) 416 | GenerateExtensionCandidates() 417 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin") 418 | library(ReferenceEnhancer) 419 | library(ReferenceEnhancer) 420 | GenerateExtensionCandidates() 421 | library(ReferenceEnhancer) 422 | GenerateExtensionCandidates() 423 | GenerateExtensionCandidates("/Users/helen/Downloads/bedtools2/bin") 424 | ?GenerateGeneLocationBed 425 | system("whereis bedops") 426 | bedops in Sys.getenv("PATH") 427 | system.file(bedops) 428 | a = Sys.getenv("PATH", intern = TRUE) 429 | a = intern = Sys.getenv("PATH") 430 | a = Sys.getenv("PATH") 431 | a 432 | bedops in a 433 | class(a) 434 | grepl( needle, haystack, fixed = TRUE) 435 | grepl(a, "bedops", fixed = TRUE) 436 | a 437 | grepl(a, "bedtools", fixed = TRUE) 438 | grepl(a, "bedtools") 439 | type(a) 440 | class(a) 441 | [1] 442 | length(a) 443 | length(a[1]) 444 | unlist(strsplit(a,sep=":")) 445 | a 446 | dim(a) 447 | ?Sys.getenv 448 | Sys.getenv(c("R_HOME", "R_PAPERSIZE", "R_PRINTCMD", "HOST")) 449 | dim(Sys.getenv(c("R_HOME", "R_PAPERSIZE", "R_PRINTCMD", "HOST"))) 450 | class(Sys.getenv(c("R_HOME", "R_PAPERSIZE", "R_PRINTCMD", "HOST"))) 451 | s <- Sys.getenv() 452 | s 453 | names(s) 454 | head(s, 12) 455 | a = Sys.getenv("PATH") 456 | names(a) 457 | head(s, 12) 458 | head(s, 1) 459 | s 460 | head(s, 1) 461 | head(s, 1)[1] 462 | head(s, Path) 463 | Sys.getenv("PATH") 464 | Sys.getenv("PATH") 465 | system("whereis bedops") 466 | library(ReferenceEnhancer) 467 | GenerateGeneLocationBed() 468 | ?GenerateGeneLocationBed 469 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 470 | GenerateGeneLocationBed(genome_annotation) 471 | Sys.getenv("PATH") 472 | library(ReferenceEnhancer) 473 | GenerateGeneLocationBed(genome_annotation) 474 | library(ReferenceEnhancer) 475 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 476 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 477 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 478 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 479 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL) 480 | GenerateExtensionCandidates(bedtools_loc = NULL) 481 | Sys.getenv("PATH") 482 | library(ReferenceEnhancer) 483 | GenerateExtensionCandidates(bedtools_loc = NULL) 484 | library(ReferenceEnhancer) 485 | GenerateExtensionCandidates() 486 | GenerateExtensionCandidates(bedtools_loc = NULL) 487 | library(ReferenceEnhancer) 488 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 489 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 490 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 491 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 492 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL) 493 | GenerateGeneLocationBed(genome_annotation = genome_annotation) 494 | GenerateExtensionCandidates(bedtools_loc = NULL) 495 | library(ReferenceEnhancer) 496 | library(ReferenceEnhancer) 497 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 498 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 499 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 500 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 501 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL) 502 | GenerateExtensionCandidates(bedtools_loc = NULL) 503 | GenerateExtensionCandidates(bedtools_loc = "/Users/helen/Downloads/bedtools2/bin") 504 | library(ReferenceEnhancer) 505 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf") 506 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation) 507 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm")) 508 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26) 509 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL) 510 | GenerateExtensionCandidates(bedtools_loc = NULL) 511 | OptimizedAnnotationAssembler("test_genes.gtf", "test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv") 512 | library(ReferenceEnhancer) 513 | --------------------------------------------------------------------------------