├── .gitignore
├── R
    ├── overlap_classification.R
    ├── .DS_Store
    ├── sample_read.R
    ├── write_csv.R
    ├── return_exons.R
    ├── write_bed.R
    ├── write_gtf.R
    ├── both_pseudo.R
    ├── exon_overlap.R
    ├── pseudo_overlap.R
    ├── PremrnaAnnotationGenerator.R
    ├── LoadGtf.R
    ├── readthrough_or_premature_min.R
    ├── readthrough_or_premature_plus.R
    ├── readthrough_or_premature.R
    ├── IdentifyOverlappers.R
    ├── GenerateGeneLocationBed.R
    ├── GenerateExtensionCandidates.R
    ├── IsolateIntergenicReads.R
    ├── OptimizedAnnotationAssembler.R
    └── OverlapResolutions.R
├── LICENSE
├── .DS_Store
├── inst
    ├── test_gene_replacement.csv
    ├── .DS_Store
    └── extdata
    │   ├── test_gene_replacement.csv
    │   ├── .DS_Store
    │   ├── #test_bam.bam
    │   ├── test_bam.bam
    │   ├── test_index.bam.bai
    │   └── #test_index.bam.bai
├── .Rbuildignore
├── ReferenceEnhancer-Manual2023.pdf
├── NAMESPACE
├── ReferenceEnhancer.Rproj
├── man
    ├── PremrnaAnnotationGenerator.Rd
    ├── LoadGtf.Rd
    ├── IdentifyOverlappers.Rd
    ├── GenerateExtensionCandidates.Rd
    ├── GenerateGeneLocationBed.Rd
    ├── IsolateIntergenicReads.Rd
    ├── OptimizedAnnotationAssembler.Rd
    └── OverlapResolutions.Rd
├── LICENSE.md
├── DESCRIPTION
├── README.Rmd
├── README.Rmd.orig
└── .Rhistory


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | 


--------------------------------------------------------------------------------
/R/overlap_classification.R:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2022
2 | COPYRIGHT HOLDER: Allan-Hermann Pool
3 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/.DS_Store


--------------------------------------------------------------------------------
/R/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/R/.DS_Store


--------------------------------------------------------------------------------
/inst/test_gene_replacement.csv:
--------------------------------------------------------------------------------
1 | old_name,new_name
2 | Sox17,Sox17-Sox17a
3 | 
4 | 


--------------------------------------------------------------------------------
/inst/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/.DS_Store


--------------------------------------------------------------------------------
/inst/extdata/test_gene_replacement.csv:
--------------------------------------------------------------------------------
1 | old_name,new_name
2 | Sox17,Sox17-Sox17a
3 | 
4 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^ReferenceEnhancer\.Rproj$
2 | ^\.Rproj\.user$
3 | ^LICENSE\.md$
4 | ^README\.Rmd$
5 | 


--------------------------------------------------------------------------------
/inst/extdata/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/.DS_Store


--------------------------------------------------------------------------------
/inst/extdata/#test_bam.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/#test_bam.bam


--------------------------------------------------------------------------------
/inst/extdata/test_bam.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/test_bam.bam


--------------------------------------------------------------------------------
/inst/extdata/test_index.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/test_index.bam.bai


--------------------------------------------------------------------------------
/ReferenceEnhancer-Manual2023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/ReferenceEnhancer-Manual2023.pdf


--------------------------------------------------------------------------------
/inst/extdata/#test_index.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PoolLab/ReferenceEnhancer/HEAD/inst/extdata/#test_index.bam.bai


--------------------------------------------------------------------------------
/R/sample_read.R:
--------------------------------------------------------------------------------
1 | # csv = system.file("extdata", "mouse_sample.csv", package = "ReferenceEnhancer")
2 | sample_read <- function(path){
3 |   readr::read_csv(path)
4 | }
5 | 


--------------------------------------------------------------------------------
/R/write_csv.R:
--------------------------------------------------------------------------------
1 | write_csv <- function(output_data, file_name){
2 |   path = "."
3 |   out_path <- file.path(path, file_name)
4 |   write.csv(output_data, out_path)
5 | }
6 | 


--------------------------------------------------------------------------------
/R/return_exons.R:
--------------------------------------------------------------------------------
1 | return_exons <- function(gene_name){
2 |   exon_subset <- subset(gene_name, type == 'exon')
3 |   return(data.frame(exon_subset['start'], exon_subset['end']))
4 | }
5 | 


--------------------------------------------------------------------------------
/R/write_bed.R:
--------------------------------------------------------------------------------
1 | write_bed <- function(output_data, file_name){
2 |   path = "."
3 |   out_path <- file.path(path, file_name)
4 |   rtracklayer::export.bed(output_data, con = out_path)
5 | }
6 | 


--------------------------------------------------------------------------------
/R/write_gtf.R:
--------------------------------------------------------------------------------
1 | write_gtf <- function(output_data, file_name){
2 |   #path = "."
3 |   #out_path <- file.path(path, file_name)
4 |   rtracklayer::export(output_data, file_name, format = "gtf")
5 | }
6 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(GenerateExtensionCandidates)
 4 | export(GenerateGeneLocationBed)
 5 | export(IdentifyOverlappers)
 6 | export(IsolateIntergenicReads)
 7 | export(LoadGtf)
 8 | export(OptimizedAnnotationAssembler)
 9 | export(OverlapResolutions)
10 | export(PremrnaAnnotationGenerator)
11 | 


--------------------------------------------------------------------------------
/R/both_pseudo.R:
--------------------------------------------------------------------------------
 1 | both_pseudo <- function(key, overlapping, gene_pattern){
 2 | 
 3 |   if (missing(gene_pattern)){
 4 |     return(FALSE)
 5 |   }
 6 | 
 7 |   else{
 8 |     key_pseudo = sum(stringr::str_detect(key, gene_pattern))
 9 |     overlapping_pseudo = sum(stringr::str_detect(overlapping, gene_pattern))
10 |     return((key_pseudo + overlapping_pseudo) > 1)
11 | 
12 |   }
13 | 
14 |   }
15 | 


--------------------------------------------------------------------------------
/ReferenceEnhancer.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | LineEndingConversion: Posix
18 | 
19 | BuildType: Package
20 | PackageUseDevtools: Yes
21 | PackageInstallArgs: --no-multiarch --with-keep.source
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/R/exon_overlap.R:
--------------------------------------------------------------------------------
 1 | exon_overlap <- function(gene_A_exons, gene_B_exons){
 2 | 
 3 |   if(dim(gene_A_exons)[1] == 0 | dim(gene_B_exons)[1] == 0){
 4 |     return (FALSE)
 5 |   }
 6 | 
 7 |   for(row_exonA in 1:nrow(gene_A_exons)){
 8 |     for(row_exonB in 1:nrow(gene_B_exons)){
 9 | 
10 |       x = seq(from = gene_A_exons[row_exonA,1], to = gene_A_exons[row_exonA,2]-1, by = 1)
11 |       y = seq(from = gene_B_exons[row_exonB,1], to = gene_B_exons[row_exonB,2]-1, by = 1)
12 | 
13 |       if(length(intersect(x,y))!=0){
14 |         return (TRUE)
15 |       }
16 |     }
17 |   }
18 |   return (FALSE)
19 | }
20 | 


--------------------------------------------------------------------------------
/R/pseudo_overlap.R:
--------------------------------------------------------------------------------
 1 | pseudo_overlap <- function(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern){
 2 | 
 3 |   if(missing(gene_pattern)){
 4 |     return('empty')
 5 |   }
 6 | 
 7 |   else{
 8 |     # Check for exon overlap
 9 |     if(sum(stringr::str_detect(key, gene_pattern)) > 0 | sum(stringr::str_detect(overlapping, gene_pattern)) > 0 ){
10 |       if(exon_overlap(gene_A_exons, gene_B_exons) == TRUE){
11 |         # Check if gene_A is a pseudogene
12 |         if(sum(stringr::str_detect(key, gene_pattern)) > 0){
13 |           return(key)
14 |         }
15 |         else{
16 |           return(overlapping)
17 |         }
18 |       }
19 |       else{
20 |         return('exonic')
21 |       }
22 |     }
23 | 
24 |     else{
25 |       return('empty')
26 |     }
27 | 
28 | 
29 |   }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/PremrnaAnnotationGenerator.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/PremrnaAnnotationGenerator.R
 3 | \name{PremrnaAnnotationGenerator}
 4 | \alias{PremrnaAnnotationGenerator}
 5 | \title{PremrnaAnnotationGenerator}
 6 | \usage{
 7 | PremrnaAnnotationGenerator(genome_annotation)
 8 | }
 9 | \arguments{
10 | \item{genome_annotation}{ENSEMBL/10x Genomics default genome annotation file (.gtf).}
11 | }
12 | \value{
13 | Generates a basic pre-mRNA reference and saves in working directory as premrna.gtf
14 | }
15 | \description{
16 | It supplements original normal gene annotation entries by
17 | traditional pre-mRNA entries where transcripts have been redefined as exons
18 | and map in the --include-introns mode to retrieve most of available intronic reads.
19 | }
20 | \examples{
21 | genome_annotation <- LoadGtf("test_genes.gtf")
22 | PremrnaAnnotationGenerator(genome_annotation)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/LoadGtf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/LoadGtf.R
 3 | \name{LoadGtf}
 4 | \alias{LoadGtf}
 5 | \title{LoadGtf}
 6 | \usage{
 7 | LoadGtf(unoptimized_annotation_path)
 8 | }
 9 | \arguments{
10 | \item{unoptimized_annotation_path}{Path to the unoptimized genome annotion GTF file.}
11 | }
12 | \value{
13 | Resulting object contains the genome annotation entries from the genome annotation GTF file.
14 | }
15 | \description{
16 | Use to import the Ensembl/10x Genomics default genome annotation
17 | or other desired genome annotation file in GTF format for optimization for scRNA-seq
18 | analysis. Note: This file can be downloaded from 10x Genomics provided reference
19 | transcriptome "gene" folder at
20 | "https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest"
21 | or Ensembl.org if wish to customize more.
22 | }
23 | \examples{
24 | LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
25 | }
26 | 


--------------------------------------------------------------------------------
/man/IdentifyOverlappers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/IdentifyOverlappers.R
 3 | \name{IdentifyOverlappers}
 4 | \alias{IdentifyOverlappers}
 5 | \title{IdentifyOverlappers}
 6 | \usage{
 7 | IdentifyOverlappers(genome_annotation)
 8 | }
 9 | \arguments{
10 | \item{genome_annotation}{Unoptimized genome annotation file in GTF. Could be
11 | obtained from Ensembl, Refseq, 10x Genomics or elsewhere.}
12 | }
13 | \value{
14 | Rank-ordered gene list of same-strand overlapping genes (“overlapping_gene_list.csv”).
15 | }
16 | \description{
17 | Identifies all same-strand overlapping genes based on the unoptimized
18 | genome annotation file in GTF, rank-orders them according to the number of gene
19 | overlaps. Saves the list of overlapping genes in working directory as “overlapping_gene_list.csv”.
20 | }
21 | \examples{
22 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
23 | IdentifyOverlappers(genome_annotation = genome_annotation)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/GenerateExtensionCandidates.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GenerateExtensionCandidates.R
 3 | \name{GenerateExtensionCandidates}
 4 | \alias{GenerateExtensionCandidates}
 5 | \title{GenerateExtensionCandidates}
 6 | \usage{
 7 | GenerateExtensionCandidates(bedtools_loc = NULL)
 8 | }
 9 | \arguments{
10 | \item{bedops_loc}{Optional. Location of bedtools in file system.}
11 | }
12 | \value{
13 | Rank ordered list of gene extension candidates saved to working directory
14 | as “gene_extension_candidates.csv”.
15 | }
16 | \description{
17 | Identifies candidate genes for extension with excess 3' intergenic
18 | reads and creates a rank ordered list of genes as a function of 3' intergenic
19 | read mapping within 10kb of known gene end. You can use this as a prioritized
20 | gene list for gene extension to examine in Integrated Genomics Viewer.
21 | 
22 | Note: It runs partially in Bash/Linux terminal. Make sure bedtools is installed
23 | and provide a path in the function if you get an error message.
24 | }
25 | \examples{
26 | GenerateExtensionCandidates(bedtools_loc = NULL)
27 | }
28 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2022 Allan-Hermann Pool
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/R/PremrnaAnnotationGenerator.R:
--------------------------------------------------------------------------------
 1 | #' @title PremrnaAnnotationGenerator
 2 | #'
 3 | #' @description It supplements original normal gene annotation entries by
 4 | #' traditional pre-mRNA entries where transcripts have been redefined as exons
 5 | #' and map in the --include-introns mode to retrieve most of available intronic reads.
 6 | #'
 7 | #' @param genome_annotation ENSEMBL/10x Genomics default genome annotation file (.gtf).
 8 | #'
 9 | #' @return Generates a basic pre-mRNA reference and saves in working directory as premrna.gtf
10 | #' @export
11 | #'
12 | #' @examples
13 | #' genome_annotation <- LoadGtf("test_genes.gtf")
14 | #' PremrnaAnnotationGenerator(genome_annotation)
15 | PremrnaAnnotationGenerator <- function(genome_annotation){
16 | 
17 |   exonic_df <- genome_annotation
18 |   premrna_df = exonic_df[exonic_df$type == "transcript",] # Extract all "transcript" entries in the genome annotation to a new variable
19 |   premrna_df$feature = rep("exon", nrow(premrna_df)) # Rename all "feature"
20 | 
21 |   premrna_df = GenomicRanges::makeGRangesFromDataFrame(premrna_df, keep.extra.columns=TRUE)
22 |   write_gtf(premrna_df, "premrna.gtf")
23 |   print("Pre-mRNA reference has been saved in working directory as premrna.gtf")
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/R/LoadGtf.R:
--------------------------------------------------------------------------------
 1 | #' @title LoadGtf
 2 | #'
 3 | #' @description Use to import the Ensembl/10x Genomics default genome annotation
 4 | #' or other desired genome annotation file in GTF format for optimization for scRNA-seq
 5 | #' analysis. Note: This file can be downloaded from 10x Genomics provided reference
 6 | #' transcriptome "gene" folder at
 7 | #' "https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest"
 8 | #' or Ensembl.org if wish to customize more.
 9 | #'
10 | #' @param unoptimized_annotation_path Path to the unoptimized genome annotion GTF file.
11 | #'
12 | #' @return Resulting object contains the genome annotation entries from the genome annotation GTF file.
13 | #' @export
14 | #'
15 | #' @examples
16 | #' LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
17 | LoadGtf <- function(unoptimized_annotation_path){
18 | 
19 |   #Access test data
20 |   if(unoptimized_annotation_path == "test_genes.gtf"){
21 |     unoptimized_annotation_path <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer")
22 |   }
23 | 
24 |   genome_annotation <- rtracklayer::import(con = unoptimized_annotation_path, format = "gtf") # Import the original exonic genome annotation file
25 |   genome_annotation <- as.data.frame(genome_annotation)
26 |   return(genome_annotation)
27 | }
28 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: ReferenceEnhancer
 2 | Title: Package For Optimizing And Assembling Genome Annotations For 3’ Single-Cell RNA-Sequencing Analysis
 3 | Version: 0.9
 4 | Authors@R: 
 5 |     person("Helen", "Poldsam", , "helen.poldsam@utsouthwestern.edu", role = c("aut", "cre"))
 6 |     person("Allan-Hermann", "Pool", , "allan-hermann.pool@utsouthwestern.edu", role = c("aut", "cre"))
 7 | Description: ReferenceEnhancer contains a set of tools for optimizing genome annotations for droplet based 3’ single-cell RNA-sequencing (10x Genomics, Dropseq etc.) data analysis. Regular genome annotations and transcriptomic references generated based on them come with several problems causing discarded sequencing data from final gene expression estimates (outlined in detail in https://www.biorxiv.org/content/10.1101/2022.04.26.489449v1). These include read loss stemming from gene overlaps, sequencing reads mapping to 3’ unannoated exons as well as introns. ReferenceEnhancer enables fixing these issues and assembling optimized genome annotations that circumvent these problems and recover the discarded gene expression data.
 8 | URL: https://github.com/PoolLab/ReferenceEnhancer 
 9 | License: Artistic-2.0
10 | Encoding: UTF-8
11 | Roxygen: list(markdown = TRUE)
12 | RoxygenNote: 7.2.3
13 | Imports: 
14 |     gdata,
15 |     GenomicAlignments,
16 |     GenomicRanges,
17 |     IRanges,
18 |     readr,
19 |     Rsamtools,
20 |     rtracklayer,
21 |     stringr
22 | 


--------------------------------------------------------------------------------
/man/GenerateGeneLocationBed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GenerateGeneLocationBed.R
 3 | \name{GenerateGeneLocationBed}
 4 | \alias{GenerateGeneLocationBed}
 5 | \title{GenerateGeneLocationBed}
 6 | \usage{
 7 | GenerateGeneLocationBed(genome_annotation, bedops_loc = NULL)
 8 | }
 9 | \arguments{
10 | \item{genome_annotation}{Genome annotation DataFrame loaded with LoadGtf()
11 | function in this package.}
12 | 
13 | \item{bedops_loc}{Optional. Location of BEDOPS in file system.}
14 | }
15 | \value{
16 | Saves “gene_ranges.bed” in working directory.
17 | }
18 | \description{
19 | Makes a bed file with gene boundaries, which is required for
20 | assigning intergenic reads to a specific gene and discovering genes with large
21 | amounts of intergenic reads near its 3’ gene end.
22 | 
23 | Note 1: This step is partially run in Linux Terminal in Bash and requires BEDOPS
24 | (https://bedops.readthedocs.io/en/latest/). Make sure BEDOPS is installed and
25 | provide a path to BEDOPS in the function if you get an error message.
26 | 
27 | Note 2: In Linux terminal, navigate to folder with the genome annotation of interest.
28 | The annotation file should be named "genes.gtf" per 10x Genomics convention.
29 | }
30 | \examples{
31 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
32 | GenerateGeneLocationBed(
33 | genome_annotation = genome_annotation,
34 | bedops_loc = NULL)
35 | }
36 | 


--------------------------------------------------------------------------------
/R/readthrough_or_premature_min.R:
--------------------------------------------------------------------------------
 1 | readthrough_or_premature_min <- function(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons){
 2 |   select_gene_A <- gene_A[gene_A$type=='gene', ]
 3 |   gene_A_start <-  select_gene_A[,'start']
 4 |   gene_A_end <- select_gene_A[,'end']
 5 |   select_gene_B <- gene_B[gene_B$type=='gene', ]
 6 |   gene_B_start <-  select_gene_B[,'start']
 7 |   gene_B_end <- select_gene_B[,'end']
 8 | 
 9 |   if(gene_B_end > gene_A_end){
10 |     upstream_name <- name_B
11 |     upstream <- gene_B
12 |     downstream_name <- name_A
13 |     downstream <- gene_A
14 |   }
15 | 
16 |   else if(gene_A_end > gene_B_end){
17 |     upstream_name <- name_A
18 |     upstream <- gene_A
19 |     downstream_name <- name_B
20 |     downstream <- gene_B
21 |   }
22 | 
23 |   else if(gene_B_start > gene_A_start){
24 |     upstream_name <- name_B
25 |     upstream <- gene_B
26 |     downstream_name <- name_A
27 |     downstream <- gene_A
28 |   }
29 | 
30 |   else{
31 |     upstream_name <- name_A
32 |     upstream <- gene_A
33 |     downstream_name <- name_B
34 |     downstream <- gene_B
35 |   }
36 | 
37 |   upstream_trx <- list(upstream[upstream$type == 'transcript',][,'start'], upstream[upstream$type == 'transcript',][,'end'])
38 |   downstream_trx = list(downstream[downstream$type == 'transcript',][,'start'], downstream[downstream$type == 'transcript',][,'end'])
39 | 
40 |   return(readthrough_or_premature(upstream_name, downstream_name, upstream_trx, downstream_trx))
41 | }
42 | 


--------------------------------------------------------------------------------
/R/readthrough_or_premature_plus.R:
--------------------------------------------------------------------------------
 1 | readthrough_or_premature_plus <- function(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons){
 2 |   select_gene_A <- gene_A[gene_A$type=='gene', ]
 3 |   gene_A_start <-  select_gene_A[,'start']
 4 |   gene_A_end <- select_gene_A[,'end']
 5 |   select_gene_B <- gene_B[gene_B$type=='gene', ]
 6 |   gene_B_start <-  select_gene_B[,'start']
 7 |   gene_B_end <- select_gene_B[,'end']
 8 | 
 9 |   if(gene_A_start < gene_B_start){
10 |     upstream_name <- name_A
11 |     upstream <- gene_A
12 |     downstream_name <- name_B
13 |     downstream <- gene_B
14 |   }
15 | 
16 |   else if(gene_B_start < gene_A_start){
17 |     upstream_name <- name_B
18 |     upstream <- gene_B
19 |     downstream_name <- name_A
20 |     downstream <- gene_A
21 |   }
22 | 
23 |   else if(gene_A_end < gene_B_end){
24 |     upstream_name <- name_A
25 |     upstream <- gene_A
26 |     downstream_name <- name_B
27 |     downstream <- gene_B
28 |   }
29 | 
30 |   else{
31 |     upstream_name <- name_B
32 |     upstream <- gene_B
33 |     downstream_name <- name_A
34 |     downstream <- gene_A
35 |   }
36 | 
37 |   upstream_trx <- list(upstream[upstream$type == 'transcript',][,'start'], upstream[upstream$type == 'transcript',][,'end'])
38 |   downstream_trx = list(downstream[downstream$type == 'transcript',][,'start'], downstream[downstream$type == 'transcript',][,'end'])
39 | 
40 |   return(readthrough_or_premature(upstream_name, downstream_name, upstream_trx, downstream_trx))
41 | }
42 | 


--------------------------------------------------------------------------------
/R/readthrough_or_premature.R:
--------------------------------------------------------------------------------
 1 | readthrough_or_premature <- function(upstream_name, downstream_name, upstream_trx, downstream_trx){
 2 |   max_u = 0
 3 |   for(trx_u in 1:length(upstream_trx[[1]])){
 4 |     count_u = 0
 5 |     for (trx_d in 1:length(downstream_trx[[1]])){
 6 |       x = seq(downstream_trx[[1]][trx_d], downstream_trx[[2]][trx_d]-1)
 7 |       y = seq(upstream_trx[[1]][trx_u], upstream_trx[[2]][trx_u]-1)
 8 | 
 9 |       if(length(intersect(x,y)) > 0){
10 |         count_u = count_u + 1
11 |       }
12 | 
13 |       if(count_u > max_u){
14 |         max_u = count_u
15 |       }
16 |     }
17 |   }
18 | 
19 |   max_d = 0
20 |   for(trx_d in 1:length(downstream_trx[[1]])){
21 |     count_d = 0
22 |     for(trx_u in 1:length(upstream_trx[[1]])){
23 |       x = seq(downstream_trx[[1]][trx_d], downstream_trx[[2]][trx_d]-1)
24 |       y = seq(upstream_trx[[1]][trx_u], upstream_trx[[2]][trx_u]-1)
25 |       if(length(intersect(x,y)) > 0){
26 |         count_d = count_d + 1
27 |       }
28 |     }
29 |     if(count_d > max_d){
30 |       max_d = count_d
31 |     }
32 |   }
33 | 
34 |   if(max_u > max_d){
35 |     result <- list(upstream_name, downstream_name, "readthrough")
36 |     return(result)
37 |   }
38 |   else if(max_d > max_u){
39 |     result <- list(upstream_name, downstream_name, "premature")
40 |     return(result)
41 |   }
42 |   else if(min(length(upstream_trx), length(downstream_trx)) == 1 & max(length(upstream_trx), length(downstream_trx)) != 1){
43 |     print("MAX & MIN")
44 |     result <- list(upstream_name, downstream_name, "manual")
45 |     return(result)
46 |   }
47 |   else if(max_u == 1 & max_d == 1){
48 |     result <- list(upstream_name, downstream_name, "manual")
49 |     return(result)
50 |   }
51 |   else if(max_u == max_d){
52 |     if(length(upstream_trx[[1]]) > length(downstream_trx[[1]])){
53 |       result <- list(upstream_name, downstream_name, "readthrough")
54 |       return(result)
55 |     }
56 |     else{
57 |       result <- list(upstream_name, downstream_name, "premature")
58 |       return(result)
59 |     }
60 |   }
61 |   else{
62 |     result <- list(upstream_name, downstream_name, "manual")
63 |     return(result)
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/man/IsolateIntergenicReads.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/IsolateIntergenicReads.R
 3 | \name{IsolateIntergenicReads}
 4 | \alias{IsolateIntergenicReads}
 5 | \title{IsolateIntergenicReads}
 6 | \usage{
 7 | IsolateIntergenicReads(bam_file_name, index_file_name, barcode_length = NULL)
 8 | }
 9 | \arguments{
10 | \item{bam_file_name}{Path to Cell Ranger generated bam file (run Cell Ranger
11 | count pipeline on sequencing data of interest and aligning it to the unoptimized
12 | transcriptomic reference).}
13 | 
14 | \item{index_file_name}{Path to Cell Ranger generated bam.bai file (run Cell Ranger
15 | count pipeline on sequencing data of interest and aligning it to the unoptimized
16 | transcriptomic reference).}
17 | 
18 | \item{barcode_length}{Optional. Specifies the length of barcode needed. If not specified, defaults to 26.}
19 | }
20 | \value{
21 | Saves extracted intergenic reads as a separate file (“intergenic_reads.bed”)
22 | }
23 | \description{
24 | Intergenic reads are extracted from Cell Ranger aligned bam file.
25 | Use a scRNA-seq dataset of interest that has been aligned to the unoptimized
26 | genome reference with the Cell Ranger count pipeline. Intergenic reads can be
27 | identified by two features: their read identity tag RE = "I" (for intergenic)
28 | OR their RE=E (for exonic) with AN = \if{html}{\out{<some gene>}}. The latter reads are in fact
29 | intergenic reads since Cell Ranger wrongly classifies reads mapping antisense
30 | to an exon as exonic (i.e. RE="E"). The false exonic reads can be recognized
31 | and captured as proper intergenic reads by extracting two kinds of reads
32 | (RE=I and RE=E & AN=<something else than NA). Also, removing duplicates command
33 | in GenomicAlignments package does not work for intergenic (nor for intronic) reads.
34 | Duplicate and corrupt read removal has to be done manually (i.e. make sure cellular
35 | and molecular barcodes have specified lengths and duplicate barcodes removed).
36 | 
37 | Note that bam files can often be many tens of gigabytes and thus this step is
38 | highly memory intensive.
39 | }
40 | \examples{
41 | IsolateIntergenicReads(
42 | bam_file_name = "test_bam.bam",
43 | index_file_name = "test_index.bam.bai")
44 | }
45 | 


--------------------------------------------------------------------------------
/R/IdentifyOverlappers.R:
--------------------------------------------------------------------------------
 1 | #' @title IdentifyOverlappers
 2 | #'
 3 | #' @description Identifies all same-strand overlapping genes based on the unoptimized
 4 | #' genome annotation file in GTF, rank-orders them according to the number of gene
 5 | #' overlaps. Saves the list of overlapping genes in working directory as “overlapping_gene_list.csv”.
 6 | #'
 7 | #' @param genome_annotation Unoptimized genome annotation file in GTF. Could be
 8 | #' obtained from Ensembl, Refseq, 10x Genomics or elsewhere.
 9 | #'
10 | #' @return Rank-ordered gene list of same-strand overlapping genes (“overlapping_gene_list.csv”).
11 | #' @export
12 | #'
13 | #' @examples
14 | #' genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
15 | #' IdentifyOverlappers(genome_annotation = genome_annotation)
16 | IdentifyOverlappers <- function(genome_annotation){
17 | 
18 |   genes_df = genome_annotation[genome_annotation$type == "gene",1:13] # Extract all "gene" entries in the genome annotation to a new variable
19 |   row.names(genes_df) = 1:nrow(genes_df)
20 |   gene_names = genes_df$gene_name
21 |   genes_df = GenomicRanges::makeGRangesFromDataFrame(genes_df, keep.extra.columns=T) # convert into granges object
22 | 
23 | 
24 |   overlapper = rep(FALSE, length(gene_names))
25 |   number_of_overlaps = rep(0, length(gene_names))
26 |   overlapping_genes = rep("", length(gene_names))
27 | 
28 |   for (i in 1:length(gene_names)){
29 |     a = sum(GenomicRanges::countOverlaps(genes_df, genes_df[i]))
30 |     if (a>1){
31 |       overlapper[i] = TRUE
32 |       number_of_overlaps[i] = a-1
33 |       conflict_genes = gene_names[as.logical(GenomicRanges::countOverlaps(genes_df, genes_df[i]))]
34 |       conflict_genes = setdiff(conflict_genes, gene_names[i])
35 |       overlapping_genes[i] = paste(conflict_genes, collapse = ', ')
36 |     }
37 |   }
38 | 
39 |   overlapping_gene_list = as.data.frame(cbind(gene_names, number_of_overlaps, overlapping_genes))[overlapper,]
40 |   colnames(overlapping_gene_list) = c("gene", "number_of_gene_overlaps", "overlapping_genes")
41 |   overlapping_gene_list$number_of_gene_overlaps = as.integer(overlapping_gene_list$number_of_gene_overlaps)
42 | 
43 |   o = order(overlapping_gene_list$number_of_gene_overlaps, decreasing = TRUE) # Rank order genes by the number of gene overlaps
44 |   overlapping_gene_list = overlapping_gene_list[o,]
45 | 
46 |   if(dim(overlapping_gene_list)[1] > 0){
47 |     row.names(overlapping_gene_list) = 1:nrow(overlapping_gene_list)
48 |   }
49 | 
50 |   overlapping_gene_list["automatic_classification"] <- ""
51 |   overlapping_gene_list["final_classification"] <- ""
52 |   overlapping_gene_list["transcripts_for_deletion"] <- ""
53 |   overlapping_gene_list["comments"] <- ""
54 | 
55 |   write_csv(overlapping_gene_list, 'overlapping_gene_list.csv')
56 |   print("A list of overlapping genes has been saved in your working directory (overlapping_gene_list.csv) for manual curation.")
57 |   return(overlapping_gene_list)
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/R/GenerateGeneLocationBed.R:
--------------------------------------------------------------------------------
 1 | #' @title GenerateGeneLocationBed
 2 | #'
 3 | #' @description Makes a bed file with gene boundaries, which is required for
 4 | #' assigning intergenic reads to a specific gene and discovering genes with large
 5 | #' amounts of intergenic reads near its 3’ gene end.
 6 | #'
 7 | #' Note 1: This step is partially run in Linux Terminal in Bash and requires BEDOPS
 8 | #' (https://bedops.readthedocs.io/en/latest/). Make sure BEDOPS is installed and
 9 | #' provide a path to BEDOPS in the function if you get an error message.
10 | #'
11 | #' Note 2: In Linux terminal, navigate to folder with the genome annotation of interest.
12 | #' The annotation file should be named "genes.gtf" per 10x Genomics convention.
13 | #'
14 | #' @param genome_annotation Genome annotation DataFrame loaded with LoadGtf()
15 | #' function in this package.
16 | #' @param bedops_loc Optional. Location of BEDOPS in file system.
17 | #'
18 | #' @return Saves “gene_ranges.bed” in working directory.
19 | #' @export
20 | #'
21 | #' @examples
22 | #' genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
23 | #' GenerateGeneLocationBed(
24 | #' genome_annotation = genome_annotation,
25 | #' bedops_loc = NULL)
26 | GenerateGeneLocationBed <- function(genome_annotation, bedops_loc = NULL){
27 |   gene_ranges_df <- genome_annotation
28 |   gene_ranges_df <- gene_ranges_df[gene_ranges_df$type == "gene",] # Extract all "gene" entries in the genome annotation ot a new variable
29 |   gene_ranges_df <- GenomicRanges::makeGRangesFromDataFrame(gene_ranges_df, keep.extra.columns=TRUE)
30 |   rtracklayer::export(gene_ranges_df, "gene_ranges.gtf", format = "gtf")
31 | 
32 |   ## Add "transcript_id """ column to the gtf file to make it compatible with bedtools format (through terminal)
33 |   system('awk \'{ if ($0 ~ "transcript_id") print $0; else print $0" transcript_id \"\";"; }\' gene_ranges.gtf > gene_ranges1.gtf')
34 | 
35 |   ## Check for bedops
36 |   if(is.null(bedops_loc)){
37 |     if(is.na(unlist(strsplit(system("whereis bedops", intern = TRUE),": "))[2])){
38 |       print("Didn't find bedops. Please install bedops or provide a path to bedops.")
39 |     }
40 |   else{
41 |     old_path <- Sys.getenv("PATH")
42 |     Sys.setenv(PATH = paste(old_path, bedops_loc, sep = ":"))
43 |     }}
44 | 
45 |   system('gtf2bed < gene_ranges1.gtf > gene_ranges.bed') # Creates a bed file with gene boundaries
46 | 
47 |   ## The following code in R replaces final column with gene name. Make sure you navigate to same folder in R.
48 | 
49 |   gene_ranges = read.table("gene_ranges.bed", sep = "\t")
50 | 
51 |   if(dim(gene_ranges)[1] > 0){
52 |     for (i in 1:dim(gene_ranges)[1])
53 |     {
54 |       a = gene_ranges[i,10]
55 |       res <- stringr::str_match(a, "gene_name\\s*(.*?)\\s*;")
56 |       b = res[,2]
57 |       gene_ranges[i, 10] = b
58 |     }
59 |   }
60 | 
61 |   ## Remove gene_ranges.gtf
62 |   file.remove("./gene_ranges.gtf")
63 |   file.remove("./gene_ranges1.gtf")
64 | 
65 |   ## Save outcome
66 |   write.table(gene_ranges, "gene_ranges.bed", sep="\t",row.names=FALSE, col.names=FALSE, quote = FALSE)
67 |   print("Gene ranges file (gene_ranges.bed) has been saved in working directory.")
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/R/GenerateExtensionCandidates.R:
--------------------------------------------------------------------------------
 1 | #' @title GenerateExtensionCandidates
 2 | #'
 3 | #' @description Identifies candidate genes for extension with excess 3' intergenic
 4 | #' reads and creates a rank ordered list of genes as a function of 3' intergenic
 5 | #' read mapping within 10kb of known gene end. You can use this as a prioritized
 6 | #' gene list for gene extension to examine in Integrated Genomics Viewer.
 7 | #'
 8 | #' Note: It runs partially in Bash/Linux terminal. Make sure bedtools is installed
 9 | #' and provide a path in the function if you get an error message.
10 | #'
11 | #' @param bedops_loc Optional. Location of bedtools in file system.
12 | #'
13 | #' @return Rank ordered list of gene extension candidates saved to working directory
14 | #' as “gene_extension_candidates.csv”.
15 | #' @export
16 | #'
17 | #' @examples
18 | #' GenerateExtensionCandidates(bedtools_loc = NULL)
19 | GenerateExtensionCandidates <- function(bedtools_loc = NULL){
20 | 
21 |   ## In bash/linux terminal: Make sure bedtools is in PATH (make sure bedtools is installed and in the PATH variable in Linux or MacOS)
22 | 
23 |   system("sort -k 1,1 -k2,2n gene_ranges.bed > gene_ranges1.bed")
24 |   system("sort -k 1,1 -k2,2n intergenic_reads.bed > intergenic_reads1.bed")
25 | 
26 |   # Checks and adds bedtools to path
27 |   if(is.null(bedtools_loc)){
28 |     if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){
29 |       print("Didn't find bedtools. Please install bedtools or provide a path to bedtools.")
30 |     }
31 |     else{
32 |       old_path <- Sys.getenv("PATH")
33 |       bedtools_loc = unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2]
34 |       Sys.setenv(PATH = paste(old_path, bedtools_loc, sep = ":"))
35 |     }
36 |   }
37 |   else{
38 |     old_path <- Sys.getenv("PATH")
39 |     Sys.setenv(PATH = paste(old_path, bedtools_loc, sep = ":"))
40 |   }
41 | 
42 | 
43 |   system("bedtools closest -a intergenic_reads1.bed -b gene_ranges1.bed -s -D a -fu > results.txt") # resulting file contains sequencing reads with distance data from closest 3' gene identity and end
44 | 
45 |   ## In R: Save a rank ordered list of genes with highest-to-lowest number of intergenic reads within 10kb of its known gene end.
46 | 
47 |   summary_data = read.table("results.txt", sep = "\t")
48 | 
49 |   summary_data = summary_data[summary_data$V23>-10000,] # retain only sequencing reads within 10kb of known gene ends. Change to more or less stringent as desired.
50 |   summary_data = summary_data[summary_data$V23<0,] # retain only sequencing reads within 10kb of known gene ends. Change to more or less stringent as desired.
51 | 
52 |   hist(summary_data$V23) # plot histogram of intergenic sequencing reads as a function of distance from 3' gene ends.
53 | 
54 |   summary_data_genes = table(summary_data$V22) # Summarizes # of intergenic reads within 10kb of known gene ends for each gene.
55 |   o = order(summary_data_genes, decreasing = TRUE) # Rank order the gene list
56 |   length(summary_data_genes)
57 |   summary_data_genes = summary_data_genes[o]
58 |   length(summary_data_genes[summary_data_genes>10]) # Determine number of genes with more than 10 intergenic reads within 10kb of known gene end
59 |   summary_data_genes = summary_data_genes[summary_data_genes>10] # Threshold gene list based on the amount of intergenic gene loading.
60 |   summary_data_genes = data.frame(summary_data_genes)
61 |   dim(summary_data_genes)
62 |   summary_data_genes[1:40,]
63 |   summary_data_genes["update_start"] <- ""
64 |   summary_data_genes["update_end"] <- ""
65 | 
66 | 
67 |   write_csv(summary_data_genes, "gene_extension_candidates.csv")
68 |   print("A rank ordered list of gene extension candidates has been saved to working directory as gene_extension_candidates.csv")
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/man/OptimizedAnnotationAssembler.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OptimizedAnnotationAssembler.R
 3 | \name{OptimizedAnnotationAssembler}
 4 | \alias{OptimizedAnnotationAssembler}
 5 | \title{OptimizedAnnotationAssembler}
 6 | \usage{
 7 | OptimizedAnnotationAssembler(
 8 |   unoptimized_annotation_path,
 9 |   gene_overlaps,
10 |   gene_extension,
11 |   gene_replacement
12 | )
13 | }
14 | \arguments{
15 | \item{unoptimized_annotation_path}{path to unoptimized genome annotation file in GTF.}
16 | 
17 | \item{gene_overlaps}{overlapping genes list generated with IdentifyOverlappers function.}
18 | 
19 | \item{gene_extension}{list of gene extension candidates generated with GenerateExtensionCandidates function.}
20 | 
21 | \item{gene_replacement}{manually generated list of gene names to be replaced in .csv format. Column names: old_name, new_name. Optional.}
22 | }
23 | \value{
24 | Single-cell RNA-seq optimized genome annotation that can be used to
25 | generate the transcriptomic reference (e.g. with cellranger mkref or
26 | STAR --runMode genomeGenerate pipelines) for mapping single-cell sequencing data.
27 | }
28 | \description{
29 | OptimizedAnnotationAssembler generates the scRNA-seq optimized genome annotation.
30 | The resulting optimized genome annotation can be used to generate the transcriptomic
31 | reference for mapping single-cell sequencing data (e.g. with cellranger mkref
32 | or STAR --runMode genomeGenerate). Note that completing this step is time intensive
33 | and can sometimes take 12-24 hours depending on the length of the annotation
34 | to be optimized.
35 | This function goes through the following steps:
36 | 0. Load data and libraries:
37 | \itemize{
38 | \item genome annotation file to be optimized in GTF.
39 | \item "overlapping_gene_list.csv" file specifying how to resolve gene overlap
40 | derived issues. "Delete" entries in $final_classification field mark genes
41 | for deletion. Transcript names in $transcripts_for_deletion mark specific
42 | transcripts for deletion.
43 | \item "gene_extension_candidates.csv" specifying updated gene boundaries for
44 | incorporating intergenic reads.
45 | \item "rename_genes.csv" specifying gene names to be replaced and new names
46 | (under $old_names and $new_names fields, respectively).
47 | }
48 | \enumerate{
49 | \item Resolve "self-overlapping" gene (duplicate gene_ids) derived issues.
50 | Required for making references compatible with multiome workflows.
51 | \item Creates pre-mRNA genome annotation from input genome annotation. This step
52 | extracts all transcript entries from the genome annotation and defines them
53 | as full length exons with new transcript IDs and corresponding transcripts.
54 | This allows to capture many intronically mapped reads that otherwise get discarded.
55 | \item Gene deletion step: Deletes all annotation entries for genes destined for
56 | deletion (has "Delete" entry in $final_classification field of
57 | "overlapping_gene_list.csv".
58 | \item Transcript deletion step: Deletes all transcripts destined for deletion
59 | (transcript names listed in the "transcripts_for_deletion" column in
60 | "overlapping_gene_list.csv".
61 | \item Gene coordinate adjustment step: Replaces the left most or right most
62 | coordinate of the first exon of a gene in genome annotation if there is a
63 | coordinate in columns $new_left or $new_right in the
64 | "gene_extension_candidates.csv".
65 | \item Adds pre-mRNA reads to all genes not in the gene overlap list.
66 | \item Renames genes to avoid discarding expression data with near perfect terminal
67 | exon overlap.
68 | \item Saves the optimized genome annotation in a new GTF file.
69 | }
70 | }
71 | \examples{
72 | OptimizedAnnotationAssembler(
73 | unoptimized_annotation_path = "test_genes.gtf",
74 | gene_overlaps = "test_overlapping_gene_list.csv",
75 | gene_extension = "./gene_extension_candidates.csv",
76 | gene_replacement = "test_gene_replacement.csv")
77 | }
78 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | # ReferenceEnhancer
 2 | 
 3 | The goal of ReferenceEnhancer is to generate a scRNA-seq optimized transcriptomic reference.
 4 | 
 5 | Generating a scRNA-seq optimized transcriptomic reference requires optimizing the genome annotation ("xxx.gtf") file that transcriptomic references are based on.
 6 | 
 7 | The following three aspects of genome annotations need to be optimized: A) Resolving gene overlap derived read loss; B) Recovering intergenic reads from 3' un-annotated exons; and C) Recovering intronic reads.
 8 | 
 9 | After optimizing and assembling the genome annotation, you can use "cellranger mkref" pipeline to assemble the optimized transcriptomic reference for mapping sequencing read data and compiling gene-cell matrices with the "cellranger count" (or other) pipeline.
10 | 
11 | ## Installation
12 | 
13 | You can install the development version of ReferenceEnhancer as follows:
14 | 
15 | ``` r
16 | install.packages("devtools") 
17 | require(devtools) 
18 | install_github("PoolLab/ReferenceEnhancer")
19 | ```
20 | 
21 | ## Example
22 | 
23 | # This is a sample workflow of the package:
24 | 
25 | This is the basic workflow for optimizing a genome annotation for single-cell RNA-seq work using ReferenceEnhancer:
26 | 
27 | 1.  Load ReferenceEnhancer and import ENSEMBL/10x Genomics default genome annotation file (GTF).
28 | 
29 | This file can be downloaded from 10x Genomics provided reference transcriptome "gene" folder at "<https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest>" or Ensembl.org if wish to customize more.
30 | 
31 | For testing, we have provided a sample file.
32 | 
33 | library(ReferenceEnhancer)
34 | 
35 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
36 | 
37 | 2.  Identify all overlapping genes based on the ENSEMBL/10x Genomics default genome annotation file (GTF), rank-order them according to the number of gene overlaps.
38 | 
39 | Prioritize this gene list for manual curation focusing on exonically overlapping genes. The function saves the list of overlapping genes in working directory as overlapping_gene_list.csv.
40 | 
41 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
42 | 
43 | 3.  Generate recommended actions for overlapping genes based on original genome annotation .gtf file and a list of overlapping genes.
44 | 
45 | The function updates overlapping_gene_list.csv file with added recommendations.
46 | 
47 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
48 | 
49 | 4.  Extract intergenic reads from Cell Ranger aligned bam file. The function saves extracted intergenic reads in working directory as intergenic_reads.bed.
50 | 
51 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
52 | 
53 | 5.  Generate gene boundaries in order to assign intergenic reads to a specific gene. The function save resulting in working directory as gene_ranges.bed.
54 | 
55 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedops (<https://bedops.readthedocs.io/en/latest/>) has been installed to your computer.
56 | 
57 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL)
58 | 
59 | 6.  Identify candidate genes for extension with excess 3' intergenic reads and create a rank ordered list of genes as a function of 3' intergenic read mapping within 10kb of known gene end. A rank ordered list of gene extension candidates is saved in working directory as gene_extension_candidates.csv.
60 | 
61 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedtools (<https://bedtools.readthedocs.io/en/latest/content/installation.html>) has been installed to your computer and that it has been added to to the path in your R environment.
62 | 
63 | GenerateExtensionCandidates(bedtools_loc = NULL)
64 | 
65 | 7.  Create the final optimized annotation file. The function saves the result in working directory as optimized_reference.gtf.
66 | 
67 | OptimizedAnnotationAssembler(unoptimized_annotation_path = "test_genes.gtf", gene_overlaps = "test_overlapping_gene_list.csv", gene_extension = "gene_extension_candidates.csv", gene_replacement = "test_gene_replacement.csv")
68 | 


--------------------------------------------------------------------------------
/README.Rmd.orig:
--------------------------------------------------------------------------------
 1 | # ReferenceEnhancer
 2 | 
 3 | The goal of ReferenceEnhancer is to generate a scRNA-seq optimized transcriptomic reference.
 4 | 
 5 | Generating a scRNA-seq optimized transcriptomic reference requires optimizing the genome annotation ("xxx.gtf") file that transcriptomic references are based on.
 6 | 
 7 | The following three aspects of genome annotations need to be optimized: A) Resolving gene overlap derived read loss; B) Recovering intergenic reads from 3' un-annotated exons; and C) Recovering intronic reads.
 8 | 
 9 | After optimizing and assembling the genome annotation, you can use "cellranger mkref" pipeline to assemble the optimized transcriptomic reference for mapping sequencing read data and compiling gene-cell matrices with the "cellranger count" (or other) pipeline.
10 | 
11 | ## Installation
12 | 
13 | <<<<<<< HEAD
14 | You can install ReferenceEnhancer like so:
15 | =======
16 | You can install the development version of ReferenceEnhancer as follows:
17 | >>>>>>> 67ca0da0bac1c232f772d39eff8f1d99ac69b6d9
18 | 
19 | ``` r
20 | install.packages("devtools")
21 | require(devtools)
22 | install_github("PoolLab/ReferenceEnhancer")
23 | ```
24 | 
25 | ## Example
26 | 
27 | <<<<<<< HEAD
28 | This is a sample workflow of the package:
29 | =======
30 | This is the basic workflow for optimizing a genome annotation for single-cell RNA-seq work using ReferenceEnhancer:
31 | >>>>>>> 67ca0da0bac1c232f772d39eff8f1d99ac69b6d9
32 | 
33 | 1. Load ReferenceEnhancer and import ENSEMBL/10x Genomics default genome annotation file (GTF). 
34 | 
35 | This file can be downloaded from 10x Genomics provided reference transcriptome "gene" folder at "https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest" or Ensembl.org if wish to customize more.
36 | 
37 | For testing, we have provided a sample file.
38 | ```{r example}
39 | library(ReferenceEnhancer)
40 | genome_annotation <- LoadGtf("test_genes.gtf")
41 | ```
42 | 
43 | 2. Identify all overlapping genes based on the ENSEMBL/10x Genomics default genome annotation file (GTF), rank-order them according to the number of gene overlaps.
44 | 
45 | Prioritize this gene list for manual curation focusing on exonically overlapping genes.
46 | The function saves the list of overlapping genes in working directory as overlapping_gene_list.csv.
47 | ```{r example}
48 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
49 | ```
50 | 
51 | 3. Generate recommended actions for overlapping genes based on original genome annotation .gtf file and a list of overlapping genes.
52 | 
53 | The function updates overlapping_gene_list.csv file with added recommendations.
54 | ```{r example}
55 | OverlapResolutions(genome_annotation, gene_overlaps)
56 | ```
57 | 
58 | 4. Extract intergenic reads from Cell Ranger aligned bam file.
59 | The function saves extracted intergenic reads in working directory as intergenic_reads.bed.
60 | ```{r example}
61 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
62 | ```
63 | 
64 | 5. Generate gene boundaries in order to assign intergenic reads to a specific gene. The function save resulting in working directory as gene_ranges.bed.
65 | 
66 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedops (https://bedops.readthedocs.io/en/latest/) has been installed to your computer.
67 | ```{r example}
68 | GenerateGeneLocationBed(genome_annotation)
69 | ```
70 | 
71 | 6. Identify candidate genes for extension with excess 3' intergenic reads and create a rank ordered list of genes as a function of 3' intergenic read mapping within 10kb of known gene end. A rank ordered list of gene extension candidates is saved in working directory as gene_extension_candidates.csv.
72 | 
73 | Note: This step runs partially in bash/linux terminal. Before this step, make sure that bedtools (https://bedtools.readthedocs.io/en/latest/content/installation.html) has been installed to your computer and that it has been added to to the path in your R environment.
74 | ```{r example}
75 | GenerateExtensionCandidates()
76 | ```
77 | 
78 | 7. Create the final optimized annotation file. The function saves the result in working directory as optimized_reference.gtf.
79 | ```{r example}
80 | OptimizedAnnotationAssembler("test_genes.gtf", "premrna.gtf", "overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv")
81 | ```
82 | 


--------------------------------------------------------------------------------
/R/IsolateIntergenicReads.R:
--------------------------------------------------------------------------------
 1 | #' @title IsolateIntergenicReads
 2 | #'
 3 | #' @description Intergenic reads are extracted from Cell Ranger aligned bam file.
 4 | #' Use a scRNA-seq dataset of interest that has been aligned to the unoptimized
 5 | #' genome reference with the Cell Ranger count pipeline. Intergenic reads can be
 6 | #' identified by two features: their read identity tag RE = "I" (for intergenic)
 7 | #' OR their RE=E (for exonic) with AN = <some gene>. The latter reads are in fact
 8 | #' intergenic reads since Cell Ranger wrongly classifies reads mapping antisense
 9 | #' to an exon as exonic (i.e. RE="E"). The false exonic reads can be recognized
10 | #' and captured as proper intergenic reads by extracting two kinds of reads
11 | #' (RE=I and RE=E & AN=<something else than NA). Also, removing duplicates command
12 | #' in GenomicAlignments package does not work for intergenic (nor for intronic) reads.
13 | #' Duplicate and corrupt read removal has to be done manually (i.e. make sure cellular
14 | #' and molecular barcodes have specified lengths and duplicate barcodes removed).
15 | #'
16 | #' Note that bam files can often be many tens of gigabytes and thus this step is
17 | #' highly memory intensive.
18 | #'
19 | #' @param bam_file_name Path to Cell Ranger generated bam file (run Cell Ranger
20 | #' count pipeline on sequencing data of interest and aligning it to the unoptimized
21 | #' transcriptomic reference).
22 | #' @param index_file_name Path to Cell Ranger generated bam.bai file (run Cell Ranger
23 | #' count pipeline on sequencing data of interest and aligning it to the unoptimized
24 | #' transcriptomic reference).
25 | #'
26 | #' @param barcode_length Optional. Specifies the length of barcode needed. If not specified, defaults to 26.
27 | #'
28 | #' @return Saves extracted intergenic reads as a separate file (“intergenic_reads.bed”)
29 | #' @export
30 | #'
31 | #' @examples
32 | #' IsolateIntergenicReads(
33 | #' bam_file_name = "test_bam.bam",
34 | #' index_file_name = "test_index.bam.bai")
35 | IsolateIntergenicReads <- function(bam_file_name, index_file_name, barcode_length = NULL){
36 | 
37 |   bamfile = bam_file_name
38 |   indexfile = index_file_name
39 | 
40 |   if(bamfile == "test_bam.bam" & indexfile == "test_index.bam.bai"){
41 |     bamfile <- system.file("extdata", "test_bam.bam", package = "ReferenceEnhancer")
42 |     indexfile <- system.file("extdata", "test_index.bam.bai", package = "ReferenceEnhancer")
43 |   }
44 | 
45 |   seq_data = GenomicAlignments::readGAlignments(bamfile, index=indexfile, param = Rsamtools::ScanBamParam(flag = Rsamtools::scanBamFlag(isDuplicate = FALSE, isSecondaryAlignment = FALSE), tag = c("GN", "RE", "CB", "UB", "AN"), what = "flag", tagFilter = list("RE"=c("I", "E"))))
46 |   seq_data = data.frame(seq_data)
47 | 
48 |   ## Keep only intergenic reads by removing all true exonic reads (i.e. remove exonic reads that lack antisense gene mapping: AN tag =NA)
49 |   intergenic_reads = seq_data$RE=="I"
50 |   false_exonic_reads = !is.na(seq_data$AN)
51 |   all_intergenic_reads = as.logical(intergenic_reads + false_exonic_reads)
52 |   seq_data = seq_data[all_intergenic_reads,] # remaining dataframe contains only intergenic reads. Note, that we are assuming that all false exonic reads are intergenic, which slightly overestimates intergenic read count. This is since some will likely also end up being intronic.
53 | 
54 |   ## Remove all duplicate reads and reads with corrupt barcodes (i.e. keep reads with 16 nucleotide cellular barcodes and 10 nucleotide molecular barcodes). Note that duplicate removal is required since Cell Ranger does not automatically flag duplicates for intronically and intergenically classified reads.
55 |   seq_data$CB = stringr::str_sub(seq_data$CB, end=-3) # Remove last two elements of the cell barcode. This is an artifact ("-1") added by Cell Ranger software.
56 |   seq_data$barcodes = paste(seq_data$CB, seq_data$UB, sep="") # Assemble the cell barcode / molecular barcode list. Each read included in the gene_cell matrix will have a unique index comprised of the two.
57 | 
58 |   if(is.null(barcode_length)){
59 |     a = nchar(seq_data$barcodes)==26 # logical vector for selecting reads with non-corrupt barcodes
60 |   }
61 |   else{
62 |     a = nchar(seq_data$barcodes)==barcode_length
63 |   }
64 | 
65 |   seq_data = seq_data[a,] # exclude all reads that don't have an intact full cellular and molecular barcodes
66 |   length(unique(seq_data$barcodes)) # Determine # of unique intergenic reads
67 |   seq_data = seq_data[!duplicated(seq_data$barcodes),] # exclude all duplicated intergenic reads
68 | 
69 |   ## Save extracted intergenic reads as a separate file
70 |   gr_seq_data = GenomicRanges::makeGRangesFromDataFrame(seq_data) # coerce to granges object as that makes it possible to save it as a bedfile that bedtools can parse
71 | 
72 |   ga_seq_data = as(gr_seq_data, "GAlignments") # Coerces GRanges object into a GAlignments object, that can be saved as a bed file. Required for bedtools to link reads to closest 3' gene end.
73 |   m = rtracklayer::asBED(ga_seq_data)# converts GAlignments object into the bed format
74 |   write_bed(m, "intergenic_reads.bed")
75 |   print("Extracted intergenic reads have been saved to your working directory as intergenic_reads.bed.")
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/man/OverlapResolutions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OverlapResolutions.R
 3 | \name{OverlapResolutions}
 4 | \alias{OverlapResolutions}
 5 | \title{OverlapResolutions}
 6 | \usage{
 7 | OverlapResolutions(genome_annotation, overlap_data, gene_pattern)
 8 | }
 9 | \arguments{
10 | \item{genome_annotation}{Unoptimized genome annotation (e.g. Ensembl/10x Genomics)
11 | default genome annotation GTF file. This should be a dataframe created with the
12 | LoadGtf() function in this package.}
13 | 
14 | \item{overlap_data}{A list of overlapping genes generated by IdentifyOverlappers.}
15 | 
16 | \item{gene_pattern}{The pattern in gene names that is unique for pseudo- or
17 | other low quality or low interest genes. Patterns for recognizing candidate
18 | pseudo- or low quality genes can be defined with regular expressions for matching
19 | gene names with a given pattern. See vignette (regular-expressions) in the stringr
20 | package for details or examples below.}
21 | }
22 | \value{
23 | Generates “overlapping_gene_list.csv” with added recommendations for
24 | resolving gene overlaps in the “automatic_classification” column.
25 | }
26 | \description{
27 | Based on original genome annotation GTF file and a list of
28 | overlapping genes, generates recommended actions for overlapping genes.
29 | This is an optional step that can help with decision making during the manual
30 | curation step.
31 | 
32 | Gene overlaps can be resolved by one of several strategies including
33 | (i) leaving overlapping gene annotations unchanged if their exons don’t directly overlap,
34 | (ii) deleting offending readthrough transcripts from upstream genes,
35 | (iii) deleting offending premature gene transcripts from downstream genes,
36 | (iv) deleting pseudogenes and non-protein coding genes with poor support and
37 | no read mapping that obscure well established protein coding genes or
38 | (v) for extensively overlapping genes deleting one and renaming the other to
39 | capture otherwise discarded reads. As well annotated genomes contain several
40 | thousand same-strand overlapping genes and properly resolving gene overlaps
41 | often requires manual inspection of the locus to determine best course of action,
42 | prioritization of genes for manual curation is often desirable. To this end,
43 | OverlapResolutions function classifies genes to prioritize for direct inspection.
44 | The following algorithm is used to classify genes for appropriate curation:
45 | \enumerate{
46 | \item If gene overlaps with multiple genes:
47 | a.	If gene’s exons overlap with another gene’s exons --> classify for “Manual inspection”
48 | b.	If gene’s exons do not overlap with any other genes’ exons --> classify as “Keep as is”
49 | c.	Assign recommended action for overlapping genes:
50 | i.	If nested gene does not overlap with any other gene  classify as “Keep as is”
51 | ii.	If nested gene overlaps with more than one gene  classify for “Manual inspection”
52 | \item If gene overlaps with only one other gene, test whether gene is non-protein
53 | coding/pseudogene (“Gm” and “…Rik” gene models in mice; “AC…” and “AL…” gene models in humans)
54 | a.	If both overlapping genes are non-protein coding/pseudogenes --> classify for
55 | “Manual inspection”
56 | b.	If only one gene in the overlapping gene pair is non-protein coding/pseudogene,
57 | test if genes have overlapping exons:
58 | i.	In case no overlapping exons --> classify both genes as “Keep as is”
59 | ii.	In case exons overlap --> mark non-protein coding/pseudogene for
60 | deletion (“Delete”).
61 | c.	If both genes are well supported genes:
62 | i.	If their exons don’t overlap --> mark both genes as “Keep as is”
63 | ii.	If their exons do overlap, determine the number of opposing gene’s
64 | exonic overlap for each exon of each gene and find the exon with most
65 | overlaps for both upstream and downstream gene to determine appropriate
66 | course of action:
67 | 1.	If downstream gene’s exon has more overlaps than its upstream
68 | counterpart, classify downstream gene as “Premature transcript deletion”
69 | and upstream gene as “Keep as is”
70 | 2.	If upstream gene’s exon has more overlaps than its downstream
71 | counterpart, classify upstream gene as “Readthrough transcript deletion”
72 | and downstream gene as “Keep as is”
73 | 3.	Otherwise classify both for “Manual inspection”
74 | The resulting recommendations can be used in the manual curation step, where
75 | all genes that are not classified in the “Keep as is” category should directly
76 | be scrutinized in the Ensembl genome browser (ensemble.org, with the correct
77 | genome builds) and/or cross-referenced to the respective Refseq genome annotation
78 | within the Integrated Genome Browser (IGV 2.11.9).
79 | }
80 | }
81 | \examples{
82 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
83 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
84 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("^Gm", "Rik$"))
85 | 
86 | # Note: The example treats genes starting with “Gm…” and ending with “…Rik” as
87 | # pseudogenes. Additional patterns for recognizing candidate pseudo- or low-quality
88 | # genes can be defined with regular expressions for matching gene names
89 | # with a given pattern. See vignette(regular-expressions) in the stringr package
90 | # for details.
91 | }
92 | 


--------------------------------------------------------------------------------
/R/OptimizedAnnotationAssembler.R:
--------------------------------------------------------------------------------
  1 | #' @title  OptimizedAnnotationAssembler
  2 | #'
  3 | #' @description
  4 | #' OptimizedAnnotationAssembler generates the scRNA-seq optimized genome annotation.
  5 | #' The resulting optimized genome annotation can be used to generate the transcriptomic
  6 | #' reference for mapping single-cell sequencing data (e.g. with cellranger mkref
  7 | #' or STAR --runMode genomeGenerate). Note that completing this step is time intensive
  8 | #' and can sometimes take 12-24 hours depending on the length of the annotation
  9 | #' to be optimized.
 10 | #' This function goes through the following steps:
 11 | #' 0. Load data and libraries:
 12 | #'    - genome annotation file to be optimized in GTF.
 13 | #'    - "overlapping_gene_list.csv" file specifying how to resolve gene overlap
 14 | #'    derived issues. "Delete" entries in $final_classification field mark genes
 15 | #'    for deletion. Transcript names in $transcripts_for_deletion mark specific
 16 | #'    transcripts for deletion.
 17 | #'    - "gene_extension_candidates.csv" specifying updated gene boundaries for
 18 | #'    incorporating intergenic reads.
 19 | #'    - "rename_genes.csv" specifying gene names to be replaced and new names
 20 | #'    (under $old_names and $new_names fields, respectively).
 21 | #'  1. Resolve "self-overlapping" gene (duplicate gene_ids) derived issues.
 22 | #'  Required for making references compatible with multiome workflows.
 23 | #'  2. Creates pre-mRNA genome annotation from input genome annotation. This step
 24 | #'  extracts all transcript entries from the genome annotation and defines them
 25 | #'  as full length exons with new transcript IDs and corresponding transcripts.
 26 | #'  This allows to capture many intronically mapped reads that otherwise get discarded.
 27 | #'  3. Gene deletion step: Deletes all annotation entries for genes destined for
 28 | #'  deletion (has "Delete" entry in $final_classification field of
 29 | #'  "overlapping_gene_list.csv".
 30 | #'  4. Transcript deletion step: Deletes all transcripts destined for deletion
 31 | #'  (transcript names listed in the "transcripts_for_deletion" column in
 32 | #'  "overlapping_gene_list.csv".
 33 | #'  5. Gene coordinate adjustment step: Replaces the left most or right most
 34 | #'  coordinate of the first exon of a gene in genome annotation if there is a
 35 | #'  coordinate in columns $new_left or $new_right in the
 36 | #'  "gene_extension_candidates.csv".
 37 | #'  6. Adds pre-mRNA reads to all genes not in the gene overlap list.
 38 | #'  7. Renames genes to avoid discarding expression data with near perfect terminal
 39 | #'  exon overlap.
 40 | #'  8. Saves the optimized genome annotation in a new GTF file.
 41 | #'
 42 | #' @param unoptimized_annotation_path path to unoptimized genome annotation file in GTF.
 43 | #' @param gene_overlaps overlapping genes list generated with IdentifyOverlappers function.
 44 | #' @param gene_extension list of gene extension candidates generated with GenerateExtensionCandidates function.
 45 | #' @param gene_replacement manually generated list of gene names to be replaced in .csv format. Column names: old_name, new_name. Optional.
 46 | #'
 47 | #' @return Single-cell RNA-seq optimized genome annotation that can be used to
 48 | #' generate the transcriptomic reference (e.g. with cellranger mkref or
 49 | #' STAR --runMode genomeGenerate pipelines) for mapping single-cell sequencing data.
 50 | #' @export
 51 | #'
 52 | #' @examples
 53 | #' OptimizedAnnotationAssembler(
 54 | #' unoptimized_annotation_path = "test_genes.gtf",
 55 | #' gene_overlaps = "test_overlapping_gene_list.csv",
 56 | #' gene_extension = "./gene_extension_candidates.csv",
 57 | #' gene_replacement = "test_gene_replacement.csv")
 58 | OptimizedAnnotationAssembler <- function(unoptimized_annotation_path, gene_overlaps, gene_extension, gene_replacement){
 59 | 
 60 |   if(gene_overlaps == "test_overlapping_gene_list.csv"){
 61 |     gene_overlaps <- system.file("extdata", "test_overlapping_gene_list.csv", package = "ReferenceEnhancer")
 62 |   }
 63 | 
 64 | 
 65 |   unoptimized_df <- LoadGtf(unoptimized_annotation_path)
 66 | 
 67 |   overlap_df = read.csv(gene_overlaps, header=T)
 68 | 
 69 |   new_df = unoptimized_df
 70 | 
 71 | 
 72 |   ####  1. Create premRNA genome annotation from input gtf that defines transcripts as exons ####
 73 |   ###############################################################################################
 74 |   transcripts_df = unoptimized_df[unoptimized_df$type == "transcript",]
 75 |   exons_df = transcripts_df # Create new dataframe to contain premrna exons
 76 |   exons_df$type = rep("exon", nrow(exons_df)) # rename "type" from transcripts to exon
 77 | 
 78 |   premrna_df = gdata::interleave(transcripts_df, exons_df) # interleave transript entries with exon entries
 79 |   premrna_df$transcript_id = gsub("000000", "100000", premrna_df$transcript_id)
 80 |   premrna_df$transcript_id = gsub("000001", "110001", premrna_df$transcript_id)
 81 |   premrna_df$transcript_id = gsub("000002", "110002", premrna_df$transcript_id)
 82 |   premrna_df$transcript_id = gsub("000003", "110003", premrna_df$transcript_id)
 83 |   premrna_df$transcript_id = gsub("000004", "110004", premrna_df$transcript_id)
 84 |   premrna_df$transcript_id = gsub("000005", "110005", premrna_df$transcript_id)
 85 |   premrna_df$transcript_id = gsub("000006", "110006", premrna_df$transcript_id)
 86 |   premrna_df$transcript_id = gsub("000007", "110007", premrna_df$transcript_id)
 87 |   premrna_df$transcript_id = gsub("000008", "110008", premrna_df$transcript_id)
 88 |   premrna_df$transcript_id = gsub("000009", "110009", premrna_df$transcript_id)
 89 | 
 90 |   rm(unoptimized_df)
 91 | 
 92 |   ####  2. Delete select genes ####
 93 |   #################################
 94 |   genes_to_delete = overlap_df$genes[overlap_df$final_classification == "Delete"]
 95 |   new_df = new_df[!new_df$gene_name %in% genes_to_delete,]
 96 | 
 97 |   ####  3. Delete select transcripts ####
 98 |   #######################################
 99 |   transcripts_to_delete = overlap_df$transcripts_for_deletion
100 |   transcripts_to_delete <- transcripts_to_delete[transcripts_to_delete!=""]
101 | 
102 |   transcripts_to_delete_final = transcripts_to_delete[!stringr::str_detect(transcripts_to_delete, ", ")]
103 | 
104 |   if(length(transcripts_to_delete) != 0){
105 |     for (i in 1:length(transcripts_to_delete)){
106 |       a = transcripts_to_delete[i]
107 |       if (stringr::str_detect(a, ", ")){
108 |         split_elements <- unlist(stringr::str_split(a, ", "))
109 |         transcripts_to_delete_final = c(transcripts_to_delete_final, split_elements)
110 |       }
111 |     }
112 |   }
113 | 
114 |   transcripts_to_delete = transcripts_to_delete_final
115 | 
116 |   new_df = new_df[!new_df$transcript_name %in% transcripts_to_delete,]
117 | 
118 |   ####  4. Adjust gene coordinates ####
119 |   #####################################
120 |   boundary_fix = read.csv(gene_extension, header=T)
121 | 
122 |   left_genes = as.data.frame(cbind(boundary_fix$genes[!is.na(boundary_fix$update_start)], boundary_fix$update_start[!is.na(boundary_fix$update_start)]))
123 | 
124 |   colnames(left_genes) = c("genes", "update_start")
125 |   left_genes$update_start = as.numeric(left_genes$update_start)
126 |   right_genes = as.data.frame(cbind(boundary_fix$genes[!is.na(boundary_fix$update_end)], boundary_fix$update_end[!is.na(boundary_fix$update_end)]))
127 |   colnames(right_genes) = c("genes", "update_end")
128 |   right_genes$update_end = as.numeric(right_genes$update_end)
129 | 
130 |   left_exon_difs = rep(0, length(left_genes)) # for troubleshooting
131 |   right_exon_difs = rep(0, length(right_genes))
132 | 
133 |   for (i in 1:dim(left_genes)[1]){
134 |     gene_entries = which(new_df$gene_name == left_genes[i, 1])
135 |     type_entries = new_df$type[gene_entries]
136 |     first_gene_exon = head(gene_entries[type_entries == "exon"], 1)
137 |     new_df[first_gene_exon, 2] = left_genes[i, 2]
138 | 
139 |     if(identical(new_df[first_gene_exon, 3], integer(0)) & identical(new_df[first_gene_exon, 2], integer(0))){
140 | 
141 |     }
142 |     else{
143 |       left_exon_difs[i] = new_df[first_gene_exon, 3] - new_df[first_gene_exon, 2]
144 |     }
145 | 
146 | 
147 |   }
148 | 
149 |   for (i in 1:dim(right_genes)[1]){
150 |     gene_entries = which(new_df$gene_name == right_genes[i, 1])
151 |     type_entries = new_df$type[gene_entries]
152 |     last_gene_exon = tail(gene_entries[type_entries == "exon"], 1)
153 |     new_df[last_gene_exon, 3] = right_genes[i, 2]
154 | 
155 |     if(identical(new_df[last_gene_exon, 3], integer(0)) & identical(new_df[last_gene_exon, 2], integer(0))){
156 | 
157 |     }
158 |     else{
159 |       right_exon_difs[i] = new_df[last_gene_exon, 3] - new_df[last_gene_exon, 2]
160 |     }
161 | 
162 |   }
163 | 
164 |   #### 5. Add pre-mRNA transcripts to genes not in the gene overlap list ####
165 |   ############################################################################
166 |   # Explanation: Cellranger --include-introns mode unfortunately does not pick up on many intronic reads (unclear why despite lengthy correspondence with their support). I can pick those up however if I add the pre-mRNA transcripts to respective genes as exons with new transcript_id values.
167 | 
168 |   ## Genes to modify
169 | 
170 |   #overlap_df$genes # genes to exclude from premrna reference appending
171 | 
172 |   genes_to_append = unique(new_df$gene_name)
173 |   genes_to_append = setdiff(genes_to_append, overlap_df$genes)
174 | 
175 |   ## Give new transcript_ids to everything in the pre-mRNA gtf
176 | 
177 |   for (i in 1:dim(premrna_df)[1]){
178 |     premrna_df$transcript_id[i] = as.character(i)
179 |   }
180 | 
181 |   ## Reformat the gtf dataframes such that we can add premrna entries to the original unoptimized entries and thus compile a hybrid reference for capturing intronic reads
182 | 
183 |   final_colnames = intersect(colnames(new_df), colnames(premrna_df))
184 | 
185 |   new_df = new_df[, final_colnames]
186 |   premrna_df = premrna_df[, final_colnames]
187 | 
188 |   ## Append premrna transcript to the end of the gene
189 | 
190 |   genes_to_append = genes_to_append[1:(length(genes_to_append)-1)]
191 | 
192 |   for (i in genes_to_append){
193 |     insert = premrna_df[premrna_df$gene_name %in% i,]
194 |     first_section = new_df[0:tail(which(new_df$gene_name == i), 1),]
195 |     last_section = new_df[(tail(which(new_df$gene_name == i), 1)+1):dim(new_df)[1],]
196 |     new_df = rbind(first_section, insert, last_section)
197 |   }
198 | 
199 |   #### 6. Rename desired genes ####
200 |   #################################
201 |   # Rename desired genes (example from mouse genome): "Cers1"==>"Cers1_Gdf1" // "Chtf8" ==> "Chtf8_Derpc" // "Insl3" ==> "Insl3_Jak3" // "Pcdhga1" ==> "Pcdhg_all" // "Pcdha1" ==> "Pcdha_all" // "Ugt1a10" ==> "Ugt1a_all" // "4933427D14Rik" ==> "4933427D14Rik_Gm43951" // "Mkks" ==> "Mkks_plus"
202 |   if(missing(gene_replacement)){
203 | 
204 |   }
205 |   else{
206 |     if(gene_replacement == "test_gene_replacement.csv"){
207 |       gene_replacement <- system.file("extdata", "test_gene_replacement.csv", package = "ReferenceEnhancer")
208 |           }
209 | 
210 |     gene_replacement <- read.csv(gene_replacement, header=T)
211 | 
212 |     old_names <- gene_replacement[,'old_name']
213 |     new_names <- gene_replacement[,'new_name']
214 | 
215 | 
216 |     for (i in 1:length(old_names)){
217 |       new_df$transcript_name = stringr::str_replace_all(new_df$transcript_name, old_names[i], new_names[i])
218 |       new_df$gene_name = stringr::str_replace_all(new_df$gene_name, old_names[i], new_names[i])
219 |     }
220 |   }
221 | 
222 |   #### 7. Export object to gtf file ####
223 |   ######################################
224 |   new_gtf = GenomicRanges::makeGRangesFromDataFrame(new_df, keep.extra.columns=TRUE)
225 | 
226 |   write_gtf(new_gtf, "optimized_reference.gtf")
227 |   print("Optimized annotation reference has been saved in working directory as optimized_reference.gtf")
228 | 
229 | }
230 | 


--------------------------------------------------------------------------------
/R/OverlapResolutions.R:
--------------------------------------------------------------------------------
  1 | #' @title  OverlapResolutions
  2 | #'
  3 | #' @description Based on original genome annotation GTF file and a list of
  4 | #' overlapping genes, generates recommended actions for overlapping genes.
  5 | #' This is an optional step that can help with decision making during the manual
  6 | #' curation step.
  7 | #'
  8 | #' Gene overlaps can be resolved by one of several strategies including
  9 | #' (i) leaving overlapping gene annotations unchanged if their exons don’t directly overlap,
 10 | #' (ii) deleting offending readthrough transcripts from upstream genes,
 11 | #' (iii) deleting offending premature gene transcripts from downstream genes,
 12 | #' (iv) deleting pseudogenes and non-protein coding genes with poor support and
 13 | #' no read mapping that obscure well established protein coding genes or
 14 | #' (v) for extensively overlapping genes deleting one and renaming the other to
 15 | #' capture otherwise discarded reads. As well annotated genomes contain several
 16 | #' thousand same-strand overlapping genes and properly resolving gene overlaps
 17 | #' often requires manual inspection of the locus to determine best course of action,
 18 | #' prioritization of genes for manual curation is often desirable. To this end,
 19 | #' OverlapResolutions function classifies genes to prioritize for direct inspection.
 20 | #' The following algorithm is used to classify genes for appropriate curation:
 21 | #' 1.	If gene overlaps with multiple genes:
 22 | #'    a.	If gene’s exons overlap with another gene’s exons --> classify for “Manual inspection”
 23 | #'    b.	If gene’s exons do not overlap with any other genes’ exons --> classify as “Keep as is”
 24 | #'    c.	Assign recommended action for overlapping genes:
 25 | #'        i.	If nested gene does not overlap with any other gene  classify as “Keep as is”
 26 | #'        ii.	If nested gene overlaps with more than one gene  classify for “Manual inspection”
 27 | #'2.	If gene overlaps with only one other gene, test whether gene is non-protein
 28 | #'coding/pseudogene (“Gm” and “…Rik” gene models in mice; “AC…” and “AL…” gene models in humans)
 29 | #'    a.	If both overlapping genes are non-protein coding/pseudogenes --> classify for
 30 | #'    “Manual inspection”
 31 | #'    b.	If only one gene in the overlapping gene pair is non-protein coding/pseudogene,
 32 | #'    test if genes have overlapping exons:
 33 | #'        i.	In case no overlapping exons --> classify both genes as “Keep as is”
 34 | #'        ii.	In case exons overlap --> mark non-protein coding/pseudogene for
 35 | #'        deletion (“Delete”).
 36 | #'    c.	If both genes are well supported genes:
 37 | #'        i.	If their exons don’t overlap --> mark both genes as “Keep as is”
 38 | #'        ii.	If their exons do overlap, determine the number of opposing gene’s
 39 | #'        exonic overlap for each exon of each gene and find the exon with most
 40 | #'        overlaps for both upstream and downstream gene to determine appropriate
 41 | #'        course of action:
 42 | #'            1.	If downstream gene’s exon has more overlaps than its upstream
 43 | #'            counterpart, classify downstream gene as “Premature transcript deletion”
 44 | #'            and upstream gene as “Keep as is”
 45 | #'            2.	If upstream gene’s exon has more overlaps than its downstream
 46 | #'            counterpart, classify upstream gene as “Readthrough transcript deletion”
 47 | #'            and downstream gene as “Keep as is”
 48 | #'            3.	Otherwise classify both for “Manual inspection”
 49 | #'The resulting recommendations can be used in the manual curation step, where
 50 | #'all genes that are not classified in the “Keep as is” category should directly
 51 | #'be scrutinized in the Ensembl genome browser (ensemble.org, with the correct
 52 | #'genome builds) and/or cross-referenced to the respective Refseq genome annotation
 53 | #'within the Integrated Genome Browser (IGV 2.11.9).
 54 | #'
 55 | #' @param genome_annotation Unoptimized genome annotation (e.g. Ensembl/10x Genomics)
 56 | #' default genome annotation GTF file. This should be a dataframe created with the
 57 | #' LoadGtf() function in this package.
 58 | #' @param overlap_data A list of overlapping genes generated by IdentifyOverlappers.
 59 | #' @param gene_pattern The pattern in gene names that is unique for pseudo- or
 60 | #' other low quality or low interest genes. Patterns for recognizing candidate
 61 | #' pseudo- or low quality genes can be defined with regular expressions for matching
 62 | #' gene names with a given pattern. See vignette (regular-expressions) in the stringr
 63 | #' package for details or examples below.
 64 | #'
 65 | #' @return Generates “overlapping_gene_list.csv” with added recommendations for
 66 | #' resolving gene overlaps in the “automatic_classification” column.
 67 | #' @export
 68 | #'
 69 | #' @examples
 70 | #' genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
 71 | #' gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
 72 | #' OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("^Gm", "Rik$"))
 73 | #'
 74 | #' # Note: The example treats genes starting with “Gm…” and ending with “…Rik” as
 75 | #' # pseudogenes. Additional patterns for recognizing candidate pseudo- or low-quality
 76 | #' # genes can be defined with regular expressions for matching gene names
 77 | #' # with a given pattern. See vignette(regular-expressions) in the stringr package
 78 | #' # for details.
 79 | OverlapResolutions <- function(genome_annotation, overlap_data, gene_pattern){
 80 |   gene_list <- unique(overlap_data$gene)
 81 |   gene_address <- rep(0, length(gene_list))
 82 |   for (i in 1:length(gene_list)){
 83 |     gene_address[i] <- which(overlap_data$gene==gene_list[i])[1]
 84 |   }
 85 | 
 86 |   overlap_data <- overlap_data[gene_address,]
 87 | 
 88 |   rownames(overlap_data) <- overlap_data[,'gene']
 89 | 
 90 |   overlap_data['automatic_classification'] <- NA
 91 | 
 92 |   for(key in (rownames(overlap_data))){
 93 | 
 94 |     # Check that the gene is not classified already
 95 |     if(is.na(overlap_data[key,'automatic_classification'])){
 96 |       gene_A <- subset(genome_annotation, gene_name == key)
 97 | 
 98 |       if(overlap_data[key,'number_of_gene_overlaps'] > 1){
 99 |         overlaps <- as.list(strsplit(overlap_data[key,'overlapping_genes'], ", "))
100 | 
101 |         for(item in overlaps[[1]]){
102 |           gene_B = genome_annotation[genome_annotation['gene_name'] == item,]
103 | 
104 |           gene_A_exons = return_exons(gene_A)
105 |           gene_B_exons = return_exons(gene_B)
106 | 
107 |           if(exon_overlap(gene_A_exons, gene_B_exons) == TRUE){
108 | 
109 |             overlap_data[item, 'automatic_classification'] = 'Manual inspection'
110 | 
111 |             if(is.na(overlap_data[key,'automatic_classification']) | overlap_data[key,'automatic_classification'] != 'Manual inspection'){
112 |               overlap_data[key,'automatic_classification'] = 'Manual inspection'
113 |             }
114 |           }
115 |           else{
116 | 
117 |             if(is.na(overlap_data[key,'automatic_classification'])){
118 |               overlap_data[key,'automatic_classification'] = 'Keep as is'
119 | 
120 |               if(overlap_data[item,'number_of_gene_overlaps'] > 1){
121 |                 overlap_data[item,'automatic_classification'] = 'Manual inspection'
122 |               }
123 |               else{
124 |                 overlap_data[item,'automatic_classification'] = 'Keep as is'
125 |               }
126 |             }
127 |           }
128 |         }
129 |       }
130 | 
131 |       if(overlap_data[key,'number_of_gene_overlaps'] == 1){
132 |         overlapping <- overlap_data[key,'overlapping_genes'][[1]]
133 |         gene_B <- subset(genome_annotation, gene_name == overlapping)
134 |         strand <- gene_A[1,'strand']
135 | 
136 |         gene_A_exons = return_exons(gene_A)
137 |         gene_B_exons = return_exons(gene_B)
138 | 
139 |         # Check if both - key and overlapping gene - are pseudogenes
140 |         if(both_pseudo(key, overlapping, gene_pattern) == TRUE){
141 |           overlap_data[key, 'automatic_classification'] = 'Manual inspection'
142 |           overlap_data[overlapping[[1]], 'automatic_classification'] = 'Manual inspection'
143 |         }
144 | 
145 |         # Check for pseudogene
146 |         else if(pseudo_overlap(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern) == key){
147 |           overlap_data[key, 'automatic_classification'] = 'Delete'
148 |           overlap_data[overlapping[[1]], 'automatic_classification'] = 'Keep as is'
149 |         }
150 | 
151 |         else if(pseudo_overlap(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern) == overlapping){
152 |           overlap_data[key, 'automatic_classification'] = 'Keep as is'
153 |           overlap_data[overlapping[[1]], 'automatic_classification'] = 'Delete'
154 |         }
155 | 
156 |         else if(pseudo_overlap(key, overlapping, gene_A_exons, gene_B_exons, gene_pattern) == 'exonic'){
157 |           overlap_data[key, 'automatic_classification'] = 'Keep as is'
158 |           overlap_data[overlapping[[1]], 'automatic_classification'] = 'Keep as is'
159 |         }
160 | 
161 |         # Check for readthrough
162 |         else if(exon_overlap(gene_A_exons, gene_B_exons) == TRUE){
163 |           if(strand == '+'){
164 |             name_A = key
165 |             name_B = overlapping
166 |             result = readthrough_or_premature_plus(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons)
167 | 
168 |             if(result[[3]] == 'readthrough'){
169 |               overlap_data[result[[1]],'automatic_classification'] = 'Readthrough transcript deletion'
170 |               overlap_data[result[[2]],'automatic_classification'] = 'Keep as is'
171 |             }
172 |             else if(result[[3]] == 'premature'){
173 |               overlap_data[result[[1]],'automatic_classification'] = 'Keep as is'
174 |               overlap_data[result[[2]],'automatic_classification'] = 'Premature transcript deletion'
175 |             }
176 |             else if(result[[3]] == 'manual'){
177 |               overlap_data[result[[1]],'automatic_classification'] = 'Manual inspection'
178 |               overlap_data[result[[2]],'automatic_classification'] = 'Manual inspection'
179 |             }
180 |           }
181 | 
182 |           else if(strand == '-'){
183 |             name_A = key
184 |             name_B = overlapping
185 |             result = readthrough_or_premature_min(name_A, gene_A, name_B, gene_B, gene_A_exons, gene_B_exons)
186 | 
187 |             if(result[[3]] == 'readthrough'){
188 |               overlap_data[result[[1]],'automatic_classification'] = 'Readthrough transcript deletion'
189 |               overlap_data[result[[2]],'automatic_classification'] = 'Keep as is'
190 |             }
191 |             else if(result[[3]] == 'premature'){
192 |               overlap_data[result[[1]],'automatic_classification'] = 'Keep as is'
193 |               overlap_data[result[[2]],'automatic_classification'] = 'Premature transcript deletion'
194 |             }
195 |             else if(result[[3]] == 'manual'){
196 |               overlap_data[result[[1]],'automatic_classification'] = 'Manual inspection'
197 |               overlap_data[result[[2]],'automatic_classification'] = 'Manual inspection'
198 |             }
199 |           }
200 |         }
201 | 
202 |         else if(exon_overlap(gene_A_exons, gene_B_exons) == FALSE){
203 |           overlap_data[key,'automatic_classification'] = 'Keep as is'
204 |           overlap_data[overlapping,'automatic_classification'] = 'Keep as is'
205 |         }
206 |       }
207 |     }
208 |   }
209 | 
210 |   print("Overlapping genes list (overlapping_gene_list.csv) has been updated with recommended action categories and the file has been saved in your working directory")
211 |   write_csv(overlap_data, "overlapping_gene_list.csv")
212 | 
213 | }
214 | 


--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
  1 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
  2 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm"))
  3 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
  4 | GenerateGeneLocationBed(genome_annotation)
  5 | GenerateExtensionCandidates()
  6 | OptimizedAnnotationAssembler("test_genes.gtf", “test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv")
  7 | OptimizedAnnotationAssembler("test_genes.gtf", "test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv")
  8 | git push
  9 | git config pull.ff only
 10 | library(ReferenceEnhancer)
 11 | library(ReferenceEnhancer)
 12 | library(referenceenhancer)
 13 | getwd()
 14 | setwd("/Users/helen/Desktop")
 15 | setwd("/Users/helen/Desktop/Overlap")
 16 | genome_annotation <- LoadGtf("test_genes.gtf")
 17 | ?IdentifyOverlappers
 18 | library(ReferenceEnhancer)
 19 | getwd()
 20 | install.packages("devtools")
 21 | require(devtools)
 22 | library(ReferenceEnhancer)
 23 | install.packages("gdata")
 24 | install.packages("readr")
 25 | ?IdentifyOverlappers
 26 | library(ReferenceEnhancer)
 27 | genome_annotation <- LoadGtf("test_genes.gtf")
 28 | library(ReferenceEnhancer)
 29 | ?IdentifyOverlappers
 30 | ?ReferenceEnhancer
 31 | ??ReferenceEnhancer
 32 | library(ReferenceEnhancer)
 33 | pwd
 34 | getwd()
 35 | genome_annotation <- LoadGtf("test_genes.gtf")
 36 | genome_annotation <- LoadGtf(genes_gtf_path)
 37 | library(ReferenceEnhancer)
 38 | devtools::document()
 39 | library(ReferenceEnhancer)
 40 | genome_annotation <- LoadGtf("test_genes.gtf")
 41 | BiocManager::install("rtracklayer")
 42 | genome_annotation <- LoadGtf("test_genes.gtf")
 43 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
 44 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm"))
 45 | devtools::document()
 46 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
 47 | ?LoadGtf
 48 | ?OptimizedAnnotationAssembler
 49 | ??OptimizedAnnotationAssembler
 50 | library(ReferenceEnhancer)
 51 | getwd()
 52 | OptimizedAnnotationAssembler()
 53 | library(ReferenceEnhancer)
 54 | ?OptimizedAnnotationAssembler
 55 | library(ReferenceEnhancer)
 56 | library(ReferenceEnhancer)
 57 | library(ReferenceEnhancer)
 58 | ?OptimizedAnnotationAssembler
 59 | library(ReferenceEnhancer)
 60 | ?OptimizedAnnotationAssembler
 61 | test = LoadGtf("test_genes.gtf")
 62 | test
 63 | dim(test)
 64 | exonic_gtf <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer")
 65 | genome_annotation <- LoadGtf("test_genes.gtf")
 66 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
 67 | gene_extension <- GenerateExtensionCandidates()
 68 | OptimizedAnnotationAssembler(exonic_gtf, gene_overlaps, gene_extension, gene_replacement)
 69 | library(ReferenceEnhancer)
 70 | exonic_gtf <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer")
 71 | genome_annotation <- LoadGtf("test_genes.gtf")
 72 | genome_annotation <- LoadGtf("test_genes.gtf")
 73 | gene_extension <- GenerateExtensionCandidates()
 74 | read.table("results.txt", sep = "\t")
 75 | library(ReferenceEnhancer)
 76 | summary_data
 77 | exonic_gtf <- system.file("extdata", "test_genes.gtf", package = "ReferenceEnhancer")
 78 | genome_annotation <- LoadGtf("test_genes.gtf")
 79 | genome_annotation <- LoadGtf("test_genes.gtf")
 80 | gene_extension <- GenerateExtensionCandidates()
 81 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 82 | pwd
 83 | getwd()
 84 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 85 | OptimizedAnnotationAssembler("test_genes.gtf", "overlapping_gene_list.csv", " gene_extension_candidates.csv", "rename_genes.csv")
 86 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 87 | library(ReferenceEnhancer)
 88 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 89 | library(ReferenceEnhancer)
 90 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 91 | library(ReferenceEnhancer)
 92 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 93 | gene_overlaps
 94 | library(ReferenceEnhancer)
 95 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 96 | length(transcripts_to_delete)
 97 | library(ReferenceEnhancer)
 98 | OptimizedAnnotationAssembler("./test_genes.gtf", "./overlapping_gene_list.csv", " gene_extension_candidates.csv", "./rename_genes.csv")
 99 | getwd()
100 | library(ReferenceEnhancer)
101 | genome_annotation <- LoadGtf("test_genes.gtf")
102 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
103 | OverlapResolutions(genome_annotation, gene_overlaps, c("^Gm", "Rik$"))
104 | IsolateIntergenicReads("./input.bam", "./input.bam.bai")
105 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
106 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
107 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
108 | library(ReferenceEnhancer)
109 | genome_annotation <- LoadGtf("test_genes.gtf")
110 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
111 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm"))
112 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
113 | library(ReferenceEnhancer)
114 | genome_annotation <- LoadGtf("test_genes.gtf")
115 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
116 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm"))
117 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai")
118 | GenerateGeneLocationBed(genome_annotation)
119 | GenerateExtensionCandidates()
120 | OptimizedAnnotationAssembler("test_genes.gtf", "test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv")
121 | GenerateGeneLocationBed(genome_annotation, "/usr/bin/bedops")
122 | GenerateGeneLocationBed(genome_annotation)
123 | library(ReferenceEnhancer)
124 | GenerateExtensionCandidates()
125 | library(ReferenceEnhancer)
126 | library(ReferenceEnhancer)
127 | library(ReferenceEnhancer)
128 | unoptimized_gtf <- "test_genes.gtf"
129 | gene_overlaps <- "overlapping_gene_list.csv"
130 | gene_extension <- "gene_extension_candidates.csv"
131 | gene_replacement <- "test_gene_replacement.csv"
132 | OptimizedAnnotationAssembler("test_genes.gtf", "overlapping_gene_list.csv", " gene_extension_candidates.csv", "rename_genes.csv")
133 | unoptimized_gtf <- "test_genes.gtf"
134 | gene_overlaps <- "test_overlapping_gene_list.csv"
135 | gene_extension <- "./gene_extension_candidates.csv"
136 | gene_replacement <- "test_gene_replacement.csv"
137 | OptimizedAnnotationAssembler(unoptimized_gtf, gene_overlaps, gene_extension, gene_replacement)
138 | library(ReferenceEnhancer)
139 | ?LoadGtf
140 | ?LoadGtf
141 | library(ReferenceEnhancer)
142 | ?LoadGtf
143 | ?LoadGtf
144 | library(ReferenceEnhancer)
145 | ?LoadGtf
146 | library(ReferenceEnhancer)
147 | ?LoadGtf
148 | library(ReferenceEnhancer)
149 | ?LoadGtf
150 | library(ReferenceEnhancer)
151 | ?LoadGtf
152 | library(ReferenceEnhancer)
153 | ?LoadGtf
154 | library(ReferenceEnhancer)
155 | ?LoadGtf
156 | library(ReferenceEnhancer)
157 | ?LoadGtf
158 | genome_annotation <- LoadGtf("test_genes.gtf")
159 | ?IdentifyOverlappers
160 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
161 | ?OverlapResolutions
162 | library(ReferenceEnhancer)
163 | ?OverlapResolutions
164 | library(ReferenceEnhancer)
165 | ?OverlapResolutions
166 | genome_annotation <- LoadGtf("test_genes.gtf")
167 | gene_overlaps <- IdentifyOverlappers(genome_annotation)
168 | OverlapResolutions(genome_annotation, gene_overlaps, c("Rik$", "^Gm"))
169 | ?IsolateIntergenicReads
170 | IsolateIntergenicReads("test_bam.bam", "test_index.bam.bai", barcode_length = 26)
171 | ?GenerateGeneLocationBed
172 | ?GenerateExtensionCandidates
173 | ?OptimizedAnnotationAssembler
174 | library(ReferenceEnhancer)
175 | ?LoadGtf
176 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
177 | ?IdentifyOverlappers
178 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
179 | ?OverlapResolutions
180 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
181 | ?IsolateIntergenicReads
182 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
183 | ?GenerateGeneLocationBed
184 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
185 | ?GenerateExtensionCandidates
186 | old_path <- Sys.getenv("PATH")
187 | Sys.setenv(PATH = paste(old_path, "/Applications/bedtools2/bin", sep = ":"))
188 | GenerateExtensionCandidates()
189 | Sys.getenv("PATH")
190 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
191 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
192 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
193 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
194 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
195 | GenerateExtensionCandidates()
196 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
197 | library(ReferenceEnhancer)
198 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
199 | library(ReferenceEnhancer)
200 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
201 | library(ReferenceEnhancer)
202 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
203 | library(ReferenceEnhancer)
204 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
205 | library(ReferenceEnhancer)
206 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
207 | library(ReferenceEnhancer)
208 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
209 | library(ReferenceEnhancer)
210 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
211 | library(ReferenceEnhancer)
212 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
213 | library(ReferenceEnhancer)
214 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
215 | library(ReferenceEnhancer)
216 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
217 | library(ReferenceEnhancer)
218 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
219 | library(ReferenceEnhancer)
220 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
221 | genome_annotation
222 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
223 | getwd()
224 | setwd("/System/Applications")
225 | library(ReferenceEnhancer)
226 | getwd()
227 | library(ReferenceEnhancer)
228 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
229 | LoadGtf("test_genes.gtf")
230 | ?LoadGtf
231 | library(ReferenceEnhancer)
232 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
233 | ?LoadGtf
234 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
235 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
236 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
237 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
238 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
239 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
240 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
241 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
242 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
243 | library(ReferenceEnhancer)
244 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
245 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai")
246 | library(ReferenceEnhancer)
247 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai")
248 | library(ReferenceEnhancer)
249 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai")
250 | library(ReferenceEnhancer)
251 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai")
252 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
253 | library(ReferenceEnhancer)
254 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
255 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
256 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
257 | old_path <- Sys.getenv("PATH")
258 | Sys.getenv("PATH")
259 | library(ReferenceEnhancer)
260 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
261 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
262 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
263 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
264 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
265 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
266 | old_path <- Sys.getenv("PATH")
267 | Sys.setenv(PATH = paste(old_path, "/Applications/bedtools2/bin", sep = ":"))
268 | GenerateExtensionCandidates()
269 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
270 | library(ReferenceEnhancer)
271 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
272 | GenerateExtensionCandidates()
273 | Sys.getenv("PATH")
274 | ?GenerateExtensionCandidates
275 | library(ReferenceEnhancer)
276 | ?GenerateExtensionCandidates
277 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
278 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
279 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
280 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
281 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
282 | Sys.getenv("PATH")
283 | GenerateExtensionCandidates()
284 | library(ReferenceEnhancer)
285 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
286 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
287 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
288 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
289 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
290 | Sys.getenv("PATH")
291 | GenerateExtensionCandidates()
292 | Sys.getenv("PATH")
293 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
294 | Sys.getenv("PATH")
295 | Sys.getenv("PATH")
296 | whereis bedtools
297 | library(ReferenceEnhancer)
298 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
299 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
300 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
301 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
302 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
303 | GenerateExtensionCandidates()
304 | Sys.getenv("PATH")
305 | GenerateExtensionCandidates("/opt/homebrew/bin/bedtools")
306 | Sys.getenv("PATH")
307 | system("whereis bedtools")
308 | library(ReferenceEnhancer)
309 | GenerateExtensionCandidates()
310 | GenerateExtensionCandidates()
311 | library(ReferenceEnhancer)
312 | library(ReferenceEnhancer)
313 | GenerateExtensionCandidates()
314 | library(ReferenceEnhancer)
315 | GenerateExtensionCandidates()
316 | library(ReferenceEnhancer)
317 | GenerateExtensionCandidates()
318 | library(ReferenceEnhancer)
319 | GenerateExtensionCandidates()
320 | library(ReferenceEnhancer)
321 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
322 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
323 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
324 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
325 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
326 | GenerateExtensionCandidates()
327 | library(ReferenceEnhancer)
328 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
329 | Sys.getenv("PATH")
330 | system("whereis bedtools")
331 | system("whereis bedtools")
332 | system("whereis bedtools") == None
333 | system("whereis bedtools") == []
334 | if(system("whereis bedtools")){print("YAY")}
335 | if(system("whereis bedtools")){print("YAY")}else{"NO"}
336 | library(ReferenceEnhancer)
337 | GenerateExtensionCandidates()
338 | Sys.getenv("PATH")
339 | GenerateExtensionCandidates()
340 | Sys.getenv("PATH")
341 | system("whereis bedtools")
342 | GenerateExtensionCandidates()
343 | Sys.getenv("PATH")
344 | system("whereis bedtools")
345 | system("whereis bedtools")
346 | a=system("whereis bedtools")
347 | class(a)
348 | a
349 | a
350 | a=system(print "homo")
351 | a=system("homo")
352 | a=system(""homo"")
353 | a=system("print homo")
354 | a=system("print 'homo'")
355 | a=system("echo homo")
356 | a
357 | system("echo homo")[1]
358 | a=system("echo homo")[1]
359 | a
360 | class(system("echo homo"))
361 | as.character((system("echo homo"))
362 | )
363 | row.names((system("echo homo")))
364 | row.names(system("whereis bedtools"))
365 | a=row.names(system("whereis bedtools"))
366 | dim(a)
367 | class(a)
368 | a
369 | row.names(system("whereis bedtools"))
370 | system("whereis bedtools")
371 | unlist(strsplit(system("whereis bedtools", intern = TRUE),":"))[2]
372 | unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2]
373 | library(ReferenceEnhancer)
374 | GenerateExtensionCandidates()
375 | system("whereis bedtools")
376 | Sys.getenv("PATH")
377 | system("whereis bedtools", intern = TRUE)
378 | if(system("whereis bedtools", intern = TRUE)){print("YAY")}
379 | length(system("whereis bedtools"))
380 | length(system("whereis bedtools", intern = TRUE))
381 | length(system("whereis bedtools"))
382 | length(system("whereis bedtools", intern = TRUE))
383 | unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2]
384 | if(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2]){print("YAY")}
385 | if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){print("OHNO")}
386 | if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){
387 | print("Didn't find bedtools. Please install bedtools.")
388 | }
389 | else{
390 | if(is.na(unlist(strsplit(system("whereis bedtools", intern = TRUE),": "))[2])){
391 | print("Didn't find bedtools. Please install bedtools.")
392 | }
393 | else{
394 | library(ReferenceEnhancer)
395 | GenerateExtensionCandidates()
396 | library(ReferenceEnhancer)
397 | GenerateExtensionCandidates()
398 | Sys.getenv("PATH")
399 | GenerateExtensionCandidates()
400 | Sys.getenv("PATH")
401 | system("whereis bedtools")
402 | system("whereis bedtools")
403 | Sys.getenv("PATH")
404 | library(ReferenceEnhancer)
405 | Sys.getenv("PATH")
406 | GenerateExtensionCandidates()
407 | old_path <- Sys.getenv("PATH")
408 | bedtools_loc = "/Users/helen/Downloads/bedtools2/bin"
409 | Sys.setenv(PATH = paste(old_path, bedtools_loc, sep = ":"))
410 | Sys.getenv("PATH")
411 | GenerateExtensionCandidates()
412 | ?GenerateExtensionCandidates
413 | library(ReferenceEnhancer)
414 | GenerateExtensionCandidates()
415 | library(ReferenceEnhancer)
416 | GenerateExtensionCandidates()
417 | GenerateExtensionCandidates(bedtools_loc = "/opt/bedtools2/bin")
418 | library(ReferenceEnhancer)
419 | library(ReferenceEnhancer)
420 | GenerateExtensionCandidates()
421 | library(ReferenceEnhancer)
422 | GenerateExtensionCandidates()
423 | GenerateExtensionCandidates("/Users/helen/Downloads/bedtools2/bin")
424 | ?GenerateGeneLocationBed
425 | system("whereis bedops")
426 | bedops in Sys.getenv("PATH")
427 | system.file(bedops)
428 | a = Sys.getenv("PATH", intern = TRUE)
429 | a = intern = Sys.getenv("PATH")
430 | a = Sys.getenv("PATH")
431 | a
432 | bedops in a
433 | class(a)
434 | grepl( needle, haystack, fixed = TRUE)
435 | grepl(a, "bedops", fixed = TRUE)
436 | a
437 | grepl(a, "bedtools", fixed = TRUE)
438 | grepl(a, "bedtools")
439 | type(a)
440 | class(a)
441 | [1]
442 | length(a)
443 | length(a[1])
444 | unlist(strsplit(a,sep=":"))
445 | a
446 | dim(a)
447 | ?Sys.getenv
448 | Sys.getenv(c("R_HOME", "R_PAPERSIZE", "R_PRINTCMD", "HOST"))
449 | dim(Sys.getenv(c("R_HOME", "R_PAPERSIZE", "R_PRINTCMD", "HOST")))
450 | class(Sys.getenv(c("R_HOME", "R_PAPERSIZE", "R_PRINTCMD", "HOST")))
451 | s <- Sys.getenv()
452 | s
453 | names(s)
454 | head(s, 12)
455 | a = Sys.getenv("PATH")
456 | names(a)
457 | head(s, 12)
458 | head(s, 1)
459 | s
460 | head(s, 1)
461 | head(s, 1)[1]
462 | head(s, Path)
463 | Sys.getenv("PATH")
464 | Sys.getenv("PATH")
465 | system("whereis bedops")
466 | library(ReferenceEnhancer)
467 | GenerateGeneLocationBed()
468 | ?GenerateGeneLocationBed
469 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
470 | GenerateGeneLocationBed(genome_annotation)
471 | Sys.getenv("PATH")
472 | library(ReferenceEnhancer)
473 | GenerateGeneLocationBed(genome_annotation)
474 | library(ReferenceEnhancer)
475 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
476 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
477 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
478 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
479 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL)
480 | GenerateExtensionCandidates(bedtools_loc = NULL)
481 | Sys.getenv("PATH")
482 | library(ReferenceEnhancer)
483 | GenerateExtensionCandidates(bedtools_loc = NULL)
484 | library(ReferenceEnhancer)
485 | GenerateExtensionCandidates()
486 | GenerateExtensionCandidates(bedtools_loc = NULL)
487 | library(ReferenceEnhancer)
488 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
489 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
490 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
491 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
492 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL)
493 | GenerateGeneLocationBed(genome_annotation = genome_annotation)
494 | GenerateExtensionCandidates(bedtools_loc = NULL)
495 | library(ReferenceEnhancer)
496 | library(ReferenceEnhancer)
497 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
498 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
499 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
500 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
501 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL)
502 | GenerateExtensionCandidates(bedtools_loc = NULL)
503 | GenerateExtensionCandidates(bedtools_loc = "/Users/helen/Downloads/bedtools2/bin")
504 | library(ReferenceEnhancer)
505 | genome_annotation <- LoadGtf(unoptimized_annotation_path = "test_genes.gtf")
506 | gene_overlaps <- IdentifyOverlappers(genome_annotation = genome_annotation)
507 | OverlapResolutions(genome_annotation = genome_annotation, overlap_data = gene_overlaps, gene_pattern = c("Rik$", "^Gm"))
508 | IsolateIntergenicReads(bam_file_name = "test_bam.bam", index_file_name = "test_index.bam.bai", barcode_length = 26)
509 | GenerateGeneLocationBed(genome_annotation = genome_annotation, bedops_loc = NULL)
510 | GenerateExtensionCandidates(bedtools_loc = NULL)
511 | OptimizedAnnotationAssembler("test_genes.gtf", "test_overlapping_gene_list.csv", "gene_extension_candidates.csv", "test_gene_replacement.csv")
512 | library(ReferenceEnhancer)
513 | 


--------------------------------------------------------------------------------