├── .BBSoptions ├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── NEWS ├── R ├── FeatureDb-class.R ├── TxDb-SELECT-helpers.R ├── TxDb-class.R ├── TxDb-schema.R ├── coordinate-mapping-methods.R ├── coverageByTranscript.R ├── exonicParts.R ├── extendExonsIntoIntrons.R ├── extractTranscriptSeqs.R ├── extractUpstreamSeqs.R ├── features.R ├── getPromoterSeq-methods.R ├── id2name.R ├── makeFeatureDbFromUCSC.R ├── makeTxDb.R ├── makeTxDbFromBiomart.R ├── makeTxDbFromEnsembl.R ├── makeTxDbFromGFF.R ├── makeTxDbFromGRanges.R ├── makeTxDbFromUCSC.R ├── makeTxDbPackage.R ├── mapIdsToRanges.R ├── nearest-methods.R ├── proteinToGenome.R ├── select-methods.R ├── tRNAs.R ├── transcriptLengths.R ├── transcriptLocs2refLocs.R ├── transcripts.R ├── transcriptsBy.R ├── transcriptsByOverlaps.R ├── utils.R └── zzz.R ├── README.md ├── TODO ├── inst ├── CITATION ├── extdata │ ├── Biomart_Ensembl_sample.sqlite │ ├── FeatureDb.sqlite │ ├── ITAG4.1_gene_models.subset.gff │ ├── cD.exByEdge-SG-Vig.Rda │ ├── cD.exsByGenes-SG-Vig.Rda │ ├── events.Rda │ ├── hg19_knownGene_sample.sqlite │ └── sample_ranges.rds ├── script │ └── README └── unitTests │ ├── test_TxDb_seqinfo.R │ ├── test_coordinate-mapping-methods.R │ ├── test_exonicParts.R │ ├── test_getPromoterSeq-methods.R │ ├── test_makeIdsForUniqueDataFrameRows.R │ ├── test_mapIdsToRanges.R │ ├── test_nearest-methods.R │ ├── test_select-methods.R │ ├── test_transcriptLengths.R │ ├── test_transcripts.R │ ├── test_transcriptsBy.R │ └── test_transcriptsByOverlaps.R ├── man ├── FeatureDb-class.Rd ├── TxDb-class.Rd ├── as-format-methods.Rd ├── coordinate-mapping-methods.Rd ├── coverageByTranscript.Rd ├── exonicParts.Rd ├── extendExonsIntoIntrons.Rd ├── extractTranscriptSeqs.Rd ├── extractUpstreamSeqs.Rd ├── features.Rd ├── getPromoterSeq-methods.Rd ├── id2name.Rd ├── makeFeatureDbFromUCSC.Rd ├── makeTxDb.Rd ├── makeTxDbFromBiomart.Rd ├── makeTxDbFromEnsembl.Rd ├── makeTxDbFromGFF.Rd ├── makeTxDbFromGRanges.Rd ├── makeTxDbFromUCSC.Rd ├── makeTxDbPackage.Rd ├── mapIdsToRanges.Rd ├── mapRangesToIds.Rd ├── nearest-methods.Rd ├── proteinToGenome.Rd ├── select-methods.Rd ├── tRNAs.Rd ├── transcriptLengths.Rd ├── transcriptLocs2refLocs.Rd ├── transcripts.Rd ├── transcriptsBy.Rd └── transcriptsByOverlaps.Rd ├── tests └── run_unitTests.R └── vignettes └── GenomicFeatures.Rmd /.BBSoptions: -------------------------------------------------------------------------------- 1 | RunLongTests: TRUE 2 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | .BBSoptions 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rhistory 2 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: GenomicFeatures 2 | Title: Query the gene models of a given organism/assembly 3 | Description: Extract the genomic locations of genes, transcripts, exons, 4 | introns, and CDS, for the gene models stored in a TxDb object. 5 | A TxDb object is a small database that contains the gene models of 6 | a given organism/assembly. Bioconductor provides a small collection 7 | of TxDb objects in the form of ready-to-install TxDb packages for 8 | the most commonly studied organisms. Additionally, the user can 9 | easily make a TxDb object (or package) for the organism/assembly 10 | of their choice by using the tools from the txdbmaker package. 11 | biocViews: Genetics, Infrastructure, Annotation, Sequencing, 12 | GenomeAnnotation 13 | URL: https://bioconductor.org/packages/GenomicFeatures 14 | BugReports: https://github.com/Bioconductor/GenomicFeatures/issues 15 | Version: 1.61.3 16 | License: Artistic-2.0 17 | Encoding: UTF-8 18 | Authors@R: c( 19 | person("M.", "Carlson", role="aut"), 20 | person("H.", "Pagès", role=c("aut", "cre"), 21 | email="hpages.on.github@gmail.com"), 22 | person("P.", "Aboyoun", role="aut"), 23 | person("S.", "Falcon", role="aut"), 24 | person("M.", "Morgan", role="aut"), 25 | person("D.", "Sarkar", role="aut"), 26 | person("M.", "Lawrence", role="aut"), 27 | person("V.", "Obenchain", role="aut"), 28 | person("S.", "Arora", role="ctb"), 29 | person("J.", "MacDonald", role="ctb"), 30 | person("M.", "Ramos", role="ctb"), 31 | person("S.", "Saini", role="ctb"), 32 | person("P.", "Shannon", role="ctb"), 33 | person("L.", "Shepherd", role="ctb"), 34 | person("D.", "Tenenbaum", role="ctb"), 35 | person("D.", "Van Twisk", role="ctb")) 36 | Depends: BiocGenerics (>= 0.51.2), S4Vectors (>= 0.17.29), 37 | IRanges (>= 2.37.1), GenomeInfoDb (>= 1.35.8), 38 | GenomicRanges (>= 1.55.2), AnnotationDbi (>= 1.41.4) 39 | Imports: methods, utils, stats, DBI, XVector, Biostrings, rtracklayer 40 | Suggests: txdbmaker, org.Mm.eg.db, org.Hs.eg.db, 41 | BSgenome, BSgenome.Hsapiens.UCSC.hg19 (>= 1.3.17), 42 | BSgenome.Celegans.UCSC.ce11, 43 | BSgenome.Dmelanogaster.UCSC.dm3 (>= 1.3.17), 44 | FDb.UCSC.tRNAs, 45 | TxDb.Hsapiens.UCSC.hg19.knownGene, 46 | TxDb.Celegans.UCSC.ce11.ensGene, 47 | TxDb.Dmelanogaster.UCSC.dm3.ensGene (>= 2.7.1), 48 | TxDb.Mmusculus.UCSC.mm10.knownGene (>= 3.4.7), 49 | TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts, 50 | TxDb.Hsapiens.UCSC.hg38.knownGene (>= 3.4.6), 51 | SNPlocs.Hsapiens.dbSNP144.GRCh38, 52 | Rsamtools, pasillaBamSubset (>= 0.0.5), GenomicAlignments (>= 1.15.7), 53 | ensembldb, AnnotationFilter, 54 | RUnit, BiocStyle, knitr, markdown 55 | VignetteBuilder: knitr 56 | Collate: utils.R 57 | TxDb-schema.R 58 | TxDb-SELECT-helpers.R 59 | TxDb-class.R FeatureDb-class.R 60 | mapIdsToRanges.R 61 | id2name.R 62 | transcripts.R 63 | transcriptsBy.R 64 | transcriptsByOverlaps.R 65 | transcriptLengths.R 66 | exonicParts.R 67 | extendExonsIntoIntrons.R 68 | features.R 69 | tRNAs.R 70 | extractTranscriptSeqs.R 71 | extractUpstreamSeqs.R 72 | getPromoterSeq-methods.R 73 | select-methods.R 74 | nearest-methods.R 75 | transcriptLocs2refLocs.R 76 | coordinate-mapping-methods.R 77 | proteinToGenome.R 78 | coverageByTranscript.R 79 | makeTxDb.R 80 | makeTxDbFromUCSC.R 81 | makeTxDbFromBiomart.R 82 | makeTxDbFromEnsembl.R 83 | makeTxDbFromGRanges.R 84 | makeTxDbFromGFF.R 85 | makeFeatureDbFromUCSC.R 86 | makeTxDbPackage.R 87 | zzz.R 88 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | import(methods) 2 | importFrom(stats, setNames) 3 | importFrom(utils, as.person) 4 | 5 | importMethodsFrom(DBI, dbGetQuery, dbListTables, dbListFields) 6 | 7 | import(AnnotationDbi) 8 | import(BiocGenerics) 9 | import(S4Vectors) 10 | import(IRanges) 11 | import(GenomeInfoDb) 12 | import(XVector) 13 | import(GenomicRanges) 14 | 15 | importClassesFrom(Biostrings, DNAString, DNAStringSet, MaskedDNAString) 16 | importFrom(Biostrings, DNAStringSet, reverseComplement, getSeq) 17 | 18 | importFrom(rtracklayer, asBED, asGFF) 19 | 20 | 21 | exportClasses(TxDb, FeatureDb) 22 | 23 | export( 24 | ## id2name.R: 25 | id2name, 26 | 27 | ## transcripts.R: 28 | transcripts, exons, cds, genes, 29 | 30 | ## transcriptsBy.R: 31 | transcriptsBy, 32 | exonsBy, 33 | cdsBy, 34 | intronsByTranscript, 35 | fiveUTRsByTranscript, 36 | threeUTRsByTranscript, 37 | 38 | ## transcriptsByOverlaps.R: 39 | transcriptsByOverlaps, 40 | exonsByOverlaps, 41 | cdsByOverlaps, 42 | 43 | ## transcriptLengths.R: 44 | transcriptLengths, 45 | 46 | ## exonicParts.R: 47 | tidyTranscripts, tidyExons, tidyIntrons, 48 | exonicParts, intronicParts, 49 | 50 | ## extendExonsIntoIntrons.R: 51 | extendExonsIntoIntrons, 52 | 53 | ## features.R: 54 | features, 55 | 56 | ## tRNAs.R: 57 | microRNAs, 58 | tRNAs, 59 | 60 | ## extractTranscriptSeqs.R: 61 | extractTranscriptSeqs, 62 | 63 | ## extractUpstreamSeqs.R: 64 | extractUpstreamSeqs, 65 | 66 | ## getPromoterSeq-methods.R: 67 | getPromoterSeq, getTerminatorSeq, 68 | 69 | ## transcriptLocs2refLocs.R: 70 | transcriptLocs2refLocs, 71 | transcriptWidths, 72 | 73 | ## coordinate-mapping-methods.R: 74 | mapToTranscripts, pmapToTranscripts, 75 | mapFromTranscripts, pmapFromTranscripts, 76 | 77 | ## proteinToGenome.R: 78 | proteinToGenome, 79 | 80 | ## coverageByTranscript.R: 81 | coverageByTranscript, 82 | pcoverageByTranscript 83 | ) 84 | 85 | exportMethods( 86 | organism, 87 | show, 88 | as.list, 89 | seqlevels0, "seqlevels<-", seqinfo, 90 | transcripts, exons, cds, genes, 91 | promoters, terminators, 92 | transcriptsByOverlaps, 93 | exonsByOverlaps, 94 | cdsByOverlaps, 95 | transcriptsBy, 96 | exonsBy, 97 | cdsBy, 98 | intronsByTranscript, 99 | fiveUTRsByTranscript, 100 | threeUTRsByTranscript, 101 | tRNAs, 102 | extractTranscriptSeqs, 103 | extractUpstreamSeqs, 104 | getPromoterSeq, getTerminatorSeq, 105 | isActiveSeq, 106 | "isActiveSeq<-", 107 | asBED, asGFF, 108 | distance, 109 | mapToTranscripts, pmapToTranscripts, 110 | mapFromTranscripts, pmapFromTranscripts, 111 | mapIdsToRanges, mapRangesToIds, 112 | proteinToGenome 113 | ) 114 | 115 | 116 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 117 | ### Stuff that has moved to txdbmaker 118 | ### 119 | 120 | export( 121 | ## makeTxDb.R: 122 | makeTxDb, 123 | 124 | ## makeTxDbFromUCSC.R: 125 | supportedUCSCtables, 126 | browseUCSCtrack, 127 | makeTxDbFromUCSC, 128 | 129 | ## makeTxDbFromBiomart.R: 130 | getChromInfoFromBiomart, 131 | makeTxDbFromBiomart, 132 | 133 | ## makeTxDbFromEnsembl.R: 134 | makeTxDbFromEnsembl, 135 | 136 | ## makeTxDbFromGRanges.R: 137 | makeTxDbFromGRanges, 138 | 139 | ## makeTxDbFromGFF.R: 140 | makeTxDbFromGFF, 141 | 142 | ## makeFeatureDbFromUCSC.R: 143 | supportedUCSCFeatureDbTracks, 144 | supportedUCSCFeatureDbTables, 145 | UCSCFeatureDbTableSchema, 146 | makeFeatureDbFromUCSC, 147 | 148 | ## makeTxDbPackage.R: 149 | supportedMiRBaseBuildValues, 150 | makePackageName, 151 | makeTxDbPackage, 152 | makeTxDbPackageFromUCSC, 153 | makeFDbPackageFromUCSC, 154 | makeTxDbPackageFromBiomart 155 | ) 156 | 157 | -------------------------------------------------------------------------------- /R/FeatureDb-class.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### FeatureDb objects 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | ## This is to try and tidy up before setRefClass() 7 | gc() 8 | 9 | .FeatureDb <- 10 | setRefClass("FeatureDb", contains="AnnotationDb") 11 | 12 | 13 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 14 | ### A low-level accessor (not exported). 15 | ### 16 | 17 | ## featuredbConn <- function(featuredb) featuredb$conn 18 | 19 | 20 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 21 | ### Validity of a FeatureDb object. 22 | ### 23 | 24 | .validate.colnames <- function(conn, colnames) 25 | { 26 | ## even though we don't know the name of the table, take advantage of the 27 | ## fact that there are only two tables and one of them is always called 28 | ## "metadata" 29 | tablenames <- dbListTables(conn) 30 | tablename <- tablenames[!tablenames %in% "metadata"] 31 | AnnotationDbi:::.valid.colnames(conn, tablename, colnames) 32 | } 33 | 34 | .valid.feature.table <- function(conn) 35 | { 36 | ## Restrict column name checking to just columns that we are demanding 37 | colnames <- c("chrom", "strand","chromStart","chromEnd") 38 | msg <- .validate.colnames(conn, colnames) 39 | if (!is.null(msg)) 40 | return(msg) 41 | NULL 42 | } 43 | 44 | 45 | .valid.FeatureDb <- function(x) 46 | { 47 | conn <- dbconn(x) 48 | c(AnnotationDbi:::.valid.metadata.table(conn, "Db type", 49 | "FeatureDb"), 50 | .valid.feature.table(conn)) 51 | } 52 | 53 | 54 | setValidity2("FeatureDb", .valid.FeatureDb) 55 | 56 | 57 | 58 | 59 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 60 | ### Low-level constructor (not exported). 61 | ### 62 | 63 | FeatureDb <- function(conn) 64 | { 65 | .FeatureDb$new(conn=conn) 66 | } 67 | 68 | -------------------------------------------------------------------------------- /R/TxDb-SELECT-helpers.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Helpers for SELECT'ing stuff from a TxDb object 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | ### Nothing in this file is exported. 6 | ### 7 | 8 | 9 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 10 | ### Low-level helpers (schema agnostic) for building SQL queries 11 | ### 12 | 13 | .as_qualified <- function(tables, columns) paste(tables, columns, sep=".") 14 | 15 | .tables_in_joins <- function(joins) 16 | { 17 | joins_len <- length(joins) 18 | stopifnot(joins_len %% 2L == 1L) 19 | joins[seq(1L, joins_len, by=2L)] 20 | } 21 | 22 | ### 'join_type' will be recycled to the nb of joins (= length(joins) %/% 2). 23 | .build_SQL_FROM <- function(joins, join_type="INNER") 24 | { 25 | joins_len <- length(joins) 26 | stopifnot(joins_len %% 2L == 1L) 27 | SQL <- joins[[1L]] 28 | if (joins_len == 1L) 29 | return(SQL) 30 | njoin <- joins_len %/% 2L 31 | stopifnot(length(join_type) == 1L || length(join_type) == njoin) 32 | ON_idx <- 2L * seq_len(njoin) 33 | ON <- joins[ON_idx] 34 | Rtables <- joins[ON_idx + 1L] 35 | c(SQL, paste0(join_type, " JOIN ", Rtables, " ON (", ON, ")")) 36 | } 37 | 38 | .build_SQL_FROM_splicing <- function(joins, cds_join_type="LEFT") 39 | { 40 | joins_len <- length(joins) 41 | stopifnot(joins_len %% 2L == 1L) 42 | SQL <- joins[[1L]] 43 | if (joins_len == 1L) 44 | return(SQL) 45 | njoin <- joins_len %/% 2L 46 | join_type <- rep.int("INNER", njoin) 47 | if (joins[[length(joins)]] == "cds") 48 | join_type[[length(join_type)]] <- cds_join_type 49 | paste0(.build_SQL_FROM(joins, join_type), collapse=" ") 50 | } 51 | 52 | .build_SQL_WHERE <- function(filter) 53 | { 54 | if (length(filter) == 0L) 55 | return("") 56 | sql <- lapply(seq_len(length(filter)), 57 | function(i) { 58 | fi <- filter[[i]] 59 | if (!is.numeric(fi)) 60 | fi <- paste0("'", fi, "'") 61 | fi <- paste0("(", paste0(fi, collapse=","), ")") 62 | fi <- paste0(names(filter)[i], " IN ", fi) 63 | paste0("(", fi, ")") 64 | }) 65 | paste0(unlist(sql), collapse=" AND ") 66 | } 67 | 68 | .build_SQL_SELECT <- function(columns, joins, distinct=FALSE, 69 | filter=list(), orderby=character(0)) 70 | { 71 | SQL <- "SELECT" 72 | if (distinct) 73 | SQL <- c(SQL, "DISTINCT") 74 | SQL <- c(SQL, paste0(columns, collapse=", "), 75 | "FROM", .build_SQL_FROM(joins)) 76 | if (length(filter) != 0L) 77 | SQL <- c(SQL, "WHERE", .build_SQL_WHERE(filter)) 78 | if (length(orderby) != 0L) 79 | SQL <- c(SQL, "ORDER BY", paste0(orderby, collapse=", ")) 80 | SQL 81 | } 82 | 83 | 84 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 85 | ### .TXDB_join_tables() and .TXDB_join_splicing_Rtables() 86 | ### 87 | 88 | .TXDB_join_tables <- function(tables) 89 | { 90 | tables <- unique(tables) 91 | if (length(tables) == 1L) 92 | return(tables) 93 | if (any(tables %in% c("exon", "cds"))) 94 | tables <- c(tables, "splicing") 95 | ## Order tables & remove duplicates. 96 | join_order <- c("transcript", "splicing", "exon", "cds", "gene") 97 | tables <- intersect(join_order, tables) 98 | joins <- character(2L * length(tables) - 1L) 99 | ON_idx <- 2L * seq_len(length(tables) - 1L) 100 | ON <- sapply(2:length(tables), function(i) { 101 | Rtable <- tables[[i]] 102 | if (Rtable == "exon") { 103 | USING <- "_exon_id" 104 | Ltable <- "splicing" 105 | } else if (Rtable == "cds") { 106 | USING <- "_cds_id" 107 | Ltable <- "splicing" 108 | } else { 109 | USING <- "_tx_id" 110 | Ltable <- tables[[1L]] 111 | } 112 | Lcolumn <- .as_qualified(Ltable, USING) 113 | Rcolumn <- .as_qualified(Rtable, USING) 114 | paste(Lcolumn, Rcolumn, sep="=") 115 | }) 116 | joins[ON_idx] <- ON 117 | joins[c(1L, ON_idx + 1L)] <- tables 118 | joins 119 | } 120 | 121 | .TXDB_join_splicing_Rtables <- function(tables=character(0)) 122 | { 123 | if (!all(tables %in% TXDB_SPLICING_BUNDLE)) 124 | stop("all tables must be in TXDB_SPLICING_BUNDLE") 125 | tables <- c("splicing", tables) 126 | ## Order tables & remove duplicates. 127 | tables <- intersect(TXDB_SPLICING_BUNDLE, tables) 128 | if (length(tables) == 1L) 129 | return(tables) 130 | joins <- character(2L * length(tables) - 1L) 131 | ON_idx <- 2L * seq_len(length(tables) - 1L) 132 | Rtables <- tables[-1L] 133 | USING <- TXDB_SPLICING_JOIN_USING[Rtables] 134 | Lcolumns <- .as_qualified("splicing", USING) 135 | Rcolumns <- .as_qualified(Rtables, USING) 136 | ON <- paste(Lcolumns, Rcolumns, sep="=") 137 | joins[ON_idx] <- ON 138 | joins[c(1L, ON_idx + 1L)] <- tables 139 | joins 140 | } 141 | 142 | 143 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 144 | ### The 2 flexible helpers for SELECT'ing stuff from a TxDb object: 145 | ### - TxDb_SELECT_from_INNER_JOIN() 146 | ### - TxDb_SELECT_from_splicing_bundle() 147 | ### They should satisfy the needs of most extractors defined in the package. 148 | ### 149 | 150 | ### The columns in 'columns' + those involved thru 'filter' and 'orderby' are 151 | ### collected and their corresponding tables are INNER JOIN'ed. 152 | TxDb_SELECT_from_INNER_JOIN <- function(txdb, table, columns, filter=list(), 153 | orderby=character(0)) 154 | { 155 | schema_version <- TxDb_schema_version(txdb) 156 | tables <- TXDB_column2table(columns, from_table=table, 157 | schema_version=schema_version) 158 | where_columns <- names(filter) 159 | where_tables <- TXDB_column2table(where_columns, from_table=table, 160 | schema_version=schema_version) 161 | joins <- .TXDB_join_tables(c(table, tables, where_tables)) 162 | orderby_tables <- TXDB_column2table(orderby, from_table=table, 163 | schema_version=schema_version) 164 | stopifnot(all(orderby_tables %in% .tables_in_joins(joins))) 165 | use_joins <- length(joins) > 1L 166 | if (use_joins) { 167 | columns <- .as_qualified(tables, columns) 168 | names(filter) <- .as_qualified(where_tables, where_columns) 169 | orderby <- .as_qualified(orderby_tables, orderby) 170 | } 171 | ## .build_SQL_SELECT() uses INNER joins. 172 | SQL <- .build_SQL_SELECT(columns, joins, distinct=use_joins, 173 | filter=filter, orderby=orderby) 174 | queryAnnotationDb(txdb, SQL) 175 | } 176 | 177 | ### Can only involve columns (thru 'columns', 'filter', and 'orderby') that 178 | ### belong to the tables in TXDB_SPLICING_BUNDLE at the moment. 179 | TxDb_SELECT_from_splicing_bundle <- function(txdb, columns, 180 | filter=list(), 181 | orderby=character(0), 182 | cds_join_type="LEFT") 183 | { 184 | schema_version <- TxDb_schema_version(txdb) 185 | tables <- TXDB_column2table(columns, from_table="splicing", 186 | schema_version=schema_version) 187 | where_columns <- names(filter) 188 | where_tables <- TXDB_column2table(where_columns, from_table="splicing", 189 | schema_version=schema_version) 190 | orderby_tables <- TXDB_column2table(orderby, from_table="splicing", 191 | schema_version=schema_version) 192 | joins <- .TXDB_join_splicing_Rtables(c(tables, where_tables, 193 | orderby_tables)) 194 | use_joins <- length(joins) > 1L 195 | if (use_joins) { 196 | columns <- .as_qualified(tables, columns) 197 | names(filter) <- .as_qualified(where_tables, where_columns) 198 | orderby <- .as_qualified(orderby_tables, orderby) 199 | } 200 | from <- .build_SQL_FROM_splicing(joins, cds_join_type=cds_join_type) 201 | SQL <- .build_SQL_SELECT(columns, from, distinct=FALSE, 202 | filter=filter, orderby=orderby) 203 | queryAnnotationDb(txdb, SQL) 204 | } 205 | 206 | 207 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 208 | ### Convenience wrappers to the above flexible helpers for SELECT'ing stuff 209 | ### from a given TxDb table 210 | ### 211 | 212 | TxDb_SELECT_from_chrominfo <- function(txdb, filter=list(), 213 | orderby="_chrom_id") 214 | { 215 | schema_version <- TxDb_schema_version(txdb) 216 | columns <- TXDB_table_columns("chrominfo", schema_version=schema_version) 217 | TxDb_SELECT_from_INNER_JOIN(txdb, "chrominfo", columns, 218 | filter=filter, orderby=orderby) 219 | } 220 | 221 | TxDb_SELECT_from_transcript <- function(txdb, filter=list(), 222 | orderby="_tx_id") 223 | { 224 | schema_version <- TxDb_schema_version(txdb) 225 | columns <- TXDB_table_columns("transcript", schema_version=schema_version) 226 | TxDb_SELECT_from_INNER_JOIN(txdb, "transcript", columns, 227 | filter=filter, orderby=orderby) 228 | } 229 | 230 | ### Select rows from the virtual table obtained by joining the "splicing", 231 | ### "exon", and "cds" tables together. 232 | TxDb_SELECT_from_splicings <- function(txdb, filter=list(), 233 | orderby=c("_tx_id", "exon_rank"), 234 | cds_join_type="LEFT") 235 | { 236 | schema_version <- TxDb_schema_version(txdb) 237 | splicing_columns <- TXDB_table_columns("splicing", 238 | schema_version=schema_version) 239 | exon_columns <- TXDB_table_columns("exon", schema_version=schema_version) 240 | cds_columns <- TXDB_table_columns("cds", schema_version=schema_version) 241 | cds_columns <- cds_columns[c("id", "name", "start", "end")] 242 | columns <- unique(c(splicing_columns, exon_columns, cds_columns)) 243 | TxDb_SELECT_from_splicing_bundle(txdb, columns, 244 | filter=filter, orderby=orderby, 245 | cds_join_type=cds_join_type) 246 | } 247 | 248 | TxDb_SELECT_from_gene <- function(txdb, filter=list(), 249 | orderby=c("_tx_id", "gene_id")) 250 | { 251 | schema_version <- TxDb_schema_version(txdb) 252 | columns <- TXDB_table_columns("gene", schema_version=schema_version) 253 | TxDb_SELECT_from_INNER_JOIN(txdb, "gene", columns, 254 | filter=filter, orderby=orderby) 255 | } 256 | 257 | -------------------------------------------------------------------------------- /R/TxDb-schema.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### TxDb schema 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | ### Nothing in this file is exported. 6 | ### 7 | ### 7 tables: 8 | ### - chrominfo 9 | ### - transcript 10 | ### - exon 11 | ### - cds 12 | ### - splicing 13 | ### - gene 14 | ### - metadata (not described here) 15 | 16 | 17 | ### Not exported. 18 | DB_TYPE_NAME <- "Db type" 19 | DB_TYPE_VALUE <- "TxDb" # same as the name of the class below 20 | DB_SCHEMA_VERSION <- "1.2" # DON'T FORGET TO BUMP THIS WHEN YOU CHANGE THE 21 | # SCHEMA 22 | 23 | ### Return the *effective* schema version. 24 | TxDb_schema_version <- function(txdb) 25 | { 26 | conn <- if (is(txdb, "TxDb")) dbconn(txdb) else txdb 27 | version <- AnnotationDbi:::.getMetaValue(conn, "DBSCHEMAVERSION") 28 | numeric_version(version) 29 | } 30 | 31 | 32 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 33 | ### Table columns 34 | ### 35 | 36 | ### 'chrominfo' table 37 | 38 | TXDB_CHROMINFO_COLDEFS <- c( 39 | `_chrom_id`="INTEGER PRIMARY KEY", 40 | chrom="TEXT UNIQUE NOT NULL", 41 | length="INTEGER NULL", 42 | is_circular="INTEGER NULL" 43 | ) 44 | 45 | TXDB_CHROMINFO_COLUMNS <- names(TXDB_CHROMINFO_COLDEFS) 46 | 47 | ### 'transcript', 'exon', and 'cds' tables (a.k.a. "feature tables") 48 | 49 | TXDB_FEATURE_COLDEFS <- c( 50 | id="INTEGER PRIMARY KEY", 51 | name="TEXT NULL", 52 | type="TEXT NULL", 53 | chrom="TEXT NOT NULL", 54 | strand="TEXT NOT NULL", 55 | start="INTEGER NOT NULL", 56 | end="INTEGER NOT NULL" 57 | ) 58 | 59 | ### Tables "transcript", "exon", and "cds" must at least have columns with 60 | ### the core column tags. 61 | TXDB_CORE_COLTAGS <- c("id", "chrom", "strand", "start", "end") 62 | TXDB_ALL_COLTAGS <- names(TXDB_FEATURE_COLDEFS) 63 | TXDB_EXON_OR_CDS_COLTAGS <- TXDB_ALL_COLTAGS[TXDB_ALL_COLTAGS != "type"] 64 | 65 | .make_feature_columns <- function(prefix, tags) 66 | { 67 | fmt <- paste0("%s_", tags) 68 | id_pos <- match("id", tags) 69 | stopifnot(identical(id_pos, 1L)) 70 | fmt[[id_pos]] <- paste0("_", fmt[[id_pos]]) 71 | setNames(sprintf(fmt, prefix), tags) 72 | } 73 | 74 | TXDB_TRANSCRIPT_COLUMNS <- .make_feature_columns("tx", TXDB_ALL_COLTAGS) 75 | TXDB_EXON_COLUMNS <- .make_feature_columns("exon", TXDB_EXON_OR_CDS_COLTAGS) 76 | TXDB_CDS_COLUMNS <- .make_feature_columns("cds", TXDB_EXON_OR_CDS_COLTAGS) 77 | 78 | ### 'splicing' table 79 | 80 | TXDB_SPLICING_COLDEFS <- c( 81 | `_tx_id`="INTEGER NOT NULL", 82 | exon_rank="INTEGER NOT NULL", 83 | `_exon_id`="INTEGER NOT NULL", 84 | `_cds_id`="INTEGER NULL", 85 | cds_phase="INTEGER NULL" 86 | ) 87 | 88 | TXDB_SPLICING_COLUMNS <- names(TXDB_SPLICING_COLDEFS) 89 | 90 | ### 'gene' table 91 | 92 | TXDB_GENE_COLDEFS <- c( 93 | gene_id="TEXT NOT NULL", 94 | `_tx_id`="INTEGER NOT NULL" 95 | ) 96 | 97 | TXDB_GENE_COLUMNS <- names(TXDB_GENE_COLDEFS) 98 | 99 | 100 | ### Order of tables matters! "transcript" must be before "splicing" and "gene", 101 | ### and "exon" and "cds" must be before "splicing". See TXDB_column2table() 102 | ### below why. 103 | TXDB_COLUMNS <- list( 104 | chrominfo=TXDB_CHROMINFO_COLUMNS, 105 | transcript=TXDB_TRANSCRIPT_COLUMNS, 106 | exon=TXDB_EXON_COLUMNS, 107 | cds=TXDB_CDS_COLUMNS, 108 | splicing=TXDB_SPLICING_COLUMNS, 109 | gene=TXDB_GENE_COLUMNS 110 | ) 111 | 112 | 113 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 114 | ### Build CREATE TABLE statements 115 | ### 116 | 117 | .build_SQL_CREATE_TABLE <- function(table, coldefs, constraints=NULL) 118 | { 119 | SQL <- "CREATE TABLE %s (%s\n)" 120 | coldefs <- c(paste(names(coldefs), coldefs), constraints) 121 | coldefs <- paste("\n ", coldefs, collapse=",") 122 | sprintf(SQL, table, coldefs) 123 | } 124 | 125 | build_SQL_CREATE_chrominfo_table <- function() 126 | { 127 | .build_SQL_CREATE_TABLE("chrominfo", TXDB_CHROMINFO_COLDEFS) 128 | } 129 | 130 | build_SQL_CREATE_feature_table <- function(table) 131 | { 132 | columns <- TXDB_COLUMNS[[table]] 133 | coldefs <- setNames(TXDB_FEATURE_COLDEFS[names(columns)], columns) 134 | foreign_key <- sprintf("FOREIGN KEY (%s) REFERENCES chrominfo (chrom)", 135 | columns[["chrom"]]) 136 | .build_SQL_CREATE_TABLE(table, coldefs, foreign_key) 137 | } 138 | 139 | build_SQL_CREATE_splicing_table <- function() 140 | { 141 | unique_key <- "UNIQUE (_tx_id, exon_rank)" 142 | foreign_keys <- sprintf("FOREIGN KEY (_%s_id) REFERENCES %s", 143 | c("tx", "exon", "cds"), 144 | c("transcript", "exon", "cds")) 145 | constraints <- c(unique_key, foreign_keys) 146 | .build_SQL_CREATE_TABLE("splicing", TXDB_SPLICING_COLDEFS, constraints) 147 | } 148 | 149 | build_SQL_CREATE_gene_table <- function() 150 | { 151 | unique_key <- "UNIQUE (gene_id, _tx_id)" 152 | foreign_key <- "FOREIGN KEY (_tx_id) REFERENCES transcript" 153 | constraints <- c(unique_key, foreign_key) 154 | .build_SQL_CREATE_TABLE("gene", TXDB_GENE_COLDEFS, constraints) 155 | } 156 | 157 | 158 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 159 | ### Relationship between the 'splicing' table and the "feature tables" 160 | ### 161 | ### The 'splicing' table is the glue between the "feature tables". 162 | ### 163 | 164 | ### The "splicing right tables" can be bundled to the "splicing" table with 165 | ### a LEFT JOIN using the TXDB_SPLICING_JOIN_USING columns. 166 | TXDB_SPLICING_RTABLES <- c("transcript", "exon", "cds") 167 | TXDB_SPLICING_JOIN_USING <- setNames(c("_tx_id", "_exon_id", "_cds_id"), 168 | TXDB_SPLICING_RTABLES) 169 | TXDB_SPLICING_BUNDLE <- c("splicing", TXDB_SPLICING_RTABLES) 170 | 171 | 172 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 173 | ### Helper functions 174 | ### 175 | 176 | TXDB_tables <- function() names(TXDB_COLUMNS) 177 | 178 | TXDB_table_columns <- function(table, schema_version=NA) 179 | { 180 | columns <- TXDB_COLUMNS[[table]] 181 | if (is.na(schema_version)) 182 | return(columns) 183 | if (table == "transcript" && schema_version < numeric_version("1.1")) 184 | columns <- columns[columns != "tx_type"] 185 | if (table == "splicing" && schema_version < numeric_version("1.2")) 186 | columns <- columns[columns != "cds_phase"] 187 | columns 188 | } 189 | 190 | ### When the same column belongs to more than one table (e.g. "_tx_id", 191 | ### "_exon_id", or "_cds_id"), then the table for which the column is a 192 | ### primary key is chosen by default. This behavior can be changed by passing 193 | ### the name of a table to 'from_table' in which case the priority is given to 194 | ### that table. 195 | TXDB_column2table <- function(columns, from_table=NA, schema_version=NA) 196 | { 197 | if (length(columns) == 0L) 198 | return(character(0)) 199 | tables <- sapply(columns, 200 | function(column) { 201 | for (table in TXDB_tables()) { 202 | table_columns <- TXDB_table_columns(table, 203 | schema_version=schema_version) 204 | if (column %in% table_columns) 205 | return(table) 206 | } 207 | if (is.na(schema_version)) { 208 | in_schema <- "" 209 | } else { 210 | in_schema <- c(" in db schema ", as.character(schema_version)) 211 | } 212 | stop(column, ": no such column", in_schema) 213 | } 214 | ) 215 | if (!is.na(from_table)) { 216 | table_columns <- TXDB_table_columns(from_table, 217 | schema_version=schema_version) 218 | tables[columns %in% table_columns] <- from_table 219 | } 220 | tables 221 | } 222 | 223 | -------------------------------------------------------------------------------- /R/exonicParts.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Extraction of exonic and intronic parts 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | ### For all functions in this file, 'txdb' must be a TxDb object or any 6 | ### object that supports transcripts() and exonsBy() (e.g. EnsDb object). 7 | ### 8 | 9 | 10 | ### Works on whatever 'x' can be used as a splitting factor in splitAsList(). 11 | ### TODO: Rename and move to a more appropriate place (IRanges?) 12 | .rank_in_group <- function(x) 13 | { 14 | groups <- splitAsList(seq_along(x), x) 15 | i <- unlist(groups, use.names=FALSE) 16 | ans <- sequence(lengths(groups)) 17 | ans[i] <- ans 18 | ans 19 | } 20 | 21 | 22 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 23 | ### 3 helper functions used internally by exonicParts() and intronicParts() 24 | ### 25 | 26 | ### Return a GRanges object with 1 range per transcript and metadata columns 27 | ### tx_id, tx_name, and gene_id. 28 | ### If 'drop.geneless' is FALSE (the default) then the transcripts are 29 | ### returned in the same order as with transcripts(), which is expected 30 | ### to be by transcript id (tx_id). Otherwise they are ordered first by 31 | ### gene id (gene_id), then by transcript id. 32 | tidyTranscripts <- function(txdb, drop.geneless=FALSE) 33 | { 34 | tx <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id")) 35 | mcols(tx)$gene_id <- as.character(mcols(tx)$gene_id) 36 | if (drop.geneless) { 37 | gene_id <- mcols(tx)$gene_id 38 | tx_id <- mcols(tx)$tx_id 39 | tx <- tx[order(gene_id, tx_id, na.last=NA)] 40 | } 41 | tx 42 | } 43 | 44 | ### Return a GRangesList object parallel to 'tx_ids'. The supplied 'tx_ids' 45 | ### must be a subset of 'mcols(transcripts(txdb))$tx_id'. 46 | .exons_by_txids <- function(txdb, tx_ids) 47 | { 48 | if (anyDuplicated(tx_ids)) 49 | stop(wmsg("\"transcripts\" method for ", class(txdb), " objects ", 50 | "seems broken, sorry")) 51 | ans <- exonsBy(txdb, by="tx") 52 | tx_ids <- as.character(tx_ids) 53 | ans_names <- names(ans) 54 | if (!identical(tx_ids, ans_names)) { 55 | m <- match(tx_ids, ans_names) 56 | if (anyNA(m)) 57 | stop(wmsg("\"exonsBy\" method for ", class(txdb), " objects ", 58 | "seems broken, sorry")) 59 | ans <- ans[m] 60 | } 61 | ans 62 | } 63 | 64 | ### Return a GRanges object with 1 range per exon and metadata columns 65 | ### tx_id, tx_name, gene_id, exon_id, exon_name, and exon_rank. 66 | ### If 'drop.geneless' is FALSE (the default) then the exons are ordered first 67 | ### by transcript id (tx_id), then by exon rank (exon_rank). Otherwise they 68 | ### are ordered first by gene id (gene_id), then by transcript id, and then 69 | ### by exon rank. 70 | tidyExons <- function(txdb, drop.geneless=FALSE) 71 | { 72 | tx <- tidyTranscripts(txdb, drop.geneless=drop.geneless) 73 | ex_by_tx <- .exons_by_txids(txdb, mcols(tx)$tx_id) 74 | 75 | ans <- unlist(ex_by_tx, use.names=FALSE) 76 | idx <- rep(seq_along(tx), lengths(ex_by_tx)) 77 | mcols(ans) <- cbind(mcols(tx)[idx, , drop=FALSE], mcols(ans)) 78 | ans 79 | } 80 | 81 | ### Return a GRanges object with 1 range per intron and metadata columns 82 | ### tx_id, tx_name, and gene_id. 83 | ### If 'drop.geneless' is FALSE (the default) then the introns are ordered 84 | ### by transcript id (tx_id). Otherwise they are ordered first by gene id 85 | ### (gene_id), then by transcript id. 86 | tidyIntrons <- function(txdb, drop.geneless=FALSE) 87 | { 88 | tx <- tidyTranscripts(txdb, drop.geneless=drop.geneless) 89 | ex_by_tx <- .exons_by_txids(txdb, mcols(tx)$tx_id) 90 | 91 | introns_by_tx <- psetdiff(tx, ex_by_tx) 92 | 93 | ans <- unlist(introns_by_tx, use.names=FALSE) 94 | idx <- rep(seq_along(tx), lengths(introns_by_tx)) 95 | mcols(ans) <- mcols(tx)[idx, , drop=FALSE] 96 | ans 97 | } 98 | 99 | .break_in_parts <- function(x, linked.to.single.gene.only=FALSE, 100 | extra_mcol="exonic_part") 101 | { 102 | ans <- disjoin(x, with.revmap=TRUE) 103 | revmap <- mcols(ans)$revmap 104 | ans_mcols <- lapply(mcols(x), 105 | function(col) { 106 | col <- unique(extractList(col, revmap)) 107 | col[!is.na(col)] 108 | } 109 | ) 110 | mcols(ans) <- DataFrame(ans_mcols) 111 | if (linked.to.single.gene.only) { 112 | keep_idx <- which(elementNROWS(mcols(ans)$gene_id) == 1L) 113 | ans <- ans[keep_idx] 114 | gene_id <- as.character(mcols(ans)$gene_id) 115 | mcols(ans)$gene_id <- gene_id 116 | ## Add "exonic_part" or "intronic_part" metadata column for 117 | ## compatibility with old disjointExons(). 118 | mcols(ans)[[extra_mcol]] <- .rank_in_group(gene_id) 119 | } 120 | ans 121 | } 122 | 123 | 124 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 125 | ### exonicParts() and intronicParts() 126 | ### 127 | ### exonicParts() is a replacement for the old disjointExons() function, with 128 | ### the following differences/improvements: 129 | ### 130 | ### 1. Argument 'linked.to.single.gene.only' in exonicParts() replaces 131 | ### argument 'aggregateGenes' in disjointExons(), but has the opposite 132 | ### meaning, that is: 133 | ### exonicParts(txdb, linked.to.single.gene.only=TRUE) 134 | ### returns the same exonic parts as: 135 | ### disjointExons(txdb, aggregateGenes=FALSE) 136 | ### 137 | ### 2. Unlike disjointExons(txdb, aggregateGenes=TRUE), 138 | ### exonicParts(txdb, linked.to.single.gene.only=FALSE) 139 | ### does NOT discard exon parts that are not linked to a gene. 140 | ### 141 | ### 3. exonicParts() is almost 2x more efficient than disjointExons(). 142 | ### 143 | ### 4. exonicParts() works out-of-the-box on any TxDb-like object that 144 | ### supports the transcripts() and exonsBy() extractors, e.g. on an 145 | ### EnsDb object. 146 | ### 147 | ### Note that disjointExons() was deprecated in BioC 3.13, then defunct in 148 | ### BioC 3.15, and finally removed from BioC 3.17. 149 | 150 | ### Return a disjoint and strictly sorted GRanges object with 1 range per 151 | ### exonic part and with metadata columns tx_id, tx_name, gene_id, exon_id, 152 | ### exon_name, and exon_rank. 153 | exonicParts <- function(txdb, linked.to.single.gene.only=FALSE) 154 | { 155 | if (!isTRUEorFALSE(linked.to.single.gene.only)) 156 | stop("'linked.to.single.gene.only' must be TRUE or FALSE") 157 | ex <- tidyExons(txdb, drop.geneless=linked.to.single.gene.only) 158 | .break_in_parts(ex, linked.to.single.gene.only, 159 | extra_mcol="exonic_part") 160 | } 161 | 162 | ### Return a disjoint and strictly sorted GRanges object with 1 range per 163 | ### intronic part and with metadata columns tx_id, tx_name, and gene_id. 164 | intronicParts <- function(txdb, linked.to.single.gene.only=FALSE) 165 | { 166 | if (!isTRUEorFALSE(linked.to.single.gene.only)) 167 | stop("'linked.to.single.gene.only' must be TRUE or FALSE") 168 | introns <- tidyIntrons(txdb, drop.geneless=linked.to.single.gene.only) 169 | .break_in_parts(introns, linked.to.single.gene.only, 170 | extra_mcol="intronic_part") 171 | } 172 | 173 | -------------------------------------------------------------------------------- /R/extendExonsIntoIntrons.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Extend exons by a given number of bases into their adjacent introns 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | extendExonsIntoIntrons <- function(ex_by_tx, extent=2) 7 | { 8 | if (!is(ex_by_tx, "GRangesList")) 9 | stop(wmsg("'ex_by_tx' must be a GRangesList object")) 10 | if (!isSingleNumber(extent)) 11 | stop(wmsg("'extent' must be a single number")) 12 | if (!is.integer(extent)) 13 | extent <- as.integer(extent) 14 | 15 | resize_idx <- which(lengths(ex_by_tx) >= 2L) 16 | ex_to_resize <- ex_by_tx[resize_idx] 17 | 18 | ## Resize first exons. 19 | first_ex <- heads(ex_to_resize, n=1L) 20 | unlisted <- unlist(first_ex, use.names=FALSE) 21 | unlisted <- resize(unlisted, width(unlisted) + extent, 22 | fix="start", use.names=FALSE) 23 | first_ex <- relist(unlisted, first_ex) 24 | 25 | ## Resize last exons. 26 | last_ex <- tails(ex_to_resize, n=1L) 27 | unlisted <- unlist(last_ex, use.names=FALSE) 28 | unlisted <- resize(unlisted, width(unlisted) + extent, 29 | fix="end", use.names=FALSE) 30 | last_ex <- relist(unlisted, last_ex) 31 | 32 | ## Resize intermediate exons. 33 | mid_ex <- tails(heads(ex_to_resize, n=-1L), n=-1L) 34 | unlisted <- unlist(mid_ex, use.names=FALSE) 35 | unlisted <- resize(unlisted, width=width(unlisted) + 2L*extent, 36 | fix="center", use.names=FALSE) 37 | mid_ex <- relist(unlisted, mid_ex) 38 | 39 | ## Put exons back together. 40 | ex_by_tx[resize_idx] <- pc(first_ex, mid_ex, last_ex) 41 | ex_by_tx 42 | } 43 | 44 | -------------------------------------------------------------------------------- /R/extractTranscriptSeqs.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### extractTranscriptSeqs() and related tools 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | .unlist_strand <- function(strand, transcripts) 7 | { 8 | if (is.list(strand) || is(strand, "List")) { 9 | ## 'strand' is a list-like object. 10 | if (!identical(unname(elementNROWS(strand)), 11 | unname(elementNROWS(transcripts)))) 12 | stop(wmsg("when 'strand' is a list-like object, it must have ", 13 | "the same \"shape\" as 'transcripts' (i.e. same length ", 14 | "plus 'strand[[i]]' must have the same length as ", 15 | "'transcripts[[i]]' for all 'i')")) 16 | return(strand(unlist(strand, use.names=FALSE))) 17 | } 18 | if (!(is.vector(strand) || is.factor(strand) || is(strand, "Rle"))) 19 | stop(wmsg("'strand' must be an atomic vector, a factor, ", 20 | "an Rle object, or a list-like object")) 21 | strand <- strand(strand) 22 | strand <- S4Vectors:::V_recycle(strand, transcripts, 23 | "strand", "transcripts") 24 | rep.int(strand, elementNROWS(transcripts)) 25 | } 26 | 27 | setGeneric("extractTranscriptSeqs", signature="x", 28 | function(x, transcripts, ...) standardGeneric("extractTranscriptSeqs") 29 | ) 30 | 31 | setMethod("extractTranscriptSeqs", "DNAString", 32 | function(x, transcripts, strand="+") 33 | { 34 | if (!is(transcripts, "IntegerRangesList")) 35 | stop(wmsg("when 'x' is a DNAString object, ", 36 | "'transcripts' must be an IntegerRangesList object")) 37 | unlisted_strand <- .unlist_strand(strand, transcripts) 38 | if (!all(unlisted_strand %in% c("+", "-"))) 39 | stop(wmsg("'strand' can only contain \"+\" and/or \"-\" values. ", 40 | "\"*\" is not allowed.")) 41 | idx <- which(unlisted_strand == "-") 42 | exons <- extractList(x, unlist(transcripts, use.names=FALSE)) 43 | exons[idx] <- reverseComplement(exons[idx]) 44 | unstrsplit(relist(exons, transcripts)) 45 | } 46 | ) 47 | 48 | ### Check for transcripts that have exons located on more than one 49 | ### chromosome. 50 | .check_exon_chrom <- function(tx1) 51 | { 52 | run_lens <- runLength(seqnames(tx1)) 53 | idx <- which(elementNROWS(run_lens) != 1L) 54 | if (length(idx) == 0L) 55 | return() 56 | tx1_names <- names(tx1) 57 | if (is.null(tx1_names)) { 58 | some_in1string <- "" 59 | } else { 60 | some_idx <- head(idx, n=2L) 61 | some_names <- tx1_names[some_idx] 62 | some_in1string <- paste0(some_names, collapse=", ") 63 | if (length(idx) > length(some_idx)) 64 | some_in1string <- paste0("e.g. ", some_in1string, ", etc...") 65 | some_in1string <- paste0(" (", some_in1string, ")") 66 | } 67 | stop(wmsg("Some transcripts", some_in1string, " have exons located on ", 68 | "more than one chromosome. This is not supported yet.")) 69 | } 70 | 71 | ### Check the "exon_rank" inner metadata column if present. When 'transcripts' 72 | ### contains CDS parts (instead of exons) grouped by transcript, some of the 73 | ### lowest or/and highest exon ranks can be missing. 74 | .check_exon_rank <- function(tx1) 75 | { 76 | exon_rank <- mcols(tx1@unlistData)$exon_rank 77 | if (is.null(exon_rank)) 78 | return() 79 | if (!is.numeric(exon_rank)) 80 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ", 81 | "object 'transcripts' is not numeric")) 82 | if (!is.integer(exon_rank)) { 83 | warning(wmsg("\"exon_rank\" inner metadata column in GRangesList ", 84 | "object 'transcripts' is not integer")) 85 | exon_rank <- as.integer(exon_rank) 86 | } 87 | if (any(is.na(exon_rank))) 88 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ", 89 | "object 'transcripts' contains NAs")) 90 | 91 | partitioning <- PartitioningByEnd(tx1) 92 | ## The 2 lines below are equivalent to: 93 | ## tmp <- relist(exon_rank, partitioning) 94 | ## min_rank <- min(tmp) 95 | ## but much faster! 96 | v <- Views(exon_rank, partitioning) 97 | min_rank <- viewMins(v) 98 | if (any(min_rank < 1L)) 99 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ", 100 | "object 'transcripts' contains ranks < 1")) 101 | tx1_eltNROWS <- elementNROWS(partitioning) 102 | target <- sequence(tx1_eltNROWS, from=min_rank) 103 | if (!identical(target, unname(exon_rank))) 104 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ", 105 | "object 'transcripts' does not contain increasing ", 106 | "consecutive ranks for some transcripts")) 107 | } 108 | 109 | ### TODO: Incorporate this fast path to "unlist" method for XStringSet objects. 110 | .fast_XStringSet_unlist <- function(x) 111 | { 112 | # Disabling the fast path for now. Until I understand why using it 113 | # causes extractTranscriptSeqs(Hsapiens, TxDb.Hsapiens.UCSC.hg18.knownGene) 114 | # to use more memory (319.7 Mb) than when NOT using it (288.9 Mb). 115 | if (FALSE) { 116 | x_len <- length(x) 117 | if (x_len != 0L && length(x@pool) == 1L) { 118 | x_ranges <- x@ranges 119 | x_start <- start(x_ranges) 120 | x_end <- end(x_ranges) 121 | if (identical(x_end[-x_len] + 1L, x_start[-1L])) { 122 | ## The ranges are adjacent. We can unlist() without copying 123 | ## the sequence data! 124 | cat("using fast path (", x_len, ") ...\n") 125 | ans_class <- elementType(x) 126 | ans_shared <- x@pool[[1L]] 127 | ans_offset <- x_start[1L] - 1L 128 | ans_length <- x_end[x_len] - ans_offset 129 | ans <- new2(ans_class, shared=ans_shared, 130 | offset=ans_offset, 131 | length=ans_length, 132 | check = FALSE) 133 | return(ans) 134 | } 135 | } 136 | } 137 | unlist(x, use.names=FALSE) 138 | } 139 | 140 | .extract_and_combine <- function(x, seqname, ranges) 141 | { 142 | seqs <- getSeq(x, GRanges(seqname, ranges)) 143 | ## For "getSeq" methods (like the method for GmapGenome objects) that 144 | ## return a character vector. 145 | if (is.character(seqs)) 146 | seqs <- DNAStringSet(seqs) 147 | .fast_XStringSet_unlist(seqs) 148 | } 149 | 150 | .extractTranscriptSeqsFromOneSeq <- function(seqlevel, x, transcripts) 151 | { 152 | seqlevels(transcripts, pruning.mode="coarse") <- seqlevel 153 | strand <- strand(transcripts) 154 | transcripts <- ranges(transcripts) 155 | if (seqlevel %in% seqlevels(x)) { 156 | ## We try to load the less stuff possible i.e. only the nucleotides 157 | ## that participate in at least one exon. 158 | exons <- unlist(transcripts, use.names=FALSE) 159 | ranges_to_load <- reduce(exons, with.inframe.attrib=TRUE) 160 | x <- .extract_and_combine(x, seqlevel, ranges_to_load) 161 | exons <- attr(ranges_to_load, "inframe") 162 | transcripts <- relist(exons, transcripts) 163 | } else { 164 | ## Why do we need this? 165 | regex <- paste0("^", seqlevel, "$") 166 | x <- getSeq(x, regex) 167 | } 168 | extractTranscriptSeqs(x, transcripts, strand=strand) 169 | } 170 | 171 | .extractTranscriptSeqs_default <- function(x, transcripts, ...) 172 | { 173 | if (is(transcripts, "GRangesList")) { 174 | if (length(list(...)) != 0L) 175 | stop(wmsg("additional arguments are allowed only when ", 176 | "'transcripts' is not a GRangesList object")) 177 | } else { 178 | transcripts <- try(exonsBy(transcripts, by="tx", ...), 179 | silent=TRUE) 180 | if (is(transcripts, "try-error")) 181 | stop(wmsg("failed to extract the exon ranges from 'transcripts' ", 182 | "with exonsBy(transcripts, by=\"tx\", ...)")) 183 | } 184 | idx1 <- which(elementNROWS(transcripts) != 0L) 185 | tx1 <- transcripts[idx1] 186 | .check_exon_chrom(tx1) 187 | .check_exon_rank(tx1) 188 | 189 | tx1_seqlevels_in_use <- seqlevelsInUse(tx1) 190 | x_seqlevels <- seqlevels(x) 191 | ok <- tx1_seqlevels_in_use %in% x_seqlevels 192 | if (!all(ok)) { 193 | if (all(!ok)) 194 | stop(wmsg("the transcripts in 'transcripts' are on chromosomes ", 195 | "that are not in 'x'")) 196 | seqlevel_not_in_x <- tx1_seqlevels_in_use[!ok][[1L]] 197 | stop(wmsg("some transcripts in 'transcripts' are on chromosomes ", 198 | "that are not in 'x' (e.g. some transcripts are on ", 199 | "chromosome \"", seqlevel_not_in_x, "\" but this ", 200 | "chromosome is not in 'x')")) 201 | } 202 | seqlevels(tx1) <- tx1_seqlevels_in_use 203 | ## 'seqnames1' is just an ordinary factor (not Rle) parallel to 'tx1'. 204 | seqnames1 <- unlist(runValue(seqnames(tx1)), use.names=FALSE) 205 | dnaset_list <- lapply(levels(seqnames1), 206 | .extractTranscriptSeqsFromOneSeq, x, tx1) 207 | ans <- rep.int(DNAStringSet(""), length(transcripts)) 208 | names(ans) <- names(transcripts) 209 | ans[idx1] <- unsplit_list_of_XVectorList("DNAStringSet", 210 | dnaset_list, 211 | seqnames1) 212 | ans 213 | } 214 | setMethod("extractTranscriptSeqs", "ANY", .extractTranscriptSeqs_default) 215 | 216 | -------------------------------------------------------------------------------- /R/extractUpstreamSeqs.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### extractUpstreamSeqs() 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | ### Dispatch is on the 2nd argument! 7 | setGeneric("extractUpstreamSeqs", signature="genes", 8 | function(x, genes, width=1000, ...) standardGeneric("extractUpstreamSeqs") 9 | ) 10 | 11 | ### Will work on any object 'x' for which seqinfo() and getSeq() are defined 12 | ### e.g. BSgenome, FaFile, TwoBitFile, etc... 13 | setMethod("extractUpstreamSeqs", "GenomicRanges", 14 | function(x, genes, width=1000) 15 | { 16 | seqinfo(genes) <- merge(seqinfo(genes), seqinfo(x)) 17 | upstream <- trim(suppressWarnings(flank(genes, width=width))) 18 | ans <- getSeq(x, upstream) 19 | 20 | ## Add metada columns to 'ans'. 21 | gene_seqnames <- seqnames(genes) 22 | gene_strand <- strand(genes) 23 | idx1 <- which(gene_strand != "-") 24 | idx2 <- which(gene_strand == "-") 25 | gene_TSS <- integer(length(genes)) 26 | gene_TSS[idx1] <- start(genes)[idx1] 27 | gene_TSS[idx2] <- end(genes)[idx2] 28 | ans_mcols <- DataFrame(gene_seqnames=gene_seqnames, 29 | gene_strand=gene_strand, 30 | gene_TSS=gene_TSS) 31 | mcols(ans) <- ans_mcols 32 | ans 33 | } 34 | ) 35 | 36 | setMethod("extractUpstreamSeqs", "TxDb", 37 | function(x, genes, width=1000, exclude.seqlevels=NULL) 38 | { 39 | genes <- sort(genes(genes)) 40 | ## 'genes' is now a GRanges object. 41 | if (!is.null(exclude.seqlevels)) { 42 | if (!is.character(exclude.seqlevels)) 43 | stop("'exclude.seqlevels' must be NULL or a character vector") 44 | idx <- match(exclude.seqlevels, seqlevels(genes)) 45 | if (any(is.na(idx))) 46 | stop("'exclude.seqlevels' contains invalid seqlevels") 47 | seqlevels(genes, pruning.mode="coarse") <- seqlevels(genes)[-idx] 48 | } 49 | callGeneric(x, genes, width=width) 50 | } 51 | ) 52 | 53 | ### 'genes' is assumed to contain transcripts grouped by gene e.g. as returned 54 | ### by transcriptsBy(..., by="gene"). 55 | setMethod("extractUpstreamSeqs", "GRangesList", 56 | function(x, genes, width=1000) 57 | { 58 | stop("NOT READY YET, SORRY!") 59 | } 60 | ) 61 | 62 | -------------------------------------------------------------------------------- /R/features.R: -------------------------------------------------------------------------------- 1 | ## Very simple extractor to just return what is in our Feature.Db objects 2 | 3 | ## business end of things 4 | 5 | .extractDataCols <- function(conn, tableName){ 6 | SQL <- paste0("SELECT * FROM ", tableName, ";") 7 | dbEasyQuery(conn, SQL) 8 | } 9 | 10 | 11 | .extractFeaturesAsGRanges <- function(db) 12 | { 13 | ## 1st figure out what table is not the metadata table. 14 | conn <- dbconn(db) ## featuredbconn(db) 15 | tableNames <- dbListTables(conn) 16 | tableName <- tableNames[!tableNames %in% "metadata"] 17 | 18 | ## Then learn what the columns are in that table and assign to otherCols 19 | colNames <- dbListFields(conn, tableName) 20 | reserved <- c("name", "chrom", "strand", "chromStart", "chromEnd") 21 | colNames <- colNames[!colNames %in% reserved] 22 | 23 | ## Extract the data 24 | df <- .extractDataCols(conn, tableName) 25 | 26 | ## Make & return the Object 27 | md <- metadata(db) 28 | genome <- md[md$name == "Genome", 'value'] 29 | if (is.null(genome)) 30 | genome <- NA_character_ 31 | ans <- 32 | GRanges(seqnames = df$chrom, 33 | ranges = IRanges(df$chromStart, df$chromEnd, names=df$name), 34 | strand = sub('\\.', '*', df$strand), 35 | df[colNames], 36 | seqinfo = Seqinfo(unique(df$chrom), genome=genome)) 37 | metadata(ans)[[1]] <- DataFrame(metadata(db)) 38 | ans 39 | } 40 | 41 | setGeneric("features", signature="x", 42 | function(x) standardGeneric("features")) 43 | 44 | setMethod("features", "FeatureDb", 45 | function(x) .extractFeaturesAsGRanges(x)) 46 | 47 | ## test code: 48 | ## library(GenomicFeatures) 49 | ## fdb <- loadDb("FeatureDb.sqlite") 50 | ## features(fdb) 51 | -------------------------------------------------------------------------------- /R/getPromoterSeq-methods.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### getPromoterSeq() and getTerminatorSeq() 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Original author: Paul Shannon 6 | 7 | ### NOTE (H. Pagès, Jan 22, 2024): Interface is inconsistent with 8 | ### extractTranscriptSeqs() or extractUpstreamSeqs(). 9 | ### TODO: Implement extractPromoterSeqs() and extractTerminatorSeqs() and 10 | ### model them after extractTranscriptSeqs() or extractUpstreamSeqs(). 11 | ### Then deprecate getPromoterSeq() and getTerminatorSeq() in favor of 12 | ### extractPromoterSeqs() and extractTerminatorSeqs(). 13 | 14 | setGeneric("getPromoterSeq", signature="query", 15 | function(query, subject, upstream=2000, downstream=200) 16 | standardGeneric("getPromoterSeq")) 17 | 18 | setGeneric("getTerminatorSeq", signature="query", 19 | function(query, subject, upstream=2000, downstream=200) 20 | standardGeneric("getTerminatorSeq")) 21 | 22 | .GRanges_getPromoterSeq <- function(query, subject, FUN, upstream, downstream) 23 | { 24 | stopifnot(is(query, "GRanges")) 25 | promoter.granges <- FUN(query, upstream, downstream) 26 | result <- getSeq(subject, promoter.granges) 27 | md <- mcols(query) 28 | geneIDs <- names(query) # often NULL 29 | if (is.null(geneIDs)) 30 | geneIDs <- rep(NA_character_, length(query)) 31 | md$geneID <- geneIDs 32 | mcols(result) <- md 33 | result 34 | } 35 | 36 | setMethod("getPromoterSeq", "GRanges", 37 | function(query, subject, upstream=2000, downstream=200) 38 | .GRanges_getPromoterSeq(query, subject, promoters, 39 | upstream, downstream) 40 | ) 41 | 42 | setMethod("getTerminatorSeq", "GRanges", 43 | function(query, subject, upstream=2000, downstream=200) 44 | .GRanges_getPromoterSeq(query, subject, terminators, 45 | upstream, downstream) 46 | ) 47 | 48 | .GRangesList_getPromoterSeq <- 49 | function(query, subject, FUN, upstream, downstream) 50 | { 51 | stopifnot(is(query, "GRangesList")) 52 | unlisted_query <- unlist(query, use.names=FALSE) # GRanges object 53 | promoter.granges <- FUN(unlisted_query, upstream, downstream) 54 | result <- getSeq(subject, promoter.granges) 55 | md <- mcols(unlisted_query) 56 | geneIDs <- names(query) 57 | geneID.counts <- elementNROWS(query) 58 | geneIDs <- rep(geneIDs, geneID.counts) # H. Pagès: what if geneIDs is NULL? 59 | md$geneID <- geneIDs 60 | mcols(result) <- md 61 | relist(result, query) 62 | } 63 | 64 | setMethod("getPromoterSeq", "GRangesList", 65 | function(query, subject, upstream=2000, downstream=200) 66 | .GRangesList_getPromoterSeq(query, subject, promoters, 67 | upstream, downstream) 68 | ) 69 | 70 | setMethod("getTerminatorSeq", "GRangesList", 71 | function(query, subject, upstream=2000, downstream=200) 72 | .GRangesList_getPromoterSeq(query, subject, terminators, 73 | upstream, downstream) 74 | ) 75 | 76 | -------------------------------------------------------------------------------- /R/id2name.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Map internal ids to external names for a given feature type. 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | id2name <- function(txdb, feature.type=c("tx", "exon", "cds")) 7 | { 8 | if (!is(txdb, "TxDb")) 9 | stop("'txdb' must be a TxDb object") 10 | feature.type <- match.arg(feature.type) 11 | table <- switch(feature.type, tx="transcript", exon="exon", cds="cds") 12 | columns <- TXDB_table_columns(table)[c("id", "name")] 13 | df <- TxDb_SELECT_from_INNER_JOIN(txdb, table, columns) 14 | ans <- df[[columns[2L]]] 15 | names(ans) <- as.character(df[[columns[1L]]]) 16 | ans 17 | } 18 | 19 | -------------------------------------------------------------------------------- /R/makeFeatureDbFromUCSC.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeFeatureDbFromUCSC() 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Everything in this file has moved to txdbmaker! 6 | 7 | supportedUCSCFeatureDbTracks <- function(genome) 8 | { 9 | call_fun_in_txdbmaker("supportedUCSCFeatureDbTracks", genome=genome) 10 | } 11 | 12 | supportedUCSCFeatureDbTables <- function(...) 13 | { 14 | call_fun_in_txdbmaker("supportedUCSCFeatureDbTables", ...) 15 | } 16 | 17 | UCSCFeatureDbTableSchema <- function(...) 18 | { 19 | call_fun_in_txdbmaker("UCSCFeatureDbTableSchema", ...) 20 | } 21 | 22 | makeFeatureDbFromUCSC <- function(...) 23 | { 24 | call_fun_in_txdbmaker("makeFeatureDbFromUCSC", ...) 25 | } 26 | 27 | -------------------------------------------------------------------------------- /R/makeTxDb.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Making TxDb objects 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Everything in this file has moved to txdbmaker! 6 | 7 | makeTxDb <- function(...) 8 | { 9 | call_fun_in_txdbmaker("makeTxDb", ...) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /R/makeTxDbFromBiomart.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeTxDbFromBiomart() 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Everything in this file has moved to txdbmaker! 6 | 7 | getChromInfoFromBiomart <- function(...) 8 | { 9 | call_fun_in_txdbmaker("getChromInfoFromBiomart", ...) 10 | } 11 | 12 | 13 | makeTxDbFromBiomart <- function(...) 14 | { 15 | call_fun_in_txdbmaker("makeTxDbFromBiomart", ...) 16 | } 17 | 18 | -------------------------------------------------------------------------------- /R/makeTxDbFromEnsembl.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeTxDbFromEnsembl() 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Everything in this file has moved to txdbmaker! 6 | 7 | makeTxDbFromEnsembl <- function(...) 8 | { 9 | call_fun_in_txdbmaker("makeTxDbFromEnsembl", ...) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /R/makeTxDbFromGFF.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeTxDbFromGFF() 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Everything in this file has moved to txdbmaker! 6 | 7 | makeTxDbFromGFF <- function(...) 8 | { 9 | call_fun_in_txdbmaker("makeTxDbFromGFF", ...) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /R/makeTxDbFromGRanges.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeTxDbFromGRanges() 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Everything in this file has moved to txdbmaker! 6 | 7 | makeTxDbFromGRanges <- function(...) 8 | { 9 | call_fun_in_txdbmaker("makeTxDbFromGRanges", ...) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /R/makeTxDbFromUCSC.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeTxDbFromUCSC() 3 | ### ------------------------------------------------------------------------- 4 | 5 | ### Everything in this file has moved to txdbmaker! 6 | 7 | supportedUCSCtables <- function(...) 8 | { 9 | call_fun_in_txdbmaker("supportedUCSCtables", ...) 10 | } 11 | 12 | browseUCSCtrack <- function(...) 13 | { 14 | call_fun_in_txdbmaker("browseUCSCtrack", ...) 15 | } 16 | 17 | makeTxDbFromUCSC <- function(...) 18 | { 19 | call_fun_in_txdbmaker("makeTxDbFromUCSC", ...) 20 | } 21 | 22 | -------------------------------------------------------------------------------- /R/makeTxDbPackage.R: -------------------------------------------------------------------------------- 1 | ### Everything in this file has moved to txdbmaker! 2 | 3 | supportedMiRBaseBuildValues <- function() 4 | { 5 | call_fun_in_txdbmaker("supportedMiRBaseBuildValues") 6 | } 7 | 8 | makePackageName <- function(...) 9 | { 10 | call_fun_in_txdbmaker("makePackageName", ...) 11 | } 12 | 13 | makeTxDbPackage <- function(...) 14 | { 15 | call_fun_in_txdbmaker("makeTxDbPackage", ...) 16 | } 17 | 18 | makeTxDbPackageFromUCSC <- function(...) 19 | { 20 | call_fun_in_txdbmaker("makeTxDbPackageFromUCSC", ...) 21 | } 22 | 23 | makeFDbPackageFromUCSC <- function(...) 24 | { 25 | call_fun_in_txdbmaker("makeFDbPackageFromUCSC", ...) 26 | } 27 | 28 | makeTxDbPackageFromBiomart <- function(...) 29 | { 30 | call_fun_in_txdbmaker("makeTxDbPackageFromBiomart", ...) 31 | } 32 | 33 | -------------------------------------------------------------------------------- /R/mapIdsToRanges.R: -------------------------------------------------------------------------------- 1 | setGeneric("mapIdsToRanges", signature="x", 2 | function(x, ...) standardGeneric("mapIdsToRanges") 3 | ) 4 | 5 | setMethod("mapIdsToRanges", "TxDb", 6 | function(x, 7 | keys, 8 | type = c("cds", "exon", "tx", "gene"), 9 | columns = NULL) 10 | { 11 | .assert(is.list(keys) && .is.named(keys), 12 | "'keys' must be a named list") 13 | 14 | .assert(is.null(columns) || is.character(columns), 15 | "'columns' must be 'NULL' or a character vector") 16 | 17 | type <- match.arg(type) 18 | 19 | fun <- switch(type, 20 | cds = cds, 21 | exon = exons, 22 | tx = transcripts, 23 | gene = genes) 24 | 25 | res <- fun(x, keys, columns = unique(c(names(keys), columns))) 26 | matches <- match(mcols(res)[[names(keys)]], keys[[1]]) 27 | ranges <- rep(res, lengths(matches)) 28 | 29 | f <- factor(keys[[1]][unlist(matches, use.names = FALSE)], 30 | levels = unique(keys[[1]])) 31 | splitAsList(ranges, f, drop = FALSE)[keys[[1]]] 32 | }) 33 | 34 | setGeneric("mapRangesToIds", signature="x", 35 | function(x, ...) standardGeneric("mapRangesToIds") 36 | ) 37 | 38 | setMethod("mapRangesToIds", "TxDb", 39 | function(x, 40 | ranges, 41 | type = c("cds", "exon", "tx", "gene"), 42 | columns = NULL, 43 | ...) 44 | { 45 | type <- match.arg(type) 46 | .assert(is(ranges, "Vector"), 47 | "'ranges' must be a 'Vector'") 48 | .assert(is.null(columns) || is.character(columns), 49 | "'columns' must be 'NULL' or a character vector") 50 | 51 | fun <- switch(type, 52 | cds = cds, 53 | exon = exons, 54 | tx = transcripts, 55 | gene = genes) 56 | 57 | all <- 58 | if (is.null(columns)) { 59 | fun(x) 60 | } else { 61 | fun(x, columns = columns) 62 | } 63 | 64 | hits <- findOverlaps(ranges, all, ...) 65 | lapply(split(all[subjectHits(hits)], names(ranges)[queryHits(hits)]), mcols) 66 | }) 67 | 68 | .assert <- function(x, message) { 69 | if(!x) { 70 | stop(message, call. = FALSE) 71 | } 72 | } 73 | 74 | .is.named <- function(x) { 75 | nm <- names(x) 76 | !is.null(nm) && all(!is.na(nm) & nzchar(nm)) 77 | } 78 | -------------------------------------------------------------------------------- /R/nearest-methods.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### nearest (and related) methods 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 7 | ### distance 8 | ### 9 | 10 | setMethod("distance", c("GenomicRanges", "TxDb"), 11 | function(x, y, ignore.strand=FALSE, ..., id, 12 | type=c("gene", "tx", "exon", "cds")) 13 | { 14 | if (!identical(length(x), length(id))) 15 | stop("length(id) must equal length(x)") 16 | if (!is.character(id)) 17 | stop("'id' must be a character") 18 | 19 | if (type == "gene") { 20 | .extractByGeneID(x, y, ignore.strand, id) 21 | } else { 22 | rng <- switch(type, 23 | tx=transcripts(y, "tx_id", filter=list(tx_id=id)), 24 | exon=exons(y, "exon_id", filter=list(exon_id=id)), 25 | cds=cds(y, "cds_id", filter=list(cds_id=id))) 26 | f <- factor(mcols(rng)[,]) 27 | missing <- !id %in% levels(f) 28 | if (any(missing)) 29 | warning(paste0("id(s): '", paste(unique(id[missing]), 30 | sep=","), "' were not found in 'y'")) 31 | ## rep out ranges according to 'id' 32 | rng <- rng[match(id[!missing], levels(f))] 33 | ans <- rep(NA_integer_, length(x)) 34 | ans[!missing] <- distance(x[!missing], rng, 35 | ignore.strand=ignore.strand) 36 | stopifnot(length(ans) == length(id)) 37 | ans 38 | } 39 | } 40 | ) 41 | 42 | .extractByGeneID <- function(x, y, ignore.strand, id) 43 | { 44 | tx <- transcriptsBy(y, "gene") 45 | missing <- !id %in% names(tx) 46 | if (any(missing)) 47 | warning(paste0("id(s): '", paste(unique(id[missing]), sep=","), 48 | "' were not found in 'y'")) 49 | 50 | group <- range(tx[names(tx) %in% id], ignore.strand=ignore.strand) 51 | multiRange <- lengths(group) > 1L 52 | if (any(multiRange)) { 53 | warning(paste0("id(s): '", paste(unique(names(multiRange)[multiRange]), 54 | sep=','), 55 | "' could not be collapsed to a single gene region")) 56 | group <- group[!multiRange] 57 | } 58 | 59 | valid <- (!id %in% names(multiRange)[multiRange]) & !missing 60 | ## rep out ranges according to 'id' 61 | rng <- unlist(group, use.names=FALSE) 62 | rng <- rng[match(id[valid], names(group))] 63 | ans <- rep(NA_integer_, length(x)) 64 | ans[valid] <- distance(x[valid], rng, ignore.strand=ignore.strand) 65 | stopifnot(length(ans) == length(id)) 66 | ans 67 | } 68 | -------------------------------------------------------------------------------- /R/proteinToGenome.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### proteinToGenome() 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | ### Dispatch is on 2nd argument! 7 | setGeneric("proteinToGenome", signature="db", 8 | function(x, db, ...) standardGeneric("proteinToGenome") 9 | ) 10 | 11 | 12 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 13 | ### Make fancy error or warning messages 14 | ### 15 | 16 | .make_bad_names_msg <- function(x_names, bad_idx, what="invalid name", 17 | max.show=5L) 18 | { 19 | nbad <- length(bad_idx) 20 | if (max.show == 0L) { 21 | msg <- c("the names on 'x' contain ", nbad, " ", what) 22 | if (nbad != 1L) 23 | msg <- c(msg, "s") 24 | return(paste(msg, collapse="")) 25 | } 26 | if (nbad == 1L) { 27 | msg <- c("The names on 'x' contain ", what) 28 | } else { 29 | msg <- c("The names on 'x' contain ", nbad, " ", what, "s") 30 | if (nbad > max.show) { 31 | if (max.show == 1L) { 32 | msg <- c(msg, " (showing the first one only)") 33 | } else { 34 | msg <- c(msg, " (showing the first ", max.show, " only)") 35 | } 36 | bad_idx <- head(bad_idx, n=max.show) 37 | } 38 | } 39 | bad_names <- x_names[bad_idx] 40 | paste0(paste(msg, collapse=""), ": ", paste(bad_names, collapse=", ")) 41 | } 42 | 43 | 44 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 45 | ### proteinToGenome() method for GRangesList objects 46 | ### 47 | 48 | .make_protein_ranges_from_cumwidths <- function(cumwidths) 49 | { 50 | len <- length(cumwidths) 51 | protein_start <- c(1L, cumwidths[-len] %/% 3L + 1L) 52 | protein_end <- (2L + cumwidths) %/% 3L 53 | IRanges(protein_start, protein_end) 54 | } 55 | 56 | ### Trims first range in 'gr' on its 5' side by 'trim1' nucleotides, 57 | ### and last range in 'gr' on its 3' side by 'trim2' nucleotides. 58 | ### Other than that, everything else is preserved (length, names, metadata 59 | ### columns). 60 | .trim_first_and_last_ranges <- function(gr, trim1=0L, trim2=0L) 61 | { 62 | stopifnot(is(gr, "GRanges"), 63 | isSingleInteger(trim1), 64 | isSingleInteger(trim2)) 65 | 66 | gr_ranges <- ranges(gr) 67 | gr_len <- length(gr_ranges) 68 | stopifnot(gr_len >= 1L) 69 | 70 | gr_start <- start(gr_ranges) 71 | gr_end <- end(gr_ranges) 72 | gr_strand <- S4Vectors:::decodeRle(strand(gr)) 73 | 74 | ## Trim first range. 75 | strand1 <- gr_strand[[1L]] 76 | if (strand1 == "+") { 77 | ## Trim on the left. 78 | gr_start[[1L]] <- gr_start[[1L]] + trim1 79 | } else { 80 | ## Trim on the right. 81 | gr_end[[1L]] <- gr_end[[1L]] - trim1 82 | } 83 | 84 | ## Trim last range. 85 | strand2 <- gr_strand[[gr_len]] 86 | if (strand2 == "+") { 87 | ## Trim on the right. 88 | gr_end[[gr_len]] <- gr_end[[gr_len]] - trim2 89 | } else { 90 | ## Trim on the left. 91 | gr_start[[gr_len]] <- gr_start[[gr_len]] + trim2 92 | } 93 | 94 | if (any(gr_start > gr_end + 1L)) 95 | stop(wmsg("invalid trimming")) 96 | 97 | ranges(gr) <- update_ranges(gr_ranges, start=gr_start, end=gr_end) 98 | gr 99 | } 100 | 101 | ### 'cds' must be a GRanges object representing the CDS parts of a given 102 | ### transcript/protein. 103 | ### 'protein_start' and 'protein_end' must be protein-relative coordinates 104 | ### i.e. coordinates (counted in Amino Acids) relative to the protein 105 | ### associated with 'cds'. 106 | ### Returns a GRanges object. 107 | .map_protein_to_cds <- function(protein_start, protein_end, cds) 108 | { 109 | stopifnot(isSingleNumber(protein_start), 110 | isSingleNumber(protein_end), 111 | protein_start <= protein_end, 112 | is(cds, "GRanges")) 113 | nparts <- length(cds) 114 | cds_widths <- width(cds) 115 | stopifnot(nparts >= 1L, all(cds_widths >= 1L)) 116 | protein_start <- as.integer(protein_start) 117 | protein_end <- as.integer(protein_end) 118 | cds_cumwidths <- cumsum(cds_widths) 119 | 120 | ## Add metadata columns 'protein_start' and 'protein_end' to 'cds'. 121 | protein_ranges <- .make_protein_ranges_from_cumwidths(cds_cumwidths) 122 | protein_ranges <- DataFrame(protein_start=start(protein_ranges), 123 | protein_end=end(protein_ranges)) 124 | mcols(cds) <- cbind(mcols(cds), protein_ranges) 125 | 126 | ## Translate protein-relative coordinates into 0-based CDS-relative 127 | ## coordinates. 128 | protein_start0 <- 3L * (protein_start - 1L) 129 | protein_end0 <- 3L * protein_end - 1L 130 | 131 | ## Find CDS parts touched by 'protein_start' and 'protein_end'. 132 | idx <- 1L + findInterval(c(protein_start0, protein_end0), cds_cumwidths) 133 | idx1 <- idx[[1L]] 134 | idx2 <- idx[[2L]] 135 | if (idx2 > nparts) 136 | idx2 <- nparts 137 | 138 | ## Extract all CDS parts touched by the [protein_start,protein_end] range. 139 | ans <- cds[idx1:idx2] 140 | 141 | ## Trim first and last ranges in 'ans' (trimming should **always** 142 | ## be valid). 143 | trim1 <- protein_start0 - sum(head(cds_widths, idx1 - 1L)) 144 | trim2 <- cds_cumwidths[[idx2]] - protein_end0 - 1L 145 | ans <- .trim_first_and_last_ranges(ans, trim1, trim2) 146 | 147 | ## Adjust metadata columns 'protein_start' and 'protein_end' to account 148 | ## for trimming. 149 | ans_mcols <- mcols(ans) 150 | ans_mcols[1L, "protein_start"] <- protein_start 151 | ans_mcols[nrow(ans_mcols), "protein_end"] <- protein_end 152 | mcols(ans) <- ans_mcols 153 | 154 | ans 155 | } 156 | 157 | ### Returns a named GRangesList object parallel to 'x' (names on 'x' are 158 | ### propagated). 159 | setMethod("proteinToGenome", "GRangesList", 160 | function(x, db) 161 | { 162 | if (!is(x, "IRanges")) 163 | stop(wmsg("'x' must be an IRanges object or derivative")) 164 | x_names <- names(x) 165 | if (is.null(x_names)) 166 | stop(wmsg("'x' must have names")) 167 | coding_tx_names <- names(db) 168 | if (is.null(coding_tx_names)) 169 | stop(wmsg("'db' must have names when it's a GRangesList object")) 170 | non_coding_idx <- which(!(x_names %in% coding_tx_names)) 171 | if (length(non_coding_idx) != 0L) { 172 | msg <- .make_bad_names_msg(x_names, non_coding_idx, 173 | what="non-coding transcript name") 174 | warning(wmsg(msg), immediate.=TRUE) 175 | } 176 | x_start <- start(x) 177 | x_end <- end(x) 178 | ## TODO: Replace this inefficient lapply-based implementation with 179 | ## something better. 180 | ans <- lapply(setNames(seq_along(x), x_names), 181 | function(i) { 182 | if (i %in% non_coding_idx) 183 | return(GRanges()) 184 | tx_name <- x_names[[i]] 185 | cds <- db[[tx_name]] 186 | .map_protein_to_cds(x_start[[i]], x_end[[i]], cds) 187 | } 188 | ) 189 | GRangesList(ans) 190 | } 191 | ) 192 | 193 | 194 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 195 | ### Default proteinToGenome() method 196 | ### 197 | 198 | ### 'db' must be a TxDb object or any object that supports transcripts() 199 | ### (e.g. EnsDb object). 200 | .check_supplied_tx_names <- function(supplied_tx_names, db) 201 | { 202 | if (is.null(supplied_tx_names)) 203 | stop(wmsg("'x' must have names and they must be transcript names")) 204 | stopifnot(is.character(supplied_tx_names)) 205 | if (any(supplied_tx_names %in% c(NA_character_, ""))) 206 | stop(wmsg("the names on 'x' cannot contain NAs or empty strings")) 207 | tx <- transcripts(db, columns="tx_name") 208 | tx_names <- mcols(tx)$tx_name 209 | bad <- !(supplied_tx_names %in% tx_names) 210 | if (all(bad)) 211 | stop(wmsg("The names on 'x' must be transcript names present in ", 212 | "the supplied ", class(db), " object. Note that the ", 213 | "transcript names in this object can be obtained/seen ", 214 | "with:"), 215 | "\n tx <- transcripts(db, columns=\"tx_name\")", 216 | "\n mcols(tx)$tx_name") 217 | bad_idx <- which(bad) 218 | if (length(bad_idx) != 0L) { 219 | msg <- .make_bad_names_msg(supplied_tx_names, bad_idx, 220 | what="invalid transcript name") 221 | stop(wmsg(msg)) 222 | } 223 | } 224 | 225 | ### 'db' must be a TxDb object or any object that supports cdsBy() 226 | ### (e.g. EnsDb object). 227 | .extract_cds_by_tx <- function(db, tx_names) 228 | { 229 | stopifnot(is.character(tx_names)) 230 | if (!is(db, "EnsDb")) 231 | return(cdsBy(db, by="tx", use.names=TRUE)) 232 | ## Should never happen in practice because if 'db' is an EnsDb object 233 | ## then the ensembldb package should be loaded already, and ensembldb 234 | ## depends on AnnotationFilter. 235 | if (!requireNamespace("AnnotationFilter", quietly=TRUE)) 236 | stop(wmsg("Couldn't load the AnnotationFilter package. ", 237 | "The AnnotationFilter package is needed when ", 238 | "calling proteinToGenome() on an EnsDb object. ", 239 | "Please install it.")) 240 | filter <- AnnotationFilter::TxIdFilter(tx_names) 241 | cdsBy(db, by="tx", filter=filter) 242 | } 243 | 244 | ### 'db' must be a TxDb object or any object that supports transcripts() 245 | ### and cdsBy() (e.g. EnsDb object). 246 | ### Returns a named GRangesList object parallel to 'x' (names on 'x' are 247 | ### propagated). 248 | .default_proteinToGenome <- function(x, db) 249 | { 250 | if (!is(x, "IRanges")) 251 | stop(wmsg("'x' must be an IRanges object or derivative")) 252 | x_names <- names(x) 253 | .check_supplied_tx_names(x_names, db) 254 | cds_by_tx <- .extract_cds_by_tx(db, x_names) 255 | proteinToGenome(x, cds_by_tx) 256 | } 257 | 258 | setMethod("proteinToGenome", "ANY", .default_proteinToGenome) 259 | 260 | -------------------------------------------------------------------------------- /R/tRNAs.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Extractors for features in other databases 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | ### This is for extractors that do NOT point to the TxDb proper. 6 | ### Such extractors can point to other databases OR they can 7 | ### point to other FeatureDbs within the same package. 8 | 9 | microRNAs <- function(x) .Defunct() 10 | 11 | .syncSeqlevel <- function(txdb, ans){ 12 | isActSeq <- .isActiveSeq(txdb) 13 | n2oNames <- levels(seqnames(ans)) 14 | n2o <- match(seqnames(seqinfo(txdb)), n2oNames) 15 | seqinfo(ans, new2old=n2o) <- seqinfo(txdb) 16 | seqlevels(ans, pruning.mode="coarse") <- names(isActSeq)[isActSeq] 17 | ans 18 | } 19 | 20 | ## main function 21 | .tRNAs <- function(x) { 22 | if (!requireNamespace("txdbmaker", quietly=TRUE)) 23 | stop("Could not load package txdbmaker. Is it installed?\n\n ", 24 | wmsg("Note that the tRNAs() method for TxDb objects requires ", 25 | "the txdbmaker package. Please install it with:"), 26 | "\n\n BiocManager::install(\"txdbmaker\")") 27 | 28 | fdbpkg <- "FDb.UCSC.tRNAs" 29 | fdbenv <- loadNamespace(fdbpkg) 30 | ## get the current package name 31 | pkgName <- txdbmaker::makePackageName(x) 32 | ## from here we know what the FDB should MUST look like 33 | fdbName <- sub("TxDb","FDb",pkgName) 34 | fdbName <- unlist(strsplit(fdbName,"\\.")) 35 | fdbName[5] <- "tRNAs" 36 | fdbString <- paste(fdbName,collapse=".") 37 | if (!exists(fdbString, envir=fdbenv)) { 38 | stop("there is no tRNA data available for this organism/source") 39 | } else { 40 | fdb <- get(fdbString, fdbenv) 41 | ans <- features(fdb) 42 | } 43 | ## Now check active seqs and set the seqlevels 44 | .syncSeqlevel(x, ans) 45 | } 46 | 47 | setGeneric("tRNAs", function(x) standardGeneric("tRNAs")) 48 | 49 | setMethod("tRNAs", "TxDb", .tRNAs) 50 | 51 | 52 | ## Test code for new TXTYPE support (BC vs new code) 53 | ## library(TxDb.Hsapiens.BioMart.ensembl.GRCh38);txdb2= TxDb.Hsapiens.BioMart.ensembl.GRCh38;transcripts(txdb2, columns='TXTYPE') 54 | ## exons(txdb2, columns='TXTYPE') 55 | ## And this one works now 56 | ## library(TxDb.Hsapiens.UCSC.hg19.knownGene);txdb = TxDb.Hsapiens.UCSC.hg19.knownGene;transcripts(txdb, columns='TXTYPE') 57 | ## But this still fails (argh): 58 | ## exons(txdb, columns='TXTYPE') 59 | 60 | -------------------------------------------------------------------------------- /R/transcriptLengths.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### transcriptLengths() 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | .match_and_check <- function(rglist_names, tx_id) 7 | { 8 | if (is.null(rglist_names)) 9 | stop(wmsg("internal error in transcriptLengths(): ", 10 | "no names on 'rglist'")) 11 | m <- match(rglist_names, tx_id) 12 | if (any(is.na(m))) 13 | stop(wmsg("internal error in transcriptLengths(): ", 14 | "some 'rglist' names cannot be mapped to 'tx_id'")) 15 | m 16 | } 17 | 18 | ### 'rglist' must be a named IntegerRangesList or GRangesList. 19 | ### 'tx_id' must be a character vector. 20 | .eltNROWS <- function(rglist, tx_id) 21 | { 22 | ans <- integer(length(tx_id)) 23 | m <- .match_and_check(names(rglist), tx_id) 24 | ans[m] <- elementNROWS(rglist) 25 | ans 26 | } 27 | 28 | .sum_width <- function(rglist, tx_id) 29 | { 30 | ans <- integer(length(tx_id)) 31 | m <- .match_and_check(names(rglist), tx_id) 32 | ans[m] <- sum(width(rglist)) 33 | ans 34 | } 35 | 36 | ### The returned data frame has 1 row per transcript returned by 37 | ### 'transcripts(txdb)' and in the same order. 38 | ### NOTES: 39 | ### - The functions only accepts a TxDb object for now. We'll make it 40 | ### a generic function when we need to support other types of input. 41 | ### - The function could probably be made much faster by querying the 42 | ### TxDb object directly in SQL instead of calling exonsBy(), cdsBy(), 43 | ### fiveUTRsByTranscript(), and threeUTRsByTranscript() successively. 44 | transcriptLengths <- function(txdb, with.cds_len=FALSE, 45 | with.utr5_len=FALSE, with.utr3_len=FALSE, 46 | ...) 47 | { 48 | if (!isTRUEorFALSE(with.cds_len)) 49 | stop("'with.cds_len' must be TRUE or FALSE") 50 | if (!isTRUEorFALSE(with.utr5_len)) 51 | stop("'with.utr5_len' must be TRUE or FALSE") 52 | if (!isTRUEorFALSE(with.cds_len)) 53 | stop("'with.utr3_len' must be TRUE or FALSE") 54 | tx <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"),...) 55 | ans <- mcols(tx) 56 | ans$gene_id <- as.character(ans$gene_id) 57 | tx_id <- as.character(ans$tx_id) # because match() will want a character 58 | 59 | rg_by_tx <- exonsBy(txdb, by="tx", ...) 60 | ans$nexon <- .eltNROWS(rg_by_tx, tx_id) 61 | ans$tx_len <- .sum_width(rg_by_tx, tx_id) 62 | if (with.cds_len) { 63 | rg_by_tx <- cdsBy(txdb, by="tx", ...) 64 | ans$cds_len <- .sum_width(rg_by_tx, tx_id) 65 | } 66 | if (with.utr5_len) { 67 | rg_by_tx <- fiveUTRsByTranscript(txdb, ...) 68 | ans$utr5_len <- .sum_width(rg_by_tx, tx_id) 69 | } 70 | if (with.utr3_len) { 71 | rg_by_tx <- threeUTRsByTranscript(txdb, ...) 72 | ans$utr3_len <- .sum_width(rg_by_tx, tx_id) 73 | } 74 | as.data.frame(ans) 75 | } 76 | 77 | -------------------------------------------------------------------------------- /R/transcriptLocs2refLocs.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### transcriptLocs2refLocs() 3 | ### ------------------------------------------------------------------------- 4 | 5 | 6 | .normargExonStartsOrEnds <- function(exonStarts, argname) 7 | { 8 | if (is.list(exonStarts)) 9 | return(exonStarts) 10 | if (is(exonStarts, "IntegerList")) 11 | return(as.list(exonStarts)) 12 | if (is.character(exonStarts)) 13 | return(toListOfIntegerVectors(exonStarts)) 14 | stop("'", argname, "' must be a list of integer vectors, ", 15 | "an IntegerList object,\n or a character vector where ", 16 | "each element is a comma-separated list of\n integers") 17 | } 18 | 19 | 20 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 21 | ### transcriptLocs2refLocs() 22 | ### 23 | 24 | transcriptLocs2refLocs <- function(tlocs, exonStarts=list(), exonEnds=list(), 25 | strand=character(0), 26 | decreasing.rank.on.minus.strand=FALSE, 27 | error.if.out.of.bounds=TRUE) 28 | { 29 | if (!is.list(tlocs)) { 30 | if (!is(tlocs, "IntegerList")) 31 | stop("'tlocs' must be a list of integer vectors ", 32 | "or an IntegerList object") 33 | tlocs <- as.list(tlocs) 34 | } 35 | if (is(exonStarts, "IntegerRangesList")) { 36 | if (!identical(exonEnds, list())) 37 | stop("'exonEnds' cannot be specified ", 38 | "when 'exonStarts' is a IntegerRangesList object") 39 | exonEnds <- end(exonStarts) 40 | exonStarts <- start(exonStarts) 41 | } 42 | exonStarts <- .normargExonStartsOrEnds(exonStarts, "exonStarts") 43 | exonEnds <- .normargExonStartsOrEnds(exonEnds, "exonEnds") 44 | if (is.factor(strand)) 45 | strand <- as.vector(strand) 46 | if (!is.character(strand)) 47 | stop("'strand' must be a character vector") 48 | if (length(tlocs) != length(strand) 49 | || length(exonStarts) != length(strand) 50 | || length(exonEnds) != length(strand)) 51 | stop("'tlocs', 'exonStarts', 'exonEnds' and 'strand' ", 52 | "must have the same length") 53 | if (!isTRUEorFALSE(decreasing.rank.on.minus.strand)) 54 | stop("'decreasing.rank.on.minus.strand' must be TRUE or FALSE") 55 | GenomicRanges:::unsafe.transcriptLocs2refLocs(tlocs, 56 | exonStarts, exonEnds, strand, 57 | decreasing.rank.on.minus.strand, 58 | error.if.out.of.bounds) 59 | } 60 | 61 | 62 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 63 | ### transcriptWidths() 64 | ### 65 | 66 | transcriptWidths <- function(exonStarts=list(), exonEnds=list()) 67 | { 68 | if (is(exonStarts, "IntegerRangesList")) { 69 | if (!identical(exonEnds, list())) 70 | stop("'exonEnds' cannot be specified ", 71 | "when 'exonStarts' is a IntegerRangesList object") 72 | exonEnds <- end(exonStarts) 73 | exonStarts <- start(exonStarts) 74 | } 75 | exonStarts <- .normargExonStartsOrEnds(exonStarts, "exonStarts") 76 | exonEnds <- .normargExonStartsOrEnds(exonEnds, "exonEnds") 77 | if (length(exonStarts) != length(exonEnds)) 78 | stop("'exonStarts', 'exonEnds' must have the same length") 79 | GenomicRanges:::unsafe.transcriptWidths(exonStarts, exonEnds) 80 | } 81 | 82 | -------------------------------------------------------------------------------- /R/transcriptsByOverlaps.R: -------------------------------------------------------------------------------- 1 | ### 2 | 3 | setGeneric("transcriptsByOverlaps", signature="x", 4 | function(x, ranges, maxgap = -1L, minoverlap = 0L, 5 | type = c("any", "start", "end"), ...) 6 | standardGeneric("transcriptsByOverlaps") 7 | ) 8 | 9 | setMethod("transcriptsByOverlaps", "TxDb", 10 | function(x, ranges, maxgap = -1L, minoverlap = 0L, 11 | type = c("any", "start", "end"), 12 | columns = c("tx_id", "tx_name")) 13 | subsetByOverlaps(transcripts(x, columns = columns), ranges, 14 | maxgap = maxgap, minoverlap = minoverlap, 15 | type = match.arg(type)) 16 | ) 17 | 18 | setGeneric("exonsByOverlaps", signature="x", 19 | function(x, ranges, maxgap = -1L, minoverlap = 0L, 20 | type = c("any", "start", "end"), ...) 21 | standardGeneric("exonsByOverlaps") 22 | ) 23 | 24 | setMethod("exonsByOverlaps", "TxDb", 25 | function(x, ranges, maxgap = -1L, minoverlap = 0L, 26 | type = c("any", "start", "end"), 27 | columns = "exon_id") 28 | subsetByOverlaps(exons(x, columns = columns), ranges, 29 | maxgap = maxgap, minoverlap = minoverlap, 30 | type = match.arg(type)) 31 | ) 32 | 33 | setGeneric("cdsByOverlaps", signature="x", 34 | function(x, ranges, maxgap = -1L, minoverlap = 0L, 35 | type = c("any", "start", "end"), ...) 36 | standardGeneric("cdsByOverlaps") 37 | ) 38 | 39 | setMethod("cdsByOverlaps", "TxDb", 40 | function(x, ranges, maxgap = -1L, minoverlap = 0L, 41 | type = c("any", "start", "end"), 42 | columns = "cds_id") 43 | subsetByOverlaps(cds(x, columns = columns), ranges, 44 | maxgap = maxgap, minoverlap = minoverlap, 45 | type = match.arg(type)) 46 | ) 47 | 48 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Miscellaneous low-level utils 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | ### Unless stated otherwise, nothing in this file is exported. 6 | ### 7 | 8 | 9 | call_fun_in_txdbmaker <- function(fun, ...) 10 | { 11 | msg <- c(fun, "() has moved from GenomicFeatures to the txdbmaker ", 12 | "package, and is formally defunct in GenomicFeatures ", 13 | ">= 1.61.1. Please call txdbmaker::", fun, "() to get rid ", 14 | "of this error.") 15 | .Defunct(msg=wmsg(msg)) 16 | } 17 | 18 | 19 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 20 | ### DB related. 21 | ### 22 | ### Most of this stuff was copy/pasted from AnnotationDbi (trying to avoid 23 | ### depending on AnnotationDbi for now). 24 | ### 25 | 26 | ### Environment for storing run-time objects 27 | RTobjs <- new.env(hash=TRUE, parent=emptyenv()) 28 | 29 | assign("debugSQL", FALSE, envir=RTobjs) 30 | 31 | debugSQL <- function() 32 | { 33 | debugSQL <- !get("debugSQL", envir=RTobjs) 34 | assign("debugSQL", debugSQL, envir=RTobjs) 35 | debugSQL 36 | } 37 | 38 | 39 | ### Use dbQuery(conn, SQL, 1) instead of dbQuery(conn, SQL)[[1]], 40 | ### it's much safer! 41 | dbEasyQuery <- function(conn, SQL, j0=NA) 42 | { 43 | if (get("debugSQL", envir=RTobjs)) { 44 | if (!is.character(SQL) || length(SQL) != 1L || is.na(SQL)) 45 | stop("[debugSQL] 'SQL' must be a single string") 46 | cat("[debugSQL] SQL query: ", SQL, "\n", sep="") 47 | st <- system.time(data0 <- dbGetQuery(conn, SQL)) 48 | cat("[debugSQL] time: ", st["user.self"], " seconds\n", sep="") 49 | } else { 50 | data0 <- dbGetQuery(conn, SQL) 51 | } 52 | if (is.na(j0)) 53 | return(data0) 54 | ## Needed to deal properly with data frame with 0 column ("NULL data 55 | ## frames with 0 rows") returned by RSQLite when the result of a SELECT 56 | ## query has 0 row 57 | if (nrow(data0) == 0L) 58 | character(0) 59 | else 60 | data0[[j0]] 61 | } 62 | 63 | ### TODO: Put this in AnnotationDbi. 64 | queryAnnotationDb <- function(annotationdb, sql) 65 | { 66 | AnnotationDbi:::dbEasyQuery(dbconn(annotationdb), 67 | paste(sql, collapse="\n")) 68 | } 69 | 70 | 71 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 72 | ### Data frame related. 73 | ### 74 | ### TODO: Find a better home for these low-level data.frame utils. 75 | ### 76 | 77 | ### Not data.frame specific. Would also work on any matrix-like object. 78 | has_col <- function(x, colnames) {colnames %in% colnames(x)} 79 | 80 | makeZeroRowDataFrame <- function(col2class) 81 | { 82 | if (!is.character(col2class) || is.null(names(col2class))) 83 | stop("'col2class' must be a named character vector") 84 | as.data.frame(lapply(col2class, function(class) get(class)()), 85 | stringsAsFactors=FALSE) 86 | } 87 | 88 | ### Sets the class of (all or some of) the columns of a data.frame. 89 | ### Typical use: 90 | ### x <- setDataFrameColClass(x, c(colA="integer", colB="factor")) 91 | ### Note that if 'x' has more than one "colA" col, then *all* of them are 92 | ### coerced to integer. 93 | setDataFrameColClass <- function(x, col2class, drop.extra.cols=FALSE) 94 | { 95 | if (!is.data.frame(x)) 96 | stop("'x' must be a data.frame") 97 | if (!is.character(col2class) || is.null(names(col2class))) 98 | stop("'col2class' must be a named character vector") 99 | if (!all(names(col2class) %in% colnames(x))) 100 | stop("'col2class' has invalid names") 101 | if (!isTRUEorFALSE(drop.extra.cols)) 102 | stop("'drop.extra.cols' must be TRUE or FALSE") 103 | if (drop.extra.cols) { 104 | col_idx <- which(colnames(x) %in% names(col2class)) 105 | } else { 106 | col_idx <- seq_len(ncol(x)) 107 | } 108 | tmp <- lapply(col_idx, 109 | function(j) 110 | { 111 | col <- x[[j]] 112 | colname <- colnames(x)[j] 113 | if (!(colname %in% names(col2class))) 114 | return(col) 115 | class <- col2class[[colname]] 116 | FUNname <- paste("as", class, sep=".") 117 | if (exists(FUNname) && is.function(FUN <- get(FUNname))) 118 | return(FUN(col)) 119 | as(col, class) 120 | }) 121 | names(tmp) <- colnames(x)[col_idx] 122 | return(data.frame(tmp, check.names=FALSE, stringsAsFactors=FALSE)) 123 | } 124 | 125 | 126 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 127 | ### ID assignment and/or reassignment. 128 | ### 129 | 130 | ### Returns the vector of ids such that 'unique(x)[ids, ]' is identical 131 | ### to 'x' (in the same way that 'levels(f)[f]' is identical to 132 | ### 'as.vector(f)' when 'f' is a character factor). 133 | ### This unambiguously defines 'ids'. In particular, it's not Locale 134 | ### specific, despite the fact that the current implementation uses a 135 | ### sorting approach. 136 | ### TODO: Remove! (not used anymore) 137 | makeIdsForUniqueDataFrameRows <- function(x) 138 | { 139 | if (!is.data.frame(x)) 140 | stop("'x' must be a data.frame") 141 | x_order <- do.call(order, x) 142 | x_dups <- duplicated(x) 143 | ## First we make "provisory" ids. Those ids *are* Locale specific. 144 | prov_ids <- integer(nrow(x)) 145 | prov_ids[x_order] <- cumsum(!x_dups[x_order]) 146 | ## Convert the "provisory" ids into the final ids. The final ids are 147 | ## *not* Locale specific anymore. 148 | as.integer(factor(prov_ids, levels=unique(prov_ids))) 149 | } 150 | 151 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .test <- function() BiocGenerics:::testPackage("GenomicFeatures") 2 | 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](https://bioconductor.org/) 2 | 3 | **GenomicFeatures** is an R/Bioconductor package for querying the gene models of a given organism/assembly. 4 | 5 | See https://bioconductor.org/packages/GenomicFeatures for more information including how to install the release version of the package (please refrain from installing directly from GitHub). 6 | 7 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | o Fix handling of the 'filter' arg (transcripts(), etc...) when the txdb has 2 | user seqlevels on. 3 | 4 | o Add 'filter' arg to transcriptsBy(), exonsBy(), cdsBy(), 5 | fiveUTRByTranscript(), and threeUTRByTranscript(). exonsBy(), cdsBy(), and 6 | *UTRByTranscript() should at least support filtering by gene or transcript 7 | id. 8 | 9 | o Too many helper functions are defined and used internally to query the 10 | db: 11 | - dbEasyQuery() is defined in AnnotationDbi and GenomicFeatures with 12 | different definitions. 13 | - AnnotationDbi:::dbQuery() is the same as GenomicFeatures:::dbEasyQuery() 14 | - queryAnnotationDb() 15 | Clean this mess! 16 | 17 | o DB schema change: Replace tx_chrom, exon_chrom, and cds_chrom columns with 18 | _tx_chrom_id, _exon_chrom_id, and _cds_chrom_id. 19 | 20 | o Add the following indices to the db schema: 21 | CREATE INDEX splicing_tx_id ON splicing (_tx_id); 22 | CREATE INDEX splicing_exon_id ON splicing (_exon_id); 23 | CREATE INDEX splicing_cds_id ON splicing (_cds_id); 24 | CREATE INDEX gene_tx_id ON gene (_tx_id); 25 | 26 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citEntry(entry="article", 2 | title = "Software for Computing and Annotating Genomic Ranges", 3 | author = personList( as.person("Michael Lawrence" ), 4 | as.person("Wolfgang Huber" ), 5 | as.person("Herv\\'e Pag\\`es" ), 6 | as.person("Patrick Aboyoun" ), 7 | as.person("Marc Carlson" ), 8 | as.person("Robert Gentleman" ), 9 | as.person("Martin Morgan" ), 10 | as.person("Vincent Carey" )), 11 | year = 2013, 12 | journal = "{PLoS} Computational Biology", 13 | volume = "9", 14 | issue = "8", 15 | doi = "10.1371/journal.pcbi.1003118", 16 | url = "http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118", 17 | textVersion = 18 | "Lawrence M, Huber W, Pag\\`es H, Aboyoun P, Carlson M, et al. (2013) Software for Computing and Annotating Genomic Ranges. PLoS Comput Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118" ) 19 | -------------------------------------------------------------------------------- /inst/extdata/Biomart_Ensembl_sample.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/Biomart_Ensembl_sample.sqlite -------------------------------------------------------------------------------- /inst/extdata/FeatureDb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/FeatureDb.sqlite -------------------------------------------------------------------------------- /inst/extdata/ITAG4.1_gene_models.subset.gff: -------------------------------------------------------------------------------- 1 | ##gff-version 3 2 | ##sequence-regionSL4.0ch00 1 9643250 3 | ##sequence-regionSL4.0ch01 1 90863682 4 | ##sequence-regionSL4.0ch02 1 53473368 5 | ##sequence-regionSL4.0ch03 1 65298490 6 | ##sequence-regionSL4.0ch04 1 64459972 7 | ##sequence-regionSL4.0ch05 1 65269487 8 | ##sequence-regionSL4.0ch06 1 47258699 9 | ##sequence-regionSL4.0ch07 1 67883646 10 | ##sequence-regionSL4.0ch08 1 63995357 11 | ##sequence-regionSL4.0ch09 1 68513564 12 | ##sequence-regionSL4.0ch10 1 64792705 13 | ##sequence-regionSL4.0ch11 1 54379777 14 | ##sequence-regionSL4.0ch12 1 66688036 15 | SL4.0ch00 maker_ITAG gene 2030916 2032369 . + . ID=gene:Solyc00g025400.2;Alias=Solyc00g025400;Name=Solyc00g025400.2;length=1453 16 | SL4.0ch00 maker_ITAG mRNA 2030916 2032369 . + . ID=mRNA:Solyc00g025400.2.1;Parent=gene:Solyc00g025400.2;Name=Solyc00g025400.2.1;Note=Unknown protein;_AED=1.00;_QI=250|0|0|0|0|0|2|0|156;_eAED=1.00 17 | SL4.0ch00 maker_ITAG five_prime_UTR 2030916 2031165 . + . ID=five_prime_UTR:Solyc00g025400.2.1.0;Parent=mRNA:Solyc00g025400.2.1 18 | SL4.0ch00 maker_ITAG exon 2030916 2031456 . + . ID=exon:Solyc00g025400.2.1.1;Parent=mRNA:Solyc00g025400.2.1 19 | SL4.0ch00 maker_ITAG CDS 2031166 2031456 . + 0 ID=CDS:Solyc00g025400.2.1.1;Parent=mRNA:Solyc00g025400.2.1 20 | SL4.0ch00 maker_ITAG exon 2032190 2032369 . + . ID=exon:Solyc00g025400.2.1.2;Parent=mRNA:Solyc00g025400.2.1 21 | SL4.0ch00 maker_ITAG CDS 2032190 2032369 . + 0 ID=CDS:Solyc00g025400.2.1.2;Parent=mRNA:Solyc00g025400.2.1 22 | ### 23 | SL4.0ch00 maker gene 2062209 2063021 . - . ID=gene:Solyc00g500104.2;Name=Solyc00g500104.2 24 | SL4.0ch00 maker mRNA 2062209 2063021 . - . ID=mRNA:Solyc00g500104.2.1;Parent=gene:Solyc00g500104.2;Name=Solyc00g500104.2.1;_aed=0.27;_eaed=0.27;_qi=0|-1|0|1|-1|1|1|150|220;Note=RNA-dependent RNA polymerase (AHRD V3.11 *-* tr|A0A2Z6JIR4|A0A2Z6JIR4_9VIRU);Dbxref=InterPro:IPR008686,Pfam:PF05919 25 | SL4.0ch00 maker three_prime_UTR 2062209 2062358 . - . ID=three_prime_UTR:Solyc00g500104.2.1.0;Parent=mRNA:Solyc00g500104.2.1 26 | SL4.0ch00 maker exon 2062209 2063021 . - . ID=exon:Solyc00g500104.2.1.1;Parent=mRNA:Solyc00g500104.2.1 27 | SL4.0ch00 maker CDS 2062359 2063021 . - 0 ID=CDS:Solyc00g500104.2.1.1;Parent=mRNA:Solyc00g500104.2.1 28 | ### 29 | SL4.0ch00 maker_ITAG gene 2081475 2081793 . - . ID=gene:Solyc00g142160.1;Alias=Solyc00g142160;Name=Solyc00g142160.1;length=318 30 | SL4.0ch00 maker_ITAG mRNA 2081475 2081793 . - . ID=mRNA:Solyc00g142160.1.1;Parent=gene:Solyc00g142160.1;Name=Solyc00g142160.1.1;Note=RNA-dependent RNA polymerase (AHRD V3.3 *-* A0A2Z6JIR4_9VIRU);_AED=0.64;_QI=0|-1|0|1|-1|0|1|43|91;_eAED=0.64 31 | SL4.0ch00 maker_ITAG three_prime_UTR 2081475 2081517 . - . ID=three_prime_UTR:Solyc00g142160.1.1.0;Parent=mRNA:Solyc00g142160.1.1 32 | SL4.0ch00 maker_ITAG exon 2081475 2081793 . - . ID=exon:Solyc00g142160.1.1.1;Parent=mRNA:Solyc00g142160.1.1 33 | SL4.0ch00 maker_ITAG CDS 2081518 2081793 . - 0 ID=CDS:Solyc00g142160.1.1.1;Parent=mRNA:Solyc00g142160.1.1 34 | ### 35 | SL4.0ch00 maker_ITAG gene 2081817 2083225 . - . ID=gene:Solyc00g142170.2;Alias=Solyc00g142170;Name=Solyc00g142170.2;length=1408 36 | SL4.0ch00 maker_ITAG mRNA 2081817 2083225 . - . ID=mRNA:Solyc00g142170.2.1;Parent=gene:Solyc00g142170.2;Name=Solyc00g142170.2.1;Note=RNA-dependent RNA polymerase (AHRD V3.3 *-* A0A2Z6JIR4_9VIRU);Dbxref=InterPro:IPR008686,Pfam:PF05919;_AED=0.38;_QI=89|0|0|0.33|0|0|3|39|305;_eAED=0.43 37 | SL4.0ch00 maker_ITAG three_prime_UTR 2081817 2081855 . - . ID=three_prime_UTR:Solyc00g142170.2.1.0;Parent=mRNA:Solyc00g142170.2.1 38 | SL4.0ch00 maker_ITAG exon 2081817 2082335 . - . ID=exon:Solyc00g142170.2.1.1;Parent=mRNA:Solyc00g142170.2.1 39 | SL4.0ch00 maker_ITAG CDS 2081856 2082335 . - 0 ID=CDS:Solyc00g142170.2.1.1;Parent=mRNA:Solyc00g142170.2.1 40 | SL4.0ch00 maker_ITAG exon 2082546 2082748 . - . ID=exon:Solyc00g142170.2.1.2;Parent=mRNA:Solyc00g142170.2.1 41 | SL4.0ch00 maker_ITAG CDS 2082546 2082748 . - 2 ID=CDS:Solyc00g142170.2.1.2;Parent=mRNA:Solyc00g142170.2.1 42 | SL4.0ch00 maker_ITAG CDS 2082902 2083136 . - 0 ID=CDS:Solyc00g142170.2.1.3;Parent=mRNA:Solyc00g142170.2.1 43 | SL4.0ch00 maker_ITAG exon 2082902 2083225 . - . ID=exon:Solyc00g142170.2.1.3;Parent=mRNA:Solyc00g142170.2.1 44 | SL4.0ch00 maker_ITAG five_prime_UTR 2083137 2083225 . - . ID=five_prime_UTR:Solyc00g142170.2.1.0;Parent=mRNA:Solyc00g142170.2.1 45 | ### 46 | SL4.0ch00 maker_ITAG gene 2136189 2136452 . - . ID=gene:Solyc00g500120.1;Alias=Solyc00g500120;Name=Solyc00g500120.1;length=263 47 | SL4.0ch00 maker_ITAG mRNA 2136189 2136452 . - . ID=mRNA:Solyc00g500120.1.1;Parent=gene:Solyc00g500120.1;Name=Solyc00g500120.1.1;Note=Unknown protein;_AED=0.46;_QI=0|-1|0|1|-1|0|1|0|87;_eAED=1.00 48 | SL4.0ch00 maker_ITAG exon 2136189 2136452 . - . ID=exon:Solyc00g500120.1.1.1;Parent=mRNA:Solyc00g500120.1.1 49 | SL4.0ch00 maker_ITAG CDS 2136189 2136452 . - 0 ID=CDS:Solyc00g500120.1.1.1;Parent=mRNA:Solyc00g500120.1.1 50 | ### 51 | SL4.0ch00 maker_ITAG gene 2139483 2140481 . - . ID=gene:Solyc00g500121.1;Alias=Solyc00g500121;Name=Solyc00g500121.1;length=998 52 | SL4.0ch00 maker_ITAG mRNA 2139483 2140481 . - . ID=mRNA:Solyc00g500121.1.1;Parent=gene:Solyc00g500121.1;Name=Solyc00g500121.1.1;Note=Unknown protein;_AED=0.38;_QI=558|1|0.5|1|0|0|2|0|86;_eAED=0.38 53 | SL4.0ch00 maker_ITAG exon 2139483 2139659 . - . ID=exon:Solyc00g500121.1.1.1;Parent=mRNA:Solyc00g500121.1.1 54 | SL4.0ch00 maker_ITAG CDS 2139483 2139659 . - 0 ID=CDS:Solyc00g500121.1.1.1;Parent=mRNA:Solyc00g500121.1.1 55 | SL4.0ch00 maker_ITAG CDS 2139840 2139923 . - 0 ID=CDS:Solyc00g500121.1.1.2;Parent=mRNA:Solyc00g500121.1.1 56 | SL4.0ch00 maker_ITAG exon 2139840 2140481 . - . ID=exon:Solyc00g500121.1.1.2;Parent=mRNA:Solyc00g500121.1.1 57 | SL4.0ch00 maker_ITAG five_prime_UTR 2139924 2140481 . - . ID=five_prime_UTR:Solyc00g500121.1.1.0;Parent=mRNA:Solyc00g500121.1.1 58 | ### 59 | SL4.0ch00 maker_ITAG gene 2187524 2191157 . - . ID=gene:Solyc00g500122.1;Alias=Solyc00g500122;Name=Solyc00g500122.1;length=3633 60 | SL4.0ch00 maker_ITAG mRNA 2187524 2191157 . - . ID=mRNA:Solyc00g500122.1.1;Parent=gene:Solyc00g500122.1;Name=Solyc00g500122.1.1;Note=Unknown protein;_AED=0.40;_QI=0|0|0.33|1|0|0|3|433|122;_eAED=0.51 61 | SL4.0ch00 maker_ITAG three_prime_UTR 2187524 2187956 . - . ID=three_prime_UTR:Solyc00g500122.1.1.0;Parent=mRNA:Solyc00g500122.1.1 62 | SL4.0ch00 maker_ITAG exon 2187524 2188176 . - . ID=exon:Solyc00g500122.1.1.1;Parent=mRNA:Solyc00g500122.1.1 63 | SL4.0ch00 maker_ITAG CDS 2187957 2188176 . - 1 ID=CDS:Solyc00g500122.1.1.1;Parent=mRNA:Solyc00g500122.1.1 64 | SL4.0ch00 maker_ITAG exon 2190928 2190992 . - . ID=exon:Solyc00g500122.1.1.2;Parent=mRNA:Solyc00g500122.1.1 65 | SL4.0ch00 maker_ITAG CDS 2190928 2190992 . - 0 ID=CDS:Solyc00g500122.1.1.2;Parent=mRNA:Solyc00g500122.1.1 66 | SL4.0ch00 maker_ITAG exon 2191074 2191157 . - . ID=exon:Solyc00g500122.1.1.3;Parent=mRNA:Solyc00g500122.1.1 67 | SL4.0ch00 maker_ITAG CDS 2191074 2191157 . - 0 ID=CDS:Solyc00g500122.1.1.3;Parent=mRNA:Solyc00g500122.1.1 68 | ### 69 | SL4.0ch00 maker gene 2329909 2332560 . - . ID=gene:Solyc00g160010.1;Name=Solyc00g160010.1 70 | SL4.0ch00 maker mRNA 2329909 2332560 . - . ID=mRNA:Solyc00g160010.1.1;Parent=gene:Solyc00g160010.1;Name=Solyc00g160010.1.1;_aed=0.77;_eaed=0.77;_qi=0|0|0|0.25|1|1|4|0|150;_merge_warning=1;Note=zinc finger CCCH domain-containing protein 30-like (AHRD V3.11 *-* XP_025884875.1) 71 | SL4.0ch00 maker exon 2329909 2329963 . - . ID=exon:Solyc00g160010.1.1.1;Parent=mRNA:Solyc00g160010.1.1 72 | SL4.0ch00 maker CDS 2329909 2329963 . - 1 ID=CDS:Solyc00g160010.1.1.1;Parent=mRNA:Solyc00g160010.1.1 73 | SL4.0ch00 maker exon 2330621 2330687 . - . ID=exon:Solyc00g160010.1.1.2;Parent=mRNA:Solyc00g160010.1.1 74 | SL4.0ch00 maker CDS 2330621 2330687 . - 2 ID=CDS:Solyc00g160010.1.1.2;Parent=mRNA:Solyc00g160010.1.1 75 | SL4.0ch00 maker exon 2330792 2330966 . - . ID=exon:Solyc00g160010.1.1.3;Parent=mRNA:Solyc00g160010.1.1 76 | SL4.0ch00 maker CDS 2330792 2330966 . - 0 ID=CDS:Solyc00g160010.1.1.3;Parent=mRNA:Solyc00g160010.1.1 77 | SL4.0ch00 maker exon 2332405 2332560 . - . ID=exon:Solyc00g160010.1.1.4;Parent=mRNA:Solyc00g160010.1.1 78 | SL4.0ch00 maker CDS 2332405 2332560 . - 0 ID=CDS:Solyc00g160010.1.1.4;Parent=mRNA:Solyc00g160010.1.1 79 | ### 80 | SL4.0ch00 maker gene 2377393 2378166 . - . ID=gene:Solyc00g160330.1;Name=Solyc00g160330.1 81 | SL4.0ch00 maker mRNA 2377393 2378166 . - . ID=mRNA:Solyc00g160330.1.1;Parent=gene:Solyc00g160330.1;Name=Solyc00g160330.1.1;_aed=0.58;_eaed=0.59;_qi=0|0|0|0.5|0|0.5|2|0|82;Note=zinc finger CCCH domain-containing protein 30-like (AHRD V3.11 *-* XP_025884875.1) 82 | SL4.0ch00 maker exon 2377393 2377434 . - . ID=exon:Solyc00g160330.1.1.1;Parent=mRNA:Solyc00g160330.1.1 83 | SL4.0ch00 maker CDS 2377393 2377434 . - 0 ID=CDS:Solyc00g160330.1.1.1;Parent=mRNA:Solyc00g160330.1.1 84 | SL4.0ch00 maker exon 2377960 2378166 . - . ID=exon:Solyc00g160330.1.1.2;Parent=mRNA:Solyc00g160330.1.1 85 | SL4.0ch00 maker CDS 2377960 2378166 . - 0 ID=CDS:Solyc00g160330.1.1.2;Parent=mRNA:Solyc00g160330.1.1 86 | ### 87 | SL4.0ch00 maker_ITAG gene 2379604 2380807 . - . ID=gene:Solyc00g007330.1;Alias=Solyc00g007330;Name=Solyc00g007330.1;length=1203 88 | SL4.0ch00 maker_ITAG mRNA 2379604 2380807 . - . ID=mRNA:Solyc00g007330.1.1;Parent=gene:Solyc00g007330.1;Name=Solyc00g007330.1.1;Note=Zinc finger transcription factor 1;_AED=1.00;_QI=373|0|0|0|0|0|2|0|171;_eAED=1.00 89 | SL4.0ch00 maker_ITAG CDS 2379604 2380119 . - 0 ID=CDS:Solyc00g007330.1.1.1;Parent=mRNA:Solyc00g007330.1.1 90 | SL4.0ch00 maker_ITAG exon 2379604 2380324 . - . ID=exon:Solyc00g007330.1.1.1;Parent=mRNA:Solyc00g007330.1.1 91 | SL4.0ch00 maker_ITAG five_prime_UTR 2380120 2380324 . - . ID=five_prime_UTR:Solyc00g007330.1.1.0;Parent=mRNA:Solyc00g007330.1.1 92 | SL4.0ch00 maker_ITAG exon 2380640 2380807 . - . ID=exon:Solyc00g007330.1.1.2;Parent=mRNA:Solyc00g007330.1.1 93 | SL4.0ch00 maker_ITAG five_prime_UTR 2380640 2380807 . - . ID=five_prime_UTR:Solyc00g007330.1.1.1;Parent=mRNA:Solyc00g007330.1.1 94 | ### 95 | -------------------------------------------------------------------------------- /inst/extdata/cD.exByEdge-SG-Vig.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/cD.exByEdge-SG-Vig.Rda -------------------------------------------------------------------------------- /inst/extdata/cD.exsByGenes-SG-Vig.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/cD.exsByGenes-SG-Vig.Rda -------------------------------------------------------------------------------- /inst/extdata/events.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/events.Rda -------------------------------------------------------------------------------- /inst/extdata/hg19_knownGene_sample.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/hg19_knownGene_sample.sqlite -------------------------------------------------------------------------------- /inst/extdata/sample_ranges.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/sample_ranges.rds -------------------------------------------------------------------------------- /inst/script/README: -------------------------------------------------------------------------------- 1 | Scripts make_tRNAFDb.R and make_TxDbs.R are now in the txdbmaker package! 2 | 3 | -------------------------------------------------------------------------------- /inst/unitTests/test_TxDb_seqinfo.R: -------------------------------------------------------------------------------- 1 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 2 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 3 | 4 | test_rename_seqlevels <- function() 5 | { 6 | txdb <- restoreSeqlevels(txdb) 7 | new_seqlevels <- as.character(seq_along(seqlevels(txdb))) 8 | seqlevels(txdb) <- new_seqlevels 9 | checkIdentical(new_seqlevels, seqlevels(txdb)) 10 | } 11 | 12 | test_restrict_seqlevels <- function() 13 | { 14 | ## This should work 15 | txdb <- restoreSeqlevels(txdb) 16 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5") 17 | checkEquals(length(seqinfo(txdb)), 1) 18 | 19 | ## This should work 20 | txdb <- restoreSeqlevels(txdb) 21 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5", chr6="6", chr4="4") 22 | checkTrue(length(seqinfo(txdb)) == 3) 23 | checkIdentical(c("5", "6", "4"), seqlevels(txdb)) 24 | checkTrue(seqlengths(txdb)[2] == min(seqlengths(txdb))) 25 | checkTrue(seqlengths(txdb)[3] == max(seqlengths(txdb))) 26 | 27 | ## And this should NOT work 28 | txdb <- restoreSeqlevels(txdb) 29 | checkException(seqlevels(txdb, pruning.mode="coarse") <- c(foo="2")) 30 | } 31 | 32 | test_seqinfo_setter <- function() 33 | { 34 | txdb <- restoreSeqlevels(txdb) 35 | new_seqinfo <- seqinfo(txdb) 36 | seqnames(new_seqinfo) <- paste0("NEW_", seqnames(new_seqinfo)) 37 | seqinfo(txdb, new2old=seq_along(new_seqinfo)) <- new_seqinfo 38 | checkIdentical(new_seqinfo, seqinfo(txdb)) 39 | 40 | txdb <- restoreSeqlevels(txdb) 41 | new_seqinfo <- seqinfo(txdb) 42 | seqlengths(new_seqinfo) <- 5 * seqlengths(new_seqinfo) 43 | checkException(seqinfo(txdb) <- new_seqinfo) 44 | 45 | txdb <- restoreSeqlevels(txdb) 46 | new_seqinfo <- seqinfo(txdb) 47 | isCircular(new_seqinfo) <- rep(TRUE, length(new_seqinfo)) 48 | checkException(seqinfo(txdb) <- new_seqinfo) 49 | 50 | txdb <- restoreSeqlevels(txdb) 51 | new_seqinfo <- seqinfo(txdb) 52 | genome(new_seqinfo) <- "foo" 53 | seqinfo(txdb) <- new_seqinfo 54 | checkIdentical(new_seqinfo, seqinfo(txdb)) 55 | } 56 | 57 | test_transcripts_accessor <- function() 58 | { 59 | txdb <- restoreSeqlevels(txdb) 60 | txs1 <- transcripts(txdb) 61 | seqlevels(txs1, pruning.mode="coarse") <- c(chr5="5") 62 | ## Then change seqlevels for txdb 63 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5") 64 | txs2 <- transcripts(txdb) 65 | checkIdentical(txs1, txs2) 66 | } 67 | 68 | test_exons_accessor <- function() 69 | { 70 | txdb <- restoreSeqlevels(txdb) 71 | exs1 <- exons(txdb) 72 | seqlevels(exs1, pruning.mode="coarse") <- c(chr5="5") 73 | ## Then change seqlevels for txdb 74 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5") 75 | exs2 <- exons(txdb) 76 | checkIdentical(exs1, exs2) 77 | } 78 | 79 | test_cds_accessor <- function() 80 | { 81 | txdb <- restoreSeqlevels(txdb) 82 | cds1 <- cds(txdb) 83 | seqlevels(cds1, pruning.mode="coarse") <- c(chr5="5") 84 | ## Then change seqlevels for txdb 85 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5") 86 | cds2 <- cds(txdb) 87 | checkIdentical(cds1, cds2) 88 | } 89 | 90 | test_promoters_accessor <- function() 91 | { 92 | txdb <- restoreSeqlevels(txdb) 93 | prm1 <- promoters(txdb) 94 | seqlevels(prm1, pruning.mode="coarse") <- c(chr5="5") 95 | ## Then change seqlevels for txdb 96 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5") 97 | prm2 <- promoters(txdb) 98 | checkIdentical(prm1, prm2) 99 | 100 | txdb <- restoreSeqlevels(txdb) 101 | trmn1 <- terminators(txdb) 102 | seqlevels(trmn1, pruning.mode="coarse") <- c(chr5="5") 103 | ## Then change seqlevels for txdb 104 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5") 105 | trmn2 <- terminators(txdb) 106 | checkIdentical(trmn1, trmn2) 107 | } 108 | 109 | test_transcriptsBy_accessors <- function() 110 | { 111 | ## This one is a "fun" one. 112 | ## There are issues because some genes are annotated as being on 113 | ## TWO different chromosomes. Such genes are filtered for txs3, 114 | ## but NOT for txs4... Hmmmm. 115 | txdb <- restoreSeqlevels(txdb) 116 | txs3 <- transcriptsBy(txdb, by="gene") 117 | seqlevels(txs3, pruning.mode="coarse") <- c(chr5="5") 118 | ## Then change seqlevels for txdb 119 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5") 120 | txs4 <- transcriptsBy(txdb, by="gene") 121 | ## checkIdentical(txs3, txs4) ## TROUBLE!! 122 | 123 | } 124 | 125 | 126 | ## What to do about this? The reason for the difference is because of order of operations. txs3 gets all the ranges and then removes any that are not kosher (this is correct), txs4 OTOH gets only ranges from chr5 (efficient!), but then fails to filter out things that have hybrid seqnames (as they were pre-filtered). I think I have to make the query less efficient to fix this, but I want to discuss it with Herve 1st to get a 2nd opinion. 127 | -------------------------------------------------------------------------------- /inst/unitTests/test_exonicParts.R: -------------------------------------------------------------------------------- 1 | ### 2 | 3 | test_exonicParts <- function() 4 | { 5 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 6 | package="GenomicFeatures")) 7 | 8 | exonic_parts <- exonicParts(txdb, linked.to.single.gene.only=TRUE) 9 | checkIdentical(length(exonic_parts), 653L) 10 | expected_mcolnames <- c("tx_id", "tx_name", "gene_id", "exon_id", 11 | "exon_name", "exon_rank", "exonic_part") 12 | checkIdentical(expected_mcolnames, names(mcols(exonic_parts))) 13 | checkTrue(is.character(mcols(exonic_parts)$gene_id)) 14 | checkTrue(is(mcols(exonic_parts)$tx_name, "CharacterList")) 15 | checkTrue(is.integer(mcols(exonic_parts)$exonic_part)) 16 | 17 | exonic_parts <- exonicParts(txdb, linked.to.single.gene.only=FALSE) 18 | checkIdentical(length(exonic_parts), 660L) 19 | expected_mcolnames <- head(expected_mcolnames, n=-1L) 20 | checkIdentical(expected_mcolnames, names(mcols(exonic_parts))) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /inst/unitTests/test_getPromoterSeq-methods.R: -------------------------------------------------------------------------------- 1 | library(BSgenome.Hsapiens.UCSC.hg19) 2 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 3 | library(BSgenome.Dmelanogaster.UCSC.dm3) 4 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) 5 | library(Rsamtools) 6 | library(pasillaBamSubset) 7 | 8 | e2f3 <- "1871" # human gene on the plus strand, chr6 9 | grb2 <- "2885" # human gene on the minus strand, chr17 10 | 11 | # a note on method: when the promoter sequence is 20 bases or more in length, 12 | # uscs blat will find these sequences, and a quick visual inspection of the 13 | # accompanying genome browser view at the right level of zoom, will 14 | # confirm that the per-transcript sequences is indeed correct. 15 | # there are a few tests of shorter sequences below as well, which 16 | # I checked in the genome browser, but this required a little more effort 17 | # than the length 20, blat approach. 18 | 19 | test_GRangesListBSgenomeHumanGetPromoterSeq <- function() { 20 | txdb <- restoreSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene) ## safety net 21 | genes <- c(e2f3, grb2) 22 | tx_by_gene <- transcriptsBy(txdb, by="gene")[genes] 23 | checkIdentical(names(tx_by_gene), genes) 24 | transcript_count <- length(unlist(tx_by_gene, use.names=FALSE)) 25 | 26 | promoter_seqs <- getPromoterSeq(tx_by_gene, Hsapiens, 27 | upstream=10, downstream=0) 28 | checkTrue(validObject(promoter_seqs)) 29 | checkTrue(is(promoter_seqs, "DNAStringSetList")) 30 | checkEquals(length(promoter_seqs), 2) 31 | checkIdentical(names(promoter_seqs), genes) 32 | checkIdentical(width(unlist(promoter_seqs, use.names=FALSE)), 33 | rep.int(10L, transcript_count)) 34 | 35 | terminator_seqs <- getTerminatorSeq(tx_by_gene, Hsapiens, 36 | upstream=10, downstream=0) 37 | checkTrue(validObject(terminator_seqs)) 38 | checkTrue(is(terminator_seqs, "DNAStringSetList")) 39 | checkEquals(length(terminator_seqs), 2) 40 | checkIdentical(names(terminator_seqs), genes) 41 | checkIdentical(width(unlist(terminator_seqs, use.names=FALSE)), 42 | rep.int(10L, transcript_count)) 43 | } 44 | 45 | test_GRangesListBSgenomeFlyGetPromoterSeq <- function() { 46 | # two neighboring genes near beginning of chr3R, on opposite strands 47 | # gene_id flybase_id symbol 48 | # 40524 FBgn0037215 CG12582 49 | # 40526 FBgn0037217 CG14636 50 | # in 2012, UCSC reported 4 total transcripts for these two genes 51 | # in 2013, 6. there should be as many promoter_seqs as there 52 | # are transcripts, and they should each be of width 53 | # upstream + downstream. it is risky to check for specific 54 | # sequence in the promoter_seqs since the annotation and sequence 55 | # may change 56 | 57 | txdb <- restoreSeqlevels(TxDb.Dmelanogaster.UCSC.dm3.ensGene) ## safety net 58 | genes <- c("FBgn0037215", "FBgn0037217") 59 | tx_by_gene <- transcriptsBy(txdb, by="gene")[genes] 60 | checkIdentical(names(tx_by_gene), genes) 61 | transcript_count <- length(unlist(tx_by_gene, use.names=FALSE)) 62 | 63 | promoter_seqs <- getPromoterSeq(tx_by_gene, Dmelanogaster, 64 | upstream=10, downstream=10) 65 | checkTrue(validObject(promoter_seqs)) 66 | checkTrue(is(promoter_seqs, "DNAStringSetList")) 67 | checkEquals(length(promoter_seqs), 2) 68 | checkIdentical(names(promoter_seqs), genes) 69 | checkIdentical(width(unlist(promoter_seqs, use.names=FALSE)), 70 | rep.int(20L, transcript_count)) 71 | 72 | terminator_seqs <- getPromoterSeq(tx_by_gene, Dmelanogaster, 73 | upstream=10, downstream=10) 74 | checkTrue(validObject(terminator_seqs)) 75 | checkTrue(is(terminator_seqs, "DNAStringSetList")) 76 | checkEquals(length(terminator_seqs), 2) 77 | checkIdentical(names(terminator_seqs), genes) 78 | checkIdentical(width(unlist(terminator_seqs, use.names=FALSE)), 79 | rep.int(20L, transcript_count)) 80 | } 81 | 82 | test_GRangesListFastaFlyGetPromoterSeq <- function() { 83 | # two neighboring genes near beginning of chr3R, on opposite strands 84 | # gene_id flybase_id symbol chr 85 | # 43766 FBgn0025740 plexB 4 86 | # 43769 FBgn0085432 pan 4 87 | 88 | txdb <- restoreSeqlevels(TxDb.Dmelanogaster.UCSC.dm3.ensGene) ## safety net 89 | genes <- c("FBgn0025740", "FBgn0085432") 90 | tx_by_gene <- transcriptsBy(txdb, by="gene")[genes] 91 | checkIdentical(names(tx_by_gene), genes) 92 | transcript_count <- length(unlist(tx_by_gene, use.names=FALSE)) 93 | fa_file <- FaFile(dm3_chr4()) 94 | 95 | promoter_seqs <- getPromoterSeq(tx_by_gene, fa_file, 96 | upstream=10, downstream=10) 97 | checkTrue(validObject(promoter_seqs)) 98 | checkTrue(is(promoter_seqs, "DNAStringSetList")) 99 | checkEquals(length(promoter_seqs), 2) 100 | checkIdentical(names(promoter_seqs), genes) 101 | checkIdentical(width(unlist(promoter_seqs, use.names=FALSE)), 102 | rep.int(20L, transcript_count)) 103 | # we are unable to check for specific DNA sequence, since 104 | # the UCSC annotation of these genes changes over time. 105 | 106 | terminator_seqs <- getPromoterSeq(tx_by_gene, fa_file, 107 | upstream=10, downstream=10) 108 | checkTrue(validObject(terminator_seqs)) 109 | checkTrue(is(terminator_seqs, "DNAStringSetList")) 110 | checkEquals(length(terminator_seqs), 2) 111 | checkIdentical(names(terminator_seqs), genes) 112 | checkIdentical(width(unlist(terminator_seqs, use.names=FALSE)), 113 | rep.int(20L, transcript_count)) 114 | } 115 | 116 | test_GRangesBSgenomeHumanGetPromoterSeq <- function() { 117 | txdb <- restoreSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene) ## safety net 118 | e2f3_tx <- transcriptsBy(txdb, by="gene")[[e2f3]] 119 | #names(e2f3_tx) <- mcols(e2f3_tx)$tx_name 120 | transcript_count <- length(e2f3_tx) 121 | checkEquals(dim(mcols(e2f3_tx)), c(transcript_count, 2)) 122 | checkIdentical(colnames(mcols(e2f3_tx)), c("tx_id", "tx_name")) 123 | 124 | promoter_seqs <- getPromoterSeq(e2f3_tx, Hsapiens, 125 | upstream=10, downstream=0) 126 | checkTrue(validObject(promoter_seqs)) 127 | checkTrue(is(promoter_seqs, "DNAStringSet")) 128 | checkEquals(length(promoter_seqs), transcript_count) 129 | checkTrue(is.null(names(promoter_seqs))) 130 | checkIdentical(width(promoter_seqs), rep.int(10L, transcript_count)) 131 | # should be one more column in the metadata than in the metadata 132 | checkEquals(dim(mcols(promoter_seqs)), c(transcript_count, 3)) 133 | checkEquals(colnames(mcols(promoter_seqs)), c("tx_id", "tx_name", "geneID")) 134 | # the input, a GRanges, had no names -- which are the source 135 | # of geneID when the GRangesList version of this methods is called. 136 | # so ensure that this lack of information was passed along into the 137 | # metadata of the returned promoter_seqs 138 | checkTrue(all(is.na(mcols(promoter_seqs)$geneID))) 139 | } 140 | 141 | -------------------------------------------------------------------------------- /inst/unitTests/test_makeIdsForUniqueDataFrameRows.R: -------------------------------------------------------------------------------- 1 | ### 2 | 3 | test_makeIdsForUniqueDataFrameRows <- function() 4 | { 5 | x <- data.frame( 6 | chrom=c("chr2", "chr2", "chr2", "chr2", "chr1", 7 | "chr2", "chr2", "chr1", "chr3", "chr1"), 8 | strand=c("+", "-", "-", "+", "+", "+", "+", "-", "-", "+"), 9 | start=c(5, 2, 2, 5, 4, 5, 5, 4, 2, 1), 10 | end=c(15, 12, 12, 15, 14, 13, 15, 14, 12, 11) 11 | ) 12 | y <- unique(x)[GenomicFeatures:::makeIdsForUniqueDataFrameRows(x), ] 13 | row.names(y) <- NULL 14 | checkEquals(y, x) 15 | } 16 | 17 | -------------------------------------------------------------------------------- /inst/unitTests/test_mapIdsToRanges.R: -------------------------------------------------------------------------------- 1 | txdb <- local({ 2 | library(txdbmaker) # for makeTxDbFromGRanges() 3 | fl <- system.file(package = "GenomicFeatures", "extdata", 4 | "sample_ranges.rds") 5 | makeTxDbFromGRanges(readRDS(fl)) 6 | }) 7 | 8 | test_mapIdsToRanges_improper_inputs <- function() 9 | { 10 | checkException(mapIdsToRanges(txdb, keys = ""), 11 | "must be a named list") 12 | 13 | checkException(mapIdsToRanges(txdb, keys = "ENST000000271582"), 14 | "must be a named list") 15 | 16 | checkException(mapIdsToRanges(txdb, keys = list("ENST000000271582")), 17 | "must be a named list") 18 | 19 | checkException(mapIdsToRanges(txdb, 20 | keys = list(tx_name = "ENST000000271582"), 21 | column = 1), 22 | "'columns' must be 'NULL' or a character vector") 23 | } 24 | 25 | test_mapIdsToRanges_same_order <- function() 26 | { 27 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588", 28 | "ENST00000494752", "ENST00000614008", "ENST00000496771")) 29 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx") 30 | checkEquals(names(res), keys[[1]]) 31 | 32 | # shuffle the order and make sure it remains equivalent 33 | for (i in seq_len(10)) { 34 | keys$tx_name <- sample(keys$tx_name) 35 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx") 36 | checkEquals(names(res), keys[[1]]) 37 | } 38 | } 39 | 40 | test_mapIdsToRanges_missing_results <- function() 41 | { 42 | keys <- list(tx_name = c("ENST00000371582", "NOT_FOUND", "ENST00000494752")) 43 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx") 44 | checkEquals(names(res), keys$tx_name) 45 | 46 | # shuffle the order and make sure it remains equivalent 47 | for (i in seq_len(10)) { 48 | keys$tx_name <- sample(keys$tx_name) 49 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx") 50 | checkEquals(names(res), keys$tx_name) 51 | } 52 | } 53 | 54 | test_mapIdsToRanges_duplicate_ranges <- function() 55 | { 56 | # both of these transcripts are from the same gene 57 | keys <- list(tx_name = c("ENST00000371582", "ENST00000494752")) 58 | res <- mapIdsToRanges(txdb, keys = keys, type = "gene") 59 | 60 | #names match input 61 | checkEquals(names(res), keys[[1]]) 62 | # but values are the same 63 | checkTrue(all.equal(res[[1]], res[[2]], check.attributes = FALSE)) 64 | } 65 | 66 | test_mapIdsToRanges_duplicate_ids <- function() { 67 | keys <- list(tx_name = c("ENST00000371582", "ENST00000494752", 68 | "ENST00000371582")) 69 | res <- mapIdsToRanges(txdb, keys = keys, type = "gene") 70 | checkEquals(names(res), keys[[1]]) 71 | checkEquals(res[[1]], res[[3]]) 72 | } 73 | 74 | test_mapRangesToIds_empty <- function() 75 | { 76 | checkException(mapRangesToIds(txdb, NULL), list()) 77 | 78 | checkException(mapRangesToIds(txdb, list()), list()) 79 | } 80 | 81 | test_mapRangesToIds_matches <- function() 82 | { 83 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588", 84 | "ENST00000494752", "ENST00000614008", "ENST00000496771")) 85 | res <- mapIdsToRanges(txdb, keys = keys, type = "gene") 86 | 87 | res2 <- mapRangesToIds(txdb, res, "tx") 88 | checkTrue(keys$tx_name[1] %in% res2[[1]]$tx_name) 89 | 90 | checkTrue(keys$tx_name[2] %in% res2[[2]]$tx_name) 91 | } 92 | -------------------------------------------------------------------------------- /inst/unitTests/test_nearest-methods.R: -------------------------------------------------------------------------------- 1 | quiet <- suppressWarnings 2 | 3 | test_GenomicRanges_distance <- function() 4 | { 5 | library(txdbmaker) # for makeTxDb() 6 | genes <- data.frame( 7 | tx_id=1:3, 8 | gene_id=c("gene1", "gene1", "gene2")) 9 | transcripts <- data.frame( 10 | tx_id=1:3, 11 | tx_chrom="chr1", 12 | tx_strand=c("+", "+", "-"), 13 | tx_start=c(1, 2001, 3001), 14 | tx_end=c(999, 2199, 3199)) 15 | splicings <- data.frame( 16 | tx_id=c(1L, 2L, 2L, 2L, 3L, 3L), 17 | cds_id=c(10L, 11L, 12L, 13L, 14L, 15L), 18 | exon_rank=c(1, 1, 2, 3, 1, 2), 19 | exon_start=c(1, 2001, 2101, 2131, 3001, 3131), 20 | exon_end=c(999, 2085, 2144, 2199, 3085, 3199), 21 | cds_start=c(1, 2022, 2101, 2131, 3001, 3131), 22 | cds_end=c(999, 2085, 2144, 2193, 3085, 3199)) 23 | txdb <- quiet(makeTxDb(transcripts, splicings, genes)) 24 | 25 | gr <- GRanges("chr1", IRanges(1050, width=1)) 26 | strand(gr) <- "-" 27 | d <- quiet(distance(gr, txdb, id="gene1", type="gene")) 28 | checkTrue(is.na(d)) 29 | strand(gr) <- "+" 30 | d_pos <- quiet(distance(gr, txdb, id="gene1", type="gene")) 31 | strand(gr) <- "*" 32 | d_star <- quiet(distance(gr, txdb, id="gene1", type="gene")) 33 | checkIdentical(d_pos, d_star) 34 | 35 | d_tx <- quiet(distance(gr, txdb, id="3", type="tx")) 36 | d_cds <- quiet(distance(gr, txdb, id="14", type="cds")) 37 | checkIdentical(d_tx, d_cds) 38 | } 39 | -------------------------------------------------------------------------------- /inst/unitTests/test_transcriptLengths.R: -------------------------------------------------------------------------------- 1 | test_transcriptLengths <- function() 2 | { 3 | library(txdbmaker) # for makeTxDbFromGFF() 4 | gff <- system.file("extdata", "ITAG4.1_gene_models.subset.gff", 5 | package="GenomicFeatures") 6 | txdb <- makeTxDbFromGFF(gff) 7 | txlens <- transcriptLengths(txdb, with.cds_len=TRUE, 8 | with.utr5_len=TRUE, 9 | with.utr3_len=TRUE) 10 | 11 | checkIdentical(class(txlens), "data.frame") 12 | checkIdentical(dim(txlens), c(10L, 8L)) 13 | 14 | expected_colnames <- c("tx_id", "tx_name", "gene_id", "nexon", "tx_len", 15 | "cds_len", "utr5_len", "utr3_len") 16 | checkIdentical(colnames(txlens), expected_colnames) 17 | 18 | checkIdentical(txlens$tx_len, 19 | txlens$cds_len + txlens$utr5_len + txlens$utr3_len) 20 | 21 | expected_nexon <- c(2L, 1L, 1L, 3L, 1L, 2L, 3L, 4L, 2L, 2L) 22 | checkIdentical(txlens$nexon, expected_nexon) 23 | 24 | expected_tx_len <- c(721L, 813L, 319L, 1046L, 264L, 25 | 819L, 802L, 453L, 249L, 889L) 26 | checkIdentical(txlens$tx_len, expected_tx_len) 27 | 28 | expected_cds_len <- c(471L, 663L, 276L, 918L, 264L, 29 | 261L, 369L, 453L, 249L, 516L) 30 | checkIdentical(txlens$cds_len, expected_cds_len) 31 | 32 | expected_utr5_len <- c(250L, 0L, 0L, 89L, 0L, 558L, 0L, 0L, 0L, 373L) 33 | checkIdentical(txlens$utr5_len, expected_utr5_len) 34 | 35 | expected_utr3_len <- c(0L, 150L, 43L, 39L, 0L, 0L, 433L, 0L, 0L, 0L) 36 | checkIdentical(txlens$utr3_len, expected_utr3_len) 37 | } 38 | 39 | -------------------------------------------------------------------------------- /inst/unitTests/test_transcripts.R: -------------------------------------------------------------------------------- 1 | test_transcripts <- function() 2 | { 3 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 4 | package="GenomicFeatures")) 5 | 6 | ## Test misuse. 7 | checkException(transcripts(data.frame()), silent=TRUE) 8 | checkException(transcripts(txdb, filter=list(bad=1:10)), silent=TRUE) 9 | checkException(transcripts(txdb, columns="bad"), silent=TRUE) 10 | 11 | ## Test 1. 12 | current <- transcripts(txdb, filter=list(gene_id="139231")) 13 | metadata(current) <- list() 14 | 15 | target <- GRanges("chrX", 16 | ranges=IRanges(start=c(103411156, 103430747), 17 | end =c(103440582, 103440582)), 18 | strand=strand("+"), 19 | tx_id=142:143, 20 | tx_name=c("uc004elw.3", "uc004elx.3"), 21 | seqinfo=seqinfo(txdb)) 22 | checkIdentical(target, current) 23 | 24 | ## Test 2. 25 | filter <- list(tx_chrom=c("chr12", "chr14"), tx_strand="-") 26 | current <- transcripts(txdb, columns=c("tx_id", "tx_name", 27 | "exon_id", "exon_rank"), 28 | filter=filter) 29 | metadata(current) <- list() 30 | 31 | target <- GRanges("chr12", 32 | ranges=IRanges(start=52753790, end=52761309), 33 | strand=strand("-"), 34 | tx_id=87L, 35 | tx_name="uc001sag.3", 36 | exon_id=IntegerList(334:326), 37 | exon_rank=IntegerList(1:9), 38 | seqinfo=seqinfo(txdb)) 39 | checkIdentical(target, current) 40 | 41 | ## Test 3. 42 | filter <- list(gene_id=c("220004", "1183", "10186")) 43 | current <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"), 44 | filter=filter) 45 | metadata(current) <- list() 46 | 47 | target_tx_id <- c(91L, 136:137) 48 | target_gene_id <- CharacterList("10186", "1183", "1183") 49 | target <- GRanges(c("chr13", "chrX", "chrX"), 50 | ranges=IRanges(start=c(39917029, 10124985, 10124985), 51 | end =c(40177356, 10205699, 10205699)), 52 | strand=strand(c("-", "+", "+")), 53 | tx_id=target_tx_id, 54 | tx_name=c("uc001uxf.3", "uc004csy.4", "uc011mid.3"), 55 | gene_id=target_gene_id, 56 | seqinfo=seqinfo(txdb)) 57 | checkIdentical(target, current) 58 | } 59 | 60 | test_transcripts_after_seqlevelsStyle_switch <- function() 61 | { 62 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 63 | package="GenomicFeatures")) 64 | checkIdentical(seqlevelsStyle(txdb), "UCSC") 65 | seqlevelsStyle(txdb) <- "NCBI" 66 | checkIdentical(seqlevelsStyle(txdb), c("NCBI", "UCSC")) 67 | 68 | filter <- list(gene_id=c("220004", "1183", "10186")) 69 | current <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"), 70 | filter=filter) 71 | metadata(current) <- list() 72 | 73 | checkIdentical(c("13", "X", "X"), as.character(seqnames(current))) 74 | 75 | target_tx_id <- c(91L, 136:137) 76 | target_gene_id <- CharacterList("10186", "1183", "1183") 77 | target <- GRanges(c("13", "X", "X"), 78 | ranges=IRanges(start=c(39917029, 10124985, 10124985), 79 | end =c(40177356, 10205699, 10205699)), 80 | strand=strand(c("-", "+", "+")), 81 | tx_id=target_tx_id, 82 | tx_name=c("uc001uxf.3", "uc004csy.4", "uc011mid.3"), 83 | gene_id=target_gene_id, 84 | seqinfo=seqinfo(txdb)) 85 | checkIdentical(target, current) 86 | } 87 | 88 | test_exons <- function() 89 | { 90 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 91 | package="GenomicFeatures")) 92 | 93 | ## Test misuse. 94 | checkException(exons(data.frame()), silent=TRUE) 95 | checkException(exons(txdb, filter=list(bad=1:10)), silent=TRUE) 96 | 97 | ## Test 1. 98 | current <- exons(txdb, filter=list(tx_name="uc001gde.2")) 99 | metadata(current) <- list() 100 | 101 | target <- GRanges("chr1", 102 | ranges=IRanges(start=c(165513478, 165532742), 103 | end =c(165514155, 165533185)), 104 | strand=strand("+"), 105 | exon_id=29:30, 106 | seqinfo=seqinfo(txdb)) 107 | checkIdentical(target, current) 108 | 109 | ## Test 2. 110 | filter <- list(exon_chrom=c("chr5", "chr14"), exon_strand="-") 111 | current <- exons(txdb, columns=c("exon_id", "tx_name", "gene_id"), 112 | filter=filter) 113 | metadata(current) <- list() 114 | 115 | target_ranges <- 116 | IRanges(start=c(134363424, 134366966, 134369403, 170732985), 117 | end =c(134365011, 134367198, 134369964, 170735759)) 118 | target <- GRanges("chr5", 119 | ranges=target_ranges, 120 | strand=strand("-"), 121 | exon_id=182:185, 122 | tx_name=CharacterList("uc010jea.3", "uc010jea.3", 123 | "uc010jea.3", "uc003mbe.2"), 124 | gene_id=CharacterList("5307", "5307", "5307", NULL), 125 | seqinfo=seqinfo(txdb)) 126 | checkIdentical(target, current) 127 | } 128 | 129 | test_cds <- function() 130 | { 131 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 132 | package="GenomicFeatures")) 133 | 134 | ## Test misuse. 135 | checkException(cds(data.frame()), silent=TRUE) 136 | checkException(cds(txdb, filter=list(bad=1:10)), silent=TRUE) 137 | 138 | ## Test 1. 139 | current <- cds(txdb, filter=list(tx_name="uc001gde.2")) 140 | metadata(current) <- list() 141 | 142 | target <- GRanges("chr1", 143 | ranges=IRanges(start=c(165513534, 165532742), 144 | end =c(165514155, 165533061)), 145 | strand=strand("+"), 146 | cds_id=23:24, 147 | seqinfo=seqinfo(txdb)) 148 | checkIdentical(target, current) 149 | 150 | ## Test 2. 151 | filter <- list(cds_chrom=c("chr5", "chr14"), cds_strand="-") 152 | current <- cds(txdb, columns=c("exon_id", "tx_name", "gene_id"), 153 | filter=filter) 154 | metadata(current) <- list() 155 | 156 | target_ranges <- 157 | IRanges(start=c(134364469, 134366966, 134369403, 170735359), 158 | end =c(134365011, 134367198, 134369571, 170735634)) 159 | target <- GRanges("chr5", 160 | ranges=target_ranges, 161 | strand=strand("-"), 162 | exon_id=as(182:185, "IntegerList"), 163 | tx_name=CharacterList("uc010jea.3", "uc010jea.3", 164 | "uc010jea.3", "uc003mbe.2"), 165 | gene_id=CharacterList("5307", "5307", "5307", NULL), 166 | seqinfo=seqinfo(txdb)) 167 | checkIdentical(target, current) 168 | } 169 | 170 | test_promoters <- function() 171 | { 172 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 173 | package="GenomicFeatures")) 174 | tx <- transcripts(txdb, use.names=TRUE) 175 | current <- promoters(txdb) 176 | checkTrue(validObject(current)) 177 | checkEquals(colnames(mcols(current)), c("tx_id", "tx_name")) 178 | checkIdentical(current, promoters(tx)) 179 | current <- terminators(txdb) 180 | checkTrue(validObject(current)) 181 | checkEquals(colnames(mcols(current)), c("tx_id", "tx_name")) 182 | checkIdentical(current, terminators(tx)) 183 | } 184 | 185 | 186 | test_translateCols <- function(){ 187 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 188 | package="GenomicFeatures")) 189 | tx1 <- transcripts(txdb, columns=c("tx_id", "tx_name", "cds_id")) 190 | checkEquals(colnames(mcols(tx1)), c("tx_id", "tx_name", "cds_id")) 191 | tx2 <- transcripts(txdb, columns=c("TXID", "TXNAME", "CDSID")) 192 | checkEquals(colnames(mcols(tx2)), c("TXID", "TXNAME", "CDSID")) 193 | tx3 <- transcripts(txdb, columns=c(bob="CDSID")) 194 | checkEquals(colnames(mcols(tx3)), c("bob")) 195 | tx4 <- transcripts(txdb, columns=c(bob="cds_id")) 196 | checkEquals(colnames(mcols(tx4)), c("bob")) 197 | ## And these two cases should both explode. ;) 198 | checkException(transcripts(txdb, columns=c(""))) 199 | checkException(transcripts(txdb, columns=c("bob"))) 200 | } 201 | 202 | -------------------------------------------------------------------------------- /inst/unitTests/test_transcriptsByOverlaps.R: -------------------------------------------------------------------------------- 1 | ### 2 | 3 | test_transcriptsByOverlaps <- function() 4 | { 5 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 6 | package="GenomicFeatures")) 7 | 8 | checkException(transcriptsByOverlaps(txdb), silent = TRUE) 9 | checkException(transcriptsByOverlaps(txdb, IRanges()), silent = TRUE) 10 | checkException(transcriptsByOverlaps(txdb, GRanges(), columns = "bad"), 11 | silent = TRUE) 12 | 13 | seqinfo <- seqinfo(txdb) 14 | seqlevels <- seqlevels(seqinfo) 15 | 16 | gr <- GRanges(seqnames = "chrX", 17 | ranges = IRanges(start=54071000, width=1), 18 | strand = strand("-")) 19 | want <- 20 | GRanges(seqnames = factor("chrX", levels = seqlevels), 21 | ranges = IRanges(start=53963113, end=54071569), 22 | strand = strand("-"), 23 | tx_id = 147L, 24 | tx_name = "uc004dsu.3") 25 | seqinfo(want) <- seqinfo 26 | want <- GenomicFeatures:::.assignMetadataList(want, txdb) 27 | checkIdentical(transcriptsByOverlaps(txdb, gr), want) 28 | 29 | ranges <- IRanges(start = c(113000000, 54071000, 54071000), 30 | width = c( 5000000, 1, 1)) 31 | chrom <- c("chr3", "chrX", "chrX") 32 | strand <- strand(c("+", "+", "-")) 33 | gr <- GRanges(seqnames = chrom, ranges = ranges, strand = strand) 34 | want <- GRanges(seqnames = 35 | factor(c("chr3", "chr3", "chr3", "chrX"), 36 | levels = seqlevels), 37 | ranges = IRanges(start = c(113666748, 113666822, 113676421, 38 | 53963113), 39 | end = c(113681827, 113681827, 113682211, 40 | 54071569)), 41 | strand = strand(c("+", "+", "+", "-")), 42 | tx_id = c(29:31, 147L)) 43 | seqinfo(want) <- seqinfo 44 | want <- GenomicFeatures:::.assignMetadataList(want, txdb) 45 | checkIdentical(transcriptsByOverlaps(txdb, gr, columns="tx_id"), want) 46 | } 47 | 48 | test_exonsByOverlaps <- function() 49 | { 50 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 51 | package="GenomicFeatures")) 52 | 53 | checkException(exonsByOverlaps(txdb), silent = TRUE) 54 | checkException(exonsByOverlaps(txdb, IRanges()), silent = TRUE) 55 | checkException(exonsByOverlaps(txdb, GRanges(), columns = "bad"), 56 | silent = TRUE) 57 | 58 | seqinfo <- seqinfo(txdb) 59 | seqlevels <- seqlevels(seqinfo) 60 | 61 | gr <- GRanges(seqnames = "chr3", 62 | ranges = IRanges(start=113677210, width=1), 63 | strand = strand("+")) 64 | want <- 65 | GRanges(seqnames = factor("chr3", levels = seqlevels), 66 | ranges = IRanges(start=c(113677210, 113677210), 67 | end =c(113677385, 113682211)), 68 | strand = strand("+"), 69 | exon_id = 139:140) 70 | seqinfo(want) <- seqinfo 71 | want <- GenomicFeatures:::.assignMetadataList(want, txdb) 72 | checkIdentical(exonsByOverlaps(txdb, gr), want) 73 | } 74 | 75 | test_cdsByOverlaps <- function() 76 | { 77 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 78 | package="GenomicFeatures")) 79 | 80 | checkException(cdsByOverlaps(txdb), silent = TRUE) 81 | checkException(cdsByOverlaps(txdb, IRanges()), silent = TRUE) 82 | checkException(cdsByOverlaps(txdb, GRanges(), columns = "bad"), 83 | silent = TRUE) 84 | 85 | seqinfo <- seqinfo(txdb) 86 | seqlevels <- seqlevels(seqinfo) 87 | 88 | gr <- GRanges(seqnames = "chr3", 89 | ranges = IRanges(start=113677210, width=1), 90 | strand = strand("+")) 91 | want <- 92 | GRanges(seqnames = factor("chr3", levels = seqlevels), 93 | ranges = IRanges(start=c(113677210, 113677210), 94 | end =c(113677385, 113677477)), 95 | strand = strand("+"), 96 | cds_id = 116:117) 97 | seqinfo(want) <- seqinfo 98 | want <- GenomicFeatures:::.assignMetadataList(want, txdb) 99 | checkIdentical(cdsByOverlaps(txdb, gr), want) 100 | } 101 | -------------------------------------------------------------------------------- /man/FeatureDb-class.Rd: -------------------------------------------------------------------------------- 1 | \name{FeatureDb-class} 2 | 3 | \alias{FeatureDb-class} 4 | \alias{class:FeatureDb} 5 | \alias{FeatureDb} 6 | 7 | \title{FeatureDb objects} 8 | 9 | \description{ 10 | WARNING: The FeatureDb/makeFeatureDbFromUCSC/features code base is 11 | no longer actively maintained and FeatureDb-related functionalities 12 | might get deprecated in the near future. Please use 13 | \code{\link{makeFeatureDbFromUCSC}} for a convenient way to 14 | import transcript annotations from UCSC online resources into 15 | Bioconductor. 16 | 17 | The FeatureDb class is a generic container for storing 18 | genomic locations of an arbitrary type of genomic features. 19 | 20 | See \code{?\link{TxDb}} for a container for storing transcript 21 | annotations. 22 | 23 | See \code{?\link{makeFeatureDbFromUCSC}} for a convenient way to 24 | make FeatureDb objects from BioMart online resources. 25 | } 26 | 27 | \section{Methods}{ 28 | In the code snippets below, \code{x} is a FeatureDb object. 29 | 30 | \describe{ 31 | \item{\code{metadata(x)}:}{ 32 | Return \code{x}'s metadata in a data frame. 33 | } 34 | } 35 | } 36 | 37 | \author{Marc Carlson} 38 | 39 | \seealso{ 40 | \itemize{ 41 | \item The \link{TxDb} class for storing transcript annotations. 42 | \item \code{\link{makeFeatureDbFromUCSC}} for a convenient way to 43 | make a FeatureDb object from UCSC online resources. 44 | \item \code{\link{saveDb}} and \code{\link{loadDb}} for 45 | saving and loading the database content of a FeatureDb object. 46 | \item \code{\link{features}} for how to extract genomic features 47 | from a FeatureDb object. 48 | } 49 | } 50 | 51 | \examples{ 52 | fdb_file <- system.file("extdata", "FeatureDb.sqlite", 53 | package="GenomicFeatures") 54 | fdb <- loadDb(fdb_file) 55 | fdb 56 | } 57 | 58 | \keyword{methods} 59 | \keyword{classes} 60 | -------------------------------------------------------------------------------- /man/TxDb-class.Rd: -------------------------------------------------------------------------------- 1 | \name{TxDb-class} 2 | 3 | \alias{TxDb-class} 4 | \alias{class:TxDb} 5 | \alias{TxDb} 6 | 7 | \alias{saveRDS,TxDb-method} 8 | 9 | \alias{organism,TxDb-method} 10 | \alias{seqlevels0,TxDb-method} 11 | \alias{seqlevels<-,TxDb-method} 12 | \alias{seqinfo,TxDb-method} 13 | \alias{isActiveSeq} 14 | \alias{isActiveSeq<-} 15 | \alias{isActiveSeq,TxDb-method} 16 | \alias{isActiveSeq<-,TxDb-method} 17 | \alias{show,TxDb-method} 18 | 19 | % coercion 20 | \alias{as.list,TxDb-method} 21 | 22 | \title{TxDb objects} 23 | 24 | \description{ 25 | The TxDb class is a container for storing transcript annotations. 26 | } 27 | 28 | \section{Methods}{ 29 | In the code snippets below, \code{x} is a TxDb object. 30 | 31 | \describe{ 32 | \item{\code{metadata(x)}:}{ 33 | Return \code{x}'s metadata in a data frame. 34 | } 35 | \item{\code{seqlevels0(x)}:}{ 36 | Get the \emph{sequence levels} originally in \code{x}. This ignores any 37 | change the user might have made to the \emph{sequence levels} with the 38 | \code{seqlevels} setter. 39 | } 40 | \item{\code{seqlevels(x)}, \code{seqlevels(x) <- value}:}{ 41 | Get or set the \emph{sequence levels} in \code{x}. 42 | } 43 | \item{\code{seqinfo(x)}, \code{seqinfo(x) <- value}:}{ 44 | Get or set the information about the underlying sequences. 45 | Note that, for now, the setter only supports replacement of the 46 | sequence names, i.e., except for their sequence names (accessed with 47 | \code{seqnames(value)} and \code{seqnames(seqinfo(x))}, respectively), 48 | \link[GenomeInfoDb]{Seqinfo} objects \code{value} (supplied) and 49 | \code{seqinfo(x)} (current) must be identical. 50 | } 51 | \item{\code{isActiveSeq(x)}:}{ 52 | Return the currently active sequences for this txdb object as a 53 | named logical vector. Only active sequences will be tapped when 54 | using the supplied accessor methods. Inactive sequences will be 55 | ignored. By default, all available sequences will be active. 56 | } 57 | \item{\code{isActiveSeq(x) <- value}:}{ 58 | Allows the user to change which sequences will be actively 59 | accessed by the accessor methods by altering the contents of this 60 | named logical vector. 61 | } 62 | \item{\code{seqlevelsStyle(x)}, \code{seqlevelsStyle(x) <- value}:}{ 63 | Get or set the seqname style for \code{x}. 64 | See the \link[GenomeInfoDb]{seqlevelsStyle} generic getter and setter 65 | in the \pkg{GenomeInfoDb} package for more information. 66 | } 67 | \item{\code{as.list(x)}:}{ 68 | Dump the entire db into a list of data frames, say \code{txdb_dump}, 69 | that can then be used to recreate the original db with 70 | \code{do.call(txdbmaker::makeTxDb, txdb_dump)} with no loss of 71 | information (except possibly for some of the metadata). 72 | Note that the transcripts are dumped in the same order in all the 73 | data frames. 74 | } 75 | } 76 | } 77 | 78 | \author{Hervé Pagès, Marc Carlson} 79 | 80 | \seealso{ 81 | \itemize{ 82 | \item \code{\link[txdbmaker]{makeTxDbFromUCSC}}, 83 | \code{\link[txdbmaker]{makeTxDbFromBiomart}}, 84 | and \code{\link[txdbmaker]{makeTxDbFromEnsembl}} in 85 | the \pkg{txdbmaker} package for making a \link{TxDb} 86 | object from online resources. 87 | 88 | \item \code{\link[txdbmaker]{makeTxDbFromGRanges}} and 89 | \code{\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker} 90 | package for making a \link{TxDb} object from a 91 | \link[GenomicRanges]{GRanges} object, or from a GFF or GTF file. 92 | 93 | \item \code{\link[AnnotationDbi]{saveDb}} and 94 | \code{\link[AnnotationDbi]{loadDb}} in the \pkg{AnnotationDbi} 95 | package for saving and loading a TxDb object as an SQLite file. 96 | 97 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, 98 | and \code{\link{transcriptsByOverlaps}}, for extracting 99 | genomic feature locations from a \link{TxDb}-like object. 100 | 101 | \item \code{\link{transcriptLengths}} for extracting the transcript 102 | lengths (and other metrics) from a \link{TxDb} object. 103 | 104 | \item \link[GenomicFeatures]{select-methods} for how to use the 105 | simple "select" interface to extract information from a 106 | TxDb object. 107 | 108 | \item The \link[GenomeInfoDb]{Seqinfo} class in the \pkg{GenomeInfoDb} 109 | package. 110 | } 111 | } 112 | 113 | \examples{ 114 | txdb_file <- system.file("extdata", "Biomart_Ensembl_sample.sqlite", 115 | package="GenomicFeatures") 116 | txdb <- loadDb(txdb_file) 117 | txdb 118 | 119 | ## Use of seqinfo(): 120 | seqlevelsStyle(txdb) 121 | seqinfo(txdb) 122 | seqlevels(txdb) 123 | seqlengths(txdb) # shortcut for 'seqlengths(seqinfo(txdb))' 124 | isCircular(txdb) # shortcut for 'isCircular(seqinfo(txdb))' 125 | names(which(isCircular(txdb))) 126 | 127 | ## You can set user-supplied seqlevels on 'txdb' to restrict any further 128 | ## operations to a subset of chromosomes: 129 | seqlevels(txdb) <- c("Y", "6") 130 | ## Then you can restore the seqlevels stored in the db: 131 | seqlevels(txdb) <- seqlevels0(txdb) 132 | 133 | ## Use of as.list(): 134 | txdb_dump <- as.list(txdb) 135 | txdb_dump 136 | 137 | library(txdbmaker) # for makeTxDb() 138 | txdb1 <- do.call(makeTxDb, txdb_dump) 139 | stopifnot(identical(as.list(txdb1), txdb_dump)) 140 | } 141 | 142 | \keyword{methods} 143 | \keyword{classes} 144 | -------------------------------------------------------------------------------- /man/as-format-methods.Rd: -------------------------------------------------------------------------------- 1 | \name{as-format-methods} 2 | \alias{asBED,TxDb-method} 3 | \alias{asGFF,TxDb-method} 4 | 5 | \title{Coerce to file format structures} 6 | \description{ 7 | These functions coerce a \code{\linkS4class{TxDb}} object to a 8 | \code{\link[GenomicRanges:GRanges-class]{GRanges}} object with 9 | metadata columns encoding transcript structures according to the 10 | model of a standard file format. Currently, BED and GFF models are 11 | supported. If a \code{TxDb} is passed to 12 | \code{\link[BiocIO]{export}}, when targeting a BED or GFF file, 13 | this coercion occurs automatically. 14 | } 15 | \usage{ 16 | \S4method{asBED}{TxDb}(x) 17 | \S4method{asGFF}{TxDb}(x) 18 | } 19 | 20 | \arguments{ 21 | \item{x}{ 22 | A \code{TxDb} object to coerce to a \code{GRanges}, 23 | structured as BED or GFF. 24 | } 25 | } 26 | 27 | \value{ 28 | For \code{asBED}, a \code{GRanges}, with the columns \code{name}, 29 | \code{thickStart}, \code{thickEnd}, \code{blockStarts}, 30 | \code{blockSizes} added. The thick regions correspond to the CDS 31 | regions, and the blocks represent the exons. The transcript IDs are 32 | stored in the \code{name} column. The ranges are the transcript bounds. 33 | 34 | For \code{asGFF}, a \code{GRanges}, with columns \code{type}, 35 | \code{Name}, \code{ID},, and \code{Parent}. The gene structures are 36 | expressed according to the conventions defined by the GFF3 spec. There 37 | are elements of each \code{type} of feature: \dQuote{gene}, 38 | \dQuote{mRNA} \dQuote{exon} and \dQuote{cds}. The \code{Name} column 39 | contains the \code{gene_id} for genes, \code{tx_name} for transcripts, 40 | and exons and cds regions are \code{NA}. The \code{ID} column uses 41 | \code{gene_id} and \code{tx_id}, with the prefixes \dQuote{GeneID} and 42 | \dQuote{TxID} to ensure uniqueness across types. The exons and cds 43 | regions have \code{NA} for \code{ID}. The \code{Parent} column 44 | contains the \code{ID}s of the parent features. A feature may have 45 | multiple parents (the column is a \code{CharacterList}). Each exon 46 | belongs to one or more mRNAs, and mRNAs belong to a gene. 47 | } 48 | 49 | \author{ 50 | Michael Lawrence 51 | } 52 | 53 | \examples{ 54 | txdb_file <- system.file("extdata", "hg19_knownGene_sample.sqlite", 55 | package="GenomicFeatures") 56 | txdb <- loadDb(txdb_file) 57 | 58 | asBED(txdb) 59 | asGFF(txdb) 60 | } 61 | -------------------------------------------------------------------------------- /man/exonicParts.Rd: -------------------------------------------------------------------------------- 1 | \name{exonicParts} 2 | 3 | \alias{tidyTranscripts} 4 | \alias{tidyExons} 5 | \alias{tidyIntrons} 6 | \alias{exonicParts} 7 | \alias{intronicParts} 8 | 9 | \title{ 10 | Extract non-overlapping exonic or intronic parts from a TxDb-like object 11 | } 12 | 13 | \description{ 14 | \code{exonicParts} and \code{intronicParts} extract the non-overlapping 15 | (a.k.a. disjoint) exonic or intronic parts from a \link{TxDb}-like object. 16 | } 17 | 18 | \usage{ 19 | exonicParts(txdb, linked.to.single.gene.only=FALSE) 20 | intronicParts(txdb, linked.to.single.gene.only=FALSE) 21 | 22 | ## 3 helper functions used internally by exonicParts() and intronicParts(): 23 | tidyTranscripts(txdb, drop.geneless=FALSE) 24 | tidyExons(txdb, drop.geneless=FALSE) 25 | tidyIntrons(txdb, drop.geneless=FALSE) 26 | } 27 | 28 | \arguments{ 29 | \item{txdb}{ 30 | A \link{TxDb} object, or any \link{TxDb}-like object that supports the 31 | \code{\link{transcripts}()} and \code{\link{exonsBy}()} extractors 32 | (e.g. an \link[ensembldb]{EnsDb} object). 33 | } 34 | \item{linked.to.single.gene.only}{ 35 | \code{TRUE} or \code{FALSE}. 36 | 37 | If \code{FALSE} (the default), then the disjoint parts are obtained 38 | by calling \code{\link[IRanges]{disjoin}()} on all the exons (or introns) 39 | in \code{txdb}, including on exons (or introns) not linked to a gene or 40 | linked to more than one gene. 41 | 42 | If \code{TRUE}, then the disjoint parts are obtained in 2 steps: 43 | \enumerate{ 44 | \item call \code{\link[IRanges]{disjoin}()} on the exons (or introns) 45 | linked to \emph{at least one gene}, 46 | 47 | \item then drop the parts linked to more than one gene from 48 | the set of exonic (or intronic) parts obtained previously. 49 | } 50 | } 51 | \item{drop.geneless}{ 52 | If \code{FALSE} (the default), then all the transcripts (or exons, or 53 | introns) get extracted from the \link{TxDb} object. 54 | 55 | If \code{TRUE}, then only the transcripts (or exons, or introns) that 56 | are linked to a gene get extracted from the \link{TxDb} object. 57 | 58 | Note that \code{drop.geneless} also impacts the order in which the 59 | features are returned: 60 | \itemize{ 61 | \item Transcripts: If \code{drop.geneless} is \code{FALSE} then 62 | transcripts are returned in the same order as with 63 | \code{\link{transcripts}}, which is expected to be by 64 | internal transcript id (\code{tx_id}). 65 | Otherwise they are ordered first by gene id (\code{gene_id}), 66 | then by internal transcript id. 67 | \item Exons: If \code{drop.geneless} is \code{FALSE} then exons are 68 | ordered first by internal transcript id (\code{tx_id}), 69 | then by exon rank (\code{exon_rank}). 70 | Otherwise they are ordered first by gene id (\code{gene_id}), 71 | then by internal transcript id, and then by exon rank. 72 | \item Introns: If \code{drop.geneless} is \code{FALSE} then introns 73 | are ordered by internal transcript id (\code{tx_id}). 74 | Otherwise they are ordered first by gene id (\code{gene_id}), 75 | then by internal transcript id. 76 | } 77 | } 78 | } 79 | 80 | \value{ 81 | \code{exonicParts} returns a disjoint and strictly sorted 82 | \link[GenomicRanges]{GRanges} object with 1 range per exonic part 83 | and with metadata columns \code{tx_id}, \code{tx_name}, \code{gene_id}, 84 | \code{exon_id}, \code{exon_name}, and \code{exon_rank}. 85 | If \code{linked.to.single.gene.only} was set to \code{TRUE}, 86 | an additional \code{exonic_part} metadata column is added that 87 | indicates the rank of each exonic part within all the exonic parts 88 | linked to the same gene. 89 | 90 | \code{intronicParts} returns a disjoint and strictly sorted 91 | \link[GenomicRanges]{GRanges} object with 1 range per intronic part 92 | and with metadata columns \code{tx_id}, \code{tx_name}, and \code{gene_id}. 93 | If \code{linked.to.single.gene.only} was set to \code{TRUE}, 94 | an additional \code{intronic_part} metadata column is added that 95 | indicates the rank of each intronic part within all the intronic parts 96 | linked to the same gene. 97 | 98 | \code{tidyTranscripts} returns a \link[GenomicRanges]{GRanges} object 99 | with 1 range per transcript and with metadata columns \code{tx_id}, 100 | \code{tx_name}, and \code{gene_id}. 101 | 102 | \code{tidyExons} returns a \link[GenomicRanges]{GRanges} object 103 | with 1 range per exon and with metadata columns \code{tx_id}, 104 | \code{tx_name}, \code{gene_id}, \code{exon_id}, \code{exon_name}, 105 | and \code{exon_rank}. 106 | 107 | \code{tidyIntrons} returns a \link[GenomicRanges]{GRanges} object 108 | with 1 range per intron and with metadata columns \code{tx_id}, 109 | \code{tx_name}, and \code{gene_id}. 110 | } 111 | 112 | \author{Hervé Pagès} 113 | 114 | \seealso{ 115 | \itemize{ 116 | \item \code{\link[IRanges]{disjoin}} in the \pkg{IRanges} package. 117 | 118 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, 119 | and \code{\link{transcriptsByOverlaps}}, for extracting 120 | genomic feature locations from a \link{TxDb}-like object. 121 | 122 | \item \code{\link{transcriptLengths}} for extracting the transcript 123 | lengths (and other metrics) from a \link{TxDb} object. 124 | 125 | \item \code{\link{extendExonsIntoIntrons}} for extending exons into 126 | their adjacent introns. 127 | 128 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript 129 | (or CDS) sequences from chromosome sequences. 130 | 131 | \item \code{\link{coverageByTranscript}} for computing coverage by 132 | transcript (or CDS) of a set of ranges. 133 | 134 | \item The \link{TxDb} class. 135 | } 136 | } 137 | 138 | \examples{ 139 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 140 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 141 | 142 | ## --------------------------------------------------------------------- 143 | ## exonicParts() 144 | ## --------------------------------------------------------------------- 145 | 146 | exonic_parts1 <- exonicParts(txdb) 147 | exonic_parts1 148 | 149 | ## Mapping from exonic parts to genes is many-to-many: 150 | gene_id1 <- mcols(exonic_parts1)$gene_id 151 | gene_id1 # CharacterList object 152 | table(lengths(gene_id1)) 153 | ## The number of known genes a Human exonic part can be linked to 154 | ## varies from 0 to 22! 155 | 156 | exonic_parts2 <- exonicParts(txdb, linked.to.single.gene.only=TRUE) 157 | exonic_parts2 158 | 159 | ## Mapping from exonic parts to genes now is many-to-one: 160 | gene_id2 <- mcols(exonic_parts2)$gene_id 161 | gene_id2[1:20] # character vector 162 | 163 | ## Select exonic parts for a given gene: 164 | exonic_parts2[gene_id2 \%in\% "643837"] 165 | 166 | ## Sanity checks: 167 | stopifnot(isDisjoint(exonic_parts1), isStrictlySorted(exonic_parts1)) 168 | stopifnot(isDisjoint(exonic_parts2), isStrictlySorted(exonic_parts2)) 169 | stopifnot(all(exonic_parts2 \%within\% reduce(exonic_parts1))) 170 | stopifnot(identical( 171 | lengths(gene_id1) == 1L, 172 | exonic_parts1 \%within\% exonic_parts2 173 | )) 174 | 175 | ## --------------------------------------------------------------------- 176 | ## intronicParts() 177 | ## --------------------------------------------------------------------- 178 | 179 | intronic_parts1 <- intronicParts(txdb) 180 | intronic_parts1 181 | 182 | ## Mapping from intronic parts to genes is many-to-many: 183 | mcols(intronic_parts1)$gene_id 184 | table(lengths(mcols(intronic_parts1)$gene_id)) 185 | ## A Human intronic part can be linked to 0 to 22 known genes! 186 | 187 | intronic_parts2 <- intronicParts(txdb, linked.to.single.gene.only=TRUE) 188 | intronic_parts2 189 | 190 | ## Mapping from intronic parts to genes now is many-to-one: 191 | class(mcols(intronic_parts2)$gene_id) # character vector 192 | 193 | ## Sanity checks: 194 | stopifnot(isDisjoint(intronic_parts1), isStrictlySorted(intronic_parts1)) 195 | stopifnot(isDisjoint(intronic_parts2), isStrictlySorted(intronic_parts2)) 196 | stopifnot(all(intronic_parts2 \%within\% reduce(intronic_parts1))) 197 | stopifnot(identical( 198 | lengths(mcols(intronic_parts1)$gene_id) == 1L, 199 | intronic_parts1 \%within\% intronic_parts2 200 | )) 201 | 202 | ## --------------------------------------------------------------------- 203 | ## Helper functions 204 | ## --------------------------------------------------------------------- 205 | 206 | tidyTranscripts(txdb) # Ordered by 'tx_id'. 207 | tidyTranscripts(txdb, drop.geneless=TRUE) # Ordered first by 'gene_id', 208 | # then by 'tx_id'. 209 | 210 | tidyExons(txdb) # Ordered first by 'tx_id', 211 | # then by 'exon_rank'. 212 | tidyExons(txdb, drop.geneless=TRUE) # Ordered first by 'gene_id', 213 | # then by 'tx_id', 214 | # then by 'exon_rank'. 215 | 216 | tidyIntrons(txdb) # Ordered by 'tx_id'. 217 | tidyIntrons(txdb, drop.geneless=TRUE) # Ordered first by 'gene_id', 218 | # then by 'tx_id'. 219 | } 220 | 221 | \keyword{manip} 222 | -------------------------------------------------------------------------------- /man/extendExonsIntoIntrons.Rd: -------------------------------------------------------------------------------- 1 | \name{extendExonsIntoIntrons} 2 | 3 | \alias{extendExonsIntoIntrons} 4 | 5 | \title{ 6 | Extend exons by a given number of bases into their adjacent introns 7 | } 8 | 9 | \description{ 10 | \code{extendExonsIntoIntrons} extends the supplied exons by a given 11 | number of bases into their adjacent introns. 12 | } 13 | 14 | \usage{ 15 | extendExonsIntoIntrons(ex_by_tx, extent=2) 16 | } 17 | 18 | \arguments{ 19 | \item{ex_by_tx}{ 20 | A \link[GenomicRanges]{GRangesList} object containing exons grouped 21 | by transcript. This must be an object as returned by 22 | \code{\link{exonsBy}(txdb, by="tx")}, that is: 23 | \itemize{ 24 | \item each list element in \code{ex_by_tx} must be a 25 | \link[GenomicRanges]{GRanges} object representing the 26 | exons of a given transcript; 27 | \item the exons in each list element must be ordered by ascending 28 | rank with respect to their transcript. 29 | } 30 | } 31 | \item{extent}{ 32 | Size of the extent in number of bases. 2 by default. 33 | 34 | The first exon in a transcript will be extended by that amount on 35 | its 3' side only. The last exon in a transcript will be extended by 36 | that amount on its 5' side only. All other exons (i.e. intermediate 37 | exons) will be extended by that amount on \emph{each} side. 38 | 39 | Note that exons that belong to a single-exon transcript don't get 40 | extended. 41 | 42 | The default value of 2 corresponds to inclusion of the donor/acceptor 43 | intronic regions (typically GT/AG). 44 | } 45 | } 46 | 47 | \value{ 48 | A copy of \link[GenomicRanges]{GRangesList} object \code{ex_by_tx} 49 | where the original exon ranges have been extended. 50 | 51 | Names and metadata columns on \code{ex_by_tx} are propagated to the 52 | result. 53 | } 54 | 55 | \author{Hervé Pagès} 56 | 57 | \seealso{ 58 | \itemize{ 59 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, 60 | and \code{\link{transcriptsByOverlaps}}, for extracting 61 | genomic feature locations from a \link{TxDb}-like object. 62 | 63 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for 64 | extracting non-overlapping exonic or intronic parts from a 65 | TxDb-like object. 66 | 67 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript 68 | (or CDS) sequences from chromosome sequences. 69 | 70 | \item The \link{TxDb} class. 71 | } 72 | } 73 | 74 | \examples{ 75 | ## With toy transcripts: 76 | ex_by_tx <- GRangesList( 77 | TX1="chr1:10-20:+", 78 | TX2=c("chr1:10-20:+", "chr1:50-75:+"), 79 | TX3=c("chr1:10-20:+", "chr1:50-75:+", "chr1:100-120:+"), 80 | TX4="chr1:10-20:-", 81 | TX5=c("chr1:10-20:-", "chr1:50-75:-"), 82 | TX6=c("chr1:10-20:-", "chr1:50-75:-", "chr1:100-120:-") 83 | ) 84 | 85 | extended <- extendExonsIntoIntrons(ex_by_tx, extent=2) 86 | extended[1:3] 87 | extended[4:6] 88 | 89 | ## With real-world transcripts: 90 | library(TxDb.Celegans.UCSC.ce11.ensGene) 91 | txdb <- TxDb.Celegans.UCSC.ce11.ensGene 92 | ex_by_tx <- exonsBy(txdb, by="tx") 93 | ex_by_tx 94 | 95 | extendExonsIntoIntrons(ex_by_tx, extent=2) 96 | 97 | ## Sanity check: 98 | stopifnot(identical(extendExonsIntoIntrons(ex_by_tx, extent=0), ex_by_tx)) 99 | } 100 | 101 | \keyword{manip} 102 | -------------------------------------------------------------------------------- /man/extractUpstreamSeqs.Rd: -------------------------------------------------------------------------------- 1 | \name{extractUpstreamSeqs} 2 | 3 | \alias{extractUpstreamSeqs} 4 | \alias{extractUpstreamSeqs,GenomicRanges-method} 5 | \alias{extractUpstreamSeqs,TxDb-method} 6 | \alias{extractUpstreamSeqs,GRangesList-method} 7 | 8 | 9 | \title{Extract sequences upstream of a set of genes or transcripts} 10 | 11 | \description{ 12 | \code{extractUpstreamSeqs} is a generic function for extracting 13 | sequences upstream of a supplied set of genes or transcripts. 14 | } 15 | 16 | \usage{ 17 | extractUpstreamSeqs(x, genes, width=1000, ...) 18 | 19 | ## Dispatch is on the 2nd argument! 20 | 21 | \S4method{extractUpstreamSeqs}{GenomicRanges}(x, genes, width=1000) 22 | 23 | \S4method{extractUpstreamSeqs}{TxDb}(x, genes, width=1000, exclude.seqlevels=NULL) 24 | } 25 | 26 | \arguments{ 27 | \item{x}{ 28 | An object containing the chromosome sequences from which to extract the 29 | upstream sequences. It can be a \link[BSgenome]{BSgenome}, 30 | \link[rtracklayer]{TwoBitFile}, or \link[Rsamtools]{FaFile} object, 31 | or any \emph{genome sequence container}. 32 | More formally, \code{x} must be an object for which 33 | \code{\link[GenomeInfoDb]{seqinfo}} and \code{\link[Biostrings]{getSeq}} 34 | are defined. 35 | } 36 | \item{genes}{ 37 | An object containing the locations (i.e. chromosome name, start, end, and 38 | strand) of the genes or transcripts with respect to the reference genome. 39 | Only \link[GenomicRanges]{GenomicRanges} and \link{TxDb} objects 40 | are supported at the moment. If the latter, the gene locations are obtained 41 | by calling the \code{\link{genes}} function on the \link{TxDb} 42 | object internally. 43 | } 44 | \item{width}{ 45 | How many bases to extract upstream of each TSS (transcription start site). 46 | } 47 | \item{...}{ 48 | Additional arguments, for use in specific methods. 49 | } 50 | \item{exclude.seqlevels}{ 51 | A character vector containing the chromosome names (a.k.a. sequence levels) 52 | to exclude when the genes are obtained from a \link{TxDb} object. 53 | } 54 | } 55 | 56 | \value{ 57 | A \link[Biostrings]{DNAStringSet} object containing one upstream sequence 58 | per gene (or per transcript if \code{genes} is a 59 | \link[GenomicRanges]{GenomicRanges} object containing transcript ranges). 60 | 61 | More precisely, if \code{genes} is a \link[GenomicRanges]{GenomicRanges} 62 | object, the returned object is \emph{parallel} to it, that is, the i-th 63 | element in the returned object is the upstream sequence corresponding to 64 | the i-th gene (or transcript) in \code{genes}. Also the names on the 65 | \link[GenomicRanges]{GenomicRanges} object are propagated to the returned 66 | object. 67 | 68 | If \code{genes} is a \link{TxDb} object, the names on the returned 69 | object are the gene IDs found in the \link{TxDb} object. To see the 70 | type of gene IDs (i.e. Entrez gene ID or Ensembl gene ID or ...), you can 71 | display \code{genes} with \code{show(genes)}. 72 | 73 | In addition, the returned object has the following metadata columns 74 | (accessible with \code{\link{mcols}}) that provide some information about 75 | the gene (or transcript) corresponding to each upstream sequence: 76 | \itemize{ 77 | \item \code{gene_seqnames}: the chromosome name of the gene (or 78 | transcript); 79 | \item \code{gene_strand}: the strand of the gene (or transcript); 80 | \item \code{gene_TSS}: the transcription start site of the gene (or 81 | transcript). 82 | } 83 | } 84 | 85 | \note{ 86 | IMPORTANT: Always make sure to use a TxDb package (or \link{TxDb} 87 | object) that contains a gene model compatible with the \emph{genome sequence 88 | container} \code{x}, that is, a gene model based on the exact same reference 89 | genome as \code{x}. 90 | 91 | See 92 | \url{http://bioconductor.org/packages/release/BiocViews.html#___TxDb} 93 | for the list of TxDb packages available in the current release of 94 | Bioconductor. 95 | Note that you can make your own custom \link{TxDb} object from 96 | various annotation resources by using one of the \code{makeTxDbFrom*()} 97 | functions defined in the \pkg{txdbmaker} package and listed in 98 | the "See also" section below. 99 | } 100 | 101 | \author{Hervé Pagès} 102 | 103 | \seealso{ 104 | \itemize{ 105 | \item \code{\link[txdbmaker]{makeTxDbFromUCSC}}, 106 | \code{\link[txdbmaker]{makeTxDbFromBiomart}}, 107 | and \code{\link[txdbmaker]{makeTxDbFromEnsembl}} in 108 | the \pkg{txdbmaker} package for making a \link{TxDb} 109 | object from online resources. 110 | 111 | \item \code{\link[txdbmaker]{makeTxDbFromGRanges}} and 112 | \code{\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker} 113 | package for making a \link{TxDb} object from a 114 | \link[GenomicRanges]{GRanges} object, or from a GFF or GTF file. 115 | 116 | \item The \code{\link[BSgenome]{available.genomes}} function in the 117 | \pkg{BSgenome} package for checking avaibility of BSgenome 118 | data packages (and installing the desired one). 119 | 120 | \item The \link[BSgenome]{BSgenome}, \link[rtracklayer]{TwoBitFile}, and 121 | \link[Rsamtools]{FaFile} classes, defined and documented 122 | in the \pkg{BSgenome}, \pkg{rtracklayer}, and \pkg{Rsamtools} 123 | packages, respectively. 124 | 125 | \item The \link{TxDb} class. 126 | 127 | \item The \code{\link{genes}} function for extracting gene ranges from 128 | a \link{TxDb} object. 129 | 130 | \item The \link[GenomicRanges]{GenomicRanges} class defined and documented 131 | in the \pkg{GenomicRanges} package. 132 | 133 | \item The \link[Biostrings]{DNAStringSet} class defined and documented 134 | in the \pkg{Biostrings} package. 135 | 136 | \item The \code{\link[GenomeInfoDb]{seqinfo}} getter defined and documented 137 | in the \pkg{GenomeInfoDb} package. 138 | 139 | \item The \code{\link[Biostrings]{getSeq}} function for extracting 140 | subsequences from a sequence container. 141 | } 142 | } 143 | 144 | \examples{ 145 | ## Load a genome: 146 | library(BSgenome.Dmelanogaster.UCSC.dm3) 147 | genome <- BSgenome.Dmelanogaster.UCSC.dm3 148 | genome 149 | 150 | ## Use a TxDb object: 151 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) 152 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene 153 | txdb # contains Ensembl gene IDs 154 | 155 | ## Because the chrU and chrUextra sequences are made of concatenated 156 | ## scaffolds (see https://genome.ucsc.edu/cgi-bin/hgGateway?db=dm3), 157 | ## extracting the upstream sequences for genes located on these 158 | ## scaffolds is not reliable. So we exclude them: 159 | exclude <- c("chrU", "chrUextra") 160 | up1000seqs <- extractUpstreamSeqs(genome, txdb, width=1000, 161 | exclude.seqlevels=exclude) 162 | up1000seqs # the names are Ensembl gene IDs 163 | mcols(up1000seqs) 164 | 165 | ## Upstream sequences for genes close to the chromosome bounds can be 166 | ## shorter than 1000 (note that this does not happen for circular 167 | ## chromosomes like chrM): 168 | table(width(up1000seqs)) 169 | mcols(up1000seqs)[width(up1000seqs) != 1000, ] 170 | } 171 | 172 | \keyword{manip} 173 | -------------------------------------------------------------------------------- /man/features.Rd: -------------------------------------------------------------------------------- 1 | \name{features} 2 | 3 | \alias{features} 4 | \alias{features,FeatureDb-method} 5 | 6 | \title{ 7 | Extract simple features from a FeatureDb object 8 | } 9 | 10 | \description{ 11 | WARNING: The FeatureDb/makeFeatureDbFromUCSC/features code base is 12 | no longer actively maintained and FeatureDb-related functionalities 13 | might get deprecated in the near future. Please use 14 | \code{\link{makeFeatureDbFromUCSC}} for a convenient way to 15 | import transcript annotations from UCSC online resources into 16 | Bioconductor. 17 | 18 | Generic function to extract genomic features from a FeatureDb object. 19 | } 20 | 21 | \usage{ 22 | features(x) 23 | \S4method{features}{FeatureDb}(x) 24 | } 25 | 26 | \arguments{ 27 | \item{x}{ 28 | A \link{FeatureDb} object. 29 | } 30 | } 31 | 32 | 33 | \value{ a GRanges object } 34 | 35 | \author{ 36 | M. Carlson 37 | } 38 | 39 | \seealso{ 40 | \link{FeatureDb} 41 | } 42 | 43 | \examples{ 44 | fdb <- loadDb(system.file("extdata", "FeatureDb.sqlite", 45 | package="GenomicFeatures")) 46 | features(fdb) 47 | } 48 | -------------------------------------------------------------------------------- /man/getPromoterSeq-methods.Rd: -------------------------------------------------------------------------------- 1 | \name{getPromoterSeq} 2 | 3 | \alias{getPromoterSeq} 4 | \alias{getTerminatorSeq} 5 | \alias{getPromoterSeq,GRanges-method} 6 | \alias{getTerminatorSeq,GRanges-method} 7 | \alias{getPromoterSeq,GRangesList-method} 8 | \alias{getTerminatorSeq,GRangesList-method} 9 | 10 | \title{Get gene promoter or terminator sequences} 11 | 12 | \description{ 13 | Extract promoter or terminator sequences for the genes or transcripts 14 | specified in the query (a\link{GRanges} or \link{GRangesList} object) 15 | from a \link[BSgenome]{BSgenome} or \link[Rsamtools]{FaFile} object. 16 | } 17 | 18 | \usage{ 19 | \S4method{getPromoterSeq}{GRanges}(query, subject, upstream=2000, downstream=200) 20 | \S4method{getTerminatorSeq}{GRanges}(query, subject, upstream=2000, downstream=200) 21 | 22 | \S4method{getPromoterSeq}{GRangesList}(query, subject, upstream=2000, downstream=200) 23 | \S4method{getTerminatorSeq}{GRangesList}(query, subject, upstream=2000, downstream=200) 24 | } 25 | 26 | \arguments{ 27 | \item{query}{A \link[GenomicRanges]{GRanges} or 28 | \link[GenomicRanges]{GRangesList} object containing genes grouped by 29 | transcript. 30 | } 31 | \item{subject}{A \link[BSgenome]{BSgenome} or \link[Rsamtools]{FaFile} object from which 32 | the sequences will be taken.} 33 | \item{upstream}{The number of DNA bases to include upstream of the TSS (transcription start site)} 34 | \item{downstream}{The number of DNA bases to include downstream of the TSS (transcription start site)} 35 | } 36 | 37 | \details{ 38 | \code{getPromoterSeq} and \code{getTerminatorSeq} are generic functions 39 | dispatching on query, which is either a GRanges or a GRangesList. 40 | They are convenience wrappers for the \code{promoters}, \code{terminators}, 41 | and \code{getSeq} functions. 42 | The purpose is to allow sequence extraction from either a 43 | \link[BSgenome]{BSgenome} or \link[Rsamtools]{FaFile} object. 44 | 45 | Default values for \code{upstream} and \code{downstream} were chosen based 46 | on our current understanding of gene regulation. On average, promoter 47 | regions in the mammalian genome are 5000 bp upstream and downstream of the 48 | transcription start site. 49 | } 50 | 51 | \value{ 52 | A \link[Biostrings]{DNAStringSet} or 53 | \link[Biostrings]{DNAStringSetList} instance corresponding to the 54 | GRanges or GRangesList supplied in the query. 55 | } 56 | 57 | \author{Paul Shannon} 58 | 59 | \seealso{ 60 | \itemize{ 61 | \item The \code{\link[GenomicRanges]{promoters}} man page in the 62 | \pkg{GenomicRanges} package for the \code{promoters()} and 63 | \code{terminators()} methods for \link[GenomicRanges]{GenomicRanges} 64 | objects. 65 | 66 | \item \code{\link[Biostrings]{getSeq}} in the \pkg{Biostrings} 67 | package for extracting a set of sequences from a sequence 68 | container like a \link[BSgenome]{BSgenome} or 69 | \link[Rsamtools]{FaFile} object. 70 | } 71 | } 72 | 73 | \examples{ 74 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 75 | library(BSgenome.Hsapiens.UCSC.hg19) 76 | 77 | 78 | ## A GRangesList object describing all the known Human transcripts grouped 79 | ## by gene: 80 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 81 | tx_by_gene <- transcriptsBy(txdb, by="gene") 82 | 83 | e2f3 <- "1871" # entrez geneID for a cell cycle control transcription 84 | # factor, chr6 on the plus strand 85 | 86 | ## A GRanges object describing the three transcripts for gene 1871: 87 | e2f3_tx <- tx_by_gene[[e2f3]] 88 | 89 | ## Promoter sequences for gene 1871: 90 | e2f3_promoter_seqs <- getPromoterSeq(e2f3_tx, Hsapiens, 91 | upstream=40, downstream=15) 92 | e2f3_promoter_seqs 93 | 94 | mcols(e2f3_promoter_seqs) 95 | 96 | ## Terminator sequences for gene 1871: 97 | e2f3_terminator_seqs <- getTerminatorSeq(e2f3_tx, Hsapiens, 98 | upstream=25, downstream=10) 99 | 100 | e2f3_terminator_seqs 101 | 102 | mcols(e2f3_terminator_seqs) # same as 'mcols(e2f3_promoter_seqs)' 103 | 104 | ## All Human promoter sequences grouped by gene: 105 | getPromoterSeq(tx_by_gene, Hsapiens, upstream=6, downstream=4) 106 | } 107 | 108 | \keyword{methods} 109 | \keyword{manip} 110 | -------------------------------------------------------------------------------- /man/id2name.Rd: -------------------------------------------------------------------------------- 1 | \name{id2name} 2 | 3 | \alias{id2name} 4 | 5 | \title{ 6 | Map internal ids to external names for a given feature type 7 | } 8 | \description{ 9 | Utility function for retrieving the mapping from the internal ids 10 | to the external names of a given feature type. 11 | } 12 | \usage{ 13 | id2name(txdb, feature.type=c("tx", "exon", "cds")) 14 | } 15 | \arguments{ 16 | \item{txdb}{A \link{TxDb} object.} 17 | \item{feature.type}{The feature type for which the mapping must be 18 | retrieved.} 19 | } 20 | \details{ 21 | Transcripts, exons and CDS parts in a \link{TxDb} object are 22 | stored in seperate tables where the primary key is an integer 23 | called \emph{feature internal id}. This id is stored in the 24 | \code{"tx_id"} column for transcripts, in the \code{"exon_id"} 25 | column for exons, and in the \code{"cds_id"} column for CDS parts. 26 | Unlike other commonly used ids like Entrez Gene IDs or Ensembl IDs, 27 | this internal id was generated at the time the \link{TxDb} 28 | object was created and has no meaning outside the scope of this object. 29 | 30 | The \code{id2name} function can be used to translate this internal 31 | id into a more informative id or name called \emph{feature external 32 | name}. This name is stored in the \code{"tx_name"} column for 33 | transcripts, in the \code{"exon_name"} column for exons, and in 34 | the \code{"cds_name"} column for CDS parts. 35 | 36 | Note that, unlike the feature internal id, the feature external 37 | name is not guaranteed to be unique or even defined (the column 38 | can contain \code{NA}s). 39 | } 40 | 41 | \value{ 42 | A named character vector where the names are the internal ids and the 43 | values the external names. 44 | } 45 | 46 | \author{Hervé Pagès} 47 | 48 | \seealso{ 49 | \itemize{ 50 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, 51 | and \code{\link{transcriptsByOverlaps}}, for how to extract 52 | genomic features from a \link{TxDb} object. 53 | \item The \link{TxDb} class. 54 | } 55 | } 56 | \examples{ 57 | txdb1_file <- system.file("extdata", "hg19_knownGene_sample.sqlite", 58 | package="GenomicFeatures") 59 | txdb1 <- loadDb(txdb1_file) 60 | id2name(txdb1, feature.type="tx")[1:4] 61 | id2name(txdb1, feature.type="exon")[1:4] 62 | id2name(txdb1, feature.type="cds")[1:4] 63 | 64 | txdb2_file <- system.file("extdata", "Biomart_Ensembl_sample.sqlite", 65 | package="GenomicFeatures") 66 | txdb2 <- loadDb(txdb2_file) 67 | id2name(txdb2, feature.type="tx")[1:4] 68 | id2name(txdb2, feature.type="exon")[1:4] 69 | id2name(txdb2, feature.type="cds")[1:4] 70 | } 71 | -------------------------------------------------------------------------------- /man/makeFeatureDbFromUCSC.Rd: -------------------------------------------------------------------------------- 1 | \name{makeFeatureDbFromUCSC} 2 | 3 | \alias{supportedUCSCFeatureDbTracks} 4 | \alias{supportedUCSCFeatureDbTables} 5 | \alias{UCSCFeatureDbTableSchema} 6 | \alias{makeFeatureDbFromUCSC} 7 | 8 | \title{ 9 | [Moved to txdbmaker] Make a FeatureDb object from annotations available 10 | at the UCSC Genome Browser 11 | } 12 | 13 | \description{ 14 | IMPORTANT NOTE: Starting with BioC 3.19, functions 15 | \code{supportedUCSCFeatureDbTracks()}, \code{supportedUCSCFeatureDbTables()}, 16 | \code{UCSCFeatureDbTableSchema()}, and \code{makeFeatureDbFromUCSC()} are 17 | defined in the \pkg{txdbmaker} package. 18 | } 19 | 20 | \seealso{ 21 | \code{txdbmaker::\link[txdbmaker]{supportedUCSCFeatureDbTracks}}, 22 | \code{txdbmaker::\link[txdbmaker]{supportedUCSCFeatureDbTables}}, 23 | \code{txdbmaker::\link[txdbmaker]{UCSCFeatureDbTableSchema}}, 24 | and \code{txdbmaker::\link[txdbmaker]{makeFeatureDbFromUCSC}} 25 | in the \pkg{txdbmaker} package. 26 | } 27 | 28 | -------------------------------------------------------------------------------- /man/makeTxDb.Rd: -------------------------------------------------------------------------------- 1 | \name{makeTxDb} 2 | 3 | \alias{makeTxDb} 4 | 5 | \title{ 6 | [Moved to txdbmaker] Make a TxDb object from user supplied annotations 7 | } 8 | 9 | \description{ 10 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDb} function 11 | is defined in the \pkg{txdbmaker} package. 12 | } 13 | 14 | \seealso{ 15 | \code{txdbmaker::\link[txdbmaker]{makeTxDb}} in the \pkg{txdbmaker} 16 | package. 17 | } 18 | 19 | -------------------------------------------------------------------------------- /man/makeTxDbFromBiomart.Rd: -------------------------------------------------------------------------------- 1 | \name{makeTxDbFromBiomart} 2 | 3 | \alias{makeTxDbFromBiomart} 4 | \alias{getChromInfoFromBiomart} 5 | 6 | \title{ 7 | [Moved to txdbmaker] Make a TxDb object from annotations available 8 | on a BioMart database 9 | } 10 | 11 | \description{ 12 | IMPORTANT NOTE: Starting with BioC 3.19, functions 13 | \code{makeTxDbFromBiomart()} and \code{getChromInfoFromBiomart()} 14 | are defined in the \pkg{txdbmaker} package. 15 | } 16 | 17 | \seealso{ 18 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromBiomart}} and 19 | \code{txdbmaker::\link[txdbmaker]{getChromInfoFromBiomart}} in 20 | the \pkg{txdbmaker} package. 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/makeTxDbFromEnsembl.Rd: -------------------------------------------------------------------------------- 1 | \name{makeTxDbFromEnsembl} 2 | 3 | \alias{makeTxDbFromEnsembl} 4 | 5 | \title{ 6 | [Moved to txdbmaker] Make a TxDb object from an Ensembl database 7 | } 8 | 9 | \description{ 10 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDbFromEnsembl} 11 | function is defined in the \pkg{txdbmaker} package. 12 | } 13 | 14 | \seealso{ 15 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromEnsembl}} in the 16 | \pkg{txdbmaker} package. 17 | } 18 | 19 | -------------------------------------------------------------------------------- /man/makeTxDbFromGFF.Rd: -------------------------------------------------------------------------------- 1 | \name{makeTxDbFromGFF} 2 | 3 | \alias{makeTxDbFromGFF} 4 | 5 | \title{ 6 | [Moved to txdbmaker] Make a TxDb object from annotations available 7 | as a GFF3 or GTF file 8 | } 9 | 10 | \description{ 11 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDbFromGFF} 12 | function is defined in the \pkg{txdbmaker} package. 13 | } 14 | 15 | \seealso{ 16 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker} 17 | package. 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/makeTxDbFromGRanges.Rd: -------------------------------------------------------------------------------- 1 | \name{makeTxDbFromGRanges} 2 | 3 | \alias{makeTxDbFromGRanges} 4 | 5 | \title{[Moved to txdbmaker] Make a TxDb object from a GRanges object} 6 | 7 | \description{ 8 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDbFromGRanges} 9 | function is defined in the \pkg{txdbmaker} package. 10 | } 11 | 12 | \seealso{ 13 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromGRanges}} in the 14 | \pkg{txdbmaker} package. 15 | } 16 | 17 | -------------------------------------------------------------------------------- /man/makeTxDbFromUCSC.Rd: -------------------------------------------------------------------------------- 1 | \name{makeTxDbFromUCSC} 2 | 3 | \alias{supportedUCSCtables} 4 | \alias{browseUCSCtrack} 5 | \alias{makeTxDbFromUCSC} 6 | 7 | \title{ 8 | [Moved to txdbmaker] Make a TxDb object from annotations available 9 | at the UCSC Genome Browser 10 | } 11 | 12 | \description{ 13 | IMPORTANT NOTE: Starting with BioC 3.19, functions 14 | \code{makeTxDbFromUCSC()}, \code{supportedUCSCtables()}, 15 | and \code{browseUCSCtrack()} are defined in the \pkg{txdbmaker} 16 | package. 17 | } 18 | 19 | \seealso{ 20 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromUCSC}}, 21 | \code{txdbmaker::\link[txdbmaker]{supportedUCSCtables}}, 22 | and \code{txdbmaker::\link[txdbmaker]{browseUCSCtrack}} 23 | in the \pkg{txdbmaker} package. 24 | } 25 | 26 | -------------------------------------------------------------------------------- /man/makeTxDbPackage.Rd: -------------------------------------------------------------------------------- 1 | \name{makeTxDbPackage} 2 | 3 | \alias{supportedMiRBaseBuildValues} 4 | \alias{makePackageName} 5 | \alias{makeTxDbPackage} 6 | \alias{makeTxDbPackageFromUCSC} 7 | \alias{makeFDbPackageFromUCSC} 8 | \alias{makeTxDbPackageFromBiomart} 9 | 10 | \title{ 11 | [Moved to txdbmaker] Making a TxDb package from annotations available 12 | at the UCSC Genome Browser, biomaRt or from another source. 13 | } 14 | 15 | \description{ 16 | IMPORTANT NOTE: Starting with BioC 3.19, functions 17 | \code{makeTxDbPackageFromUCSC()}, \code{makeFDbPackageFromUCSC()}, 18 | \code{makeTxDbPackageFromBiomart()}, \code{makeTxDbPackage()} 19 | \code{supportedMiRBaseBuildValues()} and \code{makePackageName()} 20 | are defined in the \pkg{txdbmaker} package. 21 | } 22 | 23 | \seealso{ 24 | \code{txdbmaker::\link[txdbmaker]{makeTxDbPackageFromUCSC}}, 25 | \code{txdbmaker::\link[txdbmaker]{makeFDbPackageFromUCSC}}, 26 | \code{txdbmaker::\link[txdbmaker]{makeTxDbPackageFromBiomart}}, 27 | \code{txdbmaker::\link[txdbmaker]{makeTxDbPackage}}, 28 | \code{txdbmaker::\link[txdbmaker]{supportedMiRBaseBuildValues}}, 29 | and \code{txdbmaker::\link[txdbmaker]{makePackageName}} 30 | in the \pkg{txdbmaker} package. 31 | } 32 | 33 | -------------------------------------------------------------------------------- /man/mapIdsToRanges.Rd: -------------------------------------------------------------------------------- 1 | \docType{methods} 2 | \name{mapIdsToRanges} 3 | \alias{mapIdsToRanges} 4 | \alias{mapIdsToRanges,TxDb-method} 5 | \title{Map IDs to Genomic Ranges} 6 | \usage{ 7 | mapIdsToRanges(x, ...) 8 | 9 | \S4method{mapIdsToRanges}{TxDb}(x, keys, type = c("cds", "exon", "tx", 10 | "gene"), columns = NULL) 11 | } 12 | \arguments{ 13 | \item{x}{Database to use for mapping} 14 | 15 | \item{keys}{Values to lookup, passed to \code{\link{transcripts}} et. al.} 16 | 17 | \item{type}{Types of feature to return} 18 | 19 | \item{columns}{Additional metadata columns to include in the output} 20 | 21 | \item{...}{Additional arguments passed to methods} 22 | } 23 | \value{ 24 | \code{\link[GenomicRanges]{GRangesList}} corresponding to the keys 25 | } 26 | \description{ 27 | Map IDs to Genomic Ranges 28 | } 29 | \section{Methods (by class)}{ 30 | \itemize{ 31 | \item \code{TxDb}: TxDb method 32 | }} 33 | \examples{ 34 | library(txdbmaker) # for makeTxDbFromGRanges() 35 | fl <- system.file(package = "GenomicFeatures", "extdata", "sample_ranges.rds") 36 | txdb <- makeTxDbFromGRanges(readRDS(fl)) 37 | 38 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588", 39 | "ENST00000494752", "ENST00000614008", "ENST00000496771")) 40 | mapIdsToRanges(txdb, keys = keys, type = "tx") 41 | } 42 | -------------------------------------------------------------------------------- /man/mapRangesToIds.Rd: -------------------------------------------------------------------------------- 1 | \docType{methods} 2 | \name{mapRangesToIds} 3 | \alias{mapRangesToIds} 4 | \alias{mapRangesToIds,TxDb-method} 5 | \title{Map Genomic Ranges to IDs} 6 | \usage{ 7 | mapRangesToIds(x, ...) 8 | 9 | \S4method{mapRangesToIds}{TxDb}(x, ranges, type = c("cds", "exon", "tx", 10 | "gene"), columns = NULL, ...) 11 | } 12 | \arguments{ 13 | \item{x}{Database to use for mapping} 14 | 15 | \item{ranges}{range object used to subset} 16 | 17 | \item{type}{of feature to return} 18 | 19 | \item{columns}{additional metadata columns to include in the output.} 20 | 21 | \item{...}{Additional arguments passed to 22 | \code{\link[GenomicRanges]{findOverlaps}}} 23 | } 24 | \value{ 25 | \code{\link[S4Vectors]{DataFrame}} of mcols from the database. 26 | } 27 | \description{ 28 | Map Genomic Ranges to IDs 29 | } 30 | \section{Methods (by class)}{ 31 | \itemize{ 32 | \item \code{TxDb}: TxDb method 33 | }} 34 | \examples{ 35 | library(txdbmaker) # for makeTxDbFromGRanges() 36 | fl <- system.file(package = "GenomicFeatures", "extdata", "sample_ranges.rds") 37 | txdb <- makeTxDbFromGRanges(readRDS(fl)) 38 | 39 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588", 40 | "ENST00000494752", "ENST00000614008", "ENST00000496771")) 41 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx") 42 | mapRangesToIds(txdb, res, "tx") 43 | } 44 | -------------------------------------------------------------------------------- /man/nearest-methods.Rd: -------------------------------------------------------------------------------- 1 | \name{nearest-methods} 2 | 3 | \alias{nearest-methods} 4 | 5 | \alias{distance,GenomicRanges,TxDb-method} 6 | 7 | \title{Finding the nearest genomic range neighbor in a TxDb} 8 | 9 | \description{ 10 | A \code{distance()} method for TxDb objects. 11 | } 12 | 13 | \usage{ 14 | \S4method{distance}{GenomicRanges,TxDb}(x, y, ignore.strand=FALSE, 15 | ..., id, type=c("gene", "tx", "exon", "cds")) 16 | } 17 | 18 | \arguments{ 19 | \item{x}{ 20 | The query \link{GenomicRanges} instance. 21 | } 22 | \item{y}{ 23 | A \link{TxDb} object. The \code{id} is used to extract 24 | ranges from the \link{TxDb} which are then used to compute the 25 | distance from \code{x}. 26 | } 27 | \item{id}{ 28 | A \code{character} vector the same length as \code{x}. 29 | The \code{id} must be identifiers in the \link{TxDb} object. 30 | \code{type} indicates what type of identifier \code{id} is. 31 | } 32 | \item{type}{ 33 | A \code{character(1)} describing the \code{id}. 34 | Must be one of \sQuote{gene}, \sQuote{tx}, \sQuote{exon} or 35 | \sQuote{cds}. 36 | } 37 | \item{ignore.strand}{ 38 | A \code{logical} indicating if the strand of the ranges 39 | should be ignored. When \code{TRUE}, strand is set to \code{'+'}. 40 | } 41 | \item{...}{ 42 | Additional arguments for methods. 43 | } 44 | } 45 | 46 | \details{ 47 | This \code{distance()} method returns the distance for each range in \code{x} 48 | to the range extracted from the \link{TxDb} object \code{y}. Values in 49 | \code{id} are matched to one of \sQuote{gene_id}, \sQuote{tx_id}, 50 | \sQuote{exon_id} or \sQuote{cds_id} identifiers in the \link{TxDb} 51 | and the corresponding ranges are extracted. The \code{type} argument 52 | specifies which identifier is represented in \code{id}. The extracted 53 | ranges are used in the distance calculation with the ranges in \code{x}. 54 | 55 | The method returns \code{NA} values when the genomic region defined 56 | by \code{id} cannot be collapsed into a single range (e.g., 57 | when a gene spans multiple chromosomes) or if the \code{id} 58 | is not found in \code{y}. 59 | 60 | The behavior of \code{distance()} with respect to zero-width ranges 61 | has changed in Bioconductor 2.12. See the man page \code{?distance} 62 | in the \pkg{IRanges} for details. 63 | } 64 | 65 | \value{ 66 | An integer vector of distances between the ranges in \code{x} and \code{y}. 67 | } 68 | 69 | \author{Valerie Obenchain } 70 | 71 | \seealso{ 72 | \itemize{ 73 | \item \link[IRanges]{nearest-methods} in the \pkg{IRanges} package. 74 | \item \link[GenomicRanges]{nearest-methods} in the \pkg{GenomicRanges} 75 | package. 76 | } 77 | } 78 | 79 | \examples{ 80 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) 81 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene 82 | gr <- GRanges(c("chr2L", "chr2R"), 83 | IRanges(c(100000, 200000), width=100)) 84 | distance(gr, txdb, id=c("FBgn0259717", "FBgn0261501"), type="gene") 85 | distance(gr, txdb, id=c("10000", "23000"), type="cds") 86 | 87 | ## The id's must be in the appropriate order with respect to 'x'. 88 | distance(gr, txdb, id=c("4", "4097"), type="tx") 89 | 90 | ## 'id' "4" is on chr2L and "4097" is on chr2R. 91 | transcripts(txdb, filter=list(tx_id=c("4", "4097"))) 92 | 93 | ## If we reverse the 'id' the chromosomes are incompatable with gr. 94 | distance(gr, txdb, id=c("4097", "4"), type="tx") 95 | 96 | ## distance() compares each 'x' to the corresponding 'y'. 97 | ## If an 'id' is not found in the TxDb 'y' will not 98 | ## be the same lenth as 'x' and an error is thrown. 99 | \dontrun{ 100 | distance(gr, txdb, id=c("FBgn0000008", "INVALID"), type="gene") ## will fail 101 | } 102 | } 103 | 104 | \keyword{utilities} 105 | -------------------------------------------------------------------------------- /man/proteinToGenome.Rd: -------------------------------------------------------------------------------- 1 | \name{proteinToGenome} 2 | 3 | \alias{proteinToGenome} 4 | \alias{proteinToGenome,GRangesList-method} 5 | \alias{proteinToGenome,ANY-method} 6 | 7 | \title{Map protein-relative coordinates to genomic coordinates} 8 | 9 | \description{ 10 | \code{proteinToGenome} is a generic function for mapping 11 | ranges of protein-relative positions to the genome. 12 | 13 | NOTE: This man page is for the \code{proteinToGenome} S4 generic 14 | function and methods defined in the \pkg{GenomicFeatures} package, 15 | which are (loosely) modeled on the \code{\link[ensembldb]{proteinToGenome}} 16 | function from the \pkg{ensembldb} package. 17 | See \code{?ensembldb::\link[ensembldb]{proteinToGenome}} for the latter. 18 | } 19 | 20 | \usage{ 21 | ## S4 generic function: 22 | proteinToGenome(x, db, ...) # dispatch is on 2nd argument 'db' 23 | 24 | \S4method{proteinToGenome}{ANY}(x, db) 25 | 26 | \S4method{proteinToGenome}{GRangesList}(x, db) 27 | } 28 | 29 | \arguments{ 30 | \item{x}{ 31 | A named \link[IRanges]{IRanges} object (or derivative) containing ranges 32 | of \emph{protein-relative positions} (protein-relative positions are 33 | positions relative to a protein sequence). 34 | 35 | The names on \code{x} must be transcript names present in \code{db}. 36 | More precisely, for the default \code{proteinToGenome()} method, 37 | \code{names(x)} must be a subset of: 38 | \preformatted{ mcols(transcripts(db, columns="tx_name"))$tx_name 39 | } 40 | And for the method for \link[GenomicRanges]{GRangesList} objects, 41 | \code{names(x)} must be a subset of: 42 | \preformatted{ names(db) 43 | } 44 | } 45 | \item{db}{ 46 | For the default \code{proteinToGenome()} method: A \link{TxDb} 47 | object or any object that supports \code{\link{transcripts}()} 48 | and \code{\link{cdsBy}()} (e.g. an \link[ensembldb]{EnsDb} object 49 | from the \pkg{ensembldb} package). 50 | 51 | For the method for \link[GenomicRanges]{GRangesList} objects: 52 | A named \link[GenomicRanges]{GRangesList} object (or derivative) 53 | where each list element is a \link[GenomicRanges]{GRanges} object 54 | representing a CDS (the ranges in the \link[GenomicRanges]{GRanges} 55 | object must represent the CDS parts ordered by ascending exon rank). 56 | } 57 | \item{...}{ 58 | Further arguments to be passed to specific methods. 59 | } 60 | } 61 | 62 | \details{ 63 | The \code{proteinToGenome()} method for \link[GenomicRanges]{GRangesList} 64 | objects is the workhorse behind the default method. Note that the latter 65 | is a thin wrapper around the former, which simply does the following: 66 | \enumerate{ 67 | \item Use \code{\link{cdsBy}()} to extract the CDS parts from \code{db}. 68 | The CDS parts are returned in a \link[GenomicRanges]{GRangesList} 69 | object that has the names of the transcript on it (one transcript 70 | name per list element). 71 | \item Call \code{proteinToGenome()} on \code{x} and the 72 | \link[GenomicRanges]{GRangesList} object returned by 73 | \code{\link{cdsBy}()}. 74 | } 75 | } 76 | 77 | \value{ 78 | A named \link[GenomicRanges]{GRangesList} object \emph{parallel} to 79 | \code{x} (the transcript names on \code{x} are propagated). 80 | The i-th list element in the returned object is the result of mapping 81 | the range of protein-relative positions \code{x[i]} to the genome. 82 | 83 | Note that a given range in \code{x} can only be mapped to the genome 84 | if the name on it is the name of a \emph{coding} transcript. If it's 85 | not (i.e. if it's the name of a \emph{non-coding} transcript), then 86 | an empty \link[GenomicRanges]{GRanges} object is placed in the returned 87 | object to indicate the impossible mapping, and a warning is issued. 88 | 89 | Otherwise, if a given range in \code{x} can be mapped to the 90 | genome, then the result of the mapping is represented by a 91 | non-empty \link[GenomicRanges]{GRanges} object. 92 | Note that this object represents the original CDS associated to 93 | \code{x}, trimmed on its 5' end or 3' end, or on both. 94 | Furthermore, this object will have the same metadata columns as the 95 | \link[GenomicRanges]{GRanges} object representing the original CDS, 96 | plus the 2 following ones: 97 | \itemize{ 98 | \item \code{protein_start}: The protein-relative start of the mapping. 99 | \item \code{protein_end}: The protein-relative end of the mapping. 100 | } 101 | } 102 | 103 | \note{ 104 | Unlike \code{ensembldb::\link[ensembldb]{proteinToGenome}()} which 105 | can work either with Ensembl protein IDs or Ensembl transcript IDs 106 | on \code{x}, the default \code{proteinToGenome()} method described 107 | above only accepts \emph{transcript names} on \code{x}. 108 | 109 | This means that, if the user is in possession of protein IDs, they 110 | must first replace them with the corresponding transcript IDs (referred 111 | to as \emph{transcript names} in the context of \link{TxDb} objects). 112 | How to do this exactly depends on the origin of those IDs (UCSC, 113 | Ensembl, GTF/GFF3 file, FlyBase, etc...) 114 | } 115 | 116 | \author{H. Pagès, using \code{ensembldb::proteinToGenome()} for 117 | inspiration and design.} 118 | 119 | \seealso{ 120 | \itemize{ 121 | \item The \code{\link[ensembldb]{proteinToGenome}} function in the 122 | \pkg{ensembldb} package, which the \code{proteinToGenome()} 123 | generic and methods documented in this man page are (loosely) 124 | modeled on. 125 | 126 | \item \link{TxDb} objects. 127 | 128 | \item \link[ensembldb]{EnsDb} objects (\link{TxDb}-like objects) in 129 | the \pkg{ensembldb} package. 130 | 131 | \item \code{\link{transcripts}} for extracting transcripts from a 132 | \link{TxDb}-like object. 133 | 134 | \item \code{\link{cdsBy}} for extracting CDS parts from a 135 | \link{TxDb}-like object. 136 | 137 | \item \link[IRanges]{IRanges} objects in the \pkg{IRanges} package. 138 | 139 | \item \link[GenomicRanges]{GRanges} and \link[GenomicRanges]{GRangesList} 140 | objects in the \pkg{GenomicRanges} package. 141 | } 142 | } 143 | 144 | \examples{ 145 | ## --------------------------------------------------------------------- 146 | ## USING TOY CDS 147 | ## --------------------------------------------------------------------- 148 | 149 | ## CDS1 has 2 CDS parts: 150 | CDS1 <- GRanges(c("chrX:11-60:+", "chrX:101-125:+")) 151 | 152 | ## CDS2 has 3 CDS parts: 153 | CDS2 <- GRanges(c("chrY:201-230:-", "chrY:101-125:-", "chrY:11-60:-")) 154 | 155 | ## Put them in a GRangesList object: 156 | cds_by_tx <- GRangesList(TX1=CDS1, TX2=CDS2) 157 | cds_by_tx 158 | 159 | x1 <- IRanges(start=8, end=20, names="TX1") 160 | proteinToGenome(x1, cds_by_tx) 161 | 162 | x2 <- IRanges(start=c(1, 18), end=c(25, 20), names=c("TX1", "TX1")) 163 | x2 164 | proteinToGenome(x2, cds_by_tx) 165 | 166 | x3 <- IRanges(start=8, end=15, names="TX2") 167 | proteinToGenome(x3, cds_by_tx) 168 | 169 | x4 <- c(x3, x2) 170 | x4 171 | proteinToGenome(x4, cds_by_tx) 172 | 173 | ## --------------------------------------------------------------------- 174 | ## USING A TxDb OBJECT 175 | ## --------------------------------------------------------------------- 176 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) 177 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene 178 | 179 | ## The first transcript (FBtr0309810) is non-coding: 180 | x <- IRanges(c(FBtr0309810="11-55", FBtr0306539="90-300")) 181 | res <- proteinToGenome(x, txdb) 182 | res 183 | } 184 | 185 | \keyword{methods} 186 | \keyword{utilities} 187 | -------------------------------------------------------------------------------- /man/select-methods.Rd: -------------------------------------------------------------------------------- 1 | \name{select-methods} 2 | 3 | \alias{select-methods} 4 | 5 | \alias{columns,TxDb-method} 6 | \alias{keytypes,TxDb-method} 7 | \alias{keys,TxDb-method} 8 | \alias{select,TxDb-method} 9 | 10 | \title{Using the "select" interface on TxDb objects} 11 | 12 | \description{ 13 | \code{select}, \code{columns} and \code{keys} can be used together to 14 | extract data from a \link{TxDb} object. 15 | } 16 | 17 | \details{ 18 | In the code snippets below, \code{x} is a \link{TxDb} object. 19 | 20 | \describe{ 21 | \item{\code{keytypes(x)}:}{ 22 | allows the user to discover which keytypes can be passed in to 23 | \code{select} or \code{keys} and the \code{keytype} argument. 24 | } 25 | \item{\code{keys(x, keytype, pattern, column, fuzzy)}:}{ Return keys for 26 | the database contained in the \link{TxDb} object . 27 | 28 | The \code{keytype} argument specifies the kind of keys that will 29 | be returned. By default \code{keys} will return the "GENEID" keys 30 | for the database. 31 | 32 | If \code{keys} is used with \code{pattern}, it will pattern match 33 | on the \code{keytype}. 34 | 35 | But if the \code{column} argument is also provided along with the 36 | \code{pattern} argument, then \code{pattern} will be matched 37 | against the values in \code{column} instead. 38 | 39 | And if \code{keys} is called with \code{column} and no 40 | \code{pattern} argument, then it will return all keys that have 41 | corresponding values in the \code{column} argument. 42 | 43 | Thus, the behavior of \code{keys} all depends on how many arguments are 44 | specified. 45 | 46 | Use of the \code{fuzzy} argument will toggle fuzzy matching to 47 | TRUE or FALSE. If \code{pattern} is not used, fuzzy is ignored. 48 | } 49 | \item{\code{columns(x)}:}{ 50 | Show which kinds of data can be returned for the 51 | \link{TxDb} object. 52 | } 53 | \item{\code{select(x, keys, columns, keytype)}:}{ 54 | When all the appropriate arguments are specified \code{select} 55 | will retrieve the matching data as a data.frame based on 56 | parameters for selected \code{keys} and \code{columns} and 57 | \code{keytype} arguments. 58 | } 59 | } 60 | 61 | } 62 | 63 | 64 | 65 | \author{Marc Carlson} 66 | 67 | \seealso{ 68 | \itemize{ 69 | \item \link[AnnotationDbi]{AnnotationDb-class} for more descriptsion 70 | of methods \code{select},\code{keytypes},\code{keys} and \code{columns}. 71 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, 72 | and \code{\link{transcriptsByOverlaps}}, for other ways to 73 | extract genomic features from a \link{TxDb} object. 74 | \item The \link{TxDb} class. 75 | } 76 | } 77 | 78 | \examples{ 79 | txdb_file <- system.file("extdata", "Biomart_Ensembl_sample.sqlite", 80 | package="GenomicFeatures") 81 | txdb <- loadDb(txdb_file) 82 | txdb 83 | 84 | ## find key types 85 | keytypes(txdb) 86 | 87 | ## list IDs that can be used to filter 88 | head(keys(txdb, "GENEID")) 89 | head(keys(txdb, "TXID")) 90 | head(keys(txdb, "TXNAME")) 91 | 92 | ## list columns that can be returned by select 93 | columns(txdb) 94 | 95 | ## call select 96 | res <- select(txdb, head(keys(txdb, "GENEID")), 97 | columns=c("GENEID","TXNAME"), 98 | keytype="GENEID") 99 | head(res) 100 | } 101 | 102 | \keyword{methods} 103 | -------------------------------------------------------------------------------- /man/tRNAs.Rd: -------------------------------------------------------------------------------- 1 | \name{tRNAs} 2 | 3 | \alias{microRNAs} 4 | \alias{tRNAs} 5 | \alias{tRNAs,TxDb-method} 6 | 7 | \title{ 8 | Extract tRNA genomic ranges from an object 9 | } 10 | 11 | \description{ 12 | WARNING: The code base for \code{tRNAs()} is no longer actively 13 | maintained and the function might get deprecated in the near future. 14 | 15 | The \code{tRNAs()} function extracts tRNA genomic ranges from a 16 | \link{TxDb} object. 17 | } 18 | 19 | \usage{ 20 | tRNAs(x) 21 | } 22 | 23 | \arguments{ 24 | \item{x}{ 25 | A \link{TxDb} object. 26 | } 27 | } 28 | 29 | \value{ 30 | A \link[GenomicRanges]{GRanges} object. 31 | } 32 | 33 | \author{ 34 | M. Carlson 35 | } 36 | 37 | \seealso{ 38 | \itemize{ 39 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, and 40 | \code{\link{transcriptsByOverlaps}} for the core genomic features 41 | extractors. 42 | \item The \link{TxDb} class. 43 | } 44 | } 45 | 46 | \keyword{methods} 47 | -------------------------------------------------------------------------------- /man/transcriptLengths.Rd: -------------------------------------------------------------------------------- 1 | \name{transcriptLengths} 2 | 3 | \alias{transcriptLengths} 4 | 5 | 6 | \title{Extract the transcript lengths (and other metrics) from a TxDb object} 7 | 8 | \description{ 9 | The \code{transcriptLengths} function extracts the transcript lengths from 10 | a \link{TxDb} object. It also returns the CDS and UTR lengths for each 11 | transcript if the user requested them. 12 | } 13 | 14 | \usage{ 15 | transcriptLengths(txdb, with.cds_len=FALSE, 16 | with.utr5_len=FALSE, with.utr3_len=FALSE, ...) 17 | } 18 | 19 | \arguments{ 20 | \item{txdb}{ 21 | A \link{TxDb} object. 22 | } 23 | \item{with.cds_len, with.utr5_len, with.utr3_len}{ 24 | \code{TRUE} or \code{FALSE}. Whether or not to also extract and return 25 | the CDS, 5' UTR, and 3' UTR lengths for each transcript. 26 | } 27 | \item{\dots}{ 28 | Additional arguments used by \code{transcripts} and other accessor 29 | functions. 30 | } 31 | } 32 | 33 | \details{ 34 | All the lengths are counted in number of nucleotides. 35 | 36 | The length of a processed transcript is just the sum of the lengths of its 37 | exons. This should not be confounded with the length of the stretch of DNA 38 | transcribed into RNA (a.k.a. transcription unit), which can be obtained 39 | with \code{width(transcripts(txdb))}. 40 | } 41 | 42 | \value{ 43 | A data frame with 1 row per transcript. The rows are guaranteed to be in 44 | the same order as the elements of the \link[GenomicRanges]{GRanges} object 45 | returned by \code{\link{transcripts}(txdb)}. 46 | The data frame has between 5 and 8 columns, depending on what the user 47 | requested via the \code{with.cds_len}, \code{with.utr5_len}, and 48 | \code{with.utr3_len} arguments. 49 | 50 | The first 3 columns are the same as the metadata columns of the object 51 | returned by 52 | \preformatted{ transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id")) 53 | } 54 | that is: 55 | \itemize{ 56 | \item \code{tx_id}: The internal transcript ID. This ID is unique within 57 | the scope of the \link{TxDb} object. It is not an official or public 58 | ID (like an Ensembl or FlyBase ID) or an Accession number, so it 59 | cannot be used to lookup the transcript in public data bases or in 60 | other \link{TxDb} objects. Furthermore, this ID could change when 61 | re-running the code that was used to make the \link{TxDb} object. 62 | \item \code{tx_name}: An official/public transcript name or ID that can 63 | be used to lookup the transcript in public data bases or in other 64 | \link{TxDb} objects. This column is not guaranteed to contain unique 65 | values and it can contain NAs. 66 | \item \code{gene_id}: The official/public ID of the gene that the 67 | transcript belongs to. Can be NA if the gene is unknown or if the 68 | transcript is not considered to belong to a gene. 69 | } 70 | 71 | The other columns are quantitative: 72 | \itemize{ 73 | \item \code{nexon}: The number of exons in the transcript. 74 | \item \code{tx_len}: The length of the processed transcript. 75 | \item \code{cds_len}: [optional] The length of the CDS region of the 76 | processed transcript. 77 | \item \code{utr5_len}: [optional] The length of the 5' UTR region of 78 | the processed transcript. 79 | \item \code{utr3_len}: [optional] The length of the 3' UTR region of 80 | the processed transcript. 81 | } 82 | } 83 | 84 | \author{Hervé Pagès} 85 | 86 | \seealso{ 87 | \itemize{ 88 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, 89 | and \code{\link{transcriptsByOverlaps}}, for extracting 90 | genomic feature locations from a \link{TxDb}-like object. 91 | 92 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for 93 | extracting non-overlapping exonic or intronic parts from a 94 | TxDb-like object. 95 | 96 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript 97 | (or CDS) sequences from chromosome sequences. 98 | 99 | \item \code{\link{coverageByTranscript}} for computing coverage by 100 | transcript (or CDS) of a set of ranges. 101 | 102 | \item \code{\link[txdbmaker]{makeTxDbFromUCSC}}, 103 | \code{\link[txdbmaker]{makeTxDbFromBiomart}}, 104 | and \code{\link[txdbmaker]{makeTxDbFromEnsembl}} in 105 | the \pkg{txdbmaker} package for making a \link{TxDb} 106 | object from online resources. 107 | 108 | \item \code{\link[txdbmaker]{makeTxDbFromGRanges}} and 109 | \code{\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker} 110 | package for making a \link{TxDb} object from a 111 | \link[GenomicRanges]{GRanges} object, or from a GFF or GTF file. 112 | 113 | \item The \link{TxDb} class. 114 | } 115 | } 116 | 117 | \examples{ 118 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) 119 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene 120 | dm3_txlens <- transcriptLengths(txdb) 121 | head(dm3_txlens) 122 | 123 | dm3_txlens <- transcriptLengths(txdb, with.cds_len=TRUE, 124 | with.utr5_len=TRUE, 125 | with.utr3_len=TRUE) 126 | head(dm3_txlens) 127 | 128 | ## When cds_len is 0 (non-coding transcript), utr5_len and utr3_len 129 | ## must also be 0: 130 | non_coding <- dm3_txlens[dm3_txlens$cds_len == 0, ] 131 | stopifnot(all(non_coding[6:8] == 0)) 132 | 133 | ## When cds_len is not 0 (coding transcript), cds_len + utr5_len + 134 | ## utr3_len must be equal to tx_len: 135 | coding <- dm3_txlens[dm3_txlens$cds_len != 0, ] 136 | stopifnot(all(rowSums(coding[6:8]) == coding[[5]])) 137 | 138 | ## A sanity check: 139 | stopifnot(identical(dm3_txlens$tx_id, mcols(transcripts(txdb))$tx_id)) 140 | } 141 | 142 | \keyword{manip} 143 | -------------------------------------------------------------------------------- /man/transcriptLocs2refLocs.Rd: -------------------------------------------------------------------------------- 1 | \name{transcriptLocs2refLocs} 2 | 3 | \alias{transcriptWidths} 4 | \alias{transcriptLocs2refLocs} 5 | 6 | \title{Converting transcript-based locations into reference-based locations} 7 | 8 | \description{ 9 | \code{transcriptLocs2refLocs} converts transcript-based 10 | locations into reference-based (aka chromosome-based or genomic) 11 | locations. 12 | 13 | \code{transcriptWidths} computes the lengths of the transcripts 14 | (called the "widths" in this context) based on the boundaries 15 | of their exons. 16 | } 17 | 18 | \usage{ 19 | transcriptLocs2refLocs(tlocs, 20 | exonStarts=list(), exonEnds=list(), strand=character(0), 21 | decreasing.rank.on.minus.strand=FALSE, error.if.out.of.bounds=TRUE) 22 | 23 | transcriptWidths(exonStarts=list(), exonEnds=list()) 24 | } 25 | 26 | \arguments{ 27 | \item{tlocs}{ 28 | A list of integer vectors of the same length as \code{exonStarts} 29 | and \code{exonEnds}. Each element in \code{tlocs} must contain 30 | transcript-based locations. 31 | } 32 | \item{exonStarts, exonEnds}{ 33 | The starts and ends of the exons, respectively. 34 | 35 | Each argument can be a list of integer vectors, 36 | an \link[IRanges]{IntegerList} object, 37 | or a character vector where each element is a 38 | comma-separated list of integers. 39 | In addition, the lists represented by \code{exonStarts} 40 | and \code{exonEnds} must have the same shape i.e. 41 | have the same lengths and have elements of the same lengths. 42 | The length of \code{exonStarts} and \code{exonEnds} 43 | is the number of transcripts. 44 | } 45 | \item{strand}{ 46 | A character vector of the same length as \code{exonStarts} and 47 | \code{exonEnds} specifying the strand (\code{"+"} or \code{"-"}) 48 | from which the transcript is coming. 49 | } 50 | \item{decreasing.rank.on.minus.strand}{ 51 | \code{TRUE} or \code{FALSE}. 52 | Describes the order of exons in transcripts located on the minus strand: 53 | are they ordered by increasing (default) or decreasing rank? 54 | } 55 | \item{error.if.out.of.bounds}{ 56 | \code{TRUE} or \code{FALSE}. 57 | Controls how out of bound \code{tlocs} are handled: an error is thrown 58 | (default) or \code{NA} is returned. 59 | } 60 | } 61 | 62 | \value{ 63 | For \code{transcriptLocs2refLocs}: A list of integer vectors of the same 64 | shape as \code{tlocs}. 65 | 66 | For \code{transcriptWidths}: An integer vector with one element per 67 | transcript. 68 | } 69 | 70 | \author{Hervé Pagès} 71 | 72 | \seealso{ 73 | \itemize{ 74 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript 75 | (or CDS) sequences from chromosomes. 76 | 77 | \item \code{\link{coverageByTranscript}} for computing coverage by 78 | transcript (or CDS) of a set of ranges. 79 | } 80 | } 81 | 82 | \examples{ 83 | ## --------------------------------------------------------------------- 84 | ## WITH A SMALL SET OF HUMAN TRANSCRIPTS 85 | ## --------------------------------------------------------------------- 86 | txdb_file <- system.file("extdata", "hg19_knownGene_sample.sqlite", 87 | package="GenomicFeatures") 88 | txdb <- loadDb(txdb_file) 89 | ex_by_tx <- exonsBy(txdb, by="tx", use.names=TRUE) 90 | genome <- BSgenome::getBSgenome("hg19") # load the hg19 genome 91 | tx_seqs <- extractTranscriptSeqs(genome, ex_by_tx) 92 | 93 | ## Get the reference-based locations of the first 4 (5' end) 94 | ## and last 4 (3' end) nucleotides in each transcript: 95 | tlocs <- lapply(width(tx_seqs), function(w) c(1:4, (w-3):w)) 96 | tx_strand <- sapply(strand(ex_by_tx), runValue) 97 | 98 | ## Note that, because of how we made them, 'tlocs', 'start(ex_by_tx)', 99 | ## 'end(ex_by_tx)' and 'tx_strand' are "parallel" objects i.e. they 100 | ## have the same length, and, for any valid positional index, elements 101 | ## at this position are corresponding to each other. This is how 102 | ## transcriptLocs2refLocs() expects them to be! 103 | rlocs <- transcriptLocs2refLocs(tlocs, 104 | start(ex_by_tx), end(ex_by_tx), 105 | tx_strand, decreasing.rank.on.minus.strand=TRUE) 106 | 107 | ## --------------------------------------------------------------------- 108 | ## WITH TWO WORM TRANSCRIPTS: ZC101.3.1 AND F37B1.1.1 109 | ## --------------------------------------------------------------------- 110 | library(TxDb.Celegans.UCSC.ce11.ensGene) 111 | txdb <- TxDb.Celegans.UCSC.ce11.ensGene 112 | my_tx_names <- c("ZC101.3.1", "F37B1.1.1") 113 | ## Both transcripts are on chromosome II, the first one on its positive 114 | ## strand and the second one on its negative strand: 115 | my_tx <- transcripts(txdb, filter=list(tx_name=my_tx_names)) 116 | my_tx 117 | 118 | ## Using transcripts stored in a GRangesList object: 119 | ex_by_tx <- exonsBy(txdb, use.names=TRUE)[my_tx_names] 120 | genome <- getBSgenome("ce11") # load the ce11 genome 121 | tx_seqs <- extractTranscriptSeqs(genome, ex_by_tx) 122 | tx_seqs 123 | 124 | ## Since the 2 transcripts are on the same chromosome, an alternative 125 | ## is to store them in an IRangesList object and use that object with 126 | ## extractTranscriptSeqs(): 127 | ex_by_tx2 <- ranges(ex_by_tx) 128 | tx_seqs2 <- extractTranscriptSeqs(genome$chrII, ex_by_tx2, 129 | strand=strand(my_tx)) 130 | stopifnot(identical(as.character(tx_seqs), as.character(tx_seqs2))) 131 | 132 | ## Store exon starts and ends in two IntegerList objects for use with 133 | ## transcriptWidths() and transcriptLocs2refLocs(): 134 | exon_starts <- start(ex_by_tx) 135 | exon_ends <- end(ex_by_tx) 136 | 137 | ## Same as 'width(tx_seqs)': 138 | transcriptWidths(exonStarts=exon_starts, exonEnds=exon_ends) 139 | 140 | transcriptLocs2refLocs(list(c(1:2, 202:205, 1687:1688), 141 | c(1:2, 193:196, 721:722)), 142 | exonStarts=exon_starts, 143 | exonEnds=exon_ends, 144 | strand=c("+","-")) 145 | 146 | ## A sanity check: 147 | ref_locs <- transcriptLocs2refLocs(list(1:1688, 1:722), 148 | exonStarts=exon_starts, 149 | exonEnds=exon_ends, 150 | strand=c("+","-")) 151 | stopifnot(genome$chrII[ref_locs[[1]]] == tx_seqs[[1]]) 152 | stopifnot(complement(genome$chrII)[ref_locs[[2]]] == tx_seqs[[2]]) 153 | } 154 | 155 | \keyword{manip} 156 | -------------------------------------------------------------------------------- /man/transcriptsBy.Rd: -------------------------------------------------------------------------------- 1 | \name{transcriptsBy} 2 | 3 | \alias{transcriptsBy} 4 | \alias{transcriptsBy,TxDb-method} 5 | \alias{exonsBy} 6 | \alias{exonsBy,TxDb-method} 7 | \alias{cdsBy} 8 | \alias{cdsBy,TxDb-method} 9 | \alias{intronsByTranscript} 10 | \alias{intronsByTranscript,TxDb-method} 11 | \alias{fiveUTRsByTranscript} 12 | \alias{fiveUTRsByTranscript,TxDb-method} 13 | \alias{threeUTRsByTranscript} 14 | \alias{threeUTRsByTranscript,TxDb-method} 15 | 16 | \title{ 17 | Extract and group genomic features of a given type from a TxDb-like object 18 | } 19 | \description{ 20 | Generic functions to extract genomic features of a given type 21 | grouped based on another type of genomic feature. 22 | This page documents the methods for \link{TxDb} objects only. 23 | } 24 | \usage{ 25 | transcriptsBy(x, by=c("gene", "exon", "cds"), ...) 26 | \S4method{transcriptsBy}{TxDb}(x, by=c("gene", "exon", "cds"), use.names=FALSE) 27 | 28 | exonsBy(x, by=c("tx", "gene"), ...) 29 | \S4method{exonsBy}{TxDb}(x, by=c("tx", "gene"), use.names=FALSE) 30 | 31 | cdsBy(x, by=c("tx", "gene"), ...) 32 | \S4method{cdsBy}{TxDb}(x, by=c("tx", "gene"), use.names=FALSE) 33 | 34 | intronsByTranscript(x, ...) 35 | \S4method{intronsByTranscript}{TxDb}(x, use.names=FALSE) 36 | 37 | fiveUTRsByTranscript(x, ...) 38 | \S4method{fiveUTRsByTranscript}{TxDb}(x, use.names=FALSE) 39 | 40 | threeUTRsByTranscript(x, ...) 41 | \S4method{threeUTRsByTranscript}{TxDb}(x, use.names=FALSE) 42 | } 43 | \arguments{ 44 | \item{x}{A \link{TxDb} object.} 45 | \item{...}{Arguments to be passed to or from methods.} 46 | \item{by}{One of \code{"gene"}, \code{"exon"}, \code{"cds"} or \code{"tx"}. 47 | Determines the grouping.} 48 | \item{use.names}{Controls how to set the names of the returned 49 | \link[GenomicRanges]{GRangesList} object. 50 | These functions return all the features of a given type (e.g. 51 | all the exons) grouped by another feature type (e.g. grouped by 52 | transcript) in a \link[GenomicRanges]{GRangesList} object. 53 | By default (i.e. if \code{use.names} is \code{FALSE}), the 54 | names of this \link[GenomicRanges]{GRangesList} object 55 | (aka the group names) are the internal ids of the features 56 | used for grouping (aka the grouping features), which are 57 | guaranteed to be unique. 58 | If \code{use.names} is \code{TRUE}, then the names of the 59 | grouping features are used instead of their internal ids. 60 | For example, when grouping by transcript (\code{by="tx"}), 61 | the default group names are the transcript internal ids 62 | (\code{"tx_id"}). But, if \code{use.names=TRUE}, the group 63 | names are the transcript names (\code{"tx_name"}). 64 | Note that, unlike the feature ids, the feature names are not 65 | guaranteed to be unique or even defined (they could be all 66 | \code{NA}s). A warning is issued when this happens. 67 | See \code{?\link{id2name}} for more information about 68 | feature internal ids and feature external names and how 69 | to map the formers to the latters. 70 | 71 | Finally, \code{use.names=TRUE} cannot be used when grouping 72 | by gene \code{by="gene"}. This is because, unlike for the 73 | other features, the gene ids are external ids (e.g. Entrez 74 | Gene or Ensembl ids) so the db doesn't have a \code{"gene_name"} 75 | column for storing alternate gene names. 76 | } 77 | } 78 | \details{ 79 | These functions return a \link[GenomicRanges]{GRangesList} object 80 | where the ranges within each of the elements are ordered according 81 | to the following rule: 82 | 83 | When using \code{exonsBy} or \code{cdsBy} with \code{by="tx"}, 84 | the returned exons or CDS parts are ordered by ascending rank for 85 | each transcript, that is, by their position in the transcript. 86 | In all other cases, the ranges will be ordered by chromosome, strand, 87 | start, and end values. 88 | } 89 | \value{A \link[GenomicRanges]{GRangesList} object.} 90 | \author{ 91 | M. Carlson, P. Aboyoun and H. Pagès 92 | } 93 | \seealso{ 94 | \itemize{ 95 | \item \code{\link{transcripts}} and \code{\link{transcriptsByOverlaps}} 96 | for more ways to extract genomic features 97 | from a \link{TxDb}-like object. 98 | 99 | \item \code{\link{transcriptLengths}} for extracting the transcript 100 | lengths (and other metrics) from a \link{TxDb} object. 101 | 102 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for 103 | extracting non-overlapping exonic or intronic parts from a 104 | TxDb-like object. 105 | 106 | \item \code{\link{extendExonsIntoIntrons}} for extending exons 107 | into their adjacent introns. 108 | 109 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript 110 | (or CDS) sequences from chromosome sequences. 111 | 112 | \item \code{\link{coverageByTranscript}} for computing coverage by 113 | transcript (or CDS) of a set of ranges. 114 | 115 | \item \link[GenomicFeatures]{select-methods} for how to use the 116 | simple "select" interface to extract information from a 117 | \link{TxDb} object. 118 | 119 | \item \code{\link{id2name}} for mapping \link{TxDb} internal ids 120 | to external names for a given feature type. 121 | 122 | \item The \link{TxDb} class. 123 | } 124 | } 125 | \examples{ 126 | txdb_file <- system.file("extdata", "hg19_knownGene_sample.sqlite", 127 | package="GenomicFeatures") 128 | txdb <- loadDb(txdb_file) 129 | 130 | ## Extract the transcripts grouped by gene: 131 | transcriptsBy(txdb, "gene") 132 | 133 | ## Extract the exons grouped by gene: 134 | exonsBy(txdb, "gene") 135 | 136 | ## Extract the CDS parts grouped by transcript: 137 | cds_by_tx0 <- cdsBy(txdb, "tx") 138 | ## With more informative group names: 139 | cds_by_tx1 <- cdsBy(txdb, "tx", use.names=TRUE) 140 | ## Note that 'cds_by_tx1' can also be obtained with: 141 | names(cds_by_tx0) <- id2name(txdb, feature.type="tx")[names(cds_by_tx0)] 142 | stopifnot(identical(cds_by_tx0, cds_by_tx1)) 143 | 144 | ## Extract the introns grouped by transcript: 145 | intronsByTranscript(txdb) 146 | 147 | ## Extract the 5' UTRs grouped by transcript: 148 | fiveUTRsByTranscript(txdb) 149 | fiveUTRsByTranscript(txdb, use.names=TRUE) # more informative group names 150 | } 151 | 152 | \keyword{methods} 153 | -------------------------------------------------------------------------------- /man/transcriptsByOverlaps.Rd: -------------------------------------------------------------------------------- 1 | \name{transcriptsByOverlaps} 2 | 3 | \alias{transcriptsByOverlaps} 4 | \alias{transcriptsByOverlaps,TxDb-method} 5 | \alias{exonsByOverlaps} 6 | \alias{exonsByOverlaps,TxDb-method} 7 | \alias{cdsByOverlaps} 8 | \alias{cdsByOverlaps,TxDb-method} 9 | 10 | \title{ 11 | Extract genomic features from a TxDb-like object based on their 12 | genomic location 13 | } 14 | \description{ 15 | Generic functions to extract genomic features for specified genomic 16 | locations. 17 | This page documents the methods for \link{TxDb} objects only. 18 | } 19 | \usage{ 20 | transcriptsByOverlaps(x, ranges, 21 | maxgap = -1L, minoverlap = 0L, 22 | type = c("any", "start", "end"), ...) 23 | \S4method{transcriptsByOverlaps}{TxDb}(x, ranges, 24 | maxgap = -1L, minoverlap = 0L, 25 | type = c("any", "start", "end"), 26 | columns = c("tx_id", "tx_name")) 27 | 28 | exonsByOverlaps(x, ranges, 29 | maxgap = -1L, minoverlap = 0L, 30 | type = c("any", "start", "end"), ...) 31 | \S4method{exonsByOverlaps}{TxDb}(x, ranges, 32 | maxgap = -1L, minoverlap = 0L, 33 | type = c("any", "start", "end"), 34 | columns = "exon_id") 35 | 36 | cdsByOverlaps(x, ranges, 37 | maxgap = -1L, minoverlap = 0L, 38 | type = c("any", "start", "end"), ...) 39 | \S4method{cdsByOverlaps}{TxDb}(x, ranges, 40 | maxgap = -1L, minoverlap = 0L, 41 | type = c("any", "start", "end"), 42 | columns = "cds_id") 43 | } 44 | \arguments{ 45 | \item{x}{A \link{TxDb} object.} 46 | \item{ranges}{A \link[GenomicRanges]{GRanges} object to restrict the output.} 47 | \item{maxgap,minoverlap,type}{ 48 | Used in the internal call to \code{findOverlaps()} to detect overlaps. 49 | See \code{?\link[IRanges]{findOverlaps}} in the \pkg{IRanges} package 50 | for a description of these arguments. 51 | } 52 | \item{...}{Arguments to be passed to or from methods.} 53 | \item{columns}{Columns to include in the output. 54 | See \code{?\link{transcripts}} for the possible values.} 55 | } 56 | \details{ 57 | These functions subset the results of \code{\link{transcripts}}, 58 | \code{\link{exons}}, and \code{\link{cds}} function calls with 59 | using the results of \code{\link[IRanges]{findOverlaps}} 60 | calls based on the specified \code{ranges}. 61 | } 62 | \value{ a GRanges object } 63 | \author{ 64 | P. Aboyoun 65 | } 66 | \seealso{ 67 | \itemize{ 68 | \item \code{\link{transcripts}} and \code{\link{transcriptsBy}} 69 | for more ways to extract genomic features 70 | from a \link{TxDb}-like object. 71 | 72 | \item \code{\link{transcriptLengths}} for extracting the transcript 73 | lengths (and other metrics) from a \link{TxDb} object. 74 | 75 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for 76 | extracting non-overlapping exonic or intronic parts from a 77 | TxDb-like object. 78 | 79 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript 80 | (or CDS) sequences from chromosome sequences. 81 | 82 | \item \code{\link{coverageByTranscript}} for computing coverage by 83 | transcript (or CDS) of a set of ranges. 84 | 85 | \item \link[GenomicFeatures]{select-methods} for how to use the 86 | simple "select" interface to extract information from a 87 | \link{TxDb} object. 88 | 89 | \item \code{\link{id2name}} for mapping \link{TxDb} internal ids 90 | to external names for a given feature type. 91 | 92 | \item The \link{TxDb} class. 93 | } 94 | } 95 | \examples{ 96 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite", 97 | package="GenomicFeatures")) 98 | gr <- GRanges(Rle("chr1", 2), 99 | IRanges(c(500,10500), c(10000,30000)), 100 | strand = Rle("-", 2)) 101 | transcriptsByOverlaps(txdb, gr) 102 | } 103 | 104 | \keyword{methods} 105 | -------------------------------------------------------------------------------- /tests/run_unitTests.R: -------------------------------------------------------------------------------- 1 | require("GenomicFeatures") || stop("unable to load GenomicFeatures package") 2 | GenomicFeatures:::.test() 3 | --------------------------------------------------------------------------------