├── .BBSoptions
├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── R
├── FeatureDb-class.R
├── TxDb-SELECT-helpers.R
├── TxDb-class.R
├── TxDb-schema.R
├── coordinate-mapping-methods.R
├── coverageByTranscript.R
├── exonicParts.R
├── extendExonsIntoIntrons.R
├── extractTranscriptSeqs.R
├── extractUpstreamSeqs.R
├── features.R
├── getPromoterSeq-methods.R
├── id2name.R
├── makeFeatureDbFromUCSC.R
├── makeTxDb.R
├── makeTxDbFromBiomart.R
├── makeTxDbFromEnsembl.R
├── makeTxDbFromGFF.R
├── makeTxDbFromGRanges.R
├── makeTxDbFromUCSC.R
├── makeTxDbPackage.R
├── mapIdsToRanges.R
├── nearest-methods.R
├── proteinToGenome.R
├── select-methods.R
├── tRNAs.R
├── transcriptLengths.R
├── transcriptLocs2refLocs.R
├── transcripts.R
├── transcriptsBy.R
├── transcriptsByOverlaps.R
├── utils.R
└── zzz.R
├── README.md
├── TODO
├── inst
├── CITATION
├── extdata
│ ├── Biomart_Ensembl_sample.sqlite
│ ├── FeatureDb.sqlite
│ ├── ITAG4.1_gene_models.subset.gff
│ ├── cD.exByEdge-SG-Vig.Rda
│ ├── cD.exsByGenes-SG-Vig.Rda
│ ├── events.Rda
│ ├── hg19_knownGene_sample.sqlite
│ └── sample_ranges.rds
├── script
│ └── README
└── unitTests
│ ├── test_TxDb_seqinfo.R
│ ├── test_coordinate-mapping-methods.R
│ ├── test_exonicParts.R
│ ├── test_getPromoterSeq-methods.R
│ ├── test_makeIdsForUniqueDataFrameRows.R
│ ├── test_mapIdsToRanges.R
│ ├── test_nearest-methods.R
│ ├── test_select-methods.R
│ ├── test_transcriptLengths.R
│ ├── test_transcripts.R
│ ├── test_transcriptsBy.R
│ └── test_transcriptsByOverlaps.R
├── man
├── FeatureDb-class.Rd
├── TxDb-class.Rd
├── as-format-methods.Rd
├── coordinate-mapping-methods.Rd
├── coverageByTranscript.Rd
├── exonicParts.Rd
├── extendExonsIntoIntrons.Rd
├── extractTranscriptSeqs.Rd
├── extractUpstreamSeqs.Rd
├── features.Rd
├── getPromoterSeq-methods.Rd
├── id2name.Rd
├── makeFeatureDbFromUCSC.Rd
├── makeTxDb.Rd
├── makeTxDbFromBiomart.Rd
├── makeTxDbFromEnsembl.Rd
├── makeTxDbFromGFF.Rd
├── makeTxDbFromGRanges.Rd
├── makeTxDbFromUCSC.Rd
├── makeTxDbPackage.Rd
├── mapIdsToRanges.Rd
├── mapRangesToIds.Rd
├── nearest-methods.Rd
├── proteinToGenome.Rd
├── select-methods.Rd
├── tRNAs.Rd
├── transcriptLengths.Rd
├── transcriptLocs2refLocs.Rd
├── transcripts.Rd
├── transcriptsBy.Rd
└── transcriptsByOverlaps.Rd
├── tests
└── run_unitTests.R
└── vignettes
└── GenomicFeatures.Rmd
/.BBSoptions:
--------------------------------------------------------------------------------
1 | RunLongTests: TRUE
2 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | .BBSoptions
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.Rhistory
2 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: GenomicFeatures
2 | Title: Query the gene models of a given organism/assembly
3 | Description: Extract the genomic locations of genes, transcripts, exons,
4 | introns, and CDS, for the gene models stored in a TxDb object.
5 | A TxDb object is a small database that contains the gene models of
6 | a given organism/assembly. Bioconductor provides a small collection
7 | of TxDb objects in the form of ready-to-install TxDb packages for
8 | the most commonly studied organisms. Additionally, the user can
9 | easily make a TxDb object (or package) for the organism/assembly
10 | of their choice by using the tools from the txdbmaker package.
11 | biocViews: Genetics, Infrastructure, Annotation, Sequencing,
12 | GenomeAnnotation
13 | URL: https://bioconductor.org/packages/GenomicFeatures
14 | BugReports: https://github.com/Bioconductor/GenomicFeatures/issues
15 | Version: 1.61.3
16 | License: Artistic-2.0
17 | Encoding: UTF-8
18 | Authors@R: c(
19 | person("M.", "Carlson", role="aut"),
20 | person("H.", "Pagès", role=c("aut", "cre"),
21 | email="hpages.on.github@gmail.com"),
22 | person("P.", "Aboyoun", role="aut"),
23 | person("S.", "Falcon", role="aut"),
24 | person("M.", "Morgan", role="aut"),
25 | person("D.", "Sarkar", role="aut"),
26 | person("M.", "Lawrence", role="aut"),
27 | person("V.", "Obenchain", role="aut"),
28 | person("S.", "Arora", role="ctb"),
29 | person("J.", "MacDonald", role="ctb"),
30 | person("M.", "Ramos", role="ctb"),
31 | person("S.", "Saini", role="ctb"),
32 | person("P.", "Shannon", role="ctb"),
33 | person("L.", "Shepherd", role="ctb"),
34 | person("D.", "Tenenbaum", role="ctb"),
35 | person("D.", "Van Twisk", role="ctb"))
36 | Depends: BiocGenerics (>= 0.51.2), S4Vectors (>= 0.17.29),
37 | IRanges (>= 2.37.1), GenomeInfoDb (>= 1.35.8),
38 | GenomicRanges (>= 1.55.2), AnnotationDbi (>= 1.41.4)
39 | Imports: methods, utils, stats, DBI, XVector, Biostrings, rtracklayer
40 | Suggests: txdbmaker, org.Mm.eg.db, org.Hs.eg.db,
41 | BSgenome, BSgenome.Hsapiens.UCSC.hg19 (>= 1.3.17),
42 | BSgenome.Celegans.UCSC.ce11,
43 | BSgenome.Dmelanogaster.UCSC.dm3 (>= 1.3.17),
44 | FDb.UCSC.tRNAs,
45 | TxDb.Hsapiens.UCSC.hg19.knownGene,
46 | TxDb.Celegans.UCSC.ce11.ensGene,
47 | TxDb.Dmelanogaster.UCSC.dm3.ensGene (>= 2.7.1),
48 | TxDb.Mmusculus.UCSC.mm10.knownGene (>= 3.4.7),
49 | TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts,
50 | TxDb.Hsapiens.UCSC.hg38.knownGene (>= 3.4.6),
51 | SNPlocs.Hsapiens.dbSNP144.GRCh38,
52 | Rsamtools, pasillaBamSubset (>= 0.0.5), GenomicAlignments (>= 1.15.7),
53 | ensembldb, AnnotationFilter,
54 | RUnit, BiocStyle, knitr, markdown
55 | VignetteBuilder: knitr
56 | Collate: utils.R
57 | TxDb-schema.R
58 | TxDb-SELECT-helpers.R
59 | TxDb-class.R FeatureDb-class.R
60 | mapIdsToRanges.R
61 | id2name.R
62 | transcripts.R
63 | transcriptsBy.R
64 | transcriptsByOverlaps.R
65 | transcriptLengths.R
66 | exonicParts.R
67 | extendExonsIntoIntrons.R
68 | features.R
69 | tRNAs.R
70 | extractTranscriptSeqs.R
71 | extractUpstreamSeqs.R
72 | getPromoterSeq-methods.R
73 | select-methods.R
74 | nearest-methods.R
75 | transcriptLocs2refLocs.R
76 | coordinate-mapping-methods.R
77 | proteinToGenome.R
78 | coverageByTranscript.R
79 | makeTxDb.R
80 | makeTxDbFromUCSC.R
81 | makeTxDbFromBiomart.R
82 | makeTxDbFromEnsembl.R
83 | makeTxDbFromGRanges.R
84 | makeTxDbFromGFF.R
85 | makeFeatureDbFromUCSC.R
86 | makeTxDbPackage.R
87 | zzz.R
88 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | import(methods)
2 | importFrom(stats, setNames)
3 | importFrom(utils, as.person)
4 |
5 | importMethodsFrom(DBI, dbGetQuery, dbListTables, dbListFields)
6 |
7 | import(AnnotationDbi)
8 | import(BiocGenerics)
9 | import(S4Vectors)
10 | import(IRanges)
11 | import(GenomeInfoDb)
12 | import(XVector)
13 | import(GenomicRanges)
14 |
15 | importClassesFrom(Biostrings, DNAString, DNAStringSet, MaskedDNAString)
16 | importFrom(Biostrings, DNAStringSet, reverseComplement, getSeq)
17 |
18 | importFrom(rtracklayer, asBED, asGFF)
19 |
20 |
21 | exportClasses(TxDb, FeatureDb)
22 |
23 | export(
24 | ## id2name.R:
25 | id2name,
26 |
27 | ## transcripts.R:
28 | transcripts, exons, cds, genes,
29 |
30 | ## transcriptsBy.R:
31 | transcriptsBy,
32 | exonsBy,
33 | cdsBy,
34 | intronsByTranscript,
35 | fiveUTRsByTranscript,
36 | threeUTRsByTranscript,
37 |
38 | ## transcriptsByOverlaps.R:
39 | transcriptsByOverlaps,
40 | exonsByOverlaps,
41 | cdsByOverlaps,
42 |
43 | ## transcriptLengths.R:
44 | transcriptLengths,
45 |
46 | ## exonicParts.R:
47 | tidyTranscripts, tidyExons, tidyIntrons,
48 | exonicParts, intronicParts,
49 |
50 | ## extendExonsIntoIntrons.R:
51 | extendExonsIntoIntrons,
52 |
53 | ## features.R:
54 | features,
55 |
56 | ## tRNAs.R:
57 | microRNAs,
58 | tRNAs,
59 |
60 | ## extractTranscriptSeqs.R:
61 | extractTranscriptSeqs,
62 |
63 | ## extractUpstreamSeqs.R:
64 | extractUpstreamSeqs,
65 |
66 | ## getPromoterSeq-methods.R:
67 | getPromoterSeq, getTerminatorSeq,
68 |
69 | ## transcriptLocs2refLocs.R:
70 | transcriptLocs2refLocs,
71 | transcriptWidths,
72 |
73 | ## coordinate-mapping-methods.R:
74 | mapToTranscripts, pmapToTranscripts,
75 | mapFromTranscripts, pmapFromTranscripts,
76 |
77 | ## proteinToGenome.R:
78 | proteinToGenome,
79 |
80 | ## coverageByTranscript.R:
81 | coverageByTranscript,
82 | pcoverageByTranscript
83 | )
84 |
85 | exportMethods(
86 | organism,
87 | show,
88 | as.list,
89 | seqlevels0, "seqlevels<-", seqinfo,
90 | transcripts, exons, cds, genes,
91 | promoters, terminators,
92 | transcriptsByOverlaps,
93 | exonsByOverlaps,
94 | cdsByOverlaps,
95 | transcriptsBy,
96 | exonsBy,
97 | cdsBy,
98 | intronsByTranscript,
99 | fiveUTRsByTranscript,
100 | threeUTRsByTranscript,
101 | tRNAs,
102 | extractTranscriptSeqs,
103 | extractUpstreamSeqs,
104 | getPromoterSeq, getTerminatorSeq,
105 | isActiveSeq,
106 | "isActiveSeq<-",
107 | asBED, asGFF,
108 | distance,
109 | mapToTranscripts, pmapToTranscripts,
110 | mapFromTranscripts, pmapFromTranscripts,
111 | mapIdsToRanges, mapRangesToIds,
112 | proteinToGenome
113 | )
114 |
115 |
116 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
117 | ### Stuff that has moved to txdbmaker
118 | ###
119 |
120 | export(
121 | ## makeTxDb.R:
122 | makeTxDb,
123 |
124 | ## makeTxDbFromUCSC.R:
125 | supportedUCSCtables,
126 | browseUCSCtrack,
127 | makeTxDbFromUCSC,
128 |
129 | ## makeTxDbFromBiomart.R:
130 | getChromInfoFromBiomart,
131 | makeTxDbFromBiomart,
132 |
133 | ## makeTxDbFromEnsembl.R:
134 | makeTxDbFromEnsembl,
135 |
136 | ## makeTxDbFromGRanges.R:
137 | makeTxDbFromGRanges,
138 |
139 | ## makeTxDbFromGFF.R:
140 | makeTxDbFromGFF,
141 |
142 | ## makeFeatureDbFromUCSC.R:
143 | supportedUCSCFeatureDbTracks,
144 | supportedUCSCFeatureDbTables,
145 | UCSCFeatureDbTableSchema,
146 | makeFeatureDbFromUCSC,
147 |
148 | ## makeTxDbPackage.R:
149 | supportedMiRBaseBuildValues,
150 | makePackageName,
151 | makeTxDbPackage,
152 | makeTxDbPackageFromUCSC,
153 | makeFDbPackageFromUCSC,
154 | makeTxDbPackageFromBiomart
155 | )
156 |
157 |
--------------------------------------------------------------------------------
/R/FeatureDb-class.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### FeatureDb objects
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | ## This is to try and tidy up before setRefClass()
7 | gc()
8 |
9 | .FeatureDb <-
10 | setRefClass("FeatureDb", contains="AnnotationDb")
11 |
12 |
13 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
14 | ### A low-level accessor (not exported).
15 | ###
16 |
17 | ## featuredbConn <- function(featuredb) featuredb$conn
18 |
19 |
20 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
21 | ### Validity of a FeatureDb object.
22 | ###
23 |
24 | .validate.colnames <- function(conn, colnames)
25 | {
26 | ## even though we don't know the name of the table, take advantage of the
27 | ## fact that there are only two tables and one of them is always called
28 | ## "metadata"
29 | tablenames <- dbListTables(conn)
30 | tablename <- tablenames[!tablenames %in% "metadata"]
31 | AnnotationDbi:::.valid.colnames(conn, tablename, colnames)
32 | }
33 |
34 | .valid.feature.table <- function(conn)
35 | {
36 | ## Restrict column name checking to just columns that we are demanding
37 | colnames <- c("chrom", "strand","chromStart","chromEnd")
38 | msg <- .validate.colnames(conn, colnames)
39 | if (!is.null(msg))
40 | return(msg)
41 | NULL
42 | }
43 |
44 |
45 | .valid.FeatureDb <- function(x)
46 | {
47 | conn <- dbconn(x)
48 | c(AnnotationDbi:::.valid.metadata.table(conn, "Db type",
49 | "FeatureDb"),
50 | .valid.feature.table(conn))
51 | }
52 |
53 |
54 | setValidity2("FeatureDb", .valid.FeatureDb)
55 |
56 |
57 |
58 |
59 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
60 | ### Low-level constructor (not exported).
61 | ###
62 |
63 | FeatureDb <- function(conn)
64 | {
65 | .FeatureDb$new(conn=conn)
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/R/TxDb-SELECT-helpers.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### Helpers for SELECT'ing stuff from a TxDb object
3 | ### -------------------------------------------------------------------------
4 | ###
5 | ### Nothing in this file is exported.
6 | ###
7 |
8 |
9 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
10 | ### Low-level helpers (schema agnostic) for building SQL queries
11 | ###
12 |
13 | .as_qualified <- function(tables, columns) paste(tables, columns, sep=".")
14 |
15 | .tables_in_joins <- function(joins)
16 | {
17 | joins_len <- length(joins)
18 | stopifnot(joins_len %% 2L == 1L)
19 | joins[seq(1L, joins_len, by=2L)]
20 | }
21 |
22 | ### 'join_type' will be recycled to the nb of joins (= length(joins) %/% 2).
23 | .build_SQL_FROM <- function(joins, join_type="INNER")
24 | {
25 | joins_len <- length(joins)
26 | stopifnot(joins_len %% 2L == 1L)
27 | SQL <- joins[[1L]]
28 | if (joins_len == 1L)
29 | return(SQL)
30 | njoin <- joins_len %/% 2L
31 | stopifnot(length(join_type) == 1L || length(join_type) == njoin)
32 | ON_idx <- 2L * seq_len(njoin)
33 | ON <- joins[ON_idx]
34 | Rtables <- joins[ON_idx + 1L]
35 | c(SQL, paste0(join_type, " JOIN ", Rtables, " ON (", ON, ")"))
36 | }
37 |
38 | .build_SQL_FROM_splicing <- function(joins, cds_join_type="LEFT")
39 | {
40 | joins_len <- length(joins)
41 | stopifnot(joins_len %% 2L == 1L)
42 | SQL <- joins[[1L]]
43 | if (joins_len == 1L)
44 | return(SQL)
45 | njoin <- joins_len %/% 2L
46 | join_type <- rep.int("INNER", njoin)
47 | if (joins[[length(joins)]] == "cds")
48 | join_type[[length(join_type)]] <- cds_join_type
49 | paste0(.build_SQL_FROM(joins, join_type), collapse=" ")
50 | }
51 |
52 | .build_SQL_WHERE <- function(filter)
53 | {
54 | if (length(filter) == 0L)
55 | return("")
56 | sql <- lapply(seq_len(length(filter)),
57 | function(i) {
58 | fi <- filter[[i]]
59 | if (!is.numeric(fi))
60 | fi <- paste0("'", fi, "'")
61 | fi <- paste0("(", paste0(fi, collapse=","), ")")
62 | fi <- paste0(names(filter)[i], " IN ", fi)
63 | paste0("(", fi, ")")
64 | })
65 | paste0(unlist(sql), collapse=" AND ")
66 | }
67 |
68 | .build_SQL_SELECT <- function(columns, joins, distinct=FALSE,
69 | filter=list(), orderby=character(0))
70 | {
71 | SQL <- "SELECT"
72 | if (distinct)
73 | SQL <- c(SQL, "DISTINCT")
74 | SQL <- c(SQL, paste0(columns, collapse=", "),
75 | "FROM", .build_SQL_FROM(joins))
76 | if (length(filter) != 0L)
77 | SQL <- c(SQL, "WHERE", .build_SQL_WHERE(filter))
78 | if (length(orderby) != 0L)
79 | SQL <- c(SQL, "ORDER BY", paste0(orderby, collapse=", "))
80 | SQL
81 | }
82 |
83 |
84 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
85 | ### .TXDB_join_tables() and .TXDB_join_splicing_Rtables()
86 | ###
87 |
88 | .TXDB_join_tables <- function(tables)
89 | {
90 | tables <- unique(tables)
91 | if (length(tables) == 1L)
92 | return(tables)
93 | if (any(tables %in% c("exon", "cds")))
94 | tables <- c(tables, "splicing")
95 | ## Order tables & remove duplicates.
96 | join_order <- c("transcript", "splicing", "exon", "cds", "gene")
97 | tables <- intersect(join_order, tables)
98 | joins <- character(2L * length(tables) - 1L)
99 | ON_idx <- 2L * seq_len(length(tables) - 1L)
100 | ON <- sapply(2:length(tables), function(i) {
101 | Rtable <- tables[[i]]
102 | if (Rtable == "exon") {
103 | USING <- "_exon_id"
104 | Ltable <- "splicing"
105 | } else if (Rtable == "cds") {
106 | USING <- "_cds_id"
107 | Ltable <- "splicing"
108 | } else {
109 | USING <- "_tx_id"
110 | Ltable <- tables[[1L]]
111 | }
112 | Lcolumn <- .as_qualified(Ltable, USING)
113 | Rcolumn <- .as_qualified(Rtable, USING)
114 | paste(Lcolumn, Rcolumn, sep="=")
115 | })
116 | joins[ON_idx] <- ON
117 | joins[c(1L, ON_idx + 1L)] <- tables
118 | joins
119 | }
120 |
121 | .TXDB_join_splicing_Rtables <- function(tables=character(0))
122 | {
123 | if (!all(tables %in% TXDB_SPLICING_BUNDLE))
124 | stop("all tables must be in TXDB_SPLICING_BUNDLE")
125 | tables <- c("splicing", tables)
126 | ## Order tables & remove duplicates.
127 | tables <- intersect(TXDB_SPLICING_BUNDLE, tables)
128 | if (length(tables) == 1L)
129 | return(tables)
130 | joins <- character(2L * length(tables) - 1L)
131 | ON_idx <- 2L * seq_len(length(tables) - 1L)
132 | Rtables <- tables[-1L]
133 | USING <- TXDB_SPLICING_JOIN_USING[Rtables]
134 | Lcolumns <- .as_qualified("splicing", USING)
135 | Rcolumns <- .as_qualified(Rtables, USING)
136 | ON <- paste(Lcolumns, Rcolumns, sep="=")
137 | joins[ON_idx] <- ON
138 | joins[c(1L, ON_idx + 1L)] <- tables
139 | joins
140 | }
141 |
142 |
143 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
144 | ### The 2 flexible helpers for SELECT'ing stuff from a TxDb object:
145 | ### - TxDb_SELECT_from_INNER_JOIN()
146 | ### - TxDb_SELECT_from_splicing_bundle()
147 | ### They should satisfy the needs of most extractors defined in the package.
148 | ###
149 |
150 | ### The columns in 'columns' + those involved thru 'filter' and 'orderby' are
151 | ### collected and their corresponding tables are INNER JOIN'ed.
152 | TxDb_SELECT_from_INNER_JOIN <- function(txdb, table, columns, filter=list(),
153 | orderby=character(0))
154 | {
155 | schema_version <- TxDb_schema_version(txdb)
156 | tables <- TXDB_column2table(columns, from_table=table,
157 | schema_version=schema_version)
158 | where_columns <- names(filter)
159 | where_tables <- TXDB_column2table(where_columns, from_table=table,
160 | schema_version=schema_version)
161 | joins <- .TXDB_join_tables(c(table, tables, where_tables))
162 | orderby_tables <- TXDB_column2table(orderby, from_table=table,
163 | schema_version=schema_version)
164 | stopifnot(all(orderby_tables %in% .tables_in_joins(joins)))
165 | use_joins <- length(joins) > 1L
166 | if (use_joins) {
167 | columns <- .as_qualified(tables, columns)
168 | names(filter) <- .as_qualified(where_tables, where_columns)
169 | orderby <- .as_qualified(orderby_tables, orderby)
170 | }
171 | ## .build_SQL_SELECT() uses INNER joins.
172 | SQL <- .build_SQL_SELECT(columns, joins, distinct=use_joins,
173 | filter=filter, orderby=orderby)
174 | queryAnnotationDb(txdb, SQL)
175 | }
176 |
177 | ### Can only involve columns (thru 'columns', 'filter', and 'orderby') that
178 | ### belong to the tables in TXDB_SPLICING_BUNDLE at the moment.
179 | TxDb_SELECT_from_splicing_bundle <- function(txdb, columns,
180 | filter=list(),
181 | orderby=character(0),
182 | cds_join_type="LEFT")
183 | {
184 | schema_version <- TxDb_schema_version(txdb)
185 | tables <- TXDB_column2table(columns, from_table="splicing",
186 | schema_version=schema_version)
187 | where_columns <- names(filter)
188 | where_tables <- TXDB_column2table(where_columns, from_table="splicing",
189 | schema_version=schema_version)
190 | orderby_tables <- TXDB_column2table(orderby, from_table="splicing",
191 | schema_version=schema_version)
192 | joins <- .TXDB_join_splicing_Rtables(c(tables, where_tables,
193 | orderby_tables))
194 | use_joins <- length(joins) > 1L
195 | if (use_joins) {
196 | columns <- .as_qualified(tables, columns)
197 | names(filter) <- .as_qualified(where_tables, where_columns)
198 | orderby <- .as_qualified(orderby_tables, orderby)
199 | }
200 | from <- .build_SQL_FROM_splicing(joins, cds_join_type=cds_join_type)
201 | SQL <- .build_SQL_SELECT(columns, from, distinct=FALSE,
202 | filter=filter, orderby=orderby)
203 | queryAnnotationDb(txdb, SQL)
204 | }
205 |
206 |
207 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
208 | ### Convenience wrappers to the above flexible helpers for SELECT'ing stuff
209 | ### from a given TxDb table
210 | ###
211 |
212 | TxDb_SELECT_from_chrominfo <- function(txdb, filter=list(),
213 | orderby="_chrom_id")
214 | {
215 | schema_version <- TxDb_schema_version(txdb)
216 | columns <- TXDB_table_columns("chrominfo", schema_version=schema_version)
217 | TxDb_SELECT_from_INNER_JOIN(txdb, "chrominfo", columns,
218 | filter=filter, orderby=orderby)
219 | }
220 |
221 | TxDb_SELECT_from_transcript <- function(txdb, filter=list(),
222 | orderby="_tx_id")
223 | {
224 | schema_version <- TxDb_schema_version(txdb)
225 | columns <- TXDB_table_columns("transcript", schema_version=schema_version)
226 | TxDb_SELECT_from_INNER_JOIN(txdb, "transcript", columns,
227 | filter=filter, orderby=orderby)
228 | }
229 |
230 | ### Select rows from the virtual table obtained by joining the "splicing",
231 | ### "exon", and "cds" tables together.
232 | TxDb_SELECT_from_splicings <- function(txdb, filter=list(),
233 | orderby=c("_tx_id", "exon_rank"),
234 | cds_join_type="LEFT")
235 | {
236 | schema_version <- TxDb_schema_version(txdb)
237 | splicing_columns <- TXDB_table_columns("splicing",
238 | schema_version=schema_version)
239 | exon_columns <- TXDB_table_columns("exon", schema_version=schema_version)
240 | cds_columns <- TXDB_table_columns("cds", schema_version=schema_version)
241 | cds_columns <- cds_columns[c("id", "name", "start", "end")]
242 | columns <- unique(c(splicing_columns, exon_columns, cds_columns))
243 | TxDb_SELECT_from_splicing_bundle(txdb, columns,
244 | filter=filter, orderby=orderby,
245 | cds_join_type=cds_join_type)
246 | }
247 |
248 | TxDb_SELECT_from_gene <- function(txdb, filter=list(),
249 | orderby=c("_tx_id", "gene_id"))
250 | {
251 | schema_version <- TxDb_schema_version(txdb)
252 | columns <- TXDB_table_columns("gene", schema_version=schema_version)
253 | TxDb_SELECT_from_INNER_JOIN(txdb, "gene", columns,
254 | filter=filter, orderby=orderby)
255 | }
256 |
257 |
--------------------------------------------------------------------------------
/R/TxDb-schema.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### TxDb schema
3 | ### -------------------------------------------------------------------------
4 | ###
5 | ### Nothing in this file is exported.
6 | ###
7 | ### 7 tables:
8 | ### - chrominfo
9 | ### - transcript
10 | ### - exon
11 | ### - cds
12 | ### - splicing
13 | ### - gene
14 | ### - metadata (not described here)
15 |
16 |
17 | ### Not exported.
18 | DB_TYPE_NAME <- "Db type"
19 | DB_TYPE_VALUE <- "TxDb" # same as the name of the class below
20 | DB_SCHEMA_VERSION <- "1.2" # DON'T FORGET TO BUMP THIS WHEN YOU CHANGE THE
21 | # SCHEMA
22 |
23 | ### Return the *effective* schema version.
24 | TxDb_schema_version <- function(txdb)
25 | {
26 | conn <- if (is(txdb, "TxDb")) dbconn(txdb) else txdb
27 | version <- AnnotationDbi:::.getMetaValue(conn, "DBSCHEMAVERSION")
28 | numeric_version(version)
29 | }
30 |
31 |
32 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
33 | ### Table columns
34 | ###
35 |
36 | ### 'chrominfo' table
37 |
38 | TXDB_CHROMINFO_COLDEFS <- c(
39 | `_chrom_id`="INTEGER PRIMARY KEY",
40 | chrom="TEXT UNIQUE NOT NULL",
41 | length="INTEGER NULL",
42 | is_circular="INTEGER NULL"
43 | )
44 |
45 | TXDB_CHROMINFO_COLUMNS <- names(TXDB_CHROMINFO_COLDEFS)
46 |
47 | ### 'transcript', 'exon', and 'cds' tables (a.k.a. "feature tables")
48 |
49 | TXDB_FEATURE_COLDEFS <- c(
50 | id="INTEGER PRIMARY KEY",
51 | name="TEXT NULL",
52 | type="TEXT NULL",
53 | chrom="TEXT NOT NULL",
54 | strand="TEXT NOT NULL",
55 | start="INTEGER NOT NULL",
56 | end="INTEGER NOT NULL"
57 | )
58 |
59 | ### Tables "transcript", "exon", and "cds" must at least have columns with
60 | ### the core column tags.
61 | TXDB_CORE_COLTAGS <- c("id", "chrom", "strand", "start", "end")
62 | TXDB_ALL_COLTAGS <- names(TXDB_FEATURE_COLDEFS)
63 | TXDB_EXON_OR_CDS_COLTAGS <- TXDB_ALL_COLTAGS[TXDB_ALL_COLTAGS != "type"]
64 |
65 | .make_feature_columns <- function(prefix, tags)
66 | {
67 | fmt <- paste0("%s_", tags)
68 | id_pos <- match("id", tags)
69 | stopifnot(identical(id_pos, 1L))
70 | fmt[[id_pos]] <- paste0("_", fmt[[id_pos]])
71 | setNames(sprintf(fmt, prefix), tags)
72 | }
73 |
74 | TXDB_TRANSCRIPT_COLUMNS <- .make_feature_columns("tx", TXDB_ALL_COLTAGS)
75 | TXDB_EXON_COLUMNS <- .make_feature_columns("exon", TXDB_EXON_OR_CDS_COLTAGS)
76 | TXDB_CDS_COLUMNS <- .make_feature_columns("cds", TXDB_EXON_OR_CDS_COLTAGS)
77 |
78 | ### 'splicing' table
79 |
80 | TXDB_SPLICING_COLDEFS <- c(
81 | `_tx_id`="INTEGER NOT NULL",
82 | exon_rank="INTEGER NOT NULL",
83 | `_exon_id`="INTEGER NOT NULL",
84 | `_cds_id`="INTEGER NULL",
85 | cds_phase="INTEGER NULL"
86 | )
87 |
88 | TXDB_SPLICING_COLUMNS <- names(TXDB_SPLICING_COLDEFS)
89 |
90 | ### 'gene' table
91 |
92 | TXDB_GENE_COLDEFS <- c(
93 | gene_id="TEXT NOT NULL",
94 | `_tx_id`="INTEGER NOT NULL"
95 | )
96 |
97 | TXDB_GENE_COLUMNS <- names(TXDB_GENE_COLDEFS)
98 |
99 |
100 | ### Order of tables matters! "transcript" must be before "splicing" and "gene",
101 | ### and "exon" and "cds" must be before "splicing". See TXDB_column2table()
102 | ### below why.
103 | TXDB_COLUMNS <- list(
104 | chrominfo=TXDB_CHROMINFO_COLUMNS,
105 | transcript=TXDB_TRANSCRIPT_COLUMNS,
106 | exon=TXDB_EXON_COLUMNS,
107 | cds=TXDB_CDS_COLUMNS,
108 | splicing=TXDB_SPLICING_COLUMNS,
109 | gene=TXDB_GENE_COLUMNS
110 | )
111 |
112 |
113 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
114 | ### Build CREATE TABLE statements
115 | ###
116 |
117 | .build_SQL_CREATE_TABLE <- function(table, coldefs, constraints=NULL)
118 | {
119 | SQL <- "CREATE TABLE %s (%s\n)"
120 | coldefs <- c(paste(names(coldefs), coldefs), constraints)
121 | coldefs <- paste("\n ", coldefs, collapse=",")
122 | sprintf(SQL, table, coldefs)
123 | }
124 |
125 | build_SQL_CREATE_chrominfo_table <- function()
126 | {
127 | .build_SQL_CREATE_TABLE("chrominfo", TXDB_CHROMINFO_COLDEFS)
128 | }
129 |
130 | build_SQL_CREATE_feature_table <- function(table)
131 | {
132 | columns <- TXDB_COLUMNS[[table]]
133 | coldefs <- setNames(TXDB_FEATURE_COLDEFS[names(columns)], columns)
134 | foreign_key <- sprintf("FOREIGN KEY (%s) REFERENCES chrominfo (chrom)",
135 | columns[["chrom"]])
136 | .build_SQL_CREATE_TABLE(table, coldefs, foreign_key)
137 | }
138 |
139 | build_SQL_CREATE_splicing_table <- function()
140 | {
141 | unique_key <- "UNIQUE (_tx_id, exon_rank)"
142 | foreign_keys <- sprintf("FOREIGN KEY (_%s_id) REFERENCES %s",
143 | c("tx", "exon", "cds"),
144 | c("transcript", "exon", "cds"))
145 | constraints <- c(unique_key, foreign_keys)
146 | .build_SQL_CREATE_TABLE("splicing", TXDB_SPLICING_COLDEFS, constraints)
147 | }
148 |
149 | build_SQL_CREATE_gene_table <- function()
150 | {
151 | unique_key <- "UNIQUE (gene_id, _tx_id)"
152 | foreign_key <- "FOREIGN KEY (_tx_id) REFERENCES transcript"
153 | constraints <- c(unique_key, foreign_key)
154 | .build_SQL_CREATE_TABLE("gene", TXDB_GENE_COLDEFS, constraints)
155 | }
156 |
157 |
158 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
159 | ### Relationship between the 'splicing' table and the "feature tables"
160 | ###
161 | ### The 'splicing' table is the glue between the "feature tables".
162 | ###
163 |
164 | ### The "splicing right tables" can be bundled to the "splicing" table with
165 | ### a LEFT JOIN using the TXDB_SPLICING_JOIN_USING columns.
166 | TXDB_SPLICING_RTABLES <- c("transcript", "exon", "cds")
167 | TXDB_SPLICING_JOIN_USING <- setNames(c("_tx_id", "_exon_id", "_cds_id"),
168 | TXDB_SPLICING_RTABLES)
169 | TXDB_SPLICING_BUNDLE <- c("splicing", TXDB_SPLICING_RTABLES)
170 |
171 |
172 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
173 | ### Helper functions
174 | ###
175 |
176 | TXDB_tables <- function() names(TXDB_COLUMNS)
177 |
178 | TXDB_table_columns <- function(table, schema_version=NA)
179 | {
180 | columns <- TXDB_COLUMNS[[table]]
181 | if (is.na(schema_version))
182 | return(columns)
183 | if (table == "transcript" && schema_version < numeric_version("1.1"))
184 | columns <- columns[columns != "tx_type"]
185 | if (table == "splicing" && schema_version < numeric_version("1.2"))
186 | columns <- columns[columns != "cds_phase"]
187 | columns
188 | }
189 |
190 | ### When the same column belongs to more than one table (e.g. "_tx_id",
191 | ### "_exon_id", or "_cds_id"), then the table for which the column is a
192 | ### primary key is chosen by default. This behavior can be changed by passing
193 | ### the name of a table to 'from_table' in which case the priority is given to
194 | ### that table.
195 | TXDB_column2table <- function(columns, from_table=NA, schema_version=NA)
196 | {
197 | if (length(columns) == 0L)
198 | return(character(0))
199 | tables <- sapply(columns,
200 | function(column) {
201 | for (table in TXDB_tables()) {
202 | table_columns <- TXDB_table_columns(table,
203 | schema_version=schema_version)
204 | if (column %in% table_columns)
205 | return(table)
206 | }
207 | if (is.na(schema_version)) {
208 | in_schema <- ""
209 | } else {
210 | in_schema <- c(" in db schema ", as.character(schema_version))
211 | }
212 | stop(column, ": no such column", in_schema)
213 | }
214 | )
215 | if (!is.na(from_table)) {
216 | table_columns <- TXDB_table_columns(from_table,
217 | schema_version=schema_version)
218 | tables[columns %in% table_columns] <- from_table
219 | }
220 | tables
221 | }
222 |
223 |
--------------------------------------------------------------------------------
/R/exonicParts.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### Extraction of exonic and intronic parts
3 | ### -------------------------------------------------------------------------
4 | ###
5 | ### For all functions in this file, 'txdb' must be a TxDb object or any
6 | ### object that supports transcripts() and exonsBy() (e.g. EnsDb object).
7 | ###
8 |
9 |
10 | ### Works on whatever 'x' can be used as a splitting factor in splitAsList().
11 | ### TODO: Rename and move to a more appropriate place (IRanges?)
12 | .rank_in_group <- function(x)
13 | {
14 | groups <- splitAsList(seq_along(x), x)
15 | i <- unlist(groups, use.names=FALSE)
16 | ans <- sequence(lengths(groups))
17 | ans[i] <- ans
18 | ans
19 | }
20 |
21 |
22 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
23 | ### 3 helper functions used internally by exonicParts() and intronicParts()
24 | ###
25 |
26 | ### Return a GRanges object with 1 range per transcript and metadata columns
27 | ### tx_id, tx_name, and gene_id.
28 | ### If 'drop.geneless' is FALSE (the default) then the transcripts are
29 | ### returned in the same order as with transcripts(), which is expected
30 | ### to be by transcript id (tx_id). Otherwise they are ordered first by
31 | ### gene id (gene_id), then by transcript id.
32 | tidyTranscripts <- function(txdb, drop.geneless=FALSE)
33 | {
34 | tx <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"))
35 | mcols(tx)$gene_id <- as.character(mcols(tx)$gene_id)
36 | if (drop.geneless) {
37 | gene_id <- mcols(tx)$gene_id
38 | tx_id <- mcols(tx)$tx_id
39 | tx <- tx[order(gene_id, tx_id, na.last=NA)]
40 | }
41 | tx
42 | }
43 |
44 | ### Return a GRangesList object parallel to 'tx_ids'. The supplied 'tx_ids'
45 | ### must be a subset of 'mcols(transcripts(txdb))$tx_id'.
46 | .exons_by_txids <- function(txdb, tx_ids)
47 | {
48 | if (anyDuplicated(tx_ids))
49 | stop(wmsg("\"transcripts\" method for ", class(txdb), " objects ",
50 | "seems broken, sorry"))
51 | ans <- exonsBy(txdb, by="tx")
52 | tx_ids <- as.character(tx_ids)
53 | ans_names <- names(ans)
54 | if (!identical(tx_ids, ans_names)) {
55 | m <- match(tx_ids, ans_names)
56 | if (anyNA(m))
57 | stop(wmsg("\"exonsBy\" method for ", class(txdb), " objects ",
58 | "seems broken, sorry"))
59 | ans <- ans[m]
60 | }
61 | ans
62 | }
63 |
64 | ### Return a GRanges object with 1 range per exon and metadata columns
65 | ### tx_id, tx_name, gene_id, exon_id, exon_name, and exon_rank.
66 | ### If 'drop.geneless' is FALSE (the default) then the exons are ordered first
67 | ### by transcript id (tx_id), then by exon rank (exon_rank). Otherwise they
68 | ### are ordered first by gene id (gene_id), then by transcript id, and then
69 | ### by exon rank.
70 | tidyExons <- function(txdb, drop.geneless=FALSE)
71 | {
72 | tx <- tidyTranscripts(txdb, drop.geneless=drop.geneless)
73 | ex_by_tx <- .exons_by_txids(txdb, mcols(tx)$tx_id)
74 |
75 | ans <- unlist(ex_by_tx, use.names=FALSE)
76 | idx <- rep(seq_along(tx), lengths(ex_by_tx))
77 | mcols(ans) <- cbind(mcols(tx)[idx, , drop=FALSE], mcols(ans))
78 | ans
79 | }
80 |
81 | ### Return a GRanges object with 1 range per intron and metadata columns
82 | ### tx_id, tx_name, and gene_id.
83 | ### If 'drop.geneless' is FALSE (the default) then the introns are ordered
84 | ### by transcript id (tx_id). Otherwise they are ordered first by gene id
85 | ### (gene_id), then by transcript id.
86 | tidyIntrons <- function(txdb, drop.geneless=FALSE)
87 | {
88 | tx <- tidyTranscripts(txdb, drop.geneless=drop.geneless)
89 | ex_by_tx <- .exons_by_txids(txdb, mcols(tx)$tx_id)
90 |
91 | introns_by_tx <- psetdiff(tx, ex_by_tx)
92 |
93 | ans <- unlist(introns_by_tx, use.names=FALSE)
94 | idx <- rep(seq_along(tx), lengths(introns_by_tx))
95 | mcols(ans) <- mcols(tx)[idx, , drop=FALSE]
96 | ans
97 | }
98 |
99 | .break_in_parts <- function(x, linked.to.single.gene.only=FALSE,
100 | extra_mcol="exonic_part")
101 | {
102 | ans <- disjoin(x, with.revmap=TRUE)
103 | revmap <- mcols(ans)$revmap
104 | ans_mcols <- lapply(mcols(x),
105 | function(col) {
106 | col <- unique(extractList(col, revmap))
107 | col[!is.na(col)]
108 | }
109 | )
110 | mcols(ans) <- DataFrame(ans_mcols)
111 | if (linked.to.single.gene.only) {
112 | keep_idx <- which(elementNROWS(mcols(ans)$gene_id) == 1L)
113 | ans <- ans[keep_idx]
114 | gene_id <- as.character(mcols(ans)$gene_id)
115 | mcols(ans)$gene_id <- gene_id
116 | ## Add "exonic_part" or "intronic_part" metadata column for
117 | ## compatibility with old disjointExons().
118 | mcols(ans)[[extra_mcol]] <- .rank_in_group(gene_id)
119 | }
120 | ans
121 | }
122 |
123 |
124 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
125 | ### exonicParts() and intronicParts()
126 | ###
127 | ### exonicParts() is a replacement for the old disjointExons() function, with
128 | ### the following differences/improvements:
129 | ###
130 | ### 1. Argument 'linked.to.single.gene.only' in exonicParts() replaces
131 | ### argument 'aggregateGenes' in disjointExons(), but has the opposite
132 | ### meaning, that is:
133 | ### exonicParts(txdb, linked.to.single.gene.only=TRUE)
134 | ### returns the same exonic parts as:
135 | ### disjointExons(txdb, aggregateGenes=FALSE)
136 | ###
137 | ### 2. Unlike disjointExons(txdb, aggregateGenes=TRUE),
138 | ### exonicParts(txdb, linked.to.single.gene.only=FALSE)
139 | ### does NOT discard exon parts that are not linked to a gene.
140 | ###
141 | ### 3. exonicParts() is almost 2x more efficient than disjointExons().
142 | ###
143 | ### 4. exonicParts() works out-of-the-box on any TxDb-like object that
144 | ### supports the transcripts() and exonsBy() extractors, e.g. on an
145 | ### EnsDb object.
146 | ###
147 | ### Note that disjointExons() was deprecated in BioC 3.13, then defunct in
148 | ### BioC 3.15, and finally removed from BioC 3.17.
149 |
150 | ### Return a disjoint and strictly sorted GRanges object with 1 range per
151 | ### exonic part and with metadata columns tx_id, tx_name, gene_id, exon_id,
152 | ### exon_name, and exon_rank.
153 | exonicParts <- function(txdb, linked.to.single.gene.only=FALSE)
154 | {
155 | if (!isTRUEorFALSE(linked.to.single.gene.only))
156 | stop("'linked.to.single.gene.only' must be TRUE or FALSE")
157 | ex <- tidyExons(txdb, drop.geneless=linked.to.single.gene.only)
158 | .break_in_parts(ex, linked.to.single.gene.only,
159 | extra_mcol="exonic_part")
160 | }
161 |
162 | ### Return a disjoint and strictly sorted GRanges object with 1 range per
163 | ### intronic part and with metadata columns tx_id, tx_name, and gene_id.
164 | intronicParts <- function(txdb, linked.to.single.gene.only=FALSE)
165 | {
166 | if (!isTRUEorFALSE(linked.to.single.gene.only))
167 | stop("'linked.to.single.gene.only' must be TRUE or FALSE")
168 | introns <- tidyIntrons(txdb, drop.geneless=linked.to.single.gene.only)
169 | .break_in_parts(introns, linked.to.single.gene.only,
170 | extra_mcol="intronic_part")
171 | }
172 |
173 |
--------------------------------------------------------------------------------
/R/extendExonsIntoIntrons.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### Extend exons by a given number of bases into their adjacent introns
3 | ### -------------------------------------------------------------------------
4 | ###
5 |
6 | extendExonsIntoIntrons <- function(ex_by_tx, extent=2)
7 | {
8 | if (!is(ex_by_tx, "GRangesList"))
9 | stop(wmsg("'ex_by_tx' must be a GRangesList object"))
10 | if (!isSingleNumber(extent))
11 | stop(wmsg("'extent' must be a single number"))
12 | if (!is.integer(extent))
13 | extent <- as.integer(extent)
14 |
15 | resize_idx <- which(lengths(ex_by_tx) >= 2L)
16 | ex_to_resize <- ex_by_tx[resize_idx]
17 |
18 | ## Resize first exons.
19 | first_ex <- heads(ex_to_resize, n=1L)
20 | unlisted <- unlist(first_ex, use.names=FALSE)
21 | unlisted <- resize(unlisted, width(unlisted) + extent,
22 | fix="start", use.names=FALSE)
23 | first_ex <- relist(unlisted, first_ex)
24 |
25 | ## Resize last exons.
26 | last_ex <- tails(ex_to_resize, n=1L)
27 | unlisted <- unlist(last_ex, use.names=FALSE)
28 | unlisted <- resize(unlisted, width(unlisted) + extent,
29 | fix="end", use.names=FALSE)
30 | last_ex <- relist(unlisted, last_ex)
31 |
32 | ## Resize intermediate exons.
33 | mid_ex <- tails(heads(ex_to_resize, n=-1L), n=-1L)
34 | unlisted <- unlist(mid_ex, use.names=FALSE)
35 | unlisted <- resize(unlisted, width=width(unlisted) + 2L*extent,
36 | fix="center", use.names=FALSE)
37 | mid_ex <- relist(unlisted, mid_ex)
38 |
39 | ## Put exons back together.
40 | ex_by_tx[resize_idx] <- pc(first_ex, mid_ex, last_ex)
41 | ex_by_tx
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/R/extractTranscriptSeqs.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### extractTranscriptSeqs() and related tools
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | .unlist_strand <- function(strand, transcripts)
7 | {
8 | if (is.list(strand) || is(strand, "List")) {
9 | ## 'strand' is a list-like object.
10 | if (!identical(unname(elementNROWS(strand)),
11 | unname(elementNROWS(transcripts))))
12 | stop(wmsg("when 'strand' is a list-like object, it must have ",
13 | "the same \"shape\" as 'transcripts' (i.e. same length ",
14 | "plus 'strand[[i]]' must have the same length as ",
15 | "'transcripts[[i]]' for all 'i')"))
16 | return(strand(unlist(strand, use.names=FALSE)))
17 | }
18 | if (!(is.vector(strand) || is.factor(strand) || is(strand, "Rle")))
19 | stop(wmsg("'strand' must be an atomic vector, a factor, ",
20 | "an Rle object, or a list-like object"))
21 | strand <- strand(strand)
22 | strand <- S4Vectors:::V_recycle(strand, transcripts,
23 | "strand", "transcripts")
24 | rep.int(strand, elementNROWS(transcripts))
25 | }
26 |
27 | setGeneric("extractTranscriptSeqs", signature="x",
28 | function(x, transcripts, ...) standardGeneric("extractTranscriptSeqs")
29 | )
30 |
31 | setMethod("extractTranscriptSeqs", "DNAString",
32 | function(x, transcripts, strand="+")
33 | {
34 | if (!is(transcripts, "IntegerRangesList"))
35 | stop(wmsg("when 'x' is a DNAString object, ",
36 | "'transcripts' must be an IntegerRangesList object"))
37 | unlisted_strand <- .unlist_strand(strand, transcripts)
38 | if (!all(unlisted_strand %in% c("+", "-")))
39 | stop(wmsg("'strand' can only contain \"+\" and/or \"-\" values. ",
40 | "\"*\" is not allowed."))
41 | idx <- which(unlisted_strand == "-")
42 | exons <- extractList(x, unlist(transcripts, use.names=FALSE))
43 | exons[idx] <- reverseComplement(exons[idx])
44 | unstrsplit(relist(exons, transcripts))
45 | }
46 | )
47 |
48 | ### Check for transcripts that have exons located on more than one
49 | ### chromosome.
50 | .check_exon_chrom <- function(tx1)
51 | {
52 | run_lens <- runLength(seqnames(tx1))
53 | idx <- which(elementNROWS(run_lens) != 1L)
54 | if (length(idx) == 0L)
55 | return()
56 | tx1_names <- names(tx1)
57 | if (is.null(tx1_names)) {
58 | some_in1string <- ""
59 | } else {
60 | some_idx <- head(idx, n=2L)
61 | some_names <- tx1_names[some_idx]
62 | some_in1string <- paste0(some_names, collapse=", ")
63 | if (length(idx) > length(some_idx))
64 | some_in1string <- paste0("e.g. ", some_in1string, ", etc...")
65 | some_in1string <- paste0(" (", some_in1string, ")")
66 | }
67 | stop(wmsg("Some transcripts", some_in1string, " have exons located on ",
68 | "more than one chromosome. This is not supported yet."))
69 | }
70 |
71 | ### Check the "exon_rank" inner metadata column if present. When 'transcripts'
72 | ### contains CDS parts (instead of exons) grouped by transcript, some of the
73 | ### lowest or/and highest exon ranks can be missing.
74 | .check_exon_rank <- function(tx1)
75 | {
76 | exon_rank <- mcols(tx1@unlistData)$exon_rank
77 | if (is.null(exon_rank))
78 | return()
79 | if (!is.numeric(exon_rank))
80 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ",
81 | "object 'transcripts' is not numeric"))
82 | if (!is.integer(exon_rank)) {
83 | warning(wmsg("\"exon_rank\" inner metadata column in GRangesList ",
84 | "object 'transcripts' is not integer"))
85 | exon_rank <- as.integer(exon_rank)
86 | }
87 | if (any(is.na(exon_rank)))
88 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ",
89 | "object 'transcripts' contains NAs"))
90 |
91 | partitioning <- PartitioningByEnd(tx1)
92 | ## The 2 lines below are equivalent to:
93 | ## tmp <- relist(exon_rank, partitioning)
94 | ## min_rank <- min(tmp)
95 | ## but much faster!
96 | v <- Views(exon_rank, partitioning)
97 | min_rank <- viewMins(v)
98 | if (any(min_rank < 1L))
99 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ",
100 | "object 'transcripts' contains ranks < 1"))
101 | tx1_eltNROWS <- elementNROWS(partitioning)
102 | target <- sequence(tx1_eltNROWS, from=min_rank)
103 | if (!identical(target, unname(exon_rank)))
104 | stop(wmsg("\"exon_rank\" inner metadata column in GRangesList ",
105 | "object 'transcripts' does not contain increasing ",
106 | "consecutive ranks for some transcripts"))
107 | }
108 |
109 | ### TODO: Incorporate this fast path to "unlist" method for XStringSet objects.
110 | .fast_XStringSet_unlist <- function(x)
111 | {
112 | # Disabling the fast path for now. Until I understand why using it
113 | # causes extractTranscriptSeqs(Hsapiens, TxDb.Hsapiens.UCSC.hg18.knownGene)
114 | # to use more memory (319.7 Mb) than when NOT using it (288.9 Mb).
115 | if (FALSE) {
116 | x_len <- length(x)
117 | if (x_len != 0L && length(x@pool) == 1L) {
118 | x_ranges <- x@ranges
119 | x_start <- start(x_ranges)
120 | x_end <- end(x_ranges)
121 | if (identical(x_end[-x_len] + 1L, x_start[-1L])) {
122 | ## The ranges are adjacent. We can unlist() without copying
123 | ## the sequence data!
124 | cat("using fast path (", x_len, ") ...\n")
125 | ans_class <- elementType(x)
126 | ans_shared <- x@pool[[1L]]
127 | ans_offset <- x_start[1L] - 1L
128 | ans_length <- x_end[x_len] - ans_offset
129 | ans <- new2(ans_class, shared=ans_shared,
130 | offset=ans_offset,
131 | length=ans_length,
132 | check = FALSE)
133 | return(ans)
134 | }
135 | }
136 | }
137 | unlist(x, use.names=FALSE)
138 | }
139 |
140 | .extract_and_combine <- function(x, seqname, ranges)
141 | {
142 | seqs <- getSeq(x, GRanges(seqname, ranges))
143 | ## For "getSeq" methods (like the method for GmapGenome objects) that
144 | ## return a character vector.
145 | if (is.character(seqs))
146 | seqs <- DNAStringSet(seqs)
147 | .fast_XStringSet_unlist(seqs)
148 | }
149 |
150 | .extractTranscriptSeqsFromOneSeq <- function(seqlevel, x, transcripts)
151 | {
152 | seqlevels(transcripts, pruning.mode="coarse") <- seqlevel
153 | strand <- strand(transcripts)
154 | transcripts <- ranges(transcripts)
155 | if (seqlevel %in% seqlevels(x)) {
156 | ## We try to load the less stuff possible i.e. only the nucleotides
157 | ## that participate in at least one exon.
158 | exons <- unlist(transcripts, use.names=FALSE)
159 | ranges_to_load <- reduce(exons, with.inframe.attrib=TRUE)
160 | x <- .extract_and_combine(x, seqlevel, ranges_to_load)
161 | exons <- attr(ranges_to_load, "inframe")
162 | transcripts <- relist(exons, transcripts)
163 | } else {
164 | ## Why do we need this?
165 | regex <- paste0("^", seqlevel, "$")
166 | x <- getSeq(x, regex)
167 | }
168 | extractTranscriptSeqs(x, transcripts, strand=strand)
169 | }
170 |
171 | .extractTranscriptSeqs_default <- function(x, transcripts, ...)
172 | {
173 | if (is(transcripts, "GRangesList")) {
174 | if (length(list(...)) != 0L)
175 | stop(wmsg("additional arguments are allowed only when ",
176 | "'transcripts' is not a GRangesList object"))
177 | } else {
178 | transcripts <- try(exonsBy(transcripts, by="tx", ...),
179 | silent=TRUE)
180 | if (is(transcripts, "try-error"))
181 | stop(wmsg("failed to extract the exon ranges from 'transcripts' ",
182 | "with exonsBy(transcripts, by=\"tx\", ...)"))
183 | }
184 | idx1 <- which(elementNROWS(transcripts) != 0L)
185 | tx1 <- transcripts[idx1]
186 | .check_exon_chrom(tx1)
187 | .check_exon_rank(tx1)
188 |
189 | tx1_seqlevels_in_use <- seqlevelsInUse(tx1)
190 | x_seqlevels <- seqlevels(x)
191 | ok <- tx1_seqlevels_in_use %in% x_seqlevels
192 | if (!all(ok)) {
193 | if (all(!ok))
194 | stop(wmsg("the transcripts in 'transcripts' are on chromosomes ",
195 | "that are not in 'x'"))
196 | seqlevel_not_in_x <- tx1_seqlevels_in_use[!ok][[1L]]
197 | stop(wmsg("some transcripts in 'transcripts' are on chromosomes ",
198 | "that are not in 'x' (e.g. some transcripts are on ",
199 | "chromosome \"", seqlevel_not_in_x, "\" but this ",
200 | "chromosome is not in 'x')"))
201 | }
202 | seqlevels(tx1) <- tx1_seqlevels_in_use
203 | ## 'seqnames1' is just an ordinary factor (not Rle) parallel to 'tx1'.
204 | seqnames1 <- unlist(runValue(seqnames(tx1)), use.names=FALSE)
205 | dnaset_list <- lapply(levels(seqnames1),
206 | .extractTranscriptSeqsFromOneSeq, x, tx1)
207 | ans <- rep.int(DNAStringSet(""), length(transcripts))
208 | names(ans) <- names(transcripts)
209 | ans[idx1] <- unsplit_list_of_XVectorList("DNAStringSet",
210 | dnaset_list,
211 | seqnames1)
212 | ans
213 | }
214 | setMethod("extractTranscriptSeqs", "ANY", .extractTranscriptSeqs_default)
215 |
216 |
--------------------------------------------------------------------------------
/R/extractUpstreamSeqs.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### extractUpstreamSeqs()
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | ### Dispatch is on the 2nd argument!
7 | setGeneric("extractUpstreamSeqs", signature="genes",
8 | function(x, genes, width=1000, ...) standardGeneric("extractUpstreamSeqs")
9 | )
10 |
11 | ### Will work on any object 'x' for which seqinfo() and getSeq() are defined
12 | ### e.g. BSgenome, FaFile, TwoBitFile, etc...
13 | setMethod("extractUpstreamSeqs", "GenomicRanges",
14 | function(x, genes, width=1000)
15 | {
16 | seqinfo(genes) <- merge(seqinfo(genes), seqinfo(x))
17 | upstream <- trim(suppressWarnings(flank(genes, width=width)))
18 | ans <- getSeq(x, upstream)
19 |
20 | ## Add metada columns to 'ans'.
21 | gene_seqnames <- seqnames(genes)
22 | gene_strand <- strand(genes)
23 | idx1 <- which(gene_strand != "-")
24 | idx2 <- which(gene_strand == "-")
25 | gene_TSS <- integer(length(genes))
26 | gene_TSS[idx1] <- start(genes)[idx1]
27 | gene_TSS[idx2] <- end(genes)[idx2]
28 | ans_mcols <- DataFrame(gene_seqnames=gene_seqnames,
29 | gene_strand=gene_strand,
30 | gene_TSS=gene_TSS)
31 | mcols(ans) <- ans_mcols
32 | ans
33 | }
34 | )
35 |
36 | setMethod("extractUpstreamSeqs", "TxDb",
37 | function(x, genes, width=1000, exclude.seqlevels=NULL)
38 | {
39 | genes <- sort(genes(genes))
40 | ## 'genes' is now a GRanges object.
41 | if (!is.null(exclude.seqlevels)) {
42 | if (!is.character(exclude.seqlevels))
43 | stop("'exclude.seqlevels' must be NULL or a character vector")
44 | idx <- match(exclude.seqlevels, seqlevels(genes))
45 | if (any(is.na(idx)))
46 | stop("'exclude.seqlevels' contains invalid seqlevels")
47 | seqlevels(genes, pruning.mode="coarse") <- seqlevels(genes)[-idx]
48 | }
49 | callGeneric(x, genes, width=width)
50 | }
51 | )
52 |
53 | ### 'genes' is assumed to contain transcripts grouped by gene e.g. as returned
54 | ### by transcriptsBy(..., by="gene").
55 | setMethod("extractUpstreamSeqs", "GRangesList",
56 | function(x, genes, width=1000)
57 | {
58 | stop("NOT READY YET, SORRY!")
59 | }
60 | )
61 |
62 |
--------------------------------------------------------------------------------
/R/features.R:
--------------------------------------------------------------------------------
1 | ## Very simple extractor to just return what is in our Feature.Db objects
2 |
3 | ## business end of things
4 |
5 | .extractDataCols <- function(conn, tableName){
6 | SQL <- paste0("SELECT * FROM ", tableName, ";")
7 | dbEasyQuery(conn, SQL)
8 | }
9 |
10 |
11 | .extractFeaturesAsGRanges <- function(db)
12 | {
13 | ## 1st figure out what table is not the metadata table.
14 | conn <- dbconn(db) ## featuredbconn(db)
15 | tableNames <- dbListTables(conn)
16 | tableName <- tableNames[!tableNames %in% "metadata"]
17 |
18 | ## Then learn what the columns are in that table and assign to otherCols
19 | colNames <- dbListFields(conn, tableName)
20 | reserved <- c("name", "chrom", "strand", "chromStart", "chromEnd")
21 | colNames <- colNames[!colNames %in% reserved]
22 |
23 | ## Extract the data
24 | df <- .extractDataCols(conn, tableName)
25 |
26 | ## Make & return the Object
27 | md <- metadata(db)
28 | genome <- md[md$name == "Genome", 'value']
29 | if (is.null(genome))
30 | genome <- NA_character_
31 | ans <-
32 | GRanges(seqnames = df$chrom,
33 | ranges = IRanges(df$chromStart, df$chromEnd, names=df$name),
34 | strand = sub('\\.', '*', df$strand),
35 | df[colNames],
36 | seqinfo = Seqinfo(unique(df$chrom), genome=genome))
37 | metadata(ans)[[1]] <- DataFrame(metadata(db))
38 | ans
39 | }
40 |
41 | setGeneric("features", signature="x",
42 | function(x) standardGeneric("features"))
43 |
44 | setMethod("features", "FeatureDb",
45 | function(x) .extractFeaturesAsGRanges(x))
46 |
47 | ## test code:
48 | ## library(GenomicFeatures)
49 | ## fdb <- loadDb("FeatureDb.sqlite")
50 | ## features(fdb)
51 |
--------------------------------------------------------------------------------
/R/getPromoterSeq-methods.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### getPromoterSeq() and getTerminatorSeq()
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Original author: Paul Shannon
6 |
7 | ### NOTE (H. Pagès, Jan 22, 2024): Interface is inconsistent with
8 | ### extractTranscriptSeqs() or extractUpstreamSeqs().
9 | ### TODO: Implement extractPromoterSeqs() and extractTerminatorSeqs() and
10 | ### model them after extractTranscriptSeqs() or extractUpstreamSeqs().
11 | ### Then deprecate getPromoterSeq() and getTerminatorSeq() in favor of
12 | ### extractPromoterSeqs() and extractTerminatorSeqs().
13 |
14 | setGeneric("getPromoterSeq", signature="query",
15 | function(query, subject, upstream=2000, downstream=200)
16 | standardGeneric("getPromoterSeq"))
17 |
18 | setGeneric("getTerminatorSeq", signature="query",
19 | function(query, subject, upstream=2000, downstream=200)
20 | standardGeneric("getTerminatorSeq"))
21 |
22 | .GRanges_getPromoterSeq <- function(query, subject, FUN, upstream, downstream)
23 | {
24 | stopifnot(is(query, "GRanges"))
25 | promoter.granges <- FUN(query, upstream, downstream)
26 | result <- getSeq(subject, promoter.granges)
27 | md <- mcols(query)
28 | geneIDs <- names(query) # often NULL
29 | if (is.null(geneIDs))
30 | geneIDs <- rep(NA_character_, length(query))
31 | md$geneID <- geneIDs
32 | mcols(result) <- md
33 | result
34 | }
35 |
36 | setMethod("getPromoterSeq", "GRanges",
37 | function(query, subject, upstream=2000, downstream=200)
38 | .GRanges_getPromoterSeq(query, subject, promoters,
39 | upstream, downstream)
40 | )
41 |
42 | setMethod("getTerminatorSeq", "GRanges",
43 | function(query, subject, upstream=2000, downstream=200)
44 | .GRanges_getPromoterSeq(query, subject, terminators,
45 | upstream, downstream)
46 | )
47 |
48 | .GRangesList_getPromoterSeq <-
49 | function(query, subject, FUN, upstream, downstream)
50 | {
51 | stopifnot(is(query, "GRangesList"))
52 | unlisted_query <- unlist(query, use.names=FALSE) # GRanges object
53 | promoter.granges <- FUN(unlisted_query, upstream, downstream)
54 | result <- getSeq(subject, promoter.granges)
55 | md <- mcols(unlisted_query)
56 | geneIDs <- names(query)
57 | geneID.counts <- elementNROWS(query)
58 | geneIDs <- rep(geneIDs, geneID.counts) # H. Pagès: what if geneIDs is NULL?
59 | md$geneID <- geneIDs
60 | mcols(result) <- md
61 | relist(result, query)
62 | }
63 |
64 | setMethod("getPromoterSeq", "GRangesList",
65 | function(query, subject, upstream=2000, downstream=200)
66 | .GRangesList_getPromoterSeq(query, subject, promoters,
67 | upstream, downstream)
68 | )
69 |
70 | setMethod("getTerminatorSeq", "GRangesList",
71 | function(query, subject, upstream=2000, downstream=200)
72 | .GRangesList_getPromoterSeq(query, subject, terminators,
73 | upstream, downstream)
74 | )
75 |
76 |
--------------------------------------------------------------------------------
/R/id2name.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### Map internal ids to external names for a given feature type.
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | id2name <- function(txdb, feature.type=c("tx", "exon", "cds"))
7 | {
8 | if (!is(txdb, "TxDb"))
9 | stop("'txdb' must be a TxDb object")
10 | feature.type <- match.arg(feature.type)
11 | table <- switch(feature.type, tx="transcript", exon="exon", cds="cds")
12 | columns <- TXDB_table_columns(table)[c("id", "name")]
13 | df <- TxDb_SELECT_from_INNER_JOIN(txdb, table, columns)
14 | ans <- df[[columns[2L]]]
15 | names(ans) <- as.character(df[[columns[1L]]])
16 | ans
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/R/makeFeatureDbFromUCSC.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### makeFeatureDbFromUCSC()
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Everything in this file has moved to txdbmaker!
6 |
7 | supportedUCSCFeatureDbTracks <- function(genome)
8 | {
9 | call_fun_in_txdbmaker("supportedUCSCFeatureDbTracks", genome=genome)
10 | }
11 |
12 | supportedUCSCFeatureDbTables <- function(...)
13 | {
14 | call_fun_in_txdbmaker("supportedUCSCFeatureDbTables", ...)
15 | }
16 |
17 | UCSCFeatureDbTableSchema <- function(...)
18 | {
19 | call_fun_in_txdbmaker("UCSCFeatureDbTableSchema", ...)
20 | }
21 |
22 | makeFeatureDbFromUCSC <- function(...)
23 | {
24 | call_fun_in_txdbmaker("makeFeatureDbFromUCSC", ...)
25 | }
26 |
27 |
--------------------------------------------------------------------------------
/R/makeTxDb.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### Making TxDb objects
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Everything in this file has moved to txdbmaker!
6 |
7 | makeTxDb <- function(...)
8 | {
9 | call_fun_in_txdbmaker("makeTxDb", ...)
10 | }
11 |
12 |
--------------------------------------------------------------------------------
/R/makeTxDbFromBiomart.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### makeTxDbFromBiomart()
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Everything in this file has moved to txdbmaker!
6 |
7 | getChromInfoFromBiomart <- function(...)
8 | {
9 | call_fun_in_txdbmaker("getChromInfoFromBiomart", ...)
10 | }
11 |
12 |
13 | makeTxDbFromBiomart <- function(...)
14 | {
15 | call_fun_in_txdbmaker("makeTxDbFromBiomart", ...)
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/R/makeTxDbFromEnsembl.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### makeTxDbFromEnsembl()
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Everything in this file has moved to txdbmaker!
6 |
7 | makeTxDbFromEnsembl <- function(...)
8 | {
9 | call_fun_in_txdbmaker("makeTxDbFromEnsembl", ...)
10 | }
11 |
12 |
--------------------------------------------------------------------------------
/R/makeTxDbFromGFF.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### makeTxDbFromGFF()
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Everything in this file has moved to txdbmaker!
6 |
7 | makeTxDbFromGFF <- function(...)
8 | {
9 | call_fun_in_txdbmaker("makeTxDbFromGFF", ...)
10 | }
11 |
12 |
--------------------------------------------------------------------------------
/R/makeTxDbFromGRanges.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### makeTxDbFromGRanges()
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Everything in this file has moved to txdbmaker!
6 |
7 | makeTxDbFromGRanges <- function(...)
8 | {
9 | call_fun_in_txdbmaker("makeTxDbFromGRanges", ...)
10 | }
11 |
12 |
--------------------------------------------------------------------------------
/R/makeTxDbFromUCSC.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### makeTxDbFromUCSC()
3 | ### -------------------------------------------------------------------------
4 |
5 | ### Everything in this file has moved to txdbmaker!
6 |
7 | supportedUCSCtables <- function(...)
8 | {
9 | call_fun_in_txdbmaker("supportedUCSCtables", ...)
10 | }
11 |
12 | browseUCSCtrack <- function(...)
13 | {
14 | call_fun_in_txdbmaker("browseUCSCtrack", ...)
15 | }
16 |
17 | makeTxDbFromUCSC <- function(...)
18 | {
19 | call_fun_in_txdbmaker("makeTxDbFromUCSC", ...)
20 | }
21 |
22 |
--------------------------------------------------------------------------------
/R/makeTxDbPackage.R:
--------------------------------------------------------------------------------
1 | ### Everything in this file has moved to txdbmaker!
2 |
3 | supportedMiRBaseBuildValues <- function()
4 | {
5 | call_fun_in_txdbmaker("supportedMiRBaseBuildValues")
6 | }
7 |
8 | makePackageName <- function(...)
9 | {
10 | call_fun_in_txdbmaker("makePackageName", ...)
11 | }
12 |
13 | makeTxDbPackage <- function(...)
14 | {
15 | call_fun_in_txdbmaker("makeTxDbPackage", ...)
16 | }
17 |
18 | makeTxDbPackageFromUCSC <- function(...)
19 | {
20 | call_fun_in_txdbmaker("makeTxDbPackageFromUCSC", ...)
21 | }
22 |
23 | makeFDbPackageFromUCSC <- function(...)
24 | {
25 | call_fun_in_txdbmaker("makeFDbPackageFromUCSC", ...)
26 | }
27 |
28 | makeTxDbPackageFromBiomart <- function(...)
29 | {
30 | call_fun_in_txdbmaker("makeTxDbPackageFromBiomart", ...)
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/R/mapIdsToRanges.R:
--------------------------------------------------------------------------------
1 | setGeneric("mapIdsToRanges", signature="x",
2 | function(x, ...) standardGeneric("mapIdsToRanges")
3 | )
4 |
5 | setMethod("mapIdsToRanges", "TxDb",
6 | function(x,
7 | keys,
8 | type = c("cds", "exon", "tx", "gene"),
9 | columns = NULL)
10 | {
11 | .assert(is.list(keys) && .is.named(keys),
12 | "'keys' must be a named list")
13 |
14 | .assert(is.null(columns) || is.character(columns),
15 | "'columns' must be 'NULL' or a character vector")
16 |
17 | type <- match.arg(type)
18 |
19 | fun <- switch(type,
20 | cds = cds,
21 | exon = exons,
22 | tx = transcripts,
23 | gene = genes)
24 |
25 | res <- fun(x, keys, columns = unique(c(names(keys), columns)))
26 | matches <- match(mcols(res)[[names(keys)]], keys[[1]])
27 | ranges <- rep(res, lengths(matches))
28 |
29 | f <- factor(keys[[1]][unlist(matches, use.names = FALSE)],
30 | levels = unique(keys[[1]]))
31 | splitAsList(ranges, f, drop = FALSE)[keys[[1]]]
32 | })
33 |
34 | setGeneric("mapRangesToIds", signature="x",
35 | function(x, ...) standardGeneric("mapRangesToIds")
36 | )
37 |
38 | setMethod("mapRangesToIds", "TxDb",
39 | function(x,
40 | ranges,
41 | type = c("cds", "exon", "tx", "gene"),
42 | columns = NULL,
43 | ...)
44 | {
45 | type <- match.arg(type)
46 | .assert(is(ranges, "Vector"),
47 | "'ranges' must be a 'Vector'")
48 | .assert(is.null(columns) || is.character(columns),
49 | "'columns' must be 'NULL' or a character vector")
50 |
51 | fun <- switch(type,
52 | cds = cds,
53 | exon = exons,
54 | tx = transcripts,
55 | gene = genes)
56 |
57 | all <-
58 | if (is.null(columns)) {
59 | fun(x)
60 | } else {
61 | fun(x, columns = columns)
62 | }
63 |
64 | hits <- findOverlaps(ranges, all, ...)
65 | lapply(split(all[subjectHits(hits)], names(ranges)[queryHits(hits)]), mcols)
66 | })
67 |
68 | .assert <- function(x, message) {
69 | if(!x) {
70 | stop(message, call. = FALSE)
71 | }
72 | }
73 |
74 | .is.named <- function(x) {
75 | nm <- names(x)
76 | !is.null(nm) && all(!is.na(nm) & nzchar(nm))
77 | }
78 |
--------------------------------------------------------------------------------
/R/nearest-methods.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### nearest (and related) methods
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
7 | ### distance
8 | ###
9 |
10 | setMethod("distance", c("GenomicRanges", "TxDb"),
11 | function(x, y, ignore.strand=FALSE, ..., id,
12 | type=c("gene", "tx", "exon", "cds"))
13 | {
14 | if (!identical(length(x), length(id)))
15 | stop("length(id) must equal length(x)")
16 | if (!is.character(id))
17 | stop("'id' must be a character")
18 |
19 | if (type == "gene") {
20 | .extractByGeneID(x, y, ignore.strand, id)
21 | } else {
22 | rng <- switch(type,
23 | tx=transcripts(y, "tx_id", filter=list(tx_id=id)),
24 | exon=exons(y, "exon_id", filter=list(exon_id=id)),
25 | cds=cds(y, "cds_id", filter=list(cds_id=id)))
26 | f <- factor(mcols(rng)[,])
27 | missing <- !id %in% levels(f)
28 | if (any(missing))
29 | warning(paste0("id(s): '", paste(unique(id[missing]),
30 | sep=","), "' were not found in 'y'"))
31 | ## rep out ranges according to 'id'
32 | rng <- rng[match(id[!missing], levels(f))]
33 | ans <- rep(NA_integer_, length(x))
34 | ans[!missing] <- distance(x[!missing], rng,
35 | ignore.strand=ignore.strand)
36 | stopifnot(length(ans) == length(id))
37 | ans
38 | }
39 | }
40 | )
41 |
42 | .extractByGeneID <- function(x, y, ignore.strand, id)
43 | {
44 | tx <- transcriptsBy(y, "gene")
45 | missing <- !id %in% names(tx)
46 | if (any(missing))
47 | warning(paste0("id(s): '", paste(unique(id[missing]), sep=","),
48 | "' were not found in 'y'"))
49 |
50 | group <- range(tx[names(tx) %in% id], ignore.strand=ignore.strand)
51 | multiRange <- lengths(group) > 1L
52 | if (any(multiRange)) {
53 | warning(paste0("id(s): '", paste(unique(names(multiRange)[multiRange]),
54 | sep=','),
55 | "' could not be collapsed to a single gene region"))
56 | group <- group[!multiRange]
57 | }
58 |
59 | valid <- (!id %in% names(multiRange)[multiRange]) & !missing
60 | ## rep out ranges according to 'id'
61 | rng <- unlist(group, use.names=FALSE)
62 | rng <- rng[match(id[valid], names(group))]
63 | ans <- rep(NA_integer_, length(x))
64 | ans[valid] <- distance(x[valid], rng, ignore.strand=ignore.strand)
65 | stopifnot(length(ans) == length(id))
66 | ans
67 | }
68 |
--------------------------------------------------------------------------------
/R/proteinToGenome.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### proteinToGenome()
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | ### Dispatch is on 2nd argument!
7 | setGeneric("proteinToGenome", signature="db",
8 | function(x, db, ...) standardGeneric("proteinToGenome")
9 | )
10 |
11 |
12 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
13 | ### Make fancy error or warning messages
14 | ###
15 |
16 | .make_bad_names_msg <- function(x_names, bad_idx, what="invalid name",
17 | max.show=5L)
18 | {
19 | nbad <- length(bad_idx)
20 | if (max.show == 0L) {
21 | msg <- c("the names on 'x' contain ", nbad, " ", what)
22 | if (nbad != 1L)
23 | msg <- c(msg, "s")
24 | return(paste(msg, collapse=""))
25 | }
26 | if (nbad == 1L) {
27 | msg <- c("The names on 'x' contain ", what)
28 | } else {
29 | msg <- c("The names on 'x' contain ", nbad, " ", what, "s")
30 | if (nbad > max.show) {
31 | if (max.show == 1L) {
32 | msg <- c(msg, " (showing the first one only)")
33 | } else {
34 | msg <- c(msg, " (showing the first ", max.show, " only)")
35 | }
36 | bad_idx <- head(bad_idx, n=max.show)
37 | }
38 | }
39 | bad_names <- x_names[bad_idx]
40 | paste0(paste(msg, collapse=""), ": ", paste(bad_names, collapse=", "))
41 | }
42 |
43 |
44 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
45 | ### proteinToGenome() method for GRangesList objects
46 | ###
47 |
48 | .make_protein_ranges_from_cumwidths <- function(cumwidths)
49 | {
50 | len <- length(cumwidths)
51 | protein_start <- c(1L, cumwidths[-len] %/% 3L + 1L)
52 | protein_end <- (2L + cumwidths) %/% 3L
53 | IRanges(protein_start, protein_end)
54 | }
55 |
56 | ### Trims first range in 'gr' on its 5' side by 'trim1' nucleotides,
57 | ### and last range in 'gr' on its 3' side by 'trim2' nucleotides.
58 | ### Other than that, everything else is preserved (length, names, metadata
59 | ### columns).
60 | .trim_first_and_last_ranges <- function(gr, trim1=0L, trim2=0L)
61 | {
62 | stopifnot(is(gr, "GRanges"),
63 | isSingleInteger(trim1),
64 | isSingleInteger(trim2))
65 |
66 | gr_ranges <- ranges(gr)
67 | gr_len <- length(gr_ranges)
68 | stopifnot(gr_len >= 1L)
69 |
70 | gr_start <- start(gr_ranges)
71 | gr_end <- end(gr_ranges)
72 | gr_strand <- S4Vectors:::decodeRle(strand(gr))
73 |
74 | ## Trim first range.
75 | strand1 <- gr_strand[[1L]]
76 | if (strand1 == "+") {
77 | ## Trim on the left.
78 | gr_start[[1L]] <- gr_start[[1L]] + trim1
79 | } else {
80 | ## Trim on the right.
81 | gr_end[[1L]] <- gr_end[[1L]] - trim1
82 | }
83 |
84 | ## Trim last range.
85 | strand2 <- gr_strand[[gr_len]]
86 | if (strand2 == "+") {
87 | ## Trim on the right.
88 | gr_end[[gr_len]] <- gr_end[[gr_len]] - trim2
89 | } else {
90 | ## Trim on the left.
91 | gr_start[[gr_len]] <- gr_start[[gr_len]] + trim2
92 | }
93 |
94 | if (any(gr_start > gr_end + 1L))
95 | stop(wmsg("invalid trimming"))
96 |
97 | ranges(gr) <- update_ranges(gr_ranges, start=gr_start, end=gr_end)
98 | gr
99 | }
100 |
101 | ### 'cds' must be a GRanges object representing the CDS parts of a given
102 | ### transcript/protein.
103 | ### 'protein_start' and 'protein_end' must be protein-relative coordinates
104 | ### i.e. coordinates (counted in Amino Acids) relative to the protein
105 | ### associated with 'cds'.
106 | ### Returns a GRanges object.
107 | .map_protein_to_cds <- function(protein_start, protein_end, cds)
108 | {
109 | stopifnot(isSingleNumber(protein_start),
110 | isSingleNumber(protein_end),
111 | protein_start <= protein_end,
112 | is(cds, "GRanges"))
113 | nparts <- length(cds)
114 | cds_widths <- width(cds)
115 | stopifnot(nparts >= 1L, all(cds_widths >= 1L))
116 | protein_start <- as.integer(protein_start)
117 | protein_end <- as.integer(protein_end)
118 | cds_cumwidths <- cumsum(cds_widths)
119 |
120 | ## Add metadata columns 'protein_start' and 'protein_end' to 'cds'.
121 | protein_ranges <- .make_protein_ranges_from_cumwidths(cds_cumwidths)
122 | protein_ranges <- DataFrame(protein_start=start(protein_ranges),
123 | protein_end=end(protein_ranges))
124 | mcols(cds) <- cbind(mcols(cds), protein_ranges)
125 |
126 | ## Translate protein-relative coordinates into 0-based CDS-relative
127 | ## coordinates.
128 | protein_start0 <- 3L * (protein_start - 1L)
129 | protein_end0 <- 3L * protein_end - 1L
130 |
131 | ## Find CDS parts touched by 'protein_start' and 'protein_end'.
132 | idx <- 1L + findInterval(c(protein_start0, protein_end0), cds_cumwidths)
133 | idx1 <- idx[[1L]]
134 | idx2 <- idx[[2L]]
135 | if (idx2 > nparts)
136 | idx2 <- nparts
137 |
138 | ## Extract all CDS parts touched by the [protein_start,protein_end] range.
139 | ans <- cds[idx1:idx2]
140 |
141 | ## Trim first and last ranges in 'ans' (trimming should **always**
142 | ## be valid).
143 | trim1 <- protein_start0 - sum(head(cds_widths, idx1 - 1L))
144 | trim2 <- cds_cumwidths[[idx2]] - protein_end0 - 1L
145 | ans <- .trim_first_and_last_ranges(ans, trim1, trim2)
146 |
147 | ## Adjust metadata columns 'protein_start' and 'protein_end' to account
148 | ## for trimming.
149 | ans_mcols <- mcols(ans)
150 | ans_mcols[1L, "protein_start"] <- protein_start
151 | ans_mcols[nrow(ans_mcols), "protein_end"] <- protein_end
152 | mcols(ans) <- ans_mcols
153 |
154 | ans
155 | }
156 |
157 | ### Returns a named GRangesList object parallel to 'x' (names on 'x' are
158 | ### propagated).
159 | setMethod("proteinToGenome", "GRangesList",
160 | function(x, db)
161 | {
162 | if (!is(x, "IRanges"))
163 | stop(wmsg("'x' must be an IRanges object or derivative"))
164 | x_names <- names(x)
165 | if (is.null(x_names))
166 | stop(wmsg("'x' must have names"))
167 | coding_tx_names <- names(db)
168 | if (is.null(coding_tx_names))
169 | stop(wmsg("'db' must have names when it's a GRangesList object"))
170 | non_coding_idx <- which(!(x_names %in% coding_tx_names))
171 | if (length(non_coding_idx) != 0L) {
172 | msg <- .make_bad_names_msg(x_names, non_coding_idx,
173 | what="non-coding transcript name")
174 | warning(wmsg(msg), immediate.=TRUE)
175 | }
176 | x_start <- start(x)
177 | x_end <- end(x)
178 | ## TODO: Replace this inefficient lapply-based implementation with
179 | ## something better.
180 | ans <- lapply(setNames(seq_along(x), x_names),
181 | function(i) {
182 | if (i %in% non_coding_idx)
183 | return(GRanges())
184 | tx_name <- x_names[[i]]
185 | cds <- db[[tx_name]]
186 | .map_protein_to_cds(x_start[[i]], x_end[[i]], cds)
187 | }
188 | )
189 | GRangesList(ans)
190 | }
191 | )
192 |
193 |
194 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
195 | ### Default proteinToGenome() method
196 | ###
197 |
198 | ### 'db' must be a TxDb object or any object that supports transcripts()
199 | ### (e.g. EnsDb object).
200 | .check_supplied_tx_names <- function(supplied_tx_names, db)
201 | {
202 | if (is.null(supplied_tx_names))
203 | stop(wmsg("'x' must have names and they must be transcript names"))
204 | stopifnot(is.character(supplied_tx_names))
205 | if (any(supplied_tx_names %in% c(NA_character_, "")))
206 | stop(wmsg("the names on 'x' cannot contain NAs or empty strings"))
207 | tx <- transcripts(db, columns="tx_name")
208 | tx_names <- mcols(tx)$tx_name
209 | bad <- !(supplied_tx_names %in% tx_names)
210 | if (all(bad))
211 | stop(wmsg("The names on 'x' must be transcript names present in ",
212 | "the supplied ", class(db), " object. Note that the ",
213 | "transcript names in this object can be obtained/seen ",
214 | "with:"),
215 | "\n tx <- transcripts(db, columns=\"tx_name\")",
216 | "\n mcols(tx)$tx_name")
217 | bad_idx <- which(bad)
218 | if (length(bad_idx) != 0L) {
219 | msg <- .make_bad_names_msg(supplied_tx_names, bad_idx,
220 | what="invalid transcript name")
221 | stop(wmsg(msg))
222 | }
223 | }
224 |
225 | ### 'db' must be a TxDb object or any object that supports cdsBy()
226 | ### (e.g. EnsDb object).
227 | .extract_cds_by_tx <- function(db, tx_names)
228 | {
229 | stopifnot(is.character(tx_names))
230 | if (!is(db, "EnsDb"))
231 | return(cdsBy(db, by="tx", use.names=TRUE))
232 | ## Should never happen in practice because if 'db' is an EnsDb object
233 | ## then the ensembldb package should be loaded already, and ensembldb
234 | ## depends on AnnotationFilter.
235 | if (!requireNamespace("AnnotationFilter", quietly=TRUE))
236 | stop(wmsg("Couldn't load the AnnotationFilter package. ",
237 | "The AnnotationFilter package is needed when ",
238 | "calling proteinToGenome() on an EnsDb object. ",
239 | "Please install it."))
240 | filter <- AnnotationFilter::TxIdFilter(tx_names)
241 | cdsBy(db, by="tx", filter=filter)
242 | }
243 |
244 | ### 'db' must be a TxDb object or any object that supports transcripts()
245 | ### and cdsBy() (e.g. EnsDb object).
246 | ### Returns a named GRangesList object parallel to 'x' (names on 'x' are
247 | ### propagated).
248 | .default_proteinToGenome <- function(x, db)
249 | {
250 | if (!is(x, "IRanges"))
251 | stop(wmsg("'x' must be an IRanges object or derivative"))
252 | x_names <- names(x)
253 | .check_supplied_tx_names(x_names, db)
254 | cds_by_tx <- .extract_cds_by_tx(db, x_names)
255 | proteinToGenome(x, cds_by_tx)
256 | }
257 |
258 | setMethod("proteinToGenome", "ANY", .default_proteinToGenome)
259 |
260 |
--------------------------------------------------------------------------------
/R/tRNAs.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### Extractors for features in other databases
3 | ### -------------------------------------------------------------------------
4 | ###
5 | ### This is for extractors that do NOT point to the TxDb proper.
6 | ### Such extractors can point to other databases OR they can
7 | ### point to other FeatureDbs within the same package.
8 |
9 | microRNAs <- function(x) .Defunct()
10 |
11 | .syncSeqlevel <- function(txdb, ans){
12 | isActSeq <- .isActiveSeq(txdb)
13 | n2oNames <- levels(seqnames(ans))
14 | n2o <- match(seqnames(seqinfo(txdb)), n2oNames)
15 | seqinfo(ans, new2old=n2o) <- seqinfo(txdb)
16 | seqlevels(ans, pruning.mode="coarse") <- names(isActSeq)[isActSeq]
17 | ans
18 | }
19 |
20 | ## main function
21 | .tRNAs <- function(x) {
22 | if (!requireNamespace("txdbmaker", quietly=TRUE))
23 | stop("Could not load package txdbmaker. Is it installed?\n\n ",
24 | wmsg("Note that the tRNAs() method for TxDb objects requires ",
25 | "the txdbmaker package. Please install it with:"),
26 | "\n\n BiocManager::install(\"txdbmaker\")")
27 |
28 | fdbpkg <- "FDb.UCSC.tRNAs"
29 | fdbenv <- loadNamespace(fdbpkg)
30 | ## get the current package name
31 | pkgName <- txdbmaker::makePackageName(x)
32 | ## from here we know what the FDB should MUST look like
33 | fdbName <- sub("TxDb","FDb",pkgName)
34 | fdbName <- unlist(strsplit(fdbName,"\\."))
35 | fdbName[5] <- "tRNAs"
36 | fdbString <- paste(fdbName,collapse=".")
37 | if (!exists(fdbString, envir=fdbenv)) {
38 | stop("there is no tRNA data available for this organism/source")
39 | } else {
40 | fdb <- get(fdbString, fdbenv)
41 | ans <- features(fdb)
42 | }
43 | ## Now check active seqs and set the seqlevels
44 | .syncSeqlevel(x, ans)
45 | }
46 |
47 | setGeneric("tRNAs", function(x) standardGeneric("tRNAs"))
48 |
49 | setMethod("tRNAs", "TxDb", .tRNAs)
50 |
51 |
52 | ## Test code for new TXTYPE support (BC vs new code)
53 | ## library(TxDb.Hsapiens.BioMart.ensembl.GRCh38);txdb2= TxDb.Hsapiens.BioMart.ensembl.GRCh38;transcripts(txdb2, columns='TXTYPE')
54 | ## exons(txdb2, columns='TXTYPE')
55 | ## And this one works now
56 | ## library(TxDb.Hsapiens.UCSC.hg19.knownGene);txdb = TxDb.Hsapiens.UCSC.hg19.knownGene;transcripts(txdb, columns='TXTYPE')
57 | ## But this still fails (argh):
58 | ## exons(txdb, columns='TXTYPE')
59 |
60 |
--------------------------------------------------------------------------------
/R/transcriptLengths.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### transcriptLengths()
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | .match_and_check <- function(rglist_names, tx_id)
7 | {
8 | if (is.null(rglist_names))
9 | stop(wmsg("internal error in transcriptLengths(): ",
10 | "no names on 'rglist'"))
11 | m <- match(rglist_names, tx_id)
12 | if (any(is.na(m)))
13 | stop(wmsg("internal error in transcriptLengths(): ",
14 | "some 'rglist' names cannot be mapped to 'tx_id'"))
15 | m
16 | }
17 |
18 | ### 'rglist' must be a named IntegerRangesList or GRangesList.
19 | ### 'tx_id' must be a character vector.
20 | .eltNROWS <- function(rglist, tx_id)
21 | {
22 | ans <- integer(length(tx_id))
23 | m <- .match_and_check(names(rglist), tx_id)
24 | ans[m] <- elementNROWS(rglist)
25 | ans
26 | }
27 |
28 | .sum_width <- function(rglist, tx_id)
29 | {
30 | ans <- integer(length(tx_id))
31 | m <- .match_and_check(names(rglist), tx_id)
32 | ans[m] <- sum(width(rglist))
33 | ans
34 | }
35 |
36 | ### The returned data frame has 1 row per transcript returned by
37 | ### 'transcripts(txdb)' and in the same order.
38 | ### NOTES:
39 | ### - The functions only accepts a TxDb object for now. We'll make it
40 | ### a generic function when we need to support other types of input.
41 | ### - The function could probably be made much faster by querying the
42 | ### TxDb object directly in SQL instead of calling exonsBy(), cdsBy(),
43 | ### fiveUTRsByTranscript(), and threeUTRsByTranscript() successively.
44 | transcriptLengths <- function(txdb, with.cds_len=FALSE,
45 | with.utr5_len=FALSE, with.utr3_len=FALSE,
46 | ...)
47 | {
48 | if (!isTRUEorFALSE(with.cds_len))
49 | stop("'with.cds_len' must be TRUE or FALSE")
50 | if (!isTRUEorFALSE(with.utr5_len))
51 | stop("'with.utr5_len' must be TRUE or FALSE")
52 | if (!isTRUEorFALSE(with.cds_len))
53 | stop("'with.utr3_len' must be TRUE or FALSE")
54 | tx <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"),...)
55 | ans <- mcols(tx)
56 | ans$gene_id <- as.character(ans$gene_id)
57 | tx_id <- as.character(ans$tx_id) # because match() will want a character
58 |
59 | rg_by_tx <- exonsBy(txdb, by="tx", ...)
60 | ans$nexon <- .eltNROWS(rg_by_tx, tx_id)
61 | ans$tx_len <- .sum_width(rg_by_tx, tx_id)
62 | if (with.cds_len) {
63 | rg_by_tx <- cdsBy(txdb, by="tx", ...)
64 | ans$cds_len <- .sum_width(rg_by_tx, tx_id)
65 | }
66 | if (with.utr5_len) {
67 | rg_by_tx <- fiveUTRsByTranscript(txdb, ...)
68 | ans$utr5_len <- .sum_width(rg_by_tx, tx_id)
69 | }
70 | if (with.utr3_len) {
71 | rg_by_tx <- threeUTRsByTranscript(txdb, ...)
72 | ans$utr3_len <- .sum_width(rg_by_tx, tx_id)
73 | }
74 | as.data.frame(ans)
75 | }
76 |
77 |
--------------------------------------------------------------------------------
/R/transcriptLocs2refLocs.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### transcriptLocs2refLocs()
3 | ### -------------------------------------------------------------------------
4 |
5 |
6 | .normargExonStartsOrEnds <- function(exonStarts, argname)
7 | {
8 | if (is.list(exonStarts))
9 | return(exonStarts)
10 | if (is(exonStarts, "IntegerList"))
11 | return(as.list(exonStarts))
12 | if (is.character(exonStarts))
13 | return(toListOfIntegerVectors(exonStarts))
14 | stop("'", argname, "' must be a list of integer vectors, ",
15 | "an IntegerList object,\n or a character vector where ",
16 | "each element is a comma-separated list of\n integers")
17 | }
18 |
19 |
20 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
21 | ### transcriptLocs2refLocs()
22 | ###
23 |
24 | transcriptLocs2refLocs <- function(tlocs, exonStarts=list(), exonEnds=list(),
25 | strand=character(0),
26 | decreasing.rank.on.minus.strand=FALSE,
27 | error.if.out.of.bounds=TRUE)
28 | {
29 | if (!is.list(tlocs)) {
30 | if (!is(tlocs, "IntegerList"))
31 | stop("'tlocs' must be a list of integer vectors ",
32 | "or an IntegerList object")
33 | tlocs <- as.list(tlocs)
34 | }
35 | if (is(exonStarts, "IntegerRangesList")) {
36 | if (!identical(exonEnds, list()))
37 | stop("'exonEnds' cannot be specified ",
38 | "when 'exonStarts' is a IntegerRangesList object")
39 | exonEnds <- end(exonStarts)
40 | exonStarts <- start(exonStarts)
41 | }
42 | exonStarts <- .normargExonStartsOrEnds(exonStarts, "exonStarts")
43 | exonEnds <- .normargExonStartsOrEnds(exonEnds, "exonEnds")
44 | if (is.factor(strand))
45 | strand <- as.vector(strand)
46 | if (!is.character(strand))
47 | stop("'strand' must be a character vector")
48 | if (length(tlocs) != length(strand)
49 | || length(exonStarts) != length(strand)
50 | || length(exonEnds) != length(strand))
51 | stop("'tlocs', 'exonStarts', 'exonEnds' and 'strand' ",
52 | "must have the same length")
53 | if (!isTRUEorFALSE(decreasing.rank.on.minus.strand))
54 | stop("'decreasing.rank.on.minus.strand' must be TRUE or FALSE")
55 | GenomicRanges:::unsafe.transcriptLocs2refLocs(tlocs,
56 | exonStarts, exonEnds, strand,
57 | decreasing.rank.on.minus.strand,
58 | error.if.out.of.bounds)
59 | }
60 |
61 |
62 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
63 | ### transcriptWidths()
64 | ###
65 |
66 | transcriptWidths <- function(exonStarts=list(), exonEnds=list())
67 | {
68 | if (is(exonStarts, "IntegerRangesList")) {
69 | if (!identical(exonEnds, list()))
70 | stop("'exonEnds' cannot be specified ",
71 | "when 'exonStarts' is a IntegerRangesList object")
72 | exonEnds <- end(exonStarts)
73 | exonStarts <- start(exonStarts)
74 | }
75 | exonStarts <- .normargExonStartsOrEnds(exonStarts, "exonStarts")
76 | exonEnds <- .normargExonStartsOrEnds(exonEnds, "exonEnds")
77 | if (length(exonStarts) != length(exonEnds))
78 | stop("'exonStarts', 'exonEnds' must have the same length")
79 | GenomicRanges:::unsafe.transcriptWidths(exonStarts, exonEnds)
80 | }
81 |
82 |
--------------------------------------------------------------------------------
/R/transcriptsByOverlaps.R:
--------------------------------------------------------------------------------
1 | ###
2 |
3 | setGeneric("transcriptsByOverlaps", signature="x",
4 | function(x, ranges, maxgap = -1L, minoverlap = 0L,
5 | type = c("any", "start", "end"), ...)
6 | standardGeneric("transcriptsByOverlaps")
7 | )
8 |
9 | setMethod("transcriptsByOverlaps", "TxDb",
10 | function(x, ranges, maxgap = -1L, minoverlap = 0L,
11 | type = c("any", "start", "end"),
12 | columns = c("tx_id", "tx_name"))
13 | subsetByOverlaps(transcripts(x, columns = columns), ranges,
14 | maxgap = maxgap, minoverlap = minoverlap,
15 | type = match.arg(type))
16 | )
17 |
18 | setGeneric("exonsByOverlaps", signature="x",
19 | function(x, ranges, maxgap = -1L, minoverlap = 0L,
20 | type = c("any", "start", "end"), ...)
21 | standardGeneric("exonsByOverlaps")
22 | )
23 |
24 | setMethod("exonsByOverlaps", "TxDb",
25 | function(x, ranges, maxgap = -1L, minoverlap = 0L,
26 | type = c("any", "start", "end"),
27 | columns = "exon_id")
28 | subsetByOverlaps(exons(x, columns = columns), ranges,
29 | maxgap = maxgap, minoverlap = minoverlap,
30 | type = match.arg(type))
31 | )
32 |
33 | setGeneric("cdsByOverlaps", signature="x",
34 | function(x, ranges, maxgap = -1L, minoverlap = 0L,
35 | type = c("any", "start", "end"), ...)
36 | standardGeneric("cdsByOverlaps")
37 | )
38 |
39 | setMethod("cdsByOverlaps", "TxDb",
40 | function(x, ranges, maxgap = -1L, minoverlap = 0L,
41 | type = c("any", "start", "end"),
42 | columns = "cds_id")
43 | subsetByOverlaps(cds(x, columns = columns), ranges,
44 | maxgap = maxgap, minoverlap = minoverlap,
45 | type = match.arg(type))
46 | )
47 |
48 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | ### =========================================================================
2 | ### Miscellaneous low-level utils
3 | ### -------------------------------------------------------------------------
4 | ###
5 | ### Unless stated otherwise, nothing in this file is exported.
6 | ###
7 |
8 |
9 | call_fun_in_txdbmaker <- function(fun, ...)
10 | {
11 | msg <- c(fun, "() has moved from GenomicFeatures to the txdbmaker ",
12 | "package, and is formally defunct in GenomicFeatures ",
13 | ">= 1.61.1. Please call txdbmaker::", fun, "() to get rid ",
14 | "of this error.")
15 | .Defunct(msg=wmsg(msg))
16 | }
17 |
18 |
19 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
20 | ### DB related.
21 | ###
22 | ### Most of this stuff was copy/pasted from AnnotationDbi (trying to avoid
23 | ### depending on AnnotationDbi for now).
24 | ###
25 |
26 | ### Environment for storing run-time objects
27 | RTobjs <- new.env(hash=TRUE, parent=emptyenv())
28 |
29 | assign("debugSQL", FALSE, envir=RTobjs)
30 |
31 | debugSQL <- function()
32 | {
33 | debugSQL <- !get("debugSQL", envir=RTobjs)
34 | assign("debugSQL", debugSQL, envir=RTobjs)
35 | debugSQL
36 | }
37 |
38 |
39 | ### Use dbQuery(conn, SQL, 1) instead of dbQuery(conn, SQL)[[1]],
40 | ### it's much safer!
41 | dbEasyQuery <- function(conn, SQL, j0=NA)
42 | {
43 | if (get("debugSQL", envir=RTobjs)) {
44 | if (!is.character(SQL) || length(SQL) != 1L || is.na(SQL))
45 | stop("[debugSQL] 'SQL' must be a single string")
46 | cat("[debugSQL] SQL query: ", SQL, "\n", sep="")
47 | st <- system.time(data0 <- dbGetQuery(conn, SQL))
48 | cat("[debugSQL] time: ", st["user.self"], " seconds\n", sep="")
49 | } else {
50 | data0 <- dbGetQuery(conn, SQL)
51 | }
52 | if (is.na(j0))
53 | return(data0)
54 | ## Needed to deal properly with data frame with 0 column ("NULL data
55 | ## frames with 0 rows") returned by RSQLite when the result of a SELECT
56 | ## query has 0 row
57 | if (nrow(data0) == 0L)
58 | character(0)
59 | else
60 | data0[[j0]]
61 | }
62 |
63 | ### TODO: Put this in AnnotationDbi.
64 | queryAnnotationDb <- function(annotationdb, sql)
65 | {
66 | AnnotationDbi:::dbEasyQuery(dbconn(annotationdb),
67 | paste(sql, collapse="\n"))
68 | }
69 |
70 |
71 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
72 | ### Data frame related.
73 | ###
74 | ### TODO: Find a better home for these low-level data.frame utils.
75 | ###
76 |
77 | ### Not data.frame specific. Would also work on any matrix-like object.
78 | has_col <- function(x, colnames) {colnames %in% colnames(x)}
79 |
80 | makeZeroRowDataFrame <- function(col2class)
81 | {
82 | if (!is.character(col2class) || is.null(names(col2class)))
83 | stop("'col2class' must be a named character vector")
84 | as.data.frame(lapply(col2class, function(class) get(class)()),
85 | stringsAsFactors=FALSE)
86 | }
87 |
88 | ### Sets the class of (all or some of) the columns of a data.frame.
89 | ### Typical use:
90 | ### x <- setDataFrameColClass(x, c(colA="integer", colB="factor"))
91 | ### Note that if 'x' has more than one "colA" col, then *all* of them are
92 | ### coerced to integer.
93 | setDataFrameColClass <- function(x, col2class, drop.extra.cols=FALSE)
94 | {
95 | if (!is.data.frame(x))
96 | stop("'x' must be a data.frame")
97 | if (!is.character(col2class) || is.null(names(col2class)))
98 | stop("'col2class' must be a named character vector")
99 | if (!all(names(col2class) %in% colnames(x)))
100 | stop("'col2class' has invalid names")
101 | if (!isTRUEorFALSE(drop.extra.cols))
102 | stop("'drop.extra.cols' must be TRUE or FALSE")
103 | if (drop.extra.cols) {
104 | col_idx <- which(colnames(x) %in% names(col2class))
105 | } else {
106 | col_idx <- seq_len(ncol(x))
107 | }
108 | tmp <- lapply(col_idx,
109 | function(j)
110 | {
111 | col <- x[[j]]
112 | colname <- colnames(x)[j]
113 | if (!(colname %in% names(col2class)))
114 | return(col)
115 | class <- col2class[[colname]]
116 | FUNname <- paste("as", class, sep=".")
117 | if (exists(FUNname) && is.function(FUN <- get(FUNname)))
118 | return(FUN(col))
119 | as(col, class)
120 | })
121 | names(tmp) <- colnames(x)[col_idx]
122 | return(data.frame(tmp, check.names=FALSE, stringsAsFactors=FALSE))
123 | }
124 |
125 |
126 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
127 | ### ID assignment and/or reassignment.
128 | ###
129 |
130 | ### Returns the vector of ids such that 'unique(x)[ids, ]' is identical
131 | ### to 'x' (in the same way that 'levels(f)[f]' is identical to
132 | ### 'as.vector(f)' when 'f' is a character factor).
133 | ### This unambiguously defines 'ids'. In particular, it's not Locale
134 | ### specific, despite the fact that the current implementation uses a
135 | ### sorting approach.
136 | ### TODO: Remove! (not used anymore)
137 | makeIdsForUniqueDataFrameRows <- function(x)
138 | {
139 | if (!is.data.frame(x))
140 | stop("'x' must be a data.frame")
141 | x_order <- do.call(order, x)
142 | x_dups <- duplicated(x)
143 | ## First we make "provisory" ids. Those ids *are* Locale specific.
144 | prov_ids <- integer(nrow(x))
145 | prov_ids[x_order] <- cumsum(!x_dups[x_order])
146 | ## Convert the "provisory" ids into the final ids. The final ids are
147 | ## *not* Locale specific anymore.
148 | as.integer(factor(prov_ids, levels=unique(prov_ids)))
149 | }
150 |
151 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .test <- function() BiocGenerics:::testPackage("GenomicFeatures")
2 |
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [
](https://bioconductor.org/)
2 |
3 | **GenomicFeatures** is an R/Bioconductor package for querying the gene models of a given organism/assembly.
4 |
5 | See https://bioconductor.org/packages/GenomicFeatures for more information including how to install the release version of the package (please refrain from installing directly from GitHub).
6 |
7 |
--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | o Fix handling of the 'filter' arg (transcripts(), etc...) when the txdb has
2 | user seqlevels on.
3 |
4 | o Add 'filter' arg to transcriptsBy(), exonsBy(), cdsBy(),
5 | fiveUTRByTranscript(), and threeUTRByTranscript(). exonsBy(), cdsBy(), and
6 | *UTRByTranscript() should at least support filtering by gene or transcript
7 | id.
8 |
9 | o Too many helper functions are defined and used internally to query the
10 | db:
11 | - dbEasyQuery() is defined in AnnotationDbi and GenomicFeatures with
12 | different definitions.
13 | - AnnotationDbi:::dbQuery() is the same as GenomicFeatures:::dbEasyQuery()
14 | - queryAnnotationDb()
15 | Clean this mess!
16 |
17 | o DB schema change: Replace tx_chrom, exon_chrom, and cds_chrom columns with
18 | _tx_chrom_id, _exon_chrom_id, and _cds_chrom_id.
19 |
20 | o Add the following indices to the db schema:
21 | CREATE INDEX splicing_tx_id ON splicing (_tx_id);
22 | CREATE INDEX splicing_exon_id ON splicing (_exon_id);
23 | CREATE INDEX splicing_cds_id ON splicing (_cds_id);
24 | CREATE INDEX gene_tx_id ON gene (_tx_id);
25 |
26 |
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | citEntry(entry="article",
2 | title = "Software for Computing and Annotating Genomic Ranges",
3 | author = personList( as.person("Michael Lawrence" ),
4 | as.person("Wolfgang Huber" ),
5 | as.person("Herv\\'e Pag\\`es" ),
6 | as.person("Patrick Aboyoun" ),
7 | as.person("Marc Carlson" ),
8 | as.person("Robert Gentleman" ),
9 | as.person("Martin Morgan" ),
10 | as.person("Vincent Carey" )),
11 | year = 2013,
12 | journal = "{PLoS} Computational Biology",
13 | volume = "9",
14 | issue = "8",
15 | doi = "10.1371/journal.pcbi.1003118",
16 | url = "http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118",
17 | textVersion =
18 | "Lawrence M, Huber W, Pag\\`es H, Aboyoun P, Carlson M, et al. (2013) Software for Computing and Annotating Genomic Ranges. PLoS Comput Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118" )
19 |
--------------------------------------------------------------------------------
/inst/extdata/Biomart_Ensembl_sample.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/Biomart_Ensembl_sample.sqlite
--------------------------------------------------------------------------------
/inst/extdata/FeatureDb.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/FeatureDb.sqlite
--------------------------------------------------------------------------------
/inst/extdata/ITAG4.1_gene_models.subset.gff:
--------------------------------------------------------------------------------
1 | ##gff-version 3
2 | ##sequence-regionSL4.0ch00 1 9643250
3 | ##sequence-regionSL4.0ch01 1 90863682
4 | ##sequence-regionSL4.0ch02 1 53473368
5 | ##sequence-regionSL4.0ch03 1 65298490
6 | ##sequence-regionSL4.0ch04 1 64459972
7 | ##sequence-regionSL4.0ch05 1 65269487
8 | ##sequence-regionSL4.0ch06 1 47258699
9 | ##sequence-regionSL4.0ch07 1 67883646
10 | ##sequence-regionSL4.0ch08 1 63995357
11 | ##sequence-regionSL4.0ch09 1 68513564
12 | ##sequence-regionSL4.0ch10 1 64792705
13 | ##sequence-regionSL4.0ch11 1 54379777
14 | ##sequence-regionSL4.0ch12 1 66688036
15 | SL4.0ch00 maker_ITAG gene 2030916 2032369 . + . ID=gene:Solyc00g025400.2;Alias=Solyc00g025400;Name=Solyc00g025400.2;length=1453
16 | SL4.0ch00 maker_ITAG mRNA 2030916 2032369 . + . ID=mRNA:Solyc00g025400.2.1;Parent=gene:Solyc00g025400.2;Name=Solyc00g025400.2.1;Note=Unknown protein;_AED=1.00;_QI=250|0|0|0|0|0|2|0|156;_eAED=1.00
17 | SL4.0ch00 maker_ITAG five_prime_UTR 2030916 2031165 . + . ID=five_prime_UTR:Solyc00g025400.2.1.0;Parent=mRNA:Solyc00g025400.2.1
18 | SL4.0ch00 maker_ITAG exon 2030916 2031456 . + . ID=exon:Solyc00g025400.2.1.1;Parent=mRNA:Solyc00g025400.2.1
19 | SL4.0ch00 maker_ITAG CDS 2031166 2031456 . + 0 ID=CDS:Solyc00g025400.2.1.1;Parent=mRNA:Solyc00g025400.2.1
20 | SL4.0ch00 maker_ITAG exon 2032190 2032369 . + . ID=exon:Solyc00g025400.2.1.2;Parent=mRNA:Solyc00g025400.2.1
21 | SL4.0ch00 maker_ITAG CDS 2032190 2032369 . + 0 ID=CDS:Solyc00g025400.2.1.2;Parent=mRNA:Solyc00g025400.2.1
22 | ###
23 | SL4.0ch00 maker gene 2062209 2063021 . - . ID=gene:Solyc00g500104.2;Name=Solyc00g500104.2
24 | SL4.0ch00 maker mRNA 2062209 2063021 . - . ID=mRNA:Solyc00g500104.2.1;Parent=gene:Solyc00g500104.2;Name=Solyc00g500104.2.1;_aed=0.27;_eaed=0.27;_qi=0|-1|0|1|-1|1|1|150|220;Note=RNA-dependent RNA polymerase (AHRD V3.11 *-* tr|A0A2Z6JIR4|A0A2Z6JIR4_9VIRU);Dbxref=InterPro:IPR008686,Pfam:PF05919
25 | SL4.0ch00 maker three_prime_UTR 2062209 2062358 . - . ID=three_prime_UTR:Solyc00g500104.2.1.0;Parent=mRNA:Solyc00g500104.2.1
26 | SL4.0ch00 maker exon 2062209 2063021 . - . ID=exon:Solyc00g500104.2.1.1;Parent=mRNA:Solyc00g500104.2.1
27 | SL4.0ch00 maker CDS 2062359 2063021 . - 0 ID=CDS:Solyc00g500104.2.1.1;Parent=mRNA:Solyc00g500104.2.1
28 | ###
29 | SL4.0ch00 maker_ITAG gene 2081475 2081793 . - . ID=gene:Solyc00g142160.1;Alias=Solyc00g142160;Name=Solyc00g142160.1;length=318
30 | SL4.0ch00 maker_ITAG mRNA 2081475 2081793 . - . ID=mRNA:Solyc00g142160.1.1;Parent=gene:Solyc00g142160.1;Name=Solyc00g142160.1.1;Note=RNA-dependent RNA polymerase (AHRD V3.3 *-* A0A2Z6JIR4_9VIRU);_AED=0.64;_QI=0|-1|0|1|-1|0|1|43|91;_eAED=0.64
31 | SL4.0ch00 maker_ITAG three_prime_UTR 2081475 2081517 . - . ID=three_prime_UTR:Solyc00g142160.1.1.0;Parent=mRNA:Solyc00g142160.1.1
32 | SL4.0ch00 maker_ITAG exon 2081475 2081793 . - . ID=exon:Solyc00g142160.1.1.1;Parent=mRNA:Solyc00g142160.1.1
33 | SL4.0ch00 maker_ITAG CDS 2081518 2081793 . - 0 ID=CDS:Solyc00g142160.1.1.1;Parent=mRNA:Solyc00g142160.1.1
34 | ###
35 | SL4.0ch00 maker_ITAG gene 2081817 2083225 . - . ID=gene:Solyc00g142170.2;Alias=Solyc00g142170;Name=Solyc00g142170.2;length=1408
36 | SL4.0ch00 maker_ITAG mRNA 2081817 2083225 . - . ID=mRNA:Solyc00g142170.2.1;Parent=gene:Solyc00g142170.2;Name=Solyc00g142170.2.1;Note=RNA-dependent RNA polymerase (AHRD V3.3 *-* A0A2Z6JIR4_9VIRU);Dbxref=InterPro:IPR008686,Pfam:PF05919;_AED=0.38;_QI=89|0|0|0.33|0|0|3|39|305;_eAED=0.43
37 | SL4.0ch00 maker_ITAG three_prime_UTR 2081817 2081855 . - . ID=three_prime_UTR:Solyc00g142170.2.1.0;Parent=mRNA:Solyc00g142170.2.1
38 | SL4.0ch00 maker_ITAG exon 2081817 2082335 . - . ID=exon:Solyc00g142170.2.1.1;Parent=mRNA:Solyc00g142170.2.1
39 | SL4.0ch00 maker_ITAG CDS 2081856 2082335 . - 0 ID=CDS:Solyc00g142170.2.1.1;Parent=mRNA:Solyc00g142170.2.1
40 | SL4.0ch00 maker_ITAG exon 2082546 2082748 . - . ID=exon:Solyc00g142170.2.1.2;Parent=mRNA:Solyc00g142170.2.1
41 | SL4.0ch00 maker_ITAG CDS 2082546 2082748 . - 2 ID=CDS:Solyc00g142170.2.1.2;Parent=mRNA:Solyc00g142170.2.1
42 | SL4.0ch00 maker_ITAG CDS 2082902 2083136 . - 0 ID=CDS:Solyc00g142170.2.1.3;Parent=mRNA:Solyc00g142170.2.1
43 | SL4.0ch00 maker_ITAG exon 2082902 2083225 . - . ID=exon:Solyc00g142170.2.1.3;Parent=mRNA:Solyc00g142170.2.1
44 | SL4.0ch00 maker_ITAG five_prime_UTR 2083137 2083225 . - . ID=five_prime_UTR:Solyc00g142170.2.1.0;Parent=mRNA:Solyc00g142170.2.1
45 | ###
46 | SL4.0ch00 maker_ITAG gene 2136189 2136452 . - . ID=gene:Solyc00g500120.1;Alias=Solyc00g500120;Name=Solyc00g500120.1;length=263
47 | SL4.0ch00 maker_ITAG mRNA 2136189 2136452 . - . ID=mRNA:Solyc00g500120.1.1;Parent=gene:Solyc00g500120.1;Name=Solyc00g500120.1.1;Note=Unknown protein;_AED=0.46;_QI=0|-1|0|1|-1|0|1|0|87;_eAED=1.00
48 | SL4.0ch00 maker_ITAG exon 2136189 2136452 . - . ID=exon:Solyc00g500120.1.1.1;Parent=mRNA:Solyc00g500120.1.1
49 | SL4.0ch00 maker_ITAG CDS 2136189 2136452 . - 0 ID=CDS:Solyc00g500120.1.1.1;Parent=mRNA:Solyc00g500120.1.1
50 | ###
51 | SL4.0ch00 maker_ITAG gene 2139483 2140481 . - . ID=gene:Solyc00g500121.1;Alias=Solyc00g500121;Name=Solyc00g500121.1;length=998
52 | SL4.0ch00 maker_ITAG mRNA 2139483 2140481 . - . ID=mRNA:Solyc00g500121.1.1;Parent=gene:Solyc00g500121.1;Name=Solyc00g500121.1.1;Note=Unknown protein;_AED=0.38;_QI=558|1|0.5|1|0|0|2|0|86;_eAED=0.38
53 | SL4.0ch00 maker_ITAG exon 2139483 2139659 . - . ID=exon:Solyc00g500121.1.1.1;Parent=mRNA:Solyc00g500121.1.1
54 | SL4.0ch00 maker_ITAG CDS 2139483 2139659 . - 0 ID=CDS:Solyc00g500121.1.1.1;Parent=mRNA:Solyc00g500121.1.1
55 | SL4.0ch00 maker_ITAG CDS 2139840 2139923 . - 0 ID=CDS:Solyc00g500121.1.1.2;Parent=mRNA:Solyc00g500121.1.1
56 | SL4.0ch00 maker_ITAG exon 2139840 2140481 . - . ID=exon:Solyc00g500121.1.1.2;Parent=mRNA:Solyc00g500121.1.1
57 | SL4.0ch00 maker_ITAG five_prime_UTR 2139924 2140481 . - . ID=five_prime_UTR:Solyc00g500121.1.1.0;Parent=mRNA:Solyc00g500121.1.1
58 | ###
59 | SL4.0ch00 maker_ITAG gene 2187524 2191157 . - . ID=gene:Solyc00g500122.1;Alias=Solyc00g500122;Name=Solyc00g500122.1;length=3633
60 | SL4.0ch00 maker_ITAG mRNA 2187524 2191157 . - . ID=mRNA:Solyc00g500122.1.1;Parent=gene:Solyc00g500122.1;Name=Solyc00g500122.1.1;Note=Unknown protein;_AED=0.40;_QI=0|0|0.33|1|0|0|3|433|122;_eAED=0.51
61 | SL4.0ch00 maker_ITAG three_prime_UTR 2187524 2187956 . - . ID=three_prime_UTR:Solyc00g500122.1.1.0;Parent=mRNA:Solyc00g500122.1.1
62 | SL4.0ch00 maker_ITAG exon 2187524 2188176 . - . ID=exon:Solyc00g500122.1.1.1;Parent=mRNA:Solyc00g500122.1.1
63 | SL4.0ch00 maker_ITAG CDS 2187957 2188176 . - 1 ID=CDS:Solyc00g500122.1.1.1;Parent=mRNA:Solyc00g500122.1.1
64 | SL4.0ch00 maker_ITAG exon 2190928 2190992 . - . ID=exon:Solyc00g500122.1.1.2;Parent=mRNA:Solyc00g500122.1.1
65 | SL4.0ch00 maker_ITAG CDS 2190928 2190992 . - 0 ID=CDS:Solyc00g500122.1.1.2;Parent=mRNA:Solyc00g500122.1.1
66 | SL4.0ch00 maker_ITAG exon 2191074 2191157 . - . ID=exon:Solyc00g500122.1.1.3;Parent=mRNA:Solyc00g500122.1.1
67 | SL4.0ch00 maker_ITAG CDS 2191074 2191157 . - 0 ID=CDS:Solyc00g500122.1.1.3;Parent=mRNA:Solyc00g500122.1.1
68 | ###
69 | SL4.0ch00 maker gene 2329909 2332560 . - . ID=gene:Solyc00g160010.1;Name=Solyc00g160010.1
70 | SL4.0ch00 maker mRNA 2329909 2332560 . - . ID=mRNA:Solyc00g160010.1.1;Parent=gene:Solyc00g160010.1;Name=Solyc00g160010.1.1;_aed=0.77;_eaed=0.77;_qi=0|0|0|0.25|1|1|4|0|150;_merge_warning=1;Note=zinc finger CCCH domain-containing protein 30-like (AHRD V3.11 *-* XP_025884875.1)
71 | SL4.0ch00 maker exon 2329909 2329963 . - . ID=exon:Solyc00g160010.1.1.1;Parent=mRNA:Solyc00g160010.1.1
72 | SL4.0ch00 maker CDS 2329909 2329963 . - 1 ID=CDS:Solyc00g160010.1.1.1;Parent=mRNA:Solyc00g160010.1.1
73 | SL4.0ch00 maker exon 2330621 2330687 . - . ID=exon:Solyc00g160010.1.1.2;Parent=mRNA:Solyc00g160010.1.1
74 | SL4.0ch00 maker CDS 2330621 2330687 . - 2 ID=CDS:Solyc00g160010.1.1.2;Parent=mRNA:Solyc00g160010.1.1
75 | SL4.0ch00 maker exon 2330792 2330966 . - . ID=exon:Solyc00g160010.1.1.3;Parent=mRNA:Solyc00g160010.1.1
76 | SL4.0ch00 maker CDS 2330792 2330966 . - 0 ID=CDS:Solyc00g160010.1.1.3;Parent=mRNA:Solyc00g160010.1.1
77 | SL4.0ch00 maker exon 2332405 2332560 . - . ID=exon:Solyc00g160010.1.1.4;Parent=mRNA:Solyc00g160010.1.1
78 | SL4.0ch00 maker CDS 2332405 2332560 . - 0 ID=CDS:Solyc00g160010.1.1.4;Parent=mRNA:Solyc00g160010.1.1
79 | ###
80 | SL4.0ch00 maker gene 2377393 2378166 . - . ID=gene:Solyc00g160330.1;Name=Solyc00g160330.1
81 | SL4.0ch00 maker mRNA 2377393 2378166 . - . ID=mRNA:Solyc00g160330.1.1;Parent=gene:Solyc00g160330.1;Name=Solyc00g160330.1.1;_aed=0.58;_eaed=0.59;_qi=0|0|0|0.5|0|0.5|2|0|82;Note=zinc finger CCCH domain-containing protein 30-like (AHRD V3.11 *-* XP_025884875.1)
82 | SL4.0ch00 maker exon 2377393 2377434 . - . ID=exon:Solyc00g160330.1.1.1;Parent=mRNA:Solyc00g160330.1.1
83 | SL4.0ch00 maker CDS 2377393 2377434 . - 0 ID=CDS:Solyc00g160330.1.1.1;Parent=mRNA:Solyc00g160330.1.1
84 | SL4.0ch00 maker exon 2377960 2378166 . - . ID=exon:Solyc00g160330.1.1.2;Parent=mRNA:Solyc00g160330.1.1
85 | SL4.0ch00 maker CDS 2377960 2378166 . - 0 ID=CDS:Solyc00g160330.1.1.2;Parent=mRNA:Solyc00g160330.1.1
86 | ###
87 | SL4.0ch00 maker_ITAG gene 2379604 2380807 . - . ID=gene:Solyc00g007330.1;Alias=Solyc00g007330;Name=Solyc00g007330.1;length=1203
88 | SL4.0ch00 maker_ITAG mRNA 2379604 2380807 . - . ID=mRNA:Solyc00g007330.1.1;Parent=gene:Solyc00g007330.1;Name=Solyc00g007330.1.1;Note=Zinc finger transcription factor 1;_AED=1.00;_QI=373|0|0|0|0|0|2|0|171;_eAED=1.00
89 | SL4.0ch00 maker_ITAG CDS 2379604 2380119 . - 0 ID=CDS:Solyc00g007330.1.1.1;Parent=mRNA:Solyc00g007330.1.1
90 | SL4.0ch00 maker_ITAG exon 2379604 2380324 . - . ID=exon:Solyc00g007330.1.1.1;Parent=mRNA:Solyc00g007330.1.1
91 | SL4.0ch00 maker_ITAG five_prime_UTR 2380120 2380324 . - . ID=five_prime_UTR:Solyc00g007330.1.1.0;Parent=mRNA:Solyc00g007330.1.1
92 | SL4.0ch00 maker_ITAG exon 2380640 2380807 . - . ID=exon:Solyc00g007330.1.1.2;Parent=mRNA:Solyc00g007330.1.1
93 | SL4.0ch00 maker_ITAG five_prime_UTR 2380640 2380807 . - . ID=five_prime_UTR:Solyc00g007330.1.1.1;Parent=mRNA:Solyc00g007330.1.1
94 | ###
95 |
--------------------------------------------------------------------------------
/inst/extdata/cD.exByEdge-SG-Vig.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/cD.exByEdge-SG-Vig.Rda
--------------------------------------------------------------------------------
/inst/extdata/cD.exsByGenes-SG-Vig.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/cD.exsByGenes-SG-Vig.Rda
--------------------------------------------------------------------------------
/inst/extdata/events.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/events.Rda
--------------------------------------------------------------------------------
/inst/extdata/hg19_knownGene_sample.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/hg19_knownGene_sample.sqlite
--------------------------------------------------------------------------------
/inst/extdata/sample_ranges.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicFeatures/4b7c91ac86e98b9b2414af4b2d00211487ae0693/inst/extdata/sample_ranges.rds
--------------------------------------------------------------------------------
/inst/script/README:
--------------------------------------------------------------------------------
1 | Scripts make_tRNAFDb.R and make_TxDbs.R are now in the txdbmaker package!
2 |
3 |
--------------------------------------------------------------------------------
/inst/unitTests/test_TxDb_seqinfo.R:
--------------------------------------------------------------------------------
1 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
2 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
3 |
4 | test_rename_seqlevels <- function()
5 | {
6 | txdb <- restoreSeqlevels(txdb)
7 | new_seqlevels <- as.character(seq_along(seqlevels(txdb)))
8 | seqlevels(txdb) <- new_seqlevels
9 | checkIdentical(new_seqlevels, seqlevels(txdb))
10 | }
11 |
12 | test_restrict_seqlevels <- function()
13 | {
14 | ## This should work
15 | txdb <- restoreSeqlevels(txdb)
16 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5")
17 | checkEquals(length(seqinfo(txdb)), 1)
18 |
19 | ## This should work
20 | txdb <- restoreSeqlevels(txdb)
21 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5", chr6="6", chr4="4")
22 | checkTrue(length(seqinfo(txdb)) == 3)
23 | checkIdentical(c("5", "6", "4"), seqlevels(txdb))
24 | checkTrue(seqlengths(txdb)[2] == min(seqlengths(txdb)))
25 | checkTrue(seqlengths(txdb)[3] == max(seqlengths(txdb)))
26 |
27 | ## And this should NOT work
28 | txdb <- restoreSeqlevels(txdb)
29 | checkException(seqlevels(txdb, pruning.mode="coarse") <- c(foo="2"))
30 | }
31 |
32 | test_seqinfo_setter <- function()
33 | {
34 | txdb <- restoreSeqlevels(txdb)
35 | new_seqinfo <- seqinfo(txdb)
36 | seqnames(new_seqinfo) <- paste0("NEW_", seqnames(new_seqinfo))
37 | seqinfo(txdb, new2old=seq_along(new_seqinfo)) <- new_seqinfo
38 | checkIdentical(new_seqinfo, seqinfo(txdb))
39 |
40 | txdb <- restoreSeqlevels(txdb)
41 | new_seqinfo <- seqinfo(txdb)
42 | seqlengths(new_seqinfo) <- 5 * seqlengths(new_seqinfo)
43 | checkException(seqinfo(txdb) <- new_seqinfo)
44 |
45 | txdb <- restoreSeqlevels(txdb)
46 | new_seqinfo <- seqinfo(txdb)
47 | isCircular(new_seqinfo) <- rep(TRUE, length(new_seqinfo))
48 | checkException(seqinfo(txdb) <- new_seqinfo)
49 |
50 | txdb <- restoreSeqlevels(txdb)
51 | new_seqinfo <- seqinfo(txdb)
52 | genome(new_seqinfo) <- "foo"
53 | seqinfo(txdb) <- new_seqinfo
54 | checkIdentical(new_seqinfo, seqinfo(txdb))
55 | }
56 |
57 | test_transcripts_accessor <- function()
58 | {
59 | txdb <- restoreSeqlevels(txdb)
60 | txs1 <- transcripts(txdb)
61 | seqlevels(txs1, pruning.mode="coarse") <- c(chr5="5")
62 | ## Then change seqlevels for txdb
63 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5")
64 | txs2 <- transcripts(txdb)
65 | checkIdentical(txs1, txs2)
66 | }
67 |
68 | test_exons_accessor <- function()
69 | {
70 | txdb <- restoreSeqlevels(txdb)
71 | exs1 <- exons(txdb)
72 | seqlevels(exs1, pruning.mode="coarse") <- c(chr5="5")
73 | ## Then change seqlevels for txdb
74 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5")
75 | exs2 <- exons(txdb)
76 | checkIdentical(exs1, exs2)
77 | }
78 |
79 | test_cds_accessor <- function()
80 | {
81 | txdb <- restoreSeqlevels(txdb)
82 | cds1 <- cds(txdb)
83 | seqlevels(cds1, pruning.mode="coarse") <- c(chr5="5")
84 | ## Then change seqlevels for txdb
85 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5")
86 | cds2 <- cds(txdb)
87 | checkIdentical(cds1, cds2)
88 | }
89 |
90 | test_promoters_accessor <- function()
91 | {
92 | txdb <- restoreSeqlevels(txdb)
93 | prm1 <- promoters(txdb)
94 | seqlevels(prm1, pruning.mode="coarse") <- c(chr5="5")
95 | ## Then change seqlevels for txdb
96 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5")
97 | prm2 <- promoters(txdb)
98 | checkIdentical(prm1, prm2)
99 |
100 | txdb <- restoreSeqlevels(txdb)
101 | trmn1 <- terminators(txdb)
102 | seqlevels(trmn1, pruning.mode="coarse") <- c(chr5="5")
103 | ## Then change seqlevels for txdb
104 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5")
105 | trmn2 <- terminators(txdb)
106 | checkIdentical(trmn1, trmn2)
107 | }
108 |
109 | test_transcriptsBy_accessors <- function()
110 | {
111 | ## This one is a "fun" one.
112 | ## There are issues because some genes are annotated as being on
113 | ## TWO different chromosomes. Such genes are filtered for txs3,
114 | ## but NOT for txs4... Hmmmm.
115 | txdb <- restoreSeqlevels(txdb)
116 | txs3 <- transcriptsBy(txdb, by="gene")
117 | seqlevels(txs3, pruning.mode="coarse") <- c(chr5="5")
118 | ## Then change seqlevels for txdb
119 | seqlevels(txdb, pruning.mode="coarse") <- c(chr5="5")
120 | txs4 <- transcriptsBy(txdb, by="gene")
121 | ## checkIdentical(txs3, txs4) ## TROUBLE!!
122 |
123 | }
124 |
125 |
126 | ## What to do about this? The reason for the difference is because of order of operations. txs3 gets all the ranges and then removes any that are not kosher (this is correct), txs4 OTOH gets only ranges from chr5 (efficient!), but then fails to filter out things that have hybrid seqnames (as they were pre-filtered). I think I have to make the query less efficient to fix this, but I want to discuss it with Herve 1st to get a 2nd opinion.
127 |
--------------------------------------------------------------------------------
/inst/unitTests/test_exonicParts.R:
--------------------------------------------------------------------------------
1 | ###
2 |
3 | test_exonicParts <- function()
4 | {
5 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
6 | package="GenomicFeatures"))
7 |
8 | exonic_parts <- exonicParts(txdb, linked.to.single.gene.only=TRUE)
9 | checkIdentical(length(exonic_parts), 653L)
10 | expected_mcolnames <- c("tx_id", "tx_name", "gene_id", "exon_id",
11 | "exon_name", "exon_rank", "exonic_part")
12 | checkIdentical(expected_mcolnames, names(mcols(exonic_parts)))
13 | checkTrue(is.character(mcols(exonic_parts)$gene_id))
14 | checkTrue(is(mcols(exonic_parts)$tx_name, "CharacterList"))
15 | checkTrue(is.integer(mcols(exonic_parts)$exonic_part))
16 |
17 | exonic_parts <- exonicParts(txdb, linked.to.single.gene.only=FALSE)
18 | checkIdentical(length(exonic_parts), 660L)
19 | expected_mcolnames <- head(expected_mcolnames, n=-1L)
20 | checkIdentical(expected_mcolnames, names(mcols(exonic_parts)))
21 | }
22 |
23 |
--------------------------------------------------------------------------------
/inst/unitTests/test_getPromoterSeq-methods.R:
--------------------------------------------------------------------------------
1 | library(BSgenome.Hsapiens.UCSC.hg19)
2 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
3 | library(BSgenome.Dmelanogaster.UCSC.dm3)
4 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
5 | library(Rsamtools)
6 | library(pasillaBamSubset)
7 |
8 | e2f3 <- "1871" # human gene on the plus strand, chr6
9 | grb2 <- "2885" # human gene on the minus strand, chr17
10 |
11 | # a note on method: when the promoter sequence is 20 bases or more in length,
12 | # uscs blat will find these sequences, and a quick visual inspection of the
13 | # accompanying genome browser view at the right level of zoom, will
14 | # confirm that the per-transcript sequences is indeed correct.
15 | # there are a few tests of shorter sequences below as well, which
16 | # I checked in the genome browser, but this required a little more effort
17 | # than the length 20, blat approach.
18 |
19 | test_GRangesListBSgenomeHumanGetPromoterSeq <- function() {
20 | txdb <- restoreSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene) ## safety net
21 | genes <- c(e2f3, grb2)
22 | tx_by_gene <- transcriptsBy(txdb, by="gene")[genes]
23 | checkIdentical(names(tx_by_gene), genes)
24 | transcript_count <- length(unlist(tx_by_gene, use.names=FALSE))
25 |
26 | promoter_seqs <- getPromoterSeq(tx_by_gene, Hsapiens,
27 | upstream=10, downstream=0)
28 | checkTrue(validObject(promoter_seqs))
29 | checkTrue(is(promoter_seqs, "DNAStringSetList"))
30 | checkEquals(length(promoter_seqs), 2)
31 | checkIdentical(names(promoter_seqs), genes)
32 | checkIdentical(width(unlist(promoter_seqs, use.names=FALSE)),
33 | rep.int(10L, transcript_count))
34 |
35 | terminator_seqs <- getTerminatorSeq(tx_by_gene, Hsapiens,
36 | upstream=10, downstream=0)
37 | checkTrue(validObject(terminator_seqs))
38 | checkTrue(is(terminator_seqs, "DNAStringSetList"))
39 | checkEquals(length(terminator_seqs), 2)
40 | checkIdentical(names(terminator_seqs), genes)
41 | checkIdentical(width(unlist(terminator_seqs, use.names=FALSE)),
42 | rep.int(10L, transcript_count))
43 | }
44 |
45 | test_GRangesListBSgenomeFlyGetPromoterSeq <- function() {
46 | # two neighboring genes near beginning of chr3R, on opposite strands
47 | # gene_id flybase_id symbol
48 | # 40524 FBgn0037215 CG12582
49 | # 40526 FBgn0037217 CG14636
50 | # in 2012, UCSC reported 4 total transcripts for these two genes
51 | # in 2013, 6. there should be as many promoter_seqs as there
52 | # are transcripts, and they should each be of width
53 | # upstream + downstream. it is risky to check for specific
54 | # sequence in the promoter_seqs since the annotation and sequence
55 | # may change
56 |
57 | txdb <- restoreSeqlevels(TxDb.Dmelanogaster.UCSC.dm3.ensGene) ## safety net
58 | genes <- c("FBgn0037215", "FBgn0037217")
59 | tx_by_gene <- transcriptsBy(txdb, by="gene")[genes]
60 | checkIdentical(names(tx_by_gene), genes)
61 | transcript_count <- length(unlist(tx_by_gene, use.names=FALSE))
62 |
63 | promoter_seqs <- getPromoterSeq(tx_by_gene, Dmelanogaster,
64 | upstream=10, downstream=10)
65 | checkTrue(validObject(promoter_seqs))
66 | checkTrue(is(promoter_seqs, "DNAStringSetList"))
67 | checkEquals(length(promoter_seqs), 2)
68 | checkIdentical(names(promoter_seqs), genes)
69 | checkIdentical(width(unlist(promoter_seqs, use.names=FALSE)),
70 | rep.int(20L, transcript_count))
71 |
72 | terminator_seqs <- getPromoterSeq(tx_by_gene, Dmelanogaster,
73 | upstream=10, downstream=10)
74 | checkTrue(validObject(terminator_seqs))
75 | checkTrue(is(terminator_seqs, "DNAStringSetList"))
76 | checkEquals(length(terminator_seqs), 2)
77 | checkIdentical(names(terminator_seqs), genes)
78 | checkIdentical(width(unlist(terminator_seqs, use.names=FALSE)),
79 | rep.int(20L, transcript_count))
80 | }
81 |
82 | test_GRangesListFastaFlyGetPromoterSeq <- function() {
83 | # two neighboring genes near beginning of chr3R, on opposite strands
84 | # gene_id flybase_id symbol chr
85 | # 43766 FBgn0025740 plexB 4
86 | # 43769 FBgn0085432 pan 4
87 |
88 | txdb <- restoreSeqlevels(TxDb.Dmelanogaster.UCSC.dm3.ensGene) ## safety net
89 | genes <- c("FBgn0025740", "FBgn0085432")
90 | tx_by_gene <- transcriptsBy(txdb, by="gene")[genes]
91 | checkIdentical(names(tx_by_gene), genes)
92 | transcript_count <- length(unlist(tx_by_gene, use.names=FALSE))
93 | fa_file <- FaFile(dm3_chr4())
94 |
95 | promoter_seqs <- getPromoterSeq(tx_by_gene, fa_file,
96 | upstream=10, downstream=10)
97 | checkTrue(validObject(promoter_seqs))
98 | checkTrue(is(promoter_seqs, "DNAStringSetList"))
99 | checkEquals(length(promoter_seqs), 2)
100 | checkIdentical(names(promoter_seqs), genes)
101 | checkIdentical(width(unlist(promoter_seqs, use.names=FALSE)),
102 | rep.int(20L, transcript_count))
103 | # we are unable to check for specific DNA sequence, since
104 | # the UCSC annotation of these genes changes over time.
105 |
106 | terminator_seqs <- getPromoterSeq(tx_by_gene, fa_file,
107 | upstream=10, downstream=10)
108 | checkTrue(validObject(terminator_seqs))
109 | checkTrue(is(terminator_seqs, "DNAStringSetList"))
110 | checkEquals(length(terminator_seqs), 2)
111 | checkIdentical(names(terminator_seqs), genes)
112 | checkIdentical(width(unlist(terminator_seqs, use.names=FALSE)),
113 | rep.int(20L, transcript_count))
114 | }
115 |
116 | test_GRangesBSgenomeHumanGetPromoterSeq <- function() {
117 | txdb <- restoreSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene) ## safety net
118 | e2f3_tx <- transcriptsBy(txdb, by="gene")[[e2f3]]
119 | #names(e2f3_tx) <- mcols(e2f3_tx)$tx_name
120 | transcript_count <- length(e2f3_tx)
121 | checkEquals(dim(mcols(e2f3_tx)), c(transcript_count, 2))
122 | checkIdentical(colnames(mcols(e2f3_tx)), c("tx_id", "tx_name"))
123 |
124 | promoter_seqs <- getPromoterSeq(e2f3_tx, Hsapiens,
125 | upstream=10, downstream=0)
126 | checkTrue(validObject(promoter_seqs))
127 | checkTrue(is(promoter_seqs, "DNAStringSet"))
128 | checkEquals(length(promoter_seqs), transcript_count)
129 | checkTrue(is.null(names(promoter_seqs)))
130 | checkIdentical(width(promoter_seqs), rep.int(10L, transcript_count))
131 | # should be one more column in the metadata than in the metadata
132 | checkEquals(dim(mcols(promoter_seqs)), c(transcript_count, 3))
133 | checkEquals(colnames(mcols(promoter_seqs)), c("tx_id", "tx_name", "geneID"))
134 | # the input, a GRanges, had no names -- which are the source
135 | # of geneID when the GRangesList version of this methods is called.
136 | # so ensure that this lack of information was passed along into the
137 | # metadata of the returned promoter_seqs
138 | checkTrue(all(is.na(mcols(promoter_seqs)$geneID)))
139 | }
140 |
141 |
--------------------------------------------------------------------------------
/inst/unitTests/test_makeIdsForUniqueDataFrameRows.R:
--------------------------------------------------------------------------------
1 | ###
2 |
3 | test_makeIdsForUniqueDataFrameRows <- function()
4 | {
5 | x <- data.frame(
6 | chrom=c("chr2", "chr2", "chr2", "chr2", "chr1",
7 | "chr2", "chr2", "chr1", "chr3", "chr1"),
8 | strand=c("+", "-", "-", "+", "+", "+", "+", "-", "-", "+"),
9 | start=c(5, 2, 2, 5, 4, 5, 5, 4, 2, 1),
10 | end=c(15, 12, 12, 15, 14, 13, 15, 14, 12, 11)
11 | )
12 | y <- unique(x)[GenomicFeatures:::makeIdsForUniqueDataFrameRows(x), ]
13 | row.names(y) <- NULL
14 | checkEquals(y, x)
15 | }
16 |
17 |
--------------------------------------------------------------------------------
/inst/unitTests/test_mapIdsToRanges.R:
--------------------------------------------------------------------------------
1 | txdb <- local({
2 | library(txdbmaker) # for makeTxDbFromGRanges()
3 | fl <- system.file(package = "GenomicFeatures", "extdata",
4 | "sample_ranges.rds")
5 | makeTxDbFromGRanges(readRDS(fl))
6 | })
7 |
8 | test_mapIdsToRanges_improper_inputs <- function()
9 | {
10 | checkException(mapIdsToRanges(txdb, keys = ""),
11 | "must be a named list")
12 |
13 | checkException(mapIdsToRanges(txdb, keys = "ENST000000271582"),
14 | "must be a named list")
15 |
16 | checkException(mapIdsToRanges(txdb, keys = list("ENST000000271582")),
17 | "must be a named list")
18 |
19 | checkException(mapIdsToRanges(txdb,
20 | keys = list(tx_name = "ENST000000271582"),
21 | column = 1),
22 | "'columns' must be 'NULL' or a character vector")
23 | }
24 |
25 | test_mapIdsToRanges_same_order <- function()
26 | {
27 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588",
28 | "ENST00000494752", "ENST00000614008", "ENST00000496771"))
29 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx")
30 | checkEquals(names(res), keys[[1]])
31 |
32 | # shuffle the order and make sure it remains equivalent
33 | for (i in seq_len(10)) {
34 | keys$tx_name <- sample(keys$tx_name)
35 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx")
36 | checkEquals(names(res), keys[[1]])
37 | }
38 | }
39 |
40 | test_mapIdsToRanges_missing_results <- function()
41 | {
42 | keys <- list(tx_name = c("ENST00000371582", "NOT_FOUND", "ENST00000494752"))
43 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx")
44 | checkEquals(names(res), keys$tx_name)
45 |
46 | # shuffle the order and make sure it remains equivalent
47 | for (i in seq_len(10)) {
48 | keys$tx_name <- sample(keys$tx_name)
49 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx")
50 | checkEquals(names(res), keys$tx_name)
51 | }
52 | }
53 |
54 | test_mapIdsToRanges_duplicate_ranges <- function()
55 | {
56 | # both of these transcripts are from the same gene
57 | keys <- list(tx_name = c("ENST00000371582", "ENST00000494752"))
58 | res <- mapIdsToRanges(txdb, keys = keys, type = "gene")
59 |
60 | #names match input
61 | checkEquals(names(res), keys[[1]])
62 | # but values are the same
63 | checkTrue(all.equal(res[[1]], res[[2]], check.attributes = FALSE))
64 | }
65 |
66 | test_mapIdsToRanges_duplicate_ids <- function() {
67 | keys <- list(tx_name = c("ENST00000371582", "ENST00000494752",
68 | "ENST00000371582"))
69 | res <- mapIdsToRanges(txdb, keys = keys, type = "gene")
70 | checkEquals(names(res), keys[[1]])
71 | checkEquals(res[[1]], res[[3]])
72 | }
73 |
74 | test_mapRangesToIds_empty <- function()
75 | {
76 | checkException(mapRangesToIds(txdb, NULL), list())
77 |
78 | checkException(mapRangesToIds(txdb, list()), list())
79 | }
80 |
81 | test_mapRangesToIds_matches <- function()
82 | {
83 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588",
84 | "ENST00000494752", "ENST00000614008", "ENST00000496771"))
85 | res <- mapIdsToRanges(txdb, keys = keys, type = "gene")
86 |
87 | res2 <- mapRangesToIds(txdb, res, "tx")
88 | checkTrue(keys$tx_name[1] %in% res2[[1]]$tx_name)
89 |
90 | checkTrue(keys$tx_name[2] %in% res2[[2]]$tx_name)
91 | }
92 |
--------------------------------------------------------------------------------
/inst/unitTests/test_nearest-methods.R:
--------------------------------------------------------------------------------
1 | quiet <- suppressWarnings
2 |
3 | test_GenomicRanges_distance <- function()
4 | {
5 | library(txdbmaker) # for makeTxDb()
6 | genes <- data.frame(
7 | tx_id=1:3,
8 | gene_id=c("gene1", "gene1", "gene2"))
9 | transcripts <- data.frame(
10 | tx_id=1:3,
11 | tx_chrom="chr1",
12 | tx_strand=c("+", "+", "-"),
13 | tx_start=c(1, 2001, 3001),
14 | tx_end=c(999, 2199, 3199))
15 | splicings <- data.frame(
16 | tx_id=c(1L, 2L, 2L, 2L, 3L, 3L),
17 | cds_id=c(10L, 11L, 12L, 13L, 14L, 15L),
18 | exon_rank=c(1, 1, 2, 3, 1, 2),
19 | exon_start=c(1, 2001, 2101, 2131, 3001, 3131),
20 | exon_end=c(999, 2085, 2144, 2199, 3085, 3199),
21 | cds_start=c(1, 2022, 2101, 2131, 3001, 3131),
22 | cds_end=c(999, 2085, 2144, 2193, 3085, 3199))
23 | txdb <- quiet(makeTxDb(transcripts, splicings, genes))
24 |
25 | gr <- GRanges("chr1", IRanges(1050, width=1))
26 | strand(gr) <- "-"
27 | d <- quiet(distance(gr, txdb, id="gene1", type="gene"))
28 | checkTrue(is.na(d))
29 | strand(gr) <- "+"
30 | d_pos <- quiet(distance(gr, txdb, id="gene1", type="gene"))
31 | strand(gr) <- "*"
32 | d_star <- quiet(distance(gr, txdb, id="gene1", type="gene"))
33 | checkIdentical(d_pos, d_star)
34 |
35 | d_tx <- quiet(distance(gr, txdb, id="3", type="tx"))
36 | d_cds <- quiet(distance(gr, txdb, id="14", type="cds"))
37 | checkIdentical(d_tx, d_cds)
38 | }
39 |
--------------------------------------------------------------------------------
/inst/unitTests/test_transcriptLengths.R:
--------------------------------------------------------------------------------
1 | test_transcriptLengths <- function()
2 | {
3 | library(txdbmaker) # for makeTxDbFromGFF()
4 | gff <- system.file("extdata", "ITAG4.1_gene_models.subset.gff",
5 | package="GenomicFeatures")
6 | txdb <- makeTxDbFromGFF(gff)
7 | txlens <- transcriptLengths(txdb, with.cds_len=TRUE,
8 | with.utr5_len=TRUE,
9 | with.utr3_len=TRUE)
10 |
11 | checkIdentical(class(txlens), "data.frame")
12 | checkIdentical(dim(txlens), c(10L, 8L))
13 |
14 | expected_colnames <- c("tx_id", "tx_name", "gene_id", "nexon", "tx_len",
15 | "cds_len", "utr5_len", "utr3_len")
16 | checkIdentical(colnames(txlens), expected_colnames)
17 |
18 | checkIdentical(txlens$tx_len,
19 | txlens$cds_len + txlens$utr5_len + txlens$utr3_len)
20 |
21 | expected_nexon <- c(2L, 1L, 1L, 3L, 1L, 2L, 3L, 4L, 2L, 2L)
22 | checkIdentical(txlens$nexon, expected_nexon)
23 |
24 | expected_tx_len <- c(721L, 813L, 319L, 1046L, 264L,
25 | 819L, 802L, 453L, 249L, 889L)
26 | checkIdentical(txlens$tx_len, expected_tx_len)
27 |
28 | expected_cds_len <- c(471L, 663L, 276L, 918L, 264L,
29 | 261L, 369L, 453L, 249L, 516L)
30 | checkIdentical(txlens$cds_len, expected_cds_len)
31 |
32 | expected_utr5_len <- c(250L, 0L, 0L, 89L, 0L, 558L, 0L, 0L, 0L, 373L)
33 | checkIdentical(txlens$utr5_len, expected_utr5_len)
34 |
35 | expected_utr3_len <- c(0L, 150L, 43L, 39L, 0L, 0L, 433L, 0L, 0L, 0L)
36 | checkIdentical(txlens$utr3_len, expected_utr3_len)
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/inst/unitTests/test_transcripts.R:
--------------------------------------------------------------------------------
1 | test_transcripts <- function()
2 | {
3 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
4 | package="GenomicFeatures"))
5 |
6 | ## Test misuse.
7 | checkException(transcripts(data.frame()), silent=TRUE)
8 | checkException(transcripts(txdb, filter=list(bad=1:10)), silent=TRUE)
9 | checkException(transcripts(txdb, columns="bad"), silent=TRUE)
10 |
11 | ## Test 1.
12 | current <- transcripts(txdb, filter=list(gene_id="139231"))
13 | metadata(current) <- list()
14 |
15 | target <- GRanges("chrX",
16 | ranges=IRanges(start=c(103411156, 103430747),
17 | end =c(103440582, 103440582)),
18 | strand=strand("+"),
19 | tx_id=142:143,
20 | tx_name=c("uc004elw.3", "uc004elx.3"),
21 | seqinfo=seqinfo(txdb))
22 | checkIdentical(target, current)
23 |
24 | ## Test 2.
25 | filter <- list(tx_chrom=c("chr12", "chr14"), tx_strand="-")
26 | current <- transcripts(txdb, columns=c("tx_id", "tx_name",
27 | "exon_id", "exon_rank"),
28 | filter=filter)
29 | metadata(current) <- list()
30 |
31 | target <- GRanges("chr12",
32 | ranges=IRanges(start=52753790, end=52761309),
33 | strand=strand("-"),
34 | tx_id=87L,
35 | tx_name="uc001sag.3",
36 | exon_id=IntegerList(334:326),
37 | exon_rank=IntegerList(1:9),
38 | seqinfo=seqinfo(txdb))
39 | checkIdentical(target, current)
40 |
41 | ## Test 3.
42 | filter <- list(gene_id=c("220004", "1183", "10186"))
43 | current <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"),
44 | filter=filter)
45 | metadata(current) <- list()
46 |
47 | target_tx_id <- c(91L, 136:137)
48 | target_gene_id <- CharacterList("10186", "1183", "1183")
49 | target <- GRanges(c("chr13", "chrX", "chrX"),
50 | ranges=IRanges(start=c(39917029, 10124985, 10124985),
51 | end =c(40177356, 10205699, 10205699)),
52 | strand=strand(c("-", "+", "+")),
53 | tx_id=target_tx_id,
54 | tx_name=c("uc001uxf.3", "uc004csy.4", "uc011mid.3"),
55 | gene_id=target_gene_id,
56 | seqinfo=seqinfo(txdb))
57 | checkIdentical(target, current)
58 | }
59 |
60 | test_transcripts_after_seqlevelsStyle_switch <- function()
61 | {
62 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
63 | package="GenomicFeatures"))
64 | checkIdentical(seqlevelsStyle(txdb), "UCSC")
65 | seqlevelsStyle(txdb) <- "NCBI"
66 | checkIdentical(seqlevelsStyle(txdb), c("NCBI", "UCSC"))
67 |
68 | filter <- list(gene_id=c("220004", "1183", "10186"))
69 | current <- transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"),
70 | filter=filter)
71 | metadata(current) <- list()
72 |
73 | checkIdentical(c("13", "X", "X"), as.character(seqnames(current)))
74 |
75 | target_tx_id <- c(91L, 136:137)
76 | target_gene_id <- CharacterList("10186", "1183", "1183")
77 | target <- GRanges(c("13", "X", "X"),
78 | ranges=IRanges(start=c(39917029, 10124985, 10124985),
79 | end =c(40177356, 10205699, 10205699)),
80 | strand=strand(c("-", "+", "+")),
81 | tx_id=target_tx_id,
82 | tx_name=c("uc001uxf.3", "uc004csy.4", "uc011mid.3"),
83 | gene_id=target_gene_id,
84 | seqinfo=seqinfo(txdb))
85 | checkIdentical(target, current)
86 | }
87 |
88 | test_exons <- function()
89 | {
90 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
91 | package="GenomicFeatures"))
92 |
93 | ## Test misuse.
94 | checkException(exons(data.frame()), silent=TRUE)
95 | checkException(exons(txdb, filter=list(bad=1:10)), silent=TRUE)
96 |
97 | ## Test 1.
98 | current <- exons(txdb, filter=list(tx_name="uc001gde.2"))
99 | metadata(current) <- list()
100 |
101 | target <- GRanges("chr1",
102 | ranges=IRanges(start=c(165513478, 165532742),
103 | end =c(165514155, 165533185)),
104 | strand=strand("+"),
105 | exon_id=29:30,
106 | seqinfo=seqinfo(txdb))
107 | checkIdentical(target, current)
108 |
109 | ## Test 2.
110 | filter <- list(exon_chrom=c("chr5", "chr14"), exon_strand="-")
111 | current <- exons(txdb, columns=c("exon_id", "tx_name", "gene_id"),
112 | filter=filter)
113 | metadata(current) <- list()
114 |
115 | target_ranges <-
116 | IRanges(start=c(134363424, 134366966, 134369403, 170732985),
117 | end =c(134365011, 134367198, 134369964, 170735759))
118 | target <- GRanges("chr5",
119 | ranges=target_ranges,
120 | strand=strand("-"),
121 | exon_id=182:185,
122 | tx_name=CharacterList("uc010jea.3", "uc010jea.3",
123 | "uc010jea.3", "uc003mbe.2"),
124 | gene_id=CharacterList("5307", "5307", "5307", NULL),
125 | seqinfo=seqinfo(txdb))
126 | checkIdentical(target, current)
127 | }
128 |
129 | test_cds <- function()
130 | {
131 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
132 | package="GenomicFeatures"))
133 |
134 | ## Test misuse.
135 | checkException(cds(data.frame()), silent=TRUE)
136 | checkException(cds(txdb, filter=list(bad=1:10)), silent=TRUE)
137 |
138 | ## Test 1.
139 | current <- cds(txdb, filter=list(tx_name="uc001gde.2"))
140 | metadata(current) <- list()
141 |
142 | target <- GRanges("chr1",
143 | ranges=IRanges(start=c(165513534, 165532742),
144 | end =c(165514155, 165533061)),
145 | strand=strand("+"),
146 | cds_id=23:24,
147 | seqinfo=seqinfo(txdb))
148 | checkIdentical(target, current)
149 |
150 | ## Test 2.
151 | filter <- list(cds_chrom=c("chr5", "chr14"), cds_strand="-")
152 | current <- cds(txdb, columns=c("exon_id", "tx_name", "gene_id"),
153 | filter=filter)
154 | metadata(current) <- list()
155 |
156 | target_ranges <-
157 | IRanges(start=c(134364469, 134366966, 134369403, 170735359),
158 | end =c(134365011, 134367198, 134369571, 170735634))
159 | target <- GRanges("chr5",
160 | ranges=target_ranges,
161 | strand=strand("-"),
162 | exon_id=as(182:185, "IntegerList"),
163 | tx_name=CharacterList("uc010jea.3", "uc010jea.3",
164 | "uc010jea.3", "uc003mbe.2"),
165 | gene_id=CharacterList("5307", "5307", "5307", NULL),
166 | seqinfo=seqinfo(txdb))
167 | checkIdentical(target, current)
168 | }
169 |
170 | test_promoters <- function()
171 | {
172 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
173 | package="GenomicFeatures"))
174 | tx <- transcripts(txdb, use.names=TRUE)
175 | current <- promoters(txdb)
176 | checkTrue(validObject(current))
177 | checkEquals(colnames(mcols(current)), c("tx_id", "tx_name"))
178 | checkIdentical(current, promoters(tx))
179 | current <- terminators(txdb)
180 | checkTrue(validObject(current))
181 | checkEquals(colnames(mcols(current)), c("tx_id", "tx_name"))
182 | checkIdentical(current, terminators(tx))
183 | }
184 |
185 |
186 | test_translateCols <- function(){
187 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
188 | package="GenomicFeatures"))
189 | tx1 <- transcripts(txdb, columns=c("tx_id", "tx_name", "cds_id"))
190 | checkEquals(colnames(mcols(tx1)), c("tx_id", "tx_name", "cds_id"))
191 | tx2 <- transcripts(txdb, columns=c("TXID", "TXNAME", "CDSID"))
192 | checkEquals(colnames(mcols(tx2)), c("TXID", "TXNAME", "CDSID"))
193 | tx3 <- transcripts(txdb, columns=c(bob="CDSID"))
194 | checkEquals(colnames(mcols(tx3)), c("bob"))
195 | tx4 <- transcripts(txdb, columns=c(bob="cds_id"))
196 | checkEquals(colnames(mcols(tx4)), c("bob"))
197 | ## And these two cases should both explode. ;)
198 | checkException(transcripts(txdb, columns=c("")))
199 | checkException(transcripts(txdb, columns=c("bob")))
200 | }
201 |
202 |
--------------------------------------------------------------------------------
/inst/unitTests/test_transcriptsByOverlaps.R:
--------------------------------------------------------------------------------
1 | ###
2 |
3 | test_transcriptsByOverlaps <- function()
4 | {
5 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
6 | package="GenomicFeatures"))
7 |
8 | checkException(transcriptsByOverlaps(txdb), silent = TRUE)
9 | checkException(transcriptsByOverlaps(txdb, IRanges()), silent = TRUE)
10 | checkException(transcriptsByOverlaps(txdb, GRanges(), columns = "bad"),
11 | silent = TRUE)
12 |
13 | seqinfo <- seqinfo(txdb)
14 | seqlevels <- seqlevels(seqinfo)
15 |
16 | gr <- GRanges(seqnames = "chrX",
17 | ranges = IRanges(start=54071000, width=1),
18 | strand = strand("-"))
19 | want <-
20 | GRanges(seqnames = factor("chrX", levels = seqlevels),
21 | ranges = IRanges(start=53963113, end=54071569),
22 | strand = strand("-"),
23 | tx_id = 147L,
24 | tx_name = "uc004dsu.3")
25 | seqinfo(want) <- seqinfo
26 | want <- GenomicFeatures:::.assignMetadataList(want, txdb)
27 | checkIdentical(transcriptsByOverlaps(txdb, gr), want)
28 |
29 | ranges <- IRanges(start = c(113000000, 54071000, 54071000),
30 | width = c( 5000000, 1, 1))
31 | chrom <- c("chr3", "chrX", "chrX")
32 | strand <- strand(c("+", "+", "-"))
33 | gr <- GRanges(seqnames = chrom, ranges = ranges, strand = strand)
34 | want <- GRanges(seqnames =
35 | factor(c("chr3", "chr3", "chr3", "chrX"),
36 | levels = seqlevels),
37 | ranges = IRanges(start = c(113666748, 113666822, 113676421,
38 | 53963113),
39 | end = c(113681827, 113681827, 113682211,
40 | 54071569)),
41 | strand = strand(c("+", "+", "+", "-")),
42 | tx_id = c(29:31, 147L))
43 | seqinfo(want) <- seqinfo
44 | want <- GenomicFeatures:::.assignMetadataList(want, txdb)
45 | checkIdentical(transcriptsByOverlaps(txdb, gr, columns="tx_id"), want)
46 | }
47 |
48 | test_exonsByOverlaps <- function()
49 | {
50 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
51 | package="GenomicFeatures"))
52 |
53 | checkException(exonsByOverlaps(txdb), silent = TRUE)
54 | checkException(exonsByOverlaps(txdb, IRanges()), silent = TRUE)
55 | checkException(exonsByOverlaps(txdb, GRanges(), columns = "bad"),
56 | silent = TRUE)
57 |
58 | seqinfo <- seqinfo(txdb)
59 | seqlevels <- seqlevels(seqinfo)
60 |
61 | gr <- GRanges(seqnames = "chr3",
62 | ranges = IRanges(start=113677210, width=1),
63 | strand = strand("+"))
64 | want <-
65 | GRanges(seqnames = factor("chr3", levels = seqlevels),
66 | ranges = IRanges(start=c(113677210, 113677210),
67 | end =c(113677385, 113682211)),
68 | strand = strand("+"),
69 | exon_id = 139:140)
70 | seqinfo(want) <- seqinfo
71 | want <- GenomicFeatures:::.assignMetadataList(want, txdb)
72 | checkIdentical(exonsByOverlaps(txdb, gr), want)
73 | }
74 |
75 | test_cdsByOverlaps <- function()
76 | {
77 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
78 | package="GenomicFeatures"))
79 |
80 | checkException(cdsByOverlaps(txdb), silent = TRUE)
81 | checkException(cdsByOverlaps(txdb, IRanges()), silent = TRUE)
82 | checkException(cdsByOverlaps(txdb, GRanges(), columns = "bad"),
83 | silent = TRUE)
84 |
85 | seqinfo <- seqinfo(txdb)
86 | seqlevels <- seqlevels(seqinfo)
87 |
88 | gr <- GRanges(seqnames = "chr3",
89 | ranges = IRanges(start=113677210, width=1),
90 | strand = strand("+"))
91 | want <-
92 | GRanges(seqnames = factor("chr3", levels = seqlevels),
93 | ranges = IRanges(start=c(113677210, 113677210),
94 | end =c(113677385, 113677477)),
95 | strand = strand("+"),
96 | cds_id = 116:117)
97 | seqinfo(want) <- seqinfo
98 | want <- GenomicFeatures:::.assignMetadataList(want, txdb)
99 | checkIdentical(cdsByOverlaps(txdb, gr), want)
100 | }
101 |
--------------------------------------------------------------------------------
/man/FeatureDb-class.Rd:
--------------------------------------------------------------------------------
1 | \name{FeatureDb-class}
2 |
3 | \alias{FeatureDb-class}
4 | \alias{class:FeatureDb}
5 | \alias{FeatureDb}
6 |
7 | \title{FeatureDb objects}
8 |
9 | \description{
10 | WARNING: The FeatureDb/makeFeatureDbFromUCSC/features code base is
11 | no longer actively maintained and FeatureDb-related functionalities
12 | might get deprecated in the near future. Please use
13 | \code{\link{makeFeatureDbFromUCSC}} for a convenient way to
14 | import transcript annotations from UCSC online resources into
15 | Bioconductor.
16 |
17 | The FeatureDb class is a generic container for storing
18 | genomic locations of an arbitrary type of genomic features.
19 |
20 | See \code{?\link{TxDb}} for a container for storing transcript
21 | annotations.
22 |
23 | See \code{?\link{makeFeatureDbFromUCSC}} for a convenient way to
24 | make FeatureDb objects from BioMart online resources.
25 | }
26 |
27 | \section{Methods}{
28 | In the code snippets below, \code{x} is a FeatureDb object.
29 |
30 | \describe{
31 | \item{\code{metadata(x)}:}{
32 | Return \code{x}'s metadata in a data frame.
33 | }
34 | }
35 | }
36 |
37 | \author{Marc Carlson}
38 |
39 | \seealso{
40 | \itemize{
41 | \item The \link{TxDb} class for storing transcript annotations.
42 | \item \code{\link{makeFeatureDbFromUCSC}} for a convenient way to
43 | make a FeatureDb object from UCSC online resources.
44 | \item \code{\link{saveDb}} and \code{\link{loadDb}} for
45 | saving and loading the database content of a FeatureDb object.
46 | \item \code{\link{features}} for how to extract genomic features
47 | from a FeatureDb object.
48 | }
49 | }
50 |
51 | \examples{
52 | fdb_file <- system.file("extdata", "FeatureDb.sqlite",
53 | package="GenomicFeatures")
54 | fdb <- loadDb(fdb_file)
55 | fdb
56 | }
57 |
58 | \keyword{methods}
59 | \keyword{classes}
60 |
--------------------------------------------------------------------------------
/man/TxDb-class.Rd:
--------------------------------------------------------------------------------
1 | \name{TxDb-class}
2 |
3 | \alias{TxDb-class}
4 | \alias{class:TxDb}
5 | \alias{TxDb}
6 |
7 | \alias{saveRDS,TxDb-method}
8 |
9 | \alias{organism,TxDb-method}
10 | \alias{seqlevels0,TxDb-method}
11 | \alias{seqlevels<-,TxDb-method}
12 | \alias{seqinfo,TxDb-method}
13 | \alias{isActiveSeq}
14 | \alias{isActiveSeq<-}
15 | \alias{isActiveSeq,TxDb-method}
16 | \alias{isActiveSeq<-,TxDb-method}
17 | \alias{show,TxDb-method}
18 |
19 | % coercion
20 | \alias{as.list,TxDb-method}
21 |
22 | \title{TxDb objects}
23 |
24 | \description{
25 | The TxDb class is a container for storing transcript annotations.
26 | }
27 |
28 | \section{Methods}{
29 | In the code snippets below, \code{x} is a TxDb object.
30 |
31 | \describe{
32 | \item{\code{metadata(x)}:}{
33 | Return \code{x}'s metadata in a data frame.
34 | }
35 | \item{\code{seqlevels0(x)}:}{
36 | Get the \emph{sequence levels} originally in \code{x}. This ignores any
37 | change the user might have made to the \emph{sequence levels} with the
38 | \code{seqlevels} setter.
39 | }
40 | \item{\code{seqlevels(x)}, \code{seqlevels(x) <- value}:}{
41 | Get or set the \emph{sequence levels} in \code{x}.
42 | }
43 | \item{\code{seqinfo(x)}, \code{seqinfo(x) <- value}:}{
44 | Get or set the information about the underlying sequences.
45 | Note that, for now, the setter only supports replacement of the
46 | sequence names, i.e., except for their sequence names (accessed with
47 | \code{seqnames(value)} and \code{seqnames(seqinfo(x))}, respectively),
48 | \link[GenomeInfoDb]{Seqinfo} objects \code{value} (supplied) and
49 | \code{seqinfo(x)} (current) must be identical.
50 | }
51 | \item{\code{isActiveSeq(x)}:}{
52 | Return the currently active sequences for this txdb object as a
53 | named logical vector. Only active sequences will be tapped when
54 | using the supplied accessor methods. Inactive sequences will be
55 | ignored. By default, all available sequences will be active.
56 | }
57 | \item{\code{isActiveSeq(x) <- value}:}{
58 | Allows the user to change which sequences will be actively
59 | accessed by the accessor methods by altering the contents of this
60 | named logical vector.
61 | }
62 | \item{\code{seqlevelsStyle(x)}, \code{seqlevelsStyle(x) <- value}:}{
63 | Get or set the seqname style for \code{x}.
64 | See the \link[GenomeInfoDb]{seqlevelsStyle} generic getter and setter
65 | in the \pkg{GenomeInfoDb} package for more information.
66 | }
67 | \item{\code{as.list(x)}:}{
68 | Dump the entire db into a list of data frames, say \code{txdb_dump},
69 | that can then be used to recreate the original db with
70 | \code{do.call(txdbmaker::makeTxDb, txdb_dump)} with no loss of
71 | information (except possibly for some of the metadata).
72 | Note that the transcripts are dumped in the same order in all the
73 | data frames.
74 | }
75 | }
76 | }
77 |
78 | \author{Hervé Pagès, Marc Carlson}
79 |
80 | \seealso{
81 | \itemize{
82 | \item \code{\link[txdbmaker]{makeTxDbFromUCSC}},
83 | \code{\link[txdbmaker]{makeTxDbFromBiomart}},
84 | and \code{\link[txdbmaker]{makeTxDbFromEnsembl}} in
85 | the \pkg{txdbmaker} package for making a \link{TxDb}
86 | object from online resources.
87 |
88 | \item \code{\link[txdbmaker]{makeTxDbFromGRanges}} and
89 | \code{\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker}
90 | package for making a \link{TxDb} object from a
91 | \link[GenomicRanges]{GRanges} object, or from a GFF or GTF file.
92 |
93 | \item \code{\link[AnnotationDbi]{saveDb}} and
94 | \code{\link[AnnotationDbi]{loadDb}} in the \pkg{AnnotationDbi}
95 | package for saving and loading a TxDb object as an SQLite file.
96 |
97 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}},
98 | and \code{\link{transcriptsByOverlaps}}, for extracting
99 | genomic feature locations from a \link{TxDb}-like object.
100 |
101 | \item \code{\link{transcriptLengths}} for extracting the transcript
102 | lengths (and other metrics) from a \link{TxDb} object.
103 |
104 | \item \link[GenomicFeatures]{select-methods} for how to use the
105 | simple "select" interface to extract information from a
106 | TxDb object.
107 |
108 | \item The \link[GenomeInfoDb]{Seqinfo} class in the \pkg{GenomeInfoDb}
109 | package.
110 | }
111 | }
112 |
113 | \examples{
114 | txdb_file <- system.file("extdata", "Biomart_Ensembl_sample.sqlite",
115 | package="GenomicFeatures")
116 | txdb <- loadDb(txdb_file)
117 | txdb
118 |
119 | ## Use of seqinfo():
120 | seqlevelsStyle(txdb)
121 | seqinfo(txdb)
122 | seqlevels(txdb)
123 | seqlengths(txdb) # shortcut for 'seqlengths(seqinfo(txdb))'
124 | isCircular(txdb) # shortcut for 'isCircular(seqinfo(txdb))'
125 | names(which(isCircular(txdb)))
126 |
127 | ## You can set user-supplied seqlevels on 'txdb' to restrict any further
128 | ## operations to a subset of chromosomes:
129 | seqlevels(txdb) <- c("Y", "6")
130 | ## Then you can restore the seqlevels stored in the db:
131 | seqlevels(txdb) <- seqlevels0(txdb)
132 |
133 | ## Use of as.list():
134 | txdb_dump <- as.list(txdb)
135 | txdb_dump
136 |
137 | library(txdbmaker) # for makeTxDb()
138 | txdb1 <- do.call(makeTxDb, txdb_dump)
139 | stopifnot(identical(as.list(txdb1), txdb_dump))
140 | }
141 |
142 | \keyword{methods}
143 | \keyword{classes}
144 |
--------------------------------------------------------------------------------
/man/as-format-methods.Rd:
--------------------------------------------------------------------------------
1 | \name{as-format-methods}
2 | \alias{asBED,TxDb-method}
3 | \alias{asGFF,TxDb-method}
4 |
5 | \title{Coerce to file format structures}
6 | \description{
7 | These functions coerce a \code{\linkS4class{TxDb}} object to a
8 | \code{\link[GenomicRanges:GRanges-class]{GRanges}} object with
9 | metadata columns encoding transcript structures according to the
10 | model of a standard file format. Currently, BED and GFF models are
11 | supported. If a \code{TxDb} is passed to
12 | \code{\link[BiocIO]{export}}, when targeting a BED or GFF file,
13 | this coercion occurs automatically.
14 | }
15 | \usage{
16 | \S4method{asBED}{TxDb}(x)
17 | \S4method{asGFF}{TxDb}(x)
18 | }
19 |
20 | \arguments{
21 | \item{x}{
22 | A \code{TxDb} object to coerce to a \code{GRanges},
23 | structured as BED or GFF.
24 | }
25 | }
26 |
27 | \value{
28 | For \code{asBED}, a \code{GRanges}, with the columns \code{name},
29 | \code{thickStart}, \code{thickEnd}, \code{blockStarts},
30 | \code{blockSizes} added. The thick regions correspond to the CDS
31 | regions, and the blocks represent the exons. The transcript IDs are
32 | stored in the \code{name} column. The ranges are the transcript bounds.
33 |
34 | For \code{asGFF}, a \code{GRanges}, with columns \code{type},
35 | \code{Name}, \code{ID},, and \code{Parent}. The gene structures are
36 | expressed according to the conventions defined by the GFF3 spec. There
37 | are elements of each \code{type} of feature: \dQuote{gene},
38 | \dQuote{mRNA} \dQuote{exon} and \dQuote{cds}. The \code{Name} column
39 | contains the \code{gene_id} for genes, \code{tx_name} for transcripts,
40 | and exons and cds regions are \code{NA}. The \code{ID} column uses
41 | \code{gene_id} and \code{tx_id}, with the prefixes \dQuote{GeneID} and
42 | \dQuote{TxID} to ensure uniqueness across types. The exons and cds
43 | regions have \code{NA} for \code{ID}. The \code{Parent} column
44 | contains the \code{ID}s of the parent features. A feature may have
45 | multiple parents (the column is a \code{CharacterList}). Each exon
46 | belongs to one or more mRNAs, and mRNAs belong to a gene.
47 | }
48 |
49 | \author{
50 | Michael Lawrence
51 | }
52 |
53 | \examples{
54 | txdb_file <- system.file("extdata", "hg19_knownGene_sample.sqlite",
55 | package="GenomicFeatures")
56 | txdb <- loadDb(txdb_file)
57 |
58 | asBED(txdb)
59 | asGFF(txdb)
60 | }
61 |
--------------------------------------------------------------------------------
/man/exonicParts.Rd:
--------------------------------------------------------------------------------
1 | \name{exonicParts}
2 |
3 | \alias{tidyTranscripts}
4 | \alias{tidyExons}
5 | \alias{tidyIntrons}
6 | \alias{exonicParts}
7 | \alias{intronicParts}
8 |
9 | \title{
10 | Extract non-overlapping exonic or intronic parts from a TxDb-like object
11 | }
12 |
13 | \description{
14 | \code{exonicParts} and \code{intronicParts} extract the non-overlapping
15 | (a.k.a. disjoint) exonic or intronic parts from a \link{TxDb}-like object.
16 | }
17 |
18 | \usage{
19 | exonicParts(txdb, linked.to.single.gene.only=FALSE)
20 | intronicParts(txdb, linked.to.single.gene.only=FALSE)
21 |
22 | ## 3 helper functions used internally by exonicParts() and intronicParts():
23 | tidyTranscripts(txdb, drop.geneless=FALSE)
24 | tidyExons(txdb, drop.geneless=FALSE)
25 | tidyIntrons(txdb, drop.geneless=FALSE)
26 | }
27 |
28 | \arguments{
29 | \item{txdb}{
30 | A \link{TxDb} object, or any \link{TxDb}-like object that supports the
31 | \code{\link{transcripts}()} and \code{\link{exonsBy}()} extractors
32 | (e.g. an \link[ensembldb]{EnsDb} object).
33 | }
34 | \item{linked.to.single.gene.only}{
35 | \code{TRUE} or \code{FALSE}.
36 |
37 | If \code{FALSE} (the default), then the disjoint parts are obtained
38 | by calling \code{\link[IRanges]{disjoin}()} on all the exons (or introns)
39 | in \code{txdb}, including on exons (or introns) not linked to a gene or
40 | linked to more than one gene.
41 |
42 | If \code{TRUE}, then the disjoint parts are obtained in 2 steps:
43 | \enumerate{
44 | \item call \code{\link[IRanges]{disjoin}()} on the exons (or introns)
45 | linked to \emph{at least one gene},
46 |
47 | \item then drop the parts linked to more than one gene from
48 | the set of exonic (or intronic) parts obtained previously.
49 | }
50 | }
51 | \item{drop.geneless}{
52 | If \code{FALSE} (the default), then all the transcripts (or exons, or
53 | introns) get extracted from the \link{TxDb} object.
54 |
55 | If \code{TRUE}, then only the transcripts (or exons, or introns) that
56 | are linked to a gene get extracted from the \link{TxDb} object.
57 |
58 | Note that \code{drop.geneless} also impacts the order in which the
59 | features are returned:
60 | \itemize{
61 | \item Transcripts: If \code{drop.geneless} is \code{FALSE} then
62 | transcripts are returned in the same order as with
63 | \code{\link{transcripts}}, which is expected to be by
64 | internal transcript id (\code{tx_id}).
65 | Otherwise they are ordered first by gene id (\code{gene_id}),
66 | then by internal transcript id.
67 | \item Exons: If \code{drop.geneless} is \code{FALSE} then exons are
68 | ordered first by internal transcript id (\code{tx_id}),
69 | then by exon rank (\code{exon_rank}).
70 | Otherwise they are ordered first by gene id (\code{gene_id}),
71 | then by internal transcript id, and then by exon rank.
72 | \item Introns: If \code{drop.geneless} is \code{FALSE} then introns
73 | are ordered by internal transcript id (\code{tx_id}).
74 | Otherwise they are ordered first by gene id (\code{gene_id}),
75 | then by internal transcript id.
76 | }
77 | }
78 | }
79 |
80 | \value{
81 | \code{exonicParts} returns a disjoint and strictly sorted
82 | \link[GenomicRanges]{GRanges} object with 1 range per exonic part
83 | and with metadata columns \code{tx_id}, \code{tx_name}, \code{gene_id},
84 | \code{exon_id}, \code{exon_name}, and \code{exon_rank}.
85 | If \code{linked.to.single.gene.only} was set to \code{TRUE},
86 | an additional \code{exonic_part} metadata column is added that
87 | indicates the rank of each exonic part within all the exonic parts
88 | linked to the same gene.
89 |
90 | \code{intronicParts} returns a disjoint and strictly sorted
91 | \link[GenomicRanges]{GRanges} object with 1 range per intronic part
92 | and with metadata columns \code{tx_id}, \code{tx_name}, and \code{gene_id}.
93 | If \code{linked.to.single.gene.only} was set to \code{TRUE},
94 | an additional \code{intronic_part} metadata column is added that
95 | indicates the rank of each intronic part within all the intronic parts
96 | linked to the same gene.
97 |
98 | \code{tidyTranscripts} returns a \link[GenomicRanges]{GRanges} object
99 | with 1 range per transcript and with metadata columns \code{tx_id},
100 | \code{tx_name}, and \code{gene_id}.
101 |
102 | \code{tidyExons} returns a \link[GenomicRanges]{GRanges} object
103 | with 1 range per exon and with metadata columns \code{tx_id},
104 | \code{tx_name}, \code{gene_id}, \code{exon_id}, \code{exon_name},
105 | and \code{exon_rank}.
106 |
107 | \code{tidyIntrons} returns a \link[GenomicRanges]{GRanges} object
108 | with 1 range per intron and with metadata columns \code{tx_id},
109 | \code{tx_name}, and \code{gene_id}.
110 | }
111 |
112 | \author{Hervé Pagès}
113 |
114 | \seealso{
115 | \itemize{
116 | \item \code{\link[IRanges]{disjoin}} in the \pkg{IRanges} package.
117 |
118 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}},
119 | and \code{\link{transcriptsByOverlaps}}, for extracting
120 | genomic feature locations from a \link{TxDb}-like object.
121 |
122 | \item \code{\link{transcriptLengths}} for extracting the transcript
123 | lengths (and other metrics) from a \link{TxDb} object.
124 |
125 | \item \code{\link{extendExonsIntoIntrons}} for extending exons into
126 | their adjacent introns.
127 |
128 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript
129 | (or CDS) sequences from chromosome sequences.
130 |
131 | \item \code{\link{coverageByTranscript}} for computing coverage by
132 | transcript (or CDS) of a set of ranges.
133 |
134 | \item The \link{TxDb} class.
135 | }
136 | }
137 |
138 | \examples{
139 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
140 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
141 |
142 | ## ---------------------------------------------------------------------
143 | ## exonicParts()
144 | ## ---------------------------------------------------------------------
145 |
146 | exonic_parts1 <- exonicParts(txdb)
147 | exonic_parts1
148 |
149 | ## Mapping from exonic parts to genes is many-to-many:
150 | gene_id1 <- mcols(exonic_parts1)$gene_id
151 | gene_id1 # CharacterList object
152 | table(lengths(gene_id1))
153 | ## The number of known genes a Human exonic part can be linked to
154 | ## varies from 0 to 22!
155 |
156 | exonic_parts2 <- exonicParts(txdb, linked.to.single.gene.only=TRUE)
157 | exonic_parts2
158 |
159 | ## Mapping from exonic parts to genes now is many-to-one:
160 | gene_id2 <- mcols(exonic_parts2)$gene_id
161 | gene_id2[1:20] # character vector
162 |
163 | ## Select exonic parts for a given gene:
164 | exonic_parts2[gene_id2 \%in\% "643837"]
165 |
166 | ## Sanity checks:
167 | stopifnot(isDisjoint(exonic_parts1), isStrictlySorted(exonic_parts1))
168 | stopifnot(isDisjoint(exonic_parts2), isStrictlySorted(exonic_parts2))
169 | stopifnot(all(exonic_parts2 \%within\% reduce(exonic_parts1)))
170 | stopifnot(identical(
171 | lengths(gene_id1) == 1L,
172 | exonic_parts1 \%within\% exonic_parts2
173 | ))
174 |
175 | ## ---------------------------------------------------------------------
176 | ## intronicParts()
177 | ## ---------------------------------------------------------------------
178 |
179 | intronic_parts1 <- intronicParts(txdb)
180 | intronic_parts1
181 |
182 | ## Mapping from intronic parts to genes is many-to-many:
183 | mcols(intronic_parts1)$gene_id
184 | table(lengths(mcols(intronic_parts1)$gene_id))
185 | ## A Human intronic part can be linked to 0 to 22 known genes!
186 |
187 | intronic_parts2 <- intronicParts(txdb, linked.to.single.gene.only=TRUE)
188 | intronic_parts2
189 |
190 | ## Mapping from intronic parts to genes now is many-to-one:
191 | class(mcols(intronic_parts2)$gene_id) # character vector
192 |
193 | ## Sanity checks:
194 | stopifnot(isDisjoint(intronic_parts1), isStrictlySorted(intronic_parts1))
195 | stopifnot(isDisjoint(intronic_parts2), isStrictlySorted(intronic_parts2))
196 | stopifnot(all(intronic_parts2 \%within\% reduce(intronic_parts1)))
197 | stopifnot(identical(
198 | lengths(mcols(intronic_parts1)$gene_id) == 1L,
199 | intronic_parts1 \%within\% intronic_parts2
200 | ))
201 |
202 | ## ---------------------------------------------------------------------
203 | ## Helper functions
204 | ## ---------------------------------------------------------------------
205 |
206 | tidyTranscripts(txdb) # Ordered by 'tx_id'.
207 | tidyTranscripts(txdb, drop.geneless=TRUE) # Ordered first by 'gene_id',
208 | # then by 'tx_id'.
209 |
210 | tidyExons(txdb) # Ordered first by 'tx_id',
211 | # then by 'exon_rank'.
212 | tidyExons(txdb, drop.geneless=TRUE) # Ordered first by 'gene_id',
213 | # then by 'tx_id',
214 | # then by 'exon_rank'.
215 |
216 | tidyIntrons(txdb) # Ordered by 'tx_id'.
217 | tidyIntrons(txdb, drop.geneless=TRUE) # Ordered first by 'gene_id',
218 | # then by 'tx_id'.
219 | }
220 |
221 | \keyword{manip}
222 |
--------------------------------------------------------------------------------
/man/extendExonsIntoIntrons.Rd:
--------------------------------------------------------------------------------
1 | \name{extendExonsIntoIntrons}
2 |
3 | \alias{extendExonsIntoIntrons}
4 |
5 | \title{
6 | Extend exons by a given number of bases into their adjacent introns
7 | }
8 |
9 | \description{
10 | \code{extendExonsIntoIntrons} extends the supplied exons by a given
11 | number of bases into their adjacent introns.
12 | }
13 |
14 | \usage{
15 | extendExonsIntoIntrons(ex_by_tx, extent=2)
16 | }
17 |
18 | \arguments{
19 | \item{ex_by_tx}{
20 | A \link[GenomicRanges]{GRangesList} object containing exons grouped
21 | by transcript. This must be an object as returned by
22 | \code{\link{exonsBy}(txdb, by="tx")}, that is:
23 | \itemize{
24 | \item each list element in \code{ex_by_tx} must be a
25 | \link[GenomicRanges]{GRanges} object representing the
26 | exons of a given transcript;
27 | \item the exons in each list element must be ordered by ascending
28 | rank with respect to their transcript.
29 | }
30 | }
31 | \item{extent}{
32 | Size of the extent in number of bases. 2 by default.
33 |
34 | The first exon in a transcript will be extended by that amount on
35 | its 3' side only. The last exon in a transcript will be extended by
36 | that amount on its 5' side only. All other exons (i.e. intermediate
37 | exons) will be extended by that amount on \emph{each} side.
38 |
39 | Note that exons that belong to a single-exon transcript don't get
40 | extended.
41 |
42 | The default value of 2 corresponds to inclusion of the donor/acceptor
43 | intronic regions (typically GT/AG).
44 | }
45 | }
46 |
47 | \value{
48 | A copy of \link[GenomicRanges]{GRangesList} object \code{ex_by_tx}
49 | where the original exon ranges have been extended.
50 |
51 | Names and metadata columns on \code{ex_by_tx} are propagated to the
52 | result.
53 | }
54 |
55 | \author{Hervé Pagès}
56 |
57 | \seealso{
58 | \itemize{
59 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}},
60 | and \code{\link{transcriptsByOverlaps}}, for extracting
61 | genomic feature locations from a \link{TxDb}-like object.
62 |
63 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for
64 | extracting non-overlapping exonic or intronic parts from a
65 | TxDb-like object.
66 |
67 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript
68 | (or CDS) sequences from chromosome sequences.
69 |
70 | \item The \link{TxDb} class.
71 | }
72 | }
73 |
74 | \examples{
75 | ## With toy transcripts:
76 | ex_by_tx <- GRangesList(
77 | TX1="chr1:10-20:+",
78 | TX2=c("chr1:10-20:+", "chr1:50-75:+"),
79 | TX3=c("chr1:10-20:+", "chr1:50-75:+", "chr1:100-120:+"),
80 | TX4="chr1:10-20:-",
81 | TX5=c("chr1:10-20:-", "chr1:50-75:-"),
82 | TX6=c("chr1:10-20:-", "chr1:50-75:-", "chr1:100-120:-")
83 | )
84 |
85 | extended <- extendExonsIntoIntrons(ex_by_tx, extent=2)
86 | extended[1:3]
87 | extended[4:6]
88 |
89 | ## With real-world transcripts:
90 | library(TxDb.Celegans.UCSC.ce11.ensGene)
91 | txdb <- TxDb.Celegans.UCSC.ce11.ensGene
92 | ex_by_tx <- exonsBy(txdb, by="tx")
93 | ex_by_tx
94 |
95 | extendExonsIntoIntrons(ex_by_tx, extent=2)
96 |
97 | ## Sanity check:
98 | stopifnot(identical(extendExonsIntoIntrons(ex_by_tx, extent=0), ex_by_tx))
99 | }
100 |
101 | \keyword{manip}
102 |
--------------------------------------------------------------------------------
/man/extractUpstreamSeqs.Rd:
--------------------------------------------------------------------------------
1 | \name{extractUpstreamSeqs}
2 |
3 | \alias{extractUpstreamSeqs}
4 | \alias{extractUpstreamSeqs,GenomicRanges-method}
5 | \alias{extractUpstreamSeqs,TxDb-method}
6 | \alias{extractUpstreamSeqs,GRangesList-method}
7 |
8 |
9 | \title{Extract sequences upstream of a set of genes or transcripts}
10 |
11 | \description{
12 | \code{extractUpstreamSeqs} is a generic function for extracting
13 | sequences upstream of a supplied set of genes or transcripts.
14 | }
15 |
16 | \usage{
17 | extractUpstreamSeqs(x, genes, width=1000, ...)
18 |
19 | ## Dispatch is on the 2nd argument!
20 |
21 | \S4method{extractUpstreamSeqs}{GenomicRanges}(x, genes, width=1000)
22 |
23 | \S4method{extractUpstreamSeqs}{TxDb}(x, genes, width=1000, exclude.seqlevels=NULL)
24 | }
25 |
26 | \arguments{
27 | \item{x}{
28 | An object containing the chromosome sequences from which to extract the
29 | upstream sequences. It can be a \link[BSgenome]{BSgenome},
30 | \link[rtracklayer]{TwoBitFile}, or \link[Rsamtools]{FaFile} object,
31 | or any \emph{genome sequence container}.
32 | More formally, \code{x} must be an object for which
33 | \code{\link[GenomeInfoDb]{seqinfo}} and \code{\link[Biostrings]{getSeq}}
34 | are defined.
35 | }
36 | \item{genes}{
37 | An object containing the locations (i.e. chromosome name, start, end, and
38 | strand) of the genes or transcripts with respect to the reference genome.
39 | Only \link[GenomicRanges]{GenomicRanges} and \link{TxDb} objects
40 | are supported at the moment. If the latter, the gene locations are obtained
41 | by calling the \code{\link{genes}} function on the \link{TxDb}
42 | object internally.
43 | }
44 | \item{width}{
45 | How many bases to extract upstream of each TSS (transcription start site).
46 | }
47 | \item{...}{
48 | Additional arguments, for use in specific methods.
49 | }
50 | \item{exclude.seqlevels}{
51 | A character vector containing the chromosome names (a.k.a. sequence levels)
52 | to exclude when the genes are obtained from a \link{TxDb} object.
53 | }
54 | }
55 |
56 | \value{
57 | A \link[Biostrings]{DNAStringSet} object containing one upstream sequence
58 | per gene (or per transcript if \code{genes} is a
59 | \link[GenomicRanges]{GenomicRanges} object containing transcript ranges).
60 |
61 | More precisely, if \code{genes} is a \link[GenomicRanges]{GenomicRanges}
62 | object, the returned object is \emph{parallel} to it, that is, the i-th
63 | element in the returned object is the upstream sequence corresponding to
64 | the i-th gene (or transcript) in \code{genes}. Also the names on the
65 | \link[GenomicRanges]{GenomicRanges} object are propagated to the returned
66 | object.
67 |
68 | If \code{genes} is a \link{TxDb} object, the names on the returned
69 | object are the gene IDs found in the \link{TxDb} object. To see the
70 | type of gene IDs (i.e. Entrez gene ID or Ensembl gene ID or ...), you can
71 | display \code{genes} with \code{show(genes)}.
72 |
73 | In addition, the returned object has the following metadata columns
74 | (accessible with \code{\link{mcols}}) that provide some information about
75 | the gene (or transcript) corresponding to each upstream sequence:
76 | \itemize{
77 | \item \code{gene_seqnames}: the chromosome name of the gene (or
78 | transcript);
79 | \item \code{gene_strand}: the strand of the gene (or transcript);
80 | \item \code{gene_TSS}: the transcription start site of the gene (or
81 | transcript).
82 | }
83 | }
84 |
85 | \note{
86 | IMPORTANT: Always make sure to use a TxDb package (or \link{TxDb}
87 | object) that contains a gene model compatible with the \emph{genome sequence
88 | container} \code{x}, that is, a gene model based on the exact same reference
89 | genome as \code{x}.
90 |
91 | See
92 | \url{http://bioconductor.org/packages/release/BiocViews.html#___TxDb}
93 | for the list of TxDb packages available in the current release of
94 | Bioconductor.
95 | Note that you can make your own custom \link{TxDb} object from
96 | various annotation resources by using one of the \code{makeTxDbFrom*()}
97 | functions defined in the \pkg{txdbmaker} package and listed in
98 | the "See also" section below.
99 | }
100 |
101 | \author{Hervé Pagès}
102 |
103 | \seealso{
104 | \itemize{
105 | \item \code{\link[txdbmaker]{makeTxDbFromUCSC}},
106 | \code{\link[txdbmaker]{makeTxDbFromBiomart}},
107 | and \code{\link[txdbmaker]{makeTxDbFromEnsembl}} in
108 | the \pkg{txdbmaker} package for making a \link{TxDb}
109 | object from online resources.
110 |
111 | \item \code{\link[txdbmaker]{makeTxDbFromGRanges}} and
112 | \code{\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker}
113 | package for making a \link{TxDb} object from a
114 | \link[GenomicRanges]{GRanges} object, or from a GFF or GTF file.
115 |
116 | \item The \code{\link[BSgenome]{available.genomes}} function in the
117 | \pkg{BSgenome} package for checking avaibility of BSgenome
118 | data packages (and installing the desired one).
119 |
120 | \item The \link[BSgenome]{BSgenome}, \link[rtracklayer]{TwoBitFile}, and
121 | \link[Rsamtools]{FaFile} classes, defined and documented
122 | in the \pkg{BSgenome}, \pkg{rtracklayer}, and \pkg{Rsamtools}
123 | packages, respectively.
124 |
125 | \item The \link{TxDb} class.
126 |
127 | \item The \code{\link{genes}} function for extracting gene ranges from
128 | a \link{TxDb} object.
129 |
130 | \item The \link[GenomicRanges]{GenomicRanges} class defined and documented
131 | in the \pkg{GenomicRanges} package.
132 |
133 | \item The \link[Biostrings]{DNAStringSet} class defined and documented
134 | in the \pkg{Biostrings} package.
135 |
136 | \item The \code{\link[GenomeInfoDb]{seqinfo}} getter defined and documented
137 | in the \pkg{GenomeInfoDb} package.
138 |
139 | \item The \code{\link[Biostrings]{getSeq}} function for extracting
140 | subsequences from a sequence container.
141 | }
142 | }
143 |
144 | \examples{
145 | ## Load a genome:
146 | library(BSgenome.Dmelanogaster.UCSC.dm3)
147 | genome <- BSgenome.Dmelanogaster.UCSC.dm3
148 | genome
149 |
150 | ## Use a TxDb object:
151 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
152 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
153 | txdb # contains Ensembl gene IDs
154 |
155 | ## Because the chrU and chrUextra sequences are made of concatenated
156 | ## scaffolds (see https://genome.ucsc.edu/cgi-bin/hgGateway?db=dm3),
157 | ## extracting the upstream sequences for genes located on these
158 | ## scaffolds is not reliable. So we exclude them:
159 | exclude <- c("chrU", "chrUextra")
160 | up1000seqs <- extractUpstreamSeqs(genome, txdb, width=1000,
161 | exclude.seqlevels=exclude)
162 | up1000seqs # the names are Ensembl gene IDs
163 | mcols(up1000seqs)
164 |
165 | ## Upstream sequences for genes close to the chromosome bounds can be
166 | ## shorter than 1000 (note that this does not happen for circular
167 | ## chromosomes like chrM):
168 | table(width(up1000seqs))
169 | mcols(up1000seqs)[width(up1000seqs) != 1000, ]
170 | }
171 |
172 | \keyword{manip}
173 |
--------------------------------------------------------------------------------
/man/features.Rd:
--------------------------------------------------------------------------------
1 | \name{features}
2 |
3 | \alias{features}
4 | \alias{features,FeatureDb-method}
5 |
6 | \title{
7 | Extract simple features from a FeatureDb object
8 | }
9 |
10 | \description{
11 | WARNING: The FeatureDb/makeFeatureDbFromUCSC/features code base is
12 | no longer actively maintained and FeatureDb-related functionalities
13 | might get deprecated in the near future. Please use
14 | \code{\link{makeFeatureDbFromUCSC}} for a convenient way to
15 | import transcript annotations from UCSC online resources into
16 | Bioconductor.
17 |
18 | Generic function to extract genomic features from a FeatureDb object.
19 | }
20 |
21 | \usage{
22 | features(x)
23 | \S4method{features}{FeatureDb}(x)
24 | }
25 |
26 | \arguments{
27 | \item{x}{
28 | A \link{FeatureDb} object.
29 | }
30 | }
31 |
32 |
33 | \value{ a GRanges object }
34 |
35 | \author{
36 | M. Carlson
37 | }
38 |
39 | \seealso{
40 | \link{FeatureDb}
41 | }
42 |
43 | \examples{
44 | fdb <- loadDb(system.file("extdata", "FeatureDb.sqlite",
45 | package="GenomicFeatures"))
46 | features(fdb)
47 | }
48 |
--------------------------------------------------------------------------------
/man/getPromoterSeq-methods.Rd:
--------------------------------------------------------------------------------
1 | \name{getPromoterSeq}
2 |
3 | \alias{getPromoterSeq}
4 | \alias{getTerminatorSeq}
5 | \alias{getPromoterSeq,GRanges-method}
6 | \alias{getTerminatorSeq,GRanges-method}
7 | \alias{getPromoterSeq,GRangesList-method}
8 | \alias{getTerminatorSeq,GRangesList-method}
9 |
10 | \title{Get gene promoter or terminator sequences}
11 |
12 | \description{
13 | Extract promoter or terminator sequences for the genes or transcripts
14 | specified in the query (a\link{GRanges} or \link{GRangesList} object)
15 | from a \link[BSgenome]{BSgenome} or \link[Rsamtools]{FaFile} object.
16 | }
17 |
18 | \usage{
19 | \S4method{getPromoterSeq}{GRanges}(query, subject, upstream=2000, downstream=200)
20 | \S4method{getTerminatorSeq}{GRanges}(query, subject, upstream=2000, downstream=200)
21 |
22 | \S4method{getPromoterSeq}{GRangesList}(query, subject, upstream=2000, downstream=200)
23 | \S4method{getTerminatorSeq}{GRangesList}(query, subject, upstream=2000, downstream=200)
24 | }
25 |
26 | \arguments{
27 | \item{query}{A \link[GenomicRanges]{GRanges} or
28 | \link[GenomicRanges]{GRangesList} object containing genes grouped by
29 | transcript.
30 | }
31 | \item{subject}{A \link[BSgenome]{BSgenome} or \link[Rsamtools]{FaFile} object from which
32 | the sequences will be taken.}
33 | \item{upstream}{The number of DNA bases to include upstream of the TSS (transcription start site)}
34 | \item{downstream}{The number of DNA bases to include downstream of the TSS (transcription start site)}
35 | }
36 |
37 | \details{
38 | \code{getPromoterSeq} and \code{getTerminatorSeq} are generic functions
39 | dispatching on query, which is either a GRanges or a GRangesList.
40 | They are convenience wrappers for the \code{promoters}, \code{terminators},
41 | and \code{getSeq} functions.
42 | The purpose is to allow sequence extraction from either a
43 | \link[BSgenome]{BSgenome} or \link[Rsamtools]{FaFile} object.
44 |
45 | Default values for \code{upstream} and \code{downstream} were chosen based
46 | on our current understanding of gene regulation. On average, promoter
47 | regions in the mammalian genome are 5000 bp upstream and downstream of the
48 | transcription start site.
49 | }
50 |
51 | \value{
52 | A \link[Biostrings]{DNAStringSet} or
53 | \link[Biostrings]{DNAStringSetList} instance corresponding to the
54 | GRanges or GRangesList supplied in the query.
55 | }
56 |
57 | \author{Paul Shannon}
58 |
59 | \seealso{
60 | \itemize{
61 | \item The \code{\link[GenomicRanges]{promoters}} man page in the
62 | \pkg{GenomicRanges} package for the \code{promoters()} and
63 | \code{terminators()} methods for \link[GenomicRanges]{GenomicRanges}
64 | objects.
65 |
66 | \item \code{\link[Biostrings]{getSeq}} in the \pkg{Biostrings}
67 | package for extracting a set of sequences from a sequence
68 | container like a \link[BSgenome]{BSgenome} or
69 | \link[Rsamtools]{FaFile} object.
70 | }
71 | }
72 |
73 | \examples{
74 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
75 | library(BSgenome.Hsapiens.UCSC.hg19)
76 |
77 |
78 | ## A GRangesList object describing all the known Human transcripts grouped
79 | ## by gene:
80 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
81 | tx_by_gene <- transcriptsBy(txdb, by="gene")
82 |
83 | e2f3 <- "1871" # entrez geneID for a cell cycle control transcription
84 | # factor, chr6 on the plus strand
85 |
86 | ## A GRanges object describing the three transcripts for gene 1871:
87 | e2f3_tx <- tx_by_gene[[e2f3]]
88 |
89 | ## Promoter sequences for gene 1871:
90 | e2f3_promoter_seqs <- getPromoterSeq(e2f3_tx, Hsapiens,
91 | upstream=40, downstream=15)
92 | e2f3_promoter_seqs
93 |
94 | mcols(e2f3_promoter_seqs)
95 |
96 | ## Terminator sequences for gene 1871:
97 | e2f3_terminator_seqs <- getTerminatorSeq(e2f3_tx, Hsapiens,
98 | upstream=25, downstream=10)
99 |
100 | e2f3_terminator_seqs
101 |
102 | mcols(e2f3_terminator_seqs) # same as 'mcols(e2f3_promoter_seqs)'
103 |
104 | ## All Human promoter sequences grouped by gene:
105 | getPromoterSeq(tx_by_gene, Hsapiens, upstream=6, downstream=4)
106 | }
107 |
108 | \keyword{methods}
109 | \keyword{manip}
110 |
--------------------------------------------------------------------------------
/man/id2name.Rd:
--------------------------------------------------------------------------------
1 | \name{id2name}
2 |
3 | \alias{id2name}
4 |
5 | \title{
6 | Map internal ids to external names for a given feature type
7 | }
8 | \description{
9 | Utility function for retrieving the mapping from the internal ids
10 | to the external names of a given feature type.
11 | }
12 | \usage{
13 | id2name(txdb, feature.type=c("tx", "exon", "cds"))
14 | }
15 | \arguments{
16 | \item{txdb}{A \link{TxDb} object.}
17 | \item{feature.type}{The feature type for which the mapping must be
18 | retrieved.}
19 | }
20 | \details{
21 | Transcripts, exons and CDS parts in a \link{TxDb} object are
22 | stored in seperate tables where the primary key is an integer
23 | called \emph{feature internal id}. This id is stored in the
24 | \code{"tx_id"} column for transcripts, in the \code{"exon_id"}
25 | column for exons, and in the \code{"cds_id"} column for CDS parts.
26 | Unlike other commonly used ids like Entrez Gene IDs or Ensembl IDs,
27 | this internal id was generated at the time the \link{TxDb}
28 | object was created and has no meaning outside the scope of this object.
29 |
30 | The \code{id2name} function can be used to translate this internal
31 | id into a more informative id or name called \emph{feature external
32 | name}. This name is stored in the \code{"tx_name"} column for
33 | transcripts, in the \code{"exon_name"} column for exons, and in
34 | the \code{"cds_name"} column for CDS parts.
35 |
36 | Note that, unlike the feature internal id, the feature external
37 | name is not guaranteed to be unique or even defined (the column
38 | can contain \code{NA}s).
39 | }
40 |
41 | \value{
42 | A named character vector where the names are the internal ids and the
43 | values the external names.
44 | }
45 |
46 | \author{Hervé Pagès}
47 |
48 | \seealso{
49 | \itemize{
50 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}},
51 | and \code{\link{transcriptsByOverlaps}}, for how to extract
52 | genomic features from a \link{TxDb} object.
53 | \item The \link{TxDb} class.
54 | }
55 | }
56 | \examples{
57 | txdb1_file <- system.file("extdata", "hg19_knownGene_sample.sqlite",
58 | package="GenomicFeatures")
59 | txdb1 <- loadDb(txdb1_file)
60 | id2name(txdb1, feature.type="tx")[1:4]
61 | id2name(txdb1, feature.type="exon")[1:4]
62 | id2name(txdb1, feature.type="cds")[1:4]
63 |
64 | txdb2_file <- system.file("extdata", "Biomart_Ensembl_sample.sqlite",
65 | package="GenomicFeatures")
66 | txdb2 <- loadDb(txdb2_file)
67 | id2name(txdb2, feature.type="tx")[1:4]
68 | id2name(txdb2, feature.type="exon")[1:4]
69 | id2name(txdb2, feature.type="cds")[1:4]
70 | }
71 |
--------------------------------------------------------------------------------
/man/makeFeatureDbFromUCSC.Rd:
--------------------------------------------------------------------------------
1 | \name{makeFeatureDbFromUCSC}
2 |
3 | \alias{supportedUCSCFeatureDbTracks}
4 | \alias{supportedUCSCFeatureDbTables}
5 | \alias{UCSCFeatureDbTableSchema}
6 | \alias{makeFeatureDbFromUCSC}
7 |
8 | \title{
9 | [Moved to txdbmaker] Make a FeatureDb object from annotations available
10 | at the UCSC Genome Browser
11 | }
12 |
13 | \description{
14 | IMPORTANT NOTE: Starting with BioC 3.19, functions
15 | \code{supportedUCSCFeatureDbTracks()}, \code{supportedUCSCFeatureDbTables()},
16 | \code{UCSCFeatureDbTableSchema()}, and \code{makeFeatureDbFromUCSC()} are
17 | defined in the \pkg{txdbmaker} package.
18 | }
19 |
20 | \seealso{
21 | \code{txdbmaker::\link[txdbmaker]{supportedUCSCFeatureDbTracks}},
22 | \code{txdbmaker::\link[txdbmaker]{supportedUCSCFeatureDbTables}},
23 | \code{txdbmaker::\link[txdbmaker]{UCSCFeatureDbTableSchema}},
24 | and \code{txdbmaker::\link[txdbmaker]{makeFeatureDbFromUCSC}}
25 | in the \pkg{txdbmaker} package.
26 | }
27 |
28 |
--------------------------------------------------------------------------------
/man/makeTxDb.Rd:
--------------------------------------------------------------------------------
1 | \name{makeTxDb}
2 |
3 | \alias{makeTxDb}
4 |
5 | \title{
6 | [Moved to txdbmaker] Make a TxDb object from user supplied annotations
7 | }
8 |
9 | \description{
10 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDb} function
11 | is defined in the \pkg{txdbmaker} package.
12 | }
13 |
14 | \seealso{
15 | \code{txdbmaker::\link[txdbmaker]{makeTxDb}} in the \pkg{txdbmaker}
16 | package.
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/man/makeTxDbFromBiomart.Rd:
--------------------------------------------------------------------------------
1 | \name{makeTxDbFromBiomart}
2 |
3 | \alias{makeTxDbFromBiomart}
4 | \alias{getChromInfoFromBiomart}
5 |
6 | \title{
7 | [Moved to txdbmaker] Make a TxDb object from annotations available
8 | on a BioMart database
9 | }
10 |
11 | \description{
12 | IMPORTANT NOTE: Starting with BioC 3.19, functions
13 | \code{makeTxDbFromBiomart()} and \code{getChromInfoFromBiomart()}
14 | are defined in the \pkg{txdbmaker} package.
15 | }
16 |
17 | \seealso{
18 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromBiomart}} and
19 | \code{txdbmaker::\link[txdbmaker]{getChromInfoFromBiomart}} in
20 | the \pkg{txdbmaker} package.
21 | }
22 |
23 |
--------------------------------------------------------------------------------
/man/makeTxDbFromEnsembl.Rd:
--------------------------------------------------------------------------------
1 | \name{makeTxDbFromEnsembl}
2 |
3 | \alias{makeTxDbFromEnsembl}
4 |
5 | \title{
6 | [Moved to txdbmaker] Make a TxDb object from an Ensembl database
7 | }
8 |
9 | \description{
10 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDbFromEnsembl}
11 | function is defined in the \pkg{txdbmaker} package.
12 | }
13 |
14 | \seealso{
15 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromEnsembl}} in the
16 | \pkg{txdbmaker} package.
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/man/makeTxDbFromGFF.Rd:
--------------------------------------------------------------------------------
1 | \name{makeTxDbFromGFF}
2 |
3 | \alias{makeTxDbFromGFF}
4 |
5 | \title{
6 | [Moved to txdbmaker] Make a TxDb object from annotations available
7 | as a GFF3 or GTF file
8 | }
9 |
10 | \description{
11 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDbFromGFF}
12 | function is defined in the \pkg{txdbmaker} package.
13 | }
14 |
15 | \seealso{
16 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker}
17 | package.
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/man/makeTxDbFromGRanges.Rd:
--------------------------------------------------------------------------------
1 | \name{makeTxDbFromGRanges}
2 |
3 | \alias{makeTxDbFromGRanges}
4 |
5 | \title{[Moved to txdbmaker] Make a TxDb object from a GRanges object}
6 |
7 | \description{
8 | IMPORTANT NOTE: Starting with BioC 3.19, the \code{makeTxDbFromGRanges}
9 | function is defined in the \pkg{txdbmaker} package.
10 | }
11 |
12 | \seealso{
13 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromGRanges}} in the
14 | \pkg{txdbmaker} package.
15 | }
16 |
17 |
--------------------------------------------------------------------------------
/man/makeTxDbFromUCSC.Rd:
--------------------------------------------------------------------------------
1 | \name{makeTxDbFromUCSC}
2 |
3 | \alias{supportedUCSCtables}
4 | \alias{browseUCSCtrack}
5 | \alias{makeTxDbFromUCSC}
6 |
7 | \title{
8 | [Moved to txdbmaker] Make a TxDb object from annotations available
9 | at the UCSC Genome Browser
10 | }
11 |
12 | \description{
13 | IMPORTANT NOTE: Starting with BioC 3.19, functions
14 | \code{makeTxDbFromUCSC()}, \code{supportedUCSCtables()},
15 | and \code{browseUCSCtrack()} are defined in the \pkg{txdbmaker}
16 | package.
17 | }
18 |
19 | \seealso{
20 | \code{txdbmaker::\link[txdbmaker]{makeTxDbFromUCSC}},
21 | \code{txdbmaker::\link[txdbmaker]{supportedUCSCtables}},
22 | and \code{txdbmaker::\link[txdbmaker]{browseUCSCtrack}}
23 | in the \pkg{txdbmaker} package.
24 | }
25 |
26 |
--------------------------------------------------------------------------------
/man/makeTxDbPackage.Rd:
--------------------------------------------------------------------------------
1 | \name{makeTxDbPackage}
2 |
3 | \alias{supportedMiRBaseBuildValues}
4 | \alias{makePackageName}
5 | \alias{makeTxDbPackage}
6 | \alias{makeTxDbPackageFromUCSC}
7 | \alias{makeFDbPackageFromUCSC}
8 | \alias{makeTxDbPackageFromBiomart}
9 |
10 | \title{
11 | [Moved to txdbmaker] Making a TxDb package from annotations available
12 | at the UCSC Genome Browser, biomaRt or from another source.
13 | }
14 |
15 | \description{
16 | IMPORTANT NOTE: Starting with BioC 3.19, functions
17 | \code{makeTxDbPackageFromUCSC()}, \code{makeFDbPackageFromUCSC()},
18 | \code{makeTxDbPackageFromBiomart()}, \code{makeTxDbPackage()}
19 | \code{supportedMiRBaseBuildValues()} and \code{makePackageName()}
20 | are defined in the \pkg{txdbmaker} package.
21 | }
22 |
23 | \seealso{
24 | \code{txdbmaker::\link[txdbmaker]{makeTxDbPackageFromUCSC}},
25 | \code{txdbmaker::\link[txdbmaker]{makeFDbPackageFromUCSC}},
26 | \code{txdbmaker::\link[txdbmaker]{makeTxDbPackageFromBiomart}},
27 | \code{txdbmaker::\link[txdbmaker]{makeTxDbPackage}},
28 | \code{txdbmaker::\link[txdbmaker]{supportedMiRBaseBuildValues}},
29 | and \code{txdbmaker::\link[txdbmaker]{makePackageName}}
30 | in the \pkg{txdbmaker} package.
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/man/mapIdsToRanges.Rd:
--------------------------------------------------------------------------------
1 | \docType{methods}
2 | \name{mapIdsToRanges}
3 | \alias{mapIdsToRanges}
4 | \alias{mapIdsToRanges,TxDb-method}
5 | \title{Map IDs to Genomic Ranges}
6 | \usage{
7 | mapIdsToRanges(x, ...)
8 |
9 | \S4method{mapIdsToRanges}{TxDb}(x, keys, type = c("cds", "exon", "tx",
10 | "gene"), columns = NULL)
11 | }
12 | \arguments{
13 | \item{x}{Database to use for mapping}
14 |
15 | \item{keys}{Values to lookup, passed to \code{\link{transcripts}} et. al.}
16 |
17 | \item{type}{Types of feature to return}
18 |
19 | \item{columns}{Additional metadata columns to include in the output}
20 |
21 | \item{...}{Additional arguments passed to methods}
22 | }
23 | \value{
24 | \code{\link[GenomicRanges]{GRangesList}} corresponding to the keys
25 | }
26 | \description{
27 | Map IDs to Genomic Ranges
28 | }
29 | \section{Methods (by class)}{
30 | \itemize{
31 | \item \code{TxDb}: TxDb method
32 | }}
33 | \examples{
34 | library(txdbmaker) # for makeTxDbFromGRanges()
35 | fl <- system.file(package = "GenomicFeatures", "extdata", "sample_ranges.rds")
36 | txdb <- makeTxDbFromGRanges(readRDS(fl))
37 |
38 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588",
39 | "ENST00000494752", "ENST00000614008", "ENST00000496771"))
40 | mapIdsToRanges(txdb, keys = keys, type = "tx")
41 | }
42 |
--------------------------------------------------------------------------------
/man/mapRangesToIds.Rd:
--------------------------------------------------------------------------------
1 | \docType{methods}
2 | \name{mapRangesToIds}
3 | \alias{mapRangesToIds}
4 | \alias{mapRangesToIds,TxDb-method}
5 | \title{Map Genomic Ranges to IDs}
6 | \usage{
7 | mapRangesToIds(x, ...)
8 |
9 | \S4method{mapRangesToIds}{TxDb}(x, ranges, type = c("cds", "exon", "tx",
10 | "gene"), columns = NULL, ...)
11 | }
12 | \arguments{
13 | \item{x}{Database to use for mapping}
14 |
15 | \item{ranges}{range object used to subset}
16 |
17 | \item{type}{of feature to return}
18 |
19 | \item{columns}{additional metadata columns to include in the output.}
20 |
21 | \item{...}{Additional arguments passed to
22 | \code{\link[GenomicRanges]{findOverlaps}}}
23 | }
24 | \value{
25 | \code{\link[S4Vectors]{DataFrame}} of mcols from the database.
26 | }
27 | \description{
28 | Map Genomic Ranges to IDs
29 | }
30 | \section{Methods (by class)}{
31 | \itemize{
32 | \item \code{TxDb}: TxDb method
33 | }}
34 | \examples{
35 | library(txdbmaker) # for makeTxDbFromGRanges()
36 | fl <- system.file(package = "GenomicFeatures", "extdata", "sample_ranges.rds")
37 | txdb <- makeTxDbFromGRanges(readRDS(fl))
38 |
39 | keys <- list(tx_name = c("ENST00000371582", "ENST00000371588",
40 | "ENST00000494752", "ENST00000614008", "ENST00000496771"))
41 | res <- mapIdsToRanges(txdb, keys = keys, type = "tx")
42 | mapRangesToIds(txdb, res, "tx")
43 | }
44 |
--------------------------------------------------------------------------------
/man/nearest-methods.Rd:
--------------------------------------------------------------------------------
1 | \name{nearest-methods}
2 |
3 | \alias{nearest-methods}
4 |
5 | \alias{distance,GenomicRanges,TxDb-method}
6 |
7 | \title{Finding the nearest genomic range neighbor in a TxDb}
8 |
9 | \description{
10 | A \code{distance()} method for TxDb objects.
11 | }
12 |
13 | \usage{
14 | \S4method{distance}{GenomicRanges,TxDb}(x, y, ignore.strand=FALSE,
15 | ..., id, type=c("gene", "tx", "exon", "cds"))
16 | }
17 |
18 | \arguments{
19 | \item{x}{
20 | The query \link{GenomicRanges} instance.
21 | }
22 | \item{y}{
23 | A \link{TxDb} object. The \code{id} is used to extract
24 | ranges from the \link{TxDb} which are then used to compute the
25 | distance from \code{x}.
26 | }
27 | \item{id}{
28 | A \code{character} vector the same length as \code{x}.
29 | The \code{id} must be identifiers in the \link{TxDb} object.
30 | \code{type} indicates what type of identifier \code{id} is.
31 | }
32 | \item{type}{
33 | A \code{character(1)} describing the \code{id}.
34 | Must be one of \sQuote{gene}, \sQuote{tx}, \sQuote{exon} or
35 | \sQuote{cds}.
36 | }
37 | \item{ignore.strand}{
38 | A \code{logical} indicating if the strand of the ranges
39 | should be ignored. When \code{TRUE}, strand is set to \code{'+'}.
40 | }
41 | \item{...}{
42 | Additional arguments for methods.
43 | }
44 | }
45 |
46 | \details{
47 | This \code{distance()} method returns the distance for each range in \code{x}
48 | to the range extracted from the \link{TxDb} object \code{y}. Values in
49 | \code{id} are matched to one of \sQuote{gene_id}, \sQuote{tx_id},
50 | \sQuote{exon_id} or \sQuote{cds_id} identifiers in the \link{TxDb}
51 | and the corresponding ranges are extracted. The \code{type} argument
52 | specifies which identifier is represented in \code{id}. The extracted
53 | ranges are used in the distance calculation with the ranges in \code{x}.
54 |
55 | The method returns \code{NA} values when the genomic region defined
56 | by \code{id} cannot be collapsed into a single range (e.g.,
57 | when a gene spans multiple chromosomes) or if the \code{id}
58 | is not found in \code{y}.
59 |
60 | The behavior of \code{distance()} with respect to zero-width ranges
61 | has changed in Bioconductor 2.12. See the man page \code{?distance}
62 | in the \pkg{IRanges} for details.
63 | }
64 |
65 | \value{
66 | An integer vector of distances between the ranges in \code{x} and \code{y}.
67 | }
68 |
69 | \author{Valerie Obenchain }
70 |
71 | \seealso{
72 | \itemize{
73 | \item \link[IRanges]{nearest-methods} in the \pkg{IRanges} package.
74 | \item \link[GenomicRanges]{nearest-methods} in the \pkg{GenomicRanges}
75 | package.
76 | }
77 | }
78 |
79 | \examples{
80 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
81 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
82 | gr <- GRanges(c("chr2L", "chr2R"),
83 | IRanges(c(100000, 200000), width=100))
84 | distance(gr, txdb, id=c("FBgn0259717", "FBgn0261501"), type="gene")
85 | distance(gr, txdb, id=c("10000", "23000"), type="cds")
86 |
87 | ## The id's must be in the appropriate order with respect to 'x'.
88 | distance(gr, txdb, id=c("4", "4097"), type="tx")
89 |
90 | ## 'id' "4" is on chr2L and "4097" is on chr2R.
91 | transcripts(txdb, filter=list(tx_id=c("4", "4097")))
92 |
93 | ## If we reverse the 'id' the chromosomes are incompatable with gr.
94 | distance(gr, txdb, id=c("4097", "4"), type="tx")
95 |
96 | ## distance() compares each 'x' to the corresponding 'y'.
97 | ## If an 'id' is not found in the TxDb 'y' will not
98 | ## be the same lenth as 'x' and an error is thrown.
99 | \dontrun{
100 | distance(gr, txdb, id=c("FBgn0000008", "INVALID"), type="gene") ## will fail
101 | }
102 | }
103 |
104 | \keyword{utilities}
105 |
--------------------------------------------------------------------------------
/man/proteinToGenome.Rd:
--------------------------------------------------------------------------------
1 | \name{proteinToGenome}
2 |
3 | \alias{proteinToGenome}
4 | \alias{proteinToGenome,GRangesList-method}
5 | \alias{proteinToGenome,ANY-method}
6 |
7 | \title{Map protein-relative coordinates to genomic coordinates}
8 |
9 | \description{
10 | \code{proteinToGenome} is a generic function for mapping
11 | ranges of protein-relative positions to the genome.
12 |
13 | NOTE: This man page is for the \code{proteinToGenome} S4 generic
14 | function and methods defined in the \pkg{GenomicFeatures} package,
15 | which are (loosely) modeled on the \code{\link[ensembldb]{proteinToGenome}}
16 | function from the \pkg{ensembldb} package.
17 | See \code{?ensembldb::\link[ensembldb]{proteinToGenome}} for the latter.
18 | }
19 |
20 | \usage{
21 | ## S4 generic function:
22 | proteinToGenome(x, db, ...) # dispatch is on 2nd argument 'db'
23 |
24 | \S4method{proteinToGenome}{ANY}(x, db)
25 |
26 | \S4method{proteinToGenome}{GRangesList}(x, db)
27 | }
28 |
29 | \arguments{
30 | \item{x}{
31 | A named \link[IRanges]{IRanges} object (or derivative) containing ranges
32 | of \emph{protein-relative positions} (protein-relative positions are
33 | positions relative to a protein sequence).
34 |
35 | The names on \code{x} must be transcript names present in \code{db}.
36 | More precisely, for the default \code{proteinToGenome()} method,
37 | \code{names(x)} must be a subset of:
38 | \preformatted{ mcols(transcripts(db, columns="tx_name"))$tx_name
39 | }
40 | And for the method for \link[GenomicRanges]{GRangesList} objects,
41 | \code{names(x)} must be a subset of:
42 | \preformatted{ names(db)
43 | }
44 | }
45 | \item{db}{
46 | For the default \code{proteinToGenome()} method: A \link{TxDb}
47 | object or any object that supports \code{\link{transcripts}()}
48 | and \code{\link{cdsBy}()} (e.g. an \link[ensembldb]{EnsDb} object
49 | from the \pkg{ensembldb} package).
50 |
51 | For the method for \link[GenomicRanges]{GRangesList} objects:
52 | A named \link[GenomicRanges]{GRangesList} object (or derivative)
53 | where each list element is a \link[GenomicRanges]{GRanges} object
54 | representing a CDS (the ranges in the \link[GenomicRanges]{GRanges}
55 | object must represent the CDS parts ordered by ascending exon rank).
56 | }
57 | \item{...}{
58 | Further arguments to be passed to specific methods.
59 | }
60 | }
61 |
62 | \details{
63 | The \code{proteinToGenome()} method for \link[GenomicRanges]{GRangesList}
64 | objects is the workhorse behind the default method. Note that the latter
65 | is a thin wrapper around the former, which simply does the following:
66 | \enumerate{
67 | \item Use \code{\link{cdsBy}()} to extract the CDS parts from \code{db}.
68 | The CDS parts are returned in a \link[GenomicRanges]{GRangesList}
69 | object that has the names of the transcript on it (one transcript
70 | name per list element).
71 | \item Call \code{proteinToGenome()} on \code{x} and the
72 | \link[GenomicRanges]{GRangesList} object returned by
73 | \code{\link{cdsBy}()}.
74 | }
75 | }
76 |
77 | \value{
78 | A named \link[GenomicRanges]{GRangesList} object \emph{parallel} to
79 | \code{x} (the transcript names on \code{x} are propagated).
80 | The i-th list element in the returned object is the result of mapping
81 | the range of protein-relative positions \code{x[i]} to the genome.
82 |
83 | Note that a given range in \code{x} can only be mapped to the genome
84 | if the name on it is the name of a \emph{coding} transcript. If it's
85 | not (i.e. if it's the name of a \emph{non-coding} transcript), then
86 | an empty \link[GenomicRanges]{GRanges} object is placed in the returned
87 | object to indicate the impossible mapping, and a warning is issued.
88 |
89 | Otherwise, if a given range in \code{x} can be mapped to the
90 | genome, then the result of the mapping is represented by a
91 | non-empty \link[GenomicRanges]{GRanges} object.
92 | Note that this object represents the original CDS associated to
93 | \code{x}, trimmed on its 5' end or 3' end, or on both.
94 | Furthermore, this object will have the same metadata columns as the
95 | \link[GenomicRanges]{GRanges} object representing the original CDS,
96 | plus the 2 following ones:
97 | \itemize{
98 | \item \code{protein_start}: The protein-relative start of the mapping.
99 | \item \code{protein_end}: The protein-relative end of the mapping.
100 | }
101 | }
102 |
103 | \note{
104 | Unlike \code{ensembldb::\link[ensembldb]{proteinToGenome}()} which
105 | can work either with Ensembl protein IDs or Ensembl transcript IDs
106 | on \code{x}, the default \code{proteinToGenome()} method described
107 | above only accepts \emph{transcript names} on \code{x}.
108 |
109 | This means that, if the user is in possession of protein IDs, they
110 | must first replace them with the corresponding transcript IDs (referred
111 | to as \emph{transcript names} in the context of \link{TxDb} objects).
112 | How to do this exactly depends on the origin of those IDs (UCSC,
113 | Ensembl, GTF/GFF3 file, FlyBase, etc...)
114 | }
115 |
116 | \author{H. Pagès, using \code{ensembldb::proteinToGenome()} for
117 | inspiration and design.}
118 |
119 | \seealso{
120 | \itemize{
121 | \item The \code{\link[ensembldb]{proteinToGenome}} function in the
122 | \pkg{ensembldb} package, which the \code{proteinToGenome()}
123 | generic and methods documented in this man page are (loosely)
124 | modeled on.
125 |
126 | \item \link{TxDb} objects.
127 |
128 | \item \link[ensembldb]{EnsDb} objects (\link{TxDb}-like objects) in
129 | the \pkg{ensembldb} package.
130 |
131 | \item \code{\link{transcripts}} for extracting transcripts from a
132 | \link{TxDb}-like object.
133 |
134 | \item \code{\link{cdsBy}} for extracting CDS parts from a
135 | \link{TxDb}-like object.
136 |
137 | \item \link[IRanges]{IRanges} objects in the \pkg{IRanges} package.
138 |
139 | \item \link[GenomicRanges]{GRanges} and \link[GenomicRanges]{GRangesList}
140 | objects in the \pkg{GenomicRanges} package.
141 | }
142 | }
143 |
144 | \examples{
145 | ## ---------------------------------------------------------------------
146 | ## USING TOY CDS
147 | ## ---------------------------------------------------------------------
148 |
149 | ## CDS1 has 2 CDS parts:
150 | CDS1 <- GRanges(c("chrX:11-60:+", "chrX:101-125:+"))
151 |
152 | ## CDS2 has 3 CDS parts:
153 | CDS2 <- GRanges(c("chrY:201-230:-", "chrY:101-125:-", "chrY:11-60:-"))
154 |
155 | ## Put them in a GRangesList object:
156 | cds_by_tx <- GRangesList(TX1=CDS1, TX2=CDS2)
157 | cds_by_tx
158 |
159 | x1 <- IRanges(start=8, end=20, names="TX1")
160 | proteinToGenome(x1, cds_by_tx)
161 |
162 | x2 <- IRanges(start=c(1, 18), end=c(25, 20), names=c("TX1", "TX1"))
163 | x2
164 | proteinToGenome(x2, cds_by_tx)
165 |
166 | x3 <- IRanges(start=8, end=15, names="TX2")
167 | proteinToGenome(x3, cds_by_tx)
168 |
169 | x4 <- c(x3, x2)
170 | x4
171 | proteinToGenome(x4, cds_by_tx)
172 |
173 | ## ---------------------------------------------------------------------
174 | ## USING A TxDb OBJECT
175 | ## ---------------------------------------------------------------------
176 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
177 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
178 |
179 | ## The first transcript (FBtr0309810) is non-coding:
180 | x <- IRanges(c(FBtr0309810="11-55", FBtr0306539="90-300"))
181 | res <- proteinToGenome(x, txdb)
182 | res
183 | }
184 |
185 | \keyword{methods}
186 | \keyword{utilities}
187 |
--------------------------------------------------------------------------------
/man/select-methods.Rd:
--------------------------------------------------------------------------------
1 | \name{select-methods}
2 |
3 | \alias{select-methods}
4 |
5 | \alias{columns,TxDb-method}
6 | \alias{keytypes,TxDb-method}
7 | \alias{keys,TxDb-method}
8 | \alias{select,TxDb-method}
9 |
10 | \title{Using the "select" interface on TxDb objects}
11 |
12 | \description{
13 | \code{select}, \code{columns} and \code{keys} can be used together to
14 | extract data from a \link{TxDb} object.
15 | }
16 |
17 | \details{
18 | In the code snippets below, \code{x} is a \link{TxDb} object.
19 |
20 | \describe{
21 | \item{\code{keytypes(x)}:}{
22 | allows the user to discover which keytypes can be passed in to
23 | \code{select} or \code{keys} and the \code{keytype} argument.
24 | }
25 | \item{\code{keys(x, keytype, pattern, column, fuzzy)}:}{ Return keys for
26 | the database contained in the \link{TxDb} object .
27 |
28 | The \code{keytype} argument specifies the kind of keys that will
29 | be returned. By default \code{keys} will return the "GENEID" keys
30 | for the database.
31 |
32 | If \code{keys} is used with \code{pattern}, it will pattern match
33 | on the \code{keytype}.
34 |
35 | But if the \code{column} argument is also provided along with the
36 | \code{pattern} argument, then \code{pattern} will be matched
37 | against the values in \code{column} instead.
38 |
39 | And if \code{keys} is called with \code{column} and no
40 | \code{pattern} argument, then it will return all keys that have
41 | corresponding values in the \code{column} argument.
42 |
43 | Thus, the behavior of \code{keys} all depends on how many arguments are
44 | specified.
45 |
46 | Use of the \code{fuzzy} argument will toggle fuzzy matching to
47 | TRUE or FALSE. If \code{pattern} is not used, fuzzy is ignored.
48 | }
49 | \item{\code{columns(x)}:}{
50 | Show which kinds of data can be returned for the
51 | \link{TxDb} object.
52 | }
53 | \item{\code{select(x, keys, columns, keytype)}:}{
54 | When all the appropriate arguments are specified \code{select}
55 | will retrieve the matching data as a data.frame based on
56 | parameters for selected \code{keys} and \code{columns} and
57 | \code{keytype} arguments.
58 | }
59 | }
60 |
61 | }
62 |
63 |
64 |
65 | \author{Marc Carlson}
66 |
67 | \seealso{
68 | \itemize{
69 | \item \link[AnnotationDbi]{AnnotationDb-class} for more descriptsion
70 | of methods \code{select},\code{keytypes},\code{keys} and \code{columns}.
71 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}},
72 | and \code{\link{transcriptsByOverlaps}}, for other ways to
73 | extract genomic features from a \link{TxDb} object.
74 | \item The \link{TxDb} class.
75 | }
76 | }
77 |
78 | \examples{
79 | txdb_file <- system.file("extdata", "Biomart_Ensembl_sample.sqlite",
80 | package="GenomicFeatures")
81 | txdb <- loadDb(txdb_file)
82 | txdb
83 |
84 | ## find key types
85 | keytypes(txdb)
86 |
87 | ## list IDs that can be used to filter
88 | head(keys(txdb, "GENEID"))
89 | head(keys(txdb, "TXID"))
90 | head(keys(txdb, "TXNAME"))
91 |
92 | ## list columns that can be returned by select
93 | columns(txdb)
94 |
95 | ## call select
96 | res <- select(txdb, head(keys(txdb, "GENEID")),
97 | columns=c("GENEID","TXNAME"),
98 | keytype="GENEID")
99 | head(res)
100 | }
101 |
102 | \keyword{methods}
103 |
--------------------------------------------------------------------------------
/man/tRNAs.Rd:
--------------------------------------------------------------------------------
1 | \name{tRNAs}
2 |
3 | \alias{microRNAs}
4 | \alias{tRNAs}
5 | \alias{tRNAs,TxDb-method}
6 |
7 | \title{
8 | Extract tRNA genomic ranges from an object
9 | }
10 |
11 | \description{
12 | WARNING: The code base for \code{tRNAs()} is no longer actively
13 | maintained and the function might get deprecated in the near future.
14 |
15 | The \code{tRNAs()} function extracts tRNA genomic ranges from a
16 | \link{TxDb} object.
17 | }
18 |
19 | \usage{
20 | tRNAs(x)
21 | }
22 |
23 | \arguments{
24 | \item{x}{
25 | A \link{TxDb} object.
26 | }
27 | }
28 |
29 | \value{
30 | A \link[GenomicRanges]{GRanges} object.
31 | }
32 |
33 | \author{
34 | M. Carlson
35 | }
36 |
37 | \seealso{
38 | \itemize{
39 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}}, and
40 | \code{\link{transcriptsByOverlaps}} for the core genomic features
41 | extractors.
42 | \item The \link{TxDb} class.
43 | }
44 | }
45 |
46 | \keyword{methods}
47 |
--------------------------------------------------------------------------------
/man/transcriptLengths.Rd:
--------------------------------------------------------------------------------
1 | \name{transcriptLengths}
2 |
3 | \alias{transcriptLengths}
4 |
5 |
6 | \title{Extract the transcript lengths (and other metrics) from a TxDb object}
7 |
8 | \description{
9 | The \code{transcriptLengths} function extracts the transcript lengths from
10 | a \link{TxDb} object. It also returns the CDS and UTR lengths for each
11 | transcript if the user requested them.
12 | }
13 |
14 | \usage{
15 | transcriptLengths(txdb, with.cds_len=FALSE,
16 | with.utr5_len=FALSE, with.utr3_len=FALSE, ...)
17 | }
18 |
19 | \arguments{
20 | \item{txdb}{
21 | A \link{TxDb} object.
22 | }
23 | \item{with.cds_len, with.utr5_len, with.utr3_len}{
24 | \code{TRUE} or \code{FALSE}. Whether or not to also extract and return
25 | the CDS, 5' UTR, and 3' UTR lengths for each transcript.
26 | }
27 | \item{\dots}{
28 | Additional arguments used by \code{transcripts} and other accessor
29 | functions.
30 | }
31 | }
32 |
33 | \details{
34 | All the lengths are counted in number of nucleotides.
35 |
36 | The length of a processed transcript is just the sum of the lengths of its
37 | exons. This should not be confounded with the length of the stretch of DNA
38 | transcribed into RNA (a.k.a. transcription unit), which can be obtained
39 | with \code{width(transcripts(txdb))}.
40 | }
41 |
42 | \value{
43 | A data frame with 1 row per transcript. The rows are guaranteed to be in
44 | the same order as the elements of the \link[GenomicRanges]{GRanges} object
45 | returned by \code{\link{transcripts}(txdb)}.
46 | The data frame has between 5 and 8 columns, depending on what the user
47 | requested via the \code{with.cds_len}, \code{with.utr5_len}, and
48 | \code{with.utr3_len} arguments.
49 |
50 | The first 3 columns are the same as the metadata columns of the object
51 | returned by
52 | \preformatted{ transcripts(txdb, columns=c("tx_id", "tx_name", "gene_id"))
53 | }
54 | that is:
55 | \itemize{
56 | \item \code{tx_id}: The internal transcript ID. This ID is unique within
57 | the scope of the \link{TxDb} object. It is not an official or public
58 | ID (like an Ensembl or FlyBase ID) or an Accession number, so it
59 | cannot be used to lookup the transcript in public data bases or in
60 | other \link{TxDb} objects. Furthermore, this ID could change when
61 | re-running the code that was used to make the \link{TxDb} object.
62 | \item \code{tx_name}: An official/public transcript name or ID that can
63 | be used to lookup the transcript in public data bases or in other
64 | \link{TxDb} objects. This column is not guaranteed to contain unique
65 | values and it can contain NAs.
66 | \item \code{gene_id}: The official/public ID of the gene that the
67 | transcript belongs to. Can be NA if the gene is unknown or if the
68 | transcript is not considered to belong to a gene.
69 | }
70 |
71 | The other columns are quantitative:
72 | \itemize{
73 | \item \code{nexon}: The number of exons in the transcript.
74 | \item \code{tx_len}: The length of the processed transcript.
75 | \item \code{cds_len}: [optional] The length of the CDS region of the
76 | processed transcript.
77 | \item \code{utr5_len}: [optional] The length of the 5' UTR region of
78 | the processed transcript.
79 | \item \code{utr3_len}: [optional] The length of the 3' UTR region of
80 | the processed transcript.
81 | }
82 | }
83 |
84 | \author{Hervé Pagès}
85 |
86 | \seealso{
87 | \itemize{
88 | \item \code{\link{transcripts}}, \code{\link{transcriptsBy}},
89 | and \code{\link{transcriptsByOverlaps}}, for extracting
90 | genomic feature locations from a \link{TxDb}-like object.
91 |
92 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for
93 | extracting non-overlapping exonic or intronic parts from a
94 | TxDb-like object.
95 |
96 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript
97 | (or CDS) sequences from chromosome sequences.
98 |
99 | \item \code{\link{coverageByTranscript}} for computing coverage by
100 | transcript (or CDS) of a set of ranges.
101 |
102 | \item \code{\link[txdbmaker]{makeTxDbFromUCSC}},
103 | \code{\link[txdbmaker]{makeTxDbFromBiomart}},
104 | and \code{\link[txdbmaker]{makeTxDbFromEnsembl}} in
105 | the \pkg{txdbmaker} package for making a \link{TxDb}
106 | object from online resources.
107 |
108 | \item \code{\link[txdbmaker]{makeTxDbFromGRanges}} and
109 | \code{\link[txdbmaker]{makeTxDbFromGFF}} in the \pkg{txdbmaker}
110 | package for making a \link{TxDb} object from a
111 | \link[GenomicRanges]{GRanges} object, or from a GFF or GTF file.
112 |
113 | \item The \link{TxDb} class.
114 | }
115 | }
116 |
117 | \examples{
118 | library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
119 | txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
120 | dm3_txlens <- transcriptLengths(txdb)
121 | head(dm3_txlens)
122 |
123 | dm3_txlens <- transcriptLengths(txdb, with.cds_len=TRUE,
124 | with.utr5_len=TRUE,
125 | with.utr3_len=TRUE)
126 | head(dm3_txlens)
127 |
128 | ## When cds_len is 0 (non-coding transcript), utr5_len and utr3_len
129 | ## must also be 0:
130 | non_coding <- dm3_txlens[dm3_txlens$cds_len == 0, ]
131 | stopifnot(all(non_coding[6:8] == 0))
132 |
133 | ## When cds_len is not 0 (coding transcript), cds_len + utr5_len +
134 | ## utr3_len must be equal to tx_len:
135 | coding <- dm3_txlens[dm3_txlens$cds_len != 0, ]
136 | stopifnot(all(rowSums(coding[6:8]) == coding[[5]]))
137 |
138 | ## A sanity check:
139 | stopifnot(identical(dm3_txlens$tx_id, mcols(transcripts(txdb))$tx_id))
140 | }
141 |
142 | \keyword{manip}
143 |
--------------------------------------------------------------------------------
/man/transcriptLocs2refLocs.Rd:
--------------------------------------------------------------------------------
1 | \name{transcriptLocs2refLocs}
2 |
3 | \alias{transcriptWidths}
4 | \alias{transcriptLocs2refLocs}
5 |
6 | \title{Converting transcript-based locations into reference-based locations}
7 |
8 | \description{
9 | \code{transcriptLocs2refLocs} converts transcript-based
10 | locations into reference-based (aka chromosome-based or genomic)
11 | locations.
12 |
13 | \code{transcriptWidths} computes the lengths of the transcripts
14 | (called the "widths" in this context) based on the boundaries
15 | of their exons.
16 | }
17 |
18 | \usage{
19 | transcriptLocs2refLocs(tlocs,
20 | exonStarts=list(), exonEnds=list(), strand=character(0),
21 | decreasing.rank.on.minus.strand=FALSE, error.if.out.of.bounds=TRUE)
22 |
23 | transcriptWidths(exonStarts=list(), exonEnds=list())
24 | }
25 |
26 | \arguments{
27 | \item{tlocs}{
28 | A list of integer vectors of the same length as \code{exonStarts}
29 | and \code{exonEnds}. Each element in \code{tlocs} must contain
30 | transcript-based locations.
31 | }
32 | \item{exonStarts, exonEnds}{
33 | The starts and ends of the exons, respectively.
34 |
35 | Each argument can be a list of integer vectors,
36 | an \link[IRanges]{IntegerList} object,
37 | or a character vector where each element is a
38 | comma-separated list of integers.
39 | In addition, the lists represented by \code{exonStarts}
40 | and \code{exonEnds} must have the same shape i.e.
41 | have the same lengths and have elements of the same lengths.
42 | The length of \code{exonStarts} and \code{exonEnds}
43 | is the number of transcripts.
44 | }
45 | \item{strand}{
46 | A character vector of the same length as \code{exonStarts} and
47 | \code{exonEnds} specifying the strand (\code{"+"} or \code{"-"})
48 | from which the transcript is coming.
49 | }
50 | \item{decreasing.rank.on.minus.strand}{
51 | \code{TRUE} or \code{FALSE}.
52 | Describes the order of exons in transcripts located on the minus strand:
53 | are they ordered by increasing (default) or decreasing rank?
54 | }
55 | \item{error.if.out.of.bounds}{
56 | \code{TRUE} or \code{FALSE}.
57 | Controls how out of bound \code{tlocs} are handled: an error is thrown
58 | (default) or \code{NA} is returned.
59 | }
60 | }
61 |
62 | \value{
63 | For \code{transcriptLocs2refLocs}: A list of integer vectors of the same
64 | shape as \code{tlocs}.
65 |
66 | For \code{transcriptWidths}: An integer vector with one element per
67 | transcript.
68 | }
69 |
70 | \author{Hervé Pagès}
71 |
72 | \seealso{
73 | \itemize{
74 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript
75 | (or CDS) sequences from chromosomes.
76 |
77 | \item \code{\link{coverageByTranscript}} for computing coverage by
78 | transcript (or CDS) of a set of ranges.
79 | }
80 | }
81 |
82 | \examples{
83 | ## ---------------------------------------------------------------------
84 | ## WITH A SMALL SET OF HUMAN TRANSCRIPTS
85 | ## ---------------------------------------------------------------------
86 | txdb_file <- system.file("extdata", "hg19_knownGene_sample.sqlite",
87 | package="GenomicFeatures")
88 | txdb <- loadDb(txdb_file)
89 | ex_by_tx <- exonsBy(txdb, by="tx", use.names=TRUE)
90 | genome <- BSgenome::getBSgenome("hg19") # load the hg19 genome
91 | tx_seqs <- extractTranscriptSeqs(genome, ex_by_tx)
92 |
93 | ## Get the reference-based locations of the first 4 (5' end)
94 | ## and last 4 (3' end) nucleotides in each transcript:
95 | tlocs <- lapply(width(tx_seqs), function(w) c(1:4, (w-3):w))
96 | tx_strand <- sapply(strand(ex_by_tx), runValue)
97 |
98 | ## Note that, because of how we made them, 'tlocs', 'start(ex_by_tx)',
99 | ## 'end(ex_by_tx)' and 'tx_strand' are "parallel" objects i.e. they
100 | ## have the same length, and, for any valid positional index, elements
101 | ## at this position are corresponding to each other. This is how
102 | ## transcriptLocs2refLocs() expects them to be!
103 | rlocs <- transcriptLocs2refLocs(tlocs,
104 | start(ex_by_tx), end(ex_by_tx),
105 | tx_strand, decreasing.rank.on.minus.strand=TRUE)
106 |
107 | ## ---------------------------------------------------------------------
108 | ## WITH TWO WORM TRANSCRIPTS: ZC101.3.1 AND F37B1.1.1
109 | ## ---------------------------------------------------------------------
110 | library(TxDb.Celegans.UCSC.ce11.ensGene)
111 | txdb <- TxDb.Celegans.UCSC.ce11.ensGene
112 | my_tx_names <- c("ZC101.3.1", "F37B1.1.1")
113 | ## Both transcripts are on chromosome II, the first one on its positive
114 | ## strand and the second one on its negative strand:
115 | my_tx <- transcripts(txdb, filter=list(tx_name=my_tx_names))
116 | my_tx
117 |
118 | ## Using transcripts stored in a GRangesList object:
119 | ex_by_tx <- exonsBy(txdb, use.names=TRUE)[my_tx_names]
120 | genome <- getBSgenome("ce11") # load the ce11 genome
121 | tx_seqs <- extractTranscriptSeqs(genome, ex_by_tx)
122 | tx_seqs
123 |
124 | ## Since the 2 transcripts are on the same chromosome, an alternative
125 | ## is to store them in an IRangesList object and use that object with
126 | ## extractTranscriptSeqs():
127 | ex_by_tx2 <- ranges(ex_by_tx)
128 | tx_seqs2 <- extractTranscriptSeqs(genome$chrII, ex_by_tx2,
129 | strand=strand(my_tx))
130 | stopifnot(identical(as.character(tx_seqs), as.character(tx_seqs2)))
131 |
132 | ## Store exon starts and ends in two IntegerList objects for use with
133 | ## transcriptWidths() and transcriptLocs2refLocs():
134 | exon_starts <- start(ex_by_tx)
135 | exon_ends <- end(ex_by_tx)
136 |
137 | ## Same as 'width(tx_seqs)':
138 | transcriptWidths(exonStarts=exon_starts, exonEnds=exon_ends)
139 |
140 | transcriptLocs2refLocs(list(c(1:2, 202:205, 1687:1688),
141 | c(1:2, 193:196, 721:722)),
142 | exonStarts=exon_starts,
143 | exonEnds=exon_ends,
144 | strand=c("+","-"))
145 |
146 | ## A sanity check:
147 | ref_locs <- transcriptLocs2refLocs(list(1:1688, 1:722),
148 | exonStarts=exon_starts,
149 | exonEnds=exon_ends,
150 | strand=c("+","-"))
151 | stopifnot(genome$chrII[ref_locs[[1]]] == tx_seqs[[1]])
152 | stopifnot(complement(genome$chrII)[ref_locs[[2]]] == tx_seqs[[2]])
153 | }
154 |
155 | \keyword{manip}
156 |
--------------------------------------------------------------------------------
/man/transcriptsBy.Rd:
--------------------------------------------------------------------------------
1 | \name{transcriptsBy}
2 |
3 | \alias{transcriptsBy}
4 | \alias{transcriptsBy,TxDb-method}
5 | \alias{exonsBy}
6 | \alias{exonsBy,TxDb-method}
7 | \alias{cdsBy}
8 | \alias{cdsBy,TxDb-method}
9 | \alias{intronsByTranscript}
10 | \alias{intronsByTranscript,TxDb-method}
11 | \alias{fiveUTRsByTranscript}
12 | \alias{fiveUTRsByTranscript,TxDb-method}
13 | \alias{threeUTRsByTranscript}
14 | \alias{threeUTRsByTranscript,TxDb-method}
15 |
16 | \title{
17 | Extract and group genomic features of a given type from a TxDb-like object
18 | }
19 | \description{
20 | Generic functions to extract genomic features of a given type
21 | grouped based on another type of genomic feature.
22 | This page documents the methods for \link{TxDb} objects only.
23 | }
24 | \usage{
25 | transcriptsBy(x, by=c("gene", "exon", "cds"), ...)
26 | \S4method{transcriptsBy}{TxDb}(x, by=c("gene", "exon", "cds"), use.names=FALSE)
27 |
28 | exonsBy(x, by=c("tx", "gene"), ...)
29 | \S4method{exonsBy}{TxDb}(x, by=c("tx", "gene"), use.names=FALSE)
30 |
31 | cdsBy(x, by=c("tx", "gene"), ...)
32 | \S4method{cdsBy}{TxDb}(x, by=c("tx", "gene"), use.names=FALSE)
33 |
34 | intronsByTranscript(x, ...)
35 | \S4method{intronsByTranscript}{TxDb}(x, use.names=FALSE)
36 |
37 | fiveUTRsByTranscript(x, ...)
38 | \S4method{fiveUTRsByTranscript}{TxDb}(x, use.names=FALSE)
39 |
40 | threeUTRsByTranscript(x, ...)
41 | \S4method{threeUTRsByTranscript}{TxDb}(x, use.names=FALSE)
42 | }
43 | \arguments{
44 | \item{x}{A \link{TxDb} object.}
45 | \item{...}{Arguments to be passed to or from methods.}
46 | \item{by}{One of \code{"gene"}, \code{"exon"}, \code{"cds"} or \code{"tx"}.
47 | Determines the grouping.}
48 | \item{use.names}{Controls how to set the names of the returned
49 | \link[GenomicRanges]{GRangesList} object.
50 | These functions return all the features of a given type (e.g.
51 | all the exons) grouped by another feature type (e.g. grouped by
52 | transcript) in a \link[GenomicRanges]{GRangesList} object.
53 | By default (i.e. if \code{use.names} is \code{FALSE}), the
54 | names of this \link[GenomicRanges]{GRangesList} object
55 | (aka the group names) are the internal ids of the features
56 | used for grouping (aka the grouping features), which are
57 | guaranteed to be unique.
58 | If \code{use.names} is \code{TRUE}, then the names of the
59 | grouping features are used instead of their internal ids.
60 | For example, when grouping by transcript (\code{by="tx"}),
61 | the default group names are the transcript internal ids
62 | (\code{"tx_id"}). But, if \code{use.names=TRUE}, the group
63 | names are the transcript names (\code{"tx_name"}).
64 | Note that, unlike the feature ids, the feature names are not
65 | guaranteed to be unique or even defined (they could be all
66 | \code{NA}s). A warning is issued when this happens.
67 | See \code{?\link{id2name}} for more information about
68 | feature internal ids and feature external names and how
69 | to map the formers to the latters.
70 |
71 | Finally, \code{use.names=TRUE} cannot be used when grouping
72 | by gene \code{by="gene"}. This is because, unlike for the
73 | other features, the gene ids are external ids (e.g. Entrez
74 | Gene or Ensembl ids) so the db doesn't have a \code{"gene_name"}
75 | column for storing alternate gene names.
76 | }
77 | }
78 | \details{
79 | These functions return a \link[GenomicRanges]{GRangesList} object
80 | where the ranges within each of the elements are ordered according
81 | to the following rule:
82 |
83 | When using \code{exonsBy} or \code{cdsBy} with \code{by="tx"},
84 | the returned exons or CDS parts are ordered by ascending rank for
85 | each transcript, that is, by their position in the transcript.
86 | In all other cases, the ranges will be ordered by chromosome, strand,
87 | start, and end values.
88 | }
89 | \value{A \link[GenomicRanges]{GRangesList} object.}
90 | \author{
91 | M. Carlson, P. Aboyoun and H. Pagès
92 | }
93 | \seealso{
94 | \itemize{
95 | \item \code{\link{transcripts}} and \code{\link{transcriptsByOverlaps}}
96 | for more ways to extract genomic features
97 | from a \link{TxDb}-like object.
98 |
99 | \item \code{\link{transcriptLengths}} for extracting the transcript
100 | lengths (and other metrics) from a \link{TxDb} object.
101 |
102 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for
103 | extracting non-overlapping exonic or intronic parts from a
104 | TxDb-like object.
105 |
106 | \item \code{\link{extendExonsIntoIntrons}} for extending exons
107 | into their adjacent introns.
108 |
109 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript
110 | (or CDS) sequences from chromosome sequences.
111 |
112 | \item \code{\link{coverageByTranscript}} for computing coverage by
113 | transcript (or CDS) of a set of ranges.
114 |
115 | \item \link[GenomicFeatures]{select-methods} for how to use the
116 | simple "select" interface to extract information from a
117 | \link{TxDb} object.
118 |
119 | \item \code{\link{id2name}} for mapping \link{TxDb} internal ids
120 | to external names for a given feature type.
121 |
122 | \item The \link{TxDb} class.
123 | }
124 | }
125 | \examples{
126 | txdb_file <- system.file("extdata", "hg19_knownGene_sample.sqlite",
127 | package="GenomicFeatures")
128 | txdb <- loadDb(txdb_file)
129 |
130 | ## Extract the transcripts grouped by gene:
131 | transcriptsBy(txdb, "gene")
132 |
133 | ## Extract the exons grouped by gene:
134 | exonsBy(txdb, "gene")
135 |
136 | ## Extract the CDS parts grouped by transcript:
137 | cds_by_tx0 <- cdsBy(txdb, "tx")
138 | ## With more informative group names:
139 | cds_by_tx1 <- cdsBy(txdb, "tx", use.names=TRUE)
140 | ## Note that 'cds_by_tx1' can also be obtained with:
141 | names(cds_by_tx0) <- id2name(txdb, feature.type="tx")[names(cds_by_tx0)]
142 | stopifnot(identical(cds_by_tx0, cds_by_tx1))
143 |
144 | ## Extract the introns grouped by transcript:
145 | intronsByTranscript(txdb)
146 |
147 | ## Extract the 5' UTRs grouped by transcript:
148 | fiveUTRsByTranscript(txdb)
149 | fiveUTRsByTranscript(txdb, use.names=TRUE) # more informative group names
150 | }
151 |
152 | \keyword{methods}
153 |
--------------------------------------------------------------------------------
/man/transcriptsByOverlaps.Rd:
--------------------------------------------------------------------------------
1 | \name{transcriptsByOverlaps}
2 |
3 | \alias{transcriptsByOverlaps}
4 | \alias{transcriptsByOverlaps,TxDb-method}
5 | \alias{exonsByOverlaps}
6 | \alias{exonsByOverlaps,TxDb-method}
7 | \alias{cdsByOverlaps}
8 | \alias{cdsByOverlaps,TxDb-method}
9 |
10 | \title{
11 | Extract genomic features from a TxDb-like object based on their
12 | genomic location
13 | }
14 | \description{
15 | Generic functions to extract genomic features for specified genomic
16 | locations.
17 | This page documents the methods for \link{TxDb} objects only.
18 | }
19 | \usage{
20 | transcriptsByOverlaps(x, ranges,
21 | maxgap = -1L, minoverlap = 0L,
22 | type = c("any", "start", "end"), ...)
23 | \S4method{transcriptsByOverlaps}{TxDb}(x, ranges,
24 | maxgap = -1L, minoverlap = 0L,
25 | type = c("any", "start", "end"),
26 | columns = c("tx_id", "tx_name"))
27 |
28 | exonsByOverlaps(x, ranges,
29 | maxgap = -1L, minoverlap = 0L,
30 | type = c("any", "start", "end"), ...)
31 | \S4method{exonsByOverlaps}{TxDb}(x, ranges,
32 | maxgap = -1L, minoverlap = 0L,
33 | type = c("any", "start", "end"),
34 | columns = "exon_id")
35 |
36 | cdsByOverlaps(x, ranges,
37 | maxgap = -1L, minoverlap = 0L,
38 | type = c("any", "start", "end"), ...)
39 | \S4method{cdsByOverlaps}{TxDb}(x, ranges,
40 | maxgap = -1L, minoverlap = 0L,
41 | type = c("any", "start", "end"),
42 | columns = "cds_id")
43 | }
44 | \arguments{
45 | \item{x}{A \link{TxDb} object.}
46 | \item{ranges}{A \link[GenomicRanges]{GRanges} object to restrict the output.}
47 | \item{maxgap,minoverlap,type}{
48 | Used in the internal call to \code{findOverlaps()} to detect overlaps.
49 | See \code{?\link[IRanges]{findOverlaps}} in the \pkg{IRanges} package
50 | for a description of these arguments.
51 | }
52 | \item{...}{Arguments to be passed to or from methods.}
53 | \item{columns}{Columns to include in the output.
54 | See \code{?\link{transcripts}} for the possible values.}
55 | }
56 | \details{
57 | These functions subset the results of \code{\link{transcripts}},
58 | \code{\link{exons}}, and \code{\link{cds}} function calls with
59 | using the results of \code{\link[IRanges]{findOverlaps}}
60 | calls based on the specified \code{ranges}.
61 | }
62 | \value{ a GRanges object }
63 | \author{
64 | P. Aboyoun
65 | }
66 | \seealso{
67 | \itemize{
68 | \item \code{\link{transcripts}} and \code{\link{transcriptsBy}}
69 | for more ways to extract genomic features
70 | from a \link{TxDb}-like object.
71 |
72 | \item \code{\link{transcriptLengths}} for extracting the transcript
73 | lengths (and other metrics) from a \link{TxDb} object.
74 |
75 | \item \code{\link{exonicParts}} and \code{\link{intronicParts}} for
76 | extracting non-overlapping exonic or intronic parts from a
77 | TxDb-like object.
78 |
79 | \item \code{\link{extractTranscriptSeqs}} for extracting transcript
80 | (or CDS) sequences from chromosome sequences.
81 |
82 | \item \code{\link{coverageByTranscript}} for computing coverage by
83 | transcript (or CDS) of a set of ranges.
84 |
85 | \item \link[GenomicFeatures]{select-methods} for how to use the
86 | simple "select" interface to extract information from a
87 | \link{TxDb} object.
88 |
89 | \item \code{\link{id2name}} for mapping \link{TxDb} internal ids
90 | to external names for a given feature type.
91 |
92 | \item The \link{TxDb} class.
93 | }
94 | }
95 | \examples{
96 | txdb <- loadDb(system.file("extdata", "hg19_knownGene_sample.sqlite",
97 | package="GenomicFeatures"))
98 | gr <- GRanges(Rle("chr1", 2),
99 | IRanges(c(500,10500), c(10000,30000)),
100 | strand = Rle("-", 2))
101 | transcriptsByOverlaps(txdb, gr)
102 | }
103 |
104 | \keyword{methods}
105 |
--------------------------------------------------------------------------------
/tests/run_unitTests.R:
--------------------------------------------------------------------------------
1 | require("GenomicFeatures") || stop("unable to load GenomicFeatures package")
2 | GenomicFeatures:::.test()
3 |
--------------------------------------------------------------------------------