├── .Rbuildignore ├── .gitignore ├── vignettes ├── images │ ├── icons │ │ ├── home.png │ │ ├── next.png │ │ ├── note.png │ │ ├── prev.png │ │ ├── tip.png │ │ ├── up.png │ │ ├── INFO.txt │ │ ├── caution.png │ │ ├── download.png │ │ ├── example.png │ │ ├── important.png │ │ ├── warning.png │ │ ├── callouts │ │ │ ├── 1.png │ │ │ ├── 10.png │ │ │ ├── 11.png │ │ │ ├── 12.png │ │ │ ├── 13.png │ │ │ ├── 14.png │ │ │ ├── 15.png │ │ │ ├── 2.png │ │ │ ├── 3.png │ │ │ ├── 4.png │ │ │ ├── 5.png │ │ │ ├── 6.png │ │ │ ├── 7.png │ │ │ ├── 8.png │ │ │ └── 9.png │ │ └── callout-border.png │ └── GSE52564-samples-section.png ├── mystyles.css └── archs4.Rmd ├── inst ├── extdata │ ├── testdata-sample-ids.xlsx │ ├── blacklist.csv │ └── meta.yaml └── rmdparts │ ├── references.Rmd │ └── installation.Rmd ├── tests ├── testthat.R └── testthat │ ├── test-archs4-functional.R │ ├── helper-all.R │ ├── test-bioc-containers.R │ ├── test-expression-concordance.R │ ├── test-expression.R │ ├── test-feature-info.R │ └── test-sample-retrieval.R ├── R ├── package.R ├── blacklist.R ├── validate.R ├── zzz.R ├── rmd-helpers.R ├── h5utils.R ├── tidy.R ├── data-directory.R ├── bioc-containers.R ├── geo-utils.R ├── Archs4Repository.R ├── archs4-features.R ├── expression.R └── archs4-functional.R ├── man ├── blacklist.Rd ├── datadir.Rd ├── feature_source.Rd ├── estimate_norm_factors.Rd ├── archs4_meta.Rd ├── Archs4Repository.Rd ├── libstats.Rd ├── geo_id_type.Rd ├── estimate_repository_norm_factors.Rd ├── archs4_sources.Rd ├── archs4_sample_covariates.Rd ├── query_geo.Rd ├── archs4_local_data_dir_validate.Rd ├── lookup_gse.Rd ├── archs4_file_info.Rd ├── feature_lookup.Rd ├── dot-h5read.Rd ├── retrieve_sra_metadata.Rd ├── archs4_series_status.Rd ├── archs4_feature_info.Rd ├── archs4_local_data_dir_create.Rd ├── lookup_biosamples.Rd ├── as.DGEList.Rd ├── archs4_sample_table.Rd ├── archs4_file_path.Rd ├── fetch_expression.Rd ├── create_augmented_feature_info.Rd └── archs4_sample_info.Rd ├── DESCRIPTION ├── NAMESPACE ├── README.Rmd └── README.md /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | -------------------------------------------------------------------------------- /vignettes/images/icons/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/home.png -------------------------------------------------------------------------------- /vignettes/images/icons/next.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/next.png -------------------------------------------------------------------------------- /vignettes/images/icons/note.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/note.png -------------------------------------------------------------------------------- /vignettes/images/icons/prev.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/prev.png -------------------------------------------------------------------------------- /vignettes/images/icons/tip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/tip.png -------------------------------------------------------------------------------- /vignettes/images/icons/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/up.png -------------------------------------------------------------------------------- /vignettes/images/icons/INFO.txt: -------------------------------------------------------------------------------- 1 | These icons were taken from asciidoc: 2 | 3 | http://www.methods.co.nz/asciidoc/images/icons/ 4 | -------------------------------------------------------------------------------- /vignettes/images/icons/caution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/caution.png -------------------------------------------------------------------------------- /vignettes/images/icons/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/download.png -------------------------------------------------------------------------------- /vignettes/images/icons/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/example.png -------------------------------------------------------------------------------- /vignettes/images/icons/important.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/important.png -------------------------------------------------------------------------------- /vignettes/images/icons/warning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/warning.png -------------------------------------------------------------------------------- /inst/extdata/testdata-sample-ids.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/inst/extdata/testdata-sample-ids.xlsx -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library("archs4") 2 | library("testthat") 3 | library("dplyr") 4 | library("rhdf5") 5 | 6 | test_check("archs4") 7 | -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/1.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/10.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/11.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/12.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/13.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/14.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/15.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/2.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/3.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/4.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/5.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/6.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/7.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/8.png -------------------------------------------------------------------------------- /vignettes/images/icons/callouts/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/9.png -------------------------------------------------------------------------------- /vignettes/images/icons/callout-border.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callout-border.png -------------------------------------------------------------------------------- /vignettes/images/GSE52564-samples-section.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/GSE52564-samples-section.png -------------------------------------------------------------------------------- /inst/extdata/blacklist.csv: -------------------------------------------------------------------------------- 1 | "series_id","reason" 2 | "GSE88850","scRNAseq" 3 | "GSE44618","scRNAseq" 4 | "GSE85783","scRNAseq" 5 | "GSE90014","scRNAseq" 6 | "GSE99095","scRNAseq" 7 | "GSE103239","scRNAseq" 8 | "GSE95630","scRNAseq" 9 | "GSE90015","methylation array?" 10 | -------------------------------------------------------------------------------- /R/package.R: -------------------------------------------------------------------------------- 1 | # Should we import GEOquery? 2 | 3 | #' @import checkmate 4 | #' @import dplyr 5 | #' @import purrr 6 | #' @import readr 7 | #' @import tidyr 8 | NULL 9 | 10 | # re-export generics from other packages 11 | 12 | #' @importFrom broom tidy 13 | #' @export tidy 14 | NULL -------------------------------------------------------------------------------- /tests/testthat/test-archs4-functional.R: -------------------------------------------------------------------------------- 1 | context("ARCHS4 Functional Interface") 2 | 3 | if (!exists("a4")) { 4 | # This is loaded by the testthat/helper-all.R script when testthat is running 5 | # the unit tests, but included here for convenience when doing interactive 6 | # test development 7 | a4 <- Archs4Repository() 8 | } 9 | 10 | -------------------------------------------------------------------------------- /R/blacklist.R: -------------------------------------------------------------------------------- 1 | #' Return a list of series/samples you may want to ignore for now 2 | #' 3 | #' The entries that appear here so far are because the data appear to come from 4 | #' single-cell experiments 5 | #' 6 | #' @export 7 | blacklist <- function() { 8 | fn <- system.file("extdata", "blacklist.csv", package = "archs4") 9 | read.csv(fn, stringsAsFactors = FALSE) 10 | } 11 | -------------------------------------------------------------------------------- /man/blacklist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/blacklist.R 3 | \name{blacklist} 4 | \alias{blacklist} 5 | \title{Return a list of series/samples you may want to ignore for now} 6 | \usage{ 7 | blacklist() 8 | } 9 | \description{ 10 | The entries that appear here so far are because the data appear to come from 11 | single-cell experiments 12 | } 13 | -------------------------------------------------------------------------------- /R/validate.R: -------------------------------------------------------------------------------- 1 | is_archs4_expression_file <- function(fn) { 2 | assert_character(fn) 3 | # last N characters must be '_matrix.h5' 4 | suffix <- '_matrix.h5$' 5 | isTRUE(length(grep(suffix, fn)[1L]) == 1L) 6 | } 7 | 8 | is_geo_series_id <- function(id) { 9 | assert_character(id) 10 | substr(id, 1, 3) == "GSE" 11 | } 12 | 13 | is_geo_sample_id <- function(id) { 14 | assert_character(id) 15 | substr(id, 1, 3) == "GSM" 16 | } -------------------------------------------------------------------------------- /man/datadir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R 3 | \name{datadir} 4 | \alias{datadir} 5 | \title{Retrieves the directory that contains the data for the Archs4Repository} 6 | \usage{ 7 | datadir(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{an \code{Archs4Repository}} 11 | } 12 | \description{ 13 | Retrieves the directory that contains the data for the Archs4Repository 14 | } 15 | -------------------------------------------------------------------------------- /man/feature_source.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expression.R 3 | \name{feature_source} 4 | \alias{feature_source} 5 | \title{Retrieves (organism) source of gene/transcript identifiers} 6 | \usage{ 7 | feature_source(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A vector of identifiers} 11 | } 12 | \description{ 13 | Parses ensembl identifiers and determines if they are for genes, or 14 | transcripts, as well as the organism they should belong to (human, mouse) 15 | } 16 | -------------------------------------------------------------------------------- /man/estimate_norm_factors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expression.R 3 | \name{estimate_norm_factors} 4 | \alias{estimate_norm_factors} 5 | \title{Calculates library size and norm factors for a specific dataset} 6 | \usage{ 7 | estimate_norm_factors(a4, key, n = 500, logratioTrim = 0.3, 8 | sumTrim = 0.05, doWeighting = TRUE, Acutoff = -1e+10, p = 0.75, 9 | ...) 10 | } 11 | \description{ 12 | This function will serialize the results of the library size and 13 | normalization factors into files inside \code{datadir(a4)} 14 | } 15 | -------------------------------------------------------------------------------- /man/archs4_meta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{meta} 4 | \alias{meta} 5 | \alias{archs4_meta} 6 | \title{Retrieves the meta information associated with an ARCHS4 datadir} 7 | \usage{ 8 | meta(x) 9 | 10 | archs4_meta(datadir = getOption("archs4.datadir")) 11 | } 12 | \arguments{ 13 | \item{x}{an \code{Archs4Repository}} 14 | 15 | \item{datadir}{the directory that holds the archs4 data} 16 | } 17 | \value{ 18 | a list-representation of the \code{meta.yaml} file 19 | } 20 | \description{ 21 | Retrieves the meta information associated with an ARCHS4 datadir 22 | } 23 | -------------------------------------------------------------------------------- /man/Archs4Repository.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R 3 | \name{Archs4Repository} 4 | \alias{Archs4Repository} 5 | \title{An interface to a locally downloaded ARCHS4 dataset} 6 | \usage{ 7 | Archs4Repository(datadir = getOption("archs4.datadir")) 8 | } 9 | \arguments{ 10 | \item{datadir}{The directory that stores the ARCHS4 data.} 11 | } 12 | \value{ 13 | an Arhcs4DataSet object 14 | } 15 | \description{ 16 | This instantiates an object that acts as a central broker to handle queries 17 | against the ARCHS4 dataset. Please refer to the vignette for instructions 18 | on how to setup a local directory to act as an Archs4Repository. 19 | } 20 | -------------------------------------------------------------------------------- /man/libstats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R 3 | \name{libstats} 4 | \alias{libstats} 5 | \title{Extract the read depth and normalization factors for the samples} 6 | \usage{ 7 | libstats(x, with_a4libsize = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{an \code{Archs4Repository}} 11 | 12 | \item{with_a4libsize}{If \code{TRUE}, includes an \code{a4libsize} column, which 13 | was extracted from the \code{meta/reads_aligned} hdf5 file. Defaults to \code{FALSE}.} 14 | } 15 | \value{ 16 | a tibble with sample_id, a4libsize, libsize, normfactor 17 | } 18 | \description{ 19 | Extract the read depth and normalization factors for the samples 20 | } 21 | -------------------------------------------------------------------------------- /man/geo_id_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geo-utils.R 3 | \name{geo_id_type} 4 | \alias{geo_id_type} 5 | \title{Classify a vector of sample or series GEO ID's as such} 6 | \usage{ 7 | geo_id_type(id) 8 | } 9 | \arguments{ 10 | \item{id}{a character vector of \code{GSEnnnnn} or \code{GSMnnnnn} ids} 11 | } 12 | \value{ 13 | a tibble of \code{unique(id)} indicating if the id is a \code{"series"} 14 | (GSEnnnnn), \code{"sample"} (GSMnnnnn), or \code{"unknown"}. 15 | } 16 | \description{ 17 | GEO series identifiers all start with GSE and sample identifiers all 18 | start with GSM. We use that to identify what types of identifiers are 19 | passed into \code{id} 20 | } 21 | -------------------------------------------------------------------------------- /tests/testthat/helper-all.R: -------------------------------------------------------------------------------- 1 | # These commands are run before the tests and are made available for everyone 2 | # to use. 3 | 4 | # Instantiate a universal Archs4Repository because it takes a little while 5 | # (~4s) to construct one. 6 | if (!interactive() && identical(Sys.getenv("NOT_CRAN"), "true")) { 7 | # Putting it in this block, because helper-* function are run for convenience 8 | # in several scenarios, including when we: 9 | # 10 | # 1. Run devtools::load_all(); and 11 | # 2. Run devtools::document() 12 | # 13 | # In these scenarios, I don't want to instantiated the Archs4Repository, so 14 | # the if () shoudl only evaluate to true when we are actually *testing*. 15 | a4 <- Archs4Repository() 16 | } 17 | -------------------------------------------------------------------------------- /man/estimate_repository_norm_factors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expression.R 3 | \name{estimate_repository_norm_factors} 4 | \alias{estimate_repository_norm_factors} 5 | \title{Estimate normalization factors for datasets in the Archs4Repository} 6 | \usage{ 7 | estimate_repository_norm_factors(a4) 8 | } 9 | \arguments{ 10 | \item{a4}{The \code{Arcsh4Repository}} 11 | } 12 | \value{ 13 | Invisibly returns a tibble of the the library size and normalization 14 | factors for the expression data in \code{a4} (invisibly). 15 | } 16 | \description{ 17 | This function will serialize the results of the library size and 18 | normalization factors into files inside \code{datadir(a4)} 19 | } 20 | -------------------------------------------------------------------------------- /man/archs4_sources.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{sources} 4 | \alias{sources} 5 | \alias{archs4_sources} 6 | \title{Lists the different sources ARCHS4 is built for} 7 | \usage{ 8 | sources(x) 9 | 10 | archs4_sources(datadir = getOption("archs4.datadir")) 11 | } 12 | \arguments{ 13 | \item{x}{an \code{Archs4Repository}} 14 | 15 | \item{datadir}{the directory that holds the archs4 data} 16 | } 17 | \value{ 18 | a character vector listing the different sources (organisms) that 19 | the ARCHS4 repository has data for. 20 | } 21 | \description{ 22 | We hardocde these values in a lot of places ... who knows if one day these 23 | are updated? 24 | } 25 | -------------------------------------------------------------------------------- /man/archs4_sample_covariates.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{sample_covariates} 4 | \alias{sample_covariates} 5 | \alias{archs4_sample_covariates} 6 | \title{Retrieves tibble of sample-level covariates available in mouse and human data.} 7 | \usage{ 8 | sample_covariates(x, ...) 9 | 10 | archs4_sample_covariates(datadir = getOption("archs4.datadir"), ...) 11 | } 12 | \arguments{ 13 | \item{x}{an \code{Archs4Repository}} 14 | 15 | \item{datadir}{the directory that holds the archs4 data} 16 | } 17 | \description{ 18 | Enumerate the sample covariates available in mouse and human data. 19 | Note that the covariates available in human and mouse are the same 20 | between the gene and transcript level files 21 | } 22 | -------------------------------------------------------------------------------- /man/query_geo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geo-utils.R 3 | \name{query_geo} 4 | \alias{query_geo} 5 | \title{Query NCBI GEO through its REST interface} 6 | \source{ 7 | https://www.ncbi.nlm.nih.gov/geo/info/download.html 8 | } 9 | \usage{ 10 | query_geo(accession, target = c("self", "gsm", "gpl", "gse", "all"), 11 | validate = FALSE, verbose = FALSE) 12 | } 13 | \arguments{ 14 | \item{accession}{Scalar character, GEO identifier for a series (GSE), a 15 | sample (GSM) or a platform (GPL).} 16 | 17 | \item{validate}{Scalar boolean, validate the retrieved xml file against 18 | NCBI's schema?} 19 | } 20 | \value{ 21 | xml2::xml_document object 22 | } 23 | \description{ 24 | Query NCBI GEO through its REST interface 25 | } 26 | \examples{ 27 | query_geo("GSE109171") 28 | } 29 | -------------------------------------------------------------------------------- /tests/testthat/test-bioc-containers.R: -------------------------------------------------------------------------------- 1 | context("Bioconductor Containers") 2 | 3 | if (!exists("a4")) { 4 | a4 <- Archs4Repository() 5 | } 6 | 7 | test_that("as.DGEList creates gene- and sample-level DGELists", { 8 | # This is a human dataset 9 | scovs <- c("Sample_title", "Sample_source_name_ch1") 10 | yg <- as.DGEList(a4, "GSE52564", feature_type = "gene", 11 | sample_columns = scovs) 12 | expect_true(all(substr(rownames(yg), 1, 7) == "ENSMUSG")) 13 | for (cov in scovs) { 14 | expect_is(yg$samples[[cov]], "character", info = paste("yg:", cov)) 15 | } 16 | 17 | 18 | yt <- as.DGEList(a4, "GSE52564", feature_type = "transcript", 19 | sample_columns = scovs) 20 | expect_true(all(substr(rownames(yt), 1, 7) == "ENSMUST")) 21 | for (cov in scovs) { 22 | expect_is(yt$samples[[cov]], "character", info = paste("yt:", cov)) 23 | } 24 | }) -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | ## Setup default option values 3 | opts <- options() 4 | 5 | pkg.opts <- list( 6 | archs4.datadir='~/.archs4data') 7 | toset <- !(names(pkg.opts) %in% names(opts)) 8 | if (any(toset)) { 9 | options(pkg.opts[toset]) 10 | } 11 | 12 | ddir <- getOption("archs4.datadir") 13 | kosher <- archs4_local_data_dir_validate(echo = FALSE, datadir = ddir) 14 | if (!isTRUE(kosher)) { 15 | packageStartupMessage( 16 | "Note that your default archs4 data directory is NOT setup correctly\n\n", 17 | " * Run `archs4_local_data_dir_validate()` to diagnose\n", 18 | " * Refer to the ARCHS4 Data Download section of the archs4 vignette ", 19 | "for more information\n\n", 20 | "Your default archs4 data directory (`getOption(\"archs4.datadir\")`) ", 21 | "is:\n\n ", ddir, "\n\n") 22 | } 23 | 24 | invisible() 25 | } 26 | -------------------------------------------------------------------------------- /man/archs4_local_data_dir_validate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data-directory.R 3 | \name{archs4_local_data_dir_validate} 4 | \alias{archs4_local_data_dir_validate} 5 | \title{Check "the health" of a local ARCHS4 data datadir} 6 | \usage{ 7 | archs4_local_data_dir_validate(echo = TRUE, 8 | datadir = getOption("archs4.datadir")) 9 | } 10 | \arguments{ 11 | \item{echo}{echo validation diagnostic message to stdout via \code{\link[base:cat]{base::cat()}}} 12 | 13 | \item{datadir}{the path to the datadir that stores local ARCHS4 data. 14 | Defaults to \code{getOption("archs4.datadir")}.} 15 | } 16 | \value{ 17 | A string that indicates "what's wrong", or \code{TRUE} if validation 18 | succeeds. 19 | } 20 | \description{ 21 | This function will notify the suer which files are missing from the 22 | ARCHS4 data datadir, and what course of action they can use to 23 | fix it. 24 | } 25 | -------------------------------------------------------------------------------- /man/lookup_gse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geo-utils.R 3 | \name{lookup_gse} 4 | \alias{lookup_gse} 5 | \title{Retrieve information about a GEO series} 6 | \usage{ 7 | lookup_gse(accession, fields = c("Accession", "Title", "Summary", 8 | "Overall-Design", "Type", "Pubmed-ID", "Sample"), ...) 9 | } 10 | \arguments{ 11 | \item{accession}{Scalar character, GEO series identifier e.g. GSE109171} 12 | 13 | \item{fields}{Character vector specifying which fields to extract from the 14 | XML file returned by GEO} 15 | 16 | \item{...}{Additional arguments passed on to the \code{query_geo} function.} 17 | } 18 | \value{ 19 | List the requested \code{fields} 20 | } 21 | \description{ 22 | Queries NCBI GEO's REST interface to retrieve e.g. title, summary and the 23 | list of samples for a GEO series. 24 | } 25 | \examples{ 26 | if (interactive()) { 27 | lookup_gse("GSE109171") 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /man/archs4_file_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{file_info} 4 | \alias{file_info} 5 | \alias{archs4_file_info} 6 | \title{Retrieves a table of files that back an Archs4Repository} 7 | \usage{ 8 | file_info(x) 9 | 10 | archs4_file_info(datadir = getOption("archs4.datadir")) 11 | } 12 | \arguments{ 13 | \item{x}{an \code{Archs4Repository}} 14 | 15 | \item{datadir}{the directory that stores the ARCHS4 data files} 16 | } 17 | \description{ 18 | A yaml iskept in the Archs4 data directory (\code{getOption("archs4.datadir")}) 19 | that links keys, (ie. \code{mouse_gene}) to the name of the file in the directory. 20 | This abstraction is introduced so that the version of these files can be 21 | updated with new downloads, and the user simply has to modify the yaml file 22 | so that they are used downstream 23 | 24 | Reference the "ARCHS4 Data Download" section in the vignette for more 25 | information. 26 | } 27 | -------------------------------------------------------------------------------- /tests/testthat/test-expression-concordance.R: -------------------------------------------------------------------------------- 1 | context("Expression Concordance") 2 | 3 | # There have been a few occassions where the ARCHS4 gene expression quantitation 4 | # has lined up far from what is expected. For example: 5 | # 6 | # 1. the values in the human v4 gene-level expression matrices lined up poorly 7 | # with internal estimates of the same, and this concordance was resolved when 8 | # the v5 gene-level matrices were published. 9 | # 2. The v5 mouse gene-level matrices look a bit off compared to internally 10 | # processed data, but the v4 versions of the seemed more inline with 11 | # expectation. 12 | 13 | # As a "validation set", a subset of gene level 14 | # `edgeR::cpm(., prior.count = 3, log = TRUE)` values are provided for studies 15 | # processed by: 16 | # 1. A STAR + salmon + GENOCDE ref piline; and 17 | # 2. recount2 18 | 19 | test_that("human expression is concordant with expectation", { 20 | 21 | }) 22 | 23 | test_that("mouse expression is concoordant with expectation", { 24 | 25 | }) -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: archs4 2 | Type: Package 3 | Title: Wrappers to query and extract ARCHS4 data 4 | Version: 0.5.2 5 | Authors@R: c( 6 | person("Steve", "Lianoglou", , "lianoglou@dnli.com", c("aut", "cre")), 7 | person("Thomas", "Sandmann", , "sandmann@dnli.com", c("aut")), 8 | person("Denali Therapeutics", role = c("cph", "fnd")) 9 | ) 10 | Description: An R interface to query and extract data from the ARCHS4 resource. 11 | License: Apache 2.0 12 | URL: https://github.com/denalitherapeutics/archs4#readme 13 | BugReports: https://github.com/denalitherapeutics/archs4/issues 14 | Imports: 15 | broom, 16 | checkmate, 17 | dplyr, 18 | edgeR, 19 | purrr, 20 | readr, 21 | rentrez, 22 | rhdf5, 23 | tibble, 24 | tidyr, 25 | xml2, 26 | yaml 27 | Suggests: 28 | GenomicRanges, 29 | rtracklayer, 30 | testthat, 31 | rmarkdown, 32 | knitr 33 | Encoding: UTF-8 34 | LazyData: true 35 | Roxygen: list(markdown = TRUE) 36 | RoxygenNote: 6.1.0 37 | VignetteBuilder: knitr 38 | -------------------------------------------------------------------------------- /inst/rmdparts/references.Rmd: -------------------------------------------------------------------------------- 1 | [//]: # (References ===========================================================) 2 | 3 | [archs4dl]: https://amp.pharm.mssm.edu/archs4/download.html 4 | [archs4eda]: https://amp.pharm.mssm.edu/archs4/data.html 5 | [archs4pub]: https://www.nature.com/articles/s41467-018-03751-6 6 | [archs4web]: https://amp.pharm.mssm.edu/archs4/ 7 | [barrespub]: http://www.jneurosci.org/content/34/36/11929.short 8 | [barresgeo]: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52564 9 | [bioc]: http://bioconductor.org/ 10 | [biocLite]: https://www.bioconductor.org/install/#why-biocLite 11 | [blurtonpub]: https://www.ncbi.nlm.nih.gov/pubmed/28426964 12 | [blurtongeo]: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE89189 13 | [cran]: https://cran.r-project.org/ 14 | [geo]: https://www.ncbi.nlm.nih.gov/geo/ 15 | [kallisto]: https://pachterlab.github.io/kallisto/about 16 | [lachmann]: http://www.mountsinai.org/profiles/alexander-lachmann 17 | [maayanlab]: http://labs.icahn.mssm.edu/maayanlab/ 18 | [sra]: https://www.ncbi.nlm.nih.gov/sra 19 | -------------------------------------------------------------------------------- /man/feature_lookup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R 3 | \name{feature_lookup} 4 | \alias{feature_lookup} 5 | \title{Perform a loose/fuzzy lookup for a gene/transcript feature.} 6 | \usage{ 7 | feature_lookup(x, query, feature_type = "gene", source = "human", ...) 8 | } 9 | \arguments{ 10 | \item{x}{An Archs4Repository} 11 | 12 | \item{query}{a character string of feature names to look for} 13 | 14 | \item{feature_type}{"gene" or "transcript"} 15 | 16 | \item{source}{organism dataset to lookup} 17 | } 18 | \value{ 19 | a tibble of features that match against the query. The first column 20 | is the value of the query itself. If no match is found for a query, its 21 | row is all \code{NA}. 22 | } 23 | \description{ 24 | This funciton facilitates exploratory data analyses by trying to find gene 25 | or transcripts by different type of identifiers (symbol, ensembl_id, etc). 26 | } 27 | \examples{ 28 | a4 <- Archs4Repository() 29 | features <- feature_lookup(a4, c("CFAP65", "PECR", "ENSG00000131408"), 30 | feature_type = "gene", source ="human") 31 | } 32 | -------------------------------------------------------------------------------- /man/dot-h5read.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/h5utils.R 3 | \name{.h5read} 4 | \alias{.h5read} 5 | \title{Read data from an HDF5 file "with caution"} 6 | \usage{ 7 | .h5read(file, name, index = NULL, start = NULL, stride = NULL, 8 | block = NULL, count = NULL, compoundAsDataFrame = TRUE, 9 | callGeneric = TRUE, read.attributes = FALSE, drop = FALSE, ..., 10 | native = FALSE, default_value = NA, default_dim = 1L) 11 | } 12 | \description{ 13 | This is a simple wrapper to \code{\link[rhdf5:h5read]{rhdf5::h5read()}} which returns a default value 14 | if the "data element" (specified by \code{name}) does not exists within \code{file}. 15 | We use this file to read from ARCHS4 hdf5 files when we want to provide a 16 | little insurance to the evoling nature of their data formats. 17 | 18 | For instance this function is used when we try to read somethign like 19 | \code{"meta/reads_aligned"} because this information was not provided in earlier 20 | versions of these datasets, however \code{"meta/genes"} may use \code{\link[rhdf5:h5read]{rhdf5::h5read()}} 21 | directly because this has been around since "the beginning" 22 | } 23 | -------------------------------------------------------------------------------- /R/rmd-helpers.R: -------------------------------------------------------------------------------- 1 | #' Generates markdown enumeration of linked files to download for datadir 2 | #' 3 | #' @noRd 4 | #' 5 | #' @param datadir the archs4 data dir 6 | #' @return a character string 7 | md_archs4_download_bullet_list <- function(datadir = getOption("archs4.datadir")) { 8 | mdat <- archs4_meta(datadir) 9 | files <- mdat$files 10 | sources <- sapply(files, "[[", "source") 11 | sources <- setdiff(sources, "computed") 12 | mdown <- lapply(sources, function(s) { 13 | header <- sprintf("* %s\n", s) 14 | items <- lapply(files, function(f) { 15 | if (f$source != s) return(NULL) 16 | # ftp:// links aren't rendered correctly in Rmd's, so we need to print 17 | # http:// vs ftp:// links separately 18 | if (substr(f$url, 1, 4) == "http") { 19 | md <- sprintf(" - [`%s`](%s): %s", f$name, f$url, f$description) 20 | } else { 21 | md <- sprintf(" - `%s`: %s\n %s", f$name, f$description, f$url) 22 | } 23 | md 24 | }) 25 | items <- items[sapply(items, Negate(is.null))] 26 | items <- paste(items, collapse = "\n") 27 | paste0(header, items) 28 | }) 29 | paste(mdown, collapse = "\n") 30 | } 31 | -------------------------------------------------------------------------------- /man/retrieve_sra_metadata.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geo-utils.R 3 | \name{retrieve_sra_metadata} 4 | \alias{retrieve_sra_metadata} 5 | \title{Retrieve metadata for an SRA accession} 6 | \usage{ 7 | retrieve_sra_metadata(x, from = c("ena", "ncbi")) 8 | } 9 | \arguments{ 10 | \item{x}{SRA identifier} 11 | 12 | \item{from}{Scalar character, specifying either \code{ncbi} or \code{ena} as 13 | the source database} 14 | } 15 | \value{ 16 | A tbl_df data.frame 17 | } 18 | \description{ 19 | This function uses the EBI's or the NCBI's REST APIs to retrieve information 20 | about SRA data. 21 | Study accessions (ERP, SRP, DRP, PRJ prefixes), experiment accessions 22 | (ERX, SRX, DRX prefixes), sample accessions (ERS, SRS, DRS, SAM prefixes) 23 | and run accessions (ERR, SRR, DRR prefixes) can be supplied. 24 | For more information see \url{http://www.ebi.ac.uk/ena/browse/file-reports} 25 | } 26 | \note{ 27 | The output data.frame will be different for the two source databases. 28 | } 29 | \examples{ 30 | if (interactive()) { 31 | # retrieve study annotations 32 | retrieve_sra_metadata("SRP066489") 33 | # paired-end samples 34 | retrieve_sra_metadata("PRJEB2054", "ena") \%>\% 35 | dplyr::filter(sample_accession == "SAMEA728920") 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/archs4_series_status.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{series_status} 4 | \alias{series_status} 5 | \alias{archs4_series_status} 6 | \title{Checks which samples from a single series are present/absent.} 7 | \usage{ 8 | series_status(x, id, ...) 9 | 10 | archs4_series_status(id, sample_table = archs4_sample_table(datadir = 11 | datadir), datadir = getOption("archs4.datadir"), ...) 12 | } 13 | \arguments{ 14 | \item{x}{an \code{Archs4Repository}} 15 | 16 | \item{id}{a single GEO series id, ie. \code{"GSEnnnnn"}} 17 | 18 | \item{sample_table}{the output from \code{\link[=archs4_sample_table]{archs4_sample_table()}}, which lists 19 | the series_id,sample_id combinations found in the ARCHS4 repository.} 20 | 21 | \item{datadir}{the directory that holds the archs4 data} 22 | } 23 | \value{ 24 | a tibble of information for a series. 25 | } 26 | \description{ 27 | Due to download or alignment issues, the ARCHS4 data processing pipeline may 28 | not include all of the samples included in a particular GEO series. This 29 | function will return a table with an \code{in_archs4} columns that indicates 30 | whether or not a sample from a particular series is present in ARCHS4. 31 | } 32 | \examples{ 33 | info <- archs4_series_status("GSE89189") 34 | } 35 | -------------------------------------------------------------------------------- /tests/testthat/test-expression.R: -------------------------------------------------------------------------------- 1 | context("Expression") 2 | 3 | if (!exists("a4")) { 4 | a4 <- Archs4Repository() 5 | } 6 | 7 | test_that("fetch_expression grabs the right count data", { 8 | gnz <- c("IGFL3", "GSTA4", "MRPS21") 9 | h5.fn <- file_path(a4, "human_gene") 10 | h5.idx <- match(gnz, rhdf5::h5read(h5.fn, "meta/genes")) 11 | 12 | for (i in seq(h5.idx)) { 13 | gene <- gnz[i] 14 | h5idx <- h5.idx[i] 15 | 16 | counts <- rhdf5::h5read(h5.fn, "data/expression", list(h5idx, NULL)) 17 | counts <- as.vector(counts) 18 | names(counts) <- rhdf5::h5read(h5.fn, "meta/Sample_geo_accession") 19 | counts <- counts[!is.na(counts)] 20 | 21 | samples <- sample_table(a4) %>% 22 | filter(!is.na(libsize), !is.na(normfactor)) %>% 23 | select(series_id, sample_id) %>% 24 | semi_join(tibble(sample_id = names(counts)), by = "sample_id") %>% 25 | distinct(sample_id, .keep_all = TRUE) 26 | 27 | counts <- counts[samples$sample_id] 28 | 29 | res <- fetch_expression(a4, gene, samples = samples, 30 | feature_type = "gene", source = "human") 31 | sample_key <- paste(res$series_id, res$sample_id, sep = "_") 32 | expect_true(sum(duplicated(sample_key)) == 0, info = gene) 33 | expect_setequal(names(counts), res$sample_id) 34 | xref <- match(names(counts), res$sample_id) 35 | expect_equal(unname(counts), res$count[xref]) 36 | } 37 | }) 38 | -------------------------------------------------------------------------------- /man/archs4_feature_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{feature_info} 4 | \alias{feature_info} 5 | \alias{archs4_feature_info} 6 | \title{Retrieves the feature (gene/transcript) information for the archs4 data} 7 | \usage{ 8 | feature_info(x, feature_type = "gene", source = "human", 9 | augmented = TRUE, ...) 10 | 11 | archs4_feature_info(feature_type = "gene", source = "human", 12 | augmented = TRUE, datadir = getOption("archs4.datadir"), ...) 13 | } 14 | \arguments{ 15 | \item{x}{an \code{Archs4Repository}} 16 | 17 | \item{feature_type}{gene or transcript?} 18 | 19 | \item{source}{human or mouse} 20 | 21 | \item{augmented}{include extra gene- or transcript-level features? 22 | Default: \code{TRUE}} 23 | 24 | \item{...}{pass through} 25 | 26 | \item{datadir}{the directory that stores the ARCHS4 data files} 27 | } 28 | \value{ 29 | a tibble of information 30 | } 31 | \description{ 32 | Only the gene symbols (\code{meta/genes} in gene expression hd5 file) or entrez 33 | transcript identifiers (\code{meta/transcript} for the transcript hdf5 file) are 34 | stored in thse data. We use \code{\link[=create_augmented_feature_info]{create_augmented_feature_info()}} function to 35 | generate and store extra metadata for these features, which are then appended 36 | to these identifiers with this function. 37 | } 38 | \seealso{ 39 | \code{\link[=create_augmented_feature_info]{create_augmented_feature_info()}} 40 | } 41 | -------------------------------------------------------------------------------- /man/archs4_local_data_dir_create.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data-directory.R 3 | \name{archs4_local_data_dir_create} 4 | \alias{archs4_local_data_dir_create} 5 | \title{Initialize a local datadir to act as an ARCHS4 data datadir} 6 | \usage{ 7 | archs4_local_data_dir_create(datadir = getOption("archs4.datadir"), 8 | stop_if_exists = TRUE) 9 | } 10 | \arguments{ 11 | \item{datadir}{the path to the datadir to create (or initialize) as an 12 | ARCHS4 data datadir.} 13 | 14 | \item{stop_if_exists}{by default, this function will \code{stop} if \code{datadir} 15 | already exists. Set this to \code{FALSE} to continue. Setting it to \code{FALSE} is 16 | convenient to initialize the target \code{datadir} with a \code{meta.yaml} file. 17 | If a \code{meta.yaml} file already exists in \code{datadir}, then this function 18 | will stop unconditionally. Move the \code{datadir/meta.yaml} out of the way 19 | if you simply want to refresh it with the default version.} 20 | } 21 | \value{ 22 | invisibly returns the path to the \code{meta.yaml} in the target 23 | \code{datadir} 24 | } 25 | \description{ 26 | Initialize a local datadir to act as an ARCHS4 data datadir 27 | } 28 | \details{ 29 | A local datadir needs to be created and initialized (wth a \code{meta.yaml} 30 | file), to house ARCHS4 data for use in an Archs4Repository. This function 31 | creates that datadir and copies an initial \code{meta.yaml} file. 32 | 33 | Please refer to the vignette section "ARCHS4 Data Download" for more 34 | details. 35 | } 36 | -------------------------------------------------------------------------------- /man/lookup_biosamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geo-utils.R 3 | \name{lookup_biosamples} 4 | \alias{lookup_biosamples} 5 | \title{Retrieve sample annotations from NCBI's Biosample database} 6 | \source{ 7 | https://www.ncbi.nlm.nih.gov/books/NBK25499/#\emph{chapter4_ESearch} 8 | } 9 | \usage{ 10 | lookup_biosamples(x, retmax = 1e+05 - 1L) 11 | } 12 | \arguments{ 13 | \item{x}{Character vector of sample identifiers to search the Biosample 14 | database for. Typically either \code{BioSample (SAMN)}, \code{SRA (SRS)} or 15 | \code{GEO (GSM)} accession numbers.} 16 | 17 | \item{retmax}{Scalar integer, the maximum number of (total) matches to 18 | retrieve from Entrez. See 19 | \url{https://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_} 20 | for details. The number of records that can be retrieved in one query 21 | must be < 100,000.} 22 | } 23 | \value{ 24 | A tbl_df data.frame with sample annotations. Column names and numbers 25 | vary depending on the attributes available in the Biosample database. 26 | } 27 | \description{ 28 | This function uses the \code{rentrez} package to retrieve sample annotations 29 | from NCBI's Biosample database. 30 | } 31 | \examples{ 32 | if (interactive()) { 33 | # BioSample identifiers 34 | lookup_biosamples(c("GSM1947162", "GSM1947179")) 35 | # mixed SRS and GSM identifiers 36 | lookup_biosamples(c("SRS1171537", "SRS1171536", "GSM1947179")) 37 | # mixed samples from two different studies (with different attributes) 38 | lookup_biosamples(c("SRS1171537", "SRS1271536")) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /man/as.DGEList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bioc-containers.R 3 | \name{as.DGEList} 4 | \alias{as.DGEList} 5 | \title{Create a DGEList for the expression data of a series or set of samples.} 6 | \usage{ 7 | as.DGEList(x, id, features = NULL, sample_columns = c("Sample_title", 8 | "Sample_source_name_ch1"), feature_type = c("gene", "transcript"), 9 | row_id = c("ensembl", "symbol"), check_missing_samples = TRUE) 10 | } 11 | \arguments{ 12 | \item{id}{a vector of series or sample id's.} 13 | 14 | \item{features}{a feature-descriptor of the features you want to include 15 | counts for} 16 | 17 | \item{sample_columns}{the names of the sample covariates that are stored 18 | in the ARCHS4 Dataset; a complete list of what covariates are available 19 | in the ARCHS4 dataset is found using the \code{\link[=archs4_sample_covariates]{archs4_sample_covariates()}} 20 | function.} 21 | 22 | \item{feature_type}{do you want \code{"gene"} or \code{"transcript"} level expression?} 23 | 24 | \item{row_id}{either \code{"ensembl"} or \code{"symbol"}. If this is \code{"ensembl"} and 25 | \code{feature_type == "transcript"}, then we remove the rows from the count 26 | dataset that we couldn't map symbol -> ensembl_gene_id for.} 27 | } 28 | \value{ 29 | a \code{DGEList} of results 30 | } 31 | \description{ 32 | Create a DGEList for the expression data of a series or set of samples. 33 | } 34 | \examples{ 35 | a4 <- Archs4Repository() 36 | y <- as.DGEList(a4, "GSE89189", feature_type = "gene") 37 | } 38 | \seealso{ 39 | \code{\link[=fetch_expression]{fetch_expression()}} 40 | } 41 | -------------------------------------------------------------------------------- /man/archs4_sample_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{sample_table} 4 | \alias{sample_table} 5 | \alias{archs4_sample_table} 6 | \title{Lists the GEO series and samples available in the human and mouse datasets} 7 | \usage{ 8 | sample_table(x, ...) 9 | 10 | archs4_sample_table(feature_type = c("all", "gene", "transcript"), 11 | unroll_series = TRUE, datadir = getOption("archs4.datadir")) 12 | } 13 | \arguments{ 14 | \item{x}{an \code{Archs4Repository}} 15 | 16 | \item{feature_type}{currently, the \code{"gene"} and \code{"transcript"} datasets 17 | are not the same.} 18 | 19 | \item{unroll_series}{There are some malformed series identifiers, like 20 | \code{"GSE36025Xx-xXGSE49417Xx-xXGSE49847"} when the same sample_id appears 21 | in multiple series. When this is \code{TRUE} (default), the series_id's are 22 | unrolled and cleaned up.} 23 | 24 | \item{datadir}{the directory that holds the archs4 data} 25 | } 26 | \value{ 27 | a tibble of series_id, sample_id, species columns 28 | } 29 | \description{ 30 | This function queries the human and mouse gene expression matrices from 31 | the ARCHS4 data release and combines their GEO series and sample identifiers 32 | into a long table, annotated with the organism the samples come from. 33 | } 34 | \details{ 35 | This function executes very quickly (less thatn 0.10th of a second), so most 36 | sample-level query functions in this package which you would think would 37 | benefit from specifying human/mouse don't have to, as they will join into 38 | this table to find out what species is being queried. 39 | } 40 | -------------------------------------------------------------------------------- /man/archs4_file_path.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{file_path} 4 | \alias{file_path} 5 | \alias{archs4_file_path} 6 | \title{Identify the file path on the system for specific ARCHS4 files.} 7 | \usage{ 8 | file_path(x, key) 9 | 10 | archs4_file_path(key, stop_if_missing = TRUE, na_missing = TRUE, 11 | file_info = archs4_file_info(datadir), 12 | datadir = getOption("archs4.datadir")) 13 | } 14 | \arguments{ 15 | \item{x}{an \code{Archs4Repository}} 16 | 17 | \item{key}{the lookup key for the file, ie. \code{"human_gene"} or \code{"mouse_gene"}. 18 | The known keys are enumerated in \code{archs4_file_info()$key} column.} 19 | 20 | \item{stop_if_missing}{defaults to \code{TRUE}, which causes this function to 21 | throw an error if the file does not exist at the expected \code{file_path}. 22 | Set this to \code{FALSE} to simply raise a warning} 23 | 24 | \item{na_missing}{by default, we set paths to files that don't exist to 25 | \code{NA}. Set this to \code{FALSE} to retrieve the expected path of the missing 26 | file.} 27 | 28 | \item{file_info}{the output from \code{\link[=archs4_file_info]{archs4_file_info()}}, which enumerates the 29 | files used by the \code{Archs4Repository}.} 30 | 31 | \item{datadir}{the directory that stores the ARCHS4 data files} 32 | } 33 | \value{ 34 | a named (by \code{key}) character vector of paths to the filesystem that 35 | correspond to the entries in \code{key}. 36 | } 37 | \description{ 38 | By default, this function will throw an error if a file does not exist 39 | upon lookup. To return the \emph{expected} path to the, even if it does not 40 | exist on the file system, set \code{stop_if_missing = FALSE}. 41 | } 42 | -------------------------------------------------------------------------------- /R/h5utils.R: -------------------------------------------------------------------------------- 1 | #' Read data from an HDF5 file "with caution" 2 | #' 3 | #' @description 4 | #' This is a simple wrapper to [rhdf5::h5read()] which returns a default value 5 | #' if the "data element" (specified by `name`) does not exists within `file`. 6 | #' We use this file to read from ARCHS4 hdf5 files when we want to provide a 7 | #' little insurance to the evoling nature of their data formats. 8 | #' 9 | #' For instance this function is used when we try to read somethign like 10 | #' `"meta/reads_aligned"` because this information was not provided in earlier 11 | #' versions of these datasets, however `"meta/genes"` may use [rhdf5::h5read()] 12 | #' directly because this has been around since "the beginning" 13 | #' 14 | #' @importFrom rhdf5 h5read 15 | #' 16 | .h5read <- function(file, name, index=NULL, start=NULL, stride=NULL, block=NULL, 17 | count=NULL, compoundAsDataFrame = TRUE, callGeneric = TRUE, 18 | read.attributes = FALSE, drop = FALSE, ..., 19 | native = FALSE, default_value = NA, default_dim = 1L) { 20 | out <- try({ 21 | rhdf5::h5read(file = file, name = name, index = index, start = start, 22 | stride = stride, block = block, count = count, 23 | compoundAsDataFrame = compoundAsDataFrame, 24 | callGeneric = callGeneric, read.attributes = read.attributes, 25 | drop = drop, ..., native = native) 26 | }, silent = TRUE) 27 | if (is(out, "try-error")) { 28 | ndim <- length(default_dim) 29 | if (ndim == 1L) { 30 | out <- rep(default_value, default_dim) 31 | } else { 32 | out <- array(default_value, default_dim) 33 | } 34 | } 35 | 36 | if (is.null(dim(out)) || length(dim(out)) == 1L) { 37 | na.it <- out %in% c("na", "NA", "null", "NULL") 38 | out[na.it] <- NA 39 | } 40 | 41 | out 42 | } 43 | -------------------------------------------------------------------------------- /inst/extdata/meta.yaml: -------------------------------------------------------------------------------- 1 | sources: 2 | - mouse 3 | - human 4 | files: 5 | human_gene: 6 | source: archs4 7 | name: human_matrix.h5 8 | url: https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5 9 | description: human gene-level counts 10 | human_gene_info: 11 | source: computed 12 | name: human_gene_augmented_info.csv.gz 13 | description: augmented human gene-level feature information 14 | human_gtf: 15 | source: ensembl 16 | name: Homo_sapiens.GRCh38.90.gtf.gz 17 | url: ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz 18 | description: gtf used for human transcript annotations 19 | human_transcript: 20 | source: archs4 21 | name: human_hiseq_transcript_v2.h5 22 | url: https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5 23 | description: human transcript-level counts 24 | human_transcript_info: 25 | source: computed 26 | name: human_transcript_augmented_info.csv.gz 27 | description: augmented human transcript-level feature information 28 | mouse_gene: 29 | source: archs4 30 | name: mouse_matrix.h5 31 | url: https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix.h5 32 | description: mouse gene-level counts 33 | mouse_gtf: 34 | source: ensembl 35 | name: Mus_musculus.GRCm38.90.gtf.gz 36 | url: ftp://ftp.ensembl.org/pub/release-90/gtf/mus_musculus/Mus_musculus.GRCm38.90.gtf.gz 37 | description: gtf used for mouse transcript annotations 38 | mouse_gene_info: 39 | source: computed 40 | name: mouse_gene_augmented_info.csv.gz 41 | description: augmented mouse gene-level feature information 42 | mouse_transcript: 43 | source: archs4 44 | name: mouse_hiseq_transcript_v2.h5 45 | url: https://s3.amazonaws.com/mssm-seq-matrix/mouse_hiseq_transcript_v2.h5 46 | description: mouse transcript-level counts 47 | mouse_transcript_info: 48 | source: computed 49 | name: mouse_transcript_augmented_info.csv.gz 50 | description: augmented mouse transcript-level feature information 51 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,Archs4Repository) 4 | S3method(tidy,DGEList) 5 | S3method(tidy,EList) 6 | S3method(tidy,SummarizedExperiment) 7 | export(Archs4Repository) 8 | export(archs4_feature_info) 9 | export(archs4_file_info) 10 | export(archs4_file_path) 11 | export(archs4_local_data_dir_create) 12 | export(archs4_local_data_dir_validate) 13 | export(archs4_meta) 14 | export(archs4_sample_covariates) 15 | export(archs4_sample_info) 16 | export(archs4_sample_table) 17 | export(archs4_series_status) 18 | export(archs4_sources) 19 | export(as.DGEList) 20 | export(blacklist) 21 | export(create_augmented_feature_info) 22 | export(datadir) 23 | export(estimate_repository_norm_factors) 24 | export(feature_info) 25 | export(feature_lookup) 26 | export(fetch_expression) 27 | export(file_info) 28 | export(file_path) 29 | export(geo_id_type) 30 | export(libstats) 31 | export(lookup_biosamples) 32 | export(lookup_gse) 33 | export(meta) 34 | export(query_geo) 35 | export(retrieve_sra_metadata) 36 | export(sample_covariates) 37 | export(sample_info) 38 | export(sample_table) 39 | export(series_status) 40 | export(sources) 41 | export(tidy) 42 | import(checkmate) 43 | import(dplyr) 44 | import(purrr) 45 | import(readr) 46 | import(tidyr) 47 | importFrom(SummarizedExperiment,assay) 48 | importFrom(SummarizedExperiment,assayNames) 49 | importFrom(SummarizedExperiment,colData) 50 | importFrom(SummarizedExperiment,rowData) 51 | importFrom(broom,tidy) 52 | importFrom(edgeR,DGEList) 53 | importFrom(edgeR,cpm) 54 | importFrom(readr,cols) 55 | importFrom(readr,read_csv) 56 | importFrom(readr,read_tsv) 57 | importFrom(readr,type_convert) 58 | importFrom(rentrez,entrez_fetch) 59 | importFrom(rentrez,entrez_search) 60 | importFrom(reshape2,melt) 61 | importFrom(rhdf5,h5ls) 62 | importFrom(rhdf5,h5read) 63 | importFrom(stats,setNames) 64 | importFrom(tibble,set_tidy_names) 65 | importFrom(utils,write.csv) 66 | importFrom(xml2,read_xml) 67 | importFrom(xml2,xml_attr) 68 | importFrom(xml2,xml_children) 69 | importFrom(xml2,xml_contents) 70 | importFrom(xml2,xml_find_all) 71 | importFrom(xml2,xml_find_first) 72 | importFrom(xml2,xml_ns_strip) 73 | importFrom(xml2,xml_text) 74 | importFrom(xml2,xml_validate) 75 | importFrom(yaml,read_yaml) 76 | -------------------------------------------------------------------------------- /man/fetch_expression.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expression.R 3 | \name{fetch_expression} 4 | \alias{fetch_expression} 5 | \title{Retrieve expression data for genes/transcripts across samples} 6 | \usage{ 7 | fetch_expression(a4, features, samples = NULL, feature_type = "gene", 8 | source = NULL, feature_meta = "symbol", 9 | sample_meta = c("Sample_title", "Sample_source_name_ch1"), 10 | prior.count = 3, ...) 11 | } 12 | \arguments{ 13 | \item{a4}{An \code{Archs4Repository}} 14 | 15 | \item{features}{a \code{tibble} of feature descriptors (as returned from 16 | \code{\link[=feature_info]{feature_info()}}). Really this table simply needs the following columns: 17 | \itemize{ 18 | \item \code{"a4name"} or \code{"ensembl_id" or}; 19 | \item \code{"feature_type"}: (gene or transcript) 20 | \item \code{"source"} (human or mouse),; 21 | }} 22 | 23 | \item{samples}{a samples identifier: ie. a tibble with series_id and 24 | sample_id columns.} 25 | 26 | \item{source}{"mouse" or "human" pass this in explicitly if there is no 27 | "source" column in your \code{features} or \code{samples} data.frame} 28 | 29 | \item{feature_meta}{additional columns from 30 | \code{feature_info(a4, feature_type = feature_type)}} 31 | 32 | \item{sample_meta}{metadata information to include for the samples. These 33 | values are extracted from the \code{meta/VALUE} files in the respective hdf5 34 | data files. See \code{\link[=archs4_sample_info]{archs4_sample_info()}} for more details.} 35 | 36 | \item{type}{"gene" or "transcript"-level data?} 37 | } 38 | \value{ 39 | a data.frame of expression data. Each row is an observation 40 | (one gene one sample) 41 | } 42 | \description{ 43 | Returns the estimated counts for the genes/transcripts enumerated in the 44 | \code{features} table for the samples enumerated in the \code{samples} table. 45 | } 46 | \details{ 47 | Note that the values returned are simply estimated counts. They are not 48 | normalized by sequencing depth. For now, the only use for this function is 49 | to compare how ratios of genes compare across samples. 50 | } 51 | \examples{ 52 | a4 <- Archs4Repository() 53 | gnz <- feature_lookup(a4, c("CFAP65", "PECR")) 54 | gexprs <- fetch_expression(a4, gnz) 55 | } 56 | \seealso{ 57 | \code{\link[=as.DGEList]{as.DGEList()}} 58 | } 59 | -------------------------------------------------------------------------------- /tests/testthat/test-feature-info.R: -------------------------------------------------------------------------------- 1 | context("Feature Info") 2 | 3 | if (!exists("a4")) { 4 | # This is loaded by the testthat/helper-all.R script when testthat is running 5 | # the unit tests, but included here for convenience when doing interactive 6 | # test development 7 | a4 <- Archs4Repository() 8 | } 9 | 10 | test_that("augmented feature info largely concordant", { 11 | a4 <- Archs4Repository("/Users/lianoglou/workspace/data/archs4/v4") 12 | a5 <- Archs4Repository("/Users/lianoglou/workspace/data/archs4/v5") 13 | 14 | v4.fi <- archs4_feature_info("gene", "mouse") 15 | v5.fi <- archs4_feature_info("gene", "mouse") 16 | }) 17 | 18 | test_that("feature-level metadata retrieval works", { 19 | # retrieve gene-level feature information with unique symbols 20 | mg <- feature_info(a4, feature_type = "gene", source = "mouse", 21 | distinct_symbol = TRUE) 22 | 23 | # all a4name entries should be non NA and nchar() >= 1 24 | expect_true(all(nchar(mg$a4name) >= 1)) 25 | expect_true(all(!is.na(mg$a4name))) 26 | expect_is(mg$h5idx, "integer") 27 | expect_true(all(!is.na(mg$h5idx))) 28 | # no duplicated symbols 29 | # isna.symbol <- is.na(mg$symbol) | 30 | # tolower(mg$symbol) == "na" | 31 | # tolower(mg$symbol) == "null" 32 | # expect_equal(sum(duplicated(mg$symbol) & !isna.symbol), 0) 33 | # isna.ens <- is.na(mg$ensembl_id) | 34 | # tolower(mg$ensembl_id) == "na" | 35 | # tolower(mg$ensembl_id) == "null" 36 | expect_equal(sum(duplicated(mg$ensembl_id) & !is.na(mg$ensembl_id)), 0) 37 | 38 | # There are some entries that we couldn't get identifiers for, but the ones 39 | # we got should all be prefixed with the mouse prefix. 40 | mg.ens <- filter(mg, !is.na(ensembl_id)) 41 | expect_true(all(substr(mg.ens$ensembl_id, 1, 7) == "ENSMUSG")) 42 | 43 | 44 | hg <- feature_info(a4, feature_type = "gene", source = "human", 45 | distinct_symbol = TRUE) 46 | # all a4name entries should be non NA and nchar() >= 1 47 | expect_true(all(nchar(hg$a4name) >= 1)) 48 | expect_true(all(!is.na(hg$a4name))) 49 | expect_is(hg$h5idx, "integer") 50 | expect_true(all(!is.na(hg$h5idx))) 51 | # no duplicated symbols 52 | expect_equal(sum(duplicated(hg$symbol) & !is.na(hg$symbol)), 0) 53 | 54 | hg.ens <- filter(hg, !is.na(ensembl_id)) 55 | expect_true(all(substr(hg.ens$ensembl_id, 1, 4) == "ENSG")) 56 | }) 57 | -------------------------------------------------------------------------------- /vignettes/mystyles.css: -------------------------------------------------------------------------------- 1 | /************************ Callouts ********************************************/ 2 | /* Too bad you can't inhert in CSS, change one of these you should change all */ 3 | div.tip { 4 | margin: 2px 10px 10px 0px; 5 | min-height: 55px; 6 | padding: 2px 10px 2px 85px; 7 | background-position: 5px 5px, 68px 0px; 8 | background-image: url('images/icons/tip.png'), url('images/icons/callout-border.png'); 9 | background-repeat: no-repeat, repeat-y; 10 | width: 90%; 11 | } 12 | 13 | div.note { 14 | margin: 2px 10px 10px 0px; 15 | min-height: 55px; 16 | padding: 2px 10px 2px 85px; 17 | background-position: 5px 5px, 68px 0px; 18 | background-image: url('images/icons/note.png'), url('images/icons/callout-border.png'); 19 | background-repeat: no-repeat, repeat-y; 20 | width: 90%; 21 | } 22 | 23 | div.caution { 24 | margin: 2px 10px 10px 0px; 25 | min-height: 55px; 26 | padding: 2px 10px 2px 85px; 27 | background-position: 5px 5px, 68px 0px; 28 | background-image: url('images/icons/caution.png'), url('images/icons/callout-border.png'); 29 | background-repeat: no-repeat, repeat-y; 30 | width: 90%; 31 | } 32 | 33 | div.warning { 34 | margin: 2px 10px 10px 0px; 35 | min-height: 55px; 36 | padding: 2px 10px 2px 85px; 37 | background-position: 5px 5px, 68px 0px; 38 | background-image: url('images/icons/warning.png'), url('images/icons/callout-border.png'); 39 | background-repeat: no-repeat, repeat-y; 40 | width: 90%; 41 | } 42 | 43 | div.important { 44 | margin: 2px 10px 10px 0px; 45 | min-height: 55px; 46 | padding: 2px 10px 2px 85px; 47 | background-position: 5px 5px, 68px 0px; 48 | background-image: url('images/icons/important.png'), url('images/icons/callout-border.png'); 49 | background-repeat: no-repeat, repeat-y; 50 | width: 90%; 51 | } 52 | 53 | div.example { 54 | margin: 2px 10px 10px 0px; 55 | min-height: 55px; 56 | padding: 2px 10px 2px 85px; 57 | background-position: 5px 5px, 68px 0px; 58 | background-image: url('images/icons/example.png'), url('images/icons/callout-border.png'); 59 | background-repeat: no-repeat, repeat-y; 60 | width: 90%; 61 | } 62 | 63 | div.download { 64 | margin: 2px 10px 10px 0px; 65 | min-height: 55px; 66 | padding: 2px 10px 2px 85px; 67 | background-position: 5px 5px, 68px 0px; 68 | background-image: url('images/icons/download.png'), url('images/icons/callout-border.png'); 69 | background-repeat: no-repeat, repeat-y; 70 | width: 90%; 71 | } 72 | -------------------------------------------------------------------------------- /man/create_augmented_feature_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/archs4-features.R 3 | \name{create_augmented_feature_info} 4 | \alias{create_augmented_feature_info} 5 | \title{Create meta information for the genes and transcripts in the ARCHS4 dataset.} 6 | \usage{ 7 | create_augmented_feature_info(datadir = getOption("archs4.datadir")) 8 | } 9 | \arguments{ 10 | \item{datadir}{The directory that has the mouse and human expression 11 | hdf5 files. There will be \code{SPECIES_FEATURETYPE_augmented_info.csv.gz} files 12 | saved in this directory whe this function completes.} 13 | } 14 | \description{ 15 | This is a preprocessing function that is required to successfully build an 16 | \code{Archs4Repository}. It is not really intended for use during analyses. 17 | 18 | This Function creates \emph{all} of the feature-level CSV files for the features 19 | enumerated in the \code{meta/genes} gene-level hdf5 file, and the 20 | \code{meta/transcript} transcript identfiers in the transctipt-level hdf5 file for 21 | the mouse and human files found in \code{datadir}. 22 | 23 | \strong{In order for this to work} you have to download the approprate human and 24 | mouse gtf files from ensembl and save them in \code{datadir}. Reference the 25 | \code{\link[=archs4_local_data_dir_validate]{archs4_local_data_dir_validate()}} function. 26 | 27 | For the initial relesae of the ARCHS4 dataset, the 28 | \code{Homo_sapiens.GRCh38.90.gtf.gz} and \code{Mus_musculus.GRCm38.90.gtf.gz} were 29 | used. 30 | } 31 | \details{ 32 | This function will write the augmented transcript- and gene-level files in 33 | the \code{datadir}, using the following pattern: 34 | \code{__augmented_info.csv.gz} 35 | 36 | Gene symbols are the only piece of information provided for the row-level 37 | identifieres for the gene count matrices. Furthermore, the gene symbol used 38 | in mouse are in all uppercase, which is not how genes are referred to there. 39 | In order to augment the gene symbol information with gene-level identifiers 40 | and other information, we parse relatively recent GTFs provided by GENCODE. 41 | 42 | The fruits of the labor generated by this function are used by the 43 | \code{\link[=archs4_feature_info]{archs4_feature_info()}} function. 44 | 45 | Note that this function will replace already existing "\code{augmented}" files 46 | if the already exist in \code{datadir}. 47 | } 48 | \seealso{ 49 | \code{\link[=archs4_feature_info]{archs4_feature_info()}} 50 | } 51 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "README-" 12 | ) 13 | ``` 14 | 15 | # Overview 16 | 17 | The `archs4` package provides utility functions to query and explore the 18 | expression profiling data made available through the 19 | [ARCHS4 project][archs4web], which is described in the following publication: 20 | 21 | [Massive mining of publicly available RNA-seq data from human and mouse][archs4pub]. 22 | 23 | Because this package requires the user to download a number of data files that 24 | are external to the package, the [installation instructions](#installation) are 25 | *a bit* more involved than other R packages, and we leave them for 26 | [the end of this document](#installation). 27 | 28 | # Usage 29 | 30 | After [successful installation](#installation) of this package, you can query 31 | the series and samples included in the ARCHS4 repository, as well as materialize 32 | the expresion data into well-known bioconductor assay containers for downstream 33 | analysis. 34 | 35 | To query GEO series and samples, you can use the `sample_info` function: 36 | 37 | ```{r, message=FALSE, warning=FALSE} 38 | library(archs4) 39 | 40 | a4 <- Archs4Repository() 41 | ids <- c('GSE89189', 'GSE29943', "GSM1095128", "GSM1095129", "GSM1095130") 42 | sample.info <- sample_info(a4, ids) 43 | head(sample.info) 44 | ``` 45 | 46 | You can use the `as.DGEList` function to materialize an `edgeR::DGEList` from a 47 | an arbitrary number of GEO sample and series identifiers. The only restriction 48 | is that the data from the series/samples must all be from the same species. 49 | 50 | The most often use-case will likely be to create a `DGEList` for a given study. 51 | For instance, the GEO series identifier [`"GSE89189"`][blurtongeo] refers to the 52 | expression data generated to support the 53 | [Abud et al. iPSC-Derived Human Microglia-like Cells ...][blurtonpub] paper. 54 | 55 | Creating a `DGEList` from this study will create an object with 27,024 genes 56 | across 37 samples in about 1.5 seconds: 57 | 58 | ```{r, eval = FALSE} 59 | yg <- as.DGEList(a4, "GSE89189", feature_type = "gene") 60 | ``` 61 | 62 | The following command retrieves the 178,135 transcript level counts for this 63 | experiment in about 1.5 seconds, as well: 64 | 65 | ```{r, eval = FALSE} 66 | yt <- as.DGEList(a4, "GSE89189", feature_type = "transcript") 67 | ``` 68 | 69 | # Installation 70 | 71 | ```{r child = "inst/rmdparts/installation.Rmd"} 72 | ``` 73 | 74 | [//]: # (References ===========================================================) 75 | ```{r child = "inst/rmdparts/references.Rmd"} 76 | ``` 77 | -------------------------------------------------------------------------------- /man/archs4_sample_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R 3 | \name{sample_info} 4 | \alias{sample_info} 5 | \alias{archs4_sample_info} 6 | \title{Retrieves information for samples by GSE series or sample IDs} 7 | \usage{ 8 | sample_info(x, id, columns = c("Sample_title", "Sample_source_name_ch1"), 9 | check_missing_samples = TRUE, ...) 10 | 11 | archs4_sample_info(id, columns = c("Sample_title", 12 | "Sample_source_name_ch1"), sample_table = archs4_sample_table(datadir = 13 | datadir), sample_covariates = archs4_sample_covariates(datadir), 14 | check_missing_samples = TRUE, datadir = getOption("archs4.datadir"), 15 | ...) 16 | } 17 | \arguments{ 18 | \item{x}{an \code{Archs4Repository}} 19 | 20 | \item{id}{a character vector of GEO series or sample ids.} 21 | 22 | \item{columns}{the names of the sample metadata columns desired. This 23 | defaults to \code{c("Sample_title", "Sample_source_name_ch1")}. The values 24 | in \code{columns} must be a subset of the values enumerated in 25 | \code{\link[=archs4_sample_covariates]{archs4_sample_covariates()}}.} 26 | 27 | \item{check_missing_samples}{When \code{TRUE} (the default), this function will 28 | check every unique GEO series identifier (\code{"GSEnnnn"}) for missing samples 29 | by using an NCBI Rest service via a call to \code{\link[=archs4_series_status]{archs4_series_status()}}, 30 | and \code{\link[=lookup_gse]{lookup_gse()}}.} 31 | 32 | \item{sample_table}{the output from \code{\link[=archs4_sample_table]{archs4_sample_table()}}, which lists 33 | the series_id,sample_id combinations found in the ARCHS4 repository.} 34 | 35 | \item{sample_covariates}{the \code{data.frame}-definition of the sample covariates 36 | found in the ARCHS4 datasetes, which is constructed via a call to 37 | \code{\link[=archs4_sample_covariates]{archs4_sample_covariates()}}. The parameter is included in here so that 38 | a cached version of this \code{data.frame} can be re-used.} 39 | 40 | \item{datadir}{the directory that holds the archs4 data} 41 | } 42 | \value{ 43 | a tibble of series_id, sample_id, sample_h5idx, sample_title, and 44 | sample_name columns. If the query sample or series query can't be found, 45 | then there will be an \code{NA} value for these columns. The \code{query_type} column 46 | will indicat whether the row was returned from querying by series or by 47 | sample. 48 | } 49 | \description{ 50 | Fetch a tibble of series and sample information by querying the arcsh4 51 | dataset by GEO sample (GSE) or sample (GSM) ids. 52 | 53 | For each unique GEO series identifier (\code{"GSEnnnn"}), we will check if the 54 | ARCHS4 dataset is missing any of its samples when \code{check_missing_samples} 55 | is set to \code{TRUE} (default). 56 | } 57 | \examples{ 58 | si <- archs4_sample_info("GSE52564") # ben barres transcriptome db ... 59 | } 60 | -------------------------------------------------------------------------------- /R/tidy.R: -------------------------------------------------------------------------------- 1 | #' @noRd 2 | #' @export 3 | #' @importFrom edgeR cpm 4 | #' @importFrom reshape2 melt 5 | #' @importFrom broom tidy 6 | #' 7 | #' @method tidy DGEList 8 | tidy.DGEList <- function(x, normalized.lib.sizes = TRUE, prior.count = 3, ...) { 9 | mats <- list( 10 | cpm = cpm(x, normalized.lib.sizes = normalized.lib.sizes, 11 | log = TRUE, prior.count = prior.count), 12 | count = x$counts) 13 | 14 | .tidy.core(mats, genes = x$genes, samples = x$samples) 15 | } 16 | 17 | #' @noRd 18 | #' @export 19 | #' @importFrom SummarizedExperiment assay assayNames rowData colData 20 | #' @importFrom edgeR DGEList 21 | #' @method tidy SummarizedExperiment 22 | tidy.SummarizedExperiment <- function(x, assay_name = NULL, is_counts = TRUE, 23 | ...) { 24 | if (is.null(assay_name)) { 25 | assay_name <- assayNames(x)[1L] 26 | } 27 | assert_string(assay_name) 28 | assert_choice(assay_name, assayNames(x)) 29 | 30 | dat <- assay(x, assay_name) 31 | ginfo <- as.data.frame(rowData(x)) 32 | sinfo <- as.data.frame(colData(x)) 33 | 34 | if (is_counts) { 35 | y <- calcNormFactors(DGEList(dat, samples = sinfo, genes = ginfo)) 36 | out <- tidy(y, ...) 37 | } else { 38 | out <- .tidy.core(dat, genes = ginfo, samples = sinfo, ...) 39 | } 40 | 41 | out 42 | } 43 | 44 | #' @noRd 45 | #' @export 46 | #' @method tidy EList 47 | tidy.EList <- function(x, ...) { 48 | mats <- list(cpm = x$E) 49 | if (is.matrix(x$weights)) { 50 | mats$weight <- x$weights 51 | rownames(mats$weight) <- rownames(x) 52 | colnames(mats$weight) <- colnames(x) 53 | } else { 54 | names(mats)[1L] <- "value" 55 | } 56 | 57 | .tidy.core(mats, genes = x$genes, samples = x$targets) 58 | } 59 | 60 | .tidy.core <- function(mats, genes, samples, ...) { 61 | if (is.matrix(mats)) mats <- list(value = mats) 62 | stopifnot(is.list(mats)) 63 | stopifnot(all(sapply(mats, is.matrix))) 64 | assert_named(mats, type = "unique") 65 | 66 | rnames <- rownames(mats[[1]]) 67 | snames <- colnames(mats[[1]]) 68 | genes$.gene_id <- rnames 69 | gid.col <- sapply(genes, function(xx) all(xx == rnames)) 70 | gid.col <- colnames(genes)[which(gid.col)[1L]] 71 | if (gid.col != ".gene_id") genes$.gene_id <- NULL 72 | 73 | samples$.sample_id <- snames 74 | sid.col <- sapply(samples, function(xx) all(xx == snames)) 75 | sid.col <- colnames(samples)[which(sid.col)[1L]] 76 | if (sid.col != ".sample_id") samples$.sample_id <- NULL 77 | 78 | adat.all <- lapply(names(mats), function(mname) { 79 | m <- mats[[mname]] 80 | stopifnot(all.equal(rownames(m), rnames)) 81 | m <- melt(m) 82 | m <- transform(m, Var1 = as.character(Var1), Var2 = as.character(Var2)) 83 | colnames(m) <- c(gid.col, sid.col, mname) 84 | m 85 | }) 86 | adat <- do.call(cbind, adat.all) 87 | # if there were multiple matrices, there will be multiple sample_id columns 88 | # so we remove those 89 | adat <- adat[, !duplicated(colnames(adat))] 90 | out <- inner_join(adat, genes, by = gid.col) 91 | out <- inner_join(out, samples, by = sid.col) 92 | out 93 | } 94 | -------------------------------------------------------------------------------- /R/data-directory.R: -------------------------------------------------------------------------------- 1 | # These are lower-level functions that support "the health" of the local 2 | # datadir that is used to store the data required to drive a 3 | # LocalArchs4Repository 4 | 5 | #' Initialize a local datadir to act as an ARCHS4 data datadir 6 | #' 7 | #' @details 8 | #' A local datadir needs to be created and initialized (wth a `meta.yaml` 9 | #' file), to house ARCHS4 data for use in an Archs4Repository. This function 10 | #' creates that datadir and copies an initial `meta.yaml` file. 11 | #' 12 | #' Please refer to the vignette section "ARCHS4 Data Download" for more 13 | #' details. 14 | #' 15 | #' @export 16 | #' 17 | #' @param datadir the path to the datadir to create (or initialize) as an 18 | #' ARCHS4 data datadir. 19 | #' @param stop_if_exists by default, this function will `stop` if `datadir` 20 | #' already exists. Set this to `FALSE` to continue. Setting it to `FALSE` is 21 | #' convenient to initialize the target `datadir` with a `meta.yaml` file. 22 | #' If a `meta.yaml` file already exists in `datadir`, then this function 23 | #' will stop unconditionally. Move the `datadir/meta.yaml` out of the way 24 | #' if you simply want to refresh it with the default version. 25 | #' @return invisibly returns the path to the `meta.yaml` in the target 26 | #' `datadir` 27 | archs4_local_data_dir_create <- function(datadir = getOption("archs4.datadir"), 28 | stop_if_exists = TRUE) { 29 | assert_character(datadir) 30 | d.exists <- file.exists(datadir) 31 | 32 | meta.in <- system.file("extdata", "meta.yaml", package = "archs4", 33 | mustWork = TRUE) 34 | meta.to <- file.path(datadir, "meta.yaml") 35 | 36 | if (d.exists && !dir.exists(datadir)) { 37 | stop("Desired output datadir is already file(!): ", datadir) 38 | } 39 | 40 | if (d.exists) { 41 | if (stop_if_exists) { 42 | stop("Output datadir already exisits: ", datadir) 43 | } else { 44 | if (file.exists(meta.to)) { 45 | stop("meta.yaml file already exists in output datadir, ", 46 | "remove it if you want to replace it with the default meta.yaml") 47 | } 48 | } 49 | } else { 50 | parent.dir <- assert_directory(dirname(datadir), "w") 51 | dir.create(datadir) 52 | } 53 | file.copy(meta.in, meta.to) 54 | invisible(meta.to) 55 | } 56 | 57 | #' Check "the health" of a local ARCHS4 data datadir 58 | #' 59 | #' This function will notify the suer which files are missing from the 60 | #' ARCHS4 data datadir, and what course of action they can use to 61 | #' fix it. 62 | #' 63 | #' @export 64 | #' @param echo echo validation diagnostic message to stdout via [base::cat()] 65 | #' @param datadir the path to the datadir that stores local ARCHS4 data. 66 | #' Defaults to `getOption("archs4.datadir")`. 67 | #' @return A string that indicates "what's wrong", or `TRUE` if validation 68 | #' succeeds. 69 | archs4_local_data_dir_validate <- function(echo = TRUE, 70 | datadir = getOption("archs4.datadir") 71 | ) { 72 | msg <- character() 73 | if (!dir.exists(datadir)) { 74 | msg <- paste0( 75 | "datadir does not exists, run ", 76 | '`archs4_local_data_dir_create("', datadir, '")`\n') 77 | if (echo) cat(msg) 78 | return(invisible(msg)) 79 | } 80 | 81 | meta.fn <- file.path(datadir, "meta.yaml") 82 | if (!file.exists(meta.fn)) { 83 | msg <- paste( 84 | "meta.yaml file is missing from the data datadir, run ", 85 | "`archs4_local_data_dir_create(datadir, stop_if_exists = FALSE)`\n") 86 | if (echo) cat(msg) 87 | return(invisible(msg)) 88 | } 89 | 90 | finfo <- archs4_file_info(datadir) 91 | missing <- filter(finfo, source == "archs4" & !file_exists) 92 | if (nrow(missing)) { 93 | msg <- paste0( 94 | "The following ARCHS4 files are missing, please download them:\n", 95 | paste0( 96 | sprintf(" * %s: %s", missing[["name"]], missing[["url"]]), 97 | collapse = "\n")) 98 | if (echo) cat(msg) 99 | return(invisible(msg)) 100 | } 101 | 102 | missing <- filter(finfo, source == "ensembl" & !file_exists) 103 | if (nrow(missing)) { 104 | msg <- paste0( 105 | "The following ensembl files are missing, please download them:\n", 106 | paste0( 107 | sprintf(" * %s: %s", missing[["name"]], missing[["url"]]), 108 | collapse = "\n")) 109 | if (echo) cat(msg) 110 | return(invisible(msg)) 111 | } 112 | 113 | missing <- filter(finfo, source == "computed" & !file_exists) 114 | if (nrow(missing)) { 115 | header <- "The following computed files are missing:" 116 | filez <- paste(sprintf(" * %s\n", missing[["name"]]), collapse = "") 117 | advice <- paste0( 118 | "You can create them by running:\n", 119 | " `create_augmented_feature_info(\"", datadir, "\")`") 120 | msg <- sprintf("%s\n%s\n%s\n\n", header, filez, advice) 121 | if (echo) cat(msg) 122 | return(invisible(msg)) 123 | } 124 | 125 | TRUE 126 | } 127 | 128 | -------------------------------------------------------------------------------- /tests/testthat/test-sample-retrieval.R: -------------------------------------------------------------------------------- 1 | context("Sample and Covariate Retrieval") 2 | 3 | if (!exists("a4")) { 4 | # This is loaded by the testthat/helper-all.R script when testthat is running 5 | # the unit tests, but included here for convenience when doing interactive 6 | # test development 7 | a4 <- Archs4Repository() 8 | } 9 | 10 | 11 | test_that("All expected samples come back when queried by series", { 12 | series <- "GSE52564" 13 | expected <- sample_table(a4) %>% 14 | filter(series_id == series) 15 | 16 | info <- sample_info(a4, series) 17 | full <- expected %>% 18 | full_join(info, by = c("series_id", "sample_id")) 19 | 20 | expect_equal(nrow(full), nrow(expected)) 21 | }) 22 | 23 | test_that("(archs4_)sample_status identifies samples missing from GEO series", { 24 | missing.none <- "GSE52564" # The Ben Barres dataset has all samples in ARCHS4 25 | # The blurton jones dataset (GSE89189) used to be missing some, but those have 26 | # been filled out 27 | missing.some <- "GSE43366" # The Chiu et al. SOD1 datasets is missing some 28 | 29 | ss.none <- series_status(a4, missing.none) 30 | expect_true(all(ss.none[["in_archs4"]])) 31 | 32 | ss.some <- series_status(a4, missing.some) 33 | expect_true(!all(ss.some[["in_archs4"]])) 34 | }) 35 | 36 | test_that("sample_info call handlies missing IDs gracefully", { 37 | # GSE43366 should have 42 samples, but is missing 7 of them as of April 8, 2018 38 | # when we are using "v2" of the datasets. 39 | expected <- tibble( 40 | series_id = "GSE43366", 41 | sample_id = paste0("GSM10611", 43:84)) 42 | ex.missing <- c("GSM1061148", "GSM1061149", "GSM1061150", "GSM1061151", 43 | "GSM1061152", "GSM1061153", "GSM1061154") 44 | ex.present <- setdiff(expected$sample_id, ex.missing) 45 | 46 | # Tests a mix of existing and missing sample identifiers. 47 | # Missing samples should have NAs in a number of colums. One colume that is 48 | # always returned is `organism`. 49 | res <- expect_warning(sample_info(a4, expected$sample_id), "not found") 50 | 51 | # Expect that we have one row for each entry in `expected.all` 52 | # we don't join on series_id because the sample_id's that were queried for 53 | # and are missing come back with NA series_id values. 54 | xx <- expected %>% 55 | full_join(res, by = c("sample_id")) 56 | expect_equal(nrow(xx), nrow(expected)) 57 | expect_setequal(res$sample_id, expected$sample_id) 58 | 59 | found <- filter(res, !is.na(organism)) 60 | expect_setequal(found$sample_id, ex.present) 61 | 62 | missed <- filter(res, is.na(organism)) 63 | expect_setequal(missed$sample_id, ex.missing) 64 | 65 | # When we query for all missing samples, we still return a tibble of the same 66 | # form as res.all, with all NAs where you expect them to be. We don't expect 67 | # to throw an error. 68 | amiss <- expect_warning({ 69 | sample_info(a4, ex.missing) 70 | }, "not found") 71 | expect_setequal(ex.missing, amiss$sample_id) 72 | expect_true(all(is.na(amiss$organism))) 73 | }) 74 | 75 | # Tests that are no longer necessary in v4+ matrices (they were made for v2). 76 | # These tests: 77 | # 1. Looked for missing samples in certain seriesexercised 78 | # 2. Identified discordant sample covariates among the mouse and human datasets 79 | 80 | # test_that("(archs4_)sample_info warns when querying series with missing samples", { 81 | # missing.none <- "GSE52564" # The Ben Barres dataset has all samples in ARCHS4 82 | # missing.some <- "GSE89189" # The blurton jones iPSC paper is missing some 83 | # 84 | # # This series should have no missing samples 85 | # si.none <- expect_silent(sample_info(a4, missing.none)) 86 | # 87 | # # This series has a few missing samples 88 | # wregex <- sprintf("%s series .*missing samples", missing.some) 89 | # status.some <- expect_warning(sample_info(a4, missing.some), wregex) 90 | # 91 | # # as.DGEList should also warn when we are missng samples 92 | # y <- expect_warning(as.DGEList(a4, missing.some), wregex) 93 | # }) 94 | 95 | # The v4 data matrices have the same covariates in the data matrices among 96 | # the mouse and human data. 97 | # ------------------------------------------------------------------------------ 98 | # test_that("sample_info returns desired covariate columns", { 99 | # # the code here looks a bit convoluted because it should be cleaned/updated 100 | # # to support testing "universal" covariates, mouse- and human-only covariates 101 | # # as well. 102 | # 103 | # ids.all <- tribble( 104 | # ~id, ~type, ~organism, ~complete, 105 | # "GSE69354", "series", "mouse", TRUE, 106 | # "GSE79525", "series", "mouse", TRUE, 107 | # "GSE98041", "series", "mouse", TRUE, 108 | # "GSE85702", "series", "mouse", FALSE, 109 | # "GSE99095", "series", "human", FALSE, 110 | # "GSE88681", "series", "human", TRUE) 111 | # 112 | # ids.query <- tribble( 113 | # ~id, ~type, ~organism, 114 | # "GSE69354", "series", "mouse", 115 | # "GSE88681", "series", "human", 116 | # "GSM1095128", "sample", "mouse", 117 | # "GSM1095129", "sample", "mouse", 118 | # "GSM1095130", "sample", "mouse") 119 | # 120 | # def.cols <- c("Sample_title", "Sample_source_name_ch1") 121 | # extra.cols <- c("Sample_molecule_ch1", "Sample_treatment_protocol_ch1", 122 | # "Sample_description") 123 | # 124 | # # The human data have these covariates that are not in mouse: 125 | # # * Sample_contact_laboratory 126 | # # * Sample_description 127 | # # * Sample_supplementary_file_2 128 | # h.only <- c("Sample_contact_laboratory", "Sample_description", 129 | # "Sample_supplementary_file_2") 130 | # # 131 | # # The mouse data have these covariates thata are not in human: 132 | # # * Sample_contact_state 133 | # # * Sample_growth_protocol_ch1 134 | # # * Sample_treatment_protocol_ch1 135 | # m.only <- c("Sample_contact_state", "Sample_growth_protocol_ch1", 136 | # "Sample_treatment_protocol_ch1") 137 | # 138 | # all.cols <- c(def.cols, extra.cols) 139 | # info <- sample_info(a4, ids.query$id, all.cols) 140 | # 141 | # for (col in all.cols) { 142 | # expect_is(info[[col]], "character", info = col) 143 | # } 144 | # 145 | # info.m <- filter(info, organism == "mouse") 146 | # for (col in intersect(h.only, colnames(info))) { 147 | # expect_true(all(is.na(info.m[[col]])), info = col) 148 | # } 149 | # 150 | # info.h <- filter(info, organism == "human") 151 | # for (col in intersect(m.only, colnames(info))) { 152 | # expect_true(all(is.na(info.h[[col]])), info = col) 153 | # } 154 | # }) 155 | 156 | -------------------------------------------------------------------------------- /R/bioc-containers.R: -------------------------------------------------------------------------------- 1 | #' Create a DGEList for the expression data of a series or set of samples. 2 | #' 3 | #' 4 | #' 5 | #' @export 6 | #' @importFrom rhdf5 h5read 7 | #' @importFrom edgeR DGEList 8 | #' @seealso [fetch_expression()] 9 | #' 10 | #' @param id a vector of series or sample id's. 11 | #' @param features a feature-descriptor of the features you want to include 12 | #' counts for 13 | #' @param sample_columns the names of the sample covariates that are stored 14 | #' in the ARCHS4 Dataset; a complete list of what covariates are available 15 | #' in the ARCHS4 dataset is found using the [archs4_sample_covariates()] 16 | #' function. 17 | #' @param feature_type do you want `"gene"` or `"transcript"` level expression? 18 | #' @param row_id either `"ensembl"` or `"symbol"`. If this is `"ensembl"` and 19 | #' `feature_type == "transcript"`, then we remove the rows from the count 20 | #' dataset that we couldn't map symbol -> ensembl_gene_id for. 21 | #' @return a `DGEList` of results 22 | #' 23 | #' @examples 24 | #' a4 <- Archs4Repository() 25 | #' y <- as.DGEList(a4, "GSE89189", feature_type = "gene") 26 | as.DGEList <- function(x, id, features = NULL, 27 | sample_columns = c("Sample_title", "Sample_source_name_ch1"), 28 | feature_type = c("gene", "transcript"), 29 | row_id = c("ensembl", "symbol"), 30 | check_missing_samples = TRUE) { 31 | assert_class(x, "Archs4Repository") 32 | feature_type <- match.arg(feature_type) 33 | row_id <- match.arg(row_id) 34 | 35 | if (!is.null(features)) { 36 | if (is.character(features)) { 37 | features <- feature_lookup(x, features, type = type) 38 | if (any(is.na(features$a4name))) { 39 | warning("Removing 'not found' features from query") 40 | features <- filter(features, !is.na(a4name)) 41 | } 42 | } 43 | assert_data_frame(features, min.rows = 1L) 44 | assert_subset(c("ensembl_id", "a4name"), colnames(features)) 45 | # features <- distinct(features, ensembl_id, .keep_all = TRUE) 46 | # ensembl_id is something of a "second class citizen". The a4name is 47 | # what has been there in the ARCHS4 data since "the beginning" 48 | features <- distinct(features, ensembl_id, .keep_all = TRUE) 49 | } 50 | 51 | # Identify the unique samples that are being queried ------------------------- 52 | si <- sample_info(x, id, columns = sample_columns, 53 | check_missing_samples = check_missing_samples) 54 | si <- as.data.frame(si, stringsAsFactors = FALSE) 55 | si <- distinct(si, sample_id, .keep_all = TRUE) 56 | rownames(si) <- si$sample_id 57 | 58 | # Check for identifiers that were not found 59 | not.found <- filter(si, is.na(organism)) 60 | if (nrow(not.found)) { 61 | stop("The following samples could not be found: ", 62 | paste(sprintf("%s::%s", not.found$series_id, not.found$sample_id), 63 | collapse = "; ")) 64 | } 65 | 66 | # Check that user is asking for samples that are all from the same species 67 | org <- unique(si$organism) 68 | if (length(org) != 1L) { 69 | stop("You are querying across species") 70 | } 71 | 72 | # Fetch feature meta information --------------------------------------------- 73 | finfo <- feature_info(x, feature_type, org, augmented = TRUE) 74 | finfo <- as.data.frame(finfo, stringsAsFactors = FALSE) 75 | if (feature_type == "gene") { 76 | if (row_id == "ensembl") { 77 | finfo <- filter(finfo, !is.na(ensembl_id)) 78 | rownames(finfo) <- finfo[["ensembl_id"]] 79 | } else { 80 | rownames(finfo) <- finfo[["a4name"]] 81 | } 82 | } else { 83 | dup.ensid <- duplicated(finfo[["ensembl_id"]]) 84 | if (any(dup.ensid)) { 85 | warning("Duplicated ensembl identifiers when version is removed, ", 86 | "rownames maintain their versioned id") 87 | rownames(finfo) <- finfo[["ensembl_id_full"]] 88 | } else { 89 | rownames(finfo) <- finfo[["ensembl_id"]] 90 | } 91 | } 92 | 93 | if (!is.null(features)) { 94 | # cf. note about ensembl_id being a second class citizen 95 | # finfo <- semi_join(finfo, features, by = "ensembl_id") 96 | # finfo <- semi_join(finfo, features, by = "a4name") 97 | # semi_join (and filter(?)) strips rownames! 98 | finfo <- subset(finfo, a4name %in% features[["a4name"]]) 99 | } 100 | 101 | # Fetch the count data for the given samples --------------------------------- 102 | h5col <- paste0("sample_h5idx_", feature_type) 103 | isna <- is.na(si[[h5col]]) 104 | if (any(isna)) { 105 | msg <- paste0( 106 | "The following samples do not have ", feature_type, 107 | "-level quantitation and will not be included in the expression ", 108 | "container:\n ", 109 | paste(sprintf("%s::%s", si$series_id[isna], si$sample_id[isna]), 110 | collapse = "; ")) 111 | warning(msg, immediate. = TRUE) 112 | si <- si[!isna,,drop = FALSE] 113 | } 114 | 115 | if (nrow(si) == 0L) { 116 | stop("No samples left to assemble expression data") 117 | } 118 | 119 | # Pull out pre-calcuated lib.size and norm.factor values. This has the 120 | # added benefit of pre-emptively identifying samples with weird (huge) 121 | # numbers that turn to NAs due to integer rollover, ie. we get random 122 | # errors of this variety when reading from the HDF5 file: 123 | # 124 | # integer value -2^63 replaced NA. See the section 125 | # 'Large integer data types' in the 'rhdf5' vignette 126 | # for more details. 127 | # 128 | # This happens when we fish data out from the hdf5 file, but also we 129 | # ran into this problem when running the `estimate_repository_norm_factors` 130 | # function when generating these lib.size and norm.factor values 131 | libinfo <- sample_table(x) %>% 132 | select(series_id, sample_id, lib.size = libsize, norm.factors = normfactor, 133 | a4libsize) %>% 134 | distinct(sample_id, .keep_all = TRUE) 135 | libinfo <- select(si, series_id, sample_id) %>% 136 | left_join(libinfo, by = c("series_id", "sample_id")) 137 | na.overflow <- is.na(libinfo$lib.size) | is.na(libinfo$norm.factors) 138 | if (any(na.overflow)) { 139 | warning("Removing ", sum(na.overflow), " samples due to libsize NA overflow issues") 140 | libinfo <- filter(libinfo, !na.overflow) 141 | si <- subset(si, sample_id %in% libinfo$sample_id) 142 | } 143 | 144 | # avg.lsize <- mean(libinfo$lib.size, na.rm = TRUE) 145 | # avg.nfact <- mean(libinfo$norm.factors, na.rm = TRUE) 146 | # libinfo <- libinfo %>% 147 | # transform(lib.size = ifelse(is.na(lib.size), a4libsize, lib.size), 148 | # norm.factors = ifelse(is.na(norm.factors), avg.nfact, norm.factors)) 149 | 150 | rownames(si) <- si[["sample_id"]] 151 | 152 | counts <- local({ 153 | key <- paste(org, feature_type, sep = "_") 154 | h5.fn <- file_path(x, key) 155 | index <- list(finfo$h5idx, si[[h5col]]) 156 | cnts <- rhdf5::h5read(h5.fn, "data/expression", index=index) 157 | colnames(cnts) <- rownames(si) 158 | # cnts <- cnts[finfo$h5idx,,drop = FALSE] 159 | rownames(cnts) <- rownames(finfo) 160 | cnts 161 | }) 162 | 163 | out <- suppressWarnings(edgeR::DGEList(counts, genes = finfo, samples = si)) 164 | xref <- match(colnames(out), libinfo$sample_id) 165 | if (any(is.na(xref))) { 166 | stop("Problem matching sample_id to libinfo data.frame") 167 | } 168 | if (!all(colnames(out) == libinfo$sample_id[xref])) { 169 | stop("Mismatch in outgoing DGEList to libinfo data.frame") 170 | } 171 | out$samples$lib.size <- libinfo$lib.size[xref] 172 | out$samples$norm.factors <- libinfo$norm.factors[xref] 173 | out 174 | } 175 | 176 | -------------------------------------------------------------------------------- /inst/rmdparts/installation.Rmd: -------------------------------------------------------------------------------- 1 | The installation of the `archs4` package is a bit more involved than a standard 2 | package installation and can be roughly broken down into three steps. 3 | 4 | 1. Install the R package along with its dependencies. 5 | 2. Download a number of (large) data files into a specific folder. 6 | 3. Generate metadata from the files downloaded in (2) for downstream use. 7 | 8 | We will walk you through each step in this section. 9 | 10 | ## R Package Installation 11 | 12 | The `arcsh4` package depends on other packages that are available through both 13 | [CRAN][cran] and [Bioconductor][bioc]. For that reason, we will use the 14 | [`BiocInstaller::biocLite()`][biocLite] function to install this package, which 15 | can seamlessly install packages from github, CRAN, and Bioconductor. 16 | 17 | ```{r, eval = FALSE} 18 | source("https://bioconductor.org/biocLite.R") 19 | biocLite("denalitherapeutics/archs4", build_vignettes=TRUE) 20 | library("archs4") 21 | ``` 22 | 23 | When you first load the `archs4` library, you will notice a startup message 24 | telling you that something isn't quite right with your `archs4` installation. 25 | The message will look something like this: 26 | 27 | ``` 28 | Note that your default archs4 data directory is NOT setup correctly 29 | 30 | * Run `archs4_local_data_dir_validate()` to diagnose 31 | * Refer to the ARCHS4 Data Download section of the archs4 vignette for more information 32 | 33 | Your default archs4 data directory (`getOption("archs4.datadir")`) is: 34 | 35 | ~/.archs4data 36 | ``` 37 | 38 | In order for the package to work correctly, you must download a number of files 39 | which are enumerated in the [Data File Download](#data-file-download) section 40 | below into a single directory. You will then instruct the `archs4` package the 41 | path to the directory that holds all of these files by setting the value of R's 42 | global `"archs4.datadir`" option to be the path to that directory. 43 | 44 | ## Data File Download 45 | 46 | You will have to create a directory on your filesystem which will hold a number 47 | of data files that the `archs4` package depends on. Let's call this 48 | directory `$ARCHS4DIR`, which we will define here to be `~/archs4v6data`. 49 | 50 | The `archs4` package provides the `archs4_local_data_dir_create()` convenience 51 | function which creates this directory and copies over a `meta.yaml` file into 52 | that directory. The purpose of this file is to specify the names of the 53 | downloaded files that correspond to the human and mouse-level gene and 54 | transcript-level data. 55 | 56 | ```{r, eval = FALSE} 57 | library(archs4) 58 | archs4dir <- "~/archs4v6data" 59 | archs4_local_data_dir_create(archs4dir) 60 | ``` 61 | 62 | Once this directory is created successfully, you will then have to download the 63 | following files into it: 64 | 65 | ```{r, echo = FALSE, results = "asis"} 66 | sysdir <- system.file("extdata", package = "archs4") 67 | cat(archs4:::md_archs4_download_bullet_list(sysdir)) 68 | ``` 69 | 70 | The enumerated items above contain links to the files that need to be 71 | downloaded. You can right-click on them and select `Save As ...` and instruct 72 | your web-browser to save them to your local `$ARCHS4DIR`. 73 | 74 | **NOTE**: Most all of the `archs4` functions accept a `datadir` parameter, which 75 | should be the path to `$ARCHS4DIR`. For convenience, the default value of this 76 | parameter is always set to `getOption("archs4.datadir")`. This means that you 77 | can modify your `~/.Rprofile` file to set the value of this option to 78 | `"~/archs4v2data"` (for instance), so that the package will always look there 79 | by default. If this option is not set in your `~/.Rprofile`, the 80 | default value for this option is "~/.archs4data". 81 | 82 | ## Feature-Level Metadata Generation 83 | 84 | The datasets currently made available by the [ARCHS4 Project][archs4web] only 85 | provide minimal feature-level metadata: 86 | 87 | * the features in the gene-level datasets are identified only by their symbol; 88 | and 89 | * only the ensembl transcript id's are provided for the features in the 90 | transcript-level datasets 91 | 92 | We want to augment these features with richer annotations, such as the ensembl 93 | gene identifiers or gene biotypes, for instance. 94 | 95 | To make such data generation automatic and easy for the user, once you have 96 | downloaded the Ensembl GTF files listed above into the `$ARCHS4DIR`, you can 97 | run the `create_augmented_feature_info()` to extract these extra feature-level 98 | metadata from the GTF files and store them as tables inside `$ARCHS4DIR` for 99 | later use. 100 | 101 | ```{r eval = FALSE} 102 | create_augmented_feature_info(archs4dir) 103 | ``` 104 | 105 | This function will load and parse the GTF files from human and mouse, and 106 | create gene- and transcript-level `*.csv.gz` files in the `$ARCHS4DIR` which 107 | the `archs4` package will then later use downstream. 108 | 109 | Once your `$ARCHS4DIR` is setup, you may find it convenient to set the default 110 | value for R's global `"archs4.datadir"` option to the `$ARCHS4DIR` directory you 111 | just setup. To do so, you can put the following line in your `~/.Rprofile` file: 112 | 113 | ```r 114 | options(archs4.datadir = "~/archs4v2data") 115 | ``` 116 | 117 | ## Library Size and Normalization Factors 118 | 119 | It is often convenient to extract normalized versions of the count data from 120 | the gene-level expression matrices. In order to do this "on-the-fly" we provide 121 | the `estimate_repository_norm_factors()` functions, which accepts an 122 | `Archs4Repository` object and essentially performs the steps necessary to 123 | create edgeR::TMM-like normalization factors across the entire ARCSH4 expression 124 | atlas. 125 | 126 | In order to avoid laoding the entire expression matrix into memory, the 127 | `estimate_repository_norm_factors()` splits up the processing in batches, 128 | loading a subset of samples at a time until it completes. **This process will 129 | take quite some time** (around two hours on a modern laptop). 130 | 131 | Note that recent versions of the ARCHS4 data do provide `meta/reads_aligned` 132 | and `meta/total_reads` entries. We don't use those here, but we can compare 133 | the values their with what we calculate above. 134 | 135 | ```{r eval = FALSE} 136 | a4 <- Archs4Repository(archs4dir) 137 | estimate_repository_norm_factors(a4) 138 | ``` 139 | 140 | ## ARCHS4 Installation Heatlh 141 | 142 | Because the installation of this package is a bit more involved than most, 143 | we have also provided an `archs4_local_data_dir_validate()` function, which 144 | you can run over your `$ARCHS4DIR` in order to check on "the health" of your 145 | install. 146 | 147 | This function will simply look at your `$ARCHS4DIR` to ensure that the required 148 | files are there, and tries to give you helpful error messages if not. 149 | 150 | For instance, if the first two files enumerated in the 151 | [Data File Download](#data-file-download) section were missing from 152 | your `$ARCHS4DIR` (ie. `human_matrix.h5` and `human_hiseq_transcript_v2.h5`), 153 | you would be warned that "something isn't right" when you first load the 154 | `archs4` package. You could then run the `archs4_local_data_dir_validate()` 155 | to see what is wrong: 156 | 157 | ```{r eval = FALSE} 158 | archs4_local_data_dir_validate(archs4dir) 159 | #> The following ARCHS4 files are missing, please download them: 160 | #> * human_matrix.h5: https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5 161 | #> * human_hiseq_transcript_v2.h5: #> https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5 162 | ``` 163 | 164 | **NOTE:** If all installation and data download/processing steps have been 165 | completed successfully, a call to `archs4_local_data_dir_validate()` will simply 166 | return `TRUE`. 167 | 168 | 169 | ## Package Development 170 | 171 | If you are developing this package, you will find that it will be convenient 172 | to symlink the package's default `archs4.datadir` path (`~/.arcsh4data`) to the 173 | `$ARCHS4DIR` you just setup. This is because often times things like roxygen2 174 | document compilation, unit testing, etc. happen in a vanilla R workspace, which 175 | won't run the configuration that is prescribed in your `~/.Rprofile` file. 176 | 177 | -------------------------------------------------------------------------------- /vignettes/archs4.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "ARCHS4 Overview" 3 | author: "Steve Lianoglou" 4 | date: "`r Sys.Date()`" 5 | packages: archs4 6 | output: 7 | rmarkdown::html_document: 8 | toc: true 9 | toc_float: true 10 | vignette: > 11 | %\VignetteIndexEntry{ARCHS4 Overview} 12 | %\VignetteEngine{knitr::rmarkdown} 13 | %\VignetteEncoding{UTF-8} 14 | --- 15 | 16 | ```{r setup, include = FALSE} 17 | knitr::opts_chunk$set( 18 | collapse = TRUE, 19 | comment = "#>" 20 | ) 21 | ``` 22 | 23 | # Overview 24 | 25 | ARCHS4 is a project ([web site][archs4web] | [publication][archs4pub]) 26 | lead by [Alex Lachmann][lachmann] from [The Ma'ayan Lab][maayanlab] which aims 27 | to reprocesses all of the human and mouse Illumina based (RNA) sequencing data 28 | from [GEO][geo] and [SRA][sra] and make it available for exploratory data 29 | analysis. 30 | 31 | To facilitate that, the raw sequencing data is processed through the same 32 | quantitation pipeline, which consists of [kallisto][kallisto] for quantitation 33 | and ensembl gene annotations for the transcript models. These data are then 34 | made available: 35 | 36 | 1. as binary files [for download][archs4dl]; and 37 | 2. via an [interactive exploratory analysis tool][archs4eda]. 38 | 39 | The scope of this package, for now, is to simply make all of the gene- and 40 | transcript-level expression data easily queryable and retrievable to an analyst 41 | within R. 42 | 43 | In this vignette, we show you how to install the `archs4` package and its 44 | required data files as well as example usage scenarios. 45 | 46 | # Installation 47 | 48 | ```{r child = system.file("rmdparts/installation.Rmd", package = "archs4")} 49 | ``` 50 | 51 | # Usage 52 | 53 | Currently, the `archs4` package facilitates easy query and retrieval of the 54 | samples that have been processed by the project. 55 | 56 | After you have successfully: 57 | 58 | 1. installed the `archs4` R package; 59 | 2. downloaded and build the data dependencies into your `$ARCHS4DIR`; and 60 | 3. set the R `archs4.datadir` option to point to your `$ARCHS4DIR`, ie. 61 | `options("archs4.datadir" = "$ARCHS4DIR)")` 62 | 63 | the use of this package will be most straightforward if you first instantiate a 64 | reference object to the ARCHS4 data repository, like so: 65 | 66 | ```{r a4, warning=FALSE, message=FALSE} 67 | library(archs4) 68 | library(dplyr) 69 | a4 <- Archs4Repository() 70 | ``` 71 | 72 | The `a4` object is what we will use to query and retrieve processed data from. 73 | As of now, the archs4 repository contains 4000+ projects with 100,000+ samples 74 | from both human and mouse experiments: 75 | 76 | ```{r show-a4} 77 | show(a4) 78 | ``` 79 | 80 | ## ARCHS4 Sample Queries 81 | 82 | In order to retrieve expression data from the ARCHS4 `a4` object, you will need 83 | to identify a GEO series id ("GSEnnnnn"), or GEO sample id ("GSMnnnnn") of 84 | interest. 85 | 86 | For instance, the RNA-seq expression data that was used for the 87 | "[RNA-Sequencing Transcriptome and Splicing Database of Glia, Neurons, and Vascular Cells of the Cerebral Cortex][barrespub]" is accessible using the 88 | [GSE52564 GEO series accession number][barresgeo]. When you click through to the 89 | [GEO landing page][barresgeo] and scroll down, you will find a section of the 90 | page that enumerates the samples in the series as shown in the picture below: 91 | 92 | ![](images/GSE52564-samples-section.png) 93 | 94 | We can see that this GEO Series has seventeen (17) samples associated with it. 95 | 96 | Let's query our ARCHS4 repository using the `sample_info()` function to see if 97 | we have them. The `sample_info()` function accepts a character vector of 98 | GEO identifiers (series, samples, or a mix of both) and queries the ARCHS4 99 | repository to see if they are present in the dataset. The function will return 100 | a `data.frame` where each row represents a single sample, and the columns 101 | include different sample-level metadata for the sample, which you can customize 102 | via the `columns` argument: 103 | 104 | ```{r sample-query} 105 | samples <- sample_info(a4, "GSE52564", columns = "Sample_title") 106 | ``` 107 | 108 | This function returns a `data.frame` with `r nrow(samples)` rows, which matches 109 | the number of samples listed on this projects [GEO landing page][barresgeo]. 110 | Let's take a peak: 111 | 112 | ```{r sample-peak} 113 | select(samples, sample_id, Sample_title) 114 | ``` 115 | 116 | Use the `sample_covaraites()` function to identify the names of all of the 117 | sample-level metadata variables that are available from the mouse and human 118 | datasets: 119 | 120 | ```{r scov-available} 121 | sample_covariates(a4) %>% head() 122 | ``` 123 | 124 | Any combination of the entries listed in the `name` column of the `data.frame` 125 | above can be used in the `columns` parameter of the `sample_info()` function. 126 | Although most of these covariates are available in both human and mouse, not 127 | all of them are. The `mouse` and `human` column indicate which covariates are 128 | found in which datasets. 129 | 130 | ## Expression Data 131 | 132 | The `archs4` package provides an `as.DGEList` function, which will accept a 133 | vector of series or sample identifiers and materialize a gene- or 134 | transcript-level `edgeR::DGEList` that they correspond to. 135 | 136 | For instance, to create a gene-level `DGEList` for this project, we would: 137 | 138 | ```{r} 139 | yg <- as.DGEList(a4, "GSE52564", feature_type = "gene") 140 | ``` 141 | 142 | This call returns a `r nrow(yg)` (gene) x `r ncol(yg)` (sample) DGEList that you 143 | can then easily feed into a downstream differential expression analysis pipeline 144 | such as edgeR or voom. 145 | 146 | Setting `feature_type = "transcript"` in the `as.DGEList` call will give you a 147 | DGEList with transcript-level quantitation. 148 | 149 | Any combination of GEO Series and GEO Sample identifiers will work in this 150 | function call so long as they all reference data generated from the same 151 | organism. 152 | 153 | ## Missing Samples 154 | 155 | In your course of using this data resource, you might find that your query for 156 | a particular GEO Series identifier will return fewer samples than are enumerated 157 | on its landing page. There are a number of reasons why individual samples did 158 | not make it into the ARCHS4 dataset. 159 | 160 | Some missing data is due to errors in the ARCHS4 data processing pipeline, where 161 | they may have encountered problems downloading raw data or irregularities in 162 | processing (aligning) it. There are also more mundane reasons, such as a single 163 | GEO Series identifier may contain both RNA-seq and microarray data, the latter 164 | of which would not be included in ARCHS4 resource. 165 | 166 | The `series_status()` function will tell you if there are missing samples 167 | from a GEO Series. 168 | 169 | ```{r} 170 | s1 <- series_status(a4, "GSE52564") 171 | head(s1) 172 | ``` 173 | 174 | A `TRUE` value in the `in_archs4` column indicates that the sample is present 175 | in the ARCHS4 repository. 176 | 177 | In this particular case, we can verify that all samples in the `"GSE52564"` 178 | series are found in the ARCHS4 dataset, ie. `all(s1$in_archs4) == TRUE` 179 | 180 | Other GEO Series, however, are not complete. For instance, 181 | [GEO Series GSE89189][blurtongeo] should contain 43 samples, however there 182 | are a few (six) that are missing: 183 | 184 | ```{r} 185 | series_status(a4, "GSE89189") %>% 186 | filter(!in_archs4) 187 | ``` 188 | 189 | When we use functions like `sample_info()` or `as.DGEList` which expand 190 | GEO Series-level identifiers into the samples they contain, the default 191 | behavior is to call the `series_status()` function internally to check whether 192 | all of the samples for a GEO Series are present in the ARCHS4 dataset. If not, 193 | the user will be warned: 194 | 195 | ```{r} 196 | y2 <- as.DGEList(a4, "GSE89189", feature_type = "gene") 197 | ``` 198 | 199 | **NOTE:** The `series_status()` function makes a call to an NCBI web-service to 200 | query the samples for a given series, and therefore (i) requires an active 201 | internet connection; and (ii) may take *a little* time to complete. 202 | 203 | # Conclusion 204 | 205 | The `archs4` package provides a convenient way to quickly access the vast 206 | amount of data made available by the [ARCHS4 Project][archs4web]. The package 207 | will evolve over time to suit the needs of analysts who work with this great 208 | resource via their R workspace. 209 | 210 | While we, the authors of this package, are big fans the 211 | [ARCHS4 Project][archs4web], please note that this package is not affiliated 212 | with, nor endorsed by the creators of ARCHS4. As such, any issues, bugs, or 213 | comments should be directed towards the authors of this package, or filed under 214 | its issue tracker: 215 | 216 | https://github.com/denalitherapeutics/archs4/issues 217 | 218 | 219 | [//]: # (References ===========================================================) 220 | ```{r child = system.file("rmdparts/references.Rmd", package = "archs4")} 221 | ``` 222 | -------------------------------------------------------------------------------- /R/geo-utils.R: -------------------------------------------------------------------------------- 1 | #' Classify a vector of sample or series GEO ID's as such 2 | #' 3 | #' GEO series identifiers all start with GSE and sample identifiers all 4 | #' start with GSM. We use that to identify what types of identifiers are 5 | #' passed into `id` 6 | #' 7 | #' @export 8 | #' 9 | #' @param id a character vector of `GSEnnnnn` or `GSMnnnnn` ids 10 | #' @return a tibble of `unique(id)` indicating if the id is a `"series"` 11 | #' (GSEnnnnn), `"sample"` (GSMnnnnn), or `"unknown"`. 12 | geo_id_type <- function(id) { 13 | id <- assert_character(id) %>% unique 14 | type <- case_when( 15 | is_geo_series_id(id) ~ "series", 16 | is_geo_sample_id(id) ~ "sample", 17 | TRUE ~ "unknown") 18 | tibble(id = id, type = type) 19 | } 20 | 21 | 22 | #' Query NCBI GEO through its REST interface 23 | #' 24 | #' @export 25 | #' @importFrom xml2 read_xml xml_validate xml_ns_strip xml_contents xml_find_all 26 | #' xml_text 27 | #' @source https://www.ncbi.nlm.nih.gov/geo/info/download.html 28 | #' 29 | #' @param accession Scalar character, GEO identifier for a series (GSE), a 30 | #' sample (GSM) or a platform (GPL). 31 | #' @param validate Scalar boolean, validate the retrieved xml file against 32 | #' NCBI's schema? 33 | #' @return xml2::xml_document object 34 | #' @examples 35 | #' query_geo("GSE109171") 36 | query_geo <- function(accession, target = c("self", "gsm", "gpl", "gse", "all"), 37 | validate = FALSE, verbose = FALSE) { 38 | target = match.arg(target) 39 | geo_url <- sprintf( 40 | paste0("https://www.ncbi.nlm.nih.gov/geo/query/", 41 | "acc.cgi?acc=%s&targ=%s&view=%s&form=%s"), 42 | accession, target, "full", "xml") 43 | if (verbose) { 44 | message(sprintf("Retrieving %s", geo_url)) 45 | } 46 | res <- xml2::read_xml(geo_url) 47 | 48 | if (validate) { 49 | schema_url <- strsplit(xml2::xml_attr(res, attr = "schemaLocation"), 50 | split = " ", fixed = TRUE)[[1]][-1] 51 | valid_xml <- xml2::xml_validate(res, schema = xml2::read_xml(schema_url)) 52 | if (!valid_xml) { 53 | stop(sprintf("Validation with schema %s failed", schema_url)) 54 | } 55 | } 56 | return(res) 57 | } 58 | 59 | #' Retrieve information about a GEO series 60 | #' 61 | #' Queries NCBI GEO's REST interface to retrieve e.g. title, summary and the 62 | #' list of samples for a GEO series. 63 | #' 64 | #' @export 65 | #' @importFrom xml2 xml_contents xml_find_all xml_text 66 | #' @importFrom stats setNames 67 | #' 68 | #' @param accession Scalar character, GEO series identifier e.g. GSE109171 69 | #' @param fields Character vector specifying which fields to extract from the 70 | #' XML file returned by GEO 71 | #' @param ... Additional arguments passed on to the `query_geo` function. 72 | #' @return List the requested `fields` 73 | #' @examples 74 | #' if (interactive()) { 75 | #' lookup_gse("GSE109171") 76 | #' } 77 | lookup_gse <- function(accession, 78 | fields = c("Accession", "Title", "Summary", 79 | "Overall-Design", "Type", "Pubmed-ID", 80 | "Sample"), 81 | ...) { 82 | fields <- match.arg(fields, several.ok = TRUE) 83 | xml <- query_geo(accession, target = "gse", ...) %>% 84 | xml2::xml_ns_strip() 85 | series_fields <- setdiff(fields, "Sample") 86 | series <- purrr::map( 87 | setNames(series_fields, tolower(series_fields)), 88 | .f = function(field) { 89 | xml %>% 90 | xml2::xml_find_first(xpath = "Series") %>% 91 | xml2::xml_find_first(field) %>% 92 | xml2::xml_text(trim = TRUE) 93 | }) 94 | sample_fields <- setdiff(fields, series_fields) 95 | samples <- purrr::map( 96 | setNames(sample_fields, tolower(sample_fields)), 97 | .f = function(field) { 98 | xml %>% 99 | xml2::xml_find_all(xpath = "Sample") %>% 100 | xml2::xml_text(trim = TRUE) 101 | }) 102 | append(series, samples) 103 | } 104 | 105 | #' Retrieve sample annotations from NCBI's Biosample database 106 | #' 107 | #' This function uses the `rentrez` package to retrieve sample annotations 108 | #' from NCBI's Biosample database. 109 | #' 110 | #' @export 111 | #' @importFrom rentrez entrez_search entrez_fetch 112 | #' @importFrom xml2 read_xml xml_find_all xml_text xml_attr xml_find_first 113 | #' xml_children 114 | #' @importFrom tibble set_tidy_names 115 | #' @importFrom readr type_convert cols 116 | #' @source https://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_ 117 | #' 118 | #' @param x Character vector of sample identifiers to search the Biosample 119 | #' database for. Typically either `BioSample (SAMN)`, `SRA (SRS)` or 120 | #' `GEO (GSM)` accession numbers. 121 | #' @param retmax Scalar integer, the maximum number of (total) matches to 122 | #' retrieve from Entrez. See 123 | #' \url{https://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_} 124 | #' for details. The number of records that can be retrieved in one query 125 | #' must be < 100,000. 126 | #' @return A tbl_df data.frame with sample annotations. Column names and numbers 127 | #' vary depending on the attributes available in the Biosample database. 128 | #' @examples 129 | #' if (interactive()) { 130 | #' # BioSample identifiers 131 | #' lookup_biosamples(c("GSM1947162", "GSM1947179")) 132 | #' # mixed SRS and GSM identifiers 133 | #' lookup_biosamples(c("SRS1171537", "SRS1171536", "GSM1947179")) 134 | #' # mixed samples from two different studies (with different attributes) 135 | #' lookup_biosamples(c("SRS1171537", "SRS1271536")) 136 | #' } 137 | lookup_biosamples <- function(x, retmax = 1e5 - 1L) { 138 | if (retmax >= 1e5) stop("retmax must be < 100,000") 139 | 140 | search_results <- rentrez::entrez_search( 141 | db = "biosample", retmax = retmax, 142 | term = sprintf("%s[ACCN]", paste(x, collapse = "[ACCN] OR "))) 143 | rentrez::entrez_fetch(db = "biosample", id = search_results$ids, 144 | rettype = "xml", retmax = retmax, 145 | parsed = FALSE) %>% 146 | xml2::read_xml() %>% 147 | xml2::xml_children() %>% 148 | purrr::map_df(.f = function(x) { 149 | tibble::data_frame( 150 | Title = xml2::xml_find_first( 151 | x, 152 | xpath = ".//Description/Title") %>% 153 | xml2::xml_text(trim = TRUE), 154 | OrganismName = xml2::xml_find_first( 155 | x, 156 | xpath = ".//Description/Organism/OrganismName") %>% 157 | xml2::xml_text(trim = TRUE), 158 | taxonomy_id = xml2::xml_find_all( 159 | x, 160 | xpath = ".//Description/Organism") %>% 161 | xml2::xml_attr("taxonomy_id"), 162 | BioSample = xml2::xml_find_first( 163 | x, 164 | xpath = './/Ids/Id[@db="BioSample"]') %>% 165 | xml2::xml_text(trim = TRUE), 166 | SRA = xml2::xml_find_first( 167 | x, 168 | xpath = './/Ids/Id[@db="SRA"]') %>% 169 | xml2::xml_text(trim = TRUE), 170 | GEO = xml2::xml_find_first( 171 | x, 172 | xpath = './/Ids/Id[@db="GEO"]') %>% 173 | xml2::xml_text(trim = TRUE), 174 | value = xml2::xml_find_all( 175 | x, 176 | xpath = ".//Attribute") %>% 177 | xml2::xml_text(trim = TRUE), 178 | key = xml2::xml_find_all( 179 | x, 180 | xpath = ".//Attribute") %>% 181 | xml2::xml_attr("attribute_name") 182 | ) 183 | }) %>% 184 | dplyr::distinct() %>% 185 | tidyr::spread(key = key, value = value) %>% 186 | tibble::set_tidy_names(syntactic = TRUE, quiet = TRUE) %>% 187 | dplyr::rename_all(tolower) %>% 188 | readr::type_convert(col_types = readr::cols()) 189 | } 190 | 191 | #' Retrieve metadata for an SRA accession 192 | #' 193 | #' This function uses the EBI's or the NCBI's REST APIs to retrieve information 194 | #' about SRA data. 195 | #' Study accessions (ERP, SRP, DRP, PRJ prefixes), experiment accessions 196 | #' (ERX, SRX, DRX prefixes), sample accessions (ERS, SRS, DRS, SAM prefixes) 197 | #' and run accessions (ERR, SRR, DRR prefixes) can be supplied. 198 | #' For more information see \url{http://www.ebi.ac.uk/ena/browse/file-reports} 199 | #' @param x SRA identifier 200 | #' @param from Scalar character, specifying either \code{ncbi} or \code{ena} as 201 | #' the source database 202 | #' @return A tbl_df data.frame 203 | #' @note The output data.frame will be different for the two source databases. 204 | #' @importFrom readr read_tsv read_csv cols 205 | #' @export 206 | #' @examples 207 | #' if (interactive()) { 208 | #' # retrieve study annotations 209 | #' retrieve_sra_metadata("SRP066489") 210 | #' # paired-end samples 211 | #' retrieve_sra_metadata("PRJEB2054", "ena") %>% 212 | #' dplyr::filter(sample_accession == "SAMEA728920") 213 | #' } 214 | retrieve_sra_metadata <- function(x, from = c("ena", "ncbi")) { 215 | from <- match.arg(from) 216 | runinfo <- switch( 217 | from, 218 | ncbi = { 219 | sra_url <- sprintf( 220 | paste0("https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?", 221 | "save=efetch&rettype=runinfo&db=sra&term=%s"), x) 222 | readr::read_csv(url(sra_url), col_types = readr::cols()) 223 | }, 224 | ena = { 225 | sra_url <- sprintf(paste0( 226 | "http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=%s", 227 | "&result=read_run"), x) 228 | readr::read_tsv(url(sra_url), col_types = readr::cols()) 229 | } 230 | ) 231 | return(runinfo) 232 | } 233 | -------------------------------------------------------------------------------- /R/Archs4Repository.R: -------------------------------------------------------------------------------- 1 | #' An interface to a locally downloaded ARCHS4 dataset 2 | #' 3 | #' This instantiates an object that acts as a central broker to handle queries 4 | #' against the ARCHS4 dataset. Please refer to the vignette for instructions 5 | #' on how to setup a local directory to act as an Archs4Repository. 6 | #' 7 | #' @export 8 | #' 9 | #' @param datadir The directory that stores the ARCHS4 data. 10 | #' @return an Arhcs4DataSet object 11 | Archs4Repository <- function(datadir = getOption("archs4.datadir")) { 12 | kosher.dir <- archs4_local_data_dir_validate(echo = FALSE, datadir) 13 | if (!isTRUE(kosher.dir)) { 14 | stop(kosher.dir, call. = FALSE) 15 | } 16 | 17 | asi <- archs4_sample_table(datadir = datadir) 18 | gstats <- asi %>% 19 | filter(!is.na(sample_h5idx_gene)) %>% 20 | group_by(organism) %>% 21 | summarize(nseries = length(unique(series_id)), 22 | nsamples = length(unique(sample_id))) %>% 23 | mutate(feature_type = "gene") 24 | 25 | tstats <- asi %>% 26 | filter(!is.na(sample_h5idx_transcript)) %>% 27 | group_by(organism) %>% 28 | summarize(nseries = length(unique(series_id)), 29 | nsamples = length(unique(sample_id))) %>% 30 | mutate(feature_type = "transcript") 31 | 32 | out <- list( 33 | datadir = datadir, 34 | file_info = archs4_file_info(datadir), 35 | meta = archs4_meta(datadir), 36 | sample_covariates = archs4_sample_covariates(datadir), 37 | sample_stats = bind_rows(gstats, tstats), 38 | sample_table = asi) 39 | 40 | # You *know* we're going to make a "remote" or "service" version of an 41 | # Archs4Repository, in due time ... 42 | class(out) <- c("LocalArchs4Repository", "Archs4Repository") 43 | out 44 | } 45 | 46 | #' @export 47 | #' @method print Archs4Repository 48 | print.Archs4Repository <- function(x, ...) { 49 | cat(format(x, ...), "\n") 50 | } 51 | 52 | format.Archs4Repository <- function(x, ...) { 53 | asi <- sample_table(x) 54 | 55 | out <- paste0( 56 | "===========================================================\n", 57 | paste(class(x)[1L], "object\n"), 58 | "-----------------------------------------------------------\n", 59 | "datadir: ", x$datadir, "\n\n", 60 | "mouse data:\n", 61 | "----------\n", 62 | " gene: ", nseries(x, "gene", "mouse"), " series; ", 63 | nsamples(x, "gene", "mouse"), " samples\n", 64 | " transcript: ", nseries(x, "transcript", "mouse"), " series; ", 65 | nsamples(x, "transcript", "mouse"), " samples\n\n", 66 | "human data:\n", 67 | "-----------\n", 68 | " gene: ", nseries(x, "gene", "human"), " series; ", 69 | nsamples(x, "gene", "human"), " samples\n", 70 | " transcript: ", nseries(x, "transcript", "human"), " series; ", 71 | nsamples(x, "transcript", "human"), " samples\n", 72 | "===========================================================\n") 73 | } 74 | 75 | nsamples <- function(x, feature_type = c("gene", "transcript"), 76 | source = archs4_sources()) { 77 | assert_class(x, "Archs4Repository") 78 | feature_type <- match.arg(feature_type) 79 | source <- match.arg(source) 80 | take <- x$sample_stats$organism == source & 81 | x$sample_stats$feature_type == feature_type 82 | x$sample_stats$nsamples[take] 83 | } 84 | 85 | nseries <- function(x, feature_type = c("gene", "transcript"), 86 | source = archs4_sources()) { 87 | assert_class(x, "Archs4Repository") 88 | feature_type <- match.arg(feature_type) 89 | source <- match.arg(source) 90 | take <- x$sample_stats$organism == source & 91 | x$sample_stats$feature_type == feature_type 92 | x$sample_stats$nseries[take] 93 | } 94 | 95 | #' Retrieves the directory that contains the data for the Archs4Repository 96 | #' 97 | #' @export 98 | #' @param x an `Archs4Repository` 99 | datadir <- function(x, ...) { 100 | assert_class(x, "Archs4Repository") 101 | x$datadir 102 | } 103 | 104 | #' @export 105 | #' @rdname archs4_feature_info 106 | #' @param x an `Archs4Repository` 107 | feature_info <- function(x, feature_type = "gene", source = "human", 108 | augmented = TRUE, ...) { 109 | assert_class(x, "Archs4Repository") 110 | assert_choice(feature_type, c("gene", "transcript")) 111 | assert_choice(source, sources(x)) 112 | archs4_feature_info(feature_type, source, augmented, datadir(x), ...) 113 | } 114 | 115 | #' Perform a loose/fuzzy lookup for a gene/transcript feature. 116 | #' 117 | #' This funciton facilitates exploratory data analyses by trying to find gene 118 | #' or transcripts by different type of identifiers (symbol, ensembl_id, etc). 119 | #' 120 | #' @export 121 | #' @param x An Archs4Repository 122 | #' @param query a character string of feature names to look for 123 | #' @param feature_type "gene" or "transcript" 124 | #' @param source organism dataset to lookup 125 | #' @return a tibble of features that match against the query. The first column 126 | #' is the value of the query itself. If no match is found for a query, its 127 | #' row is all `NA`. 128 | #' @examples 129 | #' a4 <- Archs4Repository() 130 | #' features <- feature_lookup(a4, c("CFAP65", "PECR", "ENSG00000131408"), 131 | #' feature_type = "gene", source ="human") 132 | feature_lookup <- function(x, query, feature_type = "gene", 133 | source = "human", ...) { 134 | assert_class(x, "Archs4Repository") 135 | assert_character(query, min.len = 1L, any.missing = FALSE) 136 | assert_choice(feature_type, c("gene", "transcript")) 137 | query <- unique(query) 138 | 139 | fi <- feature_info(x, feature_type, source, ...) 140 | if (feature_type == "gene") { 141 | search <- c("ensembl_id", "symbol", "entrez_id", "a4external") 142 | } else { 143 | search <- c("ensembl_id", "gene_id", "symbol") 144 | } 145 | search <- intersect(search, colnames(fi)) 146 | 147 | idxs <- sapply(query, function(qry) .fuzzy_lookup(fi, search, qry)) 148 | isna <- is.na(idxs) 149 | if (any(isna)) { 150 | warning("The following ", feature_type, " queries were not found", 151 | paste(query[isna], paste = ",")) 152 | # idxs <- idxs[!isna] 153 | # query <- query[!isna] 154 | } 155 | bind_cols( 156 | tibble(query = query), 157 | fi[idxs,]) 158 | } 159 | 160 | .fuzzy_lookup <- function(x, columns, query) { 161 | assert_data_frame(x) 162 | assert_subset(columns, colnames(x)) 163 | assert_string(query) 164 | query <- tolower(query) 165 | 166 | idx <- NA_integer_ 167 | for (cname in columns) { 168 | vals <- tolower(x[[cname]]) 169 | i <- which(vals == query) 170 | if (length(i) > 1) { 171 | warning("More than one row for `", query, "` found -- taking first one", 172 | call. = TRUE) 173 | idx <- i[1L] 174 | break 175 | } else if (length(i) == 1) { 176 | idx <- i 177 | } 178 | } 179 | if (is.null(idx)) { 180 | warning("Could not find feature using provided query: `", oquery, "`", 181 | immediate. = TRUE) 182 | } 183 | idx 184 | } 185 | 186 | 187 | #' @export 188 | #' @rdname archs4_file_info 189 | #' @param x an `Archs4Repository` 190 | file_info <- function(x) { 191 | assert_class(x, "Archs4Repository") 192 | x[["file_info"]] 193 | } 194 | 195 | #' @export 196 | #' @rdname archs4_file_path 197 | #' @param x an `Archs4Repository` 198 | file_path <- function(x, key) { 199 | assert_class(x, "Archs4Repository") 200 | archs4_file_path(key, file_info = file_info(x), datadir = datadir(x)) 201 | } 202 | 203 | #' Extract the read depth and normalization factors for the samples 204 | #' 205 | #' @export 206 | #' @param x an `Archs4Repository` 207 | #' @param with_a4libsize If `TRUE`, includes an `a4libsize` column, which 208 | #' was extracted from the `meta/reads_aligned` hdf5 file. Defaults to `FALSE`. 209 | #' @return a tibble with sample_id, a4libsize, libsize, normfactor 210 | libstats <- function(x, with_a4libsize = FALSE) { 211 | assert_class(x, "Archs4Repository") 212 | cols <- c("libsize", "normfactor") 213 | if (with_a4libsize) cols <- c("a4libsize", cols) 214 | sample_table(x) %>% 215 | select(sample_id, !!cols) %>% 216 | distinct(sample_id, .keep_all = TRUE) 217 | } 218 | 219 | #' @export 220 | #' @rdname archs4_sample_table 221 | #' @param x an `Archs4Repository` 222 | sample_table <- function(x, ...) { 223 | assert_class(x, "Archs4Repository") 224 | x$sample_table 225 | } 226 | 227 | #' @export 228 | #' @rdname archs4_series_status 229 | #' @param x an `Archs4Repository` 230 | series_status <- function(x, id, ...) { 231 | assert_class(x, "Archs4Repository") 232 | archs4_series_status(id, sample_table = sample_table(x), datadir = datadir(x)) 233 | } 234 | 235 | #' @export 236 | #' @rdname archs4_meta 237 | #' @param x an `Archs4Repository` 238 | meta <- function(x) { 239 | assert_class(x, "Archs4Repository") 240 | x$meta 241 | } 242 | 243 | #' @export 244 | #' @rdname archs4_sources 245 | #' @param x an `Archs4Repository` 246 | sources <- function(x) { 247 | assert_class(x, "Archs4Repository") 248 | meta(x)[["sources"]] 249 | } 250 | 251 | #' @export 252 | #' @rdname archs4_sample_covariates 253 | #' @param x an `Archs4Repository` 254 | sample_covariates <- function(x, ...) { 255 | assert_class(x, "Archs4Repository") 256 | x$sample_covariates 257 | } 258 | 259 | #' @export 260 | #' @rdname archs4_sample_info 261 | #' @param x an `Archs4Repository` 262 | sample_info <- function(x, id, 263 | columns = c("Sample_title", "Sample_source_name_ch1"), 264 | check_missing_samples = TRUE, ...) { 265 | assert_class(x, "Archs4Repository") 266 | archs4_sample_info(id, columns, sample_table(x), sample_covariates(x), 267 | check_missing_samples, datadir(x), ...) 268 | } 269 | 270 | -------------------------------------------------------------------------------- /R/archs4-features.R: -------------------------------------------------------------------------------- 1 | #' Create meta information for the genes and transcripts in the ARCHS4 dataset. 2 | #' 3 | #' @description 4 | #' This is a preprocessing function that is required to successfully build an 5 | #' `Archs4Repository`. It is not really intended for use during analyses. 6 | #' 7 | #' This Function creates *all* of the feature-level CSV files for the features 8 | #' enumerated in the `meta/genes` gene-level hdf5 file, and the 9 | #' `meta/transcript` transcript identfiers in the transctipt-level hdf5 file for 10 | #' the mouse and human files found in `datadir`. 11 | #' 12 | #' **In order for this to work** you have to download the approprate human and 13 | #' mouse gtf files from ensembl and save them in `datadir`. Reference the 14 | #' [archs4_local_data_dir_validate()] function. 15 | #' 16 | #' For the initial relesae of the ARCHS4 dataset, the 17 | #' `Homo_sapiens.GRCh38.90.gtf.gz` and `Mus_musculus.GRCm38.90.gtf.gz` were 18 | #' used. 19 | #' 20 | #' @details 21 | #' This function will write the augmented transcript- and gene-level files in 22 | #' the `datadir`, using the following pattern: 23 | #' `__augmented_info.csv.gz` 24 | #' 25 | #' Gene symbols are the only piece of information provided for the row-level 26 | #' identifieres for the gene count matrices. Furthermore, the gene symbol used 27 | #' in mouse are in all uppercase, which is not how genes are referred to there. 28 | #' In order to augment the gene symbol information with gene-level identifiers 29 | #' and other information, we parse relatively recent GTFs provided by GENCODE. 30 | #' 31 | #' The fruits of the labor generated by this function are used by the 32 | #' [archs4_feature_info()] function. 33 | #' 34 | #' Note that this function will replace already existing "`augmented`" files 35 | #' if the already exist in `datadir`. 36 | #' 37 | #' @export 38 | #' @importFrom utils write.csv 39 | #' @seealso [archs4_feature_info()] 40 | #' 41 | #' @param datadir The directory that has the mouse and human expression 42 | #' hdf5 files. There will be `SPECIES_FEATURETYPE_augmented_info.csv.gz` files 43 | #' saved in this directory whe this function completes. 44 | create_augmented_feature_info <- function(datadir = getOption("archs4.datadir")) { 45 | if (!requireNamespace("GenomicRanges", quietly = TRUE)) { 46 | stop("GenomicRanges package is required to create augmented feature tables") 47 | } 48 | if (!requireNamespace("rtracklayer", quietly = TRUE)) { 49 | stop("rtracklayer package is required to create augmented feature tables") 50 | } 51 | 52 | # Ensure that the appropriate GTF files exist 53 | sources <- archs4_sources() 54 | source.keys <- paste0(sources, "_gtf") 55 | gtfs <- archs4_file_path(source.keys, stop_if_missing = FALSE, 56 | datadir = datadir) 57 | if (any(is.na(gtfs))) { 58 | stop("Some GTF files are missing, please run ", 59 | "`archs4_local_data_dir_validate()` for further information") 60 | } 61 | 62 | for (s in sources) { 63 | message("Constructing ", s, " augmented annotations ...") 64 | gtf.fn <- gtfs[paste0(s, "_gtf")] 65 | tx.fn <- local({ 66 | key <- paste0(s, "_transcript_info") 67 | fn <- suppressWarnings( 68 | archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE, 69 | datadir = datadir) 70 | ) 71 | sub("\\.gz$", "", fn) 72 | }) 73 | gn.fn <- local({ 74 | key <- paste0(s, "_gene_info") 75 | fn <- suppressWarnings( 76 | archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE, 77 | datadir = datadir) 78 | ) 79 | sub("\\.gz$", "", fn) 80 | }) 81 | 82 | gr <- rtracklayer::import.gff(gtf.fn) 83 | 84 | a4.tinfo <- archs4_feature_info("transcript", s, augmented = FALSE) 85 | txinfo <- .augmented_transcript_info(gr, a4.tinfo) 86 | write.csv(txinfo, tx.fn, row.names = FALSE) 87 | system(paste("gzip", tx.fn)) 88 | 89 | a4.ginfo <- archs4_feature_info("gene", s, augmented = FALSE) 90 | ginfo <- .augmented_gene_info(gr, a4.ginfo) 91 | write.csv(ginfo, gn.fn, row.names = FALSE) 92 | system(paste("gzip", gn.fn)) 93 | message("") 94 | } 95 | 96 | } 97 | 98 | #' Helper function that creats augmented transcript information file. 99 | #' 100 | #' @noRd 101 | #' 102 | #' @param gr a GRanges object from .load_gtf, we assume this is enembl gtfs 103 | #' @param a4.tinfo The "raw" information that the ARCHS4 data stores for its 104 | #' transcripts, ie. the output from 105 | #' @return a decorated `a4.tinfo` tibble of augmented transcript information. 106 | .augmented_transcript_info <- function(gr, a4.tinfo) { 107 | requireNamespace("GenomicRanges", quietly = TRUE) 108 | dfa <- GenomicRanges::as.data.frame(gr) 109 | 110 | tx.info.all <- filter(dfa, type == "transcript") 111 | tx.info <- select(tx.info.all, transcript_id, transcript_biotype, 112 | gene_name, gene_id, gene_biotype, gene_source, 113 | seqnames, start, end, strand) 114 | if (any(duplicated(tx.info$transcript_id))) { 115 | stop("Duplicated transcript ids found in gtf file") 116 | } 117 | 118 | # After the full_join, we will find which transcripts are exclusive to the 119 | # gtf and the archs4 transcript file (hopefully none in the later case) 120 | check.me <- tx.info %>% 121 | full_join(a4.tinfo, by = c("transcript_id" = "ensembl_id")) 122 | stopifnot(sum(duplicated(check.me$transcript_id)) == 0L) 123 | 124 | gtf.only <- filter(check.me, is.na(h5idx)) 125 | message( 126 | nrow(gtf.only), " transcripts in gtf only ", 127 | sprintf("(%.2f%% of transcripts from gtf)", 128 | nrow(gtf.only) / nrow(tx.info))) 129 | 130 | a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype)) 131 | message( 132 | nrow(a4.only), " transcripts are only in arch4 tx file ", 133 | sprintf("(%.2f%% of transcripts in ARCHS4 hdf5 file)", 134 | nrow(a4.only) / nrow(a4.tinfo))) 135 | 136 | out <- a4.tinfo %>% 137 | left_join(tx.info, by = c("ensembl_id" = "transcript_id")) %>% 138 | rename(symbol = gene_name) 139 | stopifnot( 140 | nrow(out) == nrow(a4.tinfo), 141 | all(out$h5idx == a4.tinfo$h5idx)) 142 | out 143 | } 144 | 145 | #' Helper function to create augmented gene-level metadata. 146 | #' 147 | #' @noRd 148 | #' 149 | #' @param x a GRanges object from .load_gtf, we assume this is enembl gtfs 150 | #' @param a4.ginfo result from 151 | #' `archs4_feature_info("gene", ..., augmented = FALSE)` 152 | #' @return a decorated `a4.ginfo` table. 153 | .augmented_gene_info <- function(gr, a4.ginfo) { 154 | requireNamespace("GenomicRanges", quietly = TRUE) 155 | has.ens <- "ens_id" %in% colnames(a4.ginfo) 156 | # Not trusting the ensembl_id's for now, due to this: 157 | # https://github.com/MaayanLab/archs4/issues/3 158 | has.ens <- FALSE 159 | if (has.ens) { 160 | a4.ginfo$join <- a4.ginfo$ens_id 161 | } else { 162 | a4.ginfo$join <- a4.ginfo$a4name 163 | } 164 | dup.join <- a4.ginfo$join[duplicated(a4.ginfo$join)] 165 | 166 | a4.ginfo.all <- a4.ginfo 167 | a4.ginfo <- filter(a4.ginfo, !join %in% dup.join) 168 | 169 | # a4.ginfo <- mutate(a4.ginfo, join = tolower(a4name)) 170 | stopifnot(sum(duplicated(a4.ginfo$join)) == 0L) 171 | 172 | gr.exons <- gr[gr$type == "exon"] 173 | grl <- GenomicRanges::split(gr.exons, gr.exons$gene_id) 174 | rgrl <- GenomicRanges::reduce(grl) 175 | gwidths <- sum(GenomicRanges::width(grl)) 176 | gwidths <- tibble(gene_id = names(gwidths), length = unname(gwidths)) 177 | 178 | dfa <- as.data.frame(gr) 179 | txstats <- dfa %>% 180 | filter(type == "transcript") %>% 181 | filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>% 182 | droplevels %>% 183 | group_by(gene_id) %>% 184 | summarize(ntx = n()) 185 | 186 | # Grooming a symbol-based, gene-level metadata table 187 | g.info.all <- filter(dfa, type == "gene") 188 | g.info <- g.info.all %>% 189 | mutate(seqnames = as.character(seqnames)) %>% 190 | filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>% 191 | droplevels %>% 192 | select(gene_id, gene_name, gene_biotype, seqnames, start, end, strand) 193 | if (has.ens) { 194 | g.info$join <- g.info$gene_id 195 | } else { 196 | g.info$join <- g.info$gene_name 197 | } 198 | 199 | if (any(duplicated(g.info.all$gene_id))) { 200 | stop("Duplicated gene ids") 201 | } 202 | 203 | # There are likely duplicated "join" (symbol) rows in g.info, so deal with 204 | # that first -- this is a hack. Let's order the info table so that the 205 | # annotation entries (ie. gene_biotype) we care about appear first and pick 206 | # the first by join 207 | if (FALSE) { 208 | dup.symbols <- g.info$join[duplicated(g.info$join)] 209 | wtf <- g.info %>% 210 | filter(join %in% dup.symbols) %>% 211 | arrange(join, gene_id) 212 | View(wtf) 213 | } 214 | bt.order <- c('protein_coding', 'miRNA', 'lincRNA', 'rRNA', 'snoRNA', 215 | 'scRNA', 'scaRNA', 'sRNA') 216 | bt.order <- c(bt.order, setdiff(g.info$gene_biotype, bt.order)) 217 | g.info$gene_biotype <- factor(g.info$gene_biotype, bt.order) 218 | g.info.u <- g.info %>% 219 | arrange(join, gene_biotype) %>% 220 | filter(!duplicated(join)) 221 | 222 | # After the full_join, we will find which genes are exclusive to the 223 | # gtf and the archs4 gene file (hopefully none in the later case) 224 | check.me <- full_join(g.info.u, a4.ginfo, by = "join") 225 | stopifnot(sum(duplicated(check.me$join)) == 0L) 226 | 227 | gtf.only <- filter(check.me, is.na(h5idx)) 228 | message( 229 | nrow(gtf.only), " genes in gtf only ", 230 | sprintf("(%.2f%% of genes from gtf)", 231 | nrow(gtf.only) / nrow(g.info.u))) 232 | 233 | a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype)) 234 | message( 235 | nrow(a4.only), " genes are only in arch4 tx file ", 236 | sprintf("(%.2f%% of genes in ARCHS4 hdf5 file)", 237 | nrow(a4.only) / nrow(a4.ginfo))) 238 | 239 | out <- a4.ginfo %>% 240 | left_join(g.info.u, by = "join") %>% 241 | select(-join) %>% 242 | rename(symbol = gene_name) 243 | out <- left_join(out, gwidths, by = "gene_id") 244 | stopifnot( 245 | nrow(out) == nrow(a4.ginfo), 246 | all(out$h5idx == a4.ginfo$h5idx)) 247 | out 248 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Overview 4 | ======== 5 | 6 | The `archs4` package provides utility functions to query and explore the expression profiling data made available through the [ARCHS4 project](https://amp.pharm.mssm.edu/archs4/), which is described in the following publication: 7 | 8 | [Massive mining of publicly available RNA-seq data from human and mouse](https://www.nature.com/articles/s41467-018-03751-6). 9 | 10 | Because this package requires the user to download a number of data files that are external to the package, the [installation instructions](#installation) are *a bit* more involved than other R packages, and we leave them for [the end of this document](#installation). 11 | 12 | Usage 13 | ===== 14 | 15 | After [successful installation](#installation) of this package, you can query the series and samples included in the ARCHS4 repository, as well as materialize the expresion data into well-known bioconductor assay containers for downstream analysis. 16 | 17 | To query GEO series and samples, you can use the `sample_info` function: 18 | 19 | ``` r 20 | library(archs4) 21 | 22 | a4 <- Archs4Repository() 23 | ids <- c('GSE89189', 'GSE29943', "GSM1095128", "GSM1095129", "GSM1095130") 24 | sample.info <- sample_info(a4, ids) 25 | head(sample.info) 26 | #> # A tibble: 6 x 8 27 | #> series_id sample_id Sample_title Sample_source_name_ch1 query_type 28 | #> 29 | #> 1 GSE89189 GSM2360252 10318X2 iPS microglia series 30 | #> 2 GSE89189 GSM2360253 7028X2 iPS microglia series 31 | #> 3 GSE89189 GSM2360254 x2-1 iPS microglia series 32 | #> 4 GSE89189 GSM2360255 x2-2 iPS microglia series 33 | #> 5 GSE89189 GSM2360256 x2-3 iPS microglia series 34 | #> 6 GSE89189 GSM2360257 x2-4 iPS microglia series 35 | #> # ... with 3 more variables: sample_h5idx_gene , 36 | #> # sample_h5idx_transcript , organism 37 | ``` 38 | 39 | You can use the `as.DGEList` function to materialize an `edgeR::DGEList` from a an arbitrary number of GEO sample and series identifiers. The only restriction is that the data from the series/samples must all be from the same species. 40 | 41 | The most often use-case will likely be to create a `DGEList` for a given study. For instance, the GEO series identifier [`"GSE89189"`](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE89189) refers to the expression data generated to support the [Abud et al. iPSC-Derived Human Microglia-like Cells ...](https://www.ncbi.nlm.nih.gov/pubmed/28426964) paper. 42 | 43 | Creating a `DGEList` from this study will create an object with 27,024 genes across 37 samples in about 1.5 seconds: 44 | 45 | ``` r 46 | yg <- as.DGEList(a4, "GSE89189", feature_type = "gene") 47 | ``` 48 | 49 | The following command retrieves the 178,135 transcript level counts for this experiment in about 1.5 seconds, as well: 50 | 51 | ``` r 52 | yt <- as.DGEList(a4, "GSE89189", feature_type = "transcript") 53 | ``` 54 | 55 | Installation 56 | ============ 57 | 58 | The installation of the `archs4` package is a bit more involved than a standard package installation and can be roughly broken down into three steps. 59 | 60 | 1. Install the R package along with its dependencies. 61 | 2. Download a number of (large) data files into a specific folder. 62 | 3. Generate metadata from the files downloaded in (2) for downstream use. 63 | 64 | We will walk you through each step in this section. 65 | 66 | R Package Installation 67 | ---------------------- 68 | 69 | The `arcsh4` package depends on other packages that are available through both [CRAN](https://cran.r-project.org/) and [Bioconductor](http://bioconductor.org/). For that reason, we will use the [`BiocInstaller::biocLite()`](https://www.bioconductor.org/install/#why-biocLite) function to install this package, which can seamlessly install packages from github, CRAN, and Bioconductor. 70 | 71 | ``` r 72 | source("https://bioconductor.org/biocLite.R") 73 | biocLite("denalitherapeutics/archs4", build_vignettes=TRUE) 74 | library("archs4") 75 | ``` 76 | 77 | When you first load the `archs4` library, you will notice a startup message telling you that something isn't quite right with your `archs4` installation. The message will look something like this: 78 | 79 | Note that your default archs4 data directory is NOT setup correctly 80 | 81 | * Run `archs4_local_data_dir_validate()` to diagnose 82 | * Refer to the ARCHS4 Data Download section of the archs4 vignette for more information 83 | 84 | Your default archs4 data directory (`getOption("archs4.datadir")`) is: 85 | 86 | ~/.archs4data 87 | 88 | In order for the package to work correctly, you must download a number of files which are enumerated in the [Data File Download](#data-file-download) section below into a single directory. You will then instruct the `archs4` package the path to the directory that holds all of these files by setting the value of R's global `"archs4.datadir`" option to be the path to that directory. 89 | 90 | Data File Download 91 | ------------------ 92 | 93 | You will have to create a directory on your filesystem which will hold a number of data files that the `archs4` package depends on. Let's call this directory `$ARCHS4DIR`, which we will define here to be `~/archs4v2data`. 94 | 95 | The `archs4` package provides the `archs4_local_data_dir_create()` convenience function which creates this directory and copies over a `meta.yaml` file into that directory. The purpose of this file is to specify the names of the downloaded files that correspond to the human and mouse-level gene and transcript-level data. 96 | 97 | ``` r 98 | library(archs4) 99 | archs4dir <- "~/archs4v2data" 100 | archs4_local_data_dir_create(archs4dir) 101 | ``` 102 | 103 | Once this directory is created successfully, you will then have to download the following files into it: 104 | 105 | - archs4 106 | - [`human_matrix.h5`](https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5): human gene-level counts 107 | - [`human_hiseq_transcript_v2.h5`](https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5): human transcript-level counts 108 | - [`mouse_matrix.h5`](https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix.h5): mouse gene-level counts 109 | - [`mouse_hiseq_transcript_v2.h5`](https://s3.amazonaws.com/mssm-seq-matrix/mouse_hiseq_transcript_v2.h5): mouse transcript-level counts 110 | - ensembl 111 | - `Homo_sapiens.GRCh38.90.gtf.gz`: gtf used for human transcript annotations 112 | - `Mus_musculus.GRCm38.90.gtf.gz`: gtf used for mouse transcript annotations 113 | 114 | The enumerated items above contain links to the files that need to be downloaded. You can right-click on them and select `Save As ...` and instruct your web-browser to save them to your local `$ARCHS4DIR`. 115 | 116 | **NOTE**: Most all of the `archs4` functions accept a `datadir` parameter, which should be the path to `$ARCHS4DIR`. For convenience, the default value of this parameter is always set to `getOption("archs4.datadir")`. This means that you can modify your `~/.Rprofile` file to set the value of this option to `"~/archs4v2data"` (for instance), so that the package will always look there by default. If this option is not set in your `~/.Rprofile`, the default value for this option is "~/.archs4data". 117 | 118 | Feature-Level Metadata Generation 119 | --------------------------------- 120 | 121 | The datasets currently made available by the [ARCHS4 Project](https://amp.pharm.mssm.edu/archs4/) only provide minimal feature-level metadata: 122 | 123 | - the features in the gene-level datasets are identified only by their symbol; and 124 | - only the ensembl transcript id's are provided for the features in the transcript-level datasets 125 | 126 | We want to augment these features with richer annotations, such as the ensembl gene identifiers or gene biotypes, for instance. 127 | 128 | To make such data generation automatic and easy for the user, once you have downloaded the Ensembl GTF files listed above into the `$ARCHS4DIR`, you can run the `create_augmented_feature_info()` to extract these extra feature-level metadata from the GTF files and store them as tables inside `$ARCHS4DIR` for later use. 129 | 130 | ``` r 131 | create_augmented_feature_info(archs4dir) 132 | ``` 133 | 134 | This function will load and parse the GTF files from human and mouse, and create gene- and transcript-level `*.csv.gz` files in the `$ARCHS4DIR` which the `archs4` package will then later use downstream. 135 | 136 | Once your `$ARCHS4DIR` is setup, you may find it convenient to set the default value for R's global `"archs4.datadir"` option to the `$ARCHS4DIR` directory you just setup. To do so, you can put the following line in your `~/.Rprofile` file: 137 | 138 | ``` r 139 | options(archs4.datadir = "~/archs4v2data") 140 | ``` 141 | 142 | ARCHS4 Installation Heatlh 143 | -------------------------- 144 | 145 | Because the installation of this package is a bit more involved than most, we have also provided an `archs4_local_data_dir_validate()` function, which you can run over your `$ARCHS4DIR` in order to check on "the health" of your install. 146 | 147 | This function will simply look at your `$ARCHS4DIR` to ensure that the required files are there, and tries to give you helpful error messages if not. 148 | 149 | For instance, if the first two files enumerated in the [Data File Download](#data-file-download) section were missing from your `$ARCHS4DIR` (ie. `human_matrix.h5` and `human_hiseq_transcript_v2.h5`), you would be warned that "something isn't right" when you first load the `archs4` package. You could then run the `archs4_local_data_dir_validate()` to see what is wrong: 150 | 151 | ``` r 152 | archs4_local_data_dir_validate(archs4dir) 153 | #> The following ARCHS4 files are missing, please download them: 154 | #> * human_matrix.h5: https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5 155 | #> * human_hiseq_transcript_v2.h5: #> https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5 156 | ``` 157 | 158 | **NOTE:** If all installation and data download/processing steps have been completed successfully, a call to `archs4_local_data_dir_validate()` will simply return `TRUE`. 159 | 160 | Package Development 161 | ------------------- 162 | 163 | If you are developing this package, you will find that it will be convenient to symlink the package's default `archs4.datadir` path (`~/.arcsh4data`) to the `$ARCHS4DIR` you just setup. This is because often times things like roxygen2 document compilation, unit testing, etc. happen in a vanilla R workspace, which won't run the configuration that is prescribed in your `~/.Rprofile` file. 164 | -------------------------------------------------------------------------------- /R/expression.R: -------------------------------------------------------------------------------- 1 | #' Retrieve expression data for genes/transcripts across samples 2 | #' 3 | #' Returns the estimated counts for the genes/transcripts enumerated in the 4 | #' `features` table for the samples enumerated in the `samples` table. 5 | #' 6 | #' Note that the values returned are simply estimated counts. They are not 7 | #' normalized by sequencing depth. For now, the only use for this function is 8 | #' to compare how ratios of genes compare across samples. 9 | #' 10 | #' @export 11 | #' @importFrom edgeR cpm 12 | #' @importFrom reshape2 melt 13 | #' @seealso [as.DGEList()] 14 | #' 15 | #' @param a4 An `Archs4Repository` 16 | #' @param features a `tibble` of feature descriptors (as returned from 17 | #' [feature_info()]). Really this table simply needs the following columns: 18 | #' * `"a4name"` or `"ensembl_id" or `; 19 | #' * `"feature_type"`: (gene or transcript) 20 | #' * `"source"` (human or mouse),; 21 | #' @param samples a samples identifier: ie. a tibble with series_id and 22 | #' sample_id columns. 23 | #' @param type "gene" or "transcript"-level data? 24 | #' @param source "mouse" or "human" pass this in explicitly if there is no 25 | #' "source" column in your `features` or `samples` data.frame 26 | #' @param feature_meta additional columns from 27 | #' `feature_info(a4, feature_type = feature_type)` 28 | #' @param sample_meta metadata information to include for the samples. These 29 | #' values are extracted from the `meta/VALUE` files in the respective hdf5 30 | #' data files. See [archs4_sample_info()] for more details. 31 | #' @return a data.frame of expression data. Each row is an observation 32 | #' (one gene one sample) 33 | #' @examples 34 | #' a4 <- Archs4Repository() 35 | #' gnz <- feature_lookup(a4, c("CFAP65", "PECR")) 36 | #' gexprs <- fetch_expression(a4, gnz) 37 | fetch_expression <- function(a4, features, samples = NULL, 38 | feature_type = "gene", source = NULL, 39 | feature_meta = "symbol", 40 | sample_meta = c("Sample_title", "Sample_source_name_ch1"), 41 | prior.count = 3, ...) { 42 | assert_class(a4, "Archs4Repository") 43 | source <- .extract_source(features, samples, source) 44 | 45 | # Extract Features 46 | if (is.character(features)) { 47 | features <- feature_lookup(a4, features, feature_type = feature_type, 48 | source = source) 49 | } 50 | assert_data_frame(features) 51 | assert_subset(c("ensembl_id", "feature_type"), colnames(features)) 52 | 53 | features.all <- distinct(features, ensembl_id) 54 | features <- features.all %>% 55 | inner_join(feature_info(a4, feature_type, source), by = "ensembl_id") 56 | fmiss <- anti_join(features.all, features, by = "ensembl_id") 57 | if (nrow(fmiss) > 0L) { 58 | warning("Missing ", nrow(fmiss), " features from query") 59 | } 60 | 61 | all.samples <- a4 %>% 62 | sample_table(feature_type = type) %>% 63 | filter(organism == source) %>% 64 | select(series_id, sample_id) 65 | 66 | samples.specified <- !is.null(samples) 67 | if (!samples.specified) { 68 | samples <- all.samples 69 | } else { 70 | assert_data_frame(samples) 71 | assert_subset(c("series_id", "sample_id"), colnames(samples)) 72 | samples <- distinct(samples, series_id, sample_id, .keep_all = TRUE) 73 | missing.samples <- samples %>% 74 | anti_join(all.samples, by = c("series_id", "sample_id")) 75 | if (nrow(missing.samples)) { 76 | warning(nrow(missing.samples), " requested samples are not in archs4") 77 | samples <- samples %>% 78 | semi_join(all.samples, by = c("series_id", "sample_id")) 79 | } 80 | } 81 | 82 | sids <- unique(samples$sample_id) 83 | 84 | y <- as.DGEList(a4, sids, features = features, feature_type = feature_type, 85 | check_missing_samples = FALSE) 86 | 87 | counts <- reshape2::melt(y$counts) 88 | counts$Var1 <- as.character(counts$Var1) 89 | counts$Var2 <- as.character(counts$Var2) 90 | colnames(counts) <- c("ensembl_id", "sample_id", "count") 91 | 92 | cpms <- reshape2::melt(edgeR::cpm(y, prior.count = prior.count, log = TRUE)) 93 | counts$cpm <- cpms[[3]] 94 | 95 | out <- left_join(samples, counts, by = c("sample_id")) 96 | 97 | if (is.character(feature_meta)) { 98 | fi.meta <- feature_info(a4, feature_type = feature_type, source = source) 99 | fi.meta <- filter(fi.meta, !is.na(ensembl_id)) 100 | feature_meta <- unique(c("ensembl_id", feature_meta)) 101 | fi.meta <- fi.meta[, colnames(fi.meta) %in% feature_meta, drop = FALSE] 102 | if (ncol(fi.meta) > 1L) { 103 | out <- left_join(out, fi.meta, by = "ensembl_id") 104 | } 105 | } 106 | 107 | if (is.character(sample_meta)) { 108 | si <- tryCatch({ 109 | sample_info(a4, sids, sample_meta, check_missing_samples = FALSE) 110 | }, error = function(e) NULL) 111 | if (is.data.frame(si)) { 112 | scols <- unique(c("sample_id", sample_meta)) 113 | si <- si[, intersect(scols, colnames(si)), drop = FALSE] 114 | if (ncol(si) > 1L) { 115 | si <- distinct(si, sample_id, .keep_all = TRUE) 116 | out <- left_join(out, si, by = "sample_id") 117 | } 118 | } else { 119 | warning("There was an error fetching the sample metadata requested") 120 | } 121 | } 122 | 123 | out 124 | } 125 | 126 | .extract_source <- function(features, samples, source) { 127 | if (is.null(source)) { 128 | source <- character() 129 | if (is.data.frame(features)) { 130 | source <- unique(features$source) 131 | } 132 | if (is.data.frame(samples)) { 133 | source <- unique(c(source, samples$source)) 134 | } 135 | source <- source[!is.null(source)] 136 | if (length(source) == 0) { 137 | stop("Don't know where to fetch sources from") 138 | } 139 | } 140 | if (is.character(source)) { 141 | if (length(source) != 1L) { 142 | stop("Can't specificy multiple organisms for source -- one at a time!") 143 | } 144 | } else { 145 | stop("source expected to be a character by know") 146 | } 147 | if (!source %in% archs4_sources()) { 148 | stop("Unknown source:", source) 149 | } 150 | source 151 | } 152 | 153 | #' Retrieves (organism) source of gene/transcript identifiers 154 | #' 155 | #' Parses ensembl identifiers and determines if they are for genes, or 156 | #' transcripts, as well as the organism they should belong to (human, mouse) 157 | #' 158 | #' @param x A vector of identifiers 159 | feature_source <- function(x, ...) { 160 | stop("not implemented yet") 161 | } 162 | 163 | # library size and normalization factor estimation ============================= 164 | # 165 | # The code here was adapted from the code in edgeR. 166 | # We want to calculate and store library sizes and normalization factors 167 | # for an Archs4Repository, so we can calculate things like "cpm" on the fly ... 168 | # the super-fly. 169 | 170 | # Default top-level TMM normalization parameters: 171 | # 172 | # logratioTrim = 0.3 173 | # sumTrim = 0.05 174 | # doWeighting = TRUE 175 | # Acutoff = -1e10 176 | # p = 0.75 177 | # 178 | # Main TMM Calculation 179 | # f75 <- .calcFactorQuantile(data=x, lib.size=lib.size, p=0.75) 180 | # refColumn <- which.min(abs(f75-mean(f75))) 181 | # f <- rep(NA,ncol(x)) # holds norm factors 182 | # for(i in 1:ncol(x)) 183 | # f[i] <- .calcFactorWeighted(obs=x[,i], ref=x[,refColumn], 184 | # libsize.obs=lib.size[i], 185 | # libsize.ref=lib.size[refColumn], 186 | # logratioTrim=logratioTrim, 187 | # sumTrim=sumTrim, 188 | # doWeighting=doWeighting, 189 | # Acutoff=Acutoff) 190 | # 191 | # .calcFactorQuantile 192 | # y <- t(t(data)/lib.size) # divides each column by its library size 193 | # f <- apply(y,2,function(x) quantile(x,p=p)) # find value at upper quartile 194 | 195 | # .expr.datasets <- c("human_gene", "mouse_gene", 196 | # "human_transcript", "mouse_transcript") 197 | .expr.datasets <- c("human_gene", "mouse_gene") 198 | 199 | #' Estimate normalization factors for datasets in the Archs4Repository 200 | #' 201 | #' This function will serialize the results of the library size and 202 | #' normalization factors into files inside `datadir(a4)` 203 | #' 204 | #' @export 205 | #' @param a4 The `Arcsh4Repository` 206 | #' @return Invisibly returns a tibble of the the library size and normalization 207 | #' factors for the expression data in `a4` (invisibly). 208 | estimate_repository_norm_factors <- function(a4) { 209 | info <- lapply(.expr.datasets, function(key) { 210 | estimate_norm_factors(a4, key) 211 | }) 212 | invisible(info) 213 | } 214 | 215 | #' Calculates library size and norm factors for a specific dataset 216 | #' 217 | #' This function will serialize the results of the library size and 218 | #' normalization factors into files inside `datadir(a4)` 219 | #' 220 | #' 221 | estimate_norm_factors <- function(a4, key, n = 500, logratioTrim = 0.3, 222 | sumTrim = 0.05, doWeighting = TRUE, 223 | Acutoff = -1e10, p = 0.75, ...) { 224 | if (FALSE) { 225 | key = "mouse_gene" 226 | n = 500; logratioTrim = 0.3; sumTrim = 0.05; doWeighting = TRUE; 227 | Acutoff = -1e10; p = 0.75 228 | } 229 | 230 | key <- match.arg(key, .expr.datasets) 231 | h5.fn <- file_path(a4, key) 232 | fn.out <- paste0(key, "-normfactors.csv") 233 | fn.out <- file.path(datadir(a4), fn.out) 234 | 235 | # Almost could have saved a lot of time using the v4++ matrices since they 236 | # include a "meta/reads_aligned" vector, however to use TMM, we still need 237 | # to calculate the percentiles ... 238 | # sinfo <- tibble( 239 | # series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")), 240 | # sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession")), 241 | # libsize = h5read(h5.fn, "meta/reads_aligned")) 242 | sinfo <- .crankLibSize(h5.fn, n = n, p = p) 243 | fn.tmp <- sub("\\.csv$", "-tmp.csv", fn.out) 244 | write.csv(sinfo, fn.tmp, row.names = FALSE) 245 | 246 | # Identify the sample to use as the reference profile 247 | refColumn <- which.min(abs(sinfo$quantile - mean(sinfo$quantile, na.rm = TRUE))) 248 | ref.vals <- h5read(h5.fn, "data/expression", list(NULL, refColumn)) 249 | 250 | # Run a TMM for all samples against the reference profile 251 | nf <- .crankNormFactors(h5.fn, sinfo, ref.vals, n = n, 252 | logratioTrim = logratioTrim, sumTrim = sumTrim, 253 | doWeighting = doWeighting, Acutoff = Acutoff) 254 | sinfo$normfactor <- nf 255 | 256 | write.csv(sinfo, fn.out, row.names = FALSE) 257 | invisible(sinfo) 258 | } 259 | 260 | # before the v4 gene-level count matrices, the read counts per sample were not 261 | # included, so we were calculating them manually. 262 | .crankLibSize <- function(h5.fn, n = 500, p = 0.75) { 263 | message("Calculating library sizes ...") 264 | sinfo <- tibble( 265 | series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")), 266 | sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession"))) 267 | 268 | n.batches <- nrow(sinfo) / n 269 | remainder <- nrow(sinfo) %% n 270 | if (remainder > 0) n.batches <- n.batches + 1 271 | n.batches <- floor(n.batches) 272 | 273 | offset <- 0 274 | 275 | res <- vector("list", n.batches) 276 | for (i in 1:n.batches) { 277 | idx.start <- offset + 1 278 | idx.end <- idx.start + (if (i != n.batches) n else remainder) - 1L 279 | idxs <- idx.start:idx.end 280 | offset <- idx.end 281 | 282 | # lib size and percentile 283 | dat <- h5read(h5.fn, "data/expression", list(NULL, idxs)) 284 | lib.size <- colSums(dat) 285 | dat <- t(t(dat)/lib.size) 286 | # Somehow some samples had NA counts, so we need to protect from that 287 | fq <- apply(dat, 2, function(x) quantile(x, p = p, na.rm = TRUE)) 288 | res[[i]] <- data.frame(index = idxs, libsize = lib.size, quantile = fq) 289 | } 290 | res <- bind_rows(res) 291 | bind_cols(sinfo, res) 292 | } 293 | 294 | .crankNormFactors <- function(h5.fn, sinfo, ref.vals, n = n, 295 | logratioTrim = logratioTrim, sumTrim = sumTrim, 296 | doWeighting = doWeighting, Acutoff = Acutoff) { 297 | message("Calculating normalization factors ...") 298 | ref.size <- sum(ref.vals) 299 | # Figure out how to batch iterate over our data, we now have to reload the 300 | # data to calculate the normalization factors. 301 | n.batches <- nrow(sinfo) / n 302 | remainder <- nrow(sinfo) %% n 303 | if (remainder > 0) n.batches <- n.batches + 1 304 | n.batches <- floor(n.batches) 305 | 306 | # Load batches again and calc norm factors 307 | nf <- numeric(nrow(sinfo)) 308 | offset <- 0 309 | 310 | for (i in 1:n.batches) { 311 | idx.start <- offset + 1 312 | idx.end <- idx.start + (if (i != n.batches) n else remainder) - 1L 313 | idxs <- idx.start:idx.end 314 | offset <- idx.end 315 | 316 | # lib size and percentile 317 | dat <- h5read(h5.fn, "data/expression", list(NULL, idxs)) 318 | for (j in 1:ncol(dat)) { 319 | idx <- idxs[j] 320 | vals <- dat[,j] 321 | if (any(is.na(vals))) { 322 | nf[idx] <- NA_real_ 323 | } else { 324 | nf[idx] <- .calcFactorWeighted(obs=vals, ref=ref.vals, 325 | libsize.obs=sum(vals), 326 | libsize.ref=ref.size, 327 | logratioTrim=logratioTrim, 328 | sumTrim=sumTrim, 329 | doWeighting=doWeighting, 330 | Acutoff=Acutoff) 331 | } 332 | } 333 | } 334 | 335 | nf 336 | } 337 | 338 | # Calculates the TMM between two libraries. 339 | # 340 | # This is lifted straight from edgeR_3.22.2. The only reason it is included here 341 | # is because it is not exported (rightfully) from the package. 342 | .calcFactorWeighted <- function(obs, ref, libsize.obs=NULL, libsize.ref=NULL, 343 | logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, 344 | Acutoff=-1e10) { 345 | obs <- as.numeric(obs) 346 | ref <- as.numeric(ref) 347 | 348 | if( is.null(libsize.obs) ) nO <- sum(obs) else nO <- libsize.obs 349 | if( is.null(libsize.ref) ) nR <- sum(ref) else nR <- libsize.ref 350 | 351 | logR <- log2((obs/nO)/(ref/nR)) # log ratio of expression, accounting for library size 352 | absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression 353 | v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance 354 | 355 | # remove infinite values, cutoff based on A 356 | fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff) 357 | 358 | logR <- logR[fin] 359 | absE <- absE[fin] 360 | v <- v[fin] 361 | 362 | if(max(abs(logR)) < 1e-6) return(1) 363 | 364 | # taken from the original mean() function 365 | n <- length(logR) 366 | loL <- floor(n * logratioTrim) + 1 367 | hiL <- n + 1 - loL 368 | loS <- floor(n * sumTrim) + 1 369 | hiS <- n + 1 - loS 370 | 371 | # keep <- (rank(logR) %in% loL:hiL) & (rank(absE) %in% loS:hiS) 372 | # a fix from leonardo ivan almonacid cardenas, since rank() can return 373 | # non-integer values when there are a lot of ties 374 | keep <- (rank(logR)>=loL & rank(logR)<=hiL) & (rank(absE)>=loS & rank(absE)<=hiS) 375 | 376 | if(doWeighting) 377 | f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE) 378 | else 379 | f <- mean(logR[keep], na.rm=TRUE) 380 | 381 | # Results will be missing if the two libraries share no features with positive counts 382 | # In this case, return unity 383 | if(is.na(f)) f <- 0 384 | 2^f 385 | } 386 | -------------------------------------------------------------------------------- /R/archs4-functional.R: -------------------------------------------------------------------------------- 1 | # These functions define a "high-level" interface to working with the data 2 | # downloaded into an ARCHS4 Data Directory. They are the workhorses of this 3 | # package. 4 | # 5 | # Although users can interact with the ARCHS4 data directory "directly" using 6 | # these functions, most will find it more convenient to interact with ARCHS4 7 | # using an `Archs4Repository`. Many of the functions here are also available 8 | # against an `Archs4Repository` object, and are named by stripping the archs4_ 9 | # prefix here. 10 | # 11 | # For instance, the following are equivalent: 12 | # 13 | # R> archs4_feature_info(datadir) 14 | # 15 | # and 16 | # 17 | # R> a4 <- Archs4Repository(datadir) 18 | # R> feature_info(a4) 19 | # 20 | # Interacting with these data via the `Archs4Repository` should also better 21 | # future-proof your code. 22 | 23 | 24 | #' Retrieves the feature (gene/transcript) information for the archs4 data 25 | #' 26 | #' Only the gene symbols (`meta/genes` in gene expression hd5 file) or entrez 27 | #' transcript identifiers (`meta/transcript` for the transcript hdf5 file) are 28 | #' stored in thse data. We use [create_augmented_feature_info()] function to 29 | #' generate and store extra metadata for these features, which are then appended 30 | #' to these identifiers with this function. 31 | #' 32 | #' @export 33 | #' @importFrom readr read_csv 34 | #' @seealso [create_augmented_feature_info()] 35 | #' 36 | #' @param feature_type gene or transcript? 37 | #' @param source human or mouse 38 | #' @param augmented include extra gene- or transcript-level features? 39 | #' Default: `TRUE` 40 | #' @param datadir the directory that stores the ARCHS4 data files 41 | #' @param ... pass through 42 | #' @return a tibble of information 43 | archs4_feature_info <- function(feature_type = "gene", source = "human", 44 | augmented = TRUE, 45 | datadir = getOption("archs4.datadir"), ...) { 46 | assert_choice(feature_type, c("gene", "transcript")) 47 | assert_choice(source, archs4_sources(datadir)) 48 | assert_flag(augmented) 49 | 50 | fkey <- paste(source, feature_type, sep = "_") 51 | h5.fn <- archs4_file_path(fkey, datadir = datadir) 52 | 53 | if (feature_type == "gene") { 54 | # I am using `a4name` instead of symbol, because the values stored here 55 | # aren't universally/technically symbols. In the mouse dataset, the "symbol" 56 | # names are all uppercase, which is a non-canonical something. 57 | 58 | ainfo <- tibble(a4name = rhdf5::h5read(h5.fn, "meta/genes"), 59 | entrez_id = .h5read(h5.fn, "meta/gene_entrezid")) 60 | is.v2m <- source == "mouse" & all(ainfo$a4name == toupper(ainfo$a4name)) 61 | is.v2m <- TRUE 62 | if (source == "human") { 63 | ainfo[["a4external"]] <- .h5read(h5.fn, "meta/gene_hgnc") 64 | ainfo[["refseq_id"]] <- .h5read(h5.fn, "meta/gene_refseqid") 65 | join <- c(a4name = "a4name") 66 | } else { 67 | join <- c(a4name = "a4name") 68 | if (is.v2m) { 69 | # v2 of the mouse-level gene data had gene names in all caps. I am 70 | # changing this to "title case". 71 | ainfo[["join"]] <- paste0( 72 | substr(ainfo[["a4name"]], 1, 1), 73 | tolower(substring(ainfo[["a4name"]], 2))) 74 | ainfo[["ens_id"]] <- as.character(.h5read(h5.fn, "meta/gene_ensemblid")) 75 | } else { 76 | ainfo[["a4external"]] <- .h5read(h5.fn, "meta/gene_mgi") 77 | ainfo[["ens_id"]] <- as.character(.h5read(h5.fn, "meta/gene_ensemblid")) 78 | } 79 | # if (!all(is.na(ainfo[["ens_id"]]))) { 80 | # join <- c(ens_id = "ensembl_id") 81 | # } else { 82 | # join <- c(join = "join") 83 | # ainfo[["join"]] <- tolower(ainfo[["a4name"]]) 84 | # } 85 | } 86 | ainfo[["h5idx"]] <- seq(nrow(ainfo)) 87 | } else { 88 | ainfo <- tibble(ensembl_id_full = rhdf5::h5read(h5.fn, "meta/transcript"), 89 | ensembl_id = sub("\\.\\d+$", "", ensembl_id_full), 90 | length = rhdf5::h5read(h5.fn, "meta/transcriptlength"), 91 | h5idx = seq(ensembl_id_full)) 92 | } 93 | 94 | if (augmented) { 95 | aug.fn <- paste(source, feature_type, "info", sep = "_") 96 | aug.fn <- archs4_file_path(aug.fn, datadir = datadir) 97 | 98 | if (feature_type == "gene") { 99 | if (source == "human") { 100 | coltypes <- "cciccccciici" 101 | } else { 102 | coltypes <- "ccicccccciici" 103 | } 104 | meta <- readr::read_csv(aug.fn, col_types = coltypes) 105 | meta <- mutate(meta, 106 | entrez_id = ifelse(entrez_id == "null", NA, entrez_id)) 107 | meta <- rename(meta, ensembl_id = "gene_id") 108 | meta <- mutate(meta, feature_type = "gene") 109 | if (join == "join") { 110 | meta[["join"]] <- tolower(meta[["symbol"]]) 111 | } 112 | } else { 113 | join <- "ensembl_id_full" 114 | coltypes <- "cciicccccciic" 115 | meta <- readr::read_csv(aug.fn, col_types = coltypes) 116 | meta <- mutate(meta, feature_type = "transcript") 117 | } 118 | 119 | # remove duplicate columns in meta table except for join column 120 | meta <- meta[, !colnames(meta) %in% setdiff(colnames(ainfo), join)] 121 | meta <- meta[!duplicated(meta[[join]]) & !is.na(meta[[join]]),] 122 | tmp <- left_join(ainfo, meta, by = join) 123 | 124 | stopifnot( 125 | nrow(tmp) == nrow(ainfo), 126 | all(tmp[[join]] == ainfo[[join]])) 127 | ainfo <- tmp 128 | 129 | # if (feature_type == "gene") { 130 | # ainfo$symbol_guess <- ainfo$symbol 131 | # ainfo$symbol <- ainfo$a4name 132 | # } 133 | } 134 | 135 | if ("ens_id" %in% colnames(ainfo) && !"ensembl_id" %in% colnames(ainfo)) { 136 | ainfo <- rename(ainfo, ensembl_id = "ens_id") 137 | } 138 | if ("join" %in% colnames(ainfo)) ainfo[["join"]] <- NULL 139 | 140 | ainfo$source <- source 141 | ainfo 142 | } 143 | 144 | #' Retrieves a table of files that back an Archs4Repository 145 | #' 146 | #' @description 147 | #' A yaml iskept in the Archs4 data directory (`getOption("archs4.datadir")`) 148 | #' that links keys, (ie. `mouse_gene`) to the name of the file in the directory. 149 | #' This abstraction is introduced so that the version of these files can be 150 | #' updated with new downloads, and the user simply has to modify the yaml file 151 | #' so that they are used downstream 152 | #' 153 | #' Reference the "ARCHS4 Data Download" section in the vignette for more 154 | #' information. 155 | #' 156 | #' @export 157 | #' @param datadir the directory that stores the ARCHS4 data files 158 | archs4_file_info <- function(datadir = getOption("archs4.datadir")) { 159 | yml <- archs4_meta(datadir = datadir) 160 | finfo <- yml[["files"]] 161 | 162 | take <- function(l, wut) { 163 | val <- l[[wut]] 164 | if (is.null(val)) NA_character_ else val 165 | } 166 | 167 | tibble( 168 | key = names(finfo), 169 | source = sapply(finfo, take, "source"), 170 | name = sapply(finfo, take, "name"), 171 | url = sapply(finfo, take, "url"), 172 | desription = sapply(finfo, take, "description"), 173 | file_path = file.path(datadir, name), 174 | file_exists = file.exists(file_path)) 175 | } 176 | 177 | #' Identify the file path on the system for specific ARCHS4 files. 178 | #' 179 | #' By default, this function will throw an error if a file does not exist 180 | #' upon lookup. To return the *expected* path to the, even if it does not 181 | #' exist on the file system, set `stop_if_missing = FALSE`. 182 | #' 183 | #' @export 184 | #' @importFrom stats setNames 185 | #' 186 | #' @param key the lookup key for the file, ie. `"human_gene"` or `"mouse_gene"`. 187 | #' The known keys are enumerated in `archs4_file_info()$key` column. 188 | #' @param stop_if_missing defaults to `TRUE`, which causes this function to 189 | #' throw an error if the file does not exist at the expected `file_path`. 190 | #' Set this to `FALSE` to simply raise a warning 191 | #' @param na_missing by default, we set paths to files that don't exist to 192 | #' `NA`. Set this to `FALSE` to retrieve the expected path of the missing 193 | #' file. 194 | #' @param file_info the output from [archs4_file_info()], which enumerates the 195 | #' files used by the `Archs4Repository`. 196 | #' @param datadir the directory that stores the ARCHS4 data files 197 | #' @return a named (by `key`) character vector of paths to the filesystem that 198 | #' correspond to the entries in `key`. 199 | archs4_file_path <- function(key, stop_if_missing = TRUE, na_missing = TRUE, 200 | file_info = archs4_file_info(datadir), 201 | datadir = getOption("archs4.datadir")) { 202 | assert_character(key, min.len = 1L) 203 | assert_directory(datadir, 'r') 204 | assert_class(file_info, "data.frame") 205 | assert_names(c("key", "source", "file_path"), 206 | subset.of = colnames(file_info)) 207 | query <- tibble(key = key) %>% left_join(file_info, by = "key") 208 | qbad <- filter(query, is.na(source)) 209 | 210 | if (nrow(qbad)) { 211 | bad.key <- unique(qbad[["key"]]) 212 | stop("Unknown file queries: ", paste(bad.key, collapse = ", ")) 213 | } 214 | 215 | qmiss <- filter(query, !file_exists) 216 | if (nrow(qmiss)) { 217 | bad.key <- unique(qmiss[["key"]]) 218 | msg <- paste("Can not find archs4 file(s) on disk: ", 219 | paste(bad.key, collapse = ", ")) 220 | if (stop_if_missing) stop(msg) else warning(msg) 221 | if (na_missing) { 222 | query <- mutate(query, file_path = ifelse(file_exists, file_path, NA)) 223 | } 224 | } 225 | 226 | setNames(query[["file_path"]], query[["key"]]) 227 | } 228 | 229 | #' Retrieves tibble of sample-level covariates available in mouse and human data. 230 | #' 231 | #' Enumerate the sample covariates available in mouse and human data. 232 | #' Note that the covariates available in human and mouse are the same 233 | #' between the gene and transcript level files 234 | #' 235 | #' @export 236 | #' @param datadir the directory that holds the archs4 data 237 | archs4_sample_covariates <- function(datadir = getOption("archs4.datadir"), 238 | ...) { 239 | mouse.covs <- .sample_metadata_files("mouse_gene", datadir) %>% 240 | select(name) %>% 241 | mutate(mouse = TRUE) 242 | human.covs <- .sample_metadata_files("human_gene", datadir) %>% 243 | select(name) %>% 244 | mutate(human = TRUE) 245 | sample.covs <- mouse.covs %>% 246 | full_join(human.covs, by = "name") %>% 247 | mutate(mouse = ifelse(is.na(mouse), FALSE, TRUE), 248 | human = ifelse(is.na(human), FALSE, TRUE)) 249 | sample.covs 250 | } 251 | 252 | #' Enumerates internal files from hdf5 binary that contain sample metadata 253 | #' 254 | #' This function is intentionally not exported 255 | #' 256 | #' @importFrom rhdf5 h5ls 257 | #' @noRd 258 | #' 259 | #' @param file hdf5 file in `datadar` to use to identify the internal "metal" 260 | #' files that correspond to sample-level metadata. You shouldn't need to 261 | #' change this as all hdf5 data files should carry the same metadata. This is 262 | #' here mainly for debugging purposes. 263 | #' @param datadir the directory that holds the archs4 data 264 | #' @param ... pass through 265 | #' @return a vector of sample metadata names that are stored in archs4 266 | .sample_metadata_files <- function(file = "mouse_gene", 267 | datadir = getOption("archs4.datadir"), 268 | ...) { 269 | h5.fn <- archs4_file_path(file, datadir = datadir) 270 | all.files <- as_tibble(h5ls(h5.fn)) 271 | all.files[["idim"]] <- suppressWarnings(as.integer(all.files[["dim"]])) 272 | 273 | dat.dim <- local({ 274 | dd <- filter(all.files, group == "/data", name == "expression") 275 | if (nrow(dd) != 1L) { 276 | stop("The hdf5 file is not an archs4 expression matrix ", 277 | sprintf("[%s]", h5.fn)) 278 | } 279 | dd <- tolower(dd$dim) 280 | out <- sapply(strsplit(dd, " *x *")[[1L]], as.integer) 281 | setNames(out, c("rows", "columns")) 282 | }) 283 | nsamples <- dat.dim["columns"] 284 | nfeats <- dat.dim["rows"] 285 | 286 | sample.meta <- filter(all.files, group == "/meta", idim == nsamples) 287 | mutate(sample.meta, h5name = paste(group, name, sep = "/")) 288 | } 289 | 290 | #' Checks which samples from a single series are present/absent. 291 | #' 292 | #' Due to download or alignment issues, the ARCHS4 data processing pipeline may 293 | #' not include all of the samples included in a particular GEO series. This 294 | #' function will return a table with an `in_archs4` columns that indicates 295 | #' whether or not a sample from a particular series is present in ARCHS4. 296 | #' 297 | #' @export 298 | #' 299 | #' @param id a single GEO series id, ie. `"GSEnnnnn"` 300 | #' @param sample_table the output from [archs4_sample_table()], which lists 301 | #' the series_id,sample_id combinations found in the ARCHS4 repository. 302 | #' @param datadir the directory that holds the archs4 data 303 | #' @return a tibble of information for a series. 304 | #' @examples 305 | #' info <- archs4_series_status("GSE89189") 306 | archs4_series_status <- function(id, 307 | sample_table = archs4_sample_table(datadir = datadir), 308 | datadir = getOption("archs4.datadir"), 309 | ...) { 310 | assert_string(id) 311 | if (geo_id_type(id)[["type"]] != "series") { 312 | stop("A GEO series id is required (ie. \"GSEnnnnn\")") 313 | } 314 | 315 | gq <- lookup_gse(id) 316 | geo <- tibble(series_id = id, sample_id = gq[["sample"]]) 317 | 318 | archs <- filter(sample_table, series_id == id) 319 | archs <- transmute(archs, sample_id, in_archs4 = TRUE) 320 | 321 | out <- left_join(geo, archs, by = "sample_id") 322 | out[["in_archs4"]] <- !is.na(out[["in_archs4"]]) 323 | out 324 | } 325 | 326 | 327 | #' Retrieves information for samples by GSE series or sample IDs 328 | #' 329 | #' @description 330 | #' Fetch a tibble of series and sample information by querying the arcsh4 331 | #' dataset by GEO sample (GSE) or sample (GSM) ids. 332 | #' 333 | #' For each unique GEO series identifier (`"GSEnnnn"`), we will check if the 334 | #' ARCHS4 dataset is missing any of its samples when `check_missing_samples` 335 | #' is set to `TRUE` (default). 336 | #' 337 | #' @export 338 | #' 339 | #' @param id a character vector of GEO series or sample ids. 340 | #' @param columns the names of the sample metadata columns desired. This 341 | #' defaults to `c("Sample_title", "Sample_source_name_ch1")`. The values 342 | #' in `columns` must be a subset of the values enumerated in 343 | #' [archs4_sample_covariates()]. 344 | #' @param sample_table the output from [archs4_sample_table()], which lists 345 | #' the series_id,sample_id combinations found in the ARCHS4 repository. 346 | #' @param sample_covariates the `data.frame`-definition of the sample covariates 347 | #' found in the ARCHS4 datasetes, which is constructed via a call to 348 | #' [archs4_sample_covariates()]. The parameter is included in here so that 349 | #' a cached version of this `data.frame` can be re-used. 350 | #' @param check_missing_samples When `TRUE` (the default), this function will 351 | #' check every unique GEO series identifier (`"GSEnnnn"`) for missing samples 352 | #' by using an NCBI Rest service via a call to [archs4_series_status()], 353 | #' and [lookup_gse()]. 354 | #' @param datadir the directory that holds the archs4 data 355 | #' @return a tibble of series_id, sample_id, sample_h5idx, sample_title, and 356 | #' sample_name columns. If the query sample or series query can't be found, 357 | #' then there will be an `NA` value for these columns. The `query_type` column 358 | #' will indicat whether the row was returned from querying by series or by 359 | #' sample. 360 | #' @examples 361 | #' si <- archs4_sample_info("GSE52564") # ben barres transcriptome db ... 362 | archs4_sample_info <- function(id, 363 | columns = c("Sample_title", "Sample_source_name_ch1"), 364 | sample_table = archs4_sample_table(datadir = datadir), 365 | sample_covariates = archs4_sample_covariates(datadir), 366 | check_missing_samples = TRUE, 367 | datadir = getOption("archs4.datadir"), ...) { 368 | input <- geo_id_type(id) 369 | bad.id <- filter(input, type == "unknown") 370 | if (nrow(bad.id)) { 371 | stop("Malformed identifiers in query: ", paste(bad.id$id, collapse = ", ")) 372 | } 373 | 374 | # check sample metadata columns 375 | get.meta <- !is.null(columns) && length(columns) 376 | if (get.meta) { 377 | columns <- unique(columns) %>% 378 | assert_character(any.missing = FALSE) %>% 379 | assert_subset(sample_covariates$name) 380 | } else { 381 | columns <- character() 382 | } 383 | 384 | # This hurts: I'm doing this to ensure that queries to this function where 385 | # all `id`s are not found in the ARCHS4 repository still return a tibble 386 | # of the right dimensions, but has NAs in most places 387 | dummy <- tibble( 388 | series_id = character(), 389 | sample_id = character(), 390 | query_type = character(), 391 | sample_h5idx_gene = integer(), 392 | sample_h5idx_transcript = integer(), 393 | organism = character()) 394 | for (cname in columns) dummy[[cname]] <- character() 395 | 396 | series <- input %>% 397 | filter(type == "series") %>% 398 | rename(series_id = id) %>% 399 | left_join(sample_table, by = "series_id") 400 | bad.series <- series %>% 401 | filter(is.na(organism)) %>% 402 | distinct(series_id) %>% 403 | mutate(query_type = "series") 404 | if (nrow(bad.series)) { 405 | warning(nrow(bad.series), " series identifiers not found", 406 | immediate. = TRUE) 407 | series <- anti_join(series, bad.series, by = "series_id") 408 | bad.series <- bind_rows(bad.series, dummy) 409 | } 410 | 411 | with.missing.samples <- NULL 412 | if (check_missing_samples && nrow(series)) { 413 | for (s in unique(series[["series_id"]])) { 414 | check <- archs4_series_status(s, sample_table = sample_table, 415 | datadir = datadir) 416 | missing <- filter(check, !in_archs4) 417 | if (nrow(missing)) { 418 | msg <- paste( 419 | sprintf("The %s series has %d missing samples in ARCHS4:\n *", 420 | s, nrow(missing)), 421 | paste(missing[["sample_id"]], collapse ="\n * "), 422 | "\n") 423 | warning(msg, immediate. = TRUE, call. = FALSE) 424 | # TODO: Add `with_missing_samples` parameter, or does it make sense 425 | # to just automatically append these? 426 | # it hurts me to do this: 427 | with.missing.samples <- bind_rows(with.missing.samples, missing) 428 | } 429 | } 430 | } 431 | 432 | samples <- input %>% 433 | filter(type == "sample") %>% 434 | rename(sample_id = id) %>% 435 | left_join(sample_table, by = "sample_id") 436 | bad.samples <- samples %>% 437 | filter(is.na(organism)) %>% 438 | distinct(sample_id) %>% 439 | mutate(query_type = "sample") 440 | if (nrow(bad.samples)) { 441 | warning(nrow(bad.samples), " sample identifiers not found", 442 | immediate. = TRUE) 443 | samples <- anti_join(samples, bad.samples, by = "sample_id") 444 | bad.samples <- bind_rows(bad.samples, dummy) 445 | } 446 | 447 | query <- series %>% 448 | bind_rows(samples) %>% 449 | select(series_id, sample_id, query_type = type, 450 | sample_h5idx_gene, sample_h5idx_transcript, organism) 451 | 452 | # Only perform this if there >= 1 series or sample identifiers were found 453 | if (nrow(query) && get.meta) { 454 | res <- query %>% 455 | group_by(organism) %>% 456 | do({ 457 | .with_sample_info(., columns, .$organism[1L], sample_covariates, datadir) 458 | }) %>% 459 | ungroup 460 | } else { 461 | res <- query 462 | } 463 | 464 | out <- res %>% 465 | bind_rows(bad.series) %>% 466 | bind_rows(bad.samples) 467 | 468 | col.order <- c("series_id", "sample_id", columns) 469 | col.order <- c(col.order, setdiff(colnames(out), col.order)) 470 | out[, col.order] 471 | } 472 | 473 | #' Helper function that implements sample-level meatdata retrieval 474 | #' 475 | #' This function is only meant to be called within the `do({})` block in the 476 | #' [archs4_sample_info()] function, and as such does no argument checking and 477 | #' is intentionally not exported. 478 | #' 479 | #' @noRd 480 | #' 481 | #' @importFrom rhdf5 h5read 482 | .with_sample_info <- function(x, columns, organism, sample_covariates, datadir) { 483 | organism <- organism[1L] 484 | h5g.fn <- archs4_file_path(paste0(organism, "_gene"), datadir = datadir) 485 | h5t.fn <- archs4_file_path(paste0(organism, "_transcript"), datadir = datadir) 486 | 487 | out <- sapply(columns, function(i) { 488 | rep(NA_character_, nrow(x)) 489 | }, simplify = FALSE) 490 | 491 | # ensure that we only lookup covariates that are available for this organism 492 | scovs <- sample_covariates[["name"]][sample_covariates[[organism]]] 493 | columns <- intersect(columns, scovs) 494 | 495 | # we first try to get metadata from the gene matrix 496 | ginfo <- !is.na(x$sample_h5idx_gene) 497 | if (any(ginfo)) { 498 | i <- list(x$sample_h5idx_gene[ginfo]) 499 | for (what in columns) { 500 | out[[what]][ginfo] <- .h5read(h5g.fn, paste0("meta/", what), i) 501 | } 502 | } 503 | 504 | # anything left to fetch from the transcript hdf5? 505 | tinfo <- !ginfo 506 | if (any(tinfo)) { 507 | i <- list(x$sample_h5idx_transcript[tinfo]) 508 | for (what in columns) { 509 | out[[what]][tinfo] <- .h5read(h5t.fn, paste0("meta/", what), i) 510 | } 511 | } 512 | 513 | res <- bind_cols(out) 514 | bind_cols(x, res) 515 | } 516 | 517 | #' Lists the GEO series and samples available in the human and mouse datasets 518 | #' 519 | #' This function queries the human and mouse gene expression matrices from 520 | #' the ARCHS4 data release and combines their GEO series and sample identifiers 521 | #' into a long table, annotated with the organism the samples come from. 522 | #' 523 | #' This function executes very quickly (less thatn 0.10th of a second), so most 524 | #' sample-level query functions in this package which you would think would 525 | #' benefit from specifying human/mouse don't have to, as they will join into 526 | #' this table to find out what species is being queried. 527 | #' 528 | #' @export 529 | #' @importFrom readr read_csv 530 | #' @param feature_type currently, the `"gene"` and `"transcript"` datasets 531 | #' are not the same. 532 | #' @param unroll_series There are some malformed series identifiers, like 533 | #' `"GSE36025Xx-xXGSE49417Xx-xXGSE49847"` when the same sample_id appears 534 | #' in multiple series. When this is `TRUE` (default), the series_id's are 535 | #' unrolled and cleaned up. 536 | #' @param datadir the directory that holds the archs4 data 537 | #' @return a tibble of series_id, sample_id, species columns 538 | archs4_sample_table <- function(feature_type = c("all", "gene", "transcript"), 539 | unroll_series = TRUE, 540 | datadir = getOption("archs4.datadir")) { 541 | feature_type <- match.arg(feature_type) 542 | 543 | if (feature_type == "all") { 544 | gst <- archs4_sample_table("gene", unroll_series, datadir) 545 | tst <- archs4_sample_table("transcript", unroll_series, datadir) 546 | res <- gst %>% 547 | full_join(tst, by = c("series_id", "sample_id"), 548 | suffix = c("_gene", "_transcript")) %>% 549 | mutate(organism = ifelse(is.na(organism_gene), organism_transcript, 550 | organism_gene)) %>% 551 | mutate(organism_gene = NULL, organism_transcript = NULL) %>% 552 | select(series_id, sample_id, organism, everything()) 553 | } else { 554 | res <- map_dfr(archs4_sources(), function(source) { 555 | fkey <- paste(source, feature_type, sep = "_") 556 | h5.fn <- archs4_file_path(fkey, datadir = datadir) 557 | dat <- tibble(series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")), 558 | sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession")), 559 | organism = source, 560 | sample_h5idx = seq(sample_id)) 561 | if (feature_type == "gene") { 562 | dat[["a4libsize"]] <- as.vector(.h5read(h5.fn, 'meta/reads_aligned')) 563 | 564 | # Load manually estimated library size and normalization factors. 565 | sfn <- file.path(dirname(h5.fn), paste0(source, "_gene-normfactors.csv")) 566 | if (file.exists(sfn)) { 567 | sf <- suppressWarnings(read_csv(sfn, col_types = "cciidd")) 568 | sf <- select(sf, series_id, sample_id, libsize, normfactor) 569 | dat <- left_join(dat, sf, by = c("series_id", "sample_id")) 570 | } 571 | } 572 | if (unroll_series) { 573 | regex <- "[^GSEM0-9]+" 574 | dat <- separate_rows(dat, series_id, sep = regex) 575 | } 576 | }) 577 | } 578 | res 579 | } 580 | 581 | #' Lists the different sources ARCHS4 is built for 582 | #' 583 | #' We hardocde these values in a lot of places ... who knows if one day these 584 | #' are updated? 585 | #' 586 | #' @export 587 | #' @param datadir the directory that holds the archs4 data 588 | #' @return a character vector listing the different sources (organisms) that 589 | #' the ARCHS4 repository has data for. 590 | archs4_sources <- function(datadir = getOption("archs4.datadir")) { 591 | archs4_meta(datadir)[["sources"]] 592 | } 593 | 594 | #' Retrieves the meta information associated with an ARCHS4 datadir 595 | #' 596 | #' @export 597 | #' @importFrom yaml read_yaml 598 | #' 599 | #' @param datadir the directory that holds the archs4 data 600 | #' @return a list-representation of the `meta.yaml` file 601 | archs4_meta <- function(datadir = getOption("archs4.datadir")) { 602 | assert_directory(datadir, "r") 603 | fpath <- assert_file_exists(file.path(datadir, "meta.yaml"), "r", "yaml") 604 | yml <- yaml::read_yaml(fpath) 605 | 606 | # doing some validation on this file 607 | assert_character(yml[["sources"]]) 608 | assert_list(yml[["files"]]) 609 | yml 610 | } 611 | --------------------------------------------------------------------------------