├── .Rbuildignore
├── .gitignore
├── vignettes
    ├── images
    │   ├── icons
    │   │   ├── home.png
    │   │   ├── next.png
    │   │   ├── note.png
    │   │   ├── prev.png
    │   │   ├── tip.png
    │   │   ├── up.png
    │   │   ├── INFO.txt
    │   │   ├── caution.png
    │   │   ├── download.png
    │   │   ├── example.png
    │   │   ├── important.png
    │   │   ├── warning.png
    │   │   ├── callouts
    │   │   │   ├── 1.png
    │   │   │   ├── 10.png
    │   │   │   ├── 11.png
    │   │   │   ├── 12.png
    │   │   │   ├── 13.png
    │   │   │   ├── 14.png
    │   │   │   ├── 15.png
    │   │   │   ├── 2.png
    │   │   │   ├── 3.png
    │   │   │   ├── 4.png
    │   │   │   ├── 5.png
    │   │   │   ├── 6.png
    │   │   │   ├── 7.png
    │   │   │   ├── 8.png
    │   │   │   └── 9.png
    │   │   └── callout-border.png
    │   └── GSE52564-samples-section.png
    ├── mystyles.css
    └── archs4.Rmd
├── inst
    ├── extdata
    │   ├── testdata-sample-ids.xlsx
    │   ├── blacklist.csv
    │   └── meta.yaml
    └── rmdparts
    │   ├── references.Rmd
    │   └── installation.Rmd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-archs4-functional.R
    │   ├── helper-all.R
    │   ├── test-bioc-containers.R
    │   ├── test-expression-concordance.R
    │   ├── test-expression.R
    │   ├── test-feature-info.R
    │   └── test-sample-retrieval.R
├── R
    ├── package.R
    ├── blacklist.R
    ├── validate.R
    ├── zzz.R
    ├── rmd-helpers.R
    ├── h5utils.R
    ├── tidy.R
    ├── data-directory.R
    ├── bioc-containers.R
    ├── geo-utils.R
    ├── Archs4Repository.R
    ├── archs4-features.R
    ├── expression.R
    └── archs4-functional.R
├── man
    ├── blacklist.Rd
    ├── datadir.Rd
    ├── feature_source.Rd
    ├── estimate_norm_factors.Rd
    ├── archs4_meta.Rd
    ├── Archs4Repository.Rd
    ├── libstats.Rd
    ├── geo_id_type.Rd
    ├── estimate_repository_norm_factors.Rd
    ├── archs4_sources.Rd
    ├── archs4_sample_covariates.Rd
    ├── query_geo.Rd
    ├── archs4_local_data_dir_validate.Rd
    ├── lookup_gse.Rd
    ├── archs4_file_info.Rd
    ├── feature_lookup.Rd
    ├── dot-h5read.Rd
    ├── retrieve_sra_metadata.Rd
    ├── archs4_series_status.Rd
    ├── archs4_feature_info.Rd
    ├── archs4_local_data_dir_create.Rd
    ├── lookup_biosamples.Rd
    ├── as.DGEList.Rd
    ├── archs4_sample_table.Rd
    ├── archs4_file_path.Rd
    ├── fetch_expression.Rd
    ├── create_augmented_feature_info.Rd
    └── archs4_sample_info.Rd
├── DESCRIPTION
├── NAMESPACE
├── README.Rmd
└── README.md


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/doc
6 | 


--------------------------------------------------------------------------------
/vignettes/images/icons/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/home.png


--------------------------------------------------------------------------------
/vignettes/images/icons/next.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/next.png


--------------------------------------------------------------------------------
/vignettes/images/icons/note.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/note.png


--------------------------------------------------------------------------------
/vignettes/images/icons/prev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/prev.png


--------------------------------------------------------------------------------
/vignettes/images/icons/tip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/tip.png


--------------------------------------------------------------------------------
/vignettes/images/icons/up.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/up.png


--------------------------------------------------------------------------------
/vignettes/images/icons/INFO.txt:
--------------------------------------------------------------------------------
1 | These icons were taken from asciidoc:
2 | 
3 |   http://www.methods.co.nz/asciidoc/images/icons/
4 | 


--------------------------------------------------------------------------------
/vignettes/images/icons/caution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/caution.png


--------------------------------------------------------------------------------
/vignettes/images/icons/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/download.png


--------------------------------------------------------------------------------
/vignettes/images/icons/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/example.png


--------------------------------------------------------------------------------
/vignettes/images/icons/important.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/important.png


--------------------------------------------------------------------------------
/vignettes/images/icons/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/warning.png


--------------------------------------------------------------------------------
/inst/extdata/testdata-sample-ids.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/inst/extdata/testdata-sample-ids.xlsx


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library("archs4")
2 | library("testthat")
3 | library("dplyr")
4 | library("rhdf5")
5 | 
6 | test_check("archs4")
7 | 


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/1.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/10.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/11.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/12.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/13.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/14.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/15.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/2.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/3.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/4.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/5.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/6.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/7.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/8.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callouts/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callouts/9.png


--------------------------------------------------------------------------------
/vignettes/images/icons/callout-border.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/icons/callout-border.png


--------------------------------------------------------------------------------
/vignettes/images/GSE52564-samples-section.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/denalitherapeutics/archs4/HEAD/vignettes/images/GSE52564-samples-section.png


--------------------------------------------------------------------------------
/inst/extdata/blacklist.csv:
--------------------------------------------------------------------------------
 1 | "series_id","reason"
 2 | "GSE88850","scRNAseq"
 3 | "GSE44618","scRNAseq"
 4 | "GSE85783","scRNAseq"
 5 | "GSE90014","scRNAseq"
 6 | "GSE99095","scRNAseq"
 7 | "GSE103239","scRNAseq"
 8 | "GSE95630","scRNAseq"
 9 | "GSE90015","methylation array?"
10 | 


--------------------------------------------------------------------------------
/R/package.R:
--------------------------------------------------------------------------------
 1 | # Should we import GEOquery?
 2 | 
 3 | #' @import checkmate
 4 | #' @import dplyr
 5 | #' @import purrr
 6 | #' @import readr
 7 | #' @import tidyr
 8 | NULL
 9 | 
10 | # re-export generics from other packages
11 | 
12 | #' @importFrom broom tidy
13 | #' @export tidy
14 | NULL


--------------------------------------------------------------------------------
/tests/testthat/test-archs4-functional.R:
--------------------------------------------------------------------------------
 1 | context("ARCHS4 Functional Interface")
 2 | 
 3 | if (!exists("a4")) {
 4 |   # This is loaded by the testthat/helper-all.R script when testthat is running
 5 |   # the unit tests, but included here for convenience when doing interactive
 6 |   # test development
 7 |   a4 <- Archs4Repository()
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/R/blacklist.R:
--------------------------------------------------------------------------------
 1 | #' Return a list of series/samples you may want to ignore for now
 2 | #'
 3 | #' The entries that appear here so far are because the data appear to come from
 4 | #' single-cell experiments
 5 | #'
 6 | #' @export
 7 | blacklist <- function() {
 8 |   fn <- system.file("extdata", "blacklist.csv", package = "archs4")
 9 |   read.csv(fn, stringsAsFactors = FALSE)
10 | }
11 | 


--------------------------------------------------------------------------------
/man/blacklist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/blacklist.R
 3 | \name{blacklist}
 4 | \alias{blacklist}
 5 | \title{Return a list of series/samples you may want to ignore for now}
 6 | \usage{
 7 | blacklist()
 8 | }
 9 | \description{
10 | The entries that appear here so far are because the data appear to come from
11 | single-cell experiments
12 | }
13 | 


--------------------------------------------------------------------------------
/R/validate.R:
--------------------------------------------------------------------------------
 1 | is_archs4_expression_file <- function(fn) {
 2 |   assert_character(fn)
 3 |   # last N characters must be '_matrix.h5'
 4 |   suffix <- '_matrix.h5$'
 5 |   isTRUE(length(grep(suffix, fn)[1L]) == 1L)
 6 | }
 7 | 
 8 | is_geo_series_id <- function(id) {
 9 |   assert_character(id)
10 |   substr(id, 1, 3) == "GSE"
11 | }
12 | 
13 | is_geo_sample_id <- function(id) {
14 |   assert_character(id)
15 |   substr(id, 1, 3) == "GSM"
16 | }


--------------------------------------------------------------------------------
/man/datadir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R
 3 | \name{datadir}
 4 | \alias{datadir}
 5 | \title{Retrieves the directory that contains the data for the Archs4Repository}
 6 | \usage{
 7 | datadir(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{an \code{Archs4Repository}}
11 | }
12 | \description{
13 | Retrieves the directory that contains the data for the Archs4Repository
14 | }
15 | 


--------------------------------------------------------------------------------
/man/feature_source.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/expression.R
 3 | \name{feature_source}
 4 | \alias{feature_source}
 5 | \title{Retrieves (organism) source of gene/transcript identifiers}
 6 | \usage{
 7 | feature_source(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A vector of identifiers}
11 | }
12 | \description{
13 | Parses ensembl identifiers and determines if they are for genes, or
14 | transcripts, as well as the organism they should belong to (human, mouse)
15 | }
16 | 


--------------------------------------------------------------------------------
/man/estimate_norm_factors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/expression.R
 3 | \name{estimate_norm_factors}
 4 | \alias{estimate_norm_factors}
 5 | \title{Calculates library size and norm factors for a specific dataset}
 6 | \usage{
 7 | estimate_norm_factors(a4, key, n = 500, logratioTrim = 0.3,
 8 |   sumTrim = 0.05, doWeighting = TRUE, Acutoff = -1e+10, p = 0.75,
 9 |   ...)
10 | }
11 | \description{
12 | This function will serialize the results of the library size and
13 | normalization factors into files inside \code{datadir(a4)}
14 | }
15 | 


--------------------------------------------------------------------------------
/man/archs4_meta.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{meta}
 4 | \alias{meta}
 5 | \alias{archs4_meta}
 6 | \title{Retrieves the meta information associated with an ARCHS4 datadir}
 7 | \usage{
 8 | meta(x)
 9 | 
10 | archs4_meta(datadir = getOption("archs4.datadir"))
11 | }
12 | \arguments{
13 | \item{x}{an \code{Archs4Repository}}
14 | 
15 | \item{datadir}{the directory that holds the archs4 data}
16 | }
17 | \value{
18 | a list-representation of the \code{meta.yaml} file
19 | }
20 | \description{
21 | Retrieves the meta information associated with an ARCHS4 datadir
22 | }
23 | 


--------------------------------------------------------------------------------
/man/Archs4Repository.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R
 3 | \name{Archs4Repository}
 4 | \alias{Archs4Repository}
 5 | \title{An interface to a locally downloaded ARCHS4 dataset}
 6 | \usage{
 7 | Archs4Repository(datadir = getOption("archs4.datadir"))
 8 | }
 9 | \arguments{
10 | \item{datadir}{The directory that stores the ARCHS4 data.}
11 | }
12 | \value{
13 | an Arhcs4DataSet object
14 | }
15 | \description{
16 | This instantiates an object that acts as a central broker to handle queries
17 | against the ARCHS4 dataset. Please refer to the vignette for instructions
18 | on how to setup a local directory to act as an Archs4Repository.
19 | }
20 | 


--------------------------------------------------------------------------------
/man/libstats.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R
 3 | \name{libstats}
 4 | \alias{libstats}
 5 | \title{Extract the read depth and normalization factors for the samples}
 6 | \usage{
 7 | libstats(x, with_a4libsize = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{an \code{Archs4Repository}}
11 | 
12 | \item{with_a4libsize}{If \code{TRUE}, includes an \code{a4libsize} column, which
13 | was extracted from the \code{meta/reads_aligned} hdf5 file. Defaults to \code{FALSE}.}
14 | }
15 | \value{
16 | a tibble with sample_id, a4libsize, libsize, normfactor
17 | }
18 | \description{
19 | Extract the read depth and normalization factors for the samples
20 | }
21 | 


--------------------------------------------------------------------------------
/man/geo_id_type.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geo-utils.R
 3 | \name{geo_id_type}
 4 | \alias{geo_id_type}
 5 | \title{Classify a vector of sample or series GEO ID's as such}
 6 | \usage{
 7 | geo_id_type(id)
 8 | }
 9 | \arguments{
10 | \item{id}{a character vector of \code{GSEnnnnn} or \code{GSMnnnnn} ids}
11 | }
12 | \value{
13 | a tibble of \code{unique(id)} indicating if the id is a \code{"series"}
14 | (GSEnnnnn), \code{"sample"} (GSMnnnnn), or \code{"unknown"}.
15 | }
16 | \description{
17 | GEO series identifiers all start with GSE and sample identifiers all
18 | start with GSM. We use that to identify what types of identifiers are
19 | passed into \code{id}
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-all.R:
--------------------------------------------------------------------------------
 1 | # These commands are run before the tests and are made available for everyone
 2 | # to use.
 3 | 
 4 | # Instantiate a universal Archs4Repository because it takes a little while
 5 | # (~4s) to construct one.
 6 | if (!interactive() && identical(Sys.getenv("NOT_CRAN"), "true")) {
 7 |   # Putting it in this block, because helper-* function are run for convenience
 8 |   # in several scenarios, including when we:
 9 |   #
10 |   #   1. Run devtools::load_all(); and
11 |   #   2. Run devtools::document()
12 |   #
13 |   # In these scenarios, I don't want to instantiated the Archs4Repository, so
14 |   # the if () shoudl only evaluate to true when we are actually *testing*.
15 |   a4 <- Archs4Repository()
16 | }
17 | 


--------------------------------------------------------------------------------
/man/estimate_repository_norm_factors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/expression.R
 3 | \name{estimate_repository_norm_factors}
 4 | \alias{estimate_repository_norm_factors}
 5 | \title{Estimate normalization factors for datasets in the Archs4Repository}
 6 | \usage{
 7 | estimate_repository_norm_factors(a4)
 8 | }
 9 | \arguments{
10 | \item{a4}{The \code{Arcsh4Repository}}
11 | }
12 | \value{
13 | Invisibly returns a tibble of the the library size and normalization
14 | factors for the expression data in \code{a4} (invisibly).
15 | }
16 | \description{
17 | This function will serialize the results of the library size and
18 | normalization factors into files inside \code{datadir(a4)}
19 | }
20 | 


--------------------------------------------------------------------------------
/man/archs4_sources.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{sources}
 4 | \alias{sources}
 5 | \alias{archs4_sources}
 6 | \title{Lists the different sources ARCHS4 is built for}
 7 | \usage{
 8 | sources(x)
 9 | 
10 | archs4_sources(datadir = getOption("archs4.datadir"))
11 | }
12 | \arguments{
13 | \item{x}{an \code{Archs4Repository}}
14 | 
15 | \item{datadir}{the directory that holds the archs4 data}
16 | }
17 | \value{
18 | a character vector listing the different sources (organisms) that
19 | the ARCHS4 repository has data for.
20 | }
21 | \description{
22 | We hardocde these values in a lot of places ... who knows if one day these
23 | are updated?
24 | }
25 | 


--------------------------------------------------------------------------------
/man/archs4_sample_covariates.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{sample_covariates}
 4 | \alias{sample_covariates}
 5 | \alias{archs4_sample_covariates}
 6 | \title{Retrieves tibble of sample-level covariates available in mouse and human data.}
 7 | \usage{
 8 | sample_covariates(x, ...)
 9 | 
10 | archs4_sample_covariates(datadir = getOption("archs4.datadir"), ...)
11 | }
12 | \arguments{
13 | \item{x}{an \code{Archs4Repository}}
14 | 
15 | \item{datadir}{the directory that holds the archs4 data}
16 | }
17 | \description{
18 | Enumerate the sample covariates available in mouse and human data.
19 | Note that the covariates available in human and mouse are the same
20 | between the gene and transcript level files
21 | }
22 | 


--------------------------------------------------------------------------------
/man/query_geo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geo-utils.R
 3 | \name{query_geo}
 4 | \alias{query_geo}
 5 | \title{Query NCBI GEO through its REST interface}
 6 | \source{
 7 | https://www.ncbi.nlm.nih.gov/geo/info/download.html
 8 | }
 9 | \usage{
10 | query_geo(accession, target = c("self", "gsm", "gpl", "gse", "all"),
11 |   validate = FALSE, verbose = FALSE)
12 | }
13 | \arguments{
14 | \item{accession}{Scalar character, GEO identifier for a series (GSE), a
15 | sample (GSM) or a platform (GPL).}
16 | 
17 | \item{validate}{Scalar boolean, validate the retrieved xml file against
18 | NCBI's schema?}
19 | }
20 | \value{
21 | xml2::xml_document object
22 | }
23 | \description{
24 | Query NCBI GEO through its REST interface
25 | }
26 | \examples{
27 | query_geo("GSE109171")
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/testthat/test-bioc-containers.R:
--------------------------------------------------------------------------------
 1 | context("Bioconductor Containers")
 2 | 
 3 | if (!exists("a4")) {
 4 |   a4 <- Archs4Repository()
 5 | }
 6 | 
 7 | test_that("as.DGEList creates gene- and sample-level DGELists", {
 8 |   # This is a human dataset
 9 |   scovs <- c("Sample_title", "Sample_source_name_ch1")
10 |   yg <- as.DGEList(a4, "GSE52564", feature_type = "gene",
11 |                    sample_columns = scovs)
12 |   expect_true(all(substr(rownames(yg), 1, 7) == "ENSMUSG"))
13 |   for (cov in scovs) {
14 |     expect_is(yg$samples[[cov]], "character", info = paste("yg:", cov))
15 |   }
16 | 
17 | 
18 |   yt <- as.DGEList(a4, "GSE52564", feature_type = "transcript",
19 |                    sample_columns = scovs)
20 |   expect_true(all(substr(rownames(yt), 1, 7) == "ENSMUST"))
21 |   for (cov in scovs) {
22 |     expect_is(yt$samples[[cov]], "character", info = paste("yt:", cov))
23 |   }
24 | })


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function(libname, pkgname) {
 2 |   ## Setup default option values
 3 |   opts <- options()
 4 | 
 5 |   pkg.opts <- list(
 6 |     archs4.datadir='~/.archs4data')
 7 |   toset <- !(names(pkg.opts) %in% names(opts))
 8 |   if (any(toset)) {
 9 |     options(pkg.opts[toset])
10 |   }
11 | 
12 |   ddir <- getOption("archs4.datadir")
13 |   kosher <- archs4_local_data_dir_validate(echo = FALSE, datadir = ddir)
14 |   if (!isTRUE(kosher)) {
15 |     packageStartupMessage(
16 |       "Note that your default archs4 data directory is NOT setup correctly\n\n",
17 |       "  * Run `archs4_local_data_dir_validate()` to diagnose\n",
18 |       "  * Refer to the ARCHS4 Data Download section of the archs4 vignette ",
19 |       "for more information\n\n",
20 |       "Your default archs4 data directory (`getOption(\"archs4.datadir\")`) ",
21 |       "is:\n\n  ", ddir, "\n\n")
22 |   }
23 | 
24 |   invisible()
25 | }
26 | 


--------------------------------------------------------------------------------
/man/archs4_local_data_dir_validate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data-directory.R
 3 | \name{archs4_local_data_dir_validate}
 4 | \alias{archs4_local_data_dir_validate}
 5 | \title{Check "the health" of a local ARCHS4 data datadir}
 6 | \usage{
 7 | archs4_local_data_dir_validate(echo = TRUE,
 8 |   datadir = getOption("archs4.datadir"))
 9 | }
10 | \arguments{
11 | \item{echo}{echo validation diagnostic message to stdout via \code{\link[base:cat]{base::cat()}}}
12 | 
13 | \item{datadir}{the path to the datadir that stores local ARCHS4 data.
14 | Defaults to \code{getOption("archs4.datadir")}.}
15 | }
16 | \value{
17 | A string that indicates "what's wrong", or \code{TRUE} if validation
18 | succeeds.
19 | }
20 | \description{
21 | This function will notify the suer which files are missing from the
22 | ARCHS4 data datadir, and what course of action they can use to
23 | fix it.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/lookup_gse.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geo-utils.R
 3 | \name{lookup_gse}
 4 | \alias{lookup_gse}
 5 | \title{Retrieve information about a GEO series}
 6 | \usage{
 7 | lookup_gse(accession, fields = c("Accession", "Title", "Summary",
 8 |   "Overall-Design", "Type", "Pubmed-ID", "Sample"), ...)
 9 | }
10 | \arguments{
11 | \item{accession}{Scalar character, GEO series identifier e.g. GSE109171}
12 | 
13 | \item{fields}{Character vector specifying which fields to extract from the
14 | XML file returned by GEO}
15 | 
16 | \item{...}{Additional arguments passed on to the \code{query_geo} function.}
17 | }
18 | \value{
19 | List the requested \code{fields}
20 | }
21 | \description{
22 | Queries NCBI GEO's REST interface to retrieve e.g. title, summary and the
23 | list of samples for a GEO series.
24 | }
25 | \examples{
26 | if (interactive()) {
27 |   lookup_gse("GSE109171")
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/man/archs4_file_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{file_info}
 4 | \alias{file_info}
 5 | \alias{archs4_file_info}
 6 | \title{Retrieves a table of files that back an Archs4Repository}
 7 | \usage{
 8 | file_info(x)
 9 | 
10 | archs4_file_info(datadir = getOption("archs4.datadir"))
11 | }
12 | \arguments{
13 | \item{x}{an \code{Archs4Repository}}
14 | 
15 | \item{datadir}{the directory that stores the ARCHS4 data files}
16 | }
17 | \description{
18 | A yaml iskept in the Archs4 data directory (\code{getOption("archs4.datadir")})
19 | that links keys, (ie. \code{mouse_gene}) to the name of the file in the directory.
20 | This abstraction is introduced so that the version of these files can be
21 | updated with new downloads, and the user simply has to modify the yaml file
22 | so that they are used downstream
23 | 
24 | Reference the "ARCHS4 Data Download" section in the vignette for more
25 | information.
26 | }
27 | 


--------------------------------------------------------------------------------
/tests/testthat/test-expression-concordance.R:
--------------------------------------------------------------------------------
 1 | context("Expression Concordance")
 2 | 
 3 | # There have been a few occassions where the ARCHS4 gene expression quantitation
 4 | # has lined up far from what is expected. For example:
 5 | #
 6 | # 1. the values in the human v4 gene-level expression matrices lined up poorly
 7 | #    with internal estimates of the same, and this concordance was resolved when
 8 | #    the v5 gene-level matrices were published.
 9 | # 2. The v5 mouse gene-level matrices look a bit off compared to internally
10 | #    processed data, but the v4 versions of the seemed more inline with
11 | #    expectation.
12 | 
13 | # As a "validation set", a subset of gene level
14 | # `edgeR::cpm(., prior.count = 3, log = TRUE)` values are provided for studies
15 | # processed by:
16 | # 1. A STAR + salmon + GENOCDE ref piline; and
17 | # 2. recount2
18 | 
19 | test_that("human expression is concordant with expectation", {
20 | 
21 | })
22 | 
23 | test_that("mouse expression is concoordant with expectation", {
24 | 
25 | })


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: archs4
 2 | Type: Package
 3 | Title: Wrappers to query and extract ARCHS4 data
 4 | Version: 0.5.2
 5 | Authors@R: c(
 6 |     person("Steve", "Lianoglou", , "lianoglou@dnli.com", c("aut", "cre")),
 7 |     person("Thomas", "Sandmann", , "sandmann@dnli.com", c("aut")),
 8 |     person("Denali Therapeutics", role = c("cph", "fnd"))
 9 |     )
10 | Description: An R interface to query and extract data from the ARCHS4 resource.
11 | License: Apache 2.0
12 | URL: https://github.com/denalitherapeutics/archs4#readme
13 | BugReports: https://github.com/denalitherapeutics/archs4/issues
14 | Imports:
15 |     broom,
16 |     checkmate,
17 |     dplyr,
18 |     edgeR,
19 |     purrr,
20 |     readr,
21 |     rentrez,
22 |     rhdf5,
23 |     tibble,
24 |     tidyr,
25 |     xml2,
26 |     yaml
27 | Suggests:
28 |     GenomicRanges,
29 |     rtracklayer,
30 |     testthat,
31 |     rmarkdown,
32 |     knitr
33 | Encoding: UTF-8
34 | LazyData: true
35 | Roxygen: list(markdown = TRUE)
36 | RoxygenNote: 6.1.0
37 | VignetteBuilder: knitr
38 | 


--------------------------------------------------------------------------------
/inst/rmdparts/references.Rmd:
--------------------------------------------------------------------------------
 1 | [//]: # (References ===========================================================)
 2 | 
 3 | [archs4dl]: https://amp.pharm.mssm.edu/archs4/download.html
 4 | [archs4eda]: https://amp.pharm.mssm.edu/archs4/data.html
 5 | [archs4pub]: https://www.nature.com/articles/s41467-018-03751-6
 6 | [archs4web]: https://amp.pharm.mssm.edu/archs4/
 7 | [barrespub]: http://www.jneurosci.org/content/34/36/11929.short
 8 | [barresgeo]: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52564
 9 | [bioc]: http://bioconductor.org/
10 | [biocLite]: https://www.bioconductor.org/install/#why-biocLite
11 | [blurtonpub]: https://www.ncbi.nlm.nih.gov/pubmed/28426964
12 | [blurtongeo]: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE89189
13 | [cran]: https://cran.r-project.org/
14 | [geo]: https://www.ncbi.nlm.nih.gov/geo/
15 | [kallisto]: https://pachterlab.github.io/kallisto/about
16 | [lachmann]: http://www.mountsinai.org/profiles/alexander-lachmann
17 | [maayanlab]: http://labs.icahn.mssm.edu/maayanlab/
18 | [sra]: https://www.ncbi.nlm.nih.gov/sra
19 | 


--------------------------------------------------------------------------------
/man/feature_lookup.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R
 3 | \name{feature_lookup}
 4 | \alias{feature_lookup}
 5 | \title{Perform a loose/fuzzy lookup for a gene/transcript feature.}
 6 | \usage{
 7 | feature_lookup(x, query, feature_type = "gene", source = "human", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{An Archs4Repository}
11 | 
12 | \item{query}{a character string of feature names to look for}
13 | 
14 | \item{feature_type}{"gene" or "transcript"}
15 | 
16 | \item{source}{organism dataset to lookup}
17 | }
18 | \value{
19 | a tibble of features that match against the query. The first column
20 | is the value of the query itself. If no match is found for a query, its
21 | row is all \code{NA}.
22 | }
23 | \description{
24 | This funciton facilitates exploratory data analyses by trying to find gene
25 | or transcripts by different type of identifiers (symbol, ensembl_id, etc).
26 | }
27 | \examples{
28 | a4 <- Archs4Repository()
29 | features <- feature_lookup(a4, c("CFAP65", "PECR", "ENSG00000131408"),
30 |                            feature_type = "gene", source ="human")
31 | }
32 | 


--------------------------------------------------------------------------------
/man/dot-h5read.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/h5utils.R
 3 | \name{.h5read}
 4 | \alias{.h5read}
 5 | \title{Read data from an HDF5 file "with caution"}
 6 | \usage{
 7 | .h5read(file, name, index = NULL, start = NULL, stride = NULL,
 8 |   block = NULL, count = NULL, compoundAsDataFrame = TRUE,
 9 |   callGeneric = TRUE, read.attributes = FALSE, drop = FALSE, ...,
10 |   native = FALSE, default_value = NA, default_dim = 1L)
11 | }
12 | \description{
13 | This is a simple wrapper to \code{\link[rhdf5:h5read]{rhdf5::h5read()}} which returns a default value
14 | if the "data element" (specified by \code{name}) does not exists within \code{file}.
15 | We use this file to read from ARCHS4 hdf5 files when we want to provide a
16 | little insurance to the evoling nature of their data formats.
17 | 
18 | For instance this function is used when we try to read somethign like
19 | \code{"meta/reads_aligned"} because this information was not provided in earlier
20 | versions of these datasets, however \code{"meta/genes"} may use \code{\link[rhdf5:h5read]{rhdf5::h5read()}}
21 | directly because this has been around since "the beginning"
22 | }
23 | 


--------------------------------------------------------------------------------
/R/rmd-helpers.R:
--------------------------------------------------------------------------------
 1 | #' Generates markdown enumeration of linked files to download for datadir
 2 | #'
 3 | #' @noRd
 4 | #'
 5 | #' @param datadir the archs4 data dir
 6 | #' @return a character string
 7 | md_archs4_download_bullet_list <- function(datadir = getOption("archs4.datadir")) {
 8 |   mdat <- archs4_meta(datadir)
 9 |   files <- mdat$files
10 |   sources <- sapply(files, "[[", "source")
11 |   sources <- setdiff(sources, "computed")
12 |   mdown <- lapply(sources, function(s) {
13 |     header <- sprintf("* %s\n", s)
14 |     items <- lapply(files, function(f) {
15 |       if (f$source != s) return(NULL)
16 |       # ftp:// links aren't rendered correctly in Rmd's, so we need to print
17 |       # http:// vs ftp:// links separately
18 |       if (substr(f$url, 1, 4) == "http") {
19 |         md <- sprintf("    - [`%s`](%s): %s", f$name, f$url, f$description)
20 |       } else {
21 |         md <- sprintf("    - `%s`: %s\n         %s", f$name, f$description, f$url)
22 |       }
23 |       md
24 |     })
25 |     items <- items[sapply(items, Negate(is.null))]
26 |     items <- paste(items, collapse = "\n")
27 |     paste0(header, items)
28 |   })
29 |   paste(mdown, collapse = "\n")
30 | }
31 | 


--------------------------------------------------------------------------------
/man/retrieve_sra_metadata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geo-utils.R
 3 | \name{retrieve_sra_metadata}
 4 | \alias{retrieve_sra_metadata}
 5 | \title{Retrieve metadata for an SRA accession}
 6 | \usage{
 7 | retrieve_sra_metadata(x, from = c("ena", "ncbi"))
 8 | }
 9 | \arguments{
10 | \item{x}{SRA identifier}
11 | 
12 | \item{from}{Scalar character, specifying either \code{ncbi} or \code{ena} as
13 | the source database}
14 | }
15 | \value{
16 | A tbl_df data.frame
17 | }
18 | \description{
19 | This function uses the EBI's or the NCBI's REST APIs to retrieve information
20 | about SRA data.
21 | Study accessions (ERP, SRP, DRP, PRJ prefixes), experiment accessions
22 | (ERX, SRX, DRX prefixes), sample accessions (ERS, SRS, DRS, SAM prefixes)
23 | and run accessions (ERR, SRR, DRR prefixes) can be supplied.
24 | For more information see \url{http://www.ebi.ac.uk/ena/browse/file-reports}
25 | }
26 | \note{
27 | The output data.frame will be different for the two source databases.
28 | }
29 | \examples{
30 | if (interactive()) {
31 |    # retrieve study annotations
32 |    retrieve_sra_metadata("SRP066489")
33 |    # paired-end samples
34 |    retrieve_sra_metadata("PRJEB2054", "ena") \%>\%
35 |    dplyr::filter(sample_accession == "SAMEA728920")
36 | }
37 | }
38 | 


--------------------------------------------------------------------------------
/man/archs4_series_status.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{series_status}
 4 | \alias{series_status}
 5 | \alias{archs4_series_status}
 6 | \title{Checks which samples from a single series are present/absent.}
 7 | \usage{
 8 | series_status(x, id, ...)
 9 | 
10 | archs4_series_status(id, sample_table = archs4_sample_table(datadir =
11 |   datadir), datadir = getOption("archs4.datadir"), ...)
12 | }
13 | \arguments{
14 | \item{x}{an \code{Archs4Repository}}
15 | 
16 | \item{id}{a single GEO series id, ie. \code{"GSEnnnnn"}}
17 | 
18 | \item{sample_table}{the output from \code{\link[=archs4_sample_table]{archs4_sample_table()}}, which lists
19 | the series_id,sample_id combinations found in the ARCHS4 repository.}
20 | 
21 | \item{datadir}{the directory that holds the archs4 data}
22 | }
23 | \value{
24 | a tibble of information for a series.
25 | }
26 | \description{
27 | Due to download or alignment issues, the ARCHS4 data processing pipeline may
28 | not include all of the samples included in a particular GEO series. This
29 | function will return a table with an \code{in_archs4} columns that indicates
30 | whether or not a sample from a particular series is present in ARCHS4.
31 | }
32 | \examples{
33 | info <- archs4_series_status("GSE89189")
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/testthat/test-expression.R:
--------------------------------------------------------------------------------
 1 | context("Expression")
 2 | 
 3 | if (!exists("a4")) {
 4 |   a4 <- Archs4Repository()
 5 | }
 6 | 
 7 | test_that("fetch_expression grabs the right count data", {
 8 |   gnz <- c("IGFL3", "GSTA4", "MRPS21")
 9 |   h5.fn <- file_path(a4, "human_gene")
10 |   h5.idx <- match(gnz, rhdf5::h5read(h5.fn, "meta/genes"))
11 | 
12 |   for (i in seq(h5.idx)) {
13 |     gene <- gnz[i]
14 |     h5idx <- h5.idx[i]
15 | 
16 |     counts <- rhdf5::h5read(h5.fn, "data/expression", list(h5idx, NULL))
17 |     counts <- as.vector(counts)
18 |     names(counts) <- rhdf5::h5read(h5.fn, "meta/Sample_geo_accession")
19 |     counts <- counts[!is.na(counts)]
20 | 
21 |     samples <- sample_table(a4) %>%
22 |       filter(!is.na(libsize), !is.na(normfactor)) %>%
23 |       select(series_id, sample_id) %>%
24 |       semi_join(tibble(sample_id = names(counts)), by = "sample_id") %>%
25 |       distinct(sample_id, .keep_all = TRUE)
26 | 
27 |     counts <- counts[samples$sample_id]
28 | 
29 |     res <- fetch_expression(a4, gene, samples = samples,
30 |                             feature_type = "gene", source = "human")
31 |     sample_key <- paste(res$series_id, res$sample_id, sep = "_")
32 |     expect_true(sum(duplicated(sample_key)) == 0, info = gene)
33 |     expect_setequal(names(counts), res$sample_id)
34 |     xref <- match(names(counts), res$sample_id)
35 |     expect_equal(unname(counts), res$count[xref])
36 |   }
37 | })
38 | 


--------------------------------------------------------------------------------
/man/archs4_feature_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{feature_info}
 4 | \alias{feature_info}
 5 | \alias{archs4_feature_info}
 6 | \title{Retrieves the feature (gene/transcript) information for the archs4 data}
 7 | \usage{
 8 | feature_info(x, feature_type = "gene", source = "human",
 9 |   augmented = TRUE, ...)
10 | 
11 | archs4_feature_info(feature_type = "gene", source = "human",
12 |   augmented = TRUE, datadir = getOption("archs4.datadir"), ...)
13 | }
14 | \arguments{
15 | \item{x}{an \code{Archs4Repository}}
16 | 
17 | \item{feature_type}{gene or transcript?}
18 | 
19 | \item{source}{human or mouse}
20 | 
21 | \item{augmented}{include extra gene- or transcript-level features?
22 | Default: \code{TRUE}}
23 | 
24 | \item{...}{pass through}
25 | 
26 | \item{datadir}{the directory that stores the ARCHS4 data files}
27 | }
28 | \value{
29 | a tibble of information
30 | }
31 | \description{
32 | Only the gene symbols (\code{meta/genes} in gene expression hd5 file) or entrez
33 | transcript identifiers (\code{meta/transcript} for the transcript hdf5 file) are
34 | stored in thse data. We use \code{\link[=create_augmented_feature_info]{create_augmented_feature_info()}} function to
35 | generate and store extra metadata for these features, which are then appended
36 | to these identifiers with this function.
37 | }
38 | \seealso{
39 | \code{\link[=create_augmented_feature_info]{create_augmented_feature_info()}}
40 | }
41 | 


--------------------------------------------------------------------------------
/man/archs4_local_data_dir_create.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data-directory.R
 3 | \name{archs4_local_data_dir_create}
 4 | \alias{archs4_local_data_dir_create}
 5 | \title{Initialize a local datadir to act as an ARCHS4 data datadir}
 6 | \usage{
 7 | archs4_local_data_dir_create(datadir = getOption("archs4.datadir"),
 8 |   stop_if_exists = TRUE)
 9 | }
10 | \arguments{
11 | \item{datadir}{the path to the datadir to create (or initialize) as an
12 | ARCHS4 data datadir.}
13 | 
14 | \item{stop_if_exists}{by default, this function will \code{stop} if \code{datadir}
15 | already exists. Set this to \code{FALSE} to continue. Setting it to \code{FALSE} is
16 | convenient to initialize the target \code{datadir} with a \code{meta.yaml} file.
17 | If a \code{meta.yaml} file already exists in \code{datadir}, then this function
18 | will stop unconditionally. Move the \code{datadir/meta.yaml} out of the way
19 | if you simply want to refresh it with the default version.}
20 | }
21 | \value{
22 | invisibly returns the path to the \code{meta.yaml} in the target
23 | \code{datadir}
24 | }
25 | \description{
26 | Initialize a local datadir to act as an ARCHS4 data datadir
27 | }
28 | \details{
29 | A local datadir needs to be created and initialized (wth a \code{meta.yaml}
30 | file), to house ARCHS4 data for use in an Archs4Repository. This function
31 | creates that datadir and copies an initial \code{meta.yaml} file.
32 | 
33 | Please refer to the vignette section "ARCHS4 Data Download" for more
34 | details.
35 | }
36 | 


--------------------------------------------------------------------------------
/man/lookup_biosamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geo-utils.R
 3 | \name{lookup_biosamples}
 4 | \alias{lookup_biosamples}
 5 | \title{Retrieve sample annotations from NCBI's Biosample database}
 6 | \source{
 7 | https://www.ncbi.nlm.nih.gov/books/NBK25499/#\emph{chapter4_ESearch}
 8 | }
 9 | \usage{
10 | lookup_biosamples(x, retmax = 1e+05 - 1L)
11 | }
12 | \arguments{
13 | \item{x}{Character vector of sample identifiers to search the Biosample
14 | database for. Typically either \code{BioSample (SAMN)}, \code{SRA (SRS)} or
15 | \code{GEO (GSM)} accession numbers.}
16 | 
17 | \item{retmax}{Scalar integer, the maximum number of (total) matches to
18 | retrieve from Entrez. See
19 | \url{https://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_}
20 | for details. The number of records that can be retrieved in one query
21 | must be < 100,000.}
22 | }
23 | \value{
24 | A tbl_df data.frame with sample annotations. Column names and numbers
25 | vary depending on the attributes available in the Biosample database.
26 | }
27 | \description{
28 | This function uses the \code{rentrez} package to retrieve sample annotations
29 | from NCBI's Biosample database.
30 | }
31 | \examples{
32 | if (interactive()) {
33 |  # BioSample identifiers
34 |  lookup_biosamples(c("GSM1947162", "GSM1947179"))
35 |  # mixed SRS and GSM identifiers
36 |  lookup_biosamples(c("SRS1171537", "SRS1171536", "GSM1947179"))
37 |  # mixed samples from two different studies (with different attributes)
38 |  lookup_biosamples(c("SRS1171537", "SRS1271536"))
39 | }
40 | }
41 | 


--------------------------------------------------------------------------------
/man/as.DGEList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bioc-containers.R
 3 | \name{as.DGEList}
 4 | \alias{as.DGEList}
 5 | \title{Create a DGEList for the expression data of a series or set of samples.}
 6 | \usage{
 7 | as.DGEList(x, id, features = NULL, sample_columns = c("Sample_title",
 8 |   "Sample_source_name_ch1"), feature_type = c("gene", "transcript"),
 9 |   row_id = c("ensembl", "symbol"), check_missing_samples = TRUE)
10 | }
11 | \arguments{
12 | \item{id}{a vector of series or sample id's.}
13 | 
14 | \item{features}{a feature-descriptor of the features you want to include
15 | counts for}
16 | 
17 | \item{sample_columns}{the names of the sample covariates that are stored
18 | in the ARCHS4 Dataset; a complete list of what covariates are available
19 | in the ARCHS4 dataset is found using the \code{\link[=archs4_sample_covariates]{archs4_sample_covariates()}}
20 | function.}
21 | 
22 | \item{feature_type}{do you want \code{"gene"} or \code{"transcript"} level expression?}
23 | 
24 | \item{row_id}{either \code{"ensembl"} or \code{"symbol"}. If this is \code{"ensembl"} and
25 | \code{feature_type == "transcript"}, then we remove the rows from the count
26 | dataset that we couldn't map symbol -> ensembl_gene_id for.}
27 | }
28 | \value{
29 | a \code{DGEList} of results
30 | }
31 | \description{
32 | Create a DGEList for the expression data of a series or set of samples.
33 | }
34 | \examples{
35 | a4 <- Archs4Repository()
36 | y <- as.DGEList(a4, "GSE89189", feature_type = "gene")
37 | }
38 | \seealso{
39 | \code{\link[=fetch_expression]{fetch_expression()}}
40 | }
41 | 


--------------------------------------------------------------------------------
/man/archs4_sample_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{sample_table}
 4 | \alias{sample_table}
 5 | \alias{archs4_sample_table}
 6 | \title{Lists the GEO series and samples available in the human and mouse datasets}
 7 | \usage{
 8 | sample_table(x, ...)
 9 | 
10 | archs4_sample_table(feature_type = c("all", "gene", "transcript"),
11 |   unroll_series = TRUE, datadir = getOption("archs4.datadir"))
12 | }
13 | \arguments{
14 | \item{x}{an \code{Archs4Repository}}
15 | 
16 | \item{feature_type}{currently, the \code{"gene"} and \code{"transcript"} datasets
17 | are not the same.}
18 | 
19 | \item{unroll_series}{There are some malformed series identifiers, like
20 | \code{"GSE36025Xx-xXGSE49417Xx-xXGSE49847"} when the same sample_id appears
21 | in multiple series. When this is \code{TRUE} (default), the series_id's are
22 | unrolled and cleaned up.}
23 | 
24 | \item{datadir}{the directory that holds the archs4 data}
25 | }
26 | \value{
27 | a tibble of series_id, sample_id, species columns
28 | }
29 | \description{
30 | This function queries the human and mouse gene expression matrices from
31 | the ARCHS4 data release and combines their GEO series and sample identifiers
32 | into a long table, annotated with the organism the samples come from.
33 | }
34 | \details{
35 | This function executes very quickly (less thatn 0.10th of a second), so most
36 | sample-level query functions in this package which you would think would
37 | benefit from specifying human/mouse don't have to, as they will join into
38 | this table to find out what species is being queried.
39 | }
40 | 


--------------------------------------------------------------------------------
/man/archs4_file_path.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{file_path}
 4 | \alias{file_path}
 5 | \alias{archs4_file_path}
 6 | \title{Identify the file path on the system for specific ARCHS4 files.}
 7 | \usage{
 8 | file_path(x, key)
 9 | 
10 | archs4_file_path(key, stop_if_missing = TRUE, na_missing = TRUE,
11 |   file_info = archs4_file_info(datadir),
12 |   datadir = getOption("archs4.datadir"))
13 | }
14 | \arguments{
15 | \item{x}{an \code{Archs4Repository}}
16 | 
17 | \item{key}{the lookup key for the file, ie. \code{"human_gene"} or \code{"mouse_gene"}.
18 | The known keys are enumerated in \code{archs4_file_info()$key} column.}
19 | 
20 | \item{stop_if_missing}{defaults to \code{TRUE}, which causes this function to
21 | throw an error if the file does not exist at the expected \code{file_path}.
22 | Set this to \code{FALSE} to simply raise a warning}
23 | 
24 | \item{na_missing}{by default, we set paths to files that don't exist to
25 | \code{NA}. Set this to \code{FALSE} to retrieve the expected path of the missing
26 | file.}
27 | 
28 | \item{file_info}{the output from \code{\link[=archs4_file_info]{archs4_file_info()}}, which enumerates the
29 | files used by the \code{Archs4Repository}.}
30 | 
31 | \item{datadir}{the directory that stores the ARCHS4 data files}
32 | }
33 | \value{
34 | a named (by \code{key}) character vector of paths to the filesystem that
35 | correspond to the entries in \code{key}.
36 | }
37 | \description{
38 | By default, this function will throw an error if a file does not exist
39 | upon lookup. To return the \emph{expected} path to the, even if it does not
40 | exist on the file system, set \code{stop_if_missing = FALSE}.
41 | }
42 | 


--------------------------------------------------------------------------------
/R/h5utils.R:
--------------------------------------------------------------------------------
 1 | #' Read data from an HDF5 file "with caution"
 2 | #'
 3 | #' @description
 4 | #' This is a simple wrapper to [rhdf5::h5read()] which returns a default value
 5 | #' if the "data element" (specified by `name`) does not exists within `file`.
 6 | #' We use this file to read from ARCHS4 hdf5 files when we want to provide a
 7 | #' little insurance to the evoling nature of their data formats.
 8 | #'
 9 | #' For instance this function is used when we try to read somethign like
10 | #' `"meta/reads_aligned"` because this information was not provided in earlier
11 | #' versions of these datasets, however `"meta/genes"` may use [rhdf5::h5read()]
12 | #' directly because this has been around since "the beginning"
13 | #'
14 | #' @importFrom rhdf5 h5read
15 | #'
16 | .h5read <- function(file, name, index=NULL, start=NULL, stride=NULL, block=NULL,
17 |                     count=NULL, compoundAsDataFrame = TRUE, callGeneric = TRUE,
18 |                     read.attributes = FALSE, drop = FALSE, ...,
19 |                     native = FALSE, default_value = NA, default_dim = 1L) {
20 |   out <- try({
21 |     rhdf5::h5read(file = file, name = name, index = index, start = start,
22 |                   stride = stride, block = block, count = count,
23 |                   compoundAsDataFrame = compoundAsDataFrame,
24 |                   callGeneric = callGeneric, read.attributes = read.attributes,
25 |                   drop = drop, ..., native = native)
26 |   }, silent = TRUE)
27 |   if (is(out, "try-error")) {
28 |     ndim <- length(default_dim)
29 |     if (ndim == 1L) {
30 |       out <- rep(default_value, default_dim)
31 |     } else {
32 |       out <- array(default_value, default_dim)
33 |     }
34 |   }
35 | 
36 |   if (is.null(dim(out)) || length(dim(out)) == 1L) {
37 |     na.it <- out %in% c("na", "NA", "null", "NULL")
38 |     out[na.it] <- NA
39 |   }
40 | 
41 |   out
42 | }
43 | 


--------------------------------------------------------------------------------
/inst/extdata/meta.yaml:
--------------------------------------------------------------------------------
 1 | sources:
 2 |   - mouse
 3 |   - human
 4 | files:
 5 |   human_gene:
 6 |     source: archs4
 7 |     name: human_matrix.h5
 8 |     url: https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5
 9 |     description: human gene-level counts
10 |   human_gene_info:
11 |     source: computed
12 |     name: human_gene_augmented_info.csv.gz
13 |     description: augmented human gene-level feature information
14 |   human_gtf:
15 |     source: ensembl
16 |     name: Homo_sapiens.GRCh38.90.gtf.gz
17 |     url: ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz
18 |     description: gtf used for human transcript annotations
19 |   human_transcript:
20 |     source: archs4
21 |     name: human_hiseq_transcript_v2.h5
22 |     url: https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5
23 |     description: human transcript-level counts
24 |   human_transcript_info:
25 |     source: computed
26 |     name: human_transcript_augmented_info.csv.gz
27 |     description: augmented human transcript-level feature information
28 |   mouse_gene:
29 |     source: archs4
30 |     name: mouse_matrix.h5
31 |     url: https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix.h5
32 |     description: mouse gene-level counts
33 |   mouse_gtf:
34 |     source: ensembl
35 |     name: Mus_musculus.GRCm38.90.gtf.gz
36 |     url: ftp://ftp.ensembl.org/pub/release-90/gtf/mus_musculus/Mus_musculus.GRCm38.90.gtf.gz
37 |     description: gtf used for mouse transcript annotations
38 |   mouse_gene_info:
39 |     source: computed
40 |     name: mouse_gene_augmented_info.csv.gz
41 |     description: augmented mouse gene-level feature information
42 |   mouse_transcript:
43 |     source: archs4
44 |     name: mouse_hiseq_transcript_v2.h5
45 |     url: https://s3.amazonaws.com/mssm-seq-matrix/mouse_hiseq_transcript_v2.h5
46 |     description: mouse transcript-level counts
47 |   mouse_transcript_info:
48 |     source: computed
49 |     name: mouse_transcript_augmented_info.csv.gz
50 |     description: augmented mouse transcript-level feature information
51 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,Archs4Repository)
 4 | S3method(tidy,DGEList)
 5 | S3method(tidy,EList)
 6 | S3method(tidy,SummarizedExperiment)
 7 | export(Archs4Repository)
 8 | export(archs4_feature_info)
 9 | export(archs4_file_info)
10 | export(archs4_file_path)
11 | export(archs4_local_data_dir_create)
12 | export(archs4_local_data_dir_validate)
13 | export(archs4_meta)
14 | export(archs4_sample_covariates)
15 | export(archs4_sample_info)
16 | export(archs4_sample_table)
17 | export(archs4_series_status)
18 | export(archs4_sources)
19 | export(as.DGEList)
20 | export(blacklist)
21 | export(create_augmented_feature_info)
22 | export(datadir)
23 | export(estimate_repository_norm_factors)
24 | export(feature_info)
25 | export(feature_lookup)
26 | export(fetch_expression)
27 | export(file_info)
28 | export(file_path)
29 | export(geo_id_type)
30 | export(libstats)
31 | export(lookup_biosamples)
32 | export(lookup_gse)
33 | export(meta)
34 | export(query_geo)
35 | export(retrieve_sra_metadata)
36 | export(sample_covariates)
37 | export(sample_info)
38 | export(sample_table)
39 | export(series_status)
40 | export(sources)
41 | export(tidy)
42 | import(checkmate)
43 | import(dplyr)
44 | import(purrr)
45 | import(readr)
46 | import(tidyr)
47 | importFrom(SummarizedExperiment,assay)
48 | importFrom(SummarizedExperiment,assayNames)
49 | importFrom(SummarizedExperiment,colData)
50 | importFrom(SummarizedExperiment,rowData)
51 | importFrom(broom,tidy)
52 | importFrom(edgeR,DGEList)
53 | importFrom(edgeR,cpm)
54 | importFrom(readr,cols)
55 | importFrom(readr,read_csv)
56 | importFrom(readr,read_tsv)
57 | importFrom(readr,type_convert)
58 | importFrom(rentrez,entrez_fetch)
59 | importFrom(rentrez,entrez_search)
60 | importFrom(reshape2,melt)
61 | importFrom(rhdf5,h5ls)
62 | importFrom(rhdf5,h5read)
63 | importFrom(stats,setNames)
64 | importFrom(tibble,set_tidy_names)
65 | importFrom(utils,write.csv)
66 | importFrom(xml2,read_xml)
67 | importFrom(xml2,xml_attr)
68 | importFrom(xml2,xml_children)
69 | importFrom(xml2,xml_contents)
70 | importFrom(xml2,xml_find_all)
71 | importFrom(xml2,xml_find_first)
72 | importFrom(xml2,xml_ns_strip)
73 | importFrom(xml2,xml_text)
74 | importFrom(xml2,xml_validate)
75 | importFrom(yaml,read_yaml)
76 | 


--------------------------------------------------------------------------------
/man/fetch_expression.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/expression.R
 3 | \name{fetch_expression}
 4 | \alias{fetch_expression}
 5 | \title{Retrieve expression data for genes/transcripts across samples}
 6 | \usage{
 7 | fetch_expression(a4, features, samples = NULL, feature_type = "gene",
 8 |   source = NULL, feature_meta = "symbol",
 9 |   sample_meta = c("Sample_title", "Sample_source_name_ch1"),
10 |   prior.count = 3, ...)
11 | }
12 | \arguments{
13 | \item{a4}{An \code{Archs4Repository}}
14 | 
15 | \item{features}{a \code{tibble} of feature descriptors (as returned from
16 | \code{\link[=feature_info]{feature_info()}}). Really this table simply needs the following columns:
17 | \itemize{
18 | \item \code{"a4name"} or \code{"ensembl_id" or};
19 | \item \code{"feature_type"}: (gene or transcript)
20 | \item \code{"source"} (human or mouse),;
21 | }}
22 | 
23 | \item{samples}{a samples identifier: ie. a tibble with series_id and
24 | sample_id columns.}
25 | 
26 | \item{source}{"mouse" or "human" pass this in explicitly if there is no
27 | "source" column in your \code{features} or \code{samples} data.frame}
28 | 
29 | \item{feature_meta}{additional columns from
30 | \code{feature_info(a4, feature_type = feature_type)}}
31 | 
32 | \item{sample_meta}{metadata information to include for the samples. These
33 | values are extracted from the \code{meta/VALUE} files in the respective hdf5
34 | data files. See \code{\link[=archs4_sample_info]{archs4_sample_info()}} for more details.}
35 | 
36 | \item{type}{"gene" or "transcript"-level data?}
37 | }
38 | \value{
39 | a data.frame of expression data. Each row is an observation
40 | (one gene one sample)
41 | }
42 | \description{
43 | Returns the estimated counts for the genes/transcripts enumerated in the
44 | \code{features} table for the samples enumerated in the \code{samples} table.
45 | }
46 | \details{
47 | Note that the values returned are simply estimated counts. They are not
48 | normalized by sequencing depth. For now, the only use for this function is
49 | to compare how ratios of genes compare across samples.
50 | }
51 | \examples{
52 | a4 <- Archs4Repository()
53 | gnz <- feature_lookup(a4, c("CFAP65", "PECR"))
54 | gexprs <- fetch_expression(a4, gnz)
55 | }
56 | \seealso{
57 | \code{\link[=as.DGEList]{as.DGEList()}}
58 | }
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test-feature-info.R:
--------------------------------------------------------------------------------
 1 | context("Feature Info")
 2 | 
 3 | if (!exists("a4")) {
 4 |   # This is loaded by the testthat/helper-all.R script when testthat is running
 5 |   # the unit tests, but included here for convenience when doing interactive
 6 |   # test development
 7 |   a4 <- Archs4Repository()
 8 | }
 9 | 
10 | test_that("augmented feature info largely concordant", {
11 |   a4 <- Archs4Repository("/Users/lianoglou/workspace/data/archs4/v4")
12 |   a5 <- Archs4Repository("/Users/lianoglou/workspace/data/archs4/v5")
13 | 
14 |   v4.fi <- archs4_feature_info("gene", "mouse")
15 |   v5.fi <- archs4_feature_info("gene", "mouse")
16 | })
17 | 
18 | test_that("feature-level metadata retrieval works", {
19 |   # retrieve gene-level feature information with unique symbols
20 |   mg <- feature_info(a4, feature_type = "gene", source = "mouse",
21 |                      distinct_symbol = TRUE)
22 | 
23 |   # all a4name entries should be non NA and nchar() >= 1
24 |   expect_true(all(nchar(mg$a4name) >= 1))
25 |   expect_true(all(!is.na(mg$a4name)))
26 |   expect_is(mg$h5idx, "integer")
27 |   expect_true(all(!is.na(mg$h5idx)))
28 |   # no duplicated symbols
29 |   # isna.symbol <- is.na(mg$symbol) |
30 |   #   tolower(mg$symbol) == "na" |
31 |   #   tolower(mg$symbol) == "null"
32 |   # expect_equal(sum(duplicated(mg$symbol) & !isna.symbol), 0)
33 |   # isna.ens <- is.na(mg$ensembl_id) |
34 |   #   tolower(mg$ensembl_id) == "na" |
35 |   #   tolower(mg$ensembl_id) == "null"
36 |   expect_equal(sum(duplicated(mg$ensembl_id) & !is.na(mg$ensembl_id)), 0)
37 | 
38 |   # There are some entries that we couldn't get identifiers for, but the ones
39 |   # we got should all be prefixed with the mouse prefix.
40 |   mg.ens <- filter(mg, !is.na(ensembl_id))
41 |   expect_true(all(substr(mg.ens$ensembl_id, 1, 7) == "ENSMUSG"))
42 | 
43 | 
44 |   hg <- feature_info(a4, feature_type = "gene", source = "human",
45 |                      distinct_symbol = TRUE)
46 |   # all a4name entries should be non NA and nchar() >= 1
47 |   expect_true(all(nchar(hg$a4name) >= 1))
48 |   expect_true(all(!is.na(hg$a4name)))
49 |   expect_is(hg$h5idx, "integer")
50 |   expect_true(all(!is.na(hg$h5idx)))
51 |   # no duplicated symbols
52 |   expect_equal(sum(duplicated(hg$symbol) & !is.na(hg$symbol)), 0)
53 | 
54 |   hg.ens <- filter(hg, !is.na(ensembl_id))
55 |   expect_true(all(substr(hg.ens$ensembl_id, 1, 4) == "ENSG"))
56 | })
57 | 


--------------------------------------------------------------------------------
/vignettes/mystyles.css:
--------------------------------------------------------------------------------
 1 | /************************ Callouts ********************************************/
 2 | /* Too bad you can't inhert in CSS, change one of these you should change all */
 3 | div.tip {
 4 |   margin: 2px 10px 10px 0px;
 5 |   min-height: 55px;
 6 |   padding: 2px 10px 2px 85px;
 7 |   background-position: 5px 5px, 68px 0px;
 8 |   background-image: url('images/icons/tip.png'), url('images/icons/callout-border.png');
 9 |   background-repeat: no-repeat, repeat-y;
10 |   width: 90%;
11 | }
12 | 
13 | div.note {
14 |   margin: 2px 10px 10px 0px;
15 |   min-height: 55px;
16 |   padding: 2px 10px 2px 85px;
17 |   background-position: 5px 5px, 68px 0px;
18 |   background-image: url('images/icons/note.png'), url('images/icons/callout-border.png');
19 |   background-repeat: no-repeat, repeat-y;
20 |   width: 90%;
21 | }
22 | 
23 | div.caution {
24 |   margin: 2px 10px 10px 0px;
25 |   min-height: 55px;
26 |   padding: 2px 10px 2px 85px;
27 |   background-position: 5px 5px, 68px 0px;
28 |   background-image: url('images/icons/caution.png'), url('images/icons/callout-border.png');
29 |   background-repeat: no-repeat, repeat-y;
30 |   width: 90%;
31 | }
32 | 
33 | div.warning {
34 |   margin: 2px 10px 10px 0px;
35 |   min-height: 55px;
36 |   padding: 2px 10px 2px 85px;
37 |   background-position: 5px 5px, 68px 0px;
38 |   background-image: url('images/icons/warning.png'), url('images/icons/callout-border.png');
39 |   background-repeat: no-repeat, repeat-y;
40 |   width: 90%;
41 | }
42 | 
43 | div.important {
44 |   margin: 2px 10px 10px 0px;
45 |   min-height: 55px;
46 |   padding: 2px 10px 2px 85px;
47 |   background-position: 5px 5px, 68px 0px;
48 |   background-image: url('images/icons/important.png'), url('images/icons/callout-border.png');
49 |   background-repeat: no-repeat, repeat-y;
50 |   width: 90%;
51 | }
52 | 
53 | div.example {
54 |   margin: 2px 10px 10px 0px;
55 |   min-height: 55px;
56 |   padding: 2px 10px 2px 85px;
57 |   background-position: 5px 5px, 68px 0px;
58 |   background-image: url('images/icons/example.png'), url('images/icons/callout-border.png');
59 |   background-repeat: no-repeat, repeat-y;
60 |   width: 90%;
61 | }
62 | 
63 | div.download {
64 |   margin: 2px 10px 10px 0px;
65 |   min-height: 55px;
66 |   padding: 2px 10px 2px 85px;
67 |   background-position: 5px 5px, 68px 0px;
68 |   background-image: url('images/icons/download.png'), url('images/icons/callout-border.png');
69 |   background-repeat: no-repeat, repeat-y;
70 |   width: 90%;
71 | }
72 | 


--------------------------------------------------------------------------------
/man/create_augmented_feature_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/archs4-features.R
 3 | \name{create_augmented_feature_info}
 4 | \alias{create_augmented_feature_info}
 5 | \title{Create meta information for the genes and transcripts in the ARCHS4 dataset.}
 6 | \usage{
 7 | create_augmented_feature_info(datadir = getOption("archs4.datadir"))
 8 | }
 9 | \arguments{
10 | \item{datadir}{The directory that has the mouse and human expression
11 | hdf5 files. There will be \code{SPECIES_FEATURETYPE_augmented_info.csv.gz} files
12 | saved in this directory whe this function completes.}
13 | }
14 | \description{
15 | This is a preprocessing function that is required to successfully build an
16 | \code{Archs4Repository}. It is not really intended for use during analyses.
17 | 
18 | This Function creates \emph{all} of the feature-level CSV files for the features
19 | enumerated in the \code{meta/genes} gene-level hdf5 file, and the
20 | \code{meta/transcript} transcript identfiers in the transctipt-level hdf5 file for
21 | the mouse and human files found in \code{datadir}.
22 | 
23 | \strong{In order for this to work} you have to download the approprate human and
24 | mouse gtf files from ensembl and save them in \code{datadir}. Reference the
25 | \code{\link[=archs4_local_data_dir_validate]{archs4_local_data_dir_validate()}} function.
26 | 
27 | For the initial relesae of the ARCHS4 dataset, the
28 | \code{Homo_sapiens.GRCh38.90.gtf.gz} and \code{Mus_musculus.GRCm38.90.gtf.gz} were
29 | used.
30 | }
31 | \details{
32 | This function will write the augmented transcript- and gene-level files in
33 | the \code{datadir}, using the following pattern:
34 | \code{<organism>_<feature_type>_augmented_info.csv.gz}
35 | 
36 | Gene symbols are the only piece of information provided for the row-level
37 | identifieres for the gene count matrices. Furthermore, the gene symbol used
38 | in mouse are in all uppercase, which is not how genes are referred to there.
39 | In order to augment the gene symbol information with gene-level identifiers
40 | and other information, we parse relatively recent GTFs provided by GENCODE.
41 | 
42 | The fruits of the labor generated by this function are used by the
43 | \code{\link[=archs4_feature_info]{archs4_feature_info()}} function.
44 | 
45 | Note that this function will replace already existing "\code{augmented}" files
46 | if the already exist in \code{datadir}.
47 | }
48 | \seealso{
49 | \code{\link[=archs4_feature_info]{archs4_feature_info()}}
50 | }
51 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, include = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "README-"
12 | )
13 | ```
14 | 
15 | # Overview
16 | 
17 | The `archs4` package provides utility functions to query and explore the
18 | expression profiling data made available through the
19 | [ARCHS4 project][archs4web], which is described in the following publication:
20 | 
21 | [Massive mining of publicly available RNA-seq data from human and mouse][archs4pub]. 
22 | 
23 | Because this package requires the user to download a number of data files that
24 | are external to the package, the [installation instructions](#installation) are
25 | *a bit* more involved than other R packages, and we leave them for 
26 | [the end of this document](#installation).
27 | 
28 | # Usage
29 | 
30 | After [successful installation](#installation) of this package, you can query
31 | the series and samples included in the ARCHS4 repository, as well as materialize
32 | the expresion data into well-known bioconductor assay containers for downstream
33 | analysis.
34 | 
35 | To query GEO series and samples, you can use the `sample_info` function:
36 | 
37 | ```{r, message=FALSE, warning=FALSE}
38 | library(archs4)
39 | 
40 | a4 <- Archs4Repository()
41 | ids <- c('GSE89189', 'GSE29943', "GSM1095128", "GSM1095129", "GSM1095130")
42 | sample.info <- sample_info(a4, ids)
43 | head(sample.info)
44 | ```
45 | 
46 | You can use the `as.DGEList` function to materialize an `edgeR::DGEList` from a
47 | an arbitrary number of GEO sample and series identifiers. The only restriction
48 | is that the data from the series/samples must all be from the same species.
49 | 
50 | The most often use-case will likely be to create a `DGEList` for a given study.
51 | For instance, the GEO series identifier [`"GSE89189"`][blurtongeo] refers to the
52 | expression data generated to support the
53 | [Abud et al. iPSC-Derived Human Microglia-like Cells ...][blurtonpub] paper.
54 | 
55 | Creating a `DGEList` from this study will create an object with 27,024 genes
56 | across 37 samples in about 1.5 seconds:
57 | 
58 | ```{r, eval = FALSE}
59 | yg <- as.DGEList(a4, "GSE89189", feature_type = "gene")
60 | ```
61 | 
62 | The following command retrieves the 178,135 transcript level counts for this
63 | experiment in about 1.5 seconds, as well:
64 | 
65 | ```{r, eval = FALSE}
66 | yt <- as.DGEList(a4, "GSE89189", feature_type = "transcript")
67 | ```
68 | 
69 | # Installation
70 | 
71 | ```{r child = "inst/rmdparts/installation.Rmd"}
72 | ```
73 | 
74 | [//]: # (References ===========================================================)
75 | ```{r child = "inst/rmdparts/references.Rmd"}
76 | ```
77 | 


--------------------------------------------------------------------------------
/man/archs4_sample_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Archs4Repository.R, R/archs4-functional.R
 3 | \name{sample_info}
 4 | \alias{sample_info}
 5 | \alias{archs4_sample_info}
 6 | \title{Retrieves information for samples by GSE series or sample IDs}
 7 | \usage{
 8 | sample_info(x, id, columns = c("Sample_title", "Sample_source_name_ch1"),
 9 |   check_missing_samples = TRUE, ...)
10 | 
11 | archs4_sample_info(id, columns = c("Sample_title",
12 |   "Sample_source_name_ch1"), sample_table = archs4_sample_table(datadir =
13 |   datadir), sample_covariates = archs4_sample_covariates(datadir),
14 |   check_missing_samples = TRUE, datadir = getOption("archs4.datadir"),
15 |   ...)
16 | }
17 | \arguments{
18 | \item{x}{an \code{Archs4Repository}}
19 | 
20 | \item{id}{a character vector of GEO series or sample ids.}
21 | 
22 | \item{columns}{the names of the sample metadata columns desired. This
23 | defaults to \code{c("Sample_title", "Sample_source_name_ch1")}. The values
24 | in \code{columns} must be a subset of the values enumerated in
25 | \code{\link[=archs4_sample_covariates]{archs4_sample_covariates()}}.}
26 | 
27 | \item{check_missing_samples}{When \code{TRUE} (the default), this function will
28 | check every unique GEO series identifier (\code{"GSEnnnn"}) for missing samples
29 | by using an NCBI Rest service via a call to \code{\link[=archs4_series_status]{archs4_series_status()}},
30 | and \code{\link[=lookup_gse]{lookup_gse()}}.}
31 | 
32 | \item{sample_table}{the output from \code{\link[=archs4_sample_table]{archs4_sample_table()}}, which lists
33 | the series_id,sample_id combinations found in the ARCHS4 repository.}
34 | 
35 | \item{sample_covariates}{the \code{data.frame}-definition of the sample covariates
36 | found in the ARCHS4 datasetes, which is constructed via a call to
37 | \code{\link[=archs4_sample_covariates]{archs4_sample_covariates()}}. The parameter is included in here so that
38 | a cached version of this \code{data.frame} can be re-used.}
39 | 
40 | \item{datadir}{the directory that holds the archs4 data}
41 | }
42 | \value{
43 | a tibble of series_id, sample_id, sample_h5idx, sample_title, and
44 | sample_name columns. If the query sample or series query can't be found,
45 | then there will be an \code{NA} value for these columns. The \code{query_type} column
46 | will indicat whether the row was returned from querying by series or by
47 | sample.
48 | }
49 | \description{
50 | Fetch a tibble of series and sample information by querying the arcsh4
51 | dataset by GEO sample (GSE) or sample (GSM) ids.
52 | 
53 | For each unique GEO series identifier (\code{"GSEnnnn"}), we will check if the
54 | ARCHS4 dataset is missing any of its samples when \code{check_missing_samples}
55 | is set to \code{TRUE} (default).
56 | }
57 | \examples{
58 | si <- archs4_sample_info("GSE52564") # ben barres transcriptome db ...
59 | }
60 | 


--------------------------------------------------------------------------------
/R/tidy.R:
--------------------------------------------------------------------------------
 1 | #' @noRd
 2 | #' @export
 3 | #' @importFrom edgeR cpm
 4 | #' @importFrom reshape2 melt
 5 | #' @importFrom broom tidy
 6 | #'
 7 | #' @method tidy DGEList
 8 | tidy.DGEList <- function(x, normalized.lib.sizes = TRUE, prior.count = 3, ...) {
 9 |   mats <- list(
10 |     cpm = cpm(x, normalized.lib.sizes = normalized.lib.sizes,
11 |               log = TRUE, prior.count = prior.count),
12 |     count = x$counts)
13 | 
14 |   .tidy.core(mats, genes = x$genes, samples = x$samples)
15 | }
16 | 
17 | #' @noRd
18 | #' @export
19 | #' @importFrom SummarizedExperiment assay assayNames rowData colData
20 | #' @importFrom edgeR DGEList
21 | #' @method tidy SummarizedExperiment
22 | tidy.SummarizedExperiment <- function(x, assay_name = NULL, is_counts = TRUE,
23 |                                       ...) {
24 |   if (is.null(assay_name)) {
25 |     assay_name <- assayNames(x)[1L]
26 |   }
27 |   assert_string(assay_name)
28 |   assert_choice(assay_name, assayNames(x))
29 | 
30 |   dat <- assay(x, assay_name)
31 |   ginfo <- as.data.frame(rowData(x))
32 |   sinfo <- as.data.frame(colData(x))
33 | 
34 |   if (is_counts) {
35 |     y <- calcNormFactors(DGEList(dat, samples = sinfo, genes = ginfo))
36 |     out <- tidy(y, ...)
37 |   } else {
38 |     out <- .tidy.core(dat, genes = ginfo, samples = sinfo, ...)
39 |   }
40 | 
41 |   out
42 | }
43 | 
44 | #' @noRd
45 | #' @export
46 | #' @method tidy EList
47 | tidy.EList <- function(x, ...)  {
48 |   mats <- list(cpm = x$E)
49 |   if (is.matrix(x$weights)) {
50 |     mats$weight <- x$weights
51 |     rownames(mats$weight) <- rownames(x)
52 |     colnames(mats$weight) <- colnames(x)
53 |   } else {
54 |     names(mats)[1L] <- "value"
55 |   }
56 | 
57 |   .tidy.core(mats, genes = x$genes, samples = x$targets)
58 | }
59 | 
60 | .tidy.core <- function(mats, genes, samples, ...) {
61 |   if (is.matrix(mats)) mats <- list(value = mats)
62 |   stopifnot(is.list(mats))
63 |   stopifnot(all(sapply(mats, is.matrix)))
64 |   assert_named(mats, type = "unique")
65 | 
66 |   rnames <- rownames(mats[[1]])
67 |   snames <- colnames(mats[[1]])
68 |   genes$.gene_id <- rnames
69 |   gid.col <- sapply(genes, function(xx) all(xx == rnames))
70 |   gid.col <- colnames(genes)[which(gid.col)[1L]]
71 |   if (gid.col != ".gene_id") genes$.gene_id <- NULL
72 | 
73 |   samples$.sample_id <- snames
74 |   sid.col <- sapply(samples, function(xx) all(xx == snames))
75 |   sid.col <- colnames(samples)[which(sid.col)[1L]]
76 |   if (sid.col != ".sample_id") samples$.sample_id <- NULL
77 | 
78 |   adat.all <- lapply(names(mats), function(mname) {
79 |     m <- mats[[mname]]
80 |     stopifnot(all.equal(rownames(m), rnames))
81 |     m <- melt(m)
82 |     m <- transform(m, Var1 = as.character(Var1), Var2 = as.character(Var2))
83 |     colnames(m) <- c(gid.col, sid.col, mname)
84 |     m
85 |   })
86 |   adat <- do.call(cbind, adat.all)
87 |   # if there were multiple matrices, there will be multiple sample_id columns
88 |   # so we remove those
89 |   adat <- adat[, !duplicated(colnames(adat))]
90 |   out <- inner_join(adat, genes, by = gid.col)
91 |   out <- inner_join(out, samples, by = sid.col)
92 |   out
93 | }
94 | 


--------------------------------------------------------------------------------
/R/data-directory.R:
--------------------------------------------------------------------------------
  1 | # These are lower-level functions that support "the health" of the local
  2 | # datadir that is used to store the data required to drive a
  3 | # LocalArchs4Repository
  4 | 
  5 | #' Initialize a local datadir to act as an ARCHS4 data datadir
  6 | #'
  7 | #' @details
  8 | #' A local datadir needs to be created and initialized (wth a `meta.yaml`
  9 | #' file), to house ARCHS4 data for use in an Archs4Repository. This function
 10 | #' creates that datadir and copies an initial `meta.yaml` file.
 11 | #'
 12 | #' Please refer to the vignette section "ARCHS4 Data Download" for more
 13 | #' details.
 14 | #'
 15 | #' @export
 16 | #'
 17 | #' @param datadir the path to the datadir to create (or initialize) as an
 18 | #'   ARCHS4 data datadir.
 19 | #' @param stop_if_exists by default, this function will `stop` if `datadir`
 20 | #'   already exists. Set this to `FALSE` to continue. Setting it to `FALSE` is
 21 | #'   convenient to initialize the target `datadir` with a `meta.yaml` file.
 22 | #'   If a `meta.yaml` file already exists in `datadir`, then this function
 23 | #'   will stop unconditionally. Move the `datadir/meta.yaml` out of the way
 24 | #'   if you simply want to refresh it with the default version.
 25 | #' @return invisibly returns the path to the `meta.yaml` in the target
 26 | #'   `datadir`
 27 | archs4_local_data_dir_create <- function(datadir = getOption("archs4.datadir"),
 28 |                                          stop_if_exists = TRUE) {
 29 |   assert_character(datadir)
 30 |   d.exists <- file.exists(datadir)
 31 | 
 32 |   meta.in <- system.file("extdata", "meta.yaml", package = "archs4",
 33 |                          mustWork = TRUE)
 34 |   meta.to <- file.path(datadir, "meta.yaml")
 35 | 
 36 |   if (d.exists && !dir.exists(datadir)) {
 37 |     stop("Desired output datadir is already file(!): ", datadir)
 38 |   }
 39 | 
 40 |   if (d.exists) {
 41 |     if (stop_if_exists) {
 42 |       stop("Output datadir already exisits: ", datadir)
 43 |     } else {
 44 |       if (file.exists(meta.to)) {
 45 |         stop("meta.yaml file already exists in output datadir, ",
 46 |              "remove it if you want to replace it with the default meta.yaml")
 47 |       }
 48 |     }
 49 |   } else {
 50 |     parent.dir <- assert_directory(dirname(datadir), "w")
 51 |     dir.create(datadir)
 52 |   }
 53 |   file.copy(meta.in, meta.to)
 54 |   invisible(meta.to)
 55 | }
 56 | 
 57 | #' Check "the health" of a local ARCHS4 data datadir
 58 | #'
 59 | #' This function will notify the suer which files are missing from the
 60 | #' ARCHS4 data datadir, and what course of action they can use to
 61 | #' fix it.
 62 | #'
 63 | #' @export
 64 | #' @param echo echo validation diagnostic message to stdout via [base::cat()]
 65 | #' @param datadir the path to the datadir that stores local ARCHS4 data.
 66 | #'   Defaults to `getOption("archs4.datadir")`.
 67 | #' @return A string that indicates "what's wrong", or `TRUE` if validation
 68 | #'   succeeds.
 69 | archs4_local_data_dir_validate <- function(echo = TRUE,
 70 |                                            datadir = getOption("archs4.datadir")
 71 |                                            ) {
 72 |   msg <- character()
 73 |   if (!dir.exists(datadir)) {
 74 |     msg <- paste0(
 75 |       "datadir does not exists, run ",
 76 |       '`archs4_local_data_dir_create("', datadir, '")`\n')
 77 |     if (echo) cat(msg)
 78 |     return(invisible(msg))
 79 |   }
 80 | 
 81 |   meta.fn <- file.path(datadir, "meta.yaml")
 82 |   if (!file.exists(meta.fn)) {
 83 |     msg <- paste(
 84 |       "meta.yaml file is missing from the data datadir, run ",
 85 |       "`archs4_local_data_dir_create(datadir, stop_if_exists = FALSE)`\n")
 86 |     if (echo) cat(msg)
 87 |     return(invisible(msg))
 88 |   }
 89 | 
 90 |   finfo <- archs4_file_info(datadir)
 91 |   missing <- filter(finfo, source == "archs4" & !file_exists)
 92 |   if (nrow(missing)) {
 93 |     msg <- paste0(
 94 |       "The following ARCHS4 files are missing, please download them:\n",
 95 |       paste0(
 96 |         sprintf("  * %s: %s", missing[["name"]], missing[["url"]]),
 97 |         collapse = "\n"))
 98 |     if (echo) cat(msg)
 99 |     return(invisible(msg))
100 |   }
101 | 
102 |   missing <- filter(finfo, source == "ensembl" & !file_exists)
103 |   if (nrow(missing)) {
104 |     msg <- paste0(
105 |       "The following ensembl files are missing, please download them:\n",
106 |       paste0(
107 |         sprintf("  * %s: %s", missing[["name"]], missing[["url"]]),
108 |         collapse = "\n"))
109 |     if (echo) cat(msg)
110 |     return(invisible(msg))
111 |   }
112 | 
113 |   missing <- filter(finfo, source == "computed"  & !file_exists)
114 |   if (nrow(missing)) {
115 |     header <- "The following computed files are missing:"
116 |     filez <- paste(sprintf("  * %s\n", missing[["name"]]), collapse = "")
117 |     advice <- paste0(
118 |       "You can create them by running:\n",
119 |       "  `create_augmented_feature_info(\"", datadir, "\")`")
120 |     msg <- sprintf("%s\n%s\n%s\n\n", header, filez, advice)
121 |     if (echo) cat(msg)
122 |     return(invisible(msg))
123 |   }
124 | 
125 |   TRUE
126 | }
127 | 
128 | 


--------------------------------------------------------------------------------
/tests/testthat/test-sample-retrieval.R:
--------------------------------------------------------------------------------
  1 | context("Sample and Covariate Retrieval")
  2 | 
  3 | if (!exists("a4")) {
  4 |   # This is loaded by the testthat/helper-all.R script when testthat is running
  5 |   # the unit tests, but included here for convenience when doing interactive
  6 |   # test development
  7 |   a4 <- Archs4Repository()
  8 | }
  9 | 
 10 | 
 11 | test_that("All expected samples come back when queried by series", {
 12 |   series <- "GSE52564"
 13 |   expected <- sample_table(a4) %>%
 14 |     filter(series_id == series)
 15 | 
 16 |   info <- sample_info(a4, series)
 17 |   full <- expected %>%
 18 |     full_join(info, by = c("series_id", "sample_id"))
 19 | 
 20 |   expect_equal(nrow(full), nrow(expected))
 21 | })
 22 | 
 23 | test_that("(archs4_)sample_status identifies samples missing from GEO series", {
 24 |   missing.none <- "GSE52564" # The Ben Barres dataset has all samples in ARCHS4
 25 |   # The blurton jones dataset (GSE89189) used to be missing some, but those have
 26 |   # been filled out
 27 |   missing.some <- "GSE43366" # The Chiu et al. SOD1 datasets is missing some
 28 | 
 29 |   ss.none <- series_status(a4, missing.none)
 30 |   expect_true(all(ss.none[["in_archs4"]]))
 31 | 
 32 |   ss.some <- series_status(a4, missing.some)
 33 |   expect_true(!all(ss.some[["in_archs4"]]))
 34 | })
 35 | 
 36 | test_that("sample_info call handlies missing IDs gracefully", {
 37 |   # GSE43366 should have 42 samples, but is missing 7 of them as of April 8, 2018
 38 |   # when we are using "v2" of the datasets.
 39 |   expected <- tibble(
 40 |     series_id = "GSE43366",
 41 |     sample_id = paste0("GSM10611", 43:84))
 42 |   ex.missing <- c("GSM1061148", "GSM1061149", "GSM1061150", "GSM1061151",
 43 |                   "GSM1061152", "GSM1061153", "GSM1061154")
 44 |   ex.present <- setdiff(expected$sample_id, ex.missing)
 45 | 
 46 |   # Tests a mix of existing and missing sample identifiers.
 47 |   # Missing samples should have NAs in a number of colums. One colume that is
 48 |   # always returned is `organism`.
 49 |   res <- expect_warning(sample_info(a4, expected$sample_id), "not found")
 50 | 
 51 |   # Expect that we have one row for each entry in `expected.all`
 52 |   # we don't join on series_id because the sample_id's that were queried for
 53 |   # and are missing come back with NA series_id values.
 54 |   xx <- expected %>%
 55 |     full_join(res, by = c("sample_id"))
 56 |   expect_equal(nrow(xx), nrow(expected))
 57 |   expect_setequal(res$sample_id, expected$sample_id)
 58 | 
 59 |   found <- filter(res, !is.na(organism))
 60 |   expect_setequal(found$sample_id, ex.present)
 61 | 
 62 |   missed <- filter(res, is.na(organism))
 63 |   expect_setequal(missed$sample_id, ex.missing)
 64 | 
 65 |   # When we query for all missing samples, we still return a tibble of the same
 66 |   # form as res.all, with all NAs where you expect them to be. We don't expect
 67 |   # to throw an error.
 68 |   amiss <- expect_warning({
 69 |     sample_info(a4, ex.missing)
 70 |   }, "not found")
 71 |   expect_setequal(ex.missing, amiss$sample_id)
 72 |   expect_true(all(is.na(amiss$organism)))
 73 | })
 74 | 
 75 | # Tests that are no longer necessary in v4+ matrices (they were made for v2).
 76 | # These tests:
 77 | # 1. Looked for missing samples in certain seriesexercised
 78 | # 2. Identified discordant sample covariates among the mouse and human datasets
 79 | 
 80 | # test_that("(archs4_)sample_info warns when querying series with missing samples", {
 81 | #   missing.none <- "GSE52564" # The Ben Barres dataset has all samples in ARCHS4
 82 | #   missing.some <- "GSE89189" # The blurton jones iPSC paper is missing some
 83 | #
 84 | #   # This series should have no missing samples
 85 | #   si.none <- expect_silent(sample_info(a4, missing.none))
 86 | #
 87 | #   # This series has a few missing samples
 88 | #   wregex <- sprintf("%s series .*missing samples", missing.some)
 89 | #   status.some <- expect_warning(sample_info(a4, missing.some), wregex)
 90 | #
 91 | #   # as.DGEList should also warn when we are missng samples
 92 | #   y <- expect_warning(as.DGEList(a4, missing.some), wregex)
 93 | # })
 94 | 
 95 | # The v4 data matrices have the same covariates in the data matrices among
 96 | # the mouse and human data.
 97 | # ------------------------------------------------------------------------------
 98 | # test_that("sample_info returns desired covariate columns", {
 99 | #   # the code here looks a bit convoluted because it should be cleaned/updated
100 | #   # to support testing "universal" covariates, mouse- and human-only covariates
101 | #   # as well.
102 | #
103 | #   ids.all <- tribble(
104 | #     ~id,         ~type,            ~organism,        ~complete,
105 | #     "GSE69354",  "series",         "mouse",          TRUE,
106 | #     "GSE79525",  "series",         "mouse",          TRUE,
107 | #     "GSE98041",  "series",         "mouse",          TRUE,
108 | #     "GSE85702",  "series",         "mouse",          FALSE,
109 | #     "GSE99095",  "series",         "human",          FALSE,
110 | #     "GSE88681",  "series",         "human",          TRUE)
111 | #
112 | #   ids.query <- tribble(
113 | #     ~id,          ~type,     ~organism,
114 | #     "GSE69354",   "series",  "mouse",
115 | #     "GSE88681",   "series",  "human",
116 | #     "GSM1095128", "sample",  "mouse",
117 | #     "GSM1095129", "sample",  "mouse",
118 | #     "GSM1095130", "sample",  "mouse")
119 | #
120 | #   def.cols <- c("Sample_title", "Sample_source_name_ch1")
121 | #   extra.cols <- c("Sample_molecule_ch1", "Sample_treatment_protocol_ch1",
122 | #                   "Sample_description")
123 | #
124 | #   # The human data have these covariates that are not in mouse:
125 | #   # * Sample_contact_laboratory
126 | #   # * Sample_description
127 | #   # * Sample_supplementary_file_2
128 | #   h.only <- c("Sample_contact_laboratory", "Sample_description",
129 | #               "Sample_supplementary_file_2")
130 | #   #
131 | #   # The mouse data have these covariates thata are not in human:
132 | #   # * Sample_contact_state
133 | #   # * Sample_growth_protocol_ch1
134 | #   # * Sample_treatment_protocol_ch1
135 | #   m.only <- c("Sample_contact_state", "Sample_growth_protocol_ch1",
136 | #               "Sample_treatment_protocol_ch1")
137 | #
138 | #   all.cols <- c(def.cols, extra.cols)
139 | #   info <- sample_info(a4, ids.query$id, all.cols)
140 | #
141 | #   for (col in all.cols) {
142 | #     expect_is(info[[col]], "character", info = col)
143 | #   }
144 | #
145 | #   info.m <- filter(info, organism == "mouse")
146 | #   for (col in intersect(h.only, colnames(info))) {
147 | #     expect_true(all(is.na(info.m[[col]])), info = col)
148 | #   }
149 | #
150 | #   info.h <- filter(info, organism == "human")
151 | #   for (col in intersect(m.only, colnames(info))) {
152 | #     expect_true(all(is.na(info.h[[col]])), info = col)
153 | #   }
154 | # })
155 | 
156 | 


--------------------------------------------------------------------------------
/R/bioc-containers.R:
--------------------------------------------------------------------------------
  1 | #' Create a DGEList for the expression data of a series or set of samples.
  2 | #'
  3 | #'
  4 | #'
  5 | #' @export
  6 | #' @importFrom rhdf5 h5read
  7 | #' @importFrom edgeR DGEList
  8 | #' @seealso [fetch_expression()]
  9 | #'
 10 | #' @param id a vector of series or sample id's.
 11 | #' @param features a feature-descriptor of the features you want to include
 12 | #'   counts for
 13 | #' @param sample_columns the names of the sample covariates that are stored
 14 | #'   in the ARCHS4 Dataset; a complete list of what covariates are available
 15 | #'   in the ARCHS4 dataset is found using the [archs4_sample_covariates()]
 16 | #'   function.
 17 | #' @param feature_type do you want `"gene"` or `"transcript"` level expression?
 18 | #' @param row_id either `"ensembl"` or `"symbol"`. If this is `"ensembl"` and
 19 | #'   `feature_type == "transcript"`, then we remove the rows from the count
 20 | #'   dataset that we couldn't map symbol -> ensembl_gene_id for.
 21 | #' @return a `DGEList` of results
 22 | #'
 23 | #' @examples
 24 | #' a4 <- Archs4Repository()
 25 | #' y <- as.DGEList(a4, "GSE89189", feature_type = "gene")
 26 | as.DGEList <- function(x, id, features = NULL,
 27 |                        sample_columns = c("Sample_title", "Sample_source_name_ch1"),
 28 |                        feature_type = c("gene", "transcript"),
 29 |                        row_id = c("ensembl", "symbol"),
 30 |                        check_missing_samples = TRUE) {
 31 |   assert_class(x, "Archs4Repository")
 32 |   feature_type <- match.arg(feature_type)
 33 |   row_id <- match.arg(row_id)
 34 | 
 35 |   if (!is.null(features)) {
 36 |     if (is.character(features)) {
 37 |       features <- feature_lookup(x, features, type = type)
 38 |       if (any(is.na(features$a4name))) {
 39 |         warning("Removing 'not found' features from query")
 40 |         features <- filter(features, !is.na(a4name))
 41 |       }
 42 |     }
 43 |     assert_data_frame(features, min.rows = 1L)
 44 |     assert_subset(c("ensembl_id", "a4name"), colnames(features))
 45 |     # features <- distinct(features, ensembl_id, .keep_all = TRUE)
 46 |     # ensembl_id is something of a "second class citizen". The a4name is
 47 |     # what has been there in the ARCHS4 data since "the beginning"
 48 |     features <- distinct(features, ensembl_id, .keep_all = TRUE)
 49 |   }
 50 | 
 51 |   # Identify the unique samples that are being queried -------------------------
 52 |   si <- sample_info(x, id, columns = sample_columns,
 53 |                     check_missing_samples = check_missing_samples)
 54 |   si <- as.data.frame(si, stringsAsFactors = FALSE)
 55 |   si <- distinct(si, sample_id, .keep_all = TRUE)
 56 |   rownames(si) <- si$sample_id
 57 | 
 58 |   # Check for identifiers that were not found
 59 |   not.found <- filter(si, is.na(organism))
 60 |   if (nrow(not.found)) {
 61 |     stop("The following samples could not be found: ",
 62 |          paste(sprintf("%s::%s", not.found$series_id, not.found$sample_id),
 63 |                collapse = "; "))
 64 |   }
 65 | 
 66 |   # Check that user is asking for samples that are all from the same species
 67 |   org <- unique(si$organism)
 68 |   if (length(org) != 1L) {
 69 |     stop("You are querying across species")
 70 |   }
 71 | 
 72 |   # Fetch feature meta information ---------------------------------------------
 73 |   finfo <- feature_info(x, feature_type, org, augmented = TRUE)
 74 |   finfo <- as.data.frame(finfo, stringsAsFactors = FALSE)
 75 |   if (feature_type == "gene") {
 76 |     if (row_id == "ensembl") {
 77 |       finfo <- filter(finfo, !is.na(ensembl_id))
 78 |       rownames(finfo) <- finfo[["ensembl_id"]]
 79 |     } else {
 80 |       rownames(finfo) <- finfo[["a4name"]]
 81 |     }
 82 |   } else {
 83 |     dup.ensid <- duplicated(finfo[["ensembl_id"]])
 84 |     if (any(dup.ensid)) {
 85 |       warning("Duplicated ensembl identifiers when version is removed, ",
 86 |               "rownames maintain their versioned id")
 87 |       rownames(finfo) <- finfo[["ensembl_id_full"]]
 88 |     } else {
 89 |       rownames(finfo) <- finfo[["ensembl_id"]]
 90 |     }
 91 |   }
 92 | 
 93 |   if (!is.null(features)) {
 94 |     # cf. note about ensembl_id being a second class citizen
 95 |     # finfo <- semi_join(finfo, features, by = "ensembl_id")
 96 |     # finfo <- semi_join(finfo, features, by = "a4name")
 97 |     # semi_join (and filter(?)) strips rownames!
 98 |     finfo <- subset(finfo, a4name %in% features[["a4name"]])
 99 |   }
100 | 
101 |   # Fetch the count data for the given samples ---------------------------------
102 |   h5col <- paste0("sample_h5idx_", feature_type)
103 |   isna <- is.na(si[[h5col]])
104 |   if (any(isna)) {
105 |     msg <- paste0(
106 |       "The following samples do not have ", feature_type,
107 |       "-level quantitation and will not be included in the expression ",
108 |       "container:\n  ",
109 |       paste(sprintf("%s::%s", si$series_id[isna], si$sample_id[isna]),
110 |             collapse = "; "))
111 |     warning(msg, immediate. = TRUE)
112 |     si <- si[!isna,,drop = FALSE]
113 |   }
114 | 
115 |   if (nrow(si) == 0L) {
116 |     stop("No samples left to assemble expression data")
117 |   }
118 | 
119 |   # Pull out pre-calcuated lib.size and norm.factor values. This has the
120 |   # added benefit of pre-emptively identifying samples with weird (huge)
121 |   # numbers that turn to NAs due to integer rollover, ie. we get random
122 |   # errors of this variety when reading from the HDF5 file:
123 |   #
124 |   #   integer value -2^63 replaced NA. See the section
125 |   #   'Large integer data types' in the 'rhdf5' vignette
126 |   #   for more details.
127 |   #
128 |   # This happens when we fish data out from the hdf5 file, but also we
129 |   # ran into this problem when running the `estimate_repository_norm_factors`
130 |   # function when generating these lib.size and norm.factor values
131 |   libinfo <- sample_table(x) %>%
132 |     select(series_id, sample_id, lib.size = libsize, norm.factors = normfactor,
133 |            a4libsize) %>%
134 |     distinct(sample_id, .keep_all = TRUE)
135 |   libinfo <- select(si, series_id, sample_id) %>%
136 |     left_join(libinfo, by = c("series_id", "sample_id"))
137 |   na.overflow <- is.na(libinfo$lib.size) | is.na(libinfo$norm.factors)
138 |   if (any(na.overflow)) {
139 |     warning("Removing ", sum(na.overflow), " samples due to libsize NA overflow issues")
140 |     libinfo <- filter(libinfo, !na.overflow)
141 |     si <- subset(si, sample_id %in% libinfo$sample_id)
142 |   }
143 | 
144 |   # avg.lsize <- mean(libinfo$lib.size, na.rm = TRUE)
145 |   # avg.nfact <- mean(libinfo$norm.factors, na.rm = TRUE)
146 |   # libinfo <- libinfo %>%
147 |   #   transform(lib.size = ifelse(is.na(lib.size), a4libsize, lib.size),
148 |   #             norm.factors = ifelse(is.na(norm.factors), avg.nfact, norm.factors))
149 | 
150 |   rownames(si) <- si[["sample_id"]]
151 | 
152 |   counts <- local({
153 |     key <- paste(org, feature_type, sep = "_")
154 |     h5.fn <- file_path(x, key)
155 |     index <- list(finfo$h5idx, si[[h5col]])
156 |     cnts <- rhdf5::h5read(h5.fn, "data/expression", index=index)
157 |     colnames(cnts) <- rownames(si)
158 |     # cnts <- cnts[finfo$h5idx,,drop = FALSE]
159 |     rownames(cnts) <- rownames(finfo)
160 |     cnts
161 |   })
162 | 
163 |   out <- suppressWarnings(edgeR::DGEList(counts, genes = finfo, samples = si))
164 |   xref <- match(colnames(out), libinfo$sample_id)
165 |   if (any(is.na(xref))) {
166 |     stop("Problem matching sample_id to libinfo data.frame")
167 |   }
168 |   if (!all(colnames(out) == libinfo$sample_id[xref])) {
169 |     stop("Mismatch in outgoing DGEList to libinfo data.frame")
170 |   }
171 |   out$samples$lib.size <- libinfo$lib.size[xref]
172 |   out$samples$norm.factors <- libinfo$norm.factors[xref]
173 |   out
174 | }
175 | 
176 | 


--------------------------------------------------------------------------------
/inst/rmdparts/installation.Rmd:
--------------------------------------------------------------------------------
  1 | The installation of the `archs4` package is a bit more involved than a standard
  2 | package installation and can be roughly broken down into three steps.
  3 | 
  4 | 1. Install the R package along with its dependencies.
  5 | 2. Download a number of (large) data files into a specific folder.
  6 | 3. Generate metadata from the files downloaded in (2) for downstream use.
  7 | 
  8 | We will walk you through each step in this section.
  9 | 
 10 | ## R Package Installation
 11 | 
 12 | The `arcsh4` package depends on other packages that are available through both
 13 | [CRAN][cran] and [Bioconductor][bioc]. For that reason, we will use the
 14 | [`BiocInstaller::biocLite()`][biocLite] function to install this package, which
 15 | can seamlessly install packages from github, CRAN, and Bioconductor.
 16 | 
 17 | ```{r, eval = FALSE}
 18 | source("https://bioconductor.org/biocLite.R")
 19 | biocLite("denalitherapeutics/archs4", build_vignettes=TRUE)
 20 | library("archs4")
 21 | ```
 22 | 
 23 | When you first load the `archs4` library, you will notice a startup message
 24 | telling you that something isn't quite right with your `archs4` installation.
 25 | The message will look something like this:
 26 | 
 27 | ```
 28 | Note that your default archs4 data directory is NOT setup correctly
 29 | 
 30 |   * Run `archs4_local_data_dir_validate()` to diagnose
 31 |   * Refer to the ARCHS4 Data Download section of the archs4 vignette for more information
 32 | 
 33 | Your default archs4 data directory (`getOption("archs4.datadir")`) is:
 34 | 
 35 |   ~/.archs4data
 36 | ```
 37 | 
 38 | In order for the package to work correctly, you must download a number of files
 39 | which are enumerated in the [Data File Download](#data-file-download) section
 40 | below into a single directory. You will then instruct the `archs4` package the
 41 | path to the directory that holds all of these files by setting the value of R's
 42 | global `"archs4.datadir`" option to be the path to that directory.
 43 | 
 44 | ## Data File Download
 45 | 
 46 | You will have to create a directory on your filesystem which will hold a number
 47 | of data files that the `archs4` package depends on. Let's call this
 48 | directory `$ARCHS4DIR`, which we will define here to be `~/archs4v6data`.
 49 | 
 50 | The `archs4` package provides the `archs4_local_data_dir_create()` convenience
 51 | function which creates this directory and copies over a `meta.yaml` file into
 52 | that directory. The purpose of this file is to specify the names of the
 53 | downloaded files that correspond to the human and mouse-level gene and
 54 | transcript-level data.
 55 | 
 56 | ```{r, eval = FALSE}
 57 | library(archs4)
 58 | archs4dir <- "~/archs4v6data"
 59 | archs4_local_data_dir_create(archs4dir)
 60 | ```
 61 | 
 62 | Once this directory is created successfully, you will then have to download the
 63 | following files into it:
 64 | 
 65 | ```{r, echo = FALSE, results = "asis"}
 66 | sysdir <- system.file("extdata", package = "archs4")
 67 | cat(archs4:::md_archs4_download_bullet_list(sysdir))
 68 | ```
 69 | 
 70 | The enumerated items above contain links to the files that need to be
 71 | downloaded. You can right-click on them and select `Save As ...` and instruct
 72 | your web-browser to save them to your local `$ARCHS4DIR`.
 73 | 
 74 | **NOTE**: Most all of the `archs4` functions accept a `datadir` parameter, which
 75 | should be the path to `$ARCHS4DIR`. For convenience, the default value of this
 76 | parameter is always set to `getOption("archs4.datadir")`. This means that you
 77 | can modify your `~/.Rprofile` file to set the value of this option to
 78 | `"~/archs4v2data"` (for instance), so that the package will always look there
 79 | by default. If this option is not set in your `~/.Rprofile`, the
 80 | default value for this option is "~/.archs4data".
 81 | 
 82 | ## Feature-Level Metadata Generation
 83 | 
 84 | The datasets currently made available by the [ARCHS4 Project][archs4web] only
 85 | provide minimal feature-level metadata: 
 86 | 
 87 | * the features in the gene-level datasets are identified only by their symbol;
 88 |   and
 89 | * only the ensembl transcript id's are provided for the features in the
 90 |   transcript-level datasets
 91 | 
 92 | We want to augment these features with richer annotations, such as the ensembl
 93 | gene identifiers or gene biotypes, for instance.
 94 | 
 95 | To make such data generation automatic and easy for the user, once you have
 96 | downloaded the Ensembl GTF files listed above into the `$ARCHS4DIR`, you can
 97 | run the `create_augmented_feature_info()` to extract these extra feature-level
 98 | metadata from the GTF files and store them as tables inside `$ARCHS4DIR` for
 99 | later use.
100 | 
101 | ```{r eval = FALSE}
102 | create_augmented_feature_info(archs4dir)
103 | ```
104 | 
105 | This function will load and parse the GTF files from human and mouse, and
106 | create gene- and transcript-level `*.csv.gz` files in the `$ARCHS4DIR` which
107 | the `archs4` package will then later use downstream.
108 | 
109 | Once your `$ARCHS4DIR` is setup, you may find it convenient to set the default
110 | value for R's global `"archs4.datadir"` option to the `$ARCHS4DIR` directory you
111 | just setup. To do so, you can put the following line in your `~/.Rprofile` file:
112 | 
113 | ```r
114 | options(archs4.datadir = "~/archs4v2data")
115 | ```
116 | 
117 | ## Library Size and Normalization Factors
118 | 
119 | It is often convenient to extract normalized versions of the count data from
120 | the gene-level expression matrices. In order to do this "on-the-fly" we provide
121 | the `estimate_repository_norm_factors()` functions, which accepts an
122 | `Archs4Repository` object and essentially performs the steps necessary to
123 | create edgeR::TMM-like normalization factors across the entire ARCSH4 expression
124 | atlas.
125 | 
126 | In order to avoid laoding the entire expression matrix into memory, the
127 | `estimate_repository_norm_factors()` splits up the processing in batches,
128 | loading a subset of samples at a time until it completes. **This process will
129 | take quite some time** (around two hours on a modern laptop).
130 | 
131 | Note that recent versions of the ARCHS4 data do provide `meta/reads_aligned`
132 | and `meta/total_reads` entries. We don't use those here, but we can compare
133 | the values their with what we calculate above.
134 | 
135 | ```{r eval = FALSE}
136 | a4 <- Archs4Repository(archs4dir)
137 | estimate_repository_norm_factors(a4)
138 | ```
139 | 
140 | ## ARCHS4 Installation Heatlh
141 | 
142 | Because the installation of this package is a bit more involved than most,
143 | we have also provided an `archs4_local_data_dir_validate()` function, which
144 | you can run over your `$ARCHS4DIR` in order to check on "the health" of your
145 | install.
146 | 
147 | This function will simply look at your `$ARCHS4DIR` to ensure that the required
148 | files are there, and tries to give you helpful error messages if not.
149 | 
150 | For instance, if the first two files enumerated in the
151 | [Data File Download](#data-file-download) section were missing from
152 | your `$ARCHS4DIR` (ie. `human_matrix.h5` and `human_hiseq_transcript_v2.h5`),
153 | you would be warned that "something isn't right" when you first load the
154 | `archs4` package. You could then run the `archs4_local_data_dir_validate()`
155 | to see what is wrong:
156 | 
157 | ```{r eval = FALSE}
158 | archs4_local_data_dir_validate(archs4dir)
159 | #> The following ARCHS4 files are missing, please download them:
160 | #>   * human_matrix.h5: https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5
161 | #>   * human_hiseq_transcript_v2.h5: #> https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5
162 | ```
163 | 
164 | **NOTE:** If all installation and data download/processing steps have been
165 | completed successfully, a call to `archs4_local_data_dir_validate()` will simply
166 | return `TRUE`.
167 | 
168 | 
169 | ## Package Development
170 | 
171 | If you are developing this package, you will find that it will be convenient
172 | to symlink the package's default `archs4.datadir` path (`~/.arcsh4data`) to the
173 | `$ARCHS4DIR` you just setup. This is because often times things like roxygen2
174 | document compilation, unit testing, etc. happen in a vanilla R workspace, which
175 | won't run the configuration that is prescribed in your `~/.Rprofile` file.
176 | 
177 | 


--------------------------------------------------------------------------------
/vignettes/archs4.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "ARCHS4 Overview"
  3 | author: "Steve Lianoglou"
  4 | date: "`r Sys.Date()`"
  5 | packages: archs4
  6 | output:
  7 |   rmarkdown::html_document:
  8 |     toc: true
  9 |     toc_float: true
 10 | vignette: >
 11 |   %\VignetteIndexEntry{ARCHS4 Overview}
 12 |   %\VignetteEngine{knitr::rmarkdown}
 13 |   %\VignetteEncoding{UTF-8}
 14 | ---
 15 | 
 16 | ```{r setup, include = FALSE}
 17 | knitr::opts_chunk$set(
 18 |   collapse = TRUE,
 19 |   comment = "#>"
 20 | )
 21 | ```
 22 | 
 23 | # Overview
 24 | 
 25 | ARCHS4 is a project ([web site][archs4web] | [publication][archs4pub])
 26 | lead by [Alex Lachmann][lachmann] from [The Ma'ayan Lab][maayanlab] which aims
 27 | to reprocesses all of the human and mouse Illumina based (RNA) sequencing data
 28 | from [GEO][geo] and [SRA][sra] and make it available for exploratory data
 29 | analysis.
 30 | 
 31 | To facilitate that, the raw sequencing data is processed through the same
 32 | quantitation pipeline, which consists of [kallisto][kallisto] for quantitation
 33 | and ensembl gene annotations for the transcript models. These data are then
 34 | made available:
 35 | 
 36 | 1. as binary files [for download][archs4dl]; and
 37 | 2. via an [interactive exploratory analysis tool][archs4eda].
 38 | 
 39 | The scope of this package, for now, is to simply make all of the gene- and
 40 | transcript-level expression data easily queryable and retrievable to an analyst
 41 | within R.
 42 | 
 43 | In this vignette, we show you how to install the `archs4` package and its
 44 | required data files as well as example usage scenarios.
 45 | 
 46 | # Installation
 47 | 
 48 | ```{r child = system.file("rmdparts/installation.Rmd", package = "archs4")}
 49 | ```
 50 | 
 51 | # Usage
 52 | 
 53 | Currently, the `archs4` package facilitates easy query and retrieval of the
 54 | samples that have been processed by the project.
 55 | 
 56 | After you have successfully:
 57 | 
 58 | 1. installed the `archs4` R package;
 59 | 2. downloaded and build the data dependencies into your `$ARCHS4DIR`; and
 60 | 3. set the R `archs4.datadir` option to point to your `$ARCHS4DIR`, ie.
 61 |    `options("archs4.datadir" = "$ARCHS4DIR)")`
 62 | 
 63 | the use of this package will be most straightforward if you first instantiate a
 64 | reference object to the ARCHS4 data repository, like so:
 65 | 
 66 | ```{r a4, warning=FALSE, message=FALSE}
 67 | library(archs4)
 68 | library(dplyr)
 69 | a4 <- Archs4Repository()
 70 | ```
 71 | 
 72 | The `a4` object is what we will use to query and retrieve processed data from.
 73 | As of now, the archs4 repository contains 4000+ projects with 100,000+ samples
 74 | from both human and mouse experiments:
 75 | 
 76 | ```{r show-a4}
 77 | show(a4)
 78 | ```
 79 | 
 80 | ## ARCHS4 Sample Queries
 81 | 
 82 | In order to retrieve expression data from the ARCHS4 `a4` object, you will need
 83 | to identify a GEO series id ("GSEnnnnn"), or GEO sample id ("GSMnnnnn") of
 84 | interest.
 85 | 
 86 | For instance, the RNA-seq expression data that was used for the
 87 | "[RNA-Sequencing Transcriptome and Splicing Database of Glia, Neurons, and Vascular Cells of the Cerebral Cortex][barrespub]" is accessible using the
 88 | [GSE52564 GEO series accession number][barresgeo]. When you click through to the
 89 | [GEO landing page][barresgeo] and scroll down, you will find a section of the
 90 | page that enumerates the samples in the series as shown in the picture below:
 91 | 
 92 | ![](images/GSE52564-samples-section.png)
 93 | 
 94 | We can see that this GEO Series has seventeen (17) samples associated with it.
 95 | 
 96 | Let's query our ARCHS4 repository using the `sample_info()` function to see if
 97 | we have them. The `sample_info()` function accepts a character vector of
 98 | GEO identifiers (series, samples, or a mix of both) and queries the ARCHS4
 99 | repository to see if they are present in the dataset. The function will return
100 | a `data.frame` where each row represents a single sample, and the columns
101 | include different sample-level  metadata for the sample, which you can customize
102 | via the `columns` argument:
103 | 
104 | ```{r sample-query}
105 | samples <- sample_info(a4, "GSE52564", columns = "Sample_title")
106 | ```
107 | 
108 | This function returns a `data.frame` with `r nrow(samples)` rows, which matches
109 | the number of samples listed on this projects [GEO landing page][barresgeo].
110 | Let's take a peak:
111 | 
112 | ```{r sample-peak}
113 | select(samples, sample_id, Sample_title)
114 | ```
115 | 
116 | Use the `sample_covaraites()` function to identify the names of all of the
117 | sample-level metadata variables that are available from the mouse and human
118 | datasets:
119 | 
120 | ```{r scov-available}
121 | sample_covariates(a4) %>% head()
122 | ```
123 | 
124 | Any combination of the entries listed in the `name` column of the `data.frame`
125 | above can be used in the `columns` parameter of the `sample_info()` function.
126 | Although most of these covariates are available in both human and mouse, not
127 | all of them are. The `mouse` and `human` column indicate which covariates are
128 | found in which datasets.
129 | 
130 | ## Expression Data
131 | 
132 | The `archs4` package provides an `as.DGEList` function, which will accept a
133 | vector of series or sample identifiers and materialize a gene- or
134 | transcript-level `edgeR::DGEList` that they correspond to.
135 | 
136 | For instance, to create a gene-level `DGEList` for this project, we would:
137 | 
138 | ```{r}
139 | yg <- as.DGEList(a4, "GSE52564", feature_type = "gene")
140 | ```
141 | 
142 | This call returns a `r nrow(yg)` (gene) x `r ncol(yg)` (sample) DGEList that you
143 | can then easily feed into a downstream differential expression analysis pipeline
144 | such as edgeR or voom.
145 | 
146 | Setting `feature_type = "transcript"` in the `as.DGEList` call will give you a
147 | DGEList with transcript-level quantitation.
148 | 
149 | Any combination of GEO Series and GEO Sample identifiers will work in this
150 | function call so long as they all reference data generated from the same
151 | organism.
152 | 
153 | ## Missing Samples
154 | 
155 | In your course of using this data resource, you might find that your query for
156 | a particular GEO Series identifier will return fewer samples than are enumerated
157 | on its landing page. There are a number of reasons why individual samples did
158 | not make it into the ARCHS4 dataset.
159 | 
160 | Some missing data is due to errors in the ARCHS4 data processing pipeline, where
161 | they may have encountered problems downloading raw data or irregularities in 
162 | processing (aligning) it. There are also more mundane reasons, such as a single
163 | GEO Series identifier may contain both RNA-seq and microarray data, the latter
164 | of which would not be included in ARCHS4 resource.
165 | 
166 | The `series_status()` function will tell you if there are  missing samples
167 | from a GEO Series.
168 | 
169 | ```{r}
170 | s1 <- series_status(a4, "GSE52564")
171 | head(s1)
172 | ```
173 | 
174 | A `TRUE` value in the `in_archs4` column indicates that the sample is present
175 | in the ARCHS4 repository.
176 | 
177 | In this particular case, we can verify that all samples in the `"GSE52564"`
178 | series are found in the ARCHS4 dataset, ie. `all(s1$in_archs4) == TRUE`
179 | 
180 | Other GEO Series, however, are not complete. For instance,
181 | [GEO Series GSE89189][blurtongeo] should contain 43 samples, however there
182 | are a few (six) that are missing:
183 | 
184 | ```{r}
185 | series_status(a4, "GSE89189") %>% 
186 |   filter(!in_archs4)
187 | ```
188 | 
189 | When we use functions like `sample_info()` or `as.DGEList` which expand
190 | GEO Series-level identifiers into the samples they contain, the default
191 | behavior is to call the `series_status()` function internally to check whether
192 | all of the samples for a GEO Series are present in the ARCHS4 dataset. If not,
193 | the user will be warned:
194 | 
195 | ```{r}
196 | y2 <- as.DGEList(a4, "GSE89189", feature_type = "gene")
197 | ```
198 | 
199 | **NOTE:** The `series_status()` function makes a call to an NCBI web-service to
200 | query the samples for a given series, and therefore (i) requires an active
201 | internet connection; and (ii) may take *a little* time to complete.
202 | 
203 | # Conclusion
204 | 
205 | The `archs4` package provides a convenient way to quickly access the vast
206 | amount of data made available by the [ARCHS4 Project][archs4web]. The package
207 | will evolve over time to suit the needs of analysts who work with this great
208 | resource via their R workspace.
209 | 
210 | While we, the authors of this package, are big fans the
211 | [ARCHS4 Project][archs4web], please note that this package is not affiliated
212 | with, nor endorsed by the creators of ARCHS4. As such, any issues, bugs, or
213 | comments should be directed towards the authors of this package, or filed under
214 | its issue tracker:
215 | 
216 | https://github.com/denalitherapeutics/archs4/issues
217 | 
218 | 
219 | [//]: # (References ===========================================================)
220 | ```{r child = system.file("rmdparts/references.Rmd", package = "archs4")}
221 | ```
222 | 


--------------------------------------------------------------------------------
/R/geo-utils.R:
--------------------------------------------------------------------------------
  1 | #' Classify a vector of sample or series GEO ID's as such
  2 | #'
  3 | #' GEO series identifiers all start with GSE and sample identifiers all
  4 | #' start with GSM. We use that to identify what types of identifiers are
  5 | #' passed into `id`
  6 | #'
  7 | #' @export
  8 | #'
  9 | #' @param id a character vector of `GSEnnnnn` or `GSMnnnnn` ids
 10 | #' @return a tibble of `unique(id)` indicating if the id is a `"series"`
 11 | #'   (GSEnnnnn), `"sample"` (GSMnnnnn), or `"unknown"`.
 12 | geo_id_type <- function(id) {
 13 |   id <- assert_character(id) %>% unique
 14 |   type <- case_when(
 15 |     is_geo_series_id(id) ~ "series",
 16 |     is_geo_sample_id(id) ~ "sample",
 17 |     TRUE                 ~ "unknown")
 18 |   tibble(id = id, type = type)
 19 | }
 20 | 
 21 | 
 22 | #' Query NCBI GEO through its REST interface
 23 | #'
 24 | #' @export
 25 | #' @importFrom xml2 read_xml xml_validate xml_ns_strip xml_contents xml_find_all
 26 | #'   xml_text
 27 | #' @source https://www.ncbi.nlm.nih.gov/geo/info/download.html
 28 | #'
 29 | #' @param accession Scalar character, GEO identifier for a series (GSE), a
 30 | #'   sample (GSM) or a platform (GPL).
 31 | #' @param validate Scalar boolean, validate the retrieved xml file against
 32 | #'   NCBI's schema?
 33 | #' @return xml2::xml_document object
 34 | #' @examples
 35 | #' query_geo("GSE109171")
 36 | query_geo <- function(accession, target = c("self", "gsm", "gpl", "gse", "all"),
 37 |                       validate = FALSE, verbose = FALSE) {
 38 |   target = match.arg(target)
 39 |   geo_url <- sprintf(
 40 |     paste0("https://www.ncbi.nlm.nih.gov/geo/query/",
 41 |            "acc.cgi?acc=%s&targ=%s&view=%s&form=%s"),
 42 |     accession, target, "full", "xml")
 43 |   if (verbose) {
 44 |     message(sprintf("Retrieving %s", geo_url))
 45 |   }
 46 |   res <- xml2::read_xml(geo_url)
 47 | 
 48 |   if (validate) {
 49 |     schema_url <- strsplit(xml2::xml_attr(res, attr = "schemaLocation"),
 50 |                            split = " ", fixed = TRUE)[[1]][-1]
 51 |     valid_xml <- xml2::xml_validate(res, schema = xml2::read_xml(schema_url))
 52 |     if (!valid_xml) {
 53 |       stop(sprintf("Validation with schema %s failed", schema_url))
 54 |     }
 55 |   }
 56 |   return(res)
 57 | }
 58 | 
 59 | #' Retrieve information about a GEO series
 60 | #'
 61 | #' Queries NCBI GEO's REST interface to retrieve e.g. title, summary and the
 62 | #' list of samples for a GEO series.
 63 | #'
 64 | #' @export
 65 | #' @importFrom xml2 xml_contents xml_find_all xml_text
 66 | #' @importFrom stats setNames
 67 | #'
 68 | #' @param accession Scalar character, GEO series identifier e.g. GSE109171
 69 | #' @param fields Character vector specifying which fields to extract from the
 70 | #'   XML file returned by GEO
 71 | #' @param ... Additional arguments passed on to the `query_geo` function.
 72 | #' @return List the requested `fields`
 73 | #' @examples
 74 | #' if (interactive()) {
 75 | #'   lookup_gse("GSE109171")
 76 | #' }
 77 | lookup_gse <- function(accession,
 78 |                        fields = c("Accession", "Title", "Summary",
 79 |                                   "Overall-Design", "Type", "Pubmed-ID",
 80 |                                   "Sample"),
 81 |                        ...) {
 82 |   fields <- match.arg(fields, several.ok = TRUE)
 83 |   xml <- query_geo(accession, target = "gse", ...) %>%
 84 |     xml2::xml_ns_strip()
 85 |   series_fields <- setdiff(fields, "Sample")
 86 |   series <- purrr::map(
 87 |     setNames(series_fields, tolower(series_fields)),
 88 |     .f = function(field) {
 89 |       xml %>%
 90 |         xml2::xml_find_first(xpath = "Series") %>%
 91 |         xml2::xml_find_first(field) %>%
 92 |         xml2::xml_text(trim = TRUE)
 93 |     })
 94 |   sample_fields <- setdiff(fields, series_fields)
 95 |   samples <- purrr::map(
 96 |     setNames(sample_fields, tolower(sample_fields)),
 97 |     .f = function(field) {
 98 |       xml %>%
 99 |         xml2::xml_find_all(xpath = "Sample") %>%
100 |         xml2::xml_text(trim = TRUE)
101 |     })
102 |   append(series, samples)
103 | }
104 | 
105 | #' Retrieve sample annotations from NCBI's Biosample database
106 | #'
107 | #' This function uses the `rentrez` package to retrieve sample annotations
108 | #' from NCBI's Biosample database.
109 | #'
110 | #' @export
111 | #' @importFrom rentrez entrez_search entrez_fetch
112 | #' @importFrom xml2 read_xml xml_find_all xml_text xml_attr xml_find_first
113 | #'   xml_children
114 | #' @importFrom tibble set_tidy_names
115 | #' @importFrom readr type_convert cols
116 | #' @source https://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_
117 | #'
118 | #' @param x Character vector of sample identifiers to search the Biosample
119 | #'   database for. Typically either `BioSample (SAMN)`, `SRA (SRS)` or
120 | #'   `GEO (GSM)` accession numbers.
121 | #' @param retmax Scalar integer, the maximum number of (total) matches to
122 | #' retrieve from Entrez. See
123 | #'   \url{https://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_}
124 | #'   for details. The number of records that can be retrieved in one query
125 | #'   must be < 100,000.
126 | #' @return A tbl_df data.frame with sample annotations. Column names and numbers
127 | #'   vary depending on the attributes available in the Biosample database.
128 | #' @examples
129 | #' if (interactive()) {
130 | #'  # BioSample identifiers
131 | #'  lookup_biosamples(c("GSM1947162", "GSM1947179"))
132 | #'  # mixed SRS and GSM identifiers
133 | #'  lookup_biosamples(c("SRS1171537", "SRS1171536", "GSM1947179"))
134 | #'  # mixed samples from two different studies (with different attributes)
135 | #'  lookup_biosamples(c("SRS1171537", "SRS1271536"))
136 | #' }
137 | lookup_biosamples <- function(x, retmax = 1e5 - 1L) {
138 |   if (retmax >= 1e5) stop("retmax must be < 100,000")
139 | 
140 |   search_results <- rentrez::entrez_search(
141 |     db = "biosample", retmax = retmax,
142 |     term = sprintf("%s[ACCN]", paste(x, collapse = "[ACCN] OR ")))
143 |   rentrez::entrez_fetch(db = "biosample", id = search_results$ids,
144 |                         rettype = "xml", retmax = retmax,
145 |                         parsed = FALSE) %>%
146 |     xml2::read_xml() %>%
147 |     xml2::xml_children() %>%
148 |     purrr::map_df(.f = function(x) {
149 |       tibble::data_frame(
150 |         Title = xml2::xml_find_first(
151 |           x,
152 |           xpath = ".//Description/Title") %>%
153 |           xml2::xml_text(trim = TRUE),
154 |         OrganismName = xml2::xml_find_first(
155 |           x,
156 |           xpath = ".//Description/Organism/OrganismName") %>%
157 |           xml2::xml_text(trim = TRUE),
158 |         taxonomy_id = xml2::xml_find_all(
159 |           x,
160 |           xpath = ".//Description/Organism") %>%
161 |           xml2::xml_attr("taxonomy_id"),
162 |         BioSample = xml2::xml_find_first(
163 |           x,
164 |           xpath = './/Ids/Id[@db="BioSample"]') %>%
165 |           xml2::xml_text(trim = TRUE),
166 |         SRA = xml2::xml_find_first(
167 |           x,
168 |           xpath = './/Ids/Id[@db="SRA"]') %>%
169 |           xml2::xml_text(trim = TRUE),
170 |         GEO = xml2::xml_find_first(
171 |           x,
172 |           xpath = './/Ids/Id[@db="GEO"]') %>%
173 |           xml2::xml_text(trim = TRUE),
174 |         value = xml2::xml_find_all(
175 |           x,
176 |           xpath = ".//Attribute") %>%
177 |           xml2::xml_text(trim = TRUE),
178 |         key = xml2::xml_find_all(
179 |           x,
180 |           xpath = ".//Attribute") %>%
181 |           xml2::xml_attr("attribute_name")
182 |       )
183 |     }) %>%
184 |     dplyr::distinct() %>%
185 |     tidyr::spread(key = key, value = value) %>%
186 |     tibble::set_tidy_names(syntactic = TRUE, quiet = TRUE) %>%
187 |     dplyr::rename_all(tolower) %>%
188 |     readr::type_convert(col_types = readr::cols())
189 | }
190 | 
191 | #' Retrieve metadata for an SRA accession
192 | #'
193 | #' This function uses the EBI's or the NCBI's REST APIs to retrieve information
194 | #' about SRA data.
195 | #' Study accessions (ERP, SRP, DRP, PRJ prefixes), experiment accessions
196 | #' (ERX, SRX, DRX prefixes), sample accessions (ERS, SRS, DRS, SAM prefixes)
197 | #' and run accessions (ERR, SRR, DRR prefixes) can be supplied.
198 | #' For more information see \url{http://www.ebi.ac.uk/ena/browse/file-reports}
199 | #' @param x SRA identifier
200 | #' @param from Scalar character, specifying either \code{ncbi} or \code{ena} as
201 | #' the source database
202 | #' @return A tbl_df data.frame
203 | #' @note The output data.frame will be different for the two source databases.
204 | #' @importFrom readr read_tsv read_csv cols
205 | #' @export
206 | #' @examples
207 | #' if (interactive()) {
208 | #'    # retrieve study annotations
209 | #'    retrieve_sra_metadata("SRP066489")
210 | #'    # paired-end samples
211 | #'    retrieve_sra_metadata("PRJEB2054", "ena") %>%
212 | #'    dplyr::filter(sample_accession == "SAMEA728920")
213 | #' }
214 | retrieve_sra_metadata <- function(x, from = c("ena", "ncbi")) {
215 |   from <- match.arg(from)
216 |   runinfo <- switch(
217 |     from,
218 |     ncbi = {
219 |       sra_url <- sprintf(
220 |         paste0("https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?",
221 |                "save=efetch&rettype=runinfo&db=sra&term=%s"), x)
222 |       readr::read_csv(url(sra_url), col_types = readr::cols())
223 |     },
224 |     ena = {
225 |       sra_url <- sprintf(paste0(
226 |         "http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=%s",
227 |         "&result=read_run"), x)
228 |       readr::read_tsv(url(sra_url), col_types = readr::cols())
229 |     }
230 |   )
231 |   return(runinfo)
232 | }
233 | 


--------------------------------------------------------------------------------
/R/Archs4Repository.R:
--------------------------------------------------------------------------------
  1 | #' An interface to a locally downloaded ARCHS4 dataset
  2 | #'
  3 | #' This instantiates an object that acts as a central broker to handle queries
  4 | #' against the ARCHS4 dataset. Please refer to the vignette for instructions
  5 | #' on how to setup a local directory to act as an Archs4Repository.
  6 | #'
  7 | #' @export
  8 | #'
  9 | #' @param datadir The directory that stores the ARCHS4 data.
 10 | #' @return an Arhcs4DataSet object
 11 | Archs4Repository <- function(datadir = getOption("archs4.datadir")) {
 12 |   kosher.dir <- archs4_local_data_dir_validate(echo = FALSE, datadir)
 13 |   if (!isTRUE(kosher.dir)) {
 14 |     stop(kosher.dir, call. = FALSE)
 15 |   }
 16 | 
 17 |   asi <- archs4_sample_table(datadir = datadir)
 18 |   gstats <- asi %>%
 19 |     filter(!is.na(sample_h5idx_gene)) %>%
 20 |     group_by(organism) %>%
 21 |     summarize(nseries = length(unique(series_id)),
 22 |               nsamples = length(unique(sample_id))) %>%
 23 |     mutate(feature_type = "gene")
 24 | 
 25 |   tstats <- asi %>%
 26 |     filter(!is.na(sample_h5idx_transcript)) %>%
 27 |     group_by(organism) %>%
 28 |     summarize(nseries = length(unique(series_id)),
 29 |               nsamples = length(unique(sample_id))) %>%
 30 |     mutate(feature_type = "transcript")
 31 | 
 32 |   out <- list(
 33 |     datadir = datadir,
 34 |     file_info = archs4_file_info(datadir),
 35 |     meta = archs4_meta(datadir),
 36 |     sample_covariates = archs4_sample_covariates(datadir),
 37 |     sample_stats = bind_rows(gstats, tstats),
 38 |     sample_table = asi)
 39 | 
 40 |   # You *know* we're going to make a "remote" or "service" version of an
 41 |   # Archs4Repository, in due time ...
 42 |   class(out) <- c("LocalArchs4Repository", "Archs4Repository")
 43 |   out
 44 | }
 45 | 
 46 | #' @export
 47 | #' @method print Archs4Repository
 48 | print.Archs4Repository <- function(x, ...) {
 49 |   cat(format(x, ...), "\n")
 50 | }
 51 | 
 52 | format.Archs4Repository <- function(x, ...) {
 53 |   asi <- sample_table(x)
 54 | 
 55 |   out <- paste0(
 56 |     "===========================================================\n",
 57 |     paste(class(x)[1L], "object\n"),
 58 |     "-----------------------------------------------------------\n",
 59 |     "datadir: ", x$datadir, "\n\n",
 60 |     "mouse data:\n",
 61 |     "----------\n",
 62 |     "       gene: ", nseries(x, "gene", "mouse"), " series; ",
 63 |                      nsamples(x, "gene", "mouse"), " samples\n",
 64 |     " transcript: ", nseries(x, "transcript", "mouse"), " series; ",
 65 |                      nsamples(x, "transcript", "mouse"), " samples\n\n",
 66 |     "human data:\n",
 67 |     "-----------\n",
 68 |     "       gene: ", nseries(x, "gene", "human"), " series; ",
 69 |                      nsamples(x, "gene", "human"), " samples\n",
 70 |     " transcript: ", nseries(x, "transcript", "human"), " series; ",
 71 |                      nsamples(x, "transcript", "human"), " samples\n",
 72 |     "===========================================================\n")
 73 | }
 74 | 
 75 | nsamples <- function(x, feature_type = c("gene", "transcript"),
 76 |                      source = archs4_sources()) {
 77 |   assert_class(x, "Archs4Repository")
 78 |   feature_type <- match.arg(feature_type)
 79 |   source <- match.arg(source)
 80 |   take <- x$sample_stats$organism == source &
 81 |     x$sample_stats$feature_type == feature_type
 82 |   x$sample_stats$nsamples[take]
 83 | }
 84 | 
 85 | nseries <- function(x, feature_type = c("gene", "transcript"),
 86 |                     source = archs4_sources()) {
 87 |   assert_class(x, "Archs4Repository")
 88 |   feature_type <- match.arg(feature_type)
 89 |   source <- match.arg(source)
 90 |   take <- x$sample_stats$organism == source &
 91 |     x$sample_stats$feature_type == feature_type
 92 |   x$sample_stats$nseries[take]
 93 | }
 94 | 
 95 | #' Retrieves the directory that contains the data for the Archs4Repository
 96 | #'
 97 | #' @export
 98 | #' @param x an `Archs4Repository`
 99 | datadir <- function(x, ...) {
100 |   assert_class(x, "Archs4Repository")
101 |   x$datadir
102 | }
103 | 
104 | #' @export
105 | #' @rdname archs4_feature_info
106 | #' @param x an `Archs4Repository`
107 | feature_info <- function(x, feature_type = "gene", source = "human",
108 |                          augmented = TRUE, ...) {
109 |   assert_class(x, "Archs4Repository")
110 |   assert_choice(feature_type, c("gene", "transcript"))
111 |   assert_choice(source, sources(x))
112 |   archs4_feature_info(feature_type, source, augmented, datadir(x), ...)
113 | }
114 | 
115 | #' Perform a loose/fuzzy lookup for a gene/transcript feature.
116 | #'
117 | #' This funciton facilitates exploratory data analyses by trying to find gene
118 | #' or transcripts by different type of identifiers (symbol, ensembl_id, etc).
119 | #'
120 | #' @export
121 | #' @param x An Archs4Repository
122 | #' @param query a character string of feature names to look for
123 | #' @param feature_type "gene" or "transcript"
124 | #' @param source organism dataset to lookup
125 | #' @return a tibble of features that match against the query. The first column
126 | #'   is the value of the query itself. If no match is found for a query, its
127 | #'   row is all `NA`.
128 | #' @examples
129 | #' a4 <- Archs4Repository()
130 | #' features <- feature_lookup(a4, c("CFAP65", "PECR", "ENSG00000131408"),
131 | #'                            feature_type = "gene", source ="human")
132 | feature_lookup <- function(x, query, feature_type = "gene",
133 |                            source = "human", ...) {
134 |   assert_class(x, "Archs4Repository")
135 |   assert_character(query, min.len = 1L, any.missing = FALSE)
136 |   assert_choice(feature_type, c("gene", "transcript"))
137 |   query <- unique(query)
138 | 
139 |   fi <- feature_info(x, feature_type, source, ...)
140 |   if (feature_type == "gene") {
141 |     search <- c("ensembl_id", "symbol", "entrez_id", "a4external")
142 |   } else {
143 |     search <- c("ensembl_id", "gene_id", "symbol")
144 |   }
145 |   search <- intersect(search, colnames(fi))
146 | 
147 |   idxs <- sapply(query, function(qry) .fuzzy_lookup(fi, search, qry))
148 |   isna <- is.na(idxs)
149 |   if (any(isna)) {
150 |     warning("The following ", feature_type, " queries were not found",
151 |             paste(query[isna], paste = ","))
152 |     # idxs <- idxs[!isna]
153 |     # query <- query[!isna]
154 |   }
155 |   bind_cols(
156 |     tibble(query = query),
157 |     fi[idxs,])
158 | }
159 | 
160 | .fuzzy_lookup <- function(x, columns, query) {
161 |   assert_data_frame(x)
162 |   assert_subset(columns, colnames(x))
163 |   assert_string(query)
164 |   query <- tolower(query)
165 | 
166 |   idx <- NA_integer_
167 |   for (cname in columns) {
168 |     vals <- tolower(x[[cname]])
169 |     i <- which(vals == query)
170 |     if (length(i) > 1) {
171 |       warning("More than one row for `", query, "` found -- taking first one",
172 |               call. = TRUE)
173 |       idx <- i[1L]
174 |       break
175 |     } else if (length(i) == 1) {
176 |       idx <- i
177 |     }
178 |   }
179 |   if (is.null(idx)) {
180 |     warning("Could not find feature using provided query: `", oquery, "`",
181 |             immediate. = TRUE)
182 |   }
183 |   idx
184 | }
185 | 
186 | 
187 | #' @export
188 | #' @rdname archs4_file_info
189 | #' @param x an `Archs4Repository`
190 | file_info <- function(x) {
191 |   assert_class(x, "Archs4Repository")
192 |   x[["file_info"]]
193 | }
194 | 
195 | #' @export
196 | #' @rdname archs4_file_path
197 | #' @param x an `Archs4Repository`
198 | file_path <- function(x, key) {
199 |   assert_class(x, "Archs4Repository")
200 |   archs4_file_path(key, file_info = file_info(x), datadir = datadir(x))
201 | }
202 | 
203 | #' Extract the read depth and normalization factors for the samples
204 | #'
205 | #' @export
206 | #' @param x an `Archs4Repository`
207 | #' @param with_a4libsize If `TRUE`, includes an `a4libsize` column, which
208 | #'   was extracted from the `meta/reads_aligned` hdf5 file. Defaults to `FALSE`.
209 | #' @return a tibble with sample_id, a4libsize, libsize, normfactor
210 | libstats <- function(x, with_a4libsize = FALSE) {
211 |   assert_class(x, "Archs4Repository")
212 |   cols <- c("libsize", "normfactor")
213 |   if (with_a4libsize) cols <- c("a4libsize", cols)
214 |   sample_table(x) %>%
215 |     select(sample_id, !!cols) %>%
216 |     distinct(sample_id, .keep_all = TRUE)
217 | }
218 | 
219 | #' @export
220 | #' @rdname archs4_sample_table
221 | #' @param x an `Archs4Repository`
222 | sample_table <- function(x, ...) {
223 |   assert_class(x, "Archs4Repository")
224 |   x$sample_table
225 | }
226 | 
227 | #' @export
228 | #' @rdname archs4_series_status
229 | #' @param x an `Archs4Repository`
230 | series_status <- function(x, id, ...) {
231 |   assert_class(x, "Archs4Repository")
232 |   archs4_series_status(id, sample_table = sample_table(x), datadir = datadir(x))
233 | }
234 | 
235 | #' @export
236 | #' @rdname archs4_meta
237 | #' @param x an `Archs4Repository`
238 | meta <- function(x) {
239 |   assert_class(x, "Archs4Repository")
240 |   x$meta
241 | }
242 | 
243 | #' @export
244 | #' @rdname archs4_sources
245 | #' @param x an `Archs4Repository`
246 | sources <- function(x) {
247 |   assert_class(x, "Archs4Repository")
248 |   meta(x)[["sources"]]
249 | }
250 | 
251 | #' @export
252 | #' @rdname archs4_sample_covariates
253 | #' @param x an `Archs4Repository`
254 | sample_covariates <- function(x, ...) {
255 |   assert_class(x, "Archs4Repository")
256 |   x$sample_covariates
257 | }
258 | 
259 | #' @export
260 | #' @rdname archs4_sample_info
261 | #' @param x an `Archs4Repository`
262 | sample_info <- function(x, id,
263 |                         columns = c("Sample_title", "Sample_source_name_ch1"),
264 |                         check_missing_samples = TRUE, ...) {
265 |   assert_class(x, "Archs4Repository")
266 |   archs4_sample_info(id, columns, sample_table(x), sample_covariates(x),
267 |                      check_missing_samples, datadir(x), ...)
268 | }
269 | 
270 | 


--------------------------------------------------------------------------------
/R/archs4-features.R:
--------------------------------------------------------------------------------
  1 | #' Create meta information for the genes and transcripts in the ARCHS4 dataset.
  2 | #'
  3 | #' @description
  4 | #' This is a preprocessing function that is required to successfully build an
  5 | #' `Archs4Repository`. It is not really intended for use during analyses.
  6 | #'
  7 | #' This Function creates *all* of the feature-level CSV files for the features
  8 | #' enumerated in the `meta/genes` gene-level hdf5 file, and the
  9 | #' `meta/transcript` transcript identfiers in the transctipt-level hdf5 file for
 10 | #' the mouse and human files found in `datadir`.
 11 | #'
 12 | #' **In order for this to work** you have to download the approprate human and
 13 | #' mouse gtf files from ensembl and save them in `datadir`. Reference the
 14 | #' [archs4_local_data_dir_validate()] function.
 15 | #'
 16 | #' For the initial relesae of the ARCHS4 dataset, the
 17 | #' `Homo_sapiens.GRCh38.90.gtf.gz` and `Mus_musculus.GRCm38.90.gtf.gz` were
 18 | #' used.
 19 | #'
 20 | #' @details
 21 | #' This function will write the augmented transcript- and gene-level files in
 22 | #' the `datadir`, using the following pattern:
 23 | #' `<organism>_<feature_type>_augmented_info.csv.gz`
 24 | #'
 25 | #' Gene symbols are the only piece of information provided for the row-level
 26 | #' identifieres for the gene count matrices. Furthermore, the gene symbol used
 27 | #' in mouse are in all uppercase, which is not how genes are referred to there.
 28 | #' In order to augment the gene symbol information with gene-level identifiers
 29 | #' and other information, we parse relatively recent GTFs provided by GENCODE.
 30 | #'
 31 | #' The fruits of the labor generated by this function are used by the
 32 | #' [archs4_feature_info()] function.
 33 | #'
 34 | #' Note that this function will replace already existing "`augmented`" files
 35 | #' if the already exist in `datadir`.
 36 | #'
 37 | #' @export
 38 | #' @importFrom utils write.csv
 39 | #' @seealso [archs4_feature_info()]
 40 | #'
 41 | #' @param datadir The directory that has the mouse and human expression
 42 | #'   hdf5 files. There will be `SPECIES_FEATURETYPE_augmented_info.csv.gz` files
 43 | #'   saved in this directory whe this function completes.
 44 | create_augmented_feature_info <- function(datadir = getOption("archs4.datadir")) {
 45 |   if (!requireNamespace("GenomicRanges", quietly = TRUE)) {
 46 |     stop("GenomicRanges package is required to create augmented feature tables")
 47 |   }
 48 |   if (!requireNamespace("rtracklayer", quietly = TRUE)) {
 49 |     stop("rtracklayer package is required to create augmented feature tables")
 50 |   }
 51 | 
 52 |   # Ensure that the appropriate GTF files exist
 53 |   sources <- archs4_sources()
 54 |   source.keys <- paste0(sources, "_gtf")
 55 |   gtfs <- archs4_file_path(source.keys, stop_if_missing = FALSE,
 56 |                            datadir = datadir)
 57 |   if (any(is.na(gtfs))) {
 58 |     stop("Some GTF files are missing, please run ",
 59 |          "`archs4_local_data_dir_validate()` for further information")
 60 |   }
 61 | 
 62 |   for (s in sources) {
 63 |     message("Constructing ", s, " augmented annotations ...")
 64 |     gtf.fn <- gtfs[paste0(s, "_gtf")]
 65 |     tx.fn <- local({
 66 |       key <- paste0(s, "_transcript_info")
 67 |       fn <- suppressWarnings(
 68 |         archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE,
 69 |                          datadir = datadir)
 70 |       )
 71 |       sub("\\.gz$", "", fn)
 72 |     })
 73 |     gn.fn <- local({
 74 |       key <- paste0(s, "_gene_info")
 75 |       fn <- suppressWarnings(
 76 |         archs4_file_path(key, stop_if_missing = FALSE, na_missing = FALSE,
 77 |                          datadir = datadir)
 78 |       )
 79 |       sub("\\.gz$", "", fn)
 80 |     })
 81 | 
 82 |     gr <- rtracklayer::import.gff(gtf.fn)
 83 | 
 84 |     a4.tinfo <- archs4_feature_info("transcript", s, augmented = FALSE)
 85 |     txinfo <- .augmented_transcript_info(gr, a4.tinfo)
 86 |     write.csv(txinfo, tx.fn, row.names = FALSE)
 87 |     system(paste("gzip", tx.fn))
 88 | 
 89 |     a4.ginfo <- archs4_feature_info("gene", s, augmented = FALSE)
 90 |     ginfo <- .augmented_gene_info(gr, a4.ginfo)
 91 |     write.csv(ginfo, gn.fn, row.names = FALSE)
 92 |     system(paste("gzip", gn.fn))
 93 |     message("")
 94 |   }
 95 | 
 96 | }
 97 | 
 98 | #' Helper function that creats augmented transcript information file.
 99 | #'
100 | #' @noRd
101 | #'
102 | #' @param gr a GRanges object from .load_gtf, we assume this is enembl gtfs
103 | #' @param a4.tinfo The "raw" information that the ARCHS4 data stores for its
104 | #'   transcripts, ie. the output from
105 | #' @return a decorated `a4.tinfo` tibble of augmented transcript information.
106 | .augmented_transcript_info <- function(gr, a4.tinfo) {
107 |   requireNamespace("GenomicRanges", quietly = TRUE)
108 |   dfa <- GenomicRanges::as.data.frame(gr)
109 | 
110 |   tx.info.all <- filter(dfa, type == "transcript")
111 |   tx.info <- select(tx.info.all, transcript_id, transcript_biotype,
112 |                     gene_name, gene_id, gene_biotype, gene_source,
113 |                     seqnames, start, end, strand)
114 |   if (any(duplicated(tx.info$transcript_id))) {
115 |     stop("Duplicated transcript ids found in gtf file")
116 |   }
117 | 
118 |   # After the full_join, we will find which transcripts are exclusive to the
119 |   # gtf and the archs4 transcript file (hopefully none in the later case)
120 |   check.me <- tx.info %>%
121 |     full_join(a4.tinfo, by = c("transcript_id" = "ensembl_id"))
122 |   stopifnot(sum(duplicated(check.me$transcript_id)) == 0L)
123 | 
124 |   gtf.only <- filter(check.me, is.na(h5idx))
125 |   message(
126 |     nrow(gtf.only), " transcripts in gtf only ",
127 |     sprintf("(%.2f%% of transcripts from gtf)",
128 |             nrow(gtf.only) / nrow(tx.info)))
129 | 
130 |   a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype))
131 |   message(
132 |     nrow(a4.only), " transcripts are only in arch4 tx file ",
133 |     sprintf("(%.2f%% of transcripts in ARCHS4 hdf5 file)",
134 |             nrow(a4.only) / nrow(a4.tinfo)))
135 | 
136 |   out <- a4.tinfo %>%
137 |     left_join(tx.info, by = c("ensembl_id" = "transcript_id")) %>%
138 |     rename(symbol = gene_name)
139 |   stopifnot(
140 |     nrow(out) == nrow(a4.tinfo),
141 |     all(out$h5idx == a4.tinfo$h5idx))
142 |   out
143 | }
144 | 
145 | #' Helper function to create augmented gene-level metadata.
146 | #'
147 | #' @noRd
148 | #'
149 | #' @param x a GRanges object from .load_gtf, we assume this is enembl gtfs
150 | #' @param a4.ginfo result from
151 | #'   `archs4_feature_info("gene", ..., augmented = FALSE)`
152 | #' @return a decorated `a4.ginfo` table.
153 | .augmented_gene_info <- function(gr, a4.ginfo) {
154 |   requireNamespace("GenomicRanges", quietly = TRUE)
155 |   has.ens <- "ens_id" %in% colnames(a4.ginfo)
156 |   # Not trusting the ensembl_id's for now, due to this:
157 |   # https://github.com/MaayanLab/archs4/issues/3
158 |   has.ens <- FALSE
159 |   if (has.ens) {
160 |     a4.ginfo$join <- a4.ginfo$ens_id
161 |   } else {
162 |     a4.ginfo$join <- a4.ginfo$a4name
163 |   }
164 |   dup.join <- a4.ginfo$join[duplicated(a4.ginfo$join)]
165 | 
166 |   a4.ginfo.all <- a4.ginfo
167 |   a4.ginfo <- filter(a4.ginfo, !join %in% dup.join)
168 | 
169 |   # a4.ginfo <- mutate(a4.ginfo, join = tolower(a4name))
170 |   stopifnot(sum(duplicated(a4.ginfo$join)) == 0L)
171 | 
172 |   gr.exons <- gr[gr$type == "exon"]
173 |   grl <- GenomicRanges::split(gr.exons, gr.exons$gene_id)
174 |   rgrl <- GenomicRanges::reduce(grl)
175 |   gwidths <- sum(GenomicRanges::width(grl))
176 |   gwidths <- tibble(gene_id = names(gwidths), length = unname(gwidths))
177 | 
178 |   dfa <- as.data.frame(gr)
179 |   txstats <- dfa %>%
180 |     filter(type == "transcript") %>%
181 |     filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>%
182 |     droplevels %>%
183 |     group_by(gene_id) %>%
184 |     summarize(ntx = n())
185 | 
186 |   # Grooming a symbol-based, gene-level metadata table
187 |   g.info.all <- filter(dfa, type == "gene")
188 |   g.info <- g.info.all %>%
189 |     mutate(seqnames = as.character(seqnames)) %>%
190 |     filter(!is.na(as.integer(seqnames)) | seqnames %in% c("X", "Y", "M")) %>%
191 |     droplevels %>%
192 |     select(gene_id, gene_name, gene_biotype, seqnames, start, end, strand)
193 |   if (has.ens) {
194 |     g.info$join <- g.info$gene_id
195 |   } else {
196 |     g.info$join <- g.info$gene_name
197 |   }
198 | 
199 |   if (any(duplicated(g.info.all$gene_id))) {
200 |     stop("Duplicated gene ids")
201 |   }
202 | 
203 |   # There are likely duplicated "join" (symbol) rows in g.info, so deal with
204 |   # that first -- this is a hack. Let's order the info table so that the
205 |   # annotation entries (ie. gene_biotype) we care about appear first and pick
206 |   # the first by join
207 |   if (FALSE) {
208 |     dup.symbols <- g.info$join[duplicated(g.info$join)]
209 |     wtf <- g.info %>%
210 |       filter(join %in% dup.symbols) %>%
211 |       arrange(join, gene_id)
212 |     View(wtf)
213 |   }
214 |   bt.order <- c('protein_coding', 'miRNA', 'lincRNA', 'rRNA', 'snoRNA',
215 |                 'scRNA', 'scaRNA', 'sRNA')
216 |   bt.order <- c(bt.order, setdiff(g.info$gene_biotype, bt.order))
217 |   g.info$gene_biotype <- factor(g.info$gene_biotype, bt.order)
218 |   g.info.u <- g.info %>%
219 |     arrange(join, gene_biotype) %>%
220 |     filter(!duplicated(join))
221 | 
222 |   # After the full_join, we will find which genes are exclusive to the
223 |   # gtf and the archs4 gene file (hopefully none in the later case)
224 |   check.me <- full_join(g.info.u, a4.ginfo, by = "join")
225 |   stopifnot(sum(duplicated(check.me$join)) == 0L)
226 | 
227 |   gtf.only <- filter(check.me, is.na(h5idx))
228 |   message(
229 |     nrow(gtf.only), " genes in gtf only ",
230 |     sprintf("(%.2f%% of genes from gtf)",
231 |             nrow(gtf.only) / nrow(g.info.u)))
232 | 
233 |   a4.only <- filter(check.me, !is.na(h5idx) & is.na(gene_biotype))
234 |   message(
235 |     nrow(a4.only), " genes are only in arch4 tx file ",
236 |     sprintf("(%.2f%% of genes in ARCHS4 hdf5 file)",
237 |             nrow(a4.only) / nrow(a4.ginfo)))
238 | 
239 |   out <- a4.ginfo %>%
240 |     left_join(g.info.u, by = "join") %>%
241 |     select(-join) %>%
242 |     rename(symbol = gene_name)
243 |   out <- left_join(out, gwidths, by = "gene_id")
244 |   stopifnot(
245 |     nrow(out) == nrow(a4.ginfo),
246 |     all(out$h5idx == a4.ginfo$h5idx))
247 |   out
248 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | Overview
  4 | ========
  5 | 
  6 | The `archs4` package provides utility functions to query and explore the expression profiling data made available through the [ARCHS4 project](https://amp.pharm.mssm.edu/archs4/), which is described in the following publication:
  7 | 
  8 | [Massive mining of publicly available RNA-seq data from human and mouse](https://www.nature.com/articles/s41467-018-03751-6).
  9 | 
 10 | Because this package requires the user to download a number of data files that are external to the package, the [installation instructions](#installation) are *a bit* more involved than other R packages, and we leave them for [the end of this document](#installation).
 11 | 
 12 | Usage
 13 | =====
 14 | 
 15 | After [successful installation](#installation) of this package, you can query the series and samples included in the ARCHS4 repository, as well as materialize the expresion data into well-known bioconductor assay containers for downstream analysis.
 16 | 
 17 | To query GEO series and samples, you can use the `sample_info` function:
 18 | 
 19 | ``` r
 20 | library(archs4)
 21 | 
 22 | a4 <- Archs4Repository()
 23 | ids <- c('GSE89189', 'GSE29943', "GSM1095128", "GSM1095129", "GSM1095130")
 24 | sample.info <- sample_info(a4, ids)
 25 | head(sample.info)
 26 | #> # A tibble: 6 x 8
 27 | #>   series_id sample_id  Sample_title Sample_source_name_ch1 query_type
 28 | #>   <chr>     <chr>      <chr>        <chr>                  <chr>     
 29 | #> 1 GSE89189  GSM2360252 10318X2      iPS microglia          series    
 30 | #> 2 GSE89189  GSM2360253 7028X2       iPS microglia          series    
 31 | #> 3 GSE89189  GSM2360254 x2-1         iPS microglia          series    
 32 | #> 4 GSE89189  GSM2360255 x2-2         iPS microglia          series    
 33 | #> 5 GSE89189  GSM2360256 x2-3         iPS microglia          series    
 34 | #> 6 GSE89189  GSM2360257 x2-4         iPS microglia          series    
 35 | #> # ... with 3 more variables: sample_h5idx_gene <int>,
 36 | #> #   sample_h5idx_transcript <int>, organism <chr>
 37 | ```
 38 | 
 39 | You can use the `as.DGEList` function to materialize an `edgeR::DGEList` from a an arbitrary number of GEO sample and series identifiers. The only restriction is that the data from the series/samples must all be from the same species.
 40 | 
 41 | The most often use-case will likely be to create a `DGEList` for a given study. For instance, the GEO series identifier [`"GSE89189"`](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE89189) refers to the expression data generated to support the [Abud et al. iPSC-Derived Human Microglia-like Cells ...](https://www.ncbi.nlm.nih.gov/pubmed/28426964) paper.
 42 | 
 43 | Creating a `DGEList` from this study will create an object with 27,024 genes across 37 samples in about 1.5 seconds:
 44 | 
 45 | ``` r
 46 | yg <- as.DGEList(a4, "GSE89189", feature_type = "gene")
 47 | ```
 48 | 
 49 | The following command retrieves the 178,135 transcript level counts for this experiment in about 1.5 seconds, as well:
 50 | 
 51 | ``` r
 52 | yt <- as.DGEList(a4, "GSE89189", feature_type = "transcript")
 53 | ```
 54 | 
 55 | Installation
 56 | ============
 57 | 
 58 | The installation of the `archs4` package is a bit more involved than a standard package installation and can be roughly broken down into three steps.
 59 | 
 60 | 1.  Install the R package along with its dependencies.
 61 | 2.  Download a number of (large) data files into a specific folder.
 62 | 3.  Generate metadata from the files downloaded in (2) for downstream use.
 63 | 
 64 | We will walk you through each step in this section.
 65 | 
 66 | R Package Installation
 67 | ----------------------
 68 | 
 69 | The `arcsh4` package depends on other packages that are available through both [CRAN](https://cran.r-project.org/) and [Bioconductor](http://bioconductor.org/). For that reason, we will use the [`BiocInstaller::biocLite()`](https://www.bioconductor.org/install/#why-biocLite) function to install this package, which can seamlessly install packages from github, CRAN, and Bioconductor.
 70 | 
 71 | ``` r
 72 | source("https://bioconductor.org/biocLite.R")
 73 | biocLite("denalitherapeutics/archs4", build_vignettes=TRUE)
 74 | library("archs4")
 75 | ```
 76 | 
 77 | When you first load the `archs4` library, you will notice a startup message telling you that something isn't quite right with your `archs4` installation. The message will look something like this:
 78 | 
 79 |     Note that your default archs4 data directory is NOT setup correctly
 80 | 
 81 |       * Run `archs4_local_data_dir_validate()` to diagnose
 82 |       * Refer to the ARCHS4 Data Download section of the archs4 vignette for more information
 83 | 
 84 |     Your default archs4 data directory (`getOption("archs4.datadir")`) is:
 85 | 
 86 |       ~/.archs4data
 87 | 
 88 | In order for the package to work correctly, you must download a number of files which are enumerated in the [Data File Download](#data-file-download) section below into a single directory. You will then instruct the `archs4` package the path to the directory that holds all of these files by setting the value of R's global `"archs4.datadir`" option to be the path to that directory.
 89 | 
 90 | Data File Download
 91 | ------------------
 92 | 
 93 | You will have to create a directory on your filesystem which will hold a number of data files that the `archs4` package depends on. Let's call this directory `$ARCHS4DIR`, which we will define here to be `~/archs4v2data`.
 94 | 
 95 | The `archs4` package provides the `archs4_local_data_dir_create()` convenience function which creates this directory and copies over a `meta.yaml` file into that directory. The purpose of this file is to specify the names of the downloaded files that correspond to the human and mouse-level gene and transcript-level data.
 96 | 
 97 | ``` r
 98 | library(archs4)
 99 | archs4dir <- "~/archs4v2data"
100 | archs4_local_data_dir_create(archs4dir)
101 | ```
102 | 
103 | Once this directory is created successfully, you will then have to download the following files into it:
104 | 
105 | -   archs4
106 |     -   [`human_matrix.h5`](https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5): human gene-level counts
107 |     -   [`human_hiseq_transcript_v2.h5`](https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5): human transcript-level counts
108 |     -   [`mouse_matrix.h5`](https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix.h5): mouse gene-level counts
109 |     -   [`mouse_hiseq_transcript_v2.h5`](https://s3.amazonaws.com/mssm-seq-matrix/mouse_hiseq_transcript_v2.h5): mouse transcript-level counts
110 | -   ensembl
111 |     -   `Homo_sapiens.GRCh38.90.gtf.gz`: gtf used for human transcript annotations <ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz>
112 |     -   `Mus_musculus.GRCm38.90.gtf.gz`: gtf used for mouse transcript annotations <ftp://ftp.ensembl.org/pub/release-90/gtf/mus_musculus/Mus_musculus.GRCm38.90.gtf.gz>
113 | 
114 | The enumerated items above contain links to the files that need to be downloaded. You can right-click on them and select `Save As ...` and instruct your web-browser to save them to your local `$ARCHS4DIR`.
115 | 
116 | **NOTE**: Most all of the `archs4` functions accept a `datadir` parameter, which should be the path to `$ARCHS4DIR`. For convenience, the default value of this parameter is always set to `getOption("archs4.datadir")`. This means that you can modify your `~/.Rprofile` file to set the value of this option to `"~/archs4v2data"` (for instance), so that the package will always look there by default. If this option is not set in your `~/.Rprofile`, the default value for this option is "~/.archs4data".
117 | 
118 | Feature-Level Metadata Generation
119 | ---------------------------------
120 | 
121 | The datasets currently made available by the [ARCHS4 Project](https://amp.pharm.mssm.edu/archs4/) only provide minimal feature-level metadata:
122 | 
123 | -   the features in the gene-level datasets are identified only by their symbol; and
124 | -   only the ensembl transcript id's are provided for the features in the transcript-level datasets
125 | 
126 | We want to augment these features with richer annotations, such as the ensembl gene identifiers or gene biotypes, for instance.
127 | 
128 | To make such data generation automatic and easy for the user, once you have downloaded the Ensembl GTF files listed above into the `$ARCHS4DIR`, you can run the `create_augmented_feature_info()` to extract these extra feature-level metadata from the GTF files and store them as tables inside `$ARCHS4DIR` for later use.
129 | 
130 | ``` r
131 | create_augmented_feature_info(archs4dir)
132 | ```
133 | 
134 | This function will load and parse the GTF files from human and mouse, and create gene- and transcript-level `*.csv.gz` files in the `$ARCHS4DIR` which the `archs4` package will then later use downstream.
135 | 
136 | Once your `$ARCHS4DIR` is setup, you may find it convenient to set the default value for R's global `"archs4.datadir"` option to the `$ARCHS4DIR` directory you just setup. To do so, you can put the following line in your `~/.Rprofile` file:
137 | 
138 | ``` r
139 | options(archs4.datadir = "~/archs4v2data")
140 | ```
141 | 
142 | ARCHS4 Installation Heatlh
143 | --------------------------
144 | 
145 | Because the installation of this package is a bit more involved than most, we have also provided an `archs4_local_data_dir_validate()` function, which you can run over your `$ARCHS4DIR` in order to check on "the health" of your install.
146 | 
147 | This function will simply look at your `$ARCHS4DIR` to ensure that the required files are there, and tries to give you helpful error messages if not.
148 | 
149 | For instance, if the first two files enumerated in the [Data File Download](#data-file-download) section were missing from your `$ARCHS4DIR` (ie. `human_matrix.h5` and `human_hiseq_transcript_v2.h5`), you would be warned that "something isn't right" when you first load the `archs4` package. You could then run the `archs4_local_data_dir_validate()` to see what is wrong:
150 | 
151 | ``` r
152 | archs4_local_data_dir_validate(archs4dir)
153 | #> The following ARCHS4 files are missing, please download them:
154 | #>   * human_matrix.h5: https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5
155 | #>   * human_hiseq_transcript_v2.h5: #> https://s3.amazonaws.com/mssm-seq-matrix/human_hiseq_transcript_v2.h5
156 | ```
157 | 
158 | **NOTE:** If all installation and data download/processing steps have been completed successfully, a call to `archs4_local_data_dir_validate()` will simply return `TRUE`.
159 | 
160 | Package Development
161 | -------------------
162 | 
163 | If you are developing this package, you will find that it will be convenient to symlink the package's default `archs4.datadir` path (`~/.arcsh4data`) to the `$ARCHS4DIR` you just setup. This is because often times things like roxygen2 document compilation, unit testing, etc. happen in a vanilla R workspace, which won't run the configuration that is prescribed in your `~/.Rprofile` file.
164 | 


--------------------------------------------------------------------------------
/R/expression.R:
--------------------------------------------------------------------------------
  1 | #' Retrieve expression data for genes/transcripts across samples
  2 | #'
  3 | #' Returns the estimated counts for the genes/transcripts enumerated in the
  4 | #' `features` table for the samples enumerated in the `samples` table.
  5 | #'
  6 | #' Note that the values returned are simply estimated counts. They are not
  7 | #' normalized by sequencing depth. For now, the only use for this function is
  8 | #' to compare how ratios of genes compare across samples.
  9 | #'
 10 | #' @export
 11 | #' @importFrom edgeR cpm
 12 | #' @importFrom reshape2 melt
 13 | #' @seealso [as.DGEList()]
 14 | #'
 15 | #' @param a4 An `Archs4Repository`
 16 | #' @param features a `tibble` of feature descriptors (as returned from
 17 | #'   [feature_info()]). Really this table simply needs the following columns:
 18 | #'   * `"a4name"` or `"ensembl_id" or `;
 19 | #'   * `"feature_type"`: (gene or transcript)
 20 | #'   * `"source"` (human or mouse),;
 21 | #' @param samples a samples identifier: ie. a tibble with series_id and
 22 | #'   sample_id columns.
 23 | #' @param type "gene" or "transcript"-level data?
 24 | #' @param source "mouse" or "human" pass this in explicitly if there is no
 25 | #'   "source" column in your `features` or `samples` data.frame
 26 | #' @param feature_meta additional columns from
 27 | #'   `feature_info(a4, feature_type = feature_type)`
 28 | #' @param sample_meta metadata information to include for the samples. These
 29 | #'   values are extracted from the `meta/VALUE` files in the respective hdf5
 30 | #'   data files. See [archs4_sample_info()] for more details.
 31 | #' @return a data.frame of expression data. Each row is an observation
 32 | #'   (one gene one sample)
 33 | #' @examples
 34 | #' a4 <- Archs4Repository()
 35 | #' gnz <- feature_lookup(a4, c("CFAP65", "PECR"))
 36 | #' gexprs <- fetch_expression(a4, gnz)
 37 | fetch_expression <- function(a4, features, samples = NULL,
 38 |                              feature_type = "gene", source = NULL,
 39 |                              feature_meta = "symbol",
 40 |                              sample_meta = c("Sample_title", "Sample_source_name_ch1"),
 41 |                              prior.count = 3, ...) {
 42 |   assert_class(a4, "Archs4Repository")
 43 |   source <- .extract_source(features, samples, source)
 44 | 
 45 |   # Extract Features
 46 |   if (is.character(features)) {
 47 |     features <- feature_lookup(a4, features, feature_type = feature_type,
 48 |                                source = source)
 49 |   }
 50 |   assert_data_frame(features)
 51 |   assert_subset(c("ensembl_id", "feature_type"), colnames(features))
 52 | 
 53 |   features.all <- distinct(features, ensembl_id)
 54 |   features <- features.all %>%
 55 |     inner_join(feature_info(a4, feature_type, source), by = "ensembl_id")
 56 |   fmiss <- anti_join(features.all, features, by = "ensembl_id")
 57 |   if (nrow(fmiss) > 0L) {
 58 |     warning("Missing ", nrow(fmiss), " features from query")
 59 |   }
 60 | 
 61 |   all.samples <- a4 %>%
 62 |     sample_table(feature_type = type) %>%
 63 |     filter(organism == source) %>%
 64 |     select(series_id, sample_id)
 65 | 
 66 |   samples.specified <- !is.null(samples)
 67 |   if (!samples.specified) {
 68 |     samples <- all.samples
 69 |   } else {
 70 |     assert_data_frame(samples)
 71 |     assert_subset(c("series_id", "sample_id"), colnames(samples))
 72 |     samples <- distinct(samples, series_id, sample_id, .keep_all = TRUE)
 73 |     missing.samples <- samples %>%
 74 |       anti_join(all.samples, by = c("series_id", "sample_id"))
 75 |     if (nrow(missing.samples)) {
 76 |       warning(nrow(missing.samples), " requested samples are not in archs4")
 77 |       samples <- samples %>%
 78 |         semi_join(all.samples, by = c("series_id", "sample_id"))
 79 |     }
 80 |   }
 81 | 
 82 |   sids <- unique(samples$sample_id)
 83 | 
 84 |   y <- as.DGEList(a4, sids, features = features, feature_type = feature_type,
 85 |                   check_missing_samples = FALSE)
 86 | 
 87 |   counts <- reshape2::melt(y$counts)
 88 |   counts$Var1 <- as.character(counts$Var1)
 89 |   counts$Var2 <- as.character(counts$Var2)
 90 |   colnames(counts) <- c("ensembl_id", "sample_id", "count")
 91 | 
 92 |   cpms <- reshape2::melt(edgeR::cpm(y, prior.count = prior.count, log = TRUE))
 93 |   counts$cpm <- cpms[[3]]
 94 | 
 95 |   out <- left_join(samples, counts, by = c("sample_id"))
 96 | 
 97 |   if (is.character(feature_meta)) {
 98 |     fi.meta <- feature_info(a4, feature_type = feature_type, source = source)
 99 |     fi.meta <- filter(fi.meta, !is.na(ensembl_id))
100 |     feature_meta <- unique(c("ensembl_id", feature_meta))
101 |     fi.meta <- fi.meta[, colnames(fi.meta) %in% feature_meta, drop = FALSE]
102 |     if (ncol(fi.meta) > 1L) {
103 |       out <- left_join(out, fi.meta, by = "ensembl_id")
104 |     }
105 |   }
106 | 
107 |   if (is.character(sample_meta)) {
108 |     si <- tryCatch({
109 |       sample_info(a4, sids, sample_meta, check_missing_samples = FALSE)
110 |     }, error = function(e) NULL)
111 |     if (is.data.frame(si)) {
112 |       scols <- unique(c("sample_id", sample_meta))
113 |       si <- si[, intersect(scols, colnames(si)), drop = FALSE]
114 |       if (ncol(si) > 1L) {
115 |         si <- distinct(si, sample_id, .keep_all = TRUE)
116 |         out <- left_join(out, si, by = "sample_id")
117 |       }
118 |     } else {
119 |       warning("There was an error fetching the sample metadata requested")
120 |     }
121 |   }
122 | 
123 |   out
124 | }
125 | 
126 | .extract_source <- function(features, samples, source) {
127 |   if (is.null(source)) {
128 |     source <- character()
129 |     if (is.data.frame(features)) {
130 |       source <- unique(features$source)
131 |     }
132 |     if (is.data.frame(samples)) {
133 |       source <- unique(c(source, samples$source))
134 |     }
135 |     source <- source[!is.null(source)]
136 |     if (length(source) == 0) {
137 |       stop("Don't know where to fetch sources from")
138 |     }
139 |   }
140 |   if (is.character(source)) {
141 |     if (length(source) != 1L) {
142 |       stop("Can't specificy multiple organisms for source -- one at a time!")
143 |     }
144 |   } else {
145 |     stop("source expected to be a character by know")
146 |   }
147 |   if (!source %in% archs4_sources()) {
148 |     stop("Unknown source:", source)
149 |   }
150 |   source
151 | }
152 | 
153 | #' Retrieves (organism) source of gene/transcript identifiers
154 | #'
155 | #' Parses ensembl identifiers and determines if they are for genes, or
156 | #' transcripts, as well as the organism they should belong to (human, mouse)
157 | #'
158 | #' @param x A vector of identifiers
159 | feature_source <- function(x, ...) {
160 |   stop("not implemented yet")
161 | }
162 | 
163 | # library size and normalization factor estimation =============================
164 | #
165 | # The code here was adapted from the code in edgeR.
166 | # We want to calculate and store library sizes and normalization factors
167 | # for an Archs4Repository, so we can calculate things like "cpm" on the fly ...
168 | # the super-fly.
169 | 
170 | # Default top-level TMM normalization parameters:
171 | #
172 | #   logratioTrim = 0.3
173 | #   sumTrim = 0.05
174 | #   doWeighting = TRUE
175 | #   Acutoff = -1e10
176 | #   p = 0.75
177 | #
178 | # Main TMM Calculation
179 | #   f75 <- .calcFactorQuantile(data=x, lib.size=lib.size, p=0.75)
180 | #   refColumn <- which.min(abs(f75-mean(f75)))
181 | #   f <- rep(NA,ncol(x)) # holds norm factors
182 | #   for(i in 1:ncol(x))
183 | #     f[i] <- .calcFactorWeighted(obs=x[,i], ref=x[,refColumn],
184 | #                                 libsize.obs=lib.size[i],
185 | #                                 libsize.ref=lib.size[refColumn],
186 | #                                 logratioTrim=logratioTrim,
187 | #                                 sumTrim=sumTrim,
188 | #                                 doWeighting=doWeighting,
189 | #                                 Acutoff=Acutoff)
190 | #
191 | # .calcFactorQuantile
192 | # 	y <- t(t(data)/lib.size)           # divides each column by its library size
193 | #   f <- apply(y,2,function(x) quantile(x,p=p)) # find value at upper quartile
194 | 
195 | # .expr.datasets <- c("human_gene",       "mouse_gene",
196 | #                     "human_transcript", "mouse_transcript")
197 | .expr.datasets <- c("human_gene", "mouse_gene")
198 | 
199 | #' Estimate normalization factors for datasets in the Archs4Repository
200 | #'
201 | #' This function will serialize the results of the library size and
202 | #' normalization factors into files inside `datadir(a4)`
203 | #'
204 | #' @export
205 | #' @param a4 The `Arcsh4Repository`
206 | #' @return Invisibly returns a tibble of the the library size and normalization
207 | #'   factors for the expression data in `a4` (invisibly).
208 | estimate_repository_norm_factors <- function(a4) {
209 |   info <- lapply(.expr.datasets, function(key) {
210 |     estimate_norm_factors(a4, key)
211 |   })
212 |   invisible(info)
213 | }
214 | 
215 | #' Calculates library size and norm factors for a specific dataset
216 | #'
217 | #' This function will serialize the results of the library size and
218 | #' normalization factors into files inside `datadir(a4)`
219 | #'
220 | #'
221 | estimate_norm_factors <- function(a4, key, n = 500, logratioTrim = 0.3,
222 |                                   sumTrim = 0.05, doWeighting = TRUE,
223 |                                   Acutoff = -1e10, p = 0.75, ...) {
224 |   if (FALSE) {
225 |     key = "mouse_gene"
226 |     n = 500; logratioTrim = 0.3; sumTrim = 0.05; doWeighting = TRUE;
227 |     Acutoff = -1e10; p = 0.75
228 |   }
229 | 
230 |   key <- match.arg(key, .expr.datasets)
231 |   h5.fn <- file_path(a4, key)
232 |   fn.out <- paste0(key, "-normfactors.csv")
233 |   fn.out <- file.path(datadir(a4), fn.out)
234 | 
235 |   # Almost could have saved a lot of time using the v4++ matrices since they
236 |   # include a "meta/reads_aligned" vector, however to use TMM, we still need
237 |   # to calculate the percentiles ...
238 |   # sinfo <- tibble(
239 |   #   series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")),
240 |   #   sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession")),
241 |   #   libsize = h5read(h5.fn, "meta/reads_aligned"))
242 |   sinfo <- .crankLibSize(h5.fn, n = n, p = p)
243 |   fn.tmp <- sub("\\.csv$", "-tmp.csv", fn.out)
244 |   write.csv(sinfo, fn.tmp, row.names = FALSE)
245 | 
246 |   # Identify the sample to use as the reference profile
247 |   refColumn <- which.min(abs(sinfo$quantile - mean(sinfo$quantile, na.rm = TRUE)))
248 |   ref.vals <- h5read(h5.fn, "data/expression", list(NULL, refColumn))
249 | 
250 |   # Run a TMM for all samples against the reference profile
251 |   nf <- .crankNormFactors(h5.fn, sinfo, ref.vals, n = n,
252 |                           logratioTrim = logratioTrim, sumTrim = sumTrim,
253 |                           doWeighting = doWeighting, Acutoff = Acutoff)
254 |   sinfo$normfactor <- nf
255 | 
256 |   write.csv(sinfo, fn.out, row.names = FALSE)
257 |   invisible(sinfo)
258 | }
259 | 
260 | # before the v4 gene-level count matrices, the read counts per sample were not
261 | # included, so we were calculating them manually.
262 | .crankLibSize <- function(h5.fn, n = 500, p = 0.75) {
263 |   message("Calculating library sizes ...")
264 |   sinfo <- tibble(
265 |     series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")),
266 |     sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession")))
267 | 
268 |   n.batches <- nrow(sinfo) / n
269 |   remainder <- nrow(sinfo) %% n
270 |   if (remainder > 0) n.batches <- n.batches + 1
271 |   n.batches <- floor(n.batches)
272 | 
273 |   offset <- 0
274 | 
275 |   res <- vector("list", n.batches)
276 |   for (i in 1:n.batches) {
277 |     idx.start <- offset + 1
278 |     idx.end <- idx.start + (if (i != n.batches) n else remainder) - 1L
279 |     idxs <- idx.start:idx.end
280 |     offset <- idx.end
281 | 
282 |     # lib size and percentile
283 |     dat <- h5read(h5.fn, "data/expression", list(NULL, idxs))
284 |     lib.size <- colSums(dat)
285 |     dat <- t(t(dat)/lib.size)
286 |     # Somehow some samples had NA counts, so we need to protect from that
287 |     fq <- apply(dat, 2, function(x) quantile(x, p = p, na.rm = TRUE))
288 |     res[[i]] <- data.frame(index = idxs, libsize = lib.size, quantile = fq)
289 |   }
290 |   res <- bind_rows(res)
291 |   bind_cols(sinfo, res)
292 | }
293 | 
294 | .crankNormFactors <- function(h5.fn, sinfo, ref.vals, n = n,
295 |                               logratioTrim = logratioTrim, sumTrim = sumTrim,
296 |                               doWeighting = doWeighting, Acutoff = Acutoff) {
297 |   message("Calculating normalization factors ...")
298 |   ref.size <- sum(ref.vals)
299 |   # Figure out how to batch iterate over our data, we now have to reload the
300 |   # data to calculate the normalization factors.
301 |   n.batches <- nrow(sinfo) / n
302 |   remainder <- nrow(sinfo) %% n
303 |   if (remainder > 0) n.batches <- n.batches + 1
304 |   n.batches <- floor(n.batches)
305 | 
306 |   # Load batches again and calc norm factors
307 |   nf <- numeric(nrow(sinfo))
308 |   offset <- 0
309 | 
310 |   for (i in 1:n.batches) {
311 |     idx.start <- offset + 1
312 |     idx.end <- idx.start + (if (i != n.batches) n else remainder) - 1L
313 |     idxs <- idx.start:idx.end
314 |     offset <- idx.end
315 | 
316 |     # lib size and percentile
317 |     dat <- h5read(h5.fn, "data/expression", list(NULL, idxs))
318 |     for (j in 1:ncol(dat)) {
319 |       idx <- idxs[j]
320 |       vals <- dat[,j]
321 |       if (any(is.na(vals))) {
322 |         nf[idx] <- NA_real_
323 |       } else {
324 |         nf[idx] <- .calcFactorWeighted(obs=vals, ref=ref.vals,
325 |                                        libsize.obs=sum(vals),
326 |                                        libsize.ref=ref.size,
327 |                                        logratioTrim=logratioTrim,
328 |                                        sumTrim=sumTrim,
329 |                                        doWeighting=doWeighting,
330 |                                        Acutoff=Acutoff)
331 |       }
332 |     }
333 |   }
334 | 
335 |   nf
336 | }
337 | 
338 | # Calculates the TMM between two libraries.
339 | #
340 | # This is lifted straight from edgeR_3.22.2. The only reason it is included here
341 | # is because it is not exported (rightfully) from the package.
342 | .calcFactorWeighted <- function(obs, ref, libsize.obs=NULL, libsize.ref=NULL,
343 |                                 logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE,
344 |                                 Acutoff=-1e10) {
345 |   obs <- as.numeric(obs)
346 |   ref <- as.numeric(ref)
347 | 
348 |   if( is.null(libsize.obs) ) nO <- sum(obs) else nO <- libsize.obs
349 |   if( is.null(libsize.ref) ) nR <- sum(ref) else nR <- libsize.ref
350 | 
351 |   logR <- log2((obs/nO)/(ref/nR))			# log ratio of expression, accounting for library size
352 |   absE <- (log2(obs/nO) + log2(ref/nR))/2	# absolute expression
353 |   v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref	 # estimated asymptotic variance
354 | 
355 |   #	remove infinite values, cutoff based on A
356 |   fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
357 | 
358 |   logR <- logR[fin]
359 |   absE <- absE[fin]
360 |   v <- v[fin]
361 | 
362 |   if(max(abs(logR)) < 1e-6) return(1)
363 | 
364 |   #	taken from the original mean() function
365 |   n <- length(logR)
366 |   loL <- floor(n * logratioTrim) + 1
367 |   hiL <- n + 1 - loL
368 |   loS <- floor(n * sumTrim) + 1
369 |   hiS <- n + 1 - loS
370 | 
371 |   #	keep <- (rank(logR) %in% loL:hiL) & (rank(absE) %in% loS:hiS)
372 |   #	a fix from leonardo ivan almonacid cardenas, since rank() can return
373 |   #	non-integer values when there are a lot of ties
374 |   keep <- (rank(logR)>=loL & rank(logR)<=hiL) & (rank(absE)>=loS & rank(absE)<=hiS)
375 | 
376 |   if(doWeighting)
377 |     f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
378 |   else
379 |     f <- mean(logR[keep], na.rm=TRUE)
380 | 
381 |   #	Results will be missing if the two libraries share no features with positive counts
382 |   #	In this case, return unity
383 |   if(is.na(f)) f <- 0
384 |   2^f
385 | }
386 | 


--------------------------------------------------------------------------------
/R/archs4-functional.R:
--------------------------------------------------------------------------------
  1 | # These functions define a "high-level" interface to working with the data
  2 | # downloaded into an ARCHS4 Data Directory. They are the workhorses of this
  3 | # package.
  4 | #
  5 | # Although users can interact with the ARCHS4 data directory "directly" using
  6 | # these functions, most will find it more convenient to interact with ARCHS4
  7 | # using an `Archs4Repository`. Many of the functions here are also available
  8 | # against an `Archs4Repository` object, and are named by stripping the archs4_
  9 | # prefix here.
 10 | #
 11 | # For instance, the following are equivalent:
 12 | #
 13 | #     R> archs4_feature_info(datadir)
 14 | #
 15 | #   and
 16 | #
 17 | #     R> a4 <- Archs4Repository(datadir)
 18 | #     R> feature_info(a4)
 19 | #
 20 | # Interacting with these data via the `Archs4Repository` should also better
 21 | # future-proof your code.
 22 | 
 23 | 
 24 | #' Retrieves the feature (gene/transcript) information for the archs4 data
 25 | #'
 26 | #' Only the gene symbols (`meta/genes` in gene expression hd5 file) or entrez
 27 | #' transcript identifiers (`meta/transcript` for the transcript hdf5 file) are
 28 | #' stored in thse data. We use [create_augmented_feature_info()] function to
 29 | #' generate and store extra metadata for these features, which are then appended
 30 | #' to these identifiers with this function.
 31 | #'
 32 | #' @export
 33 | #' @importFrom readr read_csv
 34 | #' @seealso [create_augmented_feature_info()]
 35 | #'
 36 | #' @param feature_type gene or transcript?
 37 | #' @param source human or mouse
 38 | #' @param augmented include extra gene- or transcript-level features?
 39 | #'   Default: `TRUE`
 40 | #' @param datadir the directory that stores the ARCHS4 data files
 41 | #' @param ... pass through
 42 | #' @return a tibble of information
 43 | archs4_feature_info <- function(feature_type = "gene", source = "human",
 44 |                                 augmented = TRUE,
 45 |                                 datadir = getOption("archs4.datadir"), ...) {
 46 |   assert_choice(feature_type, c("gene", "transcript"))
 47 |   assert_choice(source, archs4_sources(datadir))
 48 |   assert_flag(augmented)
 49 | 
 50 |   fkey <- paste(source, feature_type, sep = "_")
 51 |   h5.fn <- archs4_file_path(fkey, datadir = datadir)
 52 | 
 53 |   if (feature_type == "gene") {
 54 |     # I am using `a4name` instead of symbol, because the values stored here
 55 |     # aren't universally/technically symbols. In the mouse dataset, the "symbol"
 56 |     # names are all uppercase, which is a non-canonical something.
 57 | 
 58 |     ainfo <- tibble(a4name = rhdf5::h5read(h5.fn, "meta/genes"),
 59 |                     entrez_id = .h5read(h5.fn, "meta/gene_entrezid"))
 60 |     is.v2m <- source == "mouse" & all(ainfo$a4name == toupper(ainfo$a4name))
 61 |     is.v2m <- TRUE
 62 |     if (source == "human") {
 63 |       ainfo[["a4external"]] <- .h5read(h5.fn, "meta/gene_hgnc")
 64 |       ainfo[["refseq_id"]] <- .h5read(h5.fn, "meta/gene_refseqid")
 65 |       join <- c(a4name = "a4name")
 66 |     } else {
 67 |       join <- c(a4name = "a4name")
 68 |       if (is.v2m) {
 69 |         # v2 of the mouse-level gene data had gene names in all caps. I am
 70 |         # changing this to "title case".
 71 |         ainfo[["join"]] <- paste0(
 72 |           substr(ainfo[["a4name"]], 1, 1),
 73 |           tolower(substring(ainfo[["a4name"]], 2)))
 74 |         ainfo[["ens_id"]] <- as.character(.h5read(h5.fn, "meta/gene_ensemblid"))
 75 |       } else {
 76 |         ainfo[["a4external"]] <- .h5read(h5.fn, "meta/gene_mgi")
 77 |         ainfo[["ens_id"]] <- as.character(.h5read(h5.fn, "meta/gene_ensemblid"))
 78 |       }
 79 |       # if (!all(is.na(ainfo[["ens_id"]]))) {
 80 |       #   join <- c(ens_id = "ensembl_id")
 81 |       # } else {
 82 |       #   join <- c(join = "join")
 83 |       #   ainfo[["join"]] <- tolower(ainfo[["a4name"]])
 84 |       # }
 85 |     }
 86 |     ainfo[["h5idx"]] <- seq(nrow(ainfo))
 87 |   } else {
 88 |     ainfo <- tibble(ensembl_id_full = rhdf5::h5read(h5.fn, "meta/transcript"),
 89 |                     ensembl_id = sub("\\.\\d+$", "", ensembl_id_full),
 90 |                     length = rhdf5::h5read(h5.fn, "meta/transcriptlength"),
 91 |                     h5idx = seq(ensembl_id_full))
 92 |   }
 93 | 
 94 |   if (augmented) {
 95 |     aug.fn <- paste(source, feature_type, "info", sep = "_")
 96 |     aug.fn <- archs4_file_path(aug.fn, datadir = datadir)
 97 | 
 98 |     if (feature_type == "gene") {
 99 |       if (source == "human") {
100 |         coltypes <- "cciccccciici"
101 |       } else {
102 |         coltypes <- "ccicccccciici"
103 |       }
104 |       meta <- readr::read_csv(aug.fn, col_types = coltypes)
105 |       meta <- mutate(meta,
106 |                      entrez_id = ifelse(entrez_id == "null", NA, entrez_id))
107 |       meta <- rename(meta, ensembl_id = "gene_id")
108 |       meta <- mutate(meta, feature_type = "gene")
109 |       if (join == "join") {
110 |         meta[["join"]] <- tolower(meta[["symbol"]])
111 |       }
112 |     } else {
113 |       join <- "ensembl_id_full"
114 |       coltypes <- "cciicccccciic"
115 |       meta <- readr::read_csv(aug.fn, col_types = coltypes)
116 |       meta <- mutate(meta, feature_type = "transcript")
117 |     }
118 | 
119 |     # remove duplicate columns in meta table except for join column
120 |     meta <- meta[, !colnames(meta) %in% setdiff(colnames(ainfo), join)]
121 |     meta <- meta[!duplicated(meta[[join]]) & !is.na(meta[[join]]),]
122 |     tmp <- left_join(ainfo, meta, by = join)
123 | 
124 |     stopifnot(
125 |       nrow(tmp) == nrow(ainfo),
126 |       all(tmp[[join]] == ainfo[[join]]))
127 |     ainfo <- tmp
128 | 
129 |     # if (feature_type == "gene") {
130 |     #   ainfo$symbol_guess <- ainfo$symbol
131 |     #   ainfo$symbol <- ainfo$a4name
132 |     # }
133 |   }
134 | 
135 |   if ("ens_id" %in% colnames(ainfo) && !"ensembl_id" %in% colnames(ainfo)) {
136 |     ainfo <- rename(ainfo, ensembl_id = "ens_id")
137 |   }
138 |   if ("join" %in% colnames(ainfo)) ainfo[["join"]] <- NULL
139 | 
140 |   ainfo$source <- source
141 |   ainfo
142 | }
143 | 
144 | #' Retrieves a table of files that back an Archs4Repository
145 | #'
146 | #' @description
147 | #' A yaml iskept in the Archs4 data directory (`getOption("archs4.datadir")`)
148 | #' that links keys, (ie. `mouse_gene`) to the name of the file in the directory.
149 | #' This abstraction is introduced so that the version of these files can be
150 | #' updated with new downloads, and the user simply has to modify the yaml file
151 | #' so that they are used downstream
152 | #'
153 | #' Reference the "ARCHS4 Data Download" section in the vignette for more
154 | #' information.
155 | #'
156 | #' @export
157 | #' @param datadir the directory that stores the ARCHS4 data files
158 | archs4_file_info <- function(datadir = getOption("archs4.datadir")) {
159 |   yml <- archs4_meta(datadir = datadir)
160 |   finfo <- yml[["files"]]
161 | 
162 |   take <- function(l, wut) {
163 |     val <- l[[wut]]
164 |     if (is.null(val)) NA_character_ else val
165 |   }
166 | 
167 |   tibble(
168 |     key = names(finfo),
169 |     source = sapply(finfo, take, "source"),
170 |     name = sapply(finfo, take, "name"),
171 |     url = sapply(finfo, take, "url"),
172 |     desription = sapply(finfo, take, "description"),
173 |     file_path = file.path(datadir, name),
174 |     file_exists = file.exists(file_path))
175 | }
176 | 
177 | #' Identify the file path on the system for specific ARCHS4 files.
178 | #'
179 | #' By default, this function will throw an error if a file does not exist
180 | #' upon lookup. To return the *expected* path to the, even if it does not
181 | #' exist on the file system, set `stop_if_missing = FALSE`.
182 | #'
183 | #' @export
184 | #' @importFrom stats setNames
185 | #'
186 | #' @param key the lookup key for the file, ie. `"human_gene"` or `"mouse_gene"`.
187 | #'   The known keys are enumerated in `archs4_file_info()$key` column.
188 | #' @param stop_if_missing defaults to `TRUE`, which causes this function to
189 | #'   throw an error if the file does not exist at the expected `file_path`.
190 | #'   Set this to `FALSE` to simply raise a warning
191 | #' @param na_missing by default, we set paths to files that don't exist to
192 | #'   `NA`. Set this to `FALSE` to retrieve the expected path of the missing
193 | #'   file.
194 | #' @param file_info the output from [archs4_file_info()], which enumerates the
195 | #'   files used by the `Archs4Repository`.
196 | #' @param datadir the directory that stores the ARCHS4 data files
197 | #' @return a named (by `key`) character vector of paths to the filesystem that
198 | #'   correspond to the entries in `key`.
199 | archs4_file_path <- function(key, stop_if_missing = TRUE,  na_missing = TRUE,
200 |                              file_info = archs4_file_info(datadir),
201 |                              datadir = getOption("archs4.datadir")) {
202 |   assert_character(key, min.len = 1L)
203 |   assert_directory(datadir, 'r')
204 |   assert_class(file_info, "data.frame")
205 |   assert_names(c("key", "source", "file_path"),
206 |                subset.of = colnames(file_info))
207 |   query <- tibble(key = key) %>% left_join(file_info, by = "key")
208 |   qbad <- filter(query, is.na(source))
209 | 
210 |   if (nrow(qbad)) {
211 |     bad.key <- unique(qbad[["key"]])
212 |     stop("Unknown file queries: ", paste(bad.key, collapse = ", "))
213 |   }
214 | 
215 |   qmiss <- filter(query, !file_exists)
216 |   if (nrow(qmiss)) {
217 |     bad.key <- unique(qmiss[["key"]])
218 |     msg <- paste("Can not find archs4 file(s) on disk: ",
219 |                  paste(bad.key, collapse = ", "))
220 |     if (stop_if_missing) stop(msg) else warning(msg)
221 |     if (na_missing) {
222 |       query <- mutate(query, file_path = ifelse(file_exists, file_path, NA))
223 |     }
224 |   }
225 | 
226 |   setNames(query[["file_path"]], query[["key"]])
227 | }
228 | 
229 | #' Retrieves tibble of sample-level covariates available in mouse and human data.
230 | #'
231 | #' Enumerate the sample covariates available in mouse and human data.
232 | #' Note that the covariates available in human and mouse are the same
233 | #' between the gene and transcript level files
234 | #'
235 | #' @export
236 | #' @param datadir the directory that holds the archs4 data
237 | archs4_sample_covariates <- function(datadir = getOption("archs4.datadir"),
238 |                                      ...) {
239 |   mouse.covs <- .sample_metadata_files("mouse_gene", datadir) %>%
240 |     select(name) %>%
241 |     mutate(mouse = TRUE)
242 |   human.covs <- .sample_metadata_files("human_gene", datadir) %>%
243 |     select(name) %>%
244 |     mutate(human = TRUE)
245 |   sample.covs <- mouse.covs %>%
246 |     full_join(human.covs, by = "name") %>%
247 |     mutate(mouse = ifelse(is.na(mouse), FALSE, TRUE),
248 |            human = ifelse(is.na(human), FALSE, TRUE))
249 |   sample.covs
250 | }
251 | 
252 | #' Enumerates internal files from hdf5 binary that contain sample metadata
253 | #'
254 | #' This function is intentionally not exported
255 | #'
256 | #' @importFrom rhdf5 h5ls
257 | #' @noRd
258 | #'
259 | #' @param file hdf5 file in `datadar` to use to identify the internal "metal"
260 | #'   files that correspond to sample-level metadata. You shouldn't need to
261 | #'   change this as all hdf5 data files should carry the same metadata. This is
262 | #'   here mainly for debugging purposes.
263 | #' @param datadir the directory that holds the archs4 data
264 | #' @param ... pass through
265 | #' @return a vector of sample metadata names that are stored in archs4
266 | .sample_metadata_files <- function(file = "mouse_gene",
267 |                                    datadir = getOption("archs4.datadir"),
268 |                                    ...) {
269 |   h5.fn <- archs4_file_path(file, datadir = datadir)
270 |   all.files <- as_tibble(h5ls(h5.fn))
271 |   all.files[["idim"]] <- suppressWarnings(as.integer(all.files[["dim"]]))
272 | 
273 |   dat.dim <- local({
274 |     dd <- filter(all.files, group == "/data", name == "expression")
275 |     if (nrow(dd) != 1L) {
276 |       stop("The hdf5 file is not an archs4 expression matrix ",
277 |            sprintf("[%s]", h5.fn))
278 |     }
279 |     dd <- tolower(dd$dim)
280 |     out <- sapply(strsplit(dd, " *x *")[[1L]], as.integer)
281 |     setNames(out, c("rows", "columns"))
282 |   })
283 |   nsamples <- dat.dim["columns"]
284 |   nfeats <- dat.dim["rows"]
285 | 
286 |   sample.meta <- filter(all.files, group == "/meta", idim == nsamples)
287 |   mutate(sample.meta, h5name = paste(group, name, sep = "/"))
288 | }
289 | 
290 | #' Checks which samples from a single series are present/absent.
291 | #'
292 | #' Due to download or alignment issues, the ARCHS4 data processing pipeline may
293 | #' not include all of the samples included in a particular GEO series. This
294 | #' function will return a table with an `in_archs4` columns that indicates
295 | #' whether or not a sample from a particular series is present in ARCHS4.
296 | #'
297 | #' @export
298 | #'
299 | #' @param id a single GEO series id, ie. `"GSEnnnnn"`
300 | #' @param sample_table the output from [archs4_sample_table()], which lists
301 | #'   the series_id,sample_id combinations found in the ARCHS4 repository.
302 | #' @param datadir the directory that holds the archs4 data
303 | #' @return a tibble of information for a series.
304 | #' @examples
305 | #' info <- archs4_series_status("GSE89189")
306 | archs4_series_status <- function(id,
307 |                                  sample_table = archs4_sample_table(datadir = datadir),
308 |                                  datadir = getOption("archs4.datadir"),
309 |                                  ...) {
310 |   assert_string(id)
311 |   if (geo_id_type(id)[["type"]] != "series") {
312 |     stop("A GEO series id is required (ie. \"GSEnnnnn\")")
313 |   }
314 | 
315 |   gq <- lookup_gse(id)
316 |   geo <- tibble(series_id = id, sample_id = gq[["sample"]])
317 | 
318 |   archs <- filter(sample_table, series_id == id)
319 |   archs <- transmute(archs, sample_id, in_archs4 = TRUE)
320 | 
321 |   out <- left_join(geo, archs, by = "sample_id")
322 |   out[["in_archs4"]] <- !is.na(out[["in_archs4"]])
323 |   out
324 | }
325 | 
326 | 
327 | #' Retrieves information for samples by GSE series or sample IDs
328 | #'
329 | #' @description
330 | #' Fetch a tibble of series and sample information by querying the arcsh4
331 | #' dataset by GEO sample (GSE) or sample (GSM) ids.
332 | #'
333 | #' For each unique GEO series identifier (`"GSEnnnn"`), we will check if the
334 | #' ARCHS4 dataset is missing any of its samples when `check_missing_samples`
335 | #' is set to `TRUE` (default).
336 | #'
337 | #' @export
338 | #'
339 | #' @param id a character vector of GEO series or sample ids.
340 | #' @param columns the names of the sample metadata columns desired. This
341 | #'   defaults to `c("Sample_title", "Sample_source_name_ch1")`. The values
342 | #'   in `columns` must be a subset of the values enumerated in
343 | #'   [archs4_sample_covariates()].
344 | #' @param sample_table the output from [archs4_sample_table()], which lists
345 | #'   the series_id,sample_id combinations found in the ARCHS4 repository.
346 | #' @param sample_covariates the `data.frame`-definition of the sample covariates
347 | #'   found in the ARCHS4 datasetes, which is constructed via a call to
348 | #'   [archs4_sample_covariates()]. The parameter is included in here so that
349 | #'   a cached version of this `data.frame` can be re-used.
350 | #' @param check_missing_samples When `TRUE` (the default), this function will
351 | #'   check every unique GEO series identifier (`"GSEnnnn"`) for missing samples
352 | #'   by using an NCBI Rest service via a call to [archs4_series_status()],
353 | #'   and [lookup_gse()].
354 | #' @param datadir the directory that holds the archs4 data
355 | #' @return a tibble of series_id, sample_id, sample_h5idx, sample_title, and
356 | #'   sample_name columns. If the query sample or series query can't be found,
357 | #'   then there will be an `NA` value for these columns. The `query_type` column
358 | #'   will indicat whether the row was returned from querying by series or by
359 | #'   sample.
360 | #' @examples
361 | #' si <- archs4_sample_info("GSE52564") # ben barres transcriptome db ...
362 | archs4_sample_info <- function(id,
363 |                                columns = c("Sample_title", "Sample_source_name_ch1"),
364 |                                sample_table = archs4_sample_table(datadir = datadir),
365 |                                sample_covariates = archs4_sample_covariates(datadir),
366 |                                check_missing_samples = TRUE,
367 |                                datadir = getOption("archs4.datadir"), ...) {
368 |   input <- geo_id_type(id)
369 |   bad.id <- filter(input, type == "unknown")
370 |   if (nrow(bad.id)) {
371 |     stop("Malformed identifiers in query: ", paste(bad.id$id, collapse = ", "))
372 |   }
373 | 
374 |   # check sample metadata columns
375 |   get.meta <- !is.null(columns) && length(columns)
376 |   if (get.meta) {
377 |     columns <- unique(columns) %>%
378 |       assert_character(any.missing = FALSE) %>%
379 |       assert_subset(sample_covariates$name)
380 |   } else {
381 |     columns <- character()
382 |   }
383 | 
384 |   # This hurts: I'm doing this to ensure that queries to this function where
385 |   # all `id`s are not found in the ARCHS4 repository still return a tibble
386 |   # of the right dimensions, but has NAs in most places
387 |   dummy <- tibble(
388 |     series_id = character(),
389 |     sample_id = character(),
390 |     query_type = character(),
391 |     sample_h5idx_gene = integer(),
392 |     sample_h5idx_transcript = integer(),
393 |     organism = character())
394 |   for (cname in columns) dummy[[cname]] <- character()
395 | 
396 |   series <- input %>%
397 |     filter(type == "series") %>%
398 |     rename(series_id = id) %>%
399 |     left_join(sample_table, by = "series_id")
400 |   bad.series <- series %>%
401 |     filter(is.na(organism)) %>%
402 |     distinct(series_id) %>%
403 |     mutate(query_type = "series")
404 |   if (nrow(bad.series)) {
405 |     warning(nrow(bad.series), " series identifiers not found",
406 |             immediate. = TRUE)
407 |     series <- anti_join(series, bad.series, by = "series_id")
408 |     bad.series <- bind_rows(bad.series, dummy)
409 |   }
410 | 
411 |   with.missing.samples <- NULL
412 |   if (check_missing_samples && nrow(series)) {
413 |     for (s in unique(series[["series_id"]])) {
414 |       check <- archs4_series_status(s, sample_table = sample_table,
415 |                                     datadir = datadir)
416 |       missing <- filter(check, !in_archs4)
417 |       if (nrow(missing)) {
418 |         msg <- paste(
419 |           sprintf("The %s series has %d missing samples in ARCHS4:\n    *",
420 |                   s, nrow(missing)),
421 |           paste(missing[["sample_id"]], collapse ="\n    * "),
422 |           "\n")
423 |         warning(msg, immediate. = TRUE, call. = FALSE)
424 |         # TODO: Add `with_missing_samples` parameter, or does it make sense
425 |         #       to just automatically append these?
426 |         # it hurts me to do this:
427 |         with.missing.samples <- bind_rows(with.missing.samples, missing)
428 |       }
429 |     }
430 |   }
431 | 
432 |   samples <- input %>%
433 |     filter(type == "sample") %>%
434 |     rename(sample_id = id) %>%
435 |     left_join(sample_table, by = "sample_id")
436 |   bad.samples <- samples %>%
437 |     filter(is.na(organism)) %>%
438 |     distinct(sample_id) %>%
439 |     mutate(query_type = "sample")
440 |   if (nrow(bad.samples)) {
441 |     warning(nrow(bad.samples), " sample identifiers not found",
442 |             immediate. = TRUE)
443 |     samples <- anti_join(samples, bad.samples, by = "sample_id")
444 |     bad.samples <- bind_rows(bad.samples, dummy)
445 |   }
446 | 
447 |   query <- series %>%
448 |     bind_rows(samples) %>%
449 |     select(series_id, sample_id, query_type = type,
450 |            sample_h5idx_gene, sample_h5idx_transcript, organism)
451 | 
452 |   # Only perform this if there >= 1 series or sample identifiers were found
453 |   if (nrow(query) && get.meta) {
454 |     res <- query %>%
455 |       group_by(organism) %>%
456 |       do({
457 |         .with_sample_info(., columns, .$organism[1L], sample_covariates, datadir)
458 |       }) %>%
459 |       ungroup
460 |   } else {
461 |     res <- query
462 |   }
463 | 
464 |   out <- res %>%
465 |     bind_rows(bad.series) %>%
466 |     bind_rows(bad.samples)
467 | 
468 |   col.order <- c("series_id", "sample_id", columns)
469 |   col.order <- c(col.order, setdiff(colnames(out), col.order))
470 |   out[, col.order]
471 | }
472 | 
473 | #' Helper function that implements sample-level meatdata retrieval
474 | #'
475 | #' This function is only meant to be called within the `do({})` block in the
476 | #' [archs4_sample_info()] function, and as such does no argument checking and
477 | #' is intentionally not exported.
478 | #'
479 | #' @noRd
480 | #'
481 | #' @importFrom rhdf5 h5read
482 | .with_sample_info <- function(x, columns, organism, sample_covariates, datadir) {
483 |   organism <- organism[1L]
484 |   h5g.fn <- archs4_file_path(paste0(organism, "_gene"), datadir = datadir)
485 |   h5t.fn <- archs4_file_path(paste0(organism, "_transcript"), datadir = datadir)
486 | 
487 |   out <- sapply(columns, function(i) {
488 |     rep(NA_character_, nrow(x))
489 |   }, simplify = FALSE)
490 | 
491 |   # ensure that we only lookup covariates that are available for this organism
492 |   scovs <- sample_covariates[["name"]][sample_covariates[[organism]]]
493 |   columns <- intersect(columns, scovs)
494 | 
495 |   # we first try to get metadata from the gene matrix
496 |   ginfo <- !is.na(x$sample_h5idx_gene)
497 |   if (any(ginfo)) {
498 |     i <- list(x$sample_h5idx_gene[ginfo])
499 |     for (what in columns) {
500 |       out[[what]][ginfo] <- .h5read(h5g.fn, paste0("meta/", what), i)
501 |     }
502 |   }
503 | 
504 |   # anything left to fetch from the transcript hdf5?
505 |   tinfo <- !ginfo
506 |   if (any(tinfo)) {
507 |     i <- list(x$sample_h5idx_transcript[tinfo])
508 |     for (what in columns) {
509 |       out[[what]][tinfo] <- .h5read(h5t.fn, paste0("meta/", what), i)
510 |     }
511 |   }
512 | 
513 |   res <- bind_cols(out)
514 |   bind_cols(x, res)
515 | }
516 | 
517 | #' Lists the GEO series and samples available in the human and mouse datasets
518 | #'
519 | #' This function queries the human and mouse gene expression matrices from
520 | #' the ARCHS4 data release and combines their GEO series and sample identifiers
521 | #' into a long table, annotated with the organism the samples come from.
522 | #'
523 | #' This function executes very quickly (less thatn 0.10th of a second), so most
524 | #' sample-level query functions in this package which you would think would
525 | #' benefit from specifying human/mouse don't have to, as they will join into
526 | #' this table to find out what species is being queried.
527 | #'
528 | #' @export
529 | #' @importFrom readr read_csv
530 | #' @param feature_type currently, the `"gene"` and `"transcript"` datasets
531 | #'   are not the same.
532 | #' @param unroll_series There are some malformed series identifiers, like
533 | #'   `"GSE36025Xx-xXGSE49417Xx-xXGSE49847"` when the same sample_id appears
534 | #'   in multiple series. When this is `TRUE` (default), the series_id's are
535 | #'   unrolled and cleaned up.
536 | #' @param datadir the directory that holds the archs4 data
537 | #' @return a tibble of series_id, sample_id, species columns
538 | archs4_sample_table <- function(feature_type = c("all", "gene", "transcript"),
539 |                                 unroll_series = TRUE,
540 |                                 datadir = getOption("archs4.datadir")) {
541 |   feature_type <- match.arg(feature_type)
542 | 
543 |   if (feature_type == "all") {
544 |     gst <- archs4_sample_table("gene", unroll_series, datadir)
545 |     tst <- archs4_sample_table("transcript", unroll_series, datadir)
546 |     res <- gst %>%
547 |       full_join(tst, by = c("series_id", "sample_id"),
548 |                 suffix = c("_gene", "_transcript")) %>%
549 |       mutate(organism = ifelse(is.na(organism_gene), organism_transcript,
550 |                                organism_gene)) %>%
551 |       mutate(organism_gene = NULL, organism_transcript = NULL) %>%
552 |       select(series_id, sample_id, organism, everything())
553 |   } else {
554 |     res <- map_dfr(archs4_sources(), function(source) {
555 |       fkey <- paste(source, feature_type, sep = "_")
556 |       h5.fn <- archs4_file_path(fkey, datadir = datadir)
557 |       dat <- tibble(series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")),
558 |                     sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession")),
559 |                     organism = source,
560 |                     sample_h5idx = seq(sample_id))
561 |       if (feature_type == "gene") {
562 |         dat[["a4libsize"]] <- as.vector(.h5read(h5.fn, 'meta/reads_aligned'))
563 | 
564 |         # Load manually estimated library size and normalization factors.
565 |         sfn <- file.path(dirname(h5.fn), paste0(source, "_gene-normfactors.csv"))
566 |         if (file.exists(sfn)) {
567 |           sf <- suppressWarnings(read_csv(sfn, col_types = "cciidd"))
568 |           sf <- select(sf, series_id, sample_id, libsize, normfactor)
569 |           dat <- left_join(dat, sf, by = c("series_id", "sample_id"))
570 |         }
571 |       }
572 |       if (unroll_series) {
573 |         regex <- "[^GSEM0-9]+"
574 |         dat <- separate_rows(dat, series_id, sep = regex)
575 |       }
576 |     })
577 |   }
578 |   res
579 | }
580 | 
581 | #' Lists the different sources ARCHS4 is built for
582 | #'
583 | #' We hardocde these values in a lot of places ... who knows if one day these
584 | #' are updated?
585 | #'
586 | #' @export
587 | #' @param datadir the directory that holds the archs4 data
588 | #' @return a character vector listing the different sources (organisms) that
589 | #'   the ARCHS4 repository has data for.
590 | archs4_sources <- function(datadir = getOption("archs4.datadir")) {
591 |   archs4_meta(datadir)[["sources"]]
592 | }
593 | 
594 | #' Retrieves the meta information associated with an ARCHS4 datadir
595 | #'
596 | #' @export
597 | #' @importFrom yaml read_yaml
598 | #'
599 | #' @param datadir the directory that holds the archs4 data
600 | #' @return a list-representation of the `meta.yaml` file
601 | archs4_meta <- function(datadir = getOption("archs4.datadir")) {
602 |   assert_directory(datadir, "r")
603 |   fpath <- assert_file_exists(file.path(datadir, "meta.yaml"), "r", "yaml")
604 |   yml <- yaml::read_yaml(fpath)
605 | 
606 |   # doing some validation on this file
607 |   assert_character(yml[["sources"]])
608 |   assert_list(yml[["files"]])
609 |   yml
610 | }
611 | 


--------------------------------------------------------------------------------