├── tests └── AnnotationHubData_unit_tests.R ├── inst ├── extdata │ ├── inpDrosPsuedo.rda │ └── badUCSCTracks │ │ ├── allBadTracks.rda │ │ ├── allPossibleTracks.rda │ │ └── genomeTrackTable.Rda ├── unitTests │ ├── cases │ │ └── encodeDCCMetadata │ │ │ ├── tbl.parsedEncodeMetadata.RData │ │ │ ├── wgEncodeAwgDnaseUniform.info │ │ │ └── wgEncodeAffyRnaChip.info │ ├── test_ImportPreparer-class.R │ ├── test_validityFunctions.R │ ├── test_webAccessFunctions.R │ ├── test_AnnotationHubConstructor.R │ └── test_recipe.R ├── makefile └── scripts │ ├── addContributedResources.txt │ └── singleContributedResourceTemplate.R ├── R ├── makeGrasp2Db.R ├── Message-class.R ├── ImportPreparer-class.R ├── makeAnnotationHubResource.R ├── makeUCSC2Bit.R ├── makeChEA.R ├── makeEnsemblGtfToGRanges.R ├── makeGencodeFasta.R ├── makeRefNet.R ├── validationFunctions.R ├── makeEnsemblTwoBit.R ├── makeStandardTxDbsToSqlite.R ├── makeInparanoid8ToDbs.R ├── makeStandardOrgDbsToSqlite.R ├── makeHaemCode.R ├── HubMetadata-class.R ├── webAccessFunctions.R ├── makeUCSCChain.R ├── ahmToJson.R ├── makedbSNPVCF.R ├── makeEncodeDCC.R ├── makeEnsemblFasta.R ├── trackWithAuxiliaryTableToGRangesRecipe.R ├── utils.R └── makeGencodeGFF.R ├── man ├── flog.Rd ├── AnnotationHubData-package.Rd ├── upload_to_azure.Rd ├── upload_to_S3.Rd ├── ImportPreparer-class.Rd ├── makeGencodeFasta.Rd ├── makeEnsemblFasta.Rd ├── validationFunctions.Rd ├── makeStandardOrgDbs.Rd ├── updateResources.Rd ├── AnnotationHubMetadata-class.Rd └── makeAnnotationHubMetadata.Rd ├── vignettes └── IntroductionToAnnotationHubData.Rmd ├── appveyor.yml ├── DESCRIPTION ├── NAMESPACE └── NEWS /tests/AnnotationHubData_unit_tests.R: -------------------------------------------------------------------------------- 1 | BiocGenerics:::testPackage("AnnotationHubData") 2 | -------------------------------------------------------------------------------- /inst/extdata/inpDrosPsuedo.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/inpDrosPsuedo.rda -------------------------------------------------------------------------------- /inst/extdata/badUCSCTracks/allBadTracks.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/badUCSCTracks/allBadTracks.rda -------------------------------------------------------------------------------- /inst/extdata/badUCSCTracks/allPossibleTracks.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/badUCSCTracks/allPossibleTracks.rda -------------------------------------------------------------------------------- /inst/extdata/badUCSCTracks/genomeTrackTable.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/badUCSCTracks/genomeTrackTable.Rda -------------------------------------------------------------------------------- /inst/unitTests/cases/encodeDCCMetadata/tbl.parsedEncodeMetadata.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/unitTests/cases/encodeDCCMetadata/tbl.parsedEncodeMetadata.RData -------------------------------------------------------------------------------- /inst/unitTests/test_ImportPreparer-class.R: -------------------------------------------------------------------------------- 1 | test_ImportPreparer_constructors <- function() 2 | { 3 | classes <- names(getClassDef("ImportPreparer")@subclasses) 4 | ## need to be no-arg capable 5 | valid <- sapply(classes, function(cl) validObject(new(cl))) 6 | checkTrue(all(valid)) 7 | } 8 | -------------------------------------------------------------------------------- /R/makeGrasp2Db.R: -------------------------------------------------------------------------------- 1 | # The correct recipe(step 1 & 2) is inside grasp2db software package. 2 | 3 | ## FIXME: does not pass BiocVersion 4 | ## STEP 3: Call the helper to set up the newResources() method 5 | makeAnnotationHubResource("Grasp2ImportPreparer", 6 | grasp2db:::.makeAnnotationHubRecord) 7 | -------------------------------------------------------------------------------- /man/flog.Rd: -------------------------------------------------------------------------------- 1 | \name{flog} 2 | \alias{flog} 3 | \title{flog} 4 | 5 | \description{ 6 | 7 | Write logging message to console and a file. 8 | 9 | } 10 | 11 | \usage{ 12 | flog(level, ...) 13 | } 14 | 15 | \arguments{ 16 | \item{level}{A \code{characater(1)} string object.} 17 | \item{\dots}{Further arguments.} 18 | } 19 | 20 | \details{ 21 | 22 | Writes the message to the console and to a file. 23 | } 24 | 25 | \value{ 26 | None. 27 | } 28 | 29 | \author{Dan Tenenbaum} 30 | 31 | \seealso{\code{futile.logger}} 32 | 33 | %%\example{ 34 | %% perhaps dan can provide one... 35 | %% } 36 | 37 | \keyword{classes} 38 | -------------------------------------------------------------------------------- /R/Message-class.R: -------------------------------------------------------------------------------- 1 | .Message <- setRefClass("Message", 2 | fields=list( 3 | name="character" 4 | ), 5 | methods=list( 6 | append = function(fmt, ...) { 7 | .self$name <- c(name, sprintf(fmt, ...)) 8 | invisible(.self) 9 | }, 10 | validity = function() { 11 | "report if any messages (e.g., after validity check)" 12 | if (length(name)) name else NULL 13 | }, 14 | isComplete = function() { 15 | "stop if any messages" 16 | if (length(name)) { 17 | stop(paste(name, collapse="\n")) 18 | } else TRUE 19 | } 20 | ) 21 | ) 22 | -------------------------------------------------------------------------------- /R/ImportPreparer-class.R: -------------------------------------------------------------------------------- 1 | ## these classes are used for dispatch only 2 | 3 | setClass("ImportPreparer", representation="VIRTUAL") 4 | 5 | setMethod(show, "ImportPreparer", function(object) { 6 | cat("class:", class(object), "\n") 7 | }) 8 | 9 | setGeneric("newResources", signature="importPreparer", 10 | function(importPreparer, currentMetadata = list(), ...) 11 | standardGeneric("newResources") 12 | ) 13 | 14 | setGeneric("annotationHubRoot", signature="object", 15 | function(object) 16 | standardGeneric("annotationHubRoot")) 17 | 18 | setGeneric("metadataList", signature="object", 19 | function(object) 20 | standardGeneric ("metadataList")) 21 | 22 | setGeneric("metadataTable", signature="object", 23 | function(object) 24 | standardGeneric ("metadataTable")) 25 | 26 | setGeneric("sourceUrls", signature="object", 27 | function(object) 28 | standardGeneric("sourceUrls")) 29 | 30 | -------------------------------------------------------------------------------- /man/AnnotationHubData-package.Rd: -------------------------------------------------------------------------------- 1 | \name{AnnotationHubData-package} 2 | \alias{AnnotationHubRecipes} 3 | \alias{AnnotationHubData-package} 4 | 5 | \docType{package} 6 | \title{ 7 | Transform public data resources into Bioconductor Data Structures 8 | } 9 | \description{ 10 | These recipes convert a wide variety and a growing number of public bioinformatic data sets into easily-used standard Bioconductor data structures. 11 | } 12 | \details{ 13 | This package provides a set of methods which convert bioinformatic data 14 | resources into standard Bioconductor data types. For example, a UCSC 15 | genome browser track, expressed as a BED file, is converted into a 16 | GRanges object. Not every valuable data resource can be transformed 17 | quite so easily; some require more elaborate transformation, and hence a 18 | more specialized recipe. Every effort is made to limit the number of 19 | recipes required. One strategy that helps with the principle of "zero 20 | curation": unless absolutely required, the "cooked" version of the data 21 | resource produced by a recipe is a simple and unembellished reflection 22 | of the original data in its downloaded form. 23 | } 24 | \author{Dan Tenenbaum, Paul Shannon} 25 | 26 | \seealso{\code{AnnotationHubMetadata-class}, \code{makeAnnotationHubMetadata}} 27 | 28 | \keyword{package} 29 | -------------------------------------------------------------------------------- /vignettes/IntroductionToAnnotationHubData.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction to AnnotationHubData" 3 | author: "Lori Shepherd" 4 | date: "Modified: February 2021. Compiled: `r format(Sys.Date(), '%d %b %Y')`" 5 | output: 6 | BiocStyle::html_document: 7 | toc: true 8 | --- 9 | 13 | 14 | 15 | # Overview 16 | 17 | The AnnotationHubData package package provides tools to acquire, annotate, 18 | convert and store data for use in Bioconductor's `AnnotationHub`. Most of the 19 | functions will be used by the Bioconcutor Core Team. For information on how to 20 | use `AnnotationHub` or how to create an `AnnotationHub` package please see the 21 | vignettes in `AnnotationHub`. 22 | 23 | # Creating an AnnotationHub Package or Converting to an AnnotationHub Package 24 | 25 | Please see HubPub Vignette "CreateAHubPackage". 26 | ``` 27 | vignette("CreateAHubPackage", package="HubPub") 28 | ``` 29 | 30 | # Historical vignettes 31 | 32 | The process for adding data to `AnnotationHub` has evolved substantially since 33 | the first vignettes were written. Much of the information contained in those 34 | documents is outdated or applicable only to repeat-run recipes added to the 35 | code base. These documents have been retained for historical purposes and 36 | are located in the inst/scripts/ directory of the `AnnotationHubData` package. 37 | 38 | -------------------------------------------------------------------------------- /man/upload_to_azure.Rd: -------------------------------------------------------------------------------- 1 | \name{upload_to_azure} 2 | \alias{upload_to_azure} 3 | \alias{Azure} 4 | \alias{DataLake} 5 | \alias{AZURE_SAS_URL} 6 | \title{Upload a file to Microsoft Azure Data Lake} 7 | \description{This function is for uploading a file resource to the 8 | Microsoft Azure Data Lake.} 9 | \usage{upload_to_azure(file, sas)} 10 | \arguments{ 11 | \item{file}{ 12 | The file or directory to upload. 13 | } 14 | \item{sas}{ 15 | A SAS url for the designated destination on Microsoft Azure Data Lake. 16 | } 17 | } 18 | \details{ 19 | Uses the \href{https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10}{azcopy Command Line Interface} 20 | to copy a file to Microsoft Azure Data Lake. Assumes azcopy is properly installed 21 | and that the \code{azcopy} program is in your PATH. The function 22 | performs a recursive automatically so it can take a file or directory 23 | for upload. The SAS URL is generated on Azure by someone who has 24 | permission to the desired destination. Please be sure to use the SAS url 25 | and not the SAS token. The sas url can be provided as an argument; if 26 | the argument is not provided it will search for a system environment 27 | variable `AZURE_SAS_URL`. 28 | } 29 | \value{ 30 | \code{TRUE} on success. If the command fails, the function 31 | will exit with an error. 32 | } 33 | \author{Lori Shepherd} 34 | 35 | \examples{ 36 | \dontrun{ 37 | upload_to_azure("myfile.txt", "https://sasurl") 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /inst/makefile: -------------------------------------------------------------------------------- 1 | PKG=AnnotationHubRecipes 2 | default: build install 3 | 4 | help: 5 | egrep "^#" makefile | sed "s/^#//" 6 | 7 | # --- quickbuild: no vignette 8 | # 9 | quickbuild: 10 | (cd ../..; R CMD build --no-vignettes $(PKG)) 11 | 12 | 13 | # --- build 14 | # 15 | build: 16 | (cd ../..; R CMD build --no-vignettes $(PKG)) 17 | 18 | # --- install 19 | # 20 | install: 21 | (cd ../..; R CMD install $(PKG)) 22 | 23 | # --- check 24 | # 25 | check: clean build install 26 | (cd ../..; R CMD check --no-manual --no-vignettes --no-codoc --no-examples --no-manual $(PKG)) 27 | 28 | # --- checkfull 29 | # 30 | checkfull: 31 | (cd ../..; R CMD build $(PKG)) 32 | (cd ../..; R CMD check $(PKG)) 33 | 34 | 35 | # --- vanillaTest 36 | # run all the unit tests, in a clean context 37 | # 38 | 39 | vanillaTest: build install 40 | - rm vanillaTest.out 41 | R --vanilla < vanillaTest.R > vanillaTest.out 2>&1 42 | 43 | # --- vt 44 | # run all the unit tests, in a clean context 45 | # 46 | 47 | vt: vanillaTest 48 | 49 | 50 | # --- checkvig 51 | # check just the vignette 52 | # 53 | 54 | checkvig: 55 | (cd ../..; R CMD check --no-manual --no-codoc --no-tests --no-examples $(PKG)) 56 | 57 | 58 | # --- tangle 59 | # extract the R code from the vignette file 60 | # 61 | 62 | tangle: 63 | (cd ../vignettes; R CMD Stangle $(PKG).Rnw) 64 | 65 | 66 | 67 | # --- sweave 68 | # creates $(PKG).tex, runs all embedded examples 69 | # run this before the pdf target 70 | # 71 | sweave: 72 | (cd ../vignettes; R CMD Sweave $(PKG).Rnw --pdf) 73 | 74 | # --- pdf 75 | # make and open $(PKG).pdf, the vignette 76 | # 77 | 78 | pdf: sweave 79 | (cd ../vignettes; open $(PKG).pdf) 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /R/makeAnnotationHubResource.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeAnnotationHubResource() 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | ## Creates a Preparer class and associated newResource() method. 7 | .generalNewResources <- function(importPreparer, currentMetadata, 8 | makeAnnotationHubMetadataFunction, ...) 9 | { 10 | ## returns metadata 11 | ahms <- makeAnnotationHubMetadataFunction(currentMetadata, ...) 12 | 13 | ## add the importPreparer 14 | lapply(ahms, function(x) { 15 | x@PreparerClass<-class(importPreparer)[1] 16 | x 17 | }) 18 | } 19 | 20 | makeAnnotationHubResource <- function(objName, 21 | makeAnnotationHubMetadataFunction, 22 | ..., where=topenv(parent.frame())) 23 | { 24 | ## create class 25 | setClass(objName, 26 | contains="ImportPreparer", 27 | package="AnnotationHubData", 28 | where=where) 29 | 30 | ## FIXME: This doesn't seem to be the case - ie, no handling of 'old'. 31 | ## The job of this method is to only get resources that are "new" 32 | ## It takes an arg of "old" AHMs that can be used for filtering. 33 | ## So it will call the makeAnnotationHubMetadataFunction, and then 34 | ## toss out any currentMetadata() AHMs that are already present. 35 | 36 | ## create newResources method 37 | setMethod(newResources, objName, where=where, 38 | function(importPreparer, currentMetadata=list(), ...) 39 | { 40 | .generalNewResources(importPreparer, currentMetadata, 41 | makeAnnotationHubMetadataFunction, ...) 42 | }) 43 | } 44 | -------------------------------------------------------------------------------- /inst/unitTests/cases/encodeDCCMetadata/wgEncodeAwgDnaseUniform.info: -------------------------------------------------------------------------------- 1 | wgEncodeAwgDnaseDuke8988tUniPk.narrowPeak.gz project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=8988T; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH001103; tableName=wgEncodeAwgDnaseDuke8988tUniPk; type=narrowPeak; md5sum=80fadeb7a14a72add38203910d937f50; size=1.7M 2 | wgEncodeAwgDnaseDukeAosmcUniPk.narrowPeak.gz project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=AoSMC; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; tableName=wgEncodeAwgDnaseDukeAosmcUniPk; type=narrowPeak; md5sum=957b3477d43cef1c6abd41182b053418; size=1.5M 3 | wgEncodeAwgDnaseDukeChorionUniPk.narrowPeak.gz project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=Chorion; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000595; tableName=wgEncodeAwgDnaseDukeChorionUniPk; type=narrowPeak; md5sum=f0ce90b72c1cfaceda456e0dfd10db1e; size=1.6M 4 | wgEncodeAwgDnaseDukeCllUniPk.narrowPeak.gz project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=CLL; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH001104; tableName=wgEncodeAwgDnaseDukeCllUniPk; type=narrowPeak; md5sum=fe463a299af6fbefa38beeba59426767; size=873K 5 | wgEncodeAwgDnaseDukeFibroblUniPk.narrowPeak.gz project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=Fibrobl; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000583; tableName=wgEncodeAwgDnaseDukeFibroblUniPk; type=narrowPeak; md5sum=4bf374cbbbda675e686c51de627a3d05; size=3.5M 6 | wgEncodeAwgDnaseDukeFibropUniPk.narrowPeak.gz project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=FibroP; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000605; tableName=wgEncodeAwgDnaseDukeFibropUniPk; type=narrowPeak; md5sum=905497fc0eaa1631b19af0e91599bb89; size=2.3M 7 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # Based on https://github.com/krlmlr/r-appveyor 2 | # DO NOT CHANGE the "init" and "install" sections below 3 | 4 | # Download script file from GitHub 5 | init: 6 | ps: | 7 | $ErrorActionPreference = "Stop" 8 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 9 | Import-Module '..\appveyor-tool.ps1' 10 | 11 | install: 12 | ps: Bootstrap 13 | 14 | # Adapt as necessary starting from here 15 | 16 | build_script: 17 | # The following configuration will use the development version of Bioconductor. This 18 | # is because under the hood, r-appveyor relies on r-travis, and r-travis sets an 19 | # environment variable , BIOC_USE_DEVEL=${BIOC_USE_DEVEL:-"TRUE"} 20 | # 21 | # This is configurable, but devel is the default. Here's the source: 22 | # https://github.com/craigcitro/r-travis/blob/master/scripts/travis-tool.sh#L11 23 | # 24 | # Notice: We'll need to decide on a workflow, if we want to provide CI builds of 25 | # release versions in AppVeyor . 26 | - git config --global user.name "travis" 27 | - git config --global user.email "travis@example.org" 28 | - travis-tool.sh install_bioc_deps 29 | - travis-tool.sh install_bioc grasp2db 30 | 31 | test_script: 32 | - travis-tool.sh run_tests 33 | 34 | on_failure: 35 | - 7z a failure.zip *.Rcheck\* 36 | - appveyor PushArtifact failure.zip 37 | 38 | environment: 39 | global: 40 | WARNINGS_ARE_ERRORS: 1 41 | _R_CHECK_FORCE_SUGGESTS_: 0 42 | R_ARCH: x64 43 | USE_RTOOLS: true ## to be able to use Remotes (i.e. packages from non-CRAN sources) 44 | 45 | matrix: 46 | - R_VERSION: release 47 | 48 | artifacts: 49 | - path: '*.Rcheck\**\*.log' 50 | name: Logs 51 | 52 | - path: '*.Rcheck\**\*.out' 53 | name: Logs 54 | 55 | - path: '*.Rcheck\**\*.fail' 56 | name: Logs 57 | 58 | - path: '*.Rcheck\**\*.Rout' 59 | name: Logs 60 | 61 | - path: '\*_*.tar.gz' 62 | name: Bits 63 | 64 | - path: '\*_*.zip' 65 | name: Bits 66 | 67 | cache: 68 | - C:\RLibrary 69 | -------------------------------------------------------------------------------- /man/upload_to_S3.Rd: -------------------------------------------------------------------------------- 1 | \name{upload_to_S3} 2 | \alias{upload_to_S3} 3 | \alias{S3} 4 | \alias{amazon} 5 | \alias{AWS} 6 | \title{ 7 | Upload a file to Amazon S3 8 | } 9 | \description{ 10 | This function is for uploading a file resource to the S3 cloud. 11 | } 12 | \usage{ 13 | upload_to_S3(file, remotename, bucket, profile, acl="public-read") 14 | } 15 | %- maybe also 'usage' for other objects documented here. 16 | \arguments{ 17 | \item{file}{ 18 | The file to upload. 19 | } 20 | \item{remotename}{ 21 | The name this file should have in S3, including any "keys" 22 | that are part of the name. This should not start with 23 | a slash (if it does, the leading slash will be removed), 24 | but can contain forward slashes. 25 | } 26 | \item{bucket}{ 27 | Name of the S3 bucket to copy to. 28 | } 29 | \item{profile}{ 30 | Corresponds to a profile set in the config file for the AWS CLI 31 | (see \href{http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html#cli-multiple-profiles}{the documentation}). 32 | If this argument is omitted,the default profile is used. 33 | } 34 | \item{acl}{ 35 | Should be one of \code{private}, \code{public-read}, or \code{public-read-write}. 36 | } 37 | } 38 | \details{ 39 | Uses the \href{https://aws.amazon.com/cli/}{AWS Command Line Interface} 40 | to copy a file to Amazon S3. Assumes the CLI is properly configured 41 | and that the \code{aws} program is in your PATH. The CLI should be 42 | configured with the credentials of a user who has permission to 43 | upload to the appropriate bucket. It's recommended to use 44 | \href{https://aws.amazon.com/iam/}{IAM} to set up users 45 | with limited permissions. 46 | 47 | There is an \code{RAmazonS3} package but it seems to have issues 48 | uploading files to S3. 49 | } 50 | \value{ 51 | \code{TRUE} on success. If the command fails, the function 52 | will exit with an error. 53 | } 54 | \author{Dan Tenenbaum} 55 | 56 | \examples{ 57 | \dontrun{ 58 | upload_to_S3("myfile.txt", "foo/bar/baz/yourfile.txt") 59 | # If this is successful, the file should be accessible at 60 | # http://s3.amazonaws.com/annotationhub/foo/bar/baz/yourfile.txt 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /R/makeUCSC2Bit.R: -------------------------------------------------------------------------------- 1 | make2bit <- function(currentMetadata, justRunUnitTest=FALSE, 2 | BiocVersion=BiocManager::version()) { 3 | rsrc <- .getUCSCResources(fileType="2bit", dirName="bigZips", 4 | fileName=".2bit", verbose=TRUE, justRunUnitTest) 5 | ## input_sources table 6 | sourceSize <- as.numeric(rsrc$size) 7 | sourceUrls <- rsrc$fileurl 8 | sourceVersion <- gsub(" ", "_", rsrc$date) 9 | sourceLastModifiedDate <- rsrc$date 10 | rdatapaths <- gsub(.ucscBase, "",sourceUrls) 11 | md5sum <- rsrc$md5sum 12 | 13 | ## resources table 14 | species <- rsrc$organism 15 | genome <- rsrc$from 16 | taxonomyId <- as.integer(rsrc$taxid) 17 | title <- rownames(rsrc) 18 | description <- sprintf("UCSC 2 bit file for %s ", rsrc$from) 19 | 20 | Map(AnnotationHubMetadata, 21 | 22 | SourceSize = sourceSize, 23 | SourceUrl = sourceUrls, 24 | SourceVersion = sourceVersion, 25 | SourceLastModifiedDate = sourceLastModifiedDate, 26 | SourceMd5 =md5sum, 27 | 28 | Description = description, 29 | Title = title, 30 | Genome = genome, 31 | Species = species, 32 | TaxonomyId = taxonomyId, 33 | 34 | RDataPath = rdatapaths, 35 | 36 | MoreArgs=list( 37 | BiocVersion=BiocVersion, 38 | # input sources 39 | SourceType = "TwoBit", 40 | 41 | # resources 42 | DataProvider = "UCSC", 43 | Maintainer = "Bioconductor Maintainer ", 44 | Coordinate_1_based = FALSE, 45 | Location_Prefix = .ucscBase, 46 | RDataDateAdded = Sys.time(), 47 | 48 | #rdata table 49 | DispatchClass= "TwoBitFile" , 50 | RDataClass = "TwoBitFile", 51 | 52 | Recipe = NA_character_, 53 | Tags = c("2bit", "UCSC", "genome" ))) 54 | } 55 | 56 | makeAnnotationHubResource("UCSC2BitPreparer", make2bit) 57 | -------------------------------------------------------------------------------- /inst/unitTests/test_validityFunctions.R: -------------------------------------------------------------------------------- 1 | txdb <- GenomeInfoDb::loadTaxonomyDb() 2 | txdb <- rbind(txdb, c(NA, NA, "")) 3 | 4 | test_getSpeciesList <- function(){ 5 | list <- getSpeciesList() 6 | checkTrue(length(list) == dim(txdb)[1]) 7 | } 8 | 9 | test_validSpecies <- function(){ 10 | 11 | checkTrue(validSpecies("Homo sapiens", verbose=FALSE)) 12 | checkTrue(!validSpecies("Homo Sapiens", verbose=FALSE)) 13 | checkTrue(validSpecies(NA_character_)) 14 | } 15 | 16 | test_suggestSpecies <- function(){ 17 | 18 | vl1 <- Reduce(`|`, lapply(txdb[2:3], grepl, pattern = "Dictyoglomus", 19 | ignore.case=TRUE)) 20 | vl2 <- Reduce(`|`, lapply(txdb[2:3], grepl, pattern = "immobile", 21 | ignore.case=TRUE)) 22 | 23 | out <- suggestSpecies(c("Dictyoglomus", "immobile")) 24 | checkTrue((length(which(vl1)) + length(which((vl2)))) == dim(out)[1]) 25 | } 26 | 27 | test_validTaxId <- function(){ 28 | 29 | checkTrue(is.null(AnnotationHubData::checkSpeciesTaxId(9606, 30 | "Homo sapiens"))) 31 | options(warn=2) 32 | checkException(AnnotationHubData::checkSpeciesTaxId(9999, "Homo sapiens")) 33 | options(warn=0) 34 | } 35 | 36 | test_validDispatchClass <- function(){ 37 | 38 | checkTrue(validDispatchClass("GRanges")) 39 | checkTrue(validDispatchClass(c("GRanges", "Rda"))) 40 | checkTrue(!validDispatchClass("somethingNotThere")) 41 | checkTrue(!validDispatchClass(c("GRanges", "somethingNotThere"))) 42 | } 43 | 44 | test_FileLengths <- function(){ 45 | 46 | checkTrue(AnnotationHubData:::.checkFileLengths( 47 | RDataPath = c("package/example1.bam", "package/example2.bai"), 48 | DispatchClass="BamFile")) 49 | checkException(AnnotationHubData:::.checkFileLengths( 50 | RDataPath = c("package/example1.bai", "package/example2.bam"), 51 | DispatchClass="BamFile")) 52 | checkException(AnnotationHubData:::.checkFileLengths( 53 | RDataPath = "package/example1.bam", 54 | DispatchClass="BamFile")) 55 | checkTrue(AnnotationHubData:::.checkFileLengths( 56 | RDataPath = "package/example1.rda", 57 | DispatchClass="Rda")) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: AnnotationHubData 2 | Type: Package 3 | Title: Transform public data resources into Bioconductor Data Structures 4 | Version: 1.41.0 5 | Encoding: UTF-8 6 | Authors@R: c( 7 | person("Martin", "Morgan", role="ctb"), 8 | person("Marc", "Carlson", role="ctb"), 9 | person("Dan", "Tenenbaum", role="ctb"), 10 | person("Sonali", "Arora", role="ctb"), 11 | person("Paul", "Shannon", role="ctb"), 12 | person("Lori", "Shepherd", role="ctb"), 13 | person("Bioconductor Package Maintainer", 14 | email="maintainer@bioconductor.org", role="cre") 15 | ) 16 | Depends: R (>= 3.2.2), methods, utils, S4Vectors (>= 0.7.21), 17 | IRanges (>= 2.3.23), GenomicRanges, AnnotationHub (>= 2.15.15) 18 | Suggests: RUnit, knitr, BiocStyle, grasp2db, GenomeInfoDbData, rmarkdown, HubPub 19 | Imports: GenomicFeatures, Rsamtools, rtracklayer, BiocGenerics, 20 | jsonlite, BiocManager, biocViews, BiocCheck, graph, 21 | AnnotationDbi, Biobase, Biostrings, DBI, Seqinfo, 22 | GenomeInfoDb (>= 1.45.5), OrganismDbi, RSQLite, 23 | AnnotationForge, futile.logger (>= 1.3.0), XML, RCurl 24 | Description: These recipes convert a wide variety and a growing number of 25 | public bioinformatic data sets into easily-used standard Bioconductor data 26 | structures. 27 | License: Artistic-2.0 28 | LazyLoad: yes 29 | biocViews: DataImport 30 | VignetteBuilder: knitr 31 | Collate: 32 | Message-class.R 33 | ImportPreparer-class.R 34 | makeAnnotationHubResource.R 35 | HubMetadata-class.R 36 | AnnotationHubMetadata-class.R 37 | utils.R 38 | updateResources.R 39 | ahmToJson.R 40 | webAccessFunctions.R 41 | makeChEA.R 42 | makedbSNPVCF.R 43 | makeEncodeDCC.R 44 | makeEnsemblGtfToGRanges.R 45 | makeEnsemblFasta.R 46 | makeEpigenomeRoadmap.R 47 | makeGencodeFasta.R 48 | makeGencodeGFF.R 49 | makeGrasp2Db.R 50 | makeHaemCode.R 51 | makeInparanoid8ToDbs.R 52 | makeNCBIToOrgDbs.R 53 | makeStandardOrgDbsToSqlite.R 54 | makeStandardTxDbsToSqlite.R 55 | makeRefNet.R 56 | makeUCSCChain.R 57 | makeUCSC2Bit.R 58 | makeUCSCTracks.R 59 | trackWithAuxiliaryTableToGRangesRecipe.R 60 | UCSCTrackUpdateChecker.R 61 | makeEnsemblTwoBit.R 62 | validationFunctions.R 63 | -------------------------------------------------------------------------------- /inst/scripts/addContributedResources.txt: -------------------------------------------------------------------------------- 1 | ## Contributed Annotations: 2 | 3 | This doc describes how to add contributed (i.e., non-core generated) 4 | resources to AnnotationHub. In general, these instructions pertain 5 | to core team members only. 6 | 7 | * Case 1: Single resources with no accompanying software package 8 | 9 | - Metadata 10 | 11 | Author follows these instructions to create a .R file that generates 12 | metadata for the resource(s): 13 | 14 | http://www.bioconductor.org/packages/3.5/bioc/vignettes/AnnotationHubData/inst/doc/IntroductionToAnnotationHubData.html#individual-resources 15 | 16 | - Test metadata 17 | 18 | Test the .R file provided by the author with 19 | AnnotationHubData::AnnotationHubMetadata(). Confirm the metadata fields 20 | are valid (reasonable title, version) and the paths are accurate. 21 | 22 | - Add metadata 23 | 24 | Add the metadata to the production database with the AnnotationHub docker. 25 | 26 | 27 | * Case 2: Family of resources with accompanying software package 28 | 29 | - Software package 30 | 31 | Author creates a software package according to guidelines here: 32 | 33 | http://www.bioconductor.org/packages/3.5/bioc/vignettes/AnnotationHubData/inst/doc/IntroductionToAnnotationHubData.html#family-of-resources 34 | 35 | - Test metadata 36 | 37 | Check the metadata with AnnotationHubData::makeAnnnotationHubMetadata(). 38 | There can be more than one metadata.csv file, e.g., ensembl_version86.csv, 39 | ensembl_version87.csv etc. The package should have record of all 40 | metadata added over time. For example, when version 88 files are added 41 | they should not remove the csv files for versions 86 and 87. 42 | 43 | - Add resources 44 | 45 | The resources can be 'stored' on a web site or in an S3 bucket. If they 46 | will be in S3, follow these steps: 47 | 48 | -- Create a new S3 bucket under annotationhub/ with the same name as the 49 | software package. 50 | -- Either the core team member adds the resources to the S3 bucket or 51 | the contributor adds them as the AnnotationContributor user. See the 52 | AnnotationHubData vignette for more details on the AnnotationContributor 53 | user. 54 | 55 | Once the resources are in the proper place, confirm they are public and 56 | can be downloaded by anyone. 57 | 58 | - Add metadata 59 | 60 | Add the metadata to the production database with the AnnotationHub docker. 61 | -------------------------------------------------------------------------------- /R/makeChEA.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeChEA() 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | ### Recipe for ChEA transcription factor background file. 7 | 8 | makeChEAToAHM <- function(currentMetadata, 9 | baseUrl="http://amp.pharm.mssm.edu/", 10 | justRunUnitTest=FALSE, 11 | BiocVersion=BiocManager::version()) 12 | { 13 | files <- "result/kea/chea-background.zip" 14 | 15 | files <- paste0(baseUrl, files) 16 | rsrc <- .httrFileInfo(files, verbose=FALSE) 17 | title <- basename(rsrc$fileurl) 18 | 19 | ## input_sources table 20 | sourceSize <- as.numeric(rsrc$size) 21 | sourceUrls <- rsrc$fileurl 22 | sourceVersion <- gsub(" ", "_", rsrc$date) 23 | sourceLastModifiedDate <- rsrc$date 24 | 25 | ## resources table 26 | description <- .expandLine("ChEA background file, containing 27 | transcription factor data to run ChEA") 28 | 29 | Map(AnnotationHubMetadata, 30 | SourceSize = sourceSize, 31 | SourceUrl = sourceUrls, 32 | SourceVersion = sourceVersion, 33 | SourceLastModifiedDate = sourceLastModifiedDate, 34 | 35 | Description = description, 36 | Title = title, 37 | 38 | RDataPath = gsub(baseUrl, "", sourceUrls), 39 | 40 | MoreArgs=list( 41 | BiocVersion=BiocVersion, 42 | # input sources 43 | SourceType = "Zip", 44 | 45 | # resources 46 | Species = NA_character_, 47 | TaxonomyId = NA_integer_, 48 | Genome = NA_character_, 49 | DataProvider = "ChEA", 50 | Maintainer = "Bioconductor Maintainer ", 51 | Coordinate_1_based = FALSE, 52 | Location_Prefix = baseUrl, 53 | RDataDateAdded = Sys.time(), 54 | 55 | #rdata table 56 | DispatchClass = "ChEA", 57 | RDataClass = "data.frame", 58 | 59 | Tags = c("ChEA","Transcription Factors"), 60 | 61 | Recipe = NA_character_ )) 62 | } 63 | 64 | makeAnnotationHubResource("ChEAImportPreparer", makeChEAToAHM) 65 | -------------------------------------------------------------------------------- /R/makeEnsemblGtfToGRanges.R: -------------------------------------------------------------------------------- 1 | ## As of July 2016 this recipe was modified to store only metadata 2 | ## and no files in S3. AnnotationHub will expose available GTF files 3 | ## from Ensembl and the AnnotationHub::GTFFile dispatch class will 4 | ## convert the GTF to GRanges on the fly. 5 | 6 | .ensemblGtfSourceUrls <- 7 | function(baseDir, baseUrl, release, justRunUnitTest, verbose=FALSE) 8 | { 9 | want <- paste0(baseUrl, "release-", release, paste0("/", baseDir)) 10 | urls <- unlist(lapply(want, function(url) { 11 | listing <- .ftpDirectoryInfo(url) 12 | subdir <- sub(".* ", "", listing[grep("^drwx", listing)]) 13 | paste0(url, subdir, "/") 14 | }), use.names=FALSE) 15 | 16 | if(justRunUnitTest) 17 | urls <- urls[1:2] ## 2 organisms; possibly more files 18 | 19 | df <- .ftpFileInfo(urls, ".gtf.gz", verbose=verbose) 20 | rownames(df) <- NULL 21 | df 22 | } 23 | 24 | makeEnsemblGtfToAHM <- 25 | function(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/", 26 | baseDir = "gtf/", release, justRunUnitTest = FALSE, 27 | BiocVersion = BiocManager::version(), ...) 28 | { 29 | ## get all file urls, size, date 30 | df <- .ensemblGtfSourceUrls(baseDir, baseUrl, release, 31 | justRunUnitTest, ...) 32 | 33 | sourceUrls <- df$fileurl 34 | rdatapath <- gsub(baseUrl, "", sourceUrls) 35 | 36 | ## get genome, species, version, title 37 | meta <- .ensemblMetadataFromUrl(sourceUrls) 38 | description <- paste("Gene Annotation for", meta$species) 39 | 40 | Map(AnnotationHubMetadata, 41 | Description=description, Genome=meta$genome, 42 | SourceUrl=sourceUrls, 43 | SourceSize=as.numeric(df$size), 44 | SourceLastModifiedDate=df$date, 45 | SourceVersion=meta$sourceVersion, 46 | Species=meta$species, 47 | RDataPath=rdatapath, 48 | TaxonomyId=meta$taxonomyId, Title=meta$title, 49 | MoreArgs=list( 50 | BiocVersion=BiocVersion, 51 | Coordinate_1_based=TRUE, 52 | DataProvider="Ensembl", 53 | Maintainer="Bioconductor Maintainer ", 54 | RDataClass="GRanges", 55 | DispatchClass="GTFFile", 56 | SourceType="GTF", 57 | Location_Prefix=baseUrl, 58 | RDataDateAdded=Sys.time(), 59 | Recipe=NA_character_, 60 | Tags=c("GTF", "ensembl", "Gene", "Transcript", "Annotation"))) 61 | } 62 | 63 | makeAnnotationHubResource("EnsemblGtfImportPreparer", makeEnsemblGtfToAHM) 64 | -------------------------------------------------------------------------------- /man/ImportPreparer-class.Rd: -------------------------------------------------------------------------------- 1 | \name{ImportPreparer-class} 2 | \docType{class} 3 | \alias{newResources} 4 | \alias{getImportPreparer} 5 | 6 | \alias{ImportPreparer-class} 7 | \alias{show,ImportPreparer-method} 8 | \alias{newResources,ImportPreparer-method} 9 | 10 | \alias{UCSCTrackImportPreparer-class} 11 | \alias{UCSCTrackImportPreparer} 12 | \alias{newResources,UCSCTrackImportPreparer-method} 13 | \alias{newResources,UCSCFullTrackImportPreparer-method} 14 | 15 | \alias{HaemCodeImportPreparer-class} 16 | \alias{HaemCodeImportPreparer} 17 | \alias{newResources,HaemCodeImportPreparer-method} 18 | 19 | \alias{EncodeImportPreparer-class} 20 | \alias{EncodeImportPreparer} 21 | \alias{newResources,EncodeImportPreparer-method} 22 | 23 | \alias{annotationHubRoot} 24 | \alias{metadataList} 25 | \alias{metadataTable} 26 | \alias{sourceUrls} 27 | 28 | \alias{EnsemblFastaImportPreparer-class} 29 | \alias{EnsemblFastaImportPreparer} 30 | \alias{newResources,EnsemblFastaImportPreparer-method} 31 | 32 | \alias{EnsemblGtfImportPreparer-class} 33 | \alias{EnsemblGtfImportPreparer} 34 | \alias{newResources,EnsemblGtfImportPreparer-method} 35 | 36 | \alias{RefNetImportPreparer-class} 37 | \alias{RefNetImportPreparer} 38 | \alias{newResources,RefNetImportPreparer-method} 39 | 40 | \alias{dbSNPVCFImportPreparer-class} 41 | \alias{dbSNPVCFImportPreparer} 42 | \alias{newResources,dbSNPVCFImportPreparer-method} 43 | 44 | \alias{Grasp2ImportPreparer-class} 45 | \alias{Grasp2ImportPreparer} 46 | \alias{newResources,Grasp2ImportPreparer-method} 47 | 48 | \alias{Inparanoid8ImportPreparer-class} 49 | \alias{Inparanoid8ImportPreparer} 50 | \alias{newResources,Inparanoid8ImportPreparer-method} 51 | 52 | \alias{NCBIImportPreparer-class} 53 | \alias{NCBIImportPreparer} 54 | \alias{newResources,NCBIImportPreparer-method} 55 | 56 | \alias{UCSCChainPreparer-class} 57 | \alias{UCSCChainPreparer} 58 | \alias{newResources,UCSCChainPreparer-method} 59 | 60 | 61 | \title{Class \code{ImportPreparer} and generic \code{newResources}} 62 | 63 | \description{ 64 | 65 | The \code{ImportPreparer} and derived classes are used for dispatch 66 | during data discovery (see \code{\link{newResources}}). There is one 67 | \code{ImportPreparer} class for each data source for 68 | \code{\link{AnnotationHubMetadata}}. 69 | 70 | \code{newResources} is a generic function; with methods implemented 71 | for each \code{ImportPreparer}. 72 | 73 | } 74 | 75 | \author{Martin Morgan} 76 | 77 | \seealso{ 78 | \code{\linkS4class{AnnotationHubMetadata}}. 79 | } 80 | 81 | \examples{ 82 | getImportPreparerClasses() 83 | } 84 | 85 | \keyword{classes} 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /R/makeGencodeFasta.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeGencodeFastaToAHM() and gencodeFastaToFaFile() 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | ### Recipe for human and mouse fasta files. 7 | ### http://www.gencodegenes.org/releases/ 8 | ### ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human 9 | ### the above has been updated to 10 | ### ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human 11 | ### Files downloaded are listed in AnnotationHubData:::.gencodeDescription(). 12 | 13 | ### metadata generator 14 | makeGencodeFastaToAHM <- function(currentMetadata, 15 | baseUrl="ftp://ftp.ebi.ac.uk/pub/databases/gencode/", 16 | species=c("Human", "Mouse"), release, 17 | justRunUnitTest=FALSE, 18 | BiocVersion=BiocManager::version()) 19 | { 20 | species <- match.arg(species) 21 | rsrc <- .gencodeSourceUrls(species, release, filetype="fasta", 22 | justRunUnitTest) 23 | 24 | rdatapath <- rsrc$rdatapath 25 | rdps <- rep(rdatapath, each=3) 26 | rdatapaths <- split(rdps, f=as.factor(rep(seq_along(rdatapath),each=3))) 27 | rdatapath <- lapply(rdatapaths, 28 | function(x){ 29 | x[1] <- sub("gz","bgz", x[1]) 30 | x[2] <- paste0(x[1],".fai") 31 | x[3] <- paste0(x[1],".gzi") 32 | x 33 | }) 34 | 35 | description <- rsrc$description 36 | title <- basename(rsrc$fileurl) 37 | genome <- rsrc$genome 38 | sourceUrls <- rsrc$fileurl 39 | sourceVersion <- as.character(rsrc$date) ## should be character 40 | SourceLastModifiedDate <- rsrc$date ## should be "POSIXct" "POSIXt" 41 | SourceSize <- as.numeric(rsrc$size) 42 | tags <- strsplit(rsrc$tag, ",") 43 | species <- rsrc$species 44 | taxid <- rsrc$taxid 45 | 46 | Map(AnnotationHubMetadata, 47 | Description=description, 48 | Genome=genome, 49 | SourceUrl=sourceUrls, 50 | SourceSize=SourceSize, 51 | SourceLastModifiedDate=SourceLastModifiedDate, 52 | SourceVersion=sourceVersion, 53 | Species=species, 54 | RDataPath=rdatapath, 55 | TaxonomyId=taxid, 56 | Title=title, 57 | Tags=tags, 58 | MoreArgs=list( 59 | Coordinate_1_based = TRUE, 60 | DataProvider = "Gencode", 61 | Maintainer = "Bioconductor Maintainer ", 62 | RDataClass = c("FaFile", "FaFile", "FaFile"), 63 | DispatchClass="FaFile", 64 | SourceType="FASTA", 65 | Location_Prefix="https://bioconductorhubs.blob.core.windows.net/annotationhub/", 66 | RDataDateAdded = Sys.time(), 67 | Recipe="AnnotationHubData:::gencodeFastaToFaFile")) 68 | } 69 | 70 | gencodeFastaToFaFile <- function(ahm) 71 | { 72 | .fastaToFaFile(ahm) 73 | } 74 | 75 | ## create dispatch class and newResources() method 76 | makeAnnotationHubResource("GencodeFastaImportPreparer", makeGencodeFastaToAHM) 77 | -------------------------------------------------------------------------------- /inst/unitTests/cases/encodeDCCMetadata/wgEncodeAffyRnaChip.info: -------------------------------------------------------------------------------- 1 | wgEncodeAffyRnaChipFiltTransfragsGm12878CellTotal.broadPeak.gz project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=cell; rnaExtract=total; origAssembly=hg18; dataVersion=ENCODE Feb 2009 Freeze; dccAccession=wgEncodeEH000016; dateSubmitted=2009-03-10; dateUnrestricted=2009-12-10; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878CellTotal; type=broadPeak; md5sum=c4049a3cfbb6b3e74cb776ee4e4309f9; size=13M 2 | wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongnonpolya.broadPeak.gz project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=cytosol; rnaExtract=longNonPolyA; origAssembly=hg18; dataVersion=ENCODE Nov 2008 Freeze; dccAccession=wgEncodeEH000012; dateSubmitted=2008-12-09; dateUnrestricted=2009-09-09; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongnonpolya; type=broadPeak; md5sum=08d5feb7211a40c99a5ce374c6d0a169; size=12M 3 | wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongpolya.broadPeak.gz project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=cytosol; rnaExtract=longPolyA; origAssembly=hg18; dataVersion=ENCODE Nov 2008 Freeze; dccAccession=wgEncodeEH000002; dateSubmitted=2008-11-21; dateUnrestricted=2009-08-21; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongpolya; type=broadPeak; md5sum=9cb30fe0ff4c6c6d9bf9add1957a77f5; size=13M 4 | wgEncodeAffyRnaChipFiltTransfragsGm12878NucleolusTotal.broadPeak.gz project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=nucleolus; rnaExtract=total; origAssembly=hg18; dataVersion=ENCODE Sep 2009 Freeze; dccAccession=wgEncodeEH000026; dateSubmitted=2009-08-27; dateUnrestricted=2010-05-27; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878NucleolusTotal; type=broadPeak; md5sum=86fa269224f528b52bf15faa387af12d; size=14M 5 | wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongnonpolya.broadPeak.gz project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=nucleus; rnaExtract=longNonPolyA; origAssembly=hg18; dataVersion=ENCODE Nov 2008 Freeze; dccAccession=wgEncodeEH000003; dateSubmitted=2008-11-21; dateUnrestricted=2009-08-21; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongnonpolya; type=broadPeak; md5sum=c8d8604e208bcec85075a97e2d855c71; size=17M 6 | wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongpolya.broadPeak.gz project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=nucleus; rnaExtract=longPolyA; origAssembly=hg18; dataVersion=ENCODE Feb 2009 Freeze; dccAccession=wgEncodeEH000017; dateSubmitted=2009-04-01; dateUnrestricted=2010-01-01; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongpolya; type=broadPeak; md5sum=e59db684bc92f9e33587c70d4e93de39; size=17M 7 | -------------------------------------------------------------------------------- /R/makeRefNet.R: -------------------------------------------------------------------------------- 1 | ## old file - RefNetImportPreparer-class.R 2 | ##changes - title should have full file name, species is "Homo sapiens" not 9606 3 | ## what is RefNet Genome? 4 | ## tags looks like "interactions, interactions from gerstein-2012" 5 | 6 | .amazonBaseUrl <- "https://bioconductorhubs.blob.core.windows.net/annotationhub/" 7 | 8 | .getRefNetFileURIs <- function() { 9 | # everything is embedded in the second line of xml 10 | .refNetbase.url <- paste0(.amazonBaseUrl, "refnet/") 11 | filenames <- c("gerstein-2012.tsv_0.0.1.RData" , 12 | "hypoxiaSignaling-2006.tsv_0.0.1.RData", 13 | "stamlabTFs-2012.tsv_0.0.1.RData", 14 | "recon202.tsv_0.0.1.RData") 15 | paste0(.refNetbase.url, filenames) 16 | } 17 | 18 | 19 | .refnetFiles <- function() { 20 | files <- .getRefNetFileURIs() 21 | df <- .httrFileInfo(files, verbose=FALSE) 22 | title <- basename(files) 23 | 24 | filename.stem <- sub(".tsv_0.0.1.RData", "", title) 25 | description <- sprintf("Interactions from %s", filename.stem) 26 | cbind(df, title, description, stringsAsFactors=FALSE) 27 | } 28 | 29 | 30 | makeRefNetImporter <- function(currentMetadata, justRunUnitTest=FALSE, 31 | BiocVersion=BiocManager::version()) { 32 | rsrc <- .refnetFiles() 33 | 34 | ## input_sources table 35 | sourceSize <- as.numeric(rsrc$size) 36 | sourceUrls <- gsub("_0.0.1.RData", "", rsrc$fileurl) 37 | sourceVersion <- gsub(" ", "_", rsrc$date) 38 | sourceLastModifiedDate <- rsrc$date 39 | 40 | ## resources table 41 | 42 | title <- rsrc$title 43 | description <- rsrc$description 44 | 45 | Tags <- lapply(rsrc$description, function(x) { 46 | c("refNet","interactions", x) 47 | }) 48 | 49 | Map(AnnotationHubMetadata, 50 | 51 | SourceSize = sourceSize, 52 | SourceUrl = sourceUrls, 53 | SourceVersion = sourceVersion, 54 | SourceLastModifiedDate = sourceLastModifiedDate, 55 | 56 | Description = description, 57 | Title = title, 58 | 59 | RDataPath = gsub(.amazonBaseUrl, "",sourceUrls), 60 | 61 | MoreArgs=list( 62 | BiocVersion=BiocVersion, 63 | # input sources 64 | SourceType = "RData", 65 | 66 | # resources 67 | Species = "Homo sapiens", 68 | TaxonomyId = 9606L, 69 | Genome = "RefNet Genome", 70 | DataProvider = "RefNet", 71 | Maintainer = "Bioconductor Maintainer ", 72 | Coordinate_1_based = FALSE, 73 | Location_Prefix = .amazonBaseUrl, 74 | RDataDateAdded = Sys.time(), 75 | 76 | 77 | #rdata table 78 | DispatchClass = "data.frame" , 79 | RDataClass = "data.frame", 80 | 81 | Tags = c("refnet", "interactions"), 82 | Recipe= NA_character_)) 83 | } 84 | 85 | makeAnnotationHubResource("RefNetImportPreparer", makeRefNetImporter) 86 | -------------------------------------------------------------------------------- /man/makeGencodeFasta.Rd: -------------------------------------------------------------------------------- 1 | \name{makeGencodeFasta} 2 | 3 | \alias{makeGencodeFasta} 4 | \alias{makeGencodeFastaToAHM} 5 | \alias{gencodeFastaToFaFile} 6 | 7 | \title{ 8 | Recipe to add Gencode FASTA resources to AnnotationHub 9 | } 10 | 11 | \description{ 12 | Create metadata and process raw Gencode FASTA files for inclusion in 13 | AnnotationHub 14 | } 15 | 16 | \usage{ 17 | makeGencodeFastaToAHM(currentMetadata, 18 | baseUrl="ftp://ftp.ebi.ac.uk/pub/databases/gencode/", 19 | species=c("Human", "Mouse"), release, 20 | justRunUnitTest=FALSE, 21 | BiocVersion=BiocManager::version()) 22 | 23 | gencodeFastaToFaFile(ahm) 24 | } 25 | 26 | \details{ 27 | \describe{ 28 | \item{Documentation:}{ 29 | http://www.gencodegenes.org/releases/ 30 | } 31 | \item{File download location:}{ 32 | ftp://ftp.ebi.ac.uk/pub/databases/gencode/. Gencode_human and 33 | Gencode_mouse are used. 34 | } 35 | \item{Files downloaded:}{ 36 | Code is currently specific for human and mouse. Files chosen for 37 | download are described in AnnotationHubData:::.gencodeDescription(). 38 | } 39 | } 40 | } 41 | 42 | \arguments{ 43 | \item{currentMetadata}{ 44 | Currently not used. Intended to be a list of metadata to filter, i.e., 45 | records that do not need to be processed again. Need to remove or fix. 46 | } 47 | \item{baseUrl}{ 48 | ftp file location. 49 | } 50 | \item{species}{ 51 | A \code{character(1)} of the species. Currently "Human" and "Mouse" 52 | are supported. 53 | } 54 | \item{release}{ 55 | A \code{character} string of the release number. 56 | } 57 | \item{justRunUnitTest}{ 58 | A \code{logical}. When TRUE, a small number of records (usually 5) are 59 | processed instead of all. 60 | } 61 | \item{BiocVersion}{ 62 | A \code{character} vector of Bioconductor versions the resources should be 63 | available for. 64 | } 65 | \item{ahm}{ 66 | List of \code{AnnotationHubMetadata} instances. 67 | } 68 | } 69 | 70 | 71 | \value{ 72 | \code{makeGencodeFastaAHM} returns a list of \code{AnnotationHubMetadata} 73 | instances. \code{gencodeFastaToFaFile} returns nothing. 74 | } 75 | 76 | \seealso{ 77 | \itemize{ 78 | \item \link{updateResources} 79 | \item \link{AnnotationHubMetadata} 80 | } 81 | } 82 | 83 | \author{ 84 | Bioconductor Core Team. 85 | } 86 | 87 | \examples{ 88 | 89 | ## updateResources() generates metadata, process records and 90 | ## pushes files to AWS S3 buckets. 91 | 92 | ## To run the GencodeFasta recipe specify 93 | ## 'preparerClasses = GencodeFastaImportPreparer'. The 'species' and 'release' 94 | ## arguments are passed to makeGencodeFastaAHM(). 95 | \dontrun{ 96 | meta <- updateResources("/local/path", 97 | BiocVersion = c("3.2", "3.3"), 98 | preparerClasses = "GencodeFastaImportPreparer", 99 | metadataOnly = TRUE, insert = FALSE, 100 | justRunUnitTest = FALSE) 101 | 102 | } 103 | } 104 | 105 | \keyword{methods} 106 | -------------------------------------------------------------------------------- /inst/unitTests/test_webAccessFunctions.R: -------------------------------------------------------------------------------- 1 | initialTimeout <- getOption("timeout") 2 | setup <- function() 3 | options(timeout=5*60) 4 | tearDown <- function() 5 | options(timeout=initialTimeout) 6 | 7 | .httrRead <- AnnotationHubData:::.httrRead 8 | .ftpDirectoryInfo <- AnnotationHubData:::.ftpDirectoryInfo 9 | .ftpFileInfo <- AnnotationHubData:::.ftpFileInfo 10 | .listRemoteFiles <- AnnotationHubData:::.listRemoteFiles 11 | .getGenomeAbbrevs <- AnnotationHubData:::.getGenomeAbbrevs 12 | 13 | test_httrRead <- function() { 14 | setup() 15 | hg19Url <- "http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/" 16 | url <- paste0(hg19Url,"wgEncodeCshlLongRnaSeq/") 17 | ans <- .httrRead(url, "//pre/a/text()") 18 | checkTrue(is(ans, "data.frame")) 19 | checkTrue(names(ans) == "files") 20 | checkTrue(nrow(ans) > 0) 21 | tearDown() 22 | } 23 | 24 | test_ftpDirectoryInfo <- function(){ 25 | setup() 26 | url <- "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/" 27 | ans <- .ftpDirectoryInfo(url) 28 | checkTrue(is(ans, "character")) 29 | checkTrue(is.null(names(ans))) 30 | checkTrue(length(ans) > 0L) 31 | tearDown() 32 | } 33 | 34 | test_ftpFileInfo <- function(){ 35 | setup() 36 | url <- "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/" 37 | ans <- .ftpFileInfo(url, "chr.gtf.gz") 38 | checkTrue(is(ans, "data.frame")) 39 | checkIdentical(names(ans), c("fileurl", "date", "size")) 40 | checkTrue(nrow(ans) == 1L) 41 | 42 | ans <- .ftpFileInfo(url, ".gz") 43 | checkTrue(nrow(ans) > 0L) 44 | 45 | ans <- .ftpFileInfo(url, "FILE_THAT_DOESNT_EXIST.gz") 46 | checkTrue(is(ans, "data.frame")) 47 | checkIdentical(names(ans), c("fileurl", "date", "size")) 48 | checkTrue(nrow(ans) == 0L) 49 | tearDown() 50 | } 51 | 52 | test_listRemoteFiles <- function(){ 53 | setup() 54 | url <- "ftp://ftp.ensembl.org/pub/" 55 | ans <- .listRemoteFiles(url) 56 | checkTrue(is(ans, "character")) 57 | checkTrue(is.null(names(ans))) 58 | checkTrue(length(ans) > 0L) 59 | tearDown() 60 | } 61 | 62 | ## FIXME: revisit this when working on UCSCTrackUpdateChecker.R 63 | ## test_getGenomeAbbrevs <- function(){ 64 | ## smallSample <- c("hg19", "hg18", "hg17") 65 | ## actualResult <- .getGenomeAbbrevs(smallSample) 66 | ## expectedResult <- sort(smallSample) 67 | ## checkEquals(actualResult, expectedResult) 68 | 69 | ## # Viewing the FTP server content at ftp://hgdownload.cse.ucsc.edu/goldenPath/ , you'll 70 | ## # notice that some files are actually symlinks to other directories. Since the 71 | ## # getGenomeAbbrevs function claims to handle symlinks, we must test that specific case. 72 | ## # The following are files that actually render the client a symlink. : 73 | ## ### cb1 -> cbJul2002 74 | ## ### ce1 -> ceMay2003 75 | ## ### hg15 -> 10april2003/" 76 | ## ### rn2 -> rnJan2003 77 | ## # You should notice, however that results are returned in a sorted order. 78 | ## sampleWithSymlink <- c("hg15", "cb1", "rn2", "ce1") 79 | ## resultWithSymlink <- AnnotationHubData:::.getGenomeAbbrevs(sampleWithSymlink) 80 | ## expectedResult <- sort(sampleWithSymlink) 81 | ## checkEquals(resultWithSymlink, expectedResult) 82 | ## } 83 | -------------------------------------------------------------------------------- /inst/unitTests/test_AnnotationHubConstructor.R: -------------------------------------------------------------------------------- 1 | 2 | ahroot <- "/var/FastRWeb/web" 3 | .AnnotationHubMetadata_args <- local({ 4 | basepath <- paste0("goldenpath/hg19/encodeDCC/wgEncodeRikenCage/", 5 | "wgEncodeRikenCageCd20CellPapTssHmm.bedRnaElements") 6 | 7 | list(AnnotationHubRoot=ahroot, 8 | SourceUrl=sprintf("http://hgdownload.cse.ucsc.edu/%s", basepath), 9 | SourceVersion=NA_character_, 10 | SourceLastModifiedDate=as.POSIXct("2015-01-01", tz="GMT"), 11 | SourceSize=as.numeric(99999), 12 | SourceMd5="2", 13 | SourceType="BED", 14 | Title="CD20 CAGE defined Transcriptional Start Sites", 15 | Description="120785 TSS sites ...", 16 | Species="Homo sapiens", 17 | TaxonomyId=9606L, 18 | Genome="hg19", 19 | Recipe="extendedBedToGranges", 20 | Tags=c("gene regulation", "ranged genomic data"), 21 | RDataClass="GRanges", 22 | Coordinate_1_based=TRUE, 23 | Maintainer="Paul Shannon ", 24 | DataProvider="hgdownload.cse.ucsc.edu", 25 | Notes="9 total columns...", 26 | RDataDateAdded=as.POSIXct("2013-01-01", tz="GMT"), 27 | DispatchClass="GRanges", 28 | PreparerClass="EncodeImportPreparer") 29 | }) 30 | 31 | .AnnotationHubMetadata <- 32 | do.call("AnnotationHubMetadata", .AnnotationHubMetadata_args) 33 | 34 | test_constructor <- function() 35 | { 36 | ## construction from complete args 37 | args <- .AnnotationHubMetadata_args 38 | ahm <- do.call("AnnotationHubMetadata", args) 39 | checkTrue(validObject(ahm)) 40 | ## ... correctly inserted into slots 41 | values <- metadata(ahm) 42 | test <- unlist(Map(identical, args, values[names(args)])) 43 | checkTrue(all(test)) 44 | 45 | ## date / version coercion 46 | idx <- grep("(Version)", names(args)) 47 | args[idx] <- sapply(args[idx], as.character) 48 | 49 | ahm1 <- do.call("AnnotationHubMetadata", args) 50 | checkIdentical(ahm, ahm1) 51 | } 52 | 53 | test_isComplete <- function() 54 | { 55 | .isComplete <- AnnotationHubData:::.isComplete 56 | valid <- .AnnotationHubMetadata 57 | checkTrue(.isComplete(valid)) 58 | 59 | ## zero-length 'required' field 60 | invalid <- valid 61 | metadata(invalid)$Title <- character() 62 | checkException(.isComplete(invalid), silent=TRUE) 63 | 64 | ## invalid email address 65 | invalid <- valid 66 | metadata(invalid)$Maintainer <- "User " 67 | checkException(.isComplete(invalid), silent=TRUE) 68 | 69 | ## species not in database 70 | invalid <- valid 71 | metadata(invalid)$Species <- "Unknown" 72 | checkException(.isComplete(invalid), silent=TRUE) 73 | } 74 | 75 | test_multi_input <- function() 76 | { 77 | args <- .AnnotationHubMetadata_args 78 | rp <- "goldenpath/hg19/encodeDCC/wgEncodeRegDnaseClustered" 79 | files <- c("wgEncodeRegDnaseClustered.bed.gz", 80 | "wgEncodeRegDnaseClusteredInputs.tab") 81 | args$SourceUrl <- 82 | sprintf("http://hgdownload.cse.ucsc.edu/%s/%s", rp, files) 83 | args$SourceMd5 <- c("2","2") 84 | args$SourceSize <- c(as.numeric(99999),as.numeric(99999)) 85 | 86 | x <- do.call("AnnotationHubMetadata", args) 87 | checkEquals(2L, length(metadata(x)$SourceUrl)) 88 | checkEquals(2L, length(metadata(x)$SourceMd5)) 89 | checkEquals(2L, length(metadata(x)$SourceSize)) 90 | } 91 | -------------------------------------------------------------------------------- /R/validationFunctions.R: -------------------------------------------------------------------------------- 1 | getSpeciesList <- function(verbose=FALSE){ 2 | if (!requireNamespace("GenomeInfoDbData", quietly = TRUE)) 3 | stop("Requires GenomeInfoDbData. Please run:\n", 4 | " BiocManager::install('GenomeInfoDbData')") 5 | if (verbose) message("Loading valid species information.") 6 | txdb <- GenomeInfoDb::loadTaxonomyDb() 7 | txdb <- rbind(txdb, c(NA, NA, "")) 8 | species <- trimws(paste(txdb$genus, txdb$species)) 9 | species 10 | } 11 | 12 | validSpecies <- function(species, verbose=TRUE){ 13 | speciesList <- getSpeciesList(verbose=verbose) 14 | res <- species %in% speciesList 15 | if (any(is.na(species))) 16 | res[is.na(species)] = TRUE 17 | if (any(!res) & verbose){ 18 | message("Found invalid species.\n") 19 | print(species[!res]) 20 | message("\nFor complete list of acceptable species run\n", 21 | " 'getSpeciesList()'\n", 22 | "For suggestions try\n", 23 | " 'suggestSpecies()'\n") 24 | } 25 | all(res) 26 | } 27 | 28 | suggestSpecies <- function(query, verbose=FALSE, op=c("|", "&")){ 29 | op = match.arg(op) 30 | if (!requireNamespace("GenomeInfoDbData", quietly = TRUE)) 31 | stop("Requires GenomeInfoDbData. Please run:\n", 32 | " BiocManager::install('GenomeInfoDbData')") 33 | if (verbose) message("Loading valid species information.") 34 | txdb <- GenomeInfoDb::loadTaxonomyDb() 35 | txdb <- rbind(txdb, c(NA, NA, "")) 36 | sd <- txdb 37 | combo <- trimws(paste(txdb$genus, txdb$species)) 38 | sd$combo = combo 39 | if( op == "|"){ 40 | keep <- FALSE 41 | for (q in query) 42 | keep <- keep | Reduce(`|`, lapply(sd[2:4], grepl, pattern = q, 43 | ignore.case=TRUE)) 44 | }else { 45 | keep <- TRUE 46 | for (q in query) 47 | keep <- keep & Reduce(`|`, lapply(sd[2:4], grepl, pattern = q, 48 | ignore.case=TRUE)) 49 | } 50 | data.frame(taxonomyId = sd$tax_id[keep], species=sd$combo[keep]) 51 | } 52 | 53 | getValidSourceTypes <- function(){ 54 | 55 | # alphabetical 56 | expectedSourceTypes <- c("BAI", "BAM", "BED", "BigWig", "BioPax", 57 | "BioPaxLevel2", "BioPaxLevel3", "BLOB", "CEL", 58 | "CDF", "Chain", "CSV", 59 | "ensembl", "FASTA", "FASTQ", "FCS", "GFF", "GRASP", 60 | "GSEMatrix", "GTF", "HDF5", "HIC", "IDAT", "Inparanoid", 61 | "JSON", "MTX", "mtx.gz", "MySQL", "mzid", "mzML", "mzTab", 62 | "mzXML", "Multiple", "NCBI/blast2GO", "NCBI/ensembl", 63 | "NCBI/UniProt", "PDB", "PNG", "RDA", "RData", "RDS", "Simulated", "tab", 64 | "tar.gz", "TIFF", "TSV", "TwoBit", "TXT", "UCSC track", 65 | "VCF", "XLS/XLSX", "XML", "Zip") 66 | 67 | expectedSourceTypes 68 | 69 | } 70 | 71 | validDispatchClass <- function(dc, verbose=TRUE){ 72 | 73 | mat <- AnnotationHub::DispatchClassList() 74 | res <- dc %in% as.character(mat[,1]) 75 | if (any(!res) & verbose){ 76 | message("Found invalid DispatchClass.\n") 77 | print(dc[!res]) 78 | message("\nFor currently available DispatchClass run\n", 79 | " 'AnnotationHub::DispatchClassList()'\n") 80 | } 81 | all(res) 82 | } 83 | -------------------------------------------------------------------------------- /man/makeEnsemblFasta.Rd: -------------------------------------------------------------------------------- 1 | \name{makeEnsemblFasta} 2 | 3 | \alias{makeEnsemblFasta} 4 | \alias{makeEnsemblFastaToAHM} 5 | \alias{makeEnsemblTwoBitToAHM} 6 | \alias{ensemblFastaToFaFile} 7 | \alias{ensemblFastaToTwoBitFile} 8 | 9 | \title{ 10 | Functions to convert Ensembl FASTA files to FaFile and TwoBitFile for 11 | inclusion in AnnotationHub. 12 | } 13 | 14 | \description{ 15 | Transform an Ensembl FASTA file to a Bioconductor FaFile or ToBitFile. 16 | } 17 | 18 | \usage{ 19 | makeEnsemblFastaToAHM(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/", 20 | baseDir = "fasta/", release, 21 | justRunUnitTest = FALSE, 22 | BiocVersion = BiocManager::version()) 23 | 24 | makeEnsemblTwoBitToAHM(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/", 25 | baseDir = "fasta/", release, 26 | justRunUnitTest = FALSE, 27 | BiocVersion = BiocManager::version()) 28 | 29 | ensemblFastaToFaFile(ahm) 30 | 31 | ensemblFastaToTwoBitFile(ahm) 32 | } 33 | 34 | \arguments{ 35 | \item{currentMetadata}{ 36 | Currently not used. Intended to be a list of metadata to filter, i.e., 37 | records that do not need to be processed again. Need to remove or fix. 38 | } 39 | \item{baseUrl}{ 40 | ftp file location. 41 | } 42 | \item{baseDir}{ 43 | ftp file directory. 44 | } 45 | \item{release}{ 46 | Integer version number, e.g., "84". 47 | } 48 | \item{justRunUnitTest}{ 49 | A \code{logical}. When TRUE, a small number of records (usually 5) are 50 | processed instead of all. 51 | } 52 | \item{BiocVersion}{ 53 | A \code{character(1)} Bioconductor version. The resource will be available 54 | in Bioconductor >= to this version. Default value is the current version, 55 | specified with BiocManager::version(). 56 | } 57 | \item{ahm}{ 58 | List of \code{AnnotationHubMetadata} instances. 59 | } 60 | } 61 | 62 | \details{ 63 | \code{makeEnsemblFastaToAHM} and \code{makeEnsemblTwoBitToAHM} process 64 | metadata into a list of \code{AnnotationHubMetadata} objects. 65 | 66 | \code{ensemblFastaToFaFile} unzips a .gz files, creates and index and 67 | writes out .rz and .rz.fai files to disk. 68 | \code{ensemblFastaToTwoBit} converts a fasta file to twobit format and 69 | writes the .2bit file out to disk. 70 | } 71 | 72 | \value{ 73 | \code{makeEnsemblFastaToAHM} and \code{makeEnsemblTwoBitToAHM} return 74 | a list of \code{AnnotationHubMetadata} objects. 75 | 76 | \code{ensemblFastaToFaFile} write out .rz and .rz.fai files to disk. 77 | \code{ensemblFastaToTwoBit} writes out a .2bit file to disk. 78 | } 79 | 80 | \author{Bioconductor Core Team} 81 | 82 | \seealso{ 83 | \itemize{ 84 | \item \link{updateResources} 85 | \item \link{AnnotationHubMetadata} 86 | } 87 | } 88 | 89 | \examples{ 90 | ## updateResources() generates metadata, process records and 91 | ## pushes files to AWS S3 buckets. See ?updateResources for details. 92 | 93 | ## 'release' is passed to makeEnsemblFastaToFaFile. 94 | \dontrun{ 95 | meta <- updateResources("/local/path", 96 | BiocVersion = c("3.2", "3.3"), 97 | preparerClasses = "EnsemblFastaImportPreparer", 98 | metadataOnly = TRUE, insert = FALSE, 99 | justRunUnitTest = FALSE, release = "83") 100 | } 101 | } 102 | 103 | \keyword{methods} 104 | -------------------------------------------------------------------------------- /R/makeEnsemblTwoBit.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeEnsemblTwoBit() 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | .ensemblTwoBitTypes <- 7 | c("cdna\\.all", "dna_rm\\.(primary_assembly|toplevel)", 8 | "dna_sm\\.(primary_assembly|toplevel)", 9 | "dna\\.(primary_assembly|toplevel)", "ncrna") 10 | 11 | ## Metadata generator 12 | makeEnsemblTwoBitToAHM <- 13 | function(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/", 14 | baseDir = "fasta/", release, 15 | justRunUnitTest = FALSE, BiocVersion = BiocManager::version()) 16 | { 17 | if (length(release) > 1L) 18 | stop("'release' must be a single integer") 19 | if (length(BiocVersion) > 1L) 20 | stop("BiocVersion must be a single version") 21 | time1 <- Sys.time() 22 | regex <- paste0(".*release-", release) 23 | sourceUrl <- .ensemblFastaSourceUrls(baseUrl, baseDir, regex, 24 | baseTypes=.ensemblTwoBitTypes) 25 | if (justRunUnitTest) 26 | sourceUrl <- sourceUrl[1:5] 27 | 28 | sourceFile <- sub(baseUrl, "ensembl/", sourceUrl) 29 | meta <- .ensemblMetadataFromUrl(sourceUrl, twobit=TRUE) 30 | dnaType <- local({ 31 | x <- basename(dirname(sourceFile)) 32 | sub("(dna|rna)", "\\U\\1", x, perl=TRUE) 33 | }) 34 | description <- paste("TwoBit", dnaType, "sequence for", meta$species) 35 | 36 | rdataPath <- sub("\\.fa\\.gz$", ".2bit", sourceFile) 37 | 38 | Map(AnnotationHubMetadata, 39 | AnnotationHubRoot=currentMetadata$AnnotationHubRoot, 40 | Description=description, 41 | Genome=meta$genome, 42 | RDataPath=rdataPath, 43 | SourceUrl=sourceUrl, 44 | SourceVersion=meta$sourceVersion, 45 | Species=meta$species, 46 | TaxonomyId=meta$taxonomyId, 47 | Title=meta$title, 48 | SourceSize=meta$sourceSize, 49 | SourceLastModifiedDate=meta$sourceLastModifiedDate, 50 | MoreArgs=list( 51 | BiocVersion=package_version(BiocVersion), 52 | Coordinate_1_based = TRUE, 53 | DataProvider="Ensembl", 54 | Maintainer = "Bioconductor Maintainer ", 55 | SourceType="FASTA", 56 | DispatchClass="TwoBitFile", 57 | RDataClass="TwoBitFile", 58 | RDataDateAdded=Sys.time(), 59 | Recipe="AnnotationHubData:::ensemblFastaToTwoBitFile", 60 | Tags=c("TwoBit", "ensembl", "sequence", "2bit", "FASTA"))) 61 | } 62 | 63 | ensemblFastaToTwoBitFile <- function(ahm) 64 | { 65 | ## Convert .fa file to .2bit 66 | gc() 67 | twobitOut <- file.path(metadata(ahm)$HubRoot, 68 | dirname(metadata(ahm)$RDataPath), basename(outputFile(ahm))) 69 | srcFile <- sub('\\.2bit','.fa.gz', twobitOut) 70 | dna <- import(srcFile, "FASTA") 71 | gc() 72 | 73 | tryCatch({ 74 | ## ID as name 75 | ids <- sub(" .*", "", names(dna)) 76 | stopifnot(length(ids) == length(dna)) 77 | names(dna) <- ids 78 | dna <- Biostrings::replaceAmbiguities(dna) 79 | export(dna, twobitOut, "TwoBit") 80 | }, error = function(err) { 81 | message("conversion failed", 82 | "\n file: ", sQuote(srcFile), 83 | "\n reason: ", conditionMessage(err), 84 | call.=FALSE) 85 | }, finally = function(){ 86 | ## remove .fa file 87 | if (exists("dna")){ 88 | rm("dna") 89 | gc() 90 | } 91 | system(paste0("rm ", srcFile)) 92 | gc() 93 | }) 94 | 95 | ## remove .fa file 96 | if (exists("dna")){ 97 | rm("dna") 98 | gc() 99 | } 100 | system(paste0("rm ", srcFile)) 101 | gc() 102 | } 103 | 104 | ## create the class and newResources() method 105 | makeAnnotationHubResource("EnsemblTwoBitPreparer", makeEnsemblTwoBitToAHM) 106 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | import(methods) 2 | import(S4Vectors) 3 | import(IRanges) 4 | import(GenomicRanges) 5 | import(AnnotationHub) 6 | import(OrganismDbi) 7 | import(GenomicFeatures) 8 | import(RCurl) 9 | import(BiocManager) 10 | import(Biostrings) 11 | 12 | importFrom(biocViews, recommendPackages, guessPackageType) 13 | importMethodsFrom(AnnotationDbi, exists, get, saveDb) 14 | importMethodsFrom(BiocGenerics, mapply, Map) 15 | importMethodsFrom(DBI, dbDriver, dbGetQuery) 16 | importMethodsFrom(Rsamtools, indexFa) 17 | importMethodsFrom(RSQLite, dbConnect, dbDisconnect) 18 | importMethodsFrom(rtracklayer, 19 | browserSession, "genome<-", getTable, 20 | import, "tableName<-", tableNames, track, 21 | "trackName<-", trackNames, ucscTableQuery, export 22 | ) 23 | importFrom(Biobase, AnnotatedDataFrame, ExpressionSet) 24 | importFrom(Seqinfo, Seqinfo) 25 | importFrom(GenomeInfoDb, loadTaxonomyDb) 26 | importFrom(rtracklayer, GRangesForUCSCGenome, ucscGenomes) 27 | importFrom(Rsamtools, bgzip) 28 | importFrom(AnnotationForge, makeInpDb, makeOrgPackageFromNCBI) 29 | importFrom(AnnotationDbi, loadDb) 30 | importFrom(tools, file_ext) 31 | importFrom(futile.logger, 32 | ERROR, INFO, TRACE, appender.file, 33 | flog.appender, flog.threshold, flog.info 34 | ) 35 | importFrom(jsonlite, fromJSON, toJSON) 36 | importFrom(parallel, detectCores) 37 | importFrom(stats, setNames) 38 | importFrom(XML, readHTMLTable, xmlParse, xmlValue, getNodeSet, htmlParse) 39 | importFrom(RSQLite, SQLite, sqliteCopyDatabase) 40 | importFrom(graphics, title) 41 | importFrom(utils, 42 | capture.output, data, download.file, 43 | read.delim, read.table, str, read.csv 44 | ) 45 | 46 | import(BiocCheck) 47 | import(biocViews) 48 | importFrom(graph, nodes) 49 | 50 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 51 | ### Export S4 classes 52 | ### 53 | 54 | exportClasses( 55 | HubMetadata, 56 | AnnotationHubMetadata, 57 | ImportPreparer, 58 | UCSCTrackImportPreparer, 59 | UCSCChainPreparer, 60 | Grasp2ImportPreparer, 61 | EnsemblGtfImportPreparer, 62 | EnsemblFastaImportPreparer, 63 | Inparanoid8ImportPreparer, 64 | NCBIImportPreparer 65 | ) 66 | 67 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 68 | ### Export non-generic functions 69 | ### 70 | 71 | export( 72 | AnnotationHubMetadata, makeAnnotationHubMetadata, 73 | UCSCTrackImportPreparer, 74 | #postProcessMetadata, 75 | flog, 76 | upload_to_S3, 77 | upload_to_azure, 78 | getImportPreparerClasses, 79 | makeAnnotationHubResource, 80 | HubMetadataFromJson, 81 | toJson, 82 | constructSeqInfo, 83 | makeEnsemblFastaToAHM, makeEnsemblTwoBitToAHM, 84 | ensemblFastaToFaFile, 85 | ensemblFastaToTwoBitFile, 86 | ahmToJson, 87 | newResources, updateResources, deleteResources, 88 | pushMetadata, pushResources, 89 | makeGencodeFastaToAHM, gencodeFastaToFaFile, 90 | makeStandardOrgDbsToAHM, makeStandardTxDbsToAHM, 91 | makeNCBIToOrgDbsToAHM, 92 | getSpeciesList, validSpecies, suggestSpecies, getValidSourceTypes, 93 | checkSpeciesTaxId, validDispatchClass 94 | ) 95 | 96 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 97 | ### Export S4 generics defined in AnnotationHubData + export corresponding 98 | ### methods 99 | ### 100 | 101 | export( 102 | recipeName, run, runRecipes, 103 | hubError, "hubError<-", 104 | inputFiles, outputFile, 105 | metadata, "metadata<-", 106 | metadataList, metadataTable, 107 | annotationHubRoot, 108 | sourceUrls 109 | ) 110 | 111 | exportMethods( 112 | recipeName, run, runRecipes, 113 | hubError, "hubError<-", 114 | inputFiles, outputFile, 115 | metadata, "metadata<-", 116 | metadataList, metadataTable, 117 | annotationHubRoot, 118 | sourceUrls 119 | ) 120 | 121 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 122 | ### Export S4 methods for generics not defined in AnnotationHubData 123 | ### 124 | 125 | exportMethods( 126 | show 127 | ) 128 | -------------------------------------------------------------------------------- /man/validationFunctions.Rd: -------------------------------------------------------------------------------- 1 | \name{validationFunctions} 2 | \alias{validationFunctions} 3 | 4 | \alias{getSpeciesList} 5 | \alias{validSpecies} 6 | \alias{suggestSpecies} 7 | \alias{getValidSourceTypes} 8 | \alias{checkSpeciesTaxId} 9 | \alias{validDispatchClass} 10 | 11 | \title{ValidationFunctions} 12 | 13 | \description{ 14 | Functions to assist in the validation process of creating the 15 | metadata.csv file for Hub Resources 16 | } 17 | 18 | \usage{ 19 | getSpeciesList(verbose=FALSE) 20 | 21 | validSpecies(species, verbose=TRUE) 22 | 23 | suggestSpecies(query, verbose=FALSE, op=c("|", "&")) 24 | 25 | getValidSourceTypes() 26 | 27 | checkSpeciesTaxId(txid, species, verbose=TRUE) 28 | 29 | validDispatchClass(dc, verbose=TRUE) 30 | } 31 | 32 | \arguments{ 33 | \item{species}{ 34 | species to validate (may be single value or list) 35 | } 36 | \item{query}{ 37 | terms to query. Whether AND or OR is determined by argument op. 38 | } 39 | \item{verbose}{ 40 | should additional information and useful tips be displayed 41 | } 42 | \item{op}{ 43 | Should searching of mulitple terms be conditional OR ("|") or AND 44 | ("&") 45 | } 46 | \item{txid}{taxonomy id (single value or list)} 47 | \item{dc}{Dispatch class to validate (may be single value or list)} 48 | } 49 | 50 | \details{ 51 | \itemize{ 52 | \item getSpeciesList: Provides a list of valid species as 53 | determined by the GenomeInfoDbData package specData.rda file. 54 | 55 | \item validSpecies: True/False if argument is considered a valid 56 | species based on the list generated by getSpeciesList. A species 57 | may be deemed invalid if the capitalization mismatches or 58 | punctuation mismatches. Use suggestSpecies to find similar terms. 59 | 60 | \item suggestSpecies: Based on a term or multiple terms suggest 61 | possible valid species. 62 | 63 | \item getValidSourceTypes: returns list of acceptable values for 64 | SourceType in metadata.csv. If you think a valid source type 65 | should be added to the list please reach out to 66 | maintainer@bioconductor.org 67 | 68 | \item checkSpeciesTaxId: cross validates a list of species and 69 | taxonomy ids for expected values based on 70 | \code{GenomeInfoDb::loadTaxonomyDb()}. Warning when there is a 71 | mismatch. 72 | 73 | \item validDispatchClass: TRUE/FALSE if argument is considered a 74 | valid DispatchClass based on the currently available methods in 75 | AnnotationHub. Use \code{AnnotationHub::DispatchClassList()} to see 76 | the table of currently available methods. If a currently available 77 | method is not appropriate for your resource, please reach out to 78 | Lori Shepherd \email{Lori.Shepherd@roswellpark.org} to request a 79 | new method be added. 80 | 81 | } 82 | } 83 | 84 | \value{ 85 | \itemize{ 86 | \item For getSpeciesList: character vector of valid species 87 | \item For validSpecies: True/False if all species given as argument 88 | are valid 89 | \item For suggestSpecies: data.frame of taxonomy id and species name 90 | of possible valid species based on given query key words. 91 | \item For getValidSourceTypes: character vector of valid source 92 | types. 93 | \item For checkSpeciesTaxId: NULL if check is verfified, If verbose 94 | is ture a table of suggested values along with the warning. 95 | \item For validDispatchClass: True/False if all dispatch class given 96 | as argument are valid 97 | } 98 | } 99 | 100 | \author{Lori Shepherd} 101 | 102 | \seealso{ 103 | \itemize{ 104 | \item \link{AnnotationHubMetadata} 105 | \item \link{makeAnnotationHubMetadata} 106 | } 107 | } 108 | 109 | \examples{ 110 | 111 | species = getSpeciesList() 112 | 113 | # following is TRUE 114 | 115 | validSpecies("Homo sapiens") 116 | # followin is FALSE because of starting "h" 117 | validSpecies("homo sapiens") 118 | 119 | # can provide multiple, if any are not valid FALSE 120 | # TRUE 121 | validSpecies(c("Homo sapiens", "Canis domesticus")) 122 | 123 | suggestSpecies("Canis") 124 | 125 | getValidSourceTypes() 126 | 127 | checkSpeciesTaxId(1003232, "Edhazardia aedis") 128 | checkSpeciesTaxId(9606, "Homo sapiens") 129 | 130 | validDispatchClass("GRanges") 131 | } 132 | 133 | \keyword{methods} 134 | -------------------------------------------------------------------------------- /R/makeStandardTxDbsToSqlite.R: -------------------------------------------------------------------------------- 1 | ### ======================================================================= 2 | ### makeStandardTxDbsToSqlite 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | ## Extracts the sqlite files from the 'standard' TxDb packages in the current 7 | ## Bioconductor repo: 8 | ## 9 | ## http://www.bioconductor.org/packages/release/BiocViews.html#___TxDb. 10 | 11 | ## NOTES: 12 | ## - Recipe should be run after new TxDbs have been generated (right before 13 | ## the next release). 14 | ## - BiocVersion() should be the impending release / current devel. 15 | ## - May need to run AnnotationHubData:::.getTxDbs(TRUE) to load all 16 | ## TxDbs if not in local R install. 17 | 18 | ## Returns list of loaded TxDb objects. 19 | .getTxDbs <- function(TxDbs) { 20 | lapply(TxDbs, function(xx) { 21 | if (!require(xx, character.only=TRUE)) 22 | BiocManager::install(xx, ask=FALSE) 23 | }) 24 | lapply(TxDbs, require, character.only=TRUE) 25 | res <- lapply(TxDbs, get) 26 | names(res) <- TxDbs 27 | res 28 | } 29 | 30 | .TxDbPkgMetadataFromObjs <- function(txdbs, biocversion) { 31 | title <- paste0(names(txdbs), '.sqlite') 32 | species <- unlist(lapply(txdbs, 33 | function(x){m <- metadata(x); m[m$name=='Organism', 2] })) 34 | taxonomyId <- as.integer(unlist(lapply(txdbs, 35 | function(x) { 36 | m <- metadata(x) 37 | id <- m[m$name=='TaxID', 2] 38 | if (!length(id)) 39 | id <- m[m$name=='Taxonomy ID', 2] 40 | id 41 | }))) 42 | 43 | sourceVersion <- sapply(txdbs, 44 | function(x) { 45 | m <- metadata(x) 46 | paste0('UCSC transcript based annotations generated ', 47 | strptime(m[m$name=='Creation time', 2], "%Y-%m-%d")) 48 | }, simplify=FALSE) 49 | url <- list(c("http://genome.ucsc.edu/", 50 | "http://hgdownload.cse.ucsc.edu/goldenPath")) 51 | list(title=title, 52 | species=species, 53 | taxonomyId=taxonomyId, 54 | genome=rep("UCSC genomes", length(title)), 55 | sourceUrl=rep(url, length(title)), 56 | sourceVersion=sourceVersion, 57 | description=paste("UCSC transcript based annotations for", species), 58 | rDataPath=paste0("ucsc/standard/", biocversion, "/",title)) 59 | } 60 | 61 | makeStandardTxDbsToAHM <- function(currentMetadata, justRunUnitTest = FALSE, 62 | BiocVersion = BiocManager::version(), 63 | TxDbs) { 64 | if (length(BiocVersion) > 1L) 65 | stop("length(BiocVersion) must == 1L") 66 | 67 | txdbs <- .getTxDbs(TxDbs) 68 | meta <- .TxDbPkgMetadataFromObjs(txdbs, biocversion=BiocVersion) 69 | Map(AnnotationHubMetadata, 70 | AnnotationHubRoot=currentMetadata$AnnotationHubRoot, 71 | Description=meta$description, 72 | Genome=meta$genome, 73 | SourceUrl=meta$sourceUrl, 74 | SourceVersion=meta$sourceVersion, 75 | Species=meta$species, 76 | TaxonomyId=meta$taxonomyId, 77 | Title=meta$title, 78 | RDataPath=meta$rDataPath, 79 | MoreArgs=list( 80 | BiocVersion=BiocVersion, 81 | Coordinate_1_based=TRUE, ## TRUE unless it "needs" to be FALSE 82 | DataProvider="UCSC", 83 | Maintainer="Bioconductor Maintainer ", 84 | RDataClass="TxDb", 85 | DispatchClass="SQLiteFile", 86 | SourceType="FASTA", 87 | RDataDateAdded = Sys.time(), 88 | Recipe="AnnotationHubData:::extractTxDbSqlite", 89 | Tags=c("UCSC", "Transcript", "Annotation")), 90 | USE.NAMES=FALSE) 91 | } 92 | 93 | ## Load the object and call saveDb() 94 | extractTxDbSqlite <- function(ahm) { 95 | dbFile <- metadata(ahm)$Title 96 | txdb <- sub('.sqlite','',dbFile) 97 | outputPath <- file.path(metadata(ahm)$AnnotationHubRoot, 98 | metadata(ahm)$RDataPath) 99 | if (!isSingleString(outputPath)) 100 | stop("'outputPath' must be a single string") 101 | sqliteCopyDatabase(dbconn(.getTxDbs(txdb)[[1]]), outputPath) 102 | outputFile(ahm) 103 | } 104 | 105 | makeAnnotationHubResource("TxDbFromPkgsImportPreparer", makeStandardTxDbsToAHM) 106 | -------------------------------------------------------------------------------- /R/makeInparanoid8ToDbs.R: -------------------------------------------------------------------------------- 1 | ## This recipe is no longer used. If reinstated, add this unit test 2 | ## back to test_recipes.R 3 | 4 | #test_Inparanoid8ImportPreparer_recipe <- function() { 5 | # suppressWarnings({ 6 | # ahms = updateResources(ahroot, BiocVersion, 7 | # preparerClasses = "Inparanoid8ImportPreparer", 8 | # insert = FALSE, metadataOnly=TRUE, 9 | # justRunUnitTest=TRUE) 10 | # }) 11 | # checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 12 | #} 13 | 14 | ## helper to make metadata list from the data 15 | .inparanoidMetadataFromUrl <- function(baseUrl, justRunUnitTest) { 16 | ## get all the subDirs 17 | subDirs <- AnnotationForge:::.getSubDirs(baseUrl) 18 | subDirs <- subDirs[!(subDirs %in% c('stderr/'))] 19 | species <- sub("/","",subDirs) 20 | allDirs <- file.path(baseUrl, species) 21 | ## We have the tax ID and the full species names in AnnotationForge already 22 | meta <- read.delim(system.file('extdata','inp8_Full_species_mapping', 23 | package='AnnotationForge'), 24 | sep="\t", header=TRUE, stringsAsFactors=FALSE) 25 | matches <- match(species, meta$inparanoidSpecies) 26 | fullSpecies <- meta$GenusAndSpecies[matches] 27 | taxonomyId <- as.integer(as.character(meta$taxID[matches])) 28 | ## get the name for the DB 29 | title <- paste0("hom.", 30 | gsub(" ","_",fullSpecies), 31 | ".inp8", 32 | ".sqlite") 33 | ## root <- setNames(rep(NA_character_, length(allDirs)), title) 34 | genome <- setNames(rep("inparanoid8 genomes", length(allDirs)), title) 35 | sourceVersion <- rep('Inparanoid version 8',length(allDirs)) 36 | description <- paste("Inparanoid 8 annotations about", fullSpecies) 37 | sourceUrl <- paste0(baseUrl,"/", species) 38 | 39 | rDataPath <- paste0("inparanoid8/Orthologs/",title) 40 | 41 | df <- data.frame(title=title, species = fullSpecies, 42 | taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl, 43 | sourceVersion = sourceVersion, 44 | description=description, rDataPath=rDataPath, stringsAsFactors=FALSE) 45 | rownames(df) <- NULL 46 | 47 | if(justRunUnitTest) 48 | df <- df[1:2, ] 49 | df 50 | } 51 | 52 | 53 | ## STEP 1: make function to process metadata into AHMs 54 | ## This function will return the AHMs and takes no args. 55 | ## It also must specify a recipe function. 56 | makeinparanoid8ToAHMs <- function(currentMetadata, justRunUnitTest, BiocVersion) { 57 | baseUrl <- 'http://inparanoid.sbc.su.se/download/current/Orthologs_other_formats' 58 | ## Then make the metadata for these 59 | meta <- .inparanoidMetadataFromUrl(baseUrl, justRunUnitTest) 60 | ## then make AnnotationHubMetadata objects. 61 | Map(AnnotationHubMetadata, 62 | ## AnnotationHubRoot=meta$annotationHubRoot, 63 | Description=meta$description, 64 | Genome=meta$genome, 65 | SourceUrl=meta$sourceUrl, 66 | SourceVersion=meta$sourceVersion, 67 | Species=meta$species, 68 | TaxonomyId=meta$taxonomyId, 69 | Title=meta$title, 70 | RDataPath=meta$rDataPath, 71 | MoreArgs=list( 72 | BiocVersion=BiocVersion, 73 | SourceType="Inparanoid", 74 | Coordinate_1_based = TRUE, ## TRUE unless it "needs" to be FALSE 75 | DataProvider = "Inparanoid8", 76 | Maintainer = "Marc Carlson ", 77 | RDataClass = "Inparanoid8Db", 78 | DispatchClass="SQLiteFile", 79 | RDataDateAdded = Sys.time(), 80 | Recipe = "AnnotationHubData:::inparanoid8ToDbsRecipe", 81 | Tags = c("Inparanoid", "Gene", "Homology", "Annotation"))) 82 | } 83 | 84 | 85 | 86 | ## STEP 2: Make a recipe function that takes an AnnotationHubRecipe 87 | ## object. 88 | inparanoid8ToDbsRecipe <- function(ahm){ 89 | 90 | inputFiles <- metadata(ahm)$SourceFile 91 | dbname <- makeInpDb(dir=file.path(inputFiles,""), 92 | dataDir=tempdir()) 93 | db <- loadDb(file=dbname) 94 | outputPath <- file.path(metadata(ahm)$AnnotationHubRoot, 95 | metadata(ahm)$RDataPath) 96 | saveDb(db, file=outputPath) 97 | outputFile(ahm) 98 | } 99 | 100 | 101 | 102 | 103 | ## STEP 3: Call the helper to set up the newResources() method 104 | makeAnnotationHubResource("Inparanoid8ImportPreparer", 105 | makeinparanoid8ToAHMs) 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /R/makeStandardOrgDbsToSqlite.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeStandardOrgDbsToSqlite ('standard' OrgDbs) 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | ## This recipe extracts the sqlite files from the 'standard' 7 | ## OrgDb packages in the current Bioconductor repo: 8 | ## 9 | ## http://www.bioconductor.org/packages/release/BiocViews.html#___OrgDb. 10 | 11 | ## This recipe should be run after the new OrgDbs have been generated for the 12 | ## next release. The version should be the current devel version, 13 | ## soon to roll over to the new release. 14 | 15 | ## The 'non-standard' OrgDbs are generated with makeNCBIToOrgDbs.R. 16 | 17 | ## Returns list of OrgDb objects 18 | ## NOTE: OrganismDbi:::.packageTaxIds is a static named character vector 19 | ## of package names and taxids. This file should be checked to 20 | ## confirm the package names match the current batch of OrgDb packages. 21 | .getOrgDbs <- function(downloadOrgDbs=FALSE) { 22 | dbNames <- OrganismDbi:::.packageTaxIds() 23 | if (downloadOrgDbs) { ## download, install 24 | lapply(dbNames, function(xx) { 25 | if (!requireNamespace(xx)) { 26 | BiocManager::install(xx, ask=FALSE) 27 | } 28 | }) 29 | } 30 | res <- mapply(get, dbNames, lapply(dbNames, asNamespace), SIMPLIFY=FALSE) 31 | names(res) <- dbNames 32 | res 33 | } 34 | 35 | .orgDbPkgMetadataFromObjs <- function(orgDbs, biocversion) { 36 | ## title 37 | title <- paste0(names(orgDbs), '.sqlite') 38 | ## organism 39 | species <- unlist(lapply(orgDbs, 40 | function(x){m <- metadata(x); m[m$name=='ORGANISM', 2] })) 41 | ## tax ID 42 | taxonomyId <- as.integer(unlist(lapply(orgDbs, 43 | function(x){m <- metadata(x); m[m$name=='TAXID', 2] }))) 44 | ## genome 45 | genome <- rep("NCBI genomes", length(title)) 46 | 47 | ## sourceUrl 48 | urls <- c("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/", 49 | "ftp://ftp.ensembl.org/pub/current_fasta") 50 | sourceUrl <- rep(list(urls), length(title)) 51 | ## sourceVersion 52 | dateMessage <- paste0('NCBI gene annotations as of ', as.character(date())) 53 | sourceVersion <- rep(dateMessage, length(title)) 54 | ## description 55 | description <- paste("NCBI gene ID based annotations about", species) 56 | ## rDataPath 57 | rDataPath <- paste0("ncbi/standard/",biocversion,"/",title) 58 | ## return as a list 59 | list(##annotationHubRoot = root, 60 | title=title, 61 | species = species, 62 | taxonomyId = taxonomyId, 63 | genome = genome, 64 | sourceUrl=sourceUrl, 65 | sourceVersion = sourceVersion, 66 | description=description, 67 | rDataPath=rDataPath) 68 | } 69 | 70 | makeStandardOrgDbsToAHM <- function(currentMetadata, justRunUnitTest=FALSE, 71 | BiocVersion=BiocManager::version(), 72 | downloadOrgDbs=TRUE) { 73 | if (length(BiocVersion) > 1L) 74 | stop("BiocVersion must be a single version") 75 | 76 | orgDbs <- .getOrgDbs(downloadOrgDbs) 77 | meta <- .orgDbPkgMetadataFromObjs(orgDbs, biocversion=BiocVersion) 78 | Map(AnnotationHubMetadata, 79 | AnnotationHubRoot=currentMetadata$AnnotationHubRoot, 80 | Description=meta$description, 81 | Genome=meta$genome, 82 | SourceUrl=meta$sourceUrl, 83 | SourceVersion=meta$sourceVersion, 84 | Species=meta$species, 85 | TaxonomyId=meta$taxonomyId, 86 | Title=meta$title, 87 | RDataPath=meta$rDataPath, 88 | MoreArgs=list( 89 | BiocVersion=BiocVersion, 90 | Coordinate_1_based = TRUE, ## TRUE unless it "needs" to be FALSE 91 | DataProvider = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/", 92 | Maintainer = "Bioconductor Maintainer ", 93 | RDataClass = "OrgDb", 94 | DispatchClass = "SQLiteFile", 95 | SourceType="NCBI/ensembl", 96 | RDataDateAdded = Sys.time(), 97 | Recipe = "AnnotationHubData:::extractOrgDbSqlite", 98 | Tags = c("NCBI", "Gene", "Annotation"))) 99 | } 100 | 101 | ## Load the object and call saveDb() 102 | extractOrgDbSqlite <- function(ahm) { 103 | dbFile <- metadata(ahm)$Title 104 | orgDbName <- sub('.sqlite','',dbFile) 105 | orgDbs <- .getOrgDbs() 106 | orgDb <- orgDbs[[orgDbName]] 107 | outputPath <- file.path(metadata(ahm)$AnnotationHubRoot, 108 | metadata(ahm)$RDataPath) 109 | if (!isSingleString(outputPath)) 110 | stop("'outputPath' must be a single string") 111 | sqliteCopyDatabase(dbconn(orgDb), outputPath) 112 | outputFile(ahm) 113 | } 114 | 115 | makeAnnotationHubResource("OrgDbFromPkgsImportPreparer", makeStandardOrgDbsToAHM) 116 | -------------------------------------------------------------------------------- /R/makeHaemCode.R: -------------------------------------------------------------------------------- 1 | ## This recipe is no longer used. Download site has moved from the hard coded 2 | ## location to 3 | ## http://codex.stemcells.cam.ac.uk/browse.php?repository=haemcode&organism=mmu 4 | ## If this is resurrected, reinstate unit test with a smaller file 5 | ## used for justRunUnitTest=TRUE. 6 | ## 7 | # test_HaemCodeImportPreparer_recipe <- function() { 8 | # ahms = updateResources(ahroot, BiocVersion, 9 | # preparerClasses = "HaemCodeImportPreparer", 10 | # insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE) 11 | # checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 12 | #} 13 | 14 | 15 | 16 | .haemcodeBaseUrl <- "http://haemcode.stemcells.cam.ac.uk/" 17 | 18 | .getHaemCodeFileNames <- function(justRunUnitTest) { 19 | filename <- system.file("extdata", "haemCodeFileList.txt", 20 | package="AnnotationHubData") 21 | stopifnot(file.exists(filename)) 22 | file.list <- scan(filename, what=character(0), sep="\n", quiet=TRUE) 23 | 24 | metadata.filename <- system.file(package="AnnotationHubData", "extdata", 25 | "annotation_haemcode.tsv") 26 | stopifnot(file.exists(metadata.filename)) 27 | tbl.md <- read.table(metadata.filename, sep="\t", header=TRUE, as.is=TRUE) 28 | metadata <- tbl.md[which(tbl.md$filename == file.list),] 29 | metadata <- apply(metadata,1, function(x) paste0(x, collapse=", ")) 30 | metadata <- rep(metadata, 3) 31 | names(metadata) <- NULL 32 | 33 | paths <- c(bigWig="blood/BigWig/mm10", 34 | peaks="blood/Peaks/mm10", 35 | geneList="blood/geneList") 36 | 37 | urls <- paste0(.haemcodeBaseUrl, paths) 38 | 39 | file.types <- c("bw", "bed", "csv") 40 | 41 | fileurls <- mapply(function(x, y){ 42 | paste0(x, "/", file.list,".", y) 43 | }, urls, file.types, USE.NAMES=FALSE, SIMPLIFY=FALSE) 44 | fileurls <- unlist(fileurls) 45 | 46 | if(justRunUnitTest) { 47 | fileurls <- fileurls[c(1,2)] 48 | metadata <- metadata[c(1,2)] 49 | } 50 | 51 | list(files= fileurls, metadata = metadata) 52 | } 53 | 54 | .getHaemCode <- function(justRunUnitTest=FALSE) { 55 | result <- .getHaemCodeFileNames(justRunUnitTest) 56 | 57 | haemfiles <- result$files 58 | tags <- result$metadata 59 | 60 | if(length(haemfiles)==0) 61 | stop(" File List not found! ") 62 | 63 | df <- .httrFileInfo(haemfiles, verbose=TRUE) 64 | title <- basename(haemfiles) 65 | type <- tools::file_ext(title) 66 | 67 | fileType <- sapply(type, function(x) 68 | switch(x, bw="bigWig", bed="peak", csv="geneList"), 69 | USE.NAMES =FALSE) 70 | 71 | description <- paste0(fileType, " file from Haemcode") 72 | 73 | dispatchclass <- sapply(type, function(x) 74 | switch(x, bw="BigWigFile", bed="BEDFile", csv="CSVtoGranges"), 75 | USE.NAMES =FALSE) 76 | 77 | sourcetype <- sapply(type, function(x) 78 | switch(x, bw="BigWig", bed="BED", csv="CSV"), 79 | USE.NAMES =FALSE) 80 | 81 | rdataclass <- sapply(type, function(x) 82 | switch(x, bw="BigWigFile", bed="GRanges", csv="GRanges"), 83 | USE.NAMES =FALSE) 84 | 85 | 86 | cbind(df, title, description, fileType, tags, dispatchclass, 87 | sourcetype, rdataclass, stringsAsFactors=FALSE) 88 | 89 | } 90 | 91 | makeHaemCodeImporter <- function(currentMetadata, justRunUnitTest=FALSE, 92 | BiocVersion=BiocManager::version()) { 93 | rsrc <- .getHaemCode(justRunUnitTest) 94 | 95 | ## input_sources table 96 | sourceSize <- as.numeric(rsrc$size) 97 | sourceUrls <- rsrc$fileurl 98 | sourceVersion <- gsub(" ", "_", rsrc$date) # should be character 99 | SourceLastModifiedDate <- rsrc$date # should be "POSIXct" "POSIXt" 100 | sourceType <- rsrc$sourcetype 101 | rdataclass <- rsrc$rdataclass 102 | 103 | ## resources table 104 | title <- rsrc$title 105 | # dataprovider, species, taxonomyid, genome are same for all files 106 | description <- rsrc$description 107 | # maintainer, cordinateBased, status_id, location_prefix, rdataadded, 108 | # preparerclss are same for all files 109 | 110 | rdatapath <- sub(.haemcodeBaseUrl, "", sourceUrls) 111 | dispatchclass <- rsrc$dispatchclass 112 | 113 | tags <- strsplit(rsrc$tags, ", ") 114 | 115 | Map(AnnotationHubMetadata, 116 | 117 | SourceSize=sourceSize, 118 | SourceUrl=sourceUrls, 119 | SourceVersion=sourceVersion, 120 | SourceLastModifiedDate = SourceLastModifiedDate, 121 | SourceType = sourceType, 122 | 123 | Description=description, 124 | Title=title, 125 | 126 | RDataPath=rdatapath, 127 | DispatchClass = dispatchclass, 128 | RDataClass = rdataclass, 129 | 130 | Tags=tags, 131 | 132 | MoreArgs=list( 133 | BiocVersion=BiocVersion, 134 | DataProvider = "Haemcode", 135 | Species="Mus musculus", 136 | TaxonomyId=10090L, 137 | Genome= "mm10", 138 | Maintainer = "Bioconductor Maintainer ", 139 | Coordinate_1_based = FALSE, 140 | Location_Prefix = .haemcodeBaseUrl, 141 | RDataDateAdded = Sys.time(), 142 | Recipe = NA_character_) 143 | ) 144 | } 145 | 146 | makeAnnotationHubResource("HaemCodeImportPreparer", makeHaemCodeImporter) 147 | -------------------------------------------------------------------------------- /inst/unitTests/test_recipe.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ## Tests to just see if we can run all of our recipes 3 | 4 | ahroot <- file.path(getwd(),"Temp") 5 | BiocVersion <- as.character(BiocManager::version()) 6 | 7 | ## No longer used: 8 | ## test_HaemCodeImportPreparer_recipe 9 | ## test_Inparanoid8ImportPreparer_recipe 10 | ## test_BioPaxImportPreparer_recipe 11 | 12 | 13 | ## FIXME: 14 | ## Both UCSC broken because location / format of eutils file has changed; 15 | ## See .organismToTaxid() 16 | #test_UCSCChainPreparer_recipe <- function() { 17 | # ahms = updateResources(ahroot, BiocVersion, 18 | # preparerClasses = "UCSCChainPreparer", 19 | # insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE) 20 | # checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 21 | #} 22 | # 23 | #test_UCSC2BitPreparer_recipe <- function() { 24 | # ahms = updateResources(ahroot, BiocVersion, 25 | # preparerClasses = "UCSC2BitPreparer", 26 | # insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE) 27 | # checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 28 | #} 29 | 30 | test_EncodeImportPreparer_recipe <- function() { 31 | ahms = updateResources(ahroot, BiocVersion, 32 | preparerClasses = "EncodeImportPreparer", 33 | insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE) 34 | checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 35 | } 36 | 37 | ## FIX ME: 38 | ## Broken becasue of change in R and encoding 39 | ## breaks makeEpigenomeRoadmap.R line 56 in gsub 40 | #test_EpigenomeRoadmapImportPreparer_recipe <- function() { 41 | # ahms = updateResources(ahroot, BiocVersion, 42 | # preparerClasses = "EpigenomeRoadMapPreparer", 43 | # insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE) 44 | # checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 45 | #} 46 | 47 | test_dbSNPVCFPreparer_recipe <- function() { 48 | ahms = updateResources(ahroot, BiocVersion, 49 | preparerClasses = "dbSNPVCFPreparer", 50 | insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE) 51 | checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 52 | } 53 | 54 | test_RefNetImportPreparer_recipe <- function() { 55 | ahms = updateResources(ahroot, BiocVersion, 56 | preparerClasses = "RefNetImportPreparer", 57 | insert = FALSE, metadataOnly=TRUE) 58 | checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 59 | } 60 | 61 | test_ChEAPreparer_recipe <- function() { 62 | ahms = updateResources(ahroot, BiocVersion, 63 | preparerClasses = "ChEAImportPreparer", 64 | insert = FALSE, metadataOnly=TRUE) 65 | checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 66 | } 67 | 68 | test_NCBIImportPreparer_recipe <- function() { 69 | ahms = updateResources(ahroot, BiocVersion, 70 | preparerClasses = "NCBIImportPreparer", 71 | insert = FALSE, metadataOnly=TRUE, 72 | justRunUnitTest=TRUE) 73 | checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 74 | } 75 | 76 | test_Grasp2Db_recipe <- function() { 77 | ahms = updateResources(ahroot, BiocVersion, 78 | preparerClasses = "Grasp2ImportPreparer", 79 | insert = FALSE, metadataOnly=TRUE, 80 | justRunUnitTest=TRUE) 81 | checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 82 | } 83 | 84 | ## FIXME: add test_EnsemblFasta 85 | ## test_EnsmblFastaTwoBitToAHM <- function() { 86 | ## ahms = updateResources(ahroot, BiocVersion, 87 | ## preparerClasses = "EnsemblTwoBitPreparer", 88 | ## insert = FALSE, metadataOnly = TRUE, 89 | ## justRunUnitTest = TRUE, release = 96) 90 | ## checkTrue(class(ahms[[1]]) == "AnnotationHubMetadata") 91 | ## # fails before ensembl release 96 92 | ## checkException( 93 | ## updateResources(ahroot, BiocVersion, 94 | ## preparerClasses = "EnsemblTwoBitPreparer", 95 | ## insert = FALSE, metadataOnly = TRUE, 96 | ## justRunUnitTest = TRUE, release = 85) 97 | ## ) 98 | ## } 99 | 100 | 101 | ## Test_EnsemblGtfToGRanges_recipe <- function() { 102 | ## ahms = updateResources(ahroot, BiocVersion, 103 | ## preparerClasses = "EnsemblGtfImportPreparer", 104 | ## insert = FALSE, metadataOnly=TRUE, 105 | ## release = "96", justRunUnitTest=TRUE) 106 | ## checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 107 | ## # fails before ensembl release 96 108 | ## checkException( 109 | ## updateResources(ahroot, BiocVersion, 110 | ## preparerClasses = "EnsemblGtfImportPreparer", 111 | ## insert = FALSE, metadataOnly=TRUE, 112 | ## release = "85", justRunUnitTest=TRUE) 113 | ## ) 114 | ## } 115 | 116 | #test_GencodeGFF <- function() { 117 | # ahms = updateResources(ahroot, BiocVersion, 118 | # preparerClasses = "GencodeGffImportPreparer", 119 | # insert = FALSE, metadataOnly=TRUE, 120 | # justRunUnitTest=TRUE, release="31") 121 | # checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 122 | #} 123 | 124 | #test_GencodeFasta <- function() { 125 | # ahms = updateResources(ahroot, BiocVersion, 126 | # preparerClasses = "GencodeFastaImportPreparer", 127 | # insert = FALSE, metadataOnly=TRUE, 128 | # justRunUnitTest=TRUE, species="Human", 129 | # release="23") 130 | # checkTrue(class(ahms[[1]])=="AnnotationHubMetadata") 131 | #} 132 | -------------------------------------------------------------------------------- /R/HubMetadata-class.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### HubMetadata objects 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | setOldClass(c("POSIXct", "POSIXt")) 7 | setOldClass("numeric_version") 8 | setOldClass(c("package_version", "numeric_version")) 9 | 10 | ## The prototype needs to be fully specified, using 'NA' to indicate 11 | ## unknown, otherwise to / from JSON is confused 12 | setClass("HubMetadata", 13 | representation( 14 | "VIRTUAL", 15 | HubRoot="character", 16 | BiocVersion="package_version", 17 | Coordinate_1_based="logical", 18 | DataProvider="character", 19 | DerivedMd5="character", 20 | Description='character', 21 | Genome="character", ## needed for record_id 22 | Maintainer="character", 23 | Notes='character', 24 | RDataClass="character", ## needed for record_id 25 | RDataDateAdded="POSIXct", 26 | RDataPath="character", 27 | Recipe="character", ## no longer needed for record_id 28 | SourceLastModifiedDate="POSIXct", 29 | SourceMd5="character", 30 | SourceSize="numeric", 31 | SourceUrl="character", ## needed for record_id 32 | SourceVersion="character", 33 | SourceType="character", 34 | Species="character", 35 | Tags='character', 36 | TaxonomyId="integer", ## needed for record_id 37 | Title="character", 38 | Location_Prefix="character", 39 | DispatchClass="character", 40 | PreparerClass="character", ## needed for record_id 41 | Error="character" 42 | ), 43 | prototype = prototype( 44 | HubRoot=NA_character_, 45 | BiocVersion=BiocManager::version(), 46 | Coordinate_1_based=NA, 47 | DataProvider=NA_character_, 48 | DerivedMd5=NA_character_, 49 | Description=NA_character_, 50 | Genome=NA_character_, 51 | Maintainer= 52 | "Bioconductor Package Maintainer ", 53 | Notes=NA_character_, 54 | RDataClass=NA_character_, 55 | RDataDateAdded=as.POSIXct(NA_character_), 56 | RDataPath=NA_character_, 57 | Recipe=NA_character_, 58 | SourceLastModifiedDate=as.POSIXct(NA_character_), 59 | SourceMd5=NA_character_, 60 | SourceSize=NA_real_, 61 | SourceVersion=NA_character_, 62 | SourceType=NA_character_, 63 | Species=NA_character_, 64 | Tags=NA_character_, 65 | TaxonomyId=NA_integer_, 66 | Title=NA_character_, 67 | Location_Prefix=NA_character_, 68 | DispatchClass=NA_character_, 69 | PreparerClass=NA_character_, 70 | Error="NA_character" 71 | ) 72 | ) 73 | 74 | ## ---------------------------------------------------------------------------- 75 | ## generics 76 | ## 77 | 78 | setGeneric("recipeName", signature="object", 79 | function(object) standardGeneric ("recipeName") 80 | ) 81 | 82 | setGeneric("inputFiles", signature="object", 83 | function(object, ...) standardGeneric ("inputFiles") 84 | ) 85 | 86 | setGeneric("outputFile", signature="object", 87 | function(object) standardGeneric ("outputFile") 88 | ) 89 | 90 | setGeneric("run", signature="object", 91 | function(object, recipeFunction, ...) standardGeneric ("run") 92 | ) 93 | 94 | setGeneric("hubError", function(x) standardGeneric("hubError")) 95 | 96 | setGeneric("hubError<-", signature=c("x", "value"), 97 | function(x, value) standardGeneric("hubError<-") 98 | ) 99 | 100 | ## ------------------------------------------------------------------------------ 101 | ## getters and setters 102 | ## 103 | 104 | setMethod("metadata", "HubMetadata", 105 | function(x, ...) { 106 | nms <- slotNames(class(x)) 107 | names(nms) <- nms 108 | lapply(nms, slot, object=x) 109 | } 110 | ) 111 | 112 | setReplaceMethod("metadata", c("HubMetadata", "list"), 113 | function(x, ..., value) 114 | do.call(new, c(class(x), x, value)) 115 | ) 116 | 117 | setMethod("recipeName", "HubMetadata", 118 | function(object) 119 | metadata(object)$Recipe 120 | ) 121 | 122 | setMethod("inputFiles", "HubMetadata", 123 | function(object, useRoot=TRUE) { 124 | if(useRoot==TRUE){ 125 | res <- file.path(metadata(object)$HubRoot, 126 | metadata(object)$RDataPath) 127 | }else{ 128 | res <- metadata(object)$SourceUrl 129 | } 130 | res 131 | } 132 | ) 133 | 134 | setMethod("outputFile", "HubMetadata", 135 | function(object) 136 | file.path(metadata(object)$HubRoot, 137 | basename(metadata(object)$RDataPath)) 138 | ) 139 | 140 | setMethod("hubError", "HubMetadata", 141 | function(x) x@Error 142 | ) 143 | 144 | setMethod("hubError", "list", 145 | function(x) 146 | { 147 | if (!all(sapply(x, is, "HubMetadata"))) 148 | stop("all elements of 'value' must be 'HubMetadata' objects") 149 | sapply(x, hubError) 150 | } 151 | ) 152 | 153 | setReplaceMethod("hubError", c("HubMetadata", "character"), 154 | function(x, value) 155 | { 156 | x@Error <- value 157 | x 158 | } 159 | ) 160 | 161 | setReplaceMethod("hubError", c("list", "character"), 162 | function(x, value) 163 | { 164 | if (!all(sapply(x, is, "HubMetadata"))) 165 | stop("all elements of 'x' must be 'HubMetadata' objects") 166 | lapply(x, "hubError<-", value=value) 167 | } 168 | ) 169 | 170 | ## ------------------------------------------------------------------------------ 171 | ## show 172 | ## 173 | 174 | setMethod(show, "HubMetadata", 175 | function(object) 176 | { 177 | cat("class: ", class(object), '\n', sep='') 178 | for (slt in sort(slotNames(object))) { 179 | value <- slot(object, slt) 180 | txt <- paste0(slt, ": ", paste0(as.character(value), collapse=" ")) 181 | cat(strwrap(txt), sep="\n ") 182 | } 183 | }) 184 | -------------------------------------------------------------------------------- /man/makeStandardOrgDbs.Rd: -------------------------------------------------------------------------------- 1 | \name{makeStandardOrgDbs} 2 | 3 | \alias{makeStandardOrgDbs} 4 | \alias{makeStandardOrgDbsToAHM} 5 | \alias{makeStandardTxDbs} 6 | \alias{makeStandardTxDbsToAHM} 7 | \alias{makeNonStandardOrgDbs} 8 | \alias{makeNCBIToOrgDbsToAHM} 9 | 10 | \title{Functions to add OrgDb and TxDb sqlite files to AnnotationHub} 11 | 12 | \description{Add OrgDb and TxDb sqlite files to AnnotationHub} 13 | 14 | \usage{ 15 | makeStandardOrgDbsToAHM(currentMetadata, justRunUnitTest = FALSE, 16 | BiocVersion = BiocManager::version(), 17 | downloadOrgDbs = TRUE) 18 | 19 | makeStandardTxDbsToAHM(currentMetadata, justRunUnitTest = FALSE, 20 | BiocVersion = BiocManager::version(), TxDbs) 21 | 22 | makeNCBIToOrgDbsToAHM(currentMetadata, justRunUnitTest = FALSE, 23 | BiocVersion = BiocManager::version(), 24 | baseUrl = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/") 25 | } 26 | 27 | \arguments{ 28 | \item{currentMetadata}{ 29 | Historically was intended to be a list of metadata to filter, i.e., 30 | records that do not need to be processed again. In some recipes this 31 | is used as a way to pass additional arguments. Need to remove or 32 | make consistent. 33 | } 34 | \item{baseUrl}{ 35 | A \code{character()}. The file location. 36 | } 37 | \item{justRunUnitTest}{ 38 | A \code{logical}. When TRUE, a small number of records (usually <= 5) are 39 | processed instead of all. 40 | } 41 | \item{BiocVersion}{ 42 | A \code{character(1)}. The resource will be available for Bioconductor 43 | versions greater than and equal to this version. Default is 44 | BiocManaer::version(). 45 | } 46 | \item{TxDbs}{ 47 | Character vector of the \code{TxDb} names; generally includes 48 | \code{TxDbs} that were new or updated for the current release. 49 | } 50 | \item{downloadOrgDbs}{ 51 | A \code{logical}. Indicates if all \code{OrgDb} packages in the 52 | Bioconductor repo should be downloaded and installed. This should be 53 | \code{TRUE} the first time the recipe is run and can be \code{FALSE} for 54 | subsequent runs when testing. 55 | } 56 | } 57 | 58 | \details{ 59 | \code{makeStandardOrgDbsToAHM} and \code{makeStandardTxDbsToAHM} extracts 60 | the sqlite files from the existing \code{OrgDb} and \code{TxDb} packages 61 | in the Bioconductor repositories and generate associated metadata. 62 | 63 | \code{makeNCBIToOrgDbsToAHM} creates sqlite files and metadata for 1000 64 | organisms with the \code{makeOrgPackageFromNCBI} function. These 65 | organisms are less 'main stream' than those hosted in the Bioconductor 66 | repository (\code{makeStandardOrgDbsToAHM}) and the databases are less 67 | comprehensive because data only come from one source, NCBI. 68 | } 69 | 70 | \value{ 71 | List of \code{AnnotationHubMetadata} objects. 72 | } 73 | 74 | \author{Bioconductor Core Team} 75 | 76 | \seealso{ 77 | \itemize{ 78 | \item \link{updateResources} 79 | \item \link{AnnotationHubMetadata} 80 | } 81 | } 82 | 83 | \examples{ 84 | \dontrun{ 85 | ## In Bioconductor 3.5, one new TxDb was added and 4 active 86 | ## tracks were updated. This piece of code shows how to add these 5 87 | ## packages to AnnotationHub. 88 | 89 | ## Step I: generate metadata 90 | ## 91 | ## Generate the metadata with the low-level helper for inspection. 92 | TxDbs <- c("TxDb.Ggallus.UCSC.galGal5.refGene", 93 | "TxDb.Celegans.UCSC.ce11.refGene", 94 | "TxDb.Rnorvegicus.UCSC.rn5.refGene", 95 | "TxDb.Dmelanogaster.UCSC.dm6.ensGene", 96 | "TxDb.Rnorvegicus.UCSC.rn6.refGene") 97 | meta <- makeStandardTxDbsToAHM(currentMetadata=list(AnnotationHubRoot="TxDbs"), 98 | justRunUnitTest=FALSE, 99 | TxDbs = TxDbs) 100 | 101 | ## Once the low-level helper runs with no errors, try generating the 102 | ## metadata with the high-level wrapper updateResources(). Setting 103 | ## metadataOnly=TRUE will generate metadata only and not push resources 104 | ## to data bucket. insert=FALSE prevents the metadata from being inserted in the 105 | ## database. 106 | ## 107 | ## The metadata generated by updateResources() will be the same as that 108 | ## generated by makeStandardTxDbsToAHM(). Both should be a list the same 109 | ## length as the number of TxDbs specified. 110 | meta <- updateResources("TxDbs", 111 | preparerClasses="TxDbFromPkgsImportPreparer", 112 | metadataOnly=TRUE, insert = FALSE, 113 | justRunUnitTest=FALSE, TxDbs = TxDbs) 114 | 115 | INFO [2017-04-11 09:12:09] Preparer Class: TxDbFromPkgsImportPreparer 116 | complete! 117 | > length(meta) 118 | [1] 5 119 | 120 | ## Step II: push resources to Azure 121 | ## 122 | ## If the metadata looks correct we are ready to push resources to Azure. 123 | ## Set metadataOnly=FALSE but keep insert=FALSE. 124 | 125 | ## export an environment variable with a core generated SAS URL for 126 | ## upload example: 127 | ## export AZURE_SAS_URL='https://bioconductorhubs.blob.core.windows.net/staginghub?sp=racwl&st=2022-02-08T15:57:00Z&se=2022-02-22T23:57:00Z&spr=https&sv=2020-08-04&sr=c&sig=fBtPzgrw1Akzlz%2Fwkne%2BQrxOKOdCzP1%2Fk5S%2FHk1LguE%3D' 128 | 129 | meta <- updateResources("TxDbs", 130 | BiocVersion="3.5", 131 | preparerClasses="TxDbFromPkgsImportPreparer", 132 | metadataOnly=FALSE, insert = FALSE, 133 | justRunUnitTest=FALSE, TxDbs = TxDbs) 134 | 135 | ## Step III: insert metadata in AnnotationHub production database 136 | ## 137 | ## Inserting the metadata in the database is usually done as a separte step 138 | ## and with the help of the AnnotationHub docker. 139 | ## Set metadataOnly=TRUE and insert=TRUE. 140 | meta <- updateResources("TxDbs", 141 | BiocVersion="3.5", 142 | preparerClasses="TxDbFromPkgsImportPreparer", 143 | metadataOnly=FALSE, insert = FALSE, 144 | justRunUnitTest=FALSE, TxDbs = TxDbs) 145 | 146 | } 147 | } 148 | 149 | \keyword{methods} 150 | -------------------------------------------------------------------------------- /inst/scripts/singleContributedResourceTemplate.R: -------------------------------------------------------------------------------- 1 | ## Community contributed resources. 2 | 3 | ## ----------------------------------------------------------------------- 4 | ## Timothee Flutre's GRanges from GFF 5 | ## ----------------------------------------------------------------------- 6 | 7 | ## Vitis vinifera URGI IGGP12Xv2_V3-20 8 | metadata <- AnnotationHubMetadata( 9 | Description="Gene Annotation for Vitis vinifera", 10 | Genome="IGGP12Xv2", 11 | Species="Vitis vinifera", 12 | SourceUrl="http://doi.org/10.15454/1.5009072354498936E12", 13 | SourceLastModifiedDate=as.POSIXct("2018-02-16"), 14 | SourceVersion="3.0", 15 | RDataPath="community/tflutre/Vvinifera_URGI_IGGP12Xv2_V3-20.gff3.Rdata", 16 | TaxonomyId=29760, 17 | Title="Vvinifera_URGI_IGGP12Xv2_V3-20.gff3.Rdata", 18 | BiocVersion=package_version("3.6"), 19 | Coordinate_1_based=TRUE, 20 | DataProvider="URGI", 21 | Maintainer="Timothée Flutre ", 22 | RDataClass="GRanges", 23 | DispatchClass="GRanges", 24 | SourceType="GFF", 25 | RDataDateAdded=as.POSIXct(Sys.time()), 26 | Recipe=NA_character_, 27 | PreparerClass="None", 28 | Tags=c("GFF", "URGI", "Gene", "Transcript", "Annotation"), 29 | Notes="compare to the original GFF3 file, chromosomes were slightly renamed to be compatible with the reference genome" 30 | ) 31 | 32 | ## Vitis vinifera URGI IGGP12Xv2 V3 33 | metadata <- AnnotationHubMetadata( 34 | Description="Gene Annotation for Vitis vinifera", 35 | Genome="IGGP12Xv2", 36 | Species="Vitis vinifera", 37 | SourceUrl="http://doi.org/10.15454/1.5009072354498936E12", 38 | SourceLastModifiedDate=as.POSIXct("2017-11-17"), 39 | SourceVersion="3.0", 40 | RDataPath="community/tflutre/Vvinifera_URGI_IGGP12Xv2_V3.gff3.Rdata", 41 | TaxonomyId=29760, 42 | Title="Vvinifera_URGI_IGGP12Xv2_V3.gff3.Rdata", 43 | BiocVersion=package_version("3.6"), 44 | Coordinate_1_based=TRUE, 45 | DataProvider="URGI", 46 | Maintainer="Timothée Flutre ", 47 | RDataClass="GRanges", 48 | DispatchClass="GRanges", 49 | SourceType="GFF", 50 | RDataDateAdded=as.POSIXct(Sys.time()), 51 | Recipe=NA_character_, 52 | PreparerClass="None", 53 | Tags=c("GFF", "URGI", "Gene", "Transcript", "Annotation"), 54 | Notes="compare to the original GFF3 file, chromosomes were slightly renamed to be compatible with the reference genome" 55 | ) 56 | 57 | ## Vitis vinifera CRIBI IGGP12Xv0 V2 58 | metadata <- AnnotationHubMetadata( 59 | Description="Gene Annotation for Vitis vinifera", 60 | Genome="IGGP12Xv0", 61 | Species="Vitis vinifera", 62 | SourceUrl="http://genomes.cribi.unipd.it/DATA/V2/V2.1/V2.1.gff3", 63 | SourceLastModifiedDate=as.POSIXct("2014-04-17"), 64 | SourceVersion="2.1", 65 | RDataPath="community/tflutre/", 66 | TaxonomyId=29760L, 67 | Title="Vvinifera_CRIBI_IGGP12Xv0_V2.1.gff3.Rdata", 68 | BiocVersion=package_version("3.3"), 69 | Coordinate_1_based=TRUE, 70 | DataProvider="CRIBI", 71 | Maintainer="Timothée Flutre 80 chars) 7 | .expandLine <- function(x) 8 | gsub("[[:space:]]{2,}"," ", x) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | ## HTTP 12 | ## 13 | 14 | ## Parses xml from a http page into a data.frame of filenames, 15 | ## List filenames or filenames ending with an extension on http page. 16 | ## Also reads the md5sum in "md5sum.txt" on the same http page. 17 | .httrRead <- function(url, xpathString="//pre/a/text()", 18 | extension=NA_character_, getmd5sum=FALSE) { 19 | tryCatch({ 20 | result <- httpGET(url) 21 | html <- htmlParse(result, asText=TRUE) 22 | 23 | fls <- getNodeSet(html, xpathString) 24 | if (is(fls, "XMLNodeSet")) 25 | fls <- vapply(fls, xmlValue, character(1L)) 26 | 27 | md5exists <- length(grep("md5sum.txt", fls))!=0 28 | remove <- c("Name", "Size", "Last modified", "Description", 29 | "Parent Directory", "referenceSequences/", 30 | "files.txt", "md5sum.txt", "supplemental/") 31 | fls <- fls[!fls %in% remove ] 32 | 33 | ## filter by extension 34 | if(!is.na(extension)){ 35 | fls <- fls[grepl(paste0(extension, "$"), fls)] 36 | } 37 | 38 | ## UCSC chain and 2bit files have a file called md5sum.txt 39 | ## col1=md5sum, col2=filename 40 | ## note : not all chain files have md5sum on UCSC website! 41 | if(getmd5sum & md5exists & length(fls!=0)) { 42 | df <- read.table(paste0(url, "/", "md5sum.txt"), header=FALSE, 43 | stringsAsFactors=FALSE) 44 | md5sum <- df[match(fls, df[,2]),1] 45 | df <- data.frame(files=fls, md5sum=md5sum, stringsAsFactors=FALSE) 46 | } else 47 | df <- data.frame(files=fls, stringsAsFactors=FALSE) 48 | df 49 | 50 | }, error=function(err) { 51 | warning(basename(url), ": ", conditionMessage(err)) 52 | url=character() 53 | }) 54 | } 55 | 56 | 57 | ## Returns data.frame with fileurl, last modified date and file size 58 | .httrFileInfo <- function(urls, verbose=TRUE) { 59 | result <- lapply(urls, function(f){ 60 | if(verbose) 61 | message(paste0("getting file info: ", basename(f))) 62 | tryCatch({ 63 | h = suppressWarnings( 64 | httpGET(f, nobody=TRUE, filetime=TRUE, header=TRUE)) 65 | 66 | nams <- names(h$header) 67 | if("last-modified" %in% nams) 68 | h$header[c("last-modified", "content-length")] 69 | else 70 | c("last-modified"=NA, "content-length"=NA) 71 | }, error=function(err) { 72 | warning(basename(f), ": ", conditionMessage(err)) 73 | list("last-modified"=character(), "content-length"=character()) 74 | }) 75 | }) 76 | 77 | size <- as.numeric(sapply(result, "[[", "content-length")) 78 | date <- strptime(sapply(result, "[[", "last-modified"), 79 | "%a, %d %b %Y %H:%M:%S", tz="GMT") 80 | 81 | data.frame(fileurl=urls, date, size, stringsAsFactors=FALSE) 82 | } 83 | 84 | ## ----------------------------------------------------------------------------- 85 | ## FTP 86 | ## 87 | 88 | ## Returns a data.frame with fileurl, last modified date and file size. 89 | ## 'extension' can be a single file name with extension or just the extension. 90 | .ftpFileInfo <- function(url, extension, verbose=FALSE) { 91 | 92 | if (verbose) 93 | message(paste0("creating urls ...")) 94 | 95 | result <- lapply(url, function(ul) { 96 | message(ul) 97 | N.TRIES = 3L 98 | while (N.TRIES > 0L) { 99 | con <- tryCatch(getURL(ul), error=identity) 100 | if (!inherits(con, "error")) 101 | break 102 | Sys.sleep(300) 103 | N.TRIES <- N.TRIES - 1L 104 | } 105 | if (N.TRIES == 0L) { 106 | stop("'getURL()' failed:", 107 | "\n URL: ", ul, 108 | "\n error: ", conditionMessage(con)) 109 | } 110 | 111 | txt <- read.table(text=con, stringsAsFactors=FALSE, fill=TRUE) 112 | 113 | files <- txt[[9]] 114 | if (verbose) 115 | message(basename(ul)) 116 | 117 | pattern <- paste(paste0(extension, "$"), collapse="|") 118 | keep <- !grepl("00-", files) & grepl(pattern, files) 119 | txt <- txt[keep, ] 120 | if (nrow(txt) == 0L) 121 | return(data.frame(fileurl=character(), 122 | date=as.POSIXct(character()), 123 | size=numeric())) 124 | 125 | # last modified date and size 126 | dateraw <- apply(txt, 1, function(xx) paste(xx[6], xx[7], xx[8])) 127 | datestring <- lapply(dateraw, function(xx) { 128 | as.POSIXct(strptime(xx, format="%b %e %H:%M", tz="GMT")) 129 | }) 130 | if (any(is.na(datestring))) { 131 | datestring <- lapply(dateraw, function(xx) { 132 | as.POSIXct(strptime(xx, format="%b %e %Y", tz="GMT")) 133 | }) 134 | } 135 | 136 | data.frame(fileurl=paste0(ul, txt[[9]]), date=do.call(c, datestring), 137 | size=as.numeric(txt[[5]]), stringsAsFactors=FALSE) 138 | }) 139 | 140 | do.call(rbind, result) 141 | } 142 | 143 | .parseDirInfo <- function(info) { 144 | readLines(textConnection(trimws(info))) 145 | } 146 | 147 | # Return unparsed directory listing as character vector 148 | .ftpDirectoryInfo <- function(someUrl, filesOnly=FALSE) { 149 | curlHandle <- getCurlHandle(customrequest="LIST -R") 150 | info <- getURL(someUrl, curl=curlHandle) 151 | .parseDirInfo(info) 152 | } 153 | 154 | ## Return just the names of the files in an FTP directory 155 | ## Note, this will not do any cleaning of symlinks 156 | .listRemoteFiles <- function(someUrl){ 157 | curlHandle <- getCurlHandle(dirlistonly=TRUE) 158 | info <- getURL(someUrl, curl=curlHandle) 159 | .parseDirInfo(info) 160 | } 161 | -------------------------------------------------------------------------------- /R/makeUCSCChain.R: -------------------------------------------------------------------------------- 1 | .ucscBase <- "http://hgdownload.cse.ucsc.edu/" 2 | 3 | .getchainFiles <- function(url, fileName=NA_character_, verbose=TRUE) { 4 | result <- .httrRead(url, extension=fileName, getmd5sum=TRUE) 5 | if(length(result)) { 6 | files <- paste0(url, "/", result$files) 7 | df <- .httrFileInfo(files, verbose=TRUE) 8 | if(identical(names(result), c("files","md5sum"))) 9 | cbind(df, md5sum=result$md5sum, stringsAsFactors=FALSE) 10 | } else 11 | data.frame(fileurl=NA_character_, date=NA, 12 | size=NA, md5sum=NA_character_, stringsAsFactors=FALSE) 13 | } 14 | 15 | ## FIXME: eutils file (and interface?) has moved to 16 | ## "https://www.ncbi.nlm.nih.gov/books/NBK25501/" 17 | .organismToTaxid <- function(organism=character()) { 18 | ## query NCBI for taxonomy ID 19 | .eutils <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils" 20 | 21 | ## 1. ids 22 | uorganism <- unique(organism[!is.na(organism)]) 23 | query <- paste(uorganism, collapse=" OR ") 24 | url <- sprintf("%s/esearch.fcgi?db=taxonomy&term=%s&retmax=%d", 25 | .eutils, query, length(uorganism)) 26 | xml <- XML::xmlParse(url) 27 | 28 | ## 2. records 29 | id <- as.character(sapply(xml["//Id/text()"], xmlValue)) 30 | scin <- taxid <- character() 31 | if (length(id)) { 32 | query2 <- paste(id, collapse=",") 33 | url <- sprintf("%s/efetch.fcgi?db=taxonomy&id=%s&retmax=%d", 34 | .eutils, query2, length(uorganism)) 35 | xml <- XML::xmlParse(url) 36 | scin <- sapply(xml["/TaxaSet/Taxon/ScientificName"], xmlValue) 37 | taxid <- sapply(xml["/TaxaSet/Taxon/TaxId/text()"], xmlValue) 38 | } 39 | 40 | 41 | scin[which(scin %in% "Pongo abelii")] <- "Pongo pygmaeus abelii" 42 | scin[which(scin %in% "Xenopus (Silurana) tropicalis")]="Xenopus tropicalis" 43 | scin[which(scin %in% "Ictidomys tridecemlineatus")]="Spermophilus tridecemlineatus" 44 | 45 | ## there are 3 special cases: WE provide query, ncbi returns scin 46 | #a) query ="Pongo pygmaeus abelii", scin="Pongo abelii", taxid="9601" 47 | #b) query ="Xenopus tropicalis", scin="Xenopus (Silurana) tropicalis", taxid="8364" 48 | #c) query ="Spermophilus tridecemlineatus", scin="Ictidomys tridecemlineatus", taxid="43179" 49 | 50 | ## 3. Results 51 | as.integer(taxid[match(organism, scin)]) 52 | } 53 | 54 | .getUCSCResources <- 55 | function(fileType, dirName, fileName, verbose=FALSE, justRunUnitTest=FALSE) 56 | { 57 | ## get resource from UCSC 58 | .fileBase <- sprintf("%sgoldenPath", .ucscBase) 59 | genome_tbl <- rtracklayer::ucscGenomes(organism=TRUE) 60 | genomes <- genome_tbl$db 61 | ## remove faulty genome. 62 | rm <- c("cb1", "eboVir3", "dp2", "strPur1", "ci1", "calMil1","monDom1", 63 | "balAcu1" ,"musFur1") 64 | genomes <- setdiff(genomes, rm) 65 | 66 | urls <- sprintf("%s/%s/%s", .fileBase, genomes, dirName) 67 | 68 | if(justRunUnitTest) 69 | urls <- tail(urls, n=2) 70 | 71 | rsrc <- do.call(rbind, lapply(urls, .getchainFiles, 72 | fileName=fileName, verbose=verbose)) 73 | rsrc <- rsrc[complete.cases(rsrc),] 74 | title <- basename(rsrc$fileurl) 75 | 76 | ## parse the filename for each file type. 77 | switch(fileType, chain={ 78 | rsrc$from <- sub("^([[:alnum:]]+)To[A-Z].*", "\\1", title) 79 | rsrc$to <- sub(".*To([A-Z])([[:alnum:]]+).*", "\\L\\1\\E\\2", 80 | title, perl=TRUE) 81 | }, "2bit"={ 82 | rsrc$from <- sub(".2bit","", title) 83 | }, { 84 | stop("unknown fileType ", sQuote(fileType)) 85 | }) 86 | 87 | ## add the organism 88 | idx <- match(rsrc$from, genome_tbl$db) 89 | rsrc$organism <- rep(NA_character_, length(idx)) 90 | rsrc$organism[!is.na(idx)] <- genome_tbl[idx[!is.na(idx)], "organism"] 91 | 92 | ## add the taxonmy Id. 93 | rsrc$taxid <- rep(NA_character_, length(idx)) 94 | rsrc$taxid[!is.na(idx)] <- .organismToTaxid(rsrc$organism[!is.na(idx)]) 95 | 96 | rsrc 97 | } 98 | 99 | makeUCSCChain <- function(currentMetadata, justRunUnitTest=FALSE, 100 | BiocVersion=BiocManager::version()) { 101 | rsrc <- .getUCSCResources(fileType="chain", dirName="liftOver", 102 | fileName="chain.gz", verbose=TRUE, justRunUnitTest) 103 | 104 | ## input_sources table 105 | sourceSize <- as.numeric(rsrc$size) 106 | sourceUrls <- rsrc$fileurl 107 | sourceVersion <- gsub(" ", "_", rsrc$date) 108 | sourceLastModifiedDate <- rsrc$date 109 | 110 | ## resources table 111 | species <- rsrc$organism 112 | genome <- rsrc$from 113 | taxonomyId <- as.integer(rsrc$taxid) 114 | title <- basename(rsrc$fileurl) 115 | description <- sprintf("UCSC liftOver chain file from %s to %s", 116 | rsrc$from, rsrc$to) 117 | rdatapaths <-gsub(.ucscBase, "",sourceUrls) 118 | md5sum <- rsrc$md5sum 119 | 120 | Map(AnnotationHubMetadata, 121 | 122 | SourceSize=sourceSize, 123 | SourceUrl=sourceUrls, 124 | SourceVersion=sourceVersion, 125 | SourceLastModifiedDate = sourceLastModifiedDate, 126 | SourceMd5 =md5sum, 127 | 128 | Description=description, 129 | Title=title, 130 | Genome=genome, 131 | Species=species, 132 | TaxonomyId=taxonomyId, 133 | 134 | RDataPath= rdatapaths, 135 | 136 | MoreArgs=list( 137 | BiocVersion=BiocVersion, 138 | # input sources 139 | SourceType= "Chain", 140 | 141 | # resources 142 | DataProvider = "UCSC", 143 | Maintainer = "Bioconductor Maintainer ", 144 | Coordinate_1_based = FALSE, 145 | Location_Prefix = .ucscBase, 146 | RDataDateAdded = Sys.time(), 147 | 148 | #rdata table 149 | DispatchClass= "ChainFile" , 150 | RDataClass = "GRanges", 151 | 152 | Recipe = NA_character_, 153 | Tags = c("liftOver", "chain", "UCSC", "genome", "homology"))) 154 | } 155 | 156 | makeAnnotationHubResource("UCSCChainPreparer", makeUCSCChain) 157 | -------------------------------------------------------------------------------- /R/ahmToJson.R: -------------------------------------------------------------------------------- 1 | ## Code for creating json records from the sqlite DB. 2 | 3 | ## RIGHT NOW the json looks sort of record centric. So I need to make 4 | ## code that takes AHMs and makes them into JSON. 5 | 6 | 7 | ## So 1st I need an exemplar AHM (will save one for now in inst/extdata 8 | 9 | ## So for testing: 10 | ## load(system.file('extdata','inpDrosPsuedo.rda', package='AnnotationHubData')) 11 | ## ahm 12 | 13 | 14 | ## helper to do cleanup and make sure things are present: 15 | cleanupLst <- function(lst){ 16 | if(is.na(lst[["recipe"]])){ return(lst) } 17 | 18 | if(length(lst[["recipe"]])==1){ 19 | lst[["recipe"]][[2]] <- "AnnotationHubData" 20 | } 21 | ## Unfortunately, I have no recipe args (so I can't fix that field) 22 | ## But I DO have this translation file Dan made me... 23 | ## looks like there are some issues with the data.. 24 | if(lst[["recipe"]][1]=='extendedBedToGRanges'){ 25 | file <- system.file('extdata','titlesToRecipes.txt', 26 | package='AnnotationHubData') 27 | trns <- read.delim(file, header=FALSE, stringsAsFactors=FALSE) 28 | idx <- trns[[2]] %in% lst[["sourceurl"]] 29 | value <- trns[idx,][[1]] 30 | if(length(value)==1){ 31 | lst[["recipe"]][1] <- value 32 | }else{ 33 | warning("no matching value for recipe called 'extendedBedToGRanges'") 34 | } 35 | } 36 | lst 37 | } 38 | 39 | 40 | 41 | ## Dan suggests jsonlite 42 | ahmToJson <- function(ahm){ 43 | lst <- metadata(ahm) 44 | 45 | ## casting on elements that toJSON can't handle 46 | lst[['BiocVersion']] <- as.character(lst[['BiocVersion']]) 47 | lst[['SourceLastModifiedDate']] <- as.character(lst[['SourceLastModifiedDate']]) 48 | ## lower case all the names 49 | names(lst) <- tolower(names(lst)) 50 | 51 | ##TEMP cleanup the ahm (in future we want to stop using this!) 52 | lst <- cleanupLst(lst) 53 | 54 | rdatapaths <- Map(list, 55 | rdatapath=lst[['rdatapath']], 56 | rdataclass=lst[['rdataclass']], 57 | dispatchclass=lst[['dispatchclass']] 58 | ) 59 | ## using Map puts unwanted labels on things... 60 | names(rdatapaths) <- NULL 61 | 62 | input_sources <- Map(list, 63 | sourcesize=lst[['sourcesize']], 64 | sourceurl=lst[['sourceurl']], 65 | sourcetype=lst[['sourcetype']], 66 | sourceversion=lst[['sourceversion']], 67 | sourcemd5=lst[['sourcemd5']], 68 | sourcelastmodifieddate=lst[['sourcelastmodifieddate']] 69 | ) 70 | ## using Map puts unwanted labels on things... 71 | names(input_sources) <- NULL 72 | 73 | ## TODO: I need to have Map make lists but not have them be named horribly. 74 | ## So multiplexed like Map on rdatapaths below, but with result 75 | ## that looks like input_sources 76 | 77 | ## Now just need to re-arrange things a bit 78 | base <- list(title=lst[['title']], 79 | dataprovider=lst[['dataprovider']], 80 | species=lst[['species']], 81 | taxonomyid=as.integer(lst[['taxonomyid']]), 82 | genome=lst[['genome']], 83 | description=lst[['description']], 84 | coordinate_1_based=lst[['coordinate_1_based']], 85 | maintainer=lst[['maintainer']], 86 | rdataversion=lst[['rdataversion']], 87 | rdatadateadded=lst[['rdatadateadded']], 88 | ## FIXME - Old AHMs may not have Location_Prefix filled in! 89 | ## It should be http://s3.amazonaws.com/annotationhub/ or 90 | ## https://bioconductorhubs.blob.core.windows.net/annotationhub 91 | ## by default, for chain files it should be: 92 | ## http://hgdownload.cse.ucsc.edu/ 93 | location_prefix=lst[['location_prefix']], 94 | recipe=lst[['recipe']][1], 95 | recipe_package=ifelse(!is.na(lst[["recipe"]]), 96 | lst[['recipe']][2], lst[['recipe']][1] ), 97 | rdatapaths=rdatapaths, 98 | input_sources=input_sources, 99 | tags=lst[['tags']], 100 | biocversions=lst[['biocversion']], 101 | preparerclass=lst[['preparerclass']] 102 | ) 103 | 104 | ## then make JSON 105 | paste0(toJSON(base, auto_unbox=TRUE,na='null', pretty=TRUE), "\n") 106 | ## STILL: some issues here with no boxing where we want it (around 107 | ## sub-sets like 'versions' 108 | ## AND: some name-mangling in the tags... 109 | 110 | } 111 | 112 | 113 | 114 | 115 | ## Testing 116 | ## numExtends <- unlist(lapply(resources, function(x){x@Recipe[1]=='extendedBedToGRanges'})) 117 | 118 | 119 | ## NOTES from 4/21/14 120 | ## check on rdatasize and sourcesize (should not be NA?) - I think 121 | ## they are NA though- but double check this. - DONE 122 | ## values that are NA in the JSON should be set to null - DONE 123 | ## use ALL of the biocversions - DONE 124 | ## add sourceMd5, derivedMD5, sourceLastModifiedDate to the json 125 | ## (soon) - manually add these to the AHMs? - DONE 126 | 127 | ## changes to the process for making Annotations: 128 | ## Export makeAnnotationHubResource (so it can be used externally in 129 | ## other packages) - DONE 130 | ## Allow currentMetadata to be passed in to the helper functions (add 131 | ## this to 132 | ## .generalNewResources::makeAnnotationHubMetadataFunction(currentMEtadata,...) 133 | ## - DONE 134 | ## Recipes should use require() to minimize dependencies for 135 | ## annotations and suggests for things that are only needed by 136 | ## specific recipes. Or they could maybe just get away with importing. 137 | ## recipes and AHM generator should not have to define an AHMRoot 138 | ## (since this is alway put in after the fact. - Just use a default 139 | ## value for this. 140 | 141 | 142 | ## modernize all of the recipes so that they use the new system (the 143 | ## new simplified system). 144 | 145 | 146 | ## And actually we now need to also stop defining the AHMRoot this in 147 | ## the recipes. (it is no longer necessary) 148 | 149 | 150 | ## Make sure that we can put a recipe into another package. - untested. 151 | 152 | 153 | ## look into the weird requirement for adding importPreparer subclasses to 154 | ## the NAMESPACE. - can we make this go away? 155 | 156 | 157 | ## Document makeAnnotationHubResource 158 | 159 | 160 | ## Fix the unit tests 161 | -------------------------------------------------------------------------------- /R/makedbSNPVCF.R: -------------------------------------------------------------------------------- 1 | ### link to data description: 2 | ### http://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf/ 3 | 4 | ### ---------------------------------------------------------------------- 5 | ### March 2016 6 | 7 | ### Files in AH that point to ftp.ncbi.nih.gov/snp/organisms/* 8 | ### are no longer available: 9 | 10 | ### We have 38 of them: 11 | 12 | ### > length(query(hub, c("dbsnp", "vcf"))) 13 | ### [1] 38 14 | ### 15 | ### 16 | ### Of the 38 full urls seen with query(hub, c("dbsnp", "vcf"))$sourceurl, 17 | ### there are 5 unique base directories: 18 | ### 19 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/ 20 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b141_GRCh37p13/ 21 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b142_GRCh37p13/ 22 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b142_GRCh38/ 23 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b141_GRCh38/ 24 | ### 25 | ### On the web site (ftp://ftp.ncbi.nih.gov/snp/organisms/) there 7 base 26 | ### directories. b141 is no longer there and b144, b146 have been added. 27 | 28 | ### The recipe has been updated to look in 29 | ### ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/ 30 | ### ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/ 31 | ### ---------------------------------------------------------------------- 32 | 33 | .dbSNPBaseUrl <-"ftp://ftp.ncbi.nlm.nih.gov/" 34 | 35 | ## Chosing files from archive so all files have a date stamp. 36 | ## It looks like files with no stamp in the current 37 | ## directory are either 'in progress' or 'subject to change'. 38 | .getdbSNP <- function(justRunUnitTest) { 39 | baseUrl <- paste0(.dbSNPBaseUrl, "pub/clinvar/") 40 | paths <- c(GRCh37="vcf_GRCh37/archive_1.0/2016/", 41 | GRCh38="vcf_GRCh38/archive_1.0/2016/") 42 | files <- c("clinvar_20160203", "clinvar_20160203_papu", 43 | "common_and_clinical_20160203", 44 | "common_no_known_medical_impact_20160203") 45 | urls <- setNames(paste0(baseUrl, paths), names(paths)) 46 | 47 | if (justRunUnitTest) 48 | urls <- urls[1] 49 | 50 | genome <- rep(names(urls), each=length(files)) 51 | df <- .ftpFileInfo(url=urls, extension=paste0(files, ".vcf.gz")) 52 | df$genome <- gsub("GRCh37clinical", "GRCh37", genome) 53 | df <- cbind(df, title=basename(df$fileurl), stringsAsFactors = FALSE) 54 | rownames(df) <- NULL 55 | 56 | map <- c( 57 | `All` = .expandLine("VCF of all variations that meet the criteria 58 | to be in a VCF file. This file is created once per dbSNP build."), 59 | `All_papu` = .expandLine("VCF of all variations found in the 60 | psuedoautosomal region (PAR), alternate loci, patch sequences and 61 | unlocalized or unplaced contigs(papu)"), 62 | `common_all` = .expandLine("VCF of all variations that are polymorphic 63 | in a least one population the 1000 Genomes project or any of the 64 | following handles: 1000GENOMES, CSHL-HAPMAP, EGP_SNPS NHLBI-ESP, 65 | PGA-UW-FHCRC. A variation is polymorphic if the minor allele 66 | frequency is at least 0.01 and the minor allele is present in 67 | at least two samples."), 68 | `clinvar` = .expandLine("VCF of variations from clinvar where 'YYYYMMDD' 69 | represents the date the file was created. This file is created 70 | weekly."), 71 | `common_and_clinical` = .expandLine("Variations from common_all.vcf.gz 72 | that are clinical. A clinical variation is one the appears in 73 | clinvar_YYYYMMDD.vcf.gz with at least one of the following clinical 74 | significance codes: 4 - probable-pathogenic, 5 - pathogenic, 75 | 6 - drug-response, 7 - histocompatibility, 255 - other, 76 | This file is created weekly."), 77 | `common_no_known_medical_impact` = .expandLine("Variations from 78 | common_all.vcf.gz that do not meet the clinical criteria described 79 | above. This file is created weekly.")) 80 | 81 | description <- character(length(title)) 82 | for (i in seq_along(map)) 83 | description[grep(names(map)[i], df$title)] <- map[[i]] 84 | 85 | cbind(df, description, stringsAsFactors = FALSE) 86 | } 87 | 88 | makedbSNPVCF <- function(currentMetadata, justRunUnitTest=TRUE, 89 | BiocVersion=BiocManager::version()) { 90 | rsrc <- .getdbSNP(justRunUnitTest) 91 | 92 | ## input_sources table 93 | sourceSize <- as.numeric(rsrc$size) 94 | sourceUrls <- rsrc$fileurl 95 | sourceVersion <- gsub(" ", "_", rsrc$date) 96 | sourceLastModifiedDate <- rsrc$date 97 | 98 | ## resources table 99 | title <- rsrc$title 100 | description <- rsrc$description 101 | genome <- rsrc$genome 102 | 103 | ## rdatapath should have 2 entries -for the VCF and its TabixFile 104 | rdatapath <- sub(.dbSNPBaseUrl, "", rsrc$fileurl) 105 | rdps <- rep(rdatapath, each=2) 106 | rdatapaths <- split(rdps, f=as.factor(rep(seq_along(rdatapath),each=2))) 107 | rdatapaths <- lapply(rdatapaths, 108 | function(x){x[2] <- paste0(x[2],".tbi") ; return(x)}) 109 | 110 | tags <- lapply(genome, 111 | function(tag) c("dbSNP", tag, "VCF") 112 | ) 113 | 114 | Map(AnnotationHubMetadata, 115 | SourceSize=sourceSize, 116 | SourceUrl=sourceUrls, 117 | SourceVersion=sourceVersion, 118 | SourceLastModifiedDate=sourceLastModifiedDate, 119 | 120 | Description=description, 121 | Title=title, 122 | Genome=genome, 123 | Tags=tags, 124 | RDataPath=rdatapaths, 125 | 126 | MoreArgs=list( 127 | BiocVersion=BiocVersion, 128 | # input sources 129 | SourceType= "VCF", 130 | 131 | # resources 132 | Species="Homo sapiens", 133 | TaxonomyId=9606L, 134 | DataProvider = "dbSNP", 135 | Maintainer = "Bioconductor Maintainer ", 136 | Coordinate_1_based = FALSE, 137 | Location_Prefix = .dbSNPBaseUrl, 138 | RDataDateAdded = Sys.time(), 139 | 140 | #rdata table 141 | DispatchClass= "dbSNPVCFFile" , 142 | RDataClass = c("VcfFile", "VcfFile"), 143 | 144 | Recipe = "AnnotationHubData:::ncbi_dbSNPVCFFile")) 145 | } 146 | 147 | 148 | ## recipe 149 | ncbi_dbSNPVCFFile <- function(ahm) 150 | { 151 | ## The tbi file exists online, just download it. 152 | faIn <- normalizePath(inputFiles(ahm)) # file on ftp site 153 | faOut1 <- normalizePath(outputFile(ahm))[1] # vcf.gz file on localDir 154 | faOut2 <- outputFile(ahm)[2] # vcf.gz.tbi file on localDir 155 | 156 | if(!file.exists(faOut2)) { 157 | tbiFile <- paste0(metadata(ahm)$Location_Prefix, 158 | metadata(ahm)$RDataPath[2]) 159 | tbi <- download.file(tbiFile, faOut2) 160 | } 161 | faOut2 162 | } 163 | 164 | makeAnnotationHubResource("dbSNPVCFPreparer", makedbSNPVCF, quiet=TRUE) 165 | 166 | -------------------------------------------------------------------------------- /R/makeEncodeDCC.R: -------------------------------------------------------------------------------- 1 | # This is a new recipe for EncodeImportPreparer-class.R 2 | .ucscBase <- "http://hgdownload.cse.ucsc.edu/" 3 | 4 | .getTags <- function(url) { 5 | tagurl <- paste0(url, "files.txt") 6 | html <- httpGET(tagurl) 7 | 8 | html <- unlist(strsplit(html, "\n")) # split to get tags for each file 9 | lapply(html, function(t) { 10 | ta <- unlist(strsplit(t, "\t")) 11 | temp <- unlist(strsplit(ta[2],";")) 12 | temp <- trimws(temp) 13 | 14 | ## extract the md5sum if present 15 | md <- grep("md5sum=", temp, value=TRUE) 16 | md <- ifelse(length(md), gsub(".*=","", md), NA_character_) 17 | 18 | ## change "cell=8988T" to "8988T cell" 19 | n <- grep("cell=", temp, value=TRUE) 20 | n <- ifelse(length(n)!=0, paste0(gsub(".*=", "", n)," cell"), 21 | NA_character_) 22 | 23 | ## change "grant=Gingeras" to "Gingeras grant" 24 | g <- grep("grant=", temp, value=TRUE) 25 | g <- ifelse(length(g)!=0, paste0(gsub(".*=", "", g)," grant"), 26 | NA_character_) 27 | 28 | dv <- grep("dataVersion=", temp, value=TRUE) 29 | dv <- ifelse(length(dv)!=0, gsub(".*=", "", dv), NA_character_) 30 | 31 | ## get only important fields 32 | toMatch <- "dataType|lorigAssembly|type" 33 | temp <- temp[grepl(toMatch, temp)] 34 | 35 | ## remove everything before "=" 36 | temp <- gsub(".*=","", temp) 37 | 38 | ## add 39 | if(!is.na(n)) 40 | temp <- c(temp, n) 41 | 42 | if(!is.na(g)) 43 | temp <- c(temp, g) 44 | 45 | temp <- c("wgEncode", temp) 46 | temp <- temp[!grepl("None",temp)] 47 | 48 | list(tags=paste0(temp, collapse=", "), md5sum = md, 49 | sourceVersion=dv) 50 | }) 51 | } 52 | 53 | .cleanFiles <- function(url, isSubDir=FALSE) { 54 | fls <- .httrRead(url)$files 55 | 56 | if(length(fls) != 0) { 57 | if(isSubDir){ 58 | 59 | result <- .getTags(url) 60 | tags <- sapply(result, "[[", "tags") 61 | sourcemd5sum <- vapply(result, "[[",character(1), "md5sum") 62 | sourceVersion <- vapply(result, "[[", "", "sourceVersion") 63 | 64 | subst <- switch( basename(url), 65 | wgEncodeAwgTfbsUniform="wgEncodeAwgTfbs", 66 | wgEncodeAwgDnaseUniform="wgEncodeAwgDnase", 67 | wgEncodeGencodeV4="wgEncodeGencode", 68 | basename(url)) 69 | 70 | fls <- fls[grepl(subst,fls)] 71 | fls <- fls[!grepl("files.txt", fls)] 72 | if(length(tags)!=0) 73 | fls <- list(filename=fls, tags=tags, sourcemd5sum=sourcemd5sum, 74 | sourceVersion=sourceVersion) 75 | } 76 | } 77 | fls 78 | } 79 | 80 | .subDir <- function(url, verbose=TRUE) { 81 | contents <- .cleanFiles(url, isSubDir=TRUE) 82 | supported.formats <- c("narrowPeak", "broadPeak", "bedRnaElements", 83 | "gtf") 84 | tags <- contents$tags 85 | sourcemd5sum <- contents$sourcemd5sum 86 | files <- contents$filename 87 | sourceVersion <- contents$sourceVersion 88 | 89 | type <- sapply(strsplit(files, ".", fixed = TRUE), "[[", 2) 90 | idx <- type %in% supported.formats 91 | files <- files[idx] 92 | tags <- tags[idx] 93 | sourcemd5sum <- sourcemd5sum[idx] 94 | type <- type[idx] 95 | sourceVersion <- sourceVersion[idx] 96 | 97 | 98 | if(length(files)!=0) { 99 | files <- sprintf("%s%s", url, files) 100 | if(length(files)>5){ 101 | files<- files[1:5] 102 | tags<- tags[1:5] 103 | sourcemd5sum <- sourcemd5sum[1:5] 104 | type <- type[1:5] 105 | sourceVersion <- sourceVersion[1:5] 106 | } 107 | 108 | df <- .httrFileInfo(files, verbose) 109 | 110 | cbind(df, type, tags, sourcemd5sum, sourceVersion, 111 | stringsAsFactors=FALSE) 112 | } else 113 | data.frame(fileurl=character(), date=character(), size=numeric(), 114 | type= character(), stringsAsFactors=FALSE) 115 | } 116 | 117 | .encodeFiles <- function(justRunUnitTest=FALSE) { 118 | encode_url <- paste0(.ucscBase, "goldenpath/hg19/encodeDCC/") 119 | subdirs <- .cleanFiles(encode_url, isSubDir=FALSE) 120 | urls <- setNames(paste0(encode_url, subdirs), subdirs) 121 | 122 | if(justRunUnitTest) 123 | urls <- urls[c(1,2)] 124 | 125 | do.call(rbind, Map(.subDir, urls, verbose=TRUE)) 126 | } 127 | 128 | makeEncodeImporter <- function(currentMetadata, justRunUnitTest=FALSE, 129 | BiocVersion=BiocManager::version()) { 130 | rsrc <- .encodeFiles(justRunUnitTest) 131 | 132 | ## input_sources table 133 | sourceSize <- as.numeric(rsrc$size) 134 | sourceUrls <- rsrc$fileurl 135 | sourceVersion <- rsrc$sourceVersion # should be character 136 | SourceLastModifiedDate <- rsrc$date # should be "POSIXct" "POSIXt" 137 | sourceType <- sapply(rsrc$type, function(x) 138 | switch(x, 139 | broadPeak="BED", 140 | narrowPeak="BED", 141 | gtf="GTF", 142 | bedRnaElements="BED"), 143 | USE.NAMES =FALSE) 144 | 145 | dispatchclass <- sapply(rsrc$type, function(x) 146 | switch(x, 147 | broadPeak="UCSCBroadPeak", 148 | narrowPeak="UCSCNarrowPeak", 149 | gtf="GTFFile", 150 | bedRnaElements="UCSCBEDRnaElements"), 151 | USE.NAMES =FALSE) 152 | 153 | 154 | ## resources table 155 | title <- basename(rsrc$fileurl) 156 | description <- rsrc$description 157 | sourceMd5sum <- rsrc$sourcemd5sum 158 | 159 | rdatapath <- gsub(.ucscBase, "", sourceUrls) 160 | 161 | tags <- strsplit(rsrc$tags, ", ") 162 | 163 | Map(AnnotationHubMetadata, 164 | 165 | SourceSize=sourceSize, 166 | SourceUrl=sourceUrls, 167 | SourceVersion=sourceVersion, 168 | SourceLastModifiedDate = SourceLastModifiedDate, 169 | SourceType = sourceType, 170 | 171 | Description= paste0(rsrc$type, " file from ENCODE"), 172 | Title=title, 173 | 174 | RDataPath=rdatapath, 175 | DispatchClass = dispatchclass, 176 | 177 | Tags=tags, 178 | 179 | MoreArgs=list( 180 | BiocVersion=BiocVersion, 181 | # resources 182 | DataProvider = "UCSC", 183 | Species="Homo sapiens", 184 | TaxonomyId=9606L, 185 | Genome= "hg19", 186 | Maintainer = "Bioconductor Maintainer ", 187 | Coordinate_1_based = FALSE, 188 | ##status_id =2L, 189 | Location_Prefix = .ucscBase, 190 | RDataDateAdded = Sys.time(), 191 | ##PreparerClass = "EncodeImportPreparer", 192 | 193 | #rdata table 194 | RDataClass = "GRanges", 195 | 196 | Recipe = NA_character_)) 197 | } 198 | 199 | makeAnnotationHubResource("EncodeImportPreparer", makeEncodeImporter) 200 | 201 | -------------------------------------------------------------------------------- /man/updateResources.Rd: -------------------------------------------------------------------------------- 1 | \name{updateResources} 2 | \alias{updateResources} 3 | 4 | \alias{runRecipes} 5 | \alias{runRecipes,AnnotationHubMetadata-method} 6 | \alias{pushResources} 7 | \alias{pushMetadata} 8 | 9 | 10 | \title{updateResources} 11 | 12 | \description{ 13 | Add new resources to AnnotationHub 14 | } 15 | 16 | \usage{ 17 | updateResources(AnnotationHubRoot, BiocVersion = BiocManager::version(), 18 | preparerClasses = getImportPreparerClasses(), 19 | metadataOnly = TRUE, insert = FALSE, 20 | justRunUnitTest = FALSE, ...) 21 | 22 | pushResources(allAhms, uploadToRemote = TRUE, download = TRUE) 23 | 24 | pushMetadata(allAhms, url) 25 | } 26 | 27 | \arguments{ 28 | \item{AnnotationHubRoot}{ 29 | Local path where files will be downloaded. 30 | } 31 | \item{BiocVersion}{ 32 | A \code{character(1)} Bioconductor version. The resource will be available 33 | in Bioconductor >= to this version. Default value is the current version, 34 | specified with BiocManager::version(). 35 | } 36 | \item{preparerClasses}{ 37 | One of the \code{ImportPreparer} subclasses defined in 38 | \code{getImportPreparer()}. This class is used for dispatch during 39 | data discovery. 40 | } 41 | \item{metadataOnly}{ 42 | A \code{logical} to specify the processing of metadata only or both 43 | metadata and data files. 44 | 45 | When FALSE, metadata are generated and data files are downloaded, 46 | processed and pushed to their final location in S3 buckets. 47 | \code{metadata = TRUE} produces only metadata and is useful for 48 | testing. 49 | } 50 | \item{insert}{ 51 | NOTE: This option is for inserting metadata records in the 52 | production data base (done by Bioconductor core team member) and 53 | is for internal use only. 54 | 55 | A \code{logical} to control if metadata are inserted in the AnnotationHub 56 | db. By default this option is FALSE which is a useful state in which 57 | to test a new recipe and confirm the metadata fields are correct. 58 | 59 | When \code{insert = TRUE}, the "AH_SERVER_POST_URL" global option must 60 | be set to the http location of the AnnotationHubServer in the global 61 | environment or .Rprofile. Additionally, azcopy command line tools 62 | must be installed on the local machine to push files to Azure buckets. 63 | See \link{upload_to_azure}. 64 | } 65 | \item{justRunUnitTest}{ 66 | A \code{logical}. When TRUE, a small number of records (usually 5) are 67 | processed instead of all. 68 | } 69 | \item{allAhms}{ 70 | List of \code{AnnotationHubMetadata} objects. 71 | } 72 | \item{url}{ 73 | URL of AnnotationHub database where metadata will be inserted. 74 | } 75 | \item{uploadToRemote}{ 76 | A \code{logical} indicating whether resources should be uploaded 77 | to remote bioconductor default location. Currently Azure Data Lakes. 78 | } 79 | \item{download}{ 80 | A \code{logical} indicating whether resources should be downloaded from 81 | resource url. 82 | } 83 | \item{\dots}{ 84 | Arguments passed to other methods such as \code{regex}, \code{baseUrl}, 85 | \code{baseDir}. 86 | } 87 | } 88 | 89 | \details{ 90 | \itemize{ 91 | \item updateResources: 92 | 93 | \code{updateResources} is responsible for creating metadata records 94 | and downloading, processing and pushing data files to their final 95 | resting place. The \item{preparerClasses} argument is used in method 96 | dispatch to determine which recipe is used. 97 | 98 | By manipulating the \code{metadataOnly}, \code{insert} and 99 | \code{justRunUnitTest} arguments one can flexibly test the metadata 100 | for a small number of records with or without downloading and 101 | processing the data files. 102 | 103 | 104 | \item global options: 105 | 106 | When \code{insert = TRUE} the "AH_SERVER_POST_URL" option must be 107 | set to the https location of the AnnotationHub db. 108 | } 109 | } 110 | 111 | \value{ 112 | A list of \code{AnnotationHubMetadata} objects. 113 | } 114 | 115 | \author{Martin Morgan, Marc Carlson} 116 | 117 | \seealso{ 118 | \itemize{ 119 | \item \link{AnnotationHubMetadata} 120 | \item \link{upload_to_azure} 121 | } 122 | } 123 | 124 | \examples{ 125 | 126 | \dontrun{ 127 | 128 | ## ----------------------------------------------------------------------- 129 | ## Inspect metadata: 130 | ## ----------------------------------------------------------------------- 131 | ## A useful first step in testing a new recipe is to generate and 132 | ## inspect a small number of metadata records. The combination of 133 | ## 'metadataOnly=TRUE', 'insert=FALSE' and 'justRunUnitTest=TRUE' 134 | ## generates metadata for the first 5 records and does not download or 135 | ## process any data. 136 | 137 | meta <- updateResources("/local/path", 138 | BiocVersion = "3.3", 139 | preparerClasses = "EnsemblFastaImportPreparer", 140 | metadataOnly = TRUE, insert = FALSE, 141 | justRunUnitTest = TRUE, 142 | release = "84") 143 | 144 | INFO [2015-11-12 07:58:05] Preparer Class: EnsemblFastaImportPreparer 145 | Ailuropoda_melanoleuca.ailMel1.cdna.all.fa.gz 146 | Ailuropoda_melanoleuca.ailMel1.dna_rm.toplevel.fa.gz 147 | Ailuropoda_melanoleuca.ailMel1.dna_sm.toplevel.fa.gz 148 | Ailuropoda_melanoleuca.ailMel1.dna.toplevel.fa.gz 149 | Ailuropoda_melanoleuca.ailMel1.ncrna.fa.gz 150 | 151 | ## The return value is a list of metadata for the first 5 records: 152 | 153 | > names(meta) 154 | [1] "FASTA cDNA sequence for Ailuropoda melanoleuca" 155 | [2] "FASTA DNA sequence for Ailuropoda melanoleuca" 156 | [3] "FASTA DNA sequence for Ailuropoda melanoleuca" 157 | [4] "FASTA DNA sequence for Ailuropoda melanoleuca" 158 | [5] "FASTA ncRNA sequence for Ailuropoda melanoleuca" 159 | 160 | 161 | ## Each record is of class AnnotationHubMetadata: 162 | 163 | > class(meta[[1]]) 164 | [1] "AnnotationHubMetadata" 165 | attr(,"package") 166 | [1] "AnnotationHubData" 167 | 168 | ## ----------------------------------------------------------------------- 169 | ## Insert metadata in the db and process/push data files: 170 | ## ----------------------------------------------------------------------- 171 | ## This next code chunk creates the metadata and downloads and processes 172 | ## the data (metadataOnly=FALSE). If all files are successfully pushed to 173 | ## to their final resting place, metadata records are inserted in the 174 | ## AnnotationHub db (insert=TRUE). Metadata insertion is done by a 175 | ## Bioconductor team member; contact maintainer@bioconductor.org for help. 176 | 177 | meta <- updateResources("local/path", 178 | BiocVersion = "3.5", 179 | preparerClasses = "EnsemblFastaImportPreparer", 180 | metadataOnly = FALSE, insert = TRUE, 181 | justRunUnitTest = FALSE, 182 | regex = ".*release-81") 183 | 184 | ## ----------------------------------------------------------------------- 185 | ## Recovery helpers: 186 | ## ----------------------------------------------------------------------- 187 | 188 | ## pushResources() and pushMetadata() are both called from updateResources() 189 | ## but can be used solo for testing or completing a run that 190 | ## terminated unexpectedly. 191 | 192 | ## Download, process and push to azure the last 2 files in 'meta': 193 | sub <- meta[length(meta) - 1:length(meta)] 194 | pushResources(sub) 195 | 196 | ## Insert metadata in the AnotationHub db for the last 2 files in 'meta': 197 | 198 | pushMetadata(sub, url = getOption("AH_SERVER_POST_URL")) 199 | } 200 | 201 | } 202 | 203 | \keyword{methods} 204 | -------------------------------------------------------------------------------- /R/makeEnsemblFasta.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### makeEnsemblFastaAHM() and ensemblFastaToFaFile() 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | ## Adjust this expression in order to save painful-reprocessing of older files. 7 | ## .ensemblReleaseRegex <- ".*release-(69|7[[:digit:]]|8[[:digit:]])" 8 | ## .ensemblReleaseRegex <- ".*release-(79|8[[:digit:]])" 9 | ## for a speed run just do one set 10 | ## .ensemblReleaseRegex <- ".*release-81" 11 | 12 | ## list directories below url/dir satisfying regex 13 | .ensemblDirUrl <- 14 | function(url, dir, regex) 15 | { 16 | lst <- .listRemoteFiles(url) 17 | releases <- paste0(url, lst) 18 | paste(grep(regex, releases, value=TRUE), dir, sep="/") 19 | } 20 | 21 | ## NOTE: httr >= 1.2.0 doesn't support ftp last modified date and size 22 | ## FIXME: This should be combined with .httrFileInfo() and .ftpFileInfo() 23 | .ensemblMetadataFromUrl <- function(sourceUrl, twobit=FALSE, http=FALSE) { 24 | releaseRegex <- ".*(release-[[:digit:]]+).*" 25 | if (!twobit){ 26 | title <- sub("\\.gz$", "", basename(sourceUrl)) 27 | }else{ 28 | title <- sub("\\.fa\\.gz$", ".2bit", basename(sourceUrl)) 29 | } 30 | root <- setNames(rep(NA_character_, length(sourceUrl)), title) 31 | 32 | releaseNum <- sub("release-", "", sub(releaseRegex, "\\1", sourceUrl[1])) 33 | 34 | # as of release 96 a file is present with species index for mappings 35 | species_index <- GenomeInfoDb:::fetch_species_index_from_Ensembl_FTP(release=releaseNum) 36 | 37 | species <- vapply(strsplit(sourceUrl, '/'), function(x) x[[7]], character(1)) 38 | genome <- vapply(species, FUN.VALUE=character(1), USE.NAMES=FALSE, 39 | FUN=function(spc, tbl){ 40 | message(spc, "\n") 41 | tbl[tbl$species == spc, "assembly"] 42 | }, tbl=species_index) 43 | taxonomyId <- vapply(species, FUN.VALUE=integer(1), USE.NAMES=FALSE, 44 | FUN=function(spc, tbl){ 45 | message(spc, "\n") 46 | tbl[tbl$species == spc, "taxonomy_id"] 47 | }, tbl=species_index) 48 | 49 | species <- sub("_", " ", species,fixed=TRUE) 50 | 51 | if (http) { 52 | ftpInfo <- .httrFileInfo(sourceUrl) 53 | sourceSize <- ftpInfo$size 54 | sourceLastModDate <- ftpInfo$date 55 | } else { 56 | sourceSize <- as.numeric(NA) 57 | sourceLastModDate <- as.POSIXct(NA) 58 | } 59 | 60 | list(annotationHubRoot = root, title=title, species = species, 61 | taxonomyId = as.integer(taxonomyId), 62 | genome = genome, 63 | sourceSize=sourceSize, 64 | sourceLastModifiedDate=sourceLastModDate, 65 | sourceVersion = sub(releaseRegex, "\\1", sourceUrl)) 66 | } 67 | 68 | .ensemblFastaTypes <- 69 | c("cdna\\.all", "dna_rm\\.toplevel", "dna_sm\\.toplevel", 70 | "dna\\.toplevel", "ncrna", "pep\\.all") 71 | 72 | ## get urls 73 | .ensemblFastaSourceUrls <- 74 | function(baseUrl, baseDir, regex, baseTypes=.ensemblFastaTypes) 75 | { 76 | want <- .ensemblDirUrl(baseUrl, baseDir, regex) 77 | 78 | .processUrl <- function(url) { 79 | listing <- .ftpDirectoryInfo(url) 80 | 81 | subdirIdx <- grepl(".*/.*:", listing) 82 | subdir <- sub("^.{2}(.*):$", "\\1", listing[subdirIdx]) 83 | fileTypes <- paste(baseTypes, collapse="|") 84 | pat <- sprintf(".*(%s)\\.fa\\.gz$", fileTypes) 85 | 86 | fastaIdx <- grepl(pat, listing) 87 | fasta <- sub(".* ", "", listing[fastaIdx]) 88 | 89 | ## match subdir w/ fasta 90 | subdir <- subdir[cumsum(subdirIdx)[fastaIdx]] 91 | 92 | ## Prefer "primary_assembly" to "toplevel" resources. 93 | organisms <- unique(sub("(.+?)\\..*", "\\1", fasta, perl=TRUE)) 94 | keepIdxList <- sapply(organisms, function(x) { 95 | orgFiles <- fasta[grep(paste0("^", x, "\\."), fasta)] 96 | reBoth <- paste0("dna", c("_rm", "_sm", ""), 97 | "\\.(primary_assembly|toplevel)\\.") 98 | toplevelIdx <- 99 | sapply(reBoth, function(x) length(grep(x, orgFiles)) > 1) 100 | reToplevel <- paste0("dna", c("_rm", "_sm", ""), 101 | "\\.toplevel\\.")[toplevelIdx] 102 | 103 | isRedundant <- 104 | sapply(reToplevel, function(x) grepl(x, orgFiles)) 105 | retVal <- rep(TRUE, length(orgFiles)) 106 | if (!is.null(dim(isRedundant))) { 107 | retVal <- !apply(isRedundant, 1, any) 108 | } 109 | 110 | retVal 111 | }) 112 | keepIdx <- base::unlist(keepIdxList) 113 | fasta <- fasta[keepIdx] 114 | subdir <- subdir[keepIdx] 115 | 116 | sprintf("%s%s/%s", url, subdir, fasta) 117 | } 118 | res <- base::unlist(lapply(want, .processUrl), use.names=FALSE) 119 | 120 | if (length(res) == 0) { 121 | txt <- sprintf("no fasta files at %s", 122 | paste(sQuote(want), collapse=", ")) 123 | stop(paste(strwrap(txt, exdent=2), collapse="\n")) 124 | } 125 | res 126 | } 127 | 128 | ## metadata generator 129 | makeEnsemblFastaToAHM <- 130 | function(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/", 131 | baseDir = "fasta/", release, 132 | justRunUnitTest = FALSE, BiocVersion = BiocManager::version()) 133 | { 134 | time1 <- Sys.time() 135 | regex <- paste0(".*release-", release) 136 | sourceUrl <- .ensemblFastaSourceUrls(baseUrl, baseDir, regex) 137 | if (justRunUnitTest) 138 | sourceUrl <- sourceUrl[1:5] 139 | 140 | sourceFile <- sub(baseUrl, "ensembl/", sourceUrl) 141 | meta <- .ensemblMetadataFromUrl(sourceUrl) 142 | dnaType <- local({ 143 | x <- basename(dirname(sourceFile)) 144 | sub("(dna|rna)", "\\U\\1", x, perl=TRUE) 145 | }) 146 | description <- paste("FASTA", dnaType, "sequence for", meta$species) 147 | 148 | ## rdatapaths db table needs an extra row for the index file 149 | rdataPath <- sub(".gz$", ".bgz", sourceFile) 150 | rdps <- rep(rdataPath, each=3) 151 | rdatapaths <- split(rdps, f=as.factor(rep(1:length(rdataPath),each=3))) 152 | ## second record of each set becomes the '.fai' file 153 | rdatapaths <- lapply(rdatapaths, 154 | function(x){x[2] <- paste0(x[2],".fai") ; x[2] <- 155 | paste0(x[3],".gzi") ; return(x)}) 156 | 157 | Map(AnnotationHubMetadata, 158 | Description=description, 159 | Genome=meta$genome, 160 | RDataPath=rdatapaths, 161 | SourceUrl=sourceUrl, 162 | SourceVersion=meta$sourceVersion, 163 | Species=meta$species, 164 | TaxonomyId=meta$taxonomyId, 165 | Title=meta$title, 166 | SourceSize=meta$sourceSize, 167 | SourceLastModifiedDate=meta$sourceLastModifiedDate, 168 | MoreArgs=list( 169 | BiocVersion=BiocVersion, 170 | Coordinate_1_based = TRUE, 171 | DataProvider="Ensembl", 172 | Maintainer = "Bioconductor Maintainer ", 173 | SourceType="FASTA", 174 | DispatchClass="FaFile", 175 | RDataClass=c("FaFile", "FaFile", "FaFile"), 176 | RDataDateAdded=Sys.time(), 177 | Recipe="AnnotationHubData:::ensemblFastaToFaFile", 178 | Tags=c("FASTA", "ensembl", "sequence"))) 179 | } 180 | 181 | ## Used in makeEnsemblFastaAHM() and makeGencodeFastaToAHM(): 182 | ## Unzips .gz file, indexes it and saves as .rz and .rz.fai. 183 | .fastaToFaFile <- function(ahm) 184 | { 185 | ## target output file 186 | faOut <- outputFile(ahm)[[1]] 187 | srcFile <- sub('.bgz$','.gz',faOut) 188 | ## unzip and index 189 | bgzip(srcFile) 190 | indexFa(faOut) 191 | } 192 | 193 | ensemblFastaToFaFile <- function(ahm) 194 | { 195 | .fastaToFaFile(ahm) 196 | } 197 | 198 | ## create dispatch class and newResources() method 199 | makeAnnotationHubResource("EnsemblFastaImportPreparer", makeEnsemblFastaToAHM) 200 | -------------------------------------------------------------------------------- /R/trackWithAuxiliaryTableToGRangesRecipe.R: -------------------------------------------------------------------------------- 1 | .makeAuxTable <- function(n, auxFiles, ahm){ 2 | 3 | colClasses <- metadata(ahm)$RecipeArgs$auxColClasses[n][[1]]$cols 4 | auxFile <- auxFiles[n] 5 | tbl.aux <- read.table(auxFile, sep="\t", colClasses=colClasses) 6 | colnames(tbl.aux) <- names(colClasses) 7 | tbl.aux 8 | } 9 | 10 | .getMergeArgs <- function(n, ahm){ 11 | metadata(ahm)$RecipeArgs$auxColClasses[n][[1]]$merge 12 | } 13 | 14 | 15 | ## from FILES (with json) 16 | trackWithAuxiliaryTablesToGRanges <- function(ahm) 17 | { 18 | mainFile <- inputFiles(ahm)[1] ## always the 1st one? - discuss with Dan and Paul 19 | auxFiles <- inputFiles(ahm)[-1] 20 | if(!(length(mainFile) == 1)) stop("No files present in input json.") 21 | if(!(length(auxFiles) >= 1)) stop("No auxiliary files listed in input json. Wrong recipe?") 22 | 23 | colClasses <- metadata(ahm)$RecipeArgs$mainColClasses 24 | tbl.main <- read.table(gzfile(mainFile), sep="\t", header=FALSE, 25 | colClasses=colClasses) 26 | colnames(tbl.main) <- names(colClasses) 27 | 28 | auxLen <- length(auxFiles) 29 | ## a couple for loops because we need to know 'n'... 30 | auxTabs <- list() 31 | for(i in seq_len(auxLen)){ 32 | auxTabs[[i]] <- .makeAuxTable(i, auxFiles, ahm) 33 | } 34 | mergeArgs <- list() 35 | for(i in seq_len(auxLen)){ 36 | mergeArgs[[i]] <- .getMergeArgs(i, ahm) 37 | } 38 | 39 | ## merge together uses for loop again (to concentrate result down to one thing) 40 | for(i in seq_len(auxLen)){ 41 | if(i ==1){tbl <- tbl.main} 42 | ## otherwise recycle 43 | tbl <- merge(tbl, auxTabs[[i]], by.x=mergeArgs[[i]][["byX"]], 44 | by.y=mergeArgs[[i]][["byY"]], 45 | all.x=TRUE) 46 | } 47 | 48 | tbl <- .sortTableByChromosomalLocation(tbl) 49 | colnames <- colnames(tbl) 50 | requiredColnames <- c("seqname", "start", "end") 51 | stopifnot(all(requiredColnames %in% colnames)) 52 | otherColnames <- setdiff(colnames, requiredColnames) 53 | 54 | ## drop any rows withouth a seqname 55 | tbl <- tbl[!is.na(tbl$seqname),] 56 | 57 | if("strand" %in% otherColnames){ 58 | gr <- with(tbl, GRanges(seqname, IRanges(start, end), strand)) 59 | otherColnames <- setdiff(colnames, c(requiredColnames,"strand")) 60 | }else{ 61 | gr <- with(tbl, GRanges(seqname, IRanges(start, end))) 62 | } 63 | 64 | mcols(gr) <- DataFrame(tbl[, otherColnames]) 65 | 66 | # add seqlength & chromosome circularity information 67 | newSeqInfo <- constructSeqInfo(metadata(ahm)$Species, 68 | metadata(ahm)$Genome) 69 | # if gr only has a subset of all possible chromosomes, 70 | # then update those only 71 | seqinfo(gr) <- newSeqInfo[names(seqinfo(gr))] 72 | 73 | save(gr, file=outputFile(ahm)) 74 | if (!getOption("AnnotationHub_Use_Disk", FALSE)) { 75 | upload_to_S3(outputFile(ahm), metadata(ahm)$RDataPath) 76 | } 77 | 78 | outputFile(ahm) 79 | 80 | } # trackWithAuxiliaryTableToGRanges 81 | #------------------------------------------------------------------------------- 82 | 83 | 84 | 85 | 86 | ## helper to remove 'id' col 87 | .removeId <- function(table){ 88 | newColnames <- setdiff(colnames(table), "id") 89 | table[,newColnames] 90 | } 91 | ## compress a whole table one col at a time. 92 | ## This (currently) assumes all cols should be characters 93 | .compressTable <- function(table, levels){ 94 | sf <- factor(table$id,levels=levels) 95 | table <- .removeId(table) 96 | res <- DataFrame() 97 | for(i in seq_len(ncol(table))){ ## for ea. column 98 | col <- splitAsList(as.character(table[[i]]), f=sf) 99 | if(i==1){ 100 | res <- DataFrame(col) 101 | }else{ 102 | res <- DataFrame(res, DataFrame(col)) ## cbind doesn't work? 103 | } 104 | } 105 | colnames(res) <- colnames(table) 106 | res 107 | } 108 | 109 | 110 | .makeComplexGR <- function(tbl,auxFiles,auxTabs){ 111 | ## replace "chrom" with "seqnames". 112 | colnames(tbl)[colnames(tbl) %in% "chrom"] <- "seqname" 113 | colnames(tbl)[colnames(tbl) %in% "chromStart"] <- "start" 114 | colnames(tbl)[colnames(tbl) %in% "chromEnd"] <- "end" 115 | 116 | tbl <-.sortTableByChromosomalLocation(tbl) 117 | colnames <- colnames(tbl) 118 | requiredColnames <- c("seqname", "start", "end") 119 | stopifnot(all(requiredColnames %in% colnames)) 120 | otherColnames <- setdiff(colnames, requiredColnames) 121 | 122 | ## drop any rows withouth a seqname 123 | tbl <- tbl[!is.na(tbl$seqname),] 124 | 125 | if("strand" %in% otherColnames){ 126 | gr <- with(tbl, GRanges(seqname, IRanges(start, end), strand)) 127 | otherColnames <- setdiff(colnames, c(requiredColnames,"strand")) 128 | }else{ 129 | gr <- with(tbl, GRanges(seqname, IRanges(start, end))) 130 | } 131 | ## append the initial mcols 132 | mcols(gr) <- DataFrame(tbl[, otherColnames]) 133 | 134 | 135 | 136 | ## make a spliting factor based on the initial table 137 | splitFactor <- factor(tbl$id, levels=tbl$id) 138 | 139 | for(i in seq_along(auxFiles)){ 140 | new <- auxTabs[[i]] 141 | if(identical(as.character(splitFactor), as.character(new$id))){ 142 | ## Add it in 143 | mcols(gr) <- DataFrame(mcols(gr), .removeId(new)) 144 | }else{## otherwise compress it 1st 145 | mcols(gr) <- DataFrame(mcols(gr),.compressTable(new, 146 | levels(splitFactor))) 147 | } 148 | } 149 | gr 150 | } 151 | 152 | ## Track AND auxiliary tables. 153 | ## Unfortunately, the schemas for some tracks are complex. 154 | ## This means that in the future I will have to use ucscSchema etc. to 155 | ## get the addional information so that I can properly assemble them. 156 | ## For now, we will check for "id" and only proceed if all tables have this. 157 | trackandTablesToGRangesRecipe <- function(ahm) 158 | { 159 | session <- browserSession() 160 | genome <- metadata(ahm)$Genome 161 | genome(session) <- genome 162 | sourceFile <- metadata(ahm)$SourceFile 163 | track <- sub("^.+/database/","",sourceFile) 164 | query <- ucscTableQuery(session, track) 165 | tableNames <- tableNames(query) 166 | 167 | mainFile <- tableNames[1] ## always the 1st one to be 2main table 168 | auxFiles <- tableNames[-1] 169 | if(!(length(auxFiles) >= 1)) { ## this means we are done already 170 | gr <- track(query) 171 | }else{ ## have to do a merge 1st 172 | ## have to "get" primary in table form to assure "id" will be present 173 | tbl <- getTable(ucscTableQuery(session, mainFile)) 174 | 175 | ## Now get the other tables 176 | auxTabs <- list() 177 | for(i in seq_along(auxFiles)){ 178 | ## query <- ucscTableQuery(session, track) 179 | tableName(query) <- auxFiles[i] 180 | auxTabs[[i]] <- getTable(query) 181 | } 182 | 183 | allColNames <- list() 184 | allColNames[[1]] <- colnames(tbl) 185 | for(i in seq_len(length(auxTabs))){ 186 | idx <- i+1 187 | #print(idx) 188 | allColNames[[idx]] <- colnames(auxTabs[[i]]) 189 | } 190 | ## for each element is there a value called "id"? 191 | idPresent <- unlist(lapply(allColNames, function(x){'id' %in% x})) 192 | 193 | if(all(idPresent)){ 194 | gr <- .makeComplexGR(tbl,auxFiles,auxTabs) 195 | }else{ 196 | message("track schema is too complex: using basic track instead") 197 | query <- ucscTableQuery(session, track) 198 | gr <- track(query) 199 | } 200 | 201 | } 202 | 203 | 204 | ## ## add seqlength & chromosome circularity information 205 | ## newSeqInfo <- constructSeqInfo(metadata(ahm)$Species, 206 | ## metadata(ahm)$Genome) 207 | ## ## if gr only has a subset of all possible chromosomes, 208 | ## ## then update those only 209 | ## seqinfo(gr) <- newSeqInfo[names(seqinfo(gr))] 210 | save(gr, file=outputFile(ahm)) 211 | outputFile(ahm) 212 | 213 | } # trackandTablesToGRangesRecipe 214 | #------------------------------------------------------------------------------- 215 | 216 | #------------------------------------------------------------------------------- 217 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | ### ========================================================================= 2 | ### Utility functions 3 | ### ------------------------------------------------------------------------- 4 | ### 5 | 6 | constructSeqInfo <- function(species, genome) 7 | { 8 | recognized.human <- species=="Homo sapiens" & genome %in% c("hg18", "hg19") 9 | recognized.mouse <- species=="Mus musculus" & genome %in% c("mm10") 10 | recognized <- recognized.human | recognized.mouse 11 | stopifnot(recognized) 12 | 13 | suppressMessages({ 14 | # chroms 1-22, X, Y, M are assumed to be the first 25 rows of the 15 | # data.frame 16 | if(recognized.human) 17 | tbl.chromInfo = 18 | GenomicFeatures:::.makeUCSCChrominfo (genome, 19 | circ_seqs="chrM") [1:25,] 20 | if(recognized.mouse) 21 | tbl.chromInfo = 22 | GenomicFeatures:::.makeUCSCChrominfo (genome, 23 | circ_seqs="chrM") [1:22,] 24 | 25 | }) 26 | 27 | Seqinfo(as.character(tbl.chromInfo$chrom), 28 | seqlengths=tbl.chromInfo$length, 29 | isCircular=tbl.chromInfo$is_circular, 30 | genome=genome) 31 | } 32 | 33 | .sortTableByChromosomalLocation <- function(tbl) 34 | { 35 | stopifnot (all (c ('seqname', 'start') %in% colnames (tbl))) 36 | factor.chromNames <- factor (tbl$seqname, 37 | levels=paste("chr", c(1:22, "X", "Y", "M"), 38 | sep='')) 39 | tbl$seqname <- factor.chromNames 40 | tbl <- tbl [order (tbl$seqname, tbl$start), ] 41 | invisible (tbl) 42 | 43 | } 44 | 45 | .printf <- function(...) print(noquote(sprintf(...))) 46 | 47 | ## from ?grep, by Luke Tierney 48 | URL_parts <- function(x) 49 | { 50 | m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) 51 | parts <- do.call(rbind, 52 | lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) 53 | colnames(parts) <- c("protocol","host","port","path") 54 | parts 55 | } 56 | 57 | ## log messages to console AND to a file 58 | flog <- function(level, ...) 59 | { 60 | loggerFunction <- switch(level, 61 | "flog.fatal", 62 | "flog.error", 63 | "noop", 64 | "flog.warn", 65 | "noop", 66 | "flog.info", 67 | "noop", 68 | "flog.debug", 69 | "flog.trace" 70 | ) 71 | dots <- list(...) 72 | do.call(loggerFunction, dots) 73 | dots$name <- "file" 74 | do.call(loggerFunction, dots) 75 | } 76 | 77 | ## Uploading to S3 usually happens in AnnotationHubServer, but 78 | ## when running the track*Recipe recipes, it happens in 79 | ## AnnotationHubData. There is an RAmazonS3 R package but 80 | ## it does not work well for uploading files. Therefore this 81 | ## function expects the AWS CLI to be installed. 82 | ## See: https://aws.amazon.com/cli/ 83 | ## It should be configured with a user who can write to 84 | ## the appropriate bucket. 85 | upload_to_S3 <- 86 | function(file, remotename, bucket, profile, acl="public-read") 87 | { 88 | remotename <- sub("^\\/", "", remotename) 89 | #aws --profile ahs_content_uploader s3 cp --acl public-read test s3://annotationhub/loquat/vato/manichean/test 90 | profileStr <- " " 91 | if (!missing(profile)) 92 | profileStr <- paste("--profile ", profile) 93 | 94 | cmd <- "aws" 95 | if (length(file) != length(remotename)) 96 | stop("Length of file does not match length of remotename!") 97 | 98 | for (i in 1:length(file)) { 99 | thisFile <- file[i] 100 | thisRemoteName <- remotename[i] 101 | quotes = getOption("useFancyQuotes") 102 | on.exit(options(useFancyQuotes=quotes)) 103 | options(useFancyQuotes=FALSE) 104 | args <- sprintf("%s s3 cp --region us-east-1 --acl %s %s s3://%s/%s", 105 | profileStr, acl, dQuote(thisFile), bucket, dQuote(thisRemoteName)) 106 | res <- system2(cmd, args) 107 | if (res != 0) 108 | stop(sprintf("Failed to upload %s to S3! Result was %s.", file, res)) 109 | } 110 | 111 | TRUE 112 | } 113 | 114 | 115 | ## new function to upload to azure 116 | 117 | upload_to_azure <- 118 | function(file, sas) 119 | { 120 | if(missing(sas)){ 121 | sas = Sys.getenv("AZURE_SAS_URL", NA_character_) 122 | } 123 | if(is.na(sas)){ 124 | stop("AZURE_SAS_URL environment variables is not set or given") 125 | } 126 | stopifnot(startsWith(prefix="https", sas)) 127 | if(Sys.which("azcopy") == ""){ 128 | stop("Please download azcopy") 129 | } 130 | 131 | args <- paste0("copy --recursive ", file, " '", sas, "'") 132 | res <- system2("azcopy", args) 133 | if (res != 0) stop(sprintf("Failed to upload %s to Azure!", file)) 134 | TRUE 135 | } 136 | 137 | 138 | 139 | globalVariables(c("futile.logger")) 140 | 141 | .onLoad <- 142 | function(libname, pkgname) 143 | { 144 | logDir <- file.path(Sys.getenv("HOME"), 145 | sprintf(".%s", pkgname)) 146 | if (!file.exists(logDir)) 147 | { 148 | .printf("Creating log directory %s", logDir) 149 | dir.create(logDir) 150 | } 151 | l <- library 152 | l(futile.logger) 153 | flog.threshold(TRACE) 154 | flog.appender(appender.file(file.path(logDir, 155 | sprintf("%s.log", pkgname))), name="file") 156 | } 157 | 158 | 159 | `%_%` <- function(a, b) paste0(a, b) 160 | 161 | 162 | # Create "pointer" variables for large data sets. 163 | ptr <- pointer <- function(..., pos=-1, envir=as.environment(pos), 164 | namedList=TRUE, expandCharacter=FALSE) 165 | { 166 | variableList <- tail(as.list(match.call()), -1) 167 | 168 | if (length(variableList) == 0) 169 | stop("Must supply reference object.") 170 | 171 | exclusions <- intersect(names(variableList), setdiff(names(formals()), 172 | "...")) 173 | for (exclusion in exclusions) 174 | variableList[[exclusion]] = NULL 175 | 176 | if (length(variableList) == 0) 177 | stop("Must supply reference object.") 178 | 179 | if (expandCharacter) { 180 | temp = character() 181 | for (variable in variableList) { 182 | if (typeof(variable) == "character") 183 | temp <- c(temp, variable) 184 | else if (typeof(variable) == "symbol") { 185 | evaluatedVariable <- eval(variable) 186 | if (typeof(evaluatedVariable) == "character") 187 | temp <- c(temp, evaluatedVariable) 188 | else if (is.environment(evaluatedVariable)) { 189 | for (name in ls(evaluatedVariable)) 190 | temp <- c(temp, variable %_% "$" %_% name) 191 | } 192 | else 193 | temp <- c(temp, as.character(variable)) 194 | } 195 | } 196 | pointerNames <- temp 197 | } 198 | else 199 | pointerNames <- as.character(variableList) 200 | 201 | returnList <- list() 202 | for (pointerName in pointerNames) { 203 | e <- envir 204 | pName <- pointerName 205 | 206 | reEnv <- "^(.+?)\\$(.+?)$" 207 | envMatch <- regexec(reEnv, pointerName) 208 | envMatches <- NULL 209 | if (envMatch[[1]][1] != -1) { 210 | envMatches <- regmatches(pointerName, envMatch)[[1]][2:3] 211 | e <- get(envMatches[1]) 212 | pName <- envMatches[2] 213 | } 214 | 215 | p <- list() 216 | p$object <- e 217 | p$name <- as.character(pName) 218 | class(p) <- "pointer" 219 | 220 | index <- length(returnList) + 1 221 | if (namedList) index <- p$name 222 | 223 | returnList[[index]] <- p 224 | } 225 | 226 | if (length(returnList) == 1) 227 | return (returnList[[1]]) 228 | 229 | return (returnList) 230 | } 231 | 232 | as.pointer <- function(x) 233 | { 234 | pointer(x) 235 | } 236 | 237 | is.pointer <- function(x) 238 | { 239 | return (inherits(x, "pointer")) 240 | } 241 | 242 | .. <- deref <- function(x) 243 | { 244 | if (is.environment(x)) return (x) 245 | else return (get(x$name, envir=x$object)) 246 | } 247 | `..<-` <- `deref<-` <- function(x, value) 248 | { 249 | if (is.pointer(x)) assign(x$name, value, envir=x$object) 250 | return (x) 251 | } 252 | 253 | print.pointer <- function(x, ...) 254 | { 255 | environment.name <- capture.output(print(x$object)) 256 | cat("Pointer to variable '", x$name, "' in ", environment.name, ":\n\n", sep="") 257 | str(..(x), ...) 258 | } 259 | 260 | ## usage: 261 | # x <- list(frog="frog", fish="~frog") 262 | # z <- pointer(x) 263 | # ..(z) 264 | # ..(z)$fish <- "trout" 265 | # ..(z) 266 | # x 267 | -------------------------------------------------------------------------------- /R/makeGencodeGFF.R: -------------------------------------------------------------------------------- 1 | # recipe to get GFF3 files from Genecode. 2 | # importtant links 3 | #http://www.gencodegenes.org/releases/ 4 | #ftp site: ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human 5 | # readme file for genecode project 6 | # the above code was updated documented at 7 | # ftp://ftp.sanger.ac.uk/pub/gencode/README.txt 8 | #ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/_README.TXT 9 | 10 | 11 | # for gff3 files 12 | #gencode.vX.annotation.gff3.gz 13 | #gencode.vX.chr_patch_hapl_scaff.annotation.gff3.gz 14 | #gencode.vX.polyAs.gff3.gz: 15 | #gencode.vX.polyAs.gff3.gz: 16 | #gencode.vX.2wayconspseudos.gff3.gz: 17 | # gencode.vX.long_noncoding_RNAs.gff3.gz 18 | #gencode.vX.tRNAs.gff3.gz 19 | 20 | # only gff3 files will be added - since both gtf and gff3 contain same 21 | # data, but gff3 is better (Herve) .These files will not be stored as 22 | # a GRanges on amazon s3. 23 | 24 | .gencodeBaseUrl <- "ftp://ftp.ebi.ac.uk/pub/databases/gencode/" 25 | 26 | .gencodeFileFromUrl <- function(urls) { 27 | unlist(sapply(urls, function(url) { 28 | listing <- .ftpDirectoryInfo(url) 29 | 30 | ## find entries marking directory 31 | idx <- grepl("^./", listing) 32 | tag <- sub("./(.*):", "\\1/", listing[idx]) 33 | directory <- c("", tag)[cumsum(idx) + 1L] 34 | ## complete URL 35 | idx <- grepl("gencode", listing) 36 | paste0(url, directory, sub(".*gencode", "gencode", listing))[idx] 37 | 38 | }, USE.NAMES=FALSE)) 39 | } 40 | 41 | .gencodeDescription <- function(fileurls){ 42 | # add description map here. 43 | map <- c( 44 | annotation.gff3.gz=.expandLine("Gene annotations 45 | on reference chromosomes from Gencode"), 46 | chr_patch_hapl_scaff.annotation.=.expandLine("Gene annotation 47 | on reference-chromosomes/patches/scaffolds/haplotypes from Gencode"), 48 | polyAs=.expandLine("files contain polyA signals, polyA sites and 49 | pseudo polyAs manually annotated by HAVANA from only the refrence 50 | chromosome"), 51 | wayconspseudos=.expandLine("pseudogenes predicted by the Yale 52 | & UCSC pipelines, but not by Havana on reference chromosomes"), 53 | long_noncoding_RNAs=.expandLine("sub-set of the main annotation files 54 | on the reference chromosomes. They contain only the lncRNA genes. 55 | Long non-coding RNA genes are considered the genes with any of 56 | those biotypes: 'processed_transcript', 'lincRNA', 57 | '3prime_overlapping_ncrna', 'antisense', 'non_coding', 58 | 'sense_intronic' , 'sense_overlapping' , 'TEC' , 'known_ncrna'."), 59 | tRNAs =.expandLine("tRNA structures predicted by tRNA-Scan on 60 | reference chromosomes"), 61 | transcripts.fa.gz=.expandLine("Protein-coding transcript sequences 62 | on reference chromosomes Fasta file"), 63 | translations.fa.gz=.expandLine("Translations of protein-coding 64 | transcripts on reference chromosomes Fasta file"), 65 | lncRNA_transcripts.fa.gz=.expandLine("Long non-coding RNA 66 | transcript sequences on reference chromosomes Fasta file."), 67 | unmapped=.expandLine("Unmapped") 68 | ) 69 | description <- character(length(fileurls)) 70 | for (i in seq_along(map)) 71 | description[grep(names(map)[i], fileurls)] <- map[[i]] 72 | 73 | description 74 | } 75 | 76 | .gencodeGenome <- function(species, release) { 77 | # this information is curated from Gencode's website 78 | # link - http://www.gencodegenes.org/releases/ 79 | if (species=="Human") 80 | tblurl <- "https://www.gencodegenes.org/human/releases" 81 | else 82 | tblurl <- "https://www.gencodegenes.org/mouse/releases" 83 | 84 | ## read in the table 85 | tryCatch({ 86 | http <- RCurl::getURL(tblurl) 87 | tbl <- XML::readHTMLTable(http, header=TRUE, stringsAsFactors=FALSE) 88 | }, error = function(err) { 89 | stop("Error reading ", tblurl, 90 | ".\n SSL issue reported in Ubuntu 20?") 91 | }) 92 | 93 | tbl <- tbl[[1]] 94 | tblheader <- gsub("\n", "", colnames(tbl)) 95 | tblheader = trimws(tblheader) 96 | colnames(tbl) = tblheader 97 | 98 | idx <- which(tbl[,"GENCODE release"]==release) 99 | tbl[idx,"Genome assembly version"] 100 | } 101 | 102 | 103 | # Helper to retrieve GTF & GFF3 file urls from Gencode 104 | .gencodeSourceUrls <- function(species, release, filetype, justRunUnitTest) 105 | { 106 | speciesUrl <- ifelse(species=="Human", "Gencode_human/", "Gencode_mouse/") 107 | dirurl = paste0(.gencodeBaseUrl, speciesUrl, "release_", release, "/") 108 | names(dirurl) <- paste0(species,"_", release) 109 | 110 | fileurls <-.gencodeFileFromUrl(dirurl) 111 | 112 | if (tolower(filetype)=="gff") 113 | idx <- grep("gff3", fileurls) 114 | if(tolower(filetype)=="fasta") 115 | idx <- grep("fa.gz", fileurls) 116 | fileurls <- fileurls[idx] 117 | 118 | if(length(idx)==0) 119 | stop("No files found.") 120 | 121 | if(justRunUnitTest) 122 | fileurls <- fileurls[1:2] 123 | 124 | ## tags 125 | filename <- basename(fileurls) 126 | filename <- sub(".gz","", filename) 127 | tags <- gsub("[.]",",",filename) 128 | 129 | ## description 130 | description <- .gencodeDescription(fileurls) 131 | 132 | ## rdatapath - these files will be made into GRanges and stored on S3. 133 | #rdatapath <- paste0("gencode/", species, "/release_", release,"/", 134 | # basename(fileurls), ".Rda") 135 | 136 | rdatapath <- sub(.gencodeBaseUrl, "", fileurls) 137 | 138 | 139 | ## get date and size for files 140 | df <- .httrFileInfo(fileurls) 141 | rownames(df) <- NULL 142 | 143 | ## species, taxid, genome 144 | scSpecies <- ifelse(species=="Human", "Homo sapiens", "Mus musculus") 145 | taxid <- ifelse(species=="Human", 9606L, 1090L) 146 | genome <- .gencodeGenome(species, release) 147 | genome <- rep(genome, length(fileurls)) 148 | genome[grepl('_mapping/', rdatapath)] <- 149 | gsub('.*/', '', 150 | gsub('_mapping/.*', '', 151 | rdatapath[grepl('_mapping/', rdatapath)]) 152 | ) 153 | scSpecies <- rep(scSpecies, length(fileurls)) 154 | taxid <- rep(taxid, length(fileurls)) 155 | 156 | cbind(df, rdatapath, description, tags, species=scSpecies, taxid, genome, 157 | stringsAsFactors=FALSE) 158 | } 159 | 160 | 161 | ## STEP 1: make function to process metadata into AHMs 162 | makeGencodeGFFsToAHMs <- function(currentMetadata, 163 | species=c("Human", "Mouse"), 164 | release, 165 | justRunUnitTest=FALSE, 166 | BiocVersion=BiocManager::version()){ 167 | 168 | ## important - here you need to know which species and release you want to 169 | ## add files for. 170 | species <- match.arg(species) 171 | rsrc <- .gencodeSourceUrls(species = species, release = release, 172 | filetype = "gff", justRunUnitTest = justRunUnitTest) 173 | 174 | description <- rsrc$description 175 | title <- basename(rsrc$fileurl) 176 | genome <- rsrc$genome 177 | sourceUrls <- rsrc$fileurl 178 | # 179 | # FixMe: in .gencodeSourceUrls the data should be LastModified time 180 | # in webAccess function .httrFileInfo these urls have that information 181 | # in the body not the header but this function is used elsewhere 182 | # 183 | sourceVersion <- as.character(rsrc$date) ## should be character 184 | if(all(is.na(sourceVersion))){ 185 | sourceVersion = rep(release, length(sourceVersion)) 186 | } 187 | SourceLastModifiedDate <- rsrc$date ## should be "POSIXct" "POSIXt" 188 | SourceSize <- as.numeric(rsrc$size) 189 | tags <- strsplit(rsrc$tag, ",") 190 | species <- rsrc$species 191 | rdatapath <- rsrc$rdatapath 192 | taxid <- rsrc$taxid 193 | 194 | Map(AnnotationHubMetadata, 195 | Description=description, 196 | Genome=genome, 197 | SourceUrl=sourceUrls, 198 | SourceSize=SourceSize, 199 | SourceLastModifiedDate=SourceLastModifiedDate, 200 | SourceVersion=sourceVersion, 201 | Species=species, 202 | RDataPath=rdatapath, 203 | TaxonomyId=taxid, 204 | Title=title, 205 | Tags=tags, 206 | MoreArgs=list( 207 | BiocVersion=BiocVersion, 208 | Coordinate_1_based = TRUE, 209 | DataProvider = "Gencode", 210 | Maintainer = "Bioconductor Maintainer ", 211 | RDataClass = "GRanges", 212 | DispatchClass="GFF3File", 213 | SourceType="GFF", 214 | Location_Prefix=.gencodeBaseUrl, 215 | RDataDateAdded = Sys.time(), 216 | Recipe="AnnotationHubData:::gencodeGFFToGRanges")) 217 | } 218 | 219 | gencodeGFFToGRanges <- function(ahm) 220 | { 221 | outputFile(ahm)[[1]] 222 | } 223 | 224 | ## STEP 2: Call the helper to set up the newResources() method 225 | makeAnnotationHubResource("GencodeGffImportPreparer", 226 | makeGencodeGFFsToAHMs) 227 | -------------------------------------------------------------------------------- /man/AnnotationHubMetadata-class.Rd: -------------------------------------------------------------------------------- 1 | \name{AnnotationHubMetadata-class} 2 | \docType{class} 3 | 4 | % Class: 5 | \alias{class:HubMetadata} 6 | \alias{HubMetadata-class} 7 | \alias{HubMetadata} 8 | \alias{class:AnnotationHubMetadata} 9 | \alias{AnnotationHubMetadata-class} 10 | \alias{AnnotationHubMetadata} 11 | 12 | % Constructors: 13 | \alias{AnnotationHubMetadata} 14 | 15 | % Accessors: 16 | \alias{metadata} 17 | \alias{metadata,HubMetadata-method} 18 | \alias{metadata<-} 19 | \alias{metadata<-,HubMetadata,list-method} 20 | \alias{inputFiles} 21 | \alias{inputFiles,HubMetadata-method} 22 | \alias{outputFile} 23 | \alias{outputFile,HubMetadata-method} 24 | \alias{recipeName} 25 | \alias{recipeName,HubMetadata-method} 26 | \alias{hubError} 27 | \alias{hubError,list-method} 28 | \alias{hubError,HubMetadata-method} 29 | \alias{hubError<-} 30 | \alias{hubError<-,list,character-method} 31 | \alias{hubError<-,HubMetadata,character-method} 32 | 33 | % Methods: 34 | \alias{run} 35 | \alias{run,AnnotationHubMetadata-method} 36 | 37 | % Other: 38 | \alias{HubMetadataFromJson} 39 | \alias{toJson} 40 | \alias{constructSeqInfo} 41 | \alias{ahmToJson} 42 | \alias{deleteResources} 43 | \alias{getImportPreparerClasses} 44 | \alias{makeAnnotationHubResource} 45 | 46 | % Show: 47 | \alias{show} 48 | \alias{show,HubMetadata-method} 49 | 50 | 51 | \title{Class \code{"AnnotationHubMetadata"} and methods} 52 | 53 | \description{ 54 | 55 | \code{AnnotationHubMetadata} is used to represent record(s) in the 56 | server data base. 57 | 58 | } 59 | 60 | \section{Objects from the Class}{ 61 | 62 | Objects can be created by calls to the constructor, 63 | \code{AnnotationHubMetadata()}. 64 | 65 | } 66 | 67 | \usage{ 68 | AnnotationHubMetadata(AnnotationHubRoot, SourceUrl, SourceType, 69 | SourceVersion, SourceLastModifiedDate, SourceMd5 = 70 | NA_character_, SourceSize, DataProvider, Title, 71 | Description, Species, TaxonomyId, Genome, Tags, 72 | Recipe, RDataClass, RDataDateAdded, RDataPath, 73 | Maintainer, ..., BiocVersion = BiocManager::version(), 74 | Coordinate_1_based = TRUE, Notes = NA_character_, 75 | DispatchClass, Location_Prefix = 76 | "https://bioconductorhubs.blob.core.windows.net/annotationhub/") 77 | 78 | toJson(x) 79 | constructSeqInfo(species, genome) 80 | 81 | metadata(x, ...) 82 | hubError(x) 83 | inputFiles(object, ...) 84 | outputFile(object) 85 | ahmToJson(ahm) 86 | deleteResources(id) 87 | getImportPreparerClasses() 88 | makeAnnotationHubResource(objName, makeAnnotationHubMetadataFunction, 89 | ..., where) 90 | } 91 | 92 | \arguments{ 93 | \item{AnnotationHubRoot}{ 94 | \code{character(1)} Absolute path to directory structure 95 | containing resources to be added to AnnotationHub. Internal use only. 96 | } 97 | \item{SourceUrl}{ 98 | \code{character()} URL of original resource(s). 99 | } 100 | \item{SourceType}{ 101 | \code{character()} Form of original data, e.g., BED, FASTA, 102 | etc. \code{getValidSourceTypes()} list currently acceptable 103 | values. If nothing seems appropiate for your data reach out to 104 | maintainer@bioconductor.org. 105 | 106 | } 107 | \item{SourceVersion}{ 108 | \code{character(1)} Version of original file. 109 | } 110 | \item{SourceLastModifiedDate}{ 111 | \code{POSIXct()} The date when the source was last modified. 112 | } 113 | \item{SourceMd5}{ 114 | \code{character()} md5 hash of original file. 115 | } 116 | \item{SourceSize}{ 117 | \code{numeric(1)} Size of original file in bytes. 118 | } 119 | \item{DataProvider}{ 120 | \code{character(1)} Provider of original data, e.g., NCBI, 121 | UniProt etc. 122 | } 123 | \item{Title}{ 124 | \code{character(1)} Title for the resource with version or genome 125 | build as appropriate. 126 | } 127 | \item{Description}{ 128 | \code{character(1)} Description of the resource. May include 129 | details such as data type, format, study origin, sequencing 130 | technology, treated vs control, number of samples etc. 131 | } 132 | \item{Species}{ 133 | \code{character(1)} Species name. For help on valid 134 | species see \code{getSpeciesList, validSpecies, or suggestSpecies.} 135 | 136 | } 137 | \item{TaxonomyId}{ 138 | \code{character(1)} NCBI code. There are 139 | checks for valid taxonomyId given the Species which produce 140 | warnings. See GenomeInfoDb::loadTaxonomyDb() for full validation 141 | table. 142 | 143 | } 144 | \item{Genome}{ 145 | \code{character(1)} Name of genome build. 146 | } 147 | \item{Tags}{ 148 | \code{character()} Free-form tags that serve as search terms. 149 | } 150 | \item{Recipe}{ 151 | \code{character(1)} Name of recipe function. Only applicable to 152 | recipes created by the Bioconductor core team and included in 153 | AnnotationHubData base code. 154 | } 155 | \item{RDataClass}{ 156 | \code{character()} Class of derived R object, e.g., GRanges. Length 157 | must match the length of \code{RDataPath}. 158 | } 159 | \item{RDataDateAdded}{ 160 | \code{POSIXct()} Date resource was added to AnnotationHub. The 161 | default is today's date and is auto-generated when metadata are 162 | constructed. Resources will appear in snapshots with a date greater 163 | than or equal to the \code{RDataDateAdded}. 164 | } 165 | \item{RDataPath}{ 166 | \code{character()} File path to where object is stored in AWS S3 167 | bucket or on the web.This field should be the 168 | remainder of the path to the resource. The 169 | \code{Location_Prefix} will be prepended to 170 | \code{RDataPath} for the full path to the resource. 171 | If the resource is stored in Bioconductor's AWS S3 172 | buckets, it should start with the name of the package associated 173 | with the metadata and should not start with a leading 174 | slash. It should include the resource file name. For 175 | strongly associated files, like a bam file and its index 176 | file, the two files should be separates with a colon 177 | \code{:}. This will link a single hub id with the multiple files. 178 | 179 | } 180 | \item{Maintainer}{ 181 | \code{character(1)} Maintainer name and email address, 182 | \sQuote{A Maintainer \url{a.maintainer@email.com}} 183 | } 184 | \item{BiocVersion}{\code{character(1)}. The first Bioconductor version 185 | the resource was made available for. Unless removed from the hub, the 186 | resource will be available for all versions greater than or equal to this 187 | field. 188 | } 189 | \item{Coordinate_1_based}{ 190 | \code{logical(1)} Do coordinates start with 1 or 0? 191 | } 192 | \item{DispatchClass}{ 193 | \code{character(1)}. Determines how data are loaded into R. The value for 194 | this field should be \sQuote{Rda} if the data were serialized with 195 | \code{save()} and \sQuote{Rds} if serialized with \code{saveRDS}. The 196 | filename should have the appropriate \sQuote{rda} or \sQuote{rds} 197 | extension. 198 | 199 | A number of dispatch classes are pre-defined in 200 | AnnotationHub/R/AnnotationHubResource-class.R with the suffix 201 | \sQuote{Resource}. For example, if you have sqlite files, the 202 | AnnotationHubResource-class.R defines SQLiteFileResource so the 203 | DispatchClass would be SQLiteFile. Contact maintainer@bioconductor.org if 204 | you are not sure which class to use. The function 205 | \code{AnnotationHub::DispatchClassList()} will output a 206 | matrix of currently implemented DispatchClass and brief 207 | description of utility. If a predefine class does not seem 208 | appropriate contact maintainer@bioconductor.org. 209 | 210 | } 211 | \item{Location_Prefix}{ 212 | \code{character(1)} URL location of AWS S3 bucket or web site where 213 | resource is located. 214 | } 215 | \item{Notes}{ 216 | \code{character()} Notes about the resource. 217 | } 218 | \item{ahm}{ 219 | An instance of class \code{AnnotationHubMetadata}. 220 | } 221 | \item{x}{ 222 | An instance of class \code{AnnotationHubMetadata}. 223 | } 224 | \item{object}{ 225 | An \code{AnnotationHubRecipe} instance. 226 | } 227 | \item{species}{ 228 | \code{character(1)} The organism, e.g., "Homo sapiens". 229 | } 230 | \item{genome}{ 231 | \code{character(1)} The genome build, e.g., "hg19". 232 | } 233 | \item{id}{ 234 | An id whose DB record is to be fully deleted. 235 | } 236 | \item{objName}{ 237 | \code{character(1)} The name of the PreparerClass used for dispatch. 238 | } 239 | \item{makeAnnotationHubMetadataFunction}{ 240 | \code{function} Function (name) that makes \code{AnnotationHubMetadata} 241 | objects from the resource(s). 242 | } 243 | \item{where}{ 244 | Environment where function definition is defined. Default value 245 | is sufficient. 246 | } 247 | \item{\dots}{ 248 | Additional arguments passed to methods. 249 | } 250 | } 251 | 252 | \value{ 253 | \code{AnnotationHubMetadata} returns an instance of the class. 254 | 255 | \code{jsonPath} returns a \code{character(1))} representation of the 256 | full path to the location of the \code{json} file associated with this 257 | record. 258 | 259 | \code{toJson} returns the JSON representation of the record. 260 | 261 | \code{fromJson} retuns an instance of the class, as parsed from the 262 | JSON file. 263 | } 264 | 265 | \author{Dan Tenenbaum and Marc Carlson} 266 | 267 | 268 | \examples{ 269 | getClass("AnnotationHubMetadata") 270 | } 271 | 272 | \keyword{classes} 273 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | CHANGES IN VERSION 1.29.0 2 | ------------------------ 3 | 4 | NEW FEATURES 5 | 6 | o 1.29.2 Added HIC as acceptable source type 7 | o 1.29.1 Added CDF as acceptable source type 8 | 9 | 10 | CHANGES IN VERSION 1.25.0 11 | ------------------------ 12 | 13 | SIGNIFICANT UPDATES 14 | 15 | o 1.25.7 Update recipes to upload to azure. NonStandardOrgDb release recipe 16 | updated 17 | o 1.25.6 Update recipes to upload to azure. TwoBit ensembl and release 18 | recipes for standard TxDb and OrgDb updated 19 | 20 | NEW FEATURES 21 | 22 | o 1.25.5 Add helper function to upload to azure 23 | 24 | MODIFICATIONS 25 | 26 | o 1.25.2 Changed makeAnnotationHubMetadata to point to Azure instead of AWS 27 | 28 | CHANGES IN VERSION 1.21.0 29 | ------------------------ 30 | 31 | MODIFICATIONS 32 | 33 | o 1.21.9 Add PNG as valid source type 34 | o 1.21.4 Removed vignette for creating annotation hub package. Reference and 35 | refer to single vignette in AnnotationHub 36 | o 1.21.3 Tags for database now combination of biocViews and meta$Tags. Also 37 | checks for valid AnnotationHub or AnnotationHubSoftware biocViews. 38 | o 1.21.2 Add mtx.gz as valid source type 39 | 40 | BUG CORRECTION 41 | 42 | o 1.21.3 Fixed bug to run make*HubMetadata using "." 43 | 44 | INTERNAL BUG CORRECTION 45 | 46 | o 1.21.1 misplaced ! clause 47 | 48 | REMOVED 49 | 50 | o 1.21.5 Removed BioPax. url no longer valid. Resources were old and never 51 | used beyond first addition 52 | 53 | CHANGES IN VERSION 1.19.0 54 | ------------------------ 55 | 56 | INTERNAL BUG CORRECTION 57 | 58 | o 1.19.2 Update Metadata from Ensembl function to use 59 | GenomeInfoDb:::fetch_species_index_from_Ensembl_FTP instead of parsing the 60 | file path 61 | o 1.19.1 misplaced ! clause 62 | 63 | 64 | CHANGES IN VERSION 1.17.0 65 | ------------------------ 66 | 67 | MODIFICATIONS 68 | 69 | o 1.17.3 add check for valid Title and Description in metadata file. It 70 | should not be empty or NA 71 | o 1.17.2 add XML as valid source type 72 | o 1.17.1 add GSEMatrix as valid source type 73 | 74 | CHANGES IN VERSION 1.15.0 75 | ------------------------ 76 | 77 | MODIFICATIONS 78 | 79 | o 1.15.13 Added "BLOB" as a valid source type 80 | o 1.15.7 Added "MTX" as a valid source type 81 | o 1.15.6 Expanded documentation to clarify that data can be hosted 82 | publically not strictly Bioconductor AWS 83 | o 1.15.4 Added "XLS/XLSX" as valid source type 84 | 85 | INTERNAL BUG CORRECTION 86 | 87 | o 1.15.11 updated GencodeGFF recipes for potential future use (still would 88 | revisit this with another update to do like ensembl on the fly) 89 | o 1.15.5 remove validity check that is wrong/outdated 90 | o 1.15.1 needToRerunNonStandardOrgDb added as helper function for when 91 | generating non standard org dbs. 1.15.3 added try catch in case aws buckets 92 | unreachable. 93 | 94 | CHANGES IN VERSION 1.13.0 95 | ------------------------ 96 | 97 | NEW FEATURES 98 | 99 | o Added ability to have multiple RDataPaths associated with single hub 100 | id for strongly associated files (like bam and its bai index file) 101 | o DispatchClass are now validated against AnnotationHub::DispatchClassList() 102 | which contains currently available DispatchClass and brief description of 103 | loading process. 104 | 105 | CHANGES IN VERSION 1.11.0 106 | ------------------------ 107 | MODIFICATIONS 108 | 109 | o Removed scripts for Pazar DB as website no longer active 110 | o Update from BiocInstaller to BiocManager 111 | 112 | NEW FEATURES 113 | 114 | o Species and taxonomyId are now validated against GenomeInfoDbData object 115 | 116 | BUG FIX 117 | 118 | o Fix TwoBit resource receipe. Converts DNA that is not A,C,T,G,N to N do to 119 | design of rtracklayer::export for TwoBit 120 | o Fix bug with assignment of tags in annotationhub 121 | o makeEpigenomeRoadMap recipe updated to account for XML bug that cannot 122 | handle http urls. updated to https 123 | 124 | CHANGES IN VERSION 1.10.0 125 | ------------------------ 126 | MODIFICATIONS 127 | 128 | o Moved readMetadataFromCsv back to AnnotationHubData. 129 | 130 | o Use AnnotationHubData::makeAnnotationHubMetadata to validate metadata.csv 131 | 132 | o readMetadataFromCsv is now internal function 133 | 134 | 135 | CHANGES IN VERSION 1.8.0 136 | ------------------------ 137 | 138 | NEW FEATURES 139 | 140 | o Instead of using dropbox or ftp to deliver contributed resources to 141 | Bioconductor Core, temporary access to Annotation-Contributor user on S3 is 142 | utilized. 143 | 144 | MODIFICATIONS 145 | 146 | o Modified readMetadataFromCsv; make RDataPath mandatory entry and if 147 | location_prefix is Bioconductor S3 bucket the Rdatapath must start with the 148 | package name 149 | 150 | BUG FIXES 151 | 152 | o Add garbage collection to fix twobit memory allocation error 153 | 154 | o Fix files not deleting do to special characters in file names 155 | 156 | o Import dbGetQuery from DBI 157 | 158 | o Remove hard coded biocVersion in unit tests 159 | 160 | CHANGES IN VERSION 1.6.0 161 | ------------------------ 162 | 163 | NEW FEATURES 164 | 165 | o add makeStandardTxDbsToSqlite() recipe 166 | 167 | o add 'ensembl' and 'MySQL' as possible SourceType values 168 | 169 | o tidy and export makeStandard*ToAHMs and makeNCBIToOrgDbsToAHMs 170 | 171 | MODIFICATIONS 172 | 173 | o move currentMetadata 174 | 175 | o tidy pushResources interface 176 | 177 | o modified parsing of species name and genome in .ensemblMetadataFromUrl() 178 | 179 | o modified standard OrgDb recipe 180 | 181 | o enhance and clean vignette 182 | 183 | o move 'Tags' check from readCsvFromMetadata() to 184 | makeAnnotationHubMetadata() 185 | 186 | o remove dependency on xml2, curl, httr and probably other wheel 187 | reinventions, alter imports and suggests 188 | 189 | o specify multiple 'Tags' as colon separated string instead of comma 190 | separated; avoids problems with read.csv() 191 | 192 | o select data moved to GenomeInfoDbData package 193 | 194 | o Added additional documentation instructions for core members to add 195 | contributed data to AnnotationHub 196 | 197 | o rename files; remove old JSON test file no longer applicable 198 | 199 | o pass 'install' argument down through recipe 200 | 201 | o General code tidy; remove unused functions and comments; clarify checks 202 | 203 | BUG FIXES 204 | 205 | o readMetadataFromCsv() fills in DataProvider and Coordinate_1_based if missing 206 | 207 | o fix bug introduced in checking 'release' in makeEnsemblTwoBit recipe 208 | 209 | o makeAnnotationHubMetadata() now processes all inst/extdata/*.csv files 210 | 211 | o fix subset and import bug in makeAnnotationHubMetadata() 212 | 213 | o Fix bug in Rdatapath and sourceurl for makeEnsemblFasta.R 214 | 215 | CHANGES IN VERSION 1.4.0 216 | ------------------------ 217 | 218 | NEW FEATURES 219 | 220 | o add script to generate user-contributed resources 221 | 222 | o makeEnsemblGtfToGRanges() no longer stores data in S3 but downloads 223 | and converts to GRanges on the fly 224 | 225 | o add EnsemblFastaTwoBitToAHM unit test 226 | 227 | o add man page for makeEnsemblTwoBitToAHM and 228 | ensemblFastaToTwoBitFile 229 | 230 | o add makeAnnotationHubMetadata() helper 231 | 232 | MODIFICATIONS 233 | 234 | o move GSE62944-related code to ExperimentHub 235 | 236 | o move old vignettes to inst/scripts; add 'Introduction to 237 | AnnotationHubData' vignette 238 | 239 | o remove fasta and towbit files on the fly 240 | 241 | o add 'uploadToS3' argument to pushResources() and runRecipes() 242 | 243 | o move readMetadataFromCsv() from ExperimentHubData to 244 | AnnotationHubData 245 | 246 | o add 'fileName' arg to readMetadataFromCsv(); don't warn when 247 | 'Tags' are provided 248 | 249 | o specify length for args in readMetadataFromCsv() 250 | 251 | o makeAnnotationHubMetadata() populates PreparerClass with package name 252 | 253 | o add 'fileName' arg to makeAnnotationHubMetadata() 254 | 255 | 256 | CHANGES IN VERSION 1.2.0 257 | ------------------------ 258 | 259 | NEW FEATURES 260 | 261 | o add makeEnsemblTwoBit() 262 | 263 | o add hubError(), hubError<- generics and methods 264 | 265 | o create 'HubMetadata' class which 'AnnotationHubMetadata' inherits from 266 | 267 | MODIFICATIONS 268 | 269 | o export ensemblFastaToTwoBitFile() 270 | 271 | o modifications due to changes in httr::HEAD(): 272 | - AFAICT httr::HEAD() >= 1.1.0 accepts https only, not ftp 273 | - use xml2 instead of XML for parsing (httr >= 1.1.0 dependency change) 274 | 275 | o work on recipes: 276 | - clean up ChEA and Gencode 277 | - don't export tracksToUpdate(); was broken and not used 278 | - reorg man pages; combine Ensembl Fasta and TwoBit on single man page 279 | 280 | o work on updateResources(): 281 | - push data to S3 before inserting metadata in db 282 | - isolate pushResources() and pushMetadata() from updateResources() 283 | - NOTE: Epigenome unit test is failing due to bad url. If not fixed by 284 | the host the recipe will need to change. 285 | 286 | o update makedbSNPVCF() to look in new clinvar location 287 | 288 | BUG FIXES 289 | 290 | o fix bugs in makedbSNPVCF() recipe related to genome and tags 291 | 292 | 293 | CHANGES IN VERSION 1.0.0 294 | ------------------------ 295 | 296 | BUG FIXES 297 | 298 | o ENSEMBL recipes discover gtf files on Windows. 299 | 300 | 301 | CHANGES IN VERSION 0.0.214 302 | -------------------------- 303 | 304 | NEW FEATURES 305 | 306 | o Have added vcf files from the following genome builds for humans 307 | "human_9606/VCF/clinical_vcf_set/", 308 | "human_9606_b141_GRCh37p13/VCF/", 309 | "human_9606_b142_GRCh37p13/VCF/", 310 | "human_9606_b142_GRCh37p13/VCF/clinical_vcf_set/" 311 | 312 | o For each genome build, where available, the following VCF file 313 | formats are available 314 | a) all.vcf.gz 315 | b) all_papu.vcf.gz 316 | c) common_all.vcf.gz 317 | d) clinvar.vcf.gz 318 | e) clinvar_papu 319 | f) common_and_clinical 320 | g) common_no_known_medical_impact 321 | 322 | o The user can refer to 323 | http://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf/ 324 | for VCF file type formats 325 | -------------------------------------------------------------------------------- /man/makeAnnotationHubMetadata.Rd: -------------------------------------------------------------------------------- 1 | \name{makeAnnotationHubMetadata} 2 | 3 | \alias{makeAnnotationHubMetadata} 4 | 5 | \title{ 6 | Make AnnotationHubMetadata objects from csv file of metadata 7 | } 8 | 9 | \description{ 10 | Make AnnotationHubMetadata objects from .csv files located in the 11 | "inst/extdata/" package directory of an AnnotationHub package. 12 | } 13 | 14 | 15 | \usage{ 16 | makeAnnotationHubMetadata(pathToPackage, fileName=character()) 17 | } 18 | 19 | \arguments{ 20 | \item{pathToPackage}{ 21 | Full path to data package including the package name; no trailing slash 22 | } 23 | \item{fileName}{ 24 | Name of metadata file(s) with csv extension. If none are provided, all 25 | files with .csv extension in "inst/extdata" will be processed. 26 | } 27 | } 28 | 29 | \details{ 30 | \itemize{ 31 | \item{makeAnnotationHubMetadata:}{ 32 | Reads the resource metadata from .csv files into a 33 | \link{AnnotationHubMetadata} object. The \link{AnnotationHubMetadata} 34 | is inserted in the AnnotationHub database. Intended for internal 35 | use or package authors checking the validity of package metadata. 36 | } 37 | } 38 | 39 | \itemize{ 40 | \item{Formatting metadata files:}{ 41 | 42 | \code{makeAnnotationHubMetadata} reads .csv files of metadata 43 | located in "inst/extdata". Internal functions perform checks for 44 | required columns and data types and can be used by package authors 45 | to validate their metadata before submitting the package for 46 | review. 47 | 48 | The rows of the .csv file(s) represent individual \code{Hub} 49 | resources (i.e., data objects) and the columns are the metadata 50 | fields. All fields should be a single character string of length 1. 51 | 52 | Required Fields in metadata file: 53 | \itemize{ 54 | \item Title: \code{character(1)}. Name of the resource. This can be 55 | the exact file name (if self-describing) or a more complete 56 | description. 57 | 58 | \item Description: \code{character(1)}. Brief description of the 59 | resource, similar to the 'Description' field in a package 60 | DESCRIPTION file. 61 | 62 | \item BiocVersion: \code{character(1)}. The first Bioconductor version 63 | the resource was made available for. Unless removed from 64 | the hub, the resource will be available for all versions 65 | greater than or equal to this field. Generally the current 66 | devel version of Bioconductor. 67 | 68 | \item Genome: \code{character(1)}. Genome. Can be NA. 69 | 70 | \item SourceType: \code{character(1)}. Format of original data, e.g., FASTA, 71 | BAM, BigWig, etc. \code{getValidSourceTypes()} list currently 72 | acceptable values. If nothing seems appropiate for your data 73 | reach out to maintainer@bioconductor.org. 74 | 75 | \item SourceUrl: \code{character(1)}. Optional location of original 76 | data files. Multiple urls should be provided as a comma separated 77 | string. 78 | 79 | \item SourceVersion: \code{character(1)}. Version of original data. 80 | 81 | \item Species: \code{character(1)}. Species. For help on valid 82 | species see \code{getSpeciesList, validSpecies, or 83 | suggestSpecies. Can be NA.} 84 | 85 | \item TaxonomyId: \code{character(1)}. Taxonomy ID. There are 86 | checks for valid taxonomyId given the Species which produce 87 | warnings. See GenomeInfoDb::loadTaxonomyDb() for full validation 88 | table. Can be NA. 89 | 90 | \item Coordinate_1_based: \code{logical}. TRUE if data are 91 | 1-based. Can be NA 92 | 93 | \item DataProvider: \code{character(1)}. Name of company or institution 94 | that supplied the original (raw) data. 95 | 96 | \item Maintainer: \code{character(1)}. Maintainer name and email in the 97 | following format: Maintainer Name . 98 | 99 | \item RDataClass: \code{character(1)}. R / Bioconductor class the data 100 | are stored in, e.g., GRanges, SummarizedExperiment, 101 | ExpressionSet etc. If the file is loaded or read into R 102 | what is the class of the object. 103 | 104 | \item DispatchClass: \code{character(1)}. Determines how data are 105 | loaded into R. The value for this field should be 106 | \sQuote{Rda} if the data were serialized with \code{save()} and 107 | \sQuote{Rds} if serialized with \code{saveRDS}. The filename 108 | should have the appropriate \sQuote{rda} or \sQuote{rds} 109 | extension. There are other available DispathClass types 110 | and the function \code{AnnotationHub::DispatchClassList()} 111 | 112 | A number of dispatch classes are pre-defined in 113 | AnnotationHub/R/AnnotationHubResource-class.R with the suffix 114 | \sQuote{Resource}. For example, if you have sqlite files, the 115 | AnnotationHubResource-class.R defines SQLiteFileResource so 116 | the DispatchClass would be SQLiteFile. Contact 117 | maintainer@bioconductor.org if you are not sure which class 118 | to use. The function 119 | \code{AnnotationHub::DispatchClassList()} will output a 120 | matrix of currently implemented DispatchClass and brief 121 | description of utility. If a predefine class does not seem 122 | appropriate contact maintainer@bioconductor.org. An all 123 | purpose DispathClass is \code{FilePath} that instead of trying 124 | to load the file into R, will only return the path to the 125 | locally downloaded file. 126 | 127 | \item Location_Prefix: \code{character(1)}. Do not include this field 128 | if data are stored in the Bioconductor AWS S3; it will be 129 | generated automatically. 130 | 131 | If data will be accessed from a location other than AWS S3 132 | this field should be the base url. 133 | 134 | \item RDataPath: \code{character()}.This field should be the 135 | remainder of the path to the resource. The 136 | \code{Location_Prefix} will be prepended to 137 | \code{RDataPath} for the full path to the resource. 138 | If the resource is stored in Bioconductor's AWS S3 139 | buckets, it should start with the name of the package associated 140 | with the metadata and should not start with a leading 141 | slash. It should include the resource file name. For 142 | strongly associated files, like a bam file and its index 143 | file, the two files should be separates with a colon 144 | \code{:}. This will link a single hub id with the multiple files. 145 | 146 | \item Tags: \code{character() vector}. 147 | \sQuote{Tags} are search terms used to define a subset of 148 | resources in a \code{Hub} object, e.g, in a call to \code{query}. 149 | 150 | \sQuote{Tags} are automatically generated from the 151 | \sQuote{biocViews} in the DESCRIPTION and applied to all 152 | resources of the metadata file. Optionally, maintainers can 153 | define \sQuote{Tags} column of the metadata to define tags 154 | for each resource individually. Multiple \sQuote{Tags} are 155 | specified as a colon separated string, e.g., tags for two 156 | resources would look like this: 157 | 158 | \preformatted{ 159 | Tags=c("tag1:tag2:tag3", "tag1:tag3") 160 | } 161 | 162 | 163 | 164 | } 165 | NOTE: The metadata file can have additional columns beyond the 'Required 166 | Fields' listed above. These values are not added to the Hub database but 167 | they can be used in package functions to provide an additional level of 168 | metadata on the resources. 169 | 170 | More on \code{Location_Prefix} and \code{RDataPath}. These two fields make up 171 | the complete file path url for downloading the data file. If using 172 | the Bioconductor AWS S3 bucket the Location_Prefix should not be 173 | included in the metadata file[s] as this field will be populated 174 | automatically. The \code{RDataPath} will be the directory structure you 175 | uploaded to S3. If you uploaded a directory \sQuote{MyAnnotation/}, and 176 | that directory had a subdirectory \sQuote{v1/} that contained two files 177 | \sQuote{counts.rds} and \sQuote{coldata.rds}, your metadata file will contain 178 | two rows and the RDataPaths would be \sQuote{MyAnnotation/v1/counts.rds} 179 | and \sQuote{MyAnnotation/v1/coldata.rds}. If you host your data on a 180 | publicly accessible site you must include a base url as the 181 | \code{Location_Prefix}. If your data file was at 182 | \sQuote{ftp://myinstiututeserver/biostats/project2/counts.rds}, your 183 | metadata file will have one row and the \code{Location_Prefix} would be 184 | \sQuote{ftp://myinstiututeserver/} and the \code{RDataPath} would be 185 | \sQuote{biostats/project2/counts.rds}. 186 | 187 | } 188 | } 189 | } 190 | 191 | \value{ 192 | A named list the length of \code{fileName}. Each element is a list of 193 | of \code{AnnotationHubMetadata} objects created from the .csv file. 194 | } 195 | 196 | \seealso{ 197 | \itemize{ 198 | \item \code{\link{updateResources}} 199 | \item \code{\link{AnnotationHubMetadata}} class 200 | } 201 | } 202 | 203 | \examples{ 204 | 205 | ## Each row of the metadata file represents a resource added to one of 206 | ## the 'Hubs'. This example creates a metadata.csv file for a single resource. 207 | ## In the case of multiple resources, the arguments below would be character 208 | ## vectors that produced multiple rows in the data.frame. 209 | 210 | meta <- data.frame( 211 | Title = "RNA-Sequencing dataset from study XYZ", 212 | Description = paste0("RNA-seq data from study XYZ containing 10 normal ", 213 | "and 10 tumor samples represented as a", 214 | "SummarizedExperiment"), 215 | BiocVersion = "3.4", 216 | Genome = "GRCh38", 217 | SourceType = "BAM", 218 | SourceUrl = "http://www.path/to/original/data/file", 219 | SourceVersion = "Jan 01 2016", 220 | Species = "Homo sapiens", 221 | TaxonomyId = 9606, 222 | Coordinate_1_based = TRUE, 223 | DataProvider = "GEO", 224 | Maintainer = "Your Name ", 225 | RDataClass = "SummarizedExperiment", 226 | DispatchClass = "Rda", 227 | ResourceName = "FileName.rda" 228 | ) 229 | 230 | \dontrun{ 231 | ## Write the data out and put in the inst/extdata directory. 232 | write.csv(meta, file="metadata.csv", row.names=FALSE) 233 | 234 | ## Test the validity of metadata.csv 235 | makeAnnotationHubMetadata("path/to/mypackage") 236 | } 237 | } 238 | 239 | \keyword{methods} 240 | --------------------------------------------------------------------------------