├── tests
    └── AnnotationHubData_unit_tests.R
├── inst
    ├── extdata
    │   ├── inpDrosPsuedo.rda
    │   └── badUCSCTracks
    │   │   ├── allBadTracks.rda
    │   │   ├── allPossibleTracks.rda
    │   │   └── genomeTrackTable.Rda
    ├── unitTests
    │   ├── cases
    │   │   └── encodeDCCMetadata
    │   │   │   ├── tbl.parsedEncodeMetadata.RData
    │   │   │   ├── wgEncodeAwgDnaseUniform.info
    │   │   │   └── wgEncodeAffyRnaChip.info
    │   ├── test_ImportPreparer-class.R
    │   ├── test_validityFunctions.R
    │   ├── test_webAccessFunctions.R
    │   ├── test_AnnotationHubConstructor.R
    │   └── test_recipe.R
    ├── makefile
    └── scripts
    │   ├── addContributedResources.txt
    │   └── singleContributedResourceTemplate.R
├── R
    ├── makeGrasp2Db.R
    ├── Message-class.R
    ├── ImportPreparer-class.R
    ├── makeAnnotationHubResource.R
    ├── makeUCSC2Bit.R
    ├── makeChEA.R
    ├── makeEnsemblGtfToGRanges.R
    ├── makeGencodeFasta.R
    ├── makeRefNet.R
    ├── validationFunctions.R
    ├── makeEnsemblTwoBit.R
    ├── makeStandardTxDbsToSqlite.R
    ├── makeInparanoid8ToDbs.R
    ├── makeStandardOrgDbsToSqlite.R
    ├── makeHaemCode.R
    ├── HubMetadata-class.R
    ├── webAccessFunctions.R
    ├── makeUCSCChain.R
    ├── ahmToJson.R
    ├── makedbSNPVCF.R
    ├── makeEncodeDCC.R
    ├── makeEnsemblFasta.R
    ├── trackWithAuxiliaryTableToGRangesRecipe.R
    ├── utils.R
    └── makeGencodeGFF.R
├── man
    ├── flog.Rd
    ├── AnnotationHubData-package.Rd
    ├── upload_to_azure.Rd
    ├── upload_to_S3.Rd
    ├── ImportPreparer-class.Rd
    ├── makeGencodeFasta.Rd
    ├── makeEnsemblFasta.Rd
    ├── validationFunctions.Rd
    ├── makeStandardOrgDbs.Rd
    ├── updateResources.Rd
    ├── AnnotationHubMetadata-class.Rd
    └── makeAnnotationHubMetadata.Rd
├── vignettes
    └── IntroductionToAnnotationHubData.Rmd
├── appveyor.yml
├── DESCRIPTION
├── NAMESPACE
└── NEWS


/tests/AnnotationHubData_unit_tests.R:
--------------------------------------------------------------------------------
1 | BiocGenerics:::testPackage("AnnotationHubData")
2 | 


--------------------------------------------------------------------------------
/inst/extdata/inpDrosPsuedo.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/inpDrosPsuedo.rda


--------------------------------------------------------------------------------
/inst/extdata/badUCSCTracks/allBadTracks.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/badUCSCTracks/allBadTracks.rda


--------------------------------------------------------------------------------
/inst/extdata/badUCSCTracks/allPossibleTracks.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/badUCSCTracks/allPossibleTracks.rda


--------------------------------------------------------------------------------
/inst/extdata/badUCSCTracks/genomeTrackTable.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/extdata/badUCSCTracks/genomeTrackTable.Rda


--------------------------------------------------------------------------------
/inst/unitTests/cases/encodeDCCMetadata/tbl.parsedEncodeMetadata.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/AnnotationHubData/devel/inst/unitTests/cases/encodeDCCMetadata/tbl.parsedEncodeMetadata.RData


--------------------------------------------------------------------------------
/inst/unitTests/test_ImportPreparer-class.R:
--------------------------------------------------------------------------------
1 | test_ImportPreparer_constructors <- function()
2 | {
3 |     classes <- names(getClassDef("ImportPreparer")@subclasses)
4 |     ## need to be no-arg capable
5 |     valid <- sapply(classes, function(cl) validObject(new(cl)))
6 |     checkTrue(all(valid))
7 | }
8 | 


--------------------------------------------------------------------------------
/R/makeGrasp2Db.R:
--------------------------------------------------------------------------------
1 | # The correct recipe(step 1 & 2) is inside grasp2db software package. 
2 | 
3 | ## FIXME: does not pass BiocVersion
4 | ## STEP 3:  Call the helper to set up the newResources() method
5 | makeAnnotationHubResource("Grasp2ImportPreparer",
6 |                           grasp2db:::.makeAnnotationHubRecord)
7 | 


--------------------------------------------------------------------------------
/man/flog.Rd:
--------------------------------------------------------------------------------
 1 | \name{flog}
 2 | \alias{flog}
 3 | \title{flog}
 4 | 
 5 | \description{
 6 | 
 7 |  Write logging message to console and a file.
 8 | 
 9 | }
10 | 
11 | \usage{
12 | flog(level, ...)
13 | }
14 | 
15 | \arguments{
16 |   \item{level}{A \code{characater(1)} string object.}
17 |   \item{\dots}{Further arguments.}
18 | }
19 | 
20 | \details{
21 | 
22 | Writes the message to the console and to a file.
23 | }
24 | 
25 | \value{
26 |   None.
27 | }
28 | 
29 | \author{Dan Tenenbaum}
30 | 
31 | \seealso{\code{futile.logger}}
32 | 
33 | %%\example{
34 | %%   perhaps dan can provide one...
35 | %% }
36 | 
37 | \keyword{classes}
38 | 


--------------------------------------------------------------------------------
/R/Message-class.R:
--------------------------------------------------------------------------------
 1 | .Message <- setRefClass("Message",
 2 |     fields=list(
 3 |         name="character"
 4 |     ),
 5 |     methods=list(
 6 |         append = function(fmt, ...) {
 7 |             .self$name <- c(name, sprintf(fmt, ...))
 8 |             invisible(.self)
 9 |         },
10 |         validity = function() {
11 |             "report if any messages (e.g., after validity check)"
12 |             if (length(name)) name else NULL
13 |         },
14 |         isComplete = function() {
15 |             "stop if any messages"
16 |             if (length(name)) {
17 |                 stop(paste(name, collapse="\n"))
18 |             } else TRUE
19 |         }
20 |     )
21 | )
22 | 


--------------------------------------------------------------------------------
/R/ImportPreparer-class.R:
--------------------------------------------------------------------------------
 1 | ## these classes are used for dispatch only
 2 | 
 3 | setClass("ImportPreparer", representation="VIRTUAL")
 4 | 
 5 | setMethod(show, "ImportPreparer", function(object) {
 6 |     cat("class:", class(object), "\n")
 7 | })
 8 | 
 9 | setGeneric("newResources", signature="importPreparer",
10 |     function(importPreparer, currentMetadata = list(), ...)
11 |         standardGeneric("newResources")
12 | )
13 | 
14 | setGeneric("annotationHubRoot", signature="object",
15 |            function(object)
16 |            standardGeneric("annotationHubRoot"))
17 | 
18 | setGeneric("metadataList", signature="object",
19 |            function(object)
20 |            standardGeneric ("metadataList"))
21 | 
22 | setGeneric("metadataTable", signature="object",
23 |            function(object)
24 |            standardGeneric ("metadataTable"))
25 | 
26 | setGeneric("sourceUrls", signature="object",
27 |            function(object)
28 |            standardGeneric("sourceUrls"))
29 | 
30 | 


--------------------------------------------------------------------------------
/man/AnnotationHubData-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{AnnotationHubData-package}
 2 | \alias{AnnotationHubRecipes}
 3 | \alias{AnnotationHubData-package}
 4 | 
 5 | \docType{package}
 6 | \title{
 7 | Transform public data resources into Bioconductor Data Structures
 8 | }
 9 | \description{
10 | These recipes convert a wide variety and a growing number of public bioinformatic data sets into easily-used standard Bioconductor data structures.
11 | }
12 | \details{
13 | This package provides a set of methods which convert bioinformatic data
14 | resources into standard Bioconductor data types.  For example, a UCSC
15 | genome browser track, expressed as a BED file, is converted into a
16 | GRanges object.  Not every valuable data resource can be transformed
17 | quite so easily; some require more elaborate transformation, and hence a
18 | more specialized recipe.  Every effort is made to limit the number of
19 | recipes required.  One strategy that helps with the principle of "zero
20 | curation":  unless absolutely required, the "cooked" version of the data
21 | resource produced by a recipe is a simple and unembellished reflection
22 | of the original data in its downloaded form.
23 | }
24 | \author{Dan Tenenbaum, Paul Shannon}
25 | 
26 | \seealso{\code{AnnotationHubMetadata-class}, \code{makeAnnotationHubMetadata}}
27 | 
28 | \keyword{package}
29 | 


--------------------------------------------------------------------------------
/vignettes/IntroductionToAnnotationHubData.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Introduction to AnnotationHubData"
 3 | author: "Lori Shepherd"
 4 | date: "Modified: February 2021. Compiled: `r format(Sys.Date(), '%d %b %Y')`"
 5 | output:
 6 |   BiocStyle::html_document:
 7 |     toc: true
 8 | ---
 9 | <!--
10 | %% \VignetteEngine{knitr::rmarkdown}
11 | %\VignetteIndexEntry{Introduction to AnnotationHubData}
12 | -->
13 | 
14 | 
15 | # Overview
16 | 
17 | The AnnotationHubData package package provides tools to acquire, annotate,
18 | convert and store data for use in Bioconductor's `AnnotationHub`. Most of the
19 | functions will be used by the Bioconcutor Core Team. For information on how to
20 | use `AnnotationHub` or how to create an `AnnotationHub` package please see the
21 | vignettes in `AnnotationHub`. 
22 | 
23 | # Creating an AnnotationHub Package or Converting to an AnnotationHub Package
24 | 
25 | Please see HubPub Vignette "CreateAHubPackage".
26 | ```
27 | vignette("CreateAHubPackage", package="HubPub")
28 | ```
29 | 
30 | # Historical vignettes
31 | 
32 | The process for adding data to `AnnotationHub` has evolved substantially since
33 | the first vignettes were written. Much of the information contained in those
34 | documents is outdated or applicable only to repeat-run recipes added to the
35 | code base. These documents have been retained for historical purposes and
36 | are located in the inst/scripts/ directory of the `AnnotationHubData` package.
37 | 
38 | 


--------------------------------------------------------------------------------
/man/upload_to_azure.Rd:
--------------------------------------------------------------------------------
 1 | \name{upload_to_azure}
 2 | \alias{upload_to_azure}
 3 | \alias{Azure}
 4 | \alias{DataLake}
 5 | \alias{AZURE_SAS_URL}
 6 | \title{Upload a file to Microsoft Azure Data Lake}
 7 | \description{This function is for uploading a file resource to the
 8 |   Microsoft Azure Data Lake.}
 9 | \usage{upload_to_azure(file, sas)}
10 | \arguments{
11 |   \item{file}{
12 |     The file or directory to upload.
13 |   }
14 |   \item{sas}{
15 |     A SAS url for the designated destination on Microsoft Azure Data Lake.
16 |   }
17 | }
18 | \details{
19 | Uses the \href{https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10}{azcopy Command Line Interface}
20 | to copy a file to Microsoft Azure Data Lake. Assumes azcopy is properly installed
21 | and that the \code{azcopy} program is in your PATH. The function
22 | performs a recursive automatically so it can take a file or directory
23 | for upload. The SAS URL is generated on Azure by someone who has
24 | permission to the desired destination. Please be sure to use the SAS url
25 | and not the SAS token. The sas url can be provided as an argument; if
26 | the argument is not provided it will search for a system environment
27 | variable `AZURE_SAS_URL`.
28 | }
29 | \value{
30 | \code{TRUE} on success. If the command fails, the function
31 | will exit with an error. 
32 | }
33 | \author{Lori Shepherd}
34 | 
35 | \examples{
36 | \dontrun{
37 | upload_to_azure("myfile.txt", "https://sasurl")
38 | }
39 | }
40 | 


--------------------------------------------------------------------------------
/inst/makefile:
--------------------------------------------------------------------------------
 1 | PKG=AnnotationHubRecipes
 2 | default: build install
 3 | 
 4 | help:
 5 | 	egrep "^#" makefile | sed "s/^#//"
 6 | 
 7 | # --- quickbuild: no vignette
 8 | #
 9 | quickbuild:
10 | 	(cd ../..; R CMD build --no-vignettes $(PKG))
11 | 
12 | 
13 | # --- build
14 | #
15 | build:
16 | 	(cd ../..; R CMD build --no-vignettes $(PKG))
17 | 
18 | # --- install
19 | #
20 | install:
21 | 	(cd ../..; R CMD install $(PKG))
22 | 
23 | # --- check
24 | #
25 | check: clean build install
26 | 	(cd ../..; R CMD check --no-manual --no-vignettes --no-codoc --no-examples --no-manual  $(PKG))
27 | 
28 | # --- checkfull
29 | #
30 | checkfull: 
31 | 	(cd ../..; R CMD build $(PKG))
32 | 	(cd ../..; R CMD check $(PKG))
33 | 
34 | 
35 | # --- vanillaTest
36 | # run all the unit tests, in a clean context
37 | #
38 | 
39 | vanillaTest:  build install
40 | 	- rm vanillaTest.out
41 | 	R --vanilla < vanillaTest.R > vanillaTest.out 2>&1
42 | 
43 | # --- vt
44 | # run all the unit tests, in a clean context
45 | #
46 | 
47 | vt: vanillaTest
48 | 
49 | 
50 | # --- checkvig
51 | # check just the vignette
52 | #
53 | 
54 | checkvig:
55 | 	(cd ../..; R CMD check --no-manual --no-codoc --no-tests --no-examples $(PKG))
56 | 
57 | 
58 | # --- tangle
59 | # extract the R code from the vignette file
60 | #
61 | 
62 | tangle:
63 | 	(cd ../vignettes; R CMD Stangle $(PKG).Rnw)
64 | 
65 | 
66 | 
67 | # --- sweave
68 | # creates $(PKG).tex, runs all embedded examples
69 | # run this before the pdf target
70 | #
71 | sweave: 
72 | 	(cd ../vignettes; R CMD Sweave $(PKG).Rnw --pdf)
73 | 
74 | # --- pdf
75 | # make and open $(PKG).pdf, the vignette 
76 | #
77 | 
78 | pdf:  sweave
79 | 	(cd ../vignettes; open $(PKG).pdf)
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/R/makeAnnotationHubResource.R:
--------------------------------------------------------------------------------
 1 | ### =========================================================================
 2 | ### makeAnnotationHubResource()
 3 | ### -------------------------------------------------------------------------
 4 | ###
 5 | 
 6 | ## Creates a Preparer class and associated newResource() method.
 7 | .generalNewResources <- function(importPreparer, currentMetadata,
 8 |                                  makeAnnotationHubMetadataFunction, ...)
 9 | {
10 |     ## returns metadata 
11 |     ahms <- makeAnnotationHubMetadataFunction(currentMetadata, ...)
12 | 
13 |     ## add the importPreparer  
14 |     lapply(ahms, function(x) {
15 |         x@PreparerClass<-class(importPreparer)[1]
16 |         x
17 |     })
18 | }
19 | 
20 | makeAnnotationHubResource <- function(objName, 
21 |                                       makeAnnotationHubMetadataFunction,
22 |                                       ..., where=topenv(parent.frame()))
23 | {
24 |     ## create class
25 |     setClass(objName,
26 |              contains="ImportPreparer",
27 |              package="AnnotationHubData",
28 |              where=where)
29 |    
30 |     ## FIXME: This doesn't seem to be the case - ie, no handling of 'old'.
31 |     ## The job of this method is to only get resources that are "new"
32 |     ## It takes an arg of "old" AHMs that can be used for filtering.    
33 |     ## So it will call the makeAnnotationHubMetadataFunction, and then
34 |     ## toss out any currentMetadata() AHMs that are already present.
35 | 
36 |     ## create newResources method 
37 |     setMethod(newResources, objName, where=where,
38 |               function(importPreparer, currentMetadata=list(), ...) 
39 |     {
40 |         .generalNewResources(importPreparer, currentMetadata,
41 |                              makeAnnotationHubMetadataFunction, ...)
42 |     })
43 | }
44 | 


--------------------------------------------------------------------------------
/inst/unitTests/cases/encodeDCCMetadata/wgEncodeAwgDnaseUniform.info:
--------------------------------------------------------------------------------
1 | wgEncodeAwgDnaseDuke8988tUniPk.narrowPeak.gz	project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=8988T; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH001103; tableName=wgEncodeAwgDnaseDuke8988tUniPk; type=narrowPeak; md5sum=80fadeb7a14a72add38203910d937f50; size=1.7M
2 | wgEncodeAwgDnaseDukeAosmcUniPk.narrowPeak.gz	project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=AoSMC; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; tableName=wgEncodeAwgDnaseDukeAosmcUniPk; type=narrowPeak; md5sum=957b3477d43cef1c6abd41182b053418; size=1.5M
3 | wgEncodeAwgDnaseDukeChorionUniPk.narrowPeak.gz	project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=Chorion; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000595; tableName=wgEncodeAwgDnaseDukeChorionUniPk; type=narrowPeak; md5sum=f0ce90b72c1cfaceda456e0dfd10db1e; size=1.6M
4 | wgEncodeAwgDnaseDukeCllUniPk.narrowPeak.gz	project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=CLL; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH001104; tableName=wgEncodeAwgDnaseDukeCllUniPk; type=narrowPeak; md5sum=fe463a299af6fbefa38beeba59426767; size=873K
5 | wgEncodeAwgDnaseDukeFibroblUniPk.narrowPeak.gz	project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=Fibrobl; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000583; tableName=wgEncodeAwgDnaseDukeFibroblUniPk; type=narrowPeak; md5sum=4bf374cbbbda675e686c51de627a3d05; size=3.5M
6 | wgEncodeAwgDnaseDukeFibropUniPk.narrowPeak.gz	project=wgEncode; lab=Duke; composite=wgEncodeAwgDnaseUniPk; dataType=DnaseSeq; view=Peaks; cell=FibroP; treatment=None; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000605; tableName=wgEncodeAwgDnaseDukeFibropUniPk; type=narrowPeak; md5sum=905497fc0eaa1631b19af0e91599bb89; size=2.3M
7 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # Based on https://github.com/krlmlr/r-appveyor
 2 | # DO NOT CHANGE the "init" and "install" sections below
 3 | 
 4 | # Download script file from GitHub
 5 | init:
 6 |   ps: |
 7 |         $ErrorActionPreference = "Stop"
 8 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 9 |         Import-Module '..\appveyor-tool.ps1'
10 | 
11 | install:
12 |   ps: Bootstrap
13 | 
14 | # Adapt as necessary starting from here
15 | 
16 | build_script:
17 |   # The following configuration will use the development version of Bioconductor.  This
18 |   # is because under the hood, r-appveyor relies on r-travis, and r-travis sets an
19 |   # environment variable , BIOC_USE_DEVEL=${BIOC_USE_DEVEL:-"TRUE"}
20 |   #
21 |   # This is configurable, but devel is the default.  Here's the source:
22 |   #     https://github.com/craigcitro/r-travis/blob/master/scripts/travis-tool.sh#L11
23 |   #
24 |   # Notice: We'll need to decide on a workflow, if we want to provide CI builds of
25 |   # release versions in AppVeyor .
26 |   - git config --global user.name "travis"
27 |   - git config --global user.email "travis@example.org"
28 |   - travis-tool.sh install_bioc_deps
29 |   - travis-tool.sh install_bioc grasp2db
30 | 
31 | test_script:
32 |   - travis-tool.sh run_tests
33 | 
34 | on_failure:
35 |   - 7z a failure.zip *.Rcheck\*
36 |   - appveyor PushArtifact failure.zip
37 | 
38 | environment:
39 |   global:
40 |     WARNINGS_ARE_ERRORS: 1
41 |     _R_CHECK_FORCE_SUGGESTS_: 0
42 |     R_ARCH: x64
43 |     USE_RTOOLS: true ## to be able to use Remotes (i.e. packages from non-CRAN sources)
44 | 
45 |   matrix:
46 |     - R_VERSION: release
47 | 
48 | artifacts:
49 |   - path: '*.Rcheck\**\*.log'
50 |     name: Logs
51 | 
52 |   - path: '*.Rcheck\**\*.out'
53 |     name: Logs
54 | 
55 |   - path: '*.Rcheck\**\*.fail'
56 |     name: Logs
57 | 
58 |   - path: '*.Rcheck\**\*.Rout'
59 |     name: Logs
60 | 
61 |   - path: '\*_*.tar.gz'
62 |     name: Bits
63 | 
64 |   - path: '\*_*.zip'
65 |     name: Bits
66 | 
67 | cache:
68 |   - C:\RLibrary
69 | 


--------------------------------------------------------------------------------
/man/upload_to_S3.Rd:
--------------------------------------------------------------------------------
 1 | \name{upload_to_S3}
 2 | \alias{upload_to_S3}
 3 | \alias{S3}
 4 | \alias{amazon}
 5 | \alias{AWS}
 6 | \title{
 7 | Upload a file to Amazon S3
 8 | }
 9 | \description{
10 | This function is for uploading a file resource to the S3 cloud.
11 | }
12 | \usage{
13 | upload_to_S3(file, remotename, bucket, profile, acl="public-read")
14 | }
15 | %- maybe also 'usage' for other objects documented here.
16 | \arguments{
17 |   \item{file}{
18 | The file to upload.
19 | }
20 |   \item{remotename}{
21 | The name this file should have in S3, including any "keys"
22 | that are part of the name. This should not start with 
23 | a slash (if it does, the leading slash will be removed),
24 | but can contain forward slashes.
25 | }
26 |   \item{bucket}{
27 | Name of the S3 bucket to copy to.
28 | }
29 |   \item{profile}{
30 | Corresponds to a profile set in the config file for the AWS CLI
31 | (see \href{http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html#cli-multiple-profiles}{the documentation}).
32 | If this argument is omitted,the default profile is used.
33 | }
34 |   \item{acl}{
35 | Should be one of \code{private}, \code{public-read}, or \code{public-read-write}.
36 | }
37 | }
38 | \details{
39 | Uses the \href{https://aws.amazon.com/cli/}{AWS Command Line Interface}
40 | to copy a file to Amazon S3. Assumes the CLI is properly configured
41 | and that the \code{aws} program is in your PATH. The CLI should be
42 | configured with the credentials of a user who has permission to
43 | upload to the appropriate bucket. It's recommended to use
44 | \href{https://aws.amazon.com/iam/}{IAM} to set up users
45 | with limited permissions.
46 | 
47 | There is an \code{RAmazonS3} package but it seems to have issues
48 | uploading files to S3.
49 | }
50 | \value{
51 | \code{TRUE} on success. If the command fails, the function
52 | will exit with an error. 
53 | }
54 | \author{Dan Tenenbaum}
55 | 
56 | \examples{
57 | \dontrun{
58 | upload_to_S3("myfile.txt", "foo/bar/baz/yourfile.txt")
59 | # If this is successful, the file should be accessible at 
60 | # http://s3.amazonaws.com/annotationhub/foo/bar/baz/yourfile.txt
61 | }
62 | }
63 | 


--------------------------------------------------------------------------------
/R/makeUCSC2Bit.R:
--------------------------------------------------------------------------------
 1 | make2bit <- function(currentMetadata, justRunUnitTest=FALSE, 
 2 |                      BiocVersion=BiocManager::version()) {
 3 |     rsrc <- .getUCSCResources(fileType="2bit", dirName="bigZips", 
 4 |                               fileName=".2bit", verbose=TRUE, justRunUnitTest)
 5 |     ## input_sources table
 6 |     sourceSize <- as.numeric(rsrc$size)
 7 |     sourceUrls <- rsrc$fileurl
 8 |     sourceVersion <- gsub(" ", "_", rsrc$date) 
 9 |     sourceLastModifiedDate <- rsrc$date
10 |     rdatapaths <- gsub(.ucscBase, "",sourceUrls)
11 |     md5sum <- rsrc$md5sum
12 |     
13 |     ## resources table
14 |     species <- rsrc$organism   
15 |     genome <- rsrc$from
16 |     taxonomyId <- as.integer(rsrc$taxid)           
17 |     title <- rownames(rsrc) 
18 |     description <- sprintf("UCSC 2 bit file for %s ", rsrc$from)
19 |     
20 |     Map(AnnotationHubMetadata,
21 |         
22 |         SourceSize = sourceSize,
23 |         SourceUrl = sourceUrls,
24 |         SourceVersion = sourceVersion,
25 |         SourceLastModifiedDate = sourceLastModifiedDate,
26 |         SourceMd5 =md5sum,
27 |         
28 |         Description = description,
29 |         Title = title,
30 |         Genome = genome,
31 |         Species = species, 
32 |         TaxonomyId = taxonomyId,
33 |         
34 |         RDataPath = rdatapaths,
35 |         
36 |         MoreArgs=list(
37 |             BiocVersion=BiocVersion,
38 |             # input sources 
39 |             SourceType = "TwoBit",
40 |             
41 |             # resources
42 |             DataProvider = "UCSC",
43 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",         
44 |             Coordinate_1_based = FALSE,
45 |             Location_Prefix = .ucscBase,
46 |             RDataDateAdded = Sys.time(),
47 |                         
48 |             #rdata table
49 |             DispatchClass= "TwoBitFile" ,
50 |             RDataClass = "TwoBitFile",
51 |             
52 |             Recipe = NA_character_, 
53 |             Tags = c("2bit", "UCSC", "genome" )))
54 | }
55 | 
56 | makeAnnotationHubResource("UCSC2BitPreparer", make2bit)
57 | 


--------------------------------------------------------------------------------
/inst/unitTests/test_validityFunctions.R:
--------------------------------------------------------------------------------
 1 | txdb <- GenomeInfoDb::loadTaxonomyDb()
 2 | txdb <- rbind(txdb, c(NA, NA, ""))
 3 | 
 4 | test_getSpeciesList <- function(){
 5 |     list <- getSpeciesList()
 6 |     checkTrue(length(list) == dim(txdb)[1])
 7 | }
 8 | 
 9 | test_validSpecies <- function(){
10 | 
11 |     checkTrue(validSpecies("Homo sapiens", verbose=FALSE))
12 |     checkTrue(!validSpecies("Homo Sapiens", verbose=FALSE))
13 |     checkTrue(validSpecies(NA_character_))
14 | }
15 | 
16 | test_suggestSpecies <- function(){
17 | 
18 |     vl1 <- Reduce(`|`, lapply(txdb[2:3], grepl, pattern = "Dictyoglomus",
19 |         ignore.case=TRUE))
20 |     vl2 <- Reduce(`|`, lapply(txdb[2:3], grepl, pattern = "immobile",
21 |         ignore.case=TRUE))
22 | 
23 |     out <- suggestSpecies(c("Dictyoglomus", "immobile"))
24 |     checkTrue((length(which(vl1)) + length(which((vl2)))) == dim(out)[1])
25 | }
26 | 
27 | test_validTaxId <- function(){
28 | 
29 |     checkTrue(is.null(AnnotationHubData::checkSpeciesTaxId(9606,
30 |                                                            "Homo sapiens")))
31 |     options(warn=2)
32 |     checkException(AnnotationHubData::checkSpeciesTaxId(9999, "Homo sapiens"))
33 |     options(warn=0)
34 | }
35 | 
36 | test_validDispatchClass <- function(){
37 | 
38 |     checkTrue(validDispatchClass("GRanges"))
39 |     checkTrue(validDispatchClass(c("GRanges", "Rda")))
40 |     checkTrue(!validDispatchClass("somethingNotThere"))
41 |     checkTrue(!validDispatchClass(c("GRanges", "somethingNotThere")))
42 | }
43 | 
44 | test_FileLengths <- function(){
45 | 
46 |     checkTrue(AnnotationHubData:::.checkFileLengths(
47 |         RDataPath = c("package/example1.bam", "package/example2.bai"),
48 |         DispatchClass="BamFile"))
49 |     checkException(AnnotationHubData:::.checkFileLengths(
50 |         RDataPath = c("package/example1.bai", "package/example2.bam"),
51 |         DispatchClass="BamFile"))
52 |     checkException(AnnotationHubData:::.checkFileLengths(
53 |         RDataPath = "package/example1.bam",
54 |         DispatchClass="BamFile"))
55 |      checkTrue(AnnotationHubData:::.checkFileLengths(
56 |         RDataPath = "package/example1.rda",
57 |         DispatchClass="Rda"))
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: AnnotationHubData
 2 | Type: Package
 3 | Title: Transform public data resources into Bioconductor Data Structures
 4 | Version: 1.41.0
 5 | Encoding: UTF-8
 6 | Authors@R: c(
 7 | 	person("Martin", "Morgan", role="ctb"),
 8 | 	person("Marc", "Carlson", role="ctb"),
 9 | 	person("Dan", "Tenenbaum", role="ctb"),
10 | 	person("Sonali", "Arora", role="ctb"),
11 | 	person("Paul", "Shannon", role="ctb"),
12 | 	person("Lori", "Shepherd", role="ctb"),
13 | 	person("Bioconductor Package Maintainer",
14 | 	    email="maintainer@bioconductor.org", role="cre")
15 | 	)
16 | Depends: R (>= 3.2.2), methods, utils, S4Vectors (>= 0.7.21),
17 |     IRanges (>= 2.3.23), GenomicRanges, AnnotationHub (>= 2.15.15)
18 | Suggests: RUnit, knitr, BiocStyle, grasp2db, GenomeInfoDbData, rmarkdown, HubPub
19 | Imports: GenomicFeatures, Rsamtools, rtracklayer, BiocGenerics,
20 |     jsonlite, BiocManager, biocViews, BiocCheck, graph,
21 |     AnnotationDbi, Biobase, Biostrings, DBI, Seqinfo,
22 |     GenomeInfoDb (>= 1.45.5), OrganismDbi, RSQLite,
23 |     AnnotationForge, futile.logger (>= 1.3.0), XML, RCurl
24 | Description: These recipes convert a wide variety and a growing number of
25 |     public bioinformatic data sets into easily-used standard Bioconductor data
26 |     structures.
27 | License: Artistic-2.0
28 | LazyLoad: yes
29 | biocViews: DataImport
30 | VignetteBuilder: knitr
31 | Collate:
32 |     Message-class.R
33 |     ImportPreparer-class.R
34 |     makeAnnotationHubResource.R
35 |     HubMetadata-class.R
36 |     AnnotationHubMetadata-class.R
37 |     utils.R
38 |     updateResources.R
39 |     ahmToJson.R
40 |     webAccessFunctions.R
41 |     makeChEA.R
42 |     makedbSNPVCF.R
43 |     makeEncodeDCC.R
44 |     makeEnsemblGtfToGRanges.R
45 |     makeEnsemblFasta.R
46 |     makeEpigenomeRoadmap.R
47 |     makeGencodeFasta.R
48 |     makeGencodeGFF.R
49 |     makeGrasp2Db.R
50 |     makeHaemCode.R
51 |     makeInparanoid8ToDbs.R
52 |     makeNCBIToOrgDbs.R
53 |     makeStandardOrgDbsToSqlite.R
54 |     makeStandardTxDbsToSqlite.R
55 |     makeRefNet.R
56 |     makeUCSCChain.R
57 |     makeUCSC2Bit.R
58 |     makeUCSCTracks.R
59 |     trackWithAuxiliaryTableToGRangesRecipe.R
60 |     UCSCTrackUpdateChecker.R
61 |     makeEnsemblTwoBit.R
62 |     validationFunctions.R
63 | 


--------------------------------------------------------------------------------
/inst/scripts/addContributedResources.txt:
--------------------------------------------------------------------------------
 1 | ## Contributed Annotations:
 2 | 
 3 | This doc describes how to add contributed (i.e., non-core generated)
 4 | resources to AnnotationHub. In general, these instructions pertain
 5 | to core team members only.
 6 | 
 7 | * Case 1: Single resources with no accompanying software package
 8 | 
 9 | - Metadata
10 | 
11 |   Author follows these instructions to create a .R file that generates
12 |   metadata for the resource(s):
13 | 
14 |   http://www.bioconductor.org/packages/3.5/bioc/vignettes/AnnotationHubData/inst/doc/IntroductionToAnnotationHubData.html#individual-resources
15 | 
16 | - Test metadata
17 | 
18 |   Test the .R file provided by the author with
19 |   AnnotationHubData::AnnotationHubMetadata(). Confirm the metadata fields
20 |   are valid (reasonable title, version) and the paths are accurate.
21 | 
22 | - Add metadata
23 | 
24 |   Add the metadata to the production database with the AnnotationHub docker.
25 | 
26 | 
27 | * Case 2: Family of resources with accompanying software package
28 | 
29 | - Software package
30 | 
31 |   Author creates a software package according to guidelines here:
32 | 
33 |   http://www.bioconductor.org/packages/3.5/bioc/vignettes/AnnotationHubData/inst/doc/IntroductionToAnnotationHubData.html#family-of-resources
34 | 
35 | - Test metadata
36 | 
37 |   Check the metadata with AnnotationHubData::makeAnnnotationHubMetadata().
38 |   There can be more than one metadata.csv file, e.g., ensembl_version86.csv,
39 |   ensembl_version87.csv etc. The package should have record of all
40 |   metadata added over time. For example, when version 88 files are added
41 |   they should not remove the csv files for versions 86 and 87.
42 | 
43 | - Add resources
44 | 
45 |   The resources can be 'stored' on a web site or in an S3 bucket. If they 
46 |   will be in S3, follow these steps:
47 | 
48 |   -- Create a new S3 bucket under annotationhub/ with the same name as the 
49 |      software package.
50 |   -- Either the core team member adds the resources to the S3 bucket or 
51 |      the contributor adds them as the AnnotationContributor user. See the
52 |      AnnotationHubData vignette for more details on the AnnotationContributor
53 |      user.
54 | 
55 |      Once the resources are in the proper place, confirm they are public and 
56 |      can be downloaded by anyone.
57 | 
58 | - Add metadata
59 | 
60 |   Add the metadata to the production database with the AnnotationHub docker.
61 | 


--------------------------------------------------------------------------------
/R/makeChEA.R:
--------------------------------------------------------------------------------
 1 | ### =========================================================================
 2 | ### makeChEA()
 3 | ### -------------------------------------------------------------------------
 4 | ###
 5 | 
 6 | ### Recipe for ChEA transcription factor background file.
 7 | 
 8 | makeChEAToAHM <- function(currentMetadata, 
 9 |                           baseUrl="http://amp.pharm.mssm.edu/",
10 |                           justRunUnitTest=FALSE,
11 |                           BiocVersion=BiocManager::version()) 
12 | {
13 |     files <- "result/kea/chea-background.zip"
14 | 
15 |     files <- paste0(baseUrl, files)
16 |     rsrc <- .httrFileInfo(files, verbose=FALSE)
17 |     title <- basename(rsrc$fileurl)
18 |         
19 |     ## input_sources table
20 |     sourceSize <- as.numeric(rsrc$size)
21 |     sourceUrls <- rsrc$fileurl
22 |     sourceVersion <- gsub(" ", "_", rsrc$date) 
23 |     sourceLastModifiedDate <- rsrc$date
24 |     
25 |     ## resources table
26 |     description <- .expandLine("ChEA background file, containing  
27 |         transcription factor data to run ChEA") 
28 |     
29 |     Map(AnnotationHubMetadata,
30 |         SourceSize = sourceSize,
31 |         SourceUrl = sourceUrls,
32 |         SourceVersion = sourceVersion,
33 |         SourceLastModifiedDate = sourceLastModifiedDate,
34 |         
35 |         Description = description,
36 |         Title = title,
37 |         
38 |         RDataPath = gsub(baseUrl, "", sourceUrls),
39 |         
40 |         MoreArgs=list(
41 |             BiocVersion=BiocVersion,
42 |             # input sources 
43 |             SourceType = "Zip",
44 |             
45 |             # resources
46 |             Species = NA_character_, 
47 |             TaxonomyId = NA_integer_,
48 |             Genome = NA_character_,
49 |             DataProvider = "ChEA",
50 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",         
51 |             Coordinate_1_based = FALSE,
52 |             Location_Prefix = baseUrl,
53 |             RDataDateAdded = Sys.time(),
54 |                         
55 |             #rdata table
56 |             DispatchClass = "ChEA",
57 |             RDataClass = "data.frame",
58 |             
59 |             Tags = c("ChEA","Transcription Factors"),
60 |             
61 |             Recipe = NA_character_ ))
62 | }
63 | 
64 | makeAnnotationHubResource("ChEAImportPreparer", makeChEAToAHM)
65 | 


--------------------------------------------------------------------------------
/R/makeEnsemblGtfToGRanges.R:
--------------------------------------------------------------------------------
 1 | ## As of July 2016 this recipe was modified to store only metadata
 2 | ## and no files in S3. AnnotationHub will expose available GTF files
 3 | ## from Ensembl and the AnnotationHub::GTFFile dispatch class will
 4 | ## convert the GTF to GRanges on the fly.
 5 | 
 6 | .ensemblGtfSourceUrls <-
 7 |     function(baseDir, baseUrl, release, justRunUnitTest, verbose=FALSE)
 8 | {
 9 |     want <- paste0(baseUrl, "release-", release, paste0("/", baseDir))
10 |     urls <- unlist(lapply(want, function(url) {
11 |         listing <- .ftpDirectoryInfo(url)
12 |         subdir <- sub(".* ", "", listing[grep("^drwx", listing)])
13 |         paste0(url, subdir, "/")
14 |     }), use.names=FALSE)
15 | 
16 |     if(justRunUnitTest)
17 |         urls <- urls[1:2] ## 2 organisms; possibly more files
18 | 
19 |     df <- .ftpFileInfo(urls, ".gtf.gz", verbose=verbose)
20 |     rownames(df) <- NULL
21 |     df
22 | }
23 | 
24 | makeEnsemblGtfToAHM <-
25 |     function(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/",
26 |              baseDir = "gtf/", release, justRunUnitTest = FALSE, 
27 |              BiocVersion = BiocManager::version(), ...)
28 | {
29 |     ## get all file urls, size, date
30 |     df <- .ensemblGtfSourceUrls(baseDir, baseUrl, release, 
31 |                                 justRunUnitTest, ...)
32 | 
33 |     sourceUrls <- df$fileurl
34 |     rdatapath <- gsub(baseUrl, "", sourceUrls)
35 | 
36 |     ## get genome, species, version, title
37 |     meta <- .ensemblMetadataFromUrl(sourceUrls)
38 |     description <- paste("Gene Annotation for", meta$species)
39 | 
40 |     Map(AnnotationHubMetadata,
41 |         Description=description, Genome=meta$genome,
42 |         SourceUrl=sourceUrls,
43 |         SourceSize=as.numeric(df$size),
44 |         SourceLastModifiedDate=df$date,
45 |         SourceVersion=meta$sourceVersion,
46 |         Species=meta$species,
47 |         RDataPath=rdatapath,
48 |         TaxonomyId=meta$taxonomyId, Title=meta$title,
49 |         MoreArgs=list(
50 |             BiocVersion=BiocVersion,
51 |             Coordinate_1_based=TRUE,
52 |             DataProvider="Ensembl",
53 |             Maintainer="Bioconductor Maintainer <maintainer@bioconductor.org>",
54 |             RDataClass="GRanges",
55 |             DispatchClass="GTFFile",
56 |             SourceType="GTF",
57 |             Location_Prefix=baseUrl,
58 |             RDataDateAdded=Sys.time(),
59 |             Recipe=NA_character_,
60 |             Tags=c("GTF", "ensembl", "Gene", "Transcript", "Annotation")))
61 | }
62 | 
63 | makeAnnotationHubResource("EnsemblGtfImportPreparer", makeEnsemblGtfToAHM)
64 | 


--------------------------------------------------------------------------------
/man/ImportPreparer-class.Rd:
--------------------------------------------------------------------------------
 1 | \name{ImportPreparer-class}
 2 | \docType{class}
 3 | \alias{newResources}
 4 | \alias{getImportPreparer}
 5 | 
 6 | \alias{ImportPreparer-class}
 7 | \alias{show,ImportPreparer-method}
 8 | \alias{newResources,ImportPreparer-method}
 9 | 
10 | \alias{UCSCTrackImportPreparer-class}
11 | \alias{UCSCTrackImportPreparer}
12 | \alias{newResources,UCSCTrackImportPreparer-method}
13 | \alias{newResources,UCSCFullTrackImportPreparer-method}
14 | 
15 | \alias{HaemCodeImportPreparer-class}
16 | \alias{HaemCodeImportPreparer}
17 | \alias{newResources,HaemCodeImportPreparer-method}
18 | 
19 | \alias{EncodeImportPreparer-class}
20 | \alias{EncodeImportPreparer}
21 | \alias{newResources,EncodeImportPreparer-method}
22 | 
23 | \alias{annotationHubRoot}
24 | \alias{metadataList}
25 | \alias{metadataTable}
26 | \alias{sourceUrls}
27 | 
28 | \alias{EnsemblFastaImportPreparer-class}
29 | \alias{EnsemblFastaImportPreparer}
30 | \alias{newResources,EnsemblFastaImportPreparer-method}
31 | 
32 | \alias{EnsemblGtfImportPreparer-class}
33 | \alias{EnsemblGtfImportPreparer}
34 | \alias{newResources,EnsemblGtfImportPreparer-method}
35 | 
36 | \alias{RefNetImportPreparer-class}
37 | \alias{RefNetImportPreparer}
38 | \alias{newResources,RefNetImportPreparer-method}
39 | 
40 | \alias{dbSNPVCFImportPreparer-class}
41 | \alias{dbSNPVCFImportPreparer}
42 | \alias{newResources,dbSNPVCFImportPreparer-method}
43 | 
44 | \alias{Grasp2ImportPreparer-class}
45 | \alias{Grasp2ImportPreparer}
46 | \alias{newResources,Grasp2ImportPreparer-method}
47 | 
48 | \alias{Inparanoid8ImportPreparer-class}
49 | \alias{Inparanoid8ImportPreparer}
50 | \alias{newResources,Inparanoid8ImportPreparer-method}
51 | 
52 | \alias{NCBIImportPreparer-class}
53 | \alias{NCBIImportPreparer}
54 | \alias{newResources,NCBIImportPreparer-method}
55 | 
56 | \alias{UCSCChainPreparer-class}
57 | \alias{UCSCChainPreparer}
58 | \alias{newResources,UCSCChainPreparer-method}
59 | 
60 | 
61 | \title{Class \code{ImportPreparer} and generic \code{newResources}}
62 | 
63 | \description{
64 | 
65 |   The \code{ImportPreparer} and derived classes are used for dispatch
66 |   during data discovery (see \code{\link{newResources}}). There is one
67 |   \code{ImportPreparer} class for each data source for
68 |   \code{\link{AnnotationHubMetadata}}.
69 | 
70 |   \code{newResources} is a generic function; with methods implemented
71 |   for each \code{ImportPreparer}.
72 | 
73 | }
74 | 
75 | \author{Martin Morgan}
76 | 
77 | \seealso{
78 |   \code{\linkS4class{AnnotationHubMetadata}}.
79 | }
80 | 
81 | \examples{
82 | getImportPreparerClasses()
83 | }
84 | 
85 | \keyword{classes}
86 | 
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/R/makeGencodeFasta.R:
--------------------------------------------------------------------------------
 1 | ### =========================================================================
 2 | ### makeGencodeFastaToAHM() and gencodeFastaToFaFile()
 3 | ### -------------------------------------------------------------------------
 4 | ###
 5 | 
 6 | ### Recipe for human and mouse fasta files.
 7 | ### http://www.gencodegenes.org/releases/
 8 | ### ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human
 9 | ### the above has been updated to
10 | ### ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human
11 | ### Files downloaded are listed in AnnotationHubData:::.gencodeDescription().
12 | 
13 | ### metadata generator
14 | makeGencodeFastaToAHM <- function(currentMetadata, 
15 |                                 baseUrl="ftp://ftp.ebi.ac.uk/pub/databases/gencode/",
16 |                                 species=c("Human", "Mouse"), release, 
17 |                                 justRunUnitTest=FALSE,
18 |                                 BiocVersion=BiocManager::version())
19 | {
20 |     species <- match.arg(species)
21 |     rsrc <- .gencodeSourceUrls(species, release, filetype="fasta", 
22 |                                justRunUnitTest)
23 |  
24 |     rdatapath <- rsrc$rdatapath
25 |     rdps <- rep(rdatapath, each=3)
26 |     rdatapaths <- split(rdps, f=as.factor(rep(seq_along(rdatapath),each=3)))
27 |     rdatapath <- lapply(rdatapaths, 
28 |         function(x){
29 |             x[1] <- sub("gz","bgz", x[1])
30 |             x[2] <- paste0(x[1],".fai")
31 |             x[3] <- paste0(x[1],".gzi")
32 |             x
33 |         })
34 | 
35 |     description <- rsrc$description
36 |     title <- basename(rsrc$fileurl)
37 |     genome <- rsrc$genome
38 |     sourceUrls <- rsrc$fileurl
39 |     sourceVersion <- as.character(rsrc$date) ## should be character
40 |     SourceLastModifiedDate <- rsrc$date  ## should be "POSIXct" "POSIXt"
41 |     SourceSize <- as.numeric(rsrc$size)
42 |     tags <- strsplit(rsrc$tag, ",")
43 |     species <- rsrc$species
44 |     taxid <- rsrc$taxid
45 |     
46 |     Map(AnnotationHubMetadata,
47 |         Description=description, 
48 |         Genome=genome,
49 |         SourceUrl=sourceUrls,
50 |         SourceSize=SourceSize,
51 |         SourceLastModifiedDate=SourceLastModifiedDate,
52 |         SourceVersion=sourceVersion,
53 |         Species=species,
54 |         RDataPath=rdatapath,
55 |         TaxonomyId=taxid, 
56 |         Title=title,
57 | 	Tags=tags, 
58 |         MoreArgs=list(
59 |           Coordinate_1_based = TRUE,
60 |           DataProvider = "Gencode",
61 |           Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
62 |           RDataClass = c("FaFile", "FaFile", "FaFile"),
63 |           DispatchClass="FaFile",
64 |           SourceType="FASTA",  
65 |           Location_Prefix="https://bioconductorhubs.blob.core.windows.net/annotationhub/",
66 |           RDataDateAdded = Sys.time(),
67 |           Recipe="AnnotationHubData:::gencodeFastaToFaFile")) 
68 | }
69 | 
70 | gencodeFastaToFaFile <- function(ahm)
71 | {
72 |     .fastaToFaFile(ahm)
73 | }
74 | 
75 | ## create dispatch class and newResources() method
76 | makeAnnotationHubResource("GencodeFastaImportPreparer", makeGencodeFastaToAHM)
77 | 


--------------------------------------------------------------------------------
/inst/unitTests/cases/encodeDCCMetadata/wgEncodeAffyRnaChip.info:
--------------------------------------------------------------------------------
1 | wgEncodeAffyRnaChipFiltTransfragsGm12878CellTotal.broadPeak.gz	project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=cell; rnaExtract=total; origAssembly=hg18; dataVersion=ENCODE Feb 2009 Freeze; dccAccession=wgEncodeEH000016; dateSubmitted=2009-03-10; dateUnrestricted=2009-12-10; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878CellTotal; type=broadPeak; md5sum=c4049a3cfbb6b3e74cb776ee4e4309f9; size=13M
2 | wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongnonpolya.broadPeak.gz	project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=cytosol; rnaExtract=longNonPolyA; origAssembly=hg18; dataVersion=ENCODE Nov 2008 Freeze; dccAccession=wgEncodeEH000012; dateSubmitted=2008-12-09; dateUnrestricted=2009-09-09; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongnonpolya; type=broadPeak; md5sum=08d5feb7211a40c99a5ce374c6d0a169; size=12M
3 | wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongpolya.broadPeak.gz	project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=cytosol; rnaExtract=longPolyA; origAssembly=hg18; dataVersion=ENCODE Nov 2008 Freeze; dccAccession=wgEncodeEH000002; dateSubmitted=2008-11-21; dateUnrestricted=2009-08-21; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878CytosolLongpolya; type=broadPeak; md5sum=9cb30fe0ff4c6c6d9bf9add1957a77f5; size=13M
4 | wgEncodeAffyRnaChipFiltTransfragsGm12878NucleolusTotal.broadPeak.gz	project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=nucleolus; rnaExtract=total; origAssembly=hg18; dataVersion=ENCODE Sep 2009 Freeze; dccAccession=wgEncodeEH000026; dateSubmitted=2009-08-27; dateUnrestricted=2010-05-27; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878NucleolusTotal; type=broadPeak; md5sum=86fa269224f528b52bf15faa387af12d; size=14M
5 | wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongnonpolya.broadPeak.gz	project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=nucleus; rnaExtract=longNonPolyA; origAssembly=hg18; dataVersion=ENCODE Nov 2008 Freeze; dccAccession=wgEncodeEH000003; dateSubmitted=2008-11-21; dateUnrestricted=2009-08-21; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongnonpolya; type=broadPeak; md5sum=c8d8604e208bcec85075a97e2d855c71; size=17M
6 | wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongpolya.broadPeak.gz	project=wgEncode; grant=Gingeras; lab=Affy; composite=wgEncodeAffyRnaChip; dataType=RnaChip; view=FiltTransfrags; cell=GM12878; localization=nucleus; rnaExtract=longPolyA; origAssembly=hg18; dataVersion=ENCODE Feb 2009 Freeze; dccAccession=wgEncodeEH000017; dateSubmitted=2009-04-01; dateUnrestricted=2010-01-01; subId=2121; tableName=wgEncodeAffyRnaChipFiltTransfragsGm12878NucleusLongpolya; type=broadPeak; md5sum=e59db684bc92f9e33587c70d4e93de39; size=17M
7 | 


--------------------------------------------------------------------------------
/R/makeRefNet.R:
--------------------------------------------------------------------------------
 1 | ## old file - RefNetImportPreparer-class.R 
 2 | ##changes - title should have full file name, species is "Homo sapiens" not 9606
 3 | ## what is RefNet Genome?
 4 | ## tags looks like "interactions, interactions from gerstein-2012"
 5 | 
 6 | .amazonBaseUrl <- "https://bioconductorhubs.blob.core.windows.net/annotationhub/"
 7 | 
 8 | .getRefNetFileURIs <- function() { 
 9 |     # everything is embedded in the second line of xml
10 |     .refNetbase.url  <- paste0(.amazonBaseUrl, "refnet/")
11 |     filenames <- c("gerstein-2012.tsv_0.0.1.RData" ,
12 |                 "hypoxiaSignaling-2006.tsv_0.0.1.RData", 
13 |                 "stamlabTFs-2012.tsv_0.0.1.RData", 
14 |                 "recon202.tsv_0.0.1.RData")      
15 |     paste0(.refNetbase.url, filenames)
16 | } 
17 | 
18 | 
19 | .refnetFiles <- function() {
20 |     files <- .getRefNetFileURIs()
21 |     df <- .httrFileInfo(files, verbose=FALSE)
22 |     title <- basename(files)
23 |     
24 |     filename.stem <- sub(".tsv_0.0.1.RData", "", title)
25 |     description <- sprintf("Interactions from %s", filename.stem)
26 |     cbind(df, title,  description, stringsAsFactors=FALSE)
27 | }
28 | 
29 | 
30 | makeRefNetImporter <- function(currentMetadata, justRunUnitTest=FALSE,
31 |                                BiocVersion=BiocManager::version()) {
32 |     rsrc <- .refnetFiles()
33 |     
34 |     ## input_sources table
35 |     sourceSize <- as.numeric(rsrc$size)
36 |     sourceUrls <- gsub("_0.0.1.RData", "", rsrc$fileurl)
37 |     sourceVersion <- gsub(" ", "_", rsrc$date) 
38 |     sourceLastModifiedDate <- rsrc$date
39 |     
40 |     ## resources table
41 |     
42 |     title <- rsrc$title
43 |     description <- rsrc$description
44 |     
45 |     Tags <- lapply(rsrc$description, function(x) {
46 |         c("refNet","interactions", x)
47 |     })
48 |     
49 |     Map(AnnotationHubMetadata,
50 |         
51 |         SourceSize = sourceSize,
52 |         SourceUrl = sourceUrls,
53 |         SourceVersion = sourceVersion,
54 |         SourceLastModifiedDate = sourceLastModifiedDate,
55 |         
56 |         Description = description,
57 |         Title = title,
58 |             
59 |         RDataPath = gsub(.amazonBaseUrl, "",sourceUrls),
60 |         
61 |         MoreArgs=list(
62 |             BiocVersion=BiocVersion,
63 |             # input sources 
64 |             SourceType = "RData",
65 |             
66 |             # resources
67 |             Species = "Homo sapiens", 
68 |             TaxonomyId = 9606L,
69 |             Genome = "RefNet Genome",
70 |             DataProvider = "RefNet",
71 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",         
72 |             Coordinate_1_based = FALSE,
73 |             Location_Prefix = .amazonBaseUrl,
74 |             RDataDateAdded = Sys.time(),
75 |             
76 |             
77 |             #rdata table
78 |             DispatchClass = "data.frame" ,
79 |             RDataClass = "data.frame",
80 |             
81 |             Tags = c("refnet", "interactions"),
82 |             Recipe= NA_character_))
83 | }
84 | 
85 | makeAnnotationHubResource("RefNetImportPreparer", makeRefNetImporter)
86 | 


--------------------------------------------------------------------------------
/man/makeGencodeFasta.Rd:
--------------------------------------------------------------------------------
  1 | \name{makeGencodeFasta}
  2 | 
  3 | \alias{makeGencodeFasta}
  4 | \alias{makeGencodeFastaToAHM}
  5 | \alias{gencodeFastaToFaFile}
  6 | 
  7 | \title{
  8 |   Recipe to add Gencode FASTA resources to AnnotationHub
  9 | }
 10 | 
 11 | \description{
 12 |   Create metadata and process raw Gencode FASTA files for inclusion in
 13 |   AnnotationHub
 14 | }
 15 | 
 16 | \usage{
 17 | makeGencodeFastaToAHM(currentMetadata, 
 18 |                       baseUrl="ftp://ftp.ebi.ac.uk/pub/databases/gencode/",
 19 |                       species=c("Human", "Mouse"), release, 
 20 |                       justRunUnitTest=FALSE, 
 21 |                       BiocVersion=BiocManager::version())
 22 | 
 23 | gencodeFastaToFaFile(ahm)
 24 | }
 25 | 
 26 | \details{
 27 |   \describe{
 28 |     \item{Documentation:}{
 29 |       http://www.gencodegenes.org/releases/
 30 |     }
 31 |     \item{File download location:}{
 32 |       ftp://ftp.ebi.ac.uk/pub/databases/gencode/. Gencode_human and
 33 |       Gencode_mouse are used.
 34 |     }
 35 |     \item{Files downloaded:}{
 36 |       Code is currently specific for human and mouse. Files chosen for
 37 |       download are described in AnnotationHubData:::.gencodeDescription().
 38 |     }
 39 |   }
 40 | }
 41 | 
 42 | \arguments{
 43 |   \item{currentMetadata}{
 44 |     Currently not used. Intended to be a list of metadata to filter, i.e.,
 45 |     records that do not need to be processed again. Need to remove or fix.
 46 |   }
 47 |   \item{baseUrl}{
 48 |     ftp file location.
 49 |   } 
 50 |   \item{species}{
 51 |     A \code{character(1)} of the species. Currently "Human" and "Mouse"
 52 |     are supported.
 53 |   } 
 54 |   \item{release}{
 55 |     A \code{character} string of the release number.
 56 |   } 
 57 |   \item{justRunUnitTest}{
 58 |     A \code{logical}. When TRUE, a small number of records (usually 5) are
 59 |     processed instead of all.
 60 |   }
 61 |   \item{BiocVersion}{
 62 |     A \code{character} vector of Bioconductor versions the resources should be 
 63 |     available for.
 64 |   }
 65 |   \item{ahm}{
 66 |     List of \code{AnnotationHubMetadata} instances.
 67 |   }
 68 | }
 69 | 
 70 | 
 71 | \value{
 72 |   \code{makeGencodeFastaAHM} returns a list of \code{AnnotationHubMetadata}
 73 |   instances. \code{gencodeFastaToFaFile} returns nothing.
 74 | }
 75 | 
 76 | \seealso{
 77 |   \itemize{
 78 |     \item \link{updateResources}
 79 |     \item \link{AnnotationHubMetadata}
 80 |   }
 81 | }
 82 | 
 83 | \author{
 84 |   Bioconductor Core Team.
 85 | }
 86 | 
 87 | \examples{
 88 | 
 89 | ## updateResources() generates metadata, process records and
 90 | ## pushes files to AWS S3 buckets. 
 91 | 
 92 | ## To run the GencodeFasta recipe specify 
 93 | ## 'preparerClasses = GencodeFastaImportPreparer'. The 'species' and 'release' 
 94 | ## arguments are passed to makeGencodeFastaAHM().
 95 | \dontrun{
 96 | meta <- updateResources("/local/path", 
 97 |                         BiocVersion = c("3.2", "3.3"),
 98 |                         preparerClasses = "GencodeFastaImportPreparer",
 99 |                         metadataOnly = TRUE, insert = FALSE,
100 |                         justRunUnitTest = FALSE)
101 | 
102 | }
103 | }
104 | 
105 | \keyword{methods}
106 | 


--------------------------------------------------------------------------------
/inst/unitTests/test_webAccessFunctions.R:
--------------------------------------------------------------------------------
 1 | initialTimeout <- getOption("timeout")
 2 | setup <- function()
 3 |     options(timeout=5*60)
 4 | tearDown <- function()
 5 |     options(timeout=initialTimeout)
 6 | 
 7 | .httrRead <- AnnotationHubData:::.httrRead
 8 | .ftpDirectoryInfo <- AnnotationHubData:::.ftpDirectoryInfo
 9 | .ftpFileInfo <- AnnotationHubData:::.ftpFileInfo 
10 | .listRemoteFiles <- AnnotationHubData:::.listRemoteFiles
11 | .getGenomeAbbrevs <- AnnotationHubData:::.getGenomeAbbrevs
12 | 
13 | test_httrRead <- function() {
14 |     setup()
15 |     hg19Url <- "http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/"
16 |     url <- paste0(hg19Url,"wgEncodeCshlLongRnaSeq/")
17 |     ans <- .httrRead(url, "//pre/a/text()")
18 |     checkTrue(is(ans, "data.frame"))
19 |     checkTrue(names(ans) == "files")
20 |     checkTrue(nrow(ans) > 0)
21 |     tearDown()
22 | }
23 | 
24 | test_ftpDirectoryInfo <- function(){
25 |     setup()
26 |     url <- "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/"
27 |     ans <- .ftpDirectoryInfo(url)
28 |     checkTrue(is(ans, "character"))
29 |     checkTrue(is.null(names(ans)))
30 |     checkTrue(length(ans) > 0L)
31 |     tearDown()
32 | }
33 | 
34 | test_ftpFileInfo <- function(){
35 |     setup()
36 |     url <- "ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/"
37 |     ans <- .ftpFileInfo(url, "chr.gtf.gz")
38 |     checkTrue(is(ans, "data.frame"))
39 |     checkIdentical(names(ans), c("fileurl", "date", "size"))
40 |     checkTrue(nrow(ans) == 1L)
41 | 
42 |     ans <- .ftpFileInfo(url, ".gz")
43 |     checkTrue(nrow(ans) > 0L)
44 | 
45 |     ans <- .ftpFileInfo(url, "FILE_THAT_DOESNT_EXIST.gz")
46 |     checkTrue(is(ans, "data.frame"))
47 |     checkIdentical(names(ans), c("fileurl", "date", "size"))
48 |     checkTrue(nrow(ans) == 0L)
49 |     tearDown()
50 | }
51 | 
52 | test_listRemoteFiles <- function(){
53 |     setup()
54 |     url <- "ftp://ftp.ensembl.org/pub/"
55 |     ans <- .listRemoteFiles(url)
56 |     checkTrue(is(ans, "character"))
57 |     checkTrue(is.null(names(ans)))
58 |     checkTrue(length(ans) > 0L)
59 |     tearDown()
60 | }
61 | 
62 | ## FIXME: revisit this when working on UCSCTrackUpdateChecker.R
63 | ## test_getGenomeAbbrevs <- function(){
64 | ##   smallSample <- c("hg19", "hg18", "hg17")
65 | ##   actualResult <- .getGenomeAbbrevs(smallSample)
66 | ##   expectedResult <- sort(smallSample)
67 | ##   checkEquals(actualResult, expectedResult)
68 | 
69 | ##   # Viewing the FTP server content at ftp://hgdownload.cse.ucsc.edu/goldenPath/ , you'll
70 | ##   # notice that some files are actually symlinks to other directories.  Since the
71 | ##   # getGenomeAbbrevs function claims to handle symlinks, we must test that specific case.
72 | ##   # The following are files that actually render the client a symlink. :
73 | ##   ### cb1 -> cbJul2002
74 | ##   ### ce1 -> ceMay2003
75 | ##   ### hg15 -> 10april2003/"
76 | ##   ### rn2 -> rnJan2003
77 | ##   # You should notice, however that results are returned in a sorted order.
78 | ##   sampleWithSymlink <- c("hg15", "cb1", "rn2", "ce1")
79 | ##   resultWithSymlink <- AnnotationHubData:::.getGenomeAbbrevs(sampleWithSymlink)
80 | ##   expectedResult <- sort(sampleWithSymlink)
81 | ##   checkEquals(resultWithSymlink, expectedResult)
82 | ## }
83 | 


--------------------------------------------------------------------------------
/inst/unitTests/test_AnnotationHubConstructor.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ahroot <- "/var/FastRWeb/web"
 3 | .AnnotationHubMetadata_args <- local({
 4 |     basepath <- paste0("goldenpath/hg19/encodeDCC/wgEncodeRikenCage/",
 5 |                        "wgEncodeRikenCageCd20CellPapTssHmm.bedRnaElements")
 6 |  
 7 |     list(AnnotationHubRoot=ahroot,
 8 |          SourceUrl=sprintf("http://hgdownload.cse.ucsc.edu/%s", basepath),
 9 |          SourceVersion=NA_character_,
10 |          SourceLastModifiedDate=as.POSIXct("2015-01-01", tz="GMT"),
11 |          SourceSize=as.numeric(99999),
12 |          SourceMd5="2",
13 |          SourceType="BED",
14 |          Title="CD20 CAGE defined Transcriptional Start Sites",
15 |          Description="120785 TSS sites ...",
16 |          Species="Homo sapiens",
17 |          TaxonomyId=9606L,
18 |          Genome="hg19",
19 |          Recipe="extendedBedToGranges",
20 |          Tags=c("gene regulation", "ranged genomic data"),
21 |          RDataClass="GRanges",
22 |          Coordinate_1_based=TRUE,
23 |          Maintainer="Paul Shannon <pshannon@fhcrc.org>",
24 |          DataProvider="hgdownload.cse.ucsc.edu",
25 |          Notes="9 total columns...",
26 |          RDataDateAdded=as.POSIXct("2013-01-01", tz="GMT"),
27 |          DispatchClass="GRanges",
28 |          PreparerClass="EncodeImportPreparer")
29 | })
30 | 
31 | .AnnotationHubMetadata <- 
32 |     do.call("AnnotationHubMetadata", .AnnotationHubMetadata_args)
33 | 
34 | test_constructor <- function()
35 | {
36 |     ## construction from complete args
37 |     args <- .AnnotationHubMetadata_args
38 |     ahm <- do.call("AnnotationHubMetadata", args)
39 |     checkTrue(validObject(ahm))
40 |     ## ... correctly inserted into slots
41 |     values <- metadata(ahm)
42 |     test <- unlist(Map(identical, args, values[names(args)]))
43 |     checkTrue(all(test))
44 | 
45 |     ## date / version coercion
46 |     idx <- grep("(Version)", names(args))
47 |     args[idx] <- sapply(args[idx], as.character)
48 | 
49 |     ahm1 <- do.call("AnnotationHubMetadata", args)
50 |     checkIdentical(ahm, ahm1)
51 | }
52 | 
53 | test_isComplete <- function()
54 | {
55 |     .isComplete <- AnnotationHubData:::.isComplete
56 |     valid <- .AnnotationHubMetadata
57 |     checkTrue(.isComplete(valid))
58 | 
59 |     ## zero-length 'required' field
60 |     invalid <- valid
61 |     metadata(invalid)$Title <- character()
62 |     checkException(.isComplete(invalid), silent=TRUE)
63 | 
64 |     ## invalid email address
65 |     invalid <- valid
66 |     metadata(invalid)$Maintainer <- "User <user at fhcrc.org>"
67 |     checkException(.isComplete(invalid), silent=TRUE)
68 | 
69 |     ## species not in database
70 |     invalid <- valid
71 |     metadata(invalid)$Species <- "Unknown"
72 |     checkException(.isComplete(invalid), silent=TRUE)
73 | }
74 | 
75 | test_multi_input <- function()
76 | {
77 |     args <- .AnnotationHubMetadata_args
78 |     rp <- "goldenpath/hg19/encodeDCC/wgEncodeRegDnaseClustered"
79 |     files <- c("wgEncodeRegDnaseClustered.bed.gz",
80 |                "wgEncodeRegDnaseClusteredInputs.tab")
81 |     args$SourceUrl <-
82 |         sprintf("http://hgdownload.cse.ucsc.edu/%s/%s", rp, files)
83 |     args$SourceMd5 <- c("2","2")
84 |         args$SourceSize <- c(as.numeric(99999),as.numeric(99999))
85 | 
86 |     x <- do.call("AnnotationHubMetadata", args)
87 |     checkEquals(2L, length(metadata(x)$SourceUrl))
88 |     checkEquals(2L, length(metadata(x)$SourceMd5))
89 |     checkEquals(2L, length(metadata(x)$SourceSize))
90 | }
91 | 


--------------------------------------------------------------------------------
/R/validationFunctions.R:
--------------------------------------------------------------------------------
 1 | getSpeciesList <- function(verbose=FALSE){
 2 |     if (!requireNamespace("GenomeInfoDbData", quietly = TRUE))
 3 |         stop("Requires GenomeInfoDbData.  Please run:\n",
 4 |              "    BiocManager::install('GenomeInfoDbData')")
 5 |     if (verbose) message("Loading valid species information.")
 6 |     txdb <- GenomeInfoDb::loadTaxonomyDb()
 7 |     txdb <- rbind(txdb, c(NA, NA, ""))
 8 |     species <- trimws(paste(txdb$genus, txdb$species))
 9 |     species
10 | }
11 | 
12 | validSpecies <- function(species, verbose=TRUE){
13 |     speciesList <- getSpeciesList(verbose=verbose)
14 |     res <- species %in% speciesList
15 |     if (any(is.na(species)))
16 |         res[is.na(species)] = TRUE
17 |     if (any(!res) & verbose){
18 |         message("Found invalid species.\n")
19 |         print(species[!res])
20 |         message("\nFor complete list of acceptable species run\n",
21 |                 "    'getSpeciesList()'\n",
22 |                 "For suggestions try\n",
23 |                 "    'suggestSpecies()'\n")
24 |      }
25 |     all(res)
26 | }
27 | 
28 | suggestSpecies <- function(query, verbose=FALSE, op=c("|", "&")){
29 |     op = match.arg(op)
30 |     if (!requireNamespace("GenomeInfoDbData", quietly = TRUE))
31 |         stop("Requires GenomeInfoDbData.  Please run:\n",
32 |              "    BiocManager::install('GenomeInfoDbData')")
33 |     if (verbose) message("Loading valid species information.")
34 |     txdb <- GenomeInfoDb::loadTaxonomyDb()
35 |     txdb <- rbind(txdb, c(NA, NA, ""))
36 |     sd <- txdb
37 |     combo <- trimws(paste(txdb$genus, txdb$species))
38 |     sd$combo = combo
39 |     if( op == "|"){
40 |         keep <- FALSE
41 |         for (q in query)
42 |             keep <- keep | Reduce(`|`, lapply(sd[2:4], grepl, pattern = q,
43 |                                               ignore.case=TRUE))
44 |     }else {
45 |         keep <- TRUE
46 |         for (q in query)
47 |             keep <- keep & Reduce(`|`, lapply(sd[2:4], grepl, pattern = q,
48 |                                               ignore.case=TRUE))
49 |     }
50 |     data.frame(taxonomyId = sd$tax_id[keep], species=sd$combo[keep])
51 | }
52 | 
53 | getValidSourceTypes <- function(){
54 | 
55 |     # alphabetical
56 |     expectedSourceTypes <- c("BAI", "BAM", "BED", "BigWig", "BioPax",
57 |                              "BioPaxLevel2", "BioPaxLevel3", "BLOB", "CEL",
58 |                              "CDF", "Chain", "CSV",
59 |                              "ensembl", "FASTA", "FASTQ", "FCS", "GFF", "GRASP",
60 |                              "GSEMatrix", "GTF", "HDF5", "HIC", "IDAT", "Inparanoid",
61 |                              "JSON", "MTX", "mtx.gz", "MySQL", "mzid", "mzML", "mzTab",
62 |                              "mzXML", "Multiple", "NCBI/blast2GO", "NCBI/ensembl",
63 |                              "NCBI/UniProt", "PDB", "PNG", "RDA", "RData", "RDS", "Simulated", "tab",
64 |                              "tar.gz", "TIFF", "TSV", "TwoBit", "TXT", "UCSC track",
65 |                              "VCF", "XLS/XLSX", "XML", "Zip")
66 | 
67 |     expectedSourceTypes
68 | 
69 | }
70 | 
71 | validDispatchClass <- function(dc, verbose=TRUE){
72 | 
73 |     mat <- AnnotationHub::DispatchClassList()
74 |     res <- dc %in% as.character(mat[,1])
75 |     if (any(!res) & verbose){
76 |         message("Found invalid DispatchClass.\n")
77 |         print(dc[!res])
78 |         message("\nFor currently available DispatchClass run\n",
79 |                 "    'AnnotationHub::DispatchClassList()'\n")
80 |     }
81 |     all(res)
82 | }
83 | 


--------------------------------------------------------------------------------
/man/makeEnsemblFasta.Rd:
--------------------------------------------------------------------------------
  1 | \name{makeEnsemblFasta}
  2 | 
  3 | \alias{makeEnsemblFasta}
  4 | \alias{makeEnsemblFastaToAHM}
  5 | \alias{makeEnsemblTwoBitToAHM}
  6 | \alias{ensemblFastaToFaFile}
  7 | \alias{ensemblFastaToTwoBitFile}
  8 | 
  9 | \title{
 10 |   Functions to convert Ensembl FASTA files to FaFile and TwoBitFile for
 11 |   inclusion in AnnotationHub.
 12 | }
 13 | 
 14 | \description{
 15 |   Transform an Ensembl FASTA file to a Bioconductor FaFile or ToBitFile.
 16 | }
 17 | 
 18 | \usage{
 19 | makeEnsemblFastaToAHM(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/",
 20 |                       baseDir = "fasta/", release,
 21 |                       justRunUnitTest = FALSE,
 22 |                       BiocVersion = BiocManager::version())
 23 | 
 24 | makeEnsemblTwoBitToAHM(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/",
 25 |                        baseDir = "fasta/", release,
 26 |                        justRunUnitTest = FALSE,
 27 |                        BiocVersion = BiocManager::version())
 28 | 
 29 | ensemblFastaToFaFile(ahm)
 30 | 
 31 | ensemblFastaToTwoBitFile(ahm)
 32 | }
 33 | 
 34 | \arguments{
 35 |   \item{currentMetadata}{
 36 |     Currently not used. Intended to be a list of metadata to filter, i.e.,
 37 |     records that do not need to be processed again. Need to remove or fix.
 38 |   }
 39 |   \item{baseUrl}{
 40 |     ftp file location.
 41 |   } 
 42 |   \item{baseDir}{
 43 |     ftp file directory.
 44 |   } 
 45 |   \item{release}{
 46 |     Integer version number, e.g., "84".
 47 |   } 
 48 |   \item{justRunUnitTest}{
 49 |     A \code{logical}. When TRUE, a small number of records (usually 5) are
 50 |     processed instead of all.
 51 |   }
 52 |   \item{BiocVersion}{
 53 |     A \code{character(1)} Bioconductor version. The resource will be available
 54 |     in Bioconductor >= to this version. Default value is the current version,
 55 |     specified with BiocManager::version().
 56 |   }
 57 |   \item{ahm}{
 58 |     List of \code{AnnotationHubMetadata} instances.
 59 |   }
 60 | }
 61 | 
 62 | \details{
 63 |   \code{makeEnsemblFastaToAHM} and \code{makeEnsemblTwoBitToAHM} process
 64 |   metadata into a list of \code{AnnotationHubMetadata} objects. 
 65 | 
 66 |   \code{ensemblFastaToFaFile} unzips a .gz files, creates and index and
 67 |   writes out .rz and .rz.fai files to disk.
 68 |   \code{ensemblFastaToTwoBit} converts a fasta file to twobit format and
 69 |   writes the .2bit file out to disk.
 70 | }
 71 | 
 72 | \value{
 73 |   \code{makeEnsemblFastaToAHM} and \code{makeEnsemblTwoBitToAHM} return
 74 |   a list of \code{AnnotationHubMetadata} objects. 
 75 | 
 76 |   \code{ensemblFastaToFaFile} write out .rz and .rz.fai files to disk.
 77 |   \code{ensemblFastaToTwoBit} writes out a .2bit file to disk.
 78 | }
 79 | 
 80 | \author{Bioconductor Core Team}
 81 | 
 82 | \seealso{
 83 |   \itemize{
 84 |     \item \link{updateResources}
 85 |     \item \link{AnnotationHubMetadata}
 86 |   }
 87 | }
 88 | 
 89 | \examples{
 90 | ## updateResources() generates metadata, process records and
 91 | ## pushes files to AWS S3 buckets. See ?updateResources for details.
 92 | 
 93 | ## 'release' is passed to makeEnsemblFastaToFaFile.
 94 | \dontrun{
 95 | meta <- updateResources("/local/path", 
 96 |                         BiocVersion = c("3.2", "3.3"),
 97 |                         preparerClasses = "EnsemblFastaImportPreparer",
 98 |                         metadataOnly = TRUE, insert = FALSE,
 99 |                         justRunUnitTest = FALSE, release = "83")
100 | }
101 | }
102 | 
103 | \keyword{methods}
104 | 


--------------------------------------------------------------------------------
/R/makeEnsemblTwoBit.R:
--------------------------------------------------------------------------------
  1 | ### =========================================================================
  2 | ### makeEnsemblTwoBit()
  3 | ### -------------------------------------------------------------------------
  4 | ###
  5 | 
  6 | .ensemblTwoBitTypes <-
  7 |     c("cdna\\.all", "dna_rm\\.(primary_assembly|toplevel)",
  8 |       "dna_sm\\.(primary_assembly|toplevel)",
  9 |       "dna\\.(primary_assembly|toplevel)", "ncrna")
 10 | 
 11 | ## Metadata generator
 12 | makeEnsemblTwoBitToAHM <-
 13 |     function(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/",
 14 |              baseDir = "fasta/", release,
 15 |              justRunUnitTest = FALSE, BiocVersion = BiocManager::version())
 16 | {
 17 |     if (length(release) > 1L)
 18 |         stop("'release' must be a single integer")
 19 |     if (length(BiocVersion) > 1L)
 20 |         stop("BiocVersion must be a single version")
 21 |     time1 <- Sys.time()
 22 |     regex <- paste0(".*release-", release)
 23 |     sourceUrl <- .ensemblFastaSourceUrls(baseUrl, baseDir, regex,
 24 |         baseTypes=.ensemblTwoBitTypes)
 25 |     if (justRunUnitTest)
 26 |         sourceUrl <- sourceUrl[1:5]
 27 | 
 28 |     sourceFile <- sub(baseUrl, "ensembl/", sourceUrl)
 29 |     meta <- .ensemblMetadataFromUrl(sourceUrl, twobit=TRUE)
 30 |     dnaType <- local({
 31 |         x <- basename(dirname(sourceFile))
 32 |         sub("(dna|rna)", "\\U\\1", x, perl=TRUE)
 33 |     })
 34 |     description <- paste("TwoBit", dnaType, "sequence for", meta$species)
 35 | 
 36 |     rdataPath <- sub("\\.fa\\.gz$", ".2bit", sourceFile)
 37 | 
 38 |     Map(AnnotationHubMetadata,
 39 |         AnnotationHubRoot=currentMetadata$AnnotationHubRoot,
 40 |         Description=description,
 41 |         Genome=meta$genome,
 42 |         RDataPath=rdataPath,
 43 |         SourceUrl=sourceUrl,
 44 |         SourceVersion=meta$sourceVersion,
 45 |         Species=meta$species,
 46 |         TaxonomyId=meta$taxonomyId,
 47 |         Title=meta$title,
 48 |         SourceSize=meta$sourceSize,
 49 |         SourceLastModifiedDate=meta$sourceLastModifiedDate,
 50 |         MoreArgs=list(
 51 |             BiocVersion=package_version(BiocVersion),
 52 |             Coordinate_1_based = TRUE,
 53 |             DataProvider="Ensembl",
 54 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
 55 |             SourceType="FASTA",
 56 |             DispatchClass="TwoBitFile",
 57 |             RDataClass="TwoBitFile",
 58 |             RDataDateAdded=Sys.time(),
 59 |             Recipe="AnnotationHubData:::ensemblFastaToTwoBitFile",
 60 |             Tags=c("TwoBit", "ensembl", "sequence", "2bit", "FASTA")))
 61 | }
 62 | 
 63 | ensemblFastaToTwoBitFile <- function(ahm)
 64 | {
 65 |     ## Convert .fa file to .2bit
 66 |     gc()
 67 |     twobitOut <- file.path(metadata(ahm)$HubRoot,
 68 |                            dirname(metadata(ahm)$RDataPath), basename(outputFile(ahm)))
 69 |     srcFile <- sub('\\.2bit','.fa.gz', twobitOut)
 70 |     dna <- import(srcFile, "FASTA")
 71 |     gc()
 72 | 
 73 |     tryCatch({
 74 |         ## ID as name
 75 |         ids <- sub(" .*", "", names(dna))
 76 |         stopifnot(length(ids) == length(dna))
 77 |         names(dna) <- ids
 78 |         dna <- Biostrings::replaceAmbiguities(dna)
 79 |         export(dna, twobitOut, "TwoBit")
 80 |     }, error = function(err) {
 81 |         message("conversion failed",
 82 |                 "\n  file: ",  sQuote(srcFile),
 83 |                 "\n  reason: ",  conditionMessage(err),
 84 |                 call.=FALSE)
 85 |     }, finally = function(){
 86 |         ## remove .fa file
 87 |         if (exists("dna")){
 88 |             rm("dna")
 89 |             gc()
 90 |         }
 91 |         system(paste0("rm ", srcFile))
 92 |         gc()
 93 |     })
 94 | 
 95 |     ## remove .fa file
 96 |     if (exists("dna")){
 97 |         rm("dna")
 98 |         gc()
 99 |     }
100 |     system(paste0("rm ", srcFile))
101 |     gc()
102 | }
103 | 
104 | ## create the class and newResources() method
105 | makeAnnotationHubResource("EnsemblTwoBitPreparer", makeEnsemblTwoBitToAHM)
106 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
  1 | import(methods)
  2 | import(S4Vectors)
  3 | import(IRanges)
  4 | import(GenomicRanges)
  5 | import(AnnotationHub)
  6 | import(OrganismDbi)
  7 | import(GenomicFeatures)
  8 | import(RCurl)
  9 | import(BiocManager)
 10 | import(Biostrings)
 11 | 
 12 | importFrom(biocViews, recommendPackages, guessPackageType)
 13 | importMethodsFrom(AnnotationDbi, exists, get, saveDb)
 14 | importMethodsFrom(BiocGenerics, mapply, Map)
 15 | importMethodsFrom(DBI, dbDriver, dbGetQuery)
 16 | importMethodsFrom(Rsamtools, indexFa)
 17 | importMethodsFrom(RSQLite, dbConnect, dbDisconnect)
 18 | importMethodsFrom(rtracklayer,
 19 |     browserSession, "genome<-", getTable,
 20 |     import, "tableName<-", tableNames, track,
 21 |     "trackName<-", trackNames, ucscTableQuery, export
 22 | )
 23 | importFrom(Biobase, AnnotatedDataFrame, ExpressionSet)
 24 | importFrom(Seqinfo, Seqinfo)
 25 | importFrom(GenomeInfoDb, loadTaxonomyDb)
 26 | importFrom(rtracklayer, GRangesForUCSCGenome, ucscGenomes)
 27 | importFrom(Rsamtools, bgzip)
 28 | importFrom(AnnotationForge, makeInpDb, makeOrgPackageFromNCBI)
 29 | importFrom(AnnotationDbi, loadDb)
 30 | importFrom(tools, file_ext)
 31 | importFrom(futile.logger,
 32 |     ERROR, INFO, TRACE, appender.file,
 33 |     flog.appender, flog.threshold, flog.info
 34 | )
 35 | importFrom(jsonlite, fromJSON, toJSON)
 36 | importFrom(parallel, detectCores)
 37 | importFrom(stats, setNames)
 38 | importFrom(XML, readHTMLTable, xmlParse, xmlValue, getNodeSet, htmlParse)
 39 | importFrom(RSQLite, SQLite, sqliteCopyDatabase)
 40 | importFrom(graphics, title)
 41 | importFrom(utils,
 42 |     capture.output, data, download.file,
 43 |     read.delim, read.table, str, read.csv
 44 |     )
 45 | 
 46 | import(BiocCheck)
 47 | import(biocViews)
 48 | importFrom(graph, nodes)
 49 | 
 50 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 51 | ### Export S4 classes
 52 | ###
 53 | 
 54 | exportClasses(
 55 |     HubMetadata,
 56 |     AnnotationHubMetadata,
 57 |     ImportPreparer,
 58 |     UCSCTrackImportPreparer,
 59 |     UCSCChainPreparer,
 60 |     Grasp2ImportPreparer,
 61 |     EnsemblGtfImportPreparer,
 62 |     EnsemblFastaImportPreparer,
 63 |     Inparanoid8ImportPreparer,
 64 |     NCBIImportPreparer
 65 | )
 66 | 
 67 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 68 | ### Export non-generic functions
 69 | ###
 70 | 
 71 | export(
 72 |     AnnotationHubMetadata, makeAnnotationHubMetadata,
 73 |     UCSCTrackImportPreparer,
 74 |     #postProcessMetadata,
 75 |     flog,
 76 |     upload_to_S3,
 77 |     upload_to_azure,
 78 |     getImportPreparerClasses,
 79 |     makeAnnotationHubResource,
 80 |     HubMetadataFromJson,
 81 |     toJson,
 82 |     constructSeqInfo,
 83 |     makeEnsemblFastaToAHM, makeEnsemblTwoBitToAHM,
 84 |     ensemblFastaToFaFile,
 85 |     ensemblFastaToTwoBitFile,
 86 |     ahmToJson,
 87 |     newResources, updateResources, deleteResources,
 88 |     pushMetadata, pushResources,
 89 |     makeGencodeFastaToAHM, gencodeFastaToFaFile,
 90 |     makeStandardOrgDbsToAHM, makeStandardTxDbsToAHM,
 91 |     makeNCBIToOrgDbsToAHM,
 92 |     getSpeciesList, validSpecies, suggestSpecies, getValidSourceTypes,
 93 |     checkSpeciesTaxId, validDispatchClass
 94 | )
 95 | 
 96 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 97 | ### Export S4 generics defined in AnnotationHubData + export corresponding
 98 | ### methods
 99 | ###
100 | 
101 | export(
102 |     recipeName, run, runRecipes,
103 |     hubError, "hubError<-",
104 |     inputFiles, outputFile,
105 |     metadata, "metadata<-",
106 |     metadataList, metadataTable,
107 |     annotationHubRoot,
108 |     sourceUrls
109 | )
110 | 
111 | exportMethods(
112 |     recipeName, run, runRecipes,
113 |     hubError, "hubError<-",
114 |     inputFiles, outputFile,
115 |     metadata, "metadata<-",
116 |     metadataList, metadataTable,
117 |     annotationHubRoot,
118 |     sourceUrls
119 | )
120 | 
121 | ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
122 | ### Export S4 methods for generics not defined in AnnotationHubData
123 | ###
124 | 
125 | exportMethods(
126 |     show
127 | )
128 | 


--------------------------------------------------------------------------------
/man/validationFunctions.Rd:
--------------------------------------------------------------------------------
  1 | \name{validationFunctions}
  2 | \alias{validationFunctions}
  3 | 
  4 | \alias{getSpeciesList}
  5 | \alias{validSpecies}
  6 | \alias{suggestSpecies}
  7 | \alias{getValidSourceTypes}
  8 | \alias{checkSpeciesTaxId}
  9 | \alias{validDispatchClass}
 10 | 
 11 | \title{ValidationFunctions}
 12 | 
 13 | \description{
 14 |   Functions to assist in the validation process of creating the
 15 |   metadata.csv file for Hub Resources
 16 | }
 17 | 
 18 | \usage{
 19 | getSpeciesList(verbose=FALSE)
 20 | 
 21 | validSpecies(species, verbose=TRUE)
 22 | 
 23 | suggestSpecies(query, verbose=FALSE, op=c("|", "&"))
 24 | 
 25 | getValidSourceTypes()
 26 | 
 27 | checkSpeciesTaxId(txid, species, verbose=TRUE)
 28 | 
 29 | validDispatchClass(dc, verbose=TRUE)
 30 | }
 31 | 
 32 | \arguments{
 33 |   \item{species}{
 34 |     species to validate (may be single value or list)
 35 |   }
 36 |   \item{query}{
 37 |     terms to query. Whether AND or OR is determined by argument op.
 38 |   }
 39 |   \item{verbose}{
 40 |     should additional information and useful tips be displayed
 41 |   }
 42 |   \item{op}{
 43 |     Should searching of mulitple terms be conditional OR ("|") or AND
 44 |     ("&")
 45 |   }
 46 |   \item{txid}{taxonomy id (single value or list)}
 47 |   \item{dc}{Dispatch class to validate (may be single value or list)}
 48 | }
 49 | 
 50 | \details{
 51 |    \itemize{
 52 |      \item getSpeciesList: Provides a list of valid species as
 53 |    determined by the GenomeInfoDbData package specData.rda file.
 54 | 
 55 |      \item validSpecies: True/False if argument is considered a valid
 56 |      species based on the list generated by getSpeciesList. A species
 57 |      may be deemed invalid if the capitalization mismatches or
 58 |      punctuation mismatches. Use suggestSpecies to find similar terms.
 59 | 
 60 |      \item suggestSpecies: Based on a term or multiple terms suggest
 61 |      possible valid species.
 62 | 
 63 |      \item getValidSourceTypes: returns list of acceptable values for
 64 |      SourceType in metadata.csv.  If you think a valid source type
 65 |      should be added to the list please reach out to
 66 |      maintainer@bioconductor.org
 67 | 
 68 |      \item checkSpeciesTaxId: cross validates a list of species and
 69 |      taxonomy ids for expected values based on
 70 |      \code{GenomeInfoDb::loadTaxonomyDb()}. Warning when there is a
 71 |      mismatch.
 72 | 
 73 |      \item validDispatchClass: TRUE/FALSE if argument is considered a
 74 |      valid DispatchClass based on the currently available methods in
 75 |      AnnotationHub. Use \code{AnnotationHub::DispatchClassList()} to see
 76 |      the table of currently available methods. If a currently available
 77 |      method is not appropriate for your resource, please reach out to
 78 |      Lori Shepherd \email{Lori.Shepherd@roswellpark.org} to request a
 79 |      new method be added.
 80 | 
 81 |    }
 82 | }
 83 | 
 84 | \value{
 85 |   \itemize{
 86 |     \item For getSpeciesList: character vector of valid species
 87 |     \item For validSpecies: True/False if all species given as argument
 88 |     are valid
 89 |     \item For suggestSpecies: data.frame of taxonomy id and species name
 90 |     of possible valid species based on given query key words.
 91 |     \item For getValidSourceTypes: character vector of valid source
 92 |     types.
 93 |     \item For checkSpeciesTaxId: NULL if check is verfified, If verbose
 94 |     is ture a table of suggested values along with the warning.
 95 |     \item For validDispatchClass: True/False if all dispatch class given
 96 |     as argument are valid
 97 |   }
 98 | }
 99 | 
100 | \author{Lori Shepherd}
101 | 
102 | \seealso{
103 |   \itemize{
104 |     \item \link{AnnotationHubMetadata}
105 |     \item \link{makeAnnotationHubMetadata}
106 |   }
107 | }
108 | 
109 | \examples{
110 | 
111 | species = getSpeciesList()
112 | 
113 | # following is TRUE
114 | 
115 | validSpecies("Homo sapiens")
116 | # followin is FALSE because of starting "h"
117 | validSpecies("homo sapiens")
118 | 
119 | # can provide multiple, if any are not valid FALSE
120 | # TRUE
121 | validSpecies(c("Homo sapiens", "Canis domesticus"))
122 | 
123 | suggestSpecies("Canis")
124 | 
125 | getValidSourceTypes()
126 | 
127 | checkSpeciesTaxId(1003232, "Edhazardia aedis")
128 | checkSpeciesTaxId(9606, "Homo sapiens")
129 | 
130 | validDispatchClass("GRanges")
131 | }
132 | 
133 | \keyword{methods}
134 | 


--------------------------------------------------------------------------------
/R/makeStandardTxDbsToSqlite.R:
--------------------------------------------------------------------------------
  1 | ### =======================================================================
  2 | ### makeStandardTxDbsToSqlite
  3 | ### -------------------------------------------------------------------------
  4 | ###
  5 | 
  6 | ## Extracts the sqlite files from the 'standard' TxDb packages in the current 
  7 | ## Bioconductor repo:
  8 | ## 
  9 | ##   http://www.bioconductor.org/packages/release/BiocViews.html#___TxDb.
 10 | 
 11 | ## NOTES:
 12 | ## - Recipe should be run after new TxDbs have been generated (right before
 13 | ##   the next release).
 14 | ## - BiocVersion() should be the impending release / current devel.
 15 | ## - May need to run AnnotationHubData:::.getTxDbs(TRUE) to load all
 16 | ##   TxDbs if not in local R install.
 17 | 
 18 | ## Returns list of loaded TxDb objects.
 19 | .getTxDbs <- function(TxDbs) {
 20 |     lapply(TxDbs, function(xx) {
 21 |         if (!require(xx, character.only=TRUE))
 22 |             BiocManager::install(xx, ask=FALSE)
 23 |     })
 24 |     lapply(TxDbs, require, character.only=TRUE)
 25 |     res <- lapply(TxDbs, get)
 26 |     names(res) <- TxDbs 
 27 |     res
 28 | }
 29 | 
 30 | .TxDbPkgMetadataFromObjs <- function(txdbs, biocversion) {
 31 |     title <- paste0(names(txdbs), '.sqlite')
 32 |     species <- unlist(lapply(txdbs,
 33 |         function(x){m <- metadata(x); m[m$name=='Organism', 2] }))
 34 |     taxonomyId <- as.integer(unlist(lapply(txdbs, 
 35 |         function(x) {
 36 |             m <- metadata(x) 
 37 |             id <- m[m$name=='TaxID', 2]
 38 |             if (!length(id))
 39 |                 id <- m[m$name=='Taxonomy ID', 2]
 40 |             id
 41 |         })))
 42 | 
 43 |     sourceVersion <- sapply(txdbs, 
 44 |         function(x) {
 45 |             m <- metadata(x) 
 46 |             paste0('UCSC transcript based annotations generated ', 
 47 |                    strptime(m[m$name=='Creation time', 2], "%Y-%m-%d")) 
 48 |         }, simplify=FALSE)
 49 |     url <- list(c("http://genome.ucsc.edu/", 
 50 |                   "http://hgdownload.cse.ucsc.edu/goldenPath"))
 51 |     list(title=title,
 52 |          species=species,
 53 |          taxonomyId=taxonomyId,
 54 |          genome=rep("UCSC genomes", length(title)),
 55 |          sourceUrl=rep(url, length(title)),
 56 |          sourceVersion=sourceVersion,
 57 |          description=paste("UCSC transcript based annotations for", species),
 58 |          rDataPath=paste0("ucsc/standard/", biocversion, "/",title))
 59 | }
 60 | 
 61 | makeStandardTxDbsToAHM <- function(currentMetadata, justRunUnitTest = FALSE, 
 62 |                                    BiocVersion = BiocManager::version(),
 63 |                                    TxDbs) {
 64 |     if (length(BiocVersion) > 1L)
 65 |         stop("length(BiocVersion) must == 1L")
 66 | 
 67 |     txdbs <- .getTxDbs(TxDbs)
 68 |     meta <- .TxDbPkgMetadataFromObjs(txdbs, biocversion=BiocVersion)
 69 |     Map(AnnotationHubMetadata,
 70 |         AnnotationHubRoot=currentMetadata$AnnotationHubRoot,
 71 |         Description=meta$description,
 72 |         Genome=meta$genome,
 73 |         SourceUrl=meta$sourceUrl,
 74 |         SourceVersion=meta$sourceVersion,
 75 |         Species=meta$species,
 76 |         TaxonomyId=meta$taxonomyId,
 77 |         Title=meta$title,
 78 |         RDataPath=meta$rDataPath,
 79 |         MoreArgs=list(
 80 |             BiocVersion=BiocVersion,
 81 |             Coordinate_1_based=TRUE, ## TRUE unless it "needs" to be FALSE
 82 |             DataProvider="UCSC",
 83 |             Maintainer="Bioconductor Maintainer <maintainer@bioconductor.org>",
 84 |             RDataClass="TxDb",
 85 |             DispatchClass="SQLiteFile",
 86 |             SourceType="FASTA",
 87 |             RDataDateAdded = Sys.time(),
 88 |             Recipe="AnnotationHubData:::extractTxDbSqlite",
 89 |             Tags=c("UCSC", "Transcript", "Annotation")),
 90 |         USE.NAMES=FALSE) 
 91 | }
 92 | 
 93 | ## Load the object and call saveDb()
 94 | extractTxDbSqlite <- function(ahm) {
 95 |     dbFile <- metadata(ahm)$Title
 96 |     txdb <- sub('.sqlite','',dbFile)
 97 |     outputPath <- file.path(metadata(ahm)$AnnotationHubRoot,
 98 |                             metadata(ahm)$RDataPath)
 99 |     if (!isSingleString(outputPath)) 
100 |         stop("'outputPath' must be a single string")
101 |     sqliteCopyDatabase(dbconn(.getTxDbs(txdb)[[1]]), outputPath)
102 |     outputFile(ahm)
103 | }
104 | 
105 | makeAnnotationHubResource("TxDbFromPkgsImportPreparer", makeStandardTxDbsToAHM)
106 | 


--------------------------------------------------------------------------------
/R/makeInparanoid8ToDbs.R:
--------------------------------------------------------------------------------
  1 | ## This recipe is no longer used. If reinstated, add this unit test
  2 | ## back to test_recipes.R
  3 | 
  4 | #test_Inparanoid8ImportPreparer_recipe <- function() {
  5 | #    suppressWarnings({
  6 | #    ahms = updateResources(ahroot, BiocVersion,
  7 | #                           preparerClasses = "Inparanoid8ImportPreparer",
  8 | #                           insert = FALSE, metadataOnly=TRUE,
  9 | #                           justRunUnitTest=TRUE)
 10 | #    })
 11 | #    checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 12 | #}
 13 | 
 14 | ## helper to make metadata list from the data
 15 | .inparanoidMetadataFromUrl <- function(baseUrl, justRunUnitTest) {
 16 |     ## get all the subDirs
 17 |     subDirs <- AnnotationForge:::.getSubDirs(baseUrl)
 18 |     subDirs <- subDirs[!(subDirs %in% c('stderr/'))]
 19 |     species <- sub("/","",subDirs)
 20 |     allDirs <- file.path(baseUrl, species)
 21 |     ## We have the tax ID and the full species names in AnnotationForge already
 22 |     meta <- read.delim(system.file('extdata','inp8_Full_species_mapping',
 23 |                                    package='AnnotationForge'),
 24 |                        sep="\t", header=TRUE, stringsAsFactors=FALSE)
 25 |     matches <- match(species, meta$inparanoidSpecies)
 26 |     fullSpecies <- meta$GenusAndSpecies[matches]
 27 |     taxonomyId <- as.integer(as.character(meta$taxID[matches]))
 28 |     ## get the name for the DB
 29 |     title <- paste0("hom.",
 30 |                      gsub(" ","_",fullSpecies),
 31 |                      ".inp8",
 32 |                      ".sqlite")    
 33 |     ## root <- setNames(rep(NA_character_, length(allDirs)), title)
 34 |     genome <- setNames(rep("inparanoid8 genomes", length(allDirs)), title)
 35 |     sourceVersion <- rep('Inparanoid version 8',length(allDirs))
 36 |     description <- paste("Inparanoid 8 annotations about", fullSpecies)
 37 |     sourceUrl <- paste0(baseUrl,"/", species)
 38 |     
 39 |     rDataPath <- paste0("inparanoid8/Orthologs/",title)
 40 |         
 41 |     df <- data.frame(title=title, species = fullSpecies,
 42 |          taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl,
 43 |          sourceVersion = sourceVersion,
 44 |          description=description, rDataPath=rDataPath, stringsAsFactors=FALSE)
 45 |     rownames(df) <- NULL
 46 |     
 47 |     if(justRunUnitTest)
 48 |         df <- df[1:2, ]    
 49 |     df
 50 | }
 51 | 
 52 | 
 53 | ## STEP 1: make function to process metadata into AHMs
 54 | ## This function will return the AHMs and takes no args.
 55 | ## It also must specify a recipe function.
 56 | makeinparanoid8ToAHMs <- function(currentMetadata, justRunUnitTest, BiocVersion) {
 57 |     baseUrl <- 'http://inparanoid.sbc.su.se/download/current/Orthologs_other_formats'
 58 |     ## Then make the metadata for these
 59 |     meta <- .inparanoidMetadataFromUrl(baseUrl, justRunUnitTest)
 60 |     ## then make AnnotationHubMetadata objects.
 61 |     Map(AnnotationHubMetadata,
 62 |         ## AnnotationHubRoot=meta$annotationHubRoot,
 63 |         Description=meta$description,
 64 |         Genome=meta$genome,
 65 |         SourceUrl=meta$sourceUrl,
 66 |         SourceVersion=meta$sourceVersion,
 67 |         Species=meta$species,
 68 |         TaxonomyId=meta$taxonomyId,
 69 |         Title=meta$title,
 70 |         RDataPath=meta$rDataPath,
 71 |         MoreArgs=list(
 72 |           BiocVersion=BiocVersion,
 73 | 	  SourceType="Inparanoid",
 74 |           Coordinate_1_based = TRUE, ## TRUE unless it "needs" to be FALSE
 75 |           DataProvider = "Inparanoid8",
 76 |           Maintainer = "Marc Carlson <mcarlson@fhcrc.org>",
 77 |           RDataClass = "Inparanoid8Db",
 78 | 	  DispatchClass="SQLiteFile",	
 79 |           RDataDateAdded = Sys.time(),
 80 |           Recipe = "AnnotationHubData:::inparanoid8ToDbsRecipe",
 81 |           Tags = c("Inparanoid", "Gene", "Homology", "Annotation")))
 82 | }
 83 | 
 84 | 
 85 | 
 86 | ## STEP 2: Make a recipe function that takes an AnnotationHubRecipe
 87 | ## object.
 88 | inparanoid8ToDbsRecipe <- function(ahm){
 89 |     
 90 |     inputFiles <- metadata(ahm)$SourceFile 
 91 |     dbname <- makeInpDb(dir=file.path(inputFiles,""),
 92 |                         dataDir=tempdir())
 93 |     db <- loadDb(file=dbname)
 94 |     outputPath <- file.path(metadata(ahm)$AnnotationHubRoot,
 95 |                             metadata(ahm)$RDataPath)
 96 |     saveDb(db, file=outputPath) 
 97 |     outputFile(ahm)
 98 | }
 99 | 
100 | 
101 | 
102 | 
103 | ## STEP 3:  Call the helper to set up the newResources() method
104 | makeAnnotationHubResource("Inparanoid8ImportPreparer",
105 |                           makeinparanoid8ToAHMs)
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/R/makeStandardOrgDbsToSqlite.R:
--------------------------------------------------------------------------------
  1 | ### =========================================================================
  2 | ### makeStandardOrgDbsToSqlite ('standard' OrgDbs)
  3 | ### -------------------------------------------------------------------------
  4 | ###
  5 | 
  6 | ## This recipe extracts the sqlite files from the 'standard'
  7 | ## OrgDb packages in the current Bioconductor repo:
  8 | ## 
  9 | ##   http://www.bioconductor.org/packages/release/BiocViews.html#___OrgDb.
 10 | 
 11 | ## This recipe should be run after the new OrgDbs have been generated for the
 12 | ## next release. The version should be the current devel version,
 13 | ## soon to roll over to the new release.
 14 | 
 15 | ## The 'non-standard' OrgDbs are generated with makeNCBIToOrgDbs.R.
 16 | 
 17 | ## Returns list of OrgDb objects
 18 | ## NOTE: OrganismDbi:::.packageTaxIds is a static named character vector
 19 | ##       of package names and taxids. This file should be checked to
 20 | ##       confirm the package names match the current batch of OrgDb packages.
 21 | .getOrgDbs <- function(downloadOrgDbs=FALSE) {
 22 |     dbNames <- OrganismDbi:::.packageTaxIds()
 23 |     if (downloadOrgDbs) {  ## download, install
 24 |         lapply(dbNames, function(xx) {
 25 |             if (!requireNamespace(xx)) {
 26 |                 BiocManager::install(xx, ask=FALSE)
 27 |             }
 28 |         })
 29 |     }
 30 |     res <- mapply(get, dbNames, lapply(dbNames, asNamespace), SIMPLIFY=FALSE)
 31 |     names(res) <- dbNames
 32 |     res
 33 | }
 34 | 
 35 | .orgDbPkgMetadataFromObjs <- function(orgDbs, biocversion) {
 36 |      ## title
 37 |      title <- paste0(names(orgDbs), '.sqlite')
 38 |      ## organism
 39 |      species <- unlist(lapply(orgDbs,
 40 |                  function(x){m <- metadata(x); m[m$name=='ORGANISM', 2] }))
 41 |      ## tax ID
 42 |      taxonomyId <- as.integer(unlist(lapply(orgDbs,
 43 |                  function(x){m <- metadata(x); m[m$name=='TAXID', 2] })))
 44 |      ## genome
 45 |      genome <- rep("NCBI genomes", length(title))
 46 | 
 47 |      ## sourceUrl
 48 |      urls <- c("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/",
 49 |                         "ftp://ftp.ensembl.org/pub/current_fasta")
 50 |      sourceUrl <- rep(list(urls), length(title))
 51 |      ## sourceVersion
 52 |      dateMessage <- paste0('NCBI gene annotations as of ', as.character(date()))
 53 |      sourceVersion <- rep(dateMessage, length(title))
 54 |      ## description 
 55 |      description <- paste("NCBI gene ID based annotations about", species)
 56 |      ## rDataPath
 57 |      rDataPath <- paste0("ncbi/standard/",biocversion,"/",title)
 58 |      ## return as a list
 59 |      list(##annotationHubRoot = root,
 60 |           title=title,
 61 |           species = species,
 62 |           taxonomyId = taxonomyId,
 63 |           genome = genome,
 64 |           sourceUrl=sourceUrl,
 65 |           sourceVersion = sourceVersion,
 66 |           description=description,
 67 |           rDataPath=rDataPath)
 68 | }
 69 | 
 70 | makeStandardOrgDbsToAHM <- function(currentMetadata, justRunUnitTest=FALSE, 
 71 |                                     BiocVersion=BiocManager::version(), 
 72 |                                     downloadOrgDbs=TRUE) {
 73 |     if (length(BiocVersion) > 1L)
 74 |         stop("BiocVersion must be a single version")
 75 | 
 76 |     orgDbs <- .getOrgDbs(downloadOrgDbs)
 77 |     meta <- .orgDbPkgMetadataFromObjs(orgDbs, biocversion=BiocVersion)
 78 |     Map(AnnotationHubMetadata,
 79 |         AnnotationHubRoot=currentMetadata$AnnotationHubRoot,
 80 |         Description=meta$description,
 81 |         Genome=meta$genome,
 82 |         SourceUrl=meta$sourceUrl,
 83 |         SourceVersion=meta$sourceVersion,
 84 |         Species=meta$species,
 85 |         TaxonomyId=meta$taxonomyId,
 86 |         Title=meta$title,
 87 |         RDataPath=meta$rDataPath,
 88 |         MoreArgs=list(
 89 |             BiocVersion=BiocVersion,
 90 |             Coordinate_1_based = TRUE, ## TRUE unless it "needs" to be FALSE
 91 |             DataProvider = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/",
 92 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
 93 |             RDataClass = "OrgDb",
 94 |             DispatchClass = "SQLiteFile",
 95 |             SourceType="NCBI/ensembl",
 96 |             RDataDateAdded = Sys.time(),
 97 |             Recipe = "AnnotationHubData:::extractOrgDbSqlite",
 98 |             Tags = c("NCBI", "Gene", "Annotation"))) 
 99 | }
100 | 
101 | ## Load the object and call saveDb()
102 | extractOrgDbSqlite <- function(ahm) {
103 |     dbFile <- metadata(ahm)$Title
104 |     orgDbName <- sub('.sqlite','',dbFile)
105 |     orgDbs <- .getOrgDbs()
106 |     orgDb <- orgDbs[[orgDbName]]
107 |     outputPath <- file.path(metadata(ahm)$AnnotationHubRoot,
108 |                             metadata(ahm)$RDataPath)
109 |     if (!isSingleString(outputPath)) 
110 |         stop("'outputPath' must be a single string")
111 |     sqliteCopyDatabase(dbconn(orgDb), outputPath)
112 |     outputFile(ahm)
113 | }
114 | 
115 | makeAnnotationHubResource("OrgDbFromPkgsImportPreparer", makeStandardOrgDbsToAHM)
116 | 


--------------------------------------------------------------------------------
/R/makeHaemCode.R:
--------------------------------------------------------------------------------
  1 | ## This recipe is no longer used. Download site has moved from the hard coded 
  2 | ## location to 
  3 | ## http://codex.stemcells.cam.ac.uk/browse.php?repository=haemcode&organism=mmu
  4 | ## If this is resurrected, reinstate unit test with a smaller file
  5 | ## used for justRunUnitTest=TRUE.
  6 | ##
  7 | #  test_HaemCodeImportPreparer_recipe <- function() {
  8 | #      ahms = updateResources(ahroot, BiocVersion,
  9 | #          preparerClasses = "HaemCodeImportPreparer",
 10 | #          insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE)
 11 | #      checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 12 | #}
 13 | 
 14 | 
 15 | 
 16 | .haemcodeBaseUrl <- "http://haemcode.stemcells.cam.ac.uk/"
 17 | 
 18 | .getHaemCodeFileNames <- function(justRunUnitTest) {
 19 |     filename <- system.file("extdata", "haemCodeFileList.txt", 
 20 |                             package="AnnotationHubData")
 21 |     stopifnot(file.exists(filename))
 22 |     file.list <- scan(filename, what=character(0), sep="\n", quiet=TRUE)
 23 |     
 24 |     metadata.filename <-  system.file(package="AnnotationHubData", "extdata",
 25 |                                       "annotation_haemcode.tsv")
 26 |     stopifnot(file.exists(metadata.filename))
 27 |     tbl.md <- read.table(metadata.filename, sep="\t", header=TRUE, as.is=TRUE)
 28 |     metadata <- tbl.md[which(tbl.md$filename == file.list),]
 29 |     metadata <- apply(metadata,1, function(x) paste0(x, collapse=", "))
 30 |     metadata <- rep(metadata, 3)
 31 |     names(metadata) <- NULL
 32 |     
 33 |     paths <- c(bigWig="blood/BigWig/mm10",
 34 |                peaks="blood/Peaks/mm10",
 35 |                geneList="blood/geneList")
 36 |     
 37 |     urls <- paste0(.haemcodeBaseUrl, paths)
 38 |     
 39 |     file.types <- c("bw", "bed", "csv")
 40 |     
 41 |     fileurls <- mapply(function(x, y){
 42 |         paste0(x, "/", file.list,".",  y)
 43 |     }, urls, file.types, USE.NAMES=FALSE, SIMPLIFY=FALSE)
 44 |     fileurls <- unlist(fileurls) 
 45 |     
 46 |     if(justRunUnitTest) {
 47 |        fileurls <- fileurls[c(1,2)]
 48 |        metadata <- metadata[c(1,2)]
 49 |     }
 50 | 
 51 |     list(files= fileurls, metadata = metadata)
 52 | }
 53 | 
 54 | .getHaemCode <- function(justRunUnitTest=FALSE) {
 55 |     result <- .getHaemCodeFileNames(justRunUnitTest) 
 56 |     
 57 |     haemfiles <- result$files
 58 |     tags <- result$metadata
 59 |     
 60 |     if(length(haemfiles)==0) 
 61 |         stop(" File List not found! ")
 62 |     
 63 |     df <- .httrFileInfo(haemfiles, verbose=TRUE)
 64 |     title <- basename(haemfiles)
 65 |     type <- tools::file_ext(title)
 66 |     
 67 |     fileType <- sapply(type, function(x) 
 68 |         switch(x, bw="bigWig", bed="peak", csv="geneList"), 
 69 |         USE.NAMES =FALSE)
 70 |     
 71 |     description <- paste0(fileType, " file from Haemcode")
 72 |         
 73 |     dispatchclass <- sapply(type, function(x) 
 74 |         switch(x, bw="BigWigFile", bed="BEDFile", csv="CSVtoGranges"), 
 75 |         USE.NAMES =FALSE)
 76 |     
 77 |     sourcetype <- sapply(type, function(x) 
 78 |         switch(x, bw="BigWig", bed="BED", csv="CSV"), 
 79 |         USE.NAMES =FALSE)
 80 |    
 81 |     rdataclass <- sapply(type, function(x)
 82 |         switch(x, bw="BigWigFile", bed="GRanges", csv="GRanges"),
 83 |         USE.NAMES =FALSE)
 84 |    
 85 |   
 86 |     cbind(df, title,  description, fileType, tags, dispatchclass,
 87 |           sourcetype, rdataclass, stringsAsFactors=FALSE)
 88 |     
 89 | }
 90 | 
 91 | makeHaemCodeImporter <- function(currentMetadata, justRunUnitTest=FALSE,
 92 |                                  BiocVersion=BiocManager::version()) {
 93 |     rsrc <- .getHaemCode(justRunUnitTest)
 94 |         
 95 |     ## input_sources table
 96 |     sourceSize <- as.numeric(rsrc$size)
 97 |     sourceUrls <- rsrc$fileurl
 98 |     sourceVersion <- gsub(" ", "_", rsrc$date) # should be character
 99 |     SourceLastModifiedDate <- rsrc$date  # should be "POSIXct" "POSIXt"
100 |     sourceType <- rsrc$sourcetype
101 |     rdataclass <- rsrc$rdataclass     
102 | 
103 |     ## resources table
104 |     title <- rsrc$title
105 |     # dataprovider, species, taxonomyid, genome are same for all files
106 |     description <- rsrc$description
107 |     # maintainer, cordinateBased, status_id, location_prefix, rdataadded, 
108 |     # preparerclss are same for all files
109 |     
110 |     rdatapath <- sub(.haemcodeBaseUrl, "", sourceUrls)
111 |     dispatchclass <- rsrc$dispatchclass
112 |     
113 |     tags <- strsplit(rsrc$tags, ", ")
114 |     
115 |     Map(AnnotationHubMetadata,
116 |         
117 |         SourceSize=sourceSize,
118 |         SourceUrl=sourceUrls,
119 |         SourceVersion=sourceVersion,
120 |         SourceLastModifiedDate = SourceLastModifiedDate,
121 |         SourceType = sourceType,
122 |         
123 |         Description=description, 
124 |         Title=title, 
125 |                 
126 |         RDataPath=rdatapath,
127 |         DispatchClass = dispatchclass,
128 |         RDataClass = rdataclass,        
129 | 
130 |         Tags=tags,
131 |         
132 |         MoreArgs=list(
133 |             BiocVersion=BiocVersion,
134 |             DataProvider = "Haemcode",
135 |             Species="Mus musculus",
136 |             TaxonomyId=10090L,
137 |             Genome= "mm10",
138 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",            
139 |             Coordinate_1_based = FALSE,
140 |             Location_Prefix = .haemcodeBaseUrl,
141 |             RDataDateAdded = Sys.time(),
142 |             Recipe = NA_character_)
143 |             )
144 | }
145 | 
146 | makeAnnotationHubResource("HaemCodeImportPreparer", makeHaemCodeImporter)
147 | 


--------------------------------------------------------------------------------
/inst/unitTests/test_recipe.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ## Tests to just see if we can run all of our recipes
  3 | 
  4 | ahroot <- file.path(getwd(),"Temp")
  5 | BiocVersion <- as.character(BiocManager::version())
  6 | 
  7 | ## No longer used:
  8 | ## test_HaemCodeImportPreparer_recipe
  9 | ## test_Inparanoid8ImportPreparer_recipe
 10 | ## test_BioPaxImportPreparer_recipe
 11 | 
 12 | 
 13 | ## FIXME:
 14 | ## Both UCSC broken because location / format of eutils file has changed;
 15 | ## See .organismToTaxid()
 16 | #test_UCSCChainPreparer_recipe <- function() {
 17 | #    ahms = updateResources(ahroot, BiocVersion,
 18 | #        preparerClasses = "UCSCChainPreparer",
 19 | #        insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE)
 20 | #    checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 21 | #}
 22 | #
 23 | #test_UCSC2BitPreparer_recipe <- function() {
 24 | #    ahms = updateResources(ahroot, BiocVersion,
 25 | #        preparerClasses = "UCSC2BitPreparer",
 26 | #        insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE)
 27 | #    checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 28 | #}
 29 | 
 30 | test_EncodeImportPreparer_recipe <- function() {
 31 |     ahms = updateResources(ahroot, BiocVersion,
 32 |         preparerClasses = "EncodeImportPreparer",
 33 |         insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE)
 34 |     checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 35 | }
 36 | 
 37 | ## FIX ME:
 38 | ## Broken becasue of change in R and encoding
 39 | ## breaks makeEpigenomeRoadmap.R line 56 in gsub
 40 | #test_EpigenomeRoadmapImportPreparer_recipe <- function() {
 41 | #    ahms = updateResources(ahroot, BiocVersion,
 42 | #        preparerClasses = "EpigenomeRoadMapPreparer",
 43 | #        insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE)
 44 | #    checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 45 | #}
 46 | 
 47 | test_dbSNPVCFPreparer_recipe <- function() {
 48 |     ahms = updateResources(ahroot, BiocVersion,
 49 |         preparerClasses = "dbSNPVCFPreparer",
 50 |         insert = FALSE, metadataOnly=TRUE, justRunUnitTest=TRUE)
 51 |     checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 52 | }
 53 | 
 54 | test_RefNetImportPreparer_recipe <- function() {
 55 |     ahms = updateResources(ahroot, BiocVersion,
 56 |         preparerClasses = "RefNetImportPreparer",
 57 |         insert = FALSE, metadataOnly=TRUE)
 58 |      checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 59 | }
 60 | 
 61 | test_ChEAPreparer_recipe <- function() {
 62 |     ahms = updateResources(ahroot, BiocVersion,
 63 |         preparerClasses = "ChEAImportPreparer",
 64 |         insert = FALSE, metadataOnly=TRUE)
 65 |     checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 66 | }
 67 | 
 68 | test_NCBIImportPreparer_recipe <- function() {
 69 |     ahms = updateResources(ahroot, BiocVersion,
 70 |                                preparerClasses = "NCBIImportPreparer",
 71 |                                insert = FALSE, metadataOnly=TRUE,
 72 |                                justRunUnitTest=TRUE)
 73 |     checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 74 | }
 75 | 
 76 | test_Grasp2Db_recipe <- function() {
 77 |     ahms = updateResources(ahroot, BiocVersion,
 78 |                            preparerClasses = "Grasp2ImportPreparer",
 79 |                            insert = FALSE, metadataOnly=TRUE,
 80 |                            justRunUnitTest=TRUE)
 81 |     checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
 82 | }
 83 | 
 84 | ## FIXME: add test_EnsemblFasta
 85 | ## test_EnsmblFastaTwoBitToAHM <- function() {
 86 | ##     ahms = updateResources(ahroot, BiocVersion,
 87 | ##                            preparerClasses = "EnsemblTwoBitPreparer",
 88 | ##                            insert = FALSE, metadataOnly = TRUE,
 89 | ##                            justRunUnitTest = TRUE, release = 96)
 90 | ##     checkTrue(class(ahms[[1]]) == "AnnotationHubMetadata")
 91 | ##     # fails before ensembl release 96
 92 | ##     checkException(
 93 | ##         updateResources(ahroot, BiocVersion,
 94 | ##                         preparerClasses = "EnsemblTwoBitPreparer",
 95 | ##                         insert = FALSE, metadataOnly = TRUE,
 96 | ##                         justRunUnitTest = TRUE, release = 85)
 97 | ##         )
 98 | ## }
 99 | 
100 | 
101 | ## Test_EnsemblGtfToGRanges_recipe <- function() {
102 | ##     ahms = updateResources(ahroot, BiocVersion,
103 | ##                            preparerClasses = "EnsemblGtfImportPreparer",
104 | ##                            insert = FALSE, metadataOnly=TRUE,
105 | ##                            release = "96", justRunUnitTest=TRUE)
106 | ##     checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
107 | ##     # fails before ensembl release 96
108 | ##     checkException(
109 | ##         updateResources(ahroot, BiocVersion,
110 | ##                         preparerClasses = "EnsemblGtfImportPreparer",
111 | ##                         insert = FALSE, metadataOnly=TRUE,
112 | ##                         release = "85", justRunUnitTest=TRUE)
113 | ##         )
114 | ## }
115 | 
116 | #test_GencodeGFF <- function() {
117 | #    ahms = updateResources(ahroot, BiocVersion,
118 | #                           preparerClasses = "GencodeGffImportPreparer",
119 | #                           insert = FALSE, metadataOnly=TRUE,
120 | #                           justRunUnitTest=TRUE, release="31")
121 | #    checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
122 | #}
123 | 
124 | #test_GencodeFasta <- function() {
125 | #    ahms = updateResources(ahroot, BiocVersion,
126 | #                           preparerClasses = "GencodeFastaImportPreparer",
127 | #                           insert = FALSE, metadataOnly=TRUE,
128 | #                           justRunUnitTest=TRUE, species="Human",
129 | #                           release="23")
130 | #    checkTrue(class(ahms[[1]])=="AnnotationHubMetadata")
131 | #}
132 | 


--------------------------------------------------------------------------------
/R/HubMetadata-class.R:
--------------------------------------------------------------------------------
  1 | ### =========================================================================
  2 | ### HubMetadata objects
  3 | ### -------------------------------------------------------------------------
  4 | ###
  5 | 
  6 | setOldClass(c("POSIXct", "POSIXt"))
  7 | setOldClass("numeric_version")
  8 | setOldClass(c("package_version", "numeric_version"))
  9 | 
 10 | ## The prototype needs to be fully specified, using 'NA' to indicate
 11 | ## unknown, otherwise to / from JSON is confused
 12 | setClass("HubMetadata",
 13 |     representation(
 14 |         "VIRTUAL",
 15 |         HubRoot="character",
 16 |         BiocVersion="package_version",
 17 |         Coordinate_1_based="logical",
 18 |         DataProvider="character",
 19 |         DerivedMd5="character",
 20 |         Description='character',
 21 |         Genome="character",                 ## needed for record_id
 22 |         Maintainer="character",
 23 |         Notes='character',
 24 |         RDataClass="character",             ## needed for record_id
 25 |         RDataDateAdded="POSIXct",
 26 |         RDataPath="character",
 27 |         Recipe="character",                 ## no longer needed for record_id
 28 |         SourceLastModifiedDate="POSIXct",
 29 |         SourceMd5="character",
 30 |         SourceSize="numeric",
 31 |         SourceUrl="character",              ## needed for record_id
 32 |         SourceVersion="character",
 33 |         SourceType="character",
 34 |         Species="character",
 35 |         Tags='character',
 36 |         TaxonomyId="integer",               ## needed for record_id
 37 |         Title="character",
 38 |         Location_Prefix="character",
 39 |         DispatchClass="character",
 40 |         PreparerClass="character",          ## needed for record_id
 41 |         Error="character"
 42 |     ),
 43 |     prototype = prototype(
 44 |         HubRoot=NA_character_,
 45 |         BiocVersion=BiocManager::version(),
 46 |         Coordinate_1_based=NA,
 47 |         DataProvider=NA_character_,
 48 |         DerivedMd5=NA_character_,
 49 |         Description=NA_character_,
 50 |         Genome=NA_character_,
 51 |         Maintainer=
 52 |             "Bioconductor Package Maintainer <maintainer@bioconductor.org>",
 53 |         Notes=NA_character_,
 54 |         RDataClass=NA_character_,
 55 |         RDataDateAdded=as.POSIXct(NA_character_),
 56 |         RDataPath=NA_character_,
 57 |         Recipe=NA_character_,
 58 |         SourceLastModifiedDate=as.POSIXct(NA_character_),
 59 |         SourceMd5=NA_character_,
 60 |         SourceSize=NA_real_,
 61 |         SourceVersion=NA_character_,
 62 |         SourceType=NA_character_,
 63 |         Species=NA_character_,
 64 |         Tags=NA_character_,
 65 |         TaxonomyId=NA_integer_,
 66 |         Title=NA_character_,
 67 |         Location_Prefix=NA_character_,
 68 |         DispatchClass=NA_character_,
 69 |         PreparerClass=NA_character_,
 70 |         Error="NA_character"
 71 |     )
 72 | )
 73 | 
 74 | ## ----------------------------------------------------------------------------
 75 | ## generics
 76 | ##
 77 | 
 78 | setGeneric("recipeName", signature="object",
 79 |     function(object) standardGeneric ("recipeName")
 80 | )
 81 | 
 82 | setGeneric("inputFiles", signature="object",
 83 |            function(object, ...) standardGeneric ("inputFiles")
 84 | )
 85 | 
 86 | setGeneric("outputFile", signature="object",
 87 |            function(object) standardGeneric ("outputFile")
 88 | )
 89 | 
 90 | setGeneric("run", signature="object",
 91 |     function(object, recipeFunction, ...) standardGeneric ("run")
 92 | )
 93 | 
 94 | setGeneric("hubError", function(x) standardGeneric("hubError"))
 95 | 
 96 | setGeneric("hubError<-", signature=c("x", "value"),
 97 |     function(x, value) standardGeneric("hubError<-")
 98 | )
 99 | 
100 | ## ------------------------------------------------------------------------------
101 | ## getters and setters
102 | ##
103 | 
104 | setMethod("metadata", "HubMetadata",
105 |     function(x, ...) {
106 |         nms <- slotNames(class(x))
107 |         names(nms) <- nms
108 |         lapply(nms, slot, object=x)
109 |     }
110 | )
111 | 
112 | setReplaceMethod("metadata", c("HubMetadata", "list"),
113 |      function(x, ..., value)
114 |          do.call(new, c(class(x), x, value))
115 | )
116 | 
117 | setMethod("recipeName", "HubMetadata",
118 |     function(object)
119 |         metadata(object)$Recipe
120 | )
121 | 
122 | setMethod("inputFiles", "HubMetadata",
123 |     function(object, useRoot=TRUE) {
124 |         if(useRoot==TRUE){
125 |             res <- file.path(metadata(object)$HubRoot,
126 |                              metadata(object)$RDataPath)
127 |         }else{
128 |             res <- metadata(object)$SourceUrl
129 |         }
130 |         res
131 |     }
132 | )
133 | 
134 | setMethod("outputFile", "HubMetadata",
135 |     function(object)
136 |         file.path(metadata(object)$HubRoot,
137 |                   basename(metadata(object)$RDataPath))
138 | )
139 | 
140 | setMethod("hubError", "HubMetadata",
141 |     function(x) x@Error
142 | )
143 | 
144 | setMethod("hubError", "list",
145 |     function(x)
146 |     {
147 |         if (!all(sapply(x, is, "HubMetadata")))
148 |             stop("all elements of 'value' must be 'HubMetadata' objects")
149 |         sapply(x, hubError)
150 |     }
151 | )
152 | 
153 | setReplaceMethod("hubError", c("HubMetadata", "character"),
154 |     function(x, value)
155 |     {
156 |         x@Error <- value
157 |         x
158 |     }
159 | )
160 | 
161 | setReplaceMethod("hubError", c("list", "character"),
162 |     function(x, value)
163 |     {
164 |         if (!all(sapply(x, is, "HubMetadata")))
165 |             stop("all elements of 'x' must be 'HubMetadata' objects")
166 |         lapply(x, "hubError<-", value=value)
167 |     }
168 | )
169 | 
170 | ## ------------------------------------------------------------------------------
171 | ## show
172 | ##
173 | 
174 | setMethod(show, "HubMetadata",
175 |     function(object)
176 | {
177 |     cat("class: ", class(object), '\n', sep='')
178 |     for (slt in sort(slotNames(object))) {
179 |         value <- slot(object, slt)
180 |         txt <- paste0(slt, ": ", paste0(as.character(value), collapse=" "))
181 |         cat(strwrap(txt), sep="\n  ")
182 |     }
183 | })
184 | 


--------------------------------------------------------------------------------
/man/makeStandardOrgDbs.Rd:
--------------------------------------------------------------------------------
  1 | \name{makeStandardOrgDbs}
  2 | 
  3 | \alias{makeStandardOrgDbs}
  4 | \alias{makeStandardOrgDbsToAHM}
  5 | \alias{makeStandardTxDbs}
  6 | \alias{makeStandardTxDbsToAHM}
  7 | \alias{makeNonStandardOrgDbs}
  8 | \alias{makeNCBIToOrgDbsToAHM}
  9 | 
 10 | \title{Functions to add OrgDb and TxDb sqlite files to AnnotationHub}
 11 | 
 12 | \description{Add OrgDb and TxDb sqlite files to AnnotationHub}
 13 | 
 14 | \usage{
 15 | makeStandardOrgDbsToAHM(currentMetadata, justRunUnitTest = FALSE, 
 16 |                         BiocVersion = BiocManager::version(), 
 17 |                         downloadOrgDbs = TRUE)
 18 | 
 19 | makeStandardTxDbsToAHM(currentMetadata, justRunUnitTest = FALSE, 
 20 |                        BiocVersion = BiocManager::version(), TxDbs)
 21 | 
 22 | makeNCBIToOrgDbsToAHM(currentMetadata, justRunUnitTest = FALSE,
 23 |                       BiocVersion = BiocManager::version(), 
 24 |                       baseUrl = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/")
 25 | }
 26 | 
 27 | \arguments{
 28 |   \item{currentMetadata}{
 29 |     Historically was intended to be a list of metadata to filter, i.e.,
 30 |     records that do not need to be processed again. In some recipes this
 31 |     is used as a way to pass additional arguments. Need to remove or 
 32 |     make consistent.
 33 |   }
 34 |   \item{baseUrl}{
 35 |     A \code{character()}. The file location.
 36 |   } 
 37 |   \item{justRunUnitTest}{
 38 |     A \code{logical}. When TRUE, a small number of records (usually <= 5) are
 39 |     processed instead of all.
 40 |   }
 41 |   \item{BiocVersion}{
 42 |     A \code{character(1)}. The resource will be available for Bioconductor 
 43 |     versions greater than and equal to this version. Default is
 44 |     BiocManaer::version().
 45 |   }
 46 |   \item{TxDbs}{
 47 |     Character vector of the \code{TxDb} names; generally includes
 48 |     \code{TxDbs} that were new or updated for the current release.
 49 |   }
 50 |   \item{downloadOrgDbs}{
 51 |     A \code{logical}. Indicates if all \code{OrgDb} packages in the 
 52 |     Bioconductor repo should be downloaded and installed. This should be 
 53 |     \code{TRUE} the first time the recipe is run and can be \code{FALSE} for
 54 |     subsequent runs when testing.
 55 |   }
 56 | }
 57 | 
 58 | \details{
 59 |   \code{makeStandardOrgDbsToAHM} and \code{makeStandardTxDbsToAHM} extracts
 60 |   the sqlite files from the existing \code{OrgDb} and \code{TxDb} packages
 61 |   in the Bioconductor repositories and generate associated metadata.
 62 | 
 63 |   \code{makeNCBIToOrgDbsToAHM} creates sqlite files and metadata for 1000
 64 |   organisms with the \code{makeOrgPackageFromNCBI} function. These
 65 |   organisms are less 'main stream' than those hosted in the Bioconductor
 66 |   repository (\code{makeStandardOrgDbsToAHM}) and the databases are less 
 67 |   comprehensive because data only come from one source, NCBI.
 68 | }
 69 | 
 70 | \value{
 71 |   List of \code{AnnotationHubMetadata} objects. 
 72 | }
 73 | 
 74 | \author{Bioconductor Core Team}
 75 | 
 76 | \seealso{
 77 |   \itemize{
 78 |     \item \link{updateResources}
 79 |     \item \link{AnnotationHubMetadata}
 80 |   }
 81 | }
 82 | 
 83 | \examples{
 84 | \dontrun{
 85 | ## In Bioconductor 3.5, one new TxDb was added and 4 active
 86 | ## tracks were updated. This piece of code shows how to add these 5
 87 | ## packages to AnnotationHub.
 88 | 
 89 | ## Step I: generate metadata 
 90 | ##
 91 | ## Generate the metadata with the low-level helper for inspection.
 92 | TxDbs <- c("TxDb.Ggallus.UCSC.galGal5.refGene",
 93 |            "TxDb.Celegans.UCSC.ce11.refGene",
 94 |            "TxDb.Rnorvegicus.UCSC.rn5.refGene",
 95 |            "TxDb.Dmelanogaster.UCSC.dm6.ensGene",
 96 |            "TxDb.Rnorvegicus.UCSC.rn6.refGene")
 97 | meta <- makeStandardTxDbsToAHM(currentMetadata=list(AnnotationHubRoot="TxDbs"), 
 98 |                                justRunUnitTest=FALSE, 
 99 |                                TxDbs = TxDbs)
100 | 
101 | ## Once the low-level helper runs with no errors, try generating the
102 | ## metadata with the high-level wrapper updateResources(). Setting
103 | ## metadataOnly=TRUE will generate metadata only and not push resources
104 | ## to data bucket. insert=FALSE prevents the metadata from being inserted in the
105 | ## database.
106 | ##
107 | ## The metadata generated by updateResources() will be the same as that
108 | ## generated by makeStandardTxDbsToAHM(). Both should be a list the same
109 | ## length as the number of TxDbs specified.
110 | meta <- updateResources("TxDbs",
111 |                         preparerClasses="TxDbFromPkgsImportPreparer",
112 |                         metadataOnly=TRUE, insert = FALSE,
113 |                         justRunUnitTest=FALSE, TxDbs = TxDbs)
114 | 
115 | INFO [2017-04-11 09:12:09] Preparer Class: TxDbFromPkgsImportPreparer
116 | complete!
117 | > length(meta)
118 | [1] 5
119 | 
120 | ## Step II: push resources to Azure 
121 | ##
122 | ## If the metadata looks correct we are ready to push resources to Azure.
123 | ## Set metadataOnly=FALSE but keep insert=FALSE.
124 | 
125 | ## export an environment variable with a core generated SAS URL for
126 | ##   upload example:
127 | ##  export AZURE_SAS_URL='https://bioconductorhubs.blob.core.windows.net/staginghub?sp=racwl&st=2022-02-08T15:57:00Z&se=2022-02-22T23:57:00Z&spr=https&sv=2020-08-04&sr=c&sig=fBtPzgrw1Akzlz%2Fwkne%2BQrxOKOdCzP1%2Fk5S%2FHk1LguE%3D'
128 | 
129 | meta <- updateResources("TxDbs",
130 |                         BiocVersion="3.5",
131 |                         preparerClasses="TxDbFromPkgsImportPreparer",
132 |                         metadataOnly=FALSE, insert = FALSE,
133 |                         justRunUnitTest=FALSE, TxDbs = TxDbs)
134 | 
135 | ## Step III: insert metadata in AnnotationHub production database
136 | ##
137 | ## Inserting the metadata in the database is usually done as a separte step
138 | ## and with the help of the AnnotationHub docker.
139 | ## Set metadataOnly=TRUE and insert=TRUE.
140 | meta <- updateResources("TxDbs",
141 |                         BiocVersion="3.5",
142 |                         preparerClasses="TxDbFromPkgsImportPreparer",
143 |                         metadataOnly=FALSE, insert = FALSE,
144 |                         justRunUnitTest=FALSE, TxDbs = TxDbs)
145 | 
146 | }
147 | }
148 | 
149 | \keyword{methods}
150 | 


--------------------------------------------------------------------------------
/inst/scripts/singleContributedResourceTemplate.R:
--------------------------------------------------------------------------------
  1 | ## Community contributed resources.
  2 | 
  3 | ## -----------------------------------------------------------------------
  4 | ## Timothee Flutre's GRanges from GFF
  5 | ## -----------------------------------------------------------------------
  6 | 
  7 | ## Vitis vinifera URGI IGGP12Xv2_V3-20
  8 | metadata <- AnnotationHubMetadata(
  9 |     Description="Gene Annotation for Vitis vinifera",
 10 |     Genome="IGGP12Xv2",
 11 |     Species="Vitis vinifera",
 12 |     SourceUrl="http://doi.org/10.15454/1.5009072354498936E12",
 13 |     SourceLastModifiedDate=as.POSIXct("2018-02-16"),
 14 |     SourceVersion="3.0",
 15 |     RDataPath="community/tflutre/Vvinifera_URGI_IGGP12Xv2_V3-20.gff3.Rdata",
 16 |     TaxonomyId=29760,
 17 |     Title="Vvinifera_URGI_IGGP12Xv2_V3-20.gff3.Rdata",
 18 |     BiocVersion=package_version("3.6"),
 19 |     Coordinate_1_based=TRUE,
 20 |     DataProvider="URGI",
 21 |     Maintainer="Timothée Flutre <timothee.flutre@inra.fr>",
 22 |     RDataClass="GRanges",
 23 |     DispatchClass="GRanges",
 24 |     SourceType="GFF",
 25 |     RDataDateAdded=as.POSIXct(Sys.time()),
 26 |     Recipe=NA_character_,
 27 |     PreparerClass="None",
 28 |     Tags=c("GFF", "URGI", "Gene", "Transcript", "Annotation"),
 29 |     Notes="compare to the original GFF3 file, chromosomes were slightly renamed to be compatible with the reference genome"
 30 | )
 31 | 
 32 | ## Vitis vinifera URGI IGGP12Xv2 V3
 33 | metadata <- AnnotationHubMetadata(
 34 |     Description="Gene Annotation for Vitis vinifera",
 35 |     Genome="IGGP12Xv2",
 36 |     Species="Vitis vinifera",
 37 |     SourceUrl="http://doi.org/10.15454/1.5009072354498936E12",
 38 |     SourceLastModifiedDate=as.POSIXct("2017-11-17"),
 39 |     SourceVersion="3.0",
 40 |     RDataPath="community/tflutre/Vvinifera_URGI_IGGP12Xv2_V3.gff3.Rdata",
 41 |     TaxonomyId=29760,
 42 |     Title="Vvinifera_URGI_IGGP12Xv2_V3.gff3.Rdata",
 43 |     BiocVersion=package_version("3.6"),
 44 |     Coordinate_1_based=TRUE,
 45 |     DataProvider="URGI",
 46 |     Maintainer="Timothée Flutre <timothee.flutre@inra.fr>",
 47 |     RDataClass="GRanges",
 48 |     DispatchClass="GRanges",
 49 |     SourceType="GFF",
 50 |     RDataDateAdded=as.POSIXct(Sys.time()),
 51 |     Recipe=NA_character_,
 52 |     PreparerClass="None",
 53 |     Tags=c("GFF", "URGI", "Gene", "Transcript", "Annotation"),
 54 |     Notes="compare to the original GFF3 file, chromosomes were slightly renamed to be compatible with the reference genome"
 55 | )
 56 | 
 57 | ## Vitis vinifera CRIBI IGGP12Xv0 V2
 58 | metadata <- AnnotationHubMetadata(
 59 |     Description="Gene Annotation for Vitis vinifera",
 60 |     Genome="IGGP12Xv0",
 61 |     Species="Vitis vinifera",
 62 |     SourceUrl="http://genomes.cribi.unipd.it/DATA/V2/V2.1/V2.1.gff3",
 63 |     SourceLastModifiedDate=as.POSIXct("2014-04-17"),
 64 |     SourceVersion="2.1",
 65 |     RDataPath="community/tflutre/",
 66 |     TaxonomyId=29760L,
 67 |     Title="Vvinifera_CRIBI_IGGP12Xv0_V2.1.gff3.Rdata",
 68 |     BiocVersion=package_version("3.3"),
 69 |     Coordinate_1_based=TRUE,
 70 |     DataProvider="CRIBI",
 71 |     Maintainer="Timothée Flutre <timothee.flutre@supagro.inra.fr",
 72 |     RDataClass="GRanges",
 73 |     DispatchClass="GRanges",
 74 |     SourceType="GFF",
 75 |     RDataDateAdded=as.POSIXct(Sys.time()),
 76 |     Recipe=NA_character_,
 77 |     PreparerClass="None",
 78 |     Tags=c("GFF", "CRIBI", "Gene", "Transcript", "Annotation"),
 79 |     Notes="chrUn renamed to chrUkn"
 80 | )
 81 | 
 82 | ## Vitis vinifera Genoscope IGGP12Xv0
 83 | metadata <- AnnotationHubMetadata(
 84 |     Description="Gene Annotation for Vitis vinifera",
 85 |     Genome="IGGP12Xv0",
 86 |     Species="Vitis vinifera",
 87 |     SourceUrl="http://www.genoscope.cns.fr/externe/Download/Projets/Projet_ML/data/12X/annotation/Vitis_vinifera_annotation.gff.gz",
 88 |     SourceLastModifiedDate=as.POSIXct("2010-03-19"),
 89 |     SourceVersion="1.0",
 90 |     RDataPath="community/tflutre/",
 91 |     TaxonomyId=29760L,
 92 |     Title="Vvinifera_Genoscope_IGGP12Xv0_V1.0.gff3.Rdata",
 93 |     BiocVersion=package_version("3.3"),
 94 |     Coordinate_1_based=TRUE,
 95 |     DataProvider="Genoscope",
 96 |     Maintainer="Timothée Flutre <timothee.flutre@supagro.inra.fr",
 97 |     RDataClass="GRanges",
 98 |     DispatchClass="GRanges",
 99 |     SourceType="GFF",
100 |     RDataDateAdded=as.POSIXct(Sys.time()),
101 |     Recipe=NA_character_,
102 |     PreparerClass="None",
103 |     Tags=c("GFF", "Genoscope", "Gene", "Transcript", "Annotation"),
104 |     Notes="Original file format was upgraded from GFF2 to GFF3; includes rows corresponding to gene/mRNA/CDS only; chrUn renamed to chrUkn"
105 | )
106 | 
107 | ## Vitis vinifera Genoscope IGGP8X
108 | metadata <- AnnotationHubMetadata(
109 |     Description="Gene Annotation for Vitis vinifera",
110 |     Genome="IGGP8X",
111 |     Species="Vitis vinifera",
112 |     SourceUrl="http://www.genoscope.cns.fr/externe/Download/Projets/Projet_ML/data/8X/annotation/Vitis_vinifera_annotation_v1.gff",
113 |     SourceLastModifiedDate=as.POSIXct("2007-10-09"),
114 |     SourceVersion="1.0",
115 |     RDataPath="community/tflutre/",
116 |     TaxonomyId=29760L,
117 |     Title="Vvinifera_Genoscope_IGGP8X_V1.0.gff3.Rdata",
118 |     BiocVersion=package_version("3.3"),
119 |     Coordinate_1_based=TRUE,
120 |     DataProvider="Genoscope",
121 |     Maintainer="Timothée Flutre <timothee.flutre@supagro.inra.fr",
122 |     RDataClass="GRanges",
123 |     DispatchClass="GRanges",
124 |     SourceType="GFF",
125 |     RDataDateAdded=as.POSIXct(Sys.time()),
126 |     Recipe=NA_character_,
127 |     PreparerClass="None",
128 |     Tags=c("GFF", "Genoscope", "Gene", "Transcript", "Annotation"),
129 |     Notes="Original file format was upgraded from GFF2 to GFF3; includes rows corresponding to gene/mRNA/CDS only; chrUn renamed to chrUkn"
130 | )
131 | 
132 | ## -----------------------------------------------------------------------
133 | ## upload to S3
134 | file <-
135 | bucket <- getOption("ANNOTATION_HUB_BUCKET_NAME", "annotationhub")
136 | remotePath <- paste0(metadata(metadata)$RDataPath, metadata(metadata)$Title)
137 | res <- upload_to_S3(file, remotePath, bucket)
138 | 
139 | ## insert metadata
140 | metadata$RDataPath = paste0(metadata$RDataPath, metadata$Title)
141 | url <- getOption("AH_SERVER_POST_URL")
142 | pushMetadata(list(metadata), url)
143 | 


--------------------------------------------------------------------------------
/R/webAccessFunctions.R:
--------------------------------------------------------------------------------
  1 | ### =========================================================================
  2 | ### HTTP and FTP helpers
  3 | ### -------------------------------------------------------------------------
  4 | ###
  5 | 
  6 | ## Remove remove "\n" when inserting long text (> 80 chars)
  7 | .expandLine <- function(x)
  8 |     gsub("[[:space:]]{2,}"," ", x)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | ## HTTP
 12 | ## 
 13 | 
 14 | ## Parses xml from a http page into a data.frame of filenames, 
 15 | ## List filenames or filenames ending with an extension on http page.
 16 | ## Also reads the md5sum in "md5sum.txt" on the same http page.
 17 | .httrRead <- function(url, xpathString="//pre/a/text()", 
 18 |                       extension=NA_character_, getmd5sum=FALSE) {
 19 |     tryCatch({
 20 |         result <- httpGET(url)
 21 |         html <- htmlParse(result, asText=TRUE)
 22 | 
 23 |         fls <- getNodeSet(html, xpathString)
 24 |         if (is(fls, "XMLNodeSet"))
 25 |             fls <- vapply(fls, xmlValue, character(1L))
 26 | 
 27 |         md5exists <- length(grep("md5sum.txt", fls))!=0
 28 |         remove <- c("Name", "Size", "Last modified", "Description",
 29 |                     "Parent Directory", "referenceSequences/",
 30 |                     "files.txt", "md5sum.txt", "supplemental/")
 31 |         fls <- fls[!fls %in% remove ]
 32 | 
 33 |         ## filter by extension
 34 |         if(!is.na(extension)){
 35 |             fls <- fls[grepl(paste0(extension, "$"), fls)]
 36 |         }
 37 | 
 38 |         ## UCSC chain and 2bit files have a file called md5sum.txt
 39 |         ## col1=md5sum, col2=filename
 40 |         ## note : not all chain files have md5sum on UCSC website!
 41 |         if(getmd5sum & md5exists & length(fls!=0)) {
 42 |             df <- read.table(paste0(url, "/", "md5sum.txt"), header=FALSE,
 43 |                              stringsAsFactors=FALSE)
 44 |             md5sum <- df[match(fls, df[,2]),1]
 45 |             df <- data.frame(files=fls, md5sum=md5sum,  stringsAsFactors=FALSE)
 46 |         } else
 47 |             df <- data.frame(files=fls, stringsAsFactors=FALSE)
 48 |         df
 49 | 
 50 |     }, error=function(err) {
 51 |         warning(basename(url), ": ", conditionMessage(err))
 52 |         url=character()
 53 |     })
 54 | }
 55 | 
 56 | 
 57 | ## Returns data.frame with fileurl, last modified date and file size
 58 | .httrFileInfo <- function(urls, verbose=TRUE) {
 59 |     result <- lapply(urls, function(f){
 60 |         if(verbose)
 61 |             message(paste0("getting file info: ", basename(f)))
 62 |         tryCatch({
 63 |             h = suppressWarnings(
 64 |               httpGET(f, nobody=TRUE, filetime=TRUE, header=TRUE))
 65 | 
 66 |             nams <- names(h$header)
 67 |             if("last-modified" %in% nams)
 68 |                  h$header[c("last-modified", "content-length")]
 69 | 	    else
 70 |                 c("last-modified"=NA, "content-length"=NA)
 71 |         }, error=function(err) {
 72 |         warning(basename(f), ": ", conditionMessage(err))
 73 |         list("last-modified"=character(), "content-length"=character())
 74 |         })
 75 |     })
 76 | 
 77 |     size <- as.numeric(sapply(result, "[[", "content-length"))
 78 |     date <- strptime(sapply(result, "[[", "last-modified"),
 79 |              "%a, %d %b %Y %H:%M:%S", tz="GMT")
 80 | 
 81 |     data.frame(fileurl=urls, date, size, stringsAsFactors=FALSE)
 82 | }
 83 | 
 84 | ## -----------------------------------------------------------------------------
 85 | ## FTP 
 86 | ## 
 87 | 
 88 | ## Returns a data.frame with fileurl, last modified date and file size.
 89 | ## 'extension' can be a single file name with extension or just the extension.
 90 | .ftpFileInfo <- function(url, extension, verbose=FALSE) {
 91 | 
 92 |     if (verbose)
 93 |         message(paste0("creating urls ..."))
 94 | 
 95 |     result <- lapply(url, function(ul) {
 96 |         message(ul)
 97 |         N.TRIES = 3L
 98 |         while (N.TRIES > 0L) {
 99 |             con <- tryCatch(getURL(ul), error=identity)
100 |             if (!inherits(con, "error"))
101 |                 break
102 |             Sys.sleep(300)
103 |             N.TRIES <- N.TRIES - 1L
104 |         }
105 |         if (N.TRIES == 0L) {
106 |             stop("'getURL()' failed:",
107 |                  "\n  URL: ", ul,
108 |                  "\n  error: ", conditionMessage(con))
109 |         }
110 | 
111 |         txt <- read.table(text=con, stringsAsFactors=FALSE, fill=TRUE)
112 | 
113 |         files <- txt[[9]]
114 |         if (verbose)
115 |             message(basename(ul))
116 | 
117 |         pattern <- paste(paste0(extension, "$"), collapse="|")
118 |         keep <- !grepl("00-", files) & grepl(pattern, files)
119 |         txt <- txt[keep, ]
120 |         if (nrow(txt) == 0L)
121 |             return(data.frame(fileurl=character(), 
122 |                               date=as.POSIXct(character()),
123 |                               size=numeric()))
124 | 
125 |         # last modified date and size
126 |         dateraw <- apply(txt, 1, function(xx) paste(xx[6], xx[7], xx[8]))
127 |         datestring <- lapply(dateraw, function(xx) {
128 |             as.POSIXct(strptime(xx, format="%b %e %H:%M", tz="GMT"))
129 |         })
130 |         if (any(is.na(datestring))) {
131 |             datestring <- lapply(dateraw, function(xx) {
132 |                 as.POSIXct(strptime(xx, format="%b %e %Y", tz="GMT"))
133 |             })
134 |         }
135 | 
136 |         data.frame(fileurl=paste0(ul, txt[[9]]), date=do.call(c, datestring), 
137 |                    size=as.numeric(txt[[5]]), stringsAsFactors=FALSE)
138 |     })
139 | 
140 |     do.call(rbind, result)
141 | }
142 | 
143 | .parseDirInfo <- function(info) {
144 |     readLines(textConnection(trimws(info)))
145 | }
146 | 
147 | # Return unparsed directory listing as character vector
148 | .ftpDirectoryInfo <- function(someUrl, filesOnly=FALSE) {
149 |     curlHandle <- getCurlHandle(customrequest="LIST -R")
150 |     info <- getURL(someUrl, curl=curlHandle)
151 |     .parseDirInfo(info)
152 | }
153 | 
154 | ## Return just the names of the files in an FTP directory
155 | ## Note, this will not do any cleaning of symlinks
156 | .listRemoteFiles <- function(someUrl){
157 |     curlHandle <- getCurlHandle(dirlistonly=TRUE)
158 |     info <- getURL(someUrl, curl=curlHandle)
159 |     .parseDirInfo(info)
160 | }
161 | 


--------------------------------------------------------------------------------
/R/makeUCSCChain.R:
--------------------------------------------------------------------------------
  1 | .ucscBase <- "http://hgdownload.cse.ucsc.edu/"
  2 | 
  3 | .getchainFiles <- function(url, fileName=NA_character_, verbose=TRUE) {
  4 |     result <- .httrRead(url, extension=fileName, getmd5sum=TRUE)
  5 |     if(length(result)) {
  6 |         files <-  paste0(url, "/", result$files)
  7 |         df <- .httrFileInfo(files, verbose=TRUE)
  8 |         if(identical(names(result), c("files","md5sum")))
  9 |             cbind(df, md5sum=result$md5sum, stringsAsFactors=FALSE)
 10 |     } else 
 11 |         data.frame(fileurl=NA_character_, date=NA, 
 12 |             size=NA, md5sum=NA_character_, stringsAsFactors=FALSE)
 13 | }
 14 | 
 15 | ## FIXME: eutils file (and interface?) has moved to 
 16 | ##        "https://www.ncbi.nlm.nih.gov/books/NBK25501/"
 17 | .organismToTaxid <- function(organism=character()) {
 18 |     ## query NCBI for taxonomy ID
 19 |     .eutils <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils"
 20 |     
 21 |     ## 1. ids
 22 |     uorganism <- unique(organism[!is.na(organism)])
 23 |     query <- paste(uorganism, collapse=" OR ")
 24 |     url <- sprintf("%s/esearch.fcgi?db=taxonomy&term=%s&retmax=%d",
 25 |         .eutils, query, length(uorganism))
 26 |     xml <- XML::xmlParse(url)
 27 |     
 28 |     ## 2. records
 29 |     id <- as.character(sapply(xml["//Id/text()"], xmlValue))
 30 |     scin <- taxid <- character()
 31 |     if (length(id)) {
 32 |         query2 <- paste(id, collapse=",")
 33 |         url <- sprintf("%s/efetch.fcgi?db=taxonomy&id=%s&retmax=%d",
 34 |             .eutils, query2, length(uorganism))
 35 |         xml <- XML::xmlParse(url)
 36 |         scin <- sapply(xml["/TaxaSet/Taxon/ScientificName"], xmlValue)
 37 |         taxid <- sapply(xml["/TaxaSet/Taxon/TaxId/text()"], xmlValue)
 38 |     }
 39 |     
 40 |     
 41 |     scin[which(scin %in% "Pongo abelii")] <- "Pongo pygmaeus abelii"
 42 |     scin[which(scin %in% "Xenopus (Silurana) tropicalis")]="Xenopus tropicalis"
 43 |     scin[which(scin %in% "Ictidomys tridecemlineatus")]="Spermophilus tridecemlineatus"
 44 |     
 45 |     ## there are 3 special cases: WE provide query, ncbi returns scin
 46 |     #a) query ="Pongo pygmaeus abelii", scin="Pongo abelii", taxid="9601"
 47 |     #b) query ="Xenopus tropicalis", scin="Xenopus (Silurana) tropicalis", taxid="8364"
 48 |     #c) query ="Spermophilus tridecemlineatus", scin="Ictidomys tridecemlineatus", taxid="43179"
 49 |         
 50 |     ## 3. Results
 51 |     as.integer(taxid[match(organism, scin)])
 52 | }
 53 | 
 54 | .getUCSCResources <- 
 55 |     function(fileType, dirName, fileName, verbose=FALSE, justRunUnitTest=FALSE)
 56 | {
 57 |     ## get resource from UCSC
 58 |     .fileBase <- sprintf("%sgoldenPath", .ucscBase)
 59 |     genome_tbl <- rtracklayer::ucscGenomes(organism=TRUE)
 60 |     genomes <- genome_tbl$db
 61 |     ## remove faulty genome. 
 62 |     rm <- c("cb1", "eboVir3", "dp2", "strPur1", "ci1", "calMil1","monDom1", 
 63 |             "balAcu1" ,"musFur1")
 64 |     genomes <- setdiff(genomes, rm)
 65 |     
 66 |     urls <- sprintf("%s/%s/%s", .fileBase, genomes, dirName)
 67 |     
 68 |     if(justRunUnitTest)
 69 |        	urls <- tail(urls, n=2)
 70 |        
 71 |     rsrc <- do.call(rbind, lapply(urls, .getchainFiles, 
 72 |         fileName=fileName, verbose=verbose))
 73 |     rsrc <- rsrc[complete.cases(rsrc),]
 74 |     title <- basename(rsrc$fileurl)
 75 | 
 76 |     ## parse the filename for each file type.
 77 |     switch(fileType, chain={
 78 |         rsrc$from <- sub("^([[:alnum:]]+)To[A-Z].*", "\\1", title)
 79 |         rsrc$to <- sub(".*To([A-Z])([[:alnum:]]+).*", "\\L\\1\\E\\2",
 80 |                        title, perl=TRUE)
 81 |     }, "2bit"={
 82 |         rsrc$from <- sub(".2bit","", title)
 83 |     }, {
 84 |         stop("unknown fileType ", sQuote(fileType))
 85 |     })
 86 |     
 87 |     ## add the organism 
 88 |     idx <- match(rsrc$from, genome_tbl$db)
 89 |     rsrc$organism <- rep(NA_character_, length(idx))
 90 |     rsrc$organism[!is.na(idx)] <- genome_tbl[idx[!is.na(idx)], "organism"]
 91 |     
 92 |     ## add the taxonmy Id. 
 93 |     rsrc$taxid <- rep(NA_character_, length(idx))
 94 |     rsrc$taxid[!is.na(idx)] <- .organismToTaxid(rsrc$organism[!is.na(idx)])
 95 |     
 96 |     rsrc
 97 | }
 98 | 
 99 | makeUCSCChain <- function(currentMetadata, justRunUnitTest=FALSE, 
100 |                           BiocVersion=BiocManager::version()) {
101 |     rsrc <- .getUCSCResources(fileType="chain", dirName="liftOver", 
102 |         fileName="chain.gz", verbose=TRUE, justRunUnitTest)
103 |     
104 |     ## input_sources table
105 |     sourceSize <- as.numeric(rsrc$size)
106 |     sourceUrls <- rsrc$fileurl
107 |     sourceVersion <- gsub(" ", "_", rsrc$date) 
108 |     sourceLastModifiedDate <- rsrc$date
109 |     
110 |     ## resources table
111 |     species <- rsrc$organism   
112 |     genome <- rsrc$from
113 |     taxonomyId <- as.integer(rsrc$taxid)           
114 |     title <- basename(rsrc$fileurl) 
115 |     description <- sprintf("UCSC liftOver chain file from %s to %s",
116 |                            rsrc$from, rsrc$to)
117 |     rdatapaths <-gsub(.ucscBase, "",sourceUrls)
118 |     md5sum <- rsrc$md5sum
119 |     
120 |     Map(AnnotationHubMetadata,
121 |         
122 |         SourceSize=sourceSize,
123 |         SourceUrl=sourceUrls,
124 |         SourceVersion=sourceVersion,
125 |         SourceLastModifiedDate = sourceLastModifiedDate,
126 |         SourceMd5 =md5sum, 
127 |         
128 |         Description=description,
129 |         Title=title,
130 |         Genome=genome,
131 |         Species=species, 
132 |         TaxonomyId=taxonomyId,
133 |         
134 |         RDataPath= rdatapaths,
135 |         
136 |         MoreArgs=list(
137 |             BiocVersion=BiocVersion,
138 |             # input sources 
139 |             SourceType= "Chain",
140 |             
141 |             # resources
142 |             DataProvider = "UCSC",
143 |             Maintainer =  "Bioconductor Maintainer <maintainer@bioconductor.org>",         
144 |             Coordinate_1_based = FALSE,
145 |             Location_Prefix = .ucscBase,
146 |             RDataDateAdded = Sys.time(),
147 |                         
148 |             #rdata table
149 |             DispatchClass= "ChainFile" ,
150 |             RDataClass = "GRanges",
151 |             
152 |             Recipe = NA_character_, 
153 |             Tags = c("liftOver", "chain", "UCSC", "genome", "homology")))
154 | }
155 | 
156 | makeAnnotationHubResource("UCSCChainPreparer", makeUCSCChain)
157 | 


--------------------------------------------------------------------------------
/R/ahmToJson.R:
--------------------------------------------------------------------------------
  1 | ## Code for creating json records from the sqlite DB.
  2 | 
  3 | ## RIGHT NOW the json looks sort of record centric.  So I need to make
  4 | ## code that takes AHMs and makes them into JSON.
  5 | 
  6 | 
  7 | ## So 1st I need an exemplar AHM (will save one for now in inst/extdata
  8 | 
  9 | ## So for testing: 
 10 | ## load(system.file('extdata','inpDrosPsuedo.rda', package='AnnotationHubData'))
 11 | ## ahm
 12 | 
 13 | 
 14 | ## helper to do cleanup and make sure things are present:
 15 | cleanupLst <- function(lst){
 16 |     if(is.na(lst[["recipe"]])){ return(lst) }
 17 |     
 18 |     if(length(lst[["recipe"]])==1){
 19 |         lst[["recipe"]][[2]] <- "AnnotationHubData"
 20 |     }
 21 |     ## Unfortunately, I have no recipe args (so I can't fix that field)
 22 |     ## But I DO have this translation file Dan made me...
 23 |     ## looks like there are some issues with the data..
 24 |     if(lst[["recipe"]][1]=='extendedBedToGRanges'){
 25 |         file <- system.file('extdata','titlesToRecipes.txt',
 26 |                             package='AnnotationHubData')
 27 |         trns <- read.delim(file, header=FALSE, stringsAsFactors=FALSE)
 28 |         idx <- trns[[2]] %in% lst[["sourceurl"]]
 29 |         value <- trns[idx,][[1]]
 30 |         if(length(value)==1){
 31 |             lst[["recipe"]][1] <- value
 32 |         }else{
 33 |            warning("no matching value for recipe called 'extendedBedToGRanges'")
 34 |         }
 35 |     }
 36 |     lst
 37 | }
 38 | 
 39 | 
 40 | 
 41 | ## Dan suggests jsonlite
 42 | ahmToJson <- function(ahm){
 43 |     lst <- metadata(ahm)    
 44 |     
 45 |     ## casting on elements that toJSON can't handle
 46 |     lst[['BiocVersion']] <- as.character(lst[['BiocVersion']])
 47 |     lst[['SourceLastModifiedDate']] <- as.character(lst[['SourceLastModifiedDate']])
 48 |     ## lower case all the names
 49 |     names(lst) <- tolower(names(lst))
 50 |     
 51 |     ##TEMP cleanup the ahm (in future we want to stop using this!)
 52 |     lst <- cleanupLst(lst)
 53 |     
 54 |     rdatapaths <- Map(list,
 55 |                       rdatapath=lst[['rdatapath']],
 56 |                       rdataclass=lst[['rdataclass']],
 57 |                       dispatchclass=lst[['dispatchclass']] 
 58 |                       )
 59 |     ## using Map puts unwanted labels on things...
 60 |     names(rdatapaths) <- NULL 
 61 | 
 62 |     input_sources <- Map(list,
 63 |                          sourcesize=lst[['sourcesize']],
 64 |                          sourceurl=lst[['sourceurl']],
 65 |                          sourcetype=lst[['sourcetype']],
 66 |                          sourceversion=lst[['sourceversion']],
 67 |                          sourcemd5=lst[['sourcemd5']],
 68 |                          sourcelastmodifieddate=lst[['sourcelastmodifieddate']]
 69 |                          )
 70 |     ## using Map puts unwanted labels on things...
 71 |     names(input_sources) <- NULL 
 72 |     
 73 |     ## TODO: I need to have Map make lists but not have them be named horribly.
 74 |     ## So multiplexed like Map on rdatapaths below, but with result
 75 |     ## that looks like input_sources
 76 |     
 77 |     ## Now just need to re-arrange things a bit
 78 |     base <- list(title=lst[['title']],
 79 |                  dataprovider=lst[['dataprovider']],
 80 |                  species=lst[['species']],
 81 |                  taxonomyid=as.integer(lst[['taxonomyid']]),
 82 |                  genome=lst[['genome']],
 83 |                  description=lst[['description']],
 84 |                  coordinate_1_based=lst[['coordinate_1_based']],
 85 |                  maintainer=lst[['maintainer']],
 86 |                  rdataversion=lst[['rdataversion']],
 87 |                  rdatadateadded=lst[['rdatadateadded']],
 88 |                  ## FIXME - Old AHMs may not have Location_Prefix filled in!
 89 |                  ## It should be http://s3.amazonaws.com/annotationhub/ or
 90 |                  ## https://bioconductorhubs.blob.core.windows.net/annotationhub
 91 |                  ## by default, for chain files it should be:
 92 |                  ## http://hgdownload.cse.ucsc.edu/
 93 |                  location_prefix=lst[['location_prefix']],
 94 |                  recipe=lst[['recipe']][1],
 95 |                  recipe_package=ifelse(!is.na(lst[["recipe"]]),
 96 |                    lst[['recipe']][2], lst[['recipe']][1] ),
 97 |                  rdatapaths=rdatapaths,                 
 98 |                  input_sources=input_sources,
 99 |                  tags=lst[['tags']],
100 |                  biocversions=lst[['biocversion']],
101 |                  preparerclass=lst[['preparerclass']]
102 |                  )
103 |     
104 |     ## then make JSON
105 |     paste0(toJSON(base, auto_unbox=TRUE,na='null', pretty=TRUE), "\n")
106 |     ## STILL: some issues here with no boxing where we want it (around
107 |     ## sub-sets like 'versions'
108 |     ## AND: some name-mangling in the tags...
109 |     
110 | }
111 | 
112 | 
113 | 
114 | 
115 | ## Testing
116 | ## numExtends <- unlist(lapply(resources, function(x){x@Recipe[1]=='extendedBedToGRanges'}))
117 | 
118 | 
119 | ## NOTES from 4/21/14
120 | ## check on rdatasize and sourcesize (should not be NA?) - I think
121 | ## they are NA though- but double check this. - DONE
122 | ## values that are NA in the JSON should be set to null - DONE
123 | ## use ALL of the biocversions - DONE
124 | ## add sourceMd5, derivedMD5, sourceLastModifiedDate to the json
125 | ## (soon) - manually add these to the AHMs? - DONE
126 | 
127 | ## changes to the process for making Annotations:
128 | ## Export makeAnnotationHubResource (so it can be used externally in
129 | ## other packages) - DONE
130 | ## Allow currentMetadata to be passed in to the helper functions (add
131 | ## this to
132 | ## .generalNewResources::makeAnnotationHubMetadataFunction(currentMEtadata,...)
133 | ## - DONE
134 | ## Recipes should use require() to minimize dependencies for
135 | ## annotations and suggests for things that are only needed by
136 | ## specific recipes. Or they could maybe just get away with importing.
137 | ## recipes and AHM generator should not have to define an AHMRoot
138 | ## (since this is alway put in after the fact. - Just use a default
139 | ## value for this. 
140 | 
141 | 
142 | ## modernize all of the recipes so that they use the new system (the
143 | ## new simplified system).
144 | 
145 | 
146 | ## And actually we now need to also stop defining the AHMRoot this in
147 | ## the recipes.  (it is no longer necessary)
148 | 
149 | 
150 | ## Make sure that we can put a recipe into another package. - untested.
151 | 
152 | 
153 | ## look into the weird requirement for adding importPreparer subclasses to
154 | ## the NAMESPACE. - can we make this go away?
155 | 
156 | 
157 | ## Document makeAnnotationHubResource
158 | 
159 | 
160 | ## Fix the unit tests
161 | 


--------------------------------------------------------------------------------
/R/makedbSNPVCF.R:
--------------------------------------------------------------------------------
  1 | ### link to data description:
  2 | ### http://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf/
  3 | 
  4 | ### ----------------------------------------------------------------------
  5 | ### March 2016
  6 | 
  7 | ### Files in AH that point to ftp.ncbi.nih.gov/snp/organisms/*
  8 | ### are no longer available:
  9 | 
 10 | ### We have 38 of them:
 11 | 
 12 | ### > length(query(hub, c("dbsnp", "vcf")))
 13 | ### [1] 38
 14 | ### 
 15 | ### 
 16 | ### Of the 38 full urls seen with query(hub, c("dbsnp", "vcf"))$sourceurl,
 17 | ### there are 5 unique base directories:
 18 | ### 
 19 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/
 20 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b141_GRCh37p13/
 21 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b142_GRCh37p13/
 22 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b142_GRCh38/
 23 | ### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b141_GRCh38/
 24 | ### 
 25 | ### On the web site (ftp://ftp.ncbi.nih.gov/snp/organisms/) there 7 base
 26 | ### directories. b141 is no longer there and b144, b146 have been added.
 27 | 
 28 | ### The recipe has been updated to look in 
 29 | ###    ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/
 30 | ###    ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/
 31 | ### ----------------------------------------------------------------------
 32 | 
 33 | .dbSNPBaseUrl <-"ftp://ftp.ncbi.nlm.nih.gov/"
 34 | 
 35 | ## Chosing files from archive so all files have a date stamp.
 36 | ## It looks like files with no stamp in the current
 37 | ## directory are either 'in progress' or 'subject to change'.
 38 | .getdbSNP <- function(justRunUnitTest) {
 39 |     baseUrl <- paste0(.dbSNPBaseUrl, "pub/clinvar/")
 40 |     paths <- c(GRCh37="vcf_GRCh37/archive_1.0/2016/", 
 41 |                GRCh38="vcf_GRCh38/archive_1.0/2016/")
 42 |     files <- c("clinvar_20160203", "clinvar_20160203_papu", 
 43 |                "common_and_clinical_20160203", 
 44 |                "common_no_known_medical_impact_20160203")
 45 |     urls <- setNames(paste0(baseUrl, paths), names(paths))
 46 | 
 47 |     if (justRunUnitTest)
 48 |         urls <- urls[1]
 49 |   
 50 |     genome <- rep(names(urls), each=length(files))
 51 |     df <- .ftpFileInfo(url=urls, extension=paste0(files, ".vcf.gz"))
 52 |     df$genome <- gsub("GRCh37clinical", "GRCh37", genome)
 53 |     df <- cbind(df, title=basename(df$fileurl), stringsAsFactors = FALSE)
 54 |     rownames(df) <- NULL
 55 |  
 56 |     map <- c(
 57 |         `All` = .expandLine("VCF of all variations that meet the criteria
 58 |         to be in a  VCF file.  This file is created once per dbSNP build."),
 59 |         `All_papu` = .expandLine("VCF of all variations found in the 
 60 |         psuedoautosomal region (PAR), alternate loci, patch sequences and 
 61 |         unlocalized or unplaced contigs(papu)"),
 62 |         `common_all` = .expandLine("VCF of all variations that are polymorphic
 63 |         in a least one population the 1000 Genomes project or any of the 
 64 |         following handles: 1000GENOMES, CSHL-HAPMAP, EGP_SNPS NHLBI-ESP,
 65 |         PGA-UW-FHCRC. A variation is polymorphic if the minor allele 
 66 |         frequency is at least 0.01 and the minor allele is present in 
 67 |         at least two samples."),
 68 |         `clinvar` = .expandLine("VCF of variations from clinvar where 'YYYYMMDD' 
 69 |         represents the date the file was created. This file is created 
 70 |         weekly."), 
 71 |         `common_and_clinical` = .expandLine("Variations from common_all.vcf.gz
 72 |         that are clinical.  A clinical variation is one the appears in 
 73 |         clinvar_YYYYMMDD.vcf.gz with at least one of the following clinical
 74 |         significance codes: 4 - probable-pathogenic, 5 - pathogenic, 
 75 |         6 - drug-response, 7 - histocompatibility, 255 - other, 
 76 |         This file is created weekly."),
 77 |         `common_no_known_medical_impact` = .expandLine("Variations from 
 78 |         common_all.vcf.gz that do not meet the clinical criteria described 
 79 |         above.  This file is created weekly."))
 80 |  
 81 |     description <- character(length(title))
 82 |     for (i in seq_along(map))
 83 |         description[grep(names(map)[i], df$title)] <- map[[i]]
 84 | 
 85 |     cbind(df, description, stringsAsFactors = FALSE)
 86 | }
 87 | 
 88 | makedbSNPVCF <- function(currentMetadata, justRunUnitTest=TRUE,
 89 |                          BiocVersion=BiocManager::version()) {
 90 |     rsrc <- .getdbSNP(justRunUnitTest)
 91 | 
 92 |     ## input_sources table
 93 |     sourceSize <- as.numeric(rsrc$size)
 94 |     sourceUrls <- rsrc$fileurl
 95 |     sourceVersion <- gsub(" ", "_", rsrc$date) 
 96 |     sourceLastModifiedDate <- rsrc$date
 97 |  
 98 |     ## resources table
 99 |     title <- rsrc$title
100 |     description <- rsrc$description
101 |     genome <- rsrc$genome
102 |  
103 |     ## rdatapath should have 2 entries -for the VCF and its TabixFile
104 |     rdatapath <- sub(.dbSNPBaseUrl, "", rsrc$fileurl)
105 |     rdps <- rep(rdatapath, each=2) 
106 |     rdatapaths <- split(rdps, f=as.factor(rep(seq_along(rdatapath),each=2)))
107 |     rdatapaths <- lapply(rdatapaths,
108 |                          function(x){x[2] <- paste0(x[2],".tbi") ; return(x)}) 
109 |  
110 |     tags <- lapply(genome, 
111 |         function(tag) c("dbSNP", tag, "VCF")
112 |     )
113 |  
114 |     Map(AnnotationHubMetadata,
115 |         SourceSize=sourceSize,
116 |         SourceUrl=sourceUrls,
117 |         SourceVersion=sourceVersion,
118 |         SourceLastModifiedDate=sourceLastModifiedDate,
119 |  
120 |         Description=description,
121 |         Title=title,
122 |         Genome=genome,
123 |         Tags=tags,
124 |         RDataPath=rdatapaths,
125 |  
126 |         MoreArgs=list(
127 |             BiocVersion=BiocVersion,
128 |             # input sources 
129 |             SourceType= "VCF",
130 |  
131 |             # resources
132 |             Species="Homo sapiens",
133 |             TaxonomyId=9606L, 
134 |             DataProvider = "dbSNP",
135 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
136 |             Coordinate_1_based = FALSE,
137 |             Location_Prefix = .dbSNPBaseUrl,
138 |             RDataDateAdded = Sys.time(),
139 | 
140 |             #rdata table
141 |             DispatchClass= "dbSNPVCFFile" ,
142 |             RDataClass = c("VcfFile", "VcfFile"),
143 | 
144 |             Recipe = "AnnotationHubData:::ncbi_dbSNPVCFFile"))
145 | }
146 | 
147 | 
148 | ## recipe
149 | ncbi_dbSNPVCFFile <- function(ahm)
150 | {
151 |     ## The tbi file exists online, just download it.
152 |     faIn <- normalizePath(inputFiles(ahm))  # file on ftp site
153 |     faOut1 <- normalizePath(outputFile(ahm))[1] # vcf.gz file on localDir
154 |     faOut2 <- outputFile(ahm)[2]  # vcf.gz.tbi file on localDir
155 |  
156 |     if(!file.exists(faOut2)) {
157 |         tbiFile <- paste0(metadata(ahm)$Location_Prefix, 
158 |                           metadata(ahm)$RDataPath[2]) 
159 |         tbi <- download.file(tbiFile, faOut2)
160 |     }
161 |     faOut2
162 | }
163 | 
164 | makeAnnotationHubResource("dbSNPVCFPreparer", makedbSNPVCF, quiet=TRUE)
165 | 
166 | 


--------------------------------------------------------------------------------
/R/makeEncodeDCC.R:
--------------------------------------------------------------------------------
  1 | # This is a new recipe for EncodeImportPreparer-class.R 
  2 | .ucscBase <- "http://hgdownload.cse.ucsc.edu/"
  3 | 
  4 | .getTags <- function(url) {
  5 |     tagurl <- paste0(url, "files.txt")
  6 |     html <- httpGET(tagurl)
  7 |     
  8 |     html <- unlist(strsplit(html, "\n")) # split to get tags for each file
  9 |     lapply(html, function(t) {
 10 |         ta <- unlist(strsplit(t, "\t"))
 11 |         temp <- unlist(strsplit(ta[2],";"))
 12 |         temp <- trimws(temp)
 13 |         
 14 |         ## extract the md5sum if present
 15 |         md <- grep("md5sum=", temp, value=TRUE)
 16 |         md <- ifelse(length(md), gsub(".*=","", md), NA_character_)    
 17 |         
 18 |         ## change "cell=8988T" to "8988T cell"
 19 |         n <- grep("cell=", temp, value=TRUE)
 20 |         n <- ifelse(length(n)!=0, paste0(gsub(".*=", "", n)," cell"), 
 21 |                     NA_character_)
 22 | 
 23 |         ## change "grant=Gingeras" to "Gingeras grant"
 24 |         g <- grep("grant=", temp, value=TRUE)
 25 |         g <- ifelse(length(g)!=0, paste0(gsub(".*=", "", g)," grant"), 
 26 |                     NA_character_)
 27 |         
 28 |         dv <- grep("dataVersion=", temp, value=TRUE)
 29 |         dv <- ifelse(length(dv)!=0, gsub(".*=", "", dv), NA_character_)
 30 |         
 31 |         ## get only important fields
 32 |         toMatch <- "dataType|lorigAssembly|type"
 33 |         temp <- temp[grepl(toMatch, temp)]
 34 |         
 35 |         ## remove everything before "="
 36 |         temp <- gsub(".*=","", temp)
 37 |         
 38 |         ## add
 39 |         if(!is.na(n))
 40 |             temp <- c(temp, n)
 41 |         
 42 |         if(!is.na(g))
 43 |             temp <- c(temp, g)
 44 |     
 45 |         temp <- c("wgEncode", temp)
 46 |         temp <- temp[!grepl("None",temp)]
 47 |         
 48 |         list(tags=paste0(temp, collapse=", "), md5sum = md, 
 49 |              sourceVersion=dv) 
 50 |    })
 51 | }
 52 | 
 53 | .cleanFiles <- function(url, isSubDir=FALSE) {
 54 |     fls <- .httrRead(url)$files
 55 |         
 56 |     if(length(fls) != 0) {
 57 |         if(isSubDir){
 58 |             
 59 |             result <- .getTags(url)
 60 |             tags <- sapply(result, "[[", "tags")
 61 |             sourcemd5sum <- vapply(result, "[[",character(1),  "md5sum")
 62 |             sourceVersion <- vapply(result, "[[", "", "sourceVersion") 
 63 |             
 64 |             subst <- switch( basename(url),
 65 |                 wgEncodeAwgTfbsUniform="wgEncodeAwgTfbs",
 66 |                 wgEncodeAwgDnaseUniform="wgEncodeAwgDnase",
 67 |                 wgEncodeGencodeV4="wgEncodeGencode",
 68 |                 basename(url))                  
 69 |             
 70 |             fls <- fls[grepl(subst,fls)]
 71 |             fls <- fls[!grepl("files.txt", fls)]
 72 |             if(length(tags)!=0)
 73 |                 fls <- list(filename=fls, tags=tags, sourcemd5sum=sourcemd5sum,
 74 |                             sourceVersion=sourceVersion)
 75 | 	    }
 76 |     }    
 77 |     fls
 78 | }
 79 | 
 80 | .subDir <- function(url, verbose=TRUE) {
 81 |     contents <- .cleanFiles(url, isSubDir=TRUE)
 82 |     supported.formats <- c("narrowPeak", "broadPeak", "bedRnaElements", 
 83 |                            "gtf")
 84 |     tags <- contents$tags
 85 |     sourcemd5sum <- contents$sourcemd5sum
 86 |     files <- contents$filename
 87 |     sourceVersion <- contents$sourceVersion
 88 |     
 89 |     type <- sapply(strsplit(files, ".", fixed = TRUE), "[[", 2)
 90 |     idx <- type %in% supported.formats
 91 |     files <- files[idx]
 92 |     tags <- tags[idx]
 93 |     sourcemd5sum <- sourcemd5sum[idx]
 94 |     type <- type[idx]
 95 |     sourceVersion <- sourceVersion[idx]
 96 |  
 97 | 
 98 |     if(length(files)!=0) {
 99 |         files <-  sprintf("%s%s", url, files)
100 |          if(length(files)>5){
101 |              files<- files[1:5]
102 |              tags<- tags[1:5]
103 |              sourcemd5sum <- sourcemd5sum[1:5]
104 |              type <- type[1:5]
105 |              sourceVersion <- sourceVersion[1:5]
106 |          }
107 |             
108 |         df <- .httrFileInfo(files, verbose)
109 |         
110 |         cbind(df, type, tags, sourcemd5sum, sourceVersion, 
111 |               stringsAsFactors=FALSE)
112 |     } else 
113 |         data.frame(fileurl=character(), date=character(), size=numeric(),
114 |                    type= character(), stringsAsFactors=FALSE)
115 | }
116 | 
117 | .encodeFiles <- function(justRunUnitTest=FALSE) {
118 |     encode_url <- paste0(.ucscBase, "goldenpath/hg19/encodeDCC/")
119 |     subdirs <- .cleanFiles(encode_url, isSubDir=FALSE)
120 |     urls <- setNames(paste0(encode_url, subdirs), subdirs)
121 |     
122 |     if(justRunUnitTest)
123 |         urls <- urls[c(1,2)]
124 |     
125 |     do.call(rbind, Map(.subDir, urls, verbose=TRUE))
126 | }
127 | 
128 | makeEncodeImporter <- function(currentMetadata, justRunUnitTest=FALSE,
129 |                                BiocVersion=BiocManager::version()) {
130 |     rsrc <- .encodeFiles(justRunUnitTest)
131 |     
132 |     ## input_sources table
133 |     sourceSize <- as.numeric(rsrc$size)
134 |     sourceUrls <- rsrc$fileurl
135 |     sourceVersion <- rsrc$sourceVersion # should be character
136 |     SourceLastModifiedDate <- rsrc$date  # should be "POSIXct" "POSIXt"
137 |     sourceType <- sapply(rsrc$type, function(x) 
138 |         switch(x, 
139 |                broadPeak="BED", 
140 |                narrowPeak="BED",
141 |                gtf="GTF", 
142 |                bedRnaElements="BED"), 
143 |         USE.NAMES =FALSE)
144 |     
145 |     dispatchclass <- sapply(rsrc$type, function(x)
146 |        switch(x,
147 |                broadPeak="UCSCBroadPeak",
148 |                narrowPeak="UCSCNarrowPeak",
149 |                gtf="GTFFile",
150 |                bedRnaElements="UCSCBEDRnaElements"),
151 |        USE.NAMES =FALSE)
152 | 
153 |     
154 |     ## resources table
155 |     title <- basename(rsrc$fileurl)
156 |     description <- rsrc$description
157 |     sourceMd5sum <- rsrc$sourcemd5sum
158 |     
159 |     rdatapath <- gsub(.ucscBase, "", sourceUrls) 
160 |     
161 |     tags <- strsplit(rsrc$tags, ", ")
162 |     
163 |     Map(AnnotationHubMetadata,
164 |         
165 |         SourceSize=sourceSize,
166 |         SourceUrl=sourceUrls,
167 |         SourceVersion=sourceVersion,
168 |         SourceLastModifiedDate = SourceLastModifiedDate,
169 |         SourceType = sourceType,
170 | 	        
171 |         Description= paste0(rsrc$type, " file from ENCODE"), 
172 |         Title=title, 
173 |         
174 |         RDataPath=rdatapath,
175 |         DispatchClass = dispatchclass,  
176 |       
177 |         Tags=tags,
178 |         
179 |         MoreArgs=list(
180 |             BiocVersion=BiocVersion,
181 |             # resources
182 |             DataProvider = "UCSC",
183 |             Species="Homo sapiens",
184 |             TaxonomyId=9606L,
185 |             Genome= "hg19",
186 |             Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",            
187 |             Coordinate_1_based = FALSE,
188 |             ##status_id =2L, 
189 |             Location_Prefix = .ucscBase,
190 |             RDataDateAdded = Sys.time(),
191 |             ##PreparerClass = "EncodeImportPreparer",
192 |             
193 |             #rdata table
194 |             RDataClass = "GRanges",
195 |             
196 |             Recipe = NA_character_))
197 | }
198 | 
199 | makeAnnotationHubResource("EncodeImportPreparer", makeEncodeImporter)
200 | 
201 | 


--------------------------------------------------------------------------------
/man/updateResources.Rd:
--------------------------------------------------------------------------------
  1 | \name{updateResources}
  2 | \alias{updateResources}
  3 | 
  4 | \alias{runRecipes}
  5 | \alias{runRecipes,AnnotationHubMetadata-method}
  6 | \alias{pushResources}
  7 | \alias{pushMetadata}
  8 | 
  9 | 
 10 | \title{updateResources}
 11 | 
 12 | \description{
 13 |   Add new resources to AnnotationHub
 14 | }
 15 | 
 16 | \usage{
 17 | updateResources(AnnotationHubRoot, BiocVersion = BiocManager::version(),
 18 |                 preparerClasses = getImportPreparerClasses(),
 19 |                 metadataOnly = TRUE, insert = FALSE,
 20 |                 justRunUnitTest = FALSE, ...)
 21 | 
 22 | pushResources(allAhms, uploadToRemote = TRUE, download = TRUE)
 23 | 
 24 | pushMetadata(allAhms, url)
 25 | }
 26 | 
 27 | \arguments{
 28 |   \item{AnnotationHubRoot}{
 29 |     Local path where files will be downloaded.
 30 |   }
 31 |   \item{BiocVersion}{
 32 |     A \code{character(1)} Bioconductor version. The resource will be available
 33 |     in Bioconductor >= to this version. Default value is the current version,
 34 |     specified with BiocManager::version().
 35 |   }
 36 |   \item{preparerClasses}{
 37 |     One of the \code{ImportPreparer} subclasses defined in
 38 |     \code{getImportPreparer()}. This class is used for dispatch during
 39 |     data discovery.
 40 |   }
 41 |   \item{metadataOnly}{
 42 |     A \code{logical} to specify the processing of metadata only or both
 43 |     metadata and data files.
 44 | 
 45 |     When FALSE, metadata are generated and data files are downloaded,
 46 |     processed and pushed to their final location in S3 buckets.
 47 |     \code{metadata = TRUE} produces only metadata and is useful for
 48 |     testing.
 49 |   }
 50 |   \item{insert}{
 51 |     NOTE: This option is for inserting metadata records in the
 52 |     production data base (done by Bioconductor core team member) and
 53 |     is for internal use only.
 54 | 
 55 |     A \code{logical} to control if metadata are inserted in the AnnotationHub
 56 |     db. By default this option is FALSE which is a useful state in which
 57 |     to test a new recipe and confirm the metadata fields are correct.
 58 | 
 59 |     When \code{insert = TRUE}, the "AH_SERVER_POST_URL" global option must
 60 |     be set to the http location of the AnnotationHubServer in the global
 61 |     environment or .Rprofile. Additionally, azcopy command line tools
 62 |     must be installed on the local machine to push files to Azure buckets.
 63 |     See \link{upload_to_azure}.
 64 |   }
 65 |   \item{justRunUnitTest}{
 66 |     A \code{logical}. When TRUE, a small number of records (usually 5) are
 67 |     processed instead of all.
 68 |   }
 69 |   \item{allAhms}{
 70 |     List of \code{AnnotationHubMetadata} objects.
 71 |   }
 72 |   \item{url}{
 73 |     URL of AnnotationHub database where metadata will be inserted.
 74 |   }
 75 |   \item{uploadToRemote}{
 76 |     A \code{logical} indicating whether resources should be uploaded
 77 |     to remote bioconductor default location. Currently Azure Data Lakes.
 78 |   }
 79 |   \item{download}{
 80 |     A \code{logical} indicating whether resources should be downloaded from
 81 |     resource url.
 82 |   }
 83 |   \item{\dots}{
 84 |     Arguments passed to other methods such as \code{regex}, \code{baseUrl},
 85 |     \code{baseDir}.
 86 |   }
 87 | }
 88 | 
 89 | \details{
 90 |    \itemize{
 91 |      \item updateResources:
 92 | 
 93 |            \code{updateResources} is responsible for creating metadata records
 94 |            and downloading, processing and pushing data files to their final
 95 |            resting place. The \item{preparerClasses} argument is used in method
 96 |            dispatch to determine which recipe is used.
 97 | 
 98 |            By manipulating the \code{metadataOnly}, \code{insert} and
 99 |            \code{justRunUnitTest} arguments one can flexibly test the metadata
100 |            for a small number of records with or without downloading and
101 |            processing the data files.
102 | 
103 | 
104 |       \item global options:
105 | 
106 |             When \code{insert = TRUE} the "AH_SERVER_POST_URL" option must be
107 |             set to the https location of the AnnotationHub db.
108 |   }
109 | }
110 | 
111 | \value{
112 |   A list of \code{AnnotationHubMetadata} objects.
113 | }
114 | 
115 | \author{Martin Morgan, Marc Carlson}
116 | 
117 | \seealso{
118 |   \itemize{
119 |     \item \link{AnnotationHubMetadata}
120 |     \item \link{upload_to_azure}
121 |   }
122 | }
123 | 
124 | \examples{
125 | 
126 | \dontrun{
127 | 
128 | ## -----------------------------------------------------------------------
129 | ## Inspect metadata:
130 | ## -----------------------------------------------------------------------
131 | ## A useful first step in testing a new recipe is to generate and
132 | ## inspect a small number of metadata records. The combination of
133 | ## 'metadataOnly=TRUE', 'insert=FALSE' and 'justRunUnitTest=TRUE'
134 | ## generates metadata for the first 5 records and does not download or
135 | ## process any data.
136 | 
137 | meta <- updateResources("/local/path",
138 |                         BiocVersion = "3.3",
139 |                         preparerClasses = "EnsemblFastaImportPreparer",
140 |                         metadataOnly = TRUE, insert = FALSE,
141 |                         justRunUnitTest = TRUE,
142 |                         release = "84")
143 | 
144 | INFO [2015-11-12 07:58:05] Preparer Class: EnsemblFastaImportPreparer
145 | Ailuropoda_melanoleuca.ailMel1.cdna.all.fa.gz
146 | Ailuropoda_melanoleuca.ailMel1.dna_rm.toplevel.fa.gz
147 | Ailuropoda_melanoleuca.ailMel1.dna_sm.toplevel.fa.gz
148 | Ailuropoda_melanoleuca.ailMel1.dna.toplevel.fa.gz
149 | Ailuropoda_melanoleuca.ailMel1.ncrna.fa.gz
150 | 
151 | ## The return value is a list of metadata for the first 5 records:
152 | 
153 | > names(meta)
154 | [1] "FASTA cDNA sequence for Ailuropoda melanoleuca"
155 | [2] "FASTA DNA sequence for Ailuropoda melanoleuca"
156 | [3] "FASTA DNA sequence for Ailuropoda melanoleuca"
157 | [4] "FASTA DNA sequence for Ailuropoda melanoleuca"
158 | [5] "FASTA ncRNA sequence for Ailuropoda melanoleuca"
159 | 
160 | 
161 | ## Each record is of class AnnotationHubMetadata:
162 | 
163 | > class(meta[[1]])
164 | [1] "AnnotationHubMetadata"
165 | attr(,"package")
166 | [1] "AnnotationHubData"
167 | 
168 | ## -----------------------------------------------------------------------
169 | ## Insert metadata in the db and process/push data files:
170 | ## -----------------------------------------------------------------------
171 | ## This next code chunk creates the metadata and downloads and processes
172 | ## the data (metadataOnly=FALSE). If all files are successfully pushed to
173 | ## to their final resting place, metadata records are inserted in the 
174 | ## AnnotationHub db (insert=TRUE). Metadata insertion is done by a 
175 | ## Bioconductor team member; contact maintainer@bioconductor.org for help.
176 | 
177 | meta <- updateResources("local/path",
178 |                         BiocVersion = "3.5",
179 |                         preparerClasses = "EnsemblFastaImportPreparer",
180 |                         metadataOnly = FALSE, insert = TRUE,
181 |                         justRunUnitTest = FALSE,
182 |                         regex = ".*release-81")
183 | 
184 | ## -----------------------------------------------------------------------
185 | ## Recovery helpers:
186 | ## -----------------------------------------------------------------------
187 | 
188 | ## pushResources() and pushMetadata() are both called from updateResources()
189 | ## but can be used solo for testing or completing a run that
190 | ## terminated unexpectedly.
191 | 
192 | ## Download, process and push to azure the last 2 files in 'meta':
193 | sub <- meta[length(meta) - 1:length(meta)]
194 | pushResources(sub)
195 | 
196 | ## Insert metadata in the AnotationHub db for the last 2 files in 'meta':
197 | 
198 | pushMetadata(sub, url = getOption("AH_SERVER_POST_URL"))
199 | }
200 | 
201 | }
202 | 
203 | \keyword{methods}
204 | 


--------------------------------------------------------------------------------
/R/makeEnsemblFasta.R:
--------------------------------------------------------------------------------
  1 | ### =========================================================================
  2 | ### makeEnsemblFastaAHM() and ensemblFastaToFaFile()
  3 | ### -------------------------------------------------------------------------
  4 | ###
  5 | 
  6 | ## Adjust this expression in order to save painful-reprocessing of older files.
  7 | ## .ensemblReleaseRegex <- ".*release-(69|7[[:digit:]]|8[[:digit:]])"
  8 | ## .ensemblReleaseRegex <- ".*release-(79|8[[:digit:]])"
  9 | ## for a speed run just do one set
 10 | ## .ensemblReleaseRegex <- ".*release-81"
 11 | 
 12 | ## list directories below url/dir satisfying regex
 13 | .ensemblDirUrl <-
 14 |     function(url, dir, regex)
 15 | {
 16 |     lst <- .listRemoteFiles(url)
 17 |     releases <- paste0(url, lst)
 18 |     paste(grep(regex, releases, value=TRUE), dir, sep="/")
 19 | }
 20 | 
 21 | ## NOTE: httr >= 1.2.0 doesn't support ftp last modified date and size
 22 | ## FIXME: This should be combined with .httrFileInfo() and .ftpFileInfo()
 23 | .ensemblMetadataFromUrl <- function(sourceUrl, twobit=FALSE, http=FALSE) {
 24 |     releaseRegex <- ".*(release-[[:digit:]]+).*"
 25 |     if (!twobit){
 26 |         title <- sub("\\.gz$", "", basename(sourceUrl))
 27 |     }else{
 28 |         title <- sub("\\.fa\\.gz$", ".2bit", basename(sourceUrl))
 29 |     }
 30 |     root <- setNames(rep(NA_character_, length(sourceUrl)), title)
 31 | 
 32 |     releaseNum <- sub("release-", "", sub(releaseRegex, "\\1", sourceUrl[1]))
 33 | 
 34 |     # as of release 96 a file is present with species index for mappings
 35 |     species_index <- GenomeInfoDb:::fetch_species_index_from_Ensembl_FTP(release=releaseNum)
 36 | 
 37 |     species <- vapply(strsplit(sourceUrl, '/'), function(x) x[[7]], character(1))
 38 |     genome <- vapply(species, FUN.VALUE=character(1), USE.NAMES=FALSE,
 39 |                      FUN=function(spc, tbl){
 40 |                          message(spc, "\n")
 41 |                          tbl[tbl$species == spc, "assembly"]
 42 |                      }, tbl=species_index)
 43 |     taxonomyId <- vapply(species, FUN.VALUE=integer(1), USE.NAMES=FALSE,
 44 |                      FUN=function(spc, tbl){
 45 |                          message(spc, "\n")
 46 |                          tbl[tbl$species == spc, "taxonomy_id"]
 47 |                      }, tbl=species_index)
 48 | 
 49 |     species <- sub("_", " ", species,fixed=TRUE)
 50 | 
 51 |     if (http) {
 52 |        ftpInfo <- .httrFileInfo(sourceUrl)
 53 |        sourceSize <- ftpInfo$size
 54 |        sourceLastModDate <- ftpInfo$date
 55 |     } else {
 56 |         sourceSize <- as.numeric(NA)
 57 |         sourceLastModDate <- as.POSIXct(NA)
 58 |     }
 59 | 
 60 |     list(annotationHubRoot = root, title=title, species = species,
 61 |          taxonomyId = as.integer(taxonomyId),
 62 |          genome = genome,
 63 |          sourceSize=sourceSize,
 64 |          sourceLastModifiedDate=sourceLastModDate,
 65 |          sourceVersion = sub(releaseRegex, "\\1", sourceUrl))
 66 | }
 67 | 
 68 | .ensemblFastaTypes <-
 69 |     c("cdna\\.all", "dna_rm\\.toplevel", "dna_sm\\.toplevel",
 70 |       "dna\\.toplevel", "ncrna", "pep\\.all")
 71 | 
 72 | ## get urls
 73 | .ensemblFastaSourceUrls <-
 74 |     function(baseUrl, baseDir, regex, baseTypes=.ensemblFastaTypes)
 75 | {
 76 |     want <- .ensemblDirUrl(baseUrl, baseDir, regex)
 77 | 
 78 |     .processUrl <- function(url) {
 79 |         listing <- .ftpDirectoryInfo(url)
 80 | 
 81 |         subdirIdx <- grepl(".*/.*:", listing)
 82 |         subdir <- sub("^.{2}(.*):$", "\\1", listing[subdirIdx])
 83 |         fileTypes <- paste(baseTypes, collapse="|")
 84 |         pat <- sprintf(".*(%s)\\.fa\\.gz$", fileTypes)
 85 | 
 86 |         fastaIdx <- grepl(pat, listing)
 87 |         fasta <- sub(".* ", "", listing[fastaIdx])
 88 | 
 89 |         ## match subdir w/ fasta
 90 |         subdir <- subdir[cumsum(subdirIdx)[fastaIdx]]
 91 | 
 92 |         ## Prefer "primary_assembly" to "toplevel" resources.
 93 |         organisms <- unique(sub("(.+?)\\..*", "\\1", fasta, perl=TRUE))
 94 |         keepIdxList <- sapply(organisms, function(x) {
 95 |             orgFiles <- fasta[grep(paste0("^", x, "\\."), fasta)]
 96 |             reBoth <- paste0("dna", c("_rm", "_sm", ""),
 97 |                 "\\.(primary_assembly|toplevel)\\.")
 98 |             toplevelIdx <-
 99 |                 sapply(reBoth, function(x) length(grep(x, orgFiles)) > 1)
100 |             reToplevel <- paste0("dna", c("_rm", "_sm", ""),
101 |                 "\\.toplevel\\.")[toplevelIdx]
102 | 
103 |             isRedundant <-
104 |                 sapply(reToplevel, function(x) grepl(x, orgFiles))
105 |             retVal <- rep(TRUE, length(orgFiles))
106 |             if (!is.null(dim(isRedundant))) {
107 |               retVal <- !apply(isRedundant, 1, any)
108 |             }
109 | 
110 |             retVal
111 |         })
112 |         keepIdx <- base::unlist(keepIdxList)
113 |         fasta <- fasta[keepIdx]
114 |         subdir <- subdir[keepIdx]
115 | 
116 |         sprintf("%s%s/%s", url, subdir, fasta)
117 |     }
118 |     res <- base::unlist(lapply(want, .processUrl), use.names=FALSE)
119 | 
120 |     if (length(res) == 0) {
121 |         txt <- sprintf("no fasta files at %s",
122 |                        paste(sQuote(want), collapse=", "))
123 |         stop(paste(strwrap(txt, exdent=2), collapse="\n"))
124 |     }
125 |     res
126 | }
127 | 
128 | ## metadata generator
129 | makeEnsemblFastaToAHM <-
130 |     function(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/",
131 |              baseDir = "fasta/", release,
132 |              justRunUnitTest = FALSE, BiocVersion = BiocManager::version())
133 | {
134 |     time1 <- Sys.time()
135 |     regex <- paste0(".*release-", release)
136 |     sourceUrl <- .ensemblFastaSourceUrls(baseUrl, baseDir, regex)
137 |     if (justRunUnitTest)
138 |         sourceUrl <- sourceUrl[1:5]
139 | 
140 |     sourceFile <- sub(baseUrl, "ensembl/", sourceUrl)
141 |     meta <- .ensemblMetadataFromUrl(sourceUrl)
142 |     dnaType <- local({
143 |         x <- basename(dirname(sourceFile))
144 |         sub("(dna|rna)", "\\U\\1", x, perl=TRUE)
145 |     })
146 |     description <- paste("FASTA", dnaType, "sequence for", meta$species)
147 | 
148 |     ## rdatapaths db table needs an extra row for the index file
149 |     rdataPath <- sub(".gz$", ".bgz", sourceFile)
150 |     rdps <- rep(rdataPath, each=3)
151 |     rdatapaths <- split(rdps, f=as.factor(rep(1:length(rdataPath),each=3)))
152 |     ## second record of each set becomes the '.fai' file
153 |     rdatapaths <- lapply(rdatapaths,
154 |                          function(x){x[2] <- paste0(x[2],".fai") ; x[2] <-
155 |                                          paste0(x[3],".gzi") ; return(x)})
156 | 
157 |     Map(AnnotationHubMetadata,
158 |         Description=description,
159 |         Genome=meta$genome,
160 |         RDataPath=rdatapaths,
161 |         SourceUrl=sourceUrl,
162 |         SourceVersion=meta$sourceVersion,
163 |         Species=meta$species,
164 |         TaxonomyId=meta$taxonomyId,
165 |         Title=meta$title,
166 |         SourceSize=meta$sourceSize,
167 |         SourceLastModifiedDate=meta$sourceLastModifiedDate,
168 |         MoreArgs=list(
169 |           BiocVersion=BiocVersion,
170 |           Coordinate_1_based = TRUE,
171 |           DataProvider="Ensembl",
172 |           Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
173 |           SourceType="FASTA",
174 |           DispatchClass="FaFile",
175 |           RDataClass=c("FaFile", "FaFile", "FaFile"),
176 |           RDataDateAdded=Sys.time(),
177 |           Recipe="AnnotationHubData:::ensemblFastaToFaFile",
178 |           Tags=c("FASTA", "ensembl", "sequence")))
179 | }
180 | 
181 | ## Used in makeEnsemblFastaAHM() and makeGencodeFastaToAHM():
182 | ## Unzips .gz file, indexes it and saves as .rz and .rz.fai.
183 | .fastaToFaFile <- function(ahm)
184 | {
185 |     ## target output file
186 |     faOut <- outputFile(ahm)[[1]]
187 |     srcFile <- sub('.bgz$','.gz',faOut)
188 |     ## unzip and index
189 |     bgzip(srcFile)
190 |     indexFa(faOut)
191 | }
192 | 
193 | ensemblFastaToFaFile <- function(ahm)
194 | {
195 |     .fastaToFaFile(ahm)
196 | }
197 | 
198 | ## create dispatch class and newResources() method
199 | makeAnnotationHubResource("EnsemblFastaImportPreparer", makeEnsemblFastaToAHM)
200 | 


--------------------------------------------------------------------------------
/R/trackWithAuxiliaryTableToGRangesRecipe.R:
--------------------------------------------------------------------------------
  1 | .makeAuxTable <- function(n, auxFiles, ahm){
  2 |     
  3 |      colClasses <- metadata(ahm)$RecipeArgs$auxColClasses[n][[1]]$cols
  4 |      auxFile <- auxFiles[n]
  5 |      tbl.aux <- read.table(auxFile, sep="\t", colClasses=colClasses)
  6 |      colnames(tbl.aux) <- names(colClasses)
  7 |      tbl.aux
  8 | }
  9 | 
 10 | .getMergeArgs <- function(n, ahm){
 11 |     metadata(ahm)$RecipeArgs$auxColClasses[n][[1]]$merge
 12 | }
 13 | 
 14 | 
 15 | ## from FILES (with json)
 16 | trackWithAuxiliaryTablesToGRanges <- function(ahm)
 17 | {
 18 |      mainFile <- inputFiles(ahm)[1] ## always the 1st one? - discuss with Dan and Paul
 19 |      auxFiles <- inputFiles(ahm)[-1]
 20 |      if(!(length(mainFile) == 1)) stop("No files present in input json.") 
 21 |      if(!(length(auxFiles) >= 1)) stop("No auxiliary files listed in input json.  Wrong recipe?")
 22 | 
 23 |      colClasses <- metadata(ahm)$RecipeArgs$mainColClasses
 24 |      tbl.main <- read.table(gzfile(mainFile), sep="\t", header=FALSE,
 25 |                            colClasses=colClasses)
 26 |      colnames(tbl.main) <- names(colClasses)
 27 | 
 28 |      auxLen <- length(auxFiles)
 29 |      ## a couple for loops because we need to know 'n'...
 30 |      auxTabs <- list()
 31 |      for(i in seq_len(auxLen)){
 32 |          auxTabs[[i]] <- .makeAuxTable(i, auxFiles, ahm)
 33 |      } 
 34 |      mergeArgs <- list()
 35 |      for(i in seq_len(auxLen)){
 36 |          mergeArgs[[i]] <- .getMergeArgs(i, ahm)
 37 |      } 
 38 |      
 39 |      ## merge together uses for loop again (to concentrate result down to one thing)
 40 |      for(i in  seq_len(auxLen)){
 41 |          if(i ==1){tbl <- tbl.main}
 42 |          ## otherwise recycle
 43 |          tbl <- merge(tbl, auxTabs[[i]], by.x=mergeArgs[[i]][["byX"]],
 44 |                        by.y=mergeArgs[[i]][["byY"]],
 45 |                        all.x=TRUE)  
 46 |      }
 47 |      
 48 |      tbl <- .sortTableByChromosomalLocation(tbl)
 49 |      colnames <- colnames(tbl)
 50 |      requiredColnames <- c("seqname", "start", "end")
 51 |      stopifnot(all(requiredColnames %in% colnames))
 52 |      otherColnames <- setdiff(colnames, requiredColnames)
 53 | 
 54 |      ## drop any rows withouth a seqname
 55 |      tbl <- tbl[!is.na(tbl$seqname),]
 56 |      
 57 |      if("strand" %in%  otherColnames){
 58 |          gr <- with(tbl, GRanges(seqname, IRanges(start, end), strand))
 59 |          otherColnames <- setdiff(colnames, c(requiredColnames,"strand"))
 60 |      }else{  
 61 |          gr <- with(tbl, GRanges(seqname, IRanges(start, end)))
 62 |      }
 63 |      
 64 |      mcols(gr) <- DataFrame(tbl[, otherColnames])
 65 | 
 66 |         # add seqlength & chromosome circularity information
 67 |     newSeqInfo <- constructSeqInfo(metadata(ahm)$Species,
 68 |                                     metadata(ahm)$Genome)
 69 |         # if gr only has a subset of all possible chromosomes,
 70 |         # then update those only
 71 |     seqinfo(gr) <- newSeqInfo[names(seqinfo(gr))]
 72 | 
 73 |     save(gr, file=outputFile(ahm))
 74 |     if (!getOption("AnnotationHub_Use_Disk", FALSE)) {
 75 |         upload_to_S3(outputFile(ahm), metadata(ahm)$RDataPath)
 76 |     }
 77 | 
 78 |     outputFile(ahm)
 79 | 
 80 | } # trackWithAuxiliaryTableToGRanges
 81 | #-------------------------------------------------------------------------------
 82 | 
 83 | 
 84 | 
 85 | 
 86 | ## helper to remove 'id' col
 87 | .removeId <- function(table){ 
 88 |     newColnames <- setdiff(colnames(table), "id")
 89 |     table[,newColnames]
 90 | }
 91 | ## compress a whole table one col at a time.
 92 | ## This (currently) assumes all cols should be characters
 93 | .compressTable <- function(table, levels){
 94 |     sf <- factor(table$id,levels=levels)
 95 |     table <- .removeId(table)
 96 |     res <- DataFrame()
 97 |     for(i in seq_len(ncol(table))){ ## for ea. column
 98 |         col <- splitAsList(as.character(table[[i]]), f=sf)
 99 |         if(i==1){
100 |             res <- DataFrame(col)
101 |         }else{
102 |             res <- DataFrame(res, DataFrame(col)) ## cbind doesn't work?
103 |         }
104 |     }
105 |     colnames(res) <- colnames(table)
106 |     res
107 | }
108 | 
109 | 
110 | .makeComplexGR <- function(tbl,auxFiles,auxTabs){
111 |     ## replace "chrom" with "seqnames".
112 |     colnames(tbl)[colnames(tbl) %in% "chrom"] <- "seqname"
113 |     colnames(tbl)[colnames(tbl) %in% "chromStart"] <- "start"
114 |     colnames(tbl)[colnames(tbl) %in% "chromEnd"] <- "end"
115 |     
116 |     tbl <-.sortTableByChromosomalLocation(tbl)
117 |     colnames <- colnames(tbl)
118 |     requiredColnames <- c("seqname", "start", "end")
119 |     stopifnot(all(requiredColnames %in% colnames))
120 |     otherColnames <- setdiff(colnames, requiredColnames)
121 |     
122 |     ## drop any rows withouth a seqname
123 |     tbl <- tbl[!is.na(tbl$seqname),]
124 |     
125 |     if("strand" %in%  otherColnames){
126 |         gr <- with(tbl, GRanges(seqname, IRanges(start, end), strand))
127 |         otherColnames <- setdiff(colnames, c(requiredColnames,"strand"))
128 |     }else{  
129 |         gr <- with(tbl, GRanges(seqname, IRanges(start, end)))
130 |     }
131 |     ## append the initial mcols
132 |     mcols(gr) <- DataFrame(tbl[, otherColnames])
133 |     
134 |     
135 |     
136 |     ## make a spliting factor based on the initial table
137 |     splitFactor <- factor(tbl$id, levels=tbl$id)
138 |     
139 |     for(i in seq_along(auxFiles)){
140 |         new <- auxTabs[[i]]
141 |         if(identical(as.character(splitFactor), as.character(new$id))){
142 |             ## Add it in
143 |             mcols(gr) <- DataFrame(mcols(gr), .removeId(new))
144 |         }else{## otherwise compress it 1st               
145 |             mcols(gr) <- DataFrame(mcols(gr),.compressTable(new,
146 |                                                     levels(splitFactor)))
147 |         }
148 |     }
149 |     gr
150 | }
151 | 
152 | ## Track AND auxiliary tables.
153 | ## Unfortunately, the schemas for some tracks are complex.
154 | ## This means that in the future I will have to use ucscSchema etc. to
155 | ## get the addional information so that I can properly assemble them.
156 | ## For now, we will check for "id" and only proceed if all tables have this.
157 | trackandTablesToGRangesRecipe <- function(ahm)
158 | {
159 |     session <- browserSession()
160 |     genome <- metadata(ahm)$Genome
161 |     genome(session) <- genome
162 |     sourceFile <- metadata(ahm)$SourceFile
163 |     track <- sub("^.+/database/","",sourceFile)
164 |     query <- ucscTableQuery(session, track)
165 |     tableNames <- tableNames(query)
166 |     
167 |     mainFile <- tableNames[1] ## always the 1st one to be 2main table 
168 |     auxFiles <- tableNames[-1]    
169 |     if(!(length(auxFiles) >= 1)) { ## this means we are done already
170 |         gr <- track(query)
171 |     }else{ ## have to do a merge 1st
172 |         ## have to "get" primary in table form to assure "id" will be present
173 |         tbl <- getTable(ucscTableQuery(session, mainFile))
174 | 
175 |         ## Now get the other tables
176 |         auxTabs <- list()
177 |         for(i in seq_along(auxFiles)){
178 |             ## query <- ucscTableQuery(session, track)
179 |             tableName(query) <- auxFiles[i]
180 |             auxTabs[[i]] <- getTable(query)
181 |         }
182 | 
183 |         allColNames <- list()
184 |         allColNames[[1]] <- colnames(tbl)
185 |         for(i in seq_len(length(auxTabs))){
186 |             idx <- i+1
187 |             #print(idx)
188 |             allColNames[[idx]] <- colnames(auxTabs[[i]])
189 |         }
190 |         ## for each element is there a value called "id"?
191 |         idPresent <- unlist(lapply(allColNames, function(x){'id' %in% x}))
192 |         
193 |         if(all(idPresent)){
194 |             gr <- .makeComplexGR(tbl,auxFiles,auxTabs)
195 |         }else{
196 |             message("track schema is too complex: using basic track instead")
197 |             query <- ucscTableQuery(session, track)
198 |             gr <- track(query)
199 |         }
200 |         
201 |     }
202 |     
203 |     
204 | ##     ## add seqlength & chromosome circularity information
205 | ##     newSeqInfo <- constructSeqInfo(metadata(ahm)$Species,
206 | ##                                     metadata(ahm)$Genome)
207 | ##     ## if gr only has a subset of all possible chromosomes,
208 | ##     ## then update those only
209 | ##     seqinfo(gr) <- newSeqInfo[names(seqinfo(gr))]
210 |     save(gr, file=outputFile(ahm))
211 |     outputFile(ahm)
212 | 
213 | } # trackandTablesToGRangesRecipe
214 | #-------------------------------------------------------------------------------
215 | 
216 | #-------------------------------------------------------------------------------
217 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | ### =========================================================================
  2 | ### Utility functions
  3 | ### -------------------------------------------------------------------------
  4 | ###
  5 | 
  6 | constructSeqInfo <- function(species, genome)
  7 | {
  8 |   recognized.human <- species=="Homo sapiens" & genome %in% c("hg18", "hg19")
  9 |   recognized.mouse <- species=="Mus musculus" & genome %in% c("mm10")
 10 |   recognized <- recognized.human | recognized.mouse
 11 |   stopifnot(recognized)
 12 |  
 13 |   suppressMessages({
 14 |        # chroms 1-22, X, Y, M are assumed to be the first 25 rows of the
 15 |        # data.frame
 16 |      if(recognized.human)
 17 |         tbl.chromInfo =
 18 |             GenomicFeatures:::.makeUCSCChrominfo (genome,
 19 |                                                   circ_seqs="chrM") [1:25,]
 20 |      if(recognized.mouse)
 21 |         tbl.chromInfo =
 22 |             GenomicFeatures:::.makeUCSCChrominfo (genome,
 23 |                                                   circ_seqs="chrM") [1:22,]
 24 |  
 25 |      })
 26 | 
 27 |    Seqinfo(as.character(tbl.chromInfo$chrom), 
 28 |            seqlengths=tbl.chromInfo$length, 
 29 |            isCircular=tbl.chromInfo$is_circular,
 30 |            genome=genome)
 31 | }
 32 | 
 33 | .sortTableByChromosomalLocation <- function(tbl)
 34 | {
 35 |   stopifnot (all (c ('seqname', 'start') %in% colnames (tbl)))
 36 |   factor.chromNames <- factor (tbl$seqname,
 37 |                                levels=paste("chr", c(1:22, "X", "Y", "M"),
 38 |                                             sep=''))
 39 |   tbl$seqname <- factor.chromNames
 40 |   tbl <- tbl [order (tbl$seqname, tbl$start), ]
 41 |   invisible (tbl)
 42 | 
 43 | } 
 44 | 
 45 | .printf <- function(...) print(noquote(sprintf(...)))
 46 | 
 47 | ## from ?grep, by Luke Tierney
 48 | URL_parts <- function(x)
 49 | {
 50 |     m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
 51 |     parts <- do.call(rbind,
 52 |        lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))
 53 |     colnames(parts) <- c("protocol","host","port","path")
 54 |     parts
 55 | }
 56 | 
 57 | ## log messages to console AND to a file
 58 | flog <- function(level, ...)
 59 | {
 60 |     loggerFunction <- switch(level,
 61 |         "flog.fatal",
 62 |         "flog.error",
 63 |         "noop",
 64 |         "flog.warn",
 65 |         "noop",
 66 |         "flog.info",
 67 |         "noop",
 68 |         "flog.debug",
 69 |         "flog.trace"
 70 |         )
 71 |     dots <- list(...)
 72 |     do.call(loggerFunction, dots)
 73 |     dots$name <- "file"
 74 |     do.call(loggerFunction, dots)
 75 | }
 76 | 
 77 | ## Uploading to S3 usually happens in AnnotationHubServer, but 
 78 | ## when running the track*Recipe recipes, it happens in
 79 | ## AnnotationHubData. There is an RAmazonS3 R package but 
 80 | ## it does not work well for uploading files. Therefore this
 81 | ## function expects the AWS CLI to be installed. 
 82 | ## See: https://aws.amazon.com/cli/
 83 | ## It should be configured with a user who can write to 
 84 | ## the appropriate bucket. 
 85 | upload_to_S3 <- 
 86 |     function(file, remotename, bucket, profile, acl="public-read")
 87 | {
 88 |     remotename <- sub("^\\/", "", remotename)
 89 |     #aws --profile ahs_content_uploader s3 cp --acl public-read test s3://annotationhub/loquat/vato/manichean/test
 90 |     profileStr <- " "
 91 |     if (!missing(profile))
 92 |         profileStr <- paste("--profile ", profile)
 93 | 
 94 |     cmd <- "aws"
 95 |     if (length(file) != length(remotename))
 96 |         stop("Length of file does not match length of remotename!")
 97 | 
 98 |     for (i in 1:length(file)) {
 99 |         thisFile <- file[i]
100 |         thisRemoteName <- remotename[i]
101 |         quotes = getOption("useFancyQuotes")
102 |         on.exit(options(useFancyQuotes=quotes))
103 |         options(useFancyQuotes=FALSE)
104 |         args <- sprintf("%s s3 cp --region us-east-1 --acl %s %s s3://%s/%s",
105 |             profileStr, acl, dQuote(thisFile), bucket, dQuote(thisRemoteName))
106 |         res <- system2(cmd, args)
107 |         if (res != 0)
108 |             stop(sprintf("Failed to upload %s to S3! Result was %s.", file, res))
109 |     }
110 |     
111 |     TRUE
112 | }
113 | 
114 | 
115 | ## new function to upload to azure
116 | 
117 | upload_to_azure <-
118 |     function(file, sas)
119 | {
120 |     if(missing(sas)){
121 |         sas = Sys.getenv("AZURE_SAS_URL", NA_character_)
122 |     }
123 |     if(is.na(sas)){
124 |         stop("AZURE_SAS_URL environment variables is not set or given")
125 |     }
126 |     stopifnot(startsWith(prefix="https", sas))
127 |     if(Sys.which("azcopy") == ""){
128 |         stop("Please download azcopy")
129 |     }
130 | 
131 |     args <- paste0("copy --recursive ", file, " '", sas, "'")
132 |     res <- system2("azcopy", args)
133 |     if (res != 0) stop(sprintf("Failed to upload %s to Azure!", file))
134 |     TRUE
135 | }
136 | 
137 | 
138 | 
139 | globalVariables(c("futile.logger"))
140 | 
141 | .onLoad <-
142 |     function(libname, pkgname)
143 | {
144 |    logDir <- file.path(Sys.getenv("HOME"),
145 |         sprintf(".%s", pkgname))
146 |     if (!file.exists(logDir))
147 |     {
148 |         .printf("Creating log directory %s", logDir)
149 |         dir.create(logDir)
150 |     }
151 |     l <- library
152 |     l(futile.logger)
153 |     flog.threshold(TRACE)
154 |     flog.appender(appender.file(file.path(logDir,
155 |         sprintf("%s.log", pkgname))), name="file")
156 | }
157 | 
158 | 
159 | `%_%` <- function(a, b) paste0(a, b)
160 | 
161 | 
162 | # Create "pointer" variables for large data sets.
163 | ptr <- pointer <- function(..., pos=-1, envir=as.environment(pos),
164 |     namedList=TRUE, expandCharacter=FALSE)
165 | {
166 |     variableList <- tail(as.list(match.call()), -1)
167 | 
168 |     if (length(variableList) == 0)
169 |         stop("Must supply reference object.")
170 | 
171 |     exclusions <- intersect(names(variableList), setdiff(names(formals()),
172 |         "..."))
173 |     for (exclusion in exclusions)
174 |         variableList[[exclusion]] = NULL
175 | 
176 |     if (length(variableList) == 0)
177 |         stop("Must supply reference object.")
178 | 
179 |     if (expandCharacter) {
180 |         temp = character()
181 |         for (variable in variableList) {
182 |             if (typeof(variable) == "character")
183 |                 temp <- c(temp, variable)
184 |             else if (typeof(variable) == "symbol") {
185 |                 evaluatedVariable <- eval(variable)
186 |                 if (typeof(evaluatedVariable) == "character")
187 |                     temp <- c(temp, evaluatedVariable)
188 |                 else if (is.environment(evaluatedVariable)) {
189 |                     for (name in ls(evaluatedVariable))
190 |                         temp <- c(temp, variable %_% "$" %_% name)
191 |                 }
192 |                 else
193 |                     temp <- c(temp, as.character(variable))
194 |             }
195 |         }
196 |         pointerNames <- temp
197 |     }
198 |     else
199 |         pointerNames <- as.character(variableList)
200 | 
201 |     returnList <- list()
202 |     for (pointerName in pointerNames) {
203 |         e <- envir
204 |         pName <- pointerName
205 | 
206 |         reEnv <- "^(.+?)\\$(.+?)$"
207 |         envMatch <- regexec(reEnv, pointerName)
208 |         envMatches <- NULL
209 |         if (envMatch[[1]][1] != -1) {
210 |             envMatches <- regmatches(pointerName, envMatch)[[1]][2:3]
211 |             e <- get(envMatches[1])
212 |             pName <- envMatches[2]
213 |         }
214 | 
215 |         p <- list()
216 |         p$object <- e
217 |         p$name <- as.character(pName)
218 |         class(p) <- "pointer"
219 | 
220 |         index <- length(returnList) + 1
221 |         if (namedList) index <- p$name
222 | 
223 |         returnList[[index]] <- p
224 |     }
225 | 
226 |     if (length(returnList) == 1)
227 |         return (returnList[[1]])
228 | 
229 |     return (returnList)
230 | }
231 | 
232 | as.pointer <- function(x)
233 | {
234 |     pointer(x)
235 | }
236 | 
237 | is.pointer <- function(x)
238 | {
239 |     return (inherits(x, "pointer"))
240 | }
241 | 
242 | .. <- deref <- function(x)
243 | {
244 |     if (is.environment(x)) return (x)
245 |     else return (get(x$name, envir=x$object))
246 | }
247 | `..<-` <- `deref<-` <- function(x, value)
248 | {
249 |     if (is.pointer(x)) assign(x$name, value, envir=x$object)
250 |     return (x)
251 | }
252 | 
253 | print.pointer <- function(x, ...)
254 | {
255 |     environment.name <- capture.output(print(x$object))
256 |     cat("Pointer to variable '", x$name, "' in ", environment.name, ":\n\n", sep="")
257 |     str(..(x), ...)
258 | }
259 | 
260 | ## usage:
261 | # x <- list(frog="frog", fish="~frog")
262 | # z <- pointer(x)
263 | # ..(z)
264 | # ..(z)$fish <- "trout"
265 | # ..(z)
266 | # x
267 | 


--------------------------------------------------------------------------------
/R/makeGencodeGFF.R:
--------------------------------------------------------------------------------
  1 | # recipe to get GFF3 files from Genecode.
  2 | # importtant links
  3 | #http://www.gencodegenes.org/releases/
  4 | #ftp site: ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human
  5 | # readme file for genecode project
  6 | # the above code was updated documented at
  7 | # ftp://ftp.sanger.ac.uk/pub/gencode/README.txt
  8 | #ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/_README.TXT
  9 | 
 10 | 
 11 | # for gff3 files
 12 | #gencode.vX.annotation.gff3.gz
 13 | #gencode.vX.chr_patch_hapl_scaff.annotation.gff3.gz
 14 | #gencode.vX.polyAs.gff3.gz:
 15 | #gencode.vX.polyAs.gff3.gz:
 16 | #gencode.vX.2wayconspseudos.gff3.gz:
 17 | # gencode.vX.long_noncoding_RNAs.gff3.gz
 18 | #gencode.vX.tRNAs.gff3.gz
 19 | 
 20 | # only gff3 files will be added - since both gtf and gff3 contain same
 21 | # data, but gff3 is better (Herve) .These files will not be stored as
 22 | # a GRanges on amazon s3.
 23 | 
 24 | .gencodeBaseUrl <- "ftp://ftp.ebi.ac.uk/pub/databases/gencode/"
 25 | 
 26 | .gencodeFileFromUrl <- function(urls) {
 27 |     unlist(sapply(urls, function(url) {
 28 |         listing <- .ftpDirectoryInfo(url)
 29 | 
 30 |         ## find entries marking directory
 31 |         idx <- grepl("^./", listing)
 32 |         tag <- sub("./(.*):", "\\1/", listing[idx])
 33 |         directory <- c("", tag)[cumsum(idx) + 1L]
 34 |         ## complete URL
 35 |         idx <- grepl("gencode", listing)
 36 |         paste0(url, directory, sub(".*gencode", "gencode", listing))[idx]
 37 | 
 38 |     }, USE.NAMES=FALSE))
 39 | }
 40 | 
 41 | .gencodeDescription <- function(fileurls){
 42 |     # add description map here.
 43 |     map <- c(
 44 |       annotation.gff3.gz=.expandLine("Gene annotations
 45 |           on reference chromosomes from Gencode"),
 46 |       chr_patch_hapl_scaff.annotation.=.expandLine("Gene annotation
 47 |           on reference-chromosomes/patches/scaffolds/haplotypes from Gencode"),
 48 |       polyAs=.expandLine("files contain polyA signals, polyA sites and
 49 |           pseudo polyAs manually annotated by HAVANA from only the refrence
 50 |           chromosome"),
 51 |       wayconspseudos=.expandLine("pseudogenes predicted by the Yale
 52 |           & UCSC pipelines, but not by Havana on reference chromosomes"),
 53 |       long_noncoding_RNAs=.expandLine("sub-set of the main annotation files
 54 |           on the reference chromosomes. They contain only the lncRNA genes.
 55 |           Long non-coding RNA genes are considered the genes with any of
 56 |           those biotypes: 'processed_transcript', 'lincRNA',
 57 |           '3prime_overlapping_ncrna', 'antisense', 'non_coding',
 58 |           'sense_intronic' , 'sense_overlapping' , 'TEC' , 'known_ncrna'."),
 59 |       tRNAs =.expandLine("tRNA structures predicted by tRNA-Scan on
 60 |           reference chromosomes"),
 61 |       transcripts.fa.gz=.expandLine("Protein-coding transcript sequences
 62 |           on reference chromosomes Fasta file"),
 63 |       translations.fa.gz=.expandLine("Translations of protein-coding
 64 |           transcripts on reference chromosomes Fasta file"),
 65 |       lncRNA_transcripts.fa.gz=.expandLine("Long non-coding RNA
 66 |           transcript sequences on reference chromosomes Fasta file."),
 67 |       unmapped=.expandLine("Unmapped")
 68 |       )
 69 |     description <- character(length(fileurls))
 70 |     for (i in seq_along(map))
 71 |         description[grep(names(map)[i], fileurls)] <- map[[i]]
 72 | 
 73 |     description
 74 | }
 75 | 
 76 | .gencodeGenome <- function(species, release) {
 77 |     # this information is curated from Gencode's website
 78 |     # link - http://www.gencodegenes.org/releases/
 79 |     if (species=="Human")
 80 |       tblurl <- "https://www.gencodegenes.org/human/releases"
 81 |     else
 82 |       tblurl <- "https://www.gencodegenes.org/mouse/releases"
 83 | 
 84 |     ## read in the table
 85 |     tryCatch({
 86 |         http <- RCurl::getURL(tblurl)
 87 |         tbl <- XML::readHTMLTable(http, header=TRUE, stringsAsFactors=FALSE)
 88 |     },  error = function(err) {
 89 |         stop("Error reading ", tblurl,
 90 |     ".\n  SSL issue reported in Ubuntu 20?")
 91 |     })
 92 |     
 93 |     tbl <- tbl[[1]]
 94 |     tblheader <- gsub("\n", "", colnames(tbl))
 95 |     tblheader = trimws(tblheader)
 96 |     colnames(tbl) = tblheader
 97 | 
 98 |     idx <- which(tbl[,"GENCODE release"]==release)
 99 |     tbl[idx,"Genome assembly version"]
100 | }
101 | 
102 | 
103 | # Helper to retrieve GTF & GFF3 file urls from Gencode
104 | .gencodeSourceUrls <- function(species, release, filetype, justRunUnitTest)
105 | {
106 |     speciesUrl <- ifelse(species=="Human", "Gencode_human/", "Gencode_mouse/")
107 |     dirurl = paste0(.gencodeBaseUrl, speciesUrl, "release_", release, "/")
108 |     names(dirurl) <- paste0(species,"_", release)
109 | 
110 |     fileurls <-.gencodeFileFromUrl(dirurl)
111 | 
112 |     if (tolower(filetype)=="gff")
113 |        idx <-  grep("gff3", fileurls)
114 |     if(tolower(filetype)=="fasta")
115 |        idx <-  grep("fa.gz", fileurls)
116 |     fileurls <- fileurls[idx]
117 | 
118 |     if(length(idx)==0)
119 |      stop("No files found.")
120 | 
121 |      if(justRunUnitTest)
122 |         fileurls <- fileurls[1:2]
123 | 
124 |     ## tags
125 |     filename <- basename(fileurls)
126 |     filename <- sub(".gz","", filename)
127 |     tags <- gsub("[.]",",",filename)
128 | 
129 |     ## description
130 |     description <- .gencodeDescription(fileurls)
131 | 
132 |     ## rdatapath - these files will be made into GRanges and stored on S3.
133 |     #rdatapath <- paste0("gencode/", species, "/release_", release,"/",
134 |     #    basename(fileurls), ".Rda")
135 | 
136 |     rdatapath <- sub(.gencodeBaseUrl, "", fileurls)
137 | 
138 | 
139 |     ## get date and size for files
140 |     df <- .httrFileInfo(fileurls)
141 |     rownames(df) <- NULL
142 | 
143 |     ## species, taxid, genome
144 |     scSpecies <- ifelse(species=="Human", "Homo sapiens", "Mus musculus")
145 |     taxid <- ifelse(species=="Human", 9606L, 1090L)
146 |     genome <- .gencodeGenome(species, release)
147 |     genome <- rep(genome, length(fileurls))
148 |     genome[grepl('_mapping/', rdatapath)] <-
149 |         gsub('.*/', '',
150 |              gsub('_mapping/.*', '',
151 |                   rdatapath[grepl('_mapping/', rdatapath)])
152 |              )
153 |     scSpecies <- rep(scSpecies, length(fileurls))
154 |     taxid <- rep(taxid, length(fileurls))
155 | 
156 |     cbind(df, rdatapath, description, tags, species=scSpecies, taxid, genome,
157 |          stringsAsFactors=FALSE)
158 | }
159 | 
160 | 
161 | ## STEP 1: make function to process metadata into AHMs
162 | makeGencodeGFFsToAHMs <- function(currentMetadata,
163 |                                   species=c("Human", "Mouse"),
164 |                                   release,
165 |                                   justRunUnitTest=FALSE,
166 |                                   BiocVersion=BiocManager::version()){
167 | 
168 |     ## important - here you need to know which species and release you want to
169 |     ## add files for.
170 |     species <- match.arg(species)
171 |     rsrc <- .gencodeSourceUrls(species = species, release = release,
172 |         filetype = "gff", justRunUnitTest = justRunUnitTest)
173 | 
174 |     description <- rsrc$description
175 |     title <- basename(rsrc$fileurl)
176 |     genome <- rsrc$genome
177 |     sourceUrls <- rsrc$fileurl
178 |     #
179 |     # FixMe: in .gencodeSourceUrls the data should be LastModified time
180 |     #    in webAccess function .httrFileInfo these urls have that information
181 |     #    in the body not the header but this function is used elsewhere
182 |     #
183 |     sourceVersion <- as.character(rsrc$date) ## should be character
184 |     if(all(is.na(sourceVersion))){
185 |         sourceVersion = rep(release, length(sourceVersion))
186 |     }
187 |     SourceLastModifiedDate <- rsrc$date  ## should be "POSIXct" "POSIXt"
188 |     SourceSize <- as.numeric(rsrc$size)
189 |     tags <- strsplit(rsrc$tag, ",")
190 |     species <- rsrc$species
191 |     rdatapath <- rsrc$rdatapath
192 |     taxid <- rsrc$taxid
193 | 
194 |     Map(AnnotationHubMetadata,
195 |         Description=description,
196 |         Genome=genome,
197 |         SourceUrl=sourceUrls,
198 |         SourceSize=SourceSize,
199 |         SourceLastModifiedDate=SourceLastModifiedDate,
200 |         SourceVersion=sourceVersion,
201 |         Species=species,
202 |         RDataPath=rdatapath,
203 |         TaxonomyId=taxid,
204 |         Title=title,
205 |         Tags=tags,
206 |         MoreArgs=list(
207 |           BiocVersion=BiocVersion,
208 |           Coordinate_1_based = TRUE,
209 |           DataProvider = "Gencode",
210 |           Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
211 |           RDataClass = "GRanges",
212 |           DispatchClass="GFF3File",
213 |           SourceType="GFF",
214 |           Location_Prefix=.gencodeBaseUrl,
215 |           RDataDateAdded = Sys.time(),
216 |           Recipe="AnnotationHubData:::gencodeGFFToGRanges"))
217 | }
218 | 
219 | gencodeGFFToGRanges <- function(ahm)
220 | {
221 |     outputFile(ahm)[[1]]
222 | }
223 | 
224 | ## STEP 2:  Call the helper to set up the newResources() method
225 | makeAnnotationHubResource("GencodeGffImportPreparer",
226 |                           makeGencodeGFFsToAHMs)
227 | 


--------------------------------------------------------------------------------
/man/AnnotationHubMetadata-class.Rd:
--------------------------------------------------------------------------------
  1 | \name{AnnotationHubMetadata-class}
  2 | \docType{class}
  3 | 
  4 | % Class:
  5 | \alias{class:HubMetadata}
  6 | \alias{HubMetadata-class}
  7 | \alias{HubMetadata}
  8 | \alias{class:AnnotationHubMetadata}
  9 | \alias{AnnotationHubMetadata-class}
 10 | \alias{AnnotationHubMetadata}
 11 | 
 12 | % Constructors:
 13 | \alias{AnnotationHubMetadata}
 14 | 
 15 | % Accessors:
 16 | \alias{metadata}
 17 | \alias{metadata,HubMetadata-method}
 18 | \alias{metadata<-}
 19 | \alias{metadata<-,HubMetadata,list-method}
 20 | \alias{inputFiles}
 21 | \alias{inputFiles,HubMetadata-method}
 22 | \alias{outputFile}
 23 | \alias{outputFile,HubMetadata-method}
 24 | \alias{recipeName}
 25 | \alias{recipeName,HubMetadata-method}
 26 | \alias{hubError}
 27 | \alias{hubError,list-method}
 28 | \alias{hubError,HubMetadata-method}
 29 | \alias{hubError<-}
 30 | \alias{hubError<-,list,character-method}
 31 | \alias{hubError<-,HubMetadata,character-method}
 32 | 
 33 | % Methods:
 34 | \alias{run}
 35 | \alias{run,AnnotationHubMetadata-method}
 36 | 
 37 | % Other:
 38 | \alias{HubMetadataFromJson}
 39 | \alias{toJson}
 40 | \alias{constructSeqInfo}
 41 | \alias{ahmToJson}
 42 | \alias{deleteResources}
 43 | \alias{getImportPreparerClasses}
 44 | \alias{makeAnnotationHubResource}
 45 | 
 46 | % Show:
 47 | \alias{show}
 48 | \alias{show,HubMetadata-method}
 49 | 
 50 | 
 51 | \title{Class \code{"AnnotationHubMetadata"} and methods}
 52 | 
 53 | \description{
 54 | 
 55 |   \code{AnnotationHubMetadata} is used to represent record(s) in the
 56 |   server data base.
 57 | 
 58 | }
 59 | 
 60 | \section{Objects from the Class}{
 61 | 
 62 |   Objects can be created by calls to the constructor,
 63 |   \code{AnnotationHubMetadata()}.
 64 | 
 65 | }
 66 | 
 67 | \usage{
 68 | AnnotationHubMetadata(AnnotationHubRoot, SourceUrl, SourceType,
 69 | 		 SourceVersion, SourceLastModifiedDate, SourceMd5 =
 70 | 		 NA_character_, SourceSize, DataProvider, Title,
 71 | 		 Description, Species, TaxonomyId, Genome, Tags,
 72 | 		 Recipe, RDataClass, RDataDateAdded, RDataPath,
 73 | 		 Maintainer, ..., BiocVersion = BiocManager::version(),
 74 | 		 Coordinate_1_based = TRUE, Notes = NA_character_,
 75 | 		 DispatchClass, Location_Prefix =
 76 | 		 "https://bioconductorhubs.blob.core.windows.net/annotationhub/")
 77 | 
 78 | toJson(x)
 79 | constructSeqInfo(species, genome)
 80 | 
 81 | metadata(x, ...)
 82 | hubError(x)
 83 | inputFiles(object, ...)
 84 | outputFile(object)
 85 | ahmToJson(ahm)
 86 | deleteResources(id)
 87 | getImportPreparerClasses()
 88 | makeAnnotationHubResource(objName, makeAnnotationHubMetadataFunction,
 89 | 			  ..., where)
 90 | }
 91 | 
 92 | \arguments{
 93 |     \item{AnnotationHubRoot}{
 94 |       \code{character(1)} Absolute path to directory structure
 95 |       containing resources to be added to AnnotationHub. Internal use only.
 96 |     }
 97 |     \item{SourceUrl}{
 98 |       \code{character()} URL of original resource(s).
 99 |     }
100 |     \item{SourceType}{
101 |       \code{character()} Form of original data, e.g., BED, FASTA,
102 |     etc. \code{getValidSourceTypes()} list currently acceptable
103 |     values. If nothing seems appropiate for your data reach out to
104 |     maintainer@bioconductor.org.
105 | 
106 |     }
107 |     \item{SourceVersion}{
108 |       \code{character(1)} Version of original file.
109 |     }
110 |     \item{SourceLastModifiedDate}{
111 |       \code{POSIXct()} The date when the source was last modified.
112 |     }
113 |     \item{SourceMd5}{
114 |       \code{character()} md5 hash of original file.
115 |     }
116 |     \item{SourceSize}{
117 |       \code{numeric(1)} Size of original file in bytes.
118 |     }
119 |     \item{DataProvider}{
120 |       \code{character(1)} Provider of original data, e.g., NCBI,
121 |       UniProt etc.
122 |     }
123 |     \item{Title}{
124 |       \code{character(1)} Title for the resource with version or genome
125 |       build as appropriate.
126 |     }
127 |     \item{Description}{
128 |       \code{character(1)} Description of the resource. May include
129 |       details such as data type, format, study origin, sequencing
130 |       technology, treated vs control, number of samples etc.
131 |     }
132 |     \item{Species}{
133 |       \code{character(1)} Species name. For help on valid
134 | 	species see \code{getSpeciesList, validSpecies, or suggestSpecies.}
135 | 
136 |     }
137 |     \item{TaxonomyId}{
138 |       \code{character(1)}  NCBI code. There are
139 | 	checks for valid taxonomyId given the Species which produce
140 | 	warnings. See GenomeInfoDb::loadTaxonomyDb() for full validation
141 | 	table.
142 | 
143 |     }
144 |     \item{Genome}{
145 |       \code{character(1)} Name of genome build.
146 |     }
147 |     \item{Tags}{
148 |       \code{character()} Free-form tags that serve as search terms.
149 |     }
150 |     \item{Recipe}{
151 |       \code{character(1)} Name of recipe function. Only applicable to
152 |       recipes created by the Bioconductor core team and included in
153 |       AnnotationHubData base code.
154 |     }
155 |     \item{RDataClass}{
156 |       \code{character()} Class of derived R object, e.g., GRanges. Length
157 |       must match the length of \code{RDataPath}.
158 |     }
159 |     \item{RDataDateAdded}{
160 |       \code{POSIXct()} Date resource was added to AnnotationHub. The
161 |       default is today's date and is auto-generated when metadata are
162 |       constructed. Resources will appear in snapshots with a date greater
163 |       than or equal to the \code{RDataDateAdded}.
164 |     }
165 |     \item{RDataPath}{
166 |       \code{character()} File path to where object is stored in AWS S3
167 |       bucket or on the web.This field should be the
168 | 	      remainder of the path to the resource. The
169 | 	      \code{Location_Prefix} will be prepended to
170 | 	      \code{RDataPath} for the full path to the resource.
171 | 	      If the resource is stored in Bioconductor's AWS S3
172 | 	      buckets, it should start with the name of the package associated
173 | 	      with the metadata and should not start with a leading
174 | 	      slash. It should include the resource file name. For
175 | 	      strongly associated files, like a bam file and its index
176 | 	      file, the two files should be separates with a colon
177 | 	      \code{:}. This will link a single hub id with the multiple files.
178 | 
179 |     }
180 |     \item{Maintainer}{
181 |       \code{character(1)} Maintainer name and email address,
182 |       \sQuote{A Maintainer \url{a.maintainer@email.com}}
183 |     }
184 |     \item{BiocVersion}{\code{character(1)}. The first Bioconductor version
185 |       the resource was made available for. Unless removed from the hub, the
186 |       resource will be available for all versions greater than or equal to this
187 |       field.
188 |     }
189 |     \item{Coordinate_1_based}{
190 |       \code{logical(1)} Do coordinates start with 1 or 0?
191 |     }
192 |     \item{DispatchClass}{
193 |       \code{character(1)}. Determines how data are loaded into R. The value for
194 |       this field should be \sQuote{Rda} if the data were serialized with
195 |       \code{save()} and \sQuote{Rds} if serialized with \code{saveRDS}. The
196 |       filename should have the appropriate \sQuote{rda} or \sQuote{rds}
197 |       extension.
198 | 
199 |       A number of dispatch classes are pre-defined in
200 |       AnnotationHub/R/AnnotationHubResource-class.R with the suffix
201 |       \sQuote{Resource}. For example, if you have sqlite files, the
202 |       AnnotationHubResource-class.R defines SQLiteFileResource so the
203 |       DispatchClass would be SQLiteFile. Contact maintainer@bioconductor.org if
204 |       you are not sure which class to use. The function
205 | 	      \code{AnnotationHub::DispatchClassList()} will output a
206 | 	      matrix of currently implemented DispatchClass and brief
207 | 	      description of utility. If a predefine class does not seem
208 | 	      appropriate contact maintainer@bioconductor.org.
209 | 
210 |     }
211 |     \item{Location_Prefix}{
212 |       \code{character(1)} URL location of AWS S3 bucket or web site where
213 |       resource is located.
214 |     }
215 |     \item{Notes}{
216 |       \code{character()} Notes about the resource.
217 |     }
218 |     \item{ahm}{
219 |       An instance of class \code{AnnotationHubMetadata}.
220 |     }
221 |     \item{x}{
222 |       An instance of class \code{AnnotationHubMetadata}.
223 |     }
224 |     \item{object}{
225 |       An \code{AnnotationHubRecipe} instance.
226 |     }
227 |     \item{species}{
228 |       \code{character(1)} The organism, e.g., "Homo sapiens".
229 |     }
230 |     \item{genome}{
231 |       \code{character(1)} The genome build, e.g., "hg19".
232 |     }
233 |     \item{id}{
234 |       An id whose DB record is to be fully deleted.
235 |     }
236 |     \item{objName}{
237 |       \code{character(1)} The name of the PreparerClass used for dispatch.
238 |     }
239 |     \item{makeAnnotationHubMetadataFunction}{
240 |       \code{function} Function (name) that makes \code{AnnotationHubMetadata}
241 |       objects from the resource(s).
242 |     }
243 |     \item{where}{
244 |       Environment where function definition is defined. Default value
245 |       is sufficient.
246 |     }
247 |     \item{\dots}{
248 |       Additional arguments passed to methods.
249 |     }
250 | }
251 | 
252 | \value{
253 |   \code{AnnotationHubMetadata} returns an instance of the class.
254 | 
255 |   \code{jsonPath} returns a \code{character(1))} representation of the
256 |   full path to the location of the \code{json} file associated with this
257 |   record.
258 | 
259 |   \code{toJson} returns the JSON representation of the record.
260 | 
261 |   \code{fromJson} retuns an instance of the class, as parsed from the
262 |   JSON file.
263 | }
264 | 
265 | \author{Dan Tenenbaum and Marc Carlson}
266 | 
267 | 
268 | \examples{
269 | getClass("AnnotationHubMetadata")
270 | }
271 | 
272 | \keyword{classes}
273 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
  1 | CHANGES IN VERSION 1.29.0
  2 | ------------------------
  3 | 
  4 | NEW FEATURES
  5 | 
  6 |     o 1.29.2 Added HIC as acceptable source type 
  7 |     o 1.29.1 Added CDF as acceptable source type 
  8 | 
  9 | 
 10 | CHANGES IN VERSION 1.25.0
 11 | ------------------------
 12 | 
 13 | SIGNIFICANT UPDATES
 14 | 
 15 |     o 1.25.7 Update recipes to upload to azure. NonStandardOrgDb release recipe
 16 |     updated
 17 |     o 1.25.6 Update recipes to upload to azure. TwoBit ensembl and release
 18 |     recipes for standard TxDb and OrgDb updated
 19 | 
 20 | NEW FEATURES
 21 | 
 22 |     o 1.25.5 Add helper function to upload to azure
 23 | 
 24 | MODIFICATIONS
 25 | 
 26 |     o 1.25.2 Changed makeAnnotationHubMetadata to point to Azure instead of AWS
 27 | 
 28 | CHANGES IN VERSION 1.21.0
 29 | ------------------------
 30 | 
 31 | MODIFICATIONS
 32 | 
 33 |     o 1.21.9 Add PNG as valid source type
 34 |     o 1.21.4 Removed vignette for creating annotation hub package. Reference and
 35 |     refer to single vignette in AnnotationHub 
 36 |     o 1.21.3 Tags for database now combination of biocViews and meta$Tags. Also
 37 |     checks for valid AnnotationHub or AnnotationHubSoftware biocViews.
 38 |     o 1.21.2 Add mtx.gz as valid source type
 39 | 
 40 | BUG CORRECTION
 41 | 
 42 |     o 1.21.3 Fixed bug to run make*HubMetadata using "."
 43 | 
 44 | INTERNAL BUG CORRECTION
 45 | 
 46 |     o 1.21.1 misplaced ! clause
 47 | 
 48 | REMOVED
 49 | 
 50 |     o 1.21.5 Removed BioPax. url no longer valid. Resources were old and never
 51 |     used beyond first addition
 52 | 
 53 | CHANGES IN VERSION 1.19.0
 54 | ------------------------
 55 | 
 56 | INTERNAL BUG CORRECTION
 57 | 
 58 |     o 1.19.2 Update Metadata from Ensembl function to use
 59 |     GenomeInfoDb:::fetch_species_index_from_Ensembl_FTP instead of parsing the
 60 |     file path
 61 |     o 1.19.1 misplaced ! clause
 62 |  
 63 | 
 64 | CHANGES IN VERSION 1.17.0
 65 | ------------------------
 66 | 
 67 | MODIFICATIONS
 68 | 
 69 |     o 1.17.3 add check for valid Title and Description in metadata file. It
 70 |     should not be empty or NA
 71 |     o 1.17.2 add XML as valid source type
 72 |     o 1.17.1 add GSEMatrix as valid source type
 73 | 
 74 | CHANGES IN VERSION 1.15.0
 75 | ------------------------
 76 | 
 77 | MODIFICATIONS
 78 | 
 79 |     o 1.15.13 Added "BLOB" as a valid source type
 80 |     o 1.15.7 Added "MTX" as a valid source type
 81 |     o 1.15.6 Expanded documentation to clarify that data can be hosted
 82 |     publically not strictly Bioconductor AWS
 83 |     o 1.15.4 Added "XLS/XLSX" as valid source type
 84 | 
 85 | INTERNAL BUG CORRECTION
 86 | 
 87 |     o 1.15.11 updated GencodeGFF recipes for potential future use (still would
 88 |     revisit this with another update to do like ensembl on the fly)
 89 |     o 1.15.5 remove validity check that is wrong/outdated
 90 |     o 1.15.1 needToRerunNonStandardOrgDb added as helper function for when
 91 |     generating non standard org dbs. 1.15.3 added try catch in case aws buckets
 92 |     unreachable. 
 93 | 
 94 | CHANGES IN VERSION 1.13.0
 95 | ------------------------
 96 | 
 97 | NEW FEATURES
 98 | 
 99 |     o Added ability to have multiple RDataPaths associated with single hub
100 |     id for strongly associated files (like bam and its bai index file)
101 |     o DispatchClass are now validated against AnnotationHub::DispatchClassList()
102 |     which contains currently available DispatchClass and brief description of
103 |     loading process.
104 | 
105 | CHANGES IN VERSION 1.11.0
106 | ------------------------
107 | MODIFICATIONS
108 | 
109 |     o Removed scripts for Pazar DB as website no longer active
110 |     o Update from BiocInstaller to BiocManager
111 | 
112 | NEW FEATURES
113 | 
114 |     o Species and taxonomyId are now validated against GenomeInfoDbData object
115 | 
116 | BUG FIX
117 | 
118 |     o Fix TwoBit resource receipe. Converts DNA that is not A,C,T,G,N to N do to
119 |     design of rtracklayer::export for TwoBit
120 |     o Fix bug with assignment of tags in annotationhub
121 |     o makeEpigenomeRoadMap recipe updated to account for XML bug that cannot
122 |     handle http urls. updated to https
123 | 
124 | CHANGES IN VERSION 1.10.0
125 | ------------------------
126 | MODIFICATIONS
127 | 
128 |     o Moved readMetadataFromCsv back to AnnotationHubData.
129 | 
130 |     o Use AnnotationHubData::makeAnnotationHubMetadata to validate metadata.csv
131 | 
132 |     o readMetadataFromCsv is now internal function
133 | 
134 | 
135 | CHANGES IN VERSION 1.8.0
136 | ------------------------
137 | 
138 | NEW FEATURES
139 | 
140 |     o Instead of using dropbox or ftp to deliver contributed resources to
141 |     Bioconductor Core, temporary access to Annotation-Contributor user on S3 is
142 |     utilized.
143 | 
144 | MODIFICATIONS
145 | 
146 |     o Modified readMetadataFromCsv; make RDataPath mandatory entry and if
147 |     location_prefix is Bioconductor S3 bucket the Rdatapath must start with the
148 |     package name
149 | 
150 | BUG FIXES
151 | 
152 |     o Add garbage collection to fix twobit memory allocation error
153 | 
154 |     o Fix files not deleting do to special characters in file names
155 | 
156 |     o Import dbGetQuery from DBI
157 | 
158 |     o Remove hard coded biocVersion in unit tests
159 | 
160 | CHANGES IN VERSION 1.6.0
161 | ------------------------
162 | 
163 | NEW FEATURES
164 | 
165 |     o add makeStandardTxDbsToSqlite() recipe
166 | 
167 |     o add 'ensembl' and 'MySQL' as possible SourceType values
168 | 
169 |     o tidy and export makeStandard*ToAHMs and makeNCBIToOrgDbsToAHMs
170 | 
171 | MODIFICATIONS
172 | 
173 |     o move currentMetadata
174 | 
175 |     o tidy pushResources interface
176 | 
177 |     o modified parsing of species name and genome in .ensemblMetadataFromUrl()
178 | 
179 |     o modified standard OrgDb recipe
180 | 
181 |     o enhance and clean vignette
182 | 
183 |     o move 'Tags' check from readCsvFromMetadata() to
184 |     makeAnnotationHubMetadata()
185 | 
186 |     o remove dependency on xml2, curl, httr and probably other wheel
187 |     reinventions, alter imports and suggests
188 | 
189 |     o specify multiple 'Tags' as colon separated string instead of comma
190 |     separated; avoids problems with read.csv()
191 | 
192 |     o select data moved to GenomeInfoDbData package
193 | 
194 |     o Added additional documentation instructions for core members to add
195 |     contributed data to AnnotationHub
196 | 
197 |     o rename files; remove old JSON test file no longer applicable
198 | 
199 |     o pass 'install' argument down through recipe
200 | 
201 |     o General code tidy; remove unused functions and comments; clarify checks
202 | 
203 | BUG FIXES
204 | 
205 |     o readMetadataFromCsv() fills in DataProvider and Coordinate_1_based if missing
206 | 
207 |     o fix bug introduced in checking 'release' in makeEnsemblTwoBit recipe
208 | 
209 |     o makeAnnotationHubMetadata() now processes all inst/extdata/*.csv files
210 | 
211 |     o fix subset and import bug in makeAnnotationHubMetadata()
212 | 
213 |     o Fix bug in Rdatapath and sourceurl for makeEnsemblFasta.R
214 | 
215 | CHANGES IN VERSION 1.4.0
216 | ------------------------
217 | 
218 | NEW FEATURES
219 | 
220 |     o add script to generate user-contributed resources
221 | 
222 |     o makeEnsemblGtfToGRanges() no longer stores data in S3 but downloads
223 |       and converts to GRanges on the fly
224 | 
225 |     o add EnsemblFastaTwoBitToAHM unit test
226 | 
227 |     o add man page for makeEnsemblTwoBitToAHM and
228 |       ensemblFastaToTwoBitFile
229 | 
230 |     o add makeAnnotationHubMetadata() helper
231 | 
232 | MODIFICATIONS
233 | 
234 |     o move GSE62944-related code to ExperimentHub
235 | 
236 |     o move old vignettes to inst/scripts; add 'Introduction to
237 |       AnnotationHubData' vignette
238 | 
239 |     o remove fasta and towbit files on the fly
240 | 
241 |     o add 'uploadToS3' argument to pushResources() and runRecipes()
242 | 
243 |     o move readMetadataFromCsv() from ExperimentHubData to
244 |       AnnotationHubData
245 | 
246 |     o add 'fileName' arg to readMetadataFromCsv(); don't warn when
247 |       'Tags' are provided
248 | 
249 |     o specify length for args in readMetadataFromCsv()
250 | 
251 |     o makeAnnotationHubMetadata() populates PreparerClass with package name
252 | 
253 |     o add 'fileName' arg to makeAnnotationHubMetadata()
254 | 
255 | 
256 | CHANGES IN VERSION 1.2.0
257 | ------------------------
258 | 
259 | NEW FEATURES
260 | 
261 |     o add makeEnsemblTwoBit()
262 | 
263 |     o add hubError(), hubError<- generics and methods
264 | 
265 |     o create 'HubMetadata' class which 'AnnotationHubMetadata' inherits from
266 | 
267 | MODIFICATIONS
268 | 
269 |     o export ensemblFastaToTwoBitFile()
270 | 
271 |     o modifications due to changes in httr::HEAD():
272 |       - AFAICT httr::HEAD() >= 1.1.0 accepts https only, not ftp
273 |       - use xml2 instead of XML for parsing (httr >= 1.1.0 dependency change)
274 | 
275 |     o work on recipes:
276 |       - clean up ChEA and Gencode
277 |       - don't export tracksToUpdate(); was broken and not used
278 |       - reorg man pages; combine Ensembl Fasta and TwoBit on single man page
279 | 
280 |     o work on updateResources():
281 |       - push data to S3 before inserting metadata in db
282 |       - isolate pushResources() and pushMetadata() from updateResources()
283 |       - NOTE: Epigenome unit test is failing due to bad url. If not fixed by
284 | 	the host the recipe will need to change.
285 | 
286 |     o update makedbSNPVCF() to look in new clinvar location
287 | 
288 | BUG FIXES
289 | 
290 |     o fix bugs in makedbSNPVCF() recipe related to genome and tags
291 | 
292 | 
293 | CHANGES IN VERSION 1.0.0
294 | ------------------------
295 | 
296 | BUG FIXES
297 | 
298 |     o ENSEMBL recipes discover gtf files on Windows.
299 | 
300 | 
301 | CHANGES IN VERSION 0.0.214
302 | --------------------------
303 | 
304 | NEW FEATURES
305 | 
306 |    o Have added vcf files from the following genome builds for humans
307 |     "human_9606/VCF/clinical_vcf_set/",
308 |     "human_9606_b141_GRCh37p13/VCF/",
309 |     "human_9606_b142_GRCh37p13/VCF/",
310 |     "human_9606_b142_GRCh37p13/VCF/clinical_vcf_set/"
311 | 
312 |    o For each genome build, where available, the following VCF file
313 |      formats are available
314 |      a) all.vcf.gz
315 |      b) all_papu.vcf.gz
316 |      c) common_all.vcf.gz
317 |      d) clinvar.vcf.gz
318 |      e) clinvar_papu
319 |      f) common_and_clinical
320 |      g) common_no_known_medical_impact
321 | 
322 |    o The user can refer to
323 |      http://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf/
324 |      for VCF file type formats
325 | 


--------------------------------------------------------------------------------
/man/makeAnnotationHubMetadata.Rd:
--------------------------------------------------------------------------------
  1 | \name{makeAnnotationHubMetadata}
  2 | 
  3 | \alias{makeAnnotationHubMetadata}
  4 | 
  5 | \title{
  6 |   Make AnnotationHubMetadata objects from csv file of metadata
  7 | }
  8 | 
  9 | \description{
 10 |   Make AnnotationHubMetadata objects from .csv files located in the
 11 |   "inst/extdata/" package directory of an AnnotationHub package.
 12 | }
 13 | 
 14 | 
 15 | \usage{
 16 |   makeAnnotationHubMetadata(pathToPackage, fileName=character())
 17 | }
 18 | 
 19 | \arguments{
 20 |   \item{pathToPackage}{
 21 |     Full path to data package including the package name; no trailing slash
 22 |   }
 23 |   \item{fileName}{
 24 |     Name of metadata file(s) with csv extension. If none are provided, all
 25 |     files with .csv extension in "inst/extdata" will be processed.
 26 |   }
 27 | }
 28 | 
 29 | \details{
 30 |   \itemize{
 31 |     \item{makeAnnotationHubMetadata:}{
 32 |       Reads the resource metadata from .csv files into a
 33 |       \link{AnnotationHubMetadata} object. The \link{AnnotationHubMetadata}
 34 |       is inserted in the AnnotationHub database. Intended for internal
 35 |       use or package authors checking the validity of package metadata.
 36 |     }
 37 |   }
 38 | 
 39 |   \itemize{
 40 |     \item{Formatting metadata files:}{
 41 | 
 42 |       \code{makeAnnotationHubMetadata} reads .csv files of metadata
 43 |       located in "inst/extdata". Internal functions perform checks for
 44 |       required columns and data types and can be used by package authors
 45 |       to validate their metadata before submitting the package for
 46 |       review.
 47 | 
 48 |       The rows of the .csv file(s) represent individual \code{Hub}
 49 |       resources (i.e., data objects) and the columns are the metadata
 50 |       fields. All fields should be a single character string of length 1.
 51 | 
 52 |       Required Fields in metadata file:
 53 |       \itemize{
 54 | 	\item Title: \code{character(1)}. Name of the resource. This can be
 55 | 	      the exact file name (if self-describing) or a more complete
 56 | 	      description.
 57 | 
 58 | 	\item Description: \code{character(1)}. Brief description of the
 59 | 	      resource, similar to the 'Description' field in a package
 60 | 	      DESCRIPTION file.
 61 | 
 62 | 	\item BiocVersion: \code{character(1)}. The first Bioconductor version
 63 | 	      the resource was made available for. Unless removed from
 64 | 	      the hub, the resource will be available for all versions
 65 | 	      greater than or equal to this field. Generally the current
 66 | 	      devel version of Bioconductor.
 67 | 
 68 | 	\item Genome: \code{character(1)}. Genome. Can be NA.
 69 | 
 70 | 	\item SourceType: \code{character(1)}. Format of original data, e.g., FASTA,
 71 | 	BAM, BigWig, etc. \code{getValidSourceTypes()} list currently
 72 | 	acceptable values. If nothing seems appropiate for your data
 73 | 	reach out to maintainer@bioconductor.org.
 74 | 
 75 | 	\item SourceUrl: \code{character(1)}. Optional location of original
 76 | 	      data files. Multiple urls should be provided as a comma separated
 77 | 	      string.
 78 | 
 79 | 	\item SourceVersion: \code{character(1)}. Version of original data.
 80 | 
 81 | 	\item Species: \code{character(1)}. Species. For help on valid
 82 | 	species see \code{getSpeciesList, validSpecies, or
 83 | 	suggestSpecies. Can be NA.}
 84 | 
 85 | 	\item TaxonomyId: \code{character(1)}. Taxonomy ID. There are
 86 | 	checks for valid taxonomyId given the Species which produce
 87 | 	warnings. See GenomeInfoDb::loadTaxonomyDb() for full validation
 88 | 	table. Can be NA.
 89 | 
 90 | 	\item Coordinate_1_based: \code{logical}. TRUE if data are
 91 | 	1-based. Can be NA
 92 | 
 93 | 	\item DataProvider: \code{character(1)}. Name of company or institution
 94 | 	      that supplied the original (raw) data.
 95 | 
 96 | 	\item Maintainer: \code{character(1)}. Maintainer name and email in the
 97 | 	      following format: Maintainer Name <username@address>.
 98 | 
 99 | 	\item RDataClass: \code{character(1)}. R / Bioconductor class the data
100 | 	      are stored in, e.g., GRanges, SummarizedExperiment,
101 | 	      ExpressionSet etc. If the file is loaded or read into R
102 | 	      what is the class of the object.
103 | 	      
104 | 	\item DispatchClass: \code{character(1)}. Determines how data are
105 | 	      loaded into R. The value for this field should be
106 | 	      \sQuote{Rda} if the data were serialized with \code{save()} and
107 | 	      \sQuote{Rds} if serialized with \code{saveRDS}. The filename
108 | 	      should have the appropriate \sQuote{rda} or \sQuote{rds}
109 | 	      extension. There are other available DispathClass types
110 | 	      and the function \code{AnnotationHub::DispatchClassList()}
111 | 	      
112 | 	      A number of dispatch classes are pre-defined in
113 | 	      AnnotationHub/R/AnnotationHubResource-class.R with the suffix
114 | 	      \sQuote{Resource}. For example, if you have sqlite files, the
115 | 	      AnnotationHubResource-class.R defines SQLiteFileResource so
116 | 	      the DispatchClass would be SQLiteFile. Contact
117 | 	      maintainer@bioconductor.org if you are not sure which class
118 | 	      to use. The function
119 | 	      \code{AnnotationHub::DispatchClassList()} will output a
120 | 	      matrix of currently implemented DispatchClass and brief
121 | 	      description of utility. If a predefine class does not seem
122 | 	      appropriate contact maintainer@bioconductor.org.  An all
123 | 	      purpose DispathClass is \code{FilePath} that instead of trying
124 | 	      to load the file into R, will only return the path to the
125 | 	      locally downloaded file.
126 | 	      
127 | 	\item Location_Prefix: \code{character(1)}. Do not include this field
128 | 	      if data are stored in the Bioconductor AWS S3; it will be
129 | 	      generated automatically.
130 | 
131 | 	      If data will be accessed from a location other than AWS S3
132 | 	      this field should be the base url.
133 | 
134 | 	\item RDataPath: \code{character()}.This field should be the
135 | 	      remainder of the path to the resource. The
136 | 	      \code{Location_Prefix} will be prepended to
137 | 	      \code{RDataPath} for the full path to the resource.
138 | 	      If the resource is stored in Bioconductor's AWS S3
139 | 	      buckets, it should start with the name of the package associated
140 | 	      with the metadata and should not start with a leading
141 | 	      slash. It should include the resource file name. For
142 | 	      strongly associated files, like a bam file and its index
143 | 	      file, the two files should be separates with a colon
144 | 	      \code{:}. This will link a single hub id with the multiple files.
145 | 
146 |        \item Tags: \code{character() vector}.
147 | 	     \sQuote{Tags} are search terms used to define a subset of
148 | 	     resources in a \code{Hub} object, e.g, in a call to \code{query}.
149 | 
150 | 	     \sQuote{Tags} are automatically generated from the
151 | 	     \sQuote{biocViews} in the DESCRIPTION and applied to all
152 | 	     resources of the metadata file. Optionally, maintainers can
153 | 	     define \sQuote{Tags} column of the metadata to define tags
154 | 	     for each resource individually. Multiple \sQuote{Tags} are
155 | 	     specified as a colon separated string, e.g., tags for two
156 | 	     resources would look like this:
157 | 
158 | 	     \preformatted{
159 | 	     Tags=c("tag1:tag2:tag3", "tag1:tag3")
160 | 	     }
161 | 
162 | 
163 | 
164 |       }
165 |       NOTE: The metadata file can have additional columns beyond the 'Required
166 |       Fields' listed above. These values are not added to the Hub database but
167 |       they can be used in package functions to provide an additional level of
168 |       metadata on the resources.
169 | 
170 |       More on \code{Location_Prefix} and \code{RDataPath}. These two fields make up
171 |       the complete file path url for downloading the data file. If using
172 |       the Bioconductor AWS S3 bucket the Location_Prefix should not be
173 |       included in the metadata file[s] as this field will be populated
174 |       automatically.  The \code{RDataPath} will be the directory structure you
175 |       uploaded to S3. If you uploaded a directory \sQuote{MyAnnotation/}, and
176 |       that directory had a subdirectory \sQuote{v1/} that contained two files
177 |       \sQuote{counts.rds} and \sQuote{coldata.rds}, your metadata file will contain
178 |       two rows and the RDataPaths would be \sQuote{MyAnnotation/v1/counts.rds}
179 |       and \sQuote{MyAnnotation/v1/coldata.rds}.  If you host your data on a
180 |       publicly accessible site you must include a base url as the
181 |       \code{Location_Prefix}. If your data file was at
182 |       \sQuote{ftp://myinstiututeserver/biostats/project2/counts.rds}, your
183 |       metadata file will have one row and the \code{Location_Prefix} would be
184 |       \sQuote{ftp://myinstiututeserver/} and the \code{RDataPath} would be
185 |       \sQuote{biostats/project2/counts.rds}.
186 |       
187 |     }
188 |   }
189 | }
190 | 
191 | \value{
192 |     A named list the length of \code{fileName}. Each element is a list of
193 |     of \code{AnnotationHubMetadata} objects created from the .csv file.
194 | }
195 | 
196 | \seealso{
197 |   \itemize{
198 |     \item \code{\link{updateResources}}
199 |     \item \code{\link{AnnotationHubMetadata}} class
200 |   }
201 | }
202 | 
203 | \examples{
204 | 
205 | ## Each row of the metadata file represents a resource added to one of
206 | ## the 'Hubs'. This example creates a metadata.csv file for a single resource.
207 | ## In the case of multiple resources, the arguments below would be character
208 | ## vectors that produced multiple rows in the data.frame.
209 | 
210 | meta <- data.frame(
211 |     Title = "RNA-Sequencing dataset from study XYZ",
212 |     Description = paste0("RNA-seq data from study XYZ containing 10 normal ",
213 | 			 "and 10 tumor samples represented as a",
214 | 			 "SummarizedExperiment"),
215 |     BiocVersion = "3.4",
216 |     Genome = "GRCh38",
217 |     SourceType = "BAM",
218 |     SourceUrl = "http://www.path/to/original/data/file",
219 |     SourceVersion = "Jan 01 2016",
220 |     Species = "Homo sapiens",
221 |     TaxonomyId = 9606,
222 |     Coordinate_1_based = TRUE,
223 |     DataProvider = "GEO",
224 |     Maintainer = "Your Name <youremail@provider.com>",
225 |     RDataClass = "SummarizedExperiment",
226 |     DispatchClass = "Rda",
227 |     ResourceName = "FileName.rda"
228 | )
229 | 
230 | \dontrun{
231 | ## Write the data out and put in the inst/extdata directory.
232 | write.csv(meta, file="metadata.csv", row.names=FALSE)
233 | 
234 | ## Test the validity of metadata.csv
235 | makeAnnotationHubMetadata("path/to/mypackage")
236 | }
237 | }
238 | 
239 | \keyword{methods}
240 | 


--------------------------------------------------------------------------------