├── tests ├── testthat │ ├── test-epiflu.R │ ├── data │ │ └── test.fa │ ├── test-core.R │ ├── test-login.R │ ├── test-fasta.R │ ├── test-download.R │ └── test-query.R └── testthat.R ├── NAMESPACE ├── .Rbuildignore ├── .gitignore ├── man ├── read_fasta.Rd ├── login.Rd ├── download.Rd └── query.Rd ├── GISAIDR.Rproj ├── DESCRIPTION ├── R ├── read_fasta.R ├── export_fasta.R ├── query.R ├── internal_query.R ├── login.R ├── core.R └── download.R ├── .github └── workflows │ └── r.yml ├── README.md └── GISAID_LOCATIONS.txt /tests/testthat/test-epiflu.R: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(GISAIDR) 3 | 4 | test_check("GISAIDR") 5 | -------------------------------------------------------------------------------- /tests/testthat/data/test.fa: -------------------------------------------------------------------------------- 1 | >A 2 | TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAG 3 | >B 4 | TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAG 5 | >C 6 | TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAG 7 | >D 8 | TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAG 9 | -------------------------------------------------------------------------------- /tests/testthat/test-core.R: -------------------------------------------------------------------------------- 1 | test_that("createCommand returns list", { 2 | expect_true(is.list(createCommand(1,2,3,"Go", list()))) 3 | }) 4 | 5 | 6 | test_that("formatDataForRequest returns str", { 7 | expect_true(is.character(formatDataForRequest(1,2,3, list(), 1))) 8 | }) 9 | -------------------------------------------------------------------------------- /man/read_fasta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_fasta.R 3 | \name{read_fasta} 4 | \alias{read_fasta} 5 | \title{Read FASTA amino acids file into a dataframe} 6 | \usage{ 7 | read_fasta(file = NULL, get_sequence = TRUE) 8 | } 9 | \description{ 10 | This function reads a FASTA amino acids file into a dataframe 11 | } 12 | \details{ 13 | TAKEN FROM AMPIR: https://github.com/Legana/ampir 14 | } 15 | -------------------------------------------------------------------------------- /GISAIDR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: GISAIDR 2 | Type: Package 3 | Title: R wrapper for GISAID "API" 4 | Version: 0.9.10 5 | Author: Wytamma Wirth 6 | Maintainer: Wytamma Wirth 7 | Description: programmatically interact with the GISAID database query. 8 | License: Please see the GISAID license 9 | Encoding: UTF-8 10 | LazyData: true 11 | Depends: R (>= 3.6.0) 12 | Suggests: 13 | testthat (>= 3.0.0) 14 | Config/testthat/edition: 3 15 | Imports: 16 | httr, 17 | rjson, 18 | xml2, 19 | openssl 20 | RoxygenNote: 7.1.1 21 | -------------------------------------------------------------------------------- /man/login.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/login.R 3 | \name{login} 4 | \alias{login} 5 | \title{Login to GISAID} 6 | \usage{ 7 | login(username, password, database = "EpiCoV") 8 | } 9 | \arguments{ 10 | \item{username}{GISAID username.} 11 | 12 | \item{password}{GISAID password.} 13 | } 14 | \value{ 15 | credentials used to query GISAID. 16 | } 17 | \description{ 18 | Login to GISAID 19 | } 20 | \examples{ 21 | username = Sys.getenv("GISAIDR_USERNAME") 22 | password = Sys.getenv("GISAIDR_PASSWORD") 23 | login(username, password) 24 | } 25 | -------------------------------------------------------------------------------- /man/download.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download.R 3 | \name{download} 4 | \alias{download} 5 | \title{Download from GISAID} 6 | \usage{ 7 | download( 8 | credentials, 9 | list_of_accession_ids, 10 | get_sequence = FALSE, 11 | clean_up = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{credentials}{GISAID credentials.} 16 | 17 | \item{list_of_accession_ids}{list of accession_id from GISAID} 18 | 19 | \item{get_sequence}{load sequences into data.frame after download} 20 | 21 | \item{clean_up}{delete downloaded files (e.g. fasta files) after download} 22 | } 23 | \value{ 24 | data.frame of complete data 25 | } 26 | \description{ 27 | Download from GISAID 28 | } 29 | -------------------------------------------------------------------------------- /tests/testthat/test-login.R: -------------------------------------------------------------------------------- 1 | test_that("login returns credentials", { 2 | username = Sys.getenv("GISAIDR_USERNAME") 3 | password = Sys.getenv("GISAIDR_PASSWORD") 4 | credentials <- login(username = username, password = password) 5 | expect_true(is.list(credentials)) 6 | 7 | }) 8 | 9 | test_that("invalid login fails", { 10 | username = 'NOT_A_REAL_PERSON' 11 | password = Sys.getenv("GISAIDR_PASSWORD") 12 | expect_error(login(username = username, password = password), 13 | "Username or password wrong!") 14 | }) 15 | 16 | test_that("EpiPox login works", { 17 | username = Sys.getenv("GISAIDR_USERNAME") 18 | password = Sys.getenv("GISAIDR_PASSWORD") 19 | credentials <- login(username = username, password = password, database="EpiPox") 20 | expect_true(is.list(credentials)) 21 | }) 22 | 23 | 24 | test_that("EpiRSV login works", { 25 | username = Sys.getenv("GISAIDR_USERNAME") 26 | password = Sys.getenv("GISAIDR_PASSWORD") 27 | credentials <- login(username = username, password = password, database="EpiRSV") 28 | expect_true(is.list(credentials)) 29 | }) 30 | -------------------------------------------------------------------------------- /R/read_fasta.R: -------------------------------------------------------------------------------- 1 | #' Read FASTA amino acids file into a dataframe 2 | #' 3 | #' This function reads a FASTA amino acids file into a dataframe 4 | #' 5 | #' TAKEN FROM AMPIR: https://github.com/Legana/ampir 6 | 7 | 8 | read_fasta <- function (file = NULL, get_sequence=TRUE) { 9 | fasta_lines <- readLines(file) 10 | 11 | ### get sequence names 12 | seq_name_index <- grep(">", fasta_lines) 13 | strain <- gsub(">", "", fasta_lines[seq_name_index]) 14 | 15 | if (get_sequence) { 16 | ### get sequence 17 | seq_aa_start_index <- seq_name_index + 1 18 | seq_aa_end_index <- c(seq_name_index, length(fasta_lines)+1)[-1]-1 19 | 20 | sequence <- rep(NA, length(seq_name_index)) 21 | 22 | ### replace NA content with actual sequence content, and concatenate the lines 23 | for(i in seq_along(seq_name_index)){ 24 | seq_aa_start <- seq_aa_start_index[i] 25 | seq_aa_end <- seq_aa_end_index[i] 26 | sequence[i] <- gsub("[[:space:]]", "", 27 | paste(fasta_lines[seq_aa_start:seq_aa_end], 28 | collapse = "")) 29 | } 30 | 31 | return(data.frame(strain, sequence, stringsAsFactors = FALSE)) 32 | } else { 33 | return(data.frame(strain, stringsAsFactors = FALSE)) 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /.github/workflows/r.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # 6 | # See https://github.com/r-lib/actions/tree/master/examples#readme for 7 | # additional example workflows available for the R community. 8 | 9 | name: Build 10 | 11 | on: 12 | push: 13 | branches: [ master ] 14 | pull_request: 15 | branches: [ master ] 16 | schedule: 17 | - cron: '0 0 * * *' 18 | 19 | 20 | jobs: 21 | build: 22 | runs-on: ${{ matrix.config.os }} 23 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 24 | strategy: 25 | fail-fast: false 26 | max-parallel: 1 27 | matrix: 28 | config: 29 | - {os: macOS-latest, r: 'release'} 30 | - {os: windows-latest, r: 'release'} 31 | 32 | steps: 33 | - uses: actions/checkout@v2 34 | - name: Set up R ${{ matrix.config.os }} (${{ matrix.config.r }}) 35 | uses: r-lib/actions/setup-r@v2 36 | with: 37 | r-version: ${{ matrix.config.r }} 38 | http-user-agent: ${{ matrix.config.http-user-agent }} 39 | - name: Install dependencies 40 | run: | 41 | install.packages(c("remotes", "rcmdcheck", "curl")) 42 | remotes::install_deps(dependencies = TRUE) 43 | shell: Rscript {0} 44 | - name: Check 45 | env: 46 | GISAIDR_PASSWORD: ${{ secrets.GISAIDR_PASSWORD }} 47 | GISAIDR_USERNAME: ${{ secrets.GISAIDR_USERNAME }} 48 | run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") 49 | shell: Rscript {0} 50 | -------------------------------------------------------------------------------- /man/query.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.R 3 | \name{query} 4 | \alias{query} 5 | \title{Query GISAID Database} 6 | \usage{ 7 | query( 8 | credentials, 9 | location = NULL, 10 | lineage = NULL, 11 | from = NULL, 12 | from_subm = NULL, 13 | to = NULL, 14 | to_subm = NULL, 15 | start_index = 0, 16 | nrows = 50, 17 | load_all = FALSE, 18 | low_coverage_excl = FALSE, 19 | complete = FALSE, 20 | high_coverage = FALSE, 21 | collection_date_complete = FALSE, 22 | total = FALSE 23 | ) 24 | } 25 | \arguments{ 26 | \item{credentials}{GISAID credentials.} 27 | 28 | \item{location}{search for entries based on geographic location.} 29 | 30 | \item{lineage}{search for entries based on pango lineage designations.} 31 | 32 | \item{from}{search from specific collection date.} 33 | 34 | \item{from_subm}{search from specific submission date.} 35 | 36 | \item{to}{search to specific collection date.} 37 | 38 | \item{to_subm}{search to specific submission date.} 39 | 40 | \item{start_index}{page through results.} 41 | 42 | \item{nrows}{number of results to return.} 43 | 44 | \item{load_all}{return all results.} 45 | 46 | \item{low_coverage_excl}{exclude low coverage entries from the results.} 47 | 48 | \item{complete}{include only complete entries in the results.} 49 | 50 | \item{high_coverage}{include only high coverage entries in the results.} 51 | 52 | \item{collection_date_complete}{include only entries with complete in collection date the results.} 53 | 54 | \item{total}{returns the total number of sequences matching the query.} 55 | } 56 | \value{ 57 | Dataframe. 58 | } 59 | \description{ 60 | Query GISAID Database 61 | } 62 | -------------------------------------------------------------------------------- /tests/testthat/test-fasta.R: -------------------------------------------------------------------------------- 1 | test_that("read_fasta works", { 2 | fa_df <- read_fasta('data/test.fa') 3 | expect_equal(names(fa_df), c('strain', 'sequence')) 4 | expect_equal(fa_df$strain, c('A', 'B', 'C', 'D')) 5 | expect_equal(fa_df$sequence[4], 'TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAG') 6 | }) 7 | 8 | 9 | test_that("export_fasta works", { 10 | 11 | # Define a temporary file for testing 12 | temp_file <- tempfile(fileext = ".fasta") 13 | 14 | # Mock seqs data frame 15 | seqs <- data.frame(country = c("Country1", "Country2"), 16 | pangolin_lineage = c("Lineage1", "Lineage2"), 17 | accession_id = c("Accession1", "Accession2"), 18 | date = as.Date(c("2023-01-01", "2023-02-01")), 19 | description = c("Description1", "Description2"), 20 | sequence = c("AGCT", "TCGA")) 21 | 22 | # Apply the function 23 | export_fasta(seqs, temp_file, columns = c("country", "pangolin_lineage", "accession_id", "date"), date_format = "%Y-%m") 24 | 25 | # Check if file exists 26 | expect_true(file.exists(temp_file)) 27 | 28 | # Read the generated file 29 | fasta_content <- readLines(temp_file) 30 | 31 | # Check the format of the FASTA headers 32 | fasta_headers <- fasta_content[seq(1, length(fasta_content), 2)] 33 | expected_headers <- c(">Country1@Lineage1@Accession1@2023-01", 34 | ">Country2@Lineage2@Accession2@2023-02") 35 | expect_equal(fasta_headers, expected_headers) 36 | 37 | # Check the sequences 38 | fasta_sequences <- trimws(fasta_content[seq(2, length(fasta_content), 2)]) 39 | expected_sequences <- c("AGCT", "TCGA") 40 | expect_equal(fasta_sequences, expected_sequences) 41 | 42 | # Remove temporary file 43 | unlink(temp_file) 44 | }) 45 | -------------------------------------------------------------------------------- /R/export_fasta.R: -------------------------------------------------------------------------------- 1 | #' Export Fasta 2 | #' 3 | #' @param seqs dataframe containing sequence information, output of function `query` 4 | #' @param out_file_name output file name 5 | #' @param export_dated_only Should entries without dates be exported? Set to TRUE to exlude sequences without a date (default) 6 | #' @param delimiter fasta sequence delimiter. Default `@` 7 | #' @param data_format Optional, specify date format eg for ISO format use '%Y-%m-%d' 8 | #' @param columns Specify columns in the input dataframe to use as fasta headers 9 | #' @return data.frame of complete data 10 | export_fasta <- function( 11 | seqs, 12 | out_file_name, 13 | export_dated_only = T, 14 | delimiter='@', 15 | date_format=NULL, 16 | columns=c('country', 'pangolin_lineage', 'accession_id', 'date') 17 | ){ 18 | if (all(columns %in% names(seqs))) { 19 | if('date' %in% columns){ 20 | if (is.null(date_format)) { 21 | dates <- round(lubridate::decimal_date(lubridate::ymd(seqs$date)), 3) 22 | } else { 23 | dates <- format(as.Date(seqs$date), date_format) 24 | } 25 | if(export_dated_only){ 26 | seqs <- seqs[!is.na(dates), ] 27 | dates <- dates[!is.na(dates)] 28 | } 29 | seqs$date <- dates 30 | } 31 | newnames <- do.call(paste, c(lapply(columns, function(x) {seqs[,x]}), sep = delimiter)) 32 | } else { 33 | missing_columns <- setdiff(columns, names(seqs)) 34 | message(sprintf("Missing columns %s. Defaulting to `description`", missing_columns)) 35 | newnames <- paste(seqs$description, sep = delimiter) 36 | } 37 | 38 | cat('', file = out_file_name) 39 | for(i in 1:nrow(seqs)){ 40 | cat('>', newnames[i], '\n', sep= '', file = out_file_name, append = T) 41 | cat(seqs$sequence[i], '\n', file = out_file_name, append = T) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tests/testthat/test-download.R: -------------------------------------------------------------------------------- 1 | username = Sys.getenv("GISAIDR_USERNAME") 2 | password = Sys.getenv("GISAIDR_PASSWORD") 3 | credentials <- login(username = username, password = password) 4 | 5 | test_that("download returns data.frame", { 6 | df <- query(credentials = credentials, nrows = 2) 7 | list_of_accession_ids <- df$accession_id 8 | full_df <- download(credentials, list_of_accession_ids) 9 | expect_true(is.data.frame(full_df)) 10 | }) 11 | 12 | test_that("get sequence works", { 13 | df <- query(credentials = credentials, nrows = 2) 14 | list_of_accession_ids <- df$accession_id 15 | full_df <- download(credentials, list_of_accession_ids, get_sequence=TRUE) 16 | expect_true(hasName(full_df, "sequence")) 17 | }) 18 | 19 | test_that("download_files works for EpiCoV", { 20 | df <- query(credentials = credentials, nrows = 2) 21 | list_of_accession_ids <- df$accession_id 22 | download_results <- download_files(credentials, list_of_accession_ids, dates_and_location=TRUE, patient_status=TRUE, sequencing_technology=TRUE, augur_input=TRUE) 23 | dates_and_location <- download_results$dates_and_location 24 | patient_status <- download_results$patient_status 25 | sequencing_technology <- download_results$sequencing_technology 26 | augur_input_metadata <- download_results$augur_input$metadata 27 | augur_input_sequences <- download_results$augur_input$sequences 28 | 29 | download_results <- download_files(credentials, list_of_accession_ids, sequences=TRUE) 30 | sequences = download_results$sequences 31 | 32 | expect_true( 33 | is.data.frame(dates_and_location) && is.data.frame(patient_status) && is.data.frame(sequencing_technology) && is.data.frame(augur_input_metadata) && 34 | !is.null(augur_input_sequences) && !is.null(sequences) 35 | ) 36 | }) 37 | 38 | credentials <- login(username = username, password = password, database="EpiPox") 39 | 40 | test_that("download_files works for EpiPox", { 41 | df <- query(credentials = credentials, nrows = 2) 42 | list_of_accession_ids <- df$accession_id 43 | download_results <- download_files(credentials, list_of_accession_ids, dates_and_location=TRUE, patient_status=TRUE, sequencing_technology=TRUE, sequences=TRUE) 44 | dates_and_location <- download_results$dates_and_location 45 | patient_status <- download_results$patient_status 46 | sequencing_technology <- download_results$sequencing_technology 47 | sequences = download_results$sequences 48 | 49 | expect_true( 50 | is.data.frame(dates_and_location) && is.data.frame(patient_status) && is.data.frame(sequencing_technology) && 51 | !is.null(sequences) 52 | ) 53 | }) 54 | 55 | credentials <- login(username = username, password = password, database="EpiRSV") 56 | 57 | test_that("download_files works for EpiRSV", { 58 | df <- query(credentials = credentials, nrows = 2) 59 | list_of_accession_ids <- df$accession_id 60 | download_results <- download_files(credentials, list_of_accession_ids, dates_and_location=TRUE, patient_status=TRUE, sequencing_technology=TRUE, sequences=TRUE) 61 | dates_and_location <- download_results$dates_and_location 62 | patient_status <- download_results$patient_status 63 | sequencing_technology <- download_results$sequencing_technology 64 | sequences = download_results$sequences 65 | 66 | expect_true( 67 | is.data.frame(dates_and_location) && is.data.frame(patient_status) && is.data.frame(sequencing_technology) && 68 | !is.null(sequences) 69 | ) 70 | }) 71 | -------------------------------------------------------------------------------- /R/query.R: -------------------------------------------------------------------------------- 1 | 2 | #' Query GISAID Database 3 | #' 4 | #' @param credentials GISAID credentials. 5 | #' @param text full text search. 6 | #' @param location search for entries based on geographic location. 7 | #' @param lineage search for entries based on pango lineage designations. 8 | #' @param variant search for entries based on variant designation 9 | #' @param from search from specific collection date. 10 | #' @param to search to specific collection date. 11 | #' @param from_subm search from specific submission date. 12 | #' @param to_subm search to specific submission date. 13 | #' @param virus_name search for a virus_name. 14 | #' @param order_by order results by a column. 15 | #' @param order_asc order_by results in ascending order. 16 | #' @param start_index page through results. 17 | #' @param nrows number of results to return. 18 | #' @param load_all return all results. 19 | #' @param low_coverage_excl exclude low coverage entries from the results. 20 | #' @param complete include only complete entries in the results. 21 | #' @param high_coverage include only high coverage entries in the results. 22 | #' @param collection_date_complete include only entries with complete in collection date the results. 23 | #' @param total returns the total number of sequences matching the query. 24 | #' @param fast returns all of the accession_ids that match the query. 25 | #' @param aa_substitution_ceid returns all sequences with the selected amino acid mutation 26 | #' @param nucl_mutation_ceid returns all sequences with the selected nucleotide mutation 27 | #' @param subtype returns all sequences with the selected subtype 28 | #' @return data.frame 29 | query <- 30 | function(credentials, 31 | text = NULL, 32 | location = NULL, 33 | lineage = NULL, 34 | variant = NULL, 35 | from = NULL, 36 | from_subm = NULL, 37 | to = NULL, 38 | to_subm = NULL, 39 | virus_name = NULL, 40 | order_by = NULL, 41 | aa_substitution = NULL, 42 | nucl_mutation = NULL, 43 | subtype = NULL, 44 | order_asc = TRUE, 45 | start_index = 0, 46 | nrows = 50, 47 | load_all = FALSE, 48 | low_coverage_excl = FALSE, 49 | complete = FALSE, 50 | high_coverage = FALSE, 51 | collection_date_complete = FALSE, 52 | total = FALSE, 53 | fast = FALSE) { 54 | 55 | if (nrows > 50 && !total && !load_all && !fast) { 56 | message(paste0("Loading entries in batches...")) 57 | batches <- create_batches(start_index = start_index, nrows = nrows) 58 | results <- data.frame() 59 | for (i in 1:nrow(batches)) { 60 | results <- rbind(results, internal_query( 61 | credentials = credentials, 62 | text = text, 63 | location = location, 64 | lineage = lineage, 65 | variant = variant, 66 | from = from, 67 | from_subm = from_subm, 68 | to = to, 69 | to_subm = to_subm, 70 | virus_name = virus_name, 71 | order_by = order_by, 72 | order_asc = order_asc, 73 | start_index = batches[i,1], 74 | nrows = batches[i,2], 75 | low_coverage_excl = low_coverage_excl, 76 | complete = complete, 77 | high_coverage = high_coverage, 78 | collection_date_complete = collection_date_complete, 79 | aa_substitution = aa_substitution, 80 | nucl_mutation = nucl_mutation, 81 | subtype = subtype 82 | )) 83 | } 84 | results <- results[!duplicated(results$accession_id), ] # Remove duplicate entries 85 | rownames(results) <- NULL # reset index 86 | return(results) 87 | } else if (total | load_all | fast) { 88 | return( 89 | internal_query( 90 | credentials = credentials, 91 | text = text, 92 | location = location, 93 | lineage = lineage, 94 | variant = variant, 95 | from = from, 96 | from_subm = from_subm, 97 | to = to, 98 | to_subm = to_subm, 99 | virus_name = virus_name, 100 | order_by = order_by, 101 | order_asc = order_asc, 102 | load_all = load_all, 103 | low_coverage_excl = low_coverage_excl, 104 | complete = complete, 105 | high_coverage = high_coverage, 106 | collection_date_complete = collection_date_complete, 107 | aa_substitution = aa_substitution, 108 | nucl_mutation = nucl_mutation, 109 | total = total, 110 | fast = fast, 111 | subtype = subtype 112 | ) 113 | ) 114 | } else { 115 | return( 116 | internal_query( 117 | credentials = credentials, 118 | text = text, 119 | location = location, 120 | lineage = lineage, 121 | variant = variant, 122 | from = from, 123 | from_subm = from_subm, 124 | to = to, 125 | to_subm = to_subm, 126 | virus_name = virus_name, 127 | order_by = order_by, 128 | order_asc = order_asc, 129 | start_index = start_index, 130 | nrows = nrows, 131 | low_coverage_excl = low_coverage_excl, 132 | complete = complete, 133 | high_coverage = high_coverage, 134 | collection_date_complete = collection_date_complete, 135 | aa_substitution = aa_substitution, 136 | nucl_mutation = nucl_mutation, 137 | subtype = subtype 138 | ) 139 | ) 140 | } 141 | 142 | 143 | } 144 | -------------------------------------------------------------------------------- /tests/testthat/test-query.R: -------------------------------------------------------------------------------- 1 | username = Sys.getenv("GISAIDR_USERNAME") 2 | password = Sys.getenv("GISAIDR_PASSWORD") 3 | credentials <- login(username = username, password = password) 4 | # will break if log in fails... 5 | 6 | test_that("basic query works", { 7 | df <- query(credentials = credentials) 8 | expect_true(is.data.frame(df)) 9 | }) 10 | 11 | test_that("can change number of rows", { 12 | df <- query(credentials = credentials, nrows = 100, location = "Africa / ...") 13 | expect_equal(nrow(df), 100) 14 | }) 15 | 16 | test_that("can change index", { 17 | df1 <- query(credentials = credentials, 18 | start_index = 0) 19 | df2 <- query(credentials = credentials, 20 | start_index = 49) 21 | expect_true(df2[1, 1] == df1[50, 1]) 22 | }) 23 | 24 | test_that("expried session fails", { 25 | sid <- credentials$sid 26 | credentials$sid = "890C95496CO0CQ007Z8HDSG34GF1JZ3Z" 27 | expect_error(query( 28 | credentials = credentials 29 | ), 30 | "The session has expired. Please login again.") 31 | credentials$sid <- sid 32 | }) 33 | 34 | test_that("location search works", { 35 | df <- query(credentials = credentials, 36 | location = 'Australia') 37 | expect_true(all(lapply(df$location, 38 | function(x) 39 | grepl("Australia", x, fixed = TRUE)))) 40 | }) 41 | 42 | test_that("lineage search works", { 43 | df <- query(credentials = credentials, 44 | lineage = 'W.1') 45 | # need a better way to test this... 46 | expect_true(is.data.frame(df)) 47 | expect_true(nrow(df) == 50) 48 | }) 49 | 50 | test_that("combination search works", { 51 | df <- query(credentials = credentials, 52 | location = 'Australia', 53 | lineage = 'W.1') 54 | expect_true(df$accession_id[1] == "EPI_ISL_678350") 55 | }) 56 | 57 | test_that("load all works", { 58 | df <- query(credentials = credentials, 59 | lineage = 'W.1', 60 | load_all = TRUE) 61 | expect_true(nrow(df) > 50) 62 | }) 63 | 64 | test_that("date search works", { 65 | df <- 66 | query(credentials = credentials, 67 | from = '2021-04-05', 68 | to = '2021-04-05') 69 | expect_true(nrow(df) == 50) 70 | expect_true(df[10,]$collection_date == "2021-04-05") 71 | }) 72 | 73 | test_that("low_coverage_excl works", { 74 | df <- query(credentials = credentials, low_coverage_excl = TRUE) 75 | expect_true(nrow(df) == 50) 76 | expect_true(length(grep("Long stretches of NNNs", df$information)) == 0) 77 | }) 78 | 79 | test_that("complete works", { 80 | df <- query(credentials = credentials, complete = TRUE) 81 | expect_true(nrow(df) == 50) 82 | expect_true(all(df$length > 29000)) 83 | }) 84 | 85 | test_that("submission date search works", { 86 | df <- 87 | query(credentials = credentials, 88 | from_subm = '2021-04-05', 89 | to_subm = '2021-04-05') 90 | expect_true(nrow(df) == 50) 91 | expect_true(all(df$submission_date == "2021-04-05")) 92 | }) 93 | 94 | test_that("collection date complete works", { 95 | df <- 96 | query( 97 | credentials = credentials, 98 | lineage = 'BA.1', 99 | location = 'Australia', 100 | collection_date_complete = T 101 | ) 102 | expect_true(nrow(df) == 50) 103 | expect_true(all(nchar(df$collection_date) == 10)) 104 | }) 105 | 106 | test_that("high coverage works", { 107 | df <- 108 | query(credentials = credentials, high_coverage = T) 109 | expect_true(nrow(df) == 50) 110 | expect_true(length(grep("warn_sign", df$information)) == 0) 111 | }) 112 | 113 | test_that("total returns total", { 114 | total <- 115 | query(credentials = credentials, total=T) 116 | expect_true(is.numeric(total)) 117 | }) 118 | 119 | test_that("variant search works", { 120 | df <- 121 | query(credentials = credentials, variant='omicron') 122 | # variant information is not returned from query or download... 123 | # need a better way to test this... 124 | expect_true(is.data.frame(df)) 125 | expect_true(nrow(df) == 50) 126 | }) 127 | 128 | test_that("virus name search works", { 129 | df <- 130 | query(credentials = credentials, virus_name='hCoV-19/Ireland/D-BHTEST/2022') 131 | expect_true(is.data.frame(df)) 132 | expect_true(nrow(df) == 1) 133 | expect_true(df[,'virus_name'] == 'hCoV-19/Ireland/D-BHTEST/2022') 134 | }) 135 | 136 | test_that("fast works", { 137 | df <- query(credentials = credentials, 138 | lineage = 'W.1', 139 | fast = TRUE) 140 | expect_true(nrow(df) > 50) 141 | }) 142 | 143 | test_that("order_by works", { 144 | df <- query(credentials = credentials, order_by = 'submission_date') 145 | expect_true(df$submission_date[1] == "2020-01-10") 146 | }) 147 | 148 | test_that("aa_substitution works", { 149 | df <- query(credentials = credentials, 150 | aa_substitution = 'Spike_E484Q, Spike_H69del, -N_P13L', 151 | to_subm = '2023-02-22', 152 | order_by='submission_date') 153 | expect_true(is.data.frame(df)) 154 | expect_equal(df$submission_date[1], "2021-01-25") 155 | expect_equal(df$virus_name[1], "hCoV-19/England/PORT-2E19CC/2021") 156 | }) 157 | 158 | test_that("nucl_mutation works", { 159 | df <- query(credentials = credentials, 160 | nucl_mutation = '-T23599G, -C10029T, -C14408T, -A23403G, T22679C, G28881A, A24424T', 161 | to_subm = '2023-02-22', 162 | order_by='submission_date') 163 | expect_true(is.data.frame(df)) 164 | 165 | 166 | expect_equal(df$submission_date[1],"2021-12-29") 167 | }) 168 | 169 | test_that("text search works", { 170 | accession_ids = c("EPI_ISL_17398411", "EPI_ISL_17199001", "EPI_ISL_17409201", "EPI_ISL_17243716") 171 | df <- query(credentials = credentials, text = paste(accession_ids, collapse = "\n")) 172 | expect_true(nrow(df) == 4) 173 | }) 174 | 175 | test_that("empty result returns empty df", { 176 | df <- query( 177 | credentials = credentials, 178 | lineage = "FAKE.1.1.2", 179 | location = 'Australia', 180 | fast = TRUE 181 | ) 182 | expect_true(nrow(df) == 0) 183 | }) 184 | 185 | # ----------------------------------------------------------------------------- 186 | # EpiPox 187 | 188 | credentials <- login(username = username, password = password, database="EpiPox") 189 | # will break if log in fails... 190 | 191 | test_that("EpiPox basic query works", { 192 | df <- query(credentials = credentials) 193 | expect_true(is.data.frame(df)) 194 | }) 195 | 196 | test_that("EpiPox lineage search works using clades", { 197 | df <- query(credentials = credentials, lineage = 'IIb') 198 | # need a better way to test this... 199 | expect_true(is.data.frame(df)) 200 | expect_true(nrow(df) == 50) 201 | }) 202 | 203 | test_that("EpiPox lineage search works using lineages", { 204 | df <- query(credentials = credentials, lineage = 'IIb B.1.20') 205 | # need a better way to test this... 206 | expect_true(is.data.frame(df)) 207 | expect_true(nrow(df) == 50) 208 | }) 209 | 210 | test_that("EpiPox high coverage works", { 211 | df <- query(credentials = credentials, high_coverage = TRUE) 212 | expect_true(nrow(df) == 50) 213 | expect_true(length(grep("warn_sign", df$information)) == 0) 214 | }) 215 | 216 | # ----------------------------------------------------------------------------- 217 | # EpiRSV 218 | 219 | credentials <- login(username = username, password = password, database="EpiRSV") 220 | # will break if log in fails... 221 | 222 | test_that("EpiRSV basic query works", { 223 | df <- query(credentials = credentials) 224 | expect_true(is.data.frame(df)) 225 | }) 226 | 227 | test_that("EpiRSV subtype search works", { 228 | total_a <- query(credentials = credentials, subtype="A", total=TRUE) 229 | total_b <- query(credentials = credentials, subtype="B", total=TRUE) 230 | expect_true(total_a != total_b) 231 | }) 232 | 233 | test_that("EpiRSV complete works", { 234 | df <- query(credentials = credentials, complete = TRUE) 235 | expect_true(nrow(df) == 50) 236 | expect_true(all(df$length > 15000)) 237 | }) 238 | 239 | test_that("EpiRSV high coverage works", { 240 | df <- query(credentials = credentials, high_coverage = TRUE) 241 | expect_true(nrow(df) == 50) 242 | expect_true(length(grep("warn_sign", df$information)) == 0) 243 | }) 244 | -------------------------------------------------------------------------------- /R/internal_query.R: -------------------------------------------------------------------------------- 1 | #' Query GISAID Database 2 | #' 3 | #' @param credentials GISAID credentials. 4 | #' @param text full text search. 5 | #' @param location search for entries based on geographic location. 6 | #' @param lineage search for entries based on pango lineage designations. 7 | #' @param variant search for entries based on variant designation 8 | #' @param from search from specific collection date. 9 | #' @param to search to specific collection date. 10 | #' @param from_subm search from specific submission date. 11 | #' @param to_subm search to specific submission date. 12 | #' @param virus_name search for a virus_name. 13 | #' @param order_by order results by a column. 14 | #' @param order_asc order_by results in ascending order. 15 | #' @param start_index page through results. 16 | #' @param nrows number of results to return. 17 | #' @param load_all return all results. 18 | #' @param low_coverage_excl exclude low coverage entries from the results. 19 | #' @param complete include only complete entries in the results. 20 | #' @param high_coverage include only high coverage entries in the results. 21 | #' @param collection_date_complete include only entries with complete in collection date the results. 22 | #' @param total returns the total number of sequences matching the query. 23 | #' @param fast returns all of the accession_ids that match the query. 24 | #' @param aa_substitution returns all sequences with the amino acid mutation(s), negative selection by '-' prefix 25 | #' @param nucl_mutation returns all sequences with the nucleotide mutation(s), negative selection by '-' prefix 26 | #' @param subtype returns all sequences with the subtype 27 | #' @return Dataframe. 28 | internal_query <- 29 | function(credentials, 30 | text = NULL, 31 | location = NULL, 32 | lineage = NULL, 33 | variant = NULL, 34 | from = NULL, 35 | from_subm = NULL, 36 | to = NULL, 37 | to_subm = NULL, 38 | virus_name = NULL, 39 | order_by = NULL, 40 | aa_substitution = NULL, 41 | nucl_mutation = NULL, 42 | subtype = NULL, 43 | order_asc = TRUE, 44 | start_index = 0, 45 | nrows = 50, 46 | load_all = FALSE, 47 | low_coverage_excl = FALSE, 48 | complete = FALSE, 49 | high_coverage = FALSE, 50 | collection_date_complete = FALSE, 51 | total = FALSE, 52 | fast = FALSE 53 | ) { 54 | 55 | df <- tryCatch({ 56 | queue = list() 57 | 58 | # Simple Text Filter (EpiCoV only) 59 | if (!is.null(text) && credentials$database == "EpiCoV") { 60 | new_queue <- create_search_queue(credentials, credentials$text_ceid, text, 'DoSimpleSearch') 61 | queue <- append(queue, new_queue) 62 | } 63 | 64 | # Location Filter (All) 65 | if (!is.null(location)) { 66 | new_queue <- create_search_queue(credentials, credentials$location_ceid, location, 'FilterChange') 67 | queue <- append(queue, new_queue) 68 | } 69 | 70 | # Collection Date Complete (All) 71 | if (collection_date_complete) { 72 | new_queue <- create_search_queue(credentials, credentials$collection_date_complete_ceid, list('coldc'), 'FilterChange') 73 | queue <- append(queue, new_queue) 74 | } 75 | 76 | # Lineage (EpiCoV, EpiPox) 77 | if (!is.null(lineage)) { 78 | if (credentials$database == "EpiCoV"){ 79 | new_queue <- create_search_queue(credentials, credentials$lineage_ceid, lineage, 'LineageChange') 80 | queue <- append(queue, new_queue) 81 | } else if (credentials$database == "EpiPox") { 82 | new_queue <- create_search_queue(credentials, credentials$lineage_ceid, lineage, 'FilterChange') 83 | queue <- append(queue, new_queue) 84 | } 85 | } 86 | 87 | # Subtype (EpiRSV) 88 | if (!is.null(subtype) && credentials$database == "EpiRSV") { 89 | new_queue <- create_search_queue(credentials, credentials$subtype_ceid, subtype, 'FilterChange') 90 | queue <- append(queue, new_queue) 91 | } 92 | 93 | # Variant (EpiCoV) 94 | if (!is.null(variant) && credentials$database == "EpiCoV") { 95 | new_queue <- create_search_queue(credentials, credentials$variant_ceid, variant, 'VariantsChange') 96 | queue <- append(queue, new_queue) 97 | } 98 | 99 | # Virus Name (All) 100 | if (!is.null(virus_name)) { 101 | new_queue <- create_search_queue(credentials, credentials$virus_name_ceid, virus_name, 'FilterChange') 102 | queue <- append(queue, new_queue) 103 | } 104 | 105 | # From, collection date (All) 106 | if (!is.null(from)) { 107 | new_queue <- create_search_queue(credentials, credentials$from_ceid, from, 'FilterChange') 108 | queue <- append(queue, new_queue) 109 | } 110 | 111 | # From, submission date (All) 112 | if (!is.null(from_subm)) { 113 | new_queue <- create_search_queue(credentials, credentials$from_sub_ceid, from_subm, 'FilterChange') 114 | queue <- append(queue, new_queue) 115 | } 116 | 117 | # To, collection date (All) 118 | if (!is.null(to)) { 119 | new_queue <- create_search_queue(credentials, credentials$to_ceid, to, 'FilterChange') 120 | queue <- append(queue, new_queue) 121 | } 122 | 123 | # To, submission date (All) 124 | if (!is.null(to_subm)) { 125 | new_queue <- create_search_queue(credentials, credentials$to_sub_ceid, to_subm, 'FilterChange') 126 | queue <- append(queue, new_queue) 127 | } 128 | 129 | # Amino acid changes (all) 130 | if (!is.null(aa_substitution)) { 131 | new_queue <- create_search_queue(credentials, credentials$aa_substitution_ceid, aa_substitution, 'FilterChange') 132 | queue <- append(queue, new_queue) 133 | } 134 | 135 | # Nucleotide changes (EpiCoV) 136 | if (!is.null(nucl_mutation) && credentials$database == "EpiCoV") { 137 | new_queue <- create_search_queue(credentials, credentials$nucl_mutation_ceid, nucl_mutation, 'FilterChange') 138 | queue <- append(queue, new_queue) 139 | } 140 | 141 | # Low Coverage Exclude (All) 142 | if (low_coverage_excl) { 143 | new_queue <- create_search_queue(credentials, credentials$low_coverage_excl_ceid, list('lowco'), 'FilterChange') 144 | queue <- append(queue, new_queue) 145 | } 146 | 147 | # Quality (mixed) 148 | if (credentials$database == 'EpiCoV') { 149 | if (complete) { 150 | new_queue <- create_search_queue(credentials, credentials$complete_ceid, list('complete'), 'FilterChange') 151 | queue <- append(queue, new_queue) 152 | } 153 | if (high_coverage) { 154 | new_queue <- create_search_queue(credentials, credentials$highq_ceid, list('highq'), 'FilterChange') 155 | queue <- append(queue, new_queue) 156 | } 157 | } else { 158 | quality <- list() 159 | 160 | if (complete) { 161 | quality <- append(quality, 'complete') 162 | } 163 | if (high_coverage) { 164 | quality <- append(quality, 'highq') 165 | } 166 | 167 | if (length(quality) > 0) { 168 | new_queue <- create_search_queue(credentials, credentials$quality_ceid, quality, 'FilterChange') 169 | queue <- append(queue, new_queue) 170 | } 171 | } 172 | 173 | if (length(queue) > 0) { 174 | command_queue <- list(queue = queue) 175 | data <- 176 | formatDataForRequest( 177 | sid = credentials$sid, 178 | wid = credentials$wid, 179 | pid = credentials$pid, 180 | queue = command_queue, 181 | timestamp = timestamp() 182 | ) 183 | response <- httr::POST(GISAID_URL, httr::add_headers(.headers = headers), body = data) 184 | response_data <- parseResponse(response) 185 | log.debug(response_data) 186 | } 187 | 188 | queue = list() 189 | 190 | # ordering 191 | if (!is.null(order_by)) { 192 | if (credentials$database == 'EpiCoV') { 193 | order_by = covid_order_by_col_map[[order_by]] 194 | } else { 195 | # epipox and epirsv are missing the host column 196 | order_by = other_order_by_col_map[[order_by]] 197 | } 198 | command <- createCommand( 199 | wid = credentials$wid, 200 | pid = credentials$pid, 201 | cid = credentials$query_cid, 202 | cmd = 'SetSorting', 203 | params = list(order_by = order_by, order_asc = order_asc) 204 | ) 205 | queue <- append(queue, list(command)) 206 | } 207 | 208 | # pagination 209 | command <- createCommand( 210 | wid = credentials$wid, 211 | pid = credentials$pid, 212 | cid = credentials$query_cid, 213 | cmd = 'SetPaginating', 214 | params = list(start_index = start_index, rows_per_page = nrows) 215 | ) 216 | queue <- append(queue, list(command)) 217 | 218 | # get data 219 | command <- createCommand( 220 | wid = credentials$wid, 221 | pid = credentials$pid, 222 | cid = credentials$query_cid, 223 | cmd = 'GetData' 224 | ) 225 | 226 | queue <- append(queue, list(command)) 227 | 228 | command_queue <- list(queue = queue) 229 | 230 | data <- 231 | formatDataForRequest( 232 | sid = credentials$sid, 233 | wid = credentials$wid, 234 | pid = credentials$pid, 235 | queue = command_queue, 236 | timestamp = timestamp() 237 | ) 238 | response <- httr::GET(paste0(GISAID_URL, '?', data)) 239 | response_data <- parseResponse(response) 240 | 241 | if (total) { 242 | return(as.numeric(response_data$totalRecords)) 243 | } 244 | 245 | if (fast) { 246 | log.debug(paste0(GISAID_URL, "?sid=", credentials$sid)) 247 | accession_id_count <- response_data$totalRecords 248 | log.info(paste('Selecting all', accession_id_count, "accession_ids.")) 249 | df <- get_accession_ids(credentials = credentials) 250 | log.info(paste("Returning", start_index, "-", nrow(df), "of", accession_id_count, "accession_ids.")) 251 | 252 | if (accession_id_count > nrow(df)) { 253 | log.warn(paste("Could only get", nrow(df), "accession_ids. Narrow your search.")) 254 | } 255 | return(df) 256 | } 257 | 258 | if (load_all && response_data$totalRecords > nrows) { 259 | log.info(paste("Loading all", response_data$totalRecords, "entries.")) 260 | return( 261 | query( 262 | credentials = credentials, 263 | text = text, 264 | location = location, 265 | lineage = lineage, 266 | variant = variant, 267 | from = from, 268 | from_subm = from_subm, 269 | to = to, 270 | to_subm = to_subm, 271 | aa_substitution = aa_substitution, 272 | nucl_mutation = nucl_mutation, 273 | nrows = response_data$totalRecords, 274 | # set load_all to false to break the recursion 275 | load_all = FALSE, 276 | low_coverage_excl = low_coverage_excl, 277 | complete = complete, 278 | high_coverage = high_coverage, 279 | collection_date_complete = collection_date_complete, 280 | subtype = subtype 281 | ) 282 | ) 283 | } 284 | log.info(paste("Returning", start_index, "-", start_index + response_data$recordsReturned, "of ", response_data$totalRecords, "entries.")) 285 | log.debug(response_data$records) 286 | if (length(response_data$records) >= 1) { 287 | df <- data.frame(do.call(rbind, response_data$records)) 288 | df <- 289 | data.frame(lapply(df, function(col) { 290 | col[sapply(col, is.null)] <- NA 291 | unlist(col) 292 | })) 293 | } else { 294 | df <- data.frame(response_data$records) 295 | } 296 | df <- setColumnNames(df, credentials$database) 297 | df <- setDataTypes(df) 298 | }, 299 | finally = { 300 | # reset search params 301 | resetQuery(credentials) 302 | }) 303 | return(df) 304 | } 305 | -------------------------------------------------------------------------------- /R/login.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Login to GISAID 4 | #' 5 | #' @param username GISAID username. 6 | #' @param password GISAID password. 7 | #' @return credentials used to query GISAID. 8 | #' @examples 9 | #' username = Sys.getenv("GISAIDR_USERNAME") 10 | #' password = Sys.getenv("GISAIDR_PASSWORD") 11 | #' login(username, password) 12 | login <- function(username, password, database="EpiCoV") { 13 | if (!database %in% c("EpiCoV", "EpiRSV", "EpiPox")) { 14 | stop(sprintf("Database must be EpiCoV, EpiRSV or EpiPox (database=%s)", database)) 15 | } 16 | # get a session ID 17 | response <- send_request() 18 | home_page_text = httr::content(response, as = 'text') 19 | session_id <- 20 | extract_first_match("name=\"sid\" value='([^']*)", home_page_text) 21 | response <- send_request(paste0('sid=', session_id)) 22 | 23 | # Load the home page 24 | # TODO: ensure it's the covid-19 page 25 | login_page_text = httr::content(response, as = 'text') 26 | 27 | # extract the other IDs for log in stage 1 28 | WID <- extract_first_match('"WID"] = "([^"]*)', login_page_text) 29 | login_page_ID <- 30 | extract_first_match('PID"] = "([^"]*)', login_page_text) 31 | login_component_ID <- 32 | extract_first_match("sys.getC\\('(.*)').call\\('doLogin'", login_page_text) 33 | 34 | # frontend page 35 | # create doLogin command 36 | doLogin_command <- createCommand( 37 | wid = WID, 38 | pid = login_page_ID, 39 | cid = login_component_ID, 40 | cmd = 'doLogin', 41 | params = list(login = username, hash = openssl::md5(password)) 42 | ) 43 | 44 | # add the doLogin command to queue 45 | # commands can be built up into a pipeline in the queue 46 | queue <- list(queue = list(doLogin_command)) 47 | data <- 48 | formatDataForRequest(session_id, WID, login_page_ID, queue, timestamp()) 49 | 50 | response <- send_request(method = 'POST', data=data) 51 | response_data <- parseResponse(response) 52 | 53 | if (length(grep("^sys.goPage", response_data$responses[[1]]$data)) == 0) { 54 | # handle event that the default page is not frontend 55 | default_page <- send_request(paste0('sid=', session_id)) 56 | default_page_text = httr::content(default_page, as = 'text') 57 | 58 | EpiCov_CID <- extract_first_match("sys.call\\('(.{5,20})','Go'", default_page_text) 59 | 60 | goto_EpiCov_page_command <- createCommand( 61 | wid = WID, 62 | pid = login_page_ID, 63 | cid = EpiCov_CID, 64 | cmd = 'Go', 65 | params = list(page = 'corona2020') 66 | ) 67 | 68 | queue <- list(queue = list(goto_EpiCov_page_command)) 69 | 70 | data <- 71 | formatDataForRequest(session_id, WID, login_page_ID, queue, timestamp()) 72 | 73 | response <- send_request(data) 74 | response_data <- parseResponse(response) 75 | } 76 | 77 | frontend_page_ID <- extract_first_match("\\('(.*)')",response_data$responses[[1]]$data) 78 | 79 | frontend_page <- send_request(paste0('sid=', session_id, '&pid=', frontend_page_ID)) 80 | frontend_page_text = httr::content(frontend_page, as = 'text') 81 | if (grepl('sys.openOverlay', frontend_page_text, fixed = TRUE)) { 82 | # extract overlay pid 83 | overlay_window_ID <- paste0('wid_', extract_first_match("openOverlay\\('wid_(.{5,20})','pid_.{5,20}'", frontend_page_text)) 84 | overlay_page_ID <- paste0('pid_', extract_first_match("openOverlay\\('wid_.{5,20}','pid_(.{5,20})'", frontend_page_text)) 85 | 86 | # load overlay 87 | overlay_page <- send_request(paste0('sid=', session_id, '&pid=', overlay_page_ID)) 88 | overlay_page_text <- httr::content(overlay_page, as = 'text') 89 | 90 | # extract close cid 91 | close_overlay_cid <- extract_first_match("createComponent\\('(.{5,20})','CloseButtonComponent", overlay_page_text) 92 | 93 | # send close cmd 94 | close_overlay_command <- createCommand( 95 | wid = overlay_window_ID, 96 | pid = overlay_page_ID, 97 | cid = close_overlay_cid, 98 | cmd = 'Back' 99 | ) 100 | queue <- list(queue = list(close_overlay_command)) 101 | data <- 102 | formatDataForRequest(session_id, overlay_window_ID, overlay_page_ID, queue, timestamp()) 103 | response <- send_request(data) 104 | response_data <- parseResponse(response) 105 | } 106 | 107 | if (database=="EpiRSV") { 108 | EpiRSV_CID <- extract_first_match("sys.call\\('(.{5,20})','Go'", frontend_page_text) 109 | 110 | goto_EpiRSV_page_command <- createCommand( 111 | wid = WID, 112 | pid = frontend_page_ID, 113 | cid = EpiRSV_CID, 114 | cmd = 'Go', 115 | params = list(page = 'rsv') 116 | ) 117 | 118 | queue <- list(queue = list(goto_EpiRSV_page_command)) 119 | 120 | data <- 121 | formatDataForRequest(session_id, WID, login_page_ID, queue, timestamp()) 122 | 123 | response <- send_request(data) 124 | response_data <- parseResponse(response) 125 | RSV_page_ID <- 126 | extract_first_match("\\('(.*)')",response_data$responses[[1]]$data) 127 | RSV_page <- send_request(paste0('sid=', session_id, '&pid=', RSV_page_ID)) 128 | RSV_page_text = httr::content(RSV_page, as = 'text') 129 | 130 | RSV_actionbar_component_ID <- 131 | extract_first_match("sys-actionbar-action-ni\" onclick=\"sys.getC\\('([^']*)", 132 | RSV_page_text) 133 | 134 | response_data <- go_to_page(session_id, WID, RSV_page_ID, RSV_actionbar_component_ID, 'page_rsv.BrowsePage') 135 | customSearch_page_ID <- 136 | extract_first_match("\\('(.*)')",response_data$responses[[1]]$data) 137 | } else if (database=="EpiPox") { 138 | EpiPox_CID <- extract_first_match("sys.call\\('(.{5,20})','Go'", frontend_page_text) 139 | 140 | goto_EpiPox_page_command <- createCommand( 141 | wid = WID, 142 | pid = frontend_page_ID, 143 | cid = EpiPox_CID, 144 | cmd = 'Go', 145 | params = list(page = 'mpox') 146 | ) 147 | 148 | queue <- list(queue = list(goto_EpiPox_page_command)) 149 | 150 | data <- 151 | formatDataForRequest(session_id, WID, login_page_ID, queue, timestamp()) 152 | 153 | response <- send_request(data) 154 | response_data <- parseResponse(response) 155 | POX_page_ID <- 156 | extract_first_match("\\('(.*)')",response_data$responses[[1]]$data) 157 | POX_page <- send_request(paste0('sid=', session_id, '&pid=', POX_page_ID)) 158 | POX_page_text = httr::content(POX_page, as = 'text') 159 | 160 | POX_actionbar_component_ID <- 161 | extract_first_match("sys-actionbar-action-ni\" onclick=\"sys.getC\\('([^']*)", 162 | POX_page_text) 163 | 164 | response_data <- go_to_page(session_id, WID, POX_page_ID, POX_actionbar_component_ID, 'page_mpox.BrowsePage') 165 | customSearch_page_ID <- 166 | extract_first_match("\\('(.*)')",response_data$responses[[1]]$data) 167 | } else { 168 | # check for overlay 169 | 170 | COVID_actionbar_component_ID <- 171 | extract_first_match("sys-actionbar-action-ni\" onclick=\"sys.getC\\('([^']*)", 172 | frontend_page_text) 173 | response_data <- go_to_page(session_id, WID, frontend_page_ID, COVID_actionbar_component_ID, 'page_corona2020.Corona2020BrowsePage') 174 | customSearch_page_ID <- 175 | extract_first_match("\\('(.*)')",response_data$responses[[1]]$data) 176 | } 177 | customSearch_page_response <- send_request(paste0('sid=', session_id, '&pid=', customSearch_page_ID)) 178 | customSearch_page_text = httr::content(customSearch_page_response, as = 'text') 179 | 180 | query_cid <- extract_first_match("div class=\"sys-datatable.{0,30}\" id=\"(.{5,20})_table", customSearch_page_text) 181 | 182 | #selectAll_ceid <- extract_first_match("onclick=\"sys.getC\\(\"(.{5,20})\"\\).selectAll", customSearch_page_text) 183 | 184 | # Search 185 | if (database == 'EpiRSV'){ 186 | SearchComponent <- 'SearchComponent' 187 | } else if (database == 'EpiPox') { 188 | SearchComponent <- 'SearchComponent' 189 | } else { 190 | SearchComponent <- 'Corona2020SearchComponent' 191 | } 192 | search_cid <- extract_first_match(sprintf("sys.createComponent\\('(.{5,20})','%s'", SearchComponent), customSearch_page_text) 193 | 194 | # Location 195 | location_ceid <- extract_search_ceid('covv_location', customSearch_page_text) 196 | 197 | # Lineage 198 | if (database == 'EpiCoV'){ 199 | lineage_ceid <- extract_search_ceid('pangolin_lineage', customSearch_page_text) 200 | } else if (database == 'EpiPox') { 201 | lineage_ceid <- extract_search_ceid('covsurver_cladelineage', customSearch_page_text) 202 | } else { 203 | lineage_ceid <- NULL 204 | } 205 | 206 | # Subtype 207 | if (database == 'EpiRSV'){ 208 | subtype_ceid <- extract_search_ceid('covv_subtype', customSearch_page_text) 209 | } else { 210 | subtype_ceid <- NULL 211 | } 212 | 213 | # Virus Name 214 | virus_name_ceid <- extract_search_ceid('covv_virus_name', customSearch_page_text) 215 | 216 | # From 217 | from_ceid <- extract_search_ceid('covv_collection_date_from', customSearch_page_text) 218 | 219 | # from submission 220 | from_sub_ceid <- extract_search_ceid('covv_subm_date_from', customSearch_page_text) 221 | 222 | # To 223 | to_ceid <- extract_search_ceid('covv_collection_date_to', customSearch_page_text) 224 | 225 | # To submission 226 | to_sub_ceid <- extract_search_ceid('covv_subm_date_to', customSearch_page_text) 227 | 228 | # low_coverage_excl 229 | if (database != 'EpiPox'){ 230 | low_coverage_excl_ceid <- extract_search_ceid('low_quality', customSearch_page_text) 231 | } else { 232 | low_coverage_excl_ceid <- NULL 233 | } 234 | if (database == 'EpiCoV'){ 235 | # full text search 236 | text_ceid <- 237 | extract_search_ceid("fts", customSearch_page_text) 238 | # AA substitution/mutation- ", " separated values 239 | aa_substitution_ceid <- extract_search_ceid('mutation', customSearch_page_text) 240 | # nucleotide substitution/nuc mutation, ", " separated values 241 | nucl_mutation_ceid <- extract_search_ceid('nuc_mutation', customSearch_page_text) 242 | # Highq 243 | highq_ceid <- 244 | extract_search_ceid("highq", customSearch_page_text) 245 | # Complete 246 | complete_ceid <- 247 | extract_search_ceid("complete", customSearch_page_text) 248 | # collection date complete 249 | collection_date_complete_ceid <- 250 | extract_search_ceid('coldc', customSearch_page_text) 251 | # variants 252 | variant_ceid <- 253 | extract_search_ceid('variants', customSearch_page_text) 254 | # quality not used by EpiCov 255 | quality_ceid <- NULL 256 | } else { 257 | text_ceid <- NULL 258 | aa_substitution_ceid <- extract_search_ceid('mutation', customSearch_page_text) 259 | nucl_mutation_ceid <- NULL 260 | variant_ceid <- NULL 261 | complete_ceid <- NULL 262 | highq_ceid <- NULL 263 | # Complete and Highq 264 | quality_ceid <- 265 | extract_search_ceid("quality'", customSearch_page_text) # avoid match with quality2 266 | # collection date complete 267 | collection_date_complete_ceid <- 268 | extract_search_ceid('quality2', customSearch_page_text) 269 | } 270 | 271 | # send selection command 272 | selection_pid_wid <- get_selection_panel(session_id, WID, customSearch_page_ID, query_cid) 273 | 274 | #load panel 275 | # REFACTOR 276 | selection_page <- 277 | send_request(paste0('sid=', session_id, '&pid=', selection_pid_wid$pid)) 278 | 279 | selection_page_text = httr::content(selection_page, as = 'text') 280 | 281 | selection_panel_cid <- extract_first_match("onselect=\"sys.getC\\('(.{5,20})')", selection_page_text) 282 | selection_ceid <- extract_first_match("getFI\\('(.{5,20})').onSelect", selection_page_text) 283 | 284 | send_back_cmd(session_id, selection_pid_wid$wid, selection_pid_wid$pid, selection_panel_cid) 285 | 286 | # back 287 | credentials <- 288 | list( 289 | database = database, 290 | pid = customSearch_page_ID, 291 | sid = session_id, 292 | wid = WID, 293 | query_cid = query_cid, 294 | selection_panel_cid = selection_panel_cid, 295 | selection_ceid = selection_ceid, 296 | download_panel_cid = query_cid, 297 | text_ceid = text_ceid, 298 | location_ceid = location_ceid, 299 | search_cid = search_cid, 300 | aa_substitution_ceid = aa_substitution_ceid, 301 | nucl_mutation_ceid = nucl_mutation_ceid, 302 | lineage_ceid = lineage_ceid, 303 | virus_name_ceid = virus_name_ceid, 304 | from_ceid = from_ceid, 305 | from_sub_ceid = from_sub_ceid, 306 | to_ceid = to_ceid, 307 | to_sub_ceid = to_sub_ceid, 308 | low_coverage_excl_ceid = low_coverage_excl_ceid, 309 | highq_ceid = highq_ceid, 310 | complete_ceid = complete_ceid, 311 | collection_date_complete_ceid = collection_date_complete_ceid, 312 | quality_ceid = quality_ceid, 313 | variant_ceid = variant_ceid, 314 | subtype_ceid = subtype_ceid 315 | ) 316 | 317 | return(credentials) 318 | } 319 | -------------------------------------------------------------------------------- /R/core.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | GISAID_URL = "https://www.epicov.org/epi3/frontend" 5 | 6 | headers = c(accept = "application/json, text/javascript, */*; q=0.01", 7 | "content-type" = "application/x-www-form-urlencoded; charset=UTF-8") 8 | 9 | timestamp <- function() { 10 | return(as.character(as.integer(Sys.time()) * 1000)) 11 | } 12 | 13 | createCommand <- 14 | function(wid, 15 | pid, 16 | cid, 17 | cmd, 18 | params = setNames(list(), character(0)), 19 | equiv = NULL) { 20 | ev = list( 21 | wid = wid, 22 | pid = pid, 23 | cid = cid, 24 | cmd = cmd, 25 | params = params, 26 | equiv = equiv 27 | ) 28 | return(ev) 29 | } 30 | 31 | formatDataForRequest <- 32 | function(sid, wid, pid, queue, timestamp, mode = 'ajax') { 33 | data <- paste0( 34 | "sid=", 35 | sid, 36 | "&wid=", 37 | wid, 38 | "&pid=", 39 | pid, 40 | "&data=", 41 | utils::URLencode(rjson::toJSON(queue), reserved = TRUE), 42 | "&ts=", 43 | timestamp, 44 | "&mode=", 45 | mode 46 | ) 47 | return(data) 48 | } 49 | 50 | parseResponse <- function(res) { 51 | j = httr::content(res, as = 'parsed') 52 | if (length(j$responses) == 0 & length(j) == 2) { 53 | warning("There was an error please see: https://github.com/Wytamma/GISAIDR/issues/1") 54 | stop("Error! Please login again.") 55 | } 56 | if (isTRUE(grep('Error', j$responses[[1]]$data) == 1)) { 57 | warning(utils::URLdecode(strsplit(j$responses[[1]]$data, '"')[[1]][2])) 58 | stop("Internal server Error.") 59 | } 60 | if (isTRUE(grep('expired', j$responses[[1]]$data) == 1)) { 61 | stop("The session has expired. Please login again.") 62 | } 63 | if (isTRUE(grep('password', j$responses[[1]]$data) == 1)) { 64 | # make a better check 65 | stop("Username or password wrong!") 66 | } 67 | if (isTRUE(grep('No data.', j$responses[[1]]$data) == 1)) { 68 | # make a better check 69 | stop("No data found.") 70 | } 71 | if (isTRUE(grep('You have no access to this page', j$responses[[1]]$data) == 1)) { 72 | stop("You have don't have access to this page!") 73 | } 74 | return(j) 75 | 76 | } 77 | 78 | 79 | 80 | get_accession_ids <- function(credentials) { 81 | # select all check box 82 | #onclick="sys.getC("c_rfsc9v_w9").selectAll(this)" 83 | queue = list() 84 | # data: {"queue":[{"wid":"wid_rfsc9v_2ktt","pid":"pid_rfsc9v_2kwd","cid":"c_rfsc9v_w9","cmd":"CallAsync","params":{"col_name":"c","checked":true,"_async_cmd":"SelectAll"},"equiv":null}]} 85 | command <- createCommand( 86 | wid = credentials$wid, 87 | pid = credentials$pid, 88 | cid = credentials$query_cid, 89 | cmd = 'CallAsync', 90 | params = list(col_name = 'c', checked = TRUE, '_async_cmd' = 'SelectAll') 91 | ) 92 | queue <- append(queue, list(command)) 93 | command_queue <- list(queue = queue) 94 | 95 | data <- 96 | formatDataForRequest( 97 | sid = credentials$sid, 98 | wid = credentials$wid, 99 | pid = credentials$pid, 100 | queue = command_queue, 101 | timestamp = timestamp() 102 | ) 103 | res <- 104 | httr::POST(GISAID_URL, httr::add_headers(.headers = headers), body = data) 105 | j = httr::content(res, as = 'parsed') 106 | 107 | # {"callback_response": {"msg": null, "async_id": "_rfsc9v_2o8a"}, "__ready__": true} 108 | # wait for selection 109 | # extract check_async 110 | check_async_id = j$callback_response$async_id 111 | # while generateDownloadDone not ready 112 | is_ready = FALSE 113 | while (!is_ready) { 114 | res <- httr::GET(paste0('https://www.epicov.org/epi3/check_async/', check_async_id, '?_=', timestamp())) 115 | j <- parseResponse(res) 116 | is_ready <- j$is_ready 117 | if (!is_ready) { 118 | Sys.sleep(1) 119 | } 120 | } 121 | log.debug(j) 122 | 123 | # select button 124 | selection_pid_wid <- get_selection_panel(credentials$sid, credentials$wid, credentials$pid, credentials$query_cid) 125 | selection_page <- 126 | send_request(paste0('sid=', credentials$sid, '&pid=', selection_pid_wid$pid)) 127 | 128 | # csv button 129 | #{"queue":[{"wid":"wid_rfsc9v_2p1c","pid":"pid_rfsc9v_2p1d","cid":"c_rfsc9v_15u","cmd":"Download","params":{},"equiv":null}]} 130 | queue = list() 131 | command <- createCommand( 132 | wid = selection_pid_wid$wid, 133 | pid = selection_pid_wid$pid, 134 | cid = credentials$selection_panel_cid, 135 | cmd = 'Download', 136 | params = setNames(list(), character(0)) 137 | ) 138 | queue <- append(queue, list(command)) 139 | command_queue <- list(queue = queue) 140 | 141 | data <- 142 | formatDataForRequest( 143 | sid = credentials$sid, 144 | wid = credentials$wid, 145 | pid = credentials$pid, 146 | queue = command_queue, 147 | timestamp = timestamp() 148 | ) 149 | res <- 150 | httr::POST(GISAID_URL, httr::add_headers(.headers = headers), body = data) 151 | j = httr::content(res, as = 'parsed') 152 | url <- extract_first_match("sys.downloadFile\\(\"(.*)\",", j$responses[[1]]$data) 153 | log.debug(paste0('https://www.epicov.org/', url)) 154 | tryCatch( 155 | df <- read.csv(paste0('https://www.epicov.org/', url), header=F, col.names = c('accession_id')), 156 | error = function(e) df <- data.frame(col.names = c('accession_id')) 157 | ) 158 | # back 159 | send_back_cmd(credentials$sid, selection_pid_wid$wid, selection_pid_wid$pid, credentials$selection_panel_cid) 160 | resetQuery(credentials) 161 | return(df) 162 | } 163 | 164 | get_selection_panel <- function(session_id, WID, customSearch_page_ID, query_cid) { 165 | # selection changes every time you open it 166 | selection_command <- createCommand( 167 | wid = WID, 168 | pid = customSearch_page_ID, 169 | cid = query_cid, 170 | cmd = 'Selection', 171 | params = setNames(list(), character(0)) #hack for empty {} 172 | ) 173 | queue <- list(queue = list(selection_command)) 174 | 175 | data <- 176 | formatDataForRequest(session_id, WID, customSearch_page_ID, queue, timestamp()) 177 | 178 | response <- 179 | send_request(method='POST', data=data) 180 | 181 | response_data <- parseResponse(response) 182 | # extract PID 183 | # selection changes every time 184 | selection_pid <- 185 | strsplit(response_data$responses[[1]]$data, "'")[[1]][4] 186 | selection_wid <- 187 | strsplit(response_data$responses[[1]]$data, "'")[[1]][2] 188 | log.debug(sprintf("get_selection_panel (response_data): %s", response_data)) 189 | list(pid=selection_pid, wid=selection_wid) 190 | } 191 | 192 | get_download_panel <- function(session_id, WID, customSearch_page_ID, query_cid) { 193 | # selection changes every time you open it 194 | selection_command <- createCommand( 195 | wid = WID, 196 | pid = customSearch_page_ID, 197 | cid = query_cid, 198 | cmd = 'DownloadAllSequences', 199 | params = setNames(list(), character(0)) #hack for empty {} 200 | ) 201 | queue <- list(queue = list(selection_command)) 202 | 203 | data <- 204 | formatDataForRequest(session_id, WID, customSearch_page_ID, queue, timestamp()) 205 | 206 | response <- 207 | send_request(method='POST', data=data) 208 | 209 | response_data <- parseResponse(response) 210 | log.debug(sprintf("get_download_panel_pid_wid (response_data): %s", response_data)) 211 | # extract PID 212 | # selection changes every time 213 | download_pid <- 214 | strsplit(response_data$responses[[1]]$data, "'")[[1]][4] 215 | download_wid <- 216 | strsplit(response_data$responses[[1]]$data, "'")[[1]][2] 217 | 218 | list(pid=download_pid, wid=download_wid) 219 | } 220 | 221 | send_back_cmd <- function(session_id, WID, PID, CID ) { 222 | # send back command to get back to page 223 | # {"queue":[{"wid":"wid_r8fuui_7jgp","pid":"pid_r8fuui_7jgq","cid":"c_r8fuui_3uj","cmd":"Back","params":{},"equiv":null}]} 224 | selection_command <- createCommand( 225 | wid = WID, 226 | pid = PID, 227 | cid = CID, 228 | cmd = 'Back', 229 | params = setNames(list(), character(0)) #hack for empty {} 230 | ) 231 | queue <- list(queue = list(selection_command)) 232 | 233 | data <- 234 | formatDataForRequest(session_id, WID, PID, queue, timestamp()) 235 | 236 | response <- 237 | send_request(method='POST', data=data) 238 | 239 | response_data <- parseResponse(response) 240 | } 241 | 242 | select_entries <- function(credentials, list_of_accession_ids) { 243 | accession_ids_string <- paste(list_of_accession_ids, collapse=", ") 244 | 245 | selection_pid_wid <- get_selection_panel(credentials$sid, credentials$wid, credentials$pid, credentials$query_cid) 246 | 247 | #load panel 248 | selection_page <- 249 | send_request(paste0('sid=', credentials$sid, '&pid=', selection_pid_wid$pid)) 250 | 251 | ev1 <- createCommand( 252 | wid = selection_pid_wid$wid, 253 | pid = selection_pid_wid$pid, 254 | cid = credentials$selection_panel_cid, 255 | cmd = 'setTarget', 256 | params = list(cvalue=accession_ids_string, ceid=credentials$selection_ceid), #hack for empty {} 257 | equiv = paste0("ST", credentials$selection_ceid) 258 | ) 259 | 260 | ev2 <- createCommand( 261 | wid = selection_pid_wid$wid, 262 | pid = selection_pid_wid$pid, 263 | cid = credentials$selection_panel_cid, 264 | cmd = 'ChangeValue', 265 | params = list(cvalue=accession_ids_string, ceid=credentials$selection_ceid), #hack for empty {} 266 | equiv = paste0("CV", credentials$selection_ceid) 267 | ) 268 | ev3 <- createCommand( 269 | wid = selection_pid_wid$wid, 270 | pid = selection_pid_wid$pid, 271 | cid = credentials$selection_panel_cid, 272 | cmd = 'OK', 273 | params = setNames(list(), character(0)) #hack for empty {} 274 | ) 275 | json_queue <- list(queue = list(ev1, ev2, ev3)) 276 | data <- formatDataForRequest(credentials$sid, selection_pid_wid$wid, selection_pid_wid$pid, json_queue, timestamp()) 277 | response <- 278 | send_request(method='POST', data=data) 279 | response_data <-parseResponse(response) 280 | log.debug(response_data) 281 | if (isTRUE(grep('Back', response_data$responses[[2]]$data) == 1)) { 282 | send_back_cmd(credentials$sid, selection_pid_wid$wid, selection_pid_wid$pid, credentials$selection_panel_cid) 283 | } 284 | 285 | return(response) 286 | } 287 | 288 | resetQuery <- function(credentials) { 289 | queue = list() 290 | command <- createCommand( 291 | wid = credentials$wid, 292 | pid = credentials$pid, 293 | cid = credentials$search_cid, 294 | cmd = "Reset" 295 | ) 296 | queue <- append(queue, list(command)) 297 | command_queue <- list(queue = queue) 298 | data <- 299 | formatDataForRequest( 300 | sid = credentials$sid, 301 | wid = credentials$wid, 302 | pid = credentials$pid, 303 | queue = command_queue, 304 | timestamp = timestamp() 305 | ) 306 | res <- 307 | httr::POST(GISAID_URL, httr::add_headers(.headers = headers), body = data) 308 | } 309 | 310 | extract_search_ceid <- function(identifier, t) { 311 | regex <- paste0(".createFI\\('(.*)','.*Widget','", identifier) 312 | log.debug(sprintf("Extracting '%s' from '%s'", regex, substr(t, 0, 30))) 313 | ceid <- 314 | regmatches(t, 315 | regexpr( 316 | regex, 317 | t, 318 | perl = TRUE 319 | )) 320 | ceid <- strsplit(ceid, "'") 321 | tryCatch( 322 | ceid <- ceid[[1]][length(ceid[[1]]) - 4], 323 | error = function(e) { 324 | warning(paste0("Could not extract ", regex, " from ", substr(t, 0, 30))) 325 | e 326 | } 327 | ) 328 | 329 | return(ceid) 330 | } 331 | 332 | log.debug <- function(msg) { 333 | if (Sys.getenv("GISAIDR_DEBUG") == 1) { 334 | message(msg) 335 | } 336 | invisible() 337 | } 338 | 339 | log.error <- function(msg) { 340 | message(paste0(Sys.time(), "\tERROR: ", gsub("\n", " ", msg))) 341 | flush.console() 342 | invisible() 343 | } 344 | 345 | log.warn <- function(msg) { 346 | message(paste0(Sys.time(), "\tWARNING: ", gsub("\n", " ", msg))) 347 | flush.console() 348 | invisible() 349 | } 350 | 351 | log.info <- function(msg, level=1) { 352 | if (Sys.getenv("GISAIDR_VERBOSITY") != ""){ 353 | verbosity <- Sys.getenv("GISAIDR_VERBOSITY") 354 | } else { 355 | verbosity <- 1 356 | } 357 | if (verbosity >= level){ 358 | message(paste0(Sys.time(), "\tINFO: ", gsub("\n", " ", msg))) 359 | } 360 | flush.console() 361 | invisible() 362 | } 363 | 364 | send_request <- 365 | function(parameter_string = "", 366 | data = NULL, 367 | method = 'GET') { 368 | URL <- paste0(GISAIDR::GISAID_URL, '?', parameter_string) 369 | if (is.null(data)) { 370 | data <- "" 371 | } 372 | log.debug(sprintf("Sending request:\n Method -> %s\n URL -> %s\n data -> %s", method, URL, data)) 373 | if (method == 'GET') { 374 | response <- httr::GET(URL) 375 | } else if (method == 'POST') { 376 | response <- 377 | httr::POST(URL, httr::add_headers(.headers = GISAIDR::headers), body = data) 378 | } else { 379 | stop(sprintf("Method '%s' not allowed", method)) 380 | } 381 | if (response$status_code >= 500) { 382 | warning(sprintf("An error occurred while trying to %s %s", method, URL)) 383 | stop("Server error!") 384 | } 385 | response 386 | } 387 | 388 | extract_first_match <- function(regex, text) { 389 | log.debug(sprintf("Extracting '%s' from '%s'", regex, substr(text, 0, 30))) 390 | matches <- regmatches(text, regexec(regex, text)) 391 | tryCatch( 392 | return(matches[[1]][[2]]), 393 | error = function(e) stop(sprintf("Could not extract '%s' from '%s'", regex, substr(text, 0, 100))) 394 | ) 395 | 396 | } 397 | 398 | 399 | go_to_page <- function(session_id, WID, PID, CID, link) { 400 | go_command <- createCommand( 401 | wid = WID, 402 | pid = PID, 403 | cid = CID, 404 | cmd = 'Go', 405 | params = list(link = link) 406 | ) 407 | 408 | queue <- list(queue = list(go_command)) 409 | 410 | data <- 411 | formatDataForRequest(session_id, WID, PID, queue, timestamp()) 412 | 413 | response <- send_request(data) 414 | response_data <- parseResponse(response) 415 | return(response_data) 416 | } 417 | 418 | # 419 | # Variants <- 420 | # list( 421 | # alpha = 'B.1.1.7 / Q.*', 422 | # beta = 'B.1.351 / B.1.351.2 / B.1.351.3', 423 | # gamma = 'P.1 / P.1.*', 424 | # delta = 'B.1.617.2 / AY.*', 425 | # epsilon = 'B.1.427 / B.1.429', 426 | # eta = 'B.1.525', 427 | # iota = 'B.1.526', 428 | # kappa = 'B.1.617.1', 429 | # lambda = 'C.37 / C.37.1', 430 | # mu = 'B.1.621 / B.1.621.1', 431 | # omicron = 'B.1.1.529 / BA.*', 432 | # GH_490R = 'B.1.640 / B.1.640.*' 433 | # ) 434 | 435 | 436 | create_batches <- function(start_index, nrows, batch_size=50) { 437 | batches <- cbind( 438 | seq(0,nrows,batch_size), 439 | c(seq(batch_size,nrows,batch_size), 440 | nrows) 441 | ) 442 | batches <- batches + start_index 443 | if (batches[nrow(batches),1] - batches[nrow(batches),2] == 0) { 444 | batches <- head(batches, -1) 445 | } 446 | #colnames(batches) <- c('start_index', 'nrows') 447 | return (batches) 448 | } 449 | 450 | covid_order_by_col_map <- 451 | list( 452 | id = "b", 453 | virus_name = "d", 454 | passage_details_history = "e", 455 | accession_id = "f", 456 | collection_date = "g", 457 | submission_date = "h", 458 | information = "i", 459 | length = "j", 460 | host = "k", 461 | location = "l", 462 | originating_lab = "m", 463 | submitting_lab = "n" 464 | ) 465 | 466 | other_order_by_col_map <- 467 | list( 468 | id = "b", 469 | virus_name = "d", 470 | passage_details_history = "e", 471 | accession_id = "f", 472 | collection_date = "g", 473 | submission_date = "h", 474 | information = "i", 475 | length = "j", 476 | location = "k", 477 | originating_lab = "l", 478 | submitting_lab = "m" 479 | ) 480 | 481 | 482 | setColumnNames <- function(df, database) { 483 | if (database == 'EpiRSV') { 484 | names(df)[names(df) == "b"] <- "id" 485 | names(df)[names(df) == "d"] <- "virus_name" 486 | names(df)[names(df) == "e"] <- "passage_details_history" 487 | names(df)[names(df) == "f"] <- "accession_id" 488 | names(df)[names(df) == "g"] <- "collection_date" 489 | names(df)[names(df) == "h"] <- "submission_date" 490 | names(df)[names(df) == "i"] <- "information" 491 | names(df)[names(df) == "j"] <- "length" 492 | names(df)[names(df) == "k"] <- "location" 493 | names(df)[names(df) == "l"] <- "originating_lab" 494 | names(df)[names(df) == "m"] <- "submitting_lab" 495 | } else if (database == 'EpiPox') { 496 | names(df)[names(df) == "b"] <- "id" 497 | names(df)[names(df) == "d"] <- "virus_name" 498 | names(df)[names(df) == "e"] <- "passage_details_history" 499 | names(df)[names(df) == "f"] <- "accession_id" 500 | names(df)[names(df) == "g"] <- "collection_date" 501 | names(df)[names(df) == "h"] <- "submission_date" 502 | names(df)[names(df) == "i"] <- "information" 503 | names(df)[names(df) == "j"] <- "length" 504 | names(df)[names(df) == "k"] <- "location" 505 | names(df)[names(df) == "l"] <- "originating_lab" 506 | names(df)[names(df) == "m"] <- "submitting_lab" 507 | } else { 508 | colnames(df)[colnames(df) %in% c("b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n")] <- 509 | c( 510 | "id", 511 | "virus_name", 512 | "passage_details_history", 513 | "accession_id", 514 | "collection_date", 515 | "submission_date", 516 | "information", 517 | "length", 518 | "host", 519 | "location", 520 | "originating_lab", 521 | "submitting_lab" 522 | ) 523 | } 524 | return(df) 525 | } 526 | 527 | setDataTypes <- function(df) { 528 | # date 529 | return(df) 530 | } 531 | 532 | create_search_queue <- function(credentials, ceid, cvalue, cmd) { 533 | queue = list() 534 | command <- createCommand( 535 | wid = credentials$wid, 536 | pid = credentials$pid, 537 | cid = credentials$search_cid, 538 | cmd = 'setTarget', 539 | params = list(cvalue = cvalue, ceid = ceid), 540 | equiv = paste0('ST', ceid) 541 | ) 542 | queue <- append(queue, list(command)) 543 | 544 | command <- createCommand( 545 | wid = credentials$wid, 546 | pid = credentials$pid, 547 | cid = credentials$search_cid, 548 | cmd = 'ChangeValue', 549 | params = list(cvalue = cvalue, ceid = ceid), 550 | equiv = paste0('CV', ceid) 551 | ) 552 | 553 | queue <- append(queue, list(command)) 554 | 555 | command <- createCommand( 556 | wid = credentials$wid, 557 | pid = credentials$pid, 558 | cid = credentials$search_cid, 559 | cmd = cmd, 560 | params = list(ceid = ceid), 561 | ) 562 | 563 | queue <- append(queue, list(command)) 564 | 565 | return(queue) 566 | } 567 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GISAIDR 2 | 3 | [![Build](https://github.com/Wytamma/GISAIDR/actions/workflows/r.yml/badge.svg)](https://github.com/Wytamma/GISAIDR/actions/workflows/r.yml) 4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5363500.svg)](https://doi.org/10.5281/zenodo.5363500) 5 | 6 | Programmatically interact with the GISAID EpiCoV, EpiPox, and EpiRSV databases. 7 | 8 | > [!TIP] 9 | > Please consider moving your research focus to an open pathogen on [pathoplexus.org](https://pathoplexus.org/). 10 | 11 | 12 | ## Citation 13 | 14 | If you use GISAIDR in your research please cite as: 15 | 16 | Wytamma Wirth, & Sebastian Duchene. (2022). GISAIDR: Programmatically interact with the GISAID databases. Zenodo. 17 | 18 | ## Installation 19 | 20 | Install from github using `devtools`. 21 | 22 | ``` r 23 | install.packages("devtools") # if you have not installed "devtools" package 24 | devtools::install_github("Wytamma/GISAIDR") 25 | library(GISAIDR) 26 | ``` 27 | 28 | ## Login 29 | 30 | Get `username` and `password` from [GISAID](https://www.epicov.org/). 31 | 32 | ``` r 33 | username = Sys.getenv("GISAIDR_USERNAME") 34 | password = Sys.getenv("GISAIDR_PASSWORD") 35 | ``` 36 | 37 | Login and save your credentials (they are used for all future database queries) 38 | 39 | ``` r 40 | credentials <- login(username = username, password = password) 41 | ``` 42 | 43 | ## Select a database 44 | 45 | The EpiCoV database is selected by default, however, GISAIDR also works with the EpiRSV and EpiPox databases (limited testing). 46 | 47 | ``` r 48 | credentials <- login(username = username, password = password, database="EpiRSV") 49 | # or 50 | credentials <- login(username = username, password = password, database="EpiPox") 51 | ``` 52 | 53 | Note: You need a GISAID account with access to EpiRSV and EpiPox. 54 | 55 | ## Get Data 56 | 57 | Query the database with `query()` using your credentials 58 | 59 | ``` r 60 | df <- query(credentials = credentials) 61 | head(df[0:6]) 62 | ``` 63 | 64 | | \# | id | virus_name | passage_details_history | accession_id | collection_date | submission_date | 65 | |-----|-----------------|-------------------------------|-------------------------|-----------------|-----------------|-----------------| 66 | | 1 | EPI_ISL_1789201 | hCoV-19/USA/IL-S21WGS954/2021 | Original | EPI_ISL_1789201 | 2021-04-16 | 2021-04-29 | 67 | | 2 | EPI_ISL_1789200 | hCoV-19/USA/IL-S21WGS885/2021 | Original | EPI_ISL_1789200 | 2021-04-02 | 2021-04-29 | 68 | | 3 | EPI_ISL_1789199 | hCoV-19/USA/IL-S21WGS884/2021 | Original | EPI_ISL_1789199 | 2021-04-12 | 2021-04-29 | 69 | | 4 | EPI_ISL_1789198 | hCoV-19/USA/IL-S21WGS883/2021 | Original | EPI_ISL_1789198 | 2021-04-14 | 2021-04-29 | 70 | | 5 | EPI_ISL_1789197 | hCoV-19/USA/IL-S21WGS882/2021 | Original | EPI_ISL_1789197 | 2021-04-15 | 2021-04-29 | 71 | | 6 | EPI_ISL_1789196 | hCoV-19/USA/IL-S21WGS881/2021 | Original | EPI_ISL_1789196 | 2021-04-13 | 2021-04-29 | 72 | 73 | ### Pagination 74 | 75 | Use `nrows` and `start_index` to page through results. GISAID limits the number of results returned with each request to 50. Internally GISAIDR runs a loop to batch queries with > 50 rows requested. See fast option below. 76 | 77 | ``` r 78 | df <- query(credentials = credentials, nrows = 1000, start_index = 100) 79 | nrow(df) 80 | ``` 81 | 82 | [1] 1000 83 | 84 | ### Fast query 85 | 86 | Use `fast` to load all of the accesion_ids that match the query. These accesion_ids can then be used in the `download` function to download up to 5000 sequences at a time. 87 | 88 | ``` r 89 | df <- query( 90 | credentials = credentials, 91 | location = "Oceania", 92 | from_subm = "2022-07-26", 93 | to_subm = "2022-07-28", 94 | fast = TRUE 95 | ) 96 | head(df$accession_id) 97 | ``` 98 | Selecting all 484 accession_ids. 99 | Returning 0-484 of 484 accession_ids. 100 | [1] "EPI_ISL_14061265" "EPI_ISL_14061266" "EPI_ISL_14061267" "EPI_ISL_14061268" "EPI_ISL_14061269" "EPI_ISL_14061270" 101 | 102 | ### Ordering 103 | 104 | Use `order_by` to order the results or `query` by a column. Use `order_asc` to change the direction of `order_by` (defaults to TRUE). 105 | 106 | ``` r 107 | df <- query(credentials = credentials, order_by = 'submission_date') 108 | df$submission_date 109 | ``` 110 | [1] "2020-01-10" "2020-01-10" "2020-01-11" "2020-01-11" "2020-01-11" "2020-01-12" "2020-01-14" 111 | [8] "2020-01-14" "2020-01-14" "2020-01-14" "2020-01-16" "2020-01-17" "2020-01-17" ... 112 | 113 | ### Full text search 114 | 115 | Use `text` for full text search. 116 | 117 | ``` r 118 | accession_ids = c("EPI_ISL_17398411", "EPI_ISL_17199001", "EPI_ISL_17409201", "EPI_ISL_17243716") 119 | df <- query(credentials = credentials, text = paste(accession_ids, collapse = "\n")) 120 | > df$accession_id 121 | ``` 122 | [1] "EPI_ISL_17199001" "EPI_ISL_17243716" "EPI_ISL_17398411" "EPI_ISL_17409201" 123 | 124 | ### Search by location 125 | 126 | Use `location` to search for entries based on geographic location. 127 | 128 | ``` r 129 | df <- query(credentials = credentials, location = 'Australia') 130 | df$location 131 | ``` 132 | 133 | [1] "Oceania / Australia / Western Australia" "Oceania / Australia / Queensland" 134 | [3] "Oceania / Australia / Queensland" "Oceania / Australia / Queensland" 135 | [5] "Oceania / Australia / Western Australia" ... 136 | 137 | A list of GISAID locations (not complete) can be found in [GISAID_LOCATIONS.txt](https://github.com/Wytamma/GISAIDR/blob/master/GISAID_LOCATIONS.txt). The location search is hierarchical e.g. querying for 'Africa / ...' will return all the regions within Africa, while querying for 'Africa / Angola / ...' will only return the regions in Angola. Region can be further subdivided by specifying more levels e.g. 'North America / USA / Alabama / Butler County'. The search uses pattern matching and does not have to follow the hierarchical format above. 138 | 139 | ### Search by lineage (EpiCoV) 140 | 141 | Use `lineage` to search for entries based on pango lineage designations. 142 | 143 | ``` r 144 | df <- query(credentials = credentials, lineage = 'B.1.1.7') 145 | full_df <- download(credentials = credentials, list_of_accession_ids = df$accession_id) # see below for download() info. 146 | full_df$pangolin_lineage 147 | ``` 148 | 149 | [1] "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" 150 | [11] "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" "B.1.1.7" 151 | [21] ... 152 | 153 | ### Search by Variant (EpiCoV) 154 | 155 | Variants can be queried by name e.g. 'omicron', 'gh/490r', 'delta', 'alpha', 'beta', 'gamma', 'lambda', or 'mu'. Unfortunately GISAID doesn't return the variant designation from the query or download so variants must be confirmed with pangolin_lineage or GISAID_clade. 156 | 157 | ``` r 158 | # VOC Omicron GRA (B.1.1.529+BA.*) first detected in Botswana/Hong Kong/South Africa 159 | omicron_df <- query(credentials = credentials, variant = 'omicron') 160 | omicron_full_df <- download(credentials = credentials, list_of_accession_ids = omicron_df$accession_id) 161 | omicron_full_df$pangolin_lineage 162 | ``` 163 | 164 | [1] "BA.2" "BA.2" "BA.2.10.1" "BA.2" "BA.2" "BA.5" "BA.2" "BA.2" "BA.2.3" 165 | [10] "BA.4" "BA.2" "BA.2" "BA.2" "BA.2" "BA.2" "BA.2" "BA.2" "BA.1.17" 166 | [19] ... 167 | 168 | ### Search by collection date 169 | 170 | Use `from` and `to` to search for entries from specific dates. 171 | 172 | ``` r 173 | df <- query(credentials = credentials, from = '2021-04-05', to = '2021-04-06') 174 | df$collection_date 175 | ``` 176 | 177 | [1] "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" 178 | [8] "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" 179 | [15] ... 180 | 181 | ### Search by submission date 182 | 183 | Use `from_subm` and `to_subm` to search for entries from specific dates. 184 | 185 | ``` r 186 | df <- query(credentials = credentials, from_subm = '2021-04-05', to_subm = '2021-04-05') 187 | df$submission_date 188 | ``` 189 | 190 | [1] "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" 191 | [8] "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" "2021-04-05" 192 | [15] ... 193 | 194 | ### Search by virus name 195 | 196 | Use `virus_name` to search for entries using the virus name. 197 | 198 | ``` r 199 | df <- query(credentials = credentials, virus_name="hCoV-19/Ireland/D-BHTEST/2022") 200 | df$virus_name 201 | ``` 202 | 203 | [1] "hCoV-19/Ireland/D-BHTEST/2022" 204 | 205 | Search for multiple virus names using a list. 206 | 207 | ``` 208 | virus_names <- list("hCoV-19/Ireland/KY-Enfer-230922004_A4/2022", "hCoV-19/Ireland/CO-Enfer-240922010_E9/2022") 209 | df <- query(credentials = credentials, virus_name=virus_names) 210 | df$virus_name 211 | ``` 212 | [1] "hCoV-19/Ireland/CO-Enfer-240922010_E9/2022" "hCoV-19/Ireland/KY-Enfer-230922004_A4/2022" 213 | 214 | You can also match parts of the virus name e.g. 215 | 216 | ``` 217 | df <- query(credentials = credentials, virus_name="hCoV-19/Ireland") 218 | df$virus_name 219 | ``` 220 | [1] "hCoV-19/Ireland/KY-Enfer-260922007_C6/2022" "hCoV-19/Ireland/KY-Enfer-260922007_C4/2022" 221 | [3] "hCoV-19/Ireland/KY-Enfer-260922007_C2/2022" "hCoV-19/Ireland/KY-Enfer-260922007_C10/2022" 222 | [5] "hCoV-19/Ireland/KY-Enfer-260922007_C1/2022" "hCoV-19/Ireland/CO-Enfer-260922007_B7/2022"... 223 | 224 | ### Search by AA Substitutions and Nucleotide Mutations 225 | 226 | Use `aa_substitution` and `nucl_mutation` to search for entries using amino acid Substitutions and nucleotide mutations. 227 | 228 | ``` r 229 | aa_substitution_df <- query(credentials = credentials, aa_substitution = 'Spike_E484Q, Spike_H69del, -N_P13L') 230 | nucl_mutation_df <- query(credentials = credentials, nucl_mutation = '-T23599G, -C10029T') 231 | ``` 232 | 233 | ### Exclude low coverage entries 234 | 235 | Use `low_coverage_excl` to exclude low coverage entries from the results. 236 | 237 | ``` r 238 | df <- query(credentials = credentials, low_coverage_excl = TRUE) 239 | grep("Long stretches of NNNs", df$information) 240 | ``` 241 | 242 | integer(0) 243 | 244 | ### Include only complete entries 245 | 246 | GISAID considers genomes \>29,000 nt as complete. Use `complete` to include only complete entries in the results. 247 | 248 | ``` r 249 | df <- query(credentials = credentials, complete = TRUE) 250 | all(df$length > 29000) 251 | ``` 252 | 253 | [1] TRUE 254 | 255 | ### Include only high coverage entries 256 | 257 | GISAID considers genomes with \<1% Ns and \<0.05% unique amino acid mutations as high coverage . Use `high_coverage` to include only high coverage entries in the results. 258 | 259 | ``` r 260 | df <- query(credentials = credentials, high_coverage = TRUE) 261 | length(grep("warn_sign", df$information)) == 0 262 | ``` 263 | 264 | [1] TRUE 265 | 266 | ### Include only entries with complete collection date 267 | 268 | Use `collection_date_complete` to include only entries with complete collection date. 269 | 270 | ``` r 271 | df <- query(credentials = credentials, collection_date_complete = TRUE) 272 | ``` 273 | 274 | ### Load all entries 275 | 276 | Use `load_all` to get all the entries that match your query without having to specify `nrows`. 277 | 278 | ``` r 279 | df <- query(credentials = credentials, lineage = 'W.1', load_all = TRUE) 280 | nrow(df) 281 | ``` 282 | 283 | [1] 100 284 | 285 | Note: you may end up downloading the entire GISAID database if your query is too general. 286 | 287 | ### Get total query matches 288 | 289 | Use `total` to get the number of entries that match you query. 290 | 291 | ``` r 292 | total <- query(credentials = credentials, total = TRUE) 293 | total 294 | ``` 295 | 296 | [1] 10145747 297 | 298 | ## Download Files 299 | 300 | The `download_files` function provides flexible downloading of specific data types from GISAID. Unlike the deprecated `download` function, it allows you to selectively download different types of files and returns them in a structured format. 301 | 302 | ### Basic usage 303 | 304 | To download specific file types, set the corresponding parameter to `TRUE`: 305 | 306 | ``` r 307 | df <- query(credentials = credentials, location = "Australia", nrows = 10) 308 | list_of_accession_ids <- df$accession_id 309 | 310 | # Download dates and location metadata 311 | results <- download_files( 312 | credentials = credentials, 313 | list_of_accession_ids = list_of_accession_ids, 314 | dates_and_location = TRUE 315 | ) 316 | head(results$dates_and_location) 317 | ``` 318 | 319 | ### Available download types 320 | 321 | The function supports the following download types: 322 | 323 | - `dates_and_location`: Date and Location metadata (TSV format) 324 | - `patient_status`: Patient Status metadata (TSV format) 325 | - `sequencing_technology`: Sequencing Technology metadata (TSV format) 326 | - `sequences`: Nucleotide sequences (FASTA format) 327 | - `augur_input`: Combined metadata and sequences for Augur analysis (EpiCoV only) 328 | 329 | ### Download multiple file types 330 | 331 | You can download multiple file types in a single call: 332 | 333 | ``` r 334 | results <- download_files( 335 | credentials = credentials, 336 | list_of_accession_ids = list_of_accession_ids, 337 | dates_and_location = TRUE, 338 | patient_status = TRUE, 339 | sequences = TRUE 340 | ) 341 | 342 | # Access different data types 343 | location_data <- results$dates_and_location 344 | patient_data <- results$patient_status 345 | sequence_data <- results$sequences 346 | ``` 347 | 348 | ### Augur input format (EpiCoV only) 349 | 350 | For phylogenetic analysis with Augur, you can download the special `augur_input` format which includes both metadata and sequences: 351 | 352 | ``` r 353 | results <- download_files( 354 | credentials = credentials, 355 | list_of_accession_ids = list_of_accession_ids, 356 | augur_input = TRUE 357 | ) 358 | 359 | metadata <- results$augur_input$metadata 360 | sequences <- results$augur_input$sequences 361 | ``` 362 | 363 | Note: The `augur_input` option is only available for the EpiCoV database. If you specify both `augur_input` and `sequences`, the standalone `sequences` download will be skipped to avoid redundancy. 364 | 365 | ### Return format 366 | 367 | The function returns a named list where each element corresponds to a requested download type: 368 | 369 | - For metadata types (`dates_and_location`, `patient_status`, `sequencing_technology`): Returns a data frame 370 | - For `sequences`: Returns a character string containing FASTA data 371 | - For `augur_input`: Returns a list with `metadata` (data frame) and `sequences` (character string) elements 372 | 373 | ### File cleanup 374 | 375 | By default, temporary files are automatically cleaned up after download. You can preserve them by setting `clean_up = FALSE`: 376 | 377 | ``` r 378 | results <- download_files( 379 | credentials = credentials, 380 | list_of_accession_ids = list_of_accession_ids, 381 | sequences = TRUE, 382 | clean_up = FALSE 383 | ) 384 | ``` 385 | 386 | Note: A maximum of 5000 sequences can be downloaded at a time. 387 | 388 | ## Download (deprecated) 389 | 390 | To download the full data set you need a list of accession IDs (which can be obtained from `query` results). This will also download the sequence data for each entry. 391 | 392 | ``` r 393 | full_df_with_seq <- download( 394 | credentials = credentials, 395 | list_of_accession_ids = list_of_accession_ids, 396 | ) 397 | full_df_with_seq$sequence 398 | ``` 399 | 400 | [1] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... 401 | [2] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... 402 | [3] "AGATCTGTTCTCTAAACGAACTTTAAAATCT... 403 | ... 404 | 405 | You can stop GISAIDR from loading the sequence data into the memory by setting get_sequence=FALSE. Note: the sequence data will still be downloaded. 406 | 407 | ``` r 408 | df <- query(credentials = credentials) 409 | list_of_accession_ids <- df$accession_id 410 | full_df <- download(credentials = credentials, list_of_accession_ids = list_of_accession_ids, get_sequence=FALSE) 411 | colnames(full_df) 412 | ``` 413 | 414 | [1] "strain" "virus" "gisaid_epi_isl" "genbank_accession" 415 | [5] "date" "region" "country" "division" 416 | [9] "location" "region_exposure" "country_exposure" "division_exposure" 417 | [13] "segment" "length" "host" "age" 418 | [17] "sex" "Nextstrain_clade" "pangolin_lineage" "GISAID_clade" 419 | [21] "originating_lab" "submitting_lab" "authors" "url" 420 | [25] "title" "paper_url" "date_submitted" "purpose_of_sequencing" 421 | 422 | Note: a maximum of 5000 results can be downloaded at a time. 423 | 424 | ### Export to fasta file 425 | 426 | Use the export_fasta function to write sequence data to a file in fasta format. The sequence names will be [country\@pango_lineage\@accesion_id\@date](mailto:country@pango_lineage@accesion_id@date), with the date in decimal format (requires the [lubridate](https://cran.r-project.org/web/packages/lubridate/index.html) package). The default is to only export sequences for which a decimal date could be set. To prevent this, use the argument export_dated_only = F. 427 | 428 | ``` r 429 | export_fasta(full_df_with_seq, out_file_name = 'GISAID_sequences.fasta') 430 | ``` 431 | 432 | Date format (default: decimal year) and delimiter (default: \@) can be set with the date_format and delimiter arguments respectively. 433 | 434 | ``` r 435 | export_fasta(full_df_with_seq, out_file_name = 'GISAID_sequences.fasta', date_format='%Y-%m-%d', delimiter='|') 436 | ``` 437 | 438 | Use the `columns` argument to choose which columns are included in the export. 439 | 440 | ``` r 441 | export_fasta(full_df_with_seq, out_file_name = 'GISAID_sequences.fasta', columns = c("accession_id", "country", "pangolin_lineage", "date")) 442 | ``` 443 | 444 | ## Errors 445 | 446 | GISAIDR relies on the custom selection interface of [gisaid.org](https://www.gisaid.org/). If GISAIDR is giving you errors, first check that it is not gisaid.org producing these errors. We can't do anything to fix errors with gisaid.org. 447 | 448 | If you have an epiflu account (i.e. you were using GISAID before COVID-19) you may have issues logging in as GISAID may default you to the epiflu database. 449 | 450 | ## Updating 451 | 452 | When updating GISAIDR run `detach("package:GISAIDR", unload=TRUE)` first to ensure the update is applied. 453 | 454 | ## Examples 455 | 456 | Download all of the 2020 entries from Asia but outside China. 457 | 458 | ``` r 459 | library(GISAIDR) 460 | 461 | # country list from GISAID 462 | Asia <- c('Asia / Afghanistan', 463 | 'Asia / Armenia', 464 | 'Asia / Bahrain', 465 | 'Asia / Bangladesh', 466 | 'Asia / Brunei', 467 | 'Asia / Cambodia', 468 | # 'Asia / China', # remove China 469 | 'Asia / Georgia', 470 | 'Asia / Hong Kong', 471 | 'Asia / India', 472 | 'Asia / Indonesia / ...', 473 | 'Asia / Iran', 474 | 'Asia / Iraq', 475 | 'Asia / Israel', 476 | 'Asia / Japan', 477 | 'Asia / Jordan / ...', 478 | 'Asia / Kazakhstan / ...', 479 | 'Asia / Kuwait', 480 | 'Asia / Lebanon', 481 | 'Asia / Malaysia', 482 | 'Asia / Mongolia', 483 | 'Asia / Myanmar', 484 | 'Asia / Nepal', 485 | 'Asia / Oman / ...', 486 | 'Asia / Pakistan', 487 | 'Asia / Palestine / ...', 488 | 'Asia / Philippines', 489 | 'Asia / Qatar / ...', 490 | 'Asia / Saudi Arabia', 491 | 'Asia / Singapore', 492 | 'Asia / South Korea', 493 | 'Asia / Sri Lanka', 494 | 'Asia / Taiwan', 495 | 'Asia / Thailand', 496 | 'Asia / Timor-Leste', 497 | 'Asia / United Arab Emirates', 498 | 'Asia / Uzbekistan', 499 | 'Asia / Vietnam') 500 | 501 | credentials <- login(username = username, password = password) 502 | 503 | asia_not_china_df <- data.frame() 504 | for (country in Asia) { 505 | print(country) 506 | df <- query(credentials = credentials, location = country, load_all = TRUE, from = "2020-01-01", to = "2020-12-31") 507 | asia_not_china_df <- rbind(asia_not_china_df, df) 508 | } 509 | head(asia_not_china_df) 510 | ``` 511 | 512 | ## Dev guide 513 | 514 | 1. Go to the search interface on (EpiCoV \> Seach). 515 | 2. Right click on the feature you want to add (e.g. the `complete` checkbox) and inspect the source code. 516 | 3. Find the `ceid` for this element (`
`) e.g. `ce_qxos9a_bi` 517 | 4. Find the the value of the checkbox element (``) e.g. `complete`. 518 | 5. Search for the id in the page source. In one of the `