├── .github ├── .gitignore └── workflows │ └── pkgdown.yaml ├── vignettes ├── .gitignore └── basics.Rmd ├── LICENSE ├── man ├── figures │ └── logo.png ├── pipe.Rd ├── ts_classify_result.Rd ├── ts_tt_installed.Rd ├── ts_make_name_df.Rd ├── filmy_taxonomy.Rd ├── ts_write_names.Rd ├── ts_parse_names.Rd ├── ts_resolve_names.Rd └── ts_match_names.Rd ├── tests ├── testthat.R └── testthat │ ├── _snaps │ ├── ts_write_names │ │ └── parsed_name.txt │ ├── ts_resolve_names.md │ ├── ts_parse_names.md │ └── ts_match_names.md │ ├── test-utils.R │ ├── test-ts_parse_names.R │ ├── test-ts_write_names.R │ ├── test-ts_resolve_names.R │ └── test-ts_match_names.R ├── data └── filmy_taxonomy.rda ├── .gitignore ├── .Rbuildignore ├── NAMESPACE ├── R ├── utils-pipe.R ├── data.R ├── ts_tt_installed.R ├── globals.R ├── ts_write_names.R ├── utils.R ├── ts_parse_names.R ├── ts_resolve_names.R └── ts_match_names.R ├── _pkgdown.yml ├── LICENSE.md ├── DESCRIPTION ├── data-raw └── filmy_taxonomy.R ├── README.Rmd └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Joel Nitta 3 | -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelnitta/taxastand/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(taxastand) 3 | 4 | test_check("taxastand") 5 | -------------------------------------------------------------------------------- /data/filmy_taxonomy.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joelnitta/taxastand/HEAD/data/filmy_taxonomy.rda -------------------------------------------------------------------------------- /tests/testthat/_snaps/ts_write_names/parsed_name.txt: -------------------------------------------------------------------------------- 1 | 5f207ff2-1||Foogenus|×|barspecies|var.|foosubsp|(L.) F. Bar 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Rprofile 5 | *.Rproj 6 | .DS_Store 7 | docs 8 | inst/doc 9 | /doc/ 10 | /Meta/ 11 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^taxastand\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.Rprofile$ 4 | ^LICENSE\.md$ 5 | ^data-raw$ 6 | ^README\.Rmd$ 7 | ^_pkgdown\.yml$ 8 | ^docs$ 9 | ^pkgdown$ 10 | ^\.github$ 11 | ^doc$ 12 | ^Meta$ 13 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(ts_match_names) 5 | export(ts_parse_names) 6 | export(ts_resolve_names) 7 | export(ts_tt_installed) 8 | export(ts_write_names) 9 | importFrom(magrittr,"%>%") 10 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/ts_resolve_names.md: -------------------------------------------------------------------------------- 1 | # Produces expected output with docker 2 | 3 | Code 4 | match_results 5 | Output 6 | query reference match_type 7 | 1 Gonocormus minutum Gonocormus minutus (Bl.) Bosch auto_fuzzy 8 | 9 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/ts_classify_result.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{ts_classify_result} 4 | \alias{ts_classify_result} 5 | \title{Classify results of taxon-tools matching} 6 | \usage{ 7 | ts_classify_result(match_results) 8 | } 9 | \arguments{ 10 | \item{match_results}{Dataframe; output of tt_match_names()} 11 | } 12 | \value{ 13 | Dataframe with column \code{result_type} added 14 | } 15 | \description{ 16 | Classify results of taxon-tools matching 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Taxonomy of filmy ferns (family Hymenophyllaceae) 2 | #' 3 | #' A dataset containing taxonomic names and associated metadata for the 4 | #' fern family Hymenophyllaceae. Downloaded from the 5 | #' [Catalog of Life](http://www.catalogueoflife.org/), Version 1.5. 6 | #' All columns formatted according to 7 | #' [Darwin Core standard](https://dwc.tdwg.org/terms/). Only includes taxa 8 | #' at the species or infraspecies level. 9 | #' 10 | #' @format A data frame with 2729 rows and 31 variables. 11 | #' 12 | #' @source 13 | "filmy_taxonomy" 14 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | test_that("Making a dataframe with taxonomic names works", { 2 | expect_s3_class( 3 | ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar"), 4 | "data.frame" 5 | ) 6 | expect_error( 7 | ts_make_name_df(c("Foogenus", "Foogenus")), 8 | "Input taxa must be unique" 9 | ) 10 | expect_error( 11 | ts_make_name_df(c("Foogenus", NA)), 12 | "Input taxa may not contain NAs" 13 | ) 14 | expect_error( 15 | ts_classify_result("Foogenus"), 16 | "match_results must be of class 'data\\.frame'" 17 | ) 18 | }) 19 | -------------------------------------------------------------------------------- /man/ts_tt_installed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ts_tt_installed.R 3 | \name{ts_tt_installed} 4 | \alias{ts_tt_installed} 5 | \title{Test if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is installed} 6 | \usage{ 7 | ts_tt_installed() 8 | } 9 | \value{ 10 | \code{TRUE} if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is 11 | installed, or \code{FALSE} if not. 12 | } 13 | \description{ 14 | Test if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is installed 15 | } 16 | \examples{ 17 | ts_tt_installed() 18 | } 19 | -------------------------------------------------------------------------------- /R/ts_tt_installed.R: -------------------------------------------------------------------------------- 1 | #' Test if [taxon-tools](https://github.com/camwebb/taxon-tools) is installed 2 | #' 3 | #' @return `TRUE` if [taxon-tools](https://github.com/camwebb/taxon-tools) is 4 | #' installed, or `FALSE` if not. 5 | #' @export 6 | #' 7 | #' @examples 8 | #' ts_tt_installed() 9 | ts_tt_installed <- function() { 10 | tryCatch( 11 | { 12 | parsenames_res <- processx::run("parsenames", "--version") 13 | matchnames_res <- processx::run("matchnames", "--version") 14 | return(TRUE) 15 | }, 16 | error = function(error_message) { 17 | return(FALSE) 18 | } 19 | ) 20 | } 21 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | home: 2 | title: Standardize Taxonomic Names 3 | description: > 4 | Matches species names to a taxonomic standard. Resolves synonyms consistently and reproducibly. 5 | template: 6 | params: 7 | bootswatch: lumen 8 | reference: 9 | - title: "Parse names" 10 | - contents: 11 | - ts_parse_names 12 | - title: "Match names" 13 | - contents: 14 | - ts_match_names 15 | - title: "Resolve names" 16 | - contents: 17 | - ts_resolve_names 18 | - title: "Datasets" 19 | - contents: 20 | - filmy_taxonomy 21 | - title: "I/O" 22 | - contents: 23 | - ts_write_names 24 | - title: "Utilities" 25 | - contents: 26 | - ts_tt_installed 27 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/ts_parse_names.md: -------------------------------------------------------------------------------- 1 | # Parsing works with docker 2 | 3 | Code 4 | invisible(capture.output(parse_res <- ts_parse_names( 5 | "Foogenus x barspecies var. foosubsp (L.) F. Bar", docker = TRUE))) 6 | parse_res 7 | Output 8 | name id genus_hybrid_sign 9 | 1 Foogenus x barspecies var. foosubsp (L.) F. Bar 5f207ff2-1 10 | genus_name species_hybrid_sign specific_epithet infraspecific_rank 11 | 1 Foogenus × barspecies var. 12 | infraspecific_epithet author 13 | 1 foosubsp (L.) F. Bar 14 | 15 | -------------------------------------------------------------------------------- /man/ts_make_name_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{ts_make_name_df} 4 | \alias{ts_make_name_df} 5 | \title{Make a dataframe with taxonomic names} 6 | \usage{ 7 | ts_make_name_df(taxa) 8 | } 9 | \arguments{ 10 | \item{taxa}{Character vector; taxon names to be parsed by taxon-tools \code{parsenames}. 11 | Missing values not allowed. Must all be unique.} 12 | } 13 | \value{ 14 | Dataframe with two columns: \code{id} and \code{name} 15 | } 16 | \description{ 17 | Make a dataframe with taxonomic names 18 | } 19 | \examples{ 20 | \dontrun{ 21 | ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar") 22 | } 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/filmy_taxonomy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{filmy_taxonomy} 5 | \alias{filmy_taxonomy} 6 | \title{Taxonomy of filmy ferns (family Hymenophyllaceae)} 7 | \format{ 8 | A data frame with 2729 rows and 31 variables. 9 | } 10 | \source{ 11 | \url{http://www.catalogueoflife.org/} 12 | } 13 | \usage{ 14 | filmy_taxonomy 15 | } 16 | \description{ 17 | A dataset containing taxonomic names and associated metadata for the 18 | fern family Hymenophyllaceae. Downloaded from the 19 | \href{http://www.catalogueoflife.org/}{Catalog of Life}, Version 1.5. 20 | All columns formatted according to 21 | \href{https://dwc.tdwg.org/terms/}{Darwin Core standard}. Only includes taxa 22 | at the species or infraspecies level. 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /tests/testthat/test-ts_parse_names.R: -------------------------------------------------------------------------------- 1 | test_that("Input checks work", { 2 | expect_error( 3 | ts_parse_names(c("Foogenus", "Foogenus")), 4 | "Input taxa must be unique" 5 | ) 6 | expect_error( 7 | ts_parse_names(c("Foogenus", NA)), 8 | "Input taxa may not contain NAs" 9 | ) 10 | }) 11 | 12 | test_that("Parsing works with docker", { 13 | skip_if_no_docker() 14 | expect_snapshot({ 15 | # Need invisible() and capture.output() to suppress spinner 16 | invisible( 17 | capture.output( 18 | parse_res <- ts_parse_names( 19 | "Foogenus x barspecies var. foosubsp (L.) F. Bar", 20 | docker = TRUE 21 | ) 22 | ) 23 | ) 24 | parse_res 25 | }) 26 | }) 27 | 28 | test_that("Parsing works with local taxon-tools", { 29 | skip_if_no_tt() 30 | expect_snapshot( 31 | ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar") 32 | ) 33 | }) 34 | -------------------------------------------------------------------------------- /man/ts_write_names.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ts_write_names.R 3 | \name{ts_write_names} 4 | \alias{ts_write_names} 5 | \title{Write out parsed names to a text file} 6 | \usage{ 7 | ts_write_names(df, path) 8 | } 9 | \arguments{ 10 | \item{df}{Dataframe with parsed names} 11 | 12 | \item{path}{Path to write dataframe 13 | 14 | Writes out parsed names in a format that can be used by \href{https://github.com/camwebb/taxon-tools}{taxon-tools} 15 | (each part of the scientific name is separated by the pipe symbol (|), with one name per line).} 16 | } 17 | \value{ 18 | Path to parsed names 19 | } 20 | \description{ 21 | Write out parsed names to a text file 22 | } 23 | \examples{ 24 | if (ts_tt_installed()) { 25 | parsed_names <- ts_parse_names( 26 | "Foogenus x barspecies var. foosubsp (L.) F. Bar") 27 | temp_file <- tempfile() 28 | ts_write_names(parsed_names, temp_file) 29 | readLines(temp_file) 30 | file.remove(temp_file) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/globals.R: -------------------------------------------------------------------------------- 1 | # Generated by roxyglobals: do not edit by hand 2 | 3 | utils::globalVariables(c( 4 | "namestring", # 5 | "id", # 6 | "key_id", # 7 | "record", # 8 | "namestring_query", # 9 | "name", # 10 | "match_type", # 11 | "record", # 12 | "id", # 13 | "name", # 14 | "reference", # 15 | "match_type", # 16 | "result_type", # 17 | "acceptedNameUsageID", # 18 | "taxonID", # 19 | "taxonomicStatus", # 20 | "scientificName", # 21 | "resolved_name", # 22 | "resolved_status", # 23 | "n", # 24 | "matched_name", # 25 | "matched_status", # 26 | "query", # 27 | "result_type", # 28 | "n", # 29 | NULL 30 | )) 31 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Joel Nitta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/testthat/test-ts_write_names.R: -------------------------------------------------------------------------------- 1 | test_that("Input checks work", { 2 | expect_error( 3 | ts_write_names("Foogenus", tempfile()), 4 | "df must be of class 'data\\.frame'" 5 | ) 6 | partial_names_df <- data.frame( 7 | id = "1", 8 | genus_hybrid_sign = "x" 9 | ) 10 | expect_error( 11 | ts_write_names(partial_names_df, tempfile()), 12 | "df must include the following columns" 13 | ) 14 | }) 15 | 16 | test_that("Produces expected output file with docker", { 17 | skip_if_no_docker() 18 | parsed_names <- ts_parse_names( 19 | "Foogenus x barspecies var. foosubsp (L.) F. Bar", 20 | docker = TRUE 21 | ) 22 | expect_snapshot_file( 23 | ts_write_names(parsed_names, "parsed_name.txt"), 24 | "parsed_name.txt" 25 | ) 26 | file.remove("parsed_name.txt") 27 | }) 28 | 29 | test_that("Produces expected output file without docker", { 30 | skip_if_no_tt() 31 | parsed_names <- ts_parse_names( 32 | "Foogenus x barspecies var. foosubsp (L.) F. Bar" 33 | ) 34 | expect_snapshot_file( 35 | ts_write_names(parsed_names, "parsed_name.txt"), 36 | "parsed_name.txt" 37 | ) 38 | file.remove("parsed_name.txt") 39 | }) 40 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: taxastand 2 | Title: Taxonomic Name Standardization 3 | Version: 1.0.0 4 | Authors@R: 5 | person(given = "Joel", 6 | family = "Nitta", 7 | role = c("aut", "cre"), 8 | email = "joelnitta@gmail.com") 9 | Description: Matches species names to a taxonomic standard. Resolves synonyms consistently and reproducibly. 10 | License: MIT + file LICENSE 11 | Encoding: UTF-8 12 | LazyData: true 13 | SystemRequirements: 14 | parsenames (), 15 | matchnames () 16 | Imports: 17 | assertr, 18 | assertthat, 19 | digest, 20 | dplyr, 21 | fs, 22 | glue, 23 | magrittr, 24 | processx, 25 | tibble, 26 | tidyr 27 | Roxygen: list( 28 | markdown = TRUE, 29 | roclets = c("collate", "namespace", "rd", "roxyglobals::global_roclet")) 30 | RoxygenNote: 7.3.2 31 | Depends: R (>= 4.1.0) 32 | Suggests: 33 | rmarkdown, 34 | knitr, 35 | roxyglobals (>= 0.2.1), 36 | testthat (>= 3.0.0), 37 | babelwhale 38 | Config/testthat/edition: 3 39 | Remotes: 40 | anthonynorth/roxyglobals 41 | VignetteBuilder: knitr 42 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | tags: ['*'] 7 | 8 | name: pkgdown 9 | 10 | jobs: 11 | pkgdown: 12 | runs-on: ubuntu-latest 13 | env: 14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - uses: r-lib/actions/setup-pandoc@v1 19 | 20 | - uses: r-lib/actions/setup-r@v1 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v1 25 | with: 26 | extra-packages: pkgdown 27 | needs: website 28 | 29 | - name: Install dependencies 30 | run: | 31 | sudo apt-get install -y --no-install-recommends gawk 32 | git clone https://github.com/camwebb/taxon-tools.git 33 | cd taxon-tools 34 | git checkout 8f8b5e2611b6fdef1998b7878e93e60a9bc7c130 35 | make check 36 | sudo make install 37 | cd .. 38 | 39 | - name: Deploy package 40 | run: | 41 | git config --local user.name "$GITHUB_ACTOR" 42 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 43 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 44 | -------------------------------------------------------------------------------- /data-raw/filmy_taxonomy.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | # Load the example standard taxonomy for resolving names. 4 | 5 | # The example standard taxonomy is the family Hymenophyllaceae from 6 | # Catalog of Life (CoL). CoL provides persistant links to database dumps. 7 | # This one was obtained by selecting "Hymenophyllaceae" for "family" 8 | # and "Complete data" on http://www.catalogueoflife.org/DCA_Export/index.php 9 | # on 2019-06-19 10 | 11 | # Download the zip file 12 | temp_dir <- fs::dir_create(tempdir()) 13 | download.file( 14 | "http://www.catalogueoflife.org/DCA_Export/zip/archive-family-hymenophyllaceae-bl3.zip", 15 | fs::path(temp_dir, "archive-genus-vandenboschia-bl3.zip") 16 | ) 17 | 18 | # Unzip 19 | unzip( 20 | fs::path(temp_dir, "archive-genus-vandenboschia-bl3.zip"), 21 | exdir = temp_dir 22 | ) 23 | 24 | # Read in taxonomy table, keep only 25 | # names at species rank and below 26 | # (warnings are produced because names at genus level 27 | # and above have NA for many fields). 28 | filmy_taxonomy <- read_tsv(fs::path(temp_dir, "taxa.txt")) %>% 29 | filter(str_detect(taxonRank, "species")) 30 | 31 | # Replace "v. d. Bosch" with "V. D. Bosch" 32 | # see https://github.com/camwebb/taxon-tools/issues/10 33 | filmy_taxonomy <- 34 | filmy_taxonomy %>% 35 | dplyr::mutate( 36 | scientificName = stringr::str_replace_all( 37 | scientificName, 38 | "v. d. Bosch", 39 | "V. D. Bosch" 40 | ) 41 | ) 42 | 43 | usethis::use_data(filmy_taxonomy) 44 | -------------------------------------------------------------------------------- /tests/testthat/test-ts_resolve_names.R: -------------------------------------------------------------------------------- 1 | data(filmy_taxonomy) 2 | 3 | test_that("Input checks work", { 4 | expect_error( 5 | ts_resolve_names(10, data.frame(genus = "Foogenus")), 6 | "query must be of class" 7 | ) 8 | expect_error( 9 | ts_resolve_names(data.frame(genus = "Foogenus"), 10), 10 | "ref_taxonomy must be of class" 11 | ) 12 | }) 13 | 14 | test_that("Produces expected output with docker", { 15 | skip_if_no_docker() 16 | # Query a misspelled name 17 | match_results <- ts_match_names( 18 | query = "Gonocormus minutum", 19 | reference = unique(filmy_taxonomy$scientificName), 20 | simple = TRUE, 21 | docker = TRUE 22 | ) 23 | expect_s3_class( 24 | ts_resolve_names(match_results, filmy_taxonomy), 25 | "data.frame" 26 | ) 27 | expect_s3_class( 28 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy, docker = TRUE), 29 | "data.frame" 30 | ) 31 | expect_snapshot(match_results) 32 | }) 33 | 34 | 35 | test_that("Produces expected output without docker", { 36 | skip_if_no_tt() 37 | # Query a misspelled name 38 | match_results <- ts_match_names( 39 | query = "Gonocormus minutum", 40 | reference = unique(filmy_taxonomy$scientificName), 41 | simple = TRUE 42 | ) 43 | expect_s3_class( 44 | ts_resolve_names(match_results, filmy_taxonomy), 45 | "data.frame" 46 | ) 47 | expect_s3_class( 48 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy), 49 | "data.frame" 50 | ) 51 | expect_snapshot(match_results) 52 | }) 53 | -------------------------------------------------------------------------------- /R/ts_write_names.R: -------------------------------------------------------------------------------- 1 | #' Write out parsed names to a text file 2 | #' 3 | #' @param df Dataframe with parsed names 4 | #' @param path Path to write dataframe 5 | #' 6 | #' Writes out parsed names in a format that can be used by [taxon-tools](https://github.com/camwebb/taxon-tools) 7 | #' (each part of the scientific name is separated by the pipe symbol (|), with one name per line). 8 | #' 9 | #' @autoglobal 10 | #' @return Path to parsed names 11 | #' @export 12 | #' @examples 13 | #' if (ts_tt_installed()) { 14 | #' parsed_names <- ts_parse_names( 15 | #' "Foogenus x barspecies var. foosubsp (L.) F. Bar") 16 | #' temp_file <- tempfile() 17 | #' ts_write_names(parsed_names, temp_file) 18 | #' readLines(temp_file) 19 | #' file.remove(temp_file) 20 | #' } 21 | ts_write_names <- function(df, path) { 22 | # Make vector of standard taxon-tools columns 23 | tt_col_names = c( 24 | "id", 25 | "genus_hybrid_sign", 26 | "genus_name", 27 | "species_hybrid_sign", 28 | "specific_epithet", 29 | "infraspecific_rank", 30 | "infraspecific_epithet", 31 | "author" 32 | ) 33 | 34 | assertthat::assert_that( 35 | inherits(df, "data.frame"), 36 | msg = "df must be of class 'data.frame'" 37 | ) 38 | assertthat::assert_that( 39 | isTRUE(all(tt_col_names %in% colnames(df))), 40 | msg = glue::glue( 41 | "df must include the following columns: {paste(tt_col_names, collapse = ', ')}" 42 | ) 43 | ) 44 | 45 | # Replace NA values with "" 46 | df <- dplyr::mutate( 47 | df, 48 | dplyr::across(dplyr::everything(), ~ tidyr::replace_na(., "")) 49 | ) 50 | 51 | # Subset to only taxon-tools columns, in order 52 | df <- df[, tt_col_names] 53 | 54 | # taxon-tools uses pipe as separator 55 | df <- tidyr::unite(df, col = "text", dplyr::all_of(tt_col_names), sep = "|") 56 | 57 | # write out text 58 | writeLines(df$text, path) 59 | 60 | path 61 | } 62 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/ts_match_names.md: -------------------------------------------------------------------------------- 1 | # Produces expected output in docker 2 | 3 | Code 4 | match_res 5 | Output 6 | query reference match_type id_query id_ref 7 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy c1ad73ec-1 19b861c8-1 8 | genus_hybrid_sign_query genus_name_query species_hybrid_sign_query 9 | 1 Crepidomanes 10 | specific_epithet_query infraspecific_rank_query infraspecific_epithet_query 11 | 1 minutus 12 | author_query genus_hybrid_sign_ref genus_name_ref species_hybrid_sign_ref 13 | 1 Crepidomanes 14 | specific_epithet_ref infraspecific_rank_ref infraspecific_epithet_ref 15 | 1 minutum 16 | author_ref 17 | 1 18 | 19 | # Manually matched names work 20 | 21 | Code 22 | match_res 23 | Output 24 | query reference match_type 25 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy 26 | 2 Hymeefee erae Hymenophyllum polyanthos manual 27 | 28 | # Names that can't be parsed don't show up in results 29 | 30 | Code 31 | match_res 32 | Output 33 | # A tibble: 1 x 3 34 | query reference match_type 35 | 36 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy 37 | 38 | # Manually matched names work with collapsed infrasp names 39 | 40 | Code 41 | match_res 42 | Output 43 | # A tibble: 6 x 3 44 | query reference match_type 45 | 46 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy 47 | 2 Crepidomanes minutawtaw Crepidomanes minutum manual 48 | 3 Blechnum lunare var. lunare Blechnum lunare exact 49 | 4 Blechnum lunare Blechnum lunare exact 50 | 5 Bar foo var. foo Bar foo manual 51 | 6 Bar foo Bar foo exact 52 | 53 | -------------------------------------------------------------------------------- /man/ts_parse_names.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ts_parse_names.R 3 | \name{ts_parse_names} 4 | \alias{ts_parse_names} 5 | \title{Parse taxonomic names} 6 | \usage{ 7 | ts_parse_names( 8 | taxa, 9 | tbl_out = getOption("ts_tbl_out", default = FALSE), 10 | quiet = FALSE, 11 | docker = getOption("ts_docker", default = FALSE) 12 | ) 13 | } 14 | \arguments{ 15 | \item{taxa}{Character vector; taxon names to be parsed by taxon-tools 16 | \code{parsenames}. Missing values not allowed. Must all be unique.} 17 | 18 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned? 19 | If \code{FALSE} (default), output will be a data.frame. This argument can 20 | be controlled via the option \code{ts_tbl_out}; see Examples.} 21 | 22 | \item{quiet}{Logical; if TRUE, suppress warning messages that would normally 23 | be issued} 24 | 25 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools 26 | (so that taxon-tools need not be installed).} 27 | } 28 | \value{ 29 | A dataframe including the following columns. 30 | \itemize{ 31 | \item id: A unique ID number assigned to the input name 32 | \item name: The input name 33 | \item genus_hybrid_sign: Hybrid sign for genus 34 | \item genus_name: Genus name 35 | \item species_hybrid_sign: Hybrid sign for species 36 | \item specific_epithet: Specific epithet (name) 37 | \item infraspecific_rank: Infraspecific rank 38 | \item infraspecific_epithet: Infraspecific epithet (name) 39 | \item author: Name of taxon 40 | } 41 | } 42 | \description{ 43 | Requires \href{https://github.com/camwebb/taxon-tools}{taxon-tools} or docker 44 | to be installed. 45 | } 46 | \details{ 47 | Parses scientific names into their component parts (genus, species, variety, 48 | author, etc). 49 | } 50 | \examples{ 51 | # Using local taxon-tools installation 52 | if (ts_tt_installed()) { 53 | 54 | ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar") 55 | ts_parse_names( 56 | "Foogenus x barspecies var. foosubsp (L.) F. Bar", tbl_out = TRUE) 57 | 58 | # If you always want tibble output without specifying `tbl_out = TRUE` 59 | # every time, set the option: 60 | options(ts_tbl_out = TRUE) 61 | ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar") 62 | ts_parse_names("Crepidomanes minutum (Blume) K. Iwats.") 63 | 64 | } 65 | 66 | # Using docker 67 | if (babelwhale::test_docker_installation()) { 68 | 69 | ts_parse_names( 70 | "Foogenus x barspecies var. foosubsp (L.) F. Bar", 71 | docker = TRUE) 72 | 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /tests/testthat/test-ts_match_names.R: -------------------------------------------------------------------------------- 1 | test_that("Input checks work", { 2 | expect_error( 3 | ts_match_names(10, "Foogenus"), 4 | "query must be of class" 5 | ) 6 | expect_error( 7 | ts_match_names("Foogenus", 10), 8 | "reference must be of class" 9 | ) 10 | expect_error( 11 | ts_match_names(10, data.frame(genus = "Foogenus")), 12 | "query must be of class" 13 | ) 14 | expect_error( 15 | ts_match_names(data.frame(genus = "Foogenus"), 10), 16 | "reference must be of class" 17 | ) 18 | }) 19 | 20 | test_that("Produces expected output in docker", { 21 | skip_if_no_docker() 22 | match_res <- ts_match_names( 23 | "Crepidomanes minutus", 24 | "Crepidomanes minutum", 25 | docker = TRUE 26 | ) 27 | expect_s3_class(match_res, "data.frame") 28 | expect_snapshot(match_res) 29 | }) 30 | 31 | test_that("Produces expected output without docker", { 32 | skip_if_no_tt() 33 | match_res <- ts_match_names( 34 | "Crepidomanes minutus", 35 | "Crepidomanes minutum" 36 | ) 37 | expect_s3_class(match_res, "data.frame") 38 | expect_snapshot(match_res) 39 | }) 40 | 41 | test_that("Manually matched names work", { 42 | skip_if_no_docker() 43 | match_res <- ts_match_names( 44 | query = c("Crepidomanes minutus", "Hymeefee erae"), 45 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 46 | manual_match = data.frame( 47 | query = "Hymeefee erae", 48 | match = "Hymenophyllum polyanthos" 49 | ), 50 | simple = TRUE, 51 | docker = TRUE 52 | ) 53 | expect_snapshot(match_res) 54 | }) 55 | 56 | test_that("Names that can't be parsed don't show up in results", { 57 | skip_if_no_docker() 58 | match_res <- ts_match_names( 59 | query = c( 60 | "Vanden kalamocarpa x Vanden nipponica x Vanden striata", 61 | "Crepidomanes minutus" 62 | ), 63 | reference = c( 64 | "Crepidomanes minutum" 65 | ), 66 | simple = TRUE, 67 | docker = TRUE, 68 | tbl_out = TRUE 69 | ) 70 | expect_snapshot(match_res) 71 | }) 72 | 73 | test_that("Manually matched names work with collapsed infrasp names", { 74 | skip_if_no_docker() 75 | match_res <- ts_match_names( 76 | query = c( 77 | "Crepidomanes minutus", 78 | "Crepidomanes minutawtaw", 79 | "Blechnum lunare var. lunare", 80 | "Blechnum lunare", 81 | "Bar foo var. foo", 82 | "Bar foo" 83 | ), 84 | reference = c( 85 | "Crepidomanes minutum", 86 | "Hymenophyllum polyanthos", 87 | "Blechnum lunare", 88 | "Bar foo" 89 | ), 90 | manual_match = data.frame( 91 | query = c("Bar foo var. foo", "Crepidomanes minutawtaw"), 92 | match = c("Bar foo", "Crepidomanes minutum") 93 | ), 94 | max_dist = 10, 95 | match_no_auth = FALSE, 96 | match_canon = FALSE, 97 | collapse_infra = TRUE, 98 | collapse_infra_exclude = NULL, 99 | simple = TRUE, 100 | docker = TRUE, 101 | tbl_out = TRUE 102 | ) 103 | expect_snapshot(match_res) 104 | }) 105 | 106 | test_that("Incorrectly specified manual match fails", { 107 | skip_if_no_docker() 108 | expect_error( 109 | ts_match_names( 110 | query = c("Crepidomanes minutus", "Hymeefee erae"), 111 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 112 | manual_match = data.frame( 113 | query = "Hymeefee erae", 114 | match = "Hymenophyllum poWHAT" 115 | ), 116 | simple = TRUE, 117 | docker = TRUE 118 | ), 119 | "One or more manually matched reference names not in reference data" 120 | ) 121 | expect_error( 122 | ts_match_names( 123 | query = c("Crepidomanes minutus", "Hymeefee erae"), 124 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 125 | manual_match = data.frame( 126 | query = c("Crepidomanes minutus", "Crepidomanes minutus"), 127 | match = c("Hymenophyllum polyanthos", "Crepidomanes minutum") 128 | ), 129 | simple = TRUE, 130 | docker = TRUE 131 | ), 132 | "All values of manual_match\\$query must be unique" 133 | ) 134 | expect_error( 135 | ts_match_names( 136 | query = c("Crepidomanes minutus", "Hymeefee erae"), 137 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 138 | manual_match = data.frame( 139 | name = c("Hymenophyllum polyantha", "Crepidomanes minutu"), 140 | match = c("Hymenophyllum polyanthos", "Crepidomanes minutum") 141 | ), 142 | simple = TRUE, 143 | docker = TRUE 144 | ), 145 | "manual_match must have `query` and `match` columns" 146 | ) 147 | expect_error( 148 | ts_match_names( 149 | query = ts_parse_names("Hymenophyllum polyantha", docker = TRUE), 150 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 151 | manual_match = data.frame( 152 | query = c("Hymenophyllum polyantha"), 153 | match = c("Hymenophyllum polyanthos") 154 | ), 155 | simple = TRUE, 156 | docker = TRUE 157 | ), 158 | "manual_match can only be used if query is a character vector" 159 | ) 160 | }) 161 | -------------------------------------------------------------------------------- /man/ts_resolve_names.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ts_resolve_names.R 3 | \name{ts_resolve_names} 4 | \alias{ts_resolve_names} 5 | \title{Resolve synonyms in taxonomic names} 6 | \usage{ 7 | ts_resolve_names( 8 | query, 9 | ref_taxonomy, 10 | max_dist = 10, 11 | match_no_auth = FALSE, 12 | match_canon = FALSE, 13 | collapse_infra = FALSE, 14 | collapse_infra_exclude = NULL, 15 | docker = getOption("ts_docker", default = FALSE), 16 | tbl_out = getOption("ts_tbl_out", default = FALSE) 17 | ) 18 | } 19 | \arguments{ 20 | \item{query}{Character vector or dataframe; taxonomic names to be resolved. 21 | If a character vector, missing values not allowed and all values must be 22 | unique. If a dataframe, should be taxonomic names matched with 23 | \code{\link{ts_match_names}()}.} 24 | 25 | \item{ref_taxonomy}{Dataframe; reference taxonomic data adhering to the 26 | \href{https://dwc.tdwg.org/terms/#taxon}{Darwin Core standard} with the 27 | following columns: 28 | \itemize{ 29 | \item \code{taxonID}: \href{https://dwc.tdwg.org/terms/#dwc:taxonID}{Unique identifier for each taxon}. 30 | \item \code{acceptedNameUsageID}: If the taxon is a synonym, the \href{https://dwc.tdwg.org/terms/#dwc:acceptedNameUsageID}{unique identifier for the accepted name} 31 | \item \code{taxonomicStatus}: \href{https://dwc.tdwg.org/terms/#dwc:taxonomicStatus}{The status of the use of the \code{scientificName} as a label for the taxon}. 32 | \item \code{scientificName}: \href{https://dwc.tdwg.org/terms/#dwc:scientificName}{The full scientific name of the taxon}, 33 | with authorship and date information if known. 34 | }} 35 | 36 | \item{max_dist}{Max Levenshtein distance to allow during fuzzy matching 37 | (total insertions, deletions and substitutions). Default: 10.} 38 | 39 | \item{match_no_auth}{Logical; If no author is given in the query and the name 40 | (without author) occurs only once in the reference, accept the name in the 41 | reference as a match. Default: to not allow such a match (\code{FALSE}).} 42 | 43 | \item{match_canon}{Logical; Allow a "canonical name" match if only the genus, 44 | species epithet, and infraspecific epithet (if present) match exactly. 45 | Default: to not allow such a match (\code{FALSE}).} 46 | 47 | \item{collapse_infra}{Logical; if the specific epithet and infraspecific 48 | epithet are the same, drop the infraspecific rank and epithet from the query. 49 | For more information, see \code{\link{ts_match_names}()}.} 50 | 51 | \item{collapse_infra_exclude}{Character vector; taxonomic names to exclude 52 | collapsing with \code{collapse_infra}. Any names used must match those in \code{query} 53 | exactly, or they won't be excluded.} 54 | 55 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools 56 | (so that taxon-tools need not be installed).} 57 | 58 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned? 59 | If \code{FALSE} (default), output will be a data.frame. This argument can 60 | be controlled via the option \code{ts_tbl_out}; see Examples.} 61 | } 62 | \value{ 63 | Dataframe; results of resolving synonyms in matched taxonomic names. 64 | Includes the following columns: 65 | \itemize{ 66 | \item \code{query}: Query name 67 | \item \code{resolved_name}: Accepted name after resolving synonyms 68 | \item \code{matched_name}: Name matched to query 69 | \item \code{resolved_status}: Taxonomic status of the resolved name (same as \code{taxonomicStatus} in \code{ref_taxonomy}) 70 | \item \code{matched_status}: Taxonomic status of the matched name (same as \code{taxonomicStatus} in \code{ref_taxonomy}) 71 | \item \code{match_type}: Type of match (for a summary of match types, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual}) 72 | } 73 | 74 | Names that could not be matched or resolve to multiple, different synonyms 75 | have \code{NA} for \code{resolved_name}. 76 | } 77 | \description{ 78 | After matching taxonomic names to a reference, some may match synonyms. This 79 | function resolves synonyms to their accepted names. 80 | } 81 | \details{ 82 | \code{query} can take as input either a character vector of taxonomic names, or 83 | the output of \code{\link{ts_match_names}()}. If the former, it will run 84 | \code{\link{ts_match_names}()} to match the query to \code{ref_taxonomy}, then 85 | resolve synonyms. If the latter, the scientific names in \code{ref_taxonomy} 86 | should be the same used as reference with \code{\link{ts_match_names}()} 87 | (this is not checked). 88 | 89 | \code{ref_taxonomy} must be taxonomic data adhering to the \href{https://dwc.tdwg.org/terms/#taxon}{Darwin Core standard}. 90 | Darwin Core includes many terms, but only four (\code{taxonID}, 91 | \code{acceptedNameUsageID}, \code{taxonomicStatus}, and \code{scientificName}) are required 92 | for this function. 93 | } 94 | \examples{ 95 | if (ts_tt_installed()) { 96 | # Load reference taxonomy in Darwin Core format 97 | data(filmy_taxonomy) 98 | 99 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy) 100 | # If you always want tibble output without specifying `tbl_out = TRUE` 101 | # every time, set the option: 102 | options(ts_tbl_out = TRUE) 103 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy) 104 | } 105 | 106 | } 107 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' Make a dataframe with taxonomic names 2 | #' 3 | #' @param taxa Character vector; taxon names to be parsed by taxon-tools `parsenames`. 4 | #' Missing values not allowed. Must all be unique. 5 | #' 6 | #' @return Dataframe with two columns: `id` and `name` 7 | #' @keywords internal 8 | #' @examples 9 | #' \dontrun{ 10 | #' ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar") 11 | #' } 12 | ts_make_name_df <- function(taxa) { 13 | assertthat::assert_that(is.character(taxa)) 14 | assertthat::assert_that( 15 | assertthat::noNA(taxa), 16 | msg = "Input taxa may not contain NAs" 17 | ) 18 | assertthat::assert_that( 19 | all(assertr::is_uniq(taxa)), 20 | msg = "Input taxa must be unique" 21 | ) 22 | 23 | # Format input names as data frame with unique ID 24 | # ID is combination of first 8 chars of hash of the 25 | # input (taxa), followed by "-" and integer 26 | taxa_df <- data.frame(name = taxa) 27 | taxa_df$id <- 1:nrow(taxa_df) 28 | taxa_df$id <- paste(substr(digest::digest(taxa), 1, 8), taxa_df$id, sep = "-") 29 | 30 | taxa_df[, c("id", "name")] 31 | } 32 | 33 | #' Classify results of taxon-tools matching 34 | #' 35 | #' @param match_results Dataframe; output of tt_match_names() 36 | #' 37 | #' @return Dataframe with column `result_type` added 38 | #' @keywords internal 39 | #' @autoglobal 40 | ts_classify_result <- function(match_results) { 41 | assertthat::assert_that( 42 | inherits(match_results, "data.frame"), 43 | msg = "match_results must be of class 'data.frame'" 44 | ) 45 | match_results %>% 46 | dplyr::add_count(query) %>% 47 | dplyr::mutate( 48 | result_type = dplyr::case_when( 49 | match_type != "no_match" & n == 1 ~ "single_match", 50 | match_type != "no_match" & n > 1 ~ "mult_match", 51 | match_type == "no_match" ~ "no_match", 52 | TRUE ~ NA_character_ 53 | ) 54 | ) %>% 55 | assertr::assert(assertr::not_na, result_type) %>% 56 | dplyr::select(-n) 57 | } 58 | 59 | # Helper function for tests: skip test if docker is not installed 60 | skip_if_no_docker <- function() { 61 | if (babelwhale::test_docker_installation()) { 62 | return(invisible(TRUE)) 63 | } 64 | testthat::skip("docker not installed") 65 | } 66 | 67 | # Helper function for tests: skip test if taxon-tools is not installed 68 | skip_if_no_tt <- function() { 69 | if (ts_tt_installed()) { 70 | return(invisible(TRUE)) 71 | } 72 | testthat::skip("taxon-tools not installed") 73 | } 74 | 75 | #' Run a containerised command with automatic mounting of files 76 | #' 77 | #' Similar to [run()], but automatically mounts files (and directories) so the 78 | #' user doesn't have to keep track of volumes. 79 | #' 80 | #' The main difference to [run()] is that the use of names for the `args`; any 81 | #' file (or directory) that should be mounted inside the container must be named 82 | #' `file`. The other elements (arguments) don't need to be named. Note that it 83 | #' is fine to have multiple elements with the same name (`file`). 84 | #' 85 | #' This should generally work as long as the command accepts absolute paths 86 | #' for file input. If that is not the case, use [run()] instead and specify 87 | #' paths and mounting manually. 88 | #' 89 | #' @inheritParams babelwhale::run 90 | #' @param args Character vector, arguments to the command. Any files or 91 | #' directories that should be mounted must be named "file" (see example). 92 | #' @param wd Local working directory to run command. If specified, the working 93 | #' directory will be mounted to the docker container. 94 | #' @param wd_in_container Working directory to run command in 95 | #' the container. Defaults to the working directory mounted to the container 96 | #' (`wd`). 97 | #' 98 | #' @return List, formatted as output from [processx::run()] 99 | #' @noRd 100 | #' @examples 101 | #' \dontrun{ 102 | #' if (test_docker_installation()) { 103 | #' 104 | #' # Count the number of lines in the DESCRIPTION and LICENSE 105 | #' # files of this package 106 | #' run_auto_mount( 107 | #' container_id = "alpine", 108 | #' command = "wc", 109 | #' args = c("-l", 110 | #' file = system.file("DESCRIPTION", package = "babelwhale"), 111 | #' file = system.file("LICENSE", package = "babelwhale") 112 | #' ) 113 | #' ) 114 | #' 115 | #' } 116 | #' } 117 | run_auto_mount <- function( 118 | container_id, 119 | command, 120 | args = NULL, 121 | wd = NULL, 122 | wd_in_container = NULL, 123 | environment_variables = NULL, 124 | debug = FALSE, 125 | verbose = FALSE, 126 | stdout = "|", 127 | stderr = "|" 128 | ) { 129 | # Convert paths of file arguments to absolute for docker 130 | file_args <- args[names(args) == "file"] 131 | in_path <- fs::path_abs(file_args) 132 | in_file <- fs::path_file(in_path) 133 | in_dir <- fs::path_dir(in_path) 134 | 135 | # Make (most likely) unique prefix for folder name that 136 | # won't conflict with an existing folder in the container 137 | # based on the hash of the container id and command 138 | prefix <- digest::digest(c(container_id, command)) 139 | 140 | # Specify volume mounting for working directory 141 | wd_volume <- NULL 142 | if (!is.null(wd)) { 143 | wd_path <- fs::path_abs(wd) 144 | if (is.null(wd_in_container)) wd_in_container <- glue::glue("/{prefix}_wd") 145 | wd_volume <- glue::glue("{wd_path}:{wd_in_container}") 146 | } 147 | 148 | # Specify all volumes: one per file, plus working directory 149 | volumes <- unique( 150 | c( 151 | glue::glue("{in_dir}:/{prefix}_{1:length(in_dir)}"), 152 | wd_volume 153 | ) 154 | ) 155 | 156 | # Replace file arg paths with location in container 157 | files_in_container <- glue::glue("/{prefix}_{1:length(in_dir)}/{in_file}") 158 | args[names(args) == "file"] <- files_in_container 159 | 160 | # Run docker via babelwhale 161 | babelwhale::run( 162 | container_id = container_id, 163 | command = command, 164 | args = args, 165 | volumes = volumes, 166 | workspace = wd_in_container, 167 | environment_variables = environment_variables, 168 | debug = debug, 169 | verbose = verbose, 170 | stdout = stdout, 171 | stderr = stderr 172 | ) 173 | } 174 | -------------------------------------------------------------------------------- /R/ts_parse_names.R: -------------------------------------------------------------------------------- 1 | #' Parse taxonomic names 2 | #' 3 | #' Requires [taxon-tools](https://github.com/camwebb/taxon-tools) or docker 4 | #' to be installed. 5 | #' 6 | #' Parses scientific names into their component parts (genus, species, variety, 7 | #' author, etc). 8 | #' 9 | #' @param taxa Character vector; taxon names to be parsed by taxon-tools 10 | #' `parsenames`. Missing values not allowed. Must all be unique. 11 | #' @param tbl_out Logical vector of length 1; should a tibble be returned? 12 | #' If `FALSE` (default), output will be a data.frame. This argument can 13 | #' be controlled via the option `ts_tbl_out`; see Examples. 14 | #' @param quiet Logical; if TRUE, suppress warning messages that would normally 15 | #' be issued 16 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools 17 | #' (so that taxon-tools need not be installed). 18 | #' 19 | #' @return A dataframe including the following columns. 20 | #' - id: A unique ID number assigned to the input name 21 | #' - name: The input name 22 | #' - genus_hybrid_sign: Hybrid sign for genus 23 | #' - genus_name: Genus name 24 | #' - species_hybrid_sign: Hybrid sign for species 25 | #' - specific_epithet: Specific epithet (name) 26 | #' - infraspecific_rank: Infraspecific rank 27 | #' - infraspecific_epithet: Infraspecific epithet (name) 28 | #' - author: Name of taxon 29 | #' 30 | #' @autoglobal 31 | #' @export 32 | #' @examples 33 | #' # Using local taxon-tools installation 34 | #' if (ts_tt_installed()) { 35 | #' 36 | #' ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar") 37 | #' ts_parse_names( 38 | #' "Foogenus x barspecies var. foosubsp (L.) F. Bar", tbl_out = TRUE) 39 | #' 40 | #' # If you always want tibble output without specifying `tbl_out = TRUE` 41 | #' # every time, set the option: 42 | #' options(ts_tbl_out = TRUE) 43 | #' ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar") 44 | #' ts_parse_names("Crepidomanes minutum (Blume) K. Iwats.") 45 | #' 46 | #' } 47 | #' 48 | #' # Using docker 49 | #' if (babelwhale::test_docker_installation()) { 50 | #' 51 | #' ts_parse_names( 52 | #' "Foogenus x barspecies var. foosubsp (L.) F. Bar", 53 | #' docker = TRUE) 54 | #' 55 | #' } 56 | #' 57 | ts_parse_names <- function( 58 | taxa, 59 | tbl_out = getOption("ts_tbl_out", default = FALSE), 60 | quiet = FALSE, 61 | docker = getOption("ts_docker", default = FALSE) 62 | ) { 63 | # Check input: must be character vector, no NA values, all unique 64 | assertthat::assert_that(is.character(taxa)) 65 | assertthat::assert_that( 66 | assertthat::noNA(taxa), 67 | msg = "Input taxa may not contain NAs" 68 | ) 69 | assertthat::assert_that( 70 | all(assertr::is_uniq(taxa)), 71 | msg = "Input taxa must be unique" 72 | ) 73 | assertthat::assert_that(assertthat::is.flag(tbl_out)) 74 | assertthat::assert_that(assertthat::is.flag(docker)) 75 | 76 | # Write out names formatted for parsing with taxon-tools to temp file 77 | # format: 78 | # `id_num|taxon_name` 79 | # for example, 80 | # `x-234|Foogenus x barspecies var. foosubsp (L.) F. Bar` 81 | taxa_tbl <- ts_make_name_df(taxa) 82 | taxa_tbl$record <- paste(taxa_tbl$id, taxa_tbl$name, sep = "|") 83 | ref_taxa_txt_file <- tempfile( 84 | pattern = digest::digest(taxa), 85 | fileext = ".txt" 86 | ) 87 | if (fs::file_exists(ref_taxa_txt_file)) fs::file_delete(ref_taxa_txt_file) 88 | writeLines(taxa_tbl$record, ref_taxa_txt_file) 89 | 90 | # Parse reference names with taxon tools 91 | if (isTRUE(docker)) { 92 | assertthat::assert_that( 93 | requireNamespace("babelwhale", quietly = TRUE), 94 | msg = "babelwhale needs to be installed to use docker" 95 | ) 96 | assertthat::assert_that( 97 | babelwhale::test_docker_installation(), 98 | msg = "docker not installed" 99 | ) 100 | ref_parsed <- run_auto_mount( 101 | container_id = "camwebb/taxon-tools:v1.3.0", 102 | command = "parsenames", 103 | args = c(file = ref_taxa_txt_file) 104 | ) 105 | } else { 106 | assertthat::assert_that( 107 | ts_tt_installed(), 108 | msg = "taxon-tools not installed" 109 | ) 110 | ref_parsed <- processx::run("parsenames", ref_taxa_txt_file) 111 | } 112 | 113 | if (fs::file_exists(ref_taxa_txt_file)) fs::file_delete(ref_taxa_txt_file) 114 | 115 | # Read in results of parsing, format as dataframe 116 | 117 | # The output is originally one record per line, with fields separated by '|' (pipe symbol) 118 | parsed_names <- data.frame( 119 | record = strsplit(ref_parsed[["stdout"]], "\n")[[1]] 120 | ) 121 | 122 | # Split these into separate columns 123 | name_parts <- c( 124 | "genus_hybrid_sign", 125 | "genus_name", 126 | "species_hybrid_sign", 127 | "specific_epithet", 128 | "infraspecific_rank", 129 | "infraspecific_epithet", 130 | "author" 131 | ) 132 | 133 | parsed_names <- tidyr::separate( 134 | data = parsed_names, 135 | col = record, 136 | into = c("id", name_parts), 137 | sep = "\\|", 138 | fill = "right", 139 | remove = FALSE 140 | ) 141 | 142 | # Fill in NA if that name part is missing 143 | parsed_names[parsed_names == ""] <- NA 144 | 145 | # Add "fail" column if all name parts are missing (couldn't be parsed properly) 146 | parsed_names$fail <- sapply( 147 | 1:nrow(parsed_names), 148 | function(x) all(is.na(parsed_names[x, name_parts])) 149 | ) 150 | 151 | # Early exit if everything failed 152 | assertthat::assert_that( 153 | !all(parsed_names$fail == TRUE), 154 | msg = "No names could be successfully parsed" 155 | ) 156 | 157 | # Emit warning for failures 158 | if (sum(parsed_names$fail) > 0 && quiet == FALSE) { 159 | failed_ids <- parsed_names$id[parsed_names$fail == TRUE] 160 | failed_names <- paste( 161 | taxa_tbl$name[taxa_tbl$id %in% failed_ids], 162 | collapse = ", " 163 | ) 164 | warning(glue::glue( 165 | "The following names could not be parsed and are excluded from results: {failed_names}" 166 | )) 167 | } 168 | 169 | # Add back in original name 170 | parsed_names <- dplyr::left_join( 171 | parsed_names, 172 | dplyr::select(taxa_tbl, id, name), 173 | by = "id" 174 | ) 175 | 176 | # Remove failures, drop "fail" column 177 | parsed_names <- parsed_names[parsed_names$fail == FALSE, ] 178 | parsed_names$fail <- NULL 179 | 180 | # Return parsed names as dataframe or tibble 181 | results <- parsed_names[, c("name", "id", name_parts)] 182 | 183 | if (isTRUE(tbl_out)) return(tibble::as_tibble(results)) 184 | 185 | results 186 | } 187 | -------------------------------------------------------------------------------- /man/ts_match_names.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ts_match_names.R 3 | \name{ts_match_names} 4 | \alias{ts_match_names} 5 | \title{Match taxonomic names to a reference} 6 | \usage{ 7 | ts_match_names( 8 | query, 9 | reference, 10 | manual_match = NULL, 11 | max_dist = 10, 12 | match_no_auth = FALSE, 13 | match_canon = FALSE, 14 | collapse_infra = FALSE, 15 | collapse_infra_exclude = NULL, 16 | simple = FALSE, 17 | docker = getOption("ts_docker", default = FALSE), 18 | tbl_out = getOption("ts_tbl_out", default = FALSE) 19 | ) 20 | } 21 | \arguments{ 22 | \item{query}{Character vector or dataframe; taxonomic names to be queried. 23 | If a character vector, missing values not allowed and all values must be 24 | unique. 25 | If a dataframe, should be taxonomic names parsed with 26 | \code{\link{ts_parse_names}()}.} 27 | 28 | \item{reference}{Character vector or dataframe; taxonomic names to use as 29 | reference. If a character vector, missing values not allowed and all values 30 | must be unique. If a dataframe, should be taxonomic names parsed with 31 | \code{\link{ts_parse_names}()}.} 32 | 33 | \item{manual_match}{Optional. Dataframe of manually matched names that will 34 | override any results from \code{taxon-tools}. Must include two columns, \code{query} 35 | and \code{match}.} 36 | 37 | \item{max_dist}{Max Levenshtein distance to allow during fuzzy matching 38 | (total insertions, deletions and substitutions). Default: 10.} 39 | 40 | \item{match_no_auth}{Logical; If no author is given in the query and the name 41 | (without author) occurs only once in the reference, accept the name in the 42 | reference as a match. Default: to not allow such a match (\code{FALSE}).} 43 | 44 | \item{match_canon}{Logical; Allow a "canonical name" match if only the genus, 45 | species epithet, and infraspecific epithet (if present) match exactly. 46 | Default: to not allow such a match (\code{FALSE}).} 47 | 48 | \item{collapse_infra}{Logical; if the specific epithet and infraspecific 49 | epithet are the same, drop the infraspecific rank and epithet from the query.} 50 | 51 | \item{collapse_infra_exclude}{Character vector; taxonomic names to exclude 52 | from collapsing with \code{collapse_infra}. Any names used must match those in 53 | \code{query} exactly, or they won't be excluded.} 54 | 55 | \item{simple}{Logical; return the output in a simplified format with only the 56 | query name, matched reference name, and match type. Default: \code{FALSE}.} 57 | 58 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools 59 | (so that taxon-tools need not be installed).} 60 | 61 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned? 62 | If \code{FALSE} (default), output will be a data.frame. This argument can 63 | be controlled via the option \code{ts_tbl_out}; see Examples.} 64 | } 65 | \value{ 66 | Dataframe with the following columns (if \code{simple} is \code{FALSE}): 67 | \itemize{ 68 | \item query: Query name 69 | \item reference: Matched reference name 70 | \item match_type: Type of match (for a summary of match types, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual}) 71 | \item id_query: Unique ID of query 72 | \item id_ref: Unique ID of reference 73 | \item genus_hybrid_sign_query: Genus hybrid sign in query 74 | \item genus_name_query: Genus name of query 75 | \item species_hybrid_sign_query: Species hybrid sign in query 76 | \item specific_epithet_query: Specific epithet of query 77 | \item infraspecific_rank_query: Infraspecific rank of query 78 | \item infraspecific_epithet_query: Infraspecific epithet of query 79 | \item author_query: Taxonomic author of query 80 | \item genus_hybrid_sign_ref: Genus hybrid sign in reference 81 | \item genus_name_ref: Genus name of reference 82 | \item species_hybrid_sign_ref: Species hybrid sign in reference 83 | \item specific_epithet_ref: Specific epithet of reference 84 | \item infraspecific_rank_ref: Infraspecific rank of reference 85 | \item infraspecific_epithet_ref: Infraspecific epithet of reference 86 | \item author_ref: Taxonomic author of reference 87 | } 88 | 89 | If \code{simple} is \code{TRUE}, only return the first three columns above. 90 | } 91 | \description{ 92 | Allows for orthographic differences between query and reference by using 93 | fuzzy matching on parsed taxonomic names. Requires 94 | \href{https://github.com/camwebb/taxon-tools}{taxon-tools} to be installed. 95 | } 96 | \details{ 97 | \code{taxon-tools} matches names in two steps: 98 | \enumerate{ 99 | \item Scientific names are parsed into their component parts (genus, species, 100 | variety, author, etc). 101 | \item Names are fuzzily matched following taxonomic rules using the component 102 | parts. 103 | } 104 | 105 | For more information on rules used for matching, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual}. 106 | 107 | Parsing is fairly fast (much faster than matching) but can take some time if 108 | the number of names is very large. If multiple queries will be made (e.g., to 109 | the same large reference database), it is recommended to first parse the 110 | names using \code{\link{ts_parse_names}()}, and use the results as input to 111 | \code{query} and/or \code{reference}. 112 | 113 | \code{collapse_infra} is useful in situations where the reference database does 114 | not use names that have the same specific epithet and infraspecific epithet. 115 | For example, reference name "Blechnum lunare" and query "Blechnum lunare var. 116 | lunare". In this case, if \code{collapse_infra} is \code{TRUE}, "Blechnum lunare" will 117 | be queried instead of "Blechnum lunare var. lunare". Note that the 118 | \code{match_type} will be "exact" even though the literal query and the matched 119 | name are different (see example below). 120 | } 121 | \examples{ 122 | if(ts_tt_installed()) { 123 | ts_match_names( 124 | "Crepidomanes minutus", 125 | c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 126 | simple = TRUE 127 | ) 128 | 129 | # If names are too distant, they won't match 130 | ts_match_names( 131 | query = "Crepidblah foo", 132 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 133 | simple = TRUE 134 | ) 135 | 136 | # But we can force a match manually 137 | ts_match_names( 138 | query = "Crepidblah foo", 139 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 140 | manual_match = data.frame( 141 | query = c("Crepidblah foo"), 142 | match = c("Crepidomanes minutum") 143 | ), 144 | simple = TRUE 145 | ) 146 | 147 | # If you always want tibble output without specifying `tbl_out = TRUE` 148 | # every time, set the option: 149 | options(ts_tbl_out = TRUE) 150 | ts_match_names( 151 | "Crepidomanes minutus", 152 | c("Crepidomanes minutum", "Hymenophyllum polyanthos") 153 | ) 154 | 155 | # Example using collapse_infra argument 156 | ts_match_names( 157 | c("Crepidomanes minutus", "Blechnum lunare var. lunare", 158 | "Blechnum lunare", "Bar foo var. foo", "Bar foo"), 159 | c("Crepidomanes minutum", "Hymenophyllum polyanthos", "Blechnum lunare", 160 | "Bar foo"), 161 | collapse_infra = TRUE, 162 | collapse_infra_exclude = "Bar foo var. foo", 163 | simple = TRUE 164 | ) 165 | } 166 | 167 | } 168 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/" 12 | ) 13 | ``` 14 | # taxastand 15 | 16 | 17 | [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) 18 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959) 19 | 20 | 21 | The goal of `taxastand` is to standardize species names from different sources, a common task in biology. 22 | 23 | Very often different biologists use different synonyms to refer to the same species. If we want to join data from different sources, their taxonomic names must be standardized first. This is what `taxastand` seeks to do in a reproducible and efficient manner. 24 | 25 | ## Important note 26 | 27 | **This package is in early development.** There may be major, breaking changes to functionality in the near future. If you use this package, I highly recommend using a package manager like [renv](https://rstudio.github.io/renv/articles/renv.html) so that later updates won't break your code. 28 | 29 | ## Taxonomic standard 30 | 31 | `taxastand` is based on matching names to a single **taxonomic standard**, that is, a database of accepted names and synonyms. As long as a single taxonomic standard is used, we can confidently resolve names from disparate sources. 32 | 33 | The taxonomic standard must conform to [Darwin Core standards](https://dwc.tdwg.org/). The user must provide this database (as a dataframe). There are many sources of taxonomic data online, including [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c), [Catalog of Life](http://www.catalogueoflife.org/), and [ITIS](https://www.itis.gov/) to name a few. The [taxadb](https://github.com/ropensci/taxadb) package provides convenient functions for downloading various taxonomic databases that use Darwin Core. 34 | 35 | ## Installation 36 | 37 | `taxastand` can be installed from [r-universe](https://joelnitta.r-universe.dev) or [github](https://github.com/joelnitta). 38 | 39 | ``` r 40 | install.packages("taxastand", repos = 'https://joelnitta.r-universe.dev') 41 | ``` 42 | 43 | OR 44 | 45 | ``` r 46 | # install.packages("remotes") 47 | remotes::install_github("joelnitta/taxastand") 48 | ``` 49 | 50 | ## Dependencies 51 | 52 | `taxastand` depends on [taxon-tools](https://github.com/camwebb/taxon-tools) for taxonomic name matching. 53 | 54 | There are two options for using this dependency. 55 | 56 | - Install [docker](https://www.docker.com/) and set `docker = TRUE` when using `taxastand` functions. 57 | 58 | OR 59 | 60 | - Install the two programs included in [taxon-tools](https://github.com/camwebb/taxon-tools), `parsenames` and `matchnames`. 61 | 62 | ## Similar work 63 | 64 | - [ROpenSci](https://ropensci.org/) has a [task view](https://github.com/ropensci/taxonomy) summarizing many tools available for taxonomy. 65 | 66 | - [taxize](https://github.com/ropensci/taxize) is the "granddaddy" of taxonomy packages in R. It can search around 20 different taxonomic databases for names and retrieve taxonomic information. 67 | 68 | - [TNRS](http://tnrs.iplantcollaborative.org/), the Taxonomic Name Resolution Service, is a web application that resolves taxonomic names of plants according to one of six databases. 69 | 70 | - [taxizedb](https://github.com/ropensci/taxizedb) downloads taxonomic databases and provides tools to interface with them through SQL. 71 | 72 | - [taxadb](https://github.com/ropensci/taxadb) also downloads and searches taxonomic databases. It can interface with them either through SQL or in-memory in R. 73 | 74 | - [taxonstand](https://cran.r-project.org/web/packages/Taxonstand/index.html) has a very similar goal to `taxastand`, but only uses [The Plant List (TPL)](http://www.theplantlist.org 75 | ) as its taxonomic standard and does not allow the user to provide their own. Note that TPL is no longer being updated as of 2013. 76 | 77 | ## Motivation 78 | 79 | Although existing web-based solutions for taxonomic name resolution are very useful, they may not be ideal for all situations: the choice of reference database to use for standardization is limited, they may not be able to handle very large queries, and the user has no guarantee that the same input will yield the same output at a later date due to changes in the remote database. 80 | 81 | Furthermore, matching of taxonomic names is not straightforward, since they are complex data structures including multiple components (e.g., genus, specific epithet, basionym author, combination author, etc). [Of the tools mentioned above](#similar-work) only [TNRS](http://tnrs.iplantcollaborative.org/) can fuzzily match taxonomic names based on their parsed components, but it does not allow for use of a local reference database. 82 | 83 | The motivation for `taxastand` is to provide greater flexibility and reproducibility by allowing for complete version control of the code and database used for name resolution, while implementing fuzzy matching of parsed taxonomic names. 84 | 85 | ## Example 86 | 87 | Here is an example of fuzzy matching followed by resolution of synonyms using the dataset included with the package. 88 | 89 | ```{r filmy-example-show, eval = FALSE} 90 | library(taxastand) 91 | 92 | # Load example reference taxonomy in Darwin Core format 93 | data(filmy_taxonomy) 94 | 95 | # Take a look at the columns used by taxastand 96 | head(filmy_taxonomy[c( 97 | "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")]) 98 | 99 | # As a test, resolve a misspelled name 100 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy) 101 | 102 | # We can now use the `resolved_name` column of this result for downstream 103 | # analyses joining on other datasets that have been resolved to the same 104 | # reference taxonomy. 105 | ``` 106 | 107 | ```{r filmy-example-hide, echo = FALSE} 108 | library(taxastand) 109 | 110 | # Load example reference taxonomy in Darwin Core format 111 | data(filmy_taxonomy) 112 | 113 | # Take a look at the columns used by taxastand 114 | head(filmy_taxonomy[c( 115 | "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")]) 116 | 117 | # As a test, resolve a misspelled name 118 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy, docker = TRUE) 119 | 120 | # We can now use the `resolved_name` column of this result for downstream 121 | # analyses joining on other datasets that have been resolved to the same 122 | # reference taxonomy. 123 | ``` 124 | 125 | ## Citing this package 126 | 127 | If you use this package, please cite it! Here is an example: 128 | 129 | Nitta, JH (2021) taxastand: Taxonomic name standardization in R. https://doi.org/10.5281/zenodo.5726390 130 | 131 | The example DOI above is for the overall package. 132 | 133 | Here is the latest DOI, which you should use if you are using the latest 134 | version of the package: 135 | 136 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959) 137 | 138 | You can find DOIs for older versions by viewing the “Releases” menu on 139 | the right. 140 | 141 | You should also cite the software that `taxastand` relies on, `taxon-tools`: https://github.com/camwebb/taxon-tools 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # taxastand 5 | 6 | 7 | 8 | [![Project Status: WIP – Initial development is in progress, but there 9 | has not yet been a stable, usable release suitable for the 10 | public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) 11 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959) 12 | 13 | 14 | The goal of `taxastand` is to standardize species names from different 15 | sources, a common task in biology. 16 | 17 | Very often different biologists use different synonyms to refer to the 18 | same species. If we want to join data from different sources, their 19 | taxonomic names must be standardized first. This is what `taxastand` 20 | seeks to do in a reproducible and efficient manner. 21 | 22 | ## Important note 23 | 24 | **This package is in early development.** There may be major, breaking 25 | changes to functionality in the near future. If you use this package, I 26 | highly recommend using a package manager like 27 | [renv](https://rstudio.github.io/renv/articles/renv.html) so that later 28 | updates won’t break your code. 29 | 30 | ## Taxonomic standard 31 | 32 | `taxastand` is based on matching names to a single **taxonomic 33 | standard**, that is, a database of accepted names and synonyms. As long 34 | as a single taxonomic standard is used, we can confidently resolve names 35 | from disparate sources. 36 | 37 | The taxonomic standard must conform to [Darwin Core 38 | standards](https://dwc.tdwg.org/). The user must provide this database 39 | (as a dataframe). There are many sources of taxonomic data online, 40 | including 41 | [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c), 42 | [Catalog of Life](http://www.catalogueoflife.org/), and 43 | [ITIS](https://www.itis.gov/) to name a few. The 44 | [taxadb](https://github.com/ropensci/taxadb) package provides convenient 45 | functions for downloading various taxonomic databases that use Darwin 46 | Core. 47 | 48 | ## Installation 49 | 50 | `taxastand` can be installed from 51 | [r-universe](https://joelnitta.r-universe.dev) or 52 | [github](https://github.com/joelnitta). 53 | 54 | ``` r 55 | install.packages("taxastand", repos = 'https://joelnitta.r-universe.dev') 56 | ``` 57 | 58 | OR 59 | 60 | ``` r 61 | # install.packages("remotes") 62 | remotes::install_github("joelnitta/taxastand") 63 | ``` 64 | 65 | ## Dependencies 66 | 67 | `taxastand` depends on 68 | [taxon-tools](https://github.com/camwebb/taxon-tools) for taxonomic name 69 | matching. 70 | 71 | There are two options for using this dependency. 72 | 73 | - Install [docker](https://www.docker.com/) and set `docker = TRUE` when 74 | using `taxastand` functions. 75 | 76 | OR 77 | 78 | - Install the two programs included in 79 | [taxon-tools](https://github.com/camwebb/taxon-tools), `parsenames` 80 | and `matchnames`. 81 | 82 | ## Similar work 83 | 84 | - [ROpenSci](https://ropensci.org/) has a [task 85 | view](https://github.com/ropensci/taxonomy) summarizing many tools 86 | available for taxonomy. 87 | 88 | - [taxize](https://github.com/ropensci/taxize) is the “granddaddy” of 89 | taxonomy packages in R. It can search around 20 different taxonomic 90 | databases for names and retrieve taxonomic information. 91 | 92 | - [TNRS](http://tnrs.iplantcollaborative.org/), the Taxonomic Name 93 | Resolution Service, is a web application that resolves taxonomic names 94 | of plants according to one of six databases. 95 | 96 | - [taxizedb](https://github.com/ropensci/taxizedb) downloads taxonomic 97 | databases and provides tools to interface with them through SQL. 98 | 99 | - [taxadb](https://github.com/ropensci/taxadb) also downloads and 100 | searches taxonomic databases. It can interface with them either 101 | through SQL or in-memory in R. 102 | 103 | - [taxonstand](https://cran.r-project.org/web/packages/Taxonstand/index.html) 104 | has a very similar goal to `taxastand`, but only uses [The Plant List 105 | (TPL)](http://www.theplantlist.org) as its taxonomic standard and does 106 | not allow the user to provide their own. Note that TPL is no longer 107 | being updated as of 2013. 108 | 109 | ## Motivation 110 | 111 | Although existing web-based solutions for taxonomic name resolution are 112 | very useful, they may not be ideal for all situations: the choice of 113 | reference database to use for standardization is limited, they may not 114 | be able to handle very large queries, and the user has no guarantee that 115 | the same input will yield the same output at a later date due to changes 116 | in the remote database. 117 | 118 | Furthermore, matching of taxonomic names is not straightforward, since 119 | they are complex data structures including multiple components (e.g., 120 | genus, specific epithet, basionym author, combination author, etc). [Of 121 | the tools mentioned above](#similar-work) only 122 | [TNRS](http://tnrs.iplantcollaborative.org/) can fuzzily match taxonomic 123 | names based on their parsed components, but it does not allow for use of 124 | a local reference database. 125 | 126 | The motivation for `taxastand` is to provide greater flexibility and 127 | reproducibility by allowing for complete version control of the code and 128 | database used for name resolution, while implementing fuzzy matching of 129 | parsed taxonomic names. 130 | 131 | ## Example 132 | 133 | Here is an example of fuzzy matching followed by resolution of synonyms 134 | using the dataset included with the package. 135 | 136 | ``` r 137 | library(taxastand) 138 | 139 | # Load example reference taxonomy in Darwin Core format 140 | data(filmy_taxonomy) 141 | 142 | # Take a look at the columns used by taxastand 143 | head(filmy_taxonomy[c( 144 | "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")]) 145 | 146 | # As a test, resolve a misspelled name 147 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy) 148 | 149 | # We can now use the `resolved_name` column of this result for downstream 150 | # analyses joining on other datasets that have been resolved to the same 151 | # reference taxonomy. 152 | ``` 153 | 154 | #> taxonID acceptedNameUsageID taxonomicStatus 155 | #> 1 54115096 NA accepted name 156 | #> 2 54133783 54115097 synonym 157 | #> 3 54115097 NA accepted name 158 | #> 4 54133784 54115098 synonym 159 | #> 5 54115098 NA accepted name 160 | #> 6 54133785 54115099 synonym 161 | #> scientificName 162 | #> 1 Cephalomanes atrovirens Presl 163 | #> 2 Trichomanes crassum Copel. 164 | #> 3 Cephalomanes crassum (Copel.) M. G. Price 165 | #> 4 Trichomanes densinervium Copel. 166 | #> 5 Cephalomanes densinervium (Copel.) Copel. 167 | #> 6 Trichomanes infundibulare Alderw. 168 | #> query resolved_name 169 | #> 1 Gonocormus minutum Crepidomanes minutum (Bl.) K. Iwats. 170 | #> matched_name resolved_status matched_status match_type 171 | #> 1 Gonocormus minutus (Bl.) Bosch accepted name synonym auto_fuzzy 172 | 173 | ## Citing this package 174 | 175 | If you use this package, please cite it! Here is an example: 176 | 177 | Nitta, JH (2021) taxastand: Taxonomic name standardization in R. https://doi.org/10.5281/zenodo.5726390 178 | 179 | The example DOI above is for the overall package. 180 | 181 | Here is the latest DOI, which you should use if you are using the latest 182 | version of the package: 183 | 184 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959) 185 | 186 | You can find DOIs for older versions by viewing the “Releases” menu on 187 | the right. 188 | 189 | You should also cite the software that `taxastand` relies on, 190 | `taxon-tools`: 191 | -------------------------------------------------------------------------------- /R/ts_resolve_names.R: -------------------------------------------------------------------------------- 1 | #' Resolve synonyms in taxonomic names 2 | #' 3 | #' After matching taxonomic names to a reference, some may match synonyms. This 4 | #' function resolves synonyms to their accepted names. 5 | #' 6 | #' `query` can take as input either a character vector of taxonomic names, or 7 | #' the output of \code{\link{ts_match_names}()}. If the former, it will run 8 | #' \code{\link{ts_match_names}()} to match the query to `ref_taxonomy`, then 9 | #' resolve synonyms. If the latter, the scientific names in `ref_taxonomy` 10 | #' should be the same used as reference with \code{\link{ts_match_names}()} 11 | #' (this is not checked). 12 | #' 13 | #' `ref_taxonomy` must be taxonomic data adhering to the [Darwin Core standard](https://dwc.tdwg.org/terms/#taxon). 14 | #' Darwin Core includes many terms, but only four (`taxonID`, 15 | #' `acceptedNameUsageID`, `taxonomicStatus`, and `scientificName`) are required 16 | #' for this function. 17 | #' 18 | #' @param query Character vector or dataframe; taxonomic names to be resolved. 19 | #' If a character vector, missing values not allowed and all values must be 20 | #' unique. If a dataframe, should be taxonomic names matched with 21 | #' \code{\link{ts_match_names}()}. 22 | #' @param ref_taxonomy Dataframe; reference taxonomic data adhering to the 23 | #' [Darwin Core standard](https://dwc.tdwg.org/terms/#taxon) with the 24 | #' following columns: 25 | #' - `taxonID`: [Unique identifier for each taxon](https://dwc.tdwg.org/terms/#dwc:taxonID). 26 | #' - `acceptedNameUsageID`: If the taxon is a synonym, the [unique identifier for the accepted name](https://dwc.tdwg.org/terms/#dwc:acceptedNameUsageID) 27 | #' - `taxonomicStatus`: [The status of the use of the `scientificName` as a label for the taxon](https://dwc.tdwg.org/terms/#dwc:taxonomicStatus). 28 | #' - `scientificName`: [The full scientific name of the taxon](https://dwc.tdwg.org/terms/#dwc:scientificName), 29 | #' with authorship and date information if known. 30 | #' @param max_dist Max Levenshtein distance to allow during fuzzy matching 31 | #' (total insertions, deletions and substitutions). Default: 10. 32 | #' @param match_no_auth Logical; If no author is given in the query and the name 33 | #' (without author) occurs only once in the reference, accept the name in the 34 | #' reference as a match. Default: to not allow such a match (`FALSE`). 35 | #' @param match_canon Logical; Allow a "canonical name" match if only the genus, 36 | #' species epithet, and infraspecific epithet (if present) match exactly. 37 | #' Default: to not allow such a match (`FALSE`). 38 | #' @param collapse_infra Logical; if the specific epithet and infraspecific 39 | #' epithet are the same, drop the infraspecific rank and epithet from the query. 40 | #' For more information, see \code{\link{ts_match_names}()}. 41 | #' @param collapse_infra_exclude Character vector; taxonomic names to exclude 42 | #' collapsing with `collapse_infra`. Any names used must match those in `query` 43 | #' exactly, or they won't be excluded. 44 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools 45 | #' (so that taxon-tools need not be installed). 46 | #' @param tbl_out Logical vector of length 1; should a tibble be returned? 47 | #' If `FALSE` (default), output will be a data.frame. This argument can 48 | #' be controlled via the option `ts_tbl_out`; see Examples. 49 | #' 50 | #' @return Dataframe; results of resolving synonyms in matched taxonomic names. 51 | #' Includes the following columns: 52 | #' - `query`: Query name 53 | #' - `resolved_name`: Accepted name after resolving synonyms 54 | #' - `matched_name`: Name matched to query 55 | #' - `resolved_status`: Taxonomic status of the resolved name (same as `taxonomicStatus` in `ref_taxonomy`) 56 | #' - `matched_status`: Taxonomic status of the matched name (same as `taxonomicStatus` in `ref_taxonomy`) 57 | #' - `match_type`: Type of match (for a summary of match types, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes)) 58 | #' 59 | #' Names that could not be matched or resolve to multiple, different synonyms 60 | #' have `NA` for `resolved_name`. 61 | #' 62 | #' @autoglobal 63 | #' @export 64 | #' @examples 65 | #' if (ts_tt_installed()) { 66 | #' # Load reference taxonomy in Darwin Core format 67 | #' data(filmy_taxonomy) 68 | #' 69 | #' ts_resolve_names("Gonocormus minutum", filmy_taxonomy) 70 | #' # If you always want tibble output without specifying `tbl_out = TRUE` 71 | #' # every time, set the option: 72 | #' options(ts_tbl_out = TRUE) 73 | #' ts_resolve_names("Gonocormus minutum", filmy_taxonomy) 74 | #' } 75 | #' 76 | ts_resolve_names <- function( 77 | query, 78 | ref_taxonomy, 79 | max_dist = 10, 80 | match_no_auth = FALSE, 81 | match_canon = FALSE, 82 | collapse_infra = FALSE, 83 | collapse_infra_exclude = NULL, 84 | docker = getOption("ts_docker", default = FALSE), 85 | tbl_out = getOption("ts_tbl_out", default = FALSE) 86 | ) { 87 | # Check input 88 | assertthat::assert_that( 89 | is.character(query) | inherits(query, "data.frame"), 90 | msg = "query must be of class 'data.frame' or a character vector" 91 | ) 92 | assertthat::assert_that( 93 | inherits(ref_taxonomy, "data.frame"), 94 | msg = "ref_taxonomy must be of class 'data.frame'" 95 | ) 96 | assertthat::assert_that(assertthat::is.flag(tbl_out)) 97 | assertthat::assert_that(assertthat::is.flag(docker)) 98 | if (!is.null(collapse_infra_exclude)) { 99 | assertthat::assert_that(is.character(collapse_infra_exclude)) 100 | } 101 | 102 | # If needed, match names first 103 | if (is.character(query)) { 104 | match_results <- ts_match_names( 105 | query = query, 106 | reference = unique(ref_taxonomy$scientificName), 107 | max_dist = max_dist, 108 | match_no_auth = match_no_auth, 109 | match_canon = match_canon, 110 | collapse_infra = collapse_infra, 111 | collapse_infra_exclude = collapse_infra_exclude, 112 | simple = TRUE, 113 | docker = docker 114 | ) 115 | } else if (is.data.frame(query)) { 116 | match_results <- query 117 | } else { 118 | stop("query must be of class 'data.frame' or a character vector") 119 | } 120 | 121 | # Classify results of matching 122 | match_results_classified_with_taxonomy <- 123 | match_results %>% 124 | ts_classify_result() %>% 125 | dplyr::select(query, reference, match_type, result_type) %>% 126 | dplyr::left_join(ref_taxonomy, by = c(reference = "scientificName")) 127 | 128 | # Separate out single matches to an accepted name (success type 1) 129 | accepted_single_match <- 130 | match_results_classified_with_taxonomy %>% 131 | # consider accepted names have either no acceptedNameUsageID or acceptedNameUsageID is same as taxonID 132 | dplyr::filter( 133 | (is.na(acceptedNameUsageID) | 134 | acceptedNameUsageID == "" | 135 | taxonID == acceptedNameUsageID) & 136 | result_type == "single_match" 137 | ) %>% 138 | dplyr::select( 139 | query, 140 | resolved_name = reference, 141 | matched_name = reference, 142 | resolved_status = taxonomicStatus, 143 | matched_status = taxonomicStatus, 144 | match_type 145 | ) 146 | 147 | # Separate out matches to a single synonym (success type 2) 148 | accepted_single_synonyms <- 149 | match_results_classified_with_taxonomy %>% 150 | # Consider synonym anything with acceptedNameUsageID not matching taxonID 151 | dplyr::filter(!is.na(acceptedNameUsageID)) %>% 152 | dplyr::filter(acceptedNameUsageID != "") %>% 153 | dplyr::filter(acceptedNameUsageID != taxonID) %>% 154 | # Join resolved names via synonym 155 | dplyr::left_join( 156 | dplyr::select( 157 | ref_taxonomy, 158 | taxonID, 159 | resolved_name = scientificName, 160 | resolved_status = taxonomicStatus 161 | ), 162 | by = c(acceptedNameUsageID = "taxonID") 163 | ) %>% 164 | dplyr::select( 165 | query, 166 | resolved_name, 167 | matched_name = reference, 168 | resolved_status, 169 | matched_status = taxonomicStatus, 170 | match_type 171 | ) %>% 172 | dplyr::group_by(query) %>% 173 | # Add count of number of resolved, accepted names per query 174 | dplyr::mutate(n = dplyr::n_distinct(resolved_name)) %>% 175 | dplyr::ungroup() %>% 176 | # Only keep those that resolve to the same name 177 | dplyr::filter(n == 1) %>% 178 | dplyr::select(-n) 179 | 180 | # Combine name resolution successes 181 | success <- dplyr::bind_rows(accepted_single_match, accepted_single_synonyms) 182 | 183 | # Anything else is a failure 184 | failure <- 185 | match_results_classified_with_taxonomy %>% 186 | dplyr::select( 187 | query, 188 | match_type, 189 | matched_status = taxonomicStatus, 190 | matched_name = reference 191 | ) %>% 192 | dplyr::anti_join(success, by = "query") 193 | 194 | # Combine into final results 195 | results <- dplyr::bind_rows(success, failure) %>% 196 | assertr::verify(all(query %in% match_results$query)) %>% 197 | assertr::verify(all(match_results$query %in% query)) %>% 198 | dplyr::select( 199 | query, 200 | resolved_name, 201 | matched_name, 202 | resolved_status, 203 | matched_status, 204 | match_type 205 | ) 206 | 207 | # Return as tibble or dataframe 208 | if (isTRUE(tbl_out)) return(tibble::as_tibble(results)) 209 | 210 | results 211 | } 212 | -------------------------------------------------------------------------------- /vignettes/basics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The basics" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{The basics} 6 | %\VignetteEncoding{UTF-8} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | editor_options: 9 | chunk_output_type: console 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | This vignette explains the three basic steps of the taxonomic name resolution workflow, which consist of: 20 | 21 | 1. Name parsing 22 | 2. Name matching 23 | 3. Name resolution 24 | 25 | ## Setup 26 | 27 | We'll start by loading `taxastand`. For more information on installing `taxastand`, see [here](https://joelnitta.github.io/taxastand/index.html#installation). 28 | 29 | ```{r setup} 30 | library(taxastand) 31 | ``` 32 | 33 | ## Name parsing 34 | 35 | In R, scientific names are often just stored as character vectors (strings). For example, 36 | 37 | ```{r example-name} 38 | example_name <- "Crepidomanes minutum (Bl.) K. Iwats." 39 | ``` 40 | 41 | However, such a name actually consists of several distinct parts: 42 | 43 | ``` 44 | "Crepidomanes minutum (Bl.) K. Iwats." 45 | ------------- ------- --------------- 46 | | | | 47 | genus specific author 48 | epithet 49 | ``` 50 | 51 | Furthermore, in the case of this name, it was originally named by Blume (`(Bl.)`), then transferred to a different genus by Iwatsuki (`K. Iwats.`). 52 | 53 | When working with taxonomic names, it can be useful to **parse** the name into its component parts. That is what `ts_parse_names()` does. It takes a character vector as input and returns a dataframe: 54 | 55 | ```{r parse-example} 56 | ts_parse_names(example_name) 57 | ``` 58 | 59 | The first column, `name`, is the original input name. `id` is a unique identifier attached to the name. The rest of the columns are [ the parsed components of the name](https://joelnitta.github.io/taxastand/reference/ts_parse_names.html#value). 60 | 61 | Note that the [name parsing algorithm](https://github.com/camwebb/taxon-tools#parsenames) used by `taxastand` is case-sensitive! It assumes that the [standard capitalization of scientific names](https://en.wikipedia.org/wiki/Binomial_nomenclature#Writing_binomial_names) is being used: genus is capitalized, specific epithet is lower case, author is capitalized as a proper noun, etc. **Name parsing probably won't work without this type of capitalization.** 62 | 63 | Now that we've parsed a name, in the next section we will see why this is useful for matching names to each other. 64 | 65 | ## Name matching 66 | 67 | One reason that name parsing is important is because some scientific names may differ only in certain components. 68 | 69 | For example, the species [*Hymenophyllum pectinatum*](https://www.tropicos.org/name/Search?name=Hymenophyllum%20pectinatum) actually corresponds to two different scientific names with different authors, *Hymenophyllum pectinatum* Nees & Blume and *Hymenophyllum pectinatum* Cav. 70 | 71 | We can see this by querying the name: 72 | 73 | ```{r match-example-1} 74 | ts_match_names( 75 | "Hymenophyllum pectinatum", 76 | c("Hymenophyllum pectinatum Nees & Blume", 77 | "Hymenophyllum pectinatum Cav."), 78 | simple = TRUE) 79 | ``` 80 | 81 | `ts_match_names()` matches both scientific names[^1], because the algorithm it can't distinguish between them without additional information. So **it is almost always better to include the taxonomic author in the query**, to distinguish between such cases. 82 | 83 | [^1]: Note that `ts_match_names()` did the name parsing by calling `ts_parse_names()` for us internally. This is usually fine, but it can also take parsed names (dataframes) produced by `ts_parse_names()` as input to either `query` or `reference.` 84 | 85 | However, there can be quite a bit of variation in how authors are recorded. Sometimes names are abbreviated to different lengths, or the basionym author (an author name in parentheses) might get left out by accident, etc. The algorithm used by `taxastand` can account for this (to a point). Here is an example where the query lacks a basionym author: 86 | 87 | ```{r match-example-2} 88 | ts_match_names( 89 | "Hymenophyllum taiwanense C. V. Morton", 90 | c("Hymenophyllum taiwanense (Tagawa) C. V. Morton", 91 | "Hymenophyllum taiwanense De Vol"), 92 | simple = TRUE) 93 | ``` 94 | 95 | The name matching algorithm was able to narrow the match down to `Hymenophyllum taiwanense (Tagawa) C. V. Morton` even though the query lacked `(Tagawa)`. Furthermore, the `match_type` tells us how the matching was done: `auto_basio-` means an automatic match based on excluding the basionym author from the reference. **It is recommended to always check any results that weren't identical** (`exact`) to verify that the matching algorithm worked correctly, especially for fuzzy matches (`auto_fuzzy`). 96 | 97 | Here is a summary of the values taken by `match_type` from [`taxon-tools`](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes): 98 | 99 | - `exact`: Exact match to all parts of the name (genus hybrid marker, genus name, species hybrid marker, species epithet, infraspecific rank signifier, infraspecific rank, author string). 100 | - `auto_punct`: Exact match to all parts of the name after removing mis-matching spaces, periods, non-ASCII author name characters, etc. 101 | - `auto_noauth` (only applies if `match_no_auth` is `TRUE`): Match between a query lacking an author and a reference name lacking an author that occurs only once in the reference. 102 | - `auto_basio-`: Match after excluding the basionym author from the reference. For example, `Cardaminopsis umbrosa Czerep.` vs. `Cardaminopsis umbrosa (Turcz.) Czerep.)`); the basionym author is `(Turcz.)`. 103 | - `auto_basio+`: Match after excluding the basionym author from the query. 104 | - `auto_in-`: Match after excluding all *in* elements from reference. An *in* element refers to phrases such as `Tagawa in Morton`. The version excluding *in* elements is `Tagawa`. 105 | - `auto_in+`: Match after excluding all *in* elements from query. 106 | - `auto_ex-`: Match after excluding all *in* and *ex* elements from reference. An *ex* element refers to phrases such as `Rändel ex D.F.Murray`. The version excluding *ex* elements is `Rändel`. 107 | - `auto_ex+`: Match after excluding all *in* and *ex* elements from query. 108 | - `auto_basexin`: Match after excluding all basionym authors and all *in* and *ex* elements from query and reference. 109 | - `auto_irank`: Match where all elements agree except for infraspecific rank. 110 | - `auto_fuzzy`: Fuzzy match; match between scientific names allowed up to threshold given by `max_dist`, the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) including total insertions, deletions and substitutions. 111 | - `cfonly`: Match by "canonical form", i.e., genus plus specific epithet plus infraspecific epithet (if present), not including the infraspecific specifier ("subsp.", etc.). 112 | - `no_match`: No match detected. 113 | 114 | The matching algorithm will prefer match codes higher in the list; so if a name could be matched both by `auto_punct` and `auto_fuzzy`, it will be matched based on `auto_punct`[^2]. 115 | 116 | [^2]: The algorithm used by `taxastand` is optimized for plants, algae, and fungi, which vary in their [taxonomic rules](https://www.iapt-taxon.org/nomen/main.php) somewhat from animals. For example, plants include basionym authors in parentheses followed by the combination author, and typically don't include the year, whereas animals normally include the year and may not provide the combination author. 117 | 118 | ## Name resolution 119 | 120 | Name resolution refers to the process of mapping a query name to its standard version. This could just be accounting for orthographic variations, or it could involve resolving synonyms: different names that actually refer to the same species. 121 | 122 | In order to conduct name resolution, we require a **taxonomic standard** in the form of a dataframe. `taxastand` requires that the taxonomic standard conform to [Darwin Core standards](https://dwc.tdwg.org/). There are many sources of taxonomic data online, including [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c), [Catalog of Life](http://www.catalogueoflife.org/), and [ITIS](https://www.itis.gov/) among others. 123 | 124 | `taxastand` comes supplied with an example taxonomic standard for filmy ferns (family Hymenophyllaceae): 125 | 126 | ```{r name-res-example-1} 127 | # Load example reference taxonomy in Darwin Core format 128 | data(filmy_taxonomy) 129 | 130 | # Take a look at the columns used by taxastand 131 | head(filmy_taxonomy[c("taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")]) 132 | ``` 133 | 134 | Here, `taxonID` is a unique identifier for each taxonomic name. `acceptedNameUsageID` only applies in the case of synonyms: it tells us the `taxonID` of the accepted name corresponding to that synonym. `taxonomicStatus` describes the status of the name, typically either as an accepted name, synonym, or something else ("dubious", etc.). Finally, the `scientificName` is the full scientific name, preferably with the author. 135 | 136 | In its most simple usage, `ts_resolve_names()` can take as input a character vector to `query`, and provide the resolved name in the taxonomic standard (`reference`): 137 | 138 | ```{r name-res-example-2} 139 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy) 140 | ``` 141 | 142 | In this case, the query, `Gonocormus minutum` was a misspelled name that is actually a synonym for *Crepidomanes minutum* (Bl.) K. Iwats. Under the hood, `ts_resolve_names()` is calling both `ts_parse_names()` and `ts_match_names()` to do parsing and matching steps before name resolution[^3]. 143 | 144 | [^3]: You can use the output of `ts_match_names()` to the `query` input of `ts_parse_names()` if you want to see the matching results first. 145 | 146 | However, when used this way, `ts_resolve_names()` may not be able to provide a resolved name if the input is not matched unambiguously: 147 | 148 | ```{r name-res-example-3} 149 | t_bifid_res <- ts_resolve_names("Trichomanes bifidum", filmy_taxonomy) 150 | head(t_bifid_res) 151 | dim(t_bifid_res) 152 | ``` 153 | 154 | In this case, name resolution using the default settings produced `r nrow(t_bifid_res)` possible answers! That is obviously far too many. Let's try to adjust the arguments and see if we can reduce the output: 155 | 156 | ```{r name-res-example-4} 157 | ts_resolve_names( 158 | "Trichomanes bifidum", filmy_taxonomy, 159 | match_no_auth = TRUE, match_canon = TRUE, max_dist = 5) 160 | ``` 161 | 162 | By allowing matches without the author name (we probably should have done that anyways, since the query lacked an author) and lowering the fuzzy match threshold, we are able to greatly reduce the number of possible resolved names. 163 | 164 | Name resolution workflows typically involve tweaking these arguments to resolve a maximum number of names automatically, followed by some amount of manual edits to the remaining resolved names. 165 | 166 | A benefit of `taxastand` is that, if during the name resolution workflow we discover mistakes in the reference database, the reference database can be edited so that the query names resolve correctly (this is not possible with packages that rely on querying a remote taxonomic database that can't be modified by the user). 167 | 168 | ## Conclusion 169 | 170 | This vignette illustrated the typical steps involved in name resolution with `taxastand` on some trivial examples. In another vignette, I will provide a more realistic example with a larger dataset. 171 | -------------------------------------------------------------------------------- /R/ts_match_names.R: -------------------------------------------------------------------------------- 1 | #' Match taxonomic names to a reference 2 | #' 3 | #' Allows for orthographic differences between query and reference by using 4 | #' fuzzy matching on parsed taxonomic names. Requires 5 | #' [taxon-tools](https://github.com/camwebb/taxon-tools) to be installed. 6 | #' 7 | #' `taxon-tools` matches names in two steps: 8 | #' 1. Scientific names are parsed into their component parts (genus, species, 9 | #' variety, author, etc). 10 | #' 2. Names are fuzzily matched following taxonomic rules using the component 11 | #' parts. 12 | #' 13 | #' For more information on rules used for matching, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes). 14 | #' 15 | #' Parsing is fairly fast (much faster than matching) but can take some time if 16 | #' the number of names is very large. If multiple queries will be made (e.g., to 17 | #' the same large reference database), it is recommended to first parse the 18 | #' names using \code{\link{ts_parse_names}()}, and use the results as input to 19 | #' `query` and/or `reference`. 20 | #' 21 | #' `collapse_infra` is useful in situations where the reference database does 22 | #' not use names that have the same specific epithet and infraspecific epithet. 23 | #' For example, reference name "Blechnum lunare" and query "Blechnum lunare var. 24 | #' lunare". In this case, if `collapse_infra` is `TRUE`, "Blechnum lunare" will 25 | #' be queried instead of "Blechnum lunare var. lunare". Note that the 26 | #' `match_type` will be "exact" even though the literal query and the matched 27 | #' name are different (see example below). 28 | #' 29 | #' @param query Character vector or dataframe; taxonomic names to be queried. 30 | #' If a character vector, missing values not allowed and all values must be 31 | #' unique. 32 | #' If a dataframe, should be taxonomic names parsed with 33 | #' \code{\link{ts_parse_names}()}. 34 | #' @param reference Character vector or dataframe; taxonomic names to use as 35 | #' reference. If a character vector, missing values not allowed and all values 36 | #' must be unique. If a dataframe, should be taxonomic names parsed with 37 | #' \code{\link{ts_parse_names}()}. 38 | #' @param manual_match Optional. Dataframe of manually matched names that will 39 | #' override any results from `taxon-tools`. Must include columns, `query` 40 | #' and `match`. Can only be used if `query` is a character vector. 41 | #' @param max_dist Max Levenshtein distance to allow during fuzzy matching 42 | #' (total insertions, deletions and substitutions). Default: 10. 43 | #' @param match_no_auth Logical; If no author is given in the query and the name 44 | #' (without author) occurs only once in the reference, accept the name in the 45 | #' reference as a match. Default: to not allow such a match (`FALSE`). 46 | #' @param match_canon Logical; Allow a "canonical name" match if only the genus, 47 | #' species epithet, and infraspecific epithet (if present) match exactly. 48 | #' Default: to not allow such a match (`FALSE`). 49 | #' @param collapse_infra Logical; if the specific epithet and infraspecific 50 | #' epithet are the same, drop the infraspecific rank and epithet from the query. 51 | #' @param collapse_infra_exclude Character vector; taxonomic names to exclude 52 | #' from collapsing with `collapse_infra`. Any names used must match those in 53 | #' `query` exactly, or they won't be excluded. 54 | #' @param simple Logical; return the output in a simplified format with only the 55 | #' query name, matched reference name, and match type. Default: `FALSE`. 56 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools 57 | #' (so that taxon-tools need not be installed). 58 | #' @param tbl_out Logical vector of length 1; should a tibble be returned? 59 | #' If `FALSE` (default), output will be a data.frame. This argument can 60 | #' be controlled via the option `ts_tbl_out`; see Examples. 61 | #' 62 | #' @return Dataframe with the following columns (if `simple` is `FALSE`): 63 | #' - query: Query name 64 | #' - reference: Matched reference name 65 | #' - match_type: Type of match (for a summary of match types, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes)) 66 | #' - id_query: Unique ID of query 67 | #' - id_ref: Unique ID of reference 68 | #' - genus_hybrid_sign_query: Genus hybrid sign in query 69 | #' - genus_name_query: Genus name of query 70 | #' - species_hybrid_sign_query: Species hybrid sign in query 71 | #' - specific_epithet_query: Specific epithet of query 72 | #' - infraspecific_rank_query: Infraspecific rank of query 73 | #' - infraspecific_epithet_query: Infraspecific epithet of query 74 | #' - author_query: Taxonomic author of query 75 | #' - genus_hybrid_sign_ref: Genus hybrid sign in reference 76 | #' - genus_name_ref: Genus name of reference 77 | #' - species_hybrid_sign_ref: Species hybrid sign in reference 78 | #' - specific_epithet_ref: Specific epithet of reference 79 | #' - infraspecific_rank_ref: Infraspecific rank of reference 80 | #' - infraspecific_epithet_ref: Infraspecific epithet of reference 81 | #' - author_ref: Taxonomic author of reference 82 | #' 83 | #' If `simple` is `TRUE`, only return the first three columns above. 84 | #' 85 | #' @autoglobal 86 | #' @export 87 | #' @examples 88 | #' if(ts_tt_installed()) { 89 | #' ts_match_names( 90 | #' "Crepidomanes minutus", 91 | #' c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 92 | #' simple = TRUE 93 | #' ) 94 | #' 95 | #' # If names are too distant, they won't match 96 | #' ts_match_names( 97 | #' query = "Crepidblah foo", 98 | #' reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 99 | #' simple = TRUE 100 | #' ) 101 | #' 102 | #' # But we can force a match manually 103 | #' ts_match_names( 104 | #' query = "Crepidblah foo", 105 | #' reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"), 106 | #' manual_match = data.frame( 107 | #' query = c("Crepidblah foo"), 108 | #' match = c("Crepidomanes minutum") 109 | #' ), 110 | #' simple = TRUE 111 | #' ) 112 | #' 113 | #' # If you always want tibble output without specifying `tbl_out = TRUE` 114 | #' # every time, set the option: 115 | #' options(ts_tbl_out = TRUE) 116 | #' ts_match_names( 117 | #' "Crepidomanes minutus", 118 | #' c("Crepidomanes minutum", "Hymenophyllum polyanthos") 119 | #' ) 120 | #' 121 | #' # Example using collapse_infra argument 122 | #' ts_match_names( 123 | #' c("Crepidomanes minutus", "Blechnum lunare var. lunare", 124 | #' "Blechnum lunare", "Bar foo var. foo", "Bar foo"), 125 | #' c("Crepidomanes minutum", "Hymenophyllum polyanthos", "Blechnum lunare", 126 | #' "Bar foo"), 127 | #' collapse_infra = TRUE, 128 | #' collapse_infra_exclude = "Bar foo var. foo", 129 | #' simple = TRUE 130 | #' ) 131 | #' } 132 | #' 133 | ts_match_names <- function( 134 | query, 135 | reference, 136 | manual_match = NULL, 137 | max_dist = 10, 138 | match_no_auth = FALSE, 139 | match_canon = FALSE, 140 | collapse_infra = FALSE, 141 | collapse_infra_exclude = NULL, 142 | simple = FALSE, 143 | docker = getOption("ts_docker", default = FALSE), 144 | tbl_out = getOption("ts_tbl_out", default = FALSE) 145 | ) { 146 | # Check input 147 | assertthat::assert_that( 148 | is.character(query) | inherits(query, "data.frame"), 149 | msg = "query must be of class 'data.frame' or a character vector" 150 | ) 151 | assertthat::assert_that( 152 | is.character(reference) | inherits(reference, "data.frame"), 153 | msg = "reference must be of class 'data.frame' or a character vector" 154 | ) 155 | assertthat::assert_that(assertthat::is.number(max_dist)) 156 | assertthat::assert_that(is.logical(match_no_auth)) 157 | assertthat::assert_that(is.logical(match_canon)) 158 | assertthat::assert_that(is.logical(simple)) 159 | assertthat::assert_that(assertthat::is.flag(tbl_out)) 160 | assertthat::assert_that(assertthat::is.flag(collapse_infra)) 161 | if (!is.null(collapse_infra_exclude)) { 162 | assertthat::assert_that(is.character(collapse_infra_exclude)) 163 | } 164 | assertthat::assert_that(assertthat::is.flag(docker)) 165 | if (!is.null(manual_match)) { 166 | assertthat::assert_that( 167 | isTRUE(inherits(manual_match, "data.frame")), 168 | msg = "manual_match must be of class 'data.frame'" 169 | ) 170 | assertthat::assert_that( 171 | isTRUE( 172 | all(c("query", "match") %in% colnames(manual_match)) 173 | ), 174 | msg = "manual_match must have `query` and `match` columns" 175 | ) 176 | assertthat::assert_that( 177 | is.character(manual_match$query) 178 | ) 179 | assertthat::assert_that( 180 | is.character(manual_match$match) 181 | ) 182 | assertthat::assert_that( 183 | assertthat::noNA(manual_match$query) 184 | ) 185 | assertthat::assert_that( 186 | assertthat::noNA(manual_match$query) 187 | ) 188 | assertthat::assert_that( 189 | isTRUE(!any(duplicated(manual_match$query))), 190 | msg = "All values of manual_match$query must be unique" 191 | ) 192 | assertthat::assert_that( 193 | is.character(query), 194 | msg = "manual_match can only be used if query is a character vector" 195 | ) 196 | } 197 | 198 | # Helper function to add a namestring to a dataframe of parsed names 199 | add_namestring <- function(df) { 200 | df$namestring <- 201 | paste0( 202 | df$genus_hybrid_sign, 203 | df$genus_name, 204 | df$species_hybrid_sign, 205 | df$specific_epithet, 206 | df$infraspecific_rank, 207 | df$infraspecific_epithet, 208 | df$author, 209 | sep = "_" 210 | ) 211 | df 212 | } 213 | 214 | # Parse or load query names 215 | if (is.character(query)) { 216 | # Optional: for manual matches, use matched name instead of query 217 | # to generate exact match 218 | if (!is.null(manual_match)) { 219 | manual_replacement_df <- 220 | data.frame( 221 | query_original = query 222 | ) |> 223 | dplyr::left_join( 224 | dplyr::select( 225 | manual_match, 226 | query_original = query, 227 | query_new = match 228 | ), 229 | by = "query_original", 230 | relationship = "one-to-one" 231 | ) |> 232 | dplyr::mutate( 233 | query_new = dplyr::coalesce(query_new, query_original) 234 | ) 235 | query <- manual_replacement_df$query_new |> 236 | unique() 237 | } 238 | # Parse the names (adds 'name' column) 239 | query_parsed_df <- ts_parse_names(query, docker = docker) 240 | } else { 241 | # Or, names are already parsed 242 | query_parsed_df <- query 243 | } 244 | 245 | # Optionally collapse infraspecific name 246 | if (isTRUE(collapse_infra)) { 247 | # Save a copy of original unmodified parsed query 248 | query_parsed_df_original <- query_parsed_df 249 | # Identify rows where infraspecific_epithet is the same as specific_epithet 250 | query_parsed_df$same_infra_species <- 251 | (query_parsed_df$specific_epithet == 252 | query_parsed_df$infraspecific_epithet) %in% 253 | TRUE & 254 | !query_parsed_df$name %in% collapse_infra_exclude 255 | assertthat::assert_that(!anyNA(query_parsed_df$same_infra_species)) 256 | # For rows where infraspecific_epithet is the same as specific_epithet, 257 | # delete infraspecific_epithet and infraspecific_rank 258 | query_parsed_df$infraspecific_epithet[ 259 | query_parsed_df$same_infra_species 260 | ] <- NA 261 | query_parsed_df$infraspecific_rank[query_parsed_df$same_infra_species] <- NA 262 | query_parsed_df$same_infra_species <- NULL 263 | # Account for duplicates created after collapsing names: drop them 264 | query_parsed_df <- add_namestring(query_parsed_df) |> 265 | dplyr::group_by(namestring) |> 266 | dplyr::mutate(key_id = dplyr::first(id)) |> 267 | dplyr::ungroup() 268 | id_map <- dplyr::select(query_parsed_df, id_query = key_id, id) 269 | query_parsed_df <- query_parsed_df[ 270 | !duplicated(query_parsed_df$namestring), 271 | ] 272 | query_parsed_df$namestring <- NULL 273 | } 274 | 275 | # Write out parsed names to temporary file 276 | query_parsed_txt <- tempfile( 277 | pattern = digest::digest(query), 278 | fileext = ".txt" 279 | ) 280 | if (fs::file_exists(query_parsed_txt)) fs::file_delete(query_parsed_txt) 281 | ts_write_names(query_parsed_df, query_parsed_txt) 282 | 283 | # Parse or load reference names 284 | if (is.character(reference)) { 285 | # Parse the names (adds 'name' column) 286 | ref_parsed_df <- ts_parse_names(reference, docker = docker) 287 | } else { 288 | # Or, names are already parsed 289 | ref_parsed_df <- reference 290 | } 291 | 292 | # Check that manually matched ref names are in data 293 | if (!is.null(manual_match)) { 294 | assertthat::assert_that( 295 | isTRUE(all(manual_match$match %in% ref_parsed_df$name)), 296 | msg = "One or more manually matched reference names not in reference data" 297 | ) 298 | } 299 | 300 | # Write out parsed names to temporary file 301 | ref_parsed_txt <- tempfile( 302 | pattern = digest::digest(reference), 303 | fileext = ".txt" 304 | ) 305 | if (fs::file_exists(ref_parsed_txt)) fs::file_delete(ref_parsed_txt) 306 | ts_write_names(ref_parsed_df, ref_parsed_txt) 307 | 308 | # Format argument flags 309 | if (match_no_auth) match_no_auth <- "-1" else match_no_auth <- NULL 310 | if (match_canon) match_canon <- "-c" else match_canon <- NULL 311 | 312 | # Specify temporary output file 313 | match_results_txt <- tempfile( 314 | pattern = digest::digest(c(query, reference)), 315 | fileext = ".txt" 316 | ) 317 | if (fs::file_exists(match_results_txt)) fs::file_delete(match_results_txt) 318 | 319 | # Run taxon-tools matchnames 320 | 321 | if (isTRUE(docker)) { 322 | assertthat::assert_that( 323 | requireNamespace("babelwhale", quietly = TRUE), 324 | msg = "babelwhale needs to be installed to use docker" 325 | ) 326 | assertthat::assert_that( 327 | babelwhale::test_docker_installation(), 328 | msg = "docker not installed" 329 | ) 330 | match_results <- run_auto_mount( 331 | container_id = "camwebb/taxon-tools:v1.3.0", 332 | command = "matchnames", 333 | args = c( 334 | "-a", 335 | file = query_parsed_txt, 336 | "-b", 337 | file = ref_parsed_txt, 338 | "-o", 339 | file = match_results_txt, 340 | "-e", 341 | max_dist, 342 | "-F", # no manual matching 343 | match_no_auth, 344 | match_canon 345 | ) 346 | ) 347 | } else { 348 | assertthat::assert_that( 349 | ts_tt_installed(), 350 | msg = "taxon-tools not installed" 351 | ) 352 | match_results <- processx::run( 353 | command = "matchnames", 354 | args = c( 355 | "-a", 356 | query_parsed_txt, 357 | "-b", 358 | ref_parsed_txt, 359 | "-o", 360 | match_results_txt, 361 | "-e", 362 | max_dist, 363 | "-F", # no manual matching 364 | match_no_auth, 365 | match_canon 366 | ) 367 | ) 368 | } 369 | 370 | # Read in results 371 | # Each line represents a single name from the query list (list A). 372 | # Seventeen pipe-delimited (“|”) fields per row: 373 | # 1. User ID code in list A, 374 | # 2. Code in list B (if matched), 375 | # 3. Match type (see codes below), 376 | # 4-10. Parsed elements of name in list A. 377 | # 11-17 (in same format as name input), Parsed elements of name in list B. 378 | matchnames_cols <- c( 379 | "id_query", 380 | "id_ref", 381 | "match_type", 382 | "genus_hybrid_sign_query", 383 | "genus_name_query", 384 | "species_hybrid_sign_query", 385 | "specific_epithet_query", 386 | "infraspecific_rank_query", 387 | "infraspecific_epithet_query", 388 | "author_query", 389 | "genus_hybrid_sign_ref", 390 | "genus_name_ref", 391 | "species_hybrid_sign_ref", 392 | "specific_epithet_ref", 393 | "infraspecific_rank_ref", 394 | "infraspecific_epithet_ref", 395 | "author_ref" 396 | ) 397 | 398 | results <- data.frame(record = readLines(match_results_txt)) 399 | 400 | results <- tidyr::separate( 401 | data = results, 402 | col = record, 403 | into = matchnames_cols, 404 | sep = "\\|", 405 | fill = "right", 406 | remove = TRUE 407 | ) 408 | 409 | # Convert empty strings to NA 410 | results <- dplyr::mutate( 411 | results, 412 | dplyr::across(dplyr::everything(), ~ dplyr::na_if(.x, "")) 413 | ) 414 | 415 | # Add back in the original search terms (query and reference) 416 | results <- dplyr::left_join( 417 | results, 418 | dplyr::select(query_parsed_df, id_query = id, query = name), 419 | by = "id_query" 420 | ) 421 | 422 | results <- dplyr::left_join( 423 | results, 424 | dplyr::select(ref_parsed_df, id_ref = id, reference = name), 425 | by = "id_ref" 426 | ) 427 | 428 | results <- dplyr::select( 429 | results, 430 | query, 431 | reference, 432 | match_type, 433 | dplyr::everything() 434 | ) 435 | 436 | # Add back in names that were duplicated due to collapsed infrasp names 437 | if (isTRUE(collapse_infra)) { 438 | results <- 439 | dplyr::select( 440 | query_parsed_df_original, 441 | query = name, 442 | id 443 | ) |> 444 | dplyr::left_join(id_map, by = "id") |> 445 | dplyr::left_join( 446 | dplyr::select(results, -query), 447 | by = "id_query" 448 | ) |> 449 | dplyr::select(-id) |> 450 | dplyr::select(query, reference, match_type, dplyr::everything()) 451 | } 452 | 453 | # For manual matches, restore back to original query input, and specify 454 | # that match was made manually 455 | if (!is.null(manual_match)) { 456 | results <- 457 | manual_replacement_df |> 458 | dplyr::inner_join( 459 | results, 460 | by = dplyr::join_by(query_new == query) 461 | ) |> 462 | dplyr::mutate( 463 | match_type = dplyr::case_when( 464 | query_original %in% manual_match$query ~ "manual", 465 | .default = match_type 466 | ) 467 | ) |> 468 | dplyr::select( 469 | query = query_original, 470 | dplyr::everything() 471 | ) 472 | } 473 | 474 | if (simple == TRUE) 475 | results <- dplyr::select(results, query, reference, match_type) 476 | 477 | if (isTRUE(tbl_out)) return(tibble::as_tibble(results)) 478 | 479 | results 480 | } 481 | --------------------------------------------------------------------------------