├── .github
├── .gitignore
└── workflows
│ └── pkgdown.yaml
├── vignettes
├── .gitignore
└── basics.Rmd
├── LICENSE
├── man
├── figures
│ └── logo.png
├── pipe.Rd
├── ts_classify_result.Rd
├── ts_tt_installed.Rd
├── ts_make_name_df.Rd
├── filmy_taxonomy.Rd
├── ts_write_names.Rd
├── ts_parse_names.Rd
├── ts_resolve_names.Rd
└── ts_match_names.Rd
├── tests
├── testthat.R
└── testthat
│ ├── _snaps
│ ├── ts_write_names
│ │ └── parsed_name.txt
│ ├── ts_resolve_names.md
│ ├── ts_parse_names.md
│ └── ts_match_names.md
│ ├── test-utils.R
│ ├── test-ts_parse_names.R
│ ├── test-ts_write_names.R
│ ├── test-ts_resolve_names.R
│ └── test-ts_match_names.R
├── data
└── filmy_taxonomy.rda
├── .gitignore
├── .Rbuildignore
├── NAMESPACE
├── R
├── utils-pipe.R
├── data.R
├── ts_tt_installed.R
├── globals.R
├── ts_write_names.R
├── utils.R
├── ts_parse_names.R
├── ts_resolve_names.R
└── ts_match_names.R
├── _pkgdown.yml
├── LICENSE.md
├── DESCRIPTION
├── data-raw
└── filmy_taxonomy.R
├── README.Rmd
└── README.md
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Joel Nitta
3 |
--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelnitta/taxastand/HEAD/man/figures/logo.png
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(taxastand)
3 |
4 | test_check("taxastand")
5 |
--------------------------------------------------------------------------------
/data/filmy_taxonomy.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelnitta/taxastand/HEAD/data/filmy_taxonomy.rda
--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_write_names/parsed_name.txt:
--------------------------------------------------------------------------------
1 | 5f207ff2-1||Foogenus|×|barspecies|var.|foosubsp|(L.) F. Bar
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Rprofile
5 | *.Rproj
6 | .DS_Store
7 | docs
8 | inst/doc
9 | /doc/
10 | /Meta/
11 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^taxastand\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.Rprofile$
4 | ^LICENSE\.md$
5 | ^data-raw$
6 | ^README\.Rmd$
7 | ^_pkgdown\.yml$
8 | ^docs$
9 | ^pkgdown$
10 | ^\.github$
11 | ^doc$
12 | ^Meta$
13 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export("%>%")
4 | export(ts_match_names)
5 | export(ts_parse_names)
6 | export(ts_resolve_names)
7 | export(ts_tt_installed)
8 | export(ts_write_names)
9 | importFrom(magrittr,"%>%")
10 |
--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
1 | #' Pipe operator
2 | #'
3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
4 | #'
5 | #' @name %>%
6 | #' @rdname pipe
7 | #' @keywords internal
8 | #' @export
9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 |
--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_resolve_names.md:
--------------------------------------------------------------------------------
1 | # Produces expected output with docker
2 |
3 | Code
4 | match_results
5 | Output
6 | query reference match_type
7 | 1 Gonocormus minutum Gonocormus minutus (Bl.) Bosch auto_fuzzy
8 |
9 |
--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils-pipe.R
3 | \name{\%>\%}
4 | \alias{\%>\%}
5 | \title{Pipe operator}
6 | \usage{
7 | lhs \%>\% rhs
8 | }
9 | \description{
10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/ts_classify_result.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{ts_classify_result}
4 | \alias{ts_classify_result}
5 | \title{Classify results of taxon-tools matching}
6 | \usage{
7 | ts_classify_result(match_results)
8 | }
9 | \arguments{
10 | \item{match_results}{Dataframe; output of tt_match_names()}
11 | }
12 | \value{
13 | Dataframe with column \code{result_type} added
14 | }
15 | \description{
16 | Classify results of taxon-tools matching
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
1 | #' Taxonomy of filmy ferns (family Hymenophyllaceae)
2 | #'
3 | #' A dataset containing taxonomic names and associated metadata for the
4 | #' fern family Hymenophyllaceae. Downloaded from the
5 | #' [Catalog of Life](http://www.catalogueoflife.org/), Version 1.5.
6 | #' All columns formatted according to
7 | #' [Darwin Core standard](https://dwc.tdwg.org/terms/). Only includes taxa
8 | #' at the species or infraspecies level.
9 | #'
10 | #' @format A data frame with 2729 rows and 31 variables.
11 | #'
12 | #' @source
13 | "filmy_taxonomy"
14 |
--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
1 | test_that("Making a dataframe with taxonomic names works", {
2 | expect_s3_class(
3 | ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar"),
4 | "data.frame"
5 | )
6 | expect_error(
7 | ts_make_name_df(c("Foogenus", "Foogenus")),
8 | "Input taxa must be unique"
9 | )
10 | expect_error(
11 | ts_make_name_df(c("Foogenus", NA)),
12 | "Input taxa may not contain NAs"
13 | )
14 | expect_error(
15 | ts_classify_result("Foogenus"),
16 | "match_results must be of class 'data\\.frame'"
17 | )
18 | })
19 |
--------------------------------------------------------------------------------
/man/ts_tt_installed.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ts_tt_installed.R
3 | \name{ts_tt_installed}
4 | \alias{ts_tt_installed}
5 | \title{Test if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is installed}
6 | \usage{
7 | ts_tt_installed()
8 | }
9 | \value{
10 | \code{TRUE} if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is
11 | installed, or \code{FALSE} if not.
12 | }
13 | \description{
14 | Test if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is installed
15 | }
16 | \examples{
17 | ts_tt_installed()
18 | }
19 |
--------------------------------------------------------------------------------
/R/ts_tt_installed.R:
--------------------------------------------------------------------------------
1 | #' Test if [taxon-tools](https://github.com/camwebb/taxon-tools) is installed
2 | #'
3 | #' @return `TRUE` if [taxon-tools](https://github.com/camwebb/taxon-tools) is
4 | #' installed, or `FALSE` if not.
5 | #' @export
6 | #'
7 | #' @examples
8 | #' ts_tt_installed()
9 | ts_tt_installed <- function() {
10 | tryCatch(
11 | {
12 | parsenames_res <- processx::run("parsenames", "--version")
13 | matchnames_res <- processx::run("matchnames", "--version")
14 | return(TRUE)
15 | },
16 | error = function(error_message) {
17 | return(FALSE)
18 | }
19 | )
20 | }
21 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | home:
2 | title: Standardize Taxonomic Names
3 | description: >
4 | Matches species names to a taxonomic standard. Resolves synonyms consistently and reproducibly.
5 | template:
6 | params:
7 | bootswatch: lumen
8 | reference:
9 | - title: "Parse names"
10 | - contents:
11 | - ts_parse_names
12 | - title: "Match names"
13 | - contents:
14 | - ts_match_names
15 | - title: "Resolve names"
16 | - contents:
17 | - ts_resolve_names
18 | - title: "Datasets"
19 | - contents:
20 | - filmy_taxonomy
21 | - title: "I/O"
22 | - contents:
23 | - ts_write_names
24 | - title: "Utilities"
25 | - contents:
26 | - ts_tt_installed
27 |
--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_parse_names.md:
--------------------------------------------------------------------------------
1 | # Parsing works with docker
2 |
3 | Code
4 | invisible(capture.output(parse_res <- ts_parse_names(
5 | "Foogenus x barspecies var. foosubsp (L.) F. Bar", docker = TRUE)))
6 | parse_res
7 | Output
8 | name id genus_hybrid_sign
9 | 1 Foogenus x barspecies var. foosubsp (L.) F. Bar 5f207ff2-1
10 | genus_name species_hybrid_sign specific_epithet infraspecific_rank
11 | 1 Foogenus × barspecies var.
12 | infraspecific_epithet author
13 | 1 foosubsp (L.) F. Bar
14 |
15 |
--------------------------------------------------------------------------------
/man/ts_make_name_df.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{ts_make_name_df}
4 | \alias{ts_make_name_df}
5 | \title{Make a dataframe with taxonomic names}
6 | \usage{
7 | ts_make_name_df(taxa)
8 | }
9 | \arguments{
10 | \item{taxa}{Character vector; taxon names to be parsed by taxon-tools \code{parsenames}.
11 | Missing values not allowed. Must all be unique.}
12 | }
13 | \value{
14 | Dataframe with two columns: \code{id} and \code{name}
15 | }
16 | \description{
17 | Make a dataframe with taxonomic names
18 | }
19 | \examples{
20 | \dontrun{
21 | ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar")
22 | }
23 | }
24 | \keyword{internal}
25 |
--------------------------------------------------------------------------------
/man/filmy_taxonomy.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{filmy_taxonomy}
5 | \alias{filmy_taxonomy}
6 | \title{Taxonomy of filmy ferns (family Hymenophyllaceae)}
7 | \format{
8 | A data frame with 2729 rows and 31 variables.
9 | }
10 | \source{
11 | \url{http://www.catalogueoflife.org/}
12 | }
13 | \usage{
14 | filmy_taxonomy
15 | }
16 | \description{
17 | A dataset containing taxonomic names and associated metadata for the
18 | fern family Hymenophyllaceae. Downloaded from the
19 | \href{http://www.catalogueoflife.org/}{Catalog of Life}, Version 1.5.
20 | All columns formatted according to
21 | \href{https://dwc.tdwg.org/terms/}{Darwin Core standard}. Only includes taxa
22 | at the species or infraspecies level.
23 | }
24 | \keyword{datasets}
25 |
--------------------------------------------------------------------------------
/tests/testthat/test-ts_parse_names.R:
--------------------------------------------------------------------------------
1 | test_that("Input checks work", {
2 | expect_error(
3 | ts_parse_names(c("Foogenus", "Foogenus")),
4 | "Input taxa must be unique"
5 | )
6 | expect_error(
7 | ts_parse_names(c("Foogenus", NA)),
8 | "Input taxa may not contain NAs"
9 | )
10 | })
11 |
12 | test_that("Parsing works with docker", {
13 | skip_if_no_docker()
14 | expect_snapshot({
15 | # Need invisible() and capture.output() to suppress spinner
16 | invisible(
17 | capture.output(
18 | parse_res <- ts_parse_names(
19 | "Foogenus x barspecies var. foosubsp (L.) F. Bar",
20 | docker = TRUE
21 | )
22 | )
23 | )
24 | parse_res
25 | })
26 | })
27 |
28 | test_that("Parsing works with local taxon-tools", {
29 | skip_if_no_tt()
30 | expect_snapshot(
31 | ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
32 | )
33 | })
34 |
--------------------------------------------------------------------------------
/man/ts_write_names.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ts_write_names.R
3 | \name{ts_write_names}
4 | \alias{ts_write_names}
5 | \title{Write out parsed names to a text file}
6 | \usage{
7 | ts_write_names(df, path)
8 | }
9 | \arguments{
10 | \item{df}{Dataframe with parsed names}
11 |
12 | \item{path}{Path to write dataframe
13 |
14 | Writes out parsed names in a format that can be used by \href{https://github.com/camwebb/taxon-tools}{taxon-tools}
15 | (each part of the scientific name is separated by the pipe symbol (|), with one name per line).}
16 | }
17 | \value{
18 | Path to parsed names
19 | }
20 | \description{
21 | Write out parsed names to a text file
22 | }
23 | \examples{
24 | if (ts_tt_installed()) {
25 | parsed_names <- ts_parse_names(
26 | "Foogenus x barspecies var. foosubsp (L.) F. Bar")
27 | temp_file <- tempfile()
28 | ts_write_names(parsed_names, temp_file)
29 | readLines(temp_file)
30 | file.remove(temp_file)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
1 | # Generated by roxyglobals: do not edit by hand
2 |
3 | utils::globalVariables(c(
4 | "namestring", #
5 | "id", #
6 | "key_id", #
7 | "record", #
8 | "namestring_query", #
9 | "name", #
10 | "match_type", #
11 | "record", #
12 | "id", #
13 | "name", #
14 | "reference", #
15 | "match_type", #
16 | "result_type", #
17 | "acceptedNameUsageID", #
18 | "taxonID", #
19 | "taxonomicStatus", #
20 | "scientificName", #
21 | "resolved_name", #
22 | "resolved_status", #
23 | "n", #
24 | "matched_name", #
25 | "matched_status", #
26 | "query", #
27 | "result_type", #
28 | "n", #
29 | NULL
30 | ))
31 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2019 Joel Nitta
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/testthat/test-ts_write_names.R:
--------------------------------------------------------------------------------
1 | test_that("Input checks work", {
2 | expect_error(
3 | ts_write_names("Foogenus", tempfile()),
4 | "df must be of class 'data\\.frame'"
5 | )
6 | partial_names_df <- data.frame(
7 | id = "1",
8 | genus_hybrid_sign = "x"
9 | )
10 | expect_error(
11 | ts_write_names(partial_names_df, tempfile()),
12 | "df must include the following columns"
13 | )
14 | })
15 |
16 | test_that("Produces expected output file with docker", {
17 | skip_if_no_docker()
18 | parsed_names <- ts_parse_names(
19 | "Foogenus x barspecies var. foosubsp (L.) F. Bar",
20 | docker = TRUE
21 | )
22 | expect_snapshot_file(
23 | ts_write_names(parsed_names, "parsed_name.txt"),
24 | "parsed_name.txt"
25 | )
26 | file.remove("parsed_name.txt")
27 | })
28 |
29 | test_that("Produces expected output file without docker", {
30 | skip_if_no_tt()
31 | parsed_names <- ts_parse_names(
32 | "Foogenus x barspecies var. foosubsp (L.) F. Bar"
33 | )
34 | expect_snapshot_file(
35 | ts_write_names(parsed_names, "parsed_name.txt"),
36 | "parsed_name.txt"
37 | )
38 | file.remove("parsed_name.txt")
39 | })
40 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: taxastand
2 | Title: Taxonomic Name Standardization
3 | Version: 1.0.0
4 | Authors@R:
5 | person(given = "Joel",
6 | family = "Nitta",
7 | role = c("aut", "cre"),
8 | email = "joelnitta@gmail.com")
9 | Description: Matches species names to a taxonomic standard. Resolves synonyms consistently and reproducibly.
10 | License: MIT + file LICENSE
11 | Encoding: UTF-8
12 | LazyData: true
13 | SystemRequirements:
14 | parsenames (),
15 | matchnames ()
16 | Imports:
17 | assertr,
18 | assertthat,
19 | digest,
20 | dplyr,
21 | fs,
22 | glue,
23 | magrittr,
24 | processx,
25 | tibble,
26 | tidyr
27 | Roxygen: list(
28 | markdown = TRUE,
29 | roclets = c("collate", "namespace", "rd", "roxyglobals::global_roclet"))
30 | RoxygenNote: 7.3.2
31 | Depends: R (>= 4.1.0)
32 | Suggests:
33 | rmarkdown,
34 | knitr,
35 | roxyglobals (>= 0.2.1),
36 | testthat (>= 3.0.0),
37 | babelwhale
38 | Config/testthat/edition: 3
39 | Remotes:
40 | anthonynorth/roxyglobals
41 | VignetteBuilder: knitr
42 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | tags: ['*']
7 |
8 | name: pkgdown
9 |
10 | jobs:
11 | pkgdown:
12 | runs-on: ubuntu-latest
13 | env:
14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
15 | steps:
16 | - uses: actions/checkout@v2
17 |
18 | - uses: r-lib/actions/setup-pandoc@v1
19 |
20 | - uses: r-lib/actions/setup-r@v1
21 | with:
22 | use-public-rspm: true
23 |
24 | - uses: r-lib/actions/setup-r-dependencies@v1
25 | with:
26 | extra-packages: pkgdown
27 | needs: website
28 |
29 | - name: Install dependencies
30 | run: |
31 | sudo apt-get install -y --no-install-recommends gawk
32 | git clone https://github.com/camwebb/taxon-tools.git
33 | cd taxon-tools
34 | git checkout 8f8b5e2611b6fdef1998b7878e93e60a9bc7c130
35 | make check
36 | sudo make install
37 | cd ..
38 |
39 | - name: Deploy package
40 | run: |
41 | git config --local user.name "$GITHUB_ACTOR"
42 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
43 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
44 |
--------------------------------------------------------------------------------
/data-raw/filmy_taxonomy.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 |
3 | # Load the example standard taxonomy for resolving names.
4 |
5 | # The example standard taxonomy is the family Hymenophyllaceae from
6 | # Catalog of Life (CoL). CoL provides persistant links to database dumps.
7 | # This one was obtained by selecting "Hymenophyllaceae" for "family"
8 | # and "Complete data" on http://www.catalogueoflife.org/DCA_Export/index.php
9 | # on 2019-06-19
10 |
11 | # Download the zip file
12 | temp_dir <- fs::dir_create(tempdir())
13 | download.file(
14 | "http://www.catalogueoflife.org/DCA_Export/zip/archive-family-hymenophyllaceae-bl3.zip",
15 | fs::path(temp_dir, "archive-genus-vandenboschia-bl3.zip")
16 | )
17 |
18 | # Unzip
19 | unzip(
20 | fs::path(temp_dir, "archive-genus-vandenboschia-bl3.zip"),
21 | exdir = temp_dir
22 | )
23 |
24 | # Read in taxonomy table, keep only
25 | # names at species rank and below
26 | # (warnings are produced because names at genus level
27 | # and above have NA for many fields).
28 | filmy_taxonomy <- read_tsv(fs::path(temp_dir, "taxa.txt")) %>%
29 | filter(str_detect(taxonRank, "species"))
30 |
31 | # Replace "v. d. Bosch" with "V. D. Bosch"
32 | # see https://github.com/camwebb/taxon-tools/issues/10
33 | filmy_taxonomy <-
34 | filmy_taxonomy %>%
35 | dplyr::mutate(
36 | scientificName = stringr::str_replace_all(
37 | scientificName,
38 | "v. d. Bosch",
39 | "V. D. Bosch"
40 | )
41 | )
42 |
43 | usethis::use_data(filmy_taxonomy)
44 |
--------------------------------------------------------------------------------
/tests/testthat/test-ts_resolve_names.R:
--------------------------------------------------------------------------------
1 | data(filmy_taxonomy)
2 |
3 | test_that("Input checks work", {
4 | expect_error(
5 | ts_resolve_names(10, data.frame(genus = "Foogenus")),
6 | "query must be of class"
7 | )
8 | expect_error(
9 | ts_resolve_names(data.frame(genus = "Foogenus"), 10),
10 | "ref_taxonomy must be of class"
11 | )
12 | })
13 |
14 | test_that("Produces expected output with docker", {
15 | skip_if_no_docker()
16 | # Query a misspelled name
17 | match_results <- ts_match_names(
18 | query = "Gonocormus minutum",
19 | reference = unique(filmy_taxonomy$scientificName),
20 | simple = TRUE,
21 | docker = TRUE
22 | )
23 | expect_s3_class(
24 | ts_resolve_names(match_results, filmy_taxonomy),
25 | "data.frame"
26 | )
27 | expect_s3_class(
28 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy, docker = TRUE),
29 | "data.frame"
30 | )
31 | expect_snapshot(match_results)
32 | })
33 |
34 |
35 | test_that("Produces expected output without docker", {
36 | skip_if_no_tt()
37 | # Query a misspelled name
38 | match_results <- ts_match_names(
39 | query = "Gonocormus minutum",
40 | reference = unique(filmy_taxonomy$scientificName),
41 | simple = TRUE
42 | )
43 | expect_s3_class(
44 | ts_resolve_names(match_results, filmy_taxonomy),
45 | "data.frame"
46 | )
47 | expect_s3_class(
48 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy),
49 | "data.frame"
50 | )
51 | expect_snapshot(match_results)
52 | })
53 |
--------------------------------------------------------------------------------
/R/ts_write_names.R:
--------------------------------------------------------------------------------
1 | #' Write out parsed names to a text file
2 | #'
3 | #' @param df Dataframe with parsed names
4 | #' @param path Path to write dataframe
5 | #'
6 | #' Writes out parsed names in a format that can be used by [taxon-tools](https://github.com/camwebb/taxon-tools)
7 | #' (each part of the scientific name is separated by the pipe symbol (|), with one name per line).
8 | #'
9 | #' @autoglobal
10 | #' @return Path to parsed names
11 | #' @export
12 | #' @examples
13 | #' if (ts_tt_installed()) {
14 | #' parsed_names <- ts_parse_names(
15 | #' "Foogenus x barspecies var. foosubsp (L.) F. Bar")
16 | #' temp_file <- tempfile()
17 | #' ts_write_names(parsed_names, temp_file)
18 | #' readLines(temp_file)
19 | #' file.remove(temp_file)
20 | #' }
21 | ts_write_names <- function(df, path) {
22 | # Make vector of standard taxon-tools columns
23 | tt_col_names = c(
24 | "id",
25 | "genus_hybrid_sign",
26 | "genus_name",
27 | "species_hybrid_sign",
28 | "specific_epithet",
29 | "infraspecific_rank",
30 | "infraspecific_epithet",
31 | "author"
32 | )
33 |
34 | assertthat::assert_that(
35 | inherits(df, "data.frame"),
36 | msg = "df must be of class 'data.frame'"
37 | )
38 | assertthat::assert_that(
39 | isTRUE(all(tt_col_names %in% colnames(df))),
40 | msg = glue::glue(
41 | "df must include the following columns: {paste(tt_col_names, collapse = ', ')}"
42 | )
43 | )
44 |
45 | # Replace NA values with ""
46 | df <- dplyr::mutate(
47 | df,
48 | dplyr::across(dplyr::everything(), ~ tidyr::replace_na(., ""))
49 | )
50 |
51 | # Subset to only taxon-tools columns, in order
52 | df <- df[, tt_col_names]
53 |
54 | # taxon-tools uses pipe as separator
55 | df <- tidyr::unite(df, col = "text", dplyr::all_of(tt_col_names), sep = "|")
56 |
57 | # write out text
58 | writeLines(df$text, path)
59 |
60 | path
61 | }
62 |
--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_match_names.md:
--------------------------------------------------------------------------------
1 | # Produces expected output in docker
2 |
3 | Code
4 | match_res
5 | Output
6 | query reference match_type id_query id_ref
7 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy c1ad73ec-1 19b861c8-1
8 | genus_hybrid_sign_query genus_name_query species_hybrid_sign_query
9 | 1 Crepidomanes
10 | specific_epithet_query infraspecific_rank_query infraspecific_epithet_query
11 | 1 minutus
12 | author_query genus_hybrid_sign_ref genus_name_ref species_hybrid_sign_ref
13 | 1 Crepidomanes
14 | specific_epithet_ref infraspecific_rank_ref infraspecific_epithet_ref
15 | 1 minutum
16 | author_ref
17 | 1
18 |
19 | # Manually matched names work
20 |
21 | Code
22 | match_res
23 | Output
24 | query reference match_type
25 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy
26 | 2 Hymeefee erae Hymenophyllum polyanthos manual
27 |
28 | # Names that can't be parsed don't show up in results
29 |
30 | Code
31 | match_res
32 | Output
33 | # A tibble: 1 x 3
34 | query reference match_type
35 |
36 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy
37 |
38 | # Manually matched names work with collapsed infrasp names
39 |
40 | Code
41 | match_res
42 | Output
43 | # A tibble: 6 x 3
44 | query reference match_type
45 |
46 | 1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy
47 | 2 Crepidomanes minutawtaw Crepidomanes minutum manual
48 | 3 Blechnum lunare var. lunare Blechnum lunare exact
49 | 4 Blechnum lunare Blechnum lunare exact
50 | 5 Bar foo var. foo Bar foo manual
51 | 6 Bar foo Bar foo exact
52 |
53 |
--------------------------------------------------------------------------------
/man/ts_parse_names.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ts_parse_names.R
3 | \name{ts_parse_names}
4 | \alias{ts_parse_names}
5 | \title{Parse taxonomic names}
6 | \usage{
7 | ts_parse_names(
8 | taxa,
9 | tbl_out = getOption("ts_tbl_out", default = FALSE),
10 | quiet = FALSE,
11 | docker = getOption("ts_docker", default = FALSE)
12 | )
13 | }
14 | \arguments{
15 | \item{taxa}{Character vector; taxon names to be parsed by taxon-tools
16 | \code{parsenames}. Missing values not allowed. Must all be unique.}
17 |
18 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned?
19 | If \code{FALSE} (default), output will be a data.frame. This argument can
20 | be controlled via the option \code{ts_tbl_out}; see Examples.}
21 |
22 | \item{quiet}{Logical; if TRUE, suppress warning messages that would normally
23 | be issued}
24 |
25 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools
26 | (so that taxon-tools need not be installed).}
27 | }
28 | \value{
29 | A dataframe including the following columns.
30 | \itemize{
31 | \item id: A unique ID number assigned to the input name
32 | \item name: The input name
33 | \item genus_hybrid_sign: Hybrid sign for genus
34 | \item genus_name: Genus name
35 | \item species_hybrid_sign: Hybrid sign for species
36 | \item specific_epithet: Specific epithet (name)
37 | \item infraspecific_rank: Infraspecific rank
38 | \item infraspecific_epithet: Infraspecific epithet (name)
39 | \item author: Name of taxon
40 | }
41 | }
42 | \description{
43 | Requires \href{https://github.com/camwebb/taxon-tools}{taxon-tools} or docker
44 | to be installed.
45 | }
46 | \details{
47 | Parses scientific names into their component parts (genus, species, variety,
48 | author, etc).
49 | }
50 | \examples{
51 | # Using local taxon-tools installation
52 | if (ts_tt_installed()) {
53 |
54 | ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
55 | ts_parse_names(
56 | "Foogenus x barspecies var. foosubsp (L.) F. Bar", tbl_out = TRUE)
57 |
58 | # If you always want tibble output without specifying `tbl_out = TRUE`
59 | # every time, set the option:
60 | options(ts_tbl_out = TRUE)
61 | ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
62 | ts_parse_names("Crepidomanes minutum (Blume) K. Iwats.")
63 |
64 | }
65 |
66 | # Using docker
67 | if (babelwhale::test_docker_installation()) {
68 |
69 | ts_parse_names(
70 | "Foogenus x barspecies var. foosubsp (L.) F. Bar",
71 | docker = TRUE)
72 |
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/tests/testthat/test-ts_match_names.R:
--------------------------------------------------------------------------------
1 | test_that("Input checks work", {
2 | expect_error(
3 | ts_match_names(10, "Foogenus"),
4 | "query must be of class"
5 | )
6 | expect_error(
7 | ts_match_names("Foogenus", 10),
8 | "reference must be of class"
9 | )
10 | expect_error(
11 | ts_match_names(10, data.frame(genus = "Foogenus")),
12 | "query must be of class"
13 | )
14 | expect_error(
15 | ts_match_names(data.frame(genus = "Foogenus"), 10),
16 | "reference must be of class"
17 | )
18 | })
19 |
20 | test_that("Produces expected output in docker", {
21 | skip_if_no_docker()
22 | match_res <- ts_match_names(
23 | "Crepidomanes minutus",
24 | "Crepidomanes minutum",
25 | docker = TRUE
26 | )
27 | expect_s3_class(match_res, "data.frame")
28 | expect_snapshot(match_res)
29 | })
30 |
31 | test_that("Produces expected output without docker", {
32 | skip_if_no_tt()
33 | match_res <- ts_match_names(
34 | "Crepidomanes minutus",
35 | "Crepidomanes minutum"
36 | )
37 | expect_s3_class(match_res, "data.frame")
38 | expect_snapshot(match_res)
39 | })
40 |
41 | test_that("Manually matched names work", {
42 | skip_if_no_docker()
43 | match_res <- ts_match_names(
44 | query = c("Crepidomanes minutus", "Hymeefee erae"),
45 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
46 | manual_match = data.frame(
47 | query = "Hymeefee erae",
48 | match = "Hymenophyllum polyanthos"
49 | ),
50 | simple = TRUE,
51 | docker = TRUE
52 | )
53 | expect_snapshot(match_res)
54 | })
55 |
56 | test_that("Names that can't be parsed don't show up in results", {
57 | skip_if_no_docker()
58 | match_res <- ts_match_names(
59 | query = c(
60 | "Vanden kalamocarpa x Vanden nipponica x Vanden striata",
61 | "Crepidomanes minutus"
62 | ),
63 | reference = c(
64 | "Crepidomanes minutum"
65 | ),
66 | simple = TRUE,
67 | docker = TRUE,
68 | tbl_out = TRUE
69 | )
70 | expect_snapshot(match_res)
71 | })
72 |
73 | test_that("Manually matched names work with collapsed infrasp names", {
74 | skip_if_no_docker()
75 | match_res <- ts_match_names(
76 | query = c(
77 | "Crepidomanes minutus",
78 | "Crepidomanes minutawtaw",
79 | "Blechnum lunare var. lunare",
80 | "Blechnum lunare",
81 | "Bar foo var. foo",
82 | "Bar foo"
83 | ),
84 | reference = c(
85 | "Crepidomanes minutum",
86 | "Hymenophyllum polyanthos",
87 | "Blechnum lunare",
88 | "Bar foo"
89 | ),
90 | manual_match = data.frame(
91 | query = c("Bar foo var. foo", "Crepidomanes minutawtaw"),
92 | match = c("Bar foo", "Crepidomanes minutum")
93 | ),
94 | max_dist = 10,
95 | match_no_auth = FALSE,
96 | match_canon = FALSE,
97 | collapse_infra = TRUE,
98 | collapse_infra_exclude = NULL,
99 | simple = TRUE,
100 | docker = TRUE,
101 | tbl_out = TRUE
102 | )
103 | expect_snapshot(match_res)
104 | })
105 |
106 | test_that("Incorrectly specified manual match fails", {
107 | skip_if_no_docker()
108 | expect_error(
109 | ts_match_names(
110 | query = c("Crepidomanes minutus", "Hymeefee erae"),
111 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
112 | manual_match = data.frame(
113 | query = "Hymeefee erae",
114 | match = "Hymenophyllum poWHAT"
115 | ),
116 | simple = TRUE,
117 | docker = TRUE
118 | ),
119 | "One or more manually matched reference names not in reference data"
120 | )
121 | expect_error(
122 | ts_match_names(
123 | query = c("Crepidomanes minutus", "Hymeefee erae"),
124 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
125 | manual_match = data.frame(
126 | query = c("Crepidomanes minutus", "Crepidomanes minutus"),
127 | match = c("Hymenophyllum polyanthos", "Crepidomanes minutum")
128 | ),
129 | simple = TRUE,
130 | docker = TRUE
131 | ),
132 | "All values of manual_match\\$query must be unique"
133 | )
134 | expect_error(
135 | ts_match_names(
136 | query = c("Crepidomanes minutus", "Hymeefee erae"),
137 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
138 | manual_match = data.frame(
139 | name = c("Hymenophyllum polyantha", "Crepidomanes minutu"),
140 | match = c("Hymenophyllum polyanthos", "Crepidomanes minutum")
141 | ),
142 | simple = TRUE,
143 | docker = TRUE
144 | ),
145 | "manual_match must have `query` and `match` columns"
146 | )
147 | expect_error(
148 | ts_match_names(
149 | query = ts_parse_names("Hymenophyllum polyantha", docker = TRUE),
150 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
151 | manual_match = data.frame(
152 | query = c("Hymenophyllum polyantha"),
153 | match = c("Hymenophyllum polyanthos")
154 | ),
155 | simple = TRUE,
156 | docker = TRUE
157 | ),
158 | "manual_match can only be used if query is a character vector"
159 | )
160 | })
161 |
--------------------------------------------------------------------------------
/man/ts_resolve_names.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ts_resolve_names.R
3 | \name{ts_resolve_names}
4 | \alias{ts_resolve_names}
5 | \title{Resolve synonyms in taxonomic names}
6 | \usage{
7 | ts_resolve_names(
8 | query,
9 | ref_taxonomy,
10 | max_dist = 10,
11 | match_no_auth = FALSE,
12 | match_canon = FALSE,
13 | collapse_infra = FALSE,
14 | collapse_infra_exclude = NULL,
15 | docker = getOption("ts_docker", default = FALSE),
16 | tbl_out = getOption("ts_tbl_out", default = FALSE)
17 | )
18 | }
19 | \arguments{
20 | \item{query}{Character vector or dataframe; taxonomic names to be resolved.
21 | If a character vector, missing values not allowed and all values must be
22 | unique. If a dataframe, should be taxonomic names matched with
23 | \code{\link{ts_match_names}()}.}
24 |
25 | \item{ref_taxonomy}{Dataframe; reference taxonomic data adhering to the
26 | \href{https://dwc.tdwg.org/terms/#taxon}{Darwin Core standard} with the
27 | following columns:
28 | \itemize{
29 | \item \code{taxonID}: \href{https://dwc.tdwg.org/terms/#dwc:taxonID}{Unique identifier for each taxon}.
30 | \item \code{acceptedNameUsageID}: If the taxon is a synonym, the \href{https://dwc.tdwg.org/terms/#dwc:acceptedNameUsageID}{unique identifier for the accepted name}
31 | \item \code{taxonomicStatus}: \href{https://dwc.tdwg.org/terms/#dwc:taxonomicStatus}{The status of the use of the \code{scientificName} as a label for the taxon}.
32 | \item \code{scientificName}: \href{https://dwc.tdwg.org/terms/#dwc:scientificName}{The full scientific name of the taxon},
33 | with authorship and date information if known.
34 | }}
35 |
36 | \item{max_dist}{Max Levenshtein distance to allow during fuzzy matching
37 | (total insertions, deletions and substitutions). Default: 10.}
38 |
39 | \item{match_no_auth}{Logical; If no author is given in the query and the name
40 | (without author) occurs only once in the reference, accept the name in the
41 | reference as a match. Default: to not allow such a match (\code{FALSE}).}
42 |
43 | \item{match_canon}{Logical; Allow a "canonical name" match if only the genus,
44 | species epithet, and infraspecific epithet (if present) match exactly.
45 | Default: to not allow such a match (\code{FALSE}).}
46 |
47 | \item{collapse_infra}{Logical; if the specific epithet and infraspecific
48 | epithet are the same, drop the infraspecific rank and epithet from the query.
49 | For more information, see \code{\link{ts_match_names}()}.}
50 |
51 | \item{collapse_infra_exclude}{Character vector; taxonomic names to exclude
52 | collapsing with \code{collapse_infra}. Any names used must match those in \code{query}
53 | exactly, or they won't be excluded.}
54 |
55 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools
56 | (so that taxon-tools need not be installed).}
57 |
58 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned?
59 | If \code{FALSE} (default), output will be a data.frame. This argument can
60 | be controlled via the option \code{ts_tbl_out}; see Examples.}
61 | }
62 | \value{
63 | Dataframe; results of resolving synonyms in matched taxonomic names.
64 | Includes the following columns:
65 | \itemize{
66 | \item \code{query}: Query name
67 | \item \code{resolved_name}: Accepted name after resolving synonyms
68 | \item \code{matched_name}: Name matched to query
69 | \item \code{resolved_status}: Taxonomic status of the resolved name (same as \code{taxonomicStatus} in \code{ref_taxonomy})
70 | \item \code{matched_status}: Taxonomic status of the matched name (same as \code{taxonomicStatus} in \code{ref_taxonomy})
71 | \item \code{match_type}: Type of match (for a summary of match types, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual})
72 | }
73 |
74 | Names that could not be matched or resolve to multiple, different synonyms
75 | have \code{NA} for \code{resolved_name}.
76 | }
77 | \description{
78 | After matching taxonomic names to a reference, some may match synonyms. This
79 | function resolves synonyms to their accepted names.
80 | }
81 | \details{
82 | \code{query} can take as input either a character vector of taxonomic names, or
83 | the output of \code{\link{ts_match_names}()}. If the former, it will run
84 | \code{\link{ts_match_names}()} to match the query to \code{ref_taxonomy}, then
85 | resolve synonyms. If the latter, the scientific names in \code{ref_taxonomy}
86 | should be the same used as reference with \code{\link{ts_match_names}()}
87 | (this is not checked).
88 |
89 | \code{ref_taxonomy} must be taxonomic data adhering to the \href{https://dwc.tdwg.org/terms/#taxon}{Darwin Core standard}.
90 | Darwin Core includes many terms, but only four (\code{taxonID},
91 | \code{acceptedNameUsageID}, \code{taxonomicStatus}, and \code{scientificName}) are required
92 | for this function.
93 | }
94 | \examples{
95 | if (ts_tt_installed()) {
96 | # Load reference taxonomy in Darwin Core format
97 | data(filmy_taxonomy)
98 |
99 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
100 | # If you always want tibble output without specifying `tbl_out = TRUE`
101 | # every time, set the option:
102 | options(ts_tbl_out = TRUE)
103 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
104 | }
105 |
106 | }
107 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | #' Make a dataframe with taxonomic names
2 | #'
3 | #' @param taxa Character vector; taxon names to be parsed by taxon-tools `parsenames`.
4 | #' Missing values not allowed. Must all be unique.
5 | #'
6 | #' @return Dataframe with two columns: `id` and `name`
7 | #' @keywords internal
8 | #' @examples
9 | #' \dontrun{
10 | #' ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar")
11 | #' }
12 | ts_make_name_df <- function(taxa) {
13 | assertthat::assert_that(is.character(taxa))
14 | assertthat::assert_that(
15 | assertthat::noNA(taxa),
16 | msg = "Input taxa may not contain NAs"
17 | )
18 | assertthat::assert_that(
19 | all(assertr::is_uniq(taxa)),
20 | msg = "Input taxa must be unique"
21 | )
22 |
23 | # Format input names as data frame with unique ID
24 | # ID is combination of first 8 chars of hash of the
25 | # input (taxa), followed by "-" and integer
26 | taxa_df <- data.frame(name = taxa)
27 | taxa_df$id <- 1:nrow(taxa_df)
28 | taxa_df$id <- paste(substr(digest::digest(taxa), 1, 8), taxa_df$id, sep = "-")
29 |
30 | taxa_df[, c("id", "name")]
31 | }
32 |
33 | #' Classify results of taxon-tools matching
34 | #'
35 | #' @param match_results Dataframe; output of tt_match_names()
36 | #'
37 | #' @return Dataframe with column `result_type` added
38 | #' @keywords internal
39 | #' @autoglobal
40 | ts_classify_result <- function(match_results) {
41 | assertthat::assert_that(
42 | inherits(match_results, "data.frame"),
43 | msg = "match_results must be of class 'data.frame'"
44 | )
45 | match_results %>%
46 | dplyr::add_count(query) %>%
47 | dplyr::mutate(
48 | result_type = dplyr::case_when(
49 | match_type != "no_match" & n == 1 ~ "single_match",
50 | match_type != "no_match" & n > 1 ~ "mult_match",
51 | match_type == "no_match" ~ "no_match",
52 | TRUE ~ NA_character_
53 | )
54 | ) %>%
55 | assertr::assert(assertr::not_na, result_type) %>%
56 | dplyr::select(-n)
57 | }
58 |
59 | # Helper function for tests: skip test if docker is not installed
60 | skip_if_no_docker <- function() {
61 | if (babelwhale::test_docker_installation()) {
62 | return(invisible(TRUE))
63 | }
64 | testthat::skip("docker not installed")
65 | }
66 |
67 | # Helper function for tests: skip test if taxon-tools is not installed
68 | skip_if_no_tt <- function() {
69 | if (ts_tt_installed()) {
70 | return(invisible(TRUE))
71 | }
72 | testthat::skip("taxon-tools not installed")
73 | }
74 |
75 | #' Run a containerised command with automatic mounting of files
76 | #'
77 | #' Similar to [run()], but automatically mounts files (and directories) so the
78 | #' user doesn't have to keep track of volumes.
79 | #'
80 | #' The main difference to [run()] is that the use of names for the `args`; any
81 | #' file (or directory) that should be mounted inside the container must be named
82 | #' `file`. The other elements (arguments) don't need to be named. Note that it
83 | #' is fine to have multiple elements with the same name (`file`).
84 | #'
85 | #' This should generally work as long as the command accepts absolute paths
86 | #' for file input. If that is not the case, use [run()] instead and specify
87 | #' paths and mounting manually.
88 | #'
89 | #' @inheritParams babelwhale::run
90 | #' @param args Character vector, arguments to the command. Any files or
91 | #' directories that should be mounted must be named "file" (see example).
92 | #' @param wd Local working directory to run command. If specified, the working
93 | #' directory will be mounted to the docker container.
94 | #' @param wd_in_container Working directory to run command in
95 | #' the container. Defaults to the working directory mounted to the container
96 | #' (`wd`).
97 | #'
98 | #' @return List, formatted as output from [processx::run()]
99 | #' @noRd
100 | #' @examples
101 | #' \dontrun{
102 | #' if (test_docker_installation()) {
103 | #'
104 | #' # Count the number of lines in the DESCRIPTION and LICENSE
105 | #' # files of this package
106 | #' run_auto_mount(
107 | #' container_id = "alpine",
108 | #' command = "wc",
109 | #' args = c("-l",
110 | #' file = system.file("DESCRIPTION", package = "babelwhale"),
111 | #' file = system.file("LICENSE", package = "babelwhale")
112 | #' )
113 | #' )
114 | #'
115 | #' }
116 | #' }
117 | run_auto_mount <- function(
118 | container_id,
119 | command,
120 | args = NULL,
121 | wd = NULL,
122 | wd_in_container = NULL,
123 | environment_variables = NULL,
124 | debug = FALSE,
125 | verbose = FALSE,
126 | stdout = "|",
127 | stderr = "|"
128 | ) {
129 | # Convert paths of file arguments to absolute for docker
130 | file_args <- args[names(args) == "file"]
131 | in_path <- fs::path_abs(file_args)
132 | in_file <- fs::path_file(in_path)
133 | in_dir <- fs::path_dir(in_path)
134 |
135 | # Make (most likely) unique prefix for folder name that
136 | # won't conflict with an existing folder in the container
137 | # based on the hash of the container id and command
138 | prefix <- digest::digest(c(container_id, command))
139 |
140 | # Specify volume mounting for working directory
141 | wd_volume <- NULL
142 | if (!is.null(wd)) {
143 | wd_path <- fs::path_abs(wd)
144 | if (is.null(wd_in_container)) wd_in_container <- glue::glue("/{prefix}_wd")
145 | wd_volume <- glue::glue("{wd_path}:{wd_in_container}")
146 | }
147 |
148 | # Specify all volumes: one per file, plus working directory
149 | volumes <- unique(
150 | c(
151 | glue::glue("{in_dir}:/{prefix}_{1:length(in_dir)}"),
152 | wd_volume
153 | )
154 | )
155 |
156 | # Replace file arg paths with location in container
157 | files_in_container <- glue::glue("/{prefix}_{1:length(in_dir)}/{in_file}")
158 | args[names(args) == "file"] <- files_in_container
159 |
160 | # Run docker via babelwhale
161 | babelwhale::run(
162 | container_id = container_id,
163 | command = command,
164 | args = args,
165 | volumes = volumes,
166 | workspace = wd_in_container,
167 | environment_variables = environment_variables,
168 | debug = debug,
169 | verbose = verbose,
170 | stdout = stdout,
171 | stderr = stderr
172 | )
173 | }
174 |
--------------------------------------------------------------------------------
/R/ts_parse_names.R:
--------------------------------------------------------------------------------
1 | #' Parse taxonomic names
2 | #'
3 | #' Requires [taxon-tools](https://github.com/camwebb/taxon-tools) or docker
4 | #' to be installed.
5 | #'
6 | #' Parses scientific names into their component parts (genus, species, variety,
7 | #' author, etc).
8 | #'
9 | #' @param taxa Character vector; taxon names to be parsed by taxon-tools
10 | #' `parsenames`. Missing values not allowed. Must all be unique.
11 | #' @param tbl_out Logical vector of length 1; should a tibble be returned?
12 | #' If `FALSE` (default), output will be a data.frame. This argument can
13 | #' be controlled via the option `ts_tbl_out`; see Examples.
14 | #' @param quiet Logical; if TRUE, suppress warning messages that would normally
15 | #' be issued
16 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools
17 | #' (so that taxon-tools need not be installed).
18 | #'
19 | #' @return A dataframe including the following columns.
20 | #' - id: A unique ID number assigned to the input name
21 | #' - name: The input name
22 | #' - genus_hybrid_sign: Hybrid sign for genus
23 | #' - genus_name: Genus name
24 | #' - species_hybrid_sign: Hybrid sign for species
25 | #' - specific_epithet: Specific epithet (name)
26 | #' - infraspecific_rank: Infraspecific rank
27 | #' - infraspecific_epithet: Infraspecific epithet (name)
28 | #' - author: Name of taxon
29 | #'
30 | #' @autoglobal
31 | #' @export
32 | #' @examples
33 | #' # Using local taxon-tools installation
34 | #' if (ts_tt_installed()) {
35 | #'
36 | #' ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
37 | #' ts_parse_names(
38 | #' "Foogenus x barspecies var. foosubsp (L.) F. Bar", tbl_out = TRUE)
39 | #'
40 | #' # If you always want tibble output without specifying `tbl_out = TRUE`
41 | #' # every time, set the option:
42 | #' options(ts_tbl_out = TRUE)
43 | #' ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
44 | #' ts_parse_names("Crepidomanes minutum (Blume) K. Iwats.")
45 | #'
46 | #' }
47 | #'
48 | #' # Using docker
49 | #' if (babelwhale::test_docker_installation()) {
50 | #'
51 | #' ts_parse_names(
52 | #' "Foogenus x barspecies var. foosubsp (L.) F. Bar",
53 | #' docker = TRUE)
54 | #'
55 | #' }
56 | #'
57 | ts_parse_names <- function(
58 | taxa,
59 | tbl_out = getOption("ts_tbl_out", default = FALSE),
60 | quiet = FALSE,
61 | docker = getOption("ts_docker", default = FALSE)
62 | ) {
63 | # Check input: must be character vector, no NA values, all unique
64 | assertthat::assert_that(is.character(taxa))
65 | assertthat::assert_that(
66 | assertthat::noNA(taxa),
67 | msg = "Input taxa may not contain NAs"
68 | )
69 | assertthat::assert_that(
70 | all(assertr::is_uniq(taxa)),
71 | msg = "Input taxa must be unique"
72 | )
73 | assertthat::assert_that(assertthat::is.flag(tbl_out))
74 | assertthat::assert_that(assertthat::is.flag(docker))
75 |
76 | # Write out names formatted for parsing with taxon-tools to temp file
77 | # format:
78 | # `id_num|taxon_name`
79 | # for example,
80 | # `x-234|Foogenus x barspecies var. foosubsp (L.) F. Bar`
81 | taxa_tbl <- ts_make_name_df(taxa)
82 | taxa_tbl$record <- paste(taxa_tbl$id, taxa_tbl$name, sep = "|")
83 | ref_taxa_txt_file <- tempfile(
84 | pattern = digest::digest(taxa),
85 | fileext = ".txt"
86 | )
87 | if (fs::file_exists(ref_taxa_txt_file)) fs::file_delete(ref_taxa_txt_file)
88 | writeLines(taxa_tbl$record, ref_taxa_txt_file)
89 |
90 | # Parse reference names with taxon tools
91 | if (isTRUE(docker)) {
92 | assertthat::assert_that(
93 | requireNamespace("babelwhale", quietly = TRUE),
94 | msg = "babelwhale needs to be installed to use docker"
95 | )
96 | assertthat::assert_that(
97 | babelwhale::test_docker_installation(),
98 | msg = "docker not installed"
99 | )
100 | ref_parsed <- run_auto_mount(
101 | container_id = "camwebb/taxon-tools:v1.3.0",
102 | command = "parsenames",
103 | args = c(file = ref_taxa_txt_file)
104 | )
105 | } else {
106 | assertthat::assert_that(
107 | ts_tt_installed(),
108 | msg = "taxon-tools not installed"
109 | )
110 | ref_parsed <- processx::run("parsenames", ref_taxa_txt_file)
111 | }
112 |
113 | if (fs::file_exists(ref_taxa_txt_file)) fs::file_delete(ref_taxa_txt_file)
114 |
115 | # Read in results of parsing, format as dataframe
116 |
117 | # The output is originally one record per line, with fields separated by '|' (pipe symbol)
118 | parsed_names <- data.frame(
119 | record = strsplit(ref_parsed[["stdout"]], "\n")[[1]]
120 | )
121 |
122 | # Split these into separate columns
123 | name_parts <- c(
124 | "genus_hybrid_sign",
125 | "genus_name",
126 | "species_hybrid_sign",
127 | "specific_epithet",
128 | "infraspecific_rank",
129 | "infraspecific_epithet",
130 | "author"
131 | )
132 |
133 | parsed_names <- tidyr::separate(
134 | data = parsed_names,
135 | col = record,
136 | into = c("id", name_parts),
137 | sep = "\\|",
138 | fill = "right",
139 | remove = FALSE
140 | )
141 |
142 | # Fill in NA if that name part is missing
143 | parsed_names[parsed_names == ""] <- NA
144 |
145 | # Add "fail" column if all name parts are missing (couldn't be parsed properly)
146 | parsed_names$fail <- sapply(
147 | 1:nrow(parsed_names),
148 | function(x) all(is.na(parsed_names[x, name_parts]))
149 | )
150 |
151 | # Early exit if everything failed
152 | assertthat::assert_that(
153 | !all(parsed_names$fail == TRUE),
154 | msg = "No names could be successfully parsed"
155 | )
156 |
157 | # Emit warning for failures
158 | if (sum(parsed_names$fail) > 0 && quiet == FALSE) {
159 | failed_ids <- parsed_names$id[parsed_names$fail == TRUE]
160 | failed_names <- paste(
161 | taxa_tbl$name[taxa_tbl$id %in% failed_ids],
162 | collapse = ", "
163 | )
164 | warning(glue::glue(
165 | "The following names could not be parsed and are excluded from results: {failed_names}"
166 | ))
167 | }
168 |
169 | # Add back in original name
170 | parsed_names <- dplyr::left_join(
171 | parsed_names,
172 | dplyr::select(taxa_tbl, id, name),
173 | by = "id"
174 | )
175 |
176 | # Remove failures, drop "fail" column
177 | parsed_names <- parsed_names[parsed_names$fail == FALSE, ]
178 | parsed_names$fail <- NULL
179 |
180 | # Return parsed names as dataframe or tibble
181 | results <- parsed_names[, c("name", "id", name_parts)]
182 |
183 | if (isTRUE(tbl_out)) return(tibble::as_tibble(results))
184 |
185 | results
186 | }
187 |
--------------------------------------------------------------------------------
/man/ts_match_names.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/ts_match_names.R
3 | \name{ts_match_names}
4 | \alias{ts_match_names}
5 | \title{Match taxonomic names to a reference}
6 | \usage{
7 | ts_match_names(
8 | query,
9 | reference,
10 | manual_match = NULL,
11 | max_dist = 10,
12 | match_no_auth = FALSE,
13 | match_canon = FALSE,
14 | collapse_infra = FALSE,
15 | collapse_infra_exclude = NULL,
16 | simple = FALSE,
17 | docker = getOption("ts_docker", default = FALSE),
18 | tbl_out = getOption("ts_tbl_out", default = FALSE)
19 | )
20 | }
21 | \arguments{
22 | \item{query}{Character vector or dataframe; taxonomic names to be queried.
23 | If a character vector, missing values not allowed and all values must be
24 | unique.
25 | If a dataframe, should be taxonomic names parsed with
26 | \code{\link{ts_parse_names}()}.}
27 |
28 | \item{reference}{Character vector or dataframe; taxonomic names to use as
29 | reference. If a character vector, missing values not allowed and all values
30 | must be unique. If a dataframe, should be taxonomic names parsed with
31 | \code{\link{ts_parse_names}()}.}
32 |
33 | \item{manual_match}{Optional. Dataframe of manually matched names that will
34 | override any results from \code{taxon-tools}. Must include two columns, \code{query}
35 | and \code{match}.}
36 |
37 | \item{max_dist}{Max Levenshtein distance to allow during fuzzy matching
38 | (total insertions, deletions and substitutions). Default: 10.}
39 |
40 | \item{match_no_auth}{Logical; If no author is given in the query and the name
41 | (without author) occurs only once in the reference, accept the name in the
42 | reference as a match. Default: to not allow such a match (\code{FALSE}).}
43 |
44 | \item{match_canon}{Logical; Allow a "canonical name" match if only the genus,
45 | species epithet, and infraspecific epithet (if present) match exactly.
46 | Default: to not allow such a match (\code{FALSE}).}
47 |
48 | \item{collapse_infra}{Logical; if the specific epithet and infraspecific
49 | epithet are the same, drop the infraspecific rank and epithet from the query.}
50 |
51 | \item{collapse_infra_exclude}{Character vector; taxonomic names to exclude
52 | from collapsing with \code{collapse_infra}. Any names used must match those in
53 | \code{query} exactly, or they won't be excluded.}
54 |
55 | \item{simple}{Logical; return the output in a simplified format with only the
56 | query name, matched reference name, and match type. Default: \code{FALSE}.}
57 |
58 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools
59 | (so that taxon-tools need not be installed).}
60 |
61 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned?
62 | If \code{FALSE} (default), output will be a data.frame. This argument can
63 | be controlled via the option \code{ts_tbl_out}; see Examples.}
64 | }
65 | \value{
66 | Dataframe with the following columns (if \code{simple} is \code{FALSE}):
67 | \itemize{
68 | \item query: Query name
69 | \item reference: Matched reference name
70 | \item match_type: Type of match (for a summary of match types, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual})
71 | \item id_query: Unique ID of query
72 | \item id_ref: Unique ID of reference
73 | \item genus_hybrid_sign_query: Genus hybrid sign in query
74 | \item genus_name_query: Genus name of query
75 | \item species_hybrid_sign_query: Species hybrid sign in query
76 | \item specific_epithet_query: Specific epithet of query
77 | \item infraspecific_rank_query: Infraspecific rank of query
78 | \item infraspecific_epithet_query: Infraspecific epithet of query
79 | \item author_query: Taxonomic author of query
80 | \item genus_hybrid_sign_ref: Genus hybrid sign in reference
81 | \item genus_name_ref: Genus name of reference
82 | \item species_hybrid_sign_ref: Species hybrid sign in reference
83 | \item specific_epithet_ref: Specific epithet of reference
84 | \item infraspecific_rank_ref: Infraspecific rank of reference
85 | \item infraspecific_epithet_ref: Infraspecific epithet of reference
86 | \item author_ref: Taxonomic author of reference
87 | }
88 |
89 | If \code{simple} is \code{TRUE}, only return the first three columns above.
90 | }
91 | \description{
92 | Allows for orthographic differences between query and reference by using
93 | fuzzy matching on parsed taxonomic names. Requires
94 | \href{https://github.com/camwebb/taxon-tools}{taxon-tools} to be installed.
95 | }
96 | \details{
97 | \code{taxon-tools} matches names in two steps:
98 | \enumerate{
99 | \item Scientific names are parsed into their component parts (genus, species,
100 | variety, author, etc).
101 | \item Names are fuzzily matched following taxonomic rules using the component
102 | parts.
103 | }
104 |
105 | For more information on rules used for matching, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual}.
106 |
107 | Parsing is fairly fast (much faster than matching) but can take some time if
108 | the number of names is very large. If multiple queries will be made (e.g., to
109 | the same large reference database), it is recommended to first parse the
110 | names using \code{\link{ts_parse_names}()}, and use the results as input to
111 | \code{query} and/or \code{reference}.
112 |
113 | \code{collapse_infra} is useful in situations where the reference database does
114 | not use names that have the same specific epithet and infraspecific epithet.
115 | For example, reference name "Blechnum lunare" and query "Blechnum lunare var.
116 | lunare". In this case, if \code{collapse_infra} is \code{TRUE}, "Blechnum lunare" will
117 | be queried instead of "Blechnum lunare var. lunare". Note that the
118 | \code{match_type} will be "exact" even though the literal query and the matched
119 | name are different (see example below).
120 | }
121 | \examples{
122 | if(ts_tt_installed()) {
123 | ts_match_names(
124 | "Crepidomanes minutus",
125 | c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
126 | simple = TRUE
127 | )
128 |
129 | # If names are too distant, they won't match
130 | ts_match_names(
131 | query = "Crepidblah foo",
132 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
133 | simple = TRUE
134 | )
135 |
136 | # But we can force a match manually
137 | ts_match_names(
138 | query = "Crepidblah foo",
139 | reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
140 | manual_match = data.frame(
141 | query = c("Crepidblah foo"),
142 | match = c("Crepidomanes minutum")
143 | ),
144 | simple = TRUE
145 | )
146 |
147 | # If you always want tibble output without specifying `tbl_out = TRUE`
148 | # every time, set the option:
149 | options(ts_tbl_out = TRUE)
150 | ts_match_names(
151 | "Crepidomanes minutus",
152 | c("Crepidomanes minutum", "Hymenophyllum polyanthos")
153 | )
154 |
155 | # Example using collapse_infra argument
156 | ts_match_names(
157 | c("Crepidomanes minutus", "Blechnum lunare var. lunare",
158 | "Blechnum lunare", "Bar foo var. foo", "Bar foo"),
159 | c("Crepidomanes minutum", "Hymenophyllum polyanthos", "Blechnum lunare",
160 | "Bar foo"),
161 | collapse_infra = TRUE,
162 | collapse_infra_exclude = "Bar foo var. foo",
163 | simple = TRUE
164 | )
165 | }
166 |
167 | }
168 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 |
6 |
7 | ```{r, include = FALSE}
8 | knitr::opts_chunk$set(
9 | collapse = TRUE,
10 | comment = "#>",
11 | fig.path = "man/figures/"
12 | )
13 | ```
14 | # taxastand
15 |
16 |
17 | [](https://www.repostatus.org/#wip)
18 | [](https://zenodo.org/badge/latestdoi/192684959)
19 |
20 |
21 | The goal of `taxastand` is to standardize species names from different sources, a common task in biology.
22 |
23 | Very often different biologists use different synonyms to refer to the same species. If we want to join data from different sources, their taxonomic names must be standardized first. This is what `taxastand` seeks to do in a reproducible and efficient manner.
24 |
25 | ## Important note
26 |
27 | **This package is in early development.** There may be major, breaking changes to functionality in the near future. If you use this package, I highly recommend using a package manager like [renv](https://rstudio.github.io/renv/articles/renv.html) so that later updates won't break your code.
28 |
29 | ## Taxonomic standard
30 |
31 | `taxastand` is based on matching names to a single **taxonomic standard**, that is, a database of accepted names and synonyms. As long as a single taxonomic standard is used, we can confidently resolve names from disparate sources.
32 |
33 | The taxonomic standard must conform to [Darwin Core standards](https://dwc.tdwg.org/). The user must provide this database (as a dataframe). There are many sources of taxonomic data online, including [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c), [Catalog of Life](http://www.catalogueoflife.org/), and [ITIS](https://www.itis.gov/) to name a few. The [taxadb](https://github.com/ropensci/taxadb) package provides convenient functions for downloading various taxonomic databases that use Darwin Core.
34 |
35 | ## Installation
36 |
37 | `taxastand` can be installed from [r-universe](https://joelnitta.r-universe.dev) or [github](https://github.com/joelnitta).
38 |
39 | ``` r
40 | install.packages("taxastand", repos = 'https://joelnitta.r-universe.dev')
41 | ```
42 |
43 | OR
44 |
45 | ``` r
46 | # install.packages("remotes")
47 | remotes::install_github("joelnitta/taxastand")
48 | ```
49 |
50 | ## Dependencies
51 |
52 | `taxastand` depends on [taxon-tools](https://github.com/camwebb/taxon-tools) for taxonomic name matching.
53 |
54 | There are two options for using this dependency.
55 |
56 | - Install [docker](https://www.docker.com/) and set `docker = TRUE` when using `taxastand` functions.
57 |
58 | OR
59 |
60 | - Install the two programs included in [taxon-tools](https://github.com/camwebb/taxon-tools), `parsenames` and `matchnames`.
61 |
62 | ## Similar work
63 |
64 | - [ROpenSci](https://ropensci.org/) has a [task view](https://github.com/ropensci/taxonomy) summarizing many tools available for taxonomy.
65 |
66 | - [taxize](https://github.com/ropensci/taxize) is the "granddaddy" of taxonomy packages in R. It can search around 20 different taxonomic databases for names and retrieve taxonomic information.
67 |
68 | - [TNRS](http://tnrs.iplantcollaborative.org/), the Taxonomic Name Resolution Service, is a web application that resolves taxonomic names of plants according to one of six databases.
69 |
70 | - [taxizedb](https://github.com/ropensci/taxizedb) downloads taxonomic databases and provides tools to interface with them through SQL.
71 |
72 | - [taxadb](https://github.com/ropensci/taxadb) also downloads and searches taxonomic databases. It can interface with them either through SQL or in-memory in R.
73 |
74 | - [taxonstand](https://cran.r-project.org/web/packages/Taxonstand/index.html) has a very similar goal to `taxastand`, but only uses [The Plant List (TPL)](http://www.theplantlist.org
75 | ) as its taxonomic standard and does not allow the user to provide their own. Note that TPL is no longer being updated as of 2013.
76 |
77 | ## Motivation
78 |
79 | Although existing web-based solutions for taxonomic name resolution are very useful, they may not be ideal for all situations: the choice of reference database to use for standardization is limited, they may not be able to handle very large queries, and the user has no guarantee that the same input will yield the same output at a later date due to changes in the remote database.
80 |
81 | Furthermore, matching of taxonomic names is not straightforward, since they are complex data structures including multiple components (e.g., genus, specific epithet, basionym author, combination author, etc). [Of the tools mentioned above](#similar-work) only [TNRS](http://tnrs.iplantcollaborative.org/) can fuzzily match taxonomic names based on their parsed components, but it does not allow for use of a local reference database.
82 |
83 | The motivation for `taxastand` is to provide greater flexibility and reproducibility by allowing for complete version control of the code and database used for name resolution, while implementing fuzzy matching of parsed taxonomic names.
84 |
85 | ## Example
86 |
87 | Here is an example of fuzzy matching followed by resolution of synonyms using the dataset included with the package.
88 |
89 | ```{r filmy-example-show, eval = FALSE}
90 | library(taxastand)
91 |
92 | # Load example reference taxonomy in Darwin Core format
93 | data(filmy_taxonomy)
94 |
95 | # Take a look at the columns used by taxastand
96 | head(filmy_taxonomy[c(
97 | "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
98 |
99 | # As a test, resolve a misspelled name
100 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
101 |
102 | # We can now use the `resolved_name` column of this result for downstream
103 | # analyses joining on other datasets that have been resolved to the same
104 | # reference taxonomy.
105 | ```
106 |
107 | ```{r filmy-example-hide, echo = FALSE}
108 | library(taxastand)
109 |
110 | # Load example reference taxonomy in Darwin Core format
111 | data(filmy_taxonomy)
112 |
113 | # Take a look at the columns used by taxastand
114 | head(filmy_taxonomy[c(
115 | "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
116 |
117 | # As a test, resolve a misspelled name
118 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy, docker = TRUE)
119 |
120 | # We can now use the `resolved_name` column of this result for downstream
121 | # analyses joining on other datasets that have been resolved to the same
122 | # reference taxonomy.
123 | ```
124 |
125 | ## Citing this package
126 |
127 | If you use this package, please cite it! Here is an example:
128 |
129 | Nitta, JH (2021) taxastand: Taxonomic name standardization in R. https://doi.org/10.5281/zenodo.5726390
130 |
131 | The example DOI above is for the overall package.
132 |
133 | Here is the latest DOI, which you should use if you are using the latest
134 | version of the package:
135 |
136 | [](https://zenodo.org/badge/latestdoi/192684959)
137 |
138 | You can find DOIs for older versions by viewing the “Releases” menu on
139 | the right.
140 |
141 | You should also cite the software that `taxastand` relies on, `taxon-tools`: https://github.com/camwebb/taxon-tools
142 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # taxastand
5 |
6 |
7 |
8 | [](https://www.repostatus.org/#wip)
11 | [](https://zenodo.org/badge/latestdoi/192684959)
12 |
13 |
14 | The goal of `taxastand` is to standardize species names from different
15 | sources, a common task in biology.
16 |
17 | Very often different biologists use different synonyms to refer to the
18 | same species. If we want to join data from different sources, their
19 | taxonomic names must be standardized first. This is what `taxastand`
20 | seeks to do in a reproducible and efficient manner.
21 |
22 | ## Important note
23 |
24 | **This package is in early development.** There may be major, breaking
25 | changes to functionality in the near future. If you use this package, I
26 | highly recommend using a package manager like
27 | [renv](https://rstudio.github.io/renv/articles/renv.html) so that later
28 | updates won’t break your code.
29 |
30 | ## Taxonomic standard
31 |
32 | `taxastand` is based on matching names to a single **taxonomic
33 | standard**, that is, a database of accepted names and synonyms. As long
34 | as a single taxonomic standard is used, we can confidently resolve names
35 | from disparate sources.
36 |
37 | The taxonomic standard must conform to [Darwin Core
38 | standards](https://dwc.tdwg.org/). The user must provide this database
39 | (as a dataframe). There are many sources of taxonomic data online,
40 | including
41 | [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c),
42 | [Catalog of Life](http://www.catalogueoflife.org/), and
43 | [ITIS](https://www.itis.gov/) to name a few. The
44 | [taxadb](https://github.com/ropensci/taxadb) package provides convenient
45 | functions for downloading various taxonomic databases that use Darwin
46 | Core.
47 |
48 | ## Installation
49 |
50 | `taxastand` can be installed from
51 | [r-universe](https://joelnitta.r-universe.dev) or
52 | [github](https://github.com/joelnitta).
53 |
54 | ``` r
55 | install.packages("taxastand", repos = 'https://joelnitta.r-universe.dev')
56 | ```
57 |
58 | OR
59 |
60 | ``` r
61 | # install.packages("remotes")
62 | remotes::install_github("joelnitta/taxastand")
63 | ```
64 |
65 | ## Dependencies
66 |
67 | `taxastand` depends on
68 | [taxon-tools](https://github.com/camwebb/taxon-tools) for taxonomic name
69 | matching.
70 |
71 | There are two options for using this dependency.
72 |
73 | - Install [docker](https://www.docker.com/) and set `docker = TRUE` when
74 | using `taxastand` functions.
75 |
76 | OR
77 |
78 | - Install the two programs included in
79 | [taxon-tools](https://github.com/camwebb/taxon-tools), `parsenames`
80 | and `matchnames`.
81 |
82 | ## Similar work
83 |
84 | - [ROpenSci](https://ropensci.org/) has a [task
85 | view](https://github.com/ropensci/taxonomy) summarizing many tools
86 | available for taxonomy.
87 |
88 | - [taxize](https://github.com/ropensci/taxize) is the “granddaddy” of
89 | taxonomy packages in R. It can search around 20 different taxonomic
90 | databases for names and retrieve taxonomic information.
91 |
92 | - [TNRS](http://tnrs.iplantcollaborative.org/), the Taxonomic Name
93 | Resolution Service, is a web application that resolves taxonomic names
94 | of plants according to one of six databases.
95 |
96 | - [taxizedb](https://github.com/ropensci/taxizedb) downloads taxonomic
97 | databases and provides tools to interface with them through SQL.
98 |
99 | - [taxadb](https://github.com/ropensci/taxadb) also downloads and
100 | searches taxonomic databases. It can interface with them either
101 | through SQL or in-memory in R.
102 |
103 | - [taxonstand](https://cran.r-project.org/web/packages/Taxonstand/index.html)
104 | has a very similar goal to `taxastand`, but only uses [The Plant List
105 | (TPL)](http://www.theplantlist.org) as its taxonomic standard and does
106 | not allow the user to provide their own. Note that TPL is no longer
107 | being updated as of 2013.
108 |
109 | ## Motivation
110 |
111 | Although existing web-based solutions for taxonomic name resolution are
112 | very useful, they may not be ideal for all situations: the choice of
113 | reference database to use for standardization is limited, they may not
114 | be able to handle very large queries, and the user has no guarantee that
115 | the same input will yield the same output at a later date due to changes
116 | in the remote database.
117 |
118 | Furthermore, matching of taxonomic names is not straightforward, since
119 | they are complex data structures including multiple components (e.g.,
120 | genus, specific epithet, basionym author, combination author, etc). [Of
121 | the tools mentioned above](#similar-work) only
122 | [TNRS](http://tnrs.iplantcollaborative.org/) can fuzzily match taxonomic
123 | names based on their parsed components, but it does not allow for use of
124 | a local reference database.
125 |
126 | The motivation for `taxastand` is to provide greater flexibility and
127 | reproducibility by allowing for complete version control of the code and
128 | database used for name resolution, while implementing fuzzy matching of
129 | parsed taxonomic names.
130 |
131 | ## Example
132 |
133 | Here is an example of fuzzy matching followed by resolution of synonyms
134 | using the dataset included with the package.
135 |
136 | ``` r
137 | library(taxastand)
138 |
139 | # Load example reference taxonomy in Darwin Core format
140 | data(filmy_taxonomy)
141 |
142 | # Take a look at the columns used by taxastand
143 | head(filmy_taxonomy[c(
144 | "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
145 |
146 | # As a test, resolve a misspelled name
147 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
148 |
149 | # We can now use the `resolved_name` column of this result for downstream
150 | # analyses joining on other datasets that have been resolved to the same
151 | # reference taxonomy.
152 | ```
153 |
154 | #> taxonID acceptedNameUsageID taxonomicStatus
155 | #> 1 54115096 NA accepted name
156 | #> 2 54133783 54115097 synonym
157 | #> 3 54115097 NA accepted name
158 | #> 4 54133784 54115098 synonym
159 | #> 5 54115098 NA accepted name
160 | #> 6 54133785 54115099 synonym
161 | #> scientificName
162 | #> 1 Cephalomanes atrovirens Presl
163 | #> 2 Trichomanes crassum Copel.
164 | #> 3 Cephalomanes crassum (Copel.) M. G. Price
165 | #> 4 Trichomanes densinervium Copel.
166 | #> 5 Cephalomanes densinervium (Copel.) Copel.
167 | #> 6 Trichomanes infundibulare Alderw.
168 | #> query resolved_name
169 | #> 1 Gonocormus minutum Crepidomanes minutum (Bl.) K. Iwats.
170 | #> matched_name resolved_status matched_status match_type
171 | #> 1 Gonocormus minutus (Bl.) Bosch accepted name synonym auto_fuzzy
172 |
173 | ## Citing this package
174 |
175 | If you use this package, please cite it! Here is an example:
176 |
177 | Nitta, JH (2021) taxastand: Taxonomic name standardization in R. https://doi.org/10.5281/zenodo.5726390
178 |
179 | The example DOI above is for the overall package.
180 |
181 | Here is the latest DOI, which you should use if you are using the latest
182 | version of the package:
183 |
184 | [](https://zenodo.org/badge/latestdoi/192684959)
185 |
186 | You can find DOIs for older versions by viewing the “Releases” menu on
187 | the right.
188 |
189 | You should also cite the software that `taxastand` relies on,
190 | `taxon-tools`:
191 |
--------------------------------------------------------------------------------
/R/ts_resolve_names.R:
--------------------------------------------------------------------------------
1 | #' Resolve synonyms in taxonomic names
2 | #'
3 | #' After matching taxonomic names to a reference, some may match synonyms. This
4 | #' function resolves synonyms to their accepted names.
5 | #'
6 | #' `query` can take as input either a character vector of taxonomic names, or
7 | #' the output of \code{\link{ts_match_names}()}. If the former, it will run
8 | #' \code{\link{ts_match_names}()} to match the query to `ref_taxonomy`, then
9 | #' resolve synonyms. If the latter, the scientific names in `ref_taxonomy`
10 | #' should be the same used as reference with \code{\link{ts_match_names}()}
11 | #' (this is not checked).
12 | #'
13 | #' `ref_taxonomy` must be taxonomic data adhering to the [Darwin Core standard](https://dwc.tdwg.org/terms/#taxon).
14 | #' Darwin Core includes many terms, but only four (`taxonID`,
15 | #' `acceptedNameUsageID`, `taxonomicStatus`, and `scientificName`) are required
16 | #' for this function.
17 | #'
18 | #' @param query Character vector or dataframe; taxonomic names to be resolved.
19 | #' If a character vector, missing values not allowed and all values must be
20 | #' unique. If a dataframe, should be taxonomic names matched with
21 | #' \code{\link{ts_match_names}()}.
22 | #' @param ref_taxonomy Dataframe; reference taxonomic data adhering to the
23 | #' [Darwin Core standard](https://dwc.tdwg.org/terms/#taxon) with the
24 | #' following columns:
25 | #' - `taxonID`: [Unique identifier for each taxon](https://dwc.tdwg.org/terms/#dwc:taxonID).
26 | #' - `acceptedNameUsageID`: If the taxon is a synonym, the [unique identifier for the accepted name](https://dwc.tdwg.org/terms/#dwc:acceptedNameUsageID)
27 | #' - `taxonomicStatus`: [The status of the use of the `scientificName` as a label for the taxon](https://dwc.tdwg.org/terms/#dwc:taxonomicStatus).
28 | #' - `scientificName`: [The full scientific name of the taxon](https://dwc.tdwg.org/terms/#dwc:scientificName),
29 | #' with authorship and date information if known.
30 | #' @param max_dist Max Levenshtein distance to allow during fuzzy matching
31 | #' (total insertions, deletions and substitutions). Default: 10.
32 | #' @param match_no_auth Logical; If no author is given in the query and the name
33 | #' (without author) occurs only once in the reference, accept the name in the
34 | #' reference as a match. Default: to not allow such a match (`FALSE`).
35 | #' @param match_canon Logical; Allow a "canonical name" match if only the genus,
36 | #' species epithet, and infraspecific epithet (if present) match exactly.
37 | #' Default: to not allow such a match (`FALSE`).
38 | #' @param collapse_infra Logical; if the specific epithet and infraspecific
39 | #' epithet are the same, drop the infraspecific rank and epithet from the query.
40 | #' For more information, see \code{\link{ts_match_names}()}.
41 | #' @param collapse_infra_exclude Character vector; taxonomic names to exclude
42 | #' collapsing with `collapse_infra`. Any names used must match those in `query`
43 | #' exactly, or they won't be excluded.
44 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools
45 | #' (so that taxon-tools need not be installed).
46 | #' @param tbl_out Logical vector of length 1; should a tibble be returned?
47 | #' If `FALSE` (default), output will be a data.frame. This argument can
48 | #' be controlled via the option `ts_tbl_out`; see Examples.
49 | #'
50 | #' @return Dataframe; results of resolving synonyms in matched taxonomic names.
51 | #' Includes the following columns:
52 | #' - `query`: Query name
53 | #' - `resolved_name`: Accepted name after resolving synonyms
54 | #' - `matched_name`: Name matched to query
55 | #' - `resolved_status`: Taxonomic status of the resolved name (same as `taxonomicStatus` in `ref_taxonomy`)
56 | #' - `matched_status`: Taxonomic status of the matched name (same as `taxonomicStatus` in `ref_taxonomy`)
57 | #' - `match_type`: Type of match (for a summary of match types, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes))
58 | #'
59 | #' Names that could not be matched or resolve to multiple, different synonyms
60 | #' have `NA` for `resolved_name`.
61 | #'
62 | #' @autoglobal
63 | #' @export
64 | #' @examples
65 | #' if (ts_tt_installed()) {
66 | #' # Load reference taxonomy in Darwin Core format
67 | #' data(filmy_taxonomy)
68 | #'
69 | #' ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
70 | #' # If you always want tibble output without specifying `tbl_out = TRUE`
71 | #' # every time, set the option:
72 | #' options(ts_tbl_out = TRUE)
73 | #' ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
74 | #' }
75 | #'
76 | ts_resolve_names <- function(
77 | query,
78 | ref_taxonomy,
79 | max_dist = 10,
80 | match_no_auth = FALSE,
81 | match_canon = FALSE,
82 | collapse_infra = FALSE,
83 | collapse_infra_exclude = NULL,
84 | docker = getOption("ts_docker", default = FALSE),
85 | tbl_out = getOption("ts_tbl_out", default = FALSE)
86 | ) {
87 | # Check input
88 | assertthat::assert_that(
89 | is.character(query) | inherits(query, "data.frame"),
90 | msg = "query must be of class 'data.frame' or a character vector"
91 | )
92 | assertthat::assert_that(
93 | inherits(ref_taxonomy, "data.frame"),
94 | msg = "ref_taxonomy must be of class 'data.frame'"
95 | )
96 | assertthat::assert_that(assertthat::is.flag(tbl_out))
97 | assertthat::assert_that(assertthat::is.flag(docker))
98 | if (!is.null(collapse_infra_exclude)) {
99 | assertthat::assert_that(is.character(collapse_infra_exclude))
100 | }
101 |
102 | # If needed, match names first
103 | if (is.character(query)) {
104 | match_results <- ts_match_names(
105 | query = query,
106 | reference = unique(ref_taxonomy$scientificName),
107 | max_dist = max_dist,
108 | match_no_auth = match_no_auth,
109 | match_canon = match_canon,
110 | collapse_infra = collapse_infra,
111 | collapse_infra_exclude = collapse_infra_exclude,
112 | simple = TRUE,
113 | docker = docker
114 | )
115 | } else if (is.data.frame(query)) {
116 | match_results <- query
117 | } else {
118 | stop("query must be of class 'data.frame' or a character vector")
119 | }
120 |
121 | # Classify results of matching
122 | match_results_classified_with_taxonomy <-
123 | match_results %>%
124 | ts_classify_result() %>%
125 | dplyr::select(query, reference, match_type, result_type) %>%
126 | dplyr::left_join(ref_taxonomy, by = c(reference = "scientificName"))
127 |
128 | # Separate out single matches to an accepted name (success type 1)
129 | accepted_single_match <-
130 | match_results_classified_with_taxonomy %>%
131 | # consider accepted names have either no acceptedNameUsageID or acceptedNameUsageID is same as taxonID
132 | dplyr::filter(
133 | (is.na(acceptedNameUsageID) |
134 | acceptedNameUsageID == "" |
135 | taxonID == acceptedNameUsageID) &
136 | result_type == "single_match"
137 | ) %>%
138 | dplyr::select(
139 | query,
140 | resolved_name = reference,
141 | matched_name = reference,
142 | resolved_status = taxonomicStatus,
143 | matched_status = taxonomicStatus,
144 | match_type
145 | )
146 |
147 | # Separate out matches to a single synonym (success type 2)
148 | accepted_single_synonyms <-
149 | match_results_classified_with_taxonomy %>%
150 | # Consider synonym anything with acceptedNameUsageID not matching taxonID
151 | dplyr::filter(!is.na(acceptedNameUsageID)) %>%
152 | dplyr::filter(acceptedNameUsageID != "") %>%
153 | dplyr::filter(acceptedNameUsageID != taxonID) %>%
154 | # Join resolved names via synonym
155 | dplyr::left_join(
156 | dplyr::select(
157 | ref_taxonomy,
158 | taxonID,
159 | resolved_name = scientificName,
160 | resolved_status = taxonomicStatus
161 | ),
162 | by = c(acceptedNameUsageID = "taxonID")
163 | ) %>%
164 | dplyr::select(
165 | query,
166 | resolved_name,
167 | matched_name = reference,
168 | resolved_status,
169 | matched_status = taxonomicStatus,
170 | match_type
171 | ) %>%
172 | dplyr::group_by(query) %>%
173 | # Add count of number of resolved, accepted names per query
174 | dplyr::mutate(n = dplyr::n_distinct(resolved_name)) %>%
175 | dplyr::ungroup() %>%
176 | # Only keep those that resolve to the same name
177 | dplyr::filter(n == 1) %>%
178 | dplyr::select(-n)
179 |
180 | # Combine name resolution successes
181 | success <- dplyr::bind_rows(accepted_single_match, accepted_single_synonyms)
182 |
183 | # Anything else is a failure
184 | failure <-
185 | match_results_classified_with_taxonomy %>%
186 | dplyr::select(
187 | query,
188 | match_type,
189 | matched_status = taxonomicStatus,
190 | matched_name = reference
191 | ) %>%
192 | dplyr::anti_join(success, by = "query")
193 |
194 | # Combine into final results
195 | results <- dplyr::bind_rows(success, failure) %>%
196 | assertr::verify(all(query %in% match_results$query)) %>%
197 | assertr::verify(all(match_results$query %in% query)) %>%
198 | dplyr::select(
199 | query,
200 | resolved_name,
201 | matched_name,
202 | resolved_status,
203 | matched_status,
204 | match_type
205 | )
206 |
207 | # Return as tibble or dataframe
208 | if (isTRUE(tbl_out)) return(tibble::as_tibble(results))
209 |
210 | results
211 | }
212 |
--------------------------------------------------------------------------------
/vignettes/basics.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The basics"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{The basics}
6 | %\VignetteEncoding{UTF-8}
7 | %\VignetteEngine{knitr::rmarkdown}
8 | editor_options:
9 | chunk_output_type: console
10 | ---
11 |
12 | ```{r, include = FALSE}
13 | knitr::opts_chunk$set(
14 | collapse = TRUE,
15 | comment = "#>"
16 | )
17 | ```
18 |
19 | This vignette explains the three basic steps of the taxonomic name resolution workflow, which consist of:
20 |
21 | 1. Name parsing
22 | 2. Name matching
23 | 3. Name resolution
24 |
25 | ## Setup
26 |
27 | We'll start by loading `taxastand`. For more information on installing `taxastand`, see [here](https://joelnitta.github.io/taxastand/index.html#installation).
28 |
29 | ```{r setup}
30 | library(taxastand)
31 | ```
32 |
33 | ## Name parsing
34 |
35 | In R, scientific names are often just stored as character vectors (strings). For example,
36 |
37 | ```{r example-name}
38 | example_name <- "Crepidomanes minutum (Bl.) K. Iwats."
39 | ```
40 |
41 | However, such a name actually consists of several distinct parts:
42 |
43 | ```
44 | "Crepidomanes minutum (Bl.) K. Iwats."
45 | ------------- ------- ---------------
46 | | | |
47 | genus specific author
48 | epithet
49 | ```
50 |
51 | Furthermore, in the case of this name, it was originally named by Blume (`(Bl.)`), then transferred to a different genus by Iwatsuki (`K. Iwats.`).
52 |
53 | When working with taxonomic names, it can be useful to **parse** the name into its component parts. That is what `ts_parse_names()` does. It takes a character vector as input and returns a dataframe:
54 |
55 | ```{r parse-example}
56 | ts_parse_names(example_name)
57 | ```
58 |
59 | The first column, `name`, is the original input name. `id` is a unique identifier attached to the name. The rest of the columns are [ the parsed components of the name](https://joelnitta.github.io/taxastand/reference/ts_parse_names.html#value).
60 |
61 | Note that the [name parsing algorithm](https://github.com/camwebb/taxon-tools#parsenames) used by `taxastand` is case-sensitive! It assumes that the [standard capitalization of scientific names](https://en.wikipedia.org/wiki/Binomial_nomenclature#Writing_binomial_names) is being used: genus is capitalized, specific epithet is lower case, author is capitalized as a proper noun, etc. **Name parsing probably won't work without this type of capitalization.**
62 |
63 | Now that we've parsed a name, in the next section we will see why this is useful for matching names to each other.
64 |
65 | ## Name matching
66 |
67 | One reason that name parsing is important is because some scientific names may differ only in certain components.
68 |
69 | For example, the species [*Hymenophyllum pectinatum*](https://www.tropicos.org/name/Search?name=Hymenophyllum%20pectinatum) actually corresponds to two different scientific names with different authors, *Hymenophyllum pectinatum* Nees & Blume and *Hymenophyllum pectinatum* Cav.
70 |
71 | We can see this by querying the name:
72 |
73 | ```{r match-example-1}
74 | ts_match_names(
75 | "Hymenophyllum pectinatum",
76 | c("Hymenophyllum pectinatum Nees & Blume",
77 | "Hymenophyllum pectinatum Cav."),
78 | simple = TRUE)
79 | ```
80 |
81 | `ts_match_names()` matches both scientific names[^1], because the algorithm it can't distinguish between them without additional information. So **it is almost always better to include the taxonomic author in the query**, to distinguish between such cases.
82 |
83 | [^1]: Note that `ts_match_names()` did the name parsing by calling `ts_parse_names()` for us internally. This is usually fine, but it can also take parsed names (dataframes) produced by `ts_parse_names()` as input to either `query` or `reference.`
84 |
85 | However, there can be quite a bit of variation in how authors are recorded. Sometimes names are abbreviated to different lengths, or the basionym author (an author name in parentheses) might get left out by accident, etc. The algorithm used by `taxastand` can account for this (to a point). Here is an example where the query lacks a basionym author:
86 |
87 | ```{r match-example-2}
88 | ts_match_names(
89 | "Hymenophyllum taiwanense C. V. Morton",
90 | c("Hymenophyllum taiwanense (Tagawa) C. V. Morton",
91 | "Hymenophyllum taiwanense De Vol"),
92 | simple = TRUE)
93 | ```
94 |
95 | The name matching algorithm was able to narrow the match down to `Hymenophyllum taiwanense (Tagawa) C. V. Morton` even though the query lacked `(Tagawa)`. Furthermore, the `match_type` tells us how the matching was done: `auto_basio-` means an automatic match based on excluding the basionym author from the reference. **It is recommended to always check any results that weren't identical** (`exact`) to verify that the matching algorithm worked correctly, especially for fuzzy matches (`auto_fuzzy`).
96 |
97 | Here is a summary of the values taken by `match_type` from [`taxon-tools`](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes):
98 |
99 | - `exact`: Exact match to all parts of the name (genus hybrid marker, genus name, species hybrid marker, species epithet, infraspecific rank signifier, infraspecific rank, author string).
100 | - `auto_punct`: Exact match to all parts of the name after removing mis-matching spaces, periods, non-ASCII author name characters, etc.
101 | - `auto_noauth` (only applies if `match_no_auth` is `TRUE`): Match between a query lacking an author and a reference name lacking an author that occurs only once in the reference.
102 | - `auto_basio-`: Match after excluding the basionym author from the reference. For example, `Cardaminopsis umbrosa Czerep.` vs. `Cardaminopsis umbrosa (Turcz.) Czerep.)`); the basionym author is `(Turcz.)`.
103 | - `auto_basio+`: Match after excluding the basionym author from the query.
104 | - `auto_in-`: Match after excluding all *in* elements from reference. An *in* element refers to phrases such as `Tagawa in Morton`. The version excluding *in* elements is `Tagawa`.
105 | - `auto_in+`: Match after excluding all *in* elements from query.
106 | - `auto_ex-`: Match after excluding all *in* and *ex* elements from reference. An *ex* element refers to phrases such as `Rändel ex D.F.Murray`. The version excluding *ex* elements is `Rändel`.
107 | - `auto_ex+`: Match after excluding all *in* and *ex* elements from query.
108 | - `auto_basexin`: Match after excluding all basionym authors and all *in* and *ex* elements from query and reference.
109 | - `auto_irank`: Match where all elements agree except for infraspecific rank.
110 | - `auto_fuzzy`: Fuzzy match; match between scientific names allowed up to threshold given by `max_dist`, the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) including total insertions, deletions and substitutions.
111 | - `cfonly`: Match by "canonical form", i.e., genus plus specific epithet plus infraspecific epithet (if present), not including the infraspecific specifier ("subsp.", etc.).
112 | - `no_match`: No match detected.
113 |
114 | The matching algorithm will prefer match codes higher in the list; so if a name could be matched both by `auto_punct` and `auto_fuzzy`, it will be matched based on `auto_punct`[^2].
115 |
116 | [^2]: The algorithm used by `taxastand` is optimized for plants, algae, and fungi, which vary in their [taxonomic rules](https://www.iapt-taxon.org/nomen/main.php) somewhat from animals. For example, plants include basionym authors in parentheses followed by the combination author, and typically don't include the year, whereas animals normally include the year and may not provide the combination author.
117 |
118 | ## Name resolution
119 |
120 | Name resolution refers to the process of mapping a query name to its standard version. This could just be accounting for orthographic variations, or it could involve resolving synonyms: different names that actually refer to the same species.
121 |
122 | In order to conduct name resolution, we require a **taxonomic standard** in the form of a dataframe. `taxastand` requires that the taxonomic standard conform to [Darwin Core standards](https://dwc.tdwg.org/). There are many sources of taxonomic data online, including [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c), [Catalog of Life](http://www.catalogueoflife.org/), and [ITIS](https://www.itis.gov/) among others.
123 |
124 | `taxastand` comes supplied with an example taxonomic standard for filmy ferns (family Hymenophyllaceae):
125 |
126 | ```{r name-res-example-1}
127 | # Load example reference taxonomy in Darwin Core format
128 | data(filmy_taxonomy)
129 |
130 | # Take a look at the columns used by taxastand
131 | head(filmy_taxonomy[c("taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
132 | ```
133 |
134 | Here, `taxonID` is a unique identifier for each taxonomic name. `acceptedNameUsageID` only applies in the case of synonyms: it tells us the `taxonID` of the accepted name corresponding to that synonym. `taxonomicStatus` describes the status of the name, typically either as an accepted name, synonym, or something else ("dubious", etc.). Finally, the `scientificName` is the full scientific name, preferably with the author.
135 |
136 | In its most simple usage, `ts_resolve_names()` can take as input a character vector to `query`, and provide the resolved name in the taxonomic standard (`reference`):
137 |
138 | ```{r name-res-example-2}
139 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
140 | ```
141 |
142 | In this case, the query, `Gonocormus minutum` was a misspelled name that is actually a synonym for *Crepidomanes minutum* (Bl.) K. Iwats. Under the hood, `ts_resolve_names()` is calling both `ts_parse_names()` and `ts_match_names()` to do parsing and matching steps before name resolution[^3].
143 |
144 | [^3]: You can use the output of `ts_match_names()` to the `query` input of `ts_parse_names()` if you want to see the matching results first.
145 |
146 | However, when used this way, `ts_resolve_names()` may not be able to provide a resolved name if the input is not matched unambiguously:
147 |
148 | ```{r name-res-example-3}
149 | t_bifid_res <- ts_resolve_names("Trichomanes bifidum", filmy_taxonomy)
150 | head(t_bifid_res)
151 | dim(t_bifid_res)
152 | ```
153 |
154 | In this case, name resolution using the default settings produced `r nrow(t_bifid_res)` possible answers! That is obviously far too many. Let's try to adjust the arguments and see if we can reduce the output:
155 |
156 | ```{r name-res-example-4}
157 | ts_resolve_names(
158 | "Trichomanes bifidum", filmy_taxonomy,
159 | match_no_auth = TRUE, match_canon = TRUE, max_dist = 5)
160 | ```
161 |
162 | By allowing matches without the author name (we probably should have done that anyways, since the query lacked an author) and lowering the fuzzy match threshold, we are able to greatly reduce the number of possible resolved names.
163 |
164 | Name resolution workflows typically involve tweaking these arguments to resolve a maximum number of names automatically, followed by some amount of manual edits to the remaining resolved names.
165 |
166 | A benefit of `taxastand` is that, if during the name resolution workflow we discover mistakes in the reference database, the reference database can be edited so that the query names resolve correctly (this is not possible with packages that rely on querying a remote taxonomic database that can't be modified by the user).
167 |
168 | ## Conclusion
169 |
170 | This vignette illustrated the typical steps involved in name resolution with `taxastand` on some trivial examples. In another vignette, I will provide a more realistic example with a larger dataset.
171 |
--------------------------------------------------------------------------------
/R/ts_match_names.R:
--------------------------------------------------------------------------------
1 | #' Match taxonomic names to a reference
2 | #'
3 | #' Allows for orthographic differences between query and reference by using
4 | #' fuzzy matching on parsed taxonomic names. Requires
5 | #' [taxon-tools](https://github.com/camwebb/taxon-tools) to be installed.
6 | #'
7 | #' `taxon-tools` matches names in two steps:
8 | #' 1. Scientific names are parsed into their component parts (genus, species,
9 | #' variety, author, etc).
10 | #' 2. Names are fuzzily matched following taxonomic rules using the component
11 | #' parts.
12 | #'
13 | #' For more information on rules used for matching, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes).
14 | #'
15 | #' Parsing is fairly fast (much faster than matching) but can take some time if
16 | #' the number of names is very large. If multiple queries will be made (e.g., to
17 | #' the same large reference database), it is recommended to first parse the
18 | #' names using \code{\link{ts_parse_names}()}, and use the results as input to
19 | #' `query` and/or `reference`.
20 | #'
21 | #' `collapse_infra` is useful in situations where the reference database does
22 | #' not use names that have the same specific epithet and infraspecific epithet.
23 | #' For example, reference name "Blechnum lunare" and query "Blechnum lunare var.
24 | #' lunare". In this case, if `collapse_infra` is `TRUE`, "Blechnum lunare" will
25 | #' be queried instead of "Blechnum lunare var. lunare". Note that the
26 | #' `match_type` will be "exact" even though the literal query and the matched
27 | #' name are different (see example below).
28 | #'
29 | #' @param query Character vector or dataframe; taxonomic names to be queried.
30 | #' If a character vector, missing values not allowed and all values must be
31 | #' unique.
32 | #' If a dataframe, should be taxonomic names parsed with
33 | #' \code{\link{ts_parse_names}()}.
34 | #' @param reference Character vector or dataframe; taxonomic names to use as
35 | #' reference. If a character vector, missing values not allowed and all values
36 | #' must be unique. If a dataframe, should be taxonomic names parsed with
37 | #' \code{\link{ts_parse_names}()}.
38 | #' @param manual_match Optional. Dataframe of manually matched names that will
39 | #' override any results from `taxon-tools`. Must include columns, `query`
40 | #' and `match`. Can only be used if `query` is a character vector.
41 | #' @param max_dist Max Levenshtein distance to allow during fuzzy matching
42 | #' (total insertions, deletions and substitutions). Default: 10.
43 | #' @param match_no_auth Logical; If no author is given in the query and the name
44 | #' (without author) occurs only once in the reference, accept the name in the
45 | #' reference as a match. Default: to not allow such a match (`FALSE`).
46 | #' @param match_canon Logical; Allow a "canonical name" match if only the genus,
47 | #' species epithet, and infraspecific epithet (if present) match exactly.
48 | #' Default: to not allow such a match (`FALSE`).
49 | #' @param collapse_infra Logical; if the specific epithet and infraspecific
50 | #' epithet are the same, drop the infraspecific rank and epithet from the query.
51 | #' @param collapse_infra_exclude Character vector; taxonomic names to exclude
52 | #' from collapsing with `collapse_infra`. Any names used must match those in
53 | #' `query` exactly, or they won't be excluded.
54 | #' @param simple Logical; return the output in a simplified format with only the
55 | #' query name, matched reference name, and match type. Default: `FALSE`.
56 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools
57 | #' (so that taxon-tools need not be installed).
58 | #' @param tbl_out Logical vector of length 1; should a tibble be returned?
59 | #' If `FALSE` (default), output will be a data.frame. This argument can
60 | #' be controlled via the option `ts_tbl_out`; see Examples.
61 | #'
62 | #' @return Dataframe with the following columns (if `simple` is `FALSE`):
63 | #' - query: Query name
64 | #' - reference: Matched reference name
65 | #' - match_type: Type of match (for a summary of match types, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes))
66 | #' - id_query: Unique ID of query
67 | #' - id_ref: Unique ID of reference
68 | #' - genus_hybrid_sign_query: Genus hybrid sign in query
69 | #' - genus_name_query: Genus name of query
70 | #' - species_hybrid_sign_query: Species hybrid sign in query
71 | #' - specific_epithet_query: Specific epithet of query
72 | #' - infraspecific_rank_query: Infraspecific rank of query
73 | #' - infraspecific_epithet_query: Infraspecific epithet of query
74 | #' - author_query: Taxonomic author of query
75 | #' - genus_hybrid_sign_ref: Genus hybrid sign in reference
76 | #' - genus_name_ref: Genus name of reference
77 | #' - species_hybrid_sign_ref: Species hybrid sign in reference
78 | #' - specific_epithet_ref: Specific epithet of reference
79 | #' - infraspecific_rank_ref: Infraspecific rank of reference
80 | #' - infraspecific_epithet_ref: Infraspecific epithet of reference
81 | #' - author_ref: Taxonomic author of reference
82 | #'
83 | #' If `simple` is `TRUE`, only return the first three columns above.
84 | #'
85 | #' @autoglobal
86 | #' @export
87 | #' @examples
88 | #' if(ts_tt_installed()) {
89 | #' ts_match_names(
90 | #' "Crepidomanes minutus",
91 | #' c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
92 | #' simple = TRUE
93 | #' )
94 | #'
95 | #' # If names are too distant, they won't match
96 | #' ts_match_names(
97 | #' query = "Crepidblah foo",
98 | #' reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
99 | #' simple = TRUE
100 | #' )
101 | #'
102 | #' # But we can force a match manually
103 | #' ts_match_names(
104 | #' query = "Crepidblah foo",
105 | #' reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
106 | #' manual_match = data.frame(
107 | #' query = c("Crepidblah foo"),
108 | #' match = c("Crepidomanes minutum")
109 | #' ),
110 | #' simple = TRUE
111 | #' )
112 | #'
113 | #' # If you always want tibble output without specifying `tbl_out = TRUE`
114 | #' # every time, set the option:
115 | #' options(ts_tbl_out = TRUE)
116 | #' ts_match_names(
117 | #' "Crepidomanes minutus",
118 | #' c("Crepidomanes minutum", "Hymenophyllum polyanthos")
119 | #' )
120 | #'
121 | #' # Example using collapse_infra argument
122 | #' ts_match_names(
123 | #' c("Crepidomanes minutus", "Blechnum lunare var. lunare",
124 | #' "Blechnum lunare", "Bar foo var. foo", "Bar foo"),
125 | #' c("Crepidomanes minutum", "Hymenophyllum polyanthos", "Blechnum lunare",
126 | #' "Bar foo"),
127 | #' collapse_infra = TRUE,
128 | #' collapse_infra_exclude = "Bar foo var. foo",
129 | #' simple = TRUE
130 | #' )
131 | #' }
132 | #'
133 | ts_match_names <- function(
134 | query,
135 | reference,
136 | manual_match = NULL,
137 | max_dist = 10,
138 | match_no_auth = FALSE,
139 | match_canon = FALSE,
140 | collapse_infra = FALSE,
141 | collapse_infra_exclude = NULL,
142 | simple = FALSE,
143 | docker = getOption("ts_docker", default = FALSE),
144 | tbl_out = getOption("ts_tbl_out", default = FALSE)
145 | ) {
146 | # Check input
147 | assertthat::assert_that(
148 | is.character(query) | inherits(query, "data.frame"),
149 | msg = "query must be of class 'data.frame' or a character vector"
150 | )
151 | assertthat::assert_that(
152 | is.character(reference) | inherits(reference, "data.frame"),
153 | msg = "reference must be of class 'data.frame' or a character vector"
154 | )
155 | assertthat::assert_that(assertthat::is.number(max_dist))
156 | assertthat::assert_that(is.logical(match_no_auth))
157 | assertthat::assert_that(is.logical(match_canon))
158 | assertthat::assert_that(is.logical(simple))
159 | assertthat::assert_that(assertthat::is.flag(tbl_out))
160 | assertthat::assert_that(assertthat::is.flag(collapse_infra))
161 | if (!is.null(collapse_infra_exclude)) {
162 | assertthat::assert_that(is.character(collapse_infra_exclude))
163 | }
164 | assertthat::assert_that(assertthat::is.flag(docker))
165 | if (!is.null(manual_match)) {
166 | assertthat::assert_that(
167 | isTRUE(inherits(manual_match, "data.frame")),
168 | msg = "manual_match must be of class 'data.frame'"
169 | )
170 | assertthat::assert_that(
171 | isTRUE(
172 | all(c("query", "match") %in% colnames(manual_match))
173 | ),
174 | msg = "manual_match must have `query` and `match` columns"
175 | )
176 | assertthat::assert_that(
177 | is.character(manual_match$query)
178 | )
179 | assertthat::assert_that(
180 | is.character(manual_match$match)
181 | )
182 | assertthat::assert_that(
183 | assertthat::noNA(manual_match$query)
184 | )
185 | assertthat::assert_that(
186 | assertthat::noNA(manual_match$query)
187 | )
188 | assertthat::assert_that(
189 | isTRUE(!any(duplicated(manual_match$query))),
190 | msg = "All values of manual_match$query must be unique"
191 | )
192 | assertthat::assert_that(
193 | is.character(query),
194 | msg = "manual_match can only be used if query is a character vector"
195 | )
196 | }
197 |
198 | # Helper function to add a namestring to a dataframe of parsed names
199 | add_namestring <- function(df) {
200 | df$namestring <-
201 | paste0(
202 | df$genus_hybrid_sign,
203 | df$genus_name,
204 | df$species_hybrid_sign,
205 | df$specific_epithet,
206 | df$infraspecific_rank,
207 | df$infraspecific_epithet,
208 | df$author,
209 | sep = "_"
210 | )
211 | df
212 | }
213 |
214 | # Parse or load query names
215 | if (is.character(query)) {
216 | # Optional: for manual matches, use matched name instead of query
217 | # to generate exact match
218 | if (!is.null(manual_match)) {
219 | manual_replacement_df <-
220 | data.frame(
221 | query_original = query
222 | ) |>
223 | dplyr::left_join(
224 | dplyr::select(
225 | manual_match,
226 | query_original = query,
227 | query_new = match
228 | ),
229 | by = "query_original",
230 | relationship = "one-to-one"
231 | ) |>
232 | dplyr::mutate(
233 | query_new = dplyr::coalesce(query_new, query_original)
234 | )
235 | query <- manual_replacement_df$query_new |>
236 | unique()
237 | }
238 | # Parse the names (adds 'name' column)
239 | query_parsed_df <- ts_parse_names(query, docker = docker)
240 | } else {
241 | # Or, names are already parsed
242 | query_parsed_df <- query
243 | }
244 |
245 | # Optionally collapse infraspecific name
246 | if (isTRUE(collapse_infra)) {
247 | # Save a copy of original unmodified parsed query
248 | query_parsed_df_original <- query_parsed_df
249 | # Identify rows where infraspecific_epithet is the same as specific_epithet
250 | query_parsed_df$same_infra_species <-
251 | (query_parsed_df$specific_epithet ==
252 | query_parsed_df$infraspecific_epithet) %in%
253 | TRUE &
254 | !query_parsed_df$name %in% collapse_infra_exclude
255 | assertthat::assert_that(!anyNA(query_parsed_df$same_infra_species))
256 | # For rows where infraspecific_epithet is the same as specific_epithet,
257 | # delete infraspecific_epithet and infraspecific_rank
258 | query_parsed_df$infraspecific_epithet[
259 | query_parsed_df$same_infra_species
260 | ] <- NA
261 | query_parsed_df$infraspecific_rank[query_parsed_df$same_infra_species] <- NA
262 | query_parsed_df$same_infra_species <- NULL
263 | # Account for duplicates created after collapsing names: drop them
264 | query_parsed_df <- add_namestring(query_parsed_df) |>
265 | dplyr::group_by(namestring) |>
266 | dplyr::mutate(key_id = dplyr::first(id)) |>
267 | dplyr::ungroup()
268 | id_map <- dplyr::select(query_parsed_df, id_query = key_id, id)
269 | query_parsed_df <- query_parsed_df[
270 | !duplicated(query_parsed_df$namestring),
271 | ]
272 | query_parsed_df$namestring <- NULL
273 | }
274 |
275 | # Write out parsed names to temporary file
276 | query_parsed_txt <- tempfile(
277 | pattern = digest::digest(query),
278 | fileext = ".txt"
279 | )
280 | if (fs::file_exists(query_parsed_txt)) fs::file_delete(query_parsed_txt)
281 | ts_write_names(query_parsed_df, query_parsed_txt)
282 |
283 | # Parse or load reference names
284 | if (is.character(reference)) {
285 | # Parse the names (adds 'name' column)
286 | ref_parsed_df <- ts_parse_names(reference, docker = docker)
287 | } else {
288 | # Or, names are already parsed
289 | ref_parsed_df <- reference
290 | }
291 |
292 | # Check that manually matched ref names are in data
293 | if (!is.null(manual_match)) {
294 | assertthat::assert_that(
295 | isTRUE(all(manual_match$match %in% ref_parsed_df$name)),
296 | msg = "One or more manually matched reference names not in reference data"
297 | )
298 | }
299 |
300 | # Write out parsed names to temporary file
301 | ref_parsed_txt <- tempfile(
302 | pattern = digest::digest(reference),
303 | fileext = ".txt"
304 | )
305 | if (fs::file_exists(ref_parsed_txt)) fs::file_delete(ref_parsed_txt)
306 | ts_write_names(ref_parsed_df, ref_parsed_txt)
307 |
308 | # Format argument flags
309 | if (match_no_auth) match_no_auth <- "-1" else match_no_auth <- NULL
310 | if (match_canon) match_canon <- "-c" else match_canon <- NULL
311 |
312 | # Specify temporary output file
313 | match_results_txt <- tempfile(
314 | pattern = digest::digest(c(query, reference)),
315 | fileext = ".txt"
316 | )
317 | if (fs::file_exists(match_results_txt)) fs::file_delete(match_results_txt)
318 |
319 | # Run taxon-tools matchnames
320 |
321 | if (isTRUE(docker)) {
322 | assertthat::assert_that(
323 | requireNamespace("babelwhale", quietly = TRUE),
324 | msg = "babelwhale needs to be installed to use docker"
325 | )
326 | assertthat::assert_that(
327 | babelwhale::test_docker_installation(),
328 | msg = "docker not installed"
329 | )
330 | match_results <- run_auto_mount(
331 | container_id = "camwebb/taxon-tools:v1.3.0",
332 | command = "matchnames",
333 | args = c(
334 | "-a",
335 | file = query_parsed_txt,
336 | "-b",
337 | file = ref_parsed_txt,
338 | "-o",
339 | file = match_results_txt,
340 | "-e",
341 | max_dist,
342 | "-F", # no manual matching
343 | match_no_auth,
344 | match_canon
345 | )
346 | )
347 | } else {
348 | assertthat::assert_that(
349 | ts_tt_installed(),
350 | msg = "taxon-tools not installed"
351 | )
352 | match_results <- processx::run(
353 | command = "matchnames",
354 | args = c(
355 | "-a",
356 | query_parsed_txt,
357 | "-b",
358 | ref_parsed_txt,
359 | "-o",
360 | match_results_txt,
361 | "-e",
362 | max_dist,
363 | "-F", # no manual matching
364 | match_no_auth,
365 | match_canon
366 | )
367 | )
368 | }
369 |
370 | # Read in results
371 | # Each line represents a single name from the query list (list A).
372 | # Seventeen pipe-delimited (“|”) fields per row:
373 | # 1. User ID code in list A,
374 | # 2. Code in list B (if matched),
375 | # 3. Match type (see codes below),
376 | # 4-10. Parsed elements of name in list A.
377 | # 11-17 (in same format as name input), Parsed elements of name in list B.
378 | matchnames_cols <- c(
379 | "id_query",
380 | "id_ref",
381 | "match_type",
382 | "genus_hybrid_sign_query",
383 | "genus_name_query",
384 | "species_hybrid_sign_query",
385 | "specific_epithet_query",
386 | "infraspecific_rank_query",
387 | "infraspecific_epithet_query",
388 | "author_query",
389 | "genus_hybrid_sign_ref",
390 | "genus_name_ref",
391 | "species_hybrid_sign_ref",
392 | "specific_epithet_ref",
393 | "infraspecific_rank_ref",
394 | "infraspecific_epithet_ref",
395 | "author_ref"
396 | )
397 |
398 | results <- data.frame(record = readLines(match_results_txt))
399 |
400 | results <- tidyr::separate(
401 | data = results,
402 | col = record,
403 | into = matchnames_cols,
404 | sep = "\\|",
405 | fill = "right",
406 | remove = TRUE
407 | )
408 |
409 | # Convert empty strings to NA
410 | results <- dplyr::mutate(
411 | results,
412 | dplyr::across(dplyr::everything(), ~ dplyr::na_if(.x, ""))
413 | )
414 |
415 | # Add back in the original search terms (query and reference)
416 | results <- dplyr::left_join(
417 | results,
418 | dplyr::select(query_parsed_df, id_query = id, query = name),
419 | by = "id_query"
420 | )
421 |
422 | results <- dplyr::left_join(
423 | results,
424 | dplyr::select(ref_parsed_df, id_ref = id, reference = name),
425 | by = "id_ref"
426 | )
427 |
428 | results <- dplyr::select(
429 | results,
430 | query,
431 | reference,
432 | match_type,
433 | dplyr::everything()
434 | )
435 |
436 | # Add back in names that were duplicated due to collapsed infrasp names
437 | if (isTRUE(collapse_infra)) {
438 | results <-
439 | dplyr::select(
440 | query_parsed_df_original,
441 | query = name,
442 | id
443 | ) |>
444 | dplyr::left_join(id_map, by = "id") |>
445 | dplyr::left_join(
446 | dplyr::select(results, -query),
447 | by = "id_query"
448 | ) |>
449 | dplyr::select(-id) |>
450 | dplyr::select(query, reference, match_type, dplyr::everything())
451 | }
452 |
453 | # For manual matches, restore back to original query input, and specify
454 | # that match was made manually
455 | if (!is.null(manual_match)) {
456 | results <-
457 | manual_replacement_df |>
458 | dplyr::inner_join(
459 | results,
460 | by = dplyr::join_by(query_new == query)
461 | ) |>
462 | dplyr::mutate(
463 | match_type = dplyr::case_when(
464 | query_original %in% manual_match$query ~ "manual",
465 | .default = match_type
466 | )
467 | ) |>
468 | dplyr::select(
469 | query = query_original,
470 | dplyr::everything()
471 | )
472 | }
473 |
474 | if (simple == TRUE)
475 | results <- dplyr::select(results, query, reference, match_type)
476 |
477 | if (isTRUE(tbl_out)) return(tibble::as_tibble(results))
478 |
479 | results
480 | }
481 |
--------------------------------------------------------------------------------