├── .github
    ├── .gitignore
    └── workflows
    │   └── pkgdown.yaml
├── vignettes
    ├── .gitignore
    └── basics.Rmd
├── LICENSE
├── man
    ├── figures
    │   └── logo.png
    ├── pipe.Rd
    ├── ts_classify_result.Rd
    ├── ts_tt_installed.Rd
    ├── ts_make_name_df.Rd
    ├── filmy_taxonomy.Rd
    ├── ts_write_names.Rd
    ├── ts_parse_names.Rd
    ├── ts_resolve_names.Rd
    └── ts_match_names.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── _snaps
    │       ├── ts_write_names
    │       │   └── parsed_name.txt
    │       ├── ts_resolve_names.md
    │       ├── ts_parse_names.md
    │       └── ts_match_names.md
    │   ├── test-utils.R
    │   ├── test-ts_parse_names.R
    │   ├── test-ts_write_names.R
    │   ├── test-ts_resolve_names.R
    │   └── test-ts_match_names.R
├── data
    └── filmy_taxonomy.rda
├── .gitignore
├── .Rbuildignore
├── NAMESPACE
├── R
    ├── utils-pipe.R
    ├── data.R
    ├── ts_tt_installed.R
    ├── globals.R
    ├── ts_write_names.R
    ├── utils.R
    ├── ts_parse_names.R
    ├── ts_resolve_names.R
    └── ts_match_names.R
├── _pkgdown.yml
├── LICENSE.md
├── DESCRIPTION
├── data-raw
    └── filmy_taxonomy.R
├── README.Rmd
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Joel Nitta
3 | 


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelnitta/taxastand/HEAD/man/figures/logo.png


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(taxastand)
3 | 
4 | test_check("taxastand")
5 | 


--------------------------------------------------------------------------------
/data/filmy_taxonomy.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joelnitta/taxastand/HEAD/data/filmy_taxonomy.rda


--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_write_names/parsed_name.txt:
--------------------------------------------------------------------------------
1 | 5f207ff2-1||Foogenus|×|barspecies|var.|foosubsp|(L.) F. Bar
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Rprofile
 5 | *.Rproj
 6 | .DS_Store
 7 | docs
 8 | inst/doc
 9 | /doc/
10 | /Meta/
11 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^taxastand\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^\.Rprofile$
 4 | ^LICENSE\.md$
 5 | ^data-raw$
 6 | ^README\.Rmd$
 7 | ^_pkgdown\.yml$
 8 | ^docs$
 9 | ^pkgdown$
10 | ^\.github$
11 | ^doc$
12 | ^Meta$
13 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export("%>%")
 4 | export(ts_match_names)
 5 | export(ts_parse_names)
 6 | export(ts_resolve_names)
 7 | export(ts_tt_installed)
 8 | export(ts_write_names)
 9 | importFrom(magrittr,"%>%")
10 | 


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_resolve_names.md:
--------------------------------------------------------------------------------
1 | # Produces expected output with docker
2 | 
3 |     Code
4 |       match_results
5 |     Output
6 |                      query                      reference match_type
7 |       1 Gonocormus minutum Gonocormus minutus (Bl.) Bosch auto_fuzzy
8 | 
9 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/ts_classify_result.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{ts_classify_result}
 4 | \alias{ts_classify_result}
 5 | \title{Classify results of taxon-tools matching}
 6 | \usage{
 7 | ts_classify_result(match_results)
 8 | }
 9 | \arguments{
10 | \item{match_results}{Dataframe; output of tt_match_names()}
11 | }
12 | \value{
13 | Dataframe with column \code{result_type} added
14 | }
15 | \description{
16 | Classify results of taxon-tools matching
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Taxonomy of filmy ferns (family Hymenophyllaceae)
 2 | #'
 3 | #' A dataset containing taxonomic names and associated metadata for the
 4 | #' fern family Hymenophyllaceae. Downloaded from the
 5 | #' [Catalog of Life](http://www.catalogueoflife.org/), Version 1.5.
 6 | #' All columns formatted according to
 7 | #' [Darwin Core standard](https://dwc.tdwg.org/terms/). Only includes taxa
 8 | #' at the species or infraspecies level.
 9 | #'
10 | #' @format A data frame with 2729 rows and 31 variables.
11 | #'
12 | #' @source <http://www.catalogueoflife.org/>
13 | "filmy_taxonomy"
14 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
 1 | test_that("Making a dataframe with taxonomic names works", {
 2 |   expect_s3_class(
 3 |     ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar"),
 4 |     "data.frame"
 5 |   )
 6 |   expect_error(
 7 |     ts_make_name_df(c("Foogenus", "Foogenus")),
 8 |     "Input taxa must be unique"
 9 |   )
10 |   expect_error(
11 |     ts_make_name_df(c("Foogenus", NA)),
12 |     "Input taxa may not contain NAs"
13 |   )
14 |   expect_error(
15 |     ts_classify_result("Foogenus"),
16 |     "match_results must be of class 'data\\.frame'"
17 |   )
18 | })
19 | 


--------------------------------------------------------------------------------
/man/ts_tt_installed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ts_tt_installed.R
 3 | \name{ts_tt_installed}
 4 | \alias{ts_tt_installed}
 5 | \title{Test if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is installed}
 6 | \usage{
 7 | ts_tt_installed()
 8 | }
 9 | \value{
10 | \code{TRUE} if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is
11 | installed, or \code{FALSE} if not.
12 | }
13 | \description{
14 | Test if \href{https://github.com/camwebb/taxon-tools}{taxon-tools} is installed
15 | }
16 | \examples{
17 | ts_tt_installed()
18 | }
19 | 


--------------------------------------------------------------------------------
/R/ts_tt_installed.R:
--------------------------------------------------------------------------------
 1 | #' Test if [taxon-tools](https://github.com/camwebb/taxon-tools) is installed
 2 | #'
 3 | #' @return `TRUE` if [taxon-tools](https://github.com/camwebb/taxon-tools) is
 4 | #'   installed, or `FALSE` if not.
 5 | #' @export
 6 | #'
 7 | #' @examples
 8 | #' ts_tt_installed()
 9 | ts_tt_installed <- function() {
10 |   tryCatch(
11 |     {
12 |       parsenames_res <- processx::run("parsenames", "--version")
13 |       matchnames_res <- processx::run("matchnames", "--version")
14 |       return(TRUE)
15 |     },
16 |     error = function(error_message) {
17 |       return(FALSE)
18 |     }
19 |   )
20 | }
21 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | home:
 2 |   title: Standardize Taxonomic Names
 3 |   description: >
 4 |     Matches species names to a taxonomic standard. Resolves synonyms consistently and reproducibly.
 5 | template:
 6 |   params:
 7 |     bootswatch: lumen
 8 | reference:
 9 | - title: "Parse names"
10 | - contents:
11 |   - ts_parse_names
12 | - title: "Match names"
13 | - contents:
14 |   - ts_match_names
15 | - title: "Resolve names"
16 | - contents:
17 |   - ts_resolve_names
18 | - title: "Datasets"
19 | - contents:
20 |   - filmy_taxonomy
21 | - title: "I/O"
22 | - contents:
23 |   - ts_write_names
24 | - title: "Utilities"
25 | - contents:
26 |   - ts_tt_installed
27 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_parse_names.md:
--------------------------------------------------------------------------------
 1 | # Parsing works with docker
 2 | 
 3 |     Code
 4 |       invisible(capture.output(parse_res <- ts_parse_names(
 5 |         "Foogenus x barspecies var. foosubsp (L.) F. Bar", docker = TRUE)))
 6 |       parse_res
 7 |     Output
 8 |                                                    name         id genus_hybrid_sign
 9 |       1 Foogenus x barspecies var. foosubsp (L.) F. Bar 5f207ff2-1              <NA>
10 |         genus_name species_hybrid_sign specific_epithet infraspecific_rank
11 |       1   Foogenus                   ×       barspecies               var.
12 |         infraspecific_epithet      author
13 |       1              foosubsp (L.) F. Bar
14 | 
15 | 


--------------------------------------------------------------------------------
/man/ts_make_name_df.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{ts_make_name_df}
 4 | \alias{ts_make_name_df}
 5 | \title{Make a dataframe with taxonomic names}
 6 | \usage{
 7 | ts_make_name_df(taxa)
 8 | }
 9 | \arguments{
10 | \item{taxa}{Character vector; taxon names to be parsed by taxon-tools \code{parsenames}.
11 | Missing values not allowed. Must all be unique.}
12 | }
13 | \value{
14 | Dataframe with two columns: \code{id} and \code{name}
15 | }
16 | \description{
17 | Make a dataframe with taxonomic names
18 | }
19 | \examples{
20 | \dontrun{
21 | ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar")
22 | }
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/filmy_taxonomy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{filmy_taxonomy}
 5 | \alias{filmy_taxonomy}
 6 | \title{Taxonomy of filmy ferns (family Hymenophyllaceae)}
 7 | \format{
 8 | A data frame with 2729 rows and 31 variables.
 9 | }
10 | \source{
11 | \url{http://www.catalogueoflife.org/}
12 | }
13 | \usage{
14 | filmy_taxonomy
15 | }
16 | \description{
17 | A dataset containing taxonomic names and associated metadata for the
18 | fern family Hymenophyllaceae. Downloaded from the
19 | \href{http://www.catalogueoflife.org/}{Catalog of Life}, Version 1.5.
20 | All columns formatted according to
21 | \href{https://dwc.tdwg.org/terms/}{Darwin Core standard}. Only includes taxa
22 | at the species or infraspecies level.
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ts_parse_names.R:
--------------------------------------------------------------------------------
 1 | test_that("Input checks work", {
 2 |   expect_error(
 3 |     ts_parse_names(c("Foogenus", "Foogenus")),
 4 |     "Input taxa must be unique"
 5 |   )
 6 |   expect_error(
 7 |     ts_parse_names(c("Foogenus", NA)),
 8 |     "Input taxa may not contain NAs"
 9 |   )
10 | })
11 | 
12 | test_that("Parsing works with docker", {
13 |   skip_if_no_docker()
14 |   expect_snapshot({
15 |     # Need invisible() and capture.output() to suppress spinner
16 |     invisible(
17 |       capture.output(
18 |         parse_res <- ts_parse_names(
19 |           "Foogenus x barspecies var. foosubsp (L.) F. Bar",
20 |           docker = TRUE
21 |         )
22 |       )
23 |     )
24 |     parse_res
25 |   })
26 | })
27 | 
28 | test_that("Parsing works with local taxon-tools", {
29 |   skip_if_no_tt()
30 |   expect_snapshot(
31 |     ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
32 |   )
33 | })
34 | 


--------------------------------------------------------------------------------
/man/ts_write_names.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ts_write_names.R
 3 | \name{ts_write_names}
 4 | \alias{ts_write_names}
 5 | \title{Write out parsed names to a text file}
 6 | \usage{
 7 | ts_write_names(df, path)
 8 | }
 9 | \arguments{
10 | \item{df}{Dataframe with parsed names}
11 | 
12 | \item{path}{Path to write dataframe
13 | 
14 | Writes out parsed names in a format that can be used by \href{https://github.com/camwebb/taxon-tools}{taxon-tools}
15 | (each part of the scientific name is separated by the pipe symbol (|), with one name per line).}
16 | }
17 | \value{
18 | Path to parsed names
19 | }
20 | \description{
21 | Write out parsed names to a text file
22 | }
23 | \examples{
24 | if (ts_tt_installed()) {
25 |   parsed_names <- ts_parse_names(
26 |     "Foogenus x barspecies var. foosubsp (L.) F. Bar")
27 |   temp_file <- tempfile()
28 |   ts_write_names(parsed_names, temp_file)
29 |   readLines(temp_file)
30 |   file.remove(temp_file)
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
 1 | # Generated by roxyglobals: do not edit by hand
 2 | 
 3 | utils::globalVariables(c(
 4 |   "namestring", # <ts_match_names>
 5 |   "id", # <ts_match_names>
 6 |   "key_id", # <ts_match_names>
 7 |   "record", # <ts_match_names>
 8 |   "namestring_query", # <ts_match_names>
 9 |   "name", # <ts_match_names>
10 |   "match_type", # <ts_match_names>
11 |   "record", # <ts_parse_names>
12 |   "id", # <ts_parse_names>
13 |   "name", # <ts_parse_names>
14 |   "reference", # <ts_resolve_names>
15 |   "match_type", # <ts_resolve_names>
16 |   "result_type", # <ts_resolve_names>
17 |   "acceptedNameUsageID", # <ts_resolve_names>
18 |   "taxonID", # <ts_resolve_names>
19 |   "taxonomicStatus", # <ts_resolve_names>
20 |   "scientificName", # <ts_resolve_names>
21 |   "resolved_name", # <ts_resolve_names>
22 |   "resolved_status", # <ts_resolve_names>
23 |   "n", # <ts_resolve_names>
24 |   "matched_name", # <ts_resolve_names>
25 |   "matched_status", # <ts_resolve_names>
26 |   "query", # <ts_classify_result>
27 |   "result_type", # <ts_classify_result>
28 |   "n", # <ts_classify_result>
29 |   NULL
30 | ))
31 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2019 Joel Nitta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ts_write_names.R:
--------------------------------------------------------------------------------
 1 | test_that("Input checks work", {
 2 |   expect_error(
 3 |     ts_write_names("Foogenus", tempfile()),
 4 |     "df must be of class 'data\\.frame'"
 5 |   )
 6 |   partial_names_df <- data.frame(
 7 |     id = "1",
 8 |     genus_hybrid_sign = "x"
 9 |   )
10 |   expect_error(
11 |     ts_write_names(partial_names_df, tempfile()),
12 |     "df must include the following columns"
13 |   )
14 | })
15 | 
16 | test_that("Produces expected output file with docker", {
17 |   skip_if_no_docker()
18 |   parsed_names <- ts_parse_names(
19 |     "Foogenus x barspecies var. foosubsp (L.) F. Bar",
20 |     docker = TRUE
21 |   )
22 |   expect_snapshot_file(
23 |     ts_write_names(parsed_names, "parsed_name.txt"),
24 |     "parsed_name.txt"
25 |   )
26 |   file.remove("parsed_name.txt")
27 | })
28 | 
29 | test_that("Produces expected output file without docker", {
30 |   skip_if_no_tt()
31 |   parsed_names <- ts_parse_names(
32 |     "Foogenus x barspecies var. foosubsp (L.) F. Bar"
33 |   )
34 |   expect_snapshot_file(
35 |     ts_write_names(parsed_names, "parsed_name.txt"),
36 |     "parsed_name.txt"
37 |   )
38 |   file.remove("parsed_name.txt")
39 | })
40 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: taxastand
 2 | Title: Taxonomic Name Standardization
 3 | Version: 1.0.0
 4 | Authors@R: 
 5 |     person(given = "Joel",
 6 |            family = "Nitta",
 7 |            role = c("aut", "cre"),
 8 |            email = "joelnitta@gmail.com")
 9 | Description: Matches species names to a taxonomic standard. Resolves synonyms consistently and reproducibly.
10 | License: MIT + file LICENSE
11 | Encoding: UTF-8
12 | LazyData: true
13 | SystemRequirements: 
14 |     parsenames (<https://github.com/camwebb/taxon-tools>),
15 |     matchnames (<https://github.com/camwebb/taxon-tools>)
16 | Imports: 
17 |     assertr,
18 |     assertthat,
19 |     digest,
20 |     dplyr,
21 |     fs,
22 |     glue,
23 |     magrittr,
24 |     processx,
25 |     tibble,
26 |     tidyr
27 | Roxygen: list(
28 |     markdown = TRUE,
29 |     roclets = c("collate", "namespace", "rd", "roxyglobals::global_roclet"))
30 | RoxygenNote: 7.3.2
31 | Depends: R (>= 4.1.0)
32 | Suggests: 
33 |     rmarkdown,
34 |     knitr,
35 |     roxyglobals (>= 0.2.1),
36 |     testthat (>= 3.0.0),
37 |     babelwhale
38 | Config/testthat/edition: 3
39 | Remotes: 
40 |     anthonynorth/roxyglobals
41 | VignetteBuilder: knitr
42 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |     tags: ['*']
 7 | 
 8 | name: pkgdown
 9 | 
10 | jobs:
11 |   pkgdown:
12 |     runs-on: ubuntu-latest
13 |     env:
14 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       - uses: r-lib/actions/setup-pandoc@v1
19 | 
20 |       - uses: r-lib/actions/setup-r@v1
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v1
25 |         with:
26 |           extra-packages: pkgdown
27 |           needs: website
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           sudo apt-get install -y --no-install-recommends gawk
32 |           git clone https://github.com/camwebb/taxon-tools.git
33 |           cd taxon-tools
34 |           git checkout 8f8b5e2611b6fdef1998b7878e93e60a9bc7c130
35 |           make check
36 |           sudo make install
37 |           cd ..
38 | 
39 |       - name: Deploy package
40 |         run: |
41 |           git config --local user.name "$GITHUB_ACTOR"
42 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
43 |           Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
44 | 


--------------------------------------------------------------------------------
/data-raw/filmy_taxonomy.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | # Load the example standard taxonomy for resolving names.
 4 | 
 5 | # The example standard taxonomy is the family Hymenophyllaceae from
 6 | # Catalog of Life (CoL). CoL provides persistant links to database dumps.
 7 | # This one was obtained by selecting "Hymenophyllaceae" for "family"
 8 | # and "Complete data" on http://www.catalogueoflife.org/DCA_Export/index.php
 9 | # on 2019-06-19
10 | 
11 | # Download the zip file
12 | temp_dir <- fs::dir_create(tempdir())
13 | download.file(
14 |   "http://www.catalogueoflife.org/DCA_Export/zip/archive-family-hymenophyllaceae-bl3.zip",
15 |   fs::path(temp_dir, "archive-genus-vandenboschia-bl3.zip")
16 | )
17 | 
18 | # Unzip
19 | unzip(
20 |   fs::path(temp_dir, "archive-genus-vandenboschia-bl3.zip"),
21 |   exdir = temp_dir
22 | )
23 | 
24 | # Read in taxonomy table, keep only
25 | # names at species rank and below
26 | # (warnings are produced because names at genus level
27 | # and above have NA for many fields).
28 | filmy_taxonomy <- read_tsv(fs::path(temp_dir, "taxa.txt")) %>%
29 |   filter(str_detect(taxonRank, "species"))
30 | 
31 | # Replace "v. d. Bosch" with "V. D. Bosch"
32 | # see https://github.com/camwebb/taxon-tools/issues/10
33 | filmy_taxonomy <-
34 |   filmy_taxonomy %>%
35 |   dplyr::mutate(
36 |     scientificName = stringr::str_replace_all(
37 |       scientificName,
38 |       "v. d. Bosch",
39 |       "V. D. Bosch"
40 |     )
41 |   )
42 | 
43 | usethis::use_data(filmy_taxonomy)
44 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ts_resolve_names.R:
--------------------------------------------------------------------------------
 1 | data(filmy_taxonomy)
 2 | 
 3 | test_that("Input checks work", {
 4 |   expect_error(
 5 |     ts_resolve_names(10, data.frame(genus = "Foogenus")),
 6 |     "query must be of class"
 7 |   )
 8 |   expect_error(
 9 |     ts_resolve_names(data.frame(genus = "Foogenus"), 10),
10 |     "ref_taxonomy must be of class"
11 |   )
12 | })
13 | 
14 | test_that("Produces expected output with docker", {
15 |   skip_if_no_docker()
16 |   # Query a misspelled name
17 |   match_results <- ts_match_names(
18 |     query = "Gonocormus minutum",
19 |     reference = unique(filmy_taxonomy$scientificName),
20 |     simple = TRUE,
21 |     docker = TRUE
22 |   )
23 |   expect_s3_class(
24 |     ts_resolve_names(match_results, filmy_taxonomy),
25 |     "data.frame"
26 |   )
27 |   expect_s3_class(
28 |     ts_resolve_names("Gonocormus minutum", filmy_taxonomy, docker = TRUE),
29 |     "data.frame"
30 |   )
31 |   expect_snapshot(match_results)
32 | })
33 | 
34 | 
35 | test_that("Produces expected output without docker", {
36 |   skip_if_no_tt()
37 |   # Query a misspelled name
38 |   match_results <- ts_match_names(
39 |     query = "Gonocormus minutum",
40 |     reference = unique(filmy_taxonomy$scientificName),
41 |     simple = TRUE
42 |   )
43 |   expect_s3_class(
44 |     ts_resolve_names(match_results, filmy_taxonomy),
45 |     "data.frame"
46 |   )
47 |   expect_s3_class(
48 |     ts_resolve_names("Gonocormus minutum", filmy_taxonomy),
49 |     "data.frame"
50 |   )
51 |   expect_snapshot(match_results)
52 | })
53 | 


--------------------------------------------------------------------------------
/R/ts_write_names.R:
--------------------------------------------------------------------------------
 1 | #' Write out parsed names to a text file
 2 | #'
 3 | #' @param df Dataframe with parsed names
 4 | #' @param path Path to write dataframe
 5 | #'
 6 | #' Writes out parsed names in a format that can be used by [taxon-tools](https://github.com/camwebb/taxon-tools)
 7 | #' (each part of the scientific name is separated by the pipe symbol (|), with one name per line).
 8 | #'
 9 | #' @autoglobal
10 | #' @return Path to parsed names
11 | #' @export
12 | #' @examples
13 | #' if (ts_tt_installed()) {
14 | #'   parsed_names <- ts_parse_names(
15 | #'     "Foogenus x barspecies var. foosubsp (L.) F. Bar")
16 | #'   temp_file <- tempfile()
17 | #'   ts_write_names(parsed_names, temp_file)
18 | #'   readLines(temp_file)
19 | #'   file.remove(temp_file)
20 | #' }
21 | ts_write_names <- function(df, path) {
22 |   # Make vector of standard taxon-tools columns
23 |   tt_col_names = c(
24 |     "id",
25 |     "genus_hybrid_sign",
26 |     "genus_name",
27 |     "species_hybrid_sign",
28 |     "specific_epithet",
29 |     "infraspecific_rank",
30 |     "infraspecific_epithet",
31 |     "author"
32 |   )
33 | 
34 |   assertthat::assert_that(
35 |     inherits(df, "data.frame"),
36 |     msg = "df must be of class 'data.frame'"
37 |   )
38 |   assertthat::assert_that(
39 |     isTRUE(all(tt_col_names %in% colnames(df))),
40 |     msg = glue::glue(
41 |       "df must include the following columns: {paste(tt_col_names, collapse = ', ')}"
42 |     )
43 |   )
44 | 
45 |   # Replace NA values with ""
46 |   df <- dplyr::mutate(
47 |     df,
48 |     dplyr::across(dplyr::everything(), ~ tidyr::replace_na(., ""))
49 |   )
50 | 
51 |   # Subset to only taxon-tools columns, in order
52 |   df <- df[, tt_col_names]
53 | 
54 |   # taxon-tools uses pipe as separator
55 |   df <- tidyr::unite(df, col = "text", dplyr::all_of(tt_col_names), sep = "|")
56 | 
57 |   # write out text
58 |   writeLines(df$text, path)
59 | 
60 |   path
61 | }
62 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/ts_match_names.md:
--------------------------------------------------------------------------------
 1 | # Produces expected output in docker
 2 | 
 3 |     Code
 4 |       match_res
 5 |     Output
 6 |                        query            reference match_type   id_query     id_ref
 7 |       1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy c1ad73ec-1 19b861c8-1
 8 |         genus_hybrid_sign_query genus_name_query species_hybrid_sign_query
 9 |       1                    <NA>     Crepidomanes                      <NA>
10 |         specific_epithet_query infraspecific_rank_query infraspecific_epithet_query
11 |       1                minutus                     <NA>                        <NA>
12 |         author_query genus_hybrid_sign_ref genus_name_ref species_hybrid_sign_ref
13 |       1         <NA>                  <NA>   Crepidomanes                    <NA>
14 |         specific_epithet_ref infraspecific_rank_ref infraspecific_epithet_ref
15 |       1              minutum                   <NA>                      <NA>
16 |         author_ref
17 |       1       <NA>
18 | 
19 | # Manually matched names work
20 | 
21 |     Code
22 |       match_res
23 |     Output
24 |                        query                reference match_type
25 |       1 Crepidomanes minutus     Crepidomanes minutum auto_fuzzy
26 |       2        Hymeefee erae Hymenophyllum polyanthos     manual
27 | 
28 | # Names that can't be parsed don't show up in results
29 | 
30 |     Code
31 |       match_res
32 |     Output
33 |       # A tibble: 1 x 3
34 |         query                reference            match_type
35 |         <chr>                <chr>                <chr>     
36 |       1 Crepidomanes minutus Crepidomanes minutum auto_fuzzy
37 | 
38 | # Manually matched names work with collapsed infrasp names
39 | 
40 |     Code
41 |       match_res
42 |     Output
43 |       # A tibble: 6 x 3
44 |         query                       reference            match_type
45 |         <chr>                       <chr>                <chr>     
46 |       1 Crepidomanes minutus        Crepidomanes minutum auto_fuzzy
47 |       2 Crepidomanes minutawtaw     Crepidomanes minutum manual    
48 |       3 Blechnum lunare var. lunare Blechnum lunare      exact     
49 |       4 Blechnum lunare             Blechnum lunare      exact     
50 |       5 Bar foo var. foo            Bar foo              manual    
51 |       6 Bar foo                     Bar foo              exact     
52 | 
53 | 


--------------------------------------------------------------------------------
/man/ts_parse_names.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ts_parse_names.R
 3 | \name{ts_parse_names}
 4 | \alias{ts_parse_names}
 5 | \title{Parse taxonomic names}
 6 | \usage{
 7 | ts_parse_names(
 8 |   taxa,
 9 |   tbl_out = getOption("ts_tbl_out", default = FALSE),
10 |   quiet = FALSE,
11 |   docker = getOption("ts_docker", default = FALSE)
12 | )
13 | }
14 | \arguments{
15 | \item{taxa}{Character vector; taxon names to be parsed by taxon-tools
16 | \code{parsenames}. Missing values not allowed. Must all be unique.}
17 | 
18 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned?
19 | If \code{FALSE} (default), output will be a data.frame. This argument can
20 | be controlled via the option \code{ts_tbl_out}; see Examples.}
21 | 
22 | \item{quiet}{Logical; if TRUE, suppress warning messages that would normally
23 | be issued}
24 | 
25 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools
26 | (so that taxon-tools need not be installed).}
27 | }
28 | \value{
29 | A dataframe including the following columns.
30 | \itemize{
31 | \item id: A unique ID number assigned to the input name
32 | \item name: The input name
33 | \item genus_hybrid_sign: Hybrid sign for genus
34 | \item genus_name: Genus name
35 | \item species_hybrid_sign: Hybrid sign for species
36 | \item specific_epithet: Specific epithet (name)
37 | \item infraspecific_rank: Infraspecific rank
38 | \item infraspecific_epithet: Infraspecific epithet (name)
39 | \item author: Name of taxon
40 | }
41 | }
42 | \description{
43 | Requires \href{https://github.com/camwebb/taxon-tools}{taxon-tools} or docker
44 | to be installed.
45 | }
46 | \details{
47 | Parses scientific names into their component parts (genus, species, variety,
48 | author, etc).
49 | }
50 | \examples{
51 | # Using local taxon-tools installation
52 | if (ts_tt_installed()) {
53 | 
54 |   ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
55 |   ts_parse_names(
56 |     "Foogenus x barspecies var. foosubsp (L.) F. Bar", tbl_out = TRUE)
57 | 
58 |   # If you always want tibble output without specifying `tbl_out = TRUE`
59 |   # every time, set the option:
60 |   options(ts_tbl_out = TRUE)
61 |   ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
62 |   ts_parse_names("Crepidomanes minutum (Blume) K. Iwats.")
63 | 
64 | }
65 | 
66 | # Using docker
67 | if (babelwhale::test_docker_installation()) {
68 | 
69 | ts_parse_names(
70 |   "Foogenus x barspecies var. foosubsp (L.) F. Bar",
71 |   docker = TRUE)
72 | 
73 | }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ts_match_names.R:
--------------------------------------------------------------------------------
  1 | test_that("Input checks work", {
  2 |   expect_error(
  3 |     ts_match_names(10, "Foogenus"),
  4 |     "query must be of class"
  5 |   )
  6 |   expect_error(
  7 |     ts_match_names("Foogenus", 10),
  8 |     "reference must be of class"
  9 |   )
 10 |   expect_error(
 11 |     ts_match_names(10, data.frame(genus = "Foogenus")),
 12 |     "query must be of class"
 13 |   )
 14 |   expect_error(
 15 |     ts_match_names(data.frame(genus = "Foogenus"), 10),
 16 |     "reference must be of class"
 17 |   )
 18 | })
 19 | 
 20 | test_that("Produces expected output in docker", {
 21 |   skip_if_no_docker()
 22 |   match_res <- ts_match_names(
 23 |     "Crepidomanes minutus",
 24 |     "Crepidomanes minutum",
 25 |     docker = TRUE
 26 |   )
 27 |   expect_s3_class(match_res, "data.frame")
 28 |   expect_snapshot(match_res)
 29 | })
 30 | 
 31 | test_that("Produces expected output without docker", {
 32 |   skip_if_no_tt()
 33 |   match_res <- ts_match_names(
 34 |     "Crepidomanes minutus",
 35 |     "Crepidomanes minutum"
 36 |   )
 37 |   expect_s3_class(match_res, "data.frame")
 38 |   expect_snapshot(match_res)
 39 | })
 40 | 
 41 | test_that("Manually matched names work", {
 42 |   skip_if_no_docker()
 43 |   match_res <- ts_match_names(
 44 |     query = c("Crepidomanes minutus", "Hymeefee erae"),
 45 |     reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
 46 |     manual_match = data.frame(
 47 |       query = "Hymeefee erae",
 48 |       match = "Hymenophyllum polyanthos"
 49 |     ),
 50 |     simple = TRUE,
 51 |     docker = TRUE
 52 |   )
 53 |   expect_snapshot(match_res)
 54 | })
 55 | 
 56 | test_that("Names that can't be parsed don't show up in results", {
 57 |   skip_if_no_docker()
 58 |   match_res <- ts_match_names(
 59 |     query = c(
 60 |       "Vanden kalamocarpa x Vanden nipponica x Vanden striata",
 61 |       "Crepidomanes minutus"
 62 |     ),
 63 |     reference = c(
 64 |       "Crepidomanes minutum"
 65 |     ),
 66 |     simple = TRUE,
 67 |     docker = TRUE,
 68 |     tbl_out = TRUE
 69 |   )
 70 |   expect_snapshot(match_res)
 71 | })
 72 | 
 73 | test_that("Manually matched names work with collapsed infrasp names", {
 74 |   skip_if_no_docker()
 75 |   match_res <- ts_match_names(
 76 |     query = c(
 77 |       "Crepidomanes minutus",
 78 |       "Crepidomanes minutawtaw",
 79 |       "Blechnum lunare var. lunare",
 80 |       "Blechnum lunare",
 81 |       "Bar foo var. foo",
 82 |       "Bar foo"
 83 |     ),
 84 |     reference = c(
 85 |       "Crepidomanes minutum",
 86 |       "Hymenophyllum polyanthos",
 87 |       "Blechnum lunare",
 88 |       "Bar foo"
 89 |     ),
 90 |     manual_match = data.frame(
 91 |       query = c("Bar foo var. foo", "Crepidomanes minutawtaw"),
 92 |       match = c("Bar foo", "Crepidomanes minutum")
 93 |     ),
 94 |     max_dist = 10,
 95 |     match_no_auth = FALSE,
 96 |     match_canon = FALSE,
 97 |     collapse_infra = TRUE,
 98 |     collapse_infra_exclude = NULL,
 99 |     simple = TRUE,
100 |     docker = TRUE,
101 |     tbl_out = TRUE
102 |   )
103 |   expect_snapshot(match_res)
104 | })
105 | 
106 | test_that("Incorrectly specified manual match fails", {
107 |   skip_if_no_docker()
108 |   expect_error(
109 |     ts_match_names(
110 |       query = c("Crepidomanes minutus", "Hymeefee erae"),
111 |       reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
112 |       manual_match = data.frame(
113 |         query = "Hymeefee erae",
114 |         match = "Hymenophyllum poWHAT"
115 |       ),
116 |       simple = TRUE,
117 |       docker = TRUE
118 |     ),
119 |     "One or more manually matched reference names not in reference data"
120 |   )
121 |   expect_error(
122 |     ts_match_names(
123 |       query = c("Crepidomanes minutus", "Hymeefee erae"),
124 |       reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
125 |       manual_match = data.frame(
126 |         query = c("Crepidomanes minutus", "Crepidomanes minutus"),
127 |         match = c("Hymenophyllum polyanthos", "Crepidomanes minutum")
128 |       ),
129 |       simple = TRUE,
130 |       docker = TRUE
131 |     ),
132 |     "All values of manual_match\\$query must be unique"
133 |   )
134 |   expect_error(
135 |     ts_match_names(
136 |       query = c("Crepidomanes minutus", "Hymeefee erae"),
137 |       reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
138 |       manual_match = data.frame(
139 |         name = c("Hymenophyllum polyantha", "Crepidomanes minutu"),
140 |         match = c("Hymenophyllum polyanthos", "Crepidomanes minutum")
141 |       ),
142 |       simple = TRUE,
143 |       docker = TRUE
144 |     ),
145 |     "manual_match must have `query` and `match` columns"
146 |   )
147 |   expect_error(
148 |     ts_match_names(
149 |       query = ts_parse_names("Hymenophyllum polyantha", docker = TRUE),
150 |       reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
151 |       manual_match = data.frame(
152 |         query = c("Hymenophyllum polyantha"),
153 |         match = c("Hymenophyllum polyanthos")
154 |       ),
155 |       simple = TRUE,
156 |       docker = TRUE
157 |     ),
158 |     "manual_match can only be used if query is a character vector"
159 |   )
160 | })
161 | 


--------------------------------------------------------------------------------
/man/ts_resolve_names.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ts_resolve_names.R
  3 | \name{ts_resolve_names}
  4 | \alias{ts_resolve_names}
  5 | \title{Resolve synonyms in taxonomic names}
  6 | \usage{
  7 | ts_resolve_names(
  8 |   query,
  9 |   ref_taxonomy,
 10 |   max_dist = 10,
 11 |   match_no_auth = FALSE,
 12 |   match_canon = FALSE,
 13 |   collapse_infra = FALSE,
 14 |   collapse_infra_exclude = NULL,
 15 |   docker = getOption("ts_docker", default = FALSE),
 16 |   tbl_out = getOption("ts_tbl_out", default = FALSE)
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{query}{Character vector or dataframe; taxonomic names to be resolved.
 21 | If a character vector, missing values not allowed and all values must be
 22 | unique. If a dataframe, should be taxonomic names matched with
 23 | \code{\link{ts_match_names}()}.}
 24 | 
 25 | \item{ref_taxonomy}{Dataframe; reference taxonomic data adhering to the
 26 | \href{https://dwc.tdwg.org/terms/#taxon}{Darwin Core standard} with the
 27 | following columns:
 28 | \itemize{
 29 | \item \code{taxonID}: \href{https://dwc.tdwg.org/terms/#dwc:taxonID}{Unique identifier for each taxon}.
 30 | \item \code{acceptedNameUsageID}: If the taxon is a synonym, the \href{https://dwc.tdwg.org/terms/#dwc:acceptedNameUsageID}{unique identifier for the accepted name}
 31 | \item \code{taxonomicStatus}: \href{https://dwc.tdwg.org/terms/#dwc:taxonomicStatus}{The status of the use of the \code{scientificName} as a label for the taxon}.
 32 | \item \code{scientificName}: \href{https://dwc.tdwg.org/terms/#dwc:scientificName}{The full scientific name of the taxon},
 33 | with authorship and date information if known.
 34 | }}
 35 | 
 36 | \item{max_dist}{Max Levenshtein distance to allow during fuzzy matching
 37 | (total insertions, deletions and substitutions). Default: 10.}
 38 | 
 39 | \item{match_no_auth}{Logical; If no author is given in the query and the name
 40 | (without author) occurs only once in the reference, accept the name in the
 41 | reference as a match. Default: to not allow such a match (\code{FALSE}).}
 42 | 
 43 | \item{match_canon}{Logical; Allow a "canonical name" match if only the genus,
 44 | species epithet, and infraspecific epithet (if present) match exactly.
 45 | Default: to not allow such a match (\code{FALSE}).}
 46 | 
 47 | \item{collapse_infra}{Logical; if the specific epithet and infraspecific
 48 | epithet are the same, drop the infraspecific rank and epithet from the query.
 49 | For more information, see \code{\link{ts_match_names}()}.}
 50 | 
 51 | \item{collapse_infra_exclude}{Character vector; taxonomic names to exclude
 52 | collapsing with \code{collapse_infra}. Any names used must match those in \code{query}
 53 | exactly, or they won't be excluded.}
 54 | 
 55 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools
 56 | (so that taxon-tools need not be installed).}
 57 | 
 58 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned?
 59 | If \code{FALSE} (default), output will be a data.frame. This argument can
 60 | be controlled via the option \code{ts_tbl_out}; see Examples.}
 61 | }
 62 | \value{
 63 | Dataframe; results of resolving synonyms in matched taxonomic names.
 64 | Includes the following columns:
 65 | \itemize{
 66 | \item \code{query}: Query name
 67 | \item \code{resolved_name}: Accepted name after resolving synonyms
 68 | \item \code{matched_name}: Name matched to query
 69 | \item \code{resolved_status}: Taxonomic status of the resolved name (same as \code{taxonomicStatus} in \code{ref_taxonomy})
 70 | \item \code{matched_status}: Taxonomic status of the matched name (same as \code{taxonomicStatus} in \code{ref_taxonomy})
 71 | \item \code{match_type}: Type of match (for a summary of match types, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual})
 72 | }
 73 | 
 74 | Names that could not be matched or resolve to multiple, different synonyms
 75 | have \code{NA} for \code{resolved_name}.
 76 | }
 77 | \description{
 78 | After matching taxonomic names to a reference, some may match synonyms. This
 79 | function resolves synonyms to their accepted names.
 80 | }
 81 | \details{
 82 | \code{query} can take as input either a character vector of taxonomic names, or
 83 | the output of \code{\link{ts_match_names}()}. If the former, it will run
 84 | \code{\link{ts_match_names}()} to match the query to \code{ref_taxonomy}, then
 85 | resolve synonyms. If the latter, the scientific names in \code{ref_taxonomy}
 86 | should be the same used as reference with \code{\link{ts_match_names}()}
 87 | (this is not checked).
 88 | 
 89 | \code{ref_taxonomy} must be taxonomic data adhering to the \href{https://dwc.tdwg.org/terms/#taxon}{Darwin Core standard}.
 90 | Darwin Core includes many terms, but only four (\code{taxonID},
 91 | \code{acceptedNameUsageID}, \code{taxonomicStatus}, and \code{scientificName}) are required
 92 | for this function.
 93 | }
 94 | \examples{
 95 | if (ts_tt_installed()) {
 96 |   # Load reference taxonomy in Darwin Core format
 97 |   data(filmy_taxonomy)
 98 | 
 99 |   ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
100 |   # If you always want tibble output without specifying `tbl_out = TRUE`
101 |   # every time, set the option:
102 |   options(ts_tbl_out = TRUE)
103 |   ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
104 | }
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #' Make a dataframe with taxonomic names
  2 | #'
  3 | #' @param taxa Character vector; taxon names to be parsed by taxon-tools `parsenames`.
  4 | #' Missing values not allowed. Must all be unique.
  5 | #'
  6 | #' @return Dataframe with two columns: `id` and `name`
  7 | #' @keywords internal
  8 | #' @examples
  9 | #' \dontrun{
 10 | #' ts_make_name_df("Foogenus x barspecies var. foosubsp (L.) F. Bar")
 11 | #' }
 12 | ts_make_name_df <- function(taxa) {
 13 |   assertthat::assert_that(is.character(taxa))
 14 |   assertthat::assert_that(
 15 |     assertthat::noNA(taxa),
 16 |     msg = "Input taxa may not contain NAs"
 17 |   )
 18 |   assertthat::assert_that(
 19 |     all(assertr::is_uniq(taxa)),
 20 |     msg = "Input taxa must be unique"
 21 |   )
 22 | 
 23 |   # Format input names as data frame with unique ID
 24 |   # ID is combination of first 8 chars of hash of the
 25 |   # input (taxa), followed by "-" and integer
 26 |   taxa_df <- data.frame(name = taxa)
 27 |   taxa_df$id <- 1:nrow(taxa_df)
 28 |   taxa_df$id <- paste(substr(digest::digest(taxa), 1, 8), taxa_df$id, sep = "-")
 29 | 
 30 |   taxa_df[, c("id", "name")]
 31 | }
 32 | 
 33 | #' Classify results of taxon-tools matching
 34 | #'
 35 | #' @param match_results Dataframe; output of tt_match_names()
 36 | #'
 37 | #' @return Dataframe with column `result_type` added
 38 | #' @keywords internal
 39 | #' @autoglobal
 40 | ts_classify_result <- function(match_results) {
 41 |   assertthat::assert_that(
 42 |     inherits(match_results, "data.frame"),
 43 |     msg = "match_results must be of class 'data.frame'"
 44 |   )
 45 |   match_results %>%
 46 |     dplyr::add_count(query) %>%
 47 |     dplyr::mutate(
 48 |       result_type = dplyr::case_when(
 49 |         match_type != "no_match" & n == 1 ~ "single_match",
 50 |         match_type != "no_match" & n > 1 ~ "mult_match",
 51 |         match_type == "no_match" ~ "no_match",
 52 |         TRUE ~ NA_character_
 53 |       )
 54 |     ) %>%
 55 |     assertr::assert(assertr::not_na, result_type) %>%
 56 |     dplyr::select(-n)
 57 | }
 58 | 
 59 | # Helper function for tests: skip test if docker is not installed
 60 | skip_if_no_docker <- function() {
 61 |   if (babelwhale::test_docker_installation()) {
 62 |     return(invisible(TRUE))
 63 |   }
 64 |   testthat::skip("docker not installed")
 65 | }
 66 | 
 67 | # Helper function for tests: skip test if taxon-tools is not installed
 68 | skip_if_no_tt <- function() {
 69 |   if (ts_tt_installed()) {
 70 |     return(invisible(TRUE))
 71 |   }
 72 |   testthat::skip("taxon-tools not installed")
 73 | }
 74 | 
 75 | #' Run a containerised command with automatic mounting of files
 76 | #'
 77 | #' Similar to [run()], but automatically mounts files (and directories) so the
 78 | #' user doesn't have to keep track of volumes.
 79 | #'
 80 | #' The main difference to [run()] is that the use of names for the `args`; any
 81 | #' file (or directory) that should be mounted inside the container must be named
 82 | #' `file`. The other elements (arguments) don't need to be named. Note that it
 83 | #' is fine to have multiple elements with the same name (`file`).
 84 | #'
 85 | #' This should generally work as long as the command accepts absolute paths
 86 | #' for file input. If that is not the case, use [run()] instead and specify
 87 | #' paths and mounting manually.
 88 | #'
 89 | #' @inheritParams babelwhale::run
 90 | #' @param args Character vector, arguments to the command. Any files or
 91 | #'   directories that should be mounted must be named "file" (see example).
 92 | #' @param wd Local working directory to run command. If specified, the working
 93 | #'   directory will be mounted to the docker container.
 94 | #' @param wd_in_container Working directory to run command in
 95 | #'   the container. Defaults to the working directory mounted to the container
 96 | #'   (`wd`).
 97 | #'
 98 | #' @return List, formatted as output from [processx::run()]
 99 | #' @noRd
100 | #' @examples
101 | #' \dontrun{
102 | #' if (test_docker_installation()) {
103 | #'
104 | #' # Count the number of lines in the DESCRIPTION and LICENSE
105 | #' # files of this package
106 | #' run_auto_mount(
107 | #'   container_id = "alpine",
108 | #'   command = "wc",
109 | #'   args = c("-l",
110 | #'     file = system.file("DESCRIPTION", package = "babelwhale"),
111 | #'     file = system.file("LICENSE", package = "babelwhale")
112 | #'   )
113 | #' )
114 | #'
115 | #' }
116 | #' }
117 | run_auto_mount <- function(
118 |   container_id,
119 |   command,
120 |   args = NULL,
121 |   wd = NULL,
122 |   wd_in_container = NULL,
123 |   environment_variables = NULL,
124 |   debug = FALSE,
125 |   verbose = FALSE,
126 |   stdout = "|",
127 |   stderr = "|"
128 | ) {
129 |   # Convert paths of file arguments to absolute for docker
130 |   file_args <- args[names(args) == "file"]
131 |   in_path <- fs::path_abs(file_args)
132 |   in_file <- fs::path_file(in_path)
133 |   in_dir <- fs::path_dir(in_path)
134 | 
135 |   # Make (most likely) unique prefix for folder name that
136 |   # won't conflict with an existing folder in the container
137 |   # based on the hash of the container id and command
138 |   prefix <- digest::digest(c(container_id, command))
139 | 
140 |   # Specify volume mounting for working directory
141 |   wd_volume <- NULL
142 |   if (!is.null(wd)) {
143 |     wd_path <- fs::path_abs(wd)
144 |     if (is.null(wd_in_container)) wd_in_container <- glue::glue("/{prefix}_wd")
145 |     wd_volume <- glue::glue("{wd_path}:{wd_in_container}")
146 |   }
147 | 
148 |   # Specify all volumes: one per file, plus working directory
149 |   volumes <- unique(
150 |     c(
151 |       glue::glue("{in_dir}:/{prefix}_{1:length(in_dir)}"),
152 |       wd_volume
153 |     )
154 |   )
155 | 
156 |   # Replace file arg paths with location in container
157 |   files_in_container <- glue::glue("/{prefix}_{1:length(in_dir)}/{in_file}")
158 |   args[names(args) == "file"] <- files_in_container
159 | 
160 |   # Run docker via babelwhale
161 |   babelwhale::run(
162 |     container_id = container_id,
163 |     command = command,
164 |     args = args,
165 |     volumes = volumes,
166 |     workspace = wd_in_container,
167 |     environment_variables = environment_variables,
168 |     debug = debug,
169 |     verbose = verbose,
170 |     stdout = stdout,
171 |     stderr = stderr
172 |   )
173 | }
174 | 


--------------------------------------------------------------------------------
/R/ts_parse_names.R:
--------------------------------------------------------------------------------
  1 | #' Parse taxonomic names
  2 | #'
  3 | #' Requires [taxon-tools](https://github.com/camwebb/taxon-tools) or docker
  4 | #' to be installed.
  5 | #'
  6 | #' Parses scientific names into their component parts (genus, species, variety,
  7 | #' author, etc).
  8 | #'
  9 | #' @param taxa Character vector; taxon names to be parsed by taxon-tools
 10 | #' `parsenames`. Missing values not allowed. Must all be unique.
 11 | #' @param tbl_out Logical vector of length 1; should a tibble be returned?
 12 | #' If `FALSE` (default), output will be a data.frame. This argument can
 13 | #' be controlled via the option `ts_tbl_out`; see Examples.
 14 | #' @param quiet Logical; if TRUE, suppress warning messages that would normally
 15 | #' be issued
 16 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools
 17 | #' (so that taxon-tools need not be installed).
 18 | #'
 19 | #' @return A dataframe including the following columns.
 20 | #' - id: A unique ID number assigned to the input name
 21 | #' - name: The input name
 22 | #' - genus_hybrid_sign: Hybrid sign for genus
 23 | #' - genus_name: Genus name
 24 | #' - species_hybrid_sign: Hybrid sign for species
 25 | #' - specific_epithet: Specific epithet (name)
 26 | #' - infraspecific_rank: Infraspecific rank
 27 | #' - infraspecific_epithet: Infraspecific epithet (name)
 28 | #' - author: Name of taxon
 29 | #'
 30 | #' @autoglobal
 31 | #' @export
 32 | #' @examples
 33 | #' # Using local taxon-tools installation
 34 | #' if (ts_tt_installed()) {
 35 | #'
 36 | #'   ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
 37 | #'   ts_parse_names(
 38 | #'     "Foogenus x barspecies var. foosubsp (L.) F. Bar", tbl_out = TRUE)
 39 | #'
 40 | #'   # If you always want tibble output without specifying `tbl_out = TRUE`
 41 | #'   # every time, set the option:
 42 | #'   options(ts_tbl_out = TRUE)
 43 | #'   ts_parse_names("Foogenus x barspecies var. foosubsp (L.) F. Bar")
 44 | #'   ts_parse_names("Crepidomanes minutum (Blume) K. Iwats.")
 45 | #'
 46 | #' }
 47 | #'
 48 | #' # Using docker
 49 | #' if (babelwhale::test_docker_installation()) {
 50 | #'
 51 | #' ts_parse_names(
 52 | #'   "Foogenus x barspecies var. foosubsp (L.) F. Bar",
 53 | #'   docker = TRUE)
 54 | #'
 55 | #' }
 56 | #'
 57 | ts_parse_names <- function(
 58 |   taxa,
 59 |   tbl_out = getOption("ts_tbl_out", default = FALSE),
 60 |   quiet = FALSE,
 61 |   docker = getOption("ts_docker", default = FALSE)
 62 | ) {
 63 |   # Check input: must be character vector, no NA values, all unique
 64 |   assertthat::assert_that(is.character(taxa))
 65 |   assertthat::assert_that(
 66 |     assertthat::noNA(taxa),
 67 |     msg = "Input taxa may not contain NAs"
 68 |   )
 69 |   assertthat::assert_that(
 70 |     all(assertr::is_uniq(taxa)),
 71 |     msg = "Input taxa must be unique"
 72 |   )
 73 |   assertthat::assert_that(assertthat::is.flag(tbl_out))
 74 |   assertthat::assert_that(assertthat::is.flag(docker))
 75 | 
 76 |   # Write out names formatted for parsing with taxon-tools to temp file
 77 |   # format:
 78 |   # `id_num|taxon_name`
 79 |   # for example,
 80 |   # `x-234|Foogenus x barspecies var. foosubsp (L.) F. Bar`
 81 |   taxa_tbl <- ts_make_name_df(taxa)
 82 |   taxa_tbl$record <- paste(taxa_tbl$id, taxa_tbl$name, sep = "|")
 83 |   ref_taxa_txt_file <- tempfile(
 84 |     pattern = digest::digest(taxa),
 85 |     fileext = ".txt"
 86 |   )
 87 |   if (fs::file_exists(ref_taxa_txt_file)) fs::file_delete(ref_taxa_txt_file)
 88 |   writeLines(taxa_tbl$record, ref_taxa_txt_file)
 89 | 
 90 |   # Parse reference names with taxon tools
 91 |   if (isTRUE(docker)) {
 92 |     assertthat::assert_that(
 93 |       requireNamespace("babelwhale", quietly = TRUE),
 94 |       msg = "babelwhale needs to be installed to use docker"
 95 |     )
 96 |     assertthat::assert_that(
 97 |       babelwhale::test_docker_installation(),
 98 |       msg = "docker not installed"
 99 |     )
100 |     ref_parsed <- run_auto_mount(
101 |       container_id = "camwebb/taxon-tools:v1.3.0",
102 |       command = "parsenames",
103 |       args = c(file = ref_taxa_txt_file)
104 |     )
105 |   } else {
106 |     assertthat::assert_that(
107 |       ts_tt_installed(),
108 |       msg = "taxon-tools not installed"
109 |     )
110 |     ref_parsed <- processx::run("parsenames", ref_taxa_txt_file)
111 |   }
112 | 
113 |   if (fs::file_exists(ref_taxa_txt_file)) fs::file_delete(ref_taxa_txt_file)
114 | 
115 |   # Read in results of parsing, format as dataframe
116 | 
117 |   # The output is originally one record per line, with fields separated by '|' (pipe symbol)
118 |   parsed_names <- data.frame(
119 |     record = strsplit(ref_parsed[["stdout"]], "\n")[[1]]
120 |   )
121 | 
122 |   # Split these into separate columns
123 |   name_parts <- c(
124 |     "genus_hybrid_sign",
125 |     "genus_name",
126 |     "species_hybrid_sign",
127 |     "specific_epithet",
128 |     "infraspecific_rank",
129 |     "infraspecific_epithet",
130 |     "author"
131 |   )
132 | 
133 |   parsed_names <- tidyr::separate(
134 |     data = parsed_names,
135 |     col = record,
136 |     into = c("id", name_parts),
137 |     sep = "\\|",
138 |     fill = "right",
139 |     remove = FALSE
140 |   )
141 | 
142 |   # Fill in NA if that name part is missing
143 |   parsed_names[parsed_names == ""] <- NA
144 | 
145 |   # Add "fail" column if all name parts are missing (couldn't be parsed properly)
146 |   parsed_names$fail <- sapply(
147 |     1:nrow(parsed_names),
148 |     function(x) all(is.na(parsed_names[x, name_parts]))
149 |   )
150 | 
151 |   # Early exit if everything failed
152 |   assertthat::assert_that(
153 |     !all(parsed_names$fail == TRUE),
154 |     msg = "No names could be successfully parsed"
155 |   )
156 | 
157 |   # Emit warning for failures
158 |   if (sum(parsed_names$fail) > 0 && quiet == FALSE) {
159 |     failed_ids <- parsed_names$id[parsed_names$fail == TRUE]
160 |     failed_names <- paste(
161 |       taxa_tbl$name[taxa_tbl$id %in% failed_ids],
162 |       collapse = ", "
163 |     )
164 |     warning(glue::glue(
165 |       "The following names could not be parsed and are excluded from results: {failed_names}"
166 |     ))
167 |   }
168 | 
169 |   # Add back in original name
170 |   parsed_names <- dplyr::left_join(
171 |     parsed_names,
172 |     dplyr::select(taxa_tbl, id, name),
173 |     by = "id"
174 |   )
175 | 
176 |   # Remove failures, drop "fail" column
177 |   parsed_names <- parsed_names[parsed_names$fail == FALSE, ]
178 |   parsed_names$fail <- NULL
179 | 
180 |   # Return parsed names as dataframe or tibble
181 |   results <- parsed_names[, c("name", "id", name_parts)]
182 | 
183 |   if (isTRUE(tbl_out)) return(tibble::as_tibble(results))
184 | 
185 |   results
186 | }
187 | 


--------------------------------------------------------------------------------
/man/ts_match_names.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ts_match_names.R
  3 | \name{ts_match_names}
  4 | \alias{ts_match_names}
  5 | \title{Match taxonomic names to a reference}
  6 | \usage{
  7 | ts_match_names(
  8 |   query,
  9 |   reference,
 10 |   manual_match = NULL,
 11 |   max_dist = 10,
 12 |   match_no_auth = FALSE,
 13 |   match_canon = FALSE,
 14 |   collapse_infra = FALSE,
 15 |   collapse_infra_exclude = NULL,
 16 |   simple = FALSE,
 17 |   docker = getOption("ts_docker", default = FALSE),
 18 |   tbl_out = getOption("ts_tbl_out", default = FALSE)
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{query}{Character vector or dataframe; taxonomic names to be queried.
 23 | If a character vector, missing values not allowed and all values must be
 24 | unique.
 25 | If a dataframe, should be taxonomic names parsed with
 26 | \code{\link{ts_parse_names}()}.}
 27 | 
 28 | \item{reference}{Character vector or dataframe; taxonomic names to use as
 29 | reference. If a character vector, missing values not allowed and all values
 30 | must be unique. If a dataframe, should be taxonomic names parsed with
 31 | \code{\link{ts_parse_names}()}.}
 32 | 
 33 | \item{manual_match}{Optional. Dataframe of manually matched names that will
 34 | override any results from \code{taxon-tools}. Must include two columns, \code{query}
 35 | and \code{match}.}
 36 | 
 37 | \item{max_dist}{Max Levenshtein distance to allow during fuzzy matching
 38 | (total insertions, deletions and substitutions). Default: 10.}
 39 | 
 40 | \item{match_no_auth}{Logical; If no author is given in the query and the name
 41 | (without author) occurs only once in the reference, accept the name in the
 42 | reference as a match. Default: to not allow such a match (\code{FALSE}).}
 43 | 
 44 | \item{match_canon}{Logical; Allow a "canonical name" match if only the genus,
 45 | species epithet, and infraspecific epithet (if present) match exactly.
 46 | Default: to not allow such a match (\code{FALSE}).}
 47 | 
 48 | \item{collapse_infra}{Logical; if the specific epithet and infraspecific
 49 | epithet are the same, drop the infraspecific rank and epithet from the query.}
 50 | 
 51 | \item{collapse_infra_exclude}{Character vector; taxonomic names to exclude
 52 | from collapsing with \code{collapse_infra}. Any names used must match those in
 53 | \code{query} exactly, or they won't be excluded.}
 54 | 
 55 | \item{simple}{Logical; return the output in a simplified format with only the
 56 | query name, matched reference name, and match type. Default: \code{FALSE}.}
 57 | 
 58 | \item{docker}{Logical; if TRUE, docker will be used to run taxon-tools
 59 | (so that taxon-tools need not be installed).}
 60 | 
 61 | \item{tbl_out}{Logical vector of length 1; should a tibble be returned?
 62 | If \code{FALSE} (default), output will be a data.frame. This argument can
 63 | be controlled via the option \code{ts_tbl_out}; see Examples.}
 64 | }
 65 | \value{
 66 | Dataframe with the following columns (if \code{simple} is \code{FALSE}):
 67 | \itemize{
 68 | \item query: Query name
 69 | \item reference: Matched reference name
 70 | \item match_type: Type of match (for a summary of match types, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual})
 71 | \item id_query: Unique ID of query
 72 | \item id_ref: Unique ID of reference
 73 | \item genus_hybrid_sign_query: Genus hybrid sign in query
 74 | \item genus_name_query: Genus name of query
 75 | \item species_hybrid_sign_query: Species hybrid sign in query
 76 | \item specific_epithet_query: Specific epithet of query
 77 | \item infraspecific_rank_query: Infraspecific rank of query
 78 | \item infraspecific_epithet_query: Infraspecific epithet of query
 79 | \item author_query: Taxonomic author of query
 80 | \item genus_hybrid_sign_ref: Genus hybrid sign in reference
 81 | \item genus_name_ref: Genus name of reference
 82 | \item species_hybrid_sign_ref: Species hybrid sign in reference
 83 | \item specific_epithet_ref: Specific epithet of reference
 84 | \item infraspecific_rank_ref: Infraspecific rank of reference
 85 | \item infraspecific_epithet_ref: Infraspecific epithet of reference
 86 | \item author_ref: Taxonomic author of reference
 87 | }
 88 | 
 89 | If \code{simple} is \code{TRUE}, only return the first three columns above.
 90 | }
 91 | \description{
 92 | Allows for orthographic differences between query and reference by using
 93 | fuzzy matching on parsed taxonomic names. Requires
 94 | \href{https://github.com/camwebb/taxon-tools}{taxon-tools} to be installed.
 95 | }
 96 | \details{
 97 | \code{taxon-tools} matches names in two steps:
 98 | \enumerate{
 99 | \item Scientific names are parsed into their component parts (genus, species,
100 | variety, author, etc).
101 | \item Names are fuzzily matched following taxonomic rules using the component
102 | parts.
103 | }
104 | 
105 | For more information on rules used for matching, \href{https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes}{see taxon-tools manual}.
106 | 
107 | Parsing is fairly fast (much faster than matching) but can take some time if
108 | the number of names is very large. If multiple queries will be made (e.g., to
109 | the same large reference database), it is recommended to first parse the
110 | names using \code{\link{ts_parse_names}()}, and use the results as input to
111 | \code{query} and/or \code{reference}.
112 | 
113 | \code{collapse_infra} is useful in situations where the reference database does
114 | not use names that have the same specific epithet and infraspecific epithet.
115 | For example, reference name "Blechnum lunare" and query "Blechnum lunare var.
116 | lunare". In this case, if \code{collapse_infra} is \code{TRUE}, "Blechnum lunare" will
117 | be queried instead of "Blechnum lunare var. lunare". Note that the
118 | \code{match_type} will be "exact" even though the literal query and the matched
119 | name are different (see example below).
120 | }
121 | \examples{
122 | if(ts_tt_installed()) {
123 |   ts_match_names(
124 |     "Crepidomanes minutus",
125 |     c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
126 |     simple = TRUE
127 |     )
128 | 
129 |   # If names are too distant, they won't match
130 |   ts_match_names(
131 |     query = "Crepidblah foo",
132 |     reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
133 |     simple = TRUE
134 |     )
135 | 
136 |   # But we can force a match manually
137 |   ts_match_names(
138 |     query = "Crepidblah foo",
139 |     reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
140 |     manual_match = data.frame(
141 |       query = c("Crepidblah foo"),
142 |       match = c("Crepidomanes minutum")
143 |     ),
144 |     simple = TRUE
145 |    )
146 | 
147 |   # If you always want tibble output without specifying `tbl_out = TRUE`
148 |   # every time, set the option:
149 |   options(ts_tbl_out = TRUE)
150 |   ts_match_names(
151 |     "Crepidomanes minutus",
152 |     c("Crepidomanes minutum", "Hymenophyllum polyanthos")
153 |     )
154 | 
155 |   # Example using collapse_infra argument
156 |   ts_match_names(
157 |     c("Crepidomanes minutus", "Blechnum lunare var. lunare",
158 |       "Blechnum lunare", "Bar foo var. foo", "Bar foo"),
159 |     c("Crepidomanes minutum", "Hymenophyllum polyanthos", "Blechnum lunare",
160 |       "Bar foo"),
161 |     collapse_infra = TRUE,
162 |     collapse_infra_exclude = "Bar foo var. foo",
163 |     simple = TRUE
164 |     )
165 | }
166 | 
167 | }
168 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "man/figures/"
 12 | )
 13 | ```
 14 | # taxastand <img src="man/figures/logo.png" align="right" alt="" width="120" />
 15 | 
 16 | <!-- badges: start -->
 17 | [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
 18 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959)
 19 | <!-- badges: end -->
 20 | 
 21 | The goal of `taxastand` is to standardize species names from different sources, a common task in biology. 
 22 | 
 23 | Very often different biologists use different synonyms to refer to the same species. If we want to join data from different sources, their taxonomic names must be standardized first. This is what `taxastand` seeks to do in a reproducible and efficient manner.
 24 | 
 25 | ## Important note
 26 | 
 27 | **This package is in early development.** There may be major, breaking changes to functionality in the near future. If you use this package, I highly recommend using a package manager like [renv](https://rstudio.github.io/renv/articles/renv.html) so that later updates won't break your code.
 28 | 
 29 | ## Taxonomic standard
 30 | 
 31 | `taxastand` is based on matching names to a single **taxonomic standard**, that is, a database of accepted names and synonyms. As long as a single taxonomic standard is used, we can confidently resolve names from disparate sources.
 32 | 
 33 | The taxonomic standard must conform to [Darwin Core standards](https://dwc.tdwg.org/). The user must provide this database (as a dataframe). There are many sources of taxonomic data online, including [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c), [Catalog of Life](http://www.catalogueoflife.org/), and [ITIS](https://www.itis.gov/) to name a few. The [taxadb](https://github.com/ropensci/taxadb) package provides convenient functions for downloading various taxonomic databases that use Darwin Core.
 34 | 
 35 | ## Installation
 36 | 
 37 | `taxastand` can be installed from [r-universe](https://joelnitta.r-universe.dev) or [github](https://github.com/joelnitta).
 38 | 
 39 | ``` r
 40 | install.packages("taxastand", repos = 'https://joelnitta.r-universe.dev')
 41 | ```
 42 | 
 43 | OR
 44 | 
 45 | ``` r
 46 | # install.packages("remotes")
 47 | remotes::install_github("joelnitta/taxastand")
 48 | ```
 49 | 
 50 | ## Dependencies
 51 | 
 52 | `taxastand` depends on [taxon-tools](https://github.com/camwebb/taxon-tools) for taxonomic name matching.
 53 | 
 54 | There are two options for using this dependency.
 55 | 
 56 | - Install [docker](https://www.docker.com/) and set `docker = TRUE` when using `taxastand` functions.
 57 | 
 58 | OR
 59 | 
 60 | - Install the two programs included in [taxon-tools](https://github.com/camwebb/taxon-tools), `parsenames` and `matchnames`.
 61 | 
 62 | ## Similar work
 63 | 
 64 | - [ROpenSci](https://ropensci.org/) has a [task view](https://github.com/ropensci/taxonomy) summarizing many tools available for taxonomy.
 65 | 
 66 | - [taxize](https://github.com/ropensci/taxize) is the "granddaddy" of taxonomy packages in R. It can search around 20 different taxonomic databases for names and retrieve taxonomic information.
 67 | 
 68 | - [TNRS](http://tnrs.iplantcollaborative.org/), the Taxonomic Name Resolution Service, is a web application that resolves taxonomic names of plants according to one of six databases.
 69 | 
 70 | - [taxizedb](https://github.com/ropensci/taxizedb) downloads taxonomic databases and provides tools to interface with them through SQL.
 71 | 
 72 | - [taxadb](https://github.com/ropensci/taxadb) also downloads and searches taxonomic databases. It can interface with them either through SQL or in-memory in R.
 73 | 
 74 | - [taxonstand](https://cran.r-project.org/web/packages/Taxonstand/index.html) has a very similar goal to `taxastand`, but only uses [The Plant List (TPL)](http://www.theplantlist.org
 75 | ) as its taxonomic standard and does not allow the user to provide their own. Note that TPL is no longer being updated as of 2013.
 76 | 
 77 | ## Motivation
 78 | 
 79 | Although existing web-based solutions for taxonomic name resolution are very useful, they may not be ideal for all situations: the choice of reference database to use for standardization is limited, they may not be able to handle very large queries, and the user has no guarantee that the same input will yield the same output at a later date due to changes in the remote database. 
 80 | 
 81 | Furthermore, matching of taxonomic names is not straightforward, since they are complex data structures including multiple components (e.g., genus, specific epithet, basionym author, combination author, etc). [Of the tools mentioned above](#similar-work) only [TNRS](http://tnrs.iplantcollaborative.org/) can fuzzily match taxonomic names based on their parsed components, but it does not allow for use of a local reference database.
 82 | 
 83 | The motivation for `taxastand` is to provide greater flexibility and reproducibility by allowing for complete version control of the code and database used for name resolution, while implementing fuzzy matching of parsed taxonomic names.
 84 | 
 85 | ## Example
 86 | 
 87 | Here is an example of fuzzy matching followed by resolution of synonyms using the dataset included with the package.
 88 | 
 89 | ```{r filmy-example-show, eval = FALSE}
 90 | library(taxastand)
 91 | 
 92 | # Load example reference taxonomy in Darwin Core format
 93 | data(filmy_taxonomy)
 94 | 
 95 | # Take a look at the columns used by taxastand
 96 | head(filmy_taxonomy[c(
 97 |   "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
 98 | 
 99 | # As a test, resolve a misspelled name
100 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
101 | 
102 | # We can now use the `resolved_name` column of this result for downstream
103 | # analyses joining on other datasets that have been resolved to the same
104 | # reference taxonomy.
105 | ```
106 | 
107 | ```{r filmy-example-hide, echo = FALSE}
108 | library(taxastand)
109 | 
110 | # Load example reference taxonomy in Darwin Core format
111 | data(filmy_taxonomy)
112 | 
113 | # Take a look at the columns used by taxastand
114 | head(filmy_taxonomy[c(
115 |   "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
116 | 
117 | # As a test, resolve a misspelled name
118 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy, docker = TRUE)
119 | 
120 | # We can now use the `resolved_name` column of this result for downstream
121 | # analyses joining on other datasets that have been resolved to the same
122 | # reference taxonomy.
123 | ```
124 | 
125 | ## Citing this package
126 | 
127 | If you use this package, please cite it! Here is an example:
128 | 
129 |     Nitta, JH (2021) taxastand: Taxonomic name standardization in R. https://doi.org/10.5281/zenodo.5726390
130 | 
131 | The example DOI above is for the overall package.
132 | 
133 | Here is the latest DOI, which you should use if you are using the latest
134 | version of the package:
135 | 
136 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959)
137 | 
138 | You can find DOIs for older versions by viewing the “Releases” menu on
139 | the right.
140 | 
141 | You should also cite the software that `taxastand` relies on, `taxon-tools`: https://github.com/camwebb/taxon-tools
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # taxastand <img src="man/figures/logo.png" align="right" alt="" width="120" />
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![Project Status: WIP – Initial development is in progress, but there
  9 | has not yet been a stable, usable release suitable for the
 10 | public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
 11 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959)
 12 | <!-- badges: end -->
 13 | 
 14 | The goal of `taxastand` is to standardize species names from different
 15 | sources, a common task in biology.
 16 | 
 17 | Very often different biologists use different synonyms to refer to the
 18 | same species. If we want to join data from different sources, their
 19 | taxonomic names must be standardized first. This is what `taxastand`
 20 | seeks to do in a reproducible and efficient manner.
 21 | 
 22 | ## Important note
 23 | 
 24 | **This package is in early development.** There may be major, breaking
 25 | changes to functionality in the near future. If you use this package, I
 26 | highly recommend using a package manager like
 27 | [renv](https://rstudio.github.io/renv/articles/renv.html) so that later
 28 | updates won’t break your code.
 29 | 
 30 | ## Taxonomic standard
 31 | 
 32 | `taxastand` is based on matching names to a single **taxonomic
 33 | standard**, that is, a database of accepted names and synonyms. As long
 34 | as a single taxonomic standard is used, we can confidently resolve names
 35 | from disparate sources.
 36 | 
 37 | The taxonomic standard must conform to [Darwin Core
 38 | standards](https://dwc.tdwg.org/). The user must provide this database
 39 | (as a dataframe). There are many sources of taxonomic data online,
 40 | including
 41 | [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c),
 42 | [Catalog of Life](http://www.catalogueoflife.org/), and
 43 | [ITIS](https://www.itis.gov/) to name a few. The
 44 | [taxadb](https://github.com/ropensci/taxadb) package provides convenient
 45 | functions for downloading various taxonomic databases that use Darwin
 46 | Core.
 47 | 
 48 | ## Installation
 49 | 
 50 | `taxastand` can be installed from
 51 | [r-universe](https://joelnitta.r-universe.dev) or
 52 | [github](https://github.com/joelnitta).
 53 | 
 54 | ``` r
 55 | install.packages("taxastand", repos = 'https://joelnitta.r-universe.dev')
 56 | ```
 57 | 
 58 | OR
 59 | 
 60 | ``` r
 61 | # install.packages("remotes")
 62 | remotes::install_github("joelnitta/taxastand")
 63 | ```
 64 | 
 65 | ## Dependencies
 66 | 
 67 | `taxastand` depends on
 68 | [taxon-tools](https://github.com/camwebb/taxon-tools) for taxonomic name
 69 | matching.
 70 | 
 71 | There are two options for using this dependency.
 72 | 
 73 | - Install [docker](https://www.docker.com/) and set `docker = TRUE` when
 74 |   using `taxastand` functions.
 75 | 
 76 | OR
 77 | 
 78 | - Install the two programs included in
 79 |   [taxon-tools](https://github.com/camwebb/taxon-tools), `parsenames`
 80 |   and `matchnames`.
 81 | 
 82 | ## Similar work
 83 | 
 84 | - [ROpenSci](https://ropensci.org/) has a [task
 85 |   view](https://github.com/ropensci/taxonomy) summarizing many tools
 86 |   available for taxonomy.
 87 | 
 88 | - [taxize](https://github.com/ropensci/taxize) is the “granddaddy” of
 89 |   taxonomy packages in R. It can search around 20 different taxonomic
 90 |   databases for names and retrieve taxonomic information.
 91 | 
 92 | - [TNRS](http://tnrs.iplantcollaborative.org/), the Taxonomic Name
 93 |   Resolution Service, is a web application that resolves taxonomic names
 94 |   of plants according to one of six databases.
 95 | 
 96 | - [taxizedb](https://github.com/ropensci/taxizedb) downloads taxonomic
 97 |   databases and provides tools to interface with them through SQL.
 98 | 
 99 | - [taxadb](https://github.com/ropensci/taxadb) also downloads and
100 |   searches taxonomic databases. It can interface with them either
101 |   through SQL or in-memory in R.
102 | 
103 | - [taxonstand](https://cran.r-project.org/web/packages/Taxonstand/index.html)
104 |   has a very similar goal to `taxastand`, but only uses [The Plant List
105 |   (TPL)](http://www.theplantlist.org) as its taxonomic standard and does
106 |   not allow the user to provide their own. Note that TPL is no longer
107 |   being updated as of 2013.
108 | 
109 | ## Motivation
110 | 
111 | Although existing web-based solutions for taxonomic name resolution are
112 | very useful, they may not be ideal for all situations: the choice of
113 | reference database to use for standardization is limited, they may not
114 | be able to handle very large queries, and the user has no guarantee that
115 | the same input will yield the same output at a later date due to changes
116 | in the remote database.
117 | 
118 | Furthermore, matching of taxonomic names is not straightforward, since
119 | they are complex data structures including multiple components (e.g.,
120 | genus, specific epithet, basionym author, combination author, etc). [Of
121 | the tools mentioned above](#similar-work) only
122 | [TNRS](http://tnrs.iplantcollaborative.org/) can fuzzily match taxonomic
123 | names based on their parsed components, but it does not allow for use of
124 | a local reference database.
125 | 
126 | The motivation for `taxastand` is to provide greater flexibility and
127 | reproducibility by allowing for complete version control of the code and
128 | database used for name resolution, while implementing fuzzy matching of
129 | parsed taxonomic names.
130 | 
131 | ## Example
132 | 
133 | Here is an example of fuzzy matching followed by resolution of synonyms
134 | using the dataset included with the package.
135 | 
136 | ``` r
137 | library(taxastand)
138 | 
139 | # Load example reference taxonomy in Darwin Core format
140 | data(filmy_taxonomy)
141 | 
142 | # Take a look at the columns used by taxastand
143 | head(filmy_taxonomy[c(
144 |   "taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
145 | 
146 | # As a test, resolve a misspelled name
147 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
148 | 
149 | # We can now use the `resolved_name` column of this result for downstream
150 | # analyses joining on other datasets that have been resolved to the same
151 | # reference taxonomy.
152 | ```
153 | 
154 |     #>    taxonID acceptedNameUsageID taxonomicStatus
155 |     #> 1 54115096                  NA   accepted name
156 |     #> 2 54133783            54115097         synonym
157 |     #> 3 54115097                  NA   accepted name
158 |     #> 4 54133784            54115098         synonym
159 |     #> 5 54115098                  NA   accepted name
160 |     #> 6 54133785            54115099         synonym
161 |     #>                              scientificName
162 |     #> 1             Cephalomanes atrovirens Presl
163 |     #> 2                Trichomanes crassum Copel.
164 |     #> 3 Cephalomanes crassum (Copel.) M. G. Price
165 |     #> 4           Trichomanes densinervium Copel.
166 |     #> 5 Cephalomanes densinervium (Copel.) Copel.
167 |     #> 6         Trichomanes infundibulare Alderw.
168 |     #>                query                        resolved_name
169 |     #> 1 Gonocormus minutum Crepidomanes minutum (Bl.) K. Iwats.
170 |     #>                     matched_name resolved_status matched_status match_type
171 |     #> 1 Gonocormus minutus (Bl.) Bosch   accepted name        synonym auto_fuzzy
172 | 
173 | ## Citing this package
174 | 
175 | If you use this package, please cite it! Here is an example:
176 | 
177 |     Nitta, JH (2021) taxastand: Taxonomic name standardization in R. https://doi.org/10.5281/zenodo.5726390
178 | 
179 | The example DOI above is for the overall package.
180 | 
181 | Here is the latest DOI, which you should use if you are using the latest
182 | version of the package:
183 | 
184 | [![DOI](https://zenodo.org/badge/192684959.svg)](https://zenodo.org/badge/latestdoi/192684959)
185 | 
186 | You can find DOIs for older versions by viewing the “Releases” menu on
187 | the right.
188 | 
189 | You should also cite the software that `taxastand` relies on,
190 | `taxon-tools`: <https://github.com/camwebb/taxon-tools>
191 | 


--------------------------------------------------------------------------------
/R/ts_resolve_names.R:
--------------------------------------------------------------------------------
  1 | #' Resolve synonyms in taxonomic names
  2 | #'
  3 | #' After matching taxonomic names to a reference, some may match synonyms. This
  4 | #' function resolves synonyms to their accepted names.
  5 | #'
  6 | #' `query` can take as input either a character vector of taxonomic names, or
  7 | #' the output of \code{\link{ts_match_names}()}. If the former, it will run
  8 | #' \code{\link{ts_match_names}()} to match the query to `ref_taxonomy`, then
  9 | #' resolve synonyms. If the latter, the scientific names in `ref_taxonomy`
 10 | #' should be the same used as reference with \code{\link{ts_match_names}()}
 11 | #' (this is not checked).
 12 | #'
 13 | #' `ref_taxonomy` must be taxonomic data adhering to the [Darwin Core standard](https://dwc.tdwg.org/terms/#taxon).
 14 | #' Darwin Core includes many terms, but only four (`taxonID`,
 15 | #' `acceptedNameUsageID`, `taxonomicStatus`, and `scientificName`) are required
 16 | #' for this function.
 17 | #'
 18 | #' @param query Character vector or dataframe; taxonomic names to be resolved.
 19 | #'   If a character vector, missing values not allowed and all values must be
 20 | #'   unique. If a dataframe, should be taxonomic names matched with
 21 | #'   \code{\link{ts_match_names}()}.
 22 | #' @param ref_taxonomy Dataframe; reference taxonomic data adhering to the
 23 | #'   [Darwin Core standard](https://dwc.tdwg.org/terms/#taxon) with the
 24 | #'   following columns:
 25 | #' - `taxonID`: [Unique identifier for each taxon](https://dwc.tdwg.org/terms/#dwc:taxonID).
 26 | #' - `acceptedNameUsageID`: If the taxon is a synonym, the [unique identifier for the accepted name](https://dwc.tdwg.org/terms/#dwc:acceptedNameUsageID)
 27 | #' - `taxonomicStatus`: [The status of the use of the `scientificName` as a label for the taxon](https://dwc.tdwg.org/terms/#dwc:taxonomicStatus).
 28 | #' - `scientificName`: [The full scientific name of the taxon](https://dwc.tdwg.org/terms/#dwc:scientificName),
 29 | #' with authorship and date information if known.
 30 | #' @param max_dist Max Levenshtein distance to allow during fuzzy matching
 31 | #' (total insertions, deletions and substitutions). Default: 10.
 32 | #' @param match_no_auth Logical; If no author is given in the query and the name
 33 | #' (without author) occurs only once in the reference, accept the name in the
 34 | #' reference as a match. Default: to not allow such a match (`FALSE`).
 35 | #' @param match_canon Logical; Allow a "canonical name" match if only the genus,
 36 | #' species epithet, and infraspecific epithet (if present) match exactly.
 37 | #' Default: to not allow such a match (`FALSE`).
 38 | #' @param collapse_infra Logical; if the specific epithet and infraspecific
 39 | #' epithet are the same, drop the infraspecific rank and epithet from the query.
 40 | #' For more information, see \code{\link{ts_match_names}()}.
 41 | #' @param collapse_infra_exclude Character vector; taxonomic names to exclude
 42 | #' collapsing with `collapse_infra`. Any names used must match those in `query`
 43 | #' exactly, or they won't be excluded.
 44 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools
 45 | #' (so that taxon-tools need not be installed).
 46 | #' @param tbl_out Logical vector of length 1; should a tibble be returned?
 47 | #' If `FALSE` (default), output will be a data.frame. This argument can
 48 | #' be controlled via the option `ts_tbl_out`; see Examples.
 49 | #'
 50 | #' @return Dataframe; results of resolving synonyms in matched taxonomic names.
 51 | #' Includes the following columns:
 52 | #' - `query`: Query name
 53 | #' - `resolved_name`: Accepted name after resolving synonyms
 54 | #' - `matched_name`: Name matched to query
 55 | #' - `resolved_status`: Taxonomic status of the resolved name (same as `taxonomicStatus` in `ref_taxonomy`)
 56 | #' - `matched_status`: Taxonomic status of the matched name (same as `taxonomicStatus` in `ref_taxonomy`)
 57 | #' - `match_type`: Type of match (for a summary of match types, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes))
 58 | #'
 59 | #' Names that could not be matched or resolve to multiple, different synonyms
 60 | #' have `NA` for `resolved_name`.
 61 | #'
 62 | #' @autoglobal
 63 | #' @export
 64 | #' @examples
 65 | #' if (ts_tt_installed()) {
 66 | #'   # Load reference taxonomy in Darwin Core format
 67 | #'   data(filmy_taxonomy)
 68 | #'
 69 | #'   ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
 70 | #'   # If you always want tibble output without specifying `tbl_out = TRUE`
 71 | #'   # every time, set the option:
 72 | #'   options(ts_tbl_out = TRUE)
 73 | #'   ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
 74 | #' }
 75 | #'
 76 | ts_resolve_names <- function(
 77 |   query,
 78 |   ref_taxonomy,
 79 |   max_dist = 10,
 80 |   match_no_auth = FALSE,
 81 |   match_canon = FALSE,
 82 |   collapse_infra = FALSE,
 83 |   collapse_infra_exclude = NULL,
 84 |   docker = getOption("ts_docker", default = FALSE),
 85 |   tbl_out = getOption("ts_tbl_out", default = FALSE)
 86 | ) {
 87 |   # Check input
 88 |   assertthat::assert_that(
 89 |     is.character(query) | inherits(query, "data.frame"),
 90 |     msg = "query must be of class 'data.frame' or a character vector"
 91 |   )
 92 |   assertthat::assert_that(
 93 |     inherits(ref_taxonomy, "data.frame"),
 94 |     msg = "ref_taxonomy must be of class 'data.frame'"
 95 |   )
 96 |   assertthat::assert_that(assertthat::is.flag(tbl_out))
 97 |   assertthat::assert_that(assertthat::is.flag(docker))
 98 |   if (!is.null(collapse_infra_exclude)) {
 99 |     assertthat::assert_that(is.character(collapse_infra_exclude))
100 |   }
101 | 
102 |   # If needed, match names first
103 |   if (is.character(query)) {
104 |     match_results <- ts_match_names(
105 |       query = query,
106 |       reference = unique(ref_taxonomy$scientificName),
107 |       max_dist = max_dist,
108 |       match_no_auth = match_no_auth,
109 |       match_canon = match_canon,
110 |       collapse_infra = collapse_infra,
111 |       collapse_infra_exclude = collapse_infra_exclude,
112 |       simple = TRUE,
113 |       docker = docker
114 |     )
115 |   } else if (is.data.frame(query)) {
116 |     match_results <- query
117 |   } else {
118 |     stop("query must be of class 'data.frame' or a character vector")
119 |   }
120 | 
121 |   # Classify results of matching
122 |   match_results_classified_with_taxonomy <-
123 |     match_results %>%
124 |     ts_classify_result() %>%
125 |     dplyr::select(query, reference, match_type, result_type) %>%
126 |     dplyr::left_join(ref_taxonomy, by = c(reference = "scientificName"))
127 | 
128 |   # Separate out single matches to an accepted name (success type 1)
129 |   accepted_single_match <-
130 |     match_results_classified_with_taxonomy %>%
131 |     # consider accepted names have either no acceptedNameUsageID or acceptedNameUsageID is same as taxonID
132 |     dplyr::filter(
133 |       (is.na(acceptedNameUsageID) |
134 |         acceptedNameUsageID == "" |
135 |         taxonID == acceptedNameUsageID) &
136 |         result_type == "single_match"
137 |     ) %>%
138 |     dplyr::select(
139 |       query,
140 |       resolved_name = reference,
141 |       matched_name = reference,
142 |       resolved_status = taxonomicStatus,
143 |       matched_status = taxonomicStatus,
144 |       match_type
145 |     )
146 | 
147 |   # Separate out matches to a single synonym (success type 2)
148 |   accepted_single_synonyms <-
149 |     match_results_classified_with_taxonomy %>%
150 |     # Consider synonym anything with acceptedNameUsageID not matching taxonID
151 |     dplyr::filter(!is.na(acceptedNameUsageID)) %>%
152 |     dplyr::filter(acceptedNameUsageID != "") %>%
153 |     dplyr::filter(acceptedNameUsageID != taxonID) %>%
154 |     # Join resolved names via synonym
155 |     dplyr::left_join(
156 |       dplyr::select(
157 |         ref_taxonomy,
158 |         taxonID,
159 |         resolved_name = scientificName,
160 |         resolved_status = taxonomicStatus
161 |       ),
162 |       by = c(acceptedNameUsageID = "taxonID")
163 |     ) %>%
164 |     dplyr::select(
165 |       query,
166 |       resolved_name,
167 |       matched_name = reference,
168 |       resolved_status,
169 |       matched_status = taxonomicStatus,
170 |       match_type
171 |     ) %>%
172 |     dplyr::group_by(query) %>%
173 |     # Add count of number of resolved, accepted names per query
174 |     dplyr::mutate(n = dplyr::n_distinct(resolved_name)) %>%
175 |     dplyr::ungroup() %>%
176 |     # Only keep those that resolve to the same name
177 |     dplyr::filter(n == 1) %>%
178 |     dplyr::select(-n)
179 | 
180 |   # Combine name resolution successes
181 |   success <- dplyr::bind_rows(accepted_single_match, accepted_single_synonyms)
182 | 
183 |   # Anything else is a failure
184 |   failure <-
185 |     match_results_classified_with_taxonomy %>%
186 |     dplyr::select(
187 |       query,
188 |       match_type,
189 |       matched_status = taxonomicStatus,
190 |       matched_name = reference
191 |     ) %>%
192 |     dplyr::anti_join(success, by = "query")
193 | 
194 |   # Combine into final results
195 |   results <- dplyr::bind_rows(success, failure) %>%
196 |     assertr::verify(all(query %in% match_results$query)) %>%
197 |     assertr::verify(all(match_results$query %in% query)) %>%
198 |     dplyr::select(
199 |       query,
200 |       resolved_name,
201 |       matched_name,
202 |       resolved_status,
203 |       matched_status,
204 |       match_type
205 |     )
206 | 
207 |   # Return as tibble or dataframe
208 |   if (isTRUE(tbl_out)) return(tibble::as_tibble(results))
209 | 
210 |   results
211 | }
212 | 


--------------------------------------------------------------------------------
/vignettes/basics.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "The basics"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{The basics}
  6 |   %\VignetteEncoding{UTF-8}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 | editor_options: 
  9 |   chunk_output_type: console
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | knitr::opts_chunk$set(
 14 |   collapse = TRUE,
 15 |   comment = "#>"
 16 | )
 17 | ```
 18 | 
 19 | This vignette explains the three basic steps of the taxonomic name resolution workflow, which consist of:
 20 | 
 21 | 1. Name parsing
 22 | 2. Name matching
 23 | 3. Name resolution
 24 | 
 25 | ## Setup
 26 | 
 27 | We'll start by loading `taxastand`. For more information on installing `taxastand`, see [here](https://joelnitta.github.io/taxastand/index.html#installation).
 28 | 
 29 | ```{r setup}
 30 | library(taxastand)
 31 | ```
 32 | 
 33 | ## Name parsing
 34 | 
 35 | In R, scientific names are often just stored as character vectors (strings). For example,
 36 | 
 37 | ```{r example-name}
 38 | example_name <- "Crepidomanes minutum (Bl.) K. Iwats."
 39 | ```
 40 | 
 41 | However, such a name actually consists of several distinct parts:
 42 | 
 43 | ```
 44 | "Crepidomanes minutum (Bl.) K. Iwats."
 45 | ------------- ------- ---------------
 46 |       |         |          |
 47 |     genus    specific    author
 48 |              epithet
 49 | ```
 50 | 
 51 | Furthermore, in the case of this name, it was originally named by Blume (`(Bl.)`), then transferred to a different genus by Iwatsuki (`K. Iwats.`).
 52 | 
 53 | When working with taxonomic names, it can be useful to **parse** the name into its component parts. That is what `ts_parse_names()` does. It takes a character vector as input and returns a dataframe:
 54 | 
 55 | ```{r parse-example}
 56 | ts_parse_names(example_name)
 57 | ```
 58 | 
 59 | The first column, `name`, is the original input name. `id` is a unique identifier attached to the name. The rest of the columns are [ the parsed components of the name](https://joelnitta.github.io/taxastand/reference/ts_parse_names.html#value).
 60 | 
 61 | Note that the [name parsing algorithm](https://github.com/camwebb/taxon-tools#parsenames) used by `taxastand` is case-sensitive! It assumes that the [standard capitalization of scientific names](https://en.wikipedia.org/wiki/Binomial_nomenclature#Writing_binomial_names) is being used: genus is capitalized, specific epithet is lower case, author is capitalized as a proper noun, etc. **Name parsing probably won't work without this type of capitalization.**
 62 | 
 63 | Now that we've parsed a name, in the next section we will see why this is useful for matching names to each other.
 64 | 
 65 | ## Name matching
 66 | 
 67 | One reason that name parsing is important is because some scientific names may differ only in certain components.
 68 | 
 69 | For example, the species [*Hymenophyllum pectinatum*](https://www.tropicos.org/name/Search?name=Hymenophyllum%20pectinatum) actually corresponds to two different scientific names with different authors, *Hymenophyllum pectinatum* Nees & Blume and *Hymenophyllum pectinatum* Cav.
 70 | 
 71 | We can see this by querying the name:
 72 | 
 73 | ```{r match-example-1}
 74 | ts_match_names(
 75 |   "Hymenophyllum pectinatum", 
 76 |   c("Hymenophyllum pectinatum Nees & Blume", 
 77 |     "Hymenophyllum pectinatum Cav."), 
 78 |   simple = TRUE)
 79 | ```
 80 | 
 81 | `ts_match_names()` matches both scientific names[^1], because the algorithm it can't distinguish between them without additional information. So **it is almost always better to include the taxonomic author in the query**, to distinguish between such cases.
 82 | 
 83 | [^1]: Note that `ts_match_names()` did the name parsing by calling `ts_parse_names()` for us internally. This is usually fine, but it can also take parsed names (dataframes) produced by `ts_parse_names()` as input to either `query` or `reference.`
 84 | 
 85 | However, there can be quite a bit of variation in how authors are recorded. Sometimes names are abbreviated to different lengths, or the basionym author (an author name in parentheses) might get left out by accident, etc. The algorithm used by `taxastand` can account for this (to a point). Here is an example where the query lacks a basionym author:
 86 | 
 87 | ```{r match-example-2}
 88 | ts_match_names(
 89 |   "Hymenophyllum taiwanense C. V. Morton", 
 90 |   c("Hymenophyllum taiwanense (Tagawa) C. V. Morton", 
 91 |     "Hymenophyllum taiwanense De Vol"), 
 92 |   simple = TRUE)
 93 | ```
 94 | 
 95 | The name matching algorithm was able to narrow the match down to `Hymenophyllum taiwanense (Tagawa) C. V. Morton` even though the query lacked `(Tagawa)`. Furthermore, the `match_type` tells us how the matching was done: `auto_basio-` means an automatic match based on excluding the basionym author from the reference. **It is recommended to always check any results that weren't identical** (`exact`) to verify that the matching algorithm worked correctly, especially for fuzzy matches (`auto_fuzzy`). 
 96 | 
 97 | Here is a summary of the values taken by `match_type` from [`taxon-tools`](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes):
 98 | 
 99 | - `exact`: Exact match to all parts of the name (genus hybrid marker, genus name, species hybrid marker, species epithet, infraspecific rank signifier, infraspecific rank, author string).
100 | - `auto_punct`: Exact match to all parts of the name after removing mis-matching spaces, periods, non-ASCII author name characters, etc.
101 | - `auto_noauth` (only applies if `match_no_auth` is `TRUE`): Match between a query lacking an author and a reference name lacking an author that occurs only once in the reference.
102 | - `auto_basio-`: Match after excluding the basionym author from the reference. For example, `Cardaminopsis umbrosa Czerep.` vs. `Cardaminopsis umbrosa (Turcz.) Czerep.)`); the basionym author is `(Turcz.)`.
103 | - `auto_basio+`: Match after excluding the basionym author from the query.
104 | - `auto_in-`: Match after excluding all *in* elements from reference. An *in* element refers to phrases such as `Tagawa in Morton`. The version excluding *in* elements is `Tagawa`.
105 | - `auto_in+`: Match after excluding all *in* elements from query.
106 | - `auto_ex-`: Match after excluding all *in* and *ex* elements from reference. An *ex* element refers to phrases such as `Rändel ex D.F.Murray`. The version excluding *ex* elements is `Rändel`.
107 | - `auto_ex+`: Match after excluding all *in* and *ex* elements from query.
108 | - `auto_basexin`: Match after excluding all basionym authors and all *in* and *ex* elements from query and reference.
109 | - `auto_irank`: Match where all elements agree except for infraspecific rank.
110 | - `auto_fuzzy`: Fuzzy match; match between scientific names allowed up to threshold given by `max_dist`, the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) including total insertions, deletions and substitutions.
111 | - `cfonly`: Match by "canonical form", i.e., genus plus specific epithet plus infraspecific epithet (if present), not including the infraspecific specifier ("subsp.", etc.).
112 | - `no_match`: No match detected.
113 | 
114 | The matching algorithm will prefer match codes higher in the list; so if a name could be matched both by `auto_punct` and `auto_fuzzy`, it will be matched based on `auto_punct`[^2].
115 | 
116 | [^2]: The algorithm used by `taxastand` is optimized for plants, algae, and fungi, which vary in their [taxonomic rules](https://www.iapt-taxon.org/nomen/main.php) somewhat from animals. For example, plants include basionym authors in parentheses followed by the combination author, and typically don't include the year, whereas animals normally include the year and may not provide the combination author.
117 | 
118 | ## Name resolution
119 | 
120 | Name resolution refers to the process of mapping a query name to its standard version. This could just be accounting for orthographic variations, or it could involve resolving synonyms: different names that actually refer to the same species.
121 | 
122 | In order to conduct name resolution, we require a **taxonomic standard** in the form of a dataframe. `taxastand` requires that the taxonomic standard conform to [Darwin Core standards](https://dwc.tdwg.org/). There are many sources of taxonomic data online, including [GBIF](https://www.gbif.org/en/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c), [Catalog of Life](http://www.catalogueoflife.org/), and [ITIS](https://www.itis.gov/) among others.
123 | 
124 | `taxastand` comes supplied with an example taxonomic standard for filmy ferns (family Hymenophyllaceae):
125 | 
126 | ```{r name-res-example-1}
127 | # Load example reference taxonomy in Darwin Core format
128 | data(filmy_taxonomy)
129 | 
130 | # Take a look at the columns used by taxastand
131 | head(filmy_taxonomy[c("taxonID", "acceptedNameUsageID", "taxonomicStatus", "scientificName")])
132 | ```
133 | 
134 | Here, `taxonID` is a unique identifier for each taxonomic name. `acceptedNameUsageID` only applies in the case of synonyms: it tells us the `taxonID` of the accepted name corresponding to that synonym. `taxonomicStatus` describes the status of the name, typically either as an accepted name, synonym, or something else ("dubious", etc.). Finally, the `scientificName` is the full scientific name, preferably with the author.
135 | 
136 | In its most simple usage, `ts_resolve_names()` can take as input a character vector to `query`, and provide the resolved name in the taxonomic standard (`reference`):
137 | 
138 | ```{r name-res-example-2}
139 | ts_resolve_names("Gonocormus minutum", filmy_taxonomy)
140 | ```
141 | 
142 | In this case, the query, `Gonocormus minutum` was a misspelled name that is actually a synonym for *Crepidomanes minutum* (Bl.) K. Iwats. Under the hood, `ts_resolve_names()` is calling both `ts_parse_names()` and `ts_match_names()` to do parsing and matching steps before name resolution[^3].
143 | 
144 | [^3]: You can use the output of `ts_match_names()` to the `query` input of `ts_parse_names()` if you want to see the matching results first.
145 | 
146 | However, when used this way, `ts_resolve_names()` may not be able to provide a resolved name if the input is not matched unambiguously:
147 | 
148 | ```{r name-res-example-3}
149 | t_bifid_res <- ts_resolve_names("Trichomanes bifidum", filmy_taxonomy)
150 | head(t_bifid_res)
151 | dim(t_bifid_res)
152 | ```
153 | 
154 | In this case, name resolution using the default settings produced `r nrow(t_bifid_res)` possible answers! That is obviously far too many. Let's try to adjust the arguments and see if we can reduce the output:
155 | 
156 | ```{r name-res-example-4}
157 | ts_resolve_names(
158 |   "Trichomanes bifidum", filmy_taxonomy, 
159 |   match_no_auth = TRUE, match_canon = TRUE, max_dist = 5)
160 | ```
161 | 
162 | By allowing matches without the author name (we probably should have done that anyways, since the query lacked an author) and lowering the fuzzy match threshold, we are able to greatly reduce the number of possible resolved names.
163 | 
164 | Name resolution workflows typically involve tweaking these arguments to resolve a maximum number of names automatically, followed by some amount of manual edits to the remaining resolved names.
165 | 
166 | A benefit of `taxastand` is that, if during the name resolution workflow we discover mistakes in the reference database, the reference database can be edited so that the query names resolve correctly (this is not possible with packages that rely on querying a remote taxonomic database that can't be modified by the user).
167 | 
168 | ## Conclusion
169 | 
170 | This vignette illustrated the typical steps involved in name resolution with `taxastand` on some trivial examples. In another vignette, I will provide a more realistic example with a larger dataset.
171 | 


--------------------------------------------------------------------------------
/R/ts_match_names.R:
--------------------------------------------------------------------------------
  1 | #' Match taxonomic names to a reference
  2 | #'
  3 | #' Allows for orthographic differences between query and reference by using
  4 | #' fuzzy matching on parsed taxonomic names. Requires
  5 | #' [taxon-tools](https://github.com/camwebb/taxon-tools) to be installed.
  6 | #'
  7 | #' `taxon-tools` matches names in two steps:
  8 | #' 1. Scientific names are parsed into their component parts (genus, species,
  9 | #' variety, author, etc).
 10 | #' 2. Names are fuzzily matched following taxonomic rules using the component
 11 | #' parts.
 12 | #'
 13 | #' For more information on rules used for matching, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes).
 14 | #'
 15 | #' Parsing is fairly fast (much faster than matching) but can take some time if
 16 | #' the number of names is very large. If multiple queries will be made (e.g., to
 17 | #' the same large reference database), it is recommended to first parse the
 18 | #' names using \code{\link{ts_parse_names}()}, and use the results as input to
 19 | #' `query` and/or `reference`.
 20 | #'
 21 | #' `collapse_infra` is useful in situations where the reference database does
 22 | #' not use names that have the same specific epithet and infraspecific epithet.
 23 | #' For example, reference name "Blechnum lunare" and query "Blechnum lunare var.
 24 | #' lunare". In this case, if `collapse_infra` is `TRUE`, "Blechnum lunare" will
 25 | #' be queried instead of "Blechnum lunare var. lunare". Note that the
 26 | #' `match_type` will be "exact" even though the literal query and the matched
 27 | #' name are different (see example below).
 28 | #'
 29 | #' @param query Character vector or dataframe; taxonomic names to be queried.
 30 | #' If a character vector, missing values not allowed and all values must be
 31 | #' unique.
 32 | #' If a dataframe, should be taxonomic names parsed with
 33 | #' \code{\link{ts_parse_names}()}.
 34 | #' @param reference  Character vector or dataframe; taxonomic names to use as
 35 | #' reference. If a character vector, missing values not allowed and all values
 36 | #' must be unique. If a dataframe, should be taxonomic names parsed with
 37 | #' \code{\link{ts_parse_names}()}.
 38 | #' @param manual_match Optional. Dataframe of manually matched names that will
 39 | #' override any results from `taxon-tools`. Must include columns, `query`
 40 | #' and `match`. Can only be used if `query` is a character vector.
 41 | #' @param max_dist Max Levenshtein distance to allow during fuzzy matching
 42 | #' (total insertions, deletions and substitutions). Default: 10.
 43 | #' @param match_no_auth Logical; If no author is given in the query and the name
 44 | #' (without author) occurs only once in the reference, accept the name in the
 45 | #' reference as a match. Default: to not allow such a match (`FALSE`).
 46 | #' @param match_canon Logical; Allow a "canonical name" match if only the genus,
 47 | #' species epithet, and infraspecific epithet (if present) match exactly.
 48 | #' Default: to not allow such a match (`FALSE`).
 49 | #' @param collapse_infra Logical; if the specific epithet and infraspecific
 50 | #' epithet are the same, drop the infraspecific rank and epithet from the query.
 51 | #' @param collapse_infra_exclude Character vector; taxonomic names to exclude
 52 | #' from collapsing with `collapse_infra`. Any names used must match those in
 53 | #' `query` exactly, or they won't be excluded.
 54 | #' @param simple Logical; return the output in a simplified format with only the
 55 | #' query name, matched reference name, and match type. Default: `FALSE`.
 56 | #' @param docker Logical; if TRUE, docker will be used to run taxon-tools
 57 | #' (so that taxon-tools need not be installed).
 58 | #' @param tbl_out Logical vector of length 1; should a tibble be returned?
 59 | #' If `FALSE` (default), output will be a data.frame. This argument can
 60 | #' be controlled via the option `ts_tbl_out`; see Examples.
 61 | #'
 62 | #' @return Dataframe with the following columns (if `simple` is `FALSE`):
 63 | #' - query: Query name
 64 | #' - reference: Matched reference name
 65 | #' - match_type: Type of match (for a summary of match types, [see taxon-tools manual](https://github.com/camwebb/taxon-tools/blob/master/doc/matchnames.md#matching-rules-and-output-codes))
 66 | #' - id_query: Unique ID of query
 67 | #' - id_ref: Unique ID of reference
 68 | #' - genus_hybrid_sign_query: Genus hybrid sign in query
 69 | #' - genus_name_query: Genus name of query
 70 | #' - species_hybrid_sign_query: Species hybrid sign in query
 71 | #' - specific_epithet_query: Specific epithet of query
 72 | #' - infraspecific_rank_query: Infraspecific rank of query
 73 | #' - infraspecific_epithet_query: Infraspecific epithet of query
 74 | #' - author_query: Taxonomic author of query
 75 | #' - genus_hybrid_sign_ref: Genus hybrid sign in reference
 76 | #' - genus_name_ref: Genus name of reference
 77 | #' - species_hybrid_sign_ref: Species hybrid sign in reference
 78 | #' - specific_epithet_ref: Specific epithet of reference
 79 | #' - infraspecific_rank_ref: Infraspecific rank of reference
 80 | #' - infraspecific_epithet_ref: Infraspecific epithet of reference
 81 | #' - author_ref: Taxonomic author of reference
 82 | #'
 83 | #' If `simple` is `TRUE`, only return the first three columns above.
 84 | #'
 85 | #' @autoglobal
 86 | #' @export
 87 | #' @examples
 88 | #' if(ts_tt_installed()) {
 89 | #'   ts_match_names(
 90 | #'     "Crepidomanes minutus",
 91 | #'     c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
 92 | #'     simple = TRUE
 93 | #'     )
 94 | #'
 95 | #'   # If names are too distant, they won't match
 96 | #'   ts_match_names(
 97 | #'     query = "Crepidblah foo",
 98 | #'     reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
 99 | #'     simple = TRUE
100 | #'     )
101 | #'
102 | #'   # But we can force a match manually
103 | #'   ts_match_names(
104 | #'     query = "Crepidblah foo",
105 | #'     reference = c("Crepidomanes minutum", "Hymenophyllum polyanthos"),
106 | #'     manual_match = data.frame(
107 | #'       query = c("Crepidblah foo"),
108 | #'       match = c("Crepidomanes minutum")
109 | #'     ),
110 | #'     simple = TRUE
111 | #'    )
112 | #'
113 | #'   # If you always want tibble output without specifying `tbl_out = TRUE`
114 | #'   # every time, set the option:
115 | #'   options(ts_tbl_out = TRUE)
116 | #'   ts_match_names(
117 | #'     "Crepidomanes minutus",
118 | #'     c("Crepidomanes minutum", "Hymenophyllum polyanthos")
119 | #'     )
120 | #'
121 | #'   # Example using collapse_infra argument
122 | #'   ts_match_names(
123 | #'     c("Crepidomanes minutus", "Blechnum lunare var. lunare",
124 | #'       "Blechnum lunare", "Bar foo var. foo", "Bar foo"),
125 | #'     c("Crepidomanes minutum", "Hymenophyllum polyanthos", "Blechnum lunare",
126 | #'       "Bar foo"),
127 | #'     collapse_infra = TRUE,
128 | #'     collapse_infra_exclude = "Bar foo var. foo",
129 | #'     simple = TRUE
130 | #'     )
131 | #' }
132 | #'
133 | ts_match_names <- function(
134 |   query,
135 |   reference,
136 |   manual_match = NULL,
137 |   max_dist = 10,
138 |   match_no_auth = FALSE,
139 |   match_canon = FALSE,
140 |   collapse_infra = FALSE,
141 |   collapse_infra_exclude = NULL,
142 |   simple = FALSE,
143 |   docker = getOption("ts_docker", default = FALSE),
144 |   tbl_out = getOption("ts_tbl_out", default = FALSE)
145 | ) {
146 |   # Check input
147 |   assertthat::assert_that(
148 |     is.character(query) | inherits(query, "data.frame"),
149 |     msg = "query must be of class 'data.frame' or a character vector"
150 |   )
151 |   assertthat::assert_that(
152 |     is.character(reference) | inherits(reference, "data.frame"),
153 |     msg = "reference must be of class 'data.frame' or a character vector"
154 |   )
155 |   assertthat::assert_that(assertthat::is.number(max_dist))
156 |   assertthat::assert_that(is.logical(match_no_auth))
157 |   assertthat::assert_that(is.logical(match_canon))
158 |   assertthat::assert_that(is.logical(simple))
159 |   assertthat::assert_that(assertthat::is.flag(tbl_out))
160 |   assertthat::assert_that(assertthat::is.flag(collapse_infra))
161 |   if (!is.null(collapse_infra_exclude)) {
162 |     assertthat::assert_that(is.character(collapse_infra_exclude))
163 |   }
164 |   assertthat::assert_that(assertthat::is.flag(docker))
165 |   if (!is.null(manual_match)) {
166 |     assertthat::assert_that(
167 |       isTRUE(inherits(manual_match, "data.frame")),
168 |       msg = "manual_match must be of class 'data.frame'"
169 |     )
170 |     assertthat::assert_that(
171 |       isTRUE(
172 |         all(c("query", "match") %in% colnames(manual_match))
173 |       ),
174 |       msg = "manual_match must have `query` and `match` columns"
175 |     )
176 |     assertthat::assert_that(
177 |       is.character(manual_match$query)
178 |     )
179 |     assertthat::assert_that(
180 |       is.character(manual_match$match)
181 |     )
182 |     assertthat::assert_that(
183 |       assertthat::noNA(manual_match$query)
184 |     )
185 |     assertthat::assert_that(
186 |       assertthat::noNA(manual_match$query)
187 |     )
188 |     assertthat::assert_that(
189 |       isTRUE(!any(duplicated(manual_match$query))),
190 |       msg = "All values of manual_match$query must be unique"
191 |     )
192 |     assertthat::assert_that(
193 |       is.character(query),
194 |       msg = "manual_match can only be used if query is a character vector"
195 |     )
196 |   }
197 | 
198 |   # Helper function to add a namestring to a dataframe of parsed names
199 |   add_namestring <- function(df) {
200 |     df$namestring <-
201 |       paste0(
202 |         df$genus_hybrid_sign,
203 |         df$genus_name,
204 |         df$species_hybrid_sign,
205 |         df$specific_epithet,
206 |         df$infraspecific_rank,
207 |         df$infraspecific_epithet,
208 |         df$author,
209 |         sep = "_"
210 |       )
211 |     df
212 |   }
213 | 
214 |   # Parse or load query names
215 |   if (is.character(query)) {
216 |     # Optional: for manual matches, use matched name instead of query
217 |     # to generate exact match
218 |     if (!is.null(manual_match)) {
219 |       manual_replacement_df <-
220 |         data.frame(
221 |           query_original = query
222 |         ) |>
223 |         dplyr::left_join(
224 |           dplyr::select(
225 |             manual_match,
226 |             query_original = query,
227 |             query_new = match
228 |           ),
229 |           by = "query_original",
230 |           relationship = "one-to-one"
231 |         ) |>
232 |         dplyr::mutate(
233 |           query_new = dplyr::coalesce(query_new, query_original)
234 |         )
235 |       query <- manual_replacement_df$query_new |>
236 |         unique()
237 |     }
238 |     # Parse the names (adds 'name' column)
239 |     query_parsed_df <- ts_parse_names(query, docker = docker)
240 |   } else {
241 |     # Or, names are already parsed
242 |     query_parsed_df <- query
243 |   }
244 | 
245 |   # Optionally collapse infraspecific name
246 |   if (isTRUE(collapse_infra)) {
247 |     # Save a copy of original unmodified parsed query
248 |     query_parsed_df_original <- query_parsed_df
249 |     # Identify rows where infraspecific_epithet is the same as specific_epithet
250 |     query_parsed_df$same_infra_species <-
251 |       (query_parsed_df$specific_epithet ==
252 |         query_parsed_df$infraspecific_epithet) %in%
253 |       TRUE &
254 |       !query_parsed_df$name %in% collapse_infra_exclude
255 |     assertthat::assert_that(!anyNA(query_parsed_df$same_infra_species))
256 |     # For rows where infraspecific_epithet is the same as specific_epithet,
257 |     # delete infraspecific_epithet and infraspecific_rank
258 |     query_parsed_df$infraspecific_epithet[
259 |       query_parsed_df$same_infra_species
260 |     ] <- NA
261 |     query_parsed_df$infraspecific_rank[query_parsed_df$same_infra_species] <- NA
262 |     query_parsed_df$same_infra_species <- NULL
263 |     # Account for duplicates created after collapsing names: drop them
264 |     query_parsed_df <- add_namestring(query_parsed_df) |>
265 |       dplyr::group_by(namestring) |>
266 |       dplyr::mutate(key_id = dplyr::first(id)) |>
267 |       dplyr::ungroup()
268 |     id_map <- dplyr::select(query_parsed_df, id_query = key_id, id)
269 |     query_parsed_df <- query_parsed_df[
270 |       !duplicated(query_parsed_df$namestring),
271 |     ]
272 |     query_parsed_df$namestring <- NULL
273 |   }
274 | 
275 |   # Write out parsed names to temporary file
276 |   query_parsed_txt <- tempfile(
277 |     pattern = digest::digest(query),
278 |     fileext = ".txt"
279 |   )
280 |   if (fs::file_exists(query_parsed_txt)) fs::file_delete(query_parsed_txt)
281 |   ts_write_names(query_parsed_df, query_parsed_txt)
282 | 
283 |   # Parse or load reference names
284 |   if (is.character(reference)) {
285 |     # Parse the names (adds 'name' column)
286 |     ref_parsed_df <- ts_parse_names(reference, docker = docker)
287 |   } else {
288 |     # Or, names are already parsed
289 |     ref_parsed_df <- reference
290 |   }
291 | 
292 |   # Check that manually matched ref names are in data
293 |   if (!is.null(manual_match)) {
294 |     assertthat::assert_that(
295 |       isTRUE(all(manual_match$match %in% ref_parsed_df$name)),
296 |       msg = "One or more manually matched reference names not in reference data"
297 |     )
298 |   }
299 | 
300 |   # Write out parsed names to temporary file
301 |   ref_parsed_txt <- tempfile(
302 |     pattern = digest::digest(reference),
303 |     fileext = ".txt"
304 |   )
305 |   if (fs::file_exists(ref_parsed_txt)) fs::file_delete(ref_parsed_txt)
306 |   ts_write_names(ref_parsed_df, ref_parsed_txt)
307 | 
308 |   # Format argument flags
309 |   if (match_no_auth) match_no_auth <- "-1" else match_no_auth <- NULL
310 |   if (match_canon) match_canon <- "-c" else match_canon <- NULL
311 | 
312 |   # Specify temporary output file
313 |   match_results_txt <- tempfile(
314 |     pattern = digest::digest(c(query, reference)),
315 |     fileext = ".txt"
316 |   )
317 |   if (fs::file_exists(match_results_txt)) fs::file_delete(match_results_txt)
318 | 
319 |   # Run taxon-tools matchnames
320 | 
321 |   if (isTRUE(docker)) {
322 |     assertthat::assert_that(
323 |       requireNamespace("babelwhale", quietly = TRUE),
324 |       msg = "babelwhale needs to be installed to use docker"
325 |     )
326 |     assertthat::assert_that(
327 |       babelwhale::test_docker_installation(),
328 |       msg = "docker not installed"
329 |     )
330 |     match_results <- run_auto_mount(
331 |       container_id = "camwebb/taxon-tools:v1.3.0",
332 |       command = "matchnames",
333 |       args = c(
334 |         "-a",
335 |         file = query_parsed_txt,
336 |         "-b",
337 |         file = ref_parsed_txt,
338 |         "-o",
339 |         file = match_results_txt,
340 |         "-e",
341 |         max_dist,
342 |         "-F", # no manual matching
343 |         match_no_auth,
344 |         match_canon
345 |       )
346 |     )
347 |   } else {
348 |     assertthat::assert_that(
349 |       ts_tt_installed(),
350 |       msg = "taxon-tools not installed"
351 |     )
352 |     match_results <- processx::run(
353 |       command = "matchnames",
354 |       args = c(
355 |         "-a",
356 |         query_parsed_txt,
357 |         "-b",
358 |         ref_parsed_txt,
359 |         "-o",
360 |         match_results_txt,
361 |         "-e",
362 |         max_dist,
363 |         "-F", # no manual matching
364 |         match_no_auth,
365 |         match_canon
366 |       )
367 |     )
368 |   }
369 | 
370 |   # Read in results
371 |   # Each line represents a single name from the query list (list A).
372 |   # Seventeen pipe-delimited (“|”) fields per row:
373 |   #  1. User ID code in list A,
374 |   #  2. Code in list B (if matched),
375 |   #  3. Match type (see codes below),
376 |   #  4-10. Parsed elements of name in list A.
377 |   #  11-17 (in same format as name input), Parsed elements of name in list B.
378 |   matchnames_cols <- c(
379 |     "id_query",
380 |     "id_ref",
381 |     "match_type",
382 |     "genus_hybrid_sign_query",
383 |     "genus_name_query",
384 |     "species_hybrid_sign_query",
385 |     "specific_epithet_query",
386 |     "infraspecific_rank_query",
387 |     "infraspecific_epithet_query",
388 |     "author_query",
389 |     "genus_hybrid_sign_ref",
390 |     "genus_name_ref",
391 |     "species_hybrid_sign_ref",
392 |     "specific_epithet_ref",
393 |     "infraspecific_rank_ref",
394 |     "infraspecific_epithet_ref",
395 |     "author_ref"
396 |   )
397 | 
398 |   results <- data.frame(record = readLines(match_results_txt))
399 | 
400 |   results <- tidyr::separate(
401 |     data = results,
402 |     col = record,
403 |     into = matchnames_cols,
404 |     sep = "\\|",
405 |     fill = "right",
406 |     remove = TRUE
407 |   )
408 | 
409 |   # Convert empty strings to NA
410 |   results <- dplyr::mutate(
411 |     results,
412 |     dplyr::across(dplyr::everything(), ~ dplyr::na_if(.x, ""))
413 |   )
414 | 
415 |   # Add back in the original search terms (query and reference)
416 |   results <- dplyr::left_join(
417 |     results,
418 |     dplyr::select(query_parsed_df, id_query = id, query = name),
419 |     by = "id_query"
420 |   )
421 | 
422 |   results <- dplyr::left_join(
423 |     results,
424 |     dplyr::select(ref_parsed_df, id_ref = id, reference = name),
425 |     by = "id_ref"
426 |   )
427 | 
428 |   results <- dplyr::select(
429 |     results,
430 |     query,
431 |     reference,
432 |     match_type,
433 |     dplyr::everything()
434 |   )
435 | 
436 |   # Add back in names that were duplicated due to collapsed infrasp names
437 |   if (isTRUE(collapse_infra)) {
438 |     results <-
439 |       dplyr::select(
440 |         query_parsed_df_original,
441 |         query = name,
442 |         id
443 |       ) |>
444 |       dplyr::left_join(id_map, by = "id") |>
445 |       dplyr::left_join(
446 |         dplyr::select(results, -query),
447 |         by = "id_query"
448 |       ) |>
449 |       dplyr::select(-id) |>
450 |       dplyr::select(query, reference, match_type, dplyr::everything())
451 |   }
452 | 
453 |   # For manual matches, restore back to original query input, and specify
454 |   # that match was made manually
455 |   if (!is.null(manual_match)) {
456 |     results <-
457 |       manual_replacement_df |>
458 |       dplyr::inner_join(
459 |         results,
460 |         by = dplyr::join_by(query_new == query)
461 |       ) |>
462 |       dplyr::mutate(
463 |         match_type = dplyr::case_when(
464 |           query_original %in% manual_match$query ~ "manual",
465 |           .default = match_type
466 |         )
467 |       ) |>
468 |       dplyr::select(
469 |         query = query_original,
470 |         dplyr::everything()
471 |       )
472 |   }
473 | 
474 |   if (simple == TRUE)
475 |     results <- dplyr::select(results, query, reference, match_type)
476 | 
477 |   if (isTRUE(tbl_out)) return(tibble::as_tibble(results))
478 | 
479 |   results
480 | }
481 | 


--------------------------------------------------------------------------------