├── test ├── testpack.R ├── autotest.R ├── vgtest.R └── smoke.R ├── pkg ├── vignettes │ ├── stringdist_api.pdf │ ├── loo2014stringdist.pdf │ ├── stringdist_C-Cpp_api.Rnw │ └── RJournal_6_111-122-2014.Rnw ├── src │ ├── Makevars │ ├── Makevars.win │ ├── qtree.h │ ├── dictionary.h │ ├── dist.h │ ├── hamming.c │ ├── utils.c │ ├── lcs.c │ ├── R_register_native.c │ ├── stringdist.h │ ├── lv.c │ ├── osa.c │ ├── jaro.c │ ├── utils.h │ ├── dl.c │ ├── stringdist.c │ ├── soundex.c │ ├── utf8ToInt.c │ └── Rstringdist.c ├── tests │ └── tinytest.R ├── inst │ ├── tinytest │ │ ├── test_gh_issue_88.R │ │ ├── test_gh_issue_78.R │ │ ├── test_gh_issue_59.R │ │ ├── test_phonetic.R │ │ ├── test_qgrams.R │ │ ├── test_seq_dist.R │ │ ├── test_afind.R │ │ ├── test_stringsim.R │ │ └── test_amatch.R │ ├── CITATION │ └── include │ │ └── stringdist_api.h ├── R │ ├── doc_api.R │ ├── phonetic.R │ ├── doc_parallel.R │ ├── utils.R │ ├── doc_encoding.R │ ├── qgrams.R │ ├── seqdist.R │ ├── afind.R │ ├── stringsim.R │ ├── doc_metrics.R │ ├── amatch.R │ └── stringdist.R ├── NAMESPACE ├── DESCRIPTION ├── README.md └── NEWS ├── examples ├── seq_qgrams.R ├── seq_sim.R ├── phonetic.R ├── stringsim.R ├── printable_ascii.R ├── amatch.R ├── seq_amatch.R ├── qgrams.R ├── seq_dist.R └── stringdist.R ├── .gitignore ├── compile.sh ├── ubsan.sh ├── test.r ├── Makefile ├── drat.sh └── README.md /test/testpack.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_package('stringdist') 3 | 4 | -------------------------------------------------------------------------------- /pkg/vignettes/stringdist_api.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markvanderloo/stringdist/HEAD/pkg/vignettes/stringdist_api.pdf -------------------------------------------------------------------------------- /pkg/vignettes/loo2014stringdist.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markvanderloo/stringdist/HEAD/pkg/vignettes/loo2014stringdist.pdf -------------------------------------------------------------------------------- /pkg/src/Makevars: -------------------------------------------------------------------------------- 1 | 2 | PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) 3 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) 4 | PKG_CPPFLAGS = -I../inst/include 5 | -------------------------------------------------------------------------------- /pkg/tests/tinytest.R: -------------------------------------------------------------------------------- 1 | if ( requireNamespace("tinytest", quietly=TRUE) ){ 2 | tinytest::test_package("stringdist") 3 | } 4 | 5 | 6 | -------------------------------------------------------------------------------- /pkg/src/Makevars.win: -------------------------------------------------------------------------------- 1 | 2 | PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) 3 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) 4 | PKG_CPPFLAGS = -I../inst/include 5 | -------------------------------------------------------------------------------- /test/autotest.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | 3 | 4 | dyn.load("../pkg/src/stringdist.so") 5 | auto_test("../pkg/R", "../pkg/tests/testthat") 6 | 7 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_gh_issue_88.R: -------------------------------------------------------------------------------- 1 | options(sd_num_thread=1L) 2 | 3 | x <- c("ca", "abc", "cba") 4 | expect_equal(stringsimmatrix(x), t(stringsimmatrix(x))) 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /examples/seq_qgrams.R: -------------------------------------------------------------------------------- 1 | 2 | # compare the 2-gram overlap between sequences 1:3 and 2:4 3 | seq_qgrams(x = 1:3, y=2:4,q=2) 4 | 5 | # behavior when NA's are present. 6 | seq_qgrams(1:3,c(1,NA,2),NA_integer_) 7 | -------------------------------------------------------------------------------- /pkg/vignettes/stringdist_C-Cpp_api.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pdfpages} 3 | %\VignetteIndexEntry{stringdist C/C++ API} 4 | 5 | \begin{document} 6 | \includepdf[pages=-, fitpaper=true]{stringdist_api.pdf} 7 | \end{document} 8 | -------------------------------------------------------------------------------- /examples/seq_sim.R: -------------------------------------------------------------------------------- 1 | L1 <- list(1:3,2:4) 2 | L2 <- list(1:3) 3 | seq_sim(L1,L2,method="osa") 4 | 5 | # note how missing values are handled (L2 is recycled over L1) 6 | L1 <- list(c(1L,NA_integer_,3L),2:4,NA_integer_) 7 | L2 <- list(1:3) 8 | seq_sim(L1,L2) 9 | 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # compiled source 2 | *.o 3 | *.so 4 | # temporary vim files 5 | *.swp 6 | 7 | # tex stuff 8 | *.pdf 9 | *.tex 10 | *.bbl 11 | *.blg 12 | *.log 13 | *.aux 14 | 15 | 16 | # ignored directories 17 | output/ 18 | pkg/man/ 19 | 20 | 21 | .Rproj.user 22 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_gh_issue_78.R: -------------------------------------------------------------------------------- 1 | 2 | # x <- "IÑIGO", we avoid problems on Windows here. 3 | x <- intToUtf8(c(73, 209, 73, 71, 79)) 4 | 5 | expect_equal(stringdist("INIGO", x, method="lv", useBytes=FALSE),1) 6 | expect_equal(amatch("INIGO", x, method="lv",maxDist=1),1) 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /examples/phonetic.R: -------------------------------------------------------------------------------- 1 | # The following examples are from The Art of Computer Programming (part III, p. 395) 2 | # (Note that our algorithm is specified different from the one in TACP, see references.) 3 | phonetic(c('Euler','Gauss','Hilbert','Knuth','Lloyd','Lukasiewicz','Wachs'),method='soundex') 4 | 5 | 6 | -------------------------------------------------------------------------------- /examples/stringsim.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Calculate the similarity using the default method of optimal string alignment 4 | stringsim("ca", "abc") 5 | 6 | # Calculate the similarity using the Jaro-Winkler method 7 | # The p argument is passed on to stringdist 8 | stringsim('MARTHA','MATHRA',method='jw', p=0.1) 9 | 10 | -------------------------------------------------------------------------------- /examples/printable_ascii.R: -------------------------------------------------------------------------------- 1 | # define o-umlaut 2 | ouml <- intToUtf8("0x00F6") 3 | x <- c("Motorhead", paste0("Mot",ouml,"rhead")) 4 | # second element contains a non-ascii character 5 | printable_ascii(x) 6 | 7 | # Control characters (like carriage return) are also excluded 8 | printable_ascii("abc\r") 9 | 10 | 11 | -------------------------------------------------------------------------------- /pkg/src/qtree.h: -------------------------------------------------------------------------------- 1 | #ifndef SD_QTREE_H 2 | #define SD_QTREE_H 3 | 4 | /* binary tree; dictionary of qgrams */ 5 | 6 | typedef struct qnode { 7 | unsigned int *qgram; // the q-gram. 8 | double *n; // (vector of) counts. 9 | struct qnode *left; 10 | struct qnode *right; 11 | } qtree; 12 | 13 | 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_gh_issue_59.R: -------------------------------------------------------------------------------- 1 | # this would crash R because of over-asking memory 2 | # it depends on the system really, so we only run this at the 3 | # comfort of our home 4 | if (FALSE){ 5 | x <- paste(letters[sample(1:length(letters),32800,replace=TRUE)], collapse="") 6 | expect_error(stringdist(x,x)) 7 | } 8 | 9 | 10 | -------------------------------------------------------------------------------- /pkg/src/dictionary.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef SD_DICTIONARY_H 3 | #define SD_DICTIONARY_H 4 | 5 | /* Unsorted dictionary for dl distance */ 6 | typedef struct { 7 | // character 8 | unsigned int *key; 9 | // number found 10 | unsigned int *value; 11 | // size of dictionary 12 | unsigned int length; 13 | } dictionary; 14 | 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /pkg/inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry(bibtype = "article", 2 | title = "The stringdist package for approximate string matching" 3 | , author = person("M.P.J.", "van der Loo") 4 | , year = 2014 5 | , journal = {"The {R} {J}ournal"} 6 | , volume = 6 7 | , issue = 1 8 | , url = "https://CRAN.R-project.org/package=stringdist" 9 | , pages = {"111-122"} 10 | ) 11 | -------------------------------------------------------------------------------- /compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd pkg/src 4 | rm --verbose *.o *.so 5 | gcc-10 -std=gnu99 -I/usr/share/R/include -DNDEBUG -fpic -fopenmp -O4 -Wall -pipe -g -c -Wstrict-prototypes -Wformat *.c 6 | #gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -fpic -O2 -Wall -pipe -g -c *.c 7 | gcc -std=gnu99 -shared -o stringdist.so *.o -L/usr/lib/R/lib -lR 8 | cd ../../ 9 | 10 | -------------------------------------------------------------------------------- /pkg/R/doc_api.R: -------------------------------------------------------------------------------- 1 | #' @title 2 | #' Calling stringdist from \code{C} or \code{C++} 3 | #' 4 | #' @description 5 | #' As of version \code{0.9.5.0} several \code{C} level functions can be linked to 6 | #' and called from C code in other R packages. 7 | #' 8 | #' A description of the API can be found in \href{../doc/stringdist_api.pdf}{stringdist_api.pdf}. 9 | #' 10 | #' @name stringdist_api 11 | #' @rdname stringdist-api 12 | {} 13 | 14 | -------------------------------------------------------------------------------- /ubsan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # perform build and R CMD check with undefined behaviour sanitizer switched on. 4 | 5 | ## Thanks to Dirk for posting this command here: 6 | ## http://dirk.eddelbuettel.com/blog/2015/01/18/#ubsan-clang-container 7 | 8 | cd output 9 | 10 | R CMD build ../pkg 11 | 12 | docker run --rm -ti -v $(pwd):/mnt rocker/r-devel-ubsan-clang check.r --setwd /mnt --install-deps stringdist_*.tar.gz 13 | 14 | cd .. 15 | 16 | -------------------------------------------------------------------------------- /pkg/vignettes/RJournal_6_111-122-2014.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pdfpages} 3 | %\VignetteIndexEntry{RJournal 6 111-122 (2014)} 4 | 5 | \begin{document} 6 | \includepdf[pages=-, fitpaper=true]{loo2014stringdist.pdf} 7 | 8 | \newpage{} 9 | \subsection*{Errata} 10 | A few things have changed after the paper was published. These changes 11 | are documented here. 12 | 13 | \begin{itemize} 14 | \item[Version 0.9.3:] $q$-gram distances are now always 0 when $q=0$ (used to be \texttt{Inf}). 15 | \end{itemize} 16 | 17 | \end{document} 18 | -------------------------------------------------------------------------------- /pkg/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(afind) 4 | export(ain) 5 | export(amatch) 6 | export(extract) 7 | export(grab) 8 | export(grabl) 9 | export(phonetic) 10 | export(printable_ascii) 11 | export(qgrams) 12 | export(seq_ain) 13 | export(seq_amatch) 14 | export(seq_dist) 15 | export(seq_distmatrix) 16 | export(seq_qgrams) 17 | export(seq_sim) 18 | export(stringdist) 19 | export(stringdistmatrix) 20 | export(stringsim) 21 | export(stringsimmatrix) 22 | importFrom(parallel,detectCores) 23 | useDynLib(stringdist, .registration=TRUE) 24 | -------------------------------------------------------------------------------- /test/vgtest.R: -------------------------------------------------------------------------------- 1 | 2 | dyn.load("../pkg/src/stringdist.so") 3 | for ( f in dir("../pkg/R",full.names=TRUE) ) source(f) 4 | 5 | x <- paste0('Mot',intToUtf8(0x00F6),'rhead') 6 | y <- c('bastard','Motorhead') 7 | jwdist <- round(1-(1/3)*(8/9 + 8/10 + 1),3) 8 | 9 | #amatch(x, y, method='dl', maxDist=2, useBytes=TRUE) 10 | #stringdist(x[1], y[2], method='dl', useBytes=FALSE) 11 | #stringdist(x[1], y[2], method='dl', useBytes=TRUE) 12 | #stringdist("b","a" , method='dl',weight=c(1,1,0.5,1)) 13 | stringdist("b","a" , method='dl',weight=c(1,1,1,1)) 14 | #stringdist(x, y, method='osa', useBytes=TRUE) 15 | 16 | 17 | -------------------------------------------------------------------------------- /test.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | suppressPackageStartupMessages({ 4 | if (!require("docopt")) stop("docopt not installed") 5 | }) 6 | 7 | "Usage: test.r [nocovr] [snitch] 8 | 9 | nocovr Skip measuring test coverage. 10 | snitch Report lines not covered. 11 | " -> doc 12 | 13 | opt <- docopt(doc) 14 | 15 | if(!require(devtools)) stop('devtools not installed first') 16 | devtools::test('pkg') 17 | 18 | if (!opt$nocovr){ 19 | if(require(covr)){ 20 | cv <- covr::package_coverage('pkg') 21 | print(cv) 22 | if (opt$snitch) print(subset(tally_coverage(cv), value == 0),row.names=FALSE) 23 | } else { 24 | stop("covr not installed") 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/amatch.R: -------------------------------------------------------------------------------- 1 | 2 | # lets see which sci-fi heroes are stringdistantly nearest 3 | amatch("leia",c("uhura","leela"),maxDist=5) 4 | 5 | # we can restrict the search 6 | amatch("leia",c("uhura","leela"),maxDist=1) 7 | 8 | # we can match each value in the find vector against values in the lookup table: 9 | amatch(c("leia","uhura"),c("ripley","leela","scully","trinity"),maxDist=2) 10 | 11 | # setting nomatch returns a different value when no match is found 12 | amatch("leia",c("uhura","leela"),maxDist=1,nomatch=0) 13 | 14 | # this is always true if maxDist is Inf 15 | ain("leia",c("uhura","leela"),maxDist=Inf) 16 | 17 | # Let's look in a neighbourhood of maximum 2 typo's (by default, the OSA algorithm is used) 18 | ain("leia",c("uhura","leela"), maxDist=2) 19 | 20 | 21 | -------------------------------------------------------------------------------- /examples/seq_amatch.R: -------------------------------------------------------------------------------- 1 | 2 | x <- list(1:3,c(3:1),c(1L,3L,4L)) 3 | table <- list( 4 | c(5L,3L,1L,2L) 5 | ,1:4 6 | ) 7 | seq_amatch(x,table,maxDist=2) 8 | 9 | # behaviour with missings 10 | seq_amatch(list(c(1L,NA_integer_,3L),NA_integer_), list(1:3),maxDist=1) 11 | 12 | 13 | \dontrun{ 14 | # Match sentences based on word order. Note: words must match exactly or they 15 | # are treated as completely different. 16 | # 17 | # For this example you need to have the 'hashr' package installed. 18 | x <- "Mary had a little lamb" 19 | x.words <- strsplit(x,"[[:blank:]]+") 20 | x.int <- hashr::hash(x.words) 21 | table <- c("a little lamb had Mary", 22 | "had Mary a little lamb") 23 | table.int <- hashr::hash(strsplit(table,"[[:blank:]]+")) 24 | seq_amatch(x.int,table.int,maxDist=3) 25 | } 26 | -------------------------------------------------------------------------------- /examples/qgrams.R: -------------------------------------------------------------------------------- 1 | 2 | qgrams('hello world',q=3) 3 | 4 | # q-grams are counted uniquely over a character vector 5 | qgrams(rep('hello world',2),q=3) 6 | 7 | # to count them separately, do something like 8 | x <- c('hello', 'world') 9 | lapply(x,qgrams, q=3) 10 | 11 | # output rows may be named, and you can pass any number of character vectors 12 | x <- "I will not buy this record, it is scratched" 13 | y <- "My hovercraft is full of eels" 14 | z <- c("this", "is", "a", "dead","parrot") 15 | qgrams(A = x, B = y, C = z,q=2) 16 | 17 | # a tonque twister, showing the effects of useBytes and useNames 18 | x <- "peter piper picked a peck of pickled peppers" 19 | qgrams(x, q=2) 20 | qgrams(x, q=2, useNames=FALSE) 21 | qgrams(x, q=2, useBytes=TRUE) 22 | qgrams(x, q=2, useBytes=TRUE, useNames=TRUE) 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | doc: 3 | R -s -e "pkgload::load_all('pkg');roxygen2::roxygenize('pkg')" 4 | 5 | pkg: doc 6 | R CMD build pkg 7 | 8 | install: pkg 9 | R CMD INSTALL *.tar.gz 10 | 11 | check: doc 12 | R CMD build pkg 13 | R CMD check *.tar.gz 14 | 15 | cran: doc 16 | R CMD build pkg 17 | R CMD check --as-cran *.tar.gz 18 | 19 | test: doc 20 | R -s -e "tinytest::build_install_test('pkg')" 21 | 22 | manual: doc 23 | R CMD Rd2pdf --force -o manual.pdf ./pkg 24 | 25 | revdep: pkg 26 | rm -rf revdep 27 | mkdir revdep 28 | mv *.tar.gz revdep 29 | R -s -e "out <- tools::check_packages_in_dir('revdep',reverse=list(which='most'),Ncpus=3); print(summary(out)); saveRDS(out, file='revdep/output.RDS')" 30 | 31 | covr: 32 | R -e 'covr::package_coverage("./pkg")' 33 | 34 | clean: 35 | rm -rf stringdist.Rcheck 36 | rm -rf revdep 37 | rm -f *.tar.gz 38 | 39 | 40 | -------------------------------------------------------------------------------- /drat.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | suppressPackageStartupMessages({ 4 | if (!require("drat")) stop("drat not installed") 5 | if (!require("docopt")) stop("docopt not installed") 6 | }) 7 | 8 | 9 | "Usage: drat.sh [commit] [--pkg FILE] [--dratrepo FOLDER] 10 | 11 | commit commit after insert? 12 | --pkg FILE The tarball to insert in the drat repo (by default the tarball in ./output) 13 | --dratrepo FOLDER path to root of drat repo [default: ../drat] 14 | " -> doc 15 | 16 | opt <- docopt(doc) 17 | 18 | stopifnot(file.exists(opt$dratrepo)) 19 | 20 | pkg <- opt$pkg 21 | if ( is.null(pkg) ){ 22 | pkg <- dir("output/",pattern = ".*tar\\.gz",full.names = TRUE) 23 | } 24 | 25 | if (!file.exists(pkg)){ 26 | stop(sprintf("%s not found",pkg)) 27 | } 28 | 29 | drat::insertPackage(pkg, repodir=opt$dratrepo, commit=opt$commit) 30 | 31 | cat(sprintf("Inserted %s into %s %s\n" 32 | , pkg 33 | , opt$dratrepo 34 | , if(opt$commit) "and committed" else "" 35 | )) 36 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_phonetic.R: -------------------------------------------------------------------------------- 1 | options(sd_num_thread=2) 2 | 3 | ### ------------------------------------------------------------- 4 | 5 | ## Soundex 6 | 7 | testset <- "name;code 8 | Robert;R163 9 | rupert;R163 10 | Rubin;R150 11 | Ashcraft;A261 12 | asHCroft;A261 13 | Tymczak;T522 14 | Pfister;P236 15 | gutierrez;G362 16 | Jackson;J250 17 | washington;W252 18 | Lee;L000 19 | NA;NA" 20 | testset <- read.csv2(textConnection(testset), stringsAsFactors=FALSE) 21 | expect_equal(phonetic(testset$name,"soundex"), testset$code) 22 | expect_equal(phonetic(testset$name,"soundex",useBytes=TRUE), testset$code) 23 | expect_warning(phonetic(paste0('Mot',intToUtf8(0x00F6),'rhead'))) 24 | 25 | 26 | ## soundex handles encoding 27 | ouml <- intToUtf8("0x00F6") 28 | # non-ascii within string 29 | expect_warning(phonetic(paste0("Mot",ouml,"rhead"),method='soundex')) 30 | # non-ascii at beginning of string 31 | expect_warning(phonetic(paste0(ouml,"zzy"),method='soundex')) 32 | # non-printable in string (carriage return) 33 | cr <- "\r" 34 | expect_warning(phonetic(paste0(cr,"hello"),method='soundex')) 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_qgrams.R: -------------------------------------------------------------------------------- 1 | options(sd_num_thread=2) 2 | ## qgrams 3 | 4 | ## qgram edge cases 5 | expect_equivalent(qgrams('a' , q=1), as.matrix(c(a=1))) # basic test 6 | expect_equivalent(qgrams('aa', q=1), as.matrix(c(a=2))) # idem 7 | expect_equivalent(qgrams(c('a','a'),q=1), as.matrix(c(a=2))) # count unique n-grams 8 | expect_equivalent(qgrams(c(NA,'a'), q=1), as.matrix(c(a=1))) # skip NA's 9 | expect_equivalent(qgrams(NA,q=1), matrix(0,nrow=1,ncol=0)) # skip all 10 | expect_equivalent(qgrams(c("a","ab"), q=2), as.matrix(table("ab"))) # skip q>nchar 11 | expect_equivalent(qgrams(c("a"),q=2), matrix(0,nrow=1,ncol=0)) # skip all 12 | expect_equivalent(qgrams(c(''),q=0), matrix(table(''))) # empty string, q=0 13 | 14 | 15 | ## qgrams 16 | expect_equivalent(qgrams("a",q=1),array(1,dim=c(1,1))) 17 | expect_equivalent(qgrams("a",q=1,useBytes=TRUE),array(1,dim=c(1,1))) 18 | 19 | 20 | ## seq_qgrams 21 | expect_equivalent( 22 | seq_qgrams(1:3,2:4,q=2) 23 | ,matrix(c( 24 | 1,2,1,0 25 | ,2,3,1,1 26 | ,3,4,0,1 27 | ),nrow=3,byrow=TRUE) 28 | ) 29 | 30 | -------------------------------------------------------------------------------- /pkg/src/dist.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef SD_DIST_H 3 | #define SD_DIST_H 4 | 5 | #include "qtree.h" 6 | #include "dictionary.h" 7 | 8 | 9 | double osa_dist(unsigned int *, int, unsigned int *, int, double *, double *); 10 | dictionary *new_dictionary(unsigned int); 11 | void free_dictionary(dictionary *); 12 | double dl_dist(unsigned int *, int, unsigned int *, int, double *, dictionary *, double *); 13 | double hamming_dist(unsigned int *, int, unsigned int *, int); 14 | double lcs_dist(unsigned int *, int, unsigned int *, int, double *); 15 | double lv_dist(unsigned int *, int, unsigned int *, int, double *, double *); 16 | double osa_dist(unsigned int *, int, unsigned int *, int, double *, double *); 17 | double jaro_winkler_dist(unsigned int *, int, unsigned int *, int, double, double, double *, double *); 18 | qtree *new_qtree(int, int); 19 | void free_qtree(void); 20 | double qgram_dist(unsigned int *, int, unsigned int *, int, unsigned int, qtree **, int); 21 | double soundex_dist(unsigned int *, int, unsigned int *, int, unsigned int *); 22 | double running_cosine_dist(unsigned int *, int, unsigned int *, int, unsigned int, qtree **, double *); 23 | #endif 24 | -------------------------------------------------------------------------------- /examples/seq_dist.R: -------------------------------------------------------------------------------- 1 | # Distances between lists of integer vectors. Note the postfix 'L' to force 2 | # integer storage. The shorter argument is recycled over (\code{a}) 3 | a <- list(c(102L, 107L)) # fu 4 | b <- list(c(102L,111L,111L),c(102L,111L,111L)) # foo, fo 5 | seq_dist(a,b) 6 | 7 | # translate strings to a list of integer sequences 8 | a <- lapply(c("foo","bar","baz"),utf8ToInt) 9 | seq_distmatrix(a) 10 | 11 | # Note how missing values are treated. NA's as part of the sequence are treated 12 | # as an integer (the representation of NA_integer_). 13 | a <- list(NA_integer_,c(102L, 107L)) 14 | b <- list(c(102L,111L,111L),c(102L,111L,NA_integer_)) 15 | seq_dist(a,b) 16 | 17 | \dontrun{ 18 | # Distance between sentences based on word order. Note: words must match exactly or they 19 | # are treated as completely different. 20 | # 21 | # For this example you need to have the 'hashr' package installed. 22 | a <- "Mary had a little lamb" 23 | a.words <- strsplit(a,"[[:blank:]]+") 24 | a.int <- hashr::hash(a.words) 25 | b <- c("a little lamb had Mary", 26 | "had Mary a little lamb") 27 | b.int <- hashr::hash(strsplit(b,"[[:blank:]]+")) 28 | seq_dist(a.int,b.int) 29 | } 30 | 31 | -------------------------------------------------------------------------------- /pkg/src/hamming.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | * 19 | * 20 | * This code is gratefully based on Nick Logan's github repository 21 | * https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c 22 | * 23 | */ 24 | 25 | 26 | 27 | #include "utils.h" 28 | 29 | double hamming_dist(unsigned int *a, int len_a, unsigned int *b, int len_b){ 30 | double h=0; 31 | if (len_a != len_b) return 1.0/0.0; 32 | for(int i=0; i < len_a; ++i){ 33 | if (a[i] != b[i]) h++; 34 | } 35 | return h; 36 | } 37 | 38 | -------------------------------------------------------------------------------- /pkg/src/utils.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | * 19 | */ 20 | 21 | 22 | #define USE_RINTERNALS 23 | #include 24 | #include 25 | 26 | 27 | unsigned int max_length(SEXP x){ 28 | unsigned int t=0, m; 29 | 30 | if (TYPEOF(x) == VECSXP){ 31 | for (int i=0; i 3 | License: GPL-3 4 | Title: Approximate String Matching, Fuzzy Text Search, and String Distance Functions 5 | Type: Package 6 | LazyLoad: yes 7 | Authors@R: c( person("Mark", "van der Loo", role=c("aut","cre") 8 | , email="mark.vanderloo@gmail.com" 9 | , comment= c(ORCID="0000-0002-9807-4686")) 10 | , person("Jan", "van der Laan", role="ctb") 11 | , person("R Core Team","" , role="ctb") 12 | , person("Nick","Logan" , role="ctb") 13 | , person("Chris","Muir" , role="ctb") 14 | , person("Johannes", "Gruber" , role="ctb") 15 | , person("Brian","Ripley" , role="ctb")) 16 | Description: Implements an approximate string matching version of R's native 17 | 'match' function. Also offers fuzzy text search based on various string 18 | distance measures. Can calculate various string distances based on edits 19 | (Damerau-Levenshtein, Hamming, Levenshtein, optimal sting alignment), qgrams (q- 20 | gram, cosine, jaccard distance) or heuristic metrics (Jaro, Jaro-Winkler). An 21 | implementation of soundex is provided as well. Distances can be computed between 22 | character vectors while taking proper care of encoding or between integer 23 | vectors representing generic sequences. This package is built for speed and 24 | runs in parallel by using 'openMP'. An API for C or C++ is exposed as well. 25 | Reference: MPJ van der Loo (2014) . 26 | Version: 0.9.15 27 | Depends: 28 | R (>= 2.15.3) 29 | URL: https://github.com/markvanderloo/stringdist 30 | BugReports: https://github.com/markvanderloo/stringdist/issues 31 | Suggests: 32 | tinytest 33 | Imports: parallel 34 | Encoding: UTF-8 35 | RoxygenNote: 7.3.2 36 | -------------------------------------------------------------------------------- /pkg/src/lcs.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #include "utils.h" 21 | 22 | /* Longest common substring 23 | * - basically edit distance, only allowing insertions and deletions, at the cost of 1. 24 | */ 25 | double lcs_dist(unsigned int *a, int na, unsigned int *b, int nb, double *scores){ 26 | if (!na){ 27 | return (double) nb; 28 | } 29 | if (!nb){ 30 | return (double) na; 31 | } 32 | 33 | int i, j; 34 | int M, I = na+1, L = na+1, J = nb+1; 35 | 36 | for ( i = 0; i < I; ++i ){ 37 | scores[i] = i; 38 | } 39 | for ( j = 1; j < J; ++j, L += I ){ 40 | scores[L] = j; 41 | } 42 | 43 | for ( i = 1; i <= na; ++i ){ 44 | M = 0; L = I; 45 | for ( j = 1; j <= nb; ++j, L += I, M += I ){ 46 | if ( a[i-1] == b[j-1] ){ // equality, copy previous score 47 | scores[i + L] = scores[i-1 + M]; 48 | } else { 49 | scores[i + L] = MIN( 50 | scores[i-1 + L] + 1 , // deletion 51 | scores[i + M] + 1 // insertion 52 | ); 53 | } 54 | 55 | } 56 | } 57 | 58 | return scores[I*J - 1]; 59 | } 60 | 61 | -------------------------------------------------------------------------------- /pkg/src/R_register_native.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* FIXME: 7 | Check these declarations against the C/Fortran source code. 8 | */ 9 | 10 | /* .Call calls */ 11 | extern SEXP R_afind(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 12 | extern SEXP R_all_int(SEXP); 13 | extern SEXP R_amatch(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 14 | extern SEXP R_get_qgrams(SEXP, SEXP); 15 | extern SEXP R_lengths(SEXP); 16 | extern SEXP R_lower_tri(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 17 | extern SEXP R_soundex(SEXP, SEXP); 18 | extern SEXP R_stringdist(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 19 | 20 | static const R_CallMethodDef CallEntries[] = { 21 | {"R_afind", (DL_FUNC) &R_afind, 10}, 22 | {"R_all_int", (DL_FUNC) &R_all_int, 1}, 23 | {"R_amatch", (DL_FUNC) &R_amatch, 12}, 24 | {"R_get_qgrams", (DL_FUNC) &R_get_qgrams, 2}, 25 | {"R_lengths", (DL_FUNC) &R_lengths, 1}, 26 | {"R_lower_tri", (DL_FUNC) &R_lower_tri, 8}, 27 | {"R_soundex", (DL_FUNC) &R_soundex, 2}, 28 | {"R_stringdist", (DL_FUNC) &R_stringdist, 9}, 29 | {NULL, NULL, 0} 30 | }; 31 | 32 | void R_init_stringdist(DllInfo *dll) 33 | { 34 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 35 | R_useDynamicSymbols(dll, FALSE); 36 | 37 | /* used by external packages linking to internal xts code from C */ 38 | R_RegisterCCallable("stringdist","R_all_int",(DL_FUNC) &R_all_int); 39 | R_RegisterCCallable("stringdist","R_amatch",(DL_FUNC) &R_amatch); 40 | R_RegisterCCallable("stringdist","R_get_qgrams",(DL_FUNC) &R_get_qgrams); 41 | R_RegisterCCallable("stringdist","R_lengths",(DL_FUNC) &R_lengths); 42 | R_RegisterCCallable("stringdist","R_lower_tri",(DL_FUNC) &R_lower_tri); 43 | R_RegisterCCallable("stringdist","R_soundex",(DL_FUNC) &R_soundex); 44 | R_RegisterCCallable("stringdist","R_stringdist",(DL_FUNC) &R_stringdist); 45 | 46 | } 47 | -------------------------------------------------------------------------------- /pkg/src/stringdist.h: -------------------------------------------------------------------------------- 1 | 2 | /* stringdist - a C library of string distance algorithms with an interface to R. 3 | * Copyright (C) 2013 Mark van der Loo 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | * 18 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 19 | */ 20 | 21 | 22 | #ifndef SD_STRINGDIST_H 23 | #define SD_STRINGDIST_H 24 | 25 | #include "dictionary.h" 26 | #include "qtree.h" 27 | #include "dist.h" 28 | 29 | typedef enum Distance { 30 | osa 31 | , lv 32 | , dl 33 | , hamming 34 | , lcs 35 | , qgram 36 | , cosine 37 | , jaccard 38 | , jw 39 | , soundex 40 | , running_cosine} Distance; 41 | 42 | typedef struct { 43 | Distance distance; 44 | // workspace 45 | double *work; 46 | // [optional] weight vector 47 | double *weight; 48 | // dictionary object for dl-distance 49 | dictionary *dict; 50 | // tree object to store q-grams 51 | qtree *tree; 52 | // the q in qgrams 53 | unsigned int q; 54 | // Winkler's penalty factor 55 | double p; 56 | // Winkler's boost threshold 57 | double bt; 58 | // fail indicator 59 | unsigned int ifail; 60 | } Stringdist; 61 | 62 | Stringdist *open_stringdist(Distance, int, int, ...); 63 | 64 | double stringdist(Stringdist *, unsigned int *, int, unsigned int *, int); 65 | 66 | void close_stringdist(Stringdist *S); 67 | void reset_stringdist(Stringdist *S); 68 | 69 | 70 | 71 | #endif 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /pkg/R/phonetic.R: -------------------------------------------------------------------------------- 1 | 2 | #' Phonetic algorithms 3 | #' 4 | #' Translate strings to phonetic codes. Similar sounding strings should get 5 | #' similar or equal codes. 6 | #' 7 | #' @param x a character vector whose elements are phonetically encoded. 8 | #' @param method name of the algorithm used. The default is \code{"soundex"}. 9 | #' @param useBytes Perform byte-wise comparison. \code{useBytes=TRUE} is faster 10 | #' but may yield different results depending on character encoding. For more 11 | #' information see the documentation of \code{\link{stringdist}}. 12 | #' 13 | #' @details 14 | #' Currently, only the soundex algorithm is implemented. Note that soundex coding 15 | #' is only meaningful for characters in the ranges a-z and A-Z. Soundex coding of strings 16 | #' containing non-printable ascii or non-ascii characters may be system-dependent and should 17 | #' not be trusted. If non-ascii or non-printable ascii charcters are encountered, a warning 18 | #' is emitted. 19 | #' 20 | #' @seealso \code{\link{printable_ascii}} 21 | #' 22 | #' 23 | #' @return 24 | #' The returns value depends on the method used. However, all currently 25 | #' implemented methods return a character vector of the same length of the input 26 | #' vector. Output characters are in the system's native encoding. 27 | #' 28 | #' @references 29 | #' \itemize{ 30 | #' \item{The Soundex algorithm implemented is the algorithm used by the 31 | #' \href{https://www.archives.gov/research/census/soundex}{National Archives}. 32 | #' This algorithm differs slightly from the original algorithm patented by R.C. Russell 33 | #' (US patents 1261167 (1918) and 1435663 (1922)). 34 | #' } 35 | #' } 36 | #' 37 | #' @example ../examples/phonetic.R 38 | #' 39 | #' @export 40 | phonetic <- function(x, method = c("soundex"), useBytes = FALSE) { 41 | x <- as.character(x) 42 | method <- match.arg(method) 43 | stopifnot(is.logical(useBytes)) 44 | if (!useBytes) x <- enc2utf8(x) 45 | if (method == "soundex") { 46 | r <- .Call("R_soundex", x, useBytes,PACKAGE="stringdist") 47 | if (!useBytes) int2char(r) else r 48 | } 49 | } 50 | 51 | int2char <- function(x) { 52 | enc2native(sapply(x, intToUtf8)) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /pkg/src/lv.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #include "utils.h" 21 | 22 | 23 | /* Levenshtein distance 24 | * Computes Levenshtein distance 25 | * - Simplified from restricted DL pseudocode at http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance 26 | * - Extended with custom weights and maxDistance 27 | */ 28 | double lv_dist( 29 | unsigned int *a, int na, // source 30 | unsigned int *b, int nb, // target 31 | double *weight, 32 | double *scores){ 33 | if (!na){ 34 | return (double) nb * weight[1]; // del score 35 | } 36 | if (!nb){ 37 | return (double) na * weight[0]; // ins score 38 | } 39 | 40 | int i, j; 41 | int I = na+1, L = na+1, J = nb+1; 42 | double sub; 43 | 44 | for ( i = 0; i < I; ++i ){ 45 | scores[i] = i * weight[0]; 46 | } 47 | for ( j = 1; j < J; ++j, L += I ){ 48 | scores[L] = j * weight[1]; 49 | } 50 | 51 | 52 | int M; 53 | for ( i = 1; i <= na; ++i ){ 54 | L = I; M= 0; 55 | for ( j = 1; j <= nb; ++j, L += I, M += I ){ 56 | sub = (a[i-1] == b[j-1]) ? 0 : weight[2]; 57 | scores[i + I*j] = MIN(MIN( 58 | scores[i-1 + L] + weight[0], // deletion 59 | scores[i + M] + weight[1]), // insertion 60 | scores[i-1 + M] + sub // substitution 61 | ); 62 | } 63 | } 64 | 65 | double score = scores[I*J-1]; 66 | return score; 67 | } 68 | 69 | -------------------------------------------------------------------------------- /pkg/src/osa.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #include "utils.h" 21 | 22 | /* Optimal string alignment algorithm. 23 | * Computes Damerau-Levenshtein distance, restricted to single transpositions. 24 | * - See pseudocode at http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance 25 | * - Extended with custom weights and maxDistance 26 | */ 27 | double osa_dist(unsigned int *a, int na, unsigned int *b, int nb, double *weight, double *scores){ 28 | 29 | if (!na){ 30 | return (double) nb * weight[1]; // ins weight 31 | } 32 | if (!nb){ 33 | return (double) na * weight[0]; // del weight 34 | } 35 | 36 | int i, j; 37 | int M, I = na+1, L=na+1, J = nb+1; 38 | double sub, tran; 39 | 40 | for ( i = 0; i < I; ++i ){ 41 | scores[i] = i * weight[0]; 42 | } 43 | for ( j = 1; j < J; ++j, L += I ){ 44 | scores[L] = j * weight[1]; 45 | } 46 | 47 | for ( i = 1; i <= na; ++i ){ 48 | L = I; M = 0; 49 | for ( j = 1; j <= nb; ++j, L += I, M += I ){ 50 | if (a[i-1] == b[j-1]){ 51 | sub = 0; 52 | tran= 0; 53 | } else { 54 | sub = weight[2]; 55 | tran= weight[3]; 56 | } 57 | 58 | scores[i + L] = MIN(MIN( 59 | scores[i-1 + L] + weight[0], // deletion 60 | scores[i + M] + weight[1]), // insertion 61 | scores[i-1 + M] + sub // substitution 62 | ); 63 | if ( i>1 && j>1 && a[i-1] == b[j-2] && a[i-2] == b[j-1] ){ 64 | scores[i + L] = MIN(scores[i + L], scores[i-2 + M-I] + tran); // transposition 65 | } 66 | } 67 | } 68 | double score = scores[I*J-1]; 69 | return score; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /pkg/R/doc_parallel.R: -------------------------------------------------------------------------------- 1 | #' @title 2 | #' Multithreading and parallelization in \pkg{stringdist} 3 | #' 4 | #' 5 | #' @description This page describes how \pkg{stringdist} uses parallel processing. 6 | #' 7 | #' @section Multithreading and parallelization in \pkg{stringdist}: 8 | #' The core 9 | #' functions of \pkg{stringdist} are implemented in C. On systems where 10 | #' \code{openMP} is available, \pkg{stringdist} will automatically take 11 | #' advantage of multiple cores. The 12 | #' \href{https://cran.r-project.org/doc/manuals/r-release/R-exts.html#OpenMP-support}{section 13 | #' on OpenMP} of the 14 | #' \href{https://cran.r-project.org/doc/manuals/r-release/R-exts.html}{Writing 15 | #' R Extensions} manual discusses on what systems OpenMP is available (at the time of writing more or 16 | #' less anywhere except on OSX). 17 | #' 18 | #' By default, the number of threads to use is taken from \code{options('sd_num_thread')}. 19 | #' When the package is loaded, the value for this option is determined as follows: 20 | #' \itemize{ 21 | #' \item{If the environment variable \code{OMP_NUM_THREADS} is set, this value is taken.} 22 | #' \item{Otherwise, the number of available cores is determined with \code{parallel::detectCores()} 23 | #' If this fails, the number of threads is set to 1 (with a message). If the nr of detected 24 | #' cores exceeds three, the number of used cores is set to \eqn{n-1}.} 25 | #' \item{If available, the environment variable \code{OMP_THREAD_LIMIT} is 26 | #' determined and The number of threads is set to the lesser of 27 | #' \code{OMP_THREAD_LIMIT} and the number of detected cores.} 28 | #' } 29 | #' 30 | #' The latter step makes sure that on machines with \eqn{n>3} cores, \eqn{n-1} 31 | #' cores are used. Some benchmarking showed that using all cores is often slower 32 | #' in such cases. This is probably because at least one of the threads will be 33 | #' shared with the operating system. 34 | #' 35 | #' Functions that use multithreading have an option named \code{nthread} that 36 | #' controls the maximum number of threads to use. If you need to do large 37 | #' calculations, it is probably a good idea to benchmark the performance on your 38 | #' machine(s) as a function of \code{'nthread'}, for example using the 39 | #' \href{https://cran.r-project.org/package=microbenchmark}{microbenchmark} 40 | #' package of Mersmann. 41 | #' 42 | #' 43 | #' 44 | #' 45 | #' @seealso 46 | #' \itemize{ 47 | #' \item{Functions running multithreaded: \code{\link{stringdist}}, \code{\link{stringdistmatrix}}, \code{\link{amatch}}, \code{\link{ain}} } 48 | #' } 49 | #' 50 | #' @name stringdist-parallelization 51 | #' @rdname stringdist-parallelization 52 | {} 53 | -------------------------------------------------------------------------------- /test/smoke.R: -------------------------------------------------------------------------------- 1 | # smoke tests. 2 | set.seed(1864) 3 | 4 | #dyn.load("../pkg/src/stringdist.so") 5 | #for (f in dir("../pkg/R/",full.names=TRUE)) d <- source(f) 6 | #options(sd_num_thread=4) 7 | library(stringdist) 8 | 9 | # printable ascii 10 | pascii <- sapply(33:126,intToUtf8) 11 | # the ascii set 12 | ascii <- sapply(0:127,intToUtf8) 13 | # the utf8 set 14 | utf8 <- sapply(0:0x10FFFF,intToUtf8) 15 | utf8 <- utf8[stringi::stri_enc_isutf8(utf8)] 16 | 17 | 18 | # methods 19 | edit <- c("osa", "lv", "dl", "lcs", "hamming") 20 | qgrm <- c("qgram","cosine", "jaccard") 21 | heur <- c("jw") 22 | phon <- c("soundex") 23 | 24 | useBytes <- c(TRUE,FALSE) 25 | nthread=1:4 26 | q <- 0:3 27 | p <- c(0,0.01,0.25) 28 | 29 | tests <- list( 30 | test_edit = expand.grid(method = edit, useBytes = useBytes, nthread=1:4,stringsAsFactors = FALSE) 31 | , test_qgrm = expand.grid(method = qgrm, useBytes = useBytes, nthread=1:4,q = q, stringsAsFactors = FALSE) 32 | , test_heur = expand.grid(method = heur, useBytes = useBytes, nthread=1:4,p = p, stringsAsFactors = FALSE) 33 | , test_phon = expand.grid(method = phon, useBytes = useBytes, nthread=1:4,stringsAsFactors=FALSE) 34 | ) 35 | 36 | rand_string <- function(n, len, alphabet=pascii){ 37 | stopifnot( length(len) == 1 || length(len) == n ) 38 | if (length(len)==1){ 39 | sapply(1:n,function(i) paste0(sample(alphabet, len, replace=TRUE),collapse="")) 40 | } else { 41 | sapply(1:n,function(i) paste0(sample(alphabet, len[i], replace=TRUE),collapse="")) 42 | } 43 | } 44 | 45 | N1 <- 250 46 | N2 <- N1 47 | len1 <- sample(0:50,N1,replace=TRUE) 48 | len2 <- sample(0:50,N2,replace=TRUE) 49 | # smoke test for edit-based distances 50 | str1 <- rand_string(N1,len1,pascii) 51 | str2 <- c(rand_string(N2,len2,pascii) 52 | , rand_string(N2,len2,ascii) 53 | , rand_string(N2,len2,utf8)) 54 | 55 | for (tst in tests ){ 56 | for ( i in 1:nrow(tst) ){ 57 | args <- c(list(a=str1,b=str2),as.list(tst[i,])) 58 | tryCatch(do.call(stringdist,args), error=function(e){ 59 | cat(sprintf("Failure with message\n%s\n",e$message)) 60 | print(tst[i,]) 61 | }, warning=function(w){ 62 | cat(sprintf("Warning with message\n%s\n",w$message)) 63 | print(tst[i,]) 64 | }) 65 | } 66 | } 67 | 68 | for (tst in tests ){ 69 | for ( i in 1:nrow(tst) ){ 70 | args <- c(list(x=str1,table=str2),as.list(tst[i,])) 71 | tryCatch(do.call(amatch,args), error=function(e){ 72 | cat(sprintf("Failure with message\n%s\n",e$message)) 73 | print(tst[i,]) 74 | }, warning=function(w){ 75 | cat(sprintf("Warning with message\n%s\n",w$message)) 76 | print(tst[i,]) 77 | }) 78 | } 79 | } 80 | 81 | 82 | 83 | #tst <- tests[[4]] 84 | #args <- c(list(a=str1[1],b=str2[1]),as.list(tst[1,])) 85 | #do.call(stringdist,args) 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /pkg/README.md: -------------------------------------------------------------------------------- 1 | [![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](https://github.com/SNStatComp/awesome-official-statistics-software) 2 | 3 | ## stringdist 4 | 5 | * Approximate matching and string distance calculations for R. 6 | * All distance and matching operations are system- and encoding-independent. 7 | * Built for speed, using [openMP](https://www.openmp.org/) for parallel computing. 8 | 9 | The package offers the following main functions: 10 | 11 | * `stringdist` computes pairwise distances between two input character vectors (shorter one is recycled) 12 | * `stringdistmatrix` computes the distance matrix for one or two vectors 13 | * `stringsim` computes a string similarity between 0 and 1, based on `stringdist` 14 | * `amatch` is a fuzzy matching equivalent of R's native `match` function 15 | * `ain` is a fuzzy matching equivalent of R's native `%in%` operator 16 | * `seq_dist`, `seq_distmatrix`, `seq_amatch` and `seq_ain` for distances between, and matching of integer sequences. 17 | 18 | These functions are built upon `C`-code that re-implements some common (weighted) string 19 | distance functions. Distance functions include: 20 | 21 | * Hamming distance; 22 | * Levenshtein distance (weighted) 23 | * Restricted Damerau-Levenshtein distance (weighted, a.k.a. Optimal String Alignment) 24 | * Full Damerau-Levenshtein distance 25 | * Longest Common Substring distance 26 | * Q-gram distance 27 | * cosine distance for q-gram count vectors (= 1-cosine similarity) 28 | * Jaccard distance for q-gram count vectors (= 1-Jaccard similarity) 29 | * Jaro, and Jaro-Winkler distance 30 | * Soundex-based string distance 31 | 32 | Also, there are some utility functions: 33 | 34 | * `qgrams()` tabulates the qgrams in one or more `character` vectors. 35 | * `seq_qrams()` tabulates the qgrams (somtimes called ngrams) in one or more `integer` vectors. 36 | * `phonetic()` computes phonetic codes of strings (currently only soundex) 37 | * `printable_ascii()` is a utility function that detects non-printable ascii or non-ascii characters. 38 | 39 | #### C API 40 | 41 | Some of `stringdist`'s underlying `C` functions can be called directly from 42 | `C` code in other packages. The description of the API can be found by either 43 | typing `?stringdist_api` in the R console or open the vignette directly as follows: 44 | 45 | ``` 46 | vignette("stringdist_C-Cpp_api", package="stringdist") 47 | ``` 48 | 49 | Examples of packages that link to `stringdist` can be found 50 | [here](https://github.com/markvanderloo/linkstringdist) and 51 | [here](https://github.com/ChrisMuir/refinr). 52 | 53 | 54 | #### Resources 55 | 56 | * A [paper](https://journal.r-project.org/archive/2014-1/loo.pdf) on stringdist has been published in the R-journal 57 | * [Slides](https://www.markvanderloo.eu/files/share/loo2014approximate.pdf) of a talk given at te _useR!2014_ conference. 58 | 59 | -------------------------------------------------------------------------------- /pkg/R/utils.R: -------------------------------------------------------------------------------- 1 | 2 | RECYCLEWARNING <- NULL 3 | 4 | # calling message from .onLoad gives a NOTE on build, so we avoid it here. 5 | mymsg <- message 6 | 7 | .onLoad <- function(libname, pkgname){ 8 | RECYCLEWARNING <<- gettext(tryCatch( (1:2)+(1:3),warning=function(w) w$message )) 9 | 10 | 11 | omp_num_threads <- as.numeric(Sys.getenv("OMP_NUM_THREADS")) 12 | if (!is.na(omp_num_threads) && omp_num_threads > 0){ 13 | nthread <- omp_num_threads 14 | } else { 15 | nthread <- parallel::detectCores() 16 | if ( is.na(nthread) || !is.numeric(nthread) ){ 17 | nthread <- 1L 18 | mymsg("Could not detect number of cores, defaulting to 1.") 19 | } else { 20 | # nthread=nr of detected cores. Leave one core free. 21 | if (nthread >= 4) nthread <- nthread - 1 22 | } 23 | } 24 | 25 | ## if OMP_THREAD_LIMIT is set, maximize on that limit. 26 | omp_thread_limit <- as.numeric(Sys.getenv("OMP_THREAD_LIMIT")) 27 | if ( is.na(omp_thread_limit) ) omp_thread_limit <- nthread 28 | nthread <- min(omp_thread_limit,nthread) 29 | 30 | options(sd_num_thread=as.integer(nthread)) 31 | } 32 | 33 | # When necessary and possible, argument is coverted to integers. 34 | ensure_int_list <- function(x){ 35 | if (is.integer(x)|is.numeric(x)) return(list(as.integer(x))) 36 | if (!is.list(x)) stop("argument must be 'list', 'integer' or 'numeric'") 37 | if (!all_int(x)){ 38 | lapply(x,as.integer) 39 | } else { 40 | x 41 | } 42 | } 43 | 44 | setNames <- function(object, nm){ 45 | names(object) <- nm 46 | object 47 | } 48 | 49 | #' Detect the presence of non-printable or non-ascii characters 50 | #' 51 | #' @param x a \code{character} vector 52 | #' 53 | #' @details 54 | #' Printable ASCII characters consist of space, \code{A-Z}, \code{a-z}, \code{0-9} and the characters 55 | #' 56 | #' \code{! "" # $ \% & ' ( ) * + , . / : ; < = > ? @@ [ ] \\ ^ _ ` { | } ~ -} 57 | #' 58 | #' Note that this excludes tab (as it is a control character). 59 | #' 60 | #' 61 | #' @example ../examples/printable_ascii.R 62 | #' 63 | #' 64 | #' @return A \code{logical} indicating which elements consist solely of printable ASCII characters. 65 | #' @export 66 | printable_ascii <- function(x){ 67 | # To portably detect ASCII characters, we need to specify them literally. Hence this monster of a character class. See ?regexp. 68 | 69 | # notes: 70 | # - caret (^) at the beginning negates what comes after, the caret in the middle is the actual character. 71 | # - the closing square bracket ] needs to be specified first 72 | # - double quote " and backslash are escaped 73 | # - the dash "-" is specified at the end since it would indicate a range otherwise 74 | # - see ? regexp. 75 | charclass <- paste0("[^]" 76 | , "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 77 | , "0123456789" 78 | , " !\"#$%&'()*+,./:;<=>?@[\\^_`{|}~-" 79 | , "]" 80 | ) 81 | !grepl(charclass,x) 82 | } 83 | 84 | # check whether all elements of a list are of type 'integer'. 85 | # x MUST be a list. 86 | all_int <- function(x){ 87 | .Call("R_all_int",x,PACKAGE="stringdist") 88 | } 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /examples/stringdist.R: -------------------------------------------------------------------------------- 1 | 2 | # Simple example using optimal string alignment 3 | stringdist("ca","abc") 4 | 5 | # computing a 'dist' object 6 | d <- stringdistmatrix(c('foo','bar','boo','baz')) 7 | # try plot(hclust(d)) 8 | 9 | # The following gives a matrix 10 | stringdistmatrix(c("foo","bar","boo"),c("baz","buz")) 11 | 12 | # An example using Damerau-Levenshtein distance (multiple editing of substrings allowed) 13 | stringdist("ca","abc",method="dl") 14 | 15 | # string distance matching is case sensitive: 16 | stringdist("ABC","abc") 17 | 18 | # so you may want to normalize a bit: 19 | stringdist(tolower("ABC"),"abc") 20 | 21 | # stringdist recycles the shortest argument: 22 | stringdist(c('a','b','c'),c('a','c')) 23 | 24 | # stringdistmatrix gives the distance matrix (by default for optimal string alignment): 25 | stringdist(c('a','b','c'),c('a','c')) 26 | 27 | # different edit operations may be weighted; e.g. weighted substitution: 28 | stringdist('ab','ba',weight=c(1,1,1,0.5)) 29 | 30 | # Non-unit weights for insertion and deletion makes the distance metric asymetric 31 | stringdist('ca','abc') 32 | stringdist('abc','ca') 33 | stringdist('ca','abc',weight=c(0.5,1,1,1)) 34 | stringdist('abc','ca',weight=c(0.5,1,1,1)) 35 | 36 | # Hamming distance is undefined for 37 | # strings of unequal lengths so stringdist returns Inf 38 | stringdist("ab","abc",method="h") 39 | # For strings of eqal length it counts the number of unequal characters as they occur 40 | # in the strings from beginning to end 41 | stringdist("hello","HeLl0",method="h") 42 | 43 | # The lcs (longest common substring) distance returns the number of 44 | # characters that are not part of the lcs. 45 | # 46 | # Here, the lcs is either 'a' or 'b' and one character cannot be paired: 47 | stringdist('ab','ba',method="lcs") 48 | # Here the lcs is 'surey' and 'v', 'g' and one 'r' of 'surgery' are not paired 49 | stringdist('survey','surgery',method="lcs") 50 | 51 | 52 | # q-grams are based on the difference between occurrences of q consecutive characters 53 | # in string a and string b. 54 | # Since each character abc occurs in 'abc' and 'cba', the q=1 distance equals 0: 55 | stringdist('abc','cba',method='qgram',q=1) 56 | 57 | # since the first string consists of 'ab','bc' and the second 58 | # of 'cb' and 'ba', the q=2 distance equals 4 (they have no q=2 grams in common): 59 | stringdist('abc','cba',method='qgram',q=2) 60 | 61 | # Wikipedia has the following example of the Jaro-distance. 62 | stringdist('MARTHA','MATHRA',method='jw') 63 | # Note that stringdist gives a _distance_ where wikipedia gives the corresponding 64 | # _similarity measure_. To get the wikipedia result: 65 | 1 - stringdist('MARTHA','MATHRA',method='jw') 66 | 67 | # The corresponding Jaro-Winkler distance can be computed by setting p=0.1 68 | stringdist('MARTHA','MATHRA',method='jw',p=0.1) 69 | # or, as a similarity measure 70 | 1 - stringdist('MARTHA','MATHRA',method='jw',p=0.1) 71 | 72 | # This gives distance 1 since Euler and Gauss translate to different soundex codes. 73 | stringdist('Euler','Gauss',method='soundex') 74 | # Euler and Ellery translate to the same code and have distance 0 75 | stringdist('Euler','Ellery',method='soundex') 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_seq_dist.R: -------------------------------------------------------------------------------- 1 | options(sd_num_thread=2) 2 | ## seq_dist 3 | # tests against cases that used to segfault when we did not check 4 | # NULL cases. 5 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=1:4)) 6 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread="foo")) 7 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=integer(0))) 8 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=NULL)) 9 | 10 | # A simple test to see that everything is passed on to the correct 11 | # algorithm 12 | ## Methods are selected and computed correctly 13 | expect_equal( 14 | seq_dist(a = list(c(1L,2L,3L)), b = list(c(2L,1L,3L)), method="osa") 15 | , 1 ) 16 | expect_equal( 17 | seq_dist(a = list(c(1L,2L,3L)), b = list(c(2L,1L,3L)), method="lv") 18 | , 2 ) 19 | # the case setting 'dl' apart from 'osa' 20 | expect_equal( 21 | seq_dist(a = list(c(2L,1L)), b = list(c(1L,3L,2L)), method="dl") 22 | , 2 ) 23 | expect_equal( 24 | seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="hamming") 25 | , 1 ) 26 | expect_equal( 27 | seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="lcs") 28 | , 2 ) 29 | expect_equal( 30 | seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="qgram",q=2) 31 | , 4 ) 32 | 33 | expect_equal( 34 | round(1-seq_dist(list(utf8ToInt("martha")),list(utf8ToInt("marhta")),method='jw'),3) 35 | , 0.944 36 | ) 37 | expect_error( 38 | seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="soundex") 39 | ) 40 | 41 | 42 | ## Conversion for non-integer-list arguments 43 | expect_equal(seq_dist(list(c(1,2,3)),list(c(2,3,4))),seq_dist(as.numeric(c(1,2,3)),as.numeric(c(2,3,4)))) 44 | expect_equal(seq_dist(list(c(1,2,3)),list(c(2,3,4))),seq_dist(c(1,2,3), c(2,3,4))) 45 | expect_equal(seq_distmatrix(list(c(1,2,3)),list(c(2,3,4))), seq_distmatrix(as.numeric(c(1,2,3)),as.numeric(c(2,3,4)))) 46 | expect_equal(seq_distmatrix(list(c(1,2,3)),list(c(2,3,4))), seq_distmatrix(c(1,2,3),c(2,3,4))) 47 | expect_equal(seq_distmatrix(list(c(1,2,3))),seq_distmatrix(c(1,2,3))) 48 | expect_equal(seq_distmatrix(list(c(1,2,3))),seq_distmatrix(as.numeric(c(1,2,3)))) 49 | 50 | 51 | ## Some edge cases 52 | expect_equal(length(seq_dist(list(),list(c(1L)))),0) 53 | expect_equal(length(seq_dist(list(),list())),0) 54 | 55 | 56 | ## Elementary tests on seq_distmatrix 57 | 58 | expect_equivalent(seq_distmatrix(1:10),dist(0)) 59 | expect_equivalent(seq_distmatrix(1:10,list(1:10)),matrix(0)) 60 | expect_equivalent( 61 | as.matrix(seq_distmatrix(list(c(1,2,3),c(2,3,4))) ) 62 | , matrix(c(0,2,2,0),nrow=2) 63 | ) 64 | expect_equal( 65 | as.matrix(seq_distmatrix(list(x=c(1,2,3),y=c(2,3,4)),useNames="names") ) 66 | , matrix(c(0,2,2,0),nrow=2,dimnames=list(c('x','y'),c('x','y'))) 67 | ) 68 | expect_equal( 69 | seq_distmatrix(list(x=c(1,2,3),y=c(2,3,4)),list(x=c(1,2,3),y=c(2,3,4)),useNames="names") 70 | , matrix(c(0,2,2,0),nrow=2,dimnames=list(c('x','y'),c('x','y'))) 71 | ) 72 | expect_equal(class(seq_distmatrix(list(c(1,2,3),c(2,3,4)))),"dist") 73 | expect_equivalent( 74 | as.matrix(seq_distmatrix(list(c(1,2,3),c(2,3,4))),seq_distmatrix(list(c(1,2,3),c(2,3,4)),list(c(1,2,3),c(2,3,4))) ) 75 | , matrix(c(0,2,2,0),nrow=2) 76 | ) 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_afind.R: -------------------------------------------------------------------------------- 1 | options(sd_num_thread=1L) 2 | 3 | # tests against cases that used to segfault when we did not check 4 | # NULL cases. 5 | expect_error(afind("a","b",nthread=1:4)) 6 | expect_error(afind("a","b",nthread="foo")) 7 | expect_error(afind("a","b",nthread=integer(0))) 8 | expect_error(afind("a","b",nthread=NULL)) 9 | 10 | 11 | 12 | 13 | 14 | texts = c("When I grow up, I want to be" 15 | , "one of the harversters of the sea" 16 | , "I think before my days are gone" 17 | , "I want to be a fisherman") 18 | 19 | patterns = c("fish", "gone","to be") 20 | 21 | out <- afind(texts, patterns, method="osa") 22 | 23 | location <- matrix(c( 24 | 1, 1, 24, 25 | 6, 1, 28, 26 | 1, 28, 6, 27 | 16, 3, 8), 28 | nrow=4, byrow=TRUE) 29 | 30 | 31 | distance <- matrix(c( 32 | 4, 3, 0, 33 | 2, 2, 3, 34 | 3, 0, 2, 35 | 0, 3, 0), 36 | nrow=4, byrow=TRUE) 37 | 38 | match <- matrix(c( 39 | "When", "When", "to be", 40 | "f th", "one ", "he se", 41 | "I th", "gone", "nk be", 42 | "fish", "want", "to be"), 43 | nrow=4, byrow=TRUE) 44 | 45 | 46 | expect_equal(out$location, location) 47 | expect_equal(out$distance, distance) 48 | expect_equal(out$match, match) 49 | 50 | # test paralellization 51 | 52 | out1 <- afind(texts, patterns, method="osa", nthread=2L) 53 | expect_identical(out, out1) 54 | 55 | # test 'value' option 56 | out2 <- afind(texts, patterns, value=FALSE) 57 | expect_equal(length(out2), 2) 58 | 59 | 60 | # test grep/grepl equivalents 'grab', 'grabl' 61 | 62 | expect_equal(grab(texts, "harvester", maxDist=2), 2) 63 | expect_equal(grab(texts, "harvester", value=TRUE, maxDist=2), "harverste") 64 | expect_equal(grabl(texts, "harvester", maxDist=2) 65 | , c(FALSE,TRUE,FALSE,FALSE)) 66 | 67 | expect_equal(extract(texts, "harvester", maxDist=2) 68 | , matrix(c(NA, "harverste",NA,NA),nrow=4) ) 69 | 70 | ## Test running_cosine 71 | pattern <- c("phish", "want to") 72 | 73 | expect_identical( 74 | afind(texts, pattern, method="cosine", q=3) 75 | , afind(texts, pattern, method="running_cosine", q=3) 76 | ) 77 | 78 | 79 | ## test whether the correct positions are returned for all methods. 80 | 81 | methods = names(stringdist:::METHODS) 82 | methods = methods[!methods %in% c("soundex","hamming")] 83 | text <- "If you squeeze my lizzard, I put my snake on you." 84 | pattern <- "lizard" 85 | 86 | for ( method in methods ){ 87 | expect_equal(afind(text, pattern, method=method, q=3, p=0.1)$location[1,1], 19, info=method) 88 | } 89 | 90 | ## test the usual edge cases 91 | 92 | # notice: window size = 0. 93 | expect_equal(afind("foo","")$distance[1], 0) 94 | 95 | expect_equal(afind("foo",NA)$distance[1], NA_real_) 96 | expect_equal(afind("foo",NA)$location[1], NA_integer_) 97 | expect_equal(afind("foo",NA)$match[1], NA_character_) 98 | 99 | expect_equal(afind(NA,"foo")$distance[1], NA_real_) 100 | expect_equal(afind(NA,"foo")$location[1], NA_integer_) 101 | expect_equal(afind(NA,"foo")$match[1], NA_character_) 102 | 103 | expect_equal(afind("","foo")$distance[1], 3) 104 | expect_equal(afind("","foo")$location[1], 1) 105 | expect_equal(afind("","foo")$match[1], "") 106 | 107 | expect_equal(grab("foo", ""), 1L) 108 | expect_equal(grabl("foo",""), TRUE) 109 | expect_equal(grab("foo",NA), integer(0)) 110 | 111 | # note that 'grepl' gives FALSE in this case (which is inconsistent with 112 | # grepl(NA, NA), grepl(NA, "foo"). 113 | expect_equal(grabl("foo",NA), NA) 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /pkg/inst/tinytest/test_stringsim.R: -------------------------------------------------------------------------------- 1 | options(sd_num_thread=2) 2 | ## stringsim 3 | 4 | # We expect that two completely different strings have a similarity of 5 | # 0 and two completely equal strings a similarity of 1 6 | methods <- c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex") 7 | for (method in methods) { 8 | 9 | expect_equal(stringsim("bb", "cc", method=method), 0) 10 | expect_equal(stringsim("bb", "bb", method=method), 1) 11 | } 12 | 13 | 14 | ## edgecases 15 | for (method in methods[c(1:5,9:10)]){ 16 | expect_equal(stringsim(c("a", ""), "", method=method), c(0, 1)) 17 | 18 | expect_equal(stringsim(c("kkk", "bbb"), "bbb", method=method), 19 | stringsim("bbb", c("kkk", "bbb"), method=method)) 20 | } 21 | for (method in methods[6:8]){ 22 | expect_equal(stringsim(c("a", ""), "", method=method,q=0), c(1, 1)) 23 | 24 | expect_equal(stringsim(c("kkk", "bbb"), "bbb", method=method,q=0), 25 | stringsim("bbb", c("kkk", "bbb"), method=method,q=0)) 26 | } 27 | 28 | 29 | 30 | 31 | ## Stringsim gets correct values with or without useBytes 32 | x <- "ao" 33 | y <- paste0("a",intToUtf8(0x00F6)) # o-umlaut 34 | expect_equal(stringsim(x,y,method="osa", useBytes=FALSE), 1-1/2) 35 | expect_equal(stringsim(x,y,method="osa", useBytes=TRUE ), 1-2/3) 36 | expect_equal(stringsim(x,y,method="lv", useBytes=FALSE), 1-1/2) 37 | expect_equal(stringsim(x,y,method="lv", useBytes=TRUE ), 1-2/3) 38 | expect_equal(stringsim(x,y,method="dl", useBytes=FALSE), 1-1/2) 39 | expect_equal(stringsim(x,y,method="dl", useBytes=TRUE ), 1-2/3) 40 | expect_equal(stringsim(x,y,method="hamming", useBytes=FALSE), 1-1/2) 41 | expect_equal(stringsim(x,y,method="hamming", useBytes=TRUE ), 1-1) 42 | expect_equal(stringsim(x,y,method="lcs", useBytes=FALSE), 1-1/2) 43 | expect_equal(stringsim(x,y,method="lcs", useBytes=TRUE ), 1-3/5) 44 | expect_equal(stringsim(x,y,method="qgram", q=1, useBytes=FALSE), 1-1/2) 45 | expect_equal(stringsim(x,y,method="qgram", q=1, useBytes=TRUE ), 1-3/5) 46 | expect_equal(stringsim(x,y,method="cosine", q=1, useBytes=FALSE), 1-1/2) 47 | expect_equal(stringsim(x,y,method="cosine", q=1, useBytes=TRUE ), 1-(1-1/sqrt(6))) 48 | expect_equal(stringsim(x,y,method="jaccard", q=1, useBytes=FALSE), 1-2/3) 49 | expect_equal(stringsim(x,y,method="jaccard", q=1, useBytes=TRUE ), 1-3/4) 50 | expect_equal(stringsim(x,y,method="jw", useBytes=FALSE), 1-1/3) 51 | expect_equal(stringsim(x,y,method="jw", useBytes=TRUE ), (1/2 + 1/3 +1)/3) 52 | 53 | # stringsimmatrix 54 | x <- names(islands)[1:10] 55 | y <- rev(x) # o-umlaut 56 | expect_true(inherits(stringsimmatrix(x,y,method="osa", useBytes=FALSE), "matrix")) 57 | expect_equal(dim(stringsimmatrix(x,y,method="osa", useBytes=FALSE)), c(10, 10)) 58 | expect_equal(stringsimmatrix(x,y,method="osa", useBytes=FALSE)[2, 2], 0.2) 59 | expect_true(inherits(stringsimmatrix(x,method="osa", useBytes=FALSE), "matrix")) 60 | expect_equal(dim(stringsimmatrix(x,method="osa", useBytes=FALSE)), c(10, 10)) 61 | expect_equal(stringsimmatrix(x,method="osa", useBytes=FALSE)[2, 9], 0.2) 62 | expect_warning(stringdistmatrix(list('a'))) 63 | expect_warning(stringdistmatrix(list('a'),list('b'))) 64 | 65 | ## seq_sim 66 | 67 | # We used to have list(1:3, 2:4) and list(1:3). This occasionally 68 | # gave failing tests, and only in the context of expect_equal (both 69 | # for tinytest and testthat. Therefore this may point to a hard-to-reproduce 70 | # bug in R's JIT compiler. 71 | expect_equal( 72 | seq_sim(list(c(1,2,3),c(2,3,4)), list(c(1,2,3)), method="cosine") 73 | , stringsim(c("abc","bcd"),"abc", method="cosine") 74 | ) 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /pkg/R/doc_encoding.R: -------------------------------------------------------------------------------- 1 | #' @title 2 | #' String metrics in \pkg{stringdist} 3 | #' 4 | #' @description 5 | #' This page gives an overview of encoding handling in \pkg{stringst}. 6 | #' 7 | #' 8 | #' @section Encoding in \pkg{stringdist}: 9 | #' 10 | #' All character strings are stored as a sequence of bytes. An encoding 11 | #' system relates a byte, or a short sequence of bytes to a symbol. Over the years, many 12 | #' encoding systems have been developed, and not all OS's and softwares use the same encoding 13 | #' as default. Similarly, depending on the system R is running on, R may use a 14 | #' different encoding for storing strings internally. 15 | #' 16 | #' The \pkg{stringdist} package is designed so users in principle need not 17 | #' worry about this. Strings are converted to \code{UTF-32} (unsigned integer) 18 | #' by default prior to any further computation. This means that results are 19 | #' encoding-independent and that strings are interpreted as a sequence of 20 | #' symbols, not as a sequence of pure bytes. In functions where this is 21 | #' relevant, this may be switched by setting the \code{useBytes} option to 22 | #' \code{TRUE}. However, keep in mind that results will then likely depend on the 23 | #' system R is running on, except when your strings are pure ASCII. 24 | #' Also, for multi-byte encodings, results for byte-wise computations 25 | #' will usually differ from results using encoded computations. 26 | #' 27 | #' Prior to \pkg{stringdist} version 0.9, setting \code{useBytes=TRUE} could 28 | #' give a significant performance enhancement. Since version 0.9, translation 29 | #' to integer is done by C code internal to \pkg{stringdist} and the difference in 30 | #' performance is now negligible. 31 | #' 32 | #' @section Unicode normalisation: 33 | #' In \code{utf-8}, the same (accented) character may be represented as several byte sequences. For example, an u-umlaut 34 | #' can be represented with a single byte code or as a byte code representing \code{'u'} followed by a modifier byte code 35 | #' that adds the umlaut. The \href{https://cran.r-project.org/package=stringi}{stringi} package 36 | #' of Gagolevski and Tartanus offers unicode normalisation tools. 37 | #' 38 | #' @section Some tips on character encoding and transliteration: 39 | #' Some algorithms (like soundex) are defined only on the printable ASCII character set. This excludes any character 40 | #' with accents for example. Translating accented characters to the non-accented ones is a form of transliteration. On 41 | #' many systems running R (but not all!) you can achieve this with 42 | #' 43 | #' \code{iconv(x,to="ASCII//TRANSLIT")}, 44 | #' 45 | #' where \code{x} is your character vector. See the documentation of \code{\link[base]{iconv}} for details. 46 | #' 47 | #' The \code{stringi} package (Gagolewski and Tartanus) should work on any system. The command 48 | #' \code{stringi::stri_trans_general(x,"Latin-ASCII")} transliterates character vector \code{x} to ASCII. 49 | #' 50 | #' @references 51 | #' \itemize{ 52 | #' \item{The help page of \code{\link[base]{Encoding}}} describes how R handles encoding. 53 | #' \item{The help page of \code{\link[base]{iconv}} has a good overview of base R's 54 | #' encoding conversion options. The capabilities of \code{iconv} depend on the system R is running on. 55 | #' The \pkg{stringi} package offers platform-independent encoding and normalization tools.} 56 | #' } 57 | #' 58 | #' @seealso 59 | #' \itemize{ 60 | #' \item{Functions using re-encoding: \code{\link{stringdist}}, \code{\link{stringdistmatrix}}, \code{\link{amatch}}, \code{\link{ain}}, \code{\link{qgrams}}} 61 | #' \item{Encoding related: \code{\link{printable_ascii}}} 62 | #' } 63 | #' @name stringdist-encoding 64 | #' @rdname stringdist-encoding 65 | {} 66 | 67 | -------------------------------------------------------------------------------- /pkg/src/jaro.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #include "utils.h" 21 | #include 22 | 23 | 24 | // Winkler's l-factor (nr of matching characters at beginning of the string). 25 | static double get_l(unsigned int *a, unsigned int *b, int n){ 26 | int i=0; 27 | double l; 28 | while ( a[i] == b[i] && i < n ){ 29 | i++; 30 | } 31 | l = (double) i; 32 | return l; 33 | } 34 | 35 | 36 | /* jaro distance (see http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance). 37 | * 38 | * a : string (in int rep) 39 | * b : string (in int rep) 40 | * x : length of a (in uints) 41 | * y : length of b (in uints) 42 | * p : Winkler's p-factor in (0,0.25) 43 | * bt : Winkler's (undocumented?) boost threshold 44 | * work : workspace, minimally of length x + y 45 | * 46 | */ 47 | double jaro_winkler_dist( 48 | unsigned int *a 49 | , int x 50 | , unsigned int *b 51 | , int y 52 | , double p 53 | , double bt 54 | , double *w 55 | , double *work 56 | ){ 57 | 58 | 59 | // edge case 60 | if ( x == 0 && y == 0 ) return 0; 61 | 62 | for (int k=0; k < x + y; k++) work[k] = 0; 63 | 64 | // we need space for integers (or do a lot of conversions) 65 | unsigned int *wrk = (unsigned int*) work; 66 | unsigned int *matcha = wrk 67 | , *matchb = wrk + x; 68 | unsigned int left, right; 69 | 70 | // number of matches 71 | int m = 0; 72 | // max transposition distance 73 | int M = MAX(MAX(x,y)/2 - 1,0); 74 | 75 | // store the match indices. Indices are stored as i+1 because 0 is used as 'no match' 76 | for ( int i = 0; i < x; ++i){ 77 | left = MAX(0,i-M); 78 | right = MIN(y,i+M); 79 | for ( int j = left; j <= right; j++){ 80 | if ((a[i] == b[j]) && (matchb[j]==0)){ 81 | matcha[i] = i+1; 82 | matchb[j] = j+1; 83 | m += 1; 84 | break; 85 | } 86 | } 87 | } 88 | 89 | // copy matches so they're easy to compare for transposition counting 90 | int j = 0; 91 | for (int i=0; i < x; ++i){ 92 | if (matcha[i]){ 93 | matcha[j] = a[matcha[i]-1]; 94 | ++j; 95 | } 96 | } 97 | j = 0; 98 | for (int i=0; i < y; ++i){ 99 | if (matchb[i]){ 100 | matchb[j] = b[matchb[i]-1]; 101 | ++j; 102 | } 103 | } 104 | 105 | // count 'transpositions', the Jaro way. 106 | double t = 0.0; 107 | for ( int k=0; k 0 && d > bt ){ 119 | int n = MIN(MIN(x,y),4); 120 | d = d - get_l(a,b,n)*p*d; 121 | } 122 | 123 | return d; 124 | } 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /pkg/src/utils.h: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #ifndef sd_utils_h 21 | #define sd_utils_h 22 | 23 | #define USE_RINTERNALS 24 | #include 25 | #include 26 | #include 27 | 28 | /* integer recycling macro */ 29 | #define RECYCLE(X,Y) ( (X) == (Y) ? 0 : (X) ) 30 | 31 | #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) 32 | #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) 33 | #define ABS(X) ((X)<0 ? -1*(X) : (X)) 34 | 35 | 36 | /* A structure for storing integer reps of strings */ 37 | typedef struct { 38 | // array of pointers to integer representation of strings, stored in data. 39 | unsigned int **string; 40 | // array of string lengths. 41 | int *str_len; 42 | // storage room for integer representation of strings 43 | unsigned int *data; 44 | } Stringset; 45 | 46 | 47 | /* size of longest object in a SEXP 48 | * 49 | * 50 | * 51 | */ 52 | unsigned int max_length(SEXP); 53 | 54 | /* Get element from SEXP list and determine some parameters. 55 | * 56 | * Input: 57 | * x: A list of integer vectors or a character vector 58 | * i: Index in x: what element to extract. 59 | * bytes: (boolean) if (bytes) then x is assumed to be a character vector. 60 | * 61 | * Output: 62 | * len : the length of the i'th object in x. 63 | * isna : wether the i'th object represents an NA 64 | * c : if (bytes) then c will contain the values of the i'th element in x, coerced to integers. 65 | * 66 | * Return value: 67 | * A pointer to the integer representation of the i'th object in x. 68 | * 69 | */ 70 | unsigned int *get_elem(SEXP x, R_xlen_t i, int bytes, int intdist, int *len, int *isna, unsigned int *c); 71 | 72 | 73 | /* (mutlithreaded) recycling. 74 | * 75 | * This avoids having to compute i % ni at every iteration while 76 | * recycling over a vector. 77 | * 78 | * In the default case, the counter i starts at omp_thread_num() and 79 | * is increased with omp_get_num_threads() (round robin parallelization). 80 | * If a counter is increased to above the length of a vector, it's value 81 | * is decreased with the vector length (recycling). 82 | * 83 | * There is an edge case when omp_get_num_threads() is less than the 84 | * vector's length. In that case, when i > the vector length, the 85 | * modulus is computed anyway. 86 | * 87 | * 88 | * Input: 89 | * i : integer, current index. 90 | * nthreads : number of threads working on the vector. 91 | * ni : vector length 92 | * 93 | * 94 | */ 95 | static inline int recycle(int i, int nthreads, int ni){ 96 | i += nthreads; 97 | if ( i >= ni ) 98 | i = (nthreads < ni) ? (i - ni) : (i % ni); 99 | return i; 100 | } 101 | 102 | /* Create a new stringset from an character vector. 103 | * 104 | * Translates character vectors to integers. Input is expected in utf-8 format. 105 | * Translation can be bytewise (bytes=1) or interpreted utf8. 106 | * 107 | * Output: Pointer to a Stringset. 108 | * 109 | * 110 | * 111 | */ 112 | Stringset *new_stringset(SEXP str, int bytes, int intdist); 113 | 114 | /* Clean up a Stringset. */ 115 | void free_stringset(Stringset *s); 116 | 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /pkg/R/qgrams.R: -------------------------------------------------------------------------------- 1 | #' Get a table of qgram counts from one or more character vectors. 2 | #' 3 | #' @section Details: 4 | #' The input is converted to \code{character}. If \code{useBytes=TRUE}, each element is 5 | #' converted to \code{utf8} and then to \code{integer} as in \code{\link{stringdist}}. 6 | #' Next,the data is passed to the underlying routine. 7 | #' 8 | #' Strings with less than \code{q} characters and elements containing \code{NA} are skipped. Using \code{q=0} 9 | #' therefore counts the number of empty strings \code{""} occuring in each argument. 10 | #' 11 | #' @param ... any number of (named) arguments, that will be coerced to character with \code{as.character}. 12 | #' @param q size of q-gram, must be non-negative. 13 | #' @param useBytes Determine byte-wise qgrams. \code{useBytes=TRUE} is faster but may yield different 14 | #' results depending on character encoding. For \code{ASCII} it is identical. See also \code{\link{stringdist}} under Encoding issues. 15 | #' @param useNames Add q-grams as column names. If \code{useBytes=useNames=TRUE}, the q-byte sequences are represented as 2 hexadecimal numbers 16 | #' per byte, separated by a vertical bar (\code{|}). 17 | #' @param .list Will be concatenated with the \code{...} argument(s). Useful for adding character vectors named \code{'q'} or \code{'useNames'}. 18 | #' @return A table with \eqn{q}-gram counts. Detected \eqn{q}-grams are column names and the argument names as row names. 19 | #' If no argument names were provided, they will be generated. 20 | #' 21 | #' @seealso \code{\link{stringdist}}, \code{\link{amatch}} 22 | #' 23 | #' @example ../examples/qgrams.R 24 | #' @export 25 | qgrams <- function(..., .list=NULL,q=1L,useBytes=FALSE, useNames=!useBytes){ 26 | stopifnot(is.numeric(q), length(q)==1, !is.na(q), q>=0) 27 | q <- as.integer(q) 28 | 29 | if (!is.null(.list) && length(.list) == 0) .list=NULL 30 | L <- lapply(c(list(...),.list), as.character) 31 | if (length(L) == 0) return(array(dim=c(0,0))) 32 | L <- setnames(L) 33 | 34 | if (q==0){ 35 | return( matrix( sapply(L,function(x) sum(x=="")) 36 | , ncol=1 37 | , dimnames = list(names(L), NULL)) ) 38 | } 39 | 40 | 41 | L <- lapply(L,char2int) 42 | 43 | v <- .Call("R_get_qgrams",L,as.integer(q),PACKAGE="stringdist") 44 | 45 | nqgrams <- length(v)/length(L) 46 | qgrams <- NULL 47 | if (useNames){ 48 | if ( q == 0 ){ 49 | qgrams = "" 50 | } else { 51 | Q <- attr(v,"qgrams") 52 | attr(v,"qgrams") <- NULL 53 | A <- array(Q,dim=c(q, nqgrams)) 54 | qgrams = if( useBytes ){ 55 | apply(A,2,function(x) paste(as.raw(x),collapse="|")) 56 | } else { 57 | enc2native(apply(A,2,intToUtf8)) 58 | } 59 | } 60 | } 61 | array(v, 62 | dim=c(length(L), nqgrams), 63 | dimnames = list( 64 | names(L), 65 | qgrams 66 | ) 67 | ) 68 | } 69 | 70 | 71 | setnames <- function(x){ 72 | list_names <- names(x) 73 | generic_names <- paste0("V",seq_along(x)) 74 | if (is.null(list_names)){ 75 | return(setNames(x,generic_names)) 76 | } 77 | I <- list_names == "" 78 | names(x)[I] <- generic_names[I] 79 | x 80 | } 81 | 82 | #' Get a table of qgram counts for integer sequences 83 | #' 84 | #' 85 | #' @param ... Any number of (named) arguments that will be coerced with \code{as.integer} 86 | #' @param .list Will be concatenated with the \code{...} argument(s). Useful for adding integer vectors named 'q'. 87 | #' @param q The size of q-gramming. 88 | #' 89 | #' @return 90 | #' A \code{matrix} containing q-gram profiles. Columns 1 to \code{q} contain the 91 | #' encountered q-grams. The ensuing (named) columns contain the q-gram counts 92 | #' per vector. Run the example for a simple overview. 93 | #' 94 | #' Missing values in integer sequences are treated as any other number. 95 | #' 96 | #' @example ../examples/seq_qgrams.R 97 | #' 98 | #' @seealso \code{\link{seq_dist}}, \code{\link{seq_amatch}} 99 | #' 100 | #' @export 101 | #' 102 | seq_qgrams <- function(...,.list=NULL,q=1L){ 103 | L <- lapply(c(list(...),.list),function(x) list(as.integer(x))) 104 | if (length(L) == 0) return(array(dim=c(0,0))) 105 | L <- setnames(L) 106 | v <- .Call("R_get_qgrams",L,as.integer(q), PACKAGE="stringdist") 107 | Q <- attr(v,"qgrams") 108 | nqgrams <- length(v)/length(L) 109 | Q <- t(array(Q,dim=c(q,nqgrams),dimnames=list(paste0("q",1:q),NULL))) 110 | A <- matrix(v, nrow=nqgrams, ncol=length(L),byrow=TRUE,dimnames=list(NULL,paste0("n.",names(L)))) 111 | cbind(Q,A) 112 | } 113 | -------------------------------------------------------------------------------- /pkg/src/dl.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | * 19 | * 20 | * This code is gratefully based on Nick Logan's github repository 21 | * https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c 22 | * 23 | * 24 | * Changes/additions wrt original code: 25 | * - Added R.h, Rdefines.h inclusion 26 | * - Added R interface function 27 | * - Added edit weights (function is now of type double) 28 | * - Added corner cases for length-zero strings. 29 | * - Replaced linked list dictionary with fixed-size struct for loop 30 | * externalization of memory allocation. 31 | * - Externalized allocation of dynamic programming matrix. 32 | * 33 | * 34 | */ 35 | 36 | 37 | #include "utils.h" 38 | #include "dictionary.h" 39 | 40 | /* 41 | static void print_dict(dictionary *d){ 42 | for ( int i=0; ilength; i++){ 43 | Rprintf("d[%d] = %d; ", i, d->key[i]); 44 | } 45 | Rprintf("\n"); 46 | } 47 | */ 48 | 49 | static void reset_dictionary(dictionary *d){ 50 | int nbytes = sizeof(unsigned int)*(d->length); 51 | memset(d->key , 0, nbytes); 52 | memset(d->value, 0, nbytes); 53 | } 54 | 55 | dictionary *new_dictionary(unsigned int length){ 56 | dictionary *d = (dictionary *) malloc(sizeof(dictionary)); 57 | if ( d == NULL ){ 58 | return NULL; 59 | } 60 | d->key = (unsigned int *) malloc(length*sizeof(int)); 61 | d->value = (unsigned int *) malloc(length*sizeof(int)); 62 | if ( d->key == NULL || d->value == NULL){ 63 | free(d->key); 64 | free(d->value); 65 | free(d); 66 | return NULL; 67 | } 68 | d->length = length; 69 | reset_dictionary(d); 70 | return d; 71 | } 72 | 73 | void free_dictionary(dictionary *d){ 74 | if ( d != NULL ){ 75 | free(d->key); 76 | free(d->value); 77 | free(d); 78 | } 79 | } 80 | 81 | 82 | static void uniquePush(dictionary *d, unsigned int key){ 83 | int i=0; 84 | while (d->key[i] && d->key[i] != key){ 85 | ++i; 86 | } 87 | d->key[i] = key; 88 | } 89 | 90 | static unsigned int which(dictionary *d, unsigned int key){ 91 | int i=0; 92 | while( d->key[i] != key ){ 93 | ++i; 94 | } 95 | return i; 96 | } 97 | 98 | 99 | // note: src (tgt) will be indexed to their x + 1 (y+1). 100 | double dl_dist( 101 | unsigned int *src, 102 | int x, 103 | unsigned int *tgt, 104 | int y, 105 | double *weight, 106 | dictionary *dict, 107 | double *scores 108 | ){ 109 | 110 | if (!x){ 111 | return (double) y * weight[1]; // ins weight 112 | } 113 | if (!y){ 114 | return (double) x * weight[0]; // del weight 115 | } 116 | 117 | unsigned int swapCount, targetCharCount,i,j; 118 | double delScore, insScore, subScore, swapScore; 119 | double score_ceil = x + y; 120 | 121 | /* intialize matrix start values */ 122 | scores[0] = score_ceil; 123 | scores[1 * (y + 2) + 0] = weight[0]; //score_ceil; 124 | scores[0 * (y + 2) + 1] = weight[1]; //score_ceil; 125 | scores[1 * (y + 2) + 1] = 0; 126 | 127 | uniquePush(dict,src[0]); 128 | uniquePush(dict,tgt[0]); 129 | 130 | /* work loops */ 131 | /* i = src index */ 132 | /* j = tgt index */ 133 | for(i=1;i<=x;i++){ 134 | uniquePush(dict,src[i]); 135 | scores[(i+1) * (y + 2) + 1] = i * weight[0]; 136 | scores[(i+1) * (y + 2) + 0] = score_ceil; 137 | swapCount = 0; 138 | 139 | for(j=1;j<=y;j++){ 140 | if(i == 1) { 141 | uniquePush(dict,tgt[j]); 142 | scores[1 * (y + 2) + (j + 1)] = j * weight[1]; 143 | scores[0 * (y + 2) + (j + 1)] = score_ceil; 144 | } 145 | targetCharCount = dict->value[which(dict, tgt[j-1])]; 146 | swapScore = scores[targetCharCount * (y + 2) + swapCount] + (i - targetCharCount - 1 + j - swapCount) * weight[3]; 147 | 148 | if(src[i-1] != tgt[j-1]){ 149 | subScore = scores[i * (y + 2) + j] + weight[2]; 150 | insScore = scores[(i+1) * (y + 2) + j] + weight[1]; 151 | delScore = scores[i * (y + 2) + (j + 1)] + weight[0]; 152 | scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore, MIN(delScore, MIN(insScore, subScore) )); 153 | } else { 154 | swapCount = j; 155 | scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); 156 | } 157 | } 158 | 159 | dict->value[which(dict,src[i-1])] = i; 160 | } 161 | 162 | double score = scores[(x+1) * (y + 2) + (y + 1)]; 163 | reset_dictionary(dict); 164 | return score; 165 | } 166 | 167 | 168 | -------------------------------------------------------------------------------- /pkg/src/stringdist.c: -------------------------------------------------------------------------------- 1 | 2 | /* stringdist - a C library of string distance algorithms with an interface to R. 3 | * Copyright (C) 2013 Mark van der Loo 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | * 18 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "dist.h" 25 | #include "stringdist.h" 26 | 27 | #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) 28 | 29 | // todo: remove spurious include. 30 | #include 31 | 32 | /* 33 | * 34 | * 35 | * 36 | */ 37 | Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){ 38 | va_list args; 39 | va_start(args, str_len_b); 40 | 41 | Stringdist *S = (Stringdist *) malloc(sizeof(Stringdist)); 42 | (*S) = (Stringdist) {d, NULL, NULL, NULL, NULL, 0L, 0.0, 0.0, 0L}; 43 | switch (d){ 44 | case osa : 45 | S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) * sizeof(double)); 46 | S->weight = (double *) malloc(4*sizeof(double)); 47 | memcpy(S->weight, va_arg(args, double *), 4*sizeof(double)); 48 | break; 49 | case lv : 50 | S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double)); 51 | S->weight = (double *) malloc(3 * sizeof(double)); 52 | memcpy(S->weight, va_arg(args, double *), 3*sizeof(double)); 53 | break; 54 | case dl : 55 | S->dict = new_dictionary( str_len_a + str_len_b + 1); 56 | S->work = (double *) malloc( (str_len_a + 3) * (str_len_b + 3) * sizeof(double)); 57 | S->weight = (double *) malloc(4*sizeof(double)); 58 | memcpy(S->weight, va_arg(args, double *), 4*sizeof(double)); 59 | break; 60 | case hamming : 61 | break; 62 | case lcs : 63 | S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double)); 64 | break; 65 | case qgram : 66 | S->q = va_arg(args, unsigned int); 67 | S->tree = new_qtree(S->q, 2L); 68 | break; 69 | case cosine : 70 | S->q = va_arg(args, unsigned int); 71 | S->tree = new_qtree(S->q, 2L); 72 | break; 73 | case jaccard : 74 | S->q = va_arg(args, unsigned int); 75 | S->tree = new_qtree(S->q, 2L); 76 | break; 77 | case running_cosine : 78 | S->q = va_arg(args, unsigned int); 79 | S->tree = new_qtree(S->q, 2L); 80 | S->work = (double *) malloc(3*sizeof(double)); 81 | break; 82 | case jw : 83 | S->work = (double *) malloc( sizeof(double) * (str_len_a+str_len_b)); 84 | S->weight = (double *) malloc(3L*sizeof(double)); 85 | memcpy(S->weight, va_arg(args, double *), 3*sizeof(double)); 86 | S->p = va_arg(args, double); 87 | S->bt = va_arg(args, double); 88 | break; 89 | case soundex : 90 | break; 91 | default : 92 | break; 93 | }; 94 | 95 | va_end(args); 96 | 97 | if ( (d == osa || d == lv || d == dl || d == lcs || d== jw) && S->work == NULL ){ 98 | close_stringdist(S); 99 | return(NULL); 100 | } 101 | return S; 102 | 103 | } 104 | 105 | void close_stringdist(Stringdist *S){ 106 | free(S->work); 107 | free(S->weight); 108 | 109 | if (S->distance == dl){ 110 | free_dictionary(S->dict); 111 | } 112 | if (S->distance == qgram || S->distance == cosine || S->distance == jaccard){ 113 | free_qtree(); 114 | } 115 | free(S); 116 | } 117 | 118 | void reset_stringdist(Stringdist *S){ 119 | if (S->distance == running_cosine){ 120 | free_qtree(); 121 | S->tree = new_qtree(S->q, 2L); 122 | } 123 | } 124 | 125 | 126 | 127 | 128 | 129 | double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *str_b, int len_b){ 130 | double d = -1.0; 131 | 132 | switch(S->distance){ 133 | case osa : 134 | return osa_dist(str_a, len_a, str_b, len_b, S->weight, S->work); 135 | case lv : 136 | return lv_dist( str_a, len_a, str_b, len_b, S->weight, S->work); 137 | case dl : 138 | return dl_dist(str_a, len_a, str_b, len_b, S->weight, S->dict, S->work); 139 | case hamming : 140 | return hamming_dist(str_a, len_a, str_b, len_b); 141 | case lcs : 142 | return lcs_dist(str_a, len_a, str_b, len_b, S->work); 143 | case qgram : 144 | return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 0L); 145 | case cosine : 146 | return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 1L); 147 | case jaccard : 148 | return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 2L); 149 | case running_cosine: 150 | return running_cosine_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, S->work); 151 | case jw : 152 | return jaro_winkler_dist(str_a, len_a, str_b, len_b, S->p, S->bt, S->weight, S->work); 153 | case soundex : 154 | return soundex_dist(str_a, len_a, str_b, len_b, &(S->ifail)); 155 | default : 156 | break; 157 | // set errno, return -1 158 | } 159 | return d; 160 | } 161 | 162 | 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /pkg/R/seqdist.R: -------------------------------------------------------------------------------- 1 | #' Compute distance metrics between integer sequences 2 | #' 3 | #' \code{seq_dist} computes pairwise string distances between elements of 4 | #' \code{a} and \code{b}, where the argument with less elements is recycled. 5 | #' \code{seq_distmatrix} computes the distance matrix with rows according to 6 | #' \code{a} and columns according to \code{b}. 7 | #' 8 | #' 9 | #' @section Notes: 10 | #' Input vectors are converted with \code{as.integer}. This causes truncation for numeric 11 | #' vectors (e.g. \code{pi} will be treated as \code{3L}). 12 | #' 13 | #' @param a (\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer} (target) 14 | #' @param b (\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer} (source). 15 | #' Optional for \code{seq_distmatrix}. 16 | #' @param method Distance metric. See \code{\link{stringdist-metrics}} 17 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for 18 | #' deletion, insertion, substitution and transposition, in that order. When 19 | #' \code{method='lv'}, the penalty for transposition is ignored. When 20 | #' \code{method='jw'}, the weights associated with characters of \code{a}, 21 | #' characters from \code{b} and the transposition weight, in that order. 22 | #' Weights must be positive and not exceed 1. \code{weight} is ignored 23 | #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, 24 | #' \code{'Jaccard'}, or \code{'lcs'} 25 | #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to 26 | #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. 27 | #' @param p Prefix factor for Jaro-Winkler distance. The valid range for 28 | #' \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the 29 | #' Jaro-distance is returned. Applies only to \code{method='jw'}. 30 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is 31 | #' only applied when the Jaro distance is larger than \code{bt} 32 | #' Applies only to \code{method='jw'} and \code{p>0}. 33 | #' @param nthread Maximum number of threads to use. By default, a sensible 34 | #' number of threads is chosen, see \code{\link{stringdist-parallelization}}. 35 | #' 36 | #' @return 37 | #' 38 | #' \code{seq_dist} returns a numeric vector with pairwise distances between \code{a} 39 | #' and \code{b} of length \code{max(length(a),length(b)}. 40 | #' 41 | #' For \code{seq_distmatrix} there are two options. If \code{b} is missing, the 42 | #' \code{\link[stats]{dist}} object corresponding to the \code{length(a) X 43 | #' length(a)} distance matrix is returned. If \code{b} is specified, the 44 | #' \code{length(a) X length(b)} distance matrix is returned. 45 | #' 46 | #' If any element of \code{a} or \code{b} is \code{NA_integer_}, the distance with 47 | #' any matched integer vector will result in \code{NA}. Missing values in the sequences 48 | #' themselves are treated as a number and not treated specially (Also see the examples). 49 | #' 50 | #' @seealso \code{\link{seq_sim}}, \code{\link{seq_amatch}}, \code{\link{seq_qgrams}} 51 | #' 52 | #' @example ../examples/seq_dist.R 53 | #' @export 54 | seq_dist <- function(a, b 55 | , method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw") 56 | , weight=c(d=1,i=1,s=1,t=1) 57 | , q=1, p=0, bt=0 58 | , nthread = getOption("sd_num_thread") 59 | ){ 60 | a <- ensure_int_list(a) 61 | b <- ensure_int_list(b) 62 | 63 | stopifnot( 64 | all(is.finite(weight)) 65 | , all(weight > 0) 66 | , all(weight <=1) 67 | , q >= 0 68 | , p <= 0.25 69 | , p >= 0 70 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) 71 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) 72 | , length(nthread) == 1 73 | , is.numeric(nthread) 74 | , nthread > 0 75 | ) 76 | 77 | 78 | if (length(a) == 0 || length(b) == 0){ 79 | return(numeric(0)) 80 | } 81 | if ( max(length(a),length(b)) %% min(length(a),length(b)) != 0 ){ 82 | warning(RECYCLEWARNING) 83 | } 84 | method <- match.arg(method) 85 | nthread <- as.integer(nthread) 86 | if (method == 'jw') weight <- weight[c(2,1,3)] 87 | do_dist(a=b, b=a 88 | , method=method 89 | , weight=weight 90 | , q=q 91 | , p=p 92 | , bt=bt 93 | , nthread=nthread) 94 | } 95 | 96 | #' @param useNames label the output matrix with \code{names(a)} and \code{names(b)}? 97 | #' @rdname seq_dist 98 | #' @export 99 | seq_distmatrix <- function(a, b 100 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw") 101 | , weight=c(d=1,i=1,s=1,t=1), q=1, p=0, bt=0 102 | , useNames=c("names","none") 103 | , nthread = getOption("sd_num_thread") 104 | ){ 105 | useNames <- match.arg(useNames) 106 | method <- match.arg(method) 107 | nthread <- as.integer(nthread) 108 | if (method == 'jw') weight <- weight[c(2,1,3)] 109 | 110 | a <- ensure_int_list(a) 111 | 112 | # if b is missing, generate a 'dist' object. 113 | if (missing(b)){ 114 | return( lower_tri(a 115 | , method=method 116 | , weight=weight 117 | , q=q 118 | , p=p 119 | , bt=bt 120 | , useNames=useNames 121 | , nthread=nthread) 122 | ) 123 | } 124 | 125 | b <- ensure_int_list(b) 126 | if (length(a) == 0 || length(b) == 0){ 127 | return(matrix(numeric(0))) 128 | } 129 | 130 | if (useNames == "names"){ 131 | rowns <- names(a) 132 | colns <- names(b) 133 | } 134 | 135 | 136 | x <- vapply(b 137 | , function(src) do_dist(list(src), b=a, method=method, weight=weight, q=q, p=p,bt=bt, nthread=nthread) 138 | , USE.NAMES=FALSE, FUN.VALUE=numeric(length(a)) 139 | ) 140 | 141 | if (useNames == "names" ){ 142 | structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns))) 143 | } else { 144 | matrix(x,nrow=length(a),ncol=length(b)) 145 | } 146 | 147 | } 148 | 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![CRAN](http://www.r-pkg.org/badges/version/stringdist)](http://cran.r-project.org/web/packages/stringdist/NEWS) 3 | [![status](https://tinyverse.netlify.com/badge/stringdist)](https://CRAN.R-project.org/package=stringdist) 4 | [![Downloads](http://cranlogs.r-pkg.org/badges/stringdist)](http://cran.r-project.org/package=stringdist/)[![Research software impact](http://depsy.org/api/package/cran/stringdist/badge.svg)](http://depsy.org/package/r/stringdist)[![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org) 5 | 6 | 7 | 8 | 9 | 10 | 11 | ## stringdist 12 | 13 | * Approximate matching, fuzzy text search, and string distance calculations for R. 14 | * All distance and matching operations are system- and encoding-independent. 15 | * Built for speed, using [openMP](https://www.openmp.org/) for parallel computing. 16 | 17 | 18 | ## Citing 19 | 20 | Please cite the [R-Journal article](https://journal.r-project.org/archive/2014/RJ-2014-011/index.html) 21 | 22 | ``` 23 | @article{RJ-2014-011, 24 | author = {Mark P.J. van der Loo}, 25 | title = {{The stringdist Package for Approximate String Matching}}, 26 | year = {2014}, 27 | journal = {{The R Journal}}, 28 | doi = {10.32614/RJ-2014-011}, 29 | url = {https://doi.org/10.32614/RJ-2014-011}, 30 | pages = {111--122}, 31 | volume = {6}, 32 | number = {1} 33 | } 34 | ``` 35 | 36 | ## Functionality 37 | 38 | The package offers the following main functions: 39 | 40 | * `stringdist` computes pairwise distances between two input character vectors (shorter one is recycled) 41 | * `stringdistmatrix` computes the distance matrix for one or two vectors 42 | * `stringsim` computes a string similarity between 0 and 1, based on `stringdist` 43 | * `amatch` is a fuzzy matching equivalent of R's native `match` function 44 | * `ain` is a fuzzy matching equivalent of R's native `%in%` operator 45 | * `afind` finds the location of fuzzy matches of a short string in a long string. 46 | * `seq_dist`, `seq_distmatrix`, `seq_amatch` and `seq_ain` for distances between, and matching of integer sequences. (see also the [hashr](https://github.com/markvanderloo/hashr) package). 47 | 48 | These functions are built upon `C`-code that re-implements some common (weighted) string 49 | distance functions. Distance functions include: 50 | 51 | * Hamming distance; 52 | * Levenshtein distance (weighted); 53 | * Restricted Damerau-Levenshtein distance (weighted, a.k.a. Optimal String Alignment); 54 | * Full Damerau-Levenshtein distance (weighted); 55 | * Longest Common Substring distance; 56 | * Q-gram distance 57 | * cosine distance for q-gram count vectors (= 1-cosine similarity) 58 | * Jaccard distance for q-gram count vectors (= 1-Jaccard similarity) 59 | * Jaro, and Jaro-Winkler distance 60 | * Soundex-based string distance. 61 | 62 | Also, there are some utility functions: 63 | 64 | * `qgrams()` tabulates the qgrams in one or more `character` vectors. 65 | * `seq_qrams()` tabulates the qgrams (somtimes called ngrams) in one or more `integer` vectors. 66 | * `phonetic()` computes phonetic codes of strings (currently only soundex) 67 | * `printable_ascii()` is a utility function that detects non-printable ascii or non-ascii characters. 68 | 69 | #### C API 70 | 71 | As of version `0.9.5.0` you can call a number of `stringdist` functions directly 72 | from the `C` code of your R package. The description of the API can be found 73 | 74 | - By typing `?stringdist_api` in the R console 75 | - By browsing the package's help index to `User guides, package vignettes and other documentation` and clicking on `doc/stringdist_api.pdf`. 76 | - Or you can find the file's location as follows 77 | 78 | ``` 79 | system.file("doc/stringdist_api.pdf", package="stringdist") 80 | ``` 81 | 82 | Examples of packages that link to `stringdist` can be found [here](https://github.com/markvanderloo/linkstringdist) and 83 | [here](https://github.com/ChrisMuir/refinr). 84 | 85 | 86 | 87 | 88 | #### Installation 89 | 90 | To install the latest release from CRAN, open an R terminal and type 91 | 92 | `install.packages('stringdist')` 93 | 94 | 95 | To obtain the package from the very latest source code open a `bash` terminal (or `git bash` if you work under Windows 96 | with `msysgit`) and type 97 | 98 | ``` 99 | git clone https://github.com/markvanderloo/stringdist.git 100 | cd stringdist 101 | bash ./build.bash 102 | R CMD INSTALL output/stringdist_*.tar.gz 103 | ``` 104 | 105 | Warning: the github version can change any time and may not even build properly. As most 106 | of the code is written in `C`, the development version may crash your `R`-session. 107 | 108 | 109 | 110 | #### Resources 111 | 112 | * A [paper](http://journal.r-project.org/archive/2014-1/loo.pdf) on stringdist has been published in the R-journal 113 | * [Slides](http://www.slideshare.net/MarkVanDerLoo/stringdist-use-r2014) of te _useR!2014_ conference. 114 | 115 | #### Note to users: deprecated arguments removed as of version 0.9.5.0 116 | 117 | The following arguments have been obsolete since 2015 and have been removed in the 0.9.5.0 release (spring 2018) 118 | 119 | * Argument `cluster` for function `stringdistmatrix`. 120 | * Argument `maxDist` for functions `stringdist` and `stringdistmatrix` (not `amatch`). 121 | * Argument `ncores` for function `stringdistmatrix` 122 | 123 | 124 | #### Note to users: deprecated arguments as of >= 0.9.0, >= 0.9.2 125 | 126 | Parallelization used to be based on R's ```parallel``` package, that works by spawning several R sessions in the background. As of version 0.9.0, ```stringdist``` uses the more efficient ```openMP``` protocol to parallelize everything under the hood. 127 | 128 | The following arguments have become obsolete and will be removed somewhere in 2016: 129 | * Argument `cluster` for function `stringdistmatrix`. 130 | * Argument `maxDist` for functions `stringdist` and `stringdistmatrix` (not `amatch`). 131 | * Argument `ncores` for function `stringdistmatrix` 132 | 133 | 134 | -------------------------------------------------------------------------------- /pkg/R/afind.R: -------------------------------------------------------------------------------- 1 | #' Stringdist-based fuzzy text search 2 | #' 3 | #' \code{afind} slides a window of fixed width over a string \code{x} and 4 | #' computes the distance between the each window and the sought-after 5 | #' \code{pattern}. The location, content, and distance corresponding to the 6 | #' window with the best match is returned. 7 | #' 8 | #' 9 | #' @param x strings to search in 10 | #' @param pattern strings to find (not a regular expression). For \code{grab}, 11 | #' \code{grabl}, and \code{extract} this must be a single string. 12 | #' @param window width of moving window. 13 | #' @param value toggle return matrix with matched strings. 14 | #' @inheritParams amatch 15 | #' 16 | #' @details 17 | #' Matching is case-sensitive. Both \code{x} and \code{pattern} are converted 18 | #' to \code{UTF-8} prior to search, unless \code{useBytes=TRUE}, in which case 19 | #' the distances are measured bytewise. 20 | #' 21 | #' Code is parallelized over the \code{x} variable: each value of \code{x} 22 | #' is scanned for every element in \code{pattern} using a separate thread (when \code{nthread} 23 | #' is larger than 1). 24 | #' 25 | #' The functions \code{grab} and \code{grabl} are approximate string matching 26 | #' functions that somewhat resemble base R's \code{\link[base]{grep}} and 27 | #' \code{\link[base:grep]{grepl}}. They are implemented as convenience wrappers 28 | #' of \code{afind}. 29 | #' 30 | #' @section Running cosine distance: 31 | #' This algorithm gains efficiency by using that two consecutive windows have 32 | #' a large overlap in their q-gram profiles. It gives the same result as 33 | #' the \code{"cosine"} distance, but much faster. 34 | #' 35 | #' 36 | #' @return 37 | #' For \code{afind}: a \code{list} of three matrices, each with 38 | #' \code{length(x)} rows and \code{length(pattern)} columns. In each matrix, 39 | #' element \eqn{(i,j)} corresponds to \code{x[i]} and \code{pattern[j]}. The 40 | #' names and description of each matrix is as follows. 41 | #' \itemize{ 42 | #' \item{\code{location}. \code{[integer]}, location of the start of best matching window. 43 | #' When \code{useBytes=FALSE}, this corresponds to the location of a \code{UTF} code point 44 | #' in \code{x}, possibly after conversion from its original encoding.} 45 | #' \item{\code{distance}. \code{[character]}, the string distance between pattern and 46 | #' the best matching window.} 47 | #' \item{\code{match}. \code{[character]}, the first, best matching window.} 48 | #' 49 | #' } 50 | #' 51 | #' @family matching 52 | #' 53 | #' @examples 54 | #' texts = c("When I grow up, I want to be" 55 | #' , "one of the harvesters of the sea" 56 | #' , "I think before my days are gone" 57 | #' , "I want to be a fisherman") 58 | #' patterns = c("fish", "gone","to be") 59 | #' 60 | #' afind(texts, patterns, method="running_cosine", q=3) 61 | #' 62 | #' grabl(texts,"grew", maxDist=1) 63 | #' extract(texts, "harvested", maxDist=3) 64 | #' 65 | #' 66 | #' @export 67 | afind <- function(x, pattern, window=NULL 68 | , value=TRUE 69 | , method = c("osa","lv","dl","hamming","lcs", "qgram","cosine","running_cosine","jaccard","jw","soundex") 70 | , useBytes = FALSE 71 | , weight=c(d=1,i=1,s=1,t=1) 72 | , q = 1 73 | , p = 0 74 | , bt = 0 75 | , nthread = getOption("sd_num_thread") 76 | ){ 77 | 78 | stopifnot( 79 | all(is.finite(weight)) 80 | , all(weight > 0) 81 | , all(weight <=1) 82 | , is.null(window) || window >= 1 83 | , q >= 0 84 | , p <= 0.25 85 | , p >= 0 86 | , is.logical(useBytes) && !is.na(useBytes) 87 | , is.logical(value) && !is.na(value) 88 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) 89 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) 90 | , length(nthread) == 1 91 | , is.numeric(nthread) 92 | , nthread > 0 93 | ) 94 | 95 | x <- as.character(x) 96 | pattern <- as.character(pattern) 97 | if ( !useBytes ){ 98 | x <- enc2utf8(x) 99 | pattern <- enc2utf8(pattern) 100 | } 101 | 102 | if (is.null(window)){ 103 | window = nchar(pattern, type = if (useBytes) "bytes" else "char") 104 | } 105 | 106 | if (length(x) == 0) return(numeric(0)) 107 | 108 | method <- match.arg(method) 109 | if (method == 'jw') weight <- weight[c(2,1,3)] 110 | 111 | 112 | method <- METHODS[method] 113 | if ( is.na(method) ){ 114 | stop(sprintf("method '%s' is not defined",method)) 115 | } 116 | 117 | L <- .Call("R_afind" 118 | , x 119 | , pattern 120 | , as.integer(window) 121 | , method 122 | , as.double(weight) 123 | , as.double(p) 124 | , as.double(bt) 125 | , as.integer(q) 126 | , as.integer(useBytes) 127 | , as.integer(nthread) 128 | , PACKAGE="stringdist") 129 | 130 | names(L) <- c("location", "distance") 131 | 132 | if (isTRUE(value)){ 133 | matches = sapply(seq_along(pattern), function(i){ 134 | substr(x, L[[1]][,i], L[[1]][,i] + window[i]-1) 135 | }) 136 | L$match <- matrix(matches, nrow=length(x)) 137 | } 138 | 139 | L 140 | } 141 | 142 | 143 | 144 | 145 | #' @rdname afind 146 | #' @param ... passed to \code{afind}. 147 | #' @param maxDist Only windows with distance \code{<= maxDist} are considered a match. 148 | #' @return 149 | #' For \code{grab}, an \code{integer} vector, indicating in which elements of 150 | #' \code{x} a match was found with a distance \code{<= maxDist}. The matched 151 | #' values when \code{value=TRUE} (equivalent to \code{\link[base]{grep}}). 152 | #' @export 153 | grab <- function(x, pattern, maxDist=Inf, value=FALSE, ...){ 154 | stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) 155 | L <- afind(x, pattern, value=value, ...) 156 | if (!value){ 157 | which(L$distance <= maxDist) 158 | } else { 159 | L$match[L$distance <= maxDist ] 160 | } 161 | } 162 | 163 | #' @rdname afind 164 | #' @param ... passed to \code{afind}. 165 | #' @return 166 | #' For \code{grabl}, a \code{logical} vector, indicating in which elements of 167 | #' \code{x} a match was found with a distance \code{<= maxDist}. (equivalent 168 | #' to \code{\link[base:grep]{grepl}}). 169 | #' @export 170 | grabl <- function(x, pattern, maxDist=Inf, ...){ 171 | stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) 172 | L <- afind(x, pattern, value=FALSE, ...) 173 | as.logical(L$distance <= maxDist) 174 | } 175 | 176 | 177 | #' @rdname afind 178 | #' 179 | #' @return 180 | #' For \code{extract}, a \code{character} matrix with \code{length(x)} rows and 181 | #' \code{length(pattern)} columns. If match was found, element \eqn{(i,j)} 182 | #' contains the match, otherwise it is set to \code{NA}. 183 | #' @export 184 | extract <- function(x, pattern, maxDist = Inf, ...){ 185 | stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) 186 | L <- afind(x, pattern, value=TRUE, ...) 187 | out <- L$match 188 | out[L$distance > maxDist] <- NA_character_ 189 | out 190 | } 191 | 192 | 193 | -------------------------------------------------------------------------------- /pkg/R/stringsim.R: -------------------------------------------------------------------------------- 1 | #' Compute similarity scores between strings 2 | #' 3 | #' \code{stringsim} computes pairwise string similarities between elements of 4 | #' \code{character} vectors \code{a} and \code{b}, where the vector with less 5 | #' elements is recycled. 6 | #' \code{stringsimmatrix} computes the string similarity matrix with rows 7 | #' according to \code{a} and columns according to \code{b}. 8 | #' 9 | #' @param a R object (target); will be converted by \code{as.character}. 10 | #' @param b R object (source); will be converted by \code{as.character}. 11 | #' @param method Method for distance calculation. The default is \code{"osa"}, 12 | #' see \code{\link{stringdist-metrics}}. 13 | #' @param useBytes Perform byte-wise comparison, see \code{\link{stringdist-encoding}}. 14 | #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to 15 | #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. 16 | #' @param ... additional arguments are passed on to \code{\link{stringdist}} and 17 | #' \code{\link{stringdistmatrix}} respectively. 18 | #' @return 19 | #' \code{stringsim} returns a vector with similarities, which are values between 20 | #' 0 and 1 where 1 corresponds to perfect similarity (distance 0) and 0 to 21 | #' complete dissimilarity. \code{NA} is returned when \code{\link{stringdist}} 22 | #' returns \code{NA}. Distances equal to \code{Inf} are truncated to a 23 | #' similarity of 0. \code{stringsimmatrix} works the same way but, equivalent to 24 | #' \code{\link{stringdistmatrix}}, returns a similarity matrix instead of a 25 | #' vector. 26 | #' 27 | #' @details 28 | #' The similarity is calculated by first calculating the distance using 29 | #' \code{\link{stringdist}}, dividing the distance by the maximum 30 | #' possible distance, and substracting the result from 1. 31 | #' This results in a score between 0 and 1, with 1 32 | #' corresponding to complete similarity and 0 to complete dissimilarity. 33 | #' Note that complete similarity only means equality for distances satisfying 34 | #' the identity property. This is not the case e.g. for q-gram based distances 35 | #' (for example if q=1, anagrams are completely similar). 36 | #' For distances where weights can be specified, the maximum distance 37 | #' is currently computed by assuming that all weights are equal to 1. 38 | #' 39 | #' @example ../examples/stringsim.R 40 | #' @export 41 | stringsim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs", 42 | "qgram", "cosine", "jaccard", "jw", "soundex"), useBytes=FALSE, q = 1, ...) { 43 | # Calculate the distance 44 | method <- match.arg(method) 45 | dist <- stringdist::stringdist(a, b, method=method, useBytes=useBytes, q=q, ...) 46 | 47 | nctype <- if (useBytes) "bytes" else "char" 48 | normalize_dist(dist, a, b, method=method, nctype=nctype, q=q) 49 | } 50 | 51 | 52 | #' @rdname stringsim 53 | #' @export 54 | stringsimmatrix <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs", 55 | "qgram", "cosine", "jaccard", "jw", "soundex"), useBytes=FALSE, q = 1, ...) { 56 | # Calculate the distance 57 | method <- match.arg(method) 58 | nctype <- if (useBytes) "bytes" else "char" 59 | if (missing(b)){ 60 | dist <- stringdist::stringdistmatrix(a, method=method, useBytes=useBytes, q=q, ...) 61 | normalize_dist(dist, a= rep(a,length(a)), b = rep(a,each=length(a)), method=method, nctype=nctype, q=q) 62 | } else { 63 | dist <- stringdist::stringdistmatrix(a, b, method=method, useBytes=useBytes, q=q, ...) 64 | normalize_dist(dist, a=rep(a,length(b)), b=rep(b,each=length(a)), method=method, nctype=nctype, q=q) 65 | } 66 | } 67 | 68 | 69 | #' Compute similarity scores between sequences of integers 70 | #' 71 | #' @param a \code{list} of \code{integer} vectors (target) 72 | #' @param b \code{list} of \code{integer} vectors (source). Optional for 73 | #' \code{seq_distmatrix}. 74 | #' @param method Method for distance calculation. The default is \code{"osa"}, 75 | #' see \code{\link{stringdist-metrics}}. 76 | #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to 77 | #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. 78 | #' @param ... additional arguments are passed on to \code{\link{seq_dist}}. 79 | #' 80 | #' @return 81 | #' A \code{numeric} vector of length \code{max(length(a),length(b))}. If one of the 82 | #' entries in \code{a} or \code{b} is \code{NA_integer_}, all comparisons with that 83 | #' element result in \code{NA}. Missings occurring within the sequences are treated 84 | #' as an ordinary number (the representation of \code{NA_integer_}). 85 | #' 86 | #' @example ../examples/seq_sim.R 87 | #' @seealso \code{\link{seq_dist}}, \code{\link{seq_amatch}} 88 | #' @export 89 | seq_sim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs", 90 | "qgram", "cosine", "jaccard", "jw"), q = 1, ...) { 91 | 92 | method <- match.arg(method) 93 | dist <- stringdist::seq_dist(a, b, method=method, q=q, ...) 94 | normalize_dist(dist, a, b, method=method, q=q) 95 | } 96 | 97 | 98 | #### HELPER FUNCTIONS --------------------------------------------------------- 99 | 100 | # get lengths of sequences (internal function) 101 | lengths <- function(x,...){ 102 | UseMethod("lengths") 103 | } 104 | 105 | lengths.character <- function(x, type="char",...){ 106 | nchar(x,type=type) 107 | } 108 | 109 | lengths.list <- function(x,...){ 110 | .Call("R_lengths",x, PACKAGE="stringdist") 111 | } 112 | 113 | normalize_dist <- function(dist, a, b, method, nctype="char",q=1L){ 114 | 115 | if (inherits(dist, "dist")) dist <- as.matrix(dist) 116 | 117 | # Normalise the distance by dividing it by the maximum possible distance 118 | if (method == "hamming") { 119 | max_dist <- if (length(b) > length(a)) lengths(b,type=nctype) else lengths(a,type=nctype) 120 | max_dist[max_dist == 0] <- 1 121 | sim <- 1 - dist/max_dist 122 | } else if (method == "lcs") { 123 | max_dist <- lengths(a,type=nctype) + lengths(b,type=nctype) 124 | max_dist[max_dist == 0] <- 1 125 | sim <- 1 - dist/max_dist 126 | } else if (method == "lv") { 127 | max_dist <- pmax(lengths(a,type=nctype), lengths(b,type=nctype)) 128 | max_dist[max_dist == 0] <- 1 129 | sim <- 1 - dist/max_dist 130 | } else if (method == "osa") { 131 | max_dist <- pmax(lengths(a,type=nctype), lengths(b,type=nctype)) 132 | max_dist[max_dist == 0] <- 1 133 | sim <- 1 - dist/max_dist 134 | } else if (method == "dl") { 135 | max_dist <- pmax(lengths(a,type=nctype), lengths(b,type=nctype)) 136 | max_dist[max_dist == 0] <- 1 137 | sim <- 1 - dist/max_dist 138 | } else if (method == "qgram") { 139 | max_dist <- (lengths(a,type=nctype) + lengths(b,type=nctype) - 2*q + 2) 140 | max_dist[max_dist < 0] <- 1 141 | sim <- 1 - dist/max_dist 142 | } else if (method == "cosine") { 143 | sim <- 1 - dist 144 | } else if (method == "jaccard") { 145 | sim <- 1 - dist 146 | } else if (method == "jw") { 147 | sim <- 1 - dist 148 | } else if (method == "soundex") { 149 | sim <- 1 - dist 150 | } 151 | # all metrics can have distances == Inf; for similariy score set these to 0 152 | sim[sim < 0] <- 0 153 | sim 154 | } 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /pkg/src/soundex.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #include "utils.h" 21 | #include 22 | 23 | // Translate similar sounding consonants to numeric codes; vowels are all 24 | // translated to 'a' and voiceless characters (and other characters) are 25 | // translated to 'h'. 26 | // Upper and lower case ASCII characters are treated as separate cases, 27 | // avoiding the use of 'tolower' whose effect depends on locale. 28 | static unsigned int translate_soundex(unsigned int c) { 29 | switch ( c ) { 30 | case 'b': 31 | case 'f': 32 | case 'p': 33 | case 'v': 34 | case 'B': 35 | case 'F': 36 | case 'P': 37 | case 'V': 38 | return '1'; 39 | case 'c': 40 | case 'g': 41 | case 'j': 42 | case 'k': 43 | case 'q': 44 | case 's': 45 | case 'x': 46 | case 'z': 47 | case 'C': 48 | case 'G': 49 | case 'J': 50 | case 'K': 51 | case 'Q': 52 | case 'S': 53 | case 'X': 54 | case 'Z': 55 | return '2'; 56 | case 'd': 57 | case 't': 58 | case 'D': 59 | case 'T': 60 | return '3'; 61 | case 'l': 62 | case 'L': 63 | return '4'; 64 | case 'm': 65 | case 'n': 66 | case 'M': 67 | case 'N': 68 | return '5'; 69 | case 'r': 70 | case 'R': 71 | return '6'; 72 | case 'h': 73 | case 'w': 74 | case 'H': 75 | case 'W': 76 | return 'h'; 77 | case 'a': 78 | case 'e': 79 | case 'i': 80 | case 'o': 81 | case 'u': 82 | case 'y': 83 | case 'A': 84 | case 'E': 85 | case 'I': 86 | case 'O': 87 | case 'U': 88 | case 'Y': 89 | return 'a'; // use 'a' to encode vowels 90 | case '!': // we will allow all printable ASCII characters. 91 | case '"': 92 | case '#': 93 | case '$': 94 | case '%': 95 | case '&': 96 | case '\'': 97 | case '(': 98 | case ')': 99 | case '*': 100 | case '+': 101 | case ',': 102 | case '-': 103 | case '.': 104 | case '/': 105 | case ':': 106 | case ';': 107 | case '<': 108 | case '=': 109 | case '>': 110 | case '?': 111 | case '@': 112 | case '[': 113 | case '\\': 114 | case ']': 115 | case '^': 116 | case '_': 117 | case '`': 118 | case '{': 119 | case '|': 120 | case '}': 121 | case '~': 122 | case '0': 123 | case '1': 124 | case '2': 125 | case '3': 126 | case '4': 127 | case '5': 128 | case '6': 129 | case '7': 130 | case '8': 131 | case '9': 132 | case ' ': 133 | return 'h'; // ignored characters; voiceless symbols. 134 | default: 135 | return '?'; // other characters are ignored with a warning 136 | } 137 | } 138 | 139 | // Translate a string to a soundex phonetic code 140 | // 141 | // str: the input string 142 | // str_len: the length of the input string 143 | // result: the character vector in which the soundex code is copied. This 144 | // should be a vector of a least length 4. 145 | // output: the number of non-ascii or non-printable ascii characters 146 | // encountered during translation. 147 | static unsigned int soundex(const unsigned int* str, unsigned int str_len, unsigned int* result) { 148 | if (!str || !result) return 0; 149 | if (str_len == 0) { 150 | unsigned int j; 151 | for (j = 0; j < 4; ++j) result[j] = '0'; 152 | return 0; 153 | } 154 | unsigned int i = 0, j = 0, nfail = 0; 155 | unsigned int cj = translate_soundex(str[j]); 156 | // the first character is copied directly and not translated to a numerical 157 | // code 158 | if ( cj == '?' ){ 159 | // the translated character is non-printable ASCII or non-ASCII. 160 | ++nfail; 161 | result[0] = str[0]; 162 | } else { 163 | result[0] = toupper(str[0]); 164 | } 165 | //result[0] = str[0] < 128 ? toupper(str[0]) : str[0]; 166 | for (i = 1; i < str_len && j < 3; ++i) { 167 | unsigned int ci = translate_soundex(str[i]); 168 | if (ci == 'a') { 169 | // vowels are not added to the result; but we do set the previous 170 | // character to the vower because two consonants with a vowel in between 171 | // are not merged 172 | cj = ci; 173 | } else if (ci != 'h') { 174 | // a consonant that is not equal to the previous consonant is added to 175 | // the result 176 | if (ci != cj) { 177 | result[++j] = ci; 178 | cj = ci; 179 | } 180 | } 181 | if ( ci == '?' ){ 182 | // the translated character is non-printable ASCII or non-ASCII. 183 | ++nfail; 184 | } 185 | } 186 | // pad with zeros 187 | for (++j ; j < 4; ++j) result[j] = '0'; 188 | return nfail; 189 | } 190 | 191 | double soundex_dist(unsigned int *a, int a_len, unsigned int *b, int b_len, unsigned int *nfail) { 192 | 193 | unsigned int sa[4]; 194 | unsigned int sb[4]; 195 | (*nfail) += soundex(a, a_len, sa); 196 | (*nfail) += soundex(b, b_len, sb); 197 | for (unsigned int i = 0; i < 4; ++i) 198 | if (sa[i] != sb[i]) return 1.0; 199 | return 0.0; 200 | } 201 | 202 | // ================================ R INTERFACE =============================== 203 | 204 | static void check_fail(unsigned int nfail){ 205 | if ( nfail > 0 ){ 206 | warning("soundex encountered %d non-printable ASCII or non-ASCII" 207 | "\n characters. Results may be unreliable, see ?printable_ascii",nfail); 208 | } 209 | } 210 | 211 | SEXP R_soundex(SEXP x, SEXP useBytes) { 212 | PROTECT(x); 213 | PROTECT(useBytes); 214 | 215 | int n = length(x); 216 | int bytes = INTEGER(useBytes)[0]; 217 | 218 | // when a and b are character vectors; create unsigned int vectors in which 219 | // the elements of and b will be copied 220 | unsigned int *s = NULL; 221 | int ml = max_length(x); 222 | s = (unsigned int *) malloc( (1L+ml) * sizeof(unsigned int)); 223 | if (s == NULL) { 224 | UNPROTECT(2); 225 | error("Unable to allocate enough memory"); 226 | } 227 | if (bytes) { 228 | // create output variable 229 | SEXP y = allocVector(STRSXP, n); 230 | PROTECT(y); 231 | // compute soundexes, skipping NA's 232 | unsigned int nfail = 0; 233 | int len_s, isna_s; 234 | char sndx[5]; 235 | unsigned int sndx_int[4]; 236 | for (int i = 0; i < n; ++i) { 237 | get_elem(x, i, bytes,0L, &len_s, &isna_s, s); 238 | if (isna_s) { 239 | SET_STRING_ELT(y, i, R_NaString); 240 | } else { 241 | nfail += soundex(s, len_s, sndx_int); 242 | for (unsigned int j = 0; j < 4; ++j) sndx[j] = (char) sndx_int[j]; 243 | sndx[4] = 0; 244 | SET_STRING_ELT(y, i, mkChar(sndx)); 245 | } 246 | } 247 | // cleanup and return 248 | check_fail(nfail); 249 | free(s); 250 | UNPROTECT(3); 251 | return y; 252 | } else { 253 | // create output variable 254 | SEXP y = allocVector(VECSXP, n); 255 | PROTECT(y); 256 | // compute soundexes, skipping NA's 257 | unsigned int nfail = 0; 258 | int len_s, isna_s; 259 | for (int i = 0; i < n; ++i) { 260 | get_elem(x, i, bytes, 0L, &len_s, &isna_s, s); 261 | if (isna_s) { 262 | SEXP sndx = allocVector(INTSXP, 1); 263 | PROTECT(sndx); 264 | INTEGER(sndx)[0] = NA_INTEGER; 265 | SET_VECTOR_ELT(y, i, sndx); 266 | UNPROTECT(1); 267 | } else { 268 | SEXP sndx = allocVector(INTSXP, 4); 269 | PROTECT(sndx); 270 | nfail += soundex(s, len_s, (unsigned int *)INTEGER(sndx)); 271 | SET_VECTOR_ELT(y, i, sndx); 272 | UNPROTECT(1); 273 | } 274 | } 275 | // cleanup and return 276 | check_fail(nfail); 277 | free(s); 278 | UNPROTECT(3); 279 | return y; 280 | } 281 | } 282 | 283 | 284 | 285 | -------------------------------------------------------------------------------- /pkg/src/utf8ToInt.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #include 21 | #include "utils.h" 22 | 23 | 24 | /* This function is gratefully copied from the R core distribution. 25 | * It is replicated here to facilitate multicore processing through 26 | * openmp (as it is not part of R's API). 27 | * 28 | * Convert a single character to integer. 29 | * 30 | * *s input buffer (must be utf8) 31 | * *w output buffer 32 | * 33 | * value: 34 | * >0 : The number of bytes in the multi-byte representation 35 | * 0 : End of string reached. 36 | * <0 : Invalid input (not interpretable as UTF-8) 37 | * 38 | * 39 | */ 40 | static int mbrtoint(unsigned int *w, const char *s) 41 | { 42 | unsigned int byte; 43 | byte = *((unsigned char *)s); 44 | 45 | if (byte == 0) { 46 | *w = 0; 47 | return 0; 48 | } else if (byte < 0xC0) { 49 | *w = (int) byte; 50 | return 1; 51 | } else if (byte < 0xE0) { 52 | if (!s[1]) return -2; 53 | if ((s[1] & 0xC0) == 0x80) { 54 | *w = (int) (((byte & 0x1F) << 6) | (s[1] & 0x3F)); 55 | return 2; 56 | } else return -1; 57 | } else if (byte < 0xF0) { 58 | if (!s[1] || !s[2]) return -2; 59 | if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { 60 | *w = (int) (((byte & 0x0F) << 12) 61 | | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F)); 62 | byte = *w; 63 | if (byte >= 0xD800 && byte <= 0xDFFF) return -1; /* surrogate */ 64 | if (byte == 0xFFFE || byte == 0xFFFF) return -1; 65 | return 3; 66 | } else return -1; 67 | } else if (byte < 0xF8) { 68 | if (!s[1] || !s[2] || !s[3]) return -2; 69 | if (((s[1] & 0xC0) == 0x80) 70 | && ((s[2] & 0xC0) == 0x80) 71 | && ((s[3] & 0xC0) == 0x80)) { 72 | *w = (int) (((byte & 0x07) << 18) 73 | | ((s[1] & 0x3F) << 12) 74 | | ((s[2] & 0x3F) << 6) 75 | | (s[3] & 0x3F)); 76 | byte = *w; 77 | return 4; 78 | } else return -1; 79 | } else if (byte < 0xFC) { 80 | if (!s[1] || !s[2] || !s[3] || !s[4]) return -2; 81 | if (((s[1] & 0xC0) == 0x80) 82 | && ((s[2] & 0xC0) == 0x80) 83 | && ((s[3] & 0xC0) == 0x80) 84 | && ((s[4] & 0xC0) == 0x80)) { 85 | *w = (int) (((byte & 0x03) << 24) 86 | | ((s[1] & 0x3F) << 18) 87 | | ((s[2] & 0x3F) << 12) 88 | | ((s[3] & 0x3F) << 6) 89 | | (s[4] & 0x3F)); 90 | byte = *w; 91 | return 5; 92 | } else return -1; 93 | } else { 94 | if (!s[1] || !s[2] || !s[3] || !s[4] || !s[5]) return -2; 95 | if (((s[1] & 0xC0) == 0x80) 96 | && ((s[2] & 0xC0) == 0x80) 97 | && ((s[3] & 0xC0) == 0x80) 98 | && ((s[4] & 0xC0) == 0x80) 99 | && ((s[5] & 0xC0) == 0x80)) { 100 | *w = (int) (((byte & 0x01) << 30) 101 | | ((s[1] & 0x3F) << 24) 102 | | ((s[2] & 0x3F) << 18) 103 | | ((s[3] & 0x3F) << 12) 104 | | ((s[5] & 0x3F) << 6) 105 | | (s[5] & 0x3F)); 106 | byte = *w; 107 | return 6; 108 | } else return -1; 109 | } 110 | /* return -2; not reached */ 111 | } 112 | 113 | 114 | /* Translate a UTF-8 string to integers. 115 | * 116 | * Input 117 | * 118 | * str : pointer to input string. 119 | * outbuf: pointer to output buffer. 120 | * 121 | * Returns: 122 | * The number of logical characters converted. 123 | * 124 | */ 125 | static int utf8_to_int(const char *str, unsigned int *outbuf){ 126 | 127 | unsigned int *p = outbuf; 128 | char *s = (char *) str; 129 | int nbytes 130 | , str_len = 0L; 131 | 132 | for (int i=0;; i++){ 133 | nbytes = mbrtoint(p, s); 134 | if (nbytes > 0){ 135 | p += 1L; 136 | str_len += 1L; 137 | s += nbytes; 138 | } else if (nbytes == 0L ){ 139 | return str_len; 140 | } else if (nbytes == -1L) { // non-utf-8 sequence encountered. 141 | return -1; 142 | } 143 | } 144 | } 145 | 146 | 147 | // Get one element from x (VECSXP or STRSXP) convert to usigned int if necessary and store in c 148 | // TODO: this can probably be a bit optimized by decreasing the use of the *_ELT macros. 149 | unsigned int *get_elem(SEXP x, R_xlen_t i, int bytes, int intdist, int *len, int *isna, unsigned int *c){ 150 | 151 | if ( intdist ){ 152 | // we need a copy with trailing zero in this case since some distances 153 | // (e.g the dl-distance) expects this 154 | *isna = ( INTEGER(VECTOR_ELT(x,i))[0] == NA_INTEGER ); 155 | (*len) = length(VECTOR_ELT(x,i)); 156 | // this implicitly converts from int to unsigned int (but that should not influence the result). 157 | memcpy(c , INTEGER(VECTOR_ELT(x,i)), (*len) * sizeof(int)); 158 | c[*len] = 0; 159 | } else { 160 | *isna = ( STRING_ELT(x,i) == NA_STRING ); 161 | if (bytes){ 162 | (*len) = length(STRING_ELT(x,i)); 163 | for (int j=0; j < *len; j++ ){ 164 | c[j] = CHAR(STRING_ELT(x,i))[j]; 165 | } 166 | c[*len] = 0; 167 | } else { 168 | (*len) = utf8_to_int( CHAR(STRING_ELT(x,i)), c); 169 | if ( *len < 0 ){ 170 | error("Encountered byte sequence not representing an utf-8 character.\n"); 171 | } 172 | } 173 | } 174 | return c; 175 | } 176 | 177 | /* byte-by-byte char to int translation 178 | * 179 | * 180 | * Return value: the number of bytes converted. 181 | * 182 | * 183 | */ 184 | static int char_to_int(const char *str, unsigned int *outbuf){ 185 | unsigned int *p = outbuf; 186 | char *s = (char *) str; 187 | int str_len = 0L; 188 | while (*s){ 189 | (*p) = (unsigned int) *s; 190 | p++; 191 | s++; 192 | str_len++; 193 | } 194 | return str_len; 195 | } 196 | 197 | Stringset *new_stringset(SEXP str, int bytes, int intdist){ 198 | size_t nstr = length(str); 199 | Stringset *s; 200 | s = (Stringset *) malloc(sizeof(Stringset)); 201 | 202 | // get and set string lengths. 203 | s->str_len = (int *) malloc(nstr * sizeof(int)); 204 | 205 | size_t nbytes = 0L; 206 | 207 | if ( intdist ){ 208 | for (size_t i=0; istring = (unsigned int **) malloc(nstr * sizeof(int *)); 218 | // room for int rep of strings, including a trailing zero (needed by e.g. by full dl-distance) 219 | // this is enough room for byte-by-byte translation, so for UTF-8 it will be too much. 220 | s->data = (unsigned int *) malloc( (nstr + nbytes) * sizeof(int)); 221 | 222 | int *t = s->str_len; 223 | unsigned int *d = s->data; 224 | 225 | if ( intdist ){ 226 | for (size_t i=0L; i < nstr; i++, t++){ 227 | if ( INTEGER(VECTOR_ELT(str,i))[0] == NA_INTEGER ){ 228 | (*t) = NA_INTEGER; 229 | } else { 230 | (*t) = length(VECTOR_ELT(str,i)); 231 | memcpy(d, INTEGER(VECTOR_ELT(str,i)), (*t)*sizeof(int) ); 232 | s->string[i] = d; 233 | (*(d + (*t))) = 0L; // append a zero. 234 | d += (*t) + 1L; 235 | } 236 | } 237 | } else if ( bytes ){ 238 | for (size_t i=0L; i < nstr; i++, t++){ 239 | if ( STRING_ELT(str,i) == NA_STRING ){ 240 | (*t) = NA_INTEGER; 241 | } else { 242 | (*t) = char_to_int(CHAR(STRING_ELT(str,i)), d); 243 | s->string[i] = d; 244 | (*(d + (*t))) = 0L; // append a zero. 245 | d += (*t) + 1L; 246 | } 247 | } 248 | } else { 249 | for (size_t i=0L; i < nstr; i++, t++){ 250 | if ( STRING_ELT(str,i) == NA_STRING ){ 251 | (*t) = NA_INTEGER; 252 | } else { 253 | (*t) = utf8_to_int(CHAR(STRING_ELT(str,i)), d); 254 | s->string[i] = d; 255 | (*(d + (*t))) = 0L; // append a zero. 256 | d += (*t) + 1L; 257 | } 258 | } 259 | } 260 | 261 | return s; 262 | } 263 | 264 | void free_stringset(Stringset *s){ 265 | free(s->string); 266 | free(s->data); 267 | free(s->str_len); 268 | free(s); 269 | } 270 | 271 | 272 | 273 | -------------------------------------------------------------------------------- /pkg/R/doc_metrics.R: -------------------------------------------------------------------------------- 1 | #' @title 2 | #' String metrics in \pkg{stringdist} 3 | #' 4 | #' @description 5 | #' This page gives an overview of the string dissimilarity measures offered by 6 | #' \pkg{stringdist}. 7 | #' 8 | #' @section String Metrics: 9 | #' String metrics are ways of quantifying the dissimilarity between two finite 10 | #' sequences, usually text strings. Over the years, many such measures have been 11 | #' developed. Some are based on a mathematical understanding of the set of all 12 | #' strings that can be composed from a finite alphabet, others are based on more 13 | #' heuristic principles, such as how a text string sounds when pronounced by a 14 | #' native English speaker. 15 | #' 16 | #' The terms 'string metrics' and 'string distance' are used more or less 17 | #' interchangibly in literature. From a mathematical point of view, string 18 | #' metrics often do not obey the demands that are usually required from a 19 | #' distance function. For example, it is not true for all string metrics that a 20 | #' distance of 0 means that two strings are the same (e.g. in the \eqn{q}-gram 21 | #' distance). Nevertheless, string metrics are very useful in practice and have 22 | #' many applications. 23 | #' 24 | #' The metric you need to choose for an application strongly depends on both the 25 | #' nature of the string (what does the string represent?) and the cause of 26 | #' dissimilarities between the strings you are measuring. For example, if you 27 | #' are comparing human-typed names that may contain typo's, the Jaro-Winkler 28 | #' distance may be of use. If you are comparing names that were written down 29 | #' after hearing them, a phonetic distance may be a better choice. 30 | #' 31 | #' Currently, the following distance metrics are supported by \pkg{stringdist}. 32 | #' \tabular{ll}{ 33 | #' \bold{Method name} \tab \bold{Description}\cr 34 | #' \code{osa} \tab Optimal string aligment, (restricted Damerau-Levenshtein distance).\cr 35 | #' \code{lv} \tab Levenshtein distance (as in R's native \code{\link[utils]{adist}}).\cr 36 | #' \code{dl} \tab Full Damerau-Levenshtein distance.\cr 37 | #' \code{hamming} \tab Hamming distance (\code{a} and \code{b} must have same nr of characters).\cr 38 | #' \code{lcs} \tab Longest common substring distance.\cr 39 | #' \code{qgram} \tab \eqn{q}-gram distance. \cr 40 | #' \code{cosine} \tab cosine distance between \eqn{q}-gram profiles \cr 41 | #' \code{jaccard} \tab Jaccard distance between \eqn{q}-gram profiles \cr 42 | #' \code{jw} \tab Jaro, or Jaro-Winkler distance.\cr 43 | #' \code{soundex} \tab Distance based on soundex encoding (see below) 44 | #' } 45 | #' 46 | #' 47 | #' @section A short description of string metrics supported by \pkg{stringdist}: 48 | #' 49 | #' See \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{Van der Loo 50 | #' (2014)} for an extensive description and references. The review papers of 51 | #' Navarro (2001) and Boytsov (2011) provide excellent technical overviews of 52 | #' respectively online and offline string matching algorithms. 53 | #' 54 | #' The \bold{Hamming distance} (\code{method='hamming'}) counts the number of 55 | #' character substitutions that turns \code{b} into \code{a}. If \code{a} 56 | #' and \code{b} have different number of characters the distance is \code{Inf}. 57 | #' 58 | #' The \bold{Levenshtein distance} (\code{method='lv'}) counts the number of 59 | #' deletions, insertions and substitutions necessary to turn \code{b} into 60 | #' \code{a}. This method is equivalent to \code{R}'s native \code{\link[utils]{adist}} 61 | #' function. 62 | #' 63 | #' The \bold{Optimal String Alignment distance} (\code{method='osa'}) is like the Levenshtein 64 | #' distance but also allows transposition of adjacent characters. Here, each 65 | #' substring may be edited only once. (For example, a character cannot be transposed twice 66 | #' to move it forward in the string). 67 | #' 68 | #' The \bold{full Damerau-Levenshtein distance} (\code{method='dl'}) is like the optimal 69 | #' string alignment distance except that it allows for multiple edits on substrings. 70 | #' 71 | #' The \bold{longest common substring} (method='lcs') is defined as the longest string that can be 72 | #' obtained by pairing characters from \code{a} and \code{b} while keeping the order 73 | #' of characters intact. The \bold{lcs-distance} is defined as the number of unpaired characters. 74 | #' The distance is equivalent to the edit distance allowing only deletions and insertions, 75 | #' each with weight one. 76 | #' 77 | #' A \bold{\eqn{q}-gram} (method='qgram') is a subsequence of \eqn{q} \emph{consecutive} 78 | #' characters of a string. If \eqn{x} (\eqn{y}) is the vector of counts 79 | #' of \eqn{q}-gram occurrences in \code{a} (\code{b}), the \bold{\eqn{q}-gram distance} 80 | #' is given by the sum over the absolute differences \eqn{|x_i-y_i|}. 81 | #' The computation is aborted when \code{q} is is larger than the length of 82 | #' any of the strings. In that case \code{Inf} is returned. 83 | #' 84 | #' The \bold{cosine distance} (method='cosine') is computed as \eqn{1-x\cdot 85 | #' y/(\|x\|\|y\|)}, where \eqn{x} and \eqn{y} were defined above. 86 | #' 87 | #' Let \eqn{X} be the set of unique \eqn{q}-grams in \code{a} and \eqn{Y} the set of unique 88 | #' \eqn{q}-grams in \code{b}. The \bold{Jaccard distance} (\code{method='jaccard'}) is given by \eqn{1-|X\cap Y|/|X\cup Y|}. 89 | #' 90 | #' The \bold{Jaro distance} (\code{method='jw'}, \code{p=0}), is a number 91 | #' between 0 (exact match) and 1 (completely dissimilar) measuring 92 | #' dissimilarity between strings. It is defined to be 0 when both strings have 93 | #' length 0, and 1 when there are no character matches between \code{a} and 94 | #' \code{b}. Otherwise, the Jaro distance is defined as 95 | #' \eqn{1-(1/3)(w_1m/|a| + w_2m/|b| + w_3(m-t)/m)}. 96 | #' Here,\eqn{|a|} indicates the number of characters in \code{a}, \eqn{m} is 97 | #' the number of character matches and \eqn{t} the number of transpositions of 98 | #' matching characters. The \eqn{w_i} are weights associated with the characters 99 | #' in \code{a}, characters in \code{b} and with transpositions. A character 100 | #' \eqn{c} of \code{a} \emph{matches} a character from \code{b} when \eqn{c} 101 | #' occurs in \code{b}, and the index of \eqn{c} in \code{a} differs less than 102 | #' \eqn{\max(|a|,|b|)/2 -1} (where we use integer division) from the index of 103 | #' \eqn{c} in \code{b}. Two matching characters are transposed when they are 104 | #' matched but they occur in different order in string \code{a} and \code{b}. 105 | #' 106 | #' The \bold{Jaro-Winkler distance} (\code{method=jw}, \code{0 0} 9 | #' 10 | #' 11 | #' @section Note on \code{NA} handling: 12 | #' \code{R}'s native \code{\link[base]{match}} function matches \code{NA} with 13 | #' \code{NA}. This may feel inconsistent with \code{R}'s usual \code{NA} 14 | #' handling, since for example \code{NA==NA} yields 15 | #' \code{NA} rather than \code{TRUE}. In most cases, one may reason about the 16 | #' behaviour under \code{NA} along the lines of ``if one of the arguments is 17 | #' \code{NA}, the result shall be \code{NA}'', simply because not all 18 | #' information necessary to execute the function is available. One uses special 19 | #' functions such as \code{is.na}, \code{is.null} \emph{etc.} to handle special 20 | #' values. 21 | #' 22 | #' The \code{amatch} function mimics the behaviour of \code{\link[base]{match}} 23 | #' by default: \code{NA} is matched with \code{NA} and with nothing else. Note 24 | #' that this is inconsistent with the behaviour of \code{\link{stringdist}} 25 | #' since \code{stringdist} yields \code{NA} when at least one of the arguments 26 | #' is \code{NA}. The same inconsistency exists between \code{\link[base]{match}} 27 | #' and \code{\link[utils]{adist}}. In \code{amatch} this behaviour can be 28 | #' controlled by setting \code{matchNA=FALSE}. In that case, if any of the 29 | #' arguments in \code{x} is \code{NA}, the \code{nomatch} value is returned, 30 | #' regardless of whether \code{NA} is present in \code{table}. In 31 | #' \code{\link[base]{match}} the behaviour can be controlled by setting the 32 | #' \code{incomparables} option. 33 | #' 34 | #' 35 | #' @param x elements to be approximately matched: will be coerced to 36 | #' \code{character} unless it is a list consisting of \code{integer} vectors. 37 | #' @param table lookup table for matching. Will be coerced to \code{character} 38 | #' unless it is a list consting of \code{integer} vectors. 39 | #' @param nomatch The value to be returned when no match is found. This is 40 | #' coerced to integer. 41 | #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the 42 | #' behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches 43 | #' \code{NA} (see also the note on \code{NA} handling below). 44 | #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}. 45 | #' @param useBytes Perform byte-wise comparison. See \code{\link{stringdist-encoding}}. 46 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for 47 | #' deletion, insertion, substitution and transposition, in that order. When 48 | #' \code{method='lv'}, the penalty for transposition is ignored. When 49 | #' \code{method='jw'}, the weights associated with characters of \code{a}, 50 | #' characters from \code{b} and the transposition weight, in that order. 51 | #' Weights must be positive and not exceed 1. \code{weight} is ignored 52 | #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, 53 | #' \code{'Jaccard'}, \code{'lcs'}, or \code{'soundex'}. 54 | #' @param maxDist Elements in \code{x} will not be matched with elements of 55 | #' \code{table} if their distance is larger than \code{maxDist}. Note that the 56 | #' maximum distance between strings depends on the method: it should always be 57 | #' specified. 58 | #' @param nthread Number of threads used by the underlying C-code. A sensible 59 | #' default is chosen, see \code{\link{stringdist-parallelization}}. 60 | #' 61 | #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'}, 62 | #' or \code{'cosine'}. 63 | #' @param p Winklers 'prefix' parameter for Jaro-Winkler distance, with 64 | #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'} 65 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is 66 | #' only applied when the Jaro distance is larger than \code{bt}. 67 | #' Applies only to \code{method='jw'} and \code{p>0}. 68 | #' 69 | #' @return \code{amatch} returns the position of the closest match of \code{x} 70 | #' in \code{table}. When multiple matches with the same smallest distance 71 | #' metric exist, the first one is returned. \code{ain} returns a 72 | #' \code{logical} vector of length \code{length(x)} indicating wether an 73 | #' element of \code{x} approximately matches an element in \code{table}. 74 | #' 75 | #' @family matching 76 | #' 77 | #' @example ../examples/amatch.R 78 | #' @export 79 | amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE 80 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw", "soundex") 81 | , useBytes = FALSE 82 | , weight=c(d=1,i=1,s=1,t=1) 83 | , maxDist=0.1, q=1, p=0, bt=0 84 | , nthread = getOption("sd_num_thread")){ 85 | 86 | x <- as.character(x) 87 | table <- as.character(table) 88 | 89 | if (!useBytes){ 90 | x <- enc2utf8(x) 91 | table <- enc2utf8(table) 92 | } 93 | 94 | method <- match.arg(method) 95 | stopifnot( 96 | all(is.finite(weight)) 97 | , all(weight > 0) 98 | , all(weight <=1) 99 | , q >= 0 100 | , p <= 0.25 101 | , p >= 0 102 | , matchNA %in% c(TRUE,FALSE) 103 | , maxDist > 0 104 | , is.logical(useBytes) 105 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) 106 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) 107 | , length(nthread) == 1 108 | , is.numeric(nthread) 109 | , nthread > 0 110 | ) 111 | if (method == 'jw') weight <- weight[c(2,1,3)] 112 | method <- METHODS[method] 113 | if ( is.na(method) ){ 114 | stop(sprintf("method '%s' is not defined",method)) 115 | } 116 | 117 | .Call("R_amatch", x, table, method 118 | , as.integer(nomatch), as.integer(matchNA) 119 | , as.double(weight), as.double(p), as.double(bt) 120 | , as.integer(q) , as.double(maxDist), as.integer(useBytes) 121 | , as.integer(nthread) 122 | , PACKAGE="stringdist" 123 | ) 124 | 125 | } 126 | 127 | #' @param ... parameters to pass to \code{amatch} (except \code{nomatch}) 128 | #' 129 | #' 130 | #' @rdname amatch 131 | #' @export 132 | ain <- function(x,table,...){ 133 | amatch(x, table, nomatch=0, ...) > 0 134 | } 135 | 136 | #' Approximate matching for integer sequences. 137 | #' 138 | #' 139 | #' For a \code{list} of integer vectors \code{x}, find the closest matches in a 140 | #' \code{list} of integer or numeric vectors in \code{table.} 141 | #' 142 | #' @section Notes: 143 | #' \code{seq_ain} is currently defined as 144 | #' 145 | #' \code{seq_ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0} 146 | #' 147 | #' All input vectors are converted with \code{as.integer}. This causes truncation for numeric 148 | #' vectors (e.g. \code{pi} will be treated as \code{3L}). 149 | #' 150 | #' 151 | #' @param x (\code{list} of) \code{integer} or \code{numeric} vector(s) to be 152 | #' approximately matched. Will be converted with \code{as.integer}. 153 | #' @param table (\code{list} of) \code{integer} or \code{numeric} vector(s) 154 | #' serving as lookup table for matching. Will be converted with 155 | #' \code{as.integer}. 156 | #' @param nomatch The value to be returned when no match is found. This is 157 | #' coerced to integer. 158 | #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the 159 | #' behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches 160 | #' \code{NA}. With \code{NA}, we mean a missing entry in the \code{list}, represented as \code{NA_integer_}. 161 | #' If one of the integer sequences stored in the list has an \code{NA} entry, 162 | #' this is just treated as another integer (the representation of 163 | #' \code{NA_integer_}). 164 | #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}. 165 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for 166 | #' deletion, insertion, substitution and transposition, in that order. When 167 | #' \code{method='lv'}, the penalty for transposition is ignored. When 168 | #' \code{method='jw'}, the weights associated with integers in elements of \code{a}, 169 | #' integers in elements of \code{b} and the transposition weight, in that order. 170 | #' Weights must be positive and not exceed 1. \code{weight} is ignored 171 | #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, 172 | #' \code{'Jaccard'}, or \code{'lcs'}. 173 | #' @param maxDist Elements in \code{x} will not be matched with elements of 174 | #' \code{table} if their distance is larger than \code{maxDist}. Note that the 175 | #' maximum distance between strings depends on the method: it should always be 176 | #' specified. 177 | #' @param nthread Number of threads used by the underlying C-code. A sensible 178 | #' default is chosen, see \code{\link{stringdist-parallelization}}. 179 | #' 180 | #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'}, 181 | #' or \code{'cosine'}. 182 | #' @param p Winkler's prefix parameter for Jaro-Winkler distance, with 183 | #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'} 184 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is 185 | #' only applied when the Jaro distance is larger than \code{bt}. 186 | #' Applies only to \code{method='jw'} and \code{p>0}. 187 | #' @return \code{seq_amatch} returns the position of the closest match of \code{x} 188 | #' in \code{table}. When multiple matches with the same minimal distance 189 | #' metric exist, the first one is returned. \code{seq_ain} returns a 190 | #' \code{logical} vector of length \code{length(x)} indicating wether an 191 | #' element of \code{x} approximately matches an element in \code{table}. 192 | #' 193 | #' @seealso \code{\link{seq_dist}}, \code{\link{seq_sim}}, \code{\link{seq_qgrams}} 194 | #' 195 | #' @example ../examples/seq_amatch.R 196 | #' @export 197 | seq_amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE 198 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw") 199 | , weight=c(d=1,i=1,s=1,t=1) 200 | , maxDist=0.1, q=1, p=0, bt=0 201 | , nthread = getOption("sd_num_thread")){ 202 | 203 | x <- ensure_int_list(x) 204 | table <- ensure_int_list(table) 205 | 206 | method <- match.arg(method) 207 | stopifnot( 208 | all(is.finite(weight)) 209 | , all(weight > 0) 210 | , all(weight <=1) 211 | , q >= 0 212 | , p <= 0.25 213 | , p >= 0 214 | , matchNA %in% c(TRUE,FALSE) 215 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) 216 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) 217 | , length(nthread) == 1 218 | , is.numeric(nthread) 219 | , nthread > 0 220 | ) 221 | if (method == 'jw') weight <- weight[c(2,1,3)] 222 | method <- METHODS[method] 223 | if ( is.na(method) ){ 224 | stop(sprintf("method '%s' is not defined",method)) 225 | } 226 | 227 | .Call("R_amatch", x, table, method 228 | , as.integer(nomatch), as.integer(matchNA) 229 | , as.double(weight), as.double(p), as.double(bt) 230 | , as.integer(q) , as.double(maxDist), 0L 231 | , as.integer(nthread) 232 | , PACKAGE="stringdist" 233 | ) 234 | } 235 | 236 | #' @param ... parameters to pass to \code{seq_amatch} (except \code{nomatch}) 237 | #' 238 | #' 239 | #' @rdname seq_amatch 240 | #' @export 241 | seq_ain <- function(x,table,...){ 242 | seq_amatch(x, table, nomatch=0, ...) > 0 243 | } 244 | 245 | 246 | 247 | -------------------------------------------------------------------------------- /pkg/NEWS: -------------------------------------------------------------------------------- 1 | version 0.9.15 2 | - Fixe issue with zero-length 'nthreads' argument in all exported functions 3 | with this parameter. (Thanks to Brian Ripley for the notification and pointer 4 | to the problem) 5 | 6 | version 0.9.14 7 | - Fixed issue with zero-length strings in 'qgrams' (Thanks to Brian Ripley 8 | for the notification and pointer to the origin of the problem) 9 | 10 | version 0.9.12 11 | - apparently R_xlen_t is long long int on CLANG/Windows and long int on gcc-13/debian 12 | 13 | version 0.9.11 14 | - Fixed a warning in gcc-13: changed specifier from %d to %ld. 15 | (Thanks to Kurt Hornik for the head's up) 16 | 17 | version 0.9.10 18 | - Fixed another warning generated by new C compiler that I overlooked. 19 | (Thanks to the CRAN team for the head's up) 20 | 21 | version 0.9.9 22 | - Fixed warnings generated by new C compiler. (function prototypes must 23 | now be defined completely). (Thanks to Kurt Hornik for the head's up.) 24 | 25 | version 0.9.8 26 | - Fixed some issues on C-level causing problems with the 27 | CLANG compiler. (Thanks to Brian Ripley for not only 28 | reporting this, but also sending updated code with 29 | fixes). 30 | 31 | 32 | version 0.9.7 33 | - Fixes in use of INTEGER() and VECTOR_ELT() after updates in R's C API. 34 | this affected 'afind' and 'max_length' (internally). (Thanks to Luke 35 | Tierny and Kurt Hornik for the notification). 36 | - Fix in 'amatch' causing utf-8 characters to be ignored in some 37 | cases (thanks to Joan Mime for reporting #78). 38 | - Fix: segfault when 'afind' was called with many search patterns or many 39 | texts to be searched. 40 | - Fix: stringsimmatrix was not normalized correctly (Thanks to Tamas Ferenci 41 | for reporting GH). 42 | 43 | 44 | version 0.9.6.3 45 | - Resubmit. Fixed an URL redirect that was detected by CRAN. 46 | 47 | version 0.9.6.2 48 | - Resubmit. Fixed url issues detected by CRAN, added doi to description 49 | as per CRAN request. 50 | 51 | version 0.9.6.1 52 | - Bugfix: afind/grab/grabl returned wrong results on MacOS only. 53 | (thanks to Prof. Brian Ripley for the notification and for running tests 54 | on his personal machine and to Tomas Kalibera for making the 55 | ubuntu-rchk docker image available). 56 | 57 | version 0.9.6 58 | - New function 'afind': find approximate matches in text based on string distance. 59 | - New functions 'grab', 'grabl': fuzzy matching equivalent to 'grep' and 'grepl'. 60 | - New function 'extract': fuzzy matching equivalent of stringr::str_extract. 61 | - New algorithm 'running_cosine': fast fuzzy text search using cosine distance. 62 | - New function 'stringsimmatrix' (Thanks to Johannes Gruber). 63 | - Number of threads used is now reported when loading 'stringdist'. 64 | - Internal fixes (in some cases class() == 'class' was used). 65 | 66 | version 0.9.5.5 67 | - Changed two URLs to canonical form in README.md (https://) to comply with 68 | CRAN policy. 69 | 70 | version 0.9.5.4 71 | - Some tests using seq_dist() would fail unpredictably when the input was 72 | defined with lazily evaluated arguments, e.g. list(1:3, 2:4); but only in the 73 | context of NSE by a test suite ('tinytest', 'testthat'). Tests were replaced by 74 | literal versions, e.g. list(c(1,2,3), c(2,3,4)). 75 | 76 | version 0.9.5.3 77 | - Update in test suite to stay on CRAN 78 | 79 | version 0.9.5.2 80 | - RJournal paper and C/C++ api docs are now presented as vignette. 81 | - Switched to tinytest framework 82 | - Fix: stringdist could cause a segfault for edit distances between very long 83 | strings. (Thanks to GH user gllipatz) 84 | 85 | 86 | version 0.9.5.1 87 | - Fixed header file for C API 88 | 89 | version 0.9.5.0 90 | - New contributor: Chris Muir 91 | - C/C++ API now exposed for packages LinkingTo stringdist. See `?stringdist_api` 92 | - Arguments 'maxDist', 'ncores', 'cluster' of functions 'stringdist' and 93 | 'stringdistmatrix' have been deprecated for several years and are now 94 | removed. 95 | - Fixed edge case where cosine distance with q=1, between strings of repeating characters 96 | yielded Inf (Thanks to Markus Dumke) 97 | 98 | 99 | version 0.9.4.6 100 | - Fixed argument passing error in lower_tri (thanks to Kurt Hornik) 101 | 102 | version 0.9.4.5 103 | - New argument 'bt' implementing Winkler's boost threshold for the Jaro-Winkler distance 104 | - stringdist(a,b,method="qgram") returns correct value when q>nchar(a) (or b). 105 | (Thanks to Giora Simchoni). Also affects stringdistmatrix, amatch, seq_dist, 106 | and seq_distmatrix. 107 | - registered native routines as now recommended by CRAN 108 | 109 | version 0.9.4.4 110 | - updated default nr of threads to comply to CRAN policy (thanks to Kurt Hornik). 111 | The default nr of cores now equals OMP_NUM_THREADS if set. See 112 | ?'stringdist-parallelization' for the full policy. 113 | 114 | version 0.9.4.2 115 | - bugfix in stringdistmatrix(a): value of p, for jw-distance was ignored 116 | (thanks to Max Fritsche) 117 | - bugfix in stringdistmatrix(a): Would segfault on q-gram w/input > ~7k strings 118 | and q>1 (thanks to Connor McKay) 119 | - bugfix in jaccard distance: distance not always correct when passing multiple 120 | strings (thanks to Robert Carlson) 121 | 122 | version 0.9.4.1 123 | - stringdistmatrix(a) now outputs long vectors (issue #45, thanks to Wouter 124 | Touw). For stringdistmatrix(a,b) this was already the case, but the length 125 | of rows and columns remains restricted to 2^31-1 since long input vectors are 126 | not supported (yet). 127 | - bugfix in osa/dl/lv distances w/unequal edit weights (thanks to Nathalia Potocka) 128 | 129 | version 0.9.4 130 | - bugfix: edge case for zero-size for lower tridiagonal dist matrices (caused 131 | UBSAN to fire, but gave correct results). 132 | - bugfix in jw distance: not symmetric for certain cases (thanks to github user gtumuluri) 133 | 134 | version 0.9.3 135 | - new function for tokenizing integer sequences: seq_qgrams 136 | - new function for matching integer sequences: seq_amatch 137 | - new functions computing distances between integer sequences: seq_dist, seq_distmatrix 138 | - q-gram based distances are now always 0 when q=0 (used to be Inf if at least 139 | one of the arguments was not the empty string) 140 | - stringdist, stringdistmatrix now emit warning when presented with 'list' argument 141 | - small c-side code optimizations 142 | - bugfix in dl, lv, osa distance: weights were not taken into account properly 143 | (thanks to Zach Price) 144 | 145 | version 0.9.2 146 | - Update fixing some errors (missing documentation, tests) in the 0.9.1 release. 147 | - Fixed a few possible memory leaks. 148 | 149 | version 0.9.1 150 | - Argument 'useNames' of 'stringdistmatrix' now accepts 'none', 'strings', and 'names' 151 | - New function 'stringsim' computes string similarities between 0 and 1 based on 'stringdist' 152 | - Calling 'stringdistmatrix' with a single argument returns an object of class 'dist' 153 | - Argument 'cluster' to stringdistmatrix is phased out. It is now ignored with a message. 154 | - Specifying 'ncores' was already ignored but now also causes a warning 155 | - internal: rewrite of the R/C interface, saving about 1/3 of C-code, making extending easier 156 | - bugfix in stringdistmatrix: output was transposed when length(a)==1 (Thanks to github user cpoonolly) 157 | - Safer core detection to avoid a failure under Cygwin (thanks to Lauri Koobas) 158 | 159 | version 0.9.0 160 | - C-code underlying stringdist and amatch now automatically use multithreading based on openMP. 161 | The default number of threads is governed by options('sd_num_thread'). 162 | - stringdist, stringdistmatrix, amatch and ain gain nthread argument which can 163 | overwrite the default maximum number of threads. 164 | - Argument 'maxDist' is phased out for 'stringdist' and 'stringdistmatrix'. 165 | Specifying it causes a message. 166 | - Argument 'ncores' is phased out for 'stringdistmatrix'. It is now ignored and 167 | specifying it causes a message. 168 | - bugfix in amatch/dl. In certain cases, the best match went undetected. 169 | - Documentation improved and rearranged with string metrics, encoding, and 170 | parallelization now documented as separate topics. 171 | 172 | version 0.8.2 173 | - Fixed a few warnings issued by the CLANG compiler (thanks to Brian Ripley). 174 | This fixes a bug in amatch/jaccard 175 | - Fixed a bug in stringdist/osa, dl: NA incorectly returned (thanks to Lauri 176 | Koobas). 177 | 178 | version 0.8.1 179 | - stringdistmatrix returns dimensionless matrix when both arguments have length 180 | zero (thanks to Richie Cotton) 181 | - stringdistmatrix gains argument 'useNames' (thanks to Richie Cotton) 182 | - Package now 'Imports' parallel rather than 'Depends' on it. 183 | - bugfix in optimal string alignment distance: the nr of transpositions was 184 | sometimes overcounted (thanks to Frank Binder) 185 | - rearranged the documentation. 186 | 187 | version 0.8.0 188 | - Added soundex-based string distance (thanks to Jan van der Laan) 189 | - New function 'phonetic' translates strings to phonetic codes using soundex 190 | (thanks to Jan van der Laan) 191 | - New function 'printable_ascii' detects non-printable ascii or non-ascii 192 | characters. 193 | - Precision issue: cosine distance between equal strings would be O(1e-16) in 194 | stead of 0.0 (thanks to Ben Haller). 195 | - Code cleaning: somewhat better performance when maxDist is unspecified in 196 | stringdist. It remains deprecated. 197 | - Row names in the output array of 'qgrams' are now in system native encoding 198 | (used to be utf8 for all systems). 199 | - updated CITATION with page number info as the R Journal is now out. 200 | 201 | version 0.7.3 202 | - bugfix in jw-distance: out-of-range access in C-code caused R to crash in 203 | some cases (thanks to Carol Gan) 204 | - bugfix in dl distance: in some cases, distances could be one unit too high. 205 | - Updated CITATION file: paper to appear in The R Journal vol 6 (2014). 206 | - Some updates in documentation. 207 | 208 | version 0.7.2 209 | - function 'qgrams' gains .list argument 210 | - bugfix in multicore option of stringdistmatrix 211 | - bugfix in substitution weight of DL-distance (undercounted when w4 != 1 in 212 | some cases) 213 | - bugfix in dl.c: C-function read outside of array. 214 | 215 | version 0.7.0 216 | - added useBytes option: up to ~3-fold speed gain at the cost of possible 217 | encoding-dependent results. 218 | - new memory allocation method for q-grams increases speed between ~5% and ~30% 219 | depending on q and input string. 220 | - function 'qgrams' gains useNames option. 221 | - jaro-winkler distance gains weight argument. 222 | - C-code optimization in edit-based distances: 10~20% speed increase depending 223 | on input. 224 | - bugfix in amatch: sometimes NA was erroneously returned. 225 | - bugfix in amatch/lcs: hamming distance method was called erroneously. 226 | 227 | version 0.6.1 228 | - bugfix in parallel version of stringdistmatrix: parameter p was not passed 229 | (thanks to Ricardo Saporta) 230 | - bugfix in lv/osa/dl: maxDist ignored in certain cases 231 | 232 | version 0.6.0 233 | - added amatch function: approximate matching version of 'match' 234 | - added ain function: approximate matching version of '%in%' 235 | - qgrams now accepts arbitrary number of arguments. Outputs array, not table 236 | - added cosine distance 237 | - added Jaccard distance 238 | - added Jaro and Jaro-Winkler distances 239 | - small performance tweeks in underlying C code 240 | - Edge case in stringdistmatrix: output is now always of class matrix 241 | - Default maxDist is now Inf (this is only to make it more intuitive and does 242 | not break previous code) 243 | - BREAKING CHANGE: output -1 is replaced by Inf for all distance methods 244 | 245 | 246 | version 0.5.0 247 | - added qgram counting function 'qgrams' 248 | - faster edge case handling in osa method. 249 | - edge case in lv/osa/dl methods: distance returned length(b) in stead of -1 250 | when length(a) == 0, maxDist < length(b). 251 | - bugfix in lv/osa/dl method: maxDist returned when length(a) > maxDist > 0 252 | (thanks to Daniel Reckhard). 253 | - Hamming distance (method='h') now returns -1 for strings of unequal lengts 254 | (used to emit error). 255 | - added longest common substring distance (method='lcs'). 256 | - added qgram distance method. 257 | - stringdistmatrix gains cluster argument. 258 | 259 | version 0.4.2 260 | - Fix in error message for hamming distance 261 | - Workaround for system-dependent translation of utf8 NA characters 262 | 263 | version 0.4.0 264 | - First release 265 | -------------------------------------------------------------------------------- /pkg/R/stringdist.R: -------------------------------------------------------------------------------- 1 | #' A package for string distance calculation and approximate string matching. 2 | #' 3 | #' 4 | #' The \pkg{stringdist} package offers fast and platform-independent string 5 | #' metrics. Its main purpose is to compute various string distances and to do 6 | #' approximate text matching between character vectors. As of version 0.9.3, 7 | #' it is also possible to compute distances between sequences represented by 8 | #' integer vectors. 9 | #' 10 | #' 11 | #' A typical use is to match strings that are not precisely the same. For 12 | #' example 13 | #' 14 | #' \code{ amatch(c("hello","g'day"),c("hi","hallo","ola"),maxDist=2)} 15 | #' 16 | #' returns \code{c(2,NA)} since \code{"hello"} matches closest with 17 | #' \code{"hallo"}, and within the maximum (optimal string alignment) distance. 18 | #' The second element, \code{"g'day"}, matches closest with \code{"ola"} but 19 | #' since the distance equals 4, no match is reported. 20 | #' 21 | #' A second typical use is to compute string distances. For example 22 | #' 23 | #' \code{ stringdist(c("g'day"),c("hi","hallo","ola"))} 24 | #' 25 | #' Returns \code{c(5,5,4)} since these are the distances between \code{"g'day"} 26 | #' and respectively \code{"hi"}, \code{"hallo"}, and \code{"ola"}. 27 | #' 28 | #' A third typical use would be to compute a \code{dist} object. The command 29 | #' 30 | #' \code{stringdistmatrix(c("foo","bar","boo","baz"))} 31 | #' 32 | #' returns an object of class \code{dist} that can be used by clustering 33 | #' algorithms such as \code{stats::hclust}. 34 | #' 35 | #' A fourth use is to compute string distances between general sequences, 36 | #' represented as integer vectors (which must be stored in a \code{list}): 37 | #' 38 | #' \code{seq_dist( list(c(1L,1L,2L)), list(c(1L,2L,1L),c(2L,3L,1L,2L)) )} 39 | #' 40 | #' The above code yields the vector \code{c(1,2)} (the first shorter first 41 | #' argument is recycled over the longer second argument) 42 | #' 43 | #' Besides documentation for each function, the main topics documented are: 44 | #' 45 | #' \itemize{ 46 | #' \item{\code{\link{stringdist-metrics}} -- string metrics supported by the package} 47 | #' \item{\code{\link{stringdist-encoding}} -- how encoding is handled by the package} 48 | #' \item{\code{\link{stringdist-parallelization}} -- on multithreading } 49 | #' } 50 | #' 51 | #' @section Acknowledgements: 52 | #' \itemize{ 53 | #' \item{The code for the full Damerau-Levenshtein distance was adapted from Nick Logan's 54 | #' \href{https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c}{public github repository}.} 55 | #' \item{C code for converting UTF-8 to integer was copied from the R core for performance reasons.} 56 | #' \item{The code for soundex conversion and string similarity was kindly contributed by Jan van der Laan.} 57 | #' } 58 | #' @section Citation: 59 | #' If you would like to cite this package, please cite the \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{R Journal Paper}: 60 | #' \itemize{ 61 | #' \item{M.P.J. van der Loo (2014). The \code{stringdist} package for approximate string matching. 62 | #' R Journal 6(1) pp 111-122} 63 | #' } 64 | #' Or use \code{citation('stringdist')} to get a bibtex item. 65 | #' 66 | #' @name stringdist-package 67 | #' @docType package 68 | #' @useDynLib stringdist, .registration=TRUE 69 | #' @importFrom parallel detectCores 70 | #' 71 | #' 72 | #' 73 | "_PACKAGE" 74 | 75 | listwarning <- function(x,y){ 76 | sprintf(" 77 | You are passing one or more arguments of type 'list' to 78 | '%s'. These arguments will be converted with 'as.character' 79 | which is likeley not to give what you want (did you mean to use '%s'?). 80 | This warning can be avoided by explicitly converting the argument(s). 81 | ",x,y) 82 | } 83 | 84 | #' Compute distance metrics between strings 85 | #' 86 | #' 87 | #' \code{stringdist} computes pairwise string distances between elements of 88 | #' \code{a} and \code{b}, where the argument with less elements is recycled. 89 | #' \code{stringdistmatrix} computes the string distance matrix with rows 90 | #' according to 91 | #' \code{a} and columns according to \code{b}. 92 | #' 93 | #' 94 | #' @param a R object (target); will be converted by \code{as.character} 95 | #' @param b R object (source); will be converted by \code{as.character} 96 | #' This argument is optional for \code{stringdistmatrix} (see section \code{Value}). 97 | #' @param method Method for distance calculation. The default is \code{"osa"}, 98 | #' see \code{\link{stringdist-metrics}}. 99 | #' @param useBytes Perform byte-wise comparison, see 100 | #' \code{\link{stringdist-encoding}}. 101 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for 102 | #' deletion, insertion, substitution and transposition, in that order. When 103 | #' \code{method='lv'}, the penalty for transposition is ignored. When 104 | #' \code{method='jw'}, the weights associated with characters of \code{a}, 105 | #' characters from \code{b} and the transposition weight, in that order. 106 | #' Weights must be positive and not exceed 1. \code{weight} is ignored 107 | #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, 108 | #' \code{'Jaccard'}, \code{'lcs'}, or \code{soundex}. 109 | #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to 110 | #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. 111 | #' @param p Prefix factor for Jaro-Winkler distance. The valid range for 112 | #' \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the 113 | #' Jaro-distance is returned. Applies only to \code{method='jw'}. 114 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is 115 | #' only applied when the Jaro distance is larger than \code{bt}. 116 | #' Applies only to \code{method='jw'} and \code{p>0}. 117 | #' @param nthread Maximum number of threads to use. By default, a sensible 118 | #' number of threads is chosen, see \code{\link{stringdist-parallelization}}. 119 | #' 120 | #' @seealso \code{\link{stringsim}}, \code{\link{qgrams}}, \code{\link{amatch}}, \code{\link{afind}} 121 | #' 122 | #' @return For \code{stringdist}, a vector with string distances of size 123 | #' \code{max(length(a),length(b))}. 124 | #' 125 | #' For \code{stringdistmatrix}: if both \code{a} and \code{b} are passed, a 126 | #' \code{length(a)xlength(b)} \code{matrix}. If a single argument \code{a} is 127 | #' given an object of class \code{\link[stats]{dist}} is returned. 128 | #' 129 | #' Distances are nonnegative if they can be computed, \code{NA} if any of the 130 | #' two argument strings is \code{NA} and \code{Inf} when \code{maxDist} is 131 | #' exceeded or, in case of the hamming distance, when the two compared strings 132 | #' have different length. 133 | #' 134 | #' 135 | #' @example ../examples/stringdist.R 136 | #' @export 137 | stringdist <- function(a, b 138 | , method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw","soundex") 139 | , useBytes = FALSE 140 | , weight=c(d=1,i=1,s=1,t=1) 141 | , q = 1 142 | , p = 0 143 | , bt = 0 144 | , nthread = getOption("sd_num_thread") 145 | ){ 146 | if (is.list(a)|is.list(b)) 147 | warning(listwarning("stringdist","seq_dist")) 148 | 149 | stopifnot( 150 | all(is.finite(weight)) 151 | , all(weight > 0) 152 | , all(weight <=1) 153 | , q >= 0 154 | , p <= 0.25 155 | , p >= 0 156 | , is.logical(useBytes) 157 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) 158 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) 159 | , length(nthread) == 1 160 | , is.numeric(nthread) 161 | , nthread > 0 162 | ) 163 | 164 | # note: enc2utf8 is very efficient when the native encoding is already UTF-8. 165 | a <- as.character(a) 166 | b <- as.character(b) 167 | if ( !useBytes ){ 168 | a <- enc2utf8(a) 169 | b <- enc2utf8(b) 170 | } 171 | 172 | if (length(a) == 0 || length(b) == 0){ 173 | return(numeric(0)) 174 | } 175 | if ( max(length(a),length(b)) %% min(length(a),length(b)) != 0 ){ 176 | warning(RECYCLEWARNING) 177 | } 178 | method <- match.arg(method) 179 | nthread <- as.integer(nthread) 180 | 181 | if (method == 'jw') weight <- weight[c(2,1,3)] 182 | do_dist(a=b, b=a 183 | , method=method 184 | , weight=weight 185 | , q=q 186 | , p=p 187 | , bt=bt 188 | , useBytes=useBytes 189 | , nthread=nthread) 190 | } 191 | 192 | 193 | #' @param useNames Use input vectors as row and column names? 194 | #' 195 | #' 196 | #' @rdname stringdist 197 | #' @export 198 | stringdistmatrix <- function(a, b 199 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex") 200 | , useBytes = FALSE 201 | , weight=c(d=1,i=1,s=1,t=1) 202 | , q = 1 203 | , p = 0 204 | , bt = 0 205 | , useNames=c('none','strings','names') 206 | , nthread = getOption("sd_num_thread") 207 | ){ 208 | if (is.list(a)|| (!missing(b) && is.list(b)) ){ 209 | warning(listwarning("stringdistmatrix","seq_distmatrix")) 210 | } 211 | 212 | # for backward compatability with stringdist <= 0.9.0 213 | if (identical(useNames, FALSE)) useNames <- "none" 214 | if (identical(useNames, TRUE)) useNames <- "strings" 215 | useNames <- match.arg(useNames) 216 | 217 | method <- match.arg(method) 218 | nthread <- as.integer(nthread) 219 | stopifnot( 220 | all(is.finite(weight)) 221 | , all(weight > 0) 222 | , all(weight <=1) 223 | , q >= 0 224 | , p <= 0.25 225 | , p >= 0 226 | , is.logical(useBytes) 227 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) 228 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) 229 | , length(nthread) == 1 230 | , is.numeric(nthread) 231 | , nthread > 0 232 | ) 233 | 234 | if (method == 'jw') weight <- weight[c(2,1,3)] 235 | 236 | # if b is missing, generate a 'dist' object. 237 | if (missing(b)){ 238 | if (useNames == "names"){ 239 | a <- setNames(as.character(a),names(a)) 240 | } else { 241 | a <- as.character(a) 242 | } 243 | return( lower_tri(a 244 | , method=method 245 | , useBytes=useBytes 246 | , weight=weight 247 | , q=q 248 | , p=p 249 | , bt=bt 250 | , useNames=useNames 251 | , nthread=nthread) 252 | ) 253 | } 254 | 255 | if (useNames == "names"){ 256 | rowns <- names(a) 257 | colns <- names(b) 258 | } 259 | 260 | # NOTE: this strips off names 261 | a <- as.character(a) 262 | b <- as.character(b) 263 | 264 | if (useNames=="strings"){ 265 | rowns <- a 266 | colns <- b 267 | } 268 | 269 | 270 | if (!useBytes){ 271 | a <- enc2utf8(a) 272 | b <- enc2utf8(b) 273 | } 274 | 275 | if (length(a) == 0 || length(b) == 0){ 276 | return(matrix(numeric(0))) 277 | } 278 | 279 | x <- vapply(b, do_dist, USE.NAMES=FALSE, FUN.VALUE=numeric(length(a)) 280 | , a, method,weight, q, p, bt, useBytes, nthread) 281 | 282 | if (useNames %in% c("strings","names") ){ 283 | structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns))) 284 | } else { 285 | matrix(x,nrow=length(a),ncol=length(b)) 286 | } 287 | } 288 | 289 | 290 | char2int <- function(x){ 291 | # For some OS's enc2utf8 had unexpected behavior for NA's, 292 | # see https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=15201. 293 | # This is fixed for R >= 2.15.3. 294 | # i <- !is.na(x) 295 | # x[i] <- enc2utf8(x[i]) 296 | lapply(enc2utf8(x),utf8ToInt) 297 | } 298 | 299 | # enum-type in stringdist.h 300 | METHODS <- c( 301 | osa = 0L 302 | , lv = 1L 303 | , dl = 2L 304 | , hamming = 3L 305 | , lcs = 4L 306 | , qgram = 5L 307 | , cosine = 6L 308 | , jaccard = 7L 309 | , jw = 8L 310 | , soundex = 9L 311 | , running_cosine = 10L 312 | ) 313 | 314 | 315 | do_dist <- function(a, b, method, weight, q, p, bt, useBytes=FALSE, nthread=1L){ 316 | 317 | if (method=='soundex' && !all(printable_ascii(a) & printable_ascii(b)) ){ 318 | warning("Non-printable ascii or non-ascii characters in soundex. Results may be unreliable. See ?printable_ascii.") 319 | } 320 | method <- METHODS[method] 321 | if ( is.na(method) ){ 322 | stop(sprintf("method '%s' is not defined",method)) 323 | } 324 | 325 | d <- .Call("R_stringdist", a, b, method 326 | , as.double(weight), as.double(p), as.double(bt), as.integer(q) 327 | , as.integer(useBytes), as.integer(nthread) 328 | , PACKAGE="stringdist" 329 | ) 330 | 331 | d 332 | } 333 | 334 | # more efficient function that returns a square distance matrix as a 'stats::dist' object. 335 | lower_tri <- function(a 336 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex") 337 | , useBytes = FALSE 338 | , weight=c(d=1,i=1,s=1,t=1) 339 | , q=1 340 | , p=0 341 | , bt=0 342 | , useNames=FALSE 343 | , nthread = getOption("sd_num_thread") 344 | ){ 345 | methnr <- METHODS[method] 346 | if (is.na(method)){ 347 | stop(sprintf("method '%s' is not defined",method)) 348 | } 349 | 350 | x <- .Call("R_lower_tri", a, methnr 351 | , as.double(weight), as.double(p), as.double(bt) 352 | , as.integer(q), as.integer(useBytes), as.integer(nthread) 353 | , PACKAGE="stringdist") 354 | 355 | attributes(x) <- list(class='dist' 356 | , Size = length(a) 357 | , Diag = FALSE 358 | , Upper = FALSE 359 | , method = method) 360 | if (useNames == "strings") attr(x,"Labels") <- as.character(a) 361 | if (useNames == "names" ) attr(x,"Labels") <- names(a) 362 | 363 | x 364 | } 365 | 366 | 367 | 368 | 369 | -------------------------------------------------------------------------------- /pkg/inst/include/stringdist_api.h: -------------------------------------------------------------------------------- 1 | 2 | /* stringdist - a C library of string distance algorithms with an interface to R. 3 | * Copyright (C) 2013 Mark van der Loo 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | * 18 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 19 | */ 20 | 21 | #ifndef _STRINGDIST_API_H 22 | #define _STRINGDIST_API_H 23 | 24 | #include 25 | 26 | #ifdef HAVE_VISIBILITY_ATTRIBUTE 27 | # define attribute_hidden __attribute__ ((visibility ("hidden"))) 28 | #else 29 | # define attribute_hidden 30 | #endif 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | 36 | /** 37 | * @mainpage Stringdist C API 38 | * 39 | * @author Mark van der Loo, Jan van der Laan, R Core Team, Paul Hsieh, Chris Muir 40 | * @version `R package stringdist` version `0.9.5.0` and higher. 41 | * 42 | * @section using Using the stringdist C API 43 | * To call the functions described here from your package you need to: 44 | * 45 | * 1. Make sure that `stringdist` is installed. 46 | * 2. Add `stringdist` to `Imports` (or `Depends`) and `LinkingTo` in the `DESCRIPTION` file. 47 | * 3. In your source file under the package's `/src` directory, add the line 48 | * ``` 49 | * #include 50 | * ``` 51 | * 52 | * 53 | * An example of a published package using this API is 54 | * [refinr](https://CRAN.R-project.org/package=refinr). A minimal example can be 55 | * found [here](https://github.com/markvanderloo/linkstringdist). 56 | * 57 | * @section encoding Character encoding 58 | * All `character` vector input is expected to be in `UTF-8` (this also allows 59 | * `ASCII`). Distance computations are based on UTF [code 60 | * points](https://en.wikipedia.org/wiki/Code_point) unless `useBytes` is 61 | * `TRUE`, in which case distances are computed over byte sequences. Using 62 | * non-UTF-8 encoded strings is untested and is highly likely to result in 63 | * errors. 64 | * 65 | * @section threads Thread safety 66 | * 67 | * It is not safe to call functions from `stringdist` C API from 68 | * multiple concurrent threads. 69 | * 70 | * 71 | * 72 | */ 73 | 74 | 75 | /** 76 | * @file stringdist_api.h 77 | * @brief Functions exported from the stringdist package. 78 | * 79 | */ 80 | 81 | 82 | /* 83 | SEXP attribute_hidden sd_all_int(SEXP X) 84 | { 85 | static SEXP(*fun)(SEXP) = NULL; 86 | if (fun == NULL) fun = (SEXP(*)(SEXP)) R_GetCCallable("stringdist","R_all_int"); 87 | return fun(X); 88 | } 89 | */ 90 | 91 | /** 92 | * @brief Find the location of values in `x` in `table` by approximate matching. 93 | * 94 | * @param x `[character]` vector. 95 | * @param table `[character]` vector (lookup table) 96 | * @param method `[integer]` scalar, indicating the distance method as follows 97 | * @parblock 98 | * - 0: Optimal String Alignment (`"osa"`) 99 | * - 1: Levenshtein (`"lv"`) 100 | * - 2: Damerau-Levenshtein (`"dl"`) 101 | * - 3: Hamming (`"hamming"`) 102 | * - 4: Longest Common Substring (`"lcs"`) 103 | * - 5: q-gram (`"qgram"`) 104 | * - 6: cosine (`"cosine"`) 105 | * - 7: Jaccard (`"jaccard"`) 106 | * - 8: Jaro-Winkler (`"jw"`) 107 | * - 9: Soundex (`"soundex"`) 108 | * @endparblock 109 | * @param nomatch `[integer]` The value to be returned when no match is found. 110 | * @param matchNA Should `NA`s be matched? Default behaviour mimics the 111 | * behaviour of base `match`, meaning that `NA` matches `NA` (see also the note 112 | * on `NA` handling below). 113 | * @param weight `[numeric]` vector. Edit penalty 114 | * @parblock 115 | * For `method='osa'` or`'dl'`, the penalty for 116 | * deletion, insertion, substitution and transposition, in that order. When 117 | * `method='lv'`, the penalty for transposition is ignored. When 118 | * `method='jw'`, the weights associated with characters of `a`, 119 | * characters from `b` and the transposition weight, in that order. 120 | * Weights must be positive and not exceed 1. `weight` is ignored 121 | * completely for other methods 122 | * @endparblock 123 | * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only 124 | * applies to `method='qgram'`, `'jaccard'` or `'cosine'`. 125 | * @param maxDistance `[numeric]` scalar. The maximum distance allowed for matching. 126 | * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The 127 | * valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the 128 | * Jaro-distance is returned. Applies only to `method='jw'`. 129 | * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty 130 | * factor is only applied when the Jaro distance is larger than `bt`. Applies 131 | * only to `method='jw'` and `p>0`. 132 | * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to 133 | * integer prior to distance calculation) 134 | * @param nthread `[integer]` scalar. Maximum number of threads to use. 135 | * 136 | * 137 | * @return 138 | * `[integer]` vector of `length(x)` with indices in `table`. 139 | */ 140 | SEXP attribute_hidden sd_amatch(SEXP x, SEXP table, SEXP method 141 | , SEXP nomatch, SEXP matchNA 142 | , SEXP weight, SEXP p, SEXP bt, SEXP q 143 | , SEXP maxDistance, SEXP useBytes 144 | , SEXP nthrd) 145 | { 146 | static SEXP(*fun)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP) = NULL; 147 | if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP)) R_GetCCallable("stringdist","R_amatch"); 148 | return fun(x, table, method, nomatch, matchNA, weight, p, bt, q, maxDistance, useBytes, nthrd); 149 | } 150 | 151 | /** 152 | * @brief Compute q-gram counts 153 | * 154 | * @param a `[character]` vector 155 | * @param qq `[integer`] scalar. 156 | * 157 | * @return 158 | * A `[numeric]` vector of `length(a)*n_qgrams`, where `n_qrams` is the number 159 | * of different `qgrams` observed in the elements of `a`. The output vector has 160 | * an attribute called `qgrams`, which is an integer vector of size 161 | * `q*n_qgrams` containing integer (UTF-32) labels for the q-grams 162 | * sequentially. 163 | * 164 | */ 165 | SEXP attribute_hidden sd_get_qgrams(SEXP a, SEXP qq) 166 | { 167 | static SEXP(*fun)(SEXP, SEXP) = NULL; 168 | if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP)) R_GetCCallable("stringdist","R_get_qgrams"); 169 | return fun(a, qq); 170 | } 171 | 172 | /* 173 | SEXP attribute_hidden sd_lengths(SEXP X) 174 | { 175 | static SEXP(*fun)(SEXP) = NULL; 176 | if (fun == NULL) fun = (SEXP(*)(SEXP)) R_GetCCallable("stringdist","R_lengths"); 177 | return fun(X); 178 | } 179 | */ 180 | 181 | /** 182 | * @brief Lower tridiagonal elements of distance matrix. 183 | * 184 | * @param a `[character]` vector 185 | * @param method `[integer]` scalar, indicating the distance method as follows 186 | * @parblock 187 | * - 0: Optimal String Alignment (`"osa"`) 188 | * - 1: Levenshtein (`"lv"`) 189 | * - 2: Damerau-Levenshtein (`"dl"`) 190 | * - 3: Hamming (`"hamming"`) 191 | * - 4: Longest Common Substring (`"lcs"`) 192 | * - 5: q-gram (`"qgram"`) 193 | * - 6: cosine (`"cosine"`) 194 | * - 7: Jaccard (`"jaccard"`) 195 | * - 8: Jaro-Winkler (`"jw"`) 196 | * - 9: Soundex (`"soundex"`) 197 | * @endparblock 198 | * @param weight `[numeric]` vector. Edit penalty 199 | * @parblock 200 | * For `method='osa'` or`'dl'`, the penalty for deletion, insertion, 201 | * substitution and transposition, in that order. When `method='lv'`, the 202 | * penalty for transposition is ignored. When `method='jw'`, the weights 203 | * associated with characters of `a`, characters from `b` and the 204 | * transposition weight, in that order. Weights must be positive and not 205 | * exceed 1. `weight` is ignored completely for other methods 206 | * @endparblock 207 | * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only 208 | * applies to `method='qgram'`, `'jaccard'` or `'cosine'`. 209 | * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The 210 | * valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the 211 | * Jaro-distance is returned. Applies only to `method='jw'`. 212 | * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty 213 | * factor is only applied when the Jaro distance is larger than `bt`. Applies 214 | * only to `method='jw'` and `p>0`. 215 | * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to 216 | * integer prior to distance calculation) 217 | * @param nthread `[integer]` scalar. Maximum number of threads to use. 218 | * 219 | * @return 220 | * A `[numeric]` vector of length `n*(n-1)/2`, where `n=length(a)`. It contains 221 | * the positive values of consequtive columns of the distance matrix. Also see 222 | * the R-code in `stringdist:::lower_tri`. 223 | */ 224 | SEXP attribute_hidden sd_lower_tri(SEXP a, SEXP method 225 | , SEXP weight, SEXP p, SEXP bt, SEXP q 226 | , SEXP useBytes, SEXP nthrd) 227 | { 228 | static SEXP(*fun)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP) = NULL; 229 | if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP)) R_GetCCallable("stringdist","R_lower_tri"); 230 | return fun(a, method, weight, p, bt, q, useBytes, nthrd); 231 | } 232 | 233 | /** 234 | * @brief Compute soundex code 235 | * 236 | * @param[in] x `[character]` vector 237 | * @param[in] useBytes `[logical]` scalar. 238 | * 239 | * @return 240 | * 241 | * A `list` with `length(x)` element. Each element is a length 4 integer vector 242 | * representing a 4-character soundex code. The integers are ASCII code points. 243 | * 244 | */ 245 | SEXP attribute_hidden sd_soundex(SEXP x, SEXP useBytes) 246 | { 247 | static SEXP(*fun)(SEXP, SEXP) = NULL; 248 | if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP)) R_GetCCallable("stringdist","R_soundex"); 249 | return fun(x, useBytes); 250 | } 251 | 252 | /** 253 | * @brief compute string distances 254 | * 255 | * @param a `[character]` vector 256 | * @param b `[character]` vector 257 | * @param method `[integer]` scalar, indicating the distance method as follows 258 | * @parblock 259 | * - 0: Optimal String Alignment (`"osa"`) 260 | * - 1: Levenshtein (`"lv"`) 261 | * - 2: Damerau-Levenshtein (`"dl"`) 262 | * - 3: Hamming (`"hamming"`) 263 | * - 4: Longest Common Substring (`"lcs"`) 264 | * - 5: q-gram (`"qgram"`) 265 | * - 6: cosine (`"cosine"`) 266 | * - 7: Jaccard (`"jaccard"`) 267 | * - 8: Jaro-Winkler (`"jw"`) 268 | * - 9: Soundex (`"soundex"`) 269 | * @endparblock 270 | * @param weight `[numeric]` vector. Edit penalty 271 | * @parblock 272 | * For `method='osa'` or`'dl'`, the penalty for deletion, insertion, 273 | * substitution and transposition, in that order. When `method='lv'`, the 274 | * penalty for transposition is ignored. When `method='jw'`, the weights 275 | * associated with characters of `a`, characters from `b` and the 276 | * transposition weight, in that order. Weights must be positive and not 277 | * exceed 1. `weight` is ignored completely for other methods 278 | * @endparblock 279 | * @param q `[integer]` scalar. Size of the q-gram; must be nonnegative. Only 280 | * applies to `method='qgram'`, `'jaccard'` or `'cosine'`. 281 | * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The 282 | * valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the 283 | * Jaro-distance is returned. Applies only to `method='jw'`. 284 | * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty 285 | * factor is only applied when the Jaro distance is larger than `bt`. Applies 286 | * only to `method='jw'` and `p>0`. 287 | * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to 288 | * integer prior to distance calculation) 289 | * @param nthread `[integer]` scalar. Maximum number of threads to use. 290 | * 291 | * 292 | * @return 293 | * A `[numeric]` vector of length `max(length(a),length(b))` where the shortest 294 | * vector is recycled over the longer (no warnings are given when the longer 295 | * length is not an integer multiple of the shorter length). 296 | * 297 | * 298 | */ 299 | SEXP attribute_hidden sd_stringdist(SEXP a, SEXP b, SEXP method 300 | , SEXP weight, SEXP p, SEXP bt, SEXP q 301 | , SEXP useBytes, SEXP nthrd) 302 | { 303 | static SEXP(*fun)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP) = NULL; 304 | if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP)) R_GetCCallable("stringdist","R_stringdist"); 305 | return fun(a, b, method, weight, p, bt, q, useBytes, nthrd); 306 | } 307 | 308 | 309 | #ifdef __cplusplus 310 | } 311 | #endif 312 | 313 | #endif 314 | -------------------------------------------------------------------------------- /pkg/src/Rstringdist.c: -------------------------------------------------------------------------------- 1 | /* stringdist - a C library of string distance algorithms with an interface to R. 2 | * Copyright (C) 2013 Mark van der Loo 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | * 17 | * You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #ifdef _OPENMP 24 | #include 25 | #endif 26 | #include "utils.h" 27 | #include "stringdist.h" 28 | 29 | #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) 30 | #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) 31 | 32 | 33 | static Stringdist *R_open_stringdist(Distance d, int max_len_a, int max_len_b, SEXP weight, SEXP p, SEXP bt, SEXP q){ 34 | 35 | Stringdist *sd = NULL; 36 | if (d == osa || d == lv || d == dl || d == hamming || d == lcs){ 37 | sd = open_stringdist(d, max_len_a, max_len_b, REAL(weight)); 38 | } else if (d == qgram || d == cosine || d == jaccard || d == running_cosine){ 39 | sd = open_stringdist(d, max_len_a, max_len_b, (unsigned int) INTEGER(q)[0]); 40 | } else if ( d == jw ){ 41 | sd = open_stringdist(d, max_len_a, max_len_b, REAL(weight), REAL(p)[0], REAL(bt)[0]); 42 | } else if (d == soundex) { 43 | sd = open_stringdist(d, max_len_a, max_len_b); 44 | } 45 | if ( sd == NULL ){ 46 | error("Could not allocate enough memory"); 47 | } 48 | return sd; 49 | } 50 | 51 | 52 | 53 | SEXP R_stringdist(SEXP a, SEXP b, SEXP method 54 | , SEXP weight, SEXP p, SEXP bt, SEXP q 55 | , SEXP useBytes, SEXP nthrd){ 56 | 57 | int na = length(a) 58 | , nb = length(b) 59 | , bytes = INTEGER(useBytes)[0] 60 | , ml_a = max_length(a) 61 | , ml_b = max_length(b) 62 | , nt = (na > nb) ? na : nb 63 | , intdist = TYPEOF(a) == VECSXP ? 1 : 0; // expect lists of integers? 64 | 65 | // output vector 66 | SEXP yy; 67 | yy = PROTECT(allocVector(REALSXP, nt)); 68 | double *y = REAL(yy); 69 | 70 | #ifdef _OPENMP 71 | int nthreads = MIN(INTEGER(nthrd)[0],MAX(na,nb)); 72 | #pragma omp parallel num_threads(nthreads) default(none) \ 73 | shared(y,na,nb, R_PosInf, NA_REAL, bytes, intdist, method, weight, p, bt, q, ml_a, ml_b, nt, a, b) 74 | #endif 75 | { 76 | 77 | Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0] 78 | , ml_a, ml_b 79 | , weight 80 | , p 81 | , bt 82 | , q 83 | ); 84 | 85 | unsigned int *s = NULL, *t = NULL; 86 | s = (unsigned int *) malloc(( 2L + ml_a + ml_b) * sizeof(int)); 87 | 88 | if ( (sd==NULL) | (bytes && s == NULL) ) nt = -1; 89 | t = s + ml_a + 1L; 90 | 91 | int len_s, len_t, isna_s, isna_t 92 | , i = 0, j = 0, ID = 0, num_threads = 1; 93 | 94 | #ifdef _OPENMP 95 | ID = omp_get_thread_num(); 96 | num_threads = omp_get_num_threads(); 97 | i = recycle(ID-num_threads, num_threads, na); 98 | j = recycle(ID-num_threads, num_threads, nb); 99 | #endif 100 | for ( int k=ID; k < nt; k += num_threads ){ 101 | get_elem(a, i, bytes, intdist, &len_s, &isna_s, s); 102 | get_elem(b, j, bytes, intdist, &len_t, &isna_t, t); 103 | if (isna_s || isna_t){ 104 | y[k] = NA_REAL; 105 | } else { 106 | y[k] = stringdist(sd, s, len_s, t, len_t); 107 | if ( y[k] < 0 ) y[k] = R_PosInf; 108 | } 109 | i = recycle(i, num_threads, na); 110 | j = recycle(j, num_threads, nb); 111 | } 112 | 113 | 114 | close_stringdist(sd); 115 | 116 | free(s); 117 | } // end of parallel region 118 | 119 | UNPROTECT(1); 120 | if (nt < 0 ) error("Unable to allocate enough memory"); 121 | return(yy); 122 | } 123 | 124 | /* amatch 125 | * 126 | */ 127 | SEXP R_amatch(SEXP x, SEXP table, SEXP method 128 | , SEXP nomatch, SEXP matchNA 129 | , SEXP weight, SEXP p, SEXP bt, SEXP q 130 | , SEXP maxDistance, SEXP useBytes 131 | , SEXP nthrd){ 132 | 133 | 134 | int nx = length(x) 135 | , ntable = length(table) 136 | , no_match = INTEGER(nomatch)[0] 137 | , match_na = INTEGER(matchNA)[0] 138 | , bytes = INTEGER(useBytes)[0] 139 | , ml_x = max_length(x) 140 | , ml_t = max_length(table) 141 | , intdist = TYPEOF(x) == VECSXP ? 1 : 0; // list of integers? 142 | 143 | 144 | double maxDist = REAL(maxDistance)[0]; 145 | 146 | // convert to integer. 147 | Stringset *X = new_stringset(x, bytes, intdist); 148 | Stringset *T = new_stringset(table, bytes, intdist); 149 | 150 | // output vector 151 | SEXP yy; 152 | yy = PROTECT(allocVector(INTSXP, nx)); 153 | int *y = INTEGER(yy); 154 | 155 | #ifdef _OPENMP 156 | int nthreads = MAX(MIN(INTEGER(nthrd)[0],nx),0); 157 | #pragma omp parallel num_threads(nthreads) default(none) \ 158 | shared(X, T, y, R_PosInf, NA_INTEGER, nx, ntable, no_match, match_na, ml_x, ml_t, method, weight, p, bt, q, maxDist) 159 | #endif 160 | { 161 | /* claim space for workhorse */ 162 | 163 | Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0] 164 | , ml_x, ml_t 165 | , weight 166 | , p 167 | , bt 168 | , q 169 | ); 170 | 171 | double d = R_PosInf, d1 = R_PosInf; 172 | int index, len_X, len_T; 173 | unsigned int *str; 174 | unsigned int **tab; 175 | 176 | #ifdef _OPENMP 177 | #pragma omp for 178 | #endif 179 | for ( int i=0; istr_len[i]; 182 | d1 = R_PosInf; 183 | str = X->string[i]; 184 | tab = T->string; 185 | for ( int j=0; jstr_len[j]; 187 | if (len_X != NA_INTEGER && len_T != NA_INTEGER ){ // both are char (usual case) 188 | d = stringdist(sd, str, len_X, *tab, len_T); 189 | if ( d <= maxDist && d < d1){ 190 | index = j + 1; 191 | if ( fabs(d) < 1e-14 ){ 192 | break; // exact match 193 | } 194 | d1 = d; 195 | } 196 | } else if ( len_X == NA_INTEGER && len_T == NA_INTEGER ) { // both are NA 197 | index = match_na ? j + 1 : no_match; 198 | break; 199 | } 200 | } 201 | y[i] = index; 202 | } 203 | str=NULL; 204 | tab=NULL; 205 | close_stringdist(sd); 206 | } // end of parallel region 207 | free_stringset(X); 208 | free_stringset(T); 209 | UNPROTECT(1); 210 | 211 | return(yy); 212 | } // end R_amatch 213 | 214 | 215 | // Lower tridiagonal distance matrix for a single vector argument. 216 | 217 | static int get_j(R_xlen_t k, R_xlen_t n){ 218 | double nd = (double) n; 219 | double kd = (double) k; 220 | double u = ceil( (2.*nd - 3.)/2. - sqrt(pow(nd-.5,2.) - 2.*(kd + 1.)) ); 221 | 222 | return (int) u; 223 | } 224 | 225 | /* max n for objects of length n(n-1). 226 | * 227 | */ 228 | #ifdef LONG_VECTOR_SUPPORT 229 | #define MAXN ( (R_xlen_t) (0.5 + 1.5 * sqrt((double) R_XLEN_T_MAX)) ) 230 | #else 231 | #define MAXN ( (R_xlen_t) (0.5 + 1.5 * sqrt((double) R_LEN_T_MAX)) ) 232 | #endif 233 | 234 | SEXP R_lower_tri(SEXP a, SEXP method 235 | , SEXP weight, SEXP p, SEXP bt, SEXP q 236 | , SEXP useBytes, SEXP nthrd){ 237 | 238 | int bytes = INTEGER(useBytes)[0] 239 | , ml = max_length(a) 240 | , intdist = TYPEOF(a) == VECSXP ? 1 : 0; // expect list of integer vectors? 241 | 242 | // Long vectors on platforms where LONG_VECTOR_SUPPORT is defined. 243 | R_xlen_t n = xlength(a) 244 | , N = n*(n-1)/2; 245 | 246 | if ( n > MAXN ){ 247 | error("Length of input vector (%1.0f) exceeds maximum allowed for this platform (%1.0f)",(double) n,(double) MAXN); 248 | } 249 | 250 | 251 | // output vector 252 | SEXP yy; 253 | yy = PROTECT(allocVector(REALSXP, N)); 254 | // nothing to do if n=1 255 | if (n == 1L) goto end ; 256 | double *y = REAL(yy); 257 | 258 | 259 | #ifdef _OPENMP 260 | int nthreads = MIN(INTEGER(nthrd)[0],N); 261 | nthreads = MIN(nthreads, n); 262 | #pragma omp parallel num_threads(nthreads) default(none) \ 263 | shared(y,n,N, R_PosInf, NA_REAL, bytes, intdist, method, weight, p, bt, q, ml, a) 264 | #endif 265 | { 266 | 267 | Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0] 268 | , ml, ml 269 | , weight 270 | , p 271 | , bt 272 | , q 273 | ); 274 | 275 | unsigned int *s = NULL, *t = NULL; 276 | s = (unsigned int *) malloc(( 2L + 2*ml) * sizeof(int)); 277 | 278 | if ( (sd==NULL) | (bytes && s == NULL) ) n = -1; 279 | t = s + ml + 1L; 280 | 281 | int len_s, len_t, isna_s, isna_t 282 | , j = 0 283 | , thread_id = 0 284 | , n_threads = 1; 285 | 286 | R_xlen_t pp = 0 287 | , k_start = 0 288 | , i = 0 289 | , col_max = n-1 290 | , k_end = N; 291 | 292 | #ifdef _OPENMP 293 | thread_id = omp_get_thread_num(); 294 | n_threads = omp_get_num_threads(); 295 | #endif 296 | // some administration to parallelize the loop. 297 | pp = N / n_threads; 298 | k_start = thread_id * pp; 299 | k_end = (thread_id < n_threads - 1 ) ? k_start + pp : N; 300 | j = get_j(k_start, n); 301 | i = k_start + j * (j - 2*n + 3)/2; 302 | for ( R_xlen_t k=k_start; k < k_end; k++ ){ 303 | i++; 304 | get_elem(a, i, bytes, intdist, &len_s, &isna_s, s); 305 | get_elem(a, j, bytes, intdist, &len_t, &isna_t, t); 306 | 307 | if (isna_s || isna_t){ 308 | y[k] = NA_REAL; 309 | } else { 310 | y[k] = stringdist(sd, s, len_s, t, len_t); 311 | if ( y[k] < 0 ) y[k] = R_PosInf; 312 | } 313 | if ( i == col_max ){ 314 | j++; 315 | i = j; 316 | } 317 | } 318 | 319 | free(s); 320 | close_stringdist(sd); 321 | } // end of parallel region 322 | 323 | end: 324 | UNPROTECT(1); 325 | if (n < 0 ) error("Unable to allocate enough memory"); 326 | return(yy); 327 | } 328 | 329 | // afind 330 | // For each string in 'a', return the starting position of 331 | // the best match with 'pattern'. 332 | SEXP R_afind(SEXP a, SEXP pattern, SEXP width 333 | , SEXP method, SEXP weight, SEXP p, SEXP bt 334 | , SEXP q, SEXP useBytes, SEXP nthrd) 335 | { 336 | 337 | int na = length(a) // nr of texts to search 338 | , npat = length(pattern) // nr of patterns 339 | , ml_a = max_length(a) // max length of searched string 340 | , ml_b = max_length(pattern) // max length of the pattern. 341 | , intdist = 0 // no distances between integer sequences (yet) 342 | , bytes = INTEGER(useBytes)[0]; 343 | 344 | 345 | int *window = INTEGER(width); // access the window widths. 346 | 347 | // output list 348 | SEXP out_list; 349 | out_list = PROTECT(allocVector(VECSXP, 2)); 350 | 351 | // output location 352 | SEXP out_loc = PROTECT(allocMatrix(INTSXP, na, npat)); 353 | SET_VECTOR_ELT(out_list,0, out_loc); 354 | int *yloc = INTEGER(out_loc); 355 | 356 | // output distance 357 | SEXP out_dist = PROTECT(allocMatrix(REALSXP, na, npat)); 358 | SET_VECTOR_ELT(out_list,1, out_dist); 359 | double *ydist = REAL(out_dist); 360 | // Setup stringdist structure. 361 | // find maximum window length 362 | int max_window = 0; 363 | for ( int i=0; i= len_s ){ // is the text shorter than the window? 415 | yloc[offset + i] = 1L; 416 | ydist[offset + i] = stringdist(sd, s, len_s, t, len_t); 417 | } else { // slide window over text and compute distances 418 | max_k = len_s - current_window; 419 | d_min = R_PosInf; 420 | k_min = 0; 421 | for (int k = 0; k <= max_k; k++){ 422 | d = stringdist(sd, s + k, current_window, t, len_t); 423 | if ( d < d_min ){ 424 | d_min = d; 425 | k_min = k; 426 | } 427 | } // end loop over windows 428 | yloc[offset + i] = k_min + 1; 429 | ydist[offset + i] = d_min; 430 | reset_stringdist(sd); 431 | } 432 | } // end loop over patterns 433 | } // end loop over strings 434 | close_stringdist(sd); 435 | } // end parallel region 436 | UNPROTECT(3); 437 | return(out_list); 438 | 439 | } 440 | 441 | // helper function to determine whether all is INTSXP 442 | 443 | SEXP R_all_int(SEXP X){ 444 | 445 | SEXP all_int; 446 | all_int = PROTECT(allocVector(LGLSXP,1L)); 447 | 448 | int n = length(X); 449 | LOGICAL(all_int)[0] = 1L; 450 | for (int i=0; i