├── test
    ├── testpack.R
    ├── autotest.R
    ├── vgtest.R
    └── smoke.R
├── pkg
    ├── vignettes
    │   ├── stringdist_api.pdf
    │   ├── loo2014stringdist.pdf
    │   ├── stringdist_C-Cpp_api.Rnw
    │   └── RJournal_6_111-122-2014.Rnw
    ├── src
    │   ├── Makevars
    │   ├── Makevars.win
    │   ├── qtree.h
    │   ├── dictionary.h
    │   ├── dist.h
    │   ├── hamming.c
    │   ├── utils.c
    │   ├── lcs.c
    │   ├── R_register_native.c
    │   ├── stringdist.h
    │   ├── lv.c
    │   ├── osa.c
    │   ├── jaro.c
    │   ├── utils.h
    │   ├── dl.c
    │   ├── stringdist.c
    │   ├── soundex.c
    │   ├── utf8ToInt.c
    │   └── Rstringdist.c
    ├── tests
    │   └── tinytest.R
    ├── inst
    │   ├── tinytest
    │   │   ├── test_gh_issue_88.R
    │   │   ├── test_gh_issue_78.R
    │   │   ├── test_gh_issue_59.R
    │   │   ├── test_phonetic.R
    │   │   ├── test_qgrams.R
    │   │   ├── test_seq_dist.R
    │   │   ├── test_afind.R
    │   │   ├── test_stringsim.R
    │   │   └── test_amatch.R
    │   ├── CITATION
    │   └── include
    │   │   └── stringdist_api.h
    ├── R
    │   ├── doc_api.R
    │   ├── phonetic.R
    │   ├── doc_parallel.R
    │   ├── utils.R
    │   ├── doc_encoding.R
    │   ├── qgrams.R
    │   ├── seqdist.R
    │   ├── afind.R
    │   ├── stringsim.R
    │   ├── doc_metrics.R
    │   ├── amatch.R
    │   └── stringdist.R
    ├── NAMESPACE
    ├── DESCRIPTION
    ├── README.md
    └── NEWS
├── examples
    ├── seq_qgrams.R
    ├── seq_sim.R
    ├── phonetic.R
    ├── stringsim.R
    ├── printable_ascii.R
    ├── amatch.R
    ├── seq_amatch.R
    ├── qgrams.R
    ├── seq_dist.R
    └── stringdist.R
├── .gitignore
├── compile.sh
├── ubsan.sh
├── test.r
├── Makefile
├── drat.sh
└── README.md


/test/testpack.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | test_package('stringdist')
3 | 
4 | 


--------------------------------------------------------------------------------
/pkg/vignettes/stringdist_api.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markvanderloo/stringdist/HEAD/pkg/vignettes/stringdist_api.pdf


--------------------------------------------------------------------------------
/pkg/vignettes/loo2014stringdist.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markvanderloo/stringdist/HEAD/pkg/vignettes/loo2014stringdist.pdf


--------------------------------------------------------------------------------
/pkg/src/Makevars:
--------------------------------------------------------------------------------
1 | 
2 | PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS)
3 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
4 | PKG_CPPFLAGS = -I../inst/include
5 | 


--------------------------------------------------------------------------------
/pkg/tests/tinytest.R:
--------------------------------------------------------------------------------
1 | if ( requireNamespace("tinytest", quietly=TRUE) ){
2 |   tinytest::test_package("stringdist")
3 | }
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/pkg/src/Makevars.win:
--------------------------------------------------------------------------------
1 | 
2 | PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS)
3 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
4 | PKG_CPPFLAGS = -I../inst/include
5 | 


--------------------------------------------------------------------------------
/test/autotest.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | 
3 | 
4 | dyn.load("../pkg/src/stringdist.so")
5 | auto_test("../pkg/R", "../pkg/tests/testthat")
6 | 
7 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_gh_issue_88.R:
--------------------------------------------------------------------------------
1 | options(sd_num_thread=1L)
2 | 
3 | x <- c("ca", "abc", "cba")
4 | expect_equal(stringsimmatrix(x), t(stringsimmatrix(x)))
5 | 
6 | 
7 | 
8 |  
9 | 


--------------------------------------------------------------------------------
/examples/seq_qgrams.R:
--------------------------------------------------------------------------------
1 | 
2 | # compare the 2-gram overlap between sequences 1:3 and 2:4
3 | seq_qgrams(x = 1:3, y=2:4,q=2)
4 | 
5 | # behavior when NA's are present.
6 | seq_qgrams(1:3,c(1,NA,2),NA_integer_)
7 | 


--------------------------------------------------------------------------------
/pkg/vignettes/stringdist_C-Cpp_api.Rnw:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 | \usepackage{pdfpages}
3 | %\VignetteIndexEntry{stringdist C/C++ API}
4 | 
5 | \begin{document}
6 | \includepdf[pages=-, fitpaper=true]{stringdist_api.pdf}
7 | \end{document}
8 | 


--------------------------------------------------------------------------------
/examples/seq_sim.R:
--------------------------------------------------------------------------------
 1 | L1 <- list(1:3,2:4)
 2 | L2 <- list(1:3)
 3 | seq_sim(L1,L2,method="osa")
 4 | 
 5 | # note how missing values are handled (L2 is recycled over L1)
 6 | L1 <- list(c(1L,NA_integer_,3L),2:4,NA_integer_)
 7 | L2 <- list(1:3)
 8 | seq_sim(L1,L2)
 9 | 
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # compiled source
 2 | *.o
 3 | *.so
 4 | # temporary vim files
 5 | *.swp
 6 | 
 7 | # tex stuff
 8 | *.pdf
 9 | *.tex
10 | *.bbl
11 | *.blg
12 | *.log
13 | *.aux
14 | 
15 | 
16 | # ignored directories
17 | output/
18 | pkg/man/
19 | 
20 | 
21 | .Rproj.user
22 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_gh_issue_78.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # x <- "IÑIGO", we avoid problems on Windows here.
 3 | x <- intToUtf8(c(73, 209,  73,  71,  79))
 4 | 
 5 | expect_equal(stringdist("INIGO", x, method="lv", useBytes=FALSE),1)
 6 | expect_equal(amatch("INIGO", x, method="lv",maxDist=1),1)
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/examples/phonetic.R:
--------------------------------------------------------------------------------
1 | # The following examples are from The Art of Computer Programming (part III, p. 395)
2 | # (Note that our algorithm is specified different from the one in TACP, see references.)
3 | phonetic(c('Euler','Gauss','Hilbert','Knuth','Lloyd','Lukasiewicz','Wachs'),method='soundex')
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/stringsim.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Calculate the similarity using the default method of optimal string alignment
 4 | stringsim("ca", "abc")
 5 | 
 6 | # Calculate the similarity using the Jaro-Winkler method
 7 | # The p argument is passed on to stringdist
 8 | stringsim('MARTHA','MATHRA',method='jw', p=0.1)
 9 | 
10 | 


--------------------------------------------------------------------------------
/examples/printable_ascii.R:
--------------------------------------------------------------------------------
 1 | # define o-umlaut
 2 | ouml <- intToUtf8("0x00F6")
 3 | x <- c("Motorhead", paste0("Mot",ouml,"rhead"))
 4 | # second element contains a non-ascii character
 5 | printable_ascii(x)
 6 | 
 7 | # Control characters (like carriage return) are also excluded
 8 | printable_ascii("abc\r")
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/pkg/src/qtree.h:
--------------------------------------------------------------------------------
 1 | #ifndef SD_QTREE_H
 2 | #define SD_QTREE_H
 3 | 
 4 | /* binary tree; dictionary of qgrams */
 5 | 
 6 | typedef struct qnode {
 7 |   unsigned int *qgram; // the q-gram.
 8 |   double *n;           // (vector of) counts.
 9 |   struct qnode *left;
10 |   struct qnode *right;
11 | } qtree;
12 | 
13 | 
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_gh_issue_59.R:
--------------------------------------------------------------------------------
 1 | # this would crash R because of over-asking memory
 2 | # it depends on the system really, so we only run this at the
 3 | # comfort of our home
 4 | if (FALSE){
 5 |   x <- paste(letters[sample(1:length(letters),32800,replace=TRUE)], collapse="")
 6 |   expect_error(stringdist(x,x))
 7 | }
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/pkg/src/dictionary.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef SD_DICTIONARY_H
 3 | #define SD_DICTIONARY_H
 4 | 
 5 | /* Unsorted dictionary for dl distance */
 6 | typedef struct {
 7 |   // character
 8 |   unsigned int *key; 
 9 |   // number found
10 |   unsigned int *value;
11 |   // size of dictionary
12 |   unsigned int length;
13 | } dictionary;
14 | 
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/pkg/inst/CITATION:
--------------------------------------------------------------------------------
 1 | bibentry(bibtype = "article",
 2 |  title = "The stringdist package for approximate string matching"
 3 |  , author = person("M.P.J.", "van der Loo")
 4 |  , year = 2014
 5 |  , journal = {"The {R} {J}ournal"}
 6 |  , volume = 6
 7 |  , issue  = 1
 8 |  , url = "https://CRAN.R-project.org/package=stringdist"
 9 |  , pages = {"111-122"}
10 | )
11 | 


--------------------------------------------------------------------------------
/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd pkg/src
 4 | rm --verbose *.o *.so
 5 | gcc-10 -std=gnu99 -I/usr/share/R/include -DNDEBUG -fpic -fopenmp -O4 -Wall -pipe  -g  -c -Wstrict-prototypes -Wformat *.c 
 6 | #gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -fpic -O2 -Wall -pipe  -g  -c *.c 
 7 | gcc -std=gnu99 -shared -o stringdist.so *.o -L/usr/lib/R/lib -lR
 8 | cd ../../
 9 | 
10 | 


--------------------------------------------------------------------------------
/pkg/R/doc_api.R:
--------------------------------------------------------------------------------
 1 | #' @title
 2 | #' Calling stringdist from \code{C} or \code{C++}
 3 | #'
 4 | #' @description
 5 | #' As of version \code{0.9.5.0} several \code{C} level functions can be linked to
 6 | #' and called from C code in other R packages.
 7 | #'
 8 | #' A description of the API can be found in \href{../doc/stringdist_api.pdf}{stringdist_api.pdf}.
 9 | #'
10 | #' @name stringdist_api
11 | #' @rdname stringdist-api
12 | {}
13 | 
14 | 


--------------------------------------------------------------------------------
/ubsan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # perform build and R CMD check with undefined behaviour sanitizer switched on.
 4 | 
 5 | ## Thanks to Dirk for posting this command here:
 6 | ## http://dirk.eddelbuettel.com/blog/2015/01/18/#ubsan-clang-container
 7 | 
 8 | cd output
 9 | 
10 | R CMD build ../pkg 
11 | 
12 | docker run --rm -ti -v $(pwd):/mnt rocker/r-devel-ubsan-clang check.r --setwd /mnt --install-deps stringdist_*.tar.gz
13 | 
14 | cd ..
15 | 
16 | 


--------------------------------------------------------------------------------
/pkg/vignettes/RJournal_6_111-122-2014.Rnw:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pdfpages}
 3 | %\VignetteIndexEntry{RJournal 6 111-122 (2014)}
 4 | 
 5 | \begin{document}
 6 | \includepdf[pages=-, fitpaper=true]{loo2014stringdist.pdf}
 7 | 
 8 | \newpage{}
 9 | \subsection*{Errata}
10 | A few things have changed after the paper was published. These changes
11 | are documented here.
12 | 
13 | \begin{itemize}
14 | \item[Version 0.9.3:] $q$-gram distances are now always 0 when $q=0$ (used to be \texttt{Inf}).
15 | \end{itemize}
16 | 
17 | \end{document}
18 | 


--------------------------------------------------------------------------------
/pkg/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(afind)
 4 | export(ain)
 5 | export(amatch)
 6 | export(extract)
 7 | export(grab)
 8 | export(grabl)
 9 | export(phonetic)
10 | export(printable_ascii)
11 | export(qgrams)
12 | export(seq_ain)
13 | export(seq_amatch)
14 | export(seq_dist)
15 | export(seq_distmatrix)
16 | export(seq_qgrams)
17 | export(seq_sim)
18 | export(stringdist)
19 | export(stringdistmatrix)
20 | export(stringsim)
21 | export(stringsimmatrix)
22 | importFrom(parallel,detectCores)
23 | useDynLib(stringdist, .registration=TRUE)
24 | 


--------------------------------------------------------------------------------
/test/vgtest.R:
--------------------------------------------------------------------------------
 1 | 
 2 | dyn.load("../pkg/src/stringdist.so")
 3 | for ( f in dir("../pkg/R",full.names=TRUE) ) source(f)
 4 | 
 5 |  x <- paste0('Mot',intToUtf8(0x00F6),'rhead')
 6 |  y <- c('bastard','Motorhead')
 7 |  jwdist <- round(1-(1/3)*(8/9 + 8/10 + 1),3)
 8 | 
 9 | #amatch(x, y, method='dl', maxDist=2, useBytes=TRUE)
10 | #stringdist(x[1], y[2], method='dl', useBytes=FALSE)
11 | #stringdist(x[1], y[2], method='dl', useBytes=TRUE)
12 | #stringdist("b","a" , method='dl',weight=c(1,1,0.5,1))
13 | stringdist("b","a" , method='dl',weight=c(1,1,1,1))
14 | #stringdist(x, y, method='osa', useBytes=TRUE)
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/test.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | suppressPackageStartupMessages({
 4 |   if (!require("docopt")) stop("docopt not installed")
 5 | })
 6 | 
 7 | "Usage: test.r [nocovr] [snitch]
 8 | 
 9 | nocovr Skip measuring test coverage.
10 | snitch Report lines not covered.
11 | " -> doc
12 | 
13 | opt <- docopt(doc)
14 | 
15 | if(!require(devtools)) stop('devtools not installed first')
16 | devtools::test('pkg')
17 | 
18 | if (!opt$nocovr){
19 |   if(require(covr)){ 
20 |    cv <- covr::package_coverage('pkg')
21 |    print(cv)
22 |    if (opt$snitch) print(subset(tally_coverage(cv), value == 0),row.names=FALSE)
23 |   } else {
24 |     stop("covr not installed")
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/examples/amatch.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # lets see which sci-fi heroes are stringdistantly nearest
 3 | amatch("leia",c("uhura","leela"),maxDist=5)
 4 | 
 5 | # we can restrict the search
 6 | amatch("leia",c("uhura","leela"),maxDist=1)
 7 | 
 8 | # we can match each value in the find vector against values in the lookup table:
 9 | amatch(c("leia","uhura"),c("ripley","leela","scully","trinity"),maxDist=2)
10 | 
11 | # setting nomatch returns a different value when no match is found
12 | amatch("leia",c("uhura","leela"),maxDist=1,nomatch=0)
13 | 
14 | # this is always true if maxDist is Inf
15 | ain("leia",c("uhura","leela"),maxDist=Inf)
16 | 
17 | # Let's look in a neighbourhood of maximum 2 typo's (by default, the OSA algorithm is used)
18 | ain("leia",c("uhura","leela"), maxDist=2)
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/examples/seq_amatch.R:
--------------------------------------------------------------------------------
 1 | 
 2 | x <- list(1:3,c(3:1),c(1L,3L,4L))
 3 | table <- list(
 4 |   c(5L,3L,1L,2L)
 5 |   ,1:4
 6 | )
 7 | seq_amatch(x,table,maxDist=2)
 8 | 
 9 | # behaviour with missings
10 | seq_amatch(list(c(1L,NA_integer_,3L),NA_integer_), list(1:3),maxDist=1)
11 | 
12 | 
13 | \dontrun{
14 | # Match sentences based on word order. Note: words must match exactly or they
15 | # are treated as completely different.
16 | #
17 | # For this example you need to have the 'hashr' package installed.
18 | x <- "Mary had a little lamb"
19 | x.words <- strsplit(x,"[[:blank:]]+")
20 | x.int <- hashr::hash(x.words)
21 | table <- c("a little lamb had Mary",
22 |            "had Mary a little lamb")
23 | table.int <- hashr::hash(strsplit(table,"[[:blank:]]+"))
24 | seq_amatch(x.int,table.int,maxDist=3)
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/qgrams.R:
--------------------------------------------------------------------------------
 1 | 
 2 | qgrams('hello world',q=3)
 3 | 
 4 | # q-grams are counted uniquely over a character vector
 5 | qgrams(rep('hello world',2),q=3)
 6 | 
 7 | # to count them separately, do something like
 8 | x <- c('hello', 'world')
 9 | lapply(x,qgrams, q=3)
10 | 
11 | # output rows may be named, and you can pass any number of character vectors
12 | x <- "I will not buy this record, it is scratched"
13 | y <- "My hovercraft is full of eels"
14 | z <- c("this", "is", "a", "dead","parrot")
15 | qgrams(A = x, B = y, C = z,q=2)
16 | 
17 | # a tonque twister, showing the effects of useBytes and useNames
18 | x <- "peter piper picked a peck of pickled peppers"
19 | qgrams(x, q=2) 
20 | qgrams(x, q=2, useNames=FALSE) 
21 | qgrams(x, q=2, useBytes=TRUE)
22 | qgrams(x, q=2, useBytes=TRUE, useNames=TRUE)
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | doc: 
 3 | 	R -s -e "pkgload::load_all('pkg');roxygen2::roxygenize('pkg')"
 4 | 
 5 | pkg: doc
 6 | 	R CMD build pkg
 7 | 
 8 | install: pkg
 9 | 	R CMD INSTALL *.tar.gz
10 | 
11 | check: doc
12 | 	R CMD build pkg
13 | 	R CMD check *.tar.gz
14 | 
15 | cran: doc
16 | 	R CMD build pkg
17 | 	R CMD check --as-cran *.tar.gz
18 | 
19 | test: doc
20 | 	R -s -e "tinytest::build_install_test('pkg')"
21 | 
22 | manual: doc
23 | 	R CMD Rd2pdf --force -o manual.pdf ./pkg
24 | 
25 | revdep: pkg
26 | 	rm -rf revdep
27 | 	mkdir revdep
28 | 	mv *.tar.gz revdep
29 | 	R -s -e "out <- tools::check_packages_in_dir('revdep',reverse=list(which='most'),Ncpus=3); print(summary(out)); saveRDS(out, file='revdep/output.RDS')"
30 | 
31 | covr: 
32 | 	R -e 'covr::package_coverage("./pkg")'
33 | 
34 | clean:
35 | 	rm -rf stringdist.Rcheck
36 | 	rm -rf revdep
37 | 	rm -f *.tar.gz
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/drat.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | suppressPackageStartupMessages({
 4 |   if (!require("drat")) stop("drat not installed")
 5 |   if (!require("docopt")) stop("docopt not installed")
 6 | })
 7 | 
 8 |  
 9 | "Usage: drat.sh [commit] [--pkg FILE] [--dratrepo FOLDER] 
10 | 
11 | commit commit after insert? 
12 | --pkg FILE The tarball to insert in the drat repo (by default the tarball in ./output)
13 | --dratrepo FOLDER path to root of drat repo [default: ../drat]
14 | " -> doc
15 | 
16 | opt <- docopt(doc)
17 | 
18 | stopifnot(file.exists(opt$dratrepo))
19 | 
20 | pkg <- opt$pkg
21 | if ( is.null(pkg) ){
22 |   pkg <- dir("output/",pattern = ".*tar\\.gz",full.names = TRUE)
23 | } 
24 | 
25 | if (!file.exists(pkg)){
26 |   stop(sprintf("%s not found",pkg))
27 | }
28 | 
29 | drat::insertPackage(pkg, repodir=opt$dratrepo, commit=opt$commit)
30 | 
31 | cat(sprintf("Inserted %s into %s %s\n"
32 |   , pkg
33 |   , opt$dratrepo
34 |   , if(opt$commit) "and committed" else ""
35 | ))
36 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_phonetic.R:
--------------------------------------------------------------------------------
 1 | options(sd_num_thread=2)
 2 | 
 3 | ### -------------------------------------------------------------
 4 | 
 5 | ## Soundex
 6 | 
 7 |   testset <- "name;code
 8 | Robert;R163
 9 | rupert;R163
10 | Rubin;R150
11 | Ashcraft;A261
12 | asHCroft;A261
13 | Tymczak;T522
14 | Pfister;P236
15 | gutierrez;G362
16 | Jackson;J250
17 | washington;W252
18 | Lee;L000
19 | NA;NA"
20 |   testset <- read.csv2(textConnection(testset), stringsAsFactors=FALSE)
21 |   expect_equal(phonetic(testset$name,"soundex"), testset$code)
22 |   expect_equal(phonetic(testset$name,"soundex",useBytes=TRUE), testset$code)
23 |   expect_warning(phonetic(paste0('Mot',intToUtf8(0x00F6),'rhead')))  
24 | 
25 | 
26 | ## soundex handles encoding
27 |   ouml <- intToUtf8("0x00F6")
28 |   # non-ascii within string
29 |   expect_warning(phonetic(paste0("Mot",ouml,"rhead"),method='soundex'))
30 |   # non-ascii at beginning of string
31 |   expect_warning(phonetic(paste0(ouml,"zzy"),method='soundex'))
32 |   # non-printable in string (carriage return)
33 |   cr <- "\r"
34 |   expect_warning(phonetic(paste0(cr,"hello"),method='soundex'))
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_qgrams.R:
--------------------------------------------------------------------------------
 1 | options(sd_num_thread=2)
 2 | ## qgrams
 3 | 
 4 | ## qgram edge cases
 5 |   expect_equivalent(qgrams('a' ,      q=1), as.matrix(c(a=1)))   # basic test
 6 |   expect_equivalent(qgrams('aa',      q=1), as.matrix(c(a=2)))   # idem
 7 |   expect_equivalent(qgrams(c('a','a'),q=1), as.matrix(c(a=2)))   # count unique n-grams
 8 |   expect_equivalent(qgrams(c(NA,'a'), q=1), as.matrix(c(a=1)))   # skip NA's
 9 |   expect_equivalent(qgrams(NA,q=1), matrix(0,nrow=1,ncol=0))          # skip all
10 |   expect_equivalent(qgrams(c("a","ab"), q=2), as.matrix(table("ab"))) # skip q>nchar
11 |   expect_equivalent(qgrams(c("a"),q=2), matrix(0,nrow=1,ncol=0))      # skip all
12 |   expect_equivalent(qgrams(c(''),q=0), matrix(table('')))          # empty string, q=0
13 | 
14 | 
15 | ## qgrams
16 |   expect_equivalent(qgrams("a",q=1),array(1,dim=c(1,1)))
17 |   expect_equivalent(qgrams("a",q=1,useBytes=TRUE),array(1,dim=c(1,1)))
18 | 
19 | 
20 | ## seq_qgrams
21 |   expect_equivalent(
22 |     seq_qgrams(1:3,2:4,q=2)
23 |     ,matrix(c(
24 |        1,2,1,0
25 |       ,2,3,1,1
26 |       ,3,4,0,1
27 |     ),nrow=3,byrow=TRUE)
28 |   )
29 | 
30 | 


--------------------------------------------------------------------------------
/pkg/src/dist.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef SD_DIST_H
 3 | #define SD_DIST_H
 4 | 
 5 | #include "qtree.h"
 6 | #include "dictionary.h"
 7 | 
 8 | 
 9 | double osa_dist(unsigned int *, int, unsigned int *, int, double *, double *);
10 | dictionary *new_dictionary(unsigned int);
11 | void free_dictionary(dictionary *);
12 | double dl_dist(unsigned int *, int, unsigned int *, int, double *, dictionary *, double *);
13 | double hamming_dist(unsigned int *, int, unsigned int *, int);
14 | double lcs_dist(unsigned int *, int, unsigned int *, int, double *);
15 | double lv_dist(unsigned int *, int, unsigned int *, int, double *, double *);
16 | double osa_dist(unsigned int *, int, unsigned int *, int, double *, double *);
17 | double jaro_winkler_dist(unsigned int *, int, unsigned int *, int, double, double, double *, double *);
18 | qtree *new_qtree(int, int);
19 | void free_qtree(void);
20 | double qgram_dist(unsigned int *, int, unsigned int *, int, unsigned int, qtree **, int);
21 | double soundex_dist(unsigned int *, int, unsigned int *, int, unsigned int *);
22 | double running_cosine_dist(unsigned int *, int, unsigned int *, int, unsigned int, qtree **, double *);
23 | #endif
24 | 


--------------------------------------------------------------------------------
/examples/seq_dist.R:
--------------------------------------------------------------------------------
 1 | # Distances between lists of integer vectors. Note the postfix 'L' to force 
 2 | # integer storage. The shorter argument is recycled over (\code{a})
 3 | a <- list(c(102L, 107L))                        # fu
 4 | b <- list(c(102L,111L,111L),c(102L,111L,111L))  # foo, fo
 5 | seq_dist(a,b)
 6 | 
 7 | # translate strings to a list of integer sequences 
 8 | a <- lapply(c("foo","bar","baz"),utf8ToInt)
 9 | seq_distmatrix(a)
10 | 
11 | # Note how missing values are treated. NA's as part of the sequence are treated 
12 | # as an integer (the representation of NA_integer_).
13 | a <- list(NA_integer_,c(102L, 107L))
14 | b <- list(c(102L,111L,111L),c(102L,111L,NA_integer_))  
15 | seq_dist(a,b)
16 | 
17 | \dontrun{
18 | # Distance between sentences based on word order. Note: words must match exactly or they
19 | # are treated as completely different.
20 | #
21 | # For this example you need to have the 'hashr' package installed.
22 | a <- "Mary had a little lamb"
23 | a.words <- strsplit(a,"[[:blank:]]+")
24 | a.int <- hashr::hash(a.words)
25 | b <- c("a little lamb had Mary",
26 |            "had Mary a little lamb")
27 | b.int <- hashr::hash(strsplit(b,"[[:blank:]]+"))
28 | seq_dist(a.int,b.int)
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/pkg/src/hamming.c:
--------------------------------------------------------------------------------
 1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
 2 |  *  Copyright (C) 2013  Mark van der Loo
 3 |  *
 4 |  *  This program is free software: you can redistribute it and/or modify
 5 |  *  it under the terms of the GNU General Public License as published by
 6 |  *  the Free Software Foundation, either version 3 of the License, or
 7 |  *  (at your option) any later version.
 8 |  *
 9 |  *  This program is distributed in the hope that it will be useful,
10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  *  GNU General Public License for more details.
13 |  *
14 |  *  You should have received a copy of the GNU General Public License
15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
16 |  *
17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
18 |  *
19 |  *
20 |  * This code is gratefully based on Nick Logan's github repository
21 |  * https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c
22 |  *
23 |  */ 
24 | 
25 | 
26 | 
27 | #include "utils.h"
28 | 
29 | double hamming_dist(unsigned int *a, int len_a, unsigned int *b, int len_b){
30 |   double h=0;
31 |     if (len_a != len_b) return 1.0/0.0;
32 |     for(int i=0; i < len_a; ++i){
33 |      if (a[i] != b[i]) h++;
34 |     }
35 |   return h;
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/pkg/src/utils.c:
--------------------------------------------------------------------------------
 1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
 2 |  *  Copyright (C) 2013  Mark van der Loo
 3 |  *
 4 |  *  This program is free software: you can redistribute it and/or modify
 5 |  *  it under the terms of the GNU General Public License as published by
 6 |  *  the Free Software Foundation, either version 3 of the License, or
 7 |  *  (at your option) any later version.
 8 |  *
 9 |  *  This program is distributed in the hope that it will be useful,
10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  *  GNU General Public License for more details.
13 |  *
14 |  *  You should have received a copy of the GNU General Public License
15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
16 |  *
17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
18 |  *
19 |  */
20 | 
21 | 
22 | #define USE_RINTERNALS
23 | #include <R.h>
24 | #include <Rdefines.h>
25 | 
26 | 
27 | unsigned int max_length(SEXP x){
28 |   unsigned int t=0, m;
29 | 
30 |   if (TYPEOF(x) == VECSXP){
31 |     for (int i=0; i<length(x); ++i){
32 |       m = length(VECTOR_ELT(x,i));
33 |       if (t < m) t = m;
34 |     }
35 |   } else {
36 |     for (int i=0; i<length(x); ++i){
37 |       m = length(STRING_ELT(x,i));
38 |       if (t < m) t = m;
39 |     }
40 | 
41 |   }
42 |   return t;
43 | }
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/pkg/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: stringdist
 2 | Maintainer: Mark van der Loo <mark.vanderloo@gmail.com>
 3 | License: GPL-3
 4 | Title: Approximate String Matching, Fuzzy Text Search, and String Distance Functions
 5 | Type: Package
 6 | LazyLoad: yes
 7 | Authors@R: c( person("Mark", "van der Loo", role=c("aut","cre")
 8 |              , email="mark.vanderloo@gmail.com"
 9 |              , comment= c(ORCID="0000-0002-9807-4686"))
10 |             , person("Jan", "van der Laan", role="ctb")
11 |             , person("R Core Team",""     , role="ctb")
12 |             , person("Nick","Logan"       , role="ctb")
13 |             , person("Chris","Muir"       , role="ctb")
14 |             , person("Johannes", "Gruber" , role="ctb")
15 |             , person("Brian","Ripley"     , role="ctb"))
16 | Description: Implements an approximate string matching version of R's native
17 |     'match' function. Also offers fuzzy text search based on various string
18 |      distance measures. Can calculate various string distances based on edits
19 |     (Damerau-Levenshtein, Hamming, Levenshtein, optimal sting alignment), qgrams (q-
20 |     gram, cosine, jaccard distance) or heuristic metrics (Jaro, Jaro-Winkler). An
21 |     implementation of soundex is provided as well. Distances can be computed between
22 |     character vectors while taking proper care of encoding or between integer
23 |     vectors representing generic sequences. This package is built for speed and
24 |     runs in parallel by using 'openMP'. An API for C or C++ is exposed as well.
25 |     Reference: MPJ van der Loo (2014) <doi:10.32614/RJ-2014-011>.
26 | Version: 0.9.15
27 | Depends:
28 |     R (>= 2.15.3)
29 | URL: https://github.com/markvanderloo/stringdist
30 | BugReports: https://github.com/markvanderloo/stringdist/issues
31 | Suggests:
32 |     tinytest
33 | Imports: parallel
34 | Encoding: UTF-8
35 | RoxygenNote: 7.3.2
36 | 


--------------------------------------------------------------------------------
/pkg/src/lcs.c:
--------------------------------------------------------------------------------
 1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
 2 |  *  Copyright (C) 2013  Mark van der Loo
 3 |  *
 4 |  *  This program is free software: you can redistribute it and/or modify
 5 |  *  it under the terms of the GNU General Public License as published by
 6 |  *  the Free Software Foundation, either version 3 of the License, or
 7 |  *  (at your option) any later version.
 8 |  *
 9 |  *  This program is distributed in the hope that it will be useful,
10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  *  GNU General Public License for more details.
13 |  *
14 |  *  You should have received a copy of the GNU General Public License
15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
16 |  *
17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
18 |  */
19 | 
20 | #include "utils.h"
21 | 
22 | /* Longest common substring
23 |  * - basically edit distance, only allowing insertions and deletions, at the cost of 1.
24 |  */
25 | double lcs_dist(unsigned int *a, int na, unsigned int *b, int nb, double *scores){
26 |   if (!na){
27 |     return (double) nb;
28 |   }
29 |   if (!nb){
30 |     return (double) na;
31 |   }
32 | 
33 |   int i, j;
34 |   int M, I = na+1, L = na+1, J = nb+1;
35 |   
36 |   for ( i = 0; i < I; ++i ){
37 |     scores[i] = i;
38 |   }
39 |   for ( j = 1; j < J; ++j, L += I ){
40 |     scores[L] = j;
41 |   }
42 | 
43 |   for ( i = 1; i <= na; ++i ){
44 |     M = 0; L = I;
45 |     for ( j = 1; j <= nb; ++j, L += I, M += I ){
46 |       if ( a[i-1] == b[j-1] ){ // equality, copy previous score
47 |         scores[i + L] = scores[i-1 + M];
48 |       } else {
49 |         scores[i + L] = MIN(
50 |           scores[i-1 + L] + 1 ,     // deletion
51 |           scores[i   + M] + 1       // insertion
52 |         );
53 |       }
54 |       
55 |     }
56 |   }
57 |    
58 |   return scores[I*J - 1];
59 | }
60 | 
61 | 


--------------------------------------------------------------------------------
/pkg/src/R_register_native.c:
--------------------------------------------------------------------------------
 1 | #include <R.h>
 2 | #include <Rinternals.h>
 3 | #include <stdlib.h> // for NULL
 4 | #include <R_ext/Rdynload.h>
 5 | 
 6 | /* FIXME: 
 7 |    Check these declarations against the C/Fortran source code.
 8 | */
 9 | 
10 | /* .Call calls */
11 | extern SEXP R_afind(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
12 | extern SEXP R_all_int(SEXP);
13 | extern SEXP R_amatch(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
14 | extern SEXP R_get_qgrams(SEXP, SEXP);
15 | extern SEXP R_lengths(SEXP);
16 | extern SEXP R_lower_tri(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
17 | extern SEXP R_soundex(SEXP, SEXP);
18 | extern SEXP R_stringdist(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
19 | 
20 | static const R_CallMethodDef CallEntries[] = {
21 |     {"R_afind",      (DL_FUNC) &R_afind,      10},
22 |     {"R_all_int",    (DL_FUNC) &R_all_int,     1},
23 |     {"R_amatch",     (DL_FUNC) &R_amatch,     12},
24 |     {"R_get_qgrams", (DL_FUNC) &R_get_qgrams,  2},
25 |     {"R_lengths",    (DL_FUNC) &R_lengths,     1},
26 |     {"R_lower_tri",  (DL_FUNC) &R_lower_tri,   8},
27 |     {"R_soundex",    (DL_FUNC) &R_soundex,     2},
28 |     {"R_stringdist", (DL_FUNC) &R_stringdist,  9},
29 |     {NULL, NULL, 0}
30 | };
31 | 
32 | void R_init_stringdist(DllInfo *dll)
33 | {
34 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
35 |     R_useDynamicSymbols(dll, FALSE);
36 | 
37 |     /* used by external packages linking to internal xts code from C */
38 |     R_RegisterCCallable("stringdist","R_all_int",(DL_FUNC) &R_all_int);
39 |     R_RegisterCCallable("stringdist","R_amatch",(DL_FUNC) &R_amatch);
40 |     R_RegisterCCallable("stringdist","R_get_qgrams",(DL_FUNC) &R_get_qgrams);
41 |     R_RegisterCCallable("stringdist","R_lengths",(DL_FUNC) &R_lengths);
42 |     R_RegisterCCallable("stringdist","R_lower_tri",(DL_FUNC) &R_lower_tri);
43 |     R_RegisterCCallable("stringdist","R_soundex",(DL_FUNC) &R_soundex);
44 |     R_RegisterCCallable("stringdist","R_stringdist",(DL_FUNC) &R_stringdist);
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/pkg/src/stringdist.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /*  stringdist - a C library of string distance algorithms with an interface to R.
 3 |  *  Copyright (C) 2013  Mark van der Loo
 4 |  *
 5 |  *  This program is free software: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License as published by
 7 |  *  the Free Software Foundation, either version 3 of the License, or
 8 |  *  (at your option) any later version.
 9 |  *
10 |  *  This program is distributed in the hope that it will be useful,
11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  *  GNU General Public License for more details.
14 |  *
15 |  *  You should have received a copy of the GNU General Public License
16 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
17 |  *
18 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
19 |  */
20 | 
21 | 
22 | #ifndef SD_STRINGDIST_H
23 | #define SD_STRINGDIST_H
24 | 
25 | #include "dictionary.h"
26 | #include "qtree.h"
27 | #include "dist.h"
28 | 
29 | typedef enum Distance { 
30 |     osa
31 |   , lv
32 |   , dl
33 |   , hamming
34 |   , lcs
35 |   , qgram
36 |   , cosine
37 |   , jaccard
38 |   , jw
39 |   , soundex
40 |   , running_cosine} Distance;
41 | 
42 | typedef struct {
43 |   Distance distance;
44 |   // workspace
45 |   double *work;
46 |   // [optional] weight vector
47 |   double *weight;
48 |   // dictionary object for dl-distance
49 |   dictionary *dict;
50 |   // tree object to store q-grams
51 |   qtree *tree;
52 |   // the q in qgrams
53 |   unsigned int q;
54 |   // Winkler's penalty factor
55 |   double p;
56 |   // Winkler's boost threshold
57 |   double bt;
58 |   // fail indicator
59 |   unsigned int ifail;
60 | } Stringdist;
61 | 
62 | Stringdist *open_stringdist(Distance, int, int, ...);
63 | 
64 | double stringdist(Stringdist *, unsigned int *, int, unsigned int *, int);
65 | 
66 | void close_stringdist(Stringdist *S);
67 | void reset_stringdist(Stringdist *S);
68 | 
69 | 
70 | 
71 | #endif
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/pkg/R/phonetic.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Phonetic algorithms
 3 | #' 
 4 | #' Translate strings to phonetic codes. Similar sounding strings should get 
 5 | #' similar or equal codes. 
 6 | #'
 7 | #' @param x a character vector whose elements are phonetically encoded. 
 8 | #' @param method name of the algorithm used. The default is \code{"soundex"}.
 9 | #' @param useBytes Perform byte-wise comparison. \code{useBytes=TRUE} is faster 
10 | #'    but may yield different results depending on character encoding. For more
11 | #'    information see the documentation of \code{\link{stringdist}}.
12 | #'
13 | #' @details
14 | #' Currently, only the soundex algorithm is implemented. Note that soundex coding
15 | #' is only meaningful for characters in the ranges a-z and A-Z. Soundex coding of strings 
16 | #' containing non-printable ascii or non-ascii characters may be system-dependent and should 
17 | #' not be trusted. If non-ascii or non-printable ascii charcters are encountered, a warning 
18 | #' is emitted.
19 | #' 
20 | #' @seealso \code{\link{printable_ascii}}
21 | #' 
22 | #' 
23 | #' @return
24 | #' The returns value depends on the method used. However, all currently 
25 | #' implemented methods return a character vector of the same length of the input
26 | #' vector. Output characters are in the system's native encoding.
27 | #'
28 | #' @references
29 | #' \itemize{
30 | #' \item{The Soundex algorithm implemented is the algorithm used by the 
31 | #'   \href{https://www.archives.gov/research/census/soundex}{National Archives}. 
32 | #'   This algorithm differs slightly from the original algorithm patented by R.C. Russell 
33 | #'   (US patents 1261167 (1918) and 1435663 (1922)). 
34 | #' }
35 | #' }
36 | #'
37 | #' @example ../examples/phonetic.R
38 | #'
39 | #' @export
40 | phonetic <- function(x, method = c("soundex"), useBytes = FALSE) {
41 |   x <- as.character(x)
42 |   method <- match.arg(method)
43 |   stopifnot(is.logical(useBytes))
44 |   if (!useBytes) x <- enc2utf8(x)
45 |   if (method == "soundex") {
46 |     r <- .Call("R_soundex", x, useBytes,PACKAGE="stringdist")
47 |     if (!useBytes) int2char(r) else r
48 |   } 
49 | }
50 | 
51 | int2char <- function(x) {
52 |   enc2native(sapply(x, intToUtf8))
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/pkg/src/lv.c:
--------------------------------------------------------------------------------
 1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
 2 |  *  Copyright (C) 2013  Mark van der Loo
 3 |  *
 4 |  *  This program is free software: you can redistribute it and/or modify
 5 |  *  it under the terms of the GNU General Public License as published by
 6 |  *  the Free Software Foundation, either version 3 of the License, or
 7 |  *  (at your option) any later version.
 8 |  *
 9 |  *  This program is distributed in the hope that it will be useful,
10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  *  GNU General Public License for more details.
13 |  *
14 |  *  You should have received a copy of the GNU General Public License
15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
16 |  *
17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
18 |  */
19 | 
20 | #include "utils.h"
21 | 
22 | 
23 | /* Levenshtein distance
24 |  * Computes Levenshtein distance
25 |  * - Simplified from restricted DL pseudocode at http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
26 |  * - Extended with custom weights and maxDistance
27 |  */
28 | double lv_dist(
29 |   unsigned int *a, int na, // source
30 |   unsigned int *b, int nb, // target
31 |   double *weight, 
32 |   double *scores){
33 |   if (!na){
34 |     return (double) nb * weight[1]; // del score
35 |   }
36 |   if (!nb){
37 |     return (double) na * weight[0]; // ins score
38 |   }
39 | 
40 |   int i, j;
41 |   int I = na+1, L = na+1, J = nb+1;
42 |   double sub;
43 | 
44 |   for ( i = 0; i < I; ++i ){
45 |     scores[i] = i * weight[0];
46 |   }
47 |   for ( j = 1; j < J; ++j, L += I ){
48 |    scores[L] = j * weight[1];
49 |   }
50 | 
51 | 
52 |   int M;
53 |   for ( i = 1; i <= na; ++i ){
54 |     L = I; M= 0; 
55 |     for ( j = 1; j <= nb; ++j, L += I, M += I ){
56 |       sub = (a[i-1] == b[j-1]) ? 0 : weight[2];
57 |       scores[i + I*j] = MIN(MIN( 
58 |         scores[i-1 + L] + weight[0],     // deletion
59 |         scores[i   + M] + weight[1]),    // insertion
60 |         scores[i-1 + M] + sub            // substitution
61 |       );
62 |     }
63 |   }
64 | 
65 |   double score = scores[I*J-1];
66 |   return score;
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/pkg/src/osa.c:
--------------------------------------------------------------------------------
 1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
 2 |  *  Copyright (C) 2013  Mark van der Loo
 3 |  *
 4 |  *  This program is free software: you can redistribute it and/or modify
 5 |  *  it under the terms of the GNU General Public License as published by
 6 |  *  the Free Software Foundation, either version 3 of the License, or
 7 |  *  (at your option) any later version.
 8 |  *
 9 |  *  This program is distributed in the hope that it will be useful,
10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  *  GNU General Public License for more details.
13 |  *
14 |  *  You should have received a copy of the GNU General Public License
15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
16 |  *
17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
18 |  */
19 | 
20 | #include "utils.h"
21 | 
22 | /* Optimal string alignment algorithm. 
23 |  * Computes Damerau-Levenshtein distance, restricted to single transpositions.
24 |  * - See pseudocode at http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
25 |  * - Extended with custom weights and maxDistance
26 |  */
27 | double osa_dist(unsigned int *a, int na, unsigned int *b, int nb, double *weight, double *scores){
28 | 
29 |   if (!na){
30 |     return (double) nb * weight[1]; // ins weight
31 |   }
32 |   if (!nb){
33 |     return (double) na * weight[0]; // del weight
34 |   }
35 | 
36 |   int i, j;
37 |   int M, I = na+1, L=na+1, J = nb+1;
38 |   double sub, tran;
39 | 
40 |    for ( i = 0; i < I; ++i ){
41 |       scores[i] = i * weight[0];
42 |    }
43 |    for ( j = 1; j < J; ++j, L += I ){
44 |       scores[L] = j * weight[1];
45 |    }
46 | 
47 | for ( i = 1; i <= na; ++i ){
48 |   L = I; M = 0;
49 |   for ( j = 1; j <= nb; ++j, L += I, M += I ){
50 |     if (a[i-1] == b[j-1]){
51 |       sub = 0;
52 |       tran= 0;
53 |     } else {
54 |       sub = weight[2];
55 |       tran= weight[3];
56 |     }
57 | 
58 |     scores[i + L] = MIN(MIN( 
59 |       scores[i-1 + L] + weight[0],     // deletion
60 |       scores[i   + M] + weight[1]),    // insertion
61 |       scores[i-1 + M] + sub            // substitution
62 |     );
63 |     if ( i>1 && j>1 && a[i-1] == b[j-2] && a[i-2] == b[j-1] ){
64 |       scores[i + L] = MIN(scores[i + L], scores[i-2 + M-I] + tran); // transposition
65 |     }
66 |   }
67 | }
68 |   double score = scores[I*J-1];
69 |   return score;
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/pkg/R/doc_parallel.R:
--------------------------------------------------------------------------------
 1 | #' @title
 2 | #' Multithreading and parallelization in \pkg{stringdist}
 3 | #'
 4 | #' 
 5 | #' @description This page describes how \pkg{stringdist} uses parallel processing.
 6 | #'
 7 | #' @section Multithreading and parallelization in \pkg{stringdist}: 
 8 | #'   The core 
 9 | #'   functions of \pkg{stringdist} are implemented in C. On systems where 
10 | #'   \code{openMP} is available, \pkg{stringdist} will automatically take 
11 | #'   advantage of multiple cores. The
12 | #'   \href{https://cran.r-project.org/doc/manuals/r-release/R-exts.html#OpenMP-support}{section
13 | #'   on OpenMP} of the
14 | #'   \href{https://cran.r-project.org/doc/manuals/r-release/R-exts.html}{Writing
15 | #'   R Extensions} manual discusses on what systems OpenMP is available (at the time of writing more or
16 | #'   less anywhere except on OSX).
17 | #'
18 | #' By default, the number of threads to use is taken from \code{options('sd_num_thread')}.
19 | #' When the package is loaded, the value for this option is determined as follows:
20 | #' \itemize{
21 | #'   \item{If the environment variable \code{OMP_NUM_THREADS} is set, this value is taken.}
22 | #'   \item{Otherwise, the number of available cores is determined with \code{parallel::detectCores()}
23 | #'   If this fails, the number of threads is set to 1 (with a message). If the nr of detected
24 | #'   cores exceeds three, the number of used cores is set to \eqn{n-1}.}
25 | #'   \item{If available, the environment variable \code{OMP_THREAD_LIMIT} is
26 | #'   determined and The number of threads is set to the lesser of
27 | #'   \code{OMP_THREAD_LIMIT} and the number of detected cores.}
28 | #' }
29 | #'
30 | #' The latter step makes sure that on machines with \eqn{n>3} cores, \eqn{n-1} 
31 | #' cores are used. Some benchmarking showed that using all cores is often slower
32 | #' in such cases. This is probably because at least one of the threads will be
33 | #' shared with the operating system.
34 | #'
35 | #' Functions that use multithreading have an option named \code{nthread} that
36 | #' controls the maximum number of threads to use. If you need to do large
37 | #' calculations, it is probably a good idea to benchmark the performance on your
38 | #' machine(s) as a function of \code{'nthread'}, for example using the 
39 | #' \href{https://cran.r-project.org/package=microbenchmark}{microbenchmark}
40 | #' package of Mersmann.
41 | #'
42 | #'
43 | #'
44 | #'
45 | #' @seealso
46 | #' \itemize{
47 | #'  \item{Functions running multithreaded: \code{\link{stringdist}}, \code{\link{stringdistmatrix}}, \code{\link{amatch}}, \code{\link{ain}} }
48 | #' }
49 | #'
50 | #' @name stringdist-parallelization
51 | #' @rdname stringdist-parallelization
52 | {}
53 | 


--------------------------------------------------------------------------------
/test/smoke.R:
--------------------------------------------------------------------------------
 1 | # smoke tests.
 2 | set.seed(1864)
 3 | 
 4 | #dyn.load("../pkg/src/stringdist.so")
 5 | #for (f in dir("../pkg/R/",full.names=TRUE)) d <- source(f)
 6 | #options(sd_num_thread=4)
 7 | library(stringdist)
 8 | 
 9 | # printable ascii
10 | pascii <- sapply(33:126,intToUtf8)
11 | # the ascii set
12 | ascii <- sapply(0:127,intToUtf8)
13 | # the utf8 set
14 | utf8 <- sapply(0:0x10FFFF,intToUtf8)
15 | utf8 <- utf8[stringi::stri_enc_isutf8(utf8)]
16 | 
17 | 
18 | # methods
19 | edit <- c("osa", "lv", "dl", "lcs", "hamming") 
20 | qgrm <- c("qgram","cosine", "jaccard")
21 | heur <- c("jw")
22 | phon <- c("soundex")
23 | 
24 | useBytes <- c(TRUE,FALSE)
25 | nthread=1:4
26 | q <- 0:3
27 | p <- c(0,0.01,0.25)
28 | 
29 | tests <- list(
30 |     test_edit = expand.grid(method = edit, useBytes = useBytes, nthread=1:4,stringsAsFactors = FALSE)
31 |   , test_qgrm = expand.grid(method = qgrm, useBytes = useBytes, nthread=1:4,q = q, stringsAsFactors = FALSE)
32 |   , test_heur = expand.grid(method = heur, useBytes = useBytes, nthread=1:4,p = p, stringsAsFactors = FALSE)
33 |   , test_phon = expand.grid(method = phon, useBytes = useBytes, nthread=1:4,stringsAsFactors=FALSE)
34 | )
35 | 
36 | rand_string <- function(n, len, alphabet=pascii){
37 |   stopifnot( length(len) == 1 || length(len) == n )
38 |   if (length(len)==1){
39 |     sapply(1:n,function(i) paste0(sample(alphabet, len, replace=TRUE),collapse=""))
40 |   } else {
41 |     sapply(1:n,function(i) paste0(sample(alphabet, len[i], replace=TRUE),collapse=""))    
42 |   }  
43 | }
44 | 
45 | N1 <- 250
46 | N2 <- N1
47 | len1 <- sample(0:50,N1,replace=TRUE)
48 | len2 <- sample(0:50,N2,replace=TRUE)
49 | # smoke test for edit-based distances
50 | str1 <- rand_string(N1,len1,pascii)
51 | str2 <- c(rand_string(N2,len2,pascii)
52 |           , rand_string(N2,len2,ascii)
53 |           , rand_string(N2,len2,utf8))
54 | 
55 | for (tst in tests ){
56 |   for ( i in 1:nrow(tst) ){
57 |     args <- c(list(a=str1,b=str2),as.list(tst[i,]))
58 |     tryCatch(do.call(stringdist,args), error=function(e){
59 |       cat(sprintf("Failure with message\n%s\n",e$message))
60 |       print(tst[i,])
61 |     }, warning=function(w){
62 |       cat(sprintf("Warning with message\n%s\n",w$message))
63 |       print(tst[i,])      
64 |     })
65 |   }
66 | }
67 | 
68 | for (tst in tests ){
69 |   for ( i in 1:nrow(tst) ){
70 |     args <- c(list(x=str1,table=str2),as.list(tst[i,]))
71 |     tryCatch(do.call(amatch,args), error=function(e){
72 |       cat(sprintf("Failure with message\n%s\n",e$message))
73 |       print(tst[i,])
74 |     }, warning=function(w){
75 |       cat(sprintf("Warning with message\n%s\n",w$message))
76 |       print(tst[i,])      
77 |     })
78 |   }
79 | }
80 | 
81 | 
82 | 
83 | #tst <- tests[[4]]
84 | #args <- c(list(a=str1[1],b=str2[1]),as.list(tst[1,]))
85 | #do.call(stringdist,args)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/pkg/README.md:
--------------------------------------------------------------------------------
 1 | [![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](https://github.com/SNStatComp/awesome-official-statistics-software)
 2 | 
 3 | ## stringdist
 4 | 
 5 | * Approximate matching and string distance calculations for R. 
 6 | * All distance and matching operations are system- and encoding-independent.
 7 | * Built for speed, using [openMP](https://www.openmp.org/) for parallel computing.
 8 | 
 9 | The package offers the following main functions:
10 | 
11 | * `stringdist`  computes pairwise distances between two input character vectors (shorter one is recycled)
12 | * `stringdistmatrix` computes the distance matrix for one or two vectors
13 | * `stringsim` computes a string similarity between 0 and 1, based on `stringdist`
14 | * `amatch` is a fuzzy matching equivalent of R's native `match` function
15 | * `ain` is a fuzzy matching equivalent of R's native `%in%` operator
16 | * `seq_dist`, `seq_distmatrix`, `seq_amatch` and `seq_ain` for distances between, and matching of integer sequences. 
17 | 
18 | These functions are built upon `C`-code that re-implements some common (weighted) string
19 | distance functions. Distance functions include:
20 | 
21 | * Hamming distance; 
22 | * Levenshtein distance (weighted)
23 | * Restricted Damerau-Levenshtein distance (weighted, a.k.a. Optimal String Alignment)
24 | * Full Damerau-Levenshtein distance
25 | * Longest Common Substring distance
26 | * Q-gram distance
27 | * cosine distance for q-gram count vectors (= 1-cosine similarity)
28 | * Jaccard distance for q-gram count vectors (= 1-Jaccard similarity)
29 | * Jaro, and Jaro-Winkler distance
30 | * Soundex-based string distance
31 | 
32 | Also, there are some utility functions:
33 | 
34 | * `qgrams()` tabulates the qgrams in one or more `character` vectors.
35 | * `seq_qrams()` tabulates the qgrams (somtimes called ngrams) in one or more `integer` vectors.
36 | * `phonetic()` computes phonetic codes of strings (currently only soundex)
37 | * `printable_ascii()` is a utility function that detects non-printable ascii or non-ascii characters.
38 | 
39 | #### C API
40 | 
41 | Some of `stringdist`'s underlying `C` functions can be called directly from
42 | `C` code in other packages. The description of the API can be found by either
43 | typing `?stringdist_api` in the R console or open the vignette directly as follows:
44 | 
45 | ```
46 | vignette("stringdist_C-Cpp_api", package="stringdist")
47 | ```
48 | 
49 | Examples of packages that link to `stringdist` can be found
50 | [here](https://github.com/markvanderloo/linkstringdist) and
51 | [here](https://github.com/ChrisMuir/refinr).
52 | 
53 | 
54 | #### Resources
55 | 
56 | * A [paper](https://journal.r-project.org/archive/2014-1/loo.pdf) on stringdist has been published in the R-journal
57 | * [Slides](https://www.markvanderloo.eu/files/share/loo2014approximate.pdf) of a talk given at te _useR!2014_ conference.
58 | 
59 | 


--------------------------------------------------------------------------------
/pkg/R/utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | RECYCLEWARNING <- NULL
 3 | 
 4 | # calling message from .onLoad gives a NOTE on build, so we avoid it here.
 5 | mymsg <- message
 6 | 
 7 | .onLoad <- function(libname, pkgname){
 8 |   RECYCLEWARNING <<- gettext(tryCatch( (1:2)+(1:3),warning=function(w) w$message ))
 9 | 
10 |   
11 |   omp_num_threads <- as.numeric(Sys.getenv("OMP_NUM_THREADS"))
12 |   if (!is.na(omp_num_threads) && omp_num_threads > 0){
13 |     nthread <- omp_num_threads
14 |   } else {
15 |     nthread <- parallel::detectCores()
16 |     if ( is.na(nthread) || !is.numeric(nthread) ){
17 |       nthread <- 1L
18 |       mymsg("Could not detect number of cores, defaulting to 1.")
19 |     } else {
20 |       # nthread=nr of detected cores. Leave one core free.
21 |       if (nthread >= 4) nthread <- nthread - 1
22 |     }
23 |   }
24 | 
25 |   ## if OMP_THREAD_LIMIT is set, maximize on that limit.
26 |   omp_thread_limit <- as.numeric(Sys.getenv("OMP_THREAD_LIMIT"))
27 |   if ( is.na(omp_thread_limit) ) omp_thread_limit <- nthread
28 |   nthread <- min(omp_thread_limit,nthread)
29 |  
30 |   options(sd_num_thread=as.integer(nthread))
31 | }
32 | 
33 | # When necessary and possible, argument is coverted to integers.
34 | ensure_int_list <- function(x){
35 |   if (is.integer(x)|is.numeric(x)) return(list(as.integer(x)))
36 |   if (!is.list(x)) stop("argument must be 'list', 'integer' or 'numeric'")
37 |   if (!all_int(x)){
38 |     lapply(x,as.integer)
39 |   } else {
40 |     x
41 |   }
42 | }
43 | 
44 | setNames <- function(object, nm){
45 |   names(object) <- nm
46 |   object
47 | }
48 | 
49 | #' Detect the presence of non-printable or non-ascii characters
50 | #' 
51 | #' @param x a \code{character} vector
52 | #'
53 | #' @details
54 | #' Printable ASCII characters consist of space, \code{A-Z}, \code{a-z}, \code{0-9} and the characters
55 | #'
56 | #' \code{! "" # $ \% & ' ( ) * + , . / : ; < = > ? @@ [ ] \\ ^ _ ` { | } ~ -} 
57 | #'
58 | #' Note that this excludes tab (as it is a control character).
59 | #'
60 | #' 
61 | #' @example ../examples/printable_ascii.R
62 | #'
63 | #'
64 | #' @return A \code{logical} indicating which elements consist solely of printable ASCII characters.
65 | #' @export 
66 | printable_ascii <- function(x){ 
67 |   # To portably detect ASCII characters, we need to specify them literally. Hence this monster of a character class. See ?regexp.
68 | 
69 |   # notes:
70 |   # - caret (^) at the beginning negates what comes after, the caret in the middle is the actual character.
71 |   # - the closing square bracket ] needs to be specified first
72 |   # - double quote " and backslash are escaped
73 |   # - the dash "-" is specified at the end since it would indicate a range otherwise
74 |   # - see ? regexp.
75 |   charclass <- paste0("[^]"
76 |     , "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
77 |     , "0123456789"
78 |     , " !\"#$%&'()*+,./:;<=>?@[\\^_`{|}~-"
79 |     , "]"
80 |   )  
81 |   !grepl(charclass,x)
82 | }
83 | 
84 | # check whether all elements of a list are of type 'integer'.
85 | # x MUST be a list.
86 | all_int <- function(x){
87 |   .Call("R_all_int",x,PACKAGE="stringdist")
88 | }
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/examples/stringdist.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Simple example using optimal string alignment
 3 | stringdist("ca","abc")
 4 | 
 5 | # computing a 'dist' object
 6 | d <- stringdistmatrix(c('foo','bar','boo','baz'))
 7 | # try plot(hclust(d))
 8 | 
 9 | # The following gives a matrix
10 | stringdistmatrix(c("foo","bar","boo"),c("baz","buz"))
11 | 
12 | # An example using Damerau-Levenshtein distance (multiple editing of substrings allowed)
13 | stringdist("ca","abc",method="dl")
14 | 
15 | # string distance matching is case sensitive:
16 | stringdist("ABC","abc")
17 | 
18 | # so you may want to normalize a bit:
19 | stringdist(tolower("ABC"),"abc")
20 | 
21 | # stringdist recycles the shortest argument:
22 | stringdist(c('a','b','c'),c('a','c'))
23 | 
24 | # stringdistmatrix gives the distance matrix (by default for optimal string alignment):
25 | stringdist(c('a','b','c'),c('a','c'))
26 | 
27 | # different edit operations may be weighted; e.g. weighted substitution:
28 | stringdist('ab','ba',weight=c(1,1,1,0.5))
29 | 
30 | # Non-unit weights for insertion and deletion makes the distance metric asymetric
31 | stringdist('ca','abc')
32 | stringdist('abc','ca')
33 | stringdist('ca','abc',weight=c(0.5,1,1,1))
34 | stringdist('abc','ca',weight=c(0.5,1,1,1))
35 | 
36 | # Hamming distance is undefined for 
37 | # strings of unequal lengths so stringdist returns Inf
38 | stringdist("ab","abc",method="h")
39 | # For strings of eqal length it counts the number of unequal characters as they occur
40 | # in the strings from beginning to end
41 | stringdist("hello","HeLl0",method="h")
42 | 
43 | # The lcs (longest common substring) distance returns the number of 
44 | # characters that are not part of the lcs.
45 | #
46 | # Here, the lcs is either 'a' or 'b' and one character cannot be paired:
47 | stringdist('ab','ba',method="lcs")
48 | # Here the lcs is 'surey' and 'v', 'g' and one 'r' of 'surgery' are not paired
49 | stringdist('survey','surgery',method="lcs")
50 | 
51 | 
52 | # q-grams are based on the difference between occurrences of q consecutive characters
53 | # in string a and string b.
54 | # Since each character abc occurs in 'abc' and 'cba', the q=1 distance equals 0:
55 | stringdist('abc','cba',method='qgram',q=1)
56 | 
57 | # since the first string consists of 'ab','bc' and the second 
58 | # of 'cb' and 'ba', the q=2 distance equals 4 (they have no q=2 grams in common):
59 | stringdist('abc','cba',method='qgram',q=2)
60 | 
61 | # Wikipedia has the following example of the Jaro-distance. 
62 | stringdist('MARTHA','MATHRA',method='jw')
63 | # Note that stringdist gives a  _distance_ where wikipedia gives the corresponding 
64 | # _similarity measure_. To get the wikipedia result:
65 | 1 - stringdist('MARTHA','MATHRA',method='jw')
66 | 
67 | # The corresponding Jaro-Winkler distance can be computed by setting p=0.1
68 | stringdist('MARTHA','MATHRA',method='jw',p=0.1)
69 | # or, as a similarity measure
70 | 1 - stringdist('MARTHA','MATHRA',method='jw',p=0.1)
71 | 
72 | # This gives distance 1 since Euler and Gauss translate to different soundex codes.
73 | stringdist('Euler','Gauss',method='soundex')
74 | # Euler and Ellery translate to the same code and have distance 0
75 | stringdist('Euler','Ellery',method='soundex')
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_seq_dist.R:
--------------------------------------------------------------------------------
 1 | options(sd_num_thread=2)
 2 | ## seq_dist
 3 | # tests against cases that used to segfault when we did not check
 4 | # NULL cases.
 5 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=1:4))
 6 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread="foo"))
 7 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=integer(0)))
 8 | expect_error(seq_dist(a=list(c(1L,2L,3L)), b=list(c(2L,1L,3L)),nthread=NULL))
 9 | 
10 | # A simple test to see that everything is passed on to the correct
11 | # algorithm
12 | ## Methods are selected and computed correctly
13 |   expect_equal(
14 |     seq_dist(a = list(c(1L,2L,3L)), b = list(c(2L,1L,3L)), method="osa")
15 |     , 1 )
16 |   expect_equal(
17 |     seq_dist(a = list(c(1L,2L,3L)), b = list(c(2L,1L,3L)), method="lv")
18 |     , 2 )
19 |   # the case setting 'dl' apart from 'osa'
20 |   expect_equal(
21 |     seq_dist(a = list(c(2L,1L)), b = list(c(1L,3L,2L)), method="dl")
22 |     , 2 )
23 |   expect_equal(
24 |     seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="hamming")
25 |     , 1 )
26 |   expect_equal(
27 |     seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="lcs")
28 |     , 2 )
29 |   expect_equal(
30 |     seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="qgram",q=2)
31 |   , 4 )
32 |   
33 |   expect_equal(
34 |     round(1-seq_dist(list(utf8ToInt("martha")),list(utf8ToInt("marhta")),method='jw'),3)
35 |     , 0.944
36 |   )
37 |   expect_error(
38 |     seq_dist(a = list(c(1L,2L,3L)), b = list(c(1L,0L,3L)), method="soundex")
39 |   )
40 | 
41 | 
42 | ## Conversion for non-integer-list arguments
43 |   expect_equal(seq_dist(list(c(1,2,3)),list(c(2,3,4))),seq_dist(as.numeric(c(1,2,3)),as.numeric(c(2,3,4))))
44 |   expect_equal(seq_dist(list(c(1,2,3)),list(c(2,3,4))),seq_dist(c(1,2,3), c(2,3,4)))
45 |   expect_equal(seq_distmatrix(list(c(1,2,3)),list(c(2,3,4))), seq_distmatrix(as.numeric(c(1,2,3)),as.numeric(c(2,3,4))))  
46 |   expect_equal(seq_distmatrix(list(c(1,2,3)),list(c(2,3,4))), seq_distmatrix(c(1,2,3),c(2,3,4)))  
47 |   expect_equal(seq_distmatrix(list(c(1,2,3))),seq_distmatrix(c(1,2,3)))
48 |   expect_equal(seq_distmatrix(list(c(1,2,3))),seq_distmatrix(as.numeric(c(1,2,3))))
49 | 
50 | 
51 | ## Some edge cases
52 |   expect_equal(length(seq_dist(list(),list(c(1L)))),0)
53 |   expect_equal(length(seq_dist(list(),list())),0)
54 | 
55 | 
56 | ## Elementary tests on seq_distmatrix
57 | 
58 |   expect_equivalent(seq_distmatrix(1:10),dist(0))
59 |   expect_equivalent(seq_distmatrix(1:10,list(1:10)),matrix(0))
60 |   expect_equivalent(
61 |     as.matrix(seq_distmatrix(list(c(1,2,3),c(2,3,4))) )
62 |     , matrix(c(0,2,2,0),nrow=2)
63 |   )
64 |   expect_equal(
65 |     as.matrix(seq_distmatrix(list(x=c(1,2,3),y=c(2,3,4)),useNames="names") )
66 |     , matrix(c(0,2,2,0),nrow=2,dimnames=list(c('x','y'),c('x','y')))
67 |   )
68 |   expect_equal(
69 |     seq_distmatrix(list(x=c(1,2,3),y=c(2,3,4)),list(x=c(1,2,3),y=c(2,3,4)),useNames="names")
70 |     , matrix(c(0,2,2,0),nrow=2,dimnames=list(c('x','y'),c('x','y')))
71 |   )
72 |   expect_equal(class(seq_distmatrix(list(c(1,2,3),c(2,3,4)))),"dist")
73 |   expect_equivalent(
74 |     as.matrix(seq_distmatrix(list(c(1,2,3),c(2,3,4))),seq_distmatrix(list(c(1,2,3),c(2,3,4)),list(c(1,2,3),c(2,3,4))) )
75 |     , matrix(c(0,2,2,0),nrow=2)
76 |   )
77 |   
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_afind.R:
--------------------------------------------------------------------------------
  1 | options(sd_num_thread=1L)
  2 | 
  3 | # tests against cases that used to segfault when we did not check
  4 | # NULL cases.
  5 | expect_error(afind("a","b",nthread=1:4))
  6 | expect_error(afind("a","b",nthread="foo"))
  7 | expect_error(afind("a","b",nthread=integer(0)))
  8 | expect_error(afind("a","b",nthread=NULL))
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | texts = c("When I grow up, I want to be"
 15 |   , "one of the harversters of the sea"
 16 |   , "I think before my days are gone"
 17 |   , "I want to be a fisherman")
 18 | 
 19 | patterns = c("fish", "gone","to be")
 20 | 
 21 | out <- afind(texts, patterns, method="osa")
 22 | 
 23 | location <- matrix(c(
 24 |    1,    1,  24,
 25 |    6,    1,  28,
 26 |    1,   28,   6,
 27 |   16,    3,   8),
 28 |   nrow=4, byrow=TRUE)
 29 | 
 30 | 
 31 | distance <- matrix(c(
 32 |    4,    3,    0,
 33 |    2,    2,    3,
 34 |    3,    0,    2,
 35 |    0,    3,    0),
 36 |    nrow=4, byrow=TRUE)
 37 | 
 38 | match <- matrix(c(
 39 |   "When", "When", "to be",
 40 |   "f th", "one ", "he se",
 41 |   "I th", "gone", "nk be",
 42 |   "fish", "want", "to be"),
 43 |    nrow=4, byrow=TRUE)
 44 | 
 45 | 
 46 | expect_equal(out$location, location)
 47 | expect_equal(out$distance, distance)
 48 | expect_equal(out$match,    match)
 49 | 
 50 | # test paralellization
 51 | 
 52 | out1 <- afind(texts, patterns, method="osa", nthread=2L)
 53 | expect_identical(out, out1)
 54 | 
 55 | # test 'value' option
 56 | out2 <- afind(texts, patterns, value=FALSE)
 57 | expect_equal(length(out2), 2)
 58 | 
 59 | 
 60 | # test grep/grepl equivalents 'grab', 'grabl'
 61 | 
 62 | expect_equal(grab(texts, "harvester", maxDist=2), 2)
 63 | expect_equal(grab(texts, "harvester", value=TRUE, maxDist=2), "harverste")
 64 | expect_equal(grabl(texts, "harvester", maxDist=2)
 65 |             , c(FALSE,TRUE,FALSE,FALSE))
 66 | 
 67 | expect_equal(extract(texts, "harvester", maxDist=2)
 68 |             , matrix(c(NA, "harverste",NA,NA),nrow=4) )
 69 | 
 70 | ## Test running_cosine
 71 | pattern <- c("phish", "want to")
 72 | 
 73 | expect_identical(
 74 |     afind(texts, pattern, method="cosine",         q=3)
 75 |   , afind(texts, pattern, method="running_cosine", q=3)
 76 | )
 77 | 
 78 | 
 79 | ## test whether the correct positions are returned for all methods.
 80 | 
 81 | methods = names(stringdist:::METHODS)
 82 | methods = methods[!methods %in% c("soundex","hamming")]
 83 | text <- "If you squeeze my lizzard, I put my snake on you."
 84 | pattern <- "lizard"
 85 | 
 86 | for ( method in methods ){
 87 |   expect_equal(afind(text, pattern, method=method, q=3, p=0.1)$location[1,1], 19, info=method)
 88 | }
 89 | 
 90 | ## test the usual edge cases
 91 | 
 92 | # notice: window size = 0.
 93 | expect_equal(afind("foo","")$distance[1], 0)
 94 | 
 95 | expect_equal(afind("foo",NA)$distance[1], NA_real_)
 96 | expect_equal(afind("foo",NA)$location[1], NA_integer_)
 97 | expect_equal(afind("foo",NA)$match[1], NA_character_)
 98 | 
 99 | expect_equal(afind(NA,"foo")$distance[1], NA_real_)
100 | expect_equal(afind(NA,"foo")$location[1], NA_integer_)
101 | expect_equal(afind(NA,"foo")$match[1], NA_character_)
102 | 
103 | expect_equal(afind("","foo")$distance[1], 3)
104 | expect_equal(afind("","foo")$location[1], 1)
105 | expect_equal(afind("","foo")$match[1], "")
106 | 
107 | expect_equal(grab("foo", ""), 1L)
108 | expect_equal(grabl("foo",""), TRUE)
109 | expect_equal(grab("foo",NA), integer(0))
110 | 
111 | # note that 'grepl' gives FALSE in this case (which is inconsistent with
112 | # grepl(NA, NA), grepl(NA, "foo").
113 | expect_equal(grabl("foo",NA), NA)
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_stringsim.R:
--------------------------------------------------------------------------------
 1 | options(sd_num_thread=2)
 2 | ## stringsim
 3 | 
 4 | # We expect that two completely different strings have a similarity of
 5 | # 0 and two completely equal strings a similarity of 1
 6 | methods <- c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
 7 | for (method in methods) {
 8 | 
 9 |     expect_equal(stringsim("bb", "cc", method=method), 0)
10 |     expect_equal(stringsim("bb", "bb", method=method), 1)
11 | }
12 | 
13 | 
14 | ## edgecases
15 | for (method in methods[c(1:5,9:10)]){
16 |     expect_equal(stringsim(c("a", ""), "", method=method), c(0, 1))
17 |     
18 |     expect_equal(stringsim(c("kkk", "bbb"), "bbb", method=method), 
19 |       stringsim("bbb", c("kkk", "bbb"), method=method))
20 | }
21 | for (method in methods[6:8]){
22 |     expect_equal(stringsim(c("a", ""), "", method=method,q=0), c(1, 1))
23 |     
24 |     expect_equal(stringsim(c("kkk", "bbb"), "bbb", method=method,q=0), 
25 |                 stringsim("bbb", c("kkk", "bbb"), method=method,q=0))
26 | }
27 | 
28 | 
29 | 
30 | 
31 | ## Stringsim gets correct values with or without useBytes
32 |   x <- "ao"
33 |   y <- paste0("a",intToUtf8(0x00F6)) # o-umlaut
34 |   expect_equal(stringsim(x,y,method="osa", useBytes=FALSE), 1-1/2)
35 |   expect_equal(stringsim(x,y,method="osa", useBytes=TRUE ), 1-2/3)
36 |   expect_equal(stringsim(x,y,method="lv", useBytes=FALSE),  1-1/2)
37 |   expect_equal(stringsim(x,y,method="lv", useBytes=TRUE ),  1-2/3)
38 |   expect_equal(stringsim(x,y,method="dl", useBytes=FALSE),  1-1/2)
39 |   expect_equal(stringsim(x,y,method="dl", useBytes=TRUE ),  1-2/3)
40 |   expect_equal(stringsim(x,y,method="hamming", useBytes=FALSE), 1-1/2)
41 |   expect_equal(stringsim(x,y,method="hamming", useBytes=TRUE ), 1-1)
42 |   expect_equal(stringsim(x,y,method="lcs", useBytes=FALSE),     1-1/2)
43 |   expect_equal(stringsim(x,y,method="lcs", useBytes=TRUE ),     1-3/5)
44 |   expect_equal(stringsim(x,y,method="qgram", q=1, useBytes=FALSE), 1-1/2)
45 |   expect_equal(stringsim(x,y,method="qgram", q=1, useBytes=TRUE ), 1-3/5)
46 |   expect_equal(stringsim(x,y,method="cosine", q=1, useBytes=FALSE), 1-1/2)
47 |   expect_equal(stringsim(x,y,method="cosine", q=1, useBytes=TRUE ), 1-(1-1/sqrt(6)))
48 |   expect_equal(stringsim(x,y,method="jaccard", q=1, useBytes=FALSE), 1-2/3)
49 |   expect_equal(stringsim(x,y,method="jaccard", q=1, useBytes=TRUE ), 1-3/4)
50 |   expect_equal(stringsim(x,y,method="jw", useBytes=FALSE), 1-1/3)
51 |   expect_equal(stringsim(x,y,method="jw", useBytes=TRUE ), (1/2 + 1/3 +1)/3)
52 | 
53 | # stringsimmatrix
54 |   x <- names(islands)[1:10] 
55 |   y <- rev(x) # o-umlaut
56 |   expect_true(inherits(stringsimmatrix(x,y,method="osa", useBytes=FALSE), "matrix"))
57 |   expect_equal(dim(stringsimmatrix(x,y,method="osa", useBytes=FALSE)), c(10, 10))
58 |   expect_equal(stringsimmatrix(x,y,method="osa", useBytes=FALSE)[2, 2], 0.2)
59 |   expect_true(inherits(stringsimmatrix(x,method="osa", useBytes=FALSE), "matrix"))
60 |   expect_equal(dim(stringsimmatrix(x,method="osa", useBytes=FALSE)), c(10, 10))
61 |   expect_equal(stringsimmatrix(x,method="osa", useBytes=FALSE)[2, 9], 0.2)
62 |   expect_warning(stringdistmatrix(list('a')))
63 |   expect_warning(stringdistmatrix(list('a'),list('b')))
64 |  
65 | ## seq_sim
66 | 
67 | # We used to have list(1:3, 2:4) and list(1:3). This occasionally
68 | # gave failing tests, and only in the context of expect_equal (both
69 | # for tinytest and testthat. Therefore this may point to a hard-to-reproduce
70 | # bug in R's JIT compiler.
71 |   expect_equal(
72 |        seq_sim(list(c(1,2,3),c(2,3,4)), list(c(1,2,3)), method="cosine")
73 |      , stringsim(c("abc","bcd"),"abc", method="cosine") 
74 |   )
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/pkg/R/doc_encoding.R:
--------------------------------------------------------------------------------
 1 | #' @title 
 2 | #' String metrics in \pkg{stringdist}
 3 | #' 
 4 | #' @description
 5 | #' This page gives an overview of encoding handling in \pkg{stringst}. 
 6 | #'
 7 | #' 
 8 | #' @section Encoding in \pkg{stringdist}:
 9 | #' 
10 | #' All character strings are stored as a sequence of bytes. An encoding
11 | #' system relates a byte, or a short sequence of bytes to a symbol. Over the years, many 
12 | #' encoding systems have been developed, and not all OS's and softwares use the same encoding 
13 | #' as default. Similarly, depending on the system R is running on, R may use a
14 | #' different encoding for storing strings internally.
15 | #'
16 | #' The \pkg{stringdist} package is designed so users in principle need not
17 | #' worry about this. Strings are converted to \code{UTF-32} (unsigned integer)
18 | #' by default prior to any further computation. This means that results are
19 | #' encoding-independent and that strings are interpreted as a sequence of
20 | #' symbols, not as a sequence of pure bytes. In functions where this is
21 | #' relevant, this may be switched by setting the \code{useBytes} option to
22 | #' \code{TRUE}. However, keep in mind that results will then likely depend on the
23 | #' system R is running on, except when your strings are pure ASCII.
24 | #' Also, for multi-byte encodings, results for byte-wise computations
25 | #' will usually differ from results using encoded computations.
26 | #'
27 | #' Prior to \pkg{stringdist} version 0.9, setting \code{useBytes=TRUE} could 
28 | #' give a significant performance enhancement. Since version 0.9, translation
29 | #' to integer is done by C code internal to \pkg{stringdist} and the difference in
30 | #' performance is now negligible.
31 | #'
32 | #' @section Unicode normalisation:
33 | #' In \code{utf-8}, the same (accented) character may be represented as several byte sequences. For example, an u-umlaut
34 | #' can be represented with a single byte code or as a byte code representing \code{'u'} followed by a modifier byte code
35 | #' that adds the umlaut. The \href{https://cran.r-project.org/package=stringi}{stringi} package 
36 | #' of Gagolevski and Tartanus offers unicode normalisation tools. 
37 | #'
38 | #' @section Some tips on character encoding and transliteration:
39 | #' Some algorithms (like soundex) are defined only on the printable ASCII character set. This excludes any character
40 | #' with accents for example. Translating accented characters to the non-accented ones is a form of transliteration. On
41 | #' many systems running R (but not all!) you can achieve this with 
42 | #' 
43 | #' \code{iconv(x,to="ASCII//TRANSLIT")}, 
44 | #' 
45 | #' where \code{x} is your character vector. See the documentation of \code{\link[base]{iconv}} for details.
46 | #'
47 | #' The \code{stringi} package (Gagolewski and Tartanus) should work on any system. The command 
48 | #' \code{stringi::stri_trans_general(x,"Latin-ASCII")} transliterates character vector \code{x} to ASCII.
49 | #'
50 | #' @references
51 | #' \itemize{
52 | #'  \item{The help page of \code{\link[base]{Encoding}}} describes how R handles encoding.
53 | #'  \item{The help page of \code{\link[base]{iconv}} has a good overview of base R's 
54 | #'       encoding conversion options. The capabilities of \code{iconv} depend on the system R is running on.
55 | #'       The \pkg{stringi} package offers platform-independent encoding and normalization tools.}
56 | #' }
57 | #'
58 | #' @seealso
59 | #' \itemize{
60 | #' \item{Functions using re-encoding: \code{\link{stringdist}}, \code{\link{stringdistmatrix}}, \code{\link{amatch}}, \code{\link{ain}}, \code{\link{qgrams}}}
61 | #' \item{Encoding related: \code{\link{printable_ascii}}}
62 | #' }
63 | #' @name stringdist-encoding
64 | #' @rdname stringdist-encoding
65 | {}
66 | 
67 | 


--------------------------------------------------------------------------------
/pkg/src/jaro.c:
--------------------------------------------------------------------------------
  1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  2 |  *  Copyright (C) 2013  Mark van der Loo
  3 |  *
  4 |  *  This program is free software: you can redistribute it and/or modify
  5 |  *  it under the terms of the GNU General Public License as published by
  6 |  *  the Free Software Foundation, either version 3 of the License, or
  7 |  *  (at your option) any later version.
  8 |  *
  9 |  *  This program is distributed in the hope that it will be useful,
 10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |  *  GNU General Public License for more details.
 13 |  *
 14 |  *  You should have received a copy of the GNU General Public License
 15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 16 |  *
 17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 18 |  */
 19 | 
 20 | #include "utils.h"
 21 | #include <string.h>
 22 | 
 23 | 
 24 | // Winkler's l-factor (nr of matching characters at beginning of the string).
 25 | static double get_l(unsigned int *a, unsigned int *b, int n){
 26 |   int i=0;
 27 |   double l;
 28 |   while ( a[i] == b[i] && i < n ){ 
 29 |     i++;
 30 |   }
 31 |   l = (double) i;
 32 |   return l;
 33 | }
 34 | 
 35 | 
 36 | /* jaro distance (see http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance).
 37 |  *
 38 |  * a    : string (in int rep)
 39 |  * b    : string (in int rep)
 40 |  * x    : length of a (in uints)
 41 |  * y    : length of b (in uints)
 42 |  * p    : Winkler's p-factor in (0,0.25)
 43 |  * bt   : Winkler's (undocumented?) boost threshold
 44 |  * work : workspace, minimally of length x + y
 45 |  *
 46 |  */
 47 | double jaro_winkler_dist(
 48 |      unsigned int *a 
 49 |      , int x
 50 |      , unsigned int *b
 51 |      , int y
 52 |      , double p
 53 |      , double bt
 54 |      , double *w
 55 |      , double *work
 56 |   ){
 57 | 
 58 | 
 59 |   // edge case
 60 |   if ( x == 0 && y == 0 ) return 0;
 61 | 
 62 |   for (int k=0; k < x + y; k++) work[k] = 0;
 63 | 
 64 |   // we need space for integers (or do a lot of conversions)
 65 |   unsigned int *wrk = (unsigned int*) work;
 66 |   unsigned int *matcha = wrk
 67 |              , *matchb = wrk + x;  
 68 |   unsigned int left, right;
 69 | 
 70 |   // number of matches
 71 |   int m = 0;
 72 |   // max transposition distance
 73 |   int M = MAX(MAX(x,y)/2 - 1,0);
 74 | 
 75 |   // store the match indices. Indices are stored as i+1 because 0 is used as 'no match'
 76 |   for ( int i = 0; i < x; ++i){
 77 |     left = MAX(0,i-M);
 78 |     right = MIN(y,i+M);
 79 |     for ( int j = left; j <= right; j++){
 80 |       if ((a[i] == b[j]) && (matchb[j]==0)){
 81 |          matcha[i] = i+1;
 82 |          matchb[j] = j+1;
 83 |          m += 1;
 84 |          break;
 85 |       }
 86 |     }
 87 |   }
 88 | 
 89 |   // copy matches so they're easy to compare for transposition counting
 90 |   int j = 0;
 91 |   for (int i=0; i < x; ++i){
 92 |     if (matcha[i]){ 
 93 |       matcha[j] = a[matcha[i]-1];
 94 |       ++j;
 95 |     }
 96 |   }
 97 |   j = 0;
 98 |   for (int i=0; i < y; ++i){
 99 |     if (matchb[i]){ 
100 |       matchb[j] = b[matchb[i]-1];
101 |       ++j;
102 |     }
103 |   }
104 | 
105 |   // count 'transpositions', the Jaro way.
106 |   double t = 0.0;
107 |   for ( int k=0; k<m; ++k){
108 |     if (matcha[k] != matchb[k]) t += 0.5;
109 |   }
110 | 
111 |   double d;
112 |   if ( m < 1 ){
113 |     d = 1.0;
114 |   } else {
115 |     d = 1.0 - (1.0/3.0)*(w[0]*m/((double) x) + w[1]*m/((double) y) + w[2]*(m-t)/m);
116 |   }
117 |   // Winkler's penalty factor
118 |   if ( p > 0 && d > bt ){
119 |     int n = MIN(MIN(x,y),4);
120 |     d =  d - get_l(a,b,n)*p*d; 
121 |   }
122 | 
123 |   return d;
124 | }
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/pkg/src/utils.h:
--------------------------------------------------------------------------------
  1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  2 |  *  Copyright (C) 2013  Mark van der Loo
  3 |  *
  4 |  *  This program is free software: you can redistribute it and/or modify
  5 |  *  it under the terms of the GNU General Public License as published by
  6 |  *  the Free Software Foundation, either version 3 of the License, or
  7 |  *  (at your option) any later version.
  8 |  *
  9 |  *  This program is distributed in the hope that it will be useful,
 10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |  *  GNU General Public License for more details.
 13 |  *
 14 |  *  You should have received a copy of the GNU General Public License
 15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 16 |  *
 17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 18 |  */
 19 | 
 20 | #ifndef sd_utils_h
 21 | #define sd_utils_h
 22 | 
 23 | #define USE_RINTERNALS
 24 | #include <stdlib.h>
 25 | #include <R.h>
 26 | #include <Rdefines.h>
 27 | 
 28 | /* integer recycling macro  */
 29 | #define RECYCLE(X,Y) ( (X) == (Y) ? 0 : (X) )
 30 | 
 31 | #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 32 | #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 33 | #define ABS(X) ((X)<0 ? -1*(X) : (X))
 34 | 
 35 | 
 36 | /* A structure for storing integer reps of strings */
 37 | typedef struct {
 38 |   // array of pointers to integer representation of strings, stored in data.
 39 |   unsigned int **string;
 40 |   //  array of string lengths.
 41 |   int *str_len;
 42 |   //  storage room for integer representation of strings
 43 |   unsigned int *data;
 44 | } Stringset;
 45 | 
 46 | 
 47 | /* size of longest object in a SEXP
 48 |  *
 49 |  *
 50 |  *
 51 |  */
 52 | unsigned int max_length(SEXP);
 53 | 
 54 | /* Get element from SEXP list and determine some parameters.
 55 |  *
 56 |  * Input:
 57 |  * x: A list of integer vectors or a character vector
 58 |  * i: Index in x: what element to extract.
 59 |  * bytes: (boolean) if (bytes) then x is assumed to be a character vector.
 60 |  * 
 61 |  * Output:
 62 |  * len  : the length of the i'th object in x.
 63 |  * isna : wether the i'th object represents an NA
 64 |  * c    : if (bytes) then c will contain the values of the i'th element in x, coerced to integers.
 65 |  *
 66 |  * Return value: 
 67 |  * A pointer to the integer representation of the i'th object in x.
 68 |  *
 69 |  */
 70 | unsigned int *get_elem(SEXP x, R_xlen_t i, int bytes, int intdist, int *len, int *isna, unsigned int *c);
 71 | 
 72 | 
 73 | /* (mutlithreaded) recycling.
 74 |  *
 75 |  * This avoids having to compute i % ni at every iteration while
 76 |  * recycling over a vector. 
 77 |  *
 78 |  * In the default case, the counter i starts at omp_thread_num() and
 79 |  * is increased with omp_get_num_threads() (round robin parallelization).
 80 |  * If a counter is increased to above the length of a vector, it's value
 81 |  * is decreased with the vector length (recycling).
 82 |  *
 83 |  * There is an edge case when omp_get_num_threads() is less than the
 84 |  * vector's length. In that case, when i > the vector length, the 
 85 |  * modulus is computed anyway.
 86 |  *
 87 |  *
 88 |  * Input:
 89 |  * i : integer, current index.
 90 |  * nthreads : number of threads working on the vector.
 91 |  * ni : vector length
 92 |  *
 93 |  *
 94 |  */
 95 | static inline int recycle(int i, int nthreads, int ni){
 96 |   i += nthreads;
 97 |   if ( i >= ni )
 98 |     i = (nthreads < ni) ? (i - ni) : (i % ni);
 99 |   return i;
100 | }
101 | 
102 | /* Create a new stringset from an character vector.
103 |  *
104 |  * Translates character vectors to integers. Input is expected in utf-8 format.
105 |  * Translation can be bytewise (bytes=1) or interpreted utf8.
106 |  *
107 |  * Output: Pointer to a Stringset.
108 |  *
109 |  *
110 |  *
111 |  */
112 | Stringset *new_stringset(SEXP str, int bytes, int intdist);
113 | 
114 | /* Clean up a Stringset. */
115 | void free_stringset(Stringset *s);
116 | 
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/pkg/R/qgrams.R:
--------------------------------------------------------------------------------
  1 | #' Get a table of qgram counts from one or more character vectors.
  2 | #'
  3 | #' @section Details:
  4 | #' The input is converted to \code{character}. If \code{useBytes=TRUE}, each element is 
  5 | #' converted to \code{utf8} and then to \code{integer} as in \code{\link{stringdist}}. 
  6 | #' Next,the data is passed to the underlying routine.
  7 | #' 
  8 | #' Strings with less than \code{q} characters and elements containing \code{NA} are skipped. Using \code{q=0} 
  9 | #' therefore counts the number of empty strings \code{""} occuring in each argument.
 10 | #' 
 11 | #' @param ... any number of (named) arguments, that will be coerced to character with \code{as.character}.
 12 | #' @param q size of q-gram, must be non-negative.
 13 | #' @param useBytes Determine byte-wise qgrams. \code{useBytes=TRUE} is faster but may yield different
 14 | #' 	results depending on character encoding. For \code{ASCII} it is identical. See also \code{\link{stringdist}} under Encoding issues.
 15 | #' @param useNames Add q-grams as column names. If \code{useBytes=useNames=TRUE}, the q-byte sequences are represented as 2 hexadecimal numbers
 16 | #'   per byte, separated by a vertical bar (\code{|}).
 17 | #' @param .list Will be concatenated with the \code{...} argument(s). Useful for adding character vectors named \code{'q'} or \code{'useNames'}.
 18 | #' @return A table with \eqn{q}-gram counts. Detected \eqn{q}-grams are column names and the argument names as row names.
 19 | #' If no argument names were provided, they will be generated.
 20 | #'
 21 | #' @seealso \code{\link{stringdist}}, \code{\link{amatch}}
 22 | #'
 23 | #' @example ../examples/qgrams.R
 24 | #' @export
 25 | qgrams <- function(..., .list=NULL,q=1L,useBytes=FALSE, useNames=!useBytes){
 26 |   stopifnot(is.numeric(q), length(q)==1, !is.na(q), q>=0)
 27 |   q <- as.integer(q)
 28 | 
 29 |   if (!is.null(.list) && length(.list) == 0) .list=NULL
 30 |   L <- lapply(c(list(...),.list), as.character)
 31 |   if (length(L) == 0) return(array(dim=c(0,0)))
 32 |   L <- setnames(L)
 33 | 
 34 |   if (q==0){
 35 |     return( matrix( sapply(L,function(x) sum(x==""))
 36 |                   , ncol=1
 37 |                   , dimnames = list(names(L), NULL)) )
 38 |   }
 39 | 
 40 | 
 41 |   L <- lapply(L,char2int)
 42 | 
 43 |   v <- .Call("R_get_qgrams",L,as.integer(q),PACKAGE="stringdist")
 44 |   
 45 |   nqgrams <- length(v)/length(L)
 46 |   qgrams <- NULL
 47 |   if (useNames){  
 48 |     if ( q == 0 ){
 49 |       qgrams = ""
 50 |     } else {
 51 |       Q <- attr(v,"qgrams")
 52 |       attr(v,"qgrams") <- NULL
 53 |       A <- array(Q,dim=c(q, nqgrams))
 54 |       qgrams = if( useBytes ){
 55 |             apply(A,2,function(x) paste(as.raw(x),collapse="|")) 
 56 |           } else {
 57 |             enc2native(apply(A,2,intToUtf8))
 58 |           }
 59 |     }
 60 |   }
 61 |   array(v,
 62 |     dim=c(length(L), nqgrams),
 63 |     dimnames = list(
 64 |     names(L),
 65 |     qgrams
 66 |    ) 
 67 |   )
 68 | }
 69 | 
 70 | 
 71 | setnames <- function(x){
 72 |   list_names <- names(x)
 73 |   generic_names <- paste0("V",seq_along(x))
 74 |   if (is.null(list_names)){
 75 |     return(setNames(x,generic_names))
 76 |   }
 77 |   I <- list_names == ""
 78 |   names(x)[I] <- generic_names[I]
 79 |   x
 80 | }
 81 | 
 82 | #' Get a table of qgram counts for integer sequences
 83 | #' 
 84 | #' 
 85 | #' @param ... Any number of (named) arguments that will be coerced with \code{as.integer}
 86 | #' @param .list Will be concatenated with the \code{...} argument(s). Useful for adding integer vectors named 'q'.
 87 | #' @param q The size of q-gramming.
 88 | #' 
 89 | #' @return 
 90 | #' A \code{matrix} containing q-gram profiles. Columns 1 to \code{q} contain the
 91 | #' encountered q-grams. The ensuing (named) columns contain the q-gram counts
 92 | #' per vector. Run the example for a simple overview.
 93 | #' 
 94 | #' Missing values in integer sequences are treated as any other number.  
 95 | #' 
 96 | #' @example ../examples/seq_qgrams.R
 97 | #' 
 98 | #' @seealso \code{\link{seq_dist}}, \code{\link{seq_amatch}}
 99 | #' 
100 | #' @export 
101 | #' 
102 | seq_qgrams <- function(...,.list=NULL,q=1L){
103 |   L <- lapply(c(list(...),.list),function(x) list(as.integer(x)))
104 |   if (length(L) == 0) return(array(dim=c(0,0)))
105 |   L <- setnames(L)
106 |   v <- .Call("R_get_qgrams",L,as.integer(q), PACKAGE="stringdist")
107 |   Q <- attr(v,"qgrams")
108 |   nqgrams <- length(v)/length(L)
109 |   Q <- t(array(Q,dim=c(q,nqgrams),dimnames=list(paste0("q",1:q),NULL)))
110 |   A <- matrix(v, nrow=nqgrams, ncol=length(L),byrow=TRUE,dimnames=list(NULL,paste0("n.",names(L))))
111 |   cbind(Q,A)
112 | }
113 | 


--------------------------------------------------------------------------------
/pkg/src/dl.c:
--------------------------------------------------------------------------------
  1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  2 |  *  Copyright (C) 2013  Mark van der Loo
  3 |  *
  4 |  *  This program is free software: you can redistribute it and/or modify
  5 |  *  it under the terms of the GNU General Public License as published by
  6 |  *  the Free Software Foundation, either version 3 of the License, or
  7 |  *  (at your option) any later version.
  8 |  *
  9 |  *  This program is distributed in the hope that it will be useful,
 10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |  *  GNU General Public License for more details.
 13 |  *
 14 |  *  You should have received a copy of the GNU General Public License
 15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 16 |  *
 17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 18 |  *
 19 |  *
 20 |  * This code is gratefully based on Nick Logan's github repository
 21 |  * https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c
 22 |  * 
 23 |  *
 24 |  * Changes/additions wrt original code:
 25 |  * - Added R.h, Rdefines.h inclusion
 26 |  * - Added R interface function
 27 |  * - Added edit weights (function is now of type double)
 28 |  * - Added corner cases for length-zero strings.
 29 |  * - Replaced linked list dictionary with fixed-size struct for loop 
 30 |  *    externalization of memory allocation.
 31 |  * - Externalized allocation of dynamic programming matrix.
 32 |  * 
 33 |  * 
 34 |  */
 35 | 
 36 | 
 37 | #include "utils.h"
 38 | #include "dictionary.h"
 39 | 
 40 | /*
 41 | static void print_dict(dictionary *d){
 42 |   for ( int i=0; i<d->length; i++){
 43 |     Rprintf("d[%d] = %d; ", i, d->key[i]);
 44 |   }
 45 |   Rprintf("\n");
 46 | }
 47 | */
 48 | 
 49 | static void reset_dictionary(dictionary *d){
 50 |   int nbytes = sizeof(unsigned int)*(d->length);
 51 |   memset(d->key  , 0, nbytes);
 52 |   memset(d->value, 0, nbytes);
 53 | }
 54 | 
 55 | dictionary *new_dictionary(unsigned int length){
 56 |   dictionary *d = (dictionary *) malloc(sizeof(dictionary));
 57 |   if ( d == NULL ){
 58 |     return NULL;
 59 |   }
 60 |   d->key   = (unsigned int *) malloc(length*sizeof(int));
 61 |   d->value = (unsigned int *) malloc(length*sizeof(int));
 62 |   if ( d->key == NULL || d->value == NULL){
 63 |     free(d->key);
 64 |     free(d->value);
 65 |     free(d);
 66 |     return NULL;
 67 |   }
 68 |   d->length = length;
 69 |   reset_dictionary(d);
 70 |   return d;
 71 | }
 72 | 
 73 | void free_dictionary(dictionary *d){
 74 |   if ( d != NULL ){
 75 |     free(d->key);
 76 |     free(d->value);
 77 |     free(d);
 78 |   }
 79 | }
 80 | 
 81 | 
 82 | static void uniquePush(dictionary *d, unsigned int key){
 83 |   int i=0;
 84 |   while (d->key[i] && d->key[i] != key){
 85 |     ++i;
 86 |   }
 87 |   d->key[i] = key;
 88 | }
 89 | 
 90 | static unsigned int which(dictionary *d, unsigned int key){
 91 |   int i=0;
 92 |   while( d->key[i] != key ){
 93 |      ++i;
 94 |   }
 95 |   return i;
 96 | }
 97 | 
 98 | 
 99 | // note: src (tgt) will be indexed to their x + 1 (y+1).
100 | double dl_dist(
101 |       unsigned int *src,
102 |       int x,
103 |       unsigned int *tgt,
104 |       int y,
105 |       double *weight,
106 |       dictionary *dict,
107 |       double *scores
108 |     ){
109 | 
110 |   if (!x){
111 |     return (double) y * weight[1]; // ins weight
112 |   }
113 |   if (!y){
114 |     return (double) x * weight[0]; // del weight
115 |   }
116 | 
117 |   unsigned int swapCount, targetCharCount,i,j;
118 |   double delScore, insScore, subScore, swapScore;
119 |   double score_ceil =  x + y;
120 |   
121 |   /* intialize matrix start values */
122 |   scores[0] = score_ceil;  
123 |   scores[1 * (y + 2) + 0] = weight[0]; //score_ceil;
124 |   scores[0 * (y + 2) + 1] = weight[1]; //score_ceil;
125 |   scores[1 * (y + 2) + 1] = 0;
126 | 
127 |   uniquePush(dict,src[0]);
128 |   uniquePush(dict,tgt[0]);
129 | 
130 |   /* work loops    */
131 |   /* i = src index */
132 |   /* j = tgt index */
133 |   for(i=1;i<=x;i++){ 
134 |     uniquePush(dict,src[i]);
135 |     scores[(i+1) * (y + 2) + 1] = i * weight[0];
136 |     scores[(i+1) * (y + 2) + 0] = score_ceil;
137 |     swapCount = 0;
138 |     
139 |     for(j=1;j<=y;j++){
140 |       if(i == 1) {
141 |         uniquePush(dict,tgt[j]);
142 |         scores[1 * (y + 2) + (j + 1)] = j * weight[1];
143 |         scores[0 * (y + 2) + (j + 1)] = score_ceil;
144 |       }
145 |       targetCharCount = dict->value[which(dict, tgt[j-1])];
146 |       swapScore = scores[targetCharCount * (y + 2) + swapCount] + (i - targetCharCount - 1 + j - swapCount) *  weight[3];
147 | 
148 |       if(src[i-1] != tgt[j-1]){
149 |         subScore = scores[i * (y + 2) + j] + weight[2];
150 |         insScore = scores[(i+1) * (y + 2) + j] + weight[1];
151 |         delScore = scores[i * (y + 2) + (j + 1)] + weight[0];
152 |         scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore, MIN(delScore, MIN(insScore, subScore) ));
153 |       } else {
154 |         swapCount = j;
155 |         scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
156 |       }
157 |     }
158 |     
159 |    dict->value[which(dict,src[i-1])] = i;    
160 |   }
161 | 
162 |   double score = scores[(x+1) * (y + 2) + (y + 1)];
163 |   reset_dictionary(dict);
164 |   return score;
165 | }
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/pkg/src/stringdist.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  3 |  *  Copyright (C) 2013  Mark van der Loo
  4 |  *
  5 |  *  This program is free software: you can redistribute it and/or modify
  6 |  *  it under the terms of the GNU General Public License as published by
  7 |  *  the Free Software Foundation, either version 3 of the License, or
  8 |  *  (at your option) any later version.
  9 |  *
 10 |  *  This program is distributed in the hope that it will be useful,
 11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  *  GNU General Public License for more details.
 14 |  *
 15 |  *  You should have received a copy of the GNU General Public License
 16 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 17 |  *
 18 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 19 |  */
 20 | 
 21 | #include <stdlib.h>
 22 | #include <string.h>
 23 | #include <stdarg.h>
 24 | #include "dist.h"
 25 | #include "stringdist.h"
 26 | 
 27 | #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 28 | 
 29 | // todo: remove spurious include.
 30 | #include <R.h>
 31 | 
 32 | /* 
 33 |  *
 34 |  *
 35 |  *
 36 |  */
 37 | Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){
 38 |   va_list args;
 39 |   va_start(args, str_len_b);
 40 | 
 41 |   Stringdist *S = (Stringdist *) malloc(sizeof(Stringdist)); 
 42 |   (*S) = (Stringdist) {d, NULL, NULL, NULL, NULL, 0L, 0.0, 0.0, 0L};
 43 |   switch (d){
 44 |     case osa :
 45 |       S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) * sizeof(double)); 
 46 |       S->weight = (double *) malloc(4*sizeof(double));
 47 |       memcpy(S->weight, va_arg(args, double *), 4*sizeof(double));
 48 |       break;
 49 |     case lv :
 50 |       S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double));
 51 |       S->weight = (double *) malloc(3 * sizeof(double));
 52 |       memcpy(S->weight, va_arg(args, double *), 3*sizeof(double));
 53 |       break;
 54 |     case dl :
 55 |       S->dict = new_dictionary( str_len_a + str_len_b + 1);
 56 |       S->work = (double *) malloc( (str_len_a + 3) * (str_len_b + 3) * sizeof(double)); 
 57 |       S->weight = (double *) malloc(4*sizeof(double));
 58 |       memcpy(S->weight, va_arg(args, double *), 4*sizeof(double));
 59 |       break;
 60 |     case hamming :
 61 |       break;
 62 |     case lcs :
 63 |       S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double));
 64 |       break;
 65 |     case qgram :
 66 |       S->q = va_arg(args, unsigned int);
 67 |       S->tree = new_qtree(S->q, 2L); 
 68 |       break;
 69 |     case cosine :
 70 |       S->q = va_arg(args, unsigned int);
 71 |       S->tree = new_qtree(S->q, 2L); 
 72 |       break;
 73 |     case jaccard :
 74 |       S->q = va_arg(args, unsigned int);
 75 |       S->tree = new_qtree(S->q, 2L); 
 76 |       break;
 77 |     case running_cosine :
 78 |       S->q = va_arg(args, unsigned int);
 79 |       S->tree = new_qtree(S->q, 2L);
 80 |       S->work = (double *) malloc(3*sizeof(double));
 81 |       break;
 82 |     case jw :
 83 |       S->work = (double *) malloc( sizeof(double) * (str_len_a+str_len_b));
 84 |       S->weight = (double *) malloc(3L*sizeof(double));
 85 |       memcpy(S->weight, va_arg(args, double *), 3*sizeof(double));
 86 |       S->p = va_arg(args, double);
 87 |       S->bt = va_arg(args, double);
 88 |       break;
 89 |     case soundex :
 90 |       break;
 91 |     default :
 92 |       break;
 93 |   };
 94 | 
 95 |   va_end(args);
 96 | 
 97 |   if ( (d == osa || d == lv || d == dl || d == lcs || d== jw) &&  S->work == NULL ){ 
 98 |       close_stringdist(S);
 99 |       return(NULL);
100 |   } 
101 |   return S; 
102 |   
103 | }
104 | 
105 | void close_stringdist(Stringdist *S){
106 |   free(S->work);
107 |   free(S->weight);
108 | 
109 |   if (S->distance == dl){
110 |     free_dictionary(S->dict);
111 |   }
112 |   if (S->distance == qgram || S->distance == cosine || S->distance == jaccard){
113 |     free_qtree();
114 |   }
115 |   free(S);
116 | }
117 | 
118 | void reset_stringdist(Stringdist *S){
119 |   if (S->distance == running_cosine){
120 |       free_qtree();
121 |       S->tree = new_qtree(S->q, 2L);
122 |   }
123 | }
124 | 
125 | 
126 | 
127 | 
128 | 
129 | double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *str_b, int len_b){
130 |   double d = -1.0;
131 | 
132 |   switch(S->distance){
133 |     case osa :
134 |      return osa_dist(str_a, len_a, str_b, len_b, S->weight, S->work);
135 |     case lv :
136 |       return lv_dist( str_a, len_a, str_b, len_b, S->weight, S->work);
137 |     case dl :
138 |       return dl_dist(str_a, len_a, str_b, len_b, S->weight, S->dict, S->work);
139 |     case hamming :
140 |       return hamming_dist(str_a, len_a, str_b, len_b);
141 |     case lcs :
142 |       return lcs_dist(str_a, len_a, str_b, len_b, S->work); 
143 |     case qgram :
144 |       return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 0L);
145 |     case cosine :
146 |       return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 1L);
147 |     case jaccard :
148 |       return qgram_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, 2L);
149 |     case running_cosine:
150 |       return running_cosine_dist(str_a, len_a, str_b, len_b, S->q, &S->tree, S->work);
151 |     case jw :
152 |       return jaro_winkler_dist(str_a, len_a, str_b, len_b, S->p, S->bt, S->weight, S->work);
153 |     case soundex :
154 |       return soundex_dist(str_a, len_a, str_b, len_b, &(S->ifail));
155 |     default :
156 |       break;
157 |       // set errno, return -1
158 |   }
159 |     return d;
160 | }
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/pkg/R/seqdist.R:
--------------------------------------------------------------------------------
  1 | #' Compute distance metrics between integer sequences
  2 | #'
  3 | #' \code{seq_dist} computes pairwise string distances between elements of 
  4 | #' \code{a} and \code{b}, where the argument with less elements is recycled. 
  5 | #' \code{seq_distmatrix} computes the distance matrix with rows according to
  6 | #' \code{a} and columns according to \code{b}.
  7 | #'
  8 | #'
  9 | #' @section Notes:
 10 | #' Input vectors are converted with \code{as.integer}. This causes truncation for numeric
 11 | #' vectors (e.g. \code{pi} will be treated as \code{3L}).
 12 | #'
 13 | #' @param a (\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer}  (target)
 14 | #' @param b (\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer} (source). 
 15 | #'    Optional for \code{seq_distmatrix}.
 16 | #' @param method Distance metric. See \code{\link{stringdist-metrics}}
 17 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
 18 | #'   deletion, insertion, substitution and transposition, in that order. When
 19 | #'   \code{method='lv'}, the penalty for transposition is ignored. When
 20 | #'   \code{method='jw'}, the weights associated with characters of \code{a},
 21 | #'   characters from \code{b} and the transposition weight, in that order. 
 22 | #'   Weights must be positive and not exceed 1. \code{weight} is ignored
 23 | #'   completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
 24 | #'   \code{'Jaccard'}, or \code{'lcs'}
 25 | #' @param q  Size of the \eqn{q}-gram; must be nonnegative. Only applies to
 26 | #'   \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.
 27 | #' @param p Prefix factor for Jaro-Winkler distance. The valid range for 
 28 | #'   \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the
 29 | #'   Jaro-distance is returned. Applies only to \code{method='jw'}.
 30 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is
 31 | #'   only applied when the Jaro distance is larger than \code{bt}
 32 | #'   Applies only to \code{method='jw'} and \code{p>0}.
 33 | #' @param nthread Maximum number of threads to use. By default, a sensible
 34 | #'   number of threads is chosen, see \code{\link{stringdist-parallelization}}.
 35 | #'
 36 | #' @return 
 37 | #' 
 38 | #' \code{seq_dist} returns a numeric vector with pairwise distances between \code{a}
 39 | #' and \code{b} of length \code{max(length(a),length(b)}.
 40 | #' 
 41 | #' For \code{seq_distmatrix} there are two options. If \code{b} is missing, the 
 42 | #' \code{\link[stats]{dist}} object corresponding to the \code{length(a) X
 43 | #' length(a)} distance matrix is returned. If \code{b} is specified, the
 44 | #' \code{length(a) X length(b)} distance matrix is returned.
 45 | #'    
 46 | #' If any element of \code{a} or \code{b} is \code{NA_integer_}, the distance with
 47 | #' any matched integer vector will result in \code{NA}. Missing values in the sequences
 48 | #' themselves are treated as a number and not treated specially (Also see the examples).
 49 | #'   
 50 | #' @seealso \code{\link{seq_sim}}, \code{\link{seq_amatch}}, \code{\link{seq_qgrams}} 
 51 | #'  
 52 | #' @example ../examples/seq_dist.R
 53 | #' @export
 54 | seq_dist <- function(a, b
 55 |   , method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw")
 56 |   , weight=c(d=1,i=1,s=1,t=1) 
 57 |   , q=1, p=0, bt=0
 58 |   , nthread = getOption("sd_num_thread")
 59 | ){
 60 |   a <- ensure_int_list(a)
 61 |   b <- ensure_int_list(b)
 62 |   
 63 |   stopifnot(
 64 |      all(is.finite(weight))
 65 |     , all(weight > 0)
 66 |     , all(weight <=1)
 67 |     , q >= 0
 68 |     , p <= 0.25
 69 |     , p >= 0
 70 |     , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
 71 |     , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
 72 |     , length(nthread) == 1
 73 |     , is.numeric(nthread)
 74 |     , nthread > 0
 75 |   )
 76 |   
 77 |   
 78 |   if (length(a) == 0 || length(b) == 0){ 
 79 |     return(numeric(0))
 80 |   }
 81 |   if ( max(length(a),length(b)) %% min(length(a),length(b)) != 0 ){
 82 |     warning(RECYCLEWARNING)
 83 |   }
 84 |   method <- match.arg(method)
 85 |   nthread <- as.integer(nthread)
 86 |   if (method == 'jw') weight <- weight[c(2,1,3)]
 87 |   do_dist(a=b, b=a
 88 |     , method=method
 89 |     , weight=weight
 90 |     , q=q
 91 |     , p=p
 92 |     , bt=bt
 93 |     , nthread=nthread)
 94 | }
 95 | 
 96 | #' @param useNames label the output matrix with \code{names(a)} and \code{names(b)}?
 97 | #' @rdname seq_dist
 98 | #' @export 
 99 | seq_distmatrix <- function(a, b
100 |    , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw")
101 |    , weight=c(d=1,i=1,s=1,t=1),  q=1, p=0, bt=0
102 |    , useNames=c("names","none")
103 |    , nthread = getOption("sd_num_thread")
104 | ){
105 |   useNames <- match.arg(useNames)
106 |   method <- match.arg(method)
107 |   nthread <- as.integer(nthread)
108 |   if (method == 'jw') weight <- weight[c(2,1,3)]
109 |   
110 |   a <- ensure_int_list(a)
111 |   
112 |   # if b is missing, generate a 'dist' object.  
113 |   if (missing(b)){ 
114 |     return( lower_tri(a
115 |         , method=method
116 |         , weight=weight
117 |         , q=q
118 |         , p=p
119 |         , bt=bt
120 |         , useNames=useNames
121 |         , nthread=nthread)
122 |     )
123 |   }
124 |  
125 |   b <- ensure_int_list(b)
126 |   if (length(a) == 0 || length(b) == 0){ 
127 |     return(matrix(numeric(0)))
128 |   }
129 |   
130 |   if (useNames == "names"){
131 |     rowns <- names(a)
132 |     colns <- names(b)
133 |   }
134 |   
135 |   
136 |   x <- vapply(b
137 |       , function(src) do_dist(list(src), b=a, method=method, weight=weight, q=q, p=p,bt=bt, nthread=nthread)
138 |       , USE.NAMES=FALSE, FUN.VALUE=numeric(length(a))
139 |     )
140 |   
141 |   if (useNames == "names" ){  
142 |     structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns)))
143 |   } else {
144 |     matrix(x,nrow=length(a),ncol=length(b)) 
145 |   }
146 |   
147 | }
148 | 
149 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | [![CRAN](http://www.r-pkg.org/badges/version/stringdist)](http://cran.r-project.org/web/packages/stringdist/NEWS)
  3 | [![status](https://tinyverse.netlify.com/badge/stringdist)](https://CRAN.R-project.org/package=stringdist)
  4 | [![Downloads](http://cranlogs.r-pkg.org/badges/stringdist)](http://cran.r-project.org/package=stringdist/)[![Research software impact](http://depsy.org/api/package/cran/stringdist/badge.svg)](http://depsy.org/package/r/stringdist)[![Mentioned in Awesome Official Statistics ](https://awesome.re/mentioned-badge.svg)](http://www.awesomeofficialstatistics.org)
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | ## stringdist
 12 | 
 13 | * Approximate matching, fuzzy text search, and string distance calculations for R. 
 14 | * All distance and matching operations are system- and encoding-independent.
 15 | * Built for speed, using [openMP](https://www.openmp.org/) for parallel computing.
 16 | 
 17 | 
 18 | ## Citing
 19 | 
 20 | Please cite the [R-Journal article](https://journal.r-project.org/archive/2014/RJ-2014-011/index.html)
 21 | 
 22 | ```
 23 | @article{RJ-2014-011,
 24 |   author = {Mark P.J. van der Loo},
 25 |   title = {{The stringdist Package for Approximate String Matching}},
 26 |   year = {2014},
 27 |   journal = {{The R Journal}},
 28 |   doi = {10.32614/RJ-2014-011},
 29 |   url = {https://doi.org/10.32614/RJ-2014-011},
 30 |   pages = {111--122},
 31 |   volume = {6},
 32 |   number = {1}
 33 | }
 34 | ```
 35 | 
 36 | ## Functionality 
 37 | 
 38 | The package offers the following main functions:
 39 | 
 40 | * `stringdist`  computes pairwise distances between two input character vectors (shorter one is recycled)
 41 | * `stringdistmatrix` computes the distance matrix for one or two vectors
 42 | * `stringsim` computes a string similarity between 0 and 1, based on `stringdist`
 43 | * `amatch` is a fuzzy matching equivalent of R's native `match` function
 44 | * `ain` is a fuzzy matching equivalent of R's native `%in%` operator
 45 | * `afind` finds the location of fuzzy matches of a short string in a long string.
 46 | * `seq_dist`, `seq_distmatrix`, `seq_amatch` and `seq_ain` for distances between, and matching of integer sequences. (see also the [hashr](https://github.com/markvanderloo/hashr) package).
 47 | 
 48 | These functions are built upon `C`-code that re-implements some common (weighted) string
 49 | distance functions. Distance functions include:
 50 | 
 51 | * Hamming distance; 
 52 | * Levenshtein distance (weighted);
 53 | * Restricted Damerau-Levenshtein distance (weighted, a.k.a. Optimal String Alignment);
 54 | * Full Damerau-Levenshtein distance (weighted);
 55 | * Longest Common Substring distance;
 56 | * Q-gram distance
 57 | * cosine distance for q-gram count vectors (= 1-cosine similarity)
 58 | * Jaccard distance for q-gram count vectors (= 1-Jaccard similarity)
 59 | * Jaro, and Jaro-Winkler distance
 60 | * Soundex-based string distance.
 61 | 
 62 | Also, there are some utility functions:
 63 | 
 64 | * `qgrams()` tabulates the qgrams in one or more `character` vectors.
 65 | * `seq_qrams()` tabulates the qgrams (somtimes called ngrams) in one or more `integer` vectors.
 66 | * `phonetic()` computes phonetic codes of strings (currently only soundex)
 67 | * `printable_ascii()` is a utility function that detects non-printable ascii or non-ascii characters.
 68 | 
 69 | #### C API
 70 | 
 71 | As of version `0.9.5.0`  you can call a number of `stringdist` functions directly
 72 | from the `C` code of your R package. The description of the API can be found 
 73 | 
 74 | - By typing `?stringdist_api` in the R console
 75 | - By browsing the package's help index to `User guides, package vignettes and other documentation` and clicking on `doc/stringdist_api.pdf`.
 76 | - Or you can find the file's location as follows
 77 | 
 78 | ```
 79 | system.file("doc/stringdist_api.pdf", package="stringdist")
 80 | ```
 81 | 
 82 | Examples of packages that link to `stringdist` can be found [here](https://github.com/markvanderloo/linkstringdist) and
 83 | [here](https://github.com/ChrisMuir/refinr).
 84 | 
 85 | 
 86 | 
 87 | 
 88 | #### Installation
 89 | 
 90 | To install the latest release from CRAN, open an R terminal and type
 91 | 
 92 | `install.packages('stringdist')`
 93 | 
 94 | 
 95 | To obtain the package from the very latest source code open a `bash` terminal (or `git bash` if you work under Windows
 96 | with `msysgit`) and type
 97 | 
 98 | ```
 99 | git clone https://github.com/markvanderloo/stringdist.git
100 | cd stringdist
101 | bash ./build.bash
102 | R CMD INSTALL output/stringdist_*.tar.gz
103 | ```
104 | 
105 | Warning: the github version can change any time and may not even build properly. As most
106 | of the code is written in `C`, the development version may crash your `R`-session.
107 | 
108 | 
109 | 
110 | #### Resources
111 | 
112 | * A [paper](http://journal.r-project.org/archive/2014-1/loo.pdf) on stringdist has been published in the R-journal
113 | * [Slides](http://www.slideshare.net/MarkVanDerLoo/stringdist-use-r2014) of te _useR!2014_ conference.
114 | 
115 | #### Note to users: deprecated arguments removed as of version 0.9.5.0
116 | 
117 | The following arguments have been obsolete since 2015 and have been removed in the 0.9.5.0 release (spring 2018)
118 | 
119 | * Argument `cluster` for function `stringdistmatrix`.
120 | * Argument `maxDist` for functions `stringdist` and `stringdistmatrix` (not `amatch`).
121 | * Argument `ncores` for function `stringdistmatrix` 
122 | 
123 | 
124 | #### Note to users: deprecated arguments as of >= 0.9.0, >= 0.9.2
125 | 
126 | Parallelization used to be based on R's ```parallel``` package, that works by spawning several R sessions in the background. As of version 0.9.0, ```stringdist``` uses the more efficient ```openMP``` protocol to parallelize everything under the hood. 
127 | 
128 | The following arguments have become obsolete and will be removed somewhere in 2016:
129 | * Argument `cluster` for function `stringdistmatrix`.
130 | * Argument `maxDist` for functions `stringdist` and `stringdistmatrix` (not `amatch`).
131 | * Argument `ncores` for function `stringdistmatrix` 
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/pkg/R/afind.R:
--------------------------------------------------------------------------------
  1 | #' Stringdist-based fuzzy text search
  2 | #'
  3 | #' \code{afind} slides a window of fixed width over a string \code{x} and
  4 | #' computes the distance between the each window and the sought-after
  5 | #' \code{pattern}. The location, content, and distance corresponding to the
  6 | #' window with the best match is returned. 
  7 | #'
  8 | #'
  9 | #' @param x  strings to search in
 10 | #' @param pattern strings to find (not a regular expression). For \code{grab},
 11 | #' \code{grabl}, and \code{extract} this must be a single string.
 12 | #' @param window  width of moving window.
 13 | #' @param value toggle return matrix with matched strings.
 14 | #' @inheritParams amatch
 15 | #'
 16 | #' @details
 17 | #' Matching is case-sensitive.  Both \code{x} and \code{pattern} are converted
 18 | #' to \code{UTF-8} prior to search, unless \code{useBytes=TRUE}, in which case
 19 | #' the distances are measured bytewise.
 20 | #'
 21 | #' Code is parallelized over the \code{x} variable: each value of \code{x}
 22 | #' is scanned for every element in \code{pattern} using a separate thread (when \code{nthread}
 23 | #' is larger than 1).
 24 | #'
 25 | #' The functions \code{grab} and \code{grabl} are approximate string matching
 26 | #' functions that somewhat resemble base R's \code{\link[base]{grep}} and
 27 | #' \code{\link[base:grep]{grepl}}. They are implemented as convenience wrappers
 28 | #' of \code{afind}. 
 29 | #'
 30 | #' @section Running cosine distance:
 31 | #' This algorithm gains efficiency by using that two consecutive windows have
 32 | #' a large overlap in their q-gram profiles. It gives the same result as
 33 | #' the \code{"cosine"} distance, but much faster.
 34 | #'
 35 | #'
 36 | #' @return
 37 | #' For \code{afind}: a \code{list} of three matrices, each with
 38 | #' \code{length(x)} rows and \code{length(pattern)} columns. In each matrix,
 39 | #' element \eqn{(i,j)} corresponds to \code{x[i]} and \code{pattern[j]}. The 
 40 | #' names and description of each matrix is as follows.
 41 | #' \itemize{
 42 | #' \item{\code{location}. \code{[integer]}, location of the start of best matching window.
 43 | #'       When \code{useBytes=FALSE}, this corresponds to the location of a \code{UTF} code point
 44 | #'       in \code{x}, possibly after conversion from its original encoding.}
 45 | #' \item{\code{distance}. \code{[character]}, the string distance between pattern and
 46 | #'       the best matching window.}
 47 | #' \item{\code{match}. \code{[character]}, the first, best matching window.}
 48 | #' 
 49 | #' }
 50 | #' 
 51 | #' @family matching
 52 | #'
 53 | #' @examples
 54 | #' texts = c("When I grow up, I want to be"
 55 | #'        , "one of the harvesters of the sea"
 56 | #'        , "I think before my days are gone"
 57 | #'        , "I want to be a fisherman")
 58 | #' patterns = c("fish", "gone","to be")
 59 | #'
 60 | #' afind(texts, patterns, method="running_cosine", q=3)
 61 | #'
 62 | #' grabl(texts,"grew", maxDist=1)
 63 | #' extract(texts, "harvested", maxDist=3)
 64 | #'
 65 | #'
 66 | #' @export
 67 | afind <- function(x, pattern, window=NULL
 68 |   , value=TRUE
 69 |   , method = c("osa","lv","dl","hamming","lcs", "qgram","cosine","running_cosine","jaccard","jw","soundex")
 70 |   , useBytes = FALSE
 71 |   , weight=c(d=1,i=1,s=1,t=1) 
 72 |   , q  = 1
 73 |   , p  = 0
 74 |   , bt = 0
 75 |   , nthread = getOption("sd_num_thread")
 76 |   ){
 77 |   
 78 |   stopifnot(
 79 |     all(is.finite(weight))
 80 |     , all(weight > 0)
 81 |     , all(weight <=1)
 82 |     , is.null(window) || window >= 1
 83 |     , q >= 0
 84 |     , p <= 0.25
 85 |     , p >= 0
 86 |     , is.logical(useBytes) && !is.na(useBytes)
 87 |     , is.logical(value) && !is.na(value)
 88 |     , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
 89 |     , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
 90 |     , length(nthread) == 1
 91 |     , is.numeric(nthread)
 92 |     , nthread > 0
 93 |   )
 94 | 
 95 |   x <- as.character(x)
 96 |   pattern <- as.character(pattern)
 97 |   if ( !useBytes ){
 98 |     x <- enc2utf8(x)
 99 |     pattern <- enc2utf8(pattern)
100 |   }
101 | 
102 |   if (is.null(window)){ 
103 |     window = nchar(pattern, type = if (useBytes) "bytes" else "char")
104 |   }
105 | 
106 |   if (length(x) == 0) return(numeric(0))
107 | 
108 |   method <- match.arg(method)
109 |   if (method == 'jw') weight <- weight[c(2,1,3)]
110 | 
111 | 
112 |   method <- METHODS[method]
113 |   if ( is.na(method) ){
114 |     stop(sprintf("method '%s' is not defined",method))
115 |   }
116 |   
117 |   L <- .Call("R_afind"
118 |     , x
119 |     , pattern
120 |     , as.integer(window)
121 |     , method
122 |     , as.double(weight)
123 |     , as.double(p)
124 |     , as.double(bt)
125 |     , as.integer(q)
126 |     , as.integer(useBytes)
127 |     , as.integer(nthread)
128 |     , PACKAGE="stringdist")
129 |   
130 |   names(L) <- c("location", "distance")
131 | 
132 |   if (isTRUE(value)){
133 |     matches = sapply(seq_along(pattern), function(i){ 
134 |       substr(x, L[[1]][,i], L[[1]][,i] + window[i]-1)
135 |     })
136 |     L$match <- matrix(matches, nrow=length(x))
137 |   }
138 | 
139 |   L
140 | }
141 | 
142 | 
143 | 
144 | 
145 | #' @rdname afind
146 | #' @param ... passed to \code{afind}.
147 | #' @param maxDist Only windows with distance \code{<= maxDist} are considered a match.
148 | #' @return 
149 | #' For \code{grab}, an \code{integer} vector, indicating in which elements of
150 | #' \code{x} a match was found with a distance \code{<= maxDist}. The matched
151 | #' values when \code{value=TRUE} (equivalent to \code{\link[base]{grep}}).
152 | #' @export
153 | grab <- function(x, pattern, maxDist=Inf, value=FALSE, ...){
154 |   stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1)
155 |   L <- afind(x, pattern, value=value, ...)
156 |   if (!value){
157 |     which(L$distance <= maxDist)
158 |   } else {
159 |     L$match[L$distance <= maxDist ]
160 |   }
161 | }
162 | 
163 | #' @rdname afind
164 | #' @param ... passed to \code{afind}.
165 | #' @return 
166 | #' For \code{grabl}, a \code{logical} vector, indicating in which elements of
167 | #' \code{x} a match was found with a distance \code{<= maxDist}.  (equivalent
168 | #' to \code{\link[base:grep]{grepl}}).
169 | #' @export
170 | grabl <- function(x, pattern, maxDist=Inf, ...){
171 |   stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1)
172 |   L <- afind(x, pattern, value=FALSE, ...)
173 |   as.logical(L$distance <= maxDist)
174 | }
175 | 
176 | 
177 | #' @rdname afind
178 | #'
179 | #' @return
180 | #' For \code{extract}, a \code{character} matrix with \code{length(x)} rows and
181 | #' \code{length(pattern)} columns.  If match was found, element \eqn{(i,j)}
182 | #' contains the match, otherwise it is set to \code{NA}.
183 | #' @export
184 | extract <- function(x, pattern, maxDist = Inf, ...){
185 |   stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1)
186 |   L <- afind(x, pattern, value=TRUE, ...)
187 |   out <- L$match
188 |   out[L$distance > maxDist] <- NA_character_
189 |   out
190 | }
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/pkg/R/stringsim.R:
--------------------------------------------------------------------------------
  1 | #' Compute similarity scores between strings
  2 | #'
  3 | #' \code{stringsim} computes pairwise string similarities between elements of
  4 | #' \code{character} vectors \code{a} and \code{b}, where the vector with less
  5 | #' elements is recycled. 
  6 | #' \code{stringsimmatrix} computes the string similarity matrix with rows
  7 | #' according to \code{a} and columns according to \code{b}.
  8 | #'
  9 | #' @param a R object (target); will be converted by \code{as.character}.
 10 | #' @param b R object (source); will be converted by \code{as.character}.
 11 | #' @param method Method for distance calculation. The default is \code{"osa"}, 
 12 | #'   see \code{\link{stringdist-metrics}}.
 13 | #' @param useBytes Perform byte-wise comparison, see \code{\link{stringdist-encoding}}.
 14 | #' @param q  Size of the \eqn{q}-gram; must be nonnegative. Only applies to
 15 | #'   \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.
 16 | #' @param ... additional arguments are passed on to \code{\link{stringdist}} and
 17 | #'   \code{\link{stringdistmatrix}} respectively.
 18 | #' @return
 19 | #' \code{stringsim} returns a vector with similarities, which are values between
 20 | #' 0 and 1 where 1 corresponds to perfect similarity (distance 0) and 0 to
 21 | #' complete dissimilarity. \code{NA} is returned when \code{\link{stringdist}}
 22 | #' returns \code{NA}. Distances equal to \code{Inf} are truncated to a
 23 | #' similarity of 0. \code{stringsimmatrix} works the same way but, equivalent to
 24 | #' \code{\link{stringdistmatrix}}, returns a similarity matrix instead of a
 25 | #' vector.
 26 | #'
 27 | #' @details
 28 | #' The similarity is calculated by first calculating the distance using
 29 | #' \code{\link{stringdist}}, dividing the distance by the maximum
 30 | #' possible distance, and substracting the result from 1. 
 31 | #' This results in a score between 0 and 1, with 1
 32 | #' corresponding to complete similarity and 0 to complete dissimilarity.
 33 | #' Note that complete similarity only means equality for distances satisfying
 34 | #' the identity property. This is not the case e.g. for q-gram based distances
 35 | #' (for example if q=1, anagrams are completely similar).
 36 | #' For distances where weights can be specified, the maximum distance 
 37 | #' is currently computed by assuming that all weights are equal to 1.
 38 | #'
 39 | #' @example ../examples/stringsim.R
 40 | #' @export
 41 | stringsim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs",
 42 |   "qgram", "cosine", "jaccard", "jw", "soundex"), useBytes=FALSE, q = 1, ...) {
 43 |   # Calculate the distance 
 44 |   method <- match.arg(method)
 45 |   dist <- stringdist::stringdist(a, b, method=method, useBytes=useBytes, q=q, ...)
 46 | 
 47 |   nctype <- if (useBytes) "bytes" else "char"
 48 |   normalize_dist(dist, a, b, method=method, nctype=nctype, q=q)
 49 | }
 50 | 
 51 | 
 52 | #' @rdname stringsim
 53 | #' @export
 54 | stringsimmatrix <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs",
 55 |                                               "qgram", "cosine", "jaccard", "jw", "soundex"), useBytes=FALSE, q = 1, ...) {
 56 |   # Calculate the distance 
 57 |   method <- match.arg(method)
 58 |   nctype <- if (useBytes) "bytes" else "char"
 59 |   if (missing(b)){
 60 |     dist <- stringdist::stringdistmatrix(a, method=method, useBytes=useBytes, q=q, ...)
 61 |     normalize_dist(dist, a= rep(a,length(a)), b = rep(a,each=length(a)), method=method, nctype=nctype, q=q)
 62 |   } else {
 63 |     dist <- stringdist::stringdistmatrix(a, b, method=method, useBytes=useBytes, q=q, ...)
 64 |     normalize_dist(dist, a=rep(a,length(b)), b=rep(b,each=length(a)), method=method, nctype=nctype, q=q)
 65 |   }
 66 | }
 67 | 
 68 | 
 69 | #' Compute similarity scores between sequences of integers
 70 | #' 
 71 | #' @param a \code{list} of \code{integer} vectors (target)
 72 | #' @param b \code{list} of \code{integer} vectors (source). Optional for
 73 | #'   \code{seq_distmatrix}.
 74 | #' @param method Method for distance calculation. The default is \code{"osa"}, 
 75 | #'   see \code{\link{stringdist-metrics}}.
 76 | #' @param q  Size of the \eqn{q}-gram; must be nonnegative. Only applies to
 77 | #'   \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.
 78 | #' @param ... additional arguments are passed on to \code{\link{seq_dist}}.
 79 | #'
 80 | #' @return 
 81 | #' A \code{numeric} vector of length \code{max(length(a),length(b))}. If one of the
 82 | #' entries in \code{a} or \code{b} is \code{NA_integer_}, all comparisons with that
 83 | #' element result in \code{NA}. Missings occurring within the sequences are treated
 84 | #' as an ordinary number (the representation of \code{NA_integer_}).
 85 | #'   
 86 | #' @example ../examples/seq_sim.R
 87 | #' @seealso \code{\link{seq_dist}}, \code{\link{seq_amatch}}
 88 | #' @export
 89 | seq_sim <- function(a, b, method = c("osa", "lv", "dl", "hamming", "lcs",
 90 |    "qgram", "cosine", "jaccard", "jw"),  q = 1, ...) {
 91 | 
 92 |   method <- match.arg(method)
 93 |   dist <- stringdist::seq_dist(a, b, method=method, q=q, ...)
 94 |   normalize_dist(dist, a, b, method=method, q=q)
 95 | }
 96 | 
 97 | 
 98 | #### HELPER FUNCTIONS ---------------------------------------------------------
 99 | 
100 | # get lengths of sequences (internal function)
101 | lengths <- function(x,...){
102 |   UseMethod("lengths")
103 | }
104 | 
105 | lengths.character <- function(x, type="char",...){
106 |   nchar(x,type=type)
107 | }
108 | 
109 | lengths.list <- function(x,...){
110 |   .Call("R_lengths",x, PACKAGE="stringdist")
111 | }
112 | 
113 | normalize_dist <- function(dist, a, b, method, nctype="char",q=1L){
114 | 
115 |   if (inherits(dist, "dist")) dist <- as.matrix(dist)
116 |   
117 |   # Normalise the distance by dividing it by the maximum possible distance
118 |   if (method == "hamming") {
119 |     max_dist <- if (length(b) > length(a)) lengths(b,type=nctype) else lengths(a,type=nctype)
120 |     max_dist[max_dist == 0] <- 1
121 |     sim <- 1 - dist/max_dist
122 |   } else if (method == "lcs") {
123 |     max_dist <- lengths(a,type=nctype) + lengths(b,type=nctype)
124 |     max_dist[max_dist == 0] <- 1
125 |     sim <- 1 - dist/max_dist
126 |   } else if (method == "lv") {
127 |     max_dist <- pmax(lengths(a,type=nctype), lengths(b,type=nctype))
128 |     max_dist[max_dist == 0] <- 1
129 |     sim <- 1 - dist/max_dist
130 |   } else if (method == "osa") {
131 |     max_dist <- pmax(lengths(a,type=nctype), lengths(b,type=nctype))
132 |     max_dist[max_dist == 0] <- 1
133 |     sim <- 1 - dist/max_dist
134 |   } else if (method == "dl") {
135 |     max_dist <- pmax(lengths(a,type=nctype), lengths(b,type=nctype))
136 |     max_dist[max_dist == 0] <- 1
137 |     sim <- 1 - dist/max_dist
138 |   } else if (method == "qgram") {
139 |     max_dist <- (lengths(a,type=nctype) + lengths(b,type=nctype) - 2*q + 2)
140 |     max_dist[max_dist < 0] <- 1
141 |     sim <- 1 - dist/max_dist
142 |   } else if (method == "cosine") {
143 |     sim <- 1 - dist
144 |   } else if (method == "jaccard") {
145 |     sim <- 1 - dist
146 |   } else if (method == "jw") {
147 |     sim <- 1 - dist
148 |   } else if (method == "soundex") {
149 |     sim <- 1 - dist
150 |   }
151 |   # all metrics can have distances == Inf; for similariy score set these to 0
152 |   sim[sim < 0] <- 0
153 |   sim
154 | }
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/pkg/src/soundex.c:
--------------------------------------------------------------------------------
  1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  2 |  *  Copyright (C) 2013  Mark van der Loo
  3 |  *
  4 |  *  This program is free software: you can redistribute it and/or modify
  5 |  *  it under the terms of the GNU General Public License as published by
  6 |  *  the Free Software Foundation, either version 3 of the License, or
  7 |  *  (at your option) any later version.
  8 |  *
  9 |  *  This program is distributed in the hope that it will be useful,
 10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |  *  GNU General Public License for more details.
 13 |  *
 14 |  *  You should have received a copy of the GNU General Public License
 15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 16 |  *
 17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 18 |  */
 19 | 
 20 | #include "utils.h"
 21 | #include <ctype.h>
 22 | 
 23 | // Translate similar sounding consonants to numeric codes; vowels are all 
 24 | // translated to 'a' and voiceless characters (and other characters) are 
 25 | // translated to 'h'.
 26 | // Upper and lower case ASCII characters are treated as separate cases,
 27 | // avoiding the use of 'tolower' whose effect depends on locale.
 28 | static unsigned int translate_soundex(unsigned int c) {
 29 |   switch ( c ) {
 30 |     case 'b':
 31 |     case 'f':
 32 |     case 'p':
 33 |     case 'v':
 34 |     case 'B':
 35 |     case 'F':
 36 |     case 'P':
 37 |     case 'V':
 38 |       return '1';
 39 |     case 'c':
 40 |     case 'g':
 41 |     case 'j':
 42 |     case 'k':
 43 |     case 'q':
 44 |     case 's':
 45 |     case 'x':
 46 |     case 'z':
 47 |     case 'C':
 48 |     case 'G':
 49 |     case 'J':
 50 |     case 'K':
 51 |     case 'Q':
 52 |     case 'S':
 53 |     case 'X':
 54 |     case 'Z':
 55 |       return '2';
 56 |     case 'd':
 57 |     case 't':
 58 |     case 'D':
 59 |     case 'T':
 60 |       return '3';
 61 |     case 'l':
 62 |     case 'L':
 63 |       return '4';
 64 |     case 'm':
 65 |     case 'n':
 66 |     case 'M':
 67 |     case 'N':
 68 |       return '5';
 69 |     case 'r':
 70 |     case 'R':
 71 |       return '6';
 72 |     case 'h':
 73 |     case 'w':
 74 |     case 'H':
 75 |     case 'W':
 76 |       return 'h';
 77 |     case 'a':
 78 |     case 'e':
 79 |     case 'i':
 80 |     case 'o':
 81 |     case 'u':
 82 |     case 'y':
 83 |     case 'A':
 84 |     case 'E':
 85 |     case 'I':
 86 |     case 'O':
 87 |     case 'U':
 88 |     case 'Y':
 89 |       return 'a'; // use 'a' to encode vowels
 90 |     case '!': // we will allow all printable ASCII characters.
 91 |     case '"':
 92 |     case '#':
 93 |     case '$':
 94 |     case '%':
 95 |     case '&':
 96 |     case '\'':
 97 |     case '(':
 98 |     case ')':
 99 |     case '*':
100 |     case '+':
101 |     case ',':
102 |     case '-':
103 |     case '.':
104 |     case '/':
105 |     case ':':
106 |     case ';':
107 |     case '<':
108 |     case '=':
109 |     case '>':
110 |     case '?':
111 |     case '@':
112 |     case '[':
113 |     case '\\':
114 |     case ']':
115 |     case '^':
116 |     case '_':
117 |     case '`':
118 |     case '{':
119 |     case '|':
120 |     case '}':
121 |     case '~':
122 |     case '0':
123 |     case '1':
124 |     case '2':
125 |     case '3':
126 |     case '4':
127 |     case '5':
128 |     case '6':
129 |     case '7':
130 |     case '8':
131 |     case '9':
132 |     case ' ':
133 |       return 'h'; // ignored characters; voiceless symbols.
134 |     default:
135 |       return '?'; // other characters are ignored with a warning
136 |   }
137 | }
138 | 
139 | // Translate a string to a soundex phonetic code
140 | //
141 | // str: the input string
142 | // str_len: the length of the input string
143 | // result: the character vector in which the soundex code is copied. This 
144 | //    should be a vector of a least length 4.
145 | // output: the number of non-ascii or non-printable ascii characters
146 | // encountered during translation.
147 | static unsigned int soundex(const unsigned int* str, unsigned int str_len, unsigned int* result) {
148 |   if (!str || !result) return 0;
149 |   if (str_len == 0) {
150 |     unsigned int j;
151 |     for (j = 0; j < 4; ++j) result[j] = '0';
152 |     return 0;
153 |   }
154 |   unsigned int i = 0, j = 0, nfail = 0;
155 |   unsigned int cj = translate_soundex(str[j]);
156 |   // the first character is copied directly and not translated to a numerical
157 |   // code
158 |   if ( cj == '?' ){
159 |     // the translated character is non-printable ASCII or non-ASCII.
160 |     ++nfail;
161 |     result[0] = str[0];
162 |   } else {
163 |     result[0] = toupper(str[0]);
164 |   }
165 |   //result[0] = str[0] < 128 ? toupper(str[0]) : str[0];
166 |   for (i = 1; i < str_len && j < 3; ++i) {
167 |     unsigned int ci = translate_soundex(str[i]);
168 |     if (ci == 'a') {
169 |       // vowels are not added to the result; but we do set the previous
170 |       // character to the vower because two consonants with a vowel in between
171 |       // are not merged
172 |       cj = ci;
173 |     } else if (ci != 'h') {
174 |       // a consonant that is not equal to the previous consonant is added to 
175 |       // the result
176 |       if (ci != cj) {
177 |         result[++j] = ci;
178 |         cj = ci;
179 |       }
180 |     }
181 |     if ( ci == '?' ){
182 |       // the translated character is non-printable ASCII or non-ASCII.
183 |       ++nfail;
184 |     }
185 |   }
186 |   // pad with zeros
187 |   for (++j ; j < 4; ++j) result[j] = '0';
188 |   return nfail;
189 | }
190 | 
191 | double soundex_dist(unsigned int *a, int a_len, unsigned int *b, int b_len, unsigned int *nfail) {
192 | 
193 |   unsigned int sa[4];
194 |   unsigned int sb[4];
195 |   (*nfail) += soundex(a, a_len, sa);
196 |   (*nfail) += soundex(b, b_len, sb);
197 |   for (unsigned int i = 0; i < 4; ++i) 
198 |     if (sa[i] != sb[i]) return 1.0;
199 |   return 0.0;
200 | }
201 | 
202 | // ================================ R INTERFACE ===============================
203 | 
204 | static void check_fail(unsigned int nfail){
205 |   if ( nfail > 0 ){
206 |     warning("soundex encountered %d non-printable ASCII or non-ASCII"
207 |     "\n  characters. Results may be unreliable, see ?printable_ascii",nfail);
208 |   }
209 | }
210 | 
211 | SEXP R_soundex(SEXP x, SEXP useBytes) {
212 |   PROTECT(x);
213 |   PROTECT(useBytes);
214 | 
215 |   int n = length(x);
216 |   int bytes = INTEGER(useBytes)[0];
217 | 
218 |   // when a and b are character vectors; create unsigned int vectors in which
219 |   // the elements of and b will be copied
220 |   unsigned int *s = NULL;
221 |   int ml = max_length(x);
222 |   s = (unsigned int *) malloc( (1L+ml) * sizeof(unsigned int));
223 |   if (s == NULL) {
224 |     UNPROTECT(2);
225 |     error("Unable to allocate enough memory");
226 |   }
227 |   if (bytes) {
228 |     // create output variable
229 |     SEXP y = allocVector(STRSXP, n);
230 |     PROTECT(y);
231 |     // compute soundexes, skipping NA's
232 |     unsigned int nfail = 0;
233 |     int len_s, isna_s;
234 |     char sndx[5];
235 |     unsigned int sndx_int[4];
236 |     for (int i = 0; i < n; ++i) {
237 |       get_elem(x, i, bytes,0L, &len_s, &isna_s, s);
238 |       if (isna_s) {
239 |         SET_STRING_ELT(y, i, R_NaString);
240 |       } else { 
241 |         nfail += soundex(s, len_s, sndx_int);
242 |         for (unsigned int j = 0; j < 4; ++j) sndx[j] = (char) sndx_int[j];
243 |         sndx[4] = 0;
244 |         SET_STRING_ELT(y, i, mkChar(sndx));
245 |       } 
246 |     }
247 |     // cleanup and return
248 |     check_fail(nfail);
249 |     free(s);
250 |     UNPROTECT(3);
251 |     return y;
252 |   } else {
253 |     // create output variable
254 |     SEXP y = allocVector(VECSXP, n);
255 |     PROTECT(y);
256 |     // compute soundexes, skipping NA's
257 |     unsigned int nfail = 0;
258 |     int len_s, isna_s;
259 |     for (int i = 0; i < n; ++i) {
260 |       get_elem(x, i, bytes, 0L, &len_s, &isna_s, s);
261 |       if (isna_s) {
262 |         SEXP sndx = allocVector(INTSXP, 1);
263 |         PROTECT(sndx);
264 |         INTEGER(sndx)[0] = NA_INTEGER;
265 |         SET_VECTOR_ELT(y, i, sndx);
266 |         UNPROTECT(1);
267 |       } else { 
268 |         SEXP sndx = allocVector(INTSXP, 4);
269 |         PROTECT(sndx);
270 |         nfail += soundex(s, len_s, (unsigned int *)INTEGER(sndx));
271 |         SET_VECTOR_ELT(y, i, sndx);
272 |         UNPROTECT(1);
273 |       } 
274 |     }
275 |     // cleanup and return
276 |     check_fail(nfail);
277 |     free(s);
278 |     UNPROTECT(3);
279 |     return y;
280 |   }
281 | }
282 | 
283 | 
284 | 
285 | 


--------------------------------------------------------------------------------
/pkg/src/utf8ToInt.c:
--------------------------------------------------------------------------------
  1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  2 |  *  Copyright (C) 2013  Mark van der Loo
  3 |  *
  4 |  *  This program is free software: you can redistribute it and/or modify
  5 |  *  it under the terms of the GNU General Public License as published by
  6 |  *  the Free Software Foundation, either version 3 of the License, or
  7 |  *  (at your option) any later version.
  8 |  *
  9 |  *  This program is distributed in the hope that it will be useful,
 10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |  *  GNU General Public License for more details.
 13 |  *
 14 |  *  You should have received a copy of the GNU General Public License
 15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 16 |  *
 17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 18 |  */
 19 | 
 20 | #include <string.h>
 21 | #include "utils.h"
 22 | 
 23 | 
 24 | /* This function is gratefully copied from the R core distribution.
 25 |  * It is replicated here to facilitate multicore processing through
 26 |  * openmp (as it is not part of R's API).
 27 |  *
 28 |  * Convert a single character to integer.
 29 |  *
 30 |  * *s input buffer (must be utf8)
 31 |  * *w output buffer
 32 |  *
 33 |  * value:
 34 |  * >0 : The number of bytes in the multi-byte representation
 35 |  * 0  : End of string reached.
 36 |  * <0 : Invalid input (not interpretable as UTF-8)
 37 |  * 
 38 |  * 
 39 |  */
 40 | static int mbrtoint(unsigned int *w, const char *s)
 41 | {
 42 |     unsigned int byte;
 43 |     byte = *((unsigned char *)s);
 44 | 
 45 |     if (byte == 0) {
 46 |   *w = 0;
 47 |   return 0;
 48 |     } else if (byte < 0xC0) {
 49 |   *w = (int) byte;
 50 |   return 1;
 51 |     } else if (byte < 0xE0) {
 52 |   if (!s[1]) return -2;
 53 |   if ((s[1] & 0xC0) == 0x80) {
 54 |       *w = (int) (((byte & 0x1F) << 6) | (s[1] & 0x3F));
 55 |       return 2;
 56 |   } else return -1;
 57 |     } else if (byte < 0xF0) {
 58 |   if (!s[1] || !s[2]) return -2;
 59 |   if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
 60 |       *w = (int) (((byte & 0x0F) << 12)
 61 |       | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
 62 |       byte = *w;
 63 |       if (byte >= 0xD800 && byte <= 0xDFFF) return -1; /* surrogate */
 64 |       if (byte == 0xFFFE || byte == 0xFFFF) return -1;
 65 |       return 3;
 66 |   } else return -1;
 67 |     } else if (byte < 0xF8) {
 68 |   if (!s[1] || !s[2] || !s[3]) return -2;
 69 |   if (((s[1] & 0xC0) == 0x80)
 70 |       && ((s[2] & 0xC0) == 0x80)
 71 |       && ((s[3] & 0xC0) == 0x80)) {
 72 |       *w = (int) (((byte & 0x07) << 18)
 73 |       | ((s[1] & 0x3F) << 12)
 74 |       | ((s[2] & 0x3F) << 6)
 75 |       | (s[3] & 0x3F));
 76 |       byte = *w;
 77 |       return 4;
 78 |   } else return -1;
 79 |     } else if (byte < 0xFC) {
 80 |   if (!s[1] || !s[2] || !s[3] || !s[4]) return -2;
 81 |   if (((s[1] & 0xC0) == 0x80)
 82 |       && ((s[2] & 0xC0) == 0x80)
 83 |       && ((s[3] & 0xC0) == 0x80)
 84 |       && ((s[4] & 0xC0) == 0x80)) {
 85 |       *w = (int) (((byte & 0x03) << 24)
 86 |       | ((s[1] & 0x3F) << 18)
 87 |       | ((s[2] & 0x3F) << 12)
 88 |       | ((s[3] & 0x3F) << 6)
 89 |       | (s[4] & 0x3F));
 90 |       byte = *w;
 91 |       return 5;
 92 |   } else return -1;
 93 |     } else {
 94 |   if (!s[1] || !s[2] || !s[3] || !s[4] || !s[5]) return -2;
 95 |   if (((s[1] & 0xC0) == 0x80)
 96 |       && ((s[2] & 0xC0) == 0x80)
 97 |       && ((s[3] & 0xC0) == 0x80)
 98 |       && ((s[4] & 0xC0) == 0x80)
 99 |       && ((s[5] & 0xC0) == 0x80)) {
100 |       *w = (int) (((byte & 0x01) << 30)
101 |       | ((s[1] & 0x3F) << 24)
102 |       | ((s[2] & 0x3F) << 18)
103 |       | ((s[3] & 0x3F) << 12)
104 |       | ((s[5] & 0x3F) << 6)
105 |       | (s[5] & 0x3F));
106 |       byte = *w;
107 |       return 6;
108 |   } else return -1;
109 |     }
110 |     /* return -2; not reached */
111 | }
112 | 
113 | 
114 | /* Translate a UTF-8 string to integers.
115 |  *
116 |  * Input
117 |  *
118 |  * str   : pointer to input string.
119 |  * outbuf: pointer to output buffer.
120 |  *
121 |  * Returns:
122 |  * The number of logical characters converted.
123 |  *
124 |  */
125 | static int utf8_to_int(const char *str, unsigned int *outbuf){
126 | 
127 |   unsigned int *p = outbuf;
128 |   char *s = (char *) str;
129 |   int nbytes
130 |     , str_len = 0L;
131 | 
132 |   for (int i=0;; i++){
133 |     nbytes = mbrtoint(p, s);
134 |     if (nbytes > 0){
135 |       p += 1L;
136 |       str_len += 1L;
137 |       s += nbytes;
138 |     } else if (nbytes == 0L ){
139 |       return str_len;
140 |     } else if (nbytes == -1L) { // non-utf-8 sequence encountered.
141 |       return -1; 
142 |     }
143 |   }
144 | }
145 | 
146 | 
147 | // Get one element from x (VECSXP or STRSXP) convert to usigned int if necessary and store in c
148 | // TODO: this can probably be a bit optimized by decreasing the use of the *_ELT macros.
149 | unsigned int *get_elem(SEXP x, R_xlen_t i, int bytes, int intdist, int *len, int *isna, unsigned int *c){
150 | 
151 |   if ( intdist ){
152 |     // we need a copy with trailing zero in this case since some distances 
153 |     // (e.g the dl-distance) expects this
154 |     *isna = ( INTEGER(VECTOR_ELT(x,i))[0] == NA_INTEGER );
155 |     (*len) = length(VECTOR_ELT(x,i));
156 |     // this implicitly converts from int to unsigned int (but that should not influence the result).
157 |     memcpy(c , INTEGER(VECTOR_ELT(x,i)), (*len) * sizeof(int));
158 |     c[*len] = 0;
159 |   } else {
160 |     *isna = ( STRING_ELT(x,i) == NA_STRING );
161 |     if (bytes){
162 |       (*len)  = length(STRING_ELT(x,i));
163 |       for (int j=0; j < *len; j++ ){
164 |         c[j] = CHAR(STRING_ELT(x,i))[j];
165 |       }
166 |         c[*len] = 0;
167 |     } else {
168 |       (*len)  = utf8_to_int( CHAR(STRING_ELT(x,i)), c);
169 |       if ( *len < 0 ){
170 |         error("Encountered byte sequence not representing an utf-8 character.\n");
171 |       }
172 |     }
173 |   }
174 |   return  c;
175 | }
176 | 
177 | /* byte-by-byte char to int translation 
178 |  *
179 |  *
180 |  * Return value: the number of bytes converted.
181 |  *
182 |  *
183 |  */
184 | static int char_to_int(const char *str, unsigned int *outbuf){
185 |   unsigned int *p = outbuf;
186 |   char *s = (char *) str;
187 |   int str_len = 0L;
188 |   while (*s){
189 |     (*p) =  (unsigned int) *s;
190 |     p++;
191 |     s++;
192 |     str_len++;
193 |   }
194 |   return str_len;
195 | }
196 | 
197 | Stringset *new_stringset(SEXP str, int bytes, int intdist){
198 |   size_t nstr = length(str);
199 |   Stringset *s;
200 |   s = (Stringset *) malloc(sizeof(Stringset));
201 | 
202 |   // get and set string lengths.
203 |   s->str_len = (int *) malloc(nstr * sizeof(int));
204 | 
205 |   size_t nbytes = 0L;
206 | 
207 |   if ( intdist ){
208 |     for (size_t i=0; i<nstr; i++){
209 |       nbytes += length(VECTOR_ELT(str,i));
210 |     }
211 |   } else {
212 |     for (size_t i=0; i<nstr; i++){
213 |       nbytes += length(STRING_ELT(str,i));
214 |     }
215 |   }
216 | 
217 |   s->string = (unsigned int **) malloc(nstr * sizeof(int *));
218 |   // room for int rep of strings, including a trailing zero (needed by e.g. by full dl-distance)
219 |   // this is enough room for byte-by-byte translation, so for UTF-8 it will be too much.
220 |   s->data = (unsigned int *) malloc( (nstr + nbytes) * sizeof(int));
221 | 
222 |   int *t = s->str_len;
223 |   unsigned int *d = s->data;
224 | 
225 |   if ( intdist ){
226 |     for (size_t i=0L; i < nstr; i++, t++){
227 |       if ( INTEGER(VECTOR_ELT(str,i))[0] == NA_INTEGER ){
228 |         (*t) = NA_INTEGER; 
229 |       } else {
230 |         (*t) = length(VECTOR_ELT(str,i));
231 |         memcpy(d, INTEGER(VECTOR_ELT(str,i)), (*t)*sizeof(int) );
232 |         s->string[i] = d;
233 |         (*(d + (*t))) = 0L; // append a zero.
234 |         d += (*t) + 1L;
235 |       }
236 |     }
237 |   } else if ( bytes ){
238 |     for (size_t i=0L; i < nstr; i++, t++){
239 |       if ( STRING_ELT(str,i) == NA_STRING ){
240 |         (*t) = NA_INTEGER; 
241 |       } else {
242 |         (*t) = char_to_int(CHAR(STRING_ELT(str,i)), d);
243 |         s->string[i] = d;
244 |         (*(d + (*t))) = 0L; // append a zero.
245 |         d += (*t) + 1L;
246 |       }
247 |     }
248 |   } else {
249 |     for (size_t i=0L; i < nstr; i++, t++){
250 |       if ( STRING_ELT(str,i) == NA_STRING ){
251 |         (*t) = NA_INTEGER; 
252 |       } else {
253 |         (*t) = utf8_to_int(CHAR(STRING_ELT(str,i)), d); 
254 |         s->string[i] = d;
255 |         (*(d + (*t))) = 0L; // append a zero.
256 |         d += (*t) + 1L;
257 |       }
258 |     }
259 |   }
260 | 
261 |   return s;
262 | }
263 | 
264 | void free_stringset(Stringset *s){
265 |   free(s->string);
266 |   free(s->data);
267 |   free(s->str_len);
268 |   free(s);  
269 | }
270 | 
271 | 
272 | 
273 | 


--------------------------------------------------------------------------------
/pkg/R/doc_metrics.R:
--------------------------------------------------------------------------------
  1 | #' @title 
  2 | #' String metrics in \pkg{stringdist}
  3 | #' 
  4 | #' @description
  5 | #' This page gives an overview of the string dissimilarity measures offered by
  6 | #' \pkg{stringdist}.
  7 | #' 
  8 | #' @section String Metrics:
  9 | #' String metrics are ways of quantifying the dissimilarity between two finite
 10 | #' sequences, usually text strings. Over the years, many such measures have been
 11 | #' developed. Some are based on a mathematical understanding of the set of all 
 12 | #' strings that can be composed from a finite alphabet, others are based on more
 13 | #' heuristic principles, such as how a text string sounds when pronounced by a 
 14 | #' native English speaker.
 15 | #'
 16 | #' The terms 'string metrics' and 'string distance' are used more or less
 17 | #' interchangibly in literature. From a mathematical point of view, string
 18 | #' metrics often do not obey the demands that are usually required from a
 19 | #' distance function. For example, it is not true for all string metrics that a
 20 | #' distance of 0 means that two strings are the same (e.g. in the \eqn{q}-gram
 21 | #' distance). Nevertheless, string metrics are very useful in practice and have
 22 | #' many applications.
 23 | #'
 24 | #' The metric you need to choose for an application strongly depends on both the
 25 | #' nature of the string (what does the string represent?) and the cause of
 26 | #' dissimilarities between the strings you are measuring. For example, if you
 27 | #' are comparing human-typed names that may contain typo's, the Jaro-Winkler
 28 | #' distance may be of use. If you are comparing names that were written down
 29 | #' after hearing them, a phonetic distance may be a better choice.
 30 | #'
 31 | #' Currently, the following distance metrics are supported by \pkg{stringdist}.
 32 | #' \tabular{ll}{
 33 | #'    \bold{Method name} \tab \bold{Description}\cr
 34 | #'    \code{osa} \tab Optimal string aligment, (restricted Damerau-Levenshtein distance).\cr
 35 | #'    \code{lv} \tab Levenshtein distance (as in R's native \code{\link[utils]{adist}}).\cr
 36 | #'    \code{dl} \tab Full Damerau-Levenshtein distance.\cr
 37 | #'    \code{hamming}  \tab Hamming distance (\code{a} and \code{b} must have same nr of characters).\cr
 38 | #'    \code{lcs} \tab Longest common substring distance.\cr
 39 | #'    \code{qgram} \tab \eqn{q}-gram distance. \cr
 40 | #'    \code{cosine} \tab cosine distance between \eqn{q}-gram profiles \cr
 41 | #'    \code{jaccard} \tab Jaccard distance between \eqn{q}-gram profiles \cr
 42 | #'    \code{jw} \tab Jaro, or Jaro-Winkler distance.\cr
 43 | #'    \code{soundex} \tab Distance based on soundex encoding (see below)
 44 | #' }
 45 | #'
 46 | #'
 47 | #' @section A short description of string metrics supported by \pkg{stringdist}:
 48 | #'
 49 | #' See \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{Van der Loo
 50 | #' (2014)} for an extensive description and references. The review papers of
 51 | #' Navarro (2001) and Boytsov (2011) provide excellent technical overviews of
 52 | #' respectively online and offline string matching algorithms.
 53 | #' 
 54 | #' The \bold{Hamming distance} (\code{method='hamming'}) counts the number of 
 55 | #' character substitutions that turns \code{b} into \code{a}. If \code{a} 
 56 | #' and \code{b} have different number of characters the distance is \code{Inf}. 
 57 | #'
 58 | #' The \bold{Levenshtein distance} (\code{method='lv'}) counts the number of 
 59 | #' deletions, insertions and substitutions necessary to turn \code{b} into 
 60 | #' \code{a}. This method is equivalent to \code{R}'s native \code{\link[utils]{adist}} 
 61 | #' function. 
 62 | #'
 63 | #' The \bold{Optimal String Alignment distance} (\code{method='osa'}) is like the Levenshtein 
 64 | #' distance but also allows transposition of adjacent characters. Here, each 
 65 | #' substring  may be edited only once. (For example, a character cannot be transposed twice
 66 | #' to move it forward in the string). 
 67 | #'
 68 | #' The \bold{full Damerau-Levenshtein distance} (\code{method='dl'}) is like the optimal 
 69 | #' string alignment distance except that it allows for multiple edits on substrings. 
 70 | #'
 71 | #' The \bold{longest common substring} (method='lcs') is defined as the longest string that can be 
 72 | #' obtained by pairing characters from \code{a} and \code{b} while keeping the order 
 73 | #' of characters intact. The \bold{lcs-distance} is defined as the number of unpaired characters. 
 74 | #' The distance is equivalent to the edit distance allowing only deletions and insertions, 
 75 | #' each with weight one. 
 76 | #'
 77 | #' A \bold{\eqn{q}-gram} (method='qgram') is a subsequence of \eqn{q} \emph{consecutive} 
 78 | #' characters of a string. If \eqn{x} (\eqn{y}) is the vector of counts
 79 | #' of \eqn{q}-gram occurrences in \code{a} (\code{b}), the \bold{\eqn{q}-gram distance} 
 80 | #' is given by the sum over the absolute differences \eqn{|x_i-y_i|}.
 81 | #' The computation is aborted when \code{q} is is larger than the length of 
 82 | #' any of the strings. In that case \code{Inf}  is returned.
 83 | #'
 84 | #' The \bold{cosine distance} (method='cosine') is computed as \eqn{1-x\cdot
 85 | #' y/(\|x\|\|y\|)}, where \eqn{x} and \eqn{y} were defined above.
 86 | #'
 87 | #' Let \eqn{X} be the set of unique \eqn{q}-grams in \code{a} and \eqn{Y} the set of unique 
 88 | #' \eqn{q}-grams in \code{b}. The \bold{Jaccard distance} (\code{method='jaccard'}) is given by \eqn{1-|X\cap Y|/|X\cup Y|}.
 89 | #'
 90 | #' The \bold{Jaro distance} (\code{method='jw'}, \code{p=0}), is a number
 91 | #' between 0 (exact match) and 1 (completely dissimilar) measuring 
 92 | #' dissimilarity between strings.  It is defined to be 0 when both strings have
 93 | #' length 0, and 1 when  there are no character matches between \code{a} and
 94 | #' \code{b}.  Otherwise, the Jaro distance is defined as 
 95 | #' \eqn{1-(1/3)(w_1m/|a| + w_2m/|b| + w_3(m-t)/m)}. 
 96 | #' Here,\eqn{|a|} indicates the number of characters in \code{a}, \eqn{m} is
 97 | #' the number of character matches and \eqn{t} the number of transpositions of
 98 | #' matching characters. The \eqn{w_i} are weights associated with the characters
 99 | #' in \code{a}, characters in \code{b} and with transpositions.  A character
100 | #' \eqn{c} of \code{a} \emph{matches} a character from \code{b} when \eqn{c}
101 | #' occurs in \code{b}, and the index of \eqn{c} in \code{a} differs less than
102 | #' \eqn{\max(|a|,|b|)/2 -1} (where we use integer division) from the index of
103 | #' \eqn{c} in \code{b}. Two matching characters are transposed when they are
104 | #' matched but they occur in different order in string \code{a} and \code{b}.
105 | #'  
106 | #' The \bold{Jaro-Winkler distance} (\code{method=jw}, \code{0<p<=0.25}) adds a
107 | #' correction term to the Jaro-distance. It is defined as \eqn{d - l\cdot p\cdot d}, where
108 | #' \eqn{d} is the Jaro-distance. Here,  \eqn{l} is obtained by counting, from
109 | #' the start of the input strings, after how many characters the first
110 | #' character mismatch between the two strings occurs, with a maximum of four. The
111 | #' factor \eqn{p} is a 'prefix' factor, which in the work of Winkler is often
112 | #' chosen \eqn{0.1}.
113 | #'
114 | #' For the \bold{soundex} distance (method='soundex'), strings are translated to a soundex code 
115 | #' (see \code{\link{phonetic}} for a specification). The
116 | #' distance between strings is 0 when they have the same soundex code,
117 | #' otherwise 1. Note that soundex recoding is only meaningful for characters
118 | #' in the ranges a-z and A-Z. A warning is emitted when non-printable or non-ascii
119 | #' characters are encountered. Also see \code{\link{printable_ascii}}.
120 | #'
121 | #' The \bold{running_cosine} distance is an implementatation of the cosine
122 | #' distance especially meant for fuzzy text search as in \code{\link{afind}}.
123 | #' In fuzzy search a window of \code{n} characters slides across a (long)
124 | #' string while for each position of the window the distance between the part
125 | #' of the string in the window and a search pattern is computed. The (position
126 | #' of) the window with the shortest distance to the search pattern is returned.
127 | #' Sliding the window with a single position only affects the \eqn{q}-grams at
128 | #' the beginning and end of the window, and the 'running cosine' distance uses
129 | #' this and a few other tricks to save calculations.
130 | #' 
131 | #'
132 | #'
133 | #' @references
134 | #'
135 | #' \itemize{
136 | #' \item{
137 | #'   MPJ van der Loo (2014) \emph{The stringdist package for approximate string matching}. The R Journal \bold{6}(1) 111-122.
138 | #' }
139 | #' \item{
140 | #'  L. Boytsov (2011). \emph{Indexing methods for approximate dictionary searching: comparative analyses}. ACM Journal of experimental
141 | #'  algorithmics \bold{16} 1-88.
142 | #' }
143 | #' \item{
144 | #'  G. Navarro (2001). \emph{A guided tour to approximate string matching}. ACM Computing Surveys \bold{33} 31-88.
145 | #' }
146 | #' }
147 | #' @seealso
148 | #' \itemize{ 
149 | #'  \item{Functions applying string metrics to text: \code{\link{stringdist}},
150 | #'    \code{\link{stringdistmatrix}}, \code{\link{amatch}}}
151 | #'  \item{Functions applying string metrics to integer sequences:
152 | #'    \code{\link{seq_dist}}, \code{\link{seq_distmatrix}}, \code{\link{seq_amatch}} }
153 | #'  \item{Encoding issues: \code{\link{stringdist-encoding}}  }
154 | #' }
155 | #' 
156 | #' @name stringdist-metrics
157 | #' @rdname stringdist-metrics
158 | {}
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_amatch.R:
--------------------------------------------------------------------------------
  1 | options(sd_num_thread=2)
  2 | ## amatch: Optimal String Alignment
  3 | # tests against cases that used to segfault when we did not check
  4 | # NULL cases.
  5 | expect_error(amatch("a","b",nthread=1:4))
  6 | expect_error(amatch("a","b",nthread="foo"))
  7 | expect_error(amatch("a","b",nthread=integer(0)))
  8 | expect_error(amatch("a","b",nthread=NULL))
  9 | 
 10 | 
 11 | ## simple test and multiple edge cases
 12 |   expect_equal(amatch("aa",c("ba","bb"), method="osa",maxDist=1L), 1L)
 13 |   expect_equal(amatch("aa",c("bb","bb"), method="osa",maxDist=1L), NA_integer_)
 14 |   expect_equal(amatch("aa",c("bbb"), method="osa",maxDist=2L), NA_integer_)
 15 |   expect_equal(amatch("bbb",c("aa"), method="osa",maxDist=2L), NA_integer_)
 16 |   expect_equal(amatch("","", method="osa"), 1L)
 17 |   expect_equal(amatch(NA,"a", method="osa"), NA_integer_)
 18 |   expect_equal(amatch(NA,"a", method="osa", nomatch=0L),0L)
 19 |   expect_equal(amatch(NA,NA, method="osa"), 1L)
 20 |   expect_equal(amatch(NA,NA, method="osa",matchNA=FALSE), NA_integer_)
 21 |   expect_equal(amatch(NA,NA, method="osa",matchNA=FALSE, nomatch=0L), 0L)
 22 |   expect_equal(amatch(NA,NA, method="osa",matchNA=FALSE, nomatch=7L), 7L)
 23 |   expect_equal(amatch("aa","bb", method="osa",maxDist=1), NA_integer_)
 24 |   expect_equal(amatch("aa","bb", method="osa",maxDist=1), NA_integer_)
 25 |   expect_equal(amatch(c("m","fem"),c("male","female"),method="osa",maxDist=Inf), c(1,2))
 26 | 
 27 | 
 28 | 
 29 | 
 30 | ## amatch: Damerau-Levenshtein
 31 | 
 32 | ## simple test and multiple edge cases
 33 |   expect_equal(amatch("aa", c("ba","bb"), method="dl",maxDist=1L), 1L)
 34 |   expect_equal(amatch("aa",c("bb","bb"), method="dl",maxDist=1L), NA_integer_)
 35 |   expect_equal(amatch("aa",c("bbb"), method="dl",maxDist=2L), NA_integer_)
 36 |   expect_equal(amatch("bbb",c("aa"), method="dl",maxDist=2L), NA_integer_)
 37 |   expect_equal(amatch("","", method="dl"), 1L)
 38 |   expect_equal(amatch(NA,"a", method="dl"), NA_integer_)
 39 |   expect_equal(amatch(NA,"a", method="dl",nomatch=0L), 0L)
 40 |   expect_equal(amatch(NA,NA, method="dl"), 1L)
 41 |   expect_equal(amatch(NA,NA, method="dl",matchNA=FALSE), NA_integer_)
 42 |   expect_equal(amatch(NA,NA, method="dl",matchNA=FALSE,nomatch=0L), 0L)
 43 |   expect_equal(amatch(NA,NA, method="dl",matchNA=FALSE,nomatch=7L), 7L)
 44 |   expect_equal(amatch("aa","bb", method="dl",maxDist=1), NA_integer_)
 45 |   expect_equal(amatch("aa","bb", method="dl",maxDist=1), NA_integer_)
 46 |   expect_equal(amatch(c("m","fem"),c("male","female"),method="dl",maxDist=Inf), c(1,2))
 47 | 
 48 | 
 49 | ## amatch: Hamming
 50 | 
 51 | ## simple test and multiple edge cases
 52 |   expect_equal(amatch("aa", c("ba","bb"), method="hamming",maxDist=1L), 1L)
 53 |   expect_equal(amatch("aa",c("bb","bb"), method="hamming",maxDist=1L), NA_integer_)
 54 |   expect_equal(amatch(NA,c(NA,NA),method="hamming"),1L)
 55 |   expect_equal(amatch("","", method="hamming"), 1L)
 56 |   expect_equal(amatch(NA,"a", method="hamming"), NA_integer_)
 57 |   expect_equal(amatch(NA,"a", method="hamming",nomatch=0L), 0L)
 58 |   expect_equal(amatch(NA,NA, method="hamming"), 1L)
 59 |   expect_equal(amatch(NA,NA, method="hamming",matchNA=FALSE), NA_integer_)
 60 |   expect_equal(amatch(NA,NA, method="hamming",matchNA=FALSE,nomatch=0L), 0L)
 61 |   expect_equal(amatch(NA,NA, method="hamming",matchNA=FALSE,nomatch=7L), 7L)
 62 |   expect_equal(amatch("aa","bb", method="hamming",maxDist=1), NA_integer_)
 63 | 
 64 | 
 65 | 
 66 | ## amatch: Jaro and Jaro-Winkler
 67 | 
 68 | ## simple test and multiple edge cases
 69 |   expect_equal(amatch("aa", c("ba","bb"), method="jw",maxDist=1L), 1L)
 70 |   expect_equal(amatch("aa",c("bb","bb"), method="jw",maxDist=0.5), NA_integer_)
 71 |   expect_equal(amatch(NA,c(NA,NA),method="jw"),1L)
 72 |   expect_equal(amatch("","", method="jw"), 1L)
 73 |   expect_equal(amatch(NA,"a", method="jw"), NA_integer_)
 74 |   expect_equal(amatch(NA,"a", method="jw",nomatch=0L), 0L)
 75 |   expect_equal(amatch(NA,NA, method="jw"), 1L)
 76 |   expect_equal(amatch(NA,NA, method="jw",matchNA=FALSE), NA_integer_)
 77 |   expect_equal(amatch(NA,NA, method="jw",matchNA=FALSE,nomatch=0L), 0L)
 78 |   expect_equal(amatch(NA,NA, method="jw",matchNA=FALSE,nomatch=7L), 7L)
 79 |   expect_equal(amatch(c("m","fem"),c("male","female"),method="jw",maxDist=Inf), c(1,2))
 80 | 
 81 | ## amatch: Longest Common Substring
 82 | 
 83 | ## simple test and multiple edge cases
 84 |   expect_equal(amatch("aa", c("ba","bb"), method="lcs",maxDist=2L), 1L)
 85 |   expect_equal(amatch("aa",c("bb","bb"), method="lcs",maxDist=1L), NA_integer_)
 86 |   expect_equal(amatch("aa",c("bbb"), method="lcs",maxDist=2L), NA_integer_)
 87 |   expect_equal(amatch("bbb",c("aa"), method="lcs",maxDist=2L), NA_integer_)
 88 |   expect_equal(amatch(NA,c(NA,NA),method="lcs"),1L)
 89 |   expect_equal(amatch("","", method="lcs"), 1L)
 90 |   expect_equal(amatch(NA,"a", method="lcs"), NA_integer_)
 91 |   expect_equal(amatch(NA,"a", method="lcs",nomatch=0L), 0L)
 92 |   expect_equal(amatch(NA,NA, method="lcs"), 1L)
 93 |   expect_equal(amatch(NA,NA, method="lcs",matchNA=FALSE), NA_integer_)
 94 |   expect_equal(amatch(NA,NA, method="lcs",matchNA=FALSE,nomatch=0L), 0L)
 95 |   expect_equal(amatch(NA,NA, method="lcs",matchNA=FALSE,nomatch=7L), 7L)
 96 | 
 97 | 
 98 | ## amatch: Levenshtein
 99 | 
100 | ## simple test and multiple edge cases
101 |   expect_equal(amatch("aa", c("ba","bb"), method="lv",maxDist=1L), 1L)
102 |   expect_equal(amatch("aa",c("bb","bb"), method="lv",maxDist=1L), NA_integer_)
103 |   expect_equal(amatch("aa",c("bbb"), method="lv",maxDist=2L), NA_integer_)
104 |   expect_equal(amatch("bbb",c("aa"), method="lv",maxDist=2L), NA_integer_)
105 |   expect_equal(amatch(NA,c(NA,NA),method="lv"),1L)
106 |   expect_equal(amatch("","", method="lv"), 1L)
107 |   expect_equal(amatch(NA,"a", method="lv"), NA_integer_)
108 |   expect_equal(amatch(NA,"a", method="lv",nomatch=0L), 0L)
109 |   expect_equal(amatch(NA,NA, method="lv"), 1L)
110 |   expect_equal(amatch(NA,NA, method="lv",matchNA=FALSE), NA_integer_)
111 |   expect_equal(amatch(NA,NA, method="lv",matchNA=FALSE,nomatch=0L), 0L)
112 |   expect_equal(amatch(NA,NA, method="lv",matchNA=FALSE,nomatch=7L), 7L)
113 |   expect_equal(amatch(c("m","fem"),c("male","female"),method="lv",maxDist=Inf), c(1,2))
114 | 
115 | 
116 | ## amatch: qgrams
117 | 
118 | ## simple test and multiple edge cases
119 |   expect_equal(amatch("aa", c("ba","bb"), method="qgram",maxDist=2), 1L)
120 |   expect_equal(amatch("aa",c("bb","bb"), method="qgram",maxDist=1L), NA_integer_)
121 |   expect_equal(amatch(NA,c(NA,NA),method="qgram"),1L)
122 |   expect_equal(amatch("","", method="qgram", q=0), 1L)
123 |   expect_equal(amatch(NA,"a", method="qgram"), NA_integer_)
124 |   expect_equal(amatch(NA,"a", method="qgram",nomatch=0L), 0L)
125 |   expect_equal(amatch(NA,NA, method="qgram"), 1L)
126 |   expect_equal(amatch(NA,NA, method="qgram",matchNA=FALSE), NA_integer_)
127 |   expect_equal(amatch(NA,NA, method="qgram",matchNA=FALSE,nomatch=0L), 0L)
128 |   expect_equal(amatch(NA,NA, method="qgram",matchNA=FALSE,nomatch=7L), 7L)
129 |   # bug reported by Robert Carlson
130 |   expect_equal( 
131 |       amatch("600 EXAMPLE AVE NJ 8629", 
132 |         c("2100 EXAMPLE AVE NJ 8619", "600 EXAMPLE AVE NJ 8629"), method="jaccard")
133 |       , 2L)
134 | 
135 | 
136 | 
137 | ## amatch: Soundex
138 | 
139 | ## simple test and multiple edge cases
140 |   expect_equal(amatch("smith", c("smyth","smelt"), method="soundex"), 1L)
141 |   expect_equal(amatch("smith",c("bb","bb"), method="soundex"), NA_integer_)
142 |   expect_equal(amatch("smith",c("whashington"), method="soundex"), NA_integer_)
143 |   expect_equal(amatch("washington",c("smith"), method="soundex"), NA_integer_)
144 |   expect_equal(amatch(NA,c(NA,NA),method="soundex"),1L)
145 |   expect_equal(amatch("","", method="soundex"), 1L)
146 |   expect_equal(amatch(NA,"a", method="soundex"), NA_integer_)
147 |   expect_equal(amatch(NA,"a", method="soundex",nomatch=0L), 0L)
148 |   expect_equal(amatch(NA,NA, method="soundex"), 1L)
149 |   expect_equal(amatch(NA,NA, method="soundex",matchNA=FALSE), NA_integer_)
150 |   expect_equal(amatch(NA,NA, method="soundex",matchNA=FALSE,nomatch=0L), 0L)
151 |   expect_equal(amatch(NA,NA, method="soundex",matchNA=FALSE,nomatch=7L), 7L)
152 | 
153 | 
154 | ## amatch: useBytes
155 | 
156 | ## bytewise matching differs from character wise matching
157 |   x <- paste0('Mot',intToUtf8(0x00F6),'rhead') 
158 |   y <- c('bastard','Motorhead') 
159 |   jwdist <- round(1-(1/3)*(8/9 + 8/10 + 1),3)
160 | 
161 |   expect_equal(amatch(x, y, method='dl', maxDist=2, useBytes=TRUE), 2);
162 |   expect_equal(amatch(x, y, method='dl', maxDist=1, nomatch=0L, useBytes=TRUE), 0L);
163 |   expect_equal(amatch(x, x, method='hamming',maxDist=1L, useBytes=TRUE),1); 
164 |   expect_equal(amatch(x, y, method='hamming',nomatch=0L,useBytes=TRUE), 0L);
165 |   expect_equal(amatch(x, y, method='jw', maxDist=1.0, useBytes=TRUE), 2);
166 |   expect_equal(amatch(x, y, method='jw', maxDist=jwdist-0.01, nomatch=0L, useBytes=TRUE), 0L);
167 |   expect_equal(amatch(x, y, method='lcs',maxDist=3, useBytes=TRUE),2L); 
168 |   expect_equal(amatch(x, y, method='lcs',maxDist=2, useBytes=TRUE, nomatch=0L), 0L);
169 |   expect_equal(amatch(x, y, method='lv',maxDist=2, useBytes=TRUE),2L); 
170 |   expect_equal(amatch(x, y, method='lv',maxDist=1, useBytes=TRUE, nomatch=0L), 0L);
171 |   expect_equal(amatch(x, y, method='osa',maxDist=2, useBytes=TRUE),2L); 
172 |   expect_equal(amatch(x, y, method='osa',maxDist=1, useBytes=TRUE, nomatch=0L), 0L);
173 |   expect_equal(amatch(x, y, method='qgram',maxDist=7, q=3, useBytes=TRUE),2L); 
174 |   expect_equal(amatch(x, y, method='qgram',maxDist=6, q=3, useBytes=TRUE, nomatch=0L), 0L);
175 | 
176 | 
177 | ## seq_amatch
178 | 
179 | ## Input checks for seq_amatch
180 |   expect_equal(seq_amatch(list(1:10),list(1:10)),seq_amatch(1:10,1:10))
181 |   expect_equal(seq_amatch(list(1:10),list(1:10)),seq_amatch(as.numeric(1:10),as.numeric(1:10)))
182 | 
183 | 
184 | ## Some elementary tests for seq_amatch and seq_ain
185 |   
186 |   x <- list(c(1L,3L,2L))
187 |   table <- list(
188 |     c(2L,3L,4L)
189 |     ,c(1L,2L,3L)
190 |   )
191 |   expect_equal(seq_amatch(x,table,maxDist=3), 2)
192 |   expect_equal(seq_amatch(list(NA_integer_),table,maxDist=3),NA_integer_ )
193 |   expect_true(seq_ain(x,table,maxDist=3))
194 |   expect_false(seq_ain(x,table))
195 | 
196 | 
197 | expect_error(seq_amatch(x,table,nthread=1:4))
198 | expect_error(seq_amatch(x,table,nthread="foo"))
199 | expect_error(seq_amatch(x,table,nthread=integer(0)))
200 | expect_error(seq_amatch(x,table,nthread=NULL))
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------
/pkg/R/amatch.R:
--------------------------------------------------------------------------------
  1 | #' Approximate string matching
  2 | #'
  3 | #' Approximate string matching equivalents of \code{R}'s native
  4 | #' \code{\link[base]{match}} and \code{\%in\%}.
  5 | #'
  6 | #' \code{ain} is currently defined as 
  7 | #' 
  8 | #' \code{ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0}
  9 | #'
 10 | #'
 11 | #' @section Note on \code{NA} handling:
 12 | #' \code{R}'s native \code{\link[base]{match}} function matches \code{NA} with
 13 | #' \code{NA}. This may feel inconsistent with \code{R}'s usual \code{NA}
 14 | #' handling, since for example \code{NA==NA} yields
 15 | #' \code{NA} rather than \code{TRUE}. In most cases, one may reason about the
 16 | #' behaviour under \code{NA} along the lines of ``if one of the arguments is
 17 | #' \code{NA}, the result shall be \code{NA}'', simply because not all
 18 | #' information necessary to execute the function is available. One uses special 
 19 | #' functions such as \code{is.na}, \code{is.null} \emph{etc.} to handle special
 20 | #' values.
 21 | #'
 22 | #' The \code{amatch} function mimics the behaviour of \code{\link[base]{match}}
 23 | #' by default: \code{NA} is matched with \code{NA} and with nothing else. Note
 24 | #' that this is inconsistent with the behaviour of \code{\link{stringdist}} 
 25 | #' since \code{stringdist} yields \code{NA} when at least one of the arguments
 26 | #' is \code{NA}. The same inconsistency exists between \code{\link[base]{match}}
 27 | #' and \code{\link[utils]{adist}}. In \code{amatch} this behaviour can be
 28 | #' controlled by setting \code{matchNA=FALSE}. In that case, if any of the
 29 | #' arguments in \code{x} is \code{NA}, the \code{nomatch} value is returned,
 30 | #' regardless of whether \code{NA} is present in \code{table}. In
 31 | #' \code{\link[base]{match}} the behaviour can be controlled by setting the
 32 | #' \code{incomparables} option.
 33 | #'
 34 | #' 
 35 | #' @param x elements to be approximately matched: will be coerced to 
 36 | #'   \code{character} unless it is a list consisting of \code{integer} vectors.
 37 | #' @param table lookup table for matching. Will be coerced to \code{character}
 38 | #'   unless it is a list consting of \code{integer} vectors.
 39 | #' @param nomatch The value to be returned when no match is found. This is
 40 | #'   coerced to integer.
 41 | #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the 
 42 | #'   behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches
 43 | #'   \code{NA} (see also the note on \code{NA} handling below).
 44 | #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}.
 45 | #' @param useBytes Perform byte-wise comparison. See \code{\link{stringdist-encoding}}.
 46 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
 47 | #'   deletion, insertion, substitution and transposition, in that order. When
 48 | #'   \code{method='lv'}, the penalty for transposition is ignored. When
 49 | #'   \code{method='jw'}, the weights associated with characters of \code{a},
 50 | #'   characters from \code{b} and the transposition weight, in that order. 
 51 | #'   Weights must be positive and not exceed 1. \code{weight} is ignored
 52 | #'   completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
 53 | #'   \code{'Jaccard'}, \code{'lcs'}, or \code{'soundex'}.
 54 | #' @param maxDist Elements in \code{x} will not be matched with elements of 
 55 | #'   \code{table} if their distance is larger than \code{maxDist}. Note that the
 56 | #'   maximum distance between strings depends on the method: it should always be
 57 | #'   specified.
 58 | #' @param nthread Number of threads used by the underlying C-code. A sensible
 59 | #'   default is chosen, see \code{\link{stringdist-parallelization}}.
 60 | #'   
 61 | #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'},
 62 | #'   or \code{'cosine'}.
 63 | #' @param p Winklers 'prefix' parameter for Jaro-Winkler distance, with
 64 | #'   \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'}
 65 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is
 66 | #'   only applied when the Jaro distance is larger than \code{bt}.
 67 | #'   Applies only to \code{method='jw'} and \code{p>0}.
 68 | #'
 69 | #' @return \code{amatch} returns the position of the closest match of \code{x}
 70 | #'   in \code{table}. When multiple matches with the same smallest distance
 71 | #'   metric exist, the first one is returned. \code{ain} returns a
 72 | #'   \code{logical} vector of length \code{length(x)} indicating wether an
 73 | #'   element of \code{x} approximately matches an element in \code{table}.
 74 | #'
 75 | #' @family matching
 76 | #'
 77 | #' @example ../examples/amatch.R
 78 | #' @export
 79 | amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE
 80 |   , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw", "soundex") 
 81 |   , useBytes = FALSE
 82 |   , weight=c(d=1,i=1,s=1,t=1)
 83 |   , maxDist=0.1, q=1, p=0, bt=0
 84 |   , nthread = getOption("sd_num_thread")){
 85 | 
 86 |   x <- as.character(x)
 87 |   table <- as.character(table)
 88 | 
 89 |   if (!useBytes){
 90 |     x <- enc2utf8(x)
 91 |     table <- enc2utf8(table)
 92 |   }
 93 | 
 94 |   method <- match.arg(method)
 95 |   stopifnot(
 96 |       all(is.finite(weight))
 97 |       , all(weight > 0)
 98 |       , all(weight <=1)
 99 |       , q >= 0
100 |       , p <= 0.25
101 |       , p >= 0
102 |       , matchNA %in% c(TRUE,FALSE)
103 |       , maxDist > 0
104 |       , is.logical(useBytes)
105 |       , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
106 |       , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
107 |       , length(nthread) == 1
108 |       , is.numeric(nthread)
109 |       , nthread > 0
110 |   )
111 |   if (method == 'jw') weight <- weight[c(2,1,3)]
112 |   method <- METHODS[method]
113 |   if ( is.na(method) ){
114 |     stop(sprintf("method '%s' is not defined",method))
115 |   }
116 | 
117 |   .Call("R_amatch", x, table, method
118 |     , as.integer(nomatch), as.integer(matchNA)
119 |     , as.double(weight), as.double(p), as.double(bt)
120 |     , as.integer(q) , as.double(maxDist), as.integer(useBytes)
121 |     , as.integer(nthread)
122 |     , PACKAGE="stringdist"
123 |   )
124 | 
125 | }
126 | 
127 | #' @param ... parameters to pass to \code{amatch} (except \code{nomatch})
128 | #'
129 | #'
130 | #' @rdname amatch
131 | #' @export 
132 | ain <- function(x,table,...){
133 |   amatch(x, table, nomatch=0, ...) > 0
134 | }
135 | 
136 | #' Approximate matching for integer sequences.
137 | #'
138 | #'
139 | #' For a \code{list} of integer vectors \code{x}, find the closest matches in a
140 | #' \code{list} of integer or numeric vectors in \code{table.}
141 | #'
142 | #' @section Notes:
143 | #' \code{seq_ain} is currently defined as 
144 | #' 
145 | #' \code{seq_ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0}
146 | #'
147 | #' All input vectors are converted with \code{as.integer}. This causes truncation for numeric
148 | #' vectors (e.g. \code{pi} will be treated as \code{3L}).
149 | #'
150 | #'
151 | #' @param x (\code{list} of) \code{integer} or \code{numeric} vector(s) to be
152 | #'   approximately matched. Will be converted with \code{as.integer}.
153 | #' @param table (\code{list} of) \code{integer} or \code{numeric} vector(s)
154 | #'   serving as lookup table for matching. Will be converted with
155 | #'   \code{as.integer}.
156 | #' @param nomatch The value to be returned when no match is found. This is
157 | #'   coerced to integer.
158 | #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the 
159 | #'   behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches
160 | #'   \code{NA}. With \code{NA}, we mean a missing entry in the \code{list}, represented as \code{NA_integer_}. 
161 | #'   If one of the integer sequences stored in the list has an \code{NA} entry,
162 | #'   this is just treated as another integer (the representation of
163 | #'   \code{NA_integer_}).
164 | #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}.
165 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
166 | #'   deletion, insertion, substitution and transposition, in that order. When
167 | #'   \code{method='lv'}, the penalty for transposition is ignored. When
168 | #'   \code{method='jw'}, the weights associated with integers in elements of \code{a},
169 | #'   integers in elements of \code{b} and the transposition weight, in that order. 
170 | #'   Weights must be positive and not exceed 1. \code{weight} is ignored
171 | #'   completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
172 | #'   \code{'Jaccard'}, or \code{'lcs'}.
173 | #' @param maxDist Elements in \code{x} will not be matched with elements of 
174 | #'   \code{table} if their distance is larger than \code{maxDist}. Note that the
175 | #'   maximum distance between strings depends on the method: it should always be
176 | #'   specified.
177 | #' @param nthread Number of threads used by the underlying C-code. A sensible
178 | #'   default is chosen, see \code{\link{stringdist-parallelization}}.
179 | #'   
180 | #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'},
181 | #'   or \code{'cosine'}.
182 | #' @param p Winkler's prefix parameter for Jaro-Winkler distance, with
183 | #'   \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'}
184 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is
185 | #'   only applied when the Jaro distance is larger than \code{bt}.
186 | #'   Applies only to \code{method='jw'} and \code{p>0}.
187 | #' @return \code{seq_amatch} returns the position of the closest match of \code{x}
188 | #'   in \code{table}. When multiple matches with the same minimal distance
189 | #'   metric exist, the first one is returned. \code{seq_ain} returns a
190 | #'   \code{logical} vector of length \code{length(x)} indicating wether an
191 | #'   element of \code{x} approximately matches an element in \code{table}.
192 | #' 
193 | #' @seealso \code{\link{seq_dist}}, \code{\link{seq_sim}}, \code{\link{seq_qgrams}}
194 | #'
195 | #' @example ../examples/seq_amatch.R
196 | #' @export
197 | seq_amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE
198 |   , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw") 
199 |   , weight=c(d=1,i=1,s=1,t=1)
200 |   , maxDist=0.1, q=1, p=0, bt=0
201 |   , nthread = getOption("sd_num_thread")){
202 | 
203 |   x <- ensure_int_list(x)
204 |   table <- ensure_int_list(table)
205 |   
206 |   method <- match.arg(method)
207 |   stopifnot(
208 |       all(is.finite(weight))
209 |       , all(weight > 0)
210 |       , all(weight <=1)
211 |       , q >= 0
212 |       , p <= 0.25
213 |       , p >= 0
214 |       , matchNA %in% c(TRUE,FALSE)
215 |       , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
216 |       , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
217 |       , length(nthread) == 1
218 |       , is.numeric(nthread)
219 |       , nthread > 0
220 |   )
221 |   if (method == 'jw') weight <- weight[c(2,1,3)]
222 |   method <- METHODS[method]
223 |   if ( is.na(method) ){
224 |     stop(sprintf("method '%s' is not defined",method))
225 |   }
226 | 
227 |   .Call("R_amatch", x, table, method
228 |     , as.integer(nomatch), as.integer(matchNA)
229 |     , as.double(weight), as.double(p), as.double(bt)
230 |     , as.integer(q) , as.double(maxDist), 0L
231 |     , as.integer(nthread)
232 |     , PACKAGE="stringdist"
233 |   )
234 | }
235 | 
236 | #' @param ... parameters to pass to \code{seq_amatch} (except \code{nomatch})
237 | #'
238 | #'
239 | #' @rdname seq_amatch
240 | #' @export 
241 | seq_ain <- function(x,table,...){
242 |   seq_amatch(x, table, nomatch=0, ...) > 0
243 | }
244 | 
245 | 
246 | 
247 | 


--------------------------------------------------------------------------------
/pkg/NEWS:
--------------------------------------------------------------------------------
  1 | version 0.9.15
  2 | - Fixe issue with zero-length 'nthreads' argument in all exported functions
  3 |   with this parameter. (Thanks to Brian Ripley for the notification and pointer
  4 |   to the problem)
  5 | 
  6 | version 0.9.14
  7 | - Fixed issue with zero-length strings in 'qgrams' (Thanks to Brian Ripley
  8 |   for the notification and pointer to the origin of the problem)
  9 | 
 10 | version 0.9.12
 11 | - apparently R_xlen_t is long long int on CLANG/Windows and long int on gcc-13/debian
 12 | 
 13 | version 0.9.11
 14 | - Fixed a warning in gcc-13: changed specifier from %d to %ld.
 15 |   (Thanks to Kurt Hornik for the head's up)
 16 | 
 17 | version 0.9.10
 18 | - Fixed another warning generated by new C compiler that I overlooked.
 19 |   (Thanks to the CRAN team for the head's up)
 20 | 
 21 | version 0.9.9
 22 | - Fixed warnings generated by new C compiler. (function prototypes must
 23 |   now be defined completely). (Thanks to Kurt Hornik for the head's up.)
 24 | 
 25 | version 0.9.8
 26 | - Fixed some issues on C-level causing problems with the
 27 |   CLANG compiler. (Thanks to Brian Ripley for not only
 28 |   reporting this, but also sending updated code with 
 29 |   fixes).
 30 | 
 31 | 
 32 | version 0.9.7
 33 | - Fixes in use of INTEGER() and VECTOR_ELT() after updates in R's C API.
 34 |   this affected 'afind' and 'max_length' (internally). (Thanks to Luke
 35 |   Tierny and Kurt Hornik for the notification).
 36 | - Fix in 'amatch' causing utf-8 characters to be ignored in some
 37 |   cases (thanks to Joan Mime for reporting #78).
 38 | - Fix: segfault when 'afind' was called with many search patterns or many
 39 |   texts to be searched.
 40 | - Fix: stringsimmatrix was not normalized correctly (Thanks to Tamas Ferenci
 41 |   for reporting GH).
 42 | 
 43 | 
 44 | version 0.9.6.3
 45 | - Resubmit. Fixed an URL redirect that was detected by CRAN.
 46 | 
 47 | version 0.9.6.2
 48 | - Resubmit. Fixed url issues detected by CRAN, added doi to description
 49 |   as per CRAN request.
 50 | 
 51 | version 0.9.6.1
 52 | - Bugfix: afind/grab/grabl returned wrong results on MacOS only.  
 53 |   (thanks to Prof. Brian Ripley for the notification and for running tests 
 54 |    on his personal machine and to Tomas Kalibera for making the 
 55 |    ubuntu-rchk docker image available).
 56 | 
 57 | version 0.9.6
 58 | - New function 'afind': find approximate matches in text based on string distance.
 59 | - New functions 'grab', 'grabl': fuzzy matching equivalent to 'grep' and 'grepl'.
 60 | - New function 'extract': fuzzy matching equivalent of stringr::str_extract.
 61 | - New algorithm 'running_cosine': fast fuzzy text search using cosine distance.
 62 | - New function 'stringsimmatrix' (Thanks to Johannes Gruber).
 63 | - Number of threads used is now reported when loading 'stringdist'.
 64 | - Internal fixes (in some cases class() == 'class' was used).
 65 | 
 66 | version 0.9.5.5
 67 | - Changed two URLs to canonical form in README.md (https://) to comply with
 68 |   CRAN policy.
 69 | 
 70 | version 0.9.5.4
 71 | - Some tests using seq_dist() would fail unpredictably when the input was
 72 |   defined with lazily evaluated arguments, e.g. list(1:3, 2:4); but only in the
 73 |   context of NSE by a test suite ('tinytest', 'testthat'). Tests were replaced by
 74 |   literal versions, e.g. list(c(1,2,3), c(2,3,4)).
 75 | 
 76 | version 0.9.5.3
 77 | - Update in test suite to stay on CRAN
 78 | 
 79 | version 0.9.5.2
 80 | - RJournal paper and C/C++ api docs are now presented as vignette.
 81 | - Switched to tinytest framework
 82 | - Fix: stringdist could cause a segfault for edit distances between very long 
 83 |   strings. (Thanks to GH user gllipatz)
 84 | 
 85 | 
 86 | version 0.9.5.1
 87 | - Fixed header file for C API
 88 | 
 89 | version 0.9.5.0
 90 | - New contributor: Chris Muir
 91 | - C/C++ API now exposed for packages LinkingTo stringdist. See `?stringdist_api`
 92 | - Arguments 'maxDist', 'ncores', 'cluster' of functions 'stringdist' and
 93 |   'stringdistmatrix' have been deprecated for several years and are now
 94 |   removed.
 95 | - Fixed edge case where cosine distance with q=1, between strings of repeating characters
 96 |   yielded Inf (Thanks to Markus Dumke)
 97 | 
 98 | 
 99 | version 0.9.4.6
100 | - Fixed argument passing error in lower_tri (thanks to Kurt Hornik)
101 | 
102 | version 0.9.4.5
103 | - New argument 'bt' implementing Winkler's boost threshold for the Jaro-Winkler distance
104 | - stringdist(a,b,method="qgram") returns correct value when q>nchar(a) (or b). 
105 |   (Thanks to Giora Simchoni). Also affects stringdistmatrix, amatch, seq_dist,
106 |   and seq_distmatrix.
107 | - registered native routines as now recommended by CRAN
108 | 
109 | version 0.9.4.4
110 | - updated default nr of threads to comply to CRAN policy (thanks to Kurt Hornik).
111 |   The default nr of cores now equals OMP_NUM_THREADS if set. See 
112 |   ?'stringdist-parallelization' for the full policy.
113 | 
114 | version 0.9.4.2
115 | - bugfix in stringdistmatrix(a): value of p, for jw-distance was ignored
116 |   (thanks to Max Fritsche)
117 | - bugfix in stringdistmatrix(a): Would segfault on q-gram w/input > ~7k strings
118 |   and q>1 (thanks to Connor McKay)
119 | - bugfix in jaccard distance: distance not always correct when passing multiple
120 |   strings (thanks to Robert Carlson)
121 | 
122 | version 0.9.4.1
123 | - stringdistmatrix(a) now outputs long vectors (issue #45, thanks to Wouter
124 |   Touw).  For stringdistmatrix(a,b) this was already the case, but the length
125 |   of rows and columns remains restricted to 2^31-1 since long input vectors are
126 |   not supported (yet).
127 | - bugfix in osa/dl/lv distances w/unequal edit weights (thanks to Nathalia Potocka)
128 | 
129 | version 0.9.4
130 | - bugfix: edge case for zero-size for lower tridiagonal dist matrices (caused
131 |   UBSAN to fire, but gave correct results).
132 | - bugfix in jw distance: not symmetric for certain cases (thanks to github user gtumuluri)
133 | 
134 | version 0.9.3
135 | - new function for tokenizing integer sequences: seq_qgrams
136 | - new function for matching integer sequences: seq_amatch
137 | - new functions computing distances between integer sequences: seq_dist, seq_distmatrix
138 | - q-gram based distances are now always 0 when q=0 (used to be Inf if at least
139 |   one of the arguments was not the empty string)
140 | - stringdist, stringdistmatrix now emit warning when presented with 'list' argument
141 | - small c-side code optimizations
142 | - bugfix in dl, lv, osa distance: weights were not taken into account properly
143 |   (thanks to Zach Price)
144 | 
145 | version 0.9.2
146 | - Update  fixing some errors (missing documentation, tests) in the 0.9.1 release.
147 | - Fixed a few possible memory leaks.
148 | 
149 | version 0.9.1
150 | - Argument 'useNames' of 'stringdistmatrix' now accepts 'none', 'strings', and 'names'
151 | - New function 'stringsim' computes string similarities between 0 and 1 based on 'stringdist'
152 | - Calling 'stringdistmatrix' with a single argument returns an object of class 'dist'
153 | - Argument 'cluster' to stringdistmatrix is phased out. It is now ignored with a message.
154 | - Specifying 'ncores' was already ignored but now also causes a warning
155 | - internal: rewrite of the R/C interface, saving about 1/3 of C-code, making extending easier
156 | - bugfix in stringdistmatrix: output was transposed when length(a)==1 (Thanks to github user cpoonolly)
157 | - Safer core detection to avoid a failure under Cygwin (thanks to Lauri Koobas)
158 | 
159 | version 0.9.0
160 | - C-code underlying stringdist and amatch now automatically use multithreading based on openMP. 
161 |   The default number of threads is governed by options('sd_num_thread'). 
162 | - stringdist, stringdistmatrix, amatch and ain gain nthread argument which can
163 |   overwrite the default maximum number of threads.
164 | - Argument 'maxDist' is phased out for 'stringdist' and 'stringdistmatrix'.
165 |   Specifying it causes a message.
166 | - Argument 'ncores' is phased out for 'stringdistmatrix'. It is now ignored and
167 |   specifying it causes a message.
168 | - bugfix in amatch/dl. In certain cases, the best match went undetected.
169 | - Documentation improved and rearranged with string metrics, encoding, and
170 |   parallelization now documented as separate topics.
171 | 
172 | version 0.8.2
173 | - Fixed a few warnings issued by the CLANG compiler (thanks to Brian Ripley).
174 |   This fixes a bug in amatch/jaccard
175 | - Fixed a bug in stringdist/osa, dl: NA incorectly returned (thanks to Lauri
176 |   Koobas).
177 | 
178 | version 0.8.1
179 | - stringdistmatrix returns dimensionless matrix when both arguments have length
180 |   zero (thanks to Richie Cotton)
181 | - stringdistmatrix gains argument 'useNames' (thanks to Richie Cotton)
182 | - Package now 'Imports' parallel rather than 'Depends' on it.
183 | - bugfix in optimal string alignment distance: the nr of transpositions was
184 |   sometimes overcounted (thanks to Frank Binder)
185 | - rearranged the documentation.
186 | 
187 | version 0.8.0
188 | - Added soundex-based string distance (thanks to Jan van der Laan)
189 | - New function 'phonetic' translates strings to phonetic codes using soundex
190 |   (thanks to Jan van der Laan)
191 | - New function 'printable_ascii' detects non-printable ascii or non-ascii
192 |   characters.
193 | - Precision issue: cosine distance between equal strings would be O(1e-16) in
194 |   stead of 0.0 (thanks to Ben Haller).
195 | - Code cleaning: somewhat better performance when maxDist is unspecified in
196 |   stringdist. It remains deprecated.
197 | - Row names in the output array of 'qgrams' are now in system native encoding
198 |   (used to be utf8 for all systems).
199 | - updated CITATION with page number info as the R Journal is now out.
200 | 
201 | version 0.7.3
202 | - bugfix in jw-distance: out-of-range access in C-code caused R to crash in
203 |   some cases (thanks to Carol Gan)
204 | - bugfix in dl distance: in some cases, distances could be one unit too high.
205 | - Updated CITATION file: paper to appear in The R Journal vol 6 (2014).
206 | - Some updates in documentation.
207 | 
208 | version 0.7.2
209 | - function 'qgrams' gains .list argument
210 | - bugfix in multicore option of stringdistmatrix
211 | - bugfix in substitution weight of DL-distance (undercounted when w4 != 1 in
212 |   some cases)
213 | - bugfix in dl.c: C-function read outside of array. 
214 | 
215 | version 0.7.0
216 | - added useBytes option: up to ~3-fold speed gain at the cost of possible
217 |   encoding-dependent results.
218 | - new memory allocation method for q-grams increases speed between ~5% and ~30%
219 |   depending on q and input string.
220 | - function 'qgrams' gains useNames option.
221 | - jaro-winkler distance gains weight argument.
222 | - C-code optimization in edit-based distances:  10~20% speed increase depending
223 |   on input.
224 | - bugfix in amatch: sometimes NA was erroneously returned.
225 | - bugfix in amatch/lcs: hamming distance method was called erroneously.
226 | 
227 | version 0.6.1
228 | - bugfix in parallel version of stringdistmatrix: parameter p was not passed
229 |   (thanks to Ricardo Saporta)
230 | - bugfix in lv/osa/dl: maxDist ignored in certain cases
231 | 
232 | version 0.6.0
233 | - added amatch function: approximate matching version of 'match'
234 | - added ain function: approximate matching version of '%in%'
235 | - qgrams now accepts arbitrary number of arguments. Outputs array, not table
236 | - added cosine distance
237 | - added Jaccard distance
238 | - added Jaro and Jaro-Winkler distances
239 | - small performance tweeks in underlying C code
240 | - Edge case in stringdistmatrix: output is now always of class matrix
241 | - Default maxDist is now Inf (this is only to make it more intuitive and does
242 |   not break previous code)
243 | - BREAKING CHANGE: output -1 is replaced by Inf for all distance methods
244 | 
245 | 
246 | version 0.5.0
247 | - added qgram counting function 'qgrams'
248 | - faster edge case handling in osa method.
249 | - edge case in lv/osa/dl methods: distance returned length(b) in stead of -1
250 |   when length(a) == 0, maxDist < length(b).
251 | - bugfix in lv/osa/dl method: maxDist returned when length(a) > maxDist > 0
252 |   (thanks to Daniel Reckhard).
253 | - Hamming distance (method='h') now returns -1 for strings of unequal lengts
254 |   (used to emit error).
255 | - added longest common substring distance (method='lcs').
256 | - added qgram distance method.
257 | - stringdistmatrix gains cluster argument.
258 | 
259 | version 0.4.2
260 | - Fix in error message for hamming distance
261 | - Workaround for system-dependent translation of utf8 NA characters
262 | 
263 | version 0.4.0
264 | - First release
265 | 


--------------------------------------------------------------------------------
/pkg/R/stringdist.R:
--------------------------------------------------------------------------------
  1 | #' A package for string distance calculation and approximate string matching.
  2 | #'
  3 | #'
  4 | #' The \pkg{stringdist} package offers fast and platform-independent string
  5 | #' metrics. Its main purpose is to compute various string distances and to do 
  6 | #' approximate text matching between character vectors. As of version 0.9.3,
  7 | #' it is also possible to compute distances between sequences represented by
  8 | #' integer vectors.
  9 | #' 
 10 | #' 
 11 | #' A typical use is to match strings that are not precisely the same. For
 12 | #' example
 13 | #'
 14 | #' \code{  amatch(c("hello","g'day"),c("hi","hallo","ola"),maxDist=2)}
 15 | #'
 16 | #' returns \code{c(2,NA)} since \code{"hello"} matches closest with
 17 | #' \code{"hallo"}, and within the maximum (optimal string alignment) distance.
 18 | #' The second element, \code{"g'day"}, matches closest with \code{"ola"} but
 19 | #' since the distance equals 4, no match is reported.
 20 | #'
 21 | #' A second typical use is to compute string distances. For example 
 22 | #'
 23 | #' \code{  stringdist(c("g'day"),c("hi","hallo","ola"))}
 24 | #'
 25 | #' Returns \code{c(5,5,4)} since these are the distances between \code{"g'day"}
 26 | #' and respectively \code{"hi"}, \code{"hallo"}, and \code{"ola"}.
 27 | #'
 28 | #' A third typical use would be to compute a \code{dist} object. The command
 29 | #' 
 30 | #' \code{stringdistmatrix(c("foo","bar","boo","baz"))}
 31 | #'
 32 | #' returns an object of class \code{dist} that can be used by clustering
 33 | #' algorithms such as \code{stats::hclust}.
 34 | #' 
 35 | #' A fourth use is to compute string distances between general sequences,
 36 | #' represented as integer vectors (which must be stored in a \code{list}):
 37 | #'
 38 | #' \code{seq_dist( list(c(1L,1L,2L)), list(c(1L,2L,1L),c(2L,3L,1L,2L)) )}
 39 | #'
 40 | #' The above code yields the vector \code{c(1,2)} (the first shorter first
 41 | #' argument is recycled over the longer second argument)
 42 | #' 
 43 | #' Besides documentation for each function, the main topics documented are:
 44 | #' 
 45 | #' \itemize{
 46 | #' \item{\code{\link{stringdist-metrics}} -- string metrics supported by the package}
 47 | #' \item{\code{\link{stringdist-encoding}} -- how encoding is handled by the package}
 48 | #' \item{\code{\link{stringdist-parallelization}} -- on multithreading }
 49 | #' }
 50 | #'
 51 | #' @section Acknowledgements:
 52 | #' \itemize{
 53 | #'   \item{The code for the full Damerau-Levenshtein distance was adapted from Nick Logan's
 54 | #'   \href{https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c}{public github repository}.}
 55 | #'   \item{C code for converting UTF-8 to integer was copied from the R core for performance reasons.}
 56 | #'   \item{The code for soundex conversion and string similarity was kindly contributed by Jan van der Laan.}
 57 | #' }
 58 | #' @section Citation:
 59 | #' If you would like to cite this package, please cite the \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{R Journal Paper}: 
 60 | #' \itemize{
 61 | #' \item{M.P.J. van der Loo (2014). The \code{stringdist} package for approximate string matching. 
 62 | #'  R Journal 6(1) pp 111-122}
 63 | #' }
 64 | #' Or use \code{citation('stringdist')} to get a bibtex item.
 65 | #'
 66 | #' @name stringdist-package
 67 | #' @docType package
 68 | #' @useDynLib stringdist, .registration=TRUE
 69 | #' @importFrom parallel detectCores 
 70 | #'
 71 | #'
 72 | #' 
 73 | "_PACKAGE"
 74 | 
 75 | listwarning <- function(x,y){
 76 | sprintf("
 77 | You are passing one or more arguments of type 'list' to
 78 | '%s'. These arguments will be converted with 'as.character'
 79 | which is likeley not to give what you want (did you mean to use '%s'?).
 80 | This warning can be avoided by explicitly converting the argument(s).
 81 | ",x,y)
 82 | }
 83 | 
 84 | #' Compute distance metrics between strings
 85 | #'
 86 | #' 
 87 | #' \code{stringdist} computes pairwise string distances between elements of
 88 | #' \code{a} and \code{b}, where the argument with less elements is recycled.
 89 | #' \code{stringdistmatrix} computes the string distance matrix with rows
 90 | #' according to
 91 | #' \code{a} and columns according to \code{b}.
 92 | #' 
 93 | #'
 94 | #' @param a R object (target); will be converted by \code{as.character}
 95 | #' @param b R object (source); will be converted by \code{as.character}
 96 | #'   This argument is optional for \code{stringdistmatrix} (see section \code{Value}).
 97 | #' @param method Method for distance calculation. The default is \code{"osa"},
 98 | #'   see \code{\link{stringdist-metrics}}.
 99 | #' @param useBytes Perform byte-wise comparison, see
100 | #'   \code{\link{stringdist-encoding}}.
101 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
102 | #'   deletion, insertion, substitution and transposition, in that order. When
103 | #'   \code{method='lv'}, the penalty for transposition is ignored. When
104 | #'   \code{method='jw'}, the weights associated with characters of \code{a},
105 | #'   characters from \code{b} and the transposition weight, in that order. 
106 | #'   Weights must be positive and not exceed 1. \code{weight} is ignored
107 | #'   completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
108 | #'   \code{'Jaccard'}, \code{'lcs'}, or \code{soundex}.
109 | #' @param q  Size of the \eqn{q}-gram; must be nonnegative. Only applies to
110 | #'   \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.
111 | #' @param p Prefix factor for Jaro-Winkler distance. The valid range for 
112 | #'   \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the
113 | #'   Jaro-distance is returned. Applies only to \code{method='jw'}.
114 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is
115 | #'   only applied when the Jaro distance is larger than \code{bt}.
116 | #'   Applies only to \code{method='jw'} and \code{p>0}.
117 | #' @param nthread Maximum number of threads to use. By default, a sensible
118 | #'   number of threads is chosen, see \code{\link{stringdist-parallelization}}.
119 | #'  
120 | #' @seealso \code{\link{stringsim}}, \code{\link{qgrams}}, \code{\link{amatch}}, \code{\link{afind}}
121 | #'
122 | #' @return For \code{stringdist},  a vector with string distances of size
123 | #'   \code{max(length(a),length(b))}.
124 | #'  
125 | #' For \code{stringdistmatrix}: if both \code{a} and \code{b} are passed, a
126 | #' \code{length(a)xlength(b)} \code{matrix}. If a single argument \code{a} is
127 | #' given an object of class \code{\link[stats]{dist}} is returned.
128 | #'  
129 | #' Distances are nonnegative if they can be computed, \code{NA} if any of the
130 | #' two argument strings is \code{NA} and \code{Inf} when \code{maxDist} is
131 | #' exceeded or, in case of the hamming distance, when the two compared strings
132 | #' have different length.
133 | #'  
134 | #'  
135 | #' @example ../examples/stringdist.R
136 | #' @export
137 | stringdist <- function(a, b
138 |   , method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw","soundex")
139 |   , useBytes = FALSE
140 |   , weight=c(d=1,i=1,s=1,t=1) 
141 |   , q  = 1
142 |   , p  = 0
143 |   , bt = 0
144 |   , nthread = getOption("sd_num_thread")
145 | ){
146 |   if (is.list(a)|is.list(b))
147 |     warning(listwarning("stringdist","seq_dist"))
148 |             
149 |   stopifnot(
150 |     all(is.finite(weight))
151 |     , all(weight > 0)
152 |     , all(weight <=1)
153 |     , q >= 0
154 |     , p <= 0.25
155 |     , p >= 0
156 |     , is.logical(useBytes)
157 |     , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
158 |     , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
159 |     , length(nthread) == 1
160 |     , is.numeric(nthread)
161 |     , nthread > 0
162 |   )
163 |   
164 |   # note: enc2utf8 is very efficient when the native encoding is already UTF-8.
165 |   a <- as.character(a)
166 |   b <- as.character(b)
167 |   if ( !useBytes ){
168 |     a <- enc2utf8(a)
169 |     b <- enc2utf8(b)
170 |   }
171 |   
172 |   if (length(a) == 0 || length(b) == 0){ 
173 |     return(numeric(0))
174 |   }
175 |   if ( max(length(a),length(b)) %% min(length(a),length(b)) != 0 ){
176 |     warning(RECYCLEWARNING)
177 |   }
178 |   method <- match.arg(method)
179 |   nthread <- as.integer(nthread)
180 | 
181 |   if (method == 'jw') weight <- weight[c(2,1,3)]
182 |   do_dist(a=b, b=a
183 |     , method=method
184 |     , weight=weight
185 |     , q=q
186 |     , p=p
187 |     , bt=bt
188 |     , useBytes=useBytes
189 |     , nthread=nthread)
190 | }
191 | 
192 | 
193 | #' @param useNames Use input vectors as row and column names?
194 | #'
195 | #'
196 | #' @rdname stringdist
197 | #' @export
198 | stringdistmatrix <- function(a, b
199 |   , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex")
200 |   , useBytes = FALSE
201 |   , weight=c(d=1,i=1,s=1,t=1)  
202 |   , q  = 1
203 |   , p  = 0
204 |   , bt = 0
205 |   , useNames=c('none','strings','names') 
206 |   , nthread = getOption("sd_num_thread")
207 | ){
208 |   if (is.list(a)|| (!missing(b) && is.list(b)) ){
209 |    warning(listwarning("stringdistmatrix","seq_distmatrix"))
210 |   }
211 | 
212 |   # for backward compatability with stringdist <= 0.9.0
213 |   if (identical(useNames, FALSE)) useNames <- "none"
214 |   if (identical(useNames, TRUE)) useNames <- "strings"
215 |   useNames <- match.arg(useNames)
216 |   
217 |   method <- match.arg(method)
218 |   nthread <- as.integer(nthread)
219 |   stopifnot(
220 |     all(is.finite(weight))
221 |     , all(weight > 0)
222 |     , all(weight <=1)
223 |     , q >= 0
224 |     , p <= 0.25
225 |     , p >= 0
226 |     , is.logical(useBytes)
227 |     , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
228 |     , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
229 |     , length(nthread) == 1
230 |     , is.numeric(nthread)
231 |     , nthread > 0
232 |   )
233 |   
234 |   if (method == 'jw') weight <- weight[c(2,1,3)]
235 |   
236 |   # if b is missing, generate a 'dist' object.  
237 |   if (missing(b)){ 
238 |     if (useNames == "names"){
239 |       a <- setNames(as.character(a),names(a))
240 |     } else {
241 |       a <- as.character(a)
242 |     }
243 |     return( lower_tri(a
244 |         , method=method
245 |         , useBytes=useBytes
246 |         , weight=weight
247 |         , q=q
248 |         , p=p
249 |         , bt=bt
250 |         , useNames=useNames
251 |         , nthread=nthread)
252 |     )
253 |   }
254 | 
255 |   if (useNames == "names"){
256 |     rowns <- names(a)
257 |     colns <- names(b)
258 |   }
259 |   
260 |   # NOTE: this strips off names
261 |   a <- as.character(a)
262 |   b <- as.character(b)
263 | 
264 |   if (useNames=="strings"){
265 |     rowns <- a
266 |     colns <- b
267 |   } 
268 |   
269 |   
270 |   if (!useBytes){
271 |     a <- enc2utf8(a)
272 |     b <- enc2utf8(b)
273 |   }
274 |  
275 |   if (length(a) == 0 || length(b) == 0){ 
276 |    return(matrix(numeric(0)))
277 |   }
278 | 
279 |   x <- vapply(b, do_dist, USE.NAMES=FALSE, FUN.VALUE=numeric(length(a))
280 |           , a, method,weight, q, p, bt, useBytes, nthread)
281 | 
282 |   if (useNames %in% c("strings","names") ){  
283 |     structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns)))
284 |   } else {
285 |     matrix(x,nrow=length(a),ncol=length(b)) 
286 |   }
287 | }
288 | 
289 | 
290 | char2int <- function(x){
291 |   # For some OS's enc2utf8 had unexpected behavior for NA's,
292 |   # see https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=15201.
293 |   # This is fixed for R >= 2.15.3.
294 |   # i <- !is.na(x)
295 |   # x[i] <- enc2utf8(x[i])
296 |   lapply(enc2utf8(x),utf8ToInt)
297 | }
298 | 
299 | #  enum-type in stringdist.h
300 | METHODS <- c(
301 |     osa     = 0L
302 |   , lv      = 1L
303 |   , dl      = 2L
304 |   , hamming = 3L
305 |   , lcs     = 4L
306 |   , qgram   = 5L
307 |   , cosine  = 6L
308 |   , jaccard = 7L
309 |   , jw      = 8L
310 |   , soundex = 9L
311 |   , running_cosine = 10L
312 | )
313 | 
314 | 
315 | do_dist <- function(a, b, method, weight, q, p, bt, useBytes=FALSE, nthread=1L){
316 |   
317 |   if (method=='soundex' && !all(printable_ascii(a) & printable_ascii(b)) ){
318 |     warning("Non-printable ascii or non-ascii characters in soundex. Results may be unreliable. See ?printable_ascii.")
319 |   }
320 |   method <- METHODS[method]
321 |   if ( is.na(method) ){
322 |     stop(sprintf("method '%s' is not defined",method))
323 |   }
324 | 
325 |   d <- .Call("R_stringdist", a, b, method
326 |     , as.double(weight), as.double(p), as.double(bt), as.integer(q)
327 |     , as.integer(useBytes), as.integer(nthread)
328 |     , PACKAGE="stringdist"
329 |   )
330 | 
331 |   d
332 | }
333 | 
334 | # more efficient function that returns a square distance matrix as a 'stats::dist' object.
335 | lower_tri <- function(a
336 |   , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex")
337 |   , useBytes = FALSE
338 |   , weight=c(d=1,i=1,s=1,t=1)
339 |   , q=1
340 |   , p=0
341 |   , bt=0
342 |   , useNames=FALSE
343 |   , nthread = getOption("sd_num_thread")
344 | ){
345 |   methnr <- METHODS[method]
346 |   if (is.na(method)){
347 |     stop(sprintf("method '%s' is not defined",method))
348 |   }
349 | 
350 |   x <- .Call("R_lower_tri", a, methnr
351 |          , as.double(weight), as.double(p), as.double(bt)
352 |          , as.integer(q), as.integer(useBytes), as.integer(nthread)
353 |          , PACKAGE="stringdist")
354 |   
355 |   attributes(x) <- list(class='dist'
356 |     , Size     = length(a)
357 |     , Diag   = FALSE
358 |     , Upper  = FALSE
359 |     , method = method)
360 |   if (useNames == "strings") attr(x,"Labels") <- as.character(a)
361 |   if (useNames == "names" ) attr(x,"Labels") <- names(a)
362 |   
363 |   x
364 | }
365 | 
366 | 
367 | 
368 | 
369 | 


--------------------------------------------------------------------------------
/pkg/inst/include/stringdist_api.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  3 |  *  Copyright (C) 2013  Mark van der Loo
  4 |  *
  5 |  *  This program is free software: you can redistribute it and/or modify
  6 |  *  it under the terms of the GNU General Public License as published by
  7 |  *  the Free Software Foundation, either version 3 of the License, or
  8 |  *  (at your option) any later version.
  9 |  *
 10 |  *  This program is distributed in the hope that it will be useful,
 11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  *  GNU General Public License for more details.
 14 |  *
 15 |  *  You should have received a copy of the GNU General Public License
 16 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 17 |  *
 18 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 19 |  */
 20 | 
 21 | #ifndef _STRINGDIST_API_H
 22 | #define _STRINGDIST_API_H
 23 | 
 24 | #include <R_ext/Rdynload.h>
 25 | 
 26 | #ifdef HAVE_VISIBILITY_ATTRIBUTE
 27 | # define attribute_hidden __attribute__ ((visibility ("hidden")))
 28 | #else
 29 | # define attribute_hidden
 30 | #endif
 31 | 
 32 | #ifdef __cplusplus
 33 | extern "C" {
 34 | #endif
 35 | 
 36 | /**
 37 |  * @mainpage Stringdist C API
 38 |  *
 39 |  * @author Mark van der Loo, Jan van der Laan, R Core Team, Paul Hsieh, Chris Muir
 40 |  * @version `R package stringdist` version `0.9.5.0` and higher.
 41 |  *
 42 |  * @section using Using the stringdist C API
 43 |  * To call the functions described here from your package you need to:
 44 |  * 
 45 |  * 1. Make sure that `stringdist` is installed.
 46 |  * 2. Add `stringdist` to `Imports` (or `Depends`) and `LinkingTo` in the `DESCRIPTION` file.
 47 |  * 3. In your source file under the package's `/src` directory, add the line 
 48 |  * ```
 49 |  * #include <stringdist_api.h>
 50 |  * ```
 51 |  * 
 52 |  * 
 53 |  * An example of a published package using this API is
 54 |  * [refinr](https://CRAN.R-project.org/package=refinr). A minimal example can be
 55 |  * found [here](https://github.com/markvanderloo/linkstringdist).
 56 |  *
 57 |  * @section encoding Character encoding
 58 |  * All `character` vector input is expected to be in `UTF-8` (this also allows
 59 |  * `ASCII`).  Distance computations are based on UTF [code
 60 |  * points](https://en.wikipedia.org/wiki/Code_point) unless `useBytes` is
 61 |  * `TRUE`, in which case distances are computed over byte sequences. Using
 62 |  * non-UTF-8 encoded strings is untested and is highly likely to result in
 63 |  * errors.
 64 |  *
 65 |  * @section threads Thread safety
 66 |  *
 67 |  * It is not safe to call functions from `stringdist` C API from 
 68 |  * multiple concurrent threads.
 69 |  *
 70 |  *
 71 |  *
 72 |  */
 73 | 
 74 | 
 75 | /** 
 76 |  * @file stringdist_api.h
 77 |  * @brief Functions exported from the stringdist package.
 78 |  *
 79 |  */
 80 | 
 81 | 
 82 | /*
 83 | SEXP attribute_hidden sd_all_int(SEXP X)
 84 | {
 85 |   static SEXP(*fun)(SEXP) = NULL;
 86 |   if (fun == NULL) fun = (SEXP(*)(SEXP)) R_GetCCallable("stringdist","R_all_int");
 87 |   return fun(X);
 88 | }
 89 | */
 90 | 
 91 | /**
 92 |  * @brief Find the location of values in `x` in `table` by approximate matching.
 93 |  * 
 94 |  * @param x `[character]` vector.
 95 |  * @param table `[character]` vector (lookup table)
 96 |  * @param method `[integer]` scalar, indicating the distance method as follows
 97 |  *   @parblock
 98 |  *    - 0: Optimal String Alignment (`"osa"`)
 99 |  *    - 1: Levenshtein (`"lv"`)
100 |  *    - 2: Damerau-Levenshtein (`"dl"`)
101 |  *    - 3: Hamming (`"hamming"`)
102 |  *    - 4: Longest Common Substring (`"lcs"`)
103 |  *    - 5: q-gram (`"qgram"`)
104 |  *    - 6: cosine (`"cosine"`)
105 |  *    - 7: Jaccard (`"jaccard"`)
106 |  *    - 8: Jaro-Winkler (`"jw"`)
107 |  *    - 9: Soundex (`"soundex"`)
108 |  *   @endparblock
109 |  * @param nomatch `[integer]` The value to be returned when no match is found. 
110 |  * @param matchNA Should `NA`s be matched? Default behaviour mimics the
111 |  *   behaviour of base `match`, meaning that `NA` matches `NA` (see also the note
112 |  *   on `NA` handling below).
113 |  * @param weight `[numeric]` vector. Edit penalty
114 |  *   @parblock
115 |  *     For `method='osa'` or`'dl'`, the penalty for
116 |  *     deletion, insertion, substitution and transposition, in that order. When
117 |  *     `method='lv'`, the penalty for transposition is ignored. When
118 |  *     `method='jw'`, the weights associated with characters of `a`,
119 |  *     characters from `b` and the transposition weight, in that order. 
120 |  *     Weights must be positive and not exceed 1. `weight` is ignored
121 |  *     completely for other methods
122 |  *   @endparblock
123 |  * @param q  `[integer]` scalar. Size of the q-gram; must be nonnegative. Only
124 |  *   applies to `method='qgram'`, `'jaccard'` or `'cosine'`.
125 |  * @param maxDistance `[numeric]` scalar. The maximum distance allowed for matching.
126 |  * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The
127 |  *   valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the
128 |  *   Jaro-distance is returned. Applies only to `method='jw'`.
129 |  * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty
130 |  *   factor is only applied when the Jaro distance is larger than `bt`.  Applies
131 |  *   only to `method='jw'` and `p>0`.
132 |  * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to
133 |  *   integer prior to distance calculation)
134 |  * @param nthread `[integer]` scalar. Maximum number of threads to use. 
135 |  *
136 |  *
137 |  * @return
138 |  * `[integer]` vector of `length(x)` with indices in `table`.
139 |  */
140 | SEXP attribute_hidden sd_amatch(SEXP x, SEXP table, SEXP method 
141 |                                   , SEXP nomatch, SEXP matchNA
142 |                                   , SEXP weight, SEXP p, SEXP bt, SEXP q
143 |                                   , SEXP maxDistance, SEXP useBytes
144 |                                   , SEXP nthrd)
145 | {
146 |   static SEXP(*fun)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP) = NULL;
147 |   if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP)) R_GetCCallable("stringdist","R_amatch");
148 |   return fun(x, table, method, nomatch, matchNA, weight, p, bt, q, maxDistance, useBytes, nthrd);
149 | }
150 | 
151 | /**
152 |  * @brief Compute q-gram counts
153 |  * 
154 |  * @param a `[character]` vector 
155 |  * @param qq `[integer`] scalar. 
156 |  * 
157 |  * @return
158 |  * A `[numeric]` vector of `length(a)*n_qgrams`, where `n_qrams` is the number
159 |  * of different `qgrams` observed in the elements of `a`. The output vector has
160 |  * an attribute called `qgrams`, which is an integer vector of size
161 |  * `q*n_qgrams` containing integer (UTF-32) labels for the q-grams
162 |  * sequentially.
163 |  * 
164 |  */
165 | SEXP attribute_hidden sd_get_qgrams(SEXP a, SEXP qq)
166 | {
167 |   static SEXP(*fun)(SEXP, SEXP) = NULL;
168 |   if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP)) R_GetCCallable("stringdist","R_get_qgrams");
169 |   return fun(a, qq);
170 | }
171 | 
172 | /*
173 | SEXP attribute_hidden sd_lengths(SEXP X)
174 | {
175 |   static SEXP(*fun)(SEXP) = NULL;
176 |   if (fun == NULL) fun = (SEXP(*)(SEXP)) R_GetCCallable("stringdist","R_lengths");
177 |   return fun(X);
178 | }
179 | */
180 | 
181 | /**
182 |  * @brief Lower tridiagonal elements of distance matrix.
183 |  *
184 |  * @param a `[character]` vector
185 |  * @param method `[integer]` scalar, indicating the distance method as follows
186 |  *   @parblock
187 |  *    - 0: Optimal String Alignment (`"osa"`)
188 |  *    - 1: Levenshtein (`"lv"`)
189 |  *    - 2: Damerau-Levenshtein (`"dl"`)
190 |  *    - 3: Hamming (`"hamming"`)
191 |  *    - 4: Longest Common Substring (`"lcs"`)
192 |  *    - 5: q-gram (`"qgram"`)
193 |  *    - 6: cosine (`"cosine"`)
194 |  *    - 7: Jaccard (`"jaccard"`)
195 |  *    - 8: Jaro-Winkler (`"jw"`)
196 |  *    - 9: Soundex (`"soundex"`)
197 |  *   @endparblock
198 |  * @param weight `[numeric]` vector. Edit penalty
199 |  *   @parblock
200 |  *     For `method='osa'` or`'dl'`, the penalty for deletion, insertion,
201 |  *     substitution and transposition, in that order. When `method='lv'`, the
202 |  *     penalty for transposition is ignored. When `method='jw'`, the weights
203 |  *     associated with characters of `a`, characters from `b` and the
204 |  *     transposition weight, in that order.  Weights must be positive and not
205 |  *     exceed 1. `weight` is ignored completely for other methods
206 |  *   @endparblock
207 |  * @param q  `[integer]` scalar. Size of the q-gram; must be nonnegative. Only
208 |  *   applies to `method='qgram'`, `'jaccard'` or `'cosine'`.
209 |  * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The
210 |  *   valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the
211 |  *   Jaro-distance is returned. Applies only to `method='jw'`.
212 |  * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty
213 |  *   factor is only applied when the Jaro distance is larger than `bt`.  Applies
214 |  *   only to `method='jw'` and `p>0`.
215 |  * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to
216 |  *   integer prior to distance calculation)
217 |  * @param nthread `[integer]` scalar. Maximum number of threads to use. 
218 |  *
219 |  * @return
220 |  *   A `[numeric]` vector of length `n*(n-1)/2`, where `n=length(a)`. It contains
221 |  *   the positive values of consequtive columns of the distance matrix. Also see
222 |  *   the R-code in  `stringdist:::lower_tri`.
223 |  */
224 | SEXP attribute_hidden sd_lower_tri(SEXP a, SEXP method
225 |                                      , SEXP weight, SEXP p,  SEXP bt, SEXP q
226 |                                      , SEXP useBytes, SEXP nthrd)
227 | {
228 |   static SEXP(*fun)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP) = NULL;
229 |   if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP)) R_GetCCallable("stringdist","R_lower_tri");
230 |   return fun(a, method, weight, p, bt, q, useBytes, nthrd);
231 | }
232 | 
233 | /**
234 |  * @brief Compute soundex code
235 |  *
236 |  * @param[in] x `[character]` vector
237 |  * @param[in] useBytes `[logical]` scalar.
238 |  *
239 |  * @return
240 |  *
241 |  * A `list` with `length(x)` element. Each element is a length 4 integer vector 
242 |  * representing a 4-character soundex code. The integers are ASCII code points.
243 |  *
244 |  */
245 | SEXP attribute_hidden sd_soundex(SEXP x, SEXP useBytes)
246 | {
247 |   static SEXP(*fun)(SEXP, SEXP) = NULL;
248 |   if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP)) R_GetCCallable("stringdist","R_soundex");
249 |   return fun(x, useBytes);
250 | }
251 | 
252 | /**
253 |  * @brief compute string distances
254 |  * 
255 |  * @param a `[character]` vector
256 |  * @param b `[character]` vector
257 |  * @param method `[integer]` scalar, indicating the distance method as follows
258 |  *   @parblock
259 |  *    - 0: Optimal String Alignment (`"osa"`)
260 |  *    - 1: Levenshtein (`"lv"`)
261 |  *    - 2: Damerau-Levenshtein (`"dl"`)
262 |  *    - 3: Hamming (`"hamming"`)
263 |  *    - 4: Longest Common Substring (`"lcs"`)
264 |  *    - 5: q-gram (`"qgram"`)
265 |  *    - 6: cosine (`"cosine"`)
266 |  *    - 7: Jaccard (`"jaccard"`)
267 |  *    - 8: Jaro-Winkler (`"jw"`)
268 |  *    - 9: Soundex (`"soundex"`)
269 |  *   @endparblock
270 |  * @param weight `[numeric]` vector. Edit penalty
271 |  *   @parblock
272 |  *     For `method='osa'` or`'dl'`, the penalty for deletion, insertion,
273 |  *     substitution and transposition, in that order. When `method='lv'`, the
274 |  *     penalty for transposition is ignored. When `method='jw'`, the weights
275 |  *     associated with characters of `a`, characters from `b` and the
276 |  *     transposition weight, in that order.  Weights must be positive and not
277 |  *     exceed 1. `weight` is ignored completely for other methods
278 |  *   @endparblock
279 |  * @param q  `[integer]` scalar. Size of the q-gram; must be nonnegative. Only
280 |  *   applies to `method='qgram'`, `'jaccard'` or `'cosine'`.
281 |  * @param p `[numeric]` scalar. Penalty factor for Jaro-Winkler distance. The
282 |  *   valid range for `p` is `0 <= p <= 0.25`. If `p=0` (default), the
283 |  *   Jaro-distance is returned. Applies only to `method='jw'`.
284 |  * @param bt `[numeric]` vector. Winkler's boost threshold. Winkler's penalty
285 |  *   factor is only applied when the Jaro distance is larger than `bt`.  Applies
286 |  *   only to `method='jw'` and `p>0`.
287 |  * @param useBytes Perform byte-wise comparison (i.e. do not translate UTF-8 to
288 |  *   integer prior to distance calculation)
289 |  * @param nthread `[integer]` scalar. Maximum number of threads to use. 
290 |  *
291 |  *
292 |  * @return
293 |  *   A `[numeric]` vector of length `max(length(a),length(b))` where the shortest
294 |  *   vector is recycled over the longer (no warnings are given when the longer
295 |  *   length is not an integer multiple of the shorter length).
296 |  *
297 |  *
298 |  */
299 | SEXP attribute_hidden sd_stringdist(SEXP a, SEXP b, SEXP method
300 |                                       , SEXP weight, SEXP p, SEXP bt, SEXP q
301 |                                       , SEXP useBytes, SEXP nthrd)
302 | {
303 |   static SEXP(*fun)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP) = NULL;
304 |   if (fun == NULL) fun = (SEXP(*)(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP)) R_GetCCallable("stringdist","R_stringdist");
305 |   return fun(a, b, method, weight, p, bt, q, useBytes, nthrd);
306 | }
307 | 
308 | 
309 | #ifdef __cplusplus
310 | }
311 | #endif
312 | 
313 | #endif
314 | 


--------------------------------------------------------------------------------
/pkg/src/Rstringdist.c:
--------------------------------------------------------------------------------
  1 | /*  stringdist - a C library of string distance algorithms with an interface to R.
  2 |  *  Copyright (C) 2013  Mark van der Loo
  3 |  *
  4 |  *  This program is free software: you can redistribute it and/or modify
  5 |  *  it under the terms of the GNU General Public License as published by
  6 |  *  the Free Software Foundation, either version 3 of the License, or
  7 |  *  (at your option) any later version.
  8 |  *
  9 |  *  This program is distributed in the hope that it will be useful,
 10 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |  *  GNU General Public License for more details.
 13 |  *
 14 |  *  You should have received a copy of the GNU General Public License
 15 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 16 |  *
 17 |  *  You can contact the author at: mark _dot_ vanderloo _at_ gmail _dot_ com
 18 |  */
 19 | 
 20 | #include <stdlib.h>
 21 | #include <stdint.h>
 22 | #include <math.h>
 23 | #ifdef _OPENMP
 24 | #include <omp.h>
 25 | #endif
 26 | #include "utils.h"
 27 | #include "stringdist.h"
 28 | 
 29 | #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 30 | #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 31 | 
 32 | 
 33 | static Stringdist *R_open_stringdist(Distance d, int max_len_a, int max_len_b, SEXP weight, SEXP p, SEXP bt, SEXP q){
 34 | 
 35 |   Stringdist *sd = NULL;
 36 |   if (d == osa || d == lv || d == dl || d == hamming || d == lcs){
 37 |     sd = open_stringdist(d, max_len_a, max_len_b, REAL(weight));
 38 |   } else if (d == qgram || d == cosine || d == jaccard || d == running_cosine){
 39 |     sd = open_stringdist(d, max_len_a, max_len_b, (unsigned int) INTEGER(q)[0]);
 40 |   } else if ( d == jw ){
 41 |     sd = open_stringdist(d, max_len_a, max_len_b, REAL(weight), REAL(p)[0], REAL(bt)[0]);
 42 |   } else if (d == soundex) {
 43 |     sd = open_stringdist(d, max_len_a, max_len_b);
 44 |   }
 45 |   if ( sd == NULL ){
 46 |     error("Could not allocate enough memory");
 47 |   }
 48 |   return sd;
 49 | }
 50 | 
 51 | 
 52 | 
 53 | SEXP R_stringdist(SEXP a, SEXP b, SEXP method
 54 |   , SEXP weight, SEXP p, SEXP bt, SEXP q
 55 |   , SEXP useBytes, SEXP nthrd){
 56 | 
 57 |   int na = length(a)
 58 |     , nb = length(b)
 59 |     , bytes = INTEGER(useBytes)[0]
 60 |     , ml_a = max_length(a)
 61 |     , ml_b = max_length(b)
 62 |     , nt = (na > nb) ? na : nb
 63 |     , intdist = TYPEOF(a) == VECSXP ? 1 : 0; // expect lists of integers? 
 64 |  
 65 |   // output vector
 66 |   SEXP yy;
 67 |   yy = PROTECT(allocVector(REALSXP, nt));
 68 |   double *y = REAL(yy);
 69 | 
 70 |   #ifdef _OPENMP 
 71 |   int  nthreads = MIN(INTEGER(nthrd)[0],MAX(na,nb));
 72 |   #pragma omp parallel num_threads(nthreads) default(none) \
 73 |       shared(y,na,nb, R_PosInf, NA_REAL, bytes, intdist, method, weight, p, bt, q, ml_a, ml_b, nt, a, b)
 74 |   #endif
 75 |   {
 76 | 
 77 |     Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0]
 78 |         , ml_a, ml_b
 79 |         , weight
 80 |         , p
 81 |         , bt
 82 |         , q
 83 |     );
 84 | 
 85 |     unsigned int *s = NULL, *t = NULL;
 86 |     s = (unsigned int *) malloc(( 2L + ml_a + ml_b) * sizeof(int));
 87 | 
 88 |     if ( (sd==NULL) | (bytes && s == NULL) ) nt = -1;
 89 |     t = s + ml_a + 1L;
 90 |       
 91 |     int len_s, len_t, isna_s, isna_t
 92 |       , i = 0, j = 0, ID = 0, num_threads = 1;
 93 | 
 94 |     #ifdef _OPENMP
 95 |     ID = omp_get_thread_num();
 96 |     num_threads = omp_get_num_threads();
 97 |     i = recycle(ID-num_threads, num_threads, na);
 98 |     j = recycle(ID-num_threads, num_threads, nb);
 99 |     #endif
100 |     for ( int k=ID; k < nt; k += num_threads ){
101 |       get_elem(a, i, bytes, intdist, &len_s, &isna_s, s);
102 |       get_elem(b, j, bytes, intdist, &len_t, &isna_t, t);
103 |       if (isna_s || isna_t){
104 |         y[k] = NA_REAL;
105 |       } else {
106 |         y[k] = stringdist(sd, s, len_s, t, len_t);
107 |         if ( y[k] < 0 ) y[k] = R_PosInf;
108 |       }
109 |       i = recycle(i, num_threads, na);
110 |       j = recycle(j, num_threads, nb);
111 |     }
112 |     
113 | 
114 |     close_stringdist(sd);
115 | 
116 |     free(s);
117 |   } // end of parallel region
118 | 
119 |   UNPROTECT(1);
120 |   if (nt < 0 ) error("Unable to allocate enough memory");
121 |   return(yy);
122 | }
123 | 
124 | /* amatch
125 |  *
126 |  */
127 | SEXP R_amatch(SEXP x, SEXP table, SEXP method 
128 |   , SEXP nomatch, SEXP matchNA
129 |   , SEXP weight, SEXP p, SEXP bt, SEXP q
130 |   , SEXP maxDistance, SEXP useBytes
131 |   , SEXP nthrd){
132 | 
133 | 
134 |   int nx = length(x)
135 |     , ntable = length(table)
136 |     , no_match = INTEGER(nomatch)[0]
137 |     , match_na = INTEGER(matchNA)[0]
138 |     , bytes = INTEGER(useBytes)[0]
139 |     , ml_x = max_length(x)
140 |     , ml_t = max_length(table)
141 |     , intdist = TYPEOF(x) == VECSXP ? 1 : 0; // list of integers?
142 | 
143 | 
144 |   double maxDist = REAL(maxDistance)[0];
145 | 
146 |   // convert to integer. 
147 |   Stringset *X = new_stringset(x, bytes, intdist);
148 |   Stringset *T = new_stringset(table, bytes, intdist);
149 | 
150 |   // output vector
151 |   SEXP yy;
152 |   yy = PROTECT(allocVector(INTSXP, nx));
153 |   int *y = INTEGER(yy);
154 |   
155 |   #ifdef _OPENMP
156 |   int nthreads = MAX(MIN(INTEGER(nthrd)[0],nx),0);
157 |   #pragma omp parallel num_threads(nthreads) default(none) \
158 |     shared(X, T, y, R_PosInf, NA_INTEGER, nx, ntable, no_match, match_na, ml_x, ml_t, method, weight, p, bt, q, maxDist)
159 |   #endif
160 |   {
161 |     /* claim space for workhorse */
162 | 
163 |     Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0]
164 |         , ml_x, ml_t
165 |         , weight
166 |         , p
167 |         , bt
168 |         , q
169 |     );
170 | 
171 |     double d = R_PosInf, d1 = R_PosInf;
172 |     int index, len_X, len_T;
173 |     unsigned int *str;
174 |     unsigned int **tab;
175 | 
176 |     #ifdef _OPENMP
177 |     #pragma omp for
178 |     #endif
179 |     for ( int i=0; i<nx; i++){
180 |       index = no_match;
181 |       len_X = X->str_len[i]; 
182 |       d1 = R_PosInf;
183 |       str = X->string[i];
184 |       tab = T->string;
185 |       for ( int j=0; j<ntable; j++, tab++){
186 |         len_T = T->str_len[j];
187 |         if (len_X != NA_INTEGER && len_T != NA_INTEGER ){        // both are char (usual case)
188 |           d = stringdist(sd, str, len_X, *tab, len_T);
189 |           if ( d <= maxDist && d < d1){ 
190 |             index = j + 1;
191 |             if ( fabs(d) < 1e-14 ){ 
192 |               break; // exact match
193 |             }
194 |             d1 = d;
195 |           }
196 |         } else if ( len_X == NA_INTEGER && len_T == NA_INTEGER ) {  // both are NA
197 |           index = match_na ? j + 1 : no_match;
198 |           break;
199 |         }
200 |       }
201 |       y[i] = index;
202 |     }
203 |     str=NULL;
204 |     tab=NULL;
205 |     close_stringdist(sd);
206 |   } // end of parallel region
207 |   free_stringset(X);
208 |   free_stringset(T);
209 |   UNPROTECT(1);
210 | 
211 |   return(yy);
212 | } // end R_amatch
213 | 
214 | 
215 | // Lower tridiagonal distance matrix for a single vector argument.
216 | 
217 | static int get_j(R_xlen_t k, R_xlen_t n){
218 |   double nd = (double) n;
219 |   double kd = (double) k;
220 |   double u = ceil( (2.*nd - 3.)/2. - sqrt(pow(nd-.5,2.) - 2.*(kd + 1.)) );
221 | 
222 |   return (int) u;
223 | }
224 | 
225 | /* max n for objects of length n(n-1).
226 |  * 
227 | */
228 | #ifdef LONG_VECTOR_SUPPORT
229 | #define MAXN ( (R_xlen_t) (0.5 + 1.5 * sqrt((double) R_XLEN_T_MAX)) )
230 | #else
231 | #define MAXN ( (R_xlen_t) (0.5 + 1.5 * sqrt((double) R_LEN_T_MAX)) )
232 | #endif
233 | 
234 | SEXP R_lower_tri(SEXP a, SEXP method
235 |   , SEXP weight, SEXP p,  SEXP bt, SEXP q
236 |   , SEXP useBytes, SEXP nthrd){
237 | 
238 |   int bytes = INTEGER(useBytes)[0]
239 |     , ml = max_length(a)
240 |     , intdist = TYPEOF(a) == VECSXP ? 1 : 0; // expect list of integer vectors? 
241 | 
242 |   // Long vectors on platforms where LONG_VECTOR_SUPPORT is defined.
243 |   R_xlen_t n = xlength(a)
244 |     , N = n*(n-1)/2;
245 | 
246 |   if ( n > MAXN ){
247 |     error("Length of input vector (%1.0f) exceeds maximum allowed for this platform (%1.0f)",(double) n,(double) MAXN);
248 |   }  
249 |  
250 |  
251 |   // output vector
252 |   SEXP yy;
253 |   yy = PROTECT(allocVector(REALSXP, N));
254 |   // nothing to do if n=1 
255 |   if (n == 1L) goto end ;
256 |   double *y = REAL(yy);
257 | 
258 | 
259 |   #ifdef _OPENMP 
260 |   int  nthreads = MIN(INTEGER(nthrd)[0],N);
261 |   nthreads = MIN(nthreads, n);
262 |   #pragma omp parallel num_threads(nthreads) default(none) \
263 |       shared(y,n,N, R_PosInf, NA_REAL, bytes, intdist, method, weight, p, bt, q, ml, a)
264 |   #endif
265 |   {
266 | 
267 |     Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0]
268 |         , ml, ml
269 |         , weight
270 |         , p
271 |         , bt
272 |         , q
273 |     );
274 | 
275 |     unsigned int *s = NULL, *t = NULL;
276 |     s = (unsigned int *) malloc(( 2L + 2*ml) * sizeof(int));
277 | 
278 |     if ( (sd==NULL) | (bytes && s == NULL) ) n = -1;
279 |     t = s + ml + 1L;
280 |       
281 |     int len_s, len_t, isna_s, isna_t
282 |       , j = 0
283 |       , thread_id = 0
284 |       , n_threads = 1;
285 | 
286 |     R_xlen_t pp = 0
287 |       , k_start = 0
288 |       , i = 0
289 |       , col_max = n-1
290 |       , k_end = N;
291 | 
292 |     #ifdef _OPENMP
293 |       thread_id  = omp_get_thread_num();
294 |       n_threads = omp_get_num_threads();
295 |     #endif
296 |       // some administration to parallelize the loop.
297 |       pp = N / n_threads;
298 |       k_start = thread_id * pp;
299 |       k_end   = (thread_id < n_threads - 1 ) ? k_start + pp : N;
300 |       j = get_j(k_start, n);
301 |       i = k_start + j * (j - 2*n + 3)/2;
302 |     for ( R_xlen_t k=k_start; k < k_end; k++ ){
303 |       i++;
304 |       get_elem(a, i, bytes, intdist, &len_s, &isna_s, s);
305 |       get_elem(a, j, bytes, intdist, &len_t, &isna_t, t);
306 | 
307 |       if (isna_s || isna_t){
308 |         y[k] = NA_REAL;
309 |       } else {
310 |         y[k] = stringdist(sd, s, len_s, t, len_t);
311 |         if ( y[k] < 0 ) y[k] = R_PosInf;
312 |       }
313 |       if ( i == col_max ){
314 |         j++;
315 |         i = j;
316 |       }
317 |     }
318 | 
319 |     free(s);
320 |     close_stringdist(sd);
321 |   } // end of parallel region
322 | 
323 |   end:
324 |   UNPROTECT(1);
325 |   if (n < 0 ) error("Unable to allocate enough memory");
326 |   return(yy);
327 | }
328 | 
329 | // afind
330 | // For each string in 'a', return the starting position of
331 | // the best match with 'pattern'.
332 | SEXP R_afind(SEXP a, SEXP pattern, SEXP width
333 |   , SEXP method, SEXP weight, SEXP p, SEXP bt
334 |   , SEXP q, SEXP useBytes, SEXP nthrd)
335 | {
336 |   
337 |   int na = length(a)              // nr of  texts to search
338 |     , npat = length(pattern)      // nr of patterns
339 |     , ml_a   = max_length(a)      // max length of searched string
340 |     , ml_b = max_length(pattern)  // max length of the pattern.
341 |     , intdist = 0                 // no distances between integer sequences (yet)
342 |     , bytes = INTEGER(useBytes)[0];
343 | 
344 | 
345 |   int *window = INTEGER(width);   // access the window widths.
346 | 
347 |   // output list
348 |   SEXP out_list;
349 |   out_list = PROTECT(allocVector(VECSXP, 2));
350 | 
351 |   // output location
352 |   SEXP out_loc = PROTECT(allocMatrix(INTSXP, na, npat));
353 |   SET_VECTOR_ELT(out_list,0, out_loc); 
354 |   int *yloc = INTEGER(out_loc);
355 | 
356 |   // output distance
357 |   SEXP out_dist =  PROTECT(allocMatrix(REALSXP, na, npat));
358 |   SET_VECTOR_ELT(out_list,1, out_dist);
359 |   double *ydist = REAL(out_dist);
360 |   // Setup stringdist structure.
361 |   // find maximum window length
362 |   int max_window = 0;
363 |   for ( int i=0; i<npat; i++){
364 |     if (max_window < window[i]){
365 |       max_window = window[i];
366 |     }
367 |   }
368 | 
369 |  
370 |   #ifdef _OPENMP 
371 |   int  nthreads = MIN(INTEGER(nthrd)[0],na);
372 |   #pragma omp parallel num_threads(nthreads) default(none) \
373 |       shared(yloc,ydist, na, npat, R_PosInf, NA_REAL, NA_INTEGER, bytes, intdist, \
374 |       method, weight, p, bt, q, ml_a, ml_b, window, max_window, a, pattern)
375 |   #endif
376 |   {  // start parallel region
377 | 
378 | 
379 |     Stringdist *sd = R_open_stringdist( (Distance) INTEGER(method)[0]
380 |         , max_window, ml_b
381 |         , weight
382 |         , p
383 |         , bt
384 |         , q
385 |     );
386 | 
387 |     // allocate memory to store the strings
388 |     unsigned int *s = NULL, *t = NULL;
389 |     s = (unsigned int *) malloc(( 2L + ml_a + ml_b) * sizeof(int));
390 |     
391 |     // t is the location of the pattern
392 |     t = s + ml_a + 1L;
393 |     
394 |     int len_s, len_t, isna_s, isna_t, max_k, k_min, current_window, offset;
395 |     int ID=0, num_threads=1;
396 |     
397 |     double d, d_min;
398 | 
399 |     #ifdef _OPENMP
400 |     ID = omp_get_thread_num();
401 |     num_threads = omp_get_num_threads();
402 |     #endif
403 |     for ( int i = ID; i < na; i += num_threads ){
404 |       // get text to search
405 |       get_elem(a, i, bytes, intdist, &len_s, &isna_s, s);
406 |       for( int j = 0; j < npat; j++){
407 |         // get pattern
408 |         get_elem(pattern, j, bytes, intdist, &len_t, &isna_t, t);
409 |         current_window = window[j];
410 |         offset = j*na;
411 |         if (isna_s || isna_t){ // something to search in, or find?
412 |           yloc[offset + i]  = NA_INTEGER;
413 |           ydist[offset + i] = NA_REAL;
414 |         } else if ( current_window >= len_s ){ // is the text shorter than the window?
415 |           yloc[offset + i]  = 1L;
416 |           ydist[offset + i] = stringdist(sd, s, len_s, t, len_t); 
417 |         } else { // slide window over text and compute distances
418 |           max_k = len_s - current_window;
419 |           d_min = R_PosInf;
420 |           k_min = 0;
421 |           for (int k = 0; k <= max_k; k++){
422 |             d = stringdist(sd, s + k, current_window, t, len_t);
423 |             if ( d < d_min ){
424 |               d_min = d;
425 |               k_min = k;
426 |             }
427 |           } // end loop over windows
428 |           yloc[offset + i]  = k_min + 1;
429 |           ydist[offset + i] = d_min;
430 |           reset_stringdist(sd);
431 |         }
432 |       } // end loop over patterns
433 |     } // end loop over strings
434 |     close_stringdist(sd);
435 |   } // end parallel region
436 |   UNPROTECT(3);
437 |   return(out_list);
438 | 
439 | }
440 | 
441 | // helper function to determine  whether all is INTSXP
442 | 
443 | SEXP R_all_int(SEXP X){
444 |   
445 |   SEXP all_int;
446 |   all_int = PROTECT(allocVector(LGLSXP,1L));
447 | 
448 |   int n = length(X);
449 |   LOGICAL(all_int)[0] =  1L;
450 |   for (int i=0; i<n; i++){
451 |     if (TYPEOF(VECTOR_ELT(X,i)) != INTSXP){
452 |       LOGICAL(all_int)[0] = 0L;
453 |       break;
454 |     }
455 |   }
456 | 
457 |   UNPROTECT(1);
458 |   return all_int;
459 | 
460 | }
461 | 
462 | // helper function determining the lengths of all elements of a list.
463 | 
464 | SEXP R_lengths(SEXP X){
465 |   PROTECT(X);
466 |   int n = length(X);
467 |   SEXP out;
468 |   out = PROTECT(allocVector(INTSXP, n));
469 |   
470 |   int *outp = INTEGER(out);  
471 | 
472 |   for ( int i=0; i<n; i++, outp++ ) {
473 |     (*outp) = length(VECTOR_ELT(X,i));
474 |   }
475 |   UNPROTECT(2);
476 |   return out;
477 | }
478 | 
479 | 
480 | 
481 | 
482 | 
483 | 
484 | 
485 | 
486 | 
487 | 
488 | 
489 | 
490 | 
491 | 
492 | 
493 | 


--------------------------------------------------------------------------------