0}
9 | #'
10 | #'
11 | #' @section Note on \code{NA} handling:
12 | #' \code{R}'s native \code{\link[base]{match}} function matches \code{NA} with
13 | #' \code{NA}. This may feel inconsistent with \code{R}'s usual \code{NA}
14 | #' handling, since for example \code{NA==NA} yields
15 | #' \code{NA} rather than \code{TRUE}. In most cases, one may reason about the
16 | #' behaviour under \code{NA} along the lines of ``if one of the arguments is
17 | #' \code{NA}, the result shall be \code{NA}'', simply because not all
18 | #' information necessary to execute the function is available. One uses special
19 | #' functions such as \code{is.na}, \code{is.null} \emph{etc.} to handle special
20 | #' values.
21 | #'
22 | #' The \code{amatch} function mimics the behaviour of \code{\link[base]{match}}
23 | #' by default: \code{NA} is matched with \code{NA} and with nothing else. Note
24 | #' that this is inconsistent with the behaviour of \code{\link{stringdist}}
25 | #' since \code{stringdist} yields \code{NA} when at least one of the arguments
26 | #' is \code{NA}. The same inconsistency exists between \code{\link[base]{match}}
27 | #' and \code{\link[utils]{adist}}. In \code{amatch} this behaviour can be
28 | #' controlled by setting \code{matchNA=FALSE}. In that case, if any of the
29 | #' arguments in \code{x} is \code{NA}, the \code{nomatch} value is returned,
30 | #' regardless of whether \code{NA} is present in \code{table}. In
31 | #' \code{\link[base]{match}} the behaviour can be controlled by setting the
32 | #' \code{incomparables} option.
33 | #'
34 | #'
35 | #' @param x elements to be approximately matched: will be coerced to
36 | #' \code{character} unless it is a list consisting of \code{integer} vectors.
37 | #' @param table lookup table for matching. Will be coerced to \code{character}
38 | #' unless it is a list consting of \code{integer} vectors.
39 | #' @param nomatch The value to be returned when no match is found. This is
40 | #' coerced to integer.
41 | #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the
42 | #' behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches
43 | #' \code{NA} (see also the note on \code{NA} handling below).
44 | #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}.
45 | #' @param useBytes Perform byte-wise comparison. See \code{\link{stringdist-encoding}}.
46 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
47 | #' deletion, insertion, substitution and transposition, in that order. When
48 | #' \code{method='lv'}, the penalty for transposition is ignored. When
49 | #' \code{method='jw'}, the weights associated with characters of \code{a},
50 | #' characters from \code{b} and the transposition weight, in that order.
51 | #' Weights must be positive and not exceed 1. \code{weight} is ignored
52 | #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
53 | #' \code{'Jaccard'}, \code{'lcs'}, or \code{'soundex'}.
54 | #' @param maxDist Elements in \code{x} will not be matched with elements of
55 | #' \code{table} if their distance is larger than \code{maxDist}. Note that the
56 | #' maximum distance between strings depends on the method: it should always be
57 | #' specified.
58 | #' @param nthread Number of threads used by the underlying C-code. A sensible
59 | #' default is chosen, see \code{\link{stringdist-parallelization}}.
60 | #'
61 | #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'},
62 | #' or \code{'cosine'}.
63 | #' @param p Winklers 'prefix' parameter for Jaro-Winkler distance, with
64 | #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'}
65 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is
66 | #' only applied when the Jaro distance is larger than \code{bt}.
67 | #' Applies only to \code{method='jw'} and \code{p>0}.
68 | #'
69 | #' @return \code{amatch} returns the position of the closest match of \code{x}
70 | #' in \code{table}. When multiple matches with the same smallest distance
71 | #' metric exist, the first one is returned. \code{ain} returns a
72 | #' \code{logical} vector of length \code{length(x)} indicating wether an
73 | #' element of \code{x} approximately matches an element in \code{table}.
74 | #'
75 | #' @family matching
76 | #'
77 | #' @example ../examples/amatch.R
78 | #' @export
79 | amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE
80 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw", "soundex")
81 | , useBytes = FALSE
82 | , weight=c(d=1,i=1,s=1,t=1)
83 | , maxDist=0.1, q=1, p=0, bt=0
84 | , nthread = getOption("sd_num_thread")){
85 |
86 | x <- as.character(x)
87 | table <- as.character(table)
88 |
89 | if (!useBytes){
90 | x <- enc2utf8(x)
91 | table <- enc2utf8(table)
92 | }
93 |
94 | method <- match.arg(method)
95 | stopifnot(
96 | all(is.finite(weight))
97 | , all(weight > 0)
98 | , all(weight <=1)
99 | , q >= 0
100 | , p <= 0.25
101 | , p >= 0
102 | , matchNA %in% c(TRUE,FALSE)
103 | , maxDist > 0
104 | , is.logical(useBytes)
105 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
106 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
107 | , length(nthread) == 1
108 | , is.numeric(nthread)
109 | , nthread > 0
110 | )
111 | if (method == 'jw') weight <- weight[c(2,1,3)]
112 | method <- METHODS[method]
113 | if ( is.na(method) ){
114 | stop(sprintf("method '%s' is not defined",method))
115 | }
116 |
117 | .Call("R_amatch", x, table, method
118 | , as.integer(nomatch), as.integer(matchNA)
119 | , as.double(weight), as.double(p), as.double(bt)
120 | , as.integer(q) , as.double(maxDist), as.integer(useBytes)
121 | , as.integer(nthread)
122 | , PACKAGE="stringdist"
123 | )
124 |
125 | }
126 |
127 | #' @param ... parameters to pass to \code{amatch} (except \code{nomatch})
128 | #'
129 | #'
130 | #' @rdname amatch
131 | #' @export
132 | ain <- function(x,table,...){
133 | amatch(x, table, nomatch=0, ...) > 0
134 | }
135 |
136 | #' Approximate matching for integer sequences.
137 | #'
138 | #'
139 | #' For a \code{list} of integer vectors \code{x}, find the closest matches in a
140 | #' \code{list} of integer or numeric vectors in \code{table.}
141 | #'
142 | #' @section Notes:
143 | #' \code{seq_ain} is currently defined as
144 | #'
145 | #' \code{seq_ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0}
146 | #'
147 | #' All input vectors are converted with \code{as.integer}. This causes truncation for numeric
148 | #' vectors (e.g. \code{pi} will be treated as \code{3L}).
149 | #'
150 | #'
151 | #' @param x (\code{list} of) \code{integer} or \code{numeric} vector(s) to be
152 | #' approximately matched. Will be converted with \code{as.integer}.
153 | #' @param table (\code{list} of) \code{integer} or \code{numeric} vector(s)
154 | #' serving as lookup table for matching. Will be converted with
155 | #' \code{as.integer}.
156 | #' @param nomatch The value to be returned when no match is found. This is
157 | #' coerced to integer.
158 | #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the
159 | #' behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches
160 | #' \code{NA}. With \code{NA}, we mean a missing entry in the \code{list}, represented as \code{NA_integer_}.
161 | #' If one of the integer sequences stored in the list has an \code{NA} entry,
162 | #' this is just treated as another integer (the representation of
163 | #' \code{NA_integer_}).
164 | #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}.
165 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
166 | #' deletion, insertion, substitution and transposition, in that order. When
167 | #' \code{method='lv'}, the penalty for transposition is ignored. When
168 | #' \code{method='jw'}, the weights associated with integers in elements of \code{a},
169 | #' integers in elements of \code{b} and the transposition weight, in that order.
170 | #' Weights must be positive and not exceed 1. \code{weight} is ignored
171 | #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
172 | #' \code{'Jaccard'}, or \code{'lcs'}.
173 | #' @param maxDist Elements in \code{x} will not be matched with elements of
174 | #' \code{table} if their distance is larger than \code{maxDist}. Note that the
175 | #' maximum distance between strings depends on the method: it should always be
176 | #' specified.
177 | #' @param nthread Number of threads used by the underlying C-code. A sensible
178 | #' default is chosen, see \code{\link{stringdist-parallelization}}.
179 | #'
180 | #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'},
181 | #' or \code{'cosine'}.
182 | #' @param p Winkler's prefix parameter for Jaro-Winkler distance, with
183 | #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'}
184 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is
185 | #' only applied when the Jaro distance is larger than \code{bt}.
186 | #' Applies only to \code{method='jw'} and \code{p>0}.
187 | #' @return \code{seq_amatch} returns the position of the closest match of \code{x}
188 | #' in \code{table}. When multiple matches with the same minimal distance
189 | #' metric exist, the first one is returned. \code{seq_ain} returns a
190 | #' \code{logical} vector of length \code{length(x)} indicating wether an
191 | #' element of \code{x} approximately matches an element in \code{table}.
192 | #'
193 | #' @seealso \code{\link{seq_dist}}, \code{\link{seq_sim}}, \code{\link{seq_qgrams}}
194 | #'
195 | #' @example ../examples/seq_amatch.R
196 | #' @export
197 | seq_amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE
198 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw")
199 | , weight=c(d=1,i=1,s=1,t=1)
200 | , maxDist=0.1, q=1, p=0, bt=0
201 | , nthread = getOption("sd_num_thread")){
202 |
203 | x <- ensure_int_list(x)
204 | table <- ensure_int_list(table)
205 |
206 | method <- match.arg(method)
207 | stopifnot(
208 | all(is.finite(weight))
209 | , all(weight > 0)
210 | , all(weight <=1)
211 | , q >= 0
212 | , p <= 0.25
213 | , p >= 0
214 | , matchNA %in% c(TRUE,FALSE)
215 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
216 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
217 | , length(nthread) == 1
218 | , is.numeric(nthread)
219 | , nthread > 0
220 | )
221 | if (method == 'jw') weight <- weight[c(2,1,3)]
222 | method <- METHODS[method]
223 | if ( is.na(method) ){
224 | stop(sprintf("method '%s' is not defined",method))
225 | }
226 |
227 | .Call("R_amatch", x, table, method
228 | , as.integer(nomatch), as.integer(matchNA)
229 | , as.double(weight), as.double(p), as.double(bt)
230 | , as.integer(q) , as.double(maxDist), 0L
231 | , as.integer(nthread)
232 | , PACKAGE="stringdist"
233 | )
234 | }
235 |
236 | #' @param ... parameters to pass to \code{seq_amatch} (except \code{nomatch})
237 | #'
238 | #'
239 | #' @rdname seq_amatch
240 | #' @export
241 | seq_ain <- function(x,table,...){
242 | seq_amatch(x, table, nomatch=0, ...) > 0
243 | }
244 |
245 |
246 |
247 |
--------------------------------------------------------------------------------
/pkg/NEWS:
--------------------------------------------------------------------------------
1 | version 0.9.15
2 | - Fixe issue with zero-length 'nthreads' argument in all exported functions
3 | with this parameter. (Thanks to Brian Ripley for the notification and pointer
4 | to the problem)
5 |
6 | version 0.9.14
7 | - Fixed issue with zero-length strings in 'qgrams' (Thanks to Brian Ripley
8 | for the notification and pointer to the origin of the problem)
9 |
10 | version 0.9.12
11 | - apparently R_xlen_t is long long int on CLANG/Windows and long int on gcc-13/debian
12 |
13 | version 0.9.11
14 | - Fixed a warning in gcc-13: changed specifier from %d to %ld.
15 | (Thanks to Kurt Hornik for the head's up)
16 |
17 | version 0.9.10
18 | - Fixed another warning generated by new C compiler that I overlooked.
19 | (Thanks to the CRAN team for the head's up)
20 |
21 | version 0.9.9
22 | - Fixed warnings generated by new C compiler. (function prototypes must
23 | now be defined completely). (Thanks to Kurt Hornik for the head's up.)
24 |
25 | version 0.9.8
26 | - Fixed some issues on C-level causing problems with the
27 | CLANG compiler. (Thanks to Brian Ripley for not only
28 | reporting this, but also sending updated code with
29 | fixes).
30 |
31 |
32 | version 0.9.7
33 | - Fixes in use of INTEGER() and VECTOR_ELT() after updates in R's C API.
34 | this affected 'afind' and 'max_length' (internally). (Thanks to Luke
35 | Tierny and Kurt Hornik for the notification).
36 | - Fix in 'amatch' causing utf-8 characters to be ignored in some
37 | cases (thanks to Joan Mime for reporting #78).
38 | - Fix: segfault when 'afind' was called with many search patterns or many
39 | texts to be searched.
40 | - Fix: stringsimmatrix was not normalized correctly (Thanks to Tamas Ferenci
41 | for reporting GH).
42 |
43 |
44 | version 0.9.6.3
45 | - Resubmit. Fixed an URL redirect that was detected by CRAN.
46 |
47 | version 0.9.6.2
48 | - Resubmit. Fixed url issues detected by CRAN, added doi to description
49 | as per CRAN request.
50 |
51 | version 0.9.6.1
52 | - Bugfix: afind/grab/grabl returned wrong results on MacOS only.
53 | (thanks to Prof. Brian Ripley for the notification and for running tests
54 | on his personal machine and to Tomas Kalibera for making the
55 | ubuntu-rchk docker image available).
56 |
57 | version 0.9.6
58 | - New function 'afind': find approximate matches in text based on string distance.
59 | - New functions 'grab', 'grabl': fuzzy matching equivalent to 'grep' and 'grepl'.
60 | - New function 'extract': fuzzy matching equivalent of stringr::str_extract.
61 | - New algorithm 'running_cosine': fast fuzzy text search using cosine distance.
62 | - New function 'stringsimmatrix' (Thanks to Johannes Gruber).
63 | - Number of threads used is now reported when loading 'stringdist'.
64 | - Internal fixes (in some cases class() == 'class' was used).
65 |
66 | version 0.9.5.5
67 | - Changed two URLs to canonical form in README.md (https://) to comply with
68 | CRAN policy.
69 |
70 | version 0.9.5.4
71 | - Some tests using seq_dist() would fail unpredictably when the input was
72 | defined with lazily evaluated arguments, e.g. list(1:3, 2:4); but only in the
73 | context of NSE by a test suite ('tinytest', 'testthat'). Tests were replaced by
74 | literal versions, e.g. list(c(1,2,3), c(2,3,4)).
75 |
76 | version 0.9.5.3
77 | - Update in test suite to stay on CRAN
78 |
79 | version 0.9.5.2
80 | - RJournal paper and C/C++ api docs are now presented as vignette.
81 | - Switched to tinytest framework
82 | - Fix: stringdist could cause a segfault for edit distances between very long
83 | strings. (Thanks to GH user gllipatz)
84 |
85 |
86 | version 0.9.5.1
87 | - Fixed header file for C API
88 |
89 | version 0.9.5.0
90 | - New contributor: Chris Muir
91 | - C/C++ API now exposed for packages LinkingTo stringdist. See `?stringdist_api`
92 | - Arguments 'maxDist', 'ncores', 'cluster' of functions 'stringdist' and
93 | 'stringdistmatrix' have been deprecated for several years and are now
94 | removed.
95 | - Fixed edge case where cosine distance with q=1, between strings of repeating characters
96 | yielded Inf (Thanks to Markus Dumke)
97 |
98 |
99 | version 0.9.4.6
100 | - Fixed argument passing error in lower_tri (thanks to Kurt Hornik)
101 |
102 | version 0.9.4.5
103 | - New argument 'bt' implementing Winkler's boost threshold for the Jaro-Winkler distance
104 | - stringdist(a,b,method="qgram") returns correct value when q>nchar(a) (or b).
105 | (Thanks to Giora Simchoni). Also affects stringdistmatrix, amatch, seq_dist,
106 | and seq_distmatrix.
107 | - registered native routines as now recommended by CRAN
108 |
109 | version 0.9.4.4
110 | - updated default nr of threads to comply to CRAN policy (thanks to Kurt Hornik).
111 | The default nr of cores now equals OMP_NUM_THREADS if set. See
112 | ?'stringdist-parallelization' for the full policy.
113 |
114 | version 0.9.4.2
115 | - bugfix in stringdistmatrix(a): value of p, for jw-distance was ignored
116 | (thanks to Max Fritsche)
117 | - bugfix in stringdistmatrix(a): Would segfault on q-gram w/input > ~7k strings
118 | and q>1 (thanks to Connor McKay)
119 | - bugfix in jaccard distance: distance not always correct when passing multiple
120 | strings (thanks to Robert Carlson)
121 |
122 | version 0.9.4.1
123 | - stringdistmatrix(a) now outputs long vectors (issue #45, thanks to Wouter
124 | Touw). For stringdistmatrix(a,b) this was already the case, but the length
125 | of rows and columns remains restricted to 2^31-1 since long input vectors are
126 | not supported (yet).
127 | - bugfix in osa/dl/lv distances w/unequal edit weights (thanks to Nathalia Potocka)
128 |
129 | version 0.9.4
130 | - bugfix: edge case for zero-size for lower tridiagonal dist matrices (caused
131 | UBSAN to fire, but gave correct results).
132 | - bugfix in jw distance: not symmetric for certain cases (thanks to github user gtumuluri)
133 |
134 | version 0.9.3
135 | - new function for tokenizing integer sequences: seq_qgrams
136 | - new function for matching integer sequences: seq_amatch
137 | - new functions computing distances between integer sequences: seq_dist, seq_distmatrix
138 | - q-gram based distances are now always 0 when q=0 (used to be Inf if at least
139 | one of the arguments was not the empty string)
140 | - stringdist, stringdistmatrix now emit warning when presented with 'list' argument
141 | - small c-side code optimizations
142 | - bugfix in dl, lv, osa distance: weights were not taken into account properly
143 | (thanks to Zach Price)
144 |
145 | version 0.9.2
146 | - Update fixing some errors (missing documentation, tests) in the 0.9.1 release.
147 | - Fixed a few possible memory leaks.
148 |
149 | version 0.9.1
150 | - Argument 'useNames' of 'stringdistmatrix' now accepts 'none', 'strings', and 'names'
151 | - New function 'stringsim' computes string similarities between 0 and 1 based on 'stringdist'
152 | - Calling 'stringdistmatrix' with a single argument returns an object of class 'dist'
153 | - Argument 'cluster' to stringdistmatrix is phased out. It is now ignored with a message.
154 | - Specifying 'ncores' was already ignored but now also causes a warning
155 | - internal: rewrite of the R/C interface, saving about 1/3 of C-code, making extending easier
156 | - bugfix in stringdistmatrix: output was transposed when length(a)==1 (Thanks to github user cpoonolly)
157 | - Safer core detection to avoid a failure under Cygwin (thanks to Lauri Koobas)
158 |
159 | version 0.9.0
160 | - C-code underlying stringdist and amatch now automatically use multithreading based on openMP.
161 | The default number of threads is governed by options('sd_num_thread').
162 | - stringdist, stringdistmatrix, amatch and ain gain nthread argument which can
163 | overwrite the default maximum number of threads.
164 | - Argument 'maxDist' is phased out for 'stringdist' and 'stringdistmatrix'.
165 | Specifying it causes a message.
166 | - Argument 'ncores' is phased out for 'stringdistmatrix'. It is now ignored and
167 | specifying it causes a message.
168 | - bugfix in amatch/dl. In certain cases, the best match went undetected.
169 | - Documentation improved and rearranged with string metrics, encoding, and
170 | parallelization now documented as separate topics.
171 |
172 | version 0.8.2
173 | - Fixed a few warnings issued by the CLANG compiler (thanks to Brian Ripley).
174 | This fixes a bug in amatch/jaccard
175 | - Fixed a bug in stringdist/osa, dl: NA incorectly returned (thanks to Lauri
176 | Koobas).
177 |
178 | version 0.8.1
179 | - stringdistmatrix returns dimensionless matrix when both arguments have length
180 | zero (thanks to Richie Cotton)
181 | - stringdistmatrix gains argument 'useNames' (thanks to Richie Cotton)
182 | - Package now 'Imports' parallel rather than 'Depends' on it.
183 | - bugfix in optimal string alignment distance: the nr of transpositions was
184 | sometimes overcounted (thanks to Frank Binder)
185 | - rearranged the documentation.
186 |
187 | version 0.8.0
188 | - Added soundex-based string distance (thanks to Jan van der Laan)
189 | - New function 'phonetic' translates strings to phonetic codes using soundex
190 | (thanks to Jan van der Laan)
191 | - New function 'printable_ascii' detects non-printable ascii or non-ascii
192 | characters.
193 | - Precision issue: cosine distance between equal strings would be O(1e-16) in
194 | stead of 0.0 (thanks to Ben Haller).
195 | - Code cleaning: somewhat better performance when maxDist is unspecified in
196 | stringdist. It remains deprecated.
197 | - Row names in the output array of 'qgrams' are now in system native encoding
198 | (used to be utf8 for all systems).
199 | - updated CITATION with page number info as the R Journal is now out.
200 |
201 | version 0.7.3
202 | - bugfix in jw-distance: out-of-range access in C-code caused R to crash in
203 | some cases (thanks to Carol Gan)
204 | - bugfix in dl distance: in some cases, distances could be one unit too high.
205 | - Updated CITATION file: paper to appear in The R Journal vol 6 (2014).
206 | - Some updates in documentation.
207 |
208 | version 0.7.2
209 | - function 'qgrams' gains .list argument
210 | - bugfix in multicore option of stringdistmatrix
211 | - bugfix in substitution weight of DL-distance (undercounted when w4 != 1 in
212 | some cases)
213 | - bugfix in dl.c: C-function read outside of array.
214 |
215 | version 0.7.0
216 | - added useBytes option: up to ~3-fold speed gain at the cost of possible
217 | encoding-dependent results.
218 | - new memory allocation method for q-grams increases speed between ~5% and ~30%
219 | depending on q and input string.
220 | - function 'qgrams' gains useNames option.
221 | - jaro-winkler distance gains weight argument.
222 | - C-code optimization in edit-based distances: 10~20% speed increase depending
223 | on input.
224 | - bugfix in amatch: sometimes NA was erroneously returned.
225 | - bugfix in amatch/lcs: hamming distance method was called erroneously.
226 |
227 | version 0.6.1
228 | - bugfix in parallel version of stringdistmatrix: parameter p was not passed
229 | (thanks to Ricardo Saporta)
230 | - bugfix in lv/osa/dl: maxDist ignored in certain cases
231 |
232 | version 0.6.0
233 | - added amatch function: approximate matching version of 'match'
234 | - added ain function: approximate matching version of '%in%'
235 | - qgrams now accepts arbitrary number of arguments. Outputs array, not table
236 | - added cosine distance
237 | - added Jaccard distance
238 | - added Jaro and Jaro-Winkler distances
239 | - small performance tweeks in underlying C code
240 | - Edge case in stringdistmatrix: output is now always of class matrix
241 | - Default maxDist is now Inf (this is only to make it more intuitive and does
242 | not break previous code)
243 | - BREAKING CHANGE: output -1 is replaced by Inf for all distance methods
244 |
245 |
246 | version 0.5.0
247 | - added qgram counting function 'qgrams'
248 | - faster edge case handling in osa method.
249 | - edge case in lv/osa/dl methods: distance returned length(b) in stead of -1
250 | when length(a) == 0, maxDist < length(b).
251 | - bugfix in lv/osa/dl method: maxDist returned when length(a) > maxDist > 0
252 | (thanks to Daniel Reckhard).
253 | - Hamming distance (method='h') now returns -1 for strings of unequal lengts
254 | (used to emit error).
255 | - added longest common substring distance (method='lcs').
256 | - added qgram distance method.
257 | - stringdistmatrix gains cluster argument.
258 |
259 | version 0.4.2
260 | - Fix in error message for hamming distance
261 | - Workaround for system-dependent translation of utf8 NA characters
262 |
263 | version 0.4.0
264 | - First release
265 |
--------------------------------------------------------------------------------
/pkg/R/stringdist.R:
--------------------------------------------------------------------------------
1 | #' A package for string distance calculation and approximate string matching.
2 | #'
3 | #'
4 | #' The \pkg{stringdist} package offers fast and platform-independent string
5 | #' metrics. Its main purpose is to compute various string distances and to do
6 | #' approximate text matching between character vectors. As of version 0.9.3,
7 | #' it is also possible to compute distances between sequences represented by
8 | #' integer vectors.
9 | #'
10 | #'
11 | #' A typical use is to match strings that are not precisely the same. For
12 | #' example
13 | #'
14 | #' \code{ amatch(c("hello","g'day"),c("hi","hallo","ola"),maxDist=2)}
15 | #'
16 | #' returns \code{c(2,NA)} since \code{"hello"} matches closest with
17 | #' \code{"hallo"}, and within the maximum (optimal string alignment) distance.
18 | #' The second element, \code{"g'day"}, matches closest with \code{"ola"} but
19 | #' since the distance equals 4, no match is reported.
20 | #'
21 | #' A second typical use is to compute string distances. For example
22 | #'
23 | #' \code{ stringdist(c("g'day"),c("hi","hallo","ola"))}
24 | #'
25 | #' Returns \code{c(5,5,4)} since these are the distances between \code{"g'day"}
26 | #' and respectively \code{"hi"}, \code{"hallo"}, and \code{"ola"}.
27 | #'
28 | #' A third typical use would be to compute a \code{dist} object. The command
29 | #'
30 | #' \code{stringdistmatrix(c("foo","bar","boo","baz"))}
31 | #'
32 | #' returns an object of class \code{dist} that can be used by clustering
33 | #' algorithms such as \code{stats::hclust}.
34 | #'
35 | #' A fourth use is to compute string distances between general sequences,
36 | #' represented as integer vectors (which must be stored in a \code{list}):
37 | #'
38 | #' \code{seq_dist( list(c(1L,1L,2L)), list(c(1L,2L,1L),c(2L,3L,1L,2L)) )}
39 | #'
40 | #' The above code yields the vector \code{c(1,2)} (the first shorter first
41 | #' argument is recycled over the longer second argument)
42 | #'
43 | #' Besides documentation for each function, the main topics documented are:
44 | #'
45 | #' \itemize{
46 | #' \item{\code{\link{stringdist-metrics}} -- string metrics supported by the package}
47 | #' \item{\code{\link{stringdist-encoding}} -- how encoding is handled by the package}
48 | #' \item{\code{\link{stringdist-parallelization}} -- on multithreading }
49 | #' }
50 | #'
51 | #' @section Acknowledgements:
52 | #' \itemize{
53 | #' \item{The code for the full Damerau-Levenshtein distance was adapted from Nick Logan's
54 | #' \href{https://github.com/ugexe/Text--Levenshtein--Damerau--XS/blob/master/damerau-int.c}{public github repository}.}
55 | #' \item{C code for converting UTF-8 to integer was copied from the R core for performance reasons.}
56 | #' \item{The code for soundex conversion and string similarity was kindly contributed by Jan van der Laan.}
57 | #' }
58 | #' @section Citation:
59 | #' If you would like to cite this package, please cite the \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{R Journal Paper}:
60 | #' \itemize{
61 | #' \item{M.P.J. van der Loo (2014). The \code{stringdist} package for approximate string matching.
62 | #' R Journal 6(1) pp 111-122}
63 | #' }
64 | #' Or use \code{citation('stringdist')} to get a bibtex item.
65 | #'
66 | #' @name stringdist-package
67 | #' @docType package
68 | #' @useDynLib stringdist, .registration=TRUE
69 | #' @importFrom parallel detectCores
70 | #'
71 | #'
72 | #'
73 | "_PACKAGE"
74 |
75 | listwarning <- function(x,y){
76 | sprintf("
77 | You are passing one or more arguments of type 'list' to
78 | '%s'. These arguments will be converted with 'as.character'
79 | which is likeley not to give what you want (did you mean to use '%s'?).
80 | This warning can be avoided by explicitly converting the argument(s).
81 | ",x,y)
82 | }
83 |
84 | #' Compute distance metrics between strings
85 | #'
86 | #'
87 | #' \code{stringdist} computes pairwise string distances between elements of
88 | #' \code{a} and \code{b}, where the argument with less elements is recycled.
89 | #' \code{stringdistmatrix} computes the string distance matrix with rows
90 | #' according to
91 | #' \code{a} and columns according to \code{b}.
92 | #'
93 | #'
94 | #' @param a R object (target); will be converted by \code{as.character}
95 | #' @param b R object (source); will be converted by \code{as.character}
96 | #' This argument is optional for \code{stringdistmatrix} (see section \code{Value}).
97 | #' @param method Method for distance calculation. The default is \code{"osa"},
98 | #' see \code{\link{stringdist-metrics}}.
99 | #' @param useBytes Perform byte-wise comparison, see
100 | #' \code{\link{stringdist-encoding}}.
101 | #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for
102 | #' deletion, insertion, substitution and transposition, in that order. When
103 | #' \code{method='lv'}, the penalty for transposition is ignored. When
104 | #' \code{method='jw'}, the weights associated with characters of \code{a},
105 | #' characters from \code{b} and the transposition weight, in that order.
106 | #' Weights must be positive and not exceed 1. \code{weight} is ignored
107 | #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
108 | #' \code{'Jaccard'}, \code{'lcs'}, or \code{soundex}.
109 | #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to
110 | #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.
111 | #' @param p Prefix factor for Jaro-Winkler distance. The valid range for
112 | #' \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the
113 | #' Jaro-distance is returned. Applies only to \code{method='jw'}.
114 | #' @param bt Winkler's boost threshold. Winkler's prefix factor is
115 | #' only applied when the Jaro distance is larger than \code{bt}.
116 | #' Applies only to \code{method='jw'} and \code{p>0}.
117 | #' @param nthread Maximum number of threads to use. By default, a sensible
118 | #' number of threads is chosen, see \code{\link{stringdist-parallelization}}.
119 | #'
120 | #' @seealso \code{\link{stringsim}}, \code{\link{qgrams}}, \code{\link{amatch}}, \code{\link{afind}}
121 | #'
122 | #' @return For \code{stringdist}, a vector with string distances of size
123 | #' \code{max(length(a),length(b))}.
124 | #'
125 | #' For \code{stringdistmatrix}: if both \code{a} and \code{b} are passed, a
126 | #' \code{length(a)xlength(b)} \code{matrix}. If a single argument \code{a} is
127 | #' given an object of class \code{\link[stats]{dist}} is returned.
128 | #'
129 | #' Distances are nonnegative if they can be computed, \code{NA} if any of the
130 | #' two argument strings is \code{NA} and \code{Inf} when \code{maxDist} is
131 | #' exceeded or, in case of the hamming distance, when the two compared strings
132 | #' have different length.
133 | #'
134 | #'
135 | #' @example ../examples/stringdist.R
136 | #' @export
137 | stringdist <- function(a, b
138 | , method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw","soundex")
139 | , useBytes = FALSE
140 | , weight=c(d=1,i=1,s=1,t=1)
141 | , q = 1
142 | , p = 0
143 | , bt = 0
144 | , nthread = getOption("sd_num_thread")
145 | ){
146 | if (is.list(a)|is.list(b))
147 | warning(listwarning("stringdist","seq_dist"))
148 |
149 | stopifnot(
150 | all(is.finite(weight))
151 | , all(weight > 0)
152 | , all(weight <=1)
153 | , q >= 0
154 | , p <= 0.25
155 | , p >= 0
156 | , is.logical(useBytes)
157 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
158 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
159 | , length(nthread) == 1
160 | , is.numeric(nthread)
161 | , nthread > 0
162 | )
163 |
164 | # note: enc2utf8 is very efficient when the native encoding is already UTF-8.
165 | a <- as.character(a)
166 | b <- as.character(b)
167 | if ( !useBytes ){
168 | a <- enc2utf8(a)
169 | b <- enc2utf8(b)
170 | }
171 |
172 | if (length(a) == 0 || length(b) == 0){
173 | return(numeric(0))
174 | }
175 | if ( max(length(a),length(b)) %% min(length(a),length(b)) != 0 ){
176 | warning(RECYCLEWARNING)
177 | }
178 | method <- match.arg(method)
179 | nthread <- as.integer(nthread)
180 |
181 | if (method == 'jw') weight <- weight[c(2,1,3)]
182 | do_dist(a=b, b=a
183 | , method=method
184 | , weight=weight
185 | , q=q
186 | , p=p
187 | , bt=bt
188 | , useBytes=useBytes
189 | , nthread=nthread)
190 | }
191 |
192 |
193 | #' @param useNames Use input vectors as row and column names?
194 | #'
195 | #'
196 | #' @rdname stringdist
197 | #' @export
198 | stringdistmatrix <- function(a, b
199 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex")
200 | , useBytes = FALSE
201 | , weight=c(d=1,i=1,s=1,t=1)
202 | , q = 1
203 | , p = 0
204 | , bt = 0
205 | , useNames=c('none','strings','names')
206 | , nthread = getOption("sd_num_thread")
207 | ){
208 | if (is.list(a)|| (!missing(b) && is.list(b)) ){
209 | warning(listwarning("stringdistmatrix","seq_distmatrix"))
210 | }
211 |
212 | # for backward compatability with stringdist <= 0.9.0
213 | if (identical(useNames, FALSE)) useNames <- "none"
214 | if (identical(useNames, TRUE)) useNames <- "strings"
215 | useNames <- match.arg(useNames)
216 |
217 | method <- match.arg(method)
218 | nthread <- as.integer(nthread)
219 | stopifnot(
220 | all(is.finite(weight))
221 | , all(weight > 0)
222 | , all(weight <=1)
223 | , q >= 0
224 | , p <= 0.25
225 | , p >= 0
226 | , is.logical(useBytes)
227 | , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE)
228 | , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE)
229 | , length(nthread) == 1
230 | , is.numeric(nthread)
231 | , nthread > 0
232 | )
233 |
234 | if (method == 'jw') weight <- weight[c(2,1,3)]
235 |
236 | # if b is missing, generate a 'dist' object.
237 | if (missing(b)){
238 | if (useNames == "names"){
239 | a <- setNames(as.character(a),names(a))
240 | } else {
241 | a <- as.character(a)
242 | }
243 | return( lower_tri(a
244 | , method=method
245 | , useBytes=useBytes
246 | , weight=weight
247 | , q=q
248 | , p=p
249 | , bt=bt
250 | , useNames=useNames
251 | , nthread=nthread)
252 | )
253 | }
254 |
255 | if (useNames == "names"){
256 | rowns <- names(a)
257 | colns <- names(b)
258 | }
259 |
260 | # NOTE: this strips off names
261 | a <- as.character(a)
262 | b <- as.character(b)
263 |
264 | if (useNames=="strings"){
265 | rowns <- a
266 | colns <- b
267 | }
268 |
269 |
270 | if (!useBytes){
271 | a <- enc2utf8(a)
272 | b <- enc2utf8(b)
273 | }
274 |
275 | if (length(a) == 0 || length(b) == 0){
276 | return(matrix(numeric(0)))
277 | }
278 |
279 | x <- vapply(b, do_dist, USE.NAMES=FALSE, FUN.VALUE=numeric(length(a))
280 | , a, method,weight, q, p, bt, useBytes, nthread)
281 |
282 | if (useNames %in% c("strings","names") ){
283 | structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns)))
284 | } else {
285 | matrix(x,nrow=length(a),ncol=length(b))
286 | }
287 | }
288 |
289 |
290 | char2int <- function(x){
291 | # For some OS's enc2utf8 had unexpected behavior for NA's,
292 | # see https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=15201.
293 | # This is fixed for R >= 2.15.3.
294 | # i <- !is.na(x)
295 | # x[i] <- enc2utf8(x[i])
296 | lapply(enc2utf8(x),utf8ToInt)
297 | }
298 |
299 | # enum-type in stringdist.h
300 | METHODS <- c(
301 | osa = 0L
302 | , lv = 1L
303 | , dl = 2L
304 | , hamming = 3L
305 | , lcs = 4L
306 | , qgram = 5L
307 | , cosine = 6L
308 | , jaccard = 7L
309 | , jw = 8L
310 | , soundex = 9L
311 | , running_cosine = 10L
312 | )
313 |
314 |
315 | do_dist <- function(a, b, method, weight, q, p, bt, useBytes=FALSE, nthread=1L){
316 |
317 | if (method=='soundex' && !all(printable_ascii(a) & printable_ascii(b)) ){
318 | warning("Non-printable ascii or non-ascii characters in soundex. Results may be unreliable. See ?printable_ascii.")
319 | }
320 | method <- METHODS[method]
321 | if ( is.na(method) ){
322 | stop(sprintf("method '%s' is not defined",method))
323 | }
324 |
325 | d <- .Call("R_stringdist", a, b, method
326 | , as.double(weight), as.double(p), as.double(bt), as.integer(q)
327 | , as.integer(useBytes), as.integer(nthread)
328 | , PACKAGE="stringdist"
329 | )
330 |
331 | d
332 | }
333 |
334 | # more efficient function that returns a square distance matrix as a 'stats::dist' object.
335 | lower_tri <- function(a
336 | , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw","soundex")
337 | , useBytes = FALSE
338 | , weight=c(d=1,i=1,s=1,t=1)
339 | , q=1
340 | , p=0
341 | , bt=0
342 | , useNames=FALSE
343 | , nthread = getOption("sd_num_thread")
344 | ){
345 | methnr <- METHODS[method]
346 | if (is.na(method)){
347 | stop(sprintf("method '%s' is not defined",method))
348 | }
349 |
350 | x <- .Call("R_lower_tri", a, methnr
351 | , as.double(weight), as.double(p), as.double(bt)
352 | , as.integer(q), as.integer(useBytes), as.integer(nthread)
353 | , PACKAGE="stringdist")
354 |
355 | attributes(x) <- list(class='dist'
356 | , Size = length(a)
357 | , Diag = FALSE
358 | , Upper = FALSE
359 | , method = method)
360 | if (useNames == "strings") attr(x,"Labels") <- as.character(a)
361 | if (useNames == "names" ) attr(x,"Labels") <- names(a)
362 |
363 | x
364 | }
365 |
366 |
367 |
368 |
369 |
--------------------------------------------------------------------------------
/pkg/inst/include/stringdist_api.h:
--------------------------------------------------------------------------------
1 |
2 | /* stringdist - a C library of string distance algorithms with an interface to R.
3 | * Copyright (C) 2013 Mark van der Loo
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, either version 3 of the License, or
8 | * (at your option) any later version.
9 | *
10 | * This program is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | * GNU General Public License for more details.
14 | *
15 | * You should have received a copy of the GNU General Public License
16 | * along with this program. If not, see