├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── include
    ├── common.hpp
    ├── fuzzywuzzy.hpp
    ├── levenshtein.h
    ├── process.hpp
    ├── string_matcher.hpp
    ├── utils.hpp
    └── wrapper.hpp
├── src
    ├── CMakeLists.txt
    ├── fuzzywuzzy.cpp
    ├── levenshtein.c
    ├── process.cpp
    ├── string_matcher.cpp
    ├── utils.cpp
    └── wrapper.cpp
└── test
    ├── CMakeLists.txt
    └── main.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .idea/
3 | cmake-build-debug/
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Joins arguments and places the results in ${result_var}.
 2 | function(join result_var)
 3 |   set(result )
 4 |   foreach (arg ${ARGN})
 5 |     set(result "${result}${arg}")
 6 |   endforeach ()
 7 |   set(${result_var} "${result}" PARENT_SCOPE)
 8 | endfunction()
 9 | 
10 | message(STATUS "CMake version: ${CMAKE_VERSION}")
11 | 
12 | cmake_minimum_required(VERSION 3.0)
13 | 
14 | # Determine if fuzzywuzzy is built as a subproject (using add_subdirectory)
15 | # or if it is the master project.
16 | set(MASTER_PROJECT OFF)
17 | if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
18 |   set(MASTER_PROJECT ON)
19 | endif()
20 | 
21 | # Set the default CMAKE_BUILD_TYPE to Release.
22 | # This should be done before the project command since the latter can set
23 | # CMAKE_BUILD_TYPE itself (it does so for nmake).
24 | if (NOT CMAKE_BUILD_TYPE)
25 |   join(doc "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or "
26 |            "CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.")
27 |   set(CMAKE_BUILD_TYPE Release CACHE STRING ${doc})
28 | endif()
29 | 
30 | option(FUZZ_TEST "Generate the test target." ${MASTER_PROJECT})
31 | 
32 | project(fuzzywuzzy LANGUAGES C CXX)
33 | 
34 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
35 | 
36 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
37 | 
38 | if (CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
39 |   set(PEDANTIC_COMPILE_FLAGS -Wall -Wextra -Wshadow -pedantic)
40 | endif()
41 | 
42 | set(CMAKE_MODULE_PATH
43 |   ${CMAKE_MODULE_PATH}
44 |   ${PROJECT_SOURCE_DIR}/cmake)
45 | 
46 | add_subdirectory(src)
47 | add_subdirectory(test)
48 | 
49 | if (FMT_TEST)
50 |   enable_testing()
51 |   #add_subdirectory(test)
52 | endif()
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is an in-progress port of [seatgeek's fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy/) Python library to C++.
 2 | When done, this library will have the same interface and behavior.
 3 | 
 4 | The underlaying C-library ([python-Levenshtein](https://github.com/miohtama/python-Levenshtein), mirrored [here](https://github.com/Tmplt/python-Levenshtein)) has been stripped of its Python interfacing
 5 | and been wrapped around some C++ code.
 6 | 
 7 | | files in `src/` | Python/C-lib equivalent |
 8 | | ----- | ----------------------- |
 9 | | `fuzzywuzzy.{c,h}pp` and `string_matcher.{c,h}pp` | Line-by-line Python-to-C++ translations of the Python library and python-Levenshtein's `StringMatcher.py`. |
10 | | `wrapper.{c,h}pp` | (Python-interfaced-)C-to-C++ wrapper of `ratio_py`, `get_opcodes_py`, `get_matching_blocks_py`, etc. from python-Levenshtein. |
11 | | `utils.{c,h}pp` | Utility functions, translated from the Python library's `utils.py`. |
12 | | `levenshtein.{c,h}` | The underlaying C functions, copied verbatim. |
13 | 
14 | Usage
15 | -----
16 | ```cpp
17 | #include <fuzzywuzzy>
18 | ```
19 | 
20 | **Simple Ratio**
21 | ```cpp
22 | fuzz::ratio("this is a test", "this is a test!"); // returns 97
23 | ```
24 | 
25 | **Partial Ratio**
26 | ```cpp
27 | fuzz::partial_ratio("this is a test", "this is a test!"); // return 100
28 | ```
29 | 
30 | **Token Sort Ratio**
31 | ```cpp
32 | fuzz::ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"); // returns 91
33 | 
34 | fuzz::token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"); // returns 100
35 | ```
36 | 
37 | **Token Set Ratio**
38 | ```cpp
39 | fuzz::token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"); // returns 83 (this should be 84)
40 | 
41 | fuzz::token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"); // returns 100
42 | ```


--------------------------------------------------------------------------------
/include/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | #ifdef __clang__
 7 |     #include <string_view>
 8 |     using std::string_view;
 9 | #else
10 |     #include <experimental/string_view>
11 |     using std::experimental::string_view;
12 | #endif
13 | 
14 | using std::vector;
15 | using std::string;
16 | 


--------------------------------------------------------------------------------
/include/fuzzywuzzy.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.hpp"
 4 | 
 5 | namespace /* I'm in your mind... */ fuzz {
 6 | 
 7 | /*                          */
 8 | /* Basic scoring functions. */
 9 | /*                          */
10 | 
11 | /* Calculates a Levenshtein simple ratio between the string. */
12 | unsigned int ratio(const string &s1, const string &s2, const bool full_process = true);
13 | 
14 | /*
15 |  * Return the ratio of the most similar substring
16 |  * as a number between 0 and 100.
17 |  */
18 | unsigned int partial_ratio(const string &s1, const string &s2, const bool full_process = true);
19 | 
20 | /*                             */
21 | /* Advanced scoring functions. */
22 | /*                             */
23 | 
24 | /*
25 |  * Returns a measure of the strings' similarity between 0 and 100
26 |  * but sorting the token before comparing.
27 |  */
28 | unsigned int token_sort_ratio(const string &s1, const string &s2, const bool full_process = true);
29 | unsigned int token_sort_partial_ratio(const string &s1, const string &s2, const bool full_process = true);
30 | 
31 | /*
32 |  * Splits the strings into tokens and computes intersections and
33 |  * remainders between the tokens of the two strings. A comparison string
34 |  * is then built up and is compared using the simple ratio algorithm.
35 |  * Useful for strings where words appear redundantly.
36 |  */
37 | unsigned int token_set_ratio(const string &s1, const string &s2, const bool full_process = true);
38 | 
39 | /*
40 |  * Returns the ratio of the most similar substring as a number
41 |  * between 0 and 100 but sorting the token before comparing.
42 |  */
43 | unsigned int partial_token_set_ratio(const string &s1, const string &s2, const bool full_process = true);
44 | 
45 | /*                 */
46 | /* Combination API */
47 | /*                 */
48 | 
49 | /*
50 |  * Quick ratio comparison between two strings.
51 |  * Runs utils::full_process on both strings.
52 |  * Short circuits if either string is empty after processing.
53 |  */
54 | unsigned int quick_ratio(const string &s1, const string &s2, const bool full_process = true);
55 | 
56 | /*
57 |  * Returns a measure of the strings' similarity between 0 and 100, using different algorithms.
58 |  *
59 |  * Steps in the order they occur:
60 |  *  #. Run utils::full_process on both strings
61 |  *  #. Short circuit if either string is empty
62 |  *  #. Take the ratio of the two processed strings
63 |  *  #. Run checks to compare the length of the strings:
64 |  *    * If one of the strings is more than 1.5 times as long as the other,
65 |  *      use partial_ratio comparisons -- scale partial results by 0.9
66 |  *      (this makes sure only full results can return 100)
67 |  *    * If one of the strings is over 8 times as long as the other,
68 |  *      scale by 0.6 instead
69 |  *
70 |  *  #. Run the other ratio functions
71 |  *    * If using partial ratio functions, call partial_ratio,
72 |  *      partial_token_sort_ratio and partial_token_set_ratio.
73 |  *      Then scale all of these by the ratio based on length.
74 |  *    * Otherwise call token_sort_ratio and token_set_ratio
75 |  *      and scale these results by 0.95 (on top of any partial scalars)
76 |  *
77 |  *  #. Take the highest value from these results, round it, and return
78 |  *     as an integer.
79 |  */
80 | unsigned int weighted_ratio(const string &s1, const string &s2, const bool full_process = true);
81 | 
82 | /* I'm not in your mind */ }
83 | 


--------------------------------------------------------------------------------
/include/levenshtein.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file has been altered to better fit fuzzywuzzy.
  3 |  * To se all changes done, please diff this file with
  4 |  * <https://github.com/Tmplt/python-Levenshtein/blob/master/Levenshtein.c>
  5 |  *
  6 |  * Summary:
  7 |  *   - stripped all python-related code and data types;
  8 |  */
  9 | 
 10 | /* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */
 11 | #ifndef LEVENSHTEIN_H
 12 | #define LEVENSHTEIN_H
 13 | 
 14 | #ifndef size_t
 15 | #  include <stdlib.h>
 16 | #endif
 17 | 
 18 | /* A bit dirty. */
 19 | #ifndef _LEV_STATIC_PY
 20 | #  define _LEV_STATIC_PY /* */
 21 | #endif
 22 | 
 23 | /* In C, this is just wchar_t and unsigned char, in Python, lev_wchar can
 24 |  * be anything.  If you really want to cheat, define wchar_t to any integer
 25 |  * type you like before including Levenshtein.h and recompile it. */
 26 | #ifndef lev_wchar
 27 | #  ifndef wchar_t
 28 | #    include <wchar.h>
 29 | #  endif
 30 | #  define lev_wchar wchar_t
 31 | #endif
 32 | typedef unsigned char lev_byte;
 33 | 
 34 | /* Edit opration type
 35 |  * DON'T CHANGE! used ad arrays indices and the bits are occasionally used
 36 |  * as flags */
 37 | typedef enum {
 38 |     LEV_EDIT_KEEP = 0,
 39 |     LEV_EDIT_REPLACE = 1,
 40 |     LEV_EDIT_INSERT = 2,
 41 |     LEV_EDIT_DELETE = 3,
 42 |     LEV_EDIT_LAST  /* sometimes returned when an error occurs */
 43 | } LevEditType;
 44 | 
 45 | /* Error codes returned by editop check functions */
 46 | typedef enum {
 47 |     LEV_EDIT_ERR_OK = 0,
 48 |     LEV_EDIT_ERR_TYPE,  /* nonexistent edit type */
 49 |     LEV_EDIT_ERR_OUT,  /* edit out of string bounds */
 50 |     LEV_EDIT_ERR_ORDER,  /* ops are not ordered */
 51 |     LEV_EDIT_ERR_BLOCK,  /* inconsistent block boundaries (block ops) */
 52 |     LEV_EDIT_ERR_SPAN,  /* sequence is not a full transformation (block ops) */
 53 |     LEV_EDIT_ERR_LAST
 54 | } LevEditOpError;
 55 | 
 56 | /* string averaging method (UNUSED yet) */
 57 | typedef enum {
 58 |     LEV_AVG_HEAD = 0,  /* take operations from the head */
 59 |     LEV_AVG_TAIL,  /* take operations from the tail */
 60 |     LEV_AVG_SPREAD,  /* take a equidistantly distributed subset */
 61 |     LEV_AVG_BLOCK,  /* take a random continuous block */
 62 |     LEV_AVG_RANDOM,  /* take a random subset */
 63 |     LEV_AVG_LAST
 64 | } LevAveragingType;
 65 | 
 66 | /* Edit operation (atomic).
 67 |  * This is the `native' atomic edit operation.  It differs from the difflib
 68 |  * one's because it represents a change of one character, not a block.  And
 69 |  * we usually don't care about LEV_EDIT_KEEP, though the functions can handle
 70 |  * them.  The positions are interpreted as at the left edge of a character.
 71 |  */
 72 | typedef struct {
 73 |     LevEditType type;  /* editing operation type */
 74 |     size_t spos;  /* source block position */
 75 |     size_t dpos;  /* destination position */
 76 | } LevEditOp;
 77 | 
 78 | /* Edit operation (difflib-compatible).
 79 |  * This is not `native', but conversion functions exist.  These fields exactly
 80 |  * correspond to the codeops() tuples fields (and this method is also the
 81 |  * source of the silly OpCode name).  Sequences must span over complete
 82 |  * strings, subsequences are simply edit sequences with more (or larger)
 83 |  * LEV_EDIT_KEEP blocks.
 84 |  */
 85 | typedef struct {
 86 |     LevEditType type;  /* editing operation type */
 87 |     size_t sbeg, send;  /* source block begin, end */
 88 |     size_t dbeg, dend;  /* destination block begin, end */
 89 | } LevOpCode;
 90 | 
 91 | /* Matching block (difflib-compatible). */
 92 | typedef struct {
 93 |     size_t spos;
 94 |     size_t dpos;
 95 |     size_t len;
 96 | } LevMatchingBlock;
 97 | 
 98 | size_t
 99 | lev_edit_distance(size_t len1,
100 |                   const lev_byte *string1,
101 |                   size_t len2,
102 |                   const lev_byte *string2,
103 |                   int xcost);
104 | 
105 | size_t
106 | lev_u_edit_distance(size_t len1,
107 |                     const lev_wchar *string1,
108 |                     size_t len2,
109 |                     const lev_wchar *string2,
110 |                     int xcost);
111 | 
112 | LevEditOp*
113 | lev_editops_find(size_t len1,
114 |                  const lev_byte *string1,
115 |                  size_t len2,
116 |                  const lev_byte *string2,
117 |                  size_t *n);
118 | 
119 | LevOpCode*
120 | lev_editops_to_opcodes(size_t n,
121 |                        const LevEditOp *ops,
122 |                        size_t *nb,
123 |                        size_t len1,
124 |                        size_t len2);
125 | 
126 | LevMatchingBlock*
127 | lev_opcodes_matching_blocks(size_t len1,
128 |                             __attribute__((unused)) size_t len2,
129 |                             size_t nb,
130 |                             const LevOpCode *bops,
131 |                             size_t *nmblocks);
132 | 
133 | LevMatchingBlock*
134 | lev_editops_matching_blocks(size_t len1,
135 |                             size_t len2,
136 |                             size_t n,
137 |                             const LevEditOp *ops,
138 |                             size_t *nmblocks);
139 | 
140 | 
141 | #endif


--------------------------------------------------------------------------------
/include/process.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.hpp"
 4 | #include <utility>
 5 | #include <functional>
 6 | 
 7 | #include "utils.hpp"
 8 | #include "fuzzywuzzy.hpp"
 9 | 
10 | namespace fuzz
11 | {
12 | using std::pair;
13 | using std::function;
14 | /*
15 |  * Finds the best matches in a vector of choises. Returns a vector of pairs which
16 |  * contains the matches and their respective scores.
17 |  */
18 | vector<pair<string, int>> extractWithoutOrder(const string& query, const vector<string>& choices
19 |     , function<string(string)> processor=utils::full_process, function<int(string, string, const bool)> scorer=weighted_ratio
20 |     , int score_cutoff=0);
21 | 
22 | /*
23 |  * Convenience function for getting the choices with best scores.
24 |  */
25 | vector<pair<string, int>> extractBests(const string& query, const vector<string>& choices
26 |     , function<string(string)> processor=utils::full_process, function<int(string, string, const bool)> scorer=weighted_ratio
27 |     , int score_cutoff = 0, intmax_t limit = 5);
28 | 
29 | /*
30 |  * Convenience function for getting the choices with best scores.
31 |  */
32 | vector<pair<string, int>> extract(const string& query, const vector<string>& choices
33 |     , function<string(string)> processor=utils::full_process, function<int(string, string, const bool)> scorer=weighted_ratio
34 |     , intmax_t limit = 5);
35 | 
36 | /*
37 |  * This is a convenience method which returns the single best choice.
38 |  */
39 | vector<pair<string, int>> extractOne(const string& query, const vector<string>& choices
40 |     , function<string(string)> processor=utils::full_process, function<int(string, string, const bool)> scorer=weighted_ratio
41 |     , int score_cutoff = 0);
42 | /*
43 |  * This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
44 |  * and remove duplicates. Specifically, it uses the process.extract to identify duplicates that
45 |  * score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
46 |  * since we assume this item contains the most entity information and returns that. It breaks string
47 |  * length ties on an alphabetical sort.
48 |  * 
49 |  * Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
50 |  *     returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less
51 |  *     sensitive. 
52 |  */
53 | vector<string> dedupe(const vector<string>& contains_dupes, int threshold=70
54 |     , function<int(string, string, const bool)> scorer=token_set_ratio);
55 | 
56 | } // ns fuzz
57 | 
58 | 


--------------------------------------------------------------------------------
/include/string_matcher.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.hpp"
 4 | #include "levenshtein.h"
 5 | 
 6 | namespace fuzz {
 7 | 
 8 | enum { not_set = -1 };
 9 | 
10 | class string_matcher {
11 | public:
12 |     explicit string_matcher(string s1, string s2)
13 |         : s1_(s1), s2_(s2) {}
14 | 
15 |     void set_strings(const string s1, const string s2);
16 |     void set_string1(const string s1);
17 |     void set_string2(const string s2);
18 | 
19 |     vector<LevMatchingBlock> get_matching_blocks();
20 |     vector<LevOpCode> get_opcodes();
21 |     vector<LevEditOp> get_editops();
22 | 
23 |     double ratio();
24 |     double real_quick_ratio();
25 | 
26 | protected:
27 | 
28 | private:
29 |     string s1_, s2_;
30 |     double ratio_ = not_set;
31 |     int distance_ = not_set;
32 | 
33 |     vector<LevMatchingBlock> matching_blocks_;
34 |     vector<LevOpCode> op_codes_;
35 |     vector<LevEditOp> edit_ops_;
36 | 
37 |     void reset_cache();
38 | };
39 | 
40 | }  // ns fuzz
41 | 


--------------------------------------------------------------------------------
/include/utils.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.hpp"
 4 | 
 5 | #include <algorithm>  // std::max_element()
 6 | 
 7 | namespace fuzz {
 8 | 
 9 | namespace utils {
10 | 
11 | unsigned int percent_round(double val);
12 | 
13 | unsigned int intr(double val);
14 | 
15 | vector<string> split_string(const string &str, const char c = ' ');
16 | 
17 | string& trim(string &str);
18 | 
19 | string join(const vector<string> &v, const string &sep = " ");
20 | 
21 | string full_process(string str);
22 | 
23 | size_t min(size_t a, size_t b);
24 | 
25 | #ifdef CPP17
26 | 
27 | template <typename First, typename ... T>
28 | decltype(auto) max(const First &f, const T & ... t)
29 | {
30 |     const First *retval = &f;
31 |     ( (retval = &std::max(*retval, t)), ... );
32 |     return *retval;
33 | }
34 | 
35 | #else
36 | 
37 | /*
38 |  * An "extension" of std::max() so that more than two arguments
39 |  * can be passed. The first argument decides what everything else
40 |  * is casted too.
41 |  *
42 |  * Hopefully the compiler will complain if we pass this something stupid.
43 |  * NOTE: Can this be done when omitting first?
44 |  */
45 | template <typename T, typename... Args>
46 | auto max(const T &first, const Args&... args)
47 | {
48 |     std::vector<T> vec = {first, static_cast<T>(args)...};
49 |     auto max = std::max_element(vec.cbegin(), vec.cend());
50 | 
51 |     return *max;
52 | }
53 | 
54 | #endif
55 | 
56 | }  // utils utils
57 |   
58 | }  // utils fuzz
59 | 


--------------------------------------------------------------------------------
/include/wrapper.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.hpp"
 4 | 
 5 | extern "C" {
 6 | #include "levenshtein.h"
 7 | }
 8 | 
 9 | using std::vector;
10 | 
11 | namespace wrapper {
12 | 
13 | double ratio(const string &str1, const string &str2);
14 | 
15 | vector<LevMatchingBlock> get_matching_blocks(vector<LevOpCode> &v, string &s1, string &s2);
16 | vector<LevOpCode> get_opcodes(string &s1, string &s2);
17 | vector<LevOpCode> get_opcodes(vector<LevEditOp> &ops, string &s1, string &s2);
18 | vector<LevEditOp> get_editops(string &s1, string &s2);
19 | 
20 | }  // ns diffutils
21 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LIB_INCLUDE_DIRS ${LIB_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/include)
 2 | include_directories(${LIB_INCLUDE_DIRS})
 3 | 
 4 | add_library(levenshtein STATIC levenshtein.c)
 5 | set_property(TARGET levenshtein PROPERTY POSITION_INDEPENDENT_CODE ON)
 6 | 
 7 | file(GLOB_RECURSE SOURCES RELATIVE ${PROJECT_SOURCE_DIR}/src *.cpp*)
 8 | add_library(fuzzywuzzy SHARED ${SOURCES})
 9 | target_link_libraries(fuzzywuzzy levenshtein)
10 | 


--------------------------------------------------------------------------------
/src/fuzzywuzzy.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <set>
  3 | #include <cmath>
  4 | 
  5 | #include "fuzzywuzzy.hpp"
  6 | #include "string_matcher.hpp"
  7 | #include "utils.hpp"
  8 | 
  9 | namespace fuzz {
 10 | 
 11 | unsigned int ratio(const string &s1, const string &s2, const bool full_process)
 12 | {
 13 |     string p1 = full_process ? utils::full_process(s1) : s1;
 14 |     string p2 = full_process ? utils::full_process(s2) : s2;
 15 | 
 16 |     auto m = string_matcher(p1, p2);
 17 |     return utils::percent_round(m.ratio());
 18 | }
 19 | 
 20 | unsigned int partial_ratio(const string &s1, const string &s2, const bool full_process)
 21 | {
 22 |     string p1 = full_process ? utils::full_process(s1) : s1;
 23 |     string p2 = full_process ? utils::full_process(s2) : s2;
 24 | 
 25 |     string shorter, longer;
 26 | 
 27 |     if (p1.length() <= p2.length()) {
 28 |         shorter = p1;
 29 |         longer  = p2;
 30 |     } else {
 31 |         shorter = p2;
 32 |         longer  = p1;
 33 |     }
 34 | 
 35 |     auto m = string_matcher(shorter, longer);
 36 |     auto blocks = m.get_matching_blocks();
 37 | 
 38 |     /*
 39 |      * Each block represents a string of matching characters
 40 |      * in a string of the form (idx_1, idx_2, len). The best
 41 |      * partial match will block align with at least one
 42 |      * of those blocks.
 43 |      * e.g. shorter = "abcd", longer "XXXbcdeEEE"
 44 |      * block = (1, 3, 3)
 45 |      * best score == ratio("abcd", "Xbcd")
 46 |      */
 47 |     vector<double> scores;
 48 |     for (const auto &block : blocks) {
 49 |         size_t long_start = utils::max(0, block.dpos - block.spos);
 50 |         size_t long_end = shorter.length();
 51 | 
 52 |         auto long_substr = longer.substr(long_start, long_end);
 53 |         auto m2 = string_matcher(shorter, long_substr);
 54 |         double r = m2.ratio();
 55 | 
 56 |         if (r > 0.995)
 57 |             return 100;
 58 |         else
 59 |             scores.push_back(r);
 60 |     }
 61 | 
 62 |     if (scores.empty())
 63 |         return 0;
 64 | 
 65 |     double max = *std::max_element(scores.cbegin(), scores.cend());
 66 |     return utils::percent_round(max);
 67 | }
 68 | 
 69 | /* Returns a cleaned string with tokens sorted. */
 70 | static string proccess_and_sort(const string &s, const bool full_process)
 71 | {
 72 |     string ps = (full_process ? utils::full_process(s) : s);
 73 | 
 74 |     auto tokens = utils::split_string(ps);
 75 |     std::sort(tokens.begin(), tokens.end());
 76 |     string sorted = utils::join(tokens);
 77 | 
 78 |     return utils::trim(sorted);
 79 | }
 80 | 
 81 | unsigned int token_sort_ratio(const string &s1, const string &s2, const bool full_proccess)
 82 | {
 83 |     /* NOTE: do we need force_ascii? */
 84 |     string sorted1 = proccess_and_sort(s1, full_proccess);
 85 |     string sorted2 = proccess_and_sort(s2, full_proccess);
 86 | 
 87 |     return ratio(sorted1, sorted2);
 88 | }
 89 | 
 90 | unsigned int token_sort_partial_ratio(const string &s1, const string &s2, const bool full_proccess)
 91 | {
 92 |     /* NOTE: do we need force_ascii? */
 93 |     string sorted1 = proccess_and_sort(s1, full_proccess);
 94 |     string sorted2 = proccess_and_sort(s2, full_proccess);
 95 | 
 96 |     return partial_ratio(sorted1, sorted2);
 97 | }
 98 | 
 99 | /*
100 |  * Find all alphanumeric tokens in each string and:
101 |  *  - treat them as a set,
102 |  *  - construct two strings of the form <sorted_intersection><sorted_remainder>,
103 |  *  - take ratios of those two strings, and
104 |  *  - check for unordered partial matches.
105 |  */
106 | static unsigned int token_set_ratio(const string &s1, const string &s2, bool partial, const bool full_process)
107 | {
108 |     string p1 = full_process ? utils::full_process(s1) : s1;
109 |     string p2 = full_process ? utils::full_process(s2) : s2;
110 | 
111 |     if (p1.length() == 0 || p2.length() == 0)
112 |         return 0;
113 | 
114 |     auto split1 = utils::split_string(p1), split2 = utils::split_string(p2);
115 |     auto tokens1 = std::set<string>(split1.cbegin(), split1.cend()),
116 |          tokens2 = std::set<string>(split2.cbegin(), split2.cend());
117 | 
118 |     vector<string> intersection, diff1to2, diff2to1;
119 | 
120 |     std::set_intersection(tokens1.cbegin(), tokens1.cend(),
121 |                           tokens2.cbegin(), tokens2.cend(),
122 |                           std::back_inserter(intersection));
123 | 
124 |     std::set_difference(tokens1.cbegin(), tokens1.cend(),
125 |                         tokens2.cbegin(), tokens2.cend(),
126 |                         std::back_inserter(diff1to2));
127 |     std::set_difference(tokens2.cbegin(), tokens2.cend(),
128 |                         tokens1.cbegin(), tokens1.cend(),
129 |                         std::back_inserter(diff2to1));
130 | 
131 |     std::sort(intersection.begin(), intersection.end());
132 |     std::sort(diff1to2.begin(), diff1to2.end());
133 |     std::sort(diff2to1.begin(), diff2to1.end());
134 | 
135 |     auto sorted_sect = utils::join(intersection),
136 |          sorted_1to2 = utils::join(diff1to2),
137 |          sorted_2to1 = utils::join(diff2to1);
138 | 
139 |     auto combined_1to2 = sorted_sect + " " + sorted_1to2,
140 |          combined_2to1 = sorted_sect + " " + sorted_2to1;
141 | 
142 |     sorted_sect = utils::trim(sorted_sect);
143 |     combined_1to2 = utils::trim(combined_1to2);
144 |     combined_2to1 = utils::trim(combined_2to1);
145 | 
146 |     auto ratio_func = partial ? partial_ratio : ratio;
147 |     auto pairwise = vector<unsigned int>{
148 |         ratio_func(sorted_sect, combined_1to2, full_process),
149 |         ratio_func(sorted_sect, combined_2to1, full_process),
150 |         ratio_func(combined_1to2, combined_2to1, full_process)
151 |     };
152 | 
153 |     return *std::max_element(pairwise.cbegin(), pairwise.cend());
154 | }
155 | 
156 | unsigned int token_set_ratio(const string &s1, const string &s2, const bool full_process)
157 | {
158 |     return token_set_ratio(s1, s2, false, full_process);
159 | }
160 | 
161 | unsigned int partial_token_set_ratio(const string &s1, const string &s2, const bool full_process)
162 | {
163 |     return token_set_ratio(s1, s2, true, full_process);
164 | }
165 | 
166 | unsigned int quick_ratio(const string &s1, const string &s2, const bool full_process)
167 | {
168 |     string p1 = full_process ? utils::full_process(s1) : s1;
169 |     string p2 = full_process ? utils::full_process(s2) : s2;
170 | 
171 |     if (p1.length() == 0 || p2.length() == 0)
172 |         return 0;
173 | 
174 |     return ratio(p1, p2);
175 | }
176 | 
177 | unsigned int weighted_ratio(const string &s1, const string &s2, const bool full_process)
178 | {
179 |     string p1 = full_process ? utils::full_process(s1) : s1;
180 |     string p2 = full_process ? utils::full_process(s2) : s2;
181 | 
182 |     if (p1.length() == 0 || p2.length() == 0)
183 |         return 0;
184 | 
185 |     bool try_partial = true;
186 |     double unbase_scale = 0.95;
187 |     double partial_scale = 0.90;
188 | 
189 |     auto base = ratio(p1, p2);
190 |     double len_ratio = static_cast<double>(utils::max(p1.length(), p2.length())) /
191 |             static_cast<double>(utils::min(p1.length(), p2.length()));
192 | 
193 |     /* If strings are similar length, don't use partials. */
194 |     if (len_ratio < 1.5)
195 |         try_partial = false;
196 | 
197 |     /* If one string is much much shorter than the other. */
198 |     if (len_ratio > 8)
199 |         partial_scale = 0.60;
200 | 
201 |     if (try_partial) {
202 |         double partial = partial_ratio(p1, p2) * partial_scale;
203 |         double ptsor = token_sort_partial_ratio(p1, p2) * unbase_scale * partial_scale;
204 |         double ptser = partial_token_set_ratio(p1, p2) * unbase_scale * partial_scale;
205 | 
206 |         return utils::intr(utils::max(base, partial, ptsor, ptser));
207 |     } else {
208 |         double tsor = token_sort_ratio(p1, p2, false) * unbase_scale;
209 |         double tser = token_set_ratio(p1, p2, false) * unbase_scale;
210 | 
211 |         return utils::intr(utils::max(base, tsor, tser));
212 |     }
213 | }
214 | 
215 | }  // ns fuzz
216 | 


--------------------------------------------------------------------------------
/src/levenshtein.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file has been altered to better fit fuzzywuzzy.
  3 |  * To se all changes done, please diff this file with
  4 |  * <https://github.com/Tmplt/python-Levenshtein/blob/master/Levenshtein.c>
  5 |  *
  6 |  * Summary:
  7 |  *   - stripped all python-related code and data types;
  8 |  *   - fixed some spelling errors.
  9 |  */
 10 | 
 11 | /*
 12 |  * Levenshtein.c
 13 |  * @(#) $Id: Levenshtein.c,v 1.41 2005/01/13 20:05:36 yeti Exp $
 14 |  * Python extension computing Levenshtein distances, string similarities,
 15 |  * median strings and other goodies.
 16 |  *
 17 |  * Copyright (C) 2002-2003 David Necas (Yeti) <yeti@physics.muni.cz>.
 18 |  *
 19 |  * The Taus113 random generator:
 20 |  * Copyright (C) 2002 Atakan Gurkan
 21 |  * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
 22 |  * (see below for more)
 23 |  *
 24 |  * This program is free software; you can redistribute it and/or modify it
 25 |  * under the terms of the GNU General Public License as published by the Free
 26 |  * Software Foundation; either version 2 of the License, or (at your option)
 27 |  * any later version.
 28 |  *
 29 |  * This program is distributed in the hope that it will be useful, but WITHOUT
 30 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 31 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 32 |  * more details.
 33 |  *
 34 |  * You should have received a copy of the GNU General Public License along
 35 |  * with this program; if not, write to the Free Software Foundation, Inc.,
 36 |  * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 37 |  **/
 38 | 
 39 | /**
 40 |  * TODO:
 41 |  *
 42 |  * - Implement weighted string averaging, see:
 43 |  *   H. Bunke et. al.: On the Weighted Mean of a Pair of Strings,
 44 |  *         Pattern Analysis and Applications 2002, 5(1): 23-30.
 45 |  *   X. Jiang et. al.: Dynamic Computations of Generalized Median Strings,
 46 |  *         Pattern Analysis and Applications 2002, ???.
 47 |  *   The latter also contains an interesting median-search algorithm.
 48 |  *
 49 |  * - Deal with stray symbols in greedy median() and median_improve().
 50 |  *   There are two possibilities:
 51 |  *    (i) Remember which strings contain which symbols.  This allows certain
 52 |  *        small optimizations when processing them.
 53 |  *   (ii) Use some overall heuristics to find symbols which don't worth
 54 |  *        trying.  This is very appealing, but hard to do properly
 55 |  *        (requires some inequality strong enough to allow practical exclusion
 56 |  *        of certain symbols -- at certain positions)
 57 |  *
 58 |  * - Editops should be an object that only *looks* like a list (which means
 59 |  *   it is a list in duck typing) to avoid never-ending conversions from
 60 |  *   Python lists to LevEditOp arrays and back
 61 |  *
 62 |  * - Optimize munkers_blackman(), it's pretty dumb (no memory of visited
 63 |  *   columns/rows)
 64 |  *
 65 |  * - Make it really usable as a C library (needs some wrappers, headers, ...,
 66 |  *   and maybe even documentation ;-)
 67 |  *
 68 |  * - Add interface to various interesting auxiliary results, namely
 69 |  *   set and sequence distance (only ratio is exported), the map from
 70 |  *   munkers_blackman() itself, ...
 71 |  *
 72 |  * - Generalizations:
 73 |  *   - character weight matrix/function
 74 |  *   - arbitrary edit operation costs, decomposable edit operations
 75 |  *
 76 |  * - Create a test suite
 77 |  *
 78 |  * - Add more interesting algorithms ;-)
 79 |  *
 80 |  * Postponed TODO (investigated, and a big `but' was found):
 81 |  *
 82 |  * - A linear approximate set median algorithm:
 83 |  *   P. Indyk: Sublinear time algorithms for metric space problems,
 84 |  *         STOC 1999, http://citeseer.nj.nec.com/indyk00sublinear.html.
 85 |  *   BUT: The algorithm seems to be advantageous only in the case of very
 86 |  *   large sets -- if my estimates are correct (the article itself is quite
 87 |  *   `asymptotic'), say 10^5 at least.  On smaller sets either one would get
 88 |  *   only an extermely rough median estimate, or the number of distance
 89 |  *   computations would be in fact higher than in the dumb O(n^2) algorithm.
 90 |  *
 91 |  * - Improve setmedian() speed with triangular inequality, see:
 92 |  *   Juan, A., E. Vidal: An Algorithm for Fast Median Search,
 93 |  *         1997, http://citeseer.nj.nec.com/article/juan97algorithm.html
 94 |  *   BUT: It doesn't seem to help much in spaces of high dimension (see the
 95 |  *   discussion and graphs in the article itself), a few percents at most,
 96 |  *   and strings behave like a space with a very high dimension (locally), so
 97 |  *   who knows, it probably wouldn't help much.
 98 |  *
 99 |  **/
100 | 
101 | #ifndef _GNU_SOURCE
102 | #  define _GNU_SOURCE
103 | #endif
104 | 
105 | #include <string.h>
106 | #include <math.h>
107 | /* for debugging */
108 | #include <stdio.h>
109 | 
110 | #include <assert.h>
111 | #include "levenshtein.h"
112 | 
113 | /**
114 |  * lev_edit_distance:
115 |  * @len1: The length of @string1.
116 |  * @string1: A sequence of bytes of length @len1, may contain NUL characters.
117 |  * @len2: The length of @string2.
118 |  * @string2: A sequence of bytes of length @len2, may contain NUL characters.
119 |  * @xcost: If nonzero, the replace operation has weight 2, otherwise all
120 |  *         edit operations have equal weights of 1.
121 |  *
122 |  * Computes Levenshtein edit distance of two strings.
123 |  *
124 |  * Returns: The edit distance.
125 |  **/
126 | size_t
127 | lev_edit_distance(size_t len1, const lev_byte *string1,
128 |                   size_t len2, const lev_byte *string2,
129 |                   int xcost) {
130 |     size_t i;
131 |     size_t *row;  /* we only need to keep one row of costs */
132 |     size_t *end;
133 |     size_t half;
134 | 
135 |     /* strip common prefix */
136 |     while (len1 > 0 && len2 > 0 && *string1 == *string2) {
137 |         len1--;
138 |         len2--;
139 |         string1++;
140 |         string2++;
141 |     }
142 | 
143 |     /* strip common suffix */
144 |     while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1]) {
145 |         len1--;
146 |         len2--;
147 |     }
148 | 
149 |     /* catch trivial cases */
150 |     if (len1 == 0)
151 |         return len2;
152 |     if (len2 == 0)
153 |         return len1;
154 | 
155 |     /* make the inner cycle (i.e. string2) the longer one */
156 |     if (len1 > len2) {
157 |         size_t nx = len1;
158 |         const lev_byte *sx = string1;
159 |         len1 = len2;
160 |         len2 = nx;
161 |         string1 = string2;
162 |         string2 = sx;
163 |     }
164 |     /* check len1 == 1 separately */
165 |     if (len1 == 1) {
166 |         if (xcost)
167 |             return len2 + 1 - 2 * (memchr(string2, *string1, len2) != NULL);
168 |         else
169 |             return len2 - (memchr(string2, *string1, len2) != NULL);
170 |     }
171 |     len1++;
172 |     len2++;
173 |     half = len1 >> 1;
174 | 
175 |     /* initialize first row */
176 |     row = (size_t *) malloc(len2 * sizeof(size_t));
177 |     if (!row)
178 |         return (size_t) (-1);
179 |     end = row + len2 - 1;
180 |     for (i = 0; i < len2 - (xcost ? 0 : half); i++)
181 |         row[i] = i;
182 | 
183 |     /* go through the matrix and compute the costs.  yes, this is an extremely
184 |      * obfuscated version, but also extremely memory-conservative and relatively
185 |      * fast.  */
186 |     if (xcost) {
187 |         for (i = 1; i < len1; i++) {
188 |             size_t *p = row + 1;
189 |             const lev_byte char1 = string1[i - 1];
190 |             const lev_byte *char2p = string2;
191 |             size_t D = i;
192 |             size_t x = i;
193 |             while (p <= end) {
194 |                 if (char1 == *(char2p++))
195 |                     x = --D;
196 |                 else
197 |                     x++;
198 |                 D = *p;
199 |                 D++;
200 |                 if (x > D)
201 |                     x = D;
202 |                 *(p++) = x;
203 |             }
204 |         }
205 |     } else {
206 |         /* in this case we don't have to scan two corner triangles (of size len1/2)
207 |          * in the matrix because no best path can go thought them. note this
208 |          * breaks when len1 == len2 == 2 so the memchr() special case above is
209 |          * necessary */
210 |         row[0] = len1 - half - 1;
211 |         for (i = 1; i < len1; i++) {
212 |             size_t *p;
213 |             const lev_byte char1 = string1[i - 1];
214 |             const lev_byte *char2p;
215 |             size_t D, x;
216 |             /* skip the upper triangle */
217 |             if (i >= len1 - half) {
218 |                 size_t offset = i - (len1 - half);
219 |                 size_t c3;
220 | 
221 |                 char2p = string2 + offset;
222 |                 p = row + offset;
223 |                 c3 = *(p++) + (char1 != *(char2p++));
224 |                 x = *p;
225 |                 x++;
226 |                 D = x;
227 |                 if (x > c3)
228 |                     x = c3;
229 |                 *(p++) = x;
230 |             } else {
231 |                 p = row + 1;
232 |                 char2p = string2;
233 |                 D = x = i;
234 |             }
235 |             /* skip the lower triangle */
236 |             if (i <= half + 1)
237 |                 end = row + len2 + i - half - 2;
238 |             /* main */
239 |             while (p <= end) {
240 |                 size_t c3 = --D + (char1 != *(char2p++));
241 |                 x++;
242 |                 if (x > c3)
243 |                     x = c3;
244 |                 D = *p;
245 |                 D++;
246 |                 if (x > D)
247 |                     x = D;
248 |                 *(p++) = x;
249 |             }
250 |             /* lower triangle sentinel */
251 |             if (i <= half) {
252 |                 size_t c3 = --D + (char1 != *char2p);
253 |                 x++;
254 |                 if (x > c3)
255 |                     x = c3;
256 |                 *p = x;
257 |             }
258 |         }
259 |     }
260 | 
261 |     i = *end;
262 |     free(row);
263 |     return i;
264 | }
265 | 
266 | /**
267 |  * editops_from_cost_matrix:
268 |  * @len1: The length of @string1.
269 |  * @string1: A string of length @len1, may contain NUL characters.
270 |  * @o1: The offset where the matrix starts from the start of @string1.
271 |  * @len2: The length of @string2.
272 |  * @string2: A string of length @len2, may contain NUL characters.
273 |  * @o2: The offset where the matrix starts from the start of @string2.
274 |  * @matrix: The cost matrix.
275 |  * @n: Where the number of edit operations should be stored.
276 |  *
277 |  * Reconstructs the optimal edit sequence from the cost matrix @matrix.
278 |  *
279 |  * The matrix is freed.
280 |  *
281 |  * Returns: The optimal edit sequence, as a newly allocated array of
282 |  *          elementary edit operations, it length is stored in @n.
283 |  **/
284 | static LevEditOp*
285 | editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1,
286 |                          size_t len2, const lev_byte *string2, size_t off2,
287 |                          size_t *matrix, size_t *n)
288 | {
289 |     size_t *p;
290 |     size_t i, j, pos;
291 |     LevEditOp *ops;
292 |     int dir = 0;
293 | 
294 |     pos = *n = matrix[len1*len2 - 1];
295 |     if (!*n) {
296 |         free(matrix);
297 |         return NULL;
298 |     }
299 |     ops = (LevEditOp*)malloc((*n)*sizeof(LevEditOp));
300 |     if (!ops) {
301 |         free(matrix);
302 |         *n = (size_t)(-1);
303 |         return NULL;
304 |     }
305 |     i = len1 - 1;
306 |     j = len2 - 1;
307 |     p = matrix + len1*len2 - 1;
308 |     while (i || j) {
309 |         /* prefer contiuning in the same direction */
310 |         if (dir < 0 && j && *p == *(p - 1) + 1) {
311 |             pos--;
312 |             ops[pos].type = LEV_EDIT_INSERT;
313 |             ops[pos].spos = i + off1;
314 |             ops[pos].dpos = --j + off2;
315 |             p--;
316 |             continue;
317 |         }
318 |         if (dir > 0 && i && *p == *(p - len2) + 1) {
319 |             pos--;
320 |             ops[pos].type = LEV_EDIT_DELETE;
321 |             ops[pos].spos = --i + off1;
322 |             ops[pos].dpos = j + off2;
323 |             p -= len2;
324 |             continue;
325 |         }
326 |         if (i && j && *p == *(p - len2 - 1)
327 |             && string1[i - 1] == string2[j - 1]) {
328 |             /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */
329 |             i--;
330 |             j--;
331 |             p -= len2 + 1;
332 |             dir = 0;
333 |             continue;
334 |         }
335 |         if (i && j && *p == *(p - len2 - 1) + 1) {
336 |             pos--;
337 |             ops[pos].type = LEV_EDIT_REPLACE;
338 |             ops[pos].spos = --i + off1;
339 |             ops[pos].dpos = --j + off2;
340 |             p -= len2 + 1;
341 |             dir = 0;
342 |             continue;
343 |         }
344 |         /* we cant't turn directly from -1 to 1, in this case it would be better
345 |          * to go diagonally, but check it (dir == 0) */
346 |         if (dir == 0 && j && *p == *(p - 1) + 1) {
347 |             pos--;
348 |             ops[pos].type = LEV_EDIT_INSERT;
349 |             ops[pos].spos = i + off1;
350 |             ops[pos].dpos = --j + off2;
351 |             p--;
352 |             dir = -1;
353 |             continue;
354 |         }
355 |         if (dir == 0 && i && *p == *(p - len2) + 1) {
356 |             pos--;
357 |             ops[pos].type = LEV_EDIT_DELETE;
358 |             ops[pos].spos = --i + off1;
359 |             ops[pos].dpos = j + off2;
360 |             p -= len2;
361 |             dir = 1;
362 |             continue;
363 |         }
364 |         /* coredump right now, later might be too late ;-) */
365 |         assert("lost in the cost matrix" == NULL);
366 |     }
367 |     free(matrix);
368 | 
369 |     return ops;
370 | }
371 | 
372 | 
373 | /**
374 |  * lev_editops_find:
375 |  * @len1: The length of @string1.
376 |  * @string1: A string of length @len1, may contain NUL characters.
377 |  * @len2: The length of @string2.
378 |  * @string2: A string of length @len2, may contain NUL characters.
379 |  * @n: Where the number of edit operations should be stored.
380 |  *
381 |  * Find an optimal edit sequence from @string1 to @string2.
382 |  *
383 |  * When there's more than one optimal sequence, a one is arbitrarily (though
384 |  * deterministically) chosen.
385 |  *
386 |  * Returns: The optimal edit sequence, as a newly allocated array of
387 |  *          elementary edit operations, it length is stored in @n.
388 |  *          It is normalized, i.e., keep operations are not included.
389 |  **/
390 | LevEditOp*
391 | lev_editops_find(size_t len1, const lev_byte *string1,
392 |                  size_t len2, const lev_byte *string2,
393 |                  size_t *n)
394 | {
395 |     size_t len1o, len2o;
396 |     size_t i;
397 |     size_t *matrix; /* cost matrix */
398 | 
399 |     /* strip common prefix */
400 |     len1o = 0;
401 |     while (len1 > 0 && len2 > 0 && *string1 == *string2) {
402 |         len1--;
403 |         len2--;
404 |         string1++;
405 |         string2++;
406 |         len1o++;
407 |     }
408 |     len2o = len1o;
409 | 
410 |     /* strip common suffix */
411 |     while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
412 |         len1--;
413 |         len2--;
414 |     }
415 |     len1++;
416 |     len2++;
417 | 
418 |     /* initalize first row and column */
419 |     matrix = (size_t*)malloc(len1*len2*sizeof(size_t));
420 |     if (!matrix) {
421 |         *n = (size_t)(-1);
422 |         return NULL;
423 |     }
424 |     for (i = 0; i < len2; i++)
425 |         matrix[i] = i;
426 |     for (i = 1; i < len1; i++)
427 |         matrix[len2*i] = i;
428 | 
429 |     /* find the costs and fill the matrix */
430 |     for (i = 1; i < len1; i++) {
431 |         size_t *prev = matrix + (i - 1)*len2;
432 |         size_t *p = matrix + i*len2;
433 |         size_t *end = p + len2 - 1;
434 |         const lev_byte char1 = string1[i - 1];
435 |         const lev_byte *char2p = string2;
436 |         size_t x = i;
437 |         p++;
438 |         while (p <= end) {
439 |             size_t c3 = *(prev++) + (char1 != *(char2p++));
440 |             x++;
441 |             if (x > c3)
442 |                 x = c3;
443 |             c3 = *prev + 1;
444 |             if (x > c3)
445 |                 x = c3;
446 |             *(p++) = x;
447 |         }
448 |     }
449 | 
450 |     /* find the way back */
451 |     return editops_from_cost_matrix(len1, string1, len1o,
452 |                                     len2, string2, len2o,
453 |                                     matrix, n);
454 | }
455 | 
456 | /**
457 |  * lev_u_edit_distance:
458 |  * @len1: The length of @string1.
459 |  * @string1: A sequence of Unicode characters of length @len1, may contain NUL
460 |  *           characters.
461 |  * @len2: The length of @string2.
462 |  * @string2: A sequence of Unicode characters of length @len2, may contain NUL
463 |  *           characters.
464 |  * @xcost: If nonzero, the replace operation has weight 2, otherwise all
465 |  *         edit operations have equal weights of 1.
466 |  *
467 |  * Computes Levenshtein edit distance of two Unicode strings.
468 |  *
469 |  * Returns: The edit distance.
470 |  **/
471 | size_t
472 | lev_u_edit_distance(size_t len1, const lev_wchar *string1,
473 |                     size_t len2, const lev_wchar *string2,
474 |                     int xcost)
475 | {
476 |   size_t i;
477 |   size_t *row;  /* we only need to keep one row of costs */
478 |   size_t *end;
479 |   size_t half;
480 | 
481 |   /* strip common prefix */
482 |   while (len1 > 0 && len2 > 0 && *string1 == *string2) {
483 |     len1--;
484 |     len2--;
485 |     string1++;
486 |     string2++;
487 |   }
488 | 
489 |   /* strip common suffix */
490 |   while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
491 |     len1--;
492 |     len2--;
493 |   }
494 | 
495 |   /* catch trivial cases */
496 |   if (len1 == 0)
497 |     return len2;
498 |   if (len2 == 0)
499 |     return len1;
500 | 
501 |   /* make the inner cycle (i.e. string2) the longer one */
502 |   if (len1 > len2) {
503 |     size_t nx = len1;
504 |     const lev_wchar *sx = string1;
505 |     len1 = len2;
506 |     len2 = nx;
507 |     string1 = string2;
508 |     string2 = sx;
509 |   }
510 |   /* check len1 == 1 separately */
511 |   if (len1 == 1) {
512 |     lev_wchar z = *string1;
513 |     const lev_wchar *p = string2;
514 |     for (i = len2; i; i--) {
515 |       if (*(p++) == z)
516 |         return len2 - 1;
517 |     }
518 |     return len2 + (xcost != 0);
519 |   }
520 |   len1++;
521 |   len2++;
522 |   half = len1 >> 1;
523 | 
524 |   /* initalize first row */
525 |   row = (size_t*)malloc(len2*sizeof(size_t));
526 |   if (!row)
527 |     return (size_t)(-1);
528 |   end = row + len2 - 1;
529 |   for (i = 0; i < len2 - (xcost ? 0 : half); i++)
530 |     row[i] = i;
531 | 
532 |   /* go through the matrix and compute the costs.  yes, this is an extremely
533 |    * obfuscated version, but also extremely memory-conservative and relatively
534 |    * fast.  */
535 |   if (xcost) {
536 |     for (i = 1; i < len1; i++) {
537 |       size_t *p = row + 1;
538 |       const lev_wchar char1 = string1[i - 1];
539 |       const lev_wchar *char2p = string2;
540 |       size_t D = i - 1;
541 |       size_t x = i;
542 |       while (p <= end) {
543 |         if (char1 == *(char2p++))
544 |           x = D;
545 |         else
546 |           x++;
547 |         D = *p;
548 |         if (x > D + 1)
549 |           x = D + 1;
550 |         *(p++) = x;
551 |       }
552 |     }
553 |   }
554 |   else {
555 |     /* in this case we don't have to scan two corner triangles (of size len1/2)
556 |      * in the matrix because no best path can go throught them. note this
557 |      * breaks when len1 == len2 == 2 so the memchr() special case above is
558 |      * necessary */
559 |     row[0] = len1 - half - 1;
560 |     for (i = 1; i < len1; i++) {
561 |       size_t *p;
562 |       const lev_wchar char1 = string1[i - 1];
563 |       const lev_wchar *char2p;
564 |       size_t D, x;
565 |       /* skip the upper triangle */
566 |       if (i >= len1 - half) {
567 |         size_t offset = i - (len1 - half);
568 |         size_t c3;
569 | 
570 |         char2p = string2 + offset;
571 |         p = row + offset;
572 |         c3 = *(p++) + (char1 != *(char2p++));
573 |         x = *p;
574 |         x++;
575 |         D = x;
576 |         if (x > c3)
577 |           x = c3;
578 |         *(p++) = x;
579 |       }
580 |       else {
581 |         p = row + 1;
582 |         char2p = string2;
583 |         D = x = i;
584 |       }
585 |       /* skip the lower triangle */
586 |       if (i <= half + 1)
587 |         end = row + len2 + i - half - 2;
588 |       /* main */
589 |       while (p <= end) {
590 |         size_t c3 = --D + (char1 != *(char2p++));
591 |         x++;
592 |         if (x > c3)
593 |           x = c3;
594 |         D = *p;
595 |         D++;
596 |         if (x > D)
597 |           x = D;
598 |         *(p++) = x;
599 |       }
600 |       /* lower triangle sentinel */
601 |       if (i <= half) {
602 |         size_t c3 = --D + (char1 != *char2p);
603 |         x++;
604 |         if (x > c3)
605 |           x = c3;
606 |         *p = x;
607 |       }
608 |     }
609 |   }
610 | 
611 |   i = *end;
612 |   free(row);
613 |   return i;
614 | }
615 | 
616 | /**
617 |  * lev_editops_to_opcodes:
618 |  * @n: The size of @ops.
619 |  * @ops: An array of elementary edit operations.
620 |  * @nb: Where the number of difflib block operation codes should be stored.
621 |  * @len1: The length of the source string.
622 |  * @len2: The length of the destination string.
623 |  *
624 |  * Converts elementary edit operations to difflib block operation codes.
625 |  *
626 |  * Note the string lengths are necessary since difflib doesn't allow omitting
627 |  * keep operations.
628 |  *
629 |  * Returns: The converted block operation codes, as a newly allocated array;
630 |  *          its length is stored in @nb.
631 |  **/
632 | LevOpCode*
633 | lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb,
634 |                        size_t len1, size_t len2)
635 | {
636 |     size_t nbl, i, spos, dpos;
637 |     const LevEditOp *o;
638 |     LevOpCode *bops, *b;
639 |     LevEditType type;
640 | 
641 |     /* compute the number of blocks */
642 |     nbl = 0;
643 |     o = ops;
644 |     spos = dpos = 0;
645 |     type = LEV_EDIT_KEEP;
646 |     for (i = n; i; ) {
647 |         /* simply pretend there are no keep blocks */
648 |         while (o->type == LEV_EDIT_KEEP && --i)
649 |             o++;
650 |         if (!i)
651 |             break;
652 |         if (spos < o->spos || dpos < o->dpos) {
653 |             nbl++;
654 |             spos = o->spos;
655 |             dpos = o->dpos;
656 |         }
657 |         nbl++;
658 |         type = o->type;
659 |         switch (type) {
660 |             case LEV_EDIT_REPLACE:
661 |                 do {
662 |                     spos++;
663 |                     dpos++;
664 |                     i--;
665 |                     o++;
666 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
667 |                 break;
668 | 
669 |             case LEV_EDIT_DELETE:
670 |                 do {
671 |                     spos++;
672 |                     i--;
673 |                     o++;
674 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
675 |                 break;
676 | 
677 |             case LEV_EDIT_INSERT:
678 |                 do {
679 |                     dpos++;
680 |                     i--;
681 |                     o++;
682 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
683 |                 break;
684 | 
685 |             default:
686 |                 break;
687 |         }
688 |     }
689 |     if (spos < len1 || dpos < len2)
690 |         nbl++;
691 | 
692 |     /* convert */
693 |     b = bops = (LevOpCode*)malloc(nbl*sizeof(LevOpCode));
694 |     if (!bops) {
695 |         *nb = (size_t)(-1);
696 |         return NULL;
697 |     }
698 |     o = ops;
699 |     spos = dpos = 0;
700 |     type = LEV_EDIT_KEEP;
701 |     for (i = n; i; ) {
702 |         /* simply pretend there are no keep blocks */
703 |         while (o->type == LEV_EDIT_KEEP && --i)
704 |             o++;
705 |         if (!i)
706 |             break;
707 |         b->sbeg = spos;
708 |         b->dbeg = dpos;
709 |         if (spos < o->spos || dpos < o->dpos) {
710 |             b->type = LEV_EDIT_KEEP;
711 |             spos = b->send = o->spos;
712 |             dpos = b->dend = o->dpos;
713 |             b++;
714 |             b->sbeg = spos;
715 |             b->dbeg = dpos;
716 |         }
717 |         type = o->type;
718 |         switch (type) {
719 |             case LEV_EDIT_REPLACE:
720 |                 do {
721 |                     spos++;
722 |                     dpos++;
723 |                     i--;
724 |                     o++;
725 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
726 |                 break;
727 | 
728 |             case LEV_EDIT_DELETE:
729 |                 do {
730 |                     spos++;
731 |                     i--;
732 |                     o++;
733 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
734 |                 break;
735 | 
736 |             case LEV_EDIT_INSERT:
737 |                 do {
738 |                     dpos++;
739 |                     i--;
740 |                     o++;
741 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
742 |                 break;
743 | 
744 |             default:
745 |                 break;
746 |         }
747 |         b->type = type;
748 |         b->send = spos;
749 |         b->dend = dpos;
750 |         b++;
751 |     }
752 |     if (spos < len1 || dpos < len2) {
753 |         assert(len1 - spos == len2 - dpos);
754 |         b->type = LEV_EDIT_KEEP;
755 |         b->sbeg = spos;
756 |         b->dbeg = dpos;
757 |         b->send = len1;
758 |         b->dend = len2;
759 |         b++;
760 |     }
761 |     assert((size_t)(b - bops) == nbl);
762 | 
763 |     *nb = nbl;
764 |     return bops;
765 | }
766 | 
767 | /**
768 |  * lev_opcodes_matching_blocks:
769 |  * @len1: The length of the source string.
770 |  * @len2: The length of the destination string.
771 |  * @nb: The size of @bops.
772 |  * @bops: An array of difflib block edit operation codes.
773 |  * @nmblocks: Where the number of matching block should be stored.
774 |  *
775 |  * Computes the matching block corresponding to an optimal edit @bops.
776 |  *
777 |  * Returns: The matching blocks as a newly allocated array, it length is
778 |  *          stored in @nmblocks.
779 |  **/
780 | LevMatchingBlock*
781 | lev_opcodes_matching_blocks(size_t len1,
782 |                             __attribute__((unused)) size_t len2,
783 |                             size_t nb,
784 |                             const LevOpCode *bops,
785 |                             size_t *nmblocks)
786 | {
787 |     size_t nmb, i;
788 |     const LevOpCode *b;
789 |     LevMatchingBlock *mblocks, *mb;
790 | 
791 |     /* compute the number of matching blocks */
792 |     nmb = 0;
793 |     b = bops;
794 |     for (i = nb; i; i--, b++) {
795 |         if (b->type == LEV_EDIT_KEEP) {
796 |             nmb++;
797 |             /* adjacent KEEP blocks -- we never produce it, but... */
798 |             while (i && b->type == LEV_EDIT_KEEP) {
799 |                 i--;
800 |                 b++;
801 |             }
802 |             if (!i)
803 |                 break;
804 |         }
805 |     }
806 | 
807 |     /* convert */
808 |     mb = mblocks = (LevMatchingBlock*)malloc(nmb*sizeof(LevOpCode));
809 |     if (!mblocks) {
810 |         *nmblocks = (size_t)(-1);
811 |         return NULL;
812 |     }
813 |     b = bops;
814 |     for (i = nb; i; i--, b++) {
815 |         if (b->type == LEV_EDIT_KEEP) {
816 |             mb->spos = b->sbeg;
817 |             mb->dpos = b->dbeg;
818 |             /* adjacent KEEP blocks -- we never produce it, but... */
819 |             while (i && b->type == LEV_EDIT_KEEP) {
820 |                 i--;
821 |                 b++;
822 |             }
823 |             if (!i) {
824 |                 mb->len = len1 - mb->spos;
825 |                 mb++;
826 |                 break;
827 |             }
828 |             mb->len = b->sbeg - mb->spos;
829 |             mb++;
830 |         }
831 |     }
832 |     assert((size_t)(mb - mblocks) == nmb);
833 | 
834 |     *nmblocks = nmb;
835 |     return mblocks;
836 | }
837 | 
838 | /**
839 |  * lev_editops_matching_blocks:
840 |  * @len1: The length of the source string.
841 |  * @len2: The length of the destination string.
842 |  * @n: The size of @ops.
843 |  * @ops: An array of elementary edit operations.
844 |  * @nmblocks: Where the number of matching block should be stored.
845 |  *
846 |  * Computes the matching block corresponding to an optimal edit @ops.
847 |  *
848 |  * Returns: The matching blocks as a newly allocated array, it length is
849 |  *          stored in @nmblocks.
850 |  **/
851 | LevMatchingBlock*
852 | lev_editops_matching_blocks(size_t len1,
853 |                             size_t len2,
854 |                             size_t n,
855 |                             const LevEditOp *ops,
856 |                             size_t *nmblocks)
857 | {
858 |     size_t nmb, i, spos, dpos;
859 |     LevEditType type;
860 |     const LevEditOp *o;
861 |     LevMatchingBlock *mblocks, *mb;
862 | 
863 |     /* compute the number of matching blocks */
864 |     nmb = 0;
865 |     o = ops;
866 |     spos = dpos = 0;
867 |     type = LEV_EDIT_KEEP;
868 |     for (i = n; i; ) {
869 |         /* simply pretend there are no keep blocks */
870 |         while (o->type == LEV_EDIT_KEEP && --i)
871 |             o++;
872 |         if (!i)
873 |             break;
874 |         if (spos < o->spos || dpos < o->dpos) {
875 |             nmb++;
876 |             spos = o->spos;
877 |             dpos = o->dpos;
878 |         }
879 |         type = o->type;
880 |         switch (type) {
881 |             case LEV_EDIT_REPLACE:
882 |                 do {
883 |                     spos++;
884 |                     dpos++;
885 |                     i--;
886 |                     o++;
887 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
888 |                 break;
889 | 
890 |             case LEV_EDIT_DELETE:
891 |                 do {
892 |                     spos++;
893 |                     i--;
894 |                     o++;
895 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
896 |                 break;
897 | 
898 |             case LEV_EDIT_INSERT:
899 |                 do {
900 |                     dpos++;
901 |                     i--;
902 |                     o++;
903 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
904 |                 break;
905 | 
906 |             default:
907 |                 break;
908 |         }
909 |     }
910 |     if (spos < len1 || dpos < len2)
911 |         nmb++;
912 | 
913 |     /* fill the info */
914 |     mb = mblocks = (LevMatchingBlock*)malloc(nmb*sizeof(LevOpCode));
915 |     if (!mblocks) {
916 |         *nmblocks = (size_t)(-1);
917 |         return NULL;
918 |     }
919 |     o = ops;
920 |     spos = dpos = 0;
921 |     type = LEV_EDIT_KEEP;
922 |     for (i = n; i; ) {
923 |         /* simply pretend there are no keep blocks */
924 |         while (o->type == LEV_EDIT_KEEP && --i)
925 |             o++;
926 |         if (!i)
927 |             break;
928 |         if (spos < o->spos || dpos < o->dpos) {
929 |             mb->spos = spos;
930 |             mb->dpos = dpos;
931 |             mb->len = o->spos - spos;
932 |             spos = o->spos;
933 |             dpos = o->dpos;
934 |             mb++;
935 |         }
936 |         type = o->type;
937 |         switch (type) {
938 |             case LEV_EDIT_REPLACE:
939 |                 do {
940 |                     spos++;
941 |                     dpos++;
942 |                     i--;
943 |                     o++;
944 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
945 |                 break;
946 | 
947 |             case LEV_EDIT_DELETE:
948 |                 do {
949 |                     spos++;
950 |                     i--;
951 |                     o++;
952 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
953 |                 break;
954 | 
955 |             case LEV_EDIT_INSERT:
956 |                 do {
957 |                     dpos++;
958 |                     i--;
959 |                     o++;
960 |                 } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
961 |                 break;
962 | 
963 |             default:
964 |                 break;
965 |         }
966 |     }
967 |     if (spos < len1 || dpos < len2) {
968 |         assert(len1 - spos == len2 - dpos);
969 |         mb->spos = spos;
970 |         mb->dpos = dpos;
971 |         mb->len = len1 - spos;
972 |         mb++;
973 |     }
974 |     assert((size_t)(mb - mblocks) == nmb);
975 | 
976 |     *nmblocks = nmb;
977 |     return mblocks;
978 | }
979 | 


--------------------------------------------------------------------------------
/src/process.cpp:
--------------------------------------------------------------------------------
  1 | #include "process.hpp"
  2 | 
  3 | #include <set>
  4 | 
  5 | namespace fuzz
  6 | {
  7 | 
  8 | using std::set;
  9 | 
 10 | vector<pair<string, int>> extractWithoutOrder(const string& query, const vector<string>& choices
 11 |     , function<string(string)> processor, function<int(string, string, const bool)> scorer
 12 |     , int score_cutoff)
 13 | {
 14 |     string processed_query = processor(query);
 15 | 
 16 |     /* TODO: Avoid running full_process twice. */
 17 |     
 18 |     auto score_func = [&scorer] (const string& s1, const string& s2) { return scorer(s1, s2, false); };
 19 |     auto pre_processor = utils::full_process;
 20 | 
 21 |     /* NOTE: Why? But the Python version does the following. */
 22 |     /* processed_query = pre_processor(processed_query) */
 23 | 
 24 |     vector<pair<string, int>> results;
 25 |     for(const auto& choice : choices) {
 26 |         string processed = pre_processor(processor(choice));
 27 |         int score = score_func(processed_query, processed);
 28 |         if(score >= score_cutoff)
 29 |             results.emplace_back(choice, score);
 30 |     }
 31 | 
 32 |     return results;
 33 | }
 34 | 
 35 | vector<pair<string, int>> extractBests(const string& query, const vector<string>& choices
 36 |     , function<string(string)> processor, function<int(string, string, const bool)> scorer
 37 |     , int score_cutoff, intmax_t limit)
 38 | {
 39 |     auto sl = extractWithoutOrder(query, choices, processor, scorer, score_cutoff);
 40 |     if(limit == -1)
 41 |         return sl;
 42 | 
 43 |     std::partial_sort(sl.begin(), sl.begin()+limit, sl.end(), [](const auto& a, const auto& b){ return a.second > b.second; });
 44 | 
 45 |     /* If limit < 0, it means to return everything. Since vector::size() is always */
 46 |     /* larger than -1, we can combine the check. */
 47 |     if(sl.size() > limit) {
 48 |         sl.resize((size_t)limit);
 49 |         sl.shrink_to_fit();
 50 |     }
 51 |     return sl;
 52 | }
 53 | 
 54 | vector<pair<string, int>> extract(const string& query, const vector<string>& choices
 55 |     , function<string(string)> processor, function<int(string, string, const bool)> scorer
 56 |     , intmax_t limit)
 57 | {
 58 |     return extractBests(query, choices, processor, scorer, 0, limit);
 59 | }
 60 | 
 61 | vector<pair<string, int>> extractOne(const string& query, const vector<string>& choices
 62 |     , function<string(string)> processor, function<int(string, string, const bool)> scorer
 63 |     , int score_cutoff)
 64 | {
 65 |     return extractBests(query, choices, processor, scorer, score_cutoff, 1);
 66 | }
 67 | 
 68 | vector<string> dedupe(const vector<string>& contains_dupes, int threshold, function<int(string, string, const bool)> scorer)
 69 | {
 70 |     /* NOTE: This function is a translation of the python and it can be optimized a lot. The original algorithm is */
 71 |     /* far from ideal. */
 72 |     vector<string> extractor;
 73 | 
 74 |     for(const auto& str : contains_dupes) {
 75 |         auto matches = extract(str, contains_dupes, utils::full_process, scorer, -1);
 76 | 
 77 |         vector<string> filtered;
 78 |         for(size_t i=0;i<matches.size();i++) {
 79 |             int score = matches[i].second;
 80 |             string value = matches[i].first;
 81 |             if(score > threshold)
 82 |                 filtered.push_back(value);
 83 |         }
 84 | 
 85 |         /* if there is only 1 item in *filtered*, no duplicates were found so append to *extracted* */
 86 |         if(filtered.size() == 1)
 87 |             extractor.push_back(*filtered.begin());
 88 |         else if(filtered.size() != 0) {
 89 |             /* alpha sort */
 90 |             std::stable_sort(filtered.begin(), filtered.end(), [](const auto& a, const auto& b){ return a[0] > b[0]; });
 91 | 
 92 |             /* length sort */
 93 |             std::stable_sort(filtered.begin(), filtered.end(), [](const auto& a, const auto& b){ return a.size() > b.size(); });
 94 | 
 95 |             /* take first item as our 'canonical example' */
 96 |             extractor.push_back(*filtered.begin());
 97 |         }
 98 |     }
 99 | 
100 |     // uniquify *extractor* list
101 |     set<string> keys;
102 |     for(auto str : extractor)
103 |         keys.insert(str);
104 |         
105 |     /* check that extractor differs from contain_dupes (e.g. duplicates were found) */
106 |     /* if not, then return the original list */
107 |     if(keys.size() == contains_dupes.size())
108 |         return contains_dupes;
109 |     else
110 |         return vector<string>(keys.begin(), keys.end());  
111 | }
112 | 
113 | }  // ns fuzz
114 | 
115 | 


--------------------------------------------------------------------------------
/src/string_matcher.cpp:
--------------------------------------------------------------------------------
 1 | #include "string_matcher.hpp"
 2 | #include "wrapper.hpp"
 3 | 
 4 | #include <algorithm>
 5 | #include <iostream>
 6 | 
 7 | namespace fuzz {
 8 | 
 9 | void string_matcher::set_strings(const string s1, const string s2)
10 | {
11 |     s1_ = s1;
12 |     s2_ = s2;
13 | 
14 |     reset_cache();
15 | }
16 | 
17 | void string_matcher::set_string1(const string s1)
18 | {
19 |     s1_ = s1;
20 |     reset_cache();
21 | }
22 | 
23 | void string_matcher::set_string2(const string s2)
24 | {
25 |     s2_ = s2;
26 |     reset_cache();
27 | }
28 | 
29 | void string_matcher::reset_cache()
30 | {
31 |     ratio_ = distance_ = 0;
32 | 
33 |     matching_blocks_.clear();
34 |     op_codes_.clear();
35 |     edit_ops_.clear();
36 | }
37 | 
38 | vector<LevOpCode> string_matcher::get_opcodes()
39 | {
40 |     if (op_codes_.empty())
41 |         op_codes_ = wrapper::get_opcodes(s1_, s2_);
42 | 
43 |     return op_codes_;
44 | }
45 | 
46 | vector<LevEditOp> string_matcher::get_editops()
47 | {
48 |    if (edit_ops_.empty())
49 |        edit_ops_ = wrapper::get_editops(s1_, s2_);
50 |     return edit_ops_;
51 | }
52 | 
53 | vector<LevMatchingBlock> string_matcher::get_matching_blocks()
54 | {
55 |     if (matching_blocks_.empty()) {
56 |         auto ops = get_opcodes();
57 |         matching_blocks_ = wrapper::get_matching_blocks(ops, s1_, s2_);
58 |     }
59 |     return matching_blocks_;
60 | }
61 | 
62 | double string_matcher::ratio()
63 | {
64 |     if (ratio_ == not_set)
65 |         ratio_ = wrapper::ratio(s1_, s2_);
66 |     return ratio_;
67 | }
68 | 
69 | double string_matcher::real_quick_ratio()
70 | {
71 |     size_t len1 = s1_.length(), len2 = s2_.length();
72 |     return 2.0 * static_cast<double>(std::min(len1, len2)) / static_cast<double>((len1 + len2));
73 | }
74 | 
75 | }  // ns fuzz
76 | 


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils.hpp"
  2 | 
  3 | #include <cmath>
  4 | 
  5 | namespace fuzz {
  6 | 
  7 | namespace utils {
  8 | 
  9 | /*
 10 |  * Return a rounded percentage in the range [0,100].
 11 |  */
 12 | unsigned int percent_round(double val)
 13 | {
 14 |     return intr(100 * val);
 15 | }
 16 | 
 17 | /*
 18 |  * Return a correctly rounded integer.
 19 |  */
 20 | unsigned int intr(double val)
 21 | {
 22 |     return static_cast<unsigned int>(std::round(val));
 23 | }
 24 | 
 25 | /*
 26 |  * Split a string into multiple strings when a character is met.
 27 |  * Returns all tokens in an array.
 28 |  */
 29 | vector<string> split_string(const string &str, const char c)
 30 | {
 31 |     vector<string> tokens;
 32 |     string word;
 33 |     for (const auto &len : str) {
 34 |         if (len == c && word.size()) {
 35 |             tokens.push_back(word);
 36 |             word.clear();
 37 |         } else if (len != c) {
 38 |             word += len;
 39 |         }
 40 |     }
 41 | 
 42 |     if(word.size()) {
 43 |         tokens.push_back(word);
 44 |     }
 45 | 
 46 |     return tokens;
 47 | }
 48 | 
 49 | /*
 50 |  * Removes leading and trailing whitespace characters from
 51 |  * the passed string.
 52 |  */
 53 | string& trim(string &str)
 54 | {
 55 |     auto isspace = [](char ch) {
 56 |         /* NOTE: should we specify the locale? */
 57 |         return !std::isspace(ch);
 58 |     };
 59 | 
 60 |     /* Strip leading whitespace.. */
 61 |     auto start = std::find_if(str.cbegin(), str.cend(), isspace);
 62 |     str.erase(str.cbegin(), start);
 63 | 
 64 |     /* .. and trailing. */
 65 |     auto end = std::find_if(str.crbegin(), str.crend(), isspace);
 66 |     str.erase(end.base(), str.cend());
 67 | 
 68 |     return str;
 69 | }
 70 | 
 71 | /*
 72 |  * Akin to Pythons join: concatenate a vector of strings
 73 |  * with intervening occurrences of sep.
 74 |  */
 75 | string join(const vector<string> &v, const string &sep)
 76 | {
 77 |     string retstr = "";
 78 |     for (const auto &str : v)
 79 |         retstr += str + (str == v.back() ? "" : sep);
 80 | 
 81 |     return retstr;
 82 | }
 83 | 
 84 | /*
 85 |  * Process the string by
 86 |  *  - replace non-alphanumeric characters with whitespace,
 87 |  *  - trim whitespace, and
 88 |  *  - forcing to lower case.
 89 |  */
 90 | string full_process(string str)
 91 | {
 92 |     /* Replace non-alphanumeric characters with whitespace, */
 93 |     std::replace_if(str.begin(), str.end(), [](char ch) {
 94 |         /* NOTE: same thing here: specify locale? */
 95 |         return !std::isalnum(ch);
 96 |     }, ' ');
 97 | 
 98 |     /* trim whitespace, and */
 99 |     str = utils::trim(str);
100 | 
101 |     /* force to lower case. */
102 |     std::transform(str.begin(), str.end(), str.begin(), ::tolower);
103 | 
104 |     return str;
105 | }
106 | 
107 | /*
108 |  * std::min, but for size_t.
109 |  */
110 | size_t min(size_t a, size_t b)
111 | {
112 |     return a < b ? a : b;
113 | }
114 | 
115 | }  // ns utils
116 | 
117 | }  // ns fuzz
118 | 


--------------------------------------------------------------------------------
/src/wrapper.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | 
  3 | #include "wrapper.hpp"
  4 | 
  5 | namespace wrapper {
  6 | 
  7 | double ratio(const string &str1, const string &str2)
  8 | {
  9 |     size_t len1 = str1.length(),
 10 |            len2 = str2.length();
 11 | 
 12 |     const lev_byte *lb1 = reinterpret_cast<const lev_byte *>(str1.c_str()),
 13 |                    *lb2 = reinterpret_cast<const lev_byte *>(str2.c_str());
 14 | 
 15 |     size_t lensum = len1 + len2;
 16 |     size_t edit_dist = lev_edit_distance(len1, lb1, len2, lb2, 1);
 17 | 
 18 |     return static_cast<double>(lensum - edit_dist) / static_cast<double>(lensum);
 19 | }
 20 | 
 21 | vector<LevOpCode> get_opcodes(string &s1, string &s2)
 22 | {
 23 |     vector<LevOpCode> opcodes;
 24 |     size_t len1, len2, nb, n;
 25 |     const lev_byte *lb1, *lb2;
 26 |     LevEditOp *ops;
 27 |     LevOpCode *bops;
 28 | 
 29 |     len1 = s1.length();
 30 |     len2 = s2.length();
 31 | 
 32 |     lb1 = reinterpret_cast<const lev_byte *>(s1.c_str());
 33 |     lb2 = reinterpret_cast<const lev_byte *>(s2.c_str());
 34 | 
 35 |     ops = lev_editops_find(len1, lb1, len2, lb2, &n);
 36 |     if (ops != nullptr) {
 37 |         bops = lev_editops_to_opcodes(n, ops, &nb, len1, len2);
 38 |         if (bops != nullptr) {
 39 |             opcodes.assign(bops, bops + nb);
 40 |             free(bops);
 41 |         }
 42 |         free(ops);
 43 |     }
 44 | 
 45 |     return opcodes;
 46 | }
 47 | 
 48 | vector<LevEditOp> get_editops(string &s1, string &s2)
 49 | {
 50 |     vector<LevEditOp> editops;
 51 |     size_t len1, len2, n;
 52 |     const lev_byte *lb1, *lb2;
 53 |     LevEditOp *ops;
 54 | 
 55 |     len1 = s1.length();
 56 |     len2 = s2.length();
 57 | 
 58 |     lb1 = reinterpret_cast<const lev_byte *>(s1.c_str());
 59 |     lb2 = reinterpret_cast<const lev_byte *>(s2.c_str());
 60 | 
 61 |     ops = lev_editops_find(len1, lb1, len2, lb2, &n);
 62 |     if (ops != nullptr) {
 63 |         editops.assign(ops, ops + n);
 64 |         free(ops);
 65 |     }
 66 | 
 67 |     return editops;
 68 | }
 69 | 
 70 | vector<LevOpCode> get_opcodes(vector<LevEditOp> &v, string &s1, string &s2)
 71 | {
 72 |     vector<LevOpCode> opcodes;
 73 |     size_t len1, len2, n;
 74 |     LevEditOp *ops;
 75 |     LevOpCode *bops;
 76 | 
 77 |     n = v.size();
 78 |     len1 = s1.length();
 79 |     len2 = s2.length();
 80 | 
 81 |     ops = v.data();
 82 |     bops = lev_editops_to_opcodes(n, ops, &n, len1, len2);
 83 |     if (bops != nullptr) {
 84 |         opcodes.assign(bops, bops + n);
 85 |         free(bops);
 86 |     }
 87 | 
 88 |     return opcodes;
 89 | }
 90 | 
 91 | vector<LevMatchingBlock> get_matching_blocks(vector<LevOpCode> &v, string &s1, string &s2)
 92 | {
 93 |     vector<LevMatchingBlock> blocks;
 94 |     size_t n, nmb, len1, len2;
 95 |     LevMatchingBlock *mblocks;
 96 | 
 97 |     n = v.size();
 98 |     len1 = s1.length();
 99 |     len2 = s2.length();
100 | 
101 |     mblocks = lev_opcodes_matching_blocks(len1, len2, n, v.data(), &nmb);
102 |     if (mblocks != nullptr) {
103 |         blocks.assign(mblocks, mblocks + nmb);
104 |         free(mblocks);
105 |     }
106 | 
107 |     return blocks;
108 | }
109 | 
110 | }  // ns wrapper
111 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LIB_INCLUDE_DIRS ${LIB_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/include)
2 | include_directories(${LIB_INCLUDE_DIRS})
3 | 
4 | file(GLOB_RECURSE SOURCES RELATIVE ${PROJECT_SOURCE_DIR}/test *.c[p]*)
5 | add_executable(main ${SOURCES})
6 | target_link_libraries(main fuzzywuzzy)


--------------------------------------------------------------------------------
/test/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "fuzzywuzzy.hpp"
 4 | #include "process.hpp"
 5 | 
 6 | int main()
 7 | {
 8 |     const string a = "I'm in your mind", b = "I'm in your mind fuzz";
 9 |     const string c = "fuzzy wuzzy was a bear", d = "wuzzy fuzzy was a bear";
10 | 
11 |     std::cout << fuzz::ratio(a, b) << '\n';
12 |     std::cout << fuzz::partial_ratio(a, b) << '\n';
13 |     std::cout << fuzz::token_sort_ratio(c, d) << '\n';
14 | 
15 |     std::vector<string> v = {"fuzzy", "wuzzy", "wuzzy", "fuzzy", "fuzzy", " "};
16 |     auto erg = fuzz::dedupe(v);
17 | }
18 | 


--------------------------------------------------------------------------------