├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── include ├── common.hpp ├── fuzzywuzzy.hpp ├── levenshtein.h ├── process.hpp ├── string_matcher.hpp ├── utils.hpp └── wrapper.hpp ├── src ├── CMakeLists.txt ├── fuzzywuzzy.cpp ├── levenshtein.c ├── process.cpp ├── string_matcher.cpp ├── utils.cpp └── wrapper.cpp └── test ├── CMakeLists.txt └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .idea/ 3 | cmake-build-debug/ 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Joins arguments and places the results in ${result_var}. 2 | function(join result_var) 3 | set(result ) 4 | foreach (arg ${ARGN}) 5 | set(result "${result}${arg}") 6 | endforeach () 7 | set(${result_var} "${result}" PARENT_SCOPE) 8 | endfunction() 9 | 10 | message(STATUS "CMake version: ${CMAKE_VERSION}") 11 | 12 | cmake_minimum_required(VERSION 3.0) 13 | 14 | # Determine if fuzzywuzzy is built as a subproject (using add_subdirectory) 15 | # or if it is the master project. 16 | set(MASTER_PROJECT OFF) 17 | if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) 18 | set(MASTER_PROJECT ON) 19 | endif() 20 | 21 | # Set the default CMAKE_BUILD_TYPE to Release. 22 | # This should be done before the project command since the latter can set 23 | # CMAKE_BUILD_TYPE itself (it does so for nmake). 24 | if (NOT CMAKE_BUILD_TYPE) 25 | join(doc "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or " 26 | "CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.") 27 | set(CMAKE_BUILD_TYPE Release CACHE STRING ${doc}) 28 | endif() 29 | 30 | option(FUZZ_TEST "Generate the test target." ${MASTER_PROJECT}) 31 | 32 | project(fuzzywuzzy LANGUAGES C CXX) 33 | 34 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 35 | 36 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 37 | 38 | if (CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) 39 | set(PEDANTIC_COMPILE_FLAGS -Wall -Wextra -Wshadow -pedantic) 40 | endif() 41 | 42 | set(CMAKE_MODULE_PATH 43 | ${CMAKE_MODULE_PATH} 44 | ${PROJECT_SOURCE_DIR}/cmake) 45 | 46 | add_subdirectory(src) 47 | add_subdirectory(test) 48 | 49 | if (FMT_TEST) 50 | enable_testing() 51 | #add_subdirectory(test) 52 | endif() 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is an in-progress port of [seatgeek's fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy/) Python library to C++. 2 | When done, this library will have the same interface and behavior. 3 | 4 | The underlaying C-library ([python-Levenshtein](https://github.com/miohtama/python-Levenshtein), mirrored [here](https://github.com/Tmplt/python-Levenshtein)) has been stripped of its Python interfacing 5 | and been wrapped around some C++ code. 6 | 7 | | files in `src/` | Python/C-lib equivalent | 8 | | ----- | ----------------------- | 9 | | `fuzzywuzzy.{c,h}pp` and `string_matcher.{c,h}pp` | Line-by-line Python-to-C++ translations of the Python library and python-Levenshtein's `StringMatcher.py`. | 10 | | `wrapper.{c,h}pp` | (Python-interfaced-)C-to-C++ wrapper of `ratio_py`, `get_opcodes_py`, `get_matching_blocks_py`, etc. from python-Levenshtein. | 11 | | `utils.{c,h}pp` | Utility functions, translated from the Python library's `utils.py`. | 12 | | `levenshtein.{c,h}` | The underlaying C functions, copied verbatim. | 13 | 14 | Usage 15 | ----- 16 | ```cpp 17 | #include 18 | ``` 19 | 20 | **Simple Ratio** 21 | ```cpp 22 | fuzz::ratio("this is a test", "this is a test!"); // returns 97 23 | ``` 24 | 25 | **Partial Ratio** 26 | ```cpp 27 | fuzz::partial_ratio("this is a test", "this is a test!"); // return 100 28 | ``` 29 | 30 | **Token Sort Ratio** 31 | ```cpp 32 | fuzz::ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"); // returns 91 33 | 34 | fuzz::token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"); // returns 100 35 | ``` 36 | 37 | **Token Set Ratio** 38 | ```cpp 39 | fuzz::token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"); // returns 83 (this should be 84) 40 | 41 | fuzz::token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"); // returns 100 42 | ``` -------------------------------------------------------------------------------- /include/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #ifdef __clang__ 7 | #include 8 | using std::string_view; 9 | #else 10 | #include 11 | using std::experimental::string_view; 12 | #endif 13 | 14 | using std::vector; 15 | using std::string; 16 | -------------------------------------------------------------------------------- /include/fuzzywuzzy.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | 5 | namespace /* I'm in your mind... */ fuzz { 6 | 7 | /* */ 8 | /* Basic scoring functions. */ 9 | /* */ 10 | 11 | /* Calculates a Levenshtein simple ratio between the string. */ 12 | unsigned int ratio(const string &s1, const string &s2, const bool full_process = true); 13 | 14 | /* 15 | * Return the ratio of the most similar substring 16 | * as a number between 0 and 100. 17 | */ 18 | unsigned int partial_ratio(const string &s1, const string &s2, const bool full_process = true); 19 | 20 | /* */ 21 | /* Advanced scoring functions. */ 22 | /* */ 23 | 24 | /* 25 | * Returns a measure of the strings' similarity between 0 and 100 26 | * but sorting the token before comparing. 27 | */ 28 | unsigned int token_sort_ratio(const string &s1, const string &s2, const bool full_process = true); 29 | unsigned int token_sort_partial_ratio(const string &s1, const string &s2, const bool full_process = true); 30 | 31 | /* 32 | * Splits the strings into tokens and computes intersections and 33 | * remainders between the tokens of the two strings. A comparison string 34 | * is then built up and is compared using the simple ratio algorithm. 35 | * Useful for strings where words appear redundantly. 36 | */ 37 | unsigned int token_set_ratio(const string &s1, const string &s2, const bool full_process = true); 38 | 39 | /* 40 | * Returns the ratio of the most similar substring as a number 41 | * between 0 and 100 but sorting the token before comparing. 42 | */ 43 | unsigned int partial_token_set_ratio(const string &s1, const string &s2, const bool full_process = true); 44 | 45 | /* */ 46 | /* Combination API */ 47 | /* */ 48 | 49 | /* 50 | * Quick ratio comparison between two strings. 51 | * Runs utils::full_process on both strings. 52 | * Short circuits if either string is empty after processing. 53 | */ 54 | unsigned int quick_ratio(const string &s1, const string &s2, const bool full_process = true); 55 | 56 | /* 57 | * Returns a measure of the strings' similarity between 0 and 100, using different algorithms. 58 | * 59 | * Steps in the order they occur: 60 | * #. Run utils::full_process on both strings 61 | * #. Short circuit if either string is empty 62 | * #. Take the ratio of the two processed strings 63 | * #. Run checks to compare the length of the strings: 64 | * * If one of the strings is more than 1.5 times as long as the other, 65 | * use partial_ratio comparisons -- scale partial results by 0.9 66 | * (this makes sure only full results can return 100) 67 | * * If one of the strings is over 8 times as long as the other, 68 | * scale by 0.6 instead 69 | * 70 | * #. Run the other ratio functions 71 | * * If using partial ratio functions, call partial_ratio, 72 | * partial_token_sort_ratio and partial_token_set_ratio. 73 | * Then scale all of these by the ratio based on length. 74 | * * Otherwise call token_sort_ratio and token_set_ratio 75 | * and scale these results by 0.95 (on top of any partial scalars) 76 | * 77 | * #. Take the highest value from these results, round it, and return 78 | * as an integer. 79 | */ 80 | unsigned int weighted_ratio(const string &s1, const string &s2, const bool full_process = true); 81 | 82 | /* I'm not in your mind */ } 83 | -------------------------------------------------------------------------------- /include/levenshtein.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file has been altered to better fit fuzzywuzzy. 3 | * To se all changes done, please diff this file with 4 | * 5 | * 6 | * Summary: 7 | * - stripped all python-related code and data types; 8 | */ 9 | 10 | /* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */ 11 | #ifndef LEVENSHTEIN_H 12 | #define LEVENSHTEIN_H 13 | 14 | #ifndef size_t 15 | # include 16 | #endif 17 | 18 | /* A bit dirty. */ 19 | #ifndef _LEV_STATIC_PY 20 | # define _LEV_STATIC_PY /* */ 21 | #endif 22 | 23 | /* In C, this is just wchar_t and unsigned char, in Python, lev_wchar can 24 | * be anything. If you really want to cheat, define wchar_t to any integer 25 | * type you like before including Levenshtein.h and recompile it. */ 26 | #ifndef lev_wchar 27 | # ifndef wchar_t 28 | # include 29 | # endif 30 | # define lev_wchar wchar_t 31 | #endif 32 | typedef unsigned char lev_byte; 33 | 34 | /* Edit opration type 35 | * DON'T CHANGE! used ad arrays indices and the bits are occasionally used 36 | * as flags */ 37 | typedef enum { 38 | LEV_EDIT_KEEP = 0, 39 | LEV_EDIT_REPLACE = 1, 40 | LEV_EDIT_INSERT = 2, 41 | LEV_EDIT_DELETE = 3, 42 | LEV_EDIT_LAST /* sometimes returned when an error occurs */ 43 | } LevEditType; 44 | 45 | /* Error codes returned by editop check functions */ 46 | typedef enum { 47 | LEV_EDIT_ERR_OK = 0, 48 | LEV_EDIT_ERR_TYPE, /* nonexistent edit type */ 49 | LEV_EDIT_ERR_OUT, /* edit out of string bounds */ 50 | LEV_EDIT_ERR_ORDER, /* ops are not ordered */ 51 | LEV_EDIT_ERR_BLOCK, /* inconsistent block boundaries (block ops) */ 52 | LEV_EDIT_ERR_SPAN, /* sequence is not a full transformation (block ops) */ 53 | LEV_EDIT_ERR_LAST 54 | } LevEditOpError; 55 | 56 | /* string averaging method (UNUSED yet) */ 57 | typedef enum { 58 | LEV_AVG_HEAD = 0, /* take operations from the head */ 59 | LEV_AVG_TAIL, /* take operations from the tail */ 60 | LEV_AVG_SPREAD, /* take a equidistantly distributed subset */ 61 | LEV_AVG_BLOCK, /* take a random continuous block */ 62 | LEV_AVG_RANDOM, /* take a random subset */ 63 | LEV_AVG_LAST 64 | } LevAveragingType; 65 | 66 | /* Edit operation (atomic). 67 | * This is the `native' atomic edit operation. It differs from the difflib 68 | * one's because it represents a change of one character, not a block. And 69 | * we usually don't care about LEV_EDIT_KEEP, though the functions can handle 70 | * them. The positions are interpreted as at the left edge of a character. 71 | */ 72 | typedef struct { 73 | LevEditType type; /* editing operation type */ 74 | size_t spos; /* source block position */ 75 | size_t dpos; /* destination position */ 76 | } LevEditOp; 77 | 78 | /* Edit operation (difflib-compatible). 79 | * This is not `native', but conversion functions exist. These fields exactly 80 | * correspond to the codeops() tuples fields (and this method is also the 81 | * source of the silly OpCode name). Sequences must span over complete 82 | * strings, subsequences are simply edit sequences with more (or larger) 83 | * LEV_EDIT_KEEP blocks. 84 | */ 85 | typedef struct { 86 | LevEditType type; /* editing operation type */ 87 | size_t sbeg, send; /* source block begin, end */ 88 | size_t dbeg, dend; /* destination block begin, end */ 89 | } LevOpCode; 90 | 91 | /* Matching block (difflib-compatible). */ 92 | typedef struct { 93 | size_t spos; 94 | size_t dpos; 95 | size_t len; 96 | } LevMatchingBlock; 97 | 98 | size_t 99 | lev_edit_distance(size_t len1, 100 | const lev_byte *string1, 101 | size_t len2, 102 | const lev_byte *string2, 103 | int xcost); 104 | 105 | size_t 106 | lev_u_edit_distance(size_t len1, 107 | const lev_wchar *string1, 108 | size_t len2, 109 | const lev_wchar *string2, 110 | int xcost); 111 | 112 | LevEditOp* 113 | lev_editops_find(size_t len1, 114 | const lev_byte *string1, 115 | size_t len2, 116 | const lev_byte *string2, 117 | size_t *n); 118 | 119 | LevOpCode* 120 | lev_editops_to_opcodes(size_t n, 121 | const LevEditOp *ops, 122 | size_t *nb, 123 | size_t len1, 124 | size_t len2); 125 | 126 | LevMatchingBlock* 127 | lev_opcodes_matching_blocks(size_t len1, 128 | __attribute__((unused)) size_t len2, 129 | size_t nb, 130 | const LevOpCode *bops, 131 | size_t *nmblocks); 132 | 133 | LevMatchingBlock* 134 | lev_editops_matching_blocks(size_t len1, 135 | size_t len2, 136 | size_t n, 137 | const LevEditOp *ops, 138 | size_t *nmblocks); 139 | 140 | 141 | #endif -------------------------------------------------------------------------------- /include/process.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include 5 | #include 6 | 7 | #include "utils.hpp" 8 | #include "fuzzywuzzy.hpp" 9 | 10 | namespace fuzz 11 | { 12 | using std::pair; 13 | using std::function; 14 | /* 15 | * Finds the best matches in a vector of choises. Returns a vector of pairs which 16 | * contains the matches and their respective scores. 17 | */ 18 | vector> extractWithoutOrder(const string& query, const vector& choices 19 | , function processor=utils::full_process, function scorer=weighted_ratio 20 | , int score_cutoff=0); 21 | 22 | /* 23 | * Convenience function for getting the choices with best scores. 24 | */ 25 | vector> extractBests(const string& query, const vector& choices 26 | , function processor=utils::full_process, function scorer=weighted_ratio 27 | , int score_cutoff = 0, intmax_t limit = 5); 28 | 29 | /* 30 | * Convenience function for getting the choices with best scores. 31 | */ 32 | vector> extract(const string& query, const vector& choices 33 | , function processor=utils::full_process, function scorer=weighted_ratio 34 | , intmax_t limit = 5); 35 | 36 | /* 37 | * This is a convenience method which returns the single best choice. 38 | */ 39 | vector> extractOne(const string& query, const vector& choices 40 | , function processor=utils::full_process, function scorer=weighted_ratio 41 | , int score_cutoff = 0); 42 | /* 43 | * This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify 44 | * and remove duplicates. Specifically, it uses the process.extract to identify duplicates that 45 | * score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list 46 | * since we assume this item contains the most entity information and returns that. It breaks string 47 | * length ties on an alphabetical sort. 48 | * 49 | * Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the 50 | * returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less 51 | * sensitive. 52 | */ 53 | vector dedupe(const vector& contains_dupes, int threshold=70 54 | , function scorer=token_set_ratio); 55 | 56 | } // ns fuzz 57 | 58 | -------------------------------------------------------------------------------- /include/string_matcher.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "levenshtein.h" 5 | 6 | namespace fuzz { 7 | 8 | enum { not_set = -1 }; 9 | 10 | class string_matcher { 11 | public: 12 | explicit string_matcher(string s1, string s2) 13 | : s1_(s1), s2_(s2) {} 14 | 15 | void set_strings(const string s1, const string s2); 16 | void set_string1(const string s1); 17 | void set_string2(const string s2); 18 | 19 | vector get_matching_blocks(); 20 | vector get_opcodes(); 21 | vector get_editops(); 22 | 23 | double ratio(); 24 | double real_quick_ratio(); 25 | 26 | protected: 27 | 28 | private: 29 | string s1_, s2_; 30 | double ratio_ = not_set; 31 | int distance_ = not_set; 32 | 33 | vector matching_blocks_; 34 | vector op_codes_; 35 | vector edit_ops_; 36 | 37 | void reset_cache(); 38 | }; 39 | 40 | } // ns fuzz 41 | -------------------------------------------------------------------------------- /include/utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | 5 | #include // std::max_element() 6 | 7 | namespace fuzz { 8 | 9 | namespace utils { 10 | 11 | unsigned int percent_round(double val); 12 | 13 | unsigned int intr(double val); 14 | 15 | vector split_string(const string &str, const char c = ' '); 16 | 17 | string& trim(string &str); 18 | 19 | string join(const vector &v, const string &sep = " "); 20 | 21 | string full_process(string str); 22 | 23 | size_t min(size_t a, size_t b); 24 | 25 | #ifdef CPP17 26 | 27 | template 28 | decltype(auto) max(const First &f, const T & ... t) 29 | { 30 | const First *retval = &f; 31 | ( (retval = &std::max(*retval, t)), ... ); 32 | return *retval; 33 | } 34 | 35 | #else 36 | 37 | /* 38 | * An "extension" of std::max() so that more than two arguments 39 | * can be passed. The first argument decides what everything else 40 | * is casted too. 41 | * 42 | * Hopefully the compiler will complain if we pass this something stupid. 43 | * NOTE: Can this be done when omitting first? 44 | */ 45 | template 46 | auto max(const T &first, const Args&... args) 47 | { 48 | std::vector vec = {first, static_cast(args)...}; 49 | auto max = std::max_element(vec.cbegin(), vec.cend()); 50 | 51 | return *max; 52 | } 53 | 54 | #endif 55 | 56 | } // utils utils 57 | 58 | } // utils fuzz 59 | -------------------------------------------------------------------------------- /include/wrapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | 5 | extern "C" { 6 | #include "levenshtein.h" 7 | } 8 | 9 | using std::vector; 10 | 11 | namespace wrapper { 12 | 13 | double ratio(const string &str1, const string &str2); 14 | 15 | vector get_matching_blocks(vector &v, string &s1, string &s2); 16 | vector get_opcodes(string &s1, string &s2); 17 | vector get_opcodes(vector &ops, string &s1, string &s2); 18 | vector get_editops(string &s1, string &s2); 19 | 20 | } // ns diffutils 21 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LIB_INCLUDE_DIRS ${LIB_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/include) 2 | include_directories(${LIB_INCLUDE_DIRS}) 3 | 4 | add_library(levenshtein STATIC levenshtein.c) 5 | set_property(TARGET levenshtein PROPERTY POSITION_INDEPENDENT_CODE ON) 6 | 7 | file(GLOB_RECURSE SOURCES RELATIVE ${PROJECT_SOURCE_DIR}/src *.cpp*) 8 | add_library(fuzzywuzzy SHARED ${SOURCES}) 9 | target_link_libraries(fuzzywuzzy levenshtein) 10 | -------------------------------------------------------------------------------- /src/fuzzywuzzy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "fuzzywuzzy.hpp" 6 | #include "string_matcher.hpp" 7 | #include "utils.hpp" 8 | 9 | namespace fuzz { 10 | 11 | unsigned int ratio(const string &s1, const string &s2, const bool full_process) 12 | { 13 | string p1 = full_process ? utils::full_process(s1) : s1; 14 | string p2 = full_process ? utils::full_process(s2) : s2; 15 | 16 | auto m = string_matcher(p1, p2); 17 | return utils::percent_round(m.ratio()); 18 | } 19 | 20 | unsigned int partial_ratio(const string &s1, const string &s2, const bool full_process) 21 | { 22 | string p1 = full_process ? utils::full_process(s1) : s1; 23 | string p2 = full_process ? utils::full_process(s2) : s2; 24 | 25 | string shorter, longer; 26 | 27 | if (p1.length() <= p2.length()) { 28 | shorter = p1; 29 | longer = p2; 30 | } else { 31 | shorter = p2; 32 | longer = p1; 33 | } 34 | 35 | auto m = string_matcher(shorter, longer); 36 | auto blocks = m.get_matching_blocks(); 37 | 38 | /* 39 | * Each block represents a string of matching characters 40 | * in a string of the form (idx_1, idx_2, len). The best 41 | * partial match will block align with at least one 42 | * of those blocks. 43 | * e.g. shorter = "abcd", longer "XXXbcdeEEE" 44 | * block = (1, 3, 3) 45 | * best score == ratio("abcd", "Xbcd") 46 | */ 47 | vector scores; 48 | for (const auto &block : blocks) { 49 | size_t long_start = utils::max(0, block.dpos - block.spos); 50 | size_t long_end = shorter.length(); 51 | 52 | auto long_substr = longer.substr(long_start, long_end); 53 | auto m2 = string_matcher(shorter, long_substr); 54 | double r = m2.ratio(); 55 | 56 | if (r > 0.995) 57 | return 100; 58 | else 59 | scores.push_back(r); 60 | } 61 | 62 | if (scores.empty()) 63 | return 0; 64 | 65 | double max = *std::max_element(scores.cbegin(), scores.cend()); 66 | return utils::percent_round(max); 67 | } 68 | 69 | /* Returns a cleaned string with tokens sorted. */ 70 | static string proccess_and_sort(const string &s, const bool full_process) 71 | { 72 | string ps = (full_process ? utils::full_process(s) : s); 73 | 74 | auto tokens = utils::split_string(ps); 75 | std::sort(tokens.begin(), tokens.end()); 76 | string sorted = utils::join(tokens); 77 | 78 | return utils::trim(sorted); 79 | } 80 | 81 | unsigned int token_sort_ratio(const string &s1, const string &s2, const bool full_proccess) 82 | { 83 | /* NOTE: do we need force_ascii? */ 84 | string sorted1 = proccess_and_sort(s1, full_proccess); 85 | string sorted2 = proccess_and_sort(s2, full_proccess); 86 | 87 | return ratio(sorted1, sorted2); 88 | } 89 | 90 | unsigned int token_sort_partial_ratio(const string &s1, const string &s2, const bool full_proccess) 91 | { 92 | /* NOTE: do we need force_ascii? */ 93 | string sorted1 = proccess_and_sort(s1, full_proccess); 94 | string sorted2 = proccess_and_sort(s2, full_proccess); 95 | 96 | return partial_ratio(sorted1, sorted2); 97 | } 98 | 99 | /* 100 | * Find all alphanumeric tokens in each string and: 101 | * - treat them as a set, 102 | * - construct two strings of the form , 103 | * - take ratios of those two strings, and 104 | * - check for unordered partial matches. 105 | */ 106 | static unsigned int token_set_ratio(const string &s1, const string &s2, bool partial, const bool full_process) 107 | { 108 | string p1 = full_process ? utils::full_process(s1) : s1; 109 | string p2 = full_process ? utils::full_process(s2) : s2; 110 | 111 | if (p1.length() == 0 || p2.length() == 0) 112 | return 0; 113 | 114 | auto split1 = utils::split_string(p1), split2 = utils::split_string(p2); 115 | auto tokens1 = std::set(split1.cbegin(), split1.cend()), 116 | tokens2 = std::set(split2.cbegin(), split2.cend()); 117 | 118 | vector intersection, diff1to2, diff2to1; 119 | 120 | std::set_intersection(tokens1.cbegin(), tokens1.cend(), 121 | tokens2.cbegin(), tokens2.cend(), 122 | std::back_inserter(intersection)); 123 | 124 | std::set_difference(tokens1.cbegin(), tokens1.cend(), 125 | tokens2.cbegin(), tokens2.cend(), 126 | std::back_inserter(diff1to2)); 127 | std::set_difference(tokens2.cbegin(), tokens2.cend(), 128 | tokens1.cbegin(), tokens1.cend(), 129 | std::back_inserter(diff2to1)); 130 | 131 | std::sort(intersection.begin(), intersection.end()); 132 | std::sort(diff1to2.begin(), diff1to2.end()); 133 | std::sort(diff2to1.begin(), diff2to1.end()); 134 | 135 | auto sorted_sect = utils::join(intersection), 136 | sorted_1to2 = utils::join(diff1to2), 137 | sorted_2to1 = utils::join(diff2to1); 138 | 139 | auto combined_1to2 = sorted_sect + " " + sorted_1to2, 140 | combined_2to1 = sorted_sect + " " + sorted_2to1; 141 | 142 | sorted_sect = utils::trim(sorted_sect); 143 | combined_1to2 = utils::trim(combined_1to2); 144 | combined_2to1 = utils::trim(combined_2to1); 145 | 146 | auto ratio_func = partial ? partial_ratio : ratio; 147 | auto pairwise = vector{ 148 | ratio_func(sorted_sect, combined_1to2, full_process), 149 | ratio_func(sorted_sect, combined_2to1, full_process), 150 | ratio_func(combined_1to2, combined_2to1, full_process) 151 | }; 152 | 153 | return *std::max_element(pairwise.cbegin(), pairwise.cend()); 154 | } 155 | 156 | unsigned int token_set_ratio(const string &s1, const string &s2, const bool full_process) 157 | { 158 | return token_set_ratio(s1, s2, false, full_process); 159 | } 160 | 161 | unsigned int partial_token_set_ratio(const string &s1, const string &s2, const bool full_process) 162 | { 163 | return token_set_ratio(s1, s2, true, full_process); 164 | } 165 | 166 | unsigned int quick_ratio(const string &s1, const string &s2, const bool full_process) 167 | { 168 | string p1 = full_process ? utils::full_process(s1) : s1; 169 | string p2 = full_process ? utils::full_process(s2) : s2; 170 | 171 | if (p1.length() == 0 || p2.length() == 0) 172 | return 0; 173 | 174 | return ratio(p1, p2); 175 | } 176 | 177 | unsigned int weighted_ratio(const string &s1, const string &s2, const bool full_process) 178 | { 179 | string p1 = full_process ? utils::full_process(s1) : s1; 180 | string p2 = full_process ? utils::full_process(s2) : s2; 181 | 182 | if (p1.length() == 0 || p2.length() == 0) 183 | return 0; 184 | 185 | bool try_partial = true; 186 | double unbase_scale = 0.95; 187 | double partial_scale = 0.90; 188 | 189 | auto base = ratio(p1, p2); 190 | double len_ratio = static_cast(utils::max(p1.length(), p2.length())) / 191 | static_cast(utils::min(p1.length(), p2.length())); 192 | 193 | /* If strings are similar length, don't use partials. */ 194 | if (len_ratio < 1.5) 195 | try_partial = false; 196 | 197 | /* If one string is much much shorter than the other. */ 198 | if (len_ratio > 8) 199 | partial_scale = 0.60; 200 | 201 | if (try_partial) { 202 | double partial = partial_ratio(p1, p2) * partial_scale; 203 | double ptsor = token_sort_partial_ratio(p1, p2) * unbase_scale * partial_scale; 204 | double ptser = partial_token_set_ratio(p1, p2) * unbase_scale * partial_scale; 205 | 206 | return utils::intr(utils::max(base, partial, ptsor, ptser)); 207 | } else { 208 | double tsor = token_sort_ratio(p1, p2, false) * unbase_scale; 209 | double tser = token_set_ratio(p1, p2, false) * unbase_scale; 210 | 211 | return utils::intr(utils::max(base, tsor, tser)); 212 | } 213 | } 214 | 215 | } // ns fuzz 216 | -------------------------------------------------------------------------------- /src/levenshtein.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file has been altered to better fit fuzzywuzzy. 3 | * To se all changes done, please diff this file with 4 | * 5 | * 6 | * Summary: 7 | * - stripped all python-related code and data types; 8 | * - fixed some spelling errors. 9 | */ 10 | 11 | /* 12 | * Levenshtein.c 13 | * @(#) $Id: Levenshtein.c,v 1.41 2005/01/13 20:05:36 yeti Exp $ 14 | * Python extension computing Levenshtein distances, string similarities, 15 | * median strings and other goodies. 16 | * 17 | * Copyright (C) 2002-2003 David Necas (Yeti) . 18 | * 19 | * The Taus113 random generator: 20 | * Copyright (C) 2002 Atakan Gurkan 21 | * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough 22 | * (see below for more) 23 | * 24 | * This program is free software; you can redistribute it and/or modify it 25 | * under the terms of the GNU General Public License as published by the Free 26 | * Software Foundation; either version 2 of the License, or (at your option) 27 | * any later version. 28 | * 29 | * This program is distributed in the hope that it will be useful, but WITHOUT 30 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 31 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 32 | * more details. 33 | * 34 | * You should have received a copy of the GNU General Public License along 35 | * with this program; if not, write to the Free Software Foundation, Inc., 36 | * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 37 | **/ 38 | 39 | /** 40 | * TODO: 41 | * 42 | * - Implement weighted string averaging, see: 43 | * H. Bunke et. al.: On the Weighted Mean of a Pair of Strings, 44 | * Pattern Analysis and Applications 2002, 5(1): 23-30. 45 | * X. Jiang et. al.: Dynamic Computations of Generalized Median Strings, 46 | * Pattern Analysis and Applications 2002, ???. 47 | * The latter also contains an interesting median-search algorithm. 48 | * 49 | * - Deal with stray symbols in greedy median() and median_improve(). 50 | * There are two possibilities: 51 | * (i) Remember which strings contain which symbols. This allows certain 52 | * small optimizations when processing them. 53 | * (ii) Use some overall heuristics to find symbols which don't worth 54 | * trying. This is very appealing, but hard to do properly 55 | * (requires some inequality strong enough to allow practical exclusion 56 | * of certain symbols -- at certain positions) 57 | * 58 | * - Editops should be an object that only *looks* like a list (which means 59 | * it is a list in duck typing) to avoid never-ending conversions from 60 | * Python lists to LevEditOp arrays and back 61 | * 62 | * - Optimize munkers_blackman(), it's pretty dumb (no memory of visited 63 | * columns/rows) 64 | * 65 | * - Make it really usable as a C library (needs some wrappers, headers, ..., 66 | * and maybe even documentation ;-) 67 | * 68 | * - Add interface to various interesting auxiliary results, namely 69 | * set and sequence distance (only ratio is exported), the map from 70 | * munkers_blackman() itself, ... 71 | * 72 | * - Generalizations: 73 | * - character weight matrix/function 74 | * - arbitrary edit operation costs, decomposable edit operations 75 | * 76 | * - Create a test suite 77 | * 78 | * - Add more interesting algorithms ;-) 79 | * 80 | * Postponed TODO (investigated, and a big `but' was found): 81 | * 82 | * - A linear approximate set median algorithm: 83 | * P. Indyk: Sublinear time algorithms for metric space problems, 84 | * STOC 1999, http://citeseer.nj.nec.com/indyk00sublinear.html. 85 | * BUT: The algorithm seems to be advantageous only in the case of very 86 | * large sets -- if my estimates are correct (the article itself is quite 87 | * `asymptotic'), say 10^5 at least. On smaller sets either one would get 88 | * only an extermely rough median estimate, or the number of distance 89 | * computations would be in fact higher than in the dumb O(n^2) algorithm. 90 | * 91 | * - Improve setmedian() speed with triangular inequality, see: 92 | * Juan, A., E. Vidal: An Algorithm for Fast Median Search, 93 | * 1997, http://citeseer.nj.nec.com/article/juan97algorithm.html 94 | * BUT: It doesn't seem to help much in spaces of high dimension (see the 95 | * discussion and graphs in the article itself), a few percents at most, 96 | * and strings behave like a space with a very high dimension (locally), so 97 | * who knows, it probably wouldn't help much. 98 | * 99 | **/ 100 | 101 | #ifndef _GNU_SOURCE 102 | # define _GNU_SOURCE 103 | #endif 104 | 105 | #include 106 | #include 107 | /* for debugging */ 108 | #include 109 | 110 | #include 111 | #include "levenshtein.h" 112 | 113 | /** 114 | * lev_edit_distance: 115 | * @len1: The length of @string1. 116 | * @string1: A sequence of bytes of length @len1, may contain NUL characters. 117 | * @len2: The length of @string2. 118 | * @string2: A sequence of bytes of length @len2, may contain NUL characters. 119 | * @xcost: If nonzero, the replace operation has weight 2, otherwise all 120 | * edit operations have equal weights of 1. 121 | * 122 | * Computes Levenshtein edit distance of two strings. 123 | * 124 | * Returns: The edit distance. 125 | **/ 126 | size_t 127 | lev_edit_distance(size_t len1, const lev_byte *string1, 128 | size_t len2, const lev_byte *string2, 129 | int xcost) { 130 | size_t i; 131 | size_t *row; /* we only need to keep one row of costs */ 132 | size_t *end; 133 | size_t half; 134 | 135 | /* strip common prefix */ 136 | while (len1 > 0 && len2 > 0 && *string1 == *string2) { 137 | len1--; 138 | len2--; 139 | string1++; 140 | string2++; 141 | } 142 | 143 | /* strip common suffix */ 144 | while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1]) { 145 | len1--; 146 | len2--; 147 | } 148 | 149 | /* catch trivial cases */ 150 | if (len1 == 0) 151 | return len2; 152 | if (len2 == 0) 153 | return len1; 154 | 155 | /* make the inner cycle (i.e. string2) the longer one */ 156 | if (len1 > len2) { 157 | size_t nx = len1; 158 | const lev_byte *sx = string1; 159 | len1 = len2; 160 | len2 = nx; 161 | string1 = string2; 162 | string2 = sx; 163 | } 164 | /* check len1 == 1 separately */ 165 | if (len1 == 1) { 166 | if (xcost) 167 | return len2 + 1 - 2 * (memchr(string2, *string1, len2) != NULL); 168 | else 169 | return len2 - (memchr(string2, *string1, len2) != NULL); 170 | } 171 | len1++; 172 | len2++; 173 | half = len1 >> 1; 174 | 175 | /* initialize first row */ 176 | row = (size_t *) malloc(len2 * sizeof(size_t)); 177 | if (!row) 178 | return (size_t) (-1); 179 | end = row + len2 - 1; 180 | for (i = 0; i < len2 - (xcost ? 0 : half); i++) 181 | row[i] = i; 182 | 183 | /* go through the matrix and compute the costs. yes, this is an extremely 184 | * obfuscated version, but also extremely memory-conservative and relatively 185 | * fast. */ 186 | if (xcost) { 187 | for (i = 1; i < len1; i++) { 188 | size_t *p = row + 1; 189 | const lev_byte char1 = string1[i - 1]; 190 | const lev_byte *char2p = string2; 191 | size_t D = i; 192 | size_t x = i; 193 | while (p <= end) { 194 | if (char1 == *(char2p++)) 195 | x = --D; 196 | else 197 | x++; 198 | D = *p; 199 | D++; 200 | if (x > D) 201 | x = D; 202 | *(p++) = x; 203 | } 204 | } 205 | } else { 206 | /* in this case we don't have to scan two corner triangles (of size len1/2) 207 | * in the matrix because no best path can go thought them. note this 208 | * breaks when len1 == len2 == 2 so the memchr() special case above is 209 | * necessary */ 210 | row[0] = len1 - half - 1; 211 | for (i = 1; i < len1; i++) { 212 | size_t *p; 213 | const lev_byte char1 = string1[i - 1]; 214 | const lev_byte *char2p; 215 | size_t D, x; 216 | /* skip the upper triangle */ 217 | if (i >= len1 - half) { 218 | size_t offset = i - (len1 - half); 219 | size_t c3; 220 | 221 | char2p = string2 + offset; 222 | p = row + offset; 223 | c3 = *(p++) + (char1 != *(char2p++)); 224 | x = *p; 225 | x++; 226 | D = x; 227 | if (x > c3) 228 | x = c3; 229 | *(p++) = x; 230 | } else { 231 | p = row + 1; 232 | char2p = string2; 233 | D = x = i; 234 | } 235 | /* skip the lower triangle */ 236 | if (i <= half + 1) 237 | end = row + len2 + i - half - 2; 238 | /* main */ 239 | while (p <= end) { 240 | size_t c3 = --D + (char1 != *(char2p++)); 241 | x++; 242 | if (x > c3) 243 | x = c3; 244 | D = *p; 245 | D++; 246 | if (x > D) 247 | x = D; 248 | *(p++) = x; 249 | } 250 | /* lower triangle sentinel */ 251 | if (i <= half) { 252 | size_t c3 = --D + (char1 != *char2p); 253 | x++; 254 | if (x > c3) 255 | x = c3; 256 | *p = x; 257 | } 258 | } 259 | } 260 | 261 | i = *end; 262 | free(row); 263 | return i; 264 | } 265 | 266 | /** 267 | * editops_from_cost_matrix: 268 | * @len1: The length of @string1. 269 | * @string1: A string of length @len1, may contain NUL characters. 270 | * @o1: The offset where the matrix starts from the start of @string1. 271 | * @len2: The length of @string2. 272 | * @string2: A string of length @len2, may contain NUL characters. 273 | * @o2: The offset where the matrix starts from the start of @string2. 274 | * @matrix: The cost matrix. 275 | * @n: Where the number of edit operations should be stored. 276 | * 277 | * Reconstructs the optimal edit sequence from the cost matrix @matrix. 278 | * 279 | * The matrix is freed. 280 | * 281 | * Returns: The optimal edit sequence, as a newly allocated array of 282 | * elementary edit operations, it length is stored in @n. 283 | **/ 284 | static LevEditOp* 285 | editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1, 286 | size_t len2, const lev_byte *string2, size_t off2, 287 | size_t *matrix, size_t *n) 288 | { 289 | size_t *p; 290 | size_t i, j, pos; 291 | LevEditOp *ops; 292 | int dir = 0; 293 | 294 | pos = *n = matrix[len1*len2 - 1]; 295 | if (!*n) { 296 | free(matrix); 297 | return NULL; 298 | } 299 | ops = (LevEditOp*)malloc((*n)*sizeof(LevEditOp)); 300 | if (!ops) { 301 | free(matrix); 302 | *n = (size_t)(-1); 303 | return NULL; 304 | } 305 | i = len1 - 1; 306 | j = len2 - 1; 307 | p = matrix + len1*len2 - 1; 308 | while (i || j) { 309 | /* prefer contiuning in the same direction */ 310 | if (dir < 0 && j && *p == *(p - 1) + 1) { 311 | pos--; 312 | ops[pos].type = LEV_EDIT_INSERT; 313 | ops[pos].spos = i + off1; 314 | ops[pos].dpos = --j + off2; 315 | p--; 316 | continue; 317 | } 318 | if (dir > 0 && i && *p == *(p - len2) + 1) { 319 | pos--; 320 | ops[pos].type = LEV_EDIT_DELETE; 321 | ops[pos].spos = --i + off1; 322 | ops[pos].dpos = j + off2; 323 | p -= len2; 324 | continue; 325 | } 326 | if (i && j && *p == *(p - len2 - 1) 327 | && string1[i - 1] == string2[j - 1]) { 328 | /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */ 329 | i--; 330 | j--; 331 | p -= len2 + 1; 332 | dir = 0; 333 | continue; 334 | } 335 | if (i && j && *p == *(p - len2 - 1) + 1) { 336 | pos--; 337 | ops[pos].type = LEV_EDIT_REPLACE; 338 | ops[pos].spos = --i + off1; 339 | ops[pos].dpos = --j + off2; 340 | p -= len2 + 1; 341 | dir = 0; 342 | continue; 343 | } 344 | /* we cant't turn directly from -1 to 1, in this case it would be better 345 | * to go diagonally, but check it (dir == 0) */ 346 | if (dir == 0 && j && *p == *(p - 1) + 1) { 347 | pos--; 348 | ops[pos].type = LEV_EDIT_INSERT; 349 | ops[pos].spos = i + off1; 350 | ops[pos].dpos = --j + off2; 351 | p--; 352 | dir = -1; 353 | continue; 354 | } 355 | if (dir == 0 && i && *p == *(p - len2) + 1) { 356 | pos--; 357 | ops[pos].type = LEV_EDIT_DELETE; 358 | ops[pos].spos = --i + off1; 359 | ops[pos].dpos = j + off2; 360 | p -= len2; 361 | dir = 1; 362 | continue; 363 | } 364 | /* coredump right now, later might be too late ;-) */ 365 | assert("lost in the cost matrix" == NULL); 366 | } 367 | free(matrix); 368 | 369 | return ops; 370 | } 371 | 372 | 373 | /** 374 | * lev_editops_find: 375 | * @len1: The length of @string1. 376 | * @string1: A string of length @len1, may contain NUL characters. 377 | * @len2: The length of @string2. 378 | * @string2: A string of length @len2, may contain NUL characters. 379 | * @n: Where the number of edit operations should be stored. 380 | * 381 | * Find an optimal edit sequence from @string1 to @string2. 382 | * 383 | * When there's more than one optimal sequence, a one is arbitrarily (though 384 | * deterministically) chosen. 385 | * 386 | * Returns: The optimal edit sequence, as a newly allocated array of 387 | * elementary edit operations, it length is stored in @n. 388 | * It is normalized, i.e., keep operations are not included. 389 | **/ 390 | LevEditOp* 391 | lev_editops_find(size_t len1, const lev_byte *string1, 392 | size_t len2, const lev_byte *string2, 393 | size_t *n) 394 | { 395 | size_t len1o, len2o; 396 | size_t i; 397 | size_t *matrix; /* cost matrix */ 398 | 399 | /* strip common prefix */ 400 | len1o = 0; 401 | while (len1 > 0 && len2 > 0 && *string1 == *string2) { 402 | len1--; 403 | len2--; 404 | string1++; 405 | string2++; 406 | len1o++; 407 | } 408 | len2o = len1o; 409 | 410 | /* strip common suffix */ 411 | while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { 412 | len1--; 413 | len2--; 414 | } 415 | len1++; 416 | len2++; 417 | 418 | /* initalize first row and column */ 419 | matrix = (size_t*)malloc(len1*len2*sizeof(size_t)); 420 | if (!matrix) { 421 | *n = (size_t)(-1); 422 | return NULL; 423 | } 424 | for (i = 0; i < len2; i++) 425 | matrix[i] = i; 426 | for (i = 1; i < len1; i++) 427 | matrix[len2*i] = i; 428 | 429 | /* find the costs and fill the matrix */ 430 | for (i = 1; i < len1; i++) { 431 | size_t *prev = matrix + (i - 1)*len2; 432 | size_t *p = matrix + i*len2; 433 | size_t *end = p + len2 - 1; 434 | const lev_byte char1 = string1[i - 1]; 435 | const lev_byte *char2p = string2; 436 | size_t x = i; 437 | p++; 438 | while (p <= end) { 439 | size_t c3 = *(prev++) + (char1 != *(char2p++)); 440 | x++; 441 | if (x > c3) 442 | x = c3; 443 | c3 = *prev + 1; 444 | if (x > c3) 445 | x = c3; 446 | *(p++) = x; 447 | } 448 | } 449 | 450 | /* find the way back */ 451 | return editops_from_cost_matrix(len1, string1, len1o, 452 | len2, string2, len2o, 453 | matrix, n); 454 | } 455 | 456 | /** 457 | * lev_u_edit_distance: 458 | * @len1: The length of @string1. 459 | * @string1: A sequence of Unicode characters of length @len1, may contain NUL 460 | * characters. 461 | * @len2: The length of @string2. 462 | * @string2: A sequence of Unicode characters of length @len2, may contain NUL 463 | * characters. 464 | * @xcost: If nonzero, the replace operation has weight 2, otherwise all 465 | * edit operations have equal weights of 1. 466 | * 467 | * Computes Levenshtein edit distance of two Unicode strings. 468 | * 469 | * Returns: The edit distance. 470 | **/ 471 | size_t 472 | lev_u_edit_distance(size_t len1, const lev_wchar *string1, 473 | size_t len2, const lev_wchar *string2, 474 | int xcost) 475 | { 476 | size_t i; 477 | size_t *row; /* we only need to keep one row of costs */ 478 | size_t *end; 479 | size_t half; 480 | 481 | /* strip common prefix */ 482 | while (len1 > 0 && len2 > 0 && *string1 == *string2) { 483 | len1--; 484 | len2--; 485 | string1++; 486 | string2++; 487 | } 488 | 489 | /* strip common suffix */ 490 | while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { 491 | len1--; 492 | len2--; 493 | } 494 | 495 | /* catch trivial cases */ 496 | if (len1 == 0) 497 | return len2; 498 | if (len2 == 0) 499 | return len1; 500 | 501 | /* make the inner cycle (i.e. string2) the longer one */ 502 | if (len1 > len2) { 503 | size_t nx = len1; 504 | const lev_wchar *sx = string1; 505 | len1 = len2; 506 | len2 = nx; 507 | string1 = string2; 508 | string2 = sx; 509 | } 510 | /* check len1 == 1 separately */ 511 | if (len1 == 1) { 512 | lev_wchar z = *string1; 513 | const lev_wchar *p = string2; 514 | for (i = len2; i; i--) { 515 | if (*(p++) == z) 516 | return len2 - 1; 517 | } 518 | return len2 + (xcost != 0); 519 | } 520 | len1++; 521 | len2++; 522 | half = len1 >> 1; 523 | 524 | /* initalize first row */ 525 | row = (size_t*)malloc(len2*sizeof(size_t)); 526 | if (!row) 527 | return (size_t)(-1); 528 | end = row + len2 - 1; 529 | for (i = 0; i < len2 - (xcost ? 0 : half); i++) 530 | row[i] = i; 531 | 532 | /* go through the matrix and compute the costs. yes, this is an extremely 533 | * obfuscated version, but also extremely memory-conservative and relatively 534 | * fast. */ 535 | if (xcost) { 536 | for (i = 1; i < len1; i++) { 537 | size_t *p = row + 1; 538 | const lev_wchar char1 = string1[i - 1]; 539 | const lev_wchar *char2p = string2; 540 | size_t D = i - 1; 541 | size_t x = i; 542 | while (p <= end) { 543 | if (char1 == *(char2p++)) 544 | x = D; 545 | else 546 | x++; 547 | D = *p; 548 | if (x > D + 1) 549 | x = D + 1; 550 | *(p++) = x; 551 | } 552 | } 553 | } 554 | else { 555 | /* in this case we don't have to scan two corner triangles (of size len1/2) 556 | * in the matrix because no best path can go throught them. note this 557 | * breaks when len1 == len2 == 2 so the memchr() special case above is 558 | * necessary */ 559 | row[0] = len1 - half - 1; 560 | for (i = 1; i < len1; i++) { 561 | size_t *p; 562 | const lev_wchar char1 = string1[i - 1]; 563 | const lev_wchar *char2p; 564 | size_t D, x; 565 | /* skip the upper triangle */ 566 | if (i >= len1 - half) { 567 | size_t offset = i - (len1 - half); 568 | size_t c3; 569 | 570 | char2p = string2 + offset; 571 | p = row + offset; 572 | c3 = *(p++) + (char1 != *(char2p++)); 573 | x = *p; 574 | x++; 575 | D = x; 576 | if (x > c3) 577 | x = c3; 578 | *(p++) = x; 579 | } 580 | else { 581 | p = row + 1; 582 | char2p = string2; 583 | D = x = i; 584 | } 585 | /* skip the lower triangle */ 586 | if (i <= half + 1) 587 | end = row + len2 + i - half - 2; 588 | /* main */ 589 | while (p <= end) { 590 | size_t c3 = --D + (char1 != *(char2p++)); 591 | x++; 592 | if (x > c3) 593 | x = c3; 594 | D = *p; 595 | D++; 596 | if (x > D) 597 | x = D; 598 | *(p++) = x; 599 | } 600 | /* lower triangle sentinel */ 601 | if (i <= half) { 602 | size_t c3 = --D + (char1 != *char2p); 603 | x++; 604 | if (x > c3) 605 | x = c3; 606 | *p = x; 607 | } 608 | } 609 | } 610 | 611 | i = *end; 612 | free(row); 613 | return i; 614 | } 615 | 616 | /** 617 | * lev_editops_to_opcodes: 618 | * @n: The size of @ops. 619 | * @ops: An array of elementary edit operations. 620 | * @nb: Where the number of difflib block operation codes should be stored. 621 | * @len1: The length of the source string. 622 | * @len2: The length of the destination string. 623 | * 624 | * Converts elementary edit operations to difflib block operation codes. 625 | * 626 | * Note the string lengths are necessary since difflib doesn't allow omitting 627 | * keep operations. 628 | * 629 | * Returns: The converted block operation codes, as a newly allocated array; 630 | * its length is stored in @nb. 631 | **/ 632 | LevOpCode* 633 | lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb, 634 | size_t len1, size_t len2) 635 | { 636 | size_t nbl, i, spos, dpos; 637 | const LevEditOp *o; 638 | LevOpCode *bops, *b; 639 | LevEditType type; 640 | 641 | /* compute the number of blocks */ 642 | nbl = 0; 643 | o = ops; 644 | spos = dpos = 0; 645 | type = LEV_EDIT_KEEP; 646 | for (i = n; i; ) { 647 | /* simply pretend there are no keep blocks */ 648 | while (o->type == LEV_EDIT_KEEP && --i) 649 | o++; 650 | if (!i) 651 | break; 652 | if (spos < o->spos || dpos < o->dpos) { 653 | nbl++; 654 | spos = o->spos; 655 | dpos = o->dpos; 656 | } 657 | nbl++; 658 | type = o->type; 659 | switch (type) { 660 | case LEV_EDIT_REPLACE: 661 | do { 662 | spos++; 663 | dpos++; 664 | i--; 665 | o++; 666 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 667 | break; 668 | 669 | case LEV_EDIT_DELETE: 670 | do { 671 | spos++; 672 | i--; 673 | o++; 674 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 675 | break; 676 | 677 | case LEV_EDIT_INSERT: 678 | do { 679 | dpos++; 680 | i--; 681 | o++; 682 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 683 | break; 684 | 685 | default: 686 | break; 687 | } 688 | } 689 | if (spos < len1 || dpos < len2) 690 | nbl++; 691 | 692 | /* convert */ 693 | b = bops = (LevOpCode*)malloc(nbl*sizeof(LevOpCode)); 694 | if (!bops) { 695 | *nb = (size_t)(-1); 696 | return NULL; 697 | } 698 | o = ops; 699 | spos = dpos = 0; 700 | type = LEV_EDIT_KEEP; 701 | for (i = n; i; ) { 702 | /* simply pretend there are no keep blocks */ 703 | while (o->type == LEV_EDIT_KEEP && --i) 704 | o++; 705 | if (!i) 706 | break; 707 | b->sbeg = spos; 708 | b->dbeg = dpos; 709 | if (spos < o->spos || dpos < o->dpos) { 710 | b->type = LEV_EDIT_KEEP; 711 | spos = b->send = o->spos; 712 | dpos = b->dend = o->dpos; 713 | b++; 714 | b->sbeg = spos; 715 | b->dbeg = dpos; 716 | } 717 | type = o->type; 718 | switch (type) { 719 | case LEV_EDIT_REPLACE: 720 | do { 721 | spos++; 722 | dpos++; 723 | i--; 724 | o++; 725 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 726 | break; 727 | 728 | case LEV_EDIT_DELETE: 729 | do { 730 | spos++; 731 | i--; 732 | o++; 733 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 734 | break; 735 | 736 | case LEV_EDIT_INSERT: 737 | do { 738 | dpos++; 739 | i--; 740 | o++; 741 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 742 | break; 743 | 744 | default: 745 | break; 746 | } 747 | b->type = type; 748 | b->send = spos; 749 | b->dend = dpos; 750 | b++; 751 | } 752 | if (spos < len1 || dpos < len2) { 753 | assert(len1 - spos == len2 - dpos); 754 | b->type = LEV_EDIT_KEEP; 755 | b->sbeg = spos; 756 | b->dbeg = dpos; 757 | b->send = len1; 758 | b->dend = len2; 759 | b++; 760 | } 761 | assert((size_t)(b - bops) == nbl); 762 | 763 | *nb = nbl; 764 | return bops; 765 | } 766 | 767 | /** 768 | * lev_opcodes_matching_blocks: 769 | * @len1: The length of the source string. 770 | * @len2: The length of the destination string. 771 | * @nb: The size of @bops. 772 | * @bops: An array of difflib block edit operation codes. 773 | * @nmblocks: Where the number of matching block should be stored. 774 | * 775 | * Computes the matching block corresponding to an optimal edit @bops. 776 | * 777 | * Returns: The matching blocks as a newly allocated array, it length is 778 | * stored in @nmblocks. 779 | **/ 780 | LevMatchingBlock* 781 | lev_opcodes_matching_blocks(size_t len1, 782 | __attribute__((unused)) size_t len2, 783 | size_t nb, 784 | const LevOpCode *bops, 785 | size_t *nmblocks) 786 | { 787 | size_t nmb, i; 788 | const LevOpCode *b; 789 | LevMatchingBlock *mblocks, *mb; 790 | 791 | /* compute the number of matching blocks */ 792 | nmb = 0; 793 | b = bops; 794 | for (i = nb; i; i--, b++) { 795 | if (b->type == LEV_EDIT_KEEP) { 796 | nmb++; 797 | /* adjacent KEEP blocks -- we never produce it, but... */ 798 | while (i && b->type == LEV_EDIT_KEEP) { 799 | i--; 800 | b++; 801 | } 802 | if (!i) 803 | break; 804 | } 805 | } 806 | 807 | /* convert */ 808 | mb = mblocks = (LevMatchingBlock*)malloc(nmb*sizeof(LevOpCode)); 809 | if (!mblocks) { 810 | *nmblocks = (size_t)(-1); 811 | return NULL; 812 | } 813 | b = bops; 814 | for (i = nb; i; i--, b++) { 815 | if (b->type == LEV_EDIT_KEEP) { 816 | mb->spos = b->sbeg; 817 | mb->dpos = b->dbeg; 818 | /* adjacent KEEP blocks -- we never produce it, but... */ 819 | while (i && b->type == LEV_EDIT_KEEP) { 820 | i--; 821 | b++; 822 | } 823 | if (!i) { 824 | mb->len = len1 - mb->spos; 825 | mb++; 826 | break; 827 | } 828 | mb->len = b->sbeg - mb->spos; 829 | mb++; 830 | } 831 | } 832 | assert((size_t)(mb - mblocks) == nmb); 833 | 834 | *nmblocks = nmb; 835 | return mblocks; 836 | } 837 | 838 | /** 839 | * lev_editops_matching_blocks: 840 | * @len1: The length of the source string. 841 | * @len2: The length of the destination string. 842 | * @n: The size of @ops. 843 | * @ops: An array of elementary edit operations. 844 | * @nmblocks: Where the number of matching block should be stored. 845 | * 846 | * Computes the matching block corresponding to an optimal edit @ops. 847 | * 848 | * Returns: The matching blocks as a newly allocated array, it length is 849 | * stored in @nmblocks. 850 | **/ 851 | LevMatchingBlock* 852 | lev_editops_matching_blocks(size_t len1, 853 | size_t len2, 854 | size_t n, 855 | const LevEditOp *ops, 856 | size_t *nmblocks) 857 | { 858 | size_t nmb, i, spos, dpos; 859 | LevEditType type; 860 | const LevEditOp *o; 861 | LevMatchingBlock *mblocks, *mb; 862 | 863 | /* compute the number of matching blocks */ 864 | nmb = 0; 865 | o = ops; 866 | spos = dpos = 0; 867 | type = LEV_EDIT_KEEP; 868 | for (i = n; i; ) { 869 | /* simply pretend there are no keep blocks */ 870 | while (o->type == LEV_EDIT_KEEP && --i) 871 | o++; 872 | if (!i) 873 | break; 874 | if (spos < o->spos || dpos < o->dpos) { 875 | nmb++; 876 | spos = o->spos; 877 | dpos = o->dpos; 878 | } 879 | type = o->type; 880 | switch (type) { 881 | case LEV_EDIT_REPLACE: 882 | do { 883 | spos++; 884 | dpos++; 885 | i--; 886 | o++; 887 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 888 | break; 889 | 890 | case LEV_EDIT_DELETE: 891 | do { 892 | spos++; 893 | i--; 894 | o++; 895 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 896 | break; 897 | 898 | case LEV_EDIT_INSERT: 899 | do { 900 | dpos++; 901 | i--; 902 | o++; 903 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 904 | break; 905 | 906 | default: 907 | break; 908 | } 909 | } 910 | if (spos < len1 || dpos < len2) 911 | nmb++; 912 | 913 | /* fill the info */ 914 | mb = mblocks = (LevMatchingBlock*)malloc(nmb*sizeof(LevOpCode)); 915 | if (!mblocks) { 916 | *nmblocks = (size_t)(-1); 917 | return NULL; 918 | } 919 | o = ops; 920 | spos = dpos = 0; 921 | type = LEV_EDIT_KEEP; 922 | for (i = n; i; ) { 923 | /* simply pretend there are no keep blocks */ 924 | while (o->type == LEV_EDIT_KEEP && --i) 925 | o++; 926 | if (!i) 927 | break; 928 | if (spos < o->spos || dpos < o->dpos) { 929 | mb->spos = spos; 930 | mb->dpos = dpos; 931 | mb->len = o->spos - spos; 932 | spos = o->spos; 933 | dpos = o->dpos; 934 | mb++; 935 | } 936 | type = o->type; 937 | switch (type) { 938 | case LEV_EDIT_REPLACE: 939 | do { 940 | spos++; 941 | dpos++; 942 | i--; 943 | o++; 944 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 945 | break; 946 | 947 | case LEV_EDIT_DELETE: 948 | do { 949 | spos++; 950 | i--; 951 | o++; 952 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 953 | break; 954 | 955 | case LEV_EDIT_INSERT: 956 | do { 957 | dpos++; 958 | i--; 959 | o++; 960 | } while (i && o->type == type && spos == o->spos && dpos == o->dpos); 961 | break; 962 | 963 | default: 964 | break; 965 | } 966 | } 967 | if (spos < len1 || dpos < len2) { 968 | assert(len1 - spos == len2 - dpos); 969 | mb->spos = spos; 970 | mb->dpos = dpos; 971 | mb->len = len1 - spos; 972 | mb++; 973 | } 974 | assert((size_t)(mb - mblocks) == nmb); 975 | 976 | *nmblocks = nmb; 977 | return mblocks; 978 | } 979 | -------------------------------------------------------------------------------- /src/process.cpp: -------------------------------------------------------------------------------- 1 | #include "process.hpp" 2 | 3 | #include 4 | 5 | namespace fuzz 6 | { 7 | 8 | using std::set; 9 | 10 | vector> extractWithoutOrder(const string& query, const vector& choices 11 | , function processor, function scorer 12 | , int score_cutoff) 13 | { 14 | string processed_query = processor(query); 15 | 16 | /* TODO: Avoid running full_process twice. */ 17 | 18 | auto score_func = [&scorer] (const string& s1, const string& s2) { return scorer(s1, s2, false); }; 19 | auto pre_processor = utils::full_process; 20 | 21 | /* NOTE: Why? But the Python version does the following. */ 22 | /* processed_query = pre_processor(processed_query) */ 23 | 24 | vector> results; 25 | for(const auto& choice : choices) { 26 | string processed = pre_processor(processor(choice)); 27 | int score = score_func(processed_query, processed); 28 | if(score >= score_cutoff) 29 | results.emplace_back(choice, score); 30 | } 31 | 32 | return results; 33 | } 34 | 35 | vector> extractBests(const string& query, const vector& choices 36 | , function processor, function scorer 37 | , int score_cutoff, intmax_t limit) 38 | { 39 | auto sl = extractWithoutOrder(query, choices, processor, scorer, score_cutoff); 40 | if(limit == -1) 41 | return sl; 42 | 43 | std::partial_sort(sl.begin(), sl.begin()+limit, sl.end(), [](const auto& a, const auto& b){ return a.second > b.second; }); 44 | 45 | /* If limit < 0, it means to return everything. Since vector::size() is always */ 46 | /* larger than -1, we can combine the check. */ 47 | if(sl.size() > limit) { 48 | sl.resize((size_t)limit); 49 | sl.shrink_to_fit(); 50 | } 51 | return sl; 52 | } 53 | 54 | vector> extract(const string& query, const vector& choices 55 | , function processor, function scorer 56 | , intmax_t limit) 57 | { 58 | return extractBests(query, choices, processor, scorer, 0, limit); 59 | } 60 | 61 | vector> extractOne(const string& query, const vector& choices 62 | , function processor, function scorer 63 | , int score_cutoff) 64 | { 65 | return extractBests(query, choices, processor, scorer, score_cutoff, 1); 66 | } 67 | 68 | vector dedupe(const vector& contains_dupes, int threshold, function scorer) 69 | { 70 | /* NOTE: This function is a translation of the python and it can be optimized a lot. The original algorithm is */ 71 | /* far from ideal. */ 72 | vector extractor; 73 | 74 | for(const auto& str : contains_dupes) { 75 | auto matches = extract(str, contains_dupes, utils::full_process, scorer, -1); 76 | 77 | vector filtered; 78 | for(size_t i=0;i threshold) 82 | filtered.push_back(value); 83 | } 84 | 85 | /* if there is only 1 item in *filtered*, no duplicates were found so append to *extracted* */ 86 | if(filtered.size() == 1) 87 | extractor.push_back(*filtered.begin()); 88 | else if(filtered.size() != 0) { 89 | /* alpha sort */ 90 | std::stable_sort(filtered.begin(), filtered.end(), [](const auto& a, const auto& b){ return a[0] > b[0]; }); 91 | 92 | /* length sort */ 93 | std::stable_sort(filtered.begin(), filtered.end(), [](const auto& a, const auto& b){ return a.size() > b.size(); }); 94 | 95 | /* take first item as our 'canonical example' */ 96 | extractor.push_back(*filtered.begin()); 97 | } 98 | } 99 | 100 | // uniquify *extractor* list 101 | set keys; 102 | for(auto str : extractor) 103 | keys.insert(str); 104 | 105 | /* check that extractor differs from contain_dupes (e.g. duplicates were found) */ 106 | /* if not, then return the original list */ 107 | if(keys.size() == contains_dupes.size()) 108 | return contains_dupes; 109 | else 110 | return vector(keys.begin(), keys.end()); 111 | } 112 | 113 | } // ns fuzz 114 | 115 | -------------------------------------------------------------------------------- /src/string_matcher.cpp: -------------------------------------------------------------------------------- 1 | #include "string_matcher.hpp" 2 | #include "wrapper.hpp" 3 | 4 | #include 5 | #include 6 | 7 | namespace fuzz { 8 | 9 | void string_matcher::set_strings(const string s1, const string s2) 10 | { 11 | s1_ = s1; 12 | s2_ = s2; 13 | 14 | reset_cache(); 15 | } 16 | 17 | void string_matcher::set_string1(const string s1) 18 | { 19 | s1_ = s1; 20 | reset_cache(); 21 | } 22 | 23 | void string_matcher::set_string2(const string s2) 24 | { 25 | s2_ = s2; 26 | reset_cache(); 27 | } 28 | 29 | void string_matcher::reset_cache() 30 | { 31 | ratio_ = distance_ = 0; 32 | 33 | matching_blocks_.clear(); 34 | op_codes_.clear(); 35 | edit_ops_.clear(); 36 | } 37 | 38 | vector string_matcher::get_opcodes() 39 | { 40 | if (op_codes_.empty()) 41 | op_codes_ = wrapper::get_opcodes(s1_, s2_); 42 | 43 | return op_codes_; 44 | } 45 | 46 | vector string_matcher::get_editops() 47 | { 48 | if (edit_ops_.empty()) 49 | edit_ops_ = wrapper::get_editops(s1_, s2_); 50 | return edit_ops_; 51 | } 52 | 53 | vector string_matcher::get_matching_blocks() 54 | { 55 | if (matching_blocks_.empty()) { 56 | auto ops = get_opcodes(); 57 | matching_blocks_ = wrapper::get_matching_blocks(ops, s1_, s2_); 58 | } 59 | return matching_blocks_; 60 | } 61 | 62 | double string_matcher::ratio() 63 | { 64 | if (ratio_ == not_set) 65 | ratio_ = wrapper::ratio(s1_, s2_); 66 | return ratio_; 67 | } 68 | 69 | double string_matcher::real_quick_ratio() 70 | { 71 | size_t len1 = s1_.length(), len2 = s2_.length(); 72 | return 2.0 * static_cast(std::min(len1, len2)) / static_cast((len1 + len2)); 73 | } 74 | 75 | } // ns fuzz 76 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.hpp" 2 | 3 | #include 4 | 5 | namespace fuzz { 6 | 7 | namespace utils { 8 | 9 | /* 10 | * Return a rounded percentage in the range [0,100]. 11 | */ 12 | unsigned int percent_round(double val) 13 | { 14 | return intr(100 * val); 15 | } 16 | 17 | /* 18 | * Return a correctly rounded integer. 19 | */ 20 | unsigned int intr(double val) 21 | { 22 | return static_cast(std::round(val)); 23 | } 24 | 25 | /* 26 | * Split a string into multiple strings when a character is met. 27 | * Returns all tokens in an array. 28 | */ 29 | vector split_string(const string &str, const char c) 30 | { 31 | vector tokens; 32 | string word; 33 | for (const auto &len : str) { 34 | if (len == c && word.size()) { 35 | tokens.push_back(word); 36 | word.clear(); 37 | } else if (len != c) { 38 | word += len; 39 | } 40 | } 41 | 42 | if(word.size()) { 43 | tokens.push_back(word); 44 | } 45 | 46 | return tokens; 47 | } 48 | 49 | /* 50 | * Removes leading and trailing whitespace characters from 51 | * the passed string. 52 | */ 53 | string& trim(string &str) 54 | { 55 | auto isspace = [](char ch) { 56 | /* NOTE: should we specify the locale? */ 57 | return !std::isspace(ch); 58 | }; 59 | 60 | /* Strip leading whitespace.. */ 61 | auto start = std::find_if(str.cbegin(), str.cend(), isspace); 62 | str.erase(str.cbegin(), start); 63 | 64 | /* .. and trailing. */ 65 | auto end = std::find_if(str.crbegin(), str.crend(), isspace); 66 | str.erase(end.base(), str.cend()); 67 | 68 | return str; 69 | } 70 | 71 | /* 72 | * Akin to Pythons join: concatenate a vector of strings 73 | * with intervening occurrences of sep. 74 | */ 75 | string join(const vector &v, const string &sep) 76 | { 77 | string retstr = ""; 78 | for (const auto &str : v) 79 | retstr += str + (str == v.back() ? "" : sep); 80 | 81 | return retstr; 82 | } 83 | 84 | /* 85 | * Process the string by 86 | * - replace non-alphanumeric characters with whitespace, 87 | * - trim whitespace, and 88 | * - forcing to lower case. 89 | */ 90 | string full_process(string str) 91 | { 92 | /* Replace non-alphanumeric characters with whitespace, */ 93 | std::replace_if(str.begin(), str.end(), [](char ch) { 94 | /* NOTE: same thing here: specify locale? */ 95 | return !std::isalnum(ch); 96 | }, ' '); 97 | 98 | /* trim whitespace, and */ 99 | str = utils::trim(str); 100 | 101 | /* force to lower case. */ 102 | std::transform(str.begin(), str.end(), str.begin(), ::tolower); 103 | 104 | return str; 105 | } 106 | 107 | /* 108 | * std::min, but for size_t. 109 | */ 110 | size_t min(size_t a, size_t b) 111 | { 112 | return a < b ? a : b; 113 | } 114 | 115 | } // ns utils 116 | 117 | } // ns fuzz 118 | -------------------------------------------------------------------------------- /src/wrapper.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "wrapper.hpp" 4 | 5 | namespace wrapper { 6 | 7 | double ratio(const string &str1, const string &str2) 8 | { 9 | size_t len1 = str1.length(), 10 | len2 = str2.length(); 11 | 12 | const lev_byte *lb1 = reinterpret_cast(str1.c_str()), 13 | *lb2 = reinterpret_cast(str2.c_str()); 14 | 15 | size_t lensum = len1 + len2; 16 | size_t edit_dist = lev_edit_distance(len1, lb1, len2, lb2, 1); 17 | 18 | return static_cast(lensum - edit_dist) / static_cast(lensum); 19 | } 20 | 21 | vector get_opcodes(string &s1, string &s2) 22 | { 23 | vector opcodes; 24 | size_t len1, len2, nb, n; 25 | const lev_byte *lb1, *lb2; 26 | LevEditOp *ops; 27 | LevOpCode *bops; 28 | 29 | len1 = s1.length(); 30 | len2 = s2.length(); 31 | 32 | lb1 = reinterpret_cast(s1.c_str()); 33 | lb2 = reinterpret_cast(s2.c_str()); 34 | 35 | ops = lev_editops_find(len1, lb1, len2, lb2, &n); 36 | if (ops != nullptr) { 37 | bops = lev_editops_to_opcodes(n, ops, &nb, len1, len2); 38 | if (bops != nullptr) { 39 | opcodes.assign(bops, bops + nb); 40 | free(bops); 41 | } 42 | free(ops); 43 | } 44 | 45 | return opcodes; 46 | } 47 | 48 | vector get_editops(string &s1, string &s2) 49 | { 50 | vector editops; 51 | size_t len1, len2, n; 52 | const lev_byte *lb1, *lb2; 53 | LevEditOp *ops; 54 | 55 | len1 = s1.length(); 56 | len2 = s2.length(); 57 | 58 | lb1 = reinterpret_cast(s1.c_str()); 59 | lb2 = reinterpret_cast(s2.c_str()); 60 | 61 | ops = lev_editops_find(len1, lb1, len2, lb2, &n); 62 | if (ops != nullptr) { 63 | editops.assign(ops, ops + n); 64 | free(ops); 65 | } 66 | 67 | return editops; 68 | } 69 | 70 | vector get_opcodes(vector &v, string &s1, string &s2) 71 | { 72 | vector opcodes; 73 | size_t len1, len2, n; 74 | LevEditOp *ops; 75 | LevOpCode *bops; 76 | 77 | n = v.size(); 78 | len1 = s1.length(); 79 | len2 = s2.length(); 80 | 81 | ops = v.data(); 82 | bops = lev_editops_to_opcodes(n, ops, &n, len1, len2); 83 | if (bops != nullptr) { 84 | opcodes.assign(bops, bops + n); 85 | free(bops); 86 | } 87 | 88 | return opcodes; 89 | } 90 | 91 | vector get_matching_blocks(vector &v, string &s1, string &s2) 92 | { 93 | vector blocks; 94 | size_t n, nmb, len1, len2; 95 | LevMatchingBlock *mblocks; 96 | 97 | n = v.size(); 98 | len1 = s1.length(); 99 | len2 = s2.length(); 100 | 101 | mblocks = lev_opcodes_matching_blocks(len1, len2, n, v.data(), &nmb); 102 | if (mblocks != nullptr) { 103 | blocks.assign(mblocks, mblocks + nmb); 104 | free(mblocks); 105 | } 106 | 107 | return blocks; 108 | } 109 | 110 | } // ns wrapper 111 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LIB_INCLUDE_DIRS ${LIB_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/include) 2 | include_directories(${LIB_INCLUDE_DIRS}) 3 | 4 | file(GLOB_RECURSE SOURCES RELATIVE ${PROJECT_SOURCE_DIR}/test *.c[p]*) 5 | add_executable(main ${SOURCES}) 6 | target_link_libraries(main fuzzywuzzy) -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "fuzzywuzzy.hpp" 4 | #include "process.hpp" 5 | 6 | int main() 7 | { 8 | const string a = "I'm in your mind", b = "I'm in your mind fuzz"; 9 | const string c = "fuzzy wuzzy was a bear", d = "wuzzy fuzzy was a bear"; 10 | 11 | std::cout << fuzz::ratio(a, b) << '\n'; 12 | std::cout << fuzz::partial_ratio(a, b) << '\n'; 13 | std::cout << fuzz::token_sort_ratio(c, d) << '\n'; 14 | 15 | std::vector v = {"fuzzy", "wuzzy", "wuzzy", "fuzzy", "fuzzy", " "}; 16 | auto erg = fuzz::dedupe(v); 17 | } 18 | --------------------------------------------------------------------------------