├── .gitattributes ├── .gitignore ├── test ├── distance │ ├── examples │ │ └── ocr.hpp │ ├── CMakeLists.txt │ ├── tests-OSA.cpp │ ├── tests-Hamming.cpp │ ├── tests-Jaro.cpp │ ├── tests-JaroWinkler.cpp │ └── tests-DamerauLevenshtein.cpp ├── tests-common.cpp └── CMakeLists.txt ├── rapidfuzz ├── rapidfuzz_all.hpp ├── details │ ├── simd.hpp │ ├── type_traits.hpp │ ├── CharSet.hpp │ ├── SplittedSentenceView.hpp │ ├── common.hpp │ ├── Range.hpp │ ├── common_impl.hpp │ ├── Matrix.hpp │ ├── GrowingHashmap.hpp │ ├── intrinsics.hpp │ └── PatternMatchVector.hpp ├── distance │ ├── Prefix_impl.hpp │ ├── Postfix_impl.hpp │ ├── Hamming_impl.hpp │ ├── Indel_impl.hpp │ ├── JaroWinkler_impl.hpp │ ├── Jaro.hpp │ ├── Prefix.hpp │ ├── Postfix.hpp │ ├── DamerauLevenshtein_impl.hpp │ ├── JaroWinkler.hpp │ ├── Hamming.hpp │ ├── DamerauLevenshtein.hpp │ └── Indel.hpp └── distance.hpp ├── rapidfuzz_reference ├── README.md ├── Indel.hpp ├── common.hpp ├── LCSseq.hpp ├── Hamming.hpp ├── JaroWinkler.hpp ├── OSA.hpp ├── Levenshtein.hpp ├── Jaro.hpp └── DamerauLevenshtein.hpp ├── examples └── cmake_installed │ ├── CMakeLists.txt │ └── main.cpp ├── docs └── literature │ ├── hyrro_2002.bib │ ├── hyrro_lcs_2004.bib │ ├── hyrro_2004.bib │ ├── myers_1999.bib │ └── wagner_fischer_1974.bib ├── cmake └── rapidfuzzConfig.cmake.in ├── .github ├── workflows │ ├── documentation.yml │ └── cmake.yml └── RapidFuzz.svg ├── fuzzing ├── CMakeLists.txt ├── fuzz_indel_editops.cpp ├── fuzz_levenshtein_editops.cpp ├── fuzzing.hpp ├── fuzz_indel_distance.cpp ├── fuzz_jaro_similarity.cpp ├── fuzz_osa_distance.cpp ├── fuzz_damerau_levenshtein_distance.cpp ├── fuzz_lcs_similarity.cpp └── fuzz_levenshtein_distance.cpp ├── .clang-format ├── bench ├── CMakeLists.txt ├── bench-jarowinkler.cpp ├── bench-lcs.cpp ├── bench-fuzz.cpp └── bench-levenshtein.cpp ├── LICENSE ├── Doxyfile ├── CHANGELOG.md ├── tools └── amalgamation.py └── CMakeLists.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.impl linguist-language=C++ 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .cache/ 3 | .idea/ 4 | build/ 5 | .cache/ 6 | *.data 7 | *.so 8 | *.o 9 | *.out -------------------------------------------------------------------------------- /test/distance/examples/ocr.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | extern std::basic_string ocr_example1; 5 | extern std::basic_string ocr_example2; 6 | -------------------------------------------------------------------------------- /rapidfuzz/rapidfuzz_all.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include -------------------------------------------------------------------------------- /rapidfuzz_reference/README.md: -------------------------------------------------------------------------------- 1 | ## rapidfuzz_reference 2 | 3 | This includes reference implementations of various string matching algorithms, 4 | which can be used to validate the results of faster implementations. -------------------------------------------------------------------------------- /examples/cmake_installed/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | project(cmake_installed CXX) 3 | 4 | find_package(rapidfuzz REQUIRED) 5 | add_executable(foo main.cpp) 6 | target_link_libraries(foo rapidfuzz::rapidfuzz) -------------------------------------------------------------------------------- /docs/literature/hyrro_2002.bib: -------------------------------------------------------------------------------- 1 | @article{hyrro_2002, 2 | author = {Hyyro, Heikki}, 3 | year = {2002}, 4 | month = {10}, 5 | pages = {}, 6 | title = {Explaining and Extending the Bit-parallel Approximate String Matching Algorithm of Myers} 7 | } 8 | -------------------------------------------------------------------------------- /examples/cmake_installed/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | std::string a = "aaaa"; 8 | std::string b = "abab"; 9 | std::cout << rapidfuzz::fuzz::ratio(a, b) << std::endl; 10 | } -------------------------------------------------------------------------------- /docs/literature/hyrro_lcs_2004.bib: -------------------------------------------------------------------------------- 1 | @article{hyrro_lcs_2004, 2 | author = {Hyyro, Heikki}, 3 | year = {2004}, 4 | month = {08}, 5 | pages = {}, 6 | title = {Bit-Parallel LCS-length Computation Revisited}, 7 | journal = {Proc. 15th Australasian Workshop on Combinatorial Algorithms (AWOCA 2004)} 8 | } -------------------------------------------------------------------------------- /docs/literature/hyrro_2004.bib: -------------------------------------------------------------------------------- 1 | @article{hyrro_2004, 2 | author = {Hyyro, Heikki}, 3 | year = {2004}, 4 | month = {08}, 5 | pages = {}, 6 | title = {Bit-Parallel LCS-length Computation Revisited}, 7 | journal = {Proc. 15th Australasian Workshop on Combinatorial Algorithms (AWOCA 2004)} 8 | } 9 | -------------------------------------------------------------------------------- /cmake/rapidfuzzConfig.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | # Avoid repeatedly including the targets 4 | if(NOT TARGET rapidfuzz::rapidfuzz) 5 | # Provide path for scripts 6 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") 7 | 8 | include(${CMAKE_CURRENT_LIST_DIR}/rapidfuzzTargets.cmake) 9 | endif() -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build_docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - run: sudo apt-get install -y doxygen 14 | - run: doxygen ./Doxyfile 15 | - uses: peaceiris/actions-gh-pages@v3 16 | with: 17 | github_token: ${{ secrets.GITHUB_TOKEN }} 18 | publish_dir: ./doxygen/html -------------------------------------------------------------------------------- /docs/literature/myers_1999.bib: -------------------------------------------------------------------------------- 1 | @article{myers_1999, 2 | author = {Myers, Gene}, 3 | title = {A Fast Bit-Vector Algorithm for Approximate String Matching Based on Dynamic Programming}, 4 | year = {1999}, 5 | issue_date = {May 1999}, 6 | publisher = {Association for Computing Machinery}, 7 | address = {New York, NY, USA}, 8 | volume = {46}, 9 | number = {3}, 10 | issn = {0004-5411}, 11 | url = {https://doi.org/10.1145/316542.316550}, 12 | doi = {10.1145/316542.316550}, 13 | journal = {J. ACM}, 14 | month = may, 15 | pages = {395–415}, 16 | numpages = {21}, 17 | keywords = {approximate string search, sequence comparison, bit-parallelism} 18 | } 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /test/distance/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | function(rapidfuzz_add_test test) 2 | add_executable(test_${test} tests-${test}.cpp examples/ocr.cpp) 3 | target_link_libraries(test_${test} ${PROJECT_NAME}) 4 | target_link_libraries(test_${test} Catch2::Catch2WithMain) 5 | if (RAPIDFUZZ_ENABLE_LINTERS) 6 | target_link_libraries(test_${test} project_warnings) 7 | endif() 8 | add_test(NAME ${test} COMMAND test_${test}) 9 | endfunction() 10 | 11 | rapidfuzz_add_test(Hamming) 12 | rapidfuzz_add_test(Indel) 13 | rapidfuzz_add_test(LCSseq) 14 | rapidfuzz_add_test(Levenshtein) 15 | rapidfuzz_add_test(DamerauLevenshtein) 16 | rapidfuzz_add_test(OSA) 17 | rapidfuzz_add_test(Jaro) 18 | rapidfuzz_add_test(JaroWinkler) 19 | -------------------------------------------------------------------------------- /rapidfuzz/details/simd.hpp: -------------------------------------------------------------------------------- 1 | 2 | /* SPDX-License-Identifier: MIT */ 3 | /* Copyright © 2022 Max Bachmann */ 4 | #pragma once 5 | 6 | /* RAPIDFUZZ_LTO_HACK is used to differentiate functions between different 7 | * translation units to avoid warnings when using lto */ 8 | #ifndef RAPIDFUZZ_EXCLUDE_SIMD 9 | # if __AVX2__ 10 | # define RAPIDFUZZ_SIMD 11 | # define RAPIDFUZZ_AVX2 12 | # define RAPIDFUZZ_LTO_HACK 0 13 | # include 14 | 15 | # elif (defined(_M_AMD64) || defined(_M_X64)) || defined(__SSE2__) 16 | # define RAPIDFUZZ_SIMD 17 | # define RAPIDFUZZ_SSE2 18 | # define RAPIDFUZZ_LTO_HACK 1 19 | # include 20 | # endif 21 | #endif -------------------------------------------------------------------------------- /fuzzing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | function(create_fuzzer fuzzer) 2 | add_executable(fuzz_${fuzzer} fuzz_${fuzzer}.cpp) 3 | target_compile_features(fuzz_${fuzzer} PUBLIC cxx_std_17) 4 | target_link_libraries(fuzz_${fuzzer} PRIVATE rapidfuzz::rapidfuzz) 5 | 6 | target_compile_options(fuzz_${fuzzer} PRIVATE -g -O1 -fsanitize=fuzzer,address -march=native) 7 | target_link_libraries(fuzz_${fuzzer} PRIVATE -fsanitize=fuzzer,address) 8 | endfunction(create_fuzzer) 9 | 10 | create_fuzzer(lcs_similarity) 11 | 12 | create_fuzzer(levenshtein_distance) 13 | create_fuzzer(levenshtein_editops) 14 | 15 | create_fuzzer(indel_distance) 16 | create_fuzzer(indel_editops) 17 | 18 | create_fuzzer(osa_distance) 19 | 20 | create_fuzzer(damerau_levenshtein_distance) 21 | 22 | create_fuzzer(jaro_similarity) 23 | -------------------------------------------------------------------------------- /fuzzing/fuzz_indel_editops.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Indel.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | 10 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 11 | { 12 | std::basic_string s1, s2; 13 | if (!extract_strings(data, size, s1, s2)) return 0; 14 | 15 | int64_t score = rapidfuzz_reference::indel_distance(s1, s2); 16 | rapidfuzz::Editops ops = rapidfuzz::indel_editops(s1, s2); 17 | 18 | if (static_cast(ops.size()) == score && s2 != rapidfuzz::editops_apply(ops, s1, s2)) 19 | throw std::logic_error("levenshtein_editops failed"); 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | ColumnLimit: 110 2 | IndentWidth: 4 3 | AccessModifierOffset: -4 4 | 5 | AllowShortIfStatementsOnASingleLine: true 6 | PointerAlignment: Left 7 | AllowShortBlocksOnASingleLine: Always 8 | AllowShortFunctionsOnASingleLine: None 9 | BreakBeforeBraces: Custom 10 | AlwaysBreakTemplateDeclarations: true 11 | BraceWrapping: 12 | SplitEmptyFunction: false 13 | AfterCaseLabel: true 14 | AfterClass: false 15 | AfterControlStatement: MultiLine 16 | AfterEnum: false 17 | AfterFunction: true 18 | AfterNamespace: false 19 | AfterStruct: false 20 | AfterUnion: false 21 | BeforeCatch: true 22 | BeforeElse: true 23 | SplitEmptyRecord: false 24 | SplitEmptyNamespace: false 25 | AllowAllConstructorInitializersOnNextLine: true 26 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 27 | AllowShortCaseLabelsOnASingleLine: true 28 | IndentPPDirectives: AfterHash 29 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Indel.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include "Levenshtein.hpp" 7 | #include 8 | 9 | namespace rapidfuzz_reference { 10 | 11 | template 12 | int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 13 | int64_t score_cutoff = std::numeric_limits::max()) 14 | { 15 | return levenshtein_distance(first1, last1, first2, last2, {1, 1, 2}, score_cutoff); 16 | } 17 | 18 | template 19 | int64_t indel_distance(const Sentence1& s1, const Sentence2& s2, 20 | int64_t score_cutoff = std::numeric_limits::max()) 21 | { 22 | return levenshtein_distance(s1, s2, {1, 1, 2}, score_cutoff); 23 | } 24 | 25 | } // namespace rapidfuzz_reference 26 | -------------------------------------------------------------------------------- /rapidfuzz_reference/common.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz_reference { 10 | 11 | template 12 | class Matrix { 13 | public: 14 | Matrix(size_t _rows, size_t _cols) : rows(_rows), cols(_cols) 15 | { 16 | matrix = new T[rows * cols]; 17 | std::fill(matrix, matrix + rows * cols, T()); 18 | } 19 | 20 | ~Matrix() 21 | { 22 | delete[] matrix; 23 | } 24 | 25 | T& operator()(ptrdiff_t row, ptrdiff_t col) 26 | { 27 | return matrix[static_cast(row) + static_cast(col) * rows]; 28 | } 29 | 30 | T& back() 31 | { 32 | return matrix[rows * cols - 1]; 33 | } 34 | 35 | size_t rows; 36 | size_t cols; 37 | T* matrix; 38 | }; 39 | 40 | } // namespace rapidfuzz_reference 41 | -------------------------------------------------------------------------------- /bench/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | FetchContent_Declare(googletest 3 | GIT_REPOSITORY https://github.com/google/googletest.git 4 | GIT_TAG v1.12.x) 5 | 6 | FetchContent_Declare(googlebenchmark 7 | GIT_REPOSITORY https://github.com/google/benchmark.git 8 | GIT_TAG main) # need master for benchmark::benchmark 9 | 10 | FetchContent_MakeAvailable( 11 | googletest 12 | googlebenchmark) 13 | 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 15 | 16 | function(rapidfuzz_add_benchmark NAME SOURCE) 17 | add_executable(bench_${NAME} ${SOURCE}) 18 | target_link_libraries(bench_${NAME} PRIVATE ${PROJECT_NAME}) 19 | target_link_libraries(bench_${NAME} PRIVATE benchmark::benchmark) 20 | endfunction() 21 | 22 | rapidfuzz_add_benchmark(lcs bench-lcs.cpp) 23 | rapidfuzz_add_benchmark(fuzz bench-fuzz.cpp) 24 | rapidfuzz_add_benchmark(levenshtein bench-levenshtein.cpp) 25 | rapidfuzz_add_benchmark(jarowinkler bench-jarowinkler.cpp) 26 | -------------------------------------------------------------------------------- /rapidfuzz_reference/LCSseq.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "Indel.hpp" 6 | 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz_reference { 11 | 12 | template 13 | int64_t lcs_seq_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 14 | int64_t score_cutoff = 0) 15 | { 16 | int64_t maximum = std::distance(first1, last1) + std::distance(first2, last2); 17 | int64_t dist = indel_distance(first1, last1, first2, last2); 18 | int64_t sim = (maximum - dist) / 2; 19 | return (sim >= score_cutoff) ? sim : 0; 20 | } 21 | 22 | template 23 | int64_t lcs_seq_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0) 24 | { 25 | return lcs_seq_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 26 | } 27 | 28 | } // namespace rapidfuzz_reference 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2020 Max Bachmann 2 | Copyright © 2011 Adam Cohen 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Prefix_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include "rapidfuzz/details/common.hpp" 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz::detail { 10 | 11 | class Prefix : public SimilarityBase::max()> { 12 | friend SimilarityBase::max()>; 13 | friend NormalizedMetricBase; 14 | 15 | template 16 | static int64_t maximum(Range s1, Range s2) 17 | { 18 | return std::max(s1.size(), s2.size()); 19 | } 20 | 21 | template 22 | static int64_t _similarity(Range s1, Range s2, int64_t score_cutoff, 23 | [[maybe_unused]] int64_t score_hint) 24 | { 25 | int64_t dist = static_cast(remove_common_prefix(s1, s2)); 26 | return (dist >= score_cutoff) ? dist : 0; 27 | } 28 | }; 29 | 30 | } // namespace rapidfuzz::detail 31 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Postfix_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include "rapidfuzz/details/common.hpp" 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz::detail { 10 | 11 | class Postfix : public SimilarityBase::max()> { 12 | friend SimilarityBase::max()>; 13 | friend NormalizedMetricBase; 14 | 15 | template 16 | static int64_t maximum(Range s1, Range s2) 17 | { 18 | return std::max(s1.size(), s2.size()); 19 | } 20 | 21 | template 22 | static int64_t _similarity(Range s1, Range s2, int64_t score_cutoff, 23 | [[maybe_unused]] int64_t score_hint) 24 | { 25 | int64_t dist = static_cast(remove_common_suffix(s1, s2)); 26 | return (dist >= score_cutoff) ? dist : 0; 27 | } 28 | }; 29 | 30 | } // namespace rapidfuzz::detail 31 | -------------------------------------------------------------------------------- /docs/literature/wagner_fischer_1974.bib: -------------------------------------------------------------------------------- 1 | @article{wagner_fischer_1974, 2 | author = {Wagner, Robert A. and Fischer, Michael J.}, 3 | title = {The String-to-String Correction Problem}, 4 | year = {1974}, 5 | issue_date = {Jan. 1974}, 6 | publisher = {Association for Computing Machinery}, 7 | address = {New York, NY, USA}, 8 | volume = {21}, 9 | number = {1}, 10 | issn = {0004-5411}, 11 | url = {https://doi.org/10.1145/321796.321811}, 12 | doi = {10.1145/321796.321811}, 13 | abstract = {The string-to-string correction problem is to determine the distance between two strings as measured by the minimum cost sequence of “edit operations” needed to change the one string into the other. The edit operations investigated allow changing one symbol of a string into another single symbol, deleting one symbol from a string, or inserting a single symbol into a string. An algorithm is presented which solves this problem in time proportional to the product of the lengths of the two strings. Possible applications are to the problems of automatic spelling correction and determining the longest subsequence of characters common to two strings.}, 14 | journal = {J. ACM}, 15 | month = jan, 16 | pages = {168–173}, 17 | numpages = {6} 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Hamming.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz_reference { 10 | 11 | template 12 | int64_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 13 | int64_t score_cutoff = std::numeric_limits::max()) 14 | { 15 | ptrdiff_t len1 = std::distance(first1, last1); 16 | ptrdiff_t len2 = std::distance(first2, last2); 17 | if (len1 != len2) throw std::invalid_argument("Sequences are not the same length."); 18 | 19 | int64_t dist = 0; 20 | for (ptrdiff_t i = 0; i < len1; ++i) 21 | dist += bool(first1[i] != first2[i]); 22 | 23 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 24 | } 25 | 26 | template 27 | int64_t hamming_distance(const Sentence1& s1, const Sentence2& s2, 28 | int64_t score_cutoff = std::numeric_limits::max()) 29 | { 30 | return hamming_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 31 | } 32 | 33 | } // namespace rapidfuzz_reference 34 | -------------------------------------------------------------------------------- /test/tests-common.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | TEST_CASE("remove affix") 6 | { 7 | std::string s1 = "aabbbbaaaa"; 8 | std::string s2 = "aaabbbbaaaaa"; 9 | 10 | { 11 | rapidfuzz::detail::Range s1_(s1); 12 | rapidfuzz::detail::Range s2_(s2); 13 | REQUIRE(rapidfuzz::detail::remove_common_prefix(s1_, s2_) == 2); 14 | REQUIRE(s1_ == rapidfuzz::detail::Range("bbbbaaaa")); 15 | REQUIRE(s2_ == rapidfuzz::detail::Range("abbbbaaaaa")); 16 | } 17 | 18 | { 19 | rapidfuzz::detail::Range s1_(s1); 20 | rapidfuzz::detail::Range s2_(s2); 21 | REQUIRE(rapidfuzz::detail::remove_common_suffix(s1_, s2_) == 4); 22 | REQUIRE(s1_ == rapidfuzz::detail::Range("aabbbb")); 23 | REQUIRE(s2_ == rapidfuzz::detail::Range("aaabbbba")); 24 | } 25 | 26 | { 27 | rapidfuzz::detail::Range s1_(s1); 28 | rapidfuzz::detail::Range s2_(s2); 29 | auto affix = rapidfuzz::detail::remove_common_affix(s1_, s2_); 30 | REQUIRE(affix.prefix_len == 2); 31 | REQUIRE(affix.suffix_len == 4); 32 | REQUIRE(s1_ == rapidfuzz::detail::Range("bbbb")); 33 | REQUIRE(s2_ == rapidfuzz::detail::Range("abbbba")); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /rapidfuzz_reference/JaroWinkler.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "Jaro.hpp" 6 | 7 | namespace rapidfuzz_reference { 8 | 9 | template >> 11 | double jaro_winkler_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, 12 | double prefix_weight = 0.1, double score_cutoff = 0.0) 13 | { 14 | int64_t min_len = std::min(std::distance(P_first, P_last), std::distance(T_first, T_last)); 15 | int64_t max_prefix = std::min(min_len, 4); 16 | 17 | int64_t prefix = 0; 18 | for (; prefix < max_prefix; ++prefix) 19 | if (T_first[prefix] != P_first[prefix]) break; 20 | 21 | double Sim = jaro_similarity(P_first, P_last, T_first, T_last); 22 | if (Sim > 0.7) Sim += static_cast(prefix) * prefix_weight * (1.0 - Sim); 23 | 24 | return (Sim >= score_cutoff) ? Sim : 0; 25 | } 26 | 27 | template 28 | double jaro_winkler_similarity(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1, 29 | double score_cutoff = 0.0) 30 | { 31 | return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), prefix_weight, 32 | score_cutoff); 33 | } 34 | 35 | } /* namespace rapidfuzz_reference */ 36 | -------------------------------------------------------------------------------- /fuzzing/fuzz_levenshtein_editops.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Levenshtein.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | 10 | void validate_editops(const std::basic_string& s1, const std::basic_string& s2, int64_t score, int64_t score_hint = std::numeric_limits::max()) 11 | { 12 | rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2, score_hint); 13 | if (static_cast(ops.size()) == score && s2 != rapidfuzz::editops_apply(ops, s1, s2)) 14 | throw std::logic_error("levenshtein_editops failed"); 15 | } 16 | 17 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 18 | { 19 | std::basic_string s1, s2; 20 | if (!extract_strings(data, size, s1, s2)) return 0; 21 | 22 | /* hirschbergs algorithm is only used for very long sequences which are apparently not generated a lot by 23 | * the fuzzer */ 24 | for (int i = 0; i < 10; i++) { 25 | int64_t score = rapidfuzz_reference::levenshtein_distance(s1, s2); 26 | validate_editops(s1, s2, score); 27 | validate_editops(s1, s2, score, 64); 28 | validate_editops(s1, s2, score, score != 0 ? score - 1 : 0); 29 | validate_editops(s1, s2, score, score); 30 | 31 | s1 = str_multiply(s1, 2); 32 | s2 = str_multiply(s2, 2); 33 | } 34 | 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /fuzzing/fuzzing.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | static inline bool extract_strings(const uint8_t* data, size_t size, std::basic_string& s1, 7 | std::basic_string& s2) 8 | { 9 | if (size <= sizeof(uint32_t)) { 10 | return false; 11 | } 12 | uint32_t len1 = *(uint32_t*)data; 13 | 14 | if (len1 > size - sizeof(len1)) { 15 | return false; 16 | } 17 | 18 | data += sizeof(len1); 19 | size -= sizeof(len1); 20 | s1 = std::basic_string(data, len1); 21 | s2 = std::basic_string(data + len1, size - len1); 22 | return true; 23 | } 24 | 25 | template 26 | static inline T pow(T x, unsigned int p) 27 | { 28 | if (p == 0) return 1; 29 | if (p == 1) return x; 30 | 31 | T tmp = pow(x, p / 2); 32 | if (p % 2 == 0) 33 | return tmp * tmp; 34 | else 35 | return x * tmp * tmp; 36 | } 37 | 38 | template 39 | std::basic_string str_multiply(std::basic_string a, size_t b) 40 | { 41 | std::basic_string output; 42 | while (b--) 43 | output += a; 44 | 45 | return output; 46 | } 47 | 48 | template 49 | void print_seq(const std::string& name, const std::basic_string& seq) 50 | { 51 | std::cout << name << " len: " << seq.size() << " content: "; 52 | for (const auto& ch : seq) 53 | std::cout << static_cast(ch) << " "; 54 | std::cout << std::endl; 55 | } 56 | -------------------------------------------------------------------------------- /fuzzing/fuzz_indel_distance.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Indel.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void validate_distance(const std::basic_string& s1, const std::basic_string& s2, 12 | int64_t score_cutoff) 13 | { 14 | auto dist = rapidfuzz::indel_distance(s1, s2, score_cutoff); 15 | auto reference_dist = rapidfuzz_reference::indel_distance(s1, s2, score_cutoff); 16 | if (dist != reference_dist) { 17 | print_seq("s1: ", s1); 18 | print_seq("s2: ", s2); 19 | throw std::logic_error(std::string("indel distance failed (score_cutoff = ") + 20 | std::to_string(score_cutoff) + std::string(", reference_score = ") + 21 | std::to_string(reference_dist) + std::string(", score = ") + 22 | std::to_string(dist) + ")"); 23 | } 24 | } 25 | 26 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 27 | { 28 | std::basic_string s1, s2; 29 | if (!extract_strings(data, size, s1, s2)) return 0; 30 | 31 | validate_distance(s1, s2, 0); 32 | validate_distance(s1, s2, 1); 33 | validate_distance(s1, s2, 2); 34 | validate_distance(s1, s2, 3); 35 | validate_distance(s1, s2, 4); 36 | validate_distance(s1, s2, std::numeric_limits::max()); 37 | 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /rapidfuzz/details/type_traits.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2020 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace rapidfuzz { 13 | 14 | namespace detail { 15 | template 16 | auto inner_type(T const*) -> T; 17 | 18 | template 19 | auto inner_type(T const&) -> typename T::value_type; 20 | } // namespace detail 21 | 22 | template 23 | using char_type = decltype(detail::inner_type(std::declval())); 24 | 25 | /* backport of std::iter_value_t from C++20 26 | * This does not cover the complete functionality, but should be enough for 27 | * the use cases in this library 28 | */ 29 | template 30 | using iter_value_t = typename std::iterator_traits::value_type; 31 | 32 | // taken from 33 | // https://stackoverflow.com/questions/16893992/check-if-type-can-be-explicitly-converted 34 | template 35 | struct is_explicitly_convertible { 36 | template 37 | static void f(T); 38 | 39 | template 40 | static constexpr auto test(int /*unused*/) -> decltype(f(static_cast(std::declval())), true) 41 | { 42 | return true; 43 | } 44 | 45 | template 46 | static constexpr auto test(...) -> bool 47 | { 48 | return false; 49 | } 50 | 51 | static bool const value = test(0); 52 | }; 53 | 54 | } // namespace rapidfuzz 55 | -------------------------------------------------------------------------------- /fuzzing/fuzz_jaro_similarity.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Jaro.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | bool is_close(double a, double b, double epsilon) 12 | { 13 | return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon); 14 | } 15 | 16 | void validate_distance(const std::basic_string& s1, const std::basic_string& s2) 17 | { 18 | double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2); 19 | double sim = rapidfuzz::jaro_similarity(s1, s2); 20 | 21 | if (!is_close(sim, reference_sim, 0.0001)) { 22 | print_seq("s1", s1); 23 | print_seq("s2", s2); 24 | throw std::logic_error(std::string("jaro similarity failed (reference_score = ") + 25 | std::to_string(reference_sim) + std::string(", score = ") + 26 | std::to_string(sim) + ")"); 27 | } 28 | } 29 | 30 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 31 | { 32 | std::basic_string s1, s2; 33 | if (!extract_strings(data, size, s1, s2)) return 0; 34 | 35 | validate_distance(s1, s2); 36 | 37 | /* test long sequences */ 38 | for (unsigned int i = 2; i < 9; ++i) { 39 | std::basic_string s1_ = str_multiply(s1, pow(2, i)); 40 | std::basic_string s2_ = str_multiply(s2, pow(2, i)); 41 | 42 | if (s1_.size() > 10000 || s2_.size() > 10000) break; 43 | 44 | validate_distance(s1_, s2_); 45 | } 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Hamming_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz::detail { 10 | 11 | class Hamming : public DistanceBase::max()> { 12 | friend DistanceBase::max()>; 13 | friend NormalizedMetricBase; 14 | 15 | template 16 | static int64_t maximum(Range s1, Range) 17 | { 18 | return s1.size(); 19 | } 20 | 21 | template 22 | static int64_t _distance(Range s1, Range s2, int64_t score_cutoff, 23 | [[maybe_unused]] int64_t score_hint) 24 | { 25 | if (s1.size() != s2.size()) throw std::invalid_argument("Sequences are not the same length."); 26 | 27 | int64_t dist = 0; 28 | for (ptrdiff_t i = 0; i < s1.size(); ++i) 29 | dist += bool(s1[i] != s2[i]); 30 | 31 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 32 | } 33 | }; 34 | 35 | template 36 | Editops hamming_editops(Range s1, Range s2, int64_t) 37 | { 38 | if (s1.size() != s2.size()) throw std::invalid_argument("Sequences are not the same length."); 39 | 40 | Editops ops; 41 | for (ptrdiff_t i = 0; i < s1.size(); ++i) 42 | if (s1[i] != s2[i]) ops.emplace_back(EditType::Replace, i, i); 43 | 44 | ops.set_src_len(static_cast(s1.size())); 45 | ops.set_dest_len(static_cast(s2.size())); 46 | return ops; 47 | } 48 | 49 | } // namespace rapidfuzz::detail 50 | -------------------------------------------------------------------------------- /rapidfuzz/details/CharSet.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright (c) 2022 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace rapidfuzz::detail { 13 | 14 | /* 15 | * taken from https://stackoverflow.com/a/17251989/11335032 16 | */ 17 | template 18 | bool CanTypeFitValue(const U value) 19 | { 20 | const intmax_t botT = intmax_t(std::numeric_limits::min()); 21 | const intmax_t botU = intmax_t(std::numeric_limits::min()); 22 | const uintmax_t topT = uintmax_t(std::numeric_limits::max()); 23 | const uintmax_t topU = uintmax_t(std::numeric_limits::max()); 24 | return !((botT > botU && value < static_cast(botT)) || (topT < topU && value > static_cast(topT))); 25 | } 26 | 27 | template 28 | struct CharSet; 29 | 30 | template 31 | struct CharSet { 32 | using UCharT1 = typename std::make_unsigned::type; 33 | 34 | std::array::max() + 1> m_val; 35 | 36 | CharSet() : m_val{} 37 | {} 38 | 39 | void insert(CharT1 ch) 40 | { 41 | m_val[UCharT1(ch)] = true; 42 | } 43 | 44 | template 45 | bool find(CharT2 ch) const 46 | { 47 | if (!CanTypeFitValue(ch)) return false; 48 | 49 | return m_val[UCharT1(ch)]; 50 | } 51 | }; 52 | 53 | template 54 | struct CharSet { 55 | std::unordered_set m_val; 56 | 57 | CharSet() : m_val{} 58 | {} 59 | 60 | void insert(CharT1 ch) 61 | { 62 | m_val.insert(ch); 63 | } 64 | 65 | template 66 | bool find(CharT2 ch) const 67 | { 68 | if (!CanTypeFitValue(ch)) return false; 69 | 70 | return m_val.find(CharT1(ch)) != m_val.end(); 71 | } 72 | }; 73 | 74 | } // namespace rapidfuzz::detail -------------------------------------------------------------------------------- /fuzzing/fuzz_osa_distance.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/OSA.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void validate_distance(int64_t reference_dist, const std::basic_string& s1, 12 | const std::basic_string& s2, int64_t score_cutoff) 13 | { 14 | if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; 15 | 16 | auto dist = rapidfuzz::osa_distance(s1, s2, score_cutoff); 17 | if (dist != reference_dist) { 18 | print_seq("s1", s1); 19 | print_seq("s2", s2); 20 | throw std::logic_error(std::string("osa distance failed (score_cutoff = ") + 21 | std::to_string(score_cutoff) + std::string(", reference_score = ") + 22 | std::to_string(reference_dist) + std::string(", score = ") + 23 | std::to_string(dist) + ")"); 24 | } 25 | } 26 | 27 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 28 | { 29 | std::basic_string s1, s2; 30 | if (!extract_strings(data, size, s1, s2)) return 0; 31 | 32 | int64_t reference_dist = rapidfuzz_reference::osa_distance(s1, s2); 33 | 34 | /* test small band */ 35 | for (int64_t i = 4; i < 32; ++i) 36 | validate_distance(reference_dist, s1, s2, i); 37 | 38 | /* unrestricted */ 39 | validate_distance(reference_dist, s1, s2, std::numeric_limits::max()); 40 | 41 | /* test long sequences */ 42 | for (unsigned int i = 2; i < 9; ++i) { 43 | std::basic_string s1_ = str_multiply(s1, pow(2, i)); 44 | std::basic_string s2_ = str_multiply(s2, pow(2, i)); 45 | 46 | if (s1_.size() > 10000 || s2_.size() > 10000) break; 47 | 48 | reference_dist = rapidfuzz_reference::osa_distance(s1_, s2_); 49 | validate_distance(reference_dist, s1_, s2_, std::numeric_limits::max()); 50 | } 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /fuzzing/fuzz_damerau_levenshtein_distance.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/DamerauLevenshtein.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void validate_distance(int64_t reference_dist, const std::basic_string& s1, 12 | const std::basic_string& s2, int64_t score_cutoff) 13 | { 14 | if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; 15 | 16 | auto dist = rapidfuzz::experimental::damerau_levenshtein_distance(s1, s2, score_cutoff); 17 | if (dist != reference_dist) { 18 | print_seq("s1", s1); 19 | print_seq("s2", s2); 20 | throw std::logic_error(std::string("osa distance failed (score_cutoff = ") + 21 | std::to_string(score_cutoff) + std::string(", reference_score = ") + 22 | std::to_string(reference_dist) + std::string(", score = ") + 23 | std::to_string(dist) + ")"); 24 | } 25 | } 26 | 27 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 28 | { 29 | std::basic_string s1, s2; 30 | if (!extract_strings(data, size, s1, s2)) return 0; 31 | 32 | int64_t reference_dist = rapidfuzz_reference::damerau_levenshtein_distance(s1, s2); 33 | 34 | /* test small band */ 35 | for (int64_t i = 4; i < 32; ++i) 36 | validate_distance(reference_dist, s1, s2, i); 37 | 38 | /* unrestricted */ 39 | validate_distance(reference_dist, s1, s2, std::numeric_limits::max()); 40 | 41 | /* test long sequences */ 42 | for (unsigned int i = 2; i < 9; ++i) { 43 | std::basic_string s1_ = str_multiply(s1, pow(2, i)); 44 | std::basic_string s2_ = str_multiply(s2, pow(2, i)); 45 | 46 | if (s1_.size() > 10000 || s2_.size() > 10000) break; 47 | 48 | reference_dist = rapidfuzz_reference::damerau_levenshtein_distance(s1_, s2_); 49 | validate_distance(reference_dist, s1_, s2_, std::numeric_limits::max()); 50 | } 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /fuzzing/fuzz_lcs_similarity.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/LCSseq.hpp" 5 | #include "fuzzing.hpp" 6 | #include "rapidfuzz/details/Range.hpp" 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | void validate_simd(const std::basic_string& s1, const std::basic_string& s2) 13 | { 14 | #ifdef RAPIDFUZZ_SIMD 15 | size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0); 16 | rapidfuzz::experimental::MultiLCSseq scorer(count); 17 | 18 | std::vector> strings; 19 | 20 | for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) { 21 | if (std::distance(it1, s1.end()) < static_cast(MaxLen)) { 22 | strings.emplace_back(it1, s1.end()); 23 | break; 24 | } 25 | else { 26 | strings.emplace_back(it1, it1 + MaxLen); 27 | } 28 | } 29 | 30 | for (const auto& s : strings) 31 | scorer.insert(s); 32 | 33 | std::vector simd_results(scorer.result_count()); 34 | scorer.similarity(&simd_results[0], simd_results.size(), s2); 35 | 36 | for (size_t i = 0; i < strings.size(); ++i) { 37 | int64_t reference_score = rapidfuzz_reference::lcs_seq_similarity(strings[i], s2); 38 | if (reference_score != simd_results[i]) { 39 | print_seq("s1: ", s1); 40 | print_seq("s2: ", s2); 41 | throw std::logic_error(std::string("lcs distance using simd failed (score_cutoff = ") + 42 | std::string(", reference_score = ") + std::to_string(reference_score) + 43 | std::string(", score = ") + std::to_string(simd_results[i]) + ")"); 44 | } 45 | } 46 | #else 47 | (void)s1; 48 | (void)s2; 49 | #endif 50 | } 51 | 52 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 53 | { 54 | std::basic_string s1, s2; 55 | if (!extract_strings(data, size, s1, s2)) { 56 | return 0; 57 | } 58 | 59 | if (s1.size() == 0) { 60 | return 0; 61 | } 62 | 63 | validate_simd<8>(s1, s2); 64 | validate_simd<16>(s1, s2); 65 | validate_simd<32>(s1, s2); 66 | validate_simd<64>(s1, s2); 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /rapidfuzz_reference/OSA.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace rapidfuzz_reference { 14 | 15 | template 16 | Matrix osa_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) 17 | { 18 | ptrdiff_t len1 = std::distance(first1, last1); 19 | ptrdiff_t len2 = std::distance(first2, last2); 20 | 21 | Matrix matrix(static_cast(len1) + 1, static_cast(len2) + 1); 22 | 23 | for (ptrdiff_t i = 0; i <= len1; ++i) 24 | matrix(static_cast(i), 0) = i; 25 | for (ptrdiff_t i = 0; i <= len2; ++i) 26 | matrix(0, static_cast(i)) = i; 27 | 28 | for (ptrdiff_t pos1 = 0; pos1 < len1; ++pos1) { 29 | for (ptrdiff_t pos2 = 0; pos2 < len2; ++pos2) { 30 | ptrdiff_t cost = (first1[pos1] == first2[pos2]) ? 0 : 1; 31 | 32 | matrix(pos1 + 1, pos2 + 1) = 33 | std::min({matrix(pos1, pos2 + 1) + 1, matrix(pos1 + 1, pos2) + 1, matrix(pos1, pos2) + cost}); 34 | 35 | if (pos1 == 0 || pos2 == 0) continue; 36 | if (first1[pos1] != first2[pos2 - 1]) continue; 37 | if (first1[pos1 - 1] != first2[pos2]) continue; 38 | 39 | matrix(pos1 + 1, pos2 + 1) = 40 | std::min(matrix(pos1 + 1, pos2 + 1), matrix(pos1 - 1, pos2 - 1) + cost); 41 | } 42 | } 43 | 44 | return matrix; 45 | } 46 | 47 | template 48 | int64_t osa_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 49 | int64_t score_cutoff = std::numeric_limits::max()) 50 | { 51 | auto matrix = osa_matrix(first1, last1, first2, last2); 52 | int64_t dist = matrix.back(); 53 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 54 | } 55 | 56 | template 57 | int64_t osa_distance(const Sentence1& s1, const Sentence2& s2, 58 | int64_t score_cutoff = std::numeric_limits::max()) 59 | { 60 | return osa_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 61 | } 62 | 63 | } // namespace rapidfuzz_reference 64 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(Catch2 3 QUIET) 2 | if (Catch2_FOUND) 3 | message("Using system supplied version of Catch2") 4 | else() 5 | message("Using FetchContent to load Catch2") 6 | include(FetchContent) 7 | FetchContent_Declare( 8 | Catch2 9 | GIT_REPOSITORY https://github.com/catchorg/Catch2.git 10 | GIT_TAG v3.0.1 11 | ) 12 | FetchContent_MakeAvailable(Catch2) 13 | endif() 14 | 15 | if (RAPIDFUZZ_ENABLE_LINTERS) 16 | # include aminya & jason turner's C++ best practices recommended cmake project utilities 17 | message("Enable Linters on test build") 18 | include(FetchContent) 19 | 20 | if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20) 21 | FetchContent_Declare(_project_options URL https://github.com/aminya/project_options/archive/refs/tags/v0.26.2.zip) 22 | else() 23 | FetchContent_Declare(_project_options URL https://github.com/aminya/project_options/archive/refs/tags/v0.25.2.zip) 24 | endif() 25 | FetchContent_MakeAvailable(_project_options) 26 | include(${_project_options_SOURCE_DIR}/Index.cmake) 27 | 28 | project_options( 29 | # ENABLE_CACHE 30 | # ENABLE_CONAN 31 | WARNINGS_AS_ERRORS 32 | # ENABLE_CPPCHECK 33 | # ENABLE_CLANG_TIDY 34 | # ENABLE_INCLUDE_WHAT_YOU_USE 35 | # ENABLE_COVERAGE 36 | # ENABLE_PCH 37 | # PCH_HEADERS 38 | # ENABLE_DOXYGEN 39 | # ENABLE_IPO 40 | # ENABLE_USER_LINKER 41 | # ENABLE_BUILD_WITH_TIME_TRACE 42 | # ENABLE_UNITY 43 | # ENABLE_SANITIZER_ADDRESS 44 | # ENABLE_SANITIZER_LEAK 45 | # ENABLE_SANITIZER_UNDEFINED_BEHAVIOR 46 | # ENABLE_SANITIZER_THREAD 47 | # ENABLE_SANITIZER_MEMORY 48 | # CLANG_WARNINGS "-Weverything" 49 | ) 50 | endif() 51 | 52 | function(rapidfuzz_add_test test) 53 | add_executable(test_${test} tests-${test}.cpp) 54 | target_link_libraries(test_${test} ${PROJECT_NAME}) 55 | target_link_libraries(test_${test} Catch2::Catch2WithMain) 56 | if (RAPIDFUZZ_ENABLE_LINTERS) 57 | target_link_libraries(test_${test} project_warnings) 58 | endif() 59 | add_test(NAME ${test} COMMAND test_${test}) 60 | endfunction() 61 | 62 | rapidfuzz_add_test(fuzz) 63 | rapidfuzz_add_test(common) 64 | 65 | add_subdirectory(distance) 66 | -------------------------------------------------------------------------------- /rapidfuzz/details/SplittedSentenceView.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace rapidfuzz::detail { 8 | 9 | template 10 | class SplittedSentenceView { 11 | public: 12 | using CharT = iter_value_t; 13 | 14 | SplittedSentenceView(RangeVec sentence) noexcept( 15 | std::is_nothrow_move_constructible_v>) 16 | : m_sentence(std::move(sentence)) 17 | {} 18 | 19 | size_t dedupe(); 20 | size_t size() const; 21 | 22 | size_t length() const 23 | { 24 | return size(); 25 | } 26 | 27 | bool empty() const 28 | { 29 | return m_sentence.empty(); 30 | } 31 | 32 | size_t word_count() const 33 | { 34 | return m_sentence.size(); 35 | } 36 | 37 | std::basic_string join() const; 38 | 39 | const RangeVec& words() const 40 | { 41 | return m_sentence; 42 | } 43 | 44 | private: 45 | RangeVec m_sentence; 46 | }; 47 | 48 | template 49 | size_t SplittedSentenceView::dedupe() 50 | { 51 | size_t old_word_count = word_count(); 52 | m_sentence.erase(std::unique(m_sentence.begin(), m_sentence.end()), m_sentence.end()); 53 | return old_word_count - word_count(); 54 | } 55 | 56 | template 57 | size_t SplittedSentenceView::size() const 58 | { 59 | if (m_sentence.empty()) return 0; 60 | 61 | // there is a whitespace between each word 62 | size_t result = m_sentence.size() - 1; 63 | for (const auto& word : m_sentence) { 64 | result += static_cast(std::distance(word.begin(), word.end())); 65 | } 66 | 67 | return result; 68 | } 69 | 70 | template 71 | auto SplittedSentenceView::join() const -> std::basic_string 72 | { 73 | if (m_sentence.empty()) { 74 | return std::basic_string(); 75 | } 76 | 77 | auto sentence_iter = m_sentence.begin(); 78 | std::basic_string joined(sentence_iter->begin(), sentence_iter->end()); 79 | const std::basic_string whitespace{0x20}; 80 | ++sentence_iter; 81 | for (; sentence_iter != m_sentence.end(); ++sentence_iter) { 82 | joined.append(whitespace) 83 | .append(std::basic_string(sentence_iter->begin(), sentence_iter->end())); 84 | } 85 | return joined; 86 | } 87 | 88 | } // namespace rapidfuzz::detail 89 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Levenshtein.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace rapidfuzz_reference { 14 | 15 | struct LevenshteinWeightTable { 16 | int64_t insert_cost; 17 | int64_t delete_cost; 18 | int64_t replace_cost; 19 | }; 20 | 21 | template 22 | Matrix levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 23 | LevenshteinWeightTable weights = {1, 1, 1}) 24 | { 25 | ptrdiff_t len1 = std::distance(first1, last1); 26 | ptrdiff_t len2 = std::distance(first2, last2); 27 | 28 | Matrix matrix(static_cast(len1) + 1, static_cast(len2) + 1); 29 | 30 | for (ptrdiff_t i = 0; i <= len1; ++i) 31 | matrix(i, 0) = i * weights.delete_cost; 32 | for (ptrdiff_t i = 0; i <= len2; ++i) 33 | matrix(0, i) = i * weights.insert_cost; 34 | 35 | for (ptrdiff_t pos1 = 0; pos1 < len1; ++pos1) { 36 | for (ptrdiff_t pos2 = 0; pos2 < len2; ++pos2) { 37 | ptrdiff_t cost = (first1[pos1] == first2[pos2]) ? 0 : weights.replace_cost; 38 | 39 | matrix(pos1 + 1, pos2 + 1) = 40 | std::min({matrix(pos1, pos2 + 1) + weights.delete_cost, 41 | matrix(pos1 + 1, pos2) + weights.insert_cost, matrix(pos1, pos2) + cost}); 42 | } 43 | } 44 | 45 | return matrix; 46 | } 47 | 48 | template 49 | int64_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 50 | LevenshteinWeightTable weights = {1, 1, 1}, 51 | int64_t score_cutoff = std::numeric_limits::max()) 52 | { 53 | auto matrix = levenshtein_matrix(first1, last1, first2, last2, weights); 54 | int64_t dist = matrix.back(); 55 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 56 | } 57 | 58 | template 59 | int64_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2, 60 | LevenshteinWeightTable weights = {1, 1, 1}, 61 | int64_t score_cutoff = std::numeric_limits::max()) 62 | { 63 | return levenshtein_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), weights, 64 | score_cutoff); 65 | } 66 | 67 | } // namespace rapidfuzz_reference 68 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Jaro.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace rapidfuzz_reference { 15 | 16 | template 17 | double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, 18 | double score_cutoff = 0.0) 19 | { 20 | size_t P_len = static_cast(std::distance(P_first, P_last)); 21 | size_t T_len = static_cast(std::distance(T_first, T_last)); 22 | 23 | if (!P_len || !T_len) return 0; 24 | 25 | std::vector P_flag(P_len + 1); 26 | std::vector T_flag(T_len + 1); 27 | 28 | size_t Bound = std::max(P_len, T_len) / 2; 29 | if (Bound > 0) Bound--; 30 | 31 | size_t CommonChars = 0; 32 | for (size_t i = 0; i < T_len; i++) { 33 | size_t lowlim = (i >= Bound) ? i - Bound : 0; 34 | size_t hilim = (i + Bound <= P_len - 1) ? (i + Bound) : P_len - 1; 35 | for (size_t j = lowlim; j <= hilim; j++) { 36 | if (!P_flag[j] && (P_first[static_cast(j)] == T_first[static_cast(i)])) { 37 | T_flag[i] = 1; 38 | P_flag[j] = 1; 39 | CommonChars++; 40 | break; 41 | } 42 | } 43 | } 44 | 45 | // Count the number of transpositions 46 | size_t Transpositions = 0; 47 | size_t k = 0; 48 | for (size_t i = 0; i < T_len; i++) { 49 | if (T_flag[i]) { 50 | size_t j = k; 51 | for (; j < P_len; j++) { 52 | if (P_flag[j]) { 53 | k = j + 1; 54 | break; 55 | } 56 | } 57 | if (T_first[static_cast(i)] != P_first[static_cast(j)]) Transpositions++; 58 | } 59 | } 60 | 61 | Transpositions /= 2; 62 | double Sim = 0; 63 | Sim += static_cast(CommonChars) / static_cast(P_len); 64 | Sim += static_cast(CommonChars) / static_cast(T_len); 65 | Sim += (static_cast(CommonChars) - static_cast(Transpositions)) / 66 | static_cast(CommonChars); 67 | Sim /= 3.0; 68 | return (Sim >= score_cutoff) ? Sim : 0; 69 | } 70 | 71 | template 72 | double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 73 | { 74 | return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 75 | } 76 | 77 | } /* namespace rapidfuzz_reference */ 78 | -------------------------------------------------------------------------------- /rapidfuzz_reference/DamerauLevenshtein.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace rapidfuzz_reference { 15 | 16 | template 17 | Matrix damerau_levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) 18 | { 19 | ptrdiff_t len1 = std::distance(first1, last1); 20 | ptrdiff_t len2 = std::distance(first2, last2); 21 | ptrdiff_t infinite = len1 + len2; 22 | 23 | std::unordered_map da; 24 | Matrix matrix(static_cast(len1) + 2, static_cast(len2) + 2); 25 | matrix(0, 0) = infinite; 26 | 27 | for (ptrdiff_t i = 0; i <= len1; ++i) { 28 | matrix(i + 1, 0) = infinite; 29 | matrix(i + 1, 1) = i; 30 | } 31 | for (ptrdiff_t i = 0; i <= len2; ++i) { 32 | matrix(0, i + 1) = infinite; 33 | matrix(1, i + 1) = i; 34 | } 35 | 36 | for (ptrdiff_t pos1 = 0; pos1 < len1; ++pos1) { 37 | ptrdiff_t db = 0; 38 | for (ptrdiff_t pos2 = 0; pos2 < len2; ++pos2) { 39 | int64_t i1 = da[static_cast(first2[pos2])]; 40 | ptrdiff_t j1 = db; 41 | ptrdiff_t cost = 1; 42 | if (first1[pos1] == first2[pos2]) { 43 | cost = 0; 44 | db = pos2 + 1; 45 | } 46 | 47 | matrix(pos1 + 2, pos2 + 2) = 48 | std::min({matrix(pos1 + 1, pos2 + 1) + cost, matrix(pos1 + 2, pos2 + 1) + 1, 49 | matrix(pos1 + 1, pos2 + 2) + 1, matrix(i1, j1) + (pos1 - i1) + 1 + (pos2 - j1) 50 | 51 | }); 52 | } 53 | 54 | da[first1[pos1]] = pos1 + 1; 55 | } 56 | 57 | return matrix; 58 | } 59 | 60 | template 61 | int64_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 62 | int64_t score_cutoff = std::numeric_limits::max()) 63 | { 64 | auto matrix = damerau_levenshtein_matrix(first1, last1, first2, last2); 65 | int64_t dist = matrix.back(); 66 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 67 | } 68 | 69 | template 70 | int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, 71 | int64_t score_cutoff = std::numeric_limits::max()) 72 | { 73 | return damerau_levenshtein_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), 74 | score_cutoff); 75 | } 76 | 77 | } // namespace rapidfuzz_reference 78 | -------------------------------------------------------------------------------- /bench/bench-jarowinkler.cpp: -------------------------------------------------------------------------------- 1 | #include "rapidfuzz/distance/Jaro.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | std::string generate(int max_length) 9 | { 10 | std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; 11 | std::random_device rd; 12 | std::mt19937 engine(rd()); 13 | std::uniform_int_distribution<> dist(0, static_cast(possible_characters.size() - 1)); 14 | std::string ret = ""; 15 | for (int i = 0; i < max_length; i++) { 16 | int random_index = dist(engine); 17 | ret += possible_characters[static_cast(random_index)]; 18 | } 19 | return ret; 20 | } 21 | 22 | template 23 | std::basic_string str_multiply(std::basic_string a, unsigned int b) 24 | { 25 | std::basic_string output; 26 | while (b--) 27 | output += a; 28 | 29 | return output; 30 | } 31 | 32 | static void BM_JaroLongSimilarSequence(benchmark::State& state) 33 | { 34 | size_t len = state.range(0); 35 | size_t score_cutoff = state.range(1); 36 | std::string s1 = std::string("a") + str_multiply(std::string("b"), (len - 2)) + std::string("a"); 37 | std::string s2 = str_multiply(std::string("b"), len); 38 | 39 | size_t num = 0; 40 | for (auto _ : state) { 41 | benchmark::DoNotOptimize(rapidfuzz::jaro_similarity(s1, s2)); 42 | ++num; 43 | } 44 | 45 | state.counters["Rate"] = benchmark::Counter(static_cast(num * len), benchmark::Counter::kIsRate); 46 | state.counters["InvRate"] = benchmark::Counter(static_cast(num * len), 47 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 48 | } 49 | 50 | static void BM_JaroLongNonSimilarSequence(benchmark::State& state) 51 | { 52 | size_t len = state.range(0); 53 | size_t score_cutoff = state.range(1); 54 | std::string s1 = str_multiply(std::string("a"), len); 55 | std::string s2 = str_multiply(std::string("b"), len); 56 | 57 | size_t num = 0; 58 | for (auto _ : state) { 59 | benchmark::DoNotOptimize(rapidfuzz::jaro_similarity(s1, s2)); 60 | ++num; 61 | } 62 | 63 | state.counters["Rate"] = benchmark::Counter(static_cast(num * len), benchmark::Counter::kIsRate); 64 | state.counters["InvRate"] = benchmark::Counter(static_cast(num * len), 65 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 66 | } 67 | 68 | BENCHMARK(BM_JaroLongSimilarSequence) 69 | ->Args({100, 30}) 70 | ->Args({500, 30}) 71 | ->Args({5000, 30}) 72 | ->Args({10000, 30}) 73 | ->Args({20000, 30}) 74 | ->Args({50000, 30}); 75 | 76 | BENCHMARK(BM_JaroLongNonSimilarSequence) 77 | ->Args({100, 30}) 78 | ->Args({500, 30}) 79 | ->Args({5000, 30}) 80 | ->Args({10000, 30}) 81 | ->Args({20000, 30}) 82 | ->Args({50000, 30}); 83 | 84 | BENCHMARK_MAIN(); -------------------------------------------------------------------------------- /rapidfuzz/details/common.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace rapidfuzz::detail { 17 | 18 | template 19 | struct DecomposedSet { 20 | SplittedSentenceView difference_ab; 21 | SplittedSentenceView difference_ba; 22 | SplittedSentenceView intersection; 23 | DecomposedSet(SplittedSentenceView diff_ab, SplittedSentenceView diff_ba, 24 | SplittedSentenceView intersect) 25 | : difference_ab(std::move(diff_ab)), 26 | difference_ba(std::move(diff_ba)), 27 | intersection(std::move(intersect)) 28 | {} 29 | }; 30 | 31 | /** 32 | * @defgroup Common Common 33 | * Common utilities shared among multiple functions 34 | * @{ 35 | */ 36 | 37 | static inline double NormSim_to_NormDist(double score_cutoff, double imprecision = 0.00001) 38 | { 39 | return std::min(1.0, 1.0 - score_cutoff + imprecision); 40 | } 41 | 42 | template 43 | DecomposedSet set_decomposition(SplittedSentenceView a, 44 | SplittedSentenceView b); 45 | 46 | constexpr double result_cutoff(double result, double score_cutoff) 47 | { 48 | return (result >= score_cutoff) ? result : 0; 49 | } 50 | 51 | template 52 | constexpr double norm_distance(int64_t dist, int64_t lensum, double score_cutoff = 0) 53 | { 54 | double max = static_cast(Max); 55 | return result_cutoff((lensum > 0) ? (max - max * static_cast(dist) / static_cast(lensum)) 56 | : max, 57 | score_cutoff); 58 | } 59 | 60 | template 61 | static inline int64_t score_cutoff_to_distance(double score_cutoff, int64_t lensum) 62 | { 63 | return static_cast(std::ceil(static_cast(lensum) * (1.0 - score_cutoff / Max))); 64 | } 65 | 66 | template 67 | StringAffix remove_common_affix(Range& s1, Range& s2); 68 | 69 | template 70 | size_t remove_common_prefix(Range& s1, Range& s2); 71 | 72 | template 73 | size_t remove_common_suffix(Range& s1, Range& s2); 74 | 75 | template > 76 | SplittedSentenceView sorted_split(InputIt first, InputIt last); 77 | 78 | /**@}*/ 79 | 80 | } // namespace rapidfuzz::detail 81 | 82 | #include 83 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Indel_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace rapidfuzz::detail { 12 | 13 | template 14 | int64_t indel_distance(const BlockPatternMatchVector& block, Range s1, Range s2, 15 | int64_t score_cutoff) 16 | { 17 | int64_t maximum = s1.size() + s2.size(); 18 | int64_t lcs_cutoff = std::max(0, maximum / 2 - score_cutoff); 19 | int64_t lcs_sim = lcs_seq_similarity(block, s1, s2, lcs_cutoff); 20 | int64_t dist = maximum - 2 * lcs_sim; 21 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 22 | } 23 | 24 | template 25 | double indel_normalized_distance(const BlockPatternMatchVector& block, Range s1, Range s2, 26 | double score_cutoff) 27 | { 28 | int64_t maximum = s1.size() + s2.size(); 29 | int64_t cutoff_distance = static_cast(std::ceil(static_cast(maximum) * score_cutoff)); 30 | int64_t dist = indel_distance(block, s1, s2, cutoff_distance); 31 | double norm_dist = (maximum) ? static_cast(dist) / static_cast(maximum) : 0.0; 32 | return (norm_dist <= score_cutoff) ? norm_dist : 1.0; 33 | } 34 | 35 | template 36 | double indel_normalized_similarity(const BlockPatternMatchVector& block, Range s1, 37 | Range s2, double score_cutoff) 38 | { 39 | double cutoff_score = NormSim_to_NormDist(score_cutoff); 40 | double norm_dist = indel_normalized_distance(block, s1, s2, cutoff_score); 41 | double norm_sim = 1.0 - norm_dist; 42 | return (norm_sim >= score_cutoff) ? norm_sim : 0.0; 43 | } 44 | 45 | class Indel : public DistanceBase::max()> { 46 | friend DistanceBase::max()>; 47 | friend NormalizedMetricBase; 48 | 49 | template 50 | static int64_t maximum(Range s1, Range s2) 51 | { 52 | return s1.size() + s2.size(); 53 | } 54 | 55 | template 56 | static int64_t _distance(Range s1, Range s2, int64_t score_cutoff, int64_t score_hint) 57 | { 58 | int64_t maximum = Indel::maximum(s1, s2); 59 | int64_t lcs_cutoff = std::max(0, maximum / 2 - score_cutoff); 60 | int64_t lcs_hint = std::max(0, maximum / 2 - score_hint); 61 | int64_t lcs_sim = LCSseq::similarity(s1, s2, lcs_cutoff, lcs_hint); 62 | int64_t dist = maximum - 2 * lcs_sim; 63 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 64 | } 65 | }; 66 | 67 | } // namespace rapidfuzz::detail 68 | -------------------------------------------------------------------------------- /rapidfuzz/distance/JaroWinkler_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #include 5 | 6 | namespace rapidfuzz::detail { 7 | 8 | template 9 | double jaro_winkler_similarity(Range P, Range T, double prefix_weight, 10 | double score_cutoff) 11 | { 12 | int64_t P_len = P.size(); 13 | int64_t T_len = T.size(); 14 | int64_t min_len = std::min(P_len, T_len); 15 | int64_t prefix = 0; 16 | int64_t max_prefix = std::min(min_len, 4); 17 | 18 | for (; prefix < max_prefix; ++prefix) 19 | if (T[prefix] != P[prefix]) break; 20 | 21 | double jaro_score_cutoff = score_cutoff; 22 | if (jaro_score_cutoff > 0.7) { 23 | double prefix_sim = static_cast(prefix) * prefix_weight; 24 | 25 | if (prefix_sim >= 1.0) 26 | jaro_score_cutoff = 0.7; 27 | else 28 | jaro_score_cutoff = std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0)); 29 | } 30 | 31 | double Sim = jaro_similarity(P, T, jaro_score_cutoff); 32 | if (Sim > 0.7) Sim += static_cast(prefix) * prefix_weight * (1.0 - Sim); 33 | 34 | return (Sim >= score_cutoff) ? Sim : 0; 35 | } 36 | 37 | template 38 | double jaro_winkler_similarity(const BlockPatternMatchVector& PM, Range P, Range T, 39 | double prefix_weight, double score_cutoff) 40 | { 41 | int64_t P_len = P.size(); 42 | int64_t T_len = T.size(); 43 | int64_t min_len = std::min(P_len, T_len); 44 | int64_t prefix = 0; 45 | int64_t max_prefix = std::min(min_len, 4); 46 | 47 | for (; prefix < max_prefix; ++prefix) 48 | if (T[prefix] != P[prefix]) break; 49 | 50 | double jaro_score_cutoff = score_cutoff; 51 | if (jaro_score_cutoff > 0.7) { 52 | double prefix_sim = static_cast(prefix) * prefix_weight; 53 | 54 | if (prefix_sim >= 1.0) 55 | jaro_score_cutoff = 0.7; 56 | else 57 | jaro_score_cutoff = std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0)); 58 | } 59 | 60 | double Sim = jaro_similarity(PM, P, T, jaro_score_cutoff); 61 | if (Sim > 0.7) Sim += static_cast(prefix) * prefix_weight * (1.0 - Sim); 62 | 63 | return (Sim >= score_cutoff) ? Sim : 0; 64 | } 65 | 66 | class JaroWinkler : public SimilarityBase { 67 | friend SimilarityBase; 68 | friend NormalizedMetricBase; 69 | 70 | template 71 | static double maximum(Range, Range, double) noexcept 72 | { 73 | return 1.0; 74 | } 75 | 76 | template 77 | static double _similarity(Range s1, Range s2, double prefix_weight, 78 | double score_cutoff, [[maybe_unused]] double score_hint) 79 | { 80 | return jaro_winkler_similarity(s1, s2, prefix_weight, score_cutoff); 81 | } 82 | }; 83 | 84 | } // namespace rapidfuzz::detail 85 | -------------------------------------------------------------------------------- /test/distance/tests-OSA.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | template 8 | std::basic_string str_multiply(std::basic_string a, unsigned int b) 9 | { 10 | std::basic_string output; 11 | while (b--) 12 | output += a; 13 | 14 | return output; 15 | } 16 | 17 | template 18 | int64_t osa_distance(const Sentence1& s1, const Sentence2& s2, 19 | int64_t max = std::numeric_limits::max()) 20 | { 21 | int64_t res1 = rapidfuzz::osa_distance(s1, s2, max); 22 | int64_t res2 = rapidfuzz::osa_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), max); 23 | rapidfuzz::CachedOSA scorer(s1); 24 | int64_t res3 = scorer.distance(s2, max); 25 | int64_t res4 = scorer.distance(s2.begin(), s2.end(), max); 26 | #ifdef RAPIDFUZZ_SIMD 27 | if (s1.size() <= 64) { 28 | std::vector results(256 / 8); 29 | 30 | if (s1.size() <= 8) { 31 | rapidfuzz::experimental::MultiOSA<8> simd_scorer(1); 32 | simd_scorer.insert(s1); 33 | simd_scorer.distance(&results[0], results.size(), s2, max); 34 | } 35 | else if (s1.size() <= 16) { 36 | rapidfuzz::experimental::MultiOSA<16> simd_scorer(1); 37 | simd_scorer.insert(s1); 38 | simd_scorer.distance(&results[0], results.size(), s2, max); 39 | } 40 | else if (s1.size() <= 32) { 41 | rapidfuzz::experimental::MultiOSA<32> simd_scorer(1); 42 | simd_scorer.insert(s1); 43 | simd_scorer.distance(&results[0], results.size(), s2, max); 44 | } 45 | else { 46 | rapidfuzz::experimental::MultiOSA<64> simd_scorer(1); 47 | simd_scorer.insert(s1); 48 | simd_scorer.distance(&results[0], results.size(), s2, max); 49 | } 50 | 51 | REQUIRE(res1 == results[0]); 52 | } 53 | #endif 54 | REQUIRE(res1 == res2); 55 | REQUIRE(res1 == res3); 56 | REQUIRE(res1 == res4); 57 | return res1; 58 | } 59 | 60 | /* test some very simple cases of the osa distance */ 61 | TEST_CASE("osa[simple]") 62 | { 63 | { 64 | std::string s1 = ""; 65 | std::string s2 = ""; 66 | REQUIRE(osa_distance(s1, s2) == 0); 67 | } 68 | 69 | { 70 | std::string s1 = "aaaa"; 71 | std::string s2 = ""; 72 | REQUIRE(osa_distance(s1, s2) == 4); 73 | REQUIRE(osa_distance(s2, s1) == 4); 74 | REQUIRE(osa_distance(s1, s2, 1) == 2); 75 | REQUIRE(osa_distance(s2, s1, 1) == 2); 76 | } 77 | 78 | { 79 | std::string s1 = "CA"; 80 | std::string s2 = "ABC"; 81 | REQUIRE(osa_distance(s1, s2) == 3); 82 | } 83 | 84 | { 85 | std::string s1 = "CA"; 86 | std::string s2 = "AC"; 87 | REQUIRE(osa_distance(s1, s2) == 1); 88 | } 89 | 90 | { 91 | std::string filler = str_multiply(std::string("a"), 64); 92 | std::string s1 = std::string("a") + filler + "CA" + filler + std::string("a"); 93 | std::string s2 = std::string("b") + filler + "AC" + filler + std::string("b"); 94 | REQUIRE(osa_distance(s1, s2) == 3); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /fuzzing/fuzz_levenshtein_distance.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Levenshtein.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | void validate_simd(const std::basic_string& s1, const std::basic_string& s2) 13 | { 14 | #ifdef RAPIDFUZZ_SIMD 15 | size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0); 16 | if (count == 0) return; 17 | 18 | rapidfuzz::experimental::MultiLevenshtein scorer(count); 19 | 20 | std::vector> strings; 21 | 22 | for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) { 23 | if (std::distance(it1, s1.end()) < static_cast(MaxLen)) { 24 | strings.emplace_back(it1, s1.end()); 25 | break; 26 | } 27 | else { 28 | strings.emplace_back(it1, it1 + MaxLen); 29 | } 30 | } 31 | 32 | for (const auto& s : strings) 33 | scorer.insert(s); 34 | 35 | std::vector simd_results(scorer.result_count()); 36 | scorer.distance(&simd_results[0], simd_results.size(), s2); 37 | 38 | for (size_t i = 0; i < strings.size(); ++i) { 39 | int64_t reference_score = rapidfuzz_reference::levenshtein_distance(strings[i], s2); 40 | if (reference_score != simd_results[i]) { 41 | print_seq("s1: ", s1); 42 | print_seq("s2: ", s2); 43 | throw std::logic_error(std::string("levenshtein distance using simd failed (reference_score = ") + 44 | std::to_string(reference_score) + std::string(", score = ") + 45 | std::to_string(simd_results[i]) + ")"); 46 | } 47 | } 48 | #else 49 | (void)s1; 50 | (void)s2; 51 | #endif 52 | } 53 | 54 | void validate_distance(int64_t reference_dist, const std::basic_string& s1, 55 | const std::basic_string& s2, int64_t score_cutoff) 56 | { 57 | if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; 58 | 59 | auto dist = rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff); 60 | if (dist != reference_dist) { 61 | print_seq("s1: ", s1); 62 | print_seq("s2: ", s2); 63 | throw std::logic_error(std::string("levenshtein distance failed (score_cutoff = ") + 64 | std::to_string(score_cutoff) + std::string(", reference_score = ") + 65 | std::to_string(reference_dist) + std::string(", score = ") + 66 | std::to_string(dist) + ")"); 67 | } 68 | 69 | validate_simd<8>(s1, s2); 70 | validate_simd<16>(s1, s2); 71 | validate_simd<32>(s1, s2); 72 | validate_simd<64>(s1, s2); 73 | } 74 | 75 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 76 | { 77 | std::basic_string s1, s2; 78 | if (!extract_strings(data, size, s1, s2)) return 0; 79 | 80 | int64_t reference_dist = rapidfuzz_reference::levenshtein_distance(s1, s2); 81 | 82 | /* test mbleven */ 83 | for (int64_t i = 0; i < 4; ++i) 84 | validate_distance(reference_dist, s1, s2, i); 85 | 86 | /* test small band */ 87 | for (int64_t i = 4; i < 32; ++i) 88 | validate_distance(reference_dist, s1, s2, i); 89 | 90 | /* unrestricted */ 91 | validate_distance(reference_dist, s1, s2, std::numeric_limits::max()); 92 | 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /test/distance/tests-Hamming.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | template 7 | int64_t hamming_distance(const Sentence1& s1, const Sentence2& s2, 8 | int64_t max = std::numeric_limits::max()) 9 | { 10 | int64_t res1 = rapidfuzz::hamming_distance(s1, s2, max); 11 | int64_t res2 = rapidfuzz::hamming_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), max); 12 | rapidfuzz::CachedHamming scorer(s1); 13 | int64_t res3 = scorer.distance(s2, max); 14 | int64_t res4 = scorer.distance(s2.begin(), s2.end(), max); 15 | REQUIRE(res1 == res2); 16 | REQUIRE(res1 == res3); 17 | REQUIRE(res1 == res4); 18 | return res1; 19 | } 20 | 21 | template 22 | int64_t hamming_similarity(const Sentence1& s1, const Sentence2& s2, int64_t max = 0) 23 | { 24 | int64_t res1 = rapidfuzz::hamming_similarity(s1, s2, max); 25 | int64_t res2 = rapidfuzz::hamming_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), max); 26 | rapidfuzz::CachedHamming scorer(s1); 27 | int64_t res3 = scorer.similarity(s2, max); 28 | int64_t res4 = scorer.similarity(s2.begin(), s2.end(), max); 29 | REQUIRE(res1 == res2); 30 | REQUIRE(res1 == res3); 31 | REQUIRE(res1 == res4); 32 | return res1; 33 | } 34 | 35 | template 36 | double hamming_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 37 | { 38 | double res1 = rapidfuzz::hamming_normalized_distance(s1, s2, score_cutoff); 39 | double res2 = 40 | rapidfuzz::hamming_normalized_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 41 | rapidfuzz::CachedHamming scorer(s1); 42 | double res3 = scorer.normalized_distance(s2, score_cutoff); 43 | double res4 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff); 44 | REQUIRE(res1 == Catch::Approx(res2).epsilon(0.0001)); 45 | REQUIRE(res1 == Catch::Approx(res3).epsilon(0.0001)); 46 | REQUIRE(res1 == Catch::Approx(res4).epsilon(0.0001)); 47 | return res1; 48 | } 49 | 50 | template 51 | double hamming_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 52 | { 53 | double res1 = rapidfuzz::hamming_normalized_similarity(s1, s2, score_cutoff); 54 | double res2 = 55 | rapidfuzz::hamming_normalized_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 56 | rapidfuzz::CachedHamming scorer(s1); 57 | double res3 = scorer.normalized_similarity(s2, score_cutoff); 58 | double res4 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff); 59 | REQUIRE(res1 == Catch::Approx(res2).epsilon(0.0001)); 60 | REQUIRE(res1 == Catch::Approx(res3).epsilon(0.0001)); 61 | REQUIRE(res1 == Catch::Approx(res4).epsilon(0.0001)); 62 | return res1; 63 | } 64 | 65 | TEST_CASE("Hamming") 66 | { 67 | std::string test = "aaaa"; 68 | std::string diff_a = "abaa"; 69 | std::string diff_b = "aaba"; 70 | std::string diff_len = "aaaaa"; 71 | 72 | SECTION("hamming calculates correct distances") 73 | { 74 | REQUIRE(hamming_distance(test, test) == 0); 75 | REQUIRE(hamming_distance(test, diff_a) == 1); 76 | REQUIRE(hamming_distance(test, diff_b) == 1); 77 | REQUIRE(hamming_distance(diff_a, diff_b) == 2); 78 | } 79 | 80 | SECTION("hamming raises exception for different lengths") 81 | { 82 | REQUIRE_THROWS_AS(rapidfuzz::hamming_distance(test, diff_len), std::invalid_argument); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /test/distance/tests-Jaro.cpp: -------------------------------------------------------------------------------- 1 | #include "../../rapidfuzz_reference/Jaro.hpp" 2 | #include 3 | #include 4 | #include 5 | 6 | using Catch::Approx; 7 | 8 | template 9 | double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 10 | { 11 | double res1 = rapidfuzz::jaro_similarity(s1, s2, score_cutoff); 12 | double res2 = rapidfuzz::jaro_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 13 | double res3 = rapidfuzz::jaro_normalized_similarity(s1, s2, score_cutoff); 14 | double res4 = 15 | rapidfuzz::jaro_normalized_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 16 | rapidfuzz::CachedJaro scorer(s1); 17 | double res5 = scorer.similarity(s2, score_cutoff); 18 | double res6 = scorer.similarity(s2.begin(), s2.end(), score_cutoff); 19 | double res7 = scorer.similarity(s2, score_cutoff); 20 | double res8 = scorer.similarity(s2.begin(), s2.end(), score_cutoff); 21 | REQUIRE(res1 == Approx(res2)); 22 | REQUIRE(res1 == Approx(res3)); 23 | REQUIRE(res1 == Approx(res4)); 24 | REQUIRE(res1 == Approx(res5)); 25 | REQUIRE(res1 == Approx(res6)); 26 | REQUIRE(res1 == Approx(res7)); 27 | REQUIRE(res1 == Approx(res8)); 28 | return res1; 29 | } 30 | 31 | template 32 | double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 33 | { 34 | double res1 = rapidfuzz::jaro_distance(s1, s2, score_cutoff); 35 | double res2 = rapidfuzz::jaro_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 36 | double res3 = rapidfuzz::jaro_normalized_distance(s1, s2, score_cutoff); 37 | double res4 = 38 | rapidfuzz::jaro_normalized_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 39 | rapidfuzz::CachedJaro scorer(s1); 40 | double res5 = scorer.distance(s2, score_cutoff); 41 | double res6 = scorer.distance(s2.begin(), s2.end(), score_cutoff); 42 | double res7 = scorer.distance(s2, score_cutoff); 43 | double res8 = scorer.distance(s2.begin(), s2.end(), score_cutoff); 44 | REQUIRE(res1 == Approx(res2)); 45 | REQUIRE(res1 == Approx(res3)); 46 | REQUIRE(res1 == Approx(res4)); 47 | REQUIRE(res1 == Approx(res5)); 48 | REQUIRE(res1 == Approx(res6)); 49 | REQUIRE(res1 == Approx(res7)); 50 | REQUIRE(res1 == Approx(res8)); 51 | return res1; 52 | } 53 | 54 | /** 55 | * @name JaroWinklerFlagCharsTest 56 | */ 57 | TEST_CASE("JaroWinklerTest") 58 | { 59 | std::array names = {"james", "robert", "john", "michael", "william", 60 | "david", "joseph", "thomas", "charles", "mary", 61 | "patricia", "jennifer", "linda", "elizabeth", "barbara", 62 | "susan", "jessica", "sarah", "karen"}; 63 | 64 | SECTION("testFullResultWithScoreCutoff") 65 | { 66 | for (double score_cutoff = 0.0; score_cutoff < 1.1; score_cutoff += 0.1) 67 | for (const auto& name1 : names) 68 | for (const auto& name2 : names) { 69 | INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff); 70 | double Sim_original = rapidfuzz_reference::jaro_similarity(name1, name2, score_cutoff); 71 | double Sim_bitparallel = jaro_similarity(name1, name2, score_cutoff); 72 | double Dist_bitparallel = jaro_distance(name1, name2, 1.0 - score_cutoff); 73 | 74 | REQUIRE(Sim_original == Approx(Sim_bitparallel)); 75 | REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel)); 76 | } 77 | } 78 | } -------------------------------------------------------------------------------- /Doxyfile: -------------------------------------------------------------------------------- 1 | # Doxyfile 1.8.20 2 | 3 | PROJECT_NAME = RapidFuzz 4 | 5 | OUTPUT_DIRECTORY = doxygen 6 | 7 | 8 | 9 | # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 10 | # to include (a tag file for) the STL sources as input, then you should set this 11 | # tag to YES in order to let doxygen match functions declarations and 12 | # definitions whose arguments contain STL classes (e.g. func(std::string); 13 | # versus func(std::string) {}). This also make the inheritance and collaboration 14 | # diagrams that involve STL classes more complete and accurate. 15 | # The default value is: NO. 16 | 17 | BUILTIN_STL_SUPPORT = YES 18 | 19 | 20 | EXTRACT_PRIVATE = YES 21 | 22 | 23 | 24 | EXTRACT_STATIC = YES 25 | 26 | 27 | HIDE_UNDOC_MEMBERS = YES 28 | 29 | HIDE_UNDOC_CLASSES = YES 30 | 31 | 32 | # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with 33 | # their full class and namespace scopes in the documentation. If set to YES, the 34 | # scope will be hidden. 35 | # The default value is: NO. 36 | 37 | HIDE_SCOPE_NAMES = NO 38 | 39 | # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will 40 | # append additional text to a page's title, such as Class Reference. If set to 41 | # YES the compound reference will be hidden. 42 | # The default value is: NO. 43 | 44 | HIDE_COMPOUND_REFERENCE= NO 45 | 46 | # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of 47 | # the files that are included by a file in the documentation of that file. 48 | # The default value is: YES. 49 | 50 | SHOW_INCLUDE_FILES = YES 51 | 52 | # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each 53 | # grouped member an include statement to the documentation, telling the reader 54 | # which file to include in order to use the member. 55 | # The default value is: NO. 56 | 57 | SHOW_GROUPED_MEMB_INC = YES 58 | 59 | 60 | # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo 61 | # list. This list is created by putting \todo commands in the documentation. 62 | # The default value is: YES. 63 | 64 | GENERATE_TODOLIST = NO 65 | 66 | SHOW_FILES = NO 67 | 68 | 69 | # The CITE_BIB_FILES tag can be used to specify one or more bib files containing 70 | # the reference definitions. This must be a list of .bib files. The .bib 71 | # extension is automatically appended if omitted. This requires the bibtex tool 72 | # to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. 73 | # For LaTeX the style of the bibliography can be controlled using 74 | # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the 75 | # search path. See also \cite for info how to create references. 76 | 77 | CITE_BIB_FILES = docs/literature/hyrro_lcs_2004 \ 78 | docs/literature/hyrro_2002 \ 79 | docs/literature/hyrro_2004 \ 80 | docs/literature/myers_1999 \ 81 | docs/literature/wagner_fischer_1974 82 | 83 | 84 | EXTRA_PACKAGES = amsmath xr amsfonts 85 | 86 | #--------------------------------------------------------------------------- 87 | # Configuration options related to the input files 88 | #--------------------------------------------------------------------------- 89 | 90 | INPUT = rapidfuzz 91 | 92 | FILE_PATTERNS = *.c \ 93 | *.cxx \ 94 | *.cpp \ 95 | *.h \ 96 | *.hpp \ 97 | *.md 98 | 99 | #--------------------------------------------------------------------------- 100 | # Configuration options related to the LaTeX output 101 | #--------------------------------------------------------------------------- 102 | 103 | GENERATE_LATEX = NO 104 | 105 | HAVE_DOT = YES 106 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Jaro.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include "rapidfuzz/details/Range.hpp" 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz { 11 | 12 | template 13 | double jaro_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 14 | double score_cutoff = 1.0) 15 | { 16 | return detail::Jaro::distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 17 | } 18 | 19 | template 20 | double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 21 | { 22 | return detail::Jaro::distance(s1, s2, score_cutoff, score_cutoff); 23 | } 24 | 25 | template 26 | double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 27 | double score_cutoff = 0.0) 28 | { 29 | return detail::Jaro::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 30 | } 31 | 32 | template 33 | double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 34 | { 35 | return detail::Jaro::similarity(s1, s2, score_cutoff, score_cutoff); 36 | } 37 | 38 | template 39 | double jaro_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 40 | double score_cutoff = 1.0) 41 | { 42 | return detail::Jaro::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 43 | } 44 | 45 | template 46 | double jaro_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 47 | { 48 | return detail::Jaro::normalized_distance(s1, s2, score_cutoff, score_cutoff); 49 | } 50 | 51 | template 52 | double jaro_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 53 | double score_cutoff = 0.0) 54 | { 55 | return detail::Jaro::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 56 | } 57 | 58 | template 59 | double jaro_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 60 | { 61 | return detail::Jaro::normalized_similarity(s1, s2, score_cutoff, score_cutoff); 62 | } 63 | 64 | template 65 | struct CachedJaro : public detail::CachedSimilarityBase, double, 0, 1> { 66 | template 67 | explicit CachedJaro(const Sentence1& s1_) : CachedJaro(detail::to_begin(s1_), detail::to_end(s1_)) 68 | {} 69 | 70 | template 71 | CachedJaro(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(detail::Range(first1, last1)) 72 | {} 73 | 74 | private: 75 | friend detail::CachedSimilarityBase, double, 0, 1>; 76 | friend detail::CachedNormalizedMetricBase>; 77 | 78 | template 79 | double maximum(detail::Range) const 80 | { 81 | return 1.0; 82 | } 83 | 84 | template 85 | double _similarity(detail::Range s2, double score_cutoff, 86 | [[maybe_unused]] double score_hint) const 87 | { 88 | return detail::jaro_similarity(PM, detail::Range(s1), s2, score_cutoff); 89 | } 90 | 91 | std::basic_string s1; 92 | detail::BlockPatternMatchVector PM; 93 | }; 94 | 95 | template 96 | explicit CachedJaro(const Sentence1& s1_) -> CachedJaro>; 97 | 98 | template 99 | CachedJaro(InputIt1 first1, InputIt1 last1) -> CachedJaro>; 100 | 101 | } // namespace rapidfuzz 102 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Changelog 2 | 3 | ### [1.10.5] - 4 | #### Fixed 5 | - fix some floating point comparisions in the test suite 6 | 7 | ### [1.10.4] - 2022-12-14 8 | #### Changed 9 | - Linters are now disabled in test builds by default and can be enabled using `RAPIDFUZZ_ENABLE_LINTERS` 10 | 11 | ### [1.10.3] - 2022-12-13 12 | #### Fixed 13 | - fix warning about `project_options` when building the test suite with `cmake>=3.24` 14 | 15 | ### [1.10.2] - 2022-12-01 16 | #### Fixed 17 | - `fuzz::partial_ratio` was not always symmetric when `len(s1) == len(s2)` 18 | - fix undefined behavior in experimental SIMD implementaton 19 | 20 | ### [1.10.1] - 2022-11-02 21 | #### Fixed 22 | - fix broken sse2 support 23 | 24 | ### [1.10.0] - 2022-10-29 25 | #### Fixed 26 | - fix bug in `Levenshtein.editops` leading to crashes when used with `score_hint` 27 | 28 | #### Changed 29 | - add `score_hint` argument to cached implementations 30 | - add `score_hint` argument to Levenshtein functions 31 | 32 | ### [1.9.0] - 2022-10-22 33 | #### Added 34 | - added `Prefix`/`Postfix` similarity 35 | 36 | ### [1.8.0] - 2022-10-02 37 | #### Fixed 38 | - fixed incorrect score_cutoff handling in `lcs_seq_distance` 39 | 40 | #### Added 41 | - added experimental simd support for `ratio`/`Levenshtein`/`LCSseq`/`Indel` 42 | - add Jaro and JaroWinkler 43 | 44 | ### [1.7.0] - 2022-09-18 45 | #### Added 46 | - add editops to hamming distance 47 | 48 | #### Performance 49 | - strip common affix in osa distance 50 | 51 | ### [1.6.0] - 2022-09-16 52 | #### Added 53 | - add optimal string alignment (OSA) alignment 54 | 55 | ### [1.5.0] - 2022-09-11 56 | #### Fix 57 | - `fuzz::partial_ratio` did not find the optimal alignment in some edge cases 58 | 59 | #### Performance 60 | - improve performance of `fuzz::partial_ratio` 61 | 62 | ### [1.4.1] - 2022-09-11 63 | #### Fixed 64 | - fix type mismatch error 65 | 66 | ### [1.4.0] - 2022-09-10 67 | #### Performance 68 | - improve performance of Levenshtein distance/editops calculation for long 69 | sequences when providing a `score_cutoff`/`score_hint` 70 | 71 | ### [1.3.0] - 2022-09-03 72 | #### Performance 73 | - improve performance of Levenshtein distance 74 | - improve performance when `score_cutoff = 1` 75 | - improve performance for long sequences when `3 < score_cutoff < 32` 76 | - improve performance of Levenshtein editops 77 | 78 | #### Fixed 79 | - fix incorrect results of partial_ratio for long needles 80 | 81 | ### [1.2.0] - 2022-08-20 82 | #### Added 83 | - added damerau levenshtein implementation 84 | - Not API stable yet, since it will be extended with weights in a future version 85 | 86 | ### [1.1.1] - 2022-07-29 87 | #### Performance 88 | - improve performance for banded Levenshtein implementation 89 | 90 | ### [1.1.0] - 2022-07-29 91 | #### Fixed 92 | - fix banded Levenshtein implementation 93 | 94 | #### Changed 95 | - implement Hirschbergs algorithms to reduce memory usage of 96 | levenshtein_editops 97 | 98 | ### [1.0.5] - 2022-07-23 99 | #### Fixed 100 | - fix opcode conversion for empty source sequence 101 | 102 | ### [1.0.4] - 2022-06-29 103 | #### Fixed 104 | - fix implementation of hamming_normalized_similarity 105 | - fix implementation of CachedLCSseq::distance 106 | 107 | ### [1.0.3] - 2022-06-24 108 | #### Fixed 109 | - fix integer wraparound in partial_ratio/partial_ratio_alignment 110 | 111 | ### [1.0.2] - 2022-06-11 112 | #### Fixed 113 | - fix unlimited recursion in CachedLCSseq::similarity 114 | - reduce compiler warnings 115 | 116 | ### [1.0.1] - 2022-04-16 117 | #### Fixed 118 | - fix undefined behavior in sorted_split incrementing iterator past the end 119 | - fix use after free in editops calculation 120 | - reduce compiler warnings 121 | 122 | ### [1.0.1] - 2022-04-16 123 | #### Added 124 | - added LCSseq (longest common subsequence) implementation 125 | 126 | #### Fixed 127 | - reduced compiler warnings 128 | - consider float imprecision in score_cutoff 129 | - fix incorrect score_cutoff handling in token_set_ratio and token_ratio 130 | - fix template deduction guides on MSVC 131 | -------------------------------------------------------------------------------- /.github/workflows/cmake.yml: -------------------------------------------------------------------------------- 1 | name: CMake 2 | 3 | on: [push, pull_request] 4 | 5 | env: 6 | BUILD_TYPE: Release 7 | 8 | jobs: 9 | build_linux_clang: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | BUILD_TYPE: [Release, Debug] 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - name: Configure CMake 20 | run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DRAPIDFUZZ_BUILD_TESTING=1 -DRAPIDFUZZ_ENABLE_LINTERS=1 -DRAPIDFUZZ_BUILD_FUZZERS=1 -DCMAKE_CXX_COMPILER=clang++ 21 | 22 | - name: Build 23 | run: cmake --build build --config ${{matrix.BUILD_TYPE}} 24 | 25 | - name: Test 26 | working-directory: build 27 | run: ctest -C ${{matrix.BUILD_TYPE}} --rerun-failed --output-on-failure 28 | 29 | - name: Fuzz Test 30 | working-directory: build 31 | run: | 32 | fuzzing/fuzz_lcs_similarity -max_total_time=30 33 | fuzzing/fuzz_levenshtein_distance -max_total_time=30 34 | fuzzing/fuzz_levenshtein_editops -max_total_time=30 35 | fuzzing/fuzz_indel_distance -max_total_time=30 36 | fuzzing/fuzz_indel_editops -max_total_time=30 37 | fuzzing/fuzz_osa_distance -max_total_time=30 38 | fuzzing/fuzz_damerau_levenshtein_distance -max_total_time=30 39 | 40 | build_linux_gcc: 41 | runs-on: ubuntu-latest 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | BUILD_TYPE: [Release, Debug] 46 | 47 | steps: 48 | - uses: actions/checkout@v2 49 | 50 | - name: Configure CMake 51 | run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DRAPIDFUZZ_BUILD_TESTING=1 -DRAPIDFUZZ_ENABLE_LINTERS=1 -DCMAKE_CXX_COMPILER=g++ 52 | 53 | - name: Build 54 | run: cmake --build build --config ${{matrix.BUILD_TYPE}} 55 | 56 | - name: Test 57 | working-directory: build 58 | run: ctest -C ${{matrix.BUILD_TYPE}} --rerun-failed --output-on-failure 59 | 60 | build_windows: 61 | runs-on: windows-latest 62 | strategy: 63 | fail-fast: false 64 | matrix: 65 | BUILD_TYPE: [Release, Debug] 66 | 67 | steps: 68 | - uses: actions/checkout@v2 69 | 70 | - name: Configure CMake 71 | run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DRAPIDFUZZ_BUILD_TESTING=1 -DRAPIDFUZZ_ENABLE_LINTERS=1 72 | 73 | - name: Build 74 | run: cmake --build build --config ${{matrix.BUILD_TYPE}} 75 | 76 | - name: Test 77 | working-directory: build 78 | run: ctest -C ${{matrix.BUILD_TYPE}} --rerun-failed --output-on-failure 79 | 80 | build_cmake_installed: 81 | runs-on: ubuntu-latest 82 | 83 | steps: 84 | - uses: actions/checkout@v2 85 | 86 | - name: Configure CMake 87 | run: cmake -B build -DCMAKE_BUILD_TYPE=Release 88 | 89 | - name: Install RapidFuzz 90 | run: sudo cmake --build build --target install 91 | 92 | - name: Configure example project 93 | working-directory: examples/cmake_installed 94 | run: cmake -B build -DCMAKE_BUILD_TYPE=Release 95 | 96 | - name: Build example project 97 | working-directory: examples/cmake_installed 98 | run: cmake --build build --config ${{env.BUILD_TYPE}} 99 | 100 | - name: Run example project 101 | working-directory: examples/cmake_installed/build 102 | run: ./foo 103 | 104 | build_cpack_installed: 105 | runs-on: ubuntu-latest 106 | 107 | steps: 108 | - uses: actions/checkout@v2 109 | 110 | - name: Configure CMake 111 | run: cmake -B build -DCMAKE_BUILD_TYPE=Release 112 | 113 | - name: Install RapidFuzz 114 | working-directory: build 115 | run: | 116 | cpack -G DEB 117 | sudo dpkg -i *.deb 118 | 119 | - name: Configure example project 120 | working-directory: examples/cmake_installed 121 | run: cmake -B build -DCMAKE_BUILD_TYPE=Release 122 | 123 | - name: Build example project 124 | working-directory: examples/cmake_installed 125 | run: cmake --build build --config ${{env.BUILD_TYPE}} 126 | 127 | - name: Run example project 128 | working-directory: examples/cmake_installed/build 129 | run: ./foo 130 | -------------------------------------------------------------------------------- /tools/amalgamation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # disclaimer: this file is mostly copied from Catch2 3 | 4 | import os 5 | import re 6 | import datetime 7 | import sys 8 | import subprocess 9 | 10 | root_path = os.path.dirname(os.path.realpath( os.path.dirname(sys.argv[0]))) 11 | version_string = "1.0.2" 12 | 13 | starting_header = os.path.join(root_path, 'rapidfuzz', 'rapidfuzz_all.hpp') 14 | output_header = os.path.join(root_path, 'extras', 'rapidfuzz_amalgamated.hpp') 15 | output_cpp = os.path.join(root_path, 'extras', 'rapidfuzz_amalgamated.cpp') 16 | 17 | # These are the copyright comments in each file, we want to ignore them 18 | def is_copyright_line(line): 19 | copyright_lines = [ 20 | '/* SPDX-License-Identifier: MIT', 21 | '/* Copyright ' 22 | ] 23 | 24 | for copyright_line in copyright_lines: 25 | if line.startswith(copyright_line): 26 | return True 27 | return False 28 | 29 | 30 | # The header of the amalgamated file: copyright information + explanation 31 | # what this file is. 32 | file_header = '''\ 33 | // Licensed under the MIT License . 34 | // SPDX-License-Identifier: MIT 35 | // RapidFuzz v{version_string} 36 | // Generated: {generation_time} 37 | // ---------------------------------------------------------- 38 | // This file is an amalgamation of multiple different files. 39 | // You probably shouldn't edit it directly. 40 | // ---------------------------------------------------------- 41 | ''' 42 | 43 | # Returns file header with proper version string and generation time 44 | def formatted_file_header(): 45 | return file_header.format(version_string=version_string, 46 | generation_time=datetime.datetime.now()) 47 | 48 | # Which headers were already concatenated (and thus should not be 49 | # processed again) 50 | concatenated_headers = set() 51 | 52 | internal_include_parser = re.compile(r'\s*# *include <(rapidfuzz/.*)>.*') 53 | 54 | def concatenate_file(out, filename: str) -> int: 55 | # Gathers statistics on how many headers were expanded 56 | concatenated = 1 57 | with open(filename, mode='r', encoding='utf-8') as input: 58 | for line in input: 59 | if is_copyright_line(line): 60 | continue 61 | 62 | if line.startswith('#pragma once'): 63 | continue 64 | 65 | m = internal_include_parser.match(line) 66 | # anything that isn't a RapidFuzz header can just be copied to 67 | # the resulting file 68 | if not m: 69 | out.write(line) 70 | continue 71 | 72 | next_header = m.group(1) 73 | # We have to avoid re-expanding the same header over and 74 | # over again 75 | if next_header in concatenated_headers: 76 | continue 77 | concatenated_headers.add(next_header) 78 | out.write("\n") 79 | concatenated += concatenate_file(out, os.path.join(root_path, next_header)) 80 | out.write("\n") 81 | 82 | return concatenated 83 | 84 | 85 | def generate_header(): 86 | with open(output_header, mode='w', encoding='utf-8') as header: 87 | header.write(formatted_file_header()) 88 | header.write('#ifndef RAPIDFUZZ_AMALGAMATED_HPP_INCLUDED\n') 89 | header.write('#define RAPIDFUZZ_AMALGAMATED_HPP_INCLUDED\n') 90 | print('Concatenated {} headers'.format(concatenate_file(header, starting_header))) 91 | header.write('#endif // RAPIDFUZZ_AMALGAMATED_HPP_INCLUDED\n') 92 | 93 | # format output properly 94 | subprocess.run(["clang-format", "-i", output_header]) 95 | 96 | generate_header() 97 | 98 | 99 | # Notes: 100 | # * For .cpp files, internal includes have to be stripped and rewritten 101 | # * for .hpp files, internal includes have to be resolved and included 102 | # * The .cpp file needs to start with `#include "catch_amalgamated.hpp" 103 | # * include guards can be left/stripped, doesn't matter 104 | # * *.cpp files should be included sorted, to minimize diffs between versions 105 | # * *.hpp files should also be somehow sorted -> use catch_all.hpp as the 106 | # * entrypoint 107 | # * allow disabling main in the .cpp amalgamation -------------------------------------------------------------------------------- /rapidfuzz/distance.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace rapidfuzz { 17 | 18 | template 19 | std::basic_string editops_apply(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, 20 | InputIt2 last2) 21 | { 22 | auto len1 = static_cast(std::distance(first1, last1)); 23 | auto len2 = static_cast(std::distance(first2, last2)); 24 | 25 | std::basic_string res_str; 26 | res_str.resize(len1 + len2); 27 | size_t src_pos = 0; 28 | size_t dest_pos = 0; 29 | 30 | for (const auto& op : ops) { 31 | /* matches between last and current editop */ 32 | while (src_pos < op.src_pos) { 33 | res_str[dest_pos] = static_cast(first1[static_cast(src_pos)]); 34 | src_pos++; 35 | dest_pos++; 36 | } 37 | 38 | switch (op.type) { 39 | case EditType::None: 40 | case EditType::Replace: 41 | res_str[dest_pos] = static_cast(first2[static_cast(op.dest_pos)]); 42 | src_pos++; 43 | dest_pos++; 44 | break; 45 | case EditType::Insert: 46 | res_str[dest_pos] = static_cast(first2[static_cast(op.dest_pos)]); 47 | dest_pos++; 48 | break; 49 | case EditType::Delete: src_pos++; break; 50 | } 51 | } 52 | 53 | /* matches after the last editop */ 54 | while (src_pos < len1) { 55 | res_str[dest_pos] = static_cast(first1[static_cast(src_pos)]); 56 | src_pos++; 57 | dest_pos++; 58 | } 59 | 60 | res_str.resize(dest_pos); 61 | return res_str; 62 | } 63 | 64 | template 65 | std::basic_string editops_apply(const Editops& ops, const Sentence1& s1, const Sentence2& s2) 66 | { 67 | return editops_apply(ops, detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), 68 | detail::to_end(s2)); 69 | } 70 | 71 | template 72 | std::basic_string opcodes_apply(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2, 73 | InputIt2 last2) 74 | { 75 | auto len1 = static_cast(std::distance(first1, last1)); 76 | auto len2 = static_cast(std::distance(first2, last2)); 77 | 78 | std::basic_string res_str; 79 | res_str.resize(len1 + len2); 80 | size_t dest_pos = 0; 81 | 82 | for (const auto& op : ops) { 83 | switch (op.type) { 84 | case EditType::None: 85 | for (auto i = op.src_begin; i < op.src_end; ++i) { 86 | res_str[dest_pos++] = static_cast(first1[static_cast(i)]); 87 | } 88 | break; 89 | case EditType::Replace: 90 | case EditType::Insert: 91 | for (auto i = op.dest_begin; i < op.dest_end; ++i) { 92 | res_str[dest_pos++] = static_cast(first2[static_cast(i)]); 93 | } 94 | break; 95 | case EditType::Delete: break; 96 | } 97 | } 98 | 99 | res_str.resize(dest_pos); 100 | return res_str; 101 | } 102 | 103 | template 104 | std::basic_string opcodes_apply(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2) 105 | { 106 | return opcodes_apply(ops, detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), 107 | detail::to_end(s2)); 108 | } 109 | 110 | } // namespace rapidfuzz 111 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Prefix.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz { 11 | 12 | template 13 | int64_t prefix_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 14 | int64_t score_cutoff = std::numeric_limits::max()) 15 | { 16 | return detail::Prefix::distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 17 | } 18 | 19 | template 20 | int64_t prefix_distance(const Sentence1& s1, const Sentence2& s2, 21 | int64_t score_cutoff = std::numeric_limits::max()) 22 | { 23 | return detail::Prefix::distance(s1, s2, score_cutoff, score_cutoff); 24 | } 25 | 26 | template 27 | int64_t prefix_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 28 | int64_t score_cutoff = 0) 29 | { 30 | return detail::Prefix::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 31 | } 32 | 33 | template 34 | int64_t prefix_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0) 35 | { 36 | return detail::Prefix::similarity(s1, s2, score_cutoff, score_cutoff); 37 | } 38 | 39 | template 40 | double prefix_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 41 | double score_cutoff = 1.0) 42 | { 43 | return detail::Prefix::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 44 | } 45 | 46 | template 47 | double prefix_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 48 | { 49 | return detail::Prefix::normalized_distance(s1, s2, score_cutoff, score_cutoff); 50 | } 51 | 52 | template 53 | double prefix_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 54 | double score_cutoff = 0.0) 55 | { 56 | return detail::Prefix::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 57 | } 58 | 59 | template 60 | double prefix_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 61 | { 62 | return detail::Prefix::normalized_similarity(s1, s2, score_cutoff, score_cutoff); 63 | } 64 | 65 | template 66 | struct CachedPrefix : public detail::CachedSimilarityBase, int64_t, 0, 67 | std::numeric_limits::max()> { 68 | template 69 | explicit CachedPrefix(const Sentence1& s1_) : CachedPrefix(detail::to_begin(s1_), detail::to_end(s1_)) 70 | {} 71 | 72 | template 73 | CachedPrefix(InputIt1 first1, InputIt1 last1) : s1(first1, last1) 74 | {} 75 | 76 | private: 77 | friend detail::CachedSimilarityBase, int64_t, 0, 78 | std::numeric_limits::max()>; 79 | friend detail::CachedNormalizedMetricBase>; 80 | 81 | template 82 | int64_t maximum(detail::Range s2) const 83 | { 84 | return std::max(static_cast(s1.size()), s2.size()); 85 | } 86 | 87 | template 88 | int64_t _similarity(detail::Range s2, int64_t score_cutoff, 89 | [[maybe_unused]] int64_t score_hint) const 90 | { 91 | return detail::Prefix::similarity(s1, s2, score_cutoff, score_cutoff); 92 | } 93 | 94 | std::basic_string s1; 95 | }; 96 | 97 | template 98 | explicit CachedPrefix(const Sentence1& s1_) -> CachedPrefix>; 99 | 100 | template 101 | CachedPrefix(InputIt1 first1, InputIt1 last1) -> CachedPrefix>; 102 | 103 | /**@}*/ 104 | 105 | } // namespace rapidfuzz 106 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Postfix.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz { 11 | 12 | template 13 | int64_t postfix_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 14 | int64_t score_cutoff = std::numeric_limits::max()) 15 | { 16 | return detail::Postfix::distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 17 | } 18 | 19 | template 20 | int64_t postfix_distance(const Sentence1& s1, const Sentence2& s2, 21 | int64_t score_cutoff = std::numeric_limits::max()) 22 | { 23 | return detail::Postfix::distance(s1, s2, score_cutoff, score_cutoff); 24 | } 25 | 26 | template 27 | int64_t postfix_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 28 | int64_t score_cutoff = 0) 29 | { 30 | return detail::Postfix::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 31 | } 32 | 33 | template 34 | int64_t postfix_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0) 35 | { 36 | return detail::Postfix::similarity(s1, s2, score_cutoff, score_cutoff); 37 | } 38 | 39 | template 40 | double postfix_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 41 | double score_cutoff = 1.0) 42 | { 43 | return detail::Postfix::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 44 | } 45 | 46 | template 47 | double postfix_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 48 | { 49 | return detail::Postfix::normalized_distance(s1, s2, score_cutoff, score_cutoff); 50 | } 51 | 52 | template 53 | double postfix_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 54 | double score_cutoff = 0.0) 55 | { 56 | return detail::Postfix::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 57 | } 58 | 59 | template 60 | double postfix_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 61 | { 62 | return detail::Postfix::normalized_similarity(s1, s2, score_cutoff, score_cutoff); 63 | } 64 | 65 | template 66 | struct CachedPostfix : public detail::CachedSimilarityBase, int64_t, 0, 67 | std::numeric_limits::max()> { 68 | template 69 | explicit CachedPostfix(const Sentence1& s1_) : CachedPostfix(detail::to_begin(s1_), detail::to_end(s1_)) 70 | {} 71 | 72 | template 73 | CachedPostfix(InputIt1 first1, InputIt1 last1) : s1(first1, last1) 74 | {} 75 | 76 | private: 77 | friend detail::CachedSimilarityBase, int64_t, 0, 78 | std::numeric_limits::max()>; 79 | friend detail::CachedNormalizedMetricBase>; 80 | 81 | template 82 | int64_t maximum(detail::Range s2) const 83 | { 84 | return std::max(static_cast(s1.size()), s2.size()); 85 | } 86 | 87 | template 88 | int64_t _similarity(detail::Range s2, int64_t score_cutoff, 89 | [[maybe_unused]] int64_t score_hint) const 90 | { 91 | return detail::Postfix::similarity(s1, s2, score_cutoff, score_hint); 92 | } 93 | 94 | std::basic_string s1; 95 | }; 96 | 97 | template 98 | explicit CachedPostfix(const Sentence1& s1_) -> CachedPostfix>; 99 | 100 | template 101 | CachedPostfix(InputIt1 first1, InputIt1 last1) -> CachedPostfix>; 102 | 103 | /**@}*/ 104 | 105 | } // namespace rapidfuzz 106 | -------------------------------------------------------------------------------- /test/distance/tests-JaroWinkler.cpp: -------------------------------------------------------------------------------- 1 | #include "../../rapidfuzz_reference/JaroWinkler.hpp" 2 | #include 3 | #include 4 | #include 5 | 6 | using Catch::Approx; 7 | 8 | template 9 | double jaro_winkler_similarity(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1, 10 | double score_cutoff = 0.0) 11 | { 12 | double res1 = rapidfuzz::jaro_winkler_similarity(s1, s2, prefix_weight, score_cutoff); 13 | double res2 = rapidfuzz::jaro_winkler_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), 14 | prefix_weight, score_cutoff); 15 | double res3 = rapidfuzz::jaro_winkler_normalized_similarity(s1, s2, prefix_weight, score_cutoff); 16 | double res4 = rapidfuzz::jaro_winkler_normalized_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), 17 | prefix_weight, score_cutoff); 18 | rapidfuzz::CachedJaroWinkler scorer(s1, prefix_weight); 19 | double res5 = scorer.similarity(s2, score_cutoff); 20 | double res6 = scorer.similarity(s2.begin(), s2.end(), score_cutoff); 21 | double res7 = scorer.similarity(s2, score_cutoff); 22 | double res8 = scorer.similarity(s2.begin(), s2.end(), score_cutoff); 23 | REQUIRE(res1 == Approx(res2)); 24 | REQUIRE(res1 == Approx(res3)); 25 | REQUIRE(res1 == Approx(res4)); 26 | REQUIRE(res1 == Approx(res5)); 27 | REQUIRE(res1 == Approx(res6)); 28 | REQUIRE(res1 == Approx(res7)); 29 | REQUIRE(res1 == Approx(res8)); 30 | return res1; 31 | } 32 | 33 | template 34 | double jaro_winkler_distance(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1, 35 | double score_cutoff = 1.0) 36 | { 37 | double res1 = rapidfuzz::jaro_winkler_distance(s1, s2, prefix_weight, score_cutoff); 38 | double res2 = rapidfuzz::jaro_winkler_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), prefix_weight, 39 | score_cutoff); 40 | double res3 = rapidfuzz::jaro_winkler_normalized_distance(s1, s2, prefix_weight, score_cutoff); 41 | double res4 = rapidfuzz::jaro_winkler_normalized_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), 42 | prefix_weight, score_cutoff); 43 | rapidfuzz::CachedJaroWinkler scorer(s1, prefix_weight); 44 | double res5 = scorer.distance(s2, score_cutoff); 45 | double res6 = scorer.distance(s2.begin(), s2.end(), score_cutoff); 46 | double res7 = scorer.distance(s2, score_cutoff); 47 | double res8 = scorer.distance(s2.begin(), s2.end(), score_cutoff); 48 | REQUIRE(res1 == Approx(res2)); 49 | REQUIRE(res1 == Approx(res3)); 50 | REQUIRE(res1 == Approx(res4)); 51 | REQUIRE(res1 == Approx(res5)); 52 | REQUIRE(res1 == Approx(res6)); 53 | REQUIRE(res1 == Approx(res7)); 54 | REQUIRE(res1 == Approx(res8)); 55 | return res1; 56 | } 57 | 58 | /** 59 | * @name JaroWinklerFlagCharsTest 60 | */ 61 | TEST_CASE("JaroWinklerTest") 62 | { 63 | std::array names = {"james", "robert", "john", "michael", "william", 64 | "david", "joseph", "thomas", "charles", "mary", 65 | "patricia", "jennifer", "linda", "elizabeth", "barbara", 66 | "susan", "jessica", "sarah", "karen"}; 67 | 68 | SECTION("testFullResultWithScoreCutoff") 69 | { 70 | for (double score_cutoff = 0.0; score_cutoff < 1.1; score_cutoff += 0.1) 71 | for (const auto& name1 : names) 72 | for (const auto& name2 : names) { 73 | INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff); 74 | double Sim_original = 75 | rapidfuzz_reference::jaro_winkler_similarity(name1, name2, 0.1, score_cutoff); 76 | double Sim_bitparallel = jaro_winkler_similarity(name1, name2, 0.1, score_cutoff); 77 | double Dist_bitparallel = jaro_winkler_distance(name1, name2, 0.1, 1.0 - score_cutoff); 78 | REQUIRE(Sim_original == Approx(Sim_bitparallel)); 79 | REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel)); 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /bench/bench-lcs.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | std::string generate(int max_length) 9 | { 10 | std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; 11 | std::random_device rd; 12 | std::mt19937 engine(rd()); 13 | std::uniform_int_distribution<> dist(0, static_cast(possible_characters.size() - 1)); 14 | std::string ret = ""; 15 | for (int i = 0; i < max_length; i++) { 16 | int random_index = dist(engine); 17 | ret += possible_characters[static_cast(random_index)]; 18 | } 19 | return ret; 20 | } 21 | 22 | template 23 | static void BM_LCS(benchmark::State& state) 24 | { 25 | std::vector seq1; 26 | std::vector seq2; 27 | for (int i = 0; i < 256; i++) 28 | seq1.push_back(generate(MaxLen)); 29 | for (int i = 0; i < 10000; i++) 30 | seq2.push_back(generate(MaxLen)); 31 | 32 | size_t num = 0; 33 | for (auto _ : state) { 34 | for (size_t j = 0; j < seq2.size(); ++j) 35 | for (size_t i = 0; i < seq1.size(); ++i) 36 | benchmark::DoNotOptimize(rapidfuzz::lcs_seq_distance(seq1[i], seq2[j])); 37 | 38 | num += seq1.size() * seq2.size(); 39 | } 40 | 41 | state.counters["Rate"] = benchmark::Counter(static_cast(num), benchmark::Counter::kIsRate); 42 | state.counters["InvRate"] = benchmark::Counter(static_cast(num), 43 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 44 | } 45 | 46 | template 47 | static void BM_LCS_Cached(benchmark::State& state) 48 | { 49 | std::vector seq1; 50 | std::vector seq2; 51 | for (int i = 0; i < 256; i++) 52 | seq1.push_back(generate(MaxLen)); 53 | for (int i = 0; i < 10000; i++) 54 | seq2.push_back(generate(MaxLen)); 55 | 56 | size_t num = 0; 57 | for (auto _ : state) { 58 | for (const auto& str1 : seq1) { 59 | rapidfuzz::CachedLCSseq scorer(str1); 60 | for (size_t j = 0; j < seq2.size(); ++j) 61 | benchmark::DoNotOptimize(scorer.similarity(seq2[j])); 62 | } 63 | num += seq1.size() * seq2.size(); 64 | } 65 | 66 | state.counters["Rate"] = benchmark::Counter(static_cast(num), benchmark::Counter::kIsRate); 67 | state.counters["InvRate"] = benchmark::Counter(static_cast(num), 68 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 69 | } 70 | 71 | #ifdef RAPIDFUZZ_SIMD 72 | template 73 | static void BM_LCS_SIMD(benchmark::State& state) 74 | { 75 | std::vector seq1; 76 | std::vector seq2; 77 | std::vector results(64); 78 | for (int i = 0; i < 64; i++) 79 | seq1.push_back(generate(MaxLen)); 80 | for (int i = 0; i < 10000; i++) 81 | seq2.push_back(generate(MaxLen)); 82 | 83 | size_t num = 0; 84 | for (auto _ : state) { 85 | rapidfuzz::experimental::MultiLCSseq scorer(seq1.size()); 86 | for (const auto& str1 : seq1) 87 | scorer.insert(str1); 88 | 89 | for (const auto& str2 : seq2) 90 | scorer.similarity(&results[0], results.size(), str2); 91 | 92 | num += seq1.size() * seq2.size(); 93 | } 94 | 95 | state.counters["Rate"] = benchmark::Counter(static_cast(num), benchmark::Counter::kIsRate); 96 | state.counters["InvRate"] = benchmark::Counter(static_cast(num), 97 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 98 | } 99 | #endif 100 | 101 | BENCHMARK_TEMPLATE(BM_LCS, 8); 102 | BENCHMARK_TEMPLATE(BM_LCS, 16); 103 | BENCHMARK_TEMPLATE(BM_LCS, 32); 104 | BENCHMARK_TEMPLATE(BM_LCS, 64); 105 | 106 | BENCHMARK_TEMPLATE(BM_LCS_Cached, 8); 107 | BENCHMARK_TEMPLATE(BM_LCS_Cached, 16); 108 | BENCHMARK_TEMPLATE(BM_LCS_Cached, 32); 109 | BENCHMARK_TEMPLATE(BM_LCS_Cached, 64); 110 | 111 | #ifdef RAPIDFUZZ_SIMD 112 | BENCHMARK_TEMPLATE(BM_LCS_SIMD, 8); 113 | BENCHMARK_TEMPLATE(BM_LCS_SIMD, 16); 114 | BENCHMARK_TEMPLATE(BM_LCS_SIMD, 32); 115 | BENCHMARK_TEMPLATE(BM_LCS_SIMD, 64); 116 | #endif 117 | 118 | BENCHMARK_MAIN(); -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Cmake config largely taken from catch2 2 | cmake_minimum_required(VERSION 3.5) 3 | 4 | if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) 5 | cmake_policy(SET CMP0135 NEW) 6 | endif() 7 | 8 | # detect if Catch is being bundled, 9 | # disable testsuite in that case 10 | if(NOT DEFINED PROJECT_NAME) 11 | set(NOT_SUBPROJECT ON) 12 | else() 13 | set(NOT_SUBPROJECT OFF) 14 | endif() 15 | 16 | option(RAPIDFUZZ_BUILD_TESTING "Build tests" OFF) 17 | option(RAPIDFUZZ_ENABLE_LINTERS "Enable Linters for the test builds" OFF) 18 | option(RAPIDFUZZ_BUILD_BENCHMARKS "Build benchmarks" OFF) 19 | option(RAPIDFUZZ_BUILD_FUZZERS "Build fuzzers" OFF) 20 | 21 | # RapidFuzz's build breaks if done in-tree. You probably should not build 22 | # things in tree anyway, but we can allow projects that include RapidFuzz 23 | # as a subproject to build in-tree as long as it is not in our tree. 24 | if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) 25 | message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt") 26 | endif() 27 | 28 | project(rapidfuzz LANGUAGES CXX VERSION 1.10.4) 29 | 30 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") 31 | include(GNUInstallDirs) 32 | include(CMakePackageConfigHelpers) 33 | 34 | # Basic paths 35 | set(BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) 36 | set(SOURCES_DIR ${BASE_DIR}/rapidfuzz) 37 | set(TEST_DIR ${BASE_DIR}/test) 38 | set(BENCHMARK_DIR ${BASE_DIR}/tests/bench) 39 | set(EXAMPLES_DIR ${BASE_DIR}/examples) 40 | 41 | add_library(rapidfuzz INTERFACE) 42 | 43 | # provide a namespaced alias for clients to 'link' against if RapidFuzz is included as a sub-project 44 | add_library(rapidfuzz::rapidfuzz ALIAS rapidfuzz) 45 | 46 | target_compile_features(rapidfuzz INTERFACE cxx_std_17) 47 | 48 | target_include_directories(rapidfuzz 49 | INTERFACE 50 | $ 51 | $ 52 | ) 53 | 54 | # Build tests only if requested 55 | if(RAPIDFUZZ_BUILD_TESTING AND NOT_SUBPROJECT) 56 | include(CTest) 57 | enable_testing() 58 | add_subdirectory(test) 59 | endif() 60 | 61 | # Build examples only if requested 62 | if(RAPIDFUZZ_BUILD_EXAMPLES) 63 | #add_subdirectory(examples) 64 | endif() 65 | 66 | # Build benchmarks only if requested 67 | if(RAPIDFUZZ_BUILD_BENCHMARKS) 68 | add_subdirectory(bench) 69 | endif() 70 | 71 | # Build fuzz tests only if requested 72 | if(RAPIDFUZZ_BUILD_FUZZERS) 73 | add_subdirectory(fuzzing) 74 | endif() 75 | 76 | # Only perform the installation steps when RapidFuzz is not being used as 77 | # a subproject via `add_subdirectory` 78 | if (NOT_SUBPROJECT) 79 | set(RAPIDFUZZ_CMAKE_CONFIG_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/rapidfuzz") 80 | 81 | install( 82 | TARGETS 83 | rapidfuzz 84 | EXPORT 85 | rapidfuzzTargets 86 | DESTINATION 87 | ${CMAKE_INSTALL_LIBDIR} 88 | ) 89 | 90 | install( 91 | EXPORT 92 | rapidfuzzTargets 93 | NAMESPACE 94 | rapidfuzz:: 95 | DESTINATION 96 | ${RAPIDFUZZ_CMAKE_CONFIG_DESTINATION} 97 | ) 98 | 99 | install( 100 | DIRECTORY 101 | rapidfuzz 102 | DESTINATION 103 | ${CMAKE_INSTALL_INCLUDEDIR} 104 | FILES_MATCHING 105 | PATTERN "*.hpp" 106 | PATTERN "*.impl" 107 | ) 108 | 109 | configure_package_config_file( 110 | ${CMAKE_CURRENT_LIST_DIR}/cmake/${PROJECT_NAME}Config.cmake.in 111 | ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake 112 | INSTALL_DESTINATION ${RAPIDFUZZ_CMAKE_CONFIG_DESTINATION} 113 | ) 114 | 115 | write_basic_package_version_file( 116 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" 117 | COMPATIBILITY SameMajorVersion 118 | ) 119 | 120 | install( 121 | FILES 122 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" 123 | "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" 124 | DESTINATION 125 | ${RAPIDFUZZ_CMAKE_CONFIG_DESTINATION} 126 | ) 127 | 128 | # CPack/CMake started taking the package version from project version 3.12 129 | # So we need to set the version manually for older CMake versions 130 | if(${CMAKE_VERSION} VERSION_LESS "3.12.0") 131 | set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION}) 132 | endif() 133 | 134 | set(CPACK_PACKAGE_VENDOR "Max Bachmann") 135 | set(CPACK_PACKAGE_CONTACT "https://github.com/maxbachmann/rapidfuzz-cpp") 136 | include(CPack) 137 | 138 | endif(NOT_SUBPROJECT) 139 | -------------------------------------------------------------------------------- /rapidfuzz/details/Range.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright (c) 2022 Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace rapidfuzz::detail { 13 | 14 | static inline void assume(bool b) 15 | { 16 | #if defined(__clang__) 17 | __builtin_assume(b); 18 | #elif defined(__GNUC__) || defined(__GNUG__) 19 | if (!b) __builtin_unreachable(); 20 | #elif defined(_MSC_VER) 21 | __assume(b); 22 | #endif 23 | } 24 | 25 | template 26 | CharT* to_begin(CharT* s) 27 | { 28 | return s; 29 | } 30 | 31 | template 32 | auto to_begin(T& x) 33 | { 34 | using std::begin; 35 | return begin(x); 36 | } 37 | 38 | template 39 | CharT* to_end(CharT* s) 40 | { 41 | assume(s != nullptr); 42 | while (*s != 0) 43 | ++s; 44 | 45 | return s; 46 | } 47 | 48 | template 49 | auto to_end(T& x) 50 | { 51 | using std::end; 52 | return end(x); 53 | } 54 | 55 | template 56 | class Range { 57 | Iter _first; 58 | Iter _last; 59 | 60 | public: 61 | using value_type = typename std::iterator_traits::value_type; 62 | using iterator = Iter; 63 | using reverse_iterator = std::reverse_iterator; 64 | 65 | constexpr Range(Iter first, Iter last) : _first(first), _last(last) 66 | {} 67 | 68 | template 69 | constexpr Range(T& x) : _first(to_begin(x)), _last(to_end(x)) 70 | {} 71 | 72 | constexpr iterator begin() const noexcept 73 | { 74 | return _first; 75 | } 76 | constexpr iterator end() const noexcept 77 | { 78 | return _last; 79 | } 80 | 81 | constexpr reverse_iterator rbegin() const noexcept 82 | { 83 | return reverse_iterator(end()); 84 | } 85 | constexpr reverse_iterator rend() const noexcept 86 | { 87 | return reverse_iterator(begin()); 88 | } 89 | 90 | constexpr ptrdiff_t size() const 91 | { 92 | return std::distance(_first, _last); 93 | } 94 | constexpr bool empty() const 95 | { 96 | return size() == 0; 97 | } 98 | explicit constexpr operator bool() const 99 | { 100 | return !empty(); 101 | } 102 | constexpr decltype(auto) operator[](ptrdiff_t n) const 103 | { 104 | return _first[n]; 105 | } 106 | 107 | constexpr void remove_prefix(ptrdiff_t n) 108 | { 109 | _first += n; 110 | } 111 | constexpr void remove_suffix(ptrdiff_t n) 112 | { 113 | _last -= n; 114 | } 115 | 116 | constexpr Range subseq(ptrdiff_t pos = 0, ptrdiff_t count = std::numeric_limits::max()) 117 | { 118 | if (pos > size()) throw std::out_of_range("Index out of range in Range::substr"); 119 | 120 | auto start = _first + pos; 121 | if (std::distance(start, _last) < count) return {start, _last}; 122 | return {start, start + count}; 123 | } 124 | 125 | constexpr decltype(auto) front() const 126 | { 127 | return *(_first); 128 | } 129 | 130 | constexpr decltype(auto) back() const 131 | { 132 | return *(_last - 1); 133 | } 134 | 135 | constexpr Range reversed() const 136 | { 137 | return {rbegin(), rend()}; 138 | } 139 | 140 | friend std::ostream& operator<<(std::ostream& os, const Range& seq) 141 | { 142 | os << "["; 143 | for (auto x : seq) 144 | os << static_cast(x) << ", "; 145 | os << "]"; 146 | return os; 147 | } 148 | }; 149 | 150 | template 151 | Range(T& x) -> Range; 152 | 153 | template 154 | inline bool operator==(const Range& a, const Range& b) 155 | { 156 | return std::equal(a.begin(), a.end(), b.begin(), b.end()); 157 | } 158 | 159 | template 160 | inline bool operator!=(const Range& a, const Range& b) 161 | { 162 | return !(a == b); 163 | } 164 | 165 | template 166 | inline bool operator<(const Range& a, const Range& b) 167 | { 168 | return (std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end())); 169 | } 170 | 171 | template 172 | inline bool operator>(const Range& a, const Range& b) 173 | { 174 | return b < a; 175 | } 176 | 177 | template 178 | inline bool operator<=(const Range& a, const Range& b) 179 | { 180 | return !(b < a); 181 | } 182 | 183 | template 184 | inline bool operator>=(const Range& a, const Range& b) 185 | { 186 | return !(a < b); 187 | } 188 | 189 | template 190 | using RangeVec = std::vector>; 191 | 192 | } // namespace rapidfuzz::detail 193 | -------------------------------------------------------------------------------- /rapidfuzz/details/common_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2020 Max Bachmann */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz::detail { 11 | 12 | template 13 | DecomposedSet set_decomposition(SplittedSentenceView a, 14 | SplittedSentenceView b) 15 | { 16 | a.dedupe(); 17 | b.dedupe(); 18 | 19 | RangeVec intersection; 20 | RangeVec difference_ab; 21 | RangeVec difference_ba = b.words(); 22 | 23 | for (const auto& current_a : a.words()) { 24 | auto element_b = std::find(difference_ba.begin(), difference_ba.end(), current_a); 25 | 26 | if (element_b != difference_ba.end()) { 27 | difference_ba.erase(element_b); 28 | intersection.push_back(current_a); 29 | } 30 | else { 31 | difference_ab.push_back(current_a); 32 | } 33 | } 34 | 35 | return {difference_ab, difference_ba, intersection}; 36 | } 37 | 38 | /** 39 | * Removes common prefix of two string views 40 | */ 41 | template 42 | size_t remove_common_prefix(Range& s1, Range& s2) 43 | { 44 | auto first1 = std::begin(s1); 45 | auto prefix = 46 | std::distance(first1, std::mismatch(first1, std::end(s1), std::begin(s2), std::end(s2)).first); 47 | s1.remove_prefix(prefix); 48 | s2.remove_prefix(prefix); 49 | return static_cast(prefix); 50 | } 51 | 52 | /** 53 | * Removes common suffix of two string views 54 | */ 55 | template 56 | size_t remove_common_suffix(Range& s1, Range& s2) 57 | { 58 | auto rfirst1 = std::rbegin(s1); 59 | auto suffix = 60 | std::distance(rfirst1, std::mismatch(rfirst1, std::rend(s1), std::rbegin(s2), std::rend(s2)).first); 61 | s1.remove_suffix(suffix); 62 | s2.remove_suffix(suffix); 63 | return static_cast(suffix); 64 | } 65 | 66 | /** 67 | * Removes common affix of two string views 68 | */ 69 | template 70 | StringAffix remove_common_affix(Range& s1, Range& s2) 71 | { 72 | return StringAffix{remove_common_prefix(s1, s2), remove_common_suffix(s1, s2)}; 73 | } 74 | 75 | template 76 | struct is_space_dispatch_tag : std::integral_constant {}; 77 | 78 | template 79 | struct is_space_dispatch_tag::type> 80 | : std::integral_constant {}; 81 | 82 | /* 83 | * Implementation of is_space for char types that are at least 2 Byte in size 84 | */ 85 | template 86 | bool is_space_impl(const CharT ch, std::integral_constant) 87 | { 88 | switch (ch) { 89 | case 0x0009: 90 | case 0x000A: 91 | case 0x000B: 92 | case 0x000C: 93 | case 0x000D: 94 | case 0x001C: 95 | case 0x001D: 96 | case 0x001E: 97 | case 0x001F: 98 | case 0x0020: 99 | case 0x0085: 100 | case 0x00A0: 101 | case 0x1680: 102 | case 0x2000: 103 | case 0x2001: 104 | case 0x2002: 105 | case 0x2003: 106 | case 0x2004: 107 | case 0x2005: 108 | case 0x2006: 109 | case 0x2007: 110 | case 0x2008: 111 | case 0x2009: 112 | case 0x200A: 113 | case 0x2028: 114 | case 0x2029: 115 | case 0x202F: 116 | case 0x205F: 117 | case 0x3000: return true; 118 | } 119 | return false; 120 | } 121 | 122 | /* 123 | * Implementation of is_space for char types that are 1 Byte in size 124 | */ 125 | template 126 | bool is_space_impl(const CharT ch, std::integral_constant) 127 | { 128 | switch (ch) { 129 | case 0x0009: 130 | case 0x000A: 131 | case 0x000B: 132 | case 0x000C: 133 | case 0x000D: 134 | case 0x001C: 135 | case 0x001D: 136 | case 0x001E: 137 | case 0x001F: 138 | case 0x0020: return true; 139 | } 140 | return false; 141 | } 142 | 143 | /* 144 | * checks whether unicode characters have the bidirectional 145 | * type 'WS', 'B' or 'S' or the category 'Zs' 146 | */ 147 | template 148 | bool is_space(const CharT ch) 149 | { 150 | return is_space_impl(ch, is_space_dispatch_tag{}); 151 | } 152 | 153 | template 154 | SplittedSentenceView sorted_split(InputIt first, InputIt last) 155 | { 156 | RangeVec splitted; 157 | auto second = first; 158 | 159 | for (; first != last; first = second + 1) { 160 | second = std::find_if(first, last, is_space); 161 | 162 | if (first != second) { 163 | splitted.emplace_back(first, second); 164 | } 165 | 166 | if (second == last) break; 167 | } 168 | 169 | std::sort(splitted.begin(), splitted.end()); 170 | 171 | return SplittedSentenceView(splitted); 172 | } 173 | 174 | } // namespace rapidfuzz::detail 175 | -------------------------------------------------------------------------------- /rapidfuzz/distance/DamerauLevenshtein_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace rapidfuzz::detail { 17 | 18 | template 19 | struct RowId { 20 | IntType val = -1; 21 | friend bool operator==(const RowId& lhs, const RowId& rhs) 22 | { 23 | return lhs.val == rhs.val; 24 | } 25 | 26 | friend bool operator!=(const RowId& lhs, const RowId& rhs) 27 | { 28 | return !(lhs == rhs); 29 | } 30 | }; 31 | 32 | /* 33 | * based on the paper 34 | * "Linear space string correction algorithm using the Damerau-Levenshtein distance" 35 | * from Chunchun Zhao and Sartaj Sahni 36 | */ 37 | template 38 | int64_t damerau_levenshtein_distance_zhao(Range s1, Range s2, int64_t max) 39 | { 40 | IntType len1 = static_cast(s1.size()); 41 | IntType len2 = static_cast(s2.size()); 42 | IntType maxVal = static_cast(std::max(len1, len2) + 1); 43 | assert(std::numeric_limits::max() > maxVal); 44 | 45 | HybridGrowingHashmap::value_type, RowId> last_row_id; 46 | size_t size = static_cast(s2.size() + 2); 47 | assume(size != 0); 48 | std::vector FR_arr(size, maxVal); 49 | std::vector R1_arr(size, maxVal); 50 | std::vector R_arr(size); 51 | R_arr[0] = maxVal; 52 | std::iota(R_arr.begin() + 1, R_arr.end(), IntType(0)); 53 | 54 | IntType* R = &R_arr[1]; 55 | IntType* R1 = &R1_arr[1]; 56 | IntType* FR = &FR_arr[1]; 57 | 58 | for (IntType i = 1; i <= len1; i++) { 59 | std::swap(R, R1); 60 | IntType last_col_id = -1; 61 | IntType last_i2l1 = R[0]; 62 | R[0] = i; 63 | IntType T = maxVal; 64 | 65 | for (IntType j = 1; j <= len2; j++) { 66 | ptrdiff_t diag = R1[j - 1] + static_cast(s1[i - 1] != s2[j - 1]); 67 | ptrdiff_t left = R[j - 1] + 1; 68 | ptrdiff_t up = R1[j] + 1; 69 | ptrdiff_t temp = std::min({diag, left, up}); 70 | 71 | if (s1[i - 1] == s2[j - 1]) { 72 | last_col_id = j; // last occurence of s1_i 73 | FR[j] = R1[j - 2]; // save H_k-1,j-2 74 | T = last_i2l1; // save H_i-2,l-1 75 | } 76 | else { 77 | ptrdiff_t k = last_row_id.get(static_cast(s2[j - 1])).val; 78 | ptrdiff_t l = last_col_id; 79 | 80 | if ((j - l) == 1) { 81 | ptrdiff_t transpose = FR[j] + (i - k); 82 | temp = std::min(temp, transpose); 83 | } 84 | else if ((i - k) == 1) { 85 | ptrdiff_t transpose = T + (j - l); 86 | temp = std::min(temp, transpose); 87 | } 88 | } 89 | 90 | last_i2l1 = R[j]; 91 | R[j] = static_cast(temp); 92 | } 93 | last_row_id[s1[i - 1]].val = i; 94 | } 95 | 96 | int64_t dist = R[s2.size()]; 97 | return (dist <= max) ? dist : max + 1; 98 | } 99 | 100 | template 101 | int64_t damerau_levenshtein_distance(Range s1, Range s2, int64_t max) 102 | { 103 | int64_t min_edits = std::abs(s1.size() - s2.size()); 104 | if (min_edits > max) return max + 1; 105 | 106 | /* common affix does not effect Levenshtein distance */ 107 | remove_common_affix(s1, s2); 108 | 109 | ptrdiff_t maxVal = std::max(s1.size(), s2.size()) + 1; 110 | if (std::numeric_limits::max() > maxVal) 111 | return damerau_levenshtein_distance_zhao(s1, s2, max); 112 | else if (std::numeric_limits::max() > maxVal) 113 | return damerau_levenshtein_distance_zhao(s1, s2, max); 114 | else 115 | return damerau_levenshtein_distance_zhao(s1, s2, max); 116 | } 117 | 118 | class DamerauLevenshtein 119 | : public DistanceBase::max()> { 120 | friend DistanceBase::max()>; 121 | friend NormalizedMetricBase; 122 | 123 | template 124 | static int64_t maximum(Range s1, Range s2) 125 | { 126 | return std::max(s1.size(), s2.size()); 127 | } 128 | 129 | template 130 | static int64_t _distance(Range s1, Range s2, int64_t score_cutoff, 131 | [[maybe_unused]] int64_t score_hint) 132 | { 133 | return damerau_levenshtein_distance(s1, s2, score_cutoff); 134 | } 135 | }; 136 | 137 | } // namespace rapidfuzz::detail -------------------------------------------------------------------------------- /rapidfuzz/distance/JaroWinkler.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include "rapidfuzz/details/Range.hpp" 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz { 11 | 12 | template >> 14 | double jaro_winkler_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 15 | double prefix_weight = 0.1, double score_cutoff = 1.0) 16 | { 17 | return detail::JaroWinkler::distance(first1, last1, first2, last2, prefix_weight, score_cutoff, 18 | score_cutoff); 19 | } 20 | 21 | template 22 | double jaro_winkler_distance(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1, 23 | double score_cutoff = 1.0) 24 | { 25 | return detail::JaroWinkler::distance(s1, s2, prefix_weight, score_cutoff, score_cutoff); 26 | } 27 | 28 | template >> 30 | double jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 31 | double prefix_weight = 0.1, double score_cutoff = 0.0) 32 | { 33 | return detail::JaroWinkler::similarity(first1, last1, first2, last2, prefix_weight, score_cutoff, 34 | score_cutoff); 35 | } 36 | 37 | template 38 | double jaro_winkler_similarity(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1, 39 | double score_cutoff = 0.0) 40 | { 41 | return detail::JaroWinkler::similarity(s1, s2, prefix_weight, score_cutoff, score_cutoff); 42 | } 43 | 44 | template >> 46 | double jaro_winkler_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 47 | double prefix_weight = 0.1, double score_cutoff = 1.0) 48 | { 49 | return detail::JaroWinkler::normalized_distance(first1, last1, first2, last2, prefix_weight, score_cutoff, 50 | score_cutoff); 51 | } 52 | 53 | template 54 | double jaro_winkler_normalized_distance(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1, 55 | double score_cutoff = 1.0) 56 | { 57 | return detail::JaroWinkler::normalized_distance(s1, s2, prefix_weight, score_cutoff, score_cutoff); 58 | } 59 | 60 | template >> 62 | double jaro_winkler_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 63 | double prefix_weight = 0.1, double score_cutoff = 0.0) 64 | { 65 | return detail::JaroWinkler::normalized_similarity(first1, last1, first2, last2, prefix_weight, 66 | score_cutoff, score_cutoff); 67 | } 68 | 69 | template 70 | double jaro_winkler_normalized_similarity(const Sentence1& s1, const Sentence2& s2, 71 | double prefix_weight = 0.1, double score_cutoff = 0.0) 72 | { 73 | return detail::JaroWinkler::normalized_similarity(s1, s2, prefix_weight, score_cutoff, score_cutoff); 74 | } 75 | 76 | template 77 | struct CachedJaroWinkler : public detail::CachedSimilarityBase, double, 0, 1> { 78 | template 79 | explicit CachedJaroWinkler(const Sentence1& s1_, double _prefix_weight = 0.1) 80 | : CachedJaroWinkler(detail::to_begin(s1_), detail::to_end(s1_), _prefix_weight) 81 | {} 82 | 83 | template 84 | CachedJaroWinkler(InputIt1 first1, InputIt1 last1, double _prefix_weight = 0.1) 85 | : prefix_weight(_prefix_weight), s1(first1, last1), PM(detail::Range(first1, last1)) 86 | {} 87 | 88 | private: 89 | friend detail::CachedSimilarityBase, double, 0, 1>; 90 | friend detail::CachedNormalizedMetricBase>; 91 | 92 | template 93 | double maximum(detail::Range) const 94 | { 95 | return 1.0; 96 | } 97 | 98 | template 99 | double _similarity(detail::Range s2, double score_cutoff, 100 | [[maybe_unused]] double score_hint) const 101 | { 102 | return detail::jaro_winkler_similarity(PM, detail::Range(s1), s2, prefix_weight, score_cutoff); 103 | } 104 | 105 | double prefix_weight; 106 | std::basic_string s1; 107 | detail::BlockPatternMatchVector PM; 108 | }; 109 | 110 | template 111 | explicit CachedJaroWinkler(const Sentence1& s1_, double _prefix_weight = 0.1) 112 | -> CachedJaroWinkler>; 113 | 114 | template 115 | CachedJaroWinkler(InputIt1 first1, InputIt1 last1, double _prefix_weight = 0.1) 116 | -> CachedJaroWinkler>; 117 | 118 | } // namespace rapidfuzz 119 | -------------------------------------------------------------------------------- /rapidfuzz/details/Matrix.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright (c) 2022 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace rapidfuzz::detail { 15 | 16 | template 17 | struct BitMatrixView { 18 | 19 | using value_type = T; 20 | using size_type = size_t; 21 | using pointer = std::conditional_t; 22 | using reference = std::conditional_t; 23 | 24 | BitMatrixView(pointer vector, size_type cols) noexcept : m_vector(vector), m_cols(cols) 25 | {} 26 | 27 | reference operator[](size_type col) noexcept 28 | { 29 | assert(col < m_cols); 30 | return m_vector[col]; 31 | } 32 | 33 | size_type size() const noexcept 34 | { 35 | return m_cols; 36 | } 37 | 38 | private: 39 | pointer m_vector; 40 | size_type m_cols; 41 | }; 42 | 43 | template 44 | struct BitMatrix { 45 | 46 | using value_type = T; 47 | 48 | BitMatrix() : m_rows(0), m_cols(0), m_matrix(nullptr) 49 | {} 50 | 51 | BitMatrix(size_t rows, size_t cols, T val) : m_rows(rows), m_cols(cols), m_matrix(nullptr) 52 | { 53 | if (m_rows && m_cols) m_matrix = new T[m_rows * m_cols]; 54 | std::fill_n(m_matrix, m_rows * m_cols, val); 55 | } 56 | 57 | BitMatrix(const BitMatrix& other) : m_rows(other.m_rows), m_cols(other.m_cols), m_matrix(nullptr) 58 | { 59 | if (m_rows && m_cols) m_matrix = new T[m_rows * m_cols]; 60 | std::copy(other.m_matrix, other.m_matrix + m_rows * m_cols, m_matrix); 61 | } 62 | 63 | BitMatrix(BitMatrix&& other) noexcept : m_rows(0), m_cols(0), m_matrix(nullptr) 64 | { 65 | other.swap(*this); 66 | } 67 | 68 | BitMatrix& operator=(BitMatrix&& other) noexcept 69 | { 70 | other.swap(*this); 71 | return *this; 72 | } 73 | 74 | BitMatrix& operator=(const BitMatrix& other) 75 | { 76 | BitMatrix temp = other; 77 | temp.swap(*this); 78 | return *this; 79 | } 80 | 81 | void swap(BitMatrix& rhs) noexcept 82 | { 83 | using std::swap; 84 | swap(m_rows, rhs.m_rows); 85 | swap(m_cols, rhs.m_cols); 86 | swap(m_matrix, rhs.m_matrix); 87 | } 88 | 89 | ~BitMatrix() 90 | { 91 | delete[] m_matrix; 92 | } 93 | 94 | BitMatrixView operator[](size_t row) noexcept 95 | { 96 | assert(row < m_rows); 97 | return {&m_matrix[row * m_cols], m_cols}; 98 | } 99 | 100 | BitMatrixView operator[](size_t row) const noexcept 101 | { 102 | assert(row < m_rows); 103 | return {&m_matrix[row * m_cols], m_cols}; 104 | } 105 | 106 | size_t rows() const noexcept 107 | { 108 | return m_rows; 109 | } 110 | 111 | size_t cols() const noexcept 112 | { 113 | return m_cols; 114 | } 115 | 116 | private: 117 | size_t m_rows; 118 | size_t m_cols; 119 | T* m_matrix; 120 | }; 121 | 122 | template 123 | struct ShiftedBitMatrix { 124 | using value_type = T; 125 | 126 | ShiftedBitMatrix() 127 | {} 128 | 129 | ShiftedBitMatrix(size_t rows, size_t cols, T val) : m_matrix(rows, cols, val), m_offsets(rows) 130 | {} 131 | 132 | ShiftedBitMatrix(const ShiftedBitMatrix& other) : m_matrix(other.m_matrix), m_offsets(other.m_offsets) 133 | {} 134 | 135 | ShiftedBitMatrix(ShiftedBitMatrix&& other) noexcept 136 | { 137 | other.swap(*this); 138 | } 139 | 140 | ShiftedBitMatrix& operator=(ShiftedBitMatrix&& other) noexcept 141 | { 142 | other.swap(*this); 143 | return *this; 144 | } 145 | 146 | ShiftedBitMatrix& operator=(const ShiftedBitMatrix& other) 147 | { 148 | ShiftedBitMatrix temp = other; 149 | temp.swap(*this); 150 | return *this; 151 | } 152 | 153 | void swap(ShiftedBitMatrix& rhs) noexcept 154 | { 155 | using std::swap; 156 | swap(m_matrix, rhs.m_matrix); 157 | swap(m_offsets, rhs.m_offsets); 158 | } 159 | 160 | bool test_bit(size_t row, size_t col, bool default_ = false) const noexcept 161 | { 162 | ptrdiff_t offset = static_cast(m_offsets[row]); 163 | 164 | if (offset < 0) { 165 | col += static_cast(-offset); 166 | } 167 | else if (col >= static_cast(offset)) { 168 | col -= static_cast(offset); 169 | } 170 | /* bit on the left of the band */ 171 | else { 172 | return default_; 173 | } 174 | 175 | size_t word_size = sizeof(value_type) * 8; 176 | size_t col_word = col / word_size; 177 | uint64_t col_mask = value_type(1) << (col % word_size); 178 | 179 | return bool(m_matrix[row][col_word] & col_mask); 180 | } 181 | 182 | auto operator[](size_t row) noexcept 183 | { 184 | return m_matrix[row]; 185 | } 186 | 187 | auto operator[](size_t row) const noexcept 188 | { 189 | return m_matrix[row]; 190 | } 191 | 192 | void set_offset(size_t row, ptrdiff_t offset) 193 | { 194 | m_offsets[row] = offset; 195 | } 196 | 197 | private: 198 | BitMatrix m_matrix; 199 | std::vector m_offsets; 200 | }; 201 | 202 | } // namespace rapidfuzz::detail -------------------------------------------------------------------------------- /rapidfuzz/details/GrowingHashmap.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright (c) 2022 Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace rapidfuzz::detail { 12 | 13 | /* hashmap for integers which can only grow, but can't remove elements */ 14 | template 15 | struct GrowingHashmap { 16 | using key_type = T_Key; 17 | using value_type = T_Entry; 18 | using size_type = unsigned int; 19 | 20 | private: 21 | static constexpr size_type min_size = 8; 22 | struct MapElem { 23 | key_type key; 24 | value_type value = value_type(); 25 | }; 26 | 27 | int used; 28 | int fill; 29 | int mask; 30 | MapElem* m_map; 31 | 32 | public: 33 | GrowingHashmap() : used(0), fill(0), mask(-1), m_map(NULL) 34 | {} 35 | ~GrowingHashmap() 36 | { 37 | delete[] m_map; 38 | } 39 | 40 | GrowingHashmap(const GrowingHashmap& other) : used(other.used), fill(other.fill), mask(other.mask) 41 | { 42 | int size = mask + 1; 43 | m_map = new MapElem[size]; 44 | std::copy(other.m_map, other.m_map + size, m_map); 45 | } 46 | 47 | GrowingHashmap(GrowingHashmap&& other) noexcept : GrowingHashmap() 48 | { 49 | swap(*this, other); 50 | } 51 | 52 | GrowingHashmap& operator=(GrowingHashmap other) 53 | { 54 | swap(*this, other); 55 | return *this; 56 | } 57 | 58 | friend void swap(GrowingHashmap& first, GrowingHashmap& second) noexcept 59 | { 60 | std::swap(first.used, second.used); 61 | std::swap(first.fill, second.fill); 62 | std::swap(first.mask, second.mask); 63 | std::swap(first.m_map, second.m_map); 64 | } 65 | 66 | size_type size() const 67 | { 68 | return used; 69 | } 70 | size_type capacity() const 71 | { 72 | return mask + 1; 73 | } 74 | bool empty() const 75 | { 76 | return used == 0; 77 | } 78 | 79 | value_type get(key_type key) const noexcept 80 | { 81 | if (m_map == NULL) return value_type(); 82 | 83 | return m_map[lookup(key)].value; 84 | } 85 | 86 | value_type& operator[](key_type key) noexcept 87 | { 88 | if (m_map == NULL) allocate(); 89 | 90 | size_t i = lookup(key); 91 | 92 | if (m_map[i].value == value_type()) { 93 | /* resize when 2/3 full */ 94 | if (++fill * 3 >= (mask + 1) * 2) { 95 | grow((used + 1) * 2); 96 | i = lookup(key); 97 | } 98 | 99 | used++; 100 | } 101 | 102 | m_map[i].key = key; 103 | return m_map[i].value; 104 | } 105 | 106 | private: 107 | void allocate() 108 | { 109 | mask = min_size - 1; 110 | m_map = new MapElem[min_size]; 111 | } 112 | 113 | /** 114 | * lookup key inside the hashmap using a similar collision resolution 115 | * strategy to CPython and Ruby 116 | */ 117 | size_t lookup(key_type key) const 118 | { 119 | size_t hash = static_cast(key); 120 | size_t i = hash & static_cast(mask); 121 | 122 | if (m_map[i].value == value_type() || m_map[i].key == key) return i; 123 | 124 | size_t perturb = hash; 125 | while (true) { 126 | i = (i * 5 + perturb + 1) & static_cast(mask); 127 | if (m_map[i].value == value_type() || m_map[i].key == key) return i; 128 | 129 | perturb >>= 5; 130 | } 131 | } 132 | 133 | void grow(int minUsed) 134 | { 135 | int newSize = mask + 1; 136 | while (newSize <= minUsed) 137 | newSize <<= 1; 138 | 139 | MapElem* oldMap = m_map; 140 | m_map = new MapElem[static_cast(newSize)]; 141 | 142 | fill = used; 143 | mask = newSize - 1; 144 | 145 | for (int i = 0; used > 0; i++) 146 | if (oldMap[i].value != value_type()) { 147 | size_t j = lookup(oldMap[i].key); 148 | 149 | m_map[j].key = oldMap[i].key; 150 | m_map[j].value = oldMap[i].value; 151 | used--; 152 | } 153 | 154 | used = fill; 155 | delete[] oldMap; 156 | } 157 | }; 158 | 159 | template 160 | struct HybridGrowingHashmap { 161 | using key_type = T_Key; 162 | using value_type = T_Entry; 163 | 164 | HybridGrowingHashmap() 165 | { 166 | m_extendedAscii.fill(value_type()); 167 | } 168 | 169 | value_type get(char key) const noexcept 170 | { 171 | /** treat char as value between 0 and 127 for performance reasons */ 172 | return m_extendedAscii[static_cast(key)]; 173 | } 174 | 175 | template 176 | value_type get(CharT key) const noexcept 177 | { 178 | if (key >= 0 && key <= 255) 179 | return m_extendedAscii[static_cast(key)]; 180 | else 181 | return m_map.get(static_cast(key)); 182 | } 183 | 184 | value_type& operator[](char key) noexcept 185 | { 186 | /** treat char as value between 0 and 127 for performance reasons */ 187 | return m_extendedAscii[static_cast(key)]; 188 | } 189 | 190 | template 191 | value_type& operator[](CharT key) 192 | { 193 | if (key >= 0 && key <= 255) 194 | return m_extendedAscii[static_cast(key)]; 195 | else 196 | return m_map[static_cast(key)]; 197 | } 198 | 199 | private: 200 | GrowingHashmap m_map; 201 | std::array m_extendedAscii; 202 | }; 203 | 204 | } // namespace rapidfuzz::detail -------------------------------------------------------------------------------- /test/distance/tests-DamerauLevenshtein.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | template 10 | std::basic_string str_multiply(std::basic_string a, unsigned int b) 11 | { 12 | std::basic_string output; 13 | while (b--) 14 | output += a; 15 | 16 | return output; 17 | } 18 | 19 | template 20 | int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, 21 | int64_t max = std::numeric_limits::max()) 22 | { 23 | int64_t res1 = rapidfuzz::experimental::damerau_levenshtein_distance(s1, s2, max); 24 | int64_t res2 = rapidfuzz::experimental::damerau_levenshtein_distance(s1.begin(), s1.end(), s2.begin(), 25 | s2.end(), max); 26 | rapidfuzz::experimental::CachedDamerauLevenshtein scorer(s1); 27 | int64_t res3 = scorer.distance(s2, max); 28 | int64_t res4 = scorer.distance(s2.begin(), s2.end(), max); 29 | REQUIRE(res1 == res2); 30 | REQUIRE(res1 == res3); 31 | REQUIRE(res1 == res4); 32 | return res1; 33 | } 34 | 35 | template 36 | int64_t damerau_levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, int64_t max = 0) 37 | { 38 | int64_t res1 = rapidfuzz::experimental::damerau_levenshtein_similarity(s1, s2, max); 39 | int64_t res2 = rapidfuzz::experimental::damerau_levenshtein_similarity(s1.begin(), s1.end(), s2.begin(), 40 | s2.end(), max); 41 | rapidfuzz::experimental::CachedDamerauLevenshtein scorer(s1); 42 | int64_t res3 = scorer.similarity(s2, max); 43 | int64_t res4 = scorer.similarity(s2.begin(), s2.end(), max); 44 | REQUIRE(res1 == res2); 45 | REQUIRE(res1 == res3); 46 | REQUIRE(res1 == res4); 47 | return res1; 48 | } 49 | 50 | template 51 | double damerau_levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, 52 | double score_cutoff = 1.0) 53 | { 54 | double res1 = rapidfuzz::experimental::damerau_levenshtein_normalized_distance(s1, s2, score_cutoff); 55 | double res2 = rapidfuzz::experimental::damerau_levenshtein_normalized_distance( 56 | s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 57 | rapidfuzz::experimental::CachedDamerauLevenshtein scorer(s1); 58 | double res3 = scorer.normalized_distance(s2, score_cutoff); 59 | double res4 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff); 60 | REQUIRE(res1 == Catch::Approx(res2).epsilon(0.0001)); 61 | REQUIRE(res1 == Catch::Approx(res3).epsilon(0.0001)); 62 | REQUIRE(res1 == Catch::Approx(res4).epsilon(0.0001)); 63 | return res1; 64 | } 65 | 66 | template 67 | double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, 68 | double score_cutoff = 0.0) 69 | { 70 | double res1 = rapidfuzz::experimental::damerau_levenshtein_normalized_similarity(s1, s2, score_cutoff); 71 | double res2 = rapidfuzz::experimental::damerau_levenshtein_normalized_similarity( 72 | s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff); 73 | rapidfuzz::experimental::CachedDamerauLevenshtein scorer(s1); 74 | double res3 = scorer.normalized_similarity(s2, score_cutoff); 75 | double res4 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff); 76 | REQUIRE(res1 == Catch::Approx(res2).epsilon(0.0001)); 77 | REQUIRE(res1 == Catch::Approx(res3).epsilon(0.0001)); 78 | REQUIRE(res1 == Catch::Approx(res4).epsilon(0.0001)); 79 | return res1; 80 | } 81 | 82 | TEST_CASE("Levenshtein") 83 | { 84 | std::string test = "aaaa"; 85 | std::wstring no_suffix = L"aaa"; 86 | std::string no_suffix2 = "aaab"; 87 | std::string swapped1 = "abaa"; 88 | std::string swapped2 = "baaa"; 89 | std::string replace_all = "bbbb"; 90 | 91 | SECTION("damerau levenshtein calculates correct distances") 92 | { 93 | REQUIRE(damerau_levenshtein_distance(test, test) == 0); 94 | REQUIRE(damerau_levenshtein_distance(test, no_suffix) == 1); 95 | REQUIRE(damerau_levenshtein_distance(swapped1, swapped2) == 1); 96 | REQUIRE(damerau_levenshtein_distance(test, no_suffix2) == 1); 97 | REQUIRE(damerau_levenshtein_distance(test, replace_all) == 4); 98 | 99 | { 100 | std::string s1 = "CA"; 101 | std::string s2 = "ABC"; 102 | REQUIRE(damerau_levenshtein_distance(s1, s2) == 2); 103 | } 104 | } 105 | 106 | SECTION("weighted levenshtein calculates correct ratios") 107 | { 108 | REQUIRE(damerau_levenshtein_normalized_similarity(test, test) == 1.0); 109 | REQUIRE(damerau_levenshtein_normalized_similarity(test, no_suffix) == 110 | Catch::Approx(0.75).epsilon(0.0001)); 111 | REQUIRE(damerau_levenshtein_normalized_similarity(swapped1, swapped2) == 112 | Catch::Approx(0.75).epsilon(0.0001)); 113 | REQUIRE(damerau_levenshtein_normalized_similarity(test, no_suffix2) == 114 | Catch::Approx(0.75).epsilon(0.0001)); 115 | REQUIRE(damerau_levenshtein_normalized_similarity(test, replace_all) == 0.0); 116 | 117 | { 118 | std::string s1 = "CA"; 119 | std::string s2 = "ABC"; 120 | REQUIRE(damerau_levenshtein_normalized_similarity(s1, s2) == 121 | Catch::Approx(0.33333).epsilon(0.0001)); 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /rapidfuzz/details/intrinsics.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #if defined(_MSC_VER) && !defined(__clang__) 14 | # include 15 | #endif 16 | 17 | namespace rapidfuzz::detail { 18 | 19 | template 20 | T bit_mask_lsb(int n) 21 | { 22 | T mask = static_cast(-1); 23 | if (n < static_cast(sizeof(T) * 8)) { 24 | mask += static_cast(1) << n; 25 | } 26 | return mask; 27 | } 28 | 29 | template 30 | bool bittest(T a, int bit) 31 | { 32 | return (a >> bit) & 1; 33 | } 34 | 35 | /* 36 | * shift right without undefined behavior for shifts > bit width 37 | */ 38 | template 39 | constexpr uint64_t shr64(uint64_t a, U shift) 40 | { 41 | return (shift < 64) ? a >> shift : 0; 42 | } 43 | 44 | /* 45 | * shift left without undefined behavior for shifts > bit width 46 | */ 47 | template 48 | constexpr uint64_t shl64(uint64_t a, U shift) 49 | { 50 | return (shift < 64) ? a << shift : 0; 51 | } 52 | 53 | constexpr uint64_t addc64(uint64_t a, uint64_t b, uint64_t carryin, uint64_t* carryout) 54 | { 55 | /* todo should use _addcarry_u64 when available */ 56 | a += carryin; 57 | *carryout = a < carryin; 58 | a += b; 59 | *carryout |= a < b; 60 | return a; 61 | } 62 | 63 | template 64 | constexpr T ceil_div(T a, U divisor) 65 | { 66 | T _div = static_cast(divisor); 67 | return a / _div + static_cast(a % _div != 0); 68 | } 69 | 70 | static inline int popcount(uint64_t x) 71 | { 72 | return static_cast(std::bitset<64>(x).count()); 73 | } 74 | 75 | static inline int popcount(uint32_t x) 76 | { 77 | return static_cast(std::bitset<32>(x).count()); 78 | } 79 | 80 | static inline int popcount(uint16_t x) 81 | { 82 | return static_cast(std::bitset<16>(x).count()); 83 | } 84 | 85 | static inline int popcount(uint8_t x) 86 | { 87 | static constexpr int bit_count[256] = { 88 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 89 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 90 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 91 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 92 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 93 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 94 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 95 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; 96 | return bit_count[x]; 97 | } 98 | 99 | template 100 | constexpr T rotl(T x, unsigned int n) 101 | { 102 | unsigned int num_bits = std::numeric_limits::digits; 103 | assert(n < num_bits); 104 | unsigned int count_mask = num_bits - 1; 105 | 106 | #if _MSC_VER && !defined(__clang__) 107 | # pragma warning(push) 108 | /* unary minus operator applied to unsigned type, result still unsigned */ 109 | # pragma warning(disable : 4146) 110 | #endif 111 | return (x << n) | (x >> (-n & count_mask)); 112 | #if _MSC_VER && !defined(__clang__) 113 | # pragma warning(pop) 114 | #endif 115 | } 116 | 117 | /** 118 | * Extract the lowest set bit from a. If no bits are set in a returns 0. 119 | */ 120 | template 121 | constexpr T blsi(T a) 122 | { 123 | #if _MSC_VER && !defined(__clang__) 124 | # pragma warning(push) 125 | /* unary minus operator applied to unsigned type, result still unsigned */ 126 | # pragma warning(disable : 4146) 127 | #endif 128 | return a & -a; 129 | #if _MSC_VER && !defined(__clang__) 130 | # pragma warning(pop) 131 | #endif 132 | } 133 | 134 | /** 135 | * Clear the lowest set bit in a. 136 | */ 137 | template 138 | constexpr T blsr(T x) 139 | { 140 | return x & (x - 1); 141 | } 142 | 143 | /** 144 | * Sets all the lower bits of the result to 1 up to and including lowest set bit (=1) in a. 145 | * If a is zero, blsmsk sets all bits to 1. 146 | */ 147 | template 148 | constexpr T blsmsk(T a) 149 | { 150 | return a ^ (a - 1); 151 | } 152 | 153 | #if defined(_MSC_VER) && !defined(__clang__) 154 | static inline int countr_zero(uint32_t x) 155 | { 156 | unsigned long trailing_zero = 0; 157 | _BitScanForward(&trailing_zero, x); 158 | return trailing_zero; 159 | } 160 | 161 | # if defined(_M_ARM) || defined(_M_X64) 162 | static inline int countr_zero(uint64_t x) 163 | { 164 | unsigned long trailing_zero = 0; 165 | _BitScanForward64(&trailing_zero, x); 166 | return trailing_zero; 167 | } 168 | # else 169 | static inline int countr_zero(uint64_t x) 170 | { 171 | uint32_t msh = (uint32_t)(x >> 32); 172 | uint32_t lsh = (uint32_t)(x & 0xFFFFFFFF); 173 | if (lsh != 0) return countr_zero(lsh); 174 | return 32 + countr_zero(msh); 175 | } 176 | # endif 177 | 178 | #else /* gcc / clang */ 179 | static inline int countr_zero(uint32_t x) 180 | { 181 | return __builtin_ctz(x); 182 | } 183 | 184 | static inline int countr_zero(uint64_t x) 185 | { 186 | return __builtin_ctzll(x); 187 | } 188 | #endif 189 | 190 | template 191 | constexpr void unroll_impl(std::integer_sequence, F&& f) 192 | { 193 | (f(std::integral_constant{}), ...); 194 | } 195 | 196 | template 197 | constexpr void unroll(F&& f) 198 | { 199 | unroll_impl(std::make_integer_sequence{}, std::forward(f)); 200 | } 201 | 202 | } // namespace rapidfuzz::detail 203 | -------------------------------------------------------------------------------- /.github/RapidFuzz.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xml 46 | 50 | 51 | 54 | 55 | 56 | 57 | 69 | 70 | 76 | Page-1 78 | 84 | 89 | Rectangle 91 | 92 | 95 | 96 | 104 | 105 | 110 | Rectangle.2 112 | 113 | 116 | 117 | 125 | 126 | 131 | Sheet.3 133 | Rapid 135 | 138 | 143 | 151 | Rapid 159 | 160 | 165 | Sheet.4 167 | Fuzz 169 | 172 | 177 | 185 | Fuzz 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /rapidfuzz/details/PatternMatchVector.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright (c) 2022 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace rapidfuzz::detail { 19 | 20 | struct BitvectorHashmap { 21 | BitvectorHashmap() : m_map() 22 | {} 23 | 24 | template 25 | uint64_t get(CharT key) const noexcept 26 | { 27 | return m_map[lookup(static_cast(key))].value; 28 | } 29 | 30 | template 31 | uint64_t& operator[](CharT key) noexcept 32 | { 33 | uint32_t i = lookup(static_cast(key)); 34 | m_map[i].key = static_cast(key); 35 | return m_map[i].value; 36 | } 37 | 38 | private: 39 | /** 40 | * lookup key inside the hashmap using a similar collision resolution 41 | * strategy to CPython and Ruby 42 | */ 43 | uint32_t lookup(uint64_t key) const noexcept 44 | { 45 | uint32_t i = key % 128; 46 | 47 | if (!m_map[i].value || m_map[i].key == key) return i; 48 | 49 | uint64_t perturb = key; 50 | while (true) { 51 | i = (static_cast(i) * 5 + perturb + 1) % 128; 52 | if (!m_map[i].value || m_map[i].key == key) return i; 53 | 54 | perturb >>= 5; 55 | } 56 | } 57 | 58 | struct MapElem { 59 | uint64_t key = 0; 60 | uint64_t value = 0; 61 | }; 62 | std::array m_map; 63 | }; 64 | 65 | struct PatternMatchVector { 66 | PatternMatchVector() : m_extendedAscii() 67 | {} 68 | 69 | template 70 | PatternMatchVector(Range s) : m_extendedAscii() 71 | { 72 | insert(s); 73 | } 74 | 75 | size_t size() const noexcept 76 | { 77 | return 1; 78 | } 79 | 80 | template 81 | void insert(Range s) noexcept 82 | { 83 | uint64_t mask = 1; 84 | for (const auto& ch : s) { 85 | insert_mask(ch, mask); 86 | mask <<= 1; 87 | } 88 | } 89 | 90 | template 91 | void insert(CharT key, int64_t pos) noexcept 92 | { 93 | insert_mask(key, UINT64_C(1) << pos); 94 | } 95 | 96 | uint64_t get(char key) const noexcept 97 | { 98 | /** treat char as value between 0 and 127 for performance reasons */ 99 | return m_extendedAscii[static_cast(key)]; 100 | } 101 | 102 | template 103 | uint64_t get(CharT key) const noexcept 104 | { 105 | if (key >= 0 && key <= 255) 106 | return m_extendedAscii[static_cast(key)]; 107 | else 108 | return m_map.get(key); 109 | } 110 | 111 | template 112 | uint64_t get(size_t block, CharT key) const noexcept 113 | { 114 | assert(block == 0); 115 | (void)block; 116 | return get(key); 117 | } 118 | 119 | void insert_mask(char key, uint64_t mask) noexcept 120 | { 121 | /** treat char as value between 0 and 127 for performance reasons */ 122 | m_extendedAscii[static_cast(key)] |= mask; 123 | } 124 | 125 | template 126 | void insert_mask(CharT key, uint64_t mask) noexcept 127 | { 128 | if (key >= 0 && key <= 255) 129 | m_extendedAscii[static_cast(key)] |= mask; 130 | else 131 | m_map[key] |= mask; 132 | } 133 | 134 | private: 135 | BitvectorHashmap m_map; 136 | std::array m_extendedAscii; 137 | }; 138 | 139 | struct BlockPatternMatchVector { 140 | BlockPatternMatchVector() = delete; 141 | 142 | BlockPatternMatchVector(size_t str_len) 143 | : m_block_count(ceil_div(str_len, 64)), m_map(nullptr), m_extendedAscii(256, m_block_count, 0) 144 | {} 145 | 146 | template 147 | BlockPatternMatchVector(Range s) : BlockPatternMatchVector(static_cast(s.size())) 148 | { 149 | insert(s); 150 | } 151 | 152 | ~BlockPatternMatchVector() 153 | { 154 | delete[] m_map; 155 | } 156 | 157 | size_t size() const noexcept 158 | { 159 | return m_block_count; 160 | } 161 | 162 | template 163 | void insert(size_t block, CharT ch, int pos) noexcept 164 | { 165 | uint64_t mask = UINT64_C(1) << pos; 166 | insert_mask(block, ch, mask); 167 | } 168 | 169 | /** 170 | * @warning undefined behavior if iterator \p first is greater than \p last 171 | * @tparam InputIt 172 | * @param first 173 | * @param last 174 | */ 175 | template 176 | void insert(Range s) noexcept 177 | { 178 | auto len = s.size(); 179 | uint64_t mask = 1; 180 | for (ptrdiff_t i = 0; i < len; ++i) { 181 | size_t block = static_cast(i) / 64; 182 | insert_mask(block, s[i], mask); 183 | mask = rotl(mask, 1); 184 | } 185 | } 186 | 187 | template 188 | void insert_mask(size_t block, CharT key, uint64_t mask) noexcept 189 | { 190 | assert(block < size()); 191 | if (key >= 0 && key <= 255) 192 | m_extendedAscii[static_cast(key)][block] |= mask; 193 | else { 194 | if (!m_map) m_map = new BitvectorHashmap[m_block_count]; 195 | m_map[block][key] |= mask; 196 | } 197 | } 198 | 199 | void insert_mask(size_t block, char key, uint64_t mask) noexcept 200 | { 201 | insert_mask(block, static_cast(key), mask); 202 | } 203 | 204 | template 205 | uint64_t get(size_t block, CharT key) const noexcept 206 | { 207 | if (key >= 0 && key <= 255) 208 | return m_extendedAscii[static_cast(key)][block]; 209 | else if (m_map) 210 | return m_map[block].get(key); 211 | else 212 | return 0; 213 | } 214 | 215 | uint64_t get(size_t block, char ch) const noexcept 216 | { 217 | return get(block, static_cast(ch)); 218 | } 219 | 220 | private: 221 | size_t m_block_count; 222 | BitvectorHashmap* m_map; 223 | BitMatrix m_extendedAscii; 224 | }; 225 | 226 | } // namespace rapidfuzz::detail 227 | -------------------------------------------------------------------------------- /bench/bench-fuzz.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using rapidfuzz::fuzz::partial_ratio; 7 | using rapidfuzz::fuzz::partial_token_ratio; 8 | using rapidfuzz::fuzz::partial_token_set_ratio; 9 | using rapidfuzz::fuzz::partial_token_sort_ratio; 10 | using rapidfuzz::fuzz::ratio; 11 | using rapidfuzz::fuzz::token_ratio; 12 | using rapidfuzz::fuzz::token_set_ratio; 13 | using rapidfuzz::fuzz::token_sort_ratio; 14 | using rapidfuzz::fuzz::WRatio; 15 | 16 | static void BM_FuzzRatio1(benchmark::State& state) 17 | { 18 | std::wstring a = L"aaaaa aaaaa"; 19 | for (auto _ : state) { 20 | benchmark::DoNotOptimize(ratio(a, a)); 21 | } 22 | state.SetLabel("Similar Strings"); 23 | } 24 | 25 | static void BM_FuzzRatio2(benchmark::State& state) 26 | { 27 | std::wstring a = L"aaaaa aaaaa"; 28 | std::wstring b = L"bbbbb bbbbb"; 29 | for (auto _ : state) { 30 | benchmark::DoNotOptimize(ratio(a, b)); 31 | } 32 | state.SetLabel("Different Strings"); 33 | } 34 | 35 | BENCHMARK(BM_FuzzRatio1); 36 | BENCHMARK(BM_FuzzRatio2); 37 | 38 | static void BM_FuzzPartialRatio1(benchmark::State& state) 39 | { 40 | std::wstring a = L"aaaaa aaaaa"; 41 | for (auto _ : state) { 42 | benchmark::DoNotOptimize(partial_ratio(a, a)); 43 | } 44 | state.SetLabel("Similar Strings"); 45 | } 46 | 47 | static void BM_FuzzPartialRatio2(benchmark::State& state) 48 | { 49 | std::wstring a = L"aaaaa aaaaa"; 50 | std::wstring b = L"bbbbb bbbbb"; 51 | for (auto _ : state) { 52 | benchmark::DoNotOptimize(partial_ratio(a, b)); 53 | } 54 | state.SetLabel("Different Strings"); 55 | } 56 | 57 | BENCHMARK(BM_FuzzPartialRatio1); 58 | BENCHMARK(BM_FuzzPartialRatio2); 59 | 60 | static void BM_FuzzTokenSort1(benchmark::State& state) 61 | { 62 | std::wstring a = L"aaaaa aaaaa"; 63 | for (auto _ : state) { 64 | benchmark::DoNotOptimize(token_sort_ratio(a, a)); 65 | } 66 | state.SetLabel("Similar Strings"); 67 | } 68 | 69 | static void BM_FuzzTokenSort2(benchmark::State& state) 70 | { 71 | std::wstring a = L"aaaaa aaaaa"; 72 | std::wstring b = L"bbbbb bbbbb"; 73 | for (auto _ : state) { 74 | benchmark::DoNotOptimize(token_sort_ratio(a, b)); 75 | } 76 | state.SetLabel("Different Strings"); 77 | } 78 | 79 | BENCHMARK(BM_FuzzTokenSort1); 80 | BENCHMARK(BM_FuzzTokenSort2); 81 | 82 | static void BM_FuzzPartialTokenSort1(benchmark::State& state) 83 | { 84 | std::wstring a = L"aaaaa aaaaa"; 85 | for (auto _ : state) { 86 | benchmark::DoNotOptimize(partial_token_sort_ratio(a, a)); 87 | } 88 | state.SetLabel("Similar Strings"); 89 | } 90 | 91 | static void BM_FuzzPartialTokenSort2(benchmark::State& state) 92 | { 93 | std::wstring a = L"aaaaa aaaaa"; 94 | std::wstring b = L"bbbbb bbbbb"; 95 | for (auto _ : state) { 96 | benchmark::DoNotOptimize(partial_token_sort_ratio(a, b)); 97 | } 98 | state.SetLabel("Different Strings"); 99 | } 100 | 101 | BENCHMARK(BM_FuzzPartialTokenSort1); 102 | BENCHMARK(BM_FuzzPartialTokenSort2); 103 | 104 | static void BM_FuzzTokenSet1(benchmark::State& state) 105 | { 106 | std::wstring a = L"aaaaa aaaaa"; 107 | for (auto _ : state) { 108 | benchmark::DoNotOptimize(token_set_ratio(a, a)); 109 | } 110 | state.SetLabel("Similar Strings"); 111 | } 112 | 113 | static void BM_FuzzTokenSet2(benchmark::State& state) 114 | { 115 | std::wstring a = L"aaaaa aaaaa"; 116 | std::wstring b = L"bbbbb bbbbb"; 117 | for (auto _ : state) { 118 | benchmark::DoNotOptimize(token_set_ratio(a, b)); 119 | } 120 | state.SetLabel("Different Strings"); 121 | } 122 | 123 | BENCHMARK(BM_FuzzTokenSet1); 124 | BENCHMARK(BM_FuzzTokenSet2); 125 | 126 | static void BM_FuzzPartialTokenSet1(benchmark::State& state) 127 | { 128 | std::wstring a = L"aaaaa aaaaa"; 129 | for (auto _ : state) { 130 | benchmark::DoNotOptimize(partial_token_set_ratio(a, a)); 131 | } 132 | state.SetLabel("Similar Strings"); 133 | } 134 | 135 | static void BM_FuzzPartialTokenSet2(benchmark::State& state) 136 | { 137 | std::wstring a = L"aaaaa aaaaa"; 138 | std::wstring b = L"bbbbb bbbbb"; 139 | for (auto _ : state) { 140 | benchmark::DoNotOptimize(partial_token_set_ratio(a, b)); 141 | } 142 | state.SetLabel("Different Strings"); 143 | } 144 | 145 | BENCHMARK(BM_FuzzPartialTokenSet1); 146 | BENCHMARK(BM_FuzzPartialTokenSet2); 147 | 148 | static void BM_FuzzToken1(benchmark::State& state) 149 | { 150 | std::wstring a = L"aaaaa aaaaa"; 151 | for (auto _ : state) { 152 | benchmark::DoNotOptimize(token_ratio(a, a)); 153 | } 154 | state.SetLabel("Similar Strings"); 155 | } 156 | 157 | static void BM_FuzzToken2(benchmark::State& state) 158 | { 159 | std::wstring a = L"aaaaa aaaaa"; 160 | std::wstring b = L"bbbbb bbbbb"; 161 | for (auto _ : state) { 162 | benchmark::DoNotOptimize(token_ratio(a, b)); 163 | } 164 | state.SetLabel("Different Strings"); 165 | } 166 | 167 | BENCHMARK(BM_FuzzToken1); 168 | BENCHMARK(BM_FuzzToken2); 169 | 170 | static void BM_FuzzPartialToken1(benchmark::State& state) 171 | { 172 | std::wstring a = L"aaaaa aaaaa"; 173 | for (auto _ : state) { 174 | benchmark::DoNotOptimize(partial_token_ratio(a, a)); 175 | } 176 | state.SetLabel("Similar Strings"); 177 | } 178 | 179 | static void BM_FuzzPartialToken2(benchmark::State& state) 180 | { 181 | std::wstring a = L"aaaaa aaaaa"; 182 | std::wstring b = L"bbbbb bbbbb"; 183 | for (auto _ : state) { 184 | benchmark::DoNotOptimize(partial_token_ratio(a, b)); 185 | } 186 | state.SetLabel("Different Strings"); 187 | } 188 | 189 | BENCHMARK(BM_FuzzPartialToken1); 190 | BENCHMARK(BM_FuzzPartialToken2); 191 | 192 | static void BM_FuzzWRatio1(benchmark::State& state) 193 | { 194 | std::wstring a = L"aaaaa aaaaa"; 195 | for (auto _ : state) { 196 | benchmark::DoNotOptimize(WRatio(a, a)); 197 | } 198 | state.SetLabel("Similar Strings"); 199 | } 200 | 201 | static void BM_FuzzWRatio3(benchmark::State& state) 202 | { 203 | std::wstring a = L"aaaaa aaaaa"; 204 | std::wstring b = L"bbbbb bbbbb"; 205 | for (auto _ : state) { 206 | benchmark::DoNotOptimize(WRatio(a, b)); 207 | } 208 | state.SetLabel("Different Strings"); 209 | } 210 | 211 | static void BM_FuzzWRatio2(benchmark::State& state) 212 | { 213 | std::wstring a = L"aaaaa b"; 214 | std::wstring b = L"bbbbb bbbbbbbbb"; 215 | for (auto _ : state) { 216 | benchmark::DoNotOptimize(WRatio(a, b)); 217 | } 218 | state.SetLabel("Different length Strings"); 219 | } 220 | 221 | BENCHMARK(BM_FuzzWRatio1); 222 | BENCHMARK(BM_FuzzWRatio2); 223 | BENCHMARK(BM_FuzzWRatio3); 224 | 225 | BENCHMARK_MAIN(); 226 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Hamming.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz { 11 | 12 | /** 13 | * @brief Calculates the Hamming distance between two strings. 14 | * 15 | * @details 16 | * Both strings require a similar length 17 | * 18 | * 19 | * @tparam Sentence1 This is a string that can be converted to 20 | * basic_string_view 21 | * @tparam Sentence2 This is a string that can be converted to 22 | * basic_string_view 23 | * 24 | * @param s1 25 | * string to compare with s2 (for type info check Template parameters above) 26 | * @param s2 27 | * string to compare with s1 (for type info check Template parameters above) 28 | * @param max 29 | * Maximum Hamming distance between s1 and s2, that is 30 | * considered as a result. If the distance is bigger than max, 31 | * max + 1 is returned instead. Default is std::numeric_limits::max(), 32 | * which deactivates this behaviour. 33 | * 34 | * @return Hamming distance between s1 and s2 35 | */ 36 | template 37 | int64_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 38 | int64_t score_cutoff = std::numeric_limits::max()) 39 | { 40 | return detail::Hamming::distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 41 | } 42 | 43 | template 44 | int64_t hamming_distance(const Sentence1& s1, const Sentence2& s2, 45 | int64_t score_cutoff = std::numeric_limits::max()) 46 | { 47 | return detail::Hamming::distance(s1, s2, score_cutoff, score_cutoff); 48 | } 49 | 50 | template 51 | int64_t hamming_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 52 | int64_t score_cutoff = 0) 53 | { 54 | return detail::Hamming::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 55 | } 56 | 57 | template 58 | int64_t hamming_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0) 59 | { 60 | return detail::Hamming::similarity(s1, s2, score_cutoff, score_cutoff); 61 | } 62 | 63 | template 64 | double hamming_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 65 | double score_cutoff = 1.0) 66 | { 67 | return detail::Hamming::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 68 | } 69 | 70 | template 71 | double hamming_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 72 | { 73 | return detail::Hamming::normalized_distance(s1, s2, score_cutoff, score_cutoff); 74 | } 75 | 76 | template 77 | Editops hamming_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 78 | int64_t score_hint = std::numeric_limits::max()) 79 | { 80 | return detail::hamming_editops(detail::Range(first1, last1), detail::Range(first2, last2), score_hint); 81 | } 82 | 83 | template 84 | Editops hamming_editops(const Sentence1& s1, const Sentence2& s2, 85 | int64_t score_hint = std::numeric_limits::max()) 86 | { 87 | return detail::hamming_editops(detail::Range(s1), detail::Range(s2), score_hint); 88 | } 89 | 90 | /** 91 | * @brief Calculates a normalized hamming similarity 92 | * 93 | * @details 94 | * Both string require a similar length 95 | * 96 | * 97 | * @tparam Sentence1 This is a string that can be converted to 98 | * basic_string_view 99 | * @tparam Sentence2 This is a string that can be converted to 100 | * basic_string_view 101 | * 102 | * @param s1 103 | * string to compare with s2 (for type info check Template parameters above) 104 | * @param s2 105 | * string to compare with s1 (for type info check Template parameters above) 106 | * @param score_cutoff 107 | * Optional argument for a score threshold as a float between 0 and 1.0. 108 | * For ratio < score_cutoff 0 is returned instead. Default is 0, 109 | * which deactivates this behaviour. 110 | * 111 | * @return Normalized hamming distance between s1 and s2 112 | * as a float between 0 and 1.0 113 | */ 114 | template 115 | double hamming_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 116 | double score_cutoff = 0.0) 117 | { 118 | return detail::Hamming::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 119 | } 120 | 121 | template 122 | double hamming_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 123 | { 124 | return detail::Hamming::normalized_similarity(s1, s2, score_cutoff, score_cutoff); 125 | } 126 | 127 | template 128 | struct CachedHamming : public detail::CachedDistanceBase, int64_t, 0, 129 | std::numeric_limits::max()> { 130 | template 131 | explicit CachedHamming(const Sentence1& s1_) : CachedHamming(detail::to_begin(s1_), detail::to_end(s1_)) 132 | {} 133 | 134 | template 135 | CachedHamming(InputIt1 first1, InputIt1 last1) : s1(first1, last1) 136 | {} 137 | 138 | private: 139 | friend detail::CachedDistanceBase, int64_t, 0, std::numeric_limits::max()>; 140 | friend detail::CachedNormalizedMetricBase>; 141 | 142 | template 143 | int64_t maximum(detail::Range s2) const 144 | { 145 | return s2.size(); 146 | } 147 | 148 | template 149 | int64_t _distance(detail::Range s2, int64_t score_cutoff, 150 | [[maybe_unused]] int64_t score_hint) const 151 | { 152 | return detail::Hamming::distance(s1, s2, score_cutoff, score_hint); 153 | } 154 | 155 | std::basic_string s1; 156 | }; 157 | 158 | template 159 | explicit CachedHamming(const Sentence1& s1_) -> CachedHamming>; 160 | 161 | template 162 | CachedHamming(InputIt1 first1, InputIt1 last1) -> CachedHamming>; 163 | 164 | /**@}*/ 165 | 166 | } // namespace rapidfuzz 167 | -------------------------------------------------------------------------------- /rapidfuzz/distance/DamerauLevenshtein.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace rapidfuzz { 9 | /* the API will require a change when adding custom weights */ 10 | namespace experimental { 11 | /** 12 | * @brief Calculates the Damerau Levenshtein distance between two strings. 13 | * 14 | * 15 | * @tparam Sentence1 This is a string that can be converted to 16 | * basic_string_view 17 | * @tparam Sentence2 This is a string that can be converted to 18 | * basic_string_view 19 | * 20 | * @param s1 21 | * string to compare with s2 (for type info check Template parameters above) 22 | * @param s2 23 | * string to compare with s1 (for type info check Template parameters above) 24 | * @param max 25 | * Maximum Damerau Levenshtein distance between s1 and s2, that is 26 | * considered as a result. If the distance is bigger than max, 27 | * max + 1 is returned instead. Default is std::numeric_limits::max(), 28 | * which deactivates this behaviour. 29 | * 30 | * @return Damerau Levenshtein distance between s1 and s2 31 | */ 32 | template 33 | int64_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 34 | int64_t score_cutoff = std::numeric_limits::max()) 35 | { 36 | return detail::DamerauLevenshtein::distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 37 | } 38 | 39 | template 40 | int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, 41 | int64_t score_cutoff = std::numeric_limits::max()) 42 | { 43 | return detail::DamerauLevenshtein::distance(s1, s2, score_cutoff, score_cutoff); 44 | } 45 | 46 | template 47 | int64_t damerau_levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 48 | int64_t score_cutoff = 0) 49 | { 50 | return detail::DamerauLevenshtein::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 51 | } 52 | 53 | template 54 | int64_t damerau_levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0) 55 | { 56 | return detail::DamerauLevenshtein::similarity(s1, s2, score_cutoff, score_cutoff); 57 | } 58 | 59 | template 60 | double damerau_levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, 61 | InputIt2 last2, double score_cutoff = 1.0) 62 | { 63 | return detail::DamerauLevenshtein::normalized_distance(first1, last1, first2, last2, score_cutoff, 64 | score_cutoff); 65 | } 66 | 67 | template 68 | double damerau_levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, 69 | double score_cutoff = 1.0) 70 | { 71 | return detail::DamerauLevenshtein::normalized_distance(s1, s2, score_cutoff, score_cutoff); 72 | } 73 | 74 | /** 75 | * @brief Calculates a normalized Damerau Levenshtein similarity 76 | * 77 | * @details 78 | * Both string require a similar length 79 | * 80 | * 81 | * @tparam Sentence1 This is a string that can be converted to 82 | * basic_string_view 83 | * @tparam Sentence2 This is a string that can be converted to 84 | * basic_string_view 85 | * 86 | * @param s1 87 | * string to compare with s2 (for type info check Template parameters above) 88 | * @param s2 89 | * string to compare with s1 (for type info check Template parameters above) 90 | * @param score_cutoff 91 | * Optional argument for a score threshold as a float between 0 and 1.0. 92 | * For ratio < score_cutoff 0 is returned instead. Default is 0, 93 | * which deactivates this behaviour. 94 | * 95 | * @return Normalized Damerau Levenshtein distance between s1 and s2 96 | * as a float between 0 and 1.0 97 | */ 98 | template 99 | double damerau_levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, 100 | InputIt2 last2, double score_cutoff = 0.0) 101 | { 102 | return detail::DamerauLevenshtein::normalized_similarity(first1, last1, first2, last2, score_cutoff, 103 | score_cutoff); 104 | } 105 | 106 | template 107 | double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, 108 | double score_cutoff = 0.0) 109 | { 110 | return detail::DamerauLevenshtein::normalized_similarity(s1, s2, score_cutoff, score_cutoff); 111 | } 112 | 113 | template 114 | struct CachedDamerauLevenshtein : public detail::CachedDistanceBase, int64_t, 115 | 0, std::numeric_limits::max()> { 116 | template 117 | explicit CachedDamerauLevenshtein(const Sentence1& s1_) 118 | : CachedDamerauLevenshtein(detail::to_begin(s1_), detail::to_end(s1_)) 119 | {} 120 | 121 | template 122 | CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) : s1(first1, last1) 123 | {} 124 | 125 | private: 126 | friend detail::CachedDistanceBase, int64_t, 0, 127 | std::numeric_limits::max()>; 128 | friend detail::CachedNormalizedMetricBase>; 129 | 130 | template 131 | int64_t maximum(detail::Range s2) const 132 | { 133 | return std::max(static_cast(s1.size()), s2.size()); 134 | } 135 | 136 | template 137 | int64_t _distance(detail::Range s2, int64_t score_cutoff, 138 | [[maybe_unused]] int64_t score_hint) const 139 | { 140 | return damerau_levenshtein_distance(s1, s2, score_cutoff); 141 | } 142 | 143 | std::basic_string s1; 144 | }; 145 | 146 | template 147 | explicit CachedDamerauLevenshtein(const Sentence1& s1_) -> CachedDamerauLevenshtein>; 148 | 149 | template 150 | CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) -> CachedDamerauLevenshtein>; 151 | 152 | } // namespace experimental 153 | } // namespace rapidfuzz 154 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Indel.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz { 11 | 12 | template 13 | int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 14 | int64_t score_cutoff = std::numeric_limits::max()) 15 | { 16 | return detail::Indel::distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 17 | } 18 | 19 | template 20 | int64_t indel_distance(const Sentence1& s1, const Sentence2& s2, 21 | int64_t score_cutoff = std::numeric_limits::max()) 22 | { 23 | return detail::Indel::distance(s1, s2, score_cutoff, score_cutoff); 24 | } 25 | 26 | template 27 | int64_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 28 | int64_t score_cutoff = 0.0) 29 | { 30 | return detail::Indel::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 31 | } 32 | 33 | template 34 | int64_t indel_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0.0) 35 | { 36 | return detail::Indel::similarity(s1, s2, score_cutoff, score_cutoff); 37 | } 38 | 39 | template 40 | double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 41 | double score_cutoff = 1.0) 42 | { 43 | return detail::Indel::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff); 44 | } 45 | 46 | template 47 | double indel_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0) 48 | { 49 | return detail::Indel::normalized_distance(s1, s2, score_cutoff, score_cutoff); 50 | } 51 | 52 | template 53 | double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 54 | double score_cutoff = 0.0) 55 | { 56 | return detail::Indel::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff); 57 | } 58 | 59 | template 60 | double indel_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 61 | { 62 | return detail::Indel::normalized_similarity(s1, s2, score_cutoff, score_cutoff); 63 | } 64 | 65 | template 66 | Editops indel_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) 67 | { 68 | return lcs_seq_editops(first1, last1, first2, last2); 69 | } 70 | 71 | template 72 | Editops indel_editops(const Sentence1& s1, const Sentence2& s2) 73 | { 74 | return lcs_seq_editops(s1, s2); 75 | } 76 | 77 | #ifdef RAPIDFUZZ_SIMD 78 | namespace experimental { 79 | template 80 | struct MultiIndel 81 | : public detail::MultiDistanceBase, int64_t, 0, std::numeric_limits::max()> { 82 | private: 83 | friend detail::MultiDistanceBase, int64_t, 0, std::numeric_limits::max()>; 84 | friend detail::MultiNormalizedMetricBase>; 85 | 86 | public: 87 | MultiIndel(size_t count) : scorer(count) 88 | {} 89 | 90 | /** 91 | * @brief get minimum size required for result vectors passed into 92 | * - distance 93 | * - similarity 94 | * - normalized_distance 95 | * - normalized_similarity 96 | * 97 | * @return minimum vector size 98 | */ 99 | size_t result_count() const 100 | { 101 | return scorer.result_count(); 102 | } 103 | 104 | template 105 | void insert(const Sentence1& s1_) 106 | { 107 | insert(detail::to_begin(s1_), detail::to_end(s1_)); 108 | } 109 | 110 | template 111 | void insert(InputIt1 first1, InputIt1 last1) 112 | { 113 | scorer.insert(first1, last1); 114 | str_lens.push_back(static_cast(std::distance(first1, last1))); 115 | } 116 | 117 | private: 118 | template 119 | void _distance(int64_t* scores, size_t score_count, detail::Range s2, 120 | int64_t score_cutoff = std::numeric_limits::max()) const 121 | { 122 | scorer.similarity(scores, score_count, s2); 123 | 124 | for (size_t i = 0; i < get_input_count(); ++i) { 125 | int64_t maximum_ = maximum(i, s2); 126 | int64_t dist = maximum_ - 2 * scores[i]; 127 | scores[i] = (dist <= score_cutoff) ? dist : score_cutoff + 1; 128 | } 129 | } 130 | 131 | template 132 | int64_t maximum(size_t s1_idx, detail::Range s2) const 133 | { 134 | return static_cast(str_lens[s1_idx]) + s2.size(); 135 | } 136 | 137 | size_t get_input_count() const noexcept 138 | { 139 | return str_lens.size(); 140 | } 141 | 142 | std::vector str_lens; 143 | MultiLCSseq scorer; 144 | }; 145 | } /* namespace experimental */ 146 | #endif 147 | 148 | template 149 | struct CachedIndel : public detail::CachedDistanceBase, int64_t, 0, 150 | std::numeric_limits::max()> { 151 | template 152 | explicit CachedIndel(const Sentence1& s1_) : CachedIndel(detail::to_begin(s1_), detail::to_end(s1_)) 153 | {} 154 | 155 | template 156 | CachedIndel(InputIt1 first1, InputIt1 last1) : s1_len(std::distance(first1, last1)), scorer(first1, last1) 157 | {} 158 | 159 | private: 160 | friend detail::CachedDistanceBase, int64_t, 0, std::numeric_limits::max()>; 161 | friend detail::CachedNormalizedMetricBase>; 162 | 163 | template 164 | int64_t maximum(detail::Range s2) const 165 | { 166 | return s1_len + s2.size(); 167 | } 168 | 169 | template 170 | int64_t _distance(detail::Range s2, int64_t score_cutoff, int64_t score_hint) const 171 | { 172 | int64_t maximum_ = maximum(s2); 173 | int64_t lcs_cutoff = std::max(0, maximum_ / 2 - score_cutoff); 174 | int64_t lcs_cutoff_hint = std::max(0, maximum_ / 2 - score_hint); 175 | int64_t lcs_sim = scorer.similarity(s2, lcs_cutoff, lcs_cutoff_hint); 176 | int64_t dist = maximum_ - 2 * lcs_sim; 177 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 178 | } 179 | 180 | int64_t s1_len; 181 | CachedLCSseq scorer; 182 | }; 183 | 184 | template 185 | explicit CachedIndel(const Sentence1& s1_) -> CachedIndel>; 186 | 187 | template 188 | CachedIndel(InputIt1 first1, InputIt1 last1) -> CachedIndel>; 189 | 190 | } // namespace rapidfuzz 191 | -------------------------------------------------------------------------------- /bench/bench-levenshtein.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | std::string generate(int max_length) 8 | { 9 | std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; 10 | std::random_device rd; 11 | std::mt19937 engine(rd()); 12 | std::uniform_int_distribution<> dist(0, static_cast(possible_characters.size() - 1)); 13 | std::string ret = ""; 14 | for (int i = 0; i < max_length; i++) { 15 | int random_index = dist(engine); 16 | ret += possible_characters[static_cast(random_index)]; 17 | } 18 | return ret; 19 | } 20 | 21 | template 22 | std::basic_string str_multiply(std::basic_string a, unsigned int b) 23 | { 24 | std::basic_string output; 25 | while (b--) 26 | output += a; 27 | 28 | return output; 29 | } 30 | 31 | // Define another benchmark 32 | static void BM_LevWeightedDist1(benchmark::State& state) 33 | { 34 | std::string a = "aaaaa aaaaa"; 35 | for (auto _ : state) { 36 | benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(a, a)); 37 | } 38 | state.SetLabel("Similar Strings"); 39 | } 40 | 41 | static void BM_LevWeightedDist2(benchmark::State& state) 42 | { 43 | std::string a = "aaaaa aaaaa"; 44 | std::string b = "bbbbb bbbbb"; 45 | for (auto _ : state) { 46 | benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(a, b)); 47 | } 48 | state.SetLabel("Different Strings"); 49 | } 50 | 51 | static void BM_LevNormWeightedDist1(benchmark::State& state) 52 | { 53 | std::string a = "aaaaa aaaaa"; 54 | for (auto _ : state) { 55 | benchmark::DoNotOptimize(rapidfuzz::levenshtein_normalized_distance(a, a)); 56 | } 57 | state.SetLabel("Similar Strings"); 58 | } 59 | 60 | static void BM_LevNormWeightedDist2(benchmark::State& state) 61 | { 62 | std::string a = "aaaaa aaaaa"; 63 | std::string b = "bbbbb bbbbb"; 64 | for (auto _ : state) { 65 | benchmark::DoNotOptimize(rapidfuzz::levenshtein_normalized_distance(a, b)); 66 | } 67 | state.SetLabel("Different Strings"); 68 | } 69 | 70 | static void BM_LevLongSimilarSequence(benchmark::State& state) 71 | { 72 | size_t len = state.range(0); 73 | size_t score_cutoff = state.range(1); 74 | std::string s1 = std::string("a") + str_multiply(std::string("b"), (len - 2)) + std::string("a"); 75 | std::string s2 = str_multiply(std::string("b"), len); 76 | 77 | size_t num = 0; 78 | for (auto _ : state) { 79 | benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff)); 80 | ++num; 81 | } 82 | 83 | state.counters["Rate"] = benchmark::Counter(static_cast(num * len), benchmark::Counter::kIsRate); 84 | state.counters["InvRate"] = benchmark::Counter(static_cast(num * len), 85 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 86 | } 87 | 88 | static void BM_LevLongNonSimilarSequence(benchmark::State& state) 89 | { 90 | size_t len = state.range(0); 91 | size_t score_cutoff = state.range(1); 92 | std::string s1 = str_multiply(std::string("a"), len); 93 | std::string s2 = str_multiply(std::string("b"), len); 94 | 95 | size_t num = 0; 96 | for (auto _ : state) { 97 | benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff)); 98 | ++num; 99 | } 100 | 101 | state.counters["Rate"] = benchmark::Counter(static_cast(num * len), benchmark::Counter::kIsRate); 102 | state.counters["InvRate"] = benchmark::Counter(static_cast(num * len), 103 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 104 | } 105 | 106 | template 107 | static void BM_Levenshtein(benchmark::State& state) 108 | { 109 | std::vector seq1; 110 | std::vector seq2; 111 | for (int i = 0; i < 256; i++) 112 | seq1.push_back(generate(MaxLen)); 113 | for (int i = 0; i < 10000; i++) 114 | seq2.push_back(generate(MaxLen)); 115 | 116 | size_t num = 0; 117 | for (auto _ : state) { 118 | for (size_t j = 0; j < seq2.size(); ++j) 119 | for (size_t i = 0; i < seq1.size(); ++i) 120 | benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(seq1[i], seq2[j])); 121 | 122 | num += seq1.size() * seq2.size(); 123 | } 124 | 125 | state.counters["Rate"] = benchmark::Counter(static_cast(num), benchmark::Counter::kIsRate); 126 | state.counters["InvRate"] = benchmark::Counter(static_cast(num), 127 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 128 | } 129 | 130 | template 131 | static void BM_Levenshtein_Cached(benchmark::State& state) 132 | { 133 | std::vector seq1; 134 | std::vector seq2; 135 | for (int i = 0; i < 256; i++) 136 | seq1.push_back(generate(MaxLen)); 137 | for (int i = 0; i < 10000; i++) 138 | seq2.push_back(generate(MaxLen)); 139 | 140 | size_t num = 0; 141 | for (auto _ : state) { 142 | for (const auto& str1 : seq1) { 143 | rapidfuzz::CachedLevenshtein scorer(str1); 144 | for (size_t j = 0; j < seq2.size(); ++j) 145 | benchmark::DoNotOptimize(scorer.similarity(seq2[j])); 146 | } 147 | num += seq1.size() * seq2.size(); 148 | } 149 | 150 | state.counters["Rate"] = benchmark::Counter(static_cast(num), benchmark::Counter::kIsRate); 151 | state.counters["InvRate"] = benchmark::Counter(static_cast(num), 152 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 153 | } 154 | 155 | #ifdef RAPIDFUZZ_SIMD 156 | template 157 | static void BM_Levenshtein_SIMD(benchmark::State& state) 158 | { 159 | std::vector seq1; 160 | std::vector seq2; 161 | std::vector results(64); 162 | for (int i = 0; i < 64; i++) 163 | seq1.push_back(generate(MaxLen)); 164 | for (int i = 0; i < 10000; i++) 165 | seq2.push_back(generate(MaxLen)); 166 | 167 | size_t num = 0; 168 | for (auto _ : state) { 169 | rapidfuzz::experimental::MultiLevenshtein scorer(seq1.size()); 170 | for (const auto& str1 : seq1) 171 | scorer.insert(str1); 172 | 173 | for (const auto& str2 : seq2) 174 | scorer.similarity(&results[0], results.size(), str2); 175 | 176 | num += seq1.size() * seq2.size(); 177 | } 178 | 179 | state.counters["Rate"] = benchmark::Counter(static_cast(num), benchmark::Counter::kIsRate); 180 | state.counters["InvRate"] = benchmark::Counter(static_cast(num), 181 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 182 | } 183 | #endif 184 | 185 | BENCHMARK(BM_LevLongSimilarSequence) 186 | ->Args({100, 30}) 187 | ->Args({500, 30}) 188 | ->Args({5000, 30}) 189 | ->Args({10000, 30}) 190 | ->Args({20000, 30}) 191 | ->Args({50000, 30}); 192 | 193 | BENCHMARK(BM_LevLongNonSimilarSequence) 194 | ->Args({100, 30}) 195 | ->Args({500, 30}) 196 | ->Args({5000, 30}) 197 | ->Args({10000, 30}) 198 | ->Args({20000, 30}) 199 | ->Args({50000, 30}); 200 | 201 | BENCHMARK(BM_LevWeightedDist1); 202 | BENCHMARK(BM_LevWeightedDist2); 203 | 204 | BENCHMARK(BM_LevNormWeightedDist1); 205 | BENCHMARK(BM_LevNormWeightedDist2); 206 | 207 | BENCHMARK_TEMPLATE(BM_Levenshtein, 8); 208 | BENCHMARK_TEMPLATE(BM_Levenshtein, 16); 209 | BENCHMARK_TEMPLATE(BM_Levenshtein, 32); 210 | BENCHMARK_TEMPLATE(BM_Levenshtein, 64); 211 | 212 | BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 8); 213 | BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 16); 214 | BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 32); 215 | BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 64); 216 | 217 | #ifdef RAPIDFUZZ_SIMD 218 | BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 8); 219 | BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 16); 220 | BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 32); 221 | BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 64); 222 | #endif 223 | 224 | BENCHMARK_MAIN(); --------------------------------------------------------------------------------