Page-1

├── .gitattributes ├── .gitignore ├── test ├── distance │ ├── examples │ │ └── ocr.hpp │ ├── CMakeLists.txt │ ├── tests-OSA.cpp │ ├── tests-Hamming.cpp │ ├── tests-Jaro.cpp │ ├── tests-JaroWinkler.cpp │ └── tests-DamerauLevenshtein.cpp ├── tests-common.cpp └── CMakeLists.txt ├── rapidfuzz ├── rapidfuzz_all.hpp ├── details │ ├── simd.hpp │ ├── type_traits.hpp │ ├── CharSet.hpp │ ├── SplittedSentenceView.hpp │ ├── common.hpp │ ├── Range.hpp │ ├── common_impl.hpp │ ├── Matrix.hpp │ ├── GrowingHashmap.hpp │ ├── intrinsics.hpp │ └── PatternMatchVector.hpp ├── distance │ ├── Prefix_impl.hpp │ ├── Postfix_impl.hpp │ ├── Hamming_impl.hpp │ ├── Indel_impl.hpp │ ├── JaroWinkler_impl.hpp │ ├── Jaro.hpp │ ├── Prefix.hpp │ ├── Postfix.hpp │ ├── DamerauLevenshtein_impl.hpp │ ├── JaroWinkler.hpp │ ├── Hamming.hpp │ ├── DamerauLevenshtein.hpp │ └── Indel.hpp └── distance.hpp ├── rapidfuzz_reference ├── README.md ├── Indel.hpp ├── common.hpp ├── LCSseq.hpp ├── Hamming.hpp ├── JaroWinkler.hpp ├── OSA.hpp ├── Levenshtein.hpp ├── Jaro.hpp └── DamerauLevenshtein.hpp ├── examples └── cmake_installed │ ├── CMakeLists.txt │ └── main.cpp ├── docs └── literature │ ├── hyrro_2002.bib │ ├── hyrro_lcs_2004.bib │ ├── hyrro_2004.bib │ ├── myers_1999.bib │ └── wagner_fischer_1974.bib ├── cmake └── rapidfuzzConfig.cmake.in ├── .github ├── workflows │ ├── documentation.yml │ └── cmake.yml └── RapidFuzz.svg ├── fuzzing ├── CMakeLists.txt ├── fuzz_indel_editops.cpp ├── fuzz_levenshtein_editops.cpp ├── fuzzing.hpp ├── fuzz_indel_distance.cpp ├── fuzz_jaro_similarity.cpp ├── fuzz_osa_distance.cpp ├── fuzz_damerau_levenshtein_distance.cpp ├── fuzz_lcs_similarity.cpp └── fuzz_levenshtein_distance.cpp ├── .clang-format ├── bench ├── CMakeLists.txt ├── bench-jarowinkler.cpp ├── bench-lcs.cpp ├── bench-fuzz.cpp └── bench-levenshtein.cpp ├── LICENSE ├── Doxyfile ├── CHANGELOG.md ├── tools └── amalgamation.py └── CMakeLists.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.impl linguist-language=C++ 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .cache/ 3 | .idea/ 4 | build/ 5 | .cache/ 6 | *.data 7 | *.so 8 | *.o 9 | *.out -------------------------------------------------------------------------------- /test/distance/examples/ocr.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | extern std::basic_string ocr_example1; 5 | extern std::basic_string ocr_example2; 6 | -------------------------------------------------------------------------------- /rapidfuzz/rapidfuzz_all.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include -------------------------------------------------------------------------------- /rapidfuzz_reference/README.md: -------------------------------------------------------------------------------- 1 | ## rapidfuzz_reference 2 | 3 | This includes reference implementations of various string matching algorithms, 4 | which can be used to validate the results of faster implementations. -------------------------------------------------------------------------------- /examples/cmake_installed/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | project(cmake_installed CXX) 3 | 4 | find_package(rapidfuzz REQUIRED) 5 | add_executable(foo main.cpp) 6 | target_link_libraries(foo rapidfuzz::rapidfuzz) -------------------------------------------------------------------------------- /docs/literature/hyrro_2002.bib: -------------------------------------------------------------------------------- 1 | @article{hyrro_2002, 2 | author = {Hyyro, Heikki}, 3 | year = {2002}, 4 | month = {10}, 5 | pages = {}, 6 | title = {Explaining and Extending the Bit-parallel Approximate String Matching Algorithm of Myers} 7 | } 8 | -------------------------------------------------------------------------------- /examples/cmake_installed/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | std::string a = "aaaa"; 8 | std::string b = "abab"; 9 | std::cout << rapidfuzz::fuzz::ratio(a, b) << std::endl; 10 | } -------------------------------------------------------------------------------- /docs/literature/hyrro_lcs_2004.bib: -------------------------------------------------------------------------------- 1 | @article{hyrro_lcs_2004, 2 | author = {Hyyro, Heikki}, 3 | year = {2004}, 4 | month = {08}, 5 | pages = {}, 6 | title = {Bit-Parallel LCS-length Computation Revisited}, 7 | journal = {Proc. 15th Australasian Workshop on Combinatorial Algorithms (AWOCA 2004)} 8 | } -------------------------------------------------------------------------------- /docs/literature/hyrro_2004.bib: -------------------------------------------------------------------------------- 1 | @article{hyrro_2004, 2 | author = {Hyyro, Heikki}, 3 | year = {2004}, 4 | month = {08}, 5 | pages = {}, 6 | title = {Bit-Parallel LCS-length Computation Revisited}, 7 | journal = {Proc. 15th Australasian Workshop on Combinatorial Algorithms (AWOCA 2004)} 8 | } 9 | -------------------------------------------------------------------------------- /cmake/rapidfuzzConfig.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | # Avoid repeatedly including the targets 4 | if(NOT TARGET rapidfuzz::rapidfuzz) 5 | # Provide path for scripts 6 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") 7 | 8 | include(${CMAKE_CURRENT_LIST_DIR}/rapidfuzzTargets.cmake) 9 | endif() -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build_docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - run: sudo apt-get install -y doxygen 14 | - run: doxygen ./Doxyfile 15 | - uses: peaceiris/actions-gh-pages@v3 16 | with: 17 | github_token: ${{ secrets.GITHUB_TOKEN }} 18 | publish_dir: ./doxygen/html -------------------------------------------------------------------------------- /docs/literature/myers_1999.bib: -------------------------------------------------------------------------------- 1 | @article{myers_1999, 2 | author = {Myers, Gene}, 3 | title = {A Fast Bit-Vector Algorithm for Approximate String Matching Based on Dynamic Programming}, 4 | year = {1999}, 5 | issue_date = {May 1999}, 6 | publisher = {Association for Computing Machinery}, 7 | address = {New York, NY, USA}, 8 | volume = {46}, 9 | number = {3}, 10 | issn = {0004-5411}, 11 | url = {https://doi.org/10.1145/316542.316550}, 12 | doi = {10.1145/316542.316550}, 13 | journal = {J. ACM}, 14 | month = may, 15 | pages = {395–415}, 16 | numpages = {21}, 17 | keywords = {approximate string search, sequence comparison, bit-parallelism} 18 | } 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /test/distance/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | function(rapidfuzz_add_test test) 2 | add_executable(test_${test} tests-${test}.cpp examples/ocr.cpp) 3 | target_link_libraries(test_${test} ${PROJECT_NAME}) 4 | target_link_libraries(test_${test} Catch2::Catch2WithMain) 5 | if (RAPIDFUZZ_ENABLE_LINTERS) 6 | target_link_libraries(test_${test} project_warnings) 7 | endif() 8 | add_test(NAME ${test} COMMAND test_${test}) 9 | endfunction() 10 | 11 | rapidfuzz_add_test(Hamming) 12 | rapidfuzz_add_test(Indel) 13 | rapidfuzz_add_test(LCSseq) 14 | rapidfuzz_add_test(Levenshtein) 15 | rapidfuzz_add_test(DamerauLevenshtein) 16 | rapidfuzz_add_test(OSA) 17 | rapidfuzz_add_test(Jaro) 18 | rapidfuzz_add_test(JaroWinkler) 19 | -------------------------------------------------------------------------------- /rapidfuzz/details/simd.hpp: -------------------------------------------------------------------------------- 1 | 2 | /* SPDX-License-Identifier: MIT */ 3 | /* Copyright © 2022 Max Bachmann */ 4 | #pragma once 5 | 6 | /* RAPIDFUZZ_LTO_HACK is used to differentiate functions between different 7 | * translation units to avoid warnings when using lto */ 8 | #ifndef RAPIDFUZZ_EXCLUDE_SIMD 9 | # if __AVX2__ 10 | # define RAPIDFUZZ_SIMD 11 | # define RAPIDFUZZ_AVX2 12 | # define RAPIDFUZZ_LTO_HACK 0 13 | # include 14 | 15 | # elif (defined(_M_AMD64) || defined(_M_X64)) || defined(__SSE2__) 16 | # define RAPIDFUZZ_SIMD 17 | # define RAPIDFUZZ_SSE2 18 | # define RAPIDFUZZ_LTO_HACK 1 19 | # include 20 | # endif 21 | #endif -------------------------------------------------------------------------------- /fuzzing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | function(create_fuzzer fuzzer) 2 | add_executable(fuzz_${fuzzer} fuzz_${fuzzer}.cpp) 3 | target_compile_features(fuzz_${fuzzer} PUBLIC cxx_std_17) 4 | target_link_libraries(fuzz_${fuzzer} PRIVATE rapidfuzz::rapidfuzz) 5 | 6 | target_compile_options(fuzz_${fuzzer} PRIVATE -g -O1 -fsanitize=fuzzer,address -march=native) 7 | target_link_libraries(fuzz_${fuzzer} PRIVATE -fsanitize=fuzzer,address) 8 | endfunction(create_fuzzer) 9 | 10 | create_fuzzer(lcs_similarity) 11 | 12 | create_fuzzer(levenshtein_distance) 13 | create_fuzzer(levenshtein_editops) 14 | 15 | create_fuzzer(indel_distance) 16 | create_fuzzer(indel_editops) 17 | 18 | create_fuzzer(osa_distance) 19 | 20 | create_fuzzer(damerau_levenshtein_distance) 21 | 22 | create_fuzzer(jaro_similarity) 23 | -------------------------------------------------------------------------------- /fuzzing/fuzz_indel_editops.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Indel.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | 10 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 11 | { 12 | std::basic_string s1, s2; 13 | if (!extract_strings(data, size, s1, s2)) return 0; 14 | 15 | int64_t score = rapidfuzz_reference::indel_distance(s1, s2); 16 | rapidfuzz::Editops ops = rapidfuzz::indel_editops(s1, s2); 17 | 18 | if (static_cast(ops.size()) == score && s2 != rapidfuzz::editops_apply(ops, s1, s2)) 19 | throw std::logic_error("levenshtein_editops failed"); 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | ColumnLimit: 110 2 | IndentWidth: 4 3 | AccessModifierOffset: -4 4 | 5 | AllowShortIfStatementsOnASingleLine: true 6 | PointerAlignment: Left 7 | AllowShortBlocksOnASingleLine: Always 8 | AllowShortFunctionsOnASingleLine: None 9 | BreakBeforeBraces: Custom 10 | AlwaysBreakTemplateDeclarations: true 11 | BraceWrapping: 12 | SplitEmptyFunction: false 13 | AfterCaseLabel: true 14 | AfterClass: false 15 | AfterControlStatement: MultiLine 16 | AfterEnum: false 17 | AfterFunction: true 18 | AfterNamespace: false 19 | AfterStruct: false 20 | AfterUnion: false 21 | BeforeCatch: true 22 | BeforeElse: true 23 | SplitEmptyRecord: false 24 | SplitEmptyNamespace: false 25 | AllowAllConstructorInitializersOnNextLine: true 26 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 27 | AllowShortCaseLabelsOnASingleLine: true 28 | IndentPPDirectives: AfterHash 29 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Indel.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | 6 | #include "Levenshtein.hpp" 7 | #include 8 | 9 | namespace rapidfuzz_reference { 10 | 11 | template 12 | int64_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 13 | int64_t score_cutoff = std::numeric_limits::max()) 14 | { 15 | return levenshtein_distance(first1, last1, first2, last2, {1, 1, 2}, score_cutoff); 16 | } 17 | 18 | template 19 | int64_t indel_distance(const Sentence1& s1, const Sentence2& s2, 20 | int64_t score_cutoff = std::numeric_limits::max()) 21 | { 22 | return levenshtein_distance(s1, s2, {1, 1, 2}, score_cutoff); 23 | } 24 | 25 | } // namespace rapidfuzz_reference 26 | -------------------------------------------------------------------------------- /rapidfuzz_reference/common.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz_reference { 10 | 11 | template 12 | class Matrix { 13 | public: 14 | Matrix(size_t _rows, size_t _cols) : rows(_rows), cols(_cols) 15 | { 16 | matrix = new T[rows * cols]; 17 | std::fill(matrix, matrix + rows * cols, T()); 18 | } 19 | 20 | ~Matrix() 21 | { 22 | delete[] matrix; 23 | } 24 | 25 | T& operator()(ptrdiff_t row, ptrdiff_t col) 26 | { 27 | return matrix[static_cast(row) + static_cast(col) * rows]; 28 | } 29 | 30 | T& back() 31 | { 32 | return matrix[rows * cols - 1]; 33 | } 34 | 35 | size_t rows; 36 | size_t cols; 37 | T* matrix; 38 | }; 39 | 40 | } // namespace rapidfuzz_reference 41 | -------------------------------------------------------------------------------- /bench/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | FetchContent_Declare(googletest 3 | GIT_REPOSITORY https://github.com/google/googletest.git 4 | GIT_TAG v1.12.x) 5 | 6 | FetchContent_Declare(googlebenchmark 7 | GIT_REPOSITORY https://github.com/google/benchmark.git 8 | GIT_TAG main) # need master for benchmark::benchmark 9 | 10 | FetchContent_MakeAvailable( 11 | googletest 12 | googlebenchmark) 13 | 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 15 | 16 | function(rapidfuzz_add_benchmark NAME SOURCE) 17 | add_executable(bench_${NAME} ${SOURCE}) 18 | target_link_libraries(bench_${NAME} PRIVATE ${PROJECT_NAME}) 19 | target_link_libraries(bench_${NAME} PRIVATE benchmark::benchmark) 20 | endfunction() 21 | 22 | rapidfuzz_add_benchmark(lcs bench-lcs.cpp) 23 | rapidfuzz_add_benchmark(fuzz bench-fuzz.cpp) 24 | rapidfuzz_add_benchmark(levenshtein bench-levenshtein.cpp) 25 | rapidfuzz_add_benchmark(jarowinkler bench-jarowinkler.cpp) 26 | -------------------------------------------------------------------------------- /rapidfuzz_reference/LCSseq.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "Indel.hpp" 6 | 7 | #include 8 | #include 9 | 10 | namespace rapidfuzz_reference { 11 | 12 | template 13 | int64_t lcs_seq_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 14 | int64_t score_cutoff = 0) 15 | { 16 | int64_t maximum = std::distance(first1, last1) + std::distance(first2, last2); 17 | int64_t dist = indel_distance(first1, last1, first2, last2); 18 | int64_t sim = (maximum - dist) / 2; 19 | return (sim >= score_cutoff) ? sim : 0; 20 | } 21 | 22 | template 23 | int64_t lcs_seq_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0) 24 | { 25 | return lcs_seq_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 26 | } 27 | 28 | } // namespace rapidfuzz_reference 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2020 Max Bachmann 2 | Copyright © 2011 Adam Cohen 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Prefix_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include "rapidfuzz/details/common.hpp" 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz::detail { 10 | 11 | class Prefix : public SimilarityBase::max()> { 12 | friend SimilarityBase::max()>; 13 | friend NormalizedMetricBase; 14 | 15 | template 16 | static int64_t maximum(Range s1, Range s2) 17 | { 18 | return std::max(s1.size(), s2.size()); 19 | } 20 | 21 | template 22 | static int64_t _similarity(Range s1, Range s2, int64_t score_cutoff, 23 | [[maybe_unused]] int64_t score_hint) 24 | { 25 | int64_t dist = static_cast(remove_common_prefix(s1, s2)); 26 | return (dist >= score_cutoff) ? dist : 0; 27 | } 28 | }; 29 | 30 | } // namespace rapidfuzz::detail 31 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Postfix_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include "rapidfuzz/details/common.hpp" 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz::detail { 10 | 11 | class Postfix : public SimilarityBase::max()> { 12 | friend SimilarityBase::max()>; 13 | friend NormalizedMetricBase; 14 | 15 | template 16 | static int64_t maximum(Range s1, Range s2) 17 | { 18 | return std::max(s1.size(), s2.size()); 19 | } 20 | 21 | template 22 | static int64_t _similarity(Range s1, Range s2, int64_t score_cutoff, 23 | [[maybe_unused]] int64_t score_hint) 24 | { 25 | int64_t dist = static_cast(remove_common_suffix(s1, s2)); 26 | return (dist >= score_cutoff) ? dist : 0; 27 | } 28 | }; 29 | 30 | } // namespace rapidfuzz::detail 31 | -------------------------------------------------------------------------------- /docs/literature/wagner_fischer_1974.bib: -------------------------------------------------------------------------------- 1 | @article{wagner_fischer_1974, 2 | author = {Wagner, Robert A. and Fischer, Michael J.}, 3 | title = {The String-to-String Correction Problem}, 4 | year = {1974}, 5 | issue_date = {Jan. 1974}, 6 | publisher = {Association for Computing Machinery}, 7 | address = {New York, NY, USA}, 8 | volume = {21}, 9 | number = {1}, 10 | issn = {0004-5411}, 11 | url = {https://doi.org/10.1145/321796.321811}, 12 | doi = {10.1145/321796.321811}, 13 | abstract = {The string-to-string correction problem is to determine the distance between two strings as measured by the minimum cost sequence of “edit operations” needed to change the one string into the other. The edit operations investigated allow changing one symbol of a string into another single symbol, deleting one symbol from a string, or inserting a single symbol into a string. An algorithm is presented which solves this problem in time proportional to the product of the lengths of the two strings. Possible applications are to the problems of automatic spelling correction and determining the longest subsequence of characters common to two strings.}, 14 | journal = {J. ACM}, 15 | month = jan, 16 | pages = {168–173}, 17 | numpages = {6} 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Hamming.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz_reference { 10 | 11 | template 12 | int64_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 13 | int64_t score_cutoff = std::numeric_limits::max()) 14 | { 15 | ptrdiff_t len1 = std::distance(first1, last1); 16 | ptrdiff_t len2 = std::distance(first2, last2); 17 | if (len1 != len2) throw std::invalid_argument("Sequences are not the same length."); 18 | 19 | int64_t dist = 0; 20 | for (ptrdiff_t i = 0; i < len1; ++i) 21 | dist += bool(first1[i] != first2[i]); 22 | 23 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 24 | } 25 | 26 | template 27 | int64_t hamming_distance(const Sentence1& s1, const Sentence2& s2, 28 | int64_t score_cutoff = std::numeric_limits::max()) 29 | { 30 | return hamming_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 31 | } 32 | 33 | } // namespace rapidfuzz_reference 34 | -------------------------------------------------------------------------------- /test/tests-common.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | TEST_CASE("remove affix") 6 | { 7 | std::string s1 = "aabbbbaaaa"; 8 | std::string s2 = "aaabbbbaaaaa"; 9 | 10 | { 11 | rapidfuzz::detail::Range s1_(s1); 12 | rapidfuzz::detail::Range s2_(s2); 13 | REQUIRE(rapidfuzz::detail::remove_common_prefix(s1_, s2_) == 2); 14 | REQUIRE(s1_ == rapidfuzz::detail::Range("bbbbaaaa")); 15 | REQUIRE(s2_ == rapidfuzz::detail::Range("abbbbaaaaa")); 16 | } 17 | 18 | { 19 | rapidfuzz::detail::Range s1_(s1); 20 | rapidfuzz::detail::Range s2_(s2); 21 | REQUIRE(rapidfuzz::detail::remove_common_suffix(s1_, s2_) == 4); 22 | REQUIRE(s1_ == rapidfuzz::detail::Range("aabbbb")); 23 | REQUIRE(s2_ == rapidfuzz::detail::Range("aaabbbba")); 24 | } 25 | 26 | { 27 | rapidfuzz::detail::Range s1_(s1); 28 | rapidfuzz::detail::Range s2_(s2); 29 | auto affix = rapidfuzz::detail::remove_common_affix(s1_, s2_); 30 | REQUIRE(affix.prefix_len == 2); 31 | REQUIRE(affix.suffix_len == 4); 32 | REQUIRE(s1_ == rapidfuzz::detail::Range("bbbb")); 33 | REQUIRE(s2_ == rapidfuzz::detail::Range("abbbba")); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /rapidfuzz_reference/JaroWinkler.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "Jaro.hpp" 6 | 7 | namespace rapidfuzz_reference { 8 | 9 | template >> 11 | double jaro_winkler_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, 12 | double prefix_weight = 0.1, double score_cutoff = 0.0) 13 | { 14 | int64_t min_len = std::min(std::distance(P_first, P_last), std::distance(T_first, T_last)); 15 | int64_t max_prefix = std::min(min_len, 4); 16 | 17 | int64_t prefix = 0; 18 | for (; prefix < max_prefix; ++prefix) 19 | if (T_first[prefix] != P_first[prefix]) break; 20 | 21 | double Sim = jaro_similarity(P_first, P_last, T_first, T_last); 22 | if (Sim > 0.7) Sim += static_cast(prefix) * prefix_weight * (1.0 - Sim); 23 | 24 | return (Sim >= score_cutoff) ? Sim : 0; 25 | } 26 | 27 | template 28 | double jaro_winkler_similarity(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1, 29 | double score_cutoff = 0.0) 30 | { 31 | return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), prefix_weight, 32 | score_cutoff); 33 | } 34 | 35 | } /* namespace rapidfuzz_reference */ 36 | -------------------------------------------------------------------------------- /fuzzing/fuzz_levenshtein_editops.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Levenshtein.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | 10 | void validate_editops(const std::basic_string& s1, const std::basic_string& s2, int64_t score, int64_t score_hint = std::numeric_limits::max()) 11 | { 12 | rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2, score_hint); 13 | if (static_cast(ops.size()) == score && s2 != rapidfuzz::editops_apply(ops, s1, s2)) 14 | throw std::logic_error("levenshtein_editops failed"); 15 | } 16 | 17 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 18 | { 19 | std::basic_string s1, s2; 20 | if (!extract_strings(data, size, s1, s2)) return 0; 21 | 22 | /* hirschbergs algorithm is only used for very long sequences which are apparently not generated a lot by 23 | * the fuzzer */ 24 | for (int i = 0; i < 10; i++) { 25 | int64_t score = rapidfuzz_reference::levenshtein_distance(s1, s2); 26 | validate_editops(s1, s2, score); 27 | validate_editops(s1, s2, score, 64); 28 | validate_editops(s1, s2, score, score != 0 ? score - 1 : 0); 29 | validate_editops(s1, s2, score, score); 30 | 31 | s1 = str_multiply(s1, 2); 32 | s2 = str_multiply(s2, 2); 33 | } 34 | 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /fuzzing/fuzzing.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | static inline bool extract_strings(const uint8_t* data, size_t size, std::basic_string& s1, 7 | std::basic_string& s2) 8 | { 9 | if (size <= sizeof(uint32_t)) { 10 | return false; 11 | } 12 | uint32_t len1 = *(uint32_t*)data; 13 | 14 | if (len1 > size - sizeof(len1)) { 15 | return false; 16 | } 17 | 18 | data += sizeof(len1); 19 | size -= sizeof(len1); 20 | s1 = std::basic_string(data, len1); 21 | s2 = std::basic_string(data + len1, size - len1); 22 | return true; 23 | } 24 | 25 | template 26 | static inline T pow(T x, unsigned int p) 27 | { 28 | if (p == 0) return 1; 29 | if (p == 1) return x; 30 | 31 | T tmp = pow(x, p / 2); 32 | if (p % 2 == 0) 33 | return tmp * tmp; 34 | else 35 | return x * tmp * tmp; 36 | } 37 | 38 | template 39 | std::basic_string str_multiply(std::basic_string a, size_t b) 40 | { 41 | std::basic_string output; 42 | while (b--) 43 | output += a; 44 | 45 | return output; 46 | } 47 | 48 | template 49 | void print_seq(const std::string& name, const std::basic_string& seq) 50 | { 51 | std::cout << name << " len: " << seq.size() << " content: "; 52 | for (const auto& ch : seq) 53 | std::cout << static_cast(ch) << " "; 54 | std::cout << std::endl; 55 | } 56 | -------------------------------------------------------------------------------- /fuzzing/fuzz_indel_distance.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Indel.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void validate_distance(const std::basic_string& s1, const std::basic_string& s2, 12 | int64_t score_cutoff) 13 | { 14 | auto dist = rapidfuzz::indel_distance(s1, s2, score_cutoff); 15 | auto reference_dist = rapidfuzz_reference::indel_distance(s1, s2, score_cutoff); 16 | if (dist != reference_dist) { 17 | print_seq("s1: ", s1); 18 | print_seq("s2: ", s2); 19 | throw std::logic_error(std::string("indel distance failed (score_cutoff = ") + 20 | std::to_string(score_cutoff) + std::string(", reference_score = ") + 21 | std::to_string(reference_dist) + std::string(", score = ") + 22 | std::to_string(dist) + ")"); 23 | } 24 | } 25 | 26 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 27 | { 28 | std::basic_string s1, s2; 29 | if (!extract_strings(data, size, s1, s2)) return 0; 30 | 31 | validate_distance(s1, s2, 0); 32 | validate_distance(s1, s2, 1); 33 | validate_distance(s1, s2, 2); 34 | validate_distance(s1, s2, 3); 35 | validate_distance(s1, s2, 4); 36 | validate_distance(s1, s2, std::numeric_limits::max()); 37 | 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /rapidfuzz/details/type_traits.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2020 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace rapidfuzz { 13 | 14 | namespace detail { 15 | template 16 | auto inner_type(T const*) -> T; 17 | 18 | template 19 | auto inner_type(T const&) -> typename T::value_type; 20 | } // namespace detail 21 | 22 | template 23 | using char_type = decltype(detail::inner_type(std::declval())); 24 | 25 | /* backport of std::iter_value_t from C++20 26 | * This does not cover the complete functionality, but should be enough for 27 | * the use cases in this library 28 | */ 29 | template 30 | using iter_value_t = typename std::iterator_traits::value_type; 31 | 32 | // taken from 33 | // https://stackoverflow.com/questions/16893992/check-if-type-can-be-explicitly-converted 34 | template 35 | struct is_explicitly_convertible { 36 | template 37 | static void f(T); 38 | 39 | template 40 | static constexpr auto test(int /*unused*/) -> decltype(f(static_cast(std::declval())), true) 41 | { 42 | return true; 43 | } 44 | 45 | template 46 | static constexpr auto test(...) -> bool 47 | { 48 | return false; 49 | } 50 | 51 | static bool const value = test(0); 52 | }; 53 | 54 | } // namespace rapidfuzz 55 | -------------------------------------------------------------------------------- /fuzzing/fuzz_jaro_similarity.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/Jaro.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | bool is_close(double a, double b, double epsilon) 12 | { 13 | return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon); 14 | } 15 | 16 | void validate_distance(const std::basic_string& s1, const std::basic_string& s2) 17 | { 18 | double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2); 19 | double sim = rapidfuzz::jaro_similarity(s1, s2); 20 | 21 | if (!is_close(sim, reference_sim, 0.0001)) { 22 | print_seq("s1", s1); 23 | print_seq("s2", s2); 24 | throw std::logic_error(std::string("jaro similarity failed (reference_score = ") + 25 | std::to_string(reference_sim) + std::string(", score = ") + 26 | std::to_string(sim) + ")"); 27 | } 28 | } 29 | 30 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 31 | { 32 | std::basic_string s1, s2; 33 | if (!extract_strings(data, size, s1, s2)) return 0; 34 | 35 | validate_distance(s1, s2); 36 | 37 | /* test long sequences */ 38 | for (unsigned int i = 2; i < 9; ++i) { 39 | std::basic_string s1_ = str_multiply(s1, pow(2, i)); 40 | std::basic_string s2_ = str_multiply(s2, pow(2, i)); 41 | 42 | if (s1_.size() > 10000 || s2_.size() > 10000) break; 43 | 44 | validate_distance(s1_, s2_); 45 | } 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Hamming_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | 9 | namespace rapidfuzz::detail { 10 | 11 | class Hamming : public DistanceBase::max()> { 12 | friend DistanceBase::max()>; 13 | friend NormalizedMetricBase; 14 | 15 | template 16 | static int64_t maximum(Range s1, Range) 17 | { 18 | return s1.size(); 19 | } 20 | 21 | template 22 | static int64_t _distance(Range s1, Range s2, int64_t score_cutoff, 23 | [[maybe_unused]] int64_t score_hint) 24 | { 25 | if (s1.size() != s2.size()) throw std::invalid_argument("Sequences are not the same length."); 26 | 27 | int64_t dist = 0; 28 | for (ptrdiff_t i = 0; i < s1.size(); ++i) 29 | dist += bool(s1[i] != s2[i]); 30 | 31 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 32 | } 33 | }; 34 | 35 | template 36 | Editops hamming_editops(Range s1, Range s2, int64_t) 37 | { 38 | if (s1.size() != s2.size()) throw std::invalid_argument("Sequences are not the same length."); 39 | 40 | Editops ops; 41 | for (ptrdiff_t i = 0; i < s1.size(); ++i) 42 | if (s1[i] != s2[i]) ops.emplace_back(EditType::Replace, i, i); 43 | 44 | ops.set_src_len(static_cast(s1.size())); 45 | ops.set_dest_len(static_cast(s2.size())); 46 | return ops; 47 | } 48 | 49 | } // namespace rapidfuzz::detail 50 | -------------------------------------------------------------------------------- /rapidfuzz/details/CharSet.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright (c) 2022 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace rapidfuzz::detail { 13 | 14 | /* 15 | * taken from https://stackoverflow.com/a/17251989/11335032 16 | */ 17 | template 18 | bool CanTypeFitValue(const U value) 19 | { 20 | const intmax_t botT = intmax_t(std::numeric_limits::min()); 21 | const intmax_t botU = intmax_t(std::numeric_limits::min()); 22 | const uintmax_t topT = uintmax_t(std::numeric_limits::max()); 23 | const uintmax_t topU = uintmax_t(std::numeric_limits::max()); 24 | return !((botT > botU && value < static_cast(botT)) || (topT < topU && value > static_cast(topT))); 25 | } 26 | 27 | template 28 | struct CharSet; 29 | 30 | template 31 | struct CharSet { 32 | using UCharT1 = typename std::make_unsigned::type; 33 | 34 | std::array::max() + 1> m_val; 35 | 36 | CharSet() : m_val{} 37 | {} 38 | 39 | void insert(CharT1 ch) 40 | { 41 | m_val[UCharT1(ch)] = true; 42 | } 43 | 44 | template 45 | bool find(CharT2 ch) const 46 | { 47 | if (!CanTypeFitValue(ch)) return false; 48 | 49 | return m_val[UCharT1(ch)]; 50 | } 51 | }; 52 | 53 | template 54 | struct CharSet { 55 | std::unordered_set m_val; 56 | 57 | CharSet() : m_val{} 58 | {} 59 | 60 | void insert(CharT1 ch) 61 | { 62 | m_val.insert(ch); 63 | } 64 | 65 | template 66 | bool find(CharT2 ch) const 67 | { 68 | if (!CanTypeFitValue(ch)) return false; 69 | 70 | return m_val.find(CharT1(ch)) != m_val.end(); 71 | } 72 | }; 73 | 74 | } // namespace rapidfuzz::detail -------------------------------------------------------------------------------- /fuzzing/fuzz_osa_distance.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/OSA.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void validate_distance(int64_t reference_dist, const std::basic_string& s1, 12 | const std::basic_string& s2, int64_t score_cutoff) 13 | { 14 | if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; 15 | 16 | auto dist = rapidfuzz::osa_distance(s1, s2, score_cutoff); 17 | if (dist != reference_dist) { 18 | print_seq("s1", s1); 19 | print_seq("s2", s2); 20 | throw std::logic_error(std::string("osa distance failed (score_cutoff = ") + 21 | std::to_string(score_cutoff) + std::string(", reference_score = ") + 22 | std::to_string(reference_dist) + std::string(", score = ") + 23 | std::to_string(dist) + ")"); 24 | } 25 | } 26 | 27 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 28 | { 29 | std::basic_string s1, s2; 30 | if (!extract_strings(data, size, s1, s2)) return 0; 31 | 32 | int64_t reference_dist = rapidfuzz_reference::osa_distance(s1, s2); 33 | 34 | /* test small band */ 35 | for (int64_t i = 4; i < 32; ++i) 36 | validate_distance(reference_dist, s1, s2, i); 37 | 38 | /* unrestricted */ 39 | validate_distance(reference_dist, s1, s2, std::numeric_limits::max()); 40 | 41 | /* test long sequences */ 42 | for (unsigned int i = 2; i < 9; ++i) { 43 | std::basic_string s1_ = str_multiply(s1, pow(2, i)); 44 | std::basic_string s2_ = str_multiply(s2, pow(2, i)); 45 | 46 | if (s1_.size() > 10000 || s2_.size() > 10000) break; 47 | 48 | reference_dist = rapidfuzz_reference::osa_distance(s1_, s2_); 49 | validate_distance(reference_dist, s1_, s2_, std::numeric_limits::max()); 50 | } 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /fuzzing/fuzz_damerau_levenshtein_distance.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/DamerauLevenshtein.hpp" 5 | #include "fuzzing.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void validate_distance(int64_t reference_dist, const std::basic_string& s1, 12 | const std::basic_string& s2, int64_t score_cutoff) 13 | { 14 | if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1; 15 | 16 | auto dist = rapidfuzz::experimental::damerau_levenshtein_distance(s1, s2, score_cutoff); 17 | if (dist != reference_dist) { 18 | print_seq("s1", s1); 19 | print_seq("s2", s2); 20 | throw std::logic_error(std::string("osa distance failed (score_cutoff = ") + 21 | std::to_string(score_cutoff) + std::string(", reference_score = ") + 22 | std::to_string(reference_dist) + std::string(", score = ") + 23 | std::to_string(dist) + ")"); 24 | } 25 | } 26 | 27 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 28 | { 29 | std::basic_string s1, s2; 30 | if (!extract_strings(data, size, s1, s2)) return 0; 31 | 32 | int64_t reference_dist = rapidfuzz_reference::damerau_levenshtein_distance(s1, s2); 33 | 34 | /* test small band */ 35 | for (int64_t i = 4; i < 32; ++i) 36 | validate_distance(reference_dist, s1, s2, i); 37 | 38 | /* unrestricted */ 39 | validate_distance(reference_dist, s1, s2, std::numeric_limits::max()); 40 | 41 | /* test long sequences */ 42 | for (unsigned int i = 2; i < 9; ++i) { 43 | std::basic_string s1_ = str_multiply(s1, pow(2, i)); 44 | std::basic_string s2_ = str_multiply(s2, pow(2, i)); 45 | 46 | if (s1_.size() > 10000 || s2_.size() > 10000) break; 47 | 48 | reference_dist = rapidfuzz_reference::damerau_levenshtein_distance(s1_, s2_); 49 | validate_distance(reference_dist, s1_, s2_, std::numeric_limits::max()); 50 | } 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /fuzzing/fuzz_lcs_similarity.cpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #include "../rapidfuzz_reference/LCSseq.hpp" 5 | #include "fuzzing.hpp" 6 | #include "rapidfuzz/details/Range.hpp" 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | void validate_simd(const std::basic_string& s1, const std::basic_string& s2) 13 | { 14 | #ifdef RAPIDFUZZ_SIMD 15 | size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0); 16 | rapidfuzz::experimental::MultiLCSseq scorer(count); 17 | 18 | std::vector> strings; 19 | 20 | for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) { 21 | if (std::distance(it1, s1.end()) < static_cast(MaxLen)) { 22 | strings.emplace_back(it1, s1.end()); 23 | break; 24 | } 25 | else { 26 | strings.emplace_back(it1, it1 + MaxLen); 27 | } 28 | } 29 | 30 | for (const auto& s : strings) 31 | scorer.insert(s); 32 | 33 | std::vector simd_results(scorer.result_count()); 34 | scorer.similarity(&simd_results[0], simd_results.size(), s2); 35 | 36 | for (size_t i = 0; i < strings.size(); ++i) { 37 | int64_t reference_score = rapidfuzz_reference::lcs_seq_similarity(strings[i], s2); 38 | if (reference_score != simd_results[i]) { 39 | print_seq("s1: ", s1); 40 | print_seq("s2: ", s2); 41 | throw std::logic_error(std::string("lcs distance using simd failed (score_cutoff = ") + 42 | std::string(", reference_score = ") + std::to_string(reference_score) + 43 | std::string(", score = ") + std::to_string(simd_results[i]) + ")"); 44 | } 45 | } 46 | #else 47 | (void)s1; 48 | (void)s2; 49 | #endif 50 | } 51 | 52 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 53 | { 54 | std::basic_string s1, s2; 55 | if (!extract_strings(data, size, s1, s2)) { 56 | return 0; 57 | } 58 | 59 | if (s1.size() == 0) { 60 | return 0; 61 | } 62 | 63 | validate_simd<8>(s1, s2); 64 | validate_simd<16>(s1, s2); 65 | validate_simd<32>(s1, s2); 66 | validate_simd<64>(s1, s2); 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /rapidfuzz_reference/OSA.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace rapidfuzz_reference { 14 | 15 | template 16 | Matrix osa_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) 17 | { 18 | ptrdiff_t len1 = std::distance(first1, last1); 19 | ptrdiff_t len2 = std::distance(first2, last2); 20 | 21 | Matrix matrix(static_cast(len1) + 1, static_cast(len2) + 1); 22 | 23 | for (ptrdiff_t i = 0; i <= len1; ++i) 24 | matrix(static_cast(i), 0) = i; 25 | for (ptrdiff_t i = 0; i <= len2; ++i) 26 | matrix(0, static_cast(i)) = i; 27 | 28 | for (ptrdiff_t pos1 = 0; pos1 < len1; ++pos1) { 29 | for (ptrdiff_t pos2 = 0; pos2 < len2; ++pos2) { 30 | ptrdiff_t cost = (first1[pos1] == first2[pos2]) ? 0 : 1; 31 | 32 | matrix(pos1 + 1, pos2 + 1) = 33 | std::min({matrix(pos1, pos2 + 1) + 1, matrix(pos1 + 1, pos2) + 1, matrix(pos1, pos2) + cost}); 34 | 35 | if (pos1 == 0 || pos2 == 0) continue; 36 | if (first1[pos1] != first2[pos2 - 1]) continue; 37 | if (first1[pos1 - 1] != first2[pos2]) continue; 38 | 39 | matrix(pos1 + 1, pos2 + 1) = 40 | std::min(matrix(pos1 + 1, pos2 + 1), matrix(pos1 - 1, pos2 - 1) + cost); 41 | } 42 | } 43 | 44 | return matrix; 45 | } 46 | 47 | template 48 | int64_t osa_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 49 | int64_t score_cutoff = std::numeric_limits::max()) 50 | { 51 | auto matrix = osa_matrix(first1, last1, first2, last2); 52 | int64_t dist = matrix.back(); 53 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 54 | } 55 | 56 | template 57 | int64_t osa_distance(const Sentence1& s1, const Sentence2& s2, 58 | int64_t score_cutoff = std::numeric_limits::max()) 59 | { 60 | return osa_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 61 | } 62 | 63 | } // namespace rapidfuzz_reference 64 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(Catch2 3 QUIET) 2 | if (Catch2_FOUND) 3 | message("Using system supplied version of Catch2") 4 | else() 5 | message("Using FetchContent to load Catch2") 6 | include(FetchContent) 7 | FetchContent_Declare( 8 | Catch2 9 | GIT_REPOSITORY https://github.com/catchorg/Catch2.git 10 | GIT_TAG v3.0.1 11 | ) 12 | FetchContent_MakeAvailable(Catch2) 13 | endif() 14 | 15 | if (RAPIDFUZZ_ENABLE_LINTERS) 16 | # include aminya & jason turner's C++ best practices recommended cmake project utilities 17 | message("Enable Linters on test build") 18 | include(FetchContent) 19 | 20 | if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20) 21 | FetchContent_Declare(_project_options URL https://github.com/aminya/project_options/archive/refs/tags/v0.26.2.zip) 22 | else() 23 | FetchContent_Declare(_project_options URL https://github.com/aminya/project_options/archive/refs/tags/v0.25.2.zip) 24 | endif() 25 | FetchContent_MakeAvailable(_project_options) 26 | include(${_project_options_SOURCE_DIR}/Index.cmake) 27 | 28 | project_options( 29 | # ENABLE_CACHE 30 | # ENABLE_CONAN 31 | WARNINGS_AS_ERRORS 32 | # ENABLE_CPPCHECK 33 | # ENABLE_CLANG_TIDY 34 | # ENABLE_INCLUDE_WHAT_YOU_USE 35 | # ENABLE_COVERAGE 36 | # ENABLE_PCH 37 | # PCH_HEADERS 38 | # ENABLE_DOXYGEN 39 | # ENABLE_IPO 40 | # ENABLE_USER_LINKER 41 | # ENABLE_BUILD_WITH_TIME_TRACE 42 | # ENABLE_UNITY 43 | # ENABLE_SANITIZER_ADDRESS 44 | # ENABLE_SANITIZER_LEAK 45 | # ENABLE_SANITIZER_UNDEFINED_BEHAVIOR 46 | # ENABLE_SANITIZER_THREAD 47 | # ENABLE_SANITIZER_MEMORY 48 | # CLANG_WARNINGS "-Weverything" 49 | ) 50 | endif() 51 | 52 | function(rapidfuzz_add_test test) 53 | add_executable(test_${test} tests-${test}.cpp) 54 | target_link_libraries(test_${test} ${PROJECT_NAME}) 55 | target_link_libraries(test_${test} Catch2::Catch2WithMain) 56 | if (RAPIDFUZZ_ENABLE_LINTERS) 57 | target_link_libraries(test_${test} project_warnings) 58 | endif() 59 | add_test(NAME ${test} COMMAND test_${test}) 60 | endfunction() 61 | 62 | rapidfuzz_add_test(fuzz) 63 | rapidfuzz_add_test(common) 64 | 65 | add_subdirectory(distance) 66 | -------------------------------------------------------------------------------- /rapidfuzz/details/SplittedSentenceView.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace rapidfuzz::detail { 8 | 9 | template 10 | class SplittedSentenceView { 11 | public: 12 | using CharT = iter_value_t; 13 | 14 | SplittedSentenceView(RangeVec sentence) noexcept( 15 | std::is_nothrow_move_constructible_v>) 16 | : m_sentence(std::move(sentence)) 17 | {} 18 | 19 | size_t dedupe(); 20 | size_t size() const; 21 | 22 | size_t length() const 23 | { 24 | return size(); 25 | } 26 | 27 | bool empty() const 28 | { 29 | return m_sentence.empty(); 30 | } 31 | 32 | size_t word_count() const 33 | { 34 | return m_sentence.size(); 35 | } 36 | 37 | std::basic_string join() const; 38 | 39 | const RangeVec& words() const 40 | { 41 | return m_sentence; 42 | } 43 | 44 | private: 45 | RangeVec m_sentence; 46 | }; 47 | 48 | template 49 | size_t SplittedSentenceView::dedupe() 50 | { 51 | size_t old_word_count = word_count(); 52 | m_sentence.erase(std::unique(m_sentence.begin(), m_sentence.end()), m_sentence.end()); 53 | return old_word_count - word_count(); 54 | } 55 | 56 | template 57 | size_t SplittedSentenceView::size() const 58 | { 59 | if (m_sentence.empty()) return 0; 60 | 61 | // there is a whitespace between each word 62 | size_t result = m_sentence.size() - 1; 63 | for (const auto& word : m_sentence) { 64 | result += static_cast(std::distance(word.begin(), word.end())); 65 | } 66 | 67 | return result; 68 | } 69 | 70 | template 71 | auto SplittedSentenceView::join() const -> std::basic_string 72 | { 73 | if (m_sentence.empty()) { 74 | return std::basic_string(); 75 | } 76 | 77 | auto sentence_iter = m_sentence.begin(); 78 | std::basic_string joined(sentence_iter->begin(), sentence_iter->end()); 79 | const std::basic_string whitespace{0x20}; 80 | ++sentence_iter; 81 | for (; sentence_iter != m_sentence.end(); ++sentence_iter) { 82 | joined.append(whitespace) 83 | .append(std::basic_string(sentence_iter->begin(), sentence_iter->end())); 84 | } 85 | return joined; 86 | } 87 | 88 | } // namespace rapidfuzz::detail 89 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Levenshtein.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace rapidfuzz_reference { 14 | 15 | struct LevenshteinWeightTable { 16 | int64_t insert_cost; 17 | int64_t delete_cost; 18 | int64_t replace_cost; 19 | }; 20 | 21 | template 22 | Matrix levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 23 | LevenshteinWeightTable weights = {1, 1, 1}) 24 | { 25 | ptrdiff_t len1 = std::distance(first1, last1); 26 | ptrdiff_t len2 = std::distance(first2, last2); 27 | 28 | Matrix matrix(static_cast(len1) + 1, static_cast(len2) + 1); 29 | 30 | for (ptrdiff_t i = 0; i <= len1; ++i) 31 | matrix(i, 0) = i * weights.delete_cost; 32 | for (ptrdiff_t i = 0; i <= len2; ++i) 33 | matrix(0, i) = i * weights.insert_cost; 34 | 35 | for (ptrdiff_t pos1 = 0; pos1 < len1; ++pos1) { 36 | for (ptrdiff_t pos2 = 0; pos2 < len2; ++pos2) { 37 | ptrdiff_t cost = (first1[pos1] == first2[pos2]) ? 0 : weights.replace_cost; 38 | 39 | matrix(pos1 + 1, pos2 + 1) = 40 | std::min({matrix(pos1, pos2 + 1) + weights.delete_cost, 41 | matrix(pos1 + 1, pos2) + weights.insert_cost, matrix(pos1, pos2) + cost}); 42 | } 43 | } 44 | 45 | return matrix; 46 | } 47 | 48 | template 49 | int64_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 50 | LevenshteinWeightTable weights = {1, 1, 1}, 51 | int64_t score_cutoff = std::numeric_limits::max()) 52 | { 53 | auto matrix = levenshtein_matrix(first1, last1, first2, last2, weights); 54 | int64_t dist = matrix.back(); 55 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 56 | } 57 | 58 | template 59 | int64_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2, 60 | LevenshteinWeightTable weights = {1, 1, 1}, 61 | int64_t score_cutoff = std::numeric_limits::max()) 62 | { 63 | return levenshtein_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), weights, 64 | score_cutoff); 65 | } 66 | 67 | } // namespace rapidfuzz_reference 68 | -------------------------------------------------------------------------------- /rapidfuzz_reference/Jaro.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace rapidfuzz_reference { 15 | 16 | template 17 | double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, 18 | double score_cutoff = 0.0) 19 | { 20 | size_t P_len = static_cast(std::distance(P_first, P_last)); 21 | size_t T_len = static_cast(std::distance(T_first, T_last)); 22 | 23 | if (!P_len || !T_len) return 0; 24 | 25 | std::vector P_flag(P_len + 1); 26 | std::vector T_flag(T_len + 1); 27 | 28 | size_t Bound = std::max(P_len, T_len) / 2; 29 | if (Bound > 0) Bound--; 30 | 31 | size_t CommonChars = 0; 32 | for (size_t i = 0; i < T_len; i++) { 33 | size_t lowlim = (i >= Bound) ? i - Bound : 0; 34 | size_t hilim = (i + Bound <= P_len - 1) ? (i + Bound) : P_len - 1; 35 | for (size_t j = lowlim; j <= hilim; j++) { 36 | if (!P_flag[j] && (P_first[static_cast(j)] == T_first[static_cast(i)])) { 37 | T_flag[i] = 1; 38 | P_flag[j] = 1; 39 | CommonChars++; 40 | break; 41 | } 42 | } 43 | } 44 | 45 | // Count the number of transpositions 46 | size_t Transpositions = 0; 47 | size_t k = 0; 48 | for (size_t i = 0; i < T_len; i++) { 49 | if (T_flag[i]) { 50 | size_t j = k; 51 | for (; j < P_len; j++) { 52 | if (P_flag[j]) { 53 | k = j + 1; 54 | break; 55 | } 56 | } 57 | if (T_first[static_cast(i)] != P_first[static_cast(j)]) Transpositions++; 58 | } 59 | } 60 | 61 | Transpositions /= 2; 62 | double Sim = 0; 63 | Sim += static_cast(CommonChars) / static_cast(P_len); 64 | Sim += static_cast(CommonChars) / static_cast(T_len); 65 | Sim += (static_cast(CommonChars) - static_cast(Transpositions)) / 66 | static_cast(CommonChars); 67 | Sim /= 3.0; 68 | return (Sim >= score_cutoff) ? Sim : 0; 69 | } 70 | 71 | template 72 | double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) 73 | { 74 | return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); 75 | } 76 | 77 | } /* namespace rapidfuzz_reference */ 78 | -------------------------------------------------------------------------------- /rapidfuzz_reference/DamerauLevenshtein.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #pragma once 5 | #include "common.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace rapidfuzz_reference { 15 | 16 | template 17 | Matrix damerau_levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2) 18 | { 19 | ptrdiff_t len1 = std::distance(first1, last1); 20 | ptrdiff_t len2 = std::distance(first2, last2); 21 | ptrdiff_t infinite = len1 + len2; 22 | 23 | std::unordered_map da; 24 | Matrix matrix(static_cast(len1) + 2, static_cast(len2) + 2); 25 | matrix(0, 0) = infinite; 26 | 27 | for (ptrdiff_t i = 0; i <= len1; ++i) { 28 | matrix(i + 1, 0) = infinite; 29 | matrix(i + 1, 1) = i; 30 | } 31 | for (ptrdiff_t i = 0; i <= len2; ++i) { 32 | matrix(0, i + 1) = infinite; 33 | matrix(1, i + 1) = i; 34 | } 35 | 36 | for (ptrdiff_t pos1 = 0; pos1 < len1; ++pos1) { 37 | ptrdiff_t db = 0; 38 | for (ptrdiff_t pos2 = 0; pos2 < len2; ++pos2) { 39 | int64_t i1 = da[static_cast(first2[pos2])]; 40 | ptrdiff_t j1 = db; 41 | ptrdiff_t cost = 1; 42 | if (first1[pos1] == first2[pos2]) { 43 | cost = 0; 44 | db = pos2 + 1; 45 | } 46 | 47 | matrix(pos1 + 2, pos2 + 2) = 48 | std::min({matrix(pos1 + 1, pos2 + 1) + cost, matrix(pos1 + 2, pos2 + 1) + 1, 49 | matrix(pos1 + 1, pos2 + 2) + 1, matrix(i1, j1) + (pos1 - i1) + 1 + (pos2 - j1) 50 | 51 | }); 52 | } 53 | 54 | da[first1[pos1]] = pos1 + 1; 55 | } 56 | 57 | return matrix; 58 | } 59 | 60 | template 61 | int64_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, 62 | int64_t score_cutoff = std::numeric_limits::max()) 63 | { 64 | auto matrix = damerau_levenshtein_matrix(first1, last1, first2, last2); 65 | int64_t dist = matrix.back(); 66 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 67 | } 68 | 69 | template 70 | int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, 71 | int64_t score_cutoff = std::numeric_limits::max()) 72 | { 73 | return damerau_levenshtein_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), 74 | score_cutoff); 75 | } 76 | 77 | } // namespace rapidfuzz_reference 78 | -------------------------------------------------------------------------------- /bench/bench-jarowinkler.cpp: -------------------------------------------------------------------------------- 1 | #include "rapidfuzz/distance/Jaro.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | std::string generate(int max_length) 9 | { 10 | std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; 11 | std::random_device rd; 12 | std::mt19937 engine(rd()); 13 | std::uniform_int_distribution<> dist(0, static_cast(possible_characters.size() - 1)); 14 | std::string ret = ""; 15 | for (int i = 0; i < max_length; i++) { 16 | int random_index = dist(engine); 17 | ret += possible_characters[static_cast(random_index)]; 18 | } 19 | return ret; 20 | } 21 | 22 | template 23 | std::basic_string str_multiply(std::basic_string a, unsigned int b) 24 | { 25 | std::basic_string output; 26 | while (b--) 27 | output += a; 28 | 29 | return output; 30 | } 31 | 32 | static void BM_JaroLongSimilarSequence(benchmark::State& state) 33 | { 34 | size_t len = state.range(0); 35 | size_t score_cutoff = state.range(1); 36 | std::string s1 = std::string("a") + str_multiply(std::string("b"), (len - 2)) + std::string("a"); 37 | std::string s2 = str_multiply(std::string("b"), len); 38 | 39 | size_t num = 0; 40 | for (auto _ : state) { 41 | benchmark::DoNotOptimize(rapidfuzz::jaro_similarity(s1, s2)); 42 | ++num; 43 | } 44 | 45 | state.counters["Rate"] = benchmark::Counter(static_cast(num * len), benchmark::Counter::kIsRate); 46 | state.counters["InvRate"] = benchmark::Counter(static_cast(num * len), 47 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 48 | } 49 | 50 | static void BM_JaroLongNonSimilarSequence(benchmark::State& state) 51 | { 52 | size_t len = state.range(0); 53 | size_t score_cutoff = state.range(1); 54 | std::string s1 = str_multiply(std::string("a"), len); 55 | std::string s2 = str_multiply(std::string("b"), len); 56 | 57 | size_t num = 0; 58 | for (auto _ : state) { 59 | benchmark::DoNotOptimize(rapidfuzz::jaro_similarity(s1, s2)); 60 | ++num; 61 | } 62 | 63 | state.counters["Rate"] = benchmark::Counter(static_cast(num * len), benchmark::Counter::kIsRate); 64 | state.counters["InvRate"] = benchmark::Counter(static_cast(num * len), 65 | benchmark::Counter::kIsRate | benchmark::Counter::kInvert); 66 | } 67 | 68 | BENCHMARK(BM_JaroLongSimilarSequence) 69 | ->Args({100, 30}) 70 | ->Args({500, 30}) 71 | ->Args({5000, 30}) 72 | ->Args({10000, 30}) 73 | ->Args({20000, 30}) 74 | ->Args({50000, 30}); 75 | 76 | BENCHMARK(BM_JaroLongNonSimilarSequence) 77 | ->Args({100, 30}) 78 | ->Args({500, 30}) 79 | ->Args({5000, 30}) 80 | ->Args({10000, 30}) 81 | ->Args({20000, 30}) 82 | ->Args({50000, 30}); 83 | 84 | BENCHMARK_MAIN(); -------------------------------------------------------------------------------- /rapidfuzz/details/common.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2021 Max Bachmann */ 3 | 4 | #pragma once 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace rapidfuzz::detail { 17 | 18 | template 19 | struct DecomposedSet { 20 | SplittedSentenceView difference_ab; 21 | SplittedSentenceView difference_ba; 22 | SplittedSentenceView intersection; 23 | DecomposedSet(SplittedSentenceView diff_ab, SplittedSentenceView diff_ba, 24 | SplittedSentenceView intersect) 25 | : difference_ab(std::move(diff_ab)), 26 | difference_ba(std::move(diff_ba)), 27 | intersection(std::move(intersect)) 28 | {} 29 | }; 30 | 31 | /** 32 | * @defgroup Common Common 33 | * Common utilities shared among multiple functions 34 | * @{ 35 | */ 36 | 37 | static inline double NormSim_to_NormDist(double score_cutoff, double imprecision = 0.00001) 38 | { 39 | return std::min(1.0, 1.0 - score_cutoff + imprecision); 40 | } 41 | 42 | template 43 | DecomposedSet set_decomposition(SplittedSentenceView a, 44 | SplittedSentenceView b); 45 | 46 | constexpr double result_cutoff(double result, double score_cutoff) 47 | { 48 | return (result >= score_cutoff) ? result : 0; 49 | } 50 | 51 | template 52 | constexpr double norm_distance(int64_t dist, int64_t lensum, double score_cutoff = 0) 53 | { 54 | double max = static_cast(Max); 55 | return result_cutoff((lensum > 0) ? (max - max * static_cast(dist) / static_cast(lensum)) 56 | : max, 57 | score_cutoff); 58 | } 59 | 60 | template 61 | static inline int64_t score_cutoff_to_distance(double score_cutoff, int64_t lensum) 62 | { 63 | return static_cast(std::ceil(static_cast(lensum) * (1.0 - score_cutoff / Max))); 64 | } 65 | 66 | template 67 | StringAffix remove_common_affix(Range& s1, Range& s2); 68 | 69 | template 70 | size_t remove_common_prefix(Range& s1, Range& s2); 71 | 72 | template 73 | size_t remove_common_suffix(Range& s1, Range& s2); 74 | 75 | template > 76 | SplittedSentenceView sorted_split(InputIt first, InputIt last); 77 | 78 | /**@}*/ 79 | 80 | } // namespace rapidfuzz::detail 81 | 82 | #include 83 | -------------------------------------------------------------------------------- /rapidfuzz/distance/Indel_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace rapidfuzz::detail { 12 | 13 | template 14 | int64_t indel_distance(const BlockPatternMatchVector& block, Range s1, Range s2, 15 | int64_t score_cutoff) 16 | { 17 | int64_t maximum = s1.size() + s2.size(); 18 | int64_t lcs_cutoff = std::max(0, maximum / 2 - score_cutoff); 19 | int64_t lcs_sim = lcs_seq_similarity(block, s1, s2, lcs_cutoff); 20 | int64_t dist = maximum - 2 * lcs_sim; 21 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 22 | } 23 | 24 | template 25 | double indel_normalized_distance(const BlockPatternMatchVector& block, Range s1, Range s2, 26 | double score_cutoff) 27 | { 28 | int64_t maximum = s1.size() + s2.size(); 29 | int64_t cutoff_distance = static_cast(std::ceil(static_cast(maximum) * score_cutoff)); 30 | int64_t dist = indel_distance(block, s1, s2, cutoff_distance); 31 | double norm_dist = (maximum) ? static_cast(dist) / static_cast(maximum) : 0.0; 32 | return (norm_dist <= score_cutoff) ? norm_dist : 1.0; 33 | } 34 | 35 | template 36 | double indel_normalized_similarity(const BlockPatternMatchVector& block, Range s1, 37 | Range s2, double score_cutoff) 38 | { 39 | double cutoff_score = NormSim_to_NormDist(score_cutoff); 40 | double norm_dist = indel_normalized_distance(block, s1, s2, cutoff_score); 41 | double norm_sim = 1.0 - norm_dist; 42 | return (norm_sim >= score_cutoff) ? norm_sim : 0.0; 43 | } 44 | 45 | class Indel : public DistanceBase::max()> { 46 | friend DistanceBase::max()>; 47 | friend NormalizedMetricBase; 48 | 49 | template 50 | static int64_t maximum(Range s1, Range s2) 51 | { 52 | return s1.size() + s2.size(); 53 | } 54 | 55 | template 56 | static int64_t _distance(Range s1, Range s2, int64_t score_cutoff, int64_t score_hint) 57 | { 58 | int64_t maximum = Indel::maximum(s1, s2); 59 | int64_t lcs_cutoff = std::max(0, maximum / 2 - score_cutoff); 60 | int64_t lcs_hint = std::max(0, maximum / 2 - score_hint); 61 | int64_t lcs_sim = LCSseq::similarity(s1, s2, lcs_cutoff, lcs_hint); 62 | int64_t dist = maximum - 2 * lcs_sim; 63 | return (dist <= score_cutoff) ? dist : score_cutoff + 1; 64 | } 65 | }; 66 | 67 | } // namespace rapidfuzz::detail 68 | -------------------------------------------------------------------------------- /rapidfuzz/distance/JaroWinkler_impl.hpp: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: MIT */ 2 | /* Copyright © 2022-present Max Bachmann */ 3 | 4 | #include 5 | 6 | namespace rapidfuzz::detail { 7 | 8 | template 9 | double jaro_winkler_similarity(Range P, Range T, double prefix_weight, 10 | double score_cutoff) 11 | { 12 | int64_t P_len = P.size(); 13 | int64_t T_len = T.size(); 14 | int64_t min_len = std::min(P_len, T_len); 15 | int64_t prefix = 0; 16 | int64_t max_prefix = std::min