├── .gitignore ├── CMakeLists.txt ├── README.md ├── SymSpell.vcxproj ├── SymSpellCpp.sln ├── SymSpellTest.cpp ├── SymSpellTest.vcxproj ├── data └── frequency_dictionary_en_82_765.txt ├── include ├── Helpers.h └── SymSpell.h └── src └── SymSpell.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .vscode/ -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(symspell) 2 | cmake_minimum_required(VERSION 2.8) 3 | include_directories(include) 4 | 5 | set(CMAKE_CXX_FLAGS "-O3 -g3") 6 | 7 | set(CMAKE_CXX_STANDARD 17) 8 | 9 | add_library(symspell src/SymSpell.cpp) 10 | add_executable(symspelltest src/SymSpell.cpp SymSpellTest.cpp) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This project is a c++ port from https://github.com/wolfgarbe/SymSpell v6.5 2 | 3 | Please note that you can disable/enable unicode support by the macro #define UNICODE_SUPPORT in the header Symspell.h 4 | 5 | Tested with English and Japanese and this version produced similar result to the original C# code -------------------------------------------------------------------------------- /SymSpell.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 15.0 23 | {0329E929-BF68-4864-A4B4-2D151C0156D4} 24 | SymSpell 25 | 10.0.17763.0 26 | 27 | 28 | 29 | StaticLibrary 30 | true 31 | v141 32 | 33 | 34 | 35 | 36 | StaticLibrary 37 | false 38 | v141 39 | true 40 | 41 | 42 | 43 | 44 | StaticLibrary 45 | true 46 | v141 47 | 48 | 49 | 50 | 51 | StaticLibrary 52 | false 53 | v141 54 | true 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | $(SolutionDir)$(Configuration)\ 78 | $(SolutionDir)$(Configuration)\temp\ 79 | 80 | 81 | $(SolutionDir)$(Configuration)\ 82 | $(SolutionDir)$(Configuration)\temp\ 83 | 84 | 85 | $(SolutionDir)$(Configuration)\temp\ 86 | 87 | 88 | $(SolutionDir)$(Configuration)\temp\ 89 | 90 | 91 | 92 | Level3 93 | Disabled 94 | true 95 | true 96 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories) 97 | 98 | 99 | 100 | 101 | Level3 102 | Disabled 103 | true 104 | true 105 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories) 106 | 107 | 108 | 109 | 110 | Level3 111 | MaxSpeed 112 | true 113 | true 114 | true 115 | true 116 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories) 117 | 118 | 119 | true 120 | true 121 | 122 | 123 | 124 | 125 | Level3 126 | MaxSpeed 127 | true 128 | true 129 | true 130 | true 131 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories) 132 | 133 | 134 | true 135 | true 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | true 148 | true 149 | true 150 | true 151 | true 152 | 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /SymSpellCpp.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.1062 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SymSpell", "SymSpell.vcxproj", "{0329E929-BF68-4864-A4B4-2D151C0156D4}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SymSpellTest", "SymSpellTest.vcxproj", "{1ADF0935-D2BA-402D-8106-388F3701C98E}" 9 | ProjectSection(ProjectDependencies) = postProject 10 | {0329E929-BF68-4864-A4B4-2D151C0156D4} = {0329E929-BF68-4864-A4B4-2D151C0156D4} 11 | EndProjectSection 12 | EndProject 13 | Global 14 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 15 | Debug|x64 = Debug|x64 16 | Debug|x86 = Debug|x86 17 | Release|x64 = Release|x64 18 | Release|x86 = Release|x86 19 | EndGlobalSection 20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 21 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x64.ActiveCfg = Debug|x64 22 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x64.Build.0 = Debug|x64 23 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x86.ActiveCfg = Debug|Win32 24 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x86.Build.0 = Debug|Win32 25 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x64.ActiveCfg = Release|x64 26 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x64.Build.0 = Release|x64 27 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x86.ActiveCfg = Release|Win32 28 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x86.Build.0 = Release|Win32 29 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x64.ActiveCfg = Debug|x64 30 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x64.Build.0 = Debug|x64 31 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x86.ActiveCfg = Debug|Win32 32 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x86.Build.0 = Debug|Win32 33 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x64.ActiveCfg = Release|x64 34 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x64.Build.0 = Release|x64 35 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x86.ActiveCfg = Release|Win32 36 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x86.Build.0 = Release|Win32 37 | EndGlobalSection 38 | GlobalSection(SolutionProperties) = preSolution 39 | HideSolutionNode = FALSE 40 | EndGlobalSection 41 | GlobalSection(ExtensibilityGlobals) = postSolution 42 | SolutionGuid = {FBE80B29-1094-4BB4-9FEB-44818F6CEC08} 43 | EndGlobalSection 44 | EndGlobal 45 | -------------------------------------------------------------------------------- /SymSpellTest.cpp: -------------------------------------------------------------------------------- 1 | // Test.cpp : This file contains the 'main' function. Program execution begins and ends there. 2 | #include 3 | 4 | #include 5 | #include 6 | // #include 7 | 8 | #ifdef UNICODE_SUPPORT 9 | 10 | void TestJapanese() 11 | { 12 | _setmode(_fileno(stdout), _O_U16TEXT); 13 | 14 | const int initialCapacity = 1000; 15 | const int maxEditDistance = 2; 16 | const int prefixLength = 5; 17 | SymSpell symSpell(initialCapacity, maxEditDistance, prefixLength); 18 | int start = clock(); 19 | symSpell.LoadDictionary(XL("frequency_jp_names.txt"), 0, 1, XL('\t')); 20 | int end = clock(); 21 | float time = (float)((end - start) / (CLOCKS_PER_SEC / 1000)); 22 | xcout << XL("Library loaded: ") << time << XL(" ms") << endl; 23 | 24 | xcout << XL("-------Testing Japanese word segmentation-------") << endl; 25 | vector sentences = { XL("�n�ӓc��"), XL("�—ѓc��"), XL("�n�瑽��") }; 26 | for (int i = 0; i < sentences.size(); i++) 27 | { 28 | Info results = symSpell.WordSegmentation(sentences[i]); 29 | xcout << sentences[i] << XL(" -> ") << results.getSegmented() << XL(" -> ") << results.getCorrected() << endl; 30 | } 31 | 32 | xcout << XL("-------Testing Japanese word correction-------") << endl; 33 | sentences = { XL("�j��"), XL("�c��") }; 34 | for (int i = 0; i < sentences.size(); i++) 35 | { 36 | vector results = symSpell.Lookup(sentences[i], Verbosity::Closest); 37 | xcout << sentences[i] << XL(" -> ") << results[0].term << endl; 38 | } 39 | } 40 | 41 | void main() 42 | { 43 | TestJapanese(); 44 | } 45 | 46 | #else 47 | 48 | void TestEnglish() 49 | { 50 | const int initialCapacity = 82765; 51 | const int maxEditDistance = 2; 52 | const int prefixLength = 3; 53 | SymSpell symSpell(initialCapacity, maxEditDistance, prefixLength); 54 | int start = clock(); 55 | string corpus_path = "../data/frequency_dictionary_en_82_765.txt"; 56 | symSpell.LoadDictionary(corpus_path, 0, 1, XL(' ')); 57 | int end = clock(); 58 | float time = (float)((end - start) / (CLOCKS_PER_SEC / 1000)); 59 | xcout << XL("Library loaded: ") << time << XL(" ms") << endl; 60 | 61 | xcout << XL("-------Testing English word segmentation-------") << endl; 62 | vector sentences = { XL("thequickbrownfoxjumpsoverthelazydog"), 63 | XL("itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"), 64 | XL("itwasthebestoftimesitwastheworstoftimesitwastheageofwisdomitwastheageoffoolishness") }; 65 | for (int i = 0; i < sentences.size(); i++) 66 | { 67 | Info results = symSpell.WordSegmentation(sentences[i]); 68 | xcout << sentences[i] << XL(" -> ") << results.getCorrected() << endl; 69 | } 70 | 71 | xcout << XL("-------Testing English word correction-------") << endl; 72 | sentences = { XL("tke"), XL("abolution"), XL("intermedaite") }; 73 | for (int i = 0; i < sentences.size(); i++) 74 | { 75 | vector results = symSpell.Lookup(sentences[i], Verbosity::Closest); 76 | xcout << sentences[i] << XL(" -> ") << results[0].term << endl; 77 | } 78 | 79 | xcout << XL("-------Testing English compound correction-------") << endl; 80 | sentences = { XL("whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him"), 81 | XL("in te dhird qarter oflast jear he hadlearned ofca sekretplan"), 82 | XL("the bigjest playrs in te strogsommer film slatew ith plety of funn") }; 83 | for (int i = 0; i < sentences.size(); i++) 84 | { 85 | vector results = symSpell.LookupCompound(sentences[i]); 86 | xcout << sentences[i] << XL(" -> ") << results[0].term << endl; 87 | } 88 | } 89 | 90 | int main() 91 | { 92 | TestEnglish(); 93 | return 0; 94 | } 95 | 96 | #endif 97 | 98 | 99 | -------------------------------------------------------------------------------- /SymSpellTest.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | 15.0 26 | {1ADF0935-D2BA-402D-8106-388F3701C98E} 27 | Win32Proj 28 | Test 29 | 10.0.17763.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v141 36 | Unicode 37 | 38 | 39 | Application 40 | false 41 | v141 42 | true 43 | Unicode 44 | 45 | 46 | Application 47 | true 48 | v141 49 | Unicode 50 | 51 | 52 | Application 53 | false 54 | v141 55 | true 56 | Unicode 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | false 78 | $(LibraryPath) 79 | $(SolutionDir)$(Configuration)\ 80 | $(SolutionDir)$(Configuration)\temp\ 81 | 82 | 83 | true 84 | $(SolutionDir)$(Configuration)\temp\ 85 | 86 | 87 | true 88 | $(SolutionDir)$(Configuration)\ 89 | $(SolutionDir)$(Configuration)\temp\ 90 | 91 | 92 | false 93 | $(SolutionDir)$(Configuration)\temp\ 94 | 95 | 96 | 97 | 98 | 99 | Level3 100 | MaxSpeed 101 | true 102 | true 103 | true 104 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 105 | true 106 | include; 107 | 108 | 109 | Console 110 | true 111 | true 112 | true 113 | $(SolutionDir)$(Configuration)\ 114 | SymSpell.lib 115 | 116 | 117 | 118 | 119 | 120 | 121 | Level3 122 | Disabled 123 | true 124 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 125 | true 126 | include; 127 | 128 | 129 | Console 130 | true 131 | $(SolutionDir)$(Configuration)\ 132 | SymSpell.lib 133 | 134 | 135 | 136 | 137 | 138 | 139 | Level3 140 | Disabled 141 | true 142 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 143 | true 144 | include; 145 | 146 | 147 | Console 148 | true 149 | $(SolutionDir)$(Configuration)\ 150 | SymSpell.lib 151 | 152 | 153 | 154 | 155 | 156 | 157 | Level3 158 | MaxSpeed 159 | true 160 | true 161 | true 162 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 163 | true 164 | include; 165 | 166 | 167 | Console 168 | true 169 | true 170 | true 171 | $(SolutionDir)$(Configuration)\ 172 | SymSpell.lib 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /include/Helpers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef _WIN32 4 | # ifdef LIBRARY_EXPORTS 5 | # define LIBRARY_API __declspec(dllexport) 6 | # else 7 | # define LIBRARY_API __declspec(dllimport) 8 | # endif 9 | #else 10 | # define LIBRARY_API 11 | #endif 12 | 13 | #define Dictionary unordered_map 14 | #define HashSet unordered_set 15 | #ifdef UNICODE_SUPPORT 16 | # define xstring wstring 17 | # define xchar wchar_t 18 | # define xifstream wifstream 19 | # define xstringstream wstringstream 20 | # define isxspace iswspace 21 | # define xregex wregex 22 | # define xsmatch wsmatch 23 | # define to_xstring to_wstring 24 | # define XL(x) L##x 25 | # define xcout wcout 26 | #else 27 | # define xstring string 28 | # define xchar char 29 | # define xifstream ifstream 30 | # define xstringstream stringstream 31 | # define isxspace isspace 32 | # define xregex regex 33 | # define xsmatch smatch 34 | # define to_xstring to_string 35 | # define XL(x) x 36 | # define xcout cout 37 | #endif 38 | 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | using namespace std; 47 | 48 | 49 | // A growable list of elements that's optimized to support adds, but not deletes, 50 | // of large numbers of elements, storing data in a way that's friendly to the garbage 51 | // collector (not backed by a monolithic array object), and can grow without needing 52 | // to copy the entire backing array contents from the old backing array to the new. 53 | template 54 | class ChunkArray 55 | { 56 | private: 57 | const int ChunkSize = 4096; //this must be a power of 2, otherwise can't optimize Row and Col functions 58 | const int DivShift = 12; // number of bits to shift right to do division by ChunkSize (the bit position of ChunkSize) 59 | int Row(unsigned int index) { return index >> DivShift; } // same as index / ChunkSize 60 | int Col(unsigned int index) { return index & (ChunkSize - 1); } //same as index % ChunkSize 61 | int Capacity() { return Values.size() * ChunkSize; } 62 | 63 | public: 64 | vector> Values; 65 | int Count; 66 | 67 | ChunkArray() 68 | { 69 | Count = 0; 70 | } 71 | 72 | void Reserve(int initialCapacity) 73 | { 74 | int chunks = (initialCapacity + ChunkSize - 1) / ChunkSize; 75 | Values.resize(chunks); 76 | for (int i = 0; i < chunks; ++i) 77 | { 78 | Values[i].resize(ChunkSize); 79 | } 80 | } 81 | 82 | int Add(T &value) 83 | { 84 | if (Count == Capacity()) 85 | { 86 | Values.push_back(vector()); 87 | Values[Values.size() - 1].resize(ChunkSize); 88 | } 89 | 90 | int row = Row(Count); 91 | int col = Col(Count); 92 | 93 | Values[row][col] = value; 94 | return Count++; 95 | } 96 | 97 | void Clear() 98 | { 99 | Count = 0; 100 | } 101 | 102 | T& At(unsigned int index) 103 | { 104 | return Values[Row(index)][Col(index)]; 105 | } 106 | 107 | void Set(unsigned int index, T &value) 108 | { 109 | Values[Row(index)][Col(index)] = value; 110 | } 111 | }; 112 | 113 | class Helpers { 114 | public: 115 | /// Determines the proper return value of an edit distance function when one or 116 | /// both strings are null. 117 | static int NullDistanceResults(xstring string1, xstring string2, double maxDistance) { 118 | if (string1.empty()) 119 | return (string2.empty()) ? 0 : (string2.size() <= maxDistance) ? string2.size() : -1; 120 | return (string1.size() <= maxDistance) ? string1.size() : -1; 121 | } 122 | 123 | /// Determines the proper return value of an similarity function when one or 124 | /// both strings are null. 125 | static int NullSimilarityResults(xstring string1, xstring string2, double minSimilarity) { 126 | return (string1.empty() && string2.empty()) ? 1 : (0 <= minSimilarity) ? 0 : -1; 127 | } 128 | 129 | /// Calculates starting position and lengths of two strings such that common 130 | /// prefix and suffix substrings are excluded. 131 | /// Expects string1.size() to be less than or equal to string2.size() 132 | static void PrefixSuffixPrep(xstring string1, xstring string2, int &len1, int &len2, int &start) { 133 | len2 = string2.size(); 134 | len1 = string1.size(); // this is also the minimum length of the two strings 135 | // suffix common to both strings can be ignored 136 | while (len1 != 0 && string1[len1 - 1] == string2[len2 - 1]) { 137 | len1 = len1 - 1; len2 = len2 - 1; 138 | } 139 | // prefix common to both strings can be ignored 140 | start = 0; 141 | while (start != len1 && string1[start] == string2[start]) start++; 142 | if (start != 0) { 143 | len2 -= start; // length of the part excluding common prefix and suffix 144 | len1 -= start; 145 | } 146 | } 147 | 148 | /// Calculate a similarity measure from an edit distance. 149 | /// The length of the longer of the two strings the edit distance is from. 150 | /// The edit distance between two strings. 151 | /// A similarity value from 0 to 1.0 (1 - (length / distance)). 152 | static double ToSimilarity(int distance, int length) { 153 | return (distance < 0) ? -1 : 1 - (distance / (double)length); 154 | } 155 | 156 | /// Calculate an edit distance from a similarity measure. 157 | /// The length of the longer of the two strings the edit distance is from. 158 | /// The similarity measure between two strings. 159 | /// An edit distance from 0 to length (length * (1 - similarity)). 160 | static int ToDistance(double similarity, int length) { 161 | return (int)((length * (1 - similarity)) + .0000000001); 162 | } 163 | 164 | /// 165 | /// CompareTo of two intergers 166 | /// 167 | static int CompareTo(int64_t mainValue, int64_t compareValue) 168 | { 169 | if (mainValue == compareValue) 170 | return 0; 171 | else if (mainValue > compareValue) 172 | return 1; 173 | else 174 | return -1; 175 | } 176 | }; 177 | 178 | 179 | /// Types implementing the IDistance interface provide methods 180 | /// for computing a relative distance between two strings. 181 | class IDistance { 182 | public: 183 | /// Return a measure of the distance between two strings. 184 | /// One of the strings to compare. 185 | /// The other string to compare. 186 | /// 0 if the strings are equivalent, otherwise a positive number whose 187 | /// magnitude increases as difference between the strings increases. 188 | virtual double Distance(xstring string1, xstring string2) = 0; 189 | 190 | /// Return a measure of the distance between two strings. 191 | /// One of the strings to compare. 192 | /// The other string to compare. 193 | /// The maximum distance that is of interest. 194 | /// -1 if the distance is greater than the maxDistance, 0 if the strings 195 | /// are equivalent, otherwise a positive number whose magnitude increases as 196 | /// difference between the strings increases. 197 | virtual double Distance(xstring string1, xstring string2, double maxDistance) = 0; 198 | }; 199 | 200 | 201 | /// Types implementing the ISimilarity interface provide methods 202 | /// for computing a normalized value of similarity between two strings. 203 | class ISimilarity { 204 | public: 205 | /// Return a measure of the similarity between two strings. 206 | /// One of the strings to compare. 207 | /// The other string to compare. 208 | /// The degree of similarity 0 to 1.0, where 0 represents a lack of any 209 | /// notable similarity, and 1 represents equivalent strings. 210 | virtual double Similarity(xstring string1, xstring string2) = 0; 211 | 212 | /// Return a measure of the similarity between two strings. 213 | /// One of the strings to compare. 214 | /// The other string to compare. 215 | /// The minimum similarity that is of interest. 216 | /// The degree of similarity 0 to 1.0, where -1 represents a similarity 217 | /// lower than minSimilarity, otherwise, a number between 0 and 1.0 where 0 218 | /// represents a lack of any notable similarity, and 1 represents equivalent 219 | /// strings. 220 | virtual double Similarity(xstring string1, xstring string2, double minSimilarity) = 0; 221 | }; 222 | 223 | /// 224 | /// Class providing optimized methods for computing Damerau-Levenshtein Optimal string 225 | /// Alignment (OSA) comparisons between two strings. 226 | /// 227 | /// 228 | /// Copyright ©2015-2018 SoftWx, Inc. 229 | /// The inspiration for creating highly optimized edit distance functions was 230 | /// from Sten Hjelmqvist's "Fast, memory efficient" algorithm, described at 231 | /// http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm 232 | /// The Damerau-Levenshtein algorithm is basically the Levenshtein algorithm with a 233 | /// modification that considers transposition of two adjacent characters as a single edit. 234 | /// The optimized algorithm was described in detail in my post at 235 | /// http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html 236 | /// Also see http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance 237 | /// Note that this implementation of Damerau-Levenshtein is the simpler and faster optimal 238 | /// string alignment (aka restricted edit) distance that difers slightly from the classic 239 | /// algorithm by imposing the restriction that no substring is edited more than once. So, 240 | /// for example, "CA" to "ABC" has an edit distance of 2 by a complete application of 241 | /// Damerau-Levenshtein, but has a distance of 3 by the method implemented here, that uses 242 | /// the optimal string alignment algorithm. This means that this algorithm is not a true 243 | /// metric since it does not uphold the triangle inequality. In real use though, this OSA 244 | /// version may be desired. Besides being faster, it does not give the lower distance score 245 | /// for transpositions that occur across long distances. Actual human error transpositions 246 | /// are most likely for adjacent characters. For example, the classic Damerau algorithm 247 | /// gives a distance of 1 for these two strings: "sated" and "dates" (it counts the 's' and 248 | /// 'd' as a single transposition. The optimal string alignment version of Damerau in this 249 | /// class gives a distance of 2 for these two strings (2 substitutions), as it only counts 250 | /// transpositions for adjacent characters. 251 | /// The methods in this class are not threadsafe. Use the static versions in the Distance 252 | /// class if that is required. 253 | class DamerauOSA : public IDistance { 254 | private: 255 | vector baseChar1Costs; 256 | vector basePrevChar1Costs; 257 | 258 | public: 259 | /// Create a new instance of DamerauOSA. 260 | DamerauOSA() { 261 | } 262 | 263 | /// Create a new instance of DamerauOSA using the specified expected 264 | /// maximum string length that will be encountered. 265 | /// By specifying the max expected string length, better memory efficiency 266 | /// can be achieved. 267 | /// The expected maximum length of strings that will 268 | /// be passed to the edit distance functions. 269 | DamerauOSA(int expectedMaxstringLength) { 270 | if (expectedMaxstringLength <= 0) throw "expectedMaxstringLength must be larger than 0"; 271 | this->baseChar1Costs = vector(expectedMaxstringLength, 0); 272 | this->basePrevChar1Costs = vector(expectedMaxstringLength, 0); 273 | } 274 | 275 | /// Compute and return the Damerau-Levenshtein optimal string 276 | /// alignment edit distance between two strings. 277 | /// https://github.com/softwx/SoftWx.Match 278 | /// This method is not threadsafe. 279 | /// One of the strings to compare. 280 | /// The other string to compare. 281 | /// 0 if the strings are equivalent, otherwise a positive number whose 282 | /// magnitude increases as difference between the strings increases. 283 | double Distance(xstring string1, xstring string2) { 284 | if (string1.empty()) return string2.size(); 285 | if (string2.empty()) return string1.size(); 286 | 287 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 288 | // faster speed by spending more time spinning just the inner loop during the main processing. 289 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 290 | 291 | // identify common suffix and/or prefix that can be ignored 292 | int len1, len2, start; 293 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 294 | if (len1 == 0) return len2; 295 | 296 | if (len2 > this->baseChar1Costs.size()) { 297 | this->baseChar1Costs = vector(len2, 0); 298 | this->basePrevChar1Costs = vector(len2, 0); 299 | } 300 | return Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs); 301 | } 302 | 303 | /// Compute and return the Damerau-Levenshtein optimal string 304 | /// alignment edit distance between two strings. 305 | /// https://github.com/softwx/SoftWx.Match 306 | /// This method is not threadsafe. 307 | /// One of the strings to compare. 308 | /// The other string to compare. 309 | /// The maximum distance that is of interest. 310 | /// -1 if the distance is greater than the maxDistance, 0 if the strings 311 | /// are equivalent, otherwise a positive number whose magnitude increases as 312 | /// difference between the strings increases. 313 | double Distance(xstring string1, xstring string2, double maxDistance) { 314 | if (string1.empty() || string2.empty()) return Helpers::NullDistanceResults(string1, string2, maxDistance); 315 | if (maxDistance <= 0) return (string1 == string2) ? 0 : -1; 316 | maxDistance = ceil(maxDistance); 317 | int iMaxDistance = (maxDistance <= INT_MAX) ? (int)maxDistance : INT_MAX; 318 | 319 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 320 | // faster speed by spending more time spinning just the inner loop during the main processing. 321 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 322 | if (string2.size() - string1.size() > iMaxDistance) return -1; 323 | 324 | // identify common suffix and/or prefix that can be ignored 325 | int len1, len2, start; 326 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 327 | if (len1 == 0) return (len2 <= iMaxDistance) ? len2 : -1; 328 | 329 | if (len2 > this->baseChar1Costs.size()) { 330 | this->baseChar1Costs = vector(len2, 0); 331 | this->basePrevChar1Costs = vector(len2, 0); 332 | } 333 | if (iMaxDistance < len2) { 334 | return Distance(string1, string2, len1, len2, start, iMaxDistance, this->baseChar1Costs, this->basePrevChar1Costs); 335 | } 336 | return Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs); 337 | } 338 | 339 | /// Return Damerau-Levenshtein optimal string alignment similarity 340 | /// between two strings (1 - (damerau distance / len of longer string)). 341 | /// One of the strings to compare. 342 | /// The other string to compare. 343 | /// The degree of similarity 0 to 1.0, where 0 represents a lack of any 344 | /// noteable similarity, and 1 represents equivalent strings. 345 | double Similarity(xstring string1, xstring string2) { 346 | if (string1.empty()) return (string2.empty()) ? 1 : 0; 347 | if (string2.empty()) return 0; 348 | 349 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 350 | // faster speed by spending more time spinning just the inner loop during the main processing. 351 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 352 | 353 | // identify common suffix and/or prefix that can be ignored 354 | int len1, len2, start; 355 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 356 | if (len1 == 0) return 1.0; 357 | 358 | if (len2 > this->baseChar1Costs.size()) { 359 | this->baseChar1Costs = vector(len2, 0); 360 | this->basePrevChar1Costs = vector(len2, 0); 361 | } 362 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs) 363 | , string2.size()); 364 | } 365 | 366 | /// Return Damerau-Levenshtein optimal string alignment similarity 367 | /// between two strings (1 - (damerau distance / len of longer string)). 368 | /// One of the strings to compare. 369 | /// The other string to compare. 370 | /// The minimum similarity that is of interest. 371 | /// The degree of similarity 0 to 1.0, where -1 represents a similarity 372 | /// lower than minSimilarity, otherwise, a number between 0 and 1.0 where 0 373 | /// represents a lack of any noteable similarity, and 1 represents equivalent 374 | /// strings. 375 | double Similarity(xstring string1, xstring string2, double minSimilarity) { 376 | if (minSimilarity < 0 || minSimilarity > 1) throw "minSimilarity must be in range 0 to 1.0"; 377 | if (string1.empty() || string2.empty()) return Helpers::NullSimilarityResults(string1, string2, minSimilarity); 378 | 379 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 380 | // faster speed by spending more time spinning just the inner loop during the main processing. 381 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 382 | 383 | int iMaxDistance = Helpers::ToDistance(minSimilarity, string2.size()); 384 | if (string2.size() - string1.size() > iMaxDistance) return -1; 385 | if (iMaxDistance <= 0) return (string1 == string2) ? 1 : -1; 386 | 387 | // identify common suffix and/or prefix that can be ignored 388 | int len1, len2, start; 389 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 390 | if (len1 == 0) return 1.0; 391 | 392 | if (len2 > this->baseChar1Costs.size()) { 393 | this->baseChar1Costs = vector(len2, 0); 394 | this->basePrevChar1Costs = vector(len2, 0); 395 | } 396 | if (iMaxDistance < len2) { 397 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, iMaxDistance, this->baseChar1Costs, this->basePrevChar1Costs) 398 | , string2.size()); 399 | } 400 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs) 401 | , string2.size()); 402 | } 403 | 404 | /// Internal implementation of the core Damerau-Levenshtein, optimal string alignment algorithm. 405 | /// https://github.com/softwx/SoftWx.Match 406 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, vector char1Costs, vector prevChar1Costs) { 407 | int j; 408 | for (j = 0; j < len2; j++) char1Costs[j] = j + 1; 409 | xchar char1 = XL(' '); 410 | int currentCost = 0; 411 | for (int i = 0; i < len1; ++i) { 412 | xchar prevChar1 = char1; 413 | char1 = string1[start + i]; 414 | xchar char2 = XL(' '); 415 | int leftCharCost, aboveCharCost; 416 | leftCharCost = aboveCharCost = i; 417 | int nextTransCost = 0; 418 | for (j = 0; j < len2; ++j) { 419 | int thisTransCost = nextTransCost; 420 | nextTransCost = prevChar1Costs[j]; 421 | prevChar1Costs[j] = currentCost = leftCharCost; // cost of diagonal (substitution) 422 | leftCharCost = char1Costs[j]; // left now equals current cost (which will be diagonal at next iteration) 423 | xchar prevChar2 = char2; 424 | char2 = string2[start + j]; 425 | if (char1 != char2) { 426 | //substitution if neither of two conditions below 427 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion 428 | if (leftCharCost < currentCost) currentCost = leftCharCost; // insertion 429 | ++currentCost; 430 | if ((i != 0) && (j != 0) 431 | && (char1 == prevChar2) 432 | && (prevChar1 == char2) 433 | && (thisTransCost + 1 < currentCost)) { 434 | currentCost = thisTransCost + 1; // transposition 435 | } 436 | } 437 | char1Costs[j] = aboveCharCost = currentCost; 438 | } 439 | } 440 | return currentCost; 441 | } 442 | 443 | /// Internal implementation of the core Damerau-Levenshtein, optimal string alignment algorithm 444 | /// that accepts a maxDistance. 445 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, int maxDistance, vector char1Costs, vector prevChar1Costs) { 446 | int i, j; 447 | //for (j = 0; j < maxDistance; j++) char1Costs[j] = j+1; 448 | for (j = 0; j < maxDistance; j++) 449 | char1Costs[j] = j + 1; 450 | for (; j < len2;) char1Costs[j++] = maxDistance + 1; 451 | int lenDiff = len2 - len1; 452 | int jStartOffset = maxDistance - lenDiff; 453 | int jStart = 0; 454 | int jEnd = maxDistance; 455 | xchar char1 = XL(' '); 456 | int currentCost = 0; 457 | for (i = 0; i < len1; ++i) { 458 | xchar prevChar1 = char1; 459 | char1 = string1[start + i]; 460 | xchar char2 = XL(' '); 461 | int leftCharCost, aboveCharCost; 462 | leftCharCost = aboveCharCost = i; 463 | int nextTransCost = 0; 464 | // no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff) 465 | // and the upper left diagonal + maxDistance cells (upper left is i) 466 | jStart += (i > jStartOffset) ? 1 : 0; 467 | jEnd += (jEnd < len2) ? 1 : 0; 468 | for (j = jStart; j < jEnd; ++j) { 469 | int thisTransCost = nextTransCost; 470 | nextTransCost = prevChar1Costs[j]; 471 | prevChar1Costs[j] = currentCost = leftCharCost; // cost on diagonal (substitution) 472 | leftCharCost = char1Costs[j]; // left now equals current cost (which will be diagonal at next iteration) 473 | xchar prevChar2 = char2; 474 | char2 = string2[start + j]; 475 | if (char1 != char2) { 476 | // substitution if neither of two conditions below 477 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion 478 | if (leftCharCost < currentCost) currentCost = leftCharCost; // insertion 479 | ++currentCost; 480 | if ((i != 0) && (j != 0) 481 | && (char1 == prevChar2) 482 | && (prevChar1 == char2) 483 | && (thisTransCost + 1 < currentCost)) { 484 | currentCost = thisTransCost + 1; // transposition 485 | } 486 | } 487 | char1Costs[j] = aboveCharCost = currentCost; 488 | } 489 | if (char1Costs[i + lenDiff] > maxDistance) return -1; 490 | } 491 | return (currentCost <= maxDistance) ? currentCost : -1; 492 | } 493 | }; 494 | 495 | 496 | /// 497 | /// Class providing optimized methods for computing Levenshtein comparisons between two strings. 498 | /// 499 | /// 500 | /// Copyright ©2015-2018 SoftWx, Inc. 501 | /// The inspiration for creating highly optimized edit distance functions was 502 | /// from Sten Hjelmqvist's "Fast, memory efficient" algorithm, described at 503 | /// http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm 504 | /// The Levenshtein algorithm computes the edit distance metric between two strings, i.e. 505 | /// the number of insertion, deletion, and substitution edits required to transform one 506 | /// string to the other. This value will be >= 0, where 0 indicates identical strings. 507 | /// Comparisons are case sensitive, so for example, "Fred" and "fred" will have a 508 | /// distance of 1. The optimized algorithm was described in detail in my post at 509 | /// http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html 510 | /// Also see http://en.wikipedia.org/wiki/Levenshtein_distance for general information. 511 | /// The methods in this class are not threadsafe. Use the static versions in the Distance 512 | /// class if that is required. 513 | class Levenshtein : public IDistance, ISimilarity { 514 | private: 515 | vector baseChar1Costs; 516 | 517 | public: 518 | 519 | /// Create a new instance of DamerauOSA. 520 | Levenshtein() { 521 | 522 | } 523 | 524 | /// Create a new instance of Levenshtein using the specified expected 525 | /// maximum string length that will be encountered. 526 | /// By specifying the max expected string length, better memory efficiency 527 | /// can be achieved. 528 | /// The expected maximum length of strings that will 529 | /// be passed to the Levenshtein methods. 530 | Levenshtein(int expectedMaxstringLength) { 531 | if (expectedMaxstringLength <= 0) throw "expectedMaxstringLength must be larger than 0"; 532 | this->baseChar1Costs = vector(expectedMaxstringLength, 0); 533 | } 534 | 535 | /// Compute and return the Levenshtein edit distance between two strings. 536 | /// https://github.com/softwx/SoftWx.Match 537 | /// This method is not threadsafe. 538 | /// One of the strings to compare. 539 | /// The other string to compare. 540 | /// 0 if the strings are equivalent, otherwise a positive number whose 541 | /// magnitude increases as difference between the strings increases. 542 | double Distance(xstring string1, xstring string2) { 543 | if (string1.empty()) return string2.size(); 544 | if (string2.empty()) return string1.size(); 545 | 546 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 547 | // faster speed by spending more time spinning just the inner loop during the main processing. 548 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 549 | 550 | // identify common suffix and/or prefix that can be ignored 551 | int len1, len2, start; 552 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 553 | if (len1 == 0) return len2; 554 | 555 | return Distance(string1, string2, len1, len2, start, 556 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0))); 557 | } 558 | 559 | /// Compute and return the Levenshtein edit distance between two strings. 560 | /// https://github.com/softwx/SoftWx.Match 561 | /// This method is not threadsafe. 562 | /// One of the strings to compare. 563 | /// The other string to compare. 564 | /// The maximum distance that is of interest. 565 | /// -1 if the distance is greater than the maxDistance, 0 if the strings 566 | /// are equivalent, otherwise a positive number whose magnitude increases as 567 | /// difference between the strings increases. 568 | double Distance(xstring string1, xstring string2, double maxDistance) { 569 | if (string1.empty() || string2.empty()) return Helpers::NullDistanceResults(string1, string2, maxDistance); 570 | if (maxDistance <= 0) return (string1 == string2) ? 0 : -1; 571 | maxDistance = ceil(maxDistance); 572 | int iMaxDistance = (maxDistance <= INT_MAX) ? (int)maxDistance : INT_MAX; 573 | 574 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 575 | // faster speed by spending more time spinning just the inner loop during the main processing. 576 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 577 | if (string2.size() - string1.size() > iMaxDistance) return -1; 578 | 579 | // identify common suffix and/or prefix that can be ignored 580 | int len1, len2, start; 581 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 582 | if (len1 == 0) return (len2 <= iMaxDistance) ? len2 : -1; 583 | 584 | if (iMaxDistance < len2) { 585 | return Distance(string1, string2, len1, len2, start, iMaxDistance, 586 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0))); 587 | } 588 | return Distance(string1, string2, len1, len2, start, 589 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0))); 590 | } 591 | 592 | /// Return Levenshtein similarity between two strings 593 | /// (1 - (levenshtein distance / len of longer string)). 594 | /// One of the strings to compare. 595 | /// The other string to compare. 596 | /// The degree of similarity 0 to 1.0, where 0 represents a lack of any 597 | /// noteable similarity, and 1 represents equivalent strings. 598 | double Similarity(xstring string1, xstring string2) { 599 | if (string1.empty()) return (string2.empty()) ? 1 : 0; 600 | if (string2.empty()) return 0; 601 | 602 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 603 | // faster speed by spending more time spinning just the inner loop during the main processing. 604 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 605 | 606 | // identify common suffix and/or prefix that can be ignored 607 | int len1, len2, start; 608 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 609 | if (len1 == 0) return 1.0; 610 | 611 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, 612 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0))) 613 | , string2.size()); 614 | } 615 | 616 | 617 | /// Return Levenshtein similarity between two strings 618 | /// (1 - (levenshtein distance / len of longer string)). 619 | /// One of the strings to compare. 620 | /// The other string to compare. 621 | /// The minimum similarity that is of interest. 622 | /// The degree of similarity 0 to 1.0, where -1 represents a similarity 623 | /// lower than minSimilarity, otherwise, a number between 0 and 1.0 where 0 624 | /// represents a lack of any noteable similarity, and 1 represents equivalent 625 | /// strings. 626 | double Similarity(xstring string1, xstring string2, double minSimilarity) { 627 | if (minSimilarity < 0 || minSimilarity > 1) throw "minSimilarity must be in range 0 to 1.0"; 628 | if (string1.empty() || string2.empty()) return Helpers::NullSimilarityResults(string1, string2, minSimilarity); 629 | 630 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little 631 | // faster speed by spending more time spinning just the inner loop during the main processing. 632 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; } 633 | 634 | int iMaxDistance = Helpers::ToDistance(minSimilarity, string2.size()); 635 | if (string2.size() - string1.size() > iMaxDistance) return -1; 636 | if (iMaxDistance == 0) return (string1 == string2) ? 1 : -1; 637 | 638 | // identify common suffix and/or prefix that can be ignored 639 | int len1, len2, start; 640 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start); 641 | if (len1 == 0) return 1.0; 642 | 643 | if (iMaxDistance < len2) { 644 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, iMaxDistance, 645 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0))) 646 | , string2.size()); 647 | } 648 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, 649 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0))) 650 | , string2.size()); 651 | } 652 | 653 | /// Internal implementation of the core Levenshtein algorithm. 654 | /// https://github.com/softwx/SoftWx.Match 655 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, vector &char1Costs) { 656 | for (int j = 0; j < len2; j++) 657 | char1Costs[j] = j + 1; 658 | int currentCharCost = 0; 659 | if (start == 0) { 660 | for (int i = 0; i < len1; ++i) { 661 | int leftCharCost, aboveCharCost; 662 | leftCharCost = aboveCharCost = i; 663 | xchar char1 = string1[i]; 664 | for (int j = 0; j < len2; ++j) { 665 | currentCharCost = leftCharCost; // cost on diagonal (substitution) 666 | leftCharCost = char1Costs[j]; 667 | if (string2[j] != char1) { 668 | // substitution if neither of two conditions below 669 | if (aboveCharCost < currentCharCost) currentCharCost = aboveCharCost; // deletion 670 | if (leftCharCost < currentCharCost) currentCharCost = leftCharCost; // insertion 671 | ++currentCharCost; 672 | } 673 | char1Costs[j] = aboveCharCost = currentCharCost; 674 | } 675 | } 676 | } 677 | else { 678 | for (int i = 0; i < len1; ++i) { 679 | int leftCharCost, aboveCharCost; 680 | leftCharCost = aboveCharCost = i; 681 | xchar char1 = string1[start + i]; 682 | for (int j = 0; j < len2; ++j) { 683 | currentCharCost = leftCharCost; // cost on diagonal (substitution) 684 | leftCharCost = char1Costs[j]; 685 | if (string2[start + j] != char1) { 686 | // substitution if neither of two conditions below 687 | if (aboveCharCost < currentCharCost) currentCharCost = aboveCharCost; // deletion 688 | if (leftCharCost < currentCharCost) currentCharCost = leftCharCost; // insertion 689 | ++currentCharCost; 690 | } 691 | char1Costs[j] = aboveCharCost = currentCharCost; 692 | } 693 | } 694 | } 695 | return currentCharCost; 696 | } 697 | 698 | /// Internal implementation of the core Levenshtein algorithm that accepts a maxDistance. 699 | /// https://github.com/softwx/SoftWx.Match 700 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, int maxDistance, vector &char1Costs) { 701 | // if (maxDistance >= len2) return Distance(string1, string2, len1, len2, start, char1Costs); 702 | int i, j; 703 | for (j = 0; j < maxDistance; j++) 704 | char1Costs[j] = j+1; 705 | for (; j < len2;) char1Costs[j++] = maxDistance + 1; 706 | int lenDiff = len2 - len1; 707 | int jStartOffset = maxDistance - lenDiff; 708 | int jStart = 0; 709 | int jEnd = maxDistance; 710 | int currentCost = 0; 711 | if (start == 0) { 712 | for (i = 0; i < len1; ++i) { 713 | xchar char1 = string1[i]; 714 | int prevChar1Cost, aboveCharCost; 715 | prevChar1Cost = aboveCharCost = i; 716 | // no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff) 717 | // and the upper left diagonal + maxDistance cells (upper left is i) 718 | jStart += (i > jStartOffset) ? 1 : 0; 719 | jEnd += (jEnd < len2) ? 1 : 0; 720 | for (j = jStart; j < jEnd; ++j) { 721 | currentCost = prevChar1Cost; // cost on diagonal (substitution) 722 | prevChar1Cost = char1Costs[j]; 723 | if (string2[j] != char1) { 724 | // substitution if neither of two conditions below 725 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion 726 | if (prevChar1Cost < currentCost) currentCost = prevChar1Cost; // insertion 727 | ++currentCost; 728 | } 729 | char1Costs[j] = aboveCharCost = currentCost; 730 | } 731 | if (char1Costs[i + lenDiff] > maxDistance) return -1; 732 | } 733 | } 734 | else { 735 | for (i = 0; i < len1; ++i) { 736 | xchar char1 = string1[start + i]; 737 | int prevChar1Cost, aboveCharCost; 738 | prevChar1Cost = aboveCharCost = i; 739 | // no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff) 740 | // and the upper left diagonal + maxDistance cells (upper left is i) 741 | jStart += (i > jStartOffset) ? 1 : 0; 742 | jEnd += (jEnd < len2) ? 1 : 0; 743 | for (j = jStart; j < jEnd; ++j) { 744 | currentCost = prevChar1Cost; // cost on diagonal (substitution) 745 | prevChar1Cost = char1Costs[j]; 746 | if (string2[start + j] != char1) { 747 | // substitution if neither of two conditions below 748 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion 749 | if (prevChar1Cost < currentCost) currentCost = prevChar1Cost; // insertion 750 | ++currentCost; 751 | } 752 | char1Costs[j] = aboveCharCost = currentCost; 753 | } 754 | if (char1Costs[i + lenDiff] > maxDistance) return -1; 755 | } 756 | } 757 | return (currentCost <= maxDistance) ? currentCost : -1; 758 | } 759 | }; 760 | 761 | 762 | /// Supported edit distance algorithms. 763 | enum DistanceAlgorithm { 764 | /// Levenshtein algorithm. 765 | LevenshteinDistance, 766 | /// Damerau optimal string alignment algorithm. 767 | DamerauOSADistance 768 | }; 769 | 770 | /// Wrapper for third party edit distance algorithms. 771 | class EditDistance { 772 | private: 773 | DistanceAlgorithm algorithm; 774 | IDistance* distanceComparer; 775 | DamerauOSA damerauOSADistance; 776 | Levenshtein levenshteinDistance; 777 | 778 | public: 779 | /// Create a new EditDistance object. 780 | /// The desired edit distance algorithm. 781 | EditDistance(DistanceAlgorithm algorithm) { 782 | this->algorithm = algorithm; 783 | switch (algorithm) { 784 | case DistanceAlgorithm::DamerauOSADistance: 785 | this->distanceComparer = &damerauOSADistance; 786 | break; 787 | case DistanceAlgorithm::LevenshteinDistance: 788 | this->distanceComparer = &levenshteinDistance; 789 | break; 790 | default: 791 | throw "Unknown distance algorithm."; 792 | } 793 | } 794 | 795 | /// Compare a string to the base string to determine the edit distance, 796 | /// using the previously selected algorithm. 797 | /// The string to compare. 798 | /// The maximum distance allowed. 799 | /// The edit distance (or -1 if maxDistance exceeded). 800 | int Compare(xstring string1, xstring string2, int maxDistance) { 801 | return (int)this->distanceComparer->Distance(string1, string2, maxDistance); 802 | } 803 | }; 804 | 805 | class Node 806 | { 807 | public: 808 | xstring suggestion; 809 | int next; 810 | }; 811 | 812 | class Entry 813 | { 814 | public: 815 | int count; 816 | int first; 817 | }; 818 | 819 | /// An intentionally opacque class used to temporarily stage 820 | /// dictionary data during the adding of many words. By staging the 821 | /// data during the building of the dictionary data, significant savings 822 | /// of time can be achieved, as well as a reduction in final memory usage. 823 | class SuggestionStage 824 | { 825 | private: 826 | Dictionary Deletes; 827 | ChunkArray Nodes; 828 | 829 | public: 830 | /// Create a new instance of SuggestionStage. 831 | /// Specifying ann accurate initialCapacity is not essential, 832 | /// but it can help speed up processing by alleviating the need for 833 | /// data restructuring as the size grows. 834 | /// The expected number of words that will be added. 835 | SuggestionStage(int initialCapacity) 836 | { 837 | Deletes.reserve(initialCapacity); 838 | Nodes.Reserve(initialCapacity * 2); 839 | } 840 | 841 | /// Gets the count of unique delete words. 842 | int DeleteCount() { return Deletes.size(); } 843 | 844 | /// Gets the total count of all suggestions for all deletes. 845 | int NodeCount() { return Nodes.Count; } 846 | 847 | /// Clears all the data from the SuggestionStaging. 848 | void Clear() 849 | { 850 | Deletes.clear(); 851 | Nodes.Clear(); 852 | } 853 | 854 | void Add(int deleteHash, xstring suggestion) 855 | { 856 | auto deletesFinded = Deletes.find(deleteHash); 857 | Entry newEntry; 858 | newEntry.count = 0; 859 | newEntry.first = -1; 860 | Entry entry = (deletesFinded == Deletes.end()) ? newEntry : deletesFinded->second; 861 | int next = entry.first; 862 | entry.count++; 863 | entry.first = Nodes.Count; 864 | Deletes[deleteHash] = entry; 865 | Node item; 866 | item.suggestion = suggestion; 867 | item.next = next; // 1st semantic errors, this should not be Nodes.Count 868 | Nodes.Add(item); 869 | } 870 | 871 | void CommitTo(Dictionary>* permanentDeletes) 872 | { 873 | auto permanentDeletesEnd = permanentDeletes->end(); 874 | for (auto it = Deletes.begin(); it != Deletes.end(); ++it) 875 | { 876 | auto permanentDeletesFinded = permanentDeletes->find(it->first); 877 | vector suggestions; 878 | int i; 879 | if (permanentDeletesFinded != permanentDeletesEnd) 880 | { 881 | suggestions = permanentDeletesFinded->second; 882 | i = suggestions.size(); 883 | 884 | vector newSuggestions; 885 | newSuggestions.reserve(suggestions.size() + it->second.count); 886 | std::copy(suggestions.begin(), suggestions.end(), back_inserter(newSuggestions)); 887 | //permanentDeletes[it->first] = newSuggestions; 888 | } 889 | else 890 | { 891 | i = 0; 892 | int32_t count = it->second.count; 893 | suggestions.reserve(count); 894 | //permanentDeletes[it->first] = suggestions; 895 | } 896 | 897 | int next = it->second.first; 898 | while (next >= 0) 899 | { 900 | auto node = Nodes.At(next); 901 | suggestions.push_back(node.suggestion); 902 | next = node.next; 903 | ++i; 904 | } 905 | (*permanentDeletes)[it->first] = suggestions; 906 | } 907 | } 908 | }; 909 | 910 | 911 | /// Spelling suggestion returned from Lookup. 912 | class SuggestItem 913 | { 914 | public: 915 | /// The suggested correctly spelled word. 916 | xstring term = XL(""); 917 | /// Edit distance between searched for word and suggestion. 918 | int distance = 0; 919 | /// Frequency of suggestion in the dictionary (a measure of how common the word is). 920 | int64_t count = 0; 921 | 922 | /// Create a new instance of SuggestItem. 923 | /// The suggested word. 924 | /// Edit distance from search word. 925 | /// Frequency of suggestion in dictionary. 926 | SuggestItem() 927 | { 928 | } 929 | 930 | 931 | SuggestItem(xstring term, int distance, int64_t count) 932 | { 933 | this->term = term; 934 | this->distance = distance; 935 | this->count = count; 936 | } 937 | 938 | int CompareTo(SuggestItem other) 939 | { 940 | // order by distance ascending, then by frequency count descending, and then by alphabet descending 941 | int disCom = Helpers::CompareTo(this->distance, other.distance); 942 | if (disCom != 0) 943 | return disCom; 944 | int cntCom = Helpers::CompareTo(other.count, this->count); 945 | if (cntCom != 0) 946 | return cntCom; 947 | return this->term.compare(other.term); 948 | } 949 | 950 | bool Equals(const SuggestItem obj) 951 | { 952 | return this->term == obj.term && this->distance == obj.distance && this->count == obj.count; 953 | } 954 | 955 | int GetHashCode() 956 | { 957 | return hash{}(this->term); 958 | } 959 | 960 | xstring Tostring() 961 | { 962 | return XL("{") + term + XL(", ") + to_xstring(distance) + XL(", ") + to_xstring(count) + XL("}"); 963 | } 964 | static bool compare(SuggestItem s1, SuggestItem s2) 965 | { 966 | return s1.CompareTo(s2) < 0 ? 1 : 0; 967 | } 968 | 969 | void set(SuggestItem exam) 970 | { 971 | this->term = exam.term; 972 | this->distance = exam.distance; 973 | this->count = exam.count; 974 | } 975 | }; 976 | -------------------------------------------------------------------------------- /include/SymSpell.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | //#define UNICODE_SUPPORT 13 | #include "Helpers.h" 14 | 15 | 16 | // SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm 17 | // 18 | // The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup 19 | // for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent. 20 | // Opposite to other algorithms only deletes are required, no transposes + replaces + inserts. 21 | // Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term. 22 | // Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters! 23 | // 24 | // SymSpell supports compound splitting / decompounding of multi-word input strings with three cases: 25 | // 1. mistakenly inserted space into a correct word led to two incorrect terms 26 | // 2. mistakenly omitted space between two correct words led to one incorrect combined term 27 | // 3. multiple independent input terms with/without spelling errors 28 | 29 | // Copyright (C) 2019 Wolf Garbe 30 | // Version: 6.5 31 | // Author: Wolf Garbe wolf.garbe@faroo.com 32 | // Maintainer: Wolf Garbe wolf.garbe@faroo.com 33 | // URL: https://github.com/wolfgarbe/symspell 34 | // Description: https://medium.com/@wolfgarbe/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f 35 | // 36 | // MIT License 37 | // Copyright (c) 2019 Wolf Garbe 38 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 39 | // documentation files (the "Software"), to deal in the Software without restriction, including without limitation 40 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 41 | // and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 42 | // 43 | // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 44 | // https://opensource.org/licenses/MIT 45 | 46 | #define DEFAULT_SEPARATOR_CHAR XL(' ') 47 | #define DEFAULT_MAX_EDIT_DISTANCE 2 48 | #define DEFAULT_PREFIX_LENGTH 7 49 | #define DEFAULT_COUNT_THRESHOLD 1 50 | #define DEFAULT_INITIAL_CAPACITY 82765 51 | #define DEFAULT_COMPACT_LEVEL 5 52 | #define min3(a, b, c) (min(a, min(b, c))) 53 | #define MAXINT LLONG_MAX 54 | #define M 55 | #define MAXLONG MAXINT 56 | #define uint unsigned int 57 | static inline void ltrim(xstring& s) 58 | { 59 | 60 | s.erase(s.begin(),find_if(s.begin(), s.end(), [](xchar ch){ 61 | return !isxspace(ch); 62 | })); 63 | } 64 | 65 | static inline void rtrim(xstring& s) 66 | { 67 | s.erase(find_if(s.rbegin(), s.rend(), [](xchar ch){ 68 | return !isxspace(ch); 69 | }).base(), s.end()); 70 | } 71 | 72 | static inline void trim(xstring& s) 73 | { 74 | ltrim(s); 75 | rtrim(s); 76 | } 77 | 78 | class Info 79 | { 80 | private: 81 | 82 | xstring segmentedstring; 83 | xstring correctedstring; 84 | int distanceSum; 85 | double probabilityLogSum; 86 | public: 87 | void set(xstring& seg, xstring& cor, int d, double prob) 88 | { 89 | segmentedstring = seg; 90 | correctedstring = cor; 91 | distanceSum = d; 92 | probabilityLogSum = prob; 93 | }; 94 | xstring getSegmented() 95 | { 96 | return segmentedstring; 97 | }; 98 | xstring getCorrected() 99 | { 100 | return correctedstring; 101 | }; 102 | int getDistance() 103 | { 104 | return distanceSum; 105 | }; 106 | double getProbability() 107 | { 108 | return probabilityLogSum; 109 | }; 110 | }; 111 | 112 | /// Controls the closeness/quantity of returned spelling suggestions. 113 | enum Verbosity 114 | { 115 | /// Top suggestion with the highest term frequency of the suggestions of smallest edit distance found. 116 | Top, 117 | /// All suggestions of smallest edit distance found, suggestions ordered by term frequency. 118 | Closest, 119 | /// All suggestions within maxEditDistance, suggestions ordered by edit distance 120 | /// , then by term frequency (slower, no early termination). 121 | All 122 | }; 123 | 124 | class SymSpell 125 | { 126 | protected: 127 | int initialCapacity; 128 | int maxDictionaryEditDistance; 129 | int prefixLength; //prefix length 5..7 130 | long countThreshold; //a threshold might be specified, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction 131 | int compactMask; 132 | DistanceAlgorithm distanceAlgorithm = DistanceAlgorithm::DamerauOSADistance; 133 | int maxDictionaryWordLength; //maximum dictionary term length 134 | // Dictionary that contains a mapping of lists of suggested correction words to the hashCodes 135 | // of the original words and the deletes derived from them. Collisions of hashCodes is tolerated, 136 | // because suggestions are ultimately verified via an edit distance function. 137 | // A list of suggestions might have a single suggestion, or multiple suggestions. 138 | Dictionary> *deletes = NULL; 139 | // Dictionary of unique correct spelling words, and the frequency count for each word. 140 | Dictionary words; 141 | // Dictionary of unique words that are below the count threshold for being considered correct spellings. 142 | Dictionary belowThresholdWords; 143 | 144 | public: 145 | /// Maximum edit distance for dictionary precalculation. 146 | int MaxDictionaryEditDistance(); 147 | 148 | /// Length of prefix, from which deletes are generated. 149 | int PrefixLength(); 150 | 151 | /// Length of longest word in the dictionary. 152 | int MaxLength(); 153 | 154 | /// Count threshold for a word to be considered a valid word for spelling correction. 155 | long CountThreshold(); 156 | 157 | /// Number of unique words in the dictionary. 158 | int WordCount(); 159 | 160 | /// Number of word prefixes and intermediate word deletes encoded in the dictionary. 161 | int EntryCount(); 162 | 163 | /// Create a new instanc of SymSpell. 164 | /// Specifying ann accurate initialCapacity is not essential, 165 | /// but it can help speed up processing by alleviating the need for 166 | /// data restructuring as the size grows. 167 | /// The expected number of words in dictionary. 168 | /// Maximum edit distance for doing lookups. 169 | /// The length of word prefixes used for spell checking.. 170 | /// The minimum frequency count for dictionary words to be considered correct spellings. 171 | /// Degree of favoring lower memory use over speed (0=fastest,most memory, 16=slowest,least memory). 172 | 173 | SymSpell(int initialCapacity = DEFAULT_INITIAL_CAPACITY, int maxDictionaryEditDistance = DEFAULT_MAX_EDIT_DISTANCE 174 | , int prefixLength = DEFAULT_PREFIX_LENGTH, int countThreshold = DEFAULT_COUNT_THRESHOLD 175 | , unsigned char compactLevel = DEFAULT_COMPACT_LEVEL); 176 | 177 | ~SymSpell(); 178 | 179 | /// Create/Update an entry in the dictionary. 180 | /// For every word there are deletes with an edit distance of 1..maxEditDistance created and added to the 181 | /// dictionary. Every delete entry has a suggestions list, which points to the original term(s) it was created from. 182 | /// The dictionary may be dynamically updated (word frequency and new words) at any time by calling CreateDictionaryEntry 183 | /// The word to add to dictionary. 184 | /// The frequency count for word. 185 | /// Optional staging object to speed up adding many entries by staging them to a temporary structure. 186 | /// True if the word was added as a new correctly spelled word, 187 | /// or false if the word is added as a below threshold word, or updates an 188 | /// existing correctly spelled word. 189 | bool CreateDictionaryEntry(xstring key, int64_t count, SuggestionStage* staging); 190 | 191 | Dictionary bigrams; 192 | int64_t bigramCountMin = MAXLONG; 193 | 194 | /// Load multiple dictionary entries from a file of word/frequency count pairs 195 | /// Merges with any dictionary data already loaded. 196 | /// The path+filename of the file. 197 | /// The column position of the word. 198 | /// The column position of the frequency count. 199 | /// Separator characters between term(s) and count. 200 | /// True if file loaded, or false if file not found. 201 | bool LoadBigramDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR); 202 | 203 | /// Load multiple dictionary entries from a file of word/frequency count pairs 204 | /// Merges with any dictionary data already loaded. 205 | /// The path+filename of the file. 206 | /// The column position of the word. 207 | /// The column position of the frequency count. 208 | /// Separator characters between term(s) and count. 209 | /// True if file loaded, or false if file not found. 210 | bool LoadBigramDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR); 211 | 212 | /// Load multiple dictionary entries from a file of word/frequency count pairs 213 | /// Merges with any dictionary data already loaded. 214 | /// The path+filename of the file. 215 | /// The column position of the word. 216 | /// The column position of the frequency count. 217 | /// Separator characters between term(s) and count. 218 | /// True if file loaded, or false if file not found. 219 | bool LoadDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR); 220 | 221 | /// Load multiple dictionary entries from a stream of word/frequency count pairs 222 | /// Merges with any dictionary data already loaded. 223 | /// The stream containing the word/frequency count pairs. 224 | /// The column position of the word. 225 | /// The column position of the frequency count. 226 | /// Separator characters between term(s) and count. 227 | /// True if stream loads. 228 | bool LoadDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR); 229 | 230 | /// Load multiple dictionary words from a file containing plain text. 231 | /// Merges with any dictionary data already loaded. 232 | /// The path+filename of the file. 233 | /// True if file loaded, or false if file not found. 234 | bool CreateDictionary(string corpus); 235 | 236 | /// Load multiple dictionary words from a stream containing plain text. 237 | /// Merges with any dictionary data already loaded. 238 | /// The stream containing the plain text. 239 | /// True if stream loads. 240 | bool CreateDictionary(xifstream& corpusStream); 241 | 242 | /// Remove all below threshold words from the dictionary. 243 | /// This can be used to reduce memory consumption after populating the dictionary from 244 | /// a corpus using CreateDictionary. 245 | void PurgeBelowThresholdWords(); 246 | 247 | /// Commit staged dictionary additions. 248 | /// Used when you write your own process to load multiple words into the 249 | /// dictionary, and as part of that process, you first created a SuggestionsStage 250 | /// object, and passed that to CreateDictionaryEntry calls. 251 | /// The SuggestionStage object storing the staged data. 252 | void CommitStaged(SuggestionStage* staging); 253 | 254 | /// Find suggested spellings for a given input word, using the maximum 255 | /// edit distance specified during construction of the SymSpell dictionary. 256 | /// The word being spell checked. 257 | /// The value controlling the quantity/closeness of the retuned suggestions. 258 | /// A List of SuggestItem object representing suggested correct spellings for the input word, 259 | /// sorted by edit distance, and secondarily by count frequency. 260 | vector Lookup(xstring input, Verbosity verbosity); 261 | 262 | /// Find suggested spellings for a given input word, using the maximum 263 | /// edit distance specified during construction of the SymSpell dictionary. 264 | /// The word being spell checked. 265 | /// The value controlling the quantity/closeness of the retuned suggestions. 266 | /// The maximum edit distance between input and suggested words. 267 | /// A List of SuggestItem object representing suggested correct spellings for the input word, 268 | /// sorted by edit distance, and secondarily by count frequency. 269 | vector Lookup(xstring input, Verbosity verbosity, int maxEditDistance); 270 | 271 | /// Find suggested spellings for a given input word. 272 | /// The word being spell checked. 273 | /// The value controlling the quantity/closeness of the retuned suggestions. 274 | /// The maximum edit distance between input and suggested words. 275 | /// Include input word in suggestions, if no words within edit distance found. 276 | /// A List of SuggestItem object representing suggested correct spellings for the input word, 277 | /// sorted by edit distance, and secondarily by count frequency. 278 | vector Lookup(xstring input, Verbosity verbosity, int maxEditDistance, bool includeUnknown); 279 | 280 | private: 281 | //check whether all delete chars are present in the suggestion prefix in correct order, otherwise this is just a hash collision 282 | bool DeleteInSuggestionPrefix(xstring deleteSugg, int deleteLen, xstring suggestion, int suggestionLen); 283 | 284 | //create a non-unique wordlist from sample text 285 | //language independent (e.g. works with Chinese characters) 286 | vector ParseWords(xstring text); 287 | 288 | //inexpensive and language independent: only deletes, no transposes + replaces + inserts 289 | //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters) 290 | HashSet* Edits(xstring word, int editDistance, HashSet* deleteWords); 291 | 292 | HashSet EditsPrefix(xstring key); 293 | 294 | int GetstringHash(xstring s); 295 | 296 | public: 297 | //###################### 298 | 299 | //LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases: 300 | //1. mistakenly inserted space into a correct word led to two incorrect terms 301 | //2. mistakenly omitted space between two correct words led to one incorrect combined term 302 | //3. multiple independent input terms with/without spelling errors 303 | 304 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 305 | /// The string being spell checked. 306 | /// A List of SuggestItem object representing suggested correct spellings for the input string. 307 | vector LookupCompound(xstring input); 308 | 309 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 310 | /// The string being spell checked. 311 | /// The maximum edit distance between input and suggested words. 312 | /// A List of SuggestItem object representing suggested correct spellings for the input string. 313 | vector LookupCompound(xstring input, int editDistanceMax); 314 | 315 | //###### 316 | 317 | //WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions 318 | //misspelled words are corrected and do not affect segmentation 319 | //existing spaces are allowed and considered for optimum segmentation 320 | 321 | //SymSpell.WordSegmentation uses a novel approach *without* recursion. 322 | //https://medium.com/@wolfgarbe/fast-word-segmentation-for-noisy-text-2c2c41f9e8da 323 | //While each string of length n can be segmentend in 2^n−1 possible compositions https://en.wikipedia.org/wiki/Composition_(combinatorics) 324 | //SymSpell.WordSegmentation has a linear runtime O(n) to find the optimum composition 325 | 326 | //number of all words in the corpus used to generate the frequency dictionary 327 | //this is used to calculate the word occurrence probability p from word counts c : p=c/N 328 | //N equals the sum of all counts c in the dictionary only if the dictionary is complete, but not if the dictionary is truncated or filtered 329 | static const int64_t N = 1024908267229L; 330 | 331 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 332 | /// The string being spell checked. 333 | /// The word segmented string, 334 | /// the word segmented and spelling corrected string, 335 | /// the Edit distance sum between input string and corrected string, 336 | /// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is). 337 | Info WordSegmentation(xstring input); 338 | 339 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 340 | /// The string being spell checked. 341 | /// The maximum edit distance between input and corrected words 342 | /// (0=no correction/segmentation only). 343 | /// The word segmented string, 344 | /// the word segmented and spelling corrected string, 345 | /// the Edit distance sum between input string and corrected string, 346 | /// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is). 347 | Info WordSegmentation(xstring input, int maxEditDistance); 348 | 349 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 350 | /// The string being spell checked. 351 | /// The maximum word length that should be considered. 352 | /// The maximum edit distance between input and corrected words 353 | /// (0=no correction/segmentation only). 354 | /// The word segmented string, 355 | /// the word segmented and spelling corrected string, 356 | /// the Edit distance sum between input string and corrected string, 357 | /// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is). 358 | Info WordSegmentation(xstring input, int maxEditDistance, int maxSegmentationWordLength); 359 | }; 360 | 361 | -------------------------------------------------------------------------------- /src/SymSpell.cpp: -------------------------------------------------------------------------------- 1 | #include "SymSpell.h" 2 | #include 3 | /// Maximum edit distance for dictionary precalculation. 4 | int SymSpell::MaxDictionaryEditDistance() 5 | { 6 | return this->maxDictionaryEditDistance; 7 | } 8 | 9 | /// Length of prefix, from which deletes are generated. 10 | int SymSpell::PrefixLength() 11 | { 12 | return this->prefixLength; 13 | } 14 | 15 | /// Length of longest word in the dictionary. 16 | int SymSpell::MaxLength() 17 | { 18 | return this->maxDictionaryWordLength; 19 | } 20 | 21 | /// Count threshold for a word to be considered a valid word for spelling correction. 22 | long SymSpell::CountThreshold() 23 | { 24 | return this->countThreshold; 25 | } 26 | 27 | /// Number of unique words in the dictionary. 28 | int SymSpell::WordCount() 29 | { 30 | return this->words.size(); 31 | } 32 | 33 | /// Number of word prefixes and intermediate word deletes encoded in the dictionary. 34 | int SymSpell::EntryCount() 35 | { 36 | return this->deletes->size(); 37 | } 38 | 39 | /// Create a new instanc of SymSpell. 40 | /// Specifying ann accurate initialCapacity is not essential, 41 | /// but it can help speed up processing by alleviating the need for 42 | /// data restructuring as the size grows. 43 | /// The expected number of words in dictionary. 44 | /// Maximum edit distance for doing lookups. 45 | /// The length of word prefixes used for spell checking.. 46 | /// The minimum frequency count for dictionary words to be considered correct spellings. 47 | /// Degree of favoring lower memory use over speed (0=fastest,most memory, 16=slowest,least memory). 48 | SymSpell::SymSpell(int initialCapacity, int maxDictionaryEditDistance 49 | , int prefixLength, int countThreshold, unsigned char compactLevel) 50 | { 51 | if (initialCapacity < 0) throw std::invalid_argument("initialCapacity"); 52 | if (maxDictionaryEditDistance < 0) throw std::invalid_argument("maxDictionaryEditDistance"); 53 | if (prefixLength < 1 || prefixLength <= maxDictionaryEditDistance) throw std::invalid_argument("prefixLength"); 54 | if (countThreshold < 0) throw std::invalid_argument("countThreshold"); 55 | if (compactLevel > 16) throw std::invalid_argument("compactLevel"); 56 | 57 | this->initialCapacity = initialCapacity; 58 | this->words.reserve(initialCapacity); 59 | this->maxDictionaryEditDistance = maxDictionaryEditDistance; 60 | this->prefixLength = prefixLength; 61 | this->countThreshold = countThreshold; 62 | if (compactLevel > 16) compactLevel = 16; 63 | this->compactMask = (UINT_MAX >> (3 + compactLevel)) << 2; 64 | this->maxDictionaryWordLength = 0; 65 | this->words = Dictionary(initialCapacity); 66 | } 67 | 68 | SymSpell::~SymSpell() { 69 | if (this->deletes != nullptr) { 70 | delete this->deletes; 71 | } 72 | } 73 | 74 | /// Create/Update an entry in the dictionary. 75 | /// For every word there are deletes with an edit distance of 1..maxEditDistance created and added to the 76 | /// dictionary. Every delete entry has a suggestions list, which points to the original term(s) it was created from. 77 | /// The dictionary may be dynamically updated (word frequency and new words) at any time by calling CreateDictionaryEntry 78 | /// The word to add to dictionary. 79 | /// The frequency count for word. 80 | /// Optional staging object to speed up adding many entries by staging them to a temporary structure. 81 | /// True if the word was added as a new correctly spelled word, 82 | /// or false if the word is added as a below threshold word, or updates an 83 | /// existing correctly spelled word. 84 | bool SymSpell::CreateDictionaryEntry(xstring key, int64_t count, SuggestionStage* staging) 85 | { 86 | 87 | if (count <= 0) 88 | { 89 | if (this->countThreshold > 0) return false; // no point doing anything if count is zero, as it can't change anything 90 | count = 0; 91 | } 92 | int countPrevious = -1; 93 | // look first in below threshold words, update count, and allow promotion to correct spelling word if count reaches threshold 94 | // threshold must be >1 for there to be the possibility of low threshold words 95 | auto belowThresholdWordsFinded = belowThresholdWords.find(key); 96 | auto wordsFinded = words.find(key); 97 | if (countThreshold > 1 && belowThresholdWordsFinded != belowThresholdWords.end()) 98 | { 99 | countPrevious = belowThresholdWordsFinded->second; 100 | 101 | // calculate new count for below threshold word 102 | count = (MAXINT - countPrevious > count) ? countPrevious + count : MAXINT; 103 | // has reached threshold - remove from below threshold collection (it will be added to correct words below) 104 | if (count >= countThreshold) 105 | { 106 | belowThresholdWords.erase(key); 107 | } 108 | else 109 | { 110 | //belowThresholdWords[key] = count; 111 | belowThresholdWords.insert(pair(key, count)); 112 | return false; 113 | } 114 | } 115 | else if (wordsFinded != words.end()) 116 | { 117 | countPrevious = wordsFinded->second; 118 | // just update count if it's an already added above threshold word 119 | count = (MAXINT - countPrevious > count) ? countPrevious + count : MAXINT; 120 | //words[key] = count; 121 | words.insert(pair(key, count)); 122 | return false; 123 | } 124 | else if (count < CountThreshold()) 125 | { 126 | // new or existing below threshold word 127 | //belowThresholdWords[key] = count; 128 | belowThresholdWords.insert(pair(key, count)); 129 | return false; 130 | } 131 | 132 | // what we have at this point is a new, above threshold word 133 | //words[key] = count; 134 | words.insert(pair(key, count)); 135 | 136 | //edits/suggestions are created only once, no matter how often word occurs 137 | //edits/suggestions are created only as soon as the word occurs in the corpus, 138 | //even if the same term existed before in the dictionary as an edit from another word 139 | if (key.size() > this->maxDictionaryWordLength) this->maxDictionaryWordLength = key.size(); 140 | 141 | //create deletes 142 | HashSet edits = EditsPrefix(key); 143 | // if not staging suggestions, put directly into main data structure 144 | if (staging != NULL) 145 | { 146 | /*ofstream outfile; 147 | outfile.open("log.txt", ios::app); 148 | outfile << words.size() << " " << key <Add(GetstringHash(*it), key); 152 | //outfile << "-" << *it; 153 | } 154 | /*outfile << endl; 155 | outfile.close();*/ 156 | } 157 | else 158 | { 159 | 160 | for (auto it = edits.begin(); it != edits.end(); ++it) 161 | { 162 | int deleteHash = GetstringHash(*it); 163 | auto deletesFinded = deletes->find(deleteHash); 164 | std::vector suggestions; 165 | if(deletesFinded != deletes->end()) 166 | //if (deletes.TryGetValue(deleteHash, out string[] suggestions)) 167 | { 168 | suggestions = deletesFinded->second; 169 | std::vector newSuggestions(suggestions.size() + 1); 170 | for(int id=0; id(1); 179 | //(*deletes)[deleteHash] = suggestions; 180 | (*deletes).insert(pair>(deleteHash, suggestions)); 181 | } 182 | suggestions[suggestions.size() - 1] = key; 183 | } 184 | 185 | } 186 | 187 | return true; 188 | } 189 | 190 | /// Load multiple dictionary entries from a file of word/frequency count pairs 191 | /// Merges with any dictionary data already loaded. 192 | /// The path+filename of the file. 193 | /// The column position of the word. 194 | /// The column position of the frequency count. 195 | /// Separator characters between term(s) and count. 196 | /// True if file loaded, or false if file not found. 197 | bool SymSpell::LoadBigramDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars) 198 | { 199 | xifstream corpusStream; 200 | corpusStream.open(corpus); 201 | #ifdef UNICODE_SUPPORT 202 | locale utf8(locale(), new codecvt_utf8); 203 | corpusStream.imbue(utf8); 204 | #endif 205 | if (!corpusStream.is_open()) 206 | return false; 207 | 208 | return LoadBigramDictionary(corpusStream, termIndex, countIndex, separatorChars); 209 | } 210 | 211 | /// Load multiple dictionary entries from a file of word/frequency count pairs 212 | /// Merges with any dictionary data already loaded. 213 | /// The path+filename of the file. 214 | /// The column position of the word. 215 | /// The column position of the frequency count. 216 | /// Separator characters between term(s) and count. 217 | /// True if file loaded, or false if file not found. 218 | bool SymSpell::LoadBigramDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars) 219 | { 220 | xstring line; 221 | int linePartsLength = (separatorChars == DEFAULT_SEPARATOR_CHAR) ? 3 : 2; 222 | //process a single line at a time only for memory efficiency 223 | while (getline(corpusStream, line)) 224 | { 225 | vector lineParts; 226 | xstringstream ss(line); 227 | xstring token; 228 | while (getline(ss, token, separatorChars)) 229 | lineParts.push_back(token); 230 | xstring key; 231 | int64_t count; 232 | if (lineParts.size() >= linePartsLength) 233 | { 234 | //if default (whitespace) is defined as separator take 2 term parts, otherwise take only one 235 | key = (separatorChars == DEFAULT_SEPARATOR_CHAR) ? lineParts[termIndex] + XL(" ") + lineParts[termIndex + 1]: lineParts[termIndex]; 236 | try{ 237 | count = stoll(lineParts[countIndex]); 238 | 239 | }catch(...) { 240 | //maybe log something here 241 | printf("Cannot convert %s to integer\n", lineParts[countIndex].c_str()); 242 | } 243 | } 244 | else 245 | { 246 | //if default (whitespace) is defined as separator take 2 term parts, otherwise take only one 247 | key = line; 248 | count = 1; 249 | } 250 | pair element(key, count); 251 | bigrams.insert(element); 252 | if (count < bigramCountMin) bigramCountMin = count; 253 | } 254 | 255 | if (bigrams.empty()) 256 | return false; 257 | return true; 258 | } 259 | 260 | /// Load multiple dictionary entries from a file of word/frequency count pairs 261 | /// Merges with any dictionary data already loaded. 262 | /// The path+filename of the file. 263 | /// The column position of the word. 264 | /// The column position of the frequency count. 265 | /// Separator characters between term(s) and count. 266 | /// True if file loaded, or false if file not found. 267 | bool SymSpell::LoadDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars) 268 | { 269 | 270 | xifstream corpusStream(corpus); 271 | /*corpusStream.open(corpus);*/ 272 | #ifdef UNICODE_SUPPORT 273 | locale utf8(locale(), new codecvt_utf8); 274 | corpusStream.imbue(utf8); 275 | #endif 276 | if (!corpusStream.is_open()) 277 | return false; 278 | 279 | return LoadDictionary(corpusStream, termIndex, countIndex, separatorChars); 280 | } 281 | 282 | /// Load multiple dictionary entries from a stream of word/frequency count pairs 283 | /// Merges with any dictionary data already loaded. 284 | /// The stream containing the word/frequency count pairs. 285 | /// The column position of the word. 286 | /// The column position of the frequency count. 287 | /// Separator characters between term(s) and count. 288 | /// True if stream loads. 289 | bool SymSpell::LoadDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars) 290 | { 291 | SuggestionStage staging(16384); 292 | xstring line; 293 | int i = 0; 294 | int start, end; 295 | start = clock(); 296 | //process a single line at a time only for memory efficiency 297 | while (getline(corpusStream, line)) 298 | { 299 | //end = clock(); 300 | //cout << float((end - start) * 1000 / CLOCKS_PER_SEC) << " "; 301 | i++; 302 | vector lineParts; 303 | xstringstream ss(line); 304 | xstring token; 305 | //start = clock(); 306 | while (getline(ss, token, separatorChars)) 307 | lineParts.push_back(token); 308 | if (lineParts.size() >= 2) 309 | { 310 | int64_t count = stoll(lineParts[countIndex]); 311 | 312 | CreateDictionaryEntry(lineParts[termIndex], count, &staging); 313 | } 314 | else 315 | { 316 | CreateDictionaryEntry(line, 1, &staging); 317 | } 318 | 319 | } 320 | if (this->deletes == NULL) 321 | this->deletes = new Dictionary>(staging.DeleteCount()); 322 | CommitStaged(&staging); 323 | if (this->EntryCount() == 0) 324 | return false; 325 | return true; 326 | } 327 | 328 | /// Load multiple dictionary words from a file containing plain text. 329 | /// Merges with any dictionary data already loaded. 330 | /// The path+filename of the file. 331 | /// True if file loaded, or false if file not found. 332 | bool SymSpell::CreateDictionary(string corpus) 333 | { 334 | xifstream corpusStream; 335 | corpusStream.open(corpus); 336 | #ifdef UNICODE_SUPPORT 337 | locale utf8(locale(), new codecvt_utf8); 338 | corpusStream.imbue(utf8); 339 | #endif 340 | if (!corpusStream.is_open()) return false; 341 | 342 | return CreateDictionary(corpusStream); 343 | } 344 | 345 | /// Load multiple dictionary words from a stream containing plain text. 346 | /// Merges with any dictionary data already loaded. 347 | /// The stream containing the plain text. 348 | /// True if stream loads. 349 | bool SymSpell::CreateDictionary(xifstream& corpusStream) 350 | { 351 | xstring line; 352 | SuggestionStage staging = SuggestionStage(16384); 353 | while (getline(corpusStream, line)) 354 | { 355 | for (xstring key : ParseWords(line)) 356 | { 357 | CreateDictionaryEntry(key, 1, &staging); 358 | } 359 | 360 | } 361 | if (this->deletes == NULL) this->deletes = new Dictionary>(staging.DeleteCount()); 362 | CommitStaged(&staging); 363 | if (this->EntryCount() == 0) 364 | return false; 365 | return true; 366 | } 367 | 368 | /// Remove all below threshold words from the dictionary. 369 | /// This can be used to reduce memory consumption after populating the dictionary from 370 | /// a corpus using CreateDictionary. 371 | void SymSpell::PurgeBelowThresholdWords() 372 | { 373 | belowThresholdWords.clear(); 374 | } 375 | 376 | /// Commit staged dictionary additions. 377 | /// Used when you write your own process to load multiple words into the 378 | /// dictionary, and as part of that process, you first created a SuggestionsStage 379 | /// object, and passed that to CreateDictionaryEntry calls. 380 | /// The SuggestionStage object storing the staged data. 381 | void SymSpell::CommitStaged(SuggestionStage* staging) 382 | { 383 | staging->CommitTo(deletes); 384 | } 385 | 386 | /// Find suggested spellings for a given input word, using the maximum 387 | /// edit distance specified during construction of the SymSpell dictionary. 388 | /// The word being spell checked. 389 | /// The value controlling the quantity/closeness of the retuned suggestions. 390 | /// A vector of SuggestItem object representing suggested correct spellings for the input word, 391 | /// sorted by edit distance, and secondarily by count frequency. 392 | vector SymSpell::Lookup(xstring input, Verbosity verbosity) 393 | { 394 | return Lookup(input, verbosity, this->maxDictionaryEditDistance, false); 395 | } 396 | 397 | /// Find suggested spellings for a given input word, using the maximum 398 | /// edit distance specified during construction of the SymSpell dictionary. 399 | /// The word being spell checked. 400 | /// The value controlling the quantity/closeness of the retuned suggestions. 401 | /// The maximum edit distance between input and suggested words. 402 | /// A vector of SuggestItem object representing suggested correct spellings for the input word, 403 | /// sorted by edit distance, and secondarily by count frequency. 404 | vector SymSpell::Lookup(xstring input, Verbosity verbosity, int maxEditDistance) 405 | { 406 | return Lookup(input, verbosity, maxEditDistance, false); 407 | } 408 | 409 | /// Find suggested spellings for a given input word. 410 | /// The word being spell checked. 411 | /// The value controlling the quantity/closeness of the retuned suggestions. 412 | /// The maximum edit distance between input and suggested words. 413 | /// Include input word in suggestions, if no words within edit distance found. 414 | /// A vector of SuggestItem object representing suggested correct spellings for the input word, 415 | /// sorted by edit distance, and secondarily by count frequency. 416 | vector SymSpell::Lookup(xstring input, Verbosity verbosity, int maxEditDistance, bool includeUnknown) 417 | { 418 | //verbosity=Top: the suggestion with the highest term frequency of the suggestions of smallest edit distance found 419 | //verbosity=Closest: all suggestions of smallest edit distance found, the suggestions are ordered by term frequency 420 | //verbosity=All: all suggestions <= maxEditDistance, the suggestions are ordered by edit distance, then by term frequency (slower, no early termination) 421 | 422 | // maxEditDistance used in Lookup can't be bigger than the maxDictionaryEditDistance 423 | // used to construct the underlying dictionary structure. 424 | int skip = 0; 425 | if (maxEditDistance > this->maxDictionaryEditDistance) throw std::invalid_argument("maxEditDistance"); 426 | 427 | vector suggestions; 428 | int inputLen = input.size(); 429 | // early exit - word is too big to possibly match any words 430 | if (inputLen - maxEditDistance > this->maxDictionaryWordLength) skip = 1; 431 | 432 | // quick look for exact match 433 | int64_t suggestionCount = 0; 434 | if (words.count(input) && !skip) 435 | { 436 | suggestionCount = words.at(input); 437 | suggestions.push_back(SuggestItem(input, 0, suggestionCount)); 438 | // early exit - return exact match, unless caller wants all matches 439 | if (verbosity != All) skip = 1; 440 | } 441 | 442 | //early termination, if we only want to check if word in dictionary or get its frequency e.g. for word segmentation 443 | if (maxEditDistance == 0) skip = 1; 444 | 445 | if (!skip) 446 | { 447 | // deletes we've considered already 448 | HashSet hashset1; 449 | // suggestions we've considered already 450 | HashSet hashset2; 451 | // we considered the input already in the word.TryGetValue above 452 | hashset2.insert(input); 453 | 454 | int maxEditDistance2 = maxEditDistance; 455 | int candidatePointer = 0; 456 | vector singleSuggestion = { XL("") }; 457 | vector candidates; 458 | 459 | //add original prefix 460 | int inputPrefixLen = inputLen; 461 | if (inputPrefixLen > prefixLength) 462 | { 463 | inputPrefixLen = prefixLength; 464 | candidates.push_back(input.substr(0, inputPrefixLen)); 465 | } 466 | else 467 | { 468 | candidates.push_back(input); 469 | } 470 | auto distanceComparer = EditDistance(this->distanceAlgorithm); 471 | while (candidatePointer < candidates.size()) 472 | { 473 | xstring candidate = candidates[candidatePointer++]; 474 | int candidateLen = candidate.size(); 475 | int lengthDiff = inputPrefixLen - candidateLen; 476 | 477 | //save some time - early termination 478 | //if candidate distance is already higher than suggestion distance, than there are no better suggestions to be expected 479 | if (lengthDiff > maxEditDistance2) 480 | { 481 | // skip to next candidate if Verbosity.All, look no further if Verbosity.Top or Closest 482 | // (candidates are ordered by delete distance, so none are closer than current) 483 | if (verbosity == Verbosity::All) continue; 484 | break; 485 | } 486 | 487 | //read candidate entry from dictionary 488 | if (deletes != nullptr and deletes->count(GetstringHash(candidate))) 489 | { 490 | vector dictSuggestions = deletes->at(GetstringHash(candidate)); 491 | //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list 492 | for (int i = 0; i < dictSuggestions.size(); i++) 493 | { 494 | auto suggestion = dictSuggestions[i]; 495 | int suggestionLen = suggestion.size(); 496 | if (suggestion == input) continue; 497 | if ((abs(suggestionLen - inputLen) > maxEditDistance2) // input and sugg lengths diff > allowed/current best distance 498 | || (suggestionLen < candidateLen) // sugg must be for a different delete string, in same bin only because of hash collision 499 | || (suggestionLen == candidateLen && suggestion != candidate)) // if sugg len = delete len, then it either equals delete or is in same bin only because of hash collision 500 | continue; 501 | auto suggPrefixLen = min(suggestionLen, prefixLength); 502 | if (suggPrefixLen > inputPrefixLen && (suggPrefixLen - candidateLen) > maxEditDistance2) continue; 503 | 504 | //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 505 | //We allow simultaneous edits (deletes) of maxEditDistance on on both the dictionary and the input term. 506 | //For replaces and adjacent transposes the resulting edit distance stays <= maxEditDistance. 507 | //For inserts and deletes the resulting edit distance might exceed maxEditDistance. 508 | //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. 509 | //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for maxEditDistance=1) 510 | //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. 511 | int distance = 0; 512 | int min_len = 0; 513 | if (candidateLen == 0) 514 | { 515 | //suggestions which have no common chars with input (inputLen<=maxEditDistance && suggestionLen<=maxEditDistance) 516 | distance = max(inputLen, suggestionLen); 517 | auto flag = hashset2.insert(suggestion); 518 | if (distance > maxEditDistance2 || !flag.second) continue; 519 | } 520 | else if (suggestionLen == 1) 521 | { 522 | // not entirely sure what happens here yet 523 | if (input.find(suggestion[0]) == input.npos) 524 | distance = inputLen; 525 | else 526 | distance = inputLen - 1; 527 | 528 | auto flag = hashset2.insert(suggestion); 529 | if (distance > maxEditDistance2 || !flag.second) continue; 530 | } 531 | else 532 | //number of edits in prefix ==maxediddistance AND no identic suffix 533 | //, then editdistance>maxEditDistance and no need for Levenshtein calculation 534 | // (inputLen >= prefixLength) && (suggestionLen >= prefixLength) 535 | if ((prefixLength - maxEditDistance == candidateLen) 536 | && (((min_len = min(inputLen, suggestionLen) - prefixLength) > 1) 537 | && (input.substr(inputLen + 1 - min_len) != suggestion.substr(suggestionLen + 1 - min_len))) 538 | || ((min_len > 0) && (input[inputLen - min_len] != suggestion[suggestionLen - min_len]) 539 | && ((input[inputLen - min_len - 1] != suggestion[suggestionLen - min_len]) 540 | || (input[inputLen - min_len] != suggestion[suggestionLen - min_len - 1])))) 541 | { 542 | continue; 543 | } 544 | else 545 | { 546 | // DeleteInSuggestionPrefix is somewhat expensive, and only pays off when verbosity is Top or Closest. 547 | if ((verbosity != All && !DeleteInSuggestionPrefix(candidate, candidateLen, suggestion, suggestionLen)) 548 | || !hashset2.insert(suggestion).second) continue; 549 | distance = distanceComparer.Compare(input, suggestion, maxEditDistance2); 550 | if (distance < 0) continue; 551 | } 552 | 553 | //save some time 554 | //do not process higher distances than those already found, if verbosity 0) 560 | { 561 | switch (verbosity) 562 | { 563 | case Closest: 564 | { 565 | //we will calculate DamLev distance only to the smallest found distance so far 566 | if (distance < maxEditDistance2) suggestions.clear(); 567 | break; 568 | } 569 | case Top: 570 | { 571 | if (distance < maxEditDistance2 || suggestionCount > suggestions[0].count) 572 | { 573 | maxEditDistance2 = distance; 574 | suggestions[0] = si; 575 | } 576 | continue; 577 | } 578 | } 579 | } 580 | if (verbosity != All) maxEditDistance2 = distance; 581 | suggestions.push_back(si); 582 | } 583 | }//end foreach 584 | }//end if 585 | 586 | //add edits 587 | //derive edits (deletes) from candidate (input) and add them to candidates list 588 | //this is a recursive process until the maximum edit distance has been reached 589 | if ((lengthDiff < maxEditDistance) && (candidateLen <= prefixLength)) 590 | { 591 | //save some time 592 | //do not create edits with edit distance smaller than suggestions already found 593 | if (verbosity != All && lengthDiff >= maxEditDistance2) continue; 594 | 595 | for (int i = 0; i < candidateLen; i++) 596 | { 597 | xstring temp(candidate); 598 | xstring del = temp.erase(i, 1); 599 | 600 | if (hashset1.insert(del).second) { candidates.push_back(del); } 601 | } 602 | } 603 | }//end while 604 | 605 | //sort by ascending edit distance, then by descending word frequency 606 | if (suggestions.size() > 1) sort(suggestions.begin(), suggestions.end(), [](SuggestItem& l, SuggestItem& r){ 607 | return l.CompareTo(r) < 0? 1 : 0; 608 | }); 609 | } 610 | if (includeUnknown && (suggestions.size() == 0)) suggestions.push_back(SuggestItem(input, maxEditDistance + 1, 0)); 611 | return suggestions; 612 | }//end if 613 | 614 | //check whether all delete chars are present in the suggestion prefix in correct order, otherwise this is just a hash collision 615 | bool SymSpell::DeleteInSuggestionPrefix(xstring deleteSugg, int deleteLen, xstring suggestion, int suggestionLen) 616 | { 617 | if (deleteLen == 0) return true; 618 | if (prefixLength < suggestionLen) suggestionLen = prefixLength; 619 | int j = 0; 620 | for (int i = 0; i < deleteLen; i++) 621 | { 622 | xchar delChar = deleteSugg[i]; 623 | while (j < suggestionLen && delChar != suggestion[j]) j++; 624 | if (j == suggestionLen) return false; 625 | } 626 | return true; 627 | } 628 | 629 | //create a non-unique wordlist from sample text 630 | //language independent (e.g. works with Chinese characters) 631 | vector SymSpell::ParseWords(xstring text) 632 | { 633 | // \w Alphanumeric characters (including non-latin characters, umlaut characters and digits) plus "_" 634 | // \d Digits 635 | // Compatible with non-latin characters, does not split words at apostrophes 636 | xregex r(XL("['’\\w\\-\\[_\\]]+")); 637 | xsmatch m; 638 | vector matches; 639 | //for benchmarking only: with CreateDictionary("big.txt","") and the text corpus from http://norvig.com/big.txt the Regex below provides the exact same number of dictionary items as Norvigs regex "[a-z]+" (which splits words at apostrophes & incompatible with non-latin characters) 640 | //MatchCollection mc = Regex.Matches(text.ToLower(), @"[\w-[\d_]]+"); 641 | xstring::const_iterator ptr(text.cbegin()); 642 | while(regex_search(ptr, text.cend(),m,r)) 643 | { 644 | matches.push_back(m[0]); 645 | ptr = m.suffix().first; 646 | } 647 | return matches; 648 | } 649 | 650 | //inexpensive and language independent: only deletes, no transposes + replaces + inserts 651 | //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters) 652 | HashSet* SymSpell::Edits(xstring word, int editDistance, HashSet* deleteWords) 653 | { 654 | editDistance++; 655 | if (word.size() > 1) 656 | { 657 | for (int i = 0; i < word.size(); i++) 658 | { 659 | xstring temp(word); 660 | xstring del = temp.erase(i, 1); 661 | if (deleteWords->insert(del).second) 662 | { 663 | //recursion, if maximum edit distance not yet reached 664 | if (editDistance < maxDictionaryEditDistance) Edits(del, editDistance, deleteWords); 665 | } 666 | } 667 | } 668 | return deleteWords; 669 | } 670 | 671 | HashSet SymSpell::EditsPrefix(xstring key) 672 | { 673 | HashSet hashSet = HashSet(); 674 | if (key.size() <= maxDictionaryEditDistance) hashSet.insert(XL("")); 675 | if (key.size() > prefixLength) key = key.substr(0, prefixLength); 676 | hashSet.insert(key); 677 | Edits(key, 0, &hashSet); 678 | return hashSet; 679 | } 680 | 681 | int SymSpell::GetstringHash(xstring s) 682 | { 683 | //return s.GetHashCode(); 684 | 685 | int len = s.size(); 686 | int lenMask = len; 687 | if (lenMask > 3) lenMask = 3; 688 | 689 | uint hash = 2166136261; 690 | for (auto i = 0; i < len; i++) 691 | { 692 | //unchecked, its fine even if it can be overflowed 693 | { 694 | hash ^= s[i]; 695 | hash *= 16777619; 696 | } 697 | } 698 | 699 | hash &= this->compactMask; 700 | hash |= (uint)lenMask; 701 | return (int)hash; 702 | } 703 | 704 | 705 | //###################### 706 | 707 | //LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases: 708 | //1. mistakenly inserted space into a correct word led to two incorrect terms 709 | //2. mistakenly omitted space between two correct words led to one incorrect combined term 710 | //3. multiple independent input terms with/without spelling errors 711 | 712 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 713 | /// The string being spell checked. 714 | /// A vector of SuggestItem object representing suggested correct spellings for the input string. 715 | vector SymSpell::LookupCompound(xstring input) 716 | { 717 | return LookupCompound(input, this->maxDictionaryEditDistance); 718 | } 719 | 720 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 721 | /// The string being spell checked. 722 | /// The maximum edit distance between input and suggested words. 723 | /// A vector of SuggestItem object representing suggested correct spellings for the input string. 724 | vector SymSpell::LookupCompound(xstring input, int editDistanceMax) 725 | { 726 | //parse input string into single terms 727 | vector termList1 = ParseWords(input); 728 | 729 | vector suggestions; //suggestions for a single term 730 | vector suggestionParts; //1 line with separate parts 731 | auto distanceComparer = EditDistance(this->distanceAlgorithm); 732 | 733 | //translate every term to its best suggestion, otherwise it remains unchanged 734 | bool lastCombi = false; 735 | for (int i = 0; i < termList1.size(); i++) 736 | { 737 | suggestions = Lookup(termList1[i], Top, editDistanceMax); 738 | 739 | //combi check, always before split 740 | if ((i > 0) && !lastCombi) 741 | { 742 | vector suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], Top, editDistanceMax); 743 | 744 | if (suggestionsCombi.size() > 0) 745 | { 746 | SuggestItem best1 = suggestionParts[suggestionParts.size() - 1]; 747 | SuggestItem best2 = SuggestItem(); 748 | if (suggestions.size() > 0) 749 | { 750 | best2 = suggestions[0]; 751 | } 752 | else 753 | { 754 | //unknown word 755 | best2.term = termList1[i]; 756 | //estimated edit distance 757 | best2.distance = editDistanceMax + 1; 758 | //estimated word occurrence probability P=10 / (N * 10^word length l) 759 | best2.count = (long)((double)10 / pow((double)10, (double)best2.term.size())); // 0; 760 | } 761 | 762 | //distance1=edit distance between 2 split terms und their best corrections : als comparative value for the combination 763 | int distance1 = best1.distance + best2.distance; 764 | if ((distance1 >= 0) && ((suggestionsCombi[0].distance + 1 < distance1) || ((suggestionsCombi[0].distance + 1 == distance1) && ((double)suggestionsCombi[0].count > (double)best1.count / (double)N * (double)best2.count)))) 765 | { 766 | suggestionsCombi[0].distance++; 767 | suggestionParts[suggestionParts.size() - 1] = suggestionsCombi[0]; 768 | lastCombi = true; 769 | goto nextTerm; 770 | } 771 | } 772 | } 773 | lastCombi = false; 774 | 775 | //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms 776 | if ((suggestions.size() > 0) && ((suggestions[0].distance == 0) || (termList1[i].size() == 1))) 777 | { 778 | //choose best suggestion 779 | suggestionParts.push_back(suggestions[0]); 780 | } 781 | else 782 | { 783 | //if no perfect suggestion, split word into pairs 784 | SuggestItem suggestionSplitBest; 785 | 786 | //add original term 787 | if (suggestions.size() > 0) suggestionSplitBest.set(suggestions[0]); 788 | 789 | if (termList1[i].size() > 1) 790 | { 791 | for (int j = 1; j < termList1[i].size(); j++) 792 | { 793 | xstring part1 = termList1[i].substr(0, j); 794 | xstring part2 = termList1[i].substr(j); 795 | SuggestItem suggestionSplit = SuggestItem(); 796 | vector suggestions1 = Lookup(part1, Top, editDistanceMax); 797 | if (suggestions1.size() > 0) 798 | { 799 | vector suggestions2 = Lookup(part2, Top, editDistanceMax); 800 | if (suggestions2.size() > 0) 801 | { 802 | //select best suggestion for split pair 803 | suggestionSplit.term = suggestions1[0].term + XL(" ") + suggestions2[0].term; 804 | 805 | int distance2 = distanceComparer.Compare(termList1[i], suggestionSplit.term, editDistanceMax); 806 | if (distance2 < 0) distance2 = editDistanceMax + 1; 807 | 808 | if (suggestionSplitBest.count) 809 | { 810 | if (distance2 > suggestionSplitBest.distance) continue; 811 | if (distance2 < suggestionSplitBest.distance) suggestionSplitBest.count = 0; 812 | } 813 | 814 | suggestionSplit.distance = distance2; 815 | //if bigram exists in bigram dictionary 816 | if (bigrams.count(suggestionSplit.term)) 817 | { 818 | long bigramCount = bigrams.at(suggestionSplit.term); 819 | suggestionSplit.count = bigramCount; 820 | 821 | //increase count, if split.corrections are part of or identical to input 822 | //single term correction exists 823 | if (suggestions.size() > 0) 824 | { 825 | //alternatively remove the single term from suggestionsSplit, but then other splittings could win 826 | if ((suggestions1[0].term + suggestions2[0].term == termList1[i])) 827 | { 828 | //make count bigger than count of single term correction 829 | suggestionSplit.count = max(suggestionSplit.count, suggestions[0].count + 2); 830 | } 831 | else if ((suggestions1[0].term == suggestions[0].term) || (suggestions2[0].term == suggestions[0].term)) 832 | { 833 | //make count bigger than count of single term correction 834 | suggestionSplit.count = max(suggestionSplit.count, suggestions[0].count + 1); 835 | } 836 | } 837 | //no single term correction exists 838 | else if ((suggestions1[0].term + suggestions2[0].term == termList1[i])) 839 | { 840 | suggestionSplit.count = max(suggestionSplit.count, max(suggestions1[0].count, suggestions2[0].count) + 2); 841 | } 842 | 843 | } 844 | else 845 | { 846 | //The Naive Bayes probability of the word combination is the product of the two word probabilities: P(AB) = P(A) * P(B) 847 | //use it to estimate the frequency count of the combination, which then is used to rank/select the best splitting variant 848 | suggestionSplit.count = min(bigramCountMin, (int64_t)((double)suggestions1[0].count / (double)N * (double)suggestions2[0].count)); 849 | } 850 | 851 | if (suggestionSplit.count > suggestionSplitBest.count) suggestionSplitBest.set(suggestionSplit); 852 | } 853 | } 854 | } 855 | 856 | if (suggestionSplitBest.count) 857 | { 858 | //select best suggestion for split pair 859 | suggestionParts.push_back(suggestionSplitBest); 860 | } 861 | else 862 | { 863 | SuggestItem si = SuggestItem(); 864 | si.term = termList1[i]; 865 | //estimated word occurrence probability P=10 / (N * 10^word length l) 866 | si.count = (long)((double)10 / pow((double)10, (double)si.term.size())); 867 | si.distance = editDistanceMax + 1; 868 | suggestionParts.push_back(si); 869 | } 870 | } 871 | else 872 | { 873 | SuggestItem si = SuggestItem(); 874 | si.term = termList1[i]; 875 | //estimated word occurrence probability P=10 / (N * 10^word length l) 876 | si.count = (long)((double)10 / pow((double)10, (double)si.term.size())); 877 | si.distance = editDistanceMax + 1; 878 | suggestionParts.push_back(si); 879 | } 880 | } 881 | nextTerm:; 882 | } 883 | 884 | SuggestItem suggestion = SuggestItem(); 885 | 886 | double count = N; 887 | xstring s(XL("")) ; 888 | for (SuggestItem si : suggestionParts) 889 | { 890 | s += (si.term + XL(" ")); 891 | count *= (double)si.count / (double)N; 892 | } 893 | suggestion.count = (long)count; 894 | rtrim(s); 895 | suggestion.term = s; 896 | suggestion.distance = distanceComparer.Compare(input, suggestion.term, MAXINT); 897 | 898 | vector suggestionsLine; 899 | suggestionsLine.push_back(suggestion); 900 | return suggestionsLine; 901 | } 902 | 903 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 904 | /// The string being spell checked. 905 | /// The word segmented string, 906 | /// the word segmented and spelling corrected string, 907 | /// the Edit distance sum between input string and corrected string, 908 | /// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is). 909 | Info SymSpell::WordSegmentation(xstring input) 910 | { 911 | return WordSegmentation(input, this->MaxDictionaryEditDistance(), this->maxDictionaryWordLength); 912 | } 913 | 914 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 915 | /// The string being spell checked. 916 | /// The maximum edit distance between input and corrected words 917 | /// (0=no correction/segmentation only). 918 | /// The word segmented string, 919 | /// the word segmented and spelling corrected string, 920 | /// the Edit distance sum between input string and corrected string, 921 | /// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is). 922 | Info SymSpell::WordSegmentation(xstring input, int maxEditDistance) 923 | { 924 | return WordSegmentation(input, maxEditDistance, this->maxDictionaryWordLength); 925 | } 926 | 927 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging). 928 | /// The string being spell checked. 929 | /// The maximum word length that should be considered. 930 | /// The maximum edit distance between input and corrected words 931 | /// (0=no correction/segmentation only). 932 | /// The word segmented string, 933 | /// the word segmented and spelling corrected string, 934 | /// the Edit distance sum between input string and corrected string, 935 | /// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is). 936 | Info SymSpell::WordSegmentation(xstring input, int maxEditDistance, int maxSegmentationWordLength) 937 | { 938 | int arraySize = min(maxSegmentationWordLength, (int)input.size()); 939 | vector compositions = vector(arraySize); 940 | int circularIndex = -1; 941 | 942 | //outer loop (column): all possible part start positions 943 | for (int j = 0; j < input.size(); j++) 944 | { 945 | //inner loop (row): all possible part lengths (from start position): part can't be bigger than longest word in dictionary (other than long unknown word) 946 | int imax = min((int)input.size() - j, maxSegmentationWordLength); 947 | for (int i = 1; i <= imax; i++) 948 | { 949 | //get top spelling correction/ed for part 950 | xstring part = input.substr(j, i); 951 | int separatorLength = 0; 952 | int topEd = 0; 953 | double topProbabilityLog = 0; 954 | xstring topResult = XL(""); 955 | 956 | if (isxspace(part[0])) 957 | { 958 | //remove space for levensthein calculation 959 | part = part.substr(1); 960 | } 961 | else 962 | { 963 | //add ed+1: space did not exist, had to be inserted 964 | separatorLength = 1; 965 | } 966 | 967 | //remove space from part1, add number of removed spaces to topEd 968 | topEd += part.size(); 969 | //remove space 970 | xregex r(XL("(\\s)+")); 971 | part = regex_replace(part, r, XL("")); 972 | // part = part.Replace(" ", ""); //=System.Text.RegularExpressions.Regex.Replace(part1, @"\s+", ""); 973 | // //add number of removed spaces to ed 974 | topEd -= part.size(); 975 | 976 | vector results = this->Lookup(part, Top, maxEditDistance); 977 | if (results.size() > 0) 978 | { 979 | topResult = results[0].term; 980 | topEd += results[0].distance; 981 | //Naive Bayes Rule 982 | //we assume the word probabilities of two words to be independent 983 | //therefore the resulting probability of the word combination is the product of the two word probabilities 984 | 985 | //instead of computing the product of probabilities we are computing the sum of the logarithm of probabilities 986 | //because the probabilities of words are about 10^-10, the product of many such small numbers could exceed (underflow) the floating number range and become zero 987 | //log(ab)=log(a)+log(b) 988 | topProbabilityLog = log10((double)results[0].count / (double)N); 989 | } 990 | else 991 | { 992 | topResult = part; 993 | //default, if word not found 994 | //otherwise long input text would win as long unknown word (with ed=edmax+1 ), although there there should many spaces inserted 995 | topEd += part.size(); 996 | topProbabilityLog = log10(10.0 / (N * pow(10.0, part.size()))); 997 | } 998 | 999 | int destinationIndex = ((i + circularIndex) % arraySize); 1000 | 1001 | //set values in first loop 1002 | if (j == 0) 1003 | { 1004 | compositions[destinationIndex].set(part, topResult, topEd, topProbabilityLog); 1005 | } 1006 | else if ((i == maxSegmentationWordLength) 1007 | //replace values if better probabilityLogSum, if same edit distance OR one space difference 1008 | || (((compositions[circularIndex].getDistance() + topEd == compositions[destinationIndex].getDistance()) \ 1009 | || (compositions[circularIndex].getDistance() + separatorLength + topEd == compositions[destinationIndex].getDistance())) \ 1010 | && (compositions[destinationIndex].getProbability() < compositions[circularIndex].getProbability() + topProbabilityLog)) 1011 | //replace values if smaller edit distance 1012 | || (compositions[circularIndex].getDistance() + separatorLength + topEd < compositions[destinationIndex].getDistance())) 1013 | { 1014 | xstring seg = compositions[circularIndex].getSegmented()+ XL(" ") + part; 1015 | xstring correct = compositions[circularIndex].getCorrected() + XL(" ") + topResult; 1016 | int d = compositions[circularIndex].getDistance() + separatorLength + topEd; 1017 | double prob = compositions[circularIndex].getProbability() + topProbabilityLog; 1018 | compositions[destinationIndex].set(seg, correct, d, prob); 1019 | } 1020 | } 1021 | circularIndex++; if (circularIndex == arraySize) circularIndex = 0; 1022 | } 1023 | return compositions[circularIndex]; 1024 | } --------------------------------------------------------------------------------