├── .gitignore
├── CMakeLists.txt
├── README.md
├── SymSpell.vcxproj
├── SymSpellCpp.sln
├── SymSpellTest.cpp
├── SymSpellTest.vcxproj
├── data
└── frequency_dictionary_en_82_765.txt
├── include
├── Helpers.h
└── SymSpell.h
└── src
└── SymSpell.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .vscode/
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | project(symspell)
2 | cmake_minimum_required(VERSION 2.8)
3 | include_directories(include)
4 |
5 | set(CMAKE_CXX_FLAGS "-O3 -g3")
6 |
7 | set(CMAKE_CXX_STANDARD 17)
8 |
9 | add_library(symspell src/SymSpell.cpp)
10 | add_executable(symspelltest src/SymSpell.cpp SymSpellTest.cpp)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This project is a c++ port from https://github.com/wolfgarbe/SymSpell v6.5
2 |
3 | Please note that you can disable/enable unicode support by the macro #define UNICODE_SUPPORT in the header Symspell.h
4 |
5 | Tested with English and Japanese and this version produced similar result to the original C# code
--------------------------------------------------------------------------------
/SymSpell.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 | Debug
14 | x64
15 |
16 |
17 | Release
18 | x64
19 |
20 |
21 |
22 | 15.0
23 | {0329E929-BF68-4864-A4B4-2D151C0156D4}
24 | SymSpell
25 | 10.0.17763.0
26 |
27 |
28 |
29 | StaticLibrary
30 | true
31 | v141
32 |
33 |
34 |
35 |
36 | StaticLibrary
37 | false
38 | v141
39 | true
40 |
41 |
42 |
43 |
44 | StaticLibrary
45 | true
46 | v141
47 |
48 |
49 |
50 |
51 | StaticLibrary
52 | false
53 | v141
54 | true
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | $(SolutionDir)$(Configuration)\
78 | $(SolutionDir)$(Configuration)\temp\
79 |
80 |
81 | $(SolutionDir)$(Configuration)\
82 | $(SolutionDir)$(Configuration)\temp\
83 |
84 |
85 | $(SolutionDir)$(Configuration)\temp\
86 |
87 |
88 | $(SolutionDir)$(Configuration)\temp\
89 |
90 |
91 |
92 | Level3
93 | Disabled
94 | true
95 | true
96 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories)
97 |
98 |
99 |
100 |
101 | Level3
102 | Disabled
103 | true
104 | true
105 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories)
106 |
107 |
108 |
109 |
110 | Level3
111 | MaxSpeed
112 | true
113 | true
114 | true
115 | true
116 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories)
117 |
118 |
119 | true
120 | true
121 |
122 |
123 |
124 |
125 | Level3
126 | MaxSpeed
127 | true
128 | true
129 | true
130 | true
131 | include;C:\Program Files (x86)\Visual Leak Detector\include;%(AdditionalIncludeDirectories)
132 |
133 |
134 | true
135 | true
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 | true
148 | true
149 | true
150 | true
151 | true
152 |
153 |
154 |
155 |
156 |
157 |
--------------------------------------------------------------------------------
/SymSpellCpp.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.28307.1062
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SymSpell", "SymSpell.vcxproj", "{0329E929-BF68-4864-A4B4-2D151C0156D4}"
7 | EndProject
8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SymSpellTest", "SymSpellTest.vcxproj", "{1ADF0935-D2BA-402D-8106-388F3701C98E}"
9 | ProjectSection(ProjectDependencies) = postProject
10 | {0329E929-BF68-4864-A4B4-2D151C0156D4} = {0329E929-BF68-4864-A4B4-2D151C0156D4}
11 | EndProjectSection
12 | EndProject
13 | Global
14 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
15 | Debug|x64 = Debug|x64
16 | Debug|x86 = Debug|x86
17 | Release|x64 = Release|x64
18 | Release|x86 = Release|x86
19 | EndGlobalSection
20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
21 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x64.ActiveCfg = Debug|x64
22 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x64.Build.0 = Debug|x64
23 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x86.ActiveCfg = Debug|Win32
24 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Debug|x86.Build.0 = Debug|Win32
25 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x64.ActiveCfg = Release|x64
26 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x64.Build.0 = Release|x64
27 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x86.ActiveCfg = Release|Win32
28 | {0329E929-BF68-4864-A4B4-2D151C0156D4}.Release|x86.Build.0 = Release|Win32
29 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x64.ActiveCfg = Debug|x64
30 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x64.Build.0 = Debug|x64
31 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x86.ActiveCfg = Debug|Win32
32 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Debug|x86.Build.0 = Debug|Win32
33 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x64.ActiveCfg = Release|x64
34 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x64.Build.0 = Release|x64
35 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x86.ActiveCfg = Release|Win32
36 | {1ADF0935-D2BA-402D-8106-388F3701C98E}.Release|x86.Build.0 = Release|Win32
37 | EndGlobalSection
38 | GlobalSection(SolutionProperties) = preSolution
39 | HideSolutionNode = FALSE
40 | EndGlobalSection
41 | GlobalSection(ExtensibilityGlobals) = postSolution
42 | SolutionGuid = {FBE80B29-1094-4BB4-9FEB-44818F6CEC08}
43 | EndGlobalSection
44 | EndGlobal
45 |
--------------------------------------------------------------------------------
/SymSpellTest.cpp:
--------------------------------------------------------------------------------
1 | // Test.cpp : This file contains the 'main' function. Program execution begins and ends there.
2 | #include
3 |
4 | #include
5 | #include
6 | // #include
7 |
8 | #ifdef UNICODE_SUPPORT
9 |
10 | void TestJapanese()
11 | {
12 | _setmode(_fileno(stdout), _O_U16TEXT);
13 |
14 | const int initialCapacity = 1000;
15 | const int maxEditDistance = 2;
16 | const int prefixLength = 5;
17 | SymSpell symSpell(initialCapacity, maxEditDistance, prefixLength);
18 | int start = clock();
19 | symSpell.LoadDictionary(XL("frequency_jp_names.txt"), 0, 1, XL('\t'));
20 | int end = clock();
21 | float time = (float)((end - start) / (CLOCKS_PER_SEC / 1000));
22 | xcout << XL("Library loaded: ") << time << XL(" ms") << endl;
23 |
24 | xcout << XL("-------Testing Japanese word segmentation-------") << endl;
25 | vector sentences = { XL("�n�ӓc��"), XL("�ѓc��"), XL("�n�瑽��") };
26 | for (int i = 0; i < sentences.size(); i++)
27 | {
28 | Info results = symSpell.WordSegmentation(sentences[i]);
29 | xcout << sentences[i] << XL(" -> ") << results.getSegmented() << XL(" -> ") << results.getCorrected() << endl;
30 | }
31 |
32 | xcout << XL("-------Testing Japanese word correction-------") << endl;
33 | sentences = { XL("�j��"), XL("�c��") };
34 | for (int i = 0; i < sentences.size(); i++)
35 | {
36 | vector results = symSpell.Lookup(sentences[i], Verbosity::Closest);
37 | xcout << sentences[i] << XL(" -> ") << results[0].term << endl;
38 | }
39 | }
40 |
41 | void main()
42 | {
43 | TestJapanese();
44 | }
45 |
46 | #else
47 |
48 | void TestEnglish()
49 | {
50 | const int initialCapacity = 82765;
51 | const int maxEditDistance = 2;
52 | const int prefixLength = 3;
53 | SymSpell symSpell(initialCapacity, maxEditDistance, prefixLength);
54 | int start = clock();
55 | string corpus_path = "../data/frequency_dictionary_en_82_765.txt";
56 | symSpell.LoadDictionary(corpus_path, 0, 1, XL(' '));
57 | int end = clock();
58 | float time = (float)((end - start) / (CLOCKS_PER_SEC / 1000));
59 | xcout << XL("Library loaded: ") << time << XL(" ms") << endl;
60 |
61 | xcout << XL("-------Testing English word segmentation-------") << endl;
62 | vector sentences = { XL("thequickbrownfoxjumpsoverthelazydog"),
63 | XL("itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"),
64 | XL("itwasthebestoftimesitwastheworstoftimesitwastheageofwisdomitwastheageoffoolishness") };
65 | for (int i = 0; i < sentences.size(); i++)
66 | {
67 | Info results = symSpell.WordSegmentation(sentences[i]);
68 | xcout << sentences[i] << XL(" -> ") << results.getCorrected() << endl;
69 | }
70 |
71 | xcout << XL("-------Testing English word correction-------") << endl;
72 | sentences = { XL("tke"), XL("abolution"), XL("intermedaite") };
73 | for (int i = 0; i < sentences.size(); i++)
74 | {
75 | vector results = symSpell.Lookup(sentences[i], Verbosity::Closest);
76 | xcout << sentences[i] << XL(" -> ") << results[0].term << endl;
77 | }
78 |
79 | xcout << XL("-------Testing English compound correction-------") << endl;
80 | sentences = { XL("whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him"),
81 | XL("in te dhird qarter oflast jear he hadlearned ofca sekretplan"),
82 | XL("the bigjest playrs in te strogsommer film slatew ith plety of funn") };
83 | for (int i = 0; i < sentences.size(); i++)
84 | {
85 | vector results = symSpell.LookupCompound(sentences[i]);
86 | xcout << sentences[i] << XL(" -> ") << results[0].term << endl;
87 | }
88 | }
89 |
90 | int main()
91 | {
92 | TestEnglish();
93 | return 0;
94 | }
95 |
96 | #endif
97 |
98 |
99 |
--------------------------------------------------------------------------------
/SymSpellTest.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 | Debug
14 | x64
15 |
16 |
17 | Release
18 | x64
19 |
20 |
21 |
22 |
23 |
24 |
25 | 15.0
26 | {1ADF0935-D2BA-402D-8106-388F3701C98E}
27 | Win32Proj
28 | Test
29 | 10.0.17763.0
30 |
31 |
32 |
33 | Application
34 | true
35 | v141
36 | Unicode
37 |
38 |
39 | Application
40 | false
41 | v141
42 | true
43 | Unicode
44 |
45 |
46 | Application
47 | true
48 | v141
49 | Unicode
50 |
51 |
52 | Application
53 | false
54 | v141
55 | true
56 | Unicode
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | false
78 | $(LibraryPath)
79 | $(SolutionDir)$(Configuration)\
80 | $(SolutionDir)$(Configuration)\temp\
81 |
82 |
83 | true
84 | $(SolutionDir)$(Configuration)\temp\
85 |
86 |
87 | true
88 | $(SolutionDir)$(Configuration)\
89 | $(SolutionDir)$(Configuration)\temp\
90 |
91 |
92 | false
93 | $(SolutionDir)$(Configuration)\temp\
94 |
95 |
96 |
97 |
98 |
99 | Level3
100 | MaxSpeed
101 | true
102 | true
103 | true
104 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
105 | true
106 | include;
107 |
108 |
109 | Console
110 | true
111 | true
112 | true
113 | $(SolutionDir)$(Configuration)\
114 | SymSpell.lib
115 |
116 |
117 |
118 |
119 |
120 |
121 | Level3
122 | Disabled
123 | true
124 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
125 | true
126 | include;
127 |
128 |
129 | Console
130 | true
131 | $(SolutionDir)$(Configuration)\
132 | SymSpell.lib
133 |
134 |
135 |
136 |
137 |
138 |
139 | Level3
140 | Disabled
141 | true
142 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions)
143 | true
144 | include;
145 |
146 |
147 | Console
148 | true
149 | $(SolutionDir)$(Configuration)\
150 | SymSpell.lib
151 |
152 |
153 |
154 |
155 |
156 |
157 | Level3
158 | MaxSpeed
159 | true
160 | true
161 | true
162 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
163 | true
164 | include;
165 |
166 |
167 | Console
168 | true
169 | true
170 | true
171 | $(SolutionDir)$(Configuration)\
172 | SymSpell.lib
173 |
174 |
175 |
176 |
177 |
178 |
--------------------------------------------------------------------------------
/include/Helpers.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #ifdef _WIN32
4 | # ifdef LIBRARY_EXPORTS
5 | # define LIBRARY_API __declspec(dllexport)
6 | # else
7 | # define LIBRARY_API __declspec(dllimport)
8 | # endif
9 | #else
10 | # define LIBRARY_API
11 | #endif
12 |
13 | #define Dictionary unordered_map
14 | #define HashSet unordered_set
15 | #ifdef UNICODE_SUPPORT
16 | # define xstring wstring
17 | # define xchar wchar_t
18 | # define xifstream wifstream
19 | # define xstringstream wstringstream
20 | # define isxspace iswspace
21 | # define xregex wregex
22 | # define xsmatch wsmatch
23 | # define to_xstring to_wstring
24 | # define XL(x) L##x
25 | # define xcout wcout
26 | #else
27 | # define xstring string
28 | # define xchar char
29 | # define xifstream ifstream
30 | # define xstringstream stringstream
31 | # define isxspace isspace
32 | # define xregex regex
33 | # define xsmatch smatch
34 | # define to_xstring to_string
35 | # define XL(x) x
36 | # define xcout cout
37 | #endif
38 |
39 | #include
40 | #include
41 | #include
42 | #include
43 | #include
44 | #include
45 | #include
46 | using namespace std;
47 |
48 |
49 | // A growable list of elements that's optimized to support adds, but not deletes,
50 | // of large numbers of elements, storing data in a way that's friendly to the garbage
51 | // collector (not backed by a monolithic array object), and can grow without needing
52 | // to copy the entire backing array contents from the old backing array to the new.
53 | template
54 | class ChunkArray
55 | {
56 | private:
57 | const int ChunkSize = 4096; //this must be a power of 2, otherwise can't optimize Row and Col functions
58 | const int DivShift = 12; // number of bits to shift right to do division by ChunkSize (the bit position of ChunkSize)
59 | int Row(unsigned int index) { return index >> DivShift; } // same as index / ChunkSize
60 | int Col(unsigned int index) { return index & (ChunkSize - 1); } //same as index % ChunkSize
61 | int Capacity() { return Values.size() * ChunkSize; }
62 |
63 | public:
64 | vector> Values;
65 | int Count;
66 |
67 | ChunkArray()
68 | {
69 | Count = 0;
70 | }
71 |
72 | void Reserve(int initialCapacity)
73 | {
74 | int chunks = (initialCapacity + ChunkSize - 1) / ChunkSize;
75 | Values.resize(chunks);
76 | for (int i = 0; i < chunks; ++i)
77 | {
78 | Values[i].resize(ChunkSize);
79 | }
80 | }
81 |
82 | int Add(T &value)
83 | {
84 | if (Count == Capacity())
85 | {
86 | Values.push_back(vector());
87 | Values[Values.size() - 1].resize(ChunkSize);
88 | }
89 |
90 | int row = Row(Count);
91 | int col = Col(Count);
92 |
93 | Values[row][col] = value;
94 | return Count++;
95 | }
96 |
97 | void Clear()
98 | {
99 | Count = 0;
100 | }
101 |
102 | T& At(unsigned int index)
103 | {
104 | return Values[Row(index)][Col(index)];
105 | }
106 |
107 | void Set(unsigned int index, T &value)
108 | {
109 | Values[Row(index)][Col(index)] = value;
110 | }
111 | };
112 |
113 | class Helpers {
114 | public:
115 | /// Determines the proper return value of an edit distance function when one or
116 | /// both strings are null.
117 | static int NullDistanceResults(xstring string1, xstring string2, double maxDistance) {
118 | if (string1.empty())
119 | return (string2.empty()) ? 0 : (string2.size() <= maxDistance) ? string2.size() : -1;
120 | return (string1.size() <= maxDistance) ? string1.size() : -1;
121 | }
122 |
123 | /// Determines the proper return value of an similarity function when one or
124 | /// both strings are null.
125 | static int NullSimilarityResults(xstring string1, xstring string2, double minSimilarity) {
126 | return (string1.empty() && string2.empty()) ? 1 : (0 <= minSimilarity) ? 0 : -1;
127 | }
128 |
129 | /// Calculates starting position and lengths of two strings such that common
130 | /// prefix and suffix substrings are excluded.
131 | /// Expects string1.size() to be less than or equal to string2.size()
132 | static void PrefixSuffixPrep(xstring string1, xstring string2, int &len1, int &len2, int &start) {
133 | len2 = string2.size();
134 | len1 = string1.size(); // this is also the minimum length of the two strings
135 | // suffix common to both strings can be ignored
136 | while (len1 != 0 && string1[len1 - 1] == string2[len2 - 1]) {
137 | len1 = len1 - 1; len2 = len2 - 1;
138 | }
139 | // prefix common to both strings can be ignored
140 | start = 0;
141 | while (start != len1 && string1[start] == string2[start]) start++;
142 | if (start != 0) {
143 | len2 -= start; // length of the part excluding common prefix and suffix
144 | len1 -= start;
145 | }
146 | }
147 |
148 | /// Calculate a similarity measure from an edit distance.
149 | /// The length of the longer of the two strings the edit distance is from.
150 | /// The edit distance between two strings.
151 | /// A similarity value from 0 to 1.0 (1 - (length / distance)).
152 | static double ToSimilarity(int distance, int length) {
153 | return (distance < 0) ? -1 : 1 - (distance / (double)length);
154 | }
155 |
156 | /// Calculate an edit distance from a similarity measure.
157 | /// The length of the longer of the two strings the edit distance is from.
158 | /// The similarity measure between two strings.
159 | /// An edit distance from 0 to length (length * (1 - similarity)).
160 | static int ToDistance(double similarity, int length) {
161 | return (int)((length * (1 - similarity)) + .0000000001);
162 | }
163 |
164 | ///
165 | /// CompareTo of two intergers
166 | ///
167 | static int CompareTo(int64_t mainValue, int64_t compareValue)
168 | {
169 | if (mainValue == compareValue)
170 | return 0;
171 | else if (mainValue > compareValue)
172 | return 1;
173 | else
174 | return -1;
175 | }
176 | };
177 |
178 |
179 | /// Types implementing the IDistance interface provide methods
180 | /// for computing a relative distance between two strings.
181 | class IDistance {
182 | public:
183 | /// Return a measure of the distance between two strings.
184 | /// One of the strings to compare.
185 | /// The other string to compare.
186 | /// 0 if the strings are equivalent, otherwise a positive number whose
187 | /// magnitude increases as difference between the strings increases.
188 | virtual double Distance(xstring string1, xstring string2) = 0;
189 |
190 | /// Return a measure of the distance between two strings.
191 | /// One of the strings to compare.
192 | /// The other string to compare.
193 | /// The maximum distance that is of interest.
194 | /// -1 if the distance is greater than the maxDistance, 0 if the strings
195 | /// are equivalent, otherwise a positive number whose magnitude increases as
196 | /// difference between the strings increases.
197 | virtual double Distance(xstring string1, xstring string2, double maxDistance) = 0;
198 | };
199 |
200 |
201 | /// Types implementing the ISimilarity interface provide methods
202 | /// for computing a normalized value of similarity between two strings.
203 | class ISimilarity {
204 | public:
205 | /// Return a measure of the similarity between two strings.
206 | /// One of the strings to compare.
207 | /// The other string to compare.
208 | /// The degree of similarity 0 to 1.0, where 0 represents a lack of any
209 | /// notable similarity, and 1 represents equivalent strings.
210 | virtual double Similarity(xstring string1, xstring string2) = 0;
211 |
212 | /// Return a measure of the similarity between two strings.
213 | /// One of the strings to compare.
214 | /// The other string to compare.
215 | /// The minimum similarity that is of interest.
216 | /// The degree of similarity 0 to 1.0, where -1 represents a similarity
217 | /// lower than minSimilarity, otherwise, a number between 0 and 1.0 where 0
218 | /// represents a lack of any notable similarity, and 1 represents equivalent
219 | /// strings.
220 | virtual double Similarity(xstring string1, xstring string2, double minSimilarity) = 0;
221 | };
222 |
223 | ///
224 | /// Class providing optimized methods for computing Damerau-Levenshtein Optimal string
225 | /// Alignment (OSA) comparisons between two strings.
226 | ///
227 | ///
228 | /// Copyright ©2015-2018 SoftWx, Inc.
229 | /// The inspiration for creating highly optimized edit distance functions was
230 | /// from Sten Hjelmqvist's "Fast, memory efficient" algorithm, described at
231 | /// http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm
232 | /// The Damerau-Levenshtein algorithm is basically the Levenshtein algorithm with a
233 | /// modification that considers transposition of two adjacent characters as a single edit.
234 | /// The optimized algorithm was described in detail in my post at
235 | /// http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
236 | /// Also see http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
237 | /// Note that this implementation of Damerau-Levenshtein is the simpler and faster optimal
238 | /// string alignment (aka restricted edit) distance that difers slightly from the classic
239 | /// algorithm by imposing the restriction that no substring is edited more than once. So,
240 | /// for example, "CA" to "ABC" has an edit distance of 2 by a complete application of
241 | /// Damerau-Levenshtein, but has a distance of 3 by the method implemented here, that uses
242 | /// the optimal string alignment algorithm. This means that this algorithm is not a true
243 | /// metric since it does not uphold the triangle inequality. In real use though, this OSA
244 | /// version may be desired. Besides being faster, it does not give the lower distance score
245 | /// for transpositions that occur across long distances. Actual human error transpositions
246 | /// are most likely for adjacent characters. For example, the classic Damerau algorithm
247 | /// gives a distance of 1 for these two strings: "sated" and "dates" (it counts the 's' and
248 | /// 'd' as a single transposition. The optimal string alignment version of Damerau in this
249 | /// class gives a distance of 2 for these two strings (2 substitutions), as it only counts
250 | /// transpositions for adjacent characters.
251 | /// The methods in this class are not threadsafe. Use the static versions in the Distance
252 | /// class if that is required.
253 | class DamerauOSA : public IDistance {
254 | private:
255 | vector baseChar1Costs;
256 | vector basePrevChar1Costs;
257 |
258 | public:
259 | /// Create a new instance of DamerauOSA.
260 | DamerauOSA() {
261 | }
262 |
263 | /// Create a new instance of DamerauOSA using the specified expected
264 | /// maximum string length that will be encountered.
265 | /// By specifying the max expected string length, better memory efficiency
266 | /// can be achieved.
267 | /// The expected maximum length of strings that will
268 | /// be passed to the edit distance functions.
269 | DamerauOSA(int expectedMaxstringLength) {
270 | if (expectedMaxstringLength <= 0) throw "expectedMaxstringLength must be larger than 0";
271 | this->baseChar1Costs = vector(expectedMaxstringLength, 0);
272 | this->basePrevChar1Costs = vector(expectedMaxstringLength, 0);
273 | }
274 |
275 | /// Compute and return the Damerau-Levenshtein optimal string
276 | /// alignment edit distance between two strings.
277 | /// https://github.com/softwx/SoftWx.Match
278 | /// This method is not threadsafe.
279 | /// One of the strings to compare.
280 | /// The other string to compare.
281 | /// 0 if the strings are equivalent, otherwise a positive number whose
282 | /// magnitude increases as difference between the strings increases.
283 | double Distance(xstring string1, xstring string2) {
284 | if (string1.empty()) return string2.size();
285 | if (string2.empty()) return string1.size();
286 |
287 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
288 | // faster speed by spending more time spinning just the inner loop during the main processing.
289 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
290 |
291 | // identify common suffix and/or prefix that can be ignored
292 | int len1, len2, start;
293 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
294 | if (len1 == 0) return len2;
295 |
296 | if (len2 > this->baseChar1Costs.size()) {
297 | this->baseChar1Costs = vector(len2, 0);
298 | this->basePrevChar1Costs = vector(len2, 0);
299 | }
300 | return Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs);
301 | }
302 |
303 | /// Compute and return the Damerau-Levenshtein optimal string
304 | /// alignment edit distance between two strings.
305 | /// https://github.com/softwx/SoftWx.Match
306 | /// This method is not threadsafe.
307 | /// One of the strings to compare.
308 | /// The other string to compare.
309 | /// The maximum distance that is of interest.
310 | /// -1 if the distance is greater than the maxDistance, 0 if the strings
311 | /// are equivalent, otherwise a positive number whose magnitude increases as
312 | /// difference between the strings increases.
313 | double Distance(xstring string1, xstring string2, double maxDistance) {
314 | if (string1.empty() || string2.empty()) return Helpers::NullDistanceResults(string1, string2, maxDistance);
315 | if (maxDistance <= 0) return (string1 == string2) ? 0 : -1;
316 | maxDistance = ceil(maxDistance);
317 | int iMaxDistance = (maxDistance <= INT_MAX) ? (int)maxDistance : INT_MAX;
318 |
319 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
320 | // faster speed by spending more time spinning just the inner loop during the main processing.
321 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
322 | if (string2.size() - string1.size() > iMaxDistance) return -1;
323 |
324 | // identify common suffix and/or prefix that can be ignored
325 | int len1, len2, start;
326 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
327 | if (len1 == 0) return (len2 <= iMaxDistance) ? len2 : -1;
328 |
329 | if (len2 > this->baseChar1Costs.size()) {
330 | this->baseChar1Costs = vector(len2, 0);
331 | this->basePrevChar1Costs = vector(len2, 0);
332 | }
333 | if (iMaxDistance < len2) {
334 | return Distance(string1, string2, len1, len2, start, iMaxDistance, this->baseChar1Costs, this->basePrevChar1Costs);
335 | }
336 | return Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs);
337 | }
338 |
339 | /// Return Damerau-Levenshtein optimal string alignment similarity
340 | /// between two strings (1 - (damerau distance / len of longer string)).
341 | /// One of the strings to compare.
342 | /// The other string to compare.
343 | /// The degree of similarity 0 to 1.0, where 0 represents a lack of any
344 | /// noteable similarity, and 1 represents equivalent strings.
345 | double Similarity(xstring string1, xstring string2) {
346 | if (string1.empty()) return (string2.empty()) ? 1 : 0;
347 | if (string2.empty()) return 0;
348 |
349 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
350 | // faster speed by spending more time spinning just the inner loop during the main processing.
351 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
352 |
353 | // identify common suffix and/or prefix that can be ignored
354 | int len1, len2, start;
355 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
356 | if (len1 == 0) return 1.0;
357 |
358 | if (len2 > this->baseChar1Costs.size()) {
359 | this->baseChar1Costs = vector(len2, 0);
360 | this->basePrevChar1Costs = vector(len2, 0);
361 | }
362 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs)
363 | , string2.size());
364 | }
365 |
366 | /// Return Damerau-Levenshtein optimal string alignment similarity
367 | /// between two strings (1 - (damerau distance / len of longer string)).
368 | /// One of the strings to compare.
369 | /// The other string to compare.
370 | /// The minimum similarity that is of interest.
371 | /// The degree of similarity 0 to 1.0, where -1 represents a similarity
372 | /// lower than minSimilarity, otherwise, a number between 0 and 1.0 where 0
373 | /// represents a lack of any noteable similarity, and 1 represents equivalent
374 | /// strings.
375 | double Similarity(xstring string1, xstring string2, double minSimilarity) {
376 | if (minSimilarity < 0 || minSimilarity > 1) throw "minSimilarity must be in range 0 to 1.0";
377 | if (string1.empty() || string2.empty()) return Helpers::NullSimilarityResults(string1, string2, minSimilarity);
378 |
379 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
380 | // faster speed by spending more time spinning just the inner loop during the main processing.
381 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
382 |
383 | int iMaxDistance = Helpers::ToDistance(minSimilarity, string2.size());
384 | if (string2.size() - string1.size() > iMaxDistance) return -1;
385 | if (iMaxDistance <= 0) return (string1 == string2) ? 1 : -1;
386 |
387 | // identify common suffix and/or prefix that can be ignored
388 | int len1, len2, start;
389 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
390 | if (len1 == 0) return 1.0;
391 |
392 | if (len2 > this->baseChar1Costs.size()) {
393 | this->baseChar1Costs = vector(len2, 0);
394 | this->basePrevChar1Costs = vector(len2, 0);
395 | }
396 | if (iMaxDistance < len2) {
397 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, iMaxDistance, this->baseChar1Costs, this->basePrevChar1Costs)
398 | , string2.size());
399 | }
400 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, this->baseChar1Costs, this->basePrevChar1Costs)
401 | , string2.size());
402 | }
403 |
404 | /// Internal implementation of the core Damerau-Levenshtein, optimal string alignment algorithm.
405 | /// https://github.com/softwx/SoftWx.Match
406 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, vector char1Costs, vector prevChar1Costs) {
407 | int j;
408 | for (j = 0; j < len2; j++) char1Costs[j] = j + 1;
409 | xchar char1 = XL(' ');
410 | int currentCost = 0;
411 | for (int i = 0; i < len1; ++i) {
412 | xchar prevChar1 = char1;
413 | char1 = string1[start + i];
414 | xchar char2 = XL(' ');
415 | int leftCharCost, aboveCharCost;
416 | leftCharCost = aboveCharCost = i;
417 | int nextTransCost = 0;
418 | for (j = 0; j < len2; ++j) {
419 | int thisTransCost = nextTransCost;
420 | nextTransCost = prevChar1Costs[j];
421 | prevChar1Costs[j] = currentCost = leftCharCost; // cost of diagonal (substitution)
422 | leftCharCost = char1Costs[j]; // left now equals current cost (which will be diagonal at next iteration)
423 | xchar prevChar2 = char2;
424 | char2 = string2[start + j];
425 | if (char1 != char2) {
426 | //substitution if neither of two conditions below
427 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion
428 | if (leftCharCost < currentCost) currentCost = leftCharCost; // insertion
429 | ++currentCost;
430 | if ((i != 0) && (j != 0)
431 | && (char1 == prevChar2)
432 | && (prevChar1 == char2)
433 | && (thisTransCost + 1 < currentCost)) {
434 | currentCost = thisTransCost + 1; // transposition
435 | }
436 | }
437 | char1Costs[j] = aboveCharCost = currentCost;
438 | }
439 | }
440 | return currentCost;
441 | }
442 |
443 | /// Internal implementation of the core Damerau-Levenshtein, optimal string alignment algorithm
444 | /// that accepts a maxDistance.
445 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, int maxDistance, vector char1Costs, vector prevChar1Costs) {
446 | int i, j;
447 | //for (j = 0; j < maxDistance; j++) char1Costs[j] = j+1;
448 | for (j = 0; j < maxDistance; j++)
449 | char1Costs[j] = j + 1;
450 | for (; j < len2;) char1Costs[j++] = maxDistance + 1;
451 | int lenDiff = len2 - len1;
452 | int jStartOffset = maxDistance - lenDiff;
453 | int jStart = 0;
454 | int jEnd = maxDistance;
455 | xchar char1 = XL(' ');
456 | int currentCost = 0;
457 | for (i = 0; i < len1; ++i) {
458 | xchar prevChar1 = char1;
459 | char1 = string1[start + i];
460 | xchar char2 = XL(' ');
461 | int leftCharCost, aboveCharCost;
462 | leftCharCost = aboveCharCost = i;
463 | int nextTransCost = 0;
464 | // no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff)
465 | // and the upper left diagonal + maxDistance cells (upper left is i)
466 | jStart += (i > jStartOffset) ? 1 : 0;
467 | jEnd += (jEnd < len2) ? 1 : 0;
468 | for (j = jStart; j < jEnd; ++j) {
469 | int thisTransCost = nextTransCost;
470 | nextTransCost = prevChar1Costs[j];
471 | prevChar1Costs[j] = currentCost = leftCharCost; // cost on diagonal (substitution)
472 | leftCharCost = char1Costs[j]; // left now equals current cost (which will be diagonal at next iteration)
473 | xchar prevChar2 = char2;
474 | char2 = string2[start + j];
475 | if (char1 != char2) {
476 | // substitution if neither of two conditions below
477 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion
478 | if (leftCharCost < currentCost) currentCost = leftCharCost; // insertion
479 | ++currentCost;
480 | if ((i != 0) && (j != 0)
481 | && (char1 == prevChar2)
482 | && (prevChar1 == char2)
483 | && (thisTransCost + 1 < currentCost)) {
484 | currentCost = thisTransCost + 1; // transposition
485 | }
486 | }
487 | char1Costs[j] = aboveCharCost = currentCost;
488 | }
489 | if (char1Costs[i + lenDiff] > maxDistance) return -1;
490 | }
491 | return (currentCost <= maxDistance) ? currentCost : -1;
492 | }
493 | };
494 |
495 |
496 | ///
497 | /// Class providing optimized methods for computing Levenshtein comparisons between two strings.
498 | ///
499 | ///
500 | /// Copyright ©2015-2018 SoftWx, Inc.
501 | /// The inspiration for creating highly optimized edit distance functions was
502 | /// from Sten Hjelmqvist's "Fast, memory efficient" algorithm, described at
503 | /// http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm
504 | /// The Levenshtein algorithm computes the edit distance metric between two strings, i.e.
505 | /// the number of insertion, deletion, and substitution edits required to transform one
506 | /// string to the other. This value will be >= 0, where 0 indicates identical strings.
507 | /// Comparisons are case sensitive, so for example, "Fred" and "fred" will have a
508 | /// distance of 1. The optimized algorithm was described in detail in my post at
509 | /// http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
510 | /// Also see http://en.wikipedia.org/wiki/Levenshtein_distance for general information.
511 | /// The methods in this class are not threadsafe. Use the static versions in the Distance
512 | /// class if that is required.
513 | class Levenshtein : public IDistance, ISimilarity {
514 | private:
515 | vector baseChar1Costs;
516 |
517 | public:
518 |
519 | /// Create a new instance of DamerauOSA.
520 | Levenshtein() {
521 |
522 | }
523 |
524 | /// Create a new instance of Levenshtein using the specified expected
525 | /// maximum string length that will be encountered.
526 | /// By specifying the max expected string length, better memory efficiency
527 | /// can be achieved.
528 | /// The expected maximum length of strings that will
529 | /// be passed to the Levenshtein methods.
530 | Levenshtein(int expectedMaxstringLength) {
531 | if (expectedMaxstringLength <= 0) throw "expectedMaxstringLength must be larger than 0";
532 | this->baseChar1Costs = vector(expectedMaxstringLength, 0);
533 | }
534 |
535 | /// Compute and return the Levenshtein edit distance between two strings.
536 | /// https://github.com/softwx/SoftWx.Match
537 | /// This method is not threadsafe.
538 | /// One of the strings to compare.
539 | /// The other string to compare.
540 | /// 0 if the strings are equivalent, otherwise a positive number whose
541 | /// magnitude increases as difference between the strings increases.
542 | double Distance(xstring string1, xstring string2) {
543 | if (string1.empty()) return string2.size();
544 | if (string2.empty()) return string1.size();
545 |
546 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
547 | // faster speed by spending more time spinning just the inner loop during the main processing.
548 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
549 |
550 | // identify common suffix and/or prefix that can be ignored
551 | int len1, len2, start;
552 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
553 | if (len1 == 0) return len2;
554 |
555 | return Distance(string1, string2, len1, len2, start,
556 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0)));
557 | }
558 |
559 | /// Compute and return the Levenshtein edit distance between two strings.
560 | /// https://github.com/softwx/SoftWx.Match
561 | /// This method is not threadsafe.
562 | /// One of the strings to compare.
563 | /// The other string to compare.
564 | /// The maximum distance that is of interest.
565 | /// -1 if the distance is greater than the maxDistance, 0 if the strings
566 | /// are equivalent, otherwise a positive number whose magnitude increases as
567 | /// difference between the strings increases.
568 | double Distance(xstring string1, xstring string2, double maxDistance) {
569 | if (string1.empty() || string2.empty()) return Helpers::NullDistanceResults(string1, string2, maxDistance);
570 | if (maxDistance <= 0) return (string1 == string2) ? 0 : -1;
571 | maxDistance = ceil(maxDistance);
572 | int iMaxDistance = (maxDistance <= INT_MAX) ? (int)maxDistance : INT_MAX;
573 |
574 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
575 | // faster speed by spending more time spinning just the inner loop during the main processing.
576 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
577 | if (string2.size() - string1.size() > iMaxDistance) return -1;
578 |
579 | // identify common suffix and/or prefix that can be ignored
580 | int len1, len2, start;
581 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
582 | if (len1 == 0) return (len2 <= iMaxDistance) ? len2 : -1;
583 |
584 | if (iMaxDistance < len2) {
585 | return Distance(string1, string2, len1, len2, start, iMaxDistance,
586 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0)));
587 | }
588 | return Distance(string1, string2, len1, len2, start,
589 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0)));
590 | }
591 |
592 | /// Return Levenshtein similarity between two strings
593 | /// (1 - (levenshtein distance / len of longer string)).
594 | /// One of the strings to compare.
595 | /// The other string to compare.
596 | /// The degree of similarity 0 to 1.0, where 0 represents a lack of any
597 | /// noteable similarity, and 1 represents equivalent strings.
598 | double Similarity(xstring string1, xstring string2) {
599 | if (string1.empty()) return (string2.empty()) ? 1 : 0;
600 | if (string2.empty()) return 0;
601 |
602 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
603 | // faster speed by spending more time spinning just the inner loop during the main processing.
604 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
605 |
606 | // identify common suffix and/or prefix that can be ignored
607 | int len1, len2, start;
608 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
609 | if (len1 == 0) return 1.0;
610 |
611 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start,
612 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0)))
613 | , string2.size());
614 | }
615 |
616 |
617 | /// Return Levenshtein similarity between two strings
618 | /// (1 - (levenshtein distance / len of longer string)).
619 | /// One of the strings to compare.
620 | /// The other string to compare.
621 | /// The minimum similarity that is of interest.
622 | /// The degree of similarity 0 to 1.0, where -1 represents a similarity
623 | /// lower than minSimilarity, otherwise, a number between 0 and 1.0 where 0
624 | /// represents a lack of any noteable similarity, and 1 represents equivalent
625 | /// strings.
626 | double Similarity(xstring string1, xstring string2, double minSimilarity) {
627 | if (minSimilarity < 0 || minSimilarity > 1) throw "minSimilarity must be in range 0 to 1.0";
628 | if (string1.empty() || string2.empty()) return Helpers::NullSimilarityResults(string1, string2, minSimilarity);
629 |
630 | // if strings of different lengths, ensure shorter string is in string1. This can result in a little
631 | // faster speed by spending more time spinning just the inner loop during the main processing.
632 | if (string1.size() > string2.size()) { xstring t = string1; string1 = string2; string2 = t; }
633 |
634 | int iMaxDistance = Helpers::ToDistance(minSimilarity, string2.size());
635 | if (string2.size() - string1.size() > iMaxDistance) return -1;
636 | if (iMaxDistance == 0) return (string1 == string2) ? 1 : -1;
637 |
638 | // identify common suffix and/or prefix that can be ignored
639 | int len1, len2, start;
640 | Helpers::PrefixSuffixPrep(string1, string2, len1, len2, start);
641 | if (len1 == 0) return 1.0;
642 |
643 | if (iMaxDistance < len2) {
644 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start, iMaxDistance,
645 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0)))
646 | , string2.size());
647 | }
648 | return Helpers::ToSimilarity(Distance(string1, string2, len1, len2, start,
649 | (this->baseChar1Costs = (len2 <= this->baseChar1Costs.size()) ? this->baseChar1Costs : vector(len2, 0)))
650 | , string2.size());
651 | }
652 |
653 | /// Internal implementation of the core Levenshtein algorithm.
654 | /// https://github.com/softwx/SoftWx.Match
655 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, vector &char1Costs) {
656 | for (int j = 0; j < len2; j++)
657 | char1Costs[j] = j + 1;
658 | int currentCharCost = 0;
659 | if (start == 0) {
660 | for (int i = 0; i < len1; ++i) {
661 | int leftCharCost, aboveCharCost;
662 | leftCharCost = aboveCharCost = i;
663 | xchar char1 = string1[i];
664 | for (int j = 0; j < len2; ++j) {
665 | currentCharCost = leftCharCost; // cost on diagonal (substitution)
666 | leftCharCost = char1Costs[j];
667 | if (string2[j] != char1) {
668 | // substitution if neither of two conditions below
669 | if (aboveCharCost < currentCharCost) currentCharCost = aboveCharCost; // deletion
670 | if (leftCharCost < currentCharCost) currentCharCost = leftCharCost; // insertion
671 | ++currentCharCost;
672 | }
673 | char1Costs[j] = aboveCharCost = currentCharCost;
674 | }
675 | }
676 | }
677 | else {
678 | for (int i = 0; i < len1; ++i) {
679 | int leftCharCost, aboveCharCost;
680 | leftCharCost = aboveCharCost = i;
681 | xchar char1 = string1[start + i];
682 | for (int j = 0; j < len2; ++j) {
683 | currentCharCost = leftCharCost; // cost on diagonal (substitution)
684 | leftCharCost = char1Costs[j];
685 | if (string2[start + j] != char1) {
686 | // substitution if neither of two conditions below
687 | if (aboveCharCost < currentCharCost) currentCharCost = aboveCharCost; // deletion
688 | if (leftCharCost < currentCharCost) currentCharCost = leftCharCost; // insertion
689 | ++currentCharCost;
690 | }
691 | char1Costs[j] = aboveCharCost = currentCharCost;
692 | }
693 | }
694 | }
695 | return currentCharCost;
696 | }
697 |
698 | /// Internal implementation of the core Levenshtein algorithm that accepts a maxDistance.
699 | /// https://github.com/softwx/SoftWx.Match
700 | static int Distance(xstring string1, xstring string2, int len1, int len2, int start, int maxDistance, vector &char1Costs) {
701 | // if (maxDistance >= len2) return Distance(string1, string2, len1, len2, start, char1Costs);
702 | int i, j;
703 | for (j = 0; j < maxDistance; j++)
704 | char1Costs[j] = j+1;
705 | for (; j < len2;) char1Costs[j++] = maxDistance + 1;
706 | int lenDiff = len2 - len1;
707 | int jStartOffset = maxDistance - lenDiff;
708 | int jStart = 0;
709 | int jEnd = maxDistance;
710 | int currentCost = 0;
711 | if (start == 0) {
712 | for (i = 0; i < len1; ++i) {
713 | xchar char1 = string1[i];
714 | int prevChar1Cost, aboveCharCost;
715 | prevChar1Cost = aboveCharCost = i;
716 | // no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff)
717 | // and the upper left diagonal + maxDistance cells (upper left is i)
718 | jStart += (i > jStartOffset) ? 1 : 0;
719 | jEnd += (jEnd < len2) ? 1 : 0;
720 | for (j = jStart; j < jEnd; ++j) {
721 | currentCost = prevChar1Cost; // cost on diagonal (substitution)
722 | prevChar1Cost = char1Costs[j];
723 | if (string2[j] != char1) {
724 | // substitution if neither of two conditions below
725 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion
726 | if (prevChar1Cost < currentCost) currentCost = prevChar1Cost; // insertion
727 | ++currentCost;
728 | }
729 | char1Costs[j] = aboveCharCost = currentCost;
730 | }
731 | if (char1Costs[i + lenDiff] > maxDistance) return -1;
732 | }
733 | }
734 | else {
735 | for (i = 0; i < len1; ++i) {
736 | xchar char1 = string1[start + i];
737 | int prevChar1Cost, aboveCharCost;
738 | prevChar1Cost = aboveCharCost = i;
739 | // no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i - lenDiff)
740 | // and the upper left diagonal + maxDistance cells (upper left is i)
741 | jStart += (i > jStartOffset) ? 1 : 0;
742 | jEnd += (jEnd < len2) ? 1 : 0;
743 | for (j = jStart; j < jEnd; ++j) {
744 | currentCost = prevChar1Cost; // cost on diagonal (substitution)
745 | prevChar1Cost = char1Costs[j];
746 | if (string2[start + j] != char1) {
747 | // substitution if neither of two conditions below
748 | if (aboveCharCost < currentCost) currentCost = aboveCharCost; // deletion
749 | if (prevChar1Cost < currentCost) currentCost = prevChar1Cost; // insertion
750 | ++currentCost;
751 | }
752 | char1Costs[j] = aboveCharCost = currentCost;
753 | }
754 | if (char1Costs[i + lenDiff] > maxDistance) return -1;
755 | }
756 | }
757 | return (currentCost <= maxDistance) ? currentCost : -1;
758 | }
759 | };
760 |
761 |
762 | /// Supported edit distance algorithms.
763 | enum DistanceAlgorithm {
764 | /// Levenshtein algorithm.
765 | LevenshteinDistance,
766 | /// Damerau optimal string alignment algorithm.
767 | DamerauOSADistance
768 | };
769 |
770 | /// Wrapper for third party edit distance algorithms.
771 | class EditDistance {
772 | private:
773 | DistanceAlgorithm algorithm;
774 | IDistance* distanceComparer;
775 | DamerauOSA damerauOSADistance;
776 | Levenshtein levenshteinDistance;
777 |
778 | public:
779 | /// Create a new EditDistance object.
780 | /// The desired edit distance algorithm.
781 | EditDistance(DistanceAlgorithm algorithm) {
782 | this->algorithm = algorithm;
783 | switch (algorithm) {
784 | case DistanceAlgorithm::DamerauOSADistance:
785 | this->distanceComparer = &damerauOSADistance;
786 | break;
787 | case DistanceAlgorithm::LevenshteinDistance:
788 | this->distanceComparer = &levenshteinDistance;
789 | break;
790 | default:
791 | throw "Unknown distance algorithm.";
792 | }
793 | }
794 |
795 | /// Compare a string to the base string to determine the edit distance,
796 | /// using the previously selected algorithm.
797 | /// The string to compare.
798 | /// The maximum distance allowed.
799 | /// The edit distance (or -1 if maxDistance exceeded).
800 | int Compare(xstring string1, xstring string2, int maxDistance) {
801 | return (int)this->distanceComparer->Distance(string1, string2, maxDistance);
802 | }
803 | };
804 |
805 | class Node
806 | {
807 | public:
808 | xstring suggestion;
809 | int next;
810 | };
811 |
812 | class Entry
813 | {
814 | public:
815 | int count;
816 | int first;
817 | };
818 |
819 | /// An intentionally opacque class used to temporarily stage
820 | /// dictionary data during the adding of many words. By staging the
821 | /// data during the building of the dictionary data, significant savings
822 | /// of time can be achieved, as well as a reduction in final memory usage.
823 | class SuggestionStage
824 | {
825 | private:
826 | Dictionary Deletes;
827 | ChunkArray Nodes;
828 |
829 | public:
830 | /// Create a new instance of SuggestionStage.
831 | /// Specifying ann accurate initialCapacity is not essential,
832 | /// but it can help speed up processing by alleviating the need for
833 | /// data restructuring as the size grows.
834 | /// The expected number of words that will be added.
835 | SuggestionStage(int initialCapacity)
836 | {
837 | Deletes.reserve(initialCapacity);
838 | Nodes.Reserve(initialCapacity * 2);
839 | }
840 |
841 | /// Gets the count of unique delete words.
842 | int DeleteCount() { return Deletes.size(); }
843 |
844 | /// Gets the total count of all suggestions for all deletes.
845 | int NodeCount() { return Nodes.Count; }
846 |
847 | /// Clears all the data from the SuggestionStaging.
848 | void Clear()
849 | {
850 | Deletes.clear();
851 | Nodes.Clear();
852 | }
853 |
854 | void Add(int deleteHash, xstring suggestion)
855 | {
856 | auto deletesFinded = Deletes.find(deleteHash);
857 | Entry newEntry;
858 | newEntry.count = 0;
859 | newEntry.first = -1;
860 | Entry entry = (deletesFinded == Deletes.end()) ? newEntry : deletesFinded->second;
861 | int next = entry.first;
862 | entry.count++;
863 | entry.first = Nodes.Count;
864 | Deletes[deleteHash] = entry;
865 | Node item;
866 | item.suggestion = suggestion;
867 | item.next = next; // 1st semantic errors, this should not be Nodes.Count
868 | Nodes.Add(item);
869 | }
870 |
871 | void CommitTo(Dictionary>* permanentDeletes)
872 | {
873 | auto permanentDeletesEnd = permanentDeletes->end();
874 | for (auto it = Deletes.begin(); it != Deletes.end(); ++it)
875 | {
876 | auto permanentDeletesFinded = permanentDeletes->find(it->first);
877 | vector suggestions;
878 | int i;
879 | if (permanentDeletesFinded != permanentDeletesEnd)
880 | {
881 | suggestions = permanentDeletesFinded->second;
882 | i = suggestions.size();
883 |
884 | vector newSuggestions;
885 | newSuggestions.reserve(suggestions.size() + it->second.count);
886 | std::copy(suggestions.begin(), suggestions.end(), back_inserter(newSuggestions));
887 | //permanentDeletes[it->first] = newSuggestions;
888 | }
889 | else
890 | {
891 | i = 0;
892 | int32_t count = it->second.count;
893 | suggestions.reserve(count);
894 | //permanentDeletes[it->first] = suggestions;
895 | }
896 |
897 | int next = it->second.first;
898 | while (next >= 0)
899 | {
900 | auto node = Nodes.At(next);
901 | suggestions.push_back(node.suggestion);
902 | next = node.next;
903 | ++i;
904 | }
905 | (*permanentDeletes)[it->first] = suggestions;
906 | }
907 | }
908 | };
909 |
910 |
911 | /// Spelling suggestion returned from Lookup.
912 | class SuggestItem
913 | {
914 | public:
915 | /// The suggested correctly spelled word.
916 | xstring term = XL("");
917 | /// Edit distance between searched for word and suggestion.
918 | int distance = 0;
919 | /// Frequency of suggestion in the dictionary (a measure of how common the word is).
920 | int64_t count = 0;
921 |
922 | /// Create a new instance of SuggestItem.
923 | /// The suggested word.
924 | /// Edit distance from search word.
925 | /// Frequency of suggestion in dictionary.
926 | SuggestItem()
927 | {
928 | }
929 |
930 |
931 | SuggestItem(xstring term, int distance, int64_t count)
932 | {
933 | this->term = term;
934 | this->distance = distance;
935 | this->count = count;
936 | }
937 |
938 | int CompareTo(SuggestItem other)
939 | {
940 | // order by distance ascending, then by frequency count descending, and then by alphabet descending
941 | int disCom = Helpers::CompareTo(this->distance, other.distance);
942 | if (disCom != 0)
943 | return disCom;
944 | int cntCom = Helpers::CompareTo(other.count, this->count);
945 | if (cntCom != 0)
946 | return cntCom;
947 | return this->term.compare(other.term);
948 | }
949 |
950 | bool Equals(const SuggestItem obj)
951 | {
952 | return this->term == obj.term && this->distance == obj.distance && this->count == obj.count;
953 | }
954 |
955 | int GetHashCode()
956 | {
957 | return hash{}(this->term);
958 | }
959 |
960 | xstring Tostring()
961 | {
962 | return XL("{") + term + XL(", ") + to_xstring(distance) + XL(", ") + to_xstring(count) + XL("}");
963 | }
964 | static bool compare(SuggestItem s1, SuggestItem s2)
965 | {
966 | return s1.CompareTo(s2) < 0 ? 1 : 0;
967 | }
968 |
969 | void set(SuggestItem exam)
970 | {
971 | this->term = exam.term;
972 | this->distance = exam.distance;
973 | this->count = exam.count;
974 | }
975 | };
976 |
--------------------------------------------------------------------------------
/include/SymSpell.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | //#define UNICODE_SUPPORT
13 | #include "Helpers.h"
14 |
15 |
16 | // SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
17 | //
18 | // The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup
19 | // for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
20 | // Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
21 | // Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
22 | // Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
23 | //
24 | // SymSpell supports compound splitting / decompounding of multi-word input strings with three cases:
25 | // 1. mistakenly inserted space into a correct word led to two incorrect terms
26 | // 2. mistakenly omitted space between two correct words led to one incorrect combined term
27 | // 3. multiple independent input terms with/without spelling errors
28 |
29 | // Copyright (C) 2019 Wolf Garbe
30 | // Version: 6.5
31 | // Author: Wolf Garbe wolf.garbe@faroo.com
32 | // Maintainer: Wolf Garbe wolf.garbe@faroo.com
33 | // URL: https://github.com/wolfgarbe/symspell
34 | // Description: https://medium.com/@wolfgarbe/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f
35 | //
36 | // MIT License
37 | // Copyright (c) 2019 Wolf Garbe
38 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
39 | // documentation files (the "Software"), to deal in the Software without restriction, including without limitation
40 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
41 | // and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
42 | //
43 | // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
44 | // https://opensource.org/licenses/MIT
45 |
46 | #define DEFAULT_SEPARATOR_CHAR XL(' ')
47 | #define DEFAULT_MAX_EDIT_DISTANCE 2
48 | #define DEFAULT_PREFIX_LENGTH 7
49 | #define DEFAULT_COUNT_THRESHOLD 1
50 | #define DEFAULT_INITIAL_CAPACITY 82765
51 | #define DEFAULT_COMPACT_LEVEL 5
52 | #define min3(a, b, c) (min(a, min(b, c)))
53 | #define MAXINT LLONG_MAX
54 | #define M
55 | #define MAXLONG MAXINT
56 | #define uint unsigned int
57 | static inline void ltrim(xstring& s)
58 | {
59 |
60 | s.erase(s.begin(),find_if(s.begin(), s.end(), [](xchar ch){
61 | return !isxspace(ch);
62 | }));
63 | }
64 |
65 | static inline void rtrim(xstring& s)
66 | {
67 | s.erase(find_if(s.rbegin(), s.rend(), [](xchar ch){
68 | return !isxspace(ch);
69 | }).base(), s.end());
70 | }
71 |
72 | static inline void trim(xstring& s)
73 | {
74 | ltrim(s);
75 | rtrim(s);
76 | }
77 |
78 | class Info
79 | {
80 | private:
81 |
82 | xstring segmentedstring;
83 | xstring correctedstring;
84 | int distanceSum;
85 | double probabilityLogSum;
86 | public:
87 | void set(xstring& seg, xstring& cor, int d, double prob)
88 | {
89 | segmentedstring = seg;
90 | correctedstring = cor;
91 | distanceSum = d;
92 | probabilityLogSum = prob;
93 | };
94 | xstring getSegmented()
95 | {
96 | return segmentedstring;
97 | };
98 | xstring getCorrected()
99 | {
100 | return correctedstring;
101 | };
102 | int getDistance()
103 | {
104 | return distanceSum;
105 | };
106 | double getProbability()
107 | {
108 | return probabilityLogSum;
109 | };
110 | };
111 |
112 | /// Controls the closeness/quantity of returned spelling suggestions.
113 | enum Verbosity
114 | {
115 | /// Top suggestion with the highest term frequency of the suggestions of smallest edit distance found.
116 | Top,
117 | /// All suggestions of smallest edit distance found, suggestions ordered by term frequency.
118 | Closest,
119 | /// All suggestions within maxEditDistance, suggestions ordered by edit distance
120 | /// , then by term frequency (slower, no early termination).
121 | All
122 | };
123 |
124 | class SymSpell
125 | {
126 | protected:
127 | int initialCapacity;
128 | int maxDictionaryEditDistance;
129 | int prefixLength; //prefix length 5..7
130 | long countThreshold; //a threshold might be specified, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction
131 | int compactMask;
132 | DistanceAlgorithm distanceAlgorithm = DistanceAlgorithm::DamerauOSADistance;
133 | int maxDictionaryWordLength; //maximum dictionary term length
134 | // Dictionary that contains a mapping of lists of suggested correction words to the hashCodes
135 | // of the original words and the deletes derived from them. Collisions of hashCodes is tolerated,
136 | // because suggestions are ultimately verified via an edit distance function.
137 | // A list of suggestions might have a single suggestion, or multiple suggestions.
138 | Dictionary> *deletes = NULL;
139 | // Dictionary of unique correct spelling words, and the frequency count for each word.
140 | Dictionary words;
141 | // Dictionary of unique words that are below the count threshold for being considered correct spellings.
142 | Dictionary belowThresholdWords;
143 |
144 | public:
145 | /// Maximum edit distance for dictionary precalculation.
146 | int MaxDictionaryEditDistance();
147 |
148 | /// Length of prefix, from which deletes are generated.
149 | int PrefixLength();
150 |
151 | /// Length of longest word in the dictionary.
152 | int MaxLength();
153 |
154 | /// Count threshold for a word to be considered a valid word for spelling correction.
155 | long CountThreshold();
156 |
157 | /// Number of unique words in the dictionary.
158 | int WordCount();
159 |
160 | /// Number of word prefixes and intermediate word deletes encoded in the dictionary.
161 | int EntryCount();
162 |
163 | /// Create a new instanc of SymSpell.
164 | /// Specifying ann accurate initialCapacity is not essential,
165 | /// but it can help speed up processing by alleviating the need for
166 | /// data restructuring as the size grows.
167 | /// The expected number of words in dictionary.
168 | /// Maximum edit distance for doing lookups.
169 | /// The length of word prefixes used for spell checking..
170 | /// The minimum frequency count for dictionary words to be considered correct spellings.
171 | /// Degree of favoring lower memory use over speed (0=fastest,most memory, 16=slowest,least memory).
172 |
173 | SymSpell(int initialCapacity = DEFAULT_INITIAL_CAPACITY, int maxDictionaryEditDistance = DEFAULT_MAX_EDIT_DISTANCE
174 | , int prefixLength = DEFAULT_PREFIX_LENGTH, int countThreshold = DEFAULT_COUNT_THRESHOLD
175 | , unsigned char compactLevel = DEFAULT_COMPACT_LEVEL);
176 |
177 | ~SymSpell();
178 |
179 | /// Create/Update an entry in the dictionary.
180 | /// For every word there are deletes with an edit distance of 1..maxEditDistance created and added to the
181 | /// dictionary. Every delete entry has a suggestions list, which points to the original term(s) it was created from.
182 | /// The dictionary may be dynamically updated (word frequency and new words) at any time by calling CreateDictionaryEntry
183 | /// The word to add to dictionary.
184 | /// The frequency count for word.
185 | /// Optional staging object to speed up adding many entries by staging them to a temporary structure.
186 | /// True if the word was added as a new correctly spelled word,
187 | /// or false if the word is added as a below threshold word, or updates an
188 | /// existing correctly spelled word.
189 | bool CreateDictionaryEntry(xstring key, int64_t count, SuggestionStage* staging);
190 |
191 | Dictionary bigrams;
192 | int64_t bigramCountMin = MAXLONG;
193 |
194 | /// Load multiple dictionary entries from a file of word/frequency count pairs
195 | /// Merges with any dictionary data already loaded.
196 | /// The path+filename of the file.
197 | /// The column position of the word.
198 | /// The column position of the frequency count.
199 | /// Separator characters between term(s) and count.
200 | /// True if file loaded, or false if file not found.
201 | bool LoadBigramDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
202 |
203 | /// Load multiple dictionary entries from a file of word/frequency count pairs
204 | /// Merges with any dictionary data already loaded.
205 | /// The path+filename of the file.
206 | /// The column position of the word.
207 | /// The column position of the frequency count.
208 | /// Separator characters between term(s) and count.
209 | /// True if file loaded, or false if file not found.
210 | bool LoadBigramDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
211 |
212 | /// Load multiple dictionary entries from a file of word/frequency count pairs
213 | /// Merges with any dictionary data already loaded.
214 | /// The path+filename of the file.
215 | /// The column position of the word.
216 | /// The column position of the frequency count.
217 | /// Separator characters between term(s) and count.
218 | /// True if file loaded, or false if file not found.
219 | bool LoadDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
220 |
221 | /// Load multiple dictionary entries from a stream of word/frequency count pairs
222 | /// Merges with any dictionary data already loaded.
223 | /// The stream containing the word/frequency count pairs.
224 | /// The column position of the word.
225 | /// The column position of the frequency count.
226 | /// Separator characters between term(s) and count.
227 | /// True if stream loads.
228 | bool LoadDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars = DEFAULT_SEPARATOR_CHAR);
229 |
230 | /// Load multiple dictionary words from a file containing plain text.
231 | /// Merges with any dictionary data already loaded.
232 | /// The path+filename of the file.
233 | /// True if file loaded, or false if file not found.
234 | bool CreateDictionary(string corpus);
235 |
236 | /// Load multiple dictionary words from a stream containing plain text.
237 | /// Merges with any dictionary data already loaded.
238 | /// The stream containing the plain text.
239 | /// True if stream loads.
240 | bool CreateDictionary(xifstream& corpusStream);
241 |
242 | /// Remove all below threshold words from the dictionary.
243 | /// This can be used to reduce memory consumption after populating the dictionary from
244 | /// a corpus using CreateDictionary.
245 | void PurgeBelowThresholdWords();
246 |
247 | /// Commit staged dictionary additions.
248 | /// Used when you write your own process to load multiple words into the
249 | /// dictionary, and as part of that process, you first created a SuggestionsStage
250 | /// object, and passed that to CreateDictionaryEntry calls.
251 | /// The SuggestionStage object storing the staged data.
252 | void CommitStaged(SuggestionStage* staging);
253 |
254 | /// Find suggested spellings for a given input word, using the maximum
255 | /// edit distance specified during construction of the SymSpell dictionary.
256 | /// The word being spell checked.
257 | /// The value controlling the quantity/closeness of the retuned suggestions.
258 | /// A List of SuggestItem object representing suggested correct spellings for the input word,
259 | /// sorted by edit distance, and secondarily by count frequency.
260 | vector Lookup(xstring input, Verbosity verbosity);
261 |
262 | /// Find suggested spellings for a given input word, using the maximum
263 | /// edit distance specified during construction of the SymSpell dictionary.
264 | /// The word being spell checked.
265 | /// The value controlling the quantity/closeness of the retuned suggestions.
266 | /// The maximum edit distance between input and suggested words.
267 | /// A List of SuggestItem object representing suggested correct spellings for the input word,
268 | /// sorted by edit distance, and secondarily by count frequency.
269 | vector Lookup(xstring input, Verbosity verbosity, int maxEditDistance);
270 |
271 | /// Find suggested spellings for a given input word.
272 | /// The word being spell checked.
273 | /// The value controlling the quantity/closeness of the retuned suggestions.
274 | /// The maximum edit distance between input and suggested words.
275 | /// Include input word in suggestions, if no words within edit distance found.
276 | /// A List of SuggestItem object representing suggested correct spellings for the input word,
277 | /// sorted by edit distance, and secondarily by count frequency.
278 | vector Lookup(xstring input, Verbosity verbosity, int maxEditDistance, bool includeUnknown);
279 |
280 | private:
281 | //check whether all delete chars are present in the suggestion prefix in correct order, otherwise this is just a hash collision
282 | bool DeleteInSuggestionPrefix(xstring deleteSugg, int deleteLen, xstring suggestion, int suggestionLen);
283 |
284 | //create a non-unique wordlist from sample text
285 | //language independent (e.g. works with Chinese characters)
286 | vector ParseWords(xstring text);
287 |
288 | //inexpensive and language independent: only deletes, no transposes + replaces + inserts
289 | //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
290 | HashSet* Edits(xstring word, int editDistance, HashSet* deleteWords);
291 |
292 | HashSet EditsPrefix(xstring key);
293 |
294 | int GetstringHash(xstring s);
295 |
296 | public:
297 | //######################
298 |
299 | //LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases:
300 | //1. mistakenly inserted space into a correct word led to two incorrect terms
301 | //2. mistakenly omitted space between two correct words led to one incorrect combined term
302 | //3. multiple independent input terms with/without spelling errors
303 |
304 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
305 | /// The string being spell checked.
306 | /// A List of SuggestItem object representing suggested correct spellings for the input string.
307 | vector LookupCompound(xstring input);
308 |
309 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
310 | /// The string being spell checked.
311 | /// The maximum edit distance between input and suggested words.
312 | /// A List of SuggestItem object representing suggested correct spellings for the input string.
313 | vector LookupCompound(xstring input, int editDistanceMax);
314 |
315 | //######
316 |
317 | //WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions
318 | //misspelled words are corrected and do not affect segmentation
319 | //existing spaces are allowed and considered for optimum segmentation
320 |
321 | //SymSpell.WordSegmentation uses a novel approach *without* recursion.
322 | //https://medium.com/@wolfgarbe/fast-word-segmentation-for-noisy-text-2c2c41f9e8da
323 | //While each string of length n can be segmentend in 2^n−1 possible compositions https://en.wikipedia.org/wiki/Composition_(combinatorics)
324 | //SymSpell.WordSegmentation has a linear runtime O(n) to find the optimum composition
325 |
326 | //number of all words in the corpus used to generate the frequency dictionary
327 | //this is used to calculate the word occurrence probability p from word counts c : p=c/N
328 | //N equals the sum of all counts c in the dictionary only if the dictionary is complete, but not if the dictionary is truncated or filtered
329 | static const int64_t N = 1024908267229L;
330 |
331 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
332 | /// The string being spell checked.
333 | /// The word segmented string,
334 | /// the word segmented and spelling corrected string,
335 | /// the Edit distance sum between input string and corrected string,
336 | /// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
337 | Info WordSegmentation(xstring input);
338 |
339 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
340 | /// The string being spell checked.
341 | /// The maximum edit distance between input and corrected words
342 | /// (0=no correction/segmentation only).
343 | /// The word segmented string,
344 | /// the word segmented and spelling corrected string,
345 | /// the Edit distance sum between input string and corrected string,
346 | /// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
347 | Info WordSegmentation(xstring input, int maxEditDistance);
348 |
349 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
350 | /// The string being spell checked.
351 | /// The maximum word length that should be considered.
352 | /// The maximum edit distance between input and corrected words
353 | /// (0=no correction/segmentation only).
354 | /// The word segmented string,
355 | /// the word segmented and spelling corrected string,
356 | /// the Edit distance sum between input string and corrected string,
357 | /// the Sum of word occurrence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
358 | Info WordSegmentation(xstring input, int maxEditDistance, int maxSegmentationWordLength);
359 | };
360 |
361 |
--------------------------------------------------------------------------------
/src/SymSpell.cpp:
--------------------------------------------------------------------------------
1 | #include "SymSpell.h"
2 | #include
3 | /// Maximum edit distance for dictionary precalculation.
4 | int SymSpell::MaxDictionaryEditDistance()
5 | {
6 | return this->maxDictionaryEditDistance;
7 | }
8 |
9 | /// Length of prefix, from which deletes are generated.
10 | int SymSpell::PrefixLength()
11 | {
12 | return this->prefixLength;
13 | }
14 |
15 | /// Length of longest word in the dictionary.
16 | int SymSpell::MaxLength()
17 | {
18 | return this->maxDictionaryWordLength;
19 | }
20 |
21 | /// Count threshold for a word to be considered a valid word for spelling correction.
22 | long SymSpell::CountThreshold()
23 | {
24 | return this->countThreshold;
25 | }
26 |
27 | /// Number of unique words in the dictionary.
28 | int SymSpell::WordCount()
29 | {
30 | return this->words.size();
31 | }
32 |
33 | /// Number of word prefixes and intermediate word deletes encoded in the dictionary.
34 | int SymSpell::EntryCount()
35 | {
36 | return this->deletes->size();
37 | }
38 |
39 | /// Create a new instanc of SymSpell.
40 | /// Specifying ann accurate initialCapacity is not essential,
41 | /// but it can help speed up processing by alleviating the need for
42 | /// data restructuring as the size grows.
43 | /// The expected number of words in dictionary.
44 | /// Maximum edit distance for doing lookups.
45 | /// The length of word prefixes used for spell checking..
46 | /// The minimum frequency count for dictionary words to be considered correct spellings.
47 | /// Degree of favoring lower memory use over speed (0=fastest,most memory, 16=slowest,least memory).
48 | SymSpell::SymSpell(int initialCapacity, int maxDictionaryEditDistance
49 | , int prefixLength, int countThreshold, unsigned char compactLevel)
50 | {
51 | if (initialCapacity < 0) throw std::invalid_argument("initialCapacity");
52 | if (maxDictionaryEditDistance < 0) throw std::invalid_argument("maxDictionaryEditDistance");
53 | if (prefixLength < 1 || prefixLength <= maxDictionaryEditDistance) throw std::invalid_argument("prefixLength");
54 | if (countThreshold < 0) throw std::invalid_argument("countThreshold");
55 | if (compactLevel > 16) throw std::invalid_argument("compactLevel");
56 |
57 | this->initialCapacity = initialCapacity;
58 | this->words.reserve(initialCapacity);
59 | this->maxDictionaryEditDistance = maxDictionaryEditDistance;
60 | this->prefixLength = prefixLength;
61 | this->countThreshold = countThreshold;
62 | if (compactLevel > 16) compactLevel = 16;
63 | this->compactMask = (UINT_MAX >> (3 + compactLevel)) << 2;
64 | this->maxDictionaryWordLength = 0;
65 | this->words = Dictionary(initialCapacity);
66 | }
67 |
68 | SymSpell::~SymSpell() {
69 | if (this->deletes != nullptr) {
70 | delete this->deletes;
71 | }
72 | }
73 |
74 | /// Create/Update an entry in the dictionary.
75 | /// For every word there are deletes with an edit distance of 1..maxEditDistance created and added to the
76 | /// dictionary. Every delete entry has a suggestions list, which points to the original term(s) it was created from.
77 | /// The dictionary may be dynamically updated (word frequency and new words) at any time by calling CreateDictionaryEntry
78 | /// The word to add to dictionary.
79 | /// The frequency count for word.
80 | /// Optional staging object to speed up adding many entries by staging them to a temporary structure.
81 | /// True if the word was added as a new correctly spelled word,
82 | /// or false if the word is added as a below threshold word, or updates an
83 | /// existing correctly spelled word.
84 | bool SymSpell::CreateDictionaryEntry(xstring key, int64_t count, SuggestionStage* staging)
85 | {
86 |
87 | if (count <= 0)
88 | {
89 | if (this->countThreshold > 0) return false; // no point doing anything if count is zero, as it can't change anything
90 | count = 0;
91 | }
92 | int countPrevious = -1;
93 | // look first in below threshold words, update count, and allow promotion to correct spelling word if count reaches threshold
94 | // threshold must be >1 for there to be the possibility of low threshold words
95 | auto belowThresholdWordsFinded = belowThresholdWords.find(key);
96 | auto wordsFinded = words.find(key);
97 | if (countThreshold > 1 && belowThresholdWordsFinded != belowThresholdWords.end())
98 | {
99 | countPrevious = belowThresholdWordsFinded->second;
100 |
101 | // calculate new count for below threshold word
102 | count = (MAXINT - countPrevious > count) ? countPrevious + count : MAXINT;
103 | // has reached threshold - remove from below threshold collection (it will be added to correct words below)
104 | if (count >= countThreshold)
105 | {
106 | belowThresholdWords.erase(key);
107 | }
108 | else
109 | {
110 | //belowThresholdWords[key] = count;
111 | belowThresholdWords.insert(pair(key, count));
112 | return false;
113 | }
114 | }
115 | else if (wordsFinded != words.end())
116 | {
117 | countPrevious = wordsFinded->second;
118 | // just update count if it's an already added above threshold word
119 | count = (MAXINT - countPrevious > count) ? countPrevious + count : MAXINT;
120 | //words[key] = count;
121 | words.insert(pair(key, count));
122 | return false;
123 | }
124 | else if (count < CountThreshold())
125 | {
126 | // new or existing below threshold word
127 | //belowThresholdWords[key] = count;
128 | belowThresholdWords.insert(pair(key, count));
129 | return false;
130 | }
131 |
132 | // what we have at this point is a new, above threshold word
133 | //words[key] = count;
134 | words.insert(pair(key, count));
135 |
136 | //edits/suggestions are created only once, no matter how often word occurs
137 | //edits/suggestions are created only as soon as the word occurs in the corpus,
138 | //even if the same term existed before in the dictionary as an edit from another word
139 | if (key.size() > this->maxDictionaryWordLength) this->maxDictionaryWordLength = key.size();
140 |
141 | //create deletes
142 | HashSet edits = EditsPrefix(key);
143 | // if not staging suggestions, put directly into main data structure
144 | if (staging != NULL)
145 | {
146 | /*ofstream outfile;
147 | outfile.open("log.txt", ios::app);
148 | outfile << words.size() << " " << key <Add(GetstringHash(*it), key);
152 | //outfile << "-" << *it;
153 | }
154 | /*outfile << endl;
155 | outfile.close();*/
156 | }
157 | else
158 | {
159 |
160 | for (auto it = edits.begin(); it != edits.end(); ++it)
161 | {
162 | int deleteHash = GetstringHash(*it);
163 | auto deletesFinded = deletes->find(deleteHash);
164 | std::vector suggestions;
165 | if(deletesFinded != deletes->end())
166 | //if (deletes.TryGetValue(deleteHash, out string[] suggestions))
167 | {
168 | suggestions = deletesFinded->second;
169 | std::vector newSuggestions(suggestions.size() + 1);
170 | for(int id=0; id(1);
179 | //(*deletes)[deleteHash] = suggestions;
180 | (*deletes).insert(pair>(deleteHash, suggestions));
181 | }
182 | suggestions[suggestions.size() - 1] = key;
183 | }
184 |
185 | }
186 |
187 | return true;
188 | }
189 |
190 | /// Load multiple dictionary entries from a file of word/frequency count pairs
191 | /// Merges with any dictionary data already loaded.
192 | /// The path+filename of the file.
193 | /// The column position of the word.
194 | /// The column position of the frequency count.
195 | /// Separator characters between term(s) and count.
196 | /// True if file loaded, or false if file not found.
197 | bool SymSpell::LoadBigramDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars)
198 | {
199 | xifstream corpusStream;
200 | corpusStream.open(corpus);
201 | #ifdef UNICODE_SUPPORT
202 | locale utf8(locale(), new codecvt_utf8);
203 | corpusStream.imbue(utf8);
204 | #endif
205 | if (!corpusStream.is_open())
206 | return false;
207 |
208 | return LoadBigramDictionary(corpusStream, termIndex, countIndex, separatorChars);
209 | }
210 |
211 | /// Load multiple dictionary entries from a file of word/frequency count pairs
212 | /// Merges with any dictionary data already loaded.
213 | /// The path+filename of the file.
214 | /// The column position of the word.
215 | /// The column position of the frequency count.
216 | /// Separator characters between term(s) and count.
217 | /// True if file loaded, or false if file not found.
218 | bool SymSpell::LoadBigramDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars)
219 | {
220 | xstring line;
221 | int linePartsLength = (separatorChars == DEFAULT_SEPARATOR_CHAR) ? 3 : 2;
222 | //process a single line at a time only for memory efficiency
223 | while (getline(corpusStream, line))
224 | {
225 | vector lineParts;
226 | xstringstream ss(line);
227 | xstring token;
228 | while (getline(ss, token, separatorChars))
229 | lineParts.push_back(token);
230 | xstring key;
231 | int64_t count;
232 | if (lineParts.size() >= linePartsLength)
233 | {
234 | //if default (whitespace) is defined as separator take 2 term parts, otherwise take only one
235 | key = (separatorChars == DEFAULT_SEPARATOR_CHAR) ? lineParts[termIndex] + XL(" ") + lineParts[termIndex + 1]: lineParts[termIndex];
236 | try{
237 | count = stoll(lineParts[countIndex]);
238 |
239 | }catch(...) {
240 | //maybe log something here
241 | printf("Cannot convert %s to integer\n", lineParts[countIndex].c_str());
242 | }
243 | }
244 | else
245 | {
246 | //if default (whitespace) is defined as separator take 2 term parts, otherwise take only one
247 | key = line;
248 | count = 1;
249 | }
250 | pair element(key, count);
251 | bigrams.insert(element);
252 | if (count < bigramCountMin) bigramCountMin = count;
253 | }
254 |
255 | if (bigrams.empty())
256 | return false;
257 | return true;
258 | }
259 |
260 | /// Load multiple dictionary entries from a file of word/frequency count pairs
261 | /// Merges with any dictionary data already loaded.
262 | /// The path+filename of the file.
263 | /// The column position of the word.
264 | /// The column position of the frequency count.
265 | /// Separator characters between term(s) and count.
266 | /// True if file loaded, or false if file not found.
267 | bool SymSpell::LoadDictionary(string corpus, int termIndex, int countIndex, xchar separatorChars)
268 | {
269 |
270 | xifstream corpusStream(corpus);
271 | /*corpusStream.open(corpus);*/
272 | #ifdef UNICODE_SUPPORT
273 | locale utf8(locale(), new codecvt_utf8);
274 | corpusStream.imbue(utf8);
275 | #endif
276 | if (!corpusStream.is_open())
277 | return false;
278 |
279 | return LoadDictionary(corpusStream, termIndex, countIndex, separatorChars);
280 | }
281 |
282 | /// Load multiple dictionary entries from a stream of word/frequency count pairs
283 | /// Merges with any dictionary data already loaded.
284 | /// The stream containing the word/frequency count pairs.
285 | /// The column position of the word.
286 | /// The column position of the frequency count.
287 | /// Separator characters between term(s) and count.
288 | /// True if stream loads.
289 | bool SymSpell::LoadDictionary(xifstream& corpusStream, int termIndex, int countIndex, xchar separatorChars)
290 | {
291 | SuggestionStage staging(16384);
292 | xstring line;
293 | int i = 0;
294 | int start, end;
295 | start = clock();
296 | //process a single line at a time only for memory efficiency
297 | while (getline(corpusStream, line))
298 | {
299 | //end = clock();
300 | //cout << float((end - start) * 1000 / CLOCKS_PER_SEC) << " ";
301 | i++;
302 | vector lineParts;
303 | xstringstream ss(line);
304 | xstring token;
305 | //start = clock();
306 | while (getline(ss, token, separatorChars))
307 | lineParts.push_back(token);
308 | if (lineParts.size() >= 2)
309 | {
310 | int64_t count = stoll(lineParts[countIndex]);
311 |
312 | CreateDictionaryEntry(lineParts[termIndex], count, &staging);
313 | }
314 | else
315 | {
316 | CreateDictionaryEntry(line, 1, &staging);
317 | }
318 |
319 | }
320 | if (this->deletes == NULL)
321 | this->deletes = new Dictionary>(staging.DeleteCount());
322 | CommitStaged(&staging);
323 | if (this->EntryCount() == 0)
324 | return false;
325 | return true;
326 | }
327 |
328 | /// Load multiple dictionary words from a file containing plain text.
329 | /// Merges with any dictionary data already loaded.
330 | /// The path+filename of the file.
331 | /// True if file loaded, or false if file not found.
332 | bool SymSpell::CreateDictionary(string corpus)
333 | {
334 | xifstream corpusStream;
335 | corpusStream.open(corpus);
336 | #ifdef UNICODE_SUPPORT
337 | locale utf8(locale(), new codecvt_utf8);
338 | corpusStream.imbue(utf8);
339 | #endif
340 | if (!corpusStream.is_open()) return false;
341 |
342 | return CreateDictionary(corpusStream);
343 | }
344 |
345 | /// Load multiple dictionary words from a stream containing plain text.
346 | /// Merges with any dictionary data already loaded.
347 | /// The stream containing the plain text.
348 | /// True if stream loads.
349 | bool SymSpell::CreateDictionary(xifstream& corpusStream)
350 | {
351 | xstring line;
352 | SuggestionStage staging = SuggestionStage(16384);
353 | while (getline(corpusStream, line))
354 | {
355 | for (xstring key : ParseWords(line))
356 | {
357 | CreateDictionaryEntry(key, 1, &staging);
358 | }
359 |
360 | }
361 | if (this->deletes == NULL) this->deletes = new Dictionary>(staging.DeleteCount());
362 | CommitStaged(&staging);
363 | if (this->EntryCount() == 0)
364 | return false;
365 | return true;
366 | }
367 |
368 | /// Remove all below threshold words from the dictionary.
369 | /// This can be used to reduce memory consumption after populating the dictionary from
370 | /// a corpus using CreateDictionary.
371 | void SymSpell::PurgeBelowThresholdWords()
372 | {
373 | belowThresholdWords.clear();
374 | }
375 |
376 | /// Commit staged dictionary additions.
377 | /// Used when you write your own process to load multiple words into the
378 | /// dictionary, and as part of that process, you first created a SuggestionsStage
379 | /// object, and passed that to CreateDictionaryEntry calls.
380 | /// The SuggestionStage object storing the staged data.
381 | void SymSpell::CommitStaged(SuggestionStage* staging)
382 | {
383 | staging->CommitTo(deletes);
384 | }
385 |
386 | /// Find suggested spellings for a given input word, using the maximum
387 | /// edit distance specified during construction of the SymSpell dictionary.
388 | /// The word being spell checked.
389 | /// The value controlling the quantity/closeness of the retuned suggestions.
390 | /// A vector of SuggestItem object representing suggested correct spellings for the input word,
391 | /// sorted by edit distance, and secondarily by count frequency.
392 | vector SymSpell::Lookup(xstring input, Verbosity verbosity)
393 | {
394 | return Lookup(input, verbosity, this->maxDictionaryEditDistance, false);
395 | }
396 |
397 | /// Find suggested spellings for a given input word, using the maximum
398 | /// edit distance specified during construction of the SymSpell dictionary.
399 | /// The word being spell checked.
400 | /// The value controlling the quantity/closeness of the retuned suggestions.
401 | /// The maximum edit distance between input and suggested words.
402 | /// A vector of SuggestItem object representing suggested correct spellings for the input word,
403 | /// sorted by edit distance, and secondarily by count frequency.
404 | vector SymSpell::Lookup(xstring input, Verbosity verbosity, int maxEditDistance)
405 | {
406 | return Lookup(input, verbosity, maxEditDistance, false);
407 | }
408 |
409 | /// Find suggested spellings for a given input word.
410 | /// The word being spell checked.
411 | /// The value controlling the quantity/closeness of the retuned suggestions.
412 | /// The maximum edit distance between input and suggested words.
413 | /// Include input word in suggestions, if no words within edit distance found.
414 | /// A vector of SuggestItem object representing suggested correct spellings for the input word,
415 | /// sorted by edit distance, and secondarily by count frequency.
416 | vector SymSpell::Lookup(xstring input, Verbosity verbosity, int maxEditDistance, bool includeUnknown)
417 | {
418 | //verbosity=Top: the suggestion with the highest term frequency of the suggestions of smallest edit distance found
419 | //verbosity=Closest: all suggestions of smallest edit distance found, the suggestions are ordered by term frequency
420 | //verbosity=All: all suggestions <= maxEditDistance, the suggestions are ordered by edit distance, then by term frequency (slower, no early termination)
421 |
422 | // maxEditDistance used in Lookup can't be bigger than the maxDictionaryEditDistance
423 | // used to construct the underlying dictionary structure.
424 | int skip = 0;
425 | if (maxEditDistance > this->maxDictionaryEditDistance) throw std::invalid_argument("maxEditDistance");
426 |
427 | vector suggestions;
428 | int inputLen = input.size();
429 | // early exit - word is too big to possibly match any words
430 | if (inputLen - maxEditDistance > this->maxDictionaryWordLength) skip = 1;
431 |
432 | // quick look for exact match
433 | int64_t suggestionCount = 0;
434 | if (words.count(input) && !skip)
435 | {
436 | suggestionCount = words.at(input);
437 | suggestions.push_back(SuggestItem(input, 0, suggestionCount));
438 | // early exit - return exact match, unless caller wants all matches
439 | if (verbosity != All) skip = 1;
440 | }
441 |
442 | //early termination, if we only want to check if word in dictionary or get its frequency e.g. for word segmentation
443 | if (maxEditDistance == 0) skip = 1;
444 |
445 | if (!skip)
446 | {
447 | // deletes we've considered already
448 | HashSet hashset1;
449 | // suggestions we've considered already
450 | HashSet hashset2;
451 | // we considered the input already in the word.TryGetValue above
452 | hashset2.insert(input);
453 |
454 | int maxEditDistance2 = maxEditDistance;
455 | int candidatePointer = 0;
456 | vector singleSuggestion = { XL("") };
457 | vector candidates;
458 |
459 | //add original prefix
460 | int inputPrefixLen = inputLen;
461 | if (inputPrefixLen > prefixLength)
462 | {
463 | inputPrefixLen = prefixLength;
464 | candidates.push_back(input.substr(0, inputPrefixLen));
465 | }
466 | else
467 | {
468 | candidates.push_back(input);
469 | }
470 | auto distanceComparer = EditDistance(this->distanceAlgorithm);
471 | while (candidatePointer < candidates.size())
472 | {
473 | xstring candidate = candidates[candidatePointer++];
474 | int candidateLen = candidate.size();
475 | int lengthDiff = inputPrefixLen - candidateLen;
476 |
477 | //save some time - early termination
478 | //if candidate distance is already higher than suggestion distance, than there are no better suggestions to be expected
479 | if (lengthDiff > maxEditDistance2)
480 | {
481 | // skip to next candidate if Verbosity.All, look no further if Verbosity.Top or Closest
482 | // (candidates are ordered by delete distance, so none are closer than current)
483 | if (verbosity == Verbosity::All) continue;
484 | break;
485 | }
486 |
487 | //read candidate entry from dictionary
488 | if (deletes != nullptr and deletes->count(GetstringHash(candidate)))
489 | {
490 | vector dictSuggestions = deletes->at(GetstringHash(candidate));
491 | //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
492 | for (int i = 0; i < dictSuggestions.size(); i++)
493 | {
494 | auto suggestion = dictSuggestions[i];
495 | int suggestionLen = suggestion.size();
496 | if (suggestion == input) continue;
497 | if ((abs(suggestionLen - inputLen) > maxEditDistance2) // input and sugg lengths diff > allowed/current best distance
498 | || (suggestionLen < candidateLen) // sugg must be for a different delete string, in same bin only because of hash collision
499 | || (suggestionLen == candidateLen && suggestion != candidate)) // if sugg len = delete len, then it either equals delete or is in same bin only because of hash collision
500 | continue;
501 | auto suggPrefixLen = min(suggestionLen, prefixLength);
502 | if (suggPrefixLen > inputPrefixLen && (suggPrefixLen - candidateLen) > maxEditDistance2) continue;
503 |
504 | //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
505 | //We allow simultaneous edits (deletes) of maxEditDistance on on both the dictionary and the input term.
506 | //For replaces and adjacent transposes the resulting edit distance stays <= maxEditDistance.
507 | //For inserts and deletes the resulting edit distance might exceed maxEditDistance.
508 | //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
509 | //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for maxEditDistance=1)
510 | //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
511 | int distance = 0;
512 | int min_len = 0;
513 | if (candidateLen == 0)
514 | {
515 | //suggestions which have no common chars with input (inputLen<=maxEditDistance && suggestionLen<=maxEditDistance)
516 | distance = max(inputLen, suggestionLen);
517 | auto flag = hashset2.insert(suggestion);
518 | if (distance > maxEditDistance2 || !flag.second) continue;
519 | }
520 | else if (suggestionLen == 1)
521 | {
522 | // not entirely sure what happens here yet
523 | if (input.find(suggestion[0]) == input.npos)
524 | distance = inputLen;
525 | else
526 | distance = inputLen - 1;
527 |
528 | auto flag = hashset2.insert(suggestion);
529 | if (distance > maxEditDistance2 || !flag.second) continue;
530 | }
531 | else
532 | //number of edits in prefix ==maxediddistance AND no identic suffix
533 | //, then editdistance>maxEditDistance and no need for Levenshtein calculation
534 | // (inputLen >= prefixLength) && (suggestionLen >= prefixLength)
535 | if ((prefixLength - maxEditDistance == candidateLen)
536 | && (((min_len = min(inputLen, suggestionLen) - prefixLength) > 1)
537 | && (input.substr(inputLen + 1 - min_len) != suggestion.substr(suggestionLen + 1 - min_len)))
538 | || ((min_len > 0) && (input[inputLen - min_len] != suggestion[suggestionLen - min_len])
539 | && ((input[inputLen - min_len - 1] != suggestion[suggestionLen - min_len])
540 | || (input[inputLen - min_len] != suggestion[suggestionLen - min_len - 1]))))
541 | {
542 | continue;
543 | }
544 | else
545 | {
546 | // DeleteInSuggestionPrefix is somewhat expensive, and only pays off when verbosity is Top or Closest.
547 | if ((verbosity != All && !DeleteInSuggestionPrefix(candidate, candidateLen, suggestion, suggestionLen))
548 | || !hashset2.insert(suggestion).second) continue;
549 | distance = distanceComparer.Compare(input, suggestion, maxEditDistance2);
550 | if (distance < 0) continue;
551 | }
552 |
553 | //save some time
554 | //do not process higher distances than those already found, if verbosity 0)
560 | {
561 | switch (verbosity)
562 | {
563 | case Closest:
564 | {
565 | //we will calculate DamLev distance only to the smallest found distance so far
566 | if (distance < maxEditDistance2) suggestions.clear();
567 | break;
568 | }
569 | case Top:
570 | {
571 | if (distance < maxEditDistance2 || suggestionCount > suggestions[0].count)
572 | {
573 | maxEditDistance2 = distance;
574 | suggestions[0] = si;
575 | }
576 | continue;
577 | }
578 | }
579 | }
580 | if (verbosity != All) maxEditDistance2 = distance;
581 | suggestions.push_back(si);
582 | }
583 | }//end foreach
584 | }//end if
585 |
586 | //add edits
587 | //derive edits (deletes) from candidate (input) and add them to candidates list
588 | //this is a recursive process until the maximum edit distance has been reached
589 | if ((lengthDiff < maxEditDistance) && (candidateLen <= prefixLength))
590 | {
591 | //save some time
592 | //do not create edits with edit distance smaller than suggestions already found
593 | if (verbosity != All && lengthDiff >= maxEditDistance2) continue;
594 |
595 | for (int i = 0; i < candidateLen; i++)
596 | {
597 | xstring temp(candidate);
598 | xstring del = temp.erase(i, 1);
599 |
600 | if (hashset1.insert(del).second) { candidates.push_back(del); }
601 | }
602 | }
603 | }//end while
604 |
605 | //sort by ascending edit distance, then by descending word frequency
606 | if (suggestions.size() > 1) sort(suggestions.begin(), suggestions.end(), [](SuggestItem& l, SuggestItem& r){
607 | return l.CompareTo(r) < 0? 1 : 0;
608 | });
609 | }
610 | if (includeUnknown && (suggestions.size() == 0)) suggestions.push_back(SuggestItem(input, maxEditDistance + 1, 0));
611 | return suggestions;
612 | }//end if
613 |
614 | //check whether all delete chars are present in the suggestion prefix in correct order, otherwise this is just a hash collision
615 | bool SymSpell::DeleteInSuggestionPrefix(xstring deleteSugg, int deleteLen, xstring suggestion, int suggestionLen)
616 | {
617 | if (deleteLen == 0) return true;
618 | if (prefixLength < suggestionLen) suggestionLen = prefixLength;
619 | int j = 0;
620 | for (int i = 0; i < deleteLen; i++)
621 | {
622 | xchar delChar = deleteSugg[i];
623 | while (j < suggestionLen && delChar != suggestion[j]) j++;
624 | if (j == suggestionLen) return false;
625 | }
626 | return true;
627 | }
628 |
629 | //create a non-unique wordlist from sample text
630 | //language independent (e.g. works with Chinese characters)
631 | vector SymSpell::ParseWords(xstring text)
632 | {
633 | // \w Alphanumeric characters (including non-latin characters, umlaut characters and digits) plus "_"
634 | // \d Digits
635 | // Compatible with non-latin characters, does not split words at apostrophes
636 | xregex r(XL("['’\\w\\-\\[_\\]]+"));
637 | xsmatch m;
638 | vector matches;
639 | //for benchmarking only: with CreateDictionary("big.txt","") and the text corpus from http://norvig.com/big.txt the Regex below provides the exact same number of dictionary items as Norvigs regex "[a-z]+" (which splits words at apostrophes & incompatible with non-latin characters)
640 | //MatchCollection mc = Regex.Matches(text.ToLower(), @"[\w-[\d_]]+");
641 | xstring::const_iterator ptr(text.cbegin());
642 | while(regex_search(ptr, text.cend(),m,r))
643 | {
644 | matches.push_back(m[0]);
645 | ptr = m.suffix().first;
646 | }
647 | return matches;
648 | }
649 |
650 | //inexpensive and language independent: only deletes, no transposes + replaces + inserts
651 | //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
652 | HashSet* SymSpell::Edits(xstring word, int editDistance, HashSet* deleteWords)
653 | {
654 | editDistance++;
655 | if (word.size() > 1)
656 | {
657 | for (int i = 0; i < word.size(); i++)
658 | {
659 | xstring temp(word);
660 | xstring del = temp.erase(i, 1);
661 | if (deleteWords->insert(del).second)
662 | {
663 | //recursion, if maximum edit distance not yet reached
664 | if (editDistance < maxDictionaryEditDistance) Edits(del, editDistance, deleteWords);
665 | }
666 | }
667 | }
668 | return deleteWords;
669 | }
670 |
671 | HashSet SymSpell::EditsPrefix(xstring key)
672 | {
673 | HashSet hashSet = HashSet();
674 | if (key.size() <= maxDictionaryEditDistance) hashSet.insert(XL(""));
675 | if (key.size() > prefixLength) key = key.substr(0, prefixLength);
676 | hashSet.insert(key);
677 | Edits(key, 0, &hashSet);
678 | return hashSet;
679 | }
680 |
681 | int SymSpell::GetstringHash(xstring s)
682 | {
683 | //return s.GetHashCode();
684 |
685 | int len = s.size();
686 | int lenMask = len;
687 | if (lenMask > 3) lenMask = 3;
688 |
689 | uint hash = 2166136261;
690 | for (auto i = 0; i < len; i++)
691 | {
692 | //unchecked, its fine even if it can be overflowed
693 | {
694 | hash ^= s[i];
695 | hash *= 16777619;
696 | }
697 | }
698 |
699 | hash &= this->compactMask;
700 | hash |= (uint)lenMask;
701 | return (int)hash;
702 | }
703 |
704 |
705 | //######################
706 |
707 | //LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases:
708 | //1. mistakenly inserted space into a correct word led to two incorrect terms
709 | //2. mistakenly omitted space between two correct words led to one incorrect combined term
710 | //3. multiple independent input terms with/without spelling errors
711 |
712 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
713 | /// The string being spell checked.
714 | /// A vector of SuggestItem object representing suggested correct spellings for the input string.
715 | vector SymSpell::LookupCompound(xstring input)
716 | {
717 | return LookupCompound(input, this->maxDictionaryEditDistance);
718 | }
719 |
720 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
721 | /// The string being spell checked.
722 | /// The maximum edit distance between input and suggested words.
723 | /// A vector of SuggestItem object representing suggested correct spellings for the input string.
724 | vector SymSpell::LookupCompound(xstring input, int editDistanceMax)
725 | {
726 | //parse input string into single terms
727 | vector termList1 = ParseWords(input);
728 |
729 | vector suggestions; //suggestions for a single term
730 | vector suggestionParts; //1 line with separate parts
731 | auto distanceComparer = EditDistance(this->distanceAlgorithm);
732 |
733 | //translate every term to its best suggestion, otherwise it remains unchanged
734 | bool lastCombi = false;
735 | for (int i = 0; i < termList1.size(); i++)
736 | {
737 | suggestions = Lookup(termList1[i], Top, editDistanceMax);
738 |
739 | //combi check, always before split
740 | if ((i > 0) && !lastCombi)
741 | {
742 | vector suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], Top, editDistanceMax);
743 |
744 | if (suggestionsCombi.size() > 0)
745 | {
746 | SuggestItem best1 = suggestionParts[suggestionParts.size() - 1];
747 | SuggestItem best2 = SuggestItem();
748 | if (suggestions.size() > 0)
749 | {
750 | best2 = suggestions[0];
751 | }
752 | else
753 | {
754 | //unknown word
755 | best2.term = termList1[i];
756 | //estimated edit distance
757 | best2.distance = editDistanceMax + 1;
758 | //estimated word occurrence probability P=10 / (N * 10^word length l)
759 | best2.count = (long)((double)10 / pow((double)10, (double)best2.term.size())); // 0;
760 | }
761 |
762 | //distance1=edit distance between 2 split terms und their best corrections : als comparative value for the combination
763 | int distance1 = best1.distance + best2.distance;
764 | if ((distance1 >= 0) && ((suggestionsCombi[0].distance + 1 < distance1) || ((suggestionsCombi[0].distance + 1 == distance1) && ((double)suggestionsCombi[0].count > (double)best1.count / (double)N * (double)best2.count))))
765 | {
766 | suggestionsCombi[0].distance++;
767 | suggestionParts[suggestionParts.size() - 1] = suggestionsCombi[0];
768 | lastCombi = true;
769 | goto nextTerm;
770 | }
771 | }
772 | }
773 | lastCombi = false;
774 |
775 | //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms
776 | if ((suggestions.size() > 0) && ((suggestions[0].distance == 0) || (termList1[i].size() == 1)))
777 | {
778 | //choose best suggestion
779 | suggestionParts.push_back(suggestions[0]);
780 | }
781 | else
782 | {
783 | //if no perfect suggestion, split word into pairs
784 | SuggestItem suggestionSplitBest;
785 |
786 | //add original term
787 | if (suggestions.size() > 0) suggestionSplitBest.set(suggestions[0]);
788 |
789 | if (termList1[i].size() > 1)
790 | {
791 | for (int j = 1; j < termList1[i].size(); j++)
792 | {
793 | xstring part1 = termList1[i].substr(0, j);
794 | xstring part2 = termList1[i].substr(j);
795 | SuggestItem suggestionSplit = SuggestItem();
796 | vector suggestions1 = Lookup(part1, Top, editDistanceMax);
797 | if (suggestions1.size() > 0)
798 | {
799 | vector suggestions2 = Lookup(part2, Top, editDistanceMax);
800 | if (suggestions2.size() > 0)
801 | {
802 | //select best suggestion for split pair
803 | suggestionSplit.term = suggestions1[0].term + XL(" ") + suggestions2[0].term;
804 |
805 | int distance2 = distanceComparer.Compare(termList1[i], suggestionSplit.term, editDistanceMax);
806 | if (distance2 < 0) distance2 = editDistanceMax + 1;
807 |
808 | if (suggestionSplitBest.count)
809 | {
810 | if (distance2 > suggestionSplitBest.distance) continue;
811 | if (distance2 < suggestionSplitBest.distance) suggestionSplitBest.count = 0;
812 | }
813 |
814 | suggestionSplit.distance = distance2;
815 | //if bigram exists in bigram dictionary
816 | if (bigrams.count(suggestionSplit.term))
817 | {
818 | long bigramCount = bigrams.at(suggestionSplit.term);
819 | suggestionSplit.count = bigramCount;
820 |
821 | //increase count, if split.corrections are part of or identical to input
822 | //single term correction exists
823 | if (suggestions.size() > 0)
824 | {
825 | //alternatively remove the single term from suggestionsSplit, but then other splittings could win
826 | if ((suggestions1[0].term + suggestions2[0].term == termList1[i]))
827 | {
828 | //make count bigger than count of single term correction
829 | suggestionSplit.count = max(suggestionSplit.count, suggestions[0].count + 2);
830 | }
831 | else if ((suggestions1[0].term == suggestions[0].term) || (suggestions2[0].term == suggestions[0].term))
832 | {
833 | //make count bigger than count of single term correction
834 | suggestionSplit.count = max(suggestionSplit.count, suggestions[0].count + 1);
835 | }
836 | }
837 | //no single term correction exists
838 | else if ((suggestions1[0].term + suggestions2[0].term == termList1[i]))
839 | {
840 | suggestionSplit.count = max(suggestionSplit.count, max(suggestions1[0].count, suggestions2[0].count) + 2);
841 | }
842 |
843 | }
844 | else
845 | {
846 | //The Naive Bayes probability of the word combination is the product of the two word probabilities: P(AB) = P(A) * P(B)
847 | //use it to estimate the frequency count of the combination, which then is used to rank/select the best splitting variant
848 | suggestionSplit.count = min(bigramCountMin, (int64_t)((double)suggestions1[0].count / (double)N * (double)suggestions2[0].count));
849 | }
850 |
851 | if (suggestionSplit.count > suggestionSplitBest.count) suggestionSplitBest.set(suggestionSplit);
852 | }
853 | }
854 | }
855 |
856 | if (suggestionSplitBest.count)
857 | {
858 | //select best suggestion for split pair
859 | suggestionParts.push_back(suggestionSplitBest);
860 | }
861 | else
862 | {
863 | SuggestItem si = SuggestItem();
864 | si.term = termList1[i];
865 | //estimated word occurrence probability P=10 / (N * 10^word length l)
866 | si.count = (long)((double)10 / pow((double)10, (double)si.term.size()));
867 | si.distance = editDistanceMax + 1;
868 | suggestionParts.push_back(si);
869 | }
870 | }
871 | else
872 | {
873 | SuggestItem si = SuggestItem();
874 | si.term = termList1[i];
875 | //estimated word occurrence probability P=10 / (N * 10^word length l)
876 | si.count = (long)((double)10 / pow((double)10, (double)si.term.size()));
877 | si.distance = editDistanceMax + 1;
878 | suggestionParts.push_back(si);
879 | }
880 | }
881 | nextTerm:;
882 | }
883 |
884 | SuggestItem suggestion = SuggestItem();
885 |
886 | double count = N;
887 | xstring s(XL("")) ;
888 | for (SuggestItem si : suggestionParts)
889 | {
890 | s += (si.term + XL(" "));
891 | count *= (double)si.count / (double)N;
892 | }
893 | suggestion.count = (long)count;
894 | rtrim(s);
895 | suggestion.term = s;
896 | suggestion.distance = distanceComparer.Compare(input, suggestion.term, MAXINT);
897 |
898 | vector suggestionsLine;
899 | suggestionsLine.push_back(suggestion);
900 | return suggestionsLine;
901 | }
902 |
903 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
904 | /// The string being spell checked.
905 | /// The word segmented string,
906 | /// the word segmented and spelling corrected string,
907 | /// the Edit distance sum between input string and corrected string,
908 | /// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
909 | Info SymSpell::WordSegmentation(xstring input)
910 | {
911 | return WordSegmentation(input, this->MaxDictionaryEditDistance(), this->maxDictionaryWordLength);
912 | }
913 |
914 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
915 | /// The string being spell checked.
916 | /// The maximum edit distance between input and corrected words
917 | /// (0=no correction/segmentation only).
918 | /// The word segmented string,
919 | /// the word segmented and spelling corrected string,
920 | /// the Edit distance sum between input string and corrected string,
921 | /// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
922 | Info SymSpell::WordSegmentation(xstring input, int maxEditDistance)
923 | {
924 | return WordSegmentation(input, maxEditDistance, this->maxDictionaryWordLength);
925 | }
926 |
927 | /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
928 | /// The string being spell checked.
929 | /// The maximum word length that should be considered.
930 | /// The maximum edit distance between input and corrected words
931 | /// (0=no correction/segmentation only).
932 | /// The word segmented string,
933 | /// the word segmented and spelling corrected string,
934 | /// the Edit distance sum between input string and corrected string,
935 | /// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
936 | Info SymSpell::WordSegmentation(xstring input, int maxEditDistance, int maxSegmentationWordLength)
937 | {
938 | int arraySize = min(maxSegmentationWordLength, (int)input.size());
939 | vector compositions = vector(arraySize);
940 | int circularIndex = -1;
941 |
942 | //outer loop (column): all possible part start positions
943 | for (int j = 0; j < input.size(); j++)
944 | {
945 | //inner loop (row): all possible part lengths (from start position): part can't be bigger than longest word in dictionary (other than long unknown word)
946 | int imax = min((int)input.size() - j, maxSegmentationWordLength);
947 | for (int i = 1; i <= imax; i++)
948 | {
949 | //get top spelling correction/ed for part
950 | xstring part = input.substr(j, i);
951 | int separatorLength = 0;
952 | int topEd = 0;
953 | double topProbabilityLog = 0;
954 | xstring topResult = XL("");
955 |
956 | if (isxspace(part[0]))
957 | {
958 | //remove space for levensthein calculation
959 | part = part.substr(1);
960 | }
961 | else
962 | {
963 | //add ed+1: space did not exist, had to be inserted
964 | separatorLength = 1;
965 | }
966 |
967 | //remove space from part1, add number of removed spaces to topEd
968 | topEd += part.size();
969 | //remove space
970 | xregex r(XL("(\\s)+"));
971 | part = regex_replace(part, r, XL(""));
972 | // part = part.Replace(" ", ""); //=System.Text.RegularExpressions.Regex.Replace(part1, @"\s+", "");
973 | // //add number of removed spaces to ed
974 | topEd -= part.size();
975 |
976 | vector results = this->Lookup(part, Top, maxEditDistance);
977 | if (results.size() > 0)
978 | {
979 | topResult = results[0].term;
980 | topEd += results[0].distance;
981 | //Naive Bayes Rule
982 | //we assume the word probabilities of two words to be independent
983 | //therefore the resulting probability of the word combination is the product of the two word probabilities
984 |
985 | //instead of computing the product of probabilities we are computing the sum of the logarithm of probabilities
986 | //because the probabilities of words are about 10^-10, the product of many such small numbers could exceed (underflow) the floating number range and become zero
987 | //log(ab)=log(a)+log(b)
988 | topProbabilityLog = log10((double)results[0].count / (double)N);
989 | }
990 | else
991 | {
992 | topResult = part;
993 | //default, if word not found
994 | //otherwise long input text would win as long unknown word (with ed=edmax+1 ), although there there should many spaces inserted
995 | topEd += part.size();
996 | topProbabilityLog = log10(10.0 / (N * pow(10.0, part.size())));
997 | }
998 |
999 | int destinationIndex = ((i + circularIndex) % arraySize);
1000 |
1001 | //set values in first loop
1002 | if (j == 0)
1003 | {
1004 | compositions[destinationIndex].set(part, topResult, topEd, topProbabilityLog);
1005 | }
1006 | else if ((i == maxSegmentationWordLength)
1007 | //replace values if better probabilityLogSum, if same edit distance OR one space difference
1008 | || (((compositions[circularIndex].getDistance() + topEd == compositions[destinationIndex].getDistance()) \
1009 | || (compositions[circularIndex].getDistance() + separatorLength + topEd == compositions[destinationIndex].getDistance())) \
1010 | && (compositions[destinationIndex].getProbability() < compositions[circularIndex].getProbability() + topProbabilityLog))
1011 | //replace values if smaller edit distance
1012 | || (compositions[circularIndex].getDistance() + separatorLength + topEd < compositions[destinationIndex].getDistance()))
1013 | {
1014 | xstring seg = compositions[circularIndex].getSegmented()+ XL(" ") + part;
1015 | xstring correct = compositions[circularIndex].getCorrected() + XL(" ") + topResult;
1016 | int d = compositions[circularIndex].getDistance() + separatorLength + topEd;
1017 | double prob = compositions[circularIndex].getProbability() + topProbabilityLog;
1018 | compositions[destinationIndex].set(seg, correct, d, prob);
1019 | }
1020 | }
1021 | circularIndex++; if (circularIndex == arraySize) circularIndex = 0;
1022 | }
1023 | return compositions[circularIndex];
1024 | }
--------------------------------------------------------------------------------