├── .DS_Store ├── .gitattributes ├── .github └── workflows │ └── dotnet.yml ├── .gitignore ├── LICENSE ├── README.md ├── appveyor.yml └── src ├── .DS_Store ├── StringComparison.Test ├── StringComparison.Test.csproj └── Tests.cs └── StringComparison ├── .DS_Store ├── .idea └── .idea.StringComparison │ └── .idea │ ├── .gitignore │ ├── indexLayout.xml │ └── vcs.xml ├── CheckSimilarity.cs ├── Enums ├── StringComparisonOption.cs └── StringComparisonTolerance.cs ├── HammingDistance.cs ├── JaccardDistance.cs ├── JaroDistance.cs ├── JaroWinklerDistance.cs ├── LevenshteinDistance.cs ├── LongestCommonSubsequence.cs ├── LongestCommonSubstring.cs ├── Operations.cs ├── OverlapCoefficient.cs ├── RatcliffObershelpSimilarity.cs ├── SorensenDiceDistance.cs ├── StringComparison.csproj ├── StringComparison.sln └── TanimotoCoefficient.cs /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bezzad/StringComparison/cd869855264e36e5f65bf081e695d9af057d478b/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.github/workflows/dotnet.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a .NET project 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-net 3 | 4 | name: .NET Ubuntu x64 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Setup .NET 8.0 20 | uses: actions/setup-dotnet@v4 21 | with: 22 | dotnet-version: 8.0.x 23 | 24 | - name: Restore dependencies 25 | run: dotnet restore ./src/StringComparison/StringComparison.csproj 26 | 27 | - name: Build 28 | run: dotnet build ./src/StringComparison/StringComparison.csproj --no-restore 29 | 30 | - name: Test 31 | run: dotnet test ./src/StringComparison.Test/StringComparison.Test.csproj --no-build --no-restore --verbosity normal 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | [Xx]64/ 19 | [Xx]86/ 20 | [Bb]uild/ 21 | bld/ 22 | [Bb]in/ 23 | [Oo]bj/ 24 | 25 | # Visual Studio 2015 cache/options directory 26 | .vs/ 27 | # Uncomment if you have tasks that create the project's static files in wwwroot 28 | #wwwroot/ 29 | 30 | # MSTest test Results 31 | [Tt]est[Rr]esult*/ 32 | [Bb]uild[Ll]og.* 33 | 34 | # NUNIT 35 | *.VisualState.xml 36 | TestResult.xml 37 | 38 | # Build Results of an ATL Project 39 | [Dd]ebugPS/ 40 | [Rr]eleasePS/ 41 | dlldata.c 42 | 43 | # DNX 44 | project.lock.json 45 | artifacts/ 46 | 47 | *_i.c 48 | *_p.c 49 | *_i.h 50 | *.ilk 51 | *.meta 52 | *.obj 53 | *.pch 54 | *.pdb 55 | *.pgc 56 | *.pgd 57 | *.rsp 58 | *.sbr 59 | *.tlb 60 | *.tli 61 | *.tlh 62 | *.tmp 63 | *.tmp_proj 64 | *.log 65 | *.vspscc 66 | *.vssscc 67 | .builds 68 | *.pidb 69 | *.svclog 70 | *.scc 71 | 72 | # Chutzpah Test files 73 | _Chutzpah* 74 | 75 | # Visual C++ cache files 76 | ipch/ 77 | *.aps 78 | *.ncb 79 | *.opendb 80 | *.opensdf 81 | *.sdf 82 | *.cachefile 83 | *.VC.db 84 | 85 | # Visual Studio profiler 86 | *.psess 87 | *.vsp 88 | *.vspx 89 | *.sap 90 | 91 | # TFS 2012 Local Workspace 92 | $tf/ 93 | 94 | # Guidance Automation Toolkit 95 | *.gpState 96 | 97 | # ReSharper is a .NET coding add-in 98 | _ReSharper*/ 99 | *.[Rr]e[Ss]harper 100 | *.DotSettings.user 101 | 102 | # JustCode is a .NET coding add-in 103 | .JustCode 104 | 105 | # TeamCity is a build add-in 106 | _TeamCity* 107 | 108 | # DotCover is a Code Coverage Tool 109 | *.dotCover 110 | 111 | # NCrunch 112 | _NCrunch_* 113 | .*crunch*.local.xml 114 | nCrunchTemp_* 115 | 116 | # MightyMoose 117 | *.mm.* 118 | AutoTest.Net/ 119 | 120 | # Web workbench (sass) 121 | .sass-cache/ 122 | 123 | # Installshield output folder 124 | [Ee]xpress/ 125 | 126 | # DocProject is a documentation generator add-in 127 | DocProject/buildhelp/ 128 | DocProject/Help/*.HxT 129 | DocProject/Help/*.HxC 130 | DocProject/Help/*.hhc 131 | DocProject/Help/*.hhk 132 | DocProject/Help/*.hhp 133 | DocProject/Help/Html2 134 | DocProject/Help/html 135 | 136 | # Click-Once directory 137 | publish/ 138 | 139 | # Publish Web Output 140 | *.[Pp]ublish.xml 141 | *.azurePubxml 142 | 143 | # TODO: Un-comment the next line if you do not want to checkin 144 | # your web deploy settings because they may include unencrypted 145 | # passwords 146 | #*.pubxml 147 | *.publishproj 148 | 149 | # NuGet Packages 150 | *.nupkg 151 | # The packages folder can be ignored because of Package Restore 152 | **/packages/* 153 | # except build/, which is used as an MSBuild target. 154 | !**/packages/build/ 155 | # Uncomment if necessary however generally it will be regenerated when needed 156 | #!**/packages/repositories.config 157 | # NuGet v3's project.json files produces more ignoreable files 158 | *.nuget.props 159 | *.nuget.targets 160 | 161 | # Microsoft Azure Build Output 162 | csx/ 163 | *.build.csdef 164 | 165 | # Microsoft Azure Emulator 166 | ecf/ 167 | rcf/ 168 | 169 | # Microsoft Azure ApplicationInsights config file 170 | ApplicationInsights.config 171 | 172 | # Windows Store app package directory 173 | AppPackages/ 174 | BundleArtifacts/ 175 | 176 | # Visual Studio cache files 177 | # files ending in .cache can be ignored 178 | *.[Cc]ache 179 | # but keep track of directories ending in .cache 180 | !*.[Cc]ache/ 181 | 182 | # Others 183 | ClientBin/ 184 | [Ss]tyle[Cc]op.* 185 | ~$* 186 | *~ 187 | *.dbmdl 188 | *.dbproj.schemaview 189 | *.pfx 190 | *.publishsettings 191 | node_modules/ 192 | orleans.codegen.cs 193 | 194 | # RIA/Silverlight projects 195 | Generated_Code/ 196 | 197 | # Backup & report files from converting an old project file 198 | # to a newer Visual Studio version. Backup files are not needed, 199 | # because we have git ;-) 200 | _UpgradeReport_Files/ 201 | Backup*/ 202 | UpgradeLog*.XML 203 | UpgradeLog*.htm 204 | 205 | # SQL Server files 206 | *.mdf 207 | *.ldf 208 | 209 | # Business Intelligence projects 210 | *.rdl.data 211 | *.bim.layout 212 | *.bim_*.settings 213 | 214 | # Microsoft Fakes 215 | FakesAssemblies/ 216 | 217 | # GhostDoc plugin setting file 218 | *.GhostDoc.xml 219 | 220 | # Node.js Tools for Visual Studio 221 | .ntvs_analysis.dat 222 | 223 | # Visual Studio 6 build log 224 | *.plg 225 | 226 | # Visual Studio 6 workspace options file 227 | *.opt 228 | 229 | # Visual Studio LightSwitch build output 230 | **/*.HTMLClient/GeneratedArtifacts 231 | **/*.DesktopClient/GeneratedArtifacts 232 | **/*.DesktopClient/ModelManifest.xml 233 | **/*.Server/GeneratedArtifacts 234 | **/*.Server/ModelManifest.xml 235 | _Pvt_Extensions 236 | 237 | # LightSwitch generated files 238 | GeneratedArtifacts/ 239 | ModelManifest.xml 240 | 241 | # Paket dependency manager 242 | .paket/paket.exe 243 | 244 | # FAKE - F# Make 245 | .fake/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Behzad Khosravifar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build status](https://ci.appveyor.com/api/projects/status/ha4n02pu9jsraq1t?svg=true)](https://ci.appveyor.com/project/Behzadkhosravifar/stringcomparison) 2 | [![nuget version](https://img.shields.io/nuget/v/stringcomparison.svg)](https://www.nuget.org/packages/stringcomparison) 3 | [![Nuget downloads](http://img.shields.io/nuget/dt/stringcomparison.svg)](https://www.nuget.org/packages/stringcomparison/) 4 | 5 | ---------------------------------------------------- 6 | 7 | # String Comparison 8 | 9 | String Comparison for C#.NET 10 | 11 | ## Project Description 12 | 13 | StringComparison is a library developed for reconciling naming conventions between different models of the electric grid. 14 | I have stripped off the power system specific code and put together what can effectively be used as a string extension for determining approximate equality between two strings. 15 | All of the algorithms used here have been pulled from online resources, translated into C#, and compiled into this library. 16 | I found several other similar open-source implementations around but nothing for .NET/C#. Adding the *.dll to your project will give you access to this extension and the individual extensions under the hood of the `IsSimilarity()` extension. 17 | 18 | ## Algorithms included in this project 19 | 20 | * [Hamming Distance](http://en.wikipedia.org/wiki/Hamming_distance) 21 | * [Jaccard Distance](http://en.wikipedia.org/wiki/Jaccard_index) 22 | * [Jaro Distance](http://en.wikipedia.org/wiki/Jaro_distance) 23 | * [Jaro-Winkler Distance](http://en.wikipedia.org/wiki/Jaro_distance) 24 | * [Levenshtein Distance](http://en.wikipedia.org/wiki/Levenshtein_distance) 25 | * [Longest Common Subsequence](http://en.wikipedia.org/wiki/Longest_common_subsequence_problem) 26 | * [Longest Common Substring](http://en.wikipedia.org/wiki/Longest_common_substring) 27 | * [Overlap Coefficient](http://en.wikipedia.org/wiki/Overlap_coefficient) 28 | * [Ratcliff-Obershelp Similarity](http://www.morfoedro.it/doc.php?n=223&lang=en) 29 | * [Sorensen-Dice Distance](http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) 30 | * [Tanimoto Coefficient](http://en.wikipedia.org/wiki/Tanimoto_coefficient#Tanimoto_coefficient_.28extended_Jaccard_coefficient.29) 31 | 32 | ## String Comparision 33 | 34 | While all of the algorithms are exposed and can be used and can provide their raw results, 35 | they have been conveniently combined in a way that they can selectively be used to judge the approximate equality of two strings. 36 | This is done through the `IsSimilar` extension and by setting the desired `StringComparisonOptions` and `StringComparisonTolerance`. 37 | 38 | For two strings that are desired to be compared approximately, a boolean response of equality can be garnered in the following way: 39 | 40 | ## Installation 41 | 42 | Download last release https://github.com/Behzadkhosravifar/StringComparison/releases 43 | 44 | or install from NuGet https://www.nuget.org/packages/StringComparison. To install run the following command in the Package Manager Console 45 | ``` 46 | Install-Package StringComparison 47 | ``` 48 | 49 | ## Usage 50 | 51 | ```c# 52 | 53 | string source = "behzad"; 54 | string target = "behsad"; 55 | 56 | var options = new List(); 57 | 58 | // Choose which algorithms should weigh in for the comparison 59 | options.Add(StringComparisonOptions.UseOverlapCoefficient); 60 | options.Add(StringComparisonOptions.UseLongestCommonSubsequence); 61 | options.Add(StringComparisonOptions.UseLongestCommonSubstring); 62 | 63 | // Choose the relative strength of the comparison - is it almost exactly equal? or is it just close? 64 | var tolerance = StringComparisonTolerance.Strong; 65 | 66 | // Get a boolean determination of approximate equality 67 | bool result = source.IsSimilar(target, options, tolerance); 68 | double howManySimilar = source.SimilarityPercent(target, options); 69 | double simLevenshtein = source.LevenshteinDistancePercentage(target); 70 | double simJaro = 1 - source.JaroDistance(target); 71 | 72 | ``` 73 | 74 | ## Contributing 75 | 76 | 1. Fork it! 77 | 2. Create your feature branch: `git checkout -b my-new-feature` 78 | 3. Commit your changes: `git commit -am 'Add some feature'` 79 | 4. Push to the branch: `git push origin my-new-feature` 80 | 5. Submit a pull request :) 81 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | image: Visual Studio 2017 2 | configuration: Release 3 | platform: Any CPU 4 | before_build: 5 | - cmd: nuget restore .\src\ 6 | build: 7 | verbosity: normal -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bezzad/StringComparison/cd869855264e36e5f65bf081e695d9af057d478b/src/.DS_Store -------------------------------------------------------------------------------- /src/StringComparison.Test/StringComparison.Test.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | 8 | false 9 | true 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/StringComparison.Test/Tests.cs: -------------------------------------------------------------------------------- 1 | using StringComparison.Enums; 2 | using Xunit.Abstractions; 3 | 4 | namespace StringComparison.Test; 5 | 6 | public class Tests(ITestOutputHelper output) 7 | { 8 | [Theory] 9 | [InlineData("قلعه حیوانات", "قلعه حیوانات")] 10 | [InlineData("قلعه حیوانات", "قلعه ای برای حیوانات")] 11 | [InlineData("قلعه حیوانات", "قلعه‌ی حیوانات")] 12 | [InlineData("قلعه حیوانات", "قلعه حیوانات (خلاصه کتاب)")] 13 | [InlineData("قلعه حیوانات", "مزرعه حیوانات")] 14 | [InlineData("قلعه حیوانات", "کلینیک حیوانات")] 15 | [InlineData("قلعه حیوانات", "خانه حیوانات")] 16 | [InlineData("من قبل از تو", "من پیش از تو")] 17 | [InlineData("ماهی طلایی", "ماهی سیاه")] 18 | [InlineData("او من", "من او")] 19 | [InlineData("آتشین دیوار", "دیوار آتشین")] 20 | [InlineData("ملت عشق", "عشق")] 21 | [InlineData("دخیل عشق", "انسان و عشق")] 22 | [InlineData("مامان و بابای سیاه پلنگ صورتی", "مامبای سیاه و عشق صورتی")] 23 | [InlineData("باهم او من", "من او باهم")] 24 | [InlineData("تحلیلی بر پوسترسازی دفاع مقدس و دو جنگ جهانی- بخش دوم", 25 | "تحلیلی بر پوسترسازی دفاع مقدس و دو جنگ جهانی- بخش اول")] 26 | [InlineData("کیمیاگران", "کیمیاگر")] 27 | [InlineData("کیمیاگری", "کیمیاگر")] 28 | [InlineData(" کیمیاگر ", "کیمیاگر")] 29 | [InlineData("شبیه‌سازی عشق", "مدل‌سازی عشق")] 30 | [InlineData("تار", "راز")] 31 | [InlineData("قاز", "راز")] 32 | [InlineData("قار", "تار")] 33 | [InlineData("ماهنامه", "روزنامه")] 34 | public void UseAllAlgorithms(string source, string target) 35 | { 36 | output.WriteLine($" {source} !==! {target}"); 37 | output.WriteLine("--------------------"); 38 | 39 | ExecuteAll(source, target); 40 | } 41 | 42 | private void ExecuteAll(string source, string target) 43 | { 44 | foreach (var option in Enum.GetValues()) 45 | { 46 | if (option is StringComparisonOption.CaseSensitive or 47 | StringComparisonOption.Normalized) 48 | continue; 49 | 50 | var sOpt = option | StringComparisonOption.Normalized; 51 | var tolerance = 1; 52 | while (tolerance <= 3) 53 | { 54 | var isSimilar = source.IsSimilar(target, (StringComparisonTolerance)tolerance, sOpt); 55 | if (isSimilar) 56 | break; 57 | tolerance++; 58 | } 59 | 60 | var toleranceName = tolerance < 4 ? ((StringComparisonTolerance)tolerance).ToString() : "!"; 61 | Assert.True((tolerance < 4) == source.IsSimilar(target, StringComparisonTolerance.Weak, sOpt)); 62 | output.WriteLine($"{option.ToString()} Similarity: % {source.Similarity(target, sOpt) * 100} {toleranceName}"); 63 | output.WriteLine($"{option.ToString()} Distance: " + source.DiffPercent(target, sOpt)); 64 | output.WriteLine(""); 65 | } 66 | } 67 | 68 | [Theory] 69 | [InlineData("Alex Taremi", "Taromi, Alex", true)] 70 | [InlineData("Mohammad Taremi", "Taromi, Alex", false)] 71 | public void TestIsSimilarUseOverlapAndJaro(string source, string target, bool expected) 72 | { 73 | // act 74 | var result = source.IsSimilar(target, StringComparisonTolerance.Strong, 75 | StringComparisonOption.UseJaccardDistance); 76 | 77 | Assert.Equal(expected, result); 78 | 79 | ExecuteAll(source, target); 80 | } 81 | } -------------------------------------------------------------------------------- /src/StringComparison/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bezzad/StringComparison/cd869855264e36e5f65bf081e695d9af057d478b/src/StringComparison/.DS_Store -------------------------------------------------------------------------------- /src/StringComparison/.idea/.idea.StringComparison/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Rider ignored files 5 | /projectSettingsUpdater.xml 6 | /.idea.StringComparisonCore.iml 7 | /contentModel.xml 8 | /modules.xml 9 | # Editor-based HTTP Client requests 10 | /httpRequests/ 11 | # Datasource local storage ignored files 12 | /dataSources/ 13 | /dataSources.local.xml 14 | -------------------------------------------------------------------------------- /src/StringComparison/.idea/.idea.StringComparison/.idea/indexLayout.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ../../../StringComparison 6 | ../../src 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/StringComparison/.idea/.idea.StringComparison/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/StringComparison/CheckSimilarity.cs: -------------------------------------------------------------------------------- 1 | using StringComparison.Enums; 2 | 3 | namespace StringComparison; 4 | 5 | public static partial class ComparisonMetrics 6 | { 7 | public static bool IsSimilar(this string source, string target, 8 | StringComparisonTolerance tolerance, 9 | StringComparisonOption options) 10 | { 11 | var diff = DiffPercent(source, target, options); 12 | 13 | switch (tolerance) 14 | { 15 | case StringComparisonTolerance.Strong: 16 | return diff < 0.25; 17 | case StringComparisonTolerance.Normal: 18 | return diff < 0.5; 19 | case StringComparisonTolerance.Weak: 20 | return diff < 0.75; 21 | default: 22 | return false; 23 | } 24 | } 25 | 26 | public static double DiffPercent(this string source, string target, StringComparisonOption options) 27 | { 28 | if (string.IsNullOrEmpty(source) || string.IsNullOrEmpty(target)) 29 | return 1; 30 | 31 | var comparisonResults = new List(); 32 | 33 | if (!options.HasFlag(StringComparisonOption.CaseSensitive)) 34 | { 35 | source = source.Capitalize(); 36 | target = target.Capitalize(); 37 | } 38 | 39 | if (options.HasFlag(StringComparisonOption.Normalized)) 40 | { 41 | source = source.SanitizeToAlphanumeric(); 42 | target = target.SanitizeToAlphanumeric(); 43 | } 44 | 45 | // Min: 0 Max: source.Length = target.Length 46 | if (options.HasFlag(StringComparisonOption.UseHammingDistance)) 47 | comparisonResults.Add(source.HammingDistance(target) / target.Length); 48 | 49 | // Min: 0 Max: 1 50 | if (options.HasFlag(StringComparisonOption.UseJaccardDistance)) 51 | comparisonResults.Add(source.JaccardDistance(target)); 52 | 53 | // Min: 0 Max: 1 54 | if (options.HasFlag(StringComparisonOption.UseJaroDistance)) 55 | comparisonResults.Add(source.JaroDistance(target)); 56 | 57 | // Min: 0 Max: 1 58 | if (options.HasFlag(StringComparisonOption.UseJaroWinklerDistance)) 59 | comparisonResults.Add(source.JaroWinklerDistance(target)); 60 | 61 | // Min: 0 Max: LevenshteinDistanceUpperBounds - LevenshteinDistanceLowerBounds 62 | // Min: LevenshteinDistanceLowerBounds Max: LevenshteinDistanceUpperBounds 63 | if (options.HasFlag(StringComparisonOption.UseNormalizedLevenshteinDistance)) 64 | comparisonResults.Add(Convert.ToDouble(source.NormalizedLevenshteinDistance(target)) / 65 | Convert.ToDouble(Math.Max(source.Length, target.Length) - 66 | source.LevenshteinDistanceLowerBounds(target))); 67 | else if (options.HasFlag(StringComparisonOption.UseLevenshteinDistance)) 68 | comparisonResults.Add(source.LevenshteinDistancePercentage(target)); 69 | 70 | if (options.HasFlag(StringComparisonOption.UseLongestCommonSubsequence)) 71 | comparisonResults.Add(1 - Convert.ToDouble(source.LongestCommonSubsequence(target).Length / 72 | Convert.ToDouble(Math.Min(source.Length, target.Length)))); 73 | 74 | if (options.HasFlag(StringComparisonOption.UseLongestCommonSubstring)) 75 | comparisonResults.Add(1 - Convert.ToDouble(source.LongestCommonSubstring(target).Length / 76 | Convert.ToDouble(Math.Min(source.Length, target.Length)))); 77 | 78 | // Min: 0 Max: 1 79 | if (options.HasFlag(StringComparisonOption.UseSorensenDiceDistance)) 80 | comparisonResults.Add(source.SorensenDiceDistance(target)); 81 | 82 | // Min: 0 Max: 1 83 | if (options.HasFlag(StringComparisonOption.UseOverlapCoefficient)) 84 | comparisonResults.Add(1 - source.OverlapCoefficient(target)); 85 | 86 | // Min: 0 Max: 1 87 | if (options.HasFlag(StringComparisonOption.UseRatcliffObershelpSimilarity)) 88 | comparisonResults.Add(1 - source.RatcliffObershelpSimilarity(target)); 89 | 90 | // Min: 0 Max: 1 91 | if (options.HasFlag(StringComparisonOption.UseTanimotoCoefficient)) 92 | comparisonResults.Add(1 - source.TanimotoCoefficient(target)); 93 | 94 | return comparisonResults.Average(); 95 | } 96 | 97 | public static double Similarity(this string source, string target, StringComparisonOption options) 98 | { 99 | return 1 - DiffPercent(source, target, options); 100 | } 101 | 102 | public static string SanitizeToAlphanumeric(this string source) 103 | { 104 | return source.Trim() 105 | .Where(c => char.IsWhiteSpace(c) || char.IsLetterOrDigit(c)) 106 | .Aggregate("", (current, c) => current + c); 107 | } 108 | } -------------------------------------------------------------------------------- /src/StringComparison/Enums/StringComparisonOption.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison.Enums; 2 | 3 | [Flags] 4 | public enum StringComparisonOption 5 | { 6 | CaseSensitive = 1, 7 | Normalized = 2, 8 | UseHammingDistance = 4, 9 | UseJaccardDistance = 8, 10 | UseJaroDistance = 16, 11 | UseJaroWinklerDistance = 32, 12 | UseLevenshteinDistance = 64, 13 | UseLongestCommonSubsequence = 128, 14 | UseLongestCommonSubstring = 256, 15 | UseNormalizedLevenshteinDistance = 512, 16 | UseOverlapCoefficient = 1024, 17 | UseRatcliffObershelpSimilarity = 2048, 18 | UseSorensenDiceDistance = 4096, 19 | UseTanimotoCoefficient = 8192 20 | } -------------------------------------------------------------------------------- /src/StringComparison/Enums/StringComparisonTolerance.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison.Enums; 2 | 3 | public enum StringComparisonTolerance 4 | { 5 | Strong = 1, 6 | Normal = 2, 7 | Weak = 3 8 | } -------------------------------------------------------------------------------- /src/StringComparison/HammingDistance.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double HammingDistance(this string source, string target) 6 | { 7 | if (source.Length != target.Length) 8 | { 9 | // padding smaller string with spaces 10 | if (source.Length < target.Length) 11 | source = source.PadRight(target.Length); 12 | else 13 | target = target.PadRight(source.Length); 14 | } 15 | 16 | return source.Where((t, i) => !t.Equals(target[i])).Count(); 17 | } 18 | } -------------------------------------------------------------------------------- /src/StringComparison/JaccardDistance.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double JaccardDistance(this string source, string target) 6 | { 7 | return 1 - source.JaccardIndex(target); 8 | } 9 | 10 | public static double JaccardIndex(this string source, string target) 11 | { 12 | return Convert.ToDouble(source.Intersect(target).Count()) / 13 | Convert.ToDouble(source.Union(target).Count()); 14 | } 15 | } -------------------------------------------------------------------------------- /src/StringComparison/JaroDistance.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double JaroDistance(this string source, string target) 6 | { 7 | var m = source.Intersect(target).Count(); 8 | 9 | if (m == 0) return 0; 10 | 11 | var sourceTargetIntersetAsString = ""; 12 | var targetSourceIntersetAsString = ""; 13 | var sourceIntersectTarget = source.Intersect(target); 14 | var targetIntersectSource = target.Intersect(source); 15 | foreach (var character in sourceIntersectTarget) sourceTargetIntersetAsString += character; 16 | foreach (var character in targetIntersectSource) targetSourceIntersetAsString += character; 17 | var t = sourceTargetIntersetAsString.LevenshteinDistance(targetSourceIntersetAsString) / 2; 18 | return (m / source.Length + m / target.Length + (m - t) / m) / 3; 19 | } 20 | } -------------------------------------------------------------------------------- /src/StringComparison/JaroWinklerDistance.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double JaroWinklerDistance(this string source, string target) 6 | { 7 | var jaroDistance = source.JaroDistance(target); 8 | var commonPrefixLength = CommonPrefixLength(source, target); 9 | 10 | return jaroDistance + commonPrefixLength * 0.1 * (1 - jaroDistance); 11 | } 12 | 13 | public static double JaroWinklerDistanceWithPrefixScale(string source, string target, double prefixScale) 14 | { 15 | if (prefixScale > 0.25) prefixScale = 0.25; 16 | else if (prefixScale < 0) prefixScale = 0; 17 | 18 | var jaroDistance = source.JaroDistance(target); 19 | var commonPrefixLength = CommonPrefixLength(source, target); 20 | 21 | return jaroDistance + commonPrefixLength * prefixScale * (1 - jaroDistance); 22 | } 23 | 24 | private static double CommonPrefixLength(string source, string target) 25 | { 26 | var maximumPrefixLength = 4; 27 | var commonPrefixLength = 0; 28 | if (source.Length <= 4 || target.Length <= 4) maximumPrefixLength = Math.Min(source.Length, target.Length); 29 | 30 | for (var i = 0; i < maximumPrefixLength; i++) 31 | if (source[i].Equals(target[i])) commonPrefixLength++; 32 | else return commonPrefixLength; 33 | 34 | return commonPrefixLength; 35 | } 36 | } -------------------------------------------------------------------------------- /src/StringComparison/LevenshteinDistance.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | /// 6 | /// Calculate the minimum number of single-character edits needed to change the source into the target, 7 | /// allowing insertions, deletions, and substitutions. 8 | /// 9 | /// Time complexity: at least O(n^2), where n is the length of each string 10 | /// Accordingly, this algorithm is most efficient when at least one of the strings is very short 11 | /// 12 | /// 13 | /// 14 | /// 15 | /// The number of edits required to transform the source into the target. This is at most the length of the 16 | /// longest string, and at least the difference in length between the two strings 17 | /// 18 | private static double LevenshteinDistance(this string source, string target) 19 | { 20 | source = source?.Trim(); 21 | target = target?.Trim(); 22 | 23 | if (source == null || target == null || 24 | source.Length == 0 || target.Length == 0 || 25 | source == target) return 0; 26 | 27 | // Step 1 28 | if (source.Length == 0) 29 | return target.Length; 30 | 31 | if (target.Length == 0) 32 | return source.Length; 33 | 34 | var distance = new int[source.Length + 1, target.Length + 1]; 35 | 36 | // Step 2 37 | for (var i = 0; i <= source.Length; distance[i, 0] = i++) 38 | { 39 | } 40 | 41 | for (var j = 0; j <= target.Length; distance[0, j] = j++) 42 | { 43 | } 44 | 45 | for (var i = 1; i <= source.Length; i++) 46 | for (var j = 1; j <= target.Length; j++) 47 | { 48 | // Step 3 49 | var cost = target[j - 1] == source[i - 1] ? 0 : 1; 50 | 51 | // Step 4 52 | distance[i, j] = Math.Min(Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1), 53 | distance[i - 1, j - 1] + cost); 54 | } 55 | 56 | return distance[source.Length, target.Length]; 57 | } 58 | 59 | /// 60 | /// Calculate the minimum number of single-character edits needed to change the source into the target, 61 | /// allowing insertions, deletions, and substitutions. 62 | ///

63 | /// Time complexity: at least O(n^2), where n is the length of each string 64 | /// Accordingly, this algorithm is most efficient when at least one of the strings is very short 65 | ///
66 | /// 67 | /// 68 | /// 69 | /// The Levenshtein distance, normalized so that the lower bound is always zero, rather than the difference in 70 | /// length between the two strings 71 | /// 72 | public static double NormalizedLevenshteinDistance(this string source, string target) 73 | { 74 | var unnormalizedLevenshteinDistance = source.LevenshteinDistance(target); 75 | 76 | return unnormalizedLevenshteinDistance - source.LevenshteinDistanceLowerBounds(target); 77 | } 78 | 79 | /// 80 | /// The upper bounds is either the length of the longer string, or the Hamming distance. 81 | /// 82 | /// 83 | /// 84 | /// 85 | public static double LevenshteinDistanceUpperBounds(this string source, string target) 86 | { 87 | // If the two strings are the same length then the Hamming Distance is the upper bounds of the Levenshtien Distance. 88 | if (source.Length == target.Length) return source.HammingDistance(target); 89 | 90 | // Otherwise, the upper bound is the length of the longer string. 91 | if (source.Length > target.Length) return source.Length; 92 | if (target.Length > source.Length) return target.Length; 93 | 94 | return 9999; 95 | } 96 | 97 | /// 98 | /// The lower bounds is the difference in length between the two strings 99 | /// 100 | /// 101 | /// 102 | /// 103 | public static double LevenshteinDistanceLowerBounds(this string source, string target) 104 | { 105 | // If the two strings are different lengths then the lower bounds is the difference in length. 106 | return Math.Abs(source.Length - target.Length); 107 | } 108 | 109 | /// 110 | /// Calculate percentage similarity of two strings 111 | /// Source String to Compare with 112 | /// Targeted String to Compare 113 | /// Return Similarity between two strings from 0 to 1.0 114 | /// 115 | public static double LevenshteinDistancePercentage(this string source, string target) 116 | { 117 | if (source == null || target == null || 118 | source.Length == 0 || target.Length == 0) 119 | return 0.0; 120 | 121 | var stepsToSame = LevenshteinDistance(source, target); 122 | return stepsToSame / Math.Max(source.Length, target.Length); 123 | } 124 | } -------------------------------------------------------------------------------- /src/StringComparison/LongestCommonSubsequence.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static string LongestCommonSubsequence(this string source, string target) 6 | { 7 | var c = LongestCommonSubsequenceLengthTable(source, target); 8 | 9 | return Backtrack(c, source, target, source.Length, target.Length); 10 | } 11 | 12 | private static int[,] LongestCommonSubsequenceLengthTable(string source, string target) 13 | { 14 | var c = new int[source.Length + 1, target.Length + 1]; 15 | 16 | for (var i = 0; i < source.Length + 1; i++) c[i, 0] = 0; 17 | for (var j = 0; j < target.Length + 1; j++) c[0, j] = 0; 18 | 19 | for (var i = 1; i < source.Length + 1; i++) 20 | for (var j = 1; j < target.Length + 1; j++) 21 | if (source[i - 1].Equals(target[j - 1])) 22 | c[i, j] = c[i - 1, j - 1] + 1; 23 | else 24 | c[i, j] = Math.Max(c[i, j - 1], c[i - 1, j]); 25 | 26 | return c; 27 | } 28 | 29 | private static string Backtrack(int[,] c, string source, string target, int i, int j) 30 | { 31 | if (i == 0 || j == 0) 32 | return ""; 33 | if (source[i - 1].Equals(target[j - 1])) 34 | return Backtrack(c, source, target, i - 1, j - 1) + source[i - 1]; 35 | if (c[i, j - 1] > c[i - 1, j]) 36 | return Backtrack(c, source, target, i, j - 1); 37 | return Backtrack(c, source, target, i - 1, j); 38 | } 39 | } -------------------------------------------------------------------------------- /src/StringComparison/LongestCommonSubstring.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace StringComparison; 4 | 5 | public static partial class ComparisonMetrics 6 | { 7 | public static string LongestCommonSubstring(this string source, string target) 8 | { 9 | if (string.IsNullOrEmpty(source) || string.IsNullOrEmpty(target)) return null; 10 | 11 | var L = new int[source.Length, target.Length]; 12 | var maximumLength = 0; 13 | var lastSubsBegin = 0; 14 | var stringBuilder = new StringBuilder(); 15 | 16 | for (var i = 0; i < source.Length; i++) 17 | for (var j = 0; j < target.Length; j++) 18 | if (source[i] != target[j]) 19 | { 20 | L[i, j] = 0; 21 | } 22 | else 23 | { 24 | if (i == 0 || j == 0) 25 | L[i, j] = 1; 26 | else 27 | L[i, j] = 1 + L[i - 1, j - 1]; 28 | 29 | if (L[i, j] > maximumLength) 30 | { 31 | maximumLength = L[i, j]; 32 | var thisSubsBegin = i - L[i, j] + 1; 33 | if (lastSubsBegin == thisSubsBegin) 34 | { 35 | //if the current LCS is the same as the last time this block ran 36 | stringBuilder.Append(source[i]); 37 | } 38 | else //this block resets the string builder if a different LCS is found 39 | { 40 | lastSubsBegin = thisSubsBegin; 41 | stringBuilder.Length = 0; //clear it 42 | stringBuilder.Append(source.Substring(lastSubsBegin, i + 1 - lastSubsBegin)); 43 | } 44 | } 45 | } 46 | 47 | return stringBuilder.ToString(); 48 | } 49 | } -------------------------------------------------------------------------------- /src/StringComparison/Operations.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static class Operations 4 | { 5 | public static string Capitalize(this string source) 6 | { 7 | return source?.ToUpper(); 8 | } 9 | 10 | public static string[] SplitIntoIndividualElements(string source) 11 | { 12 | var stringCollection = new string[source.Length]; 13 | 14 | for (var i = 0; i < stringCollection.Length; i++) 15 | stringCollection[i] = source[i].ToString(); 16 | 17 | return stringCollection; 18 | } 19 | 20 | public static string MergeIndividualElementsIntoString(IList source) 21 | { 22 | var returnString = ""; 23 | 24 | for (var i = 0; i < source.Count(); i++) 25 | returnString += source.ElementAt(i); 26 | return returnString; 27 | } 28 | 29 | public static List ListPrefixes(this string source) 30 | { 31 | return source.Select((t, i) => source.Substring(0, i)).ToList(); 32 | } 33 | 34 | public static List ListBiGrams(this string source) 35 | { 36 | return ListNGrams(source, 2); 37 | } 38 | 39 | public static List ListTriGrams(this string source) 40 | { 41 | return ListNGrams(source, 3); 42 | } 43 | 44 | public static List ListNGrams(this string source, int n) 45 | { 46 | var nGrams = new List(); 47 | 48 | if (n > source.Length) 49 | return null; 50 | if (n == source.Length) 51 | { 52 | nGrams.Add(source); 53 | return nGrams; 54 | } 55 | for (var i = 0; i < source.Length - n; i++) 56 | nGrams.Add(source.Substring(i, n)); 57 | 58 | return nGrams; 59 | } 60 | } -------------------------------------------------------------------------------- /src/StringComparison/OverlapCoefficient.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double OverlapCoefficient(this string source, string target) 6 | { 7 | return Convert.ToDouble(source.Intersect(target).Count()) / 8 | Convert.ToDouble(Math.Min(source.Length, target.Length)); 9 | } 10 | } -------------------------------------------------------------------------------- /src/StringComparison/RatcliffObershelpSimilarity.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double RatcliffObershelpSimilarity(this string source, string target) 6 | { 7 | return 2 * Convert.ToDouble(source.Intersect(target).Count()) / 8 | Convert.ToDouble(source.Length + target.Length); 9 | } 10 | } -------------------------------------------------------------------------------- /src/StringComparison/SorensenDiceDistance.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double SorensenDiceDistance(this string source, string target) 6 | { 7 | return 1 - source.SorensenDiceIndex(target); 8 | } 9 | 10 | public static double SorensenDiceIndex(this string source, string target) 11 | { 12 | return 2 * Convert.ToDouble(source.Intersect(target).Count()) / 13 | Convert.ToDouble(source.Length + target.Length); 14 | } 15 | } -------------------------------------------------------------------------------- /src/StringComparison/StringComparison.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | latestmajor 5 | enable 6 | disable 7 | StringComparison 8 | net8.0;netstandard2.0;netstandard2.1 9 | 2.0.0 10 | StringComparison 11 | While all of the algorithms are exposed and can be used and can provide their raw results, they have been conveniently combined in a way that they can selectively be used to judge the approximate equality of two strings. 12 | 2017-2025 13 | https://github.com/bezzad/StringComparisonCore 14 | https://github.com/bezzad/StringComparisonCore 15 | git 16 | string; comparation; comparison; comparer; approximate 17 | DotNet core version of StringComparison project 18 | README.md 19 | embedded 20 | 21 | 22 | true 23 | 24 | 25 | true 26 | MIT 27 | 28 | 29 | 30 | 31 | True 32 | \ 33 | 34 | 35 | True 36 | \ 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/StringComparison/StringComparison.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StringComparison.Test", "..\StringComparison.Test\StringComparison.Test.csproj", "{E9EF2A4C-F9F0-4861-BD14-A35986D78CA2}" 4 | EndProject 5 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution", "Solution", "{0100F9D1-CDEF-4550-B64F-6337919629F8}" 6 | ProjectSection(SolutionItems) = preProject 7 | ..\..\LICENSE = ..\..\LICENSE 8 | ..\..\README.md = ..\..\README.md 9 | EndProjectSection 10 | EndProject 11 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StringComparison", "StringComparison.csproj", "{5D230323-D73E-4212-B062-6D880F0404FC}" 12 | EndProject 13 | Global 14 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 15 | Debug|Any CPU = Debug|Any CPU 16 | Release|Any CPU = Release|Any CPU 17 | EndGlobalSection 18 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 19 | {E9EF2A4C-F9F0-4861-BD14-A35986D78CA2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 20 | {E9EF2A4C-F9F0-4861-BD14-A35986D78CA2}.Debug|Any CPU.Build.0 = Debug|Any CPU 21 | {E9EF2A4C-F9F0-4861-BD14-A35986D78CA2}.Release|Any CPU.ActiveCfg = Release|Any CPU 22 | {E9EF2A4C-F9F0-4861-BD14-A35986D78CA2}.Release|Any CPU.Build.0 = Release|Any CPU 23 | {5D230323-D73E-4212-B062-6D880F0404FC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 24 | {5D230323-D73E-4212-B062-6D880F0404FC}.Debug|Any CPU.Build.0 = Debug|Any CPU 25 | {5D230323-D73E-4212-B062-6D880F0404FC}.Release|Any CPU.ActiveCfg = Release|Any CPU 26 | {5D230323-D73E-4212-B062-6D880F0404FC}.Release|Any CPU.Build.0 = Release|Any CPU 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /src/StringComparison/TanimotoCoefficient.cs: -------------------------------------------------------------------------------- 1 | namespace StringComparison; 2 | 3 | public static partial class ComparisonMetrics 4 | { 5 | public static double TanimotoCoefficient(this string source, string target) 6 | { 7 | double na = source.Length; 8 | double nb = target.Length; 9 | double nc = source.Intersect(target).Count(); 10 | 11 | return nc / (na + nb - nc); 12 | } 13 | } --------------------------------------------------------------------------------