├── AhoCorasick ├── key.snk ├── AhoCorasick.csproj ├── Extensions.cs ├── Trie.cs ├── AhoCorasick.cs └── CharComparer.cs ├── .github └── dependabot.yml ├── .nuget └── packages.config ├── LICENSE ├── AhoCorasick.Tests ├── AhoCorasick.Tests.csproj ├── CharComparerTests.cs └── UnitTests.cs ├── AhoCorasick.SqlClr ├── AhoCorasick.SqlClr.sln ├── AhoCorasick.SqlClr.sqlproj ├── Contains.cs └── dist │ └── AhoCorasick.SqlClr_Create.sql ├── AhoCorasick.sln ├── appveyor.yml ├── .gitignore └── README.md /AhoCorasick/key.snk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mganss/AhoCorasick/HEAD/AhoCorasick/key.snk -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: nuget 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "04:00" 8 | open-pull-requests-limit: 10 9 | -------------------------------------------------------------------------------- /.nuget/packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Michael Ganss 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /AhoCorasick.Tests/AhoCorasick.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | net462;net6.0 4 | Ganss.Text.Tests 5 | true 6 | opencover 7 | ../coverage.xml 8 | [AhoCorasick]* 9 | 10 | 11 | 12 | 13 | 14 | 15 | all 16 | runtime; build; native; contentfiles; analyzers 17 | 18 | 19 | 20 | 21 | 22 | all 23 | runtime; build; native; contentfiles; analyzers; buildtransitive 24 | 25 | 26 | -------------------------------------------------------------------------------- /AhoCorasick.SqlClr/AhoCorasick.SqlClr.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.31101.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{00D1A9C2-B5F0-4AF3-8072-F6C62B433612}") = "AhoCorasick.SqlClr", "AhoCorasick.SqlClr.sqlproj", "{FBC73EEF-133B-492B-9EBB-77E7484F3485}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Debug|Any CPU.Deploy.0 = Debug|Any CPU 17 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Release|Any CPU.ActiveCfg = Release|Any CPU 18 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Release|Any CPU.Build.0 = Release|Any CPU 19 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Release|Any CPU.Deploy.0 = Release|Any CPU 20 | EndGlobalSection 21 | GlobalSection(SolutionProperties) = preSolution 22 | HideSolutionNode = FALSE 23 | EndGlobalSection 24 | EndGlobal 25 | -------------------------------------------------------------------------------- /AhoCorasick.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.26730.12 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AhoCorasick", "AhoCorasick\AhoCorasick.csproj", "{A1F19E27-06C8-4DB1-94CE-24B4F10330DF}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AhoCorasick.Tests", "AhoCorasick.Tests\AhoCorasick.Tests.csproj", "{860F4149-D7F3-495A-99BE-8AC3FBAE4176}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Any CPU = Debug|Any CPU 13 | Release|Any CPU = Release|Any CPU 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Release|Any CPU.ActiveCfg = Release|Any CPU 19 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Release|Any CPU.Build.0 = Release|Any CPU 20 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 21 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Debug|Any CPU.Build.0 = Debug|Any CPU 22 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Release|Any CPU.ActiveCfg = Release|Any CPU 23 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Release|Any CPU.Build.0 = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {12FCD39C-9278-4D4D-8438-9A0FCDCA85F3} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: 2.0.{build} 2 | skip_tags: true 3 | image: Visual Studio 2022 4 | environment: 5 | access_token: 6 | secure: Eq6BjtZ80BXKLwFMg76IjuQAvbLjbojIF/X/ARouGVhxPneJtgDfCXMPNgJ7KBKq 7 | sonar_token: 8 | secure: W7pHKhuTW6Lh8WlXJNTOIaOzeuxLi+H6Nqmnm4pr28jM6jyIpOZ+1r10lIQi0eCA 9 | JAVA_HOME: C:\Program Files\Java\jdk19 10 | build_script: 11 | - dotnet restore 12 | - dotnet pack --include-symbols --include-source -c Release AhoCorasick 13 | test_script: 14 | - ps: | 15 | if (-not $env:APPVEYOR_PULL_REQUEST_NUMBER) { 16 | dotnet tool install --global dotnet-sonarscanner 17 | dotnet sonarscanner begin /k:"mganss_AhoCorasick" /v:$env:APPVEYOR_BUILD_VERSION /o:"mganss-github" /d:sonar.host.url="https://sonarcloud.io" /d:sonar.login="$env:sonar_token" /d:sonar.cs.opencover.reportsPaths="$($env:APPVEYOR_BUILD_FOLDER)\coverage.xml" /d:sonar.coverage.exclusions="**/Program.cs" 18 | dotnet build 19 | } 20 | - dotnet test /p:CollectCoverage=true AhoCorasick.Tests\AhoCorasick.Tests.csproj /p:Include="[AhoCorasick]*" -f net6.0 21 | - ps: cp coverage.*.xml ./coverage.xml 22 | - ps: | 23 | if (-not $env:APPVEYOR_PULL_REQUEST_NUMBER) { 24 | dotnet sonarscanner end /d:sonar.login="$env:sonar_token" 25 | } 26 | - pip install codecov 27 | - codecov -f "coverage.xml" 28 | artifacts: 29 | - path: 'AhoCorasick\**\*.*nupkg' 30 | deploy: 31 | - provider: GitHub 32 | tag: v$(APPVEYOR_BUILD_VERSION) 33 | release: $(APPVEYOR_BUILD_VERSION) 34 | description: '$(APPVEYOR_REPO_COMMIT_MESSAGE)' 35 | auth_token: 36 | secure: Eq6BjtZ80BXKLwFMg76IjuQAvbLjbojIF/X/ARouGVhxPneJtgDfCXMPNgJ7KBKq 37 | draft: true 38 | on: 39 | branch: master 40 | -------------------------------------------------------------------------------- /AhoCorasick/AhoCorasick.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | Ganss.Text 4 | AhoCorasick 5 | Implements the Aho-Corasick string search algorithm 6 | Copyright 2013-$([System.DateTime]::Now.Year) Michael Ganss 7 | 1.0.0 8 | 2.0.0.0 9 | $(AppVeyor_Build_Version).0 10 | $(AppVeyor_Build_Version) 11 | Michael Ganss 12 | net40;netstandard2.0 13 | AhoCorasick 14 | AhoCorasick 15 | aho-corasick;aho;corasick;string;search;match;substring 16 | https://github.com/mganss/AhoCorasick 17 | https://github.com/mganss/AhoCorasick/blob/master/LICENSE 18 | README.md 19 | git 20 | git://github.com/mganss/AhoCorasick 21 | Ganss.Text 22 | true 23 | bin\$(Configuration)\$(TargetFramework)\AhoCorasick.xml 24 | true 25 | key.snk 26 | snupkg 27 | true 28 | true 29 | true 30 | snupkg 31 | latest 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /AhoCorasick/Extensions.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | 3 | namespace Ganss.Text 4 | { 5 | /// 6 | /// Provides extension methods. 7 | /// 8 | public static class Extensions 9 | { 10 | /// 11 | /// Determines whether this instance contains the specified words. 12 | /// 13 | /// The text. 14 | /// The words. 15 | /// The matched words. 16 | public static IEnumerable Contains(this string text, IEnumerable words) 17 | { 18 | return new AhoCorasick(words).Search(text); 19 | } 20 | 21 | /// 22 | /// Determines whether this instance contains the specified words. 23 | /// 24 | /// The text. 25 | /// The words. 26 | /// The matched words. 27 | public static IEnumerable Contains(this string text, params string[] words) 28 | { 29 | return new AhoCorasick(words).Search(text); 30 | } 31 | 32 | /// 33 | /// Determines whether this instance contains the specified words. 34 | /// 35 | /// The text. 36 | /// The comparer used to compare individual characters. 37 | /// The words. 38 | /// The matched words. 39 | public static IEnumerable Contains(this string text, IEqualityComparer comparer, IEnumerable words) 40 | { 41 | return new AhoCorasick(comparer, words).Search(text); 42 | } 43 | 44 | /// 45 | /// Determines whether this instance contains the specified words. 46 | /// 47 | /// The text. 48 | /// The comparer used to compare individual characters. 49 | /// The words. 50 | /// The matched words. 51 | public static IEnumerable Contains(this string text, IEqualityComparer comparer, params string[] words) 52 | { 53 | return new AhoCorasick(comparer, words).Search(text); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.sln.docstates 8 | 9 | # Build results 10 | 11 | [Dd]ebug/ 12 | [Rr]elease/ 13 | x64/ 14 | build/ 15 | [Bb]in/ 16 | [Oo]bj/ 17 | 18 | # MSTest test Results 19 | [Tt]est[Rr]esult*/ 20 | [Bb]uild[Ll]og.* 21 | 22 | *_i.c 23 | *_p.c 24 | *.ilk 25 | *.meta 26 | *.obj 27 | *.pch 28 | *.pdb 29 | *.pgc 30 | *.pgd 31 | *.rsp 32 | *.sbr 33 | *.tlb 34 | *.tli 35 | *.tlh 36 | *.tmp 37 | *.tmp_proj 38 | *.log 39 | *.vspscc 40 | *.vssscc 41 | .builds 42 | *.pidb 43 | *.log 44 | *.scc 45 | 46 | # Visual C++ cache files 47 | ipch/ 48 | *.aps 49 | *.ncb 50 | *.opensdf 51 | *.sdf 52 | *.cachefile 53 | 54 | # Visual Studio profiler 55 | *.psess 56 | *.vsp 57 | *.vspx 58 | 59 | # Guidance Automation Toolkit 60 | *.gpState 61 | 62 | # ReSharper is a .NET coding add-in 63 | _ReSharper*/ 64 | *.[Rr]e[Ss]harper 65 | 66 | # TeamCity is a build add-in 67 | _TeamCity* 68 | 69 | # DotCover is a Code Coverage Tool 70 | *.dotCover 71 | 72 | # NCrunch 73 | *.ncrunch* 74 | .*crunch*.local.xml 75 | 76 | # Installshield output folder 77 | [Ee]xpress/ 78 | 79 | # DocProject is a documentation generator add-in 80 | DocProject/buildhelp/ 81 | DocProject/Help/*.HxT 82 | DocProject/Help/*.HxC 83 | DocProject/Help/*.hhc 84 | DocProject/Help/*.hhk 85 | DocProject/Help/*.hhp 86 | DocProject/Help/Html2 87 | DocProject/Help/html 88 | 89 | # Click-Once directory 90 | publish/ 91 | 92 | # Publish Web Output 93 | *.Publish.xml 94 | *.pubxml 95 | 96 | # NuGet Packages Directory 97 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 98 | packages/ 99 | 100 | # Windows Azure Build Output 101 | csx 102 | *.build.csdef 103 | 104 | # Windows Store app package directory 105 | AppPackages/ 106 | 107 | # Others 108 | sql/ 109 | *.Cache 110 | ClientBin/ 111 | [Ss]tyle[Cc]op.* 112 | ~$* 113 | *~ 114 | *.dbmdl 115 | *.[Pp]ublish.xml 116 | *.pfx 117 | *.publishsettings 118 | 119 | # RIA/Silverlight projects 120 | Generated_Code/ 121 | 122 | # Backup & report files from converting an old project file to a newer 123 | # Visual Studio version. Backup files are not needed, because we have git ;-) 124 | _UpgradeReport_Files/ 125 | Backup*/ 126 | UpgradeLog*.XML 127 | UpgradeLog*.htm 128 | 129 | # SQL Server files 130 | App_Data/*.mdf 131 | App_Data/*.ldf 132 | 133 | # ========================= 134 | # Windows detritus 135 | # ========================= 136 | 137 | # Windows image file caches 138 | Thumbs.db 139 | ehthumbs.db 140 | 141 | # Folder config file 142 | Desktop.ini 143 | 144 | # Recycle Bin used on file shares 145 | $RECYCLE.BIN/ 146 | 147 | # Mac crap 148 | .DS_Store 149 | 150 | *.nupkg 151 | TestResult.xml 152 | coverage.xml 153 | .vs/ 154 | /OpenCover 155 | -------------------------------------------------------------------------------- /AhoCorasick.Tests/CharComparerTests.cs: -------------------------------------------------------------------------------- 1 | using NUnit.Framework; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Globalization; 5 | using System.Linq; 6 | using System.Text; 7 | using System.Threading; 8 | using System.Threading.Tasks; 9 | 10 | namespace Ganss.Text.Tests 11 | { 12 | public class CharComparerTests 13 | { 14 | const char SmallDotlessI = '\u0131'; 15 | const char CapitalIWithDot = '\u0130'; 16 | const char CapitalSharpS = '\u1e9e'; 17 | const char LatinSmallCapitalR = '\u0280'; 18 | const char LatinLetterYR = '\u01a6'; 19 | 20 | [Test] 21 | public void OrdinalTest() 22 | { 23 | var c = CharComparer.Ordinal; 24 | Assert.That(c.Equals('i', 'i'), Is.True); 25 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False); 26 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False); 27 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.False); 28 | 29 | c = CharComparer.OrdinalIgnoreCase; 30 | Assert.That(c.Equals('i', 'I'), Is.True); 31 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False); 32 | Assert.That(c.Equals(SmallDotlessI, 'I'), Is.False); 33 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False); 34 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.True); 35 | } 36 | 37 | [Test] 38 | public void InvariantTest() 39 | { 40 | var c = CharComparer.InvariantCulture; 41 | Assert.That(c.Equals('i', 'i'), Is.True); 42 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False); 43 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False); 44 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.False); 45 | 46 | c = CharComparer.InvariantCultureIgnoreCase; 47 | Assert.That(c.Equals('i', 'I'), Is.True); 48 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False); 49 | Assert.That(c.Equals(SmallDotlessI, 'I'), Is.False); 50 | Assert.That(c.Equals('ß', CapitalSharpS), Is.True); 51 | } 52 | 53 | [Test] 54 | public void CultureTest() 55 | { 56 | CultureInfo.CurrentCulture = new CultureInfo("tr-TR"); 57 | var c = CharComparer.CurrentCulture; 58 | Assert.That(c.Equals('i', 'i'), Is.True); 59 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False); 60 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False); 61 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.False); 62 | 63 | c = CharComparer.CurrentCultureIgnoreCase; 64 | Assert.That(c.Equals('i', 'I'), Is.False); 65 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False); 66 | Assert.That(c.Equals(SmallDotlessI, 'I'), Is.True); 67 | Assert.That(c.Equals('i', CapitalIWithDot), Is.True); 68 | Assert.That(c.Equals('ß', CapitalSharpS), Is.True); 69 | 70 | Assert.That(c.GetHashCode('i'), Is.EqualTo(c.GetHashCode(CapitalIWithDot))); 71 | Assert.That(c.GetHashCode(SmallDotlessI), Is.EqualTo(c.GetHashCode('I'))); 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /AhoCorasick/Trie.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | 3 | namespace Ganss.Text 4 | { 5 | /// 6 | /// A Trie. 7 | /// 8 | public class Trie 9 | { 10 | /// 11 | /// Gets or sets the child nodes. 12 | /// 13 | /// 14 | /// The child nodes. 15 | /// 16 | public Dictionary Next { get; set; } 17 | 18 | /// 19 | /// Gets or sets a value indicating whether this instance represents a word in the dictionary. 20 | /// 21 | /// 22 | /// true if this instance is a word in the dictionary; otherwise, false. 23 | /// 24 | public bool IsWord { get; set; } 25 | 26 | /// 27 | /// Gets or sets the failure node. 28 | /// 29 | /// 30 | /// The failure node. 31 | /// 32 | public Trie Fail { get; set; } 33 | 34 | /// 35 | /// Gets or sets the parent node. 36 | /// 37 | /// 38 | /// The parent node. 39 | /// 40 | public Trie Parent { get; set; } 41 | 42 | /// 43 | /// Gets the word prefix this node represents. 44 | /// 45 | /// 46 | /// The word prefix. 47 | /// 48 | public string Word { get; private set; } 49 | 50 | /// 51 | /// Initializes a new instance of the class. 52 | /// 53 | public Trie() 54 | { 55 | Word = ""; 56 | Next = new Dictionary(); 57 | } 58 | 59 | /// 60 | /// Initializes a new instance of the class. 61 | /// 62 | /// The comparer used to compare individual characters. 63 | public Trie(IEqualityComparer comparer) 64 | { 65 | Word = ""; 66 | Next = new Dictionary(comparer); 67 | } 68 | 69 | /// 70 | /// Adds the specified word to the trie. 71 | /// 72 | /// The word. 73 | /// 74 | public virtual Trie Add(string word) 75 | { 76 | var c = word[0]; 77 | 78 | if (!Next.TryGetValue(c, out Trie node)) 79 | Next[c] = node = new Trie(Next.Comparer) { Parent = this, Word = Word + c }; 80 | 81 | if (word.Length > 1) 82 | return node.Add(word.Substring(1)); 83 | else 84 | node.IsWord = true; 85 | 86 | return node; 87 | } 88 | 89 | /// 90 | /// Finds the failure node for a specified suffix within the given range of indices. 91 | /// 92 | /// The string containing the suffix. 93 | /// The start index of the suffix within the string. 94 | /// The end index (exclusive) of the suffix within the string. 95 | /// The failure node or null if no failure node is found. 96 | 97 | public virtual Trie ExploreFailLink(string word, int startIndex, int endIndex) 98 | { 99 | var node = this; 100 | 101 | for (int i = startIndex; i < endIndex; i++) 102 | { 103 | if (!node.Next.TryGetValue(word[i], out node)) 104 | { 105 | return null; 106 | } 107 | } 108 | 109 | return node; 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /AhoCorasick.SqlClr/AhoCorasick.SqlClr.sqlproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | AhoCorasick.SqlClr 8 | 2.0 9 | 4.1 10 | {fbc73eef-133b-492b-9ebb-77e7484f3485} 11 | Microsoft.Data.Tools.Schema.Sql.Sql120DatabaseSchemaProvider 12 | Database 13 | 14 | 15 | AhoCorasick.SqlClr 16 | AhoCorasick.SqlClr 17 | 1033, CI 18 | BySchemaAndSchemaType 19 | True 20 | v4.0 21 | CS 22 | Properties 23 | False 24 | True 25 | True 26 | True 27 | 28 | 29 | bin\Release\ 30 | $(MSBuildProjectName).sql 31 | False 32 | pdbonly 33 | true 34 | false 35 | true 36 | prompt 37 | 4 38 | NET40 39 | 40 | 41 | bin\Debug\ 42 | $(MSBuildProjectName).sql 43 | false 44 | true 45 | full 46 | false 47 | true 48 | true 49 | prompt 50 | 4 51 | NET40 52 | 53 | 54 | 11.0 55 | 56 | True 57 | 11.0 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | AhoCorasick.cs 67 | 68 | 69 | CharComparer.cs 70 | 71 | 72 | Trie.cs 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /AhoCorasick.Tests/UnitTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using Ganss.Text; 4 | using System.Collections.Generic; 5 | using NUnit.Framework; 6 | using System.Globalization; 7 | 8 | namespace Ganss.Text.Tests 9 | { 10 | public class WordMatchList: List 11 | { 12 | public void Add(int index, string word) 13 | { 14 | Add(new WordMatch { Index = index, Word = word }); 15 | } 16 | } 17 | 18 | public class UnitTests 19 | { 20 | [Test] 21 | public void SearchWikipediaTest() 22 | { 23 | // from https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm 24 | var ac = new AhoCorasick("a", "ab", "bab", "bc", "bca", "c", "caa"); 25 | var m = ac.Search("abccab").ToList(); 26 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; 27 | Assert.That(m, Is.EquivalentTo(expected)); 28 | } 29 | 30 | [Test] 31 | public void SimpleTest() 32 | { 33 | var ac = new AhoCorasick("a"); 34 | Assert.That(ac.Search("a").ToList(), Is.EquivalentTo(new WordMatchList { { 0, "a" } })); 35 | Assert.That(ac.Search("b"), Is.Empty); 36 | } 37 | 38 | [Test] 39 | public void SearchNullEmptyTest() 40 | { 41 | var ac = new AhoCorasick("a"); 42 | var m = ac.Search(null).ToList(); 43 | Assert.That(m, Is.Empty); 44 | m = ac.Search("").ToList(); 45 | Assert.That(m, Is.Empty); 46 | } 47 | 48 | [Test] 49 | public void SearchMultipleTest() 50 | { 51 | var ac = new AhoCorasick("her", "their", "eye", "iris", "he", "is", "si"); 52 | var m = ac.Search("theye iris irisis").ToList(); 53 | var expected = new WordMatchList { { 1, "he" }, { 2, "eye" }, { 6, "iris" }, { 8, "is" }, { 11, "iris" }, { 13, "is" }, { 14, "si" }, { 15, "is" } }; 54 | Assert.That(m, Is.EquivalentTo(expected)); 55 | } 56 | 57 | [Test] 58 | public void SearchIvankTest() 59 | { 60 | // from http://blog.ivank.net/aho-corasick-algorithm-in-as3.html 61 | var ac = new AhoCorasick("take", "fast", "sofa"); 62 | var m = ac.Search("takeso fasofast fassofatake sosso sofastake so").ToList(); 63 | var expected = new WordMatchList { { 0, "take" }, { 9, "sofa" }, { 11, "fast" }, { 19, "sofa" }, { 23, "take" }, { 34, "sofa" }, { 36, "fast" }, { 39, "take" } }; 64 | Assert.That(m, Is.EquivalentTo(expected)); 65 | } 66 | 67 | [Test] 68 | public void StringExtensionTest() 69 | { 70 | var m = "abc".Contains("abd", "bc", "ab").ToList(); 71 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } })); 72 | m = "abc".Contains(new List { "abd", "bc", "ab" }).ToList(); 73 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } })); 74 | m = "ABC".Contains(CharComparer.OrdinalIgnoreCase, "abd", "bc", "ab").ToList(); 75 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } })); 76 | m = "ABC".Contains(CharComparer.OrdinalIgnoreCase, new List { "abd", "bc", "ab" }).ToList(); 77 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } })); 78 | } 79 | 80 | [Test] 81 | public void UpperCaseTest() 82 | { 83 | var ac = new AhoCorasick("a", "ab", "bab", "bC", "bca", "c", "caa"); 84 | var m = ac.Search("abCcab").ToList(); 85 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; 86 | Assert.That(m, Is.EquivalentTo(expected)); 87 | } 88 | 89 | [Test] 90 | public void OrdinalIgnoreCaseTest() 91 | { 92 | var ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, "a", "ab", "bab", "bC", "bca", "c", "caa"); 93 | var m = ac.Search("abCcab").ToList(); 94 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; 95 | Assert.That(m, Is.EquivalentTo(expected)); 96 | } 97 | 98 | [Test] 99 | public void OverloadsTest() 100 | { 101 | var ac = new AhoCorasick(new List { "a" }); 102 | Assert.That(ac.Search("a").ToList(), Is.EquivalentTo(new WordMatchList { { 0, "a" } })); 103 | Assert.That(ac.Search("b"), Is.Empty); 104 | 105 | ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, new List { "a", "ab", "bab", "bC", "bca", "c", "caa" }); 106 | var m = ac.Search("abCcab").ToList(); 107 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; 108 | Assert.That(m, Is.EquivalentTo(expected)); 109 | 110 | ac = new AhoCorasick(); 111 | ac.Add("a"); 112 | ac.BuildFail(); 113 | Assert.That(ac.Search("a").ToList(), Is.EquivalentTo(new WordMatchList { { 0, "a" } })); 114 | Assert.That(ac.Search("b"), Is.Empty); 115 | 116 | ac = new AhoCorasick(CharComparer.Create(CultureInfo.InvariantCulture, true), "a", "ab", "bab", "bc", "bca", "c", "caa"); 117 | m = ac.Search("abccab").ToList(); 118 | expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; 119 | Assert.That(m, Is.EquivalentTo(expected)); 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /AhoCorasick.SqlClr/Contains.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Data; 3 | using System.Data.SqlClient; 4 | using System.Data.SqlTypes; 5 | using Microsoft.SqlServer.Server; 6 | using System.Xml.Linq; 7 | using System.Linq; 8 | using Ganss.Text; 9 | using System.Collections; 10 | using System.Collections.Generic; 11 | using System.Security.Cryptography; 12 | using System.Text; 13 | using System.Globalization; 14 | 15 | public partial class UserDefinedFunctions 16 | { 17 | private static AhoCorasick BuildAhoCorasick(SqlXml xml, SqlString culture) 18 | { 19 | var xe = XElement.Load(xml.CreateReader()); 20 | var words = xe.Elements().Select(e => e.FirstAttribute.Value); 21 | var c = culture.Value.Split(':'); 22 | var ignoreCase = c.Length > 1 && c[1] == "i"; 23 | CharComparer cc; 24 | switch (c[0]) 25 | { 26 | case "c": 27 | cc = CharComparer.Create(CultureInfo.CurrentCulture, ignoreCase); 28 | break; 29 | case "n": 30 | cc = CharComparer.Create(CultureInfo.InvariantCulture, ignoreCase); 31 | break; 32 | case "o": 33 | case "": 34 | cc = ignoreCase ? CharComparer.OrdinalIgnoreCase : CharComparer.Ordinal; 35 | break; 36 | default: 37 | cc = CharComparer.Create(CultureInfo.GetCultureInfo(c[0]), ignoreCase); 38 | break; 39 | } 40 | var ac = new AhoCorasick(cc, words); 41 | return ac; 42 | } 43 | 44 | [SqlFunction(FillRowMethodName = "FillRow", TableDefinition = @"""Index"" int, Word nvarchar(MAX)", 45 | IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 46 | public static IEnumerable ContainsWordsTable(SqlXml xml, SqlString text, SqlString culture) 47 | { 48 | var ac = BuildAhoCorasick(xml, culture); 49 | var matches = ac.Search(text.Value); 50 | 51 | return matches; 52 | } 53 | 54 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 55 | public static bool ContainsWords(SqlXml xml, SqlString text, SqlString culture) 56 | { 57 | return ContainsWordsTable(xml, text, culture).Cast().Any(); 58 | } 59 | 60 | public static void FillRow(object obj, out int index, out SqlString word) 61 | { 62 | var match = (WordMatch)obj; 63 | index = match.Index; 64 | word = match.Word; 65 | } 66 | 67 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 68 | public static string CreateAhoCorasick(SqlXml xml, SqlString culture) 69 | { 70 | var ac = BuildAhoCorasick(xml, culture); 71 | var hash = Hash(xml.Value + culture.Value); 72 | Objects[hash] = ac; 73 | return hash; 74 | } 75 | 76 | [SqlFunction(IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 77 | public static bool DeleteAhoCorasick(SqlString obj) 78 | { 79 | Objects.Remove(obj.Value); 80 | return true; 81 | } 82 | 83 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 84 | public static bool ClearAhoCorasick() 85 | { 86 | Objects.Clear(); 87 | return true; 88 | } 89 | 90 | [SqlFunction(FillRowMethodName = "FillRow", TableDefinition = @"""Index"" int, Word nvarchar(MAX)", IsDeterministic = true, IsPrecise = true, 91 | DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 92 | public static IEnumerable ContainsWordsTableByObject(SqlString text, SqlString obj) 93 | { 94 | var ac = Objects[obj.Value]; 95 | var matches = ac.Search(text.Value); 96 | 97 | return matches; 98 | } 99 | 100 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 101 | public static bool ContainsWordsByObject(SqlString text, SqlString obj) 102 | { 103 | return ContainsWordsTableByObject(text, obj).Cast().Any(); 104 | } 105 | 106 | [SqlFunction(FillRowMethodName = "FillRow", TableDefinition = @"""Index"" int, Word nvarchar(MAX)", IsDeterministic = true, IsPrecise = true, 107 | DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 108 | public static IEnumerable ContainsWordsBoundedTableByObject(SqlString text, SqlString obj) 109 | { 110 | var ac = Objects[obj.Value]; 111 | var t = text.Value; 112 | var matches = ac.Search(t).Cast().Where(m => 113 | { 114 | var start = m.Index == 0 || !char.IsLetterOrDigit(t[m.Index - 1]); 115 | var end = (m.Index + m.Word.Length) == t.Length || !char.IsLetterOrDigit(t[m.Index + m.Word.Length]); 116 | return start && end; 117 | }); 118 | 119 | return matches; 120 | } 121 | 122 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 123 | public static bool ContainsWordsBoundedByObject(SqlString text, SqlString obj) 124 | { 125 | return ContainsWordsBoundedTableByObject(text, obj).Cast().Any(); 126 | } 127 | 128 | [SqlFunction(FillRowMethodName = "FillRowList", TableDefinition = @"Hash nvarchar(MAX)", IsDeterministic = true, IsPrecise = true, 129 | DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)] 130 | public static IEnumerable ListAhoCorasick() 131 | { 132 | return Objects.Keys; 133 | } 134 | 135 | public static void FillRowList(object obj, out SqlString word) 136 | { 137 | word = (string)obj; 138 | } 139 | 140 | private static string Hash(string s) 141 | { 142 | return string.Concat(MD5.Create().ComputeHash(Encoding.UTF8.GetBytes(s)).Select(b => b.ToString("X2"))); 143 | } 144 | 145 | private static readonly Dictionary Objects = new Dictionary(); 146 | } 147 | -------------------------------------------------------------------------------- /AhoCorasick/AhoCorasick.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | 3 | namespace Ganss.Text 4 | { 5 | /// 6 | /// Represents a word match. 7 | /// 8 | public struct WordMatch 9 | { 10 | /// 11 | /// Gets or sets the index of the matched word in the searched text string. 12 | /// 13 | /// 14 | /// The index. 15 | /// 16 | public int Index { get; set; } 17 | 18 | /// 19 | /// Gets or sets the matched word. 20 | /// 21 | /// 22 | /// The matched word. 23 | /// 24 | public string Word { get; set; } 25 | } 26 | 27 | /// 28 | /// Implements the Aho-Corasick algorithm. 29 | /// 30 | public class AhoCorasick 31 | { 32 | /// 33 | /// Gets or sets the trie. 34 | /// 35 | /// 36 | /// The trie. 37 | /// 38 | protected Trie Trie { get; set; } 39 | 40 | /// 41 | /// Initializes a new instance of the class. 42 | /// Does not build the failure nodes. Call after adding words before calling . 43 | /// 44 | public AhoCorasick() 45 | { 46 | Trie = new Trie(); 47 | } 48 | 49 | /// 50 | /// Initializes a new instance of the class. 51 | /// 52 | /// Does not build the failure nodes. Call after adding words before calling . 53 | /// The comparer used to compare individual characters. 54 | public AhoCorasick(IEqualityComparer comparer) 55 | { 56 | Trie = new Trie(comparer); 57 | } 58 | 59 | /// 60 | /// Initializes a new instance of the class. 61 | /// 62 | /// The words to find. 63 | public AhoCorasick(params string[] words) 64 | : this() 65 | { 66 | Add(words); 67 | } 68 | 69 | /// 70 | /// Initializes a new instance of the class. 71 | /// 72 | /// The words to find. 73 | public AhoCorasick(IEnumerable words) 74 | : this() 75 | { 76 | Add(words); 77 | } 78 | 79 | /// 80 | /// Initializes a new instance of the class. 81 | /// 82 | /// The comparer used to compare individual characters. 83 | /// The words to find. 84 | public AhoCorasick(IEqualityComparer comparer, params string[] words) 85 | : this(comparer) 86 | { 87 | Add(words); 88 | } 89 | 90 | /// 91 | /// Initializes a new instance of the class. 92 | /// 93 | /// The comparer used to compare individual characters. 94 | /// The words to find. 95 | public AhoCorasick(IEqualityComparer comparer, IEnumerable words) 96 | : this(comparer) 97 | { 98 | Add(words); 99 | } 100 | 101 | /// 102 | /// Adds the specified word. 103 | /// 104 | /// Does not build the failure nodes. Call after adding words before calling . 105 | /// The word. 106 | public void Add(string word) 107 | { 108 | Trie.Add(word); 109 | } 110 | 111 | /// 112 | /// Adds the specified words. 113 | /// 114 | /// The words. 115 | public void Add(IEnumerable words) 116 | { 117 | foreach (var word in words) 118 | { 119 | Trie.Add(word); 120 | } 121 | 122 | BuildFail(); 123 | } 124 | 125 | /// 126 | /// Builds the failure nodes necessary to perform search. 127 | /// 128 | /// The start node. 129 | public void BuildFail(Trie node = null) 130 | { 131 | node ??= Trie; 132 | 133 | var word = node.Word; 134 | for (int i = 1; i < word.Length && node.Fail == null; i++) 135 | node.Fail = Trie.ExploreFailLink(word, i, word.Length); 136 | 137 | foreach (var subNode in node.Next.Values) 138 | BuildFail(subNode); 139 | } 140 | 141 | /// 142 | /// Searches for words in the specified text. 143 | /// 144 | /// The text. 145 | /// The matched words. 146 | public virtual IEnumerable Search(string text) 147 | { 148 | if (text == null) yield break; 149 | 150 | var current = Trie; 151 | 152 | for (int i = 0; i < text.Length; i++) 153 | { 154 | var c = text[i]; 155 | 156 | while (current != null && !current.Next.ContainsKey(c)) 157 | current = current.Fail; 158 | 159 | current ??= Trie; 160 | 161 | if (current.Next.TryGetValue(c, out current)) 162 | { 163 | var node = current; 164 | 165 | while (node != null) 166 | { 167 | if (node.IsWord) 168 | { 169 | var word = node.Word; 170 | var offset = i + 1 - word.Length; 171 | yield return new WordMatch { Index = offset, Word = word }; 172 | } 173 | 174 | node = node.Fail; 175 | } 176 | } 177 | } 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /AhoCorasick/CharComparer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Globalization; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace Ganss.Text 8 | { 9 | class OrdinalCharComparer : CharComparer 10 | { 11 | private readonly bool _ignoreCase; 12 | 13 | public OrdinalCharComparer(bool ignoreCase = false) 14 | { 15 | _ignoreCase = ignoreCase; 16 | } 17 | 18 | public override bool Equals(char x, char y) 19 | { 20 | return _ignoreCase ? ((uint)char.ToUpperInvariant(x)).Equals(((uint)char.ToUpperInvariant(y))) 21 | : ((uint)x).Equals((uint)y); 22 | } 23 | 24 | public override int GetHashCode(char obj) 25 | { 26 | return _ignoreCase ? (int)char.ToUpperInvariant(obj) : (int)obj; 27 | } 28 | } 29 | 30 | #if NET40 31 | class CultureCharComparer : CharComparer 32 | { 33 | private readonly StringComparer _stringComparer; 34 | 35 | public CultureCharComparer(CultureInfo cultureInfo, bool ignoreCase = false) 36 | { 37 | _stringComparer = StringComparer.Create(cultureInfo, ignoreCase); 38 | } 39 | 40 | public override bool Equals(char x, char y) 41 | { 42 | return _stringComparer.Equals(x.ToString(), y.ToString()); 43 | } 44 | 45 | public override int GetHashCode(char obj) 46 | { 47 | return _stringComparer.GetHashCode(obj.ToString()); 48 | } 49 | } 50 | #else 51 | class CultureCharComparer: CharComparer 52 | { 53 | private readonly CompareInfo _compareInfo; 54 | private readonly bool _ignoreCase; 55 | 56 | public CultureCharComparer(CultureInfo cultureInfo, bool ignoreCase = false) 57 | { 58 | _compareInfo = cultureInfo.CompareInfo; 59 | _ignoreCase = ignoreCase; 60 | } 61 | 62 | public override bool Equals(char x, char y) 63 | { 64 | return _compareInfo.Compare(x.ToString(), y.ToString(), _ignoreCase ? CompareOptions.IgnoreCase : CompareOptions.None) == 0; 65 | } 66 | 67 | public override int GetHashCode(char obj) 68 | { 69 | return _compareInfo.GetHashCode(obj.ToString(), _ignoreCase ? CompareOptions.IgnoreCase : CompareOptions.None); 70 | } 71 | } 72 | #endif 73 | 74 | /// 75 | /// Represents a char comparison operation that uses specific case and culture-based or ordinal comparison rules. 76 | /// 77 | public abstract class CharComparer: EqualityComparer 78 | { 79 | private static readonly CharComparer _ordinalIgnoreCase = new OrdinalCharComparer(ignoreCase: true); 80 | 81 | /// 82 | /// Gets a object that performs a case-insensitive ordinal comparison. 83 | /// 84 | /// 85 | /// A object. 86 | /// 87 | public static CharComparer OrdinalIgnoreCase 88 | { 89 | get 90 | { 91 | return _ordinalIgnoreCase; 92 | } 93 | } 94 | 95 | private static readonly CharComparer _ordinal = new OrdinalCharComparer(ignoreCase: false); 96 | 97 | /// 98 | /// Gets a object that performs a case-sensitive ordinal comparison. 99 | /// 100 | /// 101 | /// A object. 102 | /// 103 | public static CharComparer Ordinal 104 | { 105 | get 106 | { 107 | return _ordinal; 108 | } 109 | } 110 | 111 | private static readonly CharComparer _invariantCultureIgnoreCase = new CultureCharComparer(CultureInfo.InvariantCulture, ignoreCase: true); 112 | 113 | /// 114 | /// Gets a object that performs a case-insensitive comparison using the comparison rules of the invariant culture. 115 | /// 116 | /// 117 | /// A object. 118 | /// 119 | public static CharComparer InvariantCultureIgnoreCase 120 | { 121 | get 122 | { 123 | return _invariantCultureIgnoreCase; 124 | } 125 | } 126 | 127 | private static readonly CharComparer _invariantCulture = new CultureCharComparer(CultureInfo.InvariantCulture, ignoreCase: false); 128 | 129 | /// 130 | /// Gets a object that performs a case-sensitive comparison using the comparison rules of the invariant culture. 131 | /// 132 | /// 133 | /// A object. 134 | /// 135 | public static CharComparer InvariantCulture 136 | { 137 | get 138 | { 139 | return _invariantCulture; 140 | } 141 | } 142 | 143 | /// 144 | /// Gets a object that performs a case-sensitive comparison using the comparison rules of the current culture. 145 | /// 146 | /// 147 | /// A object. 148 | /// 149 | public static CharComparer CurrentCulture 150 | { 151 | get 152 | { 153 | return new CultureCharComparer(CultureInfo.CurrentCulture, ignoreCase: false); 154 | } 155 | } 156 | 157 | /// 158 | /// Gets a object that performs a case-insensitive comparison using the comparison rules of the current culture. 159 | /// 160 | /// 161 | /// A object. 162 | /// 163 | public static CharComparer CurrentCultureIgnoreCase 164 | { 165 | get 166 | { 167 | return new CultureCharComparer(CultureInfo.CurrentCulture, ignoreCase: true); 168 | } 169 | } 170 | 171 | /// 172 | /// Creates a object that compares characters according to the rules of a specified culture. 173 | /// 174 | /// A culture whose linguistic rules are used to perform a string comparison. 175 | /// true to specify that comparison operations be case-insensitive; false to specify that comparison operations be case-sensitive. 176 | /// A new object that performs character comparisons according to the comparison rules used by the parameter and the case rule specified by the parameter. 177 | public static CharComparer Create(CultureInfo cultureInfo, bool ignoreCase) 178 | { 179 | return new CultureCharComparer(cultureInfo, ignoreCase); 180 | } 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AhoCorasick 2 | 3 | [![Version](https://img.shields.io/nuget/v/AhoCorasick.svg)](https://www.nuget.org/packages/AhoCorasick) 4 | [![Build status](https://ci.appveyor.com/api/projects/status/b8lxercfn9spio95/branch/master?svg=true)](https://ci.appveyor.com/project/mganss/ahocorasick/branch/master) 5 | [![Coverage Status](https://coveralls.io/repos/mganss/AhoCorasick/badge.svg?branch=master&service=github)](https://coveralls.io/github/mganss/AhoCorasick?branch=master) 6 | [![netstandard2.0](https://img.shields.io/badge/netstandard-2.0-brightgreen.svg)](https://img.shields.io/badge/netstandard-2.0-brightgreen.svg) 7 | [![net40](https://img.shields.io/badge/net-40-brightgreen.svg)](https://img.shields.io/badge/net-40-brightgreen.svg) 8 | 9 | This is an implementation of the [Aho-Corasick](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) string matching algorithm for .NET (netstandard2.0 and net40) and SQL Server (SQL CLR). Mostly ported from [xudejian/aho-corasick](https://github.com/xudejian/aho-corasick) in CoffeeScript. 10 | 11 | ## Usage 12 | 13 | ```C# 14 | var ac = new AhoCorasick("a", "ab", "bab", "bc", "bca", "c", "caa"); 15 | var results = ac.Search("abccab").ToList(); 16 | 17 | Assert.AreEqual(0, results[0].Index); // index into the searched text 18 | Assert.AreEqual("a", results[0].Word); // matched word 19 | // ... 20 | ``` 21 | 22 | or 23 | 24 | ```C# 25 | var results = "abccab".Contains("a", "ab", "bab", "bc", "bca", "c", "caa").ToList(); 26 | ``` 27 | 28 | ### Custom char comparison 29 | 30 | You can optionally supply an `IEqualityComparer` to perform custom char comparisons when searching for substrings. Several implementations with comparers that mirror `StringComparer` are included. 31 | 32 | ```C# 33 | var results = "AbCcab".Contains(CharComparer.OrdinalIgnoreCase, "a", "ab", "c").ToList(); 34 | ``` 35 | 36 | ## SQL CLR Functions 37 | 38 | There are also several SQL CLR user defined functions that can be used to perform fast substring matching 39 | in Microsoft SQL Server. To use this: 40 | 41 | 1. Make sure you have [enabled CLR integration](https://msdn.microsoft.com/en-us/library/ms131048.aspx) 42 | 2. Execute [AhoCorasick.SqlClr_Create.sql](AhoCorasick.SqlClr/dist/AhoCorasick.SqlClr_Create.sql) 43 | 44 | For one-off queries, you can use the functions that rebuild the trie on each query, e.g. 45 | 46 | ```SQL 47 | select top(100) * from Posts P 48 | where dbo.ContainsWords((select Word from Words for xml raw, root('root')), P.Body, 'o') = 1 49 | ``` 50 | 51 | The words to match are always supplied as XML where the values are taken from the first attribute of all elements directly beneath the root node. Be careful to select the word column as the only or first column otherwise you'll end up matching the wrong words. The XML in the example above looks like this: 52 | 53 | ```XML 54 | 55 | 56 | 57 | 58 | ... 59 | 60 | ``` 61 | 62 | [Here's more](https://www.simple-talk.com/sql/learn-sql-server/using-the-for-xml-clause-to-return-query-results-as-xml/) about FOR XML. 63 | 64 | The last parameter in the function indicates the culture to use since there is no way to use SQL Server collations in SQL CLR code. Values can be: 65 | 66 | |Value|Character comparison| 67 | |-----|--------------------| 68 | |c|Current Culture| 69 | |n|Invariant Culture| 70 | |o or Empty|Ordinal| 71 | |Culture name, e.g. "de-de"|Specific [.NET Culture](https://msdn.microsoft.com/en-us/library/system.globalization.cultureinfo.name.aspx)| 72 | 73 | The culture identifier can be suffixed by `:i` indicating case-insensitive matching. 74 | 75 | ### Static objects 76 | 77 | The function in the example above has the problem that the trie is rebuilt for each query even though the input always stays the same. To overcome this problem, there are a number of functions to manage the creation and destruction of static objects whose handles can be saved in SQL variables. Example: 78 | 79 | ```SQL 80 | declare @ac nvarchar(32); 81 | set @ac = dbo.CreateAhoCorasick((select Word from Words for xml raw, root('root')), 'en-us:i'); 82 | select * from Posts P 83 | where dbo.ContainsWordsByObject(P.Body, @ac) = 1; 84 | ``` 85 | 86 | This is a lot faster than the first example because the trie is created only once and then reused for each row in the query. The handle (@ac) is a hash value generated from the words to match and the culture. The corresponding object is saved in a static dictionary. You can list the currently active objects using `dbo.ListAhoCorasick()`, remove all objects using `dbo.ClearAhoCorasick()` or remove only one object using `dbo.DeleteAhoCorasick(@ac)`. 87 | 88 | ### Getting all matches 89 | 90 | The examples above only checked if the words occurred in the queried texts. If you want to get the matched words and the indexes where they occur in the queried texts you can use the supplied table-valued functions. For example: 91 | 92 | ```SQL 93 | declare @ac nvarchar(32); 94 | set @ac = dbo.CreateAhoCorasick((select Word from Words for xml raw, root('root')), 'o'); 95 | select top(100) * from Posts P 96 | cross apply dbo.ContainsWordsTableByObject(P.Body, @ac) W 97 | ``` 98 | 99 | This will return a table such as this: 100 | 101 | |ID |Body |Index |Word | 102 | |---|---|---|---| 103 | |1 |What factors related...|5|factor| 104 | |1 |What factors related...|6|actor| 105 | |1 |What factors related...|5|factors| 106 | |...| 107 | 108 | ### Word boundaries 109 | 110 | There are also functions that return only matches occuring at word boundaries: `dbo.ContainsWordsBoundedByObject()` and `dbo.ContainsWordsBoundedTableByObject()`. Word boundaries here are the same as [`\b` in regexes](http://www.regular-expressions.info/wordboundaries.html), i.e. matches will occur as if words were specified as `\bword\b`. 111 | 112 | ### Forcing parallelism 113 | 114 | Although these kinds of queries lend themselves very well to parallel execution, SQL Server tends to overestimate the cost of parallel queries and builds non-parallel plans most of the time where user defined functions are involved. You can force a parallel plan by using a trace flag (more about this [here](http://sqlblog.com/blogs/paul_white/archive/2011/12/23/forcing-a-parallel-query-execution-plan.aspx)): 115 | 116 | ```SQL 117 | declare @ac nvarchar(32); 118 | set @ac = dbo.CreateAhoCorasick((select Word from Words for xml raw, root('root')), 'en-us:i'); 119 | select * from Posts P 120 | where dbo.ContainsWordsBoundedByObject(P.Body, @ac) = 1 121 | OPTION (RECOMPILE, QUERYTRACEON 8649) 122 | ``` 123 | 124 | Parallel operators are identified by a yellow badge with two arrows in the query plan. 125 | 126 | ### Performance 127 | 128 | Here's a benchmark searching for ~5000 words (average length 7) in ~250,000 texts (average length ~900): 129 | 130 | |SQL|AhoCorasick| 131 | |---|-----------| 132 | |560s|7s| 133 | 134 | The SQL query used was this: 135 | 136 | ```SQL 137 | select * from Posts P 138 | where exists (select * from Words W where CHARINDEX(W.Word, P.Text) > 0) 139 | ``` 140 | 141 | #### But I can simply use full-text search 142 | 143 | No. The [CONTAINS](https://msdn.microsoft.com/en-us/library/ms187787.aspx) predicate can only search for a single literal or variable at a time. You can't use it in a join or subquery to search for a column value of a table in the query, i.e. this won't work: 144 | 145 | ```SQL 146 | select * from Posts P 147 | where exists (select * from Words W where CONTAINS(P.Text, W.Word)) 148 | ``` 149 | 150 | If you know of a way to make this work using FTS (perhaps using a cursor?) let me know. 151 | -------------------------------------------------------------------------------- /AhoCorasick.SqlClr/dist/AhoCorasick.SqlClr_Create.sql: -------------------------------------------------------------------------------- 1 | PRINT N'Creating [AhoCorasick.SqlClr]...'; 2 | 3 | 4 | GO 5 | CREATE ASSEMBLY [AhoCorasick.SqlClr] 6 | AUTHORIZATION [dbo] 7 | FROM | 9 | 10 | GO 11 | ALTER ASSEMBLY [AhoCorasick.SqlClr] 12 | DROP FILE ALL 13 | ADD FILE FROM hoCorasick.SqlClr.pdb'; 14 | 15 | 16 | GO 17 | PRINT N'Creating [dbo].[ContainsWords]...'; 18 | 19 | 20 | GO 21 | CREATE FUNCTION [dbo].[ContainsWords] 22 | (@xml XML, @text NVARCHAR (MAX), @culture NVARCHAR (MAX)) 23 | RETURNS BIT 24 | AS 25 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWords] 26 | 27 | 28 | GO 29 | PRINT N'Creating [dbo].[CreateAhoCorasick]...'; 30 | 31 | 32 | GO 33 | CREATE FUNCTION [dbo].[CreateAhoCorasick] 34 | (@xml XML, @culture NVARCHAR (MAX)) 35 | RETURNS NVARCHAR (MAX) 36 | AS 37 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[CreateAhoCorasick] 38 | 39 | 40 | GO 41 | PRINT N'Creating [dbo].[DeleteAhoCorasick]...'; 42 | 43 | 44 | GO 45 | CREATE FUNCTION [dbo].[DeleteAhoCorasick] 46 | (@obj NVARCHAR (MAX)) 47 | RETURNS BIT 48 | AS 49 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[DeleteAhoCorasick] 50 | 51 | 52 | GO 53 | PRINT N'Creating [dbo].[ClearAhoCorasick]...'; 54 | 55 | 56 | GO 57 | CREATE FUNCTION [dbo].[ClearAhoCorasick] 58 | ( ) 59 | RETURNS BIT 60 | AS 61 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ClearAhoCorasick] 62 | 63 | 64 | GO 65 | PRINT N'Creating [dbo].[ContainsWordsByObject]...'; 66 | 67 | 68 | GO 69 | CREATE FUNCTION [dbo].[ContainsWordsByObject] 70 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX)) 71 | RETURNS BIT 72 | AS 73 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsByObject] 74 | 75 | 76 | GO 77 | PRINT N'Creating [dbo].[ContainsWordsBoundedByObject]...'; 78 | 79 | 80 | GO 81 | CREATE FUNCTION [dbo].[ContainsWordsBoundedByObject] 82 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX)) 83 | RETURNS BIT 84 | AS 85 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsBoundedByObject] 86 | 87 | 88 | GO 89 | PRINT N'Creating [dbo].[ContainsWordsTable]...'; 90 | 91 | 92 | GO 93 | CREATE FUNCTION [dbo].[ContainsWordsTable] 94 | (@xml XML, @text NVARCHAR (MAX), @culture NVARCHAR (MAX)) 95 | RETURNS 96 | TABLE ( 97 | [Index] INT NULL, 98 | [Word] NVARCHAR (MAX) NULL) 99 | AS 100 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsTable] 101 | 102 | 103 | GO 104 | PRINT N'Creating [dbo].[ContainsWordsTableByObject]...'; 105 | 106 | 107 | GO 108 | CREATE FUNCTION [dbo].[ContainsWordsTableByObject] 109 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX)) 110 | RETURNS 111 | TABLE ( 112 | [Index] INT NULL, 113 | [Word] NVARCHAR (MAX) NULL) 114 | AS 115 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsTableByObject] 116 | 117 | 118 | GO 119 | PRINT N'Creating [dbo].[ContainsWordsBoundedTableByObject]...'; 120 | 121 | 122 | GO 123 | CREATE FUNCTION [dbo].[ContainsWordsBoundedTableByObject] 124 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX)) 125 | RETURNS 126 | TABLE ( 127 | [Index] INT NULL, 128 | [Word] NVARCHAR (MAX) NULL) 129 | AS 130 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsBoundedTableByObject] 131 | 132 | 133 | GO 134 | PRINT N'Creating [dbo].[ListAhoCorasick]...'; 135 | 136 | 137 | GO 138 | CREATE FUNCTION [dbo].[ListAhoCorasick] 139 | ( ) 140 | RETURNS 141 | TABLE ( 142 | [Hash] NVARCHAR (MAX) NULL) 143 | AS 144 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ListAhoCorasick] 145 | 146 | 147 | GO 148 | PRINT N'Update complete.'; 149 | 150 | 151 | GO 152 | --------------------------------------------------------------------------------