├── AhoCorasick
├── key.snk
├── AhoCorasick.csproj
├── Extensions.cs
├── Trie.cs
├── AhoCorasick.cs
└── CharComparer.cs
├── .github
└── dependabot.yml
├── .nuget
└── packages.config
├── LICENSE
├── AhoCorasick.Tests
├── AhoCorasick.Tests.csproj
├── CharComparerTests.cs
└── UnitTests.cs
├── AhoCorasick.SqlClr
├── AhoCorasick.SqlClr.sln
├── AhoCorasick.SqlClr.sqlproj
├── Contains.cs
└── dist
│ └── AhoCorasick.SqlClr_Create.sql
├── AhoCorasick.sln
├── appveyor.yml
├── .gitignore
└── README.md
/AhoCorasick/key.snk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mganss/AhoCorasick/HEAD/AhoCorasick/key.snk
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: nuget
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | time: "04:00"
8 | open-pull-requests-limit: 10
9 |
--------------------------------------------------------------------------------
/.nuget/packages.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Michael Ganss
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/AhoCorasick.Tests/AhoCorasick.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 | net462;net6.0
4 | Ganss.Text.Tests
5 | true
6 | opencover
7 | ../coverage.xml
8 | [AhoCorasick]*
9 |
10 |
11 |
12 |
13 |
14 |
15 | all
16 | runtime; build; native; contentfiles; analyzers
17 |
18 |
19 |
20 |
21 |
22 | all
23 | runtime; build; native; contentfiles; analyzers; buildtransitive
24 |
25 |
26 |
--------------------------------------------------------------------------------
/AhoCorasick.SqlClr/AhoCorasick.SqlClr.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2013
4 | VisualStudioVersion = 12.0.31101.0
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{00D1A9C2-B5F0-4AF3-8072-F6C62B433612}") = "AhoCorasick.SqlClr", "AhoCorasick.SqlClr.sqlproj", "{FBC73EEF-133B-492B-9EBB-77E7484F3485}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Debug|Any CPU.Deploy.0 = Debug|Any CPU
17 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Release|Any CPU.ActiveCfg = Release|Any CPU
18 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Release|Any CPU.Build.0 = Release|Any CPU
19 | {FBC73EEF-133B-492B-9EBB-77E7484F3485}.Release|Any CPU.Deploy.0 = Release|Any CPU
20 | EndGlobalSection
21 | GlobalSection(SolutionProperties) = preSolution
22 | HideSolutionNode = FALSE
23 | EndGlobalSection
24 | EndGlobal
25 |
--------------------------------------------------------------------------------
/AhoCorasick.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.26730.12
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AhoCorasick", "AhoCorasick\AhoCorasick.csproj", "{A1F19E27-06C8-4DB1-94CE-24B4F10330DF}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AhoCorasick.Tests", "AhoCorasick.Tests\AhoCorasick.Tests.csproj", "{860F4149-D7F3-495A-99BE-8AC3FBAE4176}"
9 | EndProject
10 | Global
11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | Debug|Any CPU = Debug|Any CPU
13 | Release|Any CPU = Release|Any CPU
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
17 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Debug|Any CPU.Build.0 = Debug|Any CPU
18 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Release|Any CPU.ActiveCfg = Release|Any CPU
19 | {A1F19E27-06C8-4DB1-94CE-24B4F10330DF}.Release|Any CPU.Build.0 = Release|Any CPU
20 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Debug|Any CPU.Build.0 = Debug|Any CPU
22 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Release|Any CPU.ActiveCfg = Release|Any CPU
23 | {860F4149-D7F3-495A-99BE-8AC3FBAE4176}.Release|Any CPU.Build.0 = Release|Any CPU
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {12FCD39C-9278-4D4D-8438-9A0FCDCA85F3}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | version: 2.0.{build}
2 | skip_tags: true
3 | image: Visual Studio 2022
4 | environment:
5 | access_token:
6 | secure: Eq6BjtZ80BXKLwFMg76IjuQAvbLjbojIF/X/ARouGVhxPneJtgDfCXMPNgJ7KBKq
7 | sonar_token:
8 | secure: W7pHKhuTW6Lh8WlXJNTOIaOzeuxLi+H6Nqmnm4pr28jM6jyIpOZ+1r10lIQi0eCA
9 | JAVA_HOME: C:\Program Files\Java\jdk19
10 | build_script:
11 | - dotnet restore
12 | - dotnet pack --include-symbols --include-source -c Release AhoCorasick
13 | test_script:
14 | - ps: |
15 | if (-not $env:APPVEYOR_PULL_REQUEST_NUMBER) {
16 | dotnet tool install --global dotnet-sonarscanner
17 | dotnet sonarscanner begin /k:"mganss_AhoCorasick" /v:$env:APPVEYOR_BUILD_VERSION /o:"mganss-github" /d:sonar.host.url="https://sonarcloud.io" /d:sonar.login="$env:sonar_token" /d:sonar.cs.opencover.reportsPaths="$($env:APPVEYOR_BUILD_FOLDER)\coverage.xml" /d:sonar.coverage.exclusions="**/Program.cs"
18 | dotnet build
19 | }
20 | - dotnet test /p:CollectCoverage=true AhoCorasick.Tests\AhoCorasick.Tests.csproj /p:Include="[AhoCorasick]*" -f net6.0
21 | - ps: cp coverage.*.xml ./coverage.xml
22 | - ps: |
23 | if (-not $env:APPVEYOR_PULL_REQUEST_NUMBER) {
24 | dotnet sonarscanner end /d:sonar.login="$env:sonar_token"
25 | }
26 | - pip install codecov
27 | - codecov -f "coverage.xml"
28 | artifacts:
29 | - path: 'AhoCorasick\**\*.*nupkg'
30 | deploy:
31 | - provider: GitHub
32 | tag: v$(APPVEYOR_BUILD_VERSION)
33 | release: $(APPVEYOR_BUILD_VERSION)
34 | description: '$(APPVEYOR_REPO_COMMIT_MESSAGE)'
35 | auth_token:
36 | secure: Eq6BjtZ80BXKLwFMg76IjuQAvbLjbojIF/X/ARouGVhxPneJtgDfCXMPNgJ7KBKq
37 | draft: true
38 | on:
39 | branch: master
40 |
--------------------------------------------------------------------------------
/AhoCorasick/AhoCorasick.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 | Ganss.Text
4 | AhoCorasick
5 | Implements the Aho-Corasick string search algorithm
6 | Copyright 2013-$([System.DateTime]::Now.Year) Michael Ganss
7 | 1.0.0
8 | 2.0.0.0
9 | $(AppVeyor_Build_Version).0
10 | $(AppVeyor_Build_Version)
11 | Michael Ganss
12 | net40;netstandard2.0
13 | AhoCorasick
14 | AhoCorasick
15 | aho-corasick;aho;corasick;string;search;match;substring
16 | https://github.com/mganss/AhoCorasick
17 | https://github.com/mganss/AhoCorasick/blob/master/LICENSE
18 | README.md
19 | git
20 | git://github.com/mganss/AhoCorasick
21 | Ganss.Text
22 | true
23 | bin\$(Configuration)\$(TargetFramework)\AhoCorasick.xml
24 | true
25 | key.snk
26 | snupkg
27 | true
28 | true
29 | true
30 | snupkg
31 | latest
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/AhoCorasick/Extensions.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 |
3 | namespace Ganss.Text
4 | {
5 | ///
6 | /// Provides extension methods.
7 | ///
8 | public static class Extensions
9 | {
10 | ///
11 | /// Determines whether this instance contains the specified words.
12 | ///
13 | /// The text.
14 | /// The words.
15 | /// The matched words.
16 | public static IEnumerable Contains(this string text, IEnumerable words)
17 | {
18 | return new AhoCorasick(words).Search(text);
19 | }
20 |
21 | ///
22 | /// Determines whether this instance contains the specified words.
23 | ///
24 | /// The text.
25 | /// The words.
26 | /// The matched words.
27 | public static IEnumerable Contains(this string text, params string[] words)
28 | {
29 | return new AhoCorasick(words).Search(text);
30 | }
31 |
32 | ///
33 | /// Determines whether this instance contains the specified words.
34 | ///
35 | /// The text.
36 | /// The comparer used to compare individual characters.
37 | /// The words.
38 | /// The matched words.
39 | public static IEnumerable Contains(this string text, IEqualityComparer comparer, IEnumerable words)
40 | {
41 | return new AhoCorasick(comparer, words).Search(text);
42 | }
43 |
44 | ///
45 | /// Determines whether this instance contains the specified words.
46 | ///
47 | /// The text.
48 | /// The comparer used to compare individual characters.
49 | /// The words.
50 | /// The matched words.
51 | public static IEnumerable Contains(this string text, IEqualityComparer comparer, params string[] words)
52 | {
53 | return new AhoCorasick(comparer, words).Search(text);
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.sln.docstates
8 |
9 | # Build results
10 |
11 | [Dd]ebug/
12 | [Rr]elease/
13 | x64/
14 | build/
15 | [Bb]in/
16 | [Oo]bj/
17 |
18 | # MSTest test Results
19 | [Tt]est[Rr]esult*/
20 | [Bb]uild[Ll]og.*
21 |
22 | *_i.c
23 | *_p.c
24 | *.ilk
25 | *.meta
26 | *.obj
27 | *.pch
28 | *.pdb
29 | *.pgc
30 | *.pgd
31 | *.rsp
32 | *.sbr
33 | *.tlb
34 | *.tli
35 | *.tlh
36 | *.tmp
37 | *.tmp_proj
38 | *.log
39 | *.vspscc
40 | *.vssscc
41 | .builds
42 | *.pidb
43 | *.log
44 | *.scc
45 |
46 | # Visual C++ cache files
47 | ipch/
48 | *.aps
49 | *.ncb
50 | *.opensdf
51 | *.sdf
52 | *.cachefile
53 |
54 | # Visual Studio profiler
55 | *.psess
56 | *.vsp
57 | *.vspx
58 |
59 | # Guidance Automation Toolkit
60 | *.gpState
61 |
62 | # ReSharper is a .NET coding add-in
63 | _ReSharper*/
64 | *.[Rr]e[Ss]harper
65 |
66 | # TeamCity is a build add-in
67 | _TeamCity*
68 |
69 | # DotCover is a Code Coverage Tool
70 | *.dotCover
71 |
72 | # NCrunch
73 | *.ncrunch*
74 | .*crunch*.local.xml
75 |
76 | # Installshield output folder
77 | [Ee]xpress/
78 |
79 | # DocProject is a documentation generator add-in
80 | DocProject/buildhelp/
81 | DocProject/Help/*.HxT
82 | DocProject/Help/*.HxC
83 | DocProject/Help/*.hhc
84 | DocProject/Help/*.hhk
85 | DocProject/Help/*.hhp
86 | DocProject/Help/Html2
87 | DocProject/Help/html
88 |
89 | # Click-Once directory
90 | publish/
91 |
92 | # Publish Web Output
93 | *.Publish.xml
94 | *.pubxml
95 |
96 | # NuGet Packages Directory
97 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
98 | packages/
99 |
100 | # Windows Azure Build Output
101 | csx
102 | *.build.csdef
103 |
104 | # Windows Store app package directory
105 | AppPackages/
106 |
107 | # Others
108 | sql/
109 | *.Cache
110 | ClientBin/
111 | [Ss]tyle[Cc]op.*
112 | ~$*
113 | *~
114 | *.dbmdl
115 | *.[Pp]ublish.xml
116 | *.pfx
117 | *.publishsettings
118 |
119 | # RIA/Silverlight projects
120 | Generated_Code/
121 |
122 | # Backup & report files from converting an old project file to a newer
123 | # Visual Studio version. Backup files are not needed, because we have git ;-)
124 | _UpgradeReport_Files/
125 | Backup*/
126 | UpgradeLog*.XML
127 | UpgradeLog*.htm
128 |
129 | # SQL Server files
130 | App_Data/*.mdf
131 | App_Data/*.ldf
132 |
133 | # =========================
134 | # Windows detritus
135 | # =========================
136 |
137 | # Windows image file caches
138 | Thumbs.db
139 | ehthumbs.db
140 |
141 | # Folder config file
142 | Desktop.ini
143 |
144 | # Recycle Bin used on file shares
145 | $RECYCLE.BIN/
146 |
147 | # Mac crap
148 | .DS_Store
149 |
150 | *.nupkg
151 | TestResult.xml
152 | coverage.xml
153 | .vs/
154 | /OpenCover
155 |
--------------------------------------------------------------------------------
/AhoCorasick.Tests/CharComparerTests.cs:
--------------------------------------------------------------------------------
1 | using NUnit.Framework;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Globalization;
5 | using System.Linq;
6 | using System.Text;
7 | using System.Threading;
8 | using System.Threading.Tasks;
9 |
10 | namespace Ganss.Text.Tests
11 | {
12 | public class CharComparerTests
13 | {
14 | const char SmallDotlessI = '\u0131';
15 | const char CapitalIWithDot = '\u0130';
16 | const char CapitalSharpS = '\u1e9e';
17 | const char LatinSmallCapitalR = '\u0280';
18 | const char LatinLetterYR = '\u01a6';
19 |
20 | [Test]
21 | public void OrdinalTest()
22 | {
23 | var c = CharComparer.Ordinal;
24 | Assert.That(c.Equals('i', 'i'), Is.True);
25 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False);
26 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False);
27 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.False);
28 |
29 | c = CharComparer.OrdinalIgnoreCase;
30 | Assert.That(c.Equals('i', 'I'), Is.True);
31 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False);
32 | Assert.That(c.Equals(SmallDotlessI, 'I'), Is.False);
33 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False);
34 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.True);
35 | }
36 |
37 | [Test]
38 | public void InvariantTest()
39 | {
40 | var c = CharComparer.InvariantCulture;
41 | Assert.That(c.Equals('i', 'i'), Is.True);
42 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False);
43 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False);
44 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.False);
45 |
46 | c = CharComparer.InvariantCultureIgnoreCase;
47 | Assert.That(c.Equals('i', 'I'), Is.True);
48 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False);
49 | Assert.That(c.Equals(SmallDotlessI, 'I'), Is.False);
50 | Assert.That(c.Equals('ß', CapitalSharpS), Is.True);
51 | }
52 |
53 | [Test]
54 | public void CultureTest()
55 | {
56 | CultureInfo.CurrentCulture = new CultureInfo("tr-TR");
57 | var c = CharComparer.CurrentCulture;
58 | Assert.That(c.Equals('i', 'i'), Is.True);
59 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False);
60 | Assert.That(c.Equals('ß', CapitalSharpS), Is.False);
61 | Assert.That(c.Equals(LatinSmallCapitalR, LatinLetterYR), Is.False);
62 |
63 | c = CharComparer.CurrentCultureIgnoreCase;
64 | Assert.That(c.Equals('i', 'I'), Is.False);
65 | Assert.That(c.Equals(SmallDotlessI, 'i'), Is.False);
66 | Assert.That(c.Equals(SmallDotlessI, 'I'), Is.True);
67 | Assert.That(c.Equals('i', CapitalIWithDot), Is.True);
68 | Assert.That(c.Equals('ß', CapitalSharpS), Is.True);
69 |
70 | Assert.That(c.GetHashCode('i'), Is.EqualTo(c.GetHashCode(CapitalIWithDot)));
71 | Assert.That(c.GetHashCode(SmallDotlessI), Is.EqualTo(c.GetHashCode('I')));
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/AhoCorasick/Trie.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 |
3 | namespace Ganss.Text
4 | {
5 | ///
6 | /// A Trie.
7 | ///
8 | public class Trie
9 | {
10 | ///
11 | /// Gets or sets the child nodes.
12 | ///
13 | ///
14 | /// The child nodes.
15 | ///
16 | public Dictionary Next { get; set; }
17 |
18 | ///
19 | /// Gets or sets a value indicating whether this instance represents a word in the dictionary.
20 | ///
21 | ///
22 | /// true if this instance is a word in the dictionary; otherwise, false.
23 | ///
24 | public bool IsWord { get; set; }
25 |
26 | ///
27 | /// Gets or sets the failure node.
28 | ///
29 | ///
30 | /// The failure node.
31 | ///
32 | public Trie Fail { get; set; }
33 |
34 | ///
35 | /// Gets or sets the parent node.
36 | ///
37 | ///
38 | /// The parent node.
39 | ///
40 | public Trie Parent { get; set; }
41 |
42 | ///
43 | /// Gets the word prefix this node represents.
44 | ///
45 | ///
46 | /// The word prefix.
47 | ///
48 | public string Word { get; private set; }
49 |
50 | ///
51 | /// Initializes a new instance of the class.
52 | ///
53 | public Trie()
54 | {
55 | Word = "";
56 | Next = new Dictionary();
57 | }
58 |
59 | ///
60 | /// Initializes a new instance of the class.
61 | ///
62 | /// The comparer used to compare individual characters.
63 | public Trie(IEqualityComparer comparer)
64 | {
65 | Word = "";
66 | Next = new Dictionary(comparer);
67 | }
68 |
69 | ///
70 | /// Adds the specified word to the trie.
71 | ///
72 | /// The word.
73 | ///
74 | public virtual Trie Add(string word)
75 | {
76 | var c = word[0];
77 |
78 | if (!Next.TryGetValue(c, out Trie node))
79 | Next[c] = node = new Trie(Next.Comparer) { Parent = this, Word = Word + c };
80 |
81 | if (word.Length > 1)
82 | return node.Add(word.Substring(1));
83 | else
84 | node.IsWord = true;
85 |
86 | return node;
87 | }
88 |
89 | ///
90 | /// Finds the failure node for a specified suffix within the given range of indices.
91 | ///
92 | /// The string containing the suffix.
93 | /// The start index of the suffix within the string.
94 | /// The end index (exclusive) of the suffix within the string.
95 | /// The failure node or null if no failure node is found.
96 |
97 | public virtual Trie ExploreFailLink(string word, int startIndex, int endIndex)
98 | {
99 | var node = this;
100 |
101 | for (int i = startIndex; i < endIndex; i++)
102 | {
103 | if (!node.Next.TryGetValue(word[i], out node))
104 | {
105 | return null;
106 | }
107 | }
108 |
109 | return node;
110 | }
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/AhoCorasick.SqlClr/AhoCorasick.SqlClr.sqlproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | AhoCorasick.SqlClr
8 | 2.0
9 | 4.1
10 | {fbc73eef-133b-492b-9ebb-77e7484f3485}
11 | Microsoft.Data.Tools.Schema.Sql.Sql120DatabaseSchemaProvider
12 | Database
13 |
14 |
15 | AhoCorasick.SqlClr
16 | AhoCorasick.SqlClr
17 | 1033, CI
18 | BySchemaAndSchemaType
19 | True
20 | v4.0
21 | CS
22 | Properties
23 | False
24 | True
25 | True
26 | True
27 |
28 |
29 | bin\Release\
30 | $(MSBuildProjectName).sql
31 | False
32 | pdbonly
33 | true
34 | false
35 | true
36 | prompt
37 | 4
38 | NET40
39 |
40 |
41 | bin\Debug\
42 | $(MSBuildProjectName).sql
43 | false
44 | true
45 | full
46 | false
47 | true
48 | true
49 | prompt
50 | 4
51 | NET40
52 |
53 |
54 | 11.0
55 |
56 | True
57 | 11.0
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 | AhoCorasick.cs
67 |
68 |
69 | CharComparer.cs
70 |
71 |
72 | Trie.cs
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/AhoCorasick.Tests/UnitTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Linq;
3 | using Ganss.Text;
4 | using System.Collections.Generic;
5 | using NUnit.Framework;
6 | using System.Globalization;
7 |
8 | namespace Ganss.Text.Tests
9 | {
10 | public class WordMatchList: List
11 | {
12 | public void Add(int index, string word)
13 | {
14 | Add(new WordMatch { Index = index, Word = word });
15 | }
16 | }
17 |
18 | public class UnitTests
19 | {
20 | [Test]
21 | public void SearchWikipediaTest()
22 | {
23 | // from https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
24 | var ac = new AhoCorasick("a", "ab", "bab", "bc", "bca", "c", "caa");
25 | var m = ac.Search("abccab").ToList();
26 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } };
27 | Assert.That(m, Is.EquivalentTo(expected));
28 | }
29 |
30 | [Test]
31 | public void SimpleTest()
32 | {
33 | var ac = new AhoCorasick("a");
34 | Assert.That(ac.Search("a").ToList(), Is.EquivalentTo(new WordMatchList { { 0, "a" } }));
35 | Assert.That(ac.Search("b"), Is.Empty);
36 | }
37 |
38 | [Test]
39 | public void SearchNullEmptyTest()
40 | {
41 | var ac = new AhoCorasick("a");
42 | var m = ac.Search(null).ToList();
43 | Assert.That(m, Is.Empty);
44 | m = ac.Search("").ToList();
45 | Assert.That(m, Is.Empty);
46 | }
47 |
48 | [Test]
49 | public void SearchMultipleTest()
50 | {
51 | var ac = new AhoCorasick("her", "their", "eye", "iris", "he", "is", "si");
52 | var m = ac.Search("theye iris irisis").ToList();
53 | var expected = new WordMatchList { { 1, "he" }, { 2, "eye" }, { 6, "iris" }, { 8, "is" }, { 11, "iris" }, { 13, "is" }, { 14, "si" }, { 15, "is" } };
54 | Assert.That(m, Is.EquivalentTo(expected));
55 | }
56 |
57 | [Test]
58 | public void SearchIvankTest()
59 | {
60 | // from http://blog.ivank.net/aho-corasick-algorithm-in-as3.html
61 | var ac = new AhoCorasick("take", "fast", "sofa");
62 | var m = ac.Search("takeso fasofast fassofatake sosso sofastake so").ToList();
63 | var expected = new WordMatchList { { 0, "take" }, { 9, "sofa" }, { 11, "fast" }, { 19, "sofa" }, { 23, "take" }, { 34, "sofa" }, { 36, "fast" }, { 39, "take" } };
64 | Assert.That(m, Is.EquivalentTo(expected));
65 | }
66 |
67 | [Test]
68 | public void StringExtensionTest()
69 | {
70 | var m = "abc".Contains("abd", "bc", "ab").ToList();
71 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } }));
72 | m = "abc".Contains(new List { "abd", "bc", "ab" }).ToList();
73 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } }));
74 | m = "ABC".Contains(CharComparer.OrdinalIgnoreCase, "abd", "bc", "ab").ToList();
75 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } }));
76 | m = "ABC".Contains(CharComparer.OrdinalIgnoreCase, new List { "abd", "bc", "ab" }).ToList();
77 | Assert.That(m, Is.EquivalentTo(new WordMatchList { { 0, "ab" }, { 1, "bc" } }));
78 | }
79 |
80 | [Test]
81 | public void UpperCaseTest()
82 | {
83 | var ac = new AhoCorasick("a", "ab", "bab", "bC", "bca", "c", "caa");
84 | var m = ac.Search("abCcab").ToList();
85 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } };
86 | Assert.That(m, Is.EquivalentTo(expected));
87 | }
88 |
89 | [Test]
90 | public void OrdinalIgnoreCaseTest()
91 | {
92 | var ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, "a", "ab", "bab", "bC", "bca", "c", "caa");
93 | var m = ac.Search("abCcab").ToList();
94 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } };
95 | Assert.That(m, Is.EquivalentTo(expected));
96 | }
97 |
98 | [Test]
99 | public void OverloadsTest()
100 | {
101 | var ac = new AhoCorasick(new List { "a" });
102 | Assert.That(ac.Search("a").ToList(), Is.EquivalentTo(new WordMatchList { { 0, "a" } }));
103 | Assert.That(ac.Search("b"), Is.Empty);
104 |
105 | ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, new List { "a", "ab", "bab", "bC", "bca", "c", "caa" });
106 | var m = ac.Search("abCcab").ToList();
107 | var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } };
108 | Assert.That(m, Is.EquivalentTo(expected));
109 |
110 | ac = new AhoCorasick();
111 | ac.Add("a");
112 | ac.BuildFail();
113 | Assert.That(ac.Search("a").ToList(), Is.EquivalentTo(new WordMatchList { { 0, "a" } }));
114 | Assert.That(ac.Search("b"), Is.Empty);
115 |
116 | ac = new AhoCorasick(CharComparer.Create(CultureInfo.InvariantCulture, true), "a", "ab", "bab", "bc", "bca", "c", "caa");
117 | m = ac.Search("abccab").ToList();
118 | expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } };
119 | Assert.That(m, Is.EquivalentTo(expected));
120 | }
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/AhoCorasick.SqlClr/Contains.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Data;
3 | using System.Data.SqlClient;
4 | using System.Data.SqlTypes;
5 | using Microsoft.SqlServer.Server;
6 | using System.Xml.Linq;
7 | using System.Linq;
8 | using Ganss.Text;
9 | using System.Collections;
10 | using System.Collections.Generic;
11 | using System.Security.Cryptography;
12 | using System.Text;
13 | using System.Globalization;
14 |
15 | public partial class UserDefinedFunctions
16 | {
17 | private static AhoCorasick BuildAhoCorasick(SqlXml xml, SqlString culture)
18 | {
19 | var xe = XElement.Load(xml.CreateReader());
20 | var words = xe.Elements().Select(e => e.FirstAttribute.Value);
21 | var c = culture.Value.Split(':');
22 | var ignoreCase = c.Length > 1 && c[1] == "i";
23 | CharComparer cc;
24 | switch (c[0])
25 | {
26 | case "c":
27 | cc = CharComparer.Create(CultureInfo.CurrentCulture, ignoreCase);
28 | break;
29 | case "n":
30 | cc = CharComparer.Create(CultureInfo.InvariantCulture, ignoreCase);
31 | break;
32 | case "o":
33 | case "":
34 | cc = ignoreCase ? CharComparer.OrdinalIgnoreCase : CharComparer.Ordinal;
35 | break;
36 | default:
37 | cc = CharComparer.Create(CultureInfo.GetCultureInfo(c[0]), ignoreCase);
38 | break;
39 | }
40 | var ac = new AhoCorasick(cc, words);
41 | return ac;
42 | }
43 |
44 | [SqlFunction(FillRowMethodName = "FillRow", TableDefinition = @"""Index"" int, Word nvarchar(MAX)",
45 | IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
46 | public static IEnumerable ContainsWordsTable(SqlXml xml, SqlString text, SqlString culture)
47 | {
48 | var ac = BuildAhoCorasick(xml, culture);
49 | var matches = ac.Search(text.Value);
50 |
51 | return matches;
52 | }
53 |
54 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
55 | public static bool ContainsWords(SqlXml xml, SqlString text, SqlString culture)
56 | {
57 | return ContainsWordsTable(xml, text, culture).Cast().Any();
58 | }
59 |
60 | public static void FillRow(object obj, out int index, out SqlString word)
61 | {
62 | var match = (WordMatch)obj;
63 | index = match.Index;
64 | word = match.Word;
65 | }
66 |
67 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
68 | public static string CreateAhoCorasick(SqlXml xml, SqlString culture)
69 | {
70 | var ac = BuildAhoCorasick(xml, culture);
71 | var hash = Hash(xml.Value + culture.Value);
72 | Objects[hash] = ac;
73 | return hash;
74 | }
75 |
76 | [SqlFunction(IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
77 | public static bool DeleteAhoCorasick(SqlString obj)
78 | {
79 | Objects.Remove(obj.Value);
80 | return true;
81 | }
82 |
83 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
84 | public static bool ClearAhoCorasick()
85 | {
86 | Objects.Clear();
87 | return true;
88 | }
89 |
90 | [SqlFunction(FillRowMethodName = "FillRow", TableDefinition = @"""Index"" int, Word nvarchar(MAX)", IsDeterministic = true, IsPrecise = true,
91 | DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
92 | public static IEnumerable ContainsWordsTableByObject(SqlString text, SqlString obj)
93 | {
94 | var ac = Objects[obj.Value];
95 | var matches = ac.Search(text.Value);
96 |
97 | return matches;
98 | }
99 |
100 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
101 | public static bool ContainsWordsByObject(SqlString text, SqlString obj)
102 | {
103 | return ContainsWordsTableByObject(text, obj).Cast().Any();
104 | }
105 |
106 | [SqlFunction(FillRowMethodName = "FillRow", TableDefinition = @"""Index"" int, Word nvarchar(MAX)", IsDeterministic = true, IsPrecise = true,
107 | DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
108 | public static IEnumerable ContainsWordsBoundedTableByObject(SqlString text, SqlString obj)
109 | {
110 | var ac = Objects[obj.Value];
111 | var t = text.Value;
112 | var matches = ac.Search(t).Cast().Where(m =>
113 | {
114 | var start = m.Index == 0 || !char.IsLetterOrDigit(t[m.Index - 1]);
115 | var end = (m.Index + m.Word.Length) == t.Length || !char.IsLetterOrDigit(t[m.Index + m.Word.Length]);
116 | return start && end;
117 | });
118 |
119 | return matches;
120 | }
121 |
122 | [SqlFunction(IsDeterministic = true, IsPrecise = true, DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
123 | public static bool ContainsWordsBoundedByObject(SqlString text, SqlString obj)
124 | {
125 | return ContainsWordsBoundedTableByObject(text, obj).Cast().Any();
126 | }
127 |
128 | [SqlFunction(FillRowMethodName = "FillRowList", TableDefinition = @"Hash nvarchar(MAX)", IsDeterministic = true, IsPrecise = true,
129 | DataAccess = DataAccessKind.None, SystemDataAccess = SystemDataAccessKind.None)]
130 | public static IEnumerable ListAhoCorasick()
131 | {
132 | return Objects.Keys;
133 | }
134 |
135 | public static void FillRowList(object obj, out SqlString word)
136 | {
137 | word = (string)obj;
138 | }
139 |
140 | private static string Hash(string s)
141 | {
142 | return string.Concat(MD5.Create().ComputeHash(Encoding.UTF8.GetBytes(s)).Select(b => b.ToString("X2")));
143 | }
144 |
145 | private static readonly Dictionary Objects = new Dictionary();
146 | }
147 |
--------------------------------------------------------------------------------
/AhoCorasick/AhoCorasick.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 |
3 | namespace Ganss.Text
4 | {
5 | ///
6 | /// Represents a word match.
7 | ///
8 | public struct WordMatch
9 | {
10 | ///
11 | /// Gets or sets the index of the matched word in the searched text string.
12 | ///
13 | ///
14 | /// The index.
15 | ///
16 | public int Index { get; set; }
17 |
18 | ///
19 | /// Gets or sets the matched word.
20 | ///
21 | ///
22 | /// The matched word.
23 | ///
24 | public string Word { get; set; }
25 | }
26 |
27 | ///
28 | /// Implements the Aho-Corasick algorithm.
29 | ///
30 | public class AhoCorasick
31 | {
32 | ///
33 | /// Gets or sets the trie.
34 | ///
35 | ///
36 | /// The trie.
37 | ///
38 | protected Trie Trie { get; set; }
39 |
40 | ///
41 | /// Initializes a new instance of the class.
42 | /// Does not build the failure nodes. Call after adding words before calling .
43 | ///
44 | public AhoCorasick()
45 | {
46 | Trie = new Trie();
47 | }
48 |
49 | ///
50 | /// Initializes a new instance of the class.
51 | ///
52 | /// Does not build the failure nodes. Call after adding words before calling .
53 | /// The comparer used to compare individual characters.
54 | public AhoCorasick(IEqualityComparer comparer)
55 | {
56 | Trie = new Trie(comparer);
57 | }
58 |
59 | ///
60 | /// Initializes a new instance of the class.
61 | ///
62 | /// The words to find.
63 | public AhoCorasick(params string[] words)
64 | : this()
65 | {
66 | Add(words);
67 | }
68 |
69 | ///
70 | /// Initializes a new instance of the class.
71 | ///
72 | /// The words to find.
73 | public AhoCorasick(IEnumerable words)
74 | : this()
75 | {
76 | Add(words);
77 | }
78 |
79 | ///
80 | /// Initializes a new instance of the class.
81 | ///
82 | /// The comparer used to compare individual characters.
83 | /// The words to find.
84 | public AhoCorasick(IEqualityComparer comparer, params string[] words)
85 | : this(comparer)
86 | {
87 | Add(words);
88 | }
89 |
90 | ///
91 | /// Initializes a new instance of the class.
92 | ///
93 | /// The comparer used to compare individual characters.
94 | /// The words to find.
95 | public AhoCorasick(IEqualityComparer comparer, IEnumerable words)
96 | : this(comparer)
97 | {
98 | Add(words);
99 | }
100 |
101 | ///
102 | /// Adds the specified word.
103 | ///
104 | /// Does not build the failure nodes. Call after adding words before calling .
105 | /// The word.
106 | public void Add(string word)
107 | {
108 | Trie.Add(word);
109 | }
110 |
111 | ///
112 | /// Adds the specified words.
113 | ///
114 | /// The words.
115 | public void Add(IEnumerable words)
116 | {
117 | foreach (var word in words)
118 | {
119 | Trie.Add(word);
120 | }
121 |
122 | BuildFail();
123 | }
124 |
125 | ///
126 | /// Builds the failure nodes necessary to perform search.
127 | ///
128 | /// The start node.
129 | public void BuildFail(Trie node = null)
130 | {
131 | node ??= Trie;
132 |
133 | var word = node.Word;
134 | for (int i = 1; i < word.Length && node.Fail == null; i++)
135 | node.Fail = Trie.ExploreFailLink(word, i, word.Length);
136 |
137 | foreach (var subNode in node.Next.Values)
138 | BuildFail(subNode);
139 | }
140 |
141 | ///
142 | /// Searches for words in the specified text.
143 | ///
144 | /// The text.
145 | /// The matched words.
146 | public virtual IEnumerable Search(string text)
147 | {
148 | if (text == null) yield break;
149 |
150 | var current = Trie;
151 |
152 | for (int i = 0; i < text.Length; i++)
153 | {
154 | var c = text[i];
155 |
156 | while (current != null && !current.Next.ContainsKey(c))
157 | current = current.Fail;
158 |
159 | current ??= Trie;
160 |
161 | if (current.Next.TryGetValue(c, out current))
162 | {
163 | var node = current;
164 |
165 | while (node != null)
166 | {
167 | if (node.IsWord)
168 | {
169 | var word = node.Word;
170 | var offset = i + 1 - word.Length;
171 | yield return new WordMatch { Index = offset, Word = word };
172 | }
173 |
174 | node = node.Fail;
175 | }
176 | }
177 | }
178 | }
179 | }
180 | }
181 |
--------------------------------------------------------------------------------
/AhoCorasick/CharComparer.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Globalization;
4 | using System.Linq;
5 | using System.Text;
6 |
7 | namespace Ganss.Text
8 | {
9 | class OrdinalCharComparer : CharComparer
10 | {
11 | private readonly bool _ignoreCase;
12 |
13 | public OrdinalCharComparer(bool ignoreCase = false)
14 | {
15 | _ignoreCase = ignoreCase;
16 | }
17 |
18 | public override bool Equals(char x, char y)
19 | {
20 | return _ignoreCase ? ((uint)char.ToUpperInvariant(x)).Equals(((uint)char.ToUpperInvariant(y)))
21 | : ((uint)x).Equals((uint)y);
22 | }
23 |
24 | public override int GetHashCode(char obj)
25 | {
26 | return _ignoreCase ? (int)char.ToUpperInvariant(obj) : (int)obj;
27 | }
28 | }
29 |
30 | #if NET40
31 | class CultureCharComparer : CharComparer
32 | {
33 | private readonly StringComparer _stringComparer;
34 |
35 | public CultureCharComparer(CultureInfo cultureInfo, bool ignoreCase = false)
36 | {
37 | _stringComparer = StringComparer.Create(cultureInfo, ignoreCase);
38 | }
39 |
40 | public override bool Equals(char x, char y)
41 | {
42 | return _stringComparer.Equals(x.ToString(), y.ToString());
43 | }
44 |
45 | public override int GetHashCode(char obj)
46 | {
47 | return _stringComparer.GetHashCode(obj.ToString());
48 | }
49 | }
50 | #else
51 | class CultureCharComparer: CharComparer
52 | {
53 | private readonly CompareInfo _compareInfo;
54 | private readonly bool _ignoreCase;
55 |
56 | public CultureCharComparer(CultureInfo cultureInfo, bool ignoreCase = false)
57 | {
58 | _compareInfo = cultureInfo.CompareInfo;
59 | _ignoreCase = ignoreCase;
60 | }
61 |
62 | public override bool Equals(char x, char y)
63 | {
64 | return _compareInfo.Compare(x.ToString(), y.ToString(), _ignoreCase ? CompareOptions.IgnoreCase : CompareOptions.None) == 0;
65 | }
66 |
67 | public override int GetHashCode(char obj)
68 | {
69 | return _compareInfo.GetHashCode(obj.ToString(), _ignoreCase ? CompareOptions.IgnoreCase : CompareOptions.None);
70 | }
71 | }
72 | #endif
73 |
74 | ///
75 | /// Represents a char comparison operation that uses specific case and culture-based or ordinal comparison rules.
76 | ///
77 | public abstract class CharComparer: EqualityComparer
78 | {
79 | private static readonly CharComparer _ordinalIgnoreCase = new OrdinalCharComparer(ignoreCase: true);
80 |
81 | ///
82 | /// Gets a object that performs a case-insensitive ordinal comparison.
83 | ///
84 | ///
85 | /// A object.
86 | ///
87 | public static CharComparer OrdinalIgnoreCase
88 | {
89 | get
90 | {
91 | return _ordinalIgnoreCase;
92 | }
93 | }
94 |
95 | private static readonly CharComparer _ordinal = new OrdinalCharComparer(ignoreCase: false);
96 |
97 | ///
98 | /// Gets a object that performs a case-sensitive ordinal comparison.
99 | ///
100 | ///
101 | /// A object.
102 | ///
103 | public static CharComparer Ordinal
104 | {
105 | get
106 | {
107 | return _ordinal;
108 | }
109 | }
110 |
111 | private static readonly CharComparer _invariantCultureIgnoreCase = new CultureCharComparer(CultureInfo.InvariantCulture, ignoreCase: true);
112 |
113 | ///
114 | /// Gets a object that performs a case-insensitive comparison using the comparison rules of the invariant culture.
115 | ///
116 | ///
117 | /// A object.
118 | ///
119 | public static CharComparer InvariantCultureIgnoreCase
120 | {
121 | get
122 | {
123 | return _invariantCultureIgnoreCase;
124 | }
125 | }
126 |
127 | private static readonly CharComparer _invariantCulture = new CultureCharComparer(CultureInfo.InvariantCulture, ignoreCase: false);
128 |
129 | ///
130 | /// Gets a object that performs a case-sensitive comparison using the comparison rules of the invariant culture.
131 | ///
132 | ///
133 | /// A object.
134 | ///
135 | public static CharComparer InvariantCulture
136 | {
137 | get
138 | {
139 | return _invariantCulture;
140 | }
141 | }
142 |
143 | ///
144 | /// Gets a object that performs a case-sensitive comparison using the comparison rules of the current culture.
145 | ///
146 | ///
147 | /// A object.
148 | ///
149 | public static CharComparer CurrentCulture
150 | {
151 | get
152 | {
153 | return new CultureCharComparer(CultureInfo.CurrentCulture, ignoreCase: false);
154 | }
155 | }
156 |
157 | ///
158 | /// Gets a object that performs a case-insensitive comparison using the comparison rules of the current culture.
159 | ///
160 | ///
161 | /// A object.
162 | ///
163 | public static CharComparer CurrentCultureIgnoreCase
164 | {
165 | get
166 | {
167 | return new CultureCharComparer(CultureInfo.CurrentCulture, ignoreCase: true);
168 | }
169 | }
170 |
171 | ///
172 | /// Creates a object that compares characters according to the rules of a specified culture.
173 | ///
174 | /// A culture whose linguistic rules are used to perform a string comparison.
175 | /// true to specify that comparison operations be case-insensitive; false to specify that comparison operations be case-sensitive.
176 | /// A new object that performs character comparisons according to the comparison rules used by the parameter and the case rule specified by the parameter.
177 | public static CharComparer Create(CultureInfo cultureInfo, bool ignoreCase)
178 | {
179 | return new CultureCharComparer(cultureInfo, ignoreCase);
180 | }
181 | }
182 | }
183 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AhoCorasick
2 |
3 | [](https://www.nuget.org/packages/AhoCorasick)
4 | [](https://ci.appveyor.com/project/mganss/ahocorasick/branch/master)
5 | [](https://coveralls.io/github/mganss/AhoCorasick?branch=master)
6 | [](https://img.shields.io/badge/netstandard-2.0-brightgreen.svg)
7 | [](https://img.shields.io/badge/net-40-brightgreen.svg)
8 |
9 | This is an implementation of the [Aho-Corasick](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) string matching algorithm for .NET (netstandard2.0 and net40) and SQL Server (SQL CLR). Mostly ported from [xudejian/aho-corasick](https://github.com/xudejian/aho-corasick) in CoffeeScript.
10 |
11 | ## Usage
12 |
13 | ```C#
14 | var ac = new AhoCorasick("a", "ab", "bab", "bc", "bca", "c", "caa");
15 | var results = ac.Search("abccab").ToList();
16 |
17 | Assert.AreEqual(0, results[0].Index); // index into the searched text
18 | Assert.AreEqual("a", results[0].Word); // matched word
19 | // ...
20 | ```
21 |
22 | or
23 |
24 | ```C#
25 | var results = "abccab".Contains("a", "ab", "bab", "bc", "bca", "c", "caa").ToList();
26 | ```
27 |
28 | ### Custom char comparison
29 |
30 | You can optionally supply an `IEqualityComparer` to perform custom char comparisons when searching for substrings. Several implementations with comparers that mirror `StringComparer` are included.
31 |
32 | ```C#
33 | var results = "AbCcab".Contains(CharComparer.OrdinalIgnoreCase, "a", "ab", "c").ToList();
34 | ```
35 |
36 | ## SQL CLR Functions
37 |
38 | There are also several SQL CLR user defined functions that can be used to perform fast substring matching
39 | in Microsoft SQL Server. To use this:
40 |
41 | 1. Make sure you have [enabled CLR integration](https://msdn.microsoft.com/en-us/library/ms131048.aspx)
42 | 2. Execute [AhoCorasick.SqlClr_Create.sql](AhoCorasick.SqlClr/dist/AhoCorasick.SqlClr_Create.sql)
43 |
44 | For one-off queries, you can use the functions that rebuild the trie on each query, e.g.
45 |
46 | ```SQL
47 | select top(100) * from Posts P
48 | where dbo.ContainsWords((select Word from Words for xml raw, root('root')), P.Body, 'o') = 1
49 | ```
50 |
51 | The words to match are always supplied as XML where the values are taken from the first attribute of all elements directly beneath the root node. Be careful to select the word column as the only or first column otherwise you'll end up matching the wrong words. The XML in the example above looks like this:
52 |
53 | ```XML
54 |
55 |
56 |
57 |
58 | ...
59 |
60 | ```
61 |
62 | [Here's more](https://www.simple-talk.com/sql/learn-sql-server/using-the-for-xml-clause-to-return-query-results-as-xml/) about FOR XML.
63 |
64 | The last parameter in the function indicates the culture to use since there is no way to use SQL Server collations in SQL CLR code. Values can be:
65 |
66 | |Value|Character comparison|
67 | |-----|--------------------|
68 | |c|Current Culture|
69 | |n|Invariant Culture|
70 | |o or Empty|Ordinal|
71 | |Culture name, e.g. "de-de"|Specific [.NET Culture](https://msdn.microsoft.com/en-us/library/system.globalization.cultureinfo.name.aspx)|
72 |
73 | The culture identifier can be suffixed by `:i` indicating case-insensitive matching.
74 |
75 | ### Static objects
76 |
77 | The function in the example above has the problem that the trie is rebuilt for each query even though the input always stays the same. To overcome this problem, there are a number of functions to manage the creation and destruction of static objects whose handles can be saved in SQL variables. Example:
78 |
79 | ```SQL
80 | declare @ac nvarchar(32);
81 | set @ac = dbo.CreateAhoCorasick((select Word from Words for xml raw, root('root')), 'en-us:i');
82 | select * from Posts P
83 | where dbo.ContainsWordsByObject(P.Body, @ac) = 1;
84 | ```
85 |
86 | This is a lot faster than the first example because the trie is created only once and then reused for each row in the query. The handle (@ac) is a hash value generated from the words to match and the culture. The corresponding object is saved in a static dictionary. You can list the currently active objects using `dbo.ListAhoCorasick()`, remove all objects using `dbo.ClearAhoCorasick()` or remove only one object using `dbo.DeleteAhoCorasick(@ac)`.
87 |
88 | ### Getting all matches
89 |
90 | The examples above only checked if the words occurred in the queried texts. If you want to get the matched words and the indexes where they occur in the queried texts you can use the supplied table-valued functions. For example:
91 |
92 | ```SQL
93 | declare @ac nvarchar(32);
94 | set @ac = dbo.CreateAhoCorasick((select Word from Words for xml raw, root('root')), 'o');
95 | select top(100) * from Posts P
96 | cross apply dbo.ContainsWordsTableByObject(P.Body, @ac) W
97 | ```
98 |
99 | This will return a table such as this:
100 |
101 | |ID |Body |Index |Word |
102 | |---|---|---|---|
103 | |1 |What factors related...|5|factor|
104 | |1 |What factors related...|6|actor|
105 | |1 |What factors related...|5|factors|
106 | |...|
107 |
108 | ### Word boundaries
109 |
110 | There are also functions that return only matches occuring at word boundaries: `dbo.ContainsWordsBoundedByObject()` and `dbo.ContainsWordsBoundedTableByObject()`. Word boundaries here are the same as [`\b` in regexes](http://www.regular-expressions.info/wordboundaries.html), i.e. matches will occur as if words were specified as `\bword\b`.
111 |
112 | ### Forcing parallelism
113 |
114 | Although these kinds of queries lend themselves very well to parallel execution, SQL Server tends to overestimate the cost of parallel queries and builds non-parallel plans most of the time where user defined functions are involved. You can force a parallel plan by using a trace flag (more about this [here](http://sqlblog.com/blogs/paul_white/archive/2011/12/23/forcing-a-parallel-query-execution-plan.aspx)):
115 |
116 | ```SQL
117 | declare @ac nvarchar(32);
118 | set @ac = dbo.CreateAhoCorasick((select Word from Words for xml raw, root('root')), 'en-us:i');
119 | select * from Posts P
120 | where dbo.ContainsWordsBoundedByObject(P.Body, @ac) = 1
121 | OPTION (RECOMPILE, QUERYTRACEON 8649)
122 | ```
123 |
124 | Parallel operators are identified by a yellow badge with two arrows in the query plan.
125 |
126 | ### Performance
127 |
128 | Here's a benchmark searching for ~5000 words (average length 7) in ~250,000 texts (average length ~900):
129 |
130 | |SQL|AhoCorasick|
131 | |---|-----------|
132 | |560s|7s|
133 |
134 | The SQL query used was this:
135 |
136 | ```SQL
137 | select * from Posts P
138 | where exists (select * from Words W where CHARINDEX(W.Word, P.Text) > 0)
139 | ```
140 |
141 | #### But I can simply use full-text search
142 |
143 | No. The [CONTAINS](https://msdn.microsoft.com/en-us/library/ms187787.aspx) predicate can only search for a single literal or variable at a time. You can't use it in a join or subquery to search for a column value of a table in the query, i.e. this won't work:
144 |
145 | ```SQL
146 | select * from Posts P
147 | where exists (select * from Words W where CONTAINS(P.Text, W.Word))
148 | ```
149 |
150 | If you know of a way to make this work using FTS (perhaps using a cursor?) let me know.
151 |
--------------------------------------------------------------------------------
/AhoCorasick.SqlClr/dist/AhoCorasick.SqlClr_Create.sql:
--------------------------------------------------------------------------------
1 | PRINT N'Creating [AhoCorasick.SqlClr]...';
2 |
3 |
4 | GO
5 | CREATE ASSEMBLY [AhoCorasick.SqlClr]
6 | AUTHORIZATION [dbo]
7 | FROM 0x
8 |
9 |
10 | GO
11 | ALTER ASSEMBLY [AhoCorasick.SqlClr]
12 | DROP FILE ALL
13 | ADD FILE FROM hoCorasick.SqlClr.pdb';
14 |
15 |
16 | GO
17 | PRINT N'Creating [dbo].[ContainsWords]...';
18 |
19 |
20 | GO
21 | CREATE FUNCTION [dbo].[ContainsWords]
22 | (@xml XML, @text NVARCHAR (MAX), @culture NVARCHAR (MAX))
23 | RETURNS BIT
24 | AS
25 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWords]
26 |
27 |
28 | GO
29 | PRINT N'Creating [dbo].[CreateAhoCorasick]...';
30 |
31 |
32 | GO
33 | CREATE FUNCTION [dbo].[CreateAhoCorasick]
34 | (@xml XML, @culture NVARCHAR (MAX))
35 | RETURNS NVARCHAR (MAX)
36 | AS
37 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[CreateAhoCorasick]
38 |
39 |
40 | GO
41 | PRINT N'Creating [dbo].[DeleteAhoCorasick]...';
42 |
43 |
44 | GO
45 | CREATE FUNCTION [dbo].[DeleteAhoCorasick]
46 | (@obj NVARCHAR (MAX))
47 | RETURNS BIT
48 | AS
49 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[DeleteAhoCorasick]
50 |
51 |
52 | GO
53 | PRINT N'Creating [dbo].[ClearAhoCorasick]...';
54 |
55 |
56 | GO
57 | CREATE FUNCTION [dbo].[ClearAhoCorasick]
58 | ( )
59 | RETURNS BIT
60 | AS
61 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ClearAhoCorasick]
62 |
63 |
64 | GO
65 | PRINT N'Creating [dbo].[ContainsWordsByObject]...';
66 |
67 |
68 | GO
69 | CREATE FUNCTION [dbo].[ContainsWordsByObject]
70 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX))
71 | RETURNS BIT
72 | AS
73 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsByObject]
74 |
75 |
76 | GO
77 | PRINT N'Creating [dbo].[ContainsWordsBoundedByObject]...';
78 |
79 |
80 | GO
81 | CREATE FUNCTION [dbo].[ContainsWordsBoundedByObject]
82 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX))
83 | RETURNS BIT
84 | AS
85 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsBoundedByObject]
86 |
87 |
88 | GO
89 | PRINT N'Creating [dbo].[ContainsWordsTable]...';
90 |
91 |
92 | GO
93 | CREATE FUNCTION [dbo].[ContainsWordsTable]
94 | (@xml XML, @text NVARCHAR (MAX), @culture NVARCHAR (MAX))
95 | RETURNS
96 | TABLE (
97 | [Index] INT NULL,
98 | [Word] NVARCHAR (MAX) NULL)
99 | AS
100 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsTable]
101 |
102 |
103 | GO
104 | PRINT N'Creating [dbo].[ContainsWordsTableByObject]...';
105 |
106 |
107 | GO
108 | CREATE FUNCTION [dbo].[ContainsWordsTableByObject]
109 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX))
110 | RETURNS
111 | TABLE (
112 | [Index] INT NULL,
113 | [Word] NVARCHAR (MAX) NULL)
114 | AS
115 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsTableByObject]
116 |
117 |
118 | GO
119 | PRINT N'Creating [dbo].[ContainsWordsBoundedTableByObject]...';
120 |
121 |
122 | GO
123 | CREATE FUNCTION [dbo].[ContainsWordsBoundedTableByObject]
124 | (@text NVARCHAR (MAX), @obj NVARCHAR (MAX))
125 | RETURNS
126 | TABLE (
127 | [Index] INT NULL,
128 | [Word] NVARCHAR (MAX) NULL)
129 | AS
130 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ContainsWordsBoundedTableByObject]
131 |
132 |
133 | GO
134 | PRINT N'Creating [dbo].[ListAhoCorasick]...';
135 |
136 |
137 | GO
138 | CREATE FUNCTION [dbo].[ListAhoCorasick]
139 | ( )
140 | RETURNS
141 | TABLE (
142 | [Hash] NVARCHAR (MAX) NULL)
143 | AS
144 | EXTERNAL NAME [AhoCorasick.SqlClr].[UserDefinedFunctions].[ListAhoCorasick]
145 |
146 |
147 | GO
148 | PRINT N'Update complete.';
149 |
150 |
151 | GO
152 |
--------------------------------------------------------------------------------