├── .github ├── FUNDING.yml └── workflows │ └── dotnet.yml ├── images ├── icon.png └── icon.cs ├── doc ├── linqpad-samples │ ├── FileOrder.txt │ ├── Diagnostic tools.linq │ ├── Quick start.linq │ ├── Nested types.linq │ └── Using record types.linq └── spec.md ├── RegExtract.Test ├── Usage.a008.verified.txt ├── Usage.a007.verified.txt ├── Usage.a002.verified.txt ├── Usage.a006.verified.txt ├── Usage.a013.verified.txt ├── Usage.a001.verified.txt ├── Usage.a004.verified.txt ├── Usage.a012.verified.txt ├── Usage.a009.verified.txt ├── Usage.a010.verified.txt ├── Usage.a005.verified.txt ├── Usage.a011.verified.txt ├── Usage.a003.verified.txt ├── VerifyShim.cs ├── RegExtract.Test.csproj └── Usage.cs ├── LICENSE ├── RegExtract.sln ├── tools └── ExtractionPlanInspector.linq ├── RegExtract ├── RegExtractExtensions.cs ├── RegExtract.csproj ├── ExtractionPlanTypeWrapper.cs ├── ExtractionPlan.cs ├── ExtractionPlanNodeTypes.cs ├── ExtractionPlanNode.cs └── RegexCaptureGroupTree.cs ├── .gitignore └── README.md /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | ko_fi: sblom 2 | -------------------------------------------------------------------------------- /images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sblom/RegExtract/HEAD/images/icon.png -------------------------------------------------------------------------------- /doc/linqpad-samples/FileOrder.txt: -------------------------------------------------------------------------------- 1 | Quick start.linq 2 | Nested types.linq 3 | Using record types.linq 4 | Diagnostic tools.linq -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a008.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | StaticParseMethod[1] () 3 | ) 4 | 5 | ↓-------↓₀ 6 | (\d+ ?)+₁ 7 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a007.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | StaticParseMethod[1] () 3 | ) 4 | 5 | (?:↓----↓ ?)+₀ 6 | (\d+)₁ 7 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a002.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | StaticParseMethod[2] () 3 | ) 4 | 5 | ↓----------↓₀ 6 | (↓----↓ ?)+₁ 7 | (\d+)₂ 8 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a006.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | StaticParseMethod[2] () 3 | ) 4 | 5 | ↓----------↓₀ 6 | (↓----↓ ?)+₁ 7 | (\d+)₂ 8 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a013.verified.txt: -------------------------------------------------------------------------------- 1 | Constructor[0] ( 2 | StaticParseMethod[1] (), 3 | StaticParseMethod[2] (), 4 | StaticParseMethod[3] () 5 | ) 6 | 7 | ↓----↓ ↓----↓ ↓----↓₀ 8 | (\d+)₁ (\d+)₂ (\d+)₃ -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a001.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | ConstructTuple<(char,char)>[1] ( 3 | StaticParseMethod[2] (), 4 | StaticParseMethod[3] () 5 | ) 6 | ) 7 | 8 | ↓------------↓₀ 9 | (↓---↓↓---↓)+₁ 10 | (\w)₂(\w)₃ 11 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a004.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | ConstructTuple<(char,int)>[1] ( 3 | StaticParseMethod[2] (), 4 | StaticParseMethod[3] () 5 | ) 6 | ) 7 | 8 | ↓-------------------↓₀ 9 | (↓-----↓↓----↓,? ?)+₁ 10 | ([RL])₂(\d+)₃ 11 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a012.verified.txt: -------------------------------------------------------------------------------- 1 | ConstructTuple<(string,List)>[0] ( 2 | StringCast[1] (), 3 | CollectionInitializer>[2] ( 4 | StaticParseMethod[3] () 5 | ) 6 | ) 7 | 8 | ↓----↓{↓------------↓}₀ 9 | (\w+)₁ (↓------↓,?)+₂ 10 | ([^,]+)₃ 11 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a009.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | ConstructTuple<(string,string)>[1] ( 3 | StringCast[2] (), 4 | StringCast[3] () 5 | ) 6 | ) 7 | 8 | ↓--------------------------↓₀ 9 | (↓-------↓↓-----------↓,?)+₁ 10 | ([a-z]+)₂([=-][0-9]?)₃ 11 | -------------------------------------------------------------------------------- /images/icon.cs: -------------------------------------------------------------------------------- 1 | using System.Text.RegularExpressions; 2 | 3 | 4 | (int,string) value; 5 | // ⬆ ⬆ 6 | new Regex(@" (\d+)\s+(.*) "); 7 | 8 | 9 | 10 | // Enable bold on: Comment, Keyword, Punctuation, Regex Quantifier, Regex Grouping, Regex Character Class -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a010.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>>>[1] ( 2 | CollectionInitializer>>[2] ( 3 | CollectionInitializer>[3] ( 4 | StaticParseMethod[3] () 5 | ) 6 | ) 7 | ) 8 | 9 | ↓------------------↓₀ 10 | (↓----------↓,? ?)+₁ 11 | (↓----↓ ?)+₂ 12 | (\w)+₃ 13 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a005.verified.txt: -------------------------------------------------------------------------------- 1 | CollectionInitializer>[1] ( 2 | StringCast[2] (), 3 | ConstructTuple<(string,string)>[3] ( 4 | StringCast[4] (), 5 | StringCast[5] () 6 | ) 7 | ) 8 | 9 | ↓------------------------------------↓₀ 10 | (↓----↓ = \(↓---------------↓\);? ?)+₁ 11 | (...)₂ (↓----↓, ↓----↓)₃ 12 | (...)₄ (...)₅ 13 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a011.verified.txt: -------------------------------------------------------------------------------- 1 | ConstructTuple<((char?,string),List)>[0] ( 2 | ConstructTuple<(char?,string)>[1] ( 3 | StaticParseMethod[2] (), 4 | StringCast[3] () 5 | ), 6 | CollectionInitializer>[4] ( 7 | StringCast[5] () 8 | ) 9 | ) 10 | 11 | ^↓------------------↓ -> ↓---------------↓$₀ 12 | (↓------↓↓-------↓)₁ (↓-------↓,? ?)+₄ 13 | ([%&])?₂([a-z]+)₃ ([a-z]+)₅ 14 | -------------------------------------------------------------------------------- /doc/linqpad-samples/Diagnostic tools.linq: -------------------------------------------------------------------------------- 1 | 2 | C:\src\public\RegExtract\RegExtract\bin\Debug\netstandard2.1\RegExtract.dll 3 | RegExtract 4 | 5 | 6 | var plan = ExtractionPlan<((int x, int y), char ch, string pwd)>.CreatePlan(new Regex(@"((\d+)-(\d+)) (.): (.*)")); 7 | var diagnostics = plan.ToString("x"); 8 | 9 | Util.WithStyle(diagnostics, "font-family:consolas").DumpFixed(); -------------------------------------------------------------------------------- /doc/linqpad-samples/Quick start.linq: -------------------------------------------------------------------------------- 1 | 2 | C:\src\public\RegExtract\RegExtract\bin\Debug\netstandard2.1\RegExtract.dll 3 | RegExtract 4 | 5 | 6 | "Hello, world!" 7 | .Extract(@"Hello, (\w+)!") 8 | .Dump("Simple extraction"); 9 | 10 | "Party like it's 1999!" 11 | .Extract<(string verb, int year)>(@"(\w+) like it's (\d+)!") 12 | .Dump("Extract multiple captures"); 13 | -------------------------------------------------------------------------------- /RegExtract.Test/Usage.a003.verified.txt: -------------------------------------------------------------------------------- 1 | Constructor[0] ( 2 | StaticParseMethod[1] (), 3 | CollectionInitializer>[2] ( 4 | Constructor[2] ( 5 | CollectionInitializer>[3] ( 6 | ConstructTuple<(int,string)>[3] ( 7 | StaticParseMethod[4] (), 8 | StringCast[5] () 9 | ) 10 | ) 11 | ) 12 | ) 13 | ) 14 | 15 | Game ↓----↓: ↓---------------------------↓₀ 16 | (\d+)₁ (↓-------------------↓;? ?)+₂ 17 | (↓----↓ ↓----↓,? ?)+₃ 18 | (\d+)₄ (\w+)₅ 19 | -------------------------------------------------------------------------------- /doc/linqpad-samples/Nested types.linq: -------------------------------------------------------------------------------- 1 | 2 | C:\src\public\RegExtract\RegExtract\bin\Debug\netstandard2.1\RegExtract.dll 3 | RegExtract 4 | 5 | 6 | // You can nest parentheses in the Regex. The order of the types in your typle 7 | // should correspond to the open parenthesis of each capture group. 8 | (Uri uri, string protocol, string host, int port, string path) nested = 9 | "https://nuget.org:443/packages/RegExtract" 10 | .Extract<(Uri, string, string, int, string)>(@"((\S+)://(\S+):(\d+)(\S*))"); 11 | 12 | nested.Dump("Nested parens in Regex"); 13 | -------------------------------------------------------------------------------- /RegExtract.Test/VerifyShim.cs: -------------------------------------------------------------------------------- 1 | // Compatibility shim for older target frameworks that don't have Verify packages available. 2 | #if NETCOREAPP3_1 || NET462 3 | using System; 4 | using System.Threading.Tasks; 5 | 6 | namespace VerifyXunit 7 | { 8 | [AttributeUsage(AttributeTargets.Class, AllowMultiple = false)] 9 | internal sealed class UsesVerifyAttribute : Attribute { } 10 | } 11 | 12 | namespace VerifyTests 13 | { 14 | // Minimal Verifier stub to allow the test code to compile and run on older TFMs. 15 | internal static class Verifier 16 | { 17 | public static Task Verify(object? input) => Task.CompletedTask; 18 | public static Task Verify(string input) => Task.CompletedTask; 19 | } 20 | } 21 | #endif 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT 2 | 3 | Copyright (c) Scott Blomquist 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /RegExtract.Test/RegExtract.Test.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp8.0;netcoreapp3.1;net462;net10.0 5 | enable 6 | false 7 | latest 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | runtime; build; native; contentfiles; analyzers; buildtransitive 16 | all 17 | 18 | 19 | runtime; build; native; contentfiles; analyzers; buildtransitive 20 | all 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /doc/linqpad-samples/Using record types.linq: -------------------------------------------------------------------------------- 1 | 2 | C:\src\public\RegExtract\RegExtract\bin\Debug\netstandard2.1\RegExtract.dll 3 | RegExtract 4 | 5 | 6 | // Instead of a tuple, you can use any type with a single non-default public constructor. 7 | // The most useful examples of this will probably be C# 9's record types. 8 | UrlPositionalRecord urlPositional = "https://nuget.org:443/packages/RegExtract" 9 | .Extract 10 | (@"((\S+)://(\S+):(\d+)(\S*))"); 11 | 12 | urlPositional.Dump("Record type (positional)"); 13 | 14 | // Instead of a tuple, you can use any type with a single non-default public constructor. 15 | // The most useful examples of this will probably be C# 9's record types. 16 | UrlRecord urlProperties = "https://nuget.org:443/packages/RegExtract" 17 | .Extract(@"(?\S+)://(?\S+):(?\d+)(?\S*)"); 18 | 19 | urlProperties.Dump("Record type (properties)"); 20 | 21 | record UrlPositionalRecord(Uri uri, string protocol, string host, int port, string path); 22 | 23 | record UrlRecord 24 | { 25 | public string protocol { get; init; } 26 | public string host { get; init; } 27 | public int port { get; init; } 28 | public string path { get; init; } 29 | } 30 | -------------------------------------------------------------------------------- /RegExtract.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.0.32014.148 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RegExtract", "RegExtract\RegExtract.csproj", "{7F764AB9-4BEA-4128-B1DA-17B51166715D}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RegExtract.Test", "RegExtract.Test\RegExtract.Test.csproj", "{836F68E6-27F6-417F-B216-3B48BBE34862}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Any CPU = Debug|Any CPU 13 | Release|Any CPU = Release|Any CPU 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {7F764AB9-4BEA-4128-B1DA-17B51166715D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {7F764AB9-4BEA-4128-B1DA-17B51166715D}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {7F764AB9-4BEA-4128-B1DA-17B51166715D}.Release|Any CPU.ActiveCfg = Release|Any CPU 19 | {7F764AB9-4BEA-4128-B1DA-17B51166715D}.Release|Any CPU.Build.0 = Release|Any CPU 20 | {836F68E6-27F6-417F-B216-3B48BBE34862}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 21 | {836F68E6-27F6-417F-B216-3B48BBE34862}.Debug|Any CPU.Build.0 = Debug|Any CPU 22 | {836F68E6-27F6-417F-B216-3B48BBE34862}.Release|Any CPU.ActiveCfg = Release|Any CPU 23 | {836F68E6-27F6-417F-B216-3B48BBE34862}.Release|Any CPU.Build.0 = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {EC1F0F9D-41E7-458E-AF8A-74B6C9FFB231} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /tools/ExtractionPlanInspector.linq: -------------------------------------------------------------------------------- 1 | 2 | C:\src\public\RegExtract\RegExtract\bin\Debug\netstandard2.1\RegExtract.dll 3 | RegExtract 4 | 5 | 6 | //"2-12 c: abcdefghi".ShowExtractionPlan<((int,int), char, string)>(@"((\d+)-(\d+)) (.): (.*)") 7 | //"The quick brown fox jumps over the lazy dog".ShowExtractionPlan>>(@"(?:((\w)+) ?)+") 8 | //"2-12 18-3 10-5".ShowExtractionPlan>(@"((\d+)-(\d+) ?)+") 9 | //"faded yellow bags contain 4 mirrored fuchsia bags, 4 dotted indigo bags, 3 faded orange bags, 5 plaid crimson bags.".ShowExtractionPlan<(string,string,string,List,List)>(@"^(.+) bags contain(?: (no other bags)\.| ((\d+) (.*?)) bags?[,.])+$").Dump(); 10 | //"faded yellow bags contain 4 mirrored fuchsia bags, 4 dotted indigo bags, 3 faded orange bags, 5 plaid crimson bags.".ShowExtractionPlan<(string,string,List<(int,string)>)>(@"^(.+) bags contain(?: (no other bags)\.| ((\d+) (.*?)) bags?[,.])+$").Dump(); 11 | 12 | //"2-12 c: abcdefghi".ShowExtractionPlan<((int, int), char?, string)>(@"((\d+)-(\d+)) (.): (.*)").Dump(); 13 | 14 | //"faded yellow bags contain 4 mirrored fuchsia bags, 4 dotted indigo bags, 3 faded orange bags, 5 plaid crimson bags.".Extr<(string,string,string,List,List)>(@"^(.+) bags contain(?: (no other bags)\.| ((\d+) (.*?)) bags?[,.])+$") 15 | 16 | //RegexExtractionPlan.CreatePlan<(string, string, List<(int?, string)?>)>(@"^(.+) bags contain(?: (no other bags)\.| ((\d+) (.*?)) bags?[,.])+$").Dump(); 17 | //var plan = RegexExtractionPlan.CreatePlan>>(@"((\w)+ ?)+").Dump(); 18 | //plan.Execute(Regex.Match("The quick brown fox jumps over the lazy dog",@"(?:((\w)+) ?)+").Dump()).Dump(); 19 | 20 | RegexExtractionPlan.CreatePlan<(long, string, int, char, string, int, char, string, int, char, string)>(@"(((.)(.)(.)(.)(.)(.)(.)(.)(.)))").Dump(); 21 | 22 | record bound(int lo, int hi); 23 | record rule(string range, int lo, int hi, char ch, string pwd); -------------------------------------------------------------------------------- /doc/spec.md: -------------------------------------------------------------------------------- 1 | # List of tuple 2 | 3 | 4 | 5 | 14 | 24 | 25 |
6 | 7 | ```mermaid 8 | graph TD 9 | Start["List<(char,char)>"] --> tuple["(char,char)"] 10 | tuple --> char1["char"] 11 | tuple --> char2["char"] 12 | ``` 13 | 15 | 16 | ```mermaid 17 | graph TD 18 | A1["((.)(.))+"] 19 | A1 --> tuple["(.)(.)"] 20 | tuple --> char1["."] 21 | tuple --> char2["."] 22 | ``` 23 |
26 | 27 | # List of primitive type 28 | 29 | 30 | 31 | 38 | 46 | 47 | 54 | 62 | 63 |
32 | 33 | ```mermaid 34 | graph TD 35 | Start["List#lt;int>"] --> int 36 | ``` 37 | 39 | 40 | ```mermaid 41 | graph TD 42 | A1["((\d+),? ?)+"] 43 | A1 --> int["(\d+),? ?"] 44 | ``` 45 | 48 | 49 | ```mermaid 50 | graph TD 51 | Start["List#lt;int>"] --> int 52 | ``` 53 | 55 | 56 | ```mermaid 57 | graph TD 58 | A1["(\d+ ?)+"] 59 | A1 --> int["\d+ ?"] 60 | ``` 61 |
64 | 65 | # Dictionary of tuple including list 66 | 67 | 68 | 69 | 79 | 90 | 91 |
70 | 71 | ```mermaid 72 | graph TD 73 | Start["Dictionary#lt;int,List#lt;string>>"] --> tuple["(int,List#lt;string>)"] 74 | tuple --> int 75 | tuple --> list["List#lt;string>"] 76 | list --> string 77 | ``` 78 | 80 | 81 | ```mermaid 82 | graph TD 83 | A1["((\d+) = ((\w+),? ?)+)+"] 84 | A1 --> tuple["(\d+) = ((\w+),? ?)+"] 85 | tuple --> int["\d+"] 86 | tuple --> list["(\w+),? ?"] 87 | list --> string["\w+"] 88 | ``` 89 |
92 | 93 | # Record with two values 94 | 95 | 96 | 97 | 105 | 114 | 115 |
98 | 99 | ```mermaid 100 | graph TD 101 | Start["record(long,long)"] --> long1[long] 102 | Start --> long2[long] 103 | ``` 104 | 106 | 107 | ```mermaid 108 | graph TD 109 | A1["mem\[(\d+)\] = (\d+)"] 110 | A1 --> long1["\d+"] 111 | A1 --> long2["\d+"] 112 | ``` 113 |
116 | 117 | # List of List of List of char 118 | 119 | 120 | 121 | 130 | 139 | 140 |
122 | 123 | ```mermaid 124 | graph TD 125 | Start["List#lt;List#lt;List#lt;char>>>"] --> listlist["List#lt;List#lt;char>>"] 126 | listlist --> list["List#lt;char>"] 127 | list --> char 128 | ``` 129 | 131 | 132 | ```mermaid 133 | graph TD 134 | A1["(((\w)+\s+)+,? ?)+"] --> listlist["((\w)+\s+)+,? ?"] 135 | listlist --> list["(\w)+\s+"] 136 | list --> char["\w"] 137 | ``` 138 |
141 | -------------------------------------------------------------------------------- /.github/workflows/dotnet.yml: -------------------------------------------------------------------------------- 1 | name: dotnet 2 | 3 | permissions: 4 | contents: read 5 | packages: write 6 | id-token: write 7 | 8 | on: 9 | push: 10 | branches: [ main ] 11 | tags: ["*"] 12 | pull_request: 13 | branches: [ main ] 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | fetch-depth: 0 22 | - uses: actions/setup-dotnet@v4 23 | with: 24 | dotnet-version: '8.0.x' 25 | - uses: actions/setup-dotnet@v4 26 | with: 27 | dotnet-version: '3.1.x' 28 | - uses: gittools/actions/gitversion/setup@v0.9.7 29 | with: 30 | versionSpec: "5.x" 31 | - id: gitversion 32 | uses: gittools/actions/gitversion/execute@v0.9.7 33 | - name: Restore dependencies 34 | run: dotnet restore 35 | - name: Build 36 | run: dotnet build --configuration Release --no-restore 37 | - name: Test 38 | run: | 39 | if [ "$RUNNER_OS" = "Linux" ]; then 40 | # Skip net462 on Linux (Mono/OpenSSL compatibility issues) 41 | dotnet test --configuration Release --no-build --verbosity normal --framework '!net462' 42 | else 43 | dotnet test --configuration Release --no-build --verbosity normal 44 | fi 45 | shell: bash 46 | - run: | 47 | dotnet pack \ 48 | --include-source \ 49 | --include-symbols \ 50 | --configuration Release \ 51 | --no-build \ 52 | --no-restore \ 53 | -p:PackageVersion="${{ env.GitVersion_FullSemVer }}" \ 54 | RegExtract/RegExtract.csproj \ 55 | --output ${{ github.workspace }}/nugets/ 56 | - uses: actions/upload-artifact@v4 57 | with: 58 | name: nugets 59 | path: nugets 60 | 61 | nuget-push-dev: 62 | runs-on: ubuntu-latest 63 | if: github.ref == 'refs/heads/main' 64 | needs: build 65 | 66 | steps: 67 | - name: download artifact 68 | uses: actions/download-artifact@v4 69 | with: 70 | name: nugets 71 | 72 | - name: setup dotnet 73 | uses: actions/setup-dotnet@v4 74 | with: 75 | dotnet-version: '8.0.x' 76 | source-url: https://nuget.pkg.github.com/sblom/index.json 77 | env: 78 | NUGET_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 79 | 80 | - name: nuget push 81 | run: dotnet nuget push *.nupkg *.snupkg --skip-duplicate --api-key ${{ secrets.GITHUB_TOKEN }} 82 | 83 | nuget-push-prod: 84 | runs-on: ubuntu-latest 85 | if: startsWith(github.ref, 'refs/tags/') 86 | needs: build 87 | 88 | steps: 89 | - uses: actions/download-artifact@v4 90 | with: 91 | name: nugets 92 | 93 | # Get a short-lived NuGet API key 94 | - name: NuGet login (OIDC → temp API key) 95 | uses: NuGet/login@v1 96 | id: login 97 | with: 98 | user: ${{secrets.NUGET_USER}} 99 | 100 | # Push the package 101 | - name: NuGet push 102 | run: dotnet nuget push *.nupkg *.snupkg --skip-duplicate --api-key ${{steps.login.outputs.NUGET_API_KEY}} --source https://api.nuget.org/v3/index.json -------------------------------------------------------------------------------- /RegExtract/RegExtractExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Reflection; 5 | using System.Text.RegularExpressions; 6 | 7 | namespace RegExtract 8 | { 9 | public enum RegExtractOptions 10 | { 11 | None = 0x0, 12 | Strict = 1<<0 13 | } 14 | 15 | public static class RegExtractExtensions 16 | { 17 | public static T? Extract(this string str, string rx, RegExtractOptions options = RegExtractOptions.None) 18 | { 19 | return Extract(str, rx, RegexOptions.None, options); 20 | } 21 | 22 | public static T? Extract(this string str, string rx, RegexOptions rxOptions, RegExtractOptions options = RegExtractOptions.None) 23 | { 24 | var match = Regex.Match(str, rx, rxOptions); 25 | 26 | var plan = ExtractionPlan.CreatePlan(new Regex(rx)); 27 | return (T)plan.Extract(match); 28 | } 29 | 30 | public static T? Extract(this string str, Regex rx, RegExtractOptions options = RegExtractOptions.None) 31 | { 32 | var match = rx.Match(str); 33 | 34 | var plan = ExtractionPlan.CreatePlan(rx); 35 | return (T)plan.Extract(match); 36 | } 37 | 38 | public static T? Extract(this string str, ExtractionPlan plan) 39 | { 40 | return plan.Extract(str); 41 | } 42 | 43 | public static T? Extract(this string str, RegExtractOptions options = RegExtractOptions.None) 44 | { 45 | var rx = GetRegexFromType(typeof(T)); 46 | 47 | return Extract(str, rx, options); 48 | } 49 | 50 | static Regex GetRegexFromType(Type type) 51 | { 52 | var field = type.GetField("REGEXTRACT_REGEX_PATTERN", BindingFlags.Public | BindingFlags.Static); 53 | if (field is not { IsLiteral: true, IsInitOnly: false }) 54 | throw new ArgumentException("No string, Regex, or Match provided, and extraction type doesn't have public const string REGEXTRACT_REGEX_PATTERN."); 55 | string rxPattern = (string)field.GetValue(null); 56 | 57 | RegexOptions rxOptions = RegexOptions.None; 58 | field = type.GetField("REGEXTRACT_REGEX_OPTIONS", BindingFlags.Public | BindingFlags.Static); 59 | if (field is { IsLiteral: true, IsInitOnly: false }) rxOptions = (RegexOptions)field.GetValue(null); 60 | 61 | return new Regex(rxPattern, rxOptions); 62 | } 63 | 64 | public static IEnumerable Extract(this IEnumerable str, string rx, RegExtractOptions options = RegExtractOptions.None) 65 | { 66 | return Extract(str, rx, RegexOptions.None, options); 67 | } 68 | 69 | public static IEnumerable Extract(this IEnumerable str, string rx, RegexOptions rxOptions, RegExtractOptions options = RegExtractOptions.None) 70 | { 71 | return Extract(str, new Regex(rx, rxOptions), options); 72 | } 73 | 74 | public static IEnumerable Extract(this IEnumerable str, ExtractionPlan plan) 75 | { 76 | return str.Select(plan.Extract); 77 | } 78 | 79 | public static IEnumerable Extract(this IEnumerable str, RegExtractOptions options = RegExtractOptions.None) 80 | { 81 | var rx = GetRegexFromType(typeof(T)); 82 | return Extract(str, rx, options); 83 | } 84 | 85 | public static IEnumerable Extract(this IEnumerable str, Regex rx, RegExtractOptions options = RegExtractOptions.None) 86 | { 87 | var plan = ExtractionPlan.CreatePlan(rx, options); 88 | return str.Select(s => plan.Extract(rx.Match(s))); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /RegExtract/RegExtract.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.0;netstandard2.1;net40;net10.0 5 | enable 6 | Latest 7 | 8 | 9 | 10 | 11 | Clean & simple, idiomatic C# RegEx-based line parser. 12 | 13 | RegExtract takes a string and a regular expression template and deserializes to a provided type. Works especially well with records and tuples, but can extract to any type that provides either a `static T Parse(string)` method or a single-string constructor (`T(string)`). 14 | 15 | See more docs at project page: https://github.com/sblom/RegExtract 16 | 17 | Scott Blomquist 18 | Copyright © Scott Blomquist. All rights reserved. 19 | 20 | 21 | https://github.com/sblom/RegExtract 22 | MIT 23 | regex parse parser reader deserialize deserializer valuetuple record deserialization parsing regular-expression regular-expressions regular expression expressions type string tuple linqpad-samples 24 | icon.png 25 | 26 | 27 | 28 | 29 | true 30 | true 31 | true 32 | snupkg 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | true 43 | true 44 | 45 | 46 | 47 | $(GITVERSION_ASSEMBLYSEMVER) 48 | $(GITVERSION_ASSEMBLYFILESEMVER) 49 | 0.0.0.0 50 | 0.0.0.0 51 | 52 | 53 | 54 | 55 | <_Parameter1>RegExtract.Test 56 | 57 | 58 | <_Parameter1>LINQPadQuery 59 | 60 | 61 | 62 | 63 | 64 | false 65 | 66 | 67 | false 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | all 82 | runtime; build; native; contentfiles; analyzers; buildtransitive 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /RegExtract/ExtractionPlanTypeWrapper.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.Globalization; 5 | using System.Linq; 6 | using System.Reflection; 7 | using System.Text; 8 | 9 | namespace RegExtract 10 | { 11 | internal class ExtractionPlanTypeWrapper 12 | { 13 | private static Dictionary _typeWrappers = new(); 14 | 15 | public static ExtractionPlanTypeWrapper Wrap(Type? type) 16 | { 17 | type ??= typeof(void); 18 | if (_typeWrappers.TryGetValue(type, out var wrap)) 19 | return wrap; 20 | else 21 | return (_typeWrappers[type] = new ExtractionPlanTypeWrapper(type)); 22 | } 23 | 24 | private ExtractionPlanTypeWrapper(Type type) 25 | { 26 | Type = type; 27 | _genericArguments = new Lazy(() => NonNullableType.Type.GetGenericArguments()); 28 | _addMethod = new Lazy(() => Type.GetMethod("Add", BindingFlags.Public | BindingFlags.Instance, null, GenericArguments, null)); 29 | _nonNullableType = new Lazy(() => { var type = Nullable.GetUnderlyingType(Type); return type != null ? ExtractionPlanTypeWrapper.Wrap(type) : this; }); 30 | } 31 | 32 | public Type Type { get; } 33 | 34 | private Lazy _nonNullableType; 35 | public ExtractionPlanTypeWrapper NonNullableType => _nonNullableType.Value; 36 | 37 | private Lazy _genericArguments; 38 | public Type[]? GenericArguments => _genericArguments.Value; 39 | 40 | private bool? _isNullable = null; 41 | public bool IsNullable => _isNullable.HasValue ? _isNullable.Value : ((bool)(_isNullable = Nullable.GetUnderlyingType(Type) != null)); 42 | 43 | private bool? _isTuple = null; 44 | public bool IsTuple => _isTuple.HasValue ? _isTuple.Value : ((bool)(_isTuple = NonNullableType.Type.FullName.StartsWith(VALUETUPLE_TYPENAME))); 45 | 46 | // We use C#'s definition of an initializable collection, which is any type that implements IEnumerable and has a public Add() method. 47 | // In our case, we also require that the Add() method has parameters of the same type as the collection's generic parameters. 48 | private bool? _isInitializableCollection = null; 49 | public bool IsInitializableCollection => _isInitializableCollection.HasValue ? _isInitializableCollection.Value : ((bool)(_isInitializableCollection = IsInitializableCollectionImpl())); 50 | 51 | private Lazy _addMethod; 52 | public MethodInfo? AddMethod => _addMethod.Value; 53 | 54 | private bool IsInitializableCollectionImpl() 55 | { 56 | var genericParameters = GenericArguments; 57 | var addMethod = Type.GetMethod("Add", BindingFlags.Public | BindingFlags.Instance, null, genericParameters, null); 58 | 59 | return Type.GetInterfaces().Any(i => i == typeof(IEnumerable)) && addMethod != null; 60 | } 61 | 62 | private ConstructorInfo[]? _constructors = null; 63 | public ConstructorInfo[] Constructors => _constructors ?? (_constructors = NonNullableType.Type.GetConstructors()); 64 | 65 | public bool IsContainerOfSize(int numParams) 66 | { 67 | var constructors = Constructors.Where(cons => cons.GetParameters().Length == numParams); 68 | return constructors.Count() == 1; 69 | } 70 | 71 | public bool IsDirectlyConstructable { 72 | get 73 | { 74 | if (Type == typeof(string)) 75 | { 76 | return true; 77 | } 78 | 79 | if (IsTuple) 80 | { 81 | return false; 82 | } 83 | 84 | var parse = NonNullableType.Type.GetMethod("Parse", 85 | BindingFlags.Static | BindingFlags.Public, 86 | null, 87 | new Type[] { typeof(string) }, 88 | null); 89 | 90 | if (parse is not null) 91 | { 92 | return true; 93 | } 94 | 95 | if (NonNullableType.Type.BaseType == typeof(Enum)) 96 | { 97 | return true; 98 | } 99 | 100 | return false; 101 | } 102 | } 103 | 104 | protected Type[] GetTupleArgumentsList(Type type) 105 | { 106 | var typeArgs = type.GetGenericArguments(); 107 | 108 | if (IsTuple && typeArgs.Length == 8) 109 | { 110 | return typeArgs.Take(7).Concat(GetTupleArgumentsList(typeArgs[7])).ToArray(); 111 | } 112 | else 113 | { 114 | return typeArgs; 115 | } 116 | } 117 | 118 | private const string VALUETUPLE_TYPENAME = "System.ValueTuple`"; 119 | private const string NULLABLE_TYPENAME = "System.Nullable`"; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ 351 | 352 | # Verify snapshot test received files (not part of source control) 353 | *.received.txt 354 | -------------------------------------------------------------------------------- /RegExtract/ExtractionPlan.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Reflection; 6 | using System.Text.RegularExpressions; 7 | 8 | using RegExtract.ExtractionPlanNodeTypes; 9 | using RegExtract.RegexTools; 10 | 11 | namespace RegExtract 12 | { 13 | public class ExtractionPlan: IFormattable 14 | { 15 | ExtractionPlanNode Plan { get; set; } 16 | RegexCaptureGroupTree? _tree; 17 | 18 | protected ExtractionPlan() 19 | { 20 | Plan = new UninitializedNode(); 21 | } 22 | 23 | public T Extract(string str) 24 | { 25 | return (T)Plan.Execute(_tree?.Regex.Match(str) ?? Regex.Match("",""))!; 26 | } 27 | 28 | public T Extract(Match match) 29 | { 30 | return (T)Plan.Execute(match)!; 31 | } 32 | 33 | static public ExtractionPlan CreatePlan(Regex regex, RegExtractOptions reOptions= RegExtractOptions.None) 34 | { 35 | ExtractionPlan plan = new ExtractionPlan(); 36 | plan.InitializePlan(regex); 37 | 38 | return plan; 39 | } 40 | 41 | internal void InitializePlan(Regex regex) 42 | { 43 | _tree = new RegexCaptureGroupTree(regex); 44 | var type = ExtractionPlanTypeWrapper.Wrap(typeof(T)); 45 | 46 | Plan = AssignTypesToTree(_tree.Tree, type); 47 | } 48 | 49 | 50 | ExtractionPlanNode BindPropertyPlan(RegexCaptureGroupNode tree, ExtractionPlanTypeWrapper type, string name) 51 | { 52 | type = type.NonNullableType; 53 | 54 | // TODO: Figure out how to move this into ExtractionPlanTypeWrapper, and do some caching 55 | var property = type.Type.GetProperty(name); 56 | 57 | if (property is null) 58 | throw new ArgumentException($"Could not find property for named capture group '{name}'."); 59 | 60 | type = ExtractionPlanTypeWrapper.Wrap(property.PropertyType); 61 | 62 | return AssignTypesToTree(tree, type); 63 | } 64 | 65 | ExtractionPlanNode BindConstructorPlan(RegexCaptureGroupNode tree, ExtractionPlanTypeWrapper type, int paramNum, int paramCount) 66 | { 67 | var constructors = type.Constructors 68 | .Where(cons => cons.GetParameters().Length == paramCount); 69 | 70 | if (type.IsInitializableCollection) 71 | { 72 | try 73 | { 74 | type = ExtractionPlanTypeWrapper.Wrap(type.GenericArguments?[paramNum]); 75 | } 76 | catch (IndexOutOfRangeException) 77 | { 78 | throw new ArgumentException($"Capture group '{tree.name}' represents too many parameters for collection {type.Type.FullName}"); 79 | } 80 | } 81 | else if (type.IsTuple) 82 | { 83 | try 84 | { 85 | type = ExtractionPlanTypeWrapper.Wrap(type.GenericArguments?[paramNum]); 86 | } 87 | catch (IndexOutOfRangeException) 88 | { 89 | throw new ArgumentException($"Capture group '{tree.name}' represents too many parameters for tuple {type.Type.FullName}"); 90 | } 91 | } 92 | else if (constructors?.Count() == 1) 93 | { 94 | var constructor = constructors.Single(); 95 | 96 | try 97 | { 98 | type = ExtractionPlanTypeWrapper.Wrap(constructor.GetParameters()[paramNum].ParameterType); 99 | } 100 | catch (IndexOutOfRangeException) 101 | { 102 | throw new ArgumentException($"Capture group '{tree.name}' represents too many parameters for constructor {type.Type.FullName}"); 103 | } 104 | } 105 | 106 | return AssignTypesToTree(tree, type); 107 | } 108 | 109 | ExtractionPlanNode BindTupleConstructorPlan(string name, IEnumerable nodes, ExtractionPlanTypeWrapper tupleType) 110 | { 111 | var typeArgs = tupleType.GenericArguments; 112 | 113 | List groups = new(); 114 | 115 | foreach (var (node, type, idx) in nodes.Zip(typeArgs, (n, t) => (n,t)).Select(((x,i) => (x.n, x.t, i)))) 116 | { 117 | if (idx < 7) 118 | { 119 | groups.Add(BindConstructorPlan(node, tupleType, idx, typeArgs?.Length ?? 0)); 120 | } 121 | else 122 | { 123 | groups.Add(BindTupleConstructorPlan(name, nodes.Skip(7), ExtractionPlanTypeWrapper.Wrap(type))); 124 | } 125 | } 126 | 127 | return ExtractionPlanNode.Bind(name, tupleType, groups.ToArray(), new ExtractionPlanNode[0]); 128 | } 129 | 130 | private ExtractionPlanNode AssignTypesToTree(RegexCaptureGroupNode tree, ExtractionPlanTypeWrapper type) 131 | { 132 | List groups = new(); 133 | List namedgroups = new(); 134 | 135 | if (tree.children is [] or [{children: []}] && type.IsDirectlyConstructable) 136 | { 137 | // We're at a leaf in the type hierarchy, and all we need is a string. 138 | // If there's an inner capture group, use it to narrow the match. 139 | if (tree.children.Length == 1) 140 | { 141 | tree = tree.children.Single(); 142 | } 143 | 144 | return ExtractionPlanNode.BindLeaf(tree.name, type, groups.ToArray(), namedgroups.ToArray()); 145 | } 146 | else if (type.IsTuple) 147 | { 148 | return BindTupleConstructorPlan(tree.name, tree.children, type); 149 | } 150 | else if (type.IsInitializableCollection) 151 | { 152 | var typeParams = type.GenericArguments; 153 | 154 | if (tree.name == "0") 155 | { 156 | return AssignTypesToTree(tree.children.Single(), type); 157 | } 158 | 159 | if ((typeParams?.Length ?? 0) < 2 && !((typeParams?.Length ?? 0) > 0 && ExtractionPlanTypeWrapper.Wrap(typeParams.First()).IsInitializableCollection)) 160 | { 161 | return ExtractionPlanNode.Bind(tree.name, type, new[] { BindConstructorPlan(tree, type, 0, 1) }, new ExtractionPlanNode[0]); 162 | } 163 | 164 | foreach (var node in tree.children) 165 | { 166 | var plan = BindConstructorPlan(node, type, groups.Count, tree.NumberedGroups.Count()); 167 | groups.Add(plan); 168 | } 169 | // TODO: assert that there are no named groups 170 | } 171 | else 172 | { 173 | foreach (var node in tree.children) 174 | { 175 | if (int.TryParse(node.name, out var num)) 176 | { 177 | var plan = BindConstructorPlan(node, type, groups.Count, tree.NumberedGroups.Count()); 178 | groups.Add(plan); 179 | } 180 | else 181 | { 182 | namedgroups.Add(BindPropertyPlan(node, type, node.name)); 183 | } 184 | } 185 | } 186 | 187 | return ExtractionPlanNode.Bind(tree.name, type, groups.ToArray(), namedgroups.ToArray()); 188 | } 189 | 190 | public object ToDump() => this; 191 | 192 | public override string ToString() => 193 | Plan.ShowPlanTree().Replace("\t", "").Replace("\n", ""); 194 | 195 | public string ToString(string? format) => ToString(format, null); 196 | 197 | public string ToString(string? format, IFormatProvider? formatProvider) 198 | { 199 | if (format == "x") 200 | return Plan.ShowPlanTree() + "\n\n" + _tree?.TreeViz() ?? ""; 201 | else return Plan.ShowPlanTree().Replace("\t", "").Replace("\n", ""); 202 | } 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /RegExtract/ExtractionPlanNodeTypes.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.Reflection; 6 | using System.Text.RegularExpressions; 7 | 8 | namespace RegExtract.ExtractionPlanNodeTypes 9 | { 10 | internal record UninitializedNode() : 11 | ExtractionPlanNode("", ExtractionPlanTypeWrapper.Wrap(typeof(void)), new ExtractionPlanNode[0], new ExtractionPlanNode[0]) 12 | { 13 | internal override object? Execute(Match match, int captureStart, int captureLength, Dictionary cache) 14 | { 15 | throw new InvalidOperationException("Extraction plan was not initialized before execution."); 16 | } 17 | } 18 | 19 | internal record CollectionInitializerNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) : 20 | ExtractionPlanNode(groupName, type, constructorParams, propertySetters) 21 | { 22 | internal override object? Execute(Match match, int captureStart, int captureLength, Dictionary cache) 23 | { 24 | var genericArgs = type.GenericArguments; 25 | 26 | // TODO: Create a pre-sized collection 27 | var vals = Activator.CreateInstance(type.Type); 28 | var addMethod = type.Type.GetMethod("Add"); 29 | 30 | object?[] itemVals = new object[genericArgs?.Length ?? 0]; 31 | 32 | var rangeArray = constructorParams.Select(c => Ranges(match, groupName, captureStart, captureLength, cache).GetEnumerator()).ToArray(); 33 | 34 | do 35 | { 36 | for (int i = 0; i < genericArgs?.Length; i++) 37 | { 38 | if (rangeArray[i].MoveNext()) 39 | { 40 | itemVals[i] = constructorParams[i].Execute(match, rangeArray[i].Current.Index, rangeArray[i].Current.Length, cache); 41 | } 42 | else 43 | { 44 | goto no_more; 45 | } 46 | } 47 | addMethod.Invoke(vals, itemVals); 48 | } while (true); 49 | 50 | no_more:; 51 | 52 | foreach (var range in rangeArray) 53 | { 54 | range.Dispose(); 55 | } 56 | 57 | return vals; 58 | } 59 | } 60 | 61 | internal record ConstructTupleNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) : 62 | ExtractionPlanNode(groupName, type, constructorParams, propertySetters) 63 | { 64 | ConstructorInfo? _constructor = null; 65 | 66 | ConstructorInfo constructor 67 | { 68 | get 69 | { 70 | if (_constructor != null) return _constructor; 71 | else 72 | { 73 | var wrappedType = type.NonNullableType; 74 | return (_constructor = wrappedType.Type.GetConstructor(wrappedType.Type.GetGenericArguments())); 75 | } 76 | } 77 | } 78 | 79 | internal override object? Construct(Match match, ExtractionPlanTypeWrapper type, (string Value, int Index, int Length) range, Dictionary cache) 80 | { 81 | type = type.NonNullableType; 82 | var constructor = type.Type.GetConstructor(type.Type.GetGenericArguments()); 83 | 84 | return constructor.Invoke(constructorParams.Select(i => i.Execute(match, range.Index, range.Length, cache)).ToArray()); 85 | } 86 | 87 | internal override void Validate() 88 | { 89 | base.Validate(); 90 | } 91 | } 92 | 93 | internal record ConstructorNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) : 94 | ExtractionPlanNode(groupName, type, constructorParams, propertySetters) 95 | { 96 | ConstructorInfo? _constructor = null; 97 | 98 | ConstructorInfo constructor 99 | { 100 | get 101 | { 102 | return _constructor ?? type.Constructors.Where(cons => cons.GetParameters().Length == constructorParams.Length).Single(); 103 | } 104 | } 105 | 106 | internal override object? Construct(Match match, ExtractionPlanTypeWrapper type, (string Value, int Index, int Length) range, Dictionary cache) 107 | { 108 | return constructor.Invoke(constructorParams.Select(i => i.Execute(match, range.Index, range.Length, cache)).ToArray()); 109 | } 110 | 111 | internal override void Validate() 112 | { 113 | var constructors = type.Constructors 114 | .Where(cons => cons.GetParameters().Length == constructorParams.Length); 115 | 116 | if (constructors.Count() != 1) 117 | throw new InvalidOperationException($"{nameof(ConstructorNode)} has wrong number of constructor params."); 118 | 119 | base.Validate(); 120 | } 121 | } 122 | 123 | internal record EnumParseNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) : 124 | ExtractionPlanNode(groupName, type, constructorParams, propertySetters) 125 | { 126 | internal override object? Construct(Match match, ExtractionPlanTypeWrapper type, (string Value, int Index, int Length) range, Dictionary cache) 127 | { 128 | return Enum.Parse(type.Type, range.Value); 129 | } 130 | } 131 | 132 | internal record StringConstructorNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) : 133 | ExtractionPlanNode(groupName, type, constructorParams, propertySetters) 134 | { 135 | ConstructorInfo? _constructor = null; 136 | 137 | ConstructorInfo constructor 138 | { 139 | get 140 | { 141 | return _constructor ?? (type.NonNullableType.Type.GetConstructor(new[] { typeof(string) })); 142 | } 143 | } 144 | 145 | 146 | internal override object? Construct(Match match, ExtractionPlanTypeWrapper type, (string Value, int Index, int Length) range, Dictionary cache) 147 | { 148 | Debug.Assert(type == this.type); 149 | return constructor.Invoke(new[] { range.Value }); 150 | } 151 | 152 | internal override void Validate() 153 | { 154 | var constructor = type.NonNullableType.Type.GetConstructor(new[] { typeof(string) }); 155 | 156 | if (constructor is null || constructorParams.Length != 0) 157 | throw new InvalidOperationException($"{nameof(StringConstructorNode)} has wrong type or constructor params."); 158 | 159 | base.Validate(); 160 | } 161 | } 162 | 163 | internal record StaticParseMethodNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) : 164 | ExtractionPlanNode(groupName, type, constructorParams, propertySetters) 165 | { 166 | MethodInfo? _parse = null; 167 | 168 | MethodInfo parse 169 | { 170 | get 171 | { 172 | return _parse ?? (_parse = type.NonNullableType.Type 173 | .GetMethod("Parse", 174 | BindingFlags.Static | BindingFlags.Public, 175 | null, 176 | new Type[] { typeof(string) }, 177 | null)); 178 | } 179 | } 180 | 181 | internal override object? Construct(Match match, ExtractionPlanTypeWrapper type, (string Value, int Index, int Length) range, Dictionary cache) 182 | { 183 | return parse.Invoke(null, new object[] { range.Value }); 184 | } 185 | 186 | internal override void Validate() 187 | { 188 | var parse = type.NonNullableType.Type.GetMethod("Parse", 189 | BindingFlags.Static | BindingFlags.Public, 190 | null, 191 | new Type[] { typeof(string) }, 192 | null); 193 | 194 | if (parse is null) 195 | throw new InvalidOperationException($"{nameof(StaticParseMethodNode)} has wrong type or constructor params."); 196 | 197 | base.Validate(); 198 | } 199 | } 200 | 201 | internal record StringCastNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) : 202 | ExtractionPlanNode(groupName, type, constructorParams, propertySetters) 203 | { 204 | internal override object? Construct(Match match, ExtractionPlanTypeWrapper type, (string Value, int Index, int Length) range, Dictionary cache) 205 | { 206 | return range.Value; 207 | } 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /RegExtract/ExtractionPlanNode.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Reflection; 6 | using System.Text; 7 | using System.Text.RegularExpressions; 8 | using RegExtract.ExtractionPlanNodeTypes; 9 | 10 | namespace RegExtract 11 | { 12 | internal record ExtractionPlanNode(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertyNodes) 13 | { 14 | public string ShowPlanTree() 15 | { 16 | StringBuilder builder = new(); 17 | 18 | builder.Append(this.GetType().Name.Replace("Node","")).Append("<").Append(string.Join(",",FriendlyTypeName(type.Type))).Append(">[").Append(int.TryParse(groupName, out var _) ? groupName : '"' + groupName + '"').Append("] ("); 19 | if (constructorParams.Any()) 20 | { 21 | builder.Append("\n"); 22 | builder.Append(string.Join(",\n", constructorParams.Select(param => "\t" + param.ShowPlanTree().Replace("\n", "\n\t")))); 23 | builder.Append("\n)"); 24 | } 25 | else 26 | { 27 | builder.Append(")"); 28 | } 29 | if (propertyNodes.Any()) 30 | { 31 | builder.Append(" {\n"); 32 | builder.Append(string.Join(",\n", propertyNodes.Select(param => "\t" + param.groupName + " = " + param.ShowPlanTree().Replace("\n", "\n\t")))); 33 | builder.Append("\n}"); 34 | } 35 | 36 | return builder.ToString(); 37 | } 38 | 39 | string FriendlyTypeName(Type type) 40 | { 41 | var keyword = type.Name switch 42 | { 43 | "Byte" => "byte", 44 | "SByte" => "sbyte", 45 | "Float" => "float", 46 | "Double" => "double", 47 | "Decimal" => "decimal", 48 | "Int16" => "short", 49 | "UInt16" => "ushort", 50 | "Int32" => "int", 51 | "UInt32" => "uint", 52 | "Int64" => "long", 53 | "UInt64" => "ulong", 54 | "Char" => "char", 55 | "String" => "string", 56 | _ => null 57 | }; 58 | 59 | if (keyword is not null) return keyword; 60 | 61 | if (Nullable.GetUnderlyingType(type) is Type nonNullableType) return FriendlyTypeName(nonNullableType) + "?"; 62 | 63 | var args = type.GetGenericArguments(); 64 | 65 | if (type.FullName.StartsWith(VALUETUPLE_TYPENAME)) return "(" + String.Join(",", args.Select(arg => FriendlyTypeName(arg))) + ")"; 66 | 67 | if (args.Any()) 68 | { 69 | return type.Name.Split('`')[0] + "<" + String.Join(",", args.Select(arg => FriendlyTypeName(arg))) + ">"; 70 | } 71 | 72 | else return type.Name; 73 | } 74 | 75 | internal static ExtractionPlanNode Bind(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) 76 | { 77 | var innerType = type.NonNullableType; 78 | 79 | var multiConstructor = innerType.Constructors 80 | .Where(cons => cons.GetParameters().Length == constructorParams.Length); 81 | 82 | var staticParseMethod = innerType.Type.GetMethod("Parse", 83 | BindingFlags.Static | BindingFlags.Public, 84 | null, 85 | new Type[] { typeof(string) }, 86 | null); 87 | 88 | var stringConstructor = innerType.Type.GetConstructor(new[] { typeof(string) }); 89 | 90 | 91 | 92 | ExtractionPlanNode node; 93 | 94 | if (type.IsInitializableCollection) 95 | node = new CollectionInitializerNode(groupName, type, constructorParams, propertySetters); 96 | else if (innerType.IsTuple) 97 | node = new ConstructTupleNode(groupName, type, constructorParams, propertySetters); 98 | else if (multiConstructor.Count() == 1 && (constructorParams.Any() || propertySetters.Any())) 99 | node = new ConstructorNode(groupName, type, constructorParams, propertySetters); 100 | else if (!constructorParams.Any() && !propertySetters.Any() && stringConstructor != null) 101 | node = new StringConstructorNode(groupName, type, new ExtractionPlanNode[0], new ExtractionPlanNode[0]); 102 | else 103 | throw new ArgumentException("Couldn't find appropriate constructor for type."); 104 | 105 | node.Validate(); 106 | 107 | return node; 108 | } 109 | 110 | internal static ExtractionPlanNode BindLeaf(string groupName, ExtractionPlanTypeWrapper type, ExtractionPlanNode[] constructorParams, ExtractionPlanNode[] propertySetters) 111 | { 112 | var innerType = type.NonNullableType; 113 | 114 | var staticParseMethod = innerType.Type.GetMethod("Parse", 115 | BindingFlags.Static | BindingFlags.Public, 116 | null, 117 | new Type[] { typeof(string) }, 118 | null); 119 | 120 | var stringConstructor = innerType.Type.GetConstructor(new[] { typeof(string) }); 121 | 122 | 123 | 124 | ExtractionPlanNode node; 125 | 126 | if (innerType.IsTuple) 127 | throw new ArgumentException("Tuple in type cannot be bound to leaf of regex capture group tree."); 128 | else if (staticParseMethod is not null) 129 | node = new StaticParseMethodNode(groupName, type, constructorParams, propertySetters); 130 | else if (stringConstructor is not null) 131 | node = new StringConstructorNode(groupName, type, constructorParams, propertySetters); 132 | else if (innerType.Type.BaseType == typeof(Enum)) 133 | node = new EnumParseNode(groupName, type, constructorParams, propertySetters); 134 | else 135 | node = new StringCastNode(groupName, type, constructorParams, propertySetters); 136 | 137 | node.Validate(); 138 | 139 | return node; 140 | } 141 | 142 | 143 | internal virtual void Validate() 144 | { 145 | return; 146 | } 147 | 148 | internal virtual object? Construct(Match match, ExtractionPlanTypeWrapper type, (string Value, int Index, int Length) range, Dictionary cache) 149 | { 150 | throw new InvalidOperationException("Can't construct a node based on base ExtractionPlanNode type."); 151 | } 152 | 153 | protected IEnumerable<(string Value, int Index, int Length)> Ranges(Match match, string groupName, int captureStart, int captureLength, Dictionary cache) 154 | { 155 | if (!cache.ContainsKey(groupName)) 156 | { 157 | cache[groupName] = AsEnumerable(match.Groups[groupName].Captures) 158 | .Select(cap => (cap.Value, cap.Index, cap.Length)) 159 | .ToArray(); 160 | } 161 | return cache[groupName].Where(cap => cap.Index >= captureStart && cap.Index + cap.Length <= captureStart + captureLength); 162 | } 163 | 164 | internal virtual object? Execute(Match match, int captureStart, int captureLength, Dictionary cache) 165 | { 166 | object? result = null; 167 | 168 | var ranges = Ranges(match, groupName, captureStart, captureLength, cache).ToArray(); 169 | 170 | if (!ranges.Any()) 171 | { 172 | if (type.Type.IsClass || Nullable.GetUnderlyingType(type.Type) != null) return null; 173 | else return Convert.ChangeType(null, type.Type); 174 | } 175 | else 176 | { 177 | var lastRange = ranges.Last(); 178 | 179 | result = Construct(match, type.NonNullableType, lastRange, cache); 180 | 181 | if (result is not null) 182 | { 183 | foreach (var prop in propertyNodes) 184 | { 185 | result.GetType().GetProperty(prop.groupName).GetSetMethod().Invoke(result, new[] { prop.Execute(match, lastRange.Index, lastRange.Length, cache) }); 186 | } 187 | } 188 | } 189 | 190 | return result; 191 | } 192 | 193 | internal object? Execute(Match match) 194 | { 195 | if (!match.Success) 196 | { 197 | throw new ArgumentException("Regex didn't match."); 198 | } 199 | 200 | Dictionary cache = new(); 201 | 202 | return Execute(match, match.Groups[0].Index, match.Groups[0].Length, cache); 203 | } 204 | 205 | protected const string VALUETUPLE_TYPENAME = "System.ValueTuple`"; 206 | protected const string NULLABLE_TYPENAME = "System.Nullable`"; 207 | 208 | protected static IEnumerable AsEnumerable(CaptureCollection cc) 209 | { 210 | foreach (Capture c in cc) 211 | { 212 | yield return c; 213 | } 214 | } 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /RegExtract/RegexCaptureGroupTree.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Text.RegularExpressions; 7 | 8 | namespace RegExtract.RegexTools 9 | { 10 | internal record RegexCaptureGroupNode(string name, RegexCaptureGroupNode[] children, ((int start, int length) range, string text) substring) 11 | { 12 | public IEnumerable NamedGroups => children.Where(node => !int.TryParse(node.name, out var _)); 13 | public IEnumerable NumberedGroups => children.Where(node => int.TryParse(node.name, out var _)); 14 | } 15 | 16 | internal class RegexCaptureGroupTree 17 | { 18 | public List Groups { get; private set; } = new() { "0" }; 19 | 20 | public Regex Regex { get; private set; } 21 | private string _regexString; 22 | 23 | public RegexCaptureGroupNode Tree { 24 | get 25 | { 26 | if (_tree is null) 27 | { 28 | InitializeTree(); 29 | } 30 | return _tree!; 31 | } 32 | private set 33 | { 34 | _tree = value; 35 | } 36 | } 37 | private RegexCaptureGroupNode? _tree; 38 | 39 | public RegexCaptureGroupTree(Regex rx) 40 | { 41 | Regex = rx; 42 | _regexString = rx.ToString(); 43 | InitializeTree(); 44 | } 45 | 46 | public RegexCaptureGroupTree(string rx) : this(new Regex(rx)) 47 | { 48 | _regexString = rx; 49 | } 50 | 51 | private void InitializeTree() 52 | { 53 | int loc = 0, num = 0; 54 | Tree = BuildCaptureGroupTree(ref loc, ref num, 0); 55 | } 56 | 57 | private RegexCaptureGroupNode BuildCaptureGroupTree(ref int loc, ref int num, int start, string? name = null) 58 | { 59 | string myname = name ?? num.ToString(); 60 | List children = new(); 61 | 62 | int charGroupLevels = 0; 63 | bool escape = false; 64 | int nameStart = -1; 65 | int ignoreGroups = 0; 66 | char openchar = ' '; 67 | int groupStart = 0; 68 | 69 | for (; loc < _regexString.Length; loc++) 70 | { 71 | if (escape) 72 | { 73 | escape = false; 74 | continue; 75 | } 76 | if (nameStart != -1) 77 | { 78 | if ((openchar == '<' && _regexString[loc] == '>') || (openchar == '\'' && _regexString[loc] == '\'')) 79 | { 80 | loc++; 81 | var parsedName = _regexString.Substring(nameStart, loc - nameStart - 1); 82 | Groups.Add(parsedName); 83 | children.Add(BuildCaptureGroupTree(ref loc, ref num, groupStart, _regexString.Substring(nameStart, loc - nameStart - 1))); 84 | nameStart = -1; 85 | continue; 86 | } 87 | else if (!char.IsLetterOrDigit(_regexString[nameStart]) && _regexString[loc] != '_') throw new Exception("Group Name must be a valid C identifier."); 88 | } 89 | if (charGroupLevels > 0) 90 | { 91 | if (_regexString[loc] == '\\') 92 | { 93 | escape = true; 94 | continue; 95 | } 96 | if (_regexString[loc] == '-' && _regexString[loc + 1] == '[') 97 | { 98 | loc += 2; 99 | if (_regexString[loc] == '^') loc++; 100 | if (_regexString[loc] == '\\') escape = true; 101 | charGroupLevels++; 102 | continue; 103 | } 104 | else if (_regexString[loc] == ']') charGroupLevels--; 105 | continue; 106 | } 107 | 108 | switch (_regexString[loc]) 109 | { 110 | case '\\': 111 | escape = true; 112 | break; 113 | case '(': 114 | groupStart = loc; 115 | if (_regexString[loc + 1] == '?') 116 | { 117 | // ? may be followed by lookbehind (which starts out looking like a <> named group) or something that's clearly not a named group 118 | if ((_regexString[loc + 2] == '<' && (_regexString[loc + 3] == '=' || _regexString[loc + 3] == '!')) || (_regexString[loc + 2] != '<' && _regexString[loc + 2] != '\'')) 119 | { 120 | ignoreGroups++; 121 | continue; 122 | } 123 | // otherwise, it's a named group 124 | else 125 | { 126 | openchar = _regexString[loc + 2]; 127 | loc += 3; 128 | nameStart = loc; 129 | if (!char.IsLetter(_regexString[nameStart]) && _regexString[nameStart] != '_') throw new Exception("Group Name must be a valid C identifier."); 130 | } 131 | } 132 | else 133 | { 134 | num++; 135 | loc++; 136 | Groups.Add(num.ToString()); 137 | children.Add(BuildCaptureGroupTree(ref loc, ref num, groupStart)); 138 | } 139 | break; 140 | case ')': 141 | if (ignoreGroups > 0) 142 | { 143 | ignoreGroups--; 144 | continue; 145 | } 146 | else 147 | { 148 | if (myname == "0") throw new Exception("Too many close parens."); 149 | if (loc + 1 < _regexString.Length) 150 | { 151 | if (_regexString[loc + 1] is '?' or '+' or '*') 152 | { 153 | loc++; 154 | if (loc + 1 < _regexString.Length && _regexString[loc + 1] is '?') 155 | loc++; 156 | } 157 | else if (_regexString[loc + 1] is '{') 158 | { 159 | var startloc = loc; 160 | var quantifierState = 0; 161 | loc ++; 162 | while (loc < _regexString.Length - 1 && quantifierState >= 0) 163 | { 164 | loc++; 165 | switch (quantifierState, _regexString[loc]) 166 | { 167 | case (0, >= '0' and <= '9'): 168 | quantifierState = 1; 169 | break; 170 | case (1 or 2, >= '0' and <= '9'): 171 | break; 172 | case (1, ','): 173 | quantifierState = 2; 174 | break; 175 | case (1 or 2, '}'): 176 | quantifierState = 3; 177 | break; 178 | default: 179 | quantifierState = -1; 180 | break; 181 | } 182 | 183 | if (quantifierState == 3) 184 | { 185 | if (loc < _regexString.Length - 1 && _regexString[loc + 1] is '?') 186 | loc++; 187 | 188 | break; 189 | } 190 | else if (loc >= _regexString.Length - 1 || quantifierState == -1) 191 | { 192 | loc = startloc; 193 | break; 194 | } 195 | } 196 | } 197 | } 198 | return new RegexCaptureGroupNode(myname, children.ToArray(), ((start, loc - start + 1),_regexString.Substring(start, loc - start + 1))); 199 | } 200 | case '[': 201 | loc++; 202 | if (_regexString[loc] == '^') loc++; 203 | if (_regexString[loc] == '\\') escape = true; 204 | charGroupLevels++; 205 | break; 206 | default: 207 | break; 208 | } 209 | } 210 | 211 | // TODO: These should probably be asserts, because Regex has validated everything after constructor is complete. 212 | if (loc > _regexString.Length) throw new Exception("Parser over-ran end of regex string."); 213 | if (myname != "0") throw new Exception("Not enough close parens."); 214 | if (charGroupLevels > 0) throw new Exception("Unterminated char group."); 215 | 216 | Groups = Groups.OrderBy(name => int.TryParse(name, out var _) ? 0 : 1).ToList(); 217 | Debug.Assert(Groups.Zip(Regex.GetGroupNames(), (a,b) => a == b).All(b => b), "Group List doesn't match Regex.GetGroupNames()"); 218 | 219 | return new RegexCaptureGroupNode(myname, children.ToArray(), ((start, loc - start), _regexString.Substring(start, loc - start))); 220 | } 221 | 222 | string IntToSubscripts(int i) 223 | { 224 | List digits = new(); 225 | 226 | do 227 | { 228 | digits.Add((i % 10) switch 229 | { 230 | 0 => '₀', 231 | 1 => '₁', 232 | 2 => '₂', 233 | 3 => '₃', 234 | 4 => '₄', 235 | 5 => '₅', 236 | 6 => '₆', 237 | 7 => '₇', 238 | 8 => '₈', 239 | 9 => '₉', 240 | _ => throw new InvalidOperationException("That's impossible!") 241 | }); 242 | i /= 10; 243 | } while (i > 0); 244 | 245 | digits.Reverse(); 246 | return string.Join("", digits); 247 | } 248 | 249 | public string TreeViz() => String.Join("\n",TreeViz(Tree)); 250 | 251 | string[] TreeViz(RegexCaptureGroupNode tree) 252 | { 253 | var line = tree.substring.text; 254 | 255 | var tag = int.TryParse(tree.name, out var num) ? IntToSubscripts(num) : ""; 256 | 257 | char[] pad; 258 | string[] results; 259 | 260 | if (!tree.children.Any()) 261 | { 262 | var solo = $"{line}{tag}"; 263 | results = new[] { solo }; 264 | 265 | return results; 266 | } 267 | else 268 | { 269 | var left = tree.substring.range.start; 270 | var right = tree.substring.range.start + tree.substring.range.length; 271 | 272 | var blocks = tree.children.Select(child => TreeViz(child)).ToArray(); 273 | var longestblock = blocks.Max(block => block.Length); 274 | 275 | for (int i = 0; i < blocks.Length; i++) 276 | { 277 | if (blocks[i].Length != longestblock) 278 | { 279 | var blockline = new char[blocks[i][0].Length]; 280 | for (int j = 0; j < blockline.Length; j++) blockline[j] = ' '; 281 | 282 | var newblock = new string[longestblock]; 283 | for (int j = 0; j < newblock.Length; j++) 284 | { 285 | if (j < blocks[i].Length) newblock[j] = blocks[i][j]; 286 | else newblock[j] = string.Join("", blockline); 287 | } 288 | 289 | blocks[i] = newblock; 290 | } 291 | } 292 | 293 | var widths = blocks.Select(block => block[0].Length); 294 | 295 | results = new string[blocks[0].Length]; 296 | 297 | pad = new char[tree.children[0].substring.range.start - left]; 298 | for (int j = 0; j < pad.Length; j++) pad[j] = ' '; 299 | 300 | for (int i = 0; i < results.Length; i++) 301 | { 302 | results[i] = string.Join("", pad); 303 | } 304 | 305 | for (int i = 0; i < blocks.Length; i++) 306 | { 307 | var nextstart = i == blocks.Length - 1 ? right : tree.children[i + 1].substring.range.start; 308 | pad = new char[nextstart - (tree.children[i].substring.range.start + tree.children[i].substring.range.length)]; 309 | for (int j = 0; j < pad.Length; j++) pad[j] = ' '; 310 | 311 | for (int j = 0; j < blocks[0].Length; j++) 312 | { 313 | results[j] += blocks[i][j]; 314 | results[j] += string.Join("", pad); 315 | } 316 | } 317 | 318 | var topline = tree.substring.text.Substring(0, tree.children[0].substring.range.start - left); 319 | for (int i = 0; i < blocks.Length; i++) 320 | { 321 | pad = new char[blocks[i][0].Length]; 322 | for (int j = 0; j < pad.Length; j++) pad[j] = '-'; 323 | pad[0] = pad[pad.Length - 1] = '↓'; 324 | topline += string.Join("", pad); 325 | var subleft = tree.children[i].substring.range.start + tree.children[i].substring.range.length; 326 | topline += tree.substring.text.Substring(subleft - left, i == blocks.Length - 1 ? right - subleft : tree.children[i + 1].substring.range.start - subleft); 327 | } 328 | 329 | pad = new char[tag.Length]; 330 | for (int j = 0; j < pad.Length; j++) pad[j] = ' '; 331 | 332 | results = new[] { $"{topline}{tag}" }.Concat(results.Select(result => result + string.Join("", pad))).ToArray(); 333 | 334 | return results; 335 | } 336 | } 337 | } 338 | } 339 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RegExtract 2 | Quick and dirty idiomatic C# line parser that extracts text into practical data types. 3 | 4 | [![dotnet](https://github.com/sblom/RegExtract/workflows/dotnet/badge.svg)](https://github.com/sblom/RegExtract/actions) 5 | [![NuGet](https://img.shields.io/nuget/v/RegExtract.svg)](https://www.nuget.org/packages/RegExtract/) 6 | [![Downloads](https://img.shields.io/nuget/dt/RegExtract.svg)](https://www.nuget.org/packages/RegExtract/) 7 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) 8 | 9 | ## Table of Contents 10 | - [RegExtract](#regextract) 11 | - [Table of Contents](#table-of-contents) 12 | - [Release History (newest first)](#release-history-newest-first) 13 | - [Using RegExtract](#using-regextract) 14 | - [Basic extraction to `ValueTuple`](#basic-extraction-to-valuetuple) 15 | - [Extracting from multiple input strings (`IEnumerable`)](#extracting-from-multiple-input-strings-ienumerablestring) 16 | - [Extracting to collections (such as `List`)](#extracting-to-collections-such-as-listt) 17 | - [Nullable types](#nullable-types) 18 | - [Nesting compound types (such as tuples) and collections](#nesting-compound-types-such-as-tuples-and-collections) 19 | - [Tuple that contains Collections](#tuple-that-contains-collections) 20 | - [Collection that contains a tuple](#collection-that-contains-a-tuple) 21 | - [Collection that contains a collection](#collection-that-contains-a-collection) 22 | - [Collections with more than one argument (including `Dictionary<,>`)](#collections-with-more-than-one-argument-including-dictionary) 23 | - [`record`s and other compound types](#records-and-other-compound-types) 24 | - [Extracting named capture groups to properties](#extracting-named-capture-groups-to-properties) 25 | - [Other supported types](#other-supported-types) 26 | - [Including REGEXTRACT\_REGEX\_PATTERN templates on types](#including-regextract_regex_pattern-templates-on-types) 27 | - [Performance and troubleshooting](#performance-and-troubleshooting) 28 | - [Creating a re-usable `ExtractionPlan`](#creating-a-re-usable-extractionplan) 29 | - [Inspecting an extraction plan](#inspecting-an-extraction-plan) 30 | - [Regular Expression reference](#regular-expression-reference) 31 | - [History](#history) 32 | 33 | ## Release History (newest first) 34 | 35 | |Release Number |Release Date | Main Features | 36 | |--|--|--| 37 | | 3.0 | FUTURE ROADMAP | Source Generator support to eliminate run-time reflection 38 | | 2.1 | December 15, 2023 | Added caching for up to 6x speedup
Made tuples less magic | 39 | | 2.0 | December 14, 2023 | Rewrote planning engine with better Collections support | 40 | | 1.0 | December 20, 2020 | First modern release with tree-based extraction planner | 41 | | 0.9 | early December 2020 | Pre-release prototypes | 42 |
43 | History of pre-release versions 44 | 45 | |Release Number |Release Date | Main Features | 46 | |--|--|--| 47 | | 0.9.24 | December 2020 | Extraction planner fully operational | 48 | | 0.9.19 | December 2020 | Prototype extraction planner to support nested types | 49 | | 0.9.16 | December 2020 | Add support for REGEXTRACT_REGEX_PATTERN templates | 50 | | 0.9.11 | December 2020 | Add support for Enums | 51 | | 0.9.10 | December 2020 | More support for Lists | 52 | | 0.9.6 | December 2020 | Add support for Lists and Nullables | 53 | | 0.9.4 | December 2020 | Add support for named capture groups initializing properties | 54 | | 0.9.2 | December 2020 | Add positional records | 55 | | 0.9 | December 2020 | Extract capture groups to tuples, and that's all | 56 | 57 |
58 | 59 | # Using RegExtract 60 | 61 | ## Basic extraction to `ValueTuple` 62 | 63 | Let's say you have a string `2-10 c: abcdefghi`, consisting of a two `int`s separated by a dash (-), a `char` followed by a colon (:), and a `string`. 64 | 65 | You could use the regular expression `@"(\d+)-(\d+) (.): (.*)"` to extract that into a tuple `(int min, int max, char ch, string str)`. 66 | Or you could use `@"((\d+)-(\d+)) (.): (.*)"` to extract into a nested tuple `((int min, int max) range, char ch, string str)`. 67 | 68 | > [!TIP] 69 | > If you need a primer on helpful regular expression syntax, see the [Regular Expression Examples](#regular-expression-examples) section below. 70 | 71 | In C# code, those two examples would look like: 72 | 73 | ```cs 74 | using RegExtract; 75 | 76 | var input = "2-10 c: abcdefghi"; 77 | 78 | var flat_tuple = input.Extract<( int min, int max, char ch, string str)>(@"(\d+)-(\d+) (.): (.*)"); 79 | var nested_tuple = input.Extract<((int min, int max) range, char ch, string str)>(@"((\d+)-(\d+)) (.): (.*)"); 80 | ``` 81 | 82 | > [!NOTE] 83 | > The nesting of your capture groups (parts wrapped in `()`) in your regular expression must match the nesting of your type hierarchy. 84 | 85 | ## Extracting from multiple input strings (`IEnumerable`) 86 | 87 | There are many variations on RegExtract extension methods, but there are two that you will use most often. 88 | The first one is the `.Extract()` method demonstrated above. 89 | It's an extension method on `string`, and returns a single fully constructed instance of your type hierarchy `T`. 90 | The other one is very similar, but it accepts any `IEnumerable`. 91 | 92 | Here's an example of using the `IEnumerable` extension method: 93 | 94 | ```cs 95 | using RegExtract; 96 | 97 | var inputs = new[] { 98 | "2-10 c: abcdefghi", 99 | "3-7 e: qwertyuiop" 100 | }; 101 | 102 | IEnumerable<(int,int,char,string)> results = 103 | inputs.Extract<( int min, int max, char ch, string str)>(@"(\d+)-(\d+) (.): (.*)"); 104 | ``` 105 | 106 | Notice that the actual `.Extract<>()` call looks nearly identical to the version that takes a single `string`. 107 | This makes it trivial to switch between extracting a single instance and extracting from each string in an `IEnumerable`. 108 | 109 | > [!TIP] 110 | > These are the two most common RegExtract methods to use, but if you're going to be using the same extraction plan multiple times, you should first [create a reusable `ExtractionPlan`](#creating-a-re-usable-extractionplan), so that RegExtract only has to parse your regular expression and type hierarchy once. 111 | 112 | ## Extracting to collections (such as `List`) 113 | 114 | In addition to arbitrarily long `ValueTuple`s as demonstrated above, RegExtract supports any collection type that works with [C#'s Collection Initializer syntax](https://learn.microsoft.com/en-us/dotnet/csharp/programming-guide/classes-and-structs/object-and-collection-initializers#collection-initializers) (commonly `List<>`, `HashSet<>`, and `Dictionary<,>`). 115 | 116 | > [!TIP] 117 | > C# collection initializers work with *any collection type that implements `IEnumerable` [the non-generic one, in particular] and has Add with the appropriate signature*. 118 | 119 | To extract a list, you should include a capture group in your regular expression that repeats. 120 | For example, to break a sentence up into individual words, you can do something like: 121 | 122 | ```cs 123 | using RegExtract; 124 | 125 | var input = "The quick brown fox jumps over the lazy dog."; 126 | 127 | var words_with_trailing_spaces = input.Extract>(@"(\w+ ?)+"); 128 | var words_without_spaces = input.Extract>(@"((\w+) ?)+"); 129 | ``` 130 | 131 | Notice in the first example (`words_with_trailing_spaces`), there is only one capture group, and everything inside it is treated as part of the match. 132 | As a result, the strings in the List include trailing spaces (except for "dog", which stopped matching before the trailing period (.)). 133 | 134 | In the second example (`words_without_spaces`), an optional final set of parens was included immediately around `\w+`. 135 | As a result, the strings in the second list will only include the words themselves without trailing spaces. 136 | 137 | > [!NOTE] 138 | > As illustrated by the `words_without_spaces` example, you can always optionally include an extra capture group to capture only a relevant subpart inside the repeating capture group of any **Collection** type. 139 | > 140 | > This is useful if the repeated capture group includes optional separators such as spaces, commas, semicolons, etc., and it allows you to include only the interesting part without the separator. 141 | 142 | ## Nullable types 143 | 144 | Any time a type hierarchy expects a value but there's no corresponding Capture (because of an optional capture group, for example), RegExtract considers the extracted value to be `null`. 145 | For reference types, this works exactly how you'd expect. 146 | For value types, you'll get an `InvalidCastException ("Null object cannot be converted to a value type.")` unless you have marked the value type as nullable in your type hierarchy. 147 | 148 | > [!NOTE] 149 | > Collection types will always be constructed and will never be extracted as a `null`. 150 | > Unlike missing Captures for non-collection values, that there are no matches, it will simply be empty. 151 | 152 | An example of extracting to a nullable type (or not): 153 | 154 | ```csharp 155 | using RegExtract; 156 | 157 | // This will succeed because int? can be null. 158 | var nullable = "".Extract(@"(\d+)"); 159 | // ^Nullable 160 | 161 | // This will throw an exception because \d+ doesn't match anything 162 | // and the int value is required by the type system. 163 | var not_nullable = "".Extract(@"(\d+)"); 164 | // ^Not nullable 165 | ``` 166 | 167 | > [!TIP] 168 | > You can use nullable types in combination with the regular expression alternation operator (`|`) to extract to a different type depending on the details of the match. 169 | 170 | An example of using nullable types to support regular expression alternation (`|`): 171 | ```csharp 172 | var (n,s) = "str".Extract<(int?,string)>(@"(\d+)|(.*)"); 173 | ``` 174 | 175 | ## Nesting compound types (such as tuples) and collections 176 | 177 | You'll frequently find that you need to nest a **compound type** (such as a tuple) inside a **collection** or that you need to nest a **collection** inside a **compound type**. 178 | RegExtract can handle arbitrarily deeply nested mixes of any supported data types. 179 | 180 | ### Tuple that contains Collections 181 | ```csharp 182 | using RegExtract; 183 | 184 | var input = "Item #1: 27 61 49 58 44 2 69 78"; 185 | 186 | var result = input.Extract<(int itemno, HashSet set)>(@"Item #(\d+): (\d+ ?)+"); 187 | ``` 188 | 189 | > [!TIP] 190 | > As you can see in this examples, a `HashSet<>` works just like a `List<>`. 191 | 192 | ### Collection that contains a tuple 193 | ```csharp 194 | using RegExtract; 195 | 196 | var input = "red 10, blue 25, green 12, yellow 19"; 197 | 198 | var result = input.Extract>(@"((\w+) (\d+),? ?)+"); 199 | ``` 200 | 201 | ### Collection that contains a collection 202 | ```csharp 203 | using RegExtract; 204 | 205 | var input = "The quick brown fox jumps over the lazy dog"; 206 | 207 | var result = input.Extract>>(@"((\w)+ ?)+"); 208 | ``` 209 | 210 | ## Collections with more than one argument (including `Dictionary<,>`) 211 | 212 | C# collection initializers will work with `.Add()` methods that take more than one parameter, such as the `.Add(TKey key, TValue value)` that `Dictionary<,>` implements. 213 | RegExtract doesn't have the benefit of inferring generic type arguments from examples of parameters, however, since everything is a `string` before extraction. 214 | So, instead, RegExtract will only consider an `.Add()` method whose parameter types match the generic arguments `TKey` and `TValue`. 215 | 216 | Example using a `Dictionary`: 217 | ```csharp 218 | using RegExtract; 219 | 220 | var input = "red 10, blue 25, green 12, yellow 19"; 221 | 222 | var result = input.Extract>(@"((\w+) (\d+),? ?)+"); 223 | ``` 224 | 225 | > [!TIP] 226 | > RegExtract doesn't yet support having, for example, a capture group with the `value` before the `key`. 227 | > (They have to be in the order that the collection's `.Add()` method expects them.) 228 | > 229 | > You can work around this by capturing to a `List<(TValue value, TKey key)>` and then using `list.ToDictionary(vk => vk.key, vk => vk.Value)` to convert to a `Dictionary<,>` that's organized the way you want. 230 | 231 | 232 | ## `record`s and other compound types 233 | 234 | You can build almost anything you need using `ValueTuple`s and `List<>`s, and for simple, ad hoc scenarios that's often where I begin and end. 235 | 236 | However, when it comes time to extract inputs to more richly modeled types, you'll use RegExtract's support for types such as `record`s that have a single obvious constructor (some might say a [primary constructor!](https://learn.microsoft.com/en-us/dotnet/csharp/programming-guide/classes-and-structs/instance-constructors#primary-constructors)) with the number of parameters corresponding to the number of capture groups in your regular expression. 237 | (Strictly speaking, it doesn't have to be a record, and you don't have to use primary constructor syntax—absolutely any type with a constructor of the right shape is fine.) 238 | 239 | > [!INFO] 240 | > For custom compound types such as `record`s or `struct`s or `class`es, RegExtract looks for a single public constructor that takes the same number of arguments as the number of capture groups nested inside the compound type's capture group. 241 | > 242 | > It then uses the types of the constructor arguments to determine what types to construct for the nested capture groups. 243 | 244 | Here's an example using a couple of nested `record` types and `List`s: 245 | ```csharp 246 | using RegExtract; 247 | 248 | var input = "Game 14: 9 green, 4 red; 6 blue, 1 red, 7 green; 3 blue, 5 green"; 249 | 250 | var game = input.Extract(@"Game (\d+): (((\d+) (\w+),? ?)+;? ?)+"); 251 | 252 | record Game(int id, List draws); 253 | record Draw(List<(int count, string color)> colors); 254 | ``` 255 | 256 | ## Extracting named capture groups to properties 257 | 258 | All of the examples of compound types so far make use of constructors with positional semantics. 259 | RegExtract uses typical (non-named) capture groups as parameters destined for a tuple slot or a constructor parameter. 260 | 261 | Regular expressions also support named capture groups. 262 | They look like `(?pattern_goes_here)`. 263 | When RegExtract encounters a named capture group, the captures from it are used to call a property setter on the type being extracted after the type is fully constructed from (non-named) positional capture groups. 264 | 265 | A simple example: 266 | ```csharp 267 | using RegExtract; 268 | 269 | var input = 270 | 271 | var result = input.Extract