├── .gitignore ├── LICENSE ├── README.md ├── nuget.config ├── scripts ├── 35MSSharedLib1024.snk ├── release.yml └── test.yml ├── srm.sln ├── srm ├── AutomataException.cs ├── Match.cs ├── Regex.cs ├── RegexOptions.cs ├── RegexToAutomatonConverter.cs ├── SymbolicRegexBuilder.cs ├── SymbolicRegexNode.cs ├── algebras │ ├── BDD.cs │ ├── BDDAlgebra.cs │ ├── BV.cs │ ├── BV64Algebra.cs │ ├── BVAlgebra.cs │ ├── CharSetSolver.cs │ ├── CharacterEncoding.cs │ ├── IBooleanAlgebra.cs │ ├── ICharAlgebra.cs │ ├── IntervalSet.cs │ ├── MintermGenerator.cs │ └── RangeConverter.cs ├── icon.png ├── matcher │ ├── BooleanDecisionTree.cs │ ├── DecisionTree.cs │ ├── IMatcher.cs │ ├── SymbolicRegexMatcher.cs │ ├── UTF8Encoding.cs │ └── VectorizedIndexOf.cs ├── parser │ ├── RegexBoyerMoore.cs │ ├── RegexCharClass.cs │ ├── RegexCode.cs │ ├── RegexFCD.cs │ ├── RegexNode.cs │ ├── RegexParser.cs │ ├── RegexReplacement.cs │ ├── RegexTree.cs │ └── SR.cs ├── printing │ └── RegexCharSetPrinter.cs ├── srm.csproj ├── unicode │ ├── IgnoreCaseRelation.cs │ ├── IgnoreCaseRelationGenerator.cs │ ├── IgnoreCaseTransformer.cs │ ├── UnicodeCategoryRanges.cs │ ├── UnicodeCategoryRangesGenerator.cs │ └── UnicodeCategoryTheory.cs └── utils │ └── StringUtility.cs ├── tests ├── MatchingTests.cs ├── SerializationTests.cs └── tests.csproj └── unicode_table_gen ├── Program.cs └── unicode_table_gen.csproj /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE directories 2 | .vs/ 3 | .vscode/ 4 | 5 | # Build directories 6 | bin/ 7 | obj/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Microsoft Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Symbolic Regex Matcher (SRM) 2 | 3 | SRM is a high-performance regular expression matching engine with predictable performance characteristics. SRM implements a fully compatible subset of the .NET regex language, which mainly omits non-regular features. It provides comparable throughput to popular native libraries, such as RE2, with a pure C# codebase. 4 | 5 | SRM combines advanced symbolic reasoning with a regex derivatives based matching approach. For an overview of the theory behind SRM please see: 6 | [Olli Saarikivi, Margus Veanes, Tiki Wan, Eric Xu. *Symbolic Regex Matcher*. In TACAS 2019.](https://doi.org/10.1007/978-3-030-17462-0_24) 7 | 8 | # Usage 9 | 10 | The API mostly follows that of `System.Text.RegularExpressions`: 11 | 12 | ``` 13 | using Microsoft.SRM; 14 | ... 15 | string input = "Hello World!"; 16 | var regex = new Regex(".l*."); 17 | bool hasLs = regex.IsMatch(input); // True 18 | var matches = regex.Matches(input); // list of Match structs for "ello" and "rld" 19 | ``` 20 | 21 | # Building and running tests 22 | 23 | The library is built and tested with [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet-core/3.1). To build the project and run the tests run: 24 | 25 | ``` 26 | dotnet build 27 | dotnet test 28 | ``` 29 | 30 | # Regenerate unicode character tables 31 | 32 | SRM uses unicode character tables recovered from the .NET runtime. To regenerate them for a new version of the runtime run: 33 | 34 | ``` 35 | cd unicode_table_gen 36 | dotnet run ../srm/unicode 37 | ``` 38 | -------------------------------------------------------------------------------- /nuget.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /scripts/35MSSharedLib1024.snk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AutomataDotNet/srm/7c7eec9c4c974610f246e2502d93730335e70fa9/scripts/35MSSharedLib1024.snk -------------------------------------------------------------------------------- /scripts/release.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | variables: 4 | ReleaseVersion: '1.2.2' 5 | 6 | pool: 7 | vmImage: "windows-latest" 8 | 9 | steps: 10 | - script: dotnet build srm --configuration Release -p:BuildType=Official -p:Version=$(ReleaseVersion) --output $(Build.ArtifactStagingDirectory) 11 | displayName: 'Build' 12 | - task: EsrpCodeSigning@1 13 | displayName: 'StrongName sign' 14 | inputs: 15 | ConnectedServiceName: 'srm-esrp-signing' 16 | FolderPath: $(Build.ArtifactStagingDirectory) 17 | Pattern: srm.dll 18 | signConfigType: 'inlineSignParams' 19 | inlineOperation: | 20 | [ 21 | { 22 | "KeyCode" : "CP-233863-SN", 23 | "OperationCode" : "StrongNameSign", 24 | "Parameters" : {}, 25 | "ToolName" : "sign", 26 | "ToolVersion" : "1.0" 27 | }, 28 | { 29 | "KeyCode" : "CP-233863-SN", 30 | "OperationCode" : "StrongNameVerify", 31 | "Parameters" : {}, 32 | "ToolName" : "sign", 33 | "ToolVersion" : "1.0" 34 | } 35 | ] 36 | SessionTimeout: '60' 37 | MaxConcurrency: '50' 38 | MaxRetryAttempts: '5' 39 | - script: dotnet pack srm -p:PackageVersion=$(ReleaseVersion) --output $(Build.ArtifactStagingDirectory) --no-build -p:OutputPath=$(Build.ArtifactStagingDirectory) 40 | displayName: 'Pack' 41 | - task: EsrpCodeSigning@1 42 | displayName: 'NuGet sign' 43 | inputs: 44 | ConnectedServiceName: 'srm-esrp-signing' 45 | FolderPath: $(Build.ArtifactStagingDirectory) 46 | Pattern: Microsoft.Automata.SRM.$(ReleaseVersion).nupkg 47 | signConfigType: 'inlineSignParams' 48 | inlineOperation: | 49 | [ 50 | { 51 | "KeyCode" : "CP-401405", 52 | "OperationCode" : "NuGetSign", 53 | "Parameters" : {}, 54 | "ToolName" : "sign", 55 | "ToolVersion" : "1.0" 56 | }, 57 | { 58 | "KeyCode" : "CP-401405", 59 | "OperationCode" : "NuGetVerify", 60 | "Parameters" : {}, 61 | "ToolName" : "sign", 62 | "ToolVersion" : "1.0" 63 | } 64 | ] 65 | SessionTimeout: '60' 66 | MaxConcurrency: '50' 67 | MaxRetryAttempts: '5' 68 | - task: PublishPipelineArtifact@1 69 | inputs: 70 | targetPath: $(Build.ArtifactStagingDirectory)\Microsoft.Automata.SRM.$(ReleaseVersion).nupkg 71 | artifactName: 'NuGetPackage' -------------------------------------------------------------------------------- /scripts/test.yml: -------------------------------------------------------------------------------- 1 | pool: 2 | vmImage: "windows-latest" 3 | 4 | steps: 5 | - task: DotNetCoreCLI@2 6 | displayName: 'Test' 7 | inputs: 8 | command: test -------------------------------------------------------------------------------- /srm.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.26124.0 5 | MinimumVisualStudioVersion = 15.0.26124.0 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "srm", "srm\srm.csproj", "{69ED8C3B-1140-441B-8FEB-AA05855C84F5}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "tests", "tests\tests.csproj", "{70878658-B583-496F-A113-BE95FDF2E4EF}" 9 | EndProject 10 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "unicode_table_gen", "unicode_table_gen\unicode_table_gen.csproj", "{548048A4-FC83-41E1-A070-BDA5B814C254}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Any CPU = Debug|Any CPU 15 | Debug|x64 = Debug|x64 16 | Debug|x86 = Debug|x86 17 | Release|Any CPU = Release|Any CPU 18 | Release|x64 = Release|x64 19 | Release|x86 = Release|x86 20 | EndGlobalSection 21 | GlobalSection(SolutionProperties) = preSolution 22 | HideSolutionNode = FALSE 23 | EndGlobalSection 24 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 25 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 26 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Debug|Any CPU.Build.0 = Debug|Any CPU 27 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Debug|x64.ActiveCfg = Debug|Any CPU 28 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Debug|x64.Build.0 = Debug|Any CPU 29 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Debug|x86.ActiveCfg = Debug|Any CPU 30 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Debug|x86.Build.0 = Debug|Any CPU 31 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Release|Any CPU.ActiveCfg = Release|Any CPU 32 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Release|Any CPU.Build.0 = Release|Any CPU 33 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Release|x64.ActiveCfg = Release|Any CPU 34 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Release|x64.Build.0 = Release|Any CPU 35 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Release|x86.ActiveCfg = Release|Any CPU 36 | {69ED8C3B-1140-441B-8FEB-AA05855C84F5}.Release|x86.Build.0 = Release|Any CPU 37 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 38 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Debug|Any CPU.Build.0 = Debug|Any CPU 39 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Debug|x64.ActiveCfg = Debug|Any CPU 40 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Debug|x64.Build.0 = Debug|Any CPU 41 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Debug|x86.ActiveCfg = Debug|Any CPU 42 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Debug|x86.Build.0 = Debug|Any CPU 43 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Release|Any CPU.ActiveCfg = Release|Any CPU 44 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Release|Any CPU.Build.0 = Release|Any CPU 45 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Release|x64.ActiveCfg = Release|Any CPU 46 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Release|x64.Build.0 = Release|Any CPU 47 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Release|x86.ActiveCfg = Release|Any CPU 48 | {70878658-B583-496F-A113-BE95FDF2E4EF}.Release|x86.Build.0 = Release|Any CPU 49 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 50 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Debug|Any CPU.Build.0 = Debug|Any CPU 51 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Debug|x64.ActiveCfg = Debug|Any CPU 52 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Debug|x64.Build.0 = Debug|Any CPU 53 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Debug|x86.ActiveCfg = Debug|Any CPU 54 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Debug|x86.Build.0 = Debug|Any CPU 55 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Release|Any CPU.ActiveCfg = Release|Any CPU 56 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Release|Any CPU.Build.0 = Release|Any CPU 57 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Release|x64.ActiveCfg = Release|Any CPU 58 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Release|x64.Build.0 = Release|Any CPU 59 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Release|x86.ActiveCfg = Release|Any CPU 60 | {548048A4-FC83-41E1-A070-BDA5B814C254}.Release|x86.Build.0 = Release|Any CPU 61 | EndGlobalSection 62 | EndGlobal 63 | -------------------------------------------------------------------------------- /srm/AutomataException.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Microsoft.SRM 6 | { 7 | /// 8 | /// Exeption thrown by the automata constructions 9 | /// 10 | public class AutomataException : Exception 11 | { 12 | /// 13 | /// the kind of exception 14 | /// 15 | public readonly AutomataExceptionKind kind; 16 | 17 | /// 18 | /// construct an exception 19 | /// 20 | public AutomataException(string message, Exception innerException) 21 | : base(message, innerException) 22 | { 23 | kind = AutomataExceptionKind.Unspecified; 24 | } 25 | 26 | /// 27 | /// construct an exception with given message 28 | /// 29 | public AutomataException(string message) 30 | : base(message) 31 | { 32 | kind = AutomataExceptionKind.Unspecified; 33 | } 34 | 35 | /// 36 | /// construct an exception with given kind 37 | /// 38 | public AutomataException(AutomataExceptionKind kind) 39 | : base(GetMessage(kind)) 40 | { 41 | this.kind = kind; 42 | } 43 | 44 | /// 45 | /// construct an exception with given kind and inner exception 46 | /// 47 | public AutomataException(AutomataExceptionKind kind, Exception innerException) 48 | : base(GetMessage(kind), innerException) 49 | { 50 | this.kind = kind; 51 | } 52 | 53 | private static string GetMessage(AutomataExceptionKind kind) 54 | { 55 | switch (kind) 56 | { 57 | case AutomataExceptionKind.CharacterEncodingIsUnspecified: 58 | return CharacterEncodingIsUnspecified; 59 | case AutomataExceptionKind.CharSetMustBeNonempty: 60 | return CharSetMustBeNonempty; 61 | case AutomataExceptionKind.UnrecognizedRegex: 62 | return UnrecognizedRegex; 63 | case AutomataExceptionKind.InternalError: 64 | return InternalError; 65 | default: 66 | return kind.ToString(); 67 | } 68 | } 69 | 70 | public const string UnrecognizedRegex = 71 | "Unrecognized regex construct"; 72 | public const string CharSetMustBeNonempty = 73 | "Set must be nonempty"; 74 | public const string CharacterEncodingIsUnspecified = 75 | "Character encoding is unspecified"; 76 | public const string InternalError = 77 | "Internal error"; 78 | } 79 | 80 | 81 | /// 82 | /// Kinds of exceptions that may be thrown by the Automata library operations. 83 | /// 84 | public enum AutomataExceptionKind 85 | { 86 | UnrecognizedRegex, 87 | CharSetMustBeNonempty, 88 | CharacterEncodingIsUnspecified, 89 | InternalError, 90 | Unspecified, 91 | InvalidArguments, 92 | CharSetMustBeNontrivial, 93 | CompactSerializationNodeLimitViolation, 94 | CompactSerializationBitLimitViolation, 95 | CompactDeserializationError, 96 | SetIsEmpty, 97 | InvalidArgument, 98 | IncompatibleAlgebras, 99 | NotSupported, 100 | BooleanAlgebraIsNotAtomic, 101 | OrdinalIsTooLarge, 102 | UnexpectedMTBDDTerminal, 103 | AlgebraMustBeCharSetSolver, 104 | MTBDDsNotSupportedForThisOperation, 105 | BDDSerializationNodeLimitViolation, 106 | BDDSerializationBitLimitViolation, 107 | BDDDeserializationError, 108 | BitOutOfRange, 109 | InternalError_SymbolicRegex, 110 | MustNotAcceptEmptyString, 111 | NrOfMintermsCanBeAtMost64, 112 | } 113 | } -------------------------------------------------------------------------------- /srm/Match.cs: -------------------------------------------------------------------------------- 1 | namespace Microsoft.SRM 2 | { 3 | public struct Match 4 | { 5 | public int Index { get; private set; } 6 | public int Length { get; private set; } 7 | 8 | public Match(int index, int length) 9 | { 10 | Index = index; 11 | Length = length; 12 | } 13 | 14 | public static bool operator==(Match left, Match right) 15 | => left.Index == right.Index && left.Length == right.Length; 16 | 17 | public static bool operator!=(Match left, Match right) => !(left == right); 18 | 19 | public override bool Equals(object obj) => obj is Match other && this == other; 20 | 21 | public override int GetHashCode() => (Index, Length).GetHashCode(); 22 | 23 | public override string ToString() 24 | { 25 | return string.Format("Match({0},{1})", Index, Length); 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /srm/Regex.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Runtime.Serialization; 5 | using System.Runtime.Serialization.Formatters.Binary; 6 | 7 | namespace Microsoft.SRM 8 | { 9 | [Serializable] 10 | public class Regex 11 | { 12 | private static readonly CharSetSolver solver; 13 | private static readonly RegexToAutomatonConverter converter; 14 | static Regex() 15 | { 16 | solver = new CharSetSolver(); 17 | converter = new RegexToAutomatonConverter(solver); 18 | } 19 | 20 | private IMatcher matcher; 21 | 22 | public Regex(string pattern) : this(pattern, RegexOptions.None) { } 23 | 24 | public Regex(string pattern, RegexOptions options) 25 | { 26 | var root = converter.ConvertToSymbolicRegex(pattern, options, keepAnchors: true); 27 | var partition = root.ComputeMinterms(); 28 | if (partition.Length > 64) 29 | { 30 | //more than 64 bits needed to represent a set 31 | matcher = new SymbolicRegexBV(root, solver, converter.srBuilder, partition, options); 32 | } 33 | else 34 | { 35 | //enough to use 64 bits 36 | matcher = new SymbolicRegexUInt64(root, solver, converter.srBuilder, partition, options); 37 | } 38 | } 39 | 40 | /// 41 | /// Returns true iff the input string matches. 42 | /// given iput string 43 | /// start position in the input 44 | /// end position in the input, -1 means that the value is unspecified and taken to be input.Length-1 45 | /// 46 | public bool IsMatch(string input, int startat = 0, int endat = -1) 47 | => matcher.IsMatch(input, startat, endat); 48 | 49 | /// 50 | /// Returns all matches as pairs (startindex, length) in the input string. 51 | /// 52 | /// given iput string 53 | /// as soon as this many matches have been found the search terminates, 0 or negative value means that there is no bound, default is 0 54 | /// start position in the input, default is 0 55 | /// end position in the input, -1 means that the value is unspecified and taken to be input.Length-1 56 | public List Matches(string input, int limit = 0, int startat = 0, int endat = -1) 57 | => matcher.Matches(input, limit, startat, endat); 58 | 59 | /// 60 | /// Serialize this symbolic regex matcher to the given file. 61 | /// If formatter is null then an instance of 62 | /// System.Runtime.Serialization.Formatters.Binary.BinaryFormatter is used. 63 | /// 64 | /// file where the serialization is stored 65 | /// given formatter 66 | public void Serialize(string file, IFormatter formatter = null) 67 | { 68 | var stream = new FileStream(file, FileMode.Create, FileAccess.Write, FileShare.None); 69 | Serialize(stream, formatter); 70 | stream.Close(); 71 | } 72 | 73 | /// 74 | /// Serialize this symbolic regex matcher to the given file. 75 | /// If formatter is null then an instance of 76 | /// System.Runtime.Serialization.Formatters.Binary.BinaryFormatter is used. 77 | /// 78 | /// stream where the serialization is stored 79 | /// given formatter 80 | public void Serialize(Stream stream, IFormatter formatter = null) 81 | { 82 | if (formatter == null) 83 | formatter = new BinaryFormatter(); 84 | formatter.Serialize(stream, this); 85 | } 86 | 87 | /// 88 | /// Deserialize the matcher of a symblic regex from the given file using the given formatter. 89 | /// If formatter is null then an instance of 90 | /// System.Runtime.Serialization.Formatters.Binary.BinaryFormatter is used. 91 | /// 92 | /// source file of the serialized matcher 93 | /// given formatter 94 | /// 95 | public static Regex Deserialize(string file, IFormatter formatter = null) 96 | { 97 | Stream stream = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read); 98 | Regex matcher = Deserialize(stream, formatter); 99 | stream.Close(); 100 | return matcher; 101 | } 102 | 103 | /// 104 | /// Deserialize the matcher of a symblic regex from the given stream using the given formatter. 105 | /// If formatter is null then an instance of 106 | /// System.Runtime.Serialization.Formatters.Binary.BinaryFormatter is used. 107 | /// 108 | /// source stream of the serialized matcher 109 | /// given formatter 110 | /// 111 | public static Regex Deserialize(Stream stream, IFormatter formatter = null) 112 | { 113 | if (formatter == null) 114 | formatter = new BinaryFormatter(); 115 | Regex matcher = (Regex)formatter.Deserialize(stream); 116 | return matcher; 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /srm/RegexOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Diagnostics; 3 | 4 | namespace Microsoft.SRM 5 | { 6 | [Serializable] 7 | public struct RegexOptions 8 | { 9 | // .NET compatible options 10 | public static RegexOptions None = new RegexOptions(0); 11 | public static RegexOptions IgnoreCase = new RegexOptions(1); 12 | public static RegexOptions Multiline = new RegexOptions(2); 13 | public static RegexOptions Singleline = new RegexOptions(4); 14 | public static RegexOptions IgnorePatternWhitespace = new RegexOptions(8); 15 | public static RegexOptions CultureInvariant = new RegexOptions(16); 16 | public static RegexOptions ECMAScript = new RegexOptions(32); 17 | 18 | // SRM specific options 19 | public static RegexOptions Vectorize = new RegexOptions(1024); 20 | 21 | private int value; 22 | 23 | private RegexOptions(int value) 24 | { 25 | this.value = value; 26 | } 27 | 28 | public static RegexOptions operator|(RegexOptions left, RegexOptions right) 29 | { 30 | return new RegexOptions(left.value | right.value); 31 | } 32 | 33 | public static RegexOptions operator^(RegexOptions left, RegexOptions right) 34 | { 35 | return new RegexOptions(left.value ^ right.value); 36 | } 37 | 38 | public static RegexOptions operator&(RegexOptions left, RegexOptions right) 39 | { 40 | return new RegexOptions(left.value & right.value); 41 | } 42 | 43 | public static implicit operator int(RegexOptions ourOptions) 44 | { 45 | return ourOptions.value; 46 | } 47 | 48 | public static implicit operator System.Text.RegularExpressions.RegexOptions(RegexOptions ourOptions) 49 | { 50 | var theirOptions = System.Text.RegularExpressions.RegexOptions.None; 51 | var handledOptions = None; 52 | Action handleEquivalentOption = (o, t) => 53 | { 54 | if ((ourOptions & o) != 0) 55 | { 56 | theirOptions |= t; 57 | handledOptions |= o; 58 | } 59 | }; 60 | Action ignoreOption = t => 61 | { 62 | if ((ourOptions & t) != 0) 63 | { 64 | handledOptions |= t; 65 | } 66 | }; 67 | handleEquivalentOption(IgnoreCase, System.Text.RegularExpressions.RegexOptions.IgnoreCase); 68 | handleEquivalentOption(Multiline, System.Text.RegularExpressions.RegexOptions.Multiline); 69 | handleEquivalentOption(Singleline, System.Text.RegularExpressions.RegexOptions.Singleline); 70 | handleEquivalentOption(IgnorePatternWhitespace, System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace); 71 | handleEquivalentOption(CultureInvariant, System.Text.RegularExpressions.RegexOptions.CultureInvariant); 72 | handleEquivalentOption(ECMAScript, System.Text.RegularExpressions.RegexOptions.ECMAScript); 73 | ignoreOption(Vectorize); 74 | Debug.Assert(handledOptions == ourOptions); 75 | return theirOptions; 76 | } 77 | 78 | public static implicit operator RegexOptions(System.Text.RegularExpressions.RegexOptions theirOptions) 79 | { 80 | var ourOptions = None; 81 | var handledOptions = System.Text.RegularExpressions.RegexOptions.None; 82 | Action handleEquivalentOption = (o, t) => 83 | { 84 | if ((theirOptions & t) != 0) 85 | { 86 | ourOptions |= o; 87 | handledOptions |= t; 88 | } 89 | }; 90 | Action ignoreOption = t => 91 | { 92 | if ((theirOptions & t) != 0) 93 | { 94 | handledOptions |= t; 95 | } 96 | }; 97 | handleEquivalentOption(IgnoreCase, System.Text.RegularExpressions.RegexOptions.IgnoreCase); 98 | handleEquivalentOption(Multiline, System.Text.RegularExpressions.RegexOptions.Multiline); 99 | handleEquivalentOption(Singleline, System.Text.RegularExpressions.RegexOptions.Singleline); 100 | handleEquivalentOption(IgnorePatternWhitespace, System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace); 101 | handleEquivalentOption(CultureInvariant, System.Text.RegularExpressions.RegexOptions.CultureInvariant); 102 | handleEquivalentOption(ECMAScript, System.Text.RegularExpressions.RegexOptions.ECMAScript); 103 | ignoreOption(System.Text.RegularExpressions.RegexOptions.RightToLeft); 104 | ignoreOption(System.Text.RegularExpressions.RegexOptions.Compiled); 105 | ignoreOption(System.Text.RegularExpressions.RegexOptions.ExplicitCapture); 106 | Debug.Assert(handledOptions == theirOptions); 107 | return ourOptions; 108 | } 109 | } 110 | } -------------------------------------------------------------------------------- /srm/algebras/BDD.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | 4 | namespace Microsoft.SRM 5 | { 6 | /// 7 | /// Represents a Binary Decision Diagram. 8 | /// 9 | public class BDD 10 | { 11 | /// 12 | /// The encoding of the set for lower ordinals for the case when the current bit is 1. 13 | /// The value is null iff IsLeaf is true. 14 | /// 15 | public readonly BDD One; 16 | 17 | /// 18 | /// The encoding of the set for lower ordinals for the case when the current bit is 0. 19 | /// The value is null iff IsLeaf is true. 20 | /// 21 | public readonly BDD Zero; 22 | 23 | 24 | public readonly BDDAlgebra algebra; 25 | 26 | /// 27 | /// Ordinal of this bit if nonleaf 28 | /// 29 | public readonly int Ordinal; 30 | 31 | internal BDD(BDDAlgebra algebra, int ordinal, BDD one, BDD zero) 32 | { 33 | this.One = one; 34 | this.Zero = zero; 35 | this.Ordinal = ordinal; 36 | this.algebra = algebra; 37 | } 38 | 39 | /// 40 | /// True iff the node is a terminal (One and Zero are null). 41 | /// 42 | public bool IsLeaf 43 | { 44 | get { return One == null; } 45 | } 46 | 47 | /// 48 | /// True iff the set is full. 49 | /// 50 | public bool IsFull 51 | { 52 | get { return this == algebra.True; } 53 | } 54 | 55 | /// 56 | /// True iff the set is empty. 57 | /// 58 | public bool IsEmpty 59 | { 60 | get { return this == algebra.False; } 61 | } 62 | 63 | /// 64 | /// Counts the number of nodes (both terminals and nonterminals) in the BDD. 65 | /// 66 | public int CountNodes() 67 | { 68 | if (IsLeaf) 69 | return 1; 70 | 71 | HashSet visited = new HashSet(); 72 | Stack stack = new Stack(); 73 | stack.Push(this); 74 | visited.Add(this); 75 | while (stack.Count > 0) 76 | { 77 | BDD a = stack.Pop(); 78 | if (!a.IsLeaf) 79 | { 80 | if (visited.Add(a.One)) 81 | stack.Push(a.One); 82 | if (visited.Add(a.Zero)) 83 | stack.Push(a.Zero); 84 | } 85 | } 86 | return visited.Count; 87 | } 88 | 89 | /// 90 | /// Gets the lexicographically minimum bitvector in this BDD as a ulong. 91 | /// Assumes that this BDD is nonempty and that its ordinal is at most 63. 92 | /// 93 | public ulong GetMin() 94 | { 95 | var set = this; 96 | 97 | if (set.IsFull) 98 | return (ulong)0; 99 | 100 | if (set.IsEmpty) 101 | throw new AutomataException(AutomataExceptionKind.SetIsEmpty); 102 | 103 | if (set.Ordinal > 63) 104 | throw new AutomataException(AutomataExceptionKind.OrdinalIsTooLarge); 105 | 106 | ulong res = 0; 107 | 108 | while (!set.IsLeaf) 109 | { 110 | if (set.Zero.IsEmpty) //the bit must be set to 1 111 | { 112 | res = res | ((ulong)1 << set.Ordinal); 113 | set = set.One; 114 | } 115 | else 116 | set = set.Zero; 117 | } 118 | 119 | return res; 120 | } 121 | 122 | public static BDD operator >>(BDD x, int k) 123 | { 124 | return x.algebra.ShiftRight(x, k); 125 | } 126 | 127 | public static BDD operator <<(BDD x, int k) 128 | { 129 | return x.algebra.ShiftLeft(x, k); 130 | } 131 | 132 | public static BDD operator &(BDD x, BDD y) 133 | { 134 | return x.algebra.MkAnd(x, y); 135 | } 136 | 137 | public static BDD operator |(BDD x, BDD y) 138 | { 139 | return x.algebra.MkOr(x, y); 140 | } 141 | 142 | public static BDD operator !(BDD x) 143 | { 144 | return x.algebra.MkNot(x); 145 | } 146 | } 147 | } 148 | 149 | -------------------------------------------------------------------------------- /srm/algebras/BV.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Runtime.CompilerServices; 4 | using System.Runtime.Serialization; 5 | 6 | 7 | namespace Microsoft.SRM 8 | { 9 | /// 10 | /// Represents a bitvector 11 | /// 12 | [Serializable] 13 | public class BV : IComparable, ISerializable 14 | { 15 | internal ulong first; 16 | internal ulong[] more; 17 | 18 | /// 19 | /// Constructs a bitvector 20 | /// 21 | /// first 64 bits 22 | /// remaining bits in 64 increments 23 | public BV(ulong first, params ulong[] more) 24 | { 25 | this.first = first; 26 | this.more = more; 27 | } 28 | 29 | /// 30 | /// Bitwise AND 31 | /// 32 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 33 | public static BV operator &(BV x, BV y) 34 | { 35 | int k = (x.more.Length <= y.more.Length ? x.more.Length : y.more.Length); 36 | var first = x.first & y.first; 37 | var more = new ulong[k]; 38 | for (int i = 0; i < k; i++) 39 | { 40 | more[i] = x.more[i] & y.more[i]; 41 | } 42 | return new BV(first, more); 43 | } 44 | 45 | /// 46 | /// Bitwise OR 47 | /// 48 | public static BV operator |(BV x, BV y) 49 | { 50 | int k = (x.more.Length <= y.more.Length ? x.more.Length : y.more.Length); 51 | var first = x.first | y.first; 52 | var more = new ulong[k]; 53 | for (int i = 0; i < k; i++) 54 | { 55 | more[i] = x.more[i] | y.more[i]; 56 | } 57 | return new BV(first, more); 58 | } 59 | 60 | /// 61 | /// Bitwise XOR 62 | /// 63 | public static BV operator ^(BV x, BV y) 64 | { 65 | int k = (x.more.Length <= y.more.Length ? x.more.Length : y.more.Length); 66 | var first = x.first ^ y.first; 67 | var more = new ulong[x.more.Length]; 68 | for (int i = 0; i < x.more.Length; i++) 69 | { 70 | more[i] = x.more[i] ^ y.more[i]; 71 | } 72 | return new BV(first, more); 73 | } 74 | 75 | /// 76 | /// Bitwise NOT 77 | /// 78 | public static BV operator ~(BV x) 79 | { 80 | var first_compl = ~x.first; 81 | var more_compl = Array.ConvertAll(x.more, n => ~n); 82 | var compl = new BV(first_compl, more_compl); 83 | return compl; 84 | } 85 | 86 | /// 87 | /// less than 88 | /// 89 | public static bool operator <(BV x, BV y) 90 | { 91 | return x.CompareTo(y) < 0; 92 | } 93 | 94 | /// 95 | /// greater than 96 | /// 97 | public static bool operator >(BV x, BV y) 98 | { 99 | return x.CompareTo(y) > 0; 100 | } 101 | 102 | /// 103 | /// less than or equal 104 | /// 105 | public static bool operator <=(BV x, BV y) 106 | { 107 | return x.CompareTo(y) <= 0; 108 | } 109 | 110 | /// 111 | /// greater than or equal 112 | /// 113 | public static bool operator >=(BV x, BV y) 114 | { 115 | return x.CompareTo(y) >= 0; 116 | } 117 | 118 | /// 119 | /// Shows the serialized representation 120 | /// 121 | public override string ToString() 122 | { 123 | return Serialize(); 124 | } 125 | 126 | public override bool Equals(object obj) 127 | { 128 | BV that = obj as BV; 129 | if (that == null) 130 | return false; 131 | if (this == that) 132 | return true; 133 | if (this.first != that.first) 134 | return false; 135 | if (that.more.Length != this.more.Length) 136 | return false; 137 | for (int i = 0; i < more.Length; i++) 138 | { 139 | if (more[i] != that.more[i]) 140 | return false; 141 | } 142 | return true; 143 | } 144 | 145 | public override int GetHashCode() 146 | { 147 | int h = first.GetHashCode(); 148 | for (int i = 0; i < more.Length; i++) 149 | { 150 | h = (h << 5) ^ more[i].GetHashCode(); 151 | } 152 | return h; 153 | } 154 | 155 | public int CompareTo(object obj) 156 | { 157 | BV that = obj as BV; 158 | if (that == null) 159 | return 1; 160 | else if (this.more.Length != that.more.Length) 161 | { 162 | return this.more.Length.CompareTo(that.more.Length); 163 | } 164 | else 165 | { 166 | int k = this.more.Length; 167 | if (k > 0) 168 | { 169 | int i = k - 1; 170 | while (i >= 0) 171 | { 172 | var comp = this.more[i].CompareTo(that.more[i]); 173 | if (comp == 0) 174 | i = i - 1; 175 | else 176 | return comp; 177 | } 178 | } 179 | return this.first.CompareTo(that.first); 180 | } 181 | } 182 | 183 | #region serialization 184 | /// 185 | /// Serialize 186 | /// 187 | public void GetObjectData(SerializationInfo info, StreamingContext context) 188 | { 189 | info.AddValue("bv", Serialize()); 190 | } 191 | /// 192 | /// Deserialize 193 | /// 194 | public BV(SerializationInfo info, StreamingContext context) 195 | { 196 | var s = info.GetString("bv"); 197 | Deserialize_Helper(s, out first, out more); 198 | } 199 | 200 | /// 201 | /// Serialize BV into a string of hexadecimal numerals, separated by '.', 202 | /// each numeral representing an unsigned 64-bit integer in hexadecimal using lowercase a-f 203 | /// 204 | /// 205 | public string Serialize() 206 | { 207 | string str = this.first.ToString("x") + "." + string.Join(".", Array.ConvertAll(this.more, x => x.ToString("x"))); 208 | return str; 209 | } 210 | 211 | /// 212 | /// Deserialize BV from given string that was produced by Serialize 213 | /// 214 | /// BV in serialized form 215 | public static BV Deserialize(string s) 216 | { 217 | ulong first; 218 | ulong[] rest; 219 | Deserialize_Helper(s, out first, out rest); 220 | return new BV(first, rest); 221 | } 222 | 223 | private static void Deserialize_Helper(string s, out ulong first, out ulong[] rest) 224 | { 225 | int i = s.IndexOf('.'); 226 | first = ulong.Parse(s.Substring(0, i), System.Globalization.NumberStyles.HexNumber); 227 | rest = Array.ConvertAll(s.Substring(i + 1).Split('.'), x => ulong.Parse(x, System.Globalization.NumberStyles.HexNumber)); 228 | } 229 | #endregion 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /srm/algebras/BV64Algebra.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Runtime.CompilerServices; 4 | using System.Runtime.Serialization; 5 | 6 | namespace Microsoft.SRM 7 | { 8 | /// 9 | /// Bit vector algebra of up to 64 bits 10 | /// 11 | [Serializable] 12 | public class BV64Algebra : BVAlgebraBase, ICharAlgebra, ISerializable 13 | { 14 | [NonSerialized] 15 | MintermGenerator mtg; 16 | [NonSerialized] 17 | ulong zero = 0; 18 | [NonSerialized] 19 | ulong all; 20 | [NonSerialized] 21 | internal ulong[] atoms; 22 | 23 | public ulong ComputeDomainSize(ulong set) 24 | { 25 | int size = 0; 26 | for (int i = 0; i < atoms.Length; i++) 27 | { 28 | if (IsSatisfiable(set & atoms[i])) 29 | size += partition[i].Count; 30 | } 31 | return (ulong)size; 32 | } 33 | 34 | public static BV64Algebra Create(CharSetSolver solver, BDD[] minterms) 35 | { 36 | if (minterms.Length > 64) 37 | throw new AutomataException(AutomataExceptionKind.NrOfMintermsCanBeAtMost64); 38 | var dtree = DecisionTree.Create(solver, minterms); 39 | var partitionBase = Array.ConvertAll(minterms, m => solver.ToRanges(m)); 40 | var partition = Array.ConvertAll(partitionBase, p => new IntervalSet(p)); 41 | return new BV64Algebra(dtree, partition); 42 | } 43 | 44 | private BV64Algebra(DecisionTree dtree, IntervalSet[] partition) : base(dtree, partition, partition.Length) 45 | { 46 | this.all = ulong.MaxValue >> (64 - this.nrOfBits); 47 | this.mtg = new MintermGenerator(this); 48 | this.atoms = new ulong[this.nrOfBits]; 49 | for (int i = 0; i < this.nrOfBits; i++) 50 | { 51 | atoms[i] = ((ulong)1) << i; 52 | } 53 | } 54 | 55 | /// 56 | /// Create a variant of the algebra where each minterms is replaced with a singleton set starting from '0' 57 | /// Used for testing purposes. 58 | /// 59 | internal BV64Algebra ReplaceMintermsWithVisibleCharacters() 60 | { 61 | Func f = x => 62 | { 63 | int k; 64 | if (x <= 26) 65 | k = ('A' + (x - 1)); 66 | else if (x <= 52) 67 | k = ('a' + (x - 27)); 68 | else if (x <= 62) 69 | k = ('0' + (x - 53)); 70 | else 71 | k = '='; 72 | return k; 73 | }; 74 | var simplified_partition = new IntervalSet[this.partition.Length]; 75 | int[] precomp = new int[256]; 76 | for (int i=1; i < simplified_partition.Length; i++) 77 | { 78 | int k = f(i); 79 | simplified_partition[i] = new IntervalSet(new Tuple((uint)k,(uint)k)); 80 | precomp[k] = i; 81 | } 82 | var zeroIntervals = new List>(); 83 | int lower = 0; 84 | int upper = 0; 85 | for (int i = 1; i <= 'z' + 1; i++) 86 | { 87 | if (precomp[i] == 0) 88 | { 89 | if (upper == i - 1) 90 | upper += 1; 91 | else 92 | { 93 | zeroIntervals.Add(new Tuple((uint)lower, (uint)upper)); 94 | lower = i; 95 | upper = i; 96 | } 97 | } 98 | } 99 | zeroIntervals.Add(new Tuple((uint)lower, 0xFFFF)); 100 | simplified_partition[0] = new IntervalSet(zeroIntervals.ToArray()); 101 | 102 | var simplified_dtree = new DecisionTree(precomp, new DecisionTree.BST(0, null, null)); 103 | return new BV64Algebra(simplified_dtree, simplified_partition); 104 | } 105 | 106 | public ulong False 107 | { 108 | get 109 | { 110 | return zero; 111 | } 112 | } 113 | 114 | public bool IsExtensional 115 | { 116 | get 117 | { 118 | return true; 119 | } 120 | } 121 | 122 | public ulong True 123 | { 124 | get 125 | { 126 | return all; 127 | } 128 | } 129 | 130 | public BitWidth Encoding 131 | { 132 | get 133 | { 134 | throw new NotSupportedException(); 135 | } 136 | } 137 | 138 | public CharSetSolver CharSetProvider 139 | { 140 | get 141 | { 142 | throw new NotSupportedException(); 143 | } 144 | } 145 | 146 | public bool AreEquivalent(ulong predicate1, ulong predicate2) 147 | { 148 | return predicate1 == predicate2; 149 | } 150 | 151 | public IEnumerable> GenerateMinterms(params ulong[] constraints) 152 | { 153 | return this.mtg.GenerateMinterms(constraints); 154 | } 155 | 156 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 157 | public bool IsSatisfiable(ulong predicate) 158 | { 159 | return predicate != zero; 160 | } 161 | 162 | public ulong MkAnd(params ulong[] predicates) 163 | { 164 | var and = all; 165 | for (int i = 0; i < predicates.Length; i++) 166 | { 167 | and = and & predicates[i]; 168 | if (and == zero) 169 | return zero; 170 | } 171 | return and; 172 | } 173 | 174 | public ulong MkAnd(IEnumerable predicates) 175 | { 176 | throw new NotImplementedException(); 177 | } 178 | 179 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 180 | public ulong MkAnd(ulong predicate1, ulong predicate2) 181 | { 182 | return predicate1 & predicate2; 183 | } 184 | 185 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 186 | public ulong MkDiff(ulong predicate1, ulong predicate2) 187 | { 188 | return predicate1 & ~predicate2; 189 | } 190 | 191 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 192 | public ulong MkNot(ulong predicate) 193 | { 194 | return all & ~predicate; 195 | } 196 | 197 | public ulong MkOr(IEnumerable predicates) 198 | { 199 | var res = zero; 200 | foreach (var p in predicates) 201 | { 202 | res = res | p; 203 | if (res == all) 204 | return all; 205 | } 206 | return res; 207 | } 208 | 209 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 210 | public ulong MkOr(ulong predicate1, ulong predicate2) 211 | { 212 | return predicate1 | predicate2; 213 | } 214 | 215 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 216 | public ulong MkSymmetricDifference(ulong p1, ulong p2) 217 | { 218 | return (p1 ^ p2); 219 | } 220 | 221 | public ulong MkRangeConstraint(char lower, char upper, bool caseInsensitive = false) 222 | { 223 | throw new NotSupportedException(); 224 | } 225 | 226 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 227 | public ulong MkCharConstraint(char c, bool caseInsensitive = false) 228 | { 229 | if (caseInsensitive == true) 230 | throw new AutomataException(AutomataExceptionKind.NotSupported); 231 | return this.atoms[this.dtree.GetId(c)]; 232 | } 233 | 234 | /// 235 | /// Assumes that set is a union of some minterms (or empty). 236 | /// If null then 0 is returned. 237 | /// 238 | public ulong ConvertFromCharSet(BDD set) 239 | { 240 | if (set == null) 241 | return zero; 242 | var alg = set.algebra; 243 | ulong res = this.zero; 244 | for (int i = 0; i < partition.Length; i++) 245 | { 246 | BDD bdd_i = partition[i].AsBDD(alg); 247 | var conj = alg.MkAnd(bdd_i, set); 248 | if (alg.IsSatisfiable(conj)) 249 | { 250 | res = res | atoms[i]; 251 | } 252 | } 253 | return res; 254 | } 255 | 256 | /// 257 | /// Pretty print the bitvector predicate as a character class. 258 | /// 259 | /// given bitvector predicate 260 | public string PrettyPrint(ulong bv) 261 | { 262 | var lab1 = PrettyPrintHelper(bv, false); 263 | var lab2 = PrettyPrintHelper(~bv, true); 264 | if (lab1.Length <= lab2.Length) 265 | return lab1; 266 | else 267 | return lab2; 268 | 269 | } 270 | 271 | string PrettyPrintHelper(ulong bv, bool complement) 272 | { 273 | List sets = new List(); 274 | for (int i = 0; i < atoms.Length; i++) 275 | if (IsSatisfiable(bv & atoms[i])) 276 | sets.Add(partition[i]); 277 | var set = IntervalSet.Merge(sets); 278 | var res = set.ToCharacterClass(complement); 279 | return res; 280 | } 281 | 282 | public BDD ConvertToCharSet(BDDAlgebra solver, ulong pred) 283 | { 284 | BDD res = solver.False; 285 | if (!pred.Equals(this.zero)) 286 | { 287 | for (int i = 0; i < atoms.Length; i++) 288 | { 289 | //construct the union of the corresponding atoms 290 | if (!(pred & atoms[i]).Equals(this.zero)) 291 | { 292 | BDD bdd_i = partition[i].AsBDD(solver); 293 | res = solver.MkOr(res, bdd_i); 294 | } 295 | } 296 | } 297 | return res; 298 | } 299 | 300 | public ulong[] GetPartition() 301 | { 302 | return atoms; 303 | } 304 | 305 | public IEnumerable GenerateAllCharacters(ulong set) 306 | { 307 | for (int i = 0; i < atoms.Length; i++) 308 | { 309 | if (IsSatisfiable(atoms[i] & set)) 310 | foreach (uint elem in partition[i].Enumerate()) 311 | yield return (char)elem; 312 | } 313 | } 314 | 315 | #region serialization 316 | /// 317 | /// Serialize 318 | /// 319 | public void GetObjectData(SerializationInfo info, StreamingContext context) 320 | { 321 | info.AddValue("d", dtree); 322 | info.AddValue("p", SerializePartition()); 323 | } 324 | 325 | /// 326 | /// Deserialize 327 | /// 328 | public BV64Algebra(SerializationInfo info, StreamingContext context) 329 | : this((DecisionTree)info.GetValue("d", typeof(DecisionTree)), 330 | DeserializePartition(info.GetString("p"))) 331 | { 332 | } 333 | 334 | /// 335 | /// Serialize s as a hexadecimal numeral using lowercase letters 336 | /// 337 | /// given predicate 338 | public string SerializePredicate(ulong s) 339 | { 340 | return s.ToString("x"); 341 | } 342 | 343 | /// 344 | /// Deserialize s from a string created by SerializePredicate 345 | /// 346 | /// given hexadecimal numeral representation 347 | public ulong DeserializePredicate(string s) 348 | { 349 | return ulong.Parse(s, System.Globalization.NumberStyles.HexNumber); 350 | } 351 | #endregion 352 | 353 | public ulong MkCharPredicate(string name, ulong pred) 354 | { 355 | throw new NotImplementedException(); 356 | } 357 | 358 | } 359 | } -------------------------------------------------------------------------------- /srm/algebras/BVAlgebra.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Runtime.CompilerServices; 4 | using System.Runtime.Serialization; 5 | 6 | namespace Microsoft.SRM 7 | { 8 | public abstract class BVAlgebraBase 9 | { 10 | internal DecisionTree dtree; 11 | internal IntervalSet[] partition; 12 | internal int nrOfBits; 13 | 14 | internal BVAlgebraBase(DecisionTree dtree, IntervalSet[] partition, int nrOfBits) 15 | { 16 | this.dtree = dtree; 17 | this.partition = partition; 18 | this.nrOfBits = nrOfBits; 19 | } 20 | 21 | protected string SerializePartition() 22 | { 23 | string s = ""; 24 | for (int i = 0; i < partition.Length; i++) 25 | { 26 | if (i > 0) 27 | s += ";"; 28 | s += partition[i].Serialize(); 29 | } 30 | return s; 31 | } 32 | 33 | protected static IntervalSet[] DeserializePartition(string s) 34 | { 35 | var blocks = s.Split(';'); 36 | var intervalSets = Array.ConvertAll(blocks, IntervalSet.Parse); 37 | return intervalSets; 38 | } 39 | } 40 | /// 41 | /// Bit vector algebra 42 | /// 43 | [Serializable] 44 | public class BVAlgebra : BVAlgebraBase, ICharAlgebra, ISerializable 45 | { 46 | [NonSerialized] 47 | MintermGenerator mtg; 48 | [NonSerialized] 49 | BV zero; 50 | [NonSerialized] 51 | BV ones; 52 | [NonSerialized] 53 | ulong[] all0; 54 | [NonSerialized] 55 | ulong[] all1; 56 | [NonSerialized] 57 | internal BV[] atoms; 58 | 59 | public ulong ComputeDomainSize(BV set) 60 | { 61 | int size = 0; 62 | for (int i = 0; i < atoms.Length; i++) 63 | { 64 | if (IsSatisfiable(set & atoms[i])) 65 | size += partition[i].Count; 66 | } 67 | return (ulong)size; 68 | } 69 | 70 | public static BVAlgebra Create(CharSetSolver solver, BDD[] minterms) 71 | { 72 | var dtree = DecisionTree.Create(solver, minterms); 73 | var partitionBase = Array.ConvertAll(minterms, m => solver.ToRanges(m)); 74 | var partition = Array.ConvertAll(partitionBase, p => new IntervalSet(p)); 75 | return new BVAlgebra(dtree, partition); 76 | } 77 | 78 | private BVAlgebra(DecisionTree dtree, IntervalSet[] partition) : base(dtree, partition, partition.Length) 79 | { 80 | var K = (nrOfBits - 1) / 64; 81 | int last = nrOfBits % 64; 82 | ulong lastMask = (last == 0 ? ulong.MaxValue : (((ulong)1 << last) - 1)); 83 | all0 = new ulong[K]; 84 | all1 = new ulong[K]; 85 | for (int i = 0; i < K; i++) 86 | { 87 | all0[0] = 0; 88 | if (i < K - 1) 89 | { 90 | all1[i] = ulong.MaxValue; 91 | } 92 | else 93 | { 94 | all1[i] = lastMask; 95 | } 96 | } 97 | this.zero = new BV(0, all0); 98 | this.ones = new BV((K == 0 ? lastMask : ulong.MaxValue), all1); 99 | this.mtg = new MintermGenerator(this); 100 | this.atoms = new BV[nrOfBits]; 101 | for (int i = 0; i < nrOfBits; i++) 102 | { 103 | atoms[i] = MkBV(i); 104 | } 105 | } 106 | 107 | public BV False 108 | { 109 | get 110 | { 111 | return zero; 112 | } 113 | } 114 | 115 | public bool IsExtensional 116 | { 117 | get 118 | { 119 | return true; 120 | } 121 | } 122 | 123 | public BV True 124 | { 125 | get 126 | { 127 | return ones; 128 | } 129 | } 130 | 131 | public BitWidth Encoding 132 | { 133 | get 134 | { 135 | throw new NotSupportedException(); 136 | } 137 | } 138 | 139 | public CharSetSolver CharSetProvider 140 | { 141 | get 142 | { 143 | throw new NotSupportedException(); 144 | } 145 | } 146 | 147 | public bool AreEquivalent(BV predicate1, BV predicate2) 148 | { 149 | return predicate1.Equals(predicate2); 150 | } 151 | 152 | public IEnumerable> GenerateMinterms(params BV[] constraints) 153 | { 154 | return this.mtg.GenerateMinterms(constraints); 155 | } 156 | 157 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 158 | public bool IsSatisfiable(BV predicate) 159 | { 160 | return !predicate.Equals(zero); 161 | } 162 | 163 | public BV MkAnd(params BV[] predicates) 164 | { 165 | var and = ones; 166 | for (int i = 0; i < predicates.Length; i++) 167 | { 168 | and = and & predicates[i]; 169 | if (and.Equals(zero)) 170 | return zero; 171 | } 172 | return and; 173 | } 174 | 175 | public BV MkAnd(IEnumerable predicates) 176 | { 177 | throw new NotImplementedException(); 178 | } 179 | 180 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 181 | public BV MkAnd(BV predicate1, BV predicate2) 182 | { 183 | return predicate1 & predicate2; 184 | } 185 | 186 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 187 | public BV MkDiff(BV predicate1, BV predicate2) 188 | { 189 | return predicate1 & ~predicate2; 190 | } 191 | 192 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 193 | public BV MkNot(BV predicate) 194 | { 195 | return ones & ~predicate; 196 | } 197 | 198 | public BV MkOr(IEnumerable predicates) 199 | { 200 | var res = zero; 201 | foreach (var p in predicates) 202 | { 203 | res = res | p; 204 | if (res.Equals(ones)) 205 | return ones; 206 | } 207 | return res; 208 | } 209 | 210 | public BV MkOr(BV predicate1, BV predicate2) 211 | { 212 | return predicate1 | predicate2; 213 | } 214 | 215 | public BV MkBV(params int[] truebits) 216 | { 217 | ulong first = 0; 218 | var more = new ulong[this.all0.Length]; 219 | for (int i = 0; i < truebits.Length; i++) 220 | { 221 | int b = truebits[i]; 222 | if (b >= nrOfBits || b < 0) 223 | throw new AutomataException(AutomataExceptionKind.BitOutOfRange); 224 | int k = b / 64; 225 | int j = b % 64; 226 | if (k == 0) 227 | first = first | ((ulong)1 << j); 228 | else 229 | more[k-1] = more[k-1] | ((ulong)1 << j); 230 | } 231 | var bv = new BV(first, more); 232 | return bv; 233 | } 234 | 235 | public BV MkRangeConstraint(char lower, char upper, bool caseInsensitive = false) 236 | { 237 | throw new NotSupportedException(); 238 | } 239 | 240 | public BV MkCharConstraint(char c, bool caseInsensitive = false) 241 | { 242 | if (caseInsensitive == true) 243 | throw new AutomataException(AutomataExceptionKind.NotSupported); 244 | 245 | int i = this.dtree.GetId(c); 246 | return this.atoms[i]; 247 | } 248 | 249 | /// 250 | /// Assumes that set is a union of some minterms (or empty). 251 | /// If null then null is returned. 252 | /// 253 | public BV ConvertFromCharSet(BDD set) 254 | { 255 | if (set == null) 256 | return null; 257 | var alg = set.algebra; 258 | BV res = this.zero; 259 | for (int i = 0; i < partition.Length; i++) 260 | { 261 | BDD bdd_i = partition[i].AsBDD(alg); 262 | var conj = alg.MkAnd(bdd_i, set); 263 | if (alg.IsSatisfiable(conj)) 264 | { 265 | res = res | atoms[i]; 266 | } 267 | } 268 | return res; 269 | } 270 | 271 | public BDD ConvertToCharSet(BDDAlgebra solver, BV pred) 272 | { 273 | BDD res = solver.False; 274 | if (!pred.Equals(this.zero)) 275 | { 276 | for (int i = 0; i < atoms.Length; i++) 277 | { 278 | //construct the union of the corresponding atoms 279 | if (!(pred & atoms[i]).Equals(this.zero)) 280 | { 281 | BDD bdd_i = partition[i].AsBDD(solver); 282 | res = solver.MkOr(res, bdd_i); 283 | } 284 | } 285 | } 286 | return res; 287 | } 288 | 289 | public BV[] GetPartition() 290 | { 291 | return atoms; 292 | } 293 | 294 | public IEnumerable GenerateAllCharacters(BV set) 295 | { 296 | for (int i = 0; i < atoms.Length; i++) 297 | { 298 | if (IsSatisfiable(atoms[i] & set)) 299 | foreach (uint elem in partition[i].Enumerate()) 300 | yield return (char)elem; 301 | } 302 | } 303 | 304 | #region serialization 305 | /// 306 | /// Serialize 307 | /// 308 | public void GetObjectData(SerializationInfo info, StreamingContext context) 309 | { 310 | info.AddValue("d", dtree); 311 | info.AddValue("p", SerializePartition()); 312 | } 313 | 314 | /// 315 | /// Deserialize 316 | /// 317 | public BVAlgebra(SerializationInfo info, StreamingContext context) 318 | : this((DecisionTree)info.GetValue("d", typeof(DecisionTree)), 319 | DeserializePartition(info.GetString("p"))) 320 | { 321 | } 322 | 323 | /// 324 | /// calls bv.Serialize() 325 | /// 326 | public string SerializePredicate(BV bv) 327 | { 328 | return bv.Serialize(); 329 | } 330 | 331 | /// 332 | /// calls BV.Deserialize(s) 333 | /// 334 | public BV DeserializePredicate(string s) 335 | { 336 | return BV.Deserialize(s); 337 | } 338 | #endregion 339 | 340 | public BV MkCharPredicate(string name, BV pred) 341 | { 342 | throw new NotImplementedException(); 343 | } 344 | } 345 | } 346 | -------------------------------------------------------------------------------- /srm/algebras/CharSetSolver.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | //using RestrictKeyType = System.Int64; 4 | using System.IO; 5 | using System.Text.RegularExpressions; 6 | 7 | namespace Microsoft.SRM 8 | { 9 | /// 10 | /// Provides functionality to build character sets, to perform boolean operations over character sets, 11 | /// and to construct an SFA over character sets from a regex. 12 | /// Character sets are represented by bitvector sets. 13 | /// 14 | public class CharSetSolver : BDDAlgebra, ICharAlgebra 15 | { 16 | 17 | int _bw; 18 | 19 | public BitWidth Encoding 20 | { 21 | get { return (BitWidth)_bw; } 22 | } 23 | 24 | /// 25 | /// Construct the solver for BitWidth.BV16 26 | /// 27 | public CharSetSolver() : this(BitWidth.BV16) 28 | { 29 | } 30 | 31 | /// 32 | /// Construct a character set solver for the given character encoding (nr of bits). 33 | /// 34 | public CharSetSolver(BitWidth bits) : base() 35 | { 36 | if (!CharacterEncodingTool.IsSpecified(bits)) 37 | throw new AutomataException(AutomataExceptionKind.CharacterEncodingIsUnspecified); 38 | _bw = (int)bits; 39 | } 40 | 41 | IgnoreCaseTransformer _IgnoreCase = null; 42 | IgnoreCaseTransformer IgnoreCase 43 | { 44 | get 45 | { 46 | if (_IgnoreCase == null) 47 | _IgnoreCase = new IgnoreCaseTransformer(this); 48 | return _IgnoreCase; 49 | } 50 | } 51 | 52 | BDD[] charPredTable = new BDD[1 << 16]; 53 | 54 | /// 55 | /// Make a character containing the given character c. 56 | /// If c is a lower case or upper case character and ignoreCase is true 57 | /// then add both the upper case and the lower case characters. 58 | /// 59 | public BDD MkCharConstraint(char c, bool ignoreCase = false) 60 | { 61 | int i = (int)c; 62 | if (charPredTable[i] == null) 63 | charPredTable[i] = MkSetFrom((uint)c, _bw - 1); 64 | if (ignoreCase) 65 | return IgnoreCase.Apply(charPredTable[i]); 66 | return charPredTable[i]; 67 | } 68 | 69 | /// 70 | /// Make a CharSet from all the characters in the range from m to n. 71 | /// Returns the empty set if n is less than m 72 | /// 73 | public BDD MkCharSetFromRange(char m, char n) 74 | { 75 | return MkSetFromRange((uint)m, (uint)n, _bw-1); 76 | } 77 | 78 | /// 79 | /// Make a character set that is the union of the character sets of the given ranges. 80 | /// 81 | public BDD MkCharSetFromRanges(IEnumerable> ranges) 82 | { 83 | BDD res = False; 84 | foreach (var range in ranges) 85 | res = MkOr(res, MkSetFromRange(range.Item1, range.Item2, _bw -1)); 86 | return res; 87 | } 88 | 89 | /// 90 | /// Make a character set of all the characters in the interval from c to d. 91 | /// If ignoreCase is true ignore cases for upper and lower case characters by including both versions. 92 | /// 93 | public BDD MkRangeConstraint(char c, char d, bool ignoreCase = false) 94 | { 95 | var res = MkSetFromRange((uint)c, (uint)d, _bw - 1); 96 | if (ignoreCase) 97 | res = IgnoreCase.Apply(res); 98 | return res; 99 | } 100 | 101 | /// 102 | /// Make a BDD encoding of k least significant bits of all the integers in the ranges 103 | /// 104 | internal BDD MkBddForIntRanges(IEnumerable ranges) 105 | { 106 | BDD bdd = False; 107 | foreach (var range in ranges) 108 | bdd = MkOr(bdd, MkSetFromRange((uint)range[0], (uint)range[1], _bw - 1)); 109 | return bdd; 110 | } 111 | 112 | #region Serialializing and deserializing BDDs 113 | 114 | /// 115 | /// Represent the set as an integer array. 116 | /// Assumes that the bdd has less than 2^14 nodes and at most 16 variables. 117 | /// 118 | internal int[] SerializeCompact(BDD bdd) 119 | { 120 | //return SerializeBasedOnRanges(bdd); 121 | return SerializeCompact2(bdd); 122 | } 123 | 124 | /// 125 | /// Represent the set as an integer array. 126 | /// Assumes that the bdd has at most 2^14 nodes and at most 16 variables. 127 | /// 128 | int[] SerializeCompact2(BDD bdd) 129 | { 130 | // encode the bdd directly 131 | // 132 | // the element at index 0 is the false node 133 | // the element at index 1 is the true node 134 | // and entry at index i>1 is node i and has the structure 135 | // (ordinal trueNode falseNode) 136 | // where ordinal uses 4 bits and trueNode and falseNode each use 14 bits 137 | // Assumes that the bdd has less than 2^14 nodes and at most 16 variables. 138 | // BDD.False is represented by int[]{0}. 139 | // BDD.True is represented by int[]{0,0}. 140 | // The root of the BDD (Other than True or False) is node 2 141 | 142 | if (bdd.IsEmpty) 143 | return new int[] { 0 }; 144 | if (bdd.IsFull) 145 | return new int[] { 0, 0 }; 146 | 147 | int nrOfNodes = bdd.CountNodes(); 148 | 149 | if (nrOfNodes > (1 << 14)) 150 | throw new AutomataException(AutomataExceptionKind.CompactSerializationNodeLimitViolation); 151 | 152 | int[] res = new int[nrOfNodes]; 153 | 154 | 155 | //here we know that bdd is neither empty nor full 156 | var done = new Dictionary(); 157 | done[False] = 0; 158 | done[True] = 1; 159 | 160 | Stack stack = new Stack(); 161 | stack.Push(bdd); 162 | done[bdd] = 2; 163 | 164 | int doneCount = 3; 165 | 166 | while (stack.Count > 0) 167 | { 168 | BDD b = stack.Pop(); 169 | if (!done.ContainsKey(b.One)) 170 | { 171 | done[b.One] = (doneCount++); 172 | stack.Push(b.One); 173 | } 174 | if (!done.ContainsKey(b.Zero)) 175 | { 176 | done[b.Zero] = (doneCount++); 177 | stack.Push(b.Zero); 178 | } 179 | int bId = done[b]; 180 | int fId = done[b.Zero]; 181 | int tId = done[b.One]; 182 | 183 | if (b.Ordinal > 15) 184 | throw new AutomataException(AutomataExceptionKind.CompactSerializationBitLimitViolation); 185 | 186 | res[bId] = (b.Ordinal << 28) | (tId << 14) | fId; 187 | } 188 | return res; 189 | } 190 | 191 | /// 192 | /// Recreates a BDD from an int array that has been created using SerializeCompact 193 | /// 194 | internal BDD DeserializeCompact(int[] arcs) 195 | { 196 | //return DeserializeBasedOnRanges(arcs); 197 | return DeserializeCompact2(arcs); 198 | } 199 | 200 | /// 201 | /// Recreates a BDD from an int array that has been created using SerializeCompact 202 | /// 203 | BDD DeserializeCompact2(int[] arcs) 204 | { 205 | if (arcs.Length == 1) 206 | return False; 207 | if (arcs.Length == 2) 208 | return True; 209 | 210 | //organized by order 211 | //note that all arcs are strictly increasing in levels 212 | var levels = new List[16]; 213 | 214 | BDD[] bddMap = new BDD[arcs.Length]; 215 | bddMap[0] = False; 216 | bddMap[1] = True; 217 | 218 | for (int i = 2; i < arcs.Length; i++) 219 | { 220 | int x = ((arcs[i] >> 28) & 0xF); 221 | if (levels[x] == null) 222 | levels[x] = new List(); 223 | levels[x].Add(i); 224 | } 225 | 226 | //create the BDD nodes according to the levels x 227 | //this is to ensure proper internalization 228 | for (int x = 0; x < 16; x++) 229 | { 230 | if (levels[x] != null) 231 | { 232 | foreach (int i in levels[x]) 233 | { 234 | int one = ((arcs[i] >> 14) & 0x3FFF); 235 | int zero = (arcs[i] & 0x3FFF); 236 | if (one > bddMap.Length || zero > bddMap.Length) 237 | throw new AutomataException(AutomataExceptionKind.CompactDeserializationError); 238 | var oneBranch = bddMap[one]; 239 | var zeroBranch = bddMap[zero]; 240 | var bdd = MkBvSet(x, oneBranch, zeroBranch); 241 | bddMap[i] = bdd; 242 | if (bdd.Ordinal <= bdd.One.Ordinal || bdd.Ordinal <= bdd.Zero.Ordinal) 243 | throw new AutomataException(AutomataExceptionKind.CompactDeserializationError); 244 | } 245 | } 246 | } 247 | 248 | return bddMap[2]; 249 | } 250 | #endregion 251 | 252 | /// 253 | /// Identity function, returns s. 254 | /// 255 | public BDD ConvertFromCharSet(BDD s) 256 | { 257 | return s; 258 | } 259 | 260 | /// 261 | /// Returns this character set solver. 262 | /// 263 | public CharSetSolver CharSetProvider 264 | { 265 | get { return this; } 266 | } 267 | 268 | /// 269 | /// Returns pred. 270 | /// 271 | public BDD MkCharPredicate(string name, BDD pred) 272 | { 273 | return pred; 274 | } 275 | 276 | public IEnumerable GenerateAllCharacters(BDD bvSet, bool inRevereseOrder = false) 277 | { 278 | foreach (var c in GenerateAllElements(bvSet, inRevereseOrder)) 279 | yield return (char)c; 280 | } 281 | 282 | public IEnumerable GenerateAllCharacters(BDD set) 283 | { 284 | return GenerateAllCharacters(set, false); 285 | } 286 | 287 | 288 | /// 289 | /// Calculate the number of elements in the set. 290 | /// 291 | /// the given set 292 | /// the cardinality of the set 293 | public ulong ComputeDomainSize(BDD set) 294 | { 295 | var card = ComputeDomainSize(set, _bw - 1); 296 | return card; 297 | } 298 | 299 | /// 300 | /// Returns true iff the set contains exactly one element. 301 | /// 302 | /// the given set 303 | /// true iff the set is a singleton 304 | public bool IsSingleton(BDD set) 305 | { 306 | var card = ComputeDomainSize(set, _bw - 1); 307 | return card == (long)1; 308 | } 309 | 310 | /// 311 | /// Convert the set into an equivalent array of ranges. The ranges are nonoverlapping and ordered. 312 | /// If limit > 0 then returns null if the total number of ranges exceeds limit. 313 | /// 314 | public Tuple[] ToRanges(BDD set, int limit = 0) 315 | { 316 | return ToRanges(set, _bw - 1, limit); 317 | } 318 | 319 | IEnumerable GenerateAllCharactersInOrder(BDD set) 320 | { 321 | var ranges = ToRanges(set); 322 | foreach (var range in ranges) 323 | for (uint i = range.Item1; i <= range.Item2; i++) 324 | yield return (uint)i; 325 | } 326 | 327 | IEnumerable GenerateAllCharactersInReverseOrder(BDD set) 328 | { 329 | var ranges = ToRanges(set); 330 | for (int j = ranges.Length - 1; j >= 0; j--) 331 | for (uint i = ranges[j].Item2; i >= ranges[j].Item1; i--) 332 | yield return (char)i; 333 | } 334 | 335 | /// 336 | /// Generate all characters that are members of the set in alphabetical order, smallest first, provided that inReverseOrder is false. 337 | /// 338 | /// the given set 339 | /// if true the members are generated in reverse alphabetical order with the largest first, otherwise in alphabetical order 340 | /// enumeration of all characters in the set, the enumeration is empty if the set is empty 341 | public IEnumerable GenerateAllElements(BDD set, bool inReverseOrder) 342 | { 343 | if (set == False) 344 | return GenerateNothing(); 345 | else if (inReverseOrder) 346 | return GenerateAllCharactersInReverseOrder(set); 347 | else 348 | return GenerateAllCharactersInOrder(set); 349 | } 350 | 351 | IEnumerable GenerateNothing() 352 | { 353 | yield break; 354 | } 355 | 356 | public BDD ConvertToCharSet(BDDAlgebra alg, BDD pred) 357 | { 358 | return pred; 359 | } 360 | 361 | #region code generation 362 | 363 | public BDD[] GetPartition() 364 | { 365 | throw new NotSupportedException(); 366 | } 367 | 368 | #endregion 369 | 370 | public override string SerializePredicate(BDD s) 371 | { 372 | throw new NotImplementedException(); 373 | } 374 | 375 | public override BDD DeserializePredicate(string s) 376 | { 377 | throw new NotImplementedException(); 378 | } 379 | } 380 | } 381 | -------------------------------------------------------------------------------- /srm/algebras/CharacterEncoding.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Microsoft.SRM 6 | { 7 | /// 8 | /// Number of bits used in bitvectors. 9 | /// 10 | public enum BitWidth 11 | { 12 | /// 13 | /// 7 bit ASCII encoding 14 | /// 15 | BV7 = 7, 16 | /// 17 | /// 8 bit Extended ASCII encoding 18 | /// 19 | BV8 = 8, 20 | /// 21 | /// 16 bit bit-vector encoding 22 | /// 23 | BV16 = 16, 24 | /// 25 | /// 32 bit bit-vector encoding 26 | /// 27 | BV32 = 32, 28 | ///// 29 | ///// 64 bit bit-vector encoding 30 | ///// 31 | BV64 = 64 32 | } 33 | 34 | /// 35 | /// Provides functionality for character encodings. 36 | /// 37 | public static class CharacterEncodingTool 38 | { 39 | /// 40 | /// Maps ASCII to 7, extended ASCII to 8, and other encodings to 16. 41 | /// Throws AutomataException if IsSpecified(encoding) is false. 42 | /// 43 | /// 44 | /// either 7, 8, or 16 45 | public static int Truncate(BitWidth encoding) 46 | { 47 | switch (encoding) 48 | { 49 | case BitWidth.BV7: return 7; 50 | case BitWidth.BV8: return 8; 51 | case BitWidth.BV16: return 16; 52 | case BitWidth.BV32: return 16; 53 | case BitWidth.BV64: return 16; 54 | default: 55 | throw new AutomataException(AutomataExceptionKind.CharacterEncodingIsUnspecified); 56 | } 57 | } 58 | 59 | /// 60 | /// Returns true iff encoding equals to one of the enums in CharacterEncoding. 61 | /// 62 | public static bool IsSpecified(BitWidth encoding) 63 | { 64 | return (encoding == BitWidth.BV7 || 65 | encoding == BitWidth.BV32 || 66 | encoding == BitWidth.BV8 || 67 | encoding == BitWidth.BV64 || 68 | encoding == BitWidth.BV16); 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /srm/algebras/IBooleanAlgebra.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | 4 | namespace Microsoft.SRM 5 | { 6 | /// 7 | /// Generic Boolean Algebra solver. 8 | /// Provides operations for conjunction, disjunction, and negation. 9 | /// Allows to decide if a predicate is satisfiable and if two predicates are equivalent. 10 | /// 11 | /// predicates 12 | public interface IBooleanAlgebra 13 | { 14 | /// 15 | /// Top element of the Boolean algebra, corresponds to the value true. 16 | /// 17 | S True { get; } 18 | 19 | /// 20 | /// Bottom element of the Boolean algebra, corresponds to the value false. 21 | /// 22 | S False { get; } 23 | 24 | /// 25 | /// Make a conjunction of predicate1 and predicate2. 26 | /// 27 | S MkAnd(S predicate1, S predicate2); 28 | 29 | /// 30 | /// Make a conjunction of all the predicates in the enumeration. 31 | /// Returns True if the enumeration is empty. 32 | /// 33 | S MkAnd(IEnumerable predicates); 34 | 35 | /// 36 | /// Make a conjunction of all the predicates. 37 | /// Returns True if the enumeration is empty. 38 | /// 39 | S MkAnd(params S[] predicates); 40 | 41 | /// 42 | /// Make a disjunction of predicate1 and predicate2. 43 | /// 44 | S MkOr(S predicate1, S predicate2); 45 | 46 | /// 47 | /// Make a disjunction of all the predicates in the enumeration. 48 | /// Must return False if the enumeration is empty. 49 | /// 50 | S MkOr(IEnumerable predicates); 51 | 52 | /// 53 | /// Negate the predicate. 54 | /// 55 | S MkNot(S predicate); 56 | 57 | /// 58 | /// Compute the predicate and(predicate1,not(predicate2)) 59 | /// 60 | S MkDiff(S predicate1, S predicate2); 61 | 62 | /// 63 | /// Returns true iff the predicate is satisfiable. 64 | /// 65 | bool IsSatisfiable(S predicate); 66 | 67 | /// 68 | /// Returns true iff predicate1 is equivalent to predicate2. 69 | /// 70 | bool AreEquivalent(S predicate1, S predicate2); 71 | 72 | /// 73 | /// True iff any two equivalent predicates are identical. 74 | /// 75 | bool IsExtensional { get; } 76 | 77 | /// 78 | /// Given an array of constraints {c_1, c_2, ..., c_n} where n>=0. 79 | /// Enumerate all satisfiable Boolean combinations Tuple({b_1, b_2, ..., b_n}, c) 80 | /// where c is satisfisable and equivalent to c'_1 & c'_2 & ... & c'_n, 81 | /// where c'_i = c_i if b_i = true and c'_i is Not(c_i) otherwise. 82 | /// If n=0 return Tuple({},True) 83 | /// 84 | /// array of constraints 85 | /// Booolean combinations that are satisfiable 86 | IEnumerable> GenerateMinterms(params S[] constraints); 87 | 88 | /// 89 | /// Serialize the predicate using characters in [0-9a-f\-\.] 90 | /// 91 | /// given predicate 92 | string SerializePredicate(S s); 93 | 94 | /// 95 | /// Deserialize the predicate from a string constructed with Serialize 96 | /// 97 | /// given serialized predicate 98 | S DeserializePredicate(string s); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /srm/algebras/ICharAlgebra.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Microsoft.SRM 7 | { 8 | /// 9 | /// Extends ICharAlgebra with character predicate solving and predicate pretty printing. 10 | /// 11 | /// predicates 12 | public interface ICharAlgebra : IBooleanAlgebra 13 | { 14 | BitWidth Encoding { get; } 15 | 16 | /// 17 | /// Make a constraint describing the set of all characters between a (inclusive) and b (inclusive). 18 | /// Add both uppercase and lowercase elelements if caseInsensitive is true. 19 | /// 20 | PRED MkRangeConstraint(char lower, char upper, bool caseInsensitive = false); 21 | 22 | /// 23 | /// Make a constraint describing a singleton set containing the character c, or 24 | /// a set containing also the upper and lowercase versions of c if caseInsensitive is true. 25 | /// 26 | /// if true include both the uppercase and the lowercase versions of the given character 27 | /// the given character 28 | PRED MkCharConstraint(char c, bool caseInsensitive = false); 29 | 30 | /// 31 | /// Make a term that encodes the given character set. 32 | /// 33 | PRED ConvertFromCharSet(BDD set); 34 | 35 | /// 36 | /// Compute the number of elements in the set 37 | /// 38 | ulong ComputeDomainSize(PRED set); 39 | 40 | /// 41 | /// Enumerate all characters in the set 42 | /// 43 | /// given set 44 | IEnumerable GenerateAllCharacters(PRED set); 45 | 46 | /// 47 | /// Convert a predicate into a set of characters. 48 | /// 49 | BDD ConvertToCharSet(BDDAlgebra solver, PRED pred); 50 | 51 | /// 52 | /// Gets the underlying character set solver. 53 | /// 54 | CharSetSolver CharSetProvider { get; } 55 | 56 | /// 57 | /// If named definitions are possible, 58 | /// makes a named definition of pred, as a unary relation symbol, 59 | /// such that, for all x, name(x) holds iff body(x) holds. Returns the 60 | /// atom name(x) that is equivalent to pred(x). 61 | /// If named definitions are not supported, returns pred. 62 | /// 63 | PRED MkCharPredicate(string name, PRED pred); 64 | 65 | /// 66 | /// Returns a partition of the full domain. 67 | /// 68 | PRED[] GetPartition(); 69 | } 70 | } 71 | 72 | -------------------------------------------------------------------------------- /srm/algebras/IntervalSet.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Runtime.Serialization; 4 | using System.Text; 5 | 6 | namespace Microsoft.SRM 7 | { 8 | /// 9 | /// Represents a sorted finite set of finite intervals representing characters 10 | /// 11 | [Serializable] 12 | public class IntervalSet : ISerializable 13 | { 14 | Tuple[] intervals; 15 | 16 | /// 17 | /// Create a new interval set 18 | /// 19 | /// given intervals 20 | public IntervalSet(params Tuple[] intervals) 21 | { 22 | this.intervals = intervals; 23 | } 24 | 25 | /// 26 | /// Gets the index'th element where index is in [0..Count-1]. 27 | /// Throws IndexOutOfRangeException() if index is out of range. 28 | /// 29 | public uint this[int index] 30 | { 31 | get 32 | { 33 | int k = index; 34 | for (int i = 0; i < intervals.Length; i++) 35 | { 36 | int ith_size = (int)intervals[i].Item2 - (int)intervals[i].Item1 + 1; 37 | if (k < ith_size) 38 | return intervals[i].Item1 + (uint)k; 39 | else 40 | k = k - ith_size; 41 | } 42 | throw new IndexOutOfRangeException(); 43 | } 44 | } 45 | 46 | int count = -1; 47 | 48 | /// 49 | /// Number of elements in the set 50 | /// 51 | public int Count 52 | { 53 | get 54 | { 55 | if (count == -1) 56 | { 57 | int s = 0; 58 | for (int i = 0; i < intervals.Length; i++) 59 | { 60 | s += (int)intervals[i].Item2 - (int)intervals[i].Item1 + 1; 61 | } 62 | count = s; 63 | } 64 | return count; 65 | } 66 | } 67 | 68 | public bool IsEmpty 69 | { 70 | get { return Count == 0; } 71 | } 72 | 73 | private static int CompareTuples(Tuple x, Tuple y) 74 | { 75 | return x.Item1.CompareTo(y.Item1); 76 | } 77 | 78 | internal static IntervalSet Merge(IEnumerable sets) 79 | { 80 | List> merged = new List>(); 81 | foreach (var set in sets) 82 | merged.AddRange(set.intervals); 83 | 84 | merged.Sort(CompareTuples); 85 | return new IntervalSet(merged.ToArray()); 86 | } 87 | 88 | public BDD AsBDD(BDDAlgebra alg) 89 | { 90 | var res = alg.False; 91 | for (int i = 0; i < intervals.Length; i++) 92 | res = res | alg.MkSetFromRange(intervals[i].Item1, intervals[i].Item2, 15); 93 | return res; 94 | } 95 | 96 | public IEnumerable Enumerate() 97 | { 98 | for (int i = 0; i < intervals.Length; i++) 99 | { 100 | for (uint j = intervals[i].Item1; j < intervals[i].Item2; j++) 101 | yield return j; 102 | yield return intervals[i].Item2; 103 | } 104 | } 105 | 106 | internal string ToCharacterClass(bool isComplement) 107 | { 108 | if (IsEmpty) 109 | return "[0-[0]]"; 110 | 111 | string res = ""; 112 | uint m = intervals[0].Item1; 113 | uint n = intervals[0].Item2; 114 | for (int i = 1; i < intervals.Length; i++) 115 | { 116 | if (intervals[i].Item1 == n + 1) 117 | n = intervals[i].Item2; 118 | else 119 | { 120 | res += ToCharacterClassInterval(m, n); 121 | m = intervals[i].Item1; 122 | n = intervals[i].Item2; 123 | } 124 | } 125 | res += ToCharacterClassInterval(m, n); 126 | if (isComplement || res.Length > 1) 127 | { 128 | res = "[" + (isComplement ? "^" : "") + res + "]"; 129 | } 130 | return res; 131 | } 132 | 133 | private static string ToCharacterClassInterval(uint m, uint n) 134 | { 135 | if (m == 0 && n == 0xFFFF) 136 | return "."; 137 | 138 | if (m == n) 139 | return StringUtility.Escape((char)m); 140 | 141 | string res = StringUtility.Escape((char)m); 142 | if (n > m + 1) 143 | res += "-"; 144 | res += StringUtility.Escape((char)n); 145 | return res; 146 | } 147 | 148 | public override string ToString() 149 | { 150 | return ToCharacterClass(false); 151 | } 152 | 153 | #region custom serialization 154 | /// 155 | /// Serialize 156 | /// 157 | public void GetObjectData(SerializationInfo info, StreamingContext context) 158 | { 159 | string s = Serialize(); 160 | info.AddValue("i", s); 161 | } 162 | /// 163 | /// Deserialize 164 | /// 165 | public IntervalSet(SerializationInfo info, StreamingContext context) 166 | { 167 | string s = info.GetString("i"); 168 | intervals = Deserialize(s); 169 | } 170 | 171 | /// 172 | /// Returns a string that can be parsed back to IntervalSet 173 | /// 174 | public string Serialize() 175 | { 176 | string s = ""; 177 | for (int i=0; i < intervals.Length; i++) 178 | { 179 | if (i > 0) 180 | s += ","; 181 | s += intervals[i].Item1.ToString(); 182 | s += "-"; 183 | s += intervals[i].Item2.ToString(); 184 | } 185 | return s; 186 | } 187 | 188 | static Tuple[] Deserialize(string s) 189 | { 190 | Func> f = pair => 191 | { 192 | string[] vals = pair.Split('-'); 193 | return new Tuple(uint.Parse(vals[0]), uint.Parse(vals[1])); 194 | }; 195 | var intervals = Array.ConvertAll(s.Split(','), pair => f(pair)); 196 | return intervals; 197 | } 198 | 199 | /// 200 | /// Parse the interval set from a string s that was produced with Serialize 201 | /// 202 | /// given serialization 203 | public static IntervalSet Parse(string s) 204 | { 205 | var intervals = Deserialize(s); 206 | return new IntervalSet(intervals); 207 | } 208 | #endregion 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /srm/algebras/MintermGenerator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Microsoft.SRM 6 | { 7 | 8 | /// 9 | /// Provides a generic implementation for minterm generation over a given Boolean Algebra. 10 | /// 11 | /// type of predicates 12 | public class MintermGenerator 13 | { 14 | IBooleanAlgebra ba; 15 | 16 | bool hashCodesRespectEquivalence; 17 | 18 | /// 19 | /// Constructs a minterm generator for a given Boolean Algebra. 20 | /// 21 | /// given Boolean Algebra 22 | public MintermGenerator(IBooleanAlgebra ba) 23 | { 24 | this.ba = ba; 25 | hashCodesRespectEquivalence = ba.IsExtensional; 26 | } 27 | 28 | /// 29 | /// Returns GenerateMinterms(true, preds). 30 | /// 31 | public IEnumerable> GenerateMinterms(params PRED[] preds) 32 | { 33 | return GenerateMinterms(true, preds); 34 | } 35 | 36 | /// 37 | /// Given an array of predidates {p_1, p_2, ..., p_n} where n>=0. 38 | /// Enumerate all satisfiable Boolean combinations Tuple({b_1, b_2, ..., b_n}, p) 39 | /// where p is satisfiable and equivalent to p'_1 & p'_2 & ... & p'_n, 40 | /// where p'_i = p_i if b_i = true and p'_i is Not(p_i) otherwise. 41 | /// If n=0 return Tuple({},True). 42 | /// 43 | /// array of predicates 44 | /// optimization flag: if true, uses equivalence checking to cluster equivalent predicates; otherwise does not use equivalence checking 45 | /// all minterms of the given predicate sequence 46 | public IEnumerable> GenerateMinterms(bool useEquivalenceChecking, params PRED[] preds) 47 | { 48 | if (preds.Length == 0) 49 | { 50 | yield return new Tuple(new bool[] { }, ba.True); 51 | } 52 | else 53 | { 54 | var count = preds.Length; 55 | 56 | List nonequivalentSets = new List(); 57 | 58 | //work only with nonequivalent sets as distinct elements 59 | var indexLookup = new Dictionary(); 60 | var newIndexMap = new Dictionary(); 61 | var equivs = new List>(); 62 | 63 | for (int i = 0; i < count; i++) 64 | { 65 | int newIndex; 66 | EquivClass equiv = CreateEquivalenceClass(useEquivalenceChecking, preds[i]); 67 | if (!newIndexMap.TryGetValue(equiv, out newIndex)) 68 | { 69 | newIndex = newIndexMap.Count; 70 | newIndexMap[equiv] = newIndex; 71 | nonequivalentSets.Add(preds[i]); 72 | equivs.Add(new List()); 73 | } 74 | indexLookup[i] = newIndex; 75 | equivs[newIndex].Add(i); 76 | } 77 | 78 | //var pairs = new List>(GenerateMinterms1(nonequivalentSets.ToArray())); 79 | //foreach (var pair in pairs) 80 | //{ 81 | // var characteristic = new bool[preds.Length]; 82 | // for (int i = 0; i < count; i++) 83 | // if (pair.First.Contains(indexLookup[i])) 84 | // characteristic[i] = true; 85 | // yield return 86 | // new Tuple(characteristic, pair.Second); 87 | //} 88 | 89 | var tree = new PartitonTree(ba); 90 | foreach (var psi in nonequivalentSets) 91 | tree.Refine(psi); 92 | foreach (var leaf in tree.GetLeaves()) 93 | { 94 | var characteristic = new bool[preds.Length]; 95 | foreach (var k in leaf.GetPath()) 96 | foreach (var n in equivs[k]) 97 | characteristic[n] = true; 98 | yield return 99 | new Tuple(characteristic, leaf.phi); 100 | } 101 | } 102 | } 103 | 104 | EquivClass CreateEquivalenceClass(bool useEquivalenceChecking, PRED set) 105 | { 106 | return new EquivClass(useEquivalenceChecking, this, set); 107 | } 108 | 109 | private class EquivClass 110 | { 111 | PRED set; 112 | MintermGenerator gen; 113 | bool useEquivalenceChecking; 114 | 115 | internal EquivClass(bool useEquivalenceChecking, MintermGenerator gen, PRED set) 116 | { 117 | this.set = set; 118 | this.gen = gen; 119 | this.useEquivalenceChecking = useEquivalenceChecking; 120 | } 121 | 122 | public override int GetHashCode() 123 | { 124 | if (useEquivalenceChecking && !gen.hashCodesRespectEquivalence) 125 | //cannot rely on equivalent predicates having the same hashcode 126 | //so all predicates end up in the same bucket that causes a linear search 127 | //with Equals to check equivalence when useEquivalenceChecking=true 128 | return 0; 129 | else 130 | return set.GetHashCode(); 131 | } 132 | 133 | public override bool Equals(object obj) 134 | { 135 | if (useEquivalenceChecking) 136 | return gen.ba.AreEquivalent(set, ((EquivClass)obj).set); 137 | else 138 | return set.Equals(((EquivClass)obj).set); 139 | } 140 | } 141 | } 142 | 143 | internal class PartitonTree 144 | { 145 | PartitonTree parent; 146 | int nr; 147 | internal PRED phi; 148 | IBooleanAlgebra solver; 149 | PartitonTree left; 150 | PartitonTree right; //complement 151 | internal PartitonTree(IBooleanAlgebra solver) 152 | { 153 | this.solver = solver; 154 | nr = -1; 155 | parent = null; 156 | this.phi = solver.True; 157 | this.left = null; 158 | this.right = null; 159 | } 160 | PartitonTree(IBooleanAlgebra solver, int depth, PartitonTree parent, PRED phi, PartitonTree left, PartitonTree right) 161 | { 162 | this.solver = solver; 163 | this.parent = parent; 164 | this.nr = depth; 165 | this.phi = phi; 166 | this.left = left; 167 | this.right = right; 168 | } 169 | 170 | internal void Refine(PRED psi) 171 | { 172 | 173 | if (left == null && right == null) 174 | { 175 | #region leaf 176 | var phi_and_psi = solver.MkAnd(phi, psi); 177 | if (solver.IsSatisfiable(phi_and_psi)) 178 | { 179 | var phi_min_psi = solver.MkAnd(phi, solver.MkNot(psi)); 180 | if (solver.IsSatisfiable(phi_min_psi)) 181 | { 182 | left = new PartitonTree(solver, nr + 1, this, phi_and_psi, null, null); 183 | right = new PartitonTree(solver, nr + 1, this, phi_min_psi, null, null); 184 | } 185 | else // [[phi]] subset of [[psi]] 186 | left = new PartitonTree(solver, nr + 1, this, phi, null, null); //psi must true 187 | } 188 | else // [[phi]] subset of [[not(psi)]] 189 | right = new PartitonTree(solver, nr + 1, this, phi, null, null); //psi must be false 190 | #endregion 191 | } 192 | else if (left == null) 193 | right.Refine(psi); 194 | else if (right == null) 195 | left.Refine(psi); 196 | else 197 | { 198 | #region nonleaf 199 | var phi_and_psi = solver.MkAnd(phi, psi); 200 | if (solver.IsSatisfiable(phi_and_psi)) 201 | { 202 | var phi_min_psi = solver.MkAnd(phi, solver.MkNot(psi)); 203 | if (solver.IsSatisfiable(phi_min_psi)) 204 | { 205 | left.Refine(psi); 206 | right.Refine(psi); 207 | } 208 | else // [[phi]] subset of [[psi]] 209 | { 210 | left.ExtendLeft(); //psi is true 211 | right.ExtendLeft(); 212 | } 213 | } 214 | else // [[phi]] subset of [[not(psi)]] 215 | { 216 | left.ExtendRight(); 217 | right.ExtendRight(); //psi is false 218 | } 219 | #endregion 220 | } 221 | } 222 | 223 | private void ExtendRight() 224 | { 225 | if (left == null && right == null) 226 | right = new PartitonTree(solver, nr + 1, this, phi, null, null); 227 | else if (left == null) 228 | right.ExtendRight(); 229 | else if (right == null) 230 | left.ExtendRight(); 231 | else 232 | { 233 | left.ExtendRight(); 234 | right.ExtendRight(); 235 | } 236 | } 237 | 238 | private void ExtendLeft() 239 | { 240 | if (left == null && right == null) 241 | left = new PartitonTree(solver, nr + 1, this, phi, null, null); 242 | else if (left == null) 243 | right.ExtendLeft(); 244 | else if (right == null) 245 | left.ExtendLeft(); 246 | else 247 | { 248 | left.ExtendLeft(); 249 | right.ExtendLeft(); 250 | } 251 | } 252 | 253 | internal IEnumerable GetPath() 254 | { 255 | for (var curr = this; curr.parent != null; curr = curr.parent) 256 | if (curr.parent.left == curr) //curr is the left child of its parent 257 | yield return curr.nr; 258 | } 259 | 260 | internal IEnumerable> GetLeaves() 261 | { 262 | if (left == null && right == null) 263 | yield return this; 264 | else if (right == null) 265 | foreach (var leaf in left.GetLeaves()) 266 | yield return leaf; 267 | else if (left == null) 268 | foreach (var leaf in right.GetLeaves()) 269 | yield return leaf; 270 | else 271 | { 272 | foreach (var leaf in left.GetLeaves()) 273 | yield return leaf; 274 | foreach (var leaf in right.GetLeaves()) 275 | yield return leaf; 276 | } 277 | } 278 | } 279 | } 280 | -------------------------------------------------------------------------------- /srm/algebras/RangeConverter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | 4 | namespace Microsoft.SRM 5 | { 6 | internal class RangeConverter 7 | { 8 | Dictionary[]> rangeCache = new Dictionary[]>(); 9 | 10 | internal RangeConverter() 11 | { 12 | } 13 | 14 | //e.g. if b = 6 and p = 2 and ranges = (in binary form) {[0000 1010, 0000 1110]} i.e. [x0A,x0E] 15 | //then res = {[0000 1010, 0000 1110], [0001 1010, 0001 1110], 16 | // [0010 1010, 0010 1110], [0011 1010, 0011 1110]}, 17 | Tuple[] LiftRanges(int b, int p, Tuple[] ranges) 18 | { 19 | if (p == 0) 20 | return ranges; 21 | 22 | int k = b - p; 23 | uint maximal = ((uint)1 << k) - 1; 24 | 25 | Tuple[] res = new Tuple[(1 << p) * (ranges.Length)]; 26 | int j = 0; 27 | for (uint i = 0; i < (1 << p); i++) 28 | { 29 | uint prefix = (i << k); 30 | foreach (var range in ranges) 31 | res[j++] = new Tuple(range.Item1 | prefix, range.Item2 | prefix); 32 | } 33 | 34 | //the range wraps around : [0...][...2^k-1][2^k...][...2^(k+1)-1] 35 | if (ranges[0].Item1 == 0 && ranges[ranges.Length - 1].Item2 == maximal) 36 | { 37 | //merge consequtive ranges, we know that res has at least two elements here 38 | List> res1 = new List>(); 39 | var from = res[0].Item1; 40 | var to = res[0].Item2; 41 | for (int i = 1; i < res.Length; i++) 42 | { 43 | if (to == res[i].Item1 - 1) 44 | to = res[i].Item2; 45 | else 46 | { 47 | res1.Add(new Tuple(from, to)); 48 | from = res[i].Item1; 49 | to = res[i].Item2; 50 | } 51 | } 52 | res1.Add(new Tuple(from, to)); 53 | res = res1.ToArray(); 54 | } 55 | 56 | //CheckBug(res); 57 | return res; 58 | } 59 | 60 | Tuple[] ToRanges1(BDD set) 61 | { 62 | Tuple[] ranges; 63 | if (!rangeCache.TryGetValue(set, out ranges)) 64 | { 65 | int b = set.Ordinal; 66 | uint mask = (uint)1 << b; 67 | if (set.Zero.IsEmpty) 68 | { 69 | #region 0-case is empty 70 | if (set.One.IsFull) 71 | { 72 | var range = new Tuple(mask, (mask << 1) - 1); 73 | ranges = new Tuple[] { range }; 74 | } 75 | else //1-case is neither full nor empty 76 | { 77 | var ranges1 = LiftRanges(b, (b - set.One.Ordinal) - 1, ToRanges1(set.One)); 78 | ranges = new Tuple[ranges1.Length]; 79 | for (int i = 0; i < ranges1.Length; i++) 80 | { 81 | ranges[i] = new Tuple(ranges1[i].Item1 | mask, ranges1[i].Item2 | mask); 82 | } 83 | } 84 | #endregion 85 | } 86 | else if (set.Zero.IsFull) 87 | { 88 | #region 0-case is full 89 | if (set.One.IsEmpty) 90 | { 91 | var range = new Tuple(0, mask - 1); 92 | ranges = new Tuple[] { range }; 93 | } 94 | else 95 | { 96 | var rangesR = LiftRanges(b, (b - set.One.Ordinal) - 1, ToRanges1(set.One)); 97 | var range = rangesR[0]; 98 | if (range.Item1 == 0) 99 | { 100 | ranges = new Tuple[rangesR.Length]; 101 | ranges[0] = new Tuple(0, range.Item2 | mask); 102 | for (int i = 1; i < rangesR.Length; i++) 103 | { 104 | ranges[i] = new Tuple(rangesR[i].Item1 | mask, rangesR[i].Item2 | mask); 105 | } 106 | } 107 | else 108 | { 109 | ranges = new Tuple[rangesR.Length + 1]; 110 | ranges[0] = new Tuple(0, mask - 1); 111 | for (int i = 0; i < rangesR.Length; i++) 112 | { 113 | ranges[i + 1] = new Tuple(rangesR[i].Item1 | mask, rangesR[i].Item2 | mask); 114 | } 115 | } 116 | } 117 | #endregion 118 | } 119 | else 120 | { 121 | #region 0-case is neither full nor empty 122 | var rangesL = LiftRanges(b, (b - set.Zero.Ordinal) - 1, ToRanges1(set.Zero)); 123 | var last = rangesL[rangesL.Length - 1]; 124 | 125 | if (set.One.IsEmpty) 126 | { 127 | ranges = rangesL; 128 | } 129 | 130 | else if (set.One.IsFull) 131 | { 132 | var ranges1 = new List>(); 133 | for (int i = 0; i < rangesL.Length - 1; i++) 134 | ranges1.Add(rangesL[i]); 135 | if (last.Item2 == (mask - 1)) 136 | { 137 | ranges1.Add(new Tuple(last.Item1, (mask << 1) - 1)); 138 | } 139 | else 140 | { 141 | ranges1.Add(last); 142 | ranges1.Add(new Tuple(mask, (mask << 1) - 1)); 143 | } 144 | ranges = ranges1.ToArray(); 145 | } 146 | else //general case: neither 0-case, not 1-case is full or empty 147 | { 148 | var rangesR0 = ToRanges1(set.One); 149 | 150 | var rangesR = LiftRanges(b, (b - set.One.Ordinal) - 1, rangesR0); 151 | 152 | var first = rangesR[0]; 153 | 154 | if (last.Item2 == (mask - 1) && first.Item1 == 0) //merge together the last and first ranges 155 | { 156 | ranges = new Tuple[rangesL.Length + rangesR.Length - 1]; 157 | for (int i = 0; i < rangesL.Length - 1; i++) 158 | ranges[i] = rangesL[i]; 159 | ranges[rangesL.Length - 1] = new Tuple(last.Item1, first.Item2 | mask); 160 | for (int i = 1; i < rangesR.Length; i++) 161 | ranges[rangesL.Length - 1 + i] = new Tuple(rangesR[i].Item1 | mask, rangesR[i].Item2 | mask); 162 | } 163 | else 164 | { 165 | ranges = new Tuple[rangesL.Length + rangesR.Length]; 166 | for (int i = 0; i < rangesL.Length; i++) 167 | ranges[i] = rangesL[i]; 168 | for (int i = 0; i < rangesR.Length; i++) 169 | ranges[rangesL.Length + i] = new Tuple(rangesR[i].Item1 | mask, rangesR[i].Item2 | mask); 170 | } 171 | 172 | } 173 | #endregion 174 | } 175 | rangeCache[set] = ranges; 176 | } 177 | return ranges; 178 | } 179 | 180 | /// 181 | /// Convert the set into an equivalent array of ranges. 182 | /// The ranges are nonoverlapping and ordered. 183 | /// 184 | public Tuple[] ToRanges(BDD set, int maxBit) 185 | { 186 | if (set.IsEmpty) 187 | return new Tuple[] { }; 188 | else if (set.IsFull) 189 | return new Tuple[] { new Tuple(0, ((((uint)1 << maxBit) << 1) - 1)) }; //note: maxBit could be 31 190 | else 191 | return LiftRanges(maxBit + 1, maxBit - set.Ordinal, ToRanges1(set)); 192 | } 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /srm/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AutomataDotNet/srm/7c7eec9c4c974610f246e2502d93730335e70fa9/srm/icon.png -------------------------------------------------------------------------------- /srm/matcher/BooleanDecisionTree.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Serialization; 4 | 5 | namespace Microsoft.SRM 6 | { 7 | /// 8 | /// Decision tree for mapping character ranges into corresponding partition block ids 9 | /// 10 | [Serializable] 11 | internal class BooleanDecisionTree : ISerializable 12 | { 13 | [NonSerialized] 14 | internal bool[] precomputed; 15 | [NonSerialized] 16 | internal DecisionTree.BST bst; 17 | 18 | internal BooleanDecisionTree(bool[] precomputed, DecisionTree.BST bst) 19 | { 20 | this.precomputed = precomputed; 21 | this.bst = bst; 22 | } 23 | 24 | /// 25 | /// Crteate a Boolean decision tree. 26 | /// References to solver and domain are not saved in the resulting decision tree. 27 | /// 28 | /// character alberbra 29 | /// elements that map to true 30 | /// upper limit for block ids for characters to be precomputed in an array (default is 0xFF, i.e. extended ASCII) 31 | /// 32 | internal static BooleanDecisionTree Create(CharSetSolver solver, BDD domain, ushort precomputeLimit = 0xFF) 33 | { 34 | BDD domain_compl = solver.MkNot(domain); 35 | var partition = new BDD[] { domain_compl, domain }; 36 | if (precomputeLimit == 0) 37 | { 38 | return new BooleanDecisionTree(new bool[] { }, MkBST(new DecisionTree.PartitionCut(solver, partition), 0, 0xFFFF)); 39 | } 40 | 41 | bool[] precomp = Precompute(solver, domain, precomputeLimit); 42 | DecisionTree.BST bst = null; 43 | if (precomputeLimit < ushort.MaxValue) 44 | bst = MkBST(new DecisionTree.PartitionCut(solver, partition), precomputeLimit + 1, ushort.MaxValue); 45 | 46 | return new BooleanDecisionTree(precomp, bst); 47 | } 48 | 49 | private static bool[] Precompute(CharSetSolver solver, BDD domain, int precomputeLimit) 50 | { 51 | bool[] precomp = new bool[precomputeLimit + 1]; 52 | Func F = i => 53 | { 54 | var bdd = solver.MkCharConstraint((char)i); 55 | if (solver.IsSatisfiable(solver.MkAnd(bdd, domain))) 56 | return true; 57 | else 58 | return false; 59 | }; 60 | for (int c = 0; c <= precomputeLimit; c++) 61 | { 62 | precomp[c] = F(c); 63 | } 64 | return precomp; 65 | } 66 | 67 | private static DecisionTree.BST MkBST(DecisionTree.PartitionCut partition, int from, int to) 68 | { 69 | var cut = partition.Cut(from, to); 70 | if (cut.IsEmpty) 71 | return null; 72 | else 73 | { 74 | int block_id = cut.GetSigletonId(); 75 | if (block_id >= 0) 76 | //there is precisely one block remaining 77 | return new DecisionTree.BST(block_id, null, null); 78 | else 79 | { 80 | //it must be that 'from < to' 81 | //or else there could only have been one block 82 | int mid = (from + to) / 2; 83 | var left = MkBST(cut, from, mid); 84 | var right = MkBST(cut, mid + 1, to); 85 | //it must be that either left != null or right != null 86 | if (left == null) 87 | return right; 88 | else if (right == null) 89 | return left; 90 | else 91 | return new DecisionTree.BST(mid + 1, left, right); 92 | } 93 | } 94 | } 95 | 96 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 97 | public bool Contains(ushort c) 98 | { 99 | return (c < precomputed.Length ? precomputed[c] : bst.Find(c) == 1); 100 | } 101 | 102 | #region serialization 103 | /// 104 | /// Serialize 105 | /// 106 | public void GetObjectData(SerializationInfo info, StreamingContext context) 107 | { 108 | info.AddValue("p", SerializePrecomputed()); 109 | info.AddValue("b", bst.Serialize()); 110 | } 111 | /// 112 | /// Deserialize 113 | /// 114 | public BooleanDecisionTree(SerializationInfo info, StreamingContext context) 115 | { 116 | precomputed = DeserializePrecomputed(info.GetString("p")); 117 | this.bst = DecisionTree.BST.Deserialize(info.GetString("b")); 118 | } 119 | 120 | string SerializePrecomputed() 121 | { 122 | char[] chars = Array.ConvertAll(precomputed, b => (b ? '1' : '0')); 123 | var s = new String(chars); 124 | return s; 125 | } 126 | 127 | static bool[] DeserializePrecomputed(string s) 128 | { 129 | var vals = Array.ConvertAll(s.ToCharArray(), c => (c == '1' ? true : false)); 130 | return vals; 131 | } 132 | #endregion 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /srm/matcher/DecisionTree.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Serialization; 4 | using System.Text; 5 | 6 | namespace Microsoft.SRM 7 | { 8 | /// 9 | /// Decision tree for mapping character ranges into corresponding partition block ids 10 | /// 11 | [Serializable] 12 | public class DecisionTree : ISerializable 13 | { 14 | [NonSerialized] 15 | internal int[] precomputed; 16 | [NonSerialized] 17 | internal BST bst; 18 | 19 | internal BST Tree 20 | { 21 | get 22 | { 23 | return bst; 24 | } 25 | } 26 | 27 | public DecisionTree(int[] precomputed, BST bst) 28 | { 29 | this.precomputed = precomputed; 30 | this.bst = bst; 31 | } 32 | 33 | /// 34 | /// Crteate a decision tree that maps a character into a partion block id 35 | /// 36 | /// character alberbra 37 | /// partition of the whole set of all characters into pairwise disjoint nonempty sets 38 | /// upper limit for block ids for characters to be precomputed in an array (default is 0xFF, i.e. extended ASCII) 39 | /// 40 | internal static DecisionTree Create(CharSetSolver solver, BDD[] partition, ushort precomputeLimit = 0xFF) 41 | { 42 | if (partition.Length == 1) 43 | //there is no actual partition, everything maps to one id 0, e.g. as in .* 44 | return new DecisionTree(new int[(int)precomputeLimit], new BST(0, null, null)); 45 | 46 | if (precomputeLimit == 0) 47 | return new DecisionTree(new int[] { }, MkBST(new PartitionCut(solver, partition), 0, 0xFFFF)); 48 | 49 | int[] precomp = Precompute(solver, partition, precomputeLimit); 50 | BST bst = null; 51 | if (precomputeLimit < ushort.MaxValue) 52 | bst = MkBST(new PartitionCut(solver, partition), precomputeLimit + 1, ushort.MaxValue); 53 | 54 | return new DecisionTree(precomp, bst); 55 | } 56 | 57 | private static int[] Precompute(CharSetSolver solver, BDD[] partition, int precomputeLimit) 58 | { 59 | int[] precomp = new int[precomputeLimit + 1]; 60 | Func GetPartitionId = i => 61 | { 62 | for (int j = 0; j < partition.Length; j++) 63 | { 64 | var i_bdd = solver.MkCharConstraint((char)i); 65 | if (solver.IsSatisfiable(solver.MkAnd(i_bdd, partition[j]))) 66 | { 67 | return j; 68 | } 69 | } 70 | return -1; 71 | }; 72 | for (int c = 0; c <= precomputeLimit; c++) 73 | { 74 | int id = GetPartitionId(c); 75 | if (id < 0) 76 | throw new AutomataException(AutomataExceptionKind.InternalError); 77 | precomp[c] = id; 78 | } 79 | return precomp; 80 | } 81 | 82 | private static BST MkBST(PartitionCut partition, int from, int to) 83 | { 84 | var cut = partition.Cut(from, to); 85 | if (cut.IsEmpty) 86 | return null; 87 | else 88 | { 89 | int block_id = cut.GetSigletonId(); 90 | if (block_id >= 0) 91 | //there is precisely one block remaining 92 | return new BST(block_id, null, null); 93 | else 94 | { 95 | //it must be that 'from < to' 96 | //or else there could only have been one block 97 | int mid = (from + to) / 2; 98 | var left = MkBST(cut, from, mid); 99 | var right = MkBST(cut, mid + 1, to); 100 | //it must be that either left != null or right != null 101 | if (left == null) 102 | return right; 103 | else if (right == null) 104 | return left; 105 | else 106 | return new BST(mid + 1, left, right); 107 | } 108 | } 109 | } 110 | 111 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 112 | public int GetId(ushort c) 113 | { 114 | if (c < precomputed.Length) 115 | { 116 | return precomputed[c]; 117 | } 118 | else 119 | { 120 | return bst.Find(c); 121 | } 122 | } 123 | 124 | /// 125 | /// Used in the decision tree to locate minterm ids of nonascii characters 126 | /// 127 | public class BST 128 | { 129 | //[NonSerialized] 130 | int node; 131 | //[NonSerialized] 132 | BST left; 133 | //[NonSerialized] 134 | BST right; 135 | 136 | internal BST Left 137 | { 138 | get 139 | { 140 | return left; 141 | } 142 | } 143 | 144 | internal BST Right 145 | { 146 | get 147 | { 148 | return right; 149 | } 150 | } 151 | 152 | internal bool IsLeaf 153 | { 154 | get 155 | { 156 | return left == null; 157 | } 158 | } 159 | 160 | internal int Node 161 | { 162 | get 163 | { 164 | return node; 165 | } 166 | } 167 | 168 | internal int Find(int charCode) 169 | { 170 | if (left == null) 171 | return node; //return the leaf 172 | else if (charCode < node) 173 | return left.Find(charCode); 174 | else 175 | return right.Find(charCode); 176 | } 177 | 178 | public BST(int node, BST left, BST right) 179 | { 180 | this.node = node; 181 | this.left = left; 182 | this.right = right; 183 | } 184 | 185 | public override string ToString() 186 | { 187 | return this.Serialize(); 188 | } 189 | 190 | #region custom serialization 191 | void SerializeHelper(StringBuilder sb) 192 | { 193 | if (IsLeaf) 194 | { 195 | sb.Append(string.Format("{0}#", node)); 196 | } 197 | else 198 | { 199 | sb.Append("("); 200 | sb.Append(node); 201 | sb.Append(","); 202 | left.SerializeHelper(sb); 203 | sb.Append(","); 204 | right.SerializeHelper(sb); 205 | sb.Append(")"); 206 | } 207 | } 208 | public string Serialize() 209 | { 210 | var sb = new StringBuilder(); 211 | SerializeHelper(sb); 212 | return sb.ToString(); 213 | } 214 | 215 | public static BST Deserialize(string s) 216 | { 217 | int tmp; 218 | var bst = DeserializeHelper(s, 0, out tmp); 219 | return bst; 220 | } 221 | 222 | static BST DeserializeHelper(string s, int i, out int next_i) 223 | { 224 | switch (s[i]) 225 | { 226 | case '(': 227 | { 228 | int j = s.IndexOf(',', i + 1); 229 | int node = int.Parse(s.Substring(i + 1, j - (i + 1))); 230 | int k; 231 | var left = DeserializeHelper(s, j + 1, out k); 232 | int m; 233 | var right = DeserializeHelper(s, k + 1, out m); 234 | next_i = m + 1; 235 | return new BST(node, left, right); 236 | } 237 | default: //leaf l(node) 238 | { 239 | int j = s.IndexOf('#', i); 240 | int node = int.Parse(s.Substring(i, j - i)); 241 | next_i = j + 1; 242 | return new BST(node, null, null); 243 | } 244 | } 245 | } 246 | #endregion 247 | } 248 | 249 | /// 250 | /// Represents a cut of the original partition wrt some interval 251 | /// 252 | internal class PartitionCut 253 | { 254 | BDD[] blocks; 255 | CharSetSolver solver; 256 | internal PartitionCut(CharSetSolver solver, BDD[] blocks) 257 | { 258 | this.blocks = blocks; 259 | this.solver = solver; 260 | } 261 | 262 | internal bool IsEmpty 263 | { 264 | get 265 | { 266 | return Array.TrueForAll(blocks, b => b.IsEmpty); 267 | } 268 | } 269 | 270 | internal int GetSigletonId() 271 | { 272 | int id = -1; 273 | for (int i = 0; i < blocks.Length; i++) 274 | { 275 | if (!blocks[i].IsEmpty) 276 | { 277 | if (id >= 0) 278 | //there is more than one nonempty block 279 | return -1; 280 | else 281 | id = i; 282 | } 283 | } 284 | return id; 285 | } 286 | 287 | internal PartitionCut Cut(int lower, int upper) 288 | { 289 | var set = solver.MkCharSetFromRange((char)lower, (char)upper); 290 | var newblocks = Array.ConvertAll(blocks, b => solver.MkAnd(b, set)); 291 | return new PartitionCut(solver, newblocks); 292 | } 293 | } 294 | 295 | #region serialization 296 | /// 297 | /// Serialize 298 | /// 299 | public void GetObjectData(SerializationInfo info, StreamingContext context) 300 | { 301 | info.AddValue("p", SerializePrecomputed()); 302 | info.AddValue("b", bst.Serialize()); 303 | } 304 | /// 305 | /// Deserialize 306 | /// 307 | public DecisionTree(SerializationInfo info, StreamingContext context) 308 | { 309 | precomputed = DeserializePrecomputed(info.GetString("p")); 310 | bst = BST.Deserialize(info.GetString("b")); 311 | } 312 | 313 | string SerializePrecomputed() 314 | { 315 | string s = ""; 316 | for (int i=0; i < precomputed.Length; i++) 317 | { 318 | if (i > 0) 319 | s += ","; 320 | s += precomputed[i].ToString(); 321 | } 322 | return s; 323 | } 324 | 325 | static int[] DeserializePrecomputed(string s) 326 | { 327 | var vals = Array.ConvertAll(s.Split(','), x => int.Parse(x)); 328 | return vals; 329 | } 330 | #endregion 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /srm/matcher/IMatcher.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using System.Reflection; 7 | using System.Runtime.Serialization; 8 | using System.IO; 9 | 10 | namespace Microsoft.SRM 11 | { 12 | /// 13 | /// Provides IsMatch and Matches methods. 14 | /// 15 | public interface IMatcher 16 | { 17 | /// 18 | /// Returns true iff the input string matches. 19 | /// given iput string 20 | /// start position in the input, default is 0 21 | /// end position in the input, -1 means that the value is unspecified and taken to be input.Length-1 22 | /// 23 | bool IsMatch(string input, int startat = 0, int endat = -1); 24 | 25 | /// 26 | /// Returns all matches as pairs (startindex, length) in the input string. 27 | /// 28 | /// given iput string 29 | /// as soon as this many matches have been found the search terminates, 0 or negative value means that there is no bound, default is 0 30 | /// start position in the input, default is 0 31 | /// end position in the input, -1 means that the value is unspecified and taken to be input.Length-1 32 | List Matches(string input, int limit = 0, int startat = 0, int endat = -1); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /srm/matcher/UTF8Encoding.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using System.Runtime.CompilerServices; 7 | 8 | namespace Microsoft.SRM 9 | { 10 | /// 11 | /// Methods for decoding UTF8 encoded strings. 12 | /// 13 | public static class UTF8Encoding 14 | { 15 | /// 16 | /// Decode the next codepoint in the input. 17 | /// Here input[i] is assumed to be non-ASCII. 18 | /// The input byte array is asssumed to be valid UTF8 encoded Unicode text. 19 | /// 20 | /// UTF8 encoded Unicode text 21 | /// position of the current start byte 22 | /// how many bytes were consumed 23 | /// computed Unicode codepoint 24 | /// 25 | internal static void DecodeNextNonASCII(byte[] input, int i, out int step, out int codepoint) 26 | { 27 | int b = input[i]; 28 | // (b & 1110.0000 == 1100.0000) 29 | // so b has the form 110x.xxxx 30 | // startbyte of two byte encoding 31 | if ((b & 0xE0) == 0xC0) 32 | { 33 | codepoint = ((b & 0x1F) << 6) | (input[i + 1] & 0x3F); 34 | step = 2; 35 | } 36 | // (b & 1111.0000 == 1110.0000) 37 | // so b has the form 1110.xxxx 38 | // startbyte of three byte encoding 39 | else if ((b & 0xF0) == 0xE0) 40 | { 41 | codepoint = ((b & 0x0F) << 12) | ((input[i + 1] & 0x3F) << 6) | (input[i + 2] & 0x3F); 42 | step = 3; 43 | } 44 | // (b & 1111.1000 == 1111.0000) 45 | // so b has the form 1111.0xxx 46 | // must be startbyte of four byte encoding 47 | else 48 | { 49 | codepoint = ((b & 0x07) << 18) | ((input[i + 1] & 0x3F) << 12) | ((input[i + 2] & 0x3F) << 6) | (input[i + 3] & 0x3F); 50 | step = 4; 51 | } 52 | } 53 | 54 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 55 | internal static ushort HighSurrogate(int codepoint) 56 | { 57 | //given codepoint = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000 58 | // compute H 59 | return (ushort)(((codepoint - 0x10000) >> 10) | 0xD800); 60 | } 61 | 62 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 63 | internal static ushort LowSurrogate(int codepoint) 64 | { 65 | //given codepoint = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000 66 | //compute L 67 | var cp = (ushort)(((codepoint - 0x10000) & 0x3FF) | 0xDC00); 68 | return cp; 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /srm/matcher/VectorizedIndexOf.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using System.Numerics; 3 | using System.Runtime.CompilerServices; 4 | using System.Collections; 5 | using System.Collections.Generic; 6 | using System; 7 | using System.Linq.Expressions; 8 | 9 | namespace Microsoft.SRM 10 | { 11 | public static class VectorizedIndexOf 12 | { 13 | static int vecUshortSize = Vector.Count; 14 | static int vecUintSize = Vector.Count; 15 | static int vecByteSize = Vector.Count; 16 | 17 | #if UNSAFE 18 | 19 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 20 | internal unsafe static int UnsafeIndexOf(char* chars, int length, int start, string toMatch) 21 | { 22 | if (toMatch.Length == 0) 23 | { 24 | return start; 25 | } 26 | if (toMatch.Length == 1) 27 | { 28 | return UnsafeIndexOf1(chars, length, start, toMatch[0], new Vector(toMatch[0])); 29 | } 30 | 31 | fixed (char* toMatchp = toMatch) 32 | { 33 | var first = new Vector((ushort)toMatchp[0]); 34 | int lastOffset = toMatch.Length - 1; 35 | var last = new Vector((ushort)toMatchp[lastOffset]); 36 | 37 | int i = start; 38 | int lastVec = length - vecUshortSize - lastOffset; 39 | for (; i <= lastVec; i += vecUshortSize) 40 | { 41 | var vecFirst = Unsafe.Read>(chars + i); 42 | var vecLast = Unsafe.Read>(chars + i + lastOffset); 43 | 44 | var eqFirst = Vector.Equals(vecFirst, first); 45 | var eqLast = Vector.Equals(vecLast, last); 46 | 47 | var mask = Vector.BitwiseAnd(eqFirst, eqLast); 48 | 49 | if (!Vector.EqualsAll(mask, Vector.Zero)) 50 | { 51 | for (int j = 0; j < vecUshortSize; ++j) 52 | { 53 | if (mask[j] != 0) 54 | { 55 | var ij = i + j; 56 | for (int k = 0; k <= lastOffset; ++k) 57 | { 58 | if (chars[ij + k] != toMatchp[k]) 59 | goto MATCH_FAIL; 60 | } 61 | return ij; 62 | } 63 | MATCH_FAIL:; 64 | } 65 | } 66 | } 67 | for (; i < length; ++i) 68 | { 69 | for (int k = 0; k <= lastOffset; ++k) 70 | { 71 | if (chars[i + k] != toMatchp[k]) 72 | goto REMAINDER_MATCH_FAIL; 73 | } 74 | return i; 75 | REMAINDER_MATCH_FAIL:; 76 | } 77 | return -1; 78 | } 79 | } 80 | 81 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 82 | internal unsafe static int UnsafeIndexOf(char* chars, int length, int start, BooleanDecisionTree toMatch, Vector[] toMatchVecs) 83 | { 84 | //System.Diagnostics.Debug.Assert(Vector.IsHardwareAccelerated); 85 | fixed (bool* toMatch_precomputed = toMatch.precomputed) 86 | { 87 | int i = start; 88 | int lastVec = length - vecUshortSize; 89 | int toMatch_precomputed_length = toMatch.precomputed.Length; 90 | int toMatchVecs_Length = toMatchVecs.Length; 91 | for (; i <= lastVec; i += vecUshortSize) 92 | { 93 | var vec = Unsafe.Read>(chars + i); 94 | for (int k = 0; k < toMatchVecs_Length; k++) 95 | { 96 | var searchVec = toMatchVecs[k]; 97 | if (Vector.EqualsAny(vec, searchVec)) 98 | { 99 | for (int j = 0; j < vecUshortSize; ++j) 100 | { 101 | int ij = i + j; 102 | var c = chars[ij]; 103 | if (c < toMatch_precomputed_length ? toMatch_precomputed[c] : toMatch.bst.Find(c) == 1) 104 | return ij; 105 | } 106 | } 107 | } 108 | } 109 | for (; i < length; ++i) 110 | { 111 | if (toMatch.Contains(chars[i])) return i; 112 | } 113 | return -1; 114 | } 115 | } 116 | 117 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 118 | internal unsafe static int UnsafeIndexOf(char* chars, int length, int start, BooleanDecisionTree toMatch, Vector toMatchVec) 119 | { 120 | //System.Diagnostics.Debug.Assert(Vector.IsHardwareAccelerated); 121 | fixed (bool* toMatch_precomputed = toMatch.precomputed) 122 | { 123 | int i = start; 124 | int lastVec = length - vecUshortSize; 125 | int toMatch_precomputed_length = toMatch.precomputed.Length; 126 | for (; i <= lastVec; i += vecUshortSize) 127 | { 128 | var vec = Unsafe.Read>(chars + i); 129 | var searchVec = toMatchVec; 130 | if (Vector.EqualsAny(vec, searchVec)) 131 | { 132 | for (int j = 0; j < vecUshortSize; ++j) 133 | { 134 | int ij = i + j; 135 | var c = chars[ij]; 136 | if (c < toMatch_precomputed_length ? toMatch_precomputed[c] : toMatch.bst.Find(c) == 1) 137 | return ij; 138 | } 139 | } 140 | } 141 | for (; i < length; ++i) 142 | { 143 | if (toMatch.Contains(chars[i])) return i; 144 | } 145 | return -1; 146 | } 147 | } 148 | 149 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 150 | internal unsafe static int UnsafeIndexOf1(char* chars, int length, int start, ushort toMatch, Vector toMatchVec) 151 | { 152 | //System.Diagnostics.Debug.Assert(Vector.IsHardwareAccelerated); 153 | int i = start; 154 | int lastVec = length - vecUshortSize; 155 | for (; i <= lastVec; i += vecUshortSize) 156 | { 157 | var vec = Unsafe.Read>(chars + i); 158 | if (Vector.EqualsAny(vec, toMatchVec)) 159 | { 160 | for (int j = 0; j < vecUshortSize; ++j) 161 | { 162 | int ij = i + j; 163 | if (toMatch == chars[ij]) return ij; 164 | } 165 | } 166 | } 167 | for (; i < length; ++i) 168 | { 169 | if (toMatch == chars[i]) return i; 170 | } 171 | return -1; 172 | } 173 | 174 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 175 | internal unsafe static int UnsafeIndexOfByte(byte[] input, int start, byte[] toMatch) 176 | { 177 | var toMatchVecs = toMatch.Select(x => new Vector(x)).ToArray(); 178 | fixed (byte* bytes = input) 179 | { 180 | var length = input.Length; 181 | int i = start; 182 | int lastVec = length - vecByteSize; 183 | for (; i <= lastVec; i += vecByteSize) 184 | { 185 | var vec = Unsafe.Read>(bytes + i); 186 | foreach (var searchVec in toMatchVecs) 187 | { 188 | if (Vector.EqualsAny(vec, searchVec)) 189 | { 190 | for (int j = 0; j < vecUshortSize; ++j) 191 | { 192 | if (toMatch.Contains(input[i + j])) return i + j; 193 | } 194 | } 195 | } 196 | } 197 | for (; i < input.Length; ++i) 198 | { 199 | if (toMatch.Contains(input[i])) return i; 200 | } 201 | return -1; 202 | } 203 | } 204 | 205 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 206 | internal unsafe static int UnsafeIndexOfByte(byte[] input, int i, byte toMatch, Vector toMatchVec) 207 | { 208 | var length = input.Length; 209 | int lastVec = length - vecByteSize; 210 | fixed (byte* bytes = input) 211 | { 212 | for (; i <= lastVec; i += vecByteSize) 213 | { 214 | var vec = Unsafe.Read>(bytes + i); 215 | if (Vector.EqualsAny(vec, toMatchVec)) 216 | { 217 | return Array.IndexOf(input, toMatch, i); 218 | } 219 | } 220 | return Array.IndexOf(input, toMatch, i); 221 | } 222 | } 223 | 224 | #endif 225 | 226 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 227 | public static int IndexOfByte(byte[] input, int i, byte toMatch, Vector toMatchVec) 228 | { 229 | int lastVec = input.Length - vecByteSize; 230 | while (i <= lastVec && !Vector.EqualsAny(new Vector(input, i), toMatchVec)) 231 | i += vecByteSize; 232 | return Array.IndexOf(input, toMatch, i); 233 | } 234 | 235 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 236 | public static int IndexOfByteSeq(byte[] input, int i, byte[] seqToMatch, Vector firstToMatchVec) 237 | { 238 | int length = input.Length; 239 | int lastVec = length - vecByteSize; 240 | byte firstToMatch = seqToMatch[0]; 241 | int seqToMatch_length = seqToMatch.Length; 242 | while (i <= lastVec) 243 | { 244 | if (Vector.EqualsAny(new Vector(input, i), firstToMatchVec)) 245 | { 246 | i = Array.IndexOf(input, firstToMatch, i); 247 | if (i + seqToMatch_length > length) 248 | return -1; 249 | int j = 1; 250 | while (j < seqToMatch_length && input[i + j] == seqToMatch[j]) 251 | j += 1; 252 | if (j == seqToMatch_length) 253 | return i; 254 | else 255 | { 256 | i += 1; 257 | } 258 | } 259 | else 260 | { 261 | i += vecByteSize; 262 | } 263 | } 264 | i = Array.IndexOf(input, firstToMatch, i); 265 | if (i + seqToMatch_length > length) 266 | return -1; 267 | int j1 = 1; 268 | while (j1 < seqToMatch_length && input[i + j1] == seqToMatch[j1]) 269 | j1 += 1; 270 | if (j1 == seqToMatch_length) 271 | return i; 272 | else 273 | return -1; 274 | } 275 | } 276 | } 277 | -------------------------------------------------------------------------------- /srm/parser/RegexBoyerMoore.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Copyright (c) Microsoft Corporation. All rights reserved. 4 | // 5 | //------------------------------------------------------------------------------ 6 | 7 | // The RegexBoyerMoore object precomputes the Boyer-Moore 8 | // tables for fast string scanning. These tables allow 9 | // you to scan for the first occurance of a string within 10 | // a large body of text without examining every character. 11 | // The performance of the heuristic depends on the actual 12 | // string and the text being searched, but usually, the longer 13 | // the string that is being searched for, the fewer characters 14 | // need to be examined. 15 | 16 | namespace System.Text.RegularExpressions 17 | { 18 | 19 | using System.Collections; 20 | using System.Diagnostics; 21 | using System.Globalization; 22 | 23 | internal sealed class RegexBoyerMoore { 24 | internal int[] _positive; 25 | internal int[] _negativeASCII; 26 | internal int[][] _negativeUnicode; 27 | internal String _pattern; 28 | internal int _lowASCII; 29 | internal int _highASCII; 30 | internal bool _rightToLeft; 31 | internal bool _caseInsensitive; 32 | internal CultureInfo _culture; 33 | 34 | internal const int infinite = 0x7FFFFFFF; 35 | 36 | /* 37 | * Constructs a Boyer-Moore state machine for searching for the string 38 | * pattern. The string must not be zero-length. 39 | */ 40 | internal RegexBoyerMoore(String pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) { 41 | /* 42 | * Sorry, you just can't use Boyer-Moore to find an empty pattern. 43 | * We're doing this for your own protection. (Really, for speed.) 44 | */ 45 | Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); 46 | 47 | int beforefirst; 48 | int last; 49 | int bump; 50 | int examine; 51 | int scan; 52 | int match; 53 | char ch; 54 | 55 | // We do the ToLower character by character for consistency. With surrogate chars, doing 56 | // a ToLower on the entire string could actually change the surrogate pair. This is more correct 57 | // linguistically, but since Regex doesn't support surrogates, it's more important to be 58 | // consistent. 59 | if (caseInsensitive) { 60 | StringBuilder sb = new StringBuilder(pattern.Length); 61 | for (int i=0; i This algorithm appears to be a simplified variant of the 90 | * standard Boyer-Moore good suffix calculation. It could 91 | * be one of D.M. Sunday's variations, but I have not found which one. 92 | * 93 | * Maybe someday rewrite this with the real Boyer-Moore algorithm and split it 94 | * out into a separate piece of code in the BCL. 95 | * 96 | */ 97 | _positive = new int[pattern.Length]; 98 | 99 | examine = last; 100 | ch = pattern[examine]; 101 | _positive[examine] = bump; 102 | examine -= bump; 103 | 104 | for (;;) { 105 | // find an internal char (examine) that matches the tail 106 | 107 | for (;;) { 108 | if (examine == beforefirst) 109 | goto OuterloopBreak; 110 | if (pattern[examine] == ch) 111 | break; 112 | examine -= bump; 113 | } 114 | 115 | match = last; 116 | scan = examine; 117 | 118 | // find the length of the match 119 | 120 | for (;;) { 121 | if (scan == beforefirst || pattern[match] != pattern[scan]) { 122 | // at the end of the match, note the difference in _positive 123 | // this is not the length of the match, but the distance from the internal match 124 | // to the tail suffix. 125 | if (_positive[match] == 0) 126 | _positive[match] = match - scan; 127 | 128 | // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); 129 | 130 | break; 131 | } 132 | 133 | scan -= bump; 134 | match -= bump; 135 | } 136 | 137 | examine -= bump; 138 | } 139 | 140 | OuterloopBreak: 141 | 142 | match = last - bump; 143 | 144 | // scan for the chars for which there are no shifts that yield a different candidate 145 | 146 | /* 147 | * The inside of the if statement used to say 148 | * "_positive[match] = last - beforefirst;" 149 | * I've changed it to the below code. This 150 | * is slightly less agressive in how much we skip, but at worst it 151 | * should mean a little more work rather than skipping a potential 152 | * match. 153 | * 154 | */ 155 | while (match != beforefirst) { 156 | if (_positive[match] == 0) 157 | _positive[match] = bump; 158 | 159 | match -= bump; 160 | } 161 | 162 | //System.Diagnostics.Debug.WriteLine("good suffix shift table:"); 163 | //for (int i=0; i<_positive.Length; i++) 164 | // System.Diagnostics.Debug.WriteLine("\t_positive[" + i + "] = " + _positive[i]); 165 | 166 | 167 | /* 168 | * PART II - the bad-character shift table 169 | * 170 | * compute the negative requirement: 171 | * if char "ch" is the reject character when testing position "i", 172 | * we can slide up by _negative[ch]; 173 | * (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) 174 | * 175 | * the lookup table is divided into ASCII and Unicode portions; 176 | * only those parts of the Unicode 16-bit code set that actually 177 | * appear in the string are in the table. (Maximum size with 178 | * Unicode is 65K; ASCII only case is 512 bytes.) 179 | */ 180 | 181 | _negativeASCII = new int[128]; 182 | 183 | for (int i = 0; i < 128; i++) 184 | _negativeASCII[i] = last - beforefirst; 185 | 186 | _lowASCII = 127; 187 | _highASCII = 0; 188 | 189 | for (examine = last; examine != beforefirst; examine -= bump) { 190 | ch = pattern[examine]; 191 | 192 | if (ch < 128) { 193 | if (_lowASCII > ch) 194 | _lowASCII = ch; 195 | 196 | if (_highASCII < ch) 197 | _highASCII = ch; 198 | 199 | if (_negativeASCII[ch] == last - beforefirst) 200 | _negativeASCII[ch] = last - examine; 201 | } 202 | else { 203 | int i = ch >> 8; 204 | int j = ch & 0xFF; 205 | 206 | if (_negativeUnicode == null) { 207 | _negativeUnicode = new int[256][]; 208 | } 209 | 210 | if (_negativeUnicode[i] == null) { 211 | int[] newarray = new int[256]; 212 | 213 | for (int k = 0; k < 256; k++) 214 | newarray[k] = last - beforefirst; 215 | 216 | if (i == 0) { 217 | System.Array.Copy(_negativeASCII, newarray, 128); 218 | _negativeASCII = newarray; 219 | } 220 | 221 | _negativeUnicode[i] = newarray; 222 | } 223 | 224 | if (_negativeUnicode[i][j] == last - beforefirst) 225 | _negativeUnicode[i][j] = last - examine; 226 | } 227 | } 228 | } 229 | 230 | private bool MatchPattern(string text, int index) { 231 | if (_caseInsensitive) { 232 | if( text.Length - index < _pattern.Length) { 233 | return false; 234 | } 235 | 236 | TextInfo textinfo = _culture.TextInfo; 237 | for( int i = 0; i < _pattern.Length; i++) { 238 | Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); 239 | if( textinfo.ToLower(text[index + i]) != _pattern[i]) { 240 | return false; 241 | } 242 | } 243 | return true; 244 | } 245 | else { 246 | return(0 == String.CompareOrdinal(_pattern, 0, text, index, _pattern.Length)); 247 | } 248 | } 249 | 250 | /* 251 | * When a regex is anchored, we can do a quick IsMatch test instead of a Scan 252 | */ 253 | internal bool IsMatch(String text, int index, int beglimit, int endlimit) { 254 | 255 | if (!_rightToLeft) { 256 | if (index < beglimit || endlimit - index < _pattern.Length) 257 | return false; 258 | 259 | return MatchPattern(text, index); 260 | } 261 | else { 262 | if (index > endlimit || index - beglimit < _pattern.Length) 263 | return false; 264 | 265 | return MatchPattern(text, index - _pattern.Length); 266 | } 267 | } 268 | 269 | 270 | /* 271 | * Scan uses the Boyer-Moore algorithm to find the first occurrance 272 | * of the specified string within text, beginning at index, and 273 | * constrained within beglimit and endlimit. 274 | * 275 | * The direction and case-sensitivity of the match is determined 276 | * by the arguments to the RegexBoyerMoore constructor. 277 | */ 278 | internal int Scan(String text, int index, int beglimit, int endlimit) { 279 | int test; 280 | int test2; 281 | int match; 282 | int startmatch; 283 | int endmatch; 284 | int advance; 285 | int defadv; 286 | int bump; 287 | char chMatch; 288 | char chTest; 289 | int[] unicodeLookup; 290 | 291 | if (!_rightToLeft) { 292 | defadv = _pattern.Length; 293 | startmatch = _pattern.Length - 1; 294 | endmatch = 0; 295 | test = index + defadv - 1; 296 | bump = 1; 297 | } 298 | else { 299 | defadv = -_pattern.Length; 300 | startmatch = 0; 301 | endmatch = -defadv - 1; 302 | test = index + defadv; 303 | bump = -1; 304 | } 305 | 306 | chMatch = _pattern[startmatch]; 307 | 308 | for (;;) { 309 | if (test >= endlimit || test < beglimit) 310 | return -1; 311 | 312 | chTest = text[test]; 313 | 314 | if (_caseInsensitive) 315 | chTest = Char.ToLower(chTest, _culture); 316 | 317 | if (chTest != chMatch) { 318 | if (chTest < 128) 319 | advance = _negativeASCII[chTest]; 320 | else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 321 | advance = unicodeLookup[chTest & 0xFF]; 322 | else 323 | advance = defadv; 324 | 325 | test += advance; 326 | } 327 | else { // if (chTest == chMatch) 328 | test2 = test; 329 | match = startmatch; 330 | 331 | for (;;) { 332 | if (match == endmatch) 333 | return(_rightToLeft ? test2 + 1 : test2); 334 | 335 | match -= bump; 336 | test2 -= bump; 337 | 338 | chTest = text[test2]; 339 | 340 | if (_caseInsensitive) 341 | chTest = Char.ToLower(chTest, _culture); 342 | 343 | if (chTest != _pattern[match]) { 344 | advance = _positive[match]; 345 | if ((chTest & 0xFF80) == 0) 346 | test2 = (match - startmatch) + _negativeASCII[chTest]; 347 | else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 348 | test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF]; 349 | else { 350 | test += advance; 351 | break; 352 | } 353 | 354 | if (_rightToLeft ? test2 < advance : test2 > advance) 355 | advance = test2; 356 | 357 | test += advance; 358 | break; 359 | } 360 | } 361 | } 362 | } 363 | } 364 | 365 | /* 366 | * Used when dumping for debugging. 367 | */ 368 | public override String ToString() { 369 | return _pattern; 370 | } 371 | 372 | #if DBG 373 | public String Dump(String indent) { 374 | StringBuilder sb = new StringBuilder(); 375 | 376 | sb.Append(indent + "BM Pattern: " + _pattern + "\n"); 377 | sb.Append(indent + "Positive: "); 378 | for (int i = 0; i < _positive.Length; i++) { 379 | sb.Append(_positive[i].ToString(CultureInfo.InvariantCulture) + " "); 380 | } 381 | sb.Append("\n"); 382 | 383 | if (_negativeASCII != null) { 384 | sb.Append(indent + "Negative table\n"); 385 | for (int i = 0; i < _negativeASCII.Length; i++) { 386 | if (_negativeASCII[i] != _pattern.Length) { 387 | sb.Append(indent + " " + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + _negativeASCII[i].ToString(CultureInfo.InvariantCulture) + "\n"); 388 | } 389 | } 390 | } 391 | 392 | return sb.ToString(); 393 | } 394 | #endif 395 | } 396 | } 397 | -------------------------------------------------------------------------------- /srm/parser/RegexReplacement.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Copyright (c) Microsoft Corporation. All rights reserved. 4 | // 5 | //------------------------------------------------------------------------------ 6 | 7 | // The RegexReplacement class represents a substitution string for 8 | // use when using regexs to search/replace, etc. It's logically 9 | // a sequence intermixed (1) constant strings and (2) group numbers. 10 | 11 | namespace System.Text.RegularExpressions { 12 | 13 | using System.Collections; 14 | using System.Collections.Generic; 15 | 16 | internal sealed class RegexReplacement { 17 | /* 18 | * Since RegexReplacement shares the same parser as Regex, 19 | * the constructor takes a RegexNode which is a concatenation 20 | * of constant strings and backreferences. 21 | */ 22 | #if SILVERLIGHT 23 | internal RegexReplacement(String rep, RegexNode concat, Dictionary _caps) { 24 | #else 25 | internal RegexReplacement(String rep, RegexNode concat, Hashtable _caps) { 26 | #endif 27 | StringBuilder sb; 28 | List strings; 29 | List rules; 30 | int slot; 31 | 32 | _rep = rep; 33 | 34 | if (concat.Type() != RegexNode.Concatenate) 35 | throw new ArgumentException(SR.GetString(SR.ReplacementError)); 36 | 37 | sb = new StringBuilder(); 38 | strings = new List(); 39 | rules = new List(); 40 | 41 | for (int i = 0; i < concat.ChildCount(); i++) { 42 | RegexNode child = concat.Child(i); 43 | 44 | switch (child.Type()) { 45 | case RegexNode.Multi: 46 | sb.Append(child._str); 47 | break; 48 | case RegexNode.One: 49 | sb.Append(child._ch); 50 | break; 51 | case RegexNode.Ref: 52 | if (sb.Length > 0) { 53 | rules.Add(strings.Count); 54 | strings.Add(sb.ToString()); 55 | sb.Length = 0; 56 | } 57 | slot = child._m; 58 | 59 | if (_caps != null && slot >= 0) 60 | slot = (int)_caps[slot]; 61 | 62 | rules.Add(-Specials - 1 - slot); 63 | break; 64 | default: 65 | throw new ArgumentException(SR.GetString(SR.ReplacementError)); 66 | } 67 | } 68 | 69 | if (sb.Length > 0) { 70 | rules.Add(strings.Count); 71 | strings.Add(sb.ToString()); 72 | } 73 | 74 | _strings = strings; 75 | _rules = rules; 76 | } 77 | 78 | internal String _rep; 79 | internal List _strings; // table of string constants 80 | internal List _rules; // negative -> group #, positive -> string # 81 | 82 | // constants for special insertion patterns 83 | 84 | internal const int Specials = 4; 85 | internal const int LeftPortion = -1; 86 | internal const int RightPortion = -2; 87 | internal const int LastGroup = -3; 88 | internal const int WholeString = -4; 89 | 90 | /* 91 | * The original pattern string 92 | */ 93 | internal String Pattern { 94 | get { 95 | return _rep; 96 | } 97 | } 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /srm/parser/RegexTree.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Copyright (c) Microsoft Corporation. All rights reserved. 4 | // 5 | //------------------------------------------------------------------------------ 6 | 7 | // RegexTree is just a wrapper for a node tree with some 8 | // global information attached. 9 | 10 | namespace System.Text.RegularExpressions { 11 | 12 | using System.Collections; 13 | using System.Collections.Generic; 14 | 15 | // PIETER: made public instead of internal for 16 | // direct access from Automata.Tests 17 | public sealed class RegexTree { 18 | #if SILVERLIGHT 19 | internal RegexTree(RegexNode root, Dictionary caps, Int32[] capnumlist, int captop, Dictionary capnames, String[] capslist, RegexOptions opts) 20 | #else 21 | internal RegexTree(RegexNode root, Hashtable caps, Int32[] capnumlist, int captop, Hashtable capnames, String[] capslist, RegexOptions opts) 22 | #endif 23 | 24 | { 25 | _root = root; 26 | _caps = caps; 27 | _capnumlist = capnumlist; 28 | _capnames = capnames; 29 | _capslist = capslist; 30 | _captop = captop; 31 | _options = opts; 32 | } 33 | 34 | internal RegexNode _root; 35 | #if SILVERLIGHT 36 | internal Dictionary _caps; 37 | #else 38 | internal Hashtable _caps; 39 | #endif 40 | internal Int32[] _capnumlist; 41 | #if SILVERLIGHT 42 | internal Dictionary _capnames; 43 | #else 44 | internal Hashtable _capnames; 45 | #endif 46 | internal String[] _capslist; 47 | internal RegexOptions _options; 48 | internal int _captop; 49 | 50 | #if DBG 51 | internal void Dump() { 52 | _root.Dump(); 53 | } 54 | 55 | internal bool Debug { 56 | get { 57 | return(_options & RegexOptions.Debug) != 0; 58 | } 59 | } 60 | #endif 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /srm/parser/SR.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace System.Text.RegularExpressions 6 | { 7 | static class SR 8 | { 9 | public static string GetString(string s, params object[] o) 10 | { 11 | return s; 12 | } 13 | 14 | public const string ReplacementError = "ReplacementError"; 15 | public const string UnexpectedOpcode = "UnexpectedOpcode"; 16 | public const string TooManyParens = "TooManyParens"; 17 | public const string NestedQuantify = "NestedQuantify"; 18 | public const string QuantifyAfterNothing = "QuantifyAfterNothing"; 19 | public const string InternalError = "InternalError"; 20 | public const string IllegalRange = "IllegalRange"; 21 | public const string NotEnoughParens = "NotEnoughParens"; 22 | public const string BadClassInCharRange = "BadClassInCharRange"; 23 | public const string SubtractionMustBeLast = "SubtractionMustBeLast"; 24 | public const string ReversedCharRange = "ReversedCharRange"; 25 | public const string UnterminatedBracket = "UnterminatedBracket"; 26 | public const string InvalidGroupName = "InvalidGroupName"; 27 | public const string CapnumNotZero = "CapnumNotZero"; 28 | public const string UndefinedBackref = "UndefinedBackref"; 29 | public const string MalformedReference = "MalformedReference"; 30 | public const string AlternationCantHaveComment = "AlternationCantHaveComment"; 31 | public const string AlternationCantCapture = "AlternationCantCapture"; 32 | public const string UnrecognizedGrouping = "UnrecognizedGrouping"; 33 | public const string IllegalEndEscape = "IllegalEndEscape"; 34 | public const string CaptureGroupOutOfRange = "CaptureGroupOutOfRange"; 35 | public const string TooFewHex = "TooFewHex"; 36 | public const string MissingControl = "MissingControl"; 37 | public const string UnrecognizedControl = "UnrecognizedControl"; 38 | public const string UnrecognizedEscape = "UnrecognizedEscape"; 39 | public const string IncompleteSlashP = "IncompleteSlashP"; 40 | public const string MalformedSlashP = "MalformedSlashP"; 41 | public const string IllegalCondition = "IllegalCondition"; 42 | public const string TooManyAlternates = "TooManyAlternates"; 43 | public const string MakeException = "MakeException"; 44 | public const string UndefinedNameRef = "UndefinedNameRef"; 45 | public const string UndefinedReference = "UndefinedReference"; 46 | public const string UnterminatedComment = "UnterminatedComment"; 47 | public const string MalformedNameRef = "MalformedNameRef"; 48 | public const string UnknownProperty = "UnknownProperty"; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /srm/printing/RegexCharSetPrinter.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace Microsoft.SRM 4 | { 5 | internal static class RegexCharSetPrinter 6 | { 7 | internal static string ToRegexCharSet(BDD label, IUnicodeCategoryTheory categorizer, CharSetSolver solver) 8 | { 9 | if (categorizer.CategoryCondition(8) == label) 10 | return @"\d"; 11 | if (solver.MkNot(categorizer.CategoryCondition(8)) == label) 12 | return @"\D"; 13 | if (categorizer.WordLetterCondition == label) 14 | return @"\w"; 15 | if (solver.MkNot(categorizer.WordLetterCondition) == label) 16 | return @"\W"; 17 | if (categorizer.WhiteSpaceCondition == label) 18 | return @"\s"; 19 | if (solver.MkNot(categorizer.WhiteSpaceCondition) == label) 20 | return @"\S"; 21 | for (int i = 0; i < categorizer.UnicodeCategoryStandardAbbreviations.Length; i++) 22 | if (categorizer.CategoryCondition(i) == label) 23 | { 24 | return @"\P{" + categorizer.UnicodeCategoryStandardAbbreviations[i] + "}"; 25 | } 26 | 27 | var ranges = solver.ToRanges(label); 28 | if (ranges.Length == 1 && ranges[0].Item1 == ranges[0].Item2) 29 | { 30 | return StringUtility.Escape((char)ranges[0].Item1); 31 | } 32 | 33 | var res = new StringBuilder("["); 34 | for (int i = 0; i < ranges.Length; i++ ) 35 | { 36 | var range = ranges[i]; 37 | if (range.Item1 == range.Item2) 38 | res.Append(StringUtility.EscapeWithNumericSpace((char)range.Item1)); 39 | else if (range.Item1 == range.Item2 - 1) 40 | { 41 | res.Append(StringUtility.EscapeWithNumericSpace((char)range.Item1)); 42 | res.Append(StringUtility.EscapeWithNumericSpace((char)range.Item2)); 43 | } 44 | else 45 | { 46 | res.Append(StringUtility.EscapeWithNumericSpace((char)range.Item1)); 47 | res.Append("-"); 48 | res.Append(StringUtility.EscapeWithNumericSpace((char)range.Item2)); 49 | } 50 | } 51 | res.Append("]"); 52 | return res.ToString(); 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /srm/srm.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.0 5 | true 6 | UNSAFE 7 | Microsoft.Automata.SRM 8 | Microsoft 9 | MIT 10 | © Microsoft Corporation. All rights reserved. 11 | https://github.com/AutomataDotNet/srm 12 | High-performance .NET regex engine with predictable performance 13 | regular expression regex matching unicode 14 | icon.png 15 | 16 | 17 | 18 | true 19 | true 20 | ../scripts/35MSSharedLib1024.snk 21 | ../scripts/35MSSharedLib1024.snk 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /srm/unicode/IgnoreCaseRelationGenerator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.Globalization; 5 | using System.IO; 6 | 7 | namespace Microsoft.SRM 8 | { 9 | public static class IgnoreCaseRelationGenerator 10 | { 11 | public static void Generate(string namespacename, string classname, string path) 12 | { 13 | if (classname == null) 14 | throw new ArgumentNullException("classname"); 15 | if (path == null) 16 | throw new ArgumentNullException("path"); 17 | 18 | if (path != "" && !path.EndsWith("/")) 19 | path = path + "/"; 20 | 21 | string version = System.Environment.Version.ToString(); 22 | 23 | string prefix = @"/// 24 | /// Automatically generated by IgnoreCaseRelationGenerator for System.Environment.Version = " + version + @" 25 | /// 26 | namespace " + namespacename + @" 27 | { 28 | internal static class " + classname + @" 29 | {"; 30 | 31 | string suffix = @"} 32 | } 33 | "; 34 | FileInfo fi = new FileInfo(string.Format("{1}{0}.cs", classname, path)); 35 | if (fi.Exists) 36 | fi.IsReadOnly = false; 37 | StreamWriter sw = new StreamWriter(string.Format("{1}{0}.cs", classname, path)); 38 | sw.WriteLine(prefix); 39 | 40 | CreateUlongArray(sw); 41 | //CreateStringArray(sw); 42 | 43 | sw.WriteLine(suffix); 44 | sw.Close(); 45 | } 46 | 47 | private static void CreateUlongArray(StreamWriter sw) 48 | { 49 | sw.WriteLine("/// "); 50 | sw.WriteLine("/// Serialized BDD for mapping characters to their case-ignoring equivalence classes."); 51 | sw.WriteLine("/// "); 52 | sw.WriteLine("public static ulong[] ignorecase = new ulong[]{"); 53 | CharSetSolver solver = new CharSetSolver(); 54 | 55 | Dictionary ignoreCase = ComputeIgnoreCaseDistionary(solver); 56 | 57 | BDD ignorecase = solver.False; 58 | foreach (var kv in ignoreCase) 59 | { 60 | var a = solver.MkCharSetFromRange(kv.Key, kv.Key); 61 | var b = kv.Value; 62 | ignorecase = ignorecase | (a << 16) & b; 63 | } 64 | var ignorecaseArray = solver.Serialize(ignorecase); 65 | for (int i = 0; i < ignorecaseArray.Length; i++) 66 | sw.WriteLine("0x{0:X16},", ignorecaseArray[i]); 67 | 68 | sw.WriteLine("};"); //end of array 69 | } 70 | 71 | private static Dictionary ComputeIgnoreCaseDistionary(CharSetSolver solver) 72 | { 73 | var ignoreCase = new Dictionary(); 74 | for (uint i = 0; i <= 0xFFFF; i++) 75 | { 76 | char c = (char)i; 77 | char cU = char.ToUpper(c); // (char.IsLetter(char.ToUpper(c)) ? char.ToUpper(c) : c); 78 | char cL = char.ToLower(c); // (char.IsLetter(char.ToLower(c)) ? char.ToLower(c) : c); 79 | if (c != cU || c != cL || cU != cL) 80 | { 81 | //make sure that the regex engine considers c as being equivalent to cU and cL, else ignore c 82 | //in some cases c != cU but the regex engine does not consider the chacarters equivalent wrt the ignore-case option. 83 | //These characters are: 84 | //c=\xB5,cU=\u039C 85 | //c=\u0131,cU=I 86 | //c=\u017F,cU=S 87 | //c=\u0345,cU=\u0399 88 | //c=\u03C2,cU=\u03A3 89 | //c=\u03D0,cU=\u0392 90 | //c=\u03D1,cU=\u0398 91 | //c=\u03D5,cU=\u03A6 92 | //c=\u03D6,cU=\u03A0 93 | //c=\u03F0,cU=\u039A 94 | //c=\u03F1,cU=\u03A1 95 | //c=\u03F5,cU=\u0395 96 | //c=\u1E9B,cU=\u1E60 97 | //c=\u1FBE,cU=\u0399 98 | if (System.Text.RegularExpressions.Regex.IsMatch(cU.ToString() + cL.ToString(), "^(?i:" + StringUtility.Escape(c) + ")+$")) 99 | { 100 | BDD equiv = solver.False; 101 | 102 | if (ignoreCase.ContainsKey(c)) 103 | equiv = equiv | ignoreCase[c]; 104 | if (ignoreCase.ContainsKey(cU)) 105 | equiv = equiv | ignoreCase[cU]; 106 | if (ignoreCase.ContainsKey(cL)) 107 | equiv = equiv | ignoreCase[cL]; 108 | 109 | equiv = equiv | solver.MkCharSetFromRange(c, c) | solver.MkCharSetFromRange(cU, cU) | solver.MkCharSetFromRange(cL, cL); 110 | 111 | foreach (char d in solver.GenerateAllCharacters(equiv)) 112 | ignoreCase[d] = equiv; 113 | } 114 | //else 115 | //{ 116 | // outp += "c=" + StringUtility.Escape(c) + "," + "cU=" + StringUtility.Escape(cU); 117 | // Console.WriteLine("c=" + StringUtility.Escape(c) + "," + "cL=" + StringUtility.Escape(cL) + "," + "cU=" + StringUtility.Escape(cU)); 118 | //} 119 | } 120 | } 121 | return ignoreCase; 122 | } 123 | 124 | private static void CreateStringArray(StreamWriter sw) 125 | { 126 | sw.WriteLine("/// "); 127 | sw.WriteLine("/// Each string correponds to an equivalence class of characters when case is ignored."); 128 | sw.WriteLine("/// "); 129 | sw.WriteLine("public static string[] ignorecase = new string[]{"); 130 | CharSetSolver solver = new CharSetSolver(); 131 | 132 | Dictionary ignoreCase = ComputeIgnoreCaseDistionary(solver); 133 | 134 | HashSet done = new HashSet(); 135 | foreach (var kv in ignoreCase) 136 | if (done.Add(kv.Value)) 137 | { 138 | var ranges = solver.ToRanges(kv.Value); 139 | List s = new List(); 140 | for (int i = 0; i < ranges.Length; i++) 141 | { 142 | var l = (int)ranges[i].Item1; 143 | var h = (int)ranges[i].Item2; 144 | for (int j = l; j <= h; j++) 145 | s.Add((char)j); 146 | } 147 | var str = StringUtility.Escape(new String(s.ToArray())); 148 | sw.WriteLine(@"{0},", str); 149 | } 150 | sw.WriteLine("};"); //end of array 151 | } 152 | }; 153 | } 154 | 155 | 156 | -------------------------------------------------------------------------------- /srm/unicode/IgnoreCaseTransformer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Microsoft.SRM 8 | { 9 | internal class IgnoreCaseTransformer 10 | { 11 | BDD IgnoreCaseRel; 12 | BDD domain; 13 | CharSetSolver solver; 14 | 15 | public IgnoreCaseTransformer(CharSetSolver charSetSolver) 16 | { 17 | this.solver = charSetSolver; 18 | IgnoreCaseRel = charSetSolver.Deserialize(Microsoft.SRM.Generated.IgnoreCaseRelation.ignorecase); 19 | domain = IgnoreCaseRel >> 16; 20 | } 21 | 22 | /// 23 | /// For all letters in the bdd add their lower and upper case equivalents. 24 | /// 25 | public BDD Apply(BDD bdd) 26 | { 27 | if ((domain & bdd).IsEmpty) 28 | return bdd; 29 | else 30 | { 31 | var ignorecase = (bdd & IgnoreCaseRel) >> 16; 32 | var res = ignorecase | bdd; 33 | return res; 34 | } 35 | } 36 | 37 | public bool IsInDomain(char c) 38 | { 39 | BDD c_bdd = solver.MkCharConstraint(c); 40 | if ((c_bdd & domain).IsEmpty) 41 | return false; 42 | else 43 | return true; 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /srm/unicode/UnicodeCategoryRangesGenerator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.Globalization; 5 | using System.IO; 6 | 7 | namespace Microsoft.SRM 8 | { 9 | /// 10 | /// Utility for generating unicode category ranges and corresponing binary decision diagrams 11 | /// 12 | public static class UnicodeCategoryRangesGenerator 13 | { 14 | /// 15 | /// Create a file classname.cs in the directory path. 16 | /// The file contains the static class with name classname and has namespace namespacename. 17 | /// The class has static fields that map unicode categories to their character ranges 18 | /// and provide whitespace ranges. 19 | /// The fields are for the ASCII case (7 bits), CP437 case (8 bits) and for UTF16 (16 bits). 20 | /// Overwrites an existing file, even if the existing file is write protected. 21 | /// 22 | /// namespace for the class 23 | /// name of the class 24 | /// path where the file is written 25 | public static void Generate(string namespacename, string classname, string path) 26 | { 27 | if (classname == null) 28 | throw new ArgumentNullException("classname"); 29 | if (path == null) 30 | throw new ArgumentNullException("path"); 31 | 32 | if (path != "" && !path.EndsWith("/")) 33 | path = path + "/"; 34 | 35 | string version = System.Environment.Version.ToString(); 36 | 37 | string prefix = @"/// 38 | /// Automatically generated by UnicodeCategoryRangesGenerator for System.Environment.Version = " + version + @" 39 | /// 40 | namespace " + namespacename + @" 41 | { 42 | internal static class " + classname + @" 43 | {"; 44 | 45 | string suffix = @"} 46 | } 47 | "; 48 | FileInfo fi = new FileInfo(string.Format("{1}{0}.cs", classname, path)); 49 | if (fi.Exists) 50 | fi.IsReadOnly = false; 51 | StreamWriter sw = new StreamWriter(string.Format("{1}{0}.cs", classname, path)); 52 | sw.WriteLine(prefix); 53 | 54 | sw.WriteLine("#region ASCII"); 55 | WriteRangeFields(BitWidth.BV7, sw, "ASCII"); 56 | sw.WriteLine("#endregion"); 57 | sw.WriteLine(); 58 | 59 | sw.WriteLine("#region CP437"); 60 | WriteRangeFields(BitWidth.BV8, sw, "CP437"); 61 | sw.WriteLine("#endregion"); 62 | sw.WriteLine(); 63 | 64 | sw.WriteLine("#region Unicode (UTF16)"); 65 | WriteRangeFields(BitWidth.BV16, sw, "Unicode"); 66 | sw.WriteLine("#endregion"); 67 | sw.WriteLine(); 68 | 69 | sw.WriteLine(suffix); 70 | sw.Close(); 71 | } 72 | 73 | private static void WriteRangeFields(BitWidth encoding, StreamWriter sw, string field) 74 | { 75 | int bits = (int)encoding; 76 | int maxChar = (1 << bits) - 1; 77 | var catMap = new Dictionary(); 78 | for (int c = 0; c < 30; c++) 79 | catMap[(UnicodeCategory)c] = new Ranges(); 80 | Ranges whitespace = new Ranges(); 81 | Ranges wordcharacter = new Ranges(); 82 | for (int i = 0; i <= maxChar; i++) 83 | { 84 | char ch = (char)i; 85 | if (char.IsWhiteSpace(ch)) 86 | whitespace.Add(i); 87 | UnicodeCategory cat = char.GetUnicodeCategory(ch); 88 | catMap[cat].Add(i); 89 | int catCode = (int)cat; 90 | //in .NET 3.5 91 | if (bits == 7) 92 | if (catCode == 0 || catCode == 1 || catCode == 2 || catCode == 3 || catCode == 4 || catCode == 5 || catCode == 8 || catCode == 18) 93 | wordcharacter.Add(i); 94 | } 95 | //generate bdd reprs for each of the category ranges 96 | BDD[] catBDDs = new BDD[30]; 97 | CharSetSolver bddb = new CharSetSolver(encoding); 98 | for (int c = 0; c < 30; c++) 99 | catBDDs[c] = bddb.MkBddForIntRanges(catMap[(UnicodeCategory)c].ranges); 100 | 101 | BDD whitespaceBdd = bddb.MkBddForIntRanges(whitespace.ranges); 102 | 103 | //in .NET 3.5 category 5 was NOT a word character 104 | //union of categories 0,1,2,3,4,8,18 105 | BDD wordCharBdd = bddb.MkOr(catBDDs[0], 106 | bddb.MkOr(catBDDs[1], 107 | bddb.MkOr(catBDDs[2], 108 | bddb.MkOr(catBDDs[3], 109 | bddb.MkOr(catBDDs[4], 110 | bddb.MkOr(catBDDs[5], 111 | bddb.MkOr(catBDDs[8], catBDDs[18]))))))); 112 | if (bits == 7) 113 | { 114 | sw.WriteLine(@"/// 115 | /// Array of 30 UnicodeCategory ranges. Each entry is a pair of integers. 116 | /// corresponding to the lower and upper bounds of the unicodes of the characters 117 | /// that have the given UnicodeCategory code (between 0 and 29). 118 | /// "); 119 | sw.WriteLine("public static int[][][] " + field + " = new int[][][]{"); 120 | foreach (UnicodeCategory c in catMap.Keys) 121 | { 122 | sw.WriteLine("//{0}({1}):", c, (int)c); 123 | if (catMap[c].Count == 0) 124 | sw.WriteLine("null,"); 125 | else 126 | { 127 | sw.WriteLine("new int[][]{"); 128 | foreach (int[] range in catMap[c].ranges) 129 | sw.WriteLine(" new int[]{" + string.Format("{0},{1}", range[0], range[1]) + "},"); 130 | sw.WriteLine("},"); 131 | } 132 | } 133 | sw.WriteLine("};"); 134 | } 135 | 136 | sw.WriteLine(@"/// 137 | /// Compact BDD encodings of the categories. 138 | /// "); 139 | sw.WriteLine("public static int[][] " + field + "Bdd = new int[][]{"); 140 | foreach (UnicodeCategory c in catMap.Keys) 141 | { 142 | sw.WriteLine("//{0}({1}):", c, (int)c); 143 | BDD catBdd = catBDDs[(int)c]; 144 | if (catBdd == null || catBdd.IsEmpty) 145 | sw.WriteLine("null, //false"); 146 | else if (catBdd.IsFull) 147 | sw.WriteLine("new int[]{0,0}, //true"); 148 | else 149 | { 150 | sw.WriteLine("new int[]{"); 151 | foreach (var arc in bddb.SerializeCompact(catBdd)) 152 | sw.WriteLine("{0},", arc); 153 | sw.WriteLine("},"); 154 | } 155 | } 156 | sw.WriteLine("};"); 157 | 158 | if (bits == 7) 159 | { 160 | sw.WriteLine(@"/// 161 | /// Whitespace character ranges. 162 | /// "); 163 | sw.WriteLine("public static int[][] " + field + "Whitespace = new int[][]{"); 164 | foreach (int[] range in whitespace.ranges) 165 | sw.WriteLine(" new int[]{" + string.Format("{0},{1}", range[0], range[1]) + "},"); 166 | sw.WriteLine("};"); 167 | 168 | sw.WriteLine(@"/// 169 | /// Word character ranges. 170 | /// "); 171 | sw.WriteLine("public static int[][] " + field + "WordCharacter = new int[][]{"); 172 | foreach (int[] range in wordcharacter.ranges) 173 | sw.WriteLine(" new int[]{" + string.Format("{0},{1}", range[0], range[1]) + "},"); 174 | sw.WriteLine("};"); 175 | } 176 | 177 | sw.WriteLine(@"/// 178 | /// Compact BDD encoding of the whitespace characters. 179 | /// "); 180 | sw.WriteLine("public static int[] " + field + "WhitespaceBdd = new int[]{"); 181 | foreach (var arc in bddb.SerializeCompact(whitespaceBdd)) 182 | sw.WriteLine("{0},", arc); 183 | sw.WriteLine("};"); 184 | 185 | sw.WriteLine(@"/// 186 | /// Compact BDD encoding of word characters 187 | /// "); 188 | sw.WriteLine("public static int[] " + field + "WordCharacterBdd = new int[]{"); 189 | foreach (var arc in bddb.SerializeCompact(wordCharBdd)) 190 | sw.WriteLine("{0},", arc); 191 | sw.WriteLine("};"); 192 | } 193 | } 194 | 195 | /// 196 | /// Used internally for creating a collection of ranges for serialization. 197 | /// 198 | internal class Ranges 199 | { 200 | public List ranges = new List(); 201 | public Ranges() 202 | { 203 | } 204 | public void Add(int n) 205 | { 206 | for (int i = 0; i < ranges.Count; i++) 207 | { 208 | if (ranges[i][1] == (n - 1)) 209 | { 210 | ranges[i][1] = n; 211 | return; 212 | } 213 | } 214 | ranges.Add(new int[] { n, n }); 215 | } 216 | 217 | public int Count 218 | { 219 | get { return ranges.Count; } 220 | } 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /srm/unicode/UnicodeCategoryTheory.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using Microsoft.SRM.Generated; 6 | 7 | namespace Microsoft.SRM 8 | { 9 | /// 10 | /// Maps unicode categories to correspoing character predicates. 11 | /// 12 | /// predicates 13 | public interface IUnicodeCategoryTheory 14 | { 15 | /// 16 | /// Gets the unicode category condition for unicode category cat, that must be an integer between 0 and 29 17 | /// 18 | PRED CategoryCondition(int cat); 19 | 20 | /// 21 | /// Gets the white space condition 22 | /// 23 | PRED WhiteSpaceCondition { get; } 24 | 25 | /// 26 | /// Gets the word letter (\w) condition 27 | /// 28 | PRED WordLetterCondition { get; } 29 | 30 | string[] UnicodeCategoryStandardAbbreviations { get; } 31 | } 32 | 33 | internal class UnicodeCategoryTheory : IUnicodeCategoryTheory 34 | { 35 | ICharAlgebra solver; 36 | PRED[] catConditions = new PRED[30]; 37 | PRED whiteSpaceCondition = default(PRED); 38 | PRED wordLetterCondition = default(PRED); 39 | 40 | public string[] UnicodeCategoryStandardAbbreviations 41 | { 42 | get 43 | { 44 | return unicodeCategoryStandardAbbreviations; 45 | } 46 | } 47 | 48 | #region unicode category abbreviations 49 | public static string[] unicodeCategoryStandardAbbreviations = new string[30]{ 50 | "Lu", //0: UppercaseLetter 51 | "Ll", //1: LowercaseLetter 52 | "Lt", //2: TitlecaseLetter 53 | "Lm", //3: ModifierLetter 54 | "Lo", //4: OtherLetter 55 | "Mn", //5: NonSpacingMark 56 | "Mc", //6: SpacingCombiningMark 57 | "Me", //7: EnclosingMark 58 | "Nd", //8: DecimalDigitNumber 59 | "Nl", //9: LetterNumber 60 | "No", //10: OtherNumber 61 | "Zs", //11: SpaceSeparator 62 | "Zl", //12: LineSeparator 63 | "Zp", //13: ParagraphSeparator 64 | "Cc", //14: Control 65 | "Cf", //15: Format 66 | "Cs", //16: Surrogate 67 | "Co", //17: PrivateUse 68 | "Pc", //18: ConnectorPunctuation 69 | "Pd", //19: DashPunctuation 70 | "Ps", //20: OpenPunctuation 71 | "Pe", //21: ClosePunctuation 72 | "Pi", //22: InitialQuotePunctuation 73 | "Pf", //23: FinalQuotePunctuation 74 | "Po", //24: OtherPunctuation 75 | "Sm", //25: MathSymbol 76 | "Sc", //26: CurrencySymbol 77 | "Sk", //27: ModifierSymbol 78 | "So", //28: OtherSymbol 79 | "Cn", //29: OtherNotAssigned 80 | }; 81 | #endregion 82 | 83 | public static string UnicodeCategoryPredicateName(int cat) 84 | { 85 | string catName = ((System.Globalization.UnicodeCategory)cat).ToString(); 86 | return "Is" + catName; 87 | } 88 | 89 | public UnicodeCategoryTheory(ICharAlgebra solver) 90 | { 91 | this.solver = solver; 92 | InitializeUnicodeCategoryDefinitions(); 93 | } 94 | 95 | PRED MkRangesConstraint(IEnumerable ranges) 96 | { 97 | PRED res = solver.False; 98 | foreach (var range in ranges) 99 | res = solver.MkOr(res, solver.MkRangeConstraint((char)range[0], (char)range[1])); 100 | return res; 101 | } 102 | 103 | private void InitializeUnicodeCategoryDefinitions() 104 | { 105 | if (solver.Encoding == BitWidth.BV7) 106 | { 107 | //use ranges directly 108 | for (int i = 0; i < 30; i++) 109 | if (UnicodeCategoryRanges.ASCII[i] == null) 110 | catConditions[i] = solver.False; 111 | else 112 | catConditions[i] = solver.MkCharPredicate( 113 | UnicodeCategoryPredicateName(i), MkRangesConstraint(UnicodeCategoryRanges.ASCII[i])); 114 | 115 | whiteSpaceCondition = solver.MkCharPredicate( 116 | "IsWhitespace", MkRangesConstraint(UnicodeCategoryRanges.ASCIIWhitespace)); 117 | wordLetterCondition = solver.MkCharPredicate( 118 | "IsWordletter", MkRangesConstraint(UnicodeCategoryRanges.ASCIIWordCharacter)); 119 | } 120 | else if (solver.Encoding == BitWidth.BV8) 121 | { 122 | //use BDDs 123 | for (int i = 0; i < 30; i++) 124 | if (UnicodeCategoryRanges.CP437Bdd[i] == null) 125 | catConditions[i] = solver.False; 126 | else 127 | catConditions[i] = solver.MkCharPredicate( 128 | UnicodeCategoryPredicateName(i), 129 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.CP437Bdd[i]))); 130 | whiteSpaceCondition = solver.MkCharPredicate("IsWhitespace", 131 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.CP437WhitespaceBdd))); 132 | wordLetterCondition = solver.MkCharPredicate("IsWordletter", 133 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.CP437WordCharacterBdd))); 134 | } 135 | else 136 | { 137 | //use BDDs 138 | for (int i = 0; i < 30; i++) 139 | catConditions[i] = solver.MkCharPredicate( 140 | UnicodeCategoryPredicateName(i), 141 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.UnicodeBdd[i]))); 142 | whiteSpaceCondition = solver.MkCharPredicate("IsWhitespace", 143 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.UnicodeWhitespaceBdd))); 144 | wordLetterCondition = solver.MkCharPredicate("IsWordletter", 145 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.UnicodeWordCharacterBdd))); 146 | } 147 | } 148 | 149 | #region IUnicodeCategoryTheory Members 150 | 151 | public PRED CategoryCondition(int i) 152 | { 153 | if (object.Equals(catConditions[i], default(PRED))) //uninitialized 154 | { 155 | if (solver.Encoding == BitWidth.BV7) 156 | { 157 | if (UnicodeCategoryRanges.ASCII[i] == null) 158 | catConditions[i] = solver.False; 159 | else 160 | catConditions[i] = solver.MkCharPredicate( 161 | UnicodeCategoryPredicateName(i), MkRangesConstraint(UnicodeCategoryRanges.ASCII[i])); 162 | } 163 | else if (solver.Encoding == BitWidth.BV8) 164 | { 165 | //use BDDs 166 | if (UnicodeCategoryRanges.CP437Bdd[i] == null) 167 | catConditions[i] = solver.False; 168 | else 169 | catConditions[i] = solver.MkCharPredicate( 170 | UnicodeCategoryPredicateName(i), 171 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.CP437Bdd[i]))); 172 | } 173 | else 174 | { 175 | catConditions[i] = solver.MkCharPredicate( 176 | UnicodeCategoryPredicateName(i), 177 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.UnicodeBdd[i]))); 178 | } 179 | } 180 | return catConditions[i]; 181 | } 182 | 183 | public PRED WhiteSpaceCondition 184 | { 185 | get { 186 | if (object.Equals(whiteSpaceCondition, default(PRED))) 187 | { 188 | if (solver.Encoding == BitWidth.BV7) 189 | { 190 | whiteSpaceCondition = solver.MkCharPredicate( 191 | "IsWhitespace", MkRangesConstraint(UnicodeCategoryRanges.ASCIIWhitespace)); 192 | } 193 | else if (solver.Encoding == BitWidth.BV8) 194 | { 195 | //use BDDs 196 | whiteSpaceCondition = solver.MkCharPredicate("IsWhitespace", 197 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.CP437WhitespaceBdd))); 198 | } 199 | else 200 | { 201 | //use BDDs 202 | whiteSpaceCondition = solver.MkCharPredicate("IsWhitespace", 203 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.UnicodeWhitespaceBdd))); 204 | } 205 | } 206 | return whiteSpaceCondition; 207 | } 208 | } 209 | 210 | public PRED WordLetterCondition 211 | { 212 | get { 213 | if (object.Equals(wordLetterCondition, default(PRED))) 214 | { 215 | if (solver.Encoding == BitWidth.BV7) 216 | { 217 | wordLetterCondition = solver.MkCharPredicate( 218 | "IsWordletter", MkRangesConstraint(UnicodeCategoryRanges.ASCIIWordCharacter)); 219 | } 220 | else if (solver.Encoding == BitWidth.BV8) 221 | { 222 | //use BDDs 223 | wordLetterCondition = solver.MkCharPredicate("IsWordletter", 224 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.CP437WordCharacterBdd))); 225 | } 226 | else 227 | { 228 | //use BDDs 229 | wordLetterCondition = solver.MkCharPredicate("IsWordletter", 230 | solver.ConvertFromCharSet(solver.CharSetProvider.DeserializeCompact(UnicodeCategoryRanges.UnicodeWordCharacterBdd))); 231 | } 232 | } 233 | return wordLetterCondition; 234 | } 235 | } 236 | 237 | #endregion 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /srm/utils/StringUtility.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Microsoft.SRM 7 | { 8 | /// 9 | /// Provides some character escaping routines for strings. 10 | /// 11 | internal static class StringUtility 12 | { 13 | #region Escaping strings 14 | ///// 15 | ///// 16 | ///// 17 | 18 | /// 19 | /// Make an escaped string from a character. 20 | /// 21 | /// given character 22 | /// if true then use numeric hexadecimal escaping of all characters 23 | /// 24 | public static string Escape(char c, bool useNumericRepresentationOnly = false) 25 | { 26 | int code = (int)c; 27 | 28 | if (useNumericRepresentationOnly) 29 | { 30 | if (code <= 0xF) 31 | return string.Format("\\x0{0:X}", code); 32 | else if (code <= 0xFF) 33 | return string.Format("\\x{0:X}", code); 34 | else if (code <= 0xFFF) 35 | return string.Format("\\u0{0:X}", code); 36 | else 37 | return string.Format("\\u{0:X}", code); 38 | } 39 | 40 | if (code > 255) 41 | return ToUnicodeRepr(code); 42 | 43 | if (code <= 255 && code > 126) 44 | return string.Format("\\x{0:X}", code); 45 | 46 | switch (c) 47 | { 48 | case '\0': 49 | return @"\0"; 50 | //case '\a': 51 | // return @"\a"; 52 | //case '\b': 53 | // return @"\b"; 54 | //case '\t': 55 | // return @"\t"; 56 | //case '\r': 57 | // return @"\r"; 58 | //case '\v': 59 | // return @"\v"; 60 | //case '\f': 61 | // return @"\f"; 62 | case '\n': 63 | return @"\n"; 64 | case '=': 65 | return "="; 66 | case ';': 67 | return ";"; 68 | case '/': 69 | return "/"; 70 | case '!': 71 | return "!"; 72 | //case '>': 73 | // return ">"; 74 | //case '\"': 75 | // return "\\\""; 76 | //case '\'': 77 | // return "\\\'"; 78 | //case ' ': 79 | // return " "; 80 | //case '\\' : 81 | // return @"\\"; 82 | default: 83 | if (code <= 15) 84 | { 85 | return string.Format("\\x0{0:X}", code); 86 | } 87 | else if (!(((int)'a') <= code && code <= ((int)'z')) 88 | && !(((int)'A') <= code && code <= ((int)'Z')) 89 | && !(((int)'0') <= code && code <= ((int)'9'))) 90 | { 91 | return string.Format("\\x{0:X}", code); 92 | } 93 | else 94 | return c.ToString(); 95 | } 96 | } 97 | 98 | /// 99 | /// Make an escaped string from a character 100 | /// 101 | internal static string EscapeWithNumericSpace(char c) 102 | { 103 | int code = (int)c; 104 | if (code == 32) 105 | return string.Format("\\x{0:X}", code); 106 | else 107 | return Escape(c); 108 | } 109 | 110 | static string ToUnicodeRepr(int i) 111 | { 112 | string s = string.Format("{0:X}", i); 113 | if (s.Length == 1) 114 | s = "\\u000" + s; 115 | else if (s.Length == 2) 116 | s = "\\u00" + s; 117 | else if (s.Length == 3) 118 | s = "\\u0" + s; 119 | else 120 | s = "\\u" + s; 121 | return s; 122 | } 123 | 124 | /// 125 | /// Makes an escaped string from a literal string s. 126 | /// Appends '\"' at the start and end of the encoded string. 127 | /// 128 | public static string Escape(string s) 129 | { 130 | StringBuilder sb = new StringBuilder(); 131 | sb.Append("\""); 132 | foreach (char c in s) 133 | { 134 | sb.Append(Escape(c)); 135 | } 136 | sb.Append("\""); 137 | return sb.ToString(); 138 | } 139 | 140 | /// 141 | /// Unescapes any escaped characters in in the input string. 142 | /// (Same as System.Text.RegularExpressions.Regex.Unescape) 143 | /// 144 | public static string Unescape(string s) 145 | { 146 | return System.Text.RegularExpressions.Regex.Unescape(s); 147 | } 148 | #endregion 149 | 150 | internal static string SerializeStringToCharCodeSequence(string s) 151 | { 152 | if (string.IsNullOrEmpty(s)) 153 | return s; 154 | var encodedChars = Array.ConvertAll(s.ToCharArray(), c => ((int)c).ToString()); 155 | var serialized = string.Join(",", encodedChars); 156 | return serialized; 157 | } 158 | 159 | internal static string DeserializeStringFromCharCodeSequence(string s) 160 | { 161 | if (string.IsNullOrEmpty(s)) 162 | return s; 163 | var encodedChars = s.Split(','); 164 | var deserialized = new String(Array.ConvertAll(encodedChars, x => (char)(int.Parse(x)))); 165 | return deserialized; 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /tests/SerializationTests.cs: -------------------------------------------------------------------------------- 1 | 2 | using System.Runtime.Serialization.Formatters.Binary; 3 | using System.Text.RegularExpressions; 4 | using Microsoft.VisualStudio.TestTools.UnitTesting; 5 | 6 | namespace Microsoft.SRM 7 | { 8 | [TestClass] 9 | public class RegexMatcherTests 10 | { 11 | [TestMethod] 12 | public void TestSRM() 13 | { 14 | var sr = new Microsoft.SRM.Regex(@"a[^ab]+b"); 15 | var input = "xaTAG1bxaTAG2bc"; 16 | var matches = sr.Matches(input); 17 | Assert.IsTrue(matches.Count == 2); 18 | Assert.IsTrue(matches[0].Index == 1); 19 | Assert.IsTrue(matches[0].Length == 6); 20 | Assert.IsTrue(matches[1].Index == 8); 21 | Assert.IsTrue(matches[1].Length == 6); 22 | sr.Serialize("tag.bin"); 23 | var sr2 = Microsoft.SRM.Regex.Deserialize("tag.bin"); 24 | var matches2 = sr2.Matches(input); 25 | CollectionAssert.AreEqual(matches, matches2); 26 | } 27 | 28 | [TestMethod] 29 | public void TestSRM_singlePass() 30 | { 31 | var sr = new Microsoft.SRM.Regex(@"abcbc1|cbc2"); 32 | var input = "xxxabcbc1yyyccbc2xxx"; 33 | var matches = sr.Matches(input); 34 | Assert.IsTrue(matches.Count == 2); 35 | Assert.IsTrue(matches[0].Index == 3); 36 | Assert.IsTrue(matches[0].Length == 6); 37 | Assert.IsTrue(matches[1].Index == 13); 38 | Assert.IsTrue(matches[1].Length == 4); 39 | sr.Serialize("tag.bin"); 40 | var sr2 = Microsoft.SRM.Regex.Deserialize("tag.bin"); 41 | var matches2 = sr2.Matches(input); 42 | CollectionAssert.AreEqual(matches, matches2); 43 | } 44 | 45 | [TestMethod] 46 | public void TestSRM_singletonSeq() 47 | { 48 | var sr = new Microsoft.SRM.Regex(@"a[bB]c"); 49 | var input = "xxxabcyyyaBcxxx"; 50 | var matches = sr.Matches(input); 51 | Assert.IsTrue(matches.Count == 2); 52 | Assert.IsTrue(matches[0].Index == 3); 53 | Assert.IsTrue(matches[0].Length == 3); 54 | Assert.IsTrue(matches[1].Index == 9); 55 | Assert.IsTrue(matches[1].Length == 3); 56 | sr.Serialize("tag.bin"); 57 | var sr2 = Microsoft.SRM.Regex.Deserialize("tag.bin"); 58 | var matches2 = sr2.Matches(input); 59 | CollectionAssert.AreEqual(matches, matches2); 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /tests/tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp3.1 5 | false 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /unicode_table_gen/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Microsoft.SRM 8 | { 9 | class Program 10 | { 11 | static int Main(string[] args) 12 | { 13 | if (args.Length != 1) 14 | { 15 | System.Console.WriteLine("usage: unicode_table_gen "); 16 | return 1; 17 | } 18 | string targetDirectory = args[0]; 19 | UnicodeCategoryRangesGenerator.Generate("Microsoft.SRM.Generated", "UnicodeCategoryRanges", targetDirectory); 20 | IgnoreCaseRelationGenerator.Generate("Microsoft.SRM.Generated", "IgnoreCaseRelation", targetDirectory); 21 | return 0; 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /unicode_table_gen/unicode_table_gen.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.1 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | --------------------------------------------------------------------------------