├── .gitignore ├── go.mod ├── go.sum ├── convert_rtl_test.go ├── .vscode └── launch.json ├── LICENSE ├── convert_re2_test.go ├── README.md ├── _runtestmain.go ├── regexp_analysis.go ├── convert_pcre_test.go ├── main.go ├── convert.go └── convert_findfirstchar.go /.gitignore: -------------------------------------------------------------------------------- 1 | regexp2cg 2 | example/* 3 | example_old/* 4 | test/* 5 | .DS_Store -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/dlclark/regexp2cg 2 | 3 | go 1.21.4 4 | 5 | require github.com/pkg/errors v0.9.1 6 | 7 | require github.com/dlclark/regexp2 v1.11.1-0.20240706002540-f5f79a46e241 8 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/dlclark/regexp2 v1.11.1-0.20240706002540-f5f79a46e241 h1:tdmAsILCdj0TcUrYGU+GigSbtsl0ouA0O7+GXgmMzg0= 2 | github.com/dlclark/regexp2 v1.11.1-0.20240706002540-f5f79a46e241/go.mod h1:YvCrhrh/qlds8EhFKPtJprdXn5fWBllSw1qo99dZyiQ= 3 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 4 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 5 | -------------------------------------------------------------------------------- /convert_rtl_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/dlclark/regexp2/syntax" 7 | ) 8 | 9 | func TestRightToLeft_Basic(t *testing.T) { 10 | pattern := `foo\d+` 11 | s := "0123456789foo4567890foo1foo 0987" 12 | exec := generateAndCompile(t, pattern, syntax.RightToLeft) 13 | 14 | runMatch(t, pattern, exec, s, " 0: foo1") 15 | } 16 | 17 | func TestRightToLeft_StartAt(t *testing.T) { 18 | pattern := `\d` 19 | exec := generateAndCompile(t, pattern, syntax.RightToLeft) 20 | 21 | runMatch(t, pattern, exec, "0123", " 0: 3") 22 | } 23 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | 8 | { 9 | "name": "Run With Pattern", 10 | "type": "go", 11 | "request": "launch", 12 | "mode": "debug", 13 | "program": ".", 14 | "args": ["-path", "../regex-redux", "-o", "../regex-redux/testing.go" ], //["-expr", "(?'abc'\\w+):\\k{2}", "-opt", "0"] 15 | }, 16 | { 17 | "name": "Test PCRE", 18 | "type": "go", 19 | "request": "launch", 20 | "mode": "test", 21 | "program": ".", 22 | "args": [] 23 | } 24 | ] 25 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Doug Clark 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /convert_re2_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/dlclark/regexp2/syntax" 7 | ) 8 | 9 | func runNoMatch(t *testing.T, pattern, reExec, input string) { 10 | m := matchString(t, pattern, reExec, input) 11 | validateNoMatch(t, pattern, m, input) 12 | } 13 | 14 | func runMatch(t *testing.T, pattern, reExec, input, expected string) { 15 | m := matchString(t, pattern, reExec, input) 16 | validateMatch(t, pattern, m, expected, input) 17 | } 18 | 19 | func TestRE2NamedAscii_Concat(t *testing.T) { 20 | pattern := "[[:digit:]a]" 21 | exec := generateAndCompile(t, pattern, syntax.RE2) 22 | 23 | runNoMatch(t, pattern, exec, "b") 24 | 25 | runMatch(t, pattern, exec, "a", " 0: a") 26 | 27 | runNoMatch(t, pattern, exec, "[") 28 | 29 | runMatch(t, pattern, exec, "5", " 0: 5") 30 | } 31 | 32 | func TestRE2Dollar_Singleline(t *testing.T) { 33 | // PCRE allows for \n after the $ and RE2 doesn't 34 | pattern := `^ac$\n` 35 | exec := generateAndCompile(t, pattern, syntax.RE2) 36 | 37 | runNoMatch(t, pattern, exec, "ac") 38 | runNoMatch(t, pattern, exec, "ac\n") 39 | } 40 | 41 | func TestRE2Dollar_Multiline(t *testing.T) { 42 | pattern := `^ac$\n` 43 | exec := generateAndCompile(t, pattern, syntax.RE2|syntax.Multiline) 44 | 45 | runNoMatch(t, pattern, exec, "ac") 46 | runMatch(t, pattern, exec, "ac\n", " 0: ac\\x0a") 47 | } 48 | 49 | func TestRE2ExtendedZero(t *testing.T) { 50 | notZero := "߀" // \u07c0 51 | 52 | exec := generateAndCompile(t, `^\d$`, syntax.RE2) 53 | runNoMatch(t, `^\d$`, exec, notZero) 54 | 55 | exec = generateAndCompile(t, `^\D$`, syntax.RE2) 56 | runMatch(t, `^\D$`, exec, notZero, " 0: \\xdf\\x80") 57 | } 58 | 59 | func TestRegularExtendedZero(t *testing.T) { 60 | notZero := "߀" // \u07c0 61 | 62 | exec := generateAndCompile(t, `^\d$`, 0) 63 | runMatch(t, `^\d$`, exec, notZero, " 0: \\xdf\\x80") 64 | 65 | exec = generateAndCompile(t, `^\D$`, 0) 66 | runNoMatch(t, `^\D$`, exec, notZero) 67 | } 68 | 69 | func TestRE2Word(t *testing.T) { 70 | exec := generateAndCompile(t, `\w`, syntax.RE2) 71 | runNoMatch(t, `\w`, exec, "å") 72 | 73 | exec = generateAndCompile(t, `\W`, syntax.RE2) 74 | runMatch(t, `\W`, exec, "å", " 0: \\xc3\\xa5") 75 | } 76 | 77 | func TestRegularWord(t *testing.T) { 78 | exec := generateAndCompile(t, `\w`, 0) 79 | runMatch(t, `\w`, exec, "å", " 0: \\xc3\\xa5") 80 | 81 | exec = generateAndCompile(t, `\W`, 0) 82 | runNoMatch(t, `\W`, exec, "å") 83 | } 84 | 85 | func TestRE2Space(t *testing.T) { 86 | exec := generateAndCompile(t, `\s`, syntax.RE2) 87 | runNoMatch(t, `\s`, exec, "\x0b") 88 | 89 | exec = generateAndCompile(t, `\S`, syntax.RE2) 90 | runMatch(t, `\S`, exec, "\x0b", " 0: \\x0b") 91 | } 92 | 93 | func TestRegularSpace(t *testing.T) { 94 | exec := generateAndCompile(t, `\s`, 0) 95 | runMatch(t, `\s`, exec, "\x0b", " 0: \\x0b") 96 | 97 | exec = generateAndCompile(t, `\S`, 0) 98 | runNoMatch(t, `\S`, exec, "\x0b") 99 | } 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is it? 2 | `regexp2cg` will convert `regexp2` patterns that typically run as interpreted state machines into Go code that can be compiled and optimized by the Go compiler. This can have a dramatic runtime performance improvement--typically ~300%, but it can be 10x depending on the pattern and workload. The tradeoff is an increase in Go program compile time due to larger code size. For hot-path regexp patterns this tradeoff is often worth it. 3 | 4 | # Usage 5 | 6 | ## Get the code... 7 | ```bash 8 | go install github.com/dlclark/regexp2cg 9 | ``` 10 | 11 | This will download `regexp2cg` from github, compile, and install it. 12 | 13 | **Since `regexp2cg` is currently experimental, to use the pre-compiled regex's your projects will need a specific `code_gen` branch of the `regexp2` library**: 14 | 15 | ```bash 16 | go get github.com/dlclark/regexp2@code_gen 17 | ``` 18 | 19 | Eventually these changes will be merged into `regexp2` proper, but since there are a large number of changes I want to roll this in slowly. 20 | 21 | ## Run it... 22 | 23 | ```bash 24 | regexp2cg -o regexp2_codegen.go 25 | ``` 26 | 27 | By default regexp2cg will search code files (excluding tests) in the current working directory for these patterns: 28 | `regexp2.MustCompile("Pattern", options)` and `regexp2.Compile("Pattern", options)` 29 | 30 | If it finds any instances of this pattern in the code it will make a new file (specified via `-o`, I recommend `regexp2_codegen.go`) that contains state machines for each pattern+options combination found. During `init` it will register these state machines with `regexp2` so the MustCompile method knows to return our code generated, compiled state machine instance instead of an regexp2 interpreter. 31 | 32 | The original code is not changed in any way. A state machine replacement is registered with `regexp2` for that pattern and options and that's it. If you want to "undo" the change, delete the new file created by `regexp2cg` and the original regexp will once again be interpreted instead of compiled. 33 | 34 | You can also convert a single, given pattern via the command line options `-expr ["my pattern"]` and `-opt [options as int]` and by default it'll output the converted code to STDOUT. 35 | 36 | For future runs you may want to add a [`//go:generate` comment](https://go.dev/blog/generate) with the `regexp2cg` command to one of your files. 37 | 38 | # Notes 39 | * `regexp2cg` uses an AST parser to find the MustCompile and Compile methods, so the code needs to be in a compiling state for the patterns to be detected. 40 | * The pattern and options specified cannot be dynamic -- if the pattern comes from a function call or is pieced together via string concatenation (e.g. `"pattern" + var + "more pattern"`) then it will not be converted. The concept only works for fully known-at-compile-time patterns and options. 41 | * If specified, the output file is overwritten entirely 42 | * The directory searching for code isn't recursive, you'll need to run `regexp2cg` in each directory you want to generate pre-compiled patterns for. 43 | 44 | # Original code 45 | C# 11 added a compile-time regex generator: https://github.com/dotnet/runtime/tree/main/src/libraries/System.Text.RegularExpressions/gen 46 | 47 | This is a pure Go port of that generator (v9.0 preview) using the [regexp2](githug.com/dlclark/regexp2) engine as the base. 48 | 49 | # Not supported patterns 50 | Per the C# implementation patterns that contain the following cannot be dynamically generated: 51 | * Case insensitive back-references (I may have fixed this in the port) 52 | * RegexNode Tree depth of 40 or larger. This makes incredibly large code files that can impact compile performance. The value 40 is inherited from the C# compiler limitations. Will need to play with Go compiler to see what a reasonable value is. 53 | 54 | # Reporting issues 55 | This utility is new and likely has errors. If you think you found a bug please confirm the pattern works as expected on https://regex101.com using the .NET Flavor. Please include a short Go Test that uses the pattern, options, match text, and expected results. 56 | 57 | # Future plans ... 58 | It might be nice to be able to exclude a pattern from the directory processing. Maybe add a command line option `--exclude "Pattern"` or you add support for an exclude comment above the `regexp2.MustCompile` line: `// regexpcg: exclude`. 59 | 60 | -------------------------------------------------------------------------------- /_runtestmain.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os" 7 | "strings" 8 | 9 | "github.com/dlclark/regexp2" 10 | ) 11 | 12 | // our file that runs the given regex and options against the args 13 | // and outputs the results in a known way for comparison 14 | 15 | func main() { 16 | re := regexp2.MustCompile(__PATTERN__, __OPTIONS__) 17 | 18 | if len(os.Args) > 2 { 19 | //debug mode 20 | fmt.Printf("Pattern: %s\n", re.String()) 21 | //fmt.Printf("Options: %v\n", re.Options) 22 | } 23 | 24 | m, err := re.FindStringMatch(unEscapeToMatch(os.Args[1])) 25 | if err != nil { 26 | fmt.Printf("ERROR: %v", err) 27 | } 28 | if m == nil { 29 | fmt.Println("No match") 30 | return 31 | } 32 | 33 | g := m.Groups() 34 | for i := 0; i < len(g); i++ { 35 | val := "" 36 | if len(g[i].Captures) > 0 { 37 | val = unEscapeGroup(g[i].String()) 38 | } 39 | fmt.Printf("%2v: %s\n", i, val) 40 | } 41 | 42 | } 43 | 44 | func unEscapeGroup(val string) string { 45 | // use hex for chars 0x00-0x1f, 0x7f-0xff 46 | buf := &bytes.Buffer{} 47 | 48 | for i := 0; i < len(val); i++ { 49 | ch := val[i] 50 | if ch <= 0x1f || ch >= 0x7f { 51 | //write it as a \x00 52 | fmt.Fprintf(buf, "\\x%.2x", ch) 53 | } else { 54 | // write as-is 55 | buf.WriteByte(ch) 56 | } 57 | } 58 | 59 | return buf.String() 60 | } 61 | 62 | func unEscapeToMatch(line string) string { 63 | idx := strings.IndexRune(line, '\\') 64 | // no slashes means no unescape needed 65 | if idx == -1 { 66 | return line 67 | } 68 | 69 | buf := bytes.NewBufferString(line[:idx]) 70 | // get the runes for the rest of the string -- we're going full parser scan on this 71 | 72 | inEscape := false 73 | // take any \'s and convert them 74 | for i := idx; i < len(line); i++ { 75 | ch := line[i] 76 | if ch == '\\' { 77 | if inEscape { 78 | buf.WriteByte(ch) 79 | } 80 | inEscape = !inEscape 81 | continue 82 | } 83 | if inEscape { 84 | switch ch { 85 | case 'x': 86 | buf.WriteByte(scanHex(line, &i)) 87 | case 'a': 88 | buf.WriteByte(0x07) 89 | case 'b': 90 | buf.WriteByte('\b') 91 | case 'e': 92 | buf.WriteByte(0x1b) 93 | case 'f': 94 | buf.WriteByte('\f') 95 | case 'n': 96 | buf.WriteByte('\n') 97 | case 'r': 98 | buf.WriteByte('\r') 99 | case 't': 100 | buf.WriteByte('\t') 101 | case 'v': 102 | buf.WriteByte(0x0b) 103 | default: 104 | if ch >= '0' && ch <= '7' { 105 | buf.WriteByte(scanOctal(line, &i)) 106 | } else { 107 | buf.WriteByte(ch) 108 | //panic(fmt.Sprintf("unexpected escape '%v' in %v", string(ch), line)) 109 | } 110 | } 111 | inEscape = false 112 | } else { 113 | buf.WriteByte(ch) 114 | } 115 | } 116 | 117 | return buf.String() 118 | } 119 | 120 | func scanHex(line string, idx *int) byte { 121 | if *idx >= len(line)-2 { 122 | panic(fmt.Sprintf("not enough hex chars in %v at %v", line, *idx)) 123 | } 124 | (*idx)++ 125 | d1 := hexDigit(line[*idx]) 126 | (*idx)++ 127 | d2 := hexDigit(line[*idx]) 128 | if d1 < 0 || d2 < 0 { 129 | panic("bad hex chars") 130 | } 131 | 132 | return byte(d1*0x10 + d2) 133 | } 134 | 135 | // Returns n <= 0xF for a hex digit. 136 | func hexDigit(ch byte) int { 137 | 138 | if d := uint(ch - '0'); d <= 9 { 139 | return int(d) 140 | } 141 | 142 | if d := uint(ch - 'a'); d <= 5 { 143 | return int(d + 0xa) 144 | } 145 | 146 | if d := uint(ch - 'A'); d <= 5 { 147 | return int(d + 0xa) 148 | } 149 | 150 | return -1 151 | } 152 | 153 | // Scans up to three octal digits (stops before exceeding 0377). 154 | func scanOctal(line string, idx *int) byte { 155 | // Consume octal chars only up to 3 digits and value 0377 156 | 157 | // octals can be 3,2, or 1 digit 158 | c := 3 159 | 160 | if diff := len(line) - *idx; c > diff { 161 | c = diff 162 | } 163 | 164 | i := 0 165 | d := int(line[*idx] - '0') 166 | for c > 0 && d <= 7 { 167 | i *= 8 168 | i += d 169 | 170 | c-- 171 | (*idx)++ 172 | if *idx < len(line) { 173 | d = int(line[*idx] - '0') 174 | } 175 | } 176 | (*idx)-- 177 | 178 | // Octal codes only go up to 255. Any larger and the behavior that Perl follows 179 | // is simply to truncate the high bits. 180 | i &= 0xFF 181 | 182 | return byte(i) 183 | } 184 | -------------------------------------------------------------------------------- /regexp_analysis.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "github.com/dlclark/regexp2/syntax" 4 | 5 | type analysisResults struct { 6 | // true if the whole tree successfully processed, otherwise false 7 | // if false we assume the worst and "unknown" but this should be rare 8 | complete bool 9 | 10 | //Set of nodes that are considered to be atomic based on themselves or their ancestry. 11 | isAtomicByAncestor map[*syntax.RegexNode]struct{} 12 | 13 | //Set of nodes that directly or indirectly contain capture groups. 14 | containsCapture map[*syntax.RegexNode]struct{} 15 | 16 | //Set of nodes that directly or indirectly contain backtracking constructs that aren't hidden internaly by atomic constructs. 17 | mayBacktrack map[*syntax.RegexNode]struct{} 18 | 19 | //Set of nodes contained inside loops. 20 | inLoops map[*syntax.RegexNode]struct{} 21 | 22 | hasIgnoreCase bool 23 | hasRightToLeft bool 24 | } 25 | 26 | func analyze(tree *syntax.RegexTree) *analysisResults { 27 | var results = &analysisResults{ 28 | isAtomicByAncestor: make(map[*syntax.RegexNode]struct{}), 29 | containsCapture: make(map[*syntax.RegexNode]struct{}), 30 | } 31 | 32 | results.complete = tryAnalyze(tree.Root, results, true, false) 33 | 34 | return results 35 | } 36 | 37 | func tryAnalyze(node *syntax.RegexNode, results *analysisResults, isAtomicByAncestor bool, isInLoop bool) bool { 38 | //TODO: stack-depth check? 39 | 40 | // Track whether we've seen any nodes with various options set. 41 | results.hasIgnoreCase = results.hasIgnoreCase || (node.Options&syntax.IgnoreCase) != 0 42 | results.hasRightToLeft = results.hasRightToLeft || (node.Options&syntax.RightToLeft) != 0 43 | 44 | // Track whether this node is inside of a loop. 45 | if isInLoop { 46 | if results.inLoops == nil { 47 | results.inLoops = make(map[*syntax.RegexNode]struct{}) 48 | } 49 | results.inLoops[node] = struct{}{} 50 | } 51 | 52 | if isAtomicByAncestor { 53 | // We've been told by our parent that we should be considered atomic, so add ourselves 54 | // to the atomic collection. 55 | results.isAtomicByAncestor[node] = struct{}{} 56 | } else { 57 | // Certain kinds of nodes incur backtracking logic themselves: add them to the backtracking collection. 58 | // We may later find that a node contains another that has backtracking; we'll add nodes based on that 59 | // after examining the children. 60 | if node.T == syntax.NtAlternate || 61 | (node.M != node.N && 62 | (node.T == syntax.NtLoop || node.T == syntax.NtLazyloop || 63 | node.T == syntax.NtOneloop || node.T == syntax.NtNotoneloop || node.T == syntax.NtSetloop || 64 | node.T == syntax.NtOnelazy || node.T == syntax.NtNotonelazy || node.T == syntax.NtSetlazy)) { 65 | 66 | results.addMayBacktrack(node) 67 | } 68 | } 69 | 70 | // Update state for certain node types. 71 | var isAtomicBySelf = false 72 | switch node.T { 73 | case syntax.NtAtomic, syntax.NtNegLook, syntax.NtPosLook: 74 | isAtomicBySelf = true 75 | case syntax.NtCapture: 76 | results.containsCapture[node] = struct{}{} 77 | case syntax.NtLoop, syntax.NtLazyloop: 78 | isInLoop = true 79 | } 80 | 81 | childCount := len(node.Children) 82 | for i := 0; i < childCount; i++ { 83 | child := node.Children[i] 84 | 85 | // Determine whether the child should be treated as atomic (whether anything 86 | // can backtrack into it), which is influenced by whether this node (the child's 87 | // parent) is considered atomic by itself or by its parent. 88 | treatChildAsAtomic := isAtomicByAncestor || isAtomicBySelf 89 | 90 | // If the parent is atomic, so is the child. That's the whole purpose 91 | // of the Atomic node, and lookarounds are also implicitly atomic. 92 | if !(node.T == syntax.NtAtomic || node.T == syntax.NtNegLook || node.T == syntax.NtPosLook || 93 | node.T == syntax.NtAlternate || node.T == syntax.NtBackRefCond || node.T == syntax.NtExprCond || 94 | node.T == syntax.NtCapture || 95 | (node.T == syntax.NtConcatenate && i == childCount-1) || 96 | ((node.T == syntax.NtLoop || node.T == syntax.NtLazyloop) && node.N == 1)) { 97 | // if these conditions aren't met then we're not atomic 98 | treatChildAsAtomic = false 99 | } 100 | 101 | //analyze the child 102 | if !tryAnalyze(child, results, treatChildAsAtomic, isInLoop) { 103 | return false 104 | } 105 | 106 | // If the child contains captures, so too does this parent. 107 | if _, ok := results.containsCapture[child]; ok { 108 | results.containsCapture[node] = struct{}{} 109 | } 110 | 111 | // If the child might require backtracking into it, so too might the parent, 112 | // unless the parent is itself considered atomic. Here we don't consider parental 113 | // atomicity, as we need to surface upwards to the parent whether any backtracking 114 | // will be visible from this node to it. 115 | 116 | if !isAtomicBySelf && results.mayBacktrack != nil { 117 | _, ok := results.mayBacktrack[child] 118 | if ok { 119 | results.addMayBacktrack(node) 120 | } 121 | } 122 | } 123 | 124 | return true 125 | } 126 | 127 | func (a *analysisResults) IsAtomicByAncestor(node *syntax.RegexNode) bool { 128 | _, ok := a.isAtomicByAncestor[node] 129 | return ok 130 | } 131 | 132 | func (a *analysisResults) MayContainCapture(node *syntax.RegexNode) bool { 133 | if !a.complete { 134 | return true 135 | } 136 | _, ok := a.containsCapture[node] 137 | return ok 138 | } 139 | 140 | func (a *analysisResults) MayBacktrack(node *syntax.RegexNode) bool { 141 | if !a.complete { 142 | return true 143 | } 144 | if a.mayBacktrack == nil { 145 | return false 146 | } 147 | _, ok := a.mayBacktrack[node] 148 | return ok 149 | } 150 | 151 | func (a *analysisResults) addMayBacktrack(node *syntax.RegexNode) { 152 | if a.mayBacktrack == nil { 153 | a.mayBacktrack = make(map[*syntax.RegexNode]struct{}) 154 | } 155 | a.mayBacktrack[node] = struct{}{} 156 | } 157 | 158 | func (a *analysisResults) IsInLoop(node *syntax.RegexNode) bool { 159 | if !a.complete { 160 | return true 161 | } 162 | if a.inLoops == nil { 163 | return false 164 | } 165 | _, ok := a.inLoops[node] 166 | return ok 167 | } 168 | 169 | func (a *analysisResults) HasIgnoreCase() bool { 170 | return !a.complete || a.hasIgnoreCase 171 | } 172 | 173 | func (a *analysisResults) HasRightToLeft() bool { 174 | return !a.complete || a.hasRightToLeft 175 | } 176 | -------------------------------------------------------------------------------- /convert_pcre_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "fmt" 7 | "log" 8 | "os" 9 | "os/exec" 10 | "path/filepath" 11 | "regexp" 12 | "slices" 13 | "strconv" 14 | "strings" 15 | "testing" 16 | 17 | "github.com/dlclark/regexp2/syntax" 18 | "github.com/pkg/errors" 19 | ) 20 | 21 | // Process the file "testoutput1" from PCRE2 v10.21 (public domain) 22 | var totalCount, failCount = 0, 0 23 | 24 | func TestConversion(t *testing.T) { 25 | defer func() { 26 | if failCount > 0 { 27 | t.Logf("%v of %v patterns failed", failCount, totalCount) 28 | } 29 | }() 30 | // open our test patterns file and run through it 31 | // validating results as we go 32 | file, err := os.Open("testoutput1") 33 | if err != nil { 34 | log.Fatal(err) 35 | } 36 | defer file.Close() 37 | 38 | // the high level structure of the file: 39 | // #comments - ignore only outside of the pattern 40 | // pattern (could be multi-line, could be surrounded by "" or //) after the / there are the options some we understand, some we don't 41 | // test case 42 | // 0: success case 43 | // \= Expect no match (ignored) 44 | // another test case 45 | // No Match 46 | // 47 | // another pattern ...etc 48 | 49 | scanner := bufio.NewScanner(file) 50 | // main pattern loop 51 | for scanner.Scan() { 52 | // reading the file a line at a time 53 | line := scanner.Text() 54 | 55 | if trim := strings.TrimSpace(line); trim == "" || strings.HasPrefix(trim, "#") { 56 | // skip blanks and comments 57 | continue 58 | } 59 | 60 | patternStart := line[0] 61 | if patternStart != '/' && patternStart != '"' { 62 | // an error! expected a pattern but we didn't understand what was in the file 63 | t.Fatalf("Unknown file format, expected line to start with '/' or '\"', line in: %v", line) 64 | } 65 | 66 | // start building our pattern, handling multi-line patterns 67 | pattern := line 68 | totalCount++ 69 | 70 | // keep appending the lines to our pattern string until we 71 | // find our closing tag, don't allow the first char to match on the 72 | // line start, but subsequent lines could end on the first char 73 | allowFirst := false 74 | for !containsEnder(line, patternStart, allowFirst) { 75 | if !scanner.Scan() { 76 | // an error! expected more pattern, but got eof 77 | t.Fatalf("Unknown file format, expected more pattern text, but got EOF, pattern so far: %v", pattern) 78 | } 79 | line = scanner.Text() 80 | pattern += fmt.Sprintf("\n%s", line) 81 | allowFirst = true 82 | } 83 | 84 | //subtest 85 | t.Run(fmt.Sprintf("pcre %s", pattern), func(t *testing.T) { 86 | 87 | //t.Logf("Compile Pattern: %v", pattern) 88 | // we have our raw pattern! -- we need to convert this to a compiled regex 89 | reExec := compileRawPattern(t, pattern) 90 | //t.Logf("Program created: %v", reExec) 91 | var ( 92 | capsIdx map[int]int 93 | m string 94 | toMatch string 95 | ) 96 | // now we need to parse the test cases if there are any 97 | // they start with 4 spaces -- if we don't get a 4-space start then 98 | // we're back out to our next pattern 99 | for scanner.Scan() { 100 | line = scanner.Text() 101 | 102 | // blank line is our separator for a new pattern 103 | if strings.TrimSpace(line) == "" { 104 | break 105 | } 106 | 107 | // could be either " " or "\= Expect" 108 | if strings.HasPrefix(line, "\\= Expect") { 109 | continue 110 | } else if strings.HasPrefix(line, " ") { 111 | // trim off leading spaces for our text to match 112 | toMatch = line[4:] 113 | // trim off trailing spaces too 114 | toMatch = strings.TrimRight(toMatch, " ") 115 | 116 | m = matchString(t, pattern, reExec, toMatch) 117 | 118 | capsIdx = make(map[int]int) 119 | continue 120 | //t.Fatalf("Expected match text to start with 4 spaces, instead got: '%v'", line) 121 | } else if strings.HasPrefix(line, "No match") { 122 | validateNoMatch(t, pattern, m, toMatch) 123 | // no match means we're done 124 | continue 125 | } else if subs := matchGroup.FindStringSubmatch(line); len(subs) == 3 { 126 | gIdx, _ := strconv.Atoi(subs[1]) 127 | if _, ok := capsIdx[gIdx]; !ok { 128 | capsIdx[gIdx] = 0 129 | } 130 | validateMatch(t, pattern, m, line, toMatch) 131 | capsIdx[gIdx]++ 132 | continue 133 | } else { 134 | // no match -- problem 135 | t.Fatalf("Unknown file format, expected match or match group but got '%v'", line) 136 | } 137 | } 138 | }) 139 | } 140 | 141 | if err := scanner.Err(); err != nil { 142 | log.Fatal(err) 143 | } 144 | } 145 | 146 | var matchGroup = regexp.MustCompile(`^\s*(\d+): (.*)`) 147 | 148 | func problem(t *testing.T, input string, args ...interface{}) { 149 | failCount++ 150 | t.Errorf(input, args...) 151 | } 152 | 153 | func validateNoMatch(t *testing.T, pattern string, m string, toMatch string) { 154 | if len(m) == 0 || m == "No match\n" { 155 | return 156 | } 157 | 158 | problem(t, "Expected no match for pattern '%v' with input '%v', but got '%v'", pattern, toMatch, m) 159 | } 160 | 161 | func validateMatch(t *testing.T, pattern string, m string, line, toMatch string) { 162 | if len(m) == 0 { 163 | // already error'd earlier up stream 164 | return 165 | } 166 | 167 | if m == "No match\n" { 168 | // we didn't match, but should have 169 | problem(t, "Expected match for pattern '%v' with input '%v', but got no match", pattern, toMatch) 170 | return 171 | } 172 | 173 | // find our line in our output 174 | lines := strings.Split(m, "\n") 175 | if !slices.Contains(lines, line) { 176 | // we did not find our line in the input 177 | problem(t, "Did not find expected line '%s' for pattern '%v' with input '%v'. Got '%s'", line, pattern, toMatch, m) 178 | } 179 | } 180 | 181 | // returns the path to an executable for running tests against this pattern 182 | func compileRawPattern(t *testing.T, pattern string) string { 183 | // check our end for RegexOptions -trim them off 184 | index := strings.LastIndexAny(pattern, "/\"") 185 | // 186 | // Append "= Debug" to compare details between corefx and regexp2 on the PCRE test suite 187 | // 188 | var opts syntax.RegexOptions 189 | 190 | if index+1 < len(pattern) { 191 | textOptions := pattern[index+1:] 192 | pattern = pattern[:index+1] 193 | // there are lots of complex options here 194 | for _, textOpt := range strings.Split(textOptions, ",") { 195 | switch textOpt { 196 | case "dupnames": 197 | // we don't know how to handle this... 198 | default: 199 | if strings.Contains(textOpt, "i") { 200 | opts |= syntax.IgnoreCase 201 | } 202 | if strings.Contains(textOpt, "s") { 203 | opts |= syntax.Singleline 204 | } 205 | if strings.Contains(textOpt, "m") { 206 | opts |= syntax.Multiline 207 | } 208 | if strings.Contains(textOpt, "x") { 209 | opts |= syntax.IgnorePatternWhitespace 210 | } 211 | } 212 | } 213 | 214 | } 215 | 216 | // trim off first and last char 217 | pattern = pattern[1 : len(pattern)-1] 218 | 219 | defer func() { 220 | if rec := recover(); rec != nil { 221 | problem(t, "PANIC in compiling \"%v\": %v", pattern, rec) 222 | } 223 | }() 224 | 225 | return generateAndCompile(t, pattern, opts) 226 | } 227 | 228 | func generateAndCompile(t *testing.T, pattern string, opts syntax.RegexOptions) string { 229 | genPattern, err := os.CreateTemp("", "*.go") 230 | if err != nil { 231 | panic("could not create tmp file: " + err.Error()) 232 | } 233 | c, err := newConverter(genPattern, "main") 234 | if err != nil { 235 | t.Error(errors.Wrap(err, "code generation error")) 236 | } 237 | if err := c.addRegexp("MyFile.go:120:10", "MyPattern", pattern, opts); err != nil { 238 | t.Error(errors.Wrap(err, "code generation error")) 239 | } 240 | if err := c.addFooter(); err != nil { 241 | t.Error(errors.Wrap(err, "code generation error")) 242 | } 243 | 244 | // compile our tmp file 245 | 246 | // get our output file name 247 | outFile, _ := os.CreateTemp("", "") 248 | 249 | // get go path 250 | goPath, _ := exec.LookPath("go") 251 | 252 | // customize the main file for this pattern 253 | mainFile, _ := os.CreateTemp("", "*.go") 254 | origMainFile, _ := filepath.Abs("_runtestmain.go") 255 | mainContent, _ := os.ReadFile(origMainFile) 256 | mainContent = bytes.Replace(mainContent, []byte("__PATTERN__"), []byte(fmt.Sprintf("%#v", pattern)), 1) 257 | mainContent = bytes.Replace(mainContent, []byte("__OPTIONS__"), []byte(fmt.Sprintf("%#v", opts)), 1) 258 | mainFile.Write(mainContent) 259 | 260 | // build! 261 | cmd := exec.Command(goPath, "build", "-o", outFile.Name(), genPattern.Name(), mainFile.Name()) 262 | if out, err := cmd.CombinedOutput(); err != nil { 263 | t.Log(string(out)) 264 | t.Errorf("build error for pattern %v", pattern) 265 | os.Remove(outFile.Name()) 266 | return "" 267 | } 268 | 269 | // our executable! 270 | return outFile.Name() 271 | } 272 | 273 | func matchString(t *testing.T, pattern string, reExec string, toMatch string) string { 274 | if len(reExec) == 0 { 275 | return "" 276 | } 277 | 278 | escp := "" 279 | var err error 280 | if toMatch != "\\" { 281 | escp = toMatch // unEscapeToMatch(toMatch) 282 | } 283 | //t.Logf("Testing: %v", escp) 284 | cmd := exec.Command(reExec, escp) 285 | out, err := cmd.CombinedOutput() 286 | if err != nil { 287 | problem(t, "Error matching \"%v\" in pattern \"%v\": %v", toMatch, pattern, err) 288 | } 289 | //t.Logf("Result: %v", string(out)) 290 | return string(out) 291 | } 292 | 293 | func containsEnder(line string, ender byte, allowFirst bool) bool { 294 | index := strings.LastIndexByte(line, ender) 295 | if index > 0 { 296 | return true 297 | } else if index == 0 && allowFirst { 298 | return true 299 | } 300 | return false 301 | } 302 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "go/ast" 7 | "go/parser" 8 | "go/token" 9 | "io" 10 | "io/fs" 11 | "log" 12 | "os" 13 | "path/filepath" 14 | "slices" 15 | "strconv" 16 | 17 | "github.com/pkg/errors" 18 | 19 | "github.com/dlclark/regexp2/syntax" 20 | ) 21 | 22 | // single regex options - if expr is set then use that, opt, and package 23 | var expr = flag.String("expr", "", "the regexp to convert to Go code to output file") 24 | var opt = flag.Int("opt", 0, "bitwise options to use when compiling the regexp") 25 | var pkg = flag.String("package", "regexp2codegen", "package to use when converting a single regexp") 26 | 27 | // if not single regex then scan the path and convert all regex's we find, optionally including test files 28 | var path = flag.String("path", ".", "file path to scan and generate regexp's for") 29 | var tests = flag.Bool("test", false, "true if go tests should be scanned as well") 30 | 31 | // universal options 32 | var out = flag.String("o", "", "output file to write generated regexp code into, if the file exists overwrites it. defaults to stdout") 33 | 34 | func main() { 35 | flag.Parse() 36 | 37 | if expr != nil && len(*expr) > 0 { 38 | options := syntax.RegexOptions(0) 39 | if opt != nil { 40 | options = syntax.RegexOptions(*opt) 41 | } 42 | convertSingle(*expr, options, *pkg) 43 | return 44 | } 45 | 46 | convPath, _ := os.Getwd() 47 | if path != nil && len(*path) > 0 { 48 | convPath, _ = filepath.Abs(*path) 49 | } 50 | 51 | convertPath(convPath, *tests) 52 | } 53 | 54 | func getOutStream() (io.Writer, string) { 55 | outPath := "" 56 | if out == nil || len(*out) == 0 { 57 | return os.Stdout, "" 58 | } 59 | 60 | outPath, _ = filepath.Abs(*out) 61 | file, err := os.Create(outPath) 62 | if err != nil { 63 | log.Fatalf("error creating out file: %v", err) 64 | } 65 | 66 | return file, outPath 67 | } 68 | 69 | func convertSingle(expr string, opts syntax.RegexOptions, pkg string) { 70 | stream, _ := getOutStream() 71 | if stream == nil { 72 | log.Fatalf("unable to open output") 73 | } 74 | c, err := newConverter(stream, pkg) 75 | if err != nil { 76 | log.Fatal(errors.Wrap(err, "code generation error")) 77 | } 78 | if err := c.addRegexp("command line", "MyPattern", expr, opts); err != nil { 79 | log.Fatal(errors.Wrap(err, "code generation error")) 80 | } 81 | if err := c.addFooter(); err != nil { 82 | log.Fatal(errors.Wrap(err, "code generation error")) 83 | } 84 | } 85 | 86 | func convertPath(path string, includeTest bool) { 87 | log.Printf("Create regexp for path %s, include tests=%v", path, includeTest) 88 | //Create a FileSet to work with 89 | fset := token.NewFileSet() 90 | 91 | pkgs, err := parser.ParseDir(fset, path, func(fi fs.FileInfo) bool { 92 | // if we disallow tests and match the test pattern then skip 93 | if !includeTest { 94 | if ok, _ := filepath.Match("*_test.go", fi.Name()); ok { 95 | return false 96 | } 97 | } 98 | return true 99 | }, parser.ParseComments|parser.SkipObjectResolution) 100 | 101 | if err != nil { 102 | log.Fatalf("unable to parse go file: %v", err) 103 | } 104 | 105 | var alias string 106 | var c *converter 107 | var stream io.Writer 108 | var outFile string 109 | for p, pkg := range pkgs { 110 | for f, file := range pkg.Files { 111 | if !importsRegexp2(file, &alias) { 112 | // we don't import the regexp engine in this file 113 | // so we won't find anything here 114 | continue 115 | } 116 | if alias == "" { 117 | alias = "regexp2" 118 | } 119 | log.Printf("file %v imports regexp2", f) 120 | ast.Inspect(file, func(n ast.Node) bool { 121 | // Find asignment statements 122 | // a := regexp2.MustCompile("pattern", 0) 123 | // var a = regexp2.... 124 | if varDec, ok := n.(*ast.ValueSpec); ok { 125 | // var dec 126 | for i, val := range varDec.Values { 127 | ok, pat, opt, pos := isStaticCompileCall(val, alias) 128 | if ok { 129 | fset.Position(pos).String() 130 | log.Printf("%s: adding pattern %#v options %v", fset.Position(pos), pat, opt) 131 | // first find inits a converter 132 | if c == nil { 133 | stream, outFile = getOutStream() 134 | if stream == nil { 135 | log.Fatalf("unable to open output") 136 | } 137 | c, err = newConverter(stream, p) 138 | if err != nil { 139 | log.Fatal(errors.Wrap(err, "code generation error")) 140 | } 141 | } 142 | 143 | if err := c.addRegexp(getLocation(fset, pos, outFile), getName(varDec.Names[i]), pat, syntax.RegexOptions(opt)); err != nil { 144 | log.Fatal(errors.Wrap(err, "code generation error")) 145 | } 146 | } 147 | } 148 | } else if assign, ok := n.(*ast.AssignStmt); ok { 149 | for i, exp := range assign.Rhs { 150 | ok, pat, opt, pos := isStaticCompileCall(exp, alias) 151 | if ok { 152 | log.Printf("%s: adding pattern %#v options %v", fset.Position(pos), pat, opt) 153 | // first find inits a converter 154 | if c == nil { 155 | stream, outFile = getOutStream() 156 | if stream == nil { 157 | log.Fatalf("unable to open output") 158 | } 159 | c, err = newConverter(stream, p) 160 | if err != nil { 161 | log.Fatal(errors.Wrap(err, "code generation error")) 162 | } 163 | } 164 | 165 | if err := c.addRegexp(getLocation(fset, pos, outFile), getName(assign.Lhs[i]), pat, syntax.RegexOptions(opt)); err != nil { 166 | log.Fatal(errors.Wrap(err, "code generation error")) 167 | } 168 | } 169 | } 170 | } 171 | 172 | return true 173 | }) 174 | } 175 | } 176 | if c != nil { 177 | if err := c.addFooter(); err != nil { 178 | log.Fatal(errors.Wrap(err, "code generation error")) 179 | } 180 | } 181 | } 182 | 183 | // returns a location in the fileset relative to the output path given 184 | // or pwd if output path is blank 185 | func getLocation(fset *token.FileSet, pos token.Pos, outPath string) string { 186 | fullPos := fset.Position(pos) 187 | 188 | //make filename relative to our pwd 189 | if outPath == "" { 190 | outPath, _ = os.Getwd() 191 | } else { 192 | outPath = filepath.Dir(outPath) 193 | } 194 | 195 | file, _ := filepath.Rel(outPath, fullPos.Filename) 196 | return fmt.Sprint(file, ":", fullPos.Line, ":", fullPos.Column) 197 | } 198 | 199 | func getName(lhs ast.Node) string { 200 | // should be an ident 201 | if ident, ok := lhs.(*ast.Ident); ok { 202 | return ident.Name 203 | } 204 | return "" 205 | } 206 | 207 | func isStaticCompileCall(n ast.Node, importAlias string) (ok bool, pattern string, opts int, patternPos token.Pos) { 208 | funcCall, ok := n.(*ast.CallExpr) 209 | if !ok { 210 | return false, "", 0, 0 211 | } 212 | 213 | if len(funcCall.Args) < 1 || len(funcCall.Args) > 2 { 214 | return false, "", 0, 0 215 | } 216 | 217 | if match, _ := isSelector(funcCall.Fun, importAlias, "MustCompile", "Compile"); match { 218 | pattern, ok = extractPattern(funcCall.Args[0]) 219 | if !ok { 220 | return false, "", 0, 0 221 | } 222 | 223 | opts = 0 // Default options 224 | if len(funcCall.Args) == 2 { 225 | tmpOpts, ok := getOpts(funcCall.Args[1], importAlias) 226 | if ok { 227 | opts = tmpOpts 228 | } 229 | } 230 | 231 | return true, pattern, opts, funcCall.Args[0].Pos() 232 | } 233 | 234 | return false, "", 0, 0 235 | } 236 | 237 | func extractPattern(arg ast.Expr) (pattern string, ok bool) { 238 | switch v := arg.(type) { 239 | case *ast.BasicLit: // Direct string literal 240 | if v.Kind == token.STRING { 241 | pattern, _ = strconv.Unquote(v.Value) // Extract string 242 | return pattern, true 243 | } 244 | case *ast.BinaryExpr: // Concatenated strings 245 | // it does work here but the code for loading the pre-compiled regexp doesn't work with it, left it for future 246 | left, ok1 := extractPattern(v.X) 247 | right, ok2 := extractPattern(v.Y) 248 | if ok1 && ok2 { 249 | return left + right, true 250 | } 251 | case *ast.Ident: // Constant or variable 252 | // Example: const myPattern = "pattern" 253 | if v.Obj != nil && v.Obj.Kind == ast.Con { 254 | if valueSpec, ok := v.Obj.Decl.(*ast.ValueSpec); ok { 255 | if len(valueSpec.Values) > 0 { 256 | return extractPattern(valueSpec.Values[0]) 257 | } 258 | } 259 | } 260 | } 261 | return "", false // Unsupported type 262 | } 263 | 264 | func getOpts(node ast.Node, importAlias string) (int, bool) { 265 | if op, ok := node.(*ast.BasicLit); ok { 266 | // string version of an int, convert to int 267 | opts, err := strconv.Atoi(op.Value) 268 | if err != nil { 269 | log.Printf("unknown constant for options: %s", op.Value) 270 | return 0, false 271 | } 272 | return opts, true 273 | } else if ok, name := isSelector(node, importAlias, optNames...); ok { 274 | //selector, convert to int 275 | return convertOptsNameToInt(name) 276 | } else if bin, ok := node.(*ast.BinaryExpr); ok { 277 | // binary expression, gonna need to split and then do the operator 278 | // converting to a constant 279 | x, ok := getOpts(bin.X, importAlias) 280 | if !ok { 281 | return 0, false 282 | } 283 | y, ok := getOpts(bin.Y, importAlias) 284 | if !ok { 285 | return 0, false 286 | } 287 | switch bin.Op { 288 | case token.ADD: 289 | return x + y, true 290 | case token.AND: 291 | return x & y, true 292 | case token.OR: 293 | return x | y, true 294 | case token.XOR: 295 | return x ^ y, true 296 | case token.SHL: 297 | return x << y, true 298 | case token.SHR: 299 | return x >> y, true 300 | case token.AND_NOT: 301 | return x &^ y, true 302 | default: 303 | log.Printf("unknown operator for options: %s", bin.Op.String()) 304 | } 305 | } else { 306 | log.Printf("unknown ast node type for options: %T %+[1]v", node) 307 | } 308 | 309 | return 0, false 310 | } 311 | 312 | func convertOptsNameToInt(name string) (int, bool) { 313 | idx := slices.Index(optNames, name) 314 | if idx == -1 { 315 | log.Printf("unknown pattern option: %s", name) 316 | return 0, false 317 | } 318 | 319 | return 1 << idx, true 320 | } 321 | 322 | func isSelector(node ast.Node, pkg string, name ...string) (bool, string) { 323 | if sel, ok := node.(*ast.SelectorExpr); ok { 324 | if nm, ok := sel.X.(*ast.Ident); ok && nm.Name == pkg { 325 | if slices.Contains(name, sel.Sel.Name) { 326 | return true, sel.Sel.Name 327 | } 328 | } 329 | } 330 | return false, "" 331 | } 332 | 333 | func importsRegexp2(file *ast.File, alias *string) bool { 334 | *alias = "" 335 | for _, i := range file.Imports { 336 | if i.Path.Value == "\"github.com/dlclark/regexp2\"" { 337 | if i.Name != nil { 338 | *alias = i.Name.Name 339 | } 340 | 341 | return true 342 | } 343 | } 344 | return false 345 | } 346 | -------------------------------------------------------------------------------- /convert.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha256" 6 | "go/format" 7 | "strconv" 8 | 9 | "fmt" 10 | "io" 11 | "reflect" 12 | "slices" 13 | "strings" 14 | "unicode" 15 | 16 | "github.com/dlclark/regexp2/syntax" 17 | "github.com/pkg/errors" 18 | ) 19 | 20 | type converter struct { 21 | // buffer for our output 22 | buf *bytes.Buffer 23 | // writer from the consumer 24 | out io.Writer 25 | 26 | data []*regexpData 27 | // global helpers across the package 28 | requiredHelpers map[string]string 29 | 30 | convertedNames map[string]int 31 | 32 | err error 33 | } 34 | 35 | func newConverter(out io.Writer, packageName string) (*converter, error) { 36 | c := &converter{ 37 | buf: &bytes.Buffer{}, 38 | out: out, 39 | requiredHelpers: make(map[string]string), 40 | convertedNames: make(map[string]int), 41 | } 42 | if err := c.addHeader(packageName); err != nil { 43 | return nil, err 44 | } 45 | 46 | return c, nil 47 | } 48 | 49 | func (c *converter) addHeader(packageName string) error { 50 | // TODO: this 51 | // add package and imports 52 | /* 53 | package regexp2codegen 54 | 55 | import ( 56 | "github.com/dlclark/regexp2" 57 | ) 58 | */ 59 | c.writeLineFmt("package %s", packageName) 60 | c.writeLine("import (") 61 | c.writeLine(" \"github.com/dlclark/regexp2\"") 62 | c.writeLine(" \"github.com/dlclark/regexp2/helpers\"") 63 | c.writeLine(" \"github.com/dlclark/regexp2/syntax\"") 64 | c.writeLine(" \"unicode\"") 65 | //c.writeLine(" \"fmt\"") 66 | c.writeLine(")") 67 | 68 | return c.err 69 | } 70 | 71 | func (c *converter) addFooter() error { 72 | /* 73 | func init() { 74 | regexp2.RegisterEngine("ABCD+", regexp2.ECMAScript, &MyPattern_Engine{}) 75 | } 76 | */ 77 | 78 | // emit helpers 79 | for _, val := range c.requiredHelpers { 80 | c.writeLine(val) 81 | } 82 | 83 | // emit init func 84 | c.writeLine("func init() {") 85 | for _, rm := range c.data { 86 | c.writeLineFmt("regexp2.RegisterEngine(%v, %v, &%s_Engine{})", getGoLiteral(rm.Pattern), getOptString(rm.Options), rm.GeneratedName) 87 | } 88 | // emit basic usage of imports so we don't have to deal with import re-writing 89 | c.writeLine("var _ = helpers.Min") 90 | c.writeLine("var _ = syntax.NewCharSetRuntime") 91 | c.writeLine("var _ = unicode.IsDigit") 92 | c.writeLine("}") 93 | 94 | //format the code 95 | origCode := c.buf.Bytes() 96 | fmtOut, err := format.Source(origCode) 97 | 98 | if err != nil { 99 | c.out.Write(origCode) 100 | return err 101 | } 102 | c.out.Write(fmtOut) 103 | 104 | return c.err 105 | } 106 | 107 | type regexpData struct { 108 | SourceLocation string 109 | GeneratedName string 110 | Pattern string 111 | Options syntax.RegexOptions 112 | Tree *syntax.RegexTree 113 | Analysis *analysisResults 114 | 115 | // parsing state 116 | findEndsInAlwaysReturningTrue bool 117 | noMatchFoundLabelNeeded bool 118 | 119 | // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later. 120 | // To handle that, we build up a collection of all the declarations to include and switch the underlying writer to 121 | // another writer so we can merge at the end 122 | additionalDeclarations []string 123 | 124 | // state during emitExecute 125 | usedNames map[string]int 126 | sliceSpan string 127 | sliceStaticPos int 128 | topLevelDoneLabel string 129 | expressionHasCaptures bool 130 | doneLabel string 131 | 132 | // track our labels since Go doesn't like unused labels, we need to find them and 133 | // remove them as a post-process step 134 | emittedLabels []string 135 | usedLabels []string 136 | //TODO: timeout? 137 | //TODO: string vs rune vs byte? 138 | } 139 | 140 | func (rm *regexpData) unusedLabels() []string { 141 | var retval []string 142 | 143 | for _, s := range rm.emittedLabels { 144 | if !slices.Contains(rm.usedLabels, s) { 145 | retval = append(retval, s) 146 | } 147 | } 148 | 149 | return retval 150 | } 151 | func (rm *regexpData) addLocalDec(dec string) { 152 | // prevent dupes 153 | if slices.Contains(rm.additionalDeclarations, dec) { 154 | return 155 | } 156 | rm.additionalDeclarations = append(rm.additionalDeclarations, dec) 157 | } 158 | 159 | func (c *converter) addRegexp(sourceLocation, name string, txt string, opt syntax.RegexOptions) error { 160 | // check if already converted 161 | for _, data := range c.data { 162 | // match! we're done here 163 | if data.Pattern == txt && data.Options == opt { 164 | return nil 165 | } 166 | } 167 | 168 | // parse pattern 169 | tree, err := syntax.Parse(txt, opt|syntax.Compiled) 170 | if err != nil { 171 | return errors.Wrap(err, "error parsing regexp") 172 | } 173 | if err := supportsCodeGen(tree); err != nil { 174 | return errors.Wrap(err, "code generation not supported") 175 | } 176 | 177 | // generate unique class name 178 | newName := name 179 | for { 180 | if _, ok := c.convertedNames[newName]; ok { 181 | // name already exists, increment the number on the base name and try again 182 | c.convertedNames[name]++ 183 | val := c.convertedNames[name] 184 | newName = fmt.Sprint(name, "_", val) 185 | } else { 186 | break 187 | } 188 | } 189 | c.convertedNames[newName] = 1 190 | 191 | oldOut := c.buf 192 | c.buf = &bytes.Buffer{} 193 | 194 | c.writeLineFmt("/*\n%s*/", tree.Dump()) 195 | 196 | rm := ®expData{ 197 | SourceLocation: sourceLocation, 198 | GeneratedName: newName, 199 | Pattern: txt, 200 | Options: opt, 201 | Tree: tree, 202 | Analysis: analyze(tree), 203 | } 204 | c.data = append(c.data, rm) 205 | 206 | c.emitRegexStart(rm) 207 | 208 | // we need to emit 2 functions: FindFirstChar() and Execute() 209 | // the C# version has a "scan" function above these that I've omitted here 210 | c.emitFindFirstChar(rm) 211 | c.emitExecute(rm) 212 | 213 | // get our string for final manipulation 214 | output := c.buf.String() 215 | c.buf = oldOut 216 | 217 | // finalize our code 218 | removeUnusedLabels(&output, rm) 219 | 220 | // write our temp out buffer into our saved buffer 221 | c.buf.Write([]byte(output)) 222 | 223 | return c.err 224 | } 225 | 226 | func removeUnusedLabels(output *string, rm *regexpData) { 227 | unusedLabels := rm.unusedLabels() 228 | 229 | // find and remove the unused labels in the output 230 | for _, label := range unusedLabels { 231 | // the label is on its own line with a colon at the end 232 | *output = strings.ReplaceAll(*output, "\n"+label+":\n", "\n") 233 | // or the label could be on its own line with a semicolon at the end 234 | *output = strings.ReplaceAll(*output, "\n"+label+": ;\n", "\n") 235 | } 236 | } 237 | 238 | func (c *converter) emitRegexStart(rm *regexpData) { 239 | 240 | /* 241 | // From ABC.go:120:10 242 | // Pattern: [ABCD]+ 243 | // Options: regexp2.ECMAScript 244 | type MyPattern0_Engine struct{} 245 | 246 | func (MyPattern0_Engine) Caps() map[int]int { return map[int]int{} } 247 | func (MyPattern0_Engine) CapNames() map[string]int { return map[string]int{} } 248 | func (MyPattern0_Engine) CapsList() []string { return []string{} } 249 | func (MyPattern0_Engine) CapSize() int { return 1 } 250 | */ 251 | caps, capsize := getCaps(rm.Tree) 252 | rm.Tree.Caps = caps 253 | rm.Tree.Captop = capsize 254 | 255 | c.writeLineFmt("// From %s", rm.SourceLocation) 256 | c.writeLineFmt("// Pattern: %#v", rm.Pattern) 257 | c.writeLineFmt("// Options: %v", getOptString(rm.Options)) 258 | c.writeLineFmt("type %s_Engine struct{}", rm.GeneratedName) 259 | c.writeLineFmt("func (%s_Engine) Caps() map[int]int { return %s }", rm.GeneratedName, getGoLiteral(caps)) 260 | c.writeLineFmt("func (%s_Engine) CapNames() map[string]int { return %s }", rm.GeneratedName, getGoLiteral(rm.Tree.Capnames)) 261 | c.writeLineFmt("func (%s_Engine) CapsList() []string { return %s }", rm.GeneratedName, getGoLiteral(rm.Tree.Caplist)) 262 | c.writeLineFmt("func (%s_Engine) CapSize() int { return %v }", rm.GeneratedName, capsize) 263 | c.writeLine("") 264 | } 265 | 266 | var optNames = []string{ 267 | "IgnoreCase", 268 | "Multiline", 269 | "ExplicitCapture", 270 | "Compiled", 271 | "Singleline", 272 | "IgnorePatternWhitespace", 273 | "RightToLeft", 274 | "Debug", 275 | "ECMAScript", 276 | "RE2", 277 | "Unicode", 278 | } 279 | 280 | func getOptString(opts syntax.RegexOptions) string { 281 | if opts == 0 { 282 | return "regexp2.None" 283 | } 284 | 285 | stringOpts := []string{} 286 | remain := int(opts) 287 | for i, v := range optNames { 288 | //bit := i + 1 289 | mask := 1 << i 290 | // check if this bit is enabled in opts 291 | if remain&mask != 0 { 292 | remain &= ^mask 293 | stringOpts = append(stringOpts, "regexp2."+v) 294 | } 295 | // once we're out of options, stop looping 296 | if remain == 0 { 297 | break 298 | } 299 | } 300 | if remain > 0 { 301 | stringOpts = append(stringOpts, strconv.Itoa(remain)) 302 | } 303 | return strings.Join(stringOpts, "|") 304 | } 305 | 306 | func isNilish(val any) bool { 307 | if val == nil { 308 | return true 309 | } 310 | 311 | v := reflect.ValueOf(val) 312 | k := v.Kind() 313 | switch k { 314 | case reflect.Chan, reflect.Func, reflect.Map, reflect.Pointer, 315 | reflect.UnsafePointer, reflect.Interface, reflect.Slice: 316 | return v.IsNil() 317 | } 318 | 319 | return false 320 | } 321 | 322 | func getGoLiteral(in any) string { 323 | if isNilish(in) { 324 | return "nil" 325 | } 326 | switch in.(type) { 327 | case rune: 328 | return fmt.Sprintf("%q", in) 329 | } 330 | return fmt.Sprintf("%#v", in) 331 | } 332 | 333 | func getCaps(tree *syntax.RegexTree) (caps map[int]int, capSize int) { 334 | if tree.Capnumlist == nil || tree.Captop == len(tree.Capnumlist) { 335 | return nil, tree.Captop 336 | } 337 | 338 | capSize = len(tree.Capnumlist) 339 | caps = tree.Caps 340 | for i := 0; i < len(tree.Capnumlist); i++ { 341 | caps[tree.Capnumlist[i]] = i 342 | } 343 | 344 | return caps, capSize 345 | } 346 | 347 | func getRuneSliceLiteral[T []rune | string](in T) string { 348 | return fmt.Sprintf("[]rune(%#v)", string(in)) 349 | } 350 | 351 | func getRuneLiteralParams(in []rune) string { 352 | if len(in) == 0 { 353 | return "" 354 | } 355 | 356 | buf := &bytes.Buffer{} 357 | sep := "'" 358 | for _, ch := range in { 359 | buf.WriteString(sep) 360 | buf.WriteRune(ch) 361 | sep = "', '" 362 | } 363 | buf.WriteRune('\'') 364 | return buf.String() 365 | } 366 | 367 | // Determines whether its ok to embed the string in the field name. 368 | func isValidInFieldName(str string) bool { 369 | for _, c := range str { 370 | if unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) { 371 | continue 372 | } 373 | return false 374 | } 375 | return true 376 | } 377 | 378 | func getSHA256FieldName(toEncode string) string { 379 | sha := sha256.New() 380 | sha.Write([]byte(toEncode)) 381 | bs := sha.Sum(nil) 382 | return fmt.Sprintf("%x", bs) 383 | } 384 | 385 | func shouldUseSearchValues(chars []rune) bool { 386 | //TODO: perf optimizations will be different for Go 387 | // these are from C# 388 | // IndexOfAny(SearchValues) is faster than a regular IndexOfAny("abcd") if: 389 | // - There are more than 5 characters in the needle, or 390 | // - There are only 4 or 5 characters in the needle and they're all ASCII. 391 | if len(chars) > 5 { 392 | return true 393 | } 394 | 395 | if len(chars) < 4 { 396 | return false 397 | } 398 | 399 | return isAscii(chars) 400 | } 401 | 402 | func isAscii(chars []rune) bool { 403 | for _, c := range chars { 404 | if c > unicode.MaxASCII { 405 | return false 406 | } 407 | } 408 | return true 409 | } 410 | 411 | func (c *converter) emitIndexOfChars(chars []rune, negate bool, spanName string) string { 412 | // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload. 413 | // 1, 2, 3 have dedicated optimized IndexOfAny overloads 414 | // 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan overload, 415 | // but can also be handled via SearchValues 416 | // > 5 can only be handled efficiently via SearchValues 417 | var indexOfAnyName = "IndexOfAny" 418 | if negate { 419 | indexOfAnyName = "IndexOfAnyExcept" 420 | } 421 | 422 | switch len(chars) { 423 | case 1: 424 | return fmt.Sprintf("helpers.%s1(%s, %q)", indexOfAnyName, spanName, chars[0]) 425 | case 2: 426 | return fmt.Sprintf("helpers.%s2(%s, %q, %q)", indexOfAnyName, spanName, chars[0], chars[1]) 427 | case 3: 428 | return fmt.Sprintf("helpers.%s3(%s, %q, %q, %q)", indexOfAnyName, spanName, chars[0], chars[1], chars[2]) 429 | case 4, 5: 430 | if shouldUseSearchValues(chars) { 431 | return fmt.Sprintf("%s.%s(%s)", c.emitSearchValues(chars, ""), indexOfAnyName, spanName) 432 | } else { 433 | return fmt.Sprintf("helpers.%s(%s, %s)", indexOfAnyName, spanName, getRuneSliceLiteral(chars)) 434 | } 435 | } 436 | return fmt.Sprintf("%s.%s(%s)", c.emitSearchValues(chars, ""), indexOfAnyName, spanName) 437 | } 438 | 439 | var emitSearchValueConstNames = map[string]string{ 440 | "FFFFFFFF000000000000000000000080": "svAsciiControl", 441 | "000000000000FF030000000000000000": "svAsciiDigits", 442 | "0000000000000000FEFFFF07FEFFFF07": "svAsciiLetters", 443 | "000000000000FF03FEFFFF07FEFFFF07": "svAsciiLettersAndDigits", 444 | "000000000000FF037E0000007E000000": "svAsciiHexDigits", 445 | "000000000000FF03000000007E000000": "svAsciiHexDigitsLower", 446 | "000000000000FF037E00000000000000": "svAsciiHexDigitsUpper", 447 | "00000000EEF7008C010000B800000028": "svAsciiPunctuation", 448 | "00000000010000000000000000000000": "svAsciiSeparators", 449 | "00000000100800700000004001000050": "svAsciiSymbols", 450 | "003E0000010000000000000000000000": "svAsciiWhiteSpace", 451 | "000000000000FF03FEFFFF87FEFFFF07": "svAsciiWordChars", 452 | 453 | "00000000FFFFFFFFFFFFFFFFFFFFFF7F": "svAsciiExceptControl", 454 | "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF": "svAsciiExceptDigits", 455 | "FFFFFFFFFFFFFFFF010000F8010000F8": "svAsciiExceptLetters", 456 | "FFFFFFFFFFFF00FC010000F8010000F8": "svAsciiExceptLettersAndDigits", 457 | "FFFFFFFFFFFFFFFFFFFFFFFF010000F8": "svAsciiExceptLower", 458 | "FFFFFFFF1108FF73FEFFFF47FFFFFFD7": "svAsciiExceptPunctuation", 459 | "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF": "svAsciiExceptSeparators", 460 | "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF": "svAsciiExceptSymbols", 461 | "FFFFFFFFFFFFFFFF010000F8FFFFFFFF": "svAsciiExceptUpper", 462 | "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF": "svAsciiExceptWhiteSpace", 463 | "FFFFFFFFFFFF00FC01000078010000F8": "svAsciiExceptWordChars", 464 | } 465 | 466 | func (c *converter) emitSearchValues(chars []rune, fieldName string) string { 467 | slices.Sort(chars) 468 | asciiOnly := isAscii(chars) 469 | if len(fieldName) == 0 { 470 | if asciiOnly { 471 | // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. 472 | bitmap := make([]byte, 16) 473 | for _, c := range chars { 474 | bitmap[c>>3] |= (byte)(1 << (c & 7)) 475 | } 476 | hexBitmap := fmt.Sprintf("%x", bitmap) 477 | var ok bool 478 | fieldName, ok = emitSearchValueConstNames[hexBitmap] 479 | if !ok { 480 | fieldName = "svAscii" + strings.TrimLeft(hexBitmap, "0") 481 | } 482 | } else { 483 | fieldName = "sNonAscii" + getSHA256FieldName(string(chars)) 484 | } 485 | } 486 | 487 | if _, ok := c.requiredHelpers[fieldName]; !ok { 488 | if asciiOnly { 489 | c.requiredHelpers[fieldName] = fmt.Sprintf(`// Supports searching for the chars in or not in %#v 490 | var %v = helpers.NewAsciiSearchValues(%#v)`, 491 | string(chars), fieldName, string(chars)) 492 | } else { 493 | c.requiredHelpers[fieldName] = fmt.Sprintf(`// Supports searching for the chars in or not in %#v 494 | var %v = helpers.NewRuneSearchValues(%#v)`, 495 | string(chars), fieldName, string(chars)) 496 | } 497 | } 498 | 499 | return fieldName 500 | } 501 | 502 | func (c *converter) emitGoto(label string) { 503 | c.writeLineFmt("goto %s", label) 504 | } 505 | 506 | func (c *converter) emitLabel(label string) { 507 | c.writeLineFmt("%s:", label) 508 | } 509 | 510 | func (c *converter) write(data string) { 511 | _, err := fmt.Fprint(c.buf, data) 512 | if err != nil { 513 | c.err = err 514 | } 515 | } 516 | func (c *converter) writeLine(line string) { 517 | _, err := fmt.Fprintln(c.buf, line) 518 | if err != nil { 519 | c.err = err 520 | } 521 | } 522 | 523 | func (c *converter) writeLineFmt(format string, args ...any) { 524 | _, err := fmt.Fprintf(c.buf, format, args...) 525 | if err != nil { 526 | c.err = err 527 | } 528 | _, err = c.buf.Write([]byte{'\n'}) 529 | if err != nil { 530 | c.err = err 531 | } 532 | } 533 | 534 | func supportsCodeGen(tree *syntax.RegexTree) error { 535 | //TODO: filter out invalid trees 536 | //https://github.com/dotnet/runtime/blob/main/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs#L296 537 | return nil 538 | } 539 | 540 | // helper to make ident names unique, add nums for dupes 541 | func (rm *regexpData) reserveName(prefix string) string { 542 | num := rm.usedNames[prefix] 543 | rm.usedNames[prefix] = num + 1 544 | if num == 0 { 545 | return prefix 546 | } 547 | return fmt.Sprint(prefix, num) 548 | } 549 | -------------------------------------------------------------------------------- /convert_findfirstchar.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "math" 7 | "unicode" 8 | 9 | "github.com/dlclark/regexp2/syntax" 10 | ) 11 | 12 | func (c *converter) emitFindFirstChar(rm *regexpData) { 13 | c.writeLineFmt("func (%s_Engine) FindFirstChar(r *regexp2.Runner) bool {", rm.GeneratedName) 14 | //c.writeLine(`fmt.Println("FindFirstChar")`) 15 | defer func() { 16 | c.writeLine("}\n") 17 | }() 18 | 19 | rtl := rm.Options&syntax.RightToLeft != 0 20 | root := rm.Tree.Root.Children[0] 21 | 22 | if root.T == syntax.NtEmpty { 23 | // we always match the current char since we match the empty string 24 | c.writeLine("return true") 25 | return 26 | } 27 | if root.T == syntax.NtNothing { 28 | // this never matches anything 29 | c.writeLine("return false") 30 | return 31 | } 32 | 33 | needPosVar := true 34 | oldOut := c.buf 35 | buf := &bytes.Buffer{} 36 | c.buf = buf 37 | defer func() { 38 | // lets clean this up at the end 39 | c.buf = oldOut 40 | 41 | if needPosVar { 42 | c.writeLine("pos := r.Runtextpos") 43 | } 44 | 45 | // write additionalDeclarations 46 | for _, l := range rm.additionalDeclarations { 47 | c.writeLine(l) 48 | } 49 | 50 | //reset 51 | rm.additionalDeclarations = []string{} 52 | 53 | // then write our temp out buffer into our saved buffer 54 | c.buf.Write(buf.Bytes()) 55 | }() 56 | 57 | // Generate length check. If the input isn't long enough to possibly match, fail quickly. 58 | // It's rare for min required length to be 0, so we don't bother special-casing the check, 59 | // especially since we want the "return false" code regardless. 60 | minRequiredLength := rm.Tree.FindOptimizations.MinRequiredLength 61 | endBlock := "" 62 | if minRequiredLength > 0 { 63 | if minRequiredLength == 1 { 64 | c.writeLine("// Empty matches aren't possible") 65 | if !rtl { 66 | c.writeLine("if pos < len(r.Runtext) {") 67 | } else { 68 | c.writeLine("if pos > 1 {") 69 | } 70 | } else { 71 | c.writeLineFmt("// Any possible match is at least %v characters", minRequiredLength) 72 | if !rtl { 73 | c.writeLineFmt("if pos <= len(r.Runtext) - %v {", minRequiredLength) 74 | } else { 75 | c.writeLineFmt("if pos >= %v {", minRequiredLength) 76 | } 77 | } 78 | endBlock = "}" 79 | } 80 | 81 | const NoMatchFound = "NoMatchFound" 82 | 83 | if !c.emitAnchors(rm) { 84 | // Either anchors weren't specified, or they don't completely root all matches to a specific location. 85 | 86 | // Emit the code for whatever find mode has been determined. 87 | switch rm.Tree.FindOptimizations.FindMode { 88 | case syntax.LeadingString_LeftToRight, syntax.LeadingString_OrdinalIgnoreCase_LeftToRight, syntax.FixedDistanceString_LeftToRight: 89 | c.emitIndexOfString_LeftToRight(rm) 90 | case syntax.LeadingString_RightToLeft: 91 | c.emitIndexOfString_RightToLeft(rm) 92 | case syntax.LeadingStrings_LeftToRight, syntax.LeadingStrings_OrdinalIgnoreCase_LeftToRight: 93 | c.emitIndexOfStrings_LeftToRight(rm) 94 | case syntax.LeadingSet_LeftToRight, syntax.FixedDistanceSets_LeftToRight: 95 | c.emitFixedSet_LeftToRight(rm) 96 | case syntax.LeadingSet_RightToLeft: 97 | c.emitFixedSet_RightToLeft(rm) 98 | case syntax.LiteralAfterLoop_LeftToRight: 99 | c.emitLiteralAfterAtomicLoop(rm) 100 | default: 101 | //there's a special case here where we haven't written anything 102 | // and we don't want to declare the "pos" var 103 | needPosVar = buf.Len() > 0 104 | c.writeLine("return true") 105 | rm.findEndsInAlwaysReturningTrue = true 106 | } 107 | } 108 | 109 | if endBlock != "" { 110 | c.writeLine(endBlock) 111 | } 112 | 113 | // If the main path is guaranteed to end in a "return true;" and nothing is going to 114 | // jump past it, we don't need a "return false;" path. 115 | if minRequiredLength > 0 || !rm.findEndsInAlwaysReturningTrue || rm.noMatchFoundLabelNeeded { 116 | c.writeLine("\n// No match found") 117 | if rm.noMatchFoundLabelNeeded { 118 | c.emitLabel(NoMatchFound) 119 | } 120 | var setPos string 121 | if !rtl { 122 | setPos = "len(r.Runtext)" 123 | } else { 124 | setPos = "0" 125 | } 126 | c.writeLineFmt("r.Runtextpos = %v", setPos) 127 | c.writeLine("return false") 128 | } 129 | } 130 | 131 | func (c *converter) emitAnchors(rm *regexpData) bool { 132 | regexTree := rm.Tree 133 | // Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. 134 | switch regexTree.FindOptimizations.FindMode { 135 | case syntax.LeadingAnchor_LeftToRight_Beginning: 136 | c.writeLine("// The pattern leads with a beginning (\\A) anchor.") 137 | // If we're at the beginning, we're at a possible match location. Otherwise, 138 | // we'll never be, so fail immediately. 139 | c.writeLine(`if pos == 0 { 140 | return true 141 | }`) 142 | return true 143 | 144 | case syntax.LeadingAnchor_LeftToRight_Start: 145 | case syntax.LeadingAnchor_RightToLeft_Start: 146 | c.write("// The pattern leads with a start (\\G) anchor") 147 | if regexTree.FindOptimizations.FindMode == syntax.LeadingAnchor_RightToLeft_Start { 148 | c.write(" when processed right to left") 149 | } 150 | 151 | // For both left-to-right and right-to-left, if we're currently at the start, 152 | // we're at a possible match location. Otherwise, because we've already moved 153 | // beyond it, we'll never be, so fail immediately. 154 | c.writeLine(` 155 | if (pos == r.Runtextstart) { 156 | return true 157 | } 158 | `) 159 | return true 160 | 161 | case syntax.LeadingAnchor_LeftToRight_EndZ: 162 | // If we're not currently at the end (or a newline just before it), skip ahead 163 | // since nothing until then can possibly match. 164 | c.writeLine(`// The pattern leads with an end (\Z) anchor. 165 | if pos < len(r.Runtext) - 1 { 166 | r.Runtextpos = len(r.Runtext) - 1 167 | } 168 | return true 169 | `) 170 | rm.findEndsInAlwaysReturningTrue = true 171 | return true 172 | 173 | case syntax.LeadingAnchor_LeftToRight_End: 174 | // If we're not currently at the end (or a newline just before it), skip ahead 175 | // since nothing until then can possibly match. 176 | c.writeLine(`// The pattern leads with an end (\z) anchor. 177 | if pos < len(r.Runtext) { 178 | r.Runtextpos = len(r.Runtext) 179 | } 180 | return true 181 | `) 182 | rm.findEndsInAlwaysReturningTrue = true 183 | return true 184 | 185 | case syntax.LeadingAnchor_RightToLeft_Beginning: 186 | c.writeLine(`// The pattern leads with a beginning (\A) anchor when processed right to left. 187 | if pos != 0 { 188 | r.Runtextpos = 0 189 | } 190 | return true 191 | `) 192 | rm.findEndsInAlwaysReturningTrue = true 193 | return true 194 | 195 | case syntax.LeadingAnchor_RightToLeft_EndZ: 196 | // If we're currently at the end, we're at a valid position to try. Otherwise, 197 | // we'll never be (we're iterating from end to beginning), so fail immediately. 198 | c.writeLine(`// The pattern leads with an end (\Z) anchor when processed right to left. 199 | if pos >= len(r.Runtext) - 1 && (pos >= len(r.Runtext) || r.Runtext[pos] == '\n') { 200 | return true 201 | } 202 | `) 203 | return true 204 | 205 | case syntax.LeadingAnchor_RightToLeft_End: 206 | // If we're currently at the end, we're at a valid position to try. Otherwise, 207 | // we'll never be (we're iterating from end to beginning), so fail immediately. 208 | c.writeLine(`// The pattern leads with an end (\z) anchor when processed right to left. 209 | if pos >= len(r.Runtext) { 210 | return true 211 | } 212 | `) 213 | return true 214 | 215 | case syntax.TrailingAnchor_FixedLength_LeftToRight_EndZ: 216 | // Jump to the end, minus the min required length, which in this case is actually the fixed length, minus 1 (for a possible ending \n). 217 | c.writeLineFmt(`// The pattern has a trailing end (\Z) anchor, and any possible match is exactly %v characters. 218 | if pos < len(r.Runtext) - %v { 219 | r.Runtextpos = len(r.Runtext) - %[2]v 220 | } 221 | return true 222 | `, regexTree.FindOptimizations.MinRequiredLength, regexTree.FindOptimizations.MinRequiredLength+1) 223 | rm.findEndsInAlwaysReturningTrue = true 224 | return true 225 | 226 | case syntax.TrailingAnchor_FixedLength_LeftToRight_End: 227 | // Jump to the end, minus the min required length, which in this case is actually the fixed length. 228 | c.writeLineFmt(`// The pattern has a trailing end (\z) anchor, and any possible match is exactly %v characters. 229 | if pos < len(r.Runtext) - %[1]v { 230 | r.Runtextpos = len(r.Runtext) - %[1]v 231 | } 232 | return true 233 | `, regexTree.FindOptimizations.MinRequiredLength) 234 | rm.findEndsInAlwaysReturningTrue = true 235 | return true 236 | } 237 | 238 | // Now handle anchors that boost the position but may not determine immediate success or failure. 239 | 240 | if regexTree.FindOptimizations.LeadingAnchor == syntax.NtBol { 241 | str1 := ">" 242 | str2 := fmt.Sprint(" - ", regexTree.FindOptimizations.MinRequiredLength) 243 | if regexTree.FindOptimizations.MinRequiredLength == 0 { 244 | str2 = "" 245 | } else if regexTree.FindOptimizations.MinRequiredLength == 1 { 246 | str1 = ">=" 247 | str2 = "" 248 | } 249 | // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike 250 | // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike 251 | // the other anchors, which all skip all subsequent processing if found, with BOL we just use it 252 | // to boost our position to the next line, and then continue normally with any searches. 253 | c.writeLineFmt(`// The pattern has a leading beginning-of-line anchor. 254 | if pos > 0 && r.Runtext[pos-1] != '\n' { 255 | newlinePos := helpers.IndexOfAny1(r.Runtext[pos:], '\n') 256 | if newlinePos > len(r.Runtext) - pos - 1 { 257 | goto NoMatchFound 258 | } 259 | pos += newlinePos + 1 260 | 261 | if pos %v len(r.Runtext)%v { 262 | goto NoMatchFound 263 | } 264 | } 265 | `, str1, str2) 266 | rm.noMatchFoundLabelNeeded = true 267 | } 268 | 269 | // if we have a max len 270 | if regexTree.FindOptimizations.MaxPossibleLength > -1 { 271 | if regexTree.FindOptimizations.TrailingAnchor == syntax.NtEnd { 272 | c.writeLineFmt(`// The pattern has a trailing end (\z) anchor, and any possible match is no more than %v characters. 273 | if pos < len(r.Runtext) - %[1]v { 274 | pos = len(r.Runtext) - %[1]v 275 | } 276 | `, regexTree.FindOptimizations.MaxPossibleLength) 277 | } else if regexTree.FindOptimizations.TrailingAnchor == syntax.NtEndZ { 278 | c.writeLineFmt(`// The pattern has a trailing end (\Z) anchor, and any possible match is no more than %v characters. 279 | if pos < len(r.Runtext) - %[1]v { 280 | pos = len(r.Runtext) - %[1]v 281 | } 282 | `, regexTree.FindOptimizations.MaxPossibleLength+1) 283 | } 284 | 285 | } 286 | 287 | return false 288 | } 289 | 290 | // Emits a case-sensitive left-to-right search for a substring. 291 | func (c *converter) emitIndexOfString_LeftToRight(rm *regexpData) { 292 | opts := rm.Tree.FindOptimizations 293 | 294 | substring, stringComparison, offset, offsetDescription := "", "", "", "" 295 | //ignoreCase := false 296 | switch opts.FindMode { 297 | case syntax.LeadingString_LeftToRight: 298 | substring = opts.LeadingPrefix 299 | offsetDescription = "at the beginning of the pattern" 300 | 301 | case syntax.LeadingString_OrdinalIgnoreCase_LeftToRight: 302 | substring = opts.LeadingPrefix 303 | stringComparison = "IgnoreCase" 304 | offsetDescription = " case-insensitive at the beginning of the pattern" 305 | //ignoreCase = true 306 | 307 | case syntax.FixedDistanceString_LeftToRight: 308 | substring = opts.FixedDistanceLiteral.S 309 | if opts.FixedDistanceLiteral.Distance > 0 { 310 | offset = fmt.Sprint(" + ", opts.FixedDistanceLiteral.Distance) 311 | offsetDescription = fmt.Sprint(" at index ", opts.FixedDistanceLiteral.Distance, " in the pattern") 312 | } 313 | } 314 | 315 | /* 316 | TODO: is this needed? not sure a stringsearch is going to add value here 317 | 318 | substringAndComparison := fmt.Sprint(substring, stringComparison) 319 | fieldName := "sv" 320 | if isValidInFieldName(substring) { 321 | fieldName += substringAndComparison 322 | } else { 323 | fieldName += getSHA256FieldName(substringAndComparison) 324 | } 325 | 326 | if _, ok := c.requiredHelpers[fieldName]; !ok { 327 | c.requiredHelpers[fieldName] = fmt.Sprintf(`// Supports searching for the string %#[1]v 328 | var %[2]v = helpers.NewStringSearchValues(%#[1]v, %#[3]v)`, 329 | []rune(substring), fieldName, ignoreCase) 330 | }*/ 331 | 332 | c.writeLineFmt(`// The pattern has the literal %#v %v. Find the next occurrence. 333 | // If it can't be found, there's no match 334 | if i := helpers.IndexOf%v(r.Runtext[pos%v:], %s); i >= 0 { 335 | r.Runtextpos = pos + i 336 | return true 337 | }`, substring, offsetDescription, stringComparison, offset, getRuneSliceLiteral(substring)) 338 | } 339 | 340 | // Emits a case-sensitive right-to-left search for a substring. 341 | func (c *converter) emitIndexOfString_RightToLeft(rm *regexpData) { 342 | prefix := rm.Tree.FindOptimizations.LeadingPrefix 343 | 344 | c.writeLineFmt(`// The pattern begins with a literal %#[1]v. Find the next occurrence right-to-left. 345 | // If it can't be found, there's no match. 346 | pos = r.LastIndexOf(r.Runtext, pos, []rune(%#[1]v)) 347 | if pos >= 0 { 348 | r.Runtextpos = pos + %[2]v 349 | return true 350 | } 351 | `, prefix, len(prefix)) 352 | } 353 | 354 | func getRuneSliceSliceLiteral(vals []string) string { 355 | buf := &bytes.Buffer{} 356 | buf.WriteString("[][]rune{") 357 | sep := "" 358 | for i := 0; i < len(vals); i++ { 359 | buf.WriteString(sep) 360 | buf.WriteString(getRuneSliceLiteral(vals[i])) 361 | sep = ", " 362 | } 363 | buf.WriteString("}") 364 | return buf.String() 365 | } 366 | 367 | // Emits a case-sensitive left-to-right search for any one of multiple leading prefixes. 368 | func (c *converter) emitIndexOfStrings_LeftToRight(rm *regexpData) { 369 | opts := rm.Tree.FindOptimizations 370 | 371 | prefixes := getRuneSliceSliceLiteral(opts.LeadingPrefixes) 372 | stringComparison := "" 373 | ignoreCase := false 374 | if opts.FindMode == syntax.LeadingStrings_OrdinalIgnoreCase_LeftToRight { 375 | stringComparison = "_IgnoreCase" 376 | ignoreCase = true 377 | } 378 | fieldName := fmt.Sprint("indexOfAnyStrings", stringComparison, "_", getSHA256FieldName(prefixes)) 379 | 380 | if _, ok := c.requiredHelpers[fieldName]; !ok { 381 | // explicitly using an array in case prefixes is large 382 | c.requiredHelpers[fieldName] = fmt.Sprintf(`// Supports searching for the specified strings 383 | var %v = helpers.NewStringSearchValues(%s, %v)`, 384 | fieldName, prefixes, ignoreCase) 385 | } 386 | 387 | c.writeLineFmt(`// The pattern has multiple strings that could begin the match. Search for any of them. 388 | // If none can be found, there's no match 389 | if i := %v.IndexOfAny(r.Runtext[pos:]); i >= 0 { 390 | r.Runtextpos = pos + i 391 | return true 392 | }`, fieldName) 393 | } 394 | 395 | func (c *converter) emitSetDefinition(set *syntax.CharSet) string { 396 | hash := set.Hash() 397 | vals := string(hash) 398 | 399 | fieldName := fmt.Sprint("set_", getSHA256FieldName(vals)) 400 | 401 | if _, ok := c.requiredHelpers[fieldName]; !ok { 402 | // explicitly using an array in case prefixes is large 403 | c.requiredHelpers[fieldName] = fmt.Sprintf(`// The set %v 404 | var %v = syntax.NewCharSetRuntime(%#v)`, 405 | set.String(), fieldName, vals) 406 | } 407 | 408 | return fieldName 409 | } 410 | 411 | // Emits a search for a set at a fixed position from the start of the pattern, 412 | // and potentially other sets at other fixed positions in the pattern. 413 | func (c *converter) emitFixedSet_LeftToRight(rm *regexpData) { 414 | sets := rm.Tree.FindOptimizations.FixedDistanceSets 415 | primarySet := sets[0] 416 | 417 | const MaxSets = 4 418 | setsToUse := len(sets) 419 | if setsToUse > MaxSets { 420 | setsToUse = MaxSets 421 | } 422 | 423 | if primarySet.Distance == 0 { 424 | c.writeLineFmt(`// The pattern begins with %v`, primarySet.Set) 425 | } else { 426 | c.writeLineFmt(`// The pattern matches %v at index %v`, primarySet.Set, primarySet.Distance) 427 | } 428 | c.writeLine("// Find the next occurrence. If it can't be found, there's no match.") 429 | 430 | // Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix. 431 | // But we avoid using it for the relatively common case of the starting set being '.', aka anything other than 432 | // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it 433 | // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match 434 | // the first char. 435 | setIndex := 0 436 | canUseIndexOf := !primarySet.Set.Equals(syntax.NotNewLineClass()) && !primarySet.Set.IsAnything() 437 | 438 | needLoop := !canUseIndexOf || setsToUse > 1 439 | 440 | endBlock := "" 441 | if needLoop { 442 | c.writeLine("span := r.Runtext[pos:]") 443 | upperBound := "len(span)" 444 | if setsToUse > 1 || primarySet.Distance != 0 { 445 | upperBound = fmt.Sprint(upperBound, " - ", rm.Tree.FindOptimizations.MinRequiredLength-1) 446 | } 447 | c.writeLineFmt(`for i := 0; i < %v; i++ {`, upperBound) 448 | endBlock = "}" 449 | } 450 | 451 | if canUseIndexOf { 452 | var span string 453 | if needLoop { 454 | if primarySet.Distance == 0 { 455 | span = "span[i:]" 456 | } else { 457 | span = fmt.Sprint("span[i+", primarySet.Distance, ":]") 458 | } 459 | } else { 460 | if primarySet.Distance == 0 { 461 | span = "r.Runtext[pos:]" 462 | } else { 463 | span = fmt.Sprint("r.Runtext[pos+", primarySet.Distance, ":]") 464 | } 465 | } 466 | 467 | // Get the IndexOf* expression to use to perform the search. 468 | var indexOf string 469 | 470 | if len(primarySet.Chars) > 0 { 471 | indexOf = c.emitIndexOfChars(primarySet.Chars, primarySet.Negated, span) 472 | 473 | } else if primarySet.Range != nil { 474 | // We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case, 475 | // where we end up with a set of a single char, we can use IndexOf instead. 476 | if primarySet.Range.First == primarySet.Range.Last { 477 | if primarySet.Negated { 478 | indexOf = fmt.Sprintf("helpers.IndexOfAnyExcept(%v, %q)", span, primarySet.Range.First) 479 | } else { 480 | indexOf = fmt.Sprintf("helpers.IndexOfAny1(%v, %q)", span, primarySet.Range.First) 481 | } 482 | } else { 483 | if primarySet.Negated { 484 | indexOf = fmt.Sprintf("helpers.IndexOfAnyExceptInRange(%v, %q, %q)", span, primarySet.Range.First, primarySet.Range.Last) 485 | } else { 486 | indexOf = fmt.Sprintf("helpers.IndexOfAnyInRange(%v, %q, %q)", span, primarySet.Range.First, primarySet.Range.Last) 487 | } 488 | } 489 | } else if isSmall, setChars, negated, desc := primarySet.Set.IsUnicodeCategoryOfSmallCharCount(); isSmall { 490 | // We have a known set of characters, and we can use the supplied IndexOfAny{Except}(...). 491 | fName := "IndexOfAny" 492 | if negated { 493 | fName = "IndexOfAnyExcept" 494 | } 495 | if len(desc) > 0 { 496 | desc = "rsvSet" + desc 497 | } 498 | indexOf = fmt.Sprintf("%v.%v(%v)", c.emitSearchValues(setChars, desc), fName, span) 499 | } else { 500 | // We have an arbitrary set of characters that's really large or otherwise not enumerable. 501 | // We use a custom IndexOfAny helper that will perform the search as efficiently as possible. 502 | indexOf = c.emitIndexOfAnyCustomHelper(rm, primarySet.Set, negated, span) 503 | } 504 | 505 | if needLoop { 506 | c.writeLineFmt(`indexOfPos := %v 507 | if indexOfPos < 0 { 508 | goto NoMatchFound 509 | } 510 | i += indexOfPos 511 | `, indexOf) 512 | rm.noMatchFoundLabelNeeded = true 513 | 514 | if setsToUse > 1 { 515 | // Of the remaining sets we're going to check, find the maximum distance of any of them. 516 | // If it's further than the primary set we checked, we need a bounds check. 517 | maxDistance := sets[1].Distance 518 | for i := 2; i < setsToUse; i++ { 519 | if sets[i].Distance > maxDistance { 520 | maxDistance = sets[i].Distance 521 | } 522 | 523 | if maxDistance > primarySet.Distance { 524 | numRemainingSets := setsToUse - 1 525 | c.writeLineFmt(`// The primary set being searched for was found. %v more set(s) will be checked so as 526 | // to minimize the number of places TryMatchAtCurrentPosition is run unnecessarily. 527 | // Make sure everything fits in the remainder of the input. 528 | if i+%v >= len(span) { 529 | goto NoMatchFound 530 | } 531 | `, numRemainingSets, maxDistance) 532 | rm.noMatchFoundLabelNeeded = true 533 | 534 | } 535 | } 536 | } 537 | } else { 538 | c.writeLineFmt(`i := %v 539 | if i >= 0 { 540 | r.Runtextpos = pos + i 541 | return true 542 | } 543 | `, indexOf) 544 | } 545 | 546 | setIndex = 1 547 | } 548 | 549 | if needLoop { 550 | endBlock2 := "" 551 | if setIndex < setsToUse { 552 | // if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") && 553 | // ...) 554 | 555 | start := setIndex 556 | for ; setIndex < setsToUse; setIndex++ { 557 | addOn := "" 558 | if sets[setIndex].Distance > 0 { 559 | addOn = fmt.Sprintf(" + %v", sets[setIndex].Distance) 560 | } 561 | spanIndex := fmt.Sprintf("span[i%v]", addOn) 562 | charInClassExpr := c.emitMatchCharacterClass(rm, sets[setIndex].Set, false, spanIndex) 563 | 564 | if setIndex == start { 565 | c.write("if ") 566 | c.write(charInClassExpr) 567 | } else { 568 | c.writeLine(" &&") 569 | c.write(" ") 570 | c.write(charInClassExpr) 571 | } 572 | 573 | } 574 | c.writeLine(` {`) 575 | endBlock2 = "}" 576 | } 577 | c.writeLine(`r.Runtextpos = pos + i 578 | return true`) 579 | c.writeLine(endBlock2) 580 | } 581 | 582 | c.writeLine(endBlock) 583 | 584 | } 585 | 586 | // Emits a right-to-left search for a set at a fixed position from the start of the pattern. 587 | // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) 588 | func (c *converter) emitFixedSet_RightToLeft(rm *regexpData) { 589 | set := rm.Tree.FindOptimizations.FixedDistanceSets[0] 590 | 591 | c.writeLineFmt(`// The pattern begins with %v 592 | // Find the next occurrence. If it can't be found, there's no match.`, set.Set.String()) 593 | 594 | if len(set.Chars) == 1 { 595 | c.writeLineFmt(`pos = r.LastIndexOfRune(0, pos, %q) 596 | if pos >= 0 { 597 | r.Runtextpos = pos + 1 598 | return true 599 | }`, set.Chars[0]) 600 | } else { 601 | c.writeLineFmt(`for pos--; pos < len(r.Runtext); pos-- { 602 | if %v { 603 | r.Runtextpos = pos + 1 604 | return true 605 | } 606 | }`, c.emitMatchCharacterClass(rm, set.Set, false, "r.Runtext[pos]")) 607 | } 608 | } 609 | 610 | // Emits a search for a literal following a leading atomic single-character loop. 611 | func (c *converter) emitLiteralAfterAtomicLoop(rm *regexpData) { 612 | target := rm.Tree.FindOptimizations.LiteralAfterLoop 613 | 614 | targetComment := "" 615 | 616 | if len(target.String) > 0 { 617 | stringComparisonComment := "" 618 | if target.StringIgnoreCase { 619 | stringComparisonComment = "case-insensitive " 620 | } 621 | targetComment = "the " + stringComparisonComment + "string " + target.String 622 | } else if len(target.Chars) > 0 { 623 | targetComment = fmt.Sprintf("one of the characters %#v", string(target.Chars)) 624 | } else { 625 | targetComment = fmt.Sprintf("the character %q", target.Char) 626 | } 627 | 628 | c.writeLineFmt(`// The pattern begins with an atomic loop for %v {DescribeSet(target.LoopNode.Str!)}, followed by %v 629 | // Search for the literal, and then walk backwards to the beginning of the loop.`, 630 | target.LoopNode.Set.String(), targetComment) 631 | 632 | endBlock := "" 633 | 634 | if target.LoopNode.M > 0 { 635 | // If there's no lower bound on the loop, then once we find the literal, we know we have a valid starting position to try. 636 | // If there is a lower bound, then we need a loop, as we could find the literal but it might not be prefixed with enough 637 | // appropriate characters to satisfy the minimum bound. 638 | c.writeLine("for {") 639 | endBlock = "}" 640 | } 641 | 642 | c.writeLine("slice := r.Runtext[pos:]\n") 643 | // Find the literal. If we can't find it, we're done searching. 644 | if len(target.String) > 0 { 645 | // find string 646 | c.writeLineFmt("i := helpers.IndexOf(slice, %s)", getRuneSliceLiteral(target.String)) 647 | } else if len(target.Chars) > 0 { 648 | // find char any 649 | c.writeLineFmt("i := %v", c.emitIndexOfChars(target.Chars, false, "slice")) 650 | } else { 651 | // find char any 652 | c.writeLineFmt("i := %v", c.emitIndexOfChars([]rune{target.Char}, false, "slice")) 653 | } 654 | 655 | endBlock2 := "" 656 | if target.LoopNode.M > 0 { 657 | c.writeLine(`if i < 0 { 658 | break 659 | } 660 | `) 661 | } else { 662 | c.writeLine(`if i >= 0 {`) 663 | endBlock2 = "}" 664 | } 665 | 666 | // We found the literal. Walk backwards from it finding as many matches as we can against the loop. 667 | c.writeLineFmt(`prev := i - 1 668 | for uint(prev) < uint(len(slice)) && %v { 669 | prev-- 670 | } 671 | `, c.emitMatchCharacterClass(rm, target.LoopNode.Set, false, "slice[prev]")) 672 | 673 | if target.LoopNode.M > 0 { 674 | // If we found fewer than needed, loop around to try again. The loop doesn't overlap with the literal, 675 | // so we can start from after the last place the literal matched. 676 | c.writeLineFmt(`if (i - prev - 1) < %v { 677 | pos += i + 1 678 | continue 679 | } 680 | `, target.LoopNode.M) 681 | } 682 | 683 | // We have a winner. The starting position is just after the last position that failed to match the loop. 684 | // We also store the position after the loop into runtrackpos (an extra, unused field on RegexRunner) in order 685 | // to communicate this position to the match algorithm such that it can skip the loop. 686 | c.writeLine(`r.Runtextpos = pos + prev + 1 687 | r.Runtrackpos = pos + i 688 | return true`) 689 | 690 | c.writeLine(endBlock2) 691 | c.writeLine(endBlock) 692 | } 693 | 694 | func getFuncCallIfEqual(set *syntax.CharSet, negate bool, setB *syntax.CharSet, negSetB *syntax.CharSet, funcName string, chExpr string) (string, bool) { 695 | // example 696 | // if set is a DigitClass, but it's negated then we need to match 697 | // NotDigit and we need to write !isDigit() code 698 | // 699 | // if set is a NotDigitClass, but it's negated then we need to match 700 | // Digit and write isDigit() code 701 | eq := false 702 | if set.Equals(setB) { 703 | eq = true 704 | } else if set.Equals(negSetB) { 705 | eq = true 706 | negate = !negate 707 | } 708 | if !eq { 709 | return "", false 710 | } 711 | if negate { 712 | return fmt.Sprint("!", funcName, "(", chExpr, ")"), true 713 | } 714 | return fmt.Sprint(funcName, "(", chExpr, ")"), true 715 | } 716 | 717 | // Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'. 718 | func differByOneBit(a, b rune) (rune, bool) { 719 | mask := a ^ b 720 | if mask == 0 { 721 | return 0, false 722 | } 723 | return mask, mask&(mask-1) == 0 724 | } 725 | 726 | func (c *converter) emitMatchCharacterClass(rm *regexpData, set *syntax.CharSet, negate bool, chExpr string) string { 727 | //this is in-line and produces an expression that resolves to a bool, 728 | //so anything that requires a new var must call a function 729 | 730 | // We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass), 731 | // but that call is relatively expensive. Before we fall back to it, we try to optimize 732 | // some common cases for which we can do much better, such as known character classes 733 | // for which we can call a dedicated method, or a fast-path for ASCII using a lookup table. 734 | // In some cases, multiple optimizations are possible for a given character class: the checks 735 | // in this method are generally ordered from fastest / simplest to slowest / most complex so 736 | // that we get the best optimization for a given char class. 737 | 738 | // First, see if the char class is a built-in one for which there's a better function 739 | // we can just call directly. 740 | 741 | if set.IsAnything() { 742 | // This assumes chExpr never has side effects. 743 | if negate { 744 | return "false" 745 | } 746 | return "true" 747 | } 748 | 749 | if val, eq := getFuncCallIfEqual(set, negate, syntax.DigitClass(), syntax.NotDigitClass(), "unicode.IsDigit", chExpr); eq { 750 | return val 751 | } 752 | if val, eq := getFuncCallIfEqual(set, negate, syntax.SpaceClass(), syntax.NotSpaceClass(), "unicode.IsSpace", chExpr); eq { 753 | return val 754 | } 755 | if val, eq := getFuncCallIfEqual(set, negate, syntax.WordClass(), syntax.NotWordClass(), "helpers.IsWordChar", chExpr); eq { 756 | return val 757 | } 758 | /* 759 | TODO: Lots more classes here we don't have right now 760 | if val, eq := getFuncCallIfEqual(set, negate, syntax.ControlClass(), syntax.NotControlClass(), "unicode.IsControl", chExpr); eq { 761 | return val 762 | } 763 | if val, eq := getFuncCallIfEqual(set, negate, syntax.LetterClass(), syntax.NotLetterClass(), "unicode.IsLetter", chExpr); eq { 764 | return val 765 | } 766 | if val, eq := getFuncCallIfEqual(set, negate, syntax.LetterOrDigitClass(), syntax.NotLetterOrDigitClass(), "syntax.IsLetterOrDigit", chExpr); eq { 767 | return val 768 | } 769 | 770 | case RegexCharClass.LowerClass: 771 | case RegexCharClass.NotLowerClass: 772 | negate ^= charClass == RegexCharClass.NotLowerClass; 773 | return $"{(negate ? "!" : "")}char.IsLower({chExpr})"; 774 | 775 | case RegexCharClass.UpperClass: 776 | case RegexCharClass.NotUpperClass: 777 | negate ^= charClass == RegexCharClass.NotUpperClass; 778 | return $"{(negate ? "!" : "")}char.IsUpper({chExpr})"; 779 | 780 | case RegexCharClass.NumberClass: 781 | case RegexCharClass.NotNumberClass: 782 | negate ^= charClass == RegexCharClass.NotNumberClass; 783 | return $"{(negate ? "!" : "")}char.IsNumber({chExpr})"; 784 | 785 | case RegexCharClass.PunctuationClass: 786 | case RegexCharClass.NotPunctuationClass: 787 | negate ^= charClass == RegexCharClass.NotPunctuationClass; 788 | return $"{(negate ? "!" : "")}char.IsPunctuation({chExpr})"; 789 | 790 | case RegexCharClass.SeparatorClass: 791 | case RegexCharClass.NotSeparatorClass: 792 | negate ^= charClass == RegexCharClass.NotSeparatorClass; 793 | return $"{(negate ? "!" : "")}char.IsSeparator({chExpr})"; 794 | 795 | case RegexCharClass.SymbolClass: 796 | case RegexCharClass.NotSymbolClass: 797 | negate ^= charClass == RegexCharClass.NotSymbolClass; 798 | return $"{(negate ? "!" : "")}char.IsSymbol({chExpr})"; 799 | 800 | case RegexCharClass.AsciiLetterClass: 801 | case RegexCharClass.NotAsciiLetterClass: 802 | negate ^= charClass == RegexCharClass.NotAsciiLetterClass; 803 | return $"{(negate ? "!" : "")}char.IsAsciiLetter({chExpr})"; 804 | 805 | case RegexCharClass.AsciiLetterOrDigitClass: 806 | case RegexCharClass.NotAsciiLetterOrDigitClass: 807 | negate ^= charClass == RegexCharClass.NotAsciiLetterOrDigitClass; 808 | return $"{(negate ? "!" : "")}char.IsAsciiLetterOrDigit({chExpr})"; 809 | 810 | case RegexCharClass.HexDigitClass: 811 | case RegexCharClass.NotHexDigitClass: 812 | negate ^= charClass == RegexCharClass.NotHexDigitClass; 813 | return $"{(negate ? "!" : "")}char.IsAsciiHexDigit({chExpr})"; 814 | 815 | case RegexCharClass.HexDigitLowerClass: 816 | case RegexCharClass.NotHexDigitLowerClass: 817 | negate ^= charClass == RegexCharClass.NotHexDigitLowerClass; 818 | return $"{(negate ? "!" : "")}char.IsAsciiHexDigitLower({chExpr})"; 819 | 820 | case RegexCharClass.HexDigitUpperClass: 821 | case RegexCharClass.NotHexDigitUpperClass: 822 | negate ^= charClass == RegexCharClass.NotHexDigitUpperClass; 823 | return $"{(negate ? "!" : "")}char.IsAsciiHexDigitUpper({chExpr})"; 824 | }*/ 825 | 826 | // Next, handle simple sets of one range, e.g. [A-Z], [0-9], etc. This includes some built-in classes, like ECMADigitClass. 827 | if rs := set.GetIfNRanges(1); len(rs) == 1 { 828 | r := rs[0] 829 | negate = (negate != set.IsNegated()) 830 | if r.First == r.Last { 831 | // single char 832 | if negate { 833 | return fmt.Sprintf("(%v != %q)", chExpr, r.First) 834 | } 835 | return fmt.Sprintf("(%v == %q)", chExpr, r.First) 836 | } 837 | if negate { 838 | return fmt.Sprintf("!helpers.IsBetween(%s, %q, %q)", chExpr, r.First, r.Last) 839 | } 840 | return fmt.Sprintf("helpers.IsBetween(%s, %q, %q)", chExpr, r.First, r.Last) 841 | } 842 | 843 | // Next, if the character class contains nothing but Unicode categories, we can call char.GetUnicodeCategory and 844 | // compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus 845 | // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass, 846 | // but without the optimizations the C# compiler will provide for switches. 847 | cats, neg := set.GetIfOnlyUnicodeCategories() 848 | if len(cats) > 0 { 849 | negate = (negate != neg) 850 | // convert cats to strings 851 | sb := &bytes.Buffer{} 852 | if negate { 853 | sb.WriteString("!") 854 | } 855 | sb.WriteString("unicode.In(") 856 | sb.WriteString(chExpr) 857 | for _, cat := range cats { 858 | sb.WriteString(", unicode.") 859 | sb.WriteString(cat.Cat) 860 | } 861 | sb.WriteString(")") 862 | return sb.String() 863 | } 864 | 865 | // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), 866 | // it may be cheaper and smaller to compare against each than it is to use a lookup table. We can also special-case 867 | // the very common case with case insensitivity of two characters next to each other being the upper and lowercase 868 | // ASCII variants of each other, in which case we can use bit manipulation to avoid a comparison. 869 | //setChars := make([]rune, 0, 3) 870 | setChars := set.GetSetChars(3) 871 | if len(setChars) == 2 { 872 | negate = (negate != set.IsNegated()) 873 | eqStr := "==" 874 | bitJoin := "||" 875 | if negate { 876 | eqStr = "!=" 877 | bitJoin = "&&" 878 | } 879 | if mask, ok := differByOneBit(setChars[0], setChars[1]); ok { 880 | return fmt.Sprintf("(%s|0x%x %v %q)", chExpr, mask, eqStr, setChars[1]|mask) 881 | } 882 | return fmt.Sprintf("(%s %s %q %s %[1]s %[2]s %[5]q)", chExpr, eqStr, setChars[0], bitJoin, setChars[1]) 883 | } else if len(setChars) == 3 { 884 | negate = (negate != set.IsNegated()) 885 | eqStr := "==" 886 | bitJoin := "||" 887 | if negate { 888 | eqStr = "!=" 889 | bitJoin = "&&" 890 | } 891 | if mask, ok := differByOneBit(setChars[0], setChars[1]); ok { 892 | return fmt.Sprintf("((%s|0x%x %v %q) %s (%[1]s %[3]s %[6]q))", chExpr, mask, eqStr, setChars[1]|mask, bitJoin, setChars[2]) 893 | } 894 | return fmt.Sprintf("(%s %s %q %s %[1]s %[2]s %[5]q %[4]s %[1]s %[2]s %[6]q)", chExpr, eqStr, setChars[0], bitJoin, setChars[1], setChars[2]) 895 | } 896 | 897 | // Next, handle simple sets of two ASCII letter ranges that are cased versions of each other, e.g. [A-Za-z]. 898 | // This can be implemented as if it were a single range, with an additional bitwise operation. 899 | // TODO: the original C# code assumed an order of ranges coming back 900 | // based on char order -- can we assume that here too? does [A-Za-z] and [a-zA-Z] work the same? 901 | if ranges := set.GetIfNRanges(2); len(ranges) == 2 { 902 | if ranges[1].First <= unicode.MaxASCII && 903 | ranges[1].Last <= unicode.MaxASCII && 904 | ranges[0].First|0x20 == ranges[1].First && 905 | ranges[0].Last|0x20 == ranges[1].Last { 906 | 907 | negate = (negate != set.IsNegated()) 908 | op := "<=" 909 | if negate { 910 | op = ">" 911 | } 912 | return fmt.Sprintf("(uint(%s|0x20 - %q) %s uint(%q - %q))", chExpr, ranges[1].First, op, ranges[1].Last, ranges[1].First) 913 | } 914 | } 915 | 916 | // Analyze the character set more to determine what code to generate. 917 | analysis := set.Analyze() 918 | 919 | // Next, handle sets where the high - low + 1 range is <= 32. In that case, we can emit 920 | // a branchless lookup in a uint that does not rely on loading any objects (e.g. the string-based 921 | // lookup we use later). This nicely handles common sets like [\t\r\n ]. 922 | if analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges-analysis.LowerBoundInclusiveIfOnlyRanges) <= 32 { 923 | // Create the 32-bit value with 1s at indices corresponding to every character in the set, 924 | // where the bit is computed to be the char value minus the lower bound starting from 925 | // most significant bit downwards. 926 | negatedClass := set.IsNegated() 927 | bitmap := uint32(0) 928 | for i := analysis.LowerBoundInclusiveIfOnlyRanges; i < analysis.UpperBoundExclusiveIfOnlyRanges; i++ { 929 | if set.CharIn(i) != negatedClass { 930 | bitmap |= 1 << (31 - (i - analysis.LowerBoundInclusiveIfOnlyRanges)) 931 | } 932 | } 933 | 934 | // To determine whether a character is in the set, we subtract the lowest char; this subtraction happens before the result is 935 | // zero-extended to uint, meaning that `charMinusLowUInt32` will always have upper 16 bits equal to 0. 936 | // We then left shift the constant with this offset, and apply a bitmask that has the highest 937 | // bit set (the sign bit) if and only if `chExpr` is in the [low, low + 32) range. 938 | // Then we only need to check whether this final result is less than 0: this will only be 939 | // the case if both `charMinusLowUInt32` was in fact the index of a set bit in the constant, and also 940 | // `chExpr` was in the allowed range (this ensures that false positive bit shifts are ignored). 941 | negate = (negate != negatedClass) 942 | negStr := "" 943 | if negate { 944 | negStr = "!" 945 | } 946 | return fmt.Sprintf("%shelpers.IsInMask32(%s-%q, 0x%x)", negStr, chExpr, analysis.LowerBoundInclusiveIfOnlyRanges, bitmap) 947 | } 948 | 949 | // Next, handle sets where the high - low + 1 range is <= 64. As with the 32-bit case above, we can emit 950 | // a branchless lookup in a ulong that does not rely on loading any objects (e.g. the string-based lookup 951 | // we use later). Note that unlike RegexCompiler, the source generator doesn't know whether the code is going 952 | // to be run in a 32-bit or 64-bit process: in a 64-bit process, this is an optimization, but in a 32-bit process, 953 | // it's a deoptimization. In general we optimize for 64-bit perf, so this code remains; it complicates the code 954 | // too much to try to include both this and a fallback for the check. This, however, is why we do the 32-bit 955 | // version and check first, as that variant performs equally well on both 32-bit and 64-bit systems. 956 | if analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges-analysis.LowerBoundInclusiveIfOnlyRanges) <= 64 { 957 | // Create the 64-bit value with 1s at indices corresponding to every character in the set, 958 | // where the bit is computed to be the char value minus the lower bound starting from 959 | // most significant bit downwards. 960 | negatedClass := set.IsNegated() 961 | bitmap := uint64(0) 962 | for i := analysis.LowerBoundInclusiveIfOnlyRanges; i < analysis.UpperBoundExclusiveIfOnlyRanges; i++ { 963 | if set.CharIn(i) != negatedClass { 964 | bitmap |= 1 << (63 - (i - analysis.LowerBoundInclusiveIfOnlyRanges)) 965 | } 966 | } 967 | 968 | // To determine whether a character is in the set, we subtract the lowest char; this subtraction happens before 969 | // the result is zero-extended to uint, meaning that `charMinusLowUInt64` will always have upper 32 bits equal to 0. 970 | // We then left shift the constant with this offset, and apply a bitmask that has the highest bit set (the sign bit) 971 | // if and only if `chExpr` is in the [low, low + 64) range. Then we only need to check whether this final result is 972 | // less than 0: this will only be the case if both `charMinusLowUInt64` was in fact the index of a set bit in the constant, 973 | // and also `chExpr` was in the allowed range (this ensures that false positive bit shifts are ignored). 974 | negate = (negate != negatedClass) 975 | negStr := "" 976 | if negate { 977 | negStr = "!" 978 | } 979 | return fmt.Sprintf("%shelpers.IsInMask64(%s-%q, 0x%x)", negStr, chExpr, analysis.LowerBoundInclusiveIfOnlyRanges, bitmap) 980 | } 981 | 982 | // All options after this point require a ch local. 983 | // in the C# version this requires assignment statements, which Go doesn't have 984 | // so we just repeat chExpr and let the compiler handle temp var 985 | //rm.addLocalDec("var ch rune") 986 | 987 | // Next, handle simple sets of two ranges, e.g. [\p{IsGreek}\p{IsGreekExtended}]. 988 | if ranges := set.GetIfNRanges(2); len(ranges) == 2 { 989 | negate = (negate != set.IsNegated()) 990 | 991 | op := "||" 992 | if negate { 993 | op = "&&" 994 | } 995 | return fmt.Sprintf("%s %s %s", 996 | getRangeCheckClause(chExpr, ranges[0], negate), 997 | op, 998 | getRangeCheckClause(chExpr, ranges[1], negate)) 999 | } 1000 | 1001 | if analysis.ContainsNoAscii { 1002 | // We determined that the character class contains only non-ASCII, 1003 | // for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000]. 1004 | // (In the future, we could possibly extend the rm.Analysis to produce a known 1005 | // lower-bound and compare against that rather than always using 128 as the 1006 | // pivot point.) 1007 | return c.emitContainsNoAscii(negate, chExpr, set) 1008 | } 1009 | if analysis.AllAsciiContained { 1010 | // We determined that every ASCII character is in the class, for example 1011 | // if the class were the negated example from case 1 above: 1012 | // [^\p{IsGreek}\p{IsGreekExtended}]. 1013 | return c.emitAllAsciiContained(negate, chExpr, set) 1014 | } 1015 | 1016 | // Now, our big hammer is to generate a lookup table that lets us quickly index by character into a yes/no 1017 | // answer as to whether the character is in the target character class. However, we don't want to store 1018 | // a lookup table for every possible character for every character class in the regular expression; at one 1019 | // bit for each of 65K characters, that would be an 8K bitmap per character class. Instead, we handle the 1020 | // common case of ASCII input via such a lookup table, which at one bit for each of 128 characters is only 1021 | // 16 bytes per character class. We of course still need to be able to handle inputs that aren't ASCII, so 1022 | // we check the input against 128, and have a fallback if the input is >= to it. Determining the right 1023 | // fallback could itself be expensive. For example, if it's possible that a value >= 128 could match the 1024 | // character class, we output a call to RegexRunner.CharInClass, but we don't want to have to enumerate the 1025 | // entire character class evaluating every character against it, just to determine whether it's a match. 1026 | // Instead, we employ some quick heuristics that will always ensure we provide a correct answer even if 1027 | // we could have sometimes generated better code to give that answer. 1028 | 1029 | // Generate the lookup table to store 128 answers as bits. We use a const string instead of a byte[] / static 1030 | // data property because it lets IL emit handle all the details for us. 1031 | // String length is 8 chars == 16 bytes == 128 bits. 1032 | bitVector := make([]uint64, 2) 1033 | 1034 | for i := rune(0); i < unicode.MaxASCII; i++ { 1035 | if set.CharIn(i) { 1036 | bitVector[i/64] |= (1 << (i % 64)) 1037 | } 1038 | } 1039 | 1040 | // There's a chance that the class contains either no ASCII characters or all of them, 1041 | // and the analysis could not find it (for example if the class has a subtraction). 1042 | // We optimize away the bit vector in these trivial cases. 1043 | if bitVector[0] == 0 && bitVector[1] == 0 { 1044 | // no ascii at all 1045 | return c.emitContainsNoAscii(negate, chExpr, set) 1046 | } 1047 | if bitVector[0] == math.MaxUint64 && bitVector[1] == math.MaxUint64 { 1048 | // all ascii is included 1049 | return c.emitAllAsciiContained(negate, chExpr, set) 1050 | } 1051 | /* 1052 | // We determined that the character class may contain ASCII, so we 1053 | // output the lookup against the lookup table. 1054 | 1055 | if (analysis.ContainsOnlyAscii) 1056 | { 1057 | // If all inputs that could match are ASCII, we only need the lookup table, guarded 1058 | // by a check for the upper bound (which serves both to limit for what characters 1059 | // we need to access the lookup table and to bounds check the lookup table access). 1060 | return negate ? 1061 | $"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfOnlyRanges)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : 1062 | $"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfOnlyRanges)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; 1063 | } 1064 | 1065 | if (analysis.AllNonAsciiContained) 1066 | { 1067 | // If every non-ASCII value is considered a match, we can immediately succeed for any 1068 | // non-ASCII inputs, and access the lookup table for the rest. 1069 | return negate ? 1070 | $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : 1071 | $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; 1072 | } 1073 | 1074 | // We know that the whole class wasn't ASCII, and we don't know anything about the non-ASCII 1075 | // characters other than that some might be included, for example if the character class 1076 | // were [\w\d], so if ch >= 128, we need to fall back to calling CharInClass. For ASCII, we 1077 | // can use a lookup table, but if it's a known set of ASCII characters we can also use a helper. 1078 | string asciiExpr = bitVectorString switch 1079 | { 1080 | "\0\0\0\u03ff\ufffe\u07ff\ufffe\u07ff" => $"{(negate ? "!" : "")}char.IsAsciiLetterOrDigit(ch)", 1081 | 1082 | "\0\0\0\u03FF\0\0\0\0" => $"{(negate ? "!" : "")}char.IsAsciiDigit(ch)", 1083 | 1084 | "\0\0\0\0\ufffe\u07FF\ufffe\u07ff" => $"{(negate ? "!" : "")}char.IsAsciiLetter(ch)", 1085 | "\0\0\0\0\0\0\ufffe\u07ff" => $"{(negate ? "!" : "")}char.IsAsciiLetterLower(ch)", 1086 | "\0\0\0\0\ufffe\u07FF\0\0" => $"{(negate ? "!" : "")}char.IsAsciiLetterUpper(ch)", 1087 | 1088 | "\0\0\0\u03FF\u007E\0\u007E\0" => $"{(negate ? "!" : "")}char.IsAsciiHexDigit(ch)", 1089 | "\0\0\0\u03FF\0\0\u007E\0" => $"{(negate ? "!" : "")}char.IsAsciiHexDigitLower(ch)", 1090 | "\0\0\0\u03FF\u007E\0\0\0" => $"{(negate ? "!" : "")}char.IsAsciiHexDigitUpper(ch)", 1091 | 1092 | _ => $"({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) {(negate ? "=" : "!")}= 0", 1093 | }; 1094 | return $"((ch = {chExpr}) < 128 ? {asciiExpr} : {(negate ? "!" : "")}RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; 1095 | */ 1096 | 1097 | // very base option, not optimized 1098 | setField := c.emitSetDefinition(set) 1099 | if negate { 1100 | return fmt.Sprintf("!%s.CharIn(%s)", setField, chExpr) 1101 | } 1102 | return fmt.Sprintf("%s.CharIn(%s)", setField, chExpr) 1103 | } 1104 | 1105 | func getRangeCheckClause(chExpr string, r syntax.SingleRange, negate bool) string { 1106 | if negate { 1107 | if r.First == r.Last { 1108 | return fmt.Sprintf("%s != %q", chExpr, r.First) 1109 | } else { 1110 | return fmt.Sprintf("%s - %q > %v", chExpr, r.First, r.Last-r.First) 1111 | } 1112 | } 1113 | if r.First == r.Last { 1114 | return fmt.Sprintf("%s == %q", chExpr, r.First) 1115 | } 1116 | return fmt.Sprintf("%s - %q <= %v", chExpr, r.First, r.Last-r.First) 1117 | } 1118 | 1119 | func (c *converter) emitIndexOfAnyCustomHelper(rm *regexpData, set *syntax.CharSet, negate bool, spanName string) string { 1120 | //TODO: see if it's worth it to identify the set and 1121 | //use a dedicated helper for this set 1122 | 1123 | // this is the most general form of the helper 1124 | match := c.emitMatchCharacterClass(rm, set, negate, "ch") 1125 | return fmt.Sprintf("helpers.IndexFunc(%s, func(ch rune) bool { return %s })", spanName, match) 1126 | } 1127 | 1128 | func (c *converter) emitContainsNoAscii(negate bool, chExpr string, set *syntax.CharSet) string { 1129 | setField := c.emitSetDefinition(set) 1130 | if negate { 1131 | return fmt.Sprintf("%s < 128 || !%s.CharIn(%[1]s)", chExpr, setField) 1132 | } 1133 | return fmt.Sprintf("%s >= 128 && %s.CharIn(%[1]s)", chExpr, setField) 1134 | } 1135 | 1136 | func (c *converter) emitAllAsciiContained(negate bool, chExpr string, set *syntax.CharSet) string { 1137 | setField := c.emitSetDefinition(set) 1138 | if negate { 1139 | return fmt.Sprintf("%s >= 128 && !%s.CharIn(%[1]s)", chExpr, setField) 1140 | } 1141 | return fmt.Sprintf("%s < 128 || %s.CharIn(%[1]s)", chExpr, setField) 1142 | } 1143 | --------------------------------------------------------------------------------