├── syntax ├── go.mod ├── pos.go ├── utils.go ├── errors.go ├── README.md ├── ast.go ├── pcre_test.go ├── tokenkind_string.go ├── operation_string.go ├── operation.go ├── lexer_test.go ├── lexer.go ├── parser.go └── parser_test.go ├── README.md └── LICENSE /syntax/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/quasilyte/regex/syntax 2 | 3 | go 1.14 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regex - [regular expression](https://en.wikipedia.org/wiki/Regular_expression) libraries for Go 2 | 3 | ## Packages 4 | 5 | * [syntax](/syntax) - regexp parser and AST definitions 6 | -------------------------------------------------------------------------------- /syntax/pos.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | type Position struct { 4 | Begin uint16 5 | End uint16 6 | } 7 | 8 | func combinePos(begin, end Position) Position { 9 | return Position{Begin: begin.Begin, End: end.End} 10 | } 11 | -------------------------------------------------------------------------------- /syntax/utils.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | func isSpace(ch byte) bool { 4 | switch ch { 5 | case '\r', '\n', '\t', '\f', '\v', ' ': 6 | return true 7 | default: 8 | return false 9 | } 10 | } 11 | 12 | func isAlphanumeric(ch byte) bool { 13 | return (ch >= 'a' && ch <= 'z') || 14 | (ch >= 'A' && ch <= 'Z') || 15 | (ch >= '0' && ch <= '9') 16 | } 17 | 18 | func isDigit(ch byte) bool { 19 | return ch >= '0' && ch <= '9' 20 | } 21 | 22 | func isOctalDigit(ch byte) bool { 23 | return ch >= '0' && ch <= '7' 24 | } 25 | 26 | func isHexDigit(ch byte) bool { 27 | return (ch >= '0' && ch <= '9') || 28 | (ch >= 'a' && ch <= 'f') || 29 | (ch >= 'A' && ch <= 'F') 30 | } 31 | -------------------------------------------------------------------------------- /syntax/errors.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | type ParseError struct { 4 | Pos Position 5 | Message string 6 | } 7 | 8 | func (e ParseError) Error() string { return e.Message } 9 | 10 | func throw(pos Position, message string) { 11 | panic(ParseError{Pos: pos, Message: message}) 12 | } 13 | 14 | func throwExpectedFound(pos Position, expected, found string) { 15 | throw(pos, "expected '"+expected+"', found '"+found+"'") 16 | } 17 | 18 | func throwUnexpectedToken(pos Position, token string) { 19 | throw(pos, "unexpected token: "+token) 20 | } 21 | 22 | func newPos(begin, end int) Position { 23 | return Position{ 24 | Begin: uint16(begin), 25 | End: uint16(end), 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Iskander (Alex) Sharipov / quasilyte 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /syntax/README.md: -------------------------------------------------------------------------------- 1 | # Package `regex/syntax` 2 | 3 | Package `syntax` provides regular expressions parser as well as AST definitions. 4 | 5 | ## Rationale 6 | 7 | The advantages of this package over stdlib [regexp/syntax](https://golang.org/pkg/regexp/syntax/): 8 | 9 | 1. Does not transformations/optimizations during the parsing. 10 | The produced parse tree is loseless. 11 | 12 | 2. Simpler AST representation. 13 | 14 | 3. Can parse most PCRE operations in addition to [re2](https://github.com/google/re2/wiki/Syntax) syntax. 15 | It can also handle PHP/Perl style patterns with delimiters. 16 | 17 | 4. This package is easier to extend than something from the standard library. 18 | 19 | This package does almost no assumptions about how generated AST is going to be used 20 | so it preserves as much syntax information as possible. 21 | 22 | It's easy to write another intermediate representation on top of it. The main 23 | function of this package is to convert a textual regexp pattern into a more 24 | structured form that can be processed more easily. 25 | 26 | ## Users 27 | 28 | * [go-critic](https://github.com/go-critic/go-critic) - Go static analyzer 29 | * [NoVerify](https://github.com/VKCOM/noverify) - PHP static analyzer 30 | -------------------------------------------------------------------------------- /syntax/ast.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | type Regexp struct { 8 | Pattern string 9 | Expr Expr 10 | } 11 | 12 | type RegexpPCRE struct { 13 | Pattern string 14 | Expr Expr 15 | 16 | Source string 17 | Modifiers string 18 | Delim [2]byte 19 | } 20 | 21 | func (re *RegexpPCRE) HasModifier(mod byte) bool { 22 | return strings.IndexByte(re.Modifiers, mod) >= 0 23 | } 24 | 25 | type Expr struct { 26 | // The operations that this expression performs. See `operation.go`. 27 | Op Operation 28 | 29 | Form Form 30 | 31 | _ [2]byte // Reserved 32 | 33 | // Pos describes a source location inside regexp pattern. 34 | Pos Position 35 | 36 | // Args is a list of sub-expressions of this expression. 37 | // 38 | // See Operation constants documentation to learn how to 39 | // interpret the particular expression args. 40 | Args []Expr 41 | 42 | // Value holds expression textual value. 43 | // 44 | // Usually, that value is identical to src[Begin():End()], 45 | // but this is not true for programmatically generated objects. 46 | Value string 47 | } 48 | 49 | // Begin returns expression leftmost offset. 50 | func (e Expr) Begin() uint16 { return e.Pos.Begin } 51 | 52 | // End returns expression rightmost offset. 53 | func (e Expr) End() uint16 { return e.Pos.End } 54 | 55 | // LastArg returns expression last argument. 56 | // 57 | // Should not be called on expressions that may have 0 arguments. 58 | func (e Expr) LastArg() Expr { 59 | return e.Args[len(e.Args)-1] 60 | } 61 | 62 | type Operation byte 63 | 64 | type Form byte 65 | -------------------------------------------------------------------------------- /syntax/pcre_test.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestParserErrorsPCRE(t *testing.T) { 9 | tests := []struct { 10 | pattern string 11 | want string 12 | }{ 13 | {``, `empty pattern: can't find delimiters`}, 14 | {`aba`, `'a' is not a valid delimiter`}, 15 | {` aa `, `whitespace is not a valid delimiter`}, 16 | {`/abc`, `can't find '/' ending delimiter`}, 17 | {`#abc`, `can't find '#' ending delimiter`}, 18 | } 19 | 20 | p := NewParser(nil) 21 | for _, test := range tests { 22 | _, err := p.ParsePCRE(test.pattern) 23 | have := "" 24 | if err != nil { 25 | have = err.Error() 26 | } 27 | if have != test.want { 28 | t.Errorf("parse(%q):\nhave: %s\nwant: %s", 29 | test.pattern, have, test.want) 30 | } 31 | } 32 | } 33 | 34 | func TestParsePCRE(t *testing.T) { 35 | tests := []struct { 36 | source string 37 | 38 | wantPattern string 39 | wantDelim string 40 | wantModifiers string 41 | }{ 42 | {`@@`, "", "@@", ""}, 43 | {`//i`, "", "//", "i"}, 44 | {`#hello#`, "hello", "##", ""}, 45 | {`{pcre pattern}smi`, "pcre pattern", "{}", "smi"}, 46 | {`ms`, "an[o]ther (example)!", "<>", "ms"}, 47 | {`/clipFrom/([0-9]+)`, "clipFrom", "//", "([0-9]+)"}, 48 | } 49 | 50 | p := NewParser(nil) 51 | for _, test := range tests { 52 | pcre, err := p.ParsePCRE(test.source) 53 | if err != nil { 54 | t.Fatalf("parse(%q): error: %v", test.source, err) 55 | } 56 | if pcre.Pattern != test.wantPattern { 57 | t.Fatalf("parse(%q): pattern mismatch:\nhave: `%s`\nwant: `%s`", 58 | test.source, pcre.Pattern, test.wantPattern) 59 | } 60 | haveDelim := fmt.Sprintf("%c%c", pcre.Delim[0], pcre.Delim[1]) 61 | if haveDelim != test.wantDelim { 62 | t.Fatalf("parse(%q): delimiter mismatch:\nhave: `%s`\nwant: `%s`", 63 | test.source, haveDelim, test.wantDelim) 64 | } 65 | if pcre.Modifiers != test.wantModifiers { 66 | t.Fatalf("parse(%q): modifiers mismatch:\nhave: `%s`\nwant: `%s`", 67 | test.source, pcre.Modifiers, test.wantModifiers) 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /syntax/tokenkind_string.go: -------------------------------------------------------------------------------- 1 | // Code generated by "stringer -type=tokenKind -trimprefix=tok -linecomment=true"; DO NOT EDIT. 2 | 3 | package syntax 4 | 5 | import "strconv" 6 | 7 | func _() { 8 | // An "invalid array index" compiler error signifies that the constant values have changed. 9 | // Re-run the stringer command to generate them again. 10 | var x [1]struct{} 11 | _ = x[tokNone-0] 12 | _ = x[tokChar-1] 13 | _ = x[tokGroupFlags-2] 14 | _ = x[tokPosixClass-3] 15 | _ = x[tokConcat-4] 16 | _ = x[tokRepeat-5] 17 | _ = x[tokEscapeChar-6] 18 | _ = x[tokEscapeMeta-7] 19 | _ = x[tokEscapeOctal-8] 20 | _ = x[tokEscapeUni-9] 21 | _ = x[tokEscapeUniFull-10] 22 | _ = x[tokEscapeHex-11] 23 | _ = x[tokEscapeHexFull-12] 24 | _ = x[tokComment-13] 25 | _ = x[tokQ-14] 26 | _ = x[tokMinus-15] 27 | _ = x[tokLbracket-16] 28 | _ = x[tokLbracketCaret-17] 29 | _ = x[tokRbracket-18] 30 | _ = x[tokDollar-19] 31 | _ = x[tokCaret-20] 32 | _ = x[tokQuestion-21] 33 | _ = x[tokDot-22] 34 | _ = x[tokPlus-23] 35 | _ = x[tokStar-24] 36 | _ = x[tokPipe-25] 37 | _ = x[tokLparen-26] 38 | _ = x[tokLparenName-27] 39 | _ = x[tokLparenNameAngle-28] 40 | _ = x[tokLparenNameQuote-29] 41 | _ = x[tokLparenFlags-30] 42 | _ = x[tokLparenAtomic-31] 43 | _ = x[tokLparenPositiveLookahead-32] 44 | _ = x[tokLparenPositiveLookbehind-33] 45 | _ = x[tokLparenNegativeLookahead-34] 46 | _ = x[tokLparenNegativeLookbehind-35] 47 | _ = x[tokRparen-36] 48 | } 49 | 50 | const _tokenKind_name = "NoneCharGroupFlagsPosixClassConcatRepeatEscapeCharEscapeMetaEscapeOctalEscapeUniEscapeUniFullEscapeHexEscapeHexFullComment\\Q-[[^]$^?.+*|((?P(?(?'name'(?flags(?>(?=(?<=(?!(?= tokenKind(len(_tokenKind_index)-1) { 56 | return "tokenKind(" + strconv.FormatInt(int64(i), 10) + ")" 57 | } 58 | return _tokenKind_name[_tokenKind_index[i]:_tokenKind_index[i+1]] 59 | } 60 | -------------------------------------------------------------------------------- /syntax/operation_string.go: -------------------------------------------------------------------------------- 1 | // Code generated by "stringer -type=Operation -trimprefix=Op"; DO NOT EDIT. 2 | 3 | package syntax 4 | 5 | import "strconv" 6 | 7 | func _() { 8 | // An "invalid array index" compiler error signifies that the constant values have changed. 9 | // Re-run the stringer command to generate them again. 10 | var x [1]struct{} 11 | _ = x[OpNone-0] 12 | _ = x[OpConcat-1] 13 | _ = x[OpDot-2] 14 | _ = x[OpAlt-3] 15 | _ = x[OpStar-4] 16 | _ = x[OpPlus-5] 17 | _ = x[OpQuestion-6] 18 | _ = x[OpNonGreedy-7] 19 | _ = x[OpPossessive-8] 20 | _ = x[OpCaret-9] 21 | _ = x[OpDollar-10] 22 | _ = x[OpLiteral-11] 23 | _ = x[OpChar-12] 24 | _ = x[OpString-13] 25 | _ = x[OpQuote-14] 26 | _ = x[OpEscapeChar-15] 27 | _ = x[OpEscapeMeta-16] 28 | _ = x[OpEscapeOctal-17] 29 | _ = x[OpEscapeHex-18] 30 | _ = x[OpEscapeUni-19] 31 | _ = x[OpCharClass-20] 32 | _ = x[OpNegCharClass-21] 33 | _ = x[OpCharRange-22] 34 | _ = x[OpPosixClass-23] 35 | _ = x[OpRepeat-24] 36 | _ = x[OpCapture-25] 37 | _ = x[OpNamedCapture-26] 38 | _ = x[OpGroup-27] 39 | _ = x[OpGroupWithFlags-28] 40 | _ = x[OpAtomicGroup-29] 41 | _ = x[OpPositiveLookahead-30] 42 | _ = x[OpNegativeLookahead-31] 43 | _ = x[OpPositiveLookbehind-32] 44 | _ = x[OpNegativeLookbehind-33] 45 | _ = x[OpFlagOnlyGroup-34] 46 | _ = x[OpComment-35] 47 | _ = x[OpNone2-36] 48 | } 49 | 50 | const _Operation_name = "NoneConcatDotAltStarPlusQuestionNonGreedyPossessiveCaretDollarLiteralCharStringQuoteEscapeCharEscapeMetaEscapeOctalEscapeHexEscapeUniCharClassNegCharClassCharRangePosixClassRepeatCaptureNamedCaptureGroupGroupWithFlagsAtomicGroupPositiveLookaheadNegativeLookaheadPositiveLookbehindNegativeLookbehindFlagOnlyGroupCommentNone2" 51 | 52 | var _Operation_index = [...]uint16{0, 4, 10, 13, 16, 20, 24, 32, 41, 51, 56, 62, 69, 73, 79, 84, 94, 104, 115, 124, 133, 142, 154, 163, 173, 179, 186, 198, 203, 217, 228, 245, 262, 280, 298, 311, 318, 323} 53 | 54 | func (i Operation) String() string { 55 | if i >= Operation(len(_Operation_index)-1) { 56 | return "Operation(" + strconv.FormatInt(int64(i), 10) + ")" 57 | } 58 | return _Operation_name[_Operation_index[i]:_Operation_index[i+1]] 59 | } 60 | -------------------------------------------------------------------------------- /syntax/operation.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | //go:generate stringer -type=Operation -trimprefix=Op 4 | const ( 5 | OpNone Operation = iota 6 | 7 | // OpConcat is a concatenation of ops. 8 | // Examples: `xy` `abc\d` `` 9 | // Args - concatenated ops 10 | // 11 | // As a special case, OpConcat with 0 Args is used for "empty" 12 | // set of operations. 13 | OpConcat 14 | 15 | // OpDot is a '.' wildcard. 16 | OpDot 17 | 18 | // OpAlt is x|y alternation of ops. 19 | // Examples: `a|bc` `x(.*?)|y(.*?)` 20 | // Args - union-connected regexp branches 21 | OpAlt 22 | 23 | // OpStar is a shorthand for {0,} repetition. 24 | // Examples: `x*` 25 | // Args[0] - repeated expression 26 | OpStar 27 | 28 | // OpPlus is a shorthand for {1,} repetition. 29 | // Examples: `x+` 30 | // Args[0] - repeated expression 31 | OpPlus 32 | 33 | // OpQuestion is a shorthand for {0,1} repetition. 34 | // Examples: `x?` 35 | // Args[0] - repeated expression 36 | OpQuestion 37 | 38 | // OpNonGreedy makes its operand quantifier non-greedy. 39 | // Examples: `x??` `x*?` `x+?` 40 | // Args[0] - quantified expression 41 | OpNonGreedy 42 | 43 | // OpPossessive makes its operand quantifier possessive. 44 | // Examples: `x?+` `x*+` `x++` 45 | // Args[0] - quantified expression 46 | OpPossessive 47 | 48 | // OpCaret is ^ anchor. 49 | OpCaret 50 | 51 | // OpDollar is $ anchor. 52 | OpDollar 53 | 54 | // OpLiteral is a collection of consecutive chars. 55 | // Examples: `ab` `10x` 56 | // Args - enclosed characters (OpChar) 57 | OpLiteral 58 | 59 | // OpChar is a single literal pattern character. 60 | // Examples: `a` `6` `ф` 61 | OpChar 62 | 63 | // OpString is an artificial element that is used in other expressions. 64 | OpString 65 | 66 | // OpQuote is a \Q...\E enclosed literal. 67 | // Examples: `\Q.?\E` `\Q?q[]=1` 68 | // FormQuoteUnclosed: `\Qabc` 69 | // Args[0] - literal value (OpString) 70 | OpQuote 71 | 72 | // OpEscapeChar is a single char escape. 73 | // Examples: `\d` `\a` `\n` 74 | // Args[0] - escaped value (OpString) 75 | OpEscapeChar 76 | 77 | // OpEscapeMeta is an escaped meta char. 78 | // Examples: `\(` `\[` `\+` 79 | // Args[0] - escaped value (OpString) 80 | OpEscapeMeta 81 | 82 | // OpEscapeOctal is an octal char code escape (up to 3 digits). 83 | // Examples: `\123` `\12` 84 | // Args[0] - escaped value (OpString) 85 | OpEscapeOctal 86 | 87 | // OpEscapeHex is a hex char code escape. 88 | // Examples: `\x7F` `\xF7` 89 | // FormEscapeHexFull examples: `\x{10FFFF}` `\x{F}`. 90 | // Args[0] - escaped value (OpString) 91 | OpEscapeHex 92 | 93 | // OpEscapeUni is a Unicode char class escape. 94 | // Examples: `\pS` `\pL` `\PL` 95 | // FormEscapeUniFull examples: `\p{Greek}` `\p{Symbol}` `\p{^L}` 96 | // Args[0] - escaped value (OpString) 97 | OpEscapeUni 98 | 99 | // OpCharClass is a char class enclosed in []. 100 | // Examples: `[abc]` `[a-z0-9\]]` 101 | // Args - char class elements (can include OpCharRange and OpPosixClass) 102 | OpCharClass 103 | 104 | // OpNegCharClass is a negated char class enclosed in []. 105 | // Examples: `[^abc]` `[^a-z0-9\]]` 106 | // Args - char class elements (can include OpCharRange and OpPosixClass) 107 | OpNegCharClass 108 | 109 | // OpCharRange is an inclusive char range inside a char class. 110 | // Examples: `0-9` `A-Z` 111 | // Args[0] - range lower bound 112 | // Args[1] - range upper bound 113 | OpCharRange 114 | 115 | // OpPosixClass is a named ASCII char set inside a char class. 116 | // Examples: `[:alpha:]` `[:blank:]` 117 | OpPosixClass 118 | 119 | // OpRepeat is a {min,max} repetition quantifier. 120 | // Examples: `x{5}` `x{min,max}` `x{min,}` 121 | // Args[0] - repeated expression 122 | // Args[1] - repeat count (OpString) 123 | OpRepeat 124 | 125 | // OpCapture is `(re)` capturing group. 126 | // Examples: `(abc)` `(x|y)` 127 | // Args[0] - enclosed expression 128 | OpCapture 129 | 130 | // OpNamedCapture is `(?Pre)` capturing group. 131 | // Examples: `(?Pabc)` `(?Px|y)` 132 | // FormNamedCaptureAngle examples: `(?abc)` `(?x|y)` 133 | // FormNamedCaptureQuote examples: `(?'foo'abc)` `(?'name'x|y)` 134 | // Args[0] - enclosed expression (OpConcat with 0 args for empty group) 135 | // Args[1] - group name (OpString) 136 | OpNamedCapture 137 | 138 | // OpGroup is `(?:re)` non-capturing group. 139 | // Examples: `(?:abc)` `(?:x|y)` 140 | // Args[0] - enclosed expression (OpConcat with 0 args for empty group) 141 | OpGroup 142 | 143 | // OpGroupWithFlags is `(?flags:re)` non-capturing group. 144 | // Examples: `(?i:abc)` `(?i:x|y)` 145 | // Args[0] - enclosed expression (OpConcat with 0 args for empty group) 146 | // Args[1] - flags (OpString) 147 | OpGroupWithFlags 148 | 149 | // OpAtomicGroup is `(?>re)` non-capturing group without backtracking. 150 | // Examples: `(?>foo)` `(?>)` 151 | // Args[0] - enclosed expression (OpConcat with 0 args for empty group) 152 | OpAtomicGroup 153 | 154 | // OpPositiveLookahead is `(?=re)` asserts that following text matches re. 155 | // Examples: `(?=foo)` 156 | // Args[0] - enclosed expression (OpConcat with 0 args for empty group) 157 | OpPositiveLookahead 158 | 159 | // OpNegativeLookahead is `(?!re)` asserts that following text doesn't match re. 160 | // Examples: `(?!foo)` 161 | // Args[0] - enclosed expression (OpConcat with 0 args for empty group) 162 | OpNegativeLookahead 163 | 164 | // OpPositiveLookbehind is `(?<=re)` asserts that preceding text matches re. 165 | // Examples: `(?<=foo)` 166 | // Args[0] - enclosed expression (OpConcat with 0 args for empty group) 167 | OpPositiveLookbehind 168 | 169 | // OpNegativeLookbehind is `(?=re)` asserts that preceding text doesn't match re. 170 | // Examples: `(?)`, `(? )`}, 36 | {`(?'1')`, `(?'name' )`}, 37 | {`(?P<1>)`, `(?P )`}, 38 | {`(?Px)`, `(?P Char )`}, 39 | {`(?x)`, `(? Char )`}, 40 | {`(?'foo'x)`, `(?'name' Char )`}, 41 | {`(?Pxy)`, `(?P Char Concat Char )`}, 42 | {`a(?Px)b`, `Char Concat (?P Char ) Concat Char`}, 43 | {`a(?Pxy)b`, `Char Concat (?P Char Concat Char ) Concat Char`}, 44 | {`a(?xy)b`, `Char Concat (? Char Concat Char ) Concat Char`}, 45 | {`a(?'foo'xy)b`, `Char Concat (?'name' Char Concat Char ) Concat Char`}, 46 | 47 | {`(?#)`, `Comment`}, 48 | {`a(?#test)(?#c2)b`, `Char Concat Comment Concat Comment Concat Char`}, 49 | 50 | {`(?>)`, `(?> )`}, 51 | {`a(?>xy)(?>z)`, `Char Concat (?> Char Concat Char ) Concat (?> Char )`}, 52 | 53 | {`(?=)`, `(?= )`}, 54 | {`(?!)`, `(?! )`}, 55 | {`(?<=)`, `(?<= )`}, 56 | {`(? 51 | tokLparenNameAngle // (? 52 | tokLparenNameQuote // (?'name' 53 | tokLparenFlags // (?flags 54 | tokLparenAtomic // (?> 55 | tokLparenPositiveLookahead // (?= 56 | tokLparenPositiveLookbehind // (?<= 57 | tokLparenNegativeLookahead // (?! 58 | tokLparenNegativeLookbehind // (?= utf8.RuneSelf { 114 | _, size := utf8.DecodeRuneInString(l.input[l.pos:]) 115 | l.pushTok(tokChar, size) 116 | l.maybeInsertConcat() 117 | continue 118 | } 119 | switch ch { 120 | case '\\': 121 | l.scanEscape(false) 122 | case '.': 123 | l.pushTok(tokDot, 1) 124 | case '+': 125 | l.pushTok(tokPlus, 1) 126 | case '*': 127 | l.pushTok(tokStar, 1) 128 | case '^': 129 | l.pushTok(tokCaret, 1) 130 | case '$': 131 | l.pushTok(tokDollar, 1) 132 | case '?': 133 | l.pushTok(tokQuestion, 1) 134 | case ')': 135 | l.pushTok(tokRparen, 1) 136 | case '|': 137 | l.pushTok(tokPipe, 1) 138 | case '[': 139 | if l.byteAt(l.pos+1) == '^' { 140 | l.pushTok(tokLbracketCaret, 2) 141 | } else { 142 | l.pushTok(tokLbracket, 1) 143 | } 144 | l.scanCharClass() 145 | case '(': 146 | if l.byteAt(l.pos+1) == '?' { 147 | switch { 148 | case l.byteAt(l.pos+2) == '>': 149 | l.pushTok(tokLparenAtomic, len("(?>")) 150 | case l.byteAt(l.pos+2) == '=': 151 | l.pushTok(tokLparenPositiveLookahead, len("(?=")) 152 | case l.byteAt(l.pos+2) == '!': 153 | l.pushTok(tokLparenNegativeLookahead, len("(?!")) 154 | case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '=': 155 | l.pushTok(tokLparenPositiveLookbehind, len("(?<=")) 156 | case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '!': 157 | l.pushTok(tokLparenNegativeLookbehind, len("(?= 0 { 171 | l.pushTok(tokRepeat, len("{")+j) 172 | } else { 173 | l.pushTok(tokChar, 1) 174 | } 175 | default: 176 | l.pushTok(tokChar, 1) 177 | } 178 | l.maybeInsertConcat() 179 | } 180 | } 181 | 182 | func (l *lexer) scanCharClass() { 183 | l.maybeInsertConcat() 184 | 185 | // We need to handle first `]` in a special way. See #3. 186 | if l.byteAt(l.pos) == ']' { 187 | l.pushTok(tokChar, 1) 188 | } 189 | 190 | for l.pos < len(l.input) { 191 | ch := l.input[l.pos] 192 | if ch >= utf8.RuneSelf { 193 | _, size := utf8.DecodeRuneInString(l.input[l.pos:]) 194 | l.pushTok(tokChar, size) 195 | continue 196 | } 197 | switch ch { 198 | case '\\': 199 | l.scanEscape(true) 200 | case '[': 201 | isPosixClass := false 202 | if l.byteAt(l.pos+1) == ':' { 203 | j := l.stringIndex(l.pos+2, ":]") 204 | if j >= 0 { 205 | isPosixClass = true 206 | l.pushTok(tokPosixClass, j+len("[::]")) 207 | } 208 | } 209 | if !isPosixClass { 210 | l.pushTok(tokChar, 1) 211 | } 212 | case '-': 213 | l.pushTok(tokMinus, 1) 214 | case ']': 215 | l.pushTok(tokRbracket, 1) 216 | return // Stop scanning in the char context 217 | default: 218 | l.pushTok(tokChar, 1) 219 | } 220 | } 221 | } 222 | 223 | func (l *lexer) scanEscape(insideCharClass bool) { 224 | s := l.input 225 | if l.pos+1 >= len(s) { 226 | throw(newPos(l.pos, l.pos+1), `unexpected end of pattern: trailing '\'`) 227 | } 228 | switch { 229 | case s[l.pos+1] == 'p' || s[l.pos+1] == 'P': 230 | if l.pos+2 >= len(s) { 231 | throw(newPos(l.pos, l.pos+2), "unexpected end of pattern: expected uni-class-short or '{'") 232 | } 233 | if s[l.pos+2] == '{' { 234 | j := strings.IndexByte(s[l.pos+2:], '}') 235 | if j < 0 { 236 | throw(newPos(l.pos, l.pos+2), "can't find closing '}'") 237 | } 238 | l.pushTok(tokEscapeUniFull, len(`\p{`)+j) 239 | } else { 240 | l.pushTok(tokEscapeUni, len(`\pL`)) 241 | } 242 | case s[l.pos+1] == 'x': 243 | if l.pos+2 >= len(s) { 244 | throw(newPos(l.pos, l.pos+2), "unexpected end of pattern: expected hex-digit or '{'") 245 | } 246 | if s[l.pos+2] == '{' { 247 | j := strings.IndexByte(s[l.pos+2:], '}') 248 | if j < 0 { 249 | throw(newPos(l.pos, l.pos+2), "can't find closing '}'") 250 | } 251 | l.pushTok(tokEscapeHexFull, len(`\x{`)+j) 252 | } else { 253 | if isHexDigit(l.byteAt(l.pos + 3)) { 254 | l.pushTok(tokEscapeHex, len(`\xFF`)) 255 | } else { 256 | l.pushTok(tokEscapeHex, len(`\xF`)) 257 | } 258 | } 259 | case isOctalDigit(s[l.pos+1]): 260 | digits := 1 261 | if isOctalDigit(l.byteAt(l.pos + 2)) { 262 | if isOctalDigit(l.byteAt(l.pos + 3)) { 263 | digits = 3 264 | } else { 265 | digits = 2 266 | } 267 | } 268 | l.pushTok(tokEscapeOctal, len(`\`)+digits) 269 | case s[l.pos+1] == 'Q': 270 | size := len(s) - l.pos // Until the pattern ends 271 | j := l.stringIndex(l.pos+2, `\E`) 272 | if j >= 0 { 273 | size = j + len(`\Q\E`) 274 | } 275 | l.pushTok(tokQ, size) 276 | 277 | default: 278 | ch := l.byteAt(l.pos + 1) 279 | if ch >= utf8.RuneSelf { 280 | _, size := utf8.DecodeRuneInString(l.input[l.pos+1:]) 281 | l.pushTok(tokEscapeChar, len(`\`)+size) 282 | return 283 | } 284 | kind := tokEscapeChar 285 | if insideCharClass { 286 | if charClassMetachar[ch] { 287 | kind = tokEscapeMeta 288 | } 289 | } else { 290 | if reMetachar[ch] { 291 | kind = tokEscapeMeta 292 | } 293 | } 294 | l.pushTok(kind, 2) 295 | } 296 | } 297 | 298 | func (l *lexer) maybeInsertConcat() { 299 | if l.isConcatPos() { 300 | last := len(l.tokens) - 1 301 | tok := l.tokens[last] 302 | l.tokens[last].kind = tokConcat 303 | l.tokens = append(l.tokens, tok) 304 | } 305 | } 306 | 307 | func (l *lexer) Init(s string) { 308 | l.pos = 0 309 | l.tokens = l.tokens[:0] 310 | l.input = s 311 | 312 | l.scan() 313 | 314 | l.pos = 0 315 | } 316 | 317 | func (l *lexer) tryScanGroupName(pos int) bool { 318 | tok := tokLparenName 319 | endCh := byte('>') 320 | offset := 1 321 | switch l.byteAt(pos) { 322 | case '\'': 323 | endCh = '\'' 324 | tok = tokLparenNameQuote 325 | case '<': 326 | tok = tokLparenNameAngle 327 | case 'P': 328 | offset = 2 329 | default: 330 | return false 331 | } 332 | if pos+offset >= len(l.input) { 333 | return false 334 | } 335 | end := strings.IndexByte(l.input[pos+offset:], endCh) 336 | if end < 0 { 337 | return false 338 | } 339 | l.pushTok(tok, len("(?")+offset+end+1) 340 | return true 341 | } 342 | 343 | func (l *lexer) tryScanGroupFlags(pos int) bool { 344 | colonPos := strings.IndexByte(l.input[pos:], ':') 345 | parenPos := strings.IndexByte(l.input[pos:], ')') 346 | if parenPos < 0 { 347 | return false 348 | } 349 | end := parenPos 350 | if colonPos >= 0 && colonPos < parenPos { 351 | end = colonPos + len(":") 352 | } 353 | l.pushTok(tokLparenFlags, len("(?")+end) 354 | return true 355 | } 356 | 357 | func (l *lexer) tryScanComment(pos int) bool { 358 | if l.byteAt(pos) != '#' { 359 | return false 360 | } 361 | parenPos := strings.IndexByte(l.input[pos:], ')') 362 | if parenPos < 0 { 363 | return false 364 | } 365 | l.pushTok(tokComment, len("(?")+parenPos+len(")")) 366 | return true 367 | } 368 | 369 | func (l *lexer) repeatWidth(pos int) int { 370 | j := pos 371 | for isDigit(l.byteAt(j)) { 372 | j++ 373 | } 374 | if j == pos { 375 | return -1 376 | } 377 | if l.byteAt(j) == '}' { 378 | return (j + len("}")) - pos // {min} 379 | } 380 | if l.byteAt(j) != ',' { 381 | return -1 382 | } 383 | j += len(",") 384 | for isDigit(l.byteAt(j)) { 385 | j++ 386 | } 387 | if l.byteAt(j) == '}' { 388 | return (j + len("}")) - pos // {min,} or {min,max} 389 | } 390 | return -1 391 | } 392 | 393 | func (l *lexer) stringIndex(offset int, s string) int { 394 | if offset < len(l.input) { 395 | return strings.Index(l.input[offset:], s) 396 | } 397 | return -1 398 | } 399 | 400 | func (l *lexer) byteAt(pos int) byte { 401 | if pos >= 0 && pos < len(l.input) { 402 | return l.input[pos] 403 | } 404 | return 0 405 | } 406 | 407 | func (l *lexer) pushTok(kind tokenKind, size int) { 408 | l.tokens = append(l.tokens, token{ 409 | kind: kind, 410 | pos: Position{Begin: uint16(l.pos), End: uint16(l.pos + size)}, 411 | }) 412 | l.pos += size 413 | } 414 | 415 | func (l *lexer) isConcatPos() bool { 416 | if len(l.tokens) < 2 { 417 | return false 418 | } 419 | x := l.tokens[len(l.tokens)-2].kind 420 | if concatTable[x]&concatX != 0 { 421 | return false 422 | } 423 | y := l.tokens[len(l.tokens)-1].kind 424 | return concatTable[y]&concatY == 0 425 | } 426 | 427 | const ( 428 | concatX byte = 1 << iota 429 | concatY 430 | ) 431 | 432 | var concatTable = [256]byte{ 433 | tokPipe: concatX | concatY, 434 | 435 | tokLparen: concatX, 436 | tokLparenFlags: concatX, 437 | tokLparenName: concatX, 438 | tokLparenNameAngle: concatX, 439 | tokLparenNameQuote: concatX, 440 | tokLparenAtomic: concatX, 441 | tokLbracket: concatX, 442 | tokLbracketCaret: concatX, 443 | tokLparenPositiveLookahead: concatX, 444 | tokLparenPositiveLookbehind: concatX, 445 | tokLparenNegativeLookahead: concatX, 446 | tokLparenNegativeLookbehind: concatX, 447 | 448 | tokRparen: concatY, 449 | tokRbracket: concatY, 450 | tokPlus: concatY, 451 | tokStar: concatY, 452 | tokQuestion: concatY, 453 | tokRepeat: concatY, 454 | } 455 | -------------------------------------------------------------------------------- /syntax/parser.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | import ( 4 | "errors" 5 | "strings" 6 | ) 7 | 8 | type ParserOptions struct { 9 | // NoLiterals disables OpChar merging into OpLiteral. 10 | NoLiterals bool 11 | } 12 | 13 | func NewParser(opts *ParserOptions) *Parser { 14 | return newParser(opts) 15 | } 16 | 17 | type Parser struct { 18 | out Regexp 19 | lexer lexer 20 | exprPool []Expr 21 | 22 | prefixParselets [256]prefixParselet 23 | infixParselets [256]infixParselet 24 | 25 | charClass []Expr 26 | allocated uint 27 | 28 | opts ParserOptions 29 | } 30 | 31 | // ParsePCRE parses PHP-style pattern with delimiters. 32 | // An example of such pattern is `/foo/i`. 33 | func (p *Parser) ParsePCRE(pattern string) (*RegexpPCRE, error) { 34 | pcre, err := p.newPCRE(pattern) 35 | if err != nil { 36 | return nil, err 37 | } 38 | if pcre.HasModifier('x') { 39 | return nil, errors.New("'x' modifier is not supported") 40 | } 41 | re, err := p.Parse(pcre.Pattern) 42 | if re != nil { 43 | pcre.Expr = re.Expr 44 | } 45 | return pcre, err 46 | } 47 | 48 | func (p *Parser) Parse(pattern string) (result *Regexp, err error) { 49 | defer func() { 50 | r := recover() 51 | if r == nil { 52 | return 53 | } 54 | if err2, ok := r.(ParseError); ok { 55 | err = err2 56 | return 57 | } 58 | panic(r) 59 | }() 60 | 61 | p.lexer.Init(pattern) 62 | p.allocated = 0 63 | p.out.Pattern = pattern 64 | if pattern == "" { 65 | p.out.Expr = *p.newExpr(OpConcat, Position{}) 66 | } else { 67 | p.out.Expr = *p.parseExpr(0) 68 | } 69 | 70 | if !p.opts.NoLiterals { 71 | p.mergeChars(&p.out.Expr) 72 | } 73 | p.setValues(&p.out.Expr) 74 | 75 | return &p.out, nil 76 | } 77 | 78 | type prefixParselet func(token) *Expr 79 | 80 | type infixParselet func(*Expr, token) *Expr 81 | 82 | func newParser(opts *ParserOptions) *Parser { 83 | var p Parser 84 | 85 | if opts != nil { 86 | p.opts = *opts 87 | } 88 | p.exprPool = make([]Expr, 256) 89 | 90 | for tok, op := range tok2op { 91 | if op != 0 { 92 | p.prefixParselets[tokenKind(tok)] = p.parsePrefixElementary 93 | } 94 | } 95 | 96 | p.prefixParselets[tokQ] = func(tok token) *Expr { 97 | litPos := tok.pos 98 | litPos.Begin += uint16(len(`\Q`)) 99 | form := FormQuoteUnclosed 100 | if strings.HasSuffix(p.tokenValue(tok), `\E`) { 101 | litPos.End -= uint16(len(`\E`)) 102 | form = FormDefault 103 | } 104 | lit := p.newExpr(OpString, litPos) 105 | return p.newExprForm(OpQuote, form, tok.pos, lit) 106 | } 107 | 108 | p.prefixParselets[tokEscapeHexFull] = func(tok token) *Expr { 109 | litPos := tok.pos 110 | litPos.Begin += uint16(len(`\x{`)) 111 | litPos.End -= uint16(len(`}`)) 112 | lit := p.newExpr(OpString, litPos) 113 | return p.newExprForm(OpEscapeHex, FormEscapeHexFull, tok.pos, lit) 114 | } 115 | p.prefixParselets[tokEscapeUniFull] = func(tok token) *Expr { 116 | litPos := tok.pos 117 | litPos.Begin += uint16(len(`\p{`)) 118 | litPos.End -= uint16(len(`}`)) 119 | lit := p.newExpr(OpString, litPos) 120 | return p.newExprForm(OpEscapeUni, FormEscapeUniFull, tok.pos, lit) 121 | } 122 | 123 | p.prefixParselets[tokEscapeHex] = func(tok token) *Expr { return p.parseEscape(OpEscapeHex, `\x`, tok) } 124 | p.prefixParselets[tokEscapeOctal] = func(tok token) *Expr { return p.parseEscape(OpEscapeOctal, `\`, tok) } 125 | p.prefixParselets[tokEscapeChar] = func(tok token) *Expr { return p.parseEscape(OpEscapeChar, `\`, tok) } 126 | p.prefixParselets[tokEscapeMeta] = func(tok token) *Expr { return p.parseEscape(OpEscapeMeta, `\`, tok) } 127 | p.prefixParselets[tokEscapeUni] = func(tok token) *Expr { return p.parseEscape(OpEscapeUni, `\p`, tok) } 128 | 129 | p.prefixParselets[tokLparen] = func(tok token) *Expr { return p.parseGroup(OpCapture, tok) } 130 | p.prefixParselets[tokLparenAtomic] = func(tok token) *Expr { return p.parseGroup(OpAtomicGroup, tok) } 131 | p.prefixParselets[tokLparenPositiveLookahead] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookahead, tok) } 132 | p.prefixParselets[tokLparenNegativeLookahead] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookahead, tok) } 133 | p.prefixParselets[tokLparenPositiveLookbehind] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookbehind, tok) } 134 | p.prefixParselets[tokLparenNegativeLookbehind] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookbehind, tok) } 135 | 136 | p.prefixParselets[tokLparenName] = func(tok token) *Expr { 137 | return p.parseNamedCapture(FormDefault, tok) 138 | } 139 | p.prefixParselets[tokLparenNameAngle] = func(tok token) *Expr { 140 | return p.parseNamedCapture(FormNamedCaptureAngle, tok) 141 | } 142 | p.prefixParselets[tokLparenNameQuote] = func(tok token) *Expr { 143 | return p.parseNamedCapture(FormNamedCaptureQuote, tok) 144 | } 145 | 146 | p.prefixParselets[tokLparenFlags] = p.parseGroupWithFlags 147 | 148 | p.prefixParselets[tokPipe] = func(tok token) *Expr { 149 | // We need prefix pipe parselet to handle `(|x)` syntax. 150 | right := p.parseExpr(1) 151 | return p.newExpr(OpAlt, tok.pos, p.newEmpty(tok.pos), right) 152 | } 153 | p.prefixParselets[tokLbracket] = func(tok token) *Expr { 154 | return p.parseCharClass(OpCharClass, tok) 155 | } 156 | p.prefixParselets[tokLbracketCaret] = func(tok token) *Expr { 157 | return p.parseCharClass(OpNegCharClass, tok) 158 | } 159 | 160 | p.infixParselets[tokRepeat] = func(left *Expr, tok token) *Expr { 161 | repeatLit := p.newExpr(OpString, tok.pos) 162 | return p.newExpr(OpRepeat, combinePos(left.Pos, tok.pos), left, repeatLit) 163 | } 164 | p.infixParselets[tokStar] = func(left *Expr, tok token) *Expr { 165 | return p.newExpr(OpStar, combinePos(left.Pos, tok.pos), left) 166 | } 167 | p.infixParselets[tokConcat] = func(left *Expr, tok token) *Expr { 168 | right := p.parseExpr(2) 169 | if left.Op == OpConcat { 170 | left.Args = append(left.Args, *right) 171 | left.Pos.End = right.End() 172 | return left 173 | } 174 | return p.newExpr(OpConcat, combinePos(left.Pos, right.Pos), left, right) 175 | } 176 | p.infixParselets[tokPipe] = p.parseAlt 177 | p.infixParselets[tokMinus] = p.parseMinus 178 | p.infixParselets[tokPlus] = p.parsePlus 179 | p.infixParselets[tokQuestion] = p.parseQuestion 180 | 181 | return &p 182 | } 183 | 184 | func (p *Parser) setValues(e *Expr) { 185 | for i := range e.Args { 186 | p.setValues(&e.Args[i]) 187 | } 188 | e.Value = p.exprValue(e) 189 | } 190 | 191 | func (p *Parser) tokenValue(tok token) string { 192 | return p.out.Pattern[tok.pos.Begin:tok.pos.End] 193 | } 194 | 195 | func (p *Parser) exprValue(e *Expr) string { 196 | return p.out.Pattern[e.Begin():e.End()] 197 | } 198 | 199 | func (p *Parser) mergeChars(e *Expr) { 200 | for i := range e.Args { 201 | p.mergeChars(&e.Args[i]) 202 | } 203 | if e.Op != OpConcat || len(e.Args) < 2 { 204 | return 205 | } 206 | 207 | args := e.Args[:0] 208 | i := 0 209 | for i < len(e.Args) { 210 | first := i 211 | chars := 0 212 | for j := i; j < len(e.Args) && e.Args[j].Op == OpChar; j++ { 213 | chars++ 214 | } 215 | if chars > 1 { 216 | c1 := e.Args[first] 217 | c2 := e.Args[first+chars-1] 218 | lit := p.newExpr(OpLiteral, combinePos(c1.Pos, c2.Pos)) 219 | for j := 0; j < chars; j++ { 220 | lit.Args = append(lit.Args, e.Args[first+j]) 221 | } 222 | args = append(args, *lit) 223 | i += chars 224 | } else { 225 | args = append(args, e.Args[i]) 226 | i++ 227 | } 228 | } 229 | if len(args) == 1 { 230 | *e = args[0] // Turn OpConcat into OpLiteral 231 | } else { 232 | e.Args = args 233 | } 234 | } 235 | 236 | func (p *Parser) newEmpty(pos Position) *Expr { 237 | return p.newExpr(OpConcat, pos) 238 | } 239 | 240 | func (p *Parser) newExprForm(op Operation, form Form, pos Position, args ...*Expr) *Expr { 241 | e := p.newExpr(op, pos, args...) 242 | e.Form = form 243 | return e 244 | } 245 | 246 | func (p *Parser) newExpr(op Operation, pos Position, args ...*Expr) *Expr { 247 | e := p.allocExpr() 248 | *e = Expr{ 249 | Op: op, 250 | Pos: pos, 251 | Args: e.Args[:0], 252 | } 253 | for _, arg := range args { 254 | e.Args = append(e.Args, *arg) 255 | } 256 | return e 257 | } 258 | 259 | func (p *Parser) allocExpr() *Expr { 260 | i := p.allocated 261 | if i < uint(len(p.exprPool)) { 262 | p.allocated++ 263 | return &p.exprPool[i] 264 | } 265 | return &Expr{} 266 | } 267 | 268 | func (p *Parser) expect(kind tokenKind) Position { 269 | tok := p.lexer.NextToken() 270 | if tok.kind != kind { 271 | throwExpectedFound(tok.pos, kind.String(), tok.kind.String()) 272 | } 273 | return tok.pos 274 | } 275 | 276 | func (p *Parser) parseExpr(precedence int) *Expr { 277 | tok := p.lexer.NextToken() 278 | prefix := p.prefixParselets[tok.kind] 279 | if prefix == nil { 280 | throwUnexpectedToken(tok.pos, tok.String()) 281 | } 282 | left := prefix(tok) 283 | 284 | for precedence < p.precedenceOf(p.lexer.Peek()) { 285 | tok := p.lexer.NextToken() 286 | infix := p.infixParselets[tok.kind] 287 | left = infix(left, tok) 288 | } 289 | 290 | return left 291 | } 292 | 293 | func (p *Parser) parsePrefixElementary(tok token) *Expr { 294 | return p.newExpr(tok2op[tok.kind], tok.pos) 295 | } 296 | 297 | func (p *Parser) parseCharClass(op Operation, tok token) *Expr { 298 | var endPos Position 299 | p.charClass = p.charClass[:0] 300 | for { 301 | p.charClass = append(p.charClass, *p.parseExpr(0)) 302 | next := p.lexer.Peek() 303 | if next.kind == tokRbracket { 304 | endPos = next.pos 305 | p.lexer.NextToken() 306 | break 307 | } 308 | if next.kind == tokNone { 309 | throw(tok.pos, "unterminated '['") 310 | } 311 | } 312 | 313 | result := p.newExpr(op, combinePos(tok.pos, endPos)) 314 | result.Args = append(result.Args, p.charClass...) 315 | return result 316 | } 317 | 318 | func (p *Parser) parseMinus(left *Expr, tok token) *Expr { 319 | if p.isValidCharRangeOperand(left) { 320 | if p.lexer.Peek().kind != tokRbracket { 321 | right := p.parseExpr(2) 322 | return p.newExpr(OpCharRange, combinePos(left.Pos, right.Pos), left, right) 323 | } 324 | } 325 | p.charClass = append(p.charClass, *left) 326 | return p.newExpr(OpChar, tok.pos) 327 | } 328 | 329 | func (p *Parser) isValidCharRangeOperand(e *Expr) bool { 330 | switch e.Op { 331 | case OpEscapeHex, OpEscapeOctal, OpEscapeMeta, OpChar: 332 | return true 333 | case OpEscapeChar: 334 | switch p.exprValue(e) { 335 | case `\\`, `\|`, `\*`, `\+`, `\?`, `\.`, `\[`, `\^`, `\$`, `\(`, `\)`: 336 | return true 337 | } 338 | } 339 | return false 340 | } 341 | 342 | func (p *Parser) parsePlus(left *Expr, tok token) *Expr { 343 | op := OpPlus 344 | switch left.Op { 345 | case OpPlus, OpStar, OpQuestion, OpRepeat: 346 | op = OpPossessive 347 | } 348 | return p.newExpr(op, combinePos(left.Pos, tok.pos), left) 349 | } 350 | 351 | func (p *Parser) parseQuestion(left *Expr, tok token) *Expr { 352 | op := OpQuestion 353 | switch left.Op { 354 | case OpPlus, OpStar, OpQuestion, OpRepeat: 355 | op = OpNonGreedy 356 | } 357 | return p.newExpr(op, combinePos(left.Pos, tok.pos), left) 358 | } 359 | 360 | func (p *Parser) parseAlt(left *Expr, tok token) *Expr { 361 | var right *Expr 362 | switch p.lexer.Peek().kind { 363 | case tokRparen, tokNone: 364 | // This is needed to handle `(x|)` syntax. 365 | right = p.newEmpty(tok.pos) 366 | default: 367 | right = p.parseExpr(1) 368 | } 369 | if left.Op == OpAlt { 370 | left.Args = append(left.Args, *right) 371 | left.Pos.End = right.End() 372 | return left 373 | } 374 | return p.newExpr(OpAlt, combinePos(left.Pos, right.Pos), left, right) 375 | } 376 | 377 | func (p *Parser) parseGroupItem(tok token) *Expr { 378 | if p.lexer.Peek().kind == tokRparen { 379 | // This is needed to handle `() syntax.` 380 | return p.newEmpty(tok.pos) 381 | } 382 | return p.parseExpr(0) 383 | } 384 | 385 | func (p *Parser) parseGroup(op Operation, tok token) *Expr { 386 | x := p.parseGroupItem(tok) 387 | result := p.newExpr(op, tok.pos, x) 388 | result.Pos.End = p.expect(tokRparen).End 389 | return result 390 | } 391 | 392 | func (p *Parser) parseNamedCapture(form Form, tok token) *Expr { 393 | prefixLen := len("(?<") 394 | if form == FormDefault { 395 | prefixLen = len("(?P<") 396 | } 397 | name := p.newExpr(OpString, Position{ 398 | Begin: tok.pos.Begin + uint16(prefixLen), 399 | End: tok.pos.End - uint16(len(">")), 400 | }) 401 | x := p.parseGroupItem(tok) 402 | result := p.newExprForm(OpNamedCapture, form, tok.pos, x, name) 403 | result.Pos.End = p.expect(tokRparen).End 404 | return result 405 | } 406 | 407 | func (p *Parser) parseGroupWithFlags(tok token) *Expr { 408 | var result *Expr 409 | val := p.out.Pattern[tok.pos.Begin+1 : tok.pos.End] 410 | switch { 411 | case !strings.HasSuffix(val, ":"): 412 | flags := p.newExpr(OpString, Position{ 413 | Begin: tok.pos.Begin + uint16(len("(?")), 414 | End: tok.pos.End, 415 | }) 416 | result = p.newExpr(OpFlagOnlyGroup, tok.pos, flags) 417 | case val == "?:": 418 | x := p.parseGroupItem(tok) 419 | result = p.newExpr(OpGroup, tok.pos, x) 420 | default: 421 | flags := p.newExpr(OpString, Position{ 422 | Begin: tok.pos.Begin + uint16(len("(?")), 423 | End: tok.pos.End - uint16(len(":")), 424 | }) 425 | x := p.parseGroupItem(tok) 426 | result = p.newExpr(OpGroupWithFlags, tok.pos, x, flags) 427 | } 428 | result.Pos.End = p.expect(tokRparen).End 429 | return result 430 | } 431 | 432 | func (p *Parser) parseEscape(op Operation, prefix string, tok token) *Expr { 433 | litPos := tok.pos 434 | litPos.Begin += uint16(len(prefix)) 435 | lit := p.newExpr(OpString, litPos) 436 | return p.newExpr(op, tok.pos, lit) 437 | } 438 | 439 | func (p *Parser) precedenceOf(tok token) int { 440 | switch tok.kind { 441 | case tokPipe: 442 | return 1 443 | case tokConcat, tokMinus: 444 | return 2 445 | case tokPlus, tokStar, tokQuestion, tokRepeat: 446 | return 3 447 | default: 448 | return 0 449 | } 450 | } 451 | 452 | func (p *Parser) newPCRE(source string) (*RegexpPCRE, error) { 453 | if source == "" { 454 | return nil, errors.New("empty pattern: can't find delimiters") 455 | } 456 | 457 | delim := source[0] 458 | endDelim := delim 459 | switch delim { 460 | case '(': 461 | endDelim = ')' 462 | case '{': 463 | endDelim = '}' 464 | case '[': 465 | endDelim = ']' 466 | case '<': 467 | endDelim = '>' 468 | case '\\': 469 | return nil, errors.New("'\\' is not a valid delimiter") 470 | default: 471 | if isSpace(delim) { 472 | return nil, errors.New("whitespace is not a valid delimiter") 473 | } 474 | if isAlphanumeric(delim) { 475 | return nil, errors.New("'" + string(delim) + "' is not a valid delimiter") 476 | } 477 | } 478 | 479 | const delimLen = 1 480 | j := strings.LastIndexByte(source[delimLen:], endDelim) 481 | if j == -1 { 482 | return nil, errors.New("can't find '" + string(endDelim) + "' ending delimiter") 483 | } 484 | j += delimLen 485 | 486 | pcre := &RegexpPCRE{ 487 | Pattern: source[delimLen:j], 488 | Source: source, 489 | Delim: [2]byte{delim, endDelim}, 490 | Modifiers: source[j+delimLen:], 491 | } 492 | return pcre, nil 493 | } 494 | 495 | var tok2op = [256]Operation{ 496 | tokDollar: OpDollar, 497 | tokCaret: OpCaret, 498 | tokDot: OpDot, 499 | tokChar: OpChar, 500 | tokMinus: OpChar, 501 | tokPosixClass: OpPosixClass, 502 | tokComment: OpComment, 503 | } 504 | -------------------------------------------------------------------------------- /syntax/parser_test.go: -------------------------------------------------------------------------------- 1 | package syntax 2 | 3 | import ( 4 | "fmt" 5 | "regexp/syntax" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | func TestParserErrors(t *testing.T) { 11 | tests := []struct { 12 | pattern string 13 | want string 14 | }{ 15 | {`\`, `unexpected end of pattern: trailing '\'`}, 16 | {`\x`, `unexpected end of pattern: expected hex-digit or '{'`}, 17 | {`\x{12`, `can't find closing '}'`}, 18 | {`(abc`, `expected ')', found 'None'`}, 19 | {`[abc`, `unterminated '['`}, 20 | {`[]`, `unterminated '['`}, 21 | {`[^]`, `unterminated '['`}, 22 | {`\p`, `unexpected end of pattern: expected uni-class-short or '{'`}, 23 | {`\p{L`, `can't find closing '}'`}, 24 | {`(?`, `group token is incomplete`}, 25 | {`(?i`, `group token is incomplete`}, 26 | {`(?:`, `group token is incomplete`}, 27 | } 28 | 29 | p := NewParser(nil) 30 | for _, test := range tests { 31 | _, err := p.Parse(test.pattern) 32 | have := "" 33 | if err != nil { 34 | have = err.Error() 35 | } 36 | if have != test.want { 37 | t.Errorf("parse(%q):\nhave: %s\nwant: %s", 38 | test.pattern, have, test.want) 39 | } 40 | } 41 | } 42 | 43 | func writeExpr(t *testing.T, w *strings.Builder, re *Regexp, e Expr) { 44 | assertBeginPos := func(e Expr, begin uint16) { 45 | if e.Begin() != begin { 46 | t.Errorf("`%s`: %s begin pos mismatch:\nhave: `%s` (begin=%d)\nwant: `%s` (begin=%d)", 47 | re.Pattern, e.Op, 48 | re.Pattern[e.Begin():e.End()], e.Begin(), 49 | re.Pattern[begin:e.End()], begin) 50 | } 51 | } 52 | assertEndPos := func(e Expr, end uint16) { 53 | if e.End() != end { 54 | t.Errorf("`%s`: %s end pos mismatch:\nhave: `%s` (end=%d)\nwant: `%s` (end=%d)", 55 | re.Pattern, e.Op, 56 | re.Pattern[e.Begin():e.End()], e.End(), 57 | re.Pattern[e.Begin():end], end) 58 | } 59 | } 60 | 61 | switch e.Op { 62 | case OpChar, OpString, OpPosixClass, OpDot, OpCaret, OpDollar, OpComment: 63 | w.WriteString(e.Value) 64 | 65 | case OpQuote: 66 | assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\Q`))) 67 | w.WriteString(`\Q`) 68 | writeExpr(t, w, re, e.Args[0]) 69 | if e.Form != FormQuoteUnclosed { 70 | w.WriteString(`\E`) 71 | } 72 | 73 | case OpEscapeOctal, OpEscapeChar, OpEscapeMeta: 74 | assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\`))) 75 | w.WriteString(`\`) 76 | writeExpr(t, w, re, e.Args[0]) 77 | 78 | case OpEscapeUni: 79 | switch e.Form { 80 | case FormEscapeUniFull: 81 | assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\p{`))) 82 | assertEndPos(e, e.Args[0].End()+uint16(len(`}`))) 83 | w.WriteString(`\p{`) 84 | writeExpr(t, w, re, e.Args[0]) 85 | w.WriteString(`}`) 86 | default: 87 | assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\p`))) 88 | w.WriteString(`\p`) 89 | writeExpr(t, w, re, e.Args[0]) 90 | } 91 | 92 | case OpEscapeHex: 93 | switch e.Form { 94 | case FormEscapeHexFull: 95 | assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\x{`))) 96 | assertEndPos(e, e.Args[0].End()+uint16(len(`}`))) 97 | w.WriteString(`\x{`) 98 | writeExpr(t, w, re, e.Args[0]) 99 | w.WriteString(`}`) 100 | default: 101 | assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\x`))) 102 | w.WriteString(`\x`) 103 | writeExpr(t, w, re, e.Args[0]) 104 | } 105 | 106 | case OpLiteral: 107 | assertBeginPos(e, e.Args[0].Begin()) 108 | assertEndPos(e, e.LastArg().End()) 109 | for _, a := range e.Args { 110 | writeExpr(t, w, re, a) 111 | } 112 | 113 | case OpCharRange: 114 | assertBeginPos(e, e.Args[0].Begin()) 115 | assertEndPos(e, e.Args[1].End()) 116 | writeExpr(t, w, re, e.Args[0]) 117 | w.WriteByte('-') 118 | writeExpr(t, w, re, e.Args[1]) 119 | 120 | case OpNamedCapture: 121 | assertEndPos(e, e.Args[0].End()+1) 122 | switch e.Form { 123 | case FormNamedCaptureAngle: 124 | fmt.Fprintf(w, "(?<%s>", e.Args[1].Value) 125 | case FormNamedCaptureQuote: 126 | fmt.Fprintf(w, "(?'%s'", e.Args[1].Value) 127 | default: 128 | fmt.Fprintf(w, "(?P<%s>", e.Args[1].Value) 129 | } 130 | writeExpr(t, w, re, e.Args[0]) 131 | w.WriteByte(')') 132 | 133 | case OpFlagOnlyGroup: 134 | assertEndPos(e, e.Args[0].End()+1) 135 | w.WriteString("(?") 136 | w.WriteString(e.Args[0].Value) 137 | w.WriteByte(')') 138 | 139 | case OpGroupWithFlags: 140 | assertEndPos(e, e.Args[0].End()+1) 141 | w.WriteString("(?") 142 | w.WriteString(e.Args[1].Value) 143 | w.WriteByte(':') 144 | writeExpr(t, w, re, e.Args[0]) 145 | w.WriteByte(')') 146 | 147 | case OpCapture, OpGroup, OpAtomicGroup, OpPositiveLookahead, OpNegativeLookahead, OpPositiveLookbehind, OpNegativeLookbehind: 148 | assertEndPos(e, e.Args[0].End()+1) 149 | w.WriteByte('(') 150 | switch e.Op { 151 | case OpGroup: 152 | w.WriteString("?:") 153 | case OpAtomicGroup: 154 | w.WriteString("?>") 155 | case OpPositiveLookahead: 156 | w.WriteString("?=") 157 | case OpNegativeLookahead: 158 | w.WriteString("?!") 159 | case OpPositiveLookbehind: 160 | w.WriteString("?<=") 161 | case OpNegativeLookbehind: 162 | w.WriteString("? 0 { 187 | assertEndPos(e, e.LastArg().End()) 188 | } 189 | for _, a := range e.Args { 190 | writeExpr(t, w, re, a) 191 | } 192 | 193 | case OpAlt: 194 | assertBeginPos(e, e.Begin()) 195 | assertEndPos(e, e.LastArg().End()) 196 | for i, a := range e.Args { 197 | writeExpr(t, w, re, a) 198 | if i != len(e.Args)-1 { 199 | w.WriteByte('|') 200 | } 201 | } 202 | 203 | case OpNonGreedy, OpPossessive, OpQuestion, OpPlus, OpStar: 204 | assertBeginPos(e, e.Args[0].Begin()) 205 | assertEndPos(e, e.Args[0].End()+1) 206 | writeExpr(t, w, re, e.Args[0]) 207 | switch e.Op { 208 | case OpNonGreedy, OpQuestion: 209 | w.WriteByte('?') 210 | case OpPossessive, OpPlus: 211 | w.WriteByte('+') 212 | case OpStar: 213 | w.WriteByte('*') 214 | } 215 | 216 | default: 217 | panic(fmt.Sprintf("unhandled %s", e.Op)) 218 | } 219 | } 220 | 221 | func TestWriteExpr(t *testing.T) { 222 | // Tests that ensure that we can print the source regexp 223 | // using the parsed AST. 224 | // They also verify that AST node positions are correct. 225 | 226 | tests := []struct { 227 | pat string 228 | o1 Operation 229 | o2 Operation 230 | }{ 231 | {pat: `(?#?#)$`, o1: OpDollar, o2: OpComment}, 232 | {pat: `(foobar|baz)*+(?#the comment)`, o1: OpPossessive, o2: OpComment}, 233 | {pat: `abc?+`, o1: OpLiteral, o2: OpPossessive}, 234 | {pat: `x{0}`, o1: OpChar, o2: OpString}, 235 | {pat: `a\x{BAD}`, o1: OpLiteral, o2: OpEscapeHex}, 236 | {pat: `(✓x✓x)`, o1: OpLiteral, o2: OpCapture}, 237 | {pat: `[x][]]`, o1: OpCharClass, o2: OpLiteral}, 238 | {pat: `[A-Za-z0-9-]`, o1: OpCharClass, o2: OpCharRange}, 239 | {pat: `x{1}yz`, o1: OpLiteral, o2: OpRepeat}, 240 | {pat: `x{1,2}y*`, o1: OpRepeat, o2: OpStar}, 241 | {pat: `x{11,30}y+`, o1: OpRepeat, o2: OpPlus}, 242 | {pat: `x{1,}$`, o1: OpRepeat, o2: OpDollar}, 243 | {pat: `\p{Cyrillic}\d`, o1: OpEscapeUni, o2: OpEscapeChar}, 244 | {pat: `x\p{Greek}y+?`, o1: OpEscapeUni, o2: OpNonGreedy}, 245 | {pat: `x\p{L}+y`, o1: OpEscapeUni, o2: OpPlus}, 246 | {pat: `^\pL`, o1: OpEscapeUni, o2: OpCaret}, 247 | {pat: `^x\pLy`, o1: OpEscapeUni, o2: OpCaret}, 248 | {pat: `\d?`, o1: OpEscapeChar, o2: OpQuestion}, 249 | {pat: `[\xC0-\xC6]`, o1: OpCharRange, o2: OpEscapeHex}, 250 | {pat: `\01\xff`, o1: OpEscapeOctal, o2: OpEscapeHex}, 251 | {pat: `\111x\Qabc`, o1: OpEscapeOctal, o2: OpQuote}, 252 | {pat: `x\Qabc\E.(?:s:..)`, o1: OpQuote, o2: OpGroupWithFlags}, 253 | {pat: `(?i:foo[[:^alpha:]])`, o1: OpGroupWithFlags, o2: OpPosixClass}, 254 | {pat: `a[[:digit:]\]]`, o1: OpPosixClass, o2: OpEscapeMeta}, 255 | {pat: `(?:fa*)`, o1: OpGroup, o2: OpStar}, 256 | {pat: `(?:x)|(?:y)`, o1: OpGroup, o2: OpAlt}, 257 | {pat: `(foo|ba?r)`, o1: OpAlt, o2: OpQuestion}, 258 | {pat: `(?P<1>xy\x{F})`, o1: OpNamedCapture, o2: OpEscapeHex}, 259 | {pat: `(?P)[^12]+?(?:[^]]x)`, o1: OpNamedCapture, o2: OpNegCharClass}, 260 | {pat: `()\(`, o1: OpCapture, o2: OpEscapeMeta}, 261 | {pat: `x{1,}?.?.`, o1: OpNonGreedy, o2: OpDot}, 262 | {pat: `(?i)f.o`, o1: OpFlagOnlyGroup, o2: OpDot}, 263 | {pat: `(?:(?i)[^a-z]o)`, o1: OpFlagOnlyGroup, o2: OpNegCharClass}, 264 | {pat: `(?:(?Px))`, o1: OpString, o2: OpChar}, 265 | {pat: `(?>atomic){2}.(?=x)`, o1: OpAtomicGroup, o2: OpPositiveLookahead}, 266 | {pat: `(?:(?>g2)g1(?=))`, o1: OpAtomicGroup, o2: OpPositiveLookahead}, 267 | {pat: `(?<=a)|([\\w-]+?):\\s+?(?'var_val'.+?);`, o1: OpNamedCapture}, 272 | {pat: `^ *(#{1,6}) *([^\n]+?) *#* *(?:\n|$)`}, 273 | {pat: `^4\d{12}(\d{3})?$`}, 274 | } 275 | 276 | const minTests = 2 277 | toCover := make(map[Operation]int) 278 | for op := OpNone + 1; op < OpNone2; op++ { 279 | switch op { 280 | case OpConcat: 281 | continue 282 | } 283 | toCover[op] = minTests 284 | } 285 | 286 | exprToString := func(re *Regexp) (s string, err error) { 287 | var b strings.Builder 288 | writeExpr(t, &b, re, re.Expr) 289 | return b.String(), nil 290 | } 291 | 292 | p := NewParser(nil) 293 | for _, test := range tests { 294 | pattern := "_" + test.pat + "_" 295 | re, err := p.Parse(pattern) 296 | if err != nil { 297 | t.Fatalf("parse(%q): %v", test.pat, err) 298 | } 299 | have, err := exprToString(re) 300 | if err != nil { 301 | t.Fatalf("stringify(%q): %v", test.pat, err) 302 | } 303 | want := pattern 304 | if have != want { 305 | t.Fatalf("result mismatch:\nhave: `%s`\nwant: `%s`", have, want) 306 | } 307 | if test.o1 != 0 { 308 | toCover[test.o1]-- 309 | } 310 | if test.o2 != 0 { 311 | toCover[test.o2]-- 312 | if test.o2 == test.o1 { 313 | t.Fatalf("%s: o1==o2", test.pat) 314 | } 315 | } 316 | } 317 | 318 | for op, n := range toCover { 319 | if n > 0 { 320 | t.Errorf("not enough tests for %s: want %d, have %d", 321 | op, minTests, minTests-n) 322 | } 323 | } 324 | } 325 | 326 | func TestParser(t *testing.T) { 327 | tests := []struct { 328 | pattern string 329 | want string 330 | }{ 331 | // Empty pattern. 332 | {``, `{}`}, 333 | 334 | // Anchors. 335 | {`^`, `^`}, 336 | {`^^`, `{^ ^}`}, 337 | {`$`, `$`}, 338 | {`$$`, `{$ $}`}, 339 | 340 | // Simple literals and chars. 341 | {` `, ` `}, 342 | {` `, ` `}, 343 | {`x`, `x`}, 344 | {`abc`, `abc`}, 345 | {`□`, `□`}, 346 | {`✓`, `✓`}, 347 | {`✓✓`, `✓✓`}, 348 | 349 | // Dots and alternations (or). 350 | {`.`, `.`}, 351 | {`..`, `{. .}`}, 352 | {`...`, `{. . .}`}, 353 | {`.|.`, `(or . .)`}, 354 | {`.|✓|.`, `(or . ✓ .)`}, 355 | {`✓.|.`, `(or {✓ .} .)`}, 356 | {`.|✓.`, `(or . {✓ .})`}, 357 | {`..✓|.`, `(or {. . ✓} .)`}, 358 | {`.|..|..✓`, `(or . {. .} {. . ✓})`}, 359 | {`.|...|..`, `(or . {. . .} {. .})`}, 360 | 361 | // Capturing groups. 362 | {`()`, `(capture {})`}, 363 | {`(.)`, `(capture .)`}, 364 | {`(.✓)`, `(capture {. ✓})`}, 365 | {`(x)|(y)`, `(or (capture x) (capture y))`}, 366 | {`(x)(y)`, `{(capture x) (capture y)}`}, 367 | {`✓(x)y`, `{✓ (capture x) y}`}, 368 | {`a(x1|y1)b`, `{a (capture (or x1 y1)) b}`}, 369 | 370 | // Non-capturing groups without flags. 371 | {`x(?:)y`, `{x (group {}) y}`}, 372 | {`x(?:.)y`, `{x (group .) y}`}, 373 | {`x(?:ab)y`, `{x (group ab) y}`}, 374 | {`(?:a|b)`, `(group (or a b))`}, 375 | {`(?:^a|bc)c`, `{(group (or {^ a} bc)) c}`}, 376 | 377 | // Flag-only groups. 378 | {`x(?i)y`, `{x (flags ?i) y}`}, 379 | {`x(?i-m)y`, `{x (flags ?i-m) y}`}, 380 | {`x(?-im)y`, `{x (flags ?-im) y}`}, 381 | 382 | // Non-capturing groups with flags. 383 | {`x(?i:)y`, `{x (group {} ?i) y}`}, 384 | {`x(?im:.)y`, `{x (group . ?im) y}`}, 385 | {`x(?i-m:ab)y`, `{x (group ab ?i-m) y}`}, 386 | 387 | // Named captures. 388 | {`x(?P)y`, `{x (capture {} g) y}`}, 389 | {`x(?P.)y`, `{x (capture . name) y}`}, 390 | {`x(?Pab)y`, `{x (capture ab x1) y}`}, 391 | {`x(?ab)y`, `{x (capture ab x12) y}`}, 392 | {`x(?'x12'ab)y`, `{x (capture ab x12) y}`}, 393 | 394 | // Atomic groups. PCRE-only. 395 | {`(?>)`, `(atomic {})`}, 396 | {`(?>foo)`, `(atomic foo)`}, 397 | 398 | // Comments. PCRE-only. 399 | {`a(?#)b`, `{a /*(?#)*/ b}`}, 400 | {`a(?#foo\)b`, `{a /*(?#foo\)*/ b}`}, 401 | 402 | // Quantifiers. 403 | {`x+`, `(+ x)`}, 404 | {`x+|y+`, `(or (+ x) (+ y))`}, 405 | {`x+y+`, `{(+ x) (+ y)}`}, 406 | {`x+y+|z+`, `(or {(+ x) (+ y)} (+ z))`}, 407 | {`(ab)+`, `(+ (capture ab))`}, 408 | {`(.b)+`, `(+ (capture {. b}))`}, 409 | {`x+y*z+`, `{(+ x) (* y) (+ z)}`}, 410 | {`abc+`, `{ab (+ c)}`}, 411 | 412 | // Non-greedy modifiers. 413 | {`x+?|y+?`, `(or (non-greedy (+ x)) (non-greedy (+ y)))`}, 414 | {`x*?|y*?`, `(or (non-greedy (* x)) (non-greedy (* y)))`}, 415 | {`x??|y??`, `(or (non-greedy (? x)) (non-greedy (? y)))`}, 416 | 417 | // Possessive modifiers. PCRE-only. 418 | {`x++|x*+`, `(or (possessive (+ x)) (possessive (* x)))`}, 419 | {`[ab]?+|x{2,}+`, `(or (possessive (? [a b])) (possessive (repeat x {2,})))`}, 420 | 421 | // Escapes and escape chars. 422 | {`\d\d+`, `{\d (+ \d)}`}, 423 | {`\..`, `{\. .}`}, 424 | {`\1`, `\1`}, 425 | {`\✓b`, `{\✓ b}`}, 426 | {`\àb`, `{\à b}`}, 427 | 428 | // Short Unicode escapes. 429 | {`\pL+d`, `{(+ \pL) d}`}, 430 | 431 | // Full Unicode escapes. 432 | {`\p{Greek}\p{L}`, `{\p{Greek} \p{L}}`}, 433 | {`\P{Greek}\p{^L}`, `{\P{Greek} \p{^L}}`}, 434 | 435 | // Octal escapes. 436 | {`\0`, `\0`}, 437 | {`\01`, `\01`}, 438 | {`\012`, `\012`}, 439 | {`\777`, `\777`}, 440 | {`\78`, `{\7 8}`}, 441 | {`\778`, `{\77 8}`}, 442 | 443 | // Short hex escapes. 444 | {`\xfff`, `{\xff f}`}, 445 | {`\xab1`, `{\xab 1}`}, 446 | 447 | // This is not a valid syntax for hex escapes, but PHP-PCRE accepts them. 448 | // Regexp validator can report them, if enabled. 449 | {`\x2[\x3\x4]`, `{\x2 [\x3 \x4]}`}, 450 | 451 | // Full hex escapes. 452 | {`\x{}b`, `{\x{} b}`}, 453 | {`\x{1}b`, `{\x{1} b}`}, 454 | {`\x{ABC}b`, `{\x{ABC} b}`}, 455 | 456 | // Char classes. 457 | {`[1]`, `[1]`}, 458 | {`[1]a`, `{[1] a}`}, 459 | {`[-a]`, `[- a]`}, 460 | {`[a-]`, `[a -]`}, 461 | {`[a-z]a`, `{[a-z] a}`}, 462 | {`[a-z0-9]`, `[a-z 0-9]`}, 463 | {`[0-9-]`, `[0-9 -]`}, 464 | {`[\da-z_A-Z]`, `[\d a-z _ A-Z]`}, 465 | {`[\(-\)ab]`, `[\(-\) a b]`}, 466 | {`[\]\]\d]a`, `{[\] \] \d] a}`}, 467 | {`[[\[]a`, `{[[ \[] a}`}, 468 | {`[a|b]`, `[a | b]`}, 469 | {`[a+b]`, `[a + b]`}, 470 | {`[a*b]`, `[a * b]`}, 471 | {`[x{1}]`, `[x '{' 1 '}']`}, 472 | {`[]]`, `[]]`}, 473 | {`[][]`, `[] []`}, 474 | 475 | // Negated char classes. 476 | {`[^1]a`, `{[^1] a}`}, 477 | {`[^-a]`, `[^- a]`}, 478 | {`[^a-]`, `[^a -]`}, 479 | {`[^a-z]a`, `{[^a-z] a}`}, 480 | {`[^a-z0-9]`, `[^a-z 0-9]`}, 481 | {`[^\da-z_A-Z]`, `[^\d a-z _ A-Z]`}, 482 | {`[^\(-\)ab]`, `[^\(-\) a b]`}, 483 | {`[^\]\]\d]a`, `{[^\] \] \d] a}`}, 484 | {`[^[\[]a`, `{[^[ \[] a}`}, 485 | {`[^1abc]`, `[^1 a b c]`}, 486 | {`[^]]`, `[^]]`}, 487 | {`[^][]`, `[^] []`}, 488 | {`[^\040\041\043-\133\135-\176]`, `[^\040 \041 \043-\133 \135-\176]`}, 489 | 490 | // Char class ranges. 491 | // We parse a-\d and it's something that should be 492 | // handled by post-parsing validator. 493 | {`[\d-a]`, `[\d - a]`}, 494 | {`[a-\d]`, `[a-\d]`}, 495 | {`[\pL0-9]`, `[\pL 0-9]`}, 496 | {`[+--]`, `[+--]`}, 497 | {`[--+]`, `[--+]`}, 498 | {`[---]`, `[---]`}, 499 | {`[-]`, `[-]`}, 500 | {`[\x20-\x7f]`, `[\x20-\x7f]`}, 501 | {`[\x{20}-\x{7f}]`, `[\x{20}-\x{7f}]`}, 502 | {`[\1-\3]`, `[\1-\3]`}, 503 | {`[\10-\20]`, `[\10-\20]`}, 504 | {`[❤-❤a]`, `[❤-❤ a]`}, 505 | 506 | // Char class with meta symbols. 507 | {`[|]`, `[|]`}, 508 | {`[$.+*^?]`, `[$ . + * ^ ?]`}, 509 | {`[^$.+*^?]`, `[^$ . + * ^ ?]`}, 510 | 511 | // Posix char classes. 512 | {`x[:alpha:]y`, `{x [: a l p h a :] y}`}, 513 | {`x[a[:alpha:]]y`, `{x [a [:alpha:]] y}`}, 514 | {`x[[:^alpha:]]y`, `{x [[:^alpha:]] y}`}, 515 | {`x[^[:alpha:]]y`, `{x [^[:alpha:]] y}`}, 516 | {`x[^[:^alpha:]]y`, `{x [^[:^alpha:]] y}`}, 517 | 518 | // Valid repeat expressions. 519 | {`.{3}`, `(repeat . {3})`}, 520 | {`.{3,}`, `(repeat . {3,})`}, 521 | {`.{3,6}`, `(repeat . {3,6})`}, 522 | {`.{6}?`, `(non-greedy (repeat . {6}))`}, 523 | {`[a-z]{5}`, `(repeat [a-z] {5})`}, 524 | 525 | // Invalid repeat expressions are parsed as normal chars. 526 | {`.{a}`, `{. {a}}`}, 527 | {`.{-1}`, `{. {-1}}`}, 528 | 529 | // \Q...\E escape. 530 | {`\Qa.b\E+z`, `{(+ (q \Qa.b\E)) z}`}, 531 | {`x\Q?\Ey`, `{x (q \Q?\E) y}`}, 532 | {`x\Q\Ey`, `{x (q \Q\E) y}`}, 533 | {`x\Q`, `{x (q \Q)}`}, 534 | {`x\Qy`, `{x (q \Qy)}`}, 535 | {`x\Qyz`, `{x (q \Qyz)}`}, 536 | 537 | // Incomplete `x|` and `|x` expressions are valid. 538 | {`(docker-|)`, `(capture (or docker- {}))`}, 539 | {`x|`, `(or x {})`}, 540 | {`|x`, `(or {} x)`}, 541 | {`(|x|y)`, `(capture (or {} x y))`}, 542 | {`(?:|x)`, `(group (or {} x))`}, 543 | 544 | // More tests for char merging. 545 | {`xy+`, `{x (+ y)}`}, 546 | {`.xy`, `{. xy}`}, 547 | {`foo?|bar`, `(or {fo (? o)} bar)`}, 548 | 549 | // Tests from the patterns found in various GitHub projects. 550 | {`Adm([^i]|$)`, `{Adm (capture (or [^i] $))}`}, 551 | {`\.(com|com\.\w{2})$`, `{\. (capture (or com {com \. (repeat \w {2})})) $}`}, 552 | {`(?i)a(?:x|y)b`, `{(flags ?i) a (group (or x y)) b}`}, 553 | } 554 | 555 | p := NewParser(nil) 556 | for _, test := range tests { 557 | re, err := p.Parse(test.pattern) 558 | if err != nil { 559 | t.Fatalf("parse(%q) error: %v", test.pattern, err) 560 | } 561 | have := formatSyntax(re) 562 | if have != test.want { 563 | t.Fatalf("parse(%q):\nhave: %s\nwant: %s", 564 | test.pattern, have, test.want) 565 | } 566 | } 567 | } 568 | 569 | func formatSyntax(re *Regexp) string { 570 | return formatExprSyntax(re, re.Expr) 571 | } 572 | 573 | func formatExprSyntax(re *Regexp, e Expr) string { 574 | switch e.Op { 575 | case OpChar, OpLiteral: 576 | switch e.Value { 577 | case "{": 578 | return "'{'" 579 | case "}": 580 | return "'}'" 581 | default: 582 | return e.Value 583 | } 584 | case OpString, OpEscapeChar, OpEscapeMeta, OpEscapeOctal, OpEscapeUni, OpEscapeHex, OpPosixClass: 585 | return e.Value 586 | case OpRepeat: 587 | return fmt.Sprintf("(repeat %s %s)", formatExprSyntax(re, e.Args[0]), e.Args[1].Value) 588 | case OpCaret: 589 | return "^" 590 | case OpDollar: 591 | return "$" 592 | case OpDot: 593 | return "." 594 | case OpQuote: 595 | return fmt.Sprintf("(q %s)", e.Value) 596 | case OpCharRange: 597 | return fmt.Sprintf("%s-%s", formatExprSyntax(re, e.Args[0]), formatExprSyntax(re, e.Args[1])) 598 | case OpCharClass: 599 | return fmt.Sprintf("[%s]", formatArgsSyntax(re, e.Args)) 600 | case OpNegCharClass: 601 | return fmt.Sprintf("[^%s]", formatArgsSyntax(re, e.Args)) 602 | case OpConcat: 603 | return fmt.Sprintf("{%s}", formatArgsSyntax(re, e.Args)) 604 | case OpAlt: 605 | return fmt.Sprintf("(or %s)", formatArgsSyntax(re, e.Args)) 606 | case OpCapture: 607 | return fmt.Sprintf("(capture %s)", formatExprSyntax(re, e.Args[0])) 608 | case OpNamedCapture: 609 | return fmt.Sprintf("(capture %s %s)", formatExprSyntax(re, e.Args[0]), e.Args[1].Value) 610 | case OpGroup: 611 | return fmt.Sprintf("(group %s)", formatExprSyntax(re, e.Args[0])) 612 | case OpAtomicGroup: 613 | return fmt.Sprintf("(atomic %s)", formatExprSyntax(re, e.Args[0])) 614 | case OpGroupWithFlags: 615 | return fmt.Sprintf("(group %s ?%s)", formatExprSyntax(re, e.Args[0]), e.Args[1].Value) 616 | case OpFlagOnlyGroup: 617 | return fmt.Sprintf("(flags ?%s)", formatExprSyntax(re, e.Args[0])) 618 | case OpPositiveLookahead: 619 | return fmt.Sprintf("(?= %s)", formatExprSyntax(re, e.Args[0])) 620 | case OpNegativeLookahead: 621 | return fmt.Sprintf("(?! %s)", formatExprSyntax(re, e.Args[0])) 622 | case OpPositiveLookbehind: 623 | return fmt.Sprintf("(?<= %s)", formatExprSyntax(re, e.Args[0])) 624 | case OpNegativeLookbehind: 625 | return fmt.Sprintf("(?", e.Op) 640 | } 641 | } 642 | 643 | func formatArgsSyntax(re *Regexp, args []Expr) string { 644 | parts := make([]string, len(args)) 645 | for i, e := range args { 646 | parts[i] = formatExprSyntax(re, e) 647 | } 648 | return strings.Join(parts, " ") 649 | } 650 | 651 | // To run benchmarks: 652 | // $ go-benchrun ParserStdlib ParserPratt -count 5 653 | var benchmarkTests = []*struct { 654 | name string 655 | pattern string 656 | }{ 657 | {`lit`, `\+\.1234foobarbaz✓✓□□`}, 658 | {`alt`, `(x|y|1)|z|$`}, 659 | {`esc`, `\w\d\pL\123\059\p{L}\p{^Greek}`}, 660 | {`charclass`, `[a-z0-9_][^\d][\(-\)][1234][[[][a-][-a]`}, 661 | {`posix`, `[[:alpha:][:blank:][:^word:]][[:^digit:]]`}, 662 | {`meta`, `x+y*z?.*?.+?.??`}, 663 | {`repeat`, `x{3,}\d{1,4}y{5}z{0}`}, 664 | {`group`, `(?:x)(?i:(?i))(x)(?Px)`}, 665 | {`quote`, `\Qhttp://a.b.com/?x[]=1\E`}, 666 | } 667 | 668 | func BenchmarkParserPratt(b *testing.B) { 669 | for _, test := range benchmarkTests { 670 | b.Run(test.name, func(b *testing.B) { 671 | p := NewParser(nil) 672 | b.ResetTimer() 673 | for i := 0; i < b.N; i++ { 674 | _, err := p.Parse(test.pattern) 675 | if err != nil { 676 | b.Fatal(err) 677 | } 678 | } 679 | }) 680 | } 681 | } 682 | 683 | func BenchmarkParserStdlib(b *testing.B) { 684 | for _, test := range benchmarkTests { 685 | b.Run(test.name, func(b *testing.B) { 686 | for i := 0; i < b.N; i++ { 687 | _, err := syntax.Parse(test.pattern, syntax.Perl) 688 | if err != nil { 689 | b.Fatal(err) 690 | } 691 | } 692 | }) 693 | } 694 | } 695 | --------------------------------------------------------------------------------