├── syntax
    ├── go.mod
    ├── pos.go
    ├── utils.go
    ├── errors.go
    ├── README.md
    ├── ast.go
    ├── pcre_test.go
    ├── tokenkind_string.go
    ├── operation_string.go
    ├── operation.go
    ├── lexer_test.go
    ├── lexer.go
    ├── parser.go
    └── parser_test.go
├── README.md
└── LICENSE


/syntax/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/quasilyte/regex/syntax
2 | 
3 | go 1.14
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # regex - [regular expression](https://en.wikipedia.org/wiki/Regular_expression) libraries for Go
2 | 
3 | ## Packages
4 | 
5 | * [syntax](/syntax) - regexp parser and AST definitions
6 | 


--------------------------------------------------------------------------------
/syntax/pos.go:
--------------------------------------------------------------------------------
 1 | package syntax
 2 | 
 3 | type Position struct {
 4 | 	Begin uint16
 5 | 	End   uint16
 6 | }
 7 | 
 8 | func combinePos(begin, end Position) Position {
 9 | 	return Position{Begin: begin.Begin, End: end.End}
10 | }
11 | 


--------------------------------------------------------------------------------
/syntax/utils.go:
--------------------------------------------------------------------------------
 1 | package syntax
 2 | 
 3 | func isSpace(ch byte) bool {
 4 | 	switch ch {
 5 | 	case '\r', '\n', '\t', '\f', '\v', ' ':
 6 | 		return true
 7 | 	default:
 8 | 		return false
 9 | 	}
10 | }
11 | 
12 | func isAlphanumeric(ch byte) bool {
13 | 	return (ch >= 'a' && ch <= 'z') ||
14 | 		(ch >= 'A' && ch <= 'Z') ||
15 | 		(ch >= '0' && ch <= '9')
16 | }
17 | 
18 | func isDigit(ch byte) bool {
19 | 	return ch >= '0' && ch <= '9'
20 | }
21 | 
22 | func isOctalDigit(ch byte) bool {
23 | 	return ch >= '0' && ch <= '7'
24 | }
25 | 
26 | func isHexDigit(ch byte) bool {
27 | 	return (ch >= '0' && ch <= '9') ||
28 | 		(ch >= 'a' && ch <= 'f') ||
29 | 		(ch >= 'A' && ch <= 'F')
30 | }
31 | 


--------------------------------------------------------------------------------
/syntax/errors.go:
--------------------------------------------------------------------------------
 1 | package syntax
 2 | 
 3 | type ParseError struct {
 4 | 	Pos     Position
 5 | 	Message string
 6 | }
 7 | 
 8 | func (e ParseError) Error() string { return e.Message }
 9 | 
10 | func throw(pos Position, message string) {
11 | 	panic(ParseError{Pos: pos, Message: message})
12 | }
13 | 
14 | func throwExpectedFound(pos Position, expected, found string) {
15 | 	throw(pos, "expected '"+expected+"', found '"+found+"'")
16 | }
17 | 
18 | func throwUnexpectedToken(pos Position, token string) {
19 | 	throw(pos, "unexpected token: "+token)
20 | }
21 | 
22 | func newPos(begin, end int) Position {
23 | 	return Position{
24 | 		Begin: uint16(begin),
25 | 		End:   uint16(end),
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Iskander (Alex) Sharipov / quasilyte
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/syntax/README.md:
--------------------------------------------------------------------------------
 1 | # Package `regex/syntax`
 2 | 
 3 | Package `syntax` provides regular expressions parser as well as AST definitions.
 4 | 
 5 | ## Rationale
 6 | 
 7 | The advantages of this package over stdlib [regexp/syntax](https://golang.org/pkg/regexp/syntax/):
 8 | 
 9 | 1. Does not transformations/optimizations during the parsing.
10 |    The produced parse tree is loseless.
11 | 
12 | 2. Simpler AST representation.
13 | 
14 | 3. Can parse most PCRE operations in addition to [re2](https://github.com/google/re2/wiki/Syntax) syntax.
15 |    It can also handle PHP/Perl style patterns with delimiters.
16 | 
17 | 4. This package is easier to extend than something from the standard library.
18 | 
19 | This package does almost no assumptions about how generated AST is going to be used
20 | so it preserves as much syntax information as possible.
21 | 
22 | It's easy to write another intermediate representation on top of it. The main
23 | function of this package is to convert a textual regexp pattern into a more
24 | structured form that can be processed more easily.
25 | 
26 | ## Users
27 | 
28 | * [go-critic](https://github.com/go-critic/go-critic) - Go static analyzer
29 | * [NoVerify](https://github.com/VKCOM/noverify) - PHP static analyzer
30 | 


--------------------------------------------------------------------------------
/syntax/ast.go:
--------------------------------------------------------------------------------
 1 | package syntax
 2 | 
 3 | import (
 4 | 	"strings"
 5 | )
 6 | 
 7 | type Regexp struct {
 8 | 	Pattern string
 9 | 	Expr    Expr
10 | }
11 | 
12 | type RegexpPCRE struct {
13 | 	Pattern string
14 | 	Expr    Expr
15 | 
16 | 	Source    string
17 | 	Modifiers string
18 | 	Delim     [2]byte
19 | }
20 | 
21 | func (re *RegexpPCRE) HasModifier(mod byte) bool {
22 | 	return strings.IndexByte(re.Modifiers, mod) >= 0
23 | }
24 | 
25 | type Expr struct {
26 | 	// The operations that this expression performs. See `operation.go`.
27 | 	Op Operation
28 | 
29 | 	Form Form
30 | 
31 | 	_ [2]byte // Reserved
32 | 
33 | 	// Pos describes a source location inside regexp pattern.
34 | 	Pos Position
35 | 
36 | 	// Args is a list of sub-expressions of this expression.
37 | 	//
38 | 	// See Operation constants documentation to learn how to
39 | 	// interpret the particular expression args.
40 | 	Args []Expr
41 | 
42 | 	// Value holds expression textual value.
43 | 	//
44 | 	// Usually, that value is identical to src[Begin():End()],
45 | 	// but this is not true for programmatically generated objects.
46 | 	Value string
47 | }
48 | 
49 | // Begin returns expression leftmost offset.
50 | func (e Expr) Begin() uint16 { return e.Pos.Begin }
51 | 
52 | // End returns expression rightmost offset.
53 | func (e Expr) End() uint16 { return e.Pos.End }
54 | 
55 | // LastArg returns expression last argument.
56 | //
57 | // Should not be called on expressions that may have 0 arguments.
58 | func (e Expr) LastArg() Expr {
59 | 	return e.Args[len(e.Args)-1]
60 | }
61 | 
62 | type Operation byte
63 | 
64 | type Form byte
65 | 


--------------------------------------------------------------------------------
/syntax/pcre_test.go:
--------------------------------------------------------------------------------
 1 | package syntax
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestParserErrorsPCRE(t *testing.T) {
 9 | 	tests := []struct {
10 | 		pattern string
11 | 		want    string
12 | 	}{
13 | 		{``, `empty pattern: can't find delimiters`},
14 | 		{`aba`, `'a' is not a valid delimiter`},
15 | 		{` aa `, `whitespace is not a valid delimiter`},
16 | 		{`/abc`, `can't find '/' ending delimiter`},
17 | 		{`#abc`, `can't find '#' ending delimiter`},
18 | 	}
19 | 
20 | 	p := NewParser(nil)
21 | 	for _, test := range tests {
22 | 		_, err := p.ParsePCRE(test.pattern)
23 | 		have := "<nil>"
24 | 		if err != nil {
25 | 			have = err.Error()
26 | 		}
27 | 		if have != test.want {
28 | 			t.Errorf("parse(%q):\nhave: %s\nwant: %s",
29 | 				test.pattern, have, test.want)
30 | 		}
31 | 	}
32 | }
33 | 
34 | func TestParsePCRE(t *testing.T) {
35 | 	tests := []struct {
36 | 		source string
37 | 
38 | 		wantPattern   string
39 | 		wantDelim     string
40 | 		wantModifiers string
41 | 	}{
42 | 		{`@@`, "", "@@", ""},
43 | 		{`//i`, "", "//", "i"},
44 | 		{`#hello#`, "hello", "##", ""},
45 | 		{`{pcre pattern}smi`, "pcre pattern", "{}", "smi"},
46 | 		{`<an[o]ther (example)!>ms`, "an[o]ther (example)!", "<>", "ms"},
47 | 		{`/clipFrom/([0-9]+)`, "clipFrom", "//", "([0-9]+)"},
48 | 	}
49 | 
50 | 	p := NewParser(nil)
51 | 	for _, test := range tests {
52 | 		pcre, err := p.ParsePCRE(test.source)
53 | 		if err != nil {
54 | 			t.Fatalf("parse(%q): error: %v", test.source, err)
55 | 		}
56 | 		if pcre.Pattern != test.wantPattern {
57 | 			t.Fatalf("parse(%q): pattern mismatch:\nhave: `%s`\nwant: `%s`",
58 | 				test.source, pcre.Pattern, test.wantPattern)
59 | 		}
60 | 		haveDelim := fmt.Sprintf("%c%c", pcre.Delim[0], pcre.Delim[1])
61 | 		if haveDelim != test.wantDelim {
62 | 			t.Fatalf("parse(%q): delimiter mismatch:\nhave: `%s`\nwant: `%s`",
63 | 				test.source, haveDelim, test.wantDelim)
64 | 		}
65 | 		if pcre.Modifiers != test.wantModifiers {
66 | 			t.Fatalf("parse(%q): modifiers mismatch:\nhave: `%s`\nwant: `%s`",
67 | 				test.source, pcre.Modifiers, test.wantModifiers)
68 | 		}
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/syntax/tokenkind_string.go:
--------------------------------------------------------------------------------
 1 | // Code generated by "stringer -type=tokenKind -trimprefix=tok -linecomment=true"; DO NOT EDIT.
 2 | 
 3 | package syntax
 4 | 
 5 | import "strconv"
 6 | 
 7 | func _() {
 8 | 	// An "invalid array index" compiler error signifies that the constant values have changed.
 9 | 	// Re-run the stringer command to generate them again.
10 | 	var x [1]struct{}
11 | 	_ = x[tokNone-0]
12 | 	_ = x[tokChar-1]
13 | 	_ = x[tokGroupFlags-2]
14 | 	_ = x[tokPosixClass-3]
15 | 	_ = x[tokConcat-4]
16 | 	_ = x[tokRepeat-5]
17 | 	_ = x[tokEscapeChar-6]
18 | 	_ = x[tokEscapeMeta-7]
19 | 	_ = x[tokEscapeOctal-8]
20 | 	_ = x[tokEscapeUni-9]
21 | 	_ = x[tokEscapeUniFull-10]
22 | 	_ = x[tokEscapeHex-11]
23 | 	_ = x[tokEscapeHexFull-12]
24 | 	_ = x[tokComment-13]
25 | 	_ = x[tokQ-14]
26 | 	_ = x[tokMinus-15]
27 | 	_ = x[tokLbracket-16]
28 | 	_ = x[tokLbracketCaret-17]
29 | 	_ = x[tokRbracket-18]
30 | 	_ = x[tokDollar-19]
31 | 	_ = x[tokCaret-20]
32 | 	_ = x[tokQuestion-21]
33 | 	_ = x[tokDot-22]
34 | 	_ = x[tokPlus-23]
35 | 	_ = x[tokStar-24]
36 | 	_ = x[tokPipe-25]
37 | 	_ = x[tokLparen-26]
38 | 	_ = x[tokLparenName-27]
39 | 	_ = x[tokLparenNameAngle-28]
40 | 	_ = x[tokLparenNameQuote-29]
41 | 	_ = x[tokLparenFlags-30]
42 | 	_ = x[tokLparenAtomic-31]
43 | 	_ = x[tokLparenPositiveLookahead-32]
44 | 	_ = x[tokLparenPositiveLookbehind-33]
45 | 	_ = x[tokLparenNegativeLookahead-34]
46 | 	_ = x[tokLparenNegativeLookbehind-35]
47 | 	_ = x[tokRparen-36]
48 | }
49 | 
50 | const _tokenKind_name = "NoneCharGroupFlagsPosixClassConcatRepeatEscapeCharEscapeMetaEscapeOctalEscapeUniEscapeUniFullEscapeHexEscapeHexFullComment\\Q-[[^]$^?.+*|((?P<name>(?<name>(?'name'(?flags(?>(?=(?<=(?!(?<!)"
51 | 
52 | var _tokenKind_index = [...]uint8{0, 4, 8, 18, 28, 34, 40, 50, 60, 71, 80, 93, 102, 115, 122, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 146, 154, 162, 169, 172, 175, 179, 182, 186, 187}
53 | 
54 | func (i tokenKind) String() string {
55 | 	if i >= tokenKind(len(_tokenKind_index)-1) {
56 | 		return "tokenKind(" + strconv.FormatInt(int64(i), 10) + ")"
57 | 	}
58 | 	return _tokenKind_name[_tokenKind_index[i]:_tokenKind_index[i+1]]
59 | }
60 | 


--------------------------------------------------------------------------------
/syntax/operation_string.go:
--------------------------------------------------------------------------------
 1 | // Code generated by "stringer -type=Operation -trimprefix=Op"; DO NOT EDIT.
 2 | 
 3 | package syntax
 4 | 
 5 | import "strconv"
 6 | 
 7 | func _() {
 8 | 	// An "invalid array index" compiler error signifies that the constant values have changed.
 9 | 	// Re-run the stringer command to generate them again.
10 | 	var x [1]struct{}
11 | 	_ = x[OpNone-0]
12 | 	_ = x[OpConcat-1]
13 | 	_ = x[OpDot-2]
14 | 	_ = x[OpAlt-3]
15 | 	_ = x[OpStar-4]
16 | 	_ = x[OpPlus-5]
17 | 	_ = x[OpQuestion-6]
18 | 	_ = x[OpNonGreedy-7]
19 | 	_ = x[OpPossessive-8]
20 | 	_ = x[OpCaret-9]
21 | 	_ = x[OpDollar-10]
22 | 	_ = x[OpLiteral-11]
23 | 	_ = x[OpChar-12]
24 | 	_ = x[OpString-13]
25 | 	_ = x[OpQuote-14]
26 | 	_ = x[OpEscapeChar-15]
27 | 	_ = x[OpEscapeMeta-16]
28 | 	_ = x[OpEscapeOctal-17]
29 | 	_ = x[OpEscapeHex-18]
30 | 	_ = x[OpEscapeUni-19]
31 | 	_ = x[OpCharClass-20]
32 | 	_ = x[OpNegCharClass-21]
33 | 	_ = x[OpCharRange-22]
34 | 	_ = x[OpPosixClass-23]
35 | 	_ = x[OpRepeat-24]
36 | 	_ = x[OpCapture-25]
37 | 	_ = x[OpNamedCapture-26]
38 | 	_ = x[OpGroup-27]
39 | 	_ = x[OpGroupWithFlags-28]
40 | 	_ = x[OpAtomicGroup-29]
41 | 	_ = x[OpPositiveLookahead-30]
42 | 	_ = x[OpNegativeLookahead-31]
43 | 	_ = x[OpPositiveLookbehind-32]
44 | 	_ = x[OpNegativeLookbehind-33]
45 | 	_ = x[OpFlagOnlyGroup-34]
46 | 	_ = x[OpComment-35]
47 | 	_ = x[OpNone2-36]
48 | }
49 | 
50 | const _Operation_name = "NoneConcatDotAltStarPlusQuestionNonGreedyPossessiveCaretDollarLiteralCharStringQuoteEscapeCharEscapeMetaEscapeOctalEscapeHexEscapeUniCharClassNegCharClassCharRangePosixClassRepeatCaptureNamedCaptureGroupGroupWithFlagsAtomicGroupPositiveLookaheadNegativeLookaheadPositiveLookbehindNegativeLookbehindFlagOnlyGroupCommentNone2"
51 | 
52 | var _Operation_index = [...]uint16{0, 4, 10, 13, 16, 20, 24, 32, 41, 51, 56, 62, 69, 73, 79, 84, 94, 104, 115, 124, 133, 142, 154, 163, 173, 179, 186, 198, 203, 217, 228, 245, 262, 280, 298, 311, 318, 323}
53 | 
54 | func (i Operation) String() string {
55 | 	if i >= Operation(len(_Operation_index)-1) {
56 | 		return "Operation(" + strconv.FormatInt(int64(i), 10) + ")"
57 | 	}
58 | 	return _Operation_name[_Operation_index[i]:_Operation_index[i+1]]
59 | }
60 | 


--------------------------------------------------------------------------------
/syntax/operation.go:
--------------------------------------------------------------------------------
  1 | package syntax
  2 | 
  3 | //go:generate stringer -type=Operation -trimprefix=Op
  4 | const (
  5 | 	OpNone Operation = iota
  6 | 
  7 | 	// OpConcat is a concatenation of ops.
  8 | 	// Examples: `xy` `abc\d` ``
  9 | 	// Args - concatenated ops
 10 | 	//
 11 | 	// As a special case, OpConcat with 0 Args is used for "empty"
 12 | 	// set of operations.
 13 | 	OpConcat
 14 | 
 15 | 	// OpDot is a '.' wildcard.
 16 | 	OpDot
 17 | 
 18 | 	// OpAlt is x|y alternation of ops.
 19 | 	// Examples: `a|bc` `x(.*?)|y(.*?)`
 20 | 	// Args - union-connected regexp branches
 21 | 	OpAlt
 22 | 
 23 | 	// OpStar is a shorthand for {0,} repetition.
 24 | 	// Examples: `x*`
 25 | 	// Args[0] - repeated expression
 26 | 	OpStar
 27 | 
 28 | 	// OpPlus is a shorthand for {1,} repetition.
 29 | 	// Examples: `x+`
 30 | 	// Args[0] - repeated expression
 31 | 	OpPlus
 32 | 
 33 | 	// OpQuestion is a shorthand for {0,1} repetition.
 34 | 	// Examples: `x?`
 35 | 	// Args[0] - repeated expression
 36 | 	OpQuestion
 37 | 
 38 | 	// OpNonGreedy makes its operand quantifier non-greedy.
 39 | 	// Examples: `x??` `x*?` `x+?`
 40 | 	// Args[0] - quantified expression
 41 | 	OpNonGreedy
 42 | 
 43 | 	// OpPossessive makes its operand quantifier possessive.
 44 | 	// Examples: `x?+` `x*+` `x++`
 45 | 	// Args[0] - quantified expression
 46 | 	OpPossessive
 47 | 
 48 | 	// OpCaret is ^ anchor.
 49 | 	OpCaret
 50 | 
 51 | 	// OpDollar is $ anchor.
 52 | 	OpDollar
 53 | 
 54 | 	// OpLiteral is a collection of consecutive chars.
 55 | 	// Examples: `ab` `10x`
 56 | 	// Args - enclosed characters (OpChar)
 57 | 	OpLiteral
 58 | 
 59 | 	// OpChar is a single literal pattern character.
 60 | 	// Examples: `a` `6` `ф`
 61 | 	OpChar
 62 | 
 63 | 	// OpString is an artificial element that is used in other expressions.
 64 | 	OpString
 65 | 
 66 | 	// OpQuote is a \Q...\E enclosed literal.
 67 | 	// Examples: `\Q.?\E` `\Q?q[]=1`
 68 | 	// FormQuoteUnclosed: `\Qabc`
 69 | 	// Args[0] - literal value (OpString)
 70 | 	OpQuote
 71 | 
 72 | 	// OpEscapeChar is a single char escape.
 73 | 	// Examples: `\d` `\a` `\n`
 74 | 	// Args[0] - escaped value (OpString)
 75 | 	OpEscapeChar
 76 | 
 77 | 	// OpEscapeMeta is an escaped meta char.
 78 | 	// Examples: `\(` `\[` `\+`
 79 | 	// Args[0] - escaped value (OpString)
 80 | 	OpEscapeMeta
 81 | 
 82 | 	// OpEscapeOctal is an octal char code escape (up to 3 digits).
 83 | 	// Examples: `\123` `\12`
 84 | 	// Args[0] - escaped value (OpString)
 85 | 	OpEscapeOctal
 86 | 
 87 | 	// OpEscapeHex is a hex char code escape.
 88 | 	// Examples: `\x7F` `\xF7`
 89 | 	// FormEscapeHexFull examples: `\x{10FFFF}` `\x{F}`.
 90 | 	// Args[0] - escaped value (OpString)
 91 | 	OpEscapeHex
 92 | 
 93 | 	// OpEscapeUni is a Unicode char class escape.
 94 | 	// Examples: `\pS` `\pL` `\PL`
 95 | 	// FormEscapeUniFull examples: `\p{Greek}` `\p{Symbol}` `\p{^L}`
 96 | 	// Args[0] - escaped value (OpString)
 97 | 	OpEscapeUni
 98 | 
 99 | 	// OpCharClass is a char class enclosed in [].
100 | 	// Examples: `[abc]` `[a-z0-9\]]`
101 | 	// Args - char class elements (can include OpCharRange and OpPosixClass)
102 | 	OpCharClass
103 | 
104 | 	// OpNegCharClass is a negated char class enclosed in [].
105 | 	// Examples: `[^abc]` `[^a-z0-9\]]`
106 | 	// Args - char class elements (can include OpCharRange and OpPosixClass)
107 | 	OpNegCharClass
108 | 
109 | 	// OpCharRange is an inclusive char range inside a char class.
110 | 	// Examples: `0-9` `A-Z`
111 | 	// Args[0] - range lower bound
112 | 	// Args[1] - range upper bound
113 | 	OpCharRange
114 | 
115 | 	// OpPosixClass is a named ASCII char set inside a char class.
116 | 	// Examples: `[:alpha:]` `[:blank:]`
117 | 	OpPosixClass
118 | 
119 | 	// OpRepeat is a {min,max} repetition quantifier.
120 | 	// Examples: `x{5}` `x{min,max}` `x{min,}`
121 | 	// Args[0] - repeated expression
122 | 	// Args[1] - repeat count (OpString)
123 | 	OpRepeat
124 | 
125 | 	// OpCapture is `(re)` capturing group.
126 | 	// Examples: `(abc)` `(x|y)`
127 | 	// Args[0] - enclosed expression
128 | 	OpCapture
129 | 
130 | 	// OpNamedCapture is `(?P<name>re)` capturing group.
131 | 	// Examples: `(?P<foo>abc)` `(?P<name>x|y)`
132 | 	// FormNamedCaptureAngle examples: `(?<foo>abc)` `(?<name>x|y)`
133 | 	// FormNamedCaptureQuote examples: `(?'foo'abc)` `(?'name'x|y)`
134 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
135 | 	// Args[1] - group name (OpString)
136 | 	OpNamedCapture
137 | 
138 | 	// OpGroup is `(?:re)` non-capturing group.
139 | 	// Examples: `(?:abc)` `(?:x|y)`
140 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
141 | 	OpGroup
142 | 
143 | 	// OpGroupWithFlags is `(?flags:re)` non-capturing group.
144 | 	// Examples: `(?i:abc)` `(?i:x|y)`
145 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
146 | 	// Args[1] - flags (OpString)
147 | 	OpGroupWithFlags
148 | 
149 | 	// OpAtomicGroup is `(?>re)` non-capturing group without backtracking.
150 | 	// Examples: `(?>foo)` `(?>)`
151 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
152 | 	OpAtomicGroup
153 | 
154 | 	// OpPositiveLookahead is `(?=re)` asserts that following text matches re.
155 | 	// Examples: `(?=foo)`
156 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
157 | 	OpPositiveLookahead
158 | 
159 | 	// OpNegativeLookahead is `(?!re)` asserts that following text doesn't match re.
160 | 	// Examples: `(?!foo)`
161 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
162 | 	OpNegativeLookahead
163 | 
164 | 	// OpPositiveLookbehind is `(?<=re)` asserts that preceding text matches re.
165 | 	// Examples: `(?<=foo)`
166 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
167 | 	OpPositiveLookbehind
168 | 
169 | 	// OpNegativeLookbehind is `(?=re)` asserts that preceding text doesn't match re.
170 | 	// Examples: `(?<!foo)`
171 | 	// Args[0] - enclosed expression (OpConcat with 0 args for empty group)
172 | 	OpNegativeLookbehind
173 | 
174 | 	// OpFlagOnlyGroup is `(?flags)` form that affects current group flags.
175 | 	// Examples: `(?i)` `(?i-m)` `(?-im)`
176 | 	// Args[0] - flags (OpString)
177 | 	OpFlagOnlyGroup
178 | 
179 | 	// OpComment is a group-like regexp comment expression.
180 | 	// Examples: `(?#text)` `(?#)`
181 | 	OpComment
182 | 
183 | 	// OpNone2 is a sentinel value that is never part of the AST.
184 | 	// OpNone and OpNone2 can be used to cover all ops in a range.
185 | 	OpNone2
186 | )
187 | 
188 | const (
189 | 	FormDefault Form = iota
190 | 	FormEscapeHexFull
191 | 	FormEscapeUniFull
192 | 	FormNamedCaptureAngle
193 | 	FormNamedCaptureQuote
194 | 	FormQuoteUnclosed
195 | )
196 | 


--------------------------------------------------------------------------------
/syntax/lexer_test.go:
--------------------------------------------------------------------------------
  1 | package syntax
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestLexer(t *testing.T) {
  9 | 	tests := []struct {
 10 | 		input  string
 11 | 		tokens string
 12 | 	}{
 13 | 		{``, ``},
 14 | 
 15 | 		{`x`, `Char`},
 16 | 		{`xx`, `Char Concat Char`},
 17 | 		{`xxx`, `Char Concat Char Concat Char`},
 18 | 		{`..`, `. Concat .`},
 19 | 		{`.x.`, `. Concat Char Concat .`},
 20 | 		{`✓✓`, `Char Concat Char`},
 21 | 
 22 | 		{`x|x`, `Char | Char`},
 23 | 		{`x|x|x`, `Char | Char | Char`},
 24 | 		{`x|xx|xxx`, `Char | Char Concat Char | Char Concat Char Concat Char`},
 25 | 
 26 | 		{`()`, `( )`},
 27 | 		{`(x)`, `( Char )`},
 28 | 		{`((x))`, `( ( Char ) )`},
 29 | 		{`(x)|x`, `( Char ) | Char`},
 30 | 		{`x|(x)`, `Char | ( Char )`},
 31 | 		{`(x)|(x)`, `( Char ) | ( Char )`},
 32 | 		{`x(x)`, `Char Concat ( Char )`},
 33 | 		{`(✓x✓x)`, `( Char Concat Char Concat Char Concat Char )`},
 34 | 
 35 | 		{`(?<1>)`, `(?<name> )`},
 36 | 		{`(?'1')`, `(?'name' )`},
 37 | 		{`(?P<1>)`, `(?P<name> )`},
 38 | 		{`(?P<foo>x)`, `(?P<name> Char )`},
 39 | 		{`(?<foo>x)`, `(?<name> Char )`},
 40 | 		{`(?'foo'x)`, `(?'name' Char )`},
 41 | 		{`(?P<foo>xy)`, `(?P<name> Char Concat Char )`},
 42 | 		{`a(?P<foo>x)b`, `Char Concat (?P<name> Char ) Concat Char`},
 43 | 		{`a(?P<foo>xy)b`, `Char Concat (?P<name> Char Concat Char ) Concat Char`},
 44 | 		{`a(?<foo>xy)b`, `Char Concat (?<name> Char Concat Char ) Concat Char`},
 45 | 		{`a(?'foo'xy)b`, `Char Concat (?'name' Char Concat Char ) Concat Char`},
 46 | 
 47 | 		{`(?#)`, `Comment`},
 48 | 		{`a(?#test)(?#c2)b`, `Char Concat Comment Concat Comment Concat Char`},
 49 | 
 50 | 		{`(?>)`, `(?> )`},
 51 | 		{`a(?>xy)(?>z)`, `Char Concat (?> Char Concat Char ) Concat (?> Char )`},
 52 | 
 53 | 		{`(?=)`, `(?= )`},
 54 | 		{`(?!)`, `(?! )`},
 55 | 		{`(?<=)`, `(?<= )`},
 56 | 		{`(?<!)`, `(?<! )`},
 57 | 		{`a(?=xy)(?=z)`, `Char Concat (?= Char Concat Char ) Concat (?= Char )`},
 58 | 		{`a(?!xy)(?!z)`, `Char Concat (?! Char Concat Char ) Concat (?! Char )`},
 59 | 		{`a(?<=xy)(?<=z)`, `Char Concat (?<= Char Concat Char ) Concat (?<= Char )`},
 60 | 		{`a(?<!xy)(?<!z)`, `Char Concat (?<! Char Concat Char ) Concat (?<! Char )`},
 61 | 
 62 | 		{`(?i)`, `(?flags )`},
 63 | 		{`(?im)`, `(?flags )`},
 64 | 		{`(?i-m)`, `(?flags )`},
 65 | 		{`a(?i)b`, `Char Concat (?flags ) Concat Char`},
 66 | 		{`a(?im)b`, `Char Concat (?flags ) Concat Char`},
 67 | 
 68 | 		{`(?:)`, `(?flags )`},
 69 | 		{`(?:xy)`, `(?flags Char Concat Char )`},
 70 | 		{`(?i:xy)`, `(?flags Char Concat Char )`},
 71 | 		{`(?im:xy)`, `(?flags Char Concat Char )`},
 72 | 		{`a(?:)b`, `Char Concat (?flags ) Concat Char`},
 73 | 		{`a(?:xy)b`, `Char Concat (?flags Char Concat Char ) Concat Char`},
 74 | 		{`a(?i:xy)b`, `Char Concat (?flags Char Concat Char ) Concat Char`},
 75 | 		{`a(?-im:xy)b`, `Char Concat (?flags Char Concat Char ) Concat Char`},
 76 | 
 77 | 		{`\(\)`, `EscapeMeta Concat EscapeMeta`},
 78 | 		{`\\`, `EscapeMeta`},
 79 | 		{`\a`, `EscapeChar`},
 80 | 		{`\\d`, `EscapeMeta Concat Char`},
 81 | 		{`\d`, `EscapeChar`},
 82 | 		{`\d\a`, `EscapeChar Concat EscapeChar`},
 83 | 		{`\dd\a`, `EscapeChar Concat Char Concat EscapeChar`},
 84 | 		{`\D`, `EscapeChar`},
 85 | 		{`\s\S`, `EscapeChar Concat EscapeChar`},
 86 | 
 87 | 		{`-`, `Char`},
 88 | 		{`[\-]`, `[ EscapeMeta ]`},
 89 | 		{`a[]a`, `Char Concat [ Char Concat Char`},
 90 | 		{`[\^a]a`, `[ EscapeChar Char ] Concat Char`},
 91 | 		{`[^a]a`, `[^ Char ] Concat Char`},
 92 | 		{`a[^abc]a`, `Char Concat [^ Char Char Char ] Concat Char`},
 93 | 		{`[[[]a`, `[ Char Char ] Concat Char`},
 94 | 		{`[\[]a`, `[ EscapeChar ] Concat Char`},
 95 | 		{`[\]]a`, `[ EscapeMeta ] Concat Char`},
 96 | 		{`aa[\]1\]]`, `Char Concat Char Concat [ EscapeMeta Char EscapeMeta ]`},
 97 | 		{`aa[1\]\]2]`, `Char Concat Char Concat [ Char EscapeMeta EscapeMeta Char ]`},
 98 | 		{`[a-z0-9]a`, `[ Char - Char Char - Char ] Concat Char`},
 99 | 		{`[0-9-]`, `[ Char - Char - ]`},
100 | 		{`[\d-\w]`, `[ EscapeChar - EscapeChar ]`},
101 | 		{`[\(-\)]`, `[ EscapeChar - EscapeChar ]`},
102 | 		{`[\[-\]]`, `[ EscapeChar - EscapeMeta ]`},
103 | 
104 | 		{`[|]`, `[ Char ]`},
105 | 		{`[(-)]`, `[ Char - Char ]`},
106 | 		{`[$.+*^?]`, `[ Char Char Char Char Char Char ]`},
107 | 		{`[x{1}]`, `[ Char Char Char Char ]`},
108 | 
109 | 		{`[^]`, `[^ Char`},
110 | 		{`[^^]`, `[^ Char ]`},
111 | 
112 | 		{`[[:alpha:]]`, `[ PosixClass ]`},
113 | 		{`[[:alpha:]-[:blank:]]`, `[ PosixClass - PosixClass ]`},
114 | 		{`[[:^word:]]`, `[ PosixClass ]`},
115 | 		{`[[:bad:]]`, `[ PosixClass ]`},
116 | 		{`[:alpha:]`, `[ Char Char Char Char Char Char Char ]`},
117 | 
118 | 		{`]`, `Char`},
119 | 		{`]]`, `Char Concat Char`},
120 | 
121 | 		{`x+`, `Char +`},
122 | 		{`x+x+`, `Char + Concat Char +`},
123 | 		{`x+?`, `Char + ?`},
124 | 		{`x??`, `Char ? ?`},
125 | 
126 | 		{`\pL`, `EscapeUni`},
127 | 		{`\pLL`, `EscapeUni Concat Char`},
128 | 		{`\p{Greek}`, `EscapeUniFull`},
129 | 		{`x\p{^Bad}y`, `Char Concat EscapeUniFull Concat Char`},
130 | 		{`\PL`, `EscapeUni`},
131 | 		{`\P{^L}`, `EscapeUniFull`},
132 | 
133 | 		{`\0`, `EscapeOctal`},
134 | 		{`\01`, `EscapeOctal`},
135 | 		{`\012`, `EscapeOctal`},
136 | 		{`\777`, `EscapeOctal`},
137 | 		{`\78`, `EscapeOctal Concat Char`},
138 | 		{`\778`, `EscapeOctal Concat Char`},
139 | 
140 | 		{`\xFF`, `EscapeHex`},
141 | 		{`\xab`, `EscapeHex`},
142 | 		{`\x10a`, `EscapeHex Concat Char`},
143 | 		{`\x1\x2`, `EscapeHex Concat EscapeHex`},
144 | 
145 | 		{`\x{}a`, `EscapeHexFull Concat Char`},
146 | 		{`\x{f}a`, `EscapeHexFull Concat Char`},
147 | 		{`\x{F1}a`, `EscapeHexFull Concat Char`},
148 | 
149 | 		{`x{10}y`, `Char Repeat Concat Char`},
150 | 		{`x{10,}y`, `Char Repeat Concat Char`},
151 | 		{`x{10,20}y`, `Char Repeat Concat Char`},
152 | 		{`x{1}{2}y`, `Char Repeat Repeat Concat Char`},
153 | 		{`ax{10}y`, `Char Concat Char Repeat Concat Char`},
154 | 		{`ax{10,}y`, `Char Concat Char Repeat Concat Char`},
155 | 		{`ax{10,20}y`, `Char Concat Char Repeat Concat Char`},
156 | 		{`ax{1}{2}y`, `Char Concat Char Repeat Repeat Concat Char`},
157 | 
158 | 		{`{}`, `Char Concat Char`},
159 | 		{`x{}`, `Char Concat Char Concat Char`},
160 | 		{`x{a}`, `Char Concat Char Concat Char Concat Char`},
161 | 		{`x{-1}`, `Char Concat Char Concat Char Concat Char Concat Char`},
162 | 		{`x{1,b}`, `Char Concat Char Concat Char Concat Char Concat Char Concat Char`},
163 | 		{`x{1b}`, `Char Concat Char Concat Char Concat Char Concat Char`},
164 | 
165 | 		{`x\Q`, `Char Concat \Q`},
166 | 		{`x\Q.`, `Char Concat \Q`},
167 | 		{`x\Q..`, `Char Concat \Q`},
168 | 		{`\Q\E`, `\Q`},
169 | 		{`\Q..\E`, `\Q`},
170 | 		{`x\Q\Ey`, `Char Concat \Q Concat Char`},
171 | 		{`x\Q..\Ey`, `Char Concat \Q Concat Char`},
172 | 		{`\Q\E\Q\E`, `\Q Concat \Q`},
173 | 	}
174 | 
175 | 	removeBrackets := func(s string) string {
176 | 		return s[len("[") : len(s)-len("]")]
177 | 	}
178 | 	var l lexer
179 | 	for _, test := range tests {
180 | 		l.Init(test.input)
181 | 		want := test.tokens
182 | 		have := removeBrackets(fmt.Sprint(l.tokens))
183 | 		if have != want {
184 | 			t.Errorf("tokenize(%q):\nhave: %s\nwant: %s",
185 | 				test.input, have, want)
186 | 		}
187 | 	}
188 | }
189 | 


--------------------------------------------------------------------------------
/syntax/lexer.go:
--------------------------------------------------------------------------------
  1 | package syntax
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 	"unicode/utf8"
  6 | )
  7 | 
  8 | type token struct {
  9 | 	kind tokenKind
 10 | 	pos  Position
 11 | }
 12 | 
 13 | func (tok token) String() string {
 14 | 	return tok.kind.String()
 15 | }
 16 | 
 17 | type tokenKind byte
 18 | 
 19 | //go:generate stringer -type=tokenKind -trimprefix=tok -linecomment=true
 20 | const (
 21 | 	tokNone tokenKind = iota
 22 | 
 23 | 	tokChar
 24 | 	tokGroupFlags
 25 | 	tokPosixClass
 26 | 	tokConcat
 27 | 	tokRepeat
 28 | 	tokEscapeChar
 29 | 	tokEscapeMeta
 30 | 	tokEscapeOctal
 31 | 	tokEscapeUni
 32 | 	tokEscapeUniFull
 33 | 	tokEscapeHex
 34 | 	tokEscapeHexFull
 35 | 	tokComment
 36 | 
 37 | 	tokQ                        // \Q
 38 | 	tokMinus                    // -
 39 | 	tokLbracket                 // [
 40 | 	tokLbracketCaret            // [^
 41 | 	tokRbracket                 // ]
 42 | 	tokDollar                   // $
 43 | 	tokCaret                    // ^
 44 | 	tokQuestion                 // ?
 45 | 	tokDot                      // .
 46 | 	tokPlus                     // +
 47 | 	tokStar                     // *
 48 | 	tokPipe                     // |
 49 | 	tokLparen                   // (
 50 | 	tokLparenName               // (?P<name>
 51 | 	tokLparenNameAngle          // (?<name>
 52 | 	tokLparenNameQuote          // (?'name'
 53 | 	tokLparenFlags              // (?flags
 54 | 	tokLparenAtomic             // (?>
 55 | 	tokLparenPositiveLookahead  // (?=
 56 | 	tokLparenPositiveLookbehind // (?<=
 57 | 	tokLparenNegativeLookahead  // (?!
 58 | 	tokLparenNegativeLookbehind // (?<!
 59 | 	tokRparen                   // )
 60 | )
 61 | 
 62 | // reMetachar is a table of meta chars outside of a char class.
 63 | var reMetachar = [256]bool{
 64 | 	'\\': true,
 65 | 	'|':  true,
 66 | 	'*':  true,
 67 | 	'+':  true,
 68 | 	'?':  true,
 69 | 	'.':  true,
 70 | 	'[':  true,
 71 | 	']':  true,
 72 | 	'^':  true,
 73 | 	'$':  true,
 74 | 	'(':  true,
 75 | 	')':  true,
 76 | }
 77 | 
 78 | // charClassMetachar is a table of meta chars inside char class.
 79 | var charClassMetachar = [256]bool{
 80 | 	'-': true,
 81 | 	']': true,
 82 | }
 83 | 
 84 | type lexer struct {
 85 | 	tokens []token
 86 | 	pos    int
 87 | 	input  string
 88 | }
 89 | 
 90 | func (l *lexer) HasMoreTokens() bool {
 91 | 	return l.pos < len(l.tokens)
 92 | }
 93 | 
 94 | func (l *lexer) NextToken() token {
 95 | 	if l.pos < len(l.tokens) {
 96 | 		tok := l.tokens[l.pos]
 97 | 		l.pos++
 98 | 		return tok
 99 | 	}
100 | 	return token{}
101 | }
102 | 
103 | func (l *lexer) Peek() token {
104 | 	if l.pos < len(l.tokens) {
105 | 		return l.tokens[l.pos]
106 | 	}
107 | 	return token{}
108 | }
109 | 
110 | func (l *lexer) scan() {
111 | 	for l.pos < len(l.input) {
112 | 		ch := l.input[l.pos]
113 | 		if ch >= utf8.RuneSelf {
114 | 			_, size := utf8.DecodeRuneInString(l.input[l.pos:])
115 | 			l.pushTok(tokChar, size)
116 | 			l.maybeInsertConcat()
117 | 			continue
118 | 		}
119 | 		switch ch {
120 | 		case '\\':
121 | 			l.scanEscape(false)
122 | 		case '.':
123 | 			l.pushTok(tokDot, 1)
124 | 		case '+':
125 | 			l.pushTok(tokPlus, 1)
126 | 		case '*':
127 | 			l.pushTok(tokStar, 1)
128 | 		case '^':
129 | 			l.pushTok(tokCaret, 1)
130 | 		case '$':
131 | 			l.pushTok(tokDollar, 1)
132 | 		case '?':
133 | 			l.pushTok(tokQuestion, 1)
134 | 		case ')':
135 | 			l.pushTok(tokRparen, 1)
136 | 		case '|':
137 | 			l.pushTok(tokPipe, 1)
138 | 		case '[':
139 | 			if l.byteAt(l.pos+1) == '^' {
140 | 				l.pushTok(tokLbracketCaret, 2)
141 | 			} else {
142 | 				l.pushTok(tokLbracket, 1)
143 | 			}
144 | 			l.scanCharClass()
145 | 		case '(':
146 | 			if l.byteAt(l.pos+1) == '?' {
147 | 				switch {
148 | 				case l.byteAt(l.pos+2) == '>':
149 | 					l.pushTok(tokLparenAtomic, len("(?>"))
150 | 				case l.byteAt(l.pos+2) == '=':
151 | 					l.pushTok(tokLparenPositiveLookahead, len("(?="))
152 | 				case l.byteAt(l.pos+2) == '!':
153 | 					l.pushTok(tokLparenNegativeLookahead, len("(?!"))
154 | 				case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '=':
155 | 					l.pushTok(tokLparenPositiveLookbehind, len("(?<="))
156 | 				case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '!':
157 | 					l.pushTok(tokLparenNegativeLookbehind, len("(?<!"))
158 | 				default:
159 | 					if l.tryScanComment(l.pos + 2) {
160 | 					} else if l.tryScanGroupName(l.pos + 2) {
161 | 					} else if l.tryScanGroupFlags(l.pos + 2) {
162 | 					} else {
163 | 						throw(newPos(l.pos, l.pos+1), "group token is incomplete")
164 | 					}
165 | 				}
166 | 			} else {
167 | 				l.pushTok(tokLparen, 1)
168 | 			}
169 | 		case '{':
170 | 			if j := l.repeatWidth(l.pos + 1); j >= 0 {
171 | 				l.pushTok(tokRepeat, len("{")+j)
172 | 			} else {
173 | 				l.pushTok(tokChar, 1)
174 | 			}
175 | 		default:
176 | 			l.pushTok(tokChar, 1)
177 | 		}
178 | 		l.maybeInsertConcat()
179 | 	}
180 | }
181 | 
182 | func (l *lexer) scanCharClass() {
183 | 	l.maybeInsertConcat()
184 | 
185 | 	// We need to handle first `]` in a special way. See #3.
186 | 	if l.byteAt(l.pos) == ']' {
187 | 		l.pushTok(tokChar, 1)
188 | 	}
189 | 
190 | 	for l.pos < len(l.input) {
191 | 		ch := l.input[l.pos]
192 | 		if ch >= utf8.RuneSelf {
193 | 			_, size := utf8.DecodeRuneInString(l.input[l.pos:])
194 | 			l.pushTok(tokChar, size)
195 | 			continue
196 | 		}
197 | 		switch ch {
198 | 		case '\\':
199 | 			l.scanEscape(true)
200 | 		case '[':
201 | 			isPosixClass := false
202 | 			if l.byteAt(l.pos+1) == ':' {
203 | 				j := l.stringIndex(l.pos+2, ":]")
204 | 				if j >= 0 {
205 | 					isPosixClass = true
206 | 					l.pushTok(tokPosixClass, j+len("[::]"))
207 | 				}
208 | 			}
209 | 			if !isPosixClass {
210 | 				l.pushTok(tokChar, 1)
211 | 			}
212 | 		case '-':
213 | 			l.pushTok(tokMinus, 1)
214 | 		case ']':
215 | 			l.pushTok(tokRbracket, 1)
216 | 			return // Stop scanning in the char context
217 | 		default:
218 | 			l.pushTok(tokChar, 1)
219 | 		}
220 | 	}
221 | }
222 | 
223 | func (l *lexer) scanEscape(insideCharClass bool) {
224 | 	s := l.input
225 | 	if l.pos+1 >= len(s) {
226 | 		throw(newPos(l.pos, l.pos+1), `unexpected end of pattern: trailing '\'`)
227 | 	}
228 | 	switch {
229 | 	case s[l.pos+1] == 'p' || s[l.pos+1] == 'P':
230 | 		if l.pos+2 >= len(s) {
231 | 			throw(newPos(l.pos, l.pos+2), "unexpected end of pattern: expected uni-class-short or '{'")
232 | 		}
233 | 		if s[l.pos+2] == '{' {
234 | 			j := strings.IndexByte(s[l.pos+2:], '}')
235 | 			if j < 0 {
236 | 				throw(newPos(l.pos, l.pos+2), "can't find closing '}'")
237 | 			}
238 | 			l.pushTok(tokEscapeUniFull, len(`\p{`)+j)
239 | 		} else {
240 | 			l.pushTok(tokEscapeUni, len(`\pL`))
241 | 		}
242 | 	case s[l.pos+1] == 'x':
243 | 		if l.pos+2 >= len(s) {
244 | 			throw(newPos(l.pos, l.pos+2), "unexpected end of pattern: expected hex-digit or '{'")
245 | 		}
246 | 		if s[l.pos+2] == '{' {
247 | 			j := strings.IndexByte(s[l.pos+2:], '}')
248 | 			if j < 0 {
249 | 				throw(newPos(l.pos, l.pos+2), "can't find closing '}'")
250 | 			}
251 | 			l.pushTok(tokEscapeHexFull, len(`\x{`)+j)
252 | 		} else {
253 | 			if isHexDigit(l.byteAt(l.pos + 3)) {
254 | 				l.pushTok(tokEscapeHex, len(`\xFF`))
255 | 			} else {
256 | 				l.pushTok(tokEscapeHex, len(`\xF`))
257 | 			}
258 | 		}
259 | 	case isOctalDigit(s[l.pos+1]):
260 | 		digits := 1
261 | 		if isOctalDigit(l.byteAt(l.pos + 2)) {
262 | 			if isOctalDigit(l.byteAt(l.pos + 3)) {
263 | 				digits = 3
264 | 			} else {
265 | 				digits = 2
266 | 			}
267 | 		}
268 | 		l.pushTok(tokEscapeOctal, len(`\`)+digits)
269 | 	case s[l.pos+1] == 'Q':
270 | 		size := len(s) - l.pos // Until the pattern ends
271 | 		j := l.stringIndex(l.pos+2, `\E`)
272 | 		if j >= 0 {
273 | 			size = j + len(`\Q\E`)
274 | 		}
275 | 		l.pushTok(tokQ, size)
276 | 
277 | 	default:
278 | 		ch := l.byteAt(l.pos + 1)
279 | 		if ch >= utf8.RuneSelf {
280 | 			_, size := utf8.DecodeRuneInString(l.input[l.pos+1:])
281 | 			l.pushTok(tokEscapeChar, len(`\`)+size)
282 | 			return
283 | 		}
284 | 		kind := tokEscapeChar
285 | 		if insideCharClass {
286 | 			if charClassMetachar[ch] {
287 | 				kind = tokEscapeMeta
288 | 			}
289 | 		} else {
290 | 			if reMetachar[ch] {
291 | 				kind = tokEscapeMeta
292 | 			}
293 | 		}
294 | 		l.pushTok(kind, 2)
295 | 	}
296 | }
297 | 
298 | func (l *lexer) maybeInsertConcat() {
299 | 	if l.isConcatPos() {
300 | 		last := len(l.tokens) - 1
301 | 		tok := l.tokens[last]
302 | 		l.tokens[last].kind = tokConcat
303 | 		l.tokens = append(l.tokens, tok)
304 | 	}
305 | }
306 | 
307 | func (l *lexer) Init(s string) {
308 | 	l.pos = 0
309 | 	l.tokens = l.tokens[:0]
310 | 	l.input = s
311 | 
312 | 	l.scan()
313 | 
314 | 	l.pos = 0
315 | }
316 | 
317 | func (l *lexer) tryScanGroupName(pos int) bool {
318 | 	tok := tokLparenName
319 | 	endCh := byte('>')
320 | 	offset := 1
321 | 	switch l.byteAt(pos) {
322 | 	case '\'':
323 | 		endCh = '\''
324 | 		tok = tokLparenNameQuote
325 | 	case '<':
326 | 		tok = tokLparenNameAngle
327 | 	case 'P':
328 | 		offset = 2
329 | 	default:
330 | 		return false
331 | 	}
332 | 	if pos+offset >= len(l.input) {
333 | 		return false
334 | 	}
335 | 	end := strings.IndexByte(l.input[pos+offset:], endCh)
336 | 	if end < 0 {
337 | 		return false
338 | 	}
339 | 	l.pushTok(tok, len("(?")+offset+end+1)
340 | 	return true
341 | }
342 | 
343 | func (l *lexer) tryScanGroupFlags(pos int) bool {
344 | 	colonPos := strings.IndexByte(l.input[pos:], ':')
345 | 	parenPos := strings.IndexByte(l.input[pos:], ')')
346 | 	if parenPos < 0 {
347 | 		return false
348 | 	}
349 | 	end := parenPos
350 | 	if colonPos >= 0 && colonPos < parenPos {
351 | 		end = colonPos + len(":")
352 | 	}
353 | 	l.pushTok(tokLparenFlags, len("(?")+end)
354 | 	return true
355 | }
356 | 
357 | func (l *lexer) tryScanComment(pos int) bool {
358 | 	if l.byteAt(pos) != '#' {
359 | 		return false
360 | 	}
361 | 	parenPos := strings.IndexByte(l.input[pos:], ')')
362 | 	if parenPos < 0 {
363 | 		return false
364 | 	}
365 | 	l.pushTok(tokComment, len("(?")+parenPos+len(")"))
366 | 	return true
367 | }
368 | 
369 | func (l *lexer) repeatWidth(pos int) int {
370 | 	j := pos
371 | 	for isDigit(l.byteAt(j)) {
372 | 		j++
373 | 	}
374 | 	if j == pos {
375 | 		return -1
376 | 	}
377 | 	if l.byteAt(j) == '}' {
378 | 		return (j + len("}")) - pos // {min}
379 | 	}
380 | 	if l.byteAt(j) != ',' {
381 | 		return -1
382 | 	}
383 | 	j += len(",")
384 | 	for isDigit(l.byteAt(j)) {
385 | 		j++
386 | 	}
387 | 	if l.byteAt(j) == '}' {
388 | 		return (j + len("}")) - pos // {min,} or {min,max}
389 | 	}
390 | 	return -1
391 | }
392 | 
393 | func (l *lexer) stringIndex(offset int, s string) int {
394 | 	if offset < len(l.input) {
395 | 		return strings.Index(l.input[offset:], s)
396 | 	}
397 | 	return -1
398 | }
399 | 
400 | func (l *lexer) byteAt(pos int) byte {
401 | 	if pos >= 0 && pos < len(l.input) {
402 | 		return l.input[pos]
403 | 	}
404 | 	return 0
405 | }
406 | 
407 | func (l *lexer) pushTok(kind tokenKind, size int) {
408 | 	l.tokens = append(l.tokens, token{
409 | 		kind: kind,
410 | 		pos:  Position{Begin: uint16(l.pos), End: uint16(l.pos + size)},
411 | 	})
412 | 	l.pos += size
413 | }
414 | 
415 | func (l *lexer) isConcatPos() bool {
416 | 	if len(l.tokens) < 2 {
417 | 		return false
418 | 	}
419 | 	x := l.tokens[len(l.tokens)-2].kind
420 | 	if concatTable[x]&concatX != 0 {
421 | 		return false
422 | 	}
423 | 	y := l.tokens[len(l.tokens)-1].kind
424 | 	return concatTable[y]&concatY == 0
425 | }
426 | 
427 | const (
428 | 	concatX byte = 1 << iota
429 | 	concatY
430 | )
431 | 
432 | var concatTable = [256]byte{
433 | 	tokPipe: concatX | concatY,
434 | 
435 | 	tokLparen:                   concatX,
436 | 	tokLparenFlags:              concatX,
437 | 	tokLparenName:               concatX,
438 | 	tokLparenNameAngle:          concatX,
439 | 	tokLparenNameQuote:          concatX,
440 | 	tokLparenAtomic:             concatX,
441 | 	tokLbracket:                 concatX,
442 | 	tokLbracketCaret:            concatX,
443 | 	tokLparenPositiveLookahead:  concatX,
444 | 	tokLparenPositiveLookbehind: concatX,
445 | 	tokLparenNegativeLookahead:  concatX,
446 | 	tokLparenNegativeLookbehind: concatX,
447 | 
448 | 	tokRparen:   concatY,
449 | 	tokRbracket: concatY,
450 | 	tokPlus:     concatY,
451 | 	tokStar:     concatY,
452 | 	tokQuestion: concatY,
453 | 	tokRepeat:   concatY,
454 | }
455 | 


--------------------------------------------------------------------------------
/syntax/parser.go:
--------------------------------------------------------------------------------
  1 | package syntax
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"strings"
  6 | )
  7 | 
  8 | type ParserOptions struct {
  9 | 	// NoLiterals disables OpChar merging into OpLiteral.
 10 | 	NoLiterals bool
 11 | }
 12 | 
 13 | func NewParser(opts *ParserOptions) *Parser {
 14 | 	return newParser(opts)
 15 | }
 16 | 
 17 | type Parser struct {
 18 | 	out      Regexp
 19 | 	lexer    lexer
 20 | 	exprPool []Expr
 21 | 
 22 | 	prefixParselets [256]prefixParselet
 23 | 	infixParselets  [256]infixParselet
 24 | 
 25 | 	charClass []Expr
 26 | 	allocated uint
 27 | 
 28 | 	opts ParserOptions
 29 | }
 30 | 
 31 | // ParsePCRE parses PHP-style pattern with delimiters.
 32 | // An example of such pattern is `/foo/i`.
 33 | func (p *Parser) ParsePCRE(pattern string) (*RegexpPCRE, error) {
 34 | 	pcre, err := p.newPCRE(pattern)
 35 | 	if err != nil {
 36 | 		return nil, err
 37 | 	}
 38 | 	if pcre.HasModifier('x') {
 39 | 		return nil, errors.New("'x' modifier is not supported")
 40 | 	}
 41 | 	re, err := p.Parse(pcre.Pattern)
 42 | 	if re != nil {
 43 | 		pcre.Expr = re.Expr
 44 | 	}
 45 | 	return pcre, err
 46 | }
 47 | 
 48 | func (p *Parser) Parse(pattern string) (result *Regexp, err error) {
 49 | 	defer func() {
 50 | 		r := recover()
 51 | 		if r == nil {
 52 | 			return
 53 | 		}
 54 | 		if err2, ok := r.(ParseError); ok {
 55 | 			err = err2
 56 | 			return
 57 | 		}
 58 | 		panic(r)
 59 | 	}()
 60 | 
 61 | 	p.lexer.Init(pattern)
 62 | 	p.allocated = 0
 63 | 	p.out.Pattern = pattern
 64 | 	if pattern == "" {
 65 | 		p.out.Expr = *p.newExpr(OpConcat, Position{})
 66 | 	} else {
 67 | 		p.out.Expr = *p.parseExpr(0)
 68 | 	}
 69 | 
 70 | 	if !p.opts.NoLiterals {
 71 | 		p.mergeChars(&p.out.Expr)
 72 | 	}
 73 | 	p.setValues(&p.out.Expr)
 74 | 
 75 | 	return &p.out, nil
 76 | }
 77 | 
 78 | type prefixParselet func(token) *Expr
 79 | 
 80 | type infixParselet func(*Expr, token) *Expr
 81 | 
 82 | func newParser(opts *ParserOptions) *Parser {
 83 | 	var p Parser
 84 | 
 85 | 	if opts != nil {
 86 | 		p.opts = *opts
 87 | 	}
 88 | 	p.exprPool = make([]Expr, 256)
 89 | 
 90 | 	for tok, op := range tok2op {
 91 | 		if op != 0 {
 92 | 			p.prefixParselets[tokenKind(tok)] = p.parsePrefixElementary
 93 | 		}
 94 | 	}
 95 | 
 96 | 	p.prefixParselets[tokQ] = func(tok token) *Expr {
 97 | 		litPos := tok.pos
 98 | 		litPos.Begin += uint16(len(`\Q`))
 99 | 		form := FormQuoteUnclosed
100 | 		if strings.HasSuffix(p.tokenValue(tok), `\E`) {
101 | 			litPos.End -= uint16(len(`\E`))
102 | 			form = FormDefault
103 | 		}
104 | 		lit := p.newExpr(OpString, litPos)
105 | 		return p.newExprForm(OpQuote, form, tok.pos, lit)
106 | 	}
107 | 
108 | 	p.prefixParselets[tokEscapeHexFull] = func(tok token) *Expr {
109 | 		litPos := tok.pos
110 | 		litPos.Begin += uint16(len(`\x{`))
111 | 		litPos.End -= uint16(len(`}`))
112 | 		lit := p.newExpr(OpString, litPos)
113 | 		return p.newExprForm(OpEscapeHex, FormEscapeHexFull, tok.pos, lit)
114 | 	}
115 | 	p.prefixParselets[tokEscapeUniFull] = func(tok token) *Expr {
116 | 		litPos := tok.pos
117 | 		litPos.Begin += uint16(len(`\p{`))
118 | 		litPos.End -= uint16(len(`}`))
119 | 		lit := p.newExpr(OpString, litPos)
120 | 		return p.newExprForm(OpEscapeUni, FormEscapeUniFull, tok.pos, lit)
121 | 	}
122 | 
123 | 	p.prefixParselets[tokEscapeHex] = func(tok token) *Expr { return p.parseEscape(OpEscapeHex, `\x`, tok) }
124 | 	p.prefixParselets[tokEscapeOctal] = func(tok token) *Expr { return p.parseEscape(OpEscapeOctal, `\`, tok) }
125 | 	p.prefixParselets[tokEscapeChar] = func(tok token) *Expr { return p.parseEscape(OpEscapeChar, `\`, tok) }
126 | 	p.prefixParselets[tokEscapeMeta] = func(tok token) *Expr { return p.parseEscape(OpEscapeMeta, `\`, tok) }
127 | 	p.prefixParselets[tokEscapeUni] = func(tok token) *Expr { return p.parseEscape(OpEscapeUni, `\p`, tok) }
128 | 
129 | 	p.prefixParselets[tokLparen] = func(tok token) *Expr { return p.parseGroup(OpCapture, tok) }
130 | 	p.prefixParselets[tokLparenAtomic] = func(tok token) *Expr { return p.parseGroup(OpAtomicGroup, tok) }
131 | 	p.prefixParselets[tokLparenPositiveLookahead] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookahead, tok) }
132 | 	p.prefixParselets[tokLparenNegativeLookahead] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookahead, tok) }
133 | 	p.prefixParselets[tokLparenPositiveLookbehind] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookbehind, tok) }
134 | 	p.prefixParselets[tokLparenNegativeLookbehind] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookbehind, tok) }
135 | 
136 | 	p.prefixParselets[tokLparenName] = func(tok token) *Expr {
137 | 		return p.parseNamedCapture(FormDefault, tok)
138 | 	}
139 | 	p.prefixParselets[tokLparenNameAngle] = func(tok token) *Expr {
140 | 		return p.parseNamedCapture(FormNamedCaptureAngle, tok)
141 | 	}
142 | 	p.prefixParselets[tokLparenNameQuote] = func(tok token) *Expr {
143 | 		return p.parseNamedCapture(FormNamedCaptureQuote, tok)
144 | 	}
145 | 
146 | 	p.prefixParselets[tokLparenFlags] = p.parseGroupWithFlags
147 | 
148 | 	p.prefixParselets[tokPipe] = func(tok token) *Expr {
149 | 		// We need prefix pipe parselet to handle `(|x)` syntax.
150 | 		right := p.parseExpr(1)
151 | 		return p.newExpr(OpAlt, tok.pos, p.newEmpty(tok.pos), right)
152 | 	}
153 | 	p.prefixParselets[tokLbracket] = func(tok token) *Expr {
154 | 		return p.parseCharClass(OpCharClass, tok)
155 | 	}
156 | 	p.prefixParselets[tokLbracketCaret] = func(tok token) *Expr {
157 | 		return p.parseCharClass(OpNegCharClass, tok)
158 | 	}
159 | 
160 | 	p.infixParselets[tokRepeat] = func(left *Expr, tok token) *Expr {
161 | 		repeatLit := p.newExpr(OpString, tok.pos)
162 | 		return p.newExpr(OpRepeat, combinePos(left.Pos, tok.pos), left, repeatLit)
163 | 	}
164 | 	p.infixParselets[tokStar] = func(left *Expr, tok token) *Expr {
165 | 		return p.newExpr(OpStar, combinePos(left.Pos, tok.pos), left)
166 | 	}
167 | 	p.infixParselets[tokConcat] = func(left *Expr, tok token) *Expr {
168 | 		right := p.parseExpr(2)
169 | 		if left.Op == OpConcat {
170 | 			left.Args = append(left.Args, *right)
171 | 			left.Pos.End = right.End()
172 | 			return left
173 | 		}
174 | 		return p.newExpr(OpConcat, combinePos(left.Pos, right.Pos), left, right)
175 | 	}
176 | 	p.infixParselets[tokPipe] = p.parseAlt
177 | 	p.infixParselets[tokMinus] = p.parseMinus
178 | 	p.infixParselets[tokPlus] = p.parsePlus
179 | 	p.infixParselets[tokQuestion] = p.parseQuestion
180 | 
181 | 	return &p
182 | }
183 | 
184 | func (p *Parser) setValues(e *Expr) {
185 | 	for i := range e.Args {
186 | 		p.setValues(&e.Args[i])
187 | 	}
188 | 	e.Value = p.exprValue(e)
189 | }
190 | 
191 | func (p *Parser) tokenValue(tok token) string {
192 | 	return p.out.Pattern[tok.pos.Begin:tok.pos.End]
193 | }
194 | 
195 | func (p *Parser) exprValue(e *Expr) string {
196 | 	return p.out.Pattern[e.Begin():e.End()]
197 | }
198 | 
199 | func (p *Parser) mergeChars(e *Expr) {
200 | 	for i := range e.Args {
201 | 		p.mergeChars(&e.Args[i])
202 | 	}
203 | 	if e.Op != OpConcat || len(e.Args) < 2 {
204 | 		return
205 | 	}
206 | 
207 | 	args := e.Args[:0]
208 | 	i := 0
209 | 	for i < len(e.Args) {
210 | 		first := i
211 | 		chars := 0
212 | 		for j := i; j < len(e.Args) && e.Args[j].Op == OpChar; j++ {
213 | 			chars++
214 | 		}
215 | 		if chars > 1 {
216 | 			c1 := e.Args[first]
217 | 			c2 := e.Args[first+chars-1]
218 | 			lit := p.newExpr(OpLiteral, combinePos(c1.Pos, c2.Pos))
219 | 			for j := 0; j < chars; j++ {
220 | 				lit.Args = append(lit.Args, e.Args[first+j])
221 | 			}
222 | 			args = append(args, *lit)
223 | 			i += chars
224 | 		} else {
225 | 			args = append(args, e.Args[i])
226 | 			i++
227 | 		}
228 | 	}
229 | 	if len(args) == 1 {
230 | 		*e = args[0] // Turn OpConcat into OpLiteral
231 | 	} else {
232 | 		e.Args = args
233 | 	}
234 | }
235 | 
236 | func (p *Parser) newEmpty(pos Position) *Expr {
237 | 	return p.newExpr(OpConcat, pos)
238 | }
239 | 
240 | func (p *Parser) newExprForm(op Operation, form Form, pos Position, args ...*Expr) *Expr {
241 | 	e := p.newExpr(op, pos, args...)
242 | 	e.Form = form
243 | 	return e
244 | }
245 | 
246 | func (p *Parser) newExpr(op Operation, pos Position, args ...*Expr) *Expr {
247 | 	e := p.allocExpr()
248 | 	*e = Expr{
249 | 		Op:   op,
250 | 		Pos:  pos,
251 | 		Args: e.Args[:0],
252 | 	}
253 | 	for _, arg := range args {
254 | 		e.Args = append(e.Args, *arg)
255 | 	}
256 | 	return e
257 | }
258 | 
259 | func (p *Parser) allocExpr() *Expr {
260 | 	i := p.allocated
261 | 	if i < uint(len(p.exprPool)) {
262 | 		p.allocated++
263 | 		return &p.exprPool[i]
264 | 	}
265 | 	return &Expr{}
266 | }
267 | 
268 | func (p *Parser) expect(kind tokenKind) Position {
269 | 	tok := p.lexer.NextToken()
270 | 	if tok.kind != kind {
271 | 		throwExpectedFound(tok.pos, kind.String(), tok.kind.String())
272 | 	}
273 | 	return tok.pos
274 | }
275 | 
276 | func (p *Parser) parseExpr(precedence int) *Expr {
277 | 	tok := p.lexer.NextToken()
278 | 	prefix := p.prefixParselets[tok.kind]
279 | 	if prefix == nil {
280 | 		throwUnexpectedToken(tok.pos, tok.String())
281 | 	}
282 | 	left := prefix(tok)
283 | 
284 | 	for precedence < p.precedenceOf(p.lexer.Peek()) {
285 | 		tok := p.lexer.NextToken()
286 | 		infix := p.infixParselets[tok.kind]
287 | 		left = infix(left, tok)
288 | 	}
289 | 
290 | 	return left
291 | }
292 | 
293 | func (p *Parser) parsePrefixElementary(tok token) *Expr {
294 | 	return p.newExpr(tok2op[tok.kind], tok.pos)
295 | }
296 | 
297 | func (p *Parser) parseCharClass(op Operation, tok token) *Expr {
298 | 	var endPos Position
299 | 	p.charClass = p.charClass[:0]
300 | 	for {
301 | 		p.charClass = append(p.charClass, *p.parseExpr(0))
302 | 		next := p.lexer.Peek()
303 | 		if next.kind == tokRbracket {
304 | 			endPos = next.pos
305 | 			p.lexer.NextToken()
306 | 			break
307 | 		}
308 | 		if next.kind == tokNone {
309 | 			throw(tok.pos, "unterminated '['")
310 | 		}
311 | 	}
312 | 
313 | 	result := p.newExpr(op, combinePos(tok.pos, endPos))
314 | 	result.Args = append(result.Args, p.charClass...)
315 | 	return result
316 | }
317 | 
318 | func (p *Parser) parseMinus(left *Expr, tok token) *Expr {
319 | 	if p.isValidCharRangeOperand(left) {
320 | 		if p.lexer.Peek().kind != tokRbracket {
321 | 			right := p.parseExpr(2)
322 | 			return p.newExpr(OpCharRange, combinePos(left.Pos, right.Pos), left, right)
323 | 		}
324 | 	}
325 | 	p.charClass = append(p.charClass, *left)
326 | 	return p.newExpr(OpChar, tok.pos)
327 | }
328 | 
329 | func (p *Parser) isValidCharRangeOperand(e *Expr) bool {
330 | 	switch e.Op {
331 | 	case OpEscapeHex, OpEscapeOctal, OpEscapeMeta, OpChar:
332 | 		return true
333 | 	case OpEscapeChar:
334 | 		switch p.exprValue(e) {
335 | 		case `\\`, `\|`, `\*`, `\+`, `\?`, `\.`, `\[`, `\^`, `\$`, `\(`, `\)`:
336 | 			return true
337 | 		}
338 | 	}
339 | 	return false
340 | }
341 | 
342 | func (p *Parser) parsePlus(left *Expr, tok token) *Expr {
343 | 	op := OpPlus
344 | 	switch left.Op {
345 | 	case OpPlus, OpStar, OpQuestion, OpRepeat:
346 | 		op = OpPossessive
347 | 	}
348 | 	return p.newExpr(op, combinePos(left.Pos, tok.pos), left)
349 | }
350 | 
351 | func (p *Parser) parseQuestion(left *Expr, tok token) *Expr {
352 | 	op := OpQuestion
353 | 	switch left.Op {
354 | 	case OpPlus, OpStar, OpQuestion, OpRepeat:
355 | 		op = OpNonGreedy
356 | 	}
357 | 	return p.newExpr(op, combinePos(left.Pos, tok.pos), left)
358 | }
359 | 
360 | func (p *Parser) parseAlt(left *Expr, tok token) *Expr {
361 | 	var right *Expr
362 | 	switch p.lexer.Peek().kind {
363 | 	case tokRparen, tokNone:
364 | 		// This is needed to handle `(x|)` syntax.
365 | 		right = p.newEmpty(tok.pos)
366 | 	default:
367 | 		right = p.parseExpr(1)
368 | 	}
369 | 	if left.Op == OpAlt {
370 | 		left.Args = append(left.Args, *right)
371 | 		left.Pos.End = right.End()
372 | 		return left
373 | 	}
374 | 	return p.newExpr(OpAlt, combinePos(left.Pos, right.Pos), left, right)
375 | }
376 | 
377 | func (p *Parser) parseGroupItem(tok token) *Expr {
378 | 	if p.lexer.Peek().kind == tokRparen {
379 | 		// This is needed to handle `() syntax.`
380 | 		return p.newEmpty(tok.pos)
381 | 	}
382 | 	return p.parseExpr(0)
383 | }
384 | 
385 | func (p *Parser) parseGroup(op Operation, tok token) *Expr {
386 | 	x := p.parseGroupItem(tok)
387 | 	result := p.newExpr(op, tok.pos, x)
388 | 	result.Pos.End = p.expect(tokRparen).End
389 | 	return result
390 | }
391 | 
392 | func (p *Parser) parseNamedCapture(form Form, tok token) *Expr {
393 | 	prefixLen := len("(?<")
394 | 	if form == FormDefault {
395 | 		prefixLen = len("(?P<")
396 | 	}
397 | 	name := p.newExpr(OpString, Position{
398 | 		Begin: tok.pos.Begin + uint16(prefixLen),
399 | 		End:   tok.pos.End - uint16(len(">")),
400 | 	})
401 | 	x := p.parseGroupItem(tok)
402 | 	result := p.newExprForm(OpNamedCapture, form, tok.pos, x, name)
403 | 	result.Pos.End = p.expect(tokRparen).End
404 | 	return result
405 | }
406 | 
407 | func (p *Parser) parseGroupWithFlags(tok token) *Expr {
408 | 	var result *Expr
409 | 	val := p.out.Pattern[tok.pos.Begin+1 : tok.pos.End]
410 | 	switch {
411 | 	case !strings.HasSuffix(val, ":"):
412 | 		flags := p.newExpr(OpString, Position{
413 | 			Begin: tok.pos.Begin + uint16(len("(?")),
414 | 			End:   tok.pos.End,
415 | 		})
416 | 		result = p.newExpr(OpFlagOnlyGroup, tok.pos, flags)
417 | 	case val == "?:":
418 | 		x := p.parseGroupItem(tok)
419 | 		result = p.newExpr(OpGroup, tok.pos, x)
420 | 	default:
421 | 		flags := p.newExpr(OpString, Position{
422 | 			Begin: tok.pos.Begin + uint16(len("(?")),
423 | 			End:   tok.pos.End - uint16(len(":")),
424 | 		})
425 | 		x := p.parseGroupItem(tok)
426 | 		result = p.newExpr(OpGroupWithFlags, tok.pos, x, flags)
427 | 	}
428 | 	result.Pos.End = p.expect(tokRparen).End
429 | 	return result
430 | }
431 | 
432 | func (p *Parser) parseEscape(op Operation, prefix string, tok token) *Expr {
433 | 	litPos := tok.pos
434 | 	litPos.Begin += uint16(len(prefix))
435 | 	lit := p.newExpr(OpString, litPos)
436 | 	return p.newExpr(op, tok.pos, lit)
437 | }
438 | 
439 | func (p *Parser) precedenceOf(tok token) int {
440 | 	switch tok.kind {
441 | 	case tokPipe:
442 | 		return 1
443 | 	case tokConcat, tokMinus:
444 | 		return 2
445 | 	case tokPlus, tokStar, tokQuestion, tokRepeat:
446 | 		return 3
447 | 	default:
448 | 		return 0
449 | 	}
450 | }
451 | 
452 | func (p *Parser) newPCRE(source string) (*RegexpPCRE, error) {
453 | 	if source == "" {
454 | 		return nil, errors.New("empty pattern: can't find delimiters")
455 | 	}
456 | 
457 | 	delim := source[0]
458 | 	endDelim := delim
459 | 	switch delim {
460 | 	case '(':
461 | 		endDelim = ')'
462 | 	case '{':
463 | 		endDelim = '}'
464 | 	case '[':
465 | 		endDelim = ']'
466 | 	case '<':
467 | 		endDelim = '>'
468 | 	case '\\':
469 | 		return nil, errors.New("'\\' is not a valid delimiter")
470 | 	default:
471 | 		if isSpace(delim) {
472 | 			return nil, errors.New("whitespace is not a valid delimiter")
473 | 		}
474 | 		if isAlphanumeric(delim) {
475 | 			return nil, errors.New("'" + string(delim) + "' is not a valid delimiter")
476 | 		}
477 | 	}
478 | 
479 | 	const delimLen = 1
480 | 	j := strings.LastIndexByte(source[delimLen:], endDelim)
481 | 	if j == -1 {
482 | 		return nil, errors.New("can't find '" + string(endDelim) + "' ending delimiter")
483 | 	}
484 | 	j += delimLen
485 | 
486 | 	pcre := &RegexpPCRE{
487 | 		Pattern:   source[delimLen:j],
488 | 		Source:    source,
489 | 		Delim:     [2]byte{delim, endDelim},
490 | 		Modifiers: source[j+delimLen:],
491 | 	}
492 | 	return pcre, nil
493 | }
494 | 
495 | var tok2op = [256]Operation{
496 | 	tokDollar:     OpDollar,
497 | 	tokCaret:      OpCaret,
498 | 	tokDot:        OpDot,
499 | 	tokChar:       OpChar,
500 | 	tokMinus:      OpChar,
501 | 	tokPosixClass: OpPosixClass,
502 | 	tokComment:    OpComment,
503 | }
504 | 


--------------------------------------------------------------------------------
/syntax/parser_test.go:
--------------------------------------------------------------------------------
  1 | package syntax
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"regexp/syntax"
  6 | 	"strings"
  7 | 	"testing"
  8 | )
  9 | 
 10 | func TestParserErrors(t *testing.T) {
 11 | 	tests := []struct {
 12 | 		pattern string
 13 | 		want    string
 14 | 	}{
 15 | 		{`\`, `unexpected end of pattern: trailing '\'`},
 16 | 		{`\x`, `unexpected end of pattern: expected hex-digit or '{'`},
 17 | 		{`\x{12`, `can't find closing '}'`},
 18 | 		{`(abc`, `expected ')', found 'None'`},
 19 | 		{`[abc`, `unterminated '['`},
 20 | 		{`[]`, `unterminated '['`},
 21 | 		{`[^]`, `unterminated '['`},
 22 | 		{`\p`, `unexpected end of pattern: expected uni-class-short or '{'`},
 23 | 		{`\p{L`, `can't find closing '}'`},
 24 | 		{`(?`, `group token is incomplete`},
 25 | 		{`(?i`, `group token is incomplete`},
 26 | 		{`(?:`, `group token is incomplete`},
 27 | 	}
 28 | 
 29 | 	p := NewParser(nil)
 30 | 	for _, test := range tests {
 31 | 		_, err := p.Parse(test.pattern)
 32 | 		have := "<nil>"
 33 | 		if err != nil {
 34 | 			have = err.Error()
 35 | 		}
 36 | 		if have != test.want {
 37 | 			t.Errorf("parse(%q):\nhave: %s\nwant: %s",
 38 | 				test.pattern, have, test.want)
 39 | 		}
 40 | 	}
 41 | }
 42 | 
 43 | func writeExpr(t *testing.T, w *strings.Builder, re *Regexp, e Expr) {
 44 | 	assertBeginPos := func(e Expr, begin uint16) {
 45 | 		if e.Begin() != begin {
 46 | 			t.Errorf("`%s`: %s begin pos mismatch:\nhave: `%s` (begin=%d)\nwant: `%s` (begin=%d)",
 47 | 				re.Pattern, e.Op,
 48 | 				re.Pattern[e.Begin():e.End()], e.Begin(),
 49 | 				re.Pattern[begin:e.End()], begin)
 50 | 		}
 51 | 	}
 52 | 	assertEndPos := func(e Expr, end uint16) {
 53 | 		if e.End() != end {
 54 | 			t.Errorf("`%s`: %s end pos mismatch:\nhave: `%s` (end=%d)\nwant: `%s` (end=%d)",
 55 | 				re.Pattern, e.Op,
 56 | 				re.Pattern[e.Begin():e.End()], e.End(),
 57 | 				re.Pattern[e.Begin():end], end)
 58 | 		}
 59 | 	}
 60 | 
 61 | 	switch e.Op {
 62 | 	case OpChar, OpString, OpPosixClass, OpDot, OpCaret, OpDollar, OpComment:
 63 | 		w.WriteString(e.Value)
 64 | 
 65 | 	case OpQuote:
 66 | 		assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\Q`)))
 67 | 		w.WriteString(`\Q`)
 68 | 		writeExpr(t, w, re, e.Args[0])
 69 | 		if e.Form != FormQuoteUnclosed {
 70 | 			w.WriteString(`\E`)
 71 | 		}
 72 | 
 73 | 	case OpEscapeOctal, OpEscapeChar, OpEscapeMeta:
 74 | 		assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\`)))
 75 | 		w.WriteString(`\`)
 76 | 		writeExpr(t, w, re, e.Args[0])
 77 | 
 78 | 	case OpEscapeUni:
 79 | 		switch e.Form {
 80 | 		case FormEscapeUniFull:
 81 | 			assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\p{`)))
 82 | 			assertEndPos(e, e.Args[0].End()+uint16(len(`}`)))
 83 | 			w.WriteString(`\p{`)
 84 | 			writeExpr(t, w, re, e.Args[0])
 85 | 			w.WriteString(`}`)
 86 | 		default:
 87 | 			assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\p`)))
 88 | 			w.WriteString(`\p`)
 89 | 			writeExpr(t, w, re, e.Args[0])
 90 | 		}
 91 | 
 92 | 	case OpEscapeHex:
 93 | 		switch e.Form {
 94 | 		case FormEscapeHexFull:
 95 | 			assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\x{`)))
 96 | 			assertEndPos(e, e.Args[0].End()+uint16(len(`}`)))
 97 | 			w.WriteString(`\x{`)
 98 | 			writeExpr(t, w, re, e.Args[0])
 99 | 			w.WriteString(`}`)
100 | 		default:
101 | 			assertBeginPos(e, e.Args[0].Begin()-uint16(len(`\x`)))
102 | 			w.WriteString(`\x`)
103 | 			writeExpr(t, w, re, e.Args[0])
104 | 		}
105 | 
106 | 	case OpLiteral:
107 | 		assertBeginPos(e, e.Args[0].Begin())
108 | 		assertEndPos(e, e.LastArg().End())
109 | 		for _, a := range e.Args {
110 | 			writeExpr(t, w, re, a)
111 | 		}
112 | 
113 | 	case OpCharRange:
114 | 		assertBeginPos(e, e.Args[0].Begin())
115 | 		assertEndPos(e, e.Args[1].End())
116 | 		writeExpr(t, w, re, e.Args[0])
117 | 		w.WriteByte('-')
118 | 		writeExpr(t, w, re, e.Args[1])
119 | 
120 | 	case OpNamedCapture:
121 | 		assertEndPos(e, e.Args[0].End()+1)
122 | 		switch e.Form {
123 | 		case FormNamedCaptureAngle:
124 | 			fmt.Fprintf(w, "(?<%s>", e.Args[1].Value)
125 | 		case FormNamedCaptureQuote:
126 | 			fmt.Fprintf(w, "(?'%s'", e.Args[1].Value)
127 | 		default:
128 | 			fmt.Fprintf(w, "(?P<%s>", e.Args[1].Value)
129 | 		}
130 | 		writeExpr(t, w, re, e.Args[0])
131 | 		w.WriteByte(')')
132 | 
133 | 	case OpFlagOnlyGroup:
134 | 		assertEndPos(e, e.Args[0].End()+1)
135 | 		w.WriteString("(?")
136 | 		w.WriteString(e.Args[0].Value)
137 | 		w.WriteByte(')')
138 | 
139 | 	case OpGroupWithFlags:
140 | 		assertEndPos(e, e.Args[0].End()+1)
141 | 		w.WriteString("(?")
142 | 		w.WriteString(e.Args[1].Value)
143 | 		w.WriteByte(':')
144 | 		writeExpr(t, w, re, e.Args[0])
145 | 		w.WriteByte(')')
146 | 
147 | 	case OpCapture, OpGroup, OpAtomicGroup, OpPositiveLookahead, OpNegativeLookahead, OpPositiveLookbehind, OpNegativeLookbehind:
148 | 		assertEndPos(e, e.Args[0].End()+1)
149 | 		w.WriteByte('(')
150 | 		switch e.Op {
151 | 		case OpGroup:
152 | 			w.WriteString("?:")
153 | 		case OpAtomicGroup:
154 | 			w.WriteString("?>")
155 | 		case OpPositiveLookahead:
156 | 			w.WriteString("?=")
157 | 		case OpNegativeLookahead:
158 | 			w.WriteString("?!")
159 | 		case OpPositiveLookbehind:
160 | 			w.WriteString("?<=")
161 | 		case OpNegativeLookbehind:
162 | 			w.WriteString("?<!")
163 | 		}
164 | 		writeExpr(t, w, re, e.Args[0])
165 | 		w.WriteByte(')')
166 | 
167 | 	case OpCharClass, OpNegCharClass:
168 | 		assertEndPos(e, e.LastArg().End()+1)
169 | 		w.WriteByte('[')
170 | 		if e.Op == OpNegCharClass {
171 | 			w.WriteByte('^')
172 | 		}
173 | 		for _, a := range e.Args {
174 | 			writeExpr(t, w, re, a)
175 | 		}
176 | 		w.WriteByte(']')
177 | 
178 | 	case OpRepeat:
179 | 		assertBeginPos(e, e.Args[0].Begin())
180 | 		assertEndPos(e, e.Args[1].End())
181 | 		writeExpr(t, w, re, e.Args[0])
182 | 		writeExpr(t, w, re, e.Args[1])
183 | 
184 | 	case OpConcat:
185 | 		assertBeginPos(e, e.Begin())
186 | 		if len(e.Args) > 0 {
187 | 			assertEndPos(e, e.LastArg().End())
188 | 		}
189 | 		for _, a := range e.Args {
190 | 			writeExpr(t, w, re, a)
191 | 		}
192 | 
193 | 	case OpAlt:
194 | 		assertBeginPos(e, e.Begin())
195 | 		assertEndPos(e, e.LastArg().End())
196 | 		for i, a := range e.Args {
197 | 			writeExpr(t, w, re, a)
198 | 			if i != len(e.Args)-1 {
199 | 				w.WriteByte('|')
200 | 			}
201 | 		}
202 | 
203 | 	case OpNonGreedy, OpPossessive, OpQuestion, OpPlus, OpStar:
204 | 		assertBeginPos(e, e.Args[0].Begin())
205 | 		assertEndPos(e, e.Args[0].End()+1)
206 | 		writeExpr(t, w, re, e.Args[0])
207 | 		switch e.Op {
208 | 		case OpNonGreedy, OpQuestion:
209 | 			w.WriteByte('?')
210 | 		case OpPossessive, OpPlus:
211 | 			w.WriteByte('+')
212 | 		case OpStar:
213 | 			w.WriteByte('*')
214 | 		}
215 | 
216 | 	default:
217 | 		panic(fmt.Sprintf("unhandled %s", e.Op))
218 | 	}
219 | }
220 | 
221 | func TestWriteExpr(t *testing.T) {
222 | 	// Tests that ensure that we can print the source regexp
223 | 	// using the parsed AST.
224 | 	// They also verify that AST node positions are correct.
225 | 
226 | 	tests := []struct {
227 | 		pat string
228 | 		o1  Operation
229 | 		o2  Operation
230 | 	}{
231 | 		{pat: `(?#?#)$`, o1: OpDollar, o2: OpComment},
232 | 		{pat: `(foobar|baz)*+(?#the comment)`, o1: OpPossessive, o2: OpComment},
233 | 		{pat: `abc?+`, o1: OpLiteral, o2: OpPossessive},
234 | 		{pat: `x{0}`, o1: OpChar, o2: OpString},
235 | 		{pat: `a\x{BAD}`, o1: OpLiteral, o2: OpEscapeHex},
236 | 		{pat: `(✓x✓x)`, o1: OpLiteral, o2: OpCapture},
237 | 		{pat: `[x][]]`, o1: OpCharClass, o2: OpLiteral},
238 | 		{pat: `[A-Za-z0-9-]`, o1: OpCharClass, o2: OpCharRange},
239 | 		{pat: `x{1}yz`, o1: OpLiteral, o2: OpRepeat},
240 | 		{pat: `x{1,2}y*`, o1: OpRepeat, o2: OpStar},
241 | 		{pat: `x{11,30}y+`, o1: OpRepeat, o2: OpPlus},
242 | 		{pat: `x{1,}$`, o1: OpRepeat, o2: OpDollar},
243 | 		{pat: `\p{Cyrillic}\d`, o1: OpEscapeUni, o2: OpEscapeChar},
244 | 		{pat: `x\p{Greek}y+?`, o1: OpEscapeUni, o2: OpNonGreedy},
245 | 		{pat: `x\p{L}+y`, o1: OpEscapeUni, o2: OpPlus},
246 | 		{pat: `^\pL`, o1: OpEscapeUni, o2: OpCaret},
247 | 		{pat: `^x\pLy`, o1: OpEscapeUni, o2: OpCaret},
248 | 		{pat: `\d?`, o1: OpEscapeChar, o2: OpQuestion},
249 | 		{pat: `[\xC0-\xC6]`, o1: OpCharRange, o2: OpEscapeHex},
250 | 		{pat: `\01\xff`, o1: OpEscapeOctal, o2: OpEscapeHex},
251 | 		{pat: `\111x\Qabc`, o1: OpEscapeOctal, o2: OpQuote},
252 | 		{pat: `x\Qabc\E.(?:s:..)`, o1: OpQuote, o2: OpGroupWithFlags},
253 | 		{pat: `(?i:foo[[:^alpha:]])`, o1: OpGroupWithFlags, o2: OpPosixClass},
254 | 		{pat: `a[[:digit:]\]]`, o1: OpPosixClass, o2: OpEscapeMeta},
255 | 		{pat: `(?:fa*)`, o1: OpGroup, o2: OpStar},
256 | 		{pat: `(?:x)|(?:y)`, o1: OpGroup, o2: OpAlt},
257 | 		{pat: `(foo|ba?r)`, o1: OpAlt, o2: OpQuestion},
258 | 		{pat: `(?P<1>xy\x{F})`, o1: OpNamedCapture, o2: OpEscapeHex},
259 | 		{pat: `(?P<x>)[^12]+?(?:[^]]x)`, o1: OpNamedCapture, o2: OpNegCharClass},
260 | 		{pat: `()\(`, o1: OpCapture, o2: OpEscapeMeta},
261 | 		{pat: `x{1,}?.?.`, o1: OpNonGreedy, o2: OpDot},
262 | 		{pat: `(?i)f.o`, o1: OpFlagOnlyGroup, o2: OpDot},
263 | 		{pat: `(?:(?i)[^a-z]o)`, o1: OpFlagOnlyGroup, o2: OpNegCharClass},
264 | 		{pat: `(?:(?P<foo>x))`, o1: OpString, o2: OpChar},
265 | 		{pat: `(?>atomic){2}.(?=x)`, o1: OpAtomicGroup, o2: OpPositiveLookahead},
266 | 		{pat: `(?:(?>g2)g1(?=))`, o1: OpAtomicGroup, o2: OpPositiveLookahead},
267 | 		{pat: `(?<=a)|(<!)`, o1: OpPositiveLookbehind, o2: OpNegativeLookbehind},
268 | 		{pat: `(?<=)|(<!a)`, o1: OpPositiveLookbehind, o2: OpNegativeLookbehind},
269 | 		{pat: `\s*\{weight=(\d+)\}\s(?!\s)*`, o1: OpNegativeLookahead},
270 | 		{pat: `(?!x)[.?,!;:@#$%^&*()]+`, o1: OpNegativeLookahead},
271 | 		{pat: `--(?<var_name>[\\w-]+?):\\s+?(?'var_val'.+?);`, o1: OpNamedCapture},
272 | 		{pat: `^ *(#{1,6}) *([^\n]+?) *#* *(?:\n|$)`},
273 | 		{pat: `^4\d{12}(\d{3})?$`},
274 | 	}
275 | 
276 | 	const minTests = 2
277 | 	toCover := make(map[Operation]int)
278 | 	for op := OpNone + 1; op < OpNone2; op++ {
279 | 		switch op {
280 | 		case OpConcat:
281 | 			continue
282 | 		}
283 | 		toCover[op] = minTests
284 | 	}
285 | 
286 | 	exprToString := func(re *Regexp) (s string, err error) {
287 | 		var b strings.Builder
288 | 		writeExpr(t, &b, re, re.Expr)
289 | 		return b.String(), nil
290 | 	}
291 | 
292 | 	p := NewParser(nil)
293 | 	for _, test := range tests {
294 | 		pattern := "_" + test.pat + "_"
295 | 		re, err := p.Parse(pattern)
296 | 		if err != nil {
297 | 			t.Fatalf("parse(%q): %v", test.pat, err)
298 | 		}
299 | 		have, err := exprToString(re)
300 | 		if err != nil {
301 | 			t.Fatalf("stringify(%q): %v", test.pat, err)
302 | 		}
303 | 		want := pattern
304 | 		if have != want {
305 | 			t.Fatalf("result mismatch:\nhave: `%s`\nwant: `%s`", have, want)
306 | 		}
307 | 		if test.o1 != 0 {
308 | 			toCover[test.o1]--
309 | 		}
310 | 		if test.o2 != 0 {
311 | 			toCover[test.o2]--
312 | 			if test.o2 == test.o1 {
313 | 				t.Fatalf("%s: o1==o2", test.pat)
314 | 			}
315 | 		}
316 | 	}
317 | 
318 | 	for op, n := range toCover {
319 | 		if n > 0 {
320 | 			t.Errorf("not enough tests for %s: want %d, have %d",
321 | 				op, minTests, minTests-n)
322 | 		}
323 | 	}
324 | }
325 | 
326 | func TestParser(t *testing.T) {
327 | 	tests := []struct {
328 | 		pattern string
329 | 		want    string
330 | 	}{
331 | 		// Empty pattern.
332 | 		{``, `{}`},
333 | 
334 | 		// Anchors.
335 | 		{`^`, `^`},
336 | 		{`^^`, `{^ ^}`},
337 | 		{`$`, `$`},
338 | 		{`$$`, `{$ $}`},
339 | 
340 | 		// Simple literals and chars.
341 | 		{` `, ` `},
342 | 		{`  `, `  `},
343 | 		{`x`, `x`},
344 | 		{`abc`, `abc`},
345 | 		{`□`, `□`},
346 | 		{`✓`, `✓`},
347 | 		{`✓✓`, `✓✓`},
348 | 
349 | 		// Dots and alternations (or).
350 | 		{`.`, `.`},
351 | 		{`..`, `{. .}`},
352 | 		{`...`, `{. . .}`},
353 | 		{`.|.`, `(or . .)`},
354 | 		{`.|✓|.`, `(or . ✓ .)`},
355 | 		{`✓.|.`, `(or {✓ .} .)`},
356 | 		{`.|✓.`, `(or . {✓ .})`},
357 | 		{`..✓|.`, `(or {. . ✓} .)`},
358 | 		{`.|..|..✓`, `(or . {. .} {. . ✓})`},
359 | 		{`.|...|..`, `(or . {. . .} {. .})`},
360 | 
361 | 		// Capturing groups.
362 | 		{`()`, `(capture {})`},
363 | 		{`(.)`, `(capture .)`},
364 | 		{`(.✓)`, `(capture {. ✓})`},
365 | 		{`(x)|(y)`, `(or (capture x) (capture y))`},
366 | 		{`(x)(y)`, `{(capture x) (capture y)}`},
367 | 		{`✓(x)y`, `{✓ (capture x) y}`},
368 | 		{`a(x1|y1)b`, `{a (capture (or x1 y1)) b}`},
369 | 
370 | 		// Non-capturing groups without flags.
371 | 		{`x(?:)y`, `{x (group {}) y}`},
372 | 		{`x(?:.)y`, `{x (group .) y}`},
373 | 		{`x(?:ab)y`, `{x (group ab) y}`},
374 | 		{`(?:a|b)`, `(group (or a b))`},
375 | 		{`(?:^a|bc)c`, `{(group (or {^ a} bc)) c}`},
376 | 
377 | 		// Flag-only groups.
378 | 		{`x(?i)y`, `{x (flags ?i) y}`},
379 | 		{`x(?i-m)y`, `{x (flags ?i-m) y}`},
380 | 		{`x(?-im)y`, `{x (flags ?-im) y}`},
381 | 
382 | 		// Non-capturing groups with flags.
383 | 		{`x(?i:)y`, `{x (group {} ?i) y}`},
384 | 		{`x(?im:.)y`, `{x (group . ?im) y}`},
385 | 		{`x(?i-m:ab)y`, `{x (group ab ?i-m) y}`},
386 | 
387 | 		// Named captures.
388 | 		{`x(?P<g>)y`, `{x (capture {} g) y}`},
389 | 		{`x(?P<name>.)y`, `{x (capture . name) y}`},
390 | 		{`x(?P<x1>ab)y`, `{x (capture ab x1) y}`},
391 | 		{`x(?<x12>ab)y`, `{x (capture ab x12) y}`},
392 | 		{`x(?'x12'ab)y`, `{x (capture ab x12) y}`},
393 | 
394 | 		// Atomic groups. PCRE-only.
395 | 		{`(?>)`, `(atomic {})`},
396 | 		{`(?>foo)`, `(atomic foo)`},
397 | 
398 | 		// Comments. PCRE-only.
399 | 		{`a(?#)b`, `{a /*(?#)*/ b}`},
400 | 		{`a(?#foo\)b`, `{a /*(?#foo\)*/ b}`},
401 | 
402 | 		// Quantifiers.
403 | 		{`x+`, `(+ x)`},
404 | 		{`x+|y+`, `(or (+ x) (+ y))`},
405 | 		{`x+y+`, `{(+ x) (+ y)}`},
406 | 		{`x+y+|z+`, `(or {(+ x) (+ y)} (+ z))`},
407 | 		{`(ab)+`, `(+ (capture ab))`},
408 | 		{`(.b)+`, `(+ (capture {. b}))`},
409 | 		{`x+y*z+`, `{(+ x) (* y) (+ z)}`},
410 | 		{`abc+`, `{ab (+ c)}`},
411 | 
412 | 		// Non-greedy modifiers.
413 | 		{`x+?|y+?`, `(or (non-greedy (+ x)) (non-greedy (+ y)))`},
414 | 		{`x*?|y*?`, `(or (non-greedy (* x)) (non-greedy (* y)))`},
415 | 		{`x??|y??`, `(or (non-greedy (? x)) (non-greedy (? y)))`},
416 | 
417 | 		// Possessive modifiers. PCRE-only.
418 | 		{`x++|x*+`, `(or (possessive (+ x)) (possessive (* x)))`},
419 | 		{`[ab]?+|x{2,}+`, `(or (possessive (? [a b])) (possessive (repeat x {2,})))`},
420 | 
421 | 		// Escapes and escape chars.
422 | 		{`\d\d+`, `{\d (+ \d)}`},
423 | 		{`\..`, `{\. .}`},
424 | 		{`\1`, `\1`},
425 | 		{`\✓b`, `{\✓ b}`},
426 | 		{`\àb`, `{\à b}`},
427 | 
428 | 		// Short Unicode escapes.
429 | 		{`\pL+d`, `{(+ \pL) d}`},
430 | 
431 | 		// Full Unicode escapes.
432 | 		{`\p{Greek}\p{L}`, `{\p{Greek} \p{L}}`},
433 | 		{`\P{Greek}\p{^L}`, `{\P{Greek} \p{^L}}`},
434 | 
435 | 		// Octal escapes.
436 | 		{`\0`, `\0`},
437 | 		{`\01`, `\01`},
438 | 		{`\012`, `\012`},
439 | 		{`\777`, `\777`},
440 | 		{`\78`, `{\7 8}`},
441 | 		{`\778`, `{\77 8}`},
442 | 
443 | 		// Short hex escapes.
444 | 		{`\xfff`, `{\xff f}`},
445 | 		{`\xab1`, `{\xab 1}`},
446 | 
447 | 		// This is not a valid syntax for hex escapes, but PHP-PCRE accepts them.
448 | 		// Regexp validator can report them, if enabled.
449 | 		{`\x2[\x3\x4]`, `{\x2 [\x3 \x4]}`},
450 | 
451 | 		// Full hex escapes.
452 | 		{`\x{}b`, `{\x{} b}`},
453 | 		{`\x{1}b`, `{\x{1} b}`},
454 | 		{`\x{ABC}b`, `{\x{ABC} b}`},
455 | 
456 | 		// Char classes.
457 | 		{`[1]`, `[1]`},
458 | 		{`[1]a`, `{[1] a}`},
459 | 		{`[-a]`, `[- a]`},
460 | 		{`[a-]`, `[a -]`},
461 | 		{`[a-z]a`, `{[a-z] a}`},
462 | 		{`[a-z0-9]`, `[a-z 0-9]`},
463 | 		{`[0-9-]`, `[0-9 -]`},
464 | 		{`[\da-z_A-Z]`, `[\d a-z _ A-Z]`},
465 | 		{`[\(-\)ab]`, `[\(-\) a b]`},
466 | 		{`[\]\]\d]a`, `{[\] \] \d] a}`},
467 | 		{`[[\[]a`, `{[[ \[] a}`},
468 | 		{`[a|b]`, `[a | b]`},
469 | 		{`[a+b]`, `[a + b]`},
470 | 		{`[a*b]`, `[a * b]`},
471 | 		{`[x{1}]`, `[x '{' 1 '}']`},
472 | 		{`[]]`, `[]]`},
473 | 		{`[][]`, `[] []`},
474 | 
475 | 		// Negated char classes.
476 | 		{`[^1]a`, `{[^1] a}`},
477 | 		{`[^-a]`, `[^- a]`},
478 | 		{`[^a-]`, `[^a -]`},
479 | 		{`[^a-z]a`, `{[^a-z] a}`},
480 | 		{`[^a-z0-9]`, `[^a-z 0-9]`},
481 | 		{`[^\da-z_A-Z]`, `[^\d a-z _ A-Z]`},
482 | 		{`[^\(-\)ab]`, `[^\(-\) a b]`},
483 | 		{`[^\]\]\d]a`, `{[^\] \] \d] a}`},
484 | 		{`[^[\[]a`, `{[^[ \[] a}`},
485 | 		{`[^1abc]`, `[^1 a b c]`},
486 | 		{`[^]]`, `[^]]`},
487 | 		{`[^][]`, `[^] []`},
488 | 		{`[^\040\041\043-\133\135-\176]`, `[^\040 \041 \043-\133 \135-\176]`},
489 | 
490 | 		// Char class ranges.
491 | 		// We parse a-\d and it's something that should be
492 | 		// handled by post-parsing validator.
493 | 		{`[\d-a]`, `[\d - a]`},
494 | 		{`[a-\d]`, `[a-\d]`},
495 | 		{`[\pL0-9]`, `[\pL 0-9]`},
496 | 		{`[+--]`, `[+--]`},
497 | 		{`[--+]`, `[--+]`},
498 | 		{`[---]`, `[---]`},
499 | 		{`[-]`, `[-]`},
500 | 		{`[\x20-\x7f]`, `[\x20-\x7f]`},
501 | 		{`[\x{20}-\x{7f}]`, `[\x{20}-\x{7f}]`},
502 | 		{`[\1-\3]`, `[\1-\3]`},
503 | 		{`[\10-\20]`, `[\10-\20]`},
504 | 		{`[❤-❤a]`, `[❤-❤ a]`},
505 | 
506 | 		// Char class with meta symbols.
507 | 		{`[|]`, `[|]`},
508 | 		{`[$.+*^?]`, `[$ . + * ^ ?]`},
509 | 		{`[^$.+*^?]`, `[^$ . + * ^ ?]`},
510 | 
511 | 		// Posix char classes.
512 | 		{`x[:alpha:]y`, `{x [: a l p h a :] y}`},
513 | 		{`x[a[:alpha:]]y`, `{x [a [:alpha:]] y}`},
514 | 		{`x[[:^alpha:]]y`, `{x [[:^alpha:]] y}`},
515 | 		{`x[^[:alpha:]]y`, `{x [^[:alpha:]] y}`},
516 | 		{`x[^[:^alpha:]]y`, `{x [^[:^alpha:]] y}`},
517 | 
518 | 		// Valid repeat expressions.
519 | 		{`.{3}`, `(repeat . {3})`},
520 | 		{`.{3,}`, `(repeat . {3,})`},
521 | 		{`.{3,6}`, `(repeat . {3,6})`},
522 | 		{`.{6}?`, `(non-greedy (repeat . {6}))`},
523 | 		{`[a-z]{5}`, `(repeat [a-z] {5})`},
524 | 
525 | 		// Invalid repeat expressions are parsed as normal chars.
526 | 		{`.{a}`, `{. {a}}`},
527 | 		{`.{-1}`, `{. {-1}}`},
528 | 
529 | 		// \Q...\E escape.
530 | 		{`\Qa.b\E+z`, `{(+ (q \Qa.b\E)) z}`},
531 | 		{`x\Q?\Ey`, `{x (q \Q?\E) y}`},
532 | 		{`x\Q\Ey`, `{x (q \Q\E) y}`},
533 | 		{`x\Q`, `{x (q \Q)}`},
534 | 		{`x\Qy`, `{x (q \Qy)}`},
535 | 		{`x\Qyz`, `{x (q \Qyz)}`},
536 | 
537 | 		// Incomplete `x|` and `|x` expressions are valid.
538 | 		{`(docker-|)`, `(capture (or docker- {}))`},
539 | 		{`x|`, `(or x {})`},
540 | 		{`|x`, `(or {} x)`},
541 | 		{`(|x|y)`, `(capture (or {} x y))`},
542 | 		{`(?:|x)`, `(group (or {} x))`},
543 | 
544 | 		// More tests for char merging.
545 | 		{`xy+`, `{x (+ y)}`},
546 | 		{`.xy`, `{. xy}`},
547 | 		{`foo?|bar`, `(or {fo (? o)} bar)`},
548 | 
549 | 		// Tests from the patterns found in various GitHub projects.
550 | 		{`Adm([^i]|$)`, `{Adm (capture (or [^i] $))}`},
551 | 		{`\.(com|com\.\w{2})$`, `{\. (capture (or com {com \. (repeat \w {2})})) $}`},
552 | 		{`(?i)a(?:x|y)b`, `{(flags ?i) a (group (or x y)) b}`},
553 | 	}
554 | 
555 | 	p := NewParser(nil)
556 | 	for _, test := range tests {
557 | 		re, err := p.Parse(test.pattern)
558 | 		if err != nil {
559 | 			t.Fatalf("parse(%q) error: %v", test.pattern, err)
560 | 		}
561 | 		have := formatSyntax(re)
562 | 		if have != test.want {
563 | 			t.Fatalf("parse(%q):\nhave: %s\nwant: %s",
564 | 				test.pattern, have, test.want)
565 | 		}
566 | 	}
567 | }
568 | 
569 | func formatSyntax(re *Regexp) string {
570 | 	return formatExprSyntax(re, re.Expr)
571 | }
572 | 
573 | func formatExprSyntax(re *Regexp, e Expr) string {
574 | 	switch e.Op {
575 | 	case OpChar, OpLiteral:
576 | 		switch e.Value {
577 | 		case "{":
578 | 			return "'{'"
579 | 		case "}":
580 | 			return "'}'"
581 | 		default:
582 | 			return e.Value
583 | 		}
584 | 	case OpString, OpEscapeChar, OpEscapeMeta, OpEscapeOctal, OpEscapeUni, OpEscapeHex, OpPosixClass:
585 | 		return e.Value
586 | 	case OpRepeat:
587 | 		return fmt.Sprintf("(repeat %s %s)", formatExprSyntax(re, e.Args[0]), e.Args[1].Value)
588 | 	case OpCaret:
589 | 		return "^"
590 | 	case OpDollar:
591 | 		return "$"
592 | 	case OpDot:
593 | 		return "."
594 | 	case OpQuote:
595 | 		return fmt.Sprintf("(q %s)", e.Value)
596 | 	case OpCharRange:
597 | 		return fmt.Sprintf("%s-%s", formatExprSyntax(re, e.Args[0]), formatExprSyntax(re, e.Args[1]))
598 | 	case OpCharClass:
599 | 		return fmt.Sprintf("[%s]", formatArgsSyntax(re, e.Args))
600 | 	case OpNegCharClass:
601 | 		return fmt.Sprintf("[^%s]", formatArgsSyntax(re, e.Args))
602 | 	case OpConcat:
603 | 		return fmt.Sprintf("{%s}", formatArgsSyntax(re, e.Args))
604 | 	case OpAlt:
605 | 		return fmt.Sprintf("(or %s)", formatArgsSyntax(re, e.Args))
606 | 	case OpCapture:
607 | 		return fmt.Sprintf("(capture %s)", formatExprSyntax(re, e.Args[0]))
608 | 	case OpNamedCapture:
609 | 		return fmt.Sprintf("(capture %s %s)", formatExprSyntax(re, e.Args[0]), e.Args[1].Value)
610 | 	case OpGroup:
611 | 		return fmt.Sprintf("(group %s)", formatExprSyntax(re, e.Args[0]))
612 | 	case OpAtomicGroup:
613 | 		return fmt.Sprintf("(atomic %s)", formatExprSyntax(re, e.Args[0]))
614 | 	case OpGroupWithFlags:
615 | 		return fmt.Sprintf("(group %s ?%s)", formatExprSyntax(re, e.Args[0]), e.Args[1].Value)
616 | 	case OpFlagOnlyGroup:
617 | 		return fmt.Sprintf("(flags ?%s)", formatExprSyntax(re, e.Args[0]))
618 | 	case OpPositiveLookahead:
619 | 		return fmt.Sprintf("(?= %s)", formatExprSyntax(re, e.Args[0]))
620 | 	case OpNegativeLookahead:
621 | 		return fmt.Sprintf("(?! %s)", formatExprSyntax(re, e.Args[0]))
622 | 	case OpPositiveLookbehind:
623 | 		return fmt.Sprintf("(?<= %s)", formatExprSyntax(re, e.Args[0]))
624 | 	case OpNegativeLookbehind:
625 | 		return fmt.Sprintf("(?<! %s)", formatExprSyntax(re, e.Args[0]))
626 | 	case OpPlus:
627 | 		return fmt.Sprintf("(+ %s)", formatExprSyntax(re, e.Args[0]))
628 | 	case OpStar:
629 | 		return fmt.Sprintf("(* %s)", formatExprSyntax(re, e.Args[0]))
630 | 	case OpQuestion:
631 | 		return fmt.Sprintf("(? %s)", formatExprSyntax(re, e.Args[0]))
632 | 	case OpNonGreedy:
633 | 		return fmt.Sprintf("(non-greedy %s)", formatExprSyntax(re, e.Args[0]))
634 | 	case OpPossessive:
635 | 		return fmt.Sprintf("(possessive %s)", formatExprSyntax(re, e.Args[0]))
636 | 	case OpComment:
637 | 		return fmt.Sprintf("/*%s*/", e.Value)
638 | 	default:
639 | 		return fmt.Sprintf("<op=%d>", e.Op)
640 | 	}
641 | }
642 | 
643 | func formatArgsSyntax(re *Regexp, args []Expr) string {
644 | 	parts := make([]string, len(args))
645 | 	for i, e := range args {
646 | 		parts[i] = formatExprSyntax(re, e)
647 | 	}
648 | 	return strings.Join(parts, " ")
649 | }
650 | 
651 | // To run benchmarks:
652 | //	$ go-benchrun ParserStdlib ParserPratt -count 5
653 | var benchmarkTests = []*struct {
654 | 	name    string
655 | 	pattern string
656 | }{
657 | 	{`lit`, `\+\.1234foobarbaz✓✓□□`},
658 | 	{`alt`, `(x|y|1)|z|$`},
659 | 	{`esc`, `\w\d\pL\123\059\p{L}\p{^Greek}`},
660 | 	{`charclass`, `[a-z0-9_][^\d][\(-\)][1234][[[][a-][-a]`},
661 | 	{`posix`, `[[:alpha:][:blank:][:^word:]][[:^digit:]]`},
662 | 	{`meta`, `x+y*z?.*?.+?.??`},
663 | 	{`repeat`, `x{3,}\d{1,4}y{5}z{0}`},
664 | 	{`group`, `(?:x)(?i:(?i))(x)(?P<name>x)`},
665 | 	{`quote`, `\Qhttp://a.b.com/?x[]=1\E`},
666 | }
667 | 
668 | func BenchmarkParserPratt(b *testing.B) {
669 | 	for _, test := range benchmarkTests {
670 | 		b.Run(test.name, func(b *testing.B) {
671 | 			p := NewParser(nil)
672 | 			b.ResetTimer()
673 | 			for i := 0; i < b.N; i++ {
674 | 				_, err := p.Parse(test.pattern)
675 | 				if err != nil {
676 | 					b.Fatal(err)
677 | 				}
678 | 			}
679 | 		})
680 | 	}
681 | }
682 | 
683 | func BenchmarkParserStdlib(b *testing.B) {
684 | 	for _, test := range benchmarkTests {
685 | 		b.Run(test.name, func(b *testing.B) {
686 | 			for i := 0; i < b.N; i++ {
687 | 				_, err := syntax.Parse(test.pattern, syntax.Perl)
688 | 				if err != nil {
689 | 					b.Fatal(err)
690 | 				}
691 | 			}
692 | 		})
693 | 	}
694 | }
695 | 


--------------------------------------------------------------------------------