├── .gitignore
├── AUTHORS
├── CONTRIBUTORS
├── LICENSE
├── README.md
├── all_test.go
├── backtrack.go
├── example_test.go
├── exec.go
├── exec2_test.go
├── exec_test.go
├── export.sh
├── find_test.go
├── internal
    ├── dfa
    │   ├── dfa.go
    │   ├── dfa_exhaustive_test.go
    │   ├── dfa_test.go
    │   ├── exec_test.go
    │   ├── runerange.go
    │   ├── search.go
    │   ├── state.go
    │   └── workq.go
    └── input
    │   └── input.go
├── onepass.go
├── onepass_test.go
├── regexp.go
├── syntax
    ├── compile.go
    ├── doc.go
    ├── make_perl_groups.pl
    ├── parse.go
    ├── parse_test.go
    ├── perl_groups.go
    ├── prog.go
    ├── prog_test.go
    ├── regexp.go
    ├── simplify.go
    └── simplify_test.go
└── testdata
    ├── README
    ├── basic.dat
    ├── nullsubexpr.dat
    ├── re2-exhaustive.txt.bz2
    ├── re2-search.txt
    ├── repetition.dat
    └── testregex.c


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.[56789ao]
 3 | *.a[56789o]
 4 | *.so
 5 | *.pyc
 6 | ._*
 7 | .nfs.*
 8 | [56789a].out
 9 | *~
10 | *.orig
11 | *.rej
12 | *.exe
13 | .*.swp
14 | core
15 | *.cgo*.go
16 | *.cgo*.c
17 | _cgo_*
18 | _obj
19 | _test
20 | _testmain.go
21 | build.out
22 | test.out
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # go regexp + RE2 DFA port
 2 | 
 3 | `import "matloob.io/regexp"`
 4 | 
 5 | See [golang.org/cl/12081](https://golang.org/cl/12081)
 6 | 
 7 | * The regexp tests pass. Though there may still be uncaught bugs.
 8 |   Let me know if you find any of them! No guarantees!
 9 | * regexp/internal/dfa tests are currently failing. I need to fix
10 |   some thingsn there.
11 | * I've got a small change to the DFA that uses package unsafe
12 |   and makes matches 2x faster. I'll try to get it up soon.
13 | * A bunch of cleanup needs to be done all over this package.
14 | 


--------------------------------------------------------------------------------
/all_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"matloob.io/regexp/syntax"
  9 | 	"reflect"
 10 | 	"strings"
 11 | 	"testing"
 12 | )
 13 | 
 14 | var good_re = []string{
 15 | 	``,
 16 | 	`.`,
 17 | 	`^.$`,
 18 | 	`a`,
 19 | 	`a*`,
 20 | 	`a+`,
 21 | 	`a?`,
 22 | 	`a|b`,
 23 | 	`a*|b*`,
 24 | 	`(a*|b)(c*|d)`,
 25 | 	`[a-z]`,
 26 | 	`[a-abc-c\-\]\[]`,
 27 | 	`[a-z]+`,
 28 | 	`[abc]`,
 29 | 	`[^1234]`,
 30 | 	`[^\n]`,
 31 | 	`\!\\`,
 32 | }
 33 | 
 34 | type stringError struct {
 35 | 	re  string
 36 | 	err string
 37 | }
 38 | 
 39 | var bad_re = []stringError{
 40 | 	{`*`, "missing argument to repetition operator: `*`"},
 41 | 	{`+`, "missing argument to repetition operator: `+`"},
 42 | 	{`?`, "missing argument to repetition operator: `?`"},
 43 | 	{`(abc`, "missing closing ): `(abc`"},
 44 | 	{`abc)`, "unexpected ): `abc)`"},
 45 | 	{`x[a-z`, "missing closing ]: `[a-z`"},
 46 | 	{`[z-a]`, "invalid character class range: `z-a`"},
 47 | 	{`abc\`, "trailing backslash at end of expression"},
 48 | 	{`a**`, "invalid nested repetition operator: `**`"},
 49 | 	{`a*+`, "invalid nested repetition operator: `*+`"},
 50 | 	{`\x`, "invalid escape sequence: `\\x`"},
 51 | }
 52 | 
 53 | func compileTest(t *testing.T, expr string, error string) *Regexp {
 54 | 	re, err := Compile(expr)
 55 | 	if error == "" && err != nil {
 56 | 		t.Error("compiling `", expr, "`; unexpected error: ", err.Error())
 57 | 	}
 58 | 	if error != "" && err == nil {
 59 | 		t.Error("compiling `", expr, "`; missing error")
 60 | 	} else if error != "" && !strings.Contains(err.Error(), error) {
 61 | 		t.Error("compiling `", expr, "`; wrong error: ", err.Error(), "; want ", error)
 62 | 	}
 63 | 	return re
 64 | }
 65 | 
 66 | func TestGoodCompile(t *testing.T) {
 67 | 	for i := 0; i < len(good_re); i++ {
 68 | 		compileTest(t, good_re[i], "")
 69 | 	}
 70 | }
 71 | 
 72 | func TestBadCompile(t *testing.T) {
 73 | 	for i := 0; i < len(bad_re); i++ {
 74 | 		compileTest(t, bad_re[i].re, bad_re[i].err)
 75 | 	}
 76 | }
 77 | 
 78 | func matchTest(t *testing.T, test *FindTest) {
 79 | 	re := compileTest(t, test.pat, "")
 80 | 	if re == nil {
 81 | 		return
 82 | 	}
 83 | 	m := re.MatchString(test.text)
 84 | 	if m != (len(test.matches) > 0) {
 85 | 		t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0)
 86 | 	}
 87 | 	// now try bytes
 88 | 	m = re.Match([]byte(test.text))
 89 | 	if m != (len(test.matches) > 0) {
 90 | 		t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
 91 | 	}
 92 | }
 93 | 
 94 | func TestMatch(t *testing.T) {
 95 | 	for _, test := range findTests {
 96 | 		matchTest(t, &test)
 97 | 	}
 98 | }
 99 | 
100 | func matchFunctionTest(t *testing.T, test *FindTest) {
101 | 	m, err := MatchString(test.pat, test.text)
102 | 	if err == nil {
103 | 		return
104 | 	}
105 | 	if m != (len(test.matches) > 0) {
106 | 		t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
107 | 	}
108 | }
109 | 
110 | func TestMatchFunction(t *testing.T) {
111 | 	for _, test := range findTests {
112 | 		matchFunctionTest(t, &test)
113 | 	}
114 | }
115 | 
116 | func copyMatchTest(t *testing.T, test *FindTest) {
117 | 	re := compileTest(t, test.pat, "")
118 | 	if re == nil {
119 | 		return
120 | 	}
121 | 	m1 := re.MatchString(test.text)
122 | 	m2 := re.Copy().MatchString(test.text)
123 | 	if m1 != m2 {
124 | 		t.Errorf("Copied Regexp match failure on %s: original gave %t; copy gave %t; should be %t",
125 | 			test, m1, m2, len(test.matches) > 0)
126 | 	}
127 | }
128 | 
129 | func TestCopyMatch(t *testing.T) {
130 | 	for _, test := range findTests {
131 | 		copyMatchTest(t, &test)
132 | 	}
133 | }
134 | 
135 | type ReplaceTest struct {
136 | 	pattern, replacement, input, output string
137 | }
138 | 
139 | var replaceTests = []ReplaceTest{
140 | 	// Test empty input and/or replacement, with pattern that matches the empty string.
141 | 	{"", "", "", ""},
142 | 	{"", "x", "", "x"},
143 | 	{"", "", "abc", "abc"},
144 | 	{"", "x", "abc", "xaxbxcx"},
145 | 
146 | 	// Test empty input and/or replacement, with pattern that does not match the empty string.
147 | 	{"b", "", "", ""},
148 | 	{"b", "x", "", ""},
149 | 	{"b", "", "abc", "ac"},
150 | 	{"b", "x", "abc", "axc"},
151 | 	{"y", "", "", ""},
152 | 	{"y", "x", "", ""},
153 | 	{"y", "", "abc", "abc"},
154 | 	{"y", "x", "abc", "abc"},
155 | 
156 | 	// Multibyte characters -- verify that we don't try to match in the middle
157 | 	// of a character.
158 | 	{"[a-c]*", "x", "\u65e5", "x\u65e5x"},
159 | 	{"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"},
160 | 
161 | 	// Start and end of a string.
162 | 	{"^[a-c]*", "x", "abcdabc", "xdabc"},
163 | 	{"[a-c]*$", "x", "abcdabc", "abcdx"},
164 | 	{"^[a-c]*$", "x", "abcdabc", "abcdabc"},
165 | 	{"^[a-c]*", "x", "abc", "x"},
166 | 	{"[a-c]*$", "x", "abc", "x"},
167 | 	{"^[a-c]*$", "x", "abc", "x"},
168 | 	{"^[a-c]*", "x", "dabce", "xdabce"},
169 | 	{"[a-c]*$", "x", "dabce", "dabcex"},
170 | 	{"^[a-c]*$", "x", "dabce", "dabce"},
171 | 	{"^[a-c]*", "x", "", "x"},
172 | 	{"[a-c]*$", "x", "", "x"},
173 | 	{"^[a-c]*$", "x", "", "x"},
174 | 
175 | 	{"^[a-c]+", "x", "abcdabc", "xdabc"},
176 | 	{"[a-c]+$", "x", "abcdabc", "abcdx"},
177 | 	{"^[a-c]+$", "x", "abcdabc", "abcdabc"},
178 | 	{"^[a-c]+", "x", "abc", "x"},
179 | 	{"[a-c]+$", "x", "abc", "x"},
180 | 	{"^[a-c]+$", "x", "abc", "x"},
181 | 	{"^[a-c]+", "x", "dabce", "dabce"},
182 | 	{"[a-c]+$", "x", "dabce", "dabce"},
183 | 	{"^[a-c]+$", "x", "dabce", "dabce"},
184 | 	{"^[a-c]+", "x", "", ""},
185 | 	{"[a-c]+$", "x", "", ""},
186 | 	{"^[a-c]+$", "x", "", ""},
187 | 
188 | 	// Other cases.
189 | 	{"abc", "def", "abcdefg", "defdefg"},
190 | 	{"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"},
191 | 	{"abc", "", "abcdabc", "d"},
192 | 	{"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"},
193 | 	{"abc", "d", "", ""},
194 | 	{"abc", "d", "abc", "d"},
195 | 	{".+", "x", "abc", "x"},
196 | 	{"[a-c]*", "x", "def", "xdxexfx"},
197 | 	{"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"},
198 | 	{"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"},
199 | 
200 | 	// Substitutions
201 | 	{"a+", "($0)", "banana", "b(a)n(a)n(a)"},
202 | 	{"a+", "(${0})", "banana", "b(a)n(a)n(a)"},
203 | 	{"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"},
204 | 	{"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"},
205 | 	{"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"},
206 | 	{"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "},
207 | 	{"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"},
208 | 	{"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<hello, world><world><><>"},
209 | 	{"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"},
210 | 	{"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"},
211 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "hihihi"},
212 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "byebyebye"},
213 | 	{"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", ""},
214 | 	{"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "hiyz"},
215 | 	{"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $x"},
216 | 	{"a+", "${oops", "aaa", "${oops"},
217 | 	{"a+", "$$", "aaa", "$"},
218 | 	{"a+", "$", "aaa", "$"},
219 | 
220 | 	// Substitution when subexpression isn't found
221 | 	{"(x)?", "$1", "123", "123"},
222 | 	{"abc", "$1", "123", "123"},
223 | 
224 | 	// Substitutions involving a (x){0}
225 | 	{"(a)(b){0}(c)", ".$1|$3.", "xacxacx", "x.a|c.x.a|c.x"},
226 | 	{"(a)(((b))){0}c", ".$1.", "xacxacx", "x.a.x.a.x"},
227 | 	{"((a(b){0}){3}){5}(h)", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"},
228 | 	{"((a(b){0}){3}){5}h", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"},
229 | }
230 | 
231 | var replaceLiteralTests = []ReplaceTest{
232 | 	// Substitutions
233 | 	{"a+", "($0)", "banana", "b($0)n($0)n($0)"},
234 | 	{"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"},
235 | 	{"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"},
236 | 	{"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"},
237 | 	{"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"},
238 | 	{"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"},
239 | 	{"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"},
240 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "$x$x$x"},
241 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "$x$x$x"},
242 | 	{"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", "$xyz"},
243 | 	{"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "${x}yz"},
244 | 	{"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $$x"},
245 | 	{"a+", "${oops", "aaa", "${oops"},
246 | 	{"a+", "$$", "aaa", "$$"},
247 | 	{"a+", "$", "aaa", "$"},
248 | }
249 | 
250 | type ReplaceFuncTest struct {
251 | 	pattern       string
252 | 	replacement   func(string) string
253 | 	input, output string
254 | }
255 | 
256 | var replaceFuncTests = []ReplaceFuncTest{
257 | 	{"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"},
258 | 	{"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"},
259 | 	{"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"},
260 | }
261 | 
262 | func TestReplaceAll(t *testing.T) {
263 | 	for _, tc := range replaceTests {
264 | 		re, err := Compile(tc.pattern)
265 | 		if err != nil {
266 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
267 | 			continue
268 | 		}
269 | 		actual := re.ReplaceAllString(tc.input, tc.replacement)
270 | 		if actual != tc.output {
271 | 			t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q",
272 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
273 | 		}
274 | 		// now try bytes
275 | 		actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement)))
276 | 		if actual != tc.output {
277 | 			t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q",
278 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
279 | 		}
280 | 	}
281 | }
282 | 
283 | func TestReplaceAllLiteral(t *testing.T) {
284 | 	// Run ReplaceAll tests that do not have $ expansions.
285 | 	for _, tc := range replaceTests {
286 | 		if strings.Contains(tc.replacement, "$") {
287 | 			continue
288 | 		}
289 | 		re, err := Compile(tc.pattern)
290 | 		if err != nil {
291 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
292 | 			continue
293 | 		}
294 | 		actual := re.ReplaceAllLiteralString(tc.input, tc.replacement)
295 | 		if actual != tc.output {
296 | 			t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q",
297 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
298 | 		}
299 | 		// now try bytes
300 | 		actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement)))
301 | 		if actual != tc.output {
302 | 			t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q",
303 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
304 | 		}
305 | 	}
306 | 
307 | 	// Run literal-specific tests.
308 | 	for _, tc := range replaceLiteralTests {
309 | 		re, err := Compile(tc.pattern)
310 | 		if err != nil {
311 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
312 | 			continue
313 | 		}
314 | 		actual := re.ReplaceAllLiteralString(tc.input, tc.replacement)
315 | 		if actual != tc.output {
316 | 			t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q",
317 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
318 | 		}
319 | 		// now try bytes
320 | 		actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement)))
321 | 		if actual != tc.output {
322 | 			t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q",
323 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
324 | 		}
325 | 	}
326 | }
327 | 
328 | func TestReplaceAllFunc(t *testing.T) {
329 | 	for _, tc := range replaceFuncTests {
330 | 		re, err := Compile(tc.pattern)
331 | 		if err != nil {
332 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
333 | 			continue
334 | 		}
335 | 		actual := re.ReplaceAllStringFunc(tc.input, tc.replacement)
336 | 		if actual != tc.output {
337 | 			t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q",
338 | 				tc.pattern, tc.input, actual, tc.output)
339 | 		}
340 | 		// now try bytes
341 | 		actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) }))
342 | 		if actual != tc.output {
343 | 			t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q",
344 | 				tc.pattern, tc.input, actual, tc.output)
345 | 		}
346 | 	}
347 | }
348 | 
349 | type MetaTest struct {
350 | 	pattern, output, literal string
351 | 	isLiteral                bool
352 | }
353 | 
354 | var metaTests = []MetaTest{
355 | 	{``, ``, ``, true},
356 | 	{`foo`, `foo`, `foo`, true},
357 | 	{`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator
358 | 	{`foo.\$`, `foo\.\\\$`, `foo`, false},     // has escaped operators and real operators
359 | 	{`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false},
360 | }
361 | 
362 | var literalPrefixTests = []MetaTest{
363 | 	// See golang.org/issue/11175.
364 | 	// output is unused.
365 | 	{`^0^0$`, ``, `0`, false},
366 | 	{`^0^`, ``, ``, false},
367 | 	{`^0$`, ``, `0`, true},
368 | 	{`$0^`, ``, ``, false},
369 | 	{`$0$`, ``, ``, false},
370 | 	{`^^0$$`, ``, ``, false},
371 | 	{`^$^$`, ``, ``, false},
372 | 	{`$$0^^`, ``, ``, false},
373 | }
374 | 
375 | func TestQuoteMeta(t *testing.T) {
376 | 	for _, tc := range metaTests {
377 | 		// Verify that QuoteMeta returns the expected string.
378 | 		quoted := QuoteMeta(tc.pattern)
379 | 		if quoted != tc.output {
380 | 			t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`",
381 | 				tc.pattern, quoted, tc.output)
382 | 			continue
383 | 		}
384 | 
385 | 		// Verify that the quoted string is in fact treated as expected
386 | 		// by Compile -- i.e. that it matches the original, unquoted string.
387 | 		if tc.pattern != "" {
388 | 			re, err := Compile(quoted)
389 | 			if err != nil {
390 | 				t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err)
391 | 				continue
392 | 			}
393 | 			src := "abc" + tc.pattern + "def"
394 | 			repl := "xyz"
395 | 			replaced := re.ReplaceAllString(src, repl)
396 | 			expected := "abcxyzdef"
397 | 			if replaced != expected {
398 | 				t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`",
399 | 					tc.pattern, src, repl, replaced, expected)
400 | 			}
401 | 		}
402 | 	}
403 | }
404 | 
405 | func TestLiteralPrefix(t *testing.T) {
406 | 	for _, tc := range append(metaTests, literalPrefixTests...) {
407 | 		// Literal method needs to scan the pattern.
408 | 		re := MustCompile(tc.pattern)
409 | 		str, complete := re.LiteralPrefix()
410 | 		if complete != tc.isLiteral {
411 | 			t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral)
412 | 		}
413 | 		if str != tc.literal {
414 | 			t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal)
415 | 		}
416 | 	}
417 | }
418 | 
419 | type subexpCase struct {
420 | 	input string
421 | 	num   int
422 | 	names []string
423 | }
424 | 
425 | var subexpCases = []subexpCase{
426 | 	{``, 0, nil},
427 | 	{`.*`, 0, nil},
428 | 	{`abba`, 0, nil},
429 | 	{`ab(b)a`, 1, []string{"", ""}},
430 | 	{`ab(.*)a`, 1, []string{"", ""}},
431 | 	{`(.*)ab(.*)a`, 2, []string{"", "", ""}},
432 | 	{`(.*)(ab)(.*)a`, 3, []string{"", "", "", ""}},
433 | 	{`(.*)((a)b)(.*)a`, 4, []string{"", "", "", "", ""}},
434 | 	{`(.*)(\(ab)(.*)a`, 3, []string{"", "", "", ""}},
435 | 	{`(.*)(\(a\)b)(.*)a`, 3, []string{"", "", "", ""}},
436 | 	{`(?P<foo>.*)(?P<bar>(a)b)(?P<foo>.*)a`, 4, []string{"", "foo", "bar", "", "foo"}},
437 | }
438 | 
439 | func TestSubexp(t *testing.T) {
440 | 	for _, c := range subexpCases {
441 | 		re := MustCompile(c.input)
442 | 		n := re.NumSubexp()
443 | 		if n != c.num {
444 | 			t.Errorf("%q: NumSubexp = %d, want %d", c.input, n, c.num)
445 | 			continue
446 | 		}
447 | 		names := re.SubexpNames()
448 | 		if len(names) != 1+n {
449 | 			t.Errorf("%q: len(SubexpNames) = %d, want %d", c.input, len(names), n)
450 | 			continue
451 | 		}
452 | 		if c.names != nil {
453 | 			for i := 0; i < 1+n; i++ {
454 | 				if names[i] != c.names[i] {
455 | 					t.Errorf("%q: SubexpNames[%d] = %q, want %q", c.input, i, names[i], c.names[i])
456 | 				}
457 | 			}
458 | 		}
459 | 	}
460 | }
461 | 
462 | var splitTests = []struct {
463 | 	s   string
464 | 	r   string
465 | 	n   int
466 | 	out []string
467 | }{
468 | 	{"foo:and:bar", ":", -1, []string{"foo", "and", "bar"}},
469 | 	{"foo:and:bar", ":", 1, []string{"foo:and:bar"}},
470 | 	{"foo:and:bar", ":", 2, []string{"foo", "and:bar"}},
471 | 	{"foo:and:bar", "foo", -1, []string{"", ":and:bar"}},
472 | 	{"foo:and:bar", "bar", -1, []string{"foo:and:", ""}},
473 | 	{"foo:and:bar", "baz", -1, []string{"foo:and:bar"}},
474 | 	{"baabaab", "a", -1, []string{"b", "", "b", "", "b"}},
475 | 	{"baabaab", "a*", -1, []string{"b", "b", "b"}},
476 | 	{"baabaab", "ba*", -1, []string{"", "", "", ""}},
477 | 	{"foobar", "f*b*", -1, []string{"", "o", "o", "a", "r"}},
478 | 	{"foobar", "f+.*b+", -1, []string{"", "ar"}},
479 | 	{"foobooboar", "o{2}", -1, []string{"f", "b", "boar"}},
480 | 	{"a,b,c,d,e,f", ",", 3, []string{"a", "b", "c,d,e,f"}},
481 | 	{"a,b,c,d,e,f", ",", 0, nil},
482 | 	{",", ",", -1, []string{"", ""}},
483 | 	{",,,", ",", -1, []string{"", "", "", ""}},
484 | 	{"", ",", -1, []string{""}},
485 | 	{"", ".*", -1, []string{""}},
486 | 	{"", ".+", -1, []string{""}},
487 | 	{"", "", -1, []string{}},
488 | 	{"foobar", "", -1, []string{"f", "o", "o", "b", "a", "r"}},
489 | 	{"abaabaccadaaae", "a*", 5, []string{"", "b", "b", "c", "cadaaae"}},
490 | 	{":x:y:z:", ":", -1, []string{"", "x", "y", "z", ""}},
491 | }
492 | 
493 | func TestSplit(t *testing.T) {
494 | 	for i, test := range splitTests {
495 | 		re, err := Compile(test.r)
496 | 		if err != nil {
497 | 			t.Errorf("#%d: %q: compile error: %s", i, test.r, err.Error())
498 | 			continue
499 | 		}
500 | 
501 | 		split := re.Split(test.s, test.n)
502 | 		if !reflect.DeepEqual(split, test.out) {
503 | 			t.Errorf("#%d: %q: got %q; want %q", i, test.r, split, test.out)
504 | 		}
505 | 
506 | 		if QuoteMeta(test.r) == test.r {
507 | 			strsplit := strings.SplitN(test.s, test.r, test.n)
508 | 			if !reflect.DeepEqual(split, strsplit) {
509 | 				t.Errorf("#%d: Split(%q, %q, %d): regexp vs strings mismatch\nregexp=%q\nstrings=%q", i, test.s, test.r, test.n, split, strsplit)
510 | 			}
511 | 		}
512 | 	}
513 | }
514 | 
515 | // Check that one-pass cutoff does trigger.
516 | func TestOnePassCutoff(t *testing.T) {
517 | 	re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl)
518 | 	if err != nil {
519 | 		t.Fatalf("parse: %v", err)
520 | 	}
521 | 	p, err := syntax.Compile(re.Simplify())
522 | 	if err != nil {
523 | 		t.Fatalf("compile: %v", err)
524 | 	}
525 | 	if compileOnePass(p) != notOnePass {
526 | 		t.Fatalf("makeOnePass succeeded; wanted notOnePass")
527 | 	}
528 | }
529 | 
530 | // Check that the same machine can be used with the standard matcher
531 | // and then the backtracker when there are no captures.
532 | func TestSwitchBacktrack(t *testing.T) {
533 | 	re := MustCompile(`a|b`)
534 | 	long := make([]byte, maxBacktrackVector+1)
535 | 
536 | 	// The following sequence of Match calls used to panic. See issue #10319.
537 | 	re.Match(long)     // triggers standard matcher
538 | 	re.Match(long[:1]) // triggers backtracker
539 | }
540 | 
541 | func BenchmarkLiteral(b *testing.B) {
542 | 	x := strings.Repeat("x", 50) + "y"
543 | 	b.StopTimer()
544 | 	re := MustCompile("y")
545 | 	b.StartTimer()
546 | 	for i := 0; i < b.N; i++ {
547 | 		if !re.MatchString(x) {
548 | 			b.Fatalf("no match!")
549 | 		}
550 | 	}
551 | }
552 | 
553 | func BenchmarkNotLiteral(b *testing.B) {
554 | 	x := strings.Repeat("x", 50) + "y"
555 | 	b.StopTimer()
556 | 	re := MustCompile(".y")
557 | 	b.StartTimer()
558 | 	for i := 0; i < b.N; i++ {
559 | 		if !re.MatchString(x) {
560 | 			b.Fatalf("no match!")
561 | 		}
562 | 	}
563 | }
564 | 
565 | func BenchmarkMatchClass(b *testing.B) {
566 | 	b.StopTimer()
567 | 	x := strings.Repeat("xxxx", 20) + "w"
568 | 	re := MustCompile("[abcdw]")
569 | 	b.StartTimer()
570 | 	for i := 0; i < b.N; i++ {
571 | 		if !re.MatchString(x) {
572 | 			b.Fatalf("no match!")
573 | 		}
574 | 	}
575 | }
576 | 
577 | func BenchmarkMatchClass_InRange(b *testing.B) {
578 | 	b.StopTimer()
579 | 	// 'b' is between 'a' and 'c', so the charclass
580 | 	// range checking is no help here.
581 | 	x := strings.Repeat("bbbb", 20) + "c"
582 | 	re := MustCompile("[ac]")
583 | 	b.StartTimer()
584 | 	for i := 0; i < b.N; i++ {
585 | 		if !re.MatchString(x) {
586 | 			b.Fatalf("no match!")
587 | 		}
588 | 	}
589 | }
590 | 
591 | func BenchmarkReplaceAll(b *testing.B) {
592 | 	x := "abcdefghijklmnopqrstuvwxyz"
593 | 	b.StopTimer()
594 | 	re := MustCompile("[cjrw]")
595 | 	b.StartTimer()
596 | 	for i := 0; i < b.N; i++ {
597 | 		re.ReplaceAllString(x, "")
598 | 	}
599 | }
600 | 
601 | func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
602 | 	b.StopTimer()
603 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
604 | 	re := MustCompile("^zbc(d|e)")
605 | 	b.StartTimer()
606 | 	for i := 0; i < b.N; i++ {
607 | 		re.Match(x)
608 | 	}
609 | }
610 | 
611 | func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
612 | 	b.StopTimer()
613 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
614 | 	for i := 0; i < 15; i++ {
615 | 		x = append(x, x...)
616 | 	}
617 | 	re := MustCompile("^zbc(d|e)")
618 | 	b.StartTimer()
619 | 	for i := 0; i < b.N; i++ {
620 | 		re.Match(x)
621 | 	}
622 | }
623 | 
624 | func BenchmarkAnchoredShortMatch(b *testing.B) {
625 | 	b.StopTimer()
626 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
627 | 	re := MustCompile("^.bc(d|e)")
628 | 	b.StartTimer()
629 | 	for i := 0; i < b.N; i++ {
630 | 		re.Match(x)
631 | 	}
632 | }
633 | 
634 | func BenchmarkAnchoredLongMatch(b *testing.B) {
635 | 	b.StopTimer()
636 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
637 | 	for i := 0; i < 15; i++ {
638 | 		x = append(x, x...)
639 | 	}
640 | 	re := MustCompile("^.bc(d|e)")
641 | 	b.StartTimer()
642 | 	for i := 0; i < b.N; i++ {
643 | 		re.Match(x)
644 | 	}
645 | }
646 | 
647 | func BenchmarkOnePassShortA(b *testing.B) {
648 | 	b.StopTimer()
649 | 	x := []byte("abcddddddeeeededd")
650 | 	re := MustCompile("^.bc(d|e)*$")
651 | 	b.StartTimer()
652 | 	for i := 0; i < b.N; i++ {
653 | 		re.Match(x)
654 | 	}
655 | }
656 | 
657 | func BenchmarkNotOnePassShortA(b *testing.B) {
658 | 	b.StopTimer()
659 | 	x := []byte("abcddddddeeeededd")
660 | 	re := MustCompile(".bc(d|e)*$")
661 | 	b.StartTimer()
662 | 	for i := 0; i < b.N; i++ {
663 | 		re.Match(x)
664 | 	}
665 | }
666 | 
667 | func BenchmarkOnePassShortB(b *testing.B) {
668 | 	b.StopTimer()
669 | 	x := []byte("abcddddddeeeededd")
670 | 	re := MustCompile("^.bc(?:d|e)*$")
671 | 	b.StartTimer()
672 | 	for i := 0; i < b.N; i++ {
673 | 		re.Match(x)
674 | 	}
675 | }
676 | 
677 | func BenchmarkNotOnePassShortB(b *testing.B) {
678 | 	b.StopTimer()
679 | 	x := []byte("abcddddddeeeededd")
680 | 	re := MustCompile(".bc(?:d|e)*$")
681 | 	b.StartTimer()
682 | 	for i := 0; i < b.N; i++ {
683 | 		re.Match(x)
684 | 	}
685 | }
686 | 
687 | func BenchmarkOnePassLongPrefix(b *testing.B) {
688 | 	b.StopTimer()
689 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
690 | 	re := MustCompile("^abcdefghijklmnopqrstuvwxyz.*$")
691 | 	b.StartTimer()
692 | 	for i := 0; i < b.N; i++ {
693 | 		re.Match(x)
694 | 	}
695 | }
696 | 
697 | func BenchmarkOnePassLongNotPrefix(b *testing.B) {
698 | 	b.StopTimer()
699 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
700 | 	re := MustCompile("^.bcdefghijklmnopqrstuvwxyz.*$")
701 | 	b.StartTimer()
702 | 	for i := 0; i < b.N; i++ {
703 | 		re.Match(x)
704 | 	}
705 | }
706 | 
707 | func BenchmarkMatchParallelShared(b *testing.B) {
708 | 	x := []byte("this is a long line that contains foo bar baz")
709 | 	re := MustCompile("foo (ba+r)? baz")
710 | 	b.ResetTimer()
711 | 	b.RunParallel(func(pb *testing.PB) {
712 | 		for pb.Next() {
713 | 			re.Match(x)
714 | 		}
715 | 	})
716 | }
717 | 
718 | func BenchmarkMatchParallelCopied(b *testing.B) {
719 | 	x := []byte("this is a long line that contains foo bar baz")
720 | 	re := MustCompile("foo (ba+r)? baz")
721 | 	b.ResetTimer()
722 | 	b.RunParallel(func(pb *testing.PB) {
723 | 		re := re.Copy()
724 | 		for pb.Next() {
725 | 			re.Match(x)
726 | 		}
727 | 	})
728 | }
729 | 


--------------------------------------------------------------------------------
/backtrack.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // backtrack is a regular expression search with submatch
  6 | // tracking for small regular expressions and texts. It allocates
  7 | // a bit vector with (length of input) * (length of prog) bits,
  8 | // to make sure it never explores the same (character position, instruction)
  9 | // state multiple times. This limits the search to run in time linear in
 10 | // the length of the test.
 11 | //
 12 | // backtrack is a fast replacement for the NFA code on small
 13 | // regexps when onepass cannot be used.
 14 | 
 15 | package regexp
 16 | 
 17 | import (
 18 | 	"matloob.io/regexp/internal/input"
 19 | 	"matloob.io/regexp/syntax"
 20 | )
 21 | 
 22 | // A job is an entry on the backtracker's job stack. It holds
 23 | // the instruction pc and the position in the input.
 24 | type job struct {
 25 | 	pc  uint32
 26 | 	arg int
 27 | 	pos int
 28 | }
 29 | 
 30 | const (
 31 | 	visitedBits        = 32
 32 | 	maxBacktrackProg   = 500        // len(prog.Inst) <= max
 33 | 	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
 34 | )
 35 | 
 36 | // bitState holds state for the backtracker.
 37 | type bitState struct {
 38 | 	prog *syntax.Prog
 39 | 
 40 | 	end     int
 41 | 	cap     []int
 42 | 	jobs    []job
 43 | 	visited []uint32
 44 | }
 45 | 
 46 | var notBacktrack *bitState = nil
 47 | 
 48 | // maxBitStateLen returns the maximum length of a string to search with
 49 | // the backtracker using prog.
 50 | func maxBitStateLen(prog *syntax.Prog) int {
 51 | 	if !shouldBacktrack(prog) {
 52 | 		return 0
 53 | 	}
 54 | 	return maxBacktrackVector / len(prog.Inst)
 55 | }
 56 | 
 57 | // newBitState returns a new bitState for the given prog,
 58 | // or notBacktrack if the size of the prog exceeds the maximum size that
 59 | // the backtracker will be run for.
 60 | func newBitState(prog *syntax.Prog) *bitState {
 61 | 	if !shouldBacktrack(prog) {
 62 | 		return notBacktrack
 63 | 	}
 64 | 	return &bitState{
 65 | 		prog: prog,
 66 | 	}
 67 | }
 68 | 
 69 | // shouldBacktrack reports whether the program is too
 70 | // long for the backtracker to run.
 71 | func shouldBacktrack(prog *syntax.Prog) bool {
 72 | 	return len(prog.Inst) <= maxBacktrackProg
 73 | }
 74 | 
 75 | // reset resets the state of the backtracker.
 76 | // end is the end position in the input.
 77 | // ncap is the number of captures.
 78 | func (b *bitState) reset(end int, ncap int) {
 79 | 	b.end = end
 80 | 
 81 | 	if cap(b.jobs) == 0 {
 82 | 		b.jobs = make([]job, 0, 256)
 83 | 	} else {
 84 | 		b.jobs = b.jobs[:0]
 85 | 	}
 86 | 
 87 | 	visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
 88 | 	if cap(b.visited) < visitedSize {
 89 | 		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
 90 | 	} else {
 91 | 		b.visited = b.visited[:visitedSize]
 92 | 		for i := range b.visited {
 93 | 			b.visited[i] = 0
 94 | 		}
 95 | 	}
 96 | 
 97 | 	if cap(b.cap) < ncap {
 98 | 		b.cap = make([]int, ncap)
 99 | 	} else {
100 | 		b.cap = b.cap[:ncap]
101 | 	}
102 | 	for i := range b.cap {
103 | 		b.cap[i] = -1
104 | 	}
105 | }
106 | 
107 | // shouldVisit reports whether the combination of (pc, pos) has not
108 | // been visited yet.
109 | func (b *bitState) shouldVisit(pc uint32, pos int) bool {
110 | 	n := uint(int(pc)*(b.end+1) + pos)
111 | 	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
112 | 		return false
113 | 	}
114 | 	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
115 | 	return true
116 | }
117 | 
118 | // push pushes (pc, pos, arg) onto the job stack if it should be
119 | // visited.
120 | func (b *bitState) push(pc uint32, pos int, arg int) {
121 | 	if b.prog.Inst[pc].Op == syntax.InstFail {
122 | 		return
123 | 	}
124 | 
125 | 	// Only check shouldVisit when arg == 0.
126 | 	// When arg > 0, we are continuing a previous visit.
127 | 	if arg == 0 && !b.shouldVisit(pc, pos) {
128 | 		return
129 | 	}
130 | 
131 | 	b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
132 | }
133 | 
134 | // tryBacktrack runs a backtracking search starting at pos.
135 | func (m *machine) tryBacktrack(b *bitState, i input.Input, pc uint32, pos int) bool {
136 | 	longest := m.re.longest
137 | 	m.matched = false
138 | 
139 | 	b.push(pc, pos, 0)
140 | 	for len(b.jobs) > 0 {
141 | 		l := len(b.jobs) - 1
142 | 		// Pop job off the stack.
143 | 		pc := b.jobs[l].pc
144 | 		pos := b.jobs[l].pos
145 | 		arg := b.jobs[l].arg
146 | 		b.jobs = b.jobs[:l]
147 | 
148 | 		// Optimization: rather than push and pop,
149 | 		// code that is going to Push and continue
150 | 		// the loop simply updates ip, p, and arg
151 | 		// and jumps to CheckAndLoop. We have to
152 | 		// do the ShouldVisit check that Push
153 | 		// would have, but we avoid the stack
154 | 		// manipulation.
155 | 		goto Skip
156 | 	CheckAndLoop:
157 | 		if !b.shouldVisit(pc, pos) {
158 | 			continue
159 | 		}
160 | 	Skip:
161 | 
162 | 		inst := b.prog.Inst[pc]
163 | 
164 | 		switch inst.Op {
165 | 		default:
166 | 			panic("bad inst")
167 | 		case syntax.InstFail:
168 | 			panic("unexpected InstFail")
169 | 		case syntax.InstAlt:
170 | 			// Cannot just
171 | 			//   b.push(inst.Out, pos, 0)
172 | 			//   b.push(inst.Arg, pos, 0)
173 | 			// If during the processing of inst.Out, we encounter
174 | 			// inst.Arg via another path, we want to process it then.
175 | 			// Pushing it here will inhibit that. Instead, re-push
176 | 			// inst with arg==1 as a reminder to push inst.Arg out
177 | 			// later.
178 | 			switch arg {
179 | 			case 0:
180 | 				b.push(pc, pos, 1)
181 | 				pc = inst.Out
182 | 				goto CheckAndLoop
183 | 			case 1:
184 | 				// Finished inst.Out; try inst.Arg.
185 | 				arg = 0
186 | 				pc = inst.Arg
187 | 				goto CheckAndLoop
188 | 			}
189 | 			panic("bad arg in InstAlt")
190 | 
191 | 		case syntax.InstAltMatch:
192 | 			// One opcode consumes runes; the other leads to match.
193 | 			switch b.prog.Inst[inst.Out].Op {
194 | 			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
195 | 				// inst.Arg is the match.
196 | 				b.push(inst.Arg, pos, 0)
197 | 				pc = inst.Arg
198 | 				pos = b.end
199 | 				goto CheckAndLoop
200 | 			}
201 | 			// inst.Out is the match - non-greedy
202 | 			b.push(inst.Out, b.end, 0)
203 | 			pc = inst.Out
204 | 			goto CheckAndLoop
205 | 
206 | 		case syntax.InstRune:
207 | 			r, width := i.Step(pos)
208 | 			if !inst.MatchRune(r) {
209 | 				continue
210 | 			}
211 | 			pos += width
212 | 			pc = inst.Out
213 | 			goto CheckAndLoop
214 | 
215 | 		case syntax.InstRune1:
216 | 			r, width := i.Step(pos)
217 | 			if r != inst.Rune[0] {
218 | 				continue
219 | 			}
220 | 			pos += width
221 | 			pc = inst.Out
222 | 			goto CheckAndLoop
223 | 
224 | 		case syntax.InstRuneAnyNotNL:
225 | 			r, width := i.Step(pos)
226 | 			if r == '\n' || r == input.EndOfText {
227 | 				continue
228 | 			}
229 | 			pos += width
230 | 			pc = inst.Out
231 | 			goto CheckAndLoop
232 | 
233 | 		case syntax.InstRuneAny:
234 | 			r, width := i.Step(pos)
235 | 			if r == input.EndOfText {
236 | 				continue
237 | 			}
238 | 			pos += width
239 | 			pc = inst.Out
240 | 			goto CheckAndLoop
241 | 
242 | 		case syntax.InstCapture:
243 | 			switch arg {
244 | 			case 0:
245 | 				if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
246 | 					// Capture pos to register, but save old value.
247 | 					b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
248 | 					b.cap[inst.Arg] = pos
249 | 				}
250 | 				pc = inst.Out
251 | 				goto CheckAndLoop
252 | 			case 1:
253 | 				// Finished inst.Out; restore the old value.
254 | 				b.cap[inst.Arg] = pos
255 | 				continue
256 | 
257 | 			}
258 | 			panic("bad arg in InstCapture")
259 | 
260 | 		case syntax.InstEmptyWidth:
261 | 			if syntax.EmptyOp(inst.Arg)&^i.Context(pos) != 0 {
262 | 				continue
263 | 			}
264 | 			pc = inst.Out
265 | 			goto CheckAndLoop
266 | 
267 | 		case syntax.InstNop:
268 | 			pc = inst.Out
269 | 			goto CheckAndLoop
270 | 
271 | 		case syntax.InstMatch:
272 | 			// We found a match. If the caller doesn't care
273 | 			// where the match is, no point going further.
274 | 			if len(b.cap) == 0 {
275 | 				m.matched = true
276 | 				return m.matched
277 | 			}
278 | 
279 | 			// Record best match so far.
280 | 			// Only need to check end point, because this entire
281 | 			// call is only considering one start position.
282 | 			if len(b.cap) > 1 {
283 | 				b.cap[1] = pos
284 | 			}
285 | 			if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
286 | 				copy(m.matchcap, b.cap)
287 | 			}
288 | 			m.matched = true
289 | 
290 | 			// If going for first match, we're done.
291 | 			if !longest {
292 | 				return m.matched
293 | 			}
294 | 
295 | 			// If we used the entire text, no longer match is possible.
296 | 			if pos == b.end {
297 | 				return m.matched
298 | 			}
299 | 
300 | 			// Otherwise, continue on in hope of a longer match.
301 | 			continue
302 | 		}
303 | 	}
304 | 
305 | 	return m.matched
306 | }
307 | 
308 | // backtrack runs a backtracking search of prog on the input starting at pos.
309 | func (m *machine) backtrack(i input.Input, pos int, end int, ncap int) bool {
310 | 	if !i.CanCheckPrefix() {
311 | 		panic("backtrack called for a RuneReader")
312 | 	}
313 | 
314 | 	startCond := m.re.cond
315 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
316 | 		return false
317 | 	}
318 | 	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
319 | 		// Anchored match, past beginning of text.
320 | 		return false
321 | 	}
322 | 
323 | 	b := m.b
324 | 	b.reset(end, ncap)
325 | 
326 | 	m.matchcap = m.matchcap[:ncap]
327 | 	for i := range m.matchcap {
328 | 		m.matchcap[i] = -1
329 | 	}
330 | 
331 | 	// Anchored search must start at the beginning of the input
332 | 	if startCond&syntax.EmptyBeginText != 0 {
333 | 		if len(b.cap) > 0 {
334 | 			b.cap[0] = pos
335 | 		}
336 | 		return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
337 | 	}
338 | 
339 | 	// Unanchored search, starting from each possible text position.
340 | 	// Notice that we have to try the empty string at the end of
341 | 	// the text, so the loop condition is pos <= end, not pos < end.
342 | 	// This looks like it's quadratic in the size of the text,
343 | 	// but we are not clearing visited between calls to TrySearch,
344 | 	// so no work is duplicated and it ends up still being linear.
345 | 	width := -1
346 | 	for ; pos <= end && width != 0; pos += width {
347 | 		if len(m.re.prefix) > 0 {
348 | 			// Match requires literal prefix; fast search for it.
349 | 			advance := i.Index(m.re, pos)
350 | 			if advance < 0 {
351 | 				return false
352 | 			}
353 | 			pos += advance
354 | 		}
355 | 
356 | 		if len(b.cap) > 0 {
357 | 			b.cap[0] = pos
358 | 		}
359 | 		if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
360 | 			// Match must be leftmost; done.
361 | 			return true
362 | 		}
363 | 		_, width = i.Step(pos)
364 | 	}
365 | 	return false
366 | }
367 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp_test
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"matloob.io/regexp"
 10 | )
 11 | 
 12 | func Example() {
 13 | 	// Compile the expression once, usually at init time.
 14 | 	// Use raw strings to avoid having to quote the backslashes.
 15 | 	var validID = regexp.MustCompile(`^[a-z]+\[[0-9]+\]$`)
 16 | 
 17 | 	fmt.Println(validID.MatchString("adam[23]"))
 18 | 	fmt.Println(validID.MatchString("eve[7]"))
 19 | 	fmt.Println(validID.MatchString("Job[48]"))
 20 | 	fmt.Println(validID.MatchString("snakey"))
 21 | 	// Output:
 22 | 	// true
 23 | 	// true
 24 | 	// false
 25 | 	// false
 26 | }
 27 | 
 28 | func ExampleMatchString() {
 29 | 	matched, err := regexp.MatchString("foo.*", "seafood")
 30 | 	fmt.Println(matched, err)
 31 | 	matched, err = regexp.MatchString("bar.*", "seafood")
 32 | 	fmt.Println(matched, err)
 33 | 	matched, err = regexp.MatchString("a(b", "seafood")
 34 | 	fmt.Println(matched, err)
 35 | 	// Output:
 36 | 	// true <nil>
 37 | 	// false <nil>
 38 | 	// false error parsing regexp: missing closing ): `a(b`
 39 | }
 40 | 
 41 | func ExampleRegexp_FindString() {
 42 | 	re := regexp.MustCompile("fo.?")
 43 | 	fmt.Printf("%q\n", re.FindString("seafood"))
 44 | 	fmt.Printf("%q\n", re.FindString("meat"))
 45 | 	// Output:
 46 | 	// "foo"
 47 | 	// ""
 48 | }
 49 | 
 50 | func ExampleRegexp_FindStringIndex() {
 51 | 	re := regexp.MustCompile("ab?")
 52 | 	fmt.Println(re.FindStringIndex("tablett"))
 53 | 	fmt.Println(re.FindStringIndex("foo") == nil)
 54 | 	// Output:
 55 | 	// [1 3]
 56 | 	// true
 57 | }
 58 | 
 59 | func ExampleRegexp_FindStringSubmatch() {
 60 | 	re := regexp.MustCompile("a(x*)b(y|z)c")
 61 | 	fmt.Printf("%q\n", re.FindStringSubmatch("-axxxbyc-"))
 62 | 	fmt.Printf("%q\n", re.FindStringSubmatch("-abzc-"))
 63 | 	// Output:
 64 | 	// ["axxxbyc" "xxx" "y"]
 65 | 	// ["abzc" "" "z"]
 66 | }
 67 | 
 68 | func ExampleRegexp_FindAllString() {
 69 | 	re := regexp.MustCompile("a.")
 70 | 	fmt.Println(re.FindAllString("paranormal", -1))
 71 | 	fmt.Println(re.FindAllString("paranormal", 2))
 72 | 	fmt.Println(re.FindAllString("graal", -1))
 73 | 	fmt.Println(re.FindAllString("none", -1))
 74 | 	// Output:
 75 | 	// [ar an al]
 76 | 	// [ar an]
 77 | 	// [aa]
 78 | 	// []
 79 | }
 80 | 
 81 | func ExampleRegexp_FindAllStringSubmatch() {
 82 | 	re := regexp.MustCompile("a(x*)b")
 83 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-", -1))
 84 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-", -1))
 85 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-axb-", -1))
 86 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-ab-", -1))
 87 | 	// Output:
 88 | 	// [["ab" ""]]
 89 | 	// [["axxb" "xx"]]
 90 | 	// [["ab" ""] ["axb" "x"]]
 91 | 	// [["axxb" "xx"] ["ab" ""]]
 92 | }
 93 | 
 94 | func ExampleRegexp_FindAllStringSubmatchIndex() {
 95 | 	re := regexp.MustCompile("a(x*)b")
 96 | 	// Indices:
 97 | 	//    01234567   012345678
 98 | 	//    -ab-axb-   -axxb-ab-
 99 | 	fmt.Println(re.FindAllStringSubmatchIndex("-ab-", -1))
100 | 	fmt.Println(re.FindAllStringSubmatchIndex("-axxb-", -1))
101 | 	fmt.Println(re.FindAllStringSubmatchIndex("-ab-axb-", -1))
102 | 	fmt.Println(re.FindAllStringSubmatchIndex("-axxb-ab-", -1))
103 | 	fmt.Println(re.FindAllStringSubmatchIndex("-foo-", -1))
104 | 	// Output:
105 | 	// [[1 3 2 2]]
106 | 	// [[1 5 2 4]]
107 | 	// [[1 3 2 2] [4 7 5 6]]
108 | 	// [[1 5 2 4] [6 8 7 7]]
109 | 	// []
110 | }
111 | 
112 | func ExampleRegexp_ReplaceAllLiteralString() {
113 | 	re := regexp.MustCompile("a(x*)b")
114 | 	fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "T"))
115 | 	fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "$1"))
116 | 	fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "${1}"))
117 | 	// Output:
118 | 	// -T-T-
119 | 	// -$1-$1-
120 | 	// -${1}-${1}-
121 | }
122 | 
123 | func ExampleRegexp_ReplaceAllString() {
124 | 	re := regexp.MustCompile("a(x*)b")
125 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "T"))
126 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1"))
127 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1W"))
128 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "${1}W"))
129 | 	// Output:
130 | 	// -T-T-
131 | 	// --xx-
132 | 	// ---
133 | 	// -W-xxW-
134 | }
135 | 
136 | func ExampleRegexp_SubexpNames() {
137 | 	re := regexp.MustCompile("(?P<first>[a-zA-Z]+) (?P<last>[a-zA-Z]+)")
138 | 	fmt.Println(re.MatchString("Alan Turing"))
139 | 	fmt.Printf("%q\n", re.SubexpNames())
140 | 	reversed := fmt.Sprintf("${%s} ${%s}", re.SubexpNames()[2], re.SubexpNames()[1])
141 | 	fmt.Println(reversed)
142 | 	fmt.Println(re.ReplaceAllString("Alan Turing", reversed))
143 | 	// Output:
144 | 	// true
145 | 	// ["" "first" "last"]
146 | 	// ${last} ${first}
147 | 	// Turing Alan
148 | }
149 | 
150 | func ExampleRegexp_Split() {
151 | 	a := regexp.MustCompile("a")
152 | 	fmt.Println(a.Split("banana", -1))
153 | 	fmt.Println(a.Split("banana", 0))
154 | 	fmt.Println(a.Split("banana", 1))
155 | 	fmt.Println(a.Split("banana", 2))
156 | 	zp := regexp.MustCompile("z+")
157 | 	fmt.Println(zp.Split("pizza", -1))
158 | 	fmt.Println(zp.Split("pizza", 0))
159 | 	fmt.Println(zp.Split("pizza", 1))
160 | 	fmt.Println(zp.Split("pizza", 2))
161 | 	// Output:
162 | 	// [b n n ]
163 | 	// []
164 | 	// [banana]
165 | 	// [b nana]
166 | 	// [pi a]
167 | 	// []
168 | 	// [pizza]
169 | 	// [pi a]
170 | }
171 | 


--------------------------------------------------------------------------------
/exec.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"io"
  9 | 	"matloob.io/regexp/internal/input"
 10 | 	"matloob.io/regexp/syntax"
 11 | )
 12 | 
 13 | // A queue is a 'sparse array' holding pending threads of execution.
 14 | // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
 15 | type queue struct {
 16 | 	sparse []uint32
 17 | 	dense  []entry
 18 | }
 19 | 
 20 | // A entry is an entry on a queue.
 21 | // It holds both the instruction pc and the actual thread.
 22 | // Some queue entries are just place holders so that the machine
 23 | // knows it has considered that pc. Such entries have t == nil.
 24 | type entry struct {
 25 | 	pc uint32
 26 | 	t  *thread
 27 | }
 28 | 
 29 | // A thread is the state of a single path through the machine:
 30 | // an instruction and a corresponding capture array.
 31 | // See http://swtch.com/~rsc/regexp/regexp2.html
 32 | type thread struct {
 33 | 	inst *syntax.Inst
 34 | 	cap  []int
 35 | }
 36 | 
 37 | // A machine holds all the state during an NFA simulation for p.
 38 | type machine struct {
 39 | 	re             *Regexp      // corresponding Regexp
 40 | 	p              *syntax.Prog // compiled program
 41 | 	op             *onePassProg // compiled onepass program, or notOnePass
 42 | 	maxBitStateLen int          // max length of string to search with bitstate
 43 | 	b              *bitState    // state for backtracker, allocated lazily
 44 | 	q0, q1         queue        // two queues for runq, nextq
 45 | 	pool           []*thread    // pool of available threads
 46 | 	matched        bool         // whether a match was found
 47 | 	matchcap       []int        // capture information for the match
 48 | 
 49 | 	// cached inputs, to avoid allocation
 50 | 	inputBytes  input.InputBytes
 51 | 	inputString input.InputString
 52 | 	inputReader input.InputReader
 53 | }
 54 | 
 55 | func (m *machine) newInputBytes(b []byte) input.Input {
 56 | 	m.inputBytes.Reset(b)
 57 | 	return &m.inputBytes
 58 | }
 59 | 
 60 | func (m *machine) newInputString(s string) input.Input {
 61 | 	m.inputString.Reset(s)
 62 | 	return &m.inputString
 63 | }
 64 | 
 65 | func (m *machine) newInputReader(r io.RuneReader) input.Input {
 66 | 	m.inputReader.Reset(r)
 67 | 	return &m.inputReader
 68 | }
 69 | 
 70 | // progMachine returns a new machine running the prog p.
 71 | func progMachine(p *syntax.Prog, op *onePassProg) *machine {
 72 | 	m := &machine{p: p, op: op}
 73 | 	n := len(m.p.Inst)
 74 | 	m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
 75 | 	m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
 76 | 	ncap := p.NumCap
 77 | 	if ncap < 2 {
 78 | 		ncap = 2
 79 | 	}
 80 | 	if op == notOnePass {
 81 | 		m.maxBitStateLen = maxBitStateLen(p)
 82 | 	}
 83 | 	m.matchcap = make([]int, ncap)
 84 | 	return m
 85 | }
 86 | 
 87 | func (m *machine) init(ncap int) {
 88 | 	for _, t := range m.pool {
 89 | 		t.cap = t.cap[:ncap]
 90 | 	}
 91 | 	m.matchcap = m.matchcap[:ncap]
 92 | }
 93 | 
 94 | // alloc allocates a new thread with the given instruction.
 95 | // It uses the free pool if possible.
 96 | func (m *machine) alloc(i *syntax.Inst) *thread {
 97 | 	var t *thread
 98 | 	if n := len(m.pool); n > 0 {
 99 | 		t = m.pool[n-1]
100 | 		m.pool = m.pool[:n-1]
101 | 	} else {
102 | 		t = new(thread)
103 | 		t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
104 | 	}
105 | 	t.inst = i
106 | 	return t
107 | }
108 | 
109 | // match runs the machine over the input starting at pos.
110 | // It reports whether a match was found.
111 | // If so, m.matchcap holds the submatch information.
112 | func (m *machine) match(i input.Input, pos int) bool {
113 | 	startCond := m.re.cond
114 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
115 | 		return false
116 | 	}
117 | 	m.matched = false
118 | 	for i := range m.matchcap {
119 | 		m.matchcap[i] = -1
120 | 	}
121 | 	runq, nextq := &m.q0, &m.q1
122 | 	r, r1 := input.EndOfText, input.EndOfText
123 | 	width, width1 := 0, 0
124 | 	r, width = i.Step(pos)
125 | 	if r != input.EndOfText {
126 | 		r1, width1 = i.Step(pos + width)
127 | 	}
128 | 	var flag syntax.EmptyOp
129 | 	if pos == 0 {
130 | 		flag = syntax.EmptyOpContext(-1, r)
131 | 	} else {
132 | 		flag = i.Context(pos)
133 | 	}
134 | 	for {
135 | 		if len(runq.dense) == 0 {
136 | 			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
137 | 				// Anchored match, past beginning of text.
138 | 				break
139 | 			}
140 | 			if m.matched {
141 | 				// Have match; finished exploring alternatives.
142 | 				break
143 | 			}
144 | 			if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.CanCheckPrefix() {
145 | 				// Match requires literal prefix; fast search for it.
146 | 				advance := i.Index(m.re, pos)
147 | 				if advance < 0 {
148 | 					break
149 | 				}
150 | 				pos += advance
151 | 				r, width = i.Step(pos)
152 | 				r1, width1 = i.Step(pos + width)
153 | 			}
154 | 		}
155 | 		if !m.matched {
156 | 			if len(m.matchcap) > 0 {
157 | 				m.matchcap[0] = pos
158 | 			}
159 | 			m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil)
160 | 		}
161 | 		flag = syntax.EmptyOpContext(r, r1)
162 | 		m.step(runq, nextq, pos, pos+width, r, flag)
163 | 		if width == 0 {
164 | 			break
165 | 		}
166 | 		if len(m.matchcap) == 0 && m.matched {
167 | 			// Found a match and not paying attention
168 | 			// to where it is, so any match will do.
169 | 			break
170 | 		}
171 | 		pos += width
172 | 		r, width = r1, width1
173 | 		if r != input.EndOfText {
174 | 			r1, width1 = i.Step(pos + width)
175 | 		}
176 | 		runq, nextq = nextq, runq
177 | 	}
178 | 	m.clear(nextq)
179 | 	return m.matched
180 | }
181 | 
182 | // clear frees all threads on the thread queue.
183 | func (m *machine) clear(q *queue) {
184 | 	for _, d := range q.dense {
185 | 		if d.t != nil {
186 | 			m.pool = append(m.pool, d.t)
187 | 		}
188 | 	}
189 | 	q.dense = q.dense[:0]
190 | }
191 | 
192 | // step executes one step of the machine, running each of the threads
193 | // on runq and appending new threads to nextq.
194 | // The step processes the rune c (which may be input.EndOfText),
195 | // which starts at position pos and ends at nextPos.
196 | // nextCond gives the setting for the empty-width flags after c.
197 | func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) {
198 | 	longest := m.re.longest
199 | 	for j := 0; j < len(runq.dense); j++ {
200 | 		d := &runq.dense[j]
201 | 		t := d.t
202 | 		if t == nil {
203 | 			continue
204 | 		}
205 | 		if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
206 | 			m.pool = append(m.pool, t)
207 | 			continue
208 | 		}
209 | 		i := t.inst
210 | 		add := false
211 | 		switch i.Op {
212 | 		default:
213 | 			panic("bad inst")
214 | 
215 | 		case syntax.InstMatch:
216 | 			if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
217 | 				t.cap[1] = pos
218 | 				copy(m.matchcap, t.cap)
219 | 			}
220 | 			if !longest {
221 | 				// First-match mode: cut off all lower-priority threads.
222 | 				for _, d := range runq.dense[j+1:] {
223 | 					if d.t != nil {
224 | 						m.pool = append(m.pool, d.t)
225 | 					}
226 | 				}
227 | 				runq.dense = runq.dense[:0]
228 | 			}
229 | 			m.matched = true
230 | 
231 | 		case syntax.InstRune:
232 | 			add = i.MatchRune(c)
233 | 		case syntax.InstRune1:
234 | 			add = c == i.Rune[0]
235 | 		case syntax.InstRuneAny:
236 | 			add = true
237 | 		case syntax.InstRuneAnyNotNL:
238 | 			add = c != '\n'
239 | 		}
240 | 		if add {
241 | 			t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
242 | 		}
243 | 		if t != nil {
244 | 			m.pool = append(m.pool, t)
245 | 		}
246 | 	}
247 | 	runq.dense = runq.dense[:0]
248 | }
249 | 
250 | // add adds an entry to q for pc, unless the q already has such an entry.
251 | // It also recursively adds an entry for all instructions reachable from pc by following
252 | // empty-width conditions satisfied by cond.  pos gives the current position
253 | // in the input.
254 | func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread {
255 | 	if pc == 0 {
256 | 		return t
257 | 	}
258 | 	if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
259 | 		return t
260 | 	}
261 | 
262 | 	j := len(q.dense)
263 | 	q.dense = q.dense[:j+1]
264 | 	d := &q.dense[j]
265 | 	d.t = nil
266 | 	d.pc = pc
267 | 	q.sparse[pc] = uint32(j)
268 | 
269 | 	i := &m.p.Inst[pc]
270 | 	switch i.Op {
271 | 	default:
272 | 		panic("unhandled")
273 | 	case syntax.InstFail:
274 | 		// nothing
275 | 	case syntax.InstAlt, syntax.InstAltMatch:
276 | 		t = m.add(q, i.Out, pos, cap, cond, t)
277 | 		t = m.add(q, i.Arg, pos, cap, cond, t)
278 | 	case syntax.InstEmptyWidth:
279 | 		if syntax.EmptyOp(i.Arg)&^cond == 0 {
280 | 			t = m.add(q, i.Out, pos, cap, cond, t)
281 | 		}
282 | 	case syntax.InstNop:
283 | 		t = m.add(q, i.Out, pos, cap, cond, t)
284 | 	case syntax.InstCapture:
285 | 		if int(i.Arg) < len(cap) {
286 | 			opos := cap[i.Arg]
287 | 			cap[i.Arg] = pos
288 | 			m.add(q, i.Out, pos, cap, cond, nil)
289 | 			cap[i.Arg] = opos
290 | 		} else {
291 | 			t = m.add(q, i.Out, pos, cap, cond, t)
292 | 		}
293 | 	case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
294 | 		if t == nil {
295 | 			t = m.alloc(i)
296 | 		} else {
297 | 			t.inst = i
298 | 		}
299 | 		if len(cap) > 0 && &t.cap[0] != &cap[0] {
300 | 			copy(t.cap, cap)
301 | 		}
302 | 		d.t = t
303 | 		t = nil
304 | 	}
305 | 	return t
306 | }
307 | 
308 | // onepass runs the machine over the input starting at pos.
309 | // It reports whether a match was found.
310 | // If so, m.matchcap holds the submatch information.
311 | func (m *machine) onepass(i input.Input, pos int) bool {
312 | 	startCond := m.re.cond
313 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
314 | 		return false
315 | 	}
316 | 	m.matched = false
317 | 	for i := range m.matchcap {
318 | 		m.matchcap[i] = -1
319 | 	}
320 | 	r, r1 := input.EndOfText, input.EndOfText
321 | 	width, width1 := 0, 0
322 | 	r, width = i.Step(pos)
323 | 	if r != input.EndOfText {
324 | 		r1, width1 = i.Step(pos + width)
325 | 	}
326 | 	var flag syntax.EmptyOp
327 | 	if pos == 0 {
328 | 		flag = syntax.EmptyOpContext(-1, r)
329 | 	} else {
330 | 		flag = i.Context(pos)
331 | 	}
332 | 	pc := m.op.Start
333 | 	inst := m.op.Inst[pc]
334 | 	// If there is a simple literal prefix, skip over it.
335 | 	if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 &&
336 | 		len(m.re.prefix) > 0 && i.CanCheckPrefix() {
337 | 		// Match requires literal prefix; fast search for it.
338 | 		if i.HasPrefix(m.re) {
339 | 			pos += len(m.re.prefix)
340 | 			r, width = i.Step(pos)
341 | 			r1, width1 = i.Step(pos + width)
342 | 			flag = i.Context(pos)
343 | 			pc = int(m.re.prefixEnd)
344 | 		} else {
345 | 			return m.matched
346 | 		}
347 | 	}
348 | 	for {
349 | 		inst = m.op.Inst[pc]
350 | 		pc = int(inst.Out)
351 | 		switch inst.Op {
352 | 		default:
353 | 			panic("bad inst")
354 | 		case syntax.InstMatch:
355 | 			m.matched = true
356 | 			if len(m.matchcap) > 0 {
357 | 				m.matchcap[0] = 0
358 | 				m.matchcap[1] = pos
359 | 			}
360 | 			return m.matched
361 | 		case syntax.InstRune:
362 | 			if !inst.MatchRune(r) {
363 | 				return m.matched
364 | 			}
365 | 		case syntax.InstRune1:
366 | 			if r != inst.Rune[0] {
367 | 				return m.matched
368 | 			}
369 | 		case syntax.InstRuneAny:
370 | 			// Nothing
371 | 		case syntax.InstRuneAnyNotNL:
372 | 			if r == '\n' {
373 | 				return m.matched
374 | 			}
375 | 		// peek at the input rune to see which branch of the Alt to take
376 | 		case syntax.InstAlt, syntax.InstAltMatch:
377 | 			pc = int(onePassNext(&inst, r))
378 | 			continue
379 | 		case syntax.InstFail:
380 | 			return m.matched
381 | 		case syntax.InstNop:
382 | 			continue
383 | 		case syntax.InstEmptyWidth:
384 | 			if syntax.EmptyOp(inst.Arg)&^flag != 0 {
385 | 				return m.matched
386 | 			}
387 | 			continue
388 | 		case syntax.InstCapture:
389 | 			if int(inst.Arg) < len(m.matchcap) {
390 | 				m.matchcap[inst.Arg] = pos
391 | 			}
392 | 			continue
393 | 		}
394 | 		if width == 0 {
395 | 			break
396 | 		}
397 | 		flag = syntax.EmptyOpContext(r, r1)
398 | 		pos += width
399 | 		r, width = r1, width1
400 | 		if r != input.EndOfText {
401 | 			r1, width1 = i.Step(pos + width)
402 | 		}
403 | 	}
404 | 	return m.matched
405 | }
406 | 
407 | // empty is a non-nil 0-element slice,
408 | // so doExecute can avoid an allocation
409 | // when 0 captures are requested from a successful match.
410 | var empty = make([]int, 0)
411 | 
412 | // doExecute finds the leftmost match in the input and returns
413 | // the position of its subexpressions.
414 | func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int {
415 | 	m := re.get()
416 | 	var i input.Input
417 | 	var size int
418 | 	if r != nil {
419 | 		i = m.newInputReader(r)
420 | 	} else if b != nil {
421 | 		i = m.newInputBytes(b)
422 | 		size = len(b)
423 | 	} else {
424 | 		i = m.newInputString(s)
425 | 		size = len(s)
426 | 	}
427 | 	if m.op != notOnePass {
428 | 		if !m.onepass(i, pos) {
429 | 			re.put(m)
430 | 			return nil
431 | 		}
432 | 	} else if size < m.maxBitStateLen && r == nil {
433 | 		if m.b == nil {
434 | 			m.b = newBitState(m.p)
435 | 		}
436 | 		if !m.backtrack(i, pos, size, ncap) {
437 | 			re.put(m)
438 | 			return nil
439 | 		}
440 | 	} else {
441 | 		if ncap <= 2 {
442 | 			matched, err := re.searcher.Search(i, pos, m.re.longest, &m.matchcap, ncap)
443 | 			if err != nil {
444 | 				goto nfa
445 | 			}
446 | 			if !matched {
447 | 				re.put(m)
448 | 				return nil
449 | 			}
450 | 			goto e
451 | 		}
452 | 		nfa:
453 | 		m.init(ncap)
454 | 		if !m.match(i, pos) {
455 | 			re.put(m)
456 | 			return nil
457 | 		}
458 | 		e:
459 | 	}
460 | 	if ncap == 0 {
461 | 		re.put(m)
462 | 		return empty // empty but not nil
463 | 	}
464 | 	cap := make([]int, len(m.matchcap))
465 | 	copy(cap, m.matchcap)
466 | 	re.put(m)
467 | 	return cap
468 | }
469 | 


--------------------------------------------------------------------------------
/exec2_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2013 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !race
 6 | 
 7 | package regexp
 8 | 
 9 | import (
10 | 	"testing"
11 | )
12 | 
13 | // This test is excluded when running under the race detector because
14 | // it is a very expensive test and takes too long.
15 | func TestRE2Exhaustive(t *testing.T) {
16 | 	if testing.Short() {
17 | 		t.Skip("skipping TestRE2Exhaustive during short test")
18 | 	}
19 | 	testRE2(t, "testdata/re2-exhaustive.txt.bz2")
20 | }
21 | 


--------------------------------------------------------------------------------
/export.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | FROM="$HOME/src/matloob.io/regexp"
 4 | TO="$HOME/go/src/regexp"
 5 | 
 6 | cp $FROM/*.go $TO/
 7 | cp $FROM/syntax/*.go $TO/syntax/
 8 | cp $FROM/internal/dfa/*.go $TO/internal/dfa
 9 | cp $FROM/internal/input/*.go $TO/internal/input/
10 | 
11 | sed -i .bak -e "s/matloob.io\///g" $TO/*.go $TO/internal/dfa/*.go $TO/internal/input/*.go
12 | rm $TO/*.go.bak $TO/internal/dfa/*.go.bak $TO/internal/input/*.go.bak
13 | 


--------------------------------------------------------------------------------
/find_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2010 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"strings"
 10 | 	"testing"
 11 | )
 12 | 
 13 | // For each pattern/text pair, what is the expected output of each function?
 14 | // We can derive the textual results from the indexed results, the non-submatch
 15 | // results from the submatched results, the single results from the 'all' results,
 16 | // and the byte results from the string results. Therefore the table includes
 17 | // only the FindAllStringSubmatchIndex result.
 18 | type FindTest struct {
 19 | 	pat     string
 20 | 	text    string
 21 | 	matches [][]int
 22 | }
 23 | 
 24 | func (t FindTest) String() string {
 25 | 	return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text)
 26 | }
 27 | 
 28 | var findTests = []FindTest{
 29 | 	{``, ``, build(1, 0, 0)},
 30 | 	{`^abcdefg`, "abcdefg", build(1, 0, 7)},
 31 | 	{`a+`, "baaab", build(1, 1, 4)},
 32 | 	{"abcd..", "abcdef", build(1, 0, 6)},
 33 | 	{`a`, "a", build(1, 0, 1)},
 34 | 	{`x`, "y", nil},
 35 | 	{`b`, "abc", build(1, 1, 2)},
 36 | 	{`.`, "a", build(1, 0, 1)},
 37 | 	{`.*`, "abcdef", build(1, 0, 6)},
 38 | 	{`^`, "abcde", build(1, 0, 0)},
 39 | 	{`$`, "abcde", build(1, 5, 5)},
 40 | 	{`^abcd$`, "abcd", build(1, 0, 4)},
 41 | 	{`^bcd'`, "abcdef", nil},
 42 | 	{`^abcd$`, "abcde", nil},
 43 | 	{`a+`, "baaab", build(1, 1, 4)},
 44 | 	{`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)},
 45 | 	{`[a-z]+`, "abcd", build(1, 0, 4)},
 46 | 	{`[^a-z]+`, "ab1234cd", build(1, 2, 6)},
 47 | 	{`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)},
 48 | 	{`[^\n]+`, "abcd\n", build(1, 0, 4)},
 49 | 	{`[日本語]+`, "日本語日本語", build(1, 0, 18)},
 50 | 	{`日本語+`, "日本語", build(1, 0, 9)},
 51 | 	{`日本語+`, "日本語語語語", build(1, 0, 18)},
 52 | 	{`()`, "", build(1, 0, 0, 0, 0)},
 53 | 	{`(a)`, "a", build(1, 0, 1, 0, 1)},
 54 | 	{`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)},
 55 | 	{`(.*)`, "", build(1, 0, 0, 0, 0)},
 56 | 	{`(.*)`, "abcd", build(1, 0, 4, 0, 4)},
 57 | 	{`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)},
 58 | 	{`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)},
 59 | 	{`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)},
 60 | 	{`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)},
 61 | 	{`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)},
 62 | 	{`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)},
 63 | 
 64 | 	{`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)},
 65 | 	{`(.*).*`, "ab", build(1, 0, 2, 0, 2)},
 66 | 	{`[.]`, ".", build(1, 0, 1)},
 67 | 	{`/$`, "/abc/", build(1, 4, 5)},
 68 | 	{`/$`, "/abc", nil},
 69 | 
 70 | 	// multiple matches
 71 | 	{`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)},
 72 | 	{`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)},
 73 | 	{`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)},
 74 | 	{`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)},
 75 | 	{`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)},
 76 | 
 77 | 	// fixed bugs
 78 | 	{`ab$`, "cab", build(1, 1, 3)},
 79 | 	{`axxb$`, "axxcb", nil},
 80 | 	{`data`, "daXY data", build(1, 5, 9)},
 81 | 	{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
 82 | 	{`zx+`, "zzx", build(1, 1, 3)},
 83 | 	{`ab$`, "abcab", build(1, 3, 5)},
 84 | 	{`(aa)*$`, "a", build(1, 1, 1, -1, -1)},
 85 | 	{`(?:.|(?:.a))`, "", nil},
 86 | 	{`(?:A(?:A|a))`, "Aa", build(1, 0, 2)},
 87 | 	{`(?:A|(?:A|a))`, "a", build(1, 0, 1)},
 88 | 	{`(a){0}`, "", build(1, 0, 0, -1, -1)},
 89 | 	{`(?-s)(?:(?:^).)`, "\n", nil},
 90 | 	{`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)},
 91 | 	{`(?:(?:^).)`, "\n", nil},
 92 | 	{`\b`, "x", build(2, 0, 0, 1, 1)},
 93 | 	{`\b`, "xx", build(2, 0, 0, 2, 2)},
 94 | 	{`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)},
 95 | 	{`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)},
 96 | 	{`\B`, "x", nil},
 97 | 	{`\B`, "xx", build(1, 1, 1)},
 98 | 	{`\B`, "x y", nil},
 99 | 	{`\B`, "xx yy", build(2, 1, 1, 4, 4)},
100 | 
101 | 	// RE2 tests
102 | 	{`[^\S\s]`, "abcd", nil},
103 | 	{`[^\S[:space:]]`, "abcd", nil},
104 | 	{`[^\D\d]`, "abcd", nil},
105 | 	{`[^\D[:digit:]]`, "abcd", nil},
106 | 	{`(?i)\W`, "x", nil},
107 | 	{`(?i)\W`, "k", nil},
108 | 	{`(?i)\W`, "s", nil},
109 | 
110 | 	// can backslash-escape any punctuation
111 | 	{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
112 | 		`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
113 | 	{`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`,
114 | 		`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
115 | 	{"\\`", "`", build(1, 0, 1)},
116 | 	{"[\\`]+", "`", build(1, 0, 1)},
117 | 
118 | 	// long set of matches (longer than startSize)
119 | 	{
120 | 		".",
121 | 		"qwertyuiopasdfghjklzxcvbnm1234567890",
122 | 		build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
123 | 			10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20,
124 | 			20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30,
125 | 			30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36),
126 | 	},
127 | }
128 | 
129 | // build is a helper to construct a [][]int by extracting n sequences from x.
130 | // This represents n matches with len(x)/n submatches each.
131 | func build(n int, x ...int) [][]int {
132 | 	ret := make([][]int, n)
133 | 	runLength := len(x) / n
134 | 	j := 0
135 | 	for i := range ret {
136 | 		ret[i] = make([]int, runLength)
137 | 		copy(ret[i], x[j:])
138 | 		j += runLength
139 | 		if j > len(x) {
140 | 			panic("invalid build entry")
141 | 		}
142 | 	}
143 | 	return ret
144 | }
145 | 
146 | // First the simple cases.
147 | 
148 | func TestFind(t *testing.T) {
149 | 	for _, test := range findTests {
150 | 		re := MustCompile(test.pat)
151 | 		if re.String() != test.pat {
152 | 			t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
153 | 		}
154 | 		result := re.Find([]byte(test.text))
155 | 		switch {
156 | 		case len(test.matches) == 0 && len(result) == 0:
157 | 			// ok
158 | 		case test.matches == nil && result != nil:
159 | 			t.Errorf("expected no match; got one: %s", test)
160 | 		case test.matches != nil && result == nil:
161 | 			t.Errorf("expected match; got none: %s", test)
162 | 		case test.matches != nil && result != nil:
163 | 			expect := test.text[test.matches[0][0]:test.matches[0][1]]
164 | 			if expect != string(result) {
165 | 				t.Errorf("expected %q got %q: %s", expect, result, test)
166 | 			}
167 | 		}
168 | 	}
169 | }
170 | 
171 | func TestFindString(t *testing.T) {
172 | 	for _, test := range findTests {
173 | 		result := MustCompile(test.pat).FindString(test.text)
174 | 		switch {
175 | 		case len(test.matches) == 0 && len(result) == 0:
176 | 			// ok
177 | 		case test.matches == nil && result != "":
178 | 			t.Errorf("expected no match; got one: %s", test)
179 | 		case test.matches != nil && result == "":
180 | 			// Tricky because an empty result has two meanings: no match or empty match.
181 | 			if test.matches[0][0] != test.matches[0][1] {
182 | 				t.Errorf("expected match; got none: %s", test)
183 | 			}
184 | 		case test.matches != nil && result != "":
185 | 			expect := test.text[test.matches[0][0]:test.matches[0][1]]
186 | 			if expect != result {
187 | 				t.Errorf("expected %q got %q: %s", expect, result, test)
188 | 			}
189 | 		}
190 | 	}
191 | }
192 | 
193 | func testFindIndex(test *FindTest, result []int, t *testing.T) {
194 | 	switch {
195 | 	case len(test.matches) == 0 && len(result) == 0:
196 | 		// ok
197 | 	case test.matches == nil && result != nil:
198 | 		t.Errorf("expected no match; got one: %s", test)
199 | 	case test.matches != nil && result == nil:
200 | 		t.Errorf("expected match; got none: %s", test)
201 | 	case test.matches != nil && result != nil:
202 | 		expect := test.matches[0]
203 | 		if expect[0] != result[0] || expect[1] != result[1] {
204 | 			t.Errorf("expected %v got %v: %s", expect, result, test)
205 | 		}
206 | 	}
207 | }
208 | 
209 | func TestFindIndex(t *testing.T) {
210 | 	for _, test := range findTests {
211 | 		testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t)
212 | 	}
213 | }
214 | 
215 | func TestFindStringIndex(t *testing.T) {
216 | 	for _, test := range findTests {
217 | 		testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t)
218 | 	}
219 | }
220 | 
221 | func TestFindReaderIndex(t *testing.T) {
222 | 	for _, test := range findTests {
223 | 		testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
224 | 	}
225 | }
226 | 
227 | // Now come the simple All cases.
228 | 
229 | func TestFindAll(t *testing.T) {
230 | 	for _, test := range findTests {
231 | 		result := MustCompile(test.pat).FindAll([]byte(test.text), -1)
232 | 		switch {
233 | 		case test.matches == nil && result == nil:
234 | 			// ok
235 | 		case test.matches == nil && result != nil:
236 | 			t.Errorf("expected no match; got one: %s", test)
237 | 		case test.matches != nil && result == nil:
238 | 			t.Fatalf("expected match; got none: %s", test)
239 | 		case test.matches != nil && result != nil:
240 | 			if len(test.matches) != len(result) {
241 | 				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
242 | 				continue
243 | 			}
244 | 			for k, e := range test.matches {
245 | 				expect := test.text[e[0]:e[1]]
246 | 				if expect != string(result[k]) {
247 | 					t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test)
248 | 				}
249 | 			}
250 | 		}
251 | 	}
252 | }
253 | 
254 | func TestFindAllString(t *testing.T) {
255 | 	for _, test := range findTests {
256 | 		result := MustCompile(test.pat).FindAllString(test.text, -1)
257 | 		switch {
258 | 		case test.matches == nil && result == nil:
259 | 			// ok
260 | 		case test.matches == nil && result != nil:
261 | 			t.Errorf("expected no match; got one: %s", test)
262 | 		case test.matches != nil && result == nil:
263 | 			t.Errorf("expected match; got none: %s", test)
264 | 		case test.matches != nil && result != nil:
265 | 			if len(test.matches) != len(result) {
266 | 				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
267 | 				continue
268 | 			}
269 | 			for k, e := range test.matches {
270 | 				expect := test.text[e[0]:e[1]]
271 | 				if expect != result[k] {
272 | 					t.Errorf("expected %q got %q: %s", expect, result, test)
273 | 				}
274 | 			}
275 | 		}
276 | 	}
277 | }
278 | 
279 | func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
280 | 	switch {
281 | 	case test.matches == nil && result == nil:
282 | 		// ok
283 | 	case test.matches == nil && result != nil:
284 | 		t.Errorf("expected no match; got one: %s", test)
285 | 	case test.matches != nil && result == nil:
286 | 		t.Errorf("expected match; got none: %s", test)
287 | 	case test.matches != nil && result != nil:
288 | 		if len(test.matches) != len(result) {
289 | 			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
290 | 			return
291 | 		}
292 | 		for k, e := range test.matches {
293 | 			if e[0] != result[k][0] || e[1] != result[k][1] {
294 | 				t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
295 | 			}
296 | 		}
297 | 	}
298 | }
299 | 
300 | func TestFindAllIndex(t *testing.T) {
301 | 	for _, test := range findTests {
302 | 		testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t)
303 | 	}
304 | }
305 | 
306 | func TestFindAllStringIndex(t *testing.T) {
307 | 	for _, test := range findTests {
308 | 		testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t)
309 | 	}
310 | }
311 | 
312 | // Now come the Submatch cases.
313 | 
314 | func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
315 | 	if len(submatches) != len(result)*2 {
316 | 		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
317 | 		return
318 | 	}
319 | 	for k := 0; k < len(submatches); k += 2 {
320 | 		if submatches[k] == -1 {
321 | 			if result[k/2] != nil {
322 | 				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
323 | 			}
324 | 			continue
325 | 		}
326 | 		expect := test.text[submatches[k]:submatches[k+1]]
327 | 		if expect != string(result[k/2]) {
328 | 			t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
329 | 			return
330 | 		}
331 | 	}
332 | }
333 | 
334 | func TestFindSubmatch(t *testing.T) {
335 | 	for _, test := range findTests {
336 | 		result := MustCompile(test.pat).FindSubmatch([]byte(test.text))
337 | 		switch {
338 | 		case test.matches == nil && result == nil:
339 | 			// ok
340 | 		case test.matches == nil && result != nil:
341 | 			t.Errorf("expected no match; got one: %s", test)
342 | 		case test.matches != nil && result == nil:
343 | 			t.Errorf("expected match; got none: %s", test)
344 | 		case test.matches != nil && result != nil:
345 | 			testSubmatchBytes(&test, 0, test.matches[0], result, t)
346 | 		}
347 | 	}
348 | }
349 | 
350 | func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
351 | 	if len(submatches) != len(result)*2 {
352 | 		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
353 | 		return
354 | 	}
355 | 	for k := 0; k < len(submatches); k += 2 {
356 | 		if submatches[k] == -1 {
357 | 			if result[k/2] != "" {
358 | 				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
359 | 			}
360 | 			continue
361 | 		}
362 | 		expect := test.text[submatches[k]:submatches[k+1]]
363 | 		if expect != result[k/2] {
364 | 			t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
365 | 			return
366 | 		}
367 | 	}
368 | }
369 | 
370 | func TestFindStringSubmatch(t *testing.T) {
371 | 	for _, test := range findTests {
372 | 		result := MustCompile(test.pat).FindStringSubmatch(test.text)
373 | 		switch {
374 | 		case test.matches == nil && result == nil:
375 | 			// ok
376 | 		case test.matches == nil && result != nil:
377 | 			t.Errorf("expected no match; got one: %s", test)
378 | 		case test.matches != nil && result == nil:
379 | 			t.Errorf("expected match; got none: %s", test)
380 | 		case test.matches != nil && result != nil:
381 | 			testSubmatchString(&test, 0, test.matches[0], result, t)
382 | 		}
383 | 	}
384 | }
385 | 
386 | func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
387 | 	if len(expect) != len(result) {
388 | 		t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
389 | 		return
390 | 	}
391 | 	for k, e := range expect {
392 | 		if e != result[k] {
393 | 			t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
394 | 		}
395 | 	}
396 | }
397 | 
398 | func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
399 | 	switch {
400 | 	case test.matches == nil && result == nil:
401 | 		// ok
402 | 	case test.matches == nil && result != nil:
403 | 		t.Errorf("expected no match; got one: %s", test)
404 | 	case test.matches != nil && result == nil:
405 | 		t.Errorf("expected match; got none: %s", test)
406 | 	case test.matches != nil && result != nil:
407 | 		testSubmatchIndices(test, 0, test.matches[0], result, t)
408 | 	}
409 | }
410 | 
411 | func TestFindSubmatchIndex(t *testing.T) {
412 | 	for _, test := range findTests {
413 | 		testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t)
414 | 	}
415 | }
416 | 
417 | func TestFindStringSubmatchIndex(t *testing.T) {
418 | 	for _, test := range findTests {
419 | 		testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t)
420 | 	}
421 | }
422 | 
423 | func TestFindReaderSubmatchIndex(t *testing.T) {
424 | 	for _, test := range findTests {
425 | 		testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
426 | 	}
427 | }
428 | 
429 | // Now come the monster AllSubmatch cases.
430 | 
431 | func TestFindAllSubmatch(t *testing.T) {
432 | 	for _, test := range findTests {
433 | 		result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1)
434 | 		switch {
435 | 		case test.matches == nil && result == nil:
436 | 			// ok
437 | 		case test.matches == nil && result != nil:
438 | 			t.Errorf("expected no match; got one: %s", test)
439 | 		case test.matches != nil && result == nil:
440 | 			t.Errorf("expected match; got none: %s", test)
441 | 		case len(test.matches) != len(result):
442 | 			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
443 | 		case test.matches != nil && result != nil:
444 | 			for k, match := range test.matches {
445 | 				testSubmatchBytes(&test, k, match, result[k], t)
446 | 			}
447 | 		}
448 | 	}
449 | }
450 | 
451 | func TestFindAllStringSubmatch(t *testing.T) {
452 | 	for _, test := range findTests {
453 | 		result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1)
454 | 		switch {
455 | 		case test.matches == nil && result == nil:
456 | 			// ok
457 | 		case test.matches == nil && result != nil:
458 | 			t.Errorf("expected no match; got one: %s", test)
459 | 		case test.matches != nil && result == nil:
460 | 			t.Errorf("expected match; got none: %s", test)
461 | 		case len(test.matches) != len(result):
462 | 			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
463 | 		case test.matches != nil && result != nil:
464 | 			for k, match := range test.matches {
465 | 				testSubmatchString(&test, k, match, result[k], t)
466 | 			}
467 | 		}
468 | 	}
469 | }
470 | 
471 | func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
472 | 	switch {
473 | 	case test.matches == nil && result == nil:
474 | 		// ok
475 | 	case test.matches == nil && result != nil:
476 | 		t.Errorf("expected no match; got one: %s", test)
477 | 	case test.matches != nil && result == nil:
478 | 		t.Errorf("expected match; got none: %s", test)
479 | 	case len(test.matches) != len(result):
480 | 		t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
481 | 	case test.matches != nil && result != nil:
482 | 		for k, match := range test.matches {
483 | 			testSubmatchIndices(test, k, match, result[k], t)
484 | 		}
485 | 	}
486 | }
487 | 
488 | func TestFindAllSubmatchIndex(t *testing.T) {
489 | 	for _, test := range findTests {
490 | 		testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t)
491 | 	}
492 | }
493 | 
494 | func TestFindAllStringSubmatchIndex(t *testing.T) {
495 | 	for _, test := range findTests {
496 | 		testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t)
497 | 	}
498 | }
499 | 


--------------------------------------------------------------------------------
/internal/dfa/dfa_exhaustive_test.go:
--------------------------------------------------------------------------------
  1 | package dfa
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"compress/bzip2"
  6 | 	"io"
  7 | 	"os"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"testing"
 11 | )
 12 | 
 13 | func TestDFAZVV(t *testing.T) {
 14 | 	testDFA(t, "../../testdata/re2-search.txt")
 15 | }
 16 | 
 17 | 
 18 | // THIS IS REALLY SLOW
 19 | func xTestDFAExhaustive(t *testing.T) {
 20 | 	testDFA(t, "../../testdata/re2-exhaustive.txt.bz2")
 21 | }
 22 | 
 23 | func testDFA(t *testing.T, file string) {
 24 | 	f, err := os.Open(file)
 25 | 	if err != nil {
 26 | 		t.Fatal(err)
 27 | 	}
 28 | 	defer f.Close()
 29 | 	var txt io.Reader
 30 | 	if strings.HasSuffix(file, ".bz2") {
 31 | 		z := bzip2.NewReader(f)
 32 | 		txt = z
 33 | 		file = file[:len(file)-len(".bz2")] // for error messages
 34 | 	} else {
 35 | 		txt = f
 36 | 	}
 37 | 	lineno := 0
 38 | 	scanner := bufio.NewScanner(txt)
 39 | 	var (
 40 | 		str       []string
 41 | 		input     []string
 42 | 		inStrings bool
 43 | 		q, full   string
 44 | 		nfail     int
 45 | 		ncase     int
 46 | 	)
 47 | 	for lineno := 1; scanner.Scan(); lineno++ {
 48 | 		line := scanner.Text()
 49 | 		switch {
 50 | 		case line == "":
 51 | 			t.Fatalf("%s:%d: unexpected blank line", file, lineno)
 52 | 		case line[0] == '#':
 53 | 			continue
 54 | 		case 'A' <= line[0] && line[0] <= 'Z':
 55 | 			// Test name.
 56 | 			t.Logf("%s\n", line)
 57 | 			continue
 58 | 		case line == "strings":
 59 | 			str = str[:0]
 60 | 			inStrings = true
 61 | 		case line == "regexps":
 62 | 			inStrings = false
 63 | 		case line[0] == '"':
 64 | 			q, err = strconv.Unquote(line)
 65 | 			if err != nil {
 66 | 				// Fatal because we'll get out of sync.
 67 | 				t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
 68 | 			}
 69 | 			if inStrings {
 70 | 				str = append(str, q)
 71 | 				continue
 72 | 			}
 73 | 			// Is a regexp.
 74 | 			if len(input) != 0 {
 75 | 				t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
 76 | 			}
 77 | 			full = `\A(?:` + q + `)\z`
 78 | 			input = str
 79 | 		case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
 80 | 			// A sequence of match results.
 81 | 			ncase++
 82 | 			if len(input) == 0 {
 83 | 				t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
 84 | 			}
 85 | 			var text string
 86 | 			text, input = input[0], input[1:]
 87 | 			if strings.Contains(q, `\C`) || (!isSingleBytes(text) && strings.Contains(q, `\B`)) {
 88 | 				// RE2's \B considers every byte position,
 89 | 				// so it sees 'not word boundary' in the
 90 | 				// middle of UTF-8 sequences. This package
 91 | 				// only considers the positions between runes,
 92 | 				// so it disagrees. Skip those cases.
 93 | 				continue
 94 | 			}
 95 | 			res := strings.Split(line, ";")
 96 | 			if len(res) != len(run) {
 97 | 				t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
 98 | 			}
 99 | 			for i := range res {
100 | 				have, suffix := run[i](q, full, text)
101 | 				want := parseResult(t, file, lineno, res[i])
102 | 				if len(want) <= 2 && !same(have, want) {
103 | 					t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, q, suffix, text, have, want)
104 | 					if nfail++; nfail >= 100 {
105 | 						t.Fatalf("stopping after %d errors", nfail)
106 | 					}
107 | 					continue
108 | 				}
109 | 				b, suffix := match[i](q, full, text)
110 | 				if b != (want != nil) {
111 | 					t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, q, suffix, text, b, !b)
112 | 					if nfail++; nfail >= 100 {
113 | 						t.Fatalf("stopping after %d errors", nfail)
114 | 					}
115 | 					continue
116 | 				}
117 | 			}
118 | 
119 | 		default:
120 | 			t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
121 | 		}
122 | 	}
123 | 	if err := scanner.Err(); err != nil {
124 | 		t.Fatalf("%s:%d: %v", file, lineno, err)
125 | 	}
126 | 	if len(input) != 0 {
127 | 		t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
128 | 	}
129 | 	t.Logf("%d cases tested", ncase)
130 | }
131 | 
132 | // TODO(matloob): This is deceptive because we're not reusing the DFA between
133 | // tests. FIX IT!
134 | 
135 | var run = []func(string, string, string) ([]int, string){
136 | 	runFull,
137 | 	runPartial,
138 | 	runFullLongest,
139 | 	runPartialLongest,
140 | }
141 | 
142 | func runFull(re, refull, text string) ([]int, string) {
143 | 	return dfaSubmatchIndex(refull, text, false), "[full]"
144 | }
145 | 
146 | func runPartial(re, refull, text string) ([]int, string) {
147 | 	return dfaSubmatchIndex(re, text, false), ""
148 | }
149 | 
150 | func runFullLongest(re, refull, text string) ([]int, string) {
151 | 	return dfaSubmatchIndex(refull, text, true), "[full,longest]"
152 | }
153 | 
154 | func runPartialLongest(re, refull, text string) ([]int, string) {
155 | 	return dfaSubmatchIndex(re, text, true), "[longest]"
156 | }
157 | 
158 | func dfaSubmatchIndex(re, text string, longest bool) []int {
159 | 	i, j, b, err := matchDFA2(re, text, longest)
160 | 	if err != nil || !b {
161 | 		return nil
162 | 	}
163 | 	return []int{i, j}
164 | }
165 | 
166 | var match = []func(string, string, string) (bool, string){
167 | 	matchFull,
168 | 	matchPartial,
169 | 	matchFullLongest,
170 | 	matchPartialLongest,
171 | }
172 | 
173 | func matchFull(re, refull, text string) (bool, string) {
174 | 	return dfaMatchString(refull, text, false), "[full]"
175 | }
176 | 
177 | func matchPartial(re, refull, text string) (bool, string) {
178 | 	return dfaMatchString(re, text, false), ""
179 | }
180 | 
181 | func matchFullLongest(re, refull, text string) (bool, string) {
182 | 	return dfaMatchString(refull, text, true), "[full,longest]"
183 | }
184 | 
185 | func matchPartialLongest(re, refull, text string) (bool, string) {
186 | 	return dfaMatchString(re, text, true), "[longest]"
187 | }
188 | 
189 | func dfaMatchString(re, text string, longest bool) bool {
190 | 	_, _, b, err := matchDFA2(re, text, longest)
191 | 	return err == nil && b
192 | }
193 | 


--------------------------------------------------------------------------------
/internal/dfa/dfa_test.go:
--------------------------------------------------------------------------------
  1 | // TODO(matloob): DELETE ME!
  2 | 
  3 | package dfa
  4 | 
  5 | import (
  6 | 	"testing"
  7 | 
  8 | 	"matloob.io/regexp/internal/input"
  9 | 	"matloob.io/regexp/syntax"
 10 | )
 11 | 
 12 | func matchDFA(regexp string, input string) (int, int, bool, error) {
 13 | 	return matchDFA2(regexp, input, false)
 14 | }
 15 | 
 16 | func matchDFA2(regexp string, inputstr string, longest bool) (int, int, bool, error) {
 17 | 	re, err := syntax.Parse(regexp, syntax.Perl)
 18 | 	if err != nil {
 19 | 		return 0, 0, false, err
 20 | 	}
 21 | 	prog, err := syntax.Compile(re)
 22 | 	if err != nil {
 23 | 		return 0, 0, false, err
 24 | 	}
 25 | 
 26 | 	kind := firstMatch
 27 | 	if longest {
 28 | 		kind = longestMatch
 29 | 	}
 30 | 
 31 | 	d := newDFA(prog, kind, 0)
 32 | 
 33 | 	revprog, err := syntax.CompileReversed(re)
 34 | 	if err != nil {
 35 | 		panic("failed to compile reverse prog")
 36 | 	}
 37 | 
 38 | 	reversed := newDFA(revprog, longestMatch, 0)
 39 | 
 40 | 	var i input.InputString
 41 | 	i.Reset(inputstr)
 42 | 	j, k, b, err := search(d, reversed, &i, 0)
 43 | 	return j, k, b, err
 44 | }
 45 | 
 46 | func TestDFA(t *testing.T) {
 47 | 	// These are all anchored matches.
 48 | 	testCases := []struct {
 49 | 		re    string
 50 | 		in    string
 51 | 		wantS int
 52 | 		wantE int
 53 | 		want  bool
 54 | 	}{
 55 | 
 56 | 		{"abc", "abc", 0, 3, true},
 57 | 		{"abc", "ab", -1, -1, false},
 58 | 		{".*(a|z)bc", "eedbcxcee", -1, -1, false},
 59 | 		{"^abc", "xxxabcxxx", -1, -1, false},
 60 | 
 61 | 		{"ab*", "xxxabbxxx", 3, 6, true},
 62 | 		{"abc", "xxxabcxxx", 3, 6, true},
 63 | 
 64 | 		{"(>[^\n]+)?\n", ">One Homo sapiens alu\nGGCCGGGCGCG", 0, 22, true},
 65 | 		{"abc", "abcxxxabc", 0, 3, true},
 66 | 		{"^abcde", "abcde", 0, 5, true},
 67 | 		{"^", "abcde", 0, 0, true},
 68 | 		{"abcde$", "abcde", 0, 5, true},
 69 | 		{"$", "abcde", 5, 5, true},
 70 | 		{"agggtaa[cgt]|[acg]ttaccct", "agggtaag", 0, 8, true},
 71 | 		{"[cgt]gggtaaa|tttaccc[acg]", "xtttacccce", 1, 9, true},
 72 | 		{"[日本語]+", "日本語日本語", 0, len("日本語日本語"), true},
 73 | 		{"a.", "paranormal", 1, 3, true},
 74 | 		{`\B`, "x", -1, -1, false},
 75 | 	}
 76 | 	for _, tc := range testCases {
 77 | 		i, j, got, err := matchDFA(tc.re, tc.in)
 78 | 		if err != nil {
 79 | 			t.Error(err)
 80 | 		}
 81 | 		if got != tc.want || i != tc.wantS || j != tc.wantE {
 82 | 			t.Errorf("matchDFA(%q, %q): got (%v, %v, %v), want (%v, %v, %v)", tc.re, tc.in, i, j, got, tc.wantS, tc.wantE, tc.want)
 83 | 		}
 84 | 	}
 85 | 
 86 | }
 87 | func TestDFA3(t *testing.T) {
 88 | 	// These are all anchored matches.
 89 | 	testCases := []struct {
 90 | 		re    string
 91 | 		in    string
 92 | 		wantS int
 93 | 		wantE int
 94 | 		want  bool
 95 | 	}{
 96 | 		{`\B`, "a0b", 1, 1, true},
 97 | 		//		{"\\B", "x", -1, -1, false},
 98 | 		//		{"\\B", "xx yy", 1,1,true},
 99 | 		//		{`(?:A|(?:A|a))`, "B", -1, -1, true},
100 | 		//		{`(?:A|(?:A|a))`, "B", -1, -1, true},
101 | 	}
102 | 	for _, tc := range testCases {
103 | 		i, j, got, err := matchDFA(tc.re, tc.in)
104 | 		if err != nil {
105 | 			t.Error(err)
106 | 			continue
107 | 		}
108 | 		if got != tc.want || i != tc.wantS || j != tc.wantE {
109 | 			t.Errorf("matchDFA(%q, %q): got (%v, %v, %v), want (%v, %v, %v)", tc.re, tc.in, i, j, got, tc.wantS, tc.wantE, tc.want)
110 | 		}
111 | 	}
112 | }


--------------------------------------------------------------------------------
/internal/dfa/exec_test.go:
--------------------------------------------------------------------------------
 1 | package dfa
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 	"testing"
 6 | 	"strconv"
 7 | 	"strings"
 8 | 	)
 9 | 
10 | func isSingleBytes(s string) bool {
11 | 	for _, c := range s {
12 | 		if c >= utf8.RuneSelf {
13 | 			return false
14 | 		}
15 | 	}
16 | 	return true
17 | }
18 | 
19 | func parseResult(t *testing.T, file string, lineno int, res string) []int {
20 | 	// A single - indicates no match.
21 | 	if res == "-" {
22 | 		return nil
23 | 	}
24 | 	// Otherwise, a space-separated list of pairs.
25 | 	n := 1
26 | 	for j := 0; j < len(res); j++ {
27 | 		if res[j] == ' ' {
28 | 			n++
29 | 		}
30 | 	}
31 | 	out := make([]int, 2*n)
32 | 	i := 0
33 | 	n = 0
34 | 	for j := 0; j <= len(res); j++ {
35 | 		if j == len(res) || res[j] == ' ' {
36 | 			// Process a single pair.  - means no submatch.
37 | 			pair := res[i:j]
38 | 			if pair == "-" {
39 | 				out[n] = -1
40 | 				out[n+1] = -1
41 | 			} else {
42 | 				k := strings.Index(pair, "-")
43 | 				if k < 0 {
44 | 					t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
45 | 				}
46 | 				lo, err1 := strconv.Atoi(pair[:k])
47 | 				hi, err2 := strconv.Atoi(pair[k+1:])
48 | 				if err1 != nil || err2 != nil || lo > hi {
49 | 					t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
50 | 				}
51 | 				out[n] = lo
52 | 				out[n+1] = hi
53 | 			}
54 | 			n += 2
55 | 			i = j + 1
56 | 		}
57 | 	}
58 | 	return out
59 | }
60 | 
61 | func same(x, y []int) bool {
62 | 	if len(x) != len(y) {
63 | 		return false
64 | 	}
65 | 	for i, xi := range x {
66 | 		if xi != y[i] {
67 | 			return false
68 | 		}
69 | 	}
70 | 	return true
71 | }
72 | 


--------------------------------------------------------------------------------
/internal/dfa/runerange.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dfa
  6 | 
  7 | import (
  8 | 	"sort"
  9 | 	"unicode"
 10 | 	"matloob.io/regexp/internal/input"
 11 | 	"matloob.io/regexp/syntax"
 12 | )
 13 | 
 14 | type rangeMap struct {
 15 | 	bytemap []int
 16 | 	divides []rune
 17 | }
 18 | 
 19 | func (rm *rangeMap) lookup(r rune) int {
 20 | 	// Use the trivial byte map for now...
 21 | 	// See ComputeByteMap
 22 | 	if r == input.EndOfText {
 23 | 		return len(rm.divides)
 24 | 	}
 25 | 	if r == input.StartOfText {
 26 | 		return len(rm.divides) + 1
 27 | 	}
 28 | 	if r > 255 {
 29 | 		// binary search for the range
 30 | 		lo, hi := 0, len(rm.divides)
 31 | 		for {
 32 | 			// search rm.divides
 33 | 			center := (lo + hi) / 2
 34 | 			if center == lo {
 35 | 				return lo
 36 | 			}
 37 | 			divcenter := rm.divides[center]
 38 | 			if r >= divcenter {
 39 | 				lo = center
 40 | 			} else {
 41 | 				hi = center
 42 | 			}
 43 | 		}
 44 | 	}
 45 | 	// Faster lookup for runes < 256.
 46 | 	return rm.bytemap[int(r)]
 47 | }
 48 | 
 49 | // count returns the number of ranges. 0 <= rm.count() < rm.lookup(r) for all runes r.
 50 | func (rm *rangeMap) count() int {
 51 | 	return len(rm.divides) + 2
 52 | }
 53 | 
 54 | func (rm *rangeMap) init(prog *syntax.Prog) {
 55 | 	rangemark := make(map[rune]bool)
 56 | 	addRune := func(r rune) {
 57 | 		rangemark[r] = true
 58 | 		rangemark[r+1] = true
 59 | 	}
 60 | 	addRuneRange := func(rl, rh rune) {
 61 | 		rangemark[rl] = true
 62 | 		rangemark[rh+1] = true
 63 | 	}
 64 | 	addRuneFolds := func(r rune) {
 65 | 		for r1 := unicode.SimpleFold(r) ;r1 != r; r1 = unicode.SimpleFold(r1) {
 66 | 			addRune(r1)
 67 | 		}
 68 | 	}
 69 | 	for _, inst := range prog.Inst {
 70 | 		switch inst.Op {
 71 | 		case syntax.InstRune:
 72 | 			if len(inst.Rune) == 1 {
 73 | 				// special case of single rune
 74 | 				r := inst.Rune[0]
 75 | 				addRune(r)
 76 | 				if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
 77 | 					addRuneFolds(r)
 78 | 				}
 79 | 				break
 80 | 			}
 81 | 			// otherwise inst.Rune is a series of ranges
 82 | 			for i := 0; i < len(inst.Rune); i += 2 {
 83 | 				addRuneRange(inst.Rune[i], inst.Rune[i+1])
 84 | 				if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
 85 | 					for r0 := inst.Rune[i]; r0 <= inst.Rune[i+1]; r0++ {
 86 | 						// Range mapping doesn't commute, so we have to
 87 | 						// add folds individually.
 88 | 						addRuneFolds(r0)
 89 | 					}
 90 | 				}
 91 | 			}
 92 | 		case syntax.InstRune1:
 93 | 			r := inst.Rune[0]
 94 | 			addRune(r)
 95 | 			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
 96 | 				addRuneFolds(r)
 97 | 			}
 98 | 		case syntax.InstRuneAnyNotNL:
 99 | 			addRune('\n')
100 | 		case syntax.InstEmptyWidth:
101 | 			switch syntax.EmptyOp(inst.Arg) {
102 | 			case syntax.EmptyBeginLine, syntax.EmptyEndLine:
103 | 				addRune('\n')
104 | 			case syntax.EmptyWordBoundary, syntax.EmptyNoWordBoundary:
105 | 				addRuneRange('A', 'Z')
106 | 				addRuneRange('a', 'Z')
107 | 				addRuneRange('0', '9')
108 | 				addRune('_')
109 | 			}
110 | 		}
111 | 	}
112 | 
113 | 	divides := make([]rune, 0, len(rangemark))
114 | 	divides = append(divides, -1)
115 | 	for r := range rangemark {
116 | 		divides = append(divides, r)
117 | 	}
118 | 	runeSlice(divides).Sort()
119 | 	rm.divides = divides
120 | 	rm.bytemap = make([]int, 256)
121 | 	k := 0
122 | 	for i := range rm.bytemap {
123 | 		if rangemark[rune(i)] {
124 | 			k++
125 | 		}
126 | 		rm.bytemap[i] = k
127 | 	}
128 | }
129 | 
130 | // runeSlice exists to permit sorting the case-folded rune sets.
131 | type runeSlice []rune
132 | 
133 | func (p runeSlice) Len() int           { return len(p) }
134 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
135 | func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
136 | 
137 | // Sort is a convenience method.
138 | func (p runeSlice) Sort() {
139 | 	sort.Sort(p)
140 | }
141 | 


--------------------------------------------------------------------------------
/internal/dfa/search.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dfa
  6 | 
  7 | import (
  8 | 	"sync"
  9 | 	"errors"
 10 | 	"math"
 11 | 	"matloob.io/regexp/internal/input"
 12 | 	"matloob.io/regexp/syntax"
 13 | )
 14 | 
 15 | type Searcher struct {
 16 | 	mu sync.Mutex
 17 | 	re               *syntax.Regexp
 18 | 	prog               *syntax.Prog
 19 | 	prefixer input.Prefixer
 20 | 	fdfa, ldfa, revdfa *DFA
 21 | }
 22 | 
 23 | func (s *Searcher) Init(prog *syntax.Prog, expr *syntax.Regexp, p input.Prefixer) {
 24 | 	s.prog = prog
 25 | 	s.re = expr
 26 | 	s.prefixer = p 
 27 | }
 28 | 
 29 | var errNotDFA = errors.New("can't use dfa")
 30 | 
 31 | func (s *Searcher) Search(i input.Input, pos int, longest bool, matchcap *[]int, ncap int) (bool, error) {
 32 | 	const budget = (2 << 20)/3
 33 | 	rinput, ok := i.(input.Rinput)
 34 | 	if !ok {
 35 | 		return false, errNotDFA
 36 | 	}
 37 | 	var dfa *DFA
 38 | 	if longest {
 39 | 		s.mu.Lock()
 40 | 		if s.ldfa == nil {
 41 | 			s.ldfa = newDFA(s.prog, longestMatch, budget)
 42 | 			s.ldfa.prefixer = s.prefixer
 43 | 		}
 44 | 		dfa = s.ldfa
 45 | 		s.mu.Unlock()
 46 | 	} else {
 47 | 		s.mu.Lock()
 48 | 		if s.fdfa == nil {
 49 | 			s.fdfa = newDFA(s.prog, firstMatch, budget)
 50 | 			s.fdfa.prefixer = s.prefixer
 51 | 		}
 52 | 		dfa = s.fdfa
 53 | 		s.mu.Unlock()
 54 | 	}
 55 | 	var revdfa *DFA
 56 | 	if s.revdfa == nil {
 57 | 		s.mu.Lock()
 58 | 		revprog, err := syntax.CompileReversed(s.re)
 59 | 		if err != nil {
 60 | 			panic("CompileReversed failed")
 61 | 		}
 62 | 		s.revdfa = newDFA(revprog, longestMatch, budget)
 63 | 		s.mu.Unlock()
 64 | 	}
 65 | 	s.mu.Lock()
 66 | 	revdfa = s.revdfa
 67 | 	s.mu.Unlock()
 68 | 		
 69 | 	var matched bool
 70 | 	*matchcap = (*matchcap)[:ncap]
 71 | 	p, ep, matched, err := search(dfa, revdfa, rinput, pos)
 72 | 	if err != nil {
 73 | 		return false, errNotDFA
 74 | 	}
 75 | 	if ncap > 0 {
 76 | 		(*matchcap)[0], (*matchcap)[1] = p, ep
 77 | 	}
 78 | 	return matched, nil
 79 | }
 80 | 
 81 | type searchParams struct {
 82 | 	input            input.Rinput
 83 | 	startpos          int
 84 | 	anchored          bool
 85 | 	wantEarliestMatch bool
 86 | 	runForward        bool
 87 | 	start             *State
 88 | 	firstbyte         int64 // int64 to be compatible with atomic ops
 89 | 	failed            bool  // "out" parameter: whether search gave up
 90 | 	ep                int   // "out" parameter: end pointer for match
 91 | 
 92 | 	matches []int
 93 | }
 94 | 
 95 | func isanchored(prog *syntax.Prog) bool {
 96 | 	return prog.StartCond() & syntax.EmptyBeginText != 0
 97 | }
 98 | 
 99 | func search(d, reversed *DFA, i input.Rinput, startpos int) (start int, end int, matched bool, err error) {
100 | 	params := searchParams{}
101 | 	params.startpos = startpos
102 | 	params.wantEarliestMatch = false
103 | 	params.input = i
104 | 	params.anchored = isanchored(d.prog)
105 | 	params.runForward = true
106 | 	params.ep = int(math.MaxInt64)
107 | 	if !d.analyzeSearch(&params) {
108 | 		return -1, -1, false, errors.New("analyze search failed on forward DFA")
109 | 	}
110 | 	b := d.searchLoop(&params)
111 | 	if params.failed {
112 | 		return -1, -1, false, errFallBack
113 | 	}
114 | 	if !b {
115 | 		return -1, -1, false, nil
116 | 	}
117 | 	end = params.ep
118 | 
119 | 	params = searchParams{}
120 | 	params.startpos = startpos
121 | 	params.ep = end
122 | 	params.anchored = true
123 | 	params.input = i
124 | 	params.runForward = false
125 | 	if !reversed.analyzeSearch(&params) {
126 | 		return -2, -2, false, errors.New("analyze search failed on reverse DFA")
127 | 	}
128 | 	b = reversed.searchLoop(&params)
129 | 	if DebugDFA {
130 | 		DebugPrintf("\nkind %d\n%v\n", d.kind, d.prog)
131 | 	}
132 | 	if params.failed {
133 | 		return -1, -1, false, errFallBack
134 | 	}
135 | 	return params.ep, end, b, nil
136 | }


--------------------------------------------------------------------------------
/internal/dfa/state.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dfa
  6 | 
  7 | // TODO(matloob): rename all the upper-case identifiers to lower-case.
  8 | 
  9 | import (
 10 | 	"bytes"
 11 | 	"strconv"
 12 | 	"sync"
 13 | )
 14 | 
 15 | // just use ints instead of stateinst??
 16 | type stateInst int
 17 | 
 18 | type State struct {
 19 | 	mu sync.Mutex
 20 | 
 21 | 	// Instruction pointers in the state.
 22 | 	// TODO(matloob): Should these have a different type?
 23 | 	inst []int
 24 | 
 25 | 	// Empty string bitfield flags in effect on the way
 26 | 	// into this state, along with FlagMatch if this is
 27 | 	// a matching state.
 28 | 	flag flag
 29 | 
 30 | 	// Outgoing arrows from State, one per input byte class.
 31 | 	next []*State
 32 | }
 33 | 
 34 | func (s *State) isMatch() bool {
 35 | 	return s.flag&flagMatch != 0
 36 | }
 37 | 
 38 | type flag uint32
 39 | 
 40 | var (
 41 | 	flagEmptyMask = flag(0xFFF)
 42 | 	flagMatch     = flag(0x1000)
 43 | 	flagLastWord  = flag(0x2000)
 44 | 	flagNeedShift = flag(16)
 45 | )
 46 | 
 47 | // Special "firstbyte" values for a state.  (Values >= 0 denote actual bytes.)
 48 | const (
 49 | 	fbUnknown int64 = -1 // No analysis has been performed.
 50 | 	fbMany int64   = -2 // Many bytes will lead out of this state.
 51 | 	fbNone int64  = -3 // No bytes lead out of this state.
 52 | )
 53 | 
 54 | const (
 55 | 	// Indices into start for unanchored searches.
 56 | 	// Add startAnchored for anchored searches.
 57 | 	startBeginText       = 0
 58 | 	startBeginLine       = 2
 59 | 	startWordBoundary    = 4
 60 | 	startNonWordBoundary = 6
 61 | 	maxStart             = 8
 62 | 
 63 | 	kStartAnchored = 1
 64 | )
 65 | 
 66 | const mark = -1
 67 | 
 68 | // TODO(matloob): in RE2 deadState and fullMatchState are (State*)(1) and (State*)(2)
 69 | // respectively. Is it cheaper to compare with those numbers, than these states?
 70 | // Do we need to import package unsafe?
 71 | var deadState = &State{}
 72 | var fullMatchState = &State{}
 73 | 
 74 | func isSpecialState(s *State) bool {
 75 | 	// see above. cc does int comparison because deadState and fullMatchState
 76 | 	// are special numbers, but that's unsafe.
 77 | 	// TODO(matloob): convert states back to numbers. (pointers into state array state(-2) and state(-1))
 78 | 	return s == deadState || s == fullMatchState || s == nil
 79 | }
 80 | 
 81 | func (s *State) Dump() string {
 82 | 	switch s {
 83 | 	case nil:
 84 | 		return "_"
 85 | 	case deadState:
 86 | 		return "X"
 87 | 	case fullMatchState:
 88 | 		return "*"
 89 | 	}
 90 | 	var buf bytes.Buffer
 91 | 	sep := ""
 92 | 	buf.WriteString("(0x<TODO(matloob):state id>)")
 93 | 	// buf.WriteString(fmt.Sprintf("(%p)", s)
 94 | 	for _, inst := range s.inst {
 95 | 		if inst == int(mark) {
 96 | 			buf.WriteString("|")
 97 | 			sep = ""
 98 | 		} else {
 99 | 			buf.WriteString(sep)
100 | 			buf.WriteString(strconv.Itoa(inst))
101 | 			sep = ","
102 | 		}
103 | 	}
104 | 	buf.WriteString("flag=0x")
105 | 	buf.WriteString(strconv.FormatUint(uint64(s.flag), 16))
106 | 	return buf.String()
107 | }
108 | 
109 | type stateSet struct {
110 | 	states []State
111 | 
112 | 	instpool []int
113 | 	instpos  int
114 | 
115 | 	nextpool []*State
116 | 	nextpos  int
117 | }
118 | 
119 | func (s *stateSet) init(budget int, runeRanges int, proglen int, nmark int) {
120 | 	// estimate State size to avoid using unsafe
121 | 	const intsize = 8
122 | 	const slicesize = 3*intsize
123 | 	const statesize = 2 *slicesize+intsize
124 | 
125 | 	// the cost of one state including the inst and next slices
126 | 	onestate := statesize + runeRanges*intsize + (proglen+nmark)*intsize
127 | 	numstates := budget/onestate
128 | 	// TODO(matloob): actually use budget number
129 | 	s.states = make([]State, 0, numstates)
130 | 
131 | 	s.instpool = make([]int, 0, (proglen+nmark)*numstates)
132 | 	s.instpos = 0
133 | 	s.nextpool = make([]*State, 0, runeRanges*numstates)
134 | 	s.nextpos = 0
135 | 
136 | }
137 | 
138 | // clear clears the state cache. Must hold the DFA's cache mutex to call clear.
139 | func (s *stateSet) clear() {
140 | 	s.states = s.states[:0]
141 | 	s.instpool = s.instpool[:0]
142 | 	s.nextpool = s.nextpool[:0]
143 | }
144 | 
145 | func (s *stateSet) find(inst []int, flag flag) *State {
146 | loop:
147 | 	for i := range s.states {
148 | 		if len(s.states[i].inst) != len(inst) {
149 | 			continue
150 | 		}
151 | 		for j := range inst {
152 | 			if s.states[i].inst[j] != inst[j] {
153 | 				continue loop
154 | 			}
155 | 		}
156 | 		if s.states[i].flag != flag {
157 | 			continue
158 | 		}
159 | 		return &s.states[i]
160 | 	}
161 | 	return nil
162 | }
163 | 
164 | func (s *stateSet) size() int {
165 | 	return len(s.states)
166 | }
167 | 
168 | func (s *stateSet) insert(inst []int, flag flag, nextsize int) *State {
169 | 	if len(s.states)+1 > cap(s.states) ||
170 | 		s.instpos+len(inst) > cap(s.instpool) ||
171 | 		s.nextpos+nextsize > cap(s.nextpool) {
172 | 		// state cache is full
173 | 		return nil
174 | 	}
175 | 
176 | 	// TODO(matloob): can we insert?
177 | 	i := len(s.states)
178 | 	s.states = s.states[:i+1]
179 | 	state := &s.states[i]
180 | 
181 | 	instsize := len(inst)
182 | 	state.inst = s.instpool[s.instpos : s.instpos+instsize]
183 | 	s.instpos += instsize
184 | 	copy(state.inst, inst)
185 | 
186 | 	state.flag = flag
187 | 
188 | 	state.next = s.nextpool[s.nextpos : s.nextpos+nextsize]
189 | 	s.nextpos += nextsize
190 | 	for i := range state.next {
191 | 		state.next[i] = nil
192 | 	}
193 | 
194 | 	return state
195 | }
196 | 
197 | type startInfo struct {
198 | 	start     *State
199 | 	firstbyte int64
200 | }
201 | 
202 | type stateSaver struct {
203 | 	dfa       *DFA
204 | 	inst      []int
205 | 	flag      flag
206 | 	isSpecial bool
207 | 	special   *State // if it's a special state special != nil
208 | }
209 | 
210 | func (s *stateSaver) Save(dfa *DFA, state *State) {
211 | 	s.dfa = dfa
212 | 	if isSpecialState(state) {
213 | 		s.inst = nil
214 | 		s.flag = 0
215 | 		s.special = state
216 | 		s.isSpecial = true
217 | 	}
218 | 	s.isSpecial = false
219 | 	s.flag = state.flag
220 | 
221 | 	s.inst = s.inst[:0]
222 | 	s.inst = append(s.inst, state.inst...)
223 | }
224 | 
225 | func (s *stateSaver) Restore() *State {
226 | 	if s.isSpecial {
227 | 		return s.special
228 | 	}
229 | 	s.dfa.mu.Lock()
230 | 	state := s.dfa.cachedState(s.inst, s.flag)
231 | 	s.inst = nil
232 | 	s.dfa.mu.Unlock()
233 | 	return state
234 | }
235 | 


--------------------------------------------------------------------------------
/internal/dfa/workq.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dfa
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"strconv"
 10 | )
 11 | 
 12 | type sparseSet struct {
 13 | 	sparseToDense []int
 14 | 	dense         []int
 15 | }
 16 | 
 17 | func makeSparseSet(maxSize int) sparseSet {
 18 | 	// 	s.maxSize = maxSize  // not necessary, right?
 19 | 	return sparseSet{
 20 | 		sparseToDense: make([]int, maxSize),
 21 | 		dense:         make([]int, maxSize),
 22 | 	}
 23 | }
 24 | 
 25 | func (s *sparseSet) resize(newMaxSize int) {
 26 | 	// TODO(matloob): Use slice length instead of size for 'dense'.
 27 | 	// Use cap instead of maxSize for both.
 28 | 	size := len(s.dense)
 29 | 	if size > newMaxSize {
 30 | 		size = newMaxSize
 31 | 	}
 32 | 	if newMaxSize > len(s.sparseToDense) {
 33 | 		a := make([]int, newMaxSize)
 34 | 		if s.sparseToDense != nil {
 35 | 			copy(a, s.sparseToDense)
 36 | 		}
 37 | 		s.sparseToDense = a
 38 | 
 39 | 		a = make([]int, size, newMaxSize)
 40 | 		if s.dense != nil {
 41 | 			copy(a, s.dense)
 42 | 		}
 43 | 		s.dense = a
 44 | 	}
 45 | }
 46 | 
 47 | func (s *sparseSet) maxSize() int {
 48 | 	return cap(s.dense)
 49 | }
 50 | 
 51 | func (s *sparseSet) clear() {
 52 | 	s.dense = s.dense[:0]
 53 | }
 54 | 
 55 | func (s *sparseSet) contains(i int) bool {
 56 | 	if i >= len(s.sparseToDense) {
 57 | 		return false
 58 | 	}
 59 | 	return s.sparseToDense[i] < len(s.dense) && s.dense[s.sparseToDense[i]] == i
 60 | }
 61 | 
 62 | func (s *sparseSet) insert(i int) {
 63 | 	if s.contains(i) {
 64 | 		return
 65 | 	}
 66 | 	s.insertNew(i)
 67 | }
 68 | 
 69 | func (s *sparseSet) insertNew(i int) {
 70 | 	if i >= len(s.sparseToDense) {
 71 | 		return
 72 | 	}
 73 | 	// There's a CHECK here that size < maxSize...
 74 | 
 75 | 	s.sparseToDense[i] = len(s.dense)
 76 | 	s.dense = s.dense[:len(s.dense)+1]
 77 | 	s.dense[len(s.dense)-1] = i
 78 | }
 79 | 
 80 | type workq struct {
 81 | 	s           sparseSet
 82 | 	n           int  // size excluding marks
 83 | 	maxm        int  // maximum number of marks
 84 | 	nextm       int  // id of next mark
 85 | 	lastWasMark bool // last inserted was mark
 86 | }
 87 | 
 88 | func newWorkq(n, maxmark int) *workq {
 89 | 	return &workq{
 90 | 		s:           makeSparseSet(n + maxmark),
 91 | 		n:           n,
 92 | 		maxm:        maxmark,
 93 | 		nextm:       n,
 94 | 		lastWasMark: true,
 95 | 	}
 96 | }
 97 | 
 98 | func (q *workq) isMark(i int) bool { return i >= q.n }
 99 | 
100 | func (q *workq) clear() {
101 | 	q.s.clear()
102 | 	q.nextm = q.n
103 | }
104 | 
105 | func (q *workq) contains(i int) bool {
106 | 	return q.s.contains(i)
107 | }
108 | 
109 | func (q *workq) maxmark() int {
110 | 	return q.maxm
111 | }
112 | 
113 | func (q *workq) mark() {
114 | 	if q.lastWasMark {
115 | 		return
116 | 	}
117 | 	q.lastWasMark = false
118 | 	q.s.insertNew(int(q.nextm))
119 | 	q.nextm++
120 | }
121 | 
122 | func (q *workq) size() int {
123 | 	return q.n + q.maxm
124 | }
125 | 
126 | func (q *workq) insert(id int) {
127 | 	if q.s.contains(id) {
128 | 		return
129 | 	}
130 | 	q.insertNew(id)
131 | }
132 | 
133 | func (q *workq) insertNew(id int) {
134 | 	q.lastWasMark = false
135 | 	q.s.insertNew(id)
136 | }
137 | 
138 | func (q *workq) elements() []int { // should be []stateInst. Should we convert sparseset to use stateInst instead of int??
139 | 	return q.s.dense
140 | }
141 | 
142 | func (q *workq) dump() string {
143 | 	var buf bytes.Buffer
144 | 	sep := ""
145 | 	for _, v := range q.elements() {
146 | 		if q.isMark(v) {
147 | 			buf.WriteString("|")
148 | 			sep = ""
149 | 		} else {
150 | 			buf.WriteString(sep)
151 | 			buf.WriteString(strconv.Itoa(v))
152 | 			sep = ","
153 | 		}
154 | 	}
155 | 	return buf.String()
156 | }
157 | 


--------------------------------------------------------------------------------
/internal/input/input.go:
--------------------------------------------------------------------------------
  1 | package input
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"io"
  6 | 	"matloob.io/regexp/syntax"
  7 | 	"strings"
  8 | 	"unicode/utf8"
  9 | )
 10 | 
 11 | type Prefixer interface {
 12 | 	Prefix() string
 13 | 	PrefixBytes() []byte
 14 | }
 15 | 
 16 | const EndOfText rune = -1
 17 | const StartOfText rune = -2
 18 | 
 19 | // Input abstracts different representations of the input text. It provides
 20 | // one-character lookahead.
 21 | type Input interface {
 22 | 	// Step returns the rune starting at pos and its width. Unless
 23 | 	// CanCheckPrefix is true, Step should always be called with
 24 | 	// with the current position in the string, which is the sum
 25 | 	// of the previous pos Step was called with, and the width
 26 | 	// returned by that call.
 27 | 	Step(pos int) (r rune, width int)
 28 | 
 29 | 	// CanCheckInput reports whether we can look ahead without losing info.
 30 | 	CanCheckPrefix() bool
 31 | 
 32 | 	// HasPrefix reports whether the input has the prefix reported
 33 | 	// by the Prefixer.
 34 | 	HasPrefix(p Prefixer) bool
 35 | 
 36 | 	// Index returns the index of the first occurence of the
 37 | 	// prefix following pos, or -1 if it can't be found.
 38 | 	Index(p Prefixer, pos int) int
 39 | 
 40 | 	// Context returns the EmptyOp flags satisfied by the context at pos.
 41 | 	Context(pos int) syntax.EmptyOp
 42 | }
 43 | 
 44 | type Rinput interface {
 45 | 	Input
 46 | 
 47 | 	Rstep(pos int) (r rune, width int)
 48 | }
 49 | 
 50 | // InputString scans a string.
 51 | type InputString struct {
 52 | 	str string
 53 | }
 54 | 
 55 | // Reset resets the InputString with the given string.
 56 | func (i *InputString) Reset(str string) {
 57 | 	i.str = str
 58 | }
 59 | 
 60 | func (i *InputString) Step(pos int) (rune, int) {
 61 | 	if pos < 0 {
 62 | 		return StartOfText, 0
 63 | 	}
 64 | 	if pos < len(i.str) {
 65 | 		c := i.str[pos]
 66 | 		if c < utf8.RuneSelf {
 67 | 			return rune(c), 1
 68 | 		}
 69 | 		return utf8.DecodeRuneInString(i.str[pos:])
 70 | 	}
 71 | 	return EndOfText, 0
 72 | }
 73 | 
 74 | func (i *InputString) Rstep(pos int) (rune, int) {
 75 | 	if pos > len(i.str) {
 76 | 		return StartOfText, 0
 77 | 	}
 78 | 	if pos >= 0 {
 79 | 		c := i.str[pos-1]
 80 | 		if c < utf8.RuneSelf {
 81 | 			return rune(c), 1
 82 | 		}
 83 | 		return utf8.DecodeLastRuneInString(i.str[:pos])
 84 | 	}
 85 | 	return EndOfText, 0
 86 | }
 87 | 
 88 | func (i *InputString) CanCheckPrefix() bool {
 89 | 	return true
 90 | }
 91 | 
 92 | func (i *InputString) HasPrefix(p Prefixer) bool {
 93 | 	return strings.HasPrefix(i.str, p.Prefix())
 94 | }
 95 | 
 96 | func (i *InputString) Index(p Prefixer, pos int) int {
 97 | 	return strings.Index(i.str[pos:], p.Prefix())
 98 | }
 99 | 
100 | func (i *InputString) Context(pos int) syntax.EmptyOp {
101 | 	r1, r2 := EndOfText, EndOfText
102 | 	if pos > 0 && pos <= len(i.str) {
103 | 		r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
104 | 	}
105 | 	if pos < len(i.str) {
106 | 		r2, _ = utf8.DecodeRuneInString(i.str[pos:])
107 | 	}
108 | 	return syntax.EmptyOpContext(r1, r2)
109 | }
110 | 
111 | // InputBytes scans a byte slice.
112 | type InputBytes struct {
113 | 	str []byte
114 | }
115 | 
116 | // Reset resets the InputBytes with the given byte slice.
117 | func (i *InputBytes) Reset(str []byte) {
118 | 	i.str = str
119 | }
120 | 
121 | func (i *InputBytes) Step(pos int) (rune, int) {
122 | 	if pos < 0 {
123 | 		return StartOfText, 0
124 | 	}
125 | 	if pos < len(i.str) {
126 | 		c := i.str[pos]
127 | 		if c < utf8.RuneSelf {
128 | 			return rune(c), 1
129 | 		}
130 | 		return utf8.DecodeRune(i.str[pos:])
131 | 	}
132 | 	return EndOfText, 0
133 | }
134 | 
135 | func (i *InputBytes) Rstep(pos int) (rune, int) {
136 | 	if pos > len(i.str) {
137 | 		return StartOfText, 0
138 | 	}
139 | 	if pos >= 0 {
140 | 		c := i.str[pos-1]
141 | 		if c < utf8.RuneSelf {
142 | 			return rune(c), 1
143 | 		}
144 | 		return utf8.DecodeLastRune(i.str[:pos]) // This doesn't include pos char?
145 | 	}
146 | 	return EndOfText, 0
147 | }
148 | 
149 | 
150 | func (i *InputBytes) CanCheckPrefix() bool {
151 | 	return true
152 | }
153 | 
154 | func (i *InputBytes) HasPrefix(p Prefixer) bool {
155 | 	return bytes.HasPrefix(i.str, p.PrefixBytes())
156 | }
157 | 
158 | func (i *InputBytes) Index(p Prefixer, pos int) int {
159 | 	if pos > len(i.str) {
160 | 		panic("pos > len i.str")
161 | 	}
162 | 	if i.str == nil {
163 | 		panic("i.str nil")
164 | 	}
165 | 	if p == nil {
166 | 		panic("p is nil")
167 | 	}
168 | 	return bytes.Index(i.str[pos:], p.PrefixBytes())
169 | }
170 | 
171 | func (i *InputBytes) Context(pos int) syntax.EmptyOp {
172 | 	r1, r2 := EndOfText, EndOfText
173 | 	if pos > 0 && pos <= len(i.str) {
174 | 		r1, _ = utf8.DecodeLastRune(i.str[:pos])
175 | 	}
176 | 	if pos < len(i.str) {
177 | 		r2, _ = utf8.DecodeRune(i.str[pos:])
178 | 	}
179 | 	return syntax.EmptyOpContext(r1, r2)
180 | }
181 | 
182 | // InputReader scans a RuneReader.
183 | type InputReader struct {
184 | 	r     io.RuneReader
185 | 	atEOT bool
186 | 	pos   int
187 | }
188 | 
189 | // Reset resets the InputReader with the given RuneReader.
190 | func (i *InputReader) Reset(r io.RuneReader) {
191 | 	i.r = r
192 | 	i.atEOT = false
193 | 	i.pos = 0
194 | }
195 | 
196 | func (i *InputReader) Step(pos int) (rune, int) {
197 | 	if !i.atEOT && pos != i.pos {
198 | 		return EndOfText, 0
199 | 
200 | 	}
201 | 	r, w, err := i.r.ReadRune()
202 | 	if err != nil {
203 | 		i.atEOT = true
204 | 		return EndOfText, 0
205 | 	}
206 | 	i.pos += w
207 | 	return r, w
208 | }
209 | 
210 | func (i *InputReader) CanCheckPrefix() bool {
211 | 	return false
212 | }
213 | 
214 | func (i *InputReader) HasPrefix(p Prefixer) bool {
215 | 	return false
216 | }
217 | 
218 | func (i *InputReader) Index(p Prefixer, pos int) int {
219 | 	return -1
220 | }
221 | 
222 | func (i *InputReader) Context(pos int) syntax.EmptyOp {
223 | 	return 0
224 | }
225 | 


--------------------------------------------------------------------------------
/onepass.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"matloob.io/regexp/syntax"
 10 | 	"sort"
 11 | 	"unicode"
 12 | )
 13 | 
 14 | // "One-pass" regexp execution.
 15 | // Some regexps can be analyzed to determine that they never need
 16 | // backtracking: they are guaranteed to run in one pass over the string
 17 | // without bothering to save all the usual NFA state.
 18 | // Detect those and execute them more quickly.
 19 | 
 20 | // A onePassProg is a compiled one-pass regular expression program.
 21 | // It is the same as syntax.Prog except for the use of onePassInst.
 22 | type onePassProg struct {
 23 | 	Inst   []onePassInst
 24 | 	Start  int // index of start instruction
 25 | 	NumCap int // number of InstCapture insts in re
 26 | }
 27 | 
 28 | // A onePassInst is a single instruction in a one-pass regular expression program.
 29 | // It is the same as syntax.Inst except for the new 'Next' field.
 30 | type onePassInst struct {
 31 | 	syntax.Inst
 32 | 	Next []uint32
 33 | }
 34 | 
 35 | // OnePassPrefix returns a literal string that all matches for the
 36 | // regexp must start with. Complete is true if the prefix
 37 | // is the entire match. Pc is the index of the last rune instruction
 38 | // in the string. The OnePassPrefix skips over the mandatory
 39 | // EmptyBeginText
 40 | func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
 41 | 	i := &p.Inst[p.Start]
 42 | 	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
 43 | 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
 44 | 	}
 45 | 	pc = i.Out
 46 | 	i = &p.Inst[pc]
 47 | 	for i.Op == syntax.InstNop {
 48 | 		pc = i.Out
 49 | 		i = &p.Inst[pc]
 50 | 	}
 51 | 	// Avoid allocation of buffer if prefix is empty.
 52 | 	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
 53 | 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
 54 | 	}
 55 | 
 56 | 	// Have prefix; gather characters.
 57 | 	var buf bytes.Buffer
 58 | 	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
 59 | 		buf.WriteRune(i.Rune[0])
 60 | 		pc, i = i.Out, &p.Inst[i.Out]
 61 | 	}
 62 | 	if i.Op == syntax.InstEmptyWidth &&
 63 | 		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
 64 | 		p.Inst[i.Out].Op == syntax.InstMatch {
 65 | 		complete = true
 66 | 	}
 67 | 	return buf.String(), complete, pc
 68 | }
 69 | 
 70 | // OnePassNext selects the next actionable state of the prog, based on the input character.
 71 | // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
 72 | // One of the alternates may ultimately lead without input to end of line. If the instruction
 73 | // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
 74 | func onePassNext(i *onePassInst, r rune) uint32 {
 75 | 	next := i.MatchRunePos(r)
 76 | 	if next >= 0 {
 77 | 		return i.Next[next]
 78 | 	}
 79 | 	if i.Op == syntax.InstAltMatch {
 80 | 		return i.Out
 81 | 	}
 82 | 	return 0
 83 | }
 84 | 
 85 | func iop(i *syntax.Inst) syntax.InstOp {
 86 | 	op := i.Op
 87 | 	switch op {
 88 | 	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
 89 | 		op = syntax.InstRune
 90 | 	}
 91 | 	return op
 92 | }
 93 | 
 94 | // Sparse Array implementation is used as a queueOnePass.
 95 | type queueOnePass struct {
 96 | 	sparse          []uint32
 97 | 	dense           []uint32
 98 | 	size, nextIndex uint32
 99 | }
100 | 
101 | func (q *queueOnePass) empty() bool {
102 | 	return q.nextIndex >= q.size
103 | }
104 | 
105 | func (q *queueOnePass) next() (n uint32) {
106 | 	n = q.dense[q.nextIndex]
107 | 	q.nextIndex++
108 | 	return
109 | }
110 | 
111 | func (q *queueOnePass) clear() {
112 | 	q.size = 0
113 | 	q.nextIndex = 0
114 | }
115 | 
116 | func (q *queueOnePass) contains(u uint32) bool {
117 | 	if u >= uint32(len(q.sparse)) {
118 | 		return false
119 | 	}
120 | 	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
121 | }
122 | 
123 | func (q *queueOnePass) insert(u uint32) {
124 | 	if !q.contains(u) {
125 | 		q.insertNew(u)
126 | 	}
127 | }
128 | 
129 | func (q *queueOnePass) insertNew(u uint32) {
130 | 	if u >= uint32(len(q.sparse)) {
131 | 		return
132 | 	}
133 | 	q.sparse[u] = q.size
134 | 	q.dense[q.size] = u
135 | 	q.size++
136 | }
137 | 
138 | func newQueue(size int) (q *queueOnePass) {
139 | 	return &queueOnePass{
140 | 		sparse: make([]uint32, size),
141 | 		dense:  make([]uint32, size),
142 | 	}
143 | }
144 | 
145 | // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
146 | // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
147 | // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
148 | // NextIp array with the single element mergeFailed is returned.
149 | // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
150 | const mergeFailed = uint32(0xffffffff)
151 | 
152 | var (
153 | 	noRune = []rune{}
154 | 	noNext = []uint32{mergeFailed}
155 | )
156 | 
157 | func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
158 | 	leftLen := len(*leftRunes)
159 | 	rightLen := len(*rightRunes)
160 | 	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
161 | 		panic("mergeRuneSets odd length []rune")
162 | 	}
163 | 	var (
164 | 		lx, rx int
165 | 	)
166 | 	merged := make([]rune, 0)
167 | 	next := make([]uint32, 0)
168 | 	ok := true
169 | 	defer func() {
170 | 		if !ok {
171 | 			merged = nil
172 | 			next = nil
173 | 		}
174 | 	}()
175 | 
176 | 	ix := -1
177 | 	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
178 | 		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
179 | 			return false
180 | 		}
181 | 		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
182 | 		*newLow += 2
183 | 		ix += 2
184 | 		next = append(next, pc)
185 | 		return true
186 | 	}
187 | 
188 | 	for lx < leftLen || rx < rightLen {
189 | 		switch {
190 | 		case rx >= rightLen:
191 | 			ok = extend(&lx, leftRunes, leftPC)
192 | 		case lx >= leftLen:
193 | 			ok = extend(&rx, rightRunes, rightPC)
194 | 		case (*rightRunes)[rx] < (*leftRunes)[lx]:
195 | 			ok = extend(&rx, rightRunes, rightPC)
196 | 		default:
197 | 			ok = extend(&lx, leftRunes, leftPC)
198 | 		}
199 | 		if !ok {
200 | 			return noRune, noNext
201 | 		}
202 | 	}
203 | 	return merged, next
204 | }
205 | 
206 | // cleanupOnePass drops working memory, and restores certain shortcut instructions.
207 | func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
208 | 	for ix, instOriginal := range original.Inst {
209 | 		switch instOriginal.Op {
210 | 		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
211 | 		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
212 | 			prog.Inst[ix].Next = nil
213 | 		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
214 | 			prog.Inst[ix].Next = nil
215 | 			prog.Inst[ix] = onePassInst{Inst: instOriginal}
216 | 		}
217 | 	}
218 | }
219 | 
220 | // onePassCopy creates a copy of the original Prog, as we'll be modifying it
221 | func onePassCopy(prog *syntax.Prog) *onePassProg {
222 | 	p := &onePassProg{
223 | 		Start:  prog.Start,
224 | 		NumCap: prog.NumCap,
225 | 	}
226 | 	for _, inst := range prog.Inst {
227 | 		p.Inst = append(p.Inst, onePassInst{Inst: inst})
228 | 	}
229 | 
230 | 	// rewrites one or more common Prog constructs that enable some otherwise
231 | 	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
232 | 	// ip A, that points to ips B & C.
233 | 	// A:BC + B:DA => A:BC + B:CD
234 | 	// A:BC + B:DC => A:DC + B:DC
235 | 	for pc := range p.Inst {
236 | 		switch p.Inst[pc].Op {
237 | 		default:
238 | 			continue
239 | 		case syntax.InstAlt, syntax.InstAltMatch:
240 | 			// A:Bx + B:Ay
241 | 			p_A_Other := &p.Inst[pc].Out
242 | 			p_A_Alt := &p.Inst[pc].Arg
243 | 			// make sure a target is another Alt
244 | 			instAlt := p.Inst[*p_A_Alt]
245 | 			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
246 | 				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
247 | 				instAlt = p.Inst[*p_A_Alt]
248 | 				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
249 | 					continue
250 | 				}
251 | 			}
252 | 			instOther := p.Inst[*p_A_Other]
253 | 			// Analyzing both legs pointing to Alts is for another day
254 | 			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
255 | 				// too complicated
256 | 				continue
257 | 			}
258 | 			// simple empty transition loop
259 | 			// A:BC + B:DA => A:BC + B:DC
260 | 			p_B_Alt := &p.Inst[*p_A_Alt].Out
261 | 			p_B_Other := &p.Inst[*p_A_Alt].Arg
262 | 			patch := false
263 | 			if instAlt.Out == uint32(pc) {
264 | 				patch = true
265 | 			} else if instAlt.Arg == uint32(pc) {
266 | 				patch = true
267 | 				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
268 | 			}
269 | 			if patch {
270 | 				*p_B_Alt = *p_A_Other
271 | 			}
272 | 
273 | 			// empty transition to common target
274 | 			// A:BC + B:DC => A:DC + B:DC
275 | 			if *p_A_Other == *p_B_Alt {
276 | 				*p_A_Alt = *p_B_Other
277 | 			}
278 | 		}
279 | 	}
280 | 	return p
281 | }
282 | 
283 | // runeSlice exists to permit sorting the case-folded rune sets.
284 | type runeSlice []rune
285 | 
286 | func (p runeSlice) Len() int           { return len(p) }
287 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
288 | func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
289 | 
290 | // Sort is a convenience method.
291 | func (p runeSlice) Sort() {
292 | 	sort.Sort(p)
293 | }
294 | 
295 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
296 | var anyRune = []rune{0, unicode.MaxRune}
297 | 
298 | // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
299 | // the match engine can always tell which branch to take. The routine may modify
300 | // p if it is turned into a onepass Prog. If it isn't possible for this to be a
301 | // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
302 | // to the size of the Prog.
303 | func makeOnePass(p *onePassProg) *onePassProg {
304 | 	// If the machine is very long, it's not worth the time to check if we can use one pass.
305 | 	if len(p.Inst) >= 1000 {
306 | 		return notOnePass
307 | 	}
308 | 
309 | 	var (
310 | 		instQueue    = newQueue(len(p.Inst))
311 | 		visitQueue   = newQueue(len(p.Inst))
312 | 		check        func(uint32, map[uint32]bool) bool
313 | 		onePassRunes = make([][]rune, len(p.Inst))
314 | 	)
315 | 
316 | 	// check that paths from Alt instructions are unambiguous, and rebuild the new
317 | 	// program as a onepass program
318 | 	check = func(pc uint32, m map[uint32]bool) (ok bool) {
319 | 		ok = true
320 | 		inst := &p.Inst[pc]
321 | 		if visitQueue.contains(pc) {
322 | 			return
323 | 		}
324 | 		visitQueue.insert(pc)
325 | 		switch inst.Op {
326 | 		case syntax.InstAlt, syntax.InstAltMatch:
327 | 			ok = check(inst.Out, m) && check(inst.Arg, m)
328 | 			// check no-input paths to InstMatch
329 | 			matchOut := m[inst.Out]
330 | 			matchArg := m[inst.Arg]
331 | 			if matchOut && matchArg {
332 | 				ok = false
333 | 				break
334 | 			}
335 | 			// Match on empty goes in inst.Out
336 | 			if matchArg {
337 | 				inst.Out, inst.Arg = inst.Arg, inst.Out
338 | 				matchOut, matchArg = matchArg, matchOut
339 | 			}
340 | 			if matchOut {
341 | 				m[pc] = true
342 | 				inst.Op = syntax.InstAltMatch
343 | 			}
344 | 
345 | 			// build a dispatch operator from the two legs of the alt.
346 | 			onePassRunes[pc], inst.Next = mergeRuneSets(
347 | 				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
348 | 			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
349 | 				ok = false
350 | 				break
351 | 			}
352 | 		case syntax.InstCapture, syntax.InstNop:
353 | 			ok = check(inst.Out, m)
354 | 			m[pc] = m[inst.Out]
355 | 			// pass matching runes back through these no-ops.
356 | 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
357 | 			inst.Next = []uint32{}
358 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
359 | 				inst.Next = append(inst.Next, inst.Out)
360 | 			}
361 | 		case syntax.InstEmptyWidth:
362 | 			ok = check(inst.Out, m)
363 | 			m[pc] = m[inst.Out]
364 | 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
365 | 			inst.Next = []uint32{}
366 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
367 | 				inst.Next = append(inst.Next, inst.Out)
368 | 			}
369 | 		case syntax.InstMatch, syntax.InstFail:
370 | 			m[pc] = inst.Op == syntax.InstMatch
371 | 			break
372 | 		case syntax.InstRune:
373 | 			m[pc] = false
374 | 			if len(inst.Next) > 0 {
375 | 				break
376 | 			}
377 | 			instQueue.insert(inst.Out)
378 | 			if len(inst.Rune) == 0 {
379 | 				onePassRunes[pc] = []rune{}
380 | 				inst.Next = []uint32{inst.Out}
381 | 				break
382 | 			}
383 | 			runes := make([]rune, 0)
384 | 			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
385 | 				r0 := inst.Rune[0]
386 | 				runes = append(runes, r0, r0)
387 | 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
388 | 					runes = append(runes, r1, r1)
389 | 				}
390 | 				sort.Sort(runeSlice(runes))
391 | 			} else {
392 | 				runes = append(runes, inst.Rune...)
393 | 			}
394 | 			onePassRunes[pc] = runes
395 | 			inst.Next = []uint32{}
396 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
397 | 				inst.Next = append(inst.Next, inst.Out)
398 | 			}
399 | 			inst.Op = syntax.InstRune
400 | 		case syntax.InstRune1:
401 | 			m[pc] = false
402 | 			if len(inst.Next) > 0 {
403 | 				break
404 | 			}
405 | 			instQueue.insert(inst.Out)
406 | 			runes := []rune{}
407 | 			// expand case-folded runes
408 | 			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
409 | 				r0 := inst.Rune[0]
410 | 				runes = append(runes, r0, r0)
411 | 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
412 | 					runes = append(runes, r1, r1)
413 | 				}
414 | 				sort.Sort(runeSlice(runes))
415 | 			} else {
416 | 				runes = append(runes, inst.Rune[0], inst.Rune[0])
417 | 			}
418 | 			onePassRunes[pc] = runes
419 | 			inst.Next = []uint32{}
420 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
421 | 				inst.Next = append(inst.Next, inst.Out)
422 | 			}
423 | 			inst.Op = syntax.InstRune
424 | 		case syntax.InstRuneAny:
425 | 			m[pc] = false
426 | 			if len(inst.Next) > 0 {
427 | 				break
428 | 			}
429 | 			instQueue.insert(inst.Out)
430 | 			onePassRunes[pc] = append([]rune{}, anyRune...)
431 | 			inst.Next = []uint32{inst.Out}
432 | 		case syntax.InstRuneAnyNotNL:
433 | 			m[pc] = false
434 | 			if len(inst.Next) > 0 {
435 | 				break
436 | 			}
437 | 			instQueue.insert(inst.Out)
438 | 			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
439 | 			inst.Next = []uint32{}
440 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
441 | 				inst.Next = append(inst.Next, inst.Out)
442 | 			}
443 | 		}
444 | 		return
445 | 	}
446 | 
447 | 	instQueue.clear()
448 | 	instQueue.insert(uint32(p.Start))
449 | 	m := make(map[uint32]bool, len(p.Inst))
450 | 	for !instQueue.empty() {
451 | 		visitQueue.clear()
452 | 		pc := instQueue.next()
453 | 		if !check(pc, m) {
454 | 			p = notOnePass
455 | 			break
456 | 		}
457 | 	}
458 | 	if p != notOnePass {
459 | 		for i := range p.Inst {
460 | 			p.Inst[i].Rune = onePassRunes[i]
461 | 		}
462 | 	}
463 | 	return p
464 | }
465 | 
466 | var notOnePass *onePassProg = nil
467 | 
468 | // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
469 | // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
470 | // Prog cannot be converted. For a one pass prog, the fundamental condition that must
471 | // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
472 | func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
473 | 	if prog.Start == 0 {
474 | 		return notOnePass
475 | 	}
476 | 	// onepass regexp is anchored
477 | 	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
478 | 		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
479 | 		return notOnePass
480 | 	}
481 | 	// every instruction leading to InstMatch must be EmptyEndText
482 | 	for _, inst := range prog.Inst {
483 | 		opOut := prog.Inst[inst.Out].Op
484 | 		switch inst.Op {
485 | 		default:
486 | 			if opOut == syntax.InstMatch {
487 | 				return notOnePass
488 | 			}
489 | 		case syntax.InstAlt, syntax.InstAltMatch:
490 | 			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
491 | 				return notOnePass
492 | 			}
493 | 		case syntax.InstEmptyWidth:
494 | 			if opOut == syntax.InstMatch {
495 | 				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
496 | 					continue
497 | 				}
498 | 				return notOnePass
499 | 			}
500 | 		}
501 | 	}
502 | 	// Creates a slightly optimized copy of the original Prog
503 | 	// that cleans up some Prog idioms that block valid onepass programs
504 | 	p = onePassCopy(prog)
505 | 
506 | 	// checkAmbiguity on InstAlts, build onepass Prog if possible
507 | 	p = makeOnePass(p)
508 | 
509 | 	if p != notOnePass {
510 | 		cleanupOnePass(p, prog)
511 | 	}
512 | 	return p
513 | }
514 | 


--------------------------------------------------------------------------------
/onepass_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"reflect"
  9 | 	"matloob.io/regexp/syntax"
 10 | 	"testing"
 11 | )
 12 | 
 13 | var runeMergeTests = []struct {
 14 | 	left, right, merged []rune
 15 | 	next                []uint32
 16 | 	leftPC, rightPC     uint32
 17 | }{
 18 | 	{
 19 | 		// empty rhs
 20 | 		[]rune{69, 69},
 21 | 		[]rune{},
 22 | 		[]rune{69, 69},
 23 | 		[]uint32{1},
 24 | 		1, 2,
 25 | 	},
 26 | 	{
 27 | 		// identical runes, identical targets
 28 | 		[]rune{69, 69},
 29 | 		[]rune{69, 69},
 30 | 		[]rune{},
 31 | 		[]uint32{mergeFailed},
 32 | 		1, 1,
 33 | 	},
 34 | 	{
 35 | 		// identical runes, different targets
 36 | 		[]rune{69, 69},
 37 | 		[]rune{69, 69},
 38 | 		[]rune{},
 39 | 		[]uint32{mergeFailed},
 40 | 		1, 2,
 41 | 	},
 42 | 	{
 43 | 		// append right-first
 44 | 		[]rune{69, 69},
 45 | 		[]rune{71, 71},
 46 | 		[]rune{69, 69, 71, 71},
 47 | 		[]uint32{1, 2},
 48 | 		1, 2,
 49 | 	},
 50 | 	{
 51 | 		// append, left-first
 52 | 		[]rune{71, 71},
 53 | 		[]rune{69, 69},
 54 | 		[]rune{69, 69, 71, 71},
 55 | 		[]uint32{2, 1},
 56 | 		1, 2,
 57 | 	},
 58 | 	{
 59 | 		// successful interleave
 60 | 		[]rune{60, 60, 71, 71, 101, 101},
 61 | 		[]rune{69, 69, 88, 88},
 62 | 		[]rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101},
 63 | 		[]uint32{1, 2, 1, 2, 1},
 64 | 		1, 2,
 65 | 	},
 66 | 	{
 67 | 		// left surrounds right
 68 | 		[]rune{69, 74},
 69 | 		[]rune{71, 71},
 70 | 		[]rune{},
 71 | 		[]uint32{mergeFailed},
 72 | 		1, 2,
 73 | 	},
 74 | 	{
 75 | 		// right surrounds left
 76 | 		[]rune{69, 74},
 77 | 		[]rune{68, 75},
 78 | 		[]rune{},
 79 | 		[]uint32{mergeFailed},
 80 | 		1, 2,
 81 | 	},
 82 | 	{
 83 | 		// overlap at interval begin
 84 | 		[]rune{69, 74},
 85 | 		[]rune{74, 75},
 86 | 		[]rune{},
 87 | 		[]uint32{mergeFailed},
 88 | 		1, 2,
 89 | 	},
 90 | 	{
 91 | 		// overlap ar interval end
 92 | 		[]rune{69, 74},
 93 | 		[]rune{65, 69},
 94 | 		[]rune{},
 95 | 		[]uint32{mergeFailed},
 96 | 		1, 2,
 97 | 	},
 98 | 	{
 99 | 		// overlap from above
100 | 		[]rune{69, 74},
101 | 		[]rune{71, 74},
102 | 		[]rune{},
103 | 		[]uint32{mergeFailed},
104 | 		1, 2,
105 | 	},
106 | 	{
107 | 		// overlap from below
108 | 		[]rune{69, 74},
109 | 		[]rune{65, 71},
110 | 		[]rune{},
111 | 		[]uint32{mergeFailed},
112 | 		1, 2,
113 | 	},
114 | 	{
115 | 		// out of order []rune
116 | 		[]rune{69, 74, 60, 65},
117 | 		[]rune{66, 67},
118 | 		[]rune{},
119 | 		[]uint32{mergeFailed},
120 | 		1, 2,
121 | 	},
122 | }
123 | 
124 | func TestMergeRuneSet(t *testing.T) {
125 | 	for ix, test := range runeMergeTests {
126 | 		merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC)
127 | 		if !reflect.DeepEqual(merged, test.merged) {
128 | 			t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged)
129 | 		}
130 | 		if !reflect.DeepEqual(next, test.next) {
131 | 			t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next)
132 | 		}
133 | 	}
134 | }
135 | 
136 | var onePass = &onePassProg{}
137 | 
138 | var onePassTests = []struct {
139 | 	re      string
140 | 	onePass *onePassProg
141 | }{
142 | 	{`^(?:a|(?:a*))$`, notOnePass},
143 | 	{`^(?:(a)|(?:a*))$`, notOnePass},
144 | 	{`^(?:(?:(?:.(?:$))?))$`, onePass},
145 | 	{`^abcd$`, onePass},
146 | 	{`^(?:(?:a{0,})*?)$`, onePass},
147 | 	{`^(?:(?:a+)*)$`, onePass},
148 | 	{`^(?:(?:a|(?:aa)))$`, onePass},
149 | 	{`^(?:[^\s\S])$`, onePass},
150 | 	{`^(?:(?:a{3,4}){0,})$`, notOnePass},
151 | 	{`^(?:(?:(?:a*)+))$`, onePass},
152 | 	{`^[a-c]+$`, onePass},
153 | 	{`^[a-c]*$`, onePass},
154 | 	{`^(?:a*)$`, onePass},
155 | 	{`^(?:(?:aa)|a)$`, onePass},
156 | 	{`^[a-c]*`, notOnePass},
157 | 	{`^...$`, onePass},
158 | 	{`^(?:a|(?:aa))$`, onePass},
159 | 	{`^a((b))c$`, onePass},
160 | 	{`^a.[l-nA-Cg-j]?e$`, onePass},
161 | 	{`^a((b))$`, onePass},
162 | 	{`^a(?:(b)|(c))c$`, onePass},
163 | 	{`^a(?:(b*)|(c))c$`, notOnePass},
164 | 	{`^a(?:b|c)$`, onePass},
165 | 	{`^a(?:b?|c)$`, onePass},
166 | 	{`^a(?:b?|c?)$`, notOnePass},
167 | 	{`^a(?:b?|c+)$`, onePass},
168 | 	{`^a(?:b+|(bc))d$`, notOnePass},
169 | 	{`^a(?:bc)+$`, onePass},
170 | 	{`^a(?:[bcd])+$`, onePass},
171 | 	{`^a((?:[bcd])+)$`, onePass},
172 | 	{`^a(:?b|c)*d$`, onePass},
173 | 	{`^.bc(d|e)*$`, onePass},
174 | 	{`^(?:(?:aa)|.)$`, notOnePass},
175 | 	{`^(?:(?:a{1,2}){1,2})$`, notOnePass},
176 | }
177 | 
178 | func TestCompileOnePass(t *testing.T) {
179 | 	var (
180 | 		p   *syntax.Prog
181 | 		re  *syntax.Regexp
182 | 		err error
183 | 	)
184 | 	for _, test := range onePassTests {
185 | 		if re, err = syntax.Parse(test.re, syntax.Perl); err != nil {
186 | 			t.Errorf("Parse(%q) got err:%s, want success", test.re, err)
187 | 			continue
188 | 		}
189 | 		// needs to be done before compile...
190 | 		re = re.Simplify()
191 | 		if p, err = syntax.Compile(re); err != nil {
192 | 			t.Errorf("Compile(%q) got err:%s, want success", test.re, err)
193 | 			continue
194 | 		}
195 | 		onePass = compileOnePass(p)
196 | 		if (onePass == notOnePass) != (test.onePass == notOnePass) {
197 | 			t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass)
198 | 		}
199 | 	}
200 | }
201 | 
202 | // TODO(cespare): Unify with onePassTests and rationalize one-pass test cases.
203 | var onePassTests1 = []struct {
204 | 	re    string
205 | 	match string
206 | }{
207 | 	{`^a(/b+(#c+)*)*$`, "a/b#c"}, // golang.org/issue/11905
208 | }
209 | 
210 | func TestRunOnePass(t *testing.T) {
211 | 	for _, test := range onePassTests1 {
212 | 		re, err := Compile(test.re)
213 | 		if err != nil {
214 | 			t.Errorf("Compile(%q): got err: %s", test.re, err)
215 | 			continue
216 | 		}
217 | 		if re.onepass == notOnePass {
218 | 			t.Errorf("Compile(%q): got notOnePass, want one-pass", test.re)
219 | 			continue
220 | 		}
221 | 		if !re.MatchString(test.match) {
222 | 			t.Errorf("onepass %q did not match %q", test.re, test.match)
223 | 		}
224 | 	}
225 | }
226 | 


--------------------------------------------------------------------------------
/syntax/compile.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import "unicode"
  8 | 
  9 | // A patchList is a list of instruction pointers that need to be filled in (patched).
 10 | // Because the pointers haven't been filled in yet, we can reuse their storage
 11 | // to hold the list. It's kind of sleazy, but works well in practice.
 12 | // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
 13 | //
 14 | // These aren't really pointers: they're integers, so we can reinterpret them
 15 | // this way without using package unsafe. A value l denotes
 16 | // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
 17 | // l == 0 denotes the empty list, okay because we start every program
 18 | // with a fail instruction, so we'll never want to point at its output link.
 19 | type patchList uint32
 20 | 
 21 | func (l patchList) next(p *Prog) patchList {
 22 | 	i := &p.Inst[l>>1]
 23 | 	if l&1 == 0 {
 24 | 		return patchList(i.Out)
 25 | 	}
 26 | 	return patchList(i.Arg)
 27 | }
 28 | 
 29 | func (l patchList) patch(p *Prog, val uint32) {
 30 | 	for l != 0 {
 31 | 		i := &p.Inst[l>>1]
 32 | 		if l&1 == 0 {
 33 | 			l = patchList(i.Out)
 34 | 			i.Out = val
 35 | 		} else {
 36 | 			l = patchList(i.Arg)
 37 | 			i.Arg = val
 38 | 		}
 39 | 	}
 40 | }
 41 | 
 42 | func (l1 patchList) append(p *Prog, l2 patchList) patchList {
 43 | 	if l1 == 0 {
 44 | 		return l2
 45 | 	}
 46 | 	if l2 == 0 {
 47 | 		return l1
 48 | 	}
 49 | 
 50 | 	last := l1
 51 | 	for {
 52 | 		next := last.next(p)
 53 | 		if next == 0 {
 54 | 			break
 55 | 		}
 56 | 		last = next
 57 | 	}
 58 | 
 59 | 	i := &p.Inst[last>>1]
 60 | 	if last&1 == 0 {
 61 | 		i.Out = uint32(l2)
 62 | 	} else {
 63 | 		i.Arg = uint32(l2)
 64 | 	}
 65 | 	return l1
 66 | }
 67 | 
 68 | // A frag represents a compiled program fragment.
 69 | type frag struct {
 70 | 	i   uint32    // index of first instruction
 71 | 	out patchList // where to record end instruction
 72 | }
 73 | 
 74 | type compiler struct {
 75 | 	p *Prog
 76 | 	reversed bool
 77 | }
 78 | 
 79 | // Compile compiles the regexp into a program to be executed.
 80 | // The regexp should have been simplified already (returned from re.Simplify).
 81 | func Compile(re *Regexp) (*Prog, error) {
 82 | 	var c compiler
 83 | 	c.init()
 84 | 	f := c.compile(re)
 85 | 	f.out.patch(c.p, c.inst(InstMatch).i)
 86 | 
 87 | 	c.p.StartUnanchored = int(c.cat(c.star(c.rune(anyRune, 0), true), f).i)
 88 | 	// TODO(matloob): end of area that needs to be cleaned up
 89 | 
 90 | 	c.p.Start = int(f.i)
 91 | 	return c.p, nil
 92 | }
 93 | 
 94 | // CompileReversed compiles the regexp into a reverse program.
 95 | func CompileReversed(re *Regexp) (*Prog, error) {
 96 | 	var c compiler
 97 | 	c.init()
 98 | 	c.reversed = true
 99 | 	re = re.Simplify()
100 | 	f := c.compile(re)
101 | 	f.out.patch(c.p, c.inst(InstMatch).i)
102 | 	c.p.Start = int(f.i)
103 | 	return c.p, nil
104 | }
105 | 
106 | func (c *compiler) init() {
107 | 	c.p = new(Prog)
108 | 	c.p.NumCap = 2 // implicit ( and ) for whole match $0
109 | 	c.inst(InstFail)
110 | }
111 | 
112 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
113 | var anyRune = []rune{0, unicode.MaxRune}
114 | 
115 | func (c *compiler) compile(re *Regexp) frag {
116 | 	switch re.Op {
117 | 	case OpNoMatch:
118 | 		return c.fail()
119 | 	case OpEmptyMatch:
120 | 		return c.nop()
121 | 	case OpLiteral:
122 | 		if len(re.Rune) == 0 {
123 | 			return c.nop()
124 | 		}
125 | 		var f frag
126 | 		for j := range re.Rune {
127 | 			f1 := c.rune(re.Rune[j:j+1], re.Flags)
128 | 			if j == 0 {
129 | 				f = f1
130 | 			} else {
131 | 				f = c.cat(f, f1)
132 | 			}
133 | 		}
134 | 		return f
135 | 	case OpCharClass:
136 | 		return c.rune(re.Rune, re.Flags)
137 | 	case OpAnyCharNotNL:
138 | 		return c.rune(anyRuneNotNL, 0)
139 | 	case OpAnyChar:
140 | 		return c.rune(anyRune, 0)
141 | 	case OpBeginLine:
142 | 		if c.reversed {
143 | 			return c.empty(EmptyEndLine)
144 | 		}
145 | 		return c.empty(EmptyBeginLine)
146 | 	case OpEndLine:
147 | 		if c.reversed {
148 | 			return c.empty(EmptyBeginLine)
149 | 		}
150 | 		return c.empty(EmptyEndLine)
151 | 	case OpBeginText:
152 | 		if c.reversed {
153 | 			return c.empty(EmptyEndText)
154 | 		}
155 | 		return c.empty(EmptyBeginText)
156 | 	case OpEndText:
157 | 		if c.reversed {
158 | 			return c.empty(EmptyBeginText)
159 | 		}
160 | 		return c.empty(EmptyEndText)
161 | 	case OpWordBoundary:
162 | 		return c.empty(EmptyWordBoundary)
163 | 	case OpNoWordBoundary:
164 | 		return c.empty(EmptyNoWordBoundary)
165 | 	case OpCapture:
166 | 		bra := c.cap(uint32(re.Cap << 1))
167 | 		sub := c.compile(re.Sub[0])
168 | 		ket := c.cap(uint32(re.Cap<<1 | 1))
169 | 		return c.cat(c.cat(bra, sub), ket)
170 | 	case OpStar:
171 | 		return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
172 | 	case OpPlus:
173 | 		return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
174 | 	case OpQuest:
175 | 		return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
176 | 	case OpConcat:
177 | 		if len(re.Sub) == 0 {
178 | 			return c.nop()
179 | 		}
180 | 		var f frag
181 | 		for i, sub := range re.Sub {
182 | 			if i == 0 {
183 | 				f = c.compile(sub)
184 | 			} else {
185 | 				f = c.cat(f, c.compile(sub))
186 | 			}
187 | 		}
188 | 		return f
189 | 	case OpAlternate:
190 | 		var f frag
191 | 		for _, sub := range re.Sub {
192 | 			f = c.alt(f, c.compile(sub))
193 | 		}
194 | 		return f
195 | 	}
196 | 	panic("regexp: unhandled case in compile")
197 | }
198 | 
199 | func (c *compiler) inst(op InstOp) frag {
200 | 	// TODO: impose length limit
201 | 	f := frag{i: uint32(len(c.p.Inst))}
202 | 	c.p.Inst = append(c.p.Inst, Inst{Op: op})
203 | 	return f
204 | }
205 | 
206 | func (c *compiler) nop() frag {
207 | 	f := c.inst(InstNop)
208 | 	f.out = patchList(f.i << 1)
209 | 	return f
210 | }
211 | 
212 | func (c *compiler) fail() frag {
213 | 	return frag{}
214 | }
215 | 
216 | func (c *compiler) cap(arg uint32) frag {
217 | 	f := c.inst(InstCapture)
218 | 	f.out = patchList(f.i << 1)
219 | 	c.p.Inst[f.i].Arg = arg
220 | 
221 | 	if c.p.NumCap < int(arg)+1 {
222 | 		c.p.NumCap = int(arg) + 1
223 | 	}
224 | 	return f
225 | }
226 | 
227 | func (c *compiler) cat(f1, f2 frag) frag {
228 | 	// concat of failure is failure
229 | 	if f1.i == 0 || f2.i == 0 {
230 | 		return frag{}
231 | 	}
232 | 
233 | 	// TODO: elide nop
234 | 
235 | 	if c.reversed {
236 | 		f2.out.patch(c.p, f1.i)
237 | 		return frag{f2.i, f1.out}
238 | 	}
239 | 	f1.out.patch(c.p, f2.i)
240 | 	return frag{f1.i, f2.out}
241 | }
242 | 
243 | func (c *compiler) alt(f1, f2 frag) frag {
244 | 	// alt of failure is other
245 | 	if f1.i == 0 {
246 | 		return f2
247 | 	}
248 | 	if f2.i == 0 {
249 | 		return f1
250 | 	}
251 | 
252 | 	f := c.inst(InstAlt)
253 | 	i := &c.p.Inst[f.i]
254 | 	i.Out = f1.i
255 | 	i.Arg = f2.i
256 | 	f.out = f1.out.append(c.p, f2.out)
257 | 	return f
258 | }
259 | 
260 | func (c *compiler) quest(f1 frag, nongreedy bool) frag {
261 | 	f := c.inst(InstAlt)
262 | 	i := &c.p.Inst[f.i]
263 | 	if nongreedy {
264 | 		i.Arg = f1.i
265 | 		f.out = patchList(f.i << 1)
266 | 	} else {
267 | 		i.Out = f1.i
268 | 		f.out = patchList(f.i<<1 | 1)
269 | 	}
270 | 	f.out = f.out.append(c.p, f1.out)
271 | 	return f
272 | }
273 | 
274 | func (c *compiler) star(f1 frag, nongreedy bool) frag {
275 | 	f := c.inst(InstAlt)
276 | 	i := &c.p.Inst[f.i]
277 | 	if nongreedy {
278 | 		i.Arg = f1.i
279 | 		f.out = patchList(f.i << 1)
280 | 	} else {
281 | 		i.Out = f1.i
282 | 		f.out = patchList(f.i<<1 | 1)
283 | 	}
284 | 	f1.out.patch(c.p, f.i)
285 | 	return f
286 | }
287 | 
288 | func (c *compiler) plus(f1 frag, nongreedy bool) frag {
289 | 	return frag{f1.i, c.star(f1, nongreedy).out}
290 | }
291 | 
292 | func (c *compiler) empty(op EmptyOp) frag {
293 | 	f := c.inst(InstEmptyWidth)
294 | 	c.p.Inst[f.i].Arg = uint32(op)
295 | 	f.out = patchList(f.i << 1)
296 | 	return f
297 | }
298 | 
299 | func (c *compiler) rune(r []rune, flags Flags) frag {
300 | 	f := c.inst(InstRune)
301 | 	i := &c.p.Inst[f.i]
302 | 	i.Rune = r
303 | 	flags &= FoldCase // only relevant flag is FoldCase
304 | 	if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
305 | 		// and sometimes not even that
306 | 		flags &^= FoldCase
307 | 	}
308 | 	i.Arg = uint32(flags)
309 | 	f.out = patchList(f.i << 1)
310 | 
311 | 	// Special cases for exec machine.
312 | 	switch {
313 | 	case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
314 | 		i.Op = InstRune1
315 | 	case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
316 | 		i.Op = InstRuneAny
317 | 	case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
318 | 		i.Op = InstRuneAnyNotNL
319 | 	}
320 | 
321 | 	return f
322 | }
323 | 


--------------------------------------------------------------------------------
/syntax/doc.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
  6 | 
  7 | /*
  8 | Package syntax parses regular expressions into parse trees and compiles
  9 | parse trees into programs. Most clients of regular expressions will use the
 10 | facilities of package regexp (such as Compile and Match) instead of this package.
 11 | 
 12 | Syntax
 13 | 
 14 | The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
 15 | Parts of the syntax can be disabled by passing alternate flags to Parse.
 16 | 
 17 | 
 18 | Single characters:
 19 |   .              any character, possibly including newline (flag s=true)
 20 |   [xyz]          character class
 21 |   [^xyz]         negated character class
 22 |   \d             Perl character class
 23 |   \D             negated Perl character class
 24 |   [[:alpha:]]    ASCII character class
 25 |   [[:^alpha:]]   negated ASCII character class
 26 |   \pN            Unicode character class (one-letter name)
 27 |   \p{Greek}      Unicode character class
 28 |   \PN            negated Unicode character class (one-letter name)
 29 |   \P{Greek}      negated Unicode character class
 30 | 
 31 | Composites:
 32 |   xy             x followed by y
 33 |   x|y            x or y (prefer x)
 34 | 
 35 | Repetitions:
 36 |   x*             zero or more x, prefer more
 37 |   x+             one or more x, prefer more
 38 |   x?             zero or one x, prefer one
 39 |   x{n,m}         n or n+1 or ... or m x, prefer more
 40 |   x{n,}          n or more x, prefer more
 41 |   x{n}           exactly n x
 42 |   x*?            zero or more x, prefer fewer
 43 |   x+?            one or more x, prefer fewer
 44 |   x??            zero or one x, prefer zero
 45 |   x{n,m}?        n or n+1 or ... or m x, prefer fewer
 46 |   x{n,}?         n or more x, prefer fewer
 47 |   x{n}?          exactly n x
 48 | 
 49 | Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
 50 | reject forms that create a minimum or maximum repetition count above 1000.
 51 | Unlimited repetitions are not subject to this restriction.
 52 | 
 53 | Grouping:
 54 |   (re)           numbered capturing group (submatch)
 55 |   (?P<name>re)   named & numbered capturing group (submatch)
 56 |   (?:re)         non-capturing group
 57 |   (?flags)       set flags within current group; non-capturing
 58 |   (?flags:re)    set flags during re; non-capturing
 59 | 
 60 |   Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
 61 | 
 62 |   i              case-insensitive (default false)
 63 |   m              multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
 64 |   s              let . match \n (default false)
 65 |   U              ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
 66 | 
 67 | Empty strings:
 68 |   ^              at beginning of text or line (flag m=true)
 69 |   $              at end of text (like \z not \Z) or line (flag m=true)
 70 |   \A             at beginning of text
 71 |   \b             at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
 72 |   \B             not at ASCII word boundary
 73 |   \z             at end of text
 74 | 
 75 | Escape sequences:
 76 |   \a             bell (== \007)
 77 |   \f             form feed (== \014)
 78 |   \t             horizontal tab (== \011)
 79 |   \n             newline (== \012)
 80 |   \r             carriage return (== \015)
 81 |   \v             vertical tab character (== \013)
 82 |   \*             literal *, for any punctuation character *
 83 |   \123           octal character code (up to three digits)
 84 |   \x7F           hex character code (exactly two digits)
 85 |   \x{10FFFF}     hex character code
 86 |   \Q...\E        literal text ... even if ... has punctuation
 87 | 
 88 | Character class elements:
 89 |   x              single character
 90 |   A-Z            character range (inclusive)
 91 |   \d             Perl character class
 92 |   [:foo:]        ASCII character class foo
 93 |   \p{Foo}        Unicode character class Foo
 94 |   \pF            Unicode character class F (one-letter name)
 95 | 
 96 | Named character classes as character class elements:
 97 |   [\d]           digits (== \d)
 98 |   [^\d]          not digits (== \D)
 99 |   [\D]           not digits (== \D)
100 |   [^\D]          not not digits (== \d)
101 |   [[:name:]]     named ASCII class inside character class (== [:name:])
102 |   [^[:name:]]    named ASCII class inside negated character class (== [:^name:])
103 |   [\p{Name}]     named Unicode property inside character class (== \p{Name})
104 |   [^\p{Name}]    named Unicode property inside negated character class (== \P{Name})
105 | 
106 | Perl character classes (all ASCII-only):
107 |   \d             digits (== [0-9])
108 |   \D             not digits (== [^0-9])
109 |   \s             whitespace (== [\t\n\f\r ])
110 |   \S             not whitespace (== [^\t\n\f\r ])
111 |   \w             word characters (== [0-9A-Za-z_])
112 |   \W             not word characters (== [^0-9A-Za-z_])
113 | 
114 | ASCII character classes:
115 |   [[:alnum:]]    alphanumeric (== [0-9A-Za-z])
116 |   [[:alpha:]]    alphabetic (== [A-Za-z])
117 |   [[:ascii:]]    ASCII (== [\x00-\x7F])
118 |   [[:blank:]]    blank (== [\t ])
119 |   [[:cntrl:]]    control (== [\x00-\x1F\x7F])
120 |   [[:digit:]]    digits (== [0-9])
121 |   [[:graph:]]    graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
122 |   [[:lower:]]    lower case (== [a-z])
123 |   [[:print:]]    printable (== [ -~] == [ [:graph:]])
124 |   [[:punct:]]    punctuation (== [!-/:-@[-`{-~])
125 |   [[:space:]]    whitespace (== [\t\n\v\f\r ])
126 |   [[:upper:]]    upper case (== [A-Z])
127 |   [[:word:]]     word characters (== [0-9A-Za-z_])
128 |   [[:xdigit:]]   hex digit (== [0-9A-Fa-f])
129 | 
130 | */
131 | package syntax
132 | 


--------------------------------------------------------------------------------
/syntax/make_perl_groups.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | # Copyright 2008 The Go Authors.  All rights reserved.
  3 | # Use of this source code is governed by a BSD-style
  4 | # license that can be found in the LICENSE file.
  5 | 
  6 | # Modified version of RE2's make_perl_groups.pl.
  7 | 
  8 | # Generate table entries giving character ranges
  9 | # for POSIX/Perl character classes.  Rather than
 10 | # figure out what the definition is, it is easier to ask
 11 | # Perl about each letter from 0-128 and write down
 12 | # its answer.
 13 | 
 14 | @posixclasses = (
 15 | 	"[:alnum:]",
 16 | 	"[:alpha:]",
 17 | 	"[:ascii:]",
 18 | 	"[:blank:]",
 19 | 	"[:cntrl:]",
 20 | 	"[:digit:]",
 21 | 	"[:graph:]",
 22 | 	"[:lower:]",
 23 | 	"[:print:]",
 24 | 	"[:punct:]",
 25 | 	"[:space:]",
 26 | 	"[:upper:]",
 27 | 	"[:word:]",
 28 | 	"[:xdigit:]",
 29 | );
 30 | 
 31 | @perlclasses = (
 32 | 	"\\d",
 33 | 	"\\s",
 34 | 	"\\w",
 35 | );
 36 | 
 37 | sub ComputeClass($) {
 38 |   my @ranges;
 39 |   my ($class) = @_;
 40 |   my $regexp = "[$class]";
 41 |   my $start = -1;
 42 |   for (my $i=0; $i<=129; $i++) {
 43 |     if ($i == 129) { $i = 256; }
 44 |     if ($i <= 128 && chr($i) =~ $regexp) {
 45 |       if ($start < 0) {
 46 |         $start = $i;
 47 |       }
 48 |     } else {
 49 |       if ($start >= 0) {
 50 |         push @ranges, [$start, $i-1];
 51 |       }
 52 |       $start = -1;
 53 |     }
 54 |   }
 55 |   return @ranges;
 56 | }
 57 | 
 58 | sub PrintClass($$@) {
 59 |   my ($cname, $name, @ranges) = @_;
 60 |   print "var code$cname = []rune{  /* $name */\n";
 61 |   for (my $i=0; $i<@ranges; $i++) {
 62 |     my @a = @{$ranges[$i]};
 63 |     printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
 64 |   }
 65 |   print "}\n\n";
 66 |   my $n = @ranges;
 67 |   $negname = $name;
 68 |   if ($negname =~ /:/) {
 69 |     $negname =~ s/:/:^/;
 70 |   } else {
 71 |     $negname =~ y/a-z/A-Z/;
 72 |   }
 73 |   return "\t`$name`: {+1, code$cname},\n" .
 74 |   	"\t`$negname`: {-1, code$cname},\n";
 75 | }
 76 | 
 77 | my $gen = 0;
 78 | 
 79 | sub PrintClasses($@) {
 80 |   my ($cname, @classes) = @_;
 81 |   my @entries;
 82 |   foreach my $cl (@classes) {
 83 |     my @ranges = ComputeClass($cl);
 84 |     push @entries, PrintClass(++$gen, $cl, @ranges);
 85 |   }
 86 |   print "var ${cname}Group = map[string]charGroup{\n";
 87 |   foreach my $e (@entries) {
 88 |     print $e;
 89 |   }
 90 |   print "}\n";
 91 |   my $count = @entries;
 92 | }
 93 | 
 94 | print <<EOF;
 95 | // Copyright 2013 The Go Authors. All rights reserved.
 96 | // Use of this source code is governed by a BSD-style
 97 | // license that can be found in the LICENSE file.
 98 | 
 99 | // GENERATED BY make_perl_groups.pl; DO NOT EDIT.
100 | // make_perl_groups.pl >perl_groups.go
101 | 
102 | package syntax
103 | 
104 | EOF
105 | 
106 | PrintClasses("perl", @perlclasses);
107 | PrintClasses("posix", @posixclasses);
108 | 


--------------------------------------------------------------------------------
/syntax/parse_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"testing"
 11 | 	"unicode"
 12 | )
 13 | 
 14 | type parseTest struct {
 15 | 	Regexp string
 16 | 	Dump   string
 17 | }
 18 | 
 19 | var parseTests = []parseTest{
 20 | 	// Base cases
 21 | 	{`a`, `lit{a}`},
 22 | 	{`a.`, `cat{lit{a}dot{}}`},
 23 | 	{`a.b`, `cat{lit{a}dot{}lit{b}}`},
 24 | 	{`ab`, `str{ab}`},
 25 | 	{`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
 26 | 	{`abc`, `str{abc}`},
 27 | 	{`a|^`, `alt{lit{a}bol{}}`},
 28 | 	{`a|b`, `cc{0x61-0x62}`},
 29 | 	{`(a)`, `cap{lit{a}}`},
 30 | 	{`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
 31 | 	{`a*`, `star{lit{a}}`},
 32 | 	{`a+`, `plus{lit{a}}`},
 33 | 	{`a?`, `que{lit{a}}`},
 34 | 	{`a{2}`, `rep{2,2 lit{a}}`},
 35 | 	{`a{2,3}`, `rep{2,3 lit{a}}`},
 36 | 	{`a{2,}`, `rep{2,-1 lit{a}}`},
 37 | 	{`a*?`, `nstar{lit{a}}`},
 38 | 	{`a+?`, `nplus{lit{a}}`},
 39 | 	{`a??`, `nque{lit{a}}`},
 40 | 	{`a{2}?`, `nrep{2,2 lit{a}}`},
 41 | 	{`a{2,3}?`, `nrep{2,3 lit{a}}`},
 42 | 	{`a{2,}?`, `nrep{2,-1 lit{a}}`},
 43 | 	// Malformed { } are treated as literals.
 44 | 	{`x{1001`, `str{x{1001}`},
 45 | 	{`x{9876543210`, `str{x{9876543210}`},
 46 | 	{`x{9876543210,`, `str{x{9876543210,}`},
 47 | 	{`x{2,1`, `str{x{2,1}`},
 48 | 	{`x{1,9876543210`, `str{x{1,9876543210}`},
 49 | 	{``, `emp{}`},
 50 | 	{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
 51 | 	{`|x|`, `alt{emp{}lit{x}emp{}}`},
 52 | 	{`.`, `dot{}`},
 53 | 	{`^`, `bol{}`},
 54 | 	{`$`, `eol{}`},
 55 | 	{`\|`, `lit{|}`},
 56 | 	{`\(`, `lit{(}`},
 57 | 	{`\)`, `lit{)}`},
 58 | 	{`\*`, `lit{*}`},
 59 | 	{`\+`, `lit{+}`},
 60 | 	{`\?`, `lit{?}`},
 61 | 	{`{`, `lit{{}`},
 62 | 	{`}`, `lit{}}`},
 63 | 	{`\.`, `lit{.}`},
 64 | 	{`\^`, `lit{^}`},
 65 | 	{`\$`, `lit{$}`},
 66 | 	{`\\`, `lit{\}`},
 67 | 	{`[ace]`, `cc{0x61 0x63 0x65}`},
 68 | 	{`[abc]`, `cc{0x61-0x63}`},
 69 | 	{`[a-z]`, `cc{0x61-0x7a}`},
 70 | 	{`[a]`, `lit{a}`},
 71 | 	{`\-`, `lit{-}`},
 72 | 	{`-`, `lit{-}`},
 73 | 	{`\_`, `lit{_}`},
 74 | 	{`abc`, `str{abc}`},
 75 | 	{`abc|def`, `alt{str{abc}str{def}}`},
 76 | 	{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
 77 | 
 78 | 	// Posix and Perl extensions
 79 | 	{`[[:lower:]]`, `cc{0x61-0x7a}`},
 80 | 	{`[a-z]`, `cc{0x61-0x7a}`},
 81 | 	{`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
 82 | 	{`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
 83 | 	{`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 84 | 	{`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 85 | 	{`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 86 | 	{`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 87 | 	{`\d`, `cc{0x30-0x39}`},
 88 | 	{`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
 89 | 	{`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
 90 | 	{`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
 91 | 	{`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
 92 | 	{`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
 93 | 	{`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
 94 | 	{`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 95 | 	{`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
 96 | 	//	{ `\C`, `byte{}` },  // probably never
 97 | 
 98 | 	// Unicode, negatives, and a double negative.
 99 | 	{`\p{Braille}`, `cc{0x2800-0x28ff}`},
100 | 	{`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
101 | 	{`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
102 | 	{`\P{^Braille}`, `cc{0x2800-0x28ff}`},
103 | 	{`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
104 | 	{`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
105 | 	{`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
106 | 	{`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
107 | 	{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
108 | 	{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
109 | 	{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
110 | 	{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
111 | 	{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
112 | 	{`\p{Any}`, `dot{}`},
113 | 	{`\p{^Any}`, `cc{}`},
114 | 
115 | 	// Hex, octal.
116 | 	{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
117 | 	{`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
118 | 
119 | 	// More interesting regular expressions.
120 | 	{`a{,2}`, `str{a{,2}}`},
121 | 	{`\.\^\$\\`, `str{.^$\}`},
122 | 	{`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
123 | 	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
124 | 	{`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
125 | 	{`a*{`, `cat{star{lit{a}}lit{{}}`},
126 | 
127 | 	// Test precedences
128 | 	{`(?:ab)*`, `star{str{ab}}`},
129 | 	{`(ab)*`, `star{cap{str{ab}}}`},
130 | 	{`ab|cd`, `alt{str{ab}str{cd}}`},
131 | 	{`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
132 | 
133 | 	// Test flattening.
134 | 	{`(?:a)`, `lit{a}`},
135 | 	{`(?:ab)(?:cd)`, `str{abcd}`},
136 | 	{`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
137 | 	{`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
138 | 	{`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
139 | 	{`a|.`, `dot{}`},
140 | 	{`.|a`, `dot{}`},
141 | 	{`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
142 | 	{`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
143 | 
144 | 	// Test Perl quoted literals
145 | 	{`\Q+|*?{[\E`, `str{+|*?{[}`},
146 | 	{`\Q+\E+`, `plus{lit{+}}`},
147 | 	{`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
148 | 	{`\Q\\E`, `lit{\}`},
149 | 	{`\Q\\\E`, `str{\\}`},
150 | 
151 | 	// Test Perl \A and \z
152 | 	{`(?m)^`, `bol{}`},
153 | 	{`(?m)$`, `eol{}`},
154 | 	{`(?-m)^`, `bot{}`},
155 | 	{`(?-m)$`, `eot{}`},
156 | 	{`(?m)\A`, `bot{}`},
157 | 	{`(?m)\z`, `eot{\z}`},
158 | 	{`(?-m)\A`, `bot{}`},
159 | 	{`(?-m)\z`, `eot{\z}`},
160 | 
161 | 	// Test named captures
162 | 	{`(?P<name>a)`, `cap{name:lit{a}}`},
163 | 
164 | 	// Case-folded literals
165 | 	{`[Aa]`, `litfold{A}`},
166 | 	{`[\x{100}\x{101}]`, `litfold{Ā}`},
167 | 	{`[Δδ]`, `litfold{Δ}`},
168 | 
169 | 	// Strings
170 | 	{`abcde`, `str{abcde}`},
171 | 	{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
172 | 
173 | 	// Factoring.
174 | 	{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
175 | 	{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
176 | 
177 | 	// Bug fixes.
178 | 	{`(?:.)`, `dot{}`},
179 | 	{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
180 | 	{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
181 | 	{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
182 | 	{`(?:A|a)`, `litfold{A}`},
183 | 	{`A|(?:A|a)`, `litfold{A}`},
184 | 	{`(?s).`, `dot{}`},
185 | 	{`(?-s).`, `dnl{}`},
186 | 	{`(?:(?:^).)`, `cat{bol{}dot{}}`},
187 | 	{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
188 | 
189 | 	// RE2 prefix_tests
190 | 	{`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
191 | 	{`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
192 | 	{`abc|abd|aef|bcx|bcy`,
193 | 		`alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
194 | 			`cat{str{bc}cc{0x78-0x79}}}`},
195 | 	{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
196 | 	{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
197 | 	{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
198 | 	{`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
199 | 	{`x{2}|x{2}[0-9]`,
200 | 		`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
201 | 	{`x{2}y|x{2}[0-9]y`,
202 | 		`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
203 | 	{`a.*?c|a.*?b`,
204 | 		`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
205 | 
206 | 	// Valid repetitions.
207 | 	{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
208 | 	{`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
209 | }
210 | 
211 | const testFlags = MatchNL | PerlX | UnicodeGroups
212 | 
213 | func TestParseSimple(t *testing.T) {
214 | 	testParseDump(t, parseTests, testFlags)
215 | }
216 | 
217 | var foldcaseTests = []parseTest{
218 | 	{`AbCdE`, `strfold{ABCDE}`},
219 | 	{`[Aa]`, `litfold{A}`},
220 | 	{`a`, `litfold{A}`},
221 | 
222 | 	// 0x17F is an old English long s (looks like an f) and folds to s.
223 | 	// 0x212A is the Kelvin symbol and folds to k.
224 | 	{`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
225 | 	{`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
226 | 	{`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
227 | }
228 | 
229 | func TestParseFoldCase(t *testing.T) {
230 | 	testParseDump(t, foldcaseTests, FoldCase)
231 | }
232 | 
233 | var literalTests = []parseTest{
234 | 	{"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
235 | }
236 | 
237 | func TestParseLiteral(t *testing.T) {
238 | 	testParseDump(t, literalTests, Literal)
239 | }
240 | 
241 | var matchnlTests = []parseTest{
242 | 	{`.`, `dot{}`},
243 | 	{"\n", "lit{\n}"},
244 | 	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
245 | 	{`[a\n]`, `cc{0xa 0x61}`},
246 | }
247 | 
248 | func TestParseMatchNL(t *testing.T) {
249 | 	testParseDump(t, matchnlTests, MatchNL)
250 | }
251 | 
252 | var nomatchnlTests = []parseTest{
253 | 	{`.`, `dnl{}`},
254 | 	{"\n", "lit{\n}"},
255 | 	{`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
256 | 	{`[a\n]`, `cc{0xa 0x61}`},
257 | }
258 | 
259 | func TestParseNoMatchNL(t *testing.T) {
260 | 	testParseDump(t, nomatchnlTests, 0)
261 | }
262 | 
263 | // Test Parse -> Dump.
264 | func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
265 | 	for _, tt := range tests {
266 | 		re, err := Parse(tt.Regexp, flags)
267 | 		if err != nil {
268 | 			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
269 | 			continue
270 | 		}
271 | 		if tt.Dump == "" {
272 | 			// It parsed. That's all we care about.
273 | 			continue
274 | 		}
275 | 		d := dump(re)
276 | 		if d != tt.Dump {
277 | 			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
278 | 		}
279 | 	}
280 | }
281 | 
282 | // dump prints a string representation of the regexp showing
283 | // the structure explicitly.
284 | func dump(re *Regexp) string {
285 | 	var b bytes.Buffer
286 | 	dumpRegexp(&b, re)
287 | 	return b.String()
288 | }
289 | 
290 | var opNames = []string{
291 | 	OpNoMatch:        "no",
292 | 	OpEmptyMatch:     "emp",
293 | 	OpLiteral:        "lit",
294 | 	OpCharClass:      "cc",
295 | 	OpAnyCharNotNL:   "dnl",
296 | 	OpAnyChar:        "dot",
297 | 	OpBeginLine:      "bol",
298 | 	OpEndLine:        "eol",
299 | 	OpBeginText:      "bot",
300 | 	OpEndText:        "eot",
301 | 	OpWordBoundary:   "wb",
302 | 	OpNoWordBoundary: "nwb",
303 | 	OpCapture:        "cap",
304 | 	OpStar:           "star",
305 | 	OpPlus:           "plus",
306 | 	OpQuest:          "que",
307 | 	OpRepeat:         "rep",
308 | 	OpConcat:         "cat",
309 | 	OpAlternate:      "alt",
310 | }
311 | 
312 | // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
313 | // It is used during testing to distinguish between parses that might print
314 | // the same using re's String method.
315 | func dumpRegexp(b *bytes.Buffer, re *Regexp) {
316 | 	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
317 | 		fmt.Fprintf(b, "op%d", re.Op)
318 | 	} else {
319 | 		switch re.Op {
320 | 		default:
321 | 			b.WriteString(opNames[re.Op])
322 | 		case OpStar, OpPlus, OpQuest, OpRepeat:
323 | 			if re.Flags&NonGreedy != 0 {
324 | 				b.WriteByte('n')
325 | 			}
326 | 			b.WriteString(opNames[re.Op])
327 | 		case OpLiteral:
328 | 			if len(re.Rune) > 1 {
329 | 				b.WriteString("str")
330 | 			} else {
331 | 				b.WriteString("lit")
332 | 			}
333 | 			if re.Flags&FoldCase != 0 {
334 | 				for _, r := range re.Rune {
335 | 					if unicode.SimpleFold(r) != r {
336 | 						b.WriteString("fold")
337 | 						break
338 | 					}
339 | 				}
340 | 			}
341 | 		}
342 | 	}
343 | 	b.WriteByte('{')
344 | 	switch re.Op {
345 | 	case OpEndText:
346 | 		if re.Flags&WasDollar == 0 {
347 | 			b.WriteString(`\z`)
348 | 		}
349 | 	case OpLiteral:
350 | 		for _, r := range re.Rune {
351 | 			b.WriteRune(r)
352 | 		}
353 | 	case OpConcat, OpAlternate:
354 | 		for _, sub := range re.Sub {
355 | 			dumpRegexp(b, sub)
356 | 		}
357 | 	case OpStar, OpPlus, OpQuest:
358 | 		dumpRegexp(b, re.Sub[0])
359 | 	case OpRepeat:
360 | 		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
361 | 		dumpRegexp(b, re.Sub[0])
362 | 	case OpCapture:
363 | 		if re.Name != "" {
364 | 			b.WriteString(re.Name)
365 | 			b.WriteByte(':')
366 | 		}
367 | 		dumpRegexp(b, re.Sub[0])
368 | 	case OpCharClass:
369 | 		sep := ""
370 | 		for i := 0; i < len(re.Rune); i += 2 {
371 | 			b.WriteString(sep)
372 | 			sep = " "
373 | 			lo, hi := re.Rune[i], re.Rune[i+1]
374 | 			if lo == hi {
375 | 				fmt.Fprintf(b, "%#x", lo)
376 | 			} else {
377 | 				fmt.Fprintf(b, "%#x-%#x", lo, hi)
378 | 			}
379 | 		}
380 | 	}
381 | 	b.WriteByte('}')
382 | }
383 | 
384 | func mkCharClass(f func(rune) bool) string {
385 | 	re := &Regexp{Op: OpCharClass}
386 | 	lo := rune(-1)
387 | 	for i := rune(0); i <= unicode.MaxRune; i++ {
388 | 		if f(i) {
389 | 			if lo < 0 {
390 | 				lo = i
391 | 			}
392 | 		} else {
393 | 			if lo >= 0 {
394 | 				re.Rune = append(re.Rune, lo, i-1)
395 | 				lo = -1
396 | 			}
397 | 		}
398 | 	}
399 | 	if lo >= 0 {
400 | 		re.Rune = append(re.Rune, lo, unicode.MaxRune)
401 | 	}
402 | 	return dump(re)
403 | }
404 | 
405 | func isUpperFold(r rune) bool {
406 | 	if unicode.IsUpper(r) {
407 | 		return true
408 | 	}
409 | 	c := unicode.SimpleFold(r)
410 | 	for c != r {
411 | 		if unicode.IsUpper(c) {
412 | 			return true
413 | 		}
414 | 		c = unicode.SimpleFold(c)
415 | 	}
416 | 	return false
417 | }
418 | 
419 | func TestFoldConstants(t *testing.T) {
420 | 	last := rune(-1)
421 | 	for i := rune(0); i <= unicode.MaxRune; i++ {
422 | 		if unicode.SimpleFold(i) == i {
423 | 			continue
424 | 		}
425 | 		if last == -1 && minFold != i {
426 | 			t.Errorf("minFold=%#U should be %#U", minFold, i)
427 | 		}
428 | 		last = i
429 | 	}
430 | 	if maxFold != last {
431 | 		t.Errorf("maxFold=%#U should be %#U", maxFold, last)
432 | 	}
433 | }
434 | 
435 | func TestAppendRangeCollapse(t *testing.T) {
436 | 	// AppendRange should collapse each of the new ranges
437 | 	// into the earlier ones (it looks back two ranges), so that
438 | 	// the slice never grows very large.
439 | 	// Note that we are not calling cleanClass.
440 | 	var r []rune
441 | 	for i := rune('A'); i <= 'Z'; i++ {
442 | 		r = appendRange(r, i, i)
443 | 		r = appendRange(r, i+'a'-'A', i+'a'-'A')
444 | 	}
445 | 	if string(r) != "AZaz" {
446 | 		t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
447 | 	}
448 | }
449 | 
450 | var invalidRegexps = []string{
451 | 	`(`,
452 | 	`)`,
453 | 	`(a`,
454 | 	`a)`,
455 | 	`(a))`,
456 | 	`(a|b|`,
457 | 	`a|b|)`,
458 | 	`(a|b|))`,
459 | 	`(a|b`,
460 | 	`a|b)`,
461 | 	`(a|b))`,
462 | 	`[a-z`,
463 | 	`([a-z)`,
464 | 	`[a-z)`,
465 | 	`([a-z]))`,
466 | 	`x{1001}`,
467 | 	`x{9876543210}`,
468 | 	`x{2,1}`,
469 | 	`x{1,9876543210}`,
470 | 	"\xff", // Invalid UTF-8
471 | 	"[\xff]",
472 | 	"[\\\xff]",
473 | 	"\\\xff",
474 | 	`(?P<name>a`,
475 | 	`(?P<name>`,
476 | 	`(?P<name`,
477 | 	`(?P<x y>a)`,
478 | 	`(?P<>a)`,
479 | 	`[a-Z]`,
480 | 	`(?i)[a-Z]`,
481 | 	`a{100000}`,
482 | 	`a{100000,}`,
483 | 	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
484 | 	`\Q\E*`,
485 | }
486 | 
487 | var onlyPerl = []string{
488 | 	`[a-b-c]`,
489 | 	`\Qabc\E`,
490 | 	`\Q*+?{[\E`,
491 | 	`\Q\\E`,
492 | 	`\Q\\\E`,
493 | 	`\Q\\\\E`,
494 | 	`\Q\\\\\E`,
495 | 	`(?:a)`,
496 | 	`(?P<name>a)`,
497 | }
498 | 
499 | var onlyPOSIX = []string{
500 | 	"a++",
501 | 	"a**",
502 | 	"a?*",
503 | 	"a+*",
504 | 	"a{1}*",
505 | 	".{1}{2}.{3}",
506 | }
507 | 
508 | func TestParseInvalidRegexps(t *testing.T) {
509 | 	for _, regexp := range invalidRegexps {
510 | 		if re, err := Parse(regexp, Perl); err == nil {
511 | 			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
512 | 		}
513 | 		if re, err := Parse(regexp, POSIX); err == nil {
514 | 			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
515 | 		}
516 | 	}
517 | 	for _, regexp := range onlyPerl {
518 | 		if _, err := Parse(regexp, Perl); err != nil {
519 | 			t.Errorf("Parse(%#q, Perl): %v", regexp, err)
520 | 		}
521 | 		if re, err := Parse(regexp, POSIX); err == nil {
522 | 			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
523 | 		}
524 | 	}
525 | 	for _, regexp := range onlyPOSIX {
526 | 		if re, err := Parse(regexp, Perl); err == nil {
527 | 			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
528 | 		}
529 | 		if _, err := Parse(regexp, POSIX); err != nil {
530 | 			t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
531 | 		}
532 | 	}
533 | }
534 | 
535 | func TestToStringEquivalentParse(t *testing.T) {
536 | 	for _, tt := range parseTests {
537 | 		re, err := Parse(tt.Regexp, testFlags)
538 | 		if err != nil {
539 | 			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
540 | 			continue
541 | 		}
542 | 		if tt.Dump == "" {
543 | 			// It parsed. That's all we care about.
544 | 			continue
545 | 		}
546 | 		d := dump(re)
547 | 		if d != tt.Dump {
548 | 			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
549 | 			continue
550 | 		}
551 | 
552 | 		s := re.String()
553 | 		if s != tt.Regexp {
554 | 			// If ToString didn't return the original regexp,
555 | 			// it must have found one with fewer parens.
556 | 			// Unfortunately we can't check the length here, because
557 | 			// ToString produces "\\{" for a literal brace,
558 | 			// but "{" is a shorter equivalent in some contexts.
559 | 			nre, err := Parse(s, testFlags)
560 | 			if err != nil {
561 | 				t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
562 | 				continue
563 | 			}
564 | 			nd := dump(nre)
565 | 			if d != nd {
566 | 				t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
567 | 			}
568 | 
569 | 			ns := nre.String()
570 | 			if s != ns {
571 | 				t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
572 | 			}
573 | 		}
574 | 	}
575 | }
576 | 


--------------------------------------------------------------------------------
/syntax/perl_groups.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // GENERATED BY make_perl_groups.pl; DO NOT EDIT.
  6 | // make_perl_groups.pl >perl_groups.go
  7 | 
  8 | package syntax
  9 | 
 10 | var code1 = []rune{ /* \d */
 11 | 	0x30, 0x39,
 12 | }
 13 | 
 14 | var code2 = []rune{ /* \s */
 15 | 	0x9, 0xa,
 16 | 	0xc, 0xd,
 17 | 	0x20, 0x20,
 18 | }
 19 | 
 20 | var code3 = []rune{ /* \w */
 21 | 	0x30, 0x39,
 22 | 	0x41, 0x5a,
 23 | 	0x5f, 0x5f,
 24 | 	0x61, 0x7a,
 25 | }
 26 | 
 27 | var perlGroup = map[string]charGroup{
 28 | 	`\d`: {+1, code1},
 29 | 	`\D`: {-1, code1},
 30 | 	`\s`: {+1, code2},
 31 | 	`\S`: {-1, code2},
 32 | 	`\w`: {+1, code3},
 33 | 	`\W`: {-1, code3},
 34 | }
 35 | var code4 = []rune{ /* [:alnum:] */
 36 | 	0x30, 0x39,
 37 | 	0x41, 0x5a,
 38 | 	0x61, 0x7a,
 39 | }
 40 | 
 41 | var code5 = []rune{ /* [:alpha:] */
 42 | 	0x41, 0x5a,
 43 | 	0x61, 0x7a,
 44 | }
 45 | 
 46 | var code6 = []rune{ /* [:ascii:] */
 47 | 	0x0, 0x7f,
 48 | }
 49 | 
 50 | var code7 = []rune{ /* [:blank:] */
 51 | 	0x9, 0x9,
 52 | 	0x20, 0x20,
 53 | }
 54 | 
 55 | var code8 = []rune{ /* [:cntrl:] */
 56 | 	0x0, 0x1f,
 57 | 	0x7f, 0x7f,
 58 | }
 59 | 
 60 | var code9 = []rune{ /* [:digit:] */
 61 | 	0x30, 0x39,
 62 | }
 63 | 
 64 | var code10 = []rune{ /* [:graph:] */
 65 | 	0x21, 0x7e,
 66 | }
 67 | 
 68 | var code11 = []rune{ /* [:lower:] */
 69 | 	0x61, 0x7a,
 70 | }
 71 | 
 72 | var code12 = []rune{ /* [:print:] */
 73 | 	0x20, 0x7e,
 74 | }
 75 | 
 76 | var code13 = []rune{ /* [:punct:] */
 77 | 	0x21, 0x2f,
 78 | 	0x3a, 0x40,
 79 | 	0x5b, 0x60,
 80 | 	0x7b, 0x7e,
 81 | }
 82 | 
 83 | var code14 = []rune{ /* [:space:] */
 84 | 	0x9, 0xd,
 85 | 	0x20, 0x20,
 86 | }
 87 | 
 88 | var code15 = []rune{ /* [:upper:] */
 89 | 	0x41, 0x5a,
 90 | }
 91 | 
 92 | var code16 = []rune{ /* [:word:] */
 93 | 	0x30, 0x39,
 94 | 	0x41, 0x5a,
 95 | 	0x5f, 0x5f,
 96 | 	0x61, 0x7a,
 97 | }
 98 | 
 99 | var code17 = []rune{ /* [:xdigit:] */
100 | 	0x30, 0x39,
101 | 	0x41, 0x46,
102 | 	0x61, 0x66,
103 | }
104 | 
105 | var posixGroup = map[string]charGroup{
106 | 	`[:alnum:]`:   {+1, code4},
107 | 	`[:^alnum:]`:  {-1, code4},
108 | 	`[:alpha:]`:   {+1, code5},
109 | 	`[:^alpha:]`:  {-1, code5},
110 | 	`[:ascii:]`:   {+1, code6},
111 | 	`[:^ascii:]`:  {-1, code6},
112 | 	`[:blank:]`:   {+1, code7},
113 | 	`[:^blank:]`:  {-1, code7},
114 | 	`[:cntrl:]`:   {+1, code8},
115 | 	`[:^cntrl:]`:  {-1, code8},
116 | 	`[:digit:]`:   {+1, code9},
117 | 	`[:^digit:]`:  {-1, code9},
118 | 	`[:graph:]`:   {+1, code10},
119 | 	`[:^graph:]`:  {-1, code10},
120 | 	`[:lower:]`:   {+1, code11},
121 | 	`[:^lower:]`:  {-1, code11},
122 | 	`[:print:]`:   {+1, code12},
123 | 	`[:^print:]`:  {-1, code12},
124 | 	`[:punct:]`:   {+1, code13},
125 | 	`[:^punct:]`:  {-1, code13},
126 | 	`[:space:]`:   {+1, code14},
127 | 	`[:^space:]`:  {-1, code14},
128 | 	`[:upper:]`:   {+1, code15},
129 | 	`[:^upper:]`:  {-1, code15},
130 | 	`[:word:]`:    {+1, code16},
131 | 	`[:^word:]`:   {-1, code16},
132 | 	`[:xdigit:]`:  {+1, code17},
133 | 	`[:^xdigit:]`: {-1, code17},
134 | }
135 | 


--------------------------------------------------------------------------------
/syntax/prog.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"strconv"
 10 | 	"unicode"
 11 | )
 12 | 
 13 | // Compiled program.
 14 | // May not belong in this package, but convenient for now.
 15 | 
 16 | // A Prog is a compiled regular expression program.
 17 | type Prog struct {
 18 | 	Inst            []Inst
 19 | 	Start           int // index of start instruction
 20 | 	StartUnanchored int // index of start instruction for unanchored search
 21 | 	NumCap          int // number of InstCapture insts in re
 22 | }
 23 | 
 24 | // An InstOp is an instruction opcode.
 25 | type InstOp uint8
 26 | 
 27 | const (
 28 | 	InstAlt InstOp = iota
 29 | 	InstAltMatch
 30 | 	InstCapture
 31 | 	InstEmptyWidth
 32 | 	InstMatch
 33 | 	InstFail
 34 | 	InstNop
 35 | 	InstRune
 36 | 	InstRune1
 37 | 	InstRuneAny
 38 | 	InstRuneAnyNotNL
 39 | )
 40 | 
 41 | var instOpNames = []string{
 42 | 	"InstAlt",
 43 | 	"InstAltMatch",
 44 | 	"InstCapture",
 45 | 	"InstEmptyWidth",
 46 | 	"InstMatch",
 47 | 	"InstFail",
 48 | 	"InstNop",
 49 | 	"InstRune",
 50 | 	"InstRune1",
 51 | 	"InstRuneAny",
 52 | 	"InstRuneAnyNotNL",
 53 | }
 54 | 
 55 | func (i InstOp) String() string {
 56 | 	if uint(i) >= uint(len(instOpNames)) {
 57 | 		return ""
 58 | 	}
 59 | 	return instOpNames[i]
 60 | }
 61 | 
 62 | // An EmptyOp specifies a kind or mixture of zero-width assertions.
 63 | type EmptyOp uint8
 64 | 
 65 | const (
 66 | 	EmptyBeginLine EmptyOp = 1 << iota
 67 | 	EmptyEndLine
 68 | 	EmptyBeginText
 69 | 	EmptyEndText
 70 | 	EmptyWordBoundary
 71 | 	EmptyNoWordBoundary
 72 | )
 73 | 
 74 | // EmptyOpContext returns the zero-width assertions
 75 | // satisfied at the position between the runes r1 and r2.
 76 | // Passing r1 == -1 indicates that the position is
 77 | // at the beginning of the text.
 78 | // Passing r2 == -1 indicates that the position is
 79 | // at the end of the text.
 80 | func EmptyOpContext(r1, r2 rune) EmptyOp {
 81 | 	var op EmptyOp = EmptyNoWordBoundary
 82 | 	var boundary byte
 83 | 	switch {
 84 | 	case IsWordChar(r1):
 85 | 		boundary = 1
 86 | 	case r1 == '\n':
 87 | 		op |= EmptyBeginLine
 88 | 	case r1 < 0:
 89 | 		op |= EmptyBeginText | EmptyBeginLine
 90 | 	}
 91 | 	switch {
 92 | 	case IsWordChar(r2):
 93 | 		boundary ^= 1
 94 | 	case r2 == '\n':
 95 | 		op |= EmptyEndLine
 96 | 	case r2 < 0:
 97 | 		op |= EmptyEndText | EmptyEndLine
 98 | 	}
 99 | 	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
100 | 		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
101 | 	}
102 | 	return op
103 | }
104 | 
105 | // IsWordChar reports whether r is consider a ``word character''
106 | // during the evaluation of the \b and \B zero-width assertions.
107 | // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
108 | func IsWordChar(r rune) bool {
109 | 	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
110 | }
111 | 
112 | // An Inst is a single instruction in a regular expression program.
113 | type Inst struct {
114 | 	Op   InstOp
115 | 	Out  uint32 // all but InstMatch, InstFail
116 | 	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
117 | 	Rune []rune
118 | }
119 | 
120 | func (p *Prog) String() string {
121 | 	var b bytes.Buffer
122 | 	dumpProg(&b, p)
123 | 	return b.String()
124 | }
125 | 
126 | // skipNop follows any no-op or capturing instructions
127 | // and returns the resulting pc.
128 | func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
129 | 	i := &p.Inst[pc]
130 | 	for i.Op == InstNop || i.Op == InstCapture {
131 | 		pc = i.Out
132 | 		i = &p.Inst[pc]
133 | 	}
134 | 	return i, pc
135 | }
136 | 
137 | // op returns i.Op but merges all the Rune special cases into InstRune
138 | func (i *Inst) op() InstOp {
139 | 	op := i.Op
140 | 	switch op {
141 | 	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
142 | 		op = InstRune
143 | 	}
144 | 	return op
145 | }
146 | 
147 | // Prefix returns a literal string that all matches for the
148 | // regexp must start with. Complete is true if the prefix
149 | // is the entire match.
150 | func (p *Prog) Prefix() (prefix string, complete bool) {
151 | 	i, _ := p.skipNop(uint32(p.Start))
152 | 
153 | 	// Avoid allocation of buffer if prefix is empty.
154 | 	if i.op() != InstRune || len(i.Rune) != 1 {
155 | 		return "", i.Op == InstMatch
156 | 	}
157 | 
158 | 	// Have prefix; gather characters.
159 | 	var buf bytes.Buffer
160 | 	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
161 | 		buf.WriteRune(i.Rune[0])
162 | 		i, _ = p.skipNop(i.Out)
163 | 	}
164 | 	return buf.String(), i.Op == InstMatch
165 | }
166 | 
167 | // StartCond returns the leading empty-width conditions that must
168 | // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
169 | func (p *Prog) StartCond() EmptyOp {
170 | 	var flag EmptyOp
171 | 	pc := uint32(p.Start)
172 | 	i := &p.Inst[pc]
173 | Loop:
174 | 	for {
175 | 		switch i.Op {
176 | 		case InstEmptyWidth:
177 | 			flag |= EmptyOp(i.Arg)
178 | 		case InstFail:
179 | 			return ^EmptyOp(0)
180 | 		case InstCapture, InstNop:
181 | 			// skip
182 | 		default:
183 | 			break Loop
184 | 		}
185 | 		pc = i.Out
186 | 		i = &p.Inst[pc]
187 | 	}
188 | 	return flag
189 | }
190 | 
191 | const noMatch = -1
192 | 
193 | // MatchRune reports whether the instruction matches (and consumes) r.
194 | // It should only be called when i.Op == InstRune.
195 | func (i *Inst) MatchRune(r rune) bool {
196 | 	return i.MatchRunePos(r) != noMatch
197 | }
198 | 
199 | // MatchRunePos checks whether the instruction matches (and consumes) r.
200 | // If so, MatchRunePos returns the index of the matching rune pair
201 | // (or, when len(i.Rune) == 1, rune singleton).
202 | // If not, MatchRunePos returns -1.
203 | // MatchRunePos should only be called when i.Op == InstRune.
204 | func (i *Inst) MatchRunePos(r rune) int {
205 | 	rune := i.Rune
206 | 
207 | 	// Special case: single-rune slice is from literal string, not char class.
208 | 	if len(rune) == 1 {
209 | 		r0 := rune[0]
210 | 		if r == r0 {
211 | 			return 0
212 | 		}
213 | 		if Flags(i.Arg)&FoldCase != 0 {
214 | 			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
215 | 				if r == r1 {
216 | 					return 0
217 | 				}
218 | 			}
219 | 		}
220 | 		return noMatch
221 | 	}
222 | 
223 | 	// Peek at the first few pairs.
224 | 	// Should handle ASCII well.
225 | 	for j := 0; j < len(rune) && j <= 8; j += 2 {
226 | 		if r < rune[j] {
227 | 			return noMatch
228 | 		}
229 | 		if r <= rune[j+1] {
230 | 			return j / 2
231 | 		}
232 | 	}
233 | 
234 | 	// Otherwise binary search.
235 | 	lo := 0
236 | 	hi := len(rune) / 2
237 | 	for lo < hi {
238 | 		m := lo + (hi-lo)/2
239 | 		if c := rune[2*m]; c <= r {
240 | 			if r <= rune[2*m+1] {
241 | 				return m
242 | 			}
243 | 			lo = m + 1
244 | 		} else {
245 | 			hi = m
246 | 		}
247 | 	}
248 | 	return noMatch
249 | }
250 | 
251 | // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
252 | // Since we act on runes, it would be easy to support Unicode here.
253 | func wordRune(r rune) bool {
254 | 	return r == '_' ||
255 | 		('A' <= r && r <= 'Z') ||
256 | 		('a' <= r && r <= 'z') ||
257 | 		('0' <= r && r <= '9')
258 | }
259 | 
260 | // MatchEmptyWidth reports whether the instruction matches
261 | // an empty string between the runes before and after.
262 | // It should only be called when i.Op == InstEmptyWidth.
263 | func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
264 | 	switch EmptyOp(i.Arg) {
265 | 	case EmptyBeginLine:
266 | 		return before == '\n' || before == -1
267 | 	case EmptyEndLine:
268 | 		return after == '\n' || after == -1
269 | 	case EmptyBeginText:
270 | 		return before == -1
271 | 	case EmptyEndText:
272 | 		return after == -1
273 | 	case EmptyWordBoundary:
274 | 		return wordRune(before) != wordRune(after)
275 | 	case EmptyNoWordBoundary:
276 | 		return wordRune(before) == wordRune(after)
277 | 	}
278 | 	panic("unknown empty width arg")
279 | }
280 | 
281 | func (i *Inst) String() string {
282 | 	var b bytes.Buffer
283 | 	dumpInst(&b, i)
284 | 	return b.String()
285 | }
286 | 
287 | func bw(b *bytes.Buffer, args ...string) {
288 | 	for _, s := range args {
289 | 		b.WriteString(s)
290 | 	}
291 | }
292 | 
293 | func dumpProg(b *bytes.Buffer, p *Prog) {
294 | 	for j := range p.Inst {
295 | 		i := &p.Inst[j]
296 | 		pc := strconv.Itoa(j)
297 | 		if len(pc) < 3 {
298 | 			b.WriteString("   "[len(pc):])
299 | 		}
300 | 		if j == p.Start {
301 | 			pc += "*"
302 | 		}
303 | 		if j == p.StartUnanchored {
304 | 			pc += "~"
305 | 		}
306 | 		bw(b, pc, "\t")
307 | 		dumpInst(b, i)
308 | 		bw(b, "\n")
309 | 	}
310 | }
311 | 
312 | func u32(i uint32) string {
313 | 	return strconv.FormatUint(uint64(i), 10)
314 | }
315 | 
316 | func dumpInst(b *bytes.Buffer, i *Inst) {
317 | 	switch i.Op {
318 | 	case InstAlt:
319 | 		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
320 | 	case InstAltMatch:
321 | 		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
322 | 	case InstCapture:
323 | 		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
324 | 	case InstEmptyWidth:
325 | 		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
326 | 	case InstMatch:
327 | 		bw(b, "match")
328 | 	case InstFail:
329 | 		bw(b, "fail")
330 | 	case InstNop:
331 | 		bw(b, "nop -> ", u32(i.Out))
332 | 	case InstRune:
333 | 		if i.Rune == nil {
334 | 			// shouldn't happen
335 | 			bw(b, "rune <nil>")
336 | 		}
337 | 		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
338 | 		if Flags(i.Arg)&FoldCase != 0 {
339 | 			bw(b, "/i")
340 | 		}
341 | 		bw(b, " -> ", u32(i.Out))
342 | 	case InstRune1:
343 | 		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
344 | 	case InstRuneAny:
345 | 		bw(b, "any -> ", u32(i.Out))
346 | 	case InstRuneAnyNotNL:
347 | 		bw(b, "anynotnl -> ", u32(i.Out))
348 | 	}
349 | }
350 | 


--------------------------------------------------------------------------------
/syntax/prog_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import "testing"
  8 | 
  9 | var compileTests = []struct {
 10 | 	Regexp string
 11 | 	Prog   string
 12 | }{
 13 | 	{"a", `  0	fail
 14 |   1*	rune1 "a" -> 2
 15 |   2	match
 16 |   3	any -> 4
 17 |   4~	alt -> 1, 3
 18 | `},
 19 | 	{"[A-M][n-z]", `  0	fail
 20 |   1*	rune "AM" -> 2
 21 |   2	rune "nz" -> 3
 22 |   3	match
 23 |   4	any -> 5
 24 |   5~	alt -> 1, 4
 25 | `},
 26 | 	{"", `  0	fail
 27 |   1*	nop -> 2
 28 |   2	match
 29 |   3	any -> 4
 30 |   4~	alt -> 1, 3
 31 | `},
 32 | 	{"a?", `  0	fail
 33 |   1	rune1 "a" -> 3
 34 |   2*	alt -> 1, 3
 35 |   3	match
 36 |   4	any -> 5
 37 |   5~	alt -> 2, 4
 38 | `},
 39 | 	{"a??", `  0	fail
 40 |   1	rune1 "a" -> 3
 41 |   2*	alt -> 3, 1
 42 |   3	match
 43 |   4	any -> 5
 44 |   5~	alt -> 2, 4
 45 | `},
 46 | 	{"a+", `  0	fail
 47 |   1*	rune1 "a" -> 2
 48 |   2	alt -> 1, 3
 49 |   3	match
 50 |   4	any -> 5
 51 |   5~	alt -> 1, 4
 52 | `},
 53 | 	{"a+?", `  0	fail
 54 |   1*	rune1 "a" -> 2
 55 |   2	alt -> 3, 1
 56 |   3	match
 57 |   4	any -> 5
 58 |   5~	alt -> 1, 4
 59 | `},
 60 | 	{"a*", `  0	fail
 61 |   1	rune1 "a" -> 2
 62 |   2*	alt -> 1, 3
 63 |   3	match
 64 |   4	any -> 5
 65 |   5~	alt -> 2, 4
 66 | `},
 67 | 	{"a*?", `  0	fail
 68 |   1	rune1 "a" -> 2
 69 |   2*	alt -> 3, 1
 70 |   3	match
 71 |   4	any -> 5
 72 |   5~	alt -> 2, 4
 73 | `},
 74 | 	{"a+b+", `  0	fail
 75 |   1*	rune1 "a" -> 2
 76 |   2	alt -> 1, 3
 77 |   3	rune1 "b" -> 4
 78 |   4	alt -> 3, 5
 79 |   5	match
 80 |   6	any -> 7
 81 |   7~	alt -> 1, 6
 82 | `},
 83 | 	{"(a+)(b+)", `  0	fail
 84 |   1*	cap 2 -> 2
 85 |   2	rune1 "a" -> 3
 86 |   3	alt -> 2, 4
 87 |   4	cap 3 -> 5
 88 |   5	cap 4 -> 6
 89 |   6	rune1 "b" -> 7
 90 |   7	alt -> 6, 8
 91 |   8	cap 5 -> 9
 92 |   9	match
 93 |  10	any -> 11
 94 |  11~	alt -> 1, 10
 95 | `},
 96 | 	{"a+|b+", `  0	fail
 97 |   1	rune1 "a" -> 2
 98 |   2	alt -> 1, 6
 99 |   3	rune1 "b" -> 4
100 |   4	alt -> 3, 6
101 |   5*	alt -> 1, 3
102 |   6	match
103 |   7	any -> 8
104 |   8~	alt -> 5, 7
105 | `},
106 | 	{"A[Aa]", `  0	fail
107 |   1*	rune1 "A" -> 2
108 |   2	rune "A"/i -> 3
109 |   3	match
110 |   4	any -> 5
111 |   5~	alt -> 1, 4
112 | `},
113 | 	{"(?:(?:^).)", `  0	fail
114 |   1*	empty 4 -> 2
115 |   2	anynotnl -> 3
116 |   3	match
117 |   4	any -> 5
118 |   5~	alt -> 1, 4
119 | `},
120 | }
121 | 
122 | func TestCompile(t *testing.T) {
123 | 	for _, tt := range compileTests {
124 | re, _ := Parse(tt.Regexp, Perl)
125 | p, _ := Compile(re)
126 | s := p.String()
127 | if s != tt.Prog {
128 | 	t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
129 | }
130 | 	}
131 | }
132 | 
133 | func BenchmarkEmptyOpContext(b *testing.B) {
134 | 	for i := 0; i < b.N; i++ {
135 | var r1 rune = -1
136 | for _, r2 := range "foo, bar, baz\nsome input text.\n" {
137 | 	EmptyOpContext(r1, r2)
138 | 	r1 = r2
139 | }
140 | EmptyOpContext(r1, -1)
141 | 	}
142 | }
143 | 


--------------------------------------------------------------------------------
/syntax/regexp.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | // Note to implementers:
  8 | // In this package, re is always a *Regexp and r is always a rune.
  9 | 
 10 | import (
 11 | 	"bytes"
 12 | 	"strconv"
 13 | 	"strings"
 14 | 	"unicode"
 15 | )
 16 | 
 17 | // A Regexp is a node in a regular expression syntax tree.
 18 | type Regexp struct {
 19 | 	Op       Op // operator
 20 | 	Flags    Flags
 21 | 	Sub      []*Regexp  // subexpressions, if any
 22 | 	Sub0     [1]*Regexp // storage for short Sub
 23 | 	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
 24 | 	Rune0    [2]rune    // storage for short Rune
 25 | 	Min, Max int        // min, max for OpRepeat
 26 | 	Cap      int        // capturing index, for OpCapture
 27 | 	Name     string     // capturing name, for OpCapture
 28 | }
 29 | 
 30 | // An Op is a single regular expression operator.
 31 | type Op uint8
 32 | 
 33 | // Operators are listed in precedence order, tightest binding to weakest.
 34 | // Character class operators are listed simplest to most complex
 35 | // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
 36 | 
 37 | const (
 38 | 	OpNoMatch        Op = 1 + iota // matches no strings
 39 | 	OpEmptyMatch                   // matches empty string
 40 | 	OpLiteral                      // matches Runes sequence
 41 | 	OpCharClass                    // matches Runes interpreted as range pair list
 42 | 	OpAnyCharNotNL                 // matches any character except newline
 43 | 	OpAnyChar                      // matches any character
 44 | 	OpBeginLine                    // matches empty string at beginning of line
 45 | 	OpEndLine                      // matches empty string at end of line
 46 | 	OpBeginText                    // matches empty string at beginning of text
 47 | 	OpEndText                      // matches empty string at end of text
 48 | 	OpWordBoundary                 // matches word boundary `\b`
 49 | 	OpNoWordBoundary               // matches word non-boundary `\B`
 50 | 	OpCapture                      // capturing subexpression with index Cap, optional name Name
 51 | 	OpStar                         // matches Sub[0] zero or more times
 52 | 	OpPlus                         // matches Sub[0] one or more times
 53 | 	OpQuest                        // matches Sub[0] zero or one times
 54 | 	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
 55 | 	OpConcat                       // matches concatenation of Subs
 56 | 	OpAlternate                    // matches alternation of Subs
 57 | )
 58 | 
 59 | const opPseudo Op = 128 // where pseudo-ops start
 60 | 
 61 | // Equal returns true if x and y have identical structure.
 62 | func (x *Regexp) Equal(y *Regexp) bool {
 63 | 	if x == nil || y == nil {
 64 | 		return x == y
 65 | 	}
 66 | 	if x.Op != y.Op {
 67 | 		return false
 68 | 	}
 69 | 	switch x.Op {
 70 | 	case OpEndText:
 71 | 		// The parse flags remember whether this is \z or \Z.
 72 | 		if x.Flags&WasDollar != y.Flags&WasDollar {
 73 | 			return false
 74 | 		}
 75 | 
 76 | 	case OpLiteral, OpCharClass:
 77 | 		if len(x.Rune) != len(y.Rune) {
 78 | 			return false
 79 | 		}
 80 | 		for i, r := range x.Rune {
 81 | 			if r != y.Rune[i] {
 82 | 				return false
 83 | 			}
 84 | 		}
 85 | 
 86 | 	case OpAlternate, OpConcat:
 87 | 		if len(x.Sub) != len(y.Sub) {
 88 | 			return false
 89 | 		}
 90 | 		for i, sub := range x.Sub {
 91 | 			if !sub.Equal(y.Sub[i]) {
 92 | 				return false
 93 | 			}
 94 | 		}
 95 | 
 96 | 	case OpStar, OpPlus, OpQuest:
 97 | 		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
 98 | 			return false
 99 | 		}
100 | 
101 | 	case OpRepeat:
102 | 		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
103 | 			return false
104 | 		}
105 | 
106 | 	case OpCapture:
107 | 		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
108 | 			return false
109 | 		}
110 | 	}
111 | 	return true
112 | }
113 | 
114 | // writeRegexp writes the Perl syntax for the regular expression re to b.
115 | func writeRegexp(b *bytes.Buffer, re *Regexp) {
116 | 	switch re.Op {
117 | 	default:
118 | 		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
119 | 	case OpNoMatch:
120 | 		b.WriteString(`[^\x00-\x{10FFFF}]`)
121 | 	case OpEmptyMatch:
122 | 		b.WriteString(`(?:)`)
123 | 	case OpLiteral:
124 | 		if re.Flags&FoldCase != 0 {
125 | 			b.WriteString(`(?i:`)
126 | 		}
127 | 		for _, r := range re.Rune {
128 | 			escape(b, r, false)
129 | 		}
130 | 		if re.Flags&FoldCase != 0 {
131 | 			b.WriteString(`)`)
132 | 		}
133 | 	case OpCharClass:
134 | 		if len(re.Rune)%2 != 0 {
135 | 			b.WriteString(`[invalid char class]`)
136 | 			break
137 | 		}
138 | 		b.WriteRune('[')
139 | 		if len(re.Rune) == 0 {
140 | 			b.WriteString(`^\x00-\x{10FFFF}`)
141 | 		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
142 | 			// Contains 0 and MaxRune. Probably a negated class.
143 | 			// Print the gaps.
144 | 			b.WriteRune('^')
145 | 			for i := 1; i < len(re.Rune)-1; i += 2 {
146 | 				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
147 | 				escape(b, lo, lo == '-')
148 | 				if lo != hi {
149 | 					b.WriteRune('-')
150 | 					escape(b, hi, hi == '-')
151 | 				}
152 | 			}
153 | 		} else {
154 | 			for i := 0; i < len(re.Rune); i += 2 {
155 | 				lo, hi := re.Rune[i], re.Rune[i+1]
156 | 				escape(b, lo, lo == '-')
157 | 				if lo != hi {
158 | 					b.WriteRune('-')
159 | 					escape(b, hi, hi == '-')
160 | 				}
161 | 			}
162 | 		}
163 | 		b.WriteRune(']')
164 | 	case OpAnyCharNotNL:
165 | 		b.WriteString(`(?-s:.)`)
166 | 	case OpAnyChar:
167 | 		b.WriteString(`(?s:.)`)
168 | 	case OpBeginLine:
169 | 		b.WriteString(`(?m:^)`)
170 | 	case OpEndLine:
171 | 		b.WriteString(`(?m:$)`)
172 | 	case OpBeginText:
173 | 		b.WriteString(`\A`)
174 | 	case OpEndText:
175 | 		if re.Flags&WasDollar != 0 {
176 | 			b.WriteString(`(?-m:$)`)
177 | 		} else {
178 | 			b.WriteString(`\z`)
179 | 		}
180 | 	case OpWordBoundary:
181 | 		b.WriteString(`\b`)
182 | 	case OpNoWordBoundary:
183 | 		b.WriteString(`\B`)
184 | 	case OpCapture:
185 | 		if re.Name != "" {
186 | 			b.WriteString(`(?P<`)
187 | 			b.WriteString(re.Name)
188 | 			b.WriteRune('>')
189 | 		} else {
190 | 			b.WriteRune('(')
191 | 		}
192 | 		if re.Sub[0].Op != OpEmptyMatch {
193 | 			writeRegexp(b, re.Sub[0])
194 | 		}
195 | 		b.WriteRune(')')
196 | 	case OpStar, OpPlus, OpQuest, OpRepeat:
197 | 		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
198 | 			b.WriteString(`(?:`)
199 | 			writeRegexp(b, sub)
200 | 			b.WriteString(`)`)
201 | 		} else {
202 | 			writeRegexp(b, sub)
203 | 		}
204 | 		switch re.Op {
205 | 		case OpStar:
206 | 			b.WriteRune('*')
207 | 		case OpPlus:
208 | 			b.WriteRune('+')
209 | 		case OpQuest:
210 | 			b.WriteRune('?')
211 | 		case OpRepeat:
212 | 			b.WriteRune('{')
213 | 			b.WriteString(strconv.Itoa(re.Min))
214 | 			if re.Max != re.Min {
215 | 				b.WriteRune(',')
216 | 				if re.Max >= 0 {
217 | 					b.WriteString(strconv.Itoa(re.Max))
218 | 				}
219 | 			}
220 | 			b.WriteRune('}')
221 | 		}
222 | 		if re.Flags&NonGreedy != 0 {
223 | 			b.WriteRune('?')
224 | 		}
225 | 	case OpConcat:
226 | 		for _, sub := range re.Sub {
227 | 			if sub.Op == OpAlternate {
228 | 				b.WriteString(`(?:`)
229 | 				writeRegexp(b, sub)
230 | 				b.WriteString(`)`)
231 | 			} else {
232 | 				writeRegexp(b, sub)
233 | 			}
234 | 		}
235 | 	case OpAlternate:
236 | 		for i, sub := range re.Sub {
237 | 			if i > 0 {
238 | 				b.WriteRune('|')
239 | 			}
240 | 			writeRegexp(b, sub)
241 | 		}
242 | 	}
243 | }
244 | 
245 | func (re *Regexp) String() string {
246 | 	var b bytes.Buffer
247 | 	writeRegexp(&b, re)
248 | 	return b.String()
249 | }
250 | 
251 | const meta = `\.+*?()|[]{}^$`
252 | 
253 | func escape(b *bytes.Buffer, r rune, force bool) {
254 | 	if unicode.IsPrint(r) {
255 | 		if strings.ContainsRune(meta, r) || force {
256 | 			b.WriteRune('\\')
257 | 		}
258 | 		b.WriteRune(r)
259 | 		return
260 | 	}
261 | 
262 | 	switch r {
263 | 	case '\a':
264 | 		b.WriteString(`\a`)
265 | 	case '\f':
266 | 		b.WriteString(`\f`)
267 | 	case '\n':
268 | 		b.WriteString(`\n`)
269 | 	case '\r':
270 | 		b.WriteString(`\r`)
271 | 	case '\t':
272 | 		b.WriteString(`\t`)
273 | 	case '\v':
274 | 		b.WriteString(`\v`)
275 | 	default:
276 | 		if r < 0x100 {
277 | 			b.WriteString(`\x`)
278 | 			s := strconv.FormatInt(int64(r), 16)
279 | 			if len(s) == 1 {
280 | 				b.WriteRune('0')
281 | 			}
282 | 			b.WriteString(s)
283 | 			break
284 | 		}
285 | 		b.WriteString(`\x{`)
286 | 		b.WriteString(strconv.FormatInt(int64(r), 16))
287 | 		b.WriteString(`}`)
288 | 	}
289 | }
290 | 
291 | // MaxCap walks the regexp to find the maximum capture index.
292 | func (re *Regexp) MaxCap() int {
293 | 	m := 0
294 | 	if re.Op == OpCapture {
295 | 		m = re.Cap
296 | 	}
297 | 	for _, sub := range re.Sub {
298 | 		if n := sub.MaxCap(); m < n {
299 | 			m = n
300 | 		}
301 | 	}
302 | 	return m
303 | }
304 | 
305 | // CapNames walks the regexp to find the names of capturing groups.
306 | func (re *Regexp) CapNames() []string {
307 | 	names := make([]string, re.MaxCap()+1)
308 | 	re.capNames(names)
309 | 	return names
310 | }
311 | 
312 | func (re *Regexp) capNames(names []string) {
313 | 	if re.Op == OpCapture {
314 | 		names[re.Cap] = re.Name
315 | 	}
316 | 	for _, sub := range re.Sub {
317 | 		sub.capNames(names)
318 | 	}
319 | }
320 | 


--------------------------------------------------------------------------------
/syntax/simplify.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | // Simplify returns a regexp equivalent to re but without counted repetitions
  8 | // and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
  9 | // The resulting regexp will execute correctly but its string representation
 10 | // will not produce the same parse tree, because capturing parentheses
 11 | // may have been duplicated or removed. For example, the simplified form
 12 | // for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
 13 | // The returned regexp may share structure with or be the original.
 14 | func (re *Regexp) Simplify() *Regexp {
 15 | 	if re == nil {
 16 | 		return nil
 17 | 	}
 18 | 	switch re.Op {
 19 | 	case OpCapture, OpConcat, OpAlternate:
 20 | 		// Simplify children, building new Regexp if children change.
 21 | 		nre := re
 22 | 		for i, sub := range re.Sub {
 23 | 			nsub := sub.Simplify()
 24 | 			if nre == re && nsub != sub {
 25 | 				// Start a copy.
 26 | 				nre = new(Regexp)
 27 | 				*nre = *re
 28 | 				nre.Rune = nil
 29 | 				nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
 30 | 			}
 31 | 			if nre != re {
 32 | 				nre.Sub = append(nre.Sub, nsub)
 33 | 			}
 34 | 		}
 35 | 		return nre
 36 | 
 37 | 	case OpStar, OpPlus, OpQuest:
 38 | 		sub := re.Sub[0].Simplify()
 39 | 		return simplify1(re.Op, re.Flags, sub, re)
 40 | 
 41 | 	case OpRepeat:
 42 | 		// Special special case: x{0} matches the empty string
 43 | 		// and doesn't even need to consider x.
 44 | 		if re.Min == 0 && re.Max == 0 {
 45 | 			return &Regexp{Op: OpEmptyMatch}
 46 | 		}
 47 | 
 48 | 		// The fun begins.
 49 | 		sub := re.Sub[0].Simplify()
 50 | 
 51 | 		// x{n,} means at least n matches of x.
 52 | 		if re.Max == -1 {
 53 | 			// Special case: x{0,} is x*.
 54 | 			if re.Min == 0 {
 55 | 				return simplify1(OpStar, re.Flags, sub, nil)
 56 | 			}
 57 | 
 58 | 			// Special case: x{1,} is x+.
 59 | 			if re.Min == 1 {
 60 | 				return simplify1(OpPlus, re.Flags, sub, nil)
 61 | 			}
 62 | 
 63 | 			// General case: x{4,} is xxxx+.
 64 | 			nre := &Regexp{Op: OpConcat}
 65 | 			nre.Sub = nre.Sub0[:0]
 66 | 			for i := 0; i < re.Min-1; i++ {
 67 | 				nre.Sub = append(nre.Sub, sub)
 68 | 			}
 69 | 			nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
 70 | 			return nre
 71 | 		}
 72 | 
 73 | 		// Special case x{0} handled above.
 74 | 
 75 | 		// Special case: x{1} is just x.
 76 | 		if re.Min == 1 && re.Max == 1 {
 77 | 			return sub
 78 | 		}
 79 | 
 80 | 		// General case: x{n,m} means n copies of x and m copies of x?
 81 | 		// The machine will do less work if we nest the final m copies,
 82 | 		// so that x{2,5} = xx(x(x(x)?)?)?
 83 | 
 84 | 		// Build leading prefix: xx.
 85 | 		var prefix *Regexp
 86 | 		if re.Min > 0 {
 87 | 			prefix = &Regexp{Op: OpConcat}
 88 | 			prefix.Sub = prefix.Sub0[:0]
 89 | 			for i := 0; i < re.Min; i++ {
 90 | 				prefix.Sub = append(prefix.Sub, sub)
 91 | 			}
 92 | 		}
 93 | 
 94 | 		// Build and attach suffix: (x(x(x)?)?)?
 95 | 		if re.Max > re.Min {
 96 | 			suffix := simplify1(OpQuest, re.Flags, sub, nil)
 97 | 			for i := re.Min + 1; i < re.Max; i++ {
 98 | 				nre2 := &Regexp{Op: OpConcat}
 99 | 				nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
100 | 				suffix = simplify1(OpQuest, re.Flags, nre2, nil)
101 | 			}
102 | 			if prefix == nil {
103 | 				return suffix
104 | 			}
105 | 			prefix.Sub = append(prefix.Sub, suffix)
106 | 		}
107 | 		if prefix != nil {
108 | 			return prefix
109 | 		}
110 | 
111 | 		// Some degenerate case like min > max or min < max < 0.
112 | 		// Handle as impossible match.
113 | 		return &Regexp{Op: OpNoMatch}
114 | 	}
115 | 
116 | 	return re
117 | }
118 | 
119 | // simplify1 implements Simplify for the unary OpStar,
120 | // OpPlus, and OpQuest operators. It returns the simple regexp
121 | // equivalent to
122 | //
123 | //	Regexp{Op: op, Flags: flags, Sub: {sub}}
124 | //
125 | // under the assumption that sub is already simple, and
126 | // without first allocating that structure. If the regexp
127 | // to be returned turns out to be equivalent to re, simplify1
128 | // returns re instead.
129 | //
130 | // simplify1 is factored out of Simplify because the implementation
131 | // for other operators generates these unary expressions.
132 | // Letting them call simplify1 makes sure the expressions they
133 | // generate are simple.
134 | func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
135 | 	// Special case: repeat the empty string as much as
136 | 	// you want, but it's still the empty string.
137 | 	if sub.Op == OpEmptyMatch {
138 | 		return sub
139 | 	}
140 | 	// The operators are idempotent if the flags match.
141 | 	if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
142 | 		return sub
143 | 	}
144 | 	if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
145 | 		return re
146 | 	}
147 | 
148 | 	re = &Regexp{Op: op, Flags: flags}
149 | 	re.Sub = append(re.Sub0[:0], sub)
150 | 	return re
151 | }
152 | 


--------------------------------------------------------------------------------
/syntax/simplify_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import "testing"
  8 | 
  9 | var simplifyTests = []struct {
 10 | 	Regexp string
 11 | 	Simple string
 12 | }{
 13 | 	// Already-simple constructs
 14 | 	{`a`, `a`},
 15 | 	{`ab`, `ab`},
 16 | 	{`a|b`, `[a-b]`},
 17 | 	{`ab|cd`, `ab|cd`},
 18 | 	{`(ab)*`, `(ab)*`},
 19 | 	{`(ab)+`, `(ab)+`},
 20 | 	{`(ab)?`, `(ab)?`},
 21 | 	{`.`, `(?s:.)`},
 22 | 	{`^`, `(?m:^)`},
 23 | 	{`$`, `(?m:$)`},
 24 | 	{`[ac]`, `[ac]`},
 25 | 	{`[^ac]`, `[^ac]`},
 26 | 
 27 | 	// Posix character classes
 28 | 	{`[[:alnum:]]`, `[0-9A-Za-z]`},
 29 | 	{`[[:alpha:]]`, `[A-Za-z]`},
 30 | 	{`[[:blank:]]`, `[\t ]`},
 31 | 	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
 32 | 	{`[[:digit:]]`, `[0-9]`},
 33 | 	{`[[:graph:]]`, `[!-~]`},
 34 | 	{`[[:lower:]]`, `[a-z]`},
 35 | 	{`[[:print:]]`, `[ -~]`},
 36 | 	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
 37 | 	{`[[:space:]]`, `[\t-\r ]`},
 38 | 	{`[[:upper:]]`, `[A-Z]`},
 39 | 	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
 40 | 
 41 | 	// Perl character classes
 42 | 	{`\d`, `[0-9]`},
 43 | 	{`\s`, `[\t-\n\f-\r ]`},
 44 | 	{`\w`, `[0-9A-Z_a-z]`},
 45 | 	{`\D`, `[^0-9]`},
 46 | 	{`\S`, `[^\t-\n\f-\r ]`},
 47 | 	{`\W`, `[^0-9A-Z_a-z]`},
 48 | 	{`[\d]`, `[0-9]`},
 49 | 	{`[\s]`, `[\t-\n\f-\r ]`},
 50 | 	{`[\w]`, `[0-9A-Z_a-z]`},
 51 | 	{`[\D]`, `[^0-9]`},
 52 | 	{`[\S]`, `[^\t-\n\f-\r ]`},
 53 | 	{`[\W]`, `[^0-9A-Z_a-z]`},
 54 | 
 55 | 	// Posix repetitions
 56 | 	{`a{1}`, `a`},
 57 | 	{`a{2}`, `aa`},
 58 | 	{`a{5}`, `aaaaa`},
 59 | 	{`a{0,1}`, `a?`},
 60 | 	// The next three are illegible because Simplify inserts (?:)
 61 | 	// parens instead of () parens to avoid creating extra
 62 | 	// captured subexpressions. The comments show a version with fewer parens.
 63 | 	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
 64 | 	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
 65 | 	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
 66 | 	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
 67 | 	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
 68 | 	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
 69 | 	{`a{0,}`, `a*`},
 70 | 	{`a{1,}`, `a+`},
 71 | 	{`a{2,}`, `aa+`},
 72 | 	{`a{5,}`, `aaaaa+`},
 73 | 
 74 | 	// Test that operators simplify their arguments.
 75 | 	{`(?:a{1,}){1,}`, `a+`},
 76 | 	{`(a{1,}b{1,})`, `(a+b+)`},
 77 | 	{`a{1,}|b{1,}`, `a+|b+`},
 78 | 	{`(?:a{1,})*`, `(?:a+)*`},
 79 | 	{`(?:a{1,})+`, `a+`},
 80 | 	{`(?:a{1,})?`, `(?:a+)?`},
 81 | 	{``, `(?:)`},
 82 | 	{`a{0}`, `(?:)`},
 83 | 
 84 | 	// Character class simplification
 85 | 	{`[ab]`, `[a-b]`},
 86 | 	{`[a-za-za-z]`, `[a-z]`},
 87 | 	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
 88 | 	{`[ABCDEFGH]`, `[A-H]`},
 89 | 	{`[AB-CD-EF-GH]`, `[A-H]`},
 90 | 	{`[W-ZP-XE-R]`, `[E-Z]`},
 91 | 	{`[a-ee-gg-m]`, `[a-m]`},
 92 | 	{`[a-ea-ha-m]`, `[a-m]`},
 93 | 	{`[a-ma-ha-e]`, `[a-m]`},
 94 | 	{`[a-zA-Z0-9 -~]`, `[ -~]`},
 95 | 
 96 | 	// Empty character classes
 97 | 	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
 98 | 
 99 | 	// Full character classes
100 | 	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
101 | 
102 | 	// Unicode case folding.
103 | 	{`(?i)A`, `(?i:A)`},
104 | 	{`(?i)a`, `(?i:A)`},
105 | 	{`(?i)[A]`, `(?i:A)`},
106 | 	{`(?i)[a]`, `(?i:A)`},
107 | 	{`(?i)K`, `(?i:K)`},
108 | 	{`(?i)k`, `(?i:K)`},
109 | 	{`(?i)\x{212a}`, "(?i:K)"},
110 | 	{`(?i)[K]`, "[Kk\u212A]"},
111 | 	{`(?i)[k]`, "[Kk\u212A]"},
112 | 	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
113 | 	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
114 | 	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
115 | 	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
116 | 
117 | 	// Empty string as a regular expression.
118 | 	// The empty string must be preserved inside parens in order
119 | 	// to make submatches work right, so these tests are less
120 | 	// interesting than they might otherwise be. String inserts
121 | 	// explicit (?:) in place of non-parenthesized empty strings,
122 | 	// to make them easier to spot for other parsers.
123 | 	{`(a|b|)`, `([a-b]|(?:))`},
124 | 	{`(|)`, `()`},
125 | 	{`a()`, `a()`},
126 | 	{`(()|())`, `(()|())`},
127 | 	{`(a|)`, `(a|(?:))`},
128 | 	{`ab()cd()`, `ab()cd()`},
129 | 	{`()`, `()`},
130 | 	{`()*`, `()*`},
131 | 	{`()+`, `()+`},
132 | 	{`()?`, `()?`},
133 | 	{`(){0}`, `(?:)`},
134 | 	{`(){1}`, `()`},
135 | 	{`(){1,}`, `()+`},
136 | 	{`(){0,2}`, `(?:()()?)?`},
137 | }
138 | 
139 | func TestSimplify(t *testing.T) {
140 | 	for _, tt := range simplifyTests {
141 | 		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
142 | 		if err != nil {
143 | 			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
144 | 			continue
145 | 		}
146 | 		s := re.Simplify().String()
147 | 		if s != tt.Simple {
148 | 			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
149 | 		}
150 | 	}
151 | }
152 | 


--------------------------------------------------------------------------------
/testdata/README:
--------------------------------------------------------------------------------
 1 | AT&T POSIX Test Files
 2 | See textregex.c for copyright + license.
 3 | 
 4 | testregex.c	http://www2.research.att.com/~gsf/testregex/testregex.c
 5 | basic.dat	http://www2.research.att.com/~gsf/testregex/basic.dat
 6 | nullsubexpr.dat	http://www2.research.att.com/~gsf/testregex/nullsubexpr.dat
 7 | repetition.dat	http://www2.research.att.com/~gsf/testregex/repetition.dat
 8 | 
 9 | The test data has been edited to reflect RE2/Go differences:
10 |   * In a star of a possibly empty match like (a*)* matching x,
11 |     the no match case runs the starred subexpression zero times,
12 |     not once.  This is consistent with (a*)* matching a, which
13 |     runs the starred subexpression one time, not twice.
14 |   * The submatch choice is first match, not the POSIX rule.
15 | 
16 | Such changes are marked with 'RE2/Go'.
17 | 
18 | 
19 | RE2 Test Files
20 | 
21 | re2-exhaustive.txt.bz2 and re2-search.txt are built by running
22 | 'make log' in the RE2 distribution https://github.com/google/re2/
23 | 
24 | The exhaustive file is compressed because it is huge.
25 | 


--------------------------------------------------------------------------------
/testdata/basic.dat:
--------------------------------------------------------------------------------
  1 | NOTE	all standard compliant implementations should pass these : 2002-05-31
  2 | 
  3 | BE	abracadabra$	abracadabracadabra	(7,18)
  4 | BE	a...b		abababbb		(2,7)
  5 | BE	XXXXXX		..XXXXXX		(2,8)
  6 | E	\)		()	(1,2)
  7 | BE	a]		a]a	(0,2)
  8 | B	}		}	(0,1)
  9 | E	\}		}	(0,1)
 10 | BE	\]		]	(0,1)
 11 | B	]		]	(0,1)
 12 | E	]		]	(0,1)
 13 | B	{		{	(0,1)
 14 | B	}		}	(0,1)
 15 | BE	^a		ax	(0,1)
 16 | BE	\^a		a^a	(1,3)
 17 | BE	a\^		a^	(0,2)
 18 | BE	a$		aa	(1,2)
 19 | BE	a\$		a$	(0,2)
 20 | BE	^$		NULL	(0,0)
 21 | E	$^		NULL	(0,0)
 22 | E	a($)		aa	(1,2)(2,2)
 23 | E	a*(^a)		aa	(0,1)(0,1)
 24 | E	(..)*(...)*		a	(0,0)
 25 | E	(..)*(...)*		abcd	(0,4)(2,4)
 26 | E	(ab|a)(bc|c)		abc	(0,3)(0,2)(2,3)
 27 | E	(ab)c|abc		abc	(0,3)(0,2)
 28 | E	a{0}b		ab			(1,2)
 29 | E	(a*)(b?)(b+)b{3}	aaabbbbbbb	(0,10)(0,3)(3,4)(4,7)
 30 | E	(a*)(b{0,1})(b{1,})b{3}	aaabbbbbbb	(0,10)(0,3)(3,4)(4,7)
 31 | E	a{9876543210}	NULL	BADBR
 32 | E	((a|a)|a)			a	(0,1)(0,1)(0,1)
 33 | E	(a*)(a|aa)			aaaa	(0,4)(0,3)(3,4)
 34 | E	a*(a.|aa)			aaaa	(0,4)(2,4)
 35 | E	a(b)|c(d)|a(e)f			aef	(0,3)(?,?)(?,?)(1,2)
 36 | E	(a|b)?.*			b	(0,1)(0,1)
 37 | E	(a|b)c|a(b|c)			ac	(0,2)(0,1)
 38 | E	(a|b)c|a(b|c)			ab	(0,2)(?,?)(1,2)
 39 | E	(a|b)*c|(a|ab)*c		abc	(0,3)(1,2)
 40 | E	(a|b)*c|(a|ab)*c		xc	(1,2)
 41 | E	(.a|.b).*|.*(.a|.b)		xa	(0,2)(0,2)
 42 | E	a?(ab|ba)ab			abab	(0,4)(0,2)
 43 | E	a?(ac{0}b|ba)ab			abab	(0,4)(0,2)
 44 | E	ab|abab				abbabab	(0,2)
 45 | E	aba|bab|bba			baaabbbaba	(5,8)
 46 | E	aba|bab				baaabbbaba	(6,9)
 47 | E	(aa|aaa)*|(a|aaaaa)		aa	(0,2)(0,2)
 48 | E	(a.|.a.)*|(a|.a...)		aa	(0,2)(0,2)
 49 | E	ab|a				xabc	(1,3)
 50 | E	ab|a				xxabc	(2,4)
 51 | Ei	(Ab|cD)*			aBcD	(0,4)(2,4)
 52 | BE	[^-]			--a		(2,3)
 53 | BE	[a-]*			--a		(0,3)
 54 | BE	[a-m-]*			--amoma--	(0,4)
 55 | E	:::1:::0:|:::1:1:0:	:::0:::1:::1:::0:	(8,17)
 56 | E	:::1:::0:|:::1:1:1:	:::0:::1:::1:::0:	(8,17)
 57 | {E	[[:upper:]]		A		(0,1)	[[<element>]] not supported
 58 | E	[[:lower:]]+		`az{		(1,3)
 59 | E	[[:upper:]]+		@AZ[		(1,3)
 60 | # No collation in Go
 61 | #BE	[[-]]			[[-]]		(2,4)
 62 | #BE	[[.NIL.]]	NULL	ECOLLATE
 63 | #BE	[[=aleph=]]	NULL	ECOLLATE
 64 | }
 65 | BE$	\n		\n	(0,1)
 66 | BEn$	\n		\n	(0,1)
 67 | BE$	[^a]		\n	(0,1)
 68 | BE$	\na		\na	(0,2)
 69 | E	(a)(b)(c)	abc	(0,3)(0,1)(1,2)(2,3)
 70 | BE	xxx		xxx	(0,3)
 71 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	feb 6,	(0,6)
 72 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	2/7	(0,3)
 73 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	feb 1,Feb 6	(5,11)
 74 | E3	((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))	x	(0,1)(0,1)(0,1)
 75 | E3	((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*	xx	(0,2)(1,2)(1,2)
 76 | E	a?(ab|ba)*	ababababababababababababababababababababababababababababababababababababababababa	(0,81)(79,81)
 77 | E	abaa|abbaa|abbbaa|abbbbaa	ababbabbbabbbabbbbabbbbaa	(18,25)
 78 | E	abaa|abbaa|abbbaa|abbbbaa	ababbabbbabbbabbbbabaa	(18,22)
 79 | E	aaac|aabc|abac|abbc|baac|babc|bbac|bbbc	baaabbbabac	(7,11)
 80 | BE$	.*			\x01\xff	(0,2)
 81 | E	aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll		XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa	(53,57)
 82 | L	aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll		XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa	NOMATCH
 83 | E	a*a*a*a*a*b		aaaaaaaaab	(0,10)
 84 | BE	^			NULL		(0,0)
 85 | BE	$			NULL		(0,0)
 86 | BE	^$			NULL		(0,0)
 87 | BE	^a$			a		(0,1)
 88 | BE	abc			abc		(0,3)
 89 | BE	abc			xabcy		(1,4)
 90 | BE	abc			ababc		(2,5)
 91 | BE	ab*c			abc		(0,3)
 92 | BE	ab*bc			abc		(0,3)
 93 | BE	ab*bc			abbc		(0,4)
 94 | BE	ab*bc			abbbbc		(0,6)
 95 | E	ab+bc			abbc		(0,4)
 96 | E	ab+bc			abbbbc		(0,6)
 97 | E	ab?bc			abbc		(0,4)
 98 | E	ab?bc			abc		(0,3)
 99 | E	ab?c			abc		(0,3)
100 | BE	^abc$			abc		(0,3)
101 | BE	^abc			abcc		(0,3)
102 | BE	abc$			aabc		(1,4)
103 | BE	^			abc		(0,0)
104 | BE	$			abc		(3,3)
105 | BE	a.c			abc		(0,3)
106 | BE	a.c			axc		(0,3)
107 | BE	a.*c			axyzc		(0,5)
108 | BE	a[bc]d			abd		(0,3)
109 | BE	a[b-d]e			ace		(0,3)
110 | BE	a[b-d]			aac		(1,3)
111 | BE	a[-b]			a-		(0,2)
112 | BE	a[b-]			a-		(0,2)
113 | BE	a]			a]		(0,2)
114 | BE	a[]]b			a]b		(0,3)
115 | BE	a[^bc]d			aed		(0,3)
116 | BE	a[^-b]c			adc		(0,3)
117 | BE	a[^]b]c			adc		(0,3)
118 | E	ab|cd			abc		(0,2)
119 | E	ab|cd			abcd		(0,2)
120 | E	a\(b			a(b		(0,3)
121 | E	a\(*b			ab		(0,2)
122 | E	a\(*b			a((b		(0,4)
123 | E	((a))			abc		(0,1)(0,1)(0,1)
124 | E	(a)b(c)			abc		(0,3)(0,1)(2,3)
125 | E	a+b+c			aabbabc		(4,7)
126 | E	a*			aaa		(0,3)
127 | #E	(a*)*			-		(0,0)(0,0)
128 | E	(a*)*			-		(0,0)(?,?)	RE2/Go
129 | E	(a*)+			-		(0,0)(0,0)
130 | #E	(a*|b)*			-		(0,0)(0,0)
131 | E	(a*|b)*			-		(0,0)(?,?)	RE2/Go
132 | E	(a+|b)*			ab		(0,2)(1,2)
133 | E	(a+|b)+			ab		(0,2)(1,2)
134 | E	(a+|b)?			ab		(0,1)(0,1)
135 | BE	[^ab]*			cde		(0,3)
136 | #E	(^)*			-		(0,0)(0,0)
137 | E	(^)*			-		(0,0)(?,?)	RE2/Go
138 | BE	a*			NULL		(0,0)
139 | E	([abc])*d		abbbcd		(0,6)(4,5)
140 | E	([abc])*bcd		abcd		(0,4)(0,1)
141 | E	a|b|c|d|e		e		(0,1)
142 | E	(a|b|c|d|e)f		ef		(0,2)(0,1)
143 | #E	((a*|b))*		-		(0,0)(0,0)(0,0)
144 | E	((a*|b))*		-		(0,0)(?,?)(?,?)	RE2/Go
145 | BE	abcd*efg		abcdefg		(0,7)
146 | BE	ab*			xabyabbbz	(1,3)
147 | BE	ab*			xayabbbz	(1,2)
148 | E	(ab|cd)e		abcde		(2,5)(2,4)
149 | BE	[abhgefdc]ij		hij		(0,3)
150 | E	(a|b)c*d		abcd		(1,4)(1,2)
151 | E	(ab|ab*)bc		abc		(0,3)(0,1)
152 | E	a([bc]*)c*		abc		(0,3)(1,3)
153 | E	a([bc]*)(c*d)		abcd		(0,4)(1,3)(3,4)
154 | E	a([bc]+)(c*d)		abcd		(0,4)(1,3)(3,4)
155 | E	a([bc]*)(c+d)		abcd		(0,4)(1,2)(2,4)
156 | E	a[bcd]*dcdcde		adcdcde		(0,7)
157 | E	(ab|a)b*c		abc		(0,3)(0,2)
158 | E	((a)(b)c)(d)		abcd		(0,4)(0,3)(0,1)(1,2)(3,4)
159 | BE	[A-Za-z_][A-Za-z0-9_]*	alpha		(0,5)
160 | E	^a(bc+|b[eh])g|.h$	abh		(1,3)
161 | E	(bc+d$|ef*g.|h?i(j|k))	effgz		(0,5)(0,5)
162 | E	(bc+d$|ef*g.|h?i(j|k))	ij		(0,2)(0,2)(1,2)
163 | E	(bc+d$|ef*g.|h?i(j|k))	reffgz		(1,6)(1,6)
164 | E	(((((((((a)))))))))	a		(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
165 | BE	multiple words		multiple words yeah	(0,14)
166 | E	(.*)c(.*)		abcde		(0,5)(0,2)(3,5)
167 | BE	abcd			abcd		(0,4)
168 | E	a(bc)d			abcd		(0,4)(1,3)
169 | E	a[-]?c		ac		(0,3)
170 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Qaddafi	(0,15)(?,?)(10,12)
171 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mo'ammar Gadhafi	(0,16)(?,?)(11,13)
172 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Kaddafi	(0,15)(?,?)(10,12)
173 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Qadhafi	(0,15)(?,?)(10,12)
174 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Gadafi	(0,14)(?,?)(10,11)
175 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mu'ammar Qadafi	(0,15)(?,?)(11,12)
176 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moamar Gaddafi	(0,14)(?,?)(9,11)
177 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mu'ammar Qadhdhafi	(0,18)(?,?)(13,15)
178 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Khaddafi	(0,16)(?,?)(11,13)
179 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghaddafy	(0,16)(?,?)(11,13)
180 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghadafi	(0,15)(?,?)(11,12)
181 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghaddafi	(0,16)(?,?)(11,13)
182 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muamar Kaddafi	(0,14)(?,?)(9,11)
183 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Quathafi	(0,16)(?,?)(11,13)
184 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Gheddafi	(0,16)(?,?)(11,13)
185 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moammar Khadafy	(0,15)(?,?)(11,12)
186 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moammar Qudhafi	(0,15)(?,?)(10,12)
187 | E	a+(b|c)*d+		aabcdd			(0,6)(3,4)
188 | E	^.+$			vivi			(0,4)
189 | E	^(.+)$			vivi			(0,4)(0,4)
190 | E	^([^!.]+).att.com!(.+)$	gryphon.att.com!eby	(0,19)(0,7)(16,19)
191 | E	^([^!]+!)?([^!]+)$	bas			(0,3)(?,?)(0,3)
192 | E	^([^!]+!)?([^!]+)$	bar!bas			(0,7)(0,4)(4,7)
193 | E	^([^!]+!)?([^!]+)$	foo!bas			(0,7)(0,4)(4,7)
194 | E	^.+!([^!]+!)([^!]+)$	foo!bar!bas		(0,11)(4,8)(8,11)
195 | E	((foo)|(bar))!bas	bar!bas			(0,7)(0,3)(?,?)(0,3)
196 | E	((foo)|(bar))!bas	foo!bar!bas		(4,11)(4,7)(?,?)(4,7)
197 | E	((foo)|(bar))!bas	foo!bas			(0,7)(0,3)(0,3)
198 | E	((foo)|bar)!bas		bar!bas			(0,7)(0,3)
199 | E	((foo)|bar)!bas		foo!bar!bas		(4,11)(4,7)
200 | E	((foo)|bar)!bas		foo!bas			(0,7)(0,3)(0,3)
201 | E	(foo|(bar))!bas		bar!bas			(0,7)(0,3)(0,3)
202 | E	(foo|(bar))!bas		foo!bar!bas		(4,11)(4,7)(4,7)
203 | E	(foo|(bar))!bas		foo!bas			(0,7)(0,3)
204 | E	(foo|bar)!bas		bar!bas			(0,7)(0,3)
205 | E	(foo|bar)!bas		foo!bar!bas		(4,11)(4,7)
206 | E	(foo|bar)!bas		foo!bas			(0,7)(0,3)
207 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bar!bas	(0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
208 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	bas		(0,3)(?,?)(0,3)
209 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	bar!bas		(0,7)(0,4)(4,7)
210 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	foo!bar!bas	(0,11)(?,?)(?,?)(4,8)(8,11)
211 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	foo!bas		(0,7)(0,4)(4,7)
212 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	bas		(0,3)(0,3)(?,?)(0,3)
213 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	bar!bas		(0,7)(0,7)(0,4)(4,7)
214 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bar!bas	(0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
215 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bas		(0,7)(0,7)(0,4)(4,7)
216 | E	.*(/XXX).*			/XXX			(0,4)(0,4)
217 | E	.*(\\XXX).*			\XXX			(0,4)(0,4)
218 | E	\\XXX				\XXX			(0,4)
219 | E	.*(/000).*			/000			(0,4)(0,4)
220 | E	.*(\\000).*			\000			(0,4)(0,4)
221 | E	\\000				\000			(0,4)
222 | 


--------------------------------------------------------------------------------
/testdata/nullsubexpr.dat:
--------------------------------------------------------------------------------
 1 | NOTE	null subexpression matches : 2002-06-06
 2 | 
 3 | E	(a*)*		a		(0,1)(0,1)
 4 | #E	SAME		x		(0,0)(0,0)
 5 | E	SAME		x		(0,0)(?,?)	RE2/Go
 6 | E	SAME		aaaaaa		(0,6)(0,6)
 7 | E	SAME		aaaaaax		(0,6)(0,6)
 8 | E	(a*)+		a		(0,1)(0,1)
 9 | E	SAME		x		(0,0)(0,0)
10 | E	SAME		aaaaaa		(0,6)(0,6)
11 | E	SAME		aaaaaax		(0,6)(0,6)
12 | E	(a+)*		a		(0,1)(0,1)
13 | E	SAME		x		(0,0)
14 | E	SAME		aaaaaa		(0,6)(0,6)
15 | E	SAME		aaaaaax		(0,6)(0,6)
16 | E	(a+)+		a		(0,1)(0,1)
17 | E	SAME		x		NOMATCH
18 | E	SAME		aaaaaa		(0,6)(0,6)
19 | E	SAME		aaaaaax		(0,6)(0,6)
20 | 
21 | E	([a]*)*		a		(0,1)(0,1)
22 | #E	SAME		x		(0,0)(0,0)
23 | E	SAME		x		(0,0)(?,?)	RE2/Go
24 | E	SAME		aaaaaa		(0,6)(0,6)
25 | E	SAME		aaaaaax		(0,6)(0,6)
26 | E	([a]*)+		a		(0,1)(0,1)
27 | E	SAME		x		(0,0)(0,0)
28 | E	SAME		aaaaaa		(0,6)(0,6)
29 | E	SAME		aaaaaax		(0,6)(0,6)
30 | E	([^b]*)*	a		(0,1)(0,1)
31 | #E	SAME		b		(0,0)(0,0)
32 | E	SAME		b		(0,0)(?,?)	RE2/Go
33 | E	SAME		aaaaaa		(0,6)(0,6)
34 | E	SAME		aaaaaab		(0,6)(0,6)
35 | E	([ab]*)*	a		(0,1)(0,1)
36 | E	SAME		aaaaaa		(0,6)(0,6)
37 | E	SAME		ababab		(0,6)(0,6)
38 | E	SAME		bababa		(0,6)(0,6)
39 | E	SAME		b		(0,1)(0,1)
40 | E	SAME		bbbbbb		(0,6)(0,6)
41 | E	SAME		aaaabcde	(0,5)(0,5)
42 | E	([^a]*)*	b		(0,1)(0,1)
43 | E	SAME		bbbbbb		(0,6)(0,6)
44 | #E	SAME		aaaaaa		(0,0)(0,0)
45 | E	SAME		aaaaaa		(0,0)(?,?)	RE2/Go
46 | E	([^ab]*)*	ccccxx		(0,6)(0,6)
47 | #E	SAME		ababab		(0,0)(0,0)
48 | E	SAME		ababab		(0,0)(?,?)	RE2/Go
49 | 
50 | E	((z)+|a)*	zabcde		(0,2)(1,2)
51 | 
52 | #{E	a+?		aaaaaa		(0,1)	no *? +? mimimal match ops
53 | #E	(a)		aaa		(0,1)(0,1)
54 | #E	(a*?)		aaa		(0,0)(0,0)
55 | #E	(a)*?		aaa		(0,0)
56 | #E	(a*?)*?		aaa		(0,0)
57 | #}
58 | 
59 | B	\(a*\)*\(x\)		x	(0,1)(0,0)(0,1)
60 | B	\(a*\)*\(x\)		ax	(0,2)(0,1)(1,2)
61 | B	\(a*\)*\(x\)		axa	(0,2)(0,1)(1,2)
62 | B	\(a*\)*\(x\)\(\1\)	x	(0,1)(0,0)(0,1)(1,1)
63 | B	\(a*\)*\(x\)\(\1\)	ax	(0,2)(1,1)(1,2)(2,2)
64 | B	\(a*\)*\(x\)\(\1\)	axa	(0,3)(0,1)(1,2)(2,3)
65 | B	\(a*\)*\(x\)\(\1\)\(x\)	axax	(0,4)(0,1)(1,2)(2,3)(3,4)
66 | B	\(a*\)*\(x\)\(\1\)\(x\)	axxa	(0,3)(1,1)(1,2)(2,2)(2,3)
67 | 
68 | #E	(a*)*(x)		x	(0,1)(0,0)(0,1)
69 | E	(a*)*(x)		x	(0,1)(?,?)(0,1)	RE2/Go
70 | E	(a*)*(x)		ax	(0,2)(0,1)(1,2)
71 | E	(a*)*(x)		axa	(0,2)(0,1)(1,2)
72 | 
73 | E	(a*)+(x)		x	(0,1)(0,0)(0,1)
74 | E	(a*)+(x)		ax	(0,2)(0,1)(1,2)
75 | E	(a*)+(x)		axa	(0,2)(0,1)(1,2)
76 | 
77 | E	(a*){2}(x)		x	(0,1)(0,0)(0,1)
78 | E	(a*){2}(x)		ax	(0,2)(1,1)(1,2)
79 | E	(a*){2}(x)		axa	(0,2)(1,1)(1,2)
80 | 


--------------------------------------------------------------------------------
/testdata/re2-exhaustive.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloob/regexp/a9296bf6ee794d8726f013c254654bb606da0b7e/testdata/re2-exhaustive.txt.bz2


--------------------------------------------------------------------------------
/testdata/repetition.dat:
--------------------------------------------------------------------------------
  1 | NOTE	implicit vs. explicit repetitions : 2009-02-02
  2 | 
  3 | # Glenn Fowler <gsf@research.att.com>
  4 | # conforming matches (column 4) must match one of the following BREs
  5 | #	NOMATCH
  6 | #	(0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
  7 | #	(0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
  8 | # i.e., each 3-tuple has two identical elements and one (?,?)
  9 | 
 10 | E	((..)|(.))				NULL		NOMATCH
 11 | E	((..)|(.))((..)|(.))			NULL		NOMATCH
 12 | E	((..)|(.))((..)|(.))((..)|(.))		NULL		NOMATCH
 13 | 
 14 | E	((..)|(.)){1}				NULL		NOMATCH
 15 | E	((..)|(.)){2}				NULL		NOMATCH
 16 | E	((..)|(.)){3}				NULL		NOMATCH
 17 | 
 18 | E	((..)|(.))*				NULL		(0,0)
 19 | 
 20 | E	((..)|(.))				a		(0,1)(0,1)(?,?)(0,1)
 21 | E	((..)|(.))((..)|(.))			a		NOMATCH
 22 | E	((..)|(.))((..)|(.))((..)|(.))		a		NOMATCH
 23 | 
 24 | E	((..)|(.)){1}				a		(0,1)(0,1)(?,?)(0,1)
 25 | E	((..)|(.)){2}				a		NOMATCH
 26 | E	((..)|(.)){3}				a		NOMATCH
 27 | 
 28 | E	((..)|(.))*				a		(0,1)(0,1)(?,?)(0,1)
 29 | 
 30 | E	((..)|(.))				aa		(0,2)(0,2)(0,2)(?,?)
 31 | E	((..)|(.))((..)|(.))			aa		(0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
 32 | E	((..)|(.))((..)|(.))((..)|(.))		aa		NOMATCH
 33 | 
 34 | E	((..)|(.)){1}				aa		(0,2)(0,2)(0,2)(?,?)
 35 | E	((..)|(.)){2}				aa		(0,2)(1,2)(?,?)(1,2)
 36 | E	((..)|(.)){3}				aa		NOMATCH
 37 | 
 38 | E	((..)|(.))*				aa		(0,2)(0,2)(0,2)(?,?)
 39 | 
 40 | E	((..)|(.))				aaa		(0,2)(0,2)(0,2)(?,?)
 41 | E	((..)|(.))((..)|(.))			aaa		(0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
 42 | E	((..)|(.))((..)|(.))((..)|(.))		aaa		(0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
 43 | 
 44 | E	((..)|(.)){1}				aaa		(0,2)(0,2)(0,2)(?,?)
 45 | #E	((..)|(.)){2}				aaa		(0,3)(2,3)(?,?)(2,3)
 46 | E	((..)|(.)){2}				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
 47 | E	((..)|(.)){3}				aaa		(0,3)(2,3)(?,?)(2,3)
 48 | 
 49 | #E	((..)|(.))*				aaa		(0,3)(2,3)(?,?)(2,3)
 50 | E	((..)|(.))*				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
 51 | 
 52 | E	((..)|(.))				aaaa		(0,2)(0,2)(0,2)(?,?)
 53 | E	((..)|(.))((..)|(.))			aaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 54 | E	((..)|(.))((..)|(.))((..)|(.))		aaaa		(0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
 55 | 
 56 | E	((..)|(.)){1}				aaaa		(0,2)(0,2)(0,2)(?,?)
 57 | E	((..)|(.)){2}				aaaa		(0,4)(2,4)(2,4)(?,?)
 58 | #E	((..)|(.)){3}				aaaa		(0,4)(3,4)(?,?)(3,4)
 59 | E	((..)|(.)){3}				aaaa		(0,4)(3,4)(0,2)(3,4)	RE2/Go
 60 | 
 61 | E	((..)|(.))*				aaaa		(0,4)(2,4)(2,4)(?,?)
 62 | 
 63 | E	((..)|(.))				aaaaa		(0,2)(0,2)(0,2)(?,?)
 64 | E	((..)|(.))((..)|(.))			aaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 65 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaa		(0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
 66 | 
 67 | E	((..)|(.)){1}				aaaaa		(0,2)(0,2)(0,2)(?,?)
 68 | E	((..)|(.)){2}				aaaaa		(0,4)(2,4)(2,4)(?,?)
 69 | #E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(?,?)(4,5)
 70 | E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
 71 | 
 72 | #E	((..)|(.))*				aaaaa		(0,5)(4,5)(?,?)(4,5)
 73 | E	((..)|(.))*				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
 74 | 
 75 | E	((..)|(.))				aaaaaa		(0,2)(0,2)(0,2)(?,?)
 76 | E	((..)|(.))((..)|(.))			aaaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 77 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaaa		(0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
 78 | 
 79 | E	((..)|(.)){1}				aaaaaa		(0,2)(0,2)(0,2)(?,?)
 80 | E	((..)|(.)){2}				aaaaaa		(0,4)(2,4)(2,4)(?,?)
 81 | E	((..)|(.)){3}				aaaaaa		(0,6)(4,6)(4,6)(?,?)
 82 | 
 83 | E	((..)|(.))*				aaaaaa		(0,6)(4,6)(4,6)(?,?)
 84 | 
 85 | NOTE	additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
 86 | 
 87 | # These test a bug in OS X / FreeBSD / NetBSD, and libtree. 
 88 | # Linux/GLIBC gets the {8,} and {8,8} wrong.
 89 | 
 90 | :HA#100:E	X(.?){0,}Y	X1234567Y	(0,9)(7,8)
 91 | :HA#101:E	X(.?){1,}Y	X1234567Y	(0,9)(7,8)
 92 | :HA#102:E	X(.?){2,}Y	X1234567Y	(0,9)(7,8)
 93 | :HA#103:E	X(.?){3,}Y	X1234567Y	(0,9)(7,8)
 94 | :HA#104:E	X(.?){4,}Y	X1234567Y	(0,9)(7,8)
 95 | :HA#105:E	X(.?){5,}Y	X1234567Y	(0,9)(7,8)
 96 | :HA#106:E	X(.?){6,}Y	X1234567Y	(0,9)(7,8)
 97 | :HA#107:E	X(.?){7,}Y	X1234567Y	(0,9)(7,8)
 98 | :HA#108:E	X(.?){8,}Y	X1234567Y	(0,9)(8,8)
 99 | #:HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(7,8)
100 | :HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
101 | #:HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(7,8)
102 | :HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
103 | #:HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(7,8)
104 | :HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
105 | #:HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(7,8)
106 | :HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
107 | #:HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(7,8)
108 | :HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
109 | #:HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(7,8)
110 | :HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
111 | #:HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(7,8)
112 | :HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
113 | #:HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(7,8)
114 | :HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
115 | :HA#118:E	X(.?){8,8}Y	X1234567Y	(0,9)(8,8)
116 | 
117 | # These test a fixed bug in my regex-tdfa that did not keep the expanded
118 | # form properly grouped, so right association did the wrong thing with
119 | # these ambiguous patterns (crafted just to test my code when I became
120 | # suspicious of my implementation).  The first subexpression should use
121 | # "ab" then "a" then "bcd".
122 | 
123 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible
124 | # results like (0,6)(4,5)(6,6).
125 | 
126 | :HA#260:E	(a|ab|c|bcd){0,}(d*)	ababcd	(0,6)(3,6)(6,6)
127 | :HA#261:E	(a|ab|c|bcd){1,}(d*)	ababcd	(0,6)(3,6)(6,6)
128 | :HA#262:E	(a|ab|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
129 | :HA#263:E	(a|ab|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
130 | :HA#264:E	(a|ab|c|bcd){4,}(d*)	ababcd	NOMATCH
131 | :HA#265:E	(a|ab|c|bcd){0,10}(d*)	ababcd	(0,6)(3,6)(6,6)
132 | :HA#266:E	(a|ab|c|bcd){1,10}(d*)	ababcd	(0,6)(3,6)(6,6)
133 | :HA#267:E	(a|ab|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
134 | :HA#268:E	(a|ab|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
135 | :HA#269:E	(a|ab|c|bcd){4,10}(d*)	ababcd	NOMATCH
136 | :HA#270:E	(a|ab|c|bcd)*(d*)	ababcd	(0,6)(3,6)(6,6)
137 | :HA#271:E	(a|ab|c|bcd)+(d*)	ababcd	(0,6)(3,6)(6,6)
138 | 
139 | # The above worked on Linux/GLIBC but the following often fail.
140 | # They also trip up OS X / FreeBSD / NetBSD:
141 | 
142 | #:HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(3,6)(6,6)
143 | :HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
144 | #:HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(3,6)(6,6)
145 | :HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
146 | #:HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
147 | :HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
148 | #:HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
149 | :HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
150 | :HA#284:E	(ab|a|c|bcd){4,}(d*)	ababcd	NOMATCH
151 | #:HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(3,6)(6,6)
152 | :HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
153 | #:HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(3,6)(6,6)
154 | :HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
155 | #:HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
156 | :HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
157 | #:HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
158 | :HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
159 | :HA#289:E	(ab|a|c|bcd){4,10}(d*)	ababcd	NOMATCH
160 | #:HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(3,6)(6,6)
161 | :HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
162 | #:HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(3,6)(6,6)
163 | :HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
164 | 


--------------------------------------------------------------------------------