├── LICENSE
├── all_test.go
├── backtrack.go
├── example_test.go
├── exec.go
├── exec2_test.go
├── exec_test.go
├── find_test.go
├── go.mod
├── onepass.go
├── onepass_test.go
├── regexp.go
├── syntax
    ├── compile.go
    ├── doc.go
    ├── op_string.go
    ├── parse.go
    ├── parse_test.go
    ├── perl_groups.go
    ├── prog.go
    ├── prog_test.go
    ├── regexp.go
    ├── simplify.go
    └── simplify_test.go
└── testdata
    ├── README
    ├── basic.dat
    ├── nullsubexpr.dat
    ├── re2-exhaustive.txt.bz2
    ├── re2-search.txt
    ├── repetition.dat
    └── testregex.c


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/all_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package binaryregexp
  6 | 
  7 | import (
  8 | 	"reflect"
  9 | 	"strings"
 10 | 	"testing"
 11 | 	"unicode/utf8"
 12 | 
 13 | 	"rsc.io/binaryregexp/syntax"
 14 | )
 15 | 
 16 | var goodRe = []string{
 17 | 	``,
 18 | 	`.`,
 19 | 	`^.$`,
 20 | 	`a`,
 21 | 	`a*`,
 22 | 	`a+`,
 23 | 	`a?`,
 24 | 	`a|b`,
 25 | 	`a*|b*`,
 26 | 	`(a*|b)(c*|d)`,
 27 | 	`[a-z]`,
 28 | 	`[a-abc-c\-\]\[]`,
 29 | 	`[a-z]+`,
 30 | 	`[abc]`,
 31 | 	`[^1234]`,
 32 | 	`[^\n]`,
 33 | 	`\!\\`,
 34 | }
 35 | 
 36 | type stringError struct {
 37 | 	re  string
 38 | 	err string
 39 | }
 40 | 
 41 | var badRe = []stringError{
 42 | 	{`*`, "missing argument to repetition operator: `*`"},
 43 | 	{`+`, "missing argument to repetition operator: `+`"},
 44 | 	{`?`, "missing argument to repetition operator: `?`"},
 45 | 	{`(abc`, "missing closing ): `(abc`"},
 46 | 	{`abc)`, "unexpected ): `abc)`"},
 47 | 	{`x[a-z`, "missing closing ]: `[a-z`"},
 48 | 	{`[z-a]`, "invalid character class range: `z-a`"},
 49 | 	{`abc\`, "trailing backslash at end of expression"},
 50 | 	{`a**`, "invalid nested repetition operator: `**`"},
 51 | 	{`a*+`, "invalid nested repetition operator: `*+`"},
 52 | 	{`\x`, "invalid escape sequence: `\\x`"},
 53 | }
 54 | 
 55 | func compileTest(t *testing.T, expr string, error string) *Regexp {
 56 | 	re, err := Compile(expr)
 57 | 	if error == "" && err != nil {
 58 | 		t.Error("compiling `", expr, "`; unexpected error: ", err.Error())
 59 | 	}
 60 | 	if error != "" && err == nil {
 61 | 		t.Error("compiling `", expr, "`; missing error")
 62 | 	} else if error != "" && !strings.Contains(err.Error(), error) {
 63 | 		t.Error("compiling `", expr, "`; wrong error: ", err.Error(), "; want ", error)
 64 | 	}
 65 | 	return re
 66 | }
 67 | 
 68 | func TestGoodCompile(t *testing.T) {
 69 | 	for i := 0; i < len(goodRe); i++ {
 70 | 		compileTest(t, goodRe[i], "")
 71 | 	}
 72 | }
 73 | 
 74 | func TestBadCompile(t *testing.T) {
 75 | 	for i := 0; i < len(badRe); i++ {
 76 | 		compileTest(t, badRe[i].re, badRe[i].err)
 77 | 	}
 78 | }
 79 | 
 80 | func matchTest(t *testing.T, test *FindTest) {
 81 | 	re := compileTest(t, test.pat, "")
 82 | 	if re == nil {
 83 | 		return
 84 | 	}
 85 | 	m := re.MatchString(test.text)
 86 | 	if m != (len(test.matches) > 0) {
 87 | 		t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0)
 88 | 	}
 89 | 	// now try bytes
 90 | 	m = re.Match([]byte(test.text))
 91 | 	if m != (len(test.matches) > 0) {
 92 | 		t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
 93 | 	}
 94 | }
 95 | 
 96 | func TestMatch(t *testing.T) {
 97 | 	for _, test := range findTests {
 98 | 		matchTest(t, &test)
 99 | 	}
100 | }
101 | 
102 | func matchFunctionTest(t *testing.T, test *FindTest) {
103 | 	m, err := MatchString(test.pat, test.text)
104 | 	if err == nil {
105 | 		return
106 | 	}
107 | 	if m != (len(test.matches) > 0) {
108 | 		t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
109 | 	}
110 | }
111 | 
112 | func TestMatchFunction(t *testing.T) {
113 | 	for _, test := range findTests {
114 | 		matchFunctionTest(t, &test)
115 | 	}
116 | }
117 | 
118 | func copyMatchTest(t *testing.T, test *FindTest) {
119 | 	re := compileTest(t, test.pat, "")
120 | 	if re == nil {
121 | 		return
122 | 	}
123 | 	m1 := re.MatchString(test.text)
124 | 	m2 := re.Copy().MatchString(test.text)
125 | 	if m1 != m2 {
126 | 		t.Errorf("Copied Regexp match failure on %s: original gave %t; copy gave %t; should be %t",
127 | 			test, m1, m2, len(test.matches) > 0)
128 | 	}
129 | }
130 | 
131 | func TestCopyMatch(t *testing.T) {
132 | 	for _, test := range findTests {
133 | 		copyMatchTest(t, &test)
134 | 	}
135 | }
136 | 
137 | type ReplaceTest struct {
138 | 	pattern, replacement, input, output string
139 | }
140 | 
141 | var replaceTests = []ReplaceTest{
142 | 	// Test empty input and/or replacement, with pattern that matches the empty string.
143 | 	{"", "", "", ""},
144 | 	{"", "x", "", "x"},
145 | 	{"", "", "abc", "abc"},
146 | 	{"", "x", "abc", "xaxbxcx"},
147 | 
148 | 	// Test empty input and/or replacement, with pattern that does not match the empty string.
149 | 	{"b", "", "", ""},
150 | 	{"b", "x", "", ""},
151 | 	{"b", "", "abc", "ac"},
152 | 	{"b", "x", "abc", "axc"},
153 | 	{"y", "", "", ""},
154 | 	{"y", "x", "", ""},
155 | 	{"y", "", "abc", "abc"},
156 | 	{"y", "x", "abc", "abc"},
157 | 
158 | 	// Multibyte characters -- verify that we don't try to match in the middle
159 | 	// of a character.
160 | 	{"[a-c]*", "x", "\u65e5", "x\xe6x\x97x\xa5x"},
161 | 	{"[^\u65e5]", "x", "abc\u65e5def", "xxxxxxxxx"},
162 | 
163 | 	// Start and end of a string.
164 | 	{"^[a-c]*", "x", "abcdabc", "xdabc"},
165 | 	{"[a-c]*$", "x", "abcdabc", "abcdx"},
166 | 	{"^[a-c]*$", "x", "abcdabc", "abcdabc"},
167 | 	{"^[a-c]*", "x", "abc", "x"},
168 | 	{"[a-c]*$", "x", "abc", "x"},
169 | 	{"^[a-c]*$", "x", "abc", "x"},
170 | 	{"^[a-c]*", "x", "dabce", "xdabce"},
171 | 	{"[a-c]*$", "x", "dabce", "dabcex"},
172 | 	{"^[a-c]*$", "x", "dabce", "dabce"},
173 | 	{"^[a-c]*", "x", "", "x"},
174 | 	{"[a-c]*$", "x", "", "x"},
175 | 	{"^[a-c]*$", "x", "", "x"},
176 | 
177 | 	{"^[a-c]+", "x", "abcdabc", "xdabc"},
178 | 	{"[a-c]+$", "x", "abcdabc", "abcdx"},
179 | 	{"^[a-c]+$", "x", "abcdabc", "abcdabc"},
180 | 	{"^[a-c]+", "x", "abc", "x"},
181 | 	{"[a-c]+$", "x", "abc", "x"},
182 | 	{"^[a-c]+$", "x", "abc", "x"},
183 | 	{"^[a-c]+", "x", "dabce", "dabce"},
184 | 	{"[a-c]+$", "x", "dabce", "dabce"},
185 | 	{"^[a-c]+$", "x", "dabce", "dabce"},
186 | 	{"^[a-c]+", "x", "", ""},
187 | 	{"[a-c]+$", "x", "", ""},
188 | 	{"^[a-c]+$", "x", "", ""},
189 | 
190 | 	// Other cases.
191 | 	{"abc", "def", "abcdefg", "defdefg"},
192 | 	{"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"},
193 | 	{"abc", "", "abcdabc", "d"},
194 | 	{"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"},
195 | 	{"abc", "d", "", ""},
196 | 	{"abc", "d", "abc", "d"},
197 | 	{".+", "x", "abc", "x"},
198 | 	{"[a-c]*", "x", "def", "xdxexfx"},
199 | 	{"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"},
200 | 	{"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"},
201 | 
202 | 	// Substitutions
203 | 	{"a+", "($0)", "banana", "b(a)n(a)n(a)"},
204 | 	{"a+", "(${0})", "banana", "b(a)n(a)n(a)"},
205 | 	{"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"},
206 | 	{"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"},
207 | 	{"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"},
208 | 	{"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "},
209 | 	{"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"},
210 | 	{"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<hello, world><world><><>"},
211 | 	{"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"},
212 | 	{"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"},
213 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "hihihi"},
214 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "byebyebye"},
215 | 	{"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", ""},
216 | 	{"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "hiyz"},
217 | 	{"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $x"},
218 | 	{"a+", "${oops", "aaa", "${oops"},
219 | 	{"a+", "$$", "aaa", "$"},
220 | 	{"a+", "$", "aaa", "$"},
221 | 
222 | 	// Substitution when subexpression isn't found
223 | 	{"(x)?", "$1", "123", "123"},
224 | 	{"abc", "$1", "123", "123"},
225 | 
226 | 	// Substitutions involving a (x){0}
227 | 	{"(a)(b){0}(c)", ".$1|$3.", "xacxacx", "x.a|c.x.a|c.x"},
228 | 	{"(a)(((b))){0}c", ".$1.", "xacxacx", "x.a.x.a.x"},
229 | 	{"((a(b){0}){3}){5}(h)", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"},
230 | 	{"((a(b){0}){3}){5}h", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"},
231 | }
232 | 
233 | var replaceLiteralTests = []ReplaceTest{
234 | 	// Substitutions
235 | 	{"a+", "($0)", "banana", "b($0)n($0)n($0)"},
236 | 	{"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"},
237 | 	{"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"},
238 | 	{"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"},
239 | 	{"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"},
240 | 	{"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"},
241 | 	{"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"},
242 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "$x$x$x"},
243 | 	{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "$x$x$x"},
244 | 	{"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", "$xyz"},
245 | 	{"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "${x}yz"},
246 | 	{"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $$x"},
247 | 	{"a+", "${oops", "aaa", "${oops"},
248 | 	{"a+", "$$", "aaa", "$$"},
249 | 	{"a+", "$", "aaa", "$"},
250 | }
251 | 
252 | type ReplaceFuncTest struct {
253 | 	pattern       string
254 | 	replacement   func(string) string
255 | 	input, output string
256 | }
257 | 
258 | var replaceFuncTests = []ReplaceFuncTest{
259 | 	{"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"},
260 | 	{"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"},
261 | 	{"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"},
262 | }
263 | 
264 | func TestReplaceAll(t *testing.T) {
265 | 	for _, tc := range replaceTests {
266 | 		re, err := Compile(tc.pattern)
267 | 		if err != nil {
268 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
269 | 			continue
270 | 		}
271 | 		actual := re.ReplaceAllString(tc.input, tc.replacement)
272 | 		if actual != tc.output {
273 | 			t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q",
274 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
275 | 		}
276 | 		// now try bytes
277 | 		actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement)))
278 | 		if actual != tc.output {
279 | 			t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q",
280 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
281 | 		}
282 | 	}
283 | }
284 | 
285 | func TestReplaceAllLiteral(t *testing.T) {
286 | 	// Run ReplaceAll tests that do not have $ expansions.
287 | 	for _, tc := range replaceTests {
288 | 		if strings.Contains(tc.replacement, "$") {
289 | 			continue
290 | 		}
291 | 		re, err := Compile(tc.pattern)
292 | 		if err != nil {
293 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
294 | 			continue
295 | 		}
296 | 		actual := re.ReplaceAllLiteralString(tc.input, tc.replacement)
297 | 		if actual != tc.output {
298 | 			t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q",
299 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
300 | 		}
301 | 		// now try bytes
302 | 		actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement)))
303 | 		if actual != tc.output {
304 | 			t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q",
305 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
306 | 		}
307 | 	}
308 | 
309 | 	// Run literal-specific tests.
310 | 	for _, tc := range replaceLiteralTests {
311 | 		re, err := Compile(tc.pattern)
312 | 		if err != nil {
313 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
314 | 			continue
315 | 		}
316 | 		actual := re.ReplaceAllLiteralString(tc.input, tc.replacement)
317 | 		if actual != tc.output {
318 | 			t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q",
319 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
320 | 		}
321 | 		// now try bytes
322 | 		actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement)))
323 | 		if actual != tc.output {
324 | 			t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q",
325 | 				tc.pattern, tc.input, tc.replacement, actual, tc.output)
326 | 		}
327 | 	}
328 | }
329 | 
330 | func TestReplaceAllFunc(t *testing.T) {
331 | 	for _, tc := range replaceFuncTests {
332 | 		re, err := Compile(tc.pattern)
333 | 		if err != nil {
334 | 			t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
335 | 			continue
336 | 		}
337 | 		actual := re.ReplaceAllStringFunc(tc.input, tc.replacement)
338 | 		if actual != tc.output {
339 | 			t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q",
340 | 				tc.pattern, tc.input, actual, tc.output)
341 | 		}
342 | 		// now try bytes
343 | 		actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) }))
344 | 		if actual != tc.output {
345 | 			t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q",
346 | 				tc.pattern, tc.input, actual, tc.output)
347 | 		}
348 | 	}
349 | }
350 | 
351 | type MetaTest struct {
352 | 	pattern, output, literal string
353 | 	isLiteral                bool
354 | }
355 | 
356 | var metaTests = []MetaTest{
357 | 	{``, ``, ``, true},
358 | 	{`foo`, `foo`, `foo`, true},
359 | 	{`ÿ+`, `ÿ\+`, `ÿ`, false},
360 | 	{`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator
361 | 	{`foo.\$`, `foo\.\\\$`, `foo`, false},     // has escaped operators and real operators
362 | 	{`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false},
363 | }
364 | 
365 | var literalPrefixTests = []MetaTest{
366 | 	// See golang.org/issue/11175.
367 | 	// output is unused.
368 | 	{`^0^0$`, ``, `0`, false},
369 | 	{`^0^`, ``, ``, false},
370 | 	{`^0$`, ``, `0`, true},
371 | 	{`$0^`, ``, ``, false},
372 | 	{`$0$`, ``, ``, false},
373 | 	{`^^0$$`, ``, ``, false},
374 | 	{`^$^$`, ``, ``, false},
375 | 	{`$$0^^`, ``, ``, false},
376 | }
377 | 
378 | func TestQuoteMeta(t *testing.T) {
379 | 	for _, tc := range metaTests {
380 | 		// Verify that QuoteMeta returns the expected string.
381 | 		quoted := QuoteMeta(tc.pattern)
382 | 		if quoted != tc.output {
383 | 			t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`",
384 | 				tc.pattern, quoted, tc.output)
385 | 			continue
386 | 		}
387 | 
388 | 		// Verify that the quoted string is in fact treated as expected
389 | 		// by Compile -- i.e. that it matches the original, unquoted string.
390 | 		if tc.pattern != "" {
391 | 			re, err := Compile(quoted)
392 | 			if err != nil {
393 | 				t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err)
394 | 				continue
395 | 			}
396 | 			src := "abc" + toLatin1(tc.pattern) + "def"
397 | 			repl := "xyz"
398 | 			replaced := re.ReplaceAllString(src, repl)
399 | 			expected := "abcxyzdef"
400 | 			if replaced != expected {
401 | 				t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`",
402 | 					tc.pattern, src, repl, replaced, expected)
403 | 			}
404 | 		}
405 | 	}
406 | }
407 | 
408 | func toLatin1(s string) string {
409 | 	runes := []rune(s)
410 | 	b := make([]byte, len(runes))
411 | 	for i, r := range runes {
412 | 		if r > 0xff {
413 | 			panic("cannot toLatin1")
414 | 		}
415 | 		b[i] = byte(r)
416 | 	}
417 | 	return string(b)
418 | }
419 | 
420 | func TestLiteralPrefix(t *testing.T) {
421 | 	for _, tc := range append(metaTests, literalPrefixTests...) {
422 | 		// Literal method needs to scan the pattern.
423 | 		re := MustCompile(tc.pattern)
424 | 		str, complete := re.LiteralPrefix()
425 | 		if complete != tc.isLiteral {
426 | 			t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral)
427 | 		}
428 | 		if str != toLatin1(tc.literal) {
429 | 			t.Errorf("LiteralPrefix(`%s`) = %#q; want %#q", tc.pattern, str, toLatin1(tc.literal))
430 | 		}
431 | 	}
432 | }
433 | 
434 | type subexpCase struct {
435 | 	input string
436 | 	num   int
437 | 	names []string
438 | }
439 | 
440 | var subexpCases = []subexpCase{
441 | 	{``, 0, nil},
442 | 	{`.*`, 0, nil},
443 | 	{`abba`, 0, nil},
444 | 	{`ab(b)a`, 1, []string{"", ""}},
445 | 	{`ab(.*)a`, 1, []string{"", ""}},
446 | 	{`(.*)ab(.*)a`, 2, []string{"", "", ""}},
447 | 	{`(.*)(ab)(.*)a`, 3, []string{"", "", "", ""}},
448 | 	{`(.*)((a)b)(.*)a`, 4, []string{"", "", "", "", ""}},
449 | 	{`(.*)(\(ab)(.*)a`, 3, []string{"", "", "", ""}},
450 | 	{`(.*)(\(a\)b)(.*)a`, 3, []string{"", "", "", ""}},
451 | 	{`(?P<foo>.*)(?P<bar>(a)b)(?P<foo>.*)a`, 4, []string{"", "foo", "bar", "", "foo"}},
452 | }
453 | 
454 | func TestSubexp(t *testing.T) {
455 | 	for _, c := range subexpCases {
456 | 		re := MustCompile(c.input)
457 | 		n := re.NumSubexp()
458 | 		if n != c.num {
459 | 			t.Errorf("%q: NumSubexp = %d, want %d", c.input, n, c.num)
460 | 			continue
461 | 		}
462 | 		names := re.SubexpNames()
463 | 		if len(names) != 1+n {
464 | 			t.Errorf("%q: len(SubexpNames) = %d, want %d", c.input, len(names), n)
465 | 			continue
466 | 		}
467 | 		if c.names != nil {
468 | 			for i := 0; i < 1+n; i++ {
469 | 				if names[i] != c.names[i] {
470 | 					t.Errorf("%q: SubexpNames[%d] = %q, want %q", c.input, i, names[i], c.names[i])
471 | 				}
472 | 			}
473 | 		}
474 | 	}
475 | }
476 | 
477 | var splitTests = []struct {
478 | 	s   string
479 | 	r   string
480 | 	n   int
481 | 	out []string
482 | }{
483 | 	{"foo:and:bar", ":", -1, []string{"foo", "and", "bar"}},
484 | 	{"foo:and:bar", ":", 1, []string{"foo:and:bar"}},
485 | 	{"foo:and:bar", ":", 2, []string{"foo", "and:bar"}},
486 | 	{"foo:and:bar", "foo", -1, []string{"", ":and:bar"}},
487 | 	{"foo:and:bar", "bar", -1, []string{"foo:and:", ""}},
488 | 	{"foo:and:bar", "baz", -1, []string{"foo:and:bar"}},
489 | 	{"baabaab", "a", -1, []string{"b", "", "b", "", "b"}},
490 | 	{"baabaab", "a*", -1, []string{"b", "b", "b"}},
491 | 	{"baabaab", "ba*", -1, []string{"", "", "", ""}},
492 | 	{"foobar", "f*b*", -1, []string{"", "o", "o", "a", "r"}},
493 | 	{"foobar", "f+.*b+", -1, []string{"", "ar"}},
494 | 	{"foobooboar", "o{2}", -1, []string{"f", "b", "boar"}},
495 | 	{"a,b,c,d,e,f", ",", 3, []string{"a", "b", "c,d,e,f"}},
496 | 	{"a,b,c,d,e,f", ",", 0, nil},
497 | 	{",", ",", -1, []string{"", ""}},
498 | 	{",,,", ",", -1, []string{"", "", "", ""}},
499 | 	{"", ",", -1, []string{""}},
500 | 	{"", ".*", -1, []string{""}},
501 | 	{"", ".+", -1, []string{""}},
502 | 	{"", "", -1, []string{}},
503 | 	{"foobar", "", -1, []string{"f", "o", "o", "b", "a", "r"}},
504 | 	{"abaabaccadaaae", "a*", 5, []string{"", "b", "b", "c", "cadaaae"}},
505 | 	{":x:y:z:", ":", -1, []string{"", "x", "y", "z", ""}},
506 | }
507 | 
508 | func TestSplit(t *testing.T) {
509 | 	for i, test := range splitTests {
510 | 		re, err := Compile(test.r)
511 | 		if err != nil {
512 | 			t.Errorf("#%d: %q: compile error: %s", i, test.r, err.Error())
513 | 			continue
514 | 		}
515 | 
516 | 		split := re.Split(test.s, test.n)
517 | 		if !reflect.DeepEqual(split, test.out) {
518 | 			t.Errorf("#%d: %q: got %q; want %q", i, test.r, split, test.out)
519 | 		}
520 | 
521 | 		if QuoteMeta(test.r) == test.r {
522 | 			strsplit := strings.SplitN(test.s, test.r, test.n)
523 | 			if !reflect.DeepEqual(split, strsplit) {
524 | 				t.Errorf("#%d: Split(%q, %q, %d): regexp vs strings mismatch\nregexp=%q\nstrings=%q", i, test.s, test.r, test.n, split, strsplit)
525 | 			}
526 | 		}
527 | 	}
528 | }
529 | 
530 | // The following sequence of Match calls used to panic. See issue #12980.
531 | func TestParseAndCompile(t *testing.T) {
532 | 	expr := "a$"
533 | 	s := "a\nb"
534 | 
535 | 	for i, tc := range []struct {
536 | 		reFlags  syntax.Flags
537 | 		expMatch bool
538 | 	}{
539 | 		{syntax.Perl | syntax.OneLine, false},
540 | 		{syntax.Perl &^ syntax.OneLine, true},
541 | 	} {
542 | 		parsed, err := syntax.Parse(expr, tc.reFlags)
543 | 		if err != nil {
544 | 			t.Fatalf("%d: parse: %v", i, err)
545 | 		}
546 | 		re, err := Compile(parsed.String())
547 | 		if err != nil {
548 | 			t.Fatalf("%d: compile: %v", i, err)
549 | 		}
550 | 		if match := re.MatchString(s); match != tc.expMatch {
551 | 			t.Errorf("%d: %q.MatchString(%q)=%t; expected=%t", i, re, s, match, tc.expMatch)
552 | 		}
553 | 	}
554 | }
555 | 
556 | // Check that one-pass cutoff does trigger.
557 | func TestOnePassCutoff(t *testing.T) {
558 | 	re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl)
559 | 	if err != nil {
560 | 		t.Fatalf("parse: %v", err)
561 | 	}
562 | 	p, err := syntax.Compile(re.Simplify())
563 | 	if err != nil {
564 | 		t.Fatalf("compile: %v", err)
565 | 	}
566 | 	if compileOnePass(p) != nil {
567 | 		t.Fatalf("makeOnePass succeeded; wanted nil")
568 | 	}
569 | }
570 | 
571 | // Check that the same machine can be used with the standard matcher
572 | // and then the backtracker when there are no captures.
573 | func TestSwitchBacktrack(t *testing.T) {
574 | 	re := MustCompile(`a|b`)
575 | 	long := make([]byte, maxBacktrackVector+1)
576 | 
577 | 	// The following sequence of Match calls used to panic. See issue #10319.
578 | 	re.Match(long)     // triggers standard matcher
579 | 	re.Match(long[:1]) // triggers backtracker
580 | }
581 | 
582 | func BenchmarkFind(b *testing.B) {
583 | 	b.StopTimer()
584 | 	re := MustCompile("a+b+")
585 | 	wantSubs := "aaabb"
586 | 	s := []byte("acbb" + wantSubs + "dd")
587 | 	b.StartTimer()
588 | 	b.ReportAllocs()
589 | 	for i := 0; i < b.N; i++ {
590 | 		subs := re.Find(s)
591 | 		if string(subs) != wantSubs {
592 | 			b.Fatalf("Find(%q) = %q; want %q", s, subs, wantSubs)
593 | 		}
594 | 	}
595 | }
596 | 
597 | func BenchmarkFindAllNoMatches(b *testing.B) {
598 | 	re := MustCompile("a+b+")
599 | 	s := []byte("acddee")
600 | 	b.ReportAllocs()
601 | 	b.ResetTimer()
602 | 	for i := 0; i < b.N; i++ {
603 | 		all := re.FindAll(s, -1)
604 | 		if all != nil {
605 | 			b.Fatalf("FindAll(%q) = %q; want nil", s, all)
606 | 		}
607 | 	}
608 | }
609 | 
610 | func BenchmarkFindString(b *testing.B) {
611 | 	b.StopTimer()
612 | 	re := MustCompile("a+b+")
613 | 	wantSubs := "aaabb"
614 | 	s := "acbb" + wantSubs + "dd"
615 | 	b.StartTimer()
616 | 	b.ReportAllocs()
617 | 	for i := 0; i < b.N; i++ {
618 | 		subs := re.FindString(s)
619 | 		if subs != wantSubs {
620 | 			b.Fatalf("FindString(%q) = %q; want %q", s, subs, wantSubs)
621 | 		}
622 | 	}
623 | }
624 | 
625 | func BenchmarkFindSubmatch(b *testing.B) {
626 | 	b.StopTimer()
627 | 	re := MustCompile("a(a+b+)b")
628 | 	wantSubs := "aaabb"
629 | 	s := []byte("acbb" + wantSubs + "dd")
630 | 	b.StartTimer()
631 | 	b.ReportAllocs()
632 | 	for i := 0; i < b.N; i++ {
633 | 		subs := re.FindSubmatch(s)
634 | 		if string(subs[0]) != wantSubs {
635 | 			b.Fatalf("FindSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs)
636 | 		}
637 | 		if string(subs[1]) != "aab" {
638 | 			b.Fatalf("FindSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab")
639 | 		}
640 | 	}
641 | }
642 | 
643 | func BenchmarkFindStringSubmatch(b *testing.B) {
644 | 	b.StopTimer()
645 | 	re := MustCompile("a(a+b+)b")
646 | 	wantSubs := "aaabb"
647 | 	s := "acbb" + wantSubs + "dd"
648 | 	b.StartTimer()
649 | 	b.ReportAllocs()
650 | 	for i := 0; i < b.N; i++ {
651 | 		subs := re.FindStringSubmatch(s)
652 | 		if subs[0] != wantSubs {
653 | 			b.Fatalf("FindStringSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs)
654 | 		}
655 | 		if subs[1] != "aab" {
656 | 			b.Fatalf("FindStringSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab")
657 | 		}
658 | 	}
659 | }
660 | 
661 | func BenchmarkLiteral(b *testing.B) {
662 | 	x := strings.Repeat("x", 50) + "y"
663 | 	b.StopTimer()
664 | 	re := MustCompile("y")
665 | 	b.StartTimer()
666 | 	for i := 0; i < b.N; i++ {
667 | 		if !re.MatchString(x) {
668 | 			b.Fatalf("no match!")
669 | 		}
670 | 	}
671 | }
672 | 
673 | func BenchmarkNotLiteral(b *testing.B) {
674 | 	x := strings.Repeat("x", 50) + "y"
675 | 	b.StopTimer()
676 | 	re := MustCompile(".y")
677 | 	b.StartTimer()
678 | 	for i := 0; i < b.N; i++ {
679 | 		if !re.MatchString(x) {
680 | 			b.Fatalf("no match!")
681 | 		}
682 | 	}
683 | }
684 | 
685 | func BenchmarkMatchClass(b *testing.B) {
686 | 	b.StopTimer()
687 | 	x := strings.Repeat("xxxx", 20) + "w"
688 | 	re := MustCompile("[abcdw]")
689 | 	b.StartTimer()
690 | 	for i := 0; i < b.N; i++ {
691 | 		if !re.MatchString(x) {
692 | 			b.Fatalf("no match!")
693 | 		}
694 | 	}
695 | }
696 | 
697 | func BenchmarkMatchClass_InRange(b *testing.B) {
698 | 	b.StopTimer()
699 | 	// 'b' is between 'a' and 'c', so the charclass
700 | 	// range checking is no help here.
701 | 	x := strings.Repeat("bbbb", 20) + "c"
702 | 	re := MustCompile("[ac]")
703 | 	b.StartTimer()
704 | 	for i := 0; i < b.N; i++ {
705 | 		if !re.MatchString(x) {
706 | 			b.Fatalf("no match!")
707 | 		}
708 | 	}
709 | }
710 | 
711 | func BenchmarkReplaceAll(b *testing.B) {
712 | 	x := "abcdefghijklmnopqrstuvwxyz"
713 | 	b.StopTimer()
714 | 	re := MustCompile("[cjrw]")
715 | 	b.StartTimer()
716 | 	for i := 0; i < b.N; i++ {
717 | 		re.ReplaceAllString(x, "")
718 | 	}
719 | }
720 | 
721 | func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
722 | 	b.StopTimer()
723 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
724 | 	re := MustCompile("^zbc(d|e)")
725 | 	b.StartTimer()
726 | 	for i := 0; i < b.N; i++ {
727 | 		re.Match(x)
728 | 	}
729 | }
730 | 
731 | func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
732 | 	b.StopTimer()
733 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
734 | 	for i := 0; i < 15; i++ {
735 | 		x = append(x, x...)
736 | 	}
737 | 	re := MustCompile("^zbc(d|e)")
738 | 	b.StartTimer()
739 | 	for i := 0; i < b.N; i++ {
740 | 		re.Match(x)
741 | 	}
742 | }
743 | 
744 | func BenchmarkAnchoredShortMatch(b *testing.B) {
745 | 	b.StopTimer()
746 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
747 | 	re := MustCompile("^.bc(d|e)")
748 | 	b.StartTimer()
749 | 	for i := 0; i < b.N; i++ {
750 | 		re.Match(x)
751 | 	}
752 | }
753 | 
754 | func BenchmarkAnchoredLongMatch(b *testing.B) {
755 | 	b.StopTimer()
756 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
757 | 	for i := 0; i < 15; i++ {
758 | 		x = append(x, x...)
759 | 	}
760 | 	re := MustCompile("^.bc(d|e)")
761 | 	b.StartTimer()
762 | 	for i := 0; i < b.N; i++ {
763 | 		re.Match(x)
764 | 	}
765 | }
766 | 
767 | func BenchmarkOnePassShortA(b *testing.B) {
768 | 	b.StopTimer()
769 | 	x := []byte("abcddddddeeeededd")
770 | 	re := MustCompile("^.bc(d|e)*$")
771 | 	b.StartTimer()
772 | 	for i := 0; i < b.N; i++ {
773 | 		re.Match(x)
774 | 	}
775 | }
776 | 
777 | func BenchmarkNotOnePassShortA(b *testing.B) {
778 | 	b.StopTimer()
779 | 	x := []byte("abcddddddeeeededd")
780 | 	re := MustCompile(".bc(d|e)*$")
781 | 	b.StartTimer()
782 | 	for i := 0; i < b.N; i++ {
783 | 		re.Match(x)
784 | 	}
785 | }
786 | 
787 | func BenchmarkOnePassShortB(b *testing.B) {
788 | 	b.StopTimer()
789 | 	x := []byte("abcddddddeeeededd")
790 | 	re := MustCompile("^.bc(?:d|e)*$")
791 | 	b.StartTimer()
792 | 	for i := 0; i < b.N; i++ {
793 | 		re.Match(x)
794 | 	}
795 | }
796 | 
797 | func BenchmarkNotOnePassShortB(b *testing.B) {
798 | 	b.StopTimer()
799 | 	x := []byte("abcddddddeeeededd")
800 | 	re := MustCompile(".bc(?:d|e)*$")
801 | 	b.StartTimer()
802 | 	for i := 0; i < b.N; i++ {
803 | 		re.Match(x)
804 | 	}
805 | }
806 | 
807 | func BenchmarkOnePassLongPrefix(b *testing.B) {
808 | 	b.StopTimer()
809 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
810 | 	re := MustCompile("^abcdefghijklmnopqrstuvwxyz.*$")
811 | 	b.StartTimer()
812 | 	for i := 0; i < b.N; i++ {
813 | 		re.Match(x)
814 | 	}
815 | }
816 | 
817 | func BenchmarkOnePassLongNotPrefix(b *testing.B) {
818 | 	b.StopTimer()
819 | 	x := []byte("abcdefghijklmnopqrstuvwxyz")
820 | 	re := MustCompile("^.bcdefghijklmnopqrstuvwxyz.*$")
821 | 	b.StartTimer()
822 | 	for i := 0; i < b.N; i++ {
823 | 		re.Match(x)
824 | 	}
825 | }
826 | 
827 | func BenchmarkMatchParallelShared(b *testing.B) {
828 | 	x := []byte("this is a long line that contains foo bar baz")
829 | 	re := MustCompile("foo (ba+r)? baz")
830 | 	b.ResetTimer()
831 | 	b.RunParallel(func(pb *testing.PB) {
832 | 		for pb.Next() {
833 | 			re.Match(x)
834 | 		}
835 | 	})
836 | }
837 | 
838 | func BenchmarkMatchParallelCopied(b *testing.B) {
839 | 	x := []byte("this is a long line that contains foo bar baz")
840 | 	re := MustCompile("foo (ba+r)? baz")
841 | 	b.ResetTimer()
842 | 	b.RunParallel(func(pb *testing.PB) {
843 | 		re := re.Copy()
844 | 		for pb.Next() {
845 | 			re.Match(x)
846 | 		}
847 | 	})
848 | }
849 | 
850 | var sink string
851 | 
852 | func BenchmarkQuoteMetaAll(b *testing.B) {
853 | 	specials := make([]byte, 0)
854 | 	for i := byte(0); i < utf8.RuneSelf; i++ {
855 | 		if special(i) {
856 | 			specials = append(specials, i)
857 | 		}
858 | 	}
859 | 	s := string(specials)
860 | 	b.SetBytes(int64(len(s)))
861 | 	b.ResetTimer()
862 | 	for i := 0; i < b.N; i++ {
863 | 		sink = QuoteMeta(s)
864 | 	}
865 | }
866 | 
867 | func BenchmarkQuoteMetaNone(b *testing.B) {
868 | 	s := "abcdefghijklmnopqrstuvwxyz"
869 | 	b.SetBytes(int64(len(s)))
870 | 	b.ResetTimer()
871 | 	for i := 0; i < b.N; i++ {
872 | 		sink = QuoteMeta(s)
873 | 	}
874 | }
875 | 
876 | var compileBenchData = []struct{ name, re string }{
877 | 	{"Onepass", `^a.[l-nA-Cg-j]?e$`},
878 | 	{"Medium", `^((a|b|[d-z0-9])*(日){4,5}.)+$`},
879 | 	{"Hard", strings.Repeat(`((abc)*|`, 50) + strings.Repeat(`)`, 50)},
880 | }
881 | 
882 | func BenchmarkCompile(b *testing.B) {
883 | 	for _, data := range compileBenchData {
884 | 		b.Run(data.name, func(b *testing.B) {
885 | 			b.ReportAllocs()
886 | 			for i := 0; i < b.N; i++ {
887 | 				if _, err := Compile(data.re); err != nil {
888 | 					b.Fatal(err)
889 | 				}
890 | 			}
891 | 		})
892 | 	}
893 | }
894 | 
895 | func TestDeepEqual(t *testing.T) {
896 | 	re1 := MustCompile("a.*b.*c.*d")
897 | 	re2 := MustCompile("a.*b.*c.*d")
898 | 	if !reflect.DeepEqual(re1, re2) { // has always been true, since Go 1.
899 | 		t.Errorf("DeepEqual(re1, re2) = false, want true")
900 | 	}
901 | 
902 | 	re1.MatchString("abcdefghijklmn")
903 | 	if !reflect.DeepEqual(re1, re2) {
904 | 		t.Errorf("DeepEqual(re1, re2) = false, want true")
905 | 	}
906 | 
907 | 	re2.MatchString("abcdefghijklmn")
908 | 	if !reflect.DeepEqual(re1, re2) {
909 | 		t.Errorf("DeepEqual(re1, re2) = false, want true")
910 | 	}
911 | 
912 | 	re2.MatchString(strings.Repeat("abcdefghijklmn", 100))
913 | 	if !reflect.DeepEqual(re1, re2) {
914 | 		t.Errorf("DeepEqual(re1, re2) = false, want true")
915 | 	}
916 | }
917 | 
918 | var minInputLenTests = []struct {
919 | 	Regexp string
920 | 	min    int
921 | }{
922 | 	{``, 0},
923 | 	{`a`, 1},
924 | 	{`aa`, 2},
925 | 	{`(aa)a`, 3},
926 | 	{`(?:aa)a`, 3},
927 | 	{`a?a`, 1},
928 | 	{`(aaa)|(aa)`, 2},
929 | 	{`(aa)+a`, 3},
930 | 	{`(aa)*a`, 1},
931 | 	{`(aa){3,5}`, 6},
932 | 	{`[a-z]`, 1},
933 | 	{`日`, 1},
934 | }
935 | 
936 | func TestMinInputLen(t *testing.T) {
937 | 	for _, tt := range minInputLenTests {
938 | 		re, _ := syntax.Parse(tt.Regexp, syntax.Perl)
939 | 		m := minInputLen(re)
940 | 		if m != tt.min {
941 | 			t.Errorf("regexp %#q has minInputLen %d, should be %d", tt.Regexp, m, tt.min)
942 | 		}
943 | 	}
944 | }
945 | 


--------------------------------------------------------------------------------
/backtrack.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // backtrack is a regular expression search with submatch
  6 | // tracking for small regular expressions and texts. It allocates
  7 | // a bit vector with (length of input) * (length of prog) bits,
  8 | // to make sure it never explores the same (character position, instruction)
  9 | // state multiple times. This limits the search to run in time linear in
 10 | // the length of the test.
 11 | //
 12 | // backtrack is a fast replacement for the NFA code on small
 13 | // regexps when onepass cannot be used.
 14 | 
 15 | package binaryregexp
 16 | 
 17 | import (
 18 | 	"sync"
 19 | 
 20 | 	"rsc.io/binaryregexp/syntax"
 21 | )
 22 | 
 23 | // A job is an entry on the backtracker's job stack. It holds
 24 | // the instruction pc and the position in the input.
 25 | type job struct {
 26 | 	pc  uint32
 27 | 	arg bool
 28 | 	pos int
 29 | }
 30 | 
 31 | const (
 32 | 	visitedBits        = 32
 33 | 	maxBacktrackProg   = 500        // len(prog.Inst) <= max
 34 | 	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
 35 | )
 36 | 
 37 | // bitState holds state for the backtracker.
 38 | type bitState struct {
 39 | 	end      int
 40 | 	cap      []int
 41 | 	matchcap []int
 42 | 	jobs     []job
 43 | 	visited  []uint32
 44 | 
 45 | 	inputs inputs
 46 | }
 47 | 
 48 | var bitStatePool sync.Pool
 49 | 
 50 | func newBitState() *bitState {
 51 | 	b, ok := bitStatePool.Get().(*bitState)
 52 | 	if !ok {
 53 | 		b = new(bitState)
 54 | 	}
 55 | 	return b
 56 | }
 57 | 
 58 | func freeBitState(b *bitState) {
 59 | 	b.inputs.clear()
 60 | 	bitStatePool.Put(b)
 61 | }
 62 | 
 63 | // maxBitStateLen returns the maximum length of a string to search with
 64 | // the backtracker using prog.
 65 | func maxBitStateLen(prog *syntax.Prog) int {
 66 | 	if !shouldBacktrack(prog) {
 67 | 		return 0
 68 | 	}
 69 | 	return maxBacktrackVector / len(prog.Inst)
 70 | }
 71 | 
 72 | // shouldBacktrack reports whether the program is too
 73 | // long for the backtracker to run.
 74 | func shouldBacktrack(prog *syntax.Prog) bool {
 75 | 	return len(prog.Inst) <= maxBacktrackProg
 76 | }
 77 | 
 78 | // reset resets the state of the backtracker.
 79 | // end is the end position in the input.
 80 | // ncap is the number of captures.
 81 | func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
 82 | 	b.end = end
 83 | 
 84 | 	if cap(b.jobs) == 0 {
 85 | 		b.jobs = make([]job, 0, 256)
 86 | 	} else {
 87 | 		b.jobs = b.jobs[:0]
 88 | 	}
 89 | 
 90 | 	visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
 91 | 	if cap(b.visited) < visitedSize {
 92 | 		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
 93 | 	} else {
 94 | 		b.visited = b.visited[:visitedSize]
 95 | 		for i := range b.visited {
 96 | 			b.visited[i] = 0
 97 | 		}
 98 | 	}
 99 | 
100 | 	if cap(b.cap) < ncap {
101 | 		b.cap = make([]int, ncap)
102 | 	} else {
103 | 		b.cap = b.cap[:ncap]
104 | 	}
105 | 	for i := range b.cap {
106 | 		b.cap[i] = -1
107 | 	}
108 | 
109 | 	if cap(b.matchcap) < ncap {
110 | 		b.matchcap = make([]int, ncap)
111 | 	} else {
112 | 		b.matchcap = b.matchcap[:ncap]
113 | 	}
114 | 	for i := range b.matchcap {
115 | 		b.matchcap[i] = -1
116 | 	}
117 | }
118 | 
119 | // shouldVisit reports whether the combination of (pc, pos) has not
120 | // been visited yet.
121 | func (b *bitState) shouldVisit(pc uint32, pos int) bool {
122 | 	n := uint(int(pc)*(b.end+1) + pos)
123 | 	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
124 | 		return false
125 | 	}
126 | 	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
127 | 	return true
128 | }
129 | 
130 | // push pushes (pc, pos, arg) onto the job stack if it should be
131 | // visited.
132 | func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
133 | 	// Only check shouldVisit when arg is false.
134 | 	// When arg is true, we are continuing a previous visit.
135 | 	if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
136 | 		b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
137 | 	}
138 | }
139 | 
140 | // tryBacktrack runs a backtracking search starting at pos.
141 | func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
142 | 	longest := re.longest
143 | 
144 | 	b.push(re, pc, pos, false)
145 | 	for len(b.jobs) > 0 {
146 | 		l := len(b.jobs) - 1
147 | 		// Pop job off the stack.
148 | 		pc := b.jobs[l].pc
149 | 		pos := b.jobs[l].pos
150 | 		arg := b.jobs[l].arg
151 | 		b.jobs = b.jobs[:l]
152 | 
153 | 		// Optimization: rather than push and pop,
154 | 		// code that is going to Push and continue
155 | 		// the loop simply updates ip, p, and arg
156 | 		// and jumps to CheckAndLoop. We have to
157 | 		// do the ShouldVisit check that Push
158 | 		// would have, but we avoid the stack
159 | 		// manipulation.
160 | 		goto Skip
161 | 	CheckAndLoop:
162 | 		if !b.shouldVisit(pc, pos) {
163 | 			continue
164 | 		}
165 | 	Skip:
166 | 
167 | 		inst := re.prog.Inst[pc]
168 | 
169 | 		switch inst.Op {
170 | 		default:
171 | 			panic("bad inst")
172 | 		case syntax.InstFail:
173 | 			panic("unexpected InstFail")
174 | 		case syntax.InstAlt:
175 | 			// Cannot just
176 | 			//   b.push(inst.Out, pos, false)
177 | 			//   b.push(inst.Arg, pos, false)
178 | 			// If during the processing of inst.Out, we encounter
179 | 			// inst.Arg via another path, we want to process it then.
180 | 			// Pushing it here will inhibit that. Instead, re-push
181 | 			// inst with arg==true as a reminder to push inst.Arg out
182 | 			// later.
183 | 			if arg {
184 | 				// Finished inst.Out; try inst.Arg.
185 | 				arg = false
186 | 				pc = inst.Arg
187 | 				goto CheckAndLoop
188 | 			} else {
189 | 				b.push(re, pc, pos, true)
190 | 				pc = inst.Out
191 | 				goto CheckAndLoop
192 | 			}
193 | 
194 | 		case syntax.InstAltMatch:
195 | 			// One opcode consumes runes; the other leads to match.
196 | 			switch re.prog.Inst[inst.Out].Op {
197 | 			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
198 | 				// inst.Arg is the match.
199 | 				b.push(re, inst.Arg, pos, false)
200 | 				pc = inst.Arg
201 | 				pos = b.end
202 | 				goto CheckAndLoop
203 | 			}
204 | 			// inst.Out is the match - non-greedy
205 | 			b.push(re, inst.Out, b.end, false)
206 | 			pc = inst.Out
207 | 			goto CheckAndLoop
208 | 
209 | 		case syntax.InstRune:
210 | 			r, width := i.step(pos)
211 | 			if !inst.MatchRune(r) {
212 | 				continue
213 | 			}
214 | 			pos += width
215 | 			pc = inst.Out
216 | 			goto CheckAndLoop
217 | 
218 | 		case syntax.InstRune1:
219 | 			r, width := i.step(pos)
220 | 			if r != inst.Rune[0] {
221 | 				continue
222 | 			}
223 | 			pos += width
224 | 			pc = inst.Out
225 | 			goto CheckAndLoop
226 | 
227 | 		case syntax.InstRuneAnyNotNL:
228 | 			r, width := i.step(pos)
229 | 			if r == '\n' || r == endOfText {
230 | 				continue
231 | 			}
232 | 			pos += width
233 | 			pc = inst.Out
234 | 			goto CheckAndLoop
235 | 
236 | 		case syntax.InstRuneAny:
237 | 			r, width := i.step(pos)
238 | 			if r == endOfText {
239 | 				continue
240 | 			}
241 | 			pos += width
242 | 			pc = inst.Out
243 | 			goto CheckAndLoop
244 | 
245 | 		case syntax.InstCapture:
246 | 			if arg {
247 | 				// Finished inst.Out; restore the old value.
248 | 				b.cap[inst.Arg] = pos
249 | 				continue
250 | 			} else {
251 | 				if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
252 | 					// Capture pos to register, but save old value.
253 | 					b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
254 | 					b.cap[inst.Arg] = pos
255 | 				}
256 | 				pc = inst.Out
257 | 				goto CheckAndLoop
258 | 			}
259 | 
260 | 		case syntax.InstEmptyWidth:
261 | 			flag := i.context(pos)
262 | 			if !flag.match(syntax.EmptyOp(inst.Arg)) {
263 | 				continue
264 | 			}
265 | 			pc = inst.Out
266 | 			goto CheckAndLoop
267 | 
268 | 		case syntax.InstNop:
269 | 			pc = inst.Out
270 | 			goto CheckAndLoop
271 | 
272 | 		case syntax.InstMatch:
273 | 			// We found a match. If the caller doesn't care
274 | 			// where the match is, no point going further.
275 | 			if len(b.cap) == 0 {
276 | 				return true
277 | 			}
278 | 
279 | 			// Record best match so far.
280 | 			// Only need to check end point, because this entire
281 | 			// call is only considering one start position.
282 | 			if len(b.cap) > 1 {
283 | 				b.cap[1] = pos
284 | 			}
285 | 			if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
286 | 				copy(b.matchcap, b.cap)
287 | 			}
288 | 
289 | 			// If going for first match, we're done.
290 | 			if !longest {
291 | 				return true
292 | 			}
293 | 
294 | 			// If we used the entire text, no longer match is possible.
295 | 			if pos == b.end {
296 | 				return true
297 | 			}
298 | 
299 | 			// Otherwise, continue on in hope of a longer match.
300 | 			continue
301 | 		}
302 | 	}
303 | 
304 | 	return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
305 | }
306 | 
307 | // backtrack runs a backtracking search of prog on the input starting at pos.
308 | func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
309 | 	startCond := re.cond
310 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
311 | 		return nil
312 | 	}
313 | 	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
314 | 		// Anchored match, past beginning of text.
315 | 		return nil
316 | 	}
317 | 
318 | 	b := newBitState()
319 | 	i, end := b.inputs.init(nil, ib, is)
320 | 	b.reset(re.prog, end, ncap)
321 | 
322 | 	// Anchored search must start at the beginning of the input
323 | 	if startCond&syntax.EmptyBeginText != 0 {
324 | 		if len(b.cap) > 0 {
325 | 			b.cap[0] = pos
326 | 		}
327 | 		if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
328 | 			freeBitState(b)
329 | 			return nil
330 | 		}
331 | 	} else {
332 | 		// Unanchored search, starting from each possible text position.
333 | 		// Notice that we have to try the empty string at the end of
334 | 		// the text, so the loop condition is pos <= end, not pos < end.
335 | 		// This looks like it's quadratic in the size of the text,
336 | 		// but we are not clearing visited between calls to TrySearch,
337 | 		// so no work is duplicated and it ends up still being linear.
338 | 		width := -1
339 | 		for ; pos <= end && width != 0; pos += width {
340 | 			if len(re.prefix) > 0 {
341 | 				// Match requires literal prefix; fast search for it.
342 | 				advance := i.index(re, pos)
343 | 				if advance < 0 {
344 | 					freeBitState(b)
345 | 					return nil
346 | 				}
347 | 				pos += advance
348 | 			}
349 | 
350 | 			if len(b.cap) > 0 {
351 | 				b.cap[0] = pos
352 | 			}
353 | 			if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
354 | 				// Match must be leftmost; done.
355 | 				goto Match
356 | 			}
357 | 			_, width = i.step(pos)
358 | 		}
359 | 		freeBitState(b)
360 | 		return nil
361 | 	}
362 | 
363 | Match:
364 | 	dstCap = append(dstCap, b.matchcap...)
365 | 	freeBitState(b)
366 | 	return dstCap
367 | }
368 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package binaryregexp_test
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"strings"
 10 | 
 11 | 	"rsc.io/binaryregexp"
 12 | )
 13 | 
 14 | func Example() {
 15 | 	// Compile the expression once, usually at init time.
 16 | 	// Use raw strings to avoid having to quote the backslashes.
 17 | 	var validID = binaryregexp.MustCompile(`^[a-z]+\[[0-9]+\]$`)
 18 | 
 19 | 	fmt.Println(validID.MatchString("adam[23]"))
 20 | 	fmt.Println(validID.MatchString("eve[7]"))
 21 | 	fmt.Println(validID.MatchString("Job[48]"))
 22 | 	fmt.Println(validID.MatchString("snakey"))
 23 | 	// Output:
 24 | 	// true
 25 | 	// true
 26 | 	// false
 27 | 	// false
 28 | }
 29 | 
 30 | func ExampleMatch() {
 31 | 	matched, err := binaryregexp.Match(`foo.*`, []byte(`seafood`))
 32 | 	fmt.Println(matched, err)
 33 | 	matched, err = binaryregexp.Match(`bar.*`, []byte(`seafood`))
 34 | 	fmt.Println(matched, err)
 35 | 	matched, err = binaryregexp.Match(`a(b`, []byte(`seafood`))
 36 | 	fmt.Println(matched, err)
 37 | 
 38 | 	// Output:
 39 | 	// true <nil>
 40 | 	// false <nil>
 41 | 	// false error parsing regexp: missing closing ): `a(b`
 42 | }
 43 | 
 44 | func ExampleMatchString() {
 45 | 	matched, err := binaryregexp.MatchString(`foo.*`, "seafood")
 46 | 	fmt.Println(matched, err)
 47 | 	matched, err = binaryregexp.MatchString(`bar.*`, "seafood")
 48 | 	fmt.Println(matched, err)
 49 | 	matched, err = binaryregexp.MatchString(`a(b`, "seafood")
 50 | 	fmt.Println(matched, err)
 51 | 	// Output:
 52 | 	// true <nil>
 53 | 	// false <nil>
 54 | 	// false error parsing regexp: missing closing ): `a(b`
 55 | }
 56 | 
 57 | func ExampleQuoteMeta() {
 58 | 	fmt.Println(binaryregexp.QuoteMeta(`Escaping symbols like: .+*?()|[]{}^$`))
 59 | 	// Output:
 60 | 	// Escaping symbols like: \.\+\*\?\(\)\|\[\]\{\}\^\$
 61 | }
 62 | 
 63 | func ExampleRegexp_Find() {
 64 | 	re := binaryregexp.MustCompile(`foo.?`)
 65 | 	fmt.Printf("%q\n", re.Find([]byte(`seafood fool`)))
 66 | 
 67 | 	// Output:
 68 | 	// "food"
 69 | }
 70 | 
 71 | func ExampleRegexp_FindAll() {
 72 | 	re := binaryregexp.MustCompile(`foo.?`)
 73 | 	fmt.Printf("%q\n", re.FindAll([]byte(`seafood fool`), -1))
 74 | 
 75 | 	// Output:
 76 | 	// ["food" "fool"]
 77 | }
 78 | 
 79 | func ExampleRegexp_FindAllSubmatch() {
 80 | 	re := binaryregexp.MustCompile(`foo(.?)`)
 81 | 	fmt.Printf("%q\n", re.FindAllSubmatch([]byte(`seafood fool`), -1))
 82 | 
 83 | 	// Output:
 84 | 	// [["food" "d"] ["fool" "l"]]
 85 | }
 86 | 
 87 | func ExampleRegexp_FindSubmatch() {
 88 | 	re := binaryregexp.MustCompile(`foo(.?)`)
 89 | 	fmt.Printf("%q\n", re.FindSubmatch([]byte(`seafood fool`)))
 90 | 
 91 | 	// Output:
 92 | 	// ["food" "d"]
 93 | }
 94 | 
 95 | func ExampleRegexp_Match() {
 96 | 	re := binaryregexp.MustCompile(`foo.?`)
 97 | 	fmt.Println(re.Match([]byte(`seafood fool`)))
 98 | 
 99 | 	// Output:
100 | 	// true
101 | }
102 | 
103 | func ExampleRegexp_FindString() {
104 | 	re := binaryregexp.MustCompile(`foo.?`)
105 | 	fmt.Printf("%q\n", re.FindString("seafood fool"))
106 | 	fmt.Printf("%q\n", re.FindString("meat"))
107 | 	// Output:
108 | 	// "food"
109 | 	// ""
110 | }
111 | 
112 | func ExampleRegexp_FindStringIndex() {
113 | 	re := binaryregexp.MustCompile(`ab?`)
114 | 	fmt.Println(re.FindStringIndex("tablett"))
115 | 	fmt.Println(re.FindStringIndex("foo") == nil)
116 | 	// Output:
117 | 	// [1 3]
118 | 	// true
119 | }
120 | 
121 | func ExampleRegexp_FindStringSubmatch() {
122 | 	re := binaryregexp.MustCompile(`a(x*)b(y|z)c`)
123 | 	fmt.Printf("%q\n", re.FindStringSubmatch("-axxxbyc-"))
124 | 	fmt.Printf("%q\n", re.FindStringSubmatch("-abzc-"))
125 | 	// Output:
126 | 	// ["axxxbyc" "xxx" "y"]
127 | 	// ["abzc" "" "z"]
128 | }
129 | 
130 | func ExampleRegexp_FindAllString() {
131 | 	re := binaryregexp.MustCompile(`a.`)
132 | 	fmt.Println(re.FindAllString("paranormal", -1))
133 | 	fmt.Println(re.FindAllString("paranormal", 2))
134 | 	fmt.Println(re.FindAllString("graal", -1))
135 | 	fmt.Println(re.FindAllString("none", -1))
136 | 	// Output:
137 | 	// [ar an al]
138 | 	// [ar an]
139 | 	// [aa]
140 | 	// []
141 | }
142 | 
143 | func ExampleRegexp_FindAllStringSubmatch() {
144 | 	re := binaryregexp.MustCompile(`a(x*)b`)
145 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-", -1))
146 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-", -1))
147 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-axb-", -1))
148 | 	fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-ab-", -1))
149 | 	// Output:
150 | 	// [["ab" ""]]
151 | 	// [["axxb" "xx"]]
152 | 	// [["ab" ""] ["axb" "x"]]
153 | 	// [["axxb" "xx"] ["ab" ""]]
154 | }
155 | 
156 | func ExampleRegexp_FindAllStringSubmatchIndex() {
157 | 	re := binaryregexp.MustCompile(`a(x*)b`)
158 | 	// Indices:
159 | 	//    01234567   012345678
160 | 	//    -ab-axb-   -axxb-ab-
161 | 	fmt.Println(re.FindAllStringSubmatchIndex("-ab-", -1))
162 | 	fmt.Println(re.FindAllStringSubmatchIndex("-axxb-", -1))
163 | 	fmt.Println(re.FindAllStringSubmatchIndex("-ab-axb-", -1))
164 | 	fmt.Println(re.FindAllStringSubmatchIndex("-axxb-ab-", -1))
165 | 	fmt.Println(re.FindAllStringSubmatchIndex("-foo-", -1))
166 | 	// Output:
167 | 	// [[1 3 2 2]]
168 | 	// [[1 5 2 4]]
169 | 	// [[1 3 2 2] [4 7 5 6]]
170 | 	// [[1 5 2 4] [6 8 7 7]]
171 | 	// []
172 | }
173 | 
174 | func ExampleRegexp_MatchString() {
175 | 	re := binaryregexp.MustCompile(`(gopher){2}`)
176 | 	fmt.Println(re.MatchString("gopher"))
177 | 	fmt.Println(re.MatchString("gophergopher"))
178 | 	fmt.Println(re.MatchString("gophergophergopher"))
179 | 	// Output:
180 | 	// false
181 | 	// true
182 | 	// true
183 | }
184 | 
185 | func ExampleRegexp_ReplaceAllLiteralString() {
186 | 	re := binaryregexp.MustCompile(`a(x*)b`)
187 | 	fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "T"))
188 | 	fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "$1"))
189 | 	fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "${1}"))
190 | 	// Output:
191 | 	// -T-T-
192 | 	// -$1-$1-
193 | 	// -${1}-${1}-
194 | }
195 | 
196 | func ExampleRegexp_ReplaceAllString() {
197 | 	re := binaryregexp.MustCompile(`a(x*)b`)
198 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "T"))
199 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1"))
200 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1W"))
201 | 	fmt.Println(re.ReplaceAllString("-ab-axxb-", "${1}W"))
202 | 	// Output:
203 | 	// -T-T-
204 | 	// --xx-
205 | 	// ---
206 | 	// -W-xxW-
207 | }
208 | 
209 | func ExampleRegexp_ReplaceAllStringFunc() {
210 | 	re := binaryregexp.MustCompile(`[^aeiou]`)
211 | 	fmt.Println(re.ReplaceAllStringFunc("seafood fool", strings.ToUpper))
212 | 	// Output:
213 | 	// SeaFooD FooL
214 | }
215 | 
216 | func ExampleRegexp_SubexpNames() {
217 | 	re := binaryregexp.MustCompile(`(?P<first>[a-zA-Z]+) (?P<last>[a-zA-Z]+)`)
218 | 	fmt.Println(re.MatchString("Alan Turing"))
219 | 	fmt.Printf("%q\n", re.SubexpNames())
220 | 	reversed := fmt.Sprintf("${%s} ${%s}", re.SubexpNames()[2], re.SubexpNames()[1])
221 | 	fmt.Println(reversed)
222 | 	fmt.Println(re.ReplaceAllString("Alan Turing", reversed))
223 | 	// Output:
224 | 	// true
225 | 	// ["" "first" "last"]
226 | 	// ${last} ${first}
227 | 	// Turing Alan
228 | }
229 | 
230 | func ExampleRegexp_Split() {
231 | 	a := binaryregexp.MustCompile(`a`)
232 | 	fmt.Println(a.Split("banana", -1))
233 | 	fmt.Println(a.Split("banana", 0))
234 | 	fmt.Println(a.Split("banana", 1))
235 | 	fmt.Println(a.Split("banana", 2))
236 | 	zp := binaryregexp.MustCompile(`z+`)
237 | 	fmt.Println(zp.Split("pizza", -1))
238 | 	fmt.Println(zp.Split("pizza", 0))
239 | 	fmt.Println(zp.Split("pizza", 1))
240 | 	fmt.Println(zp.Split("pizza", 2))
241 | 	// Output:
242 | 	// [b n n ]
243 | 	// []
244 | 	// [banana]
245 | 	// [b nana]
246 | 	// [pi a]
247 | 	// []
248 | 	// [pizza]
249 | 	// [pi a]
250 | }
251 | 
252 | func ExampleRegexp_Expand() {
253 | 	content := []byte(`
254 | 	# comment line
255 | 	option1: value1
256 | 	option2: value2
257 | 
258 | 	# another comment line
259 | 	option3: value3
260 | `)
261 | 
262 | 	// Regex pattern captures "key: value" pair from the content.
263 | 	pattern := binaryregexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
264 | 
265 | 	// Template to convert "key: value" to "key=value" by
266 | 	// referencing the values captured by the regex pattern.
267 | 	template := []byte("$key=$value\n")
268 | 
269 | 	result := []byte{}
270 | 
271 | 	// For each match of the regex in the content.
272 | 	for _, submatches := range pattern.FindAllSubmatchIndex(content, -1) {
273 | 		// Apply the captured submatches to the template and append the output
274 | 		// to the result.
275 | 		result = pattern.Expand(result, template, content, submatches)
276 | 	}
277 | 	fmt.Println(string(result))
278 | 	// Output:
279 | 	// option1=value1
280 | 	// option2=value2
281 | 	// option3=value3
282 | }
283 | 
284 | func ExampleRegexp_ExpandString() {
285 | 	content := `
286 | 	# comment line
287 | 	option1: value1
288 | 	option2: value2
289 | 
290 | 	# another comment line
291 | 	option3: value3
292 | `
293 | 
294 | 	// Regex pattern captures "key: value" pair from the content.
295 | 	pattern := binaryregexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
296 | 
297 | 	// Template to convert "key: value" to "key=value" by
298 | 	// referencing the values captured by the regex pattern.
299 | 	template := "$key=$value\n"
300 | 
301 | 	result := []byte{}
302 | 
303 | 	// For each match of the regex in the content.
304 | 	for _, submatches := range pattern.FindAllStringSubmatchIndex(content, -1) {
305 | 		// Apply the captured submatches to the template and append the output
306 | 		// to the result.
307 | 		result = pattern.ExpandString(result, template, content, submatches)
308 | 	}
309 | 	fmt.Println(string(result))
310 | 	// Output:
311 | 	// option1=value1
312 | 	// option2=value2
313 | 	// option3=value3
314 | }
315 | 
316 | func ExampleRegexp_FindIndex() {
317 | 	content := []byte(`
318 | 	# comment line
319 | 	option1: value1
320 | 	option2: value2
321 | `)
322 | 	// Regex pattern captures "key: value" pair from the content.
323 | 	pattern := binaryregexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
324 | 
325 | 	loc := pattern.FindIndex(content)
326 | 	fmt.Println(loc)
327 | 	fmt.Println(string(content[loc[0]:loc[1]]))
328 | 	// Output:
329 | 	// [18 33]
330 | 	// option1: value1
331 | }
332 | func ExampleRegexp_FindAllSubmatchIndex() {
333 | 	content := []byte(`
334 | 	# comment line
335 | 	option1: value1
336 | 	option2: value2
337 | `)
338 | 	// Regex pattern captures "key: value" pair from the content.
339 | 	pattern := binaryregexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
340 | 	allIndexes := pattern.FindAllSubmatchIndex(content, -1)
341 | 	for _, loc := range allIndexes {
342 | 		fmt.Println(loc)
343 | 		fmt.Println(string(content[loc[0]:loc[1]]))
344 | 		fmt.Println(string(content[loc[2]:loc[3]]))
345 | 		fmt.Println(string(content[loc[4]:loc[5]]))
346 | 	}
347 | 	// Output:
348 | 	// [18 33 18 25 27 33]
349 | 	// option1: value1
350 | 	// option1
351 | 	// value1
352 | 	// [35 50 35 42 44 50]
353 | 	// option2: value2
354 | 	// option2
355 | 	// value2
356 | }
357 | 


--------------------------------------------------------------------------------
/exec.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package binaryregexp
  6 | 
  7 | import (
  8 | 	"io"
  9 | 	"sync"
 10 | 
 11 | 	"rsc.io/binaryregexp/syntax"
 12 | )
 13 | 
 14 | // A queue is a 'sparse array' holding pending threads of execution.
 15 | // See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
 16 | type queue struct {
 17 | 	sparse []uint32
 18 | 	dense  []entry
 19 | }
 20 | 
 21 | // An entry is an entry on a queue.
 22 | // It holds both the instruction pc and the actual thread.
 23 | // Some queue entries are just place holders so that the machine
 24 | // knows it has considered that pc. Such entries have t == nil.
 25 | type entry struct {
 26 | 	pc uint32
 27 | 	t  *thread
 28 | }
 29 | 
 30 | // A thread is the state of a single path through the machine:
 31 | // an instruction and a corresponding capture array.
 32 | // See https://swtch.com/~rsc/regexp/regexp2.html
 33 | type thread struct {
 34 | 	inst *syntax.Inst
 35 | 	cap  []int
 36 | }
 37 | 
 38 | // A machine holds all the state during an NFA simulation for p.
 39 | type machine struct {
 40 | 	re       *Regexp      // corresponding Regexp
 41 | 	p        *syntax.Prog // compiled program
 42 | 	q0, q1   queue        // two queues for runq, nextq
 43 | 	pool     []*thread    // pool of available threads
 44 | 	matched  bool         // whether a match was found
 45 | 	matchcap []int        // capture information for the match
 46 | 
 47 | 	inputs inputs
 48 | }
 49 | 
 50 | type inputs struct {
 51 | 	// cached inputs, to avoid allocation
 52 | 	bytes  inputBytes
 53 | 	string inputString
 54 | 	reader inputReader
 55 | }
 56 | 
 57 | func (i *inputs) newBytes(b []byte) input {
 58 | 	i.bytes.str = b
 59 | 	return &i.bytes
 60 | }
 61 | 
 62 | func (i *inputs) newString(s string) input {
 63 | 	i.string.str = s
 64 | 	return &i.string
 65 | }
 66 | 
 67 | func (i *inputs) newReader(r io.ByteReader) input {
 68 | 	i.reader.r = r
 69 | 	i.reader.atEOT = false
 70 | 	i.reader.pos = 0
 71 | 	return &i.reader
 72 | }
 73 | 
 74 | func (i *inputs) clear() {
 75 | 	// We need to clear 1 of these.
 76 | 	// Avoid the expense of clearing the others (pointer write barrier).
 77 | 	if i.bytes.str != nil {
 78 | 		i.bytes.str = nil
 79 | 	} else if i.reader.r != nil {
 80 | 		i.reader.r = nil
 81 | 	} else {
 82 | 		i.string.str = ""
 83 | 	}
 84 | }
 85 | 
 86 | func (i *inputs) init(r io.ByteReader, b []byte, s string) (input, int) {
 87 | 	if r != nil {
 88 | 		return i.newReader(r), 0
 89 | 	}
 90 | 	if b != nil {
 91 | 		return i.newBytes(b), len(b)
 92 | 	}
 93 | 	return i.newString(s), len(s)
 94 | }
 95 | 
 96 | func (m *machine) init(ncap int) {
 97 | 	for _, t := range m.pool {
 98 | 		t.cap = t.cap[:ncap]
 99 | 	}
100 | 	m.matchcap = m.matchcap[:ncap]
101 | }
102 | 
103 | // alloc allocates a new thread with the given instruction.
104 | // It uses the free pool if possible.
105 | func (m *machine) alloc(i *syntax.Inst) *thread {
106 | 	var t *thread
107 | 	if n := len(m.pool); n > 0 {
108 | 		t = m.pool[n-1]
109 | 		m.pool = m.pool[:n-1]
110 | 	} else {
111 | 		t = new(thread)
112 | 		t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
113 | 	}
114 | 	t.inst = i
115 | 	return t
116 | }
117 | 
118 | // A lazyFlag is a lazily-evaluated syntax.EmptyOp,
119 | // for checking zero-width flags like ^ $ \A \z \B \b.
120 | // It records the pair of relevant runes and does not
121 | // determine the implied flags until absolutely necessary
122 | // (most of the time, that means never).
123 | type lazyFlag uint64
124 | 
125 | func newLazyFlag(r1, r2 rune) lazyFlag {
126 | 	return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2)))
127 | }
128 | 
129 | func (f lazyFlag) match(op syntax.EmptyOp) bool {
130 | 	if op == 0 {
131 | 		return true
132 | 	}
133 | 	r1 := rune(f >> 32)
134 | 	if op&syntax.EmptyBeginLine != 0 {
135 | 		if r1 != '\n' && r1 >= 0 {
136 | 			return false
137 | 		}
138 | 		op &^= syntax.EmptyBeginLine
139 | 	}
140 | 	if op&syntax.EmptyBeginText != 0 {
141 | 		if r1 >= 0 {
142 | 			return false
143 | 		}
144 | 		op &^= syntax.EmptyBeginText
145 | 	}
146 | 	if op == 0 {
147 | 		return true
148 | 	}
149 | 	r2 := rune(f)
150 | 	if op&syntax.EmptyEndLine != 0 {
151 | 		if r2 != '\n' && r2 >= 0 {
152 | 			return false
153 | 		}
154 | 		op &^= syntax.EmptyEndLine
155 | 	}
156 | 	if op&syntax.EmptyEndText != 0 {
157 | 		if r2 >= 0 {
158 | 			return false
159 | 		}
160 | 		op &^= syntax.EmptyEndText
161 | 	}
162 | 	if op == 0 {
163 | 		return true
164 | 	}
165 | 	if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) {
166 | 		op &^= syntax.EmptyWordBoundary
167 | 	} else {
168 | 		op &^= syntax.EmptyNoWordBoundary
169 | 	}
170 | 	return op == 0
171 | }
172 | 
173 | // match runs the machine over the input starting at pos.
174 | // It reports whether a match was found.
175 | // If so, m.matchcap holds the submatch information.
176 | func (m *machine) match(i input, pos int) bool {
177 | 	startCond := m.re.cond
178 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
179 | 		return false
180 | 	}
181 | 	m.matched = false
182 | 	for i := range m.matchcap {
183 | 		m.matchcap[i] = -1
184 | 	}
185 | 	runq, nextq := &m.q0, &m.q1
186 | 	r, r1 := endOfText, endOfText
187 | 	width, width1 := 0, 0
188 | 	r, width = i.step(pos)
189 | 	if r != endOfText {
190 | 		r1, width1 = i.step(pos + width)
191 | 	}
192 | 	var flag lazyFlag
193 | 	if pos == 0 {
194 | 		flag = newLazyFlag(-1, r)
195 | 	} else {
196 | 		flag = i.context(pos)
197 | 	}
198 | 	for {
199 | 		if len(runq.dense) == 0 {
200 | 			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
201 | 				// Anchored match, past beginning of text.
202 | 				break
203 | 			}
204 | 			if m.matched {
205 | 				// Have match; finished exploring alternatives.
206 | 				break
207 | 			}
208 | 			if len(m.re.prefix) > 0 && r1 != rune(m.re.prefix[0]) && i.canCheckPrefix() {
209 | 				// Match requires literal prefix; fast search for it.
210 | 				advance := i.index(m.re, pos)
211 | 				if advance < 0 {
212 | 					break
213 | 				}
214 | 				pos += advance
215 | 				r, width = i.step(pos)
216 | 				r1, width1 = i.step(pos + width)
217 | 			}
218 | 		}
219 | 		if !m.matched {
220 | 			if len(m.matchcap) > 0 {
221 | 				m.matchcap[0] = pos
222 | 			}
223 | 			m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil)
224 | 		}
225 | 		flag = newLazyFlag(r, r1)
226 | 		m.step(runq, nextq, pos, pos+width, r, &flag)
227 | 		if width == 0 {
228 | 			break
229 | 		}
230 | 		if len(m.matchcap) == 0 && m.matched {
231 | 			// Found a match and not paying attention
232 | 			// to where it is, so any match will do.
233 | 			break
234 | 		}
235 | 		pos += width
236 | 		r, width = r1, width1
237 | 		if r != endOfText {
238 | 			r1, width1 = i.step(pos + width)
239 | 		}
240 | 		runq, nextq = nextq, runq
241 | 	}
242 | 	m.clear(nextq)
243 | 	return m.matched
244 | }
245 | 
246 | // clear frees all threads on the thread queue.
247 | func (m *machine) clear(q *queue) {
248 | 	for _, d := range q.dense {
249 | 		if d.t != nil {
250 | 			m.pool = append(m.pool, d.t)
251 | 		}
252 | 	}
253 | 	q.dense = q.dense[:0]
254 | }
255 | 
256 | // step executes one step of the machine, running each of the threads
257 | // on runq and appending new threads to nextq.
258 | // The step processes the rune c (which may be endOfText),
259 | // which starts at position pos and ends at nextPos.
260 | // nextCond gives the setting for the empty-width flags after c.
261 | func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) {
262 | 	longest := m.re.longest
263 | 	for j := 0; j < len(runq.dense); j++ {
264 | 		d := &runq.dense[j]
265 | 		t := d.t
266 | 		if t == nil {
267 | 			continue
268 | 		}
269 | 		if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
270 | 			m.pool = append(m.pool, t)
271 | 			continue
272 | 		}
273 | 		i := t.inst
274 | 		add := false
275 | 		switch i.Op {
276 | 		default:
277 | 			panic("bad inst")
278 | 
279 | 		case syntax.InstMatch:
280 | 			if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
281 | 				t.cap[1] = pos
282 | 				copy(m.matchcap, t.cap)
283 | 			}
284 | 			if !longest {
285 | 				// First-match mode: cut off all lower-priority threads.
286 | 				for _, d := range runq.dense[j+1:] {
287 | 					if d.t != nil {
288 | 						m.pool = append(m.pool, d.t)
289 | 					}
290 | 				}
291 | 				runq.dense = runq.dense[:0]
292 | 			}
293 | 			m.matched = true
294 | 
295 | 		case syntax.InstRune:
296 | 			add = i.MatchRune(c)
297 | 		case syntax.InstRune1:
298 | 			add = c == i.Rune[0]
299 | 		case syntax.InstRuneAny:
300 | 			add = true
301 | 		case syntax.InstRuneAnyNotNL:
302 | 			add = c != '\n'
303 | 		}
304 | 		if add {
305 | 			t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
306 | 		}
307 | 		if t != nil {
308 | 			m.pool = append(m.pool, t)
309 | 		}
310 | 	}
311 | 	runq.dense = runq.dense[:0]
312 | }
313 | 
314 | // add adds an entry to q for pc, unless the q already has such an entry.
315 | // It also recursively adds an entry for all instructions reachable from pc by following
316 | // empty-width conditions satisfied by cond.  pos gives the current position
317 | // in the input.
318 | func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread {
319 | Again:
320 | 	if pc == 0 {
321 | 		return t
322 | 	}
323 | 	if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
324 | 		return t
325 | 	}
326 | 
327 | 	j := len(q.dense)
328 | 	q.dense = q.dense[:j+1]
329 | 	d := &q.dense[j]
330 | 	d.t = nil
331 | 	d.pc = pc
332 | 	q.sparse[pc] = uint32(j)
333 | 
334 | 	i := &m.p.Inst[pc]
335 | 	switch i.Op {
336 | 	default:
337 | 		panic("unhandled")
338 | 	case syntax.InstFail:
339 | 		// nothing
340 | 	case syntax.InstAlt, syntax.InstAltMatch:
341 | 		t = m.add(q, i.Out, pos, cap, cond, t)
342 | 		pc = i.Arg
343 | 		goto Again
344 | 	case syntax.InstEmptyWidth:
345 | 		if cond.match(syntax.EmptyOp(i.Arg)) {
346 | 			pc = i.Out
347 | 			goto Again
348 | 		}
349 | 	case syntax.InstNop:
350 | 		pc = i.Out
351 | 		goto Again
352 | 	case syntax.InstCapture:
353 | 		if int(i.Arg) < len(cap) {
354 | 			opos := cap[i.Arg]
355 | 			cap[i.Arg] = pos
356 | 			m.add(q, i.Out, pos, cap, cond, nil)
357 | 			cap[i.Arg] = opos
358 | 		} else {
359 | 			pc = i.Out
360 | 			goto Again
361 | 		}
362 | 	case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
363 | 		if t == nil {
364 | 			t = m.alloc(i)
365 | 		} else {
366 | 			t.inst = i
367 | 		}
368 | 		if len(cap) > 0 && &t.cap[0] != &cap[0] {
369 | 			copy(t.cap, cap)
370 | 		}
371 | 		d.t = t
372 | 		t = nil
373 | 	}
374 | 	return t
375 | }
376 | 
377 | type onePassMachine struct {
378 | 	inputs   inputs
379 | 	matchcap []int
380 | }
381 | 
382 | var onePassPool sync.Pool
383 | 
384 | func newOnePassMachine() *onePassMachine {
385 | 	m, ok := onePassPool.Get().(*onePassMachine)
386 | 	if !ok {
387 | 		m = new(onePassMachine)
388 | 	}
389 | 	return m
390 | }
391 | 
392 | func freeOnePassMachine(m *onePassMachine) {
393 | 	m.inputs.clear()
394 | 	onePassPool.Put(m)
395 | }
396 | 
397 | // doOnePass implements r.doExecute using the one-pass execution engine.
398 | func (re *Regexp) doOnePass(ir io.ByteReader, ib []byte, is string, pos, ncap int, dstCap []int) []int {
399 | 	startCond := re.cond
400 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
401 | 		return nil
402 | 	}
403 | 
404 | 	m := newOnePassMachine()
405 | 	if cap(m.matchcap) < ncap {
406 | 		m.matchcap = make([]int, ncap)
407 | 	} else {
408 | 		m.matchcap = m.matchcap[:ncap]
409 | 	}
410 | 
411 | 	matched := false
412 | 	for i := range m.matchcap {
413 | 		m.matchcap[i] = -1
414 | 	}
415 | 
416 | 	i, _ := m.inputs.init(ir, ib, is)
417 | 
418 | 	r, r1 := endOfText, endOfText
419 | 	width, width1 := 0, 0
420 | 	r, width = i.step(pos)
421 | 	if r != endOfText {
422 | 		r1, width1 = i.step(pos + width)
423 | 	}
424 | 	var flag lazyFlag
425 | 	if pos == 0 {
426 | 		flag = newLazyFlag(-1, r)
427 | 	} else {
428 | 		flag = i.context(pos)
429 | 	}
430 | 	pc := re.onepass.Start
431 | 	inst := re.onepass.Inst[pc]
432 | 	// If there is a simple literal prefix, skip over it.
433 | 	if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) &&
434 | 		len(re.prefix) > 0 && i.canCheckPrefix() {
435 | 		// Match requires literal prefix; fast search for it.
436 | 		if !i.hasPrefix(re) {
437 | 			goto Return
438 | 		}
439 | 		pos += len(re.prefix)
440 | 		r, width = i.step(pos)
441 | 		r1, width1 = i.step(pos + width)
442 | 		flag = i.context(pos)
443 | 		pc = int(re.prefixEnd)
444 | 	}
445 | 	for {
446 | 		inst = re.onepass.Inst[pc]
447 | 		pc = int(inst.Out)
448 | 		switch inst.Op {
449 | 		default:
450 | 			panic("bad inst")
451 | 		case syntax.InstMatch:
452 | 			matched = true
453 | 			if len(m.matchcap) > 0 {
454 | 				m.matchcap[0] = 0
455 | 				m.matchcap[1] = pos
456 | 			}
457 | 			goto Return
458 | 		case syntax.InstRune:
459 | 			if !inst.MatchRune(r) {
460 | 				goto Return
461 | 			}
462 | 		case syntax.InstRune1:
463 | 			if r != inst.Rune[0] {
464 | 				goto Return
465 | 			}
466 | 		case syntax.InstRuneAny:
467 | 			// Nothing
468 | 		case syntax.InstRuneAnyNotNL:
469 | 			if r == '\n' {
470 | 				goto Return
471 | 			}
472 | 		// peek at the input rune to see which branch of the Alt to take
473 | 		case syntax.InstAlt, syntax.InstAltMatch:
474 | 			pc = int(onePassNext(&inst, r))
475 | 			continue
476 | 		case syntax.InstFail:
477 | 			goto Return
478 | 		case syntax.InstNop:
479 | 			continue
480 | 		case syntax.InstEmptyWidth:
481 | 			if !flag.match(syntax.EmptyOp(inst.Arg)) {
482 | 				goto Return
483 | 			}
484 | 			continue
485 | 		case syntax.InstCapture:
486 | 			if int(inst.Arg) < len(m.matchcap) {
487 | 				m.matchcap[inst.Arg] = pos
488 | 			}
489 | 			continue
490 | 		}
491 | 		if width == 0 {
492 | 			break
493 | 		}
494 | 		flag = newLazyFlag(r, r1)
495 | 		pos += width
496 | 		r, width = r1, width1
497 | 		if r != endOfText {
498 | 			r1, width1 = i.step(pos + width)
499 | 		}
500 | 	}
501 | 
502 | Return:
503 | 	if !matched {
504 | 		freeOnePassMachine(m)
505 | 		return nil
506 | 	}
507 | 
508 | 	dstCap = append(dstCap, m.matchcap...)
509 | 	freeOnePassMachine(m)
510 | 	return dstCap
511 | }
512 | 
513 | // doMatch reports whether either r, b or s match the regexp.
514 | func (re *Regexp) doMatch(r io.ByteReader, b []byte, s string) bool {
515 | 	return re.doExecute(r, b, s, 0, 0, nil) != nil
516 | }
517 | 
518 | // doExecute finds the leftmost match in the input, appends the position
519 | // of its subexpressions to dstCap and returns dstCap.
520 | //
521 | // nil is returned if no matches are found and non-nil if matches are found.
522 | func (re *Regexp) doExecute(r io.ByteReader, b []byte, s string, pos int, ncap int, dstCap []int) []int {
523 | 	if dstCap == nil {
524 | 		// Make sure 'return dstCap' is non-nil.
525 | 		dstCap = arrayNoInts[:0:0]
526 | 	}
527 | 
528 | 	if r == nil && len(b)+len(s) < re.minInputLen {
529 | 		return nil
530 | 	}
531 | 
532 | 	if re.onepass != nil {
533 | 		return re.doOnePass(r, b, s, pos, ncap, dstCap)
534 | 	}
535 | 	if r == nil && len(b)+len(s) < re.maxBitStateLen {
536 | 		return re.backtrack(b, s, pos, ncap, dstCap)
537 | 	}
538 | 
539 | 	m := re.get()
540 | 	i, _ := m.inputs.init(r, b, s)
541 | 
542 | 	m.init(ncap)
543 | 	if !m.match(i, pos) {
544 | 		re.put(m)
545 | 		return nil
546 | 	}
547 | 
548 | 	dstCap = append(dstCap, m.matchcap...)
549 | 	re.put(m)
550 | 	return dstCap
551 | }
552 | 
553 | // arrayNoInts is returned by doExecute match if nil dstCap is passed
554 | // to it with ncap=0.
555 | var arrayNoInts [0]int
556 | 


--------------------------------------------------------------------------------
/exec2_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2013 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !race
 6 | 
 7 | package binaryregexp
 8 | 
 9 | import (
10 | 	"testing"
11 | )
12 | 
13 | // This test is excluded when running under the race detector because
14 | // it is a very expensive test and takes too long.
15 | func TestRE2Exhaustive(t *testing.T) {
16 | 	if testing.Short() {
17 | 		t.Skip("skipping TestRE2Exhaustive during short test")
18 | 	}
19 | 	testRE2(t, "testdata/re2-exhaustive.txt.bz2")
20 | }
21 | 


--------------------------------------------------------------------------------
/exec_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2010 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package binaryregexp
  6 | 
  7 | import (
  8 | 	"bufio"
  9 | 	"compress/bzip2"
 10 | 	"fmt"
 11 | 	"io"
 12 | 	"os"
 13 | 	"path/filepath"
 14 | 	"strconv"
 15 | 	"strings"
 16 | 	"testing"
 17 | 	"unicode/utf8"
 18 | 
 19 | 	"rsc.io/binaryregexp/syntax"
 20 | )
 21 | 
 22 | // TestRE2 tests this package's regexp API against test cases
 23 | // considered during RE2's exhaustive tests, which run all possible
 24 | // regexps over a given set of atoms and operators, up to a given
 25 | // complexity, over all possible strings over a given alphabet,
 26 | // up to a given size. Rather than try to link with RE2, we read a
 27 | // log file containing the test cases and the expected matches.
 28 | // The log file, re2-exhaustive.txt, is generated by running 'make log'
 29 | // in the open source RE2 distribution https://github.com/google/re2/.
 30 | //
 31 | // The test file format is a sequence of stanzas like:
 32 | //
 33 | //	strings
 34 | //	"abc"
 35 | //	"123x"
 36 | //	regexps
 37 | //	"[a-z]+"
 38 | //	0-3;0-3
 39 | //	-;-
 40 | //	"([0-9])([0-9])([0-9])"
 41 | //	-;-
 42 | //	-;0-3 0-1 1-2 2-3
 43 | //
 44 | // The stanza begins by defining a set of strings, quoted
 45 | // using Go double-quote syntax, one per line. Then the
 46 | // regexps section gives a sequence of regexps to run on
 47 | // the strings. In the block that follows a regexp, each line
 48 | // gives the semicolon-separated match results of running
 49 | // the regexp on the corresponding string.
 50 | // Each match result is either a single -, meaning no match, or a
 51 | // space-separated sequence of pairs giving the match and
 52 | // submatch indices. An unmatched subexpression formats
 53 | // its pair as a single - (not illustrated above).  For now
 54 | // each regexp run produces two match results, one for a
 55 | // ``full match'' that restricts the regexp to matching the entire
 56 | // string or nothing, and one for a ``partial match'' that gives
 57 | // the leftmost first match found in the string.
 58 | //
 59 | // Lines beginning with # are comments. Lines beginning with
 60 | // a capital letter are test names printed during RE2's test suite
 61 | // and are echoed into t but otherwise ignored.
 62 | //
 63 | // At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB,
 64 | // so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly.
 65 | //
 66 | func TestRE2Search(t *testing.T) {
 67 | 	testRE2(t, "testdata/re2-search.txt")
 68 | }
 69 | 
 70 | func testRE2(t *testing.T, file string) {
 71 | 	t.Skip("skipping - RE2 testdata assumes UTF-8")
 72 | 	f, err := os.Open(file)
 73 | 	if err != nil {
 74 | 		t.Fatal(err)
 75 | 	}
 76 | 	defer f.Close()
 77 | 	var txt io.Reader
 78 | 	if strings.HasSuffix(file, ".bz2") {
 79 | 		z := bzip2.NewReader(f)
 80 | 		txt = z
 81 | 		file = file[:len(file)-len(".bz2")] // for error messages
 82 | 	} else {
 83 | 		txt = f
 84 | 	}
 85 | 	lineno := 0
 86 | 	scanner := bufio.NewScanner(txt)
 87 | 	var (
 88 | 		str       []string
 89 | 		input     []string
 90 | 		inStrings bool
 91 | 		re        *Regexp
 92 | 		refull    *Regexp
 93 | 		nfail     int
 94 | 		ncase     int
 95 | 	)
 96 | 	for lineno := 1; scanner.Scan(); lineno++ {
 97 | 		line := scanner.Text()
 98 | 		switch {
 99 | 		case line == "":
100 | 			t.Fatalf("%s:%d: unexpected blank line", file, lineno)
101 | 		case line[0] == '#':
102 | 			continue
103 | 		case 'A' <= line[0] && line[0] <= 'Z':
104 | 			// Test name.
105 | 			t.Logf("%s\n", line)
106 | 			continue
107 | 		case line == "strings":
108 | 			str = str[:0]
109 | 			inStrings = true
110 | 		case line == "regexps":
111 | 			inStrings = false
112 | 		case line[0] == '"':
113 | 			q, err := strconv.Unquote(line)
114 | 			if err != nil {
115 | 				// Fatal because we'll get out of sync.
116 | 				t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
117 | 			}
118 | 			if inStrings {
119 | 				str = append(str, q)
120 | 				continue
121 | 			}
122 | 			// Is a regexp.
123 | 			if len(input) != 0 {
124 | 				t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
125 | 			}
126 | 			re, err = tryCompile(q)
127 | 			if err != nil {
128 | 				if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
129 | 					// We don't and likely never will support \C; keep going.
130 | 					continue
131 | 				}
132 | 				t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err)
133 | 				if nfail++; nfail >= 100 {
134 | 					t.Fatalf("stopping after %d errors", nfail)
135 | 				}
136 | 				continue
137 | 			}
138 | 			full := `\A(?:` + q + `)\z`
139 | 			refull, err = tryCompile(full)
140 | 			if err != nil {
141 | 				// Fatal because q worked, so this should always work.
142 | 				t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err)
143 | 			}
144 | 			input = str
145 | 		case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
146 | 			// A sequence of match results.
147 | 			ncase++
148 | 			if re == nil {
149 | 				// Failed to compile: skip results.
150 | 				continue
151 | 			}
152 | 			if len(input) == 0 {
153 | 				t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
154 | 			}
155 | 			var text string
156 | 			text, input = input[0], input[1:]
157 | 			if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
158 | 				// RE2's \B considers every byte position,
159 | 				// so it sees 'not word boundary' in the
160 | 				// middle of UTF-8 sequences. This package
161 | 				// only considers the positions between runes,
162 | 				// so it disagrees. Skip those cases.
163 | 				continue
164 | 			}
165 | 			res := strings.Split(line, ";")
166 | 			if len(res) != len(run) {
167 | 				t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
168 | 			}
169 | 			for i := range res {
170 | 				have, suffix := run[i](re, refull, text)
171 | 				want := parseResult(t, file, lineno, res[i])
172 | 				if !same(have, want) {
173 | 					t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want)
174 | 					if nfail++; nfail >= 100 {
175 | 						t.Fatalf("stopping after %d errors", nfail)
176 | 					}
177 | 					continue
178 | 				}
179 | 				b, suffix := match[i](re, refull, text)
180 | 				if b != (want != nil) {
181 | 					t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b)
182 | 					if nfail++; nfail >= 100 {
183 | 						t.Fatalf("stopping after %d errors", nfail)
184 | 					}
185 | 					continue
186 | 				}
187 | 			}
188 | 
189 | 		default:
190 | 			t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
191 | 		}
192 | 	}
193 | 	if err := scanner.Err(); err != nil {
194 | 		t.Fatalf("%s:%d: %v", file, lineno, err)
195 | 	}
196 | 	if len(input) != 0 {
197 | 		t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
198 | 	}
199 | 	t.Logf("%d cases tested", ncase)
200 | }
201 | 
202 | var run = []func(*Regexp, *Regexp, string) ([]int, string){
203 | 	runFull,
204 | 	runPartial,
205 | 	runFullLongest,
206 | 	runPartialLongest,
207 | }
208 | 
209 | func runFull(re, refull *Regexp, text string) ([]int, string) {
210 | 	refull.longest = false
211 | 	return refull.FindStringSubmatchIndex(text), "[full]"
212 | }
213 | 
214 | func runPartial(re, refull *Regexp, text string) ([]int, string) {
215 | 	re.longest = false
216 | 	return re.FindStringSubmatchIndex(text), ""
217 | }
218 | 
219 | func runFullLongest(re, refull *Regexp, text string) ([]int, string) {
220 | 	refull.longest = true
221 | 	return refull.FindStringSubmatchIndex(text), "[full,longest]"
222 | }
223 | 
224 | func runPartialLongest(re, refull *Regexp, text string) ([]int, string) {
225 | 	re.longest = true
226 | 	return re.FindStringSubmatchIndex(text), "[longest]"
227 | }
228 | 
229 | var match = []func(*Regexp, *Regexp, string) (bool, string){
230 | 	matchFull,
231 | 	matchPartial,
232 | 	matchFullLongest,
233 | 	matchPartialLongest,
234 | }
235 | 
236 | func matchFull(re, refull *Regexp, text string) (bool, string) {
237 | 	refull.longest = false
238 | 	return refull.MatchString(text), "[full]"
239 | }
240 | 
241 | func matchPartial(re, refull *Regexp, text string) (bool, string) {
242 | 	re.longest = false
243 | 	return re.MatchString(text), ""
244 | }
245 | 
246 | func matchFullLongest(re, refull *Regexp, text string) (bool, string) {
247 | 	refull.longest = true
248 | 	return refull.MatchString(text), "[full,longest]"
249 | }
250 | 
251 | func matchPartialLongest(re, refull *Regexp, text string) (bool, string) {
252 | 	re.longest = true
253 | 	return re.MatchString(text), "[longest]"
254 | }
255 | 
256 | func isSingleBytes(s string) bool {
257 | 	for _, c := range s {
258 | 		if c >= utf8.RuneSelf {
259 | 			return false
260 | 		}
261 | 	}
262 | 	return true
263 | }
264 | 
265 | func tryCompile(s string) (re *Regexp, err error) {
266 | 	// Protect against panic during Compile.
267 | 	defer func() {
268 | 		if r := recover(); r != nil {
269 | 			err = fmt.Errorf("panic: %v", r)
270 | 		}
271 | 	}()
272 | 	return Compile(s)
273 | }
274 | 
275 | func parseResult(t *testing.T, file string, lineno int, res string) []int {
276 | 	// A single - indicates no match.
277 | 	if res == "-" {
278 | 		return nil
279 | 	}
280 | 	// Otherwise, a space-separated list of pairs.
281 | 	n := 1
282 | 	for j := 0; j < len(res); j++ {
283 | 		if res[j] == ' ' {
284 | 			n++
285 | 		}
286 | 	}
287 | 	out := make([]int, 2*n)
288 | 	i := 0
289 | 	n = 0
290 | 	for j := 0; j <= len(res); j++ {
291 | 		if j == len(res) || res[j] == ' ' {
292 | 			// Process a single pair.  - means no submatch.
293 | 			pair := res[i:j]
294 | 			if pair == "-" {
295 | 				out[n] = -1
296 | 				out[n+1] = -1
297 | 			} else {
298 | 				k := strings.Index(pair, "-")
299 | 				if k < 0 {
300 | 					t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
301 | 				}
302 | 				lo, err1 := strconv.Atoi(pair[:k])
303 | 				hi, err2 := strconv.Atoi(pair[k+1:])
304 | 				if err1 != nil || err2 != nil || lo > hi {
305 | 					t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
306 | 				}
307 | 				out[n] = lo
308 | 				out[n+1] = hi
309 | 			}
310 | 			n += 2
311 | 			i = j + 1
312 | 		}
313 | 	}
314 | 	return out
315 | }
316 | 
317 | func same(x, y []int) bool {
318 | 	if len(x) != len(y) {
319 | 		return false
320 | 	}
321 | 	for i, xi := range x {
322 | 		if xi != y[i] {
323 | 			return false
324 | 		}
325 | 	}
326 | 	return true
327 | }
328 | 
329 | // TestFowler runs this package's regexp API against the
330 | // POSIX regular expression tests collected by Glenn Fowler
331 | // at http://www2.research.att.com/~astopen/testregex/testregex.html.
332 | func TestFowler(t *testing.T) {
333 | 	files, err := filepath.Glob("testdata/*.dat")
334 | 	if err != nil {
335 | 		t.Fatal(err)
336 | 	}
337 | 	for _, file := range files {
338 | 		t.Log(file)
339 | 		testFowler(t, file)
340 | 	}
341 | }
342 | 
343 | var notab = MustCompilePOSIX(`[^\t]+`)
344 | 
345 | func testFowler(t *testing.T, file string) {
346 | 	f, err := os.Open(file)
347 | 	if err != nil {
348 | 		t.Error(err)
349 | 		return
350 | 	}
351 | 	defer f.Close()
352 | 	b := bufio.NewReader(f)
353 | 	lineno := 0
354 | 	lastRegexp := ""
355 | Reading:
356 | 	for {
357 | 		lineno++
358 | 		line, err := b.ReadString('\n')
359 | 		if err != nil {
360 | 			if err != io.EOF {
361 | 				t.Errorf("%s:%d: %v", file, lineno, err)
362 | 			}
363 | 			break Reading
364 | 		}
365 | 
366 | 		// http://www2.research.att.com/~astopen/man/man1/testregex.html
367 | 		//
368 | 		// INPUT FORMAT
369 | 		//   Input lines may be blank, a comment beginning with #, or a test
370 | 		//   specification. A specification is five fields separated by one
371 | 		//   or more tabs. NULL denotes the empty string and NIL denotes the
372 | 		//   0 pointer.
373 | 		if line[0] == '#' || line[0] == '\n' {
374 | 			continue Reading
375 | 		}
376 | 		line = line[:len(line)-1]
377 | 		field := notab.FindAllString(line, -1)
378 | 		for i, f := range field {
379 | 			if f == "NULL" {
380 | 				field[i] = ""
381 | 			}
382 | 			if f == "NIL" {
383 | 				t.Logf("%s:%d: skip: %s", file, lineno, line)
384 | 				continue Reading
385 | 			}
386 | 		}
387 | 		if len(field) == 0 {
388 | 			continue Reading
389 | 		}
390 | 
391 | 		//   Field 1: the regex(3) flags to apply, one character per REG_feature
392 | 		//   flag. The test is skipped if REG_feature is not supported by the
393 | 		//   implementation. If the first character is not [BEASKLP] then the
394 | 		//   specification is a global control line. One or more of [BEASKLP] may be
395 | 		//   specified; the test will be repeated for each mode.
396 | 		//
397 | 		//     B 	basic			BRE	(grep, ed, sed)
398 | 		//     E 	REG_EXTENDED		ERE	(egrep)
399 | 		//     A	REG_AUGMENTED		ARE	(egrep with negation)
400 | 		//     S	REG_SHELL		SRE	(sh glob)
401 | 		//     K	REG_SHELL|REG_AUGMENTED	KRE	(ksh glob)
402 | 		//     L	REG_LITERAL		LRE	(fgrep)
403 | 		//
404 | 		//     a	REG_LEFT|REG_RIGHT	implicit ^...$
405 | 		//     b	REG_NOTBOL		lhs does not match ^
406 | 		//     c	REG_COMMENT		ignore space and #...\n
407 | 		//     d	REG_SHELL_DOT		explicit leading . match
408 | 		//     e	REG_NOTEOL		rhs does not match $
409 | 		//     f	REG_MULTIPLE		multiple \n separated patterns
410 | 		//     g	FNM_LEADING_DIR		testfnmatch only -- match until /
411 | 		//     h	REG_MULTIREF		multiple digit backref
412 | 		//     i	REG_ICASE		ignore case
413 | 		//     j	REG_SPAN		. matches \n
414 | 		//     k	REG_ESCAPE		\ to escape [...] delimiter
415 | 		//     l	REG_LEFT		implicit ^...
416 | 		//     m	REG_MINIMAL		minimal match
417 | 		//     n	REG_NEWLINE		explicit \n match
418 | 		//     o	REG_ENCLOSED		(|&) magic inside [@|&](...)
419 | 		//     p	REG_SHELL_PATH		explicit / match
420 | 		//     q	REG_DELIMITED		delimited pattern
421 | 		//     r	REG_RIGHT		implicit ...$
422 | 		//     s	REG_SHELL_ESCAPED	\ not special
423 | 		//     t	REG_MUSTDELIM		all delimiters must be specified
424 | 		//     u	standard unspecified behavior -- errors not counted
425 | 		//     v	REG_CLASS_ESCAPE	\ special inside [...]
426 | 		//     w	REG_NOSUB		no subexpression match array
427 | 		//     x	REG_LENIENT		let some errors slide
428 | 		//     y	REG_LEFT		regexec() implicit ^...
429 | 		//     z	REG_NULL		NULL subexpressions ok
430 | 		//     $	                        expand C \c escapes in fields 2 and 3
431 | 		//     /	                        field 2 is a regsubcomp() expression
432 | 		//     =	                        field 3 is a regdecomp() expression
433 | 		//
434 | 		//   Field 1 control lines:
435 | 		//
436 | 		//     C		set LC_COLLATE and LC_CTYPE to locale in field 2
437 | 		//
438 | 		//     ?test ...	output field 5 if passed and != EXPECTED, silent otherwise
439 | 		//     &test ...	output field 5 if current and previous passed
440 | 		//     |test ...	output field 5 if current passed and previous failed
441 | 		//     ; ...	output field 2 if previous failed
442 | 		//     {test ...	skip if failed until }
443 | 		//     }		end of skip
444 | 		//
445 | 		//     : comment		comment copied as output NOTE
446 | 		//     :comment:test	:comment: ignored
447 | 		//     N[OTE] comment	comment copied as output NOTE
448 | 		//     T[EST] comment	comment
449 | 		//
450 | 		//     number		use number for nmatch (20 by default)
451 | 		flag := field[0]
452 | 		switch flag[0] {
453 | 		case '?', '&', '|', ';', '{', '}':
454 | 			// Ignore all the control operators.
455 | 			// Just run everything.
456 | 			flag = flag[1:]
457 | 			if flag == "" {
458 | 				continue Reading
459 | 			}
460 | 		case ':':
461 | 			i := strings.Index(flag[1:], ":")
462 | 			if i < 0 {
463 | 				t.Logf("skip: %s", line)
464 | 				continue Reading
465 | 			}
466 | 			flag = flag[1+i+1:]
467 | 		case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
468 | 			t.Logf("skip: %s", line)
469 | 			continue Reading
470 | 		}
471 | 
472 | 		// Can check field count now that we've handled the myriad comment formats.
473 | 		if len(field) < 4 {
474 | 			t.Errorf("%s:%d: too few fields: %s", file, lineno, line)
475 | 			continue Reading
476 | 		}
477 | 
478 | 		// Expand C escapes (a.k.a. Go escapes).
479 | 		if strings.Contains(flag, "$") {
480 | 			f := `"` + field[1] + `"`
481 | 			if field[1], err = strconv.Unquote(f); err != nil {
482 | 				t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
483 | 			}
484 | 			f = `"` + field[2] + `"`
485 | 			if field[2], err = strconv.Unquote(f); err != nil {
486 | 				t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
487 | 			}
488 | 		}
489 | 
490 | 		//   Field 2: the regular expression pattern; SAME uses the pattern from
491 | 		//     the previous specification.
492 | 		//
493 | 		if field[1] == "SAME" {
494 | 			field[1] = lastRegexp
495 | 		}
496 | 		lastRegexp = field[1]
497 | 
498 | 		//   Field 3: the string to match.
499 | 		text := field[2]
500 | 
501 | 		//   Field 4: the test outcome...
502 | 		ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3])
503 | 		if !ok {
504 | 			t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3])
505 | 			continue Reading
506 | 		}
507 | 
508 | 		//   Field 5: optional comment appended to the report.
509 | 
510 | 	Testing:
511 | 		// Run test once for each specified capital letter mode that we support.
512 | 		for _, c := range flag {
513 | 			pattern := field[1]
514 | 			syn := syntax.POSIX | syntax.ClassNL
515 | 			switch c {
516 | 			default:
517 | 				continue Testing
518 | 			case 'E':
519 | 				// extended regexp (what we support)
520 | 			case 'L':
521 | 				// literal
522 | 				pattern = QuoteMeta(pattern)
523 | 			}
524 | 
525 | 			for _, c := range flag {
526 | 				switch c {
527 | 				case 'i':
528 | 					syn |= syntax.FoldCase
529 | 				}
530 | 			}
531 | 
532 | 			re, err := compile(pattern, syn, true)
533 | 			if err != nil {
534 | 				if shouldCompile {
535 | 					t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern)
536 | 				}
537 | 				continue Testing
538 | 			}
539 | 			if !shouldCompile {
540 | 				t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern)
541 | 				continue Testing
542 | 			}
543 | 			match := re.MatchString(text)
544 | 			if match != shouldMatch {
545 | 				t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch)
546 | 				continue Testing
547 | 			}
548 | 			have := re.FindStringSubmatchIndex(text)
549 | 			if (len(have) > 0) != match {
550 | 				t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have)
551 | 				continue Testing
552 | 			}
553 | 			if len(have) > len(pos) {
554 | 				have = have[:len(pos)]
555 | 			}
556 | 			if !same(have, pos) {
557 | 				t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos)
558 | 			}
559 | 		}
560 | 	}
561 | }
562 | 
563 | func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) {
564 | 	//   Field 4: the test outcome. This is either one of the posix error
565 | 	//     codes (with REG_ omitted) or the match array, a list of (m,n)
566 | 	//     entries with m and n being first and last+1 positions in the
567 | 	//     field 3 string, or NULL if REG_NOSUB is in effect and success
568 | 	//     is expected. BADPAT is acceptable in place of any regcomp(3)
569 | 	//     error code. The match[] array is initialized to (-2,-2) before
570 | 	//     each test. All array elements from 0 to nmatch-1 must be specified
571 | 	//     in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
572 | 	//     Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
573 | 	//     matched (?{...}) expression, where x is the text enclosed by {...},
574 | 	//     o is the expression ordinal counting from 1, and n is the length of
575 | 	//     the unmatched portion of the subject string. If x starts with a
576 | 	//     number then that is the return value of re_execf(), otherwise 0 is
577 | 	//     returned.
578 | 	switch {
579 | 	case s == "":
580 | 		// Match with no position information.
581 | 		ok = true
582 | 		compiled = true
583 | 		matched = true
584 | 		return
585 | 	case s == "NOMATCH":
586 | 		// Match failure.
587 | 		ok = true
588 | 		compiled = true
589 | 		matched = false
590 | 		return
591 | 	case 'A' <= s[0] && s[0] <= 'Z':
592 | 		// All the other error codes are compile errors.
593 | 		ok = true
594 | 		compiled = false
595 | 		return
596 | 	}
597 | 	compiled = true
598 | 
599 | 	var x []int
600 | 	for s != "" {
601 | 		var end byte = ')'
602 | 		if len(x)%2 == 0 {
603 | 			if s[0] != '(' {
604 | 				ok = false
605 | 				return
606 | 			}
607 | 			s = s[1:]
608 | 			end = ','
609 | 		}
610 | 		i := 0
611 | 		for i < len(s) && s[i] != end {
612 | 			i++
613 | 		}
614 | 		if i == 0 || i == len(s) {
615 | 			ok = false
616 | 			return
617 | 		}
618 | 		var v = -1
619 | 		var err error
620 | 		if s[:i] != "?" {
621 | 			v, err = strconv.Atoi(s[:i])
622 | 			if err != nil {
623 | 				ok = false
624 | 				return
625 | 			}
626 | 		}
627 | 		x = append(x, v)
628 | 		s = s[i+1:]
629 | 	}
630 | 	if len(x)%2 != 0 {
631 | 		ok = false
632 | 		return
633 | 	}
634 | 	ok = true
635 | 	matched = true
636 | 	pos = x
637 | 	return
638 | }
639 | 
640 | var text []byte
641 | 
642 | func makeText(n int) []byte {
643 | 	if len(text) >= n {
644 | 		return text[:n]
645 | 	}
646 | 	text = make([]byte, n)
647 | 	x := ^uint32(0)
648 | 	for i := range text {
649 | 		x += x
650 | 		x ^= 1
651 | 		if int32(x) < 0 {
652 | 			x ^= 0x88888eef
653 | 		}
654 | 		if x%31 == 0 {
655 | 			text[i] = '\n'
656 | 		} else {
657 | 			text[i] = byte(x%(0x7E+1-0x20) + 0x20)
658 | 		}
659 | 	}
660 | 	return text
661 | }
662 | 
663 | func BenchmarkMatch(b *testing.B) {
664 | 	isRaceBuilder := false
665 | 	for _, data := range benchData {
666 | 		r := MustCompile(data.re)
667 | 		for _, size := range benchSizes {
668 | 			if isRaceBuilder && size.n > 1<<10 {
669 | 				continue
670 | 			}
671 | 			t := makeText(size.n)
672 | 			b.Run(data.name+"/"+size.name, func(b *testing.B) {
673 | 				b.SetBytes(int64(size.n))
674 | 				for i := 0; i < b.N; i++ {
675 | 					if r.Match(t) {
676 | 						b.Fatal("match!")
677 | 					}
678 | 				}
679 | 			})
680 | 		}
681 | 	}
682 | }
683 | 
684 | func BenchmarkMatch_onepass_regex(b *testing.B) {
685 | 	isRaceBuilder := false
686 | 	r := MustCompile(`(?s)\A.*\z`)
687 | 	if r.onepass == nil {
688 | 		b.Fatalf("want onepass regex, but %q is not onepass", r)
689 | 	}
690 | 	for _, size := range benchSizes {
691 | 		if isRaceBuilder && size.n > 1<<10 {
692 | 			continue
693 | 		}
694 | 		t := makeText(size.n)
695 | 		b.Run(size.name, func(b *testing.B) {
696 | 			b.SetBytes(int64(size.n))
697 | 			b.ReportAllocs()
698 | 			for i := 0; i < b.N; i++ {
699 | 				if !r.Match(t) {
700 | 					b.Fatal("not match!")
701 | 				}
702 | 			}
703 | 		})
704 | 	}
705 | }
706 | 
707 | var benchData = []struct{ name, re string }{
708 | 	{"Easy0", "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
709 | 	{"Easy0i", "(?i)ABCDEFGHIJklmnopqrstuvwxyz$"},
710 | 	{"Easy1", "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"},
711 | 	{"Medium", "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
712 | 	{"Hard", "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
713 | 	{"Hard1", "ABCD|CDEF|EFGH|GHIJ|IJKL|KLMN|MNOP|OPQR|QRST|STUV|UVWX|WXYZ"},
714 | }
715 | 
716 | var benchSizes = []struct {
717 | 	name string
718 | 	n    int
719 | }{
720 | 	{"16", 16},
721 | 	{"32", 32},
722 | 	{"1K", 1 << 10},
723 | 	{"32K", 32 << 10},
724 | 	{"1M", 1 << 20},
725 | 	{"32M", 32 << 20},
726 | }
727 | 
728 | func TestLongest(t *testing.T) {
729 | 	re, err := Compile(`a(|b)`)
730 | 	if err != nil {
731 | 		t.Fatal(err)
732 | 	}
733 | 	if g, w := re.FindString("ab"), "a"; g != w {
734 | 		t.Errorf("first match was %q, want %q", g, w)
735 | 	}
736 | 	re.Longest()
737 | 	if g, w := re.FindString("ab"), "ab"; g != w {
738 | 		t.Errorf("longest match was %q, want %q", g, w)
739 | 	}
740 | }
741 | 
742 | // TestProgramTooLongForBacktrack tests that a regex which is too long
743 | // for the backtracker still executes properly.
744 | func TestProgramTooLongForBacktrack(t *testing.T) {
745 | 	longRegex := MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`)
746 | 	if !longRegex.MatchString("two") {
747 | 		t.Errorf("longRegex.MatchString(\"two\") was false, want true")
748 | 	}
749 | 	if longRegex.MatchString("xxx") {
750 | 		t.Errorf("longRegex.MatchString(\"xxx\") was true, want false")
751 | 	}
752 | }
753 | 


--------------------------------------------------------------------------------
/find_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2010 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package binaryregexp
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"strings"
 10 | 	"testing"
 11 | )
 12 | 
 13 | // For each pattern/text pair, what is the expected output of each function?
 14 | // We can derive the textual results from the indexed results, the non-submatch
 15 | // results from the submatched results, the single results from the 'all' results,
 16 | // and the byte results from the string results. Therefore the table includes
 17 | // only the FindAllStringSubmatchIndex result.
 18 | type FindTest struct {
 19 | 	pat     string
 20 | 	text    string
 21 | 	matches [][]int
 22 | }
 23 | 
 24 | func (t FindTest) String() string {
 25 | 	return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text)
 26 | }
 27 | 
 28 | var findTests = []FindTest{
 29 | 	{``, ``, build(1, 0, 0)},
 30 | 	{`^abcdefg`, "abcdefg", build(1, 0, 7)},
 31 | 	{`a+`, "baaab", build(1, 1, 4)},
 32 | 	{"abcd..", "abcdef", build(1, 0, 6)},
 33 | 	{`a`, "a", build(1, 0, 1)},
 34 | 	{`x`, "y", nil},
 35 | 	{`b`, "abc", build(1, 1, 2)},
 36 | 	{`.`, "a", build(1, 0, 1)},
 37 | 	{`.*`, "abcdef", build(1, 0, 6)},
 38 | 	{`^`, "abcde", build(1, 0, 0)},
 39 | 	{`$`, "abcde", build(1, 5, 5)},
 40 | 	{`^abcd$`, "abcd", build(1, 0, 4)},
 41 | 	{`^bcd'`, "abcdef", nil},
 42 | 	{`^abcd$`, "abcde", nil},
 43 | 	{`a+`, "baaab", build(1, 1, 4)},
 44 | 	{`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)},
 45 | 	{`[a-z]+`, "abcd", build(1, 0, 4)},
 46 | 	{`[^a-z]+`, "ab1234cd", build(1, 2, 6)},
 47 | 	{`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)},
 48 | 	{`[^\n]+`, "abcd\n", build(1, 0, 4)},
 49 | 	{`[日本語]+`, "日本語日本語", nil},
 50 | 	{`日本語+`, "日本語", nil},
 51 | 	{`日本語+`, "日本語語語語", nil},
 52 | 	{`Æ`, "\xC6", build(1, 0, 1)},
 53 | 	{`ÆÌÓÿ`, "\xC6\xCC\xD3\xFF", build(1, 0, 4)},
 54 | 	{`()`, "", build(1, 0, 0, 0, 0)},
 55 | 	{`(a)`, "a", build(1, 0, 1, 0, 1)},
 56 | 	{`(.)(.)`, "日a", build(2, 0, 2, 0, 1, 1, 2, 2, 4, 2, 3, 3, 4)},
 57 | 	{`(.*)`, "", build(1, 0, 0, 0, 0)},
 58 | 	{`(.*)`, "abcd", build(1, 0, 4, 0, 4)},
 59 | 	{`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)},
 60 | 	{`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)},
 61 | 	{`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)},
 62 | 	{`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)},
 63 | 	{`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)},
 64 | 	{`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)},
 65 | 
 66 | 	{`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)},
 67 | 	{`(.*).*`, "ab", build(1, 0, 2, 0, 2)},
 68 | 	{`[.]`, ".", build(1, 0, 1)},
 69 | 	{`/$`, "/abc/", build(1, 4, 5)},
 70 | 	{`/$`, "/abc", nil},
 71 | 
 72 | 	// multiple matches
 73 | 	{`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)},
 74 | 	{`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)},
 75 | 	{`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)},
 76 | 	{`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)},
 77 | 	{`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)},
 78 | 
 79 | 	// fixed bugs
 80 | 	{`ab$`, "cab", build(1, 1, 3)},
 81 | 	{`axxb$`, "axxcb", nil},
 82 | 	{`data`, "daXY data", build(1, 5, 9)},
 83 | 	{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
 84 | 	{`zx+`, "zzx", build(1, 1, 3)},
 85 | 	{`ab$`, "abcab", build(1, 3, 5)},
 86 | 	{`(aa)*$`, "a", build(1, 1, 1, -1, -1)},
 87 | 	{`(?:.|(?:.a))`, "", nil},
 88 | 	{`(?:A(?:A|a))`, "Aa", build(1, 0, 2)},
 89 | 	{`(?:A|(?:A|a))`, "a", build(1, 0, 1)},
 90 | 	{`(a){0}`, "", build(1, 0, 0, -1, -1)},
 91 | 	{`(?-s)(?:(?:^).)`, "\n", nil},
 92 | 	{`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)},
 93 | 	{`(?:(?:^).)`, "\n", nil},
 94 | 	{`\b`, "x", build(2, 0, 0, 1, 1)},
 95 | 	{`\b`, "xx", build(2, 0, 0, 2, 2)},
 96 | 	{`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)},
 97 | 	{`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)},
 98 | 	{`\B`, "x", nil},
 99 | 	{`\B`, "xx", build(1, 1, 1)},
100 | 	{`\B`, "x y", nil},
101 | 	{`\B`, "xx yy", build(2, 1, 1, 4, 4)},
102 | 
103 | 	// RE2 tests
104 | 	{`[^\S\s]`, "abcd", nil},
105 | 	{`[^\S[:space:]]`, "abcd", nil},
106 | 	{`[^\D\d]`, "abcd", nil},
107 | 	{`[^\D[:digit:]]`, "abcd", nil},
108 | 	{`(?i)\W`, "x", nil},
109 | 	{`(?i)\W`, "k", nil},
110 | 	{`(?i)\W`, "s", nil},
111 | 
112 | 	// can backslash-escape any punctuation
113 | 	{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
114 | 		`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
115 | 	{`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`,
116 | 		`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
117 | 	{"\\`", "`", build(1, 0, 1)},
118 | 	{"[\\`]+", "`", build(1, 0, 1)},
119 | 
120 | 	// long set of matches (longer than startSize)
121 | 	{
122 | 		".",
123 | 		"qwertyuiopasdfghjklzxcvbnm1234567890",
124 | 		build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
125 | 			10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20,
126 | 			20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30,
127 | 			30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36),
128 | 	},
129 | }
130 | 
131 | // build is a helper to construct a [][]int by extracting n sequences from x.
132 | // This represents n matches with len(x)/n submatches each.
133 | func build(n int, x ...int) [][]int {
134 | 	ret := make([][]int, n)
135 | 	runLength := len(x) / n
136 | 	j := 0
137 | 	for i := range ret {
138 | 		ret[i] = make([]int, runLength)
139 | 		copy(ret[i], x[j:])
140 | 		j += runLength
141 | 		if j > len(x) {
142 | 			panic("invalid build entry")
143 | 		}
144 | 	}
145 | 	return ret
146 | }
147 | 
148 | // First the simple cases.
149 | 
150 | func TestFind(t *testing.T) {
151 | 	for _, test := range findTests {
152 | 		re := MustCompile(test.pat)
153 | 		if re.String() != test.pat {
154 | 			t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
155 | 		}
156 | 		result := re.Find([]byte(test.text))
157 | 		switch {
158 | 		case len(test.matches) == 0 && len(result) == 0:
159 | 			// ok
160 | 		case test.matches == nil && result != nil:
161 | 			t.Errorf("expected no match; got one: %s", test)
162 | 		case test.matches != nil && result == nil:
163 | 			t.Errorf("expected match; got none: %s", test)
164 | 		case test.matches != nil && result != nil:
165 | 			expect := test.text[test.matches[0][0]:test.matches[0][1]]
166 | 			if len(result) != cap(result) {
167 | 				t.Errorf("expected capacity %d got %d: %s", len(result), cap(result), test)
168 | 			}
169 | 			if expect != string(result) {
170 | 				t.Errorf("expected %q got %q: %s", expect, result, test)
171 | 			}
172 | 		}
173 | 	}
174 | }
175 | 
176 | func TestFindString(t *testing.T) {
177 | 	for _, test := range findTests {
178 | 		result := MustCompile(test.pat).FindString(test.text)
179 | 		switch {
180 | 		case len(test.matches) == 0 && len(result) == 0:
181 | 			// ok
182 | 		case test.matches == nil && result != "":
183 | 			t.Errorf("expected no match; got one: %s", test)
184 | 		case test.matches != nil && result == "":
185 | 			// Tricky because an empty result has two meanings: no match or empty match.
186 | 			if test.matches[0][0] != test.matches[0][1] {
187 | 				t.Errorf("expected match; got none: %s", test)
188 | 			}
189 | 		case test.matches != nil && result != "":
190 | 			expect := test.text[test.matches[0][0]:test.matches[0][1]]
191 | 			if expect != result {
192 | 				t.Errorf("expected %q got %q: %s", expect, result, test)
193 | 			}
194 | 		}
195 | 	}
196 | }
197 | 
198 | func testFindIndex(test *FindTest, result []int, t *testing.T) {
199 | 	switch {
200 | 	case len(test.matches) == 0 && len(result) == 0:
201 | 		// ok
202 | 	case test.matches == nil && result != nil:
203 | 		t.Errorf("expected no match; got one: %s", test)
204 | 	case test.matches != nil && result == nil:
205 | 		t.Errorf("expected match; got none: %s", test)
206 | 	case test.matches != nil && result != nil:
207 | 		expect := test.matches[0]
208 | 		if expect[0] != result[0] || expect[1] != result[1] {
209 | 			t.Errorf("expected %v got %v: %s", expect, result, test)
210 | 		}
211 | 	}
212 | }
213 | 
214 | func TestFindIndex(t *testing.T) {
215 | 	for _, test := range findTests {
216 | 		testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t)
217 | 	}
218 | }
219 | 
220 | func TestFindStringIndex(t *testing.T) {
221 | 	for _, test := range findTests {
222 | 		testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t)
223 | 	}
224 | }
225 | 
226 | func TestFindReaderIndex(t *testing.T) {
227 | 	for _, test := range findTests {
228 | 		testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
229 | 	}
230 | }
231 | 
232 | // Now come the simple All cases.
233 | 
234 | func TestFindAll(t *testing.T) {
235 | 	for _, test := range findTests {
236 | 		result := MustCompile(test.pat).FindAll([]byte(test.text), -1)
237 | 		switch {
238 | 		case test.matches == nil && result == nil:
239 | 			// ok
240 | 		case test.matches == nil && result != nil:
241 | 			t.Errorf("expected no match; got one: %s", test)
242 | 		case test.matches != nil && result == nil:
243 | 			t.Fatalf("expected match; got none: %s", test)
244 | 		case test.matches != nil && result != nil:
245 | 			if len(test.matches) != len(result) {
246 | 				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
247 | 				continue
248 | 			}
249 | 			for k, e := range test.matches {
250 | 				got := result[k]
251 | 				if len(got) != cap(got) {
252 | 					t.Errorf("match %d: expected capacity %d got %d: %s", k, len(got), cap(got), test)
253 | 				}
254 | 				expect := test.text[e[0]:e[1]]
255 | 				if expect != string(got) {
256 | 					t.Errorf("match %d: expected %q got %q: %s", k, expect, got, test)
257 | 				}
258 | 			}
259 | 		}
260 | 	}
261 | }
262 | 
263 | func TestFindAllString(t *testing.T) {
264 | 	for _, test := range findTests {
265 | 		result := MustCompile(test.pat).FindAllString(test.text, -1)
266 | 		switch {
267 | 		case test.matches == nil && result == nil:
268 | 			// ok
269 | 		case test.matches == nil && result != nil:
270 | 			t.Errorf("expected no match; got one: %s", test)
271 | 		case test.matches != nil && result == nil:
272 | 			t.Errorf("expected match; got none: %s", test)
273 | 		case test.matches != nil && result != nil:
274 | 			if len(test.matches) != len(result) {
275 | 				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
276 | 				continue
277 | 			}
278 | 			for k, e := range test.matches {
279 | 				expect := test.text[e[0]:e[1]]
280 | 				if expect != result[k] {
281 | 					t.Errorf("expected %q got %q: %s", expect, result, test)
282 | 				}
283 | 			}
284 | 		}
285 | 	}
286 | }
287 | 
288 | func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
289 | 	switch {
290 | 	case test.matches == nil && result == nil:
291 | 		// ok
292 | 	case test.matches == nil && result != nil:
293 | 		t.Errorf("expected no match; got one: %s", test)
294 | 	case test.matches != nil && result == nil:
295 | 		t.Errorf("expected match; got none: %s", test)
296 | 	case test.matches != nil && result != nil:
297 | 		if len(test.matches) != len(result) {
298 | 			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
299 | 			return
300 | 		}
301 | 		for k, e := range test.matches {
302 | 			if e[0] != result[k][0] || e[1] != result[k][1] {
303 | 				t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
304 | 			}
305 | 		}
306 | 	}
307 | }
308 | 
309 | func TestFindAllIndex(t *testing.T) {
310 | 	for _, test := range findTests {
311 | 		testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t)
312 | 	}
313 | }
314 | 
315 | func TestFindAllStringIndex(t *testing.T) {
316 | 	for _, test := range findTests {
317 | 		testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t)
318 | 	}
319 | }
320 | 
321 | // Now come the Submatch cases.
322 | 
323 | func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
324 | 	if len(submatches) != len(result)*2 {
325 | 		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
326 | 		return
327 | 	}
328 | 	for k := 0; k < len(submatches); k += 2 {
329 | 		if submatches[k] == -1 {
330 | 			if result[k/2] != nil {
331 | 				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
332 | 			}
333 | 			continue
334 | 		}
335 | 		got := result[k/2]
336 | 		if len(got) != cap(got) {
337 | 			t.Errorf("match %d: expected capacity %d got %d: %s", n, len(got), cap(got), test)
338 | 			return
339 | 		}
340 | 		expect := test.text[submatches[k]:submatches[k+1]]
341 | 		if expect != string(got) {
342 | 			t.Errorf("match %d: expected %q got %q: %s", n, expect, got, test)
343 | 			return
344 | 		}
345 | 	}
346 | }
347 | 
348 | func TestFindSubmatch(t *testing.T) {
349 | 	for _, test := range findTests {
350 | 		result := MustCompile(test.pat).FindSubmatch([]byte(test.text))
351 | 		switch {
352 | 		case test.matches == nil && result == nil:
353 | 			// ok
354 | 		case test.matches == nil && result != nil:
355 | 			t.Errorf("expected no match; got one: %s", test)
356 | 		case test.matches != nil && result == nil:
357 | 			t.Errorf("expected match; got none: %s", test)
358 | 		case test.matches != nil && result != nil:
359 | 			testSubmatchBytes(&test, 0, test.matches[0], result, t)
360 | 		}
361 | 	}
362 | }
363 | 
364 | func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
365 | 	if len(submatches) != len(result)*2 {
366 | 		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
367 | 		return
368 | 	}
369 | 	for k := 0; k < len(submatches); k += 2 {
370 | 		if submatches[k] == -1 {
371 | 			if result[k/2] != "" {
372 | 				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
373 | 			}
374 | 			continue
375 | 		}
376 | 		expect := test.text[submatches[k]:submatches[k+1]]
377 | 		if expect != result[k/2] {
378 | 			t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
379 | 			return
380 | 		}
381 | 	}
382 | }
383 | 
384 | func TestFindStringSubmatch(t *testing.T) {
385 | 	for _, test := range findTests {
386 | 		result := MustCompile(test.pat).FindStringSubmatch(test.text)
387 | 		switch {
388 | 		case test.matches == nil && result == nil:
389 | 			// ok
390 | 		case test.matches == nil && result != nil:
391 | 			t.Errorf("expected no match; got one: %s", test)
392 | 		case test.matches != nil && result == nil:
393 | 			t.Errorf("expected match; got none: %s", test)
394 | 		case test.matches != nil && result != nil:
395 | 			testSubmatchString(&test, 0, test.matches[0], result, t)
396 | 		}
397 | 	}
398 | }
399 | 
400 | func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
401 | 	if len(expect) != len(result) {
402 | 		t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
403 | 		return
404 | 	}
405 | 	for k, e := range expect {
406 | 		if e != result[k] {
407 | 			t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
408 | 		}
409 | 	}
410 | }
411 | 
412 | func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
413 | 	switch {
414 | 	case test.matches == nil && result == nil:
415 | 		// ok
416 | 	case test.matches == nil && result != nil:
417 | 		t.Errorf("expected no match; got one: %s", test)
418 | 	case test.matches != nil && result == nil:
419 | 		t.Errorf("expected match; got none: %s", test)
420 | 	case test.matches != nil && result != nil:
421 | 		testSubmatchIndices(test, 0, test.matches[0], result, t)
422 | 	}
423 | }
424 | 
425 | func TestFindSubmatchIndex(t *testing.T) {
426 | 	for _, test := range findTests {
427 | 		testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t)
428 | 	}
429 | }
430 | 
431 | func TestFindStringSubmatchIndex(t *testing.T) {
432 | 	for _, test := range findTests {
433 | 		testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t)
434 | 	}
435 | }
436 | 
437 | func TestFindReaderSubmatchIndex(t *testing.T) {
438 | 	for _, test := range findTests {
439 | 		testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
440 | 	}
441 | }
442 | 
443 | // Now come the monster AllSubmatch cases.
444 | 
445 | func TestFindAllSubmatch(t *testing.T) {
446 | 	for _, test := range findTests {
447 | 		result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1)
448 | 		switch {
449 | 		case test.matches == nil && result == nil:
450 | 			// ok
451 | 		case test.matches == nil && result != nil:
452 | 			t.Errorf("expected no match; got one: %s", test)
453 | 		case test.matches != nil && result == nil:
454 | 			t.Errorf("expected match; got none: %s", test)
455 | 		case len(test.matches) != len(result):
456 | 			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
457 | 		case test.matches != nil && result != nil:
458 | 			for k, match := range test.matches {
459 | 				testSubmatchBytes(&test, k, match, result[k], t)
460 | 			}
461 | 		}
462 | 	}
463 | }
464 | 
465 | func TestFindAllStringSubmatch(t *testing.T) {
466 | 	for _, test := range findTests {
467 | 		result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1)
468 | 		switch {
469 | 		case test.matches == nil && result == nil:
470 | 			// ok
471 | 		case test.matches == nil && result != nil:
472 | 			t.Errorf("expected no match; got one: %s", test)
473 | 		case test.matches != nil && result == nil:
474 | 			t.Errorf("expected match; got none: %s", test)
475 | 		case len(test.matches) != len(result):
476 | 			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
477 | 		case test.matches != nil && result != nil:
478 | 			for k, match := range test.matches {
479 | 				testSubmatchString(&test, k, match, result[k], t)
480 | 			}
481 | 		}
482 | 	}
483 | }
484 | 
485 | func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
486 | 	switch {
487 | 	case test.matches == nil && result == nil:
488 | 		// ok
489 | 	case test.matches == nil && result != nil:
490 | 		t.Errorf("expected no match; got one: %s", test)
491 | 	case test.matches != nil && result == nil:
492 | 		t.Errorf("expected match; got none: %s", test)
493 | 	case len(test.matches) != len(result):
494 | 		t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
495 | 	case test.matches != nil && result != nil:
496 | 		for k, match := range test.matches {
497 | 			testSubmatchIndices(test, k, match, result[k], t)
498 | 		}
499 | 	}
500 | }
501 | 
502 | func TestFindAllSubmatchIndex(t *testing.T) {
503 | 	for _, test := range findTests {
504 | 		testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t)
505 | 	}
506 | }
507 | 
508 | func TestFindAllStringSubmatchIndex(t *testing.T) {
509 | 	for _, test := range findTests {
510 | 		testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t)
511 | 	}
512 | }
513 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module rsc.io/binaryregexp
2 | 
3 | go 1.12
4 | 


--------------------------------------------------------------------------------
/onepass.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package binaryregexp
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"sort"
 10 | 	"unicode"
 11 | 
 12 | 	"rsc.io/binaryregexp/syntax"
 13 | )
 14 | 
 15 | // "One-pass" regexp execution.
 16 | // Some regexps can be analyzed to determine that they never need
 17 | // backtracking: they are guaranteed to run in one pass over the string
 18 | // without bothering to save all the usual NFA state.
 19 | // Detect those and execute them more quickly.
 20 | 
 21 | // A onePassProg is a compiled one-pass regular expression program.
 22 | // It is the same as syntax.Prog except for the use of onePassInst.
 23 | type onePassProg struct {
 24 | 	Inst   []onePassInst
 25 | 	Start  int // index of start instruction
 26 | 	NumCap int // number of InstCapture insts in re
 27 | }
 28 | 
 29 | // A onePassInst is a single instruction in a one-pass regular expression program.
 30 | // It is the same as syntax.Inst except for the new 'Next' field.
 31 | type onePassInst struct {
 32 | 	syntax.Inst
 33 | 	Next []uint32
 34 | }
 35 | 
 36 | // OnePassPrefix returns a literal string that all matches for the
 37 | // regexp must start with. Complete is true if the prefix
 38 | // is the entire match. Pc is the index of the last rune instruction
 39 | // in the string. The OnePassPrefix skips over the mandatory
 40 | // EmptyBeginText
 41 | func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
 42 | 	i := &p.Inst[p.Start]
 43 | 	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
 44 | 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
 45 | 	}
 46 | 	pc = i.Out
 47 | 	i = &p.Inst[pc]
 48 | 	for i.Op == syntax.InstNop {
 49 | 		pc = i.Out
 50 | 		i = &p.Inst[pc]
 51 | 	}
 52 | 	// Avoid allocation of buffer if prefix is empty.
 53 | 	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
 54 | 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
 55 | 	}
 56 | 
 57 | 	// Have prefix; gather characters.
 58 | 	var buf bytes.Buffer
 59 | 	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && i.Rune[0] <= 0xFF && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
 60 | 		buf.WriteByte(byte(i.Rune[0]))
 61 | 		pc, i = i.Out, &p.Inst[i.Out]
 62 | 	}
 63 | 	if i.Op == syntax.InstEmptyWidth &&
 64 | 		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
 65 | 		p.Inst[i.Out].Op == syntax.InstMatch {
 66 | 		complete = true
 67 | 	}
 68 | 	return buf.String(), complete, pc
 69 | }
 70 | 
 71 | // OnePassNext selects the next actionable state of the prog, based on the input character.
 72 | // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
 73 | // One of the alternates may ultimately lead without input to end of line. If the instruction
 74 | // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
 75 | func onePassNext(i *onePassInst, r rune) uint32 {
 76 | 	next := i.MatchRunePos(r)
 77 | 	if next >= 0 {
 78 | 		return i.Next[next]
 79 | 	}
 80 | 	if i.Op == syntax.InstAltMatch {
 81 | 		return i.Out
 82 | 	}
 83 | 	return 0
 84 | }
 85 | 
 86 | func iop(i *syntax.Inst) syntax.InstOp {
 87 | 	op := i.Op
 88 | 	switch op {
 89 | 	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
 90 | 		op = syntax.InstRune
 91 | 	}
 92 | 	return op
 93 | }
 94 | 
 95 | // Sparse Array implementation is used as a queueOnePass.
 96 | type queueOnePass struct {
 97 | 	sparse          []uint32
 98 | 	dense           []uint32
 99 | 	size, nextIndex uint32
100 | }
101 | 
102 | func (q *queueOnePass) empty() bool {
103 | 	return q.nextIndex >= q.size
104 | }
105 | 
106 | func (q *queueOnePass) next() (n uint32) {
107 | 	n = q.dense[q.nextIndex]
108 | 	q.nextIndex++
109 | 	return
110 | }
111 | 
112 | func (q *queueOnePass) clear() {
113 | 	q.size = 0
114 | 	q.nextIndex = 0
115 | }
116 | 
117 | func (q *queueOnePass) contains(u uint32) bool {
118 | 	if u >= uint32(len(q.sparse)) {
119 | 		return false
120 | 	}
121 | 	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
122 | }
123 | 
124 | func (q *queueOnePass) insert(u uint32) {
125 | 	if !q.contains(u) {
126 | 		q.insertNew(u)
127 | 	}
128 | }
129 | 
130 | func (q *queueOnePass) insertNew(u uint32) {
131 | 	if u >= uint32(len(q.sparse)) {
132 | 		return
133 | 	}
134 | 	q.sparse[u] = q.size
135 | 	q.dense[q.size] = u
136 | 	q.size++
137 | }
138 | 
139 | func newQueue(size int) (q *queueOnePass) {
140 | 	return &queueOnePass{
141 | 		sparse: make([]uint32, size),
142 | 		dense:  make([]uint32, size),
143 | 	}
144 | }
145 | 
146 | // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
147 | // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
148 | // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
149 | // NextIp array with the single element mergeFailed is returned.
150 | // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
151 | const mergeFailed = uint32(0xffffffff)
152 | 
153 | var (
154 | 	noRune = []rune{}
155 | 	noNext = []uint32{mergeFailed}
156 | )
157 | 
158 | func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
159 | 	leftLen := len(*leftRunes)
160 | 	rightLen := len(*rightRunes)
161 | 	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
162 | 		panic("mergeRuneSets odd length []rune")
163 | 	}
164 | 	var (
165 | 		lx, rx int
166 | 	)
167 | 	merged := make([]rune, 0)
168 | 	next := make([]uint32, 0)
169 | 	ok := true
170 | 	defer func() {
171 | 		if !ok {
172 | 			merged = nil
173 | 			next = nil
174 | 		}
175 | 	}()
176 | 
177 | 	ix := -1
178 | 	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
179 | 		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
180 | 			return false
181 | 		}
182 | 		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
183 | 		*newLow += 2
184 | 		ix += 2
185 | 		next = append(next, pc)
186 | 		return true
187 | 	}
188 | 
189 | 	for lx < leftLen || rx < rightLen {
190 | 		switch {
191 | 		case rx >= rightLen:
192 | 			ok = extend(&lx, leftRunes, leftPC)
193 | 		case lx >= leftLen:
194 | 			ok = extend(&rx, rightRunes, rightPC)
195 | 		case (*rightRunes)[rx] < (*leftRunes)[lx]:
196 | 			ok = extend(&rx, rightRunes, rightPC)
197 | 		default:
198 | 			ok = extend(&lx, leftRunes, leftPC)
199 | 		}
200 | 		if !ok {
201 | 			return noRune, noNext
202 | 		}
203 | 	}
204 | 	return merged, next
205 | }
206 | 
207 | // cleanupOnePass drops working memory, and restores certain shortcut instructions.
208 | func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
209 | 	for ix, instOriginal := range original.Inst {
210 | 		switch instOriginal.Op {
211 | 		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
212 | 		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
213 | 			prog.Inst[ix].Next = nil
214 | 		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
215 | 			prog.Inst[ix].Next = nil
216 | 			prog.Inst[ix] = onePassInst{Inst: instOriginal}
217 | 		}
218 | 	}
219 | }
220 | 
221 | // onePassCopy creates a copy of the original Prog, as we'll be modifying it
222 | func onePassCopy(prog *syntax.Prog) *onePassProg {
223 | 	p := &onePassProg{
224 | 		Start:  prog.Start,
225 | 		NumCap: prog.NumCap,
226 | 		Inst:   make([]onePassInst, len(prog.Inst)),
227 | 	}
228 | 	for i, inst := range prog.Inst {
229 | 		p.Inst[i] = onePassInst{Inst: inst}
230 | 	}
231 | 
232 | 	// rewrites one or more common Prog constructs that enable some otherwise
233 | 	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
234 | 	// ip A, that points to ips B & C.
235 | 	// A:BC + B:DA => A:BC + B:CD
236 | 	// A:BC + B:DC => A:DC + B:DC
237 | 	for pc := range p.Inst {
238 | 		switch p.Inst[pc].Op {
239 | 		default:
240 | 			continue
241 | 		case syntax.InstAlt, syntax.InstAltMatch:
242 | 			// A:Bx + B:Ay
243 | 			p_A_Other := &p.Inst[pc].Out
244 | 			p_A_Alt := &p.Inst[pc].Arg
245 | 			// make sure a target is another Alt
246 | 			instAlt := p.Inst[*p_A_Alt]
247 | 			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
248 | 				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
249 | 				instAlt = p.Inst[*p_A_Alt]
250 | 				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
251 | 					continue
252 | 				}
253 | 			}
254 | 			instOther := p.Inst[*p_A_Other]
255 | 			// Analyzing both legs pointing to Alts is for another day
256 | 			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
257 | 				// too complicated
258 | 				continue
259 | 			}
260 | 			// simple empty transition loop
261 | 			// A:BC + B:DA => A:BC + B:DC
262 | 			p_B_Alt := &p.Inst[*p_A_Alt].Out
263 | 			p_B_Other := &p.Inst[*p_A_Alt].Arg
264 | 			patch := false
265 | 			if instAlt.Out == uint32(pc) {
266 | 				patch = true
267 | 			} else if instAlt.Arg == uint32(pc) {
268 | 				patch = true
269 | 				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
270 | 			}
271 | 			if patch {
272 | 				*p_B_Alt = *p_A_Other
273 | 			}
274 | 
275 | 			// empty transition to common target
276 | 			// A:BC + B:DC => A:DC + B:DC
277 | 			if *p_A_Other == *p_B_Alt {
278 | 				*p_A_Alt = *p_B_Other
279 | 			}
280 | 		}
281 | 	}
282 | 	return p
283 | }
284 | 
285 | // runeSlice exists to permit sorting the case-folded rune sets.
286 | type runeSlice []rune
287 | 
288 | func (p runeSlice) Len() int           { return len(p) }
289 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
290 | func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
291 | 
292 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
293 | var anyRune = []rune{0, unicode.MaxRune}
294 | 
295 | // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
296 | // the match engine can always tell which branch to take. The routine may modify
297 | // p if it is turned into a onepass Prog. If it isn't possible for this to be a
298 | // onepass Prog, the Prog nil is returned. makeOnePass is recursive
299 | // to the size of the Prog.
300 | func makeOnePass(p *onePassProg) *onePassProg {
301 | 	// If the machine is very long, it's not worth the time to check if we can use one pass.
302 | 	if len(p.Inst) >= 1000 {
303 | 		return nil
304 | 	}
305 | 
306 | 	var (
307 | 		instQueue    = newQueue(len(p.Inst))
308 | 		visitQueue   = newQueue(len(p.Inst))
309 | 		check        func(uint32, []bool) bool
310 | 		onePassRunes = make([][]rune, len(p.Inst))
311 | 	)
312 | 
313 | 	// check that paths from Alt instructions are unambiguous, and rebuild the new
314 | 	// program as a onepass program
315 | 	check = func(pc uint32, m []bool) (ok bool) {
316 | 		ok = true
317 | 		inst := &p.Inst[pc]
318 | 		if visitQueue.contains(pc) {
319 | 			return
320 | 		}
321 | 		visitQueue.insert(pc)
322 | 		switch inst.Op {
323 | 		case syntax.InstAlt, syntax.InstAltMatch:
324 | 			ok = check(inst.Out, m) && check(inst.Arg, m)
325 | 			// check no-input paths to InstMatch
326 | 			matchOut := m[inst.Out]
327 | 			matchArg := m[inst.Arg]
328 | 			if matchOut && matchArg {
329 | 				ok = false
330 | 				break
331 | 			}
332 | 			// Match on empty goes in inst.Out
333 | 			if matchArg {
334 | 				inst.Out, inst.Arg = inst.Arg, inst.Out
335 | 				matchOut, matchArg = matchArg, matchOut
336 | 			}
337 | 			if matchOut {
338 | 				m[pc] = true
339 | 				inst.Op = syntax.InstAltMatch
340 | 			}
341 | 
342 | 			// build a dispatch operator from the two legs of the alt.
343 | 			onePassRunes[pc], inst.Next = mergeRuneSets(
344 | 				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
345 | 			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
346 | 				ok = false
347 | 				break
348 | 			}
349 | 		case syntax.InstCapture, syntax.InstNop:
350 | 			ok = check(inst.Out, m)
351 | 			m[pc] = m[inst.Out]
352 | 			// pass matching runes back through these no-ops.
353 | 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
354 | 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
355 | 			for i := range inst.Next {
356 | 				inst.Next[i] = inst.Out
357 | 			}
358 | 		case syntax.InstEmptyWidth:
359 | 			ok = check(inst.Out, m)
360 | 			m[pc] = m[inst.Out]
361 | 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
362 | 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
363 | 			for i := range inst.Next {
364 | 				inst.Next[i] = inst.Out
365 | 			}
366 | 		case syntax.InstMatch, syntax.InstFail:
367 | 			m[pc] = inst.Op == syntax.InstMatch
368 | 		case syntax.InstRune:
369 | 			m[pc] = false
370 | 			if len(inst.Next) > 0 {
371 | 				break
372 | 			}
373 | 			instQueue.insert(inst.Out)
374 | 			if len(inst.Rune) == 0 {
375 | 				onePassRunes[pc] = []rune{}
376 | 				inst.Next = []uint32{inst.Out}
377 | 				break
378 | 			}
379 | 			runes := make([]rune, 0)
380 | 			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
381 | 				r0 := inst.Rune[0]
382 | 				runes = append(runes, r0, r0)
383 | 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
384 | 					runes = append(runes, r1, r1)
385 | 				}
386 | 				sort.Sort(runeSlice(runes))
387 | 			} else {
388 | 				runes = append(runes, inst.Rune...)
389 | 			}
390 | 			onePassRunes[pc] = runes
391 | 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
392 | 			for i := range inst.Next {
393 | 				inst.Next[i] = inst.Out
394 | 			}
395 | 			inst.Op = syntax.InstRune
396 | 		case syntax.InstRune1:
397 | 			m[pc] = false
398 | 			if len(inst.Next) > 0 {
399 | 				break
400 | 			}
401 | 			instQueue.insert(inst.Out)
402 | 			runes := []rune{}
403 | 			// expand case-folded runes
404 | 			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
405 | 				r0 := inst.Rune[0]
406 | 				runes = append(runes, r0, r0)
407 | 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
408 | 					runes = append(runes, r1, r1)
409 | 				}
410 | 				sort.Sort(runeSlice(runes))
411 | 			} else {
412 | 				runes = append(runes, inst.Rune[0], inst.Rune[0])
413 | 			}
414 | 			onePassRunes[pc] = runes
415 | 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
416 | 			for i := range inst.Next {
417 | 				inst.Next[i] = inst.Out
418 | 			}
419 | 			inst.Op = syntax.InstRune
420 | 		case syntax.InstRuneAny:
421 | 			m[pc] = false
422 | 			if len(inst.Next) > 0 {
423 | 				break
424 | 			}
425 | 			instQueue.insert(inst.Out)
426 | 			onePassRunes[pc] = append([]rune{}, anyRune...)
427 | 			inst.Next = []uint32{inst.Out}
428 | 		case syntax.InstRuneAnyNotNL:
429 | 			m[pc] = false
430 | 			if len(inst.Next) > 0 {
431 | 				break
432 | 			}
433 | 			instQueue.insert(inst.Out)
434 | 			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
435 | 			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
436 | 			for i := range inst.Next {
437 | 				inst.Next[i] = inst.Out
438 | 			}
439 | 		}
440 | 		return
441 | 	}
442 | 
443 | 	instQueue.clear()
444 | 	instQueue.insert(uint32(p.Start))
445 | 	m := make([]bool, len(p.Inst))
446 | 	for !instQueue.empty() {
447 | 		visitQueue.clear()
448 | 		pc := instQueue.next()
449 | 		if !check(pc, m) {
450 | 			p = nil
451 | 			break
452 | 		}
453 | 	}
454 | 	if p != nil {
455 | 		for i := range p.Inst {
456 | 			p.Inst[i].Rune = onePassRunes[i]
457 | 		}
458 | 	}
459 | 	return p
460 | }
461 | 
462 | // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
463 | // can be recharacterized as a one-pass regexp program, or syntax.nil if the
464 | // Prog cannot be converted. For a one pass prog, the fundamental condition that must
465 | // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
466 | func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
467 | 	if prog.Start == 0 {
468 | 		return nil
469 | 	}
470 | 	// onepass regexp is anchored
471 | 	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
472 | 		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
473 | 		return nil
474 | 	}
475 | 	// every instruction leading to InstMatch must be EmptyEndText
476 | 	for _, inst := range prog.Inst {
477 | 		opOut := prog.Inst[inst.Out].Op
478 | 		switch inst.Op {
479 | 		default:
480 | 			if opOut == syntax.InstMatch {
481 | 				return nil
482 | 			}
483 | 		case syntax.InstAlt, syntax.InstAltMatch:
484 | 			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
485 | 				return nil
486 | 			}
487 | 		case syntax.InstEmptyWidth:
488 | 			if opOut == syntax.InstMatch {
489 | 				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
490 | 					continue
491 | 				}
492 | 				return nil
493 | 			}
494 | 		}
495 | 	}
496 | 	// Creates a slightly optimized copy of the original Prog
497 | 	// that cleans up some Prog idioms that block valid onepass programs
498 | 	p = onePassCopy(prog)
499 | 
500 | 	// checkAmbiguity on InstAlts, build onepass Prog if possible
501 | 	p = makeOnePass(p)
502 | 
503 | 	if p != nil {
504 | 		cleanupOnePass(p, prog)
505 | 	}
506 | 	return p
507 | }
508 | 


--------------------------------------------------------------------------------
/onepass_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package binaryregexp
  6 | 
  7 | import (
  8 | 	"reflect"
  9 | 	"rsc.io/binaryregexp/syntax"
 10 | 	"strings"
 11 | 	"testing"
 12 | )
 13 | 
 14 | var runeMergeTests = []struct {
 15 | 	left, right, merged []rune
 16 | 	next                []uint32
 17 | 	leftPC, rightPC     uint32
 18 | }{
 19 | 	{
 20 | 		// empty rhs
 21 | 		[]rune{69, 69},
 22 | 		[]rune{},
 23 | 		[]rune{69, 69},
 24 | 		[]uint32{1},
 25 | 		1, 2,
 26 | 	},
 27 | 	{
 28 | 		// identical runes, identical targets
 29 | 		[]rune{69, 69},
 30 | 		[]rune{69, 69},
 31 | 		[]rune{},
 32 | 		[]uint32{mergeFailed},
 33 | 		1, 1,
 34 | 	},
 35 | 	{
 36 | 		// identical runes, different targets
 37 | 		[]rune{69, 69},
 38 | 		[]rune{69, 69},
 39 | 		[]rune{},
 40 | 		[]uint32{mergeFailed},
 41 | 		1, 2,
 42 | 	},
 43 | 	{
 44 | 		// append right-first
 45 | 		[]rune{69, 69},
 46 | 		[]rune{71, 71},
 47 | 		[]rune{69, 69, 71, 71},
 48 | 		[]uint32{1, 2},
 49 | 		1, 2,
 50 | 	},
 51 | 	{
 52 | 		// append, left-first
 53 | 		[]rune{71, 71},
 54 | 		[]rune{69, 69},
 55 | 		[]rune{69, 69, 71, 71},
 56 | 		[]uint32{2, 1},
 57 | 		1, 2,
 58 | 	},
 59 | 	{
 60 | 		// successful interleave
 61 | 		[]rune{60, 60, 71, 71, 101, 101},
 62 | 		[]rune{69, 69, 88, 88},
 63 | 		[]rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101},
 64 | 		[]uint32{1, 2, 1, 2, 1},
 65 | 		1, 2,
 66 | 	},
 67 | 	{
 68 | 		// left surrounds right
 69 | 		[]rune{69, 74},
 70 | 		[]rune{71, 71},
 71 | 		[]rune{},
 72 | 		[]uint32{mergeFailed},
 73 | 		1, 2,
 74 | 	},
 75 | 	{
 76 | 		// right surrounds left
 77 | 		[]rune{69, 74},
 78 | 		[]rune{68, 75},
 79 | 		[]rune{},
 80 | 		[]uint32{mergeFailed},
 81 | 		1, 2,
 82 | 	},
 83 | 	{
 84 | 		// overlap at interval begin
 85 | 		[]rune{69, 74},
 86 | 		[]rune{74, 75},
 87 | 		[]rune{},
 88 | 		[]uint32{mergeFailed},
 89 | 		1, 2,
 90 | 	},
 91 | 	{
 92 | 		// overlap ar interval end
 93 | 		[]rune{69, 74},
 94 | 		[]rune{65, 69},
 95 | 		[]rune{},
 96 | 		[]uint32{mergeFailed},
 97 | 		1, 2,
 98 | 	},
 99 | 	{
100 | 		// overlap from above
101 | 		[]rune{69, 74},
102 | 		[]rune{71, 74},
103 | 		[]rune{},
104 | 		[]uint32{mergeFailed},
105 | 		1, 2,
106 | 	},
107 | 	{
108 | 		// overlap from below
109 | 		[]rune{69, 74},
110 | 		[]rune{65, 71},
111 | 		[]rune{},
112 | 		[]uint32{mergeFailed},
113 | 		1, 2,
114 | 	},
115 | 	{
116 | 		// out of order []rune
117 | 		[]rune{69, 74, 60, 65},
118 | 		[]rune{66, 67},
119 | 		[]rune{},
120 | 		[]uint32{mergeFailed},
121 | 		1, 2,
122 | 	},
123 | }
124 | 
125 | func TestMergeRuneSet(t *testing.T) {
126 | 	for ix, test := range runeMergeTests {
127 | 		merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC)
128 | 		if !reflect.DeepEqual(merged, test.merged) {
129 | 			t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged)
130 | 		}
131 | 		if !reflect.DeepEqual(next, test.next) {
132 | 			t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next)
133 | 		}
134 | 	}
135 | }
136 | 
137 | var onePassTests = []struct {
138 | 	re        string
139 | 	isOnePass bool
140 | }{
141 | 	{`^(?:a|(?:a*))$`, false},
142 | 	{`^(?:(a)|(?:a*))$`, false},
143 | 	{`^(?:(?:(?:.(?:$))?))$`, true},
144 | 	{`^abcd$`, true},
145 | 	{`^(?:(?:a{0,})*?)$`, true},
146 | 	{`^(?:(?:a+)*)$`, true},
147 | 	{`^(?:(?:a|(?:aa)))$`, true},
148 | 	{`^(?:[^\s\S])$`, true},
149 | 	{`^(?:(?:a{3,4}){0,})$`, false},
150 | 	{`^(?:(?:(?:a*)+))$`, true},
151 | 	{`^[a-c]+$`, true},
152 | 	{`^[a-c]*$`, true},
153 | 	{`^(?:a*)$`, true},
154 | 	{`^(?:(?:aa)|a)$`, true},
155 | 	{`^[a-c]*`, false},
156 | 	{`^...$`, true},
157 | 	{`^(?:a|(?:aa))$`, true},
158 | 	{`^a((b))c$`, true},
159 | 	{`^a.[l-nA-Cg-j]?e$`, true},
160 | 	{`^a((b))$`, true},
161 | 	{`^a(?:(b)|(c))c$`, true},
162 | 	{`^a(?:(b*)|(c))c$`, false},
163 | 	{`^a(?:b|c)$`, true},
164 | 	{`^a(?:b?|c)$`, true},
165 | 	{`^a(?:b?|c?)$`, false},
166 | 	{`^a(?:b?|c+)$`, true},
167 | 	{`^a(?:b+|(bc))d$`, false},
168 | 	{`^a(?:bc)+$`, true},
169 | 	{`^a(?:[bcd])+$`, true},
170 | 	{`^a((?:[bcd])+)$`, true},
171 | 	{`^a(:?b|c)*d$`, true},
172 | 	{`^.bc(d|e)*$`, true},
173 | 	{`^(?:(?:aa)|.)$`, false},
174 | 	{`^(?:(?:a{1,2}){1,2})$`, false},
175 | 	{`^l` + strings.Repeat("o", 2<<8) + `ng$`, true},
176 | }
177 | 
178 | func TestCompileOnePass(t *testing.T) {
179 | 	var (
180 | 		p   *syntax.Prog
181 | 		re  *syntax.Regexp
182 | 		err error
183 | 	)
184 | 	for _, test := range onePassTests {
185 | 		if re, err = syntax.Parse(test.re, syntax.Perl); err != nil {
186 | 			t.Errorf("Parse(%q) got err:%s, want success", test.re, err)
187 | 			continue
188 | 		}
189 | 		// needs to be done before compile...
190 | 		re = re.Simplify()
191 | 		if p, err = syntax.Compile(re); err != nil {
192 | 			t.Errorf("Compile(%q) got err:%s, want success", test.re, err)
193 | 			continue
194 | 		}
195 | 		isOnePass := compileOnePass(p) != nil
196 | 		if isOnePass != test.isOnePass {
197 | 			t.Errorf("CompileOnePass(%q) got isOnePass=%v, expected %v", test.re, isOnePass, test.isOnePass)
198 | 		}
199 | 	}
200 | }
201 | 
202 | // TODO(cespare): Unify with onePassTests and rationalize one-pass test cases.
203 | var onePassTests1 = []struct {
204 | 	re    string
205 | 	match string
206 | }{
207 | 	{`^a(/b+(#c+)*)*$`, "a/b#c"}, // golang.org/issue/11905
208 | }
209 | 
210 | func TestRunOnePass(t *testing.T) {
211 | 	for _, test := range onePassTests1 {
212 | 		re, err := Compile(test.re)
213 | 		if err != nil {
214 | 			t.Errorf("Compile(%q): got err: %s", test.re, err)
215 | 			continue
216 | 		}
217 | 		if re.onepass == nil {
218 | 			t.Errorf("Compile(%q): got nil, want one-pass", test.re)
219 | 			continue
220 | 		}
221 | 		if !re.MatchString(test.match) {
222 | 			t.Errorf("onepass %q did not match %q", test.re, test.match)
223 | 		}
224 | 	}
225 | }
226 | 


--------------------------------------------------------------------------------
/syntax/compile.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import "unicode"
  8 | 
  9 | // A patchList is a list of instruction pointers that need to be filled in (patched).
 10 | // Because the pointers haven't been filled in yet, we can reuse their storage
 11 | // to hold the list. It's kind of sleazy, but works well in practice.
 12 | // See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
 13 | //
 14 | // These aren't really pointers: they're integers, so we can reinterpret them
 15 | // this way without using package unsafe. A value l denotes
 16 | // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
 17 | // l == 0 denotes the empty list, okay because we start every program
 18 | // with a fail instruction, so we'll never want to point at its output link.
 19 | type patchList uint32
 20 | 
 21 | func (l patchList) next(p *Prog) patchList {
 22 | 	i := &p.Inst[l>>1]
 23 | 	if l&1 == 0 {
 24 | 		return patchList(i.Out)
 25 | 	}
 26 | 	return patchList(i.Arg)
 27 | }
 28 | 
 29 | func (l patchList) patch(p *Prog, val uint32) {
 30 | 	for l != 0 {
 31 | 		i := &p.Inst[l>>1]
 32 | 		if l&1 == 0 {
 33 | 			l = patchList(i.Out)
 34 | 			i.Out = val
 35 | 		} else {
 36 | 			l = patchList(i.Arg)
 37 | 			i.Arg = val
 38 | 		}
 39 | 	}
 40 | }
 41 | 
 42 | func (l1 patchList) append(p *Prog, l2 patchList) patchList {
 43 | 	if l1 == 0 {
 44 | 		return l2
 45 | 	}
 46 | 	if l2 == 0 {
 47 | 		return l1
 48 | 	}
 49 | 
 50 | 	last := l1
 51 | 	for {
 52 | 		next := last.next(p)
 53 | 		if next == 0 {
 54 | 			break
 55 | 		}
 56 | 		last = next
 57 | 	}
 58 | 
 59 | 	i := &p.Inst[last>>1]
 60 | 	if last&1 == 0 {
 61 | 		i.Out = uint32(l2)
 62 | 	} else {
 63 | 		i.Arg = uint32(l2)
 64 | 	}
 65 | 	return l1
 66 | }
 67 | 
 68 | // A frag represents a compiled program fragment.
 69 | type frag struct {
 70 | 	i   uint32    // index of first instruction
 71 | 	out patchList // where to record end instruction
 72 | }
 73 | 
 74 | type compiler struct {
 75 | 	p *Prog
 76 | }
 77 | 
 78 | // Compile compiles the regexp into a program to be executed.
 79 | // The regexp should have been simplified already (returned from re.Simplify).
 80 | func Compile(re *Regexp) (*Prog, error) {
 81 | 	var c compiler
 82 | 	c.init()
 83 | 	f := c.compile(re)
 84 | 	f.out.patch(c.p, c.inst(InstMatch).i)
 85 | 	c.p.Start = int(f.i)
 86 | 	return c.p, nil
 87 | }
 88 | 
 89 | func (c *compiler) init() {
 90 | 	c.p = new(Prog)
 91 | 	c.p.NumCap = 2 // implicit ( and ) for whole match $0
 92 | 	c.inst(InstFail)
 93 | }
 94 | 
 95 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
 96 | var anyRune = []rune{0, unicode.MaxRune}
 97 | 
 98 | func (c *compiler) compile(re *Regexp) frag {
 99 | 	switch re.Op {
100 | 	case OpNoMatch:
101 | 		return c.fail()
102 | 	case OpEmptyMatch:
103 | 		return c.nop()
104 | 	case OpLiteral:
105 | 		if len(re.Rune) == 0 {
106 | 			return c.nop()
107 | 		}
108 | 		var f frag
109 | 		for j := range re.Rune {
110 | 			f1 := c.rune(re.Rune[j:j+1], re.Flags)
111 | 			if j == 0 {
112 | 				f = f1
113 | 			} else {
114 | 				f = c.cat(f, f1)
115 | 			}
116 | 		}
117 | 		return f
118 | 	case OpCharClass:
119 | 		return c.rune(re.Rune, re.Flags)
120 | 	case OpAnyCharNotNL:
121 | 		return c.rune(anyRuneNotNL, 0)
122 | 	case OpAnyChar:
123 | 		return c.rune(anyRune, 0)
124 | 	case OpBeginLine:
125 | 		return c.empty(EmptyBeginLine)
126 | 	case OpEndLine:
127 | 		return c.empty(EmptyEndLine)
128 | 	case OpBeginText:
129 | 		return c.empty(EmptyBeginText)
130 | 	case OpEndText:
131 | 		return c.empty(EmptyEndText)
132 | 	case OpWordBoundary:
133 | 		return c.empty(EmptyWordBoundary)
134 | 	case OpNoWordBoundary:
135 | 		return c.empty(EmptyNoWordBoundary)
136 | 	case OpCapture:
137 | 		bra := c.cap(uint32(re.Cap << 1))
138 | 		sub := c.compile(re.Sub[0])
139 | 		ket := c.cap(uint32(re.Cap<<1 | 1))
140 | 		return c.cat(c.cat(bra, sub), ket)
141 | 	case OpStar:
142 | 		return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
143 | 	case OpPlus:
144 | 		return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
145 | 	case OpQuest:
146 | 		return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
147 | 	case OpConcat:
148 | 		if len(re.Sub) == 0 {
149 | 			return c.nop()
150 | 		}
151 | 		var f frag
152 | 		for i, sub := range re.Sub {
153 | 			if i == 0 {
154 | 				f = c.compile(sub)
155 | 			} else {
156 | 				f = c.cat(f, c.compile(sub))
157 | 			}
158 | 		}
159 | 		return f
160 | 	case OpAlternate:
161 | 		var f frag
162 | 		for _, sub := range re.Sub {
163 | 			f = c.alt(f, c.compile(sub))
164 | 		}
165 | 		return f
166 | 	}
167 | 	panic("regexp: unhandled case in compile")
168 | }
169 | 
170 | func (c *compiler) inst(op InstOp) frag {
171 | 	// TODO: impose length limit
172 | 	f := frag{i: uint32(len(c.p.Inst))}
173 | 	c.p.Inst = append(c.p.Inst, Inst{Op: op})
174 | 	return f
175 | }
176 | 
177 | func (c *compiler) nop() frag {
178 | 	f := c.inst(InstNop)
179 | 	f.out = patchList(f.i << 1)
180 | 	return f
181 | }
182 | 
183 | func (c *compiler) fail() frag {
184 | 	return frag{}
185 | }
186 | 
187 | func (c *compiler) cap(arg uint32) frag {
188 | 	f := c.inst(InstCapture)
189 | 	f.out = patchList(f.i << 1)
190 | 	c.p.Inst[f.i].Arg = arg
191 | 
192 | 	if c.p.NumCap < int(arg)+1 {
193 | 		c.p.NumCap = int(arg) + 1
194 | 	}
195 | 	return f
196 | }
197 | 
198 | func (c *compiler) cat(f1, f2 frag) frag {
199 | 	// concat of failure is failure
200 | 	if f1.i == 0 || f2.i == 0 {
201 | 		return frag{}
202 | 	}
203 | 
204 | 	// TODO: elide nop
205 | 
206 | 	f1.out.patch(c.p, f2.i)
207 | 	return frag{f1.i, f2.out}
208 | }
209 | 
210 | func (c *compiler) alt(f1, f2 frag) frag {
211 | 	// alt of failure is other
212 | 	if f1.i == 0 {
213 | 		return f2
214 | 	}
215 | 	if f2.i == 0 {
216 | 		return f1
217 | 	}
218 | 
219 | 	f := c.inst(InstAlt)
220 | 	i := &c.p.Inst[f.i]
221 | 	i.Out = f1.i
222 | 	i.Arg = f2.i
223 | 	f.out = f1.out.append(c.p, f2.out)
224 | 	return f
225 | }
226 | 
227 | func (c *compiler) quest(f1 frag, nongreedy bool) frag {
228 | 	f := c.inst(InstAlt)
229 | 	i := &c.p.Inst[f.i]
230 | 	if nongreedy {
231 | 		i.Arg = f1.i
232 | 		f.out = patchList(f.i << 1)
233 | 	} else {
234 | 		i.Out = f1.i
235 | 		f.out = patchList(f.i<<1 | 1)
236 | 	}
237 | 	f.out = f.out.append(c.p, f1.out)
238 | 	return f
239 | }
240 | 
241 | func (c *compiler) star(f1 frag, nongreedy bool) frag {
242 | 	f := c.inst(InstAlt)
243 | 	i := &c.p.Inst[f.i]
244 | 	if nongreedy {
245 | 		i.Arg = f1.i
246 | 		f.out = patchList(f.i << 1)
247 | 	} else {
248 | 		i.Out = f1.i
249 | 		f.out = patchList(f.i<<1 | 1)
250 | 	}
251 | 	f1.out.patch(c.p, f.i)
252 | 	return f
253 | }
254 | 
255 | func (c *compiler) plus(f1 frag, nongreedy bool) frag {
256 | 	return frag{f1.i, c.star(f1, nongreedy).out}
257 | }
258 | 
259 | func (c *compiler) empty(op EmptyOp) frag {
260 | 	f := c.inst(InstEmptyWidth)
261 | 	c.p.Inst[f.i].Arg = uint32(op)
262 | 	f.out = patchList(f.i << 1)
263 | 	return f
264 | }
265 | 
266 | func (c *compiler) rune(r []rune, flags Flags) frag {
267 | 	f := c.inst(InstRune)
268 | 	i := &c.p.Inst[f.i]
269 | 	i.Rune = r
270 | 	flags &= FoldCase // only relevant flag is FoldCase
271 | 	if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
272 | 		// and sometimes not even that
273 | 		flags &^= FoldCase
274 | 	}
275 | 	i.Arg = uint32(flags)
276 | 	f.out = patchList(f.i << 1)
277 | 
278 | 	// Special cases for exec machine.
279 | 	switch {
280 | 	case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
281 | 		i.Op = InstRune1
282 | 	case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
283 | 		i.Op = InstRuneAny
284 | 	case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
285 | 		i.Op = InstRuneAnyNotNL
286 | 	}
287 | 
288 | 	return f
289 | }
290 | 


--------------------------------------------------------------------------------
/syntax/doc.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
  6 | 
  7 | /*
  8 | Package syntax parses regular expressions into parse trees and compiles
  9 | parse trees into programs. Most clients of regular expressions will use the
 10 | facilities of package regexp (such as Compile and Match) instead of this package.
 11 | 
 12 | Syntax
 13 | 
 14 | The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
 15 | Parts of the syntax can be disabled by passing alternate flags to Parse.
 16 | 
 17 | 
 18 | Single characters:
 19 |   .              any character, possibly including newline (flag s=true)
 20 |   [xyz]          character class
 21 |   [^xyz]         negated character class
 22 |   \d             Perl character class
 23 |   \D             negated Perl character class
 24 |   [[:alpha:]]    ASCII character class
 25 |   [[:^alpha:]]   negated ASCII character class
 26 |   \pN            Unicode character class (one-letter name)
 27 |   \p{Greek}      Unicode character class
 28 |   \PN            negated Unicode character class (one-letter name)
 29 |   \P{Greek}      negated Unicode character class
 30 | 
 31 | Composites:
 32 |   xy             x followed by y
 33 |   x|y            x or y (prefer x)
 34 | 
 35 | Repetitions:
 36 |   x*             zero or more x, prefer more
 37 |   x+             one or more x, prefer more
 38 |   x?             zero or one x, prefer one
 39 |   x{n,m}         n or n+1 or ... or m x, prefer more
 40 |   x{n,}          n or more x, prefer more
 41 |   x{n}           exactly n x
 42 |   x*?            zero or more x, prefer fewer
 43 |   x+?            one or more x, prefer fewer
 44 |   x??            zero or one x, prefer zero
 45 |   x{n,m}?        n or n+1 or ... or m x, prefer fewer
 46 |   x{n,}?         n or more x, prefer fewer
 47 |   x{n}?          exactly n x
 48 | 
 49 | Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
 50 | reject forms that create a minimum or maximum repetition count above 1000.
 51 | Unlimited repetitions are not subject to this restriction.
 52 | 
 53 | Grouping:
 54 |   (re)           numbered capturing group (submatch)
 55 |   (?P<name>re)   named & numbered capturing group (submatch)
 56 |   (?:re)         non-capturing group
 57 |   (?flags)       set flags within current group; non-capturing
 58 |   (?flags:re)    set flags during re; non-capturing
 59 | 
 60 |   Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
 61 | 
 62 |   i              case-insensitive (default false)
 63 |   m              multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
 64 |   s              let . match \n (default false)
 65 |   U              ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
 66 | 
 67 | Empty strings:
 68 |   ^              at beginning of text or line (flag m=true)
 69 |   $              at end of text (like \z not Perl's \Z) or line (flag m=true)
 70 |   \A             at beginning of text
 71 |   \b             at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
 72 |   \B             not at ASCII word boundary
 73 |   \z             at end of text
 74 | 
 75 | Escape sequences:
 76 |   \a             bell (== \007)
 77 |   \f             form feed (== \014)
 78 |   \t             horizontal tab (== \011)
 79 |   \n             newline (== \012)
 80 |   \r             carriage return (== \015)
 81 |   \v             vertical tab character (== \013)
 82 |   \*             literal *, for any punctuation character *
 83 |   \123           octal character code (up to three digits)
 84 |   \x7F           hex character code (exactly two digits)
 85 |   \x{10FFFF}     hex character code
 86 |   \Q...\E        literal text ... even if ... has punctuation
 87 | 
 88 | Character class elements:
 89 |   x              single character
 90 |   A-Z            character range (inclusive)
 91 |   \d             Perl character class
 92 |   [:foo:]        ASCII character class foo
 93 |   \p{Foo}        Unicode character class Foo
 94 |   \pF            Unicode character class F (one-letter name)
 95 | 
 96 | Named character classes as character class elements:
 97 |   [\d]           digits (== \d)
 98 |   [^\d]          not digits (== \D)
 99 |   [\D]           not digits (== \D)
100 |   [^\D]          not not digits (== \d)
101 |   [[:name:]]     named ASCII class inside character class (== [:name:])
102 |   [^[:name:]]    named ASCII class inside negated character class (== [:^name:])
103 |   [\p{Name}]     named Unicode property inside character class (== \p{Name})
104 |   [^\p{Name}]    named Unicode property inside negated character class (== \P{Name})
105 | 
106 | Perl character classes (all ASCII-only):
107 |   \d             digits (== [0-9])
108 |   \D             not digits (== [^0-9])
109 |   \s             whitespace (== [\t\n\f\r ])
110 |   \S             not whitespace (== [^\t\n\f\r ])
111 |   \w             word characters (== [0-9A-Za-z_])
112 |   \W             not word characters (== [^0-9A-Za-z_])
113 | 
114 | ASCII character classes:
115 |   [[:alnum:]]    alphanumeric (== [0-9A-Za-z])
116 |   [[:alpha:]]    alphabetic (== [A-Za-z])
117 |   [[:ascii:]]    ASCII (== [\x00-\x7F])
118 |   [[:blank:]]    blank (== [\t ])
119 |   [[:cntrl:]]    control (== [\x00-\x1F\x7F])
120 |   [[:digit:]]    digits (== [0-9])
121 |   [[:graph:]]    graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
122 |   [[:lower:]]    lower case (== [a-z])
123 |   [[:print:]]    printable (== [ -~] == [ [:graph:]])
124 |   [[:punct:]]    punctuation (== [!-/:-@[-`{-~])
125 |   [[:space:]]    whitespace (== [\t\n\v\f\r ])
126 |   [[:upper:]]    upper case (== [A-Z])
127 |   [[:word:]]     word characters (== [0-9A-Za-z_])
128 |   [[:xdigit:]]   hex digit (== [0-9A-Fa-f])
129 | 
130 | */
131 | package syntax
132 | 


--------------------------------------------------------------------------------
/syntax/op_string.go:
--------------------------------------------------------------------------------
 1 | // Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
 2 | 
 3 | package syntax
 4 | 
 5 | import "strconv"
 6 | 
 7 | const (
 8 | 	_Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
 9 | 	_Op_name_1 = "opPseudo"
10 | )
11 | 
12 | var (
13 | 	_Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151}
14 | )
15 | 
16 | func (i Op) String() string {
17 | 	switch {
18 | 	case 1 <= i && i <= 19:
19 | 		i -= 1
20 | 		return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]]
21 | 	case i == 128:
22 | 		return _Op_name_1
23 | 	default:
24 | 		return "Op(" + strconv.FormatInt(int64(i), 10) + ")"
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/syntax/parse_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"testing"
 11 | 	"unicode"
 12 | )
 13 | 
 14 | type parseTest struct {
 15 | 	Regexp string
 16 | 	Dump   string
 17 | }
 18 | 
 19 | var parseTests = []parseTest{
 20 | 	// Base cases
 21 | 	{`a`, `lit{a}`},
 22 | 	{`a.`, `cat{lit{a}dot{}}`},
 23 | 	{`a.b`, `cat{lit{a}dot{}lit{b}}`},
 24 | 	{`ab`, `str{ab}`},
 25 | 	{`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
 26 | 	{`abc`, `str{abc}`},
 27 | 	{`a|^`, `alt{lit{a}bol{}}`},
 28 | 	{`a|b`, `cc{0x61-0x62}`},
 29 | 	{`(a)`, `cap{lit{a}}`},
 30 | 	{`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
 31 | 	{`a*`, `star{lit{a}}`},
 32 | 	{`a+`, `plus{lit{a}}`},
 33 | 	{`a?`, `que{lit{a}}`},
 34 | 	{`a{2}`, `rep{2,2 lit{a}}`},
 35 | 	{`a{2,3}`, `rep{2,3 lit{a}}`},
 36 | 	{`a{2,}`, `rep{2,-1 lit{a}}`},
 37 | 	{`a*?`, `nstar{lit{a}}`},
 38 | 	{`a+?`, `nplus{lit{a}}`},
 39 | 	{`a??`, `nque{lit{a}}`},
 40 | 	{`a{2}?`, `nrep{2,2 lit{a}}`},
 41 | 	{`a{2,3}?`, `nrep{2,3 lit{a}}`},
 42 | 	{`a{2,}?`, `nrep{2,-1 lit{a}}`},
 43 | 	// Malformed { } are treated as literals.
 44 | 	{`x{1001`, `str{x{1001}`},
 45 | 	{`x{9876543210`, `str{x{9876543210}`},
 46 | 	{`x{9876543210,`, `str{x{9876543210,}`},
 47 | 	{`x{2,1`, `str{x{2,1}`},
 48 | 	{`x{1,9876543210`, `str{x{1,9876543210}`},
 49 | 	{``, `emp{}`},
 50 | 	{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
 51 | 	{`|x|`, `alt{emp{}lit{x}emp{}}`},
 52 | 	{`.`, `dot{}`},
 53 | 	{`^`, `bol{}`},
 54 | 	{`$`, `eol{}`},
 55 | 	{`\|`, `lit{|}`},
 56 | 	{`\(`, `lit{(}`},
 57 | 	{`\)`, `lit{)}`},
 58 | 	{`\*`, `lit{*}`},
 59 | 	{`\+`, `lit{+}`},
 60 | 	{`\?`, `lit{?}`},
 61 | 	{`{`, `lit{{}`},
 62 | 	{`}`, `lit{}}`},
 63 | 	{`\.`, `lit{.}`},
 64 | 	{`\^`, `lit{^}`},
 65 | 	{`\$`, `lit{$}`},
 66 | 	{`\\`, `lit{\}`},
 67 | 	{`[ace]`, `cc{0x61 0x63 0x65}`},
 68 | 	{`[abc]`, `cc{0x61-0x63}`},
 69 | 	{`[a-z]`, `cc{0x61-0x7a}`},
 70 | 	{`[a]`, `lit{a}`},
 71 | 	{`\-`, `lit{-}`},
 72 | 	{`-`, `lit{-}`},
 73 | 	{`\_`, `lit{_}`},
 74 | 	{`abc`, `str{abc}`},
 75 | 	{`abc|def`, `alt{str{abc}str{def}}`},
 76 | 	{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
 77 | 
 78 | 	// Posix and Perl extensions
 79 | 	{`[[:lower:]]`, `cc{0x61-0x7a}`},
 80 | 	{`[a-z]`, `cc{0x61-0x7a}`},
 81 | 	{`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
 82 | 	{`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
 83 | 	{`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 84 | 	{`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
 85 | 	{`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 86 | 	{`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 87 | 	{`\d`, `cc{0x30-0x39}`},
 88 | 	{`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
 89 | 	{`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
 90 | 	{`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
 91 | 	{`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
 92 | 	{`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
 93 | 	{`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
 94 | 	{`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
 95 | 	{`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
 96 | 	{`\C`, `dot{}`},
 97 | 
 98 | 	// Unicode, negatives, and a double negative.
 99 | 	{`\p{Braille}`, `cc{0x2800-0x28ff}`},
100 | 	{`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
101 | 	{`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
102 | 	{`\P{^Braille}`, `cc{0x2800-0x28ff}`},
103 | 	{`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
104 | 	{`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
105 | 	{`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
106 | 	{`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
107 | 	{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
108 | 	{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
109 | 	{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
110 | 	{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
111 | 	{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
112 | 	{`\p{Any}`, `dot{}`},
113 | 	{`\p{^Any}`, `cc{}`},
114 | 
115 | 	// Hex, octal.
116 | 	{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
117 | 	{`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
118 | 
119 | 	// More interesting regular expressions.
120 | 	{`a{,2}`, `str{a{,2}}`},
121 | 	{`\.\^\$\\`, `str{.^$\}`},
122 | 	{`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
123 | 	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
124 | 	{`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
125 | 	{`a*{`, `cat{star{lit{a}}lit{{}}`},
126 | 
127 | 	// Test precedences
128 | 	{`(?:ab)*`, `star{str{ab}}`},
129 | 	{`(ab)*`, `star{cap{str{ab}}}`},
130 | 	{`ab|cd`, `alt{str{ab}str{cd}}`},
131 | 	{`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
132 | 
133 | 	// Test flattening.
134 | 	{`(?:a)`, `lit{a}`},
135 | 	{`(?:ab)(?:cd)`, `str{abcd}`},
136 | 	{`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
137 | 	{`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
138 | 	{`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
139 | 	{`a|.`, `dot{}`},
140 | 	{`.|a`, `dot{}`},
141 | 	{`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
142 | 	{`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
143 | 
144 | 	// Test Perl quoted literals
145 | 	{`\Q+|*?{[\E`, `str{+|*?{[}`},
146 | 	{`\Q+\E+`, `plus{lit{+}}`},
147 | 	{`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
148 | 	{`\Q\\E`, `lit{\}`},
149 | 	{`\Q\\\E`, `str{\\}`},
150 | 
151 | 	// Test Perl \A and \z
152 | 	{`(?m)^`, `bol{}`},
153 | 	{`(?m)$`, `eol{}`},
154 | 	{`(?-m)^`, `bot{}`},
155 | 	{`(?-m)$`, `eot{}`},
156 | 	{`(?m)\A`, `bot{}`},
157 | 	{`(?m)\z`, `eot{\z}`},
158 | 	{`(?-m)\A`, `bot{}`},
159 | 	{`(?-m)\z`, `eot{\z}`},
160 | 
161 | 	// Test named captures
162 | 	{`(?P<name>a)`, `cap{name:lit{a}}`},
163 | 
164 | 	// Case-folded literals
165 | 	{`[Aa]`, `litfold{A}`},
166 | 	{`[\x{100}\x{101}]`, `litfold{Ā}`},
167 | 	{`[Δδ]`, `litfold{Δ}`},
168 | 
169 | 	// Strings
170 | 	{`abcde`, `str{abcde}`},
171 | 	{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
172 | 
173 | 	// Factoring.
174 | 	{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
175 | 	{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
176 | 
177 | 	// Bug fixes.
178 | 	{`(?:.)`, `dot{}`},
179 | 	{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
180 | 	{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
181 | 	{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
182 | 	{`(?:A|a)`, `litfold{A}`},
183 | 	{`A|(?:A|a)`, `litfold{A}`},
184 | 	{`(?s).`, `dot{}`},
185 | 	{`(?-s).`, `dnl{}`},
186 | 	{`(?:(?:^).)`, `cat{bol{}dot{}}`},
187 | 	{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
188 | 
189 | 	// RE2 prefix_tests
190 | 	{`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
191 | 	{`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
192 | 	{`abc|abd|aef|bcx|bcy`,
193 | 		`alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
194 | 			`cat{str{bc}cc{0x78-0x79}}}`},
195 | 	{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
196 | 	{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
197 | 	{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
198 | 	{`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
199 | 	{`x{2}|x{2}[0-9]`,
200 | 		`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
201 | 	{`x{2}y|x{2}[0-9]y`,
202 | 		`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
203 | 	{`a.*?c|a.*?b`,
204 | 		`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
205 | 
206 | 	// Valid repetitions.
207 | 	{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
208 | 	{`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
209 | }
210 | 
211 | const testFlags = MatchNL | PerlX | UnicodeGroups
212 | 
213 | func TestParseSimple(t *testing.T) {
214 | 	testParseDump(t, parseTests, testFlags)
215 | }
216 | 
217 | var foldcaseTests = []parseTest{
218 | 	{`AbCdE`, `strfold{ABCDE}`},
219 | 	{`[Aa]`, `litfold{A}`},
220 | 	{`a`, `litfold{A}`},
221 | 
222 | 	// 0x17F is an old English long s (looks like an f) and folds to s.
223 | 	// 0x212A is the Kelvin symbol and folds to k.
224 | 	{`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
225 | 	{`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
226 | 	{`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
227 | }
228 | 
229 | func TestParseFoldCase(t *testing.T) {
230 | 	testParseDump(t, foldcaseTests, FoldCase)
231 | }
232 | 
233 | var literalTests = []parseTest{
234 | 	{"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
235 | }
236 | 
237 | func TestParseLiteral(t *testing.T) {
238 | 	testParseDump(t, literalTests, Literal)
239 | }
240 | 
241 | var matchnlTests = []parseTest{
242 | 	{`.`, `dot{}`},
243 | 	{"\n", "lit{\n}"},
244 | 	{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
245 | 	{`[a\n]`, `cc{0xa 0x61}`},
246 | }
247 | 
248 | func TestParseMatchNL(t *testing.T) {
249 | 	testParseDump(t, matchnlTests, MatchNL)
250 | }
251 | 
252 | var nomatchnlTests = []parseTest{
253 | 	{`.`, `dnl{}`},
254 | 	{"\n", "lit{\n}"},
255 | 	{`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
256 | 	{`[a\n]`, `cc{0xa 0x61}`},
257 | }
258 | 
259 | func TestParseNoMatchNL(t *testing.T) {
260 | 	testParseDump(t, nomatchnlTests, 0)
261 | }
262 | 
263 | // Test Parse -> Dump.
264 | func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
265 | 	for _, tt := range tests {
266 | 		re, err := Parse(tt.Regexp, flags)
267 | 		if err != nil {
268 | 			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
269 | 			continue
270 | 		}
271 | 		if tt.Dump == "" {
272 | 			// It parsed. That's all we care about.
273 | 			continue
274 | 		}
275 | 		d := dump(re)
276 | 		if d != tt.Dump {
277 | 			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
278 | 		}
279 | 	}
280 | }
281 | 
282 | // dump prints a string representation of the regexp showing
283 | // the structure explicitly.
284 | func dump(re *Regexp) string {
285 | 	var b bytes.Buffer
286 | 	dumpRegexp(&b, re)
287 | 	return b.String()
288 | }
289 | 
290 | var opNames = []string{
291 | 	OpNoMatch:        "no",
292 | 	OpEmptyMatch:     "emp",
293 | 	OpLiteral:        "lit",
294 | 	OpCharClass:      "cc",
295 | 	OpAnyCharNotNL:   "dnl",
296 | 	OpAnyChar:        "dot",
297 | 	OpBeginLine:      "bol",
298 | 	OpEndLine:        "eol",
299 | 	OpBeginText:      "bot",
300 | 	OpEndText:        "eot",
301 | 	OpWordBoundary:   "wb",
302 | 	OpNoWordBoundary: "nwb",
303 | 	OpCapture:        "cap",
304 | 	OpStar:           "star",
305 | 	OpPlus:           "plus",
306 | 	OpQuest:          "que",
307 | 	OpRepeat:         "rep",
308 | 	OpConcat:         "cat",
309 | 	OpAlternate:      "alt",
310 | }
311 | 
312 | // dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
313 | // It is used during testing to distinguish between parses that might print
314 | // the same using re's String method.
315 | func dumpRegexp(b *bytes.Buffer, re *Regexp) {
316 | 	if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
317 | 		fmt.Fprintf(b, "op%d", re.Op)
318 | 	} else {
319 | 		switch re.Op {
320 | 		default:
321 | 			b.WriteString(opNames[re.Op])
322 | 		case OpStar, OpPlus, OpQuest, OpRepeat:
323 | 			if re.Flags&NonGreedy != 0 {
324 | 				b.WriteByte('n')
325 | 			}
326 | 			b.WriteString(opNames[re.Op])
327 | 		case OpLiteral:
328 | 			if len(re.Rune) > 1 {
329 | 				b.WriteString("str")
330 | 			} else {
331 | 				b.WriteString("lit")
332 | 			}
333 | 			if re.Flags&FoldCase != 0 {
334 | 				for _, r := range re.Rune {
335 | 					if unicode.SimpleFold(r) != r {
336 | 						b.WriteString("fold")
337 | 						break
338 | 					}
339 | 				}
340 | 			}
341 | 		}
342 | 	}
343 | 	b.WriteByte('{')
344 | 	switch re.Op {
345 | 	case OpEndText:
346 | 		if re.Flags&WasDollar == 0 {
347 | 			b.WriteString(`\z`)
348 | 		}
349 | 	case OpLiteral:
350 | 		for _, r := range re.Rune {
351 | 			b.WriteRune(r)
352 | 		}
353 | 	case OpConcat, OpAlternate:
354 | 		for _, sub := range re.Sub {
355 | 			dumpRegexp(b, sub)
356 | 		}
357 | 	case OpStar, OpPlus, OpQuest:
358 | 		dumpRegexp(b, re.Sub[0])
359 | 	case OpRepeat:
360 | 		fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
361 | 		dumpRegexp(b, re.Sub[0])
362 | 	case OpCapture:
363 | 		if re.Name != "" {
364 | 			b.WriteString(re.Name)
365 | 			b.WriteByte(':')
366 | 		}
367 | 		dumpRegexp(b, re.Sub[0])
368 | 	case OpCharClass:
369 | 		sep := ""
370 | 		for i := 0; i < len(re.Rune); i += 2 {
371 | 			b.WriteString(sep)
372 | 			sep = " "
373 | 			lo, hi := re.Rune[i], re.Rune[i+1]
374 | 			if lo == hi {
375 | 				fmt.Fprintf(b, "%#x", lo)
376 | 			} else {
377 | 				fmt.Fprintf(b, "%#x-%#x", lo, hi)
378 | 			}
379 | 		}
380 | 	}
381 | 	b.WriteByte('}')
382 | }
383 | 
384 | func mkCharClass(f func(rune) bool) string {
385 | 	re := &Regexp{Op: OpCharClass}
386 | 	lo := rune(-1)
387 | 	for i := rune(0); i <= unicode.MaxRune; i++ {
388 | 		if f(i) {
389 | 			if lo < 0 {
390 | 				lo = i
391 | 			}
392 | 		} else {
393 | 			if lo >= 0 {
394 | 				re.Rune = append(re.Rune, lo, i-1)
395 | 				lo = -1
396 | 			}
397 | 		}
398 | 	}
399 | 	if lo >= 0 {
400 | 		re.Rune = append(re.Rune, lo, unicode.MaxRune)
401 | 	}
402 | 	return dump(re)
403 | }
404 | 
405 | func isUpperFold(r rune) bool {
406 | 	if unicode.IsUpper(r) {
407 | 		return true
408 | 	}
409 | 	c := unicode.SimpleFold(r)
410 | 	for c != r {
411 | 		if unicode.IsUpper(c) {
412 | 			return true
413 | 		}
414 | 		c = unicode.SimpleFold(c)
415 | 	}
416 | 	return false
417 | }
418 | 
419 | func TestFoldConstants(t *testing.T) {
420 | 	last := rune(-1)
421 | 	for i := rune(0); i <= unicode.MaxRune; i++ {
422 | 		if unicode.SimpleFold(i) == i {
423 | 			continue
424 | 		}
425 | 		if last == -1 && minFold != i {
426 | 			t.Errorf("minFold=%#U should be %#U", minFold, i)
427 | 		}
428 | 		last = i
429 | 	}
430 | 	if maxFold != last {
431 | 		t.Errorf("maxFold=%#U should be %#U", maxFold, last)
432 | 	}
433 | }
434 | 
435 | func TestAppendRangeCollapse(t *testing.T) {
436 | 	// AppendRange should collapse each of the new ranges
437 | 	// into the earlier ones (it looks back two ranges), so that
438 | 	// the slice never grows very large.
439 | 	// Note that we are not calling cleanClass.
440 | 	var r []rune
441 | 	for i := rune('A'); i <= 'Z'; i++ {
442 | 		r = appendRange(r, i, i)
443 | 		r = appendRange(r, i+'a'-'A', i+'a'-'A')
444 | 	}
445 | 	if string(r) != "AZaz" {
446 | 		t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
447 | 	}
448 | }
449 | 
450 | var invalidRegexps = []string{
451 | 	`(`,
452 | 	`)`,
453 | 	`(a`,
454 | 	`a)`,
455 | 	`(a))`,
456 | 	`(a|b|`,
457 | 	`a|b|)`,
458 | 	`(a|b|))`,
459 | 	`(a|b`,
460 | 	`a|b)`,
461 | 	`(a|b))`,
462 | 	`[a-z`,
463 | 	`([a-z)`,
464 | 	`[a-z)`,
465 | 	`([a-z]))`,
466 | 	`x{1001}`,
467 | 	`x{9876543210}`,
468 | 	`x{2,1}`,
469 | 	`x{1,9876543210}`,
470 | 	"\xff", // Invalid UTF-8
471 | 	"[\xff]",
472 | 	"[\\\xff]",
473 | 	"\\\xff",
474 | 	`(?P<name>a`,
475 | 	`(?P<name>`,
476 | 	`(?P<name`,
477 | 	`(?P<x y>a)`,
478 | 	`(?P<>a)`,
479 | 	`[a-Z]`,
480 | 	`(?i)[a-Z]`,
481 | 	`a{100000}`,
482 | 	`a{100000,}`,
483 | 	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
484 | 	`\Q\E*`,
485 | }
486 | 
487 | var onlyPerl = []string{
488 | 	`[a-b-c]`,
489 | 	`\Qabc\E`,
490 | 	`\Q*+?{[\E`,
491 | 	`\Q\\E`,
492 | 	`\Q\\\E`,
493 | 	`\Q\\\\E`,
494 | 	`\Q\\\\\E`,
495 | 	`(?:a)`,
496 | 	`(?P<name>a)`,
497 | }
498 | 
499 | var onlyPOSIX = []string{
500 | 	"a++",
501 | 	"a**",
502 | 	"a?*",
503 | 	"a+*",
504 | 	"a{1}*",
505 | 	".{1}{2}.{3}",
506 | }
507 | 
508 | func TestParseInvalidRegexps(t *testing.T) {
509 | 	for _, regexp := range invalidRegexps {
510 | 		if re, err := Parse(regexp, Perl); err == nil {
511 | 			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
512 | 		}
513 | 		if re, err := Parse(regexp, POSIX); err == nil {
514 | 			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
515 | 		}
516 | 	}
517 | 	for _, regexp := range onlyPerl {
518 | 		if _, err := Parse(regexp, Perl); err != nil {
519 | 			t.Errorf("Parse(%#q, Perl): %v", regexp, err)
520 | 		}
521 | 		if re, err := Parse(regexp, POSIX); err == nil {
522 | 			t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
523 | 		}
524 | 	}
525 | 	for _, regexp := range onlyPOSIX {
526 | 		if re, err := Parse(regexp, Perl); err == nil {
527 | 			t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
528 | 		}
529 | 		if _, err := Parse(regexp, POSIX); err != nil {
530 | 			t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
531 | 		}
532 | 	}
533 | }
534 | 
535 | func TestToStringEquivalentParse(t *testing.T) {
536 | 	for _, tt := range parseTests {
537 | 		re, err := Parse(tt.Regexp, testFlags)
538 | 		if err != nil {
539 | 			t.Errorf("Parse(%#q): %v", tt.Regexp, err)
540 | 			continue
541 | 		}
542 | 		if tt.Dump == "" {
543 | 			// It parsed. That's all we care about.
544 | 			continue
545 | 		}
546 | 		d := dump(re)
547 | 		if d != tt.Dump {
548 | 			t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
549 | 			continue
550 | 		}
551 | 
552 | 		s := re.String()
553 | 		if s != tt.Regexp {
554 | 			// If ToString didn't return the original regexp,
555 | 			// it must have found one with fewer parens.
556 | 			// Unfortunately we can't check the length here, because
557 | 			// ToString produces "\\{" for a literal brace,
558 | 			// but "{" is a shorter equivalent in some contexts.
559 | 			nre, err := Parse(s, testFlags)
560 | 			if err != nil {
561 | 				t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
562 | 				continue
563 | 			}
564 | 			nd := dump(nre)
565 | 			if d != nd {
566 | 				t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
567 | 			}
568 | 
569 | 			ns := nre.String()
570 | 			if s != ns {
571 | 				t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
572 | 			}
573 | 		}
574 | 	}
575 | }
576 | 


--------------------------------------------------------------------------------
/syntax/perl_groups.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // GENERATED BY make_perl_groups.pl; DO NOT EDIT.
  6 | // make_perl_groups.pl >perl_groups.go
  7 | 
  8 | package syntax
  9 | 
 10 | var code1 = []rune{ /* \d */
 11 | 	0x30, 0x39,
 12 | }
 13 | 
 14 | var code2 = []rune{ /* \s */
 15 | 	0x9, 0xa,
 16 | 	0xc, 0xd,
 17 | 	0x20, 0x20,
 18 | }
 19 | 
 20 | var code3 = []rune{ /* \w */
 21 | 	0x30, 0x39,
 22 | 	0x41, 0x5a,
 23 | 	0x5f, 0x5f,
 24 | 	0x61, 0x7a,
 25 | }
 26 | 
 27 | var perlGroup = map[string]charGroup{
 28 | 	`\d`: {+1, code1},
 29 | 	`\D`: {-1, code1},
 30 | 	`\s`: {+1, code2},
 31 | 	`\S`: {-1, code2},
 32 | 	`\w`: {+1, code3},
 33 | 	`\W`: {-1, code3},
 34 | }
 35 | var code4 = []rune{ /* [:alnum:] */
 36 | 	0x30, 0x39,
 37 | 	0x41, 0x5a,
 38 | 	0x61, 0x7a,
 39 | }
 40 | 
 41 | var code5 = []rune{ /* [:alpha:] */
 42 | 	0x41, 0x5a,
 43 | 	0x61, 0x7a,
 44 | }
 45 | 
 46 | var code6 = []rune{ /* [:ascii:] */
 47 | 	0x0, 0x7f,
 48 | }
 49 | 
 50 | var code7 = []rune{ /* [:blank:] */
 51 | 	0x9, 0x9,
 52 | 	0x20, 0x20,
 53 | }
 54 | 
 55 | var code8 = []rune{ /* [:cntrl:] */
 56 | 	0x0, 0x1f,
 57 | 	0x7f, 0x7f,
 58 | }
 59 | 
 60 | var code9 = []rune{ /* [:digit:] */
 61 | 	0x30, 0x39,
 62 | }
 63 | 
 64 | var code10 = []rune{ /* [:graph:] */
 65 | 	0x21, 0x7e,
 66 | }
 67 | 
 68 | var code11 = []rune{ /* [:lower:] */
 69 | 	0x61, 0x7a,
 70 | }
 71 | 
 72 | var code12 = []rune{ /* [:print:] */
 73 | 	0x20, 0x7e,
 74 | }
 75 | 
 76 | var code13 = []rune{ /* [:punct:] */
 77 | 	0x21, 0x2f,
 78 | 	0x3a, 0x40,
 79 | 	0x5b, 0x60,
 80 | 	0x7b, 0x7e,
 81 | }
 82 | 
 83 | var code14 = []rune{ /* [:space:] */
 84 | 	0x9, 0xd,
 85 | 	0x20, 0x20,
 86 | }
 87 | 
 88 | var code15 = []rune{ /* [:upper:] */
 89 | 	0x41, 0x5a,
 90 | }
 91 | 
 92 | var code16 = []rune{ /* [:word:] */
 93 | 	0x30, 0x39,
 94 | 	0x41, 0x5a,
 95 | 	0x5f, 0x5f,
 96 | 	0x61, 0x7a,
 97 | }
 98 | 
 99 | var code17 = []rune{ /* [:xdigit:] */
100 | 	0x30, 0x39,
101 | 	0x41, 0x46,
102 | 	0x61, 0x66,
103 | }
104 | 
105 | var posixGroup = map[string]charGroup{
106 | 	`[:alnum:]`:   {+1, code4},
107 | 	`[:^alnum:]`:  {-1, code4},
108 | 	`[:alpha:]`:   {+1, code5},
109 | 	`[:^alpha:]`:  {-1, code5},
110 | 	`[:ascii:]`:   {+1, code6},
111 | 	`[:^ascii:]`:  {-1, code6},
112 | 	`[:blank:]`:   {+1, code7},
113 | 	`[:^blank:]`:  {-1, code7},
114 | 	`[:cntrl:]`:   {+1, code8},
115 | 	`[:^cntrl:]`:  {-1, code8},
116 | 	`[:digit:]`:   {+1, code9},
117 | 	`[:^digit:]`:  {-1, code9},
118 | 	`[:graph:]`:   {+1, code10},
119 | 	`[:^graph:]`:  {-1, code10},
120 | 	`[:lower:]`:   {+1, code11},
121 | 	`[:^lower:]`:  {-1, code11},
122 | 	`[:print:]`:   {+1, code12},
123 | 	`[:^print:]`:  {-1, code12},
124 | 	`[:punct:]`:   {+1, code13},
125 | 	`[:^punct:]`:  {-1, code13},
126 | 	`[:space:]`:   {+1, code14},
127 | 	`[:^space:]`:  {-1, code14},
128 | 	`[:upper:]`:   {+1, code15},
129 | 	`[:^upper:]`:  {-1, code15},
130 | 	`[:word:]`:    {+1, code16},
131 | 	`[:^word:]`:   {-1, code16},
132 | 	`[:xdigit:]`:  {+1, code17},
133 | 	`[:^xdigit:]`: {-1, code17},
134 | }
135 | 


--------------------------------------------------------------------------------
/syntax/prog.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"strconv"
 10 | 	"unicode"
 11 | )
 12 | 
 13 | // Compiled program.
 14 | // May not belong in this package, but convenient for now.
 15 | 
 16 | // A Prog is a compiled regular expression program.
 17 | type Prog struct {
 18 | 	Inst   []Inst
 19 | 	Start  int // index of start instruction
 20 | 	NumCap int // number of InstCapture insts in re
 21 | }
 22 | 
 23 | // An InstOp is an instruction opcode.
 24 | type InstOp uint8
 25 | 
 26 | const (
 27 | 	InstAlt InstOp = iota
 28 | 	InstAltMatch
 29 | 	InstCapture
 30 | 	InstEmptyWidth
 31 | 	InstMatch
 32 | 	InstFail
 33 | 	InstNop
 34 | 	InstRune
 35 | 	InstRune1
 36 | 	InstRuneAny
 37 | 	InstRuneAnyNotNL
 38 | )
 39 | 
 40 | var instOpNames = []string{
 41 | 	"InstAlt",
 42 | 	"InstAltMatch",
 43 | 	"InstCapture",
 44 | 	"InstEmptyWidth",
 45 | 	"InstMatch",
 46 | 	"InstFail",
 47 | 	"InstNop",
 48 | 	"InstRune",
 49 | 	"InstRune1",
 50 | 	"InstRuneAny",
 51 | 	"InstRuneAnyNotNL",
 52 | }
 53 | 
 54 | func (i InstOp) String() string {
 55 | 	if uint(i) >= uint(len(instOpNames)) {
 56 | 		return ""
 57 | 	}
 58 | 	return instOpNames[i]
 59 | }
 60 | 
 61 | // An EmptyOp specifies a kind or mixture of zero-width assertions.
 62 | type EmptyOp uint8
 63 | 
 64 | const (
 65 | 	EmptyBeginLine EmptyOp = 1 << iota
 66 | 	EmptyEndLine
 67 | 	EmptyBeginText
 68 | 	EmptyEndText
 69 | 	EmptyWordBoundary
 70 | 	EmptyNoWordBoundary
 71 | )
 72 | 
 73 | // EmptyOpContext returns the zero-width assertions
 74 | // satisfied at the position between the runes r1 and r2.
 75 | // Passing r1 == -1 indicates that the position is
 76 | // at the beginning of the text.
 77 | // Passing r2 == -1 indicates that the position is
 78 | // at the end of the text.
 79 | func EmptyOpContext(r1, r2 rune) EmptyOp {
 80 | 	var op EmptyOp = EmptyNoWordBoundary
 81 | 	var boundary byte
 82 | 	switch {
 83 | 	case IsWordChar(r1):
 84 | 		boundary = 1
 85 | 	case r1 == '\n':
 86 | 		op |= EmptyBeginLine
 87 | 	case r1 < 0:
 88 | 		op |= EmptyBeginText | EmptyBeginLine
 89 | 	}
 90 | 	switch {
 91 | 	case IsWordChar(r2):
 92 | 		boundary ^= 1
 93 | 	case r2 == '\n':
 94 | 		op |= EmptyEndLine
 95 | 	case r2 < 0:
 96 | 		op |= EmptyEndText | EmptyEndLine
 97 | 	}
 98 | 	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
 99 | 		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
100 | 	}
101 | 	return op
102 | }
103 | 
104 | // IsWordChar reports whether r is consider a ``word character''
105 | // during the evaluation of the \b and \B zero-width assertions.
106 | // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
107 | func IsWordChar(r rune) bool {
108 | 	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
109 | }
110 | 
111 | // An Inst is a single instruction in a regular expression program.
112 | type Inst struct {
113 | 	Op   InstOp
114 | 	Out  uint32 // all but InstMatch, InstFail
115 | 	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
116 | 	Rune []rune
117 | }
118 | 
119 | func (p *Prog) String() string {
120 | 	var b bytes.Buffer
121 | 	dumpProg(&b, p)
122 | 	return b.String()
123 | }
124 | 
125 | // skipNop follows any no-op or capturing instructions.
126 | func (p *Prog) skipNop(pc uint32) *Inst {
127 | 	i := &p.Inst[pc]
128 | 	for i.Op == InstNop || i.Op == InstCapture {
129 | 		i = &p.Inst[i.Out]
130 | 	}
131 | 	return i
132 | }
133 | 
134 | // op returns i.Op but merges all the Rune special cases into InstRune
135 | func (i *Inst) op() InstOp {
136 | 	op := i.Op
137 | 	switch op {
138 | 	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
139 | 		op = InstRune
140 | 	}
141 | 	return op
142 | }
143 | 
144 | // Prefix returns a literal string that all matches for the
145 | // regexp must start with. Complete is true if the prefix
146 | // is the entire match.
147 | func (p *Prog) Prefix() (prefix string, complete bool) {
148 | 	i := p.skipNop(uint32(p.Start))
149 | 
150 | 	// Avoid allocation of buffer if prefix is empty.
151 | 	if i.op() != InstRune || len(i.Rune) != 1 {
152 | 		return "", i.Op == InstMatch
153 | 	}
154 | 
155 | 	// Have prefix; gather characters.
156 | 	var buf bytes.Buffer
157 | 	for i.op() == InstRune && len(i.Rune) == 1 && i.Rune[0] <= 0xFF && Flags(i.Arg)&FoldCase == 0 {
158 | 		buf.WriteByte(byte(i.Rune[0]))
159 | 		i = p.skipNop(i.Out)
160 | 	}
161 | 	return buf.String(), i.Op == InstMatch
162 | }
163 | 
164 | // StartCond returns the leading empty-width conditions that must
165 | // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
166 | func (p *Prog) StartCond() EmptyOp {
167 | 	var flag EmptyOp
168 | 	pc := uint32(p.Start)
169 | 	i := &p.Inst[pc]
170 | Loop:
171 | 	for {
172 | 		switch i.Op {
173 | 		case InstEmptyWidth:
174 | 			flag |= EmptyOp(i.Arg)
175 | 		case InstFail:
176 | 			return ^EmptyOp(0)
177 | 		case InstCapture, InstNop:
178 | 			// skip
179 | 		default:
180 | 			break Loop
181 | 		}
182 | 		pc = i.Out
183 | 		i = &p.Inst[pc]
184 | 	}
185 | 	return flag
186 | }
187 | 
188 | const noMatch = -1
189 | 
190 | // MatchRune reports whether the instruction matches (and consumes) r.
191 | // It should only be called when i.Op == InstRune.
192 | func (i *Inst) MatchRune(r rune) bool {
193 | 	return i.MatchRunePos(r) != noMatch
194 | }
195 | 
196 | // MatchRunePos checks whether the instruction matches (and consumes) r.
197 | // If so, MatchRunePos returns the index of the matching rune pair
198 | // (or, when len(i.Rune) == 1, rune singleton).
199 | // If not, MatchRunePos returns -1.
200 | // MatchRunePos should only be called when i.Op == InstRune.
201 | func (i *Inst) MatchRunePos(r rune) int {
202 | 	rune := i.Rune
203 | 
204 | 	switch len(rune) {
205 | 	case 0:
206 | 		return noMatch
207 | 
208 | 	case 1:
209 | 		// Special case: single-rune slice is from literal string, not char class.
210 | 		r0 := rune[0]
211 | 		if r == r0 {
212 | 			return 0
213 | 		}
214 | 		if Flags(i.Arg)&FoldCase != 0 {
215 | 			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
216 | 				if r == r1 {
217 | 					return 0
218 | 				}
219 | 			}
220 | 		}
221 | 		return noMatch
222 | 
223 | 	case 2:
224 | 		if r >= rune[0] && r <= rune[1] {
225 | 			return 0
226 | 		}
227 | 		return noMatch
228 | 
229 | 	case 4, 6, 8:
230 | 		// Linear search for a few pairs.
231 | 		// Should handle ASCII well.
232 | 		for j := 0; j < len(rune); j += 2 {
233 | 			if r < rune[j] {
234 | 				return noMatch
235 | 			}
236 | 			if r <= rune[j+1] {
237 | 				return j / 2
238 | 			}
239 | 		}
240 | 		return noMatch
241 | 	}
242 | 
243 | 	// Otherwise binary search.
244 | 	lo := 0
245 | 	hi := len(rune) / 2
246 | 	for lo < hi {
247 | 		m := lo + (hi-lo)/2
248 | 		if c := rune[2*m]; c <= r {
249 | 			if r <= rune[2*m+1] {
250 | 				return m
251 | 			}
252 | 			lo = m + 1
253 | 		} else {
254 | 			hi = m
255 | 		}
256 | 	}
257 | 	return noMatch
258 | }
259 | 
260 | // MatchEmptyWidth reports whether the instruction matches
261 | // an empty string between the runes before and after.
262 | // It should only be called when i.Op == InstEmptyWidth.
263 | func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
264 | 	switch EmptyOp(i.Arg) {
265 | 	case EmptyBeginLine:
266 | 		return before == '\n' || before == -1
267 | 	case EmptyEndLine:
268 | 		return after == '\n' || after == -1
269 | 	case EmptyBeginText:
270 | 		return before == -1
271 | 	case EmptyEndText:
272 | 		return after == -1
273 | 	case EmptyWordBoundary:
274 | 		return IsWordChar(before) != IsWordChar(after)
275 | 	case EmptyNoWordBoundary:
276 | 		return IsWordChar(before) == IsWordChar(after)
277 | 	}
278 | 	panic("unknown empty width arg")
279 | }
280 | 
281 | func (i *Inst) String() string {
282 | 	var b bytes.Buffer
283 | 	dumpInst(&b, i)
284 | 	return b.String()
285 | }
286 | 
287 | func bw(b *bytes.Buffer, args ...string) {
288 | 	for _, s := range args {
289 | 		b.WriteString(s)
290 | 	}
291 | }
292 | 
293 | func dumpProg(b *bytes.Buffer, p *Prog) {
294 | 	for j := range p.Inst {
295 | 		i := &p.Inst[j]
296 | 		pc := strconv.Itoa(j)
297 | 		if len(pc) < 3 {
298 | 			b.WriteString("   "[len(pc):])
299 | 		}
300 | 		if j == p.Start {
301 | 			pc += "*"
302 | 		}
303 | 		bw(b, pc, "\t")
304 | 		dumpInst(b, i)
305 | 		bw(b, "\n")
306 | 	}
307 | }
308 | 
309 | func u32(i uint32) string {
310 | 	return strconv.FormatUint(uint64(i), 10)
311 | }
312 | 
313 | func dumpInst(b *bytes.Buffer, i *Inst) {
314 | 	switch i.Op {
315 | 	case InstAlt:
316 | 		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
317 | 	case InstAltMatch:
318 | 		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
319 | 	case InstCapture:
320 | 		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
321 | 	case InstEmptyWidth:
322 | 		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
323 | 	case InstMatch:
324 | 		bw(b, "match")
325 | 	case InstFail:
326 | 		bw(b, "fail")
327 | 	case InstNop:
328 | 		bw(b, "nop -> ", u32(i.Out))
329 | 	case InstRune:
330 | 		if i.Rune == nil {
331 | 			// shouldn't happen
332 | 			bw(b, "rune <nil>")
333 | 		}
334 | 		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
335 | 		if Flags(i.Arg)&FoldCase != 0 {
336 | 			bw(b, "/i")
337 | 		}
338 | 		bw(b, " -> ", u32(i.Out))
339 | 	case InstRune1:
340 | 		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
341 | 	case InstRuneAny:
342 | 		bw(b, "any -> ", u32(i.Out))
343 | 	case InstRuneAnyNotNL:
344 | 		bw(b, "anynotnl -> ", u32(i.Out))
345 | 	}
346 | }
347 | 


--------------------------------------------------------------------------------
/syntax/prog_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import "testing"
  8 | 
  9 | var compileTests = []struct {
 10 | 	Regexp string
 11 | 	Prog   string
 12 | }{
 13 | 	{"a", `  0	fail
 14 |   1*	rune1 "a" -> 2
 15 |   2	match
 16 | `},
 17 | 	{"[A-M][n-z]", `  0	fail
 18 |   1*	rune "AM" -> 2
 19 |   2	rune "nz" -> 3
 20 |   3	match
 21 | `},
 22 | 	{"", `  0	fail
 23 |   1*	nop -> 2
 24 |   2	match
 25 | `},
 26 | 	{"a?", `  0	fail
 27 |   1	rune1 "a" -> 3
 28 |   2*	alt -> 1, 3
 29 |   3	match
 30 | `},
 31 | 	{"a??", `  0	fail
 32 |   1	rune1 "a" -> 3
 33 |   2*	alt -> 3, 1
 34 |   3	match
 35 | `},
 36 | 	{"a+", `  0	fail
 37 |   1*	rune1 "a" -> 2
 38 |   2	alt -> 1, 3
 39 |   3	match
 40 | `},
 41 | 	{"a+?", `  0	fail
 42 |   1*	rune1 "a" -> 2
 43 |   2	alt -> 3, 1
 44 |   3	match
 45 | `},
 46 | 	{"a*", `  0	fail
 47 |   1	rune1 "a" -> 2
 48 |   2*	alt -> 1, 3
 49 |   3	match
 50 | `},
 51 | 	{"a*?", `  0	fail
 52 |   1	rune1 "a" -> 2
 53 |   2*	alt -> 3, 1
 54 |   3	match
 55 | `},
 56 | 	{"a+b+", `  0	fail
 57 |   1*	rune1 "a" -> 2
 58 |   2	alt -> 1, 3
 59 |   3	rune1 "b" -> 4
 60 |   4	alt -> 3, 5
 61 |   5	match
 62 | `},
 63 | 	{"(a+)(b+)", `  0	fail
 64 |   1*	cap 2 -> 2
 65 |   2	rune1 "a" -> 3
 66 |   3	alt -> 2, 4
 67 |   4	cap 3 -> 5
 68 |   5	cap 4 -> 6
 69 |   6	rune1 "b" -> 7
 70 |   7	alt -> 6, 8
 71 |   8	cap 5 -> 9
 72 |   9	match
 73 | `},
 74 | 	{"a+|b+", `  0	fail
 75 |   1	rune1 "a" -> 2
 76 |   2	alt -> 1, 6
 77 |   3	rune1 "b" -> 4
 78 |   4	alt -> 3, 6
 79 |   5*	alt -> 1, 3
 80 |   6	match
 81 | `},
 82 | 	{"A[Aa]", `  0	fail
 83 |   1*	rune1 "A" -> 2
 84 |   2	rune "A"/i -> 3
 85 |   3	match
 86 | `},
 87 | 	{"(?:(?:^).)", `  0	fail
 88 |   1*	empty 4 -> 2
 89 |   2	anynotnl -> 3
 90 |   3	match
 91 | `},
 92 | }
 93 | 
 94 | func TestCompile(t *testing.T) {
 95 | 	for _, tt := range compileTests {
 96 | 		re, _ := Parse(tt.Regexp, Perl)
 97 | 		p, _ := Compile(re)
 98 | 		s := p.String()
 99 | 		if s != tt.Prog {
100 | 			t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
101 | 		}
102 | 	}
103 | }
104 | 
105 | func BenchmarkEmptyOpContext(b *testing.B) {
106 | 	for i := 0; i < b.N; i++ {
107 | 		var r1 rune = -1
108 | 		for _, r2 := range "foo, bar, baz\nsome input text.\n" {
109 | 			EmptyOpContext(r1, r2)
110 | 			r1 = r2
111 | 		}
112 | 		EmptyOpContext(r1, -1)
113 | 	}
114 | }
115 | 


--------------------------------------------------------------------------------
/syntax/regexp.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | // Note to implementers:
  8 | // In this package, re is always a *Regexp and r is always a rune.
  9 | 
 10 | import (
 11 | 	"bytes"
 12 | 	"strconv"
 13 | 	"strings"
 14 | 	"unicode"
 15 | )
 16 | 
 17 | // A Regexp is a node in a regular expression syntax tree.
 18 | type Regexp struct {
 19 | 	Op       Op // operator
 20 | 	Flags    Flags
 21 | 	Sub      []*Regexp  // subexpressions, if any
 22 | 	Sub0     [1]*Regexp // storage for short Sub
 23 | 	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
 24 | 	Rune0    [2]rune    // storage for short Rune
 25 | 	Min, Max int        // min, max for OpRepeat
 26 | 	Cap      int        // capturing index, for OpCapture
 27 | 	Name     string     // capturing name, for OpCapture
 28 | }
 29 | 
 30 | //go:generate stringer -type Op -trimprefix Op
 31 | 
 32 | // An Op is a single regular expression operator.
 33 | type Op uint8
 34 | 
 35 | // Operators are listed in precedence order, tightest binding to weakest.
 36 | // Character class operators are listed simplest to most complex
 37 | // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
 38 | 
 39 | const (
 40 | 	OpNoMatch        Op = 1 + iota // matches no strings
 41 | 	OpEmptyMatch                   // matches empty string
 42 | 	OpLiteral                      // matches Runes sequence
 43 | 	OpCharClass                    // matches Runes interpreted as range pair list
 44 | 	OpAnyCharNotNL                 // matches any character except newline
 45 | 	OpAnyChar                      // matches any character
 46 | 	OpBeginLine                    // matches empty string at beginning of line
 47 | 	OpEndLine                      // matches empty string at end of line
 48 | 	OpBeginText                    // matches empty string at beginning of text
 49 | 	OpEndText                      // matches empty string at end of text
 50 | 	OpWordBoundary                 // matches word boundary `\b`
 51 | 	OpNoWordBoundary               // matches word non-boundary `\B`
 52 | 	OpCapture                      // capturing subexpression with index Cap, optional name Name
 53 | 	OpStar                         // matches Sub[0] zero or more times
 54 | 	OpPlus                         // matches Sub[0] one or more times
 55 | 	OpQuest                        // matches Sub[0] zero or one times
 56 | 	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
 57 | 	OpConcat                       // matches concatenation of Subs
 58 | 	OpAlternate                    // matches alternation of Subs
 59 | )
 60 | 
 61 | const opPseudo Op = 128 // where pseudo-ops start
 62 | 
 63 | // Equal reports whether x and y have identical structure.
 64 | func (x *Regexp) Equal(y *Regexp) bool {
 65 | 	if x == nil || y == nil {
 66 | 		return x == y
 67 | 	}
 68 | 	if x.Op != y.Op {
 69 | 		return false
 70 | 	}
 71 | 	switch x.Op {
 72 | 	case OpEndText:
 73 | 		// The parse flags remember whether this is \z or \Z.
 74 | 		if x.Flags&WasDollar != y.Flags&WasDollar {
 75 | 			return false
 76 | 		}
 77 | 
 78 | 	case OpLiteral, OpCharClass:
 79 | 		if len(x.Rune) != len(y.Rune) {
 80 | 			return false
 81 | 		}
 82 | 		for i, r := range x.Rune {
 83 | 			if r != y.Rune[i] {
 84 | 				return false
 85 | 			}
 86 | 		}
 87 | 
 88 | 	case OpAlternate, OpConcat:
 89 | 		if len(x.Sub) != len(y.Sub) {
 90 | 			return false
 91 | 		}
 92 | 		for i, sub := range x.Sub {
 93 | 			if !sub.Equal(y.Sub[i]) {
 94 | 				return false
 95 | 			}
 96 | 		}
 97 | 
 98 | 	case OpStar, OpPlus, OpQuest:
 99 | 		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
100 | 			return false
101 | 		}
102 | 
103 | 	case OpRepeat:
104 | 		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
105 | 			return false
106 | 		}
107 | 
108 | 	case OpCapture:
109 | 		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
110 | 			return false
111 | 		}
112 | 	}
113 | 	return true
114 | }
115 | 
116 | // writeRegexp writes the Perl syntax for the regular expression re to b.
117 | func writeRegexp(b *bytes.Buffer, re *Regexp) {
118 | 	switch re.Op {
119 | 	default:
120 | 		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
121 | 	case OpNoMatch:
122 | 		b.WriteString(`[^\x00-\x{10FFFF}]`)
123 | 	case OpEmptyMatch:
124 | 		b.WriteString(`(?:)`)
125 | 	case OpLiteral:
126 | 		if re.Flags&FoldCase != 0 {
127 | 			b.WriteString(`(?i:`)
128 | 		}
129 | 		for _, r := range re.Rune {
130 | 			escape(b, r, false)
131 | 		}
132 | 		if re.Flags&FoldCase != 0 {
133 | 			b.WriteString(`)`)
134 | 		}
135 | 	case OpCharClass:
136 | 		if len(re.Rune)%2 != 0 {
137 | 			b.WriteString(`[invalid char class]`)
138 | 			break
139 | 		}
140 | 		b.WriteRune('[')
141 | 		if len(re.Rune) == 0 {
142 | 			b.WriteString(`^\x00-\x{10FFFF}`)
143 | 		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
144 | 			// Contains 0 and MaxRune. Probably a negated class.
145 | 			// Print the gaps.
146 | 			b.WriteRune('^')
147 | 			for i := 1; i < len(re.Rune)-1; i += 2 {
148 | 				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
149 | 				escape(b, lo, lo == '-')
150 | 				if lo != hi {
151 | 					b.WriteRune('-')
152 | 					escape(b, hi, hi == '-')
153 | 				}
154 | 			}
155 | 		} else {
156 | 			for i := 0; i < len(re.Rune); i += 2 {
157 | 				lo, hi := re.Rune[i], re.Rune[i+1]
158 | 				escape(b, lo, lo == '-')
159 | 				if lo != hi {
160 | 					b.WriteRune('-')
161 | 					escape(b, hi, hi == '-')
162 | 				}
163 | 			}
164 | 		}
165 | 		b.WriteRune(']')
166 | 	case OpAnyCharNotNL:
167 | 		b.WriteString(`(?-s:.)`)
168 | 	case OpAnyChar:
169 | 		b.WriteString(`(?s:.)`)
170 | 	case OpBeginLine:
171 | 		b.WriteString(`(?m:^)`)
172 | 	case OpEndLine:
173 | 		b.WriteString(`(?m:$)`)
174 | 	case OpBeginText:
175 | 		b.WriteString(`\A`)
176 | 	case OpEndText:
177 | 		if re.Flags&WasDollar != 0 {
178 | 			b.WriteString(`(?-m:$)`)
179 | 		} else {
180 | 			b.WriteString(`\z`)
181 | 		}
182 | 	case OpWordBoundary:
183 | 		b.WriteString(`\b`)
184 | 	case OpNoWordBoundary:
185 | 		b.WriteString(`\B`)
186 | 	case OpCapture:
187 | 		if re.Name != "" {
188 | 			b.WriteString(`(?P<`)
189 | 			b.WriteString(re.Name)
190 | 			b.WriteRune('>')
191 | 		} else {
192 | 			b.WriteRune('(')
193 | 		}
194 | 		if re.Sub[0].Op != OpEmptyMatch {
195 | 			writeRegexp(b, re.Sub[0])
196 | 		}
197 | 		b.WriteRune(')')
198 | 	case OpStar, OpPlus, OpQuest, OpRepeat:
199 | 		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
200 | 			b.WriteString(`(?:`)
201 | 			writeRegexp(b, sub)
202 | 			b.WriteString(`)`)
203 | 		} else {
204 | 			writeRegexp(b, sub)
205 | 		}
206 | 		switch re.Op {
207 | 		case OpStar:
208 | 			b.WriteRune('*')
209 | 		case OpPlus:
210 | 			b.WriteRune('+')
211 | 		case OpQuest:
212 | 			b.WriteRune('?')
213 | 		case OpRepeat:
214 | 			b.WriteRune('{')
215 | 			b.WriteString(strconv.Itoa(re.Min))
216 | 			if re.Max != re.Min {
217 | 				b.WriteRune(',')
218 | 				if re.Max >= 0 {
219 | 					b.WriteString(strconv.Itoa(re.Max))
220 | 				}
221 | 			}
222 | 			b.WriteRune('}')
223 | 		}
224 | 		if re.Flags&NonGreedy != 0 {
225 | 			b.WriteRune('?')
226 | 		}
227 | 	case OpConcat:
228 | 		for _, sub := range re.Sub {
229 | 			if sub.Op == OpAlternate {
230 | 				b.WriteString(`(?:`)
231 | 				writeRegexp(b, sub)
232 | 				b.WriteString(`)`)
233 | 			} else {
234 | 				writeRegexp(b, sub)
235 | 			}
236 | 		}
237 | 	case OpAlternate:
238 | 		for i, sub := range re.Sub {
239 | 			if i > 0 {
240 | 				b.WriteRune('|')
241 | 			}
242 | 			writeRegexp(b, sub)
243 | 		}
244 | 	}
245 | }
246 | 
247 | func (re *Regexp) String() string {
248 | 	var b bytes.Buffer
249 | 	writeRegexp(&b, re)
250 | 	return b.String()
251 | }
252 | 
253 | const meta = `\.+*?()|[]{}^$`
254 | 
255 | func escape(b *bytes.Buffer, r rune, force bool) {
256 | 	if unicode.IsPrint(r) {
257 | 		if strings.ContainsRune(meta, r) || force {
258 | 			b.WriteRune('\\')
259 | 		}
260 | 		b.WriteRune(r)
261 | 		return
262 | 	}
263 | 
264 | 	switch r {
265 | 	case '\a':
266 | 		b.WriteString(`\a`)
267 | 	case '\f':
268 | 		b.WriteString(`\f`)
269 | 	case '\n':
270 | 		b.WriteString(`\n`)
271 | 	case '\r':
272 | 		b.WriteString(`\r`)
273 | 	case '\t':
274 | 		b.WriteString(`\t`)
275 | 	case '\v':
276 | 		b.WriteString(`\v`)
277 | 	default:
278 | 		if r < 0x100 {
279 | 			b.WriteString(`\x`)
280 | 			s := strconv.FormatInt(int64(r), 16)
281 | 			if len(s) == 1 {
282 | 				b.WriteRune('0')
283 | 			}
284 | 			b.WriteString(s)
285 | 			break
286 | 		}
287 | 		b.WriteString(`\x{`)
288 | 		b.WriteString(strconv.FormatInt(int64(r), 16))
289 | 		b.WriteString(`}`)
290 | 	}
291 | }
292 | 
293 | // MaxCap walks the regexp to find the maximum capture index.
294 | func (re *Regexp) MaxCap() int {
295 | 	m := 0
296 | 	if re.Op == OpCapture {
297 | 		m = re.Cap
298 | 	}
299 | 	for _, sub := range re.Sub {
300 | 		if n := sub.MaxCap(); m < n {
301 | 			m = n
302 | 		}
303 | 	}
304 | 	return m
305 | }
306 | 
307 | // CapNames walks the regexp to find the names of capturing groups.
308 | func (re *Regexp) CapNames() []string {
309 | 	names := make([]string, re.MaxCap()+1)
310 | 	re.capNames(names)
311 | 	return names
312 | }
313 | 
314 | func (re *Regexp) capNames(names []string) {
315 | 	if re.Op == OpCapture {
316 | 		names[re.Cap] = re.Name
317 | 	}
318 | 	for _, sub := range re.Sub {
319 | 		sub.capNames(names)
320 | 	}
321 | }
322 | 


--------------------------------------------------------------------------------
/syntax/simplify.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | // Simplify returns a regexp equivalent to re but without counted repetitions
  8 | // and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
  9 | // The resulting regexp will execute correctly but its string representation
 10 | // will not produce the same parse tree, because capturing parentheses
 11 | // may have been duplicated or removed. For example, the simplified form
 12 | // for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
 13 | // The returned regexp may share structure with or be the original.
 14 | func (re *Regexp) Simplify() *Regexp {
 15 | 	if re == nil {
 16 | 		return nil
 17 | 	}
 18 | 	switch re.Op {
 19 | 	case OpCapture, OpConcat, OpAlternate:
 20 | 		// Simplify children, building new Regexp if children change.
 21 | 		nre := re
 22 | 		for i, sub := range re.Sub {
 23 | 			nsub := sub.Simplify()
 24 | 			if nre == re && nsub != sub {
 25 | 				// Start a copy.
 26 | 				nre = new(Regexp)
 27 | 				*nre = *re
 28 | 				nre.Rune = nil
 29 | 				nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
 30 | 			}
 31 | 			if nre != re {
 32 | 				nre.Sub = append(nre.Sub, nsub)
 33 | 			}
 34 | 		}
 35 | 		return nre
 36 | 
 37 | 	case OpStar, OpPlus, OpQuest:
 38 | 		sub := re.Sub[0].Simplify()
 39 | 		return simplify1(re.Op, re.Flags, sub, re)
 40 | 
 41 | 	case OpRepeat:
 42 | 		// Special special case: x{0} matches the empty string
 43 | 		// and doesn't even need to consider x.
 44 | 		if re.Min == 0 && re.Max == 0 {
 45 | 			return &Regexp{Op: OpEmptyMatch}
 46 | 		}
 47 | 
 48 | 		// The fun begins.
 49 | 		sub := re.Sub[0].Simplify()
 50 | 
 51 | 		// x{n,} means at least n matches of x.
 52 | 		if re.Max == -1 {
 53 | 			// Special case: x{0,} is x*.
 54 | 			if re.Min == 0 {
 55 | 				return simplify1(OpStar, re.Flags, sub, nil)
 56 | 			}
 57 | 
 58 | 			// Special case: x{1,} is x+.
 59 | 			if re.Min == 1 {
 60 | 				return simplify1(OpPlus, re.Flags, sub, nil)
 61 | 			}
 62 | 
 63 | 			// General case: x{4,} is xxxx+.
 64 | 			nre := &Regexp{Op: OpConcat}
 65 | 			nre.Sub = nre.Sub0[:0]
 66 | 			for i := 0; i < re.Min-1; i++ {
 67 | 				nre.Sub = append(nre.Sub, sub)
 68 | 			}
 69 | 			nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
 70 | 			return nre
 71 | 		}
 72 | 
 73 | 		// Special case x{0} handled above.
 74 | 
 75 | 		// Special case: x{1} is just x.
 76 | 		if re.Min == 1 && re.Max == 1 {
 77 | 			return sub
 78 | 		}
 79 | 
 80 | 		// General case: x{n,m} means n copies of x and m copies of x?
 81 | 		// The machine will do less work if we nest the final m copies,
 82 | 		// so that x{2,5} = xx(x(x(x)?)?)?
 83 | 
 84 | 		// Build leading prefix: xx.
 85 | 		var prefix *Regexp
 86 | 		if re.Min > 0 {
 87 | 			prefix = &Regexp{Op: OpConcat}
 88 | 			prefix.Sub = prefix.Sub0[:0]
 89 | 			for i := 0; i < re.Min; i++ {
 90 | 				prefix.Sub = append(prefix.Sub, sub)
 91 | 			}
 92 | 		}
 93 | 
 94 | 		// Build and attach suffix: (x(x(x)?)?)?
 95 | 		if re.Max > re.Min {
 96 | 			suffix := simplify1(OpQuest, re.Flags, sub, nil)
 97 | 			for i := re.Min + 1; i < re.Max; i++ {
 98 | 				nre2 := &Regexp{Op: OpConcat}
 99 | 				nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
100 | 				suffix = simplify1(OpQuest, re.Flags, nre2, nil)
101 | 			}
102 | 			if prefix == nil {
103 | 				return suffix
104 | 			}
105 | 			prefix.Sub = append(prefix.Sub, suffix)
106 | 		}
107 | 		if prefix != nil {
108 | 			return prefix
109 | 		}
110 | 
111 | 		// Some degenerate case like min > max or min < max < 0.
112 | 		// Handle as impossible match.
113 | 		return &Regexp{Op: OpNoMatch}
114 | 	}
115 | 
116 | 	return re
117 | }
118 | 
119 | // simplify1 implements Simplify for the unary OpStar,
120 | // OpPlus, and OpQuest operators. It returns the simple regexp
121 | // equivalent to
122 | //
123 | //	Regexp{Op: op, Flags: flags, Sub: {sub}}
124 | //
125 | // under the assumption that sub is already simple, and
126 | // without first allocating that structure. If the regexp
127 | // to be returned turns out to be equivalent to re, simplify1
128 | // returns re instead.
129 | //
130 | // simplify1 is factored out of Simplify because the implementation
131 | // for other operators generates these unary expressions.
132 | // Letting them call simplify1 makes sure the expressions they
133 | // generate are simple.
134 | func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
135 | 	// Special case: repeat the empty string as much as
136 | 	// you want, but it's still the empty string.
137 | 	if sub.Op == OpEmptyMatch {
138 | 		return sub
139 | 	}
140 | 	// The operators are idempotent if the flags match.
141 | 	if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
142 | 		return sub
143 | 	}
144 | 	if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
145 | 		return re
146 | 	}
147 | 
148 | 	re = &Regexp{Op: op, Flags: flags}
149 | 	re.Sub = append(re.Sub0[:0], sub)
150 | 	return re
151 | }
152 | 


--------------------------------------------------------------------------------
/syntax/simplify_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package syntax
  6 | 
  7 | import "testing"
  8 | 
  9 | var simplifyTests = []struct {
 10 | 	Regexp string
 11 | 	Simple string
 12 | }{
 13 | 	// Already-simple constructs
 14 | 	{`a`, `a`},
 15 | 	{`ab`, `ab`},
 16 | 	{`a|b`, `[a-b]`},
 17 | 	{`ab|cd`, `ab|cd`},
 18 | 	{`(ab)*`, `(ab)*`},
 19 | 	{`(ab)+`, `(ab)+`},
 20 | 	{`(ab)?`, `(ab)?`},
 21 | 	{`.`, `(?s:.)`},
 22 | 	{`^`, `(?m:^)`},
 23 | 	{`$`, `(?m:$)`},
 24 | 	{`[ac]`, `[ac]`},
 25 | 	{`[^ac]`, `[^ac]`},
 26 | 
 27 | 	// Posix character classes
 28 | 	{`[[:alnum:]]`, `[0-9A-Za-z]`},
 29 | 	{`[[:alpha:]]`, `[A-Za-z]`},
 30 | 	{`[[:blank:]]`, `[\t ]`},
 31 | 	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
 32 | 	{`[[:digit:]]`, `[0-9]`},
 33 | 	{`[[:graph:]]`, `[!-~]`},
 34 | 	{`[[:lower:]]`, `[a-z]`},
 35 | 	{`[[:print:]]`, `[ -~]`},
 36 | 	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
 37 | 	{`[[:space:]]`, `[\t-\r ]`},
 38 | 	{`[[:upper:]]`, `[A-Z]`},
 39 | 	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
 40 | 
 41 | 	// Perl character classes
 42 | 	{`\d`, `[0-9]`},
 43 | 	{`\s`, `[\t-\n\f-\r ]`},
 44 | 	{`\w`, `[0-9A-Z_a-z]`},
 45 | 	{`\D`, `[^0-9]`},
 46 | 	{`\S`, `[^\t-\n\f-\r ]`},
 47 | 	{`\W`, `[^0-9A-Z_a-z]`},
 48 | 	{`[\d]`, `[0-9]`},
 49 | 	{`[\s]`, `[\t-\n\f-\r ]`},
 50 | 	{`[\w]`, `[0-9A-Z_a-z]`},
 51 | 	{`[\D]`, `[^0-9]`},
 52 | 	{`[\S]`, `[^\t-\n\f-\r ]`},
 53 | 	{`[\W]`, `[^0-9A-Z_a-z]`},
 54 | 
 55 | 	// Posix repetitions
 56 | 	{`a{1}`, `a`},
 57 | 	{`a{2}`, `aa`},
 58 | 	{`a{5}`, `aaaaa`},
 59 | 	{`a{0,1}`, `a?`},
 60 | 	// The next three are illegible because Simplify inserts (?:)
 61 | 	// parens instead of () parens to avoid creating extra
 62 | 	// captured subexpressions. The comments show a version with fewer parens.
 63 | 	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
 64 | 	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
 65 | 	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
 66 | 	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
 67 | 	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
 68 | 	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
 69 | 	{`a{0,}`, `a*`},
 70 | 	{`a{1,}`, `a+`},
 71 | 	{`a{2,}`, `aa+`},
 72 | 	{`a{5,}`, `aaaaa+`},
 73 | 
 74 | 	// Test that operators simplify their arguments.
 75 | 	{`(?:a{1,}){1,}`, `a+`},
 76 | 	{`(a{1,}b{1,})`, `(a+b+)`},
 77 | 	{`a{1,}|b{1,}`, `a+|b+`},
 78 | 	{`(?:a{1,})*`, `(?:a+)*`},
 79 | 	{`(?:a{1,})+`, `a+`},
 80 | 	{`(?:a{1,})?`, `(?:a+)?`},
 81 | 	{``, `(?:)`},
 82 | 	{`a{0}`, `(?:)`},
 83 | 
 84 | 	// Character class simplification
 85 | 	{`[ab]`, `[a-b]`},
 86 | 	{`[a-za-za-z]`, `[a-z]`},
 87 | 	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
 88 | 	{`[ABCDEFGH]`, `[A-H]`},
 89 | 	{`[AB-CD-EF-GH]`, `[A-H]`},
 90 | 	{`[W-ZP-XE-R]`, `[E-Z]`},
 91 | 	{`[a-ee-gg-m]`, `[a-m]`},
 92 | 	{`[a-ea-ha-m]`, `[a-m]`},
 93 | 	{`[a-ma-ha-e]`, `[a-m]`},
 94 | 	{`[a-zA-Z0-9 -~]`, `[ -~]`},
 95 | 
 96 | 	// Empty character classes
 97 | 	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
 98 | 
 99 | 	// Full character classes
100 | 	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
101 | 
102 | 	// Unicode case folding.
103 | 	{`(?i)A`, `(?i:A)`},
104 | 	{`(?i)a`, `(?i:A)`},
105 | 	{`(?i)[A]`, `(?i:A)`},
106 | 	{`(?i)[a]`, `(?i:A)`},
107 | 	{`(?i)K`, `(?i:K)`},
108 | 	{`(?i)k`, `(?i:K)`},
109 | 	{`(?i)\x{212a}`, "(?i:K)"},
110 | 	{`(?i)[K]`, "[Kk\u212A]"},
111 | 	{`(?i)[k]`, "[Kk\u212A]"},
112 | 	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
113 | 	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
114 | 	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
115 | 	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
116 | 
117 | 	// Empty string as a regular expression.
118 | 	// The empty string must be preserved inside parens in order
119 | 	// to make submatches work right, so these tests are less
120 | 	// interesting than they might otherwise be. String inserts
121 | 	// explicit (?:) in place of non-parenthesized empty strings,
122 | 	// to make them easier to spot for other parsers.
123 | 	{`(a|b|)`, `([a-b]|(?:))`},
124 | 	{`(|)`, `()`},
125 | 	{`a()`, `a()`},
126 | 	{`(()|())`, `(()|())`},
127 | 	{`(a|)`, `(a|(?:))`},
128 | 	{`ab()cd()`, `ab()cd()`},
129 | 	{`()`, `()`},
130 | 	{`()*`, `()*`},
131 | 	{`()+`, `()+`},
132 | 	{`()?`, `()?`},
133 | 	{`(){0}`, `(?:)`},
134 | 	{`(){1}`, `()`},
135 | 	{`(){1,}`, `()+`},
136 | 	{`(){0,2}`, `(?:()()?)?`},
137 | }
138 | 
139 | func TestSimplify(t *testing.T) {
140 | 	for _, tt := range simplifyTests {
141 | 		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
142 | 		if err != nil {
143 | 			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
144 | 			continue
145 | 		}
146 | 		s := re.Simplify().String()
147 | 		if s != tt.Simple {
148 | 			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
149 | 		}
150 | 	}
151 | }
152 | 


--------------------------------------------------------------------------------
/testdata/README:
--------------------------------------------------------------------------------
 1 | AT&T POSIX Test Files
 2 | See textregex.c for copyright + license.
 3 | 
 4 | testregex.c	http://www2.research.att.com/~gsf/testregex/testregex.c
 5 | basic.dat	http://www2.research.att.com/~gsf/testregex/basic.dat
 6 | nullsubexpr.dat	http://www2.research.att.com/~gsf/testregex/nullsubexpr.dat
 7 | repetition.dat	http://www2.research.att.com/~gsf/testregex/repetition.dat
 8 | 
 9 | The test data has been edited to reflect RE2/Go differences:
10 |   * In a star of a possibly empty match like (a*)* matching x,
11 |     the no match case runs the starred subexpression zero times,
12 |     not once.  This is consistent with (a*)* matching a, which
13 |     runs the starred subexpression one time, not twice.
14 |   * The submatch choice is first match, not the POSIX rule.
15 | 
16 | Such changes are marked with 'RE2/Go'.
17 | 
18 | 
19 | RE2 Test Files
20 | 
21 | re2-exhaustive.txt.bz2 and re2-search.txt are built by running
22 | 'make log' in the RE2 distribution https://github.com/google/re2/
23 | 
24 | The exhaustive file is compressed because it is huge.
25 | 


--------------------------------------------------------------------------------
/testdata/basic.dat:
--------------------------------------------------------------------------------
  1 | NOTE	all standard compliant implementations should pass these : 2002-05-31
  2 | 
  3 | BE	abracadabra$	abracadabracadabra	(7,18)
  4 | BE	a...b		abababbb		(2,7)
  5 | BE	XXXXXX		..XXXXXX		(2,8)
  6 | E	\)		()	(1,2)
  7 | BE	a]		a]a	(0,2)
  8 | B	}		}	(0,1)
  9 | E	\}		}	(0,1)
 10 | BE	\]		]	(0,1)
 11 | B	]		]	(0,1)
 12 | E	]		]	(0,1)
 13 | B	{		{	(0,1)
 14 | B	}		}	(0,1)
 15 | BE	^a		ax	(0,1)
 16 | BE	\^a		a^a	(1,3)
 17 | BE	a\^		a^	(0,2)
 18 | BE	a$		aa	(1,2)
 19 | BE	a\$		a$	(0,2)
 20 | BE	^$		NULL	(0,0)
 21 | E	$^		NULL	(0,0)
 22 | E	a($)		aa	(1,2)(2,2)
 23 | E	a*(^a)		aa	(0,1)(0,1)
 24 | E	(..)*(...)*		a	(0,0)
 25 | E	(..)*(...)*		abcd	(0,4)(2,4)
 26 | E	(ab|a)(bc|c)		abc	(0,3)(0,2)(2,3)
 27 | E	(ab)c|abc		abc	(0,3)(0,2)
 28 | E	a{0}b		ab			(1,2)
 29 | E	(a*)(b?)(b+)b{3}	aaabbbbbbb	(0,10)(0,3)(3,4)(4,7)
 30 | E	(a*)(b{0,1})(b{1,})b{3}	aaabbbbbbb	(0,10)(0,3)(3,4)(4,7)
 31 | E	a{9876543210}	NULL	BADBR
 32 | E	((a|a)|a)			a	(0,1)(0,1)(0,1)
 33 | E	(a*)(a|aa)			aaaa	(0,4)(0,3)(3,4)
 34 | E	a*(a.|aa)			aaaa	(0,4)(2,4)
 35 | E	a(b)|c(d)|a(e)f			aef	(0,3)(?,?)(?,?)(1,2)
 36 | E	(a|b)?.*			b	(0,1)(0,1)
 37 | E	(a|b)c|a(b|c)			ac	(0,2)(0,1)
 38 | E	(a|b)c|a(b|c)			ab	(0,2)(?,?)(1,2)
 39 | E	(a|b)*c|(a|ab)*c		abc	(0,3)(1,2)
 40 | E	(a|b)*c|(a|ab)*c		xc	(1,2)
 41 | E	(.a|.b).*|.*(.a|.b)		xa	(0,2)(0,2)
 42 | E	a?(ab|ba)ab			abab	(0,4)(0,2)
 43 | E	a?(ac{0}b|ba)ab			abab	(0,4)(0,2)
 44 | E	ab|abab				abbabab	(0,2)
 45 | E	aba|bab|bba			baaabbbaba	(5,8)
 46 | E	aba|bab				baaabbbaba	(6,9)
 47 | E	(aa|aaa)*|(a|aaaaa)		aa	(0,2)(0,2)
 48 | E	(a.|.a.)*|(a|.a...)		aa	(0,2)(0,2)
 49 | E	ab|a				xabc	(1,3)
 50 | E	ab|a				xxabc	(2,4)
 51 | Ei	(Ab|cD)*			aBcD	(0,4)(2,4)
 52 | BE	[^-]			--a		(2,3)
 53 | BE	[a-]*			--a		(0,3)
 54 | BE	[a-m-]*			--amoma--	(0,4)
 55 | E	:::1:::0:|:::1:1:0:	:::0:::1:::1:::0:	(8,17)
 56 | E	:::1:::0:|:::1:1:1:	:::0:::1:::1:::0:	(8,17)
 57 | {E	[[:upper:]]		A		(0,1)	[[<element>]] not supported
 58 | E	[[:lower:]]+		`az{		(1,3)
 59 | E	[[:upper:]]+		@AZ[		(1,3)
 60 | # No collation in Go
 61 | #BE	[[-]]			[[-]]		(2,4)
 62 | #BE	[[.NIL.]]	NULL	ECOLLATE
 63 | #BE	[[=aleph=]]	NULL	ECOLLATE
 64 | }
 65 | BE$	\n		\n	(0,1)
 66 | BEn$	\n		\n	(0,1)
 67 | BE$	[^a]		\n	(0,1)
 68 | BE$	\na		\na	(0,2)
 69 | E	(a)(b)(c)	abc	(0,3)(0,1)(1,2)(2,3)
 70 | BE	xxx		xxx	(0,3)
 71 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	feb 6,	(0,6)
 72 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	2/7	(0,3)
 73 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	feb 1,Feb 6	(5,11)
 74 | E3	((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))	x	(0,1)(0,1)(0,1)
 75 | E3	((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*	xx	(0,2)(1,2)(1,2)
 76 | E	a?(ab|ba)*	ababababababababababababababababababababababababababababababababababababababababa	(0,81)(79,81)
 77 | E	abaa|abbaa|abbbaa|abbbbaa	ababbabbbabbbabbbbabbbbaa	(18,25)
 78 | E	abaa|abbaa|abbbaa|abbbbaa	ababbabbbabbbabbbbabaa	(18,22)
 79 | E	aaac|aabc|abac|abbc|baac|babc|bbac|bbbc	baaabbbabac	(7,11)
 80 | BE$	.*			\x01\xff	(0,2)
 81 | E	aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll		XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa	(53,57)
 82 | L	aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll		XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa	NOMATCH
 83 | E	a*a*a*a*a*b		aaaaaaaaab	(0,10)
 84 | BE	^			NULL		(0,0)
 85 | BE	$			NULL		(0,0)
 86 | BE	^$			NULL		(0,0)
 87 | BE	^a$			a		(0,1)
 88 | BE	abc			abc		(0,3)
 89 | BE	abc			xabcy		(1,4)
 90 | BE	abc			ababc		(2,5)
 91 | BE	ab*c			abc		(0,3)
 92 | BE	ab*bc			abc		(0,3)
 93 | BE	ab*bc			abbc		(0,4)
 94 | BE	ab*bc			abbbbc		(0,6)
 95 | E	ab+bc			abbc		(0,4)
 96 | E	ab+bc			abbbbc		(0,6)
 97 | E	ab?bc			abbc		(0,4)
 98 | E	ab?bc			abc		(0,3)
 99 | E	ab?c			abc		(0,3)
100 | BE	^abc$			abc		(0,3)
101 | BE	^abc			abcc		(0,3)
102 | BE	abc$			aabc		(1,4)
103 | BE	^			abc		(0,0)
104 | BE	$			abc		(3,3)
105 | BE	a.c			abc		(0,3)
106 | BE	a.c			axc		(0,3)
107 | BE	a.*c			axyzc		(0,5)
108 | BE	a[bc]d			abd		(0,3)
109 | BE	a[b-d]e			ace		(0,3)
110 | BE	a[b-d]			aac		(1,3)
111 | BE	a[-b]			a-		(0,2)
112 | BE	a[b-]			a-		(0,2)
113 | BE	a]			a]		(0,2)
114 | BE	a[]]b			a]b		(0,3)
115 | BE	a[^bc]d			aed		(0,3)
116 | BE	a[^-b]c			adc		(0,3)
117 | BE	a[^]b]c			adc		(0,3)
118 | E	ab|cd			abc		(0,2)
119 | E	ab|cd			abcd		(0,2)
120 | E	a\(b			a(b		(0,3)
121 | E	a\(*b			ab		(0,2)
122 | E	a\(*b			a((b		(0,4)
123 | E	((a))			abc		(0,1)(0,1)(0,1)
124 | E	(a)b(c)			abc		(0,3)(0,1)(2,3)
125 | E	a+b+c			aabbabc		(4,7)
126 | E	a*			aaa		(0,3)
127 | #E	(a*)*			-		(0,0)(0,0)
128 | E	(a*)*			-		(0,0)(?,?)	RE2/Go
129 | E	(a*)+			-		(0,0)(0,0)
130 | #E	(a*|b)*			-		(0,0)(0,0)
131 | E	(a*|b)*			-		(0,0)(?,?)	RE2/Go
132 | E	(a+|b)*			ab		(0,2)(1,2)
133 | E	(a+|b)+			ab		(0,2)(1,2)
134 | E	(a+|b)?			ab		(0,1)(0,1)
135 | BE	[^ab]*			cde		(0,3)
136 | #E	(^)*			-		(0,0)(0,0)
137 | E	(^)*			-		(0,0)(?,?)	RE2/Go
138 | BE	a*			NULL		(0,0)
139 | E	([abc])*d		abbbcd		(0,6)(4,5)
140 | E	([abc])*bcd		abcd		(0,4)(0,1)
141 | E	a|b|c|d|e		e		(0,1)
142 | E	(a|b|c|d|e)f		ef		(0,2)(0,1)
143 | #E	((a*|b))*		-		(0,0)(0,0)(0,0)
144 | E	((a*|b))*		-		(0,0)(?,?)(?,?)	RE2/Go
145 | BE	abcd*efg		abcdefg		(0,7)
146 | BE	ab*			xabyabbbz	(1,3)
147 | BE	ab*			xayabbbz	(1,2)
148 | E	(ab|cd)e		abcde		(2,5)(2,4)
149 | BE	[abhgefdc]ij		hij		(0,3)
150 | E	(a|b)c*d		abcd		(1,4)(1,2)
151 | E	(ab|ab*)bc		abc		(0,3)(0,1)
152 | E	a([bc]*)c*		abc		(0,3)(1,3)
153 | E	a([bc]*)(c*d)		abcd		(0,4)(1,3)(3,4)
154 | E	a([bc]+)(c*d)		abcd		(0,4)(1,3)(3,4)
155 | E	a([bc]*)(c+d)		abcd		(0,4)(1,2)(2,4)
156 | E	a[bcd]*dcdcde		adcdcde		(0,7)
157 | E	(ab|a)b*c		abc		(0,3)(0,2)
158 | E	((a)(b)c)(d)		abcd		(0,4)(0,3)(0,1)(1,2)(3,4)
159 | BE	[A-Za-z_][A-Za-z0-9_]*	alpha		(0,5)
160 | E	^a(bc+|b[eh])g|.h$	abh		(1,3)
161 | E	(bc+d$|ef*g.|h?i(j|k))	effgz		(0,5)(0,5)
162 | E	(bc+d$|ef*g.|h?i(j|k))	ij		(0,2)(0,2)(1,2)
163 | E	(bc+d$|ef*g.|h?i(j|k))	reffgz		(1,6)(1,6)
164 | E	(((((((((a)))))))))	a		(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
165 | BE	multiple words		multiple words yeah	(0,14)
166 | E	(.*)c(.*)		abcde		(0,5)(0,2)(3,5)
167 | BE	abcd			abcd		(0,4)
168 | E	a(bc)d			abcd		(0,4)(1,3)
169 | E	a[-]?c		ac		(0,3)
170 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Qaddafi	(0,15)(?,?)(10,12)
171 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mo'ammar Gadhafi	(0,16)(?,?)(11,13)
172 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Kaddafi	(0,15)(?,?)(10,12)
173 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Qadhafi	(0,15)(?,?)(10,12)
174 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Gadafi	(0,14)(?,?)(10,11)
175 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mu'ammar Qadafi	(0,15)(?,?)(11,12)
176 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moamar Gaddafi	(0,14)(?,?)(9,11)
177 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mu'ammar Qadhdhafi	(0,18)(?,?)(13,15)
178 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Khaddafi	(0,16)(?,?)(11,13)
179 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghaddafy	(0,16)(?,?)(11,13)
180 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghadafi	(0,15)(?,?)(11,12)
181 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghaddafi	(0,16)(?,?)(11,13)
182 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muamar Kaddafi	(0,14)(?,?)(9,11)
183 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Quathafi	(0,16)(?,?)(11,13)
184 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Gheddafi	(0,16)(?,?)(11,13)
185 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moammar Khadafy	(0,15)(?,?)(11,12)
186 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moammar Qudhafi	(0,15)(?,?)(10,12)
187 | E	a+(b|c)*d+		aabcdd			(0,6)(3,4)
188 | E	^.+$			vivi			(0,4)
189 | E	^(.+)$			vivi			(0,4)(0,4)
190 | E	^([^!.]+).att.com!(.+)$	gryphon.att.com!eby	(0,19)(0,7)(16,19)
191 | E	^([^!]+!)?([^!]+)$	bas			(0,3)(?,?)(0,3)
192 | E	^([^!]+!)?([^!]+)$	bar!bas			(0,7)(0,4)(4,7)
193 | E	^([^!]+!)?([^!]+)$	foo!bas			(0,7)(0,4)(4,7)
194 | E	^.+!([^!]+!)([^!]+)$	foo!bar!bas		(0,11)(4,8)(8,11)
195 | E	((foo)|(bar))!bas	bar!bas			(0,7)(0,3)(?,?)(0,3)
196 | E	((foo)|(bar))!bas	foo!bar!bas		(4,11)(4,7)(?,?)(4,7)
197 | E	((foo)|(bar))!bas	foo!bas			(0,7)(0,3)(0,3)
198 | E	((foo)|bar)!bas		bar!bas			(0,7)(0,3)
199 | E	((foo)|bar)!bas		foo!bar!bas		(4,11)(4,7)
200 | E	((foo)|bar)!bas		foo!bas			(0,7)(0,3)(0,3)
201 | E	(foo|(bar))!bas		bar!bas			(0,7)(0,3)(0,3)
202 | E	(foo|(bar))!bas		foo!bar!bas		(4,11)(4,7)(4,7)
203 | E	(foo|(bar))!bas		foo!bas			(0,7)(0,3)
204 | E	(foo|bar)!bas		bar!bas			(0,7)(0,3)
205 | E	(foo|bar)!bas		foo!bar!bas		(4,11)(4,7)
206 | E	(foo|bar)!bas		foo!bas			(0,7)(0,3)
207 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bar!bas	(0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
208 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	bas		(0,3)(?,?)(0,3)
209 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	bar!bas		(0,7)(0,4)(4,7)
210 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	foo!bar!bas	(0,11)(?,?)(?,?)(4,8)(8,11)
211 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	foo!bas		(0,7)(0,4)(4,7)
212 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	bas		(0,3)(0,3)(?,?)(0,3)
213 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	bar!bas		(0,7)(0,7)(0,4)(4,7)
214 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bar!bas	(0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
215 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bas		(0,7)(0,7)(0,4)(4,7)
216 | E	.*(/XXX).*			/XXX			(0,4)(0,4)
217 | E	.*(\\XXX).*			\XXX			(0,4)(0,4)
218 | E	\\XXX				\XXX			(0,4)
219 | E	.*(/000).*			/000			(0,4)(0,4)
220 | E	.*(\\000).*			\000			(0,4)(0,4)
221 | E	\\000				\000			(0,4)
222 | 


--------------------------------------------------------------------------------
/testdata/nullsubexpr.dat:
--------------------------------------------------------------------------------
 1 | NOTE	null subexpression matches : 2002-06-06
 2 | 
 3 | E	(a*)*		a		(0,1)(0,1)
 4 | #E	SAME		x		(0,0)(0,0)
 5 | E	SAME		x		(0,0)(?,?)	RE2/Go
 6 | E	SAME		aaaaaa		(0,6)(0,6)
 7 | E	SAME		aaaaaax		(0,6)(0,6)
 8 | E	(a*)+		a		(0,1)(0,1)
 9 | E	SAME		x		(0,0)(0,0)
10 | E	SAME		aaaaaa		(0,6)(0,6)
11 | E	SAME		aaaaaax		(0,6)(0,6)
12 | E	(a+)*		a		(0,1)(0,1)
13 | E	SAME		x		(0,0)
14 | E	SAME		aaaaaa		(0,6)(0,6)
15 | E	SAME		aaaaaax		(0,6)(0,6)
16 | E	(a+)+		a		(0,1)(0,1)
17 | E	SAME		x		NOMATCH
18 | E	SAME		aaaaaa		(0,6)(0,6)
19 | E	SAME		aaaaaax		(0,6)(0,6)
20 | 
21 | E	([a]*)*		a		(0,1)(0,1)
22 | #E	SAME		x		(0,0)(0,0)
23 | E	SAME		x		(0,0)(?,?)	RE2/Go
24 | E	SAME		aaaaaa		(0,6)(0,6)
25 | E	SAME		aaaaaax		(0,6)(0,6)
26 | E	([a]*)+		a		(0,1)(0,1)
27 | E	SAME		x		(0,0)(0,0)
28 | E	SAME		aaaaaa		(0,6)(0,6)
29 | E	SAME		aaaaaax		(0,6)(0,6)
30 | E	([^b]*)*	a		(0,1)(0,1)
31 | #E	SAME		b		(0,0)(0,0)
32 | E	SAME		b		(0,0)(?,?)	RE2/Go
33 | E	SAME		aaaaaa		(0,6)(0,6)
34 | E	SAME		aaaaaab		(0,6)(0,6)
35 | E	([ab]*)*	a		(0,1)(0,1)
36 | E	SAME		aaaaaa		(0,6)(0,6)
37 | E	SAME		ababab		(0,6)(0,6)
38 | E	SAME		bababa		(0,6)(0,6)
39 | E	SAME		b		(0,1)(0,1)
40 | E	SAME		bbbbbb		(0,6)(0,6)
41 | E	SAME		aaaabcde	(0,5)(0,5)
42 | E	([^a]*)*	b		(0,1)(0,1)
43 | E	SAME		bbbbbb		(0,6)(0,6)
44 | #E	SAME		aaaaaa		(0,0)(0,0)
45 | E	SAME		aaaaaa		(0,0)(?,?)	RE2/Go
46 | E	([^ab]*)*	ccccxx		(0,6)(0,6)
47 | #E	SAME		ababab		(0,0)(0,0)
48 | E	SAME		ababab		(0,0)(?,?)	RE2/Go
49 | 
50 | E	((z)+|a)*	zabcde		(0,2)(1,2)
51 | 
52 | #{E	a+?		aaaaaa		(0,1)	no *? +? mimimal match ops
53 | #E	(a)		aaa		(0,1)(0,1)
54 | #E	(a*?)		aaa		(0,0)(0,0)
55 | #E	(a)*?		aaa		(0,0)
56 | #E	(a*?)*?		aaa		(0,0)
57 | #}
58 | 
59 | B	\(a*\)*\(x\)		x	(0,1)(0,0)(0,1)
60 | B	\(a*\)*\(x\)		ax	(0,2)(0,1)(1,2)
61 | B	\(a*\)*\(x\)		axa	(0,2)(0,1)(1,2)
62 | B	\(a*\)*\(x\)\(\1\)	x	(0,1)(0,0)(0,1)(1,1)
63 | B	\(a*\)*\(x\)\(\1\)	ax	(0,2)(1,1)(1,2)(2,2)
64 | B	\(a*\)*\(x\)\(\1\)	axa	(0,3)(0,1)(1,2)(2,3)
65 | B	\(a*\)*\(x\)\(\1\)\(x\)	axax	(0,4)(0,1)(1,2)(2,3)(3,4)
66 | B	\(a*\)*\(x\)\(\1\)\(x\)	axxa	(0,3)(1,1)(1,2)(2,2)(2,3)
67 | 
68 | #E	(a*)*(x)		x	(0,1)(0,0)(0,1)
69 | E	(a*)*(x)		x	(0,1)(?,?)(0,1)	RE2/Go
70 | E	(a*)*(x)		ax	(0,2)(0,1)(1,2)
71 | E	(a*)*(x)		axa	(0,2)(0,1)(1,2)
72 | 
73 | E	(a*)+(x)		x	(0,1)(0,0)(0,1)
74 | E	(a*)+(x)		ax	(0,2)(0,1)(1,2)
75 | E	(a*)+(x)		axa	(0,2)(0,1)(1,2)
76 | 
77 | E	(a*){2}(x)		x	(0,1)(0,0)(0,1)
78 | E	(a*){2}(x)		ax	(0,2)(1,1)(1,2)
79 | E	(a*){2}(x)		axa	(0,2)(1,1)(1,2)
80 | 


--------------------------------------------------------------------------------
/testdata/re2-exhaustive.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsc/binaryregexp/545cabda89ca36b48b8e681a30d9d769a30b3074/testdata/re2-exhaustive.txt.bz2


--------------------------------------------------------------------------------
/testdata/repetition.dat:
--------------------------------------------------------------------------------
  1 | NOTE	implicit vs. explicit repetitions : 2009-02-02
  2 | 
  3 | # Glenn Fowler <gsf@research.att.com>
  4 | # conforming matches (column 4) must match one of the following BREs
  5 | #	NOMATCH
  6 | #	(0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
  7 | #	(0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
  8 | # i.e., each 3-tuple has two identical elements and one (?,?)
  9 | 
 10 | E	((..)|(.))				NULL		NOMATCH
 11 | E	((..)|(.))((..)|(.))			NULL		NOMATCH
 12 | E	((..)|(.))((..)|(.))((..)|(.))		NULL		NOMATCH
 13 | 
 14 | E	((..)|(.)){1}				NULL		NOMATCH
 15 | E	((..)|(.)){2}				NULL		NOMATCH
 16 | E	((..)|(.)){3}				NULL		NOMATCH
 17 | 
 18 | E	((..)|(.))*				NULL		(0,0)
 19 | 
 20 | E	((..)|(.))				a		(0,1)(0,1)(?,?)(0,1)
 21 | E	((..)|(.))((..)|(.))			a		NOMATCH
 22 | E	((..)|(.))((..)|(.))((..)|(.))		a		NOMATCH
 23 | 
 24 | E	((..)|(.)){1}				a		(0,1)(0,1)(?,?)(0,1)
 25 | E	((..)|(.)){2}				a		NOMATCH
 26 | E	((..)|(.)){3}				a		NOMATCH
 27 | 
 28 | E	((..)|(.))*				a		(0,1)(0,1)(?,?)(0,1)
 29 | 
 30 | E	((..)|(.))				aa		(0,2)(0,2)(0,2)(?,?)
 31 | E	((..)|(.))((..)|(.))			aa		(0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
 32 | E	((..)|(.))((..)|(.))((..)|(.))		aa		NOMATCH
 33 | 
 34 | E	((..)|(.)){1}				aa		(0,2)(0,2)(0,2)(?,?)
 35 | E	((..)|(.)){2}				aa		(0,2)(1,2)(?,?)(1,2)
 36 | E	((..)|(.)){3}				aa		NOMATCH
 37 | 
 38 | E	((..)|(.))*				aa		(0,2)(0,2)(0,2)(?,?)
 39 | 
 40 | E	((..)|(.))				aaa		(0,2)(0,2)(0,2)(?,?)
 41 | E	((..)|(.))((..)|(.))			aaa		(0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
 42 | E	((..)|(.))((..)|(.))((..)|(.))		aaa		(0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
 43 | 
 44 | E	((..)|(.)){1}				aaa		(0,2)(0,2)(0,2)(?,?)
 45 | #E	((..)|(.)){2}				aaa		(0,3)(2,3)(?,?)(2,3)
 46 | E	((..)|(.)){2}				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
 47 | E	((..)|(.)){3}				aaa		(0,3)(2,3)(?,?)(2,3)
 48 | 
 49 | #E	((..)|(.))*				aaa		(0,3)(2,3)(?,?)(2,3)
 50 | E	((..)|(.))*				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
 51 | 
 52 | E	((..)|(.))				aaaa		(0,2)(0,2)(0,2)(?,?)
 53 | E	((..)|(.))((..)|(.))			aaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 54 | E	((..)|(.))((..)|(.))((..)|(.))		aaaa		(0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
 55 | 
 56 | E	((..)|(.)){1}				aaaa		(0,2)(0,2)(0,2)(?,?)
 57 | E	((..)|(.)){2}				aaaa		(0,4)(2,4)(2,4)(?,?)
 58 | #E	((..)|(.)){3}				aaaa		(0,4)(3,4)(?,?)(3,4)
 59 | E	((..)|(.)){3}				aaaa		(0,4)(3,4)(0,2)(3,4)	RE2/Go
 60 | 
 61 | E	((..)|(.))*				aaaa		(0,4)(2,4)(2,4)(?,?)
 62 | 
 63 | E	((..)|(.))				aaaaa		(0,2)(0,2)(0,2)(?,?)
 64 | E	((..)|(.))((..)|(.))			aaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 65 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaa		(0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
 66 | 
 67 | E	((..)|(.)){1}				aaaaa		(0,2)(0,2)(0,2)(?,?)
 68 | E	((..)|(.)){2}				aaaaa		(0,4)(2,4)(2,4)(?,?)
 69 | #E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(?,?)(4,5)
 70 | E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
 71 | 
 72 | #E	((..)|(.))*				aaaaa		(0,5)(4,5)(?,?)(4,5)
 73 | E	((..)|(.))*				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
 74 | 
 75 | E	((..)|(.))				aaaaaa		(0,2)(0,2)(0,2)(?,?)
 76 | E	((..)|(.))((..)|(.))			aaaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 77 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaaa		(0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
 78 | 
 79 | E	((..)|(.)){1}				aaaaaa		(0,2)(0,2)(0,2)(?,?)
 80 | E	((..)|(.)){2}				aaaaaa		(0,4)(2,4)(2,4)(?,?)
 81 | E	((..)|(.)){3}				aaaaaa		(0,6)(4,6)(4,6)(?,?)
 82 | 
 83 | E	((..)|(.))*				aaaaaa		(0,6)(4,6)(4,6)(?,?)
 84 | 
 85 | NOTE	additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
 86 | 
 87 | # These test a bug in OS X / FreeBSD / NetBSD, and libtree. 
 88 | # Linux/GLIBC gets the {8,} and {8,8} wrong.
 89 | 
 90 | :HA#100:E	X(.?){0,}Y	X1234567Y	(0,9)(7,8)
 91 | :HA#101:E	X(.?){1,}Y	X1234567Y	(0,9)(7,8)
 92 | :HA#102:E	X(.?){2,}Y	X1234567Y	(0,9)(7,8)
 93 | :HA#103:E	X(.?){3,}Y	X1234567Y	(0,9)(7,8)
 94 | :HA#104:E	X(.?){4,}Y	X1234567Y	(0,9)(7,8)
 95 | :HA#105:E	X(.?){5,}Y	X1234567Y	(0,9)(7,8)
 96 | :HA#106:E	X(.?){6,}Y	X1234567Y	(0,9)(7,8)
 97 | :HA#107:E	X(.?){7,}Y	X1234567Y	(0,9)(7,8)
 98 | :HA#108:E	X(.?){8,}Y	X1234567Y	(0,9)(8,8)
 99 | #:HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(7,8)
100 | :HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
101 | #:HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(7,8)
102 | :HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
103 | #:HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(7,8)
104 | :HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
105 | #:HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(7,8)
106 | :HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
107 | #:HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(7,8)
108 | :HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
109 | #:HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(7,8)
110 | :HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
111 | #:HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(7,8)
112 | :HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
113 | #:HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(7,8)
114 | :HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
115 | :HA#118:E	X(.?){8,8}Y	X1234567Y	(0,9)(8,8)
116 | 
117 | # These test a fixed bug in my regex-tdfa that did not keep the expanded
118 | # form properly grouped, so right association did the wrong thing with
119 | # these ambiguous patterns (crafted just to test my code when I became
120 | # suspicious of my implementation).  The first subexpression should use
121 | # "ab" then "a" then "bcd".
122 | 
123 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible
124 | # results like (0,6)(4,5)(6,6).
125 | 
126 | :HA#260:E	(a|ab|c|bcd){0,}(d*)	ababcd	(0,6)(3,6)(6,6)
127 | :HA#261:E	(a|ab|c|bcd){1,}(d*)	ababcd	(0,6)(3,6)(6,6)
128 | :HA#262:E	(a|ab|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
129 | :HA#263:E	(a|ab|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
130 | :HA#264:E	(a|ab|c|bcd){4,}(d*)	ababcd	NOMATCH
131 | :HA#265:E	(a|ab|c|bcd){0,10}(d*)	ababcd	(0,6)(3,6)(6,6)
132 | :HA#266:E	(a|ab|c|bcd){1,10}(d*)	ababcd	(0,6)(3,6)(6,6)
133 | :HA#267:E	(a|ab|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
134 | :HA#268:E	(a|ab|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
135 | :HA#269:E	(a|ab|c|bcd){4,10}(d*)	ababcd	NOMATCH
136 | :HA#270:E	(a|ab|c|bcd)*(d*)	ababcd	(0,6)(3,6)(6,6)
137 | :HA#271:E	(a|ab|c|bcd)+(d*)	ababcd	(0,6)(3,6)(6,6)
138 | 
139 | # The above worked on Linux/GLIBC but the following often fail.
140 | # They also trip up OS X / FreeBSD / NetBSD:
141 | 
142 | #:HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(3,6)(6,6)
143 | :HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
144 | #:HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(3,6)(6,6)
145 | :HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
146 | #:HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
147 | :HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
148 | #:HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
149 | :HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
150 | :HA#284:E	(ab|a|c|bcd){4,}(d*)	ababcd	NOMATCH
151 | #:HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(3,6)(6,6)
152 | :HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
153 | #:HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(3,6)(6,6)
154 | :HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
155 | #:HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
156 | :HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
157 | #:HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
158 | :HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
159 | :HA#289:E	(ab|a|c|bcd){4,10}(d*)	ababcd	NOMATCH
160 | #:HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(3,6)(6,6)
161 | :HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
162 | #:HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(3,6)(6,6)
163 | :HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
164 | 


--------------------------------------------------------------------------------