├── .gitignore ├── AUTHORS ├── CONTRIBUTORS ├── LICENSE ├── README.md ├── all_test.go ├── backtrack.go ├── example_test.go ├── exec.go ├── exec2_test.go ├── exec_test.go ├── export.sh ├── find_test.go ├── internal ├── dfa │ ├── dfa.go │ ├── dfa_exhaustive_test.go │ ├── dfa_test.go │ ├── exec_test.go │ ├── runerange.go │ ├── search.go │ ├── state.go │ └── workq.go └── input │ └── input.go ├── onepass.go ├── onepass_test.go ├── regexp.go ├── syntax ├── compile.go ├── doc.go ├── make_perl_groups.pl ├── parse.go ├── parse_test.go ├── perl_groups.go ├── prog.go ├── prog_test.go ├── regexp.go ├── simplify.go └── simplify_test.go └── testdata ├── README ├── basic.dat ├── nullsubexpr.dat ├── re2-exhaustive.txt.bz2 ├── re2-search.txt ├── repetition.dat └── testregex.c /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.[56789ao] 3 | *.a[56789o] 4 | *.so 5 | *.pyc 6 | ._* 7 | .nfs.* 8 | [56789a].out 9 | *~ 10 | *.orig 11 | *.rej 12 | *.exe 13 | .*.swp 14 | core 15 | *.cgo*.go 16 | *.cgo*.c 17 | _cgo_* 18 | _obj 19 | _test 20 | _testmain.go 21 | build.out 22 | test.out 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go regexp + RE2 DFA port 2 | 3 | `import "matloob.io/regexp"` 4 | 5 | See [golang.org/cl/12081](https://golang.org/cl/12081) 6 | 7 | * The regexp tests pass. Though there may still be uncaught bugs. 8 | Let me know if you find any of them! No guarantees! 9 | * regexp/internal/dfa tests are currently failing. I need to fix 10 | some thingsn there. 11 | * I've got a small change to the DFA that uses package unsafe 12 | and makes matches 2x faster. I'll try to get it up soon. 13 | * A bunch of cleanup needs to be done all over this package. 14 | -------------------------------------------------------------------------------- /all_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "matloob.io/regexp/syntax" 9 | "reflect" 10 | "strings" 11 | "testing" 12 | ) 13 | 14 | var good_re = []string{ 15 | ``, 16 | `.`, 17 | `^.$`, 18 | `a`, 19 | `a*`, 20 | `a+`, 21 | `a?`, 22 | `a|b`, 23 | `a*|b*`, 24 | `(a*|b)(c*|d)`, 25 | `[a-z]`, 26 | `[a-abc-c\-\]\[]`, 27 | `[a-z]+`, 28 | `[abc]`, 29 | `[^1234]`, 30 | `[^\n]`, 31 | `\!\\`, 32 | } 33 | 34 | type stringError struct { 35 | re string 36 | err string 37 | } 38 | 39 | var bad_re = []stringError{ 40 | {`*`, "missing argument to repetition operator: `*`"}, 41 | {`+`, "missing argument to repetition operator: `+`"}, 42 | {`?`, "missing argument to repetition operator: `?`"}, 43 | {`(abc`, "missing closing ): `(abc`"}, 44 | {`abc)`, "unexpected ): `abc)`"}, 45 | {`x[a-z`, "missing closing ]: `[a-z`"}, 46 | {`[z-a]`, "invalid character class range: `z-a`"}, 47 | {`abc\`, "trailing backslash at end of expression"}, 48 | {`a**`, "invalid nested repetition operator: `**`"}, 49 | {`a*+`, "invalid nested repetition operator: `*+`"}, 50 | {`\x`, "invalid escape sequence: `\\x`"}, 51 | } 52 | 53 | func compileTest(t *testing.T, expr string, error string) *Regexp { 54 | re, err := Compile(expr) 55 | if error == "" && err != nil { 56 | t.Error("compiling `", expr, "`; unexpected error: ", err.Error()) 57 | } 58 | if error != "" && err == nil { 59 | t.Error("compiling `", expr, "`; missing error") 60 | } else if error != "" && !strings.Contains(err.Error(), error) { 61 | t.Error("compiling `", expr, "`; wrong error: ", err.Error(), "; want ", error) 62 | } 63 | return re 64 | } 65 | 66 | func TestGoodCompile(t *testing.T) { 67 | for i := 0; i < len(good_re); i++ { 68 | compileTest(t, good_re[i], "") 69 | } 70 | } 71 | 72 | func TestBadCompile(t *testing.T) { 73 | for i := 0; i < len(bad_re); i++ { 74 | compileTest(t, bad_re[i].re, bad_re[i].err) 75 | } 76 | } 77 | 78 | func matchTest(t *testing.T, test *FindTest) { 79 | re := compileTest(t, test.pat, "") 80 | if re == nil { 81 | return 82 | } 83 | m := re.MatchString(test.text) 84 | if m != (len(test.matches) > 0) { 85 | t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) 86 | } 87 | // now try bytes 88 | m = re.Match([]byte(test.text)) 89 | if m != (len(test.matches) > 0) { 90 | t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) 91 | } 92 | } 93 | 94 | func TestMatch(t *testing.T) { 95 | for _, test := range findTests { 96 | matchTest(t, &test) 97 | } 98 | } 99 | 100 | func matchFunctionTest(t *testing.T, test *FindTest) { 101 | m, err := MatchString(test.pat, test.text) 102 | if err == nil { 103 | return 104 | } 105 | if m != (len(test.matches) > 0) { 106 | t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) 107 | } 108 | } 109 | 110 | func TestMatchFunction(t *testing.T) { 111 | for _, test := range findTests { 112 | matchFunctionTest(t, &test) 113 | } 114 | } 115 | 116 | func copyMatchTest(t *testing.T, test *FindTest) { 117 | re := compileTest(t, test.pat, "") 118 | if re == nil { 119 | return 120 | } 121 | m1 := re.MatchString(test.text) 122 | m2 := re.Copy().MatchString(test.text) 123 | if m1 != m2 { 124 | t.Errorf("Copied Regexp match failure on %s: original gave %t; copy gave %t; should be %t", 125 | test, m1, m2, len(test.matches) > 0) 126 | } 127 | } 128 | 129 | func TestCopyMatch(t *testing.T) { 130 | for _, test := range findTests { 131 | copyMatchTest(t, &test) 132 | } 133 | } 134 | 135 | type ReplaceTest struct { 136 | pattern, replacement, input, output string 137 | } 138 | 139 | var replaceTests = []ReplaceTest{ 140 | // Test empty input and/or replacement, with pattern that matches the empty string. 141 | {"", "", "", ""}, 142 | {"", "x", "", "x"}, 143 | {"", "", "abc", "abc"}, 144 | {"", "x", "abc", "xaxbxcx"}, 145 | 146 | // Test empty input and/or replacement, with pattern that does not match the empty string. 147 | {"b", "", "", ""}, 148 | {"b", "x", "", ""}, 149 | {"b", "", "abc", "ac"}, 150 | {"b", "x", "abc", "axc"}, 151 | {"y", "", "", ""}, 152 | {"y", "x", "", ""}, 153 | {"y", "", "abc", "abc"}, 154 | {"y", "x", "abc", "abc"}, 155 | 156 | // Multibyte characters -- verify that we don't try to match in the middle 157 | // of a character. 158 | {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, 159 | {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, 160 | 161 | // Start and end of a string. 162 | {"^[a-c]*", "x", "abcdabc", "xdabc"}, 163 | {"[a-c]*$", "x", "abcdabc", "abcdx"}, 164 | {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, 165 | {"^[a-c]*", "x", "abc", "x"}, 166 | {"[a-c]*$", "x", "abc", "x"}, 167 | {"^[a-c]*$", "x", "abc", "x"}, 168 | {"^[a-c]*", "x", "dabce", "xdabce"}, 169 | {"[a-c]*$", "x", "dabce", "dabcex"}, 170 | {"^[a-c]*$", "x", "dabce", "dabce"}, 171 | {"^[a-c]*", "x", "", "x"}, 172 | {"[a-c]*$", "x", "", "x"}, 173 | {"^[a-c]*$", "x", "", "x"}, 174 | 175 | {"^[a-c]+", "x", "abcdabc", "xdabc"}, 176 | {"[a-c]+$", "x", "abcdabc", "abcdx"}, 177 | {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, 178 | {"^[a-c]+", "x", "abc", "x"}, 179 | {"[a-c]+$", "x", "abc", "x"}, 180 | {"^[a-c]+$", "x", "abc", "x"}, 181 | {"^[a-c]+", "x", "dabce", "dabce"}, 182 | {"[a-c]+$", "x", "dabce", "dabce"}, 183 | {"^[a-c]+$", "x", "dabce", "dabce"}, 184 | {"^[a-c]+", "x", "", ""}, 185 | {"[a-c]+$", "x", "", ""}, 186 | {"^[a-c]+$", "x", "", ""}, 187 | 188 | // Other cases. 189 | {"abc", "def", "abcdefg", "defdefg"}, 190 | {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, 191 | {"abc", "", "abcdabc", "d"}, 192 | {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, 193 | {"abc", "d", "", ""}, 194 | {"abc", "d", "abc", "d"}, 195 | {".+", "x", "abc", "x"}, 196 | {"[a-c]*", "x", "def", "xdxexfx"}, 197 | {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, 198 | {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, 199 | 200 | // Substitutions 201 | {"a+", "($0)", "banana", "b(a)n(a)n(a)"}, 202 | {"a+", "(${0})", "banana", "b(a)n(a)n(a)"}, 203 | {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, 204 | {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, 205 | {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"}, 206 | {"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "}, 207 | {"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"}, 208 | {"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<><>"}, 209 | {"hello, (?P.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"}, 210 | {"hello, (?P.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"}, 211 | {"(?Phi)|(?Pbye)", "$x$x$x", "hi", "hihihi"}, 212 | {"(?Phi)|(?Pbye)", "$x$x$x", "bye", "byebyebye"}, 213 | {"(?Phi)|(?Pbye)", "$xyz", "hi", ""}, 214 | {"(?Phi)|(?Pbye)", "${x}yz", "hi", "hiyz"}, 215 | {"(?Phi)|(?Pbye)", "hello $$x", "hi", "hello $x"}, 216 | {"a+", "${oops", "aaa", "${oops"}, 217 | {"a+", "$$", "aaa", "$"}, 218 | {"a+", "$", "aaa", "$"}, 219 | 220 | // Substitution when subexpression isn't found 221 | {"(x)?", "$1", "123", "123"}, 222 | {"abc", "$1", "123", "123"}, 223 | 224 | // Substitutions involving a (x){0} 225 | {"(a)(b){0}(c)", ".$1|$3.", "xacxacx", "x.a|c.x.a|c.x"}, 226 | {"(a)(((b))){0}c", ".$1.", "xacxacx", "x.a.x.a.x"}, 227 | {"((a(b){0}){3}){5}(h)", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, 228 | {"((a(b){0}){3}){5}h", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, 229 | } 230 | 231 | var replaceLiteralTests = []ReplaceTest{ 232 | // Substitutions 233 | {"a+", "($0)", "banana", "b($0)n($0)n($0)"}, 234 | {"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"}, 235 | {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, 236 | {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, 237 | {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"}, 238 | {"hello, (?P.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"}, 239 | {"hello, (?P.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"}, 240 | {"(?Phi)|(?Pbye)", "$x$x$x", "hi", "$x$x$x"}, 241 | {"(?Phi)|(?Pbye)", "$x$x$x", "bye", "$x$x$x"}, 242 | {"(?Phi)|(?Pbye)", "$xyz", "hi", "$xyz"}, 243 | {"(?Phi)|(?Pbye)", "${x}yz", "hi", "${x}yz"}, 244 | {"(?Phi)|(?Pbye)", "hello $$x", "hi", "hello $$x"}, 245 | {"a+", "${oops", "aaa", "${oops"}, 246 | {"a+", "$$", "aaa", "$$"}, 247 | {"a+", "$", "aaa", "$"}, 248 | } 249 | 250 | type ReplaceFuncTest struct { 251 | pattern string 252 | replacement func(string) string 253 | input, output string 254 | } 255 | 256 | var replaceFuncTests = []ReplaceFuncTest{ 257 | {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, 258 | {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, 259 | {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, 260 | } 261 | 262 | func TestReplaceAll(t *testing.T) { 263 | for _, tc := range replaceTests { 264 | re, err := Compile(tc.pattern) 265 | if err != nil { 266 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 267 | continue 268 | } 269 | actual := re.ReplaceAllString(tc.input, tc.replacement) 270 | if actual != tc.output { 271 | t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q", 272 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 273 | } 274 | // now try bytes 275 | actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) 276 | if actual != tc.output { 277 | t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q", 278 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 279 | } 280 | } 281 | } 282 | 283 | func TestReplaceAllLiteral(t *testing.T) { 284 | // Run ReplaceAll tests that do not have $ expansions. 285 | for _, tc := range replaceTests { 286 | if strings.Contains(tc.replacement, "$") { 287 | continue 288 | } 289 | re, err := Compile(tc.pattern) 290 | if err != nil { 291 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 292 | continue 293 | } 294 | actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) 295 | if actual != tc.output { 296 | t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", 297 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 298 | } 299 | // now try bytes 300 | actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) 301 | if actual != tc.output { 302 | t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", 303 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 304 | } 305 | } 306 | 307 | // Run literal-specific tests. 308 | for _, tc := range replaceLiteralTests { 309 | re, err := Compile(tc.pattern) 310 | if err != nil { 311 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 312 | continue 313 | } 314 | actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) 315 | if actual != tc.output { 316 | t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", 317 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 318 | } 319 | // now try bytes 320 | actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) 321 | if actual != tc.output { 322 | t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", 323 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 324 | } 325 | } 326 | } 327 | 328 | func TestReplaceAllFunc(t *testing.T) { 329 | for _, tc := range replaceFuncTests { 330 | re, err := Compile(tc.pattern) 331 | if err != nil { 332 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 333 | continue 334 | } 335 | actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) 336 | if actual != tc.output { 337 | t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q", 338 | tc.pattern, tc.input, actual, tc.output) 339 | } 340 | // now try bytes 341 | actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) 342 | if actual != tc.output { 343 | t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q", 344 | tc.pattern, tc.input, actual, tc.output) 345 | } 346 | } 347 | } 348 | 349 | type MetaTest struct { 350 | pattern, output, literal string 351 | isLiteral bool 352 | } 353 | 354 | var metaTests = []MetaTest{ 355 | {``, ``, ``, true}, 356 | {`foo`, `foo`, `foo`, true}, 357 | {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator 358 | {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators 359 | {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false}, 360 | } 361 | 362 | var literalPrefixTests = []MetaTest{ 363 | // See golang.org/issue/11175. 364 | // output is unused. 365 | {`^0^0$`, ``, `0`, false}, 366 | {`^0^`, ``, ``, false}, 367 | {`^0$`, ``, `0`, true}, 368 | {`$0^`, ``, ``, false}, 369 | {`$0$`, ``, ``, false}, 370 | {`^^0$$`, ``, ``, false}, 371 | {`^$^$`, ``, ``, false}, 372 | {`$$0^^`, ``, ``, false}, 373 | } 374 | 375 | func TestQuoteMeta(t *testing.T) { 376 | for _, tc := range metaTests { 377 | // Verify that QuoteMeta returns the expected string. 378 | quoted := QuoteMeta(tc.pattern) 379 | if quoted != tc.output { 380 | t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", 381 | tc.pattern, quoted, tc.output) 382 | continue 383 | } 384 | 385 | // Verify that the quoted string is in fact treated as expected 386 | // by Compile -- i.e. that it matches the original, unquoted string. 387 | if tc.pattern != "" { 388 | re, err := Compile(quoted) 389 | if err != nil { 390 | t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) 391 | continue 392 | } 393 | src := "abc" + tc.pattern + "def" 394 | repl := "xyz" 395 | replaced := re.ReplaceAllString(src, repl) 396 | expected := "abcxyzdef" 397 | if replaced != expected { 398 | t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", 399 | tc.pattern, src, repl, replaced, expected) 400 | } 401 | } 402 | } 403 | } 404 | 405 | func TestLiteralPrefix(t *testing.T) { 406 | for _, tc := range append(metaTests, literalPrefixTests...) { 407 | // Literal method needs to scan the pattern. 408 | re := MustCompile(tc.pattern) 409 | str, complete := re.LiteralPrefix() 410 | if complete != tc.isLiteral { 411 | t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) 412 | } 413 | if str != tc.literal { 414 | t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) 415 | } 416 | } 417 | } 418 | 419 | type subexpCase struct { 420 | input string 421 | num int 422 | names []string 423 | } 424 | 425 | var subexpCases = []subexpCase{ 426 | {``, 0, nil}, 427 | {`.*`, 0, nil}, 428 | {`abba`, 0, nil}, 429 | {`ab(b)a`, 1, []string{"", ""}}, 430 | {`ab(.*)a`, 1, []string{"", ""}}, 431 | {`(.*)ab(.*)a`, 2, []string{"", "", ""}}, 432 | {`(.*)(ab)(.*)a`, 3, []string{"", "", "", ""}}, 433 | {`(.*)((a)b)(.*)a`, 4, []string{"", "", "", "", ""}}, 434 | {`(.*)(\(ab)(.*)a`, 3, []string{"", "", "", ""}}, 435 | {`(.*)(\(a\)b)(.*)a`, 3, []string{"", "", "", ""}}, 436 | {`(?P.*)(?P(a)b)(?P.*)a`, 4, []string{"", "foo", "bar", "", "foo"}}, 437 | } 438 | 439 | func TestSubexp(t *testing.T) { 440 | for _, c := range subexpCases { 441 | re := MustCompile(c.input) 442 | n := re.NumSubexp() 443 | if n != c.num { 444 | t.Errorf("%q: NumSubexp = %d, want %d", c.input, n, c.num) 445 | continue 446 | } 447 | names := re.SubexpNames() 448 | if len(names) != 1+n { 449 | t.Errorf("%q: len(SubexpNames) = %d, want %d", c.input, len(names), n) 450 | continue 451 | } 452 | if c.names != nil { 453 | for i := 0; i < 1+n; i++ { 454 | if names[i] != c.names[i] { 455 | t.Errorf("%q: SubexpNames[%d] = %q, want %q", c.input, i, names[i], c.names[i]) 456 | } 457 | } 458 | } 459 | } 460 | } 461 | 462 | var splitTests = []struct { 463 | s string 464 | r string 465 | n int 466 | out []string 467 | }{ 468 | {"foo:and:bar", ":", -1, []string{"foo", "and", "bar"}}, 469 | {"foo:and:bar", ":", 1, []string{"foo:and:bar"}}, 470 | {"foo:and:bar", ":", 2, []string{"foo", "and:bar"}}, 471 | {"foo:and:bar", "foo", -1, []string{"", ":and:bar"}}, 472 | {"foo:and:bar", "bar", -1, []string{"foo:and:", ""}}, 473 | {"foo:and:bar", "baz", -1, []string{"foo:and:bar"}}, 474 | {"baabaab", "a", -1, []string{"b", "", "b", "", "b"}}, 475 | {"baabaab", "a*", -1, []string{"b", "b", "b"}}, 476 | {"baabaab", "ba*", -1, []string{"", "", "", ""}}, 477 | {"foobar", "f*b*", -1, []string{"", "o", "o", "a", "r"}}, 478 | {"foobar", "f+.*b+", -1, []string{"", "ar"}}, 479 | {"foobooboar", "o{2}", -1, []string{"f", "b", "boar"}}, 480 | {"a,b,c,d,e,f", ",", 3, []string{"a", "b", "c,d,e,f"}}, 481 | {"a,b,c,d,e,f", ",", 0, nil}, 482 | {",", ",", -1, []string{"", ""}}, 483 | {",,,", ",", -1, []string{"", "", "", ""}}, 484 | {"", ",", -1, []string{""}}, 485 | {"", ".*", -1, []string{""}}, 486 | {"", ".+", -1, []string{""}}, 487 | {"", "", -1, []string{}}, 488 | {"foobar", "", -1, []string{"f", "o", "o", "b", "a", "r"}}, 489 | {"abaabaccadaaae", "a*", 5, []string{"", "b", "b", "c", "cadaaae"}}, 490 | {":x:y:z:", ":", -1, []string{"", "x", "y", "z", ""}}, 491 | } 492 | 493 | func TestSplit(t *testing.T) { 494 | for i, test := range splitTests { 495 | re, err := Compile(test.r) 496 | if err != nil { 497 | t.Errorf("#%d: %q: compile error: %s", i, test.r, err.Error()) 498 | continue 499 | } 500 | 501 | split := re.Split(test.s, test.n) 502 | if !reflect.DeepEqual(split, test.out) { 503 | t.Errorf("#%d: %q: got %q; want %q", i, test.r, split, test.out) 504 | } 505 | 506 | if QuoteMeta(test.r) == test.r { 507 | strsplit := strings.SplitN(test.s, test.r, test.n) 508 | if !reflect.DeepEqual(split, strsplit) { 509 | t.Errorf("#%d: Split(%q, %q, %d): regexp vs strings mismatch\nregexp=%q\nstrings=%q", i, test.s, test.r, test.n, split, strsplit) 510 | } 511 | } 512 | } 513 | } 514 | 515 | // Check that one-pass cutoff does trigger. 516 | func TestOnePassCutoff(t *testing.T) { 517 | re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl) 518 | if err != nil { 519 | t.Fatalf("parse: %v", err) 520 | } 521 | p, err := syntax.Compile(re.Simplify()) 522 | if err != nil { 523 | t.Fatalf("compile: %v", err) 524 | } 525 | if compileOnePass(p) != notOnePass { 526 | t.Fatalf("makeOnePass succeeded; wanted notOnePass") 527 | } 528 | } 529 | 530 | // Check that the same machine can be used with the standard matcher 531 | // and then the backtracker when there are no captures. 532 | func TestSwitchBacktrack(t *testing.T) { 533 | re := MustCompile(`a|b`) 534 | long := make([]byte, maxBacktrackVector+1) 535 | 536 | // The following sequence of Match calls used to panic. See issue #10319. 537 | re.Match(long) // triggers standard matcher 538 | re.Match(long[:1]) // triggers backtracker 539 | } 540 | 541 | func BenchmarkLiteral(b *testing.B) { 542 | x := strings.Repeat("x", 50) + "y" 543 | b.StopTimer() 544 | re := MustCompile("y") 545 | b.StartTimer() 546 | for i := 0; i < b.N; i++ { 547 | if !re.MatchString(x) { 548 | b.Fatalf("no match!") 549 | } 550 | } 551 | } 552 | 553 | func BenchmarkNotLiteral(b *testing.B) { 554 | x := strings.Repeat("x", 50) + "y" 555 | b.StopTimer() 556 | re := MustCompile(".y") 557 | b.StartTimer() 558 | for i := 0; i < b.N; i++ { 559 | if !re.MatchString(x) { 560 | b.Fatalf("no match!") 561 | } 562 | } 563 | } 564 | 565 | func BenchmarkMatchClass(b *testing.B) { 566 | b.StopTimer() 567 | x := strings.Repeat("xxxx", 20) + "w" 568 | re := MustCompile("[abcdw]") 569 | b.StartTimer() 570 | for i := 0; i < b.N; i++ { 571 | if !re.MatchString(x) { 572 | b.Fatalf("no match!") 573 | } 574 | } 575 | } 576 | 577 | func BenchmarkMatchClass_InRange(b *testing.B) { 578 | b.StopTimer() 579 | // 'b' is between 'a' and 'c', so the charclass 580 | // range checking is no help here. 581 | x := strings.Repeat("bbbb", 20) + "c" 582 | re := MustCompile("[ac]") 583 | b.StartTimer() 584 | for i := 0; i < b.N; i++ { 585 | if !re.MatchString(x) { 586 | b.Fatalf("no match!") 587 | } 588 | } 589 | } 590 | 591 | func BenchmarkReplaceAll(b *testing.B) { 592 | x := "abcdefghijklmnopqrstuvwxyz" 593 | b.StopTimer() 594 | re := MustCompile("[cjrw]") 595 | b.StartTimer() 596 | for i := 0; i < b.N; i++ { 597 | re.ReplaceAllString(x, "") 598 | } 599 | } 600 | 601 | func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { 602 | b.StopTimer() 603 | x := []byte("abcdefghijklmnopqrstuvwxyz") 604 | re := MustCompile("^zbc(d|e)") 605 | b.StartTimer() 606 | for i := 0; i < b.N; i++ { 607 | re.Match(x) 608 | } 609 | } 610 | 611 | func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { 612 | b.StopTimer() 613 | x := []byte("abcdefghijklmnopqrstuvwxyz") 614 | for i := 0; i < 15; i++ { 615 | x = append(x, x...) 616 | } 617 | re := MustCompile("^zbc(d|e)") 618 | b.StartTimer() 619 | for i := 0; i < b.N; i++ { 620 | re.Match(x) 621 | } 622 | } 623 | 624 | func BenchmarkAnchoredShortMatch(b *testing.B) { 625 | b.StopTimer() 626 | x := []byte("abcdefghijklmnopqrstuvwxyz") 627 | re := MustCompile("^.bc(d|e)") 628 | b.StartTimer() 629 | for i := 0; i < b.N; i++ { 630 | re.Match(x) 631 | } 632 | } 633 | 634 | func BenchmarkAnchoredLongMatch(b *testing.B) { 635 | b.StopTimer() 636 | x := []byte("abcdefghijklmnopqrstuvwxyz") 637 | for i := 0; i < 15; i++ { 638 | x = append(x, x...) 639 | } 640 | re := MustCompile("^.bc(d|e)") 641 | b.StartTimer() 642 | for i := 0; i < b.N; i++ { 643 | re.Match(x) 644 | } 645 | } 646 | 647 | func BenchmarkOnePassShortA(b *testing.B) { 648 | b.StopTimer() 649 | x := []byte("abcddddddeeeededd") 650 | re := MustCompile("^.bc(d|e)*$") 651 | b.StartTimer() 652 | for i := 0; i < b.N; i++ { 653 | re.Match(x) 654 | } 655 | } 656 | 657 | func BenchmarkNotOnePassShortA(b *testing.B) { 658 | b.StopTimer() 659 | x := []byte("abcddddddeeeededd") 660 | re := MustCompile(".bc(d|e)*$") 661 | b.StartTimer() 662 | for i := 0; i < b.N; i++ { 663 | re.Match(x) 664 | } 665 | } 666 | 667 | func BenchmarkOnePassShortB(b *testing.B) { 668 | b.StopTimer() 669 | x := []byte("abcddddddeeeededd") 670 | re := MustCompile("^.bc(?:d|e)*$") 671 | b.StartTimer() 672 | for i := 0; i < b.N; i++ { 673 | re.Match(x) 674 | } 675 | } 676 | 677 | func BenchmarkNotOnePassShortB(b *testing.B) { 678 | b.StopTimer() 679 | x := []byte("abcddddddeeeededd") 680 | re := MustCompile(".bc(?:d|e)*$") 681 | b.StartTimer() 682 | for i := 0; i < b.N; i++ { 683 | re.Match(x) 684 | } 685 | } 686 | 687 | func BenchmarkOnePassLongPrefix(b *testing.B) { 688 | b.StopTimer() 689 | x := []byte("abcdefghijklmnopqrstuvwxyz") 690 | re := MustCompile("^abcdefghijklmnopqrstuvwxyz.*$") 691 | b.StartTimer() 692 | for i := 0; i < b.N; i++ { 693 | re.Match(x) 694 | } 695 | } 696 | 697 | func BenchmarkOnePassLongNotPrefix(b *testing.B) { 698 | b.StopTimer() 699 | x := []byte("abcdefghijklmnopqrstuvwxyz") 700 | re := MustCompile("^.bcdefghijklmnopqrstuvwxyz.*$") 701 | b.StartTimer() 702 | for i := 0; i < b.N; i++ { 703 | re.Match(x) 704 | } 705 | } 706 | 707 | func BenchmarkMatchParallelShared(b *testing.B) { 708 | x := []byte("this is a long line that contains foo bar baz") 709 | re := MustCompile("foo (ba+r)? baz") 710 | b.ResetTimer() 711 | b.RunParallel(func(pb *testing.PB) { 712 | for pb.Next() { 713 | re.Match(x) 714 | } 715 | }) 716 | } 717 | 718 | func BenchmarkMatchParallelCopied(b *testing.B) { 719 | x := []byte("this is a long line that contains foo bar baz") 720 | re := MustCompile("foo (ba+r)? baz") 721 | b.ResetTimer() 722 | b.RunParallel(func(pb *testing.PB) { 723 | re := re.Copy() 724 | for pb.Next() { 725 | re.Match(x) 726 | } 727 | }) 728 | } 729 | -------------------------------------------------------------------------------- /backtrack.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // backtrack is a regular expression search with submatch 6 | // tracking for small regular expressions and texts. It allocates 7 | // a bit vector with (length of input) * (length of prog) bits, 8 | // to make sure it never explores the same (character position, instruction) 9 | // state multiple times. This limits the search to run in time linear in 10 | // the length of the test. 11 | // 12 | // backtrack is a fast replacement for the NFA code on small 13 | // regexps when onepass cannot be used. 14 | 15 | package regexp 16 | 17 | import ( 18 | "matloob.io/regexp/internal/input" 19 | "matloob.io/regexp/syntax" 20 | ) 21 | 22 | // A job is an entry on the backtracker's job stack. It holds 23 | // the instruction pc and the position in the input. 24 | type job struct { 25 | pc uint32 26 | arg int 27 | pos int 28 | } 29 | 30 | const ( 31 | visitedBits = 32 32 | maxBacktrackProg = 500 // len(prog.Inst) <= max 33 | maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) 34 | ) 35 | 36 | // bitState holds state for the backtracker. 37 | type bitState struct { 38 | prog *syntax.Prog 39 | 40 | end int 41 | cap []int 42 | jobs []job 43 | visited []uint32 44 | } 45 | 46 | var notBacktrack *bitState = nil 47 | 48 | // maxBitStateLen returns the maximum length of a string to search with 49 | // the backtracker using prog. 50 | func maxBitStateLen(prog *syntax.Prog) int { 51 | if !shouldBacktrack(prog) { 52 | return 0 53 | } 54 | return maxBacktrackVector / len(prog.Inst) 55 | } 56 | 57 | // newBitState returns a new bitState for the given prog, 58 | // or notBacktrack if the size of the prog exceeds the maximum size that 59 | // the backtracker will be run for. 60 | func newBitState(prog *syntax.Prog) *bitState { 61 | if !shouldBacktrack(prog) { 62 | return notBacktrack 63 | } 64 | return &bitState{ 65 | prog: prog, 66 | } 67 | } 68 | 69 | // shouldBacktrack reports whether the program is too 70 | // long for the backtracker to run. 71 | func shouldBacktrack(prog *syntax.Prog) bool { 72 | return len(prog.Inst) <= maxBacktrackProg 73 | } 74 | 75 | // reset resets the state of the backtracker. 76 | // end is the end position in the input. 77 | // ncap is the number of captures. 78 | func (b *bitState) reset(end int, ncap int) { 79 | b.end = end 80 | 81 | if cap(b.jobs) == 0 { 82 | b.jobs = make([]job, 0, 256) 83 | } else { 84 | b.jobs = b.jobs[:0] 85 | } 86 | 87 | visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits 88 | if cap(b.visited) < visitedSize { 89 | b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) 90 | } else { 91 | b.visited = b.visited[:visitedSize] 92 | for i := range b.visited { 93 | b.visited[i] = 0 94 | } 95 | } 96 | 97 | if cap(b.cap) < ncap { 98 | b.cap = make([]int, ncap) 99 | } else { 100 | b.cap = b.cap[:ncap] 101 | } 102 | for i := range b.cap { 103 | b.cap[i] = -1 104 | } 105 | } 106 | 107 | // shouldVisit reports whether the combination of (pc, pos) has not 108 | // been visited yet. 109 | func (b *bitState) shouldVisit(pc uint32, pos int) bool { 110 | n := uint(int(pc)*(b.end+1) + pos) 111 | if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { 112 | return false 113 | } 114 | b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) 115 | return true 116 | } 117 | 118 | // push pushes (pc, pos, arg) onto the job stack if it should be 119 | // visited. 120 | func (b *bitState) push(pc uint32, pos int, arg int) { 121 | if b.prog.Inst[pc].Op == syntax.InstFail { 122 | return 123 | } 124 | 125 | // Only check shouldVisit when arg == 0. 126 | // When arg > 0, we are continuing a previous visit. 127 | if arg == 0 && !b.shouldVisit(pc, pos) { 128 | return 129 | } 130 | 131 | b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) 132 | } 133 | 134 | // tryBacktrack runs a backtracking search starting at pos. 135 | func (m *machine) tryBacktrack(b *bitState, i input.Input, pc uint32, pos int) bool { 136 | longest := m.re.longest 137 | m.matched = false 138 | 139 | b.push(pc, pos, 0) 140 | for len(b.jobs) > 0 { 141 | l := len(b.jobs) - 1 142 | // Pop job off the stack. 143 | pc := b.jobs[l].pc 144 | pos := b.jobs[l].pos 145 | arg := b.jobs[l].arg 146 | b.jobs = b.jobs[:l] 147 | 148 | // Optimization: rather than push and pop, 149 | // code that is going to Push and continue 150 | // the loop simply updates ip, p, and arg 151 | // and jumps to CheckAndLoop. We have to 152 | // do the ShouldVisit check that Push 153 | // would have, but we avoid the stack 154 | // manipulation. 155 | goto Skip 156 | CheckAndLoop: 157 | if !b.shouldVisit(pc, pos) { 158 | continue 159 | } 160 | Skip: 161 | 162 | inst := b.prog.Inst[pc] 163 | 164 | switch inst.Op { 165 | default: 166 | panic("bad inst") 167 | case syntax.InstFail: 168 | panic("unexpected InstFail") 169 | case syntax.InstAlt: 170 | // Cannot just 171 | // b.push(inst.Out, pos, 0) 172 | // b.push(inst.Arg, pos, 0) 173 | // If during the processing of inst.Out, we encounter 174 | // inst.Arg via another path, we want to process it then. 175 | // Pushing it here will inhibit that. Instead, re-push 176 | // inst with arg==1 as a reminder to push inst.Arg out 177 | // later. 178 | switch arg { 179 | case 0: 180 | b.push(pc, pos, 1) 181 | pc = inst.Out 182 | goto CheckAndLoop 183 | case 1: 184 | // Finished inst.Out; try inst.Arg. 185 | arg = 0 186 | pc = inst.Arg 187 | goto CheckAndLoop 188 | } 189 | panic("bad arg in InstAlt") 190 | 191 | case syntax.InstAltMatch: 192 | // One opcode consumes runes; the other leads to match. 193 | switch b.prog.Inst[inst.Out].Op { 194 | case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 195 | // inst.Arg is the match. 196 | b.push(inst.Arg, pos, 0) 197 | pc = inst.Arg 198 | pos = b.end 199 | goto CheckAndLoop 200 | } 201 | // inst.Out is the match - non-greedy 202 | b.push(inst.Out, b.end, 0) 203 | pc = inst.Out 204 | goto CheckAndLoop 205 | 206 | case syntax.InstRune: 207 | r, width := i.Step(pos) 208 | if !inst.MatchRune(r) { 209 | continue 210 | } 211 | pos += width 212 | pc = inst.Out 213 | goto CheckAndLoop 214 | 215 | case syntax.InstRune1: 216 | r, width := i.Step(pos) 217 | if r != inst.Rune[0] { 218 | continue 219 | } 220 | pos += width 221 | pc = inst.Out 222 | goto CheckAndLoop 223 | 224 | case syntax.InstRuneAnyNotNL: 225 | r, width := i.Step(pos) 226 | if r == '\n' || r == input.EndOfText { 227 | continue 228 | } 229 | pos += width 230 | pc = inst.Out 231 | goto CheckAndLoop 232 | 233 | case syntax.InstRuneAny: 234 | r, width := i.Step(pos) 235 | if r == input.EndOfText { 236 | continue 237 | } 238 | pos += width 239 | pc = inst.Out 240 | goto CheckAndLoop 241 | 242 | case syntax.InstCapture: 243 | switch arg { 244 | case 0: 245 | if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) { 246 | // Capture pos to register, but save old value. 247 | b.push(pc, b.cap[inst.Arg], 1) // come back when we're done. 248 | b.cap[inst.Arg] = pos 249 | } 250 | pc = inst.Out 251 | goto CheckAndLoop 252 | case 1: 253 | // Finished inst.Out; restore the old value. 254 | b.cap[inst.Arg] = pos 255 | continue 256 | 257 | } 258 | panic("bad arg in InstCapture") 259 | 260 | case syntax.InstEmptyWidth: 261 | if syntax.EmptyOp(inst.Arg)&^i.Context(pos) != 0 { 262 | continue 263 | } 264 | pc = inst.Out 265 | goto CheckAndLoop 266 | 267 | case syntax.InstNop: 268 | pc = inst.Out 269 | goto CheckAndLoop 270 | 271 | case syntax.InstMatch: 272 | // We found a match. If the caller doesn't care 273 | // where the match is, no point going further. 274 | if len(b.cap) == 0 { 275 | m.matched = true 276 | return m.matched 277 | } 278 | 279 | // Record best match so far. 280 | // Only need to check end point, because this entire 281 | // call is only considering one start position. 282 | if len(b.cap) > 1 { 283 | b.cap[1] = pos 284 | } 285 | if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) { 286 | copy(m.matchcap, b.cap) 287 | } 288 | m.matched = true 289 | 290 | // If going for first match, we're done. 291 | if !longest { 292 | return m.matched 293 | } 294 | 295 | // If we used the entire text, no longer match is possible. 296 | if pos == b.end { 297 | return m.matched 298 | } 299 | 300 | // Otherwise, continue on in hope of a longer match. 301 | continue 302 | } 303 | } 304 | 305 | return m.matched 306 | } 307 | 308 | // backtrack runs a backtracking search of prog on the input starting at pos. 309 | func (m *machine) backtrack(i input.Input, pos int, end int, ncap int) bool { 310 | if !i.CanCheckPrefix() { 311 | panic("backtrack called for a RuneReader") 312 | } 313 | 314 | startCond := m.re.cond 315 | if startCond == ^syntax.EmptyOp(0) { // impossible 316 | return false 317 | } 318 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 319 | // Anchored match, past beginning of text. 320 | return false 321 | } 322 | 323 | b := m.b 324 | b.reset(end, ncap) 325 | 326 | m.matchcap = m.matchcap[:ncap] 327 | for i := range m.matchcap { 328 | m.matchcap[i] = -1 329 | } 330 | 331 | // Anchored search must start at the beginning of the input 332 | if startCond&syntax.EmptyBeginText != 0 { 333 | if len(b.cap) > 0 { 334 | b.cap[0] = pos 335 | } 336 | return m.tryBacktrack(b, i, uint32(m.p.Start), pos) 337 | } 338 | 339 | // Unanchored search, starting from each possible text position. 340 | // Notice that we have to try the empty string at the end of 341 | // the text, so the loop condition is pos <= end, not pos < end. 342 | // This looks like it's quadratic in the size of the text, 343 | // but we are not clearing visited between calls to TrySearch, 344 | // so no work is duplicated and it ends up still being linear. 345 | width := -1 346 | for ; pos <= end && width != 0; pos += width { 347 | if len(m.re.prefix) > 0 { 348 | // Match requires literal prefix; fast search for it. 349 | advance := i.Index(m.re, pos) 350 | if advance < 0 { 351 | return false 352 | } 353 | pos += advance 354 | } 355 | 356 | if len(b.cap) > 0 { 357 | b.cap[0] = pos 358 | } 359 | if m.tryBacktrack(b, i, uint32(m.p.Start), pos) { 360 | // Match must be leftmost; done. 361 | return true 362 | } 363 | _, width = i.Step(pos) 364 | } 365 | return false 366 | } 367 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp_test 6 | 7 | import ( 8 | "fmt" 9 | "matloob.io/regexp" 10 | ) 11 | 12 | func Example() { 13 | // Compile the expression once, usually at init time. 14 | // Use raw strings to avoid having to quote the backslashes. 15 | var validID = regexp.MustCompile(`^[a-z]+\[[0-9]+\]$`) 16 | 17 | fmt.Println(validID.MatchString("adam[23]")) 18 | fmt.Println(validID.MatchString("eve[7]")) 19 | fmt.Println(validID.MatchString("Job[48]")) 20 | fmt.Println(validID.MatchString("snakey")) 21 | // Output: 22 | // true 23 | // true 24 | // false 25 | // false 26 | } 27 | 28 | func ExampleMatchString() { 29 | matched, err := regexp.MatchString("foo.*", "seafood") 30 | fmt.Println(matched, err) 31 | matched, err = regexp.MatchString("bar.*", "seafood") 32 | fmt.Println(matched, err) 33 | matched, err = regexp.MatchString("a(b", "seafood") 34 | fmt.Println(matched, err) 35 | // Output: 36 | // true 37 | // false 38 | // false error parsing regexp: missing closing ): `a(b` 39 | } 40 | 41 | func ExampleRegexp_FindString() { 42 | re := regexp.MustCompile("fo.?") 43 | fmt.Printf("%q\n", re.FindString("seafood")) 44 | fmt.Printf("%q\n", re.FindString("meat")) 45 | // Output: 46 | // "foo" 47 | // "" 48 | } 49 | 50 | func ExampleRegexp_FindStringIndex() { 51 | re := regexp.MustCompile("ab?") 52 | fmt.Println(re.FindStringIndex("tablett")) 53 | fmt.Println(re.FindStringIndex("foo") == nil) 54 | // Output: 55 | // [1 3] 56 | // true 57 | } 58 | 59 | func ExampleRegexp_FindStringSubmatch() { 60 | re := regexp.MustCompile("a(x*)b(y|z)c") 61 | fmt.Printf("%q\n", re.FindStringSubmatch("-axxxbyc-")) 62 | fmt.Printf("%q\n", re.FindStringSubmatch("-abzc-")) 63 | // Output: 64 | // ["axxxbyc" "xxx" "y"] 65 | // ["abzc" "" "z"] 66 | } 67 | 68 | func ExampleRegexp_FindAllString() { 69 | re := regexp.MustCompile("a.") 70 | fmt.Println(re.FindAllString("paranormal", -1)) 71 | fmt.Println(re.FindAllString("paranormal", 2)) 72 | fmt.Println(re.FindAllString("graal", -1)) 73 | fmt.Println(re.FindAllString("none", -1)) 74 | // Output: 75 | // [ar an al] 76 | // [ar an] 77 | // [aa] 78 | // [] 79 | } 80 | 81 | func ExampleRegexp_FindAllStringSubmatch() { 82 | re := regexp.MustCompile("a(x*)b") 83 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-", -1)) 84 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-", -1)) 85 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-axb-", -1)) 86 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-ab-", -1)) 87 | // Output: 88 | // [["ab" ""]] 89 | // [["axxb" "xx"]] 90 | // [["ab" ""] ["axb" "x"]] 91 | // [["axxb" "xx"] ["ab" ""]] 92 | } 93 | 94 | func ExampleRegexp_FindAllStringSubmatchIndex() { 95 | re := regexp.MustCompile("a(x*)b") 96 | // Indices: 97 | // 01234567 012345678 98 | // -ab-axb- -axxb-ab- 99 | fmt.Println(re.FindAllStringSubmatchIndex("-ab-", -1)) 100 | fmt.Println(re.FindAllStringSubmatchIndex("-axxb-", -1)) 101 | fmt.Println(re.FindAllStringSubmatchIndex("-ab-axb-", -1)) 102 | fmt.Println(re.FindAllStringSubmatchIndex("-axxb-ab-", -1)) 103 | fmt.Println(re.FindAllStringSubmatchIndex("-foo-", -1)) 104 | // Output: 105 | // [[1 3 2 2]] 106 | // [[1 5 2 4]] 107 | // [[1 3 2 2] [4 7 5 6]] 108 | // [[1 5 2 4] [6 8 7 7]] 109 | // [] 110 | } 111 | 112 | func ExampleRegexp_ReplaceAllLiteralString() { 113 | re := regexp.MustCompile("a(x*)b") 114 | fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "T")) 115 | fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "$1")) 116 | fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "${1}")) 117 | // Output: 118 | // -T-T- 119 | // -$1-$1- 120 | // -${1}-${1}- 121 | } 122 | 123 | func ExampleRegexp_ReplaceAllString() { 124 | re := regexp.MustCompile("a(x*)b") 125 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "T")) 126 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1")) 127 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1W")) 128 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "${1}W")) 129 | // Output: 130 | // -T-T- 131 | // --xx- 132 | // --- 133 | // -W-xxW- 134 | } 135 | 136 | func ExampleRegexp_SubexpNames() { 137 | re := regexp.MustCompile("(?P[a-zA-Z]+) (?P[a-zA-Z]+)") 138 | fmt.Println(re.MatchString("Alan Turing")) 139 | fmt.Printf("%q\n", re.SubexpNames()) 140 | reversed := fmt.Sprintf("${%s} ${%s}", re.SubexpNames()[2], re.SubexpNames()[1]) 141 | fmt.Println(reversed) 142 | fmt.Println(re.ReplaceAllString("Alan Turing", reversed)) 143 | // Output: 144 | // true 145 | // ["" "first" "last"] 146 | // ${last} ${first} 147 | // Turing Alan 148 | } 149 | 150 | func ExampleRegexp_Split() { 151 | a := regexp.MustCompile("a") 152 | fmt.Println(a.Split("banana", -1)) 153 | fmt.Println(a.Split("banana", 0)) 154 | fmt.Println(a.Split("banana", 1)) 155 | fmt.Println(a.Split("banana", 2)) 156 | zp := regexp.MustCompile("z+") 157 | fmt.Println(zp.Split("pizza", -1)) 158 | fmt.Println(zp.Split("pizza", 0)) 159 | fmt.Println(zp.Split("pizza", 1)) 160 | fmt.Println(zp.Split("pizza", 2)) 161 | // Output: 162 | // [b n n ] 163 | // [] 164 | // [banana] 165 | // [b nana] 166 | // [pi a] 167 | // [] 168 | // [pizza] 169 | // [pi a] 170 | } 171 | -------------------------------------------------------------------------------- /exec.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "io" 9 | "matloob.io/regexp/internal/input" 10 | "matloob.io/regexp/syntax" 11 | ) 12 | 13 | // A queue is a 'sparse array' holding pending threads of execution. 14 | // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 15 | type queue struct { 16 | sparse []uint32 17 | dense []entry 18 | } 19 | 20 | // A entry is an entry on a queue. 21 | // It holds both the instruction pc and the actual thread. 22 | // Some queue entries are just place holders so that the machine 23 | // knows it has considered that pc. Such entries have t == nil. 24 | type entry struct { 25 | pc uint32 26 | t *thread 27 | } 28 | 29 | // A thread is the state of a single path through the machine: 30 | // an instruction and a corresponding capture array. 31 | // See http://swtch.com/~rsc/regexp/regexp2.html 32 | type thread struct { 33 | inst *syntax.Inst 34 | cap []int 35 | } 36 | 37 | // A machine holds all the state during an NFA simulation for p. 38 | type machine struct { 39 | re *Regexp // corresponding Regexp 40 | p *syntax.Prog // compiled program 41 | op *onePassProg // compiled onepass program, or notOnePass 42 | maxBitStateLen int // max length of string to search with bitstate 43 | b *bitState // state for backtracker, allocated lazily 44 | q0, q1 queue // two queues for runq, nextq 45 | pool []*thread // pool of available threads 46 | matched bool // whether a match was found 47 | matchcap []int // capture information for the match 48 | 49 | // cached inputs, to avoid allocation 50 | inputBytes input.InputBytes 51 | inputString input.InputString 52 | inputReader input.InputReader 53 | } 54 | 55 | func (m *machine) newInputBytes(b []byte) input.Input { 56 | m.inputBytes.Reset(b) 57 | return &m.inputBytes 58 | } 59 | 60 | func (m *machine) newInputString(s string) input.Input { 61 | m.inputString.Reset(s) 62 | return &m.inputString 63 | } 64 | 65 | func (m *machine) newInputReader(r io.RuneReader) input.Input { 66 | m.inputReader.Reset(r) 67 | return &m.inputReader 68 | } 69 | 70 | // progMachine returns a new machine running the prog p. 71 | func progMachine(p *syntax.Prog, op *onePassProg) *machine { 72 | m := &machine{p: p, op: op} 73 | n := len(m.p.Inst) 74 | m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 75 | m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 76 | ncap := p.NumCap 77 | if ncap < 2 { 78 | ncap = 2 79 | } 80 | if op == notOnePass { 81 | m.maxBitStateLen = maxBitStateLen(p) 82 | } 83 | m.matchcap = make([]int, ncap) 84 | return m 85 | } 86 | 87 | func (m *machine) init(ncap int) { 88 | for _, t := range m.pool { 89 | t.cap = t.cap[:ncap] 90 | } 91 | m.matchcap = m.matchcap[:ncap] 92 | } 93 | 94 | // alloc allocates a new thread with the given instruction. 95 | // It uses the free pool if possible. 96 | func (m *machine) alloc(i *syntax.Inst) *thread { 97 | var t *thread 98 | if n := len(m.pool); n > 0 { 99 | t = m.pool[n-1] 100 | m.pool = m.pool[:n-1] 101 | } else { 102 | t = new(thread) 103 | t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 104 | } 105 | t.inst = i 106 | return t 107 | } 108 | 109 | // match runs the machine over the input starting at pos. 110 | // It reports whether a match was found. 111 | // If so, m.matchcap holds the submatch information. 112 | func (m *machine) match(i input.Input, pos int) bool { 113 | startCond := m.re.cond 114 | if startCond == ^syntax.EmptyOp(0) { // impossible 115 | return false 116 | } 117 | m.matched = false 118 | for i := range m.matchcap { 119 | m.matchcap[i] = -1 120 | } 121 | runq, nextq := &m.q0, &m.q1 122 | r, r1 := input.EndOfText, input.EndOfText 123 | width, width1 := 0, 0 124 | r, width = i.Step(pos) 125 | if r != input.EndOfText { 126 | r1, width1 = i.Step(pos + width) 127 | } 128 | var flag syntax.EmptyOp 129 | if pos == 0 { 130 | flag = syntax.EmptyOpContext(-1, r) 131 | } else { 132 | flag = i.Context(pos) 133 | } 134 | for { 135 | if len(runq.dense) == 0 { 136 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 137 | // Anchored match, past beginning of text. 138 | break 139 | } 140 | if m.matched { 141 | // Have match; finished exploring alternatives. 142 | break 143 | } 144 | if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.CanCheckPrefix() { 145 | // Match requires literal prefix; fast search for it. 146 | advance := i.Index(m.re, pos) 147 | if advance < 0 { 148 | break 149 | } 150 | pos += advance 151 | r, width = i.Step(pos) 152 | r1, width1 = i.Step(pos + width) 153 | } 154 | } 155 | if !m.matched { 156 | if len(m.matchcap) > 0 { 157 | m.matchcap[0] = pos 158 | } 159 | m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil) 160 | } 161 | flag = syntax.EmptyOpContext(r, r1) 162 | m.step(runq, nextq, pos, pos+width, r, flag) 163 | if width == 0 { 164 | break 165 | } 166 | if len(m.matchcap) == 0 && m.matched { 167 | // Found a match and not paying attention 168 | // to where it is, so any match will do. 169 | break 170 | } 171 | pos += width 172 | r, width = r1, width1 173 | if r != input.EndOfText { 174 | r1, width1 = i.Step(pos + width) 175 | } 176 | runq, nextq = nextq, runq 177 | } 178 | m.clear(nextq) 179 | return m.matched 180 | } 181 | 182 | // clear frees all threads on the thread queue. 183 | func (m *machine) clear(q *queue) { 184 | for _, d := range q.dense { 185 | if d.t != nil { 186 | m.pool = append(m.pool, d.t) 187 | } 188 | } 189 | q.dense = q.dense[:0] 190 | } 191 | 192 | // step executes one step of the machine, running each of the threads 193 | // on runq and appending new threads to nextq. 194 | // The step processes the rune c (which may be input.EndOfText), 195 | // which starts at position pos and ends at nextPos. 196 | // nextCond gives the setting for the empty-width flags after c. 197 | func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) { 198 | longest := m.re.longest 199 | for j := 0; j < len(runq.dense); j++ { 200 | d := &runq.dense[j] 201 | t := d.t 202 | if t == nil { 203 | continue 204 | } 205 | if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 206 | m.pool = append(m.pool, t) 207 | continue 208 | } 209 | i := t.inst 210 | add := false 211 | switch i.Op { 212 | default: 213 | panic("bad inst") 214 | 215 | case syntax.InstMatch: 216 | if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 217 | t.cap[1] = pos 218 | copy(m.matchcap, t.cap) 219 | } 220 | if !longest { 221 | // First-match mode: cut off all lower-priority threads. 222 | for _, d := range runq.dense[j+1:] { 223 | if d.t != nil { 224 | m.pool = append(m.pool, d.t) 225 | } 226 | } 227 | runq.dense = runq.dense[:0] 228 | } 229 | m.matched = true 230 | 231 | case syntax.InstRune: 232 | add = i.MatchRune(c) 233 | case syntax.InstRune1: 234 | add = c == i.Rune[0] 235 | case syntax.InstRuneAny: 236 | add = true 237 | case syntax.InstRuneAnyNotNL: 238 | add = c != '\n' 239 | } 240 | if add { 241 | t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 242 | } 243 | if t != nil { 244 | m.pool = append(m.pool, t) 245 | } 246 | } 247 | runq.dense = runq.dense[:0] 248 | } 249 | 250 | // add adds an entry to q for pc, unless the q already has such an entry. 251 | // It also recursively adds an entry for all instructions reachable from pc by following 252 | // empty-width conditions satisfied by cond. pos gives the current position 253 | // in the input. 254 | func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread { 255 | if pc == 0 { 256 | return t 257 | } 258 | if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 259 | return t 260 | } 261 | 262 | j := len(q.dense) 263 | q.dense = q.dense[:j+1] 264 | d := &q.dense[j] 265 | d.t = nil 266 | d.pc = pc 267 | q.sparse[pc] = uint32(j) 268 | 269 | i := &m.p.Inst[pc] 270 | switch i.Op { 271 | default: 272 | panic("unhandled") 273 | case syntax.InstFail: 274 | // nothing 275 | case syntax.InstAlt, syntax.InstAltMatch: 276 | t = m.add(q, i.Out, pos, cap, cond, t) 277 | t = m.add(q, i.Arg, pos, cap, cond, t) 278 | case syntax.InstEmptyWidth: 279 | if syntax.EmptyOp(i.Arg)&^cond == 0 { 280 | t = m.add(q, i.Out, pos, cap, cond, t) 281 | } 282 | case syntax.InstNop: 283 | t = m.add(q, i.Out, pos, cap, cond, t) 284 | case syntax.InstCapture: 285 | if int(i.Arg) < len(cap) { 286 | opos := cap[i.Arg] 287 | cap[i.Arg] = pos 288 | m.add(q, i.Out, pos, cap, cond, nil) 289 | cap[i.Arg] = opos 290 | } else { 291 | t = m.add(q, i.Out, pos, cap, cond, t) 292 | } 293 | case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 294 | if t == nil { 295 | t = m.alloc(i) 296 | } else { 297 | t.inst = i 298 | } 299 | if len(cap) > 0 && &t.cap[0] != &cap[0] { 300 | copy(t.cap, cap) 301 | } 302 | d.t = t 303 | t = nil 304 | } 305 | return t 306 | } 307 | 308 | // onepass runs the machine over the input starting at pos. 309 | // It reports whether a match was found. 310 | // If so, m.matchcap holds the submatch information. 311 | func (m *machine) onepass(i input.Input, pos int) bool { 312 | startCond := m.re.cond 313 | if startCond == ^syntax.EmptyOp(0) { // impossible 314 | return false 315 | } 316 | m.matched = false 317 | for i := range m.matchcap { 318 | m.matchcap[i] = -1 319 | } 320 | r, r1 := input.EndOfText, input.EndOfText 321 | width, width1 := 0, 0 322 | r, width = i.Step(pos) 323 | if r != input.EndOfText { 324 | r1, width1 = i.Step(pos + width) 325 | } 326 | var flag syntax.EmptyOp 327 | if pos == 0 { 328 | flag = syntax.EmptyOpContext(-1, r) 329 | } else { 330 | flag = i.Context(pos) 331 | } 332 | pc := m.op.Start 333 | inst := m.op.Inst[pc] 334 | // If there is a simple literal prefix, skip over it. 335 | if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 && 336 | len(m.re.prefix) > 0 && i.CanCheckPrefix() { 337 | // Match requires literal prefix; fast search for it. 338 | if i.HasPrefix(m.re) { 339 | pos += len(m.re.prefix) 340 | r, width = i.Step(pos) 341 | r1, width1 = i.Step(pos + width) 342 | flag = i.Context(pos) 343 | pc = int(m.re.prefixEnd) 344 | } else { 345 | return m.matched 346 | } 347 | } 348 | for { 349 | inst = m.op.Inst[pc] 350 | pc = int(inst.Out) 351 | switch inst.Op { 352 | default: 353 | panic("bad inst") 354 | case syntax.InstMatch: 355 | m.matched = true 356 | if len(m.matchcap) > 0 { 357 | m.matchcap[0] = 0 358 | m.matchcap[1] = pos 359 | } 360 | return m.matched 361 | case syntax.InstRune: 362 | if !inst.MatchRune(r) { 363 | return m.matched 364 | } 365 | case syntax.InstRune1: 366 | if r != inst.Rune[0] { 367 | return m.matched 368 | } 369 | case syntax.InstRuneAny: 370 | // Nothing 371 | case syntax.InstRuneAnyNotNL: 372 | if r == '\n' { 373 | return m.matched 374 | } 375 | // peek at the input rune to see which branch of the Alt to take 376 | case syntax.InstAlt, syntax.InstAltMatch: 377 | pc = int(onePassNext(&inst, r)) 378 | continue 379 | case syntax.InstFail: 380 | return m.matched 381 | case syntax.InstNop: 382 | continue 383 | case syntax.InstEmptyWidth: 384 | if syntax.EmptyOp(inst.Arg)&^flag != 0 { 385 | return m.matched 386 | } 387 | continue 388 | case syntax.InstCapture: 389 | if int(inst.Arg) < len(m.matchcap) { 390 | m.matchcap[inst.Arg] = pos 391 | } 392 | continue 393 | } 394 | if width == 0 { 395 | break 396 | } 397 | flag = syntax.EmptyOpContext(r, r1) 398 | pos += width 399 | r, width = r1, width1 400 | if r != input.EndOfText { 401 | r1, width1 = i.Step(pos + width) 402 | } 403 | } 404 | return m.matched 405 | } 406 | 407 | // empty is a non-nil 0-element slice, 408 | // so doExecute can avoid an allocation 409 | // when 0 captures are requested from a successful match. 410 | var empty = make([]int, 0) 411 | 412 | // doExecute finds the leftmost match in the input and returns 413 | // the position of its subexpressions. 414 | func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int { 415 | m := re.get() 416 | var i input.Input 417 | var size int 418 | if r != nil { 419 | i = m.newInputReader(r) 420 | } else if b != nil { 421 | i = m.newInputBytes(b) 422 | size = len(b) 423 | } else { 424 | i = m.newInputString(s) 425 | size = len(s) 426 | } 427 | if m.op != notOnePass { 428 | if !m.onepass(i, pos) { 429 | re.put(m) 430 | return nil 431 | } 432 | } else if size < m.maxBitStateLen && r == nil { 433 | if m.b == nil { 434 | m.b = newBitState(m.p) 435 | } 436 | if !m.backtrack(i, pos, size, ncap) { 437 | re.put(m) 438 | return nil 439 | } 440 | } else { 441 | if ncap <= 2 { 442 | matched, err := re.searcher.Search(i, pos, m.re.longest, &m.matchcap, ncap) 443 | if err != nil { 444 | goto nfa 445 | } 446 | if !matched { 447 | re.put(m) 448 | return nil 449 | } 450 | goto e 451 | } 452 | nfa: 453 | m.init(ncap) 454 | if !m.match(i, pos) { 455 | re.put(m) 456 | return nil 457 | } 458 | e: 459 | } 460 | if ncap == 0 { 461 | re.put(m) 462 | return empty // empty but not nil 463 | } 464 | cap := make([]int, len(m.matchcap)) 465 | copy(cap, m.matchcap) 466 | re.put(m) 467 | return cap 468 | } 469 | -------------------------------------------------------------------------------- /exec2_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !race 6 | 7 | package regexp 8 | 9 | import ( 10 | "testing" 11 | ) 12 | 13 | // This test is excluded when running under the race detector because 14 | // it is a very expensive test and takes too long. 15 | func TestRE2Exhaustive(t *testing.T) { 16 | if testing.Short() { 17 | t.Skip("skipping TestRE2Exhaustive during short test") 18 | } 19 | testRE2(t, "testdata/re2-exhaustive.txt.bz2") 20 | } 21 | -------------------------------------------------------------------------------- /export.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | FROM="$HOME/src/matloob.io/regexp" 4 | TO="$HOME/go/src/regexp" 5 | 6 | cp $FROM/*.go $TO/ 7 | cp $FROM/syntax/*.go $TO/syntax/ 8 | cp $FROM/internal/dfa/*.go $TO/internal/dfa 9 | cp $FROM/internal/input/*.go $TO/internal/input/ 10 | 11 | sed -i .bak -e "s/matloob.io\///g" $TO/*.go $TO/internal/dfa/*.go $TO/internal/input/*.go 12 | rm $TO/*.go.bak $TO/internal/dfa/*.go.bak $TO/internal/input/*.go.bak 13 | -------------------------------------------------------------------------------- /find_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | "testing" 11 | ) 12 | 13 | // For each pattern/text pair, what is the expected output of each function? 14 | // We can derive the textual results from the indexed results, the non-submatch 15 | // results from the submatched results, the single results from the 'all' results, 16 | // and the byte results from the string results. Therefore the table includes 17 | // only the FindAllStringSubmatchIndex result. 18 | type FindTest struct { 19 | pat string 20 | text string 21 | matches [][]int 22 | } 23 | 24 | func (t FindTest) String() string { 25 | return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) 26 | } 27 | 28 | var findTests = []FindTest{ 29 | {``, ``, build(1, 0, 0)}, 30 | {`^abcdefg`, "abcdefg", build(1, 0, 7)}, 31 | {`a+`, "baaab", build(1, 1, 4)}, 32 | {"abcd..", "abcdef", build(1, 0, 6)}, 33 | {`a`, "a", build(1, 0, 1)}, 34 | {`x`, "y", nil}, 35 | {`b`, "abc", build(1, 1, 2)}, 36 | {`.`, "a", build(1, 0, 1)}, 37 | {`.*`, "abcdef", build(1, 0, 6)}, 38 | {`^`, "abcde", build(1, 0, 0)}, 39 | {`$`, "abcde", build(1, 5, 5)}, 40 | {`^abcd$`, "abcd", build(1, 0, 4)}, 41 | {`^bcd'`, "abcdef", nil}, 42 | {`^abcd$`, "abcde", nil}, 43 | {`a+`, "baaab", build(1, 1, 4)}, 44 | {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, 45 | {`[a-z]+`, "abcd", build(1, 0, 4)}, 46 | {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, 47 | {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, 48 | {`[^\n]+`, "abcd\n", build(1, 0, 4)}, 49 | {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, 50 | {`日本語+`, "日本語", build(1, 0, 9)}, 51 | {`日本語+`, "日本語語語語", build(1, 0, 18)}, 52 | {`()`, "", build(1, 0, 0, 0, 0)}, 53 | {`(a)`, "a", build(1, 0, 1, 0, 1)}, 54 | {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, 55 | {`(.*)`, "", build(1, 0, 0, 0, 0)}, 56 | {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, 57 | {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, 58 | {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, 59 | {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, 60 | {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, 61 | {`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)}, 62 | {`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)}, 63 | 64 | {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, 65 | {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, 66 | {`[.]`, ".", build(1, 0, 1)}, 67 | {`/$`, "/abc/", build(1, 4, 5)}, 68 | {`/$`, "/abc", nil}, 69 | 70 | // multiple matches 71 | {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, 72 | {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, 73 | {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, 74 | {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, 75 | {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, 76 | 77 | // fixed bugs 78 | {`ab$`, "cab", build(1, 1, 3)}, 79 | {`axxb$`, "axxcb", nil}, 80 | {`data`, "daXY data", build(1, 5, 9)}, 81 | {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, 82 | {`zx+`, "zzx", build(1, 1, 3)}, 83 | {`ab$`, "abcab", build(1, 3, 5)}, 84 | {`(aa)*$`, "a", build(1, 1, 1, -1, -1)}, 85 | {`(?:.|(?:.a))`, "", nil}, 86 | {`(?:A(?:A|a))`, "Aa", build(1, 0, 2)}, 87 | {`(?:A|(?:A|a))`, "a", build(1, 0, 1)}, 88 | {`(a){0}`, "", build(1, 0, 0, -1, -1)}, 89 | {`(?-s)(?:(?:^).)`, "\n", nil}, 90 | {`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)}, 91 | {`(?:(?:^).)`, "\n", nil}, 92 | {`\b`, "x", build(2, 0, 0, 1, 1)}, 93 | {`\b`, "xx", build(2, 0, 0, 2, 2)}, 94 | {`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)}, 95 | {`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)}, 96 | {`\B`, "x", nil}, 97 | {`\B`, "xx", build(1, 1, 1)}, 98 | {`\B`, "x y", nil}, 99 | {`\B`, "xx yy", build(2, 1, 1, 4, 4)}, 100 | 101 | // RE2 tests 102 | {`[^\S\s]`, "abcd", nil}, 103 | {`[^\S[:space:]]`, "abcd", nil}, 104 | {`[^\D\d]`, "abcd", nil}, 105 | {`[^\D[:digit:]]`, "abcd", nil}, 106 | {`(?i)\W`, "x", nil}, 107 | {`(?i)\W`, "k", nil}, 108 | {`(?i)\W`, "s", nil}, 109 | 110 | // can backslash-escape any punctuation 111 | {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, 112 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, 113 | {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, 114 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, 115 | {"\\`", "`", build(1, 0, 1)}, 116 | {"[\\`]+", "`", build(1, 0, 1)}, 117 | 118 | // long set of matches (longer than startSize) 119 | { 120 | ".", 121 | "qwertyuiopasdfghjklzxcvbnm1234567890", 122 | build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 123 | 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 124 | 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 125 | 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), 126 | }, 127 | } 128 | 129 | // build is a helper to construct a [][]int by extracting n sequences from x. 130 | // This represents n matches with len(x)/n submatches each. 131 | func build(n int, x ...int) [][]int { 132 | ret := make([][]int, n) 133 | runLength := len(x) / n 134 | j := 0 135 | for i := range ret { 136 | ret[i] = make([]int, runLength) 137 | copy(ret[i], x[j:]) 138 | j += runLength 139 | if j > len(x) { 140 | panic("invalid build entry") 141 | } 142 | } 143 | return ret 144 | } 145 | 146 | // First the simple cases. 147 | 148 | func TestFind(t *testing.T) { 149 | for _, test := range findTests { 150 | re := MustCompile(test.pat) 151 | if re.String() != test.pat { 152 | t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) 153 | } 154 | result := re.Find([]byte(test.text)) 155 | switch { 156 | case len(test.matches) == 0 && len(result) == 0: 157 | // ok 158 | case test.matches == nil && result != nil: 159 | t.Errorf("expected no match; got one: %s", test) 160 | case test.matches != nil && result == nil: 161 | t.Errorf("expected match; got none: %s", test) 162 | case test.matches != nil && result != nil: 163 | expect := test.text[test.matches[0][0]:test.matches[0][1]] 164 | if expect != string(result) { 165 | t.Errorf("expected %q got %q: %s", expect, result, test) 166 | } 167 | } 168 | } 169 | } 170 | 171 | func TestFindString(t *testing.T) { 172 | for _, test := range findTests { 173 | result := MustCompile(test.pat).FindString(test.text) 174 | switch { 175 | case len(test.matches) == 0 && len(result) == 0: 176 | // ok 177 | case test.matches == nil && result != "": 178 | t.Errorf("expected no match; got one: %s", test) 179 | case test.matches != nil && result == "": 180 | // Tricky because an empty result has two meanings: no match or empty match. 181 | if test.matches[0][0] != test.matches[0][1] { 182 | t.Errorf("expected match; got none: %s", test) 183 | } 184 | case test.matches != nil && result != "": 185 | expect := test.text[test.matches[0][0]:test.matches[0][1]] 186 | if expect != result { 187 | t.Errorf("expected %q got %q: %s", expect, result, test) 188 | } 189 | } 190 | } 191 | } 192 | 193 | func testFindIndex(test *FindTest, result []int, t *testing.T) { 194 | switch { 195 | case len(test.matches) == 0 && len(result) == 0: 196 | // ok 197 | case test.matches == nil && result != nil: 198 | t.Errorf("expected no match; got one: %s", test) 199 | case test.matches != nil && result == nil: 200 | t.Errorf("expected match; got none: %s", test) 201 | case test.matches != nil && result != nil: 202 | expect := test.matches[0] 203 | if expect[0] != result[0] || expect[1] != result[1] { 204 | t.Errorf("expected %v got %v: %s", expect, result, test) 205 | } 206 | } 207 | } 208 | 209 | func TestFindIndex(t *testing.T) { 210 | for _, test := range findTests { 211 | testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) 212 | } 213 | } 214 | 215 | func TestFindStringIndex(t *testing.T) { 216 | for _, test := range findTests { 217 | testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) 218 | } 219 | } 220 | 221 | func TestFindReaderIndex(t *testing.T) { 222 | for _, test := range findTests { 223 | testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t) 224 | } 225 | } 226 | 227 | // Now come the simple All cases. 228 | 229 | func TestFindAll(t *testing.T) { 230 | for _, test := range findTests { 231 | result := MustCompile(test.pat).FindAll([]byte(test.text), -1) 232 | switch { 233 | case test.matches == nil && result == nil: 234 | // ok 235 | case test.matches == nil && result != nil: 236 | t.Errorf("expected no match; got one: %s", test) 237 | case test.matches != nil && result == nil: 238 | t.Fatalf("expected match; got none: %s", test) 239 | case test.matches != nil && result != nil: 240 | if len(test.matches) != len(result) { 241 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 242 | continue 243 | } 244 | for k, e := range test.matches { 245 | expect := test.text[e[0]:e[1]] 246 | if expect != string(result[k]) { 247 | t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test) 248 | } 249 | } 250 | } 251 | } 252 | } 253 | 254 | func TestFindAllString(t *testing.T) { 255 | for _, test := range findTests { 256 | result := MustCompile(test.pat).FindAllString(test.text, -1) 257 | switch { 258 | case test.matches == nil && result == nil: 259 | // ok 260 | case test.matches == nil && result != nil: 261 | t.Errorf("expected no match; got one: %s", test) 262 | case test.matches != nil && result == nil: 263 | t.Errorf("expected match; got none: %s", test) 264 | case test.matches != nil && result != nil: 265 | if len(test.matches) != len(result) { 266 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 267 | continue 268 | } 269 | for k, e := range test.matches { 270 | expect := test.text[e[0]:e[1]] 271 | if expect != result[k] { 272 | t.Errorf("expected %q got %q: %s", expect, result, test) 273 | } 274 | } 275 | } 276 | } 277 | } 278 | 279 | func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { 280 | switch { 281 | case test.matches == nil && result == nil: 282 | // ok 283 | case test.matches == nil && result != nil: 284 | t.Errorf("expected no match; got one: %s", test) 285 | case test.matches != nil && result == nil: 286 | t.Errorf("expected match; got none: %s", test) 287 | case test.matches != nil && result != nil: 288 | if len(test.matches) != len(result) { 289 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 290 | return 291 | } 292 | for k, e := range test.matches { 293 | if e[0] != result[k][0] || e[1] != result[k][1] { 294 | t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) 295 | } 296 | } 297 | } 298 | } 299 | 300 | func TestFindAllIndex(t *testing.T) { 301 | for _, test := range findTests { 302 | testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) 303 | } 304 | } 305 | 306 | func TestFindAllStringIndex(t *testing.T) { 307 | for _, test := range findTests { 308 | testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) 309 | } 310 | } 311 | 312 | // Now come the Submatch cases. 313 | 314 | func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { 315 | if len(submatches) != len(result)*2 { 316 | t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) 317 | return 318 | } 319 | for k := 0; k < len(submatches); k += 2 { 320 | if submatches[k] == -1 { 321 | if result[k/2] != nil { 322 | t.Errorf("match %d: expected nil got %q: %s", n, result, test) 323 | } 324 | continue 325 | } 326 | expect := test.text[submatches[k]:submatches[k+1]] 327 | if expect != string(result[k/2]) { 328 | t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) 329 | return 330 | } 331 | } 332 | } 333 | 334 | func TestFindSubmatch(t *testing.T) { 335 | for _, test := range findTests { 336 | result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) 337 | switch { 338 | case test.matches == nil && result == nil: 339 | // ok 340 | case test.matches == nil && result != nil: 341 | t.Errorf("expected no match; got one: %s", test) 342 | case test.matches != nil && result == nil: 343 | t.Errorf("expected match; got none: %s", test) 344 | case test.matches != nil && result != nil: 345 | testSubmatchBytes(&test, 0, test.matches[0], result, t) 346 | } 347 | } 348 | } 349 | 350 | func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { 351 | if len(submatches) != len(result)*2 { 352 | t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) 353 | return 354 | } 355 | for k := 0; k < len(submatches); k += 2 { 356 | if submatches[k] == -1 { 357 | if result[k/2] != "" { 358 | t.Errorf("match %d: expected nil got %q: %s", n, result, test) 359 | } 360 | continue 361 | } 362 | expect := test.text[submatches[k]:submatches[k+1]] 363 | if expect != result[k/2] { 364 | t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) 365 | return 366 | } 367 | } 368 | } 369 | 370 | func TestFindStringSubmatch(t *testing.T) { 371 | for _, test := range findTests { 372 | result := MustCompile(test.pat).FindStringSubmatch(test.text) 373 | switch { 374 | case test.matches == nil && result == nil: 375 | // ok 376 | case test.matches == nil && result != nil: 377 | t.Errorf("expected no match; got one: %s", test) 378 | case test.matches != nil && result == nil: 379 | t.Errorf("expected match; got none: %s", test) 380 | case test.matches != nil && result != nil: 381 | testSubmatchString(&test, 0, test.matches[0], result, t) 382 | } 383 | } 384 | } 385 | 386 | func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { 387 | if len(expect) != len(result) { 388 | t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) 389 | return 390 | } 391 | for k, e := range expect { 392 | if e != result[k] { 393 | t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) 394 | } 395 | } 396 | } 397 | 398 | func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { 399 | switch { 400 | case test.matches == nil && result == nil: 401 | // ok 402 | case test.matches == nil && result != nil: 403 | t.Errorf("expected no match; got one: %s", test) 404 | case test.matches != nil && result == nil: 405 | t.Errorf("expected match; got none: %s", test) 406 | case test.matches != nil && result != nil: 407 | testSubmatchIndices(test, 0, test.matches[0], result, t) 408 | } 409 | } 410 | 411 | func TestFindSubmatchIndex(t *testing.T) { 412 | for _, test := range findTests { 413 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) 414 | } 415 | } 416 | 417 | func TestFindStringSubmatchIndex(t *testing.T) { 418 | for _, test := range findTests { 419 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) 420 | } 421 | } 422 | 423 | func TestFindReaderSubmatchIndex(t *testing.T) { 424 | for _, test := range findTests { 425 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t) 426 | } 427 | } 428 | 429 | // Now come the monster AllSubmatch cases. 430 | 431 | func TestFindAllSubmatch(t *testing.T) { 432 | for _, test := range findTests { 433 | result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) 434 | switch { 435 | case test.matches == nil && result == nil: 436 | // ok 437 | case test.matches == nil && result != nil: 438 | t.Errorf("expected no match; got one: %s", test) 439 | case test.matches != nil && result == nil: 440 | t.Errorf("expected match; got none: %s", test) 441 | case len(test.matches) != len(result): 442 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 443 | case test.matches != nil && result != nil: 444 | for k, match := range test.matches { 445 | testSubmatchBytes(&test, k, match, result[k], t) 446 | } 447 | } 448 | } 449 | } 450 | 451 | func TestFindAllStringSubmatch(t *testing.T) { 452 | for _, test := range findTests { 453 | result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) 454 | switch { 455 | case test.matches == nil && result == nil: 456 | // ok 457 | case test.matches == nil && result != nil: 458 | t.Errorf("expected no match; got one: %s", test) 459 | case test.matches != nil && result == nil: 460 | t.Errorf("expected match; got none: %s", test) 461 | case len(test.matches) != len(result): 462 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 463 | case test.matches != nil && result != nil: 464 | for k, match := range test.matches { 465 | testSubmatchString(&test, k, match, result[k], t) 466 | } 467 | } 468 | } 469 | } 470 | 471 | func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { 472 | switch { 473 | case test.matches == nil && result == nil: 474 | // ok 475 | case test.matches == nil && result != nil: 476 | t.Errorf("expected no match; got one: %s", test) 477 | case test.matches != nil && result == nil: 478 | t.Errorf("expected match; got none: %s", test) 479 | case len(test.matches) != len(result): 480 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 481 | case test.matches != nil && result != nil: 482 | for k, match := range test.matches { 483 | testSubmatchIndices(test, k, match, result[k], t) 484 | } 485 | } 486 | } 487 | 488 | func TestFindAllSubmatchIndex(t *testing.T) { 489 | for _, test := range findTests { 490 | testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) 491 | } 492 | } 493 | 494 | func TestFindAllStringSubmatchIndex(t *testing.T) { 495 | for _, test := range findTests { 496 | testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) 497 | } 498 | } 499 | -------------------------------------------------------------------------------- /internal/dfa/dfa_exhaustive_test.go: -------------------------------------------------------------------------------- 1 | package dfa 2 | 3 | import ( 4 | "bufio" 5 | "compress/bzip2" 6 | "io" 7 | "os" 8 | "strconv" 9 | "strings" 10 | "testing" 11 | ) 12 | 13 | func TestDFAZVV(t *testing.T) { 14 | testDFA(t, "../../testdata/re2-search.txt") 15 | } 16 | 17 | 18 | // THIS IS REALLY SLOW 19 | func xTestDFAExhaustive(t *testing.T) { 20 | testDFA(t, "../../testdata/re2-exhaustive.txt.bz2") 21 | } 22 | 23 | func testDFA(t *testing.T, file string) { 24 | f, err := os.Open(file) 25 | if err != nil { 26 | t.Fatal(err) 27 | } 28 | defer f.Close() 29 | var txt io.Reader 30 | if strings.HasSuffix(file, ".bz2") { 31 | z := bzip2.NewReader(f) 32 | txt = z 33 | file = file[:len(file)-len(".bz2")] // for error messages 34 | } else { 35 | txt = f 36 | } 37 | lineno := 0 38 | scanner := bufio.NewScanner(txt) 39 | var ( 40 | str []string 41 | input []string 42 | inStrings bool 43 | q, full string 44 | nfail int 45 | ncase int 46 | ) 47 | for lineno := 1; scanner.Scan(); lineno++ { 48 | line := scanner.Text() 49 | switch { 50 | case line == "": 51 | t.Fatalf("%s:%d: unexpected blank line", file, lineno) 52 | case line[0] == '#': 53 | continue 54 | case 'A' <= line[0] && line[0] <= 'Z': 55 | // Test name. 56 | t.Logf("%s\n", line) 57 | continue 58 | case line == "strings": 59 | str = str[:0] 60 | inStrings = true 61 | case line == "regexps": 62 | inStrings = false 63 | case line[0] == '"': 64 | q, err = strconv.Unquote(line) 65 | if err != nil { 66 | // Fatal because we'll get out of sync. 67 | t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err) 68 | } 69 | if inStrings { 70 | str = append(str, q) 71 | continue 72 | } 73 | // Is a regexp. 74 | if len(input) != 0 { 75 | t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q) 76 | } 77 | full = `\A(?:` + q + `)\z` 78 | input = str 79 | case line[0] == '-' || '0' <= line[0] && line[0] <= '9': 80 | // A sequence of match results. 81 | ncase++ 82 | if len(input) == 0 { 83 | t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno) 84 | } 85 | var text string 86 | text, input = input[0], input[1:] 87 | if strings.Contains(q, `\C`) || (!isSingleBytes(text) && strings.Contains(q, `\B`)) { 88 | // RE2's \B considers every byte position, 89 | // so it sees 'not word boundary' in the 90 | // middle of UTF-8 sequences. This package 91 | // only considers the positions between runes, 92 | // so it disagrees. Skip those cases. 93 | continue 94 | } 95 | res := strings.Split(line, ";") 96 | if len(res) != len(run) { 97 | t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run)) 98 | } 99 | for i := range res { 100 | have, suffix := run[i](q, full, text) 101 | want := parseResult(t, file, lineno, res[i]) 102 | if len(want) <= 2 && !same(have, want) { 103 | t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, q, suffix, text, have, want) 104 | if nfail++; nfail >= 100 { 105 | t.Fatalf("stopping after %d errors", nfail) 106 | } 107 | continue 108 | } 109 | b, suffix := match[i](q, full, text) 110 | if b != (want != nil) { 111 | t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, q, suffix, text, b, !b) 112 | if nfail++; nfail >= 100 { 113 | t.Fatalf("stopping after %d errors", nfail) 114 | } 115 | continue 116 | } 117 | } 118 | 119 | default: 120 | t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line) 121 | } 122 | } 123 | if err := scanner.Err(); err != nil { 124 | t.Fatalf("%s:%d: %v", file, lineno, err) 125 | } 126 | if len(input) != 0 { 127 | t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input)) 128 | } 129 | t.Logf("%d cases tested", ncase) 130 | } 131 | 132 | // TODO(matloob): This is deceptive because we're not reusing the DFA between 133 | // tests. FIX IT! 134 | 135 | var run = []func(string, string, string) ([]int, string){ 136 | runFull, 137 | runPartial, 138 | runFullLongest, 139 | runPartialLongest, 140 | } 141 | 142 | func runFull(re, refull, text string) ([]int, string) { 143 | return dfaSubmatchIndex(refull, text, false), "[full]" 144 | } 145 | 146 | func runPartial(re, refull, text string) ([]int, string) { 147 | return dfaSubmatchIndex(re, text, false), "" 148 | } 149 | 150 | func runFullLongest(re, refull, text string) ([]int, string) { 151 | return dfaSubmatchIndex(refull, text, true), "[full,longest]" 152 | } 153 | 154 | func runPartialLongest(re, refull, text string) ([]int, string) { 155 | return dfaSubmatchIndex(re, text, true), "[longest]" 156 | } 157 | 158 | func dfaSubmatchIndex(re, text string, longest bool) []int { 159 | i, j, b, err := matchDFA2(re, text, longest) 160 | if err != nil || !b { 161 | return nil 162 | } 163 | return []int{i, j} 164 | } 165 | 166 | var match = []func(string, string, string) (bool, string){ 167 | matchFull, 168 | matchPartial, 169 | matchFullLongest, 170 | matchPartialLongest, 171 | } 172 | 173 | func matchFull(re, refull, text string) (bool, string) { 174 | return dfaMatchString(refull, text, false), "[full]" 175 | } 176 | 177 | func matchPartial(re, refull, text string) (bool, string) { 178 | return dfaMatchString(re, text, false), "" 179 | } 180 | 181 | func matchFullLongest(re, refull, text string) (bool, string) { 182 | return dfaMatchString(refull, text, true), "[full,longest]" 183 | } 184 | 185 | func matchPartialLongest(re, refull, text string) (bool, string) { 186 | return dfaMatchString(re, text, true), "[longest]" 187 | } 188 | 189 | func dfaMatchString(re, text string, longest bool) bool { 190 | _, _, b, err := matchDFA2(re, text, longest) 191 | return err == nil && b 192 | } 193 | -------------------------------------------------------------------------------- /internal/dfa/dfa_test.go: -------------------------------------------------------------------------------- 1 | // TODO(matloob): DELETE ME! 2 | 3 | package dfa 4 | 5 | import ( 6 | "testing" 7 | 8 | "matloob.io/regexp/internal/input" 9 | "matloob.io/regexp/syntax" 10 | ) 11 | 12 | func matchDFA(regexp string, input string) (int, int, bool, error) { 13 | return matchDFA2(regexp, input, false) 14 | } 15 | 16 | func matchDFA2(regexp string, inputstr string, longest bool) (int, int, bool, error) { 17 | re, err := syntax.Parse(regexp, syntax.Perl) 18 | if err != nil { 19 | return 0, 0, false, err 20 | } 21 | prog, err := syntax.Compile(re) 22 | if err != nil { 23 | return 0, 0, false, err 24 | } 25 | 26 | kind := firstMatch 27 | if longest { 28 | kind = longestMatch 29 | } 30 | 31 | d := newDFA(prog, kind, 0) 32 | 33 | revprog, err := syntax.CompileReversed(re) 34 | if err != nil { 35 | panic("failed to compile reverse prog") 36 | } 37 | 38 | reversed := newDFA(revprog, longestMatch, 0) 39 | 40 | var i input.InputString 41 | i.Reset(inputstr) 42 | j, k, b, err := search(d, reversed, &i, 0) 43 | return j, k, b, err 44 | } 45 | 46 | func TestDFA(t *testing.T) { 47 | // These are all anchored matches. 48 | testCases := []struct { 49 | re string 50 | in string 51 | wantS int 52 | wantE int 53 | want bool 54 | }{ 55 | 56 | {"abc", "abc", 0, 3, true}, 57 | {"abc", "ab", -1, -1, false}, 58 | {".*(a|z)bc", "eedbcxcee", -1, -1, false}, 59 | {"^abc", "xxxabcxxx", -1, -1, false}, 60 | 61 | {"ab*", "xxxabbxxx", 3, 6, true}, 62 | {"abc", "xxxabcxxx", 3, 6, true}, 63 | 64 | {"(>[^\n]+)?\n", ">One Homo sapiens alu\nGGCCGGGCGCG", 0, 22, true}, 65 | {"abc", "abcxxxabc", 0, 3, true}, 66 | {"^abcde", "abcde", 0, 5, true}, 67 | {"^", "abcde", 0, 0, true}, 68 | {"abcde$", "abcde", 0, 5, true}, 69 | {"$", "abcde", 5, 5, true}, 70 | {"agggtaa[cgt]|[acg]ttaccct", "agggtaag", 0, 8, true}, 71 | {"[cgt]gggtaaa|tttaccc[acg]", "xtttacccce", 1, 9, true}, 72 | {"[日本語]+", "日本語日本語", 0, len("日本語日本語"), true}, 73 | {"a.", "paranormal", 1, 3, true}, 74 | {`\B`, "x", -1, -1, false}, 75 | } 76 | for _, tc := range testCases { 77 | i, j, got, err := matchDFA(tc.re, tc.in) 78 | if err != nil { 79 | t.Error(err) 80 | } 81 | if got != tc.want || i != tc.wantS || j != tc.wantE { 82 | t.Errorf("matchDFA(%q, %q): got (%v, %v, %v), want (%v, %v, %v)", tc.re, tc.in, i, j, got, tc.wantS, tc.wantE, tc.want) 83 | } 84 | } 85 | 86 | } 87 | func TestDFA3(t *testing.T) { 88 | // These are all anchored matches. 89 | testCases := []struct { 90 | re string 91 | in string 92 | wantS int 93 | wantE int 94 | want bool 95 | }{ 96 | {`\B`, "a0b", 1, 1, true}, 97 | // {"\\B", "x", -1, -1, false}, 98 | // {"\\B", "xx yy", 1,1,true}, 99 | // {`(?:A|(?:A|a))`, "B", -1, -1, true}, 100 | // {`(?:A|(?:A|a))`, "B", -1, -1, true}, 101 | } 102 | for _, tc := range testCases { 103 | i, j, got, err := matchDFA(tc.re, tc.in) 104 | if err != nil { 105 | t.Error(err) 106 | continue 107 | } 108 | if got != tc.want || i != tc.wantS || j != tc.wantE { 109 | t.Errorf("matchDFA(%q, %q): got (%v, %v, %v), want (%v, %v, %v)", tc.re, tc.in, i, j, got, tc.wantS, tc.wantE, tc.want) 110 | } 111 | } 112 | } -------------------------------------------------------------------------------- /internal/dfa/exec_test.go: -------------------------------------------------------------------------------- 1 | package dfa 2 | 3 | import ( 4 | "unicode/utf8" 5 | "testing" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | func isSingleBytes(s string) bool { 11 | for _, c := range s { 12 | if c >= utf8.RuneSelf { 13 | return false 14 | } 15 | } 16 | return true 17 | } 18 | 19 | func parseResult(t *testing.T, file string, lineno int, res string) []int { 20 | // A single - indicates no match. 21 | if res == "-" { 22 | return nil 23 | } 24 | // Otherwise, a space-separated list of pairs. 25 | n := 1 26 | for j := 0; j < len(res); j++ { 27 | if res[j] == ' ' { 28 | n++ 29 | } 30 | } 31 | out := make([]int, 2*n) 32 | i := 0 33 | n = 0 34 | for j := 0; j <= len(res); j++ { 35 | if j == len(res) || res[j] == ' ' { 36 | // Process a single pair. - means no submatch. 37 | pair := res[i:j] 38 | if pair == "-" { 39 | out[n] = -1 40 | out[n+1] = -1 41 | } else { 42 | k := strings.Index(pair, "-") 43 | if k < 0 { 44 | t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) 45 | } 46 | lo, err1 := strconv.Atoi(pair[:k]) 47 | hi, err2 := strconv.Atoi(pair[k+1:]) 48 | if err1 != nil || err2 != nil || lo > hi { 49 | t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) 50 | } 51 | out[n] = lo 52 | out[n+1] = hi 53 | } 54 | n += 2 55 | i = j + 1 56 | } 57 | } 58 | return out 59 | } 60 | 61 | func same(x, y []int) bool { 62 | if len(x) != len(y) { 63 | return false 64 | } 65 | for i, xi := range x { 66 | if xi != y[i] { 67 | return false 68 | } 69 | } 70 | return true 71 | } 72 | -------------------------------------------------------------------------------- /internal/dfa/runerange.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dfa 6 | 7 | import ( 8 | "sort" 9 | "unicode" 10 | "matloob.io/regexp/internal/input" 11 | "matloob.io/regexp/syntax" 12 | ) 13 | 14 | type rangeMap struct { 15 | bytemap []int 16 | divides []rune 17 | } 18 | 19 | func (rm *rangeMap) lookup(r rune) int { 20 | // Use the trivial byte map for now... 21 | // See ComputeByteMap 22 | if r == input.EndOfText { 23 | return len(rm.divides) 24 | } 25 | if r == input.StartOfText { 26 | return len(rm.divides) + 1 27 | } 28 | if r > 255 { 29 | // binary search for the range 30 | lo, hi := 0, len(rm.divides) 31 | for { 32 | // search rm.divides 33 | center := (lo + hi) / 2 34 | if center == lo { 35 | return lo 36 | } 37 | divcenter := rm.divides[center] 38 | if r >= divcenter { 39 | lo = center 40 | } else { 41 | hi = center 42 | } 43 | } 44 | } 45 | // Faster lookup for runes < 256. 46 | return rm.bytemap[int(r)] 47 | } 48 | 49 | // count returns the number of ranges. 0 <= rm.count() < rm.lookup(r) for all runes r. 50 | func (rm *rangeMap) count() int { 51 | return len(rm.divides) + 2 52 | } 53 | 54 | func (rm *rangeMap) init(prog *syntax.Prog) { 55 | rangemark := make(map[rune]bool) 56 | addRune := func(r rune) { 57 | rangemark[r] = true 58 | rangemark[r+1] = true 59 | } 60 | addRuneRange := func(rl, rh rune) { 61 | rangemark[rl] = true 62 | rangemark[rh+1] = true 63 | } 64 | addRuneFolds := func(r rune) { 65 | for r1 := unicode.SimpleFold(r) ;r1 != r; r1 = unicode.SimpleFold(r1) { 66 | addRune(r1) 67 | } 68 | } 69 | for _, inst := range prog.Inst { 70 | switch inst.Op { 71 | case syntax.InstRune: 72 | if len(inst.Rune) == 1 { 73 | // special case of single rune 74 | r := inst.Rune[0] 75 | addRune(r) 76 | if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 77 | addRuneFolds(r) 78 | } 79 | break 80 | } 81 | // otherwise inst.Rune is a series of ranges 82 | for i := 0; i < len(inst.Rune); i += 2 { 83 | addRuneRange(inst.Rune[i], inst.Rune[i+1]) 84 | if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 85 | for r0 := inst.Rune[i]; r0 <= inst.Rune[i+1]; r0++ { 86 | // Range mapping doesn't commute, so we have to 87 | // add folds individually. 88 | addRuneFolds(r0) 89 | } 90 | } 91 | } 92 | case syntax.InstRune1: 93 | r := inst.Rune[0] 94 | addRune(r) 95 | if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 96 | addRuneFolds(r) 97 | } 98 | case syntax.InstRuneAnyNotNL: 99 | addRune('\n') 100 | case syntax.InstEmptyWidth: 101 | switch syntax.EmptyOp(inst.Arg) { 102 | case syntax.EmptyBeginLine, syntax.EmptyEndLine: 103 | addRune('\n') 104 | case syntax.EmptyWordBoundary, syntax.EmptyNoWordBoundary: 105 | addRuneRange('A', 'Z') 106 | addRuneRange('a', 'Z') 107 | addRuneRange('0', '9') 108 | addRune('_') 109 | } 110 | } 111 | } 112 | 113 | divides := make([]rune, 0, len(rangemark)) 114 | divides = append(divides, -1) 115 | for r := range rangemark { 116 | divides = append(divides, r) 117 | } 118 | runeSlice(divides).Sort() 119 | rm.divides = divides 120 | rm.bytemap = make([]int, 256) 121 | k := 0 122 | for i := range rm.bytemap { 123 | if rangemark[rune(i)] { 124 | k++ 125 | } 126 | rm.bytemap[i] = k 127 | } 128 | } 129 | 130 | // runeSlice exists to permit sorting the case-folded rune sets. 131 | type runeSlice []rune 132 | 133 | func (p runeSlice) Len() int { return len(p) } 134 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 135 | func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 136 | 137 | // Sort is a convenience method. 138 | func (p runeSlice) Sort() { 139 | sort.Sort(p) 140 | } 141 | -------------------------------------------------------------------------------- /internal/dfa/search.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dfa 6 | 7 | import ( 8 | "sync" 9 | "errors" 10 | "math" 11 | "matloob.io/regexp/internal/input" 12 | "matloob.io/regexp/syntax" 13 | ) 14 | 15 | type Searcher struct { 16 | mu sync.Mutex 17 | re *syntax.Regexp 18 | prog *syntax.Prog 19 | prefixer input.Prefixer 20 | fdfa, ldfa, revdfa *DFA 21 | } 22 | 23 | func (s *Searcher) Init(prog *syntax.Prog, expr *syntax.Regexp, p input.Prefixer) { 24 | s.prog = prog 25 | s.re = expr 26 | s.prefixer = p 27 | } 28 | 29 | var errNotDFA = errors.New("can't use dfa") 30 | 31 | func (s *Searcher) Search(i input.Input, pos int, longest bool, matchcap *[]int, ncap int) (bool, error) { 32 | const budget = (2 << 20)/3 33 | rinput, ok := i.(input.Rinput) 34 | if !ok { 35 | return false, errNotDFA 36 | } 37 | var dfa *DFA 38 | if longest { 39 | s.mu.Lock() 40 | if s.ldfa == nil { 41 | s.ldfa = newDFA(s.prog, longestMatch, budget) 42 | s.ldfa.prefixer = s.prefixer 43 | } 44 | dfa = s.ldfa 45 | s.mu.Unlock() 46 | } else { 47 | s.mu.Lock() 48 | if s.fdfa == nil { 49 | s.fdfa = newDFA(s.prog, firstMatch, budget) 50 | s.fdfa.prefixer = s.prefixer 51 | } 52 | dfa = s.fdfa 53 | s.mu.Unlock() 54 | } 55 | var revdfa *DFA 56 | if s.revdfa == nil { 57 | s.mu.Lock() 58 | revprog, err := syntax.CompileReversed(s.re) 59 | if err != nil { 60 | panic("CompileReversed failed") 61 | } 62 | s.revdfa = newDFA(revprog, longestMatch, budget) 63 | s.mu.Unlock() 64 | } 65 | s.mu.Lock() 66 | revdfa = s.revdfa 67 | s.mu.Unlock() 68 | 69 | var matched bool 70 | *matchcap = (*matchcap)[:ncap] 71 | p, ep, matched, err := search(dfa, revdfa, rinput, pos) 72 | if err != nil { 73 | return false, errNotDFA 74 | } 75 | if ncap > 0 { 76 | (*matchcap)[0], (*matchcap)[1] = p, ep 77 | } 78 | return matched, nil 79 | } 80 | 81 | type searchParams struct { 82 | input input.Rinput 83 | startpos int 84 | anchored bool 85 | wantEarliestMatch bool 86 | runForward bool 87 | start *State 88 | firstbyte int64 // int64 to be compatible with atomic ops 89 | failed bool // "out" parameter: whether search gave up 90 | ep int // "out" parameter: end pointer for match 91 | 92 | matches []int 93 | } 94 | 95 | func isanchored(prog *syntax.Prog) bool { 96 | return prog.StartCond() & syntax.EmptyBeginText != 0 97 | } 98 | 99 | func search(d, reversed *DFA, i input.Rinput, startpos int) (start int, end int, matched bool, err error) { 100 | params := searchParams{} 101 | params.startpos = startpos 102 | params.wantEarliestMatch = false 103 | params.input = i 104 | params.anchored = isanchored(d.prog) 105 | params.runForward = true 106 | params.ep = int(math.MaxInt64) 107 | if !d.analyzeSearch(¶ms) { 108 | return -1, -1, false, errors.New("analyze search failed on forward DFA") 109 | } 110 | b := d.searchLoop(¶ms) 111 | if params.failed { 112 | return -1, -1, false, errFallBack 113 | } 114 | if !b { 115 | return -1, -1, false, nil 116 | } 117 | end = params.ep 118 | 119 | params = searchParams{} 120 | params.startpos = startpos 121 | params.ep = end 122 | params.anchored = true 123 | params.input = i 124 | params.runForward = false 125 | if !reversed.analyzeSearch(¶ms) { 126 | return -2, -2, false, errors.New("analyze search failed on reverse DFA") 127 | } 128 | b = reversed.searchLoop(¶ms) 129 | if DebugDFA { 130 | DebugPrintf("\nkind %d\n%v\n", d.kind, d.prog) 131 | } 132 | if params.failed { 133 | return -1, -1, false, errFallBack 134 | } 135 | return params.ep, end, b, nil 136 | } -------------------------------------------------------------------------------- /internal/dfa/state.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dfa 6 | 7 | // TODO(matloob): rename all the upper-case identifiers to lower-case. 8 | 9 | import ( 10 | "bytes" 11 | "strconv" 12 | "sync" 13 | ) 14 | 15 | // just use ints instead of stateinst?? 16 | type stateInst int 17 | 18 | type State struct { 19 | mu sync.Mutex 20 | 21 | // Instruction pointers in the state. 22 | // TODO(matloob): Should these have a different type? 23 | inst []int 24 | 25 | // Empty string bitfield flags in effect on the way 26 | // into this state, along with FlagMatch if this is 27 | // a matching state. 28 | flag flag 29 | 30 | // Outgoing arrows from State, one per input byte class. 31 | next []*State 32 | } 33 | 34 | func (s *State) isMatch() bool { 35 | return s.flag&flagMatch != 0 36 | } 37 | 38 | type flag uint32 39 | 40 | var ( 41 | flagEmptyMask = flag(0xFFF) 42 | flagMatch = flag(0x1000) 43 | flagLastWord = flag(0x2000) 44 | flagNeedShift = flag(16) 45 | ) 46 | 47 | // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.) 48 | const ( 49 | fbUnknown int64 = -1 // No analysis has been performed. 50 | fbMany int64 = -2 // Many bytes will lead out of this state. 51 | fbNone int64 = -3 // No bytes lead out of this state. 52 | ) 53 | 54 | const ( 55 | // Indices into start for unanchored searches. 56 | // Add startAnchored for anchored searches. 57 | startBeginText = 0 58 | startBeginLine = 2 59 | startWordBoundary = 4 60 | startNonWordBoundary = 6 61 | maxStart = 8 62 | 63 | kStartAnchored = 1 64 | ) 65 | 66 | const mark = -1 67 | 68 | // TODO(matloob): in RE2 deadState and fullMatchState are (State*)(1) and (State*)(2) 69 | // respectively. Is it cheaper to compare with those numbers, than these states? 70 | // Do we need to import package unsafe? 71 | var deadState = &State{} 72 | var fullMatchState = &State{} 73 | 74 | func isSpecialState(s *State) bool { 75 | // see above. cc does int comparison because deadState and fullMatchState 76 | // are special numbers, but that's unsafe. 77 | // TODO(matloob): convert states back to numbers. (pointers into state array state(-2) and state(-1)) 78 | return s == deadState || s == fullMatchState || s == nil 79 | } 80 | 81 | func (s *State) Dump() string { 82 | switch s { 83 | case nil: 84 | return "_" 85 | case deadState: 86 | return "X" 87 | case fullMatchState: 88 | return "*" 89 | } 90 | var buf bytes.Buffer 91 | sep := "" 92 | buf.WriteString("(0x)") 93 | // buf.WriteString(fmt.Sprintf("(%p)", s) 94 | for _, inst := range s.inst { 95 | if inst == int(mark) { 96 | buf.WriteString("|") 97 | sep = "" 98 | } else { 99 | buf.WriteString(sep) 100 | buf.WriteString(strconv.Itoa(inst)) 101 | sep = "," 102 | } 103 | } 104 | buf.WriteString("flag=0x") 105 | buf.WriteString(strconv.FormatUint(uint64(s.flag), 16)) 106 | return buf.String() 107 | } 108 | 109 | type stateSet struct { 110 | states []State 111 | 112 | instpool []int 113 | instpos int 114 | 115 | nextpool []*State 116 | nextpos int 117 | } 118 | 119 | func (s *stateSet) init(budget int, runeRanges int, proglen int, nmark int) { 120 | // estimate State size to avoid using unsafe 121 | const intsize = 8 122 | const slicesize = 3*intsize 123 | const statesize = 2 *slicesize+intsize 124 | 125 | // the cost of one state including the inst and next slices 126 | onestate := statesize + runeRanges*intsize + (proglen+nmark)*intsize 127 | numstates := budget/onestate 128 | // TODO(matloob): actually use budget number 129 | s.states = make([]State, 0, numstates) 130 | 131 | s.instpool = make([]int, 0, (proglen+nmark)*numstates) 132 | s.instpos = 0 133 | s.nextpool = make([]*State, 0, runeRanges*numstates) 134 | s.nextpos = 0 135 | 136 | } 137 | 138 | // clear clears the state cache. Must hold the DFA's cache mutex to call clear. 139 | func (s *stateSet) clear() { 140 | s.states = s.states[:0] 141 | s.instpool = s.instpool[:0] 142 | s.nextpool = s.nextpool[:0] 143 | } 144 | 145 | func (s *stateSet) find(inst []int, flag flag) *State { 146 | loop: 147 | for i := range s.states { 148 | if len(s.states[i].inst) != len(inst) { 149 | continue 150 | } 151 | for j := range inst { 152 | if s.states[i].inst[j] != inst[j] { 153 | continue loop 154 | } 155 | } 156 | if s.states[i].flag != flag { 157 | continue 158 | } 159 | return &s.states[i] 160 | } 161 | return nil 162 | } 163 | 164 | func (s *stateSet) size() int { 165 | return len(s.states) 166 | } 167 | 168 | func (s *stateSet) insert(inst []int, flag flag, nextsize int) *State { 169 | if len(s.states)+1 > cap(s.states) || 170 | s.instpos+len(inst) > cap(s.instpool) || 171 | s.nextpos+nextsize > cap(s.nextpool) { 172 | // state cache is full 173 | return nil 174 | } 175 | 176 | // TODO(matloob): can we insert? 177 | i := len(s.states) 178 | s.states = s.states[:i+1] 179 | state := &s.states[i] 180 | 181 | instsize := len(inst) 182 | state.inst = s.instpool[s.instpos : s.instpos+instsize] 183 | s.instpos += instsize 184 | copy(state.inst, inst) 185 | 186 | state.flag = flag 187 | 188 | state.next = s.nextpool[s.nextpos : s.nextpos+nextsize] 189 | s.nextpos += nextsize 190 | for i := range state.next { 191 | state.next[i] = nil 192 | } 193 | 194 | return state 195 | } 196 | 197 | type startInfo struct { 198 | start *State 199 | firstbyte int64 200 | } 201 | 202 | type stateSaver struct { 203 | dfa *DFA 204 | inst []int 205 | flag flag 206 | isSpecial bool 207 | special *State // if it's a special state special != nil 208 | } 209 | 210 | func (s *stateSaver) Save(dfa *DFA, state *State) { 211 | s.dfa = dfa 212 | if isSpecialState(state) { 213 | s.inst = nil 214 | s.flag = 0 215 | s.special = state 216 | s.isSpecial = true 217 | } 218 | s.isSpecial = false 219 | s.flag = state.flag 220 | 221 | s.inst = s.inst[:0] 222 | s.inst = append(s.inst, state.inst...) 223 | } 224 | 225 | func (s *stateSaver) Restore() *State { 226 | if s.isSpecial { 227 | return s.special 228 | } 229 | s.dfa.mu.Lock() 230 | state := s.dfa.cachedState(s.inst, s.flag) 231 | s.inst = nil 232 | s.dfa.mu.Unlock() 233 | return state 234 | } 235 | -------------------------------------------------------------------------------- /internal/dfa/workq.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dfa 6 | 7 | import ( 8 | "bytes" 9 | "strconv" 10 | ) 11 | 12 | type sparseSet struct { 13 | sparseToDense []int 14 | dense []int 15 | } 16 | 17 | func makeSparseSet(maxSize int) sparseSet { 18 | // s.maxSize = maxSize // not necessary, right? 19 | return sparseSet{ 20 | sparseToDense: make([]int, maxSize), 21 | dense: make([]int, maxSize), 22 | } 23 | } 24 | 25 | func (s *sparseSet) resize(newMaxSize int) { 26 | // TODO(matloob): Use slice length instead of size for 'dense'. 27 | // Use cap instead of maxSize for both. 28 | size := len(s.dense) 29 | if size > newMaxSize { 30 | size = newMaxSize 31 | } 32 | if newMaxSize > len(s.sparseToDense) { 33 | a := make([]int, newMaxSize) 34 | if s.sparseToDense != nil { 35 | copy(a, s.sparseToDense) 36 | } 37 | s.sparseToDense = a 38 | 39 | a = make([]int, size, newMaxSize) 40 | if s.dense != nil { 41 | copy(a, s.dense) 42 | } 43 | s.dense = a 44 | } 45 | } 46 | 47 | func (s *sparseSet) maxSize() int { 48 | return cap(s.dense) 49 | } 50 | 51 | func (s *sparseSet) clear() { 52 | s.dense = s.dense[:0] 53 | } 54 | 55 | func (s *sparseSet) contains(i int) bool { 56 | if i >= len(s.sparseToDense) { 57 | return false 58 | } 59 | return s.sparseToDense[i] < len(s.dense) && s.dense[s.sparseToDense[i]] == i 60 | } 61 | 62 | func (s *sparseSet) insert(i int) { 63 | if s.contains(i) { 64 | return 65 | } 66 | s.insertNew(i) 67 | } 68 | 69 | func (s *sparseSet) insertNew(i int) { 70 | if i >= len(s.sparseToDense) { 71 | return 72 | } 73 | // There's a CHECK here that size < maxSize... 74 | 75 | s.sparseToDense[i] = len(s.dense) 76 | s.dense = s.dense[:len(s.dense)+1] 77 | s.dense[len(s.dense)-1] = i 78 | } 79 | 80 | type workq struct { 81 | s sparseSet 82 | n int // size excluding marks 83 | maxm int // maximum number of marks 84 | nextm int // id of next mark 85 | lastWasMark bool // last inserted was mark 86 | } 87 | 88 | func newWorkq(n, maxmark int) *workq { 89 | return &workq{ 90 | s: makeSparseSet(n + maxmark), 91 | n: n, 92 | maxm: maxmark, 93 | nextm: n, 94 | lastWasMark: true, 95 | } 96 | } 97 | 98 | func (q *workq) isMark(i int) bool { return i >= q.n } 99 | 100 | func (q *workq) clear() { 101 | q.s.clear() 102 | q.nextm = q.n 103 | } 104 | 105 | func (q *workq) contains(i int) bool { 106 | return q.s.contains(i) 107 | } 108 | 109 | func (q *workq) maxmark() int { 110 | return q.maxm 111 | } 112 | 113 | func (q *workq) mark() { 114 | if q.lastWasMark { 115 | return 116 | } 117 | q.lastWasMark = false 118 | q.s.insertNew(int(q.nextm)) 119 | q.nextm++ 120 | } 121 | 122 | func (q *workq) size() int { 123 | return q.n + q.maxm 124 | } 125 | 126 | func (q *workq) insert(id int) { 127 | if q.s.contains(id) { 128 | return 129 | } 130 | q.insertNew(id) 131 | } 132 | 133 | func (q *workq) insertNew(id int) { 134 | q.lastWasMark = false 135 | q.s.insertNew(id) 136 | } 137 | 138 | func (q *workq) elements() []int { // should be []stateInst. Should we convert sparseset to use stateInst instead of int?? 139 | return q.s.dense 140 | } 141 | 142 | func (q *workq) dump() string { 143 | var buf bytes.Buffer 144 | sep := "" 145 | for _, v := range q.elements() { 146 | if q.isMark(v) { 147 | buf.WriteString("|") 148 | sep = "" 149 | } else { 150 | buf.WriteString(sep) 151 | buf.WriteString(strconv.Itoa(v)) 152 | sep = "," 153 | } 154 | } 155 | return buf.String() 156 | } 157 | -------------------------------------------------------------------------------- /internal/input/input.go: -------------------------------------------------------------------------------- 1 | package input 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "matloob.io/regexp/syntax" 7 | "strings" 8 | "unicode/utf8" 9 | ) 10 | 11 | type Prefixer interface { 12 | Prefix() string 13 | PrefixBytes() []byte 14 | } 15 | 16 | const EndOfText rune = -1 17 | const StartOfText rune = -2 18 | 19 | // Input abstracts different representations of the input text. It provides 20 | // one-character lookahead. 21 | type Input interface { 22 | // Step returns the rune starting at pos and its width. Unless 23 | // CanCheckPrefix is true, Step should always be called with 24 | // with the current position in the string, which is the sum 25 | // of the previous pos Step was called with, and the width 26 | // returned by that call. 27 | Step(pos int) (r rune, width int) 28 | 29 | // CanCheckInput reports whether we can look ahead without losing info. 30 | CanCheckPrefix() bool 31 | 32 | // HasPrefix reports whether the input has the prefix reported 33 | // by the Prefixer. 34 | HasPrefix(p Prefixer) bool 35 | 36 | // Index returns the index of the first occurence of the 37 | // prefix following pos, or -1 if it can't be found. 38 | Index(p Prefixer, pos int) int 39 | 40 | // Context returns the EmptyOp flags satisfied by the context at pos. 41 | Context(pos int) syntax.EmptyOp 42 | } 43 | 44 | type Rinput interface { 45 | Input 46 | 47 | Rstep(pos int) (r rune, width int) 48 | } 49 | 50 | // InputString scans a string. 51 | type InputString struct { 52 | str string 53 | } 54 | 55 | // Reset resets the InputString with the given string. 56 | func (i *InputString) Reset(str string) { 57 | i.str = str 58 | } 59 | 60 | func (i *InputString) Step(pos int) (rune, int) { 61 | if pos < 0 { 62 | return StartOfText, 0 63 | } 64 | if pos < len(i.str) { 65 | c := i.str[pos] 66 | if c < utf8.RuneSelf { 67 | return rune(c), 1 68 | } 69 | return utf8.DecodeRuneInString(i.str[pos:]) 70 | } 71 | return EndOfText, 0 72 | } 73 | 74 | func (i *InputString) Rstep(pos int) (rune, int) { 75 | if pos > len(i.str) { 76 | return StartOfText, 0 77 | } 78 | if pos >= 0 { 79 | c := i.str[pos-1] 80 | if c < utf8.RuneSelf { 81 | return rune(c), 1 82 | } 83 | return utf8.DecodeLastRuneInString(i.str[:pos]) 84 | } 85 | return EndOfText, 0 86 | } 87 | 88 | func (i *InputString) CanCheckPrefix() bool { 89 | return true 90 | } 91 | 92 | func (i *InputString) HasPrefix(p Prefixer) bool { 93 | return strings.HasPrefix(i.str, p.Prefix()) 94 | } 95 | 96 | func (i *InputString) Index(p Prefixer, pos int) int { 97 | return strings.Index(i.str[pos:], p.Prefix()) 98 | } 99 | 100 | func (i *InputString) Context(pos int) syntax.EmptyOp { 101 | r1, r2 := EndOfText, EndOfText 102 | if pos > 0 && pos <= len(i.str) { 103 | r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 104 | } 105 | if pos < len(i.str) { 106 | r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 107 | } 108 | return syntax.EmptyOpContext(r1, r2) 109 | } 110 | 111 | // InputBytes scans a byte slice. 112 | type InputBytes struct { 113 | str []byte 114 | } 115 | 116 | // Reset resets the InputBytes with the given byte slice. 117 | func (i *InputBytes) Reset(str []byte) { 118 | i.str = str 119 | } 120 | 121 | func (i *InputBytes) Step(pos int) (rune, int) { 122 | if pos < 0 { 123 | return StartOfText, 0 124 | } 125 | if pos < len(i.str) { 126 | c := i.str[pos] 127 | if c < utf8.RuneSelf { 128 | return rune(c), 1 129 | } 130 | return utf8.DecodeRune(i.str[pos:]) 131 | } 132 | return EndOfText, 0 133 | } 134 | 135 | func (i *InputBytes) Rstep(pos int) (rune, int) { 136 | if pos > len(i.str) { 137 | return StartOfText, 0 138 | } 139 | if pos >= 0 { 140 | c := i.str[pos-1] 141 | if c < utf8.RuneSelf { 142 | return rune(c), 1 143 | } 144 | return utf8.DecodeLastRune(i.str[:pos]) // This doesn't include pos char? 145 | } 146 | return EndOfText, 0 147 | } 148 | 149 | 150 | func (i *InputBytes) CanCheckPrefix() bool { 151 | return true 152 | } 153 | 154 | func (i *InputBytes) HasPrefix(p Prefixer) bool { 155 | return bytes.HasPrefix(i.str, p.PrefixBytes()) 156 | } 157 | 158 | func (i *InputBytes) Index(p Prefixer, pos int) int { 159 | if pos > len(i.str) { 160 | panic("pos > len i.str") 161 | } 162 | if i.str == nil { 163 | panic("i.str nil") 164 | } 165 | if p == nil { 166 | panic("p is nil") 167 | } 168 | return bytes.Index(i.str[pos:], p.PrefixBytes()) 169 | } 170 | 171 | func (i *InputBytes) Context(pos int) syntax.EmptyOp { 172 | r1, r2 := EndOfText, EndOfText 173 | if pos > 0 && pos <= len(i.str) { 174 | r1, _ = utf8.DecodeLastRune(i.str[:pos]) 175 | } 176 | if pos < len(i.str) { 177 | r2, _ = utf8.DecodeRune(i.str[pos:]) 178 | } 179 | return syntax.EmptyOpContext(r1, r2) 180 | } 181 | 182 | // InputReader scans a RuneReader. 183 | type InputReader struct { 184 | r io.RuneReader 185 | atEOT bool 186 | pos int 187 | } 188 | 189 | // Reset resets the InputReader with the given RuneReader. 190 | func (i *InputReader) Reset(r io.RuneReader) { 191 | i.r = r 192 | i.atEOT = false 193 | i.pos = 0 194 | } 195 | 196 | func (i *InputReader) Step(pos int) (rune, int) { 197 | if !i.atEOT && pos != i.pos { 198 | return EndOfText, 0 199 | 200 | } 201 | r, w, err := i.r.ReadRune() 202 | if err != nil { 203 | i.atEOT = true 204 | return EndOfText, 0 205 | } 206 | i.pos += w 207 | return r, w 208 | } 209 | 210 | func (i *InputReader) CanCheckPrefix() bool { 211 | return false 212 | } 213 | 214 | func (i *InputReader) HasPrefix(p Prefixer) bool { 215 | return false 216 | } 217 | 218 | func (i *InputReader) Index(p Prefixer, pos int) int { 219 | return -1 220 | } 221 | 222 | func (i *InputReader) Context(pos int) syntax.EmptyOp { 223 | return 0 224 | } 225 | -------------------------------------------------------------------------------- /onepass.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "bytes" 9 | "matloob.io/regexp/syntax" 10 | "sort" 11 | "unicode" 12 | ) 13 | 14 | // "One-pass" regexp execution. 15 | // Some regexps can be analyzed to determine that they never need 16 | // backtracking: they are guaranteed to run in one pass over the string 17 | // without bothering to save all the usual NFA state. 18 | // Detect those and execute them more quickly. 19 | 20 | // A onePassProg is a compiled one-pass regular expression program. 21 | // It is the same as syntax.Prog except for the use of onePassInst. 22 | type onePassProg struct { 23 | Inst []onePassInst 24 | Start int // index of start instruction 25 | NumCap int // number of InstCapture insts in re 26 | } 27 | 28 | // A onePassInst is a single instruction in a one-pass regular expression program. 29 | // It is the same as syntax.Inst except for the new 'Next' field. 30 | type onePassInst struct { 31 | syntax.Inst 32 | Next []uint32 33 | } 34 | 35 | // OnePassPrefix returns a literal string that all matches for the 36 | // regexp must start with. Complete is true if the prefix 37 | // is the entire match. Pc is the index of the last rune instruction 38 | // in the string. The OnePassPrefix skips over the mandatory 39 | // EmptyBeginText 40 | func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 41 | i := &p.Inst[p.Start] 42 | if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 43 | return "", i.Op == syntax.InstMatch, uint32(p.Start) 44 | } 45 | pc = i.Out 46 | i = &p.Inst[pc] 47 | for i.Op == syntax.InstNop { 48 | pc = i.Out 49 | i = &p.Inst[pc] 50 | } 51 | // Avoid allocation of buffer if prefix is empty. 52 | if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 53 | return "", i.Op == syntax.InstMatch, uint32(p.Start) 54 | } 55 | 56 | // Have prefix; gather characters. 57 | var buf bytes.Buffer 58 | for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { 59 | buf.WriteRune(i.Rune[0]) 60 | pc, i = i.Out, &p.Inst[i.Out] 61 | } 62 | if i.Op == syntax.InstEmptyWidth && 63 | syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && 64 | p.Inst[i.Out].Op == syntax.InstMatch { 65 | complete = true 66 | } 67 | return buf.String(), complete, pc 68 | } 69 | 70 | // OnePassNext selects the next actionable state of the prog, based on the input character. 71 | // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 72 | // One of the alternates may ultimately lead without input to end of line. If the instruction 73 | // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 74 | func onePassNext(i *onePassInst, r rune) uint32 { 75 | next := i.MatchRunePos(r) 76 | if next >= 0 { 77 | return i.Next[next] 78 | } 79 | if i.Op == syntax.InstAltMatch { 80 | return i.Out 81 | } 82 | return 0 83 | } 84 | 85 | func iop(i *syntax.Inst) syntax.InstOp { 86 | op := i.Op 87 | switch op { 88 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 89 | op = syntax.InstRune 90 | } 91 | return op 92 | } 93 | 94 | // Sparse Array implementation is used as a queueOnePass. 95 | type queueOnePass struct { 96 | sparse []uint32 97 | dense []uint32 98 | size, nextIndex uint32 99 | } 100 | 101 | func (q *queueOnePass) empty() bool { 102 | return q.nextIndex >= q.size 103 | } 104 | 105 | func (q *queueOnePass) next() (n uint32) { 106 | n = q.dense[q.nextIndex] 107 | q.nextIndex++ 108 | return 109 | } 110 | 111 | func (q *queueOnePass) clear() { 112 | q.size = 0 113 | q.nextIndex = 0 114 | } 115 | 116 | func (q *queueOnePass) contains(u uint32) bool { 117 | if u >= uint32(len(q.sparse)) { 118 | return false 119 | } 120 | return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 121 | } 122 | 123 | func (q *queueOnePass) insert(u uint32) { 124 | if !q.contains(u) { 125 | q.insertNew(u) 126 | } 127 | } 128 | 129 | func (q *queueOnePass) insertNew(u uint32) { 130 | if u >= uint32(len(q.sparse)) { 131 | return 132 | } 133 | q.sparse[u] = q.size 134 | q.dense[q.size] = u 135 | q.size++ 136 | } 137 | 138 | func newQueue(size int) (q *queueOnePass) { 139 | return &queueOnePass{ 140 | sparse: make([]uint32, size), 141 | dense: make([]uint32, size), 142 | } 143 | } 144 | 145 | // mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 146 | // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 147 | // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 148 | // NextIp array with the single element mergeFailed is returned. 149 | // The code assumes that both inputs contain ordered and non-intersecting rune pairs. 150 | const mergeFailed = uint32(0xffffffff) 151 | 152 | var ( 153 | noRune = []rune{} 154 | noNext = []uint32{mergeFailed} 155 | ) 156 | 157 | func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 158 | leftLen := len(*leftRunes) 159 | rightLen := len(*rightRunes) 160 | if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 161 | panic("mergeRuneSets odd length []rune") 162 | } 163 | var ( 164 | lx, rx int 165 | ) 166 | merged := make([]rune, 0) 167 | next := make([]uint32, 0) 168 | ok := true 169 | defer func() { 170 | if !ok { 171 | merged = nil 172 | next = nil 173 | } 174 | }() 175 | 176 | ix := -1 177 | extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 178 | if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 179 | return false 180 | } 181 | merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 182 | *newLow += 2 183 | ix += 2 184 | next = append(next, pc) 185 | return true 186 | } 187 | 188 | for lx < leftLen || rx < rightLen { 189 | switch { 190 | case rx >= rightLen: 191 | ok = extend(&lx, leftRunes, leftPC) 192 | case lx >= leftLen: 193 | ok = extend(&rx, rightRunes, rightPC) 194 | case (*rightRunes)[rx] < (*leftRunes)[lx]: 195 | ok = extend(&rx, rightRunes, rightPC) 196 | default: 197 | ok = extend(&lx, leftRunes, leftPC) 198 | } 199 | if !ok { 200 | return noRune, noNext 201 | } 202 | } 203 | return merged, next 204 | } 205 | 206 | // cleanupOnePass drops working memory, and restores certain shortcut instructions. 207 | func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 208 | for ix, instOriginal := range original.Inst { 209 | switch instOriginal.Op { 210 | case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 211 | case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 212 | prog.Inst[ix].Next = nil 213 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 214 | prog.Inst[ix].Next = nil 215 | prog.Inst[ix] = onePassInst{Inst: instOriginal} 216 | } 217 | } 218 | } 219 | 220 | // onePassCopy creates a copy of the original Prog, as we'll be modifying it 221 | func onePassCopy(prog *syntax.Prog) *onePassProg { 222 | p := &onePassProg{ 223 | Start: prog.Start, 224 | NumCap: prog.NumCap, 225 | } 226 | for _, inst := range prog.Inst { 227 | p.Inst = append(p.Inst, onePassInst{Inst: inst}) 228 | } 229 | 230 | // rewrites one or more common Prog constructs that enable some otherwise 231 | // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 232 | // ip A, that points to ips B & C. 233 | // A:BC + B:DA => A:BC + B:CD 234 | // A:BC + B:DC => A:DC + B:DC 235 | for pc := range p.Inst { 236 | switch p.Inst[pc].Op { 237 | default: 238 | continue 239 | case syntax.InstAlt, syntax.InstAltMatch: 240 | // A:Bx + B:Ay 241 | p_A_Other := &p.Inst[pc].Out 242 | p_A_Alt := &p.Inst[pc].Arg 243 | // make sure a target is another Alt 244 | instAlt := p.Inst[*p_A_Alt] 245 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 246 | p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 247 | instAlt = p.Inst[*p_A_Alt] 248 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 249 | continue 250 | } 251 | } 252 | instOther := p.Inst[*p_A_Other] 253 | // Analyzing both legs pointing to Alts is for another day 254 | if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 255 | // too complicated 256 | continue 257 | } 258 | // simple empty transition loop 259 | // A:BC + B:DA => A:BC + B:DC 260 | p_B_Alt := &p.Inst[*p_A_Alt].Out 261 | p_B_Other := &p.Inst[*p_A_Alt].Arg 262 | patch := false 263 | if instAlt.Out == uint32(pc) { 264 | patch = true 265 | } else if instAlt.Arg == uint32(pc) { 266 | patch = true 267 | p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 268 | } 269 | if patch { 270 | *p_B_Alt = *p_A_Other 271 | } 272 | 273 | // empty transition to common target 274 | // A:BC + B:DC => A:DC + B:DC 275 | if *p_A_Other == *p_B_Alt { 276 | *p_A_Alt = *p_B_Other 277 | } 278 | } 279 | } 280 | return p 281 | } 282 | 283 | // runeSlice exists to permit sorting the case-folded rune sets. 284 | type runeSlice []rune 285 | 286 | func (p runeSlice) Len() int { return len(p) } 287 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 288 | func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 289 | 290 | // Sort is a convenience method. 291 | func (p runeSlice) Sort() { 292 | sort.Sort(p) 293 | } 294 | 295 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 296 | var anyRune = []rune{0, unicode.MaxRune} 297 | 298 | // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 299 | // the match engine can always tell which branch to take. The routine may modify 300 | // p if it is turned into a onepass Prog. If it isn't possible for this to be a 301 | // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive 302 | // to the size of the Prog. 303 | func makeOnePass(p *onePassProg) *onePassProg { 304 | // If the machine is very long, it's not worth the time to check if we can use one pass. 305 | if len(p.Inst) >= 1000 { 306 | return notOnePass 307 | } 308 | 309 | var ( 310 | instQueue = newQueue(len(p.Inst)) 311 | visitQueue = newQueue(len(p.Inst)) 312 | check func(uint32, map[uint32]bool) bool 313 | onePassRunes = make([][]rune, len(p.Inst)) 314 | ) 315 | 316 | // check that paths from Alt instructions are unambiguous, and rebuild the new 317 | // program as a onepass program 318 | check = func(pc uint32, m map[uint32]bool) (ok bool) { 319 | ok = true 320 | inst := &p.Inst[pc] 321 | if visitQueue.contains(pc) { 322 | return 323 | } 324 | visitQueue.insert(pc) 325 | switch inst.Op { 326 | case syntax.InstAlt, syntax.InstAltMatch: 327 | ok = check(inst.Out, m) && check(inst.Arg, m) 328 | // check no-input paths to InstMatch 329 | matchOut := m[inst.Out] 330 | matchArg := m[inst.Arg] 331 | if matchOut && matchArg { 332 | ok = false 333 | break 334 | } 335 | // Match on empty goes in inst.Out 336 | if matchArg { 337 | inst.Out, inst.Arg = inst.Arg, inst.Out 338 | matchOut, matchArg = matchArg, matchOut 339 | } 340 | if matchOut { 341 | m[pc] = true 342 | inst.Op = syntax.InstAltMatch 343 | } 344 | 345 | // build a dispatch operator from the two legs of the alt. 346 | onePassRunes[pc], inst.Next = mergeRuneSets( 347 | &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 348 | if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 349 | ok = false 350 | break 351 | } 352 | case syntax.InstCapture, syntax.InstNop: 353 | ok = check(inst.Out, m) 354 | m[pc] = m[inst.Out] 355 | // pass matching runes back through these no-ops. 356 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 357 | inst.Next = []uint32{} 358 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 359 | inst.Next = append(inst.Next, inst.Out) 360 | } 361 | case syntax.InstEmptyWidth: 362 | ok = check(inst.Out, m) 363 | m[pc] = m[inst.Out] 364 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 365 | inst.Next = []uint32{} 366 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 367 | inst.Next = append(inst.Next, inst.Out) 368 | } 369 | case syntax.InstMatch, syntax.InstFail: 370 | m[pc] = inst.Op == syntax.InstMatch 371 | break 372 | case syntax.InstRune: 373 | m[pc] = false 374 | if len(inst.Next) > 0 { 375 | break 376 | } 377 | instQueue.insert(inst.Out) 378 | if len(inst.Rune) == 0 { 379 | onePassRunes[pc] = []rune{} 380 | inst.Next = []uint32{inst.Out} 381 | break 382 | } 383 | runes := make([]rune, 0) 384 | if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 385 | r0 := inst.Rune[0] 386 | runes = append(runes, r0, r0) 387 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 388 | runes = append(runes, r1, r1) 389 | } 390 | sort.Sort(runeSlice(runes)) 391 | } else { 392 | runes = append(runes, inst.Rune...) 393 | } 394 | onePassRunes[pc] = runes 395 | inst.Next = []uint32{} 396 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 397 | inst.Next = append(inst.Next, inst.Out) 398 | } 399 | inst.Op = syntax.InstRune 400 | case syntax.InstRune1: 401 | m[pc] = false 402 | if len(inst.Next) > 0 { 403 | break 404 | } 405 | instQueue.insert(inst.Out) 406 | runes := []rune{} 407 | // expand case-folded runes 408 | if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 409 | r0 := inst.Rune[0] 410 | runes = append(runes, r0, r0) 411 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 412 | runes = append(runes, r1, r1) 413 | } 414 | sort.Sort(runeSlice(runes)) 415 | } else { 416 | runes = append(runes, inst.Rune[0], inst.Rune[0]) 417 | } 418 | onePassRunes[pc] = runes 419 | inst.Next = []uint32{} 420 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 421 | inst.Next = append(inst.Next, inst.Out) 422 | } 423 | inst.Op = syntax.InstRune 424 | case syntax.InstRuneAny: 425 | m[pc] = false 426 | if len(inst.Next) > 0 { 427 | break 428 | } 429 | instQueue.insert(inst.Out) 430 | onePassRunes[pc] = append([]rune{}, anyRune...) 431 | inst.Next = []uint32{inst.Out} 432 | case syntax.InstRuneAnyNotNL: 433 | m[pc] = false 434 | if len(inst.Next) > 0 { 435 | break 436 | } 437 | instQueue.insert(inst.Out) 438 | onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 439 | inst.Next = []uint32{} 440 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 441 | inst.Next = append(inst.Next, inst.Out) 442 | } 443 | } 444 | return 445 | } 446 | 447 | instQueue.clear() 448 | instQueue.insert(uint32(p.Start)) 449 | m := make(map[uint32]bool, len(p.Inst)) 450 | for !instQueue.empty() { 451 | visitQueue.clear() 452 | pc := instQueue.next() 453 | if !check(pc, m) { 454 | p = notOnePass 455 | break 456 | } 457 | } 458 | if p != notOnePass { 459 | for i := range p.Inst { 460 | p.Inst[i].Rune = onePassRunes[i] 461 | } 462 | } 463 | return p 464 | } 465 | 466 | var notOnePass *onePassProg = nil 467 | 468 | // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 469 | // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the 470 | // Prog cannot be converted. For a one pass prog, the fundamental condition that must 471 | // be true is: at any InstAlt, there must be no ambiguity about what branch to take. 472 | func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 473 | if prog.Start == 0 { 474 | return notOnePass 475 | } 476 | // onepass regexp is anchored 477 | if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 478 | syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 479 | return notOnePass 480 | } 481 | // every instruction leading to InstMatch must be EmptyEndText 482 | for _, inst := range prog.Inst { 483 | opOut := prog.Inst[inst.Out].Op 484 | switch inst.Op { 485 | default: 486 | if opOut == syntax.InstMatch { 487 | return notOnePass 488 | } 489 | case syntax.InstAlt, syntax.InstAltMatch: 490 | if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 491 | return notOnePass 492 | } 493 | case syntax.InstEmptyWidth: 494 | if opOut == syntax.InstMatch { 495 | if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 496 | continue 497 | } 498 | return notOnePass 499 | } 500 | } 501 | } 502 | // Creates a slightly optimized copy of the original Prog 503 | // that cleans up some Prog idioms that block valid onepass programs 504 | p = onePassCopy(prog) 505 | 506 | // checkAmbiguity on InstAlts, build onepass Prog if possible 507 | p = makeOnePass(p) 508 | 509 | if p != notOnePass { 510 | cleanupOnePass(p, prog) 511 | } 512 | return p 513 | } 514 | -------------------------------------------------------------------------------- /onepass_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "reflect" 9 | "matloob.io/regexp/syntax" 10 | "testing" 11 | ) 12 | 13 | var runeMergeTests = []struct { 14 | left, right, merged []rune 15 | next []uint32 16 | leftPC, rightPC uint32 17 | }{ 18 | { 19 | // empty rhs 20 | []rune{69, 69}, 21 | []rune{}, 22 | []rune{69, 69}, 23 | []uint32{1}, 24 | 1, 2, 25 | }, 26 | { 27 | // identical runes, identical targets 28 | []rune{69, 69}, 29 | []rune{69, 69}, 30 | []rune{}, 31 | []uint32{mergeFailed}, 32 | 1, 1, 33 | }, 34 | { 35 | // identical runes, different targets 36 | []rune{69, 69}, 37 | []rune{69, 69}, 38 | []rune{}, 39 | []uint32{mergeFailed}, 40 | 1, 2, 41 | }, 42 | { 43 | // append right-first 44 | []rune{69, 69}, 45 | []rune{71, 71}, 46 | []rune{69, 69, 71, 71}, 47 | []uint32{1, 2}, 48 | 1, 2, 49 | }, 50 | { 51 | // append, left-first 52 | []rune{71, 71}, 53 | []rune{69, 69}, 54 | []rune{69, 69, 71, 71}, 55 | []uint32{2, 1}, 56 | 1, 2, 57 | }, 58 | { 59 | // successful interleave 60 | []rune{60, 60, 71, 71, 101, 101}, 61 | []rune{69, 69, 88, 88}, 62 | []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101}, 63 | []uint32{1, 2, 1, 2, 1}, 64 | 1, 2, 65 | }, 66 | { 67 | // left surrounds right 68 | []rune{69, 74}, 69 | []rune{71, 71}, 70 | []rune{}, 71 | []uint32{mergeFailed}, 72 | 1, 2, 73 | }, 74 | { 75 | // right surrounds left 76 | []rune{69, 74}, 77 | []rune{68, 75}, 78 | []rune{}, 79 | []uint32{mergeFailed}, 80 | 1, 2, 81 | }, 82 | { 83 | // overlap at interval begin 84 | []rune{69, 74}, 85 | []rune{74, 75}, 86 | []rune{}, 87 | []uint32{mergeFailed}, 88 | 1, 2, 89 | }, 90 | { 91 | // overlap ar interval end 92 | []rune{69, 74}, 93 | []rune{65, 69}, 94 | []rune{}, 95 | []uint32{mergeFailed}, 96 | 1, 2, 97 | }, 98 | { 99 | // overlap from above 100 | []rune{69, 74}, 101 | []rune{71, 74}, 102 | []rune{}, 103 | []uint32{mergeFailed}, 104 | 1, 2, 105 | }, 106 | { 107 | // overlap from below 108 | []rune{69, 74}, 109 | []rune{65, 71}, 110 | []rune{}, 111 | []uint32{mergeFailed}, 112 | 1, 2, 113 | }, 114 | { 115 | // out of order []rune 116 | []rune{69, 74, 60, 65}, 117 | []rune{66, 67}, 118 | []rune{}, 119 | []uint32{mergeFailed}, 120 | 1, 2, 121 | }, 122 | } 123 | 124 | func TestMergeRuneSet(t *testing.T) { 125 | for ix, test := range runeMergeTests { 126 | merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC) 127 | if !reflect.DeepEqual(merged, test.merged) { 128 | t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged) 129 | } 130 | if !reflect.DeepEqual(next, test.next) { 131 | t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next) 132 | } 133 | } 134 | } 135 | 136 | var onePass = &onePassProg{} 137 | 138 | var onePassTests = []struct { 139 | re string 140 | onePass *onePassProg 141 | }{ 142 | {`^(?:a|(?:a*))$`, notOnePass}, 143 | {`^(?:(a)|(?:a*))$`, notOnePass}, 144 | {`^(?:(?:(?:.(?:$))?))$`, onePass}, 145 | {`^abcd$`, onePass}, 146 | {`^(?:(?:a{0,})*?)$`, onePass}, 147 | {`^(?:(?:a+)*)$`, onePass}, 148 | {`^(?:(?:a|(?:aa)))$`, onePass}, 149 | {`^(?:[^\s\S])$`, onePass}, 150 | {`^(?:(?:a{3,4}){0,})$`, notOnePass}, 151 | {`^(?:(?:(?:a*)+))$`, onePass}, 152 | {`^[a-c]+$`, onePass}, 153 | {`^[a-c]*$`, onePass}, 154 | {`^(?:a*)$`, onePass}, 155 | {`^(?:(?:aa)|a)$`, onePass}, 156 | {`^[a-c]*`, notOnePass}, 157 | {`^...$`, onePass}, 158 | {`^(?:a|(?:aa))$`, onePass}, 159 | {`^a((b))c$`, onePass}, 160 | {`^a.[l-nA-Cg-j]?e$`, onePass}, 161 | {`^a((b))$`, onePass}, 162 | {`^a(?:(b)|(c))c$`, onePass}, 163 | {`^a(?:(b*)|(c))c$`, notOnePass}, 164 | {`^a(?:b|c)$`, onePass}, 165 | {`^a(?:b?|c)$`, onePass}, 166 | {`^a(?:b?|c?)$`, notOnePass}, 167 | {`^a(?:b?|c+)$`, onePass}, 168 | {`^a(?:b+|(bc))d$`, notOnePass}, 169 | {`^a(?:bc)+$`, onePass}, 170 | {`^a(?:[bcd])+$`, onePass}, 171 | {`^a((?:[bcd])+)$`, onePass}, 172 | {`^a(:?b|c)*d$`, onePass}, 173 | {`^.bc(d|e)*$`, onePass}, 174 | {`^(?:(?:aa)|.)$`, notOnePass}, 175 | {`^(?:(?:a{1,2}){1,2})$`, notOnePass}, 176 | } 177 | 178 | func TestCompileOnePass(t *testing.T) { 179 | var ( 180 | p *syntax.Prog 181 | re *syntax.Regexp 182 | err error 183 | ) 184 | for _, test := range onePassTests { 185 | if re, err = syntax.Parse(test.re, syntax.Perl); err != nil { 186 | t.Errorf("Parse(%q) got err:%s, want success", test.re, err) 187 | continue 188 | } 189 | // needs to be done before compile... 190 | re = re.Simplify() 191 | if p, err = syntax.Compile(re); err != nil { 192 | t.Errorf("Compile(%q) got err:%s, want success", test.re, err) 193 | continue 194 | } 195 | onePass = compileOnePass(p) 196 | if (onePass == notOnePass) != (test.onePass == notOnePass) { 197 | t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass) 198 | } 199 | } 200 | } 201 | 202 | // TODO(cespare): Unify with onePassTests and rationalize one-pass test cases. 203 | var onePassTests1 = []struct { 204 | re string 205 | match string 206 | }{ 207 | {`^a(/b+(#c+)*)*$`, "a/b#c"}, // golang.org/issue/11905 208 | } 209 | 210 | func TestRunOnePass(t *testing.T) { 211 | for _, test := range onePassTests1 { 212 | re, err := Compile(test.re) 213 | if err != nil { 214 | t.Errorf("Compile(%q): got err: %s", test.re, err) 215 | continue 216 | } 217 | if re.onepass == notOnePass { 218 | t.Errorf("Compile(%q): got notOnePass, want one-pass", test.re) 219 | continue 220 | } 221 | if !re.MatchString(test.match) { 222 | t.Errorf("onepass %q did not match %q", test.re, test.match) 223 | } 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /syntax/compile.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import "unicode" 8 | 9 | // A patchList is a list of instruction pointers that need to be filled in (patched). 10 | // Because the pointers haven't been filled in yet, we can reuse their storage 11 | // to hold the list. It's kind of sleazy, but works well in practice. 12 | // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. 13 | // 14 | // These aren't really pointers: they're integers, so we can reinterpret them 15 | // this way without using package unsafe. A value l denotes 16 | // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1). 17 | // l == 0 denotes the empty list, okay because we start every program 18 | // with a fail instruction, so we'll never want to point at its output link. 19 | type patchList uint32 20 | 21 | func (l patchList) next(p *Prog) patchList { 22 | i := &p.Inst[l>>1] 23 | if l&1 == 0 { 24 | return patchList(i.Out) 25 | } 26 | return patchList(i.Arg) 27 | } 28 | 29 | func (l patchList) patch(p *Prog, val uint32) { 30 | for l != 0 { 31 | i := &p.Inst[l>>1] 32 | if l&1 == 0 { 33 | l = patchList(i.Out) 34 | i.Out = val 35 | } else { 36 | l = patchList(i.Arg) 37 | i.Arg = val 38 | } 39 | } 40 | } 41 | 42 | func (l1 patchList) append(p *Prog, l2 patchList) patchList { 43 | if l1 == 0 { 44 | return l2 45 | } 46 | if l2 == 0 { 47 | return l1 48 | } 49 | 50 | last := l1 51 | for { 52 | next := last.next(p) 53 | if next == 0 { 54 | break 55 | } 56 | last = next 57 | } 58 | 59 | i := &p.Inst[last>>1] 60 | if last&1 == 0 { 61 | i.Out = uint32(l2) 62 | } else { 63 | i.Arg = uint32(l2) 64 | } 65 | return l1 66 | } 67 | 68 | // A frag represents a compiled program fragment. 69 | type frag struct { 70 | i uint32 // index of first instruction 71 | out patchList // where to record end instruction 72 | } 73 | 74 | type compiler struct { 75 | p *Prog 76 | reversed bool 77 | } 78 | 79 | // Compile compiles the regexp into a program to be executed. 80 | // The regexp should have been simplified already (returned from re.Simplify). 81 | func Compile(re *Regexp) (*Prog, error) { 82 | var c compiler 83 | c.init() 84 | f := c.compile(re) 85 | f.out.patch(c.p, c.inst(InstMatch).i) 86 | 87 | c.p.StartUnanchored = int(c.cat(c.star(c.rune(anyRune, 0), true), f).i) 88 | // TODO(matloob): end of area that needs to be cleaned up 89 | 90 | c.p.Start = int(f.i) 91 | return c.p, nil 92 | } 93 | 94 | // CompileReversed compiles the regexp into a reverse program. 95 | func CompileReversed(re *Regexp) (*Prog, error) { 96 | var c compiler 97 | c.init() 98 | c.reversed = true 99 | re = re.Simplify() 100 | f := c.compile(re) 101 | f.out.patch(c.p, c.inst(InstMatch).i) 102 | c.p.Start = int(f.i) 103 | return c.p, nil 104 | } 105 | 106 | func (c *compiler) init() { 107 | c.p = new(Prog) 108 | c.p.NumCap = 2 // implicit ( and ) for whole match $0 109 | c.inst(InstFail) 110 | } 111 | 112 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 113 | var anyRune = []rune{0, unicode.MaxRune} 114 | 115 | func (c *compiler) compile(re *Regexp) frag { 116 | switch re.Op { 117 | case OpNoMatch: 118 | return c.fail() 119 | case OpEmptyMatch: 120 | return c.nop() 121 | case OpLiteral: 122 | if len(re.Rune) == 0 { 123 | return c.nop() 124 | } 125 | var f frag 126 | for j := range re.Rune { 127 | f1 := c.rune(re.Rune[j:j+1], re.Flags) 128 | if j == 0 { 129 | f = f1 130 | } else { 131 | f = c.cat(f, f1) 132 | } 133 | } 134 | return f 135 | case OpCharClass: 136 | return c.rune(re.Rune, re.Flags) 137 | case OpAnyCharNotNL: 138 | return c.rune(anyRuneNotNL, 0) 139 | case OpAnyChar: 140 | return c.rune(anyRune, 0) 141 | case OpBeginLine: 142 | if c.reversed { 143 | return c.empty(EmptyEndLine) 144 | } 145 | return c.empty(EmptyBeginLine) 146 | case OpEndLine: 147 | if c.reversed { 148 | return c.empty(EmptyBeginLine) 149 | } 150 | return c.empty(EmptyEndLine) 151 | case OpBeginText: 152 | if c.reversed { 153 | return c.empty(EmptyEndText) 154 | } 155 | return c.empty(EmptyBeginText) 156 | case OpEndText: 157 | if c.reversed { 158 | return c.empty(EmptyBeginText) 159 | } 160 | return c.empty(EmptyEndText) 161 | case OpWordBoundary: 162 | return c.empty(EmptyWordBoundary) 163 | case OpNoWordBoundary: 164 | return c.empty(EmptyNoWordBoundary) 165 | case OpCapture: 166 | bra := c.cap(uint32(re.Cap << 1)) 167 | sub := c.compile(re.Sub[0]) 168 | ket := c.cap(uint32(re.Cap<<1 | 1)) 169 | return c.cat(c.cat(bra, sub), ket) 170 | case OpStar: 171 | return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 172 | case OpPlus: 173 | return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 174 | case OpQuest: 175 | return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 176 | case OpConcat: 177 | if len(re.Sub) == 0 { 178 | return c.nop() 179 | } 180 | var f frag 181 | for i, sub := range re.Sub { 182 | if i == 0 { 183 | f = c.compile(sub) 184 | } else { 185 | f = c.cat(f, c.compile(sub)) 186 | } 187 | } 188 | return f 189 | case OpAlternate: 190 | var f frag 191 | for _, sub := range re.Sub { 192 | f = c.alt(f, c.compile(sub)) 193 | } 194 | return f 195 | } 196 | panic("regexp: unhandled case in compile") 197 | } 198 | 199 | func (c *compiler) inst(op InstOp) frag { 200 | // TODO: impose length limit 201 | f := frag{i: uint32(len(c.p.Inst))} 202 | c.p.Inst = append(c.p.Inst, Inst{Op: op}) 203 | return f 204 | } 205 | 206 | func (c *compiler) nop() frag { 207 | f := c.inst(InstNop) 208 | f.out = patchList(f.i << 1) 209 | return f 210 | } 211 | 212 | func (c *compiler) fail() frag { 213 | return frag{} 214 | } 215 | 216 | func (c *compiler) cap(arg uint32) frag { 217 | f := c.inst(InstCapture) 218 | f.out = patchList(f.i << 1) 219 | c.p.Inst[f.i].Arg = arg 220 | 221 | if c.p.NumCap < int(arg)+1 { 222 | c.p.NumCap = int(arg) + 1 223 | } 224 | return f 225 | } 226 | 227 | func (c *compiler) cat(f1, f2 frag) frag { 228 | // concat of failure is failure 229 | if f1.i == 0 || f2.i == 0 { 230 | return frag{} 231 | } 232 | 233 | // TODO: elide nop 234 | 235 | if c.reversed { 236 | f2.out.patch(c.p, f1.i) 237 | return frag{f2.i, f1.out} 238 | } 239 | f1.out.patch(c.p, f2.i) 240 | return frag{f1.i, f2.out} 241 | } 242 | 243 | func (c *compiler) alt(f1, f2 frag) frag { 244 | // alt of failure is other 245 | if f1.i == 0 { 246 | return f2 247 | } 248 | if f2.i == 0 { 249 | return f1 250 | } 251 | 252 | f := c.inst(InstAlt) 253 | i := &c.p.Inst[f.i] 254 | i.Out = f1.i 255 | i.Arg = f2.i 256 | f.out = f1.out.append(c.p, f2.out) 257 | return f 258 | } 259 | 260 | func (c *compiler) quest(f1 frag, nongreedy bool) frag { 261 | f := c.inst(InstAlt) 262 | i := &c.p.Inst[f.i] 263 | if nongreedy { 264 | i.Arg = f1.i 265 | f.out = patchList(f.i << 1) 266 | } else { 267 | i.Out = f1.i 268 | f.out = patchList(f.i<<1 | 1) 269 | } 270 | f.out = f.out.append(c.p, f1.out) 271 | return f 272 | } 273 | 274 | func (c *compiler) star(f1 frag, nongreedy bool) frag { 275 | f := c.inst(InstAlt) 276 | i := &c.p.Inst[f.i] 277 | if nongreedy { 278 | i.Arg = f1.i 279 | f.out = patchList(f.i << 1) 280 | } else { 281 | i.Out = f1.i 282 | f.out = patchList(f.i<<1 | 1) 283 | } 284 | f1.out.patch(c.p, f.i) 285 | return f 286 | } 287 | 288 | func (c *compiler) plus(f1 frag, nongreedy bool) frag { 289 | return frag{f1.i, c.star(f1, nongreedy).out} 290 | } 291 | 292 | func (c *compiler) empty(op EmptyOp) frag { 293 | f := c.inst(InstEmptyWidth) 294 | c.p.Inst[f.i].Arg = uint32(op) 295 | f.out = patchList(f.i << 1) 296 | return f 297 | } 298 | 299 | func (c *compiler) rune(r []rune, flags Flags) frag { 300 | f := c.inst(InstRune) 301 | i := &c.p.Inst[f.i] 302 | i.Rune = r 303 | flags &= FoldCase // only relevant flag is FoldCase 304 | if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] { 305 | // and sometimes not even that 306 | flags &^= FoldCase 307 | } 308 | i.Arg = uint32(flags) 309 | f.out = patchList(f.i << 1) 310 | 311 | // Special cases for exec machine. 312 | switch { 313 | case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]): 314 | i.Op = InstRune1 315 | case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune: 316 | i.Op = InstRuneAny 317 | case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune: 318 | i.Op = InstRuneAnyNotNL 319 | } 320 | 321 | return f 322 | } 323 | -------------------------------------------------------------------------------- /syntax/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution. 6 | 7 | /* 8 | Package syntax parses regular expressions into parse trees and compiles 9 | parse trees into programs. Most clients of regular expressions will use the 10 | facilities of package regexp (such as Compile and Match) instead of this package. 11 | 12 | Syntax 13 | 14 | The regular expression syntax understood by this package when parsing with the Perl flag is as follows. 15 | Parts of the syntax can be disabled by passing alternate flags to Parse. 16 | 17 | 18 | Single characters: 19 | . any character, possibly including newline (flag s=true) 20 | [xyz] character class 21 | [^xyz] negated character class 22 | \d Perl character class 23 | \D negated Perl character class 24 | [[:alpha:]] ASCII character class 25 | [[:^alpha:]] negated ASCII character class 26 | \pN Unicode character class (one-letter name) 27 | \p{Greek} Unicode character class 28 | \PN negated Unicode character class (one-letter name) 29 | \P{Greek} negated Unicode character class 30 | 31 | Composites: 32 | xy x followed by y 33 | x|y x or y (prefer x) 34 | 35 | Repetitions: 36 | x* zero or more x, prefer more 37 | x+ one or more x, prefer more 38 | x? zero or one x, prefer one 39 | x{n,m} n or n+1 or ... or m x, prefer more 40 | x{n,} n or more x, prefer more 41 | x{n} exactly n x 42 | x*? zero or more x, prefer fewer 43 | x+? one or more x, prefer fewer 44 | x?? zero or one x, prefer zero 45 | x{n,m}? n or n+1 or ... or m x, prefer fewer 46 | x{n,}? n or more x, prefer fewer 47 | x{n}? exactly n x 48 | 49 | Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n} 50 | reject forms that create a minimum or maximum repetition count above 1000. 51 | Unlimited repetitions are not subject to this restriction. 52 | 53 | Grouping: 54 | (re) numbered capturing group (submatch) 55 | (?Pre) named & numbered capturing group (submatch) 56 | (?:re) non-capturing group 57 | (?flags) set flags within current group; non-capturing 58 | (?flags:re) set flags during re; non-capturing 59 | 60 | Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are: 61 | 62 | i case-insensitive (default false) 63 | m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false) 64 | s let . match \n (default false) 65 | U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false) 66 | 67 | Empty strings: 68 | ^ at beginning of text or line (flag m=true) 69 | $ at end of text (like \z not \Z) or line (flag m=true) 70 | \A at beginning of text 71 | \b at ASCII word boundary (\w on one side and \W, \A, or \z on the other) 72 | \B not at ASCII word boundary 73 | \z at end of text 74 | 75 | Escape sequences: 76 | \a bell (== \007) 77 | \f form feed (== \014) 78 | \t horizontal tab (== \011) 79 | \n newline (== \012) 80 | \r carriage return (== \015) 81 | \v vertical tab character (== \013) 82 | \* literal *, for any punctuation character * 83 | \123 octal character code (up to three digits) 84 | \x7F hex character code (exactly two digits) 85 | \x{10FFFF} hex character code 86 | \Q...\E literal text ... even if ... has punctuation 87 | 88 | Character class elements: 89 | x single character 90 | A-Z character range (inclusive) 91 | \d Perl character class 92 | [:foo:] ASCII character class foo 93 | \p{Foo} Unicode character class Foo 94 | \pF Unicode character class F (one-letter name) 95 | 96 | Named character classes as character class elements: 97 | [\d] digits (== \d) 98 | [^\d] not digits (== \D) 99 | [\D] not digits (== \D) 100 | [^\D] not not digits (== \d) 101 | [[:name:]] named ASCII class inside character class (== [:name:]) 102 | [^[:name:]] named ASCII class inside negated character class (== [:^name:]) 103 | [\p{Name}] named Unicode property inside character class (== \p{Name}) 104 | [^\p{Name}] named Unicode property inside negated character class (== \P{Name}) 105 | 106 | Perl character classes (all ASCII-only): 107 | \d digits (== [0-9]) 108 | \D not digits (== [^0-9]) 109 | \s whitespace (== [\t\n\f\r ]) 110 | \S not whitespace (== [^\t\n\f\r ]) 111 | \w word characters (== [0-9A-Za-z_]) 112 | \W not word characters (== [^0-9A-Za-z_]) 113 | 114 | ASCII character classes: 115 | [[:alnum:]] alphanumeric (== [0-9A-Za-z]) 116 | [[:alpha:]] alphabetic (== [A-Za-z]) 117 | [[:ascii:]] ASCII (== [\x00-\x7F]) 118 | [[:blank:]] blank (== [\t ]) 119 | [[:cntrl:]] control (== [\x00-\x1F\x7F]) 120 | [[:digit:]] digits (== [0-9]) 121 | [[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) 122 | [[:lower:]] lower case (== [a-z]) 123 | [[:print:]] printable (== [ -~] == [ [:graph:]]) 124 | [[:punct:]] punctuation (== [!-/:-@[-`{-~]) 125 | [[:space:]] whitespace (== [\t\n\v\f\r ]) 126 | [[:upper:]] upper case (== [A-Z]) 127 | [[:word:]] word characters (== [0-9A-Za-z_]) 128 | [[:xdigit:]] hex digit (== [0-9A-Fa-f]) 129 | 130 | */ 131 | package syntax 132 | -------------------------------------------------------------------------------- /syntax/make_perl_groups.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2008 The Go Authors. All rights reserved. 3 | # Use of this source code is governed by a BSD-style 4 | # license that can be found in the LICENSE file. 5 | 6 | # Modified version of RE2's make_perl_groups.pl. 7 | 8 | # Generate table entries giving character ranges 9 | # for POSIX/Perl character classes. Rather than 10 | # figure out what the definition is, it is easier to ask 11 | # Perl about each letter from 0-128 and write down 12 | # its answer. 13 | 14 | @posixclasses = ( 15 | "[:alnum:]", 16 | "[:alpha:]", 17 | "[:ascii:]", 18 | "[:blank:]", 19 | "[:cntrl:]", 20 | "[:digit:]", 21 | "[:graph:]", 22 | "[:lower:]", 23 | "[:print:]", 24 | "[:punct:]", 25 | "[:space:]", 26 | "[:upper:]", 27 | "[:word:]", 28 | "[:xdigit:]", 29 | ); 30 | 31 | @perlclasses = ( 32 | "\\d", 33 | "\\s", 34 | "\\w", 35 | ); 36 | 37 | sub ComputeClass($) { 38 | my @ranges; 39 | my ($class) = @_; 40 | my $regexp = "[$class]"; 41 | my $start = -1; 42 | for (my $i=0; $i<=129; $i++) { 43 | if ($i == 129) { $i = 256; } 44 | if ($i <= 128 && chr($i) =~ $regexp) { 45 | if ($start < 0) { 46 | $start = $i; 47 | } 48 | } else { 49 | if ($start >= 0) { 50 | push @ranges, [$start, $i-1]; 51 | } 52 | $start = -1; 53 | } 54 | } 55 | return @ranges; 56 | } 57 | 58 | sub PrintClass($$@) { 59 | my ($cname, $name, @ranges) = @_; 60 | print "var code$cname = []rune{ /* $name */\n"; 61 | for (my $i=0; $i<@ranges; $i++) { 62 | my @a = @{$ranges[$i]}; 63 | printf "\t0x%x, 0x%x,\n", $a[0], $a[1]; 64 | } 65 | print "}\n\n"; 66 | my $n = @ranges; 67 | $negname = $name; 68 | if ($negname =~ /:/) { 69 | $negname =~ s/:/:^/; 70 | } else { 71 | $negname =~ y/a-z/A-Z/; 72 | } 73 | return "\t`$name`: {+1, code$cname},\n" . 74 | "\t`$negname`: {-1, code$cname},\n"; 75 | } 76 | 77 | my $gen = 0; 78 | 79 | sub PrintClasses($@) { 80 | my ($cname, @classes) = @_; 81 | my @entries; 82 | foreach my $cl (@classes) { 83 | my @ranges = ComputeClass($cl); 84 | push @entries, PrintClass(++$gen, $cl, @ranges); 85 | } 86 | print "var ${cname}Group = map[string]charGroup{\n"; 87 | foreach my $e (@entries) { 88 | print $e; 89 | } 90 | print "}\n"; 91 | my $count = @entries; 92 | } 93 | 94 | print <perl_groups.go 101 | 102 | package syntax 103 | 104 | EOF 105 | 106 | PrintClasses("perl", @perlclasses); 107 | PrintClasses("posix", @posixclasses); 108 | -------------------------------------------------------------------------------- /syntax/parse_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "testing" 11 | "unicode" 12 | ) 13 | 14 | type parseTest struct { 15 | Regexp string 16 | Dump string 17 | } 18 | 19 | var parseTests = []parseTest{ 20 | // Base cases 21 | {`a`, `lit{a}`}, 22 | {`a.`, `cat{lit{a}dot{}}`}, 23 | {`a.b`, `cat{lit{a}dot{}lit{b}}`}, 24 | {`ab`, `str{ab}`}, 25 | {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, 26 | {`abc`, `str{abc}`}, 27 | {`a|^`, `alt{lit{a}bol{}}`}, 28 | {`a|b`, `cc{0x61-0x62}`}, 29 | {`(a)`, `cap{lit{a}}`}, 30 | {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, 31 | {`a*`, `star{lit{a}}`}, 32 | {`a+`, `plus{lit{a}}`}, 33 | {`a?`, `que{lit{a}}`}, 34 | {`a{2}`, `rep{2,2 lit{a}}`}, 35 | {`a{2,3}`, `rep{2,3 lit{a}}`}, 36 | {`a{2,}`, `rep{2,-1 lit{a}}`}, 37 | {`a*?`, `nstar{lit{a}}`}, 38 | {`a+?`, `nplus{lit{a}}`}, 39 | {`a??`, `nque{lit{a}}`}, 40 | {`a{2}?`, `nrep{2,2 lit{a}}`}, 41 | {`a{2,3}?`, `nrep{2,3 lit{a}}`}, 42 | {`a{2,}?`, `nrep{2,-1 lit{a}}`}, 43 | // Malformed { } are treated as literals. 44 | {`x{1001`, `str{x{1001}`}, 45 | {`x{9876543210`, `str{x{9876543210}`}, 46 | {`x{9876543210,`, `str{x{9876543210,}`}, 47 | {`x{2,1`, `str{x{2,1}`}, 48 | {`x{1,9876543210`, `str{x{1,9876543210}`}, 49 | {``, `emp{}`}, 50 | {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored 51 | {`|x|`, `alt{emp{}lit{x}emp{}}`}, 52 | {`.`, `dot{}`}, 53 | {`^`, `bol{}`}, 54 | {`$`, `eol{}`}, 55 | {`\|`, `lit{|}`}, 56 | {`\(`, `lit{(}`}, 57 | {`\)`, `lit{)}`}, 58 | {`\*`, `lit{*}`}, 59 | {`\+`, `lit{+}`}, 60 | {`\?`, `lit{?}`}, 61 | {`{`, `lit{{}`}, 62 | {`}`, `lit{}}`}, 63 | {`\.`, `lit{.}`}, 64 | {`\^`, `lit{^}`}, 65 | {`\$`, `lit{$}`}, 66 | {`\\`, `lit{\}`}, 67 | {`[ace]`, `cc{0x61 0x63 0x65}`}, 68 | {`[abc]`, `cc{0x61-0x63}`}, 69 | {`[a-z]`, `cc{0x61-0x7a}`}, 70 | {`[a]`, `lit{a}`}, 71 | {`\-`, `lit{-}`}, 72 | {`-`, `lit{-}`}, 73 | {`\_`, `lit{_}`}, 74 | {`abc`, `str{abc}`}, 75 | {`abc|def`, `alt{str{abc}str{def}}`}, 76 | {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, 77 | 78 | // Posix and Perl extensions 79 | {`[[:lower:]]`, `cc{0x61-0x7a}`}, 80 | {`[a-z]`, `cc{0x61-0x7a}`}, 81 | {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 82 | {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 83 | {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 84 | {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 85 | {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 86 | {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 87 | {`\d`, `cc{0x30-0x39}`}, 88 | {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, 89 | {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, 90 | {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, 91 | {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, 92 | {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, 93 | {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, 94 | {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 95 | {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, 96 | // { `\C`, `byte{}` }, // probably never 97 | 98 | // Unicode, negatives, and a double negative. 99 | {`\p{Braille}`, `cc{0x2800-0x28ff}`}, 100 | {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 101 | {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 102 | {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, 103 | {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 104 | {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, 105 | {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 106 | {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 107 | {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, 108 | {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 109 | {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, 110 | {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, 111 | {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, 112 | {`\p{Any}`, `dot{}`}, 113 | {`\p{^Any}`, `cc{}`}, 114 | 115 | // Hex, octal. 116 | {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, 117 | {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, 118 | 119 | // More interesting regular expressions. 120 | {`a{,2}`, `str{a{,2}}`}, 121 | {`\.\^\$\\`, `str{.^$\}`}, 122 | {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, 123 | {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, 124 | {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 125 | {`a*{`, `cat{star{lit{a}}lit{{}}`}, 126 | 127 | // Test precedences 128 | {`(?:ab)*`, `star{str{ab}}`}, 129 | {`(ab)*`, `star{cap{str{ab}}}`}, 130 | {`ab|cd`, `alt{str{ab}str{cd}}`}, 131 | {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, 132 | 133 | // Test flattening. 134 | {`(?:a)`, `lit{a}`}, 135 | {`(?:ab)(?:cd)`, `str{abcd}`}, 136 | {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 137 | {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 138 | {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, 139 | {`a|.`, `dot{}`}, 140 | {`.|a`, `dot{}`}, 141 | {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, 142 | {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, 143 | 144 | // Test Perl quoted literals 145 | {`\Q+|*?{[\E`, `str{+|*?{[}`}, 146 | {`\Q+\E+`, `plus{lit{+}}`}, 147 | {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, 148 | {`\Q\\E`, `lit{\}`}, 149 | {`\Q\\\E`, `str{\\}`}, 150 | 151 | // Test Perl \A and \z 152 | {`(?m)^`, `bol{}`}, 153 | {`(?m)$`, `eol{}`}, 154 | {`(?-m)^`, `bot{}`}, 155 | {`(?-m)$`, `eot{}`}, 156 | {`(?m)\A`, `bot{}`}, 157 | {`(?m)\z`, `eot{\z}`}, 158 | {`(?-m)\A`, `bot{}`}, 159 | {`(?-m)\z`, `eot{\z}`}, 160 | 161 | // Test named captures 162 | {`(?Pa)`, `cap{name:lit{a}}`}, 163 | 164 | // Case-folded literals 165 | {`[Aa]`, `litfold{A}`}, 166 | {`[\x{100}\x{101}]`, `litfold{Ā}`}, 167 | {`[Δδ]`, `litfold{Δ}`}, 168 | 169 | // Strings 170 | {`abcde`, `str{abcde}`}, 171 | {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, 172 | 173 | // Factoring. 174 | {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, 175 | {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, 176 | 177 | // Bug fixes. 178 | {`(?:.)`, `dot{}`}, 179 | {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, 180 | {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, 181 | {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, 182 | {`(?:A|a)`, `litfold{A}`}, 183 | {`A|(?:A|a)`, `litfold{A}`}, 184 | {`(?s).`, `dot{}`}, 185 | {`(?-s).`, `dnl{}`}, 186 | {`(?:(?:^).)`, `cat{bol{}dot{}}`}, 187 | {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, 188 | 189 | // RE2 prefix_tests 190 | {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 191 | {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 192 | {`abc|abd|aef|bcx|bcy`, 193 | `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + 194 | `cat{str{bc}cc{0x78-0x79}}}`}, 195 | {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, 196 | {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, 197 | {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, 198 | {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, 199 | {`x{2}|x{2}[0-9]`, 200 | `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, 201 | {`x{2}y|x{2}[0-9]y`, 202 | `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, 203 | {`a.*?c|a.*?b`, 204 | `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, 205 | 206 | // Valid repetitions. 207 | {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, 208 | {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, 209 | } 210 | 211 | const testFlags = MatchNL | PerlX | UnicodeGroups 212 | 213 | func TestParseSimple(t *testing.T) { 214 | testParseDump(t, parseTests, testFlags) 215 | } 216 | 217 | var foldcaseTests = []parseTest{ 218 | {`AbCdE`, `strfold{ABCDE}`}, 219 | {`[Aa]`, `litfold{A}`}, 220 | {`a`, `litfold{A}`}, 221 | 222 | // 0x17F is an old English long s (looks like an f) and folds to s. 223 | // 0x212A is the Kelvin symbol and folds to k. 224 | {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...] 225 | {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 226 | {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 227 | } 228 | 229 | func TestParseFoldCase(t *testing.T) { 230 | testParseDump(t, foldcaseTests, FoldCase) 231 | } 232 | 233 | var literalTests = []parseTest{ 234 | {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"}, 235 | } 236 | 237 | func TestParseLiteral(t *testing.T) { 238 | testParseDump(t, literalTests, Literal) 239 | } 240 | 241 | var matchnlTests = []parseTest{ 242 | {`.`, `dot{}`}, 243 | {"\n", "lit{\n}"}, 244 | {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, 245 | {`[a\n]`, `cc{0xa 0x61}`}, 246 | } 247 | 248 | func TestParseMatchNL(t *testing.T) { 249 | testParseDump(t, matchnlTests, MatchNL) 250 | } 251 | 252 | var nomatchnlTests = []parseTest{ 253 | {`.`, `dnl{}`}, 254 | {"\n", "lit{\n}"}, 255 | {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`}, 256 | {`[a\n]`, `cc{0xa 0x61}`}, 257 | } 258 | 259 | func TestParseNoMatchNL(t *testing.T) { 260 | testParseDump(t, nomatchnlTests, 0) 261 | } 262 | 263 | // Test Parse -> Dump. 264 | func testParseDump(t *testing.T, tests []parseTest, flags Flags) { 265 | for _, tt := range tests { 266 | re, err := Parse(tt.Regexp, flags) 267 | if err != nil { 268 | t.Errorf("Parse(%#q): %v", tt.Regexp, err) 269 | continue 270 | } 271 | if tt.Dump == "" { 272 | // It parsed. That's all we care about. 273 | continue 274 | } 275 | d := dump(re) 276 | if d != tt.Dump { 277 | t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) 278 | } 279 | } 280 | } 281 | 282 | // dump prints a string representation of the regexp showing 283 | // the structure explicitly. 284 | func dump(re *Regexp) string { 285 | var b bytes.Buffer 286 | dumpRegexp(&b, re) 287 | return b.String() 288 | } 289 | 290 | var opNames = []string{ 291 | OpNoMatch: "no", 292 | OpEmptyMatch: "emp", 293 | OpLiteral: "lit", 294 | OpCharClass: "cc", 295 | OpAnyCharNotNL: "dnl", 296 | OpAnyChar: "dot", 297 | OpBeginLine: "bol", 298 | OpEndLine: "eol", 299 | OpBeginText: "bot", 300 | OpEndText: "eot", 301 | OpWordBoundary: "wb", 302 | OpNoWordBoundary: "nwb", 303 | OpCapture: "cap", 304 | OpStar: "star", 305 | OpPlus: "plus", 306 | OpQuest: "que", 307 | OpRepeat: "rep", 308 | OpConcat: "cat", 309 | OpAlternate: "alt", 310 | } 311 | 312 | // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. 313 | // It is used during testing to distinguish between parses that might print 314 | // the same using re's String method. 315 | func dumpRegexp(b *bytes.Buffer, re *Regexp) { 316 | if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { 317 | fmt.Fprintf(b, "op%d", re.Op) 318 | } else { 319 | switch re.Op { 320 | default: 321 | b.WriteString(opNames[re.Op]) 322 | case OpStar, OpPlus, OpQuest, OpRepeat: 323 | if re.Flags&NonGreedy != 0 { 324 | b.WriteByte('n') 325 | } 326 | b.WriteString(opNames[re.Op]) 327 | case OpLiteral: 328 | if len(re.Rune) > 1 { 329 | b.WriteString("str") 330 | } else { 331 | b.WriteString("lit") 332 | } 333 | if re.Flags&FoldCase != 0 { 334 | for _, r := range re.Rune { 335 | if unicode.SimpleFold(r) != r { 336 | b.WriteString("fold") 337 | break 338 | } 339 | } 340 | } 341 | } 342 | } 343 | b.WriteByte('{') 344 | switch re.Op { 345 | case OpEndText: 346 | if re.Flags&WasDollar == 0 { 347 | b.WriteString(`\z`) 348 | } 349 | case OpLiteral: 350 | for _, r := range re.Rune { 351 | b.WriteRune(r) 352 | } 353 | case OpConcat, OpAlternate: 354 | for _, sub := range re.Sub { 355 | dumpRegexp(b, sub) 356 | } 357 | case OpStar, OpPlus, OpQuest: 358 | dumpRegexp(b, re.Sub[0]) 359 | case OpRepeat: 360 | fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) 361 | dumpRegexp(b, re.Sub[0]) 362 | case OpCapture: 363 | if re.Name != "" { 364 | b.WriteString(re.Name) 365 | b.WriteByte(':') 366 | } 367 | dumpRegexp(b, re.Sub[0]) 368 | case OpCharClass: 369 | sep := "" 370 | for i := 0; i < len(re.Rune); i += 2 { 371 | b.WriteString(sep) 372 | sep = " " 373 | lo, hi := re.Rune[i], re.Rune[i+1] 374 | if lo == hi { 375 | fmt.Fprintf(b, "%#x", lo) 376 | } else { 377 | fmt.Fprintf(b, "%#x-%#x", lo, hi) 378 | } 379 | } 380 | } 381 | b.WriteByte('}') 382 | } 383 | 384 | func mkCharClass(f func(rune) bool) string { 385 | re := &Regexp{Op: OpCharClass} 386 | lo := rune(-1) 387 | for i := rune(0); i <= unicode.MaxRune; i++ { 388 | if f(i) { 389 | if lo < 0 { 390 | lo = i 391 | } 392 | } else { 393 | if lo >= 0 { 394 | re.Rune = append(re.Rune, lo, i-1) 395 | lo = -1 396 | } 397 | } 398 | } 399 | if lo >= 0 { 400 | re.Rune = append(re.Rune, lo, unicode.MaxRune) 401 | } 402 | return dump(re) 403 | } 404 | 405 | func isUpperFold(r rune) bool { 406 | if unicode.IsUpper(r) { 407 | return true 408 | } 409 | c := unicode.SimpleFold(r) 410 | for c != r { 411 | if unicode.IsUpper(c) { 412 | return true 413 | } 414 | c = unicode.SimpleFold(c) 415 | } 416 | return false 417 | } 418 | 419 | func TestFoldConstants(t *testing.T) { 420 | last := rune(-1) 421 | for i := rune(0); i <= unicode.MaxRune; i++ { 422 | if unicode.SimpleFold(i) == i { 423 | continue 424 | } 425 | if last == -1 && minFold != i { 426 | t.Errorf("minFold=%#U should be %#U", minFold, i) 427 | } 428 | last = i 429 | } 430 | if maxFold != last { 431 | t.Errorf("maxFold=%#U should be %#U", maxFold, last) 432 | } 433 | } 434 | 435 | func TestAppendRangeCollapse(t *testing.T) { 436 | // AppendRange should collapse each of the new ranges 437 | // into the earlier ones (it looks back two ranges), so that 438 | // the slice never grows very large. 439 | // Note that we are not calling cleanClass. 440 | var r []rune 441 | for i := rune('A'); i <= 'Z'; i++ { 442 | r = appendRange(r, i, i) 443 | r = appendRange(r, i+'a'-'A', i+'a'-'A') 444 | } 445 | if string(r) != "AZaz" { 446 | t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r)) 447 | } 448 | } 449 | 450 | var invalidRegexps = []string{ 451 | `(`, 452 | `)`, 453 | `(a`, 454 | `a)`, 455 | `(a))`, 456 | `(a|b|`, 457 | `a|b|)`, 458 | `(a|b|))`, 459 | `(a|b`, 460 | `a|b)`, 461 | `(a|b))`, 462 | `[a-z`, 463 | `([a-z)`, 464 | `[a-z)`, 465 | `([a-z]))`, 466 | `x{1001}`, 467 | `x{9876543210}`, 468 | `x{2,1}`, 469 | `x{1,9876543210}`, 470 | "\xff", // Invalid UTF-8 471 | "[\xff]", 472 | "[\\\xff]", 473 | "\\\xff", 474 | `(?Pa`, 475 | `(?P`, 476 | `(?Pa)`, 478 | `(?P<>a)`, 479 | `[a-Z]`, 480 | `(?i)[a-Z]`, 481 | `a{100000}`, 482 | `a{100000,}`, 483 | "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", 484 | `\Q\E*`, 485 | } 486 | 487 | var onlyPerl = []string{ 488 | `[a-b-c]`, 489 | `\Qabc\E`, 490 | `\Q*+?{[\E`, 491 | `\Q\\E`, 492 | `\Q\\\E`, 493 | `\Q\\\\E`, 494 | `\Q\\\\\E`, 495 | `(?:a)`, 496 | `(?Pa)`, 497 | } 498 | 499 | var onlyPOSIX = []string{ 500 | "a++", 501 | "a**", 502 | "a?*", 503 | "a+*", 504 | "a{1}*", 505 | ".{1}{2}.{3}", 506 | } 507 | 508 | func TestParseInvalidRegexps(t *testing.T) { 509 | for _, regexp := range invalidRegexps { 510 | if re, err := Parse(regexp, Perl); err == nil { 511 | t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) 512 | } 513 | if re, err := Parse(regexp, POSIX); err == nil { 514 | t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) 515 | } 516 | } 517 | for _, regexp := range onlyPerl { 518 | if _, err := Parse(regexp, Perl); err != nil { 519 | t.Errorf("Parse(%#q, Perl): %v", regexp, err) 520 | } 521 | if re, err := Parse(regexp, POSIX); err == nil { 522 | t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) 523 | } 524 | } 525 | for _, regexp := range onlyPOSIX { 526 | if re, err := Parse(regexp, Perl); err == nil { 527 | t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) 528 | } 529 | if _, err := Parse(regexp, POSIX); err != nil { 530 | t.Errorf("Parse(%#q, POSIX): %v", regexp, err) 531 | } 532 | } 533 | } 534 | 535 | func TestToStringEquivalentParse(t *testing.T) { 536 | for _, tt := range parseTests { 537 | re, err := Parse(tt.Regexp, testFlags) 538 | if err != nil { 539 | t.Errorf("Parse(%#q): %v", tt.Regexp, err) 540 | continue 541 | } 542 | if tt.Dump == "" { 543 | // It parsed. That's all we care about. 544 | continue 545 | } 546 | d := dump(re) 547 | if d != tt.Dump { 548 | t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) 549 | continue 550 | } 551 | 552 | s := re.String() 553 | if s != tt.Regexp { 554 | // If ToString didn't return the original regexp, 555 | // it must have found one with fewer parens. 556 | // Unfortunately we can't check the length here, because 557 | // ToString produces "\\{" for a literal brace, 558 | // but "{" is a shorter equivalent in some contexts. 559 | nre, err := Parse(s, testFlags) 560 | if err != nil { 561 | t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) 562 | continue 563 | } 564 | nd := dump(nre) 565 | if d != nd { 566 | t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) 567 | } 568 | 569 | ns := nre.String() 570 | if s != ns { 571 | t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) 572 | } 573 | } 574 | } 575 | } 576 | -------------------------------------------------------------------------------- /syntax/perl_groups.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // GENERATED BY make_perl_groups.pl; DO NOT EDIT. 6 | // make_perl_groups.pl >perl_groups.go 7 | 8 | package syntax 9 | 10 | var code1 = []rune{ /* \d */ 11 | 0x30, 0x39, 12 | } 13 | 14 | var code2 = []rune{ /* \s */ 15 | 0x9, 0xa, 16 | 0xc, 0xd, 17 | 0x20, 0x20, 18 | } 19 | 20 | var code3 = []rune{ /* \w */ 21 | 0x30, 0x39, 22 | 0x41, 0x5a, 23 | 0x5f, 0x5f, 24 | 0x61, 0x7a, 25 | } 26 | 27 | var perlGroup = map[string]charGroup{ 28 | `\d`: {+1, code1}, 29 | `\D`: {-1, code1}, 30 | `\s`: {+1, code2}, 31 | `\S`: {-1, code2}, 32 | `\w`: {+1, code3}, 33 | `\W`: {-1, code3}, 34 | } 35 | var code4 = []rune{ /* [:alnum:] */ 36 | 0x30, 0x39, 37 | 0x41, 0x5a, 38 | 0x61, 0x7a, 39 | } 40 | 41 | var code5 = []rune{ /* [:alpha:] */ 42 | 0x41, 0x5a, 43 | 0x61, 0x7a, 44 | } 45 | 46 | var code6 = []rune{ /* [:ascii:] */ 47 | 0x0, 0x7f, 48 | } 49 | 50 | var code7 = []rune{ /* [:blank:] */ 51 | 0x9, 0x9, 52 | 0x20, 0x20, 53 | } 54 | 55 | var code8 = []rune{ /* [:cntrl:] */ 56 | 0x0, 0x1f, 57 | 0x7f, 0x7f, 58 | } 59 | 60 | var code9 = []rune{ /* [:digit:] */ 61 | 0x30, 0x39, 62 | } 63 | 64 | var code10 = []rune{ /* [:graph:] */ 65 | 0x21, 0x7e, 66 | } 67 | 68 | var code11 = []rune{ /* [:lower:] */ 69 | 0x61, 0x7a, 70 | } 71 | 72 | var code12 = []rune{ /* [:print:] */ 73 | 0x20, 0x7e, 74 | } 75 | 76 | var code13 = []rune{ /* [:punct:] */ 77 | 0x21, 0x2f, 78 | 0x3a, 0x40, 79 | 0x5b, 0x60, 80 | 0x7b, 0x7e, 81 | } 82 | 83 | var code14 = []rune{ /* [:space:] */ 84 | 0x9, 0xd, 85 | 0x20, 0x20, 86 | } 87 | 88 | var code15 = []rune{ /* [:upper:] */ 89 | 0x41, 0x5a, 90 | } 91 | 92 | var code16 = []rune{ /* [:word:] */ 93 | 0x30, 0x39, 94 | 0x41, 0x5a, 95 | 0x5f, 0x5f, 96 | 0x61, 0x7a, 97 | } 98 | 99 | var code17 = []rune{ /* [:xdigit:] */ 100 | 0x30, 0x39, 101 | 0x41, 0x46, 102 | 0x61, 0x66, 103 | } 104 | 105 | var posixGroup = map[string]charGroup{ 106 | `[:alnum:]`: {+1, code4}, 107 | `[:^alnum:]`: {-1, code4}, 108 | `[:alpha:]`: {+1, code5}, 109 | `[:^alpha:]`: {-1, code5}, 110 | `[:ascii:]`: {+1, code6}, 111 | `[:^ascii:]`: {-1, code6}, 112 | `[:blank:]`: {+1, code7}, 113 | `[:^blank:]`: {-1, code7}, 114 | `[:cntrl:]`: {+1, code8}, 115 | `[:^cntrl:]`: {-1, code8}, 116 | `[:digit:]`: {+1, code9}, 117 | `[:^digit:]`: {-1, code9}, 118 | `[:graph:]`: {+1, code10}, 119 | `[:^graph:]`: {-1, code10}, 120 | `[:lower:]`: {+1, code11}, 121 | `[:^lower:]`: {-1, code11}, 122 | `[:print:]`: {+1, code12}, 123 | `[:^print:]`: {-1, code12}, 124 | `[:punct:]`: {+1, code13}, 125 | `[:^punct:]`: {-1, code13}, 126 | `[:space:]`: {+1, code14}, 127 | `[:^space:]`: {-1, code14}, 128 | `[:upper:]`: {+1, code15}, 129 | `[:^upper:]`: {-1, code15}, 130 | `[:word:]`: {+1, code16}, 131 | `[:^word:]`: {-1, code16}, 132 | `[:xdigit:]`: {+1, code17}, 133 | `[:^xdigit:]`: {-1, code17}, 134 | } 135 | -------------------------------------------------------------------------------- /syntax/prog.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import ( 8 | "bytes" 9 | "strconv" 10 | "unicode" 11 | ) 12 | 13 | // Compiled program. 14 | // May not belong in this package, but convenient for now. 15 | 16 | // A Prog is a compiled regular expression program. 17 | type Prog struct { 18 | Inst []Inst 19 | Start int // index of start instruction 20 | StartUnanchored int // index of start instruction for unanchored search 21 | NumCap int // number of InstCapture insts in re 22 | } 23 | 24 | // An InstOp is an instruction opcode. 25 | type InstOp uint8 26 | 27 | const ( 28 | InstAlt InstOp = iota 29 | InstAltMatch 30 | InstCapture 31 | InstEmptyWidth 32 | InstMatch 33 | InstFail 34 | InstNop 35 | InstRune 36 | InstRune1 37 | InstRuneAny 38 | InstRuneAnyNotNL 39 | ) 40 | 41 | var instOpNames = []string{ 42 | "InstAlt", 43 | "InstAltMatch", 44 | "InstCapture", 45 | "InstEmptyWidth", 46 | "InstMatch", 47 | "InstFail", 48 | "InstNop", 49 | "InstRune", 50 | "InstRune1", 51 | "InstRuneAny", 52 | "InstRuneAnyNotNL", 53 | } 54 | 55 | func (i InstOp) String() string { 56 | if uint(i) >= uint(len(instOpNames)) { 57 | return "" 58 | } 59 | return instOpNames[i] 60 | } 61 | 62 | // An EmptyOp specifies a kind or mixture of zero-width assertions. 63 | type EmptyOp uint8 64 | 65 | const ( 66 | EmptyBeginLine EmptyOp = 1 << iota 67 | EmptyEndLine 68 | EmptyBeginText 69 | EmptyEndText 70 | EmptyWordBoundary 71 | EmptyNoWordBoundary 72 | ) 73 | 74 | // EmptyOpContext returns the zero-width assertions 75 | // satisfied at the position between the runes r1 and r2. 76 | // Passing r1 == -1 indicates that the position is 77 | // at the beginning of the text. 78 | // Passing r2 == -1 indicates that the position is 79 | // at the end of the text. 80 | func EmptyOpContext(r1, r2 rune) EmptyOp { 81 | var op EmptyOp = EmptyNoWordBoundary 82 | var boundary byte 83 | switch { 84 | case IsWordChar(r1): 85 | boundary = 1 86 | case r1 == '\n': 87 | op |= EmptyBeginLine 88 | case r1 < 0: 89 | op |= EmptyBeginText | EmptyBeginLine 90 | } 91 | switch { 92 | case IsWordChar(r2): 93 | boundary ^= 1 94 | case r2 == '\n': 95 | op |= EmptyEndLine 96 | case r2 < 0: 97 | op |= EmptyEndText | EmptyEndLine 98 | } 99 | if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 100 | op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 101 | } 102 | return op 103 | } 104 | 105 | // IsWordChar reports whether r is consider a ``word character'' 106 | // during the evaluation of the \b and \B zero-width assertions. 107 | // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 108 | func IsWordChar(r rune) bool { 109 | return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 110 | } 111 | 112 | // An Inst is a single instruction in a regular expression program. 113 | type Inst struct { 114 | Op InstOp 115 | Out uint32 // all but InstMatch, InstFail 116 | Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 117 | Rune []rune 118 | } 119 | 120 | func (p *Prog) String() string { 121 | var b bytes.Buffer 122 | dumpProg(&b, p) 123 | return b.String() 124 | } 125 | 126 | // skipNop follows any no-op or capturing instructions 127 | // and returns the resulting pc. 128 | func (p *Prog) skipNop(pc uint32) (*Inst, uint32) { 129 | i := &p.Inst[pc] 130 | for i.Op == InstNop || i.Op == InstCapture { 131 | pc = i.Out 132 | i = &p.Inst[pc] 133 | } 134 | return i, pc 135 | } 136 | 137 | // op returns i.Op but merges all the Rune special cases into InstRune 138 | func (i *Inst) op() InstOp { 139 | op := i.Op 140 | switch op { 141 | case InstRune1, InstRuneAny, InstRuneAnyNotNL: 142 | op = InstRune 143 | } 144 | return op 145 | } 146 | 147 | // Prefix returns a literal string that all matches for the 148 | // regexp must start with. Complete is true if the prefix 149 | // is the entire match. 150 | func (p *Prog) Prefix() (prefix string, complete bool) { 151 | i, _ := p.skipNop(uint32(p.Start)) 152 | 153 | // Avoid allocation of buffer if prefix is empty. 154 | if i.op() != InstRune || len(i.Rune) != 1 { 155 | return "", i.Op == InstMatch 156 | } 157 | 158 | // Have prefix; gather characters. 159 | var buf bytes.Buffer 160 | for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { 161 | buf.WriteRune(i.Rune[0]) 162 | i, _ = p.skipNop(i.Out) 163 | } 164 | return buf.String(), i.Op == InstMatch 165 | } 166 | 167 | // StartCond returns the leading empty-width conditions that must 168 | // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 169 | func (p *Prog) StartCond() EmptyOp { 170 | var flag EmptyOp 171 | pc := uint32(p.Start) 172 | i := &p.Inst[pc] 173 | Loop: 174 | for { 175 | switch i.Op { 176 | case InstEmptyWidth: 177 | flag |= EmptyOp(i.Arg) 178 | case InstFail: 179 | return ^EmptyOp(0) 180 | case InstCapture, InstNop: 181 | // skip 182 | default: 183 | break Loop 184 | } 185 | pc = i.Out 186 | i = &p.Inst[pc] 187 | } 188 | return flag 189 | } 190 | 191 | const noMatch = -1 192 | 193 | // MatchRune reports whether the instruction matches (and consumes) r. 194 | // It should only be called when i.Op == InstRune. 195 | func (i *Inst) MatchRune(r rune) bool { 196 | return i.MatchRunePos(r) != noMatch 197 | } 198 | 199 | // MatchRunePos checks whether the instruction matches (and consumes) r. 200 | // If so, MatchRunePos returns the index of the matching rune pair 201 | // (or, when len(i.Rune) == 1, rune singleton). 202 | // If not, MatchRunePos returns -1. 203 | // MatchRunePos should only be called when i.Op == InstRune. 204 | func (i *Inst) MatchRunePos(r rune) int { 205 | rune := i.Rune 206 | 207 | // Special case: single-rune slice is from literal string, not char class. 208 | if len(rune) == 1 { 209 | r0 := rune[0] 210 | if r == r0 { 211 | return 0 212 | } 213 | if Flags(i.Arg)&FoldCase != 0 { 214 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 215 | if r == r1 { 216 | return 0 217 | } 218 | } 219 | } 220 | return noMatch 221 | } 222 | 223 | // Peek at the first few pairs. 224 | // Should handle ASCII well. 225 | for j := 0; j < len(rune) && j <= 8; j += 2 { 226 | if r < rune[j] { 227 | return noMatch 228 | } 229 | if r <= rune[j+1] { 230 | return j / 2 231 | } 232 | } 233 | 234 | // Otherwise binary search. 235 | lo := 0 236 | hi := len(rune) / 2 237 | for lo < hi { 238 | m := lo + (hi-lo)/2 239 | if c := rune[2*m]; c <= r { 240 | if r <= rune[2*m+1] { 241 | return m 242 | } 243 | lo = m + 1 244 | } else { 245 | hi = m 246 | } 247 | } 248 | return noMatch 249 | } 250 | 251 | // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. 252 | // Since we act on runes, it would be easy to support Unicode here. 253 | func wordRune(r rune) bool { 254 | return r == '_' || 255 | ('A' <= r && r <= 'Z') || 256 | ('a' <= r && r <= 'z') || 257 | ('0' <= r && r <= '9') 258 | } 259 | 260 | // MatchEmptyWidth reports whether the instruction matches 261 | // an empty string between the runes before and after. 262 | // It should only be called when i.Op == InstEmptyWidth. 263 | func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 264 | switch EmptyOp(i.Arg) { 265 | case EmptyBeginLine: 266 | return before == '\n' || before == -1 267 | case EmptyEndLine: 268 | return after == '\n' || after == -1 269 | case EmptyBeginText: 270 | return before == -1 271 | case EmptyEndText: 272 | return after == -1 273 | case EmptyWordBoundary: 274 | return wordRune(before) != wordRune(after) 275 | case EmptyNoWordBoundary: 276 | return wordRune(before) == wordRune(after) 277 | } 278 | panic("unknown empty width arg") 279 | } 280 | 281 | func (i *Inst) String() string { 282 | var b bytes.Buffer 283 | dumpInst(&b, i) 284 | return b.String() 285 | } 286 | 287 | func bw(b *bytes.Buffer, args ...string) { 288 | for _, s := range args { 289 | b.WriteString(s) 290 | } 291 | } 292 | 293 | func dumpProg(b *bytes.Buffer, p *Prog) { 294 | for j := range p.Inst { 295 | i := &p.Inst[j] 296 | pc := strconv.Itoa(j) 297 | if len(pc) < 3 { 298 | b.WriteString(" "[len(pc):]) 299 | } 300 | if j == p.Start { 301 | pc += "*" 302 | } 303 | if j == p.StartUnanchored { 304 | pc += "~" 305 | } 306 | bw(b, pc, "\t") 307 | dumpInst(b, i) 308 | bw(b, "\n") 309 | } 310 | } 311 | 312 | func u32(i uint32) string { 313 | return strconv.FormatUint(uint64(i), 10) 314 | } 315 | 316 | func dumpInst(b *bytes.Buffer, i *Inst) { 317 | switch i.Op { 318 | case InstAlt: 319 | bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 320 | case InstAltMatch: 321 | bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 322 | case InstCapture: 323 | bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 324 | case InstEmptyWidth: 325 | bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 326 | case InstMatch: 327 | bw(b, "match") 328 | case InstFail: 329 | bw(b, "fail") 330 | case InstNop: 331 | bw(b, "nop -> ", u32(i.Out)) 332 | case InstRune: 333 | if i.Rune == nil { 334 | // shouldn't happen 335 | bw(b, "rune ") 336 | } 337 | bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 338 | if Flags(i.Arg)&FoldCase != 0 { 339 | bw(b, "/i") 340 | } 341 | bw(b, " -> ", u32(i.Out)) 342 | case InstRune1: 343 | bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 344 | case InstRuneAny: 345 | bw(b, "any -> ", u32(i.Out)) 346 | case InstRuneAnyNotNL: 347 | bw(b, "anynotnl -> ", u32(i.Out)) 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /syntax/prog_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import "testing" 8 | 9 | var compileTests = []struct { 10 | Regexp string 11 | Prog string 12 | }{ 13 | {"a", ` 0 fail 14 | 1* rune1 "a" -> 2 15 | 2 match 16 | 3 any -> 4 17 | 4~ alt -> 1, 3 18 | `}, 19 | {"[A-M][n-z]", ` 0 fail 20 | 1* rune "AM" -> 2 21 | 2 rune "nz" -> 3 22 | 3 match 23 | 4 any -> 5 24 | 5~ alt -> 1, 4 25 | `}, 26 | {"", ` 0 fail 27 | 1* nop -> 2 28 | 2 match 29 | 3 any -> 4 30 | 4~ alt -> 1, 3 31 | `}, 32 | {"a?", ` 0 fail 33 | 1 rune1 "a" -> 3 34 | 2* alt -> 1, 3 35 | 3 match 36 | 4 any -> 5 37 | 5~ alt -> 2, 4 38 | `}, 39 | {"a??", ` 0 fail 40 | 1 rune1 "a" -> 3 41 | 2* alt -> 3, 1 42 | 3 match 43 | 4 any -> 5 44 | 5~ alt -> 2, 4 45 | `}, 46 | {"a+", ` 0 fail 47 | 1* rune1 "a" -> 2 48 | 2 alt -> 1, 3 49 | 3 match 50 | 4 any -> 5 51 | 5~ alt -> 1, 4 52 | `}, 53 | {"a+?", ` 0 fail 54 | 1* rune1 "a" -> 2 55 | 2 alt -> 3, 1 56 | 3 match 57 | 4 any -> 5 58 | 5~ alt -> 1, 4 59 | `}, 60 | {"a*", ` 0 fail 61 | 1 rune1 "a" -> 2 62 | 2* alt -> 1, 3 63 | 3 match 64 | 4 any -> 5 65 | 5~ alt -> 2, 4 66 | `}, 67 | {"a*?", ` 0 fail 68 | 1 rune1 "a" -> 2 69 | 2* alt -> 3, 1 70 | 3 match 71 | 4 any -> 5 72 | 5~ alt -> 2, 4 73 | `}, 74 | {"a+b+", ` 0 fail 75 | 1* rune1 "a" -> 2 76 | 2 alt -> 1, 3 77 | 3 rune1 "b" -> 4 78 | 4 alt -> 3, 5 79 | 5 match 80 | 6 any -> 7 81 | 7~ alt -> 1, 6 82 | `}, 83 | {"(a+)(b+)", ` 0 fail 84 | 1* cap 2 -> 2 85 | 2 rune1 "a" -> 3 86 | 3 alt -> 2, 4 87 | 4 cap 3 -> 5 88 | 5 cap 4 -> 6 89 | 6 rune1 "b" -> 7 90 | 7 alt -> 6, 8 91 | 8 cap 5 -> 9 92 | 9 match 93 | 10 any -> 11 94 | 11~ alt -> 1, 10 95 | `}, 96 | {"a+|b+", ` 0 fail 97 | 1 rune1 "a" -> 2 98 | 2 alt -> 1, 6 99 | 3 rune1 "b" -> 4 100 | 4 alt -> 3, 6 101 | 5* alt -> 1, 3 102 | 6 match 103 | 7 any -> 8 104 | 8~ alt -> 5, 7 105 | `}, 106 | {"A[Aa]", ` 0 fail 107 | 1* rune1 "A" -> 2 108 | 2 rune "A"/i -> 3 109 | 3 match 110 | 4 any -> 5 111 | 5~ alt -> 1, 4 112 | `}, 113 | {"(?:(?:^).)", ` 0 fail 114 | 1* empty 4 -> 2 115 | 2 anynotnl -> 3 116 | 3 match 117 | 4 any -> 5 118 | 5~ alt -> 1, 4 119 | `}, 120 | } 121 | 122 | func TestCompile(t *testing.T) { 123 | for _, tt := range compileTests { 124 | re, _ := Parse(tt.Regexp, Perl) 125 | p, _ := Compile(re) 126 | s := p.String() 127 | if s != tt.Prog { 128 | t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog) 129 | } 130 | } 131 | } 132 | 133 | func BenchmarkEmptyOpContext(b *testing.B) { 134 | for i := 0; i < b.N; i++ { 135 | var r1 rune = -1 136 | for _, r2 := range "foo, bar, baz\nsome input text.\n" { 137 | EmptyOpContext(r1, r2) 138 | r1 = r2 139 | } 140 | EmptyOpContext(r1, -1) 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /syntax/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | // Note to implementers: 8 | // In this package, re is always a *Regexp and r is always a rune. 9 | 10 | import ( 11 | "bytes" 12 | "strconv" 13 | "strings" 14 | "unicode" 15 | ) 16 | 17 | // A Regexp is a node in a regular expression syntax tree. 18 | type Regexp struct { 19 | Op Op // operator 20 | Flags Flags 21 | Sub []*Regexp // subexpressions, if any 22 | Sub0 [1]*Regexp // storage for short Sub 23 | Rune []rune // matched runes, for OpLiteral, OpCharClass 24 | Rune0 [2]rune // storage for short Rune 25 | Min, Max int // min, max for OpRepeat 26 | Cap int // capturing index, for OpCapture 27 | Name string // capturing name, for OpCapture 28 | } 29 | 30 | // An Op is a single regular expression operator. 31 | type Op uint8 32 | 33 | // Operators are listed in precedence order, tightest binding to weakest. 34 | // Character class operators are listed simplest to most complex 35 | // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). 36 | 37 | const ( 38 | OpNoMatch Op = 1 + iota // matches no strings 39 | OpEmptyMatch // matches empty string 40 | OpLiteral // matches Runes sequence 41 | OpCharClass // matches Runes interpreted as range pair list 42 | OpAnyCharNotNL // matches any character except newline 43 | OpAnyChar // matches any character 44 | OpBeginLine // matches empty string at beginning of line 45 | OpEndLine // matches empty string at end of line 46 | OpBeginText // matches empty string at beginning of text 47 | OpEndText // matches empty string at end of text 48 | OpWordBoundary // matches word boundary `\b` 49 | OpNoWordBoundary // matches word non-boundary `\B` 50 | OpCapture // capturing subexpression with index Cap, optional name Name 51 | OpStar // matches Sub[0] zero or more times 52 | OpPlus // matches Sub[0] one or more times 53 | OpQuest // matches Sub[0] zero or one times 54 | OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) 55 | OpConcat // matches concatenation of Subs 56 | OpAlternate // matches alternation of Subs 57 | ) 58 | 59 | const opPseudo Op = 128 // where pseudo-ops start 60 | 61 | // Equal returns true if x and y have identical structure. 62 | func (x *Regexp) Equal(y *Regexp) bool { 63 | if x == nil || y == nil { 64 | return x == y 65 | } 66 | if x.Op != y.Op { 67 | return false 68 | } 69 | switch x.Op { 70 | case OpEndText: 71 | // The parse flags remember whether this is \z or \Z. 72 | if x.Flags&WasDollar != y.Flags&WasDollar { 73 | return false 74 | } 75 | 76 | case OpLiteral, OpCharClass: 77 | if len(x.Rune) != len(y.Rune) { 78 | return false 79 | } 80 | for i, r := range x.Rune { 81 | if r != y.Rune[i] { 82 | return false 83 | } 84 | } 85 | 86 | case OpAlternate, OpConcat: 87 | if len(x.Sub) != len(y.Sub) { 88 | return false 89 | } 90 | for i, sub := range x.Sub { 91 | if !sub.Equal(y.Sub[i]) { 92 | return false 93 | } 94 | } 95 | 96 | case OpStar, OpPlus, OpQuest: 97 | if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { 98 | return false 99 | } 100 | 101 | case OpRepeat: 102 | if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { 103 | return false 104 | } 105 | 106 | case OpCapture: 107 | if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { 108 | return false 109 | } 110 | } 111 | return true 112 | } 113 | 114 | // writeRegexp writes the Perl syntax for the regular expression re to b. 115 | func writeRegexp(b *bytes.Buffer, re *Regexp) { 116 | switch re.Op { 117 | default: 118 | b.WriteString("") 119 | case OpNoMatch: 120 | b.WriteString(`[^\x00-\x{10FFFF}]`) 121 | case OpEmptyMatch: 122 | b.WriteString(`(?:)`) 123 | case OpLiteral: 124 | if re.Flags&FoldCase != 0 { 125 | b.WriteString(`(?i:`) 126 | } 127 | for _, r := range re.Rune { 128 | escape(b, r, false) 129 | } 130 | if re.Flags&FoldCase != 0 { 131 | b.WriteString(`)`) 132 | } 133 | case OpCharClass: 134 | if len(re.Rune)%2 != 0 { 135 | b.WriteString(`[invalid char class]`) 136 | break 137 | } 138 | b.WriteRune('[') 139 | if len(re.Rune) == 0 { 140 | b.WriteString(`^\x00-\x{10FFFF}`) 141 | } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { 142 | // Contains 0 and MaxRune. Probably a negated class. 143 | // Print the gaps. 144 | b.WriteRune('^') 145 | for i := 1; i < len(re.Rune)-1; i += 2 { 146 | lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 147 | escape(b, lo, lo == '-') 148 | if lo != hi { 149 | b.WriteRune('-') 150 | escape(b, hi, hi == '-') 151 | } 152 | } 153 | } else { 154 | for i := 0; i < len(re.Rune); i += 2 { 155 | lo, hi := re.Rune[i], re.Rune[i+1] 156 | escape(b, lo, lo == '-') 157 | if lo != hi { 158 | b.WriteRune('-') 159 | escape(b, hi, hi == '-') 160 | } 161 | } 162 | } 163 | b.WriteRune(']') 164 | case OpAnyCharNotNL: 165 | b.WriteString(`(?-s:.)`) 166 | case OpAnyChar: 167 | b.WriteString(`(?s:.)`) 168 | case OpBeginLine: 169 | b.WriteString(`(?m:^)`) 170 | case OpEndLine: 171 | b.WriteString(`(?m:$)`) 172 | case OpBeginText: 173 | b.WriteString(`\A`) 174 | case OpEndText: 175 | if re.Flags&WasDollar != 0 { 176 | b.WriteString(`(?-m:$)`) 177 | } else { 178 | b.WriteString(`\z`) 179 | } 180 | case OpWordBoundary: 181 | b.WriteString(`\b`) 182 | case OpNoWordBoundary: 183 | b.WriteString(`\B`) 184 | case OpCapture: 185 | if re.Name != "" { 186 | b.WriteString(`(?P<`) 187 | b.WriteString(re.Name) 188 | b.WriteRune('>') 189 | } else { 190 | b.WriteRune('(') 191 | } 192 | if re.Sub[0].Op != OpEmptyMatch { 193 | writeRegexp(b, re.Sub[0]) 194 | } 195 | b.WriteRune(')') 196 | case OpStar, OpPlus, OpQuest, OpRepeat: 197 | if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { 198 | b.WriteString(`(?:`) 199 | writeRegexp(b, sub) 200 | b.WriteString(`)`) 201 | } else { 202 | writeRegexp(b, sub) 203 | } 204 | switch re.Op { 205 | case OpStar: 206 | b.WriteRune('*') 207 | case OpPlus: 208 | b.WriteRune('+') 209 | case OpQuest: 210 | b.WriteRune('?') 211 | case OpRepeat: 212 | b.WriteRune('{') 213 | b.WriteString(strconv.Itoa(re.Min)) 214 | if re.Max != re.Min { 215 | b.WriteRune(',') 216 | if re.Max >= 0 { 217 | b.WriteString(strconv.Itoa(re.Max)) 218 | } 219 | } 220 | b.WriteRune('}') 221 | } 222 | if re.Flags&NonGreedy != 0 { 223 | b.WriteRune('?') 224 | } 225 | case OpConcat: 226 | for _, sub := range re.Sub { 227 | if sub.Op == OpAlternate { 228 | b.WriteString(`(?:`) 229 | writeRegexp(b, sub) 230 | b.WriteString(`)`) 231 | } else { 232 | writeRegexp(b, sub) 233 | } 234 | } 235 | case OpAlternate: 236 | for i, sub := range re.Sub { 237 | if i > 0 { 238 | b.WriteRune('|') 239 | } 240 | writeRegexp(b, sub) 241 | } 242 | } 243 | } 244 | 245 | func (re *Regexp) String() string { 246 | var b bytes.Buffer 247 | writeRegexp(&b, re) 248 | return b.String() 249 | } 250 | 251 | const meta = `\.+*?()|[]{}^$` 252 | 253 | func escape(b *bytes.Buffer, r rune, force bool) { 254 | if unicode.IsPrint(r) { 255 | if strings.ContainsRune(meta, r) || force { 256 | b.WriteRune('\\') 257 | } 258 | b.WriteRune(r) 259 | return 260 | } 261 | 262 | switch r { 263 | case '\a': 264 | b.WriteString(`\a`) 265 | case '\f': 266 | b.WriteString(`\f`) 267 | case '\n': 268 | b.WriteString(`\n`) 269 | case '\r': 270 | b.WriteString(`\r`) 271 | case '\t': 272 | b.WriteString(`\t`) 273 | case '\v': 274 | b.WriteString(`\v`) 275 | default: 276 | if r < 0x100 { 277 | b.WriteString(`\x`) 278 | s := strconv.FormatInt(int64(r), 16) 279 | if len(s) == 1 { 280 | b.WriteRune('0') 281 | } 282 | b.WriteString(s) 283 | break 284 | } 285 | b.WriteString(`\x{`) 286 | b.WriteString(strconv.FormatInt(int64(r), 16)) 287 | b.WriteString(`}`) 288 | } 289 | } 290 | 291 | // MaxCap walks the regexp to find the maximum capture index. 292 | func (re *Regexp) MaxCap() int { 293 | m := 0 294 | if re.Op == OpCapture { 295 | m = re.Cap 296 | } 297 | for _, sub := range re.Sub { 298 | if n := sub.MaxCap(); m < n { 299 | m = n 300 | } 301 | } 302 | return m 303 | } 304 | 305 | // CapNames walks the regexp to find the names of capturing groups. 306 | func (re *Regexp) CapNames() []string { 307 | names := make([]string, re.MaxCap()+1) 308 | re.capNames(names) 309 | return names 310 | } 311 | 312 | func (re *Regexp) capNames(names []string) { 313 | if re.Op == OpCapture { 314 | names[re.Cap] = re.Name 315 | } 316 | for _, sub := range re.Sub { 317 | sub.capNames(names) 318 | } 319 | } 320 | -------------------------------------------------------------------------------- /syntax/simplify.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | // Simplify returns a regexp equivalent to re but without counted repetitions 8 | // and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. 9 | // The resulting regexp will execute correctly but its string representation 10 | // will not produce the same parse tree, because capturing parentheses 11 | // may have been duplicated or removed. For example, the simplified form 12 | // for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. 13 | // The returned regexp may share structure with or be the original. 14 | func (re *Regexp) Simplify() *Regexp { 15 | if re == nil { 16 | return nil 17 | } 18 | switch re.Op { 19 | case OpCapture, OpConcat, OpAlternate: 20 | // Simplify children, building new Regexp if children change. 21 | nre := re 22 | for i, sub := range re.Sub { 23 | nsub := sub.Simplify() 24 | if nre == re && nsub != sub { 25 | // Start a copy. 26 | nre = new(Regexp) 27 | *nre = *re 28 | nre.Rune = nil 29 | nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) 30 | } 31 | if nre != re { 32 | nre.Sub = append(nre.Sub, nsub) 33 | } 34 | } 35 | return nre 36 | 37 | case OpStar, OpPlus, OpQuest: 38 | sub := re.Sub[0].Simplify() 39 | return simplify1(re.Op, re.Flags, sub, re) 40 | 41 | case OpRepeat: 42 | // Special special case: x{0} matches the empty string 43 | // and doesn't even need to consider x. 44 | if re.Min == 0 && re.Max == 0 { 45 | return &Regexp{Op: OpEmptyMatch} 46 | } 47 | 48 | // The fun begins. 49 | sub := re.Sub[0].Simplify() 50 | 51 | // x{n,} means at least n matches of x. 52 | if re.Max == -1 { 53 | // Special case: x{0,} is x*. 54 | if re.Min == 0 { 55 | return simplify1(OpStar, re.Flags, sub, nil) 56 | } 57 | 58 | // Special case: x{1,} is x+. 59 | if re.Min == 1 { 60 | return simplify1(OpPlus, re.Flags, sub, nil) 61 | } 62 | 63 | // General case: x{4,} is xxxx+. 64 | nre := &Regexp{Op: OpConcat} 65 | nre.Sub = nre.Sub0[:0] 66 | for i := 0; i < re.Min-1; i++ { 67 | nre.Sub = append(nre.Sub, sub) 68 | } 69 | nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) 70 | return nre 71 | } 72 | 73 | // Special case x{0} handled above. 74 | 75 | // Special case: x{1} is just x. 76 | if re.Min == 1 && re.Max == 1 { 77 | return sub 78 | } 79 | 80 | // General case: x{n,m} means n copies of x and m copies of x? 81 | // The machine will do less work if we nest the final m copies, 82 | // so that x{2,5} = xx(x(x(x)?)?)? 83 | 84 | // Build leading prefix: xx. 85 | var prefix *Regexp 86 | if re.Min > 0 { 87 | prefix = &Regexp{Op: OpConcat} 88 | prefix.Sub = prefix.Sub0[:0] 89 | for i := 0; i < re.Min; i++ { 90 | prefix.Sub = append(prefix.Sub, sub) 91 | } 92 | } 93 | 94 | // Build and attach suffix: (x(x(x)?)?)? 95 | if re.Max > re.Min { 96 | suffix := simplify1(OpQuest, re.Flags, sub, nil) 97 | for i := re.Min + 1; i < re.Max; i++ { 98 | nre2 := &Regexp{Op: OpConcat} 99 | nre2.Sub = append(nre2.Sub0[:0], sub, suffix) 100 | suffix = simplify1(OpQuest, re.Flags, nre2, nil) 101 | } 102 | if prefix == nil { 103 | return suffix 104 | } 105 | prefix.Sub = append(prefix.Sub, suffix) 106 | } 107 | if prefix != nil { 108 | return prefix 109 | } 110 | 111 | // Some degenerate case like min > max or min < max < 0. 112 | // Handle as impossible match. 113 | return &Regexp{Op: OpNoMatch} 114 | } 115 | 116 | return re 117 | } 118 | 119 | // simplify1 implements Simplify for the unary OpStar, 120 | // OpPlus, and OpQuest operators. It returns the simple regexp 121 | // equivalent to 122 | // 123 | // Regexp{Op: op, Flags: flags, Sub: {sub}} 124 | // 125 | // under the assumption that sub is already simple, and 126 | // without first allocating that structure. If the regexp 127 | // to be returned turns out to be equivalent to re, simplify1 128 | // returns re instead. 129 | // 130 | // simplify1 is factored out of Simplify because the implementation 131 | // for other operators generates these unary expressions. 132 | // Letting them call simplify1 makes sure the expressions they 133 | // generate are simple. 134 | func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { 135 | // Special case: repeat the empty string as much as 136 | // you want, but it's still the empty string. 137 | if sub.Op == OpEmptyMatch { 138 | return sub 139 | } 140 | // The operators are idempotent if the flags match. 141 | if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { 142 | return sub 143 | } 144 | if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { 145 | return re 146 | } 147 | 148 | re = &Regexp{Op: op, Flags: flags} 149 | re.Sub = append(re.Sub0[:0], sub) 150 | return re 151 | } 152 | -------------------------------------------------------------------------------- /syntax/simplify_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import "testing" 8 | 9 | var simplifyTests = []struct { 10 | Regexp string 11 | Simple string 12 | }{ 13 | // Already-simple constructs 14 | {`a`, `a`}, 15 | {`ab`, `ab`}, 16 | {`a|b`, `[a-b]`}, 17 | {`ab|cd`, `ab|cd`}, 18 | {`(ab)*`, `(ab)*`}, 19 | {`(ab)+`, `(ab)+`}, 20 | {`(ab)?`, `(ab)?`}, 21 | {`.`, `(?s:.)`}, 22 | {`^`, `(?m:^)`}, 23 | {`$`, `(?m:$)`}, 24 | {`[ac]`, `[ac]`}, 25 | {`[^ac]`, `[^ac]`}, 26 | 27 | // Posix character classes 28 | {`[[:alnum:]]`, `[0-9A-Za-z]`}, 29 | {`[[:alpha:]]`, `[A-Za-z]`}, 30 | {`[[:blank:]]`, `[\t ]`}, 31 | {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, 32 | {`[[:digit:]]`, `[0-9]`}, 33 | {`[[:graph:]]`, `[!-~]`}, 34 | {`[[:lower:]]`, `[a-z]`}, 35 | {`[[:print:]]`, `[ -~]`}, 36 | {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, 37 | {`[[:space:]]`, `[\t-\r ]`}, 38 | {`[[:upper:]]`, `[A-Z]`}, 39 | {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, 40 | 41 | // Perl character classes 42 | {`\d`, `[0-9]`}, 43 | {`\s`, `[\t-\n\f-\r ]`}, 44 | {`\w`, `[0-9A-Z_a-z]`}, 45 | {`\D`, `[^0-9]`}, 46 | {`\S`, `[^\t-\n\f-\r ]`}, 47 | {`\W`, `[^0-9A-Z_a-z]`}, 48 | {`[\d]`, `[0-9]`}, 49 | {`[\s]`, `[\t-\n\f-\r ]`}, 50 | {`[\w]`, `[0-9A-Z_a-z]`}, 51 | {`[\D]`, `[^0-9]`}, 52 | {`[\S]`, `[^\t-\n\f-\r ]`}, 53 | {`[\W]`, `[^0-9A-Z_a-z]`}, 54 | 55 | // Posix repetitions 56 | {`a{1}`, `a`}, 57 | {`a{2}`, `aa`}, 58 | {`a{5}`, `aaaaa`}, 59 | {`a{0,1}`, `a?`}, 60 | // The next three are illegible because Simplify inserts (?:) 61 | // parens instead of () parens to avoid creating extra 62 | // captured subexpressions. The comments show a version with fewer parens. 63 | {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? 64 | {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? 65 | {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? 66 | {`a{0,2}`, `(?:aa?)?`}, // (aa?)? 67 | {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? 68 | {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? 69 | {`a{0,}`, `a*`}, 70 | {`a{1,}`, `a+`}, 71 | {`a{2,}`, `aa+`}, 72 | {`a{5,}`, `aaaaa+`}, 73 | 74 | // Test that operators simplify their arguments. 75 | {`(?:a{1,}){1,}`, `a+`}, 76 | {`(a{1,}b{1,})`, `(a+b+)`}, 77 | {`a{1,}|b{1,}`, `a+|b+`}, 78 | {`(?:a{1,})*`, `(?:a+)*`}, 79 | {`(?:a{1,})+`, `a+`}, 80 | {`(?:a{1,})?`, `(?:a+)?`}, 81 | {``, `(?:)`}, 82 | {`a{0}`, `(?:)`}, 83 | 84 | // Character class simplification 85 | {`[ab]`, `[a-b]`}, 86 | {`[a-za-za-z]`, `[a-z]`}, 87 | {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, 88 | {`[ABCDEFGH]`, `[A-H]`}, 89 | {`[AB-CD-EF-GH]`, `[A-H]`}, 90 | {`[W-ZP-XE-R]`, `[E-Z]`}, 91 | {`[a-ee-gg-m]`, `[a-m]`}, 92 | {`[a-ea-ha-m]`, `[a-m]`}, 93 | {`[a-ma-ha-e]`, `[a-m]`}, 94 | {`[a-zA-Z0-9 -~]`, `[ -~]`}, 95 | 96 | // Empty character classes 97 | {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, 98 | 99 | // Full character classes 100 | {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`}, 101 | 102 | // Unicode case folding. 103 | {`(?i)A`, `(?i:A)`}, 104 | {`(?i)a`, `(?i:A)`}, 105 | {`(?i)[A]`, `(?i:A)`}, 106 | {`(?i)[a]`, `(?i:A)`}, 107 | {`(?i)K`, `(?i:K)`}, 108 | {`(?i)k`, `(?i:K)`}, 109 | {`(?i)\x{212a}`, "(?i:K)"}, 110 | {`(?i)[K]`, "[Kk\u212A]"}, 111 | {`(?i)[k]`, "[Kk\u212A]"}, 112 | {`(?i)[\x{212a}]`, "[Kk\u212A]"}, 113 | {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, 114 | {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, 115 | {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`}, 116 | 117 | // Empty string as a regular expression. 118 | // The empty string must be preserved inside parens in order 119 | // to make submatches work right, so these tests are less 120 | // interesting than they might otherwise be. String inserts 121 | // explicit (?:) in place of non-parenthesized empty strings, 122 | // to make them easier to spot for other parsers. 123 | {`(a|b|)`, `([a-b]|(?:))`}, 124 | {`(|)`, `()`}, 125 | {`a()`, `a()`}, 126 | {`(()|())`, `(()|())`}, 127 | {`(a|)`, `(a|(?:))`}, 128 | {`ab()cd()`, `ab()cd()`}, 129 | {`()`, `()`}, 130 | {`()*`, `()*`}, 131 | {`()+`, `()+`}, 132 | {`()?`, `()?`}, 133 | {`(){0}`, `(?:)`}, 134 | {`(){1}`, `()`}, 135 | {`(){1,}`, `()+`}, 136 | {`(){0,2}`, `(?:()()?)?`}, 137 | } 138 | 139 | func TestSimplify(t *testing.T) { 140 | for _, tt := range simplifyTests { 141 | re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) 142 | if err != nil { 143 | t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) 144 | continue 145 | } 146 | s := re.Simplify().String() 147 | if s != tt.Simple { 148 | t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /testdata/README: -------------------------------------------------------------------------------- 1 | AT&T POSIX Test Files 2 | See textregex.c for copyright + license. 3 | 4 | testregex.c http://www2.research.att.com/~gsf/testregex/testregex.c 5 | basic.dat http://www2.research.att.com/~gsf/testregex/basic.dat 6 | nullsubexpr.dat http://www2.research.att.com/~gsf/testregex/nullsubexpr.dat 7 | repetition.dat http://www2.research.att.com/~gsf/testregex/repetition.dat 8 | 9 | The test data has been edited to reflect RE2/Go differences: 10 | * In a star of a possibly empty match like (a*)* matching x, 11 | the no match case runs the starred subexpression zero times, 12 | not once. This is consistent with (a*)* matching a, which 13 | runs the starred subexpression one time, not twice. 14 | * The submatch choice is first match, not the POSIX rule. 15 | 16 | Such changes are marked with 'RE2/Go'. 17 | 18 | 19 | RE2 Test Files 20 | 21 | re2-exhaustive.txt.bz2 and re2-search.txt are built by running 22 | 'make log' in the RE2 distribution https://github.com/google/re2/ 23 | 24 | The exhaustive file is compressed because it is huge. 25 | -------------------------------------------------------------------------------- /testdata/basic.dat: -------------------------------------------------------------------------------- 1 | NOTE all standard compliant implementations should pass these : 2002-05-31 2 | 3 | BE abracadabra$ abracadabracadabra (7,18) 4 | BE a...b abababbb (2,7) 5 | BE XXXXXX ..XXXXXX (2,8) 6 | E \) () (1,2) 7 | BE a] a]a (0,2) 8 | B } } (0,1) 9 | E \} } (0,1) 10 | BE \] ] (0,1) 11 | B ] ] (0,1) 12 | E ] ] (0,1) 13 | B { { (0,1) 14 | B } } (0,1) 15 | BE ^a ax (0,1) 16 | BE \^a a^a (1,3) 17 | BE a\^ a^ (0,2) 18 | BE a$ aa (1,2) 19 | BE a\$ a$ (0,2) 20 | BE ^$ NULL (0,0) 21 | E $^ NULL (0,0) 22 | E a($) aa (1,2)(2,2) 23 | E a*(^a) aa (0,1)(0,1) 24 | E (..)*(...)* a (0,0) 25 | E (..)*(...)* abcd (0,4)(2,4) 26 | E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) 27 | E (ab)c|abc abc (0,3)(0,2) 28 | E a{0}b ab (1,2) 29 | E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 30 | E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 31 | E a{9876543210} NULL BADBR 32 | E ((a|a)|a) a (0,1)(0,1)(0,1) 33 | E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) 34 | E a*(a.|aa) aaaa (0,4)(2,4) 35 | E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) 36 | E (a|b)?.* b (0,1)(0,1) 37 | E (a|b)c|a(b|c) ac (0,2)(0,1) 38 | E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) 39 | E (a|b)*c|(a|ab)*c abc (0,3)(1,2) 40 | E (a|b)*c|(a|ab)*c xc (1,2) 41 | E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) 42 | E a?(ab|ba)ab abab (0,4)(0,2) 43 | E a?(ac{0}b|ba)ab abab (0,4)(0,2) 44 | E ab|abab abbabab (0,2) 45 | E aba|bab|bba baaabbbaba (5,8) 46 | E aba|bab baaabbbaba (6,9) 47 | E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) 48 | E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) 49 | E ab|a xabc (1,3) 50 | E ab|a xxabc (2,4) 51 | Ei (Ab|cD)* aBcD (0,4)(2,4) 52 | BE [^-] --a (2,3) 53 | BE [a-]* --a (0,3) 54 | BE [a-m-]* --amoma-- (0,4) 55 | E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) 56 | E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) 57 | {E [[:upper:]] A (0,1) [[]] not supported 58 | E [[:lower:]]+ `az{ (1,3) 59 | E [[:upper:]]+ @AZ[ (1,3) 60 | # No collation in Go 61 | #BE [[-]] [[-]] (2,4) 62 | #BE [[.NIL.]] NULL ECOLLATE 63 | #BE [[=aleph=]] NULL ECOLLATE 64 | } 65 | BE$ \n \n (0,1) 66 | BEn$ \n \n (0,1) 67 | BE$ [^a] \n (0,1) 68 | BE$ \na \na (0,2) 69 | E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) 70 | BE xxx xxx (0,3) 71 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) 72 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) 73 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) 74 | E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) 75 | E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) 76 | E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) 77 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) 78 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) 79 | E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) 80 | BE$ .* \x01\xff (0,2) 81 | E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) 82 | L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH 83 | E a*a*a*a*a*b aaaaaaaaab (0,10) 84 | BE ^ NULL (0,0) 85 | BE $ NULL (0,0) 86 | BE ^$ NULL (0,0) 87 | BE ^a$ a (0,1) 88 | BE abc abc (0,3) 89 | BE abc xabcy (1,4) 90 | BE abc ababc (2,5) 91 | BE ab*c abc (0,3) 92 | BE ab*bc abc (0,3) 93 | BE ab*bc abbc (0,4) 94 | BE ab*bc abbbbc (0,6) 95 | E ab+bc abbc (0,4) 96 | E ab+bc abbbbc (0,6) 97 | E ab?bc abbc (0,4) 98 | E ab?bc abc (0,3) 99 | E ab?c abc (0,3) 100 | BE ^abc$ abc (0,3) 101 | BE ^abc abcc (0,3) 102 | BE abc$ aabc (1,4) 103 | BE ^ abc (0,0) 104 | BE $ abc (3,3) 105 | BE a.c abc (0,3) 106 | BE a.c axc (0,3) 107 | BE a.*c axyzc (0,5) 108 | BE a[bc]d abd (0,3) 109 | BE a[b-d]e ace (0,3) 110 | BE a[b-d] aac (1,3) 111 | BE a[-b] a- (0,2) 112 | BE a[b-] a- (0,2) 113 | BE a] a] (0,2) 114 | BE a[]]b a]b (0,3) 115 | BE a[^bc]d aed (0,3) 116 | BE a[^-b]c adc (0,3) 117 | BE a[^]b]c adc (0,3) 118 | E ab|cd abc (0,2) 119 | E ab|cd abcd (0,2) 120 | E a\(b a(b (0,3) 121 | E a\(*b ab (0,2) 122 | E a\(*b a((b (0,4) 123 | E ((a)) abc (0,1)(0,1)(0,1) 124 | E (a)b(c) abc (0,3)(0,1)(2,3) 125 | E a+b+c aabbabc (4,7) 126 | E a* aaa (0,3) 127 | #E (a*)* - (0,0)(0,0) 128 | E (a*)* - (0,0)(?,?) RE2/Go 129 | E (a*)+ - (0,0)(0,0) 130 | #E (a*|b)* - (0,0)(0,0) 131 | E (a*|b)* - (0,0)(?,?) RE2/Go 132 | E (a+|b)* ab (0,2)(1,2) 133 | E (a+|b)+ ab (0,2)(1,2) 134 | E (a+|b)? ab (0,1)(0,1) 135 | BE [^ab]* cde (0,3) 136 | #E (^)* - (0,0)(0,0) 137 | E (^)* - (0,0)(?,?) RE2/Go 138 | BE a* NULL (0,0) 139 | E ([abc])*d abbbcd (0,6)(4,5) 140 | E ([abc])*bcd abcd (0,4)(0,1) 141 | E a|b|c|d|e e (0,1) 142 | E (a|b|c|d|e)f ef (0,2)(0,1) 143 | #E ((a*|b))* - (0,0)(0,0)(0,0) 144 | E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go 145 | BE abcd*efg abcdefg (0,7) 146 | BE ab* xabyabbbz (1,3) 147 | BE ab* xayabbbz (1,2) 148 | E (ab|cd)e abcde (2,5)(2,4) 149 | BE [abhgefdc]ij hij (0,3) 150 | E (a|b)c*d abcd (1,4)(1,2) 151 | E (ab|ab*)bc abc (0,3)(0,1) 152 | E a([bc]*)c* abc (0,3)(1,3) 153 | E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) 154 | E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) 155 | E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) 156 | E a[bcd]*dcdcde adcdcde (0,7) 157 | E (ab|a)b*c abc (0,3)(0,2) 158 | E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) 159 | BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) 160 | E ^a(bc+|b[eh])g|.h$ abh (1,3) 161 | E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) 162 | E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) 163 | E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) 164 | E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) 165 | BE multiple words multiple words yeah (0,14) 166 | E (.*)c(.*) abcde (0,5)(0,2)(3,5) 167 | BE abcd abcd (0,4) 168 | E a(bc)d abcd (0,4)(1,3) 169 | E a[-]?c ac (0,3) 170 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) 171 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) 172 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) 173 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) 174 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) 175 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) 176 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) 177 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) 178 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) 179 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) 180 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) 181 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) 182 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) 183 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) 184 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) 185 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) 186 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) 187 | E a+(b|c)*d+ aabcdd (0,6)(3,4) 188 | E ^.+$ vivi (0,4) 189 | E ^(.+)$ vivi (0,4)(0,4) 190 | E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) 191 | E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) 192 | E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) 193 | E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) 194 | E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) 195 | E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) 196 | E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) 197 | E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) 198 | E ((foo)|bar)!bas bar!bas (0,7)(0,3) 199 | E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) 200 | E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) 201 | E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) 202 | E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) 203 | E (foo|(bar))!bas foo!bas (0,7)(0,3) 204 | E (foo|bar)!bas bar!bas (0,7)(0,3) 205 | E (foo|bar)!bas foo!bar!bas (4,11)(4,7) 206 | E (foo|bar)!bas foo!bas (0,7)(0,3) 207 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 208 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) 209 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) 210 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) 211 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) 212 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) 213 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) 214 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 215 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) 216 | E .*(/XXX).* /XXX (0,4)(0,4) 217 | E .*(\\XXX).* \XXX (0,4)(0,4) 218 | E \\XXX \XXX (0,4) 219 | E .*(/000).* /000 (0,4)(0,4) 220 | E .*(\\000).* \000 (0,4)(0,4) 221 | E \\000 \000 (0,4) 222 | -------------------------------------------------------------------------------- /testdata/nullsubexpr.dat: -------------------------------------------------------------------------------- 1 | NOTE null subexpression matches : 2002-06-06 2 | 3 | E (a*)* a (0,1)(0,1) 4 | #E SAME x (0,0)(0,0) 5 | E SAME x (0,0)(?,?) RE2/Go 6 | E SAME aaaaaa (0,6)(0,6) 7 | E SAME aaaaaax (0,6)(0,6) 8 | E (a*)+ a (0,1)(0,1) 9 | E SAME x (0,0)(0,0) 10 | E SAME aaaaaa (0,6)(0,6) 11 | E SAME aaaaaax (0,6)(0,6) 12 | E (a+)* a (0,1)(0,1) 13 | E SAME x (0,0) 14 | E SAME aaaaaa (0,6)(0,6) 15 | E SAME aaaaaax (0,6)(0,6) 16 | E (a+)+ a (0,1)(0,1) 17 | E SAME x NOMATCH 18 | E SAME aaaaaa (0,6)(0,6) 19 | E SAME aaaaaax (0,6)(0,6) 20 | 21 | E ([a]*)* a (0,1)(0,1) 22 | #E SAME x (0,0)(0,0) 23 | E SAME x (0,0)(?,?) RE2/Go 24 | E SAME aaaaaa (0,6)(0,6) 25 | E SAME aaaaaax (0,6)(0,6) 26 | E ([a]*)+ a (0,1)(0,1) 27 | E SAME x (0,0)(0,0) 28 | E SAME aaaaaa (0,6)(0,6) 29 | E SAME aaaaaax (0,6)(0,6) 30 | E ([^b]*)* a (0,1)(0,1) 31 | #E SAME b (0,0)(0,0) 32 | E SAME b (0,0)(?,?) RE2/Go 33 | E SAME aaaaaa (0,6)(0,6) 34 | E SAME aaaaaab (0,6)(0,6) 35 | E ([ab]*)* a (0,1)(0,1) 36 | E SAME aaaaaa (0,6)(0,6) 37 | E SAME ababab (0,6)(0,6) 38 | E SAME bababa (0,6)(0,6) 39 | E SAME b (0,1)(0,1) 40 | E SAME bbbbbb (0,6)(0,6) 41 | E SAME aaaabcde (0,5)(0,5) 42 | E ([^a]*)* b (0,1)(0,1) 43 | E SAME bbbbbb (0,6)(0,6) 44 | #E SAME aaaaaa (0,0)(0,0) 45 | E SAME aaaaaa (0,0)(?,?) RE2/Go 46 | E ([^ab]*)* ccccxx (0,6)(0,6) 47 | #E SAME ababab (0,0)(0,0) 48 | E SAME ababab (0,0)(?,?) RE2/Go 49 | 50 | E ((z)+|a)* zabcde (0,2)(1,2) 51 | 52 | #{E a+? aaaaaa (0,1) no *? +? mimimal match ops 53 | #E (a) aaa (0,1)(0,1) 54 | #E (a*?) aaa (0,0)(0,0) 55 | #E (a)*? aaa (0,0) 56 | #E (a*?)*? aaa (0,0) 57 | #} 58 | 59 | B \(a*\)*\(x\) x (0,1)(0,0)(0,1) 60 | B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) 61 | B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) 62 | B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) 63 | B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) 64 | B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) 65 | B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) 66 | B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) 67 | 68 | #E (a*)*(x) x (0,1)(0,0)(0,1) 69 | E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go 70 | E (a*)*(x) ax (0,2)(0,1)(1,2) 71 | E (a*)*(x) axa (0,2)(0,1)(1,2) 72 | 73 | E (a*)+(x) x (0,1)(0,0)(0,1) 74 | E (a*)+(x) ax (0,2)(0,1)(1,2) 75 | E (a*)+(x) axa (0,2)(0,1)(1,2) 76 | 77 | E (a*){2}(x) x (0,1)(0,0)(0,1) 78 | E (a*){2}(x) ax (0,2)(1,1)(1,2) 79 | E (a*){2}(x) axa (0,2)(1,1)(1,2) 80 | -------------------------------------------------------------------------------- /testdata/re2-exhaustive.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloob/regexp/a9296bf6ee794d8726f013c254654bb606da0b7e/testdata/re2-exhaustive.txt.bz2 -------------------------------------------------------------------------------- /testdata/repetition.dat: -------------------------------------------------------------------------------- 1 | NOTE implicit vs. explicit repetitions : 2009-02-02 2 | 3 | # Glenn Fowler 4 | # conforming matches (column 4) must match one of the following BREs 5 | # NOMATCH 6 | # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* 7 | # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* 8 | # i.e., each 3-tuple has two identical elements and one (?,?) 9 | 10 | E ((..)|(.)) NULL NOMATCH 11 | E ((..)|(.))((..)|(.)) NULL NOMATCH 12 | E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH 13 | 14 | E ((..)|(.)){1} NULL NOMATCH 15 | E ((..)|(.)){2} NULL NOMATCH 16 | E ((..)|(.)){3} NULL NOMATCH 17 | 18 | E ((..)|(.))* NULL (0,0) 19 | 20 | E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) 21 | E ((..)|(.))((..)|(.)) a NOMATCH 22 | E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH 23 | 24 | E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) 25 | E ((..)|(.)){2} a NOMATCH 26 | E ((..)|(.)){3} a NOMATCH 27 | 28 | E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) 29 | 30 | E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) 31 | E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) 32 | E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH 33 | 34 | E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) 35 | E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) 36 | E ((..)|(.)){3} aa NOMATCH 37 | 38 | E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) 39 | 40 | E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) 41 | E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) 42 | E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) 43 | 44 | E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) 45 | #E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) 46 | E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 47 | E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) 48 | 49 | #E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) 50 | E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 51 | 52 | E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) 53 | E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 54 | E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) 55 | 56 | E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) 57 | E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) 58 | #E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) 59 | E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go 60 | 61 | E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) 62 | 63 | E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) 64 | E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 65 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) 66 | 67 | E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) 68 | E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) 69 | #E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) 70 | E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 71 | 72 | #E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) 73 | E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 74 | 75 | E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) 76 | E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 77 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) 78 | 79 | E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) 80 | E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) 81 | E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) 82 | 83 | E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) 84 | 85 | NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 86 | 87 | # These test a bug in OS X / FreeBSD / NetBSD, and libtree. 88 | # Linux/GLIBC gets the {8,} and {8,8} wrong. 89 | 90 | :HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) 91 | :HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) 92 | :HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) 93 | :HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) 94 | :HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) 95 | :HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) 96 | :HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) 97 | :HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) 98 | :HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) 99 | #:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) 100 | :HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go 101 | #:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) 102 | :HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go 103 | #:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) 104 | :HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go 105 | #:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) 106 | :HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go 107 | #:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) 108 | :HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go 109 | #:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) 110 | :HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go 111 | #:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) 112 | :HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go 113 | #:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) 114 | :HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go 115 | :HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) 116 | 117 | # These test a fixed bug in my regex-tdfa that did not keep the expanded 118 | # form properly grouped, so right association did the wrong thing with 119 | # these ambiguous patterns (crafted just to test my code when I became 120 | # suspicious of my implementation). The first subexpression should use 121 | # "ab" then "a" then "bcd". 122 | 123 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible 124 | # results like (0,6)(4,5)(6,6). 125 | 126 | :HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 127 | :HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 128 | :HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 129 | :HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 130 | :HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH 131 | :HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 132 | :HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 133 | :HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 134 | :HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 135 | :HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH 136 | :HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 137 | :HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 138 | 139 | # The above worked on Linux/GLIBC but the following often fail. 140 | # They also trip up OS X / FreeBSD / NetBSD: 141 | 142 | #:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 143 | :HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 144 | #:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 145 | :HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 146 | #:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 147 | :HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 148 | #:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 149 | :HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 150 | :HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH 151 | #:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 152 | :HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 153 | #:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 154 | :HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 155 | #:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 156 | :HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 157 | #:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 158 | :HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 159 | :HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH 160 | #:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 161 | :HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 162 | #:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 163 | :HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 164 | --------------------------------------------------------------------------------