├── LICENSE ├── all_test.go ├── backtrack.go ├── example_test.go ├── exec.go ├── exec2_test.go ├── exec_test.go ├── find_test.go ├── go.mod ├── onepass.go ├── onepass_test.go ├── regexp.go ├── syntax ├── compile.go ├── doc.go ├── op_string.go ├── parse.go ├── parse_test.go ├── perl_groups.go ├── prog.go ├── prog_test.go ├── regexp.go ├── simplify.go └── simplify_test.go └── testdata ├── README ├── basic.dat ├── nullsubexpr.dat ├── re2-exhaustive.txt.bz2 ├── re2-search.txt ├── repetition.dat └── testregex.c /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /all_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package binaryregexp 6 | 7 | import ( 8 | "reflect" 9 | "strings" 10 | "testing" 11 | "unicode/utf8" 12 | 13 | "rsc.io/binaryregexp/syntax" 14 | ) 15 | 16 | var goodRe = []string{ 17 | ``, 18 | `.`, 19 | `^.$`, 20 | `a`, 21 | `a*`, 22 | `a+`, 23 | `a?`, 24 | `a|b`, 25 | `a*|b*`, 26 | `(a*|b)(c*|d)`, 27 | `[a-z]`, 28 | `[a-abc-c\-\]\[]`, 29 | `[a-z]+`, 30 | `[abc]`, 31 | `[^1234]`, 32 | `[^\n]`, 33 | `\!\\`, 34 | } 35 | 36 | type stringError struct { 37 | re string 38 | err string 39 | } 40 | 41 | var badRe = []stringError{ 42 | {`*`, "missing argument to repetition operator: `*`"}, 43 | {`+`, "missing argument to repetition operator: `+`"}, 44 | {`?`, "missing argument to repetition operator: `?`"}, 45 | {`(abc`, "missing closing ): `(abc`"}, 46 | {`abc)`, "unexpected ): `abc)`"}, 47 | {`x[a-z`, "missing closing ]: `[a-z`"}, 48 | {`[z-a]`, "invalid character class range: `z-a`"}, 49 | {`abc\`, "trailing backslash at end of expression"}, 50 | {`a**`, "invalid nested repetition operator: `**`"}, 51 | {`a*+`, "invalid nested repetition operator: `*+`"}, 52 | {`\x`, "invalid escape sequence: `\\x`"}, 53 | } 54 | 55 | func compileTest(t *testing.T, expr string, error string) *Regexp { 56 | re, err := Compile(expr) 57 | if error == "" && err != nil { 58 | t.Error("compiling `", expr, "`; unexpected error: ", err.Error()) 59 | } 60 | if error != "" && err == nil { 61 | t.Error("compiling `", expr, "`; missing error") 62 | } else if error != "" && !strings.Contains(err.Error(), error) { 63 | t.Error("compiling `", expr, "`; wrong error: ", err.Error(), "; want ", error) 64 | } 65 | return re 66 | } 67 | 68 | func TestGoodCompile(t *testing.T) { 69 | for i := 0; i < len(goodRe); i++ { 70 | compileTest(t, goodRe[i], "") 71 | } 72 | } 73 | 74 | func TestBadCompile(t *testing.T) { 75 | for i := 0; i < len(badRe); i++ { 76 | compileTest(t, badRe[i].re, badRe[i].err) 77 | } 78 | } 79 | 80 | func matchTest(t *testing.T, test *FindTest) { 81 | re := compileTest(t, test.pat, "") 82 | if re == nil { 83 | return 84 | } 85 | m := re.MatchString(test.text) 86 | if m != (len(test.matches) > 0) { 87 | t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) 88 | } 89 | // now try bytes 90 | m = re.Match([]byte(test.text)) 91 | if m != (len(test.matches) > 0) { 92 | t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) 93 | } 94 | } 95 | 96 | func TestMatch(t *testing.T) { 97 | for _, test := range findTests { 98 | matchTest(t, &test) 99 | } 100 | } 101 | 102 | func matchFunctionTest(t *testing.T, test *FindTest) { 103 | m, err := MatchString(test.pat, test.text) 104 | if err == nil { 105 | return 106 | } 107 | if m != (len(test.matches) > 0) { 108 | t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) 109 | } 110 | } 111 | 112 | func TestMatchFunction(t *testing.T) { 113 | for _, test := range findTests { 114 | matchFunctionTest(t, &test) 115 | } 116 | } 117 | 118 | func copyMatchTest(t *testing.T, test *FindTest) { 119 | re := compileTest(t, test.pat, "") 120 | if re == nil { 121 | return 122 | } 123 | m1 := re.MatchString(test.text) 124 | m2 := re.Copy().MatchString(test.text) 125 | if m1 != m2 { 126 | t.Errorf("Copied Regexp match failure on %s: original gave %t; copy gave %t; should be %t", 127 | test, m1, m2, len(test.matches) > 0) 128 | } 129 | } 130 | 131 | func TestCopyMatch(t *testing.T) { 132 | for _, test := range findTests { 133 | copyMatchTest(t, &test) 134 | } 135 | } 136 | 137 | type ReplaceTest struct { 138 | pattern, replacement, input, output string 139 | } 140 | 141 | var replaceTests = []ReplaceTest{ 142 | // Test empty input and/or replacement, with pattern that matches the empty string. 143 | {"", "", "", ""}, 144 | {"", "x", "", "x"}, 145 | {"", "", "abc", "abc"}, 146 | {"", "x", "abc", "xaxbxcx"}, 147 | 148 | // Test empty input and/or replacement, with pattern that does not match the empty string. 149 | {"b", "", "", ""}, 150 | {"b", "x", "", ""}, 151 | {"b", "", "abc", "ac"}, 152 | {"b", "x", "abc", "axc"}, 153 | {"y", "", "", ""}, 154 | {"y", "x", "", ""}, 155 | {"y", "", "abc", "abc"}, 156 | {"y", "x", "abc", "abc"}, 157 | 158 | // Multibyte characters -- verify that we don't try to match in the middle 159 | // of a character. 160 | {"[a-c]*", "x", "\u65e5", "x\xe6x\x97x\xa5x"}, 161 | {"[^\u65e5]", "x", "abc\u65e5def", "xxxxxxxxx"}, 162 | 163 | // Start and end of a string. 164 | {"^[a-c]*", "x", "abcdabc", "xdabc"}, 165 | {"[a-c]*$", "x", "abcdabc", "abcdx"}, 166 | {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, 167 | {"^[a-c]*", "x", "abc", "x"}, 168 | {"[a-c]*$", "x", "abc", "x"}, 169 | {"^[a-c]*$", "x", "abc", "x"}, 170 | {"^[a-c]*", "x", "dabce", "xdabce"}, 171 | {"[a-c]*$", "x", "dabce", "dabcex"}, 172 | {"^[a-c]*$", "x", "dabce", "dabce"}, 173 | {"^[a-c]*", "x", "", "x"}, 174 | {"[a-c]*$", "x", "", "x"}, 175 | {"^[a-c]*$", "x", "", "x"}, 176 | 177 | {"^[a-c]+", "x", "abcdabc", "xdabc"}, 178 | {"[a-c]+$", "x", "abcdabc", "abcdx"}, 179 | {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, 180 | {"^[a-c]+", "x", "abc", "x"}, 181 | {"[a-c]+$", "x", "abc", "x"}, 182 | {"^[a-c]+$", "x", "abc", "x"}, 183 | {"^[a-c]+", "x", "dabce", "dabce"}, 184 | {"[a-c]+$", "x", "dabce", "dabce"}, 185 | {"^[a-c]+$", "x", "dabce", "dabce"}, 186 | {"^[a-c]+", "x", "", ""}, 187 | {"[a-c]+$", "x", "", ""}, 188 | {"^[a-c]+$", "x", "", ""}, 189 | 190 | // Other cases. 191 | {"abc", "def", "abcdefg", "defdefg"}, 192 | {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, 193 | {"abc", "", "abcdabc", "d"}, 194 | {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, 195 | {"abc", "d", "", ""}, 196 | {"abc", "d", "abc", "d"}, 197 | {".+", "x", "abc", "x"}, 198 | {"[a-c]*", "x", "def", "xdxexfx"}, 199 | {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, 200 | {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, 201 | 202 | // Substitutions 203 | {"a+", "($0)", "banana", "b(a)n(a)n(a)"}, 204 | {"a+", "(${0})", "banana", "b(a)n(a)n(a)"}, 205 | {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, 206 | {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, 207 | {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"}, 208 | {"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "}, 209 | {"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"}, 210 | {"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<><>"}, 211 | {"hello, (?P.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"}, 212 | {"hello, (?P.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"}, 213 | {"(?Phi)|(?Pbye)", "$x$x$x", "hi", "hihihi"}, 214 | {"(?Phi)|(?Pbye)", "$x$x$x", "bye", "byebyebye"}, 215 | {"(?Phi)|(?Pbye)", "$xyz", "hi", ""}, 216 | {"(?Phi)|(?Pbye)", "${x}yz", "hi", "hiyz"}, 217 | {"(?Phi)|(?Pbye)", "hello $$x", "hi", "hello $x"}, 218 | {"a+", "${oops", "aaa", "${oops"}, 219 | {"a+", "$$", "aaa", "$"}, 220 | {"a+", "$", "aaa", "$"}, 221 | 222 | // Substitution when subexpression isn't found 223 | {"(x)?", "$1", "123", "123"}, 224 | {"abc", "$1", "123", "123"}, 225 | 226 | // Substitutions involving a (x){0} 227 | {"(a)(b){0}(c)", ".$1|$3.", "xacxacx", "x.a|c.x.a|c.x"}, 228 | {"(a)(((b))){0}c", ".$1.", "xacxacx", "x.a.x.a.x"}, 229 | {"((a(b){0}){3}){5}(h)", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, 230 | {"((a(b){0}){3}){5}h", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"}, 231 | } 232 | 233 | var replaceLiteralTests = []ReplaceTest{ 234 | // Substitutions 235 | {"a+", "($0)", "banana", "b($0)n($0)n($0)"}, 236 | {"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"}, 237 | {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, 238 | {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, 239 | {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"}, 240 | {"hello, (?P.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"}, 241 | {"hello, (?P.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"}, 242 | {"(?Phi)|(?Pbye)", "$x$x$x", "hi", "$x$x$x"}, 243 | {"(?Phi)|(?Pbye)", "$x$x$x", "bye", "$x$x$x"}, 244 | {"(?Phi)|(?Pbye)", "$xyz", "hi", "$xyz"}, 245 | {"(?Phi)|(?Pbye)", "${x}yz", "hi", "${x}yz"}, 246 | {"(?Phi)|(?Pbye)", "hello $$x", "hi", "hello $$x"}, 247 | {"a+", "${oops", "aaa", "${oops"}, 248 | {"a+", "$$", "aaa", "$$"}, 249 | {"a+", "$", "aaa", "$"}, 250 | } 251 | 252 | type ReplaceFuncTest struct { 253 | pattern string 254 | replacement func(string) string 255 | input, output string 256 | } 257 | 258 | var replaceFuncTests = []ReplaceFuncTest{ 259 | {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, 260 | {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, 261 | {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, 262 | } 263 | 264 | func TestReplaceAll(t *testing.T) { 265 | for _, tc := range replaceTests { 266 | re, err := Compile(tc.pattern) 267 | if err != nil { 268 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 269 | continue 270 | } 271 | actual := re.ReplaceAllString(tc.input, tc.replacement) 272 | if actual != tc.output { 273 | t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q", 274 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 275 | } 276 | // now try bytes 277 | actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) 278 | if actual != tc.output { 279 | t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q", 280 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 281 | } 282 | } 283 | } 284 | 285 | func TestReplaceAllLiteral(t *testing.T) { 286 | // Run ReplaceAll tests that do not have $ expansions. 287 | for _, tc := range replaceTests { 288 | if strings.Contains(tc.replacement, "$") { 289 | continue 290 | } 291 | re, err := Compile(tc.pattern) 292 | if err != nil { 293 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 294 | continue 295 | } 296 | actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) 297 | if actual != tc.output { 298 | t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", 299 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 300 | } 301 | // now try bytes 302 | actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) 303 | if actual != tc.output { 304 | t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", 305 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 306 | } 307 | } 308 | 309 | // Run literal-specific tests. 310 | for _, tc := range replaceLiteralTests { 311 | re, err := Compile(tc.pattern) 312 | if err != nil { 313 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 314 | continue 315 | } 316 | actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) 317 | if actual != tc.output { 318 | t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", 319 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 320 | } 321 | // now try bytes 322 | actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) 323 | if actual != tc.output { 324 | t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", 325 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 326 | } 327 | } 328 | } 329 | 330 | func TestReplaceAllFunc(t *testing.T) { 331 | for _, tc := range replaceFuncTests { 332 | re, err := Compile(tc.pattern) 333 | if err != nil { 334 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 335 | continue 336 | } 337 | actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) 338 | if actual != tc.output { 339 | t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q", 340 | tc.pattern, tc.input, actual, tc.output) 341 | } 342 | // now try bytes 343 | actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) 344 | if actual != tc.output { 345 | t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q", 346 | tc.pattern, tc.input, actual, tc.output) 347 | } 348 | } 349 | } 350 | 351 | type MetaTest struct { 352 | pattern, output, literal string 353 | isLiteral bool 354 | } 355 | 356 | var metaTests = []MetaTest{ 357 | {``, ``, ``, true}, 358 | {`foo`, `foo`, `foo`, true}, 359 | {`ÿ+`, `ÿ\+`, `ÿ`, false}, 360 | {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator 361 | {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators 362 | {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false}, 363 | } 364 | 365 | var literalPrefixTests = []MetaTest{ 366 | // See golang.org/issue/11175. 367 | // output is unused. 368 | {`^0^0$`, ``, `0`, false}, 369 | {`^0^`, ``, ``, false}, 370 | {`^0$`, ``, `0`, true}, 371 | {`$0^`, ``, ``, false}, 372 | {`$0$`, ``, ``, false}, 373 | {`^^0$$`, ``, ``, false}, 374 | {`^$^$`, ``, ``, false}, 375 | {`$$0^^`, ``, ``, false}, 376 | } 377 | 378 | func TestQuoteMeta(t *testing.T) { 379 | for _, tc := range metaTests { 380 | // Verify that QuoteMeta returns the expected string. 381 | quoted := QuoteMeta(tc.pattern) 382 | if quoted != tc.output { 383 | t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", 384 | tc.pattern, quoted, tc.output) 385 | continue 386 | } 387 | 388 | // Verify that the quoted string is in fact treated as expected 389 | // by Compile -- i.e. that it matches the original, unquoted string. 390 | if tc.pattern != "" { 391 | re, err := Compile(quoted) 392 | if err != nil { 393 | t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) 394 | continue 395 | } 396 | src := "abc" + toLatin1(tc.pattern) + "def" 397 | repl := "xyz" 398 | replaced := re.ReplaceAllString(src, repl) 399 | expected := "abcxyzdef" 400 | if replaced != expected { 401 | t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", 402 | tc.pattern, src, repl, replaced, expected) 403 | } 404 | } 405 | } 406 | } 407 | 408 | func toLatin1(s string) string { 409 | runes := []rune(s) 410 | b := make([]byte, len(runes)) 411 | for i, r := range runes { 412 | if r > 0xff { 413 | panic("cannot toLatin1") 414 | } 415 | b[i] = byte(r) 416 | } 417 | return string(b) 418 | } 419 | 420 | func TestLiteralPrefix(t *testing.T) { 421 | for _, tc := range append(metaTests, literalPrefixTests...) { 422 | // Literal method needs to scan the pattern. 423 | re := MustCompile(tc.pattern) 424 | str, complete := re.LiteralPrefix() 425 | if complete != tc.isLiteral { 426 | t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) 427 | } 428 | if str != toLatin1(tc.literal) { 429 | t.Errorf("LiteralPrefix(`%s`) = %#q; want %#q", tc.pattern, str, toLatin1(tc.literal)) 430 | } 431 | } 432 | } 433 | 434 | type subexpCase struct { 435 | input string 436 | num int 437 | names []string 438 | } 439 | 440 | var subexpCases = []subexpCase{ 441 | {``, 0, nil}, 442 | {`.*`, 0, nil}, 443 | {`abba`, 0, nil}, 444 | {`ab(b)a`, 1, []string{"", ""}}, 445 | {`ab(.*)a`, 1, []string{"", ""}}, 446 | {`(.*)ab(.*)a`, 2, []string{"", "", ""}}, 447 | {`(.*)(ab)(.*)a`, 3, []string{"", "", "", ""}}, 448 | {`(.*)((a)b)(.*)a`, 4, []string{"", "", "", "", ""}}, 449 | {`(.*)(\(ab)(.*)a`, 3, []string{"", "", "", ""}}, 450 | {`(.*)(\(a\)b)(.*)a`, 3, []string{"", "", "", ""}}, 451 | {`(?P.*)(?P(a)b)(?P.*)a`, 4, []string{"", "foo", "bar", "", "foo"}}, 452 | } 453 | 454 | func TestSubexp(t *testing.T) { 455 | for _, c := range subexpCases { 456 | re := MustCompile(c.input) 457 | n := re.NumSubexp() 458 | if n != c.num { 459 | t.Errorf("%q: NumSubexp = %d, want %d", c.input, n, c.num) 460 | continue 461 | } 462 | names := re.SubexpNames() 463 | if len(names) != 1+n { 464 | t.Errorf("%q: len(SubexpNames) = %d, want %d", c.input, len(names), n) 465 | continue 466 | } 467 | if c.names != nil { 468 | for i := 0; i < 1+n; i++ { 469 | if names[i] != c.names[i] { 470 | t.Errorf("%q: SubexpNames[%d] = %q, want %q", c.input, i, names[i], c.names[i]) 471 | } 472 | } 473 | } 474 | } 475 | } 476 | 477 | var splitTests = []struct { 478 | s string 479 | r string 480 | n int 481 | out []string 482 | }{ 483 | {"foo:and:bar", ":", -1, []string{"foo", "and", "bar"}}, 484 | {"foo:and:bar", ":", 1, []string{"foo:and:bar"}}, 485 | {"foo:and:bar", ":", 2, []string{"foo", "and:bar"}}, 486 | {"foo:and:bar", "foo", -1, []string{"", ":and:bar"}}, 487 | {"foo:and:bar", "bar", -1, []string{"foo:and:", ""}}, 488 | {"foo:and:bar", "baz", -1, []string{"foo:and:bar"}}, 489 | {"baabaab", "a", -1, []string{"b", "", "b", "", "b"}}, 490 | {"baabaab", "a*", -1, []string{"b", "b", "b"}}, 491 | {"baabaab", "ba*", -1, []string{"", "", "", ""}}, 492 | {"foobar", "f*b*", -1, []string{"", "o", "o", "a", "r"}}, 493 | {"foobar", "f+.*b+", -1, []string{"", "ar"}}, 494 | {"foobooboar", "o{2}", -1, []string{"f", "b", "boar"}}, 495 | {"a,b,c,d,e,f", ",", 3, []string{"a", "b", "c,d,e,f"}}, 496 | {"a,b,c,d,e,f", ",", 0, nil}, 497 | {",", ",", -1, []string{"", ""}}, 498 | {",,,", ",", -1, []string{"", "", "", ""}}, 499 | {"", ",", -1, []string{""}}, 500 | {"", ".*", -1, []string{""}}, 501 | {"", ".+", -1, []string{""}}, 502 | {"", "", -1, []string{}}, 503 | {"foobar", "", -1, []string{"f", "o", "o", "b", "a", "r"}}, 504 | {"abaabaccadaaae", "a*", 5, []string{"", "b", "b", "c", "cadaaae"}}, 505 | {":x:y:z:", ":", -1, []string{"", "x", "y", "z", ""}}, 506 | } 507 | 508 | func TestSplit(t *testing.T) { 509 | for i, test := range splitTests { 510 | re, err := Compile(test.r) 511 | if err != nil { 512 | t.Errorf("#%d: %q: compile error: %s", i, test.r, err.Error()) 513 | continue 514 | } 515 | 516 | split := re.Split(test.s, test.n) 517 | if !reflect.DeepEqual(split, test.out) { 518 | t.Errorf("#%d: %q: got %q; want %q", i, test.r, split, test.out) 519 | } 520 | 521 | if QuoteMeta(test.r) == test.r { 522 | strsplit := strings.SplitN(test.s, test.r, test.n) 523 | if !reflect.DeepEqual(split, strsplit) { 524 | t.Errorf("#%d: Split(%q, %q, %d): regexp vs strings mismatch\nregexp=%q\nstrings=%q", i, test.s, test.r, test.n, split, strsplit) 525 | } 526 | } 527 | } 528 | } 529 | 530 | // The following sequence of Match calls used to panic. See issue #12980. 531 | func TestParseAndCompile(t *testing.T) { 532 | expr := "a$" 533 | s := "a\nb" 534 | 535 | for i, tc := range []struct { 536 | reFlags syntax.Flags 537 | expMatch bool 538 | }{ 539 | {syntax.Perl | syntax.OneLine, false}, 540 | {syntax.Perl &^ syntax.OneLine, true}, 541 | } { 542 | parsed, err := syntax.Parse(expr, tc.reFlags) 543 | if err != nil { 544 | t.Fatalf("%d: parse: %v", i, err) 545 | } 546 | re, err := Compile(parsed.String()) 547 | if err != nil { 548 | t.Fatalf("%d: compile: %v", i, err) 549 | } 550 | if match := re.MatchString(s); match != tc.expMatch { 551 | t.Errorf("%d: %q.MatchString(%q)=%t; expected=%t", i, re, s, match, tc.expMatch) 552 | } 553 | } 554 | } 555 | 556 | // Check that one-pass cutoff does trigger. 557 | func TestOnePassCutoff(t *testing.T) { 558 | re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl) 559 | if err != nil { 560 | t.Fatalf("parse: %v", err) 561 | } 562 | p, err := syntax.Compile(re.Simplify()) 563 | if err != nil { 564 | t.Fatalf("compile: %v", err) 565 | } 566 | if compileOnePass(p) != nil { 567 | t.Fatalf("makeOnePass succeeded; wanted nil") 568 | } 569 | } 570 | 571 | // Check that the same machine can be used with the standard matcher 572 | // and then the backtracker when there are no captures. 573 | func TestSwitchBacktrack(t *testing.T) { 574 | re := MustCompile(`a|b`) 575 | long := make([]byte, maxBacktrackVector+1) 576 | 577 | // The following sequence of Match calls used to panic. See issue #10319. 578 | re.Match(long) // triggers standard matcher 579 | re.Match(long[:1]) // triggers backtracker 580 | } 581 | 582 | func BenchmarkFind(b *testing.B) { 583 | b.StopTimer() 584 | re := MustCompile("a+b+") 585 | wantSubs := "aaabb" 586 | s := []byte("acbb" + wantSubs + "dd") 587 | b.StartTimer() 588 | b.ReportAllocs() 589 | for i := 0; i < b.N; i++ { 590 | subs := re.Find(s) 591 | if string(subs) != wantSubs { 592 | b.Fatalf("Find(%q) = %q; want %q", s, subs, wantSubs) 593 | } 594 | } 595 | } 596 | 597 | func BenchmarkFindAllNoMatches(b *testing.B) { 598 | re := MustCompile("a+b+") 599 | s := []byte("acddee") 600 | b.ReportAllocs() 601 | b.ResetTimer() 602 | for i := 0; i < b.N; i++ { 603 | all := re.FindAll(s, -1) 604 | if all != nil { 605 | b.Fatalf("FindAll(%q) = %q; want nil", s, all) 606 | } 607 | } 608 | } 609 | 610 | func BenchmarkFindString(b *testing.B) { 611 | b.StopTimer() 612 | re := MustCompile("a+b+") 613 | wantSubs := "aaabb" 614 | s := "acbb" + wantSubs + "dd" 615 | b.StartTimer() 616 | b.ReportAllocs() 617 | for i := 0; i < b.N; i++ { 618 | subs := re.FindString(s) 619 | if subs != wantSubs { 620 | b.Fatalf("FindString(%q) = %q; want %q", s, subs, wantSubs) 621 | } 622 | } 623 | } 624 | 625 | func BenchmarkFindSubmatch(b *testing.B) { 626 | b.StopTimer() 627 | re := MustCompile("a(a+b+)b") 628 | wantSubs := "aaabb" 629 | s := []byte("acbb" + wantSubs + "dd") 630 | b.StartTimer() 631 | b.ReportAllocs() 632 | for i := 0; i < b.N; i++ { 633 | subs := re.FindSubmatch(s) 634 | if string(subs[0]) != wantSubs { 635 | b.Fatalf("FindSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs) 636 | } 637 | if string(subs[1]) != "aab" { 638 | b.Fatalf("FindSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab") 639 | } 640 | } 641 | } 642 | 643 | func BenchmarkFindStringSubmatch(b *testing.B) { 644 | b.StopTimer() 645 | re := MustCompile("a(a+b+)b") 646 | wantSubs := "aaabb" 647 | s := "acbb" + wantSubs + "dd" 648 | b.StartTimer() 649 | b.ReportAllocs() 650 | for i := 0; i < b.N; i++ { 651 | subs := re.FindStringSubmatch(s) 652 | if subs[0] != wantSubs { 653 | b.Fatalf("FindStringSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs) 654 | } 655 | if subs[1] != "aab" { 656 | b.Fatalf("FindStringSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab") 657 | } 658 | } 659 | } 660 | 661 | func BenchmarkLiteral(b *testing.B) { 662 | x := strings.Repeat("x", 50) + "y" 663 | b.StopTimer() 664 | re := MustCompile("y") 665 | b.StartTimer() 666 | for i := 0; i < b.N; i++ { 667 | if !re.MatchString(x) { 668 | b.Fatalf("no match!") 669 | } 670 | } 671 | } 672 | 673 | func BenchmarkNotLiteral(b *testing.B) { 674 | x := strings.Repeat("x", 50) + "y" 675 | b.StopTimer() 676 | re := MustCompile(".y") 677 | b.StartTimer() 678 | for i := 0; i < b.N; i++ { 679 | if !re.MatchString(x) { 680 | b.Fatalf("no match!") 681 | } 682 | } 683 | } 684 | 685 | func BenchmarkMatchClass(b *testing.B) { 686 | b.StopTimer() 687 | x := strings.Repeat("xxxx", 20) + "w" 688 | re := MustCompile("[abcdw]") 689 | b.StartTimer() 690 | for i := 0; i < b.N; i++ { 691 | if !re.MatchString(x) { 692 | b.Fatalf("no match!") 693 | } 694 | } 695 | } 696 | 697 | func BenchmarkMatchClass_InRange(b *testing.B) { 698 | b.StopTimer() 699 | // 'b' is between 'a' and 'c', so the charclass 700 | // range checking is no help here. 701 | x := strings.Repeat("bbbb", 20) + "c" 702 | re := MustCompile("[ac]") 703 | b.StartTimer() 704 | for i := 0; i < b.N; i++ { 705 | if !re.MatchString(x) { 706 | b.Fatalf("no match!") 707 | } 708 | } 709 | } 710 | 711 | func BenchmarkReplaceAll(b *testing.B) { 712 | x := "abcdefghijklmnopqrstuvwxyz" 713 | b.StopTimer() 714 | re := MustCompile("[cjrw]") 715 | b.StartTimer() 716 | for i := 0; i < b.N; i++ { 717 | re.ReplaceAllString(x, "") 718 | } 719 | } 720 | 721 | func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { 722 | b.StopTimer() 723 | x := []byte("abcdefghijklmnopqrstuvwxyz") 724 | re := MustCompile("^zbc(d|e)") 725 | b.StartTimer() 726 | for i := 0; i < b.N; i++ { 727 | re.Match(x) 728 | } 729 | } 730 | 731 | func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { 732 | b.StopTimer() 733 | x := []byte("abcdefghijklmnopqrstuvwxyz") 734 | for i := 0; i < 15; i++ { 735 | x = append(x, x...) 736 | } 737 | re := MustCompile("^zbc(d|e)") 738 | b.StartTimer() 739 | for i := 0; i < b.N; i++ { 740 | re.Match(x) 741 | } 742 | } 743 | 744 | func BenchmarkAnchoredShortMatch(b *testing.B) { 745 | b.StopTimer() 746 | x := []byte("abcdefghijklmnopqrstuvwxyz") 747 | re := MustCompile("^.bc(d|e)") 748 | b.StartTimer() 749 | for i := 0; i < b.N; i++ { 750 | re.Match(x) 751 | } 752 | } 753 | 754 | func BenchmarkAnchoredLongMatch(b *testing.B) { 755 | b.StopTimer() 756 | x := []byte("abcdefghijklmnopqrstuvwxyz") 757 | for i := 0; i < 15; i++ { 758 | x = append(x, x...) 759 | } 760 | re := MustCompile("^.bc(d|e)") 761 | b.StartTimer() 762 | for i := 0; i < b.N; i++ { 763 | re.Match(x) 764 | } 765 | } 766 | 767 | func BenchmarkOnePassShortA(b *testing.B) { 768 | b.StopTimer() 769 | x := []byte("abcddddddeeeededd") 770 | re := MustCompile("^.bc(d|e)*$") 771 | b.StartTimer() 772 | for i := 0; i < b.N; i++ { 773 | re.Match(x) 774 | } 775 | } 776 | 777 | func BenchmarkNotOnePassShortA(b *testing.B) { 778 | b.StopTimer() 779 | x := []byte("abcddddddeeeededd") 780 | re := MustCompile(".bc(d|e)*$") 781 | b.StartTimer() 782 | for i := 0; i < b.N; i++ { 783 | re.Match(x) 784 | } 785 | } 786 | 787 | func BenchmarkOnePassShortB(b *testing.B) { 788 | b.StopTimer() 789 | x := []byte("abcddddddeeeededd") 790 | re := MustCompile("^.bc(?:d|e)*$") 791 | b.StartTimer() 792 | for i := 0; i < b.N; i++ { 793 | re.Match(x) 794 | } 795 | } 796 | 797 | func BenchmarkNotOnePassShortB(b *testing.B) { 798 | b.StopTimer() 799 | x := []byte("abcddddddeeeededd") 800 | re := MustCompile(".bc(?:d|e)*$") 801 | b.StartTimer() 802 | for i := 0; i < b.N; i++ { 803 | re.Match(x) 804 | } 805 | } 806 | 807 | func BenchmarkOnePassLongPrefix(b *testing.B) { 808 | b.StopTimer() 809 | x := []byte("abcdefghijklmnopqrstuvwxyz") 810 | re := MustCompile("^abcdefghijklmnopqrstuvwxyz.*$") 811 | b.StartTimer() 812 | for i := 0; i < b.N; i++ { 813 | re.Match(x) 814 | } 815 | } 816 | 817 | func BenchmarkOnePassLongNotPrefix(b *testing.B) { 818 | b.StopTimer() 819 | x := []byte("abcdefghijklmnopqrstuvwxyz") 820 | re := MustCompile("^.bcdefghijklmnopqrstuvwxyz.*$") 821 | b.StartTimer() 822 | for i := 0; i < b.N; i++ { 823 | re.Match(x) 824 | } 825 | } 826 | 827 | func BenchmarkMatchParallelShared(b *testing.B) { 828 | x := []byte("this is a long line that contains foo bar baz") 829 | re := MustCompile("foo (ba+r)? baz") 830 | b.ResetTimer() 831 | b.RunParallel(func(pb *testing.PB) { 832 | for pb.Next() { 833 | re.Match(x) 834 | } 835 | }) 836 | } 837 | 838 | func BenchmarkMatchParallelCopied(b *testing.B) { 839 | x := []byte("this is a long line that contains foo bar baz") 840 | re := MustCompile("foo (ba+r)? baz") 841 | b.ResetTimer() 842 | b.RunParallel(func(pb *testing.PB) { 843 | re := re.Copy() 844 | for pb.Next() { 845 | re.Match(x) 846 | } 847 | }) 848 | } 849 | 850 | var sink string 851 | 852 | func BenchmarkQuoteMetaAll(b *testing.B) { 853 | specials := make([]byte, 0) 854 | for i := byte(0); i < utf8.RuneSelf; i++ { 855 | if special(i) { 856 | specials = append(specials, i) 857 | } 858 | } 859 | s := string(specials) 860 | b.SetBytes(int64(len(s))) 861 | b.ResetTimer() 862 | for i := 0; i < b.N; i++ { 863 | sink = QuoteMeta(s) 864 | } 865 | } 866 | 867 | func BenchmarkQuoteMetaNone(b *testing.B) { 868 | s := "abcdefghijklmnopqrstuvwxyz" 869 | b.SetBytes(int64(len(s))) 870 | b.ResetTimer() 871 | for i := 0; i < b.N; i++ { 872 | sink = QuoteMeta(s) 873 | } 874 | } 875 | 876 | var compileBenchData = []struct{ name, re string }{ 877 | {"Onepass", `^a.[l-nA-Cg-j]?e$`}, 878 | {"Medium", `^((a|b|[d-z0-9])*(日){4,5}.)+$`}, 879 | {"Hard", strings.Repeat(`((abc)*|`, 50) + strings.Repeat(`)`, 50)}, 880 | } 881 | 882 | func BenchmarkCompile(b *testing.B) { 883 | for _, data := range compileBenchData { 884 | b.Run(data.name, func(b *testing.B) { 885 | b.ReportAllocs() 886 | for i := 0; i < b.N; i++ { 887 | if _, err := Compile(data.re); err != nil { 888 | b.Fatal(err) 889 | } 890 | } 891 | }) 892 | } 893 | } 894 | 895 | func TestDeepEqual(t *testing.T) { 896 | re1 := MustCompile("a.*b.*c.*d") 897 | re2 := MustCompile("a.*b.*c.*d") 898 | if !reflect.DeepEqual(re1, re2) { // has always been true, since Go 1. 899 | t.Errorf("DeepEqual(re1, re2) = false, want true") 900 | } 901 | 902 | re1.MatchString("abcdefghijklmn") 903 | if !reflect.DeepEqual(re1, re2) { 904 | t.Errorf("DeepEqual(re1, re2) = false, want true") 905 | } 906 | 907 | re2.MatchString("abcdefghijklmn") 908 | if !reflect.DeepEqual(re1, re2) { 909 | t.Errorf("DeepEqual(re1, re2) = false, want true") 910 | } 911 | 912 | re2.MatchString(strings.Repeat("abcdefghijklmn", 100)) 913 | if !reflect.DeepEqual(re1, re2) { 914 | t.Errorf("DeepEqual(re1, re2) = false, want true") 915 | } 916 | } 917 | 918 | var minInputLenTests = []struct { 919 | Regexp string 920 | min int 921 | }{ 922 | {``, 0}, 923 | {`a`, 1}, 924 | {`aa`, 2}, 925 | {`(aa)a`, 3}, 926 | {`(?:aa)a`, 3}, 927 | {`a?a`, 1}, 928 | {`(aaa)|(aa)`, 2}, 929 | {`(aa)+a`, 3}, 930 | {`(aa)*a`, 1}, 931 | {`(aa){3,5}`, 6}, 932 | {`[a-z]`, 1}, 933 | {`日`, 1}, 934 | } 935 | 936 | func TestMinInputLen(t *testing.T) { 937 | for _, tt := range minInputLenTests { 938 | re, _ := syntax.Parse(tt.Regexp, syntax.Perl) 939 | m := minInputLen(re) 940 | if m != tt.min { 941 | t.Errorf("regexp %#q has minInputLen %d, should be %d", tt.Regexp, m, tt.min) 942 | } 943 | } 944 | } 945 | -------------------------------------------------------------------------------- /backtrack.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // backtrack is a regular expression search with submatch 6 | // tracking for small regular expressions and texts. It allocates 7 | // a bit vector with (length of input) * (length of prog) bits, 8 | // to make sure it never explores the same (character position, instruction) 9 | // state multiple times. This limits the search to run in time linear in 10 | // the length of the test. 11 | // 12 | // backtrack is a fast replacement for the NFA code on small 13 | // regexps when onepass cannot be used. 14 | 15 | package binaryregexp 16 | 17 | import ( 18 | "sync" 19 | 20 | "rsc.io/binaryregexp/syntax" 21 | ) 22 | 23 | // A job is an entry on the backtracker's job stack. It holds 24 | // the instruction pc and the position in the input. 25 | type job struct { 26 | pc uint32 27 | arg bool 28 | pos int 29 | } 30 | 31 | const ( 32 | visitedBits = 32 33 | maxBacktrackProg = 500 // len(prog.Inst) <= max 34 | maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) 35 | ) 36 | 37 | // bitState holds state for the backtracker. 38 | type bitState struct { 39 | end int 40 | cap []int 41 | matchcap []int 42 | jobs []job 43 | visited []uint32 44 | 45 | inputs inputs 46 | } 47 | 48 | var bitStatePool sync.Pool 49 | 50 | func newBitState() *bitState { 51 | b, ok := bitStatePool.Get().(*bitState) 52 | if !ok { 53 | b = new(bitState) 54 | } 55 | return b 56 | } 57 | 58 | func freeBitState(b *bitState) { 59 | b.inputs.clear() 60 | bitStatePool.Put(b) 61 | } 62 | 63 | // maxBitStateLen returns the maximum length of a string to search with 64 | // the backtracker using prog. 65 | func maxBitStateLen(prog *syntax.Prog) int { 66 | if !shouldBacktrack(prog) { 67 | return 0 68 | } 69 | return maxBacktrackVector / len(prog.Inst) 70 | } 71 | 72 | // shouldBacktrack reports whether the program is too 73 | // long for the backtracker to run. 74 | func shouldBacktrack(prog *syntax.Prog) bool { 75 | return len(prog.Inst) <= maxBacktrackProg 76 | } 77 | 78 | // reset resets the state of the backtracker. 79 | // end is the end position in the input. 80 | // ncap is the number of captures. 81 | func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) { 82 | b.end = end 83 | 84 | if cap(b.jobs) == 0 { 85 | b.jobs = make([]job, 0, 256) 86 | } else { 87 | b.jobs = b.jobs[:0] 88 | } 89 | 90 | visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits 91 | if cap(b.visited) < visitedSize { 92 | b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) 93 | } else { 94 | b.visited = b.visited[:visitedSize] 95 | for i := range b.visited { 96 | b.visited[i] = 0 97 | } 98 | } 99 | 100 | if cap(b.cap) < ncap { 101 | b.cap = make([]int, ncap) 102 | } else { 103 | b.cap = b.cap[:ncap] 104 | } 105 | for i := range b.cap { 106 | b.cap[i] = -1 107 | } 108 | 109 | if cap(b.matchcap) < ncap { 110 | b.matchcap = make([]int, ncap) 111 | } else { 112 | b.matchcap = b.matchcap[:ncap] 113 | } 114 | for i := range b.matchcap { 115 | b.matchcap[i] = -1 116 | } 117 | } 118 | 119 | // shouldVisit reports whether the combination of (pc, pos) has not 120 | // been visited yet. 121 | func (b *bitState) shouldVisit(pc uint32, pos int) bool { 122 | n := uint(int(pc)*(b.end+1) + pos) 123 | if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { 124 | return false 125 | } 126 | b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) 127 | return true 128 | } 129 | 130 | // push pushes (pc, pos, arg) onto the job stack if it should be 131 | // visited. 132 | func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) { 133 | // Only check shouldVisit when arg is false. 134 | // When arg is true, we are continuing a previous visit. 135 | if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) { 136 | b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) 137 | } 138 | } 139 | 140 | // tryBacktrack runs a backtracking search starting at pos. 141 | func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { 142 | longest := re.longest 143 | 144 | b.push(re, pc, pos, false) 145 | for len(b.jobs) > 0 { 146 | l := len(b.jobs) - 1 147 | // Pop job off the stack. 148 | pc := b.jobs[l].pc 149 | pos := b.jobs[l].pos 150 | arg := b.jobs[l].arg 151 | b.jobs = b.jobs[:l] 152 | 153 | // Optimization: rather than push and pop, 154 | // code that is going to Push and continue 155 | // the loop simply updates ip, p, and arg 156 | // and jumps to CheckAndLoop. We have to 157 | // do the ShouldVisit check that Push 158 | // would have, but we avoid the stack 159 | // manipulation. 160 | goto Skip 161 | CheckAndLoop: 162 | if !b.shouldVisit(pc, pos) { 163 | continue 164 | } 165 | Skip: 166 | 167 | inst := re.prog.Inst[pc] 168 | 169 | switch inst.Op { 170 | default: 171 | panic("bad inst") 172 | case syntax.InstFail: 173 | panic("unexpected InstFail") 174 | case syntax.InstAlt: 175 | // Cannot just 176 | // b.push(inst.Out, pos, false) 177 | // b.push(inst.Arg, pos, false) 178 | // If during the processing of inst.Out, we encounter 179 | // inst.Arg via another path, we want to process it then. 180 | // Pushing it here will inhibit that. Instead, re-push 181 | // inst with arg==true as a reminder to push inst.Arg out 182 | // later. 183 | if arg { 184 | // Finished inst.Out; try inst.Arg. 185 | arg = false 186 | pc = inst.Arg 187 | goto CheckAndLoop 188 | } else { 189 | b.push(re, pc, pos, true) 190 | pc = inst.Out 191 | goto CheckAndLoop 192 | } 193 | 194 | case syntax.InstAltMatch: 195 | // One opcode consumes runes; the other leads to match. 196 | switch re.prog.Inst[inst.Out].Op { 197 | case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 198 | // inst.Arg is the match. 199 | b.push(re, inst.Arg, pos, false) 200 | pc = inst.Arg 201 | pos = b.end 202 | goto CheckAndLoop 203 | } 204 | // inst.Out is the match - non-greedy 205 | b.push(re, inst.Out, b.end, false) 206 | pc = inst.Out 207 | goto CheckAndLoop 208 | 209 | case syntax.InstRune: 210 | r, width := i.step(pos) 211 | if !inst.MatchRune(r) { 212 | continue 213 | } 214 | pos += width 215 | pc = inst.Out 216 | goto CheckAndLoop 217 | 218 | case syntax.InstRune1: 219 | r, width := i.step(pos) 220 | if r != inst.Rune[0] { 221 | continue 222 | } 223 | pos += width 224 | pc = inst.Out 225 | goto CheckAndLoop 226 | 227 | case syntax.InstRuneAnyNotNL: 228 | r, width := i.step(pos) 229 | if r == '\n' || r == endOfText { 230 | continue 231 | } 232 | pos += width 233 | pc = inst.Out 234 | goto CheckAndLoop 235 | 236 | case syntax.InstRuneAny: 237 | r, width := i.step(pos) 238 | if r == endOfText { 239 | continue 240 | } 241 | pos += width 242 | pc = inst.Out 243 | goto CheckAndLoop 244 | 245 | case syntax.InstCapture: 246 | if arg { 247 | // Finished inst.Out; restore the old value. 248 | b.cap[inst.Arg] = pos 249 | continue 250 | } else { 251 | if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) { 252 | // Capture pos to register, but save old value. 253 | b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done. 254 | b.cap[inst.Arg] = pos 255 | } 256 | pc = inst.Out 257 | goto CheckAndLoop 258 | } 259 | 260 | case syntax.InstEmptyWidth: 261 | flag := i.context(pos) 262 | if !flag.match(syntax.EmptyOp(inst.Arg)) { 263 | continue 264 | } 265 | pc = inst.Out 266 | goto CheckAndLoop 267 | 268 | case syntax.InstNop: 269 | pc = inst.Out 270 | goto CheckAndLoop 271 | 272 | case syntax.InstMatch: 273 | // We found a match. If the caller doesn't care 274 | // where the match is, no point going further. 275 | if len(b.cap) == 0 { 276 | return true 277 | } 278 | 279 | // Record best match so far. 280 | // Only need to check end point, because this entire 281 | // call is only considering one start position. 282 | if len(b.cap) > 1 { 283 | b.cap[1] = pos 284 | } 285 | if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) { 286 | copy(b.matchcap, b.cap) 287 | } 288 | 289 | // If going for first match, we're done. 290 | if !longest { 291 | return true 292 | } 293 | 294 | // If we used the entire text, no longer match is possible. 295 | if pos == b.end { 296 | return true 297 | } 298 | 299 | // Otherwise, continue on in hope of a longer match. 300 | continue 301 | } 302 | } 303 | 304 | return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0 305 | } 306 | 307 | // backtrack runs a backtracking search of prog on the input starting at pos. 308 | func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int { 309 | startCond := re.cond 310 | if startCond == ^syntax.EmptyOp(0) { // impossible 311 | return nil 312 | } 313 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 314 | // Anchored match, past beginning of text. 315 | return nil 316 | } 317 | 318 | b := newBitState() 319 | i, end := b.inputs.init(nil, ib, is) 320 | b.reset(re.prog, end, ncap) 321 | 322 | // Anchored search must start at the beginning of the input 323 | if startCond&syntax.EmptyBeginText != 0 { 324 | if len(b.cap) > 0 { 325 | b.cap[0] = pos 326 | } 327 | if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { 328 | freeBitState(b) 329 | return nil 330 | } 331 | } else { 332 | // Unanchored search, starting from each possible text position. 333 | // Notice that we have to try the empty string at the end of 334 | // the text, so the loop condition is pos <= end, not pos < end. 335 | // This looks like it's quadratic in the size of the text, 336 | // but we are not clearing visited between calls to TrySearch, 337 | // so no work is duplicated and it ends up still being linear. 338 | width := -1 339 | for ; pos <= end && width != 0; pos += width { 340 | if len(re.prefix) > 0 { 341 | // Match requires literal prefix; fast search for it. 342 | advance := i.index(re, pos) 343 | if advance < 0 { 344 | freeBitState(b) 345 | return nil 346 | } 347 | pos += advance 348 | } 349 | 350 | if len(b.cap) > 0 { 351 | b.cap[0] = pos 352 | } 353 | if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { 354 | // Match must be leftmost; done. 355 | goto Match 356 | } 357 | _, width = i.step(pos) 358 | } 359 | freeBitState(b) 360 | return nil 361 | } 362 | 363 | Match: 364 | dstCap = append(dstCap, b.matchcap...) 365 | freeBitState(b) 366 | return dstCap 367 | } 368 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package binaryregexp_test 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | 11 | "rsc.io/binaryregexp" 12 | ) 13 | 14 | func Example() { 15 | // Compile the expression once, usually at init time. 16 | // Use raw strings to avoid having to quote the backslashes. 17 | var validID = binaryregexp.MustCompile(`^[a-z]+\[[0-9]+\]$`) 18 | 19 | fmt.Println(validID.MatchString("adam[23]")) 20 | fmt.Println(validID.MatchString("eve[7]")) 21 | fmt.Println(validID.MatchString("Job[48]")) 22 | fmt.Println(validID.MatchString("snakey")) 23 | // Output: 24 | // true 25 | // true 26 | // false 27 | // false 28 | } 29 | 30 | func ExampleMatch() { 31 | matched, err := binaryregexp.Match(`foo.*`, []byte(`seafood`)) 32 | fmt.Println(matched, err) 33 | matched, err = binaryregexp.Match(`bar.*`, []byte(`seafood`)) 34 | fmt.Println(matched, err) 35 | matched, err = binaryregexp.Match(`a(b`, []byte(`seafood`)) 36 | fmt.Println(matched, err) 37 | 38 | // Output: 39 | // true 40 | // false 41 | // false error parsing regexp: missing closing ): `a(b` 42 | } 43 | 44 | func ExampleMatchString() { 45 | matched, err := binaryregexp.MatchString(`foo.*`, "seafood") 46 | fmt.Println(matched, err) 47 | matched, err = binaryregexp.MatchString(`bar.*`, "seafood") 48 | fmt.Println(matched, err) 49 | matched, err = binaryregexp.MatchString(`a(b`, "seafood") 50 | fmt.Println(matched, err) 51 | // Output: 52 | // true 53 | // false 54 | // false error parsing regexp: missing closing ): `a(b` 55 | } 56 | 57 | func ExampleQuoteMeta() { 58 | fmt.Println(binaryregexp.QuoteMeta(`Escaping symbols like: .+*?()|[]{}^$`)) 59 | // Output: 60 | // Escaping symbols like: \.\+\*\?\(\)\|\[\]\{\}\^\$ 61 | } 62 | 63 | func ExampleRegexp_Find() { 64 | re := binaryregexp.MustCompile(`foo.?`) 65 | fmt.Printf("%q\n", re.Find([]byte(`seafood fool`))) 66 | 67 | // Output: 68 | // "food" 69 | } 70 | 71 | func ExampleRegexp_FindAll() { 72 | re := binaryregexp.MustCompile(`foo.?`) 73 | fmt.Printf("%q\n", re.FindAll([]byte(`seafood fool`), -1)) 74 | 75 | // Output: 76 | // ["food" "fool"] 77 | } 78 | 79 | func ExampleRegexp_FindAllSubmatch() { 80 | re := binaryregexp.MustCompile(`foo(.?)`) 81 | fmt.Printf("%q\n", re.FindAllSubmatch([]byte(`seafood fool`), -1)) 82 | 83 | // Output: 84 | // [["food" "d"] ["fool" "l"]] 85 | } 86 | 87 | func ExampleRegexp_FindSubmatch() { 88 | re := binaryregexp.MustCompile(`foo(.?)`) 89 | fmt.Printf("%q\n", re.FindSubmatch([]byte(`seafood fool`))) 90 | 91 | // Output: 92 | // ["food" "d"] 93 | } 94 | 95 | func ExampleRegexp_Match() { 96 | re := binaryregexp.MustCompile(`foo.?`) 97 | fmt.Println(re.Match([]byte(`seafood fool`))) 98 | 99 | // Output: 100 | // true 101 | } 102 | 103 | func ExampleRegexp_FindString() { 104 | re := binaryregexp.MustCompile(`foo.?`) 105 | fmt.Printf("%q\n", re.FindString("seafood fool")) 106 | fmt.Printf("%q\n", re.FindString("meat")) 107 | // Output: 108 | // "food" 109 | // "" 110 | } 111 | 112 | func ExampleRegexp_FindStringIndex() { 113 | re := binaryregexp.MustCompile(`ab?`) 114 | fmt.Println(re.FindStringIndex("tablett")) 115 | fmt.Println(re.FindStringIndex("foo") == nil) 116 | // Output: 117 | // [1 3] 118 | // true 119 | } 120 | 121 | func ExampleRegexp_FindStringSubmatch() { 122 | re := binaryregexp.MustCompile(`a(x*)b(y|z)c`) 123 | fmt.Printf("%q\n", re.FindStringSubmatch("-axxxbyc-")) 124 | fmt.Printf("%q\n", re.FindStringSubmatch("-abzc-")) 125 | // Output: 126 | // ["axxxbyc" "xxx" "y"] 127 | // ["abzc" "" "z"] 128 | } 129 | 130 | func ExampleRegexp_FindAllString() { 131 | re := binaryregexp.MustCompile(`a.`) 132 | fmt.Println(re.FindAllString("paranormal", -1)) 133 | fmt.Println(re.FindAllString("paranormal", 2)) 134 | fmt.Println(re.FindAllString("graal", -1)) 135 | fmt.Println(re.FindAllString("none", -1)) 136 | // Output: 137 | // [ar an al] 138 | // [ar an] 139 | // [aa] 140 | // [] 141 | } 142 | 143 | func ExampleRegexp_FindAllStringSubmatch() { 144 | re := binaryregexp.MustCompile(`a(x*)b`) 145 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-", -1)) 146 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-", -1)) 147 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-axb-", -1)) 148 | fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-ab-", -1)) 149 | // Output: 150 | // [["ab" ""]] 151 | // [["axxb" "xx"]] 152 | // [["ab" ""] ["axb" "x"]] 153 | // [["axxb" "xx"] ["ab" ""]] 154 | } 155 | 156 | func ExampleRegexp_FindAllStringSubmatchIndex() { 157 | re := binaryregexp.MustCompile(`a(x*)b`) 158 | // Indices: 159 | // 01234567 012345678 160 | // -ab-axb- -axxb-ab- 161 | fmt.Println(re.FindAllStringSubmatchIndex("-ab-", -1)) 162 | fmt.Println(re.FindAllStringSubmatchIndex("-axxb-", -1)) 163 | fmt.Println(re.FindAllStringSubmatchIndex("-ab-axb-", -1)) 164 | fmt.Println(re.FindAllStringSubmatchIndex("-axxb-ab-", -1)) 165 | fmt.Println(re.FindAllStringSubmatchIndex("-foo-", -1)) 166 | // Output: 167 | // [[1 3 2 2]] 168 | // [[1 5 2 4]] 169 | // [[1 3 2 2] [4 7 5 6]] 170 | // [[1 5 2 4] [6 8 7 7]] 171 | // [] 172 | } 173 | 174 | func ExampleRegexp_MatchString() { 175 | re := binaryregexp.MustCompile(`(gopher){2}`) 176 | fmt.Println(re.MatchString("gopher")) 177 | fmt.Println(re.MatchString("gophergopher")) 178 | fmt.Println(re.MatchString("gophergophergopher")) 179 | // Output: 180 | // false 181 | // true 182 | // true 183 | } 184 | 185 | func ExampleRegexp_ReplaceAllLiteralString() { 186 | re := binaryregexp.MustCompile(`a(x*)b`) 187 | fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "T")) 188 | fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "$1")) 189 | fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "${1}")) 190 | // Output: 191 | // -T-T- 192 | // -$1-$1- 193 | // -${1}-${1}- 194 | } 195 | 196 | func ExampleRegexp_ReplaceAllString() { 197 | re := binaryregexp.MustCompile(`a(x*)b`) 198 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "T")) 199 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1")) 200 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1W")) 201 | fmt.Println(re.ReplaceAllString("-ab-axxb-", "${1}W")) 202 | // Output: 203 | // -T-T- 204 | // --xx- 205 | // --- 206 | // -W-xxW- 207 | } 208 | 209 | func ExampleRegexp_ReplaceAllStringFunc() { 210 | re := binaryregexp.MustCompile(`[^aeiou]`) 211 | fmt.Println(re.ReplaceAllStringFunc("seafood fool", strings.ToUpper)) 212 | // Output: 213 | // SeaFooD FooL 214 | } 215 | 216 | func ExampleRegexp_SubexpNames() { 217 | re := binaryregexp.MustCompile(`(?P[a-zA-Z]+) (?P[a-zA-Z]+)`) 218 | fmt.Println(re.MatchString("Alan Turing")) 219 | fmt.Printf("%q\n", re.SubexpNames()) 220 | reversed := fmt.Sprintf("${%s} ${%s}", re.SubexpNames()[2], re.SubexpNames()[1]) 221 | fmt.Println(reversed) 222 | fmt.Println(re.ReplaceAllString("Alan Turing", reversed)) 223 | // Output: 224 | // true 225 | // ["" "first" "last"] 226 | // ${last} ${first} 227 | // Turing Alan 228 | } 229 | 230 | func ExampleRegexp_Split() { 231 | a := binaryregexp.MustCompile(`a`) 232 | fmt.Println(a.Split("banana", -1)) 233 | fmt.Println(a.Split("banana", 0)) 234 | fmt.Println(a.Split("banana", 1)) 235 | fmt.Println(a.Split("banana", 2)) 236 | zp := binaryregexp.MustCompile(`z+`) 237 | fmt.Println(zp.Split("pizza", -1)) 238 | fmt.Println(zp.Split("pizza", 0)) 239 | fmt.Println(zp.Split("pizza", 1)) 240 | fmt.Println(zp.Split("pizza", 2)) 241 | // Output: 242 | // [b n n ] 243 | // [] 244 | // [banana] 245 | // [b nana] 246 | // [pi a] 247 | // [] 248 | // [pizza] 249 | // [pi a] 250 | } 251 | 252 | func ExampleRegexp_Expand() { 253 | content := []byte(` 254 | # comment line 255 | option1: value1 256 | option2: value2 257 | 258 | # another comment line 259 | option3: value3 260 | `) 261 | 262 | // Regex pattern captures "key: value" pair from the content. 263 | pattern := binaryregexp.MustCompile(`(?m)(?P\w+):\s+(?P\w+)$`) 264 | 265 | // Template to convert "key: value" to "key=value" by 266 | // referencing the values captured by the regex pattern. 267 | template := []byte("$key=$value\n") 268 | 269 | result := []byte{} 270 | 271 | // For each match of the regex in the content. 272 | for _, submatches := range pattern.FindAllSubmatchIndex(content, -1) { 273 | // Apply the captured submatches to the template and append the output 274 | // to the result. 275 | result = pattern.Expand(result, template, content, submatches) 276 | } 277 | fmt.Println(string(result)) 278 | // Output: 279 | // option1=value1 280 | // option2=value2 281 | // option3=value3 282 | } 283 | 284 | func ExampleRegexp_ExpandString() { 285 | content := ` 286 | # comment line 287 | option1: value1 288 | option2: value2 289 | 290 | # another comment line 291 | option3: value3 292 | ` 293 | 294 | // Regex pattern captures "key: value" pair from the content. 295 | pattern := binaryregexp.MustCompile(`(?m)(?P\w+):\s+(?P\w+)$`) 296 | 297 | // Template to convert "key: value" to "key=value" by 298 | // referencing the values captured by the regex pattern. 299 | template := "$key=$value\n" 300 | 301 | result := []byte{} 302 | 303 | // For each match of the regex in the content. 304 | for _, submatches := range pattern.FindAllStringSubmatchIndex(content, -1) { 305 | // Apply the captured submatches to the template and append the output 306 | // to the result. 307 | result = pattern.ExpandString(result, template, content, submatches) 308 | } 309 | fmt.Println(string(result)) 310 | // Output: 311 | // option1=value1 312 | // option2=value2 313 | // option3=value3 314 | } 315 | 316 | func ExampleRegexp_FindIndex() { 317 | content := []byte(` 318 | # comment line 319 | option1: value1 320 | option2: value2 321 | `) 322 | // Regex pattern captures "key: value" pair from the content. 323 | pattern := binaryregexp.MustCompile(`(?m)(?P\w+):\s+(?P\w+)$`) 324 | 325 | loc := pattern.FindIndex(content) 326 | fmt.Println(loc) 327 | fmt.Println(string(content[loc[0]:loc[1]])) 328 | // Output: 329 | // [18 33] 330 | // option1: value1 331 | } 332 | func ExampleRegexp_FindAllSubmatchIndex() { 333 | content := []byte(` 334 | # comment line 335 | option1: value1 336 | option2: value2 337 | `) 338 | // Regex pattern captures "key: value" pair from the content. 339 | pattern := binaryregexp.MustCompile(`(?m)(?P\w+):\s+(?P\w+)$`) 340 | allIndexes := pattern.FindAllSubmatchIndex(content, -1) 341 | for _, loc := range allIndexes { 342 | fmt.Println(loc) 343 | fmt.Println(string(content[loc[0]:loc[1]])) 344 | fmt.Println(string(content[loc[2]:loc[3]])) 345 | fmt.Println(string(content[loc[4]:loc[5]])) 346 | } 347 | // Output: 348 | // [18 33 18 25 27 33] 349 | // option1: value1 350 | // option1 351 | // value1 352 | // [35 50 35 42 44 50] 353 | // option2: value2 354 | // option2 355 | // value2 356 | } 357 | -------------------------------------------------------------------------------- /exec.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package binaryregexp 6 | 7 | import ( 8 | "io" 9 | "sync" 10 | 11 | "rsc.io/binaryregexp/syntax" 12 | ) 13 | 14 | // A queue is a 'sparse array' holding pending threads of execution. 15 | // See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 16 | type queue struct { 17 | sparse []uint32 18 | dense []entry 19 | } 20 | 21 | // An entry is an entry on a queue. 22 | // It holds both the instruction pc and the actual thread. 23 | // Some queue entries are just place holders so that the machine 24 | // knows it has considered that pc. Such entries have t == nil. 25 | type entry struct { 26 | pc uint32 27 | t *thread 28 | } 29 | 30 | // A thread is the state of a single path through the machine: 31 | // an instruction and a corresponding capture array. 32 | // See https://swtch.com/~rsc/regexp/regexp2.html 33 | type thread struct { 34 | inst *syntax.Inst 35 | cap []int 36 | } 37 | 38 | // A machine holds all the state during an NFA simulation for p. 39 | type machine struct { 40 | re *Regexp // corresponding Regexp 41 | p *syntax.Prog // compiled program 42 | q0, q1 queue // two queues for runq, nextq 43 | pool []*thread // pool of available threads 44 | matched bool // whether a match was found 45 | matchcap []int // capture information for the match 46 | 47 | inputs inputs 48 | } 49 | 50 | type inputs struct { 51 | // cached inputs, to avoid allocation 52 | bytes inputBytes 53 | string inputString 54 | reader inputReader 55 | } 56 | 57 | func (i *inputs) newBytes(b []byte) input { 58 | i.bytes.str = b 59 | return &i.bytes 60 | } 61 | 62 | func (i *inputs) newString(s string) input { 63 | i.string.str = s 64 | return &i.string 65 | } 66 | 67 | func (i *inputs) newReader(r io.ByteReader) input { 68 | i.reader.r = r 69 | i.reader.atEOT = false 70 | i.reader.pos = 0 71 | return &i.reader 72 | } 73 | 74 | func (i *inputs) clear() { 75 | // We need to clear 1 of these. 76 | // Avoid the expense of clearing the others (pointer write barrier). 77 | if i.bytes.str != nil { 78 | i.bytes.str = nil 79 | } else if i.reader.r != nil { 80 | i.reader.r = nil 81 | } else { 82 | i.string.str = "" 83 | } 84 | } 85 | 86 | func (i *inputs) init(r io.ByteReader, b []byte, s string) (input, int) { 87 | if r != nil { 88 | return i.newReader(r), 0 89 | } 90 | if b != nil { 91 | return i.newBytes(b), len(b) 92 | } 93 | return i.newString(s), len(s) 94 | } 95 | 96 | func (m *machine) init(ncap int) { 97 | for _, t := range m.pool { 98 | t.cap = t.cap[:ncap] 99 | } 100 | m.matchcap = m.matchcap[:ncap] 101 | } 102 | 103 | // alloc allocates a new thread with the given instruction. 104 | // It uses the free pool if possible. 105 | func (m *machine) alloc(i *syntax.Inst) *thread { 106 | var t *thread 107 | if n := len(m.pool); n > 0 { 108 | t = m.pool[n-1] 109 | m.pool = m.pool[:n-1] 110 | } else { 111 | t = new(thread) 112 | t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 113 | } 114 | t.inst = i 115 | return t 116 | } 117 | 118 | // A lazyFlag is a lazily-evaluated syntax.EmptyOp, 119 | // for checking zero-width flags like ^ $ \A \z \B \b. 120 | // It records the pair of relevant runes and does not 121 | // determine the implied flags until absolutely necessary 122 | // (most of the time, that means never). 123 | type lazyFlag uint64 124 | 125 | func newLazyFlag(r1, r2 rune) lazyFlag { 126 | return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2))) 127 | } 128 | 129 | func (f lazyFlag) match(op syntax.EmptyOp) bool { 130 | if op == 0 { 131 | return true 132 | } 133 | r1 := rune(f >> 32) 134 | if op&syntax.EmptyBeginLine != 0 { 135 | if r1 != '\n' && r1 >= 0 { 136 | return false 137 | } 138 | op &^= syntax.EmptyBeginLine 139 | } 140 | if op&syntax.EmptyBeginText != 0 { 141 | if r1 >= 0 { 142 | return false 143 | } 144 | op &^= syntax.EmptyBeginText 145 | } 146 | if op == 0 { 147 | return true 148 | } 149 | r2 := rune(f) 150 | if op&syntax.EmptyEndLine != 0 { 151 | if r2 != '\n' && r2 >= 0 { 152 | return false 153 | } 154 | op &^= syntax.EmptyEndLine 155 | } 156 | if op&syntax.EmptyEndText != 0 { 157 | if r2 >= 0 { 158 | return false 159 | } 160 | op &^= syntax.EmptyEndText 161 | } 162 | if op == 0 { 163 | return true 164 | } 165 | if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) { 166 | op &^= syntax.EmptyWordBoundary 167 | } else { 168 | op &^= syntax.EmptyNoWordBoundary 169 | } 170 | return op == 0 171 | } 172 | 173 | // match runs the machine over the input starting at pos. 174 | // It reports whether a match was found. 175 | // If so, m.matchcap holds the submatch information. 176 | func (m *machine) match(i input, pos int) bool { 177 | startCond := m.re.cond 178 | if startCond == ^syntax.EmptyOp(0) { // impossible 179 | return false 180 | } 181 | m.matched = false 182 | for i := range m.matchcap { 183 | m.matchcap[i] = -1 184 | } 185 | runq, nextq := &m.q0, &m.q1 186 | r, r1 := endOfText, endOfText 187 | width, width1 := 0, 0 188 | r, width = i.step(pos) 189 | if r != endOfText { 190 | r1, width1 = i.step(pos + width) 191 | } 192 | var flag lazyFlag 193 | if pos == 0 { 194 | flag = newLazyFlag(-1, r) 195 | } else { 196 | flag = i.context(pos) 197 | } 198 | for { 199 | if len(runq.dense) == 0 { 200 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 201 | // Anchored match, past beginning of text. 202 | break 203 | } 204 | if m.matched { 205 | // Have match; finished exploring alternatives. 206 | break 207 | } 208 | if len(m.re.prefix) > 0 && r1 != rune(m.re.prefix[0]) && i.canCheckPrefix() { 209 | // Match requires literal prefix; fast search for it. 210 | advance := i.index(m.re, pos) 211 | if advance < 0 { 212 | break 213 | } 214 | pos += advance 215 | r, width = i.step(pos) 216 | r1, width1 = i.step(pos + width) 217 | } 218 | } 219 | if !m.matched { 220 | if len(m.matchcap) > 0 { 221 | m.matchcap[0] = pos 222 | } 223 | m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil) 224 | } 225 | flag = newLazyFlag(r, r1) 226 | m.step(runq, nextq, pos, pos+width, r, &flag) 227 | if width == 0 { 228 | break 229 | } 230 | if len(m.matchcap) == 0 && m.matched { 231 | // Found a match and not paying attention 232 | // to where it is, so any match will do. 233 | break 234 | } 235 | pos += width 236 | r, width = r1, width1 237 | if r != endOfText { 238 | r1, width1 = i.step(pos + width) 239 | } 240 | runq, nextq = nextq, runq 241 | } 242 | m.clear(nextq) 243 | return m.matched 244 | } 245 | 246 | // clear frees all threads on the thread queue. 247 | func (m *machine) clear(q *queue) { 248 | for _, d := range q.dense { 249 | if d.t != nil { 250 | m.pool = append(m.pool, d.t) 251 | } 252 | } 253 | q.dense = q.dense[:0] 254 | } 255 | 256 | // step executes one step of the machine, running each of the threads 257 | // on runq and appending new threads to nextq. 258 | // The step processes the rune c (which may be endOfText), 259 | // which starts at position pos and ends at nextPos. 260 | // nextCond gives the setting for the empty-width flags after c. 261 | func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) { 262 | longest := m.re.longest 263 | for j := 0; j < len(runq.dense); j++ { 264 | d := &runq.dense[j] 265 | t := d.t 266 | if t == nil { 267 | continue 268 | } 269 | if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 270 | m.pool = append(m.pool, t) 271 | continue 272 | } 273 | i := t.inst 274 | add := false 275 | switch i.Op { 276 | default: 277 | panic("bad inst") 278 | 279 | case syntax.InstMatch: 280 | if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 281 | t.cap[1] = pos 282 | copy(m.matchcap, t.cap) 283 | } 284 | if !longest { 285 | // First-match mode: cut off all lower-priority threads. 286 | for _, d := range runq.dense[j+1:] { 287 | if d.t != nil { 288 | m.pool = append(m.pool, d.t) 289 | } 290 | } 291 | runq.dense = runq.dense[:0] 292 | } 293 | m.matched = true 294 | 295 | case syntax.InstRune: 296 | add = i.MatchRune(c) 297 | case syntax.InstRune1: 298 | add = c == i.Rune[0] 299 | case syntax.InstRuneAny: 300 | add = true 301 | case syntax.InstRuneAnyNotNL: 302 | add = c != '\n' 303 | } 304 | if add { 305 | t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 306 | } 307 | if t != nil { 308 | m.pool = append(m.pool, t) 309 | } 310 | } 311 | runq.dense = runq.dense[:0] 312 | } 313 | 314 | // add adds an entry to q for pc, unless the q already has such an entry. 315 | // It also recursively adds an entry for all instructions reachable from pc by following 316 | // empty-width conditions satisfied by cond. pos gives the current position 317 | // in the input. 318 | func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread { 319 | Again: 320 | if pc == 0 { 321 | return t 322 | } 323 | if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 324 | return t 325 | } 326 | 327 | j := len(q.dense) 328 | q.dense = q.dense[:j+1] 329 | d := &q.dense[j] 330 | d.t = nil 331 | d.pc = pc 332 | q.sparse[pc] = uint32(j) 333 | 334 | i := &m.p.Inst[pc] 335 | switch i.Op { 336 | default: 337 | panic("unhandled") 338 | case syntax.InstFail: 339 | // nothing 340 | case syntax.InstAlt, syntax.InstAltMatch: 341 | t = m.add(q, i.Out, pos, cap, cond, t) 342 | pc = i.Arg 343 | goto Again 344 | case syntax.InstEmptyWidth: 345 | if cond.match(syntax.EmptyOp(i.Arg)) { 346 | pc = i.Out 347 | goto Again 348 | } 349 | case syntax.InstNop: 350 | pc = i.Out 351 | goto Again 352 | case syntax.InstCapture: 353 | if int(i.Arg) < len(cap) { 354 | opos := cap[i.Arg] 355 | cap[i.Arg] = pos 356 | m.add(q, i.Out, pos, cap, cond, nil) 357 | cap[i.Arg] = opos 358 | } else { 359 | pc = i.Out 360 | goto Again 361 | } 362 | case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 363 | if t == nil { 364 | t = m.alloc(i) 365 | } else { 366 | t.inst = i 367 | } 368 | if len(cap) > 0 && &t.cap[0] != &cap[0] { 369 | copy(t.cap, cap) 370 | } 371 | d.t = t 372 | t = nil 373 | } 374 | return t 375 | } 376 | 377 | type onePassMachine struct { 378 | inputs inputs 379 | matchcap []int 380 | } 381 | 382 | var onePassPool sync.Pool 383 | 384 | func newOnePassMachine() *onePassMachine { 385 | m, ok := onePassPool.Get().(*onePassMachine) 386 | if !ok { 387 | m = new(onePassMachine) 388 | } 389 | return m 390 | } 391 | 392 | func freeOnePassMachine(m *onePassMachine) { 393 | m.inputs.clear() 394 | onePassPool.Put(m) 395 | } 396 | 397 | // doOnePass implements r.doExecute using the one-pass execution engine. 398 | func (re *Regexp) doOnePass(ir io.ByteReader, ib []byte, is string, pos, ncap int, dstCap []int) []int { 399 | startCond := re.cond 400 | if startCond == ^syntax.EmptyOp(0) { // impossible 401 | return nil 402 | } 403 | 404 | m := newOnePassMachine() 405 | if cap(m.matchcap) < ncap { 406 | m.matchcap = make([]int, ncap) 407 | } else { 408 | m.matchcap = m.matchcap[:ncap] 409 | } 410 | 411 | matched := false 412 | for i := range m.matchcap { 413 | m.matchcap[i] = -1 414 | } 415 | 416 | i, _ := m.inputs.init(ir, ib, is) 417 | 418 | r, r1 := endOfText, endOfText 419 | width, width1 := 0, 0 420 | r, width = i.step(pos) 421 | if r != endOfText { 422 | r1, width1 = i.step(pos + width) 423 | } 424 | var flag lazyFlag 425 | if pos == 0 { 426 | flag = newLazyFlag(-1, r) 427 | } else { 428 | flag = i.context(pos) 429 | } 430 | pc := re.onepass.Start 431 | inst := re.onepass.Inst[pc] 432 | // If there is a simple literal prefix, skip over it. 433 | if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) && 434 | len(re.prefix) > 0 && i.canCheckPrefix() { 435 | // Match requires literal prefix; fast search for it. 436 | if !i.hasPrefix(re) { 437 | goto Return 438 | } 439 | pos += len(re.prefix) 440 | r, width = i.step(pos) 441 | r1, width1 = i.step(pos + width) 442 | flag = i.context(pos) 443 | pc = int(re.prefixEnd) 444 | } 445 | for { 446 | inst = re.onepass.Inst[pc] 447 | pc = int(inst.Out) 448 | switch inst.Op { 449 | default: 450 | panic("bad inst") 451 | case syntax.InstMatch: 452 | matched = true 453 | if len(m.matchcap) > 0 { 454 | m.matchcap[0] = 0 455 | m.matchcap[1] = pos 456 | } 457 | goto Return 458 | case syntax.InstRune: 459 | if !inst.MatchRune(r) { 460 | goto Return 461 | } 462 | case syntax.InstRune1: 463 | if r != inst.Rune[0] { 464 | goto Return 465 | } 466 | case syntax.InstRuneAny: 467 | // Nothing 468 | case syntax.InstRuneAnyNotNL: 469 | if r == '\n' { 470 | goto Return 471 | } 472 | // peek at the input rune to see which branch of the Alt to take 473 | case syntax.InstAlt, syntax.InstAltMatch: 474 | pc = int(onePassNext(&inst, r)) 475 | continue 476 | case syntax.InstFail: 477 | goto Return 478 | case syntax.InstNop: 479 | continue 480 | case syntax.InstEmptyWidth: 481 | if !flag.match(syntax.EmptyOp(inst.Arg)) { 482 | goto Return 483 | } 484 | continue 485 | case syntax.InstCapture: 486 | if int(inst.Arg) < len(m.matchcap) { 487 | m.matchcap[inst.Arg] = pos 488 | } 489 | continue 490 | } 491 | if width == 0 { 492 | break 493 | } 494 | flag = newLazyFlag(r, r1) 495 | pos += width 496 | r, width = r1, width1 497 | if r != endOfText { 498 | r1, width1 = i.step(pos + width) 499 | } 500 | } 501 | 502 | Return: 503 | if !matched { 504 | freeOnePassMachine(m) 505 | return nil 506 | } 507 | 508 | dstCap = append(dstCap, m.matchcap...) 509 | freeOnePassMachine(m) 510 | return dstCap 511 | } 512 | 513 | // doMatch reports whether either r, b or s match the regexp. 514 | func (re *Regexp) doMatch(r io.ByteReader, b []byte, s string) bool { 515 | return re.doExecute(r, b, s, 0, 0, nil) != nil 516 | } 517 | 518 | // doExecute finds the leftmost match in the input, appends the position 519 | // of its subexpressions to dstCap and returns dstCap. 520 | // 521 | // nil is returned if no matches are found and non-nil if matches are found. 522 | func (re *Regexp) doExecute(r io.ByteReader, b []byte, s string, pos int, ncap int, dstCap []int) []int { 523 | if dstCap == nil { 524 | // Make sure 'return dstCap' is non-nil. 525 | dstCap = arrayNoInts[:0:0] 526 | } 527 | 528 | if r == nil && len(b)+len(s) < re.minInputLen { 529 | return nil 530 | } 531 | 532 | if re.onepass != nil { 533 | return re.doOnePass(r, b, s, pos, ncap, dstCap) 534 | } 535 | if r == nil && len(b)+len(s) < re.maxBitStateLen { 536 | return re.backtrack(b, s, pos, ncap, dstCap) 537 | } 538 | 539 | m := re.get() 540 | i, _ := m.inputs.init(r, b, s) 541 | 542 | m.init(ncap) 543 | if !m.match(i, pos) { 544 | re.put(m) 545 | return nil 546 | } 547 | 548 | dstCap = append(dstCap, m.matchcap...) 549 | re.put(m) 550 | return dstCap 551 | } 552 | 553 | // arrayNoInts is returned by doExecute match if nil dstCap is passed 554 | // to it with ncap=0. 555 | var arrayNoInts [0]int 556 | -------------------------------------------------------------------------------- /exec2_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !race 6 | 7 | package binaryregexp 8 | 9 | import ( 10 | "testing" 11 | ) 12 | 13 | // This test is excluded when running under the race detector because 14 | // it is a very expensive test and takes too long. 15 | func TestRE2Exhaustive(t *testing.T) { 16 | if testing.Short() { 17 | t.Skip("skipping TestRE2Exhaustive during short test") 18 | } 19 | testRE2(t, "testdata/re2-exhaustive.txt.bz2") 20 | } 21 | -------------------------------------------------------------------------------- /exec_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package binaryregexp 6 | 7 | import ( 8 | "bufio" 9 | "compress/bzip2" 10 | "fmt" 11 | "io" 12 | "os" 13 | "path/filepath" 14 | "strconv" 15 | "strings" 16 | "testing" 17 | "unicode/utf8" 18 | 19 | "rsc.io/binaryregexp/syntax" 20 | ) 21 | 22 | // TestRE2 tests this package's regexp API against test cases 23 | // considered during RE2's exhaustive tests, which run all possible 24 | // regexps over a given set of atoms and operators, up to a given 25 | // complexity, over all possible strings over a given alphabet, 26 | // up to a given size. Rather than try to link with RE2, we read a 27 | // log file containing the test cases and the expected matches. 28 | // The log file, re2-exhaustive.txt, is generated by running 'make log' 29 | // in the open source RE2 distribution https://github.com/google/re2/. 30 | // 31 | // The test file format is a sequence of stanzas like: 32 | // 33 | // strings 34 | // "abc" 35 | // "123x" 36 | // regexps 37 | // "[a-z]+" 38 | // 0-3;0-3 39 | // -;- 40 | // "([0-9])([0-9])([0-9])" 41 | // -;- 42 | // -;0-3 0-1 1-2 2-3 43 | // 44 | // The stanza begins by defining a set of strings, quoted 45 | // using Go double-quote syntax, one per line. Then the 46 | // regexps section gives a sequence of regexps to run on 47 | // the strings. In the block that follows a regexp, each line 48 | // gives the semicolon-separated match results of running 49 | // the regexp on the corresponding string. 50 | // Each match result is either a single -, meaning no match, or a 51 | // space-separated sequence of pairs giving the match and 52 | // submatch indices. An unmatched subexpression formats 53 | // its pair as a single - (not illustrated above). For now 54 | // each regexp run produces two match results, one for a 55 | // ``full match'' that restricts the regexp to matching the entire 56 | // string or nothing, and one for a ``partial match'' that gives 57 | // the leftmost first match found in the string. 58 | // 59 | // Lines beginning with # are comments. Lines beginning with 60 | // a capital letter are test names printed during RE2's test suite 61 | // and are echoed into t but otherwise ignored. 62 | // 63 | // At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB, 64 | // so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly. 65 | // 66 | func TestRE2Search(t *testing.T) { 67 | testRE2(t, "testdata/re2-search.txt") 68 | } 69 | 70 | func testRE2(t *testing.T, file string) { 71 | t.Skip("skipping - RE2 testdata assumes UTF-8") 72 | f, err := os.Open(file) 73 | if err != nil { 74 | t.Fatal(err) 75 | } 76 | defer f.Close() 77 | var txt io.Reader 78 | if strings.HasSuffix(file, ".bz2") { 79 | z := bzip2.NewReader(f) 80 | txt = z 81 | file = file[:len(file)-len(".bz2")] // for error messages 82 | } else { 83 | txt = f 84 | } 85 | lineno := 0 86 | scanner := bufio.NewScanner(txt) 87 | var ( 88 | str []string 89 | input []string 90 | inStrings bool 91 | re *Regexp 92 | refull *Regexp 93 | nfail int 94 | ncase int 95 | ) 96 | for lineno := 1; scanner.Scan(); lineno++ { 97 | line := scanner.Text() 98 | switch { 99 | case line == "": 100 | t.Fatalf("%s:%d: unexpected blank line", file, lineno) 101 | case line[0] == '#': 102 | continue 103 | case 'A' <= line[0] && line[0] <= 'Z': 104 | // Test name. 105 | t.Logf("%s\n", line) 106 | continue 107 | case line == "strings": 108 | str = str[:0] 109 | inStrings = true 110 | case line == "regexps": 111 | inStrings = false 112 | case line[0] == '"': 113 | q, err := strconv.Unquote(line) 114 | if err != nil { 115 | // Fatal because we'll get out of sync. 116 | t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err) 117 | } 118 | if inStrings { 119 | str = append(str, q) 120 | continue 121 | } 122 | // Is a regexp. 123 | if len(input) != 0 { 124 | t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q) 125 | } 126 | re, err = tryCompile(q) 127 | if err != nil { 128 | if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" { 129 | // We don't and likely never will support \C; keep going. 130 | continue 131 | } 132 | t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err) 133 | if nfail++; nfail >= 100 { 134 | t.Fatalf("stopping after %d errors", nfail) 135 | } 136 | continue 137 | } 138 | full := `\A(?:` + q + `)\z` 139 | refull, err = tryCompile(full) 140 | if err != nil { 141 | // Fatal because q worked, so this should always work. 142 | t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err) 143 | } 144 | input = str 145 | case line[0] == '-' || '0' <= line[0] && line[0] <= '9': 146 | // A sequence of match results. 147 | ncase++ 148 | if re == nil { 149 | // Failed to compile: skip results. 150 | continue 151 | } 152 | if len(input) == 0 { 153 | t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno) 154 | } 155 | var text string 156 | text, input = input[0], input[1:] 157 | if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { 158 | // RE2's \B considers every byte position, 159 | // so it sees 'not word boundary' in the 160 | // middle of UTF-8 sequences. This package 161 | // only considers the positions between runes, 162 | // so it disagrees. Skip those cases. 163 | continue 164 | } 165 | res := strings.Split(line, ";") 166 | if len(res) != len(run) { 167 | t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run)) 168 | } 169 | for i := range res { 170 | have, suffix := run[i](re, refull, text) 171 | want := parseResult(t, file, lineno, res[i]) 172 | if !same(have, want) { 173 | t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want) 174 | if nfail++; nfail >= 100 { 175 | t.Fatalf("stopping after %d errors", nfail) 176 | } 177 | continue 178 | } 179 | b, suffix := match[i](re, refull, text) 180 | if b != (want != nil) { 181 | t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b) 182 | if nfail++; nfail >= 100 { 183 | t.Fatalf("stopping after %d errors", nfail) 184 | } 185 | continue 186 | } 187 | } 188 | 189 | default: 190 | t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line) 191 | } 192 | } 193 | if err := scanner.Err(); err != nil { 194 | t.Fatalf("%s:%d: %v", file, lineno, err) 195 | } 196 | if len(input) != 0 { 197 | t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input)) 198 | } 199 | t.Logf("%d cases tested", ncase) 200 | } 201 | 202 | var run = []func(*Regexp, *Regexp, string) ([]int, string){ 203 | runFull, 204 | runPartial, 205 | runFullLongest, 206 | runPartialLongest, 207 | } 208 | 209 | func runFull(re, refull *Regexp, text string) ([]int, string) { 210 | refull.longest = false 211 | return refull.FindStringSubmatchIndex(text), "[full]" 212 | } 213 | 214 | func runPartial(re, refull *Regexp, text string) ([]int, string) { 215 | re.longest = false 216 | return re.FindStringSubmatchIndex(text), "" 217 | } 218 | 219 | func runFullLongest(re, refull *Regexp, text string) ([]int, string) { 220 | refull.longest = true 221 | return refull.FindStringSubmatchIndex(text), "[full,longest]" 222 | } 223 | 224 | func runPartialLongest(re, refull *Regexp, text string) ([]int, string) { 225 | re.longest = true 226 | return re.FindStringSubmatchIndex(text), "[longest]" 227 | } 228 | 229 | var match = []func(*Regexp, *Regexp, string) (bool, string){ 230 | matchFull, 231 | matchPartial, 232 | matchFullLongest, 233 | matchPartialLongest, 234 | } 235 | 236 | func matchFull(re, refull *Regexp, text string) (bool, string) { 237 | refull.longest = false 238 | return refull.MatchString(text), "[full]" 239 | } 240 | 241 | func matchPartial(re, refull *Regexp, text string) (bool, string) { 242 | re.longest = false 243 | return re.MatchString(text), "" 244 | } 245 | 246 | func matchFullLongest(re, refull *Regexp, text string) (bool, string) { 247 | refull.longest = true 248 | return refull.MatchString(text), "[full,longest]" 249 | } 250 | 251 | func matchPartialLongest(re, refull *Regexp, text string) (bool, string) { 252 | re.longest = true 253 | return re.MatchString(text), "[longest]" 254 | } 255 | 256 | func isSingleBytes(s string) bool { 257 | for _, c := range s { 258 | if c >= utf8.RuneSelf { 259 | return false 260 | } 261 | } 262 | return true 263 | } 264 | 265 | func tryCompile(s string) (re *Regexp, err error) { 266 | // Protect against panic during Compile. 267 | defer func() { 268 | if r := recover(); r != nil { 269 | err = fmt.Errorf("panic: %v", r) 270 | } 271 | }() 272 | return Compile(s) 273 | } 274 | 275 | func parseResult(t *testing.T, file string, lineno int, res string) []int { 276 | // A single - indicates no match. 277 | if res == "-" { 278 | return nil 279 | } 280 | // Otherwise, a space-separated list of pairs. 281 | n := 1 282 | for j := 0; j < len(res); j++ { 283 | if res[j] == ' ' { 284 | n++ 285 | } 286 | } 287 | out := make([]int, 2*n) 288 | i := 0 289 | n = 0 290 | for j := 0; j <= len(res); j++ { 291 | if j == len(res) || res[j] == ' ' { 292 | // Process a single pair. - means no submatch. 293 | pair := res[i:j] 294 | if pair == "-" { 295 | out[n] = -1 296 | out[n+1] = -1 297 | } else { 298 | k := strings.Index(pair, "-") 299 | if k < 0 { 300 | t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) 301 | } 302 | lo, err1 := strconv.Atoi(pair[:k]) 303 | hi, err2 := strconv.Atoi(pair[k+1:]) 304 | if err1 != nil || err2 != nil || lo > hi { 305 | t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) 306 | } 307 | out[n] = lo 308 | out[n+1] = hi 309 | } 310 | n += 2 311 | i = j + 1 312 | } 313 | } 314 | return out 315 | } 316 | 317 | func same(x, y []int) bool { 318 | if len(x) != len(y) { 319 | return false 320 | } 321 | for i, xi := range x { 322 | if xi != y[i] { 323 | return false 324 | } 325 | } 326 | return true 327 | } 328 | 329 | // TestFowler runs this package's regexp API against the 330 | // POSIX regular expression tests collected by Glenn Fowler 331 | // at http://www2.research.att.com/~astopen/testregex/testregex.html. 332 | func TestFowler(t *testing.T) { 333 | files, err := filepath.Glob("testdata/*.dat") 334 | if err != nil { 335 | t.Fatal(err) 336 | } 337 | for _, file := range files { 338 | t.Log(file) 339 | testFowler(t, file) 340 | } 341 | } 342 | 343 | var notab = MustCompilePOSIX(`[^\t]+`) 344 | 345 | func testFowler(t *testing.T, file string) { 346 | f, err := os.Open(file) 347 | if err != nil { 348 | t.Error(err) 349 | return 350 | } 351 | defer f.Close() 352 | b := bufio.NewReader(f) 353 | lineno := 0 354 | lastRegexp := "" 355 | Reading: 356 | for { 357 | lineno++ 358 | line, err := b.ReadString('\n') 359 | if err != nil { 360 | if err != io.EOF { 361 | t.Errorf("%s:%d: %v", file, lineno, err) 362 | } 363 | break Reading 364 | } 365 | 366 | // http://www2.research.att.com/~astopen/man/man1/testregex.html 367 | // 368 | // INPUT FORMAT 369 | // Input lines may be blank, a comment beginning with #, or a test 370 | // specification. A specification is five fields separated by one 371 | // or more tabs. NULL denotes the empty string and NIL denotes the 372 | // 0 pointer. 373 | if line[0] == '#' || line[0] == '\n' { 374 | continue Reading 375 | } 376 | line = line[:len(line)-1] 377 | field := notab.FindAllString(line, -1) 378 | for i, f := range field { 379 | if f == "NULL" { 380 | field[i] = "" 381 | } 382 | if f == "NIL" { 383 | t.Logf("%s:%d: skip: %s", file, lineno, line) 384 | continue Reading 385 | } 386 | } 387 | if len(field) == 0 { 388 | continue Reading 389 | } 390 | 391 | // Field 1: the regex(3) flags to apply, one character per REG_feature 392 | // flag. The test is skipped if REG_feature is not supported by the 393 | // implementation. If the first character is not [BEASKLP] then the 394 | // specification is a global control line. One or more of [BEASKLP] may be 395 | // specified; the test will be repeated for each mode. 396 | // 397 | // B basic BRE (grep, ed, sed) 398 | // E REG_EXTENDED ERE (egrep) 399 | // A REG_AUGMENTED ARE (egrep with negation) 400 | // S REG_SHELL SRE (sh glob) 401 | // K REG_SHELL|REG_AUGMENTED KRE (ksh glob) 402 | // L REG_LITERAL LRE (fgrep) 403 | // 404 | // a REG_LEFT|REG_RIGHT implicit ^...$ 405 | // b REG_NOTBOL lhs does not match ^ 406 | // c REG_COMMENT ignore space and #...\n 407 | // d REG_SHELL_DOT explicit leading . match 408 | // e REG_NOTEOL rhs does not match $ 409 | // f REG_MULTIPLE multiple \n separated patterns 410 | // g FNM_LEADING_DIR testfnmatch only -- match until / 411 | // h REG_MULTIREF multiple digit backref 412 | // i REG_ICASE ignore case 413 | // j REG_SPAN . matches \n 414 | // k REG_ESCAPE \ to escape [...] delimiter 415 | // l REG_LEFT implicit ^... 416 | // m REG_MINIMAL minimal match 417 | // n REG_NEWLINE explicit \n match 418 | // o REG_ENCLOSED (|&) magic inside [@|&](...) 419 | // p REG_SHELL_PATH explicit / match 420 | // q REG_DELIMITED delimited pattern 421 | // r REG_RIGHT implicit ...$ 422 | // s REG_SHELL_ESCAPED \ not special 423 | // t REG_MUSTDELIM all delimiters must be specified 424 | // u standard unspecified behavior -- errors not counted 425 | // v REG_CLASS_ESCAPE \ special inside [...] 426 | // w REG_NOSUB no subexpression match array 427 | // x REG_LENIENT let some errors slide 428 | // y REG_LEFT regexec() implicit ^... 429 | // z REG_NULL NULL subexpressions ok 430 | // $ expand C \c escapes in fields 2 and 3 431 | // / field 2 is a regsubcomp() expression 432 | // = field 3 is a regdecomp() expression 433 | // 434 | // Field 1 control lines: 435 | // 436 | // C set LC_COLLATE and LC_CTYPE to locale in field 2 437 | // 438 | // ?test ... output field 5 if passed and != EXPECTED, silent otherwise 439 | // &test ... output field 5 if current and previous passed 440 | // |test ... output field 5 if current passed and previous failed 441 | // ; ... output field 2 if previous failed 442 | // {test ... skip if failed until } 443 | // } end of skip 444 | // 445 | // : comment comment copied as output NOTE 446 | // :comment:test :comment: ignored 447 | // N[OTE] comment comment copied as output NOTE 448 | // T[EST] comment comment 449 | // 450 | // number use number for nmatch (20 by default) 451 | flag := field[0] 452 | switch flag[0] { 453 | case '?', '&', '|', ';', '{', '}': 454 | // Ignore all the control operators. 455 | // Just run everything. 456 | flag = flag[1:] 457 | if flag == "" { 458 | continue Reading 459 | } 460 | case ':': 461 | i := strings.Index(flag[1:], ":") 462 | if i < 0 { 463 | t.Logf("skip: %s", line) 464 | continue Reading 465 | } 466 | flag = flag[1+i+1:] 467 | case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 468 | t.Logf("skip: %s", line) 469 | continue Reading 470 | } 471 | 472 | // Can check field count now that we've handled the myriad comment formats. 473 | if len(field) < 4 { 474 | t.Errorf("%s:%d: too few fields: %s", file, lineno, line) 475 | continue Reading 476 | } 477 | 478 | // Expand C escapes (a.k.a. Go escapes). 479 | if strings.Contains(flag, "$") { 480 | f := `"` + field[1] + `"` 481 | if field[1], err = strconv.Unquote(f); err != nil { 482 | t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) 483 | } 484 | f = `"` + field[2] + `"` 485 | if field[2], err = strconv.Unquote(f); err != nil { 486 | t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) 487 | } 488 | } 489 | 490 | // Field 2: the regular expression pattern; SAME uses the pattern from 491 | // the previous specification. 492 | // 493 | if field[1] == "SAME" { 494 | field[1] = lastRegexp 495 | } 496 | lastRegexp = field[1] 497 | 498 | // Field 3: the string to match. 499 | text := field[2] 500 | 501 | // Field 4: the test outcome... 502 | ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3]) 503 | if !ok { 504 | t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3]) 505 | continue Reading 506 | } 507 | 508 | // Field 5: optional comment appended to the report. 509 | 510 | Testing: 511 | // Run test once for each specified capital letter mode that we support. 512 | for _, c := range flag { 513 | pattern := field[1] 514 | syn := syntax.POSIX | syntax.ClassNL 515 | switch c { 516 | default: 517 | continue Testing 518 | case 'E': 519 | // extended regexp (what we support) 520 | case 'L': 521 | // literal 522 | pattern = QuoteMeta(pattern) 523 | } 524 | 525 | for _, c := range flag { 526 | switch c { 527 | case 'i': 528 | syn |= syntax.FoldCase 529 | } 530 | } 531 | 532 | re, err := compile(pattern, syn, true) 533 | if err != nil { 534 | if shouldCompile { 535 | t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern) 536 | } 537 | continue Testing 538 | } 539 | if !shouldCompile { 540 | t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern) 541 | continue Testing 542 | } 543 | match := re.MatchString(text) 544 | if match != shouldMatch { 545 | t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch) 546 | continue Testing 547 | } 548 | have := re.FindStringSubmatchIndex(text) 549 | if (len(have) > 0) != match { 550 | t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have) 551 | continue Testing 552 | } 553 | if len(have) > len(pos) { 554 | have = have[:len(pos)] 555 | } 556 | if !same(have, pos) { 557 | t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos) 558 | } 559 | } 560 | } 561 | } 562 | 563 | func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) { 564 | // Field 4: the test outcome. This is either one of the posix error 565 | // codes (with REG_ omitted) or the match array, a list of (m,n) 566 | // entries with m and n being first and last+1 positions in the 567 | // field 3 string, or NULL if REG_NOSUB is in effect and success 568 | // is expected. BADPAT is acceptable in place of any regcomp(3) 569 | // error code. The match[] array is initialized to (-2,-2) before 570 | // each test. All array elements from 0 to nmatch-1 must be specified 571 | // in the outcome. Unspecified endpoints (offset -1) are denoted by ?. 572 | // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a 573 | // matched (?{...}) expression, where x is the text enclosed by {...}, 574 | // o is the expression ordinal counting from 1, and n is the length of 575 | // the unmatched portion of the subject string. If x starts with a 576 | // number then that is the return value of re_execf(), otherwise 0 is 577 | // returned. 578 | switch { 579 | case s == "": 580 | // Match with no position information. 581 | ok = true 582 | compiled = true 583 | matched = true 584 | return 585 | case s == "NOMATCH": 586 | // Match failure. 587 | ok = true 588 | compiled = true 589 | matched = false 590 | return 591 | case 'A' <= s[0] && s[0] <= 'Z': 592 | // All the other error codes are compile errors. 593 | ok = true 594 | compiled = false 595 | return 596 | } 597 | compiled = true 598 | 599 | var x []int 600 | for s != "" { 601 | var end byte = ')' 602 | if len(x)%2 == 0 { 603 | if s[0] != '(' { 604 | ok = false 605 | return 606 | } 607 | s = s[1:] 608 | end = ',' 609 | } 610 | i := 0 611 | for i < len(s) && s[i] != end { 612 | i++ 613 | } 614 | if i == 0 || i == len(s) { 615 | ok = false 616 | return 617 | } 618 | var v = -1 619 | var err error 620 | if s[:i] != "?" { 621 | v, err = strconv.Atoi(s[:i]) 622 | if err != nil { 623 | ok = false 624 | return 625 | } 626 | } 627 | x = append(x, v) 628 | s = s[i+1:] 629 | } 630 | if len(x)%2 != 0 { 631 | ok = false 632 | return 633 | } 634 | ok = true 635 | matched = true 636 | pos = x 637 | return 638 | } 639 | 640 | var text []byte 641 | 642 | func makeText(n int) []byte { 643 | if len(text) >= n { 644 | return text[:n] 645 | } 646 | text = make([]byte, n) 647 | x := ^uint32(0) 648 | for i := range text { 649 | x += x 650 | x ^= 1 651 | if int32(x) < 0 { 652 | x ^= 0x88888eef 653 | } 654 | if x%31 == 0 { 655 | text[i] = '\n' 656 | } else { 657 | text[i] = byte(x%(0x7E+1-0x20) + 0x20) 658 | } 659 | } 660 | return text 661 | } 662 | 663 | func BenchmarkMatch(b *testing.B) { 664 | isRaceBuilder := false 665 | for _, data := range benchData { 666 | r := MustCompile(data.re) 667 | for _, size := range benchSizes { 668 | if isRaceBuilder && size.n > 1<<10 { 669 | continue 670 | } 671 | t := makeText(size.n) 672 | b.Run(data.name+"/"+size.name, func(b *testing.B) { 673 | b.SetBytes(int64(size.n)) 674 | for i := 0; i < b.N; i++ { 675 | if r.Match(t) { 676 | b.Fatal("match!") 677 | } 678 | } 679 | }) 680 | } 681 | } 682 | } 683 | 684 | func BenchmarkMatch_onepass_regex(b *testing.B) { 685 | isRaceBuilder := false 686 | r := MustCompile(`(?s)\A.*\z`) 687 | if r.onepass == nil { 688 | b.Fatalf("want onepass regex, but %q is not onepass", r) 689 | } 690 | for _, size := range benchSizes { 691 | if isRaceBuilder && size.n > 1<<10 { 692 | continue 693 | } 694 | t := makeText(size.n) 695 | b.Run(size.name, func(b *testing.B) { 696 | b.SetBytes(int64(size.n)) 697 | b.ReportAllocs() 698 | for i := 0; i < b.N; i++ { 699 | if !r.Match(t) { 700 | b.Fatal("not match!") 701 | } 702 | } 703 | }) 704 | } 705 | } 706 | 707 | var benchData = []struct{ name, re string }{ 708 | {"Easy0", "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"}, 709 | {"Easy0i", "(?i)ABCDEFGHIJklmnopqrstuvwxyz$"}, 710 | {"Easy1", "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"}, 711 | {"Medium", "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"}, 712 | {"Hard", "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"}, 713 | {"Hard1", "ABCD|CDEF|EFGH|GHIJ|IJKL|KLMN|MNOP|OPQR|QRST|STUV|UVWX|WXYZ"}, 714 | } 715 | 716 | var benchSizes = []struct { 717 | name string 718 | n int 719 | }{ 720 | {"16", 16}, 721 | {"32", 32}, 722 | {"1K", 1 << 10}, 723 | {"32K", 32 << 10}, 724 | {"1M", 1 << 20}, 725 | {"32M", 32 << 20}, 726 | } 727 | 728 | func TestLongest(t *testing.T) { 729 | re, err := Compile(`a(|b)`) 730 | if err != nil { 731 | t.Fatal(err) 732 | } 733 | if g, w := re.FindString("ab"), "a"; g != w { 734 | t.Errorf("first match was %q, want %q", g, w) 735 | } 736 | re.Longest() 737 | if g, w := re.FindString("ab"), "ab"; g != w { 738 | t.Errorf("longest match was %q, want %q", g, w) 739 | } 740 | } 741 | 742 | // TestProgramTooLongForBacktrack tests that a regex which is too long 743 | // for the backtracker still executes properly. 744 | func TestProgramTooLongForBacktrack(t *testing.T) { 745 | longRegex := MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`) 746 | if !longRegex.MatchString("two") { 747 | t.Errorf("longRegex.MatchString(\"two\") was false, want true") 748 | } 749 | if longRegex.MatchString("xxx") { 750 | t.Errorf("longRegex.MatchString(\"xxx\") was true, want false") 751 | } 752 | } 753 | -------------------------------------------------------------------------------- /find_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package binaryregexp 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | "testing" 11 | ) 12 | 13 | // For each pattern/text pair, what is the expected output of each function? 14 | // We can derive the textual results from the indexed results, the non-submatch 15 | // results from the submatched results, the single results from the 'all' results, 16 | // and the byte results from the string results. Therefore the table includes 17 | // only the FindAllStringSubmatchIndex result. 18 | type FindTest struct { 19 | pat string 20 | text string 21 | matches [][]int 22 | } 23 | 24 | func (t FindTest) String() string { 25 | return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) 26 | } 27 | 28 | var findTests = []FindTest{ 29 | {``, ``, build(1, 0, 0)}, 30 | {`^abcdefg`, "abcdefg", build(1, 0, 7)}, 31 | {`a+`, "baaab", build(1, 1, 4)}, 32 | {"abcd..", "abcdef", build(1, 0, 6)}, 33 | {`a`, "a", build(1, 0, 1)}, 34 | {`x`, "y", nil}, 35 | {`b`, "abc", build(1, 1, 2)}, 36 | {`.`, "a", build(1, 0, 1)}, 37 | {`.*`, "abcdef", build(1, 0, 6)}, 38 | {`^`, "abcde", build(1, 0, 0)}, 39 | {`$`, "abcde", build(1, 5, 5)}, 40 | {`^abcd$`, "abcd", build(1, 0, 4)}, 41 | {`^bcd'`, "abcdef", nil}, 42 | {`^abcd$`, "abcde", nil}, 43 | {`a+`, "baaab", build(1, 1, 4)}, 44 | {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, 45 | {`[a-z]+`, "abcd", build(1, 0, 4)}, 46 | {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, 47 | {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, 48 | {`[^\n]+`, "abcd\n", build(1, 0, 4)}, 49 | {`[日本語]+`, "日本語日本語", nil}, 50 | {`日本語+`, "日本語", nil}, 51 | {`日本語+`, "日本語語語語", nil}, 52 | {`Æ`, "\xC6", build(1, 0, 1)}, 53 | {`ÆÌÓÿ`, "\xC6\xCC\xD3\xFF", build(1, 0, 4)}, 54 | {`()`, "", build(1, 0, 0, 0, 0)}, 55 | {`(a)`, "a", build(1, 0, 1, 0, 1)}, 56 | {`(.)(.)`, "日a", build(2, 0, 2, 0, 1, 1, 2, 2, 4, 2, 3, 3, 4)}, 57 | {`(.*)`, "", build(1, 0, 0, 0, 0)}, 58 | {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, 59 | {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, 60 | {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, 61 | {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, 62 | {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, 63 | {`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)}, 64 | {`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)}, 65 | 66 | {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, 67 | {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, 68 | {`[.]`, ".", build(1, 0, 1)}, 69 | {`/$`, "/abc/", build(1, 4, 5)}, 70 | {`/$`, "/abc", nil}, 71 | 72 | // multiple matches 73 | {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, 74 | {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, 75 | {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, 76 | {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, 77 | {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, 78 | 79 | // fixed bugs 80 | {`ab$`, "cab", build(1, 1, 3)}, 81 | {`axxb$`, "axxcb", nil}, 82 | {`data`, "daXY data", build(1, 5, 9)}, 83 | {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, 84 | {`zx+`, "zzx", build(1, 1, 3)}, 85 | {`ab$`, "abcab", build(1, 3, 5)}, 86 | {`(aa)*$`, "a", build(1, 1, 1, -1, -1)}, 87 | {`(?:.|(?:.a))`, "", nil}, 88 | {`(?:A(?:A|a))`, "Aa", build(1, 0, 2)}, 89 | {`(?:A|(?:A|a))`, "a", build(1, 0, 1)}, 90 | {`(a){0}`, "", build(1, 0, 0, -1, -1)}, 91 | {`(?-s)(?:(?:^).)`, "\n", nil}, 92 | {`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)}, 93 | {`(?:(?:^).)`, "\n", nil}, 94 | {`\b`, "x", build(2, 0, 0, 1, 1)}, 95 | {`\b`, "xx", build(2, 0, 0, 2, 2)}, 96 | {`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)}, 97 | {`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)}, 98 | {`\B`, "x", nil}, 99 | {`\B`, "xx", build(1, 1, 1)}, 100 | {`\B`, "x y", nil}, 101 | {`\B`, "xx yy", build(2, 1, 1, 4, 4)}, 102 | 103 | // RE2 tests 104 | {`[^\S\s]`, "abcd", nil}, 105 | {`[^\S[:space:]]`, "abcd", nil}, 106 | {`[^\D\d]`, "abcd", nil}, 107 | {`[^\D[:digit:]]`, "abcd", nil}, 108 | {`(?i)\W`, "x", nil}, 109 | {`(?i)\W`, "k", nil}, 110 | {`(?i)\W`, "s", nil}, 111 | 112 | // can backslash-escape any punctuation 113 | {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, 114 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, 115 | {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, 116 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, 117 | {"\\`", "`", build(1, 0, 1)}, 118 | {"[\\`]+", "`", build(1, 0, 1)}, 119 | 120 | // long set of matches (longer than startSize) 121 | { 122 | ".", 123 | "qwertyuiopasdfghjklzxcvbnm1234567890", 124 | build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 125 | 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 126 | 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 127 | 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), 128 | }, 129 | } 130 | 131 | // build is a helper to construct a [][]int by extracting n sequences from x. 132 | // This represents n matches with len(x)/n submatches each. 133 | func build(n int, x ...int) [][]int { 134 | ret := make([][]int, n) 135 | runLength := len(x) / n 136 | j := 0 137 | for i := range ret { 138 | ret[i] = make([]int, runLength) 139 | copy(ret[i], x[j:]) 140 | j += runLength 141 | if j > len(x) { 142 | panic("invalid build entry") 143 | } 144 | } 145 | return ret 146 | } 147 | 148 | // First the simple cases. 149 | 150 | func TestFind(t *testing.T) { 151 | for _, test := range findTests { 152 | re := MustCompile(test.pat) 153 | if re.String() != test.pat { 154 | t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) 155 | } 156 | result := re.Find([]byte(test.text)) 157 | switch { 158 | case len(test.matches) == 0 && len(result) == 0: 159 | // ok 160 | case test.matches == nil && result != nil: 161 | t.Errorf("expected no match; got one: %s", test) 162 | case test.matches != nil && result == nil: 163 | t.Errorf("expected match; got none: %s", test) 164 | case test.matches != nil && result != nil: 165 | expect := test.text[test.matches[0][0]:test.matches[0][1]] 166 | if len(result) != cap(result) { 167 | t.Errorf("expected capacity %d got %d: %s", len(result), cap(result), test) 168 | } 169 | if expect != string(result) { 170 | t.Errorf("expected %q got %q: %s", expect, result, test) 171 | } 172 | } 173 | } 174 | } 175 | 176 | func TestFindString(t *testing.T) { 177 | for _, test := range findTests { 178 | result := MustCompile(test.pat).FindString(test.text) 179 | switch { 180 | case len(test.matches) == 0 && len(result) == 0: 181 | // ok 182 | case test.matches == nil && result != "": 183 | t.Errorf("expected no match; got one: %s", test) 184 | case test.matches != nil && result == "": 185 | // Tricky because an empty result has two meanings: no match or empty match. 186 | if test.matches[0][0] != test.matches[0][1] { 187 | t.Errorf("expected match; got none: %s", test) 188 | } 189 | case test.matches != nil && result != "": 190 | expect := test.text[test.matches[0][0]:test.matches[0][1]] 191 | if expect != result { 192 | t.Errorf("expected %q got %q: %s", expect, result, test) 193 | } 194 | } 195 | } 196 | } 197 | 198 | func testFindIndex(test *FindTest, result []int, t *testing.T) { 199 | switch { 200 | case len(test.matches) == 0 && len(result) == 0: 201 | // ok 202 | case test.matches == nil && result != nil: 203 | t.Errorf("expected no match; got one: %s", test) 204 | case test.matches != nil && result == nil: 205 | t.Errorf("expected match; got none: %s", test) 206 | case test.matches != nil && result != nil: 207 | expect := test.matches[0] 208 | if expect[0] != result[0] || expect[1] != result[1] { 209 | t.Errorf("expected %v got %v: %s", expect, result, test) 210 | } 211 | } 212 | } 213 | 214 | func TestFindIndex(t *testing.T) { 215 | for _, test := range findTests { 216 | testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) 217 | } 218 | } 219 | 220 | func TestFindStringIndex(t *testing.T) { 221 | for _, test := range findTests { 222 | testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) 223 | } 224 | } 225 | 226 | func TestFindReaderIndex(t *testing.T) { 227 | for _, test := range findTests { 228 | testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t) 229 | } 230 | } 231 | 232 | // Now come the simple All cases. 233 | 234 | func TestFindAll(t *testing.T) { 235 | for _, test := range findTests { 236 | result := MustCompile(test.pat).FindAll([]byte(test.text), -1) 237 | switch { 238 | case test.matches == nil && result == nil: 239 | // ok 240 | case test.matches == nil && result != nil: 241 | t.Errorf("expected no match; got one: %s", test) 242 | case test.matches != nil && result == nil: 243 | t.Fatalf("expected match; got none: %s", test) 244 | case test.matches != nil && result != nil: 245 | if len(test.matches) != len(result) { 246 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 247 | continue 248 | } 249 | for k, e := range test.matches { 250 | got := result[k] 251 | if len(got) != cap(got) { 252 | t.Errorf("match %d: expected capacity %d got %d: %s", k, len(got), cap(got), test) 253 | } 254 | expect := test.text[e[0]:e[1]] 255 | if expect != string(got) { 256 | t.Errorf("match %d: expected %q got %q: %s", k, expect, got, test) 257 | } 258 | } 259 | } 260 | } 261 | } 262 | 263 | func TestFindAllString(t *testing.T) { 264 | for _, test := range findTests { 265 | result := MustCompile(test.pat).FindAllString(test.text, -1) 266 | switch { 267 | case test.matches == nil && result == nil: 268 | // ok 269 | case test.matches == nil && result != nil: 270 | t.Errorf("expected no match; got one: %s", test) 271 | case test.matches != nil && result == nil: 272 | t.Errorf("expected match; got none: %s", test) 273 | case test.matches != nil && result != nil: 274 | if len(test.matches) != len(result) { 275 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 276 | continue 277 | } 278 | for k, e := range test.matches { 279 | expect := test.text[e[0]:e[1]] 280 | if expect != result[k] { 281 | t.Errorf("expected %q got %q: %s", expect, result, test) 282 | } 283 | } 284 | } 285 | } 286 | } 287 | 288 | func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { 289 | switch { 290 | case test.matches == nil && result == nil: 291 | // ok 292 | case test.matches == nil && result != nil: 293 | t.Errorf("expected no match; got one: %s", test) 294 | case test.matches != nil && result == nil: 295 | t.Errorf("expected match; got none: %s", test) 296 | case test.matches != nil && result != nil: 297 | if len(test.matches) != len(result) { 298 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 299 | return 300 | } 301 | for k, e := range test.matches { 302 | if e[0] != result[k][0] || e[1] != result[k][1] { 303 | t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) 304 | } 305 | } 306 | } 307 | } 308 | 309 | func TestFindAllIndex(t *testing.T) { 310 | for _, test := range findTests { 311 | testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) 312 | } 313 | } 314 | 315 | func TestFindAllStringIndex(t *testing.T) { 316 | for _, test := range findTests { 317 | testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) 318 | } 319 | } 320 | 321 | // Now come the Submatch cases. 322 | 323 | func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { 324 | if len(submatches) != len(result)*2 { 325 | t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) 326 | return 327 | } 328 | for k := 0; k < len(submatches); k += 2 { 329 | if submatches[k] == -1 { 330 | if result[k/2] != nil { 331 | t.Errorf("match %d: expected nil got %q: %s", n, result, test) 332 | } 333 | continue 334 | } 335 | got := result[k/2] 336 | if len(got) != cap(got) { 337 | t.Errorf("match %d: expected capacity %d got %d: %s", n, len(got), cap(got), test) 338 | return 339 | } 340 | expect := test.text[submatches[k]:submatches[k+1]] 341 | if expect != string(got) { 342 | t.Errorf("match %d: expected %q got %q: %s", n, expect, got, test) 343 | return 344 | } 345 | } 346 | } 347 | 348 | func TestFindSubmatch(t *testing.T) { 349 | for _, test := range findTests { 350 | result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) 351 | switch { 352 | case test.matches == nil && result == nil: 353 | // ok 354 | case test.matches == nil && result != nil: 355 | t.Errorf("expected no match; got one: %s", test) 356 | case test.matches != nil && result == nil: 357 | t.Errorf("expected match; got none: %s", test) 358 | case test.matches != nil && result != nil: 359 | testSubmatchBytes(&test, 0, test.matches[0], result, t) 360 | } 361 | } 362 | } 363 | 364 | func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { 365 | if len(submatches) != len(result)*2 { 366 | t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) 367 | return 368 | } 369 | for k := 0; k < len(submatches); k += 2 { 370 | if submatches[k] == -1 { 371 | if result[k/2] != "" { 372 | t.Errorf("match %d: expected nil got %q: %s", n, result, test) 373 | } 374 | continue 375 | } 376 | expect := test.text[submatches[k]:submatches[k+1]] 377 | if expect != result[k/2] { 378 | t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) 379 | return 380 | } 381 | } 382 | } 383 | 384 | func TestFindStringSubmatch(t *testing.T) { 385 | for _, test := range findTests { 386 | result := MustCompile(test.pat).FindStringSubmatch(test.text) 387 | switch { 388 | case test.matches == nil && result == nil: 389 | // ok 390 | case test.matches == nil && result != nil: 391 | t.Errorf("expected no match; got one: %s", test) 392 | case test.matches != nil && result == nil: 393 | t.Errorf("expected match; got none: %s", test) 394 | case test.matches != nil && result != nil: 395 | testSubmatchString(&test, 0, test.matches[0], result, t) 396 | } 397 | } 398 | } 399 | 400 | func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { 401 | if len(expect) != len(result) { 402 | t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) 403 | return 404 | } 405 | for k, e := range expect { 406 | if e != result[k] { 407 | t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) 408 | } 409 | } 410 | } 411 | 412 | func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { 413 | switch { 414 | case test.matches == nil && result == nil: 415 | // ok 416 | case test.matches == nil && result != nil: 417 | t.Errorf("expected no match; got one: %s", test) 418 | case test.matches != nil && result == nil: 419 | t.Errorf("expected match; got none: %s", test) 420 | case test.matches != nil && result != nil: 421 | testSubmatchIndices(test, 0, test.matches[0], result, t) 422 | } 423 | } 424 | 425 | func TestFindSubmatchIndex(t *testing.T) { 426 | for _, test := range findTests { 427 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) 428 | } 429 | } 430 | 431 | func TestFindStringSubmatchIndex(t *testing.T) { 432 | for _, test := range findTests { 433 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) 434 | } 435 | } 436 | 437 | func TestFindReaderSubmatchIndex(t *testing.T) { 438 | for _, test := range findTests { 439 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t) 440 | } 441 | } 442 | 443 | // Now come the monster AllSubmatch cases. 444 | 445 | func TestFindAllSubmatch(t *testing.T) { 446 | for _, test := range findTests { 447 | result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) 448 | switch { 449 | case test.matches == nil && result == nil: 450 | // ok 451 | case test.matches == nil && result != nil: 452 | t.Errorf("expected no match; got one: %s", test) 453 | case test.matches != nil && result == nil: 454 | t.Errorf("expected match; got none: %s", test) 455 | case len(test.matches) != len(result): 456 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 457 | case test.matches != nil && result != nil: 458 | for k, match := range test.matches { 459 | testSubmatchBytes(&test, k, match, result[k], t) 460 | } 461 | } 462 | } 463 | } 464 | 465 | func TestFindAllStringSubmatch(t *testing.T) { 466 | for _, test := range findTests { 467 | result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) 468 | switch { 469 | case test.matches == nil && result == nil: 470 | // ok 471 | case test.matches == nil && result != nil: 472 | t.Errorf("expected no match; got one: %s", test) 473 | case test.matches != nil && result == nil: 474 | t.Errorf("expected match; got none: %s", test) 475 | case len(test.matches) != len(result): 476 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 477 | case test.matches != nil && result != nil: 478 | for k, match := range test.matches { 479 | testSubmatchString(&test, k, match, result[k], t) 480 | } 481 | } 482 | } 483 | } 484 | 485 | func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { 486 | switch { 487 | case test.matches == nil && result == nil: 488 | // ok 489 | case test.matches == nil && result != nil: 490 | t.Errorf("expected no match; got one: %s", test) 491 | case test.matches != nil && result == nil: 492 | t.Errorf("expected match; got none: %s", test) 493 | case len(test.matches) != len(result): 494 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 495 | case test.matches != nil && result != nil: 496 | for k, match := range test.matches { 497 | testSubmatchIndices(test, k, match, result[k], t) 498 | } 499 | } 500 | } 501 | 502 | func TestFindAllSubmatchIndex(t *testing.T) { 503 | for _, test := range findTests { 504 | testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) 505 | } 506 | } 507 | 508 | func TestFindAllStringSubmatchIndex(t *testing.T) { 509 | for _, test := range findTests { 510 | testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) 511 | } 512 | } 513 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module rsc.io/binaryregexp 2 | 3 | go 1.12 4 | -------------------------------------------------------------------------------- /onepass.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package binaryregexp 6 | 7 | import ( 8 | "bytes" 9 | "sort" 10 | "unicode" 11 | 12 | "rsc.io/binaryregexp/syntax" 13 | ) 14 | 15 | // "One-pass" regexp execution. 16 | // Some regexps can be analyzed to determine that they never need 17 | // backtracking: they are guaranteed to run in one pass over the string 18 | // without bothering to save all the usual NFA state. 19 | // Detect those and execute them more quickly. 20 | 21 | // A onePassProg is a compiled one-pass regular expression program. 22 | // It is the same as syntax.Prog except for the use of onePassInst. 23 | type onePassProg struct { 24 | Inst []onePassInst 25 | Start int // index of start instruction 26 | NumCap int // number of InstCapture insts in re 27 | } 28 | 29 | // A onePassInst is a single instruction in a one-pass regular expression program. 30 | // It is the same as syntax.Inst except for the new 'Next' field. 31 | type onePassInst struct { 32 | syntax.Inst 33 | Next []uint32 34 | } 35 | 36 | // OnePassPrefix returns a literal string that all matches for the 37 | // regexp must start with. Complete is true if the prefix 38 | // is the entire match. Pc is the index of the last rune instruction 39 | // in the string. The OnePassPrefix skips over the mandatory 40 | // EmptyBeginText 41 | func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 42 | i := &p.Inst[p.Start] 43 | if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 44 | return "", i.Op == syntax.InstMatch, uint32(p.Start) 45 | } 46 | pc = i.Out 47 | i = &p.Inst[pc] 48 | for i.Op == syntax.InstNop { 49 | pc = i.Out 50 | i = &p.Inst[pc] 51 | } 52 | // Avoid allocation of buffer if prefix is empty. 53 | if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 54 | return "", i.Op == syntax.InstMatch, uint32(p.Start) 55 | } 56 | 57 | // Have prefix; gather characters. 58 | var buf bytes.Buffer 59 | for iop(i) == syntax.InstRune && len(i.Rune) == 1 && i.Rune[0] <= 0xFF && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { 60 | buf.WriteByte(byte(i.Rune[0])) 61 | pc, i = i.Out, &p.Inst[i.Out] 62 | } 63 | if i.Op == syntax.InstEmptyWidth && 64 | syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && 65 | p.Inst[i.Out].Op == syntax.InstMatch { 66 | complete = true 67 | } 68 | return buf.String(), complete, pc 69 | } 70 | 71 | // OnePassNext selects the next actionable state of the prog, based on the input character. 72 | // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 73 | // One of the alternates may ultimately lead without input to end of line. If the instruction 74 | // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 75 | func onePassNext(i *onePassInst, r rune) uint32 { 76 | next := i.MatchRunePos(r) 77 | if next >= 0 { 78 | return i.Next[next] 79 | } 80 | if i.Op == syntax.InstAltMatch { 81 | return i.Out 82 | } 83 | return 0 84 | } 85 | 86 | func iop(i *syntax.Inst) syntax.InstOp { 87 | op := i.Op 88 | switch op { 89 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 90 | op = syntax.InstRune 91 | } 92 | return op 93 | } 94 | 95 | // Sparse Array implementation is used as a queueOnePass. 96 | type queueOnePass struct { 97 | sparse []uint32 98 | dense []uint32 99 | size, nextIndex uint32 100 | } 101 | 102 | func (q *queueOnePass) empty() bool { 103 | return q.nextIndex >= q.size 104 | } 105 | 106 | func (q *queueOnePass) next() (n uint32) { 107 | n = q.dense[q.nextIndex] 108 | q.nextIndex++ 109 | return 110 | } 111 | 112 | func (q *queueOnePass) clear() { 113 | q.size = 0 114 | q.nextIndex = 0 115 | } 116 | 117 | func (q *queueOnePass) contains(u uint32) bool { 118 | if u >= uint32(len(q.sparse)) { 119 | return false 120 | } 121 | return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 122 | } 123 | 124 | func (q *queueOnePass) insert(u uint32) { 125 | if !q.contains(u) { 126 | q.insertNew(u) 127 | } 128 | } 129 | 130 | func (q *queueOnePass) insertNew(u uint32) { 131 | if u >= uint32(len(q.sparse)) { 132 | return 133 | } 134 | q.sparse[u] = q.size 135 | q.dense[q.size] = u 136 | q.size++ 137 | } 138 | 139 | func newQueue(size int) (q *queueOnePass) { 140 | return &queueOnePass{ 141 | sparse: make([]uint32, size), 142 | dense: make([]uint32, size), 143 | } 144 | } 145 | 146 | // mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 147 | // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 148 | // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 149 | // NextIp array with the single element mergeFailed is returned. 150 | // The code assumes that both inputs contain ordered and non-intersecting rune pairs. 151 | const mergeFailed = uint32(0xffffffff) 152 | 153 | var ( 154 | noRune = []rune{} 155 | noNext = []uint32{mergeFailed} 156 | ) 157 | 158 | func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 159 | leftLen := len(*leftRunes) 160 | rightLen := len(*rightRunes) 161 | if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 162 | panic("mergeRuneSets odd length []rune") 163 | } 164 | var ( 165 | lx, rx int 166 | ) 167 | merged := make([]rune, 0) 168 | next := make([]uint32, 0) 169 | ok := true 170 | defer func() { 171 | if !ok { 172 | merged = nil 173 | next = nil 174 | } 175 | }() 176 | 177 | ix := -1 178 | extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 179 | if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 180 | return false 181 | } 182 | merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 183 | *newLow += 2 184 | ix += 2 185 | next = append(next, pc) 186 | return true 187 | } 188 | 189 | for lx < leftLen || rx < rightLen { 190 | switch { 191 | case rx >= rightLen: 192 | ok = extend(&lx, leftRunes, leftPC) 193 | case lx >= leftLen: 194 | ok = extend(&rx, rightRunes, rightPC) 195 | case (*rightRunes)[rx] < (*leftRunes)[lx]: 196 | ok = extend(&rx, rightRunes, rightPC) 197 | default: 198 | ok = extend(&lx, leftRunes, leftPC) 199 | } 200 | if !ok { 201 | return noRune, noNext 202 | } 203 | } 204 | return merged, next 205 | } 206 | 207 | // cleanupOnePass drops working memory, and restores certain shortcut instructions. 208 | func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 209 | for ix, instOriginal := range original.Inst { 210 | switch instOriginal.Op { 211 | case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 212 | case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 213 | prog.Inst[ix].Next = nil 214 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 215 | prog.Inst[ix].Next = nil 216 | prog.Inst[ix] = onePassInst{Inst: instOriginal} 217 | } 218 | } 219 | } 220 | 221 | // onePassCopy creates a copy of the original Prog, as we'll be modifying it 222 | func onePassCopy(prog *syntax.Prog) *onePassProg { 223 | p := &onePassProg{ 224 | Start: prog.Start, 225 | NumCap: prog.NumCap, 226 | Inst: make([]onePassInst, len(prog.Inst)), 227 | } 228 | for i, inst := range prog.Inst { 229 | p.Inst[i] = onePassInst{Inst: inst} 230 | } 231 | 232 | // rewrites one or more common Prog constructs that enable some otherwise 233 | // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 234 | // ip A, that points to ips B & C. 235 | // A:BC + B:DA => A:BC + B:CD 236 | // A:BC + B:DC => A:DC + B:DC 237 | for pc := range p.Inst { 238 | switch p.Inst[pc].Op { 239 | default: 240 | continue 241 | case syntax.InstAlt, syntax.InstAltMatch: 242 | // A:Bx + B:Ay 243 | p_A_Other := &p.Inst[pc].Out 244 | p_A_Alt := &p.Inst[pc].Arg 245 | // make sure a target is another Alt 246 | instAlt := p.Inst[*p_A_Alt] 247 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 248 | p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 249 | instAlt = p.Inst[*p_A_Alt] 250 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 251 | continue 252 | } 253 | } 254 | instOther := p.Inst[*p_A_Other] 255 | // Analyzing both legs pointing to Alts is for another day 256 | if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 257 | // too complicated 258 | continue 259 | } 260 | // simple empty transition loop 261 | // A:BC + B:DA => A:BC + B:DC 262 | p_B_Alt := &p.Inst[*p_A_Alt].Out 263 | p_B_Other := &p.Inst[*p_A_Alt].Arg 264 | patch := false 265 | if instAlt.Out == uint32(pc) { 266 | patch = true 267 | } else if instAlt.Arg == uint32(pc) { 268 | patch = true 269 | p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 270 | } 271 | if patch { 272 | *p_B_Alt = *p_A_Other 273 | } 274 | 275 | // empty transition to common target 276 | // A:BC + B:DC => A:DC + B:DC 277 | if *p_A_Other == *p_B_Alt { 278 | *p_A_Alt = *p_B_Other 279 | } 280 | } 281 | } 282 | return p 283 | } 284 | 285 | // runeSlice exists to permit sorting the case-folded rune sets. 286 | type runeSlice []rune 287 | 288 | func (p runeSlice) Len() int { return len(p) } 289 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 290 | func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 291 | 292 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 293 | var anyRune = []rune{0, unicode.MaxRune} 294 | 295 | // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 296 | // the match engine can always tell which branch to take. The routine may modify 297 | // p if it is turned into a onepass Prog. If it isn't possible for this to be a 298 | // onepass Prog, the Prog nil is returned. makeOnePass is recursive 299 | // to the size of the Prog. 300 | func makeOnePass(p *onePassProg) *onePassProg { 301 | // If the machine is very long, it's not worth the time to check if we can use one pass. 302 | if len(p.Inst) >= 1000 { 303 | return nil 304 | } 305 | 306 | var ( 307 | instQueue = newQueue(len(p.Inst)) 308 | visitQueue = newQueue(len(p.Inst)) 309 | check func(uint32, []bool) bool 310 | onePassRunes = make([][]rune, len(p.Inst)) 311 | ) 312 | 313 | // check that paths from Alt instructions are unambiguous, and rebuild the new 314 | // program as a onepass program 315 | check = func(pc uint32, m []bool) (ok bool) { 316 | ok = true 317 | inst := &p.Inst[pc] 318 | if visitQueue.contains(pc) { 319 | return 320 | } 321 | visitQueue.insert(pc) 322 | switch inst.Op { 323 | case syntax.InstAlt, syntax.InstAltMatch: 324 | ok = check(inst.Out, m) && check(inst.Arg, m) 325 | // check no-input paths to InstMatch 326 | matchOut := m[inst.Out] 327 | matchArg := m[inst.Arg] 328 | if matchOut && matchArg { 329 | ok = false 330 | break 331 | } 332 | // Match on empty goes in inst.Out 333 | if matchArg { 334 | inst.Out, inst.Arg = inst.Arg, inst.Out 335 | matchOut, matchArg = matchArg, matchOut 336 | } 337 | if matchOut { 338 | m[pc] = true 339 | inst.Op = syntax.InstAltMatch 340 | } 341 | 342 | // build a dispatch operator from the two legs of the alt. 343 | onePassRunes[pc], inst.Next = mergeRuneSets( 344 | &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 345 | if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 346 | ok = false 347 | break 348 | } 349 | case syntax.InstCapture, syntax.InstNop: 350 | ok = check(inst.Out, m) 351 | m[pc] = m[inst.Out] 352 | // pass matching runes back through these no-ops. 353 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 354 | inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 355 | for i := range inst.Next { 356 | inst.Next[i] = inst.Out 357 | } 358 | case syntax.InstEmptyWidth: 359 | ok = check(inst.Out, m) 360 | m[pc] = m[inst.Out] 361 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 362 | inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 363 | for i := range inst.Next { 364 | inst.Next[i] = inst.Out 365 | } 366 | case syntax.InstMatch, syntax.InstFail: 367 | m[pc] = inst.Op == syntax.InstMatch 368 | case syntax.InstRune: 369 | m[pc] = false 370 | if len(inst.Next) > 0 { 371 | break 372 | } 373 | instQueue.insert(inst.Out) 374 | if len(inst.Rune) == 0 { 375 | onePassRunes[pc] = []rune{} 376 | inst.Next = []uint32{inst.Out} 377 | break 378 | } 379 | runes := make([]rune, 0) 380 | if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 381 | r0 := inst.Rune[0] 382 | runes = append(runes, r0, r0) 383 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 384 | runes = append(runes, r1, r1) 385 | } 386 | sort.Sort(runeSlice(runes)) 387 | } else { 388 | runes = append(runes, inst.Rune...) 389 | } 390 | onePassRunes[pc] = runes 391 | inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 392 | for i := range inst.Next { 393 | inst.Next[i] = inst.Out 394 | } 395 | inst.Op = syntax.InstRune 396 | case syntax.InstRune1: 397 | m[pc] = false 398 | if len(inst.Next) > 0 { 399 | break 400 | } 401 | instQueue.insert(inst.Out) 402 | runes := []rune{} 403 | // expand case-folded runes 404 | if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 405 | r0 := inst.Rune[0] 406 | runes = append(runes, r0, r0) 407 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 408 | runes = append(runes, r1, r1) 409 | } 410 | sort.Sort(runeSlice(runes)) 411 | } else { 412 | runes = append(runes, inst.Rune[0], inst.Rune[0]) 413 | } 414 | onePassRunes[pc] = runes 415 | inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 416 | for i := range inst.Next { 417 | inst.Next[i] = inst.Out 418 | } 419 | inst.Op = syntax.InstRune 420 | case syntax.InstRuneAny: 421 | m[pc] = false 422 | if len(inst.Next) > 0 { 423 | break 424 | } 425 | instQueue.insert(inst.Out) 426 | onePassRunes[pc] = append([]rune{}, anyRune...) 427 | inst.Next = []uint32{inst.Out} 428 | case syntax.InstRuneAnyNotNL: 429 | m[pc] = false 430 | if len(inst.Next) > 0 { 431 | break 432 | } 433 | instQueue.insert(inst.Out) 434 | onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 435 | inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 436 | for i := range inst.Next { 437 | inst.Next[i] = inst.Out 438 | } 439 | } 440 | return 441 | } 442 | 443 | instQueue.clear() 444 | instQueue.insert(uint32(p.Start)) 445 | m := make([]bool, len(p.Inst)) 446 | for !instQueue.empty() { 447 | visitQueue.clear() 448 | pc := instQueue.next() 449 | if !check(pc, m) { 450 | p = nil 451 | break 452 | } 453 | } 454 | if p != nil { 455 | for i := range p.Inst { 456 | p.Inst[i].Rune = onePassRunes[i] 457 | } 458 | } 459 | return p 460 | } 461 | 462 | // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 463 | // can be recharacterized as a one-pass regexp program, or syntax.nil if the 464 | // Prog cannot be converted. For a one pass prog, the fundamental condition that must 465 | // be true is: at any InstAlt, there must be no ambiguity about what branch to take. 466 | func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 467 | if prog.Start == 0 { 468 | return nil 469 | } 470 | // onepass regexp is anchored 471 | if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 472 | syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 473 | return nil 474 | } 475 | // every instruction leading to InstMatch must be EmptyEndText 476 | for _, inst := range prog.Inst { 477 | opOut := prog.Inst[inst.Out].Op 478 | switch inst.Op { 479 | default: 480 | if opOut == syntax.InstMatch { 481 | return nil 482 | } 483 | case syntax.InstAlt, syntax.InstAltMatch: 484 | if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 485 | return nil 486 | } 487 | case syntax.InstEmptyWidth: 488 | if opOut == syntax.InstMatch { 489 | if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 490 | continue 491 | } 492 | return nil 493 | } 494 | } 495 | } 496 | // Creates a slightly optimized copy of the original Prog 497 | // that cleans up some Prog idioms that block valid onepass programs 498 | p = onePassCopy(prog) 499 | 500 | // checkAmbiguity on InstAlts, build onepass Prog if possible 501 | p = makeOnePass(p) 502 | 503 | if p != nil { 504 | cleanupOnePass(p, prog) 505 | } 506 | return p 507 | } 508 | -------------------------------------------------------------------------------- /onepass_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package binaryregexp 6 | 7 | import ( 8 | "reflect" 9 | "rsc.io/binaryregexp/syntax" 10 | "strings" 11 | "testing" 12 | ) 13 | 14 | var runeMergeTests = []struct { 15 | left, right, merged []rune 16 | next []uint32 17 | leftPC, rightPC uint32 18 | }{ 19 | { 20 | // empty rhs 21 | []rune{69, 69}, 22 | []rune{}, 23 | []rune{69, 69}, 24 | []uint32{1}, 25 | 1, 2, 26 | }, 27 | { 28 | // identical runes, identical targets 29 | []rune{69, 69}, 30 | []rune{69, 69}, 31 | []rune{}, 32 | []uint32{mergeFailed}, 33 | 1, 1, 34 | }, 35 | { 36 | // identical runes, different targets 37 | []rune{69, 69}, 38 | []rune{69, 69}, 39 | []rune{}, 40 | []uint32{mergeFailed}, 41 | 1, 2, 42 | }, 43 | { 44 | // append right-first 45 | []rune{69, 69}, 46 | []rune{71, 71}, 47 | []rune{69, 69, 71, 71}, 48 | []uint32{1, 2}, 49 | 1, 2, 50 | }, 51 | { 52 | // append, left-first 53 | []rune{71, 71}, 54 | []rune{69, 69}, 55 | []rune{69, 69, 71, 71}, 56 | []uint32{2, 1}, 57 | 1, 2, 58 | }, 59 | { 60 | // successful interleave 61 | []rune{60, 60, 71, 71, 101, 101}, 62 | []rune{69, 69, 88, 88}, 63 | []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101}, 64 | []uint32{1, 2, 1, 2, 1}, 65 | 1, 2, 66 | }, 67 | { 68 | // left surrounds right 69 | []rune{69, 74}, 70 | []rune{71, 71}, 71 | []rune{}, 72 | []uint32{mergeFailed}, 73 | 1, 2, 74 | }, 75 | { 76 | // right surrounds left 77 | []rune{69, 74}, 78 | []rune{68, 75}, 79 | []rune{}, 80 | []uint32{mergeFailed}, 81 | 1, 2, 82 | }, 83 | { 84 | // overlap at interval begin 85 | []rune{69, 74}, 86 | []rune{74, 75}, 87 | []rune{}, 88 | []uint32{mergeFailed}, 89 | 1, 2, 90 | }, 91 | { 92 | // overlap ar interval end 93 | []rune{69, 74}, 94 | []rune{65, 69}, 95 | []rune{}, 96 | []uint32{mergeFailed}, 97 | 1, 2, 98 | }, 99 | { 100 | // overlap from above 101 | []rune{69, 74}, 102 | []rune{71, 74}, 103 | []rune{}, 104 | []uint32{mergeFailed}, 105 | 1, 2, 106 | }, 107 | { 108 | // overlap from below 109 | []rune{69, 74}, 110 | []rune{65, 71}, 111 | []rune{}, 112 | []uint32{mergeFailed}, 113 | 1, 2, 114 | }, 115 | { 116 | // out of order []rune 117 | []rune{69, 74, 60, 65}, 118 | []rune{66, 67}, 119 | []rune{}, 120 | []uint32{mergeFailed}, 121 | 1, 2, 122 | }, 123 | } 124 | 125 | func TestMergeRuneSet(t *testing.T) { 126 | for ix, test := range runeMergeTests { 127 | merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC) 128 | if !reflect.DeepEqual(merged, test.merged) { 129 | t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged) 130 | } 131 | if !reflect.DeepEqual(next, test.next) { 132 | t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next) 133 | } 134 | } 135 | } 136 | 137 | var onePassTests = []struct { 138 | re string 139 | isOnePass bool 140 | }{ 141 | {`^(?:a|(?:a*))$`, false}, 142 | {`^(?:(a)|(?:a*))$`, false}, 143 | {`^(?:(?:(?:.(?:$))?))$`, true}, 144 | {`^abcd$`, true}, 145 | {`^(?:(?:a{0,})*?)$`, true}, 146 | {`^(?:(?:a+)*)$`, true}, 147 | {`^(?:(?:a|(?:aa)))$`, true}, 148 | {`^(?:[^\s\S])$`, true}, 149 | {`^(?:(?:a{3,4}){0,})$`, false}, 150 | {`^(?:(?:(?:a*)+))$`, true}, 151 | {`^[a-c]+$`, true}, 152 | {`^[a-c]*$`, true}, 153 | {`^(?:a*)$`, true}, 154 | {`^(?:(?:aa)|a)$`, true}, 155 | {`^[a-c]*`, false}, 156 | {`^...$`, true}, 157 | {`^(?:a|(?:aa))$`, true}, 158 | {`^a((b))c$`, true}, 159 | {`^a.[l-nA-Cg-j]?e$`, true}, 160 | {`^a((b))$`, true}, 161 | {`^a(?:(b)|(c))c$`, true}, 162 | {`^a(?:(b*)|(c))c$`, false}, 163 | {`^a(?:b|c)$`, true}, 164 | {`^a(?:b?|c)$`, true}, 165 | {`^a(?:b?|c?)$`, false}, 166 | {`^a(?:b?|c+)$`, true}, 167 | {`^a(?:b+|(bc))d$`, false}, 168 | {`^a(?:bc)+$`, true}, 169 | {`^a(?:[bcd])+$`, true}, 170 | {`^a((?:[bcd])+)$`, true}, 171 | {`^a(:?b|c)*d$`, true}, 172 | {`^.bc(d|e)*$`, true}, 173 | {`^(?:(?:aa)|.)$`, false}, 174 | {`^(?:(?:a{1,2}){1,2})$`, false}, 175 | {`^l` + strings.Repeat("o", 2<<8) + `ng$`, true}, 176 | } 177 | 178 | func TestCompileOnePass(t *testing.T) { 179 | var ( 180 | p *syntax.Prog 181 | re *syntax.Regexp 182 | err error 183 | ) 184 | for _, test := range onePassTests { 185 | if re, err = syntax.Parse(test.re, syntax.Perl); err != nil { 186 | t.Errorf("Parse(%q) got err:%s, want success", test.re, err) 187 | continue 188 | } 189 | // needs to be done before compile... 190 | re = re.Simplify() 191 | if p, err = syntax.Compile(re); err != nil { 192 | t.Errorf("Compile(%q) got err:%s, want success", test.re, err) 193 | continue 194 | } 195 | isOnePass := compileOnePass(p) != nil 196 | if isOnePass != test.isOnePass { 197 | t.Errorf("CompileOnePass(%q) got isOnePass=%v, expected %v", test.re, isOnePass, test.isOnePass) 198 | } 199 | } 200 | } 201 | 202 | // TODO(cespare): Unify with onePassTests and rationalize one-pass test cases. 203 | var onePassTests1 = []struct { 204 | re string 205 | match string 206 | }{ 207 | {`^a(/b+(#c+)*)*$`, "a/b#c"}, // golang.org/issue/11905 208 | } 209 | 210 | func TestRunOnePass(t *testing.T) { 211 | for _, test := range onePassTests1 { 212 | re, err := Compile(test.re) 213 | if err != nil { 214 | t.Errorf("Compile(%q): got err: %s", test.re, err) 215 | continue 216 | } 217 | if re.onepass == nil { 218 | t.Errorf("Compile(%q): got nil, want one-pass", test.re) 219 | continue 220 | } 221 | if !re.MatchString(test.match) { 222 | t.Errorf("onepass %q did not match %q", test.re, test.match) 223 | } 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /syntax/compile.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import "unicode" 8 | 9 | // A patchList is a list of instruction pointers that need to be filled in (patched). 10 | // Because the pointers haven't been filled in yet, we can reuse their storage 11 | // to hold the list. It's kind of sleazy, but works well in practice. 12 | // See https://swtch.com/~rsc/regexp/regexp1.html for inspiration. 13 | // 14 | // These aren't really pointers: they're integers, so we can reinterpret them 15 | // this way without using package unsafe. A value l denotes 16 | // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1). 17 | // l == 0 denotes the empty list, okay because we start every program 18 | // with a fail instruction, so we'll never want to point at its output link. 19 | type patchList uint32 20 | 21 | func (l patchList) next(p *Prog) patchList { 22 | i := &p.Inst[l>>1] 23 | if l&1 == 0 { 24 | return patchList(i.Out) 25 | } 26 | return patchList(i.Arg) 27 | } 28 | 29 | func (l patchList) patch(p *Prog, val uint32) { 30 | for l != 0 { 31 | i := &p.Inst[l>>1] 32 | if l&1 == 0 { 33 | l = patchList(i.Out) 34 | i.Out = val 35 | } else { 36 | l = patchList(i.Arg) 37 | i.Arg = val 38 | } 39 | } 40 | } 41 | 42 | func (l1 patchList) append(p *Prog, l2 patchList) patchList { 43 | if l1 == 0 { 44 | return l2 45 | } 46 | if l2 == 0 { 47 | return l1 48 | } 49 | 50 | last := l1 51 | for { 52 | next := last.next(p) 53 | if next == 0 { 54 | break 55 | } 56 | last = next 57 | } 58 | 59 | i := &p.Inst[last>>1] 60 | if last&1 == 0 { 61 | i.Out = uint32(l2) 62 | } else { 63 | i.Arg = uint32(l2) 64 | } 65 | return l1 66 | } 67 | 68 | // A frag represents a compiled program fragment. 69 | type frag struct { 70 | i uint32 // index of first instruction 71 | out patchList // where to record end instruction 72 | } 73 | 74 | type compiler struct { 75 | p *Prog 76 | } 77 | 78 | // Compile compiles the regexp into a program to be executed. 79 | // The regexp should have been simplified already (returned from re.Simplify). 80 | func Compile(re *Regexp) (*Prog, error) { 81 | var c compiler 82 | c.init() 83 | f := c.compile(re) 84 | f.out.patch(c.p, c.inst(InstMatch).i) 85 | c.p.Start = int(f.i) 86 | return c.p, nil 87 | } 88 | 89 | func (c *compiler) init() { 90 | c.p = new(Prog) 91 | c.p.NumCap = 2 // implicit ( and ) for whole match $0 92 | c.inst(InstFail) 93 | } 94 | 95 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 96 | var anyRune = []rune{0, unicode.MaxRune} 97 | 98 | func (c *compiler) compile(re *Regexp) frag { 99 | switch re.Op { 100 | case OpNoMatch: 101 | return c.fail() 102 | case OpEmptyMatch: 103 | return c.nop() 104 | case OpLiteral: 105 | if len(re.Rune) == 0 { 106 | return c.nop() 107 | } 108 | var f frag 109 | for j := range re.Rune { 110 | f1 := c.rune(re.Rune[j:j+1], re.Flags) 111 | if j == 0 { 112 | f = f1 113 | } else { 114 | f = c.cat(f, f1) 115 | } 116 | } 117 | return f 118 | case OpCharClass: 119 | return c.rune(re.Rune, re.Flags) 120 | case OpAnyCharNotNL: 121 | return c.rune(anyRuneNotNL, 0) 122 | case OpAnyChar: 123 | return c.rune(anyRune, 0) 124 | case OpBeginLine: 125 | return c.empty(EmptyBeginLine) 126 | case OpEndLine: 127 | return c.empty(EmptyEndLine) 128 | case OpBeginText: 129 | return c.empty(EmptyBeginText) 130 | case OpEndText: 131 | return c.empty(EmptyEndText) 132 | case OpWordBoundary: 133 | return c.empty(EmptyWordBoundary) 134 | case OpNoWordBoundary: 135 | return c.empty(EmptyNoWordBoundary) 136 | case OpCapture: 137 | bra := c.cap(uint32(re.Cap << 1)) 138 | sub := c.compile(re.Sub[0]) 139 | ket := c.cap(uint32(re.Cap<<1 | 1)) 140 | return c.cat(c.cat(bra, sub), ket) 141 | case OpStar: 142 | return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 143 | case OpPlus: 144 | return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 145 | case OpQuest: 146 | return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) 147 | case OpConcat: 148 | if len(re.Sub) == 0 { 149 | return c.nop() 150 | } 151 | var f frag 152 | for i, sub := range re.Sub { 153 | if i == 0 { 154 | f = c.compile(sub) 155 | } else { 156 | f = c.cat(f, c.compile(sub)) 157 | } 158 | } 159 | return f 160 | case OpAlternate: 161 | var f frag 162 | for _, sub := range re.Sub { 163 | f = c.alt(f, c.compile(sub)) 164 | } 165 | return f 166 | } 167 | panic("regexp: unhandled case in compile") 168 | } 169 | 170 | func (c *compiler) inst(op InstOp) frag { 171 | // TODO: impose length limit 172 | f := frag{i: uint32(len(c.p.Inst))} 173 | c.p.Inst = append(c.p.Inst, Inst{Op: op}) 174 | return f 175 | } 176 | 177 | func (c *compiler) nop() frag { 178 | f := c.inst(InstNop) 179 | f.out = patchList(f.i << 1) 180 | return f 181 | } 182 | 183 | func (c *compiler) fail() frag { 184 | return frag{} 185 | } 186 | 187 | func (c *compiler) cap(arg uint32) frag { 188 | f := c.inst(InstCapture) 189 | f.out = patchList(f.i << 1) 190 | c.p.Inst[f.i].Arg = arg 191 | 192 | if c.p.NumCap < int(arg)+1 { 193 | c.p.NumCap = int(arg) + 1 194 | } 195 | return f 196 | } 197 | 198 | func (c *compiler) cat(f1, f2 frag) frag { 199 | // concat of failure is failure 200 | if f1.i == 0 || f2.i == 0 { 201 | return frag{} 202 | } 203 | 204 | // TODO: elide nop 205 | 206 | f1.out.patch(c.p, f2.i) 207 | return frag{f1.i, f2.out} 208 | } 209 | 210 | func (c *compiler) alt(f1, f2 frag) frag { 211 | // alt of failure is other 212 | if f1.i == 0 { 213 | return f2 214 | } 215 | if f2.i == 0 { 216 | return f1 217 | } 218 | 219 | f := c.inst(InstAlt) 220 | i := &c.p.Inst[f.i] 221 | i.Out = f1.i 222 | i.Arg = f2.i 223 | f.out = f1.out.append(c.p, f2.out) 224 | return f 225 | } 226 | 227 | func (c *compiler) quest(f1 frag, nongreedy bool) frag { 228 | f := c.inst(InstAlt) 229 | i := &c.p.Inst[f.i] 230 | if nongreedy { 231 | i.Arg = f1.i 232 | f.out = patchList(f.i << 1) 233 | } else { 234 | i.Out = f1.i 235 | f.out = patchList(f.i<<1 | 1) 236 | } 237 | f.out = f.out.append(c.p, f1.out) 238 | return f 239 | } 240 | 241 | func (c *compiler) star(f1 frag, nongreedy bool) frag { 242 | f := c.inst(InstAlt) 243 | i := &c.p.Inst[f.i] 244 | if nongreedy { 245 | i.Arg = f1.i 246 | f.out = patchList(f.i << 1) 247 | } else { 248 | i.Out = f1.i 249 | f.out = patchList(f.i<<1 | 1) 250 | } 251 | f1.out.patch(c.p, f.i) 252 | return f 253 | } 254 | 255 | func (c *compiler) plus(f1 frag, nongreedy bool) frag { 256 | return frag{f1.i, c.star(f1, nongreedy).out} 257 | } 258 | 259 | func (c *compiler) empty(op EmptyOp) frag { 260 | f := c.inst(InstEmptyWidth) 261 | c.p.Inst[f.i].Arg = uint32(op) 262 | f.out = patchList(f.i << 1) 263 | return f 264 | } 265 | 266 | func (c *compiler) rune(r []rune, flags Flags) frag { 267 | f := c.inst(InstRune) 268 | i := &c.p.Inst[f.i] 269 | i.Rune = r 270 | flags &= FoldCase // only relevant flag is FoldCase 271 | if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] { 272 | // and sometimes not even that 273 | flags &^= FoldCase 274 | } 275 | i.Arg = uint32(flags) 276 | f.out = patchList(f.i << 1) 277 | 278 | // Special cases for exec machine. 279 | switch { 280 | case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]): 281 | i.Op = InstRune1 282 | case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune: 283 | i.Op = InstRuneAny 284 | case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune: 285 | i.Op = InstRuneAnyNotNL 286 | } 287 | 288 | return f 289 | } 290 | -------------------------------------------------------------------------------- /syntax/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution. 6 | 7 | /* 8 | Package syntax parses regular expressions into parse trees and compiles 9 | parse trees into programs. Most clients of regular expressions will use the 10 | facilities of package regexp (such as Compile and Match) instead of this package. 11 | 12 | Syntax 13 | 14 | The regular expression syntax understood by this package when parsing with the Perl flag is as follows. 15 | Parts of the syntax can be disabled by passing alternate flags to Parse. 16 | 17 | 18 | Single characters: 19 | . any character, possibly including newline (flag s=true) 20 | [xyz] character class 21 | [^xyz] negated character class 22 | \d Perl character class 23 | \D negated Perl character class 24 | [[:alpha:]] ASCII character class 25 | [[:^alpha:]] negated ASCII character class 26 | \pN Unicode character class (one-letter name) 27 | \p{Greek} Unicode character class 28 | \PN negated Unicode character class (one-letter name) 29 | \P{Greek} negated Unicode character class 30 | 31 | Composites: 32 | xy x followed by y 33 | x|y x or y (prefer x) 34 | 35 | Repetitions: 36 | x* zero or more x, prefer more 37 | x+ one or more x, prefer more 38 | x? zero or one x, prefer one 39 | x{n,m} n or n+1 or ... or m x, prefer more 40 | x{n,} n or more x, prefer more 41 | x{n} exactly n x 42 | x*? zero or more x, prefer fewer 43 | x+? one or more x, prefer fewer 44 | x?? zero or one x, prefer zero 45 | x{n,m}? n or n+1 or ... or m x, prefer fewer 46 | x{n,}? n or more x, prefer fewer 47 | x{n}? exactly n x 48 | 49 | Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n} 50 | reject forms that create a minimum or maximum repetition count above 1000. 51 | Unlimited repetitions are not subject to this restriction. 52 | 53 | Grouping: 54 | (re) numbered capturing group (submatch) 55 | (?Pre) named & numbered capturing group (submatch) 56 | (?:re) non-capturing group 57 | (?flags) set flags within current group; non-capturing 58 | (?flags:re) set flags during re; non-capturing 59 | 60 | Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are: 61 | 62 | i case-insensitive (default false) 63 | m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false) 64 | s let . match \n (default false) 65 | U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false) 66 | 67 | Empty strings: 68 | ^ at beginning of text or line (flag m=true) 69 | $ at end of text (like \z not Perl's \Z) or line (flag m=true) 70 | \A at beginning of text 71 | \b at ASCII word boundary (\w on one side and \W, \A, or \z on the other) 72 | \B not at ASCII word boundary 73 | \z at end of text 74 | 75 | Escape sequences: 76 | \a bell (== \007) 77 | \f form feed (== \014) 78 | \t horizontal tab (== \011) 79 | \n newline (== \012) 80 | \r carriage return (== \015) 81 | \v vertical tab character (== \013) 82 | \* literal *, for any punctuation character * 83 | \123 octal character code (up to three digits) 84 | \x7F hex character code (exactly two digits) 85 | \x{10FFFF} hex character code 86 | \Q...\E literal text ... even if ... has punctuation 87 | 88 | Character class elements: 89 | x single character 90 | A-Z character range (inclusive) 91 | \d Perl character class 92 | [:foo:] ASCII character class foo 93 | \p{Foo} Unicode character class Foo 94 | \pF Unicode character class F (one-letter name) 95 | 96 | Named character classes as character class elements: 97 | [\d] digits (== \d) 98 | [^\d] not digits (== \D) 99 | [\D] not digits (== \D) 100 | [^\D] not not digits (== \d) 101 | [[:name:]] named ASCII class inside character class (== [:name:]) 102 | [^[:name:]] named ASCII class inside negated character class (== [:^name:]) 103 | [\p{Name}] named Unicode property inside character class (== \p{Name}) 104 | [^\p{Name}] named Unicode property inside negated character class (== \P{Name}) 105 | 106 | Perl character classes (all ASCII-only): 107 | \d digits (== [0-9]) 108 | \D not digits (== [^0-9]) 109 | \s whitespace (== [\t\n\f\r ]) 110 | \S not whitespace (== [^\t\n\f\r ]) 111 | \w word characters (== [0-9A-Za-z_]) 112 | \W not word characters (== [^0-9A-Za-z_]) 113 | 114 | ASCII character classes: 115 | [[:alnum:]] alphanumeric (== [0-9A-Za-z]) 116 | [[:alpha:]] alphabetic (== [A-Za-z]) 117 | [[:ascii:]] ASCII (== [\x00-\x7F]) 118 | [[:blank:]] blank (== [\t ]) 119 | [[:cntrl:]] control (== [\x00-\x1F\x7F]) 120 | [[:digit:]] digits (== [0-9]) 121 | [[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) 122 | [[:lower:]] lower case (== [a-z]) 123 | [[:print:]] printable (== [ -~] == [ [:graph:]]) 124 | [[:punct:]] punctuation (== [!-/:-@[-`{-~]) 125 | [[:space:]] whitespace (== [\t\n\v\f\r ]) 126 | [[:upper:]] upper case (== [A-Z]) 127 | [[:word:]] word characters (== [0-9A-Za-z_]) 128 | [[:xdigit:]] hex digit (== [0-9A-Fa-f]) 129 | 130 | */ 131 | package syntax 132 | -------------------------------------------------------------------------------- /syntax/op_string.go: -------------------------------------------------------------------------------- 1 | // Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT. 2 | 3 | package syntax 4 | 5 | import "strconv" 6 | 7 | const ( 8 | _Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate" 9 | _Op_name_1 = "opPseudo" 10 | ) 11 | 12 | var ( 13 | _Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151} 14 | ) 15 | 16 | func (i Op) String() string { 17 | switch { 18 | case 1 <= i && i <= 19: 19 | i -= 1 20 | return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]] 21 | case i == 128: 22 | return _Op_name_1 23 | default: 24 | return "Op(" + strconv.FormatInt(int64(i), 10) + ")" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /syntax/parse_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "testing" 11 | "unicode" 12 | ) 13 | 14 | type parseTest struct { 15 | Regexp string 16 | Dump string 17 | } 18 | 19 | var parseTests = []parseTest{ 20 | // Base cases 21 | {`a`, `lit{a}`}, 22 | {`a.`, `cat{lit{a}dot{}}`}, 23 | {`a.b`, `cat{lit{a}dot{}lit{b}}`}, 24 | {`ab`, `str{ab}`}, 25 | {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, 26 | {`abc`, `str{abc}`}, 27 | {`a|^`, `alt{lit{a}bol{}}`}, 28 | {`a|b`, `cc{0x61-0x62}`}, 29 | {`(a)`, `cap{lit{a}}`}, 30 | {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, 31 | {`a*`, `star{lit{a}}`}, 32 | {`a+`, `plus{lit{a}}`}, 33 | {`a?`, `que{lit{a}}`}, 34 | {`a{2}`, `rep{2,2 lit{a}}`}, 35 | {`a{2,3}`, `rep{2,3 lit{a}}`}, 36 | {`a{2,}`, `rep{2,-1 lit{a}}`}, 37 | {`a*?`, `nstar{lit{a}}`}, 38 | {`a+?`, `nplus{lit{a}}`}, 39 | {`a??`, `nque{lit{a}}`}, 40 | {`a{2}?`, `nrep{2,2 lit{a}}`}, 41 | {`a{2,3}?`, `nrep{2,3 lit{a}}`}, 42 | {`a{2,}?`, `nrep{2,-1 lit{a}}`}, 43 | // Malformed { } are treated as literals. 44 | {`x{1001`, `str{x{1001}`}, 45 | {`x{9876543210`, `str{x{9876543210}`}, 46 | {`x{9876543210,`, `str{x{9876543210,}`}, 47 | {`x{2,1`, `str{x{2,1}`}, 48 | {`x{1,9876543210`, `str{x{1,9876543210}`}, 49 | {``, `emp{}`}, 50 | {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored 51 | {`|x|`, `alt{emp{}lit{x}emp{}}`}, 52 | {`.`, `dot{}`}, 53 | {`^`, `bol{}`}, 54 | {`$`, `eol{}`}, 55 | {`\|`, `lit{|}`}, 56 | {`\(`, `lit{(}`}, 57 | {`\)`, `lit{)}`}, 58 | {`\*`, `lit{*}`}, 59 | {`\+`, `lit{+}`}, 60 | {`\?`, `lit{?}`}, 61 | {`{`, `lit{{}`}, 62 | {`}`, `lit{}}`}, 63 | {`\.`, `lit{.}`}, 64 | {`\^`, `lit{^}`}, 65 | {`\$`, `lit{$}`}, 66 | {`\\`, `lit{\}`}, 67 | {`[ace]`, `cc{0x61 0x63 0x65}`}, 68 | {`[abc]`, `cc{0x61-0x63}`}, 69 | {`[a-z]`, `cc{0x61-0x7a}`}, 70 | {`[a]`, `lit{a}`}, 71 | {`\-`, `lit{-}`}, 72 | {`-`, `lit{-}`}, 73 | {`\_`, `lit{_}`}, 74 | {`abc`, `str{abc}`}, 75 | {`abc|def`, `alt{str{abc}str{def}}`}, 76 | {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, 77 | 78 | // Posix and Perl extensions 79 | {`[[:lower:]]`, `cc{0x61-0x7a}`}, 80 | {`[a-z]`, `cc{0x61-0x7a}`}, 81 | {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 82 | {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, 83 | {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 84 | {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 85 | {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 86 | {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 87 | {`\d`, `cc{0x30-0x39}`}, 88 | {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, 89 | {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, 90 | {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, 91 | {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, 92 | {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, 93 | {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, 94 | {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, 95 | {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, 96 | {`\C`, `dot{}`}, 97 | 98 | // Unicode, negatives, and a double negative. 99 | {`\p{Braille}`, `cc{0x2800-0x28ff}`}, 100 | {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 101 | {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 102 | {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, 103 | {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 104 | {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, 105 | {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 106 | {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, 107 | {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, 108 | {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, 109 | {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, 110 | {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, 111 | {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, 112 | {`\p{Any}`, `dot{}`}, 113 | {`\p{^Any}`, `cc{}`}, 114 | 115 | // Hex, octal. 116 | {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, 117 | {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, 118 | 119 | // More interesting regular expressions. 120 | {`a{,2}`, `str{a{,2}}`}, 121 | {`\.\^\$\\`, `str{.^$\}`}, 122 | {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, 123 | {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, 124 | {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 125 | {`a*{`, `cat{star{lit{a}}lit{{}}`}, 126 | 127 | // Test precedences 128 | {`(?:ab)*`, `star{str{ab}}`}, 129 | {`(ab)*`, `star{cap{str{ab}}}`}, 130 | {`ab|cd`, `alt{str{ab}str{cd}}`}, 131 | {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, 132 | 133 | // Test flattening. 134 | {`(?:a)`, `lit{a}`}, 135 | {`(?:ab)(?:cd)`, `str{abcd}`}, 136 | {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 137 | {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, 138 | {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, 139 | {`a|.`, `dot{}`}, 140 | {`.|a`, `dot{}`}, 141 | {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, 142 | {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, 143 | 144 | // Test Perl quoted literals 145 | {`\Q+|*?{[\E`, `str{+|*?{[}`}, 146 | {`\Q+\E+`, `plus{lit{+}}`}, 147 | {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, 148 | {`\Q\\E`, `lit{\}`}, 149 | {`\Q\\\E`, `str{\\}`}, 150 | 151 | // Test Perl \A and \z 152 | {`(?m)^`, `bol{}`}, 153 | {`(?m)$`, `eol{}`}, 154 | {`(?-m)^`, `bot{}`}, 155 | {`(?-m)$`, `eot{}`}, 156 | {`(?m)\A`, `bot{}`}, 157 | {`(?m)\z`, `eot{\z}`}, 158 | {`(?-m)\A`, `bot{}`}, 159 | {`(?-m)\z`, `eot{\z}`}, 160 | 161 | // Test named captures 162 | {`(?Pa)`, `cap{name:lit{a}}`}, 163 | 164 | // Case-folded literals 165 | {`[Aa]`, `litfold{A}`}, 166 | {`[\x{100}\x{101}]`, `litfold{Ā}`}, 167 | {`[Δδ]`, `litfold{Δ}`}, 168 | 169 | // Strings 170 | {`abcde`, `str{abcde}`}, 171 | {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, 172 | 173 | // Factoring. 174 | {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, 175 | {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, 176 | 177 | // Bug fixes. 178 | {`(?:.)`, `dot{}`}, 179 | {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, 180 | {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, 181 | {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, 182 | {`(?:A|a)`, `litfold{A}`}, 183 | {`A|(?:A|a)`, `litfold{A}`}, 184 | {`(?s).`, `dot{}`}, 185 | {`(?-s).`, `dnl{}`}, 186 | {`(?:(?:^).)`, `cat{bol{}dot{}}`}, 187 | {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, 188 | 189 | // RE2 prefix_tests 190 | {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 191 | {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, 192 | {`abc|abd|aef|bcx|bcy`, 193 | `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + 194 | `cat{str{bc}cc{0x78-0x79}}}`}, 195 | {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, 196 | {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, 197 | {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, 198 | {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, 199 | {`x{2}|x{2}[0-9]`, 200 | `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, 201 | {`x{2}y|x{2}[0-9]y`, 202 | `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, 203 | {`a.*?c|a.*?b`, 204 | `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, 205 | 206 | // Valid repetitions. 207 | {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, 208 | {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, 209 | } 210 | 211 | const testFlags = MatchNL | PerlX | UnicodeGroups 212 | 213 | func TestParseSimple(t *testing.T) { 214 | testParseDump(t, parseTests, testFlags) 215 | } 216 | 217 | var foldcaseTests = []parseTest{ 218 | {`AbCdE`, `strfold{ABCDE}`}, 219 | {`[Aa]`, `litfold{A}`}, 220 | {`a`, `litfold{A}`}, 221 | 222 | // 0x17F is an old English long s (looks like an f) and folds to s. 223 | // 0x212A is the Kelvin symbol and folds to k. 224 | {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...] 225 | {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 226 | {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, 227 | } 228 | 229 | func TestParseFoldCase(t *testing.T) { 230 | testParseDump(t, foldcaseTests, FoldCase) 231 | } 232 | 233 | var literalTests = []parseTest{ 234 | {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"}, 235 | } 236 | 237 | func TestParseLiteral(t *testing.T) { 238 | testParseDump(t, literalTests, Literal) 239 | } 240 | 241 | var matchnlTests = []parseTest{ 242 | {`.`, `dot{}`}, 243 | {"\n", "lit{\n}"}, 244 | {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, 245 | {`[a\n]`, `cc{0xa 0x61}`}, 246 | } 247 | 248 | func TestParseMatchNL(t *testing.T) { 249 | testParseDump(t, matchnlTests, MatchNL) 250 | } 251 | 252 | var nomatchnlTests = []parseTest{ 253 | {`.`, `dnl{}`}, 254 | {"\n", "lit{\n}"}, 255 | {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`}, 256 | {`[a\n]`, `cc{0xa 0x61}`}, 257 | } 258 | 259 | func TestParseNoMatchNL(t *testing.T) { 260 | testParseDump(t, nomatchnlTests, 0) 261 | } 262 | 263 | // Test Parse -> Dump. 264 | func testParseDump(t *testing.T, tests []parseTest, flags Flags) { 265 | for _, tt := range tests { 266 | re, err := Parse(tt.Regexp, flags) 267 | if err != nil { 268 | t.Errorf("Parse(%#q): %v", tt.Regexp, err) 269 | continue 270 | } 271 | if tt.Dump == "" { 272 | // It parsed. That's all we care about. 273 | continue 274 | } 275 | d := dump(re) 276 | if d != tt.Dump { 277 | t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) 278 | } 279 | } 280 | } 281 | 282 | // dump prints a string representation of the regexp showing 283 | // the structure explicitly. 284 | func dump(re *Regexp) string { 285 | var b bytes.Buffer 286 | dumpRegexp(&b, re) 287 | return b.String() 288 | } 289 | 290 | var opNames = []string{ 291 | OpNoMatch: "no", 292 | OpEmptyMatch: "emp", 293 | OpLiteral: "lit", 294 | OpCharClass: "cc", 295 | OpAnyCharNotNL: "dnl", 296 | OpAnyChar: "dot", 297 | OpBeginLine: "bol", 298 | OpEndLine: "eol", 299 | OpBeginText: "bot", 300 | OpEndText: "eot", 301 | OpWordBoundary: "wb", 302 | OpNoWordBoundary: "nwb", 303 | OpCapture: "cap", 304 | OpStar: "star", 305 | OpPlus: "plus", 306 | OpQuest: "que", 307 | OpRepeat: "rep", 308 | OpConcat: "cat", 309 | OpAlternate: "alt", 310 | } 311 | 312 | // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. 313 | // It is used during testing to distinguish between parses that might print 314 | // the same using re's String method. 315 | func dumpRegexp(b *bytes.Buffer, re *Regexp) { 316 | if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { 317 | fmt.Fprintf(b, "op%d", re.Op) 318 | } else { 319 | switch re.Op { 320 | default: 321 | b.WriteString(opNames[re.Op]) 322 | case OpStar, OpPlus, OpQuest, OpRepeat: 323 | if re.Flags&NonGreedy != 0 { 324 | b.WriteByte('n') 325 | } 326 | b.WriteString(opNames[re.Op]) 327 | case OpLiteral: 328 | if len(re.Rune) > 1 { 329 | b.WriteString("str") 330 | } else { 331 | b.WriteString("lit") 332 | } 333 | if re.Flags&FoldCase != 0 { 334 | for _, r := range re.Rune { 335 | if unicode.SimpleFold(r) != r { 336 | b.WriteString("fold") 337 | break 338 | } 339 | } 340 | } 341 | } 342 | } 343 | b.WriteByte('{') 344 | switch re.Op { 345 | case OpEndText: 346 | if re.Flags&WasDollar == 0 { 347 | b.WriteString(`\z`) 348 | } 349 | case OpLiteral: 350 | for _, r := range re.Rune { 351 | b.WriteRune(r) 352 | } 353 | case OpConcat, OpAlternate: 354 | for _, sub := range re.Sub { 355 | dumpRegexp(b, sub) 356 | } 357 | case OpStar, OpPlus, OpQuest: 358 | dumpRegexp(b, re.Sub[0]) 359 | case OpRepeat: 360 | fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) 361 | dumpRegexp(b, re.Sub[0]) 362 | case OpCapture: 363 | if re.Name != "" { 364 | b.WriteString(re.Name) 365 | b.WriteByte(':') 366 | } 367 | dumpRegexp(b, re.Sub[0]) 368 | case OpCharClass: 369 | sep := "" 370 | for i := 0; i < len(re.Rune); i += 2 { 371 | b.WriteString(sep) 372 | sep = " " 373 | lo, hi := re.Rune[i], re.Rune[i+1] 374 | if lo == hi { 375 | fmt.Fprintf(b, "%#x", lo) 376 | } else { 377 | fmt.Fprintf(b, "%#x-%#x", lo, hi) 378 | } 379 | } 380 | } 381 | b.WriteByte('}') 382 | } 383 | 384 | func mkCharClass(f func(rune) bool) string { 385 | re := &Regexp{Op: OpCharClass} 386 | lo := rune(-1) 387 | for i := rune(0); i <= unicode.MaxRune; i++ { 388 | if f(i) { 389 | if lo < 0 { 390 | lo = i 391 | } 392 | } else { 393 | if lo >= 0 { 394 | re.Rune = append(re.Rune, lo, i-1) 395 | lo = -1 396 | } 397 | } 398 | } 399 | if lo >= 0 { 400 | re.Rune = append(re.Rune, lo, unicode.MaxRune) 401 | } 402 | return dump(re) 403 | } 404 | 405 | func isUpperFold(r rune) bool { 406 | if unicode.IsUpper(r) { 407 | return true 408 | } 409 | c := unicode.SimpleFold(r) 410 | for c != r { 411 | if unicode.IsUpper(c) { 412 | return true 413 | } 414 | c = unicode.SimpleFold(c) 415 | } 416 | return false 417 | } 418 | 419 | func TestFoldConstants(t *testing.T) { 420 | last := rune(-1) 421 | for i := rune(0); i <= unicode.MaxRune; i++ { 422 | if unicode.SimpleFold(i) == i { 423 | continue 424 | } 425 | if last == -1 && minFold != i { 426 | t.Errorf("minFold=%#U should be %#U", minFold, i) 427 | } 428 | last = i 429 | } 430 | if maxFold != last { 431 | t.Errorf("maxFold=%#U should be %#U", maxFold, last) 432 | } 433 | } 434 | 435 | func TestAppendRangeCollapse(t *testing.T) { 436 | // AppendRange should collapse each of the new ranges 437 | // into the earlier ones (it looks back two ranges), so that 438 | // the slice never grows very large. 439 | // Note that we are not calling cleanClass. 440 | var r []rune 441 | for i := rune('A'); i <= 'Z'; i++ { 442 | r = appendRange(r, i, i) 443 | r = appendRange(r, i+'a'-'A', i+'a'-'A') 444 | } 445 | if string(r) != "AZaz" { 446 | t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r)) 447 | } 448 | } 449 | 450 | var invalidRegexps = []string{ 451 | `(`, 452 | `)`, 453 | `(a`, 454 | `a)`, 455 | `(a))`, 456 | `(a|b|`, 457 | `a|b|)`, 458 | `(a|b|))`, 459 | `(a|b`, 460 | `a|b)`, 461 | `(a|b))`, 462 | `[a-z`, 463 | `([a-z)`, 464 | `[a-z)`, 465 | `([a-z]))`, 466 | `x{1001}`, 467 | `x{9876543210}`, 468 | `x{2,1}`, 469 | `x{1,9876543210}`, 470 | "\xff", // Invalid UTF-8 471 | "[\xff]", 472 | "[\\\xff]", 473 | "\\\xff", 474 | `(?Pa`, 475 | `(?P`, 476 | `(?Pa)`, 478 | `(?P<>a)`, 479 | `[a-Z]`, 480 | `(?i)[a-Z]`, 481 | `a{100000}`, 482 | `a{100000,}`, 483 | "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", 484 | `\Q\E*`, 485 | } 486 | 487 | var onlyPerl = []string{ 488 | `[a-b-c]`, 489 | `\Qabc\E`, 490 | `\Q*+?{[\E`, 491 | `\Q\\E`, 492 | `\Q\\\E`, 493 | `\Q\\\\E`, 494 | `\Q\\\\\E`, 495 | `(?:a)`, 496 | `(?Pa)`, 497 | } 498 | 499 | var onlyPOSIX = []string{ 500 | "a++", 501 | "a**", 502 | "a?*", 503 | "a+*", 504 | "a{1}*", 505 | ".{1}{2}.{3}", 506 | } 507 | 508 | func TestParseInvalidRegexps(t *testing.T) { 509 | for _, regexp := range invalidRegexps { 510 | if re, err := Parse(regexp, Perl); err == nil { 511 | t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) 512 | } 513 | if re, err := Parse(regexp, POSIX); err == nil { 514 | t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) 515 | } 516 | } 517 | for _, regexp := range onlyPerl { 518 | if _, err := Parse(regexp, Perl); err != nil { 519 | t.Errorf("Parse(%#q, Perl): %v", regexp, err) 520 | } 521 | if re, err := Parse(regexp, POSIX); err == nil { 522 | t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re)) 523 | } 524 | } 525 | for _, regexp := range onlyPOSIX { 526 | if re, err := Parse(regexp, Perl); err == nil { 527 | t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re)) 528 | } 529 | if _, err := Parse(regexp, POSIX); err != nil { 530 | t.Errorf("Parse(%#q, POSIX): %v", regexp, err) 531 | } 532 | } 533 | } 534 | 535 | func TestToStringEquivalentParse(t *testing.T) { 536 | for _, tt := range parseTests { 537 | re, err := Parse(tt.Regexp, testFlags) 538 | if err != nil { 539 | t.Errorf("Parse(%#q): %v", tt.Regexp, err) 540 | continue 541 | } 542 | if tt.Dump == "" { 543 | // It parsed. That's all we care about. 544 | continue 545 | } 546 | d := dump(re) 547 | if d != tt.Dump { 548 | t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) 549 | continue 550 | } 551 | 552 | s := re.String() 553 | if s != tt.Regexp { 554 | // If ToString didn't return the original regexp, 555 | // it must have found one with fewer parens. 556 | // Unfortunately we can't check the length here, because 557 | // ToString produces "\\{" for a literal brace, 558 | // but "{" is a shorter equivalent in some contexts. 559 | nre, err := Parse(s, testFlags) 560 | if err != nil { 561 | t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) 562 | continue 563 | } 564 | nd := dump(nre) 565 | if d != nd { 566 | t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) 567 | } 568 | 569 | ns := nre.String() 570 | if s != ns { 571 | t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) 572 | } 573 | } 574 | } 575 | } 576 | -------------------------------------------------------------------------------- /syntax/perl_groups.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // GENERATED BY make_perl_groups.pl; DO NOT EDIT. 6 | // make_perl_groups.pl >perl_groups.go 7 | 8 | package syntax 9 | 10 | var code1 = []rune{ /* \d */ 11 | 0x30, 0x39, 12 | } 13 | 14 | var code2 = []rune{ /* \s */ 15 | 0x9, 0xa, 16 | 0xc, 0xd, 17 | 0x20, 0x20, 18 | } 19 | 20 | var code3 = []rune{ /* \w */ 21 | 0x30, 0x39, 22 | 0x41, 0x5a, 23 | 0x5f, 0x5f, 24 | 0x61, 0x7a, 25 | } 26 | 27 | var perlGroup = map[string]charGroup{ 28 | `\d`: {+1, code1}, 29 | `\D`: {-1, code1}, 30 | `\s`: {+1, code2}, 31 | `\S`: {-1, code2}, 32 | `\w`: {+1, code3}, 33 | `\W`: {-1, code3}, 34 | } 35 | var code4 = []rune{ /* [:alnum:] */ 36 | 0x30, 0x39, 37 | 0x41, 0x5a, 38 | 0x61, 0x7a, 39 | } 40 | 41 | var code5 = []rune{ /* [:alpha:] */ 42 | 0x41, 0x5a, 43 | 0x61, 0x7a, 44 | } 45 | 46 | var code6 = []rune{ /* [:ascii:] */ 47 | 0x0, 0x7f, 48 | } 49 | 50 | var code7 = []rune{ /* [:blank:] */ 51 | 0x9, 0x9, 52 | 0x20, 0x20, 53 | } 54 | 55 | var code8 = []rune{ /* [:cntrl:] */ 56 | 0x0, 0x1f, 57 | 0x7f, 0x7f, 58 | } 59 | 60 | var code9 = []rune{ /* [:digit:] */ 61 | 0x30, 0x39, 62 | } 63 | 64 | var code10 = []rune{ /* [:graph:] */ 65 | 0x21, 0x7e, 66 | } 67 | 68 | var code11 = []rune{ /* [:lower:] */ 69 | 0x61, 0x7a, 70 | } 71 | 72 | var code12 = []rune{ /* [:print:] */ 73 | 0x20, 0x7e, 74 | } 75 | 76 | var code13 = []rune{ /* [:punct:] */ 77 | 0x21, 0x2f, 78 | 0x3a, 0x40, 79 | 0x5b, 0x60, 80 | 0x7b, 0x7e, 81 | } 82 | 83 | var code14 = []rune{ /* [:space:] */ 84 | 0x9, 0xd, 85 | 0x20, 0x20, 86 | } 87 | 88 | var code15 = []rune{ /* [:upper:] */ 89 | 0x41, 0x5a, 90 | } 91 | 92 | var code16 = []rune{ /* [:word:] */ 93 | 0x30, 0x39, 94 | 0x41, 0x5a, 95 | 0x5f, 0x5f, 96 | 0x61, 0x7a, 97 | } 98 | 99 | var code17 = []rune{ /* [:xdigit:] */ 100 | 0x30, 0x39, 101 | 0x41, 0x46, 102 | 0x61, 0x66, 103 | } 104 | 105 | var posixGroup = map[string]charGroup{ 106 | `[:alnum:]`: {+1, code4}, 107 | `[:^alnum:]`: {-1, code4}, 108 | `[:alpha:]`: {+1, code5}, 109 | `[:^alpha:]`: {-1, code5}, 110 | `[:ascii:]`: {+1, code6}, 111 | `[:^ascii:]`: {-1, code6}, 112 | `[:blank:]`: {+1, code7}, 113 | `[:^blank:]`: {-1, code7}, 114 | `[:cntrl:]`: {+1, code8}, 115 | `[:^cntrl:]`: {-1, code8}, 116 | `[:digit:]`: {+1, code9}, 117 | `[:^digit:]`: {-1, code9}, 118 | `[:graph:]`: {+1, code10}, 119 | `[:^graph:]`: {-1, code10}, 120 | `[:lower:]`: {+1, code11}, 121 | `[:^lower:]`: {-1, code11}, 122 | `[:print:]`: {+1, code12}, 123 | `[:^print:]`: {-1, code12}, 124 | `[:punct:]`: {+1, code13}, 125 | `[:^punct:]`: {-1, code13}, 126 | `[:space:]`: {+1, code14}, 127 | `[:^space:]`: {-1, code14}, 128 | `[:upper:]`: {+1, code15}, 129 | `[:^upper:]`: {-1, code15}, 130 | `[:word:]`: {+1, code16}, 131 | `[:^word:]`: {-1, code16}, 132 | `[:xdigit:]`: {+1, code17}, 133 | `[:^xdigit:]`: {-1, code17}, 134 | } 135 | -------------------------------------------------------------------------------- /syntax/prog.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import ( 8 | "bytes" 9 | "strconv" 10 | "unicode" 11 | ) 12 | 13 | // Compiled program. 14 | // May not belong in this package, but convenient for now. 15 | 16 | // A Prog is a compiled regular expression program. 17 | type Prog struct { 18 | Inst []Inst 19 | Start int // index of start instruction 20 | NumCap int // number of InstCapture insts in re 21 | } 22 | 23 | // An InstOp is an instruction opcode. 24 | type InstOp uint8 25 | 26 | const ( 27 | InstAlt InstOp = iota 28 | InstAltMatch 29 | InstCapture 30 | InstEmptyWidth 31 | InstMatch 32 | InstFail 33 | InstNop 34 | InstRune 35 | InstRune1 36 | InstRuneAny 37 | InstRuneAnyNotNL 38 | ) 39 | 40 | var instOpNames = []string{ 41 | "InstAlt", 42 | "InstAltMatch", 43 | "InstCapture", 44 | "InstEmptyWidth", 45 | "InstMatch", 46 | "InstFail", 47 | "InstNop", 48 | "InstRune", 49 | "InstRune1", 50 | "InstRuneAny", 51 | "InstRuneAnyNotNL", 52 | } 53 | 54 | func (i InstOp) String() string { 55 | if uint(i) >= uint(len(instOpNames)) { 56 | return "" 57 | } 58 | return instOpNames[i] 59 | } 60 | 61 | // An EmptyOp specifies a kind or mixture of zero-width assertions. 62 | type EmptyOp uint8 63 | 64 | const ( 65 | EmptyBeginLine EmptyOp = 1 << iota 66 | EmptyEndLine 67 | EmptyBeginText 68 | EmptyEndText 69 | EmptyWordBoundary 70 | EmptyNoWordBoundary 71 | ) 72 | 73 | // EmptyOpContext returns the zero-width assertions 74 | // satisfied at the position between the runes r1 and r2. 75 | // Passing r1 == -1 indicates that the position is 76 | // at the beginning of the text. 77 | // Passing r2 == -1 indicates that the position is 78 | // at the end of the text. 79 | func EmptyOpContext(r1, r2 rune) EmptyOp { 80 | var op EmptyOp = EmptyNoWordBoundary 81 | var boundary byte 82 | switch { 83 | case IsWordChar(r1): 84 | boundary = 1 85 | case r1 == '\n': 86 | op |= EmptyBeginLine 87 | case r1 < 0: 88 | op |= EmptyBeginText | EmptyBeginLine 89 | } 90 | switch { 91 | case IsWordChar(r2): 92 | boundary ^= 1 93 | case r2 == '\n': 94 | op |= EmptyEndLine 95 | case r2 < 0: 96 | op |= EmptyEndText | EmptyEndLine 97 | } 98 | if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 99 | op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 100 | } 101 | return op 102 | } 103 | 104 | // IsWordChar reports whether r is consider a ``word character'' 105 | // during the evaluation of the \b and \B zero-width assertions. 106 | // These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 107 | func IsWordChar(r rune) bool { 108 | return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 109 | } 110 | 111 | // An Inst is a single instruction in a regular expression program. 112 | type Inst struct { 113 | Op InstOp 114 | Out uint32 // all but InstMatch, InstFail 115 | Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 116 | Rune []rune 117 | } 118 | 119 | func (p *Prog) String() string { 120 | var b bytes.Buffer 121 | dumpProg(&b, p) 122 | return b.String() 123 | } 124 | 125 | // skipNop follows any no-op or capturing instructions. 126 | func (p *Prog) skipNop(pc uint32) *Inst { 127 | i := &p.Inst[pc] 128 | for i.Op == InstNop || i.Op == InstCapture { 129 | i = &p.Inst[i.Out] 130 | } 131 | return i 132 | } 133 | 134 | // op returns i.Op but merges all the Rune special cases into InstRune 135 | func (i *Inst) op() InstOp { 136 | op := i.Op 137 | switch op { 138 | case InstRune1, InstRuneAny, InstRuneAnyNotNL: 139 | op = InstRune 140 | } 141 | return op 142 | } 143 | 144 | // Prefix returns a literal string that all matches for the 145 | // regexp must start with. Complete is true if the prefix 146 | // is the entire match. 147 | func (p *Prog) Prefix() (prefix string, complete bool) { 148 | i := p.skipNop(uint32(p.Start)) 149 | 150 | // Avoid allocation of buffer if prefix is empty. 151 | if i.op() != InstRune || len(i.Rune) != 1 { 152 | return "", i.Op == InstMatch 153 | } 154 | 155 | // Have prefix; gather characters. 156 | var buf bytes.Buffer 157 | for i.op() == InstRune && len(i.Rune) == 1 && i.Rune[0] <= 0xFF && Flags(i.Arg)&FoldCase == 0 { 158 | buf.WriteByte(byte(i.Rune[0])) 159 | i = p.skipNop(i.Out) 160 | } 161 | return buf.String(), i.Op == InstMatch 162 | } 163 | 164 | // StartCond returns the leading empty-width conditions that must 165 | // be true in any match. It returns ^EmptyOp(0) if no matches are possible. 166 | func (p *Prog) StartCond() EmptyOp { 167 | var flag EmptyOp 168 | pc := uint32(p.Start) 169 | i := &p.Inst[pc] 170 | Loop: 171 | for { 172 | switch i.Op { 173 | case InstEmptyWidth: 174 | flag |= EmptyOp(i.Arg) 175 | case InstFail: 176 | return ^EmptyOp(0) 177 | case InstCapture, InstNop: 178 | // skip 179 | default: 180 | break Loop 181 | } 182 | pc = i.Out 183 | i = &p.Inst[pc] 184 | } 185 | return flag 186 | } 187 | 188 | const noMatch = -1 189 | 190 | // MatchRune reports whether the instruction matches (and consumes) r. 191 | // It should only be called when i.Op == InstRune. 192 | func (i *Inst) MatchRune(r rune) bool { 193 | return i.MatchRunePos(r) != noMatch 194 | } 195 | 196 | // MatchRunePos checks whether the instruction matches (and consumes) r. 197 | // If so, MatchRunePos returns the index of the matching rune pair 198 | // (or, when len(i.Rune) == 1, rune singleton). 199 | // If not, MatchRunePos returns -1. 200 | // MatchRunePos should only be called when i.Op == InstRune. 201 | func (i *Inst) MatchRunePos(r rune) int { 202 | rune := i.Rune 203 | 204 | switch len(rune) { 205 | case 0: 206 | return noMatch 207 | 208 | case 1: 209 | // Special case: single-rune slice is from literal string, not char class. 210 | r0 := rune[0] 211 | if r == r0 { 212 | return 0 213 | } 214 | if Flags(i.Arg)&FoldCase != 0 { 215 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 216 | if r == r1 { 217 | return 0 218 | } 219 | } 220 | } 221 | return noMatch 222 | 223 | case 2: 224 | if r >= rune[0] && r <= rune[1] { 225 | return 0 226 | } 227 | return noMatch 228 | 229 | case 4, 6, 8: 230 | // Linear search for a few pairs. 231 | // Should handle ASCII well. 232 | for j := 0; j < len(rune); j += 2 { 233 | if r < rune[j] { 234 | return noMatch 235 | } 236 | if r <= rune[j+1] { 237 | return j / 2 238 | } 239 | } 240 | return noMatch 241 | } 242 | 243 | // Otherwise binary search. 244 | lo := 0 245 | hi := len(rune) / 2 246 | for lo < hi { 247 | m := lo + (hi-lo)/2 248 | if c := rune[2*m]; c <= r { 249 | if r <= rune[2*m+1] { 250 | return m 251 | } 252 | lo = m + 1 253 | } else { 254 | hi = m 255 | } 256 | } 257 | return noMatch 258 | } 259 | 260 | // MatchEmptyWidth reports whether the instruction matches 261 | // an empty string between the runes before and after. 262 | // It should only be called when i.Op == InstEmptyWidth. 263 | func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 264 | switch EmptyOp(i.Arg) { 265 | case EmptyBeginLine: 266 | return before == '\n' || before == -1 267 | case EmptyEndLine: 268 | return after == '\n' || after == -1 269 | case EmptyBeginText: 270 | return before == -1 271 | case EmptyEndText: 272 | return after == -1 273 | case EmptyWordBoundary: 274 | return IsWordChar(before) != IsWordChar(after) 275 | case EmptyNoWordBoundary: 276 | return IsWordChar(before) == IsWordChar(after) 277 | } 278 | panic("unknown empty width arg") 279 | } 280 | 281 | func (i *Inst) String() string { 282 | var b bytes.Buffer 283 | dumpInst(&b, i) 284 | return b.String() 285 | } 286 | 287 | func bw(b *bytes.Buffer, args ...string) { 288 | for _, s := range args { 289 | b.WriteString(s) 290 | } 291 | } 292 | 293 | func dumpProg(b *bytes.Buffer, p *Prog) { 294 | for j := range p.Inst { 295 | i := &p.Inst[j] 296 | pc := strconv.Itoa(j) 297 | if len(pc) < 3 { 298 | b.WriteString(" "[len(pc):]) 299 | } 300 | if j == p.Start { 301 | pc += "*" 302 | } 303 | bw(b, pc, "\t") 304 | dumpInst(b, i) 305 | bw(b, "\n") 306 | } 307 | } 308 | 309 | func u32(i uint32) string { 310 | return strconv.FormatUint(uint64(i), 10) 311 | } 312 | 313 | func dumpInst(b *bytes.Buffer, i *Inst) { 314 | switch i.Op { 315 | case InstAlt: 316 | bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 317 | case InstAltMatch: 318 | bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 319 | case InstCapture: 320 | bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 321 | case InstEmptyWidth: 322 | bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 323 | case InstMatch: 324 | bw(b, "match") 325 | case InstFail: 326 | bw(b, "fail") 327 | case InstNop: 328 | bw(b, "nop -> ", u32(i.Out)) 329 | case InstRune: 330 | if i.Rune == nil { 331 | // shouldn't happen 332 | bw(b, "rune ") 333 | } 334 | bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 335 | if Flags(i.Arg)&FoldCase != 0 { 336 | bw(b, "/i") 337 | } 338 | bw(b, " -> ", u32(i.Out)) 339 | case InstRune1: 340 | bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 341 | case InstRuneAny: 342 | bw(b, "any -> ", u32(i.Out)) 343 | case InstRuneAnyNotNL: 344 | bw(b, "anynotnl -> ", u32(i.Out)) 345 | } 346 | } 347 | -------------------------------------------------------------------------------- /syntax/prog_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import "testing" 8 | 9 | var compileTests = []struct { 10 | Regexp string 11 | Prog string 12 | }{ 13 | {"a", ` 0 fail 14 | 1* rune1 "a" -> 2 15 | 2 match 16 | `}, 17 | {"[A-M][n-z]", ` 0 fail 18 | 1* rune "AM" -> 2 19 | 2 rune "nz" -> 3 20 | 3 match 21 | `}, 22 | {"", ` 0 fail 23 | 1* nop -> 2 24 | 2 match 25 | `}, 26 | {"a?", ` 0 fail 27 | 1 rune1 "a" -> 3 28 | 2* alt -> 1, 3 29 | 3 match 30 | `}, 31 | {"a??", ` 0 fail 32 | 1 rune1 "a" -> 3 33 | 2* alt -> 3, 1 34 | 3 match 35 | `}, 36 | {"a+", ` 0 fail 37 | 1* rune1 "a" -> 2 38 | 2 alt -> 1, 3 39 | 3 match 40 | `}, 41 | {"a+?", ` 0 fail 42 | 1* rune1 "a" -> 2 43 | 2 alt -> 3, 1 44 | 3 match 45 | `}, 46 | {"a*", ` 0 fail 47 | 1 rune1 "a" -> 2 48 | 2* alt -> 1, 3 49 | 3 match 50 | `}, 51 | {"a*?", ` 0 fail 52 | 1 rune1 "a" -> 2 53 | 2* alt -> 3, 1 54 | 3 match 55 | `}, 56 | {"a+b+", ` 0 fail 57 | 1* rune1 "a" -> 2 58 | 2 alt -> 1, 3 59 | 3 rune1 "b" -> 4 60 | 4 alt -> 3, 5 61 | 5 match 62 | `}, 63 | {"(a+)(b+)", ` 0 fail 64 | 1* cap 2 -> 2 65 | 2 rune1 "a" -> 3 66 | 3 alt -> 2, 4 67 | 4 cap 3 -> 5 68 | 5 cap 4 -> 6 69 | 6 rune1 "b" -> 7 70 | 7 alt -> 6, 8 71 | 8 cap 5 -> 9 72 | 9 match 73 | `}, 74 | {"a+|b+", ` 0 fail 75 | 1 rune1 "a" -> 2 76 | 2 alt -> 1, 6 77 | 3 rune1 "b" -> 4 78 | 4 alt -> 3, 6 79 | 5* alt -> 1, 3 80 | 6 match 81 | `}, 82 | {"A[Aa]", ` 0 fail 83 | 1* rune1 "A" -> 2 84 | 2 rune "A"/i -> 3 85 | 3 match 86 | `}, 87 | {"(?:(?:^).)", ` 0 fail 88 | 1* empty 4 -> 2 89 | 2 anynotnl -> 3 90 | 3 match 91 | `}, 92 | } 93 | 94 | func TestCompile(t *testing.T) { 95 | for _, tt := range compileTests { 96 | re, _ := Parse(tt.Regexp, Perl) 97 | p, _ := Compile(re) 98 | s := p.String() 99 | if s != tt.Prog { 100 | t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog) 101 | } 102 | } 103 | } 104 | 105 | func BenchmarkEmptyOpContext(b *testing.B) { 106 | for i := 0; i < b.N; i++ { 107 | var r1 rune = -1 108 | for _, r2 := range "foo, bar, baz\nsome input text.\n" { 109 | EmptyOpContext(r1, r2) 110 | r1 = r2 111 | } 112 | EmptyOpContext(r1, -1) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /syntax/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | // Note to implementers: 8 | // In this package, re is always a *Regexp and r is always a rune. 9 | 10 | import ( 11 | "bytes" 12 | "strconv" 13 | "strings" 14 | "unicode" 15 | ) 16 | 17 | // A Regexp is a node in a regular expression syntax tree. 18 | type Regexp struct { 19 | Op Op // operator 20 | Flags Flags 21 | Sub []*Regexp // subexpressions, if any 22 | Sub0 [1]*Regexp // storage for short Sub 23 | Rune []rune // matched runes, for OpLiteral, OpCharClass 24 | Rune0 [2]rune // storage for short Rune 25 | Min, Max int // min, max for OpRepeat 26 | Cap int // capturing index, for OpCapture 27 | Name string // capturing name, for OpCapture 28 | } 29 | 30 | //go:generate stringer -type Op -trimprefix Op 31 | 32 | // An Op is a single regular expression operator. 33 | type Op uint8 34 | 35 | // Operators are listed in precedence order, tightest binding to weakest. 36 | // Character class operators are listed simplest to most complex 37 | // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). 38 | 39 | const ( 40 | OpNoMatch Op = 1 + iota // matches no strings 41 | OpEmptyMatch // matches empty string 42 | OpLiteral // matches Runes sequence 43 | OpCharClass // matches Runes interpreted as range pair list 44 | OpAnyCharNotNL // matches any character except newline 45 | OpAnyChar // matches any character 46 | OpBeginLine // matches empty string at beginning of line 47 | OpEndLine // matches empty string at end of line 48 | OpBeginText // matches empty string at beginning of text 49 | OpEndText // matches empty string at end of text 50 | OpWordBoundary // matches word boundary `\b` 51 | OpNoWordBoundary // matches word non-boundary `\B` 52 | OpCapture // capturing subexpression with index Cap, optional name Name 53 | OpStar // matches Sub[0] zero or more times 54 | OpPlus // matches Sub[0] one or more times 55 | OpQuest // matches Sub[0] zero or one times 56 | OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) 57 | OpConcat // matches concatenation of Subs 58 | OpAlternate // matches alternation of Subs 59 | ) 60 | 61 | const opPseudo Op = 128 // where pseudo-ops start 62 | 63 | // Equal reports whether x and y have identical structure. 64 | func (x *Regexp) Equal(y *Regexp) bool { 65 | if x == nil || y == nil { 66 | return x == y 67 | } 68 | if x.Op != y.Op { 69 | return false 70 | } 71 | switch x.Op { 72 | case OpEndText: 73 | // The parse flags remember whether this is \z or \Z. 74 | if x.Flags&WasDollar != y.Flags&WasDollar { 75 | return false 76 | } 77 | 78 | case OpLiteral, OpCharClass: 79 | if len(x.Rune) != len(y.Rune) { 80 | return false 81 | } 82 | for i, r := range x.Rune { 83 | if r != y.Rune[i] { 84 | return false 85 | } 86 | } 87 | 88 | case OpAlternate, OpConcat: 89 | if len(x.Sub) != len(y.Sub) { 90 | return false 91 | } 92 | for i, sub := range x.Sub { 93 | if !sub.Equal(y.Sub[i]) { 94 | return false 95 | } 96 | } 97 | 98 | case OpStar, OpPlus, OpQuest: 99 | if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { 100 | return false 101 | } 102 | 103 | case OpRepeat: 104 | if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { 105 | return false 106 | } 107 | 108 | case OpCapture: 109 | if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { 110 | return false 111 | } 112 | } 113 | return true 114 | } 115 | 116 | // writeRegexp writes the Perl syntax for the regular expression re to b. 117 | func writeRegexp(b *bytes.Buffer, re *Regexp) { 118 | switch re.Op { 119 | default: 120 | b.WriteString("") 121 | case OpNoMatch: 122 | b.WriteString(`[^\x00-\x{10FFFF}]`) 123 | case OpEmptyMatch: 124 | b.WriteString(`(?:)`) 125 | case OpLiteral: 126 | if re.Flags&FoldCase != 0 { 127 | b.WriteString(`(?i:`) 128 | } 129 | for _, r := range re.Rune { 130 | escape(b, r, false) 131 | } 132 | if re.Flags&FoldCase != 0 { 133 | b.WriteString(`)`) 134 | } 135 | case OpCharClass: 136 | if len(re.Rune)%2 != 0 { 137 | b.WriteString(`[invalid char class]`) 138 | break 139 | } 140 | b.WriteRune('[') 141 | if len(re.Rune) == 0 { 142 | b.WriteString(`^\x00-\x{10FFFF}`) 143 | } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { 144 | // Contains 0 and MaxRune. Probably a negated class. 145 | // Print the gaps. 146 | b.WriteRune('^') 147 | for i := 1; i < len(re.Rune)-1; i += 2 { 148 | lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 149 | escape(b, lo, lo == '-') 150 | if lo != hi { 151 | b.WriteRune('-') 152 | escape(b, hi, hi == '-') 153 | } 154 | } 155 | } else { 156 | for i := 0; i < len(re.Rune); i += 2 { 157 | lo, hi := re.Rune[i], re.Rune[i+1] 158 | escape(b, lo, lo == '-') 159 | if lo != hi { 160 | b.WriteRune('-') 161 | escape(b, hi, hi == '-') 162 | } 163 | } 164 | } 165 | b.WriteRune(']') 166 | case OpAnyCharNotNL: 167 | b.WriteString(`(?-s:.)`) 168 | case OpAnyChar: 169 | b.WriteString(`(?s:.)`) 170 | case OpBeginLine: 171 | b.WriteString(`(?m:^)`) 172 | case OpEndLine: 173 | b.WriteString(`(?m:$)`) 174 | case OpBeginText: 175 | b.WriteString(`\A`) 176 | case OpEndText: 177 | if re.Flags&WasDollar != 0 { 178 | b.WriteString(`(?-m:$)`) 179 | } else { 180 | b.WriteString(`\z`) 181 | } 182 | case OpWordBoundary: 183 | b.WriteString(`\b`) 184 | case OpNoWordBoundary: 185 | b.WriteString(`\B`) 186 | case OpCapture: 187 | if re.Name != "" { 188 | b.WriteString(`(?P<`) 189 | b.WriteString(re.Name) 190 | b.WriteRune('>') 191 | } else { 192 | b.WriteRune('(') 193 | } 194 | if re.Sub[0].Op != OpEmptyMatch { 195 | writeRegexp(b, re.Sub[0]) 196 | } 197 | b.WriteRune(')') 198 | case OpStar, OpPlus, OpQuest, OpRepeat: 199 | if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { 200 | b.WriteString(`(?:`) 201 | writeRegexp(b, sub) 202 | b.WriteString(`)`) 203 | } else { 204 | writeRegexp(b, sub) 205 | } 206 | switch re.Op { 207 | case OpStar: 208 | b.WriteRune('*') 209 | case OpPlus: 210 | b.WriteRune('+') 211 | case OpQuest: 212 | b.WriteRune('?') 213 | case OpRepeat: 214 | b.WriteRune('{') 215 | b.WriteString(strconv.Itoa(re.Min)) 216 | if re.Max != re.Min { 217 | b.WriteRune(',') 218 | if re.Max >= 0 { 219 | b.WriteString(strconv.Itoa(re.Max)) 220 | } 221 | } 222 | b.WriteRune('}') 223 | } 224 | if re.Flags&NonGreedy != 0 { 225 | b.WriteRune('?') 226 | } 227 | case OpConcat: 228 | for _, sub := range re.Sub { 229 | if sub.Op == OpAlternate { 230 | b.WriteString(`(?:`) 231 | writeRegexp(b, sub) 232 | b.WriteString(`)`) 233 | } else { 234 | writeRegexp(b, sub) 235 | } 236 | } 237 | case OpAlternate: 238 | for i, sub := range re.Sub { 239 | if i > 0 { 240 | b.WriteRune('|') 241 | } 242 | writeRegexp(b, sub) 243 | } 244 | } 245 | } 246 | 247 | func (re *Regexp) String() string { 248 | var b bytes.Buffer 249 | writeRegexp(&b, re) 250 | return b.String() 251 | } 252 | 253 | const meta = `\.+*?()|[]{}^$` 254 | 255 | func escape(b *bytes.Buffer, r rune, force bool) { 256 | if unicode.IsPrint(r) { 257 | if strings.ContainsRune(meta, r) || force { 258 | b.WriteRune('\\') 259 | } 260 | b.WriteRune(r) 261 | return 262 | } 263 | 264 | switch r { 265 | case '\a': 266 | b.WriteString(`\a`) 267 | case '\f': 268 | b.WriteString(`\f`) 269 | case '\n': 270 | b.WriteString(`\n`) 271 | case '\r': 272 | b.WriteString(`\r`) 273 | case '\t': 274 | b.WriteString(`\t`) 275 | case '\v': 276 | b.WriteString(`\v`) 277 | default: 278 | if r < 0x100 { 279 | b.WriteString(`\x`) 280 | s := strconv.FormatInt(int64(r), 16) 281 | if len(s) == 1 { 282 | b.WriteRune('0') 283 | } 284 | b.WriteString(s) 285 | break 286 | } 287 | b.WriteString(`\x{`) 288 | b.WriteString(strconv.FormatInt(int64(r), 16)) 289 | b.WriteString(`}`) 290 | } 291 | } 292 | 293 | // MaxCap walks the regexp to find the maximum capture index. 294 | func (re *Regexp) MaxCap() int { 295 | m := 0 296 | if re.Op == OpCapture { 297 | m = re.Cap 298 | } 299 | for _, sub := range re.Sub { 300 | if n := sub.MaxCap(); m < n { 301 | m = n 302 | } 303 | } 304 | return m 305 | } 306 | 307 | // CapNames walks the regexp to find the names of capturing groups. 308 | func (re *Regexp) CapNames() []string { 309 | names := make([]string, re.MaxCap()+1) 310 | re.capNames(names) 311 | return names 312 | } 313 | 314 | func (re *Regexp) capNames(names []string) { 315 | if re.Op == OpCapture { 316 | names[re.Cap] = re.Name 317 | } 318 | for _, sub := range re.Sub { 319 | sub.capNames(names) 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /syntax/simplify.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | // Simplify returns a regexp equivalent to re but without counted repetitions 8 | // and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. 9 | // The resulting regexp will execute correctly but its string representation 10 | // will not produce the same parse tree, because capturing parentheses 11 | // may have been duplicated or removed. For example, the simplified form 12 | // for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. 13 | // The returned regexp may share structure with or be the original. 14 | func (re *Regexp) Simplify() *Regexp { 15 | if re == nil { 16 | return nil 17 | } 18 | switch re.Op { 19 | case OpCapture, OpConcat, OpAlternate: 20 | // Simplify children, building new Regexp if children change. 21 | nre := re 22 | for i, sub := range re.Sub { 23 | nsub := sub.Simplify() 24 | if nre == re && nsub != sub { 25 | // Start a copy. 26 | nre = new(Regexp) 27 | *nre = *re 28 | nre.Rune = nil 29 | nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) 30 | } 31 | if nre != re { 32 | nre.Sub = append(nre.Sub, nsub) 33 | } 34 | } 35 | return nre 36 | 37 | case OpStar, OpPlus, OpQuest: 38 | sub := re.Sub[0].Simplify() 39 | return simplify1(re.Op, re.Flags, sub, re) 40 | 41 | case OpRepeat: 42 | // Special special case: x{0} matches the empty string 43 | // and doesn't even need to consider x. 44 | if re.Min == 0 && re.Max == 0 { 45 | return &Regexp{Op: OpEmptyMatch} 46 | } 47 | 48 | // The fun begins. 49 | sub := re.Sub[0].Simplify() 50 | 51 | // x{n,} means at least n matches of x. 52 | if re.Max == -1 { 53 | // Special case: x{0,} is x*. 54 | if re.Min == 0 { 55 | return simplify1(OpStar, re.Flags, sub, nil) 56 | } 57 | 58 | // Special case: x{1,} is x+. 59 | if re.Min == 1 { 60 | return simplify1(OpPlus, re.Flags, sub, nil) 61 | } 62 | 63 | // General case: x{4,} is xxxx+. 64 | nre := &Regexp{Op: OpConcat} 65 | nre.Sub = nre.Sub0[:0] 66 | for i := 0; i < re.Min-1; i++ { 67 | nre.Sub = append(nre.Sub, sub) 68 | } 69 | nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) 70 | return nre 71 | } 72 | 73 | // Special case x{0} handled above. 74 | 75 | // Special case: x{1} is just x. 76 | if re.Min == 1 && re.Max == 1 { 77 | return sub 78 | } 79 | 80 | // General case: x{n,m} means n copies of x and m copies of x? 81 | // The machine will do less work if we nest the final m copies, 82 | // so that x{2,5} = xx(x(x(x)?)?)? 83 | 84 | // Build leading prefix: xx. 85 | var prefix *Regexp 86 | if re.Min > 0 { 87 | prefix = &Regexp{Op: OpConcat} 88 | prefix.Sub = prefix.Sub0[:0] 89 | for i := 0; i < re.Min; i++ { 90 | prefix.Sub = append(prefix.Sub, sub) 91 | } 92 | } 93 | 94 | // Build and attach suffix: (x(x(x)?)?)? 95 | if re.Max > re.Min { 96 | suffix := simplify1(OpQuest, re.Flags, sub, nil) 97 | for i := re.Min + 1; i < re.Max; i++ { 98 | nre2 := &Regexp{Op: OpConcat} 99 | nre2.Sub = append(nre2.Sub0[:0], sub, suffix) 100 | suffix = simplify1(OpQuest, re.Flags, nre2, nil) 101 | } 102 | if prefix == nil { 103 | return suffix 104 | } 105 | prefix.Sub = append(prefix.Sub, suffix) 106 | } 107 | if prefix != nil { 108 | return prefix 109 | } 110 | 111 | // Some degenerate case like min > max or min < max < 0. 112 | // Handle as impossible match. 113 | return &Regexp{Op: OpNoMatch} 114 | } 115 | 116 | return re 117 | } 118 | 119 | // simplify1 implements Simplify for the unary OpStar, 120 | // OpPlus, and OpQuest operators. It returns the simple regexp 121 | // equivalent to 122 | // 123 | // Regexp{Op: op, Flags: flags, Sub: {sub}} 124 | // 125 | // under the assumption that sub is already simple, and 126 | // without first allocating that structure. If the regexp 127 | // to be returned turns out to be equivalent to re, simplify1 128 | // returns re instead. 129 | // 130 | // simplify1 is factored out of Simplify because the implementation 131 | // for other operators generates these unary expressions. 132 | // Letting them call simplify1 makes sure the expressions they 133 | // generate are simple. 134 | func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { 135 | // Special case: repeat the empty string as much as 136 | // you want, but it's still the empty string. 137 | if sub.Op == OpEmptyMatch { 138 | return sub 139 | } 140 | // The operators are idempotent if the flags match. 141 | if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { 142 | return sub 143 | } 144 | if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { 145 | return re 146 | } 147 | 148 | re = &Regexp{Op: op, Flags: flags} 149 | re.Sub = append(re.Sub0[:0], sub) 150 | return re 151 | } 152 | -------------------------------------------------------------------------------- /syntax/simplify_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package syntax 6 | 7 | import "testing" 8 | 9 | var simplifyTests = []struct { 10 | Regexp string 11 | Simple string 12 | }{ 13 | // Already-simple constructs 14 | {`a`, `a`}, 15 | {`ab`, `ab`}, 16 | {`a|b`, `[a-b]`}, 17 | {`ab|cd`, `ab|cd`}, 18 | {`(ab)*`, `(ab)*`}, 19 | {`(ab)+`, `(ab)+`}, 20 | {`(ab)?`, `(ab)?`}, 21 | {`.`, `(?s:.)`}, 22 | {`^`, `(?m:^)`}, 23 | {`$`, `(?m:$)`}, 24 | {`[ac]`, `[ac]`}, 25 | {`[^ac]`, `[^ac]`}, 26 | 27 | // Posix character classes 28 | {`[[:alnum:]]`, `[0-9A-Za-z]`}, 29 | {`[[:alpha:]]`, `[A-Za-z]`}, 30 | {`[[:blank:]]`, `[\t ]`}, 31 | {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, 32 | {`[[:digit:]]`, `[0-9]`}, 33 | {`[[:graph:]]`, `[!-~]`}, 34 | {`[[:lower:]]`, `[a-z]`}, 35 | {`[[:print:]]`, `[ -~]`}, 36 | {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, 37 | {`[[:space:]]`, `[\t-\r ]`}, 38 | {`[[:upper:]]`, `[A-Z]`}, 39 | {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, 40 | 41 | // Perl character classes 42 | {`\d`, `[0-9]`}, 43 | {`\s`, `[\t-\n\f-\r ]`}, 44 | {`\w`, `[0-9A-Z_a-z]`}, 45 | {`\D`, `[^0-9]`}, 46 | {`\S`, `[^\t-\n\f-\r ]`}, 47 | {`\W`, `[^0-9A-Z_a-z]`}, 48 | {`[\d]`, `[0-9]`}, 49 | {`[\s]`, `[\t-\n\f-\r ]`}, 50 | {`[\w]`, `[0-9A-Z_a-z]`}, 51 | {`[\D]`, `[^0-9]`}, 52 | {`[\S]`, `[^\t-\n\f-\r ]`}, 53 | {`[\W]`, `[^0-9A-Z_a-z]`}, 54 | 55 | // Posix repetitions 56 | {`a{1}`, `a`}, 57 | {`a{2}`, `aa`}, 58 | {`a{5}`, `aaaaa`}, 59 | {`a{0,1}`, `a?`}, 60 | // The next three are illegible because Simplify inserts (?:) 61 | // parens instead of () parens to avoid creating extra 62 | // captured subexpressions. The comments show a version with fewer parens. 63 | {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? 64 | {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? 65 | {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? 66 | {`a{0,2}`, `(?:aa?)?`}, // (aa?)? 67 | {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? 68 | {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? 69 | {`a{0,}`, `a*`}, 70 | {`a{1,}`, `a+`}, 71 | {`a{2,}`, `aa+`}, 72 | {`a{5,}`, `aaaaa+`}, 73 | 74 | // Test that operators simplify their arguments. 75 | {`(?:a{1,}){1,}`, `a+`}, 76 | {`(a{1,}b{1,})`, `(a+b+)`}, 77 | {`a{1,}|b{1,}`, `a+|b+`}, 78 | {`(?:a{1,})*`, `(?:a+)*`}, 79 | {`(?:a{1,})+`, `a+`}, 80 | {`(?:a{1,})?`, `(?:a+)?`}, 81 | {``, `(?:)`}, 82 | {`a{0}`, `(?:)`}, 83 | 84 | // Character class simplification 85 | {`[ab]`, `[a-b]`}, 86 | {`[a-za-za-z]`, `[a-z]`}, 87 | {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, 88 | {`[ABCDEFGH]`, `[A-H]`}, 89 | {`[AB-CD-EF-GH]`, `[A-H]`}, 90 | {`[W-ZP-XE-R]`, `[E-Z]`}, 91 | {`[a-ee-gg-m]`, `[a-m]`}, 92 | {`[a-ea-ha-m]`, `[a-m]`}, 93 | {`[a-ma-ha-e]`, `[a-m]`}, 94 | {`[a-zA-Z0-9 -~]`, `[ -~]`}, 95 | 96 | // Empty character classes 97 | {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, 98 | 99 | // Full character classes 100 | {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`}, 101 | 102 | // Unicode case folding. 103 | {`(?i)A`, `(?i:A)`}, 104 | {`(?i)a`, `(?i:A)`}, 105 | {`(?i)[A]`, `(?i:A)`}, 106 | {`(?i)[a]`, `(?i:A)`}, 107 | {`(?i)K`, `(?i:K)`}, 108 | {`(?i)k`, `(?i:K)`}, 109 | {`(?i)\x{212a}`, "(?i:K)"}, 110 | {`(?i)[K]`, "[Kk\u212A]"}, 111 | {`(?i)[k]`, "[Kk\u212A]"}, 112 | {`(?i)[\x{212a}]`, "[Kk\u212A]"}, 113 | {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, 114 | {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, 115 | {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`}, 116 | 117 | // Empty string as a regular expression. 118 | // The empty string must be preserved inside parens in order 119 | // to make submatches work right, so these tests are less 120 | // interesting than they might otherwise be. String inserts 121 | // explicit (?:) in place of non-parenthesized empty strings, 122 | // to make them easier to spot for other parsers. 123 | {`(a|b|)`, `([a-b]|(?:))`}, 124 | {`(|)`, `()`}, 125 | {`a()`, `a()`}, 126 | {`(()|())`, `(()|())`}, 127 | {`(a|)`, `(a|(?:))`}, 128 | {`ab()cd()`, `ab()cd()`}, 129 | {`()`, `()`}, 130 | {`()*`, `()*`}, 131 | {`()+`, `()+`}, 132 | {`()?`, `()?`}, 133 | {`(){0}`, `(?:)`}, 134 | {`(){1}`, `()`}, 135 | {`(){1,}`, `()+`}, 136 | {`(){0,2}`, `(?:()()?)?`}, 137 | } 138 | 139 | func TestSimplify(t *testing.T) { 140 | for _, tt := range simplifyTests { 141 | re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) 142 | if err != nil { 143 | t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) 144 | continue 145 | } 146 | s := re.Simplify().String() 147 | if s != tt.Simple { 148 | t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /testdata/README: -------------------------------------------------------------------------------- 1 | AT&T POSIX Test Files 2 | See textregex.c for copyright + license. 3 | 4 | testregex.c http://www2.research.att.com/~gsf/testregex/testregex.c 5 | basic.dat http://www2.research.att.com/~gsf/testregex/basic.dat 6 | nullsubexpr.dat http://www2.research.att.com/~gsf/testregex/nullsubexpr.dat 7 | repetition.dat http://www2.research.att.com/~gsf/testregex/repetition.dat 8 | 9 | The test data has been edited to reflect RE2/Go differences: 10 | * In a star of a possibly empty match like (a*)* matching x, 11 | the no match case runs the starred subexpression zero times, 12 | not once. This is consistent with (a*)* matching a, which 13 | runs the starred subexpression one time, not twice. 14 | * The submatch choice is first match, not the POSIX rule. 15 | 16 | Such changes are marked with 'RE2/Go'. 17 | 18 | 19 | RE2 Test Files 20 | 21 | re2-exhaustive.txt.bz2 and re2-search.txt are built by running 22 | 'make log' in the RE2 distribution https://github.com/google/re2/ 23 | 24 | The exhaustive file is compressed because it is huge. 25 | -------------------------------------------------------------------------------- /testdata/basic.dat: -------------------------------------------------------------------------------- 1 | NOTE all standard compliant implementations should pass these : 2002-05-31 2 | 3 | BE abracadabra$ abracadabracadabra (7,18) 4 | BE a...b abababbb (2,7) 5 | BE XXXXXX ..XXXXXX (2,8) 6 | E \) () (1,2) 7 | BE a] a]a (0,2) 8 | B } } (0,1) 9 | E \} } (0,1) 10 | BE \] ] (0,1) 11 | B ] ] (0,1) 12 | E ] ] (0,1) 13 | B { { (0,1) 14 | B } } (0,1) 15 | BE ^a ax (0,1) 16 | BE \^a a^a (1,3) 17 | BE a\^ a^ (0,2) 18 | BE a$ aa (1,2) 19 | BE a\$ a$ (0,2) 20 | BE ^$ NULL (0,0) 21 | E $^ NULL (0,0) 22 | E a($) aa (1,2)(2,2) 23 | E a*(^a) aa (0,1)(0,1) 24 | E (..)*(...)* a (0,0) 25 | E (..)*(...)* abcd (0,4)(2,4) 26 | E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) 27 | E (ab)c|abc abc (0,3)(0,2) 28 | E a{0}b ab (1,2) 29 | E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 30 | E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 31 | E a{9876543210} NULL BADBR 32 | E ((a|a)|a) a (0,1)(0,1)(0,1) 33 | E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) 34 | E a*(a.|aa) aaaa (0,4)(2,4) 35 | E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) 36 | E (a|b)?.* b (0,1)(0,1) 37 | E (a|b)c|a(b|c) ac (0,2)(0,1) 38 | E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) 39 | E (a|b)*c|(a|ab)*c abc (0,3)(1,2) 40 | E (a|b)*c|(a|ab)*c xc (1,2) 41 | E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) 42 | E a?(ab|ba)ab abab (0,4)(0,2) 43 | E a?(ac{0}b|ba)ab abab (0,4)(0,2) 44 | E ab|abab abbabab (0,2) 45 | E aba|bab|bba baaabbbaba (5,8) 46 | E aba|bab baaabbbaba (6,9) 47 | E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) 48 | E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) 49 | E ab|a xabc (1,3) 50 | E ab|a xxabc (2,4) 51 | Ei (Ab|cD)* aBcD (0,4)(2,4) 52 | BE [^-] --a (2,3) 53 | BE [a-]* --a (0,3) 54 | BE [a-m-]* --amoma-- (0,4) 55 | E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) 56 | E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) 57 | {E [[:upper:]] A (0,1) [[]] not supported 58 | E [[:lower:]]+ `az{ (1,3) 59 | E [[:upper:]]+ @AZ[ (1,3) 60 | # No collation in Go 61 | #BE [[-]] [[-]] (2,4) 62 | #BE [[.NIL.]] NULL ECOLLATE 63 | #BE [[=aleph=]] NULL ECOLLATE 64 | } 65 | BE$ \n \n (0,1) 66 | BEn$ \n \n (0,1) 67 | BE$ [^a] \n (0,1) 68 | BE$ \na \na (0,2) 69 | E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) 70 | BE xxx xxx (0,3) 71 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) 72 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) 73 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) 74 | E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) 75 | E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) 76 | E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) 77 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) 78 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) 79 | E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) 80 | BE$ .* \x01\xff (0,2) 81 | E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) 82 | L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH 83 | E a*a*a*a*a*b aaaaaaaaab (0,10) 84 | BE ^ NULL (0,0) 85 | BE $ NULL (0,0) 86 | BE ^$ NULL (0,0) 87 | BE ^a$ a (0,1) 88 | BE abc abc (0,3) 89 | BE abc xabcy (1,4) 90 | BE abc ababc (2,5) 91 | BE ab*c abc (0,3) 92 | BE ab*bc abc (0,3) 93 | BE ab*bc abbc (0,4) 94 | BE ab*bc abbbbc (0,6) 95 | E ab+bc abbc (0,4) 96 | E ab+bc abbbbc (0,6) 97 | E ab?bc abbc (0,4) 98 | E ab?bc abc (0,3) 99 | E ab?c abc (0,3) 100 | BE ^abc$ abc (0,3) 101 | BE ^abc abcc (0,3) 102 | BE abc$ aabc (1,4) 103 | BE ^ abc (0,0) 104 | BE $ abc (3,3) 105 | BE a.c abc (0,3) 106 | BE a.c axc (0,3) 107 | BE a.*c axyzc (0,5) 108 | BE a[bc]d abd (0,3) 109 | BE a[b-d]e ace (0,3) 110 | BE a[b-d] aac (1,3) 111 | BE a[-b] a- (0,2) 112 | BE a[b-] a- (0,2) 113 | BE a] a] (0,2) 114 | BE a[]]b a]b (0,3) 115 | BE a[^bc]d aed (0,3) 116 | BE a[^-b]c adc (0,3) 117 | BE a[^]b]c adc (0,3) 118 | E ab|cd abc (0,2) 119 | E ab|cd abcd (0,2) 120 | E a\(b a(b (0,3) 121 | E a\(*b ab (0,2) 122 | E a\(*b a((b (0,4) 123 | E ((a)) abc (0,1)(0,1)(0,1) 124 | E (a)b(c) abc (0,3)(0,1)(2,3) 125 | E a+b+c aabbabc (4,7) 126 | E a* aaa (0,3) 127 | #E (a*)* - (0,0)(0,0) 128 | E (a*)* - (0,0)(?,?) RE2/Go 129 | E (a*)+ - (0,0)(0,0) 130 | #E (a*|b)* - (0,0)(0,0) 131 | E (a*|b)* - (0,0)(?,?) RE2/Go 132 | E (a+|b)* ab (0,2)(1,2) 133 | E (a+|b)+ ab (0,2)(1,2) 134 | E (a+|b)? ab (0,1)(0,1) 135 | BE [^ab]* cde (0,3) 136 | #E (^)* - (0,0)(0,0) 137 | E (^)* - (0,0)(?,?) RE2/Go 138 | BE a* NULL (0,0) 139 | E ([abc])*d abbbcd (0,6)(4,5) 140 | E ([abc])*bcd abcd (0,4)(0,1) 141 | E a|b|c|d|e e (0,1) 142 | E (a|b|c|d|e)f ef (0,2)(0,1) 143 | #E ((a*|b))* - (0,0)(0,0)(0,0) 144 | E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go 145 | BE abcd*efg abcdefg (0,7) 146 | BE ab* xabyabbbz (1,3) 147 | BE ab* xayabbbz (1,2) 148 | E (ab|cd)e abcde (2,5)(2,4) 149 | BE [abhgefdc]ij hij (0,3) 150 | E (a|b)c*d abcd (1,4)(1,2) 151 | E (ab|ab*)bc abc (0,3)(0,1) 152 | E a([bc]*)c* abc (0,3)(1,3) 153 | E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) 154 | E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) 155 | E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) 156 | E a[bcd]*dcdcde adcdcde (0,7) 157 | E (ab|a)b*c abc (0,3)(0,2) 158 | E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) 159 | BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) 160 | E ^a(bc+|b[eh])g|.h$ abh (1,3) 161 | E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) 162 | E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) 163 | E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) 164 | E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) 165 | BE multiple words multiple words yeah (0,14) 166 | E (.*)c(.*) abcde (0,5)(0,2)(3,5) 167 | BE abcd abcd (0,4) 168 | E a(bc)d abcd (0,4)(1,3) 169 | E a[-]?c ac (0,3) 170 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) 171 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) 172 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) 173 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) 174 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) 175 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) 176 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) 177 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) 178 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) 179 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) 180 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) 181 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) 182 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) 183 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) 184 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) 185 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) 186 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) 187 | E a+(b|c)*d+ aabcdd (0,6)(3,4) 188 | E ^.+$ vivi (0,4) 189 | E ^(.+)$ vivi (0,4)(0,4) 190 | E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) 191 | E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) 192 | E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) 193 | E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) 194 | E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) 195 | E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) 196 | E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) 197 | E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) 198 | E ((foo)|bar)!bas bar!bas (0,7)(0,3) 199 | E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) 200 | E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) 201 | E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) 202 | E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) 203 | E (foo|(bar))!bas foo!bas (0,7)(0,3) 204 | E (foo|bar)!bas bar!bas (0,7)(0,3) 205 | E (foo|bar)!bas foo!bar!bas (4,11)(4,7) 206 | E (foo|bar)!bas foo!bas (0,7)(0,3) 207 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 208 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) 209 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) 210 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) 211 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) 212 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) 213 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) 214 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 215 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) 216 | E .*(/XXX).* /XXX (0,4)(0,4) 217 | E .*(\\XXX).* \XXX (0,4)(0,4) 218 | E \\XXX \XXX (0,4) 219 | E .*(/000).* /000 (0,4)(0,4) 220 | E .*(\\000).* \000 (0,4)(0,4) 221 | E \\000 \000 (0,4) 222 | -------------------------------------------------------------------------------- /testdata/nullsubexpr.dat: -------------------------------------------------------------------------------- 1 | NOTE null subexpression matches : 2002-06-06 2 | 3 | E (a*)* a (0,1)(0,1) 4 | #E SAME x (0,0)(0,0) 5 | E SAME x (0,0)(?,?) RE2/Go 6 | E SAME aaaaaa (0,6)(0,6) 7 | E SAME aaaaaax (0,6)(0,6) 8 | E (a*)+ a (0,1)(0,1) 9 | E SAME x (0,0)(0,0) 10 | E SAME aaaaaa (0,6)(0,6) 11 | E SAME aaaaaax (0,6)(0,6) 12 | E (a+)* a (0,1)(0,1) 13 | E SAME x (0,0) 14 | E SAME aaaaaa (0,6)(0,6) 15 | E SAME aaaaaax (0,6)(0,6) 16 | E (a+)+ a (0,1)(0,1) 17 | E SAME x NOMATCH 18 | E SAME aaaaaa (0,6)(0,6) 19 | E SAME aaaaaax (0,6)(0,6) 20 | 21 | E ([a]*)* a (0,1)(0,1) 22 | #E SAME x (0,0)(0,0) 23 | E SAME x (0,0)(?,?) RE2/Go 24 | E SAME aaaaaa (0,6)(0,6) 25 | E SAME aaaaaax (0,6)(0,6) 26 | E ([a]*)+ a (0,1)(0,1) 27 | E SAME x (0,0)(0,0) 28 | E SAME aaaaaa (0,6)(0,6) 29 | E SAME aaaaaax (0,6)(0,6) 30 | E ([^b]*)* a (0,1)(0,1) 31 | #E SAME b (0,0)(0,0) 32 | E SAME b (0,0)(?,?) RE2/Go 33 | E SAME aaaaaa (0,6)(0,6) 34 | E SAME aaaaaab (0,6)(0,6) 35 | E ([ab]*)* a (0,1)(0,1) 36 | E SAME aaaaaa (0,6)(0,6) 37 | E SAME ababab (0,6)(0,6) 38 | E SAME bababa (0,6)(0,6) 39 | E SAME b (0,1)(0,1) 40 | E SAME bbbbbb (0,6)(0,6) 41 | E SAME aaaabcde (0,5)(0,5) 42 | E ([^a]*)* b (0,1)(0,1) 43 | E SAME bbbbbb (0,6)(0,6) 44 | #E SAME aaaaaa (0,0)(0,0) 45 | E SAME aaaaaa (0,0)(?,?) RE2/Go 46 | E ([^ab]*)* ccccxx (0,6)(0,6) 47 | #E SAME ababab (0,0)(0,0) 48 | E SAME ababab (0,0)(?,?) RE2/Go 49 | 50 | E ((z)+|a)* zabcde (0,2)(1,2) 51 | 52 | #{E a+? aaaaaa (0,1) no *? +? mimimal match ops 53 | #E (a) aaa (0,1)(0,1) 54 | #E (a*?) aaa (0,0)(0,0) 55 | #E (a)*? aaa (0,0) 56 | #E (a*?)*? aaa (0,0) 57 | #} 58 | 59 | B \(a*\)*\(x\) x (0,1)(0,0)(0,1) 60 | B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) 61 | B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) 62 | B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) 63 | B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) 64 | B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) 65 | B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) 66 | B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) 67 | 68 | #E (a*)*(x) x (0,1)(0,0)(0,1) 69 | E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go 70 | E (a*)*(x) ax (0,2)(0,1)(1,2) 71 | E (a*)*(x) axa (0,2)(0,1)(1,2) 72 | 73 | E (a*)+(x) x (0,1)(0,0)(0,1) 74 | E (a*)+(x) ax (0,2)(0,1)(1,2) 75 | E (a*)+(x) axa (0,2)(0,1)(1,2) 76 | 77 | E (a*){2}(x) x (0,1)(0,0)(0,1) 78 | E (a*){2}(x) ax (0,2)(1,1)(1,2) 79 | E (a*){2}(x) axa (0,2)(1,1)(1,2) 80 | -------------------------------------------------------------------------------- /testdata/re2-exhaustive.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsc/binaryregexp/545cabda89ca36b48b8e681a30d9d769a30b3074/testdata/re2-exhaustive.txt.bz2 -------------------------------------------------------------------------------- /testdata/repetition.dat: -------------------------------------------------------------------------------- 1 | NOTE implicit vs. explicit repetitions : 2009-02-02 2 | 3 | # Glenn Fowler 4 | # conforming matches (column 4) must match one of the following BREs 5 | # NOMATCH 6 | # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* 7 | # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* 8 | # i.e., each 3-tuple has two identical elements and one (?,?) 9 | 10 | E ((..)|(.)) NULL NOMATCH 11 | E ((..)|(.))((..)|(.)) NULL NOMATCH 12 | E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH 13 | 14 | E ((..)|(.)){1} NULL NOMATCH 15 | E ((..)|(.)){2} NULL NOMATCH 16 | E ((..)|(.)){3} NULL NOMATCH 17 | 18 | E ((..)|(.))* NULL (0,0) 19 | 20 | E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) 21 | E ((..)|(.))((..)|(.)) a NOMATCH 22 | E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH 23 | 24 | E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) 25 | E ((..)|(.)){2} a NOMATCH 26 | E ((..)|(.)){3} a NOMATCH 27 | 28 | E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) 29 | 30 | E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) 31 | E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) 32 | E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH 33 | 34 | E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) 35 | E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) 36 | E ((..)|(.)){3} aa NOMATCH 37 | 38 | E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) 39 | 40 | E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) 41 | E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) 42 | E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) 43 | 44 | E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) 45 | #E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) 46 | E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 47 | E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) 48 | 49 | #E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) 50 | E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 51 | 52 | E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) 53 | E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 54 | E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) 55 | 56 | E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) 57 | E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) 58 | #E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) 59 | E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go 60 | 61 | E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) 62 | 63 | E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) 64 | E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 65 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) 66 | 67 | E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) 68 | E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) 69 | #E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) 70 | E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 71 | 72 | #E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) 73 | E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 74 | 75 | E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) 76 | E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 77 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) 78 | 79 | E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) 80 | E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) 81 | E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) 82 | 83 | E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) 84 | 85 | NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 86 | 87 | # These test a bug in OS X / FreeBSD / NetBSD, and libtree. 88 | # Linux/GLIBC gets the {8,} and {8,8} wrong. 89 | 90 | :HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) 91 | :HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) 92 | :HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) 93 | :HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) 94 | :HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) 95 | :HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) 96 | :HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) 97 | :HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) 98 | :HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) 99 | #:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) 100 | :HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go 101 | #:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) 102 | :HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go 103 | #:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) 104 | :HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go 105 | #:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) 106 | :HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go 107 | #:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) 108 | :HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go 109 | #:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) 110 | :HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go 111 | #:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) 112 | :HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go 113 | #:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) 114 | :HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go 115 | :HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) 116 | 117 | # These test a fixed bug in my regex-tdfa that did not keep the expanded 118 | # form properly grouped, so right association did the wrong thing with 119 | # these ambiguous patterns (crafted just to test my code when I became 120 | # suspicious of my implementation). The first subexpression should use 121 | # "ab" then "a" then "bcd". 122 | 123 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible 124 | # results like (0,6)(4,5)(6,6). 125 | 126 | :HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 127 | :HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 128 | :HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 129 | :HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 130 | :HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH 131 | :HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 132 | :HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 133 | :HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 134 | :HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 135 | :HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH 136 | :HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 137 | :HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 138 | 139 | # The above worked on Linux/GLIBC but the following often fail. 140 | # They also trip up OS X / FreeBSD / NetBSD: 141 | 142 | #:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 143 | :HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 144 | #:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 145 | :HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 146 | #:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 147 | :HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 148 | #:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 149 | :HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 150 | :HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH 151 | #:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 152 | :HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 153 | #:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 154 | :HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 155 | #:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 156 | :HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 157 | #:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 158 | :HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 159 | :HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH 160 | #:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 161 | :HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 162 | #:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 163 | :HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 164 | --------------------------------------------------------------------------------