├── AUTHOR ├── LICENSE ├── README.md ├── VERSION ├── all_test.go ├── bstorm └── reg.go ├── chelper.c ├── chelper.h ├── constants.go ├── find_test.go ├── quotemeta.go └── regex.go /AUTHOR: -------------------------------------------------------------------------------- 1 | Zhigang Chen 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2011 by Zhigang Chen 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rubex : Super Fast Regexp for Go # 2 | by Zhigang Chen (zhigang.chen@moovweb.com or zhigangc@gmail.com) 3 | 4 | ***ONLY USE go1 BRANCH*** 5 | 6 | A simple regular expression library that supports Ruby's regexp syntax. It implements all the public functions of Go's Regexp package, except LiteralPrefix. By the benchmark tests in Regexp, the library is 40% to 10X faster than Regexp on all but one test. Unlike Go's Regrexp, this library supports named capture groups and also allow "\\1" and "\\k" in replacement strings. 7 | 8 | The library calls the Oniguruma regex library (5.9.2, the latest release as of now) for regex pattern searching. All replacement code is done in Go. This library can be easily adapted to support the regex syntax used by other programming languages or tools, like Java, Perl, grep, and emacs. 9 | 10 | ## Installation ## 11 | 12 | First, ensure you have Oniguruma installed. On OS X with brew, its as simple as 13 | 14 | brew install oniguruma 15 | 16 | On Ubuntu... 17 | 18 | sudo apt-get install libonig2 19 | 20 | Now that we've got Oniguruma installed, we can install Rubex! 21 | 22 | go install github.com/moovweb/rubex 23 | 24 | ## Example Usage ## 25 | 26 | import "rubex" 27 | 28 | rxp := rubex.MustCompile("[a-z]*") 29 | if err != nil { 30 | // whoops 31 | } 32 | result := rxp.FindString("a me my") 33 | if result != "" { 34 | // FOUND A STRING!! YAY! Must be "a" in this instance 35 | } else { 36 | // no good 37 | } 38 | 39 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 1.0 2 | -------------------------------------------------------------------------------- /all_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package rubex 6 | 7 | import ( 8 | "errors" 9 | "runtime" 10 | "strings" 11 | "testing" 12 | ) 13 | 14 | var good_re = []string{ 15 | ``, 16 | `.`, 17 | `^.$`, 18 | `a`, 19 | `a*`, 20 | `a+`, 21 | `a?`, 22 | `a|b`, 23 | `a*|b*`, 24 | `(a*|b)(c*|d)`, 25 | `[a-z]`, 26 | `[a-abc-c\-\]\[]`, 27 | `[a-z]+`, 28 | //`[]`, //this is not considered as good by ruby/javascript regex 29 | `[abc]`, 30 | `[^1234]`, 31 | `[^\n]`, 32 | `\!\\`, 33 | } 34 | 35 | type stringError struct { 36 | re string 37 | err error 38 | } 39 | 40 | var bad_re = []stringError{ 41 | {`*`, errors.New("target of repeat operator is not specified")}, 42 | {`+`, errors.New("target of repeat operator is not specified")}, 43 | {`?`, errors.New("target of repeat operator is not specified")}, 44 | {`(abc`, errors.New("end pattern with unmatched parenthesis")}, 45 | {`abc)`, errors.New("unmatched close parenthesis")}, 46 | {`x[a-z`, errors.New("premature end of char-class")}, 47 | //{`abc]`, Err}, //this is not considered as bad by ruby/javascript regex; nor are the following commented out regex patterns 48 | {`abc[`, errors.New("premature end of char-class")}, 49 | {`[z-a]`, errors.New("empty range in char class")}, 50 | {`abc\`, errors.New("end pattern at escape")}, 51 | //{`a**`, Err}, 52 | //{`a*+`, Err}, 53 | //{`a??`, Err}, 54 | //{`\x`, Err}, 55 | } 56 | 57 | func runParallel(testFunc func(chan bool), concurrency int) { 58 | runtime.GOMAXPROCS(4) 59 | done := make(chan bool, concurrency) 60 | for i := 0; i < concurrency; i++ { 61 | go testFunc(done) 62 | } 63 | for i := 0; i < concurrency; i++ { 64 | <-done 65 | <-done 66 | } 67 | runtime.GOMAXPROCS(1) 68 | } 69 | 70 | const numConcurrentRuns = 200 71 | 72 | func compileTest(t *testing.T, expr string, error error) *Regexp { 73 | re, err := Compile(expr) 74 | if (error == nil && err != error) || (error != nil && err.Error() != error.Error()) { 75 | t.Error("compiling `", expr, "`; unexpected error: ", err.Error()) 76 | } 77 | return re 78 | } 79 | 80 | func TestGoodCompile(t *testing.T) { 81 | testFunc := func(done chan bool) { 82 | done <- false 83 | for i := 0; i < len(good_re); i++ { 84 | compileTest(t, good_re[i], nil) 85 | } 86 | done <- true 87 | } 88 | runParallel(testFunc, numConcurrentRuns) 89 | } 90 | 91 | func TestBadCompile(t *testing.T) { 92 | for i := 0; i < len(bad_re); i++ { 93 | compileTest(t, bad_re[i].re, bad_re[i].err) 94 | } 95 | } 96 | 97 | func matchTest(t *testing.T, test *FindTest) { 98 | re := compileTest(t, test.pat, nil) 99 | if re == nil { 100 | return 101 | } 102 | m := re.MatchString(test.text) 103 | if m != (len(test.matches) > 0) { 104 | t.Errorf("MatchString failure on %s: %t should be %t", test.pat, m, len(test.matches) > 0) 105 | } 106 | // now try bytes 107 | m = re.Match([]byte(test.text)) 108 | if m != (len(test.matches) > 0) { 109 | t.Errorf("Match failure on %s: %t should be %t", test.pat, m, len(test.matches) > 0) 110 | } 111 | } 112 | 113 | func TestMatch(t *testing.T) { 114 | for _, test := range findTests { 115 | matchTest(t, &test) 116 | } 117 | } 118 | 119 | func matchFunctionTest(t *testing.T, test *FindTest) { 120 | m, err := MatchString(test.pat, test.text) 121 | if err == nil { 122 | return 123 | } 124 | if m != (len(test.matches) > 0) { 125 | t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) 126 | } 127 | } 128 | 129 | func TestMatchFunction(t *testing.T) { 130 | for _, test := range findTests { 131 | matchFunctionTest(t, &test) 132 | } 133 | } 134 | 135 | type ReplaceTest struct { 136 | pattern, replacement, input, output string 137 | } 138 | 139 | var replaceTests = []ReplaceTest{ 140 | // Test empty input and/or replacement, with pattern that matches the empty string. 141 | {"", "", "", ""}, 142 | {"", "x", "", "x"}, 143 | {"", "", "abc", "abc"}, 144 | {"", "x", "abc", "xaxbxcx"}, 145 | 146 | // Test empty input and/or replacement, with pattern that does not match the empty string. 147 | {"b", "", "", ""}, 148 | {"b", "x", "", ""}, 149 | {"b", "", "abc", "ac"}, 150 | {"b", "x", "abc", "axc"}, 151 | {"y", "", "", ""}, 152 | {"y", "x", "", ""}, 153 | {"y", "", "abc", "abc"}, 154 | {"y", "x", "abc", "abc"}, 155 | 156 | // Multibyte characters -- verify that we don't try to match in the middle 157 | // of a character. 158 | {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, 159 | {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, 160 | 161 | // Start and end of a string. 162 | {"^[a-c]*", "x", "abcdabc", "xdabc"}, 163 | {"[a-c]*$", "x", "abcdabc", "abcdxx"}, 164 | {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, 165 | {"^[a-c]*", "x", "abc", "x"}, 166 | {"[a-c]*$", "x", "abc", "xx"}, 167 | {"^[a-c]*$", "x", "abc", "x"}, 168 | {"^[a-c]*", "x", "dabce", "xdabce"}, 169 | {"[a-c]*$", "x", "dabce", "dabcex"}, 170 | {"^[a-c]*$", "x", "dabce", "dabce"}, 171 | {"^[a-c]*", "x", "", "x"}, 172 | {"[a-c]*$", "x", "", "x"}, 173 | {"^[a-c]*$", "x", "", "x"}, 174 | 175 | {"^[a-c]+", "x", "abcdabc", "xdabc"}, 176 | {"[a-c]+$", "x", "abcdabc", "abcdx"}, 177 | {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, 178 | {"^[a-c]+", "x", "abc", "x"}, 179 | {"[a-c]+$", "x", "abc", "x"}, 180 | {"^[a-c]+$", "x", "abc", "x"}, 181 | {"^[a-c]+", "x", "dabce", "dabce"}, 182 | {"[a-c]+$", "x", "dabce", "dabce"}, 183 | {"^[a-c]+$", "x", "dabce", "dabce"}, 184 | {"^[a-c]+", "x", "", ""}, 185 | {"[a-c]+$", "x", "", ""}, 186 | {"^[a-c]+$", "x", "", ""}, 187 | 188 | // Other cases. 189 | {"abc", "def", "abcdefg", "defdefg"}, 190 | {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, 191 | {"abc", "", "abcdabc", "d"}, 192 | {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, 193 | {"abc", "d", "", ""}, 194 | {"abc", "d", "abc", "d"}, 195 | {".+", "x", "abc", "x"}, 196 | {"[a-c]*", "x", "def", "xdxexfx"}, 197 | {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, 198 | {"[a-c]*", "x", "abcbcdcdedef", "xxdxxdxexdxexfx"}, 199 | {"(foo)*bar(s)", "\\1", "bars", ""}, 200 | } 201 | 202 | type ReplaceFuncTest struct { 203 | pattern string 204 | replacement func(string) string 205 | input, output string 206 | } 207 | 208 | var replaceFuncTests = []ReplaceFuncTest{ 209 | {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, 210 | {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, 211 | {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcyxydxyexyfxy"}, 212 | } 213 | 214 | func TestReplaceAll(t *testing.T) { 215 | for _, tc := range replaceTests { 216 | re, err := Compile(tc.pattern) 217 | 218 | if err != nil { 219 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 220 | continue 221 | } 222 | 223 | actual := re.ReplaceAllString(tc.input, tc.replacement) 224 | 225 | if actual != tc.output { 226 | t.Errorf("%q.Replace(%q,%q) = %q; want %q", 227 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 228 | } 229 | 230 | // now try bytes 231 | 232 | actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) 233 | if actual != tc.output { 234 | t.Errorf("%q.Replace(%q,%q) = %q; want %q", 235 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 236 | } 237 | 238 | } 239 | } 240 | 241 | func TestReplaceAllFunc(t *testing.T) { 242 | for _, tc := range replaceFuncTests { 243 | re, err := Compile(tc.pattern) 244 | if err != nil { 245 | t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) 246 | continue 247 | } 248 | actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) 249 | if actual != tc.output { 250 | t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", 251 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 252 | } 253 | // now try bytes 254 | actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) 255 | if actual != tc.output { 256 | t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", 257 | tc.pattern, tc.input, tc.replacement, actual, tc.output) 258 | } 259 | } 260 | } 261 | 262 | /* 263 | * "hallo".gsub(/h(.*)llo/, "e") 264 | */ 265 | func TestGsub1(t *testing.T) { 266 | input := "hallo" 267 | pattern := "h(.*)llo" 268 | expected := "e" 269 | re, err := Compile(pattern) 270 | if err != nil { 271 | t.Errorf("Unexpected error compiling %q: %v", pattern, err) 272 | return 273 | } 274 | actual := re.Gsub(input, "e") 275 | if actual != expected { 276 | t.Errorf("expected %q, actual %q\n", expected, actual) 277 | } 278 | } 279 | 280 | /* 281 | * "hallo".gsub(/h(?.*)llo/, "\\k") 282 | */ 283 | func TestGsubNamedCapture1(t *testing.T) { 284 | input := "hallo" 285 | pattern := "h(?.*)llo" 286 | expected := "a" 287 | re, err := Compile(pattern) 288 | if err != nil { 289 | t.Errorf("Unexpected error compiling %q: %v", pattern, err) 290 | return 291 | } 292 | actual := re.Gsub(input, "\\k") 293 | if actual != expected { 294 | t.Errorf("expected %q, actual %q\n", expected, actual) 295 | } 296 | } 297 | 298 | /* 299 | * "hallo".gsub(/h(?.*)ll(?.*)/, "\\k\\k\\k") 300 | */ 301 | func TestGsubNamedCapture2(t *testing.T) { 302 | input := "hallo" 303 | pattern := "h(?.*)ll(?.*)" 304 | expected := "aoa" 305 | re, err := Compile(pattern) 306 | if err != nil { 307 | t.Errorf("Unexpected error compiling %q: %v", pattern, err) 308 | return 309 | } 310 | actual := re.Gsub(input, "\\k\\k\\k") 311 | if actual != expected { 312 | t.Errorf("expected %q, actual %q\n", expected, actual) 313 | } 314 | } 315 | 316 | /* 317 | * "hallo".gsub(/h(?.*)(l*)(?.*)/, "\\k\\k\\k\\1") 318 | */ 319 | func TestGsubNamedCapture3(t *testing.T) { 320 | input := "hallo" 321 | pattern := "h(?.*)(l*)(?.*)" 322 | expected := "alloallo" 323 | re, err := Compile(pattern) 324 | if err != nil { 325 | t.Errorf("Unexpected error compiling %q: %v", pattern, err) 326 | return 327 | } 328 | actual := re.Gsub(input, "\\k\\k\\k\\1") 329 | if actual != expected { 330 | t.Errorf("expected %q, actual %q\n", expected, actual) 331 | } 332 | } 333 | 334 | /* 335 | * "hallo".gsub(/h(?.*)(l*)(?.*)/, "\\k\\k\\k\\1") 336 | */ 337 | func TestGsubNamedCapture4(t *testing.T) { 338 | input := "The lamb was sure to go." 339 | pattern := "(?[^\\s\\.]+)(?\\s)" 340 | expected := "They lamby wasy surey toy go." 341 | re, err := Compile(pattern) 342 | if err != nil { 343 | t.Errorf("Unexpected error compiling %q: %v", pattern, err) 344 | return 345 | } 346 | 347 | actual := re.GsubFunc(input, func(_ string, captures map[string]string) string { 348 | return captures["word"] + "y" + captures["white_space"] 349 | }) 350 | if actual != expected { 351 | t.Errorf("expected %q, actual %q\n", expected, actual) 352 | } 353 | 354 | } 355 | 356 | /* 357 | * "hallo".gsub(/h(.*)llo/) { |match| 358 | * "e" 359 | * } 360 | */ 361 | func TestGsubFunc1(t *testing.T) { 362 | input := "hallo" 363 | pattern := "h(.*)llo" 364 | expected := "e" 365 | re, err := Compile(pattern) 366 | if err != nil { 367 | t.Errorf("Unexpected error compiling %q: %v", pattern, err) 368 | return 369 | } 370 | actual := re.GsubFunc(input, func(match string, captures map[string]string) string { 371 | return "e" 372 | }) 373 | if actual != expected { 374 | t.Errorf("expected %q, actual %q\n", expected, actual) 375 | } 376 | } 377 | 378 | /* 379 | * @env = {} 380 | * "hallo".gsub(/h(.*)llo/) { |match| 381 | * $~.captures.each_with_index do |arg, index| 382 | * @env["#{index + 1}"] = arg 383 | * "abcd".gsub(/(d)/) do 384 | * env["1"] 385 | * end 386 | * end 387 | * } 388 | */ 389 | func TestGsubFunc2(t *testing.T) { 390 | input := "hallo" 391 | pattern := "h(.*)llo" 392 | expected := "abca" 393 | env := make(map[string]string) 394 | re, err := Compile(pattern) 395 | if err != nil { 396 | t.Errorf("Unexpected error compiling %q: %v", pattern, err) 397 | return 398 | } 399 | actual := re.GsubFunc(input, func(_ string, captures map[string]string) string { 400 | for name, capture := range captures { 401 | env[name] = capture 402 | } 403 | re1 := MustCompile("(d)") 404 | return re1.GsubFunc("abcd", func(_ string, captures2 map[string]string) string { 405 | return env["1"] 406 | }) 407 | }) 408 | if actual != expected { 409 | t.Errorf("expected %q, actual %q\n", expected, actual) 410 | } 411 | } 412 | 413 | /* how to match $ as itself */ 414 | func TestPattern1(t *testing.T) { 415 | re := MustCompile(`b\$a`) 416 | if !re.MatchString("b$a") { 417 | t.Errorf("expect to match\n") 418 | } 419 | re = MustCompile("b\\$a") 420 | if !re.MatchString("b$a") { 421 | t.Errorf("expect to match 2\n") 422 | } 423 | } 424 | 425 | /* how to use $ as the end of line */ 426 | func TestPattern2(t *testing.T) { 427 | re := MustCompile("a$") 428 | if !re.MatchString("a") { 429 | t.Errorf("expect to match\n") 430 | } 431 | if re.MatchString("ab") { 432 | t.Errorf("expect to mismatch\n") 433 | } 434 | } 435 | 436 | func TestCompileWithOption(t *testing.T) { 437 | re := MustCompileWithOption("a$", ONIG_OPTION_IGNORECASE) 438 | if !re.MatchString("A") { 439 | t.Errorf("expect to match\n") 440 | } 441 | re = MustCompile("a$") 442 | if re.MatchString("A") { 443 | t.Errorf("expect to mismatch\n") 444 | } 445 | 446 | } 447 | 448 | type MetaTest struct { 449 | pattern, output, literal string 450 | isLiteral bool 451 | } 452 | 453 | var metaTests = []MetaTest{ 454 | {``, ``, ``, true}, 455 | {`foo`, `foo`, `foo`, true}, 456 | {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator 457 | {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators 458 | {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*_\+-=\[{\]}\\\|,<\.>/\?~`, `!@#`, false}, 459 | } 460 | 461 | func TestQuoteMeta(t *testing.T) { 462 | for _, tc := range metaTests { 463 | // Verify that QuoteMeta returns the expected string. 464 | quoted := QuoteMeta(tc.pattern) 465 | if quoted != tc.output { 466 | t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", 467 | tc.pattern, quoted, tc.output) 468 | continue 469 | } 470 | 471 | // Verify that the quoted string is in fact treated as expected 472 | // by Compile -- i.e. that it matches the original, unquoted string. 473 | if tc.pattern != "" { 474 | re, err := Compile(quoted) 475 | if err != nil { 476 | t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) 477 | continue 478 | } 479 | src := "abc" + tc.pattern + "def" 480 | repl := "xyz" 481 | replaced := re.ReplaceAllString(src, repl) 482 | expected := "abcxyzdef" 483 | if replaced != expected { 484 | t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", 485 | tc.pattern, src, repl, replaced, expected) 486 | } 487 | } 488 | } 489 | } 490 | 491 | /* 492 | * LiteralPrefix is not supported by rubex 493 | * 494 | //LiteralPrefix 495 | func TestLiteralPrefix(t *testing.T) { 496 | for _, tc := range metaTests { 497 | // Literal method needs to scan the pattern. 498 | re := MustCompile(tc.pattern) 499 | str, complete := re.LiteralPrefix() 500 | if complete != tc.isLiteral { 501 | t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) 502 | } 503 | if str != tc.literal { 504 | t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) 505 | } 506 | } 507 | } 508 | */ 509 | type numSubexpCase struct { 510 | input string 511 | expected int 512 | } 513 | 514 | var numSubexpCases = []numSubexpCase{ 515 | {``, 0}, 516 | {`.*`, 0}, 517 | {`abba`, 0}, 518 | {`ab(b)a`, 1}, 519 | {`ab(.*)a`, 1}, 520 | {`(.*)ab(.*)a`, 2}, 521 | {`(.*)(ab)(.*)a`, 3}, 522 | {`(.*)((a)b)(.*)a`, 4}, 523 | {`(.*)($ab)(.*)a`, 3}, 524 | {`(.*)(\(a$b)(.*)a`, 3}, 525 | } 526 | 527 | func TestNumSubexp(t *testing.T) { 528 | for _, c := range numSubexpCases { 529 | re := MustCompile(c.input) 530 | n := re.NumSubexp() 531 | if n != c.expected { 532 | t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) 533 | } 534 | } 535 | } 536 | 537 | func BenchmarkLiteral(b *testing.B) { 538 | x := strings.Repeat("x", 50) + "y" 539 | b.StopTimer() 540 | re := MustCompile("y") 541 | b.StartTimer() 542 | for i := 0; i < b.N; i++ { 543 | if !re.MatchString(x) { 544 | println("no match!") 545 | break 546 | } 547 | } 548 | } 549 | 550 | func BenchmarkNotLiteral(b *testing.B) { 551 | x := strings.Repeat("x", 50) + "y" 552 | b.StopTimer() 553 | re := MustCompile(".y") 554 | b.StartTimer() 555 | for i := 0; i < b.N; i++ { 556 | if !re.MatchString(x) { 557 | println("no match!") 558 | break 559 | } 560 | } 561 | } 562 | 563 | func BenchmarkMatchClass(b *testing.B) { 564 | b.StopTimer() 565 | x := strings.Repeat("xxxx", 20) + "w" 566 | re := MustCompile("[abcdw]") 567 | b.StartTimer() 568 | for i := 0; i < b.N; i++ { 569 | if !re.MatchString(x) { 570 | println("no match!") 571 | break 572 | } 573 | } 574 | } 575 | 576 | func BenchmarkMatchClass_InRange(b *testing.B) { 577 | b.StopTimer() 578 | // 'b' is between 'a' and 'c', so the charclass 579 | // range checking is no help here. 580 | x := strings.Repeat("bbbb", 20) + "c" 581 | re := MustCompile("[ac]") 582 | b.StartTimer() 583 | for i := 0; i < b.N; i++ { 584 | if !re.MatchString(x) { 585 | println("no match!") 586 | break 587 | } 588 | } 589 | } 590 | 591 | func BenchmarkReplaceAll(b *testing.B) { 592 | x := "abcdefghijklmnopqrstuvwxyz" 593 | b.StopTimer() 594 | re := MustCompile("[cjrw]") 595 | b.StartTimer() 596 | for i := 0; i < b.N; i++ { 597 | re.ReplaceAllString(x, "") 598 | } 599 | } 600 | 601 | func BenchmarkFindAllStringSubmatchIndex(b *testing.B) { 602 | x := "abcdefghijklmnopqrstuvwxyz" 603 | b.StopTimer() 604 | re := MustCompile("[cjrw]") 605 | b.StartTimer() 606 | for i := 0; i < b.N; i++ { 607 | re.FindAllStringSubmatchIndex(x, 0) 608 | } 609 | } 610 | 611 | func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { 612 | b.StopTimer() 613 | x := []byte("abcdefghijklmnopqrstuvwxyz") 614 | re := MustCompile("^zbc(d|e)") 615 | b.StartTimer() 616 | for i := 0; i < b.N; i++ { 617 | re.Match(x) 618 | } 619 | } 620 | 621 | func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { 622 | b.StopTimer() 623 | x := []byte("abcdefghijklmnopqrstuvwxyz") 624 | for i := 0; i < 15; i++ { 625 | x = append(x, x...) 626 | } 627 | re := MustCompile("^zbc(d|e)") 628 | b.StartTimer() 629 | for i := 0; i < b.N; i++ { 630 | re.Match(x) 631 | } 632 | } 633 | 634 | func BenchmarkAnchoredShortMatch(b *testing.B) { 635 | b.StopTimer() 636 | x := []byte("abcdefghijklmnopqrstuvwxyz") 637 | re := MustCompile("^.bc(d|e)") 638 | b.StartTimer() 639 | for i := 0; i < b.N; i++ { 640 | re.Match(x) 641 | } 642 | } 643 | 644 | func BenchmarkAnchoredLongMatch(b *testing.B) { 645 | b.StopTimer() 646 | x := []byte("abcdefghijklmnopqrstuvwxyz") 647 | for i := 0; i < 15; i++ { 648 | x = append(x, x...) 649 | } 650 | re := MustCompile("^.bc(d|e)") 651 | b.StartTimer() 652 | for i := 0; i < b.N; i++ { 653 | re.Match(x) 654 | } 655 | } 656 | -------------------------------------------------------------------------------- /bstorm/reg.go: -------------------------------------------------------------------------------- 1 | // Comparing the speeds of the golang native regex library and rubex. 2 | // The numbers show a dramatic difference, with rubex being nearly 400 3 | // times slower than the native go libraries. Unfortunately for us, 4 | // the native go libraries have a different regex behavior than rubex, 5 | // so we'll have to hack at it a bit to fit our needs if we decide to use it. 6 | // (which we should, I mean, come on, 400 times faster? That's mad wins.) 7 | 8 | package main 9 | 10 | import "fmt" 11 | import re "github.com/moovweb/rubex" 12 | import "time" 13 | import "regexp" 14 | import "runtime" 15 | import "os" 16 | import "strconv" 17 | import "sync" 18 | 19 | var mu sync.Mutex 20 | var count = 0 21 | var re1 []Matcher 22 | var re2 []Matcher 23 | 24 | const NUM = 100 25 | const NNN = 1000 26 | const CCC = 100000 27 | 28 | var STR = "abcdabc" 29 | 30 | type Matcher interface { 31 | MatchString(string) bool 32 | } 33 | 34 | type Task struct { 35 | str string 36 | m Matcher 37 | t time.Time 38 | } 39 | 40 | var TaskChann chan *Task 41 | 42 | func init() { 43 | re1 = make([]Matcher, NUM) 44 | re2 = make([]Matcher, NUM) 45 | for i := 0; i < NUM; i++ { 46 | re1[i] = regexp.MustCompile("[a-c]*$") 47 | re2[i] = re.MustCompile("[a-c]*$") 48 | } 49 | TaskChann = make(chan *Task, 100) 50 | for i := 0; i < 10; i++ { 51 | STR += STR 52 | } 53 | fmt.Println("len:", len(STR)) 54 | } 55 | 56 | func render_pages(name string, marray []Matcher, num_routines, num_renders int) { 57 | for i := 0; i < num_routines; i++ { 58 | m := marray[i] 59 | go func() { 60 | runtime.LockOSThread() 61 | for j := 0; j < num_renders; j++ { 62 | var totalDuration int64 = 0 63 | for i := 0; i < NNN; i++ { 64 | t := time.Now() 65 | mu.Lock() 66 | if count > CCC { 67 | mu.Unlock() 68 | return 69 | } 70 | count += 1 71 | m.MatchString(STR) 72 | mu.Unlock() 73 | totalDuration += time.Since(t).Nanoseconds() 74 | } 75 | fmt.Println(name+"-average: ", totalDuration/int64(1000*NNN), "us") 76 | } 77 | }() 78 | } 79 | } 80 | 81 | func render_pages2(name string, marray []Matcher, num_routines, num_renders int) { 82 | go func() { 83 | for i := 0; i < CCC; i++ { 84 | t := &Task{str: STR, m: marray[0], t: time.Now()} 85 | TaskChann <- t 86 | } 87 | }() 88 | for i := 0; i < num_routines; i++ { 89 | m := marray[i] 90 | go func() { 91 | runtime.LockOSThread() 92 | for j := 0; j < num_renders; j++ { 93 | var totalDuration int64 = 0 94 | for i := 0; i < NNN; i++ { 95 | task := <-TaskChann 96 | m.MatchString(task.str) 97 | totalDuration += time.Since(task.t).Nanoseconds() 98 | } 99 | fmt.Println(name+"-average: ", totalDuration/int64(1000*NNN), "us") 100 | } 101 | }() 102 | } 103 | } 104 | 105 | func main() { 106 | cpu, _ := strconv.Atoi(os.Args[1]) 107 | lib := os.Args[2] 108 | method := os.Args[3] 109 | fmt.Println("using CPUs:", cpu) 110 | runtime.GOMAXPROCS(cpu) 111 | num_routines := 6 112 | num_renders := 20 113 | 114 | if method == "chan" { 115 | if lib == "rubex" { 116 | render_pages2("rubex", re2, num_routines, num_renders) 117 | } else { 118 | render_pages2("regexp", re1, num_routines, num_renders) 119 | } 120 | } else { 121 | if lib == "rubex" { 122 | render_pages("rubex", re2, num_routines, num_renders) 123 | } else { 124 | render_pages("regexp", re1, num_routines, num_renders) 125 | } 126 | 127 | } 128 | d, _ := time.ParseDuration("5s") 129 | for i := 0; i < 100; i++ { 130 | fmt.Println("goroutine:", runtime.NumGoroutine()) 131 | time.Sleep(d) 132 | 133 | } 134 | fmt.Println("Done") 135 | } 136 | -------------------------------------------------------------------------------- /chelper.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #ifdef BENCHMARK_CHELP 5 | #include 6 | #endif 7 | #include "chelper.h" 8 | 9 | int NewOnigRegex( char *pattern, int pattern_length, int option, 10 | OnigRegex *regex, OnigRegion **region, OnigErrorInfo **error_info, char **error_buffer) { 11 | int ret = ONIG_NORMAL; 12 | int error_msg_len = 0; 13 | 14 | OnigUChar *pattern_start = (OnigUChar *) pattern; 15 | OnigUChar *pattern_end = (OnigUChar *) (pattern + pattern_length); 16 | 17 | *error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo)); 18 | memset(*error_info, 0, sizeof(OnigErrorInfo)); 19 | 20 | *error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); 21 | 22 | memset(*error_buffer, 0, ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); 23 | 24 | *region = onig_region_new(); 25 | 26 | ret = onig_new_default(regex, pattern_start, pattern_end, (OnigOptionType)(option), *error_info); 27 | 28 | if (ret != ONIG_NORMAL) { 29 | error_msg_len = onig_error_code_to_str((unsigned char*)(*error_buffer), ret, *error_info); 30 | if (error_msg_len >= ONIG_MAX_ERROR_MESSAGE_LEN) { 31 | error_msg_len = ONIG_MAX_ERROR_MESSAGE_LEN - 1; 32 | } 33 | (*error_buffer)[error_msg_len] = '\0'; 34 | } 35 | return ret; 36 | } 37 | 38 | int SearchOnigRegex( void *str, int str_length, int offset, int option, 39 | OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) { 40 | int ret = ONIG_MISMATCH; 41 | int error_msg_len = 0; 42 | #ifdef BENCHMARK_CHELP 43 | struct timeval tim1, tim2; 44 | long t; 45 | #endif 46 | 47 | OnigUChar *str_start = (OnigUChar *) str; 48 | OnigUChar *str_end = (OnigUChar *) (str_start + str_length); 49 | OnigUChar *search_start = (OnigUChar *)(str_start + offset); 50 | OnigUChar *search_end = str_end; 51 | 52 | #ifdef BENCHMARK_CHELP 53 | gettimeofday(&tim1, NULL); 54 | #endif 55 | 56 | ret = onig_search(regex, str_start, str_end, search_start, search_end, region, option); 57 | if (ret < 0 && error_buffer != NULL) { 58 | error_msg_len = onig_error_code_to_str((unsigned char*)(error_buffer), ret, error_info); 59 | if (error_msg_len >= ONIG_MAX_ERROR_MESSAGE_LEN) { 60 | error_msg_len = ONIG_MAX_ERROR_MESSAGE_LEN - 1; 61 | } 62 | error_buffer[error_msg_len] = '\0'; 63 | } 64 | else if (captures != NULL) { 65 | int i; 66 | int count = 0; 67 | for (i = 0; i < region->num_regs; i++) { 68 | captures[2*count] = region->beg[i]; 69 | captures[2*count+1] = region->end[i]; 70 | count ++; 71 | } 72 | *numCaptures = count; 73 | } 74 | 75 | #ifdef BENCHMARK_CHELP 76 | gettimeofday(&tim2, NULL); 77 | t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; 78 | printf("%ld microseconds elapsed\n", t); 79 | #endif 80 | return ret; 81 | } 82 | 83 | int MatchOnigRegex(void *str, int str_length, int offset, int option, 84 | OnigRegex regex, OnigRegion *region) { 85 | int ret = ONIG_MISMATCH; 86 | int error_msg_len = 0; 87 | #ifdef BENCHMARK_CHELP 88 | struct timeval tim1, tim2; 89 | long t; 90 | #endif 91 | 92 | OnigUChar *str_start = (OnigUChar *) str; 93 | OnigUChar *str_end = (OnigUChar *) (str_start + str_length); 94 | OnigUChar *search_start = (OnigUChar *)(str_start + offset); 95 | 96 | #ifdef BENCHMARK_CHELP 97 | gettimeofday(&tim1, NULL); 98 | #endif 99 | ret = onig_match(regex, str_start, str_end, search_start, region, option); 100 | #ifdef BENCHMARK_CHELP 101 | gettimeofday(&tim2, NULL); 102 | t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; 103 | printf("%ld microseconds elapsed\n", t); 104 | #endif 105 | return ret; 106 | } 107 | 108 | int LookupOnigCaptureByName(char *name, int name_length, 109 | OnigRegex regex, OnigRegion *region) { 110 | int ret = ONIGERR_UNDEFINED_NAME_REFERENCE; 111 | #ifdef BENCHMARK_CHELP 112 | struct timeval tim1, tim2; 113 | long t; 114 | #endif 115 | OnigUChar *name_start = (OnigUChar *) name; 116 | OnigUChar *name_end = (OnigUChar *) (name_start + name_length); 117 | #ifdef BENCHMARK_CHELP 118 | gettimeofday(&tim1, NULL); 119 | #endif 120 | ret = onig_name_to_backref_number(regex, name_start, name_end, region); 121 | #ifdef BENCHMARK_CHELP 122 | gettimeofday(&tim2, NULL); 123 | t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; 124 | printf("%ld microseconds elapsed\n", t); 125 | #endif 126 | return ret; 127 | } 128 | 129 | typedef struct { 130 | char *nameBuffer; 131 | int bufferOffset; 132 | int bufferSize; 133 | int *numbers; 134 | int numIndex; 135 | } group_info_t; 136 | 137 | int name_callback(const UChar* name, const UChar* name_end, 138 | int ngroup_num, int* group_nums, 139 | regex_t* reg, void* arg) 140 | { 141 | int nameLen, offset, newOffset; 142 | group_info_t *groupInfo; 143 | 144 | groupInfo = (group_info_t*) arg; 145 | offset = groupInfo->bufferOffset; 146 | nameLen = name_end - name; 147 | newOffset = offset + nameLen; 148 | 149 | //if there are already names, add a ";" 150 | if (offset > 0) { 151 | newOffset += 1; 152 | } 153 | 154 | if (newOffset <= groupInfo->bufferSize) { 155 | if (offset > 0) { 156 | groupInfo->nameBuffer[offset] = ';'; 157 | offset += 1; 158 | } 159 | strncpy(&groupInfo->nameBuffer[offset], name, nameLen); 160 | } 161 | groupInfo->bufferOffset = newOffset; 162 | if (ngroup_num > 0) { 163 | groupInfo->numbers[groupInfo->numIndex] = group_nums[ngroup_num-1]; 164 | } else { 165 | groupInfo->numbers[groupInfo->numIndex] = -1; 166 | } 167 | groupInfo->numIndex += 1; 168 | return 0; /* 0: continue */ 169 | } 170 | 171 | int GetCaptureNames(OnigRegex reg, void *buffer, int bufferSize, int* groupNumbers) { 172 | int ret; 173 | group_info_t groupInfo; 174 | groupInfo.nameBuffer = (char*)buffer; 175 | groupInfo.bufferOffset = 0; 176 | groupInfo.bufferSize = bufferSize; 177 | groupInfo.numbers = groupNumbers; 178 | groupInfo.numIndex = 0; 179 | onig_foreach_name(reg, name_callback, (void* )&groupInfo); 180 | return groupInfo.bufferOffset; 181 | } 182 | 183 | -------------------------------------------------------------------------------- /chelper.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | extern int NewOnigRegex( char *pattern, int pattern_length, int option, 4 | OnigRegex *regex, OnigRegion **region, OnigErrorInfo **error_info, char **error_buffer); 5 | 6 | extern int SearchOnigRegex( void *str, int str_length, int offset, int option, 7 | OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures); 8 | 9 | extern int MatchOnigRegex( void *str, int str_length, int offset, int option, 10 | OnigRegex regex, OnigRegion *region); 11 | 12 | extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex, OnigRegion *region); 13 | 14 | extern int GetCaptureNames(OnigRegex regex, void *buffer, int bufferSize, int* groupNumbers); 15 | -------------------------------------------------------------------------------- /constants.go: -------------------------------------------------------------------------------- 1 | package rubex 2 | 3 | const ( 4 | ONIG_OPTION_DEFAULT = ONIG_OPTION_NONE 5 | /* options */ 6 | ONIG_OPTION_NONE = 0 7 | ONIG_OPTION_IGNORECASE = 1 8 | ONIG_OPTION_EXTEND = (ONIG_OPTION_IGNORECASE << 1) 9 | ONIG_OPTION_MULTILINE = (ONIG_OPTION_EXTEND << 1) 10 | ONIG_OPTION_SINGLELINE = (ONIG_OPTION_MULTILINE << 1) 11 | ONIG_OPTION_FIND_LONGEST = (ONIG_OPTION_SINGLELINE << 1) 12 | ONIG_OPTION_FIND_NOT_EMPTY = (ONIG_OPTION_FIND_LONGEST << 1) 13 | ONIG_OPTION_NEGATE_SINGLELINE = (ONIG_OPTION_FIND_NOT_EMPTY << 1) 14 | ONIG_OPTION_DONT_CAPTURE_GROUP = (ONIG_OPTION_NEGATE_SINGLELINE << 1) 15 | ONIG_OPTION_CAPTURE_GROUP = (ONIG_OPTION_DONT_CAPTURE_GROUP << 1) 16 | /* options (search time) */ 17 | ONIG_OPTION_NOTBOL = (ONIG_OPTION_CAPTURE_GROUP << 1) 18 | ONIG_OPTION_NOTEOL = (ONIG_OPTION_NOTBOL << 1) 19 | ONIG_OPTION_POSIX_REGION = (ONIG_OPTION_NOTEOL << 1) 20 | ONIG_OPTION_MAXBIT = ONIG_OPTION_POSIX_REGION /* limit */ 21 | 22 | ONIG_NORMAL = 0 23 | ONIG_MISMATCH = -1 24 | 25 | ONIG_MISMATCH_STR = "mismatch" 26 | ONIGERR_UNDEFINED_NAME_REFERENCE = -217 27 | ) 28 | -------------------------------------------------------------------------------- /find_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package rubex 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | "testing" 11 | ) 12 | 13 | // For each pattern/text pair, what is the expected output of each function? 14 | // We can derive the textual results from the indexed results, the non-submatch 15 | // results from the submatched results, the single results from the 'all' results, 16 | // and the byte results from the string results. Therefore the table includes 17 | // only the FindAllStringSubmatchIndex result. 18 | type FindTest struct { 19 | pat string 20 | text string 21 | matches [][]int 22 | } 23 | 24 | func (t FindTest) String() string { 25 | return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) 26 | } 27 | 28 | var findTests = []FindTest{ 29 | {``, ``, build(1, 0, 0)}, 30 | {`^abcdefg`, "abcdefg", build(1, 0, 7)}, 31 | {`a+`, "baaab", build(1, 1, 4)}, 32 | {"abcd..", "abcdef", build(1, 0, 6)}, 33 | {`a`, "a", build(1, 0, 1)}, 34 | {`x`, "y", nil}, 35 | {`b`, "abc", build(1, 1, 2)}, 36 | {`.`, "a", build(1, 0, 1)}, 37 | {`.*`, "abcdef", build(2, 0, 6, 6, 6)}, 38 | {`^`, "abcde", build(1, 0, 0)}, 39 | {`$`, "abcde", build(1, 5, 5)}, 40 | {`^abcd$`, "abcd", build(1, 0, 4)}, 41 | {`^bcd'`, "abcdef", nil}, 42 | {`^abcd$`, "abcde", nil}, 43 | {`a+`, "baaab", build(1, 1, 4)}, 44 | {`a*`, "baaab", build(4, 0, 0, 1, 4, 4, 4, 5, 5)}, 45 | {`[a-z]+`, "abcd", build(1, 0, 4)}, 46 | {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, 47 | {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, 48 | {`[^\n]+`, "abcd\n", build(1, 0, 4)}, 49 | {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, 50 | {`日本語+`, "日本語", build(1, 0, 9)}, 51 | {`a*`, "日本語", build(4, 0, 0, 3, 3, 6, 6, 9, 9)}, 52 | {`日本語+`, "日本語語語語", build(1, 0, 18)}, 53 | {`()`, "", build(1, 0, 0, 0, 0)}, 54 | {`(a)`, "a", build(1, 0, 1, 0, 1)}, 55 | {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, 56 | {`(.*)`, "", build(1, 0, 0, 0, 0)}, 57 | {`(.*)`, "abcd", build(2, 0, 4, 0, 4, 4, 4, 4, 4)}, 58 | {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, 59 | {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, 60 | {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, 61 | {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, 62 | {"\a\b\f\n\r\t\v", "\a\b\f\n\r\t\v", build(1, 0, 7)}, 63 | {`[\a\b\f\n\r\t\v]+`, "\a\b\f\n\r\t\v", build(1, 0, 7)}, 64 | 65 | //{`a*(|(b))c*`, "aacc", build(2, 0, 4, 4, 4)}, 66 | {`(.*).*`, "ab", build(2, 0, 2, 0, 2, 2, 2, 2, 2)}, 67 | {`[.]`, ".", build(1, 0, 1)}, 68 | {`/$`, "/abc/", build(1, 4, 5)}, 69 | {`/$`, "/abc", nil}, 70 | 71 | // multiple matches 72 | {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, 73 | {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, 74 | {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, 75 | {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, 76 | {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, 77 | 78 | // fixed bugs 79 | {`ab$`, "cab", build(1, 1, 3)}, 80 | {`axxb$`, "axxcb", nil}, 81 | {`data`, "daXY data", build(1, 5, 9)}, 82 | {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, 83 | {`zx+`, "zzx", build(1, 1, 3)}, 84 | 85 | // can backslash-escape any punctuation 86 | {`\!\"\#\$\%\&\'\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, 87 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, 88 | {`[\!\"\#\$\%\&\'\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, 89 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, 90 | {"\\`", "`", build(1, 0, 1)}, 91 | {"[\\`]+", "`", build(1, 0, 1)}, 92 | 93 | // long set of matches (longer than startSize) 94 | { 95 | ".", 96 | "qwertyuiopasdfghjklzxcvbnm1234567890", 97 | build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 98 | 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 99 | 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 100 | 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), 101 | }, 102 | } 103 | 104 | // build is a helper to construct a [][]int by extracting n sequences from x. 105 | // This represents n matches with len(x)/n submatches each. 106 | func build(n int, x ...int) [][]int { 107 | ret := make([][]int, n) 108 | runLength := len(x) / n 109 | j := 0 110 | for i := range ret { 111 | ret[i] = make([]int, runLength) 112 | copy(ret[i], x[j:]) 113 | j += runLength 114 | if j > len(x) { 115 | panic("invalid build entry") 116 | } 117 | } 118 | return ret 119 | } 120 | 121 | // First the simple cases. 122 | 123 | func TestFind(t *testing.T) { 124 | for _, test := range findTests { 125 | re := MustCompile(test.pat) 126 | if re.String() != test.pat { 127 | t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) 128 | } 129 | result := re.Find([]byte(test.text)) 130 | switch { 131 | case len(test.matches) == 0 && len(result) == 0: 132 | // ok 133 | case test.matches == nil && result != nil: 134 | t.Errorf("expected no match; got one: %s", test) 135 | case test.matches != nil && result == nil: 136 | t.Errorf("expected match; got none: %s", test) 137 | case test.matches != nil && result != nil: 138 | expect := test.text[test.matches[0][0]:test.matches[0][1]] 139 | if expect != string(result) { 140 | t.Errorf("expected %q got %q: %s", expect, result, test) 141 | } 142 | } 143 | } 144 | } 145 | 146 | func TestFindString(t *testing.T) { 147 | for _, test := range findTests { 148 | result := MustCompile(test.pat).FindString(test.text) 149 | switch { 150 | case len(test.matches) == 0 && len(result) == 0: 151 | // ok 152 | case test.matches == nil && result != "": 153 | t.Errorf("expected no match; got one: %s", test) 154 | case test.matches != nil && result == "": 155 | // Tricky because an empty result has two meanings: no match or empty match. 156 | if test.matches[0][0] != test.matches[0][1] { 157 | t.Errorf("expected match; got none: %s", test) 158 | } 159 | case test.matches != nil && result != "": 160 | expect := test.text[test.matches[0][0]:test.matches[0][1]] 161 | if expect != result { 162 | t.Errorf("expected %q got %q: %s", expect, result, test) 163 | } 164 | } 165 | } 166 | } 167 | 168 | func testFindIndex(test *FindTest, result []int, t *testing.T) { 169 | switch { 170 | case len(test.matches) == 0 && len(result) == 0: 171 | // ok 172 | case test.matches == nil && result != nil: 173 | t.Errorf("expected no match; got one: %s", test) 174 | case test.matches != nil && result == nil: 175 | t.Errorf("expected match; got none: %s", test) 176 | case test.matches != nil && result != nil: 177 | expect := test.matches[0] 178 | if expect[0] != result[0] || expect[1] != result[1] { 179 | t.Errorf("expected %v got %v: %s", expect, result, test) 180 | } 181 | } 182 | } 183 | 184 | func TestFindIndex(t *testing.T) { 185 | for _, test := range findTests { 186 | testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) 187 | } 188 | } 189 | 190 | func TestFindStringIndex(t *testing.T) { 191 | for _, test := range findTests { 192 | testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) 193 | } 194 | } 195 | 196 | func TestFindStringContentType(t *testing.T) { 197 | pattern := `text/(.*);\s*charset\s*=\s*(.*)` 198 | regex := MustCompile(pattern) 199 | 200 | data1 := "text/html; charset=utf8" 201 | data2 := "text/;charset=iso-8859-1" 202 | data3 := "image/png" 203 | matches := regex.FindStringSubmatch(data1) 204 | if matches[1] != "html" || matches[2] != "utf8" { 205 | t.Errorf("does not match content-type 1") 206 | } 207 | matches = regex.FindStringSubmatch(data2) 208 | if matches[1] != "" || matches[2] != "iso-8859-1" { 209 | println(matches[1]) 210 | println(matches[2]) 211 | t.Errorf("does not match content-type 2") 212 | } 213 | matches = regex.FindStringSubmatch(data3) 214 | if len(matches) != 0 { 215 | t.Errorf("does not match content-type 3") 216 | } 217 | } 218 | 219 | func TestFindReaderIndex(t *testing.T) { 220 | for _, test := range findTests { 221 | testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t) 222 | } 223 | } 224 | 225 | // Now come the simple All cases. 226 | 227 | func TestFindAll(t *testing.T) { 228 | for _, test := range findTests { 229 | result := MustCompile(test.pat).FindAll([]byte(test.text), -1) 230 | switch { 231 | case test.matches == nil && result == nil: 232 | // ok 233 | case test.matches == nil && result != nil: 234 | t.Errorf("expected no match; got one: %s", test) 235 | case test.matches != nil && result == nil: 236 | t.Errorf("expected match; got none: %s", test) 237 | case test.matches != nil && result != nil: 238 | if len(test.matches) != len(result) { 239 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 240 | continue 241 | } 242 | for k, e := range test.matches { 243 | expect := test.text[e[0]:e[1]] 244 | if expect != string(result[k]) { 245 | t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test) 246 | } 247 | } 248 | } 249 | } 250 | } 251 | 252 | func TestFindAllString(t *testing.T) { 253 | for _, test := range findTests { 254 | result := MustCompile(test.pat).FindAllString(test.text, -1) 255 | switch { 256 | case test.matches == nil && result == nil: 257 | // ok 258 | case test.matches == nil && result != nil: 259 | t.Errorf("expected no match; got one: %s", test) 260 | case test.matches != nil && result == nil: 261 | t.Errorf("expected match; got none: %s", test) 262 | case test.matches != nil && result != nil: 263 | if len(test.matches) != len(result) { 264 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 265 | continue 266 | } 267 | for k, e := range test.matches { 268 | expect := test.text[e[0]:e[1]] 269 | if expect != result[k] { 270 | t.Errorf("expected %q got %q: %s", expect, result, test) 271 | } 272 | } 273 | } 274 | } 275 | } 276 | 277 | func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { 278 | switch { 279 | case test.matches == nil && result == nil: 280 | // ok 281 | case test.matches == nil && result != nil: 282 | t.Errorf("expected no match; got one: %s", test) 283 | case test.matches != nil && result == nil: 284 | t.Errorf("expected match; got none: %s", test) 285 | case test.matches != nil && result != nil: 286 | if len(test.matches) != len(result) { 287 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 288 | return 289 | } 290 | for k, e := range test.matches { 291 | if e[0] != result[k][0] || e[1] != result[k][1] { 292 | t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) 293 | } 294 | } 295 | } 296 | } 297 | 298 | func TestFindAllIndex(t *testing.T) { 299 | for _, test := range findTests { 300 | testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) 301 | } 302 | } 303 | 304 | func TestFindAllStringIndex(t *testing.T) { 305 | for _, test := range findTests { 306 | testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) 307 | } 308 | } 309 | 310 | // Now come the Submatch cases. 311 | 312 | func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { 313 | if len(submatches) != len(result)*2 { 314 | t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) 315 | return 316 | } 317 | for k := 0; k < len(submatches); k += 2 { 318 | if submatches[k] == -1 { 319 | if result[k/2] != nil { 320 | t.Errorf("match %d: expected nil got %q: %s", n, result, test) 321 | } 322 | continue 323 | } 324 | expect := test.text[submatches[k]:submatches[k+1]] 325 | if expect != string(result[k/2]) { 326 | t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) 327 | return 328 | } 329 | } 330 | } 331 | 332 | func TestFindSubmatch(t *testing.T) { 333 | for _, test := range findTests { 334 | result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) 335 | switch { 336 | case test.matches == nil && result == nil: 337 | // ok 338 | case test.matches == nil && result != nil: 339 | t.Errorf("expected no match; got one: %s", test) 340 | case test.matches != nil && result == nil: 341 | t.Errorf("expected match; got none: %s", test) 342 | case test.matches != nil && result != nil: 343 | testSubmatchBytes(&test, 0, test.matches[0], result, t) 344 | } 345 | } 346 | } 347 | 348 | func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { 349 | if len(submatches) != len(result)*2 { 350 | t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) 351 | return 352 | } 353 | for k := 0; k < len(submatches); k += 2 { 354 | if submatches[k] == -1 { 355 | if result[k/2] != "" { 356 | t.Errorf("match %d: expected nil got %q: %s", n, result, test) 357 | } 358 | continue 359 | } 360 | expect := test.text[submatches[k]:submatches[k+1]] 361 | if expect != result[k/2] { 362 | t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) 363 | return 364 | } 365 | } 366 | } 367 | 368 | func TestFindStringSubmatch(t *testing.T) { 369 | for _, test := range findTests { 370 | result := MustCompile(test.pat).FindStringSubmatch(test.text) 371 | switch { 372 | case test.matches == nil && result == nil: 373 | // ok 374 | case test.matches == nil && result != nil: 375 | t.Errorf("expected no match; got one: %s", test) 376 | case test.matches != nil && result == nil: 377 | t.Errorf("expected match; got none: %s", test) 378 | case test.matches != nil && result != nil: 379 | testSubmatchString(&test, 0, test.matches[0], result, t) 380 | } 381 | } 382 | } 383 | 384 | func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { 385 | if len(expect) != len(result) { 386 | t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) 387 | return 388 | } 389 | for k, e := range expect { 390 | if e != result[k] { 391 | t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) 392 | } 393 | } 394 | } 395 | 396 | func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { 397 | switch { 398 | case test.matches == nil && result == nil: 399 | // ok 400 | case test.matches == nil && result != nil: 401 | t.Errorf("expected no match; got one: %s", test) 402 | case test.matches != nil && result == nil: 403 | t.Errorf("expected match; got none: %s", test) 404 | case test.matches != nil && result != nil: 405 | testSubmatchIndices(test, 0, test.matches[0], result, t) 406 | } 407 | } 408 | 409 | func TestFindSubmatchIndex(t *testing.T) { 410 | for _, test := range findTests { 411 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) 412 | } 413 | } 414 | 415 | func TestFindStringSubmatchIndex(t *testing.T) { 416 | for _, test := range findTests { 417 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) 418 | } 419 | } 420 | 421 | func TestFindReaderSubmatchIndex(t *testing.T) { 422 | for _, test := range findTests { 423 | testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t) 424 | } 425 | } 426 | 427 | // Now come the monster AllSubmatch cases. 428 | 429 | func TestFindAllSubmatch(t *testing.T) { 430 | for _, test := range findTests { 431 | result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) 432 | switch { 433 | case test.matches == nil && result == nil: 434 | // ok 435 | case test.matches == nil && result != nil: 436 | t.Errorf("expected no match; got one: %s", test) 437 | case test.matches != nil && result == nil: 438 | t.Errorf("expected match; got none: %s", test) 439 | case len(test.matches) != len(result): 440 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 441 | case test.matches != nil && result != nil: 442 | for k, match := range test.matches { 443 | testSubmatchBytes(&test, k, match, result[k], t) 444 | } 445 | } 446 | } 447 | } 448 | 449 | func TestFindAllStringSubmatch(t *testing.T) { 450 | for _, test := range findTests { 451 | result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) 452 | switch { 453 | case test.matches == nil && result == nil: 454 | // ok 455 | case test.matches == nil && result != nil: 456 | t.Errorf("expected no match; got one: %s", test) 457 | case test.matches != nil && result == nil: 458 | t.Errorf("expected match; got none: %s", test) 459 | case len(test.matches) != len(result): 460 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 461 | case test.matches != nil && result != nil: 462 | for k, match := range test.matches { 463 | testSubmatchString(&test, k, match, result[k], t) 464 | } 465 | } 466 | } 467 | } 468 | 469 | func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { 470 | switch { 471 | case test.matches == nil && result == nil: 472 | // ok 473 | case test.matches == nil && result != nil: 474 | t.Errorf("expected no match; got one: %s", test) 475 | case test.matches != nil && result == nil: 476 | t.Errorf("expected match; got none: %s", test) 477 | case len(test.matches) != len(result): 478 | t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) 479 | case test.matches != nil && result != nil: 480 | for k, match := range test.matches { 481 | testSubmatchIndices(test, k, match, result[k], t) 482 | } 483 | } 484 | } 485 | 486 | func TestFindAllSubmatchIndex(t *testing.T) { 487 | for _, test := range findTests { 488 | testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) 489 | } 490 | } 491 | 492 | func TestFindAllStringSubmatchIndex(t *testing.T) { 493 | for _, test := range findTests { 494 | testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) 495 | } 496 | } 497 | -------------------------------------------------------------------------------- /quotemeta.go: -------------------------------------------------------------------------------- 1 | // Use of this source code is governed by a BSD-style 2 | // license that can be found in the LICENSE file. 3 | 4 | // Package regexp implements a simple regular expression library. 5 | 6 | // QuoteMeta func is copied here to avoid linking the entire Regexp library. 7 | 8 | package rubex 9 | 10 | func special(c int) bool { 11 | for _, r := range `\.+*?()|[]^$` { 12 | if c == int(r) { 13 | return true 14 | } 15 | } 16 | return false 17 | } 18 | 19 | // QuoteMeta returns a string that quotes all regular expression metacharacters 20 | // inside the argument text; the returned string is a regular expression matching 21 | // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 22 | func QuoteMeta(s string) string { 23 | b := make([]byte, 2*len(s)) 24 | 25 | // A byte loop is correct because all metacharacters are ASCII. 26 | j := 0 27 | for i := 0; i < len(s); i++ { 28 | if special(int(s[i])) { 29 | b[j] = '\\' 30 | j++ 31 | } 32 | b[j] = s[i] 33 | j++ 34 | } 35 | return string(b[0:j]) 36 | } 37 | -------------------------------------------------------------------------------- /regex.go: -------------------------------------------------------------------------------- 1 | package rubex 2 | 3 | /* 4 | #cgo CFLAGS: -I${SRCDIR}/../../../../../clibs/include 5 | #cgo LDFLAGS: -L${SRCDIR}/../../../../../clibs/lib -lonig 6 | #include 7 | #include 8 | #include "chelper.h" 9 | */ 10 | import "C" 11 | 12 | import ( 13 | "bytes" 14 | "errors" 15 | "fmt" 16 | "io" 17 | "log" 18 | //"runtime" 19 | "strconv" 20 | "sync" 21 | "unicode/utf8" 22 | "unsafe" 23 | ) 24 | 25 | type strRange []int 26 | 27 | const numMatchStartSize = 4 28 | const numReadBufferStartSize = 256 29 | 30 | var mutex sync.Mutex 31 | 32 | type MatchData struct { 33 | count int 34 | indexes [][]int32 35 | } 36 | 37 | type NamedGroupInfo map[string]int 38 | 39 | type Regexp struct { 40 | pattern string 41 | regex C.OnigRegex 42 | region *C.OnigRegion 43 | errorInfo *C.OnigErrorInfo 44 | errorBuf *C.char 45 | matchData *MatchData 46 | namedGroupInfo NamedGroupInfo 47 | } 48 | 49 | func NewRegexp(pattern string, option int) (re *Regexp, err error) { 50 | re = &Regexp{pattern: pattern} 51 | patternCharPtr := C.CString(pattern) 52 | defer C.free(unsafe.Pointer(patternCharPtr)) 53 | 54 | mutex.Lock() 55 | defer mutex.Unlock() 56 | error_code := C.NewOnigRegex(patternCharPtr, C.int(len(pattern)), C.int(option), &re.regex, &re.region, &re.errorInfo, &re.errorBuf) 57 | if error_code != C.ONIG_NORMAL { 58 | err = errors.New(C.GoString(re.errorBuf)) 59 | } else { 60 | err = nil 61 | numCapturesInPattern := int(C.onig_number_of_captures(re.regex)) + 1 62 | re.matchData = &MatchData{} 63 | re.matchData.indexes = make([][]int32, numMatchStartSize) 64 | for i := 0; i < numMatchStartSize; i++ { 65 | re.matchData.indexes[i] = make([]int32, numCapturesInPattern*2) 66 | } 67 | re.namedGroupInfo = re.getNamedGroupInfo() 68 | //runtime.SetFinalizer(re, (*Regexp).Free) 69 | } 70 | return re, err 71 | } 72 | 73 | func Compile(str string) (*Regexp, error) { 74 | return NewRegexp(str, ONIG_OPTION_DEFAULT) 75 | } 76 | 77 | func MustCompile(str string) *Regexp { 78 | regexp, error := NewRegexp(str, ONIG_OPTION_DEFAULT) 79 | if error != nil { 80 | panic("regexp: compiling " + str + ": " + error.Error()) 81 | } 82 | return regexp 83 | } 84 | 85 | func CompileWithOption(str string, option int) (*Regexp, error) { 86 | return NewRegexp(str, option) 87 | } 88 | 89 | func MustCompileWithOption(str string, option int) *Regexp { 90 | regexp, error := NewRegexp(str, option) 91 | if error != nil { 92 | panic("regexp: compiling " + str + ": " + error.Error()) 93 | } 94 | return regexp 95 | } 96 | 97 | func (re *Regexp) Free() { 98 | mutex.Lock() 99 | if re.regex != nil { 100 | C.onig_free(re.regex) 101 | re.regex = nil 102 | } 103 | if re.region != nil { 104 | C.onig_region_free(re.region, 1) 105 | re.region = nil 106 | } 107 | mutex.Unlock() 108 | if re.errorInfo != nil { 109 | C.free(unsafe.Pointer(re.errorInfo)) 110 | re.errorInfo = nil 111 | } 112 | if re.errorBuf != nil { 113 | C.free(unsafe.Pointer(re.errorBuf)) 114 | re.errorBuf = nil 115 | } 116 | } 117 | 118 | func (re *Regexp) getNamedGroupInfo() (namedGroupInfo NamedGroupInfo) { 119 | numNamedGroups := int(C.onig_number_of_names(re.regex)) 120 | //when any named capture exisits, there is no numbered capture even if there are unnamed captures 121 | if numNamedGroups > 0 { 122 | namedGroupInfo = make(map[string]int) 123 | //try to get the names 124 | bufferSize := len(re.pattern) * 2 125 | nameBuffer := make([]byte, bufferSize) 126 | groupNumbers := make([]int32, numNamedGroups) 127 | bufferPtr := unsafe.Pointer(&nameBuffer[0]) 128 | numbersPtr := unsafe.Pointer(&groupNumbers[0]) 129 | length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr))) 130 | if length > 0 { 131 | namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";")) 132 | if len(namesAsBytes) != numNamedGroups { 133 | log.Fatalf("the number of named groups (%d) does not match the number names found (%d)\n", numNamedGroups, len(namesAsBytes)) 134 | } 135 | for i, nameAsBytes := range namesAsBytes { 136 | name := string(nameAsBytes) 137 | namedGroupInfo[name] = int(groupNumbers[i]) 138 | } 139 | } else { 140 | log.Fatalf("could not get the capture group names from %q", re.String()) 141 | } 142 | } 143 | return 144 | } 145 | 146 | func (re *Regexp) groupNameToId(name string) (id int) { 147 | if re.namedGroupInfo == nil { 148 | id = ONIGERR_UNDEFINED_NAME_REFERENCE 149 | } else { 150 | id = re.namedGroupInfo[name] 151 | } 152 | return 153 | } 154 | 155 | func (re *Regexp) processMatch(numCaptures int) (match []int32) { 156 | if numCaptures <= 0 { 157 | panic("cannot have 0 captures when processing a match") 158 | } 159 | matchData := re.matchData 160 | return matchData.indexes[matchData.count][:numCaptures*2] 161 | } 162 | 163 | func (re *Regexp) ClearMatchData() { 164 | matchData := re.matchData 165 | matchData.count = 0 166 | } 167 | 168 | func (re *Regexp) find(b []byte, n int, offset int) (match []int) { 169 | if n == 0 { 170 | b = []byte{0} 171 | } 172 | ptr := unsafe.Pointer(&b[0]) 173 | matchData := re.matchData 174 | capturesPtr := unsafe.Pointer(&(matchData.indexes[matchData.count][0])) 175 | numCaptures := int32(0) 176 | numCapturesPtr := unsafe.Pointer(&numCaptures) 177 | pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr))) 178 | if pos >= 0 { 179 | if numCaptures <= 0 { 180 | panic("cannot have 0 captures when processing a match") 181 | } 182 | match2 := matchData.indexes[matchData.count][:numCaptures*2] 183 | match = make([]int, len(match2)) 184 | for i := range match2 { 185 | match[i] = int(match2[i]) 186 | } 187 | numCapturesInPattern := int32(C.onig_number_of_captures(re.regex)) + 1 188 | if numCapturesInPattern != numCaptures { 189 | log.Fatalf("expected %d captures but got %d\n", numCapturesInPattern, numCaptures) 190 | } 191 | } 192 | return 193 | } 194 | 195 | func getCapture(b []byte, beg int, end int) []byte { 196 | if beg < 0 || end < 0 { 197 | return nil 198 | } 199 | return b[beg:end] 200 | } 201 | 202 | func (re *Regexp) match(b []byte, n int, offset int) bool { 203 | re.ClearMatchData() 204 | if n == 0 { 205 | b = []byte{0} 206 | } 207 | ptr := unsafe.Pointer(&b[0]) 208 | pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(nil), (*C.int)(nil))) 209 | return pos >= 0 210 | } 211 | 212 | func (re *Regexp) findAll(b []byte, n int) (matches [][]int) { 213 | re.ClearMatchData() 214 | 215 | if n < 0 { 216 | n = len(b) 217 | } 218 | matchData := re.matchData 219 | offset := 0 220 | for offset <= n { 221 | if matchData.count >= len(matchData.indexes) { 222 | length := len(matchData.indexes[0]) 223 | matchData.indexes = append(matchData.indexes, make([]int32, length)) 224 | } 225 | if match := re.find(b, n, offset); len(match) > 0 { 226 | matchData.count += 1 227 | //move offset to the ending index of the current match and prepare to find the next non-overlapping match 228 | offset = match[1] 229 | //if match[0] == match[1], it means the current match does not advance the search. we need to exit the loop to avoid getting stuck here. 230 | if match[0] == match[1] { 231 | if offset < n && offset >= 0 { 232 | //there are more bytes, so move offset by a word 233 | _, width := utf8.DecodeRune(b[offset:]) 234 | offset += width 235 | } else { 236 | //search is over, exit loop 237 | break 238 | } 239 | } 240 | } else { 241 | break 242 | } 243 | } 244 | matches2 := matchData.indexes[:matchData.count] 245 | matches = make([][]int, len(matches2)) 246 | for i, v := range matches2 { 247 | matches[i] = make([]int, len(v)) 248 | for j, v2 := range v { 249 | matches[i][j] = int(v2) 250 | } 251 | } 252 | return 253 | } 254 | 255 | func (re *Regexp) FindIndex(b []byte) []int { 256 | re.ClearMatchData() 257 | match := re.find(b, len(b), 0) 258 | if len(match) == 0 { 259 | return nil 260 | } 261 | return match[:2] 262 | } 263 | 264 | func (re *Regexp) Find(b []byte) []byte { 265 | loc := re.FindIndex(b) 266 | if loc == nil { 267 | return nil 268 | } 269 | return getCapture(b, loc[0], loc[1]) 270 | } 271 | 272 | func (re *Regexp) FindString(s string) string { 273 | b := []byte(s) 274 | mb := re.Find(b) 275 | if mb == nil { 276 | return "" 277 | } 278 | return string(mb) 279 | } 280 | 281 | func (re *Regexp) FindStringIndex(s string) []int { 282 | b := []byte(s) 283 | return re.FindIndex(b) 284 | } 285 | 286 | func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 287 | matches := re.findAll(b, n) 288 | if len(matches) == 0 { 289 | return nil 290 | } 291 | return matches 292 | } 293 | 294 | func (re *Regexp) FindAll(b []byte, n int) [][]byte { 295 | matches := re.FindAllIndex(b, n) 296 | if matches == nil { 297 | return nil 298 | } 299 | matchBytes := make([][]byte, 0, len(matches)) 300 | for _, match := range matches { 301 | matchBytes = append(matchBytes, getCapture(b, match[0], match[1])) 302 | } 303 | return matchBytes 304 | } 305 | 306 | func (re *Regexp) FindAllString(s string, n int) []string { 307 | b := []byte(s) 308 | matches := re.FindAllIndex(b, n) 309 | if matches == nil { 310 | return nil 311 | } 312 | matchStrings := make([]string, 0, len(matches)) 313 | for _, match := range matches { 314 | m := getCapture(b, match[0], match[1]) 315 | if m == nil { 316 | matchStrings = append(matchStrings, "") 317 | } else { 318 | matchStrings = append(matchStrings, string(m)) 319 | } 320 | } 321 | return matchStrings 322 | 323 | } 324 | 325 | func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 326 | b := []byte(s) 327 | return re.FindAllIndex(b, n) 328 | } 329 | 330 | func (re *Regexp) findSubmatchIndex(b []byte) (match []int) { 331 | re.ClearMatchData() 332 | match = re.find(b, len(b), 0) 333 | return 334 | } 335 | 336 | func (re *Regexp) FindSubmatchIndex(b []byte) []int { 337 | match := re.findSubmatchIndex(b) 338 | if len(match) == 0 { 339 | return nil 340 | } 341 | return match 342 | } 343 | 344 | func (re *Regexp) FindSubmatch(b []byte) [][]byte { 345 | match := re.findSubmatchIndex(b) 346 | if match == nil { 347 | return nil 348 | } 349 | length := len(match) / 2 350 | if length == 0 { 351 | return nil 352 | } 353 | results := make([][]byte, 0, length) 354 | for i := 0; i < length; i++ { 355 | results = append(results, getCapture(b, match[2*i], match[2*i+1])) 356 | } 357 | return results 358 | } 359 | 360 | func (re *Regexp) FindStringSubmatch(s string) []string { 361 | b := []byte(s) 362 | match := re.findSubmatchIndex(b) 363 | if match == nil { 364 | return nil 365 | } 366 | length := len(match) / 2 367 | if length == 0 { 368 | return nil 369 | } 370 | 371 | results := make([]string, 0, length) 372 | for i := 0; i < length; i++ { 373 | cap := getCapture(b, match[2*i], match[2*i+1]) 374 | if cap == nil { 375 | results = append(results, "") 376 | } else { 377 | results = append(results, string(cap)) 378 | } 379 | } 380 | return results 381 | } 382 | 383 | func (re *Regexp) FindStringSubmatchIndex(s string) []int { 384 | b := []byte(s) 385 | return re.FindSubmatchIndex(b) 386 | } 387 | 388 | func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 389 | matches := re.findAll(b, n) 390 | if len(matches) == 0 { 391 | return nil 392 | } 393 | return matches 394 | } 395 | 396 | func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 397 | matches := re.findAll(b, n) 398 | if len(matches) == 0 { 399 | return nil 400 | } 401 | allCapturedBytes := make([][][]byte, 0, len(matches)) 402 | for _, match := range matches { 403 | length := len(match) / 2 404 | capturedBytes := make([][]byte, 0, length) 405 | for i := 0; i < length; i++ { 406 | capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1])) 407 | } 408 | allCapturedBytes = append(allCapturedBytes, capturedBytes) 409 | } 410 | 411 | return allCapturedBytes 412 | } 413 | 414 | func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 415 | b := []byte(s) 416 | matches := re.findAll(b, n) 417 | if len(matches) == 0 { 418 | return nil 419 | } 420 | allCapturedStrings := make([][]string, 0, len(matches)) 421 | for _, match := range matches { 422 | length := len(match) / 2 423 | capturedStrings := make([]string, 0, length) 424 | for i := 0; i < length; i++ { 425 | cap := getCapture(b, match[2*i], match[2*i+1]) 426 | if cap == nil { 427 | capturedStrings = append(capturedStrings, "") 428 | } else { 429 | capturedStrings = append(capturedStrings, string(cap)) 430 | } 431 | } 432 | allCapturedStrings = append(allCapturedStrings, capturedStrings) 433 | } 434 | return allCapturedStrings 435 | } 436 | 437 | func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 438 | b := []byte(s) 439 | return re.FindAllSubmatchIndex(b, n) 440 | } 441 | 442 | func (re *Regexp) Match(b []byte) bool { 443 | return re.match(b, len(b), 0) 444 | } 445 | 446 | func (re *Regexp) MatchString(s string) bool { 447 | b := []byte(s) 448 | return re.Match(b) 449 | } 450 | 451 | func (re *Regexp) NumSubexp() int { 452 | return (int)(C.onig_number_of_captures(re.regex)) 453 | } 454 | 455 | func (re *Regexp) getNamedCapture(name []byte, capturedBytes [][]byte) []byte { 456 | nameStr := string(name) 457 | capNum := re.groupNameToId(nameStr) 458 | if capNum < 0 || capNum >= len(capturedBytes) { 459 | panic(fmt.Sprintf("capture group name (%q) has error\n", nameStr)) 460 | } 461 | return capturedBytes[capNum] 462 | } 463 | 464 | func (re *Regexp) getNumberedCapture(num int, capturedBytes [][]byte) []byte { 465 | //when named capture groups exist, numbered capture groups returns "" 466 | if re.namedGroupInfo == nil && num <= (len(capturedBytes)-1) && num >= 0 { 467 | return capturedBytes[num] 468 | } 469 | return ([]byte)("") 470 | } 471 | 472 | func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte { 473 | replLen := len(repl) 474 | newRepl := make([]byte, 0, replLen*3) 475 | inEscapeMode := false 476 | inGroupNameMode := false 477 | groupName := make([]byte, 0, replLen) 478 | for index := 0; index < replLen; index += 1 { 479 | ch := repl[index] 480 | if inGroupNameMode && ch == byte('<') { 481 | } else if inGroupNameMode && ch == byte('>') { 482 | inGroupNameMode = false 483 | groupNameStr := string(groupName) 484 | capBytes := capturedBytes[groupNameStr] 485 | newRepl = append(newRepl, capBytes...) 486 | groupName = groupName[:0] //reset the name 487 | } else if inGroupNameMode { 488 | groupName = append(groupName, ch) 489 | } else if inEscapeMode && ch <= byte('9') && byte('1') <= ch { 490 | capNumStr := string(ch) 491 | capBytes := capturedBytes[capNumStr] 492 | newRepl = append(newRepl, capBytes...) 493 | } else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') { 494 | inGroupNameMode = true 495 | inEscapeMode = false 496 | index += 1 //bypass the next char '<' 497 | } else if inEscapeMode { 498 | newRepl = append(newRepl, '\\') 499 | newRepl = append(newRepl, ch) 500 | } else if ch != '\\' { 501 | newRepl = append(newRepl, ch) 502 | } 503 | if ch == byte('\\') || inEscapeMode { 504 | inEscapeMode = !inEscapeMode 505 | } 506 | } 507 | return newRepl 508 | } 509 | 510 | func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map[string][]byte) []byte) []byte { 511 | srcLen := len(src) 512 | matches := re.findAll(src, srcLen) 513 | if len(matches) == 0 { 514 | return src 515 | } 516 | dest := make([]byte, 0, srcLen) 517 | for i, match := range matches { 518 | length := len(match) / 2 519 | capturedBytes := make(map[string][]byte) 520 | if re.namedGroupInfo == nil { 521 | for j := 0; j < length; j++ { 522 | capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1]) 523 | } 524 | } else { 525 | for name, j := range re.namedGroupInfo { 526 | capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1]) 527 | } 528 | } 529 | matchBytes := getCapture(src, match[0], match[1]) 530 | newRepl := replFunc(repl, matchBytes, capturedBytes) 531 | prevEnd := 0 532 | if i > 0 { 533 | prevMatch := matches[i-1][:2] 534 | prevEnd = prevMatch[1] 535 | } 536 | if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen { 537 | dest = append(dest, src[prevEnd:match[0]]...) 538 | } 539 | dest = append(dest, newRepl...) 540 | } 541 | lastEnd := matches[len(matches)-1][1] 542 | if lastEnd < srcLen && lastEnd >= 0 { 543 | dest = append(dest, src[lastEnd:]...) 544 | } 545 | return dest 546 | } 547 | 548 | func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 549 | return re.replaceAll(src, repl, fillCapturedValues) 550 | } 551 | 552 | func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 553 | return re.replaceAll(src, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { 554 | return repl(matchBytes) 555 | }) 556 | } 557 | 558 | func (re *Regexp) ReplaceAllString(src, repl string) string { 559 | return string(re.ReplaceAll([]byte(src), []byte(repl))) 560 | } 561 | 562 | func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 563 | srcB := []byte(src) 564 | destB := re.replaceAll(srcB, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { 565 | return []byte(repl(string(matchBytes))) 566 | }) 567 | return string(destB) 568 | } 569 | 570 | func (re *Regexp) String() string { 571 | return re.pattern 572 | } 573 | 574 | func grow_buffer(b []byte, offset int, n int) []byte { 575 | if offset+n > cap(b) { 576 | buf := make([]byte, 2*cap(b)+n) 577 | copy(buf, b[:offset]) 578 | return buf 579 | } 580 | return b 581 | } 582 | 583 | func fromReader(r io.RuneReader) []byte { 584 | b := make([]byte, numReadBufferStartSize) 585 | offset := 0 586 | var err error = nil 587 | for err == nil { 588 | rune, runeWidth, err := r.ReadRune() 589 | if err == nil { 590 | b = grow_buffer(b, offset, runeWidth) 591 | writeWidth := utf8.EncodeRune(b[offset:], rune) 592 | if runeWidth != writeWidth { 593 | panic("reading rune width not equal to the written rune width") 594 | } 595 | offset += writeWidth 596 | } else { 597 | break 598 | } 599 | } 600 | return b[:offset] 601 | } 602 | 603 | func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { 604 | b := fromReader(r) 605 | return re.FindIndex(b) 606 | } 607 | 608 | func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 609 | b := fromReader(r) 610 | return re.FindSubmatchIndex(b) 611 | } 612 | 613 | func (re *Regexp) MatchReader(r io.RuneReader) bool { 614 | b := fromReader(r) 615 | return re.Match(b) 616 | } 617 | 618 | func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 619 | //no easy way to implement this 620 | return "", false 621 | } 622 | 623 | func MatchString(pattern string, s string) (matched bool, error error) { 624 | re, err := Compile(pattern) 625 | if err != nil { 626 | return false, err 627 | } 628 | return re.MatchString(s), nil 629 | } 630 | 631 | func (re *Regexp) Gsub(src, repl string) string { 632 | srcBytes := ([]byte)(src) 633 | replBytes := ([]byte)(repl) 634 | replaced := re.replaceAll(srcBytes, replBytes, fillCapturedValues) 635 | return string(replaced) 636 | } 637 | 638 | func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string { 639 | srcBytes := ([]byte)(src) 640 | replaced := re.replaceAll(srcBytes, nil, func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte { 641 | capturedStrings := make(map[string]string) 642 | for name, capBytes := range capturedBytes { 643 | capturedStrings[name] = string(capBytes) 644 | } 645 | matchString := string(matchBytes) 646 | return ([]byte)(replFunc(matchString, capturedStrings)) 647 | }) 648 | return string(replaced) 649 | } 650 | --------------------------------------------------------------------------------