├── .gitignore ├── Makefile ├── api_test.go ├── ast-match.go ├── ast.go ├── benchmark_test.go ├── charclass.go ├── charclass_test.go ├── cmd └── yarexgen │ └── main.go ├── codefragments.go ├── compiledmatch.go ├── export_test.go ├── go.mod ├── gogenerate.go ├── gogenerate_charclass.go ├── match_test.go ├── matchcontext.go ├── opcompile.go ├── opmatchcontext.go ├── optimize-ast.go ├── optree.go ├── optreematch.go ├── parse.go ├── parse_test.go └── yarex.go /.gitignore: -------------------------------------------------------------------------------- 1 | *_yarex.go 2 | *_yarex_test.go 3 | cmd/yarexgen/yarexgen 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: *.go cmd/yarexgen/main.go 2 | cd cmd/yarexgen; go build . 3 | 4 | test: all cmd/yarexgen/yarexgen 5 | go generate 6 | go test 7 | 8 | clean: 9 | rm cmd/yarexgen/yarexgen 10 | rm *_yarex_test.go 11 | -------------------------------------------------------------------------------- /api_test.go: -------------------------------------------------------------------------------- 1 | package yarex_test 2 | 3 | import ( 4 | "reflect" 5 | "regexp" 6 | "testing" 7 | 8 | "github.com/Maki-Daisuke/go-yarex" 9 | ) 10 | 11 | func testAPIs(t *testing.T, restr string, tests []string) { 12 | stdRe := regexp.MustCompile(restr) 13 | opRe := yarex.MustCompileOp(restr) 14 | compRe := yarex.MustCompile(restr) 15 | for _, str := range tests { 16 | r := stdRe.FindString(str) 17 | if opRe.FindString(str) != r { 18 | t.Errorf("(OpTree) %v.FindString(%q) returned %q, but expected %q", opRe, str, opRe.FindString(str), r) 19 | } 20 | if compRe.FindString(str) != r { 21 | t.Errorf("(Compiled) %v.FindString(%q) returned %q, but expected %q", opRe, str, compRe.FindString(str), r) 22 | } 23 | loc := stdRe.FindStringIndex(str) 24 | if !reflect.DeepEqual(opRe.FindStringIndex(str), loc) { 25 | t.Errorf("(OpTree) %v.FindStringIndex(%q) returned %v, but expected %v", opRe, str, opRe.FindStringIndex(str), loc) 26 | } 27 | if !reflect.DeepEqual(compRe.FindStringIndex(str), loc) { 28 | t.Errorf("(Compiled) %v.FindStringIndex(%q) returned %v, but expected %v", opRe, str, compRe.FindStringIndex(str), loc) 29 | } 30 | } 31 | } 32 | 33 | func TestAPI(t *testing.T) { 34 | re := "foo bar" //yarexgen 35 | testAPIs(t, re, []string{ 36 | "foo bar", 37 | "foo bar", 38 | "hogefoo barfuga", 39 | "foo barf", 40 | "Afoo bar", 41 | "foo ba", 42 | }) 43 | 44 | re = "foo|bar" //yarexgen 45 | testAPIs(t, re, []string{ 46 | "foo bar", 47 | "hogefoo barfuga", 48 | "foo baz", 49 | "bar f", 50 | "foba", 51 | "", 52 | }) 53 | 54 | re = "(?:foo|fo)oh" //yarexgen 55 | testAPIs(t, re, []string{ 56 | "fooh", 57 | "foooh", 58 | "foh", 59 | "fooooooooooh", 60 | "fooooooooofoooh", 61 | "", 62 | }) 63 | 64 | re = "fo*oh" //yarexgen 65 | testAPIs(t, re, []string{ 66 | "fooh", 67 | "foh", 68 | "fh", 69 | "fooooooooooh", 70 | "fooooooooofoooh", 71 | "", 72 | "fo", 73 | "oh", 74 | }) 75 | 76 | re = "fo+oh" //yarexgen 77 | testAPIs(t, re, []string{ 78 | "fooh", 79 | "foh", 80 | "fh", 81 | "fooooooooooh", 82 | "fooooooooofoooh", 83 | "", 84 | "fo", 85 | "oh", 86 | }) 87 | 88 | re = "fo{2,5}oh" //yarexgen 89 | testAPIs(t, re, []string{ 90 | "fooh", 91 | "foh", 92 | "fh", 93 | "fooooooooooh", 94 | "fooooooooofoooh", 95 | "", 96 | "fo", 97 | "oh", 98 | }) 99 | 100 | re = "fo?oh" //yarexgen 101 | testAPIs(t, re, []string{ 102 | "fooh", 103 | "foh", 104 | "fh", 105 | "fooooooooooh", 106 | "fooooooooofooh", 107 | "", 108 | "fo", 109 | "oh", 110 | }) 111 | 112 | re = "fo*oh?" //yarexgen 113 | testAPIs(t, re, []string{ 114 | "ABfooh", 115 | "foo", 116 | "fh", 117 | "foooohoooooo", 118 | "foooooooooooCD", 119 | "", 120 | "fo", 121 | "oh", 122 | }) 123 | 124 | re = "." //yarexgen 125 | testAPIs(t, re, []string{ 126 | "aiueo", 127 | "\n", 128 | "", 129 | " ", 130 | "\b", 131 | }) 132 | 133 | re = ".+x" //yarexgen 134 | testAPIs(t, re, []string{ 135 | "", 136 | "x", 137 | "xx", 138 | "aaaaax", 139 | "\nx", 140 | "xx\nx", 141 | "xxxxxa", 142 | }) 143 | 144 | re = "^foo bar" //yarexgen 145 | testAPIs(t, re, []string{ 146 | "foo bar", 147 | "foo bar", 148 | "hogefoo barfuga", 149 | "foo barf", 150 | "Afoo bar", 151 | "foo ba", 152 | "\nfoo bar", 153 | }) 154 | 155 | re = "(^|A)*foo bar" //yarexgen 156 | testAPIs(t, re, []string{ 157 | "foo bar", 158 | "foo bar", 159 | "hogefoo barfuga", 160 | "foo barf", 161 | "Afoo bar", 162 | "AAfoo bar", 163 | "AAAAfoo bar", 164 | "AABAAfoo bar", 165 | }) 166 | 167 | re = "[0aB]" //yarexgen 168 | testAPIs(t, re, []string{ 169 | "foo", // false 170 | "foo bar", // true 171 | "FOO BAR", // true 172 | "AAAAAA", // false 173 | "012345", // true 174 | "\000hoge", // false 175 | "\000hage", // true 176 | }) 177 | 178 | re = "[A-Z0-9][a-z]" //yarexgen 179 | testAPIs(t, re, []string{ 180 | "absksdjhasd", 181 | "alsdAAA", 182 | "asl;k3as7djj", 183 | "Aiiiiiiii9", 184 | "foo BAR", 185 | "FOO bar", 186 | "FOObar", 187 | "fooBARbaz", 188 | }) 189 | 190 | re = `^["]{0,1}([^"]*)["]{0,1}[ ]*<(sip|tel|sips):(([^@]*)@){0,1}([^>^:]*|\[[a-fA-F0-9:]*\]):{0,1}([0-9]*){0,1}>(;.*){0,1}$` //yarexgen 191 | testAPIs(t, re, []string{ 192 | "\"display_name\";user=phone;hogehoge", 193 | "", 194 | "\"display_name\"", 195 | ";user=phone", 196 | "\"0333334444\";user=phone", 197 | }) 198 | } 199 | -------------------------------------------------------------------------------- /ast-match.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "strings" 5 | "unicode/utf8" 6 | "unsafe" 7 | ) 8 | 9 | func astMatch(re Ast, s string) bool { 10 | stack := make([]stackFrame, initialStackSize, initialStackSize) // We need to make stack here, instead of inside makeContext, since matchContext only contains uintptr to stack. GC does not count it as a valid reference. 11 | getStack := func() []stackFrame { 12 | return stack 13 | } 14 | setStack := func(s []stackFrame) { 15 | stack = s 16 | } 17 | c0 := makeContext(&s, &getStack, &setStack) 18 | if re.match(c0.push(0, 0), 0, func(c matchContext, _ int) *matchContext { return &c }) != nil { 19 | return true 20 | } 21 | if canOnlyMatchAtBegining(re) { 22 | return false 23 | } 24 | for i := 1; i < len(s); i++ { 25 | if re.match(c0.push(0, i), i, func(c matchContext, _ int) *matchContext { return &c }) != nil { 26 | return true 27 | } 28 | } 29 | return false 30 | } 31 | 32 | func (re AstLit) match(c matchContext, p int, k Continuation) *matchContext { 33 | str := *(*string)(unsafe.Pointer(c.str)) 34 | lit := string(re) 35 | if !strings.HasPrefix(str[p:], lit) { 36 | return nil 37 | } 38 | return k(c, p+len(lit)) 39 | } 40 | 41 | func (re AstNotNewline) match(c matchContext, p int, k Continuation) *matchContext { 42 | str := *(*string)(unsafe.Pointer(c.str)) 43 | if !(p < len(str)) || str[0] == '\n' { 44 | return nil 45 | } 46 | return k(c, p+1) 47 | } 48 | 49 | func (r *AstSeq) match(c matchContext, p int, k Continuation) *matchContext { 50 | seq := r.seq 51 | var loop func(int) Continuation 52 | loop = func(i int) func(c matchContext, p int) *matchContext { 53 | return func(c matchContext, p int) *matchContext { 54 | if i < len(seq) { 55 | return seq[i].match(c, p, loop(i+1)) 56 | } 57 | return k(c, p) 58 | } 59 | } 60 | return loop(0)(c, p) 61 | } 62 | 63 | func (r *AstAlt) match(c matchContext, p int, k Continuation) *matchContext { 64 | for _, re := range r.opts { 65 | if c1 := re.match(c, p, k); c1 != nil { 66 | return c1 67 | } 68 | } 69 | return nil 70 | } 71 | 72 | func (r *AstRepeat) match(c matchContext, p int, k Continuation) *matchContext { 73 | str := *(*string)(unsafe.Pointer(c.str)) 74 | switch re := r.re.(type) { 75 | case AstLit: 76 | s := string(re) 77 | width := len(s) 78 | if width == 0 { 79 | return k(c, p) 80 | } 81 | p1 := p 82 | i := 0 83 | if r.max < 0 { 84 | for strings.HasPrefix(str[p1:], s) { 85 | i++ 86 | p1 += width 87 | } 88 | } else { 89 | for i < r.max && strings.HasPrefix(str[p1:], s) { 90 | i++ 91 | p1 += width 92 | } 93 | } 94 | for i >= r.min { // Try backtrack 95 | if ret := k(c, p1); ret != nil { 96 | return ret 97 | } 98 | p1 -= width 99 | i-- 100 | } 101 | return nil 102 | case AstCharClass: 103 | cc := re.CharClass 104 | stack := make([]int, 0, 64) 105 | stack = append(stack, p) 106 | p1 := p 107 | i := 0 108 | if r.max < 0 { 109 | for p1 < len(str) { 110 | r, size := utf8.DecodeRuneInString(str[p1:]) 111 | if cc.Contains(r) { 112 | p1 += size 113 | i++ 114 | stack = append(stack, p1) 115 | } else { 116 | break 117 | } 118 | } 119 | } else { 120 | for i < r.max && p1 < len(str) { 121 | r, size := utf8.DecodeRuneInString(str[p1:]) 122 | if cc.Contains(r) { 123 | p1 += size 124 | i++ 125 | stack = append(stack, p1) 126 | } else { 127 | break 128 | } 129 | } 130 | } 131 | for i >= r.min { // Try backtrack 132 | if ret := k(c, stack[i]); ret != nil { 133 | return ret 134 | } 135 | i-- 136 | } 137 | return nil 138 | default: 139 | prev := -1 // initial value must be a number which never equal to any position (i.e. positive integer) 140 | var loop func(count int) Continuation 141 | loop = func(count int) Continuation { 142 | return func(c matchContext, p int) *matchContext { 143 | if prev == p { // Matched zero-length assertion. So, move ahead the next pattern. 144 | return k(c, p) 145 | } 146 | prev = p 147 | if count < r.min { 148 | return re.match(c, p, loop(count+1)) 149 | } 150 | if count == r.max { 151 | return k(c, p) 152 | } 153 | c1 := re.match(c, p, loop(count+1)) 154 | if c1 != nil { 155 | return c1 156 | } 157 | return k(c, p) 158 | } 159 | } 160 | return loop(0)(c, p) 161 | } 162 | } 163 | 164 | func (r *AstCap) match(c matchContext, p int, k Continuation) *matchContext { 165 | c = c.push(r.index, p) 166 | return r.re.match(c, p, func(c matchContext, p1 int) *matchContext { 167 | c = c.push(r.index, p1) 168 | return k(c, p1) 169 | }) 170 | } 171 | 172 | func (r AstBackRef) match(c matchContext, p int, k Continuation) *matchContext { 173 | cap, ok := c.GetCaptured(uint(r)) 174 | if !ok { 175 | return nil 176 | } 177 | return AstLit(cap).match(c, p, k) 178 | } 179 | 180 | func (re AstAssertBegin) match(c matchContext, p int, k Continuation) *matchContext { 181 | if p != 0 { 182 | return nil 183 | } 184 | return k(c, p) 185 | } 186 | 187 | func (re AstAssertEnd) match(c matchContext, p int, k Continuation) *matchContext { 188 | str := *(*string)(unsafe.Pointer(c.str)) 189 | if p != len(str) { 190 | return nil 191 | } 192 | return k(c, p) 193 | } 194 | 195 | func (re AstCharClass) match(c matchContext, p int, k Continuation) *matchContext { 196 | str := *(*string)(unsafe.Pointer(c.str)) 197 | if len(str) < p+1 { 198 | return nil 199 | } 200 | r, size := utf8.DecodeRuneInString(str[p:]) 201 | if !re.Contains(r) { 202 | return nil 203 | } 204 | return k(c, p+size) 205 | } 206 | -------------------------------------------------------------------------------- /ast.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | ) 7 | 8 | // Here, we use uintpointer to pass *matchContext 9 | // to avoid from allocating the parameter in heap 10 | type Continuation = func(matchContext, int) *matchContext 11 | 12 | type Ast interface { 13 | //Compile() 14 | String() string 15 | match(matchContext, int, Continuation) *matchContext // This implements an interpreter-based regex engine. 16 | } 17 | 18 | type AstLit string 19 | 20 | func (re AstLit) String() string { 21 | return string(re) 22 | } 23 | 24 | type AstSeq struct { 25 | seq []Ast 26 | } 27 | 28 | func (re *AstSeq) String() string { 29 | b := bytes.NewBufferString("(?:") 30 | for _, r := range re.seq { 31 | fmt.Fprint(b, r.String()) 32 | } 33 | fmt.Fprint(b, ")") 34 | return b.String() 35 | } 36 | 37 | type AstAlt struct { 38 | opts []Ast 39 | } 40 | 41 | func (re *AstAlt) String() string { 42 | b := bytes.NewBufferString("(?:") 43 | fmt.Fprint(b, re.opts[0].String()) 44 | for _, r := range re.opts[1:] { 45 | fmt.Fprint(b, "|") 46 | fmt.Fprint(b, r.String()) 47 | } 48 | fmt.Fprint(b, ")") 49 | return b.String() 50 | } 51 | 52 | type AstNotNewline struct{} 53 | 54 | func (re AstNotNewline) String() string { 55 | return "." 56 | } 57 | 58 | type AstRepeat struct { 59 | re Ast 60 | min, max int // -1 means unlimited 61 | } 62 | 63 | func (re *AstRepeat) String() string { 64 | if re.min == 0 && re.max == 1 { 65 | return re.re.String() + "?" 66 | } 67 | if re.min == 0 && re.max < 0 { 68 | return re.re.String() + "*" 69 | } 70 | if re.min == 1 && re.max < 0 { 71 | return re.re.String() + "+" 72 | } 73 | if re.min == re.max { 74 | return fmt.Sprintf("%s{%d}", re.re.String(), re.min) 75 | } 76 | if re.max < 0 { 77 | return fmt.Sprintf("%s{%d,}", re.re.String(), re.min) 78 | } 79 | return fmt.Sprintf("%s{%d,%d}", re.re.String(), re.min, re.max) 80 | } 81 | 82 | type AstCap struct { 83 | index uint 84 | re Ast 85 | } 86 | 87 | func (re *AstCap) String() string { 88 | return fmt.Sprintf("(%s)", re.re) 89 | } 90 | 91 | type AstBackRef uint 92 | 93 | func (re AstBackRef) String() string { 94 | return fmt.Sprintf("\\%d", uint(re)) 95 | } 96 | 97 | type AstAssertBegin struct{} 98 | 99 | func (re AstAssertBegin) String() string { 100 | return "^" 101 | } 102 | 103 | type AstAssertEnd struct{} 104 | 105 | func (re AstAssertEnd) String() string { 106 | return "$" 107 | } 108 | 109 | type AstCharClass struct { 110 | CharClass 111 | str string 112 | } 113 | 114 | func (re AstCharClass) String() string { 115 | return "[" + re.CharClass.String() + "]" 116 | } 117 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package yarex_test 2 | 3 | import ( 4 | "os" 5 | "regexp" 6 | "testing" 7 | 8 | "github.com/Maki-Daisuke/go-yarex" 9 | ) 10 | 11 | //go:generate cmd/yarexgen/yarexgen benchmark_test.go 12 | 13 | // This benchmark was barrowed from https://qiita.com/tj8000rpm/items/b92d7617883639a3e714 14 | var sipPattern = `^["]{0,1}([^"]*)["]{0,1}[ ]*<(sip|tel|sips):(([^@]*)@){0,1}([^>^:]*|\[[a-fA-F0-9:]*\]):{0,1}([0-9]*){0,1}>(;.*){0,1}$` //yarexgen 15 | var sipReStd = regexp.MustCompile(sipPattern) 16 | var sipReAst, _ = yarex.Parse(sipPattern) 17 | var sipReOpt = yarex.OptimizeAst(sipReAst) 18 | var sipReOp = yarex.MustCompileOp(sipPattern) 19 | 20 | // Initialize sipReComp in TestMain, because it must be initialized after RegisterCompiledRegexp is called. 21 | var sipReComp *yarex.Regexp 22 | 23 | func TestMain(m *testing.M) { 24 | sipReComp = yarex.MustCompile(sipPattern) 25 | os.Exit(m.Run()) 26 | } 27 | 28 | var testStrings = []string{"\"display_name\";user=phone;hogehoge", 29 | "", 30 | "\"display_name\"", 31 | ";user=phone", 32 | "\"0333334444\";user=phone", 33 | } 34 | 35 | func BenchmarkSipPattern_Standard(b *testing.B) { 36 | for i := 0; i < b.N; i++ { 37 | for _, s := range testStrings { 38 | sipReStd.MatchString(s) 39 | } 40 | } 41 | } 42 | 43 | func BenchmarkSipPattern_Ast(b *testing.B) { 44 | for i := 0; i < b.N; i++ { 45 | for _, s := range testStrings { 46 | yarex.AstMatch(sipReAst, s) 47 | } 48 | } 49 | } 50 | 51 | func BenchmarkSipPattern_Optimized(b *testing.B) { 52 | for i := 0; i < b.N; i++ { 53 | for _, s := range testStrings { 54 | yarex.AstMatch(sipReOpt, s) 55 | } 56 | } 57 | } 58 | 59 | func BenchmarkSipPattern_Optree(b *testing.B) { 60 | for i := 0; i < b.N; i++ { 61 | for _, s := range testStrings { 62 | sipReOp.MatchString(s) 63 | } 64 | } 65 | } 66 | 67 | func BenchmarkSipPattern_Compiled(b *testing.B) { 68 | if !yarex.IsCompiledMatcher(sipReComp) { 69 | panic("Not compiled!!!!!") 70 | } 71 | for i := 0; i < b.N; i++ { 72 | for _, s := range testStrings { 73 | sipReComp.MatchString(s) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /charclass.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "strings" 7 | "unicode" 8 | ) 9 | 10 | type CharClass interface { 11 | Contains(r rune) bool 12 | String() string 13 | } 14 | 15 | type AsciiMaskClass struct { 16 | Hi uint64 17 | Lo uint64 18 | } 19 | 20 | func (c AsciiMaskClass) Contains(r rune) bool { 21 | if r > 127 { 22 | return false 23 | } 24 | if r < 64 { 25 | return (c.Lo & (1 << r)) != 0 26 | } 27 | return (c.Hi & (1 << (r - 64))) != 0 28 | } 29 | 30 | func (_ AsciiMaskClass) String() string { 31 | // tentative implementation 32 | return ":AsciiMask:" 33 | } 34 | 35 | type CompAsciiMaskClass struct { 36 | AsciiMaskClass 37 | } 38 | 39 | func (c CompAsciiMaskClass) Contains(r rune) bool { 40 | if r > 127 { 41 | return true 42 | } 43 | if r < 64 { 44 | return (c.Lo & (1 << r)) == 0 45 | } 46 | return (c.Hi & (1 << (r - 64))) == 0 47 | } 48 | 49 | func (_ CompAsciiMaskClass) String() string { 50 | // tentative implementation 51 | return ":NegAsciiMask:" 52 | } 53 | 54 | // toAsciiMaskClass returns input as-is if impossible to convert to asciiMaskClass 55 | func toAsciiMaskClass(c CharClass) CharClass { 56 | switch rtc := c.(type) { 57 | case *RangeTableClass: 58 | rt := (*unicode.RangeTable)(rtc) 59 | if len(rt.R32) != 0 { 60 | return c 61 | } 62 | if rt.R16[len(rt.R16)-1].Hi > 127 { 63 | return c 64 | } 65 | var mask AsciiMaskClass 66 | for _, r := range rt.R16 { 67 | for i := r.Lo; i <= r.Hi; i += r.Stride { 68 | if i > 127 { 69 | panic(fmt.Errorf("(THIS SHOULD NOT HAPPEN) i (=%d) exceeds ASCII range", i)) 70 | } 71 | if i < 64 { 72 | mask.Lo |= 1 << i 73 | } else { 74 | mask.Hi |= 1 << (i - 64) 75 | } 76 | } 77 | } 78 | return mask 79 | } 80 | return c 81 | } 82 | 83 | type RangeTableClass unicode.RangeTable 84 | 85 | func (rt *RangeTableClass) Contains(r rune) bool { 86 | return unicode.Is((*unicode.RangeTable)(rt), r) 87 | } 88 | 89 | func (c *RangeTableClass) HasOnlySingleChar() (rune, bool) { 90 | rt := (*unicode.RangeTable)(c) 91 | if len(rt.R16) == 1 && len(rt.R32) == 0 && rt.R16[0].Lo == rt.R16[0].Hi { 92 | return rune(rt.R16[0].Lo), true 93 | } 94 | if len(rt.R16) == 0 && len(rt.R32) == 1 && rt.R32[0].Lo == rt.R32[0].Hi { 95 | return rune(rt.R32[0].Lo), true 96 | } 97 | return 0, false 98 | } 99 | 100 | func (rt *RangeTableClass) String() string { 101 | var buf strings.Builder 102 | for _, r := range rt.R16 { 103 | if r.Lo == r.Hi { 104 | buf.WriteString(regexp.QuoteMeta(string([]rune{rune(r.Lo)}))) 105 | } else { 106 | buf.WriteString(regexp.QuoteMeta(string([]rune{rune(r.Lo)}))) 107 | buf.WriteString("-") 108 | buf.WriteString(regexp.QuoteMeta(string([]rune{rune(r.Hi)}))) 109 | } 110 | } 111 | for _, r := range rt.R32 { 112 | if r.Lo == r.Hi { 113 | buf.WriteString(regexp.QuoteMeta(string([]rune{rune(r.Lo)}))) 114 | } else { 115 | buf.WriteString(regexp.QuoteMeta(string([]rune{rune(r.Lo)}))) 116 | buf.WriteString("-") 117 | buf.WriteString(regexp.QuoteMeta(string([]rune{rune(r.Hi)}))) 118 | } 119 | } 120 | return buf.String() 121 | } 122 | 123 | type CompClass struct{ CharClass } 124 | 125 | func (c CompClass) Contains(r rune) bool { 126 | return !c.CharClass.Contains(r) 127 | } 128 | 129 | func (nc CompClass) String() string { 130 | return "^" + nc.CharClass.String() 131 | } 132 | 133 | type CompositeClass []CharClass 134 | 135 | func (cc CompositeClass) Contains(r rune) bool { 136 | for _, c := range ([]CharClass)(cc) { 137 | if c.Contains(r) { 138 | return true 139 | } 140 | } 141 | return false 142 | } 143 | 144 | func (cc CompositeClass) String() string { 145 | var buf strings.Builder 146 | for _, c := range ([]CharClass)(cc) { 147 | buf.WriteString(c.String()) 148 | } 149 | return buf.String() 150 | } 151 | 152 | func NegateCharClass(c CharClass) CharClass { 153 | switch x := c.(type) { 154 | case *RangeTableClass: 155 | neg := negateRangeTable((*unicode.RangeTable)(x)) 156 | if neg != nil { 157 | return (*RangeTableClass)(neg) 158 | } 159 | case AsciiMaskClass: 160 | return CompAsciiMaskClass{x} 161 | } 162 | return CompClass{c} 163 | } 164 | 165 | // negateRangeTable can only negate RageTables in which Stride = 1, and returns nil 166 | // for those with Stride != 1 167 | func negateRangeTable(in *unicode.RangeTable) *unicode.RangeTable { 168 | out := &unicode.RangeTable{R16: []unicode.Range16{}, R32: []unicode.Range32{}, LatinOffset: 0} 169 | // Note: Range16 and Range32 both represents characters from Low to Hi *inclusively* 170 | var start uint64 = 0 171 | for _, r := range in.R16 { 172 | if r.Stride != 1 { 173 | return nil 174 | } 175 | if start < uint64(r.Lo) { 176 | out.R16 = append(out.R16, unicode.Range16{uint16(start), r.Lo - 1, 1}) 177 | } 178 | start = uint64(r.Hi) + 1 179 | } 180 | if start <= 0xFFFF { 181 | out.R16 = append(out.R16, unicode.Range16{uint16(start), 0xFFFF, 1}) 182 | } 183 | for _, r := range out.R16 { 184 | if r.Hi <= unicode.MaxLatin1 { 185 | out.LatinOffset++ 186 | } 187 | } 188 | start = 0 189 | for _, r := range in.R32 { 190 | if r.Stride != 1 { 191 | return nil 192 | } 193 | if start < uint64(r.Lo) { 194 | out.R32 = append(out.R32, unicode.Range32{uint32(start), r.Lo - 1, 1}) 195 | } 196 | start = uint64(r.Hi) + 1 197 | } 198 | if start <= 0xFFFFFFFF { 199 | out.R32 = append(out.R32, unicode.Range32{uint32(start), 0xFFFFFFFF, 1}) 200 | } 201 | return out 202 | } 203 | 204 | func MergeCharClass(cs ...CharClass) CharClass { 205 | out := []CharClass{} 206 | acc := &unicode.RangeTable{[]unicode.Range16{}, []unicode.Range32{}, 1} 207 | for _, c := range flattenCharClass(cs...) { 208 | // Try to merge RangeTable and fallback to composite if merge fails. 209 | if rc, ok := c.(*RangeTableClass); ok { 210 | merged := mergeRangeTable(acc, (*unicode.RangeTable)(rc)) 211 | if merged != nil { 212 | acc = merged 213 | continue 214 | } 215 | } 216 | out = append(out, c) 217 | } 218 | if len(acc.R16) > 0 || len(acc.R32) > 0 { 219 | out = append(out, (*RangeTableClass)(acc)) 220 | } 221 | if len(out) == 1 { 222 | return out[0] 223 | } 224 | return CompositeClass(out) 225 | } 226 | 227 | // flattenCharClass recursively flatten compositeClass into a list of CharClasses 228 | func flattenCharClass(cs ...CharClass) []CharClass { 229 | out := []CharClass{} 230 | for _, i := range cs { 231 | if c, ok := i.(CompositeClass); ok { 232 | out = append(out, flattenCharClass(([]CharClass)(c)...)...) 233 | } else { 234 | out = append(out, i) 235 | } 236 | } 237 | return out 238 | } 239 | 240 | // mergeRangeTable can only merge RageTables in which Stride = 1, and returns nil 241 | // for those with Stride != 1 242 | func mergeRangeTable(left, right *unicode.RangeTable) *unicode.RangeTable { 243 | out := &unicode.RangeTable{[]unicode.Range16{}, []unicode.Range32{}, 0} 244 | i := 0 245 | j := 0 246 | for { 247 | var next unicode.Range16 248 | if i < len(left.R16) { 249 | if j < len(right.R16) { 250 | if left.R16[i].Lo <= right.R16[j].Lo { 251 | next = left.R16[i] 252 | i++ 253 | } else { 254 | next = right.R16[j] 255 | j++ 256 | } 257 | } else { 258 | next = left.R16[i] 259 | i++ 260 | } 261 | } else if j < len(right.R16) { 262 | next = right.R16[j] 263 | j++ 264 | } else { 265 | break 266 | } 267 | if len(out.R16) == 0 { 268 | out.R16 = append(out.R16, next) 269 | continue 270 | } 271 | if next.Stride != 1 || out.R16[len(out.R16)-1].Stride != 1 { // If either Stride is not 1, give up to merge. 272 | return nil 273 | } 274 | if next.Lo <= out.R16[len(out.R16)-1].Hi+1 { // If the next range is overlapping or adjoininig the previus one, merge them. 275 | if next.Hi > out.R16[len(out.R16)-1].Hi { 276 | out.R16[len(out.R16)-1].Hi = next.Hi 277 | } 278 | } else { // Otherwise, just append the next one. 279 | out.R16 = append(out.R16, next) 280 | } 281 | } 282 | // Recalculate LatinOffset 283 | out.LatinOffset = 0 284 | for _, r := range out.R16 { 285 | if r.Hi <= unicode.MaxLatin1 { 286 | out.LatinOffset++ 287 | } 288 | } 289 | // Do the same for R16 also. 290 | i = 0 291 | j = 0 292 | for { 293 | var next unicode.Range32 294 | if i < len(left.R32) { 295 | if j < len(right.R32) { 296 | if left.R32[i].Lo <= right.R32[j].Lo { 297 | next = left.R32[i] 298 | i++ 299 | } else { 300 | next = right.R32[j] 301 | j++ 302 | } 303 | } else { 304 | next = left.R32[i] 305 | i++ 306 | } 307 | } else if j < len(right.R32) { 308 | next = right.R32[j] 309 | j++ 310 | } else { 311 | break 312 | } 313 | if len(out.R32) == 0 { 314 | out.R32 = append(out.R32, next) 315 | continue 316 | } 317 | if next.Stride != 1 || out.R16[len(out.R16)-1].Stride != 1 { // If either Stride is not 1, give up to merge. 318 | return nil 319 | } 320 | if next.Lo <= out.R32[len(out.R32)-1].Hi+1 { // If the next range is overlapping or adjoininig the previus one, merge them. 321 | if next.Hi > out.R32[len(out.R32)-1].Hi { 322 | out.R32[len(out.R32)-1].Hi = next.Hi 323 | } 324 | } else { // Otherwise, just append the next one. 325 | out.R32 = append(out.R32, next) 326 | } 327 | } 328 | return out 329 | } 330 | 331 | func rangeTableFromTo(lo, hi rune) *unicode.RangeTable { 332 | if lo > hi { 333 | panic(fmt.Errorf(`lo (%q) is higer than hi (%q)`, lo, hi)) 334 | } 335 | out := &unicode.RangeTable{make([]unicode.Range16, 0, 1), make([]unicode.Range32, 0, 1), 0} 336 | if lo <= 0xFFFF { 337 | if hi <= 0xFFFF { 338 | out.R16 = append(out.R16, unicode.Range16{Lo: uint16(lo), Hi: uint16(hi), Stride: 1}) 339 | if hi <= unicode.MaxLatin1 { 340 | out.LatinOffset = 1 341 | } 342 | } else { 343 | out.R16 = append(out.R16, unicode.Range16{Lo: uint16(lo), Hi: 0xFFFF, Stride: 1}) 344 | out.R32 = append(out.R32, unicode.Range32{Lo: 0x10000, Hi: uint32(hi), Stride: 1}) 345 | } 346 | } else { 347 | out.R32 = append(out.R32, unicode.Range32{Lo: uint32(lo), Hi: uint32(hi), Stride: 1}) 348 | } 349 | return out 350 | } 351 | -------------------------------------------------------------------------------- /charclass_test.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "testing" 5 | "unicode" 6 | ) 7 | 8 | var classLowerAlpha = (*RangeTableClass)(&unicode.RangeTable{ 9 | R16: []unicode.Range16{ 10 | {'a', 'z', 1}, 11 | }, 12 | LatinOffset: 1, 13 | }) 14 | 15 | var classAlpha = (*RangeTableClass)(&unicode.RangeTable{ 16 | R16: []unicode.Range16{ 17 | {'A', 'Z', 1}, 18 | {'a', 'z', 1}, 19 | }, 20 | LatinOffset: 1, 21 | }) 22 | 23 | var classDigit = (*RangeTableClass)(&unicode.RangeTable{ 24 | R16: []unicode.Range16{ 25 | {'0', '9', 1}, 26 | }, 27 | LatinOffset: 1, 28 | }) 29 | 30 | func TestRangeTableClass(t *testing.T) { 31 | aB0 := (*RangeTableClass)(&unicode.RangeTable{[]unicode.Range16{{'0', '0', '\x01'}, {'B', 'B', '\x01'}, {'a', 'a', '\x01'}}, []unicode.Range32{}, 3}) 32 | if aB0.String() != "0Ba" { 33 | t.Errorf("expect %q, but got %q", "0Ba", aB0) 34 | } 35 | for i := '\000'; i <= 0xFFFFF; i++ { // Test only up to 0xFFFFF due to long-running test 36 | switch i { 37 | case 'a', 'B', '0': 38 | if !aB0.Contains(i) { 39 | t.Errorf("aB0.Contains(%q) should be true, but actually false", i) 40 | } 41 | break 42 | default: 43 | if aB0.Contains(i) { 44 | t.Errorf("aB0.Contains(%q) should be false, but actually true", i) 45 | } 46 | } 47 | } 48 | } 49 | 50 | func TestNegateCharClass_LowerAlpha(t *testing.T) { 51 | notLowerAlpha := NegateCharClass(classLowerAlpha) 52 | if _, ok := notLowerAlpha.(*RangeTableClass); !ok { 53 | t.Errorf("notLowerAlpha should be of type *rangeTableClass, but actually of type %T", notLowerAlpha) 54 | } 55 | for i := '\000'; i <= 0xFFFFF; i++ { // Test only up to 0xFFFFF due to long-running test 56 | if notLowerAlpha.Contains(i) != !unicode.Is((*unicode.RangeTable)(classLowerAlpha), i) { 57 | t.Errorf("notLowerAlpha.Contains(0x%x) should be %t, but actually not", i, !unicode.Is((*unicode.RangeTable)(classLowerAlpha), i)) 58 | } 59 | } 60 | } 61 | 62 | func TestNegateCharClass_Alpha(t *testing.T) { 63 | notAlpha := NegateCharClass(classAlpha) 64 | if _, ok := notAlpha.(*RangeTableClass); !ok { 65 | t.Errorf("notAlpha should be of type *rangeTableClass, but actually of type %T", notAlpha) 66 | } 67 | for i := '\000'; i <= 0xFFFFF; i++ { // Test only up to 0xFFFFF due to long-running test 68 | if notAlpha.Contains(i) != !unicode.Is((*unicode.RangeTable)(classAlpha), i) { 69 | t.Errorf("notAlpha.Contains(0x%x) should be %t, but actually not", i, !unicode.Is((*unicode.RangeTable)(classAlpha), i)) 70 | } 71 | } 72 | } 73 | 74 | func TestNegateCharClass_Lm(t *testing.T) { 75 | notLm := NegateCharClass((*RangeTableClass)(unicode.Lm)) 76 | if _, ok := notLm.(CompClass); !ok { 77 | t.Errorf("notLm should be of type negClass, but actually of type %T", notLm) 78 | } 79 | if notLm.String()[0] != '^' { 80 | t.Errorf("expect %q, but got %q", "^...", notLm.String()) 81 | } 82 | for i := '\000'; i <= 0xFFFFF; i++ { // Test only up to 0xFFFFF due to long-running test 83 | if notLm.Contains(i) != !unicode.Is(unicode.Lm, i) { 84 | t.Errorf("notLm.Contains(0x%x) should be %t, but actually not", i, !unicode.Is(unicode.Lm, i)) 85 | } 86 | } 87 | } 88 | 89 | func TestNegateCharClass_Nd(t *testing.T) { 90 | // Nd should be negated by RangeTable level, because it only contains ranges with Stride = 1 91 | notNd := NegateCharClass((*RangeTableClass)(unicode.Nd)) 92 | if _, ok := notNd.(*RangeTableClass); !ok { 93 | t.Errorf("notNd should be of type *rangeTableClass, but actually of type %T", notNd) 94 | } 95 | for i := '\000'; i <= 0xFFFFF; i++ { // Test only up to 0xFFFFF due to long-running test 96 | if notNd.Contains(i) != !unicode.Is(unicode.Nd, i) { 97 | t.Errorf("notNd.Contains(0x%x) should be %t, but actually not", i, !unicode.Is(unicode.Nd, i)) 98 | } 99 | } 100 | } 101 | 102 | func TestMergeCharClass_AlphaNum(t *testing.T) { 103 | alphanum := MergeCharClass(classLowerAlpha, classAlpha, classDigit) 104 | if _, ok := alphanum.(*RangeTableClass); !ok { 105 | t.Errorf("alphanum should be of type *rangeTableClass, but actually of type %T", alphanum) 106 | } 107 | if alphanum.String() != "0-9A-Za-z" { 108 | t.Errorf("expect %q, but got %q", "0-9A-Za-z", alphanum) 109 | } 110 | for i := '\000'; i <= 0xFFFFF; i++ { // Test only up to 0xFFFFF due to long-running test 111 | if alphanum.Contains(i) != ('A' <= i && i <= 'Z' || 'a' <= i && i <= 'z' || '0' <= i && i <= '9') { 112 | t.Errorf("alphanum.Contains(0x%x) should be %t, but actually not", i, ('A' <= i && i <= 'Z' || 'a' <= i && i <= 'z' || '0' <= i && i <= '9')) 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /cmd/yarexgen/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "go/ast" 5 | "go/constant" 6 | "go/parser" 7 | "go/token" 8 | "log" 9 | "os" 10 | "regexp" 11 | 12 | "github.com/Maki-Daisuke/go-yarex" 13 | ) 14 | 15 | var reDirective = regexp.MustCompile(`^yarexgen\s*$`) 16 | 17 | func main() { 18 | if len(os.Args) < 2 { 19 | log.Println("Specify a file name.") 20 | os.Exit(1) 21 | } 22 | filename := os.Args[1] 23 | 24 | fset := token.NewFileSet() 25 | file, err := parser.ParseFile(fset, filename, nil, parser.ParseComments) 26 | if err != nil { 27 | log.Println("Error: ", err) 28 | os.Exit(1) 29 | } 30 | 31 | generator := yarex.NewGoGenerator(filename, file.Name.Name) 32 | LOOP: 33 | for n, cg := range ast.NewCommentMap(fset, file, file.Comments) { 34 | for _, c := range cg { 35 | if reDirective.MatchString(c.Text()) { 36 | strs := findRegex(n) 37 | if strs != nil { 38 | generator.Add(strs...) 39 | } else { 40 | log.Printf("couldn't find regexp string in %s at line %d\n", filename, fset.File(c.Pos()).Line(c.Pos())) 41 | } 42 | continue LOOP 43 | } 44 | } 45 | } 46 | 47 | re := regexp.MustCompile(`(?i:(_test)?\.go$)`) 48 | out := re.ReplaceAllString(filename, "_yarex$0") 49 | outfile, err := os.Create(out) 50 | if err != nil { 51 | panic(err) 52 | } 53 | defer outfile.Close() 54 | _, err = generator.WriteTo(outfile) 55 | if err != nil { 56 | panic(err) 57 | } 58 | } 59 | 60 | // find string litetal 61 | func findRegex(n ast.Node) (out []string) { 62 | ast.Inspect(n, func(n ast.Node) bool { 63 | lit, ok := n.(*ast.BasicLit) 64 | if !ok { 65 | return true 66 | } 67 | if lit.Kind != token.STRING { 68 | return true 69 | } 70 | v := constant.MakeFromLiteral(lit.Value, lit.Kind, 0) 71 | out = append(out, constant.StringVal(v)) 72 | return true 73 | }) 74 | return out 75 | } 76 | -------------------------------------------------------------------------------- /codefragments.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "io" 5 | ) 6 | 7 | type codeFragments struct { 8 | minReq int // minimum number of characters to match this code fragmant 9 | code string // fragment of Go code 10 | follower *codeFragments 11 | } 12 | 13 | func (cf *codeFragments) codeLength() int { 14 | if cf == nil { 15 | return 0 16 | } 17 | return len(cf.code) + cf.follower.codeLength() 18 | } 19 | 20 | func (cf *codeFragments) prepend(s string) *codeFragments { 21 | return &codeFragments{ 22 | minReq: cf.minReq, 23 | code: s, 24 | follower: cf, 25 | } 26 | } 27 | 28 | func (cf *codeFragments) WriteTo(w io.Writer) (int64, error) { 29 | var acc int64 30 | for i := cf; i != nil; i = i.follower { 31 | n, err := io.WriteString(w, i.code) 32 | acc += int64(n) 33 | if err != nil { 34 | return acc, err 35 | } 36 | } 37 | return acc, nil 38 | } 39 | -------------------------------------------------------------------------------- /compiledmatch.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import "sync" 4 | 5 | // IntStackPool is accessed by compiled matchers to reuse int stacks. 6 | // Do not use this for any other purposes. 7 | var IntStackPool = sync.Pool{ 8 | New: func() interface{} { 9 | b := make([]int, 256) 10 | return &b 11 | }, 12 | } 13 | 14 | var compiledRegexps = map[string]*Regexp{} 15 | 16 | func RegisterCompiledRegexp(s string, h bool, m int, f func(int, MatchContext, int, func(MatchContext)) bool) bool { 17 | compiledRegexps[s] = &Regexp{s, &compiledExecer{f, h, m}} 18 | return true 19 | } 20 | 21 | type compiledExecer struct { 22 | fun func(int, MatchContext, int, func(MatchContext)) bool 23 | headOnly bool 24 | minReq int 25 | } 26 | 27 | func (exe *compiledExecer) exec(str string, pos int, onSuccess func(MatchContext)) bool { 28 | headOnly := exe.headOnly 29 | minReq := exe.minReq 30 | if headOnly && pos != 0 { 31 | return false 32 | } 33 | if minReq > len(str)-pos { 34 | return false 35 | } 36 | stack := *(opStackPool.Get().(*[]opStackFrame)) 37 | defer func() { opStackPool.Put(&stack) }() 38 | getter := func() []opStackFrame { return stack } 39 | setter := func(s []opStackFrame) { stack = s } 40 | ctx0 := makeOpMatchContext(&str, &getter, &setter) 41 | if exe.fun(0, ctx0.Push(ContextKey{'c', 0}, pos), pos, onSuccess) { 42 | return true 43 | } 44 | if headOnly { 45 | return false 46 | } 47 | for i := pos + 1; minReq <= len(str)-i; i++ { 48 | if exe.fun(0, ctx0.Push(ContextKey{'c', 0}, i), i, onSuccess) { 49 | return true 50 | } 51 | } 52 | return false 53 | } 54 | -------------------------------------------------------------------------------- /export_test.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | ) 7 | 8 | // Export private functions only for test 9 | var ( 10 | Parse = parse 11 | OptimizeAst = optimizeAst 12 | AstMatch = astMatch 13 | ) 14 | 15 | // MustCompileOp is identical to MustCompile, but ignores compiled version of regexp 16 | // and returns OpTree version. 17 | func MustCompileOp(ptn string) *Regexp { 18 | ast, err := parse(ptn) 19 | if err != nil { 20 | panic(err) 21 | } 22 | ast = optimizeAst(ast) 23 | op := opCompile(ast) 24 | return &Regexp{ptn, opExecer{op}} 25 | } 26 | 27 | func IsOpMatcher(r *Regexp) bool { 28 | _, ok := r.exe.(opExecer) 29 | return ok 30 | } 31 | 32 | func IsCompiledMatcher(r *Regexp) bool { 33 | _, ok := r.exe.(*compiledExecer) 34 | return ok 35 | } 36 | 37 | func DumpAst(re Ast) string { 38 | var buf strings.Builder 39 | dumpAux(re, 0, &buf) 40 | return buf.String() 41 | } 42 | 43 | func indent(n int) string { 44 | return strings.Repeat(" ", n) 45 | } 46 | 47 | func dumpAux(re Ast, n int, buf *strings.Builder) { 48 | switch v := re.(type) { 49 | case AstLit: 50 | fmt.Fprintf(buf, "%sLit{%q}", indent(n), string(v)) 51 | return 52 | case *AstSeq: 53 | if len(v.seq) == 0 { 54 | fmt.Fprintf(buf, "%sSeq{ }", indent(n)) 55 | return 56 | } 57 | fmt.Fprintf(buf, "%sSeq{\n", indent(n)) 58 | for _, r := range v.seq { 59 | dumpAux(r, n+1, buf) 60 | buf.WriteRune('\n') 61 | } 62 | fmt.Fprintf(buf, "%s}", indent(n)) 63 | return 64 | case *AstAlt: 65 | if len(v.opts) == 0 { 66 | fmt.Fprintf(buf, "%sAlt{ }", indent(n)) 67 | return 68 | } 69 | fmt.Fprintf(buf, "%sAlt{\n", indent(n)) 70 | for _, r := range v.opts { 71 | dumpAux(r, n+1, buf) 72 | buf.WriteRune('\n') 73 | } 74 | fmt.Fprintf(buf, "%s}", indent(n)) 75 | return 76 | case AstNotNewline: 77 | fmt.Fprintf(buf, "%sNotNewLine", indent(n)) 78 | return 79 | case *AstRepeat: 80 | fmt.Fprintf(buf, "%sRepeat(min=%d,max=%d){\n", indent(n), v.min, v.max) 81 | dumpAux(v.re, n+1, buf) 82 | fmt.Fprintf(buf, "\n%s}", indent(n)) 83 | return 84 | case *AstCap: 85 | fmt.Fprintf(buf, "%sCapture(index=%d){\n", indent(n), v.index) 86 | dumpAux(v.re, n+1, buf) 87 | fmt.Fprintf(buf, "\n%s}", indent(n)) 88 | return 89 | case AstBackRef: 90 | fmt.Fprintf(buf, "%sBackRef(index=%d)", indent(n), int(v)) 91 | return 92 | case AstAssertBegin: 93 | fmt.Fprintf(buf, "%sAssertBegin", indent(n)) 94 | return 95 | case AstAssertEnd: 96 | fmt.Fprintf(buf, "%sAssertEnd", indent(n)) 97 | return 98 | case AstCharClass: 99 | fmt.Fprintf(buf, "%sCharClass%s", indent(n), v) 100 | return 101 | } 102 | panic(fmt.Errorf("IMPLEMENT DUMP for %T", re)) 103 | } 104 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Maki-Daisuke/go-yarex 2 | 3 | go 1.15 4 | -------------------------------------------------------------------------------- /gogenerate.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "regexp" 7 | "strings" 8 | ) 9 | 10 | var reNotWord = regexp.MustCompile(`\W`) 11 | 12 | type charClassResult struct { 13 | id string 14 | code *codeFragments 15 | } 16 | 17 | type GoGenerator struct { 18 | pkgname string 19 | useUtf8 bool 20 | stateCount uint 21 | idPrefix string 22 | idCount uint 23 | repeatCount uint 24 | funcs map[string]*codeFragments 25 | charClasses map[string]charClassResult 26 | useCharClass bool 27 | useSmallLoop bool 28 | } 29 | 30 | func NewGoGenerator(file string, pkg string) *GoGenerator { 31 | gg := &GoGenerator{} 32 | gg.pkgname = pkg 33 | gg.idPrefix = fmt.Sprintf("yarexGen_%s", reNotWord.ReplaceAllString(file, "_")) 34 | gg.funcs = map[string]*codeFragments{} 35 | gg.charClasses = map[string]charClassResult{} 36 | return gg 37 | } 38 | 39 | func (gg *GoGenerator) Add(rs ...string) error { 40 | for _, r := range rs { 41 | if _, ok := gg.funcs[r]; ok { 42 | continue 43 | } 44 | ast, err := parse(r) 45 | if err != nil { 46 | return err 47 | } 48 | ast = optimizeAst(ast) 49 | code := gg.generateFunc(r, ast) 50 | gg.funcs[r] = code 51 | } 52 | return nil 53 | } 54 | 55 | func (gg *GoGenerator) WriteTo(w io.Writer) (int64, error) { 56 | var acc int64 57 | importUtf8 := "" 58 | if gg.useUtf8 { 59 | importUtf8 = `"unicode/utf8"` 60 | } 61 | n, err := fmt.Fprintf(w, `package %s 62 | 63 | import ( 64 | "strconv" 65 | "unsafe" 66 | %s 67 | "github.com/Maki-Daisuke/go-yarex" 68 | ) 69 | 70 | `, gg.pkgname, importUtf8) 71 | acc += int64(n) 72 | if err != nil { 73 | return acc, err 74 | } 75 | 76 | for _, cr := range gg.charClasses { 77 | n, err := fmt.Fprintf(w, "var %s = ", cr.id) 78 | acc += int64(n) 79 | if err != nil { 80 | return acc, err 81 | } 82 | m, err := cr.code.WriteTo(w) 83 | acc += m 84 | if err != nil { 85 | return acc, err 86 | } 87 | n, err = fmt.Fprintf(w, "\n") 88 | acc += int64(n) 89 | if err != nil { 90 | return acc, err 91 | } 92 | } 93 | 94 | for _, f := range gg.funcs { 95 | n, err := f.WriteTo(w) 96 | acc += n 97 | if err != nil { 98 | return acc, err 99 | } 100 | } 101 | 102 | return acc, nil 103 | } 104 | 105 | func (gg *GoGenerator) newState() uint { 106 | gg.stateCount++ 107 | return gg.stateCount 108 | } 109 | 110 | func (gg *GoGenerator) newId() string { 111 | gg.idCount++ 112 | return fmt.Sprintf("%s%d", gg.idPrefix, gg.idCount) 113 | } 114 | 115 | func (gg *GoGenerator) newRepeatID() uint { 116 | gg.repeatCount++ 117 | return gg.repeatCount 118 | } 119 | 120 | func (gg *GoGenerator) generateFunc(re string, ast Ast) *codeFragments { 121 | funcID := gg.newId() 122 | gg.stateCount = 0 123 | gg.useCharClass = false 124 | gg.useSmallLoop = false 125 | follower := gg.generateAst(funcID, ast, &codeFragments{0, fmt.Sprintf(` 126 | onSuccess(ctx.Push(yarex.ContextKey{'c', 0}, p)) 127 | return true 128 | default: 129 | // This should not happen. 130 | panic("state" + strconv.Itoa(state) + "is not defined") 131 | } 132 | } 133 | } 134 | var _ = yarex.RegisterCompiledRegexp(%q, %t, %d, %s) 135 | `, re, canOnlyMatchAtBegining(ast), minRequiredLengthOfAst(ast), funcID), nil}) 136 | 137 | varDecl := "" 138 | if gg.useCharClass { 139 | varDecl = ` 140 | var ( 141 | r rune 142 | size int 143 | ) 144 | ` 145 | } 146 | if gg.useSmallLoop { 147 | varDecl += ` 148 | var ( 149 | localStack [16]int 150 | heapStack *[]int 151 | endPos int 152 | n int 153 | ) 154 | ` 155 | } 156 | 157 | return follower.prepend(fmt.Sprintf(` 158 | func %s (state int, ctx yarex.MatchContext, p int, onSuccess func(yarex.MatchContext)) bool { 159 | %s 160 | str := *(*string)(unsafe.Pointer(ctx.Str)) 161 | for{ 162 | switch state { 163 | case 0: 164 | `, funcID, varDecl)) 165 | } 166 | 167 | func (gg *GoGenerator) generateAst(funcID string, re Ast, follower *codeFragments) *codeFragments { 168 | switch r := re.(type) { 169 | case AstLit: 170 | return gg.generateLit(string(r), follower) 171 | case AstNotNewline: 172 | gg.useUtf8 = true 173 | return &codeFragments{follower.minReq + 1, fmt.Sprintf(` 174 | if len(str)-p < %d { 175 | return false 176 | } 177 | r, size := utf8.DecodeRuneInString(str[p:]) 178 | if size == 0 || r == utf8.RuneError { 179 | return false 180 | } 181 | if r == '\n' { 182 | return false 183 | } 184 | p += size 185 | `, follower.minReq+1), follower} 186 | case *AstSeq: 187 | return gg.generateSeq(funcID, r.seq, follower) 188 | case *AstAlt: 189 | return gg.generateAlt(funcID, r.opts, follower) 190 | case *AstRepeat: 191 | return gg.generateRepeat(funcID, r.re, r.min, r.max, follower) 192 | case *AstCap: 193 | return gg.compileCapture(funcID, r.re, r.index, follower) 194 | case AstBackRef: 195 | return gg.compileBackRef(uint(r), follower) 196 | case AstAssertBegin: 197 | return follower.prepend(` 198 | if p != 0 { 199 | return false 200 | } 201 | `) 202 | case AstAssertEnd: 203 | return follower.prepend(` 204 | if p != len(str) { 205 | return false 206 | } 207 | `) 208 | case AstCharClass: 209 | return gg.generateCharClass(r.str, r.CharClass, follower) 210 | default: 211 | panic(fmt.Errorf("Please implement compiler for %T", re)) 212 | } 213 | } 214 | 215 | func (gg *GoGenerator) generateLit(str string, follower *codeFragments) *codeFragments { 216 | if len(str) == 0 { 217 | return follower 218 | } 219 | minReq := follower.minReq + len(str) 220 | var buf strings.Builder 221 | fmt.Fprintf(&buf, `if len(str)-p < %d { 222 | return false 223 | } 224 | `, minReq) 225 | fmt.Fprintf(&buf, "if !(str[p] == %d", str[0]) 226 | for i := 1; i < len(str); i++ { 227 | fmt.Fprintf(&buf, "&& str[p+%d] == %d", i, str[i]) 228 | } 229 | fmt.Fprintf(&buf, `) { 230 | return false 231 | } 232 | p += %d 233 | `, len(str)) 234 | return &codeFragments{minReq, buf.String(), follower} 235 | } 236 | 237 | func (gg *GoGenerator) generateSeq(funcID string, seq []Ast, follower *codeFragments) *codeFragments { 238 | if len(seq) == 0 { 239 | return follower 240 | } 241 | follower = gg.generateSeq(funcID, seq[1:], follower) 242 | return gg.generateAst(funcID, seq[0], follower) 243 | } 244 | 245 | func (gg *GoGenerator) generateAlt(funcID string, opts []Ast, follower *codeFragments) *codeFragments { 246 | switch len(opts) { 247 | case 0: 248 | return follower 249 | case 1: 250 | return gg.generateSeq(funcID, opts, follower) 251 | } 252 | 253 | origMinReq := follower.minReq 254 | 255 | followerState := gg.newState() 256 | follower = follower.prepend(fmt.Sprintf(` 257 | fallthrough 258 | case %d: 259 | `, followerState)) 260 | follower = gg.generateAst(funcID, opts[len(opts)-1], follower) 261 | minReq := follower.minReq 262 | stateLastOpt := gg.newState() 263 | follower = follower.prepend(fmt.Sprintf("case %d:\n", stateLastOpt)) 264 | 265 | states := make([]uint, len(opts)-1) 266 | for i := len(opts) - 2; i >= 0; i-- { 267 | follower = follower.prepend(fmt.Sprintf("state = %d\n", followerState)) 268 | follower.minReq = origMinReq 269 | follower = gg.generateAst(funcID, opts[i], follower) 270 | if follower.minReq < minReq { 271 | minReq = follower.minReq 272 | } 273 | s := gg.newState() 274 | follower = follower.prepend(fmt.Sprintf("case %d:\n", s)) 275 | states[i] = s 276 | } 277 | 278 | tries := make([]string, len(states)) 279 | for i, s := range states { 280 | tries[i] = fmt.Sprintf(`%s(%d, ctx, p, onSuccess)`, funcID, s) 281 | } 282 | follower = follower.prepend(fmt.Sprintf(` 283 | if %s { 284 | return true 285 | } 286 | state = %d 287 | `, strings.Join(tries, " || "), stateLastOpt)) 288 | follower.minReq = minReq 289 | return follower 290 | } 291 | 292 | func (gg *GoGenerator) generateRepeat(funcID string, re Ast, min, max int, follower *codeFragments) *codeFragments { 293 | switch r := re.(type) { 294 | case AstLit: 295 | return gg.generateRepeatLit(funcID, string(r), min, max, follower) 296 | case AstCharClass: 297 | return gg.generateRepeatCharClass(funcID, r, min, max, follower) 298 | } 299 | if min > 0 { 300 | return gg.generateAst(funcID, re, gg.generateRepeat(funcID, re, min-1, max-1, follower)) 301 | } 302 | if max == 0 { 303 | return follower 304 | } 305 | if max > 0 { 306 | follower = gg.generateRepeat(funcID, re, 0, max-1, follower) 307 | followerState := gg.newState() 308 | follower = follower.prepend(fmt.Sprintf(` 309 | fallthrough 310 | case %d: 311 | `, followerState)) 312 | minReq := follower.minReq 313 | follower = gg.generateAst(funcID, re, follower) 314 | altState := gg.newState() 315 | follower = follower.prepend(fmt.Sprintf(` 316 | if %s(%d, ctx, p, onSuccess) { 317 | return true 318 | } 319 | state = %d 320 | case %d: 321 | `, funcID, altState, followerState, altState)) 322 | follower.minReq = minReq 323 | return follower 324 | } 325 | // Here, we need to compile infinite-loop regexp 326 | startState := gg.newState() 327 | repeatState := gg.newState() 328 | followerState := gg.newState() 329 | follower = follower.prepend(fmt.Sprintf(` 330 | state = %d 331 | case %d: 332 | `, startState, followerState)) 333 | minReq := follower.minReq 334 | follower = gg.generateAst(funcID, re, follower) 335 | if canMatchZeroWidth(re) { // If re can matches zero-width string, we need zero-width check 336 | repeatID := gg.newRepeatID() 337 | follower = follower.prepend(fmt.Sprintf(` 338 | prev := ctx.FindVal(yarex.ContextKey{'r', %d}) 339 | if prev == p { // This means zero-width matching occurs. 340 | state = %d // So, terminate repeating. 341 | continue 342 | } 343 | ctx2 := ctx.Push(yarex.ContextKey{'r', %d}, p) 344 | if %s(%d, ctx2, p, onSuccess) { 345 | return true 346 | } 347 | state = %d 348 | case %d: 349 | `, repeatID, followerState, repeatID, funcID, repeatState, followerState, repeatState)) 350 | } else { // We can skip zero-width check for optimization 351 | follower = follower.prepend(fmt.Sprintf(` 352 | if %s(%d, ctx, p, onSuccess) { 353 | return true 354 | } 355 | state = %d 356 | case %d: 357 | `, funcID, repeatState, followerState, repeatState)) 358 | } 359 | follower.minReq = minReq 360 | return follower.prepend(fmt.Sprintf(` 361 | fallthrough 362 | case %d: 363 | `, startState)) 364 | } 365 | 366 | func (gg *GoGenerator) generateRepeatLit(funcID string, lit string, min, max int, follower *codeFragments) *codeFragments { 367 | gg.useSmallLoop = true 368 | followerState := gg.newState() 369 | minReq := follower.minReq + len(lit) 370 | maxCond := "" 371 | if max >= 0 { 372 | maxCond = fmt.Sprintf(`n < %d && `, max) 373 | } 374 | conds := []string{} 375 | for i := 0; i < len(lit); i++ { 376 | conds = append(conds, fmt.Sprintf(`str[p+%d] != %d`, i, lit[i])) 377 | } 378 | condition := strings.Join(conds, " || ") 379 | return follower.prepend(fmt.Sprintf(` 380 | endPos = len(str) - %d 381 | n = 0 382 | for %s p <= endPos { 383 | if %s { 384 | break 385 | } 386 | if len(localStack) == n { 387 | goto LABEL_HEAP_STACK%d 388 | } 389 | localStack[n] = p 390 | n++ 391 | p += %d 392 | } 393 | for n > %d { // try backtrack 394 | if %s(%d, ctx, p, onSuccess) { 395 | return true 396 | } 397 | n-- 398 | p = localStack[n] 399 | } 400 | goto LABEL_END%d 401 | LABEL_HEAP_STACK%d: 402 | heapStack = (yarex.IntStackPool.Get().(*[]int)) 403 | copy(*heapStack, localStack[:]) 404 | (*heapStack)[n] = p 405 | n++ 406 | p += %d 407 | for %s p <= endPos { 408 | if %s { 409 | break 410 | } 411 | if len(*heapStack) == n { 412 | *heapStack = append(*heapStack, p) 413 | *heapStack = (*heapStack)[:cap(*heapStack)] 414 | } else { 415 | (*heapStack)[n] = p 416 | } 417 | n++ 418 | p += %d 419 | } 420 | for n > %d { // try backtrack 421 | if %s(%d, ctx, p, onSuccess) { 422 | yarex.IntStackPool.Put(heapStack) 423 | return true 424 | } 425 | n-- 426 | p = (*heapStack)[n] 427 | } 428 | yarex.IntStackPool.Put(heapStack) 429 | LABEL_END%d: 430 | fallthrough 431 | case %d: 432 | `, minReq, maxCond, condition, followerState, len(lit), min, funcID, followerState, followerState, followerState, len(lit), maxCond, condition, len(lit), min, funcID, followerState, followerState, followerState)) 433 | } 434 | 435 | func (gg *GoGenerator) generateRepeatCharClass(funcID string, re AstCharClass, min, max int, follower *codeFragments) *codeFragments { 436 | gg.generateCharClass(re.str, re.CharClass, follower) // Compile and register CharClass 437 | ccId := gg.charClasses[re.str].id // Get CharClass's identifier 438 | followerState := gg.newState() 439 | minReq := follower.minReq + 1 440 | maxCond := "" 441 | if max >= 0 { 442 | maxCond = fmt.Sprintf(`n < %d && `, max) 443 | } 444 | return follower.prepend(fmt.Sprintf(` 445 | endPos = len(str) - %d 446 | n = 0 447 | for %s p <= endPos { 448 | r, size = utf8.DecodeRuneInString(str[p:]) 449 | if size == 0 || r == utf8.RuneError { 450 | break 451 | } 452 | if !%s.Contains(r) { 453 | break 454 | } 455 | if len(localStack) == n { 456 | goto LABEL_HEAP_STACK%d 457 | } 458 | localStack[n] = p 459 | n++ 460 | p += size 461 | } 462 | for n > %d { // try backtrack 463 | if %s(%d, ctx, p, onSuccess) { 464 | return true 465 | } 466 | n-- 467 | p = localStack[n] 468 | } 469 | goto LABEL_END%d 470 | LABEL_HEAP_STACK%d: 471 | heapStack = (yarex.IntStackPool.Get().(*[]int)) 472 | copy(*heapStack, localStack[:]) 473 | (*heapStack)[n] = p 474 | n++ 475 | p += size 476 | for %s p <= endPos { 477 | r, size = utf8.DecodeRuneInString(str[p:]) 478 | if size == 0 || r == utf8.RuneError { 479 | break 480 | } 481 | if !%s.Contains(r) { 482 | break 483 | } 484 | if len(*heapStack) == n { 485 | *heapStack = append(*heapStack, p) 486 | *heapStack = (*heapStack)[:cap(*heapStack)] 487 | } else { 488 | (*heapStack)[n] = p 489 | } 490 | n++ 491 | p += size 492 | } 493 | for n > %d { // try backtrack 494 | if %s(%d, ctx, p, onSuccess) { 495 | yarex.IntStackPool.Put(heapStack) 496 | return true 497 | } 498 | n-- 499 | p = (*heapStack)[n] 500 | } 501 | yarex.IntStackPool.Put(heapStack) 502 | LABEL_END%d: 503 | fallthrough 504 | case %d: 505 | `, minReq, maxCond, ccId, followerState, min, funcID, followerState, followerState, followerState, maxCond, ccId, min, funcID, followerState, followerState, followerState)) 506 | } 507 | 508 | func (gg *GoGenerator) compileCapture(funcID string, re Ast, index uint, follower *codeFragments) *codeFragments { 509 | follower = follower.prepend(fmt.Sprintf(` 510 | ctx = ctx.Push(yarex.ContextKey{'c', %d}, p) 511 | `, index)) 512 | follower = gg.generateAst(funcID, re, follower) 513 | return follower.prepend(fmt.Sprintf(` 514 | ctx = ctx.Push(yarex.ContextKey{'c', %d}, p) 515 | `, index)) 516 | } 517 | 518 | func (gg *GoGenerator) compileBackRef(index uint, follower *codeFragments) *codeFragments { 519 | return follower.prepend(fmt.Sprintf(` 520 | s, ok := ctx.GetCaptured(yarex.ContextKey{'c', %d}) 521 | if !ok { // There is no captured string with the index. So, failed matching. 522 | return false 523 | } 524 | l := len(s) 525 | if len(str)-p < l { 526 | return false 527 | } 528 | for i := 0; i < l; i++ { 529 | if str[p+i] != s[i] { 530 | return false 531 | } 532 | } 533 | p += l 534 | `, index)) 535 | } 536 | 537 | func (gg *GoGenerator) generateCharClass(ptn string, c CharClass, follower *codeFragments) *codeFragments { 538 | var id string 539 | if r, ok := gg.charClasses[ptn]; ok { 540 | id = r.id 541 | } else { 542 | id = gg.newId() 543 | gg.charClasses[ptn] = charClassResult{ 544 | id: id, 545 | code: gg.generateCharClassAux(c, nil), 546 | } 547 | } 548 | gg.useCharClass = true 549 | return &codeFragments{follower.minReq + 1, fmt.Sprintf(` 550 | if len(str)-p < %d { 551 | return false 552 | } 553 | r, size = utf8.DecodeRuneInString(str[p:]) 554 | if size == 0 || r == utf8.RuneError { 555 | return false 556 | } 557 | if !%s.Contains(r) { 558 | return false 559 | } 560 | p += size 561 | `, follower.minReq+1, id), follower} 562 | } 563 | -------------------------------------------------------------------------------- /gogenerate_charclass.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "unicode" 7 | ) 8 | 9 | func (gg *GoGenerator) generateCharClassAux(cc CharClass, follower *codeFragments) *codeFragments { 10 | switch c := cc.(type) { 11 | case AsciiMaskClass: 12 | return gg.generateAsciiMaskClass(c, follower) 13 | case CompAsciiMaskClass: 14 | return gg.generateCompAsciiMaskClass(c, follower) 15 | case *RangeTableClass: 16 | return gg.generateRangeTableClass(c, follower) 17 | case CompClass: 18 | return gg.generateCompClass(c, follower) 19 | case CompositeClass: 20 | } 21 | panic(fmt.Errorf("Please implement compiler for %T", cc)) 22 | } 23 | 24 | func (gg *GoGenerator) generateAsciiMaskClass(c AsciiMaskClass, follower *codeFragments) *codeFragments { 25 | return &codeFragments{1, fmt.Sprintf(`yarex.AsciiMaskClass{Hi: 0x%X, Lo: 0x%X}`, c.Hi, c.Lo), follower} 26 | } 27 | 28 | func (gg *GoGenerator) generateCompAsciiMaskClass(c CompAsciiMaskClass, follower *codeFragments) *codeFragments { 29 | return &codeFragments{1, fmt.Sprintf(`yarex.CompAsciiMaskClass{yarex.AsciiMaskClass{Hi: 0x%X, Lo: 0x%X}}`, c.AsciiMaskClass.Hi, c.AsciiMaskClass.Lo), follower} 30 | } 31 | 32 | func (gg *GoGenerator) generateRangeTableClass(c *RangeTableClass, follower *codeFragments) *codeFragments { 33 | rt := (*unicode.RangeTable)(c) 34 | var buf strings.Builder 35 | buf.WriteString("(*yarex.RangeTableClass)(*unicode.RangeTable{\n") 36 | if rt.R16 != nil { 37 | buf.WriteString(" R16: []Range16{\n") 38 | for _, r := range rt.R16 { 39 | fmt.Fprintf(&buf, " {0x%04x, 0x%04x, %d},\n", r.Lo, r.Hi, r.Stride) 40 | } 41 | buf.WriteString(" },\n") 42 | } 43 | if rt.R32 != nil { 44 | buf.WriteString(" R32: []Range32{\n") 45 | for _, r := range rt.R32 { 46 | fmt.Fprintf(&buf, " {0x%x, 0x%x, %d},\n", r.Lo, r.Hi, r.Stride) 47 | } 48 | buf.WriteString(" },\n") 49 | } 50 | if rt.LatinOffset != 0 { 51 | fmt.Fprintf(&buf, " LatinOffset: %d,\n", rt.LatinOffset) 52 | } 53 | buf.WriteString("})\n") 54 | return &codeFragments{1, buf.String(), follower} 55 | } 56 | 57 | func (gg *GoGenerator) generateCompClass(c CompClass, follower *codeFragments) *codeFragments { 58 | return &codeFragments{ 59 | 1, 60 | `yarex.CompClass{`, 61 | gg.generateCharClassAux(c.CharClass, follower.prepend("}")), 62 | } 63 | } 64 | 65 | func (gg *GoGenerator) generateCompositeClass(c CompositeClass, follower *codeFragments) *codeFragments { 66 | follower = follower.prepend(")") 67 | cs := ([]CharClass)(c) 68 | follower = gg.generateCharClassAux(cs[len(cs)-1], follower) 69 | for _, c := range cs[0 : len(cs)-1] { 70 | follower = follower.prepend(", ") 71 | follower = gg.generateCharClassAux(c, follower) 72 | } 73 | return follower.prepend("yarex.ComopsiteClass(") 74 | } 75 | -------------------------------------------------------------------------------- /match_test.go: -------------------------------------------------------------------------------- 1 | package yarex_test 2 | 3 | //go:generate cmd/yarexgen/yarexgen match_test.go 4 | 5 | import ( 6 | "regexp" 7 | "testing" 8 | 9 | "github.com/Maki-Daisuke/go-yarex" 10 | ) 11 | 12 | func testMatchStrings(t *testing.T, restr string, tests []string) { 13 | ast, err := yarex.Parse(restr) 14 | if err != nil { 15 | t.Fatalf("want nil, but got %s", err) 16 | } 17 | stdRe := regexp.MustCompile(restr) 18 | ast = yarex.OptimizeAst(ast) 19 | opRe := yarex.MustCompileOp(restr) 20 | compRe := yarex.MustCompile(restr) 21 | if !yarex.IsCompiledMatcher(compRe) { 22 | t.Errorf("%v should be Compiled matcher, but isn't", compRe) 23 | } 24 | for _, str := range tests { 25 | match := stdRe.MatchString(str) 26 | if yarex.AstMatch(ast, str) != match { 27 | if match { 28 | t.Errorf("(Interp) %v should match against %q, but didn't", ast, str) 29 | } else { 30 | t.Errorf("(Interp) %v shouldn't match against %q, but did", ast, str) 31 | } 32 | } 33 | if opRe.MatchString(str) != match { 34 | if match { 35 | t.Errorf("(OpTree) %v should match against %q, but didn't", opRe, str) 36 | } else { 37 | t.Errorf("(OpTree) %v shouldn't match against %q, but did", opRe, str) 38 | } 39 | } 40 | if compRe.MatchString(str) != match { 41 | if match { 42 | t.Errorf("(Compiled) %v should match against %q, but didn't", compRe, str) 43 | } else { 44 | t.Errorf("(Compiled) %v shouldn't match against %q, but did", compRe, str) 45 | } 46 | } 47 | } 48 | } 49 | 50 | func TestMatchFooBar(t *testing.T) { 51 | re := "foo bar" //yarexgen 52 | testMatchStrings(t, re, []string{ 53 | "foo bar", 54 | "foo bar", 55 | "hogefoo barfuga", 56 | "foo barf", 57 | "Afoo bar", 58 | "foo ba", 59 | }) 60 | } 61 | 62 | func TestMatchFooOrBar(t *testing.T) { 63 | re := "foo|bar" //yarexgen 64 | testMatchStrings(t, re, []string{ 65 | "foo bar", 66 | "hogefoo barfuga", 67 | "foo baz", 68 | "bar f", 69 | "foba", 70 | "", 71 | }) 72 | } 73 | 74 | func TestMatchBacktracking(t *testing.T) { 75 | re := "(?:foo|fo)oh" //yarexgen 76 | testMatchStrings(t, re, []string{ 77 | "fooh", 78 | "foooh", 79 | "foh", 80 | "fooooooooooh", 81 | "fooooooooofoooh", 82 | "", 83 | }) 84 | } 85 | 86 | func TestMatchZeroOrMore(t *testing.T) { 87 | re := "fo*oh" //yarexgen 88 | testMatchStrings(t, re, []string{ 89 | "fooh", 90 | "foh", 91 | "fh", 92 | "fooooooooooh", 93 | "fooooooooofoooh", 94 | "", 95 | "fo", 96 | "oh", 97 | }) 98 | } 99 | 100 | func TestMatchOneOrMore(t *testing.T) { 101 | re := "fo+oh" //yarexgen 102 | testMatchStrings(t, re, []string{ 103 | "fooh", 104 | "foh", 105 | "fh", 106 | "fooooooooooh", 107 | "fooooooooofoooh", 108 | "", 109 | "fo", 110 | "oh", 111 | }) 112 | } 113 | 114 | func TestMatchQuantifier(t *testing.T) { 115 | re := "fo{2,5}oh" //yarexgen 116 | testMatchStrings(t, re, []string{ 117 | "fooh", 118 | "foh", 119 | "fh", 120 | "fooooooooooh", 121 | "fooooooooofoooh", 122 | "", 123 | "fo", 124 | "oh", 125 | }) 126 | } 127 | 128 | func TestMatchOpt(t *testing.T) { 129 | re := "fo?oh" //yarexgen 130 | testMatchStrings(t, re, []string{ 131 | "fooh", 132 | "foh", 133 | "fh", 134 | "fooooooooooh", 135 | "fooooooooofooh", 136 | "", 137 | "fo", 138 | "oh", 139 | }) 140 | re = "fo*oh?" //yarexgen 141 | testMatchStrings(t, re, []string{ 142 | "ABfooh", 143 | "foo", 144 | "fh", 145 | "foooohoooooo", 146 | "foooooooooooCD", 147 | "", 148 | "fo", 149 | "oh", 150 | }) 151 | } 152 | 153 | func TestMatchWildcard(t *testing.T) { 154 | re := "." //yarexgen 155 | testMatchStrings(t, re, []string{ 156 | "aiueo", 157 | "\n", 158 | "", 159 | " ", 160 | "\b", 161 | }) 162 | re = ".+x" //yarexgen 163 | testMatchStrings(t, re, []string{ 164 | "", 165 | "x", 166 | "xx", 167 | "aaaaax", 168 | "\nx", 169 | "xx\nx", 170 | "xxxxxa", 171 | }) 172 | } 173 | 174 | func TestMatchBegin(t *testing.T) { 175 | re := "^foo bar" //yarexgen 176 | testMatchStrings(t, re, []string{ 177 | "foo bar", 178 | "foo bar", 179 | "hogefoo barfuga", 180 | "foo barf", 181 | "Afoo bar", 182 | "foo ba", 183 | "\nfoo bar", 184 | }) 185 | re = "(^|A)*foo bar" //yarexgen 186 | testMatchStrings(t, re, []string{ 187 | "foo bar", 188 | "foo bar", 189 | "hogefoo barfuga", 190 | "foo barf", 191 | "Afoo bar", 192 | "AAfoo bar", 193 | "AAAAfoo bar", 194 | "AABAAfoo bar", 195 | }) 196 | } 197 | 198 | func TestMatchBackRef(t *testing.T) { 199 | // Here, we cannot use testMatchStrings, because Go's regexp does not 200 | // support back-reference. 201 | tests := []struct { 202 | str string 203 | result bool 204 | }{ 205 | {"hogehogefuga", true}, 206 | {"AAAhogehogefugaBBB", true}, 207 | {"hogefuga", false}, 208 | {"hoge", false}, 209 | {"fuga", false}, 210 | } 211 | pattern := `(hoge)\1fuga` //yarexgen 212 | ast, err := yarex.Parse(pattern) 213 | if err != nil { 214 | t.Fatalf("want nil, but got %s", err) 215 | } 216 | ast = yarex.OptimizeAst(ast) 217 | opRe := yarex.MustCompileOp(pattern) 218 | compRe := yarex.MustCompileOp(pattern) 219 | for _, test := range tests { 220 | if yarex.AstMatch(ast, test.str) != test.result { 221 | if test.result { 222 | t.Errorf("(Interp) %v should match against %q, but didn't", ast, test.str) 223 | } else { 224 | t.Errorf("(Interp) %v shouldn't match against %q, but did", ast, test.str) 225 | } 226 | } 227 | if opRe.MatchString(test.str) != test.result { 228 | if test.result { 229 | t.Errorf("(OpTree) %v should match against %q, but didn't", opRe, test.str) 230 | } else { 231 | t.Errorf("(OpTree) %v shouldn't match against %q, but did", opRe, test.str) 232 | } 233 | } 234 | if compRe.MatchString(test.str) != test.result { 235 | if test.result { 236 | t.Errorf("(Compiled) %v should match against %q, but didn't", compRe, test.str) 237 | } else { 238 | t.Errorf("(Compiled) %v shouldn't match against %q, but did", compRe, test.str) 239 | } 240 | } 241 | } 242 | } 243 | 244 | func TestMatchClass(t *testing.T) { 245 | re := "[0aB]" //yarexgen 246 | testMatchStrings(t, re, []string{ 247 | "foo", // false 248 | "foo bar", // true 249 | "FOO BAR", // true 250 | "AAAAAA", // false 251 | "012345", // true 252 | "\000hoge", // false 253 | "\000hage", // true 254 | }) 255 | re = "[A-Z0-9][a-z]" //yarexgen 256 | testMatchStrings(t, re, []string{ 257 | "absksdjhasd", 258 | "alsdAAA", 259 | "asl;k3as7djj", 260 | "Aiiiiiiii9", 261 | "foo BAR", 262 | "FOO bar", 263 | "FOObar", 264 | "fooBARbaz", 265 | }) 266 | } 267 | 268 | func TestSipAddress(t *testing.T) { 269 | re := `^["]{0,1}([^"]*)["]{0,1}[ ]*<(sip|tel|sips):(([^@]*)@){0,1}([^>^:]*|\[[a-fA-F0-9:]*\]):{0,1}([0-9]*){0,1}>(;.*){0,1}$` //yarexgen 270 | testMatchStrings(t, re, []string{ 271 | "\"display_name\";user=phone;hogehoge", 272 | "", 273 | "\"display_name\"", 274 | ";user=phone", 275 | "\"0333334444\";user=phone", 276 | }) 277 | } 278 | -------------------------------------------------------------------------------- /matchcontext.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import "unsafe" 4 | 5 | const initialStackSize = 64 6 | 7 | type stackFrame struct { 8 | index uint 9 | pos int 10 | } 11 | 12 | type matchContext struct { 13 | str uintptr // *string // string being matched 14 | getStack uintptr // *func() []stackFrame // Accessors to stack to record capturing positions. 15 | setStack uintptr // *func([]stackFrame) // We use uintptr to avoid leaking param. 16 | stackTop int // stack top 17 | } 18 | 19 | func makeContext(str *string, getter *func() []stackFrame, setter *func([]stackFrame)) matchContext { 20 | return matchContext{uintptr(unsafe.Pointer(str)), uintptr(unsafe.Pointer(getter)), uintptr(unsafe.Pointer(setter)), 0} 21 | } 22 | 23 | func (c matchContext) push(i uint, p int) matchContext { 24 | st := (*(*func() []stackFrame)(unsafe.Pointer(c.getStack)))() // == c.getStack() 25 | sf := stackFrame{i, p} 26 | if len(st) <= c.stackTop { 27 | st = append(st, sf) 28 | (*(*func([]stackFrame))(unsafe.Pointer(c.setStack)))(st) // == c.setStack(st) 29 | } else { 30 | st[c.stackTop] = sf 31 | } 32 | c.stackTop++ 33 | return c 34 | } 35 | 36 | // GetOffset returns (-1, -1) when it cannot find specified index. 37 | func (c matchContext) GetOffset(idx uint) (start int, end int) { 38 | st := (*(*func() []stackFrame)(unsafe.Pointer(c.getStack)))() // == c.getStack() 39 | i := c.stackTop - 1 40 | for ; ; i-- { 41 | if i == 0 { 42 | return -1, -1 43 | } 44 | if st[i].index == idx { 45 | end = st[i].pos 46 | break 47 | } 48 | } 49 | i-- 50 | for ; i >= 0; i-- { 51 | if st[i].index == idx { 52 | start = st[i].pos 53 | return 54 | } 55 | } 56 | // This should not happen. 57 | panic("Undetermined capture") 58 | } 59 | 60 | func (c matchContext) GetCaptured(i uint) (string, bool) { 61 | start, end := c.GetOffset(i) 62 | if start < 0 { 63 | return "", false 64 | } 65 | str := *(*string)(unsafe.Pointer(c.str)) 66 | return str[start:end], true 67 | } 68 | -------------------------------------------------------------------------------- /opcompile.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | func newOpAlt(left, right OpTree) *OpAlt { 4 | min := left.minimumReq() 5 | if right.minimumReq() < min { 6 | min = right.minimumReq() 7 | } 8 | return &OpAlt{ 9 | OpBase: OpBase{ 10 | minReq: min, 11 | follower: left, 12 | }, 13 | alt: right, 14 | } 15 | } 16 | 17 | func opCompile(re Ast) OpTree { 18 | return (&opCompiler{}).compile(re, OpSuccess{}) 19 | } 20 | 21 | type opCompiler struct { 22 | repeatCount uint 23 | } 24 | 25 | func (oc *opCompiler) compile(re Ast, follower OpTree) OpTree { 26 | switch r := re.(type) { 27 | case AstLit: 28 | str := string(r) 29 | return &OpStr{ 30 | OpBase: OpBase{ 31 | minReq: follower.minimumReq() + len(str), 32 | follower: follower, 33 | }, 34 | str: str, 35 | } 36 | case *AstSeq: 37 | return oc.compileSeq(r.seq, follower) 38 | case *AstAlt: 39 | return oc.compileAlt(r.opts, follower) 40 | case AstNotNewline: 41 | return &OpNotNewLine{ 42 | OpBase: OpBase{ 43 | minReq: follower.minimumReq() + 1, 44 | follower: follower, 45 | }, 46 | } 47 | case *AstRepeat: 48 | return oc.compileRepeat(r.re, r.min, r.max, follower) 49 | case *AstCap: 50 | return oc.compileCapture(r.re, r.index, follower) 51 | case AstBackRef: 52 | return &OpBackRef{ 53 | OpBase: OpBase{ 54 | minReq: follower.minimumReq(), 55 | follower: follower, 56 | }, 57 | key: ContextKey{'c', uint(r)}, 58 | } 59 | case AstAssertBegin: 60 | return &OpAssertBegin{ 61 | OpBase: OpBase{ 62 | minReq: follower.minimumReq(), 63 | follower: follower, 64 | }, 65 | } 66 | case AstAssertEnd: 67 | return &OpAssertEnd{ 68 | OpBase: OpBase{ 69 | minReq: follower.minimumReq(), 70 | follower: follower, 71 | }, 72 | } 73 | case AstCharClass: 74 | return &OpClass{ 75 | OpBase: OpBase{ 76 | minReq: follower.minimumReq() + 1, 77 | follower: follower, 78 | }, 79 | cls: (CharClass)(r), 80 | } 81 | } 82 | panic("EXECUTION SHOULD NOT REACH HERE") 83 | } 84 | 85 | func (oc *opCompiler) compileSeq(seq []Ast, follower OpTree) OpTree { 86 | if len(seq) == 0 { 87 | return follower 88 | } 89 | follower = oc.compileSeq(seq[1:], follower) 90 | return oc.compile(seq[0], follower) 91 | } 92 | 93 | func (oc *opCompiler) compileAlt(opts []Ast, follower OpTree) OpTree { 94 | if len(opts) == 0 { 95 | panic("THIS SHOULD NOT HAPPEN") 96 | } 97 | left := oc.compile(opts[0], follower) 98 | if len(opts) == 1 { 99 | return left 100 | } 101 | right := oc.compileAlt(opts[1:], follower) 102 | return newOpAlt(left, right) 103 | } 104 | 105 | func (oc *opCompiler) compileRepeat(re Ast, min, max int, follower OpTree) OpTree { 106 | if min > 0 { 107 | return oc.compile(re, oc.compileRepeat(re, min-1, max-1, follower)) 108 | } 109 | if max == 0 { 110 | return follower 111 | } 112 | switch r := re.(type) { 113 | // Optimization for repeating fixed-length patterns 114 | case AstLit: 115 | return &OpRepeatLit{ 116 | OpBase: OpBase{ 117 | follower: follower, 118 | minReq: follower.minimumReq(), 119 | }, 120 | lit: string(r), 121 | max: max, 122 | } 123 | case AstCharClass: 124 | return &OpRepeatClass{ 125 | OpBase: OpBase{ 126 | follower: follower, 127 | minReq: follower.minimumReq(), 128 | }, 129 | CharClass: r.CharClass, 130 | max: max, 131 | } 132 | } 133 | if max > 0 { 134 | left := oc.compile(re, oc.compileRepeat(re, 0, max-1, follower)) 135 | return newOpAlt(left, follower) 136 | } 137 | // If you are here max < 0, which means infinite repeat 138 | if !canMatchZeroWidth(re) { // If re does not match zero-width string, we can optimize by skipping zero-width check 139 | self := &OpAlt{ 140 | OpBase: OpBase{ 141 | minReq: follower.minimumReq(), 142 | }, 143 | alt: follower, 144 | } 145 | self.follower = oc.compile(re, self) // self-reference makes infinite loop 146 | return self 147 | } 148 | oc.repeatCount++ 149 | self := &OpRepeat{ 150 | OpBase: OpBase{ 151 | minReq: follower.minimumReq(), 152 | }, 153 | key: ContextKey{'r', oc.repeatCount}, 154 | alt: follower, 155 | } 156 | self.follower = oc.compile(re, self) // self-reference makes infinite loop 157 | return self 158 | } 159 | 160 | func canMatchZeroWidth(re Ast) bool { 161 | switch r := re.(type) { 162 | case AstBackRef, AstAssertBegin, AstAssertEnd: 163 | return true 164 | case AstNotNewline, AstCharClass: 165 | return false 166 | case AstLit: 167 | return len(string(r)) == 0 168 | case *AstSeq: 169 | for _, s := range r.seq { 170 | if !canMatchZeroWidth(s) { 171 | return false 172 | } 173 | } 174 | return true 175 | case *AstAlt: 176 | for _, o := range r.opts { 177 | if canMatchZeroWidth(o) { 178 | return true 179 | } 180 | } 181 | return false 182 | case *AstRepeat: 183 | return r.min == 0 || canMatchZeroWidth(r.re) 184 | case *AstCap: 185 | return canMatchZeroWidth(r.re) 186 | } 187 | panic("EXECUTION SHOULD NOT REACH HERE") 188 | } 189 | 190 | func (oc *opCompiler) compileCapture(re Ast, index uint, follower OpTree) OpTree { 191 | follower = &OpCaptureEnd{ 192 | OpBase: OpBase{ 193 | minReq: follower.minimumReq(), 194 | follower: follower, 195 | }, 196 | key: ContextKey{'c', index}, 197 | } 198 | follower = oc.compile(re, follower) 199 | return &OpCaptureStart{ 200 | OpBase: OpBase{ 201 | minReq: follower.minimumReq(), 202 | follower: follower, 203 | }, 204 | key: ContextKey{'c', index}, 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /opmatchcontext.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "sync" 5 | "unsafe" 6 | ) 7 | 8 | type ContextKey struct { 9 | Kind rune 10 | Index uint 11 | } 12 | 13 | type opStackFrame struct { 14 | Key ContextKey 15 | Pos int 16 | } 17 | 18 | var opStackPool = sync.Pool{ 19 | New: func() interface{} { 20 | b := make([]opStackFrame, initialStackSize) 21 | return &b 22 | }, 23 | } 24 | 25 | type MatchContext struct { 26 | Str uintptr // *string // string being matched 27 | getStack uintptr // *func() []opStackFrame // Accessors to stack to record capturing positions. 28 | setStack uintptr // *func([]opStackFrame) // We use uintptr to avoid leaking param. 29 | stackTop int // stack top 30 | } 31 | 32 | func makeOpMatchContext(str *string, getter *func() []opStackFrame, setter *func([]opStackFrame)) MatchContext { 33 | return MatchContext{uintptr(unsafe.Pointer(str)), uintptr(unsafe.Pointer(getter)), uintptr(unsafe.Pointer(setter)), 0} 34 | } 35 | 36 | func (c MatchContext) Push(k ContextKey, p int) MatchContext { 37 | st := (*(*func() []opStackFrame)(unsafe.Pointer(c.getStack)))() // c.getStack() 38 | sf := opStackFrame{k, p} 39 | if len(st) <= c.stackTop { 40 | st = append(st, sf) 41 | st = st[:cap(st)] 42 | (*(*func([]opStackFrame))(unsafe.Pointer(c.setStack)))(st) // c.setStack(st) 43 | } else { 44 | st[c.stackTop] = sf 45 | } 46 | c.stackTop++ 47 | return c 48 | } 49 | 50 | func (c MatchContext) GetCaptured(k ContextKey) (string, bool) { 51 | loc := c.GetCapturedIndex(k) 52 | if loc == nil { 53 | return "", false 54 | } 55 | return (*(*string)(unsafe.Pointer(c.Str)))[loc[0]:loc[1]], true 56 | } 57 | 58 | func (c MatchContext) GetCapturedIndex(k ContextKey) []int { 59 | var start, end int 60 | st := (*(*func() []opStackFrame)(unsafe.Pointer(c.getStack)))() // c.getStack() 61 | i := c.stackTop - 1 62 | for ; ; i-- { 63 | if i == 0 { 64 | return nil 65 | } 66 | if st[i].Key == k { 67 | end = st[i].Pos 68 | break 69 | } 70 | } 71 | i-- 72 | for ; i >= 0; i-- { 73 | if st[i].Key == k { 74 | start = st[i].Pos 75 | return []int{start, end} 76 | } 77 | } 78 | // This should not happen. 79 | panic("Undetermined capture") 80 | } 81 | 82 | func (c MatchContext) FindVal(k ContextKey) int { 83 | st := (*(*func() []opStackFrame)(unsafe.Pointer(c.getStack)))() // c.getStack() 84 | for i := c.stackTop - 1; i >= 0; i-- { 85 | if st[i].Key == k { 86 | return st[i].Pos 87 | } 88 | } 89 | return -1 90 | } 91 | -------------------------------------------------------------------------------- /optimize-ast.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import "fmt" 4 | 5 | func optimizeAst(re Ast) Ast { 6 | re = optimizeAstFlattenSeqAndAlt(re) 7 | re = optimizeAstUnwrapSingletonSeqAndAlt(re) 8 | return re 9 | } 10 | 11 | // optimizeAstFlattenSeqAndAlt flattens nested AstSeqs and AstAlts 12 | func optimizeAstFlattenSeqAndAlt(re Ast) Ast { 13 | switch v := re.(type) { 14 | case *AstSeq: 15 | out := make([]Ast, 0, len(v.seq)) 16 | for _, r := range v.seq { 17 | r = optimizeAstFlattenSeqAndAlt(r) 18 | if as, ok := r.(*AstSeq); ok { 19 | out = append(out, as.seq...) 20 | } else { 21 | out = append(out, r) 22 | } 23 | } 24 | return &AstSeq{out} 25 | case *AstAlt: 26 | out := make([]Ast, 0, len(v.opts)) 27 | for _, r := range v.opts { 28 | r = optimizeAstFlattenSeqAndAlt(r) 29 | if aa, ok := r.(*AstAlt); ok { 30 | out = append(out, aa.opts...) 31 | } else { 32 | out = append(out, r) 33 | } 34 | } 35 | return &AstAlt{out} 36 | case *AstRepeat: 37 | out := *v 38 | out.re = optimizeAstFlattenSeqAndAlt(v.re) 39 | return &out 40 | case *AstCap: 41 | out := *v 42 | out.re = optimizeAstFlattenSeqAndAlt(v.re) 43 | return &out 44 | case AstLit, AstNotNewline, AstAssertBegin, AstAssertEnd, AstBackRef, AstCharClass: 45 | return v 46 | default: 47 | panic(fmt.Errorf("IMPLEMENT optimizeAstFlattenSeqAndAlt for %T", re)) 48 | } 49 | } 50 | 51 | // optimizeAstUnwrapSingletonSeqAndAlt joins adjacent literals, and unwrap seqs and alts 52 | // containing a single re as much as possible 53 | func optimizeAstUnwrapSingletonSeqAndAlt(re Ast) Ast { 54 | switch v := re.(type) { 55 | case *AstSeq: 56 | out := make([]Ast, 0, len(v.seq)) 57 | var acc *string = nil 58 | for _, r := range v.seq { 59 | r = optimizeAstUnwrapSingletonSeqAndAlt(r) 60 | if lit, ok := r.(AstLit); ok { 61 | if acc == nil { 62 | s := string(lit) 63 | acc = &s 64 | } else { 65 | *acc = *acc + string(lit) 66 | } 67 | } else { 68 | if acc != nil { 69 | out = append(out, AstLit(*acc)) 70 | acc = nil 71 | } 72 | out = append(out, r) 73 | } 74 | } 75 | if acc != nil { 76 | out = append(out, AstLit(*acc)) 77 | } 78 | switch len(out) { 79 | case 0: 80 | return AstLit("") 81 | case 1: 82 | return out[0] 83 | } 84 | return &AstSeq{out} 85 | case *AstAlt: 86 | out := make([]Ast, len(v.opts), len(v.opts)) 87 | for i, r := range v.opts { 88 | out[i] = optimizeAstUnwrapSingletonSeqAndAlt(r) 89 | } 90 | switch len(out) { 91 | case 0: 92 | return AstLit("") 93 | case 1: 94 | return out[0] 95 | } 96 | return &AstAlt{out} 97 | case *AstRepeat: 98 | out := *v 99 | out.re = optimizeAstUnwrapSingletonSeqAndAlt(v.re) 100 | return &out 101 | case *AstCap: 102 | out := *v 103 | out.re = optimizeAstUnwrapSingletonSeqAndAlt(v.re) 104 | return &out 105 | default: 106 | return v 107 | } 108 | } 109 | 110 | func canOnlyMatchAtBegining(re Ast) bool { 111 | switch v := re.(type) { 112 | case AstAssertBegin: 113 | return true 114 | case *AstSeq: 115 | if len(v.seq) == 0 { 116 | return false 117 | } 118 | return canOnlyMatchAtBegining(v.seq[0]) 119 | case *AstAlt: 120 | if len(v.opts) == 0 { 121 | return false 122 | } 123 | for _, r := range v.opts { 124 | if !canOnlyMatchAtBegining(r) { 125 | return false 126 | } 127 | } 128 | return true 129 | case *AstRepeat: 130 | if v.min == 0 { 131 | return false 132 | } 133 | return canOnlyMatchAtBegining(v.re) 134 | case *AstCap: 135 | return canOnlyMatchAtBegining(v.re) 136 | default: 137 | return false 138 | } 139 | } 140 | 141 | func minRequiredLengthOfAst(re Ast) int { 142 | switch v := re.(type) { 143 | case AstAssertBegin, AstAssertEnd, AstBackRef: 144 | return 0 145 | case AstNotNewline, AstCharClass: 146 | return 1 147 | case AstLit: 148 | return len(string(v)) 149 | case *AstSeq: 150 | acc := 0 151 | for _, r := range v.seq { 152 | acc += minRequiredLengthOfAst(r) 153 | } 154 | return acc 155 | case *AstAlt: 156 | min := minRequiredLengthOfAst(v.opts[0]) 157 | for _, r := range v.opts[1:] { 158 | m := minRequiredLengthOfAst(r) 159 | if m < min { 160 | min = m 161 | } 162 | } 163 | return min 164 | case *AstRepeat: 165 | return minRequiredLengthOfAst(v.re) * v.min 166 | case *AstCap: 167 | return minRequiredLengthOfAst(v.re) 168 | default: 169 | panic(fmt.Errorf("IMPLEMENT optimizeAstFlattenSeqAndAlt for %T", re)) 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /optree.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | type OpTree interface { 4 | minimumReq() int 5 | } 6 | 7 | type OpBase struct { 8 | minReq int 9 | follower OpTree 10 | } 11 | 12 | func (op OpBase) minimumReq() int { 13 | return op.minReq 14 | } 15 | 16 | type OpSuccess struct{} 17 | 18 | func (_ OpSuccess) minimumReq() int { 19 | return 0 20 | } 21 | 22 | type OpStr struct { 23 | OpBase 24 | str string 25 | } 26 | 27 | type OpAlt struct { 28 | OpBase 29 | alt OpTree 30 | } 31 | 32 | type OpRepeat struct { 33 | OpBase 34 | alt OpTree 35 | key ContextKey 36 | } 37 | 38 | // OpRepeat optimized for literal 39 | type OpRepeatLit struct { 40 | OpBase 41 | lit string 42 | max int 43 | } 44 | 45 | // OpRepeat optimized for char class 46 | type OpRepeatClass struct { 47 | OpBase 48 | CharClass 49 | max int 50 | } 51 | 52 | type OpClass struct { 53 | OpBase 54 | cls CharClass 55 | } 56 | 57 | type OpNotNewLine struct { 58 | OpBase 59 | } 60 | 61 | type OpCaptureStart struct { 62 | OpBase 63 | key ContextKey 64 | } 65 | 66 | type OpCaptureEnd struct { // actuallly, this is identical to OpCaptureStart 67 | OpBase 68 | key ContextKey 69 | } 70 | 71 | type OpBackRef struct { 72 | OpBase 73 | key ContextKey 74 | } 75 | 76 | type OpAssertBegin struct { 77 | OpBase 78 | } 79 | 80 | type OpAssertEnd struct { 81 | OpBase 82 | } 83 | -------------------------------------------------------------------------------- /optreematch.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "strings" 5 | "unicode/utf8" 6 | "unsafe" 7 | ) 8 | 9 | type opExecer struct { 10 | op OpTree 11 | } 12 | 13 | func (oe opExecer) exec(str string, pos int, onSuccess func(MatchContext)) bool { 14 | op := oe.op 15 | _, headOnly := op.(*OpAssertBegin) 16 | if headOnly && pos != 0 { 17 | return false 18 | } 19 | minReq := op.minimumReq() 20 | if minReq > len(str)-pos { 21 | return false 22 | } 23 | stack := *(opStackPool.Get().(*[]opStackFrame)) 24 | defer func() { opStackPool.Put(&stack) }() 25 | getter := func() []opStackFrame { return stack } 26 | setter := func(s []opStackFrame) { stack = s } 27 | ctx0 := makeOpMatchContext(&str, &getter, &setter) 28 | if opTreeExec(op, ctx0.Push(ContextKey{'c', 0}, 0), pos, onSuccess) { 29 | return true 30 | } 31 | if headOnly { 32 | return false 33 | } 34 | for i := pos + 1; minReq <= len(str)-i; i++ { 35 | if opTreeExec(op, ctx0.Push(ContextKey{'c', 0}, i), i, onSuccess) { 36 | return true 37 | } 38 | } 39 | return false 40 | } 41 | 42 | func opTreeExec(next OpTree, ctx MatchContext, p int, onSuccess func(MatchContext)) bool { 43 | str := *(*string)(unsafe.Pointer(ctx.Str)) 44 | var ( 45 | localStack [16]int 46 | heapStack *[]int 47 | ) 48 | for { 49 | switch op := next.(type) { 50 | case OpSuccess: 51 | ctx = ctx.Push(ContextKey{'c', 0}, p) 52 | onSuccess(ctx) 53 | return true 54 | case *OpStr: 55 | if len(str)-p < op.minReq { 56 | return false 57 | } 58 | for i := 0; i < len(op.str); i++ { 59 | if str[p+i] != op.str[i] { 60 | return false 61 | } 62 | } 63 | next = op.follower 64 | p += len(op.str) 65 | case *OpAlt: 66 | if opTreeExec(op.follower, ctx, p, onSuccess) { 67 | return true 68 | } 69 | next = op.alt 70 | case *OpRepeat: 71 | prev := ctx.FindVal(op.key) 72 | if prev == p { // This means zero-width matching occurs. 73 | next = op.alt // So, terminate repeating. 74 | continue 75 | } 76 | ctx2 := ctx.Push(op.key, p) 77 | if opTreeExec(op.follower, ctx2, p, onSuccess) { 78 | return true 79 | } 80 | next = op.alt 81 | case *OpRepeatLit: 82 | endPos := len(str) - op.minReq - len(op.lit) 83 | n := 0 84 | for (op.max < 0 || n < op.max) && p <= endPos { 85 | if str[p:p+len(op.lit)] != op.lit { 86 | break 87 | } 88 | if len(localStack) == n { 89 | goto OpRepeatLit_HEAP_STACK 90 | } 91 | localStack[n] = p 92 | n++ 93 | p += len(op.lit) 94 | } 95 | for n > 0 { // try backtrack 96 | if opTreeExec(op.follower, ctx, p, onSuccess) { 97 | return true 98 | } 99 | n-- 100 | p = localStack[n] 101 | } 102 | next = op.follower 103 | break 104 | OpRepeatLit_HEAP_STACK: 105 | heapStack = (IntStackPool.Get().(*[]int)) 106 | copy(*heapStack, localStack[:]) 107 | (*heapStack)[n] = p 108 | n++ 109 | p += len(op.lit) 110 | for (op.max < 0 || n < op.max) && p <= endPos { 111 | if str[p:p+len(op.lit)] != op.lit { 112 | break 113 | } 114 | if len(*heapStack) == n { 115 | *heapStack = append(*heapStack, p) 116 | *heapStack = (*heapStack)[:cap(*heapStack)] 117 | } else { 118 | (*heapStack)[n] = p 119 | } 120 | n++ 121 | p += len(op.lit) 122 | } 123 | for n > 0 { // try backtrack 124 | if opTreeExec(op.follower, ctx, p, onSuccess) { 125 | IntStackPool.Put(heapStack) 126 | return true 127 | } 128 | n-- 129 | p = (*heapStack)[n] 130 | } 131 | IntStackPool.Put(heapStack) 132 | next = op.follower 133 | case *OpRepeatClass: 134 | endPos := len(str) - op.minReq - 1 135 | size := 0 136 | n := 0 137 | for (op.max < 0 || n < op.max) && p <= endPos { 138 | r, size := utf8.DecodeRuneInString(str[p:]) 139 | if size == 0 || r == utf8.RuneError { 140 | break 141 | } 142 | if !op.CharClass.Contains(r) { 143 | break 144 | } 145 | if len(localStack) == n { 146 | goto OpRepeatClass_HEAP_STACK 147 | } 148 | localStack[n] = p 149 | n++ 150 | p += size 151 | } 152 | for n > 0 { // try backtrack 153 | if opTreeExec(op.follower, ctx, p, onSuccess) { 154 | return true 155 | } 156 | n-- 157 | p = localStack[n] 158 | } 159 | next = op.follower 160 | break 161 | OpRepeatClass_HEAP_STACK: 162 | heapStack = (IntStackPool.Get().(*[]int)) 163 | copy(*heapStack, localStack[:]) 164 | (*heapStack)[n] = p 165 | n++ 166 | p += size 167 | for (op.max < 0 || n < op.max) && p <= endPos { 168 | r, size := utf8.DecodeRuneInString(str[p:]) 169 | if size == 0 || r == utf8.RuneError { 170 | break 171 | } 172 | if !op.CharClass.Contains(r) { 173 | break 174 | } 175 | if len(*heapStack) == n { 176 | *heapStack = append(*heapStack, p) 177 | *heapStack = (*heapStack)[:cap(*heapStack)] 178 | } else { 179 | (*heapStack)[n] = p 180 | } 181 | n++ 182 | p += size 183 | } 184 | for n > 0 { // try backtrack 185 | if opTreeExec(op.follower, ctx, p, onSuccess) { 186 | IntStackPool.Put(heapStack) 187 | return true 188 | } 189 | n-- 190 | p = (*heapStack)[n] 191 | } 192 | IntStackPool.Put(heapStack) 193 | next = op.follower 194 | case *OpClass: 195 | if len(str)-p < op.minReq { 196 | return false 197 | } 198 | r, size := utf8.DecodeRuneInString(str[p:]) 199 | if size == 0 || r == utf8.RuneError { 200 | return false 201 | } 202 | if !op.cls.Contains(r) { 203 | return false 204 | } 205 | next = op.follower 206 | p += size 207 | case *OpNotNewLine: 208 | if len(str)-p < op.minReq { 209 | return false 210 | } 211 | r, size := utf8.DecodeRuneInString(str[p:]) 212 | if size == 0 || r == utf8.RuneError { 213 | return false 214 | } 215 | if r == '\n' { 216 | return false 217 | } 218 | next = op.follower 219 | p += size 220 | case *OpCaptureStart: 221 | return opTreeExec(op.follower, ctx.Push(op.key, p), p, onSuccess) 222 | case *OpCaptureEnd: 223 | return opTreeExec(op.follower, ctx.Push(op.key, p), p, onSuccess) 224 | case *OpBackRef: 225 | s, ok := ctx.GetCaptured(op.key) 226 | if !ok || !strings.HasPrefix(str[p:], s) { 227 | return false 228 | } 229 | next = op.follower 230 | p += len(s) 231 | case *OpAssertBegin: 232 | if p != 0 { 233 | return false 234 | } 235 | next = op.follower 236 | case *OpAssertEnd: 237 | if p != len(str) { 238 | return false 239 | } 240 | next = op.follower 241 | } 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /parse.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | "unicode" 7 | ) 8 | 9 | func parse(s string) (re Ast, err error) { 10 | defer func() { 11 | if e := recover(); e != nil { 12 | err = e.(error) 13 | re = nil 14 | } 15 | }() 16 | re, remain := (&(parser{})).parseAlt([]rune(s)) 17 | if len(remain) > 0 { 18 | return nil, fmt.Errorf("Unknown context: %q", remain) 19 | } 20 | return re, nil 21 | } 22 | 23 | type parser struct { 24 | openCaptures uint 25 | closeCaptures uint 26 | } 27 | 28 | func (*parser) parseLit(str []rune) (Ast, []rune) { 29 | if len(str) == 0 { 30 | panic(fmt.Errorf("Literal is expected, but reached end-of-string unexpectedly")) 31 | } 32 | switch str[0] { 33 | case '$', '^', '*', '(', ')', '+', '[', ']', '{', '}', '|', '\\', '.', '?': 34 | panic(fmt.Errorf("Literal is expected, but cannot find: %q", string(str))) 35 | } 36 | return AstLit(str[0:1]), str[1:] 37 | } 38 | 39 | func (p *parser) parseSeq(str []rune) (Ast, []rune) { 40 | seq := make([]Ast, 0, 8) 41 | LOOP: 42 | for len(str) > 0 { 43 | var re Ast 44 | switch str[0] { 45 | case '^': 46 | re = AstAssertBegin{} 47 | str = str[1:] 48 | case '$': 49 | re = AstAssertEnd{} 50 | str = str[1:] 51 | case '.': 52 | re = AstNotNewline{} 53 | str = str[1:] 54 | case '\\': 55 | re, str = p.parseEscape(str) 56 | case '[': 57 | re, str = p.parseClass(str) 58 | case '(': 59 | re, str = p.parseGroup(str) 60 | case ')', '|': 61 | break LOOP 62 | default: 63 | re, str = p.parseLit(str) 64 | } 65 | re, str = p.parseQuantifier(str, re) 66 | seq = append(seq, re) 67 | } 68 | if len(seq) == 1 { 69 | return seq[0], str 70 | } else { 71 | return &AstSeq{seq}, str 72 | } 73 | } 74 | 75 | func (p *parser) parseAlt(str []rune) (Ast, []rune) { 76 | re, str := p.parseSeq(str) 77 | opts := []Ast{re} 78 | LOOP: 79 | for len(str) > 0 { 80 | switch str[0] { 81 | case '|': 82 | var re Ast 83 | re, str = p.parseAlt(str[1:]) 84 | opts = append(opts, re) 85 | case ')': 86 | break LOOP 87 | default: 88 | panic(fmt.Errorf("Unknown context: %q", string(str))) 89 | } 90 | } 91 | if len(opts) == 1 { 92 | return opts[0], str 93 | } else { 94 | return &AstAlt{opts}, str 95 | } 96 | } 97 | 98 | func (p *parser) parseGroup(str []rune) (Ast, []rune) { 99 | if str[0] != '(' { 100 | panic(fmt.Errorf("'(' is expected, but cannot find: %q", string(str))) 101 | } 102 | if len(str) < 2 { 103 | panic(fmt.Errorf("Unmatched '(' : %q", string(str))) 104 | } 105 | if str[1] != '?' { 106 | return p.parseCapture(str[1:]) 107 | } 108 | if str[2] != ':' { 109 | panic(fmt.Errorf("Unknown extended pattern syntax: %q", string(str))) 110 | } 111 | re, remain := p.parseAlt(str[3:]) 112 | if remain[0] != ')' { 113 | panic(fmt.Errorf("Unmatched '(' : %q", string(str))) 114 | } 115 | return re, remain[1:] 116 | } 117 | 118 | func (p *parser) parseCapture(str []rune) (Ast, []rune) { 119 | p.openCaptures++ 120 | index := p.openCaptures 121 | re, remain := p.parseAlt(str) 122 | if remain[0] != ')' { 123 | panic(fmt.Errorf("Unmatched '(' : %q", string(str))) 124 | } 125 | p.closeCaptures++ 126 | return &AstCap{index, re}, remain[1:] 127 | } 128 | 129 | func (p *parser) parseQuantifier(str []rune, re Ast) (Ast, []rune) { 130 | if len(str) == 0 { 131 | return re, str 132 | } 133 | switch str[0] { 134 | case '*': 135 | return &AstRepeat{re, 0, -1}, str[1:] 136 | case '+': 137 | return &AstRepeat{re, 1, -1}, str[1:] 138 | case '?': 139 | return &AstRepeat{re, 0, 1}, str[1:] 140 | case '{': 141 | start, remain := p.parseInt(str[1:]) 142 | if remain == nil { 143 | panic(fmt.Errorf(`Invalid quantifier: %q`, string(str))) 144 | } 145 | switch remain[0] { 146 | case '}': 147 | return &AstRepeat{re, start, start}, remain[1:] 148 | case ',': 149 | end, remain := p.parseInt(remain[1:]) 150 | if remain == nil { 151 | panic(fmt.Errorf(`Invalid quantifier: %q`, string(str))) 152 | } 153 | if remain[0] != '}' { 154 | panic(fmt.Errorf("Unmatched '{' : %q", string(str))) 155 | } 156 | return &AstRepeat{re, start, end}, remain[1:] 157 | default: 158 | panic(fmt.Errorf("Unmatched '{' : %q", string(str))) 159 | } 160 | } 161 | return re, str 162 | } 163 | 164 | // parseInt returns (0, nil) if it cannot find any integer at the head of str 165 | func (p *parser) parseInt(str []rune) (int, []rune) { 166 | i := 0 167 | for ; i < len(str); i++ { 168 | if str[i] < '0' || '9' < str[i] { 169 | break 170 | } 171 | } 172 | if i == 0 { 173 | return 0, nil 174 | } 175 | x, err := strconv.ParseInt(string(str[0:i]), 10, 32) 176 | if err != nil { 177 | panic(fmt.Errorf(`(THIS SHOULD NOT HAPPEN) can't parse int: %q`, string(str[0:i]))) 178 | } 179 | return int(x), str[i:] 180 | } 181 | 182 | func (p *parser) parseEscape(str []rune) (Ast, []rune) { 183 | return p.parseEscapeAux(str, false) 184 | } 185 | 186 | func (p *parser) parseEscapeAux(str []rune, inClass bool) (Ast, []rune) { 187 | if str[0] != '\\' { 188 | panic(fmt.Errorf("'\\' is expected, but cannot find: %q", string(str))) 189 | } 190 | if len(str) < 2 { 191 | panic(fmt.Errorf("Trailing '\\' in regex: %q", string(str))) 192 | } 193 | switch str[1] { 194 | case ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', 195 | '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~': 196 | return AstLit(str[1:2]), str[2:] 197 | case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 198 | if str[2] < '0' || '9' < str[2] { 199 | if str[1] == '0' { 200 | return AstLit([]rune{0}), str[2:] 201 | } 202 | if !inClass { 203 | return AstBackRef(str[1] - '0'), str[2:] 204 | } 205 | panic(fmt.Errorf("invalid character %q in octal escape: %q", str[2], string(str))) 206 | } 207 | if str[3] < '0' || '9' < str[3] { 208 | panic(fmt.Errorf("invalid character %q in octal escape: %q", str[3], string(str))) 209 | } 210 | oct, err := strconv.ParseUint(string(str[1:4]), 8, 8) 211 | if err != nil { 212 | panic(fmt.Errorf("can't parse octal escape in %q: %w", string(str), err)) 213 | } 214 | return AstLit([]rune{rune(oct)}), str[4:] 215 | default: 216 | panic(fmt.Errorf("Unknown escape sequence: %q", string(str))) 217 | } 218 | } 219 | 220 | func (p *parser) parseClass(str []rune) (Ast, []rune) { 221 | if str[0] != '[' { 222 | panic(fmt.Errorf("'[' is expected, but cannot find: %q", string(str))) 223 | } 224 | origStr := str 225 | str = str[1:] 226 | rangeTable := &unicode.RangeTable{} 227 | ccs := []CharClass{} 228 | isNegate := false 229 | if str[0] == '^' { 230 | isNegate = true 231 | str = str[1:] 232 | } 233 | if str[0] == ']' || str[0] == '-' { 234 | rangeTable = mergeRangeTable(rangeTable, rangeTableFromTo(str[0], str[0])) 235 | str = str[1:] 236 | } 237 | LOOP: 238 | for { 239 | if len(str) == 0 { 240 | panic(fmt.Errorf("unmatched '[' here: %q", string(origStr))) 241 | } 242 | if str[0] == ']' { 243 | str = str[1:] 244 | break LOOP 245 | } 246 | var from rune 247 | if str[0] == '\\' { 248 | var re Ast 249 | re, str = p.parseEscapeAux(str, true) 250 | from = ([]rune)(re.(AstLit))[0] // This must work at least now 251 | } else { 252 | from = str[0] 253 | str = str[1:] 254 | } 255 | if str[0] != '-' { 256 | rangeTable = mergeRangeTable(rangeTable, rangeTableFromTo(from, from)) 257 | continue LOOP 258 | } 259 | switch str[1] { // In the case of character range, i.e. "X-Y" 260 | case ']': 261 | rangeTable = mergeRangeTable(rangeTable, rangeTableFromTo(from, from)) 262 | rangeTable = mergeRangeTable(rangeTable, rangeTableFromTo('-', '-')) 263 | str = str[2:] 264 | break LOOP 265 | case '\\': 266 | var re Ast 267 | re, str = p.parseEscapeAux(str[1:], true) 268 | to := ([]rune)(re.(AstLit))[0] // This must work at least now 269 | rangeTable = mergeRangeTable(rangeTable, rangeTableFromTo(from, to)) 270 | break 271 | default: 272 | rangeTable = mergeRangeTable(rangeTable, rangeTableFromTo(from, str[1])) 273 | str = str[2:] 274 | } 275 | } 276 | strRep := string(origStr[0 : len(origStr)-len(str)]) 277 | if rangeTable.R16 != nil || rangeTable.R32 != nil { 278 | ccs = append(ccs, (*RangeTableClass)(rangeTable)) 279 | } 280 | var out CharClass 281 | if len(ccs) == 1 { 282 | out = ccs[0] 283 | } else { 284 | out = CompositeClass(ccs) 285 | } 286 | // Return AstLit if it contains only a single character 287 | if c, ok := out.(*RangeTableClass); ok && !isNegate { 288 | if r, ok := c.HasOnlySingleChar(); ok { 289 | return AstLit(string(r)), str 290 | } 291 | } 292 | out = toAsciiMaskClass(out) // this returns the input as-is if impossible to convert to asciiMaskClass 293 | if isNegate { 294 | out = NegateCharClass(out) 295 | } 296 | return AstCharClass{out, strRep}, str 297 | } 298 | -------------------------------------------------------------------------------- /parse_test.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | import "testing" 4 | 5 | func TestParseFooBar(t *testing.T) { 6 | re, err := parse("foo bar") 7 | if err != nil { 8 | t.Errorf("want %v, but got %v", nil, err) 9 | } 10 | seq, ok := re.(*AstSeq) 11 | if !ok { 12 | t.Fatalf("want *AstSeq, but got %T", re) 13 | } 14 | if seq.String() != "(?:foo bar)" { 15 | t.Errorf("want %q, but got %q", "(?:foo bar)", seq) 16 | } 17 | } 18 | 19 | func TestParseFooOrBar(t *testing.T) { 20 | re, err := parse("foo|bar") 21 | if err != nil { 22 | t.Errorf("want %v, but got %v", nil, err) 23 | } 24 | alt, ok := re.(*AstAlt) 25 | if !ok { 26 | t.Fatalf("want *AstAlt, but got %v of type %T", re, re) 27 | } 28 | seq, ok := (alt.opts[0]).(*AstSeq) 29 | if !ok { 30 | t.Fatalf("want *AstSeq, but got %v of type %T", alt.opts[0], alt.opts[0]) 31 | } 32 | if seq.String() != "(?:foo)" { 33 | t.Errorf("want %q, but got %q", "(?:foo)", seq) 34 | } 35 | seq, ok = (alt.opts[1]).(*AstSeq) 36 | if !ok { 37 | t.Fatalf("want *AstSeq, but got %v of type %T", alt.opts[1], alt.opts[1]) 38 | } 39 | if seq.String() != "(?:bar)" { 40 | t.Errorf("want %q, but got %q", "(?:bar)", seq) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /yarex.go: -------------------------------------------------------------------------------- 1 | package yarex 2 | 3 | type execer interface { 4 | exec(str string, pos int, onSuccess func(MatchContext)) bool 5 | } 6 | 7 | type Regexp struct { 8 | str string 9 | exe execer 10 | } 11 | 12 | func Compile(ptn string) (*Regexp, error) { 13 | if r, ok := compiledRegexps[ptn]; ok { 14 | return r, nil 15 | } 16 | ast, err := parse(ptn) 17 | if err != nil { 18 | return nil, err 19 | } 20 | ast = optimizeAst(ast) 21 | op := opCompile(ast) 22 | return &Regexp{ptn, opExecer{op}}, nil 23 | } 24 | 25 | func MustCompile(ptn string) *Regexp { 26 | r, err := Compile(ptn) 27 | if err != nil { 28 | panic(err) 29 | } 30 | return r 31 | } 32 | 33 | func (re Regexp) String() string { 34 | return re.str 35 | } 36 | 37 | func (re Regexp) MatchString(s string) bool { 38 | return re.exe.exec(s, 0, func(_ MatchContext) {}) 39 | } 40 | 41 | func (re Regexp) FindString(s string) string { 42 | matched := "" 43 | re.exe.exec(s, 0, func(c MatchContext) { 44 | matched, _ = c.GetCaptured(ContextKey{'c', 0}) 45 | }) 46 | return matched 47 | } 48 | 49 | func (re Regexp) FindStringIndex(s string) (loc []int) { 50 | re.exe.exec(s, 0, func(c MatchContext) { 51 | loc = c.GetCapturedIndex(ContextKey{'c', 0}) 52 | }) 53 | return loc 54 | } 55 | --------------------------------------------------------------------------------