├── .editorconfig ├── .gitignore ├── LICENSE ├── README.md ├── bench ├── edit.go ├── java.go └── main.go ├── capture_test.go ├── charset ├── charset.go └── charset_test.go ├── cmd └── gpeg │ └── main.go ├── go.mod ├── gpeg_test.go ├── grammars ├── arith.peg ├── c.peg ├── java.peg ├── java_memo.peg ├── json.peg ├── json_memo.peg ├── lpeg.peg ├── peg.peg └── re.peg ├── incremental_test.go ├── input ├── input.go ├── input_test.go ├── linerope │ ├── .gitignore │ ├── LICENSE │ ├── line.go │ ├── rope.go │ ├── rope_test.go │ └── util.go ├── reader.go └── reader_test.go ├── isa ├── checker.go └── isa.go ├── memo ├── capture.go ├── edit.go ├── entry.go ├── interval │ ├── interval_test.go │ ├── lazy │ │ ├── LICENSE-AVL │ │ ├── array.go │ │ ├── interval.go │ │ ├── interval_test.go │ │ └── tree.go │ ├── lazylog │ │ ├── interval.go │ │ └── tree.go │ └── map.go ├── none.go ├── table.go └── tree.go ├── pattern ├── compile.go ├── nodes.go ├── optimize.go ├── pattern.go └── string.go ├── re ├── grammar.go └── re.go ├── re_test.go ├── recover_test.go ├── rxconv ├── rxconv.go └── rxconv_test.go ├── testdata ├── ScriptRuntime.java ├── bible.txt ├── test.java └── test.json └── vm ├── code.go ├── code_test.go ├── op.go ├── stack.go └── vm.go /.editorconfig: -------------------------------------------------------------------------------- 1 | # See http://editorconfig.org 2 | 3 | # In Go files we indent with tabs but still 4 | # set indent_size to control the GitHub web viewer. 5 | [*.go] 6 | indent_size=4 7 | 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | *.test 3 | main/ 4 | todo.txt 5 | capt.txt 6 | bench/bench 7 | gpeg 8 | !cmd/gpeg 9 | *.pdf 10 | *.java 11 | /flare/flare 12 | *.svg 13 | *.dat 14 | *.so 15 | /benchmarks/apply 16 | /benchmarks/apply_gpeg 17 | /benchmarks/fullparse 18 | /benchmarks/reparse 19 | /bench/bench 20 | testdata/* 21 | !testdata/ScriptRuntime.java 22 | !testdata/bible.txt 23 | !testdata/test.java 24 | !testdata/test.json 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020: Zachary Yedidia. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPeg 2 | 3 | [![Documentation](https://godoc.org/github.com/zyedidia/gpeg?status.svg)](http://godoc.org/github.com/zyedidia/gpeg) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/zyedidia/gpeg)](https://goreportcard.com/report/github.com/zyedidia/gpeg) 5 | [![MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/zyedidia/gpeg/blob/master/LICENSE) 6 | 7 | GPeg is a tool for working with parsing expression grammars (PEGs). It is 8 | built with three primary goals in mind: 9 | 10 | * Efficient parsing for two use-cases. 11 | * Language grammars with AST construction (where PEGs serve as a CFG 12 | alternative) 13 | * Patterns (where PEGs serve as a regex alternative). 14 | * Incremental parsing. 15 | * Support for dynamically loading grammars (meaning parsers can be generated 16 | and used at runtime). 17 | 18 | GPeg uses the same general parsing techniques as Lua's LPeg library and is 19 | heavily inspired by LPeg. 20 | 21 | # Features 22 | 23 | * Fast incremental parsing. 24 | * Parsing virtual machine (parsers can be dynamically generated). 25 | * Pattern compiler with optimizations. 26 | * Support for the original PEG syntax with some extensions. 27 | * Parse more complex string data structures (via ReaderAt interface). 28 | * Support for back-references (context-sensitivity). 29 | * Can convert most Go regular expressions to PEGs (see the `rxconv` package). 30 | * Basic error recovery. 31 | * Syntax highlighting library ([zyedidia/flare](https://github.com/zyedidia/flare)). 32 | * Tools for visualizing grammars, ASTs, and memo tables ([zyedidia/gpeg-extra](https://github.com/zyedidia/gpeg-extra)). 33 | 34 | # Publications 35 | 36 | * Zachary Yedidia and Stephen Chong. "Fast Incremental PEG Parsing." Proceedings of the 14th ACM SIGPLAN International Conference on Software Language Engineering (SLE), October 2021. [Link](https://zyedidia.github.io/preprints/gpeg_sle21.pdf). 37 | * Zachary Yedidia. "Incremental PEG Parsing." Bachelor's thesis. [Link](https://zyedidia.github.io/notes/yedidia_thesis.pdf). 38 | 39 | # Related work 40 | 41 | * Ford, Bryan. "Parsing expression grammars: a recognition-based syntactic foundation." Proceedings of the 31st ACM SIGPLAN-SIGACT symposium on Principles of programming languages. 2004. [Link](https://bford.info/pub/lang/peg.pdf). 42 | * [LPeg](http://www.inf.puc-rio.br/~roberto/lpeg/). 43 | * Ierusalimschy, Roberto. "A text pattern‐matching tool based on Parsing 44 | Expression Grammars." Software: Practice and Experience 39.3 (2009): 45 | 221-258. [Link](http://www.inf.puc-rio.br/~roberto/docs/peg.pdf). 46 | * Medeiros, Sérgio, and Fabio Mascarenhas. "Syntax error recovery in 47 | parsing expression grammars." Proceedings of the 33rd Annual ACM 48 | Symposium on Applied Computing. 2018. 49 | [Link](https://arxiv.org/pdf/1806.11150.pdf). 50 | * Medeiros, Sérgio, Fabio Mascarenhas, and Roberto Ierusalimschy. "Left 51 | recursion in parsing expression grammars." Science of Computer 52 | Programming 96 (2014): 177-190. 53 | [Link](https://arxiv.org/pdf/1207.0443.pdf). 54 | * [NPeg](https://github.com/zevv/npeg). 55 | * [Papa Carlo](https://lakhin.com/projects/papa-carlo/). 56 | * Dubroy, Patrick, and Alessandro Warth. "Incremental packrat parsing." 57 | Proceedings of the 10th ACM SIGPLAN International Conference on Software 58 | Language Engineering. 2017. 59 | [Link](https://ohmlang.github.io/pubs/sle2017/incremental-packrat-parsing.pdf). 60 | * Marcelo Oikawa, Roberto Ierusalimschy, Ana Lucia de Moura. "Converting regexes to Parsing Expression Grammars." [Link](http://www.inf.puc-rio.br/~roberto/docs/ry10-01.pdf). 61 | * [Tree Sitter](https://tree-sitter.github.io/tree-sitter/). 62 | 63 | -------------------------------------------------------------------------------- /bench/edit.go: -------------------------------------------------------------------------------- 1 | package bench 2 | 3 | import ( 4 | "math/rand" 5 | 6 | "github.com/zyedidia/gpeg/input/linerope" 7 | "github.com/zyedidia/gpeg/memo" 8 | p "github.com/zyedidia/gpeg/pattern" 9 | "github.com/zyedidia/gpeg/vm" 10 | ) 11 | 12 | type Edit struct { 13 | Start, End int 14 | Text []byte 15 | } 16 | 17 | func EditToEdits(e Edit) []Edit { 18 | var edits []Edit 19 | 20 | for i := e.Start; i < e.End; i++ { 21 | edits = append(edits, Edit{ 22 | Start: e.Start, 23 | End: e.Start + 1, 24 | Text: nil, 25 | }) 26 | } 27 | 28 | for i := 0; i < len(e.Text); i++ { 29 | edits = append(edits, Edit{ 30 | Start: e.Start + i, 31 | End: e.Start + i, 32 | Text: []byte{e.Text[i]}, 33 | }) 34 | } 35 | 36 | return edits 37 | } 38 | 39 | func ToSingleEdits(edits []Edit) []Edit { 40 | single := make([]Edit, 0) 41 | 42 | for _, e := range edits { 43 | single = append(single, EditToEdits(e)...) 44 | } 45 | 46 | return single 47 | } 48 | 49 | // strategies for generating edits to a Java file: 50 | // * insert newline at start of line 51 | // * change contents of comment 52 | // * delete single-line comment 53 | // * change function name 54 | // * change function qualifier (e.g., from 'private' to 'public') 55 | // * change contents of string 56 | 57 | type EditType int 58 | 59 | const ( 60 | EditInsertNewline EditType = iota 61 | EditRemoveNewline 62 | EditWhitespace 63 | EditChangeComment 64 | EditRemoveComment 65 | EditChangeFunc 66 | EditChangeFuncQual 67 | EditChangeString 68 | ) 69 | 70 | var editTypes = []EditType{ 71 | EditInsertNewline, 72 | EditRemoveNewline, 73 | EditChangeComment, 74 | EditRemoveComment, 75 | EditChangeFunc, 76 | EditChangeFuncQual, 77 | EditChangeString, 78 | } 79 | 80 | func GenerateEdits(data []byte, nedits int) []Edit { 81 | r := linerope.New(data) 82 | edits := make([]Edit, 0, nedits) 83 | 84 | prog := p.MustCompile(grammar) 85 | java := vm.Encode(prog) 86 | tbl := memo.NewTreeTable(512) 87 | 88 | for i := 0; i < nedits; { 89 | _, _, ast, _ := java.Exec(r, tbl) 90 | 91 | var e Edit 92 | typ := editTypes[rand.Intn(len(editTypes))] 93 | 94 | switch typ { 95 | case EditInsertNewline: 96 | line := rand.Intn(r.NumLines()) 97 | off := r.OffsetAt(line, 0) 98 | e = Edit{ 99 | Start: off, 100 | End: off, 101 | Text: []byte{'\n'}, 102 | } 103 | case EditRemoveNewline: 104 | candidates := make([]*memo.Capture, 0) 105 | it := ast.ChildIterator(0) 106 | for ch := it(); ch != nil; ch = it() { 107 | if ch.Id() == capNewline { 108 | candidates = append(candidates, ch) 109 | } 110 | } 111 | if len(candidates) == 0 { 112 | continue 113 | } 114 | ch := candidates[rand.Intn(len(candidates))] 115 | e = Edit{ 116 | Start: ch.Start(), 117 | End: ch.Start() + ch.Len(), 118 | Text: nil, 119 | } 120 | case EditRemoveComment: 121 | candidates := make([]*memo.Capture, 0) 122 | it := ast.ChildIterator(0) 123 | for ch := it(); ch != nil; ch = it() { 124 | if ch.Id() == capLineComment { 125 | candidates = append(candidates, ch) 126 | } 127 | } 128 | if len(candidates) == 0 { 129 | continue 130 | } 131 | ch := candidates[rand.Intn(len(candidates))] 132 | line, _ := r.LineColAt(ch.Start()) 133 | e = Edit{ 134 | Start: ch.Start(), 135 | End: r.OffsetAt(line+1, 0), 136 | Text: nil, 137 | } 138 | case EditChangeFunc: 139 | candidates := make([]*memo.Capture, 0) 140 | it := ast.ChildIterator(0) 141 | for ch := it(); ch != nil; ch = it() { 142 | if ch.Id() == capFuncName { 143 | candidates = append(candidates, ch) 144 | } 145 | } 146 | if len(candidates) == 0 { 147 | continue 148 | } 149 | ch := candidates[rand.Intn(len(candidates))] 150 | e = Edit{ 151 | Start: ch.Start(), 152 | End: ch.Start() + ch.Len(), 153 | Text: randID(rand.Intn(5) + 4), 154 | } 155 | case EditChangeFuncQual: 156 | candidates := make([]*memo.Capture, 0) 157 | it := ast.ChildIterator(0) 158 | for ch := it(); ch != nil; ch = it() { 159 | if ch.Id() == capFuncQual { 160 | candidates = append(candidates, ch) 161 | } 162 | } 163 | if len(candidates) == 0 { 164 | continue 165 | } 166 | ch := candidates[rand.Intn(len(candidates))] 167 | modifiers := []string{ 168 | "protected", 169 | "public", 170 | "private", 171 | } 172 | e = Edit{ 173 | Start: ch.Start(), 174 | End: ch.Start() + ch.Len(), 175 | Text: []byte(modifiers[rand.Intn(len(modifiers))]), 176 | } 177 | default: 178 | continue 179 | } 180 | 181 | r.Remove(e.Start, e.End) 182 | r.Insert(e.Start, e.Text) 183 | tbl.ApplyEdit(memo.Edit{ 184 | Start: e.Start, 185 | End: e.End, 186 | Len: len(e.Text), 187 | }) 188 | 189 | edits = append(edits, e) 190 | i++ 191 | } 192 | 193 | // r.WriteTo(os.Stdout) 194 | 195 | return edits 196 | } 197 | 198 | var rbytes = []byte("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 199 | 200 | func randID(n int) []byte { 201 | id := make([]byte, n) 202 | for i := range id { 203 | id[i] = rbytes[rand.Intn(len(rbytes))] 204 | } 205 | return id 206 | } 207 | -------------------------------------------------------------------------------- /bench/java.go: -------------------------------------------------------------------------------- 1 | package bench 2 | 3 | import ( 4 | "github.com/zyedidia/gpeg/charset" 5 | "github.com/zyedidia/gpeg/isa" 6 | p "github.com/zyedidia/gpeg/pattern" 7 | ) 8 | 9 | var ( 10 | alpha = p.Set(charset.Range('A', 'Z').Add(charset.Range('a', 'z'))) 11 | alnum = p.Set(charset.Range('A', 'Z').Add(charset.Range('a', 'z')).Add(charset.Range('0', '9'))) 12 | 13 | word = p.Concat( 14 | p.Or(alpha, p.Literal("_")), 15 | p.Star(p.Or(alnum, p.Literal("_"))), 16 | ) 17 | ) 18 | 19 | func BlockPatt(start, end string, escape p.Pattern) p.Pattern { 20 | if escape != nil { 21 | return p.Concat( 22 | p.Literal(start), 23 | p.Star( 24 | p.Or( 25 | escape, 26 | p.Concat( 27 | p.Not(p.Literal(end)), 28 | p.Any(1), 29 | ), 30 | ), 31 | ), 32 | p.Literal(end), 33 | ) 34 | } 35 | 36 | return p.Concat( 37 | p.Literal(start), 38 | p.Star(p.Concat( 39 | p.Not(p.Literal(end)), 40 | p.Any(1), 41 | )), 42 | p.Literal(end), 43 | ) 44 | } 45 | 46 | func WordMatch(words ...string) p.Pattern { 47 | m := make(map[string]struct{}) 48 | 49 | for _, w := range words { 50 | m[w] = struct{}{} 51 | } 52 | 53 | return p.Check(word, isa.MapChecker(m)) 54 | } 55 | 56 | // mini java grammar for picking out important pieces 57 | 58 | var grammar = p.Grammar("S", map[string]p.Pattern{ 59 | "S": p.Star(p.Memo(p.Or( 60 | p.NonTerm("Token"), 61 | p.Concat( 62 | p.Any(1), 63 | p.Star(p.Concat( 64 | p.Not(p.NonTerm("Token")), 65 | p.Any(1), 66 | )), 67 | ), 68 | ))), 69 | "Token": p.Or( 70 | p.NonTerm("Comment"), 71 | p.NonTerm("FuncQual"), 72 | p.NonTerm("FuncName"), 73 | p.NonTerm("String"), 74 | p.NonTerm("Newline"), 75 | ), 76 | "Comment": p.Or(p.NonTerm("LineComment"), p.NonTerm("LongComment")), 77 | "LineComment": p.Cap(BlockPatt("//", "\n", nil), capLineComment), 78 | "LongComment": BlockPatt("/*", "*/", nil), 79 | 80 | "FuncQual": p.Cap(WordMatch("public", "protected", "private"), capFuncQual), 81 | 82 | "FuncName": p.Concat( 83 | p.Cap(p.NonTerm("Identifier"), capFuncName), 84 | p.Literal("("), 85 | ), 86 | "Identifier": word, 87 | 88 | "String": p.Cap( 89 | BlockPatt("\"", "\"", p.NonTerm("Escape")), 90 | capString, 91 | ), 92 | "Escape": p.Concat( 93 | p.Literal("\\"), 94 | p.Set(charset.New([]byte{'\'', '"', 't', 'n', 'b', 'f', 'r', '\\'})), 95 | ), 96 | 97 | "Newline": p.Cap(p.Literal("\n"), capNewline), 98 | }) 99 | 100 | const ( 101 | capLineComment = iota 102 | capFuncName 103 | capFuncQual 104 | capString 105 | capNewline 106 | ) 107 | -------------------------------------------------------------------------------- /bench/main.go: -------------------------------------------------------------------------------- 1 | // +build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "flag" 7 | "fmt" 8 | "io/ioutil" 9 | "log" 10 | "strconv" 11 | 12 | "github.com/zyedidia/gpeg/bench" 13 | ) 14 | 15 | func main() { 16 | flag.Parse() 17 | 18 | data, err := ioutil.ReadFile(flag.Args()[0]) 19 | if err != nil { 20 | log.Fatal(err) 21 | } 22 | 23 | edits := bench.GenerateEdits(data, 100) 24 | 25 | for _, e := range edits { 26 | fmt.Printf("(%d, %d): %s\n", e.Start, e.End, strconv.Quote(string(e.Text))) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /capture_test.go: -------------------------------------------------------------------------------- 1 | package gpeg 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/zyedidia/gpeg/charset" 8 | "github.com/zyedidia/gpeg/memo" 9 | . "github.com/zyedidia/gpeg/pattern" 10 | "github.com/zyedidia/gpeg/vm" 11 | ) 12 | 13 | func TestCaptures(t *testing.T) { 14 | const ( 15 | digit = iota 16 | num 17 | ) 18 | 19 | p := Star(Memo(Concat( 20 | Cap(Plus( 21 | Cap(Set(charset.Range('0', '9')), digit), 22 | ), num), 23 | Optional(Literal(" ")), 24 | ))) 25 | code := vm.Encode(MustCompile(p)) 26 | r := strings.NewReader("12 34 56 78 9") 27 | _, _, ast, _ := code.Exec(r, memo.NoneTable{}) 28 | 29 | expect := [][2]int{ 30 | {0, 2}, 31 | {3, 2}, 32 | {6, 2}, 33 | {9, 2}, 34 | {12, 1}, 35 | } 36 | 37 | it := ast.ChildIterator(0) 38 | i := 0 39 | for ch := it(); ch != nil; ch = it() { 40 | if expect[i][0] != ch.Start() || expect[i][1] != ch.Len() { 41 | t.Fatal(ch.Start(), ch.Len()) 42 | } 43 | i++ 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /charset/charset.go: -------------------------------------------------------------------------------- 1 | // Package charset provides data types and functions for managing sets of 2 | // characters. 3 | package charset 4 | 5 | import ( 6 | "math/bits" 7 | "strconv" 8 | ) 9 | 10 | const log2WordSize = 6 11 | const wordSize = 64 12 | 13 | // A Set represents a set of chars. 14 | type Set struct { 15 | // Bits is the bit array for indicating which chars are in the set. 16 | // We have 256 bits because a char can have 256 different values. 17 | Bits [4]uint64 18 | } 19 | 20 | // A SmallSet is the same as a Set but can only represent 128 possible chars. 21 | // This is an optimization, since in the common case, only ASCII bytes are 22 | // used which are <128. The full Set is only necessary when unicode control 23 | // characters must be matched. 24 | type SmallSet struct { 25 | Bits [2]uint64 26 | } 27 | 28 | // Size returns the number of chars matched by this Set. 29 | func (c SmallSet) Size() int { 30 | return bits.OnesCount64(c.Bits[0]) + bits.OnesCount64(c.Bits[1]) 31 | } 32 | 33 | // Has checks if a charset accepts a character. 34 | // Pointer receiver is for performance. 35 | func (c *SmallSet) Has(r byte) bool { 36 | return c.Bits[r>>log2WordSize]&(uint64(1)<<(r&(wordSize-1))) != 0 37 | } 38 | 39 | // IsSmall returns true if this set can be converted to a small set. In other 40 | // words, if this set only matches bytes <128. 41 | func (c Set) IsSmall() bool { 42 | return c.Bits[2] == 0 && c.Bits[3] == 0 43 | } 44 | 45 | // SmallSet converts this Set to a SmallSet. 46 | func (c Set) SmallSet() SmallSet { 47 | return SmallSet{ 48 | Bits: [2]uint64{c.Bits[0], c.Bits[1]}, 49 | } 50 | } 51 | 52 | // New returns a charset which accepts all chars in 'chars'. Note 53 | // that all chars must be valid ASCII characters (<128). 54 | func New(chars []byte) Set { 55 | var set Set 56 | for _, r := range chars { 57 | switch { 58 | case r < 64: 59 | bit := uint64(1) << r 60 | set.Bits[0] |= bit 61 | case r < 128: 62 | bit := uint64(1) << (r - 64) 63 | set.Bits[1] |= bit 64 | case r < 192: 65 | bit := uint64(1) << (r - 128) 66 | set.Bits[2] |= bit 67 | default: 68 | bit := uint64(1) << (r - 192) 69 | set.Bits[3] |= bit 70 | } 71 | } 72 | 73 | return set 74 | } 75 | 76 | // CharsetRange returns a charset matching all characters between `low` and 77 | // `high` inclusive. 78 | func Range(low, high byte) Set { 79 | var set Set 80 | for c := int(low); c <= int(high); c++ { 81 | switch { 82 | case c < 64: 83 | bit := uint64(1) << c 84 | set.Bits[0] |= bit 85 | case c < 128: 86 | bit := uint64(1) << (c - 64) 87 | set.Bits[1] |= bit 88 | case c < 192: 89 | bit := uint64(1) << (c - 128) 90 | set.Bits[2] |= bit 91 | default: 92 | bit := uint64(1) << (c - 192) 93 | set.Bits[3] |= bit 94 | } 95 | } 96 | 97 | return set 98 | } 99 | 100 | // Complement returns a charset that matches all characters except for those 101 | // matched by `c`. 102 | func (c Set) Complement() Set { 103 | return Set{ 104 | Bits: [4]uint64{^c.Bits[0], ^c.Bits[1], ^c.Bits[2], ^c.Bits[3]}, 105 | } 106 | } 107 | 108 | // Add combines the characters two charsets match together. 109 | func (c Set) Add(c1 Set) Set { 110 | return Set{ 111 | Bits: [4]uint64{c1.Bits[0] | c.Bits[0], c1.Bits[1] | c.Bits[1], c1.Bits[2] | c.Bits[2], c1.Bits[3] | c.Bits[3]}, 112 | } 113 | } 114 | 115 | // Sub removes from 'c' any characters in 'c1'. 116 | func (c Set) Sub(c1 Set) Set { 117 | return Set{ 118 | Bits: [4]uint64{^c1.Bits[0] & c.Bits[0], ^c1.Bits[1] & c.Bits[1], ^c1.Bits[2] & c.Bits[2], ^c1.Bits[3] & c.Bits[3]}, 119 | } 120 | } 121 | 122 | // Size returns the number of chars matched by this Set. 123 | func (c Set) Size() int { 124 | return bits.OnesCount64(c.Bits[0]) + bits.OnesCount64(c.Bits[1]) + bits.OnesCount64(c.Bits[2]) + bits.OnesCount64(c.Bits[3]) 125 | } 126 | 127 | // Has checks if a charset accepts a character. 128 | // Pointer receiver is for performance. 129 | func (c *Set) Has(r byte) bool { 130 | return c.Bits[r>>log2WordSize]&(uint64(1)<<(r&(wordSize-1))) != 0 131 | } 132 | 133 | // String returns the string representation of the charset. 134 | func (c Set) String() string { 135 | s := "" 136 | inRange := false 137 | for b := int(0); b <= 255; b++ { 138 | if c.Has(byte(b)) && b == 255 { 139 | s += strconv.QuoteRuneToASCII(rune(b)) 140 | } else if c.Has(byte(b)) && !inRange { 141 | inRange = true 142 | if c.Has(byte(b + 1)) { 143 | s += strconv.QuoteRuneToASCII(rune(b)) + ".." 144 | } 145 | } else if !c.Has(byte(b)) && inRange { 146 | inRange = false 147 | s += strconv.QuoteRuneToASCII(rune(b-1)) + "," 148 | } 149 | } 150 | if s != "" && s[len(s)-1] == ',' { 151 | s = s[:len(s)-1] 152 | } 153 | s = "{" + s + "}" 154 | return s 155 | } 156 | -------------------------------------------------------------------------------- /charset/charset_test.go: -------------------------------------------------------------------------------- 1 | package charset_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/zyedidia/gpeg/charset" 7 | ) 8 | 9 | func inSet(set charset.Set, in, notin []byte, t *testing.T) { 10 | for _, r := range in { 11 | if !set.Has(r) { 12 | t.Errorf("Error: %c returned 'not in set'", r) 13 | } 14 | } 15 | 16 | for _, r := range notin { 17 | if set.Has(r) { 18 | t.Errorf("Error: %c returned 'in set'", r) 19 | } 20 | } 21 | } 22 | 23 | func TestSet(t *testing.T) { 24 | in := []byte{'a', 'b', 'c', 'd', '{', '}'} 25 | notin := []byte{'x', 'y', 'z', '[', ']'} 26 | 27 | set := charset.New(in) 28 | 29 | inSet(set, in, notin, t) 30 | } 31 | 32 | func TestRangeUnion(t *testing.T) { 33 | set := charset.Range('a', 'z').Add(charset.Range('A', 'Z')) 34 | 35 | in := []byte{'a', 'b', 'c', 'd', 'z', 'y', 'A', 'Z', 'B'} 36 | notin := []byte{'0', '1', '2', 0} 37 | 38 | inSet(set, in, notin, t) 39 | } 40 | 41 | func TestComplement(t *testing.T) { 42 | in := []byte{'a', 'b', 'c', 'd', '{', '}'} 43 | notin := []byte{'x', 'y', 'z', '[', ']'} 44 | 45 | set := charset.New(in).Complement() 46 | 47 | inSet(set, notin, in, t) 48 | } 49 | 50 | func TestBigSet(t *testing.T) { 51 | in := []byte{200, 201, 203} 52 | notin := []byte{0, 1, 2} 53 | 54 | set := charset.Range(128, '\xff') 55 | 56 | inSet(set, in, notin, t) 57 | } 58 | -------------------------------------------------------------------------------- /cmd/gpeg/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "io" 7 | "log" 8 | "os" 9 | "regexp/syntax" 10 | 11 | "github.com/zyedidia/gpeg/pattern" 12 | "github.com/zyedidia/gpeg/re" 13 | "github.com/zyedidia/gpeg/rxconv" 14 | ) 15 | 16 | var regex = flag.Bool("regex", false, "compile regex instead of PEG") 17 | 18 | func main() { 19 | flag.Parse() 20 | 21 | args := flag.Args() 22 | 23 | var in io.Reader 24 | if len(args) <= 0 { 25 | in = os.Stdin 26 | } else { 27 | f, err := os.Open(args[0]) 28 | if err != nil { 29 | log.Fatal(err) 30 | } 31 | defer f.Close() 32 | in = f 33 | } 34 | 35 | bytes, err := io.ReadAll(in) 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | var patt pattern.Pattern 40 | 41 | if *regex { 42 | patt, err = rxconv.FromRegexp(string(bytes), syntax.Perl) 43 | } else { 44 | patt, err = re.Compile(string(bytes)) 45 | } 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | prog, err := pattern.Compile(patt) 50 | if err != nil { 51 | log.Fatal(err) 52 | } 53 | fmt.Println(prog) 54 | } 55 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/zyedidia/gpeg 2 | 3 | go 1.16 4 | -------------------------------------------------------------------------------- /gpeg_test.go: -------------------------------------------------------------------------------- 1 | package gpeg 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "strconv" 9 | "strings" 10 | "testing" 11 | 12 | "github.com/zyedidia/gpeg/charset" 13 | "github.com/zyedidia/gpeg/input" 14 | "github.com/zyedidia/gpeg/isa" 15 | "github.com/zyedidia/gpeg/memo" 16 | . "github.com/zyedidia/gpeg/pattern" 17 | "github.com/zyedidia/gpeg/vm" 18 | ) 19 | 20 | type PatternTest struct { 21 | in string 22 | match int 23 | } 24 | 25 | func check(p Pattern, tests []PatternTest, t *testing.T) { 26 | code := vm.Encode(MustCompile(p)) 27 | for _, tt := range tests { 28 | name := tt.in[:min(10, len(tt.in))] 29 | t.Run(name, func(t *testing.T) { 30 | match, off, _, _ := code.Exec(strings.NewReader(tt.in), memo.NoneTable{}) 31 | if tt.match == -1 && match || tt.match != -1 && !match || tt.match != -1 && tt.match != off { 32 | t.Errorf("%s: got: (%t, %d), but expected (%d)\n", tt.in, match, off, tt.match) 33 | } 34 | }) 35 | } 36 | } 37 | 38 | func TestConcat(t *testing.T) { 39 | p := Concat( 40 | Literal("ana"), 41 | Literal("hi"), 42 | ) 43 | 44 | tests := []PatternTest{ 45 | {"ana", -1}, 46 | {"hi", -1}, 47 | {"anahi", 5}, 48 | {"anah", -1}, 49 | } 50 | 51 | check(p, tests, t) 52 | } 53 | 54 | type uint8Checker struct{} 55 | 56 | // only allows integers between 0 and 256 57 | func (uint8Checker) Check(b []byte, src *input.Input, id, flag int) int { 58 | i, err := strconv.Atoi(string(b)) 59 | if err != nil { 60 | return -1 61 | } 62 | if i >= 0 && i < 256 { 63 | return 0 64 | } 65 | return -1 66 | } 67 | 68 | func TestChecker(t *testing.T) { 69 | p := Check(Plus(Set(charset.Range('0', '9'))), uint8Checker{}) 70 | 71 | tests := []PatternTest{ 72 | {"123", 3}, 73 | {"256", -1}, 74 | {"foo", -1}, 75 | {"0", 1}, 76 | } 77 | 78 | check(p, tests, t) 79 | } 80 | 81 | func TestOr(t *testing.T) { 82 | p := Or(Literal("ana"), Literal("hi")) 83 | 84 | tests := []PatternTest{ 85 | {"ana", 3}, 86 | {"hi", 2}, 87 | {"an", -1}, 88 | {"anahi", 3}, 89 | } 90 | 91 | check(p, tests, t) 92 | } 93 | 94 | func TestRepeat(t *testing.T) { 95 | p := Star(Literal("ana")) 96 | tests := []PatternTest{ 97 | {"", 0}, 98 | {"ana", 3}, 99 | {"anaanaana", 9}, 100 | {"hiana", 0}, 101 | {"anaanaan", 6}, 102 | {"an", 0}, 103 | } 104 | check(p, tests, t) 105 | 106 | p = Plus(Literal("hi")) 107 | tests = []PatternTest{ 108 | {"", -1}, 109 | {"hi", 2}, 110 | {"hihihi", 6}, 111 | {"hihiana", 4}, 112 | {"h", -1}, 113 | } 114 | check(p, tests, t) 115 | 116 | p = Concat(Plus(Set(charset.New([]byte{'0', '1'}))), Star(Set(charset.New([]byte{'a', 'b', 'c'})))) 117 | tests = []PatternTest{ 118 | {"01", 2}, 119 | {"01abaabbc", 9}, 120 | {"abc", -1}, 121 | {"5a", -1}, 122 | {"1z", 1}, 123 | } 124 | check(p, tests, t) 125 | } 126 | 127 | func TestPredicate(t *testing.T) { 128 | p := Not(Literal("ana")) 129 | tests := []PatternTest{ 130 | {"ana", -1}, 131 | {"hi", 0}, 132 | {"an", 0}, 133 | } 134 | check(p, tests, t) 135 | 136 | p1 := Not(Not(Literal("ana"))) 137 | p2 := And(Literal("ana")) 138 | tests = []PatternTest{ 139 | {"ana", 0}, 140 | {"hi", -1}, 141 | {"an", -1}, 142 | } 143 | check(p1, tests, t) 144 | check(p2, tests, t) 145 | } 146 | 147 | func TestAny(t *testing.T) { 148 | p := Concat(Any(5), Literal("ana")) 149 | tests := []PatternTest{ 150 | {"helloana", 8}, 151 | {"hiana", -1}, 152 | {"anaanana", 8}, 153 | } 154 | check(p, tests, t) 155 | } 156 | 157 | func TestOptional(t *testing.T) { 158 | p := Concat(Literal("ana"), Optional(Literal("hello"))) 159 | tests := []PatternTest{ 160 | {"ana", 3}, 161 | {"anahe", 3}, 162 | {"hello", -1}, 163 | {"anahello", 8}, 164 | } 165 | check(p, tests, t) 166 | } 167 | 168 | func TestSet(t *testing.T) { 169 | p := Plus(Set(charset.Range('0', '9'))) 170 | tests := []PatternTest{ 171 | {"hi", -1}, 172 | {"1002", 4}, 173 | {"10.02", 2}, 174 | {"9", 1}, 175 | } 176 | check(p, tests, t) 177 | } 178 | 179 | func TestGrammar(t *testing.T) { 180 | // grammar: 181 | // S <- / (![()] .)+ 182 | // B <- '(' ')' 183 | S := Or(NonTerm("B"), Plus(Concat(Not(Set(charset.New([]byte{'(', ')'}))), Any(1)))) 184 | B := Concat(Concat(Literal("("), NonTerm("S")), Literal(")")) 185 | 186 | p := Grammar("S", map[string]Pattern{ 187 | "S": S, 188 | "B": B, 189 | }) 190 | tests := []PatternTest{ 191 | {"(hello)", 7}, 192 | {"(hello", -1}, 193 | {"((inside))", 10}, 194 | {"((inside)", -1}, 195 | } 196 | check(p, tests, t) 197 | } 198 | 199 | func TestTailCall(t *testing.T) { 200 | p := Grammar("X", map[string]Pattern{ 201 | "X": Or(Literal("ana"), Concat(Any(1), NonTerm("X"))), 202 | }) 203 | tests := []PatternTest{ 204 | {"asdf", -1}, 205 | {"ana hello", 3}, 206 | {"hello ana", 9}, 207 | {"anaana", 3}, 208 | } 209 | check(p, tests, t) 210 | } 211 | 212 | func TestUnionSet(t *testing.T) { 213 | p := Plus(Or(Set(charset.Range('a', 'z')), Set(charset.Range('A', 'Z')))) 214 | tests := []PatternTest{ 215 | {"Hello", 5}, 216 | {"123", -1}, 217 | {"Hello1", 5}, 218 | } 219 | check(p, tests, t) 220 | } 221 | 222 | func TestSearch(t *testing.T) { 223 | p := Search( 224 | Concat( 225 | Literal("ana"), 226 | ), 227 | ) 228 | tests := []PatternTest{ 229 | {"hello ana hello", 9}, 230 | {"hello", -1}, 231 | {"hello ana ana ana", 9}, 232 | } 233 | check(p, tests, t) 234 | 235 | // search for last occurrence 236 | p = Plus(Search(Literal("ana"))) 237 | tests = []PatternTest{ 238 | {"hello ana hello", 9}, 239 | {"hello", -1}, 240 | {"hello ana ana ana hello", 17}, 241 | } 242 | check(p, tests, t) 243 | } 244 | 245 | func TestArithmeticGrammar(t *testing.T) { 246 | // grammar: 247 | // Expr <- ([+-] )* 248 | // Factor <- ([*/] )* 249 | // Term <- / '(' ')' 250 | // Number <- [0-9]+ 251 | p := Grammar("Expr", map[string]Pattern{ 252 | "Expr": Concat(NonTerm("Factor"), Star(Concat(Set(charset.New([]byte{'+', '-'})), NonTerm("Factor")))), 253 | "Factor": Concat(NonTerm("Term"), Star(Concat(Set(charset.New([]byte{'*', '/'})), NonTerm("Term")))), 254 | "Term": Or(NonTerm("Number"), Concat(Concat(Literal("("), NonTerm("Expr")), Literal(")"))), 255 | "Number": Plus(Set(charset.Range('0', '9'))), 256 | }) 257 | tests := []PatternTest{ 258 | {"13+(22-15)", 10}, 259 | {"24*5+3", 6}, 260 | {"word 5*3", -1}, 261 | {"10*(43", 2}, 262 | } 263 | check(p, tests, t) 264 | } 265 | 266 | func TestBackReference(t *testing.T) { 267 | word := Plus(Literal("/")) 268 | br := isa.NewBackRef() 269 | p := Concat( 270 | CheckFlags(word, br, 0, int(isa.RefDef)), 271 | Star(Concat( 272 | Not(CheckFlags(&EmptyNode{}, br, 0, int(isa.RefUse))), 273 | Any(1), 274 | )), 275 | CheckFlags(&EmptyNode{}, br, 0, int(isa.RefUse)), 276 | ) 277 | tests := []PatternTest{ 278 | {"/// hello world ///", 19}, 279 | {"// hello world //", 17}, 280 | {"/// hello world //", -1}, 281 | } 282 | check(p, tests, t) 283 | } 284 | 285 | // ************** 286 | // * Benchmarks * 287 | // ************** 288 | // These require `bible.txt` in the testdata directory. 289 | 290 | var match bool 291 | var bible *bytes.Reader 292 | 293 | func TestMain(m *testing.M) { 294 | data, err := ioutil.ReadFile("testdata/bible.txt") 295 | if err != nil { 296 | fmt.Println("Warning:", err) 297 | } 298 | bible = bytes.NewReader(data) 299 | os.Exit(m.Run()) 300 | } 301 | 302 | func BenchmarkBibleSearchFirstEartt(b *testing.B) { 303 | code := vm.Encode(MustCompile(Search(Literal("eartt")))) 304 | 305 | b.ResetTimer() 306 | for i := 0; i < b.N; i++ { 307 | match, _, _, _ = code.Exec(bible, memo.NoneTable{}) 308 | } 309 | } 310 | 311 | func BenchmarkBibleSearchFirstAbram(b *testing.B) { 312 | abram := Concat(Plus(Set(charset.Range('a', 'z').Add(charset.Range('A', 'Z')))), Literal(" Abram")) 313 | code := vm.Encode(MustCompile(Search(abram))) 314 | 315 | b.ResetTimer() 316 | for i := 0; i < b.N; i++ { 317 | match, _, _, _ = code.Exec(bible, memo.NoneTable{}) 318 | } 319 | } 320 | 321 | func BenchmarkBibleSearchLastAbram(b *testing.B) { 322 | abram := Concat(Plus(Set(charset.Range('a', 'z').Add(charset.Range('A', 'Z')))), Literal(" Abram")) 323 | code := vm.Encode(MustCompile(Star(Search(abram)))) 324 | 325 | b.ResetTimer() 326 | for i := 0; i < b.N; i++ { 327 | match, _, _, _ = code.Exec(bible, memo.NoneTable{}) 328 | } 329 | } 330 | 331 | func BenchmarkBibleSearchLastTubalcain(b *testing.B) { 332 | code := vm.Encode(MustCompile(Star(Search(Literal("Tubalcain"))))) 333 | 334 | b.ResetTimer() 335 | for i := 0; i < b.N; i++ { 336 | match, _, _, _ = code.Exec(bible, memo.NoneTable{}) 337 | } 338 | } 339 | 340 | func BenchmarkBibleOmegaPattern(b *testing.B) { 341 | omega := Concat(Star(Concat(Not(Literal("Omega")), Any(1))), Literal("Omega")) 342 | code := vm.Encode(MustCompile(omega)) 343 | 344 | b.ResetTimer() 345 | for i := 0; i < b.N; i++ { 346 | match, _, _, _ = code.Exec(bible, memo.NoneTable{}) 347 | } 348 | } 349 | 350 | func BenchmarkBibleOmegaGrammar(b *testing.B) { 351 | omega := Grammar("S", map[string]Pattern{ 352 | "S": Concat(Star(Concat(Not(NonTerm("P")), Any(1))), NonTerm("P")), 353 | "P": Literal("Omega"), 354 | }) 355 | code := vm.Encode(MustCompile(omega)) 356 | 357 | b.ResetTimer() 358 | for i := 0; i < b.N; i++ { 359 | match, _, _, _ = code.Exec(bible, memo.NoneTable{}) 360 | } 361 | } 362 | 363 | func min(a, b int) int { 364 | if a < b { 365 | return a 366 | } 367 | return b 368 | } 369 | -------------------------------------------------------------------------------- /grammars/arith.peg: -------------------------------------------------------------------------------- 1 | Expr <- Factor ([+\-] Factor)* 2 | Factor <- Term ([*/] Term)* 3 | Term <- Number / '(' Expr ')' 4 | Number <- [0-9]+ 5 | -------------------------------------------------------------------------------- /grammars/json.peg: -------------------------------------------------------------------------------- 1 | doc <- JSON !. 2 | JSON <- S_ (Number / Object / Array / String / True / False / Null) S_ 3 | Object <- '{' (String ':' JSON (',' String ':' JSON)* / S_) '}' 4 | Array <- '[' (JSON (',' JSON)* / S_) ']' 5 | StringBody <- Escape? ((!["\\\00-\37] .)+ Escape*)* 6 | String <- S_ '"' StringBody '"' S_ 7 | Escape <- '\\' (["{|\\bfnrt] / UnicodeEscape) 8 | UnicodeEscape <- 'u' [0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f] 9 | Number <- Minus? IntPart FractPart? ExpPart? 10 | Minus <- '-' 11 | IntPart <- '0' / [1-9][0-9]* 12 | FractPart <- '.' [0-9]+ 13 | ExpPart <- [eE] [+\-]? [0-9]+ 14 | True <- 'true' 15 | False <- 'false' 16 | Null <- 'null' 17 | S_ <- [\11-\15\40]* 18 | -------------------------------------------------------------------------------- /grammars/json_memo.peg: -------------------------------------------------------------------------------- 1 | doc <- JSON !. 2 | JSON <- S_ (Number / Object / Array / String / True / False / Null) S_ 3 | Object <- '{' (String ':' JSON (',' String ':' JSON)* / S_) '}' 4 | Array <- '[' (JSON ({{',' JSON}})* / S_) ']' 5 | StringBody <- Escape? ((!["\\\00-\37] .)+ Escape*)* 6 | String <- S_ '"' StringBody '"' S_ 7 | Escape <- '\\' (["{|\\bfnrt] / UnicodeEscape) 8 | UnicodeEscape <- 'u' [0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f] 9 | Number <- Minus? IntPart FractPart? ExpPart? 10 | Minus <- '-' 11 | IntPart <- '0' / [1-9][0-9]* 12 | FractPart <- '.' [0-9]+ 13 | ExpPart <- [eE] [+\-]? [0-9]+ 14 | True <- 'true' 15 | False <- 'false' 16 | Null <- 'null' 17 | S_ <- [\11-\15\40]* 18 | -------------------------------------------------------------------------------- /grammars/lpeg.peg: -------------------------------------------------------------------------------- 1 | pattern <- exp !. 2 | exp <- S (alternative / grammar) 3 | 4 | alternative <- seq ('/' S seq)* 5 | seq <- prefix* 6 | prefix <- '&' S prefix / '!' S prefix / suffix 7 | suffix <- primary S (([+*?] 8 | / '^' [+\-]? num 9 | / '->' S (string / '{}' / name) 10 | / '=>' S name) S)* 11 | 12 | primary <- '(' exp ')' / string / class / defined 13 | / '{:' (name ':')? exp ':}' 14 | / '=' name 15 | / '{*' exp '*}' # bare capture 16 | / '{~' exp '~}' # substitution capture 17 | / '{|' exp '|}' # table capture 18 | / '{+' exp '+}' # memoization expression 19 | / '{' exp '}' # string capture 20 | / '.' 21 | / name S !arrow 22 | 23 | grammar <- definition+ 24 | definition <- name S arrow exp 25 | 26 | class <- '[' '^'? item (!']' item)* ']' 27 | item <- defined / range / . 28 | range <- . '-' (!']' .) 29 | 30 | S <- (space / comment)* # spaces and comments 31 | name <- [A-Za-z][A-Za-z0-9_]* 32 | arrow <- '<-' 33 | num <- [0-9]+ 34 | string <- '"' (!'"' .)* '"' / "'" (!"'" .)* "'" 35 | defined <- '%' name 36 | 37 | comment <- '#' (!eol .)* eol 38 | space <- ' ' / '\t' / eol 39 | eol <- '\r\n' / '\n' / '\r' 40 | -------------------------------------------------------------------------------- /grammars/peg.peg: -------------------------------------------------------------------------------- 1 | # Hierarchical syntax 2 | Grammar <- Spacing_ Definition+ EndOfFile_ 3 | Definition <- Identifier LEFTARROW_ Expression 4 | 5 | Expression <- Sequence (SLASH_ Sequence)* 6 | Sequence <- Prefix* 7 | Prefix <- (AND / NOT)? Suffix 8 | Suffix <- Primary (QUESTION / STAR / PLUS)? 9 | Primary <- Identifier !LEFTARROW_ 10 | / OPEN_ Expression CLOSE_ 11 | / Literal / Class / DOT 12 | 13 | # Lexical syntax 14 | Identifier <- IdentStart IdentCont* Spacing_ 15 | IdentStart <- [a-zA-Z_] 16 | IdentCont <- IdentStart / [0-9] 17 | 18 | Literal <- ['] (!['] Char)* ['] Spacing_ 19 | / ["] (!["] Char)* ["] Spacing_ 20 | Class <- '[' (!']' Range)* ']' Spacing_ 21 | Range <- Char '-' Char / Char 22 | Char <- '\\' [nrt'"\[\]\\] 23 | / '\\' [0-2][0-7][0-7] 24 | / '\\' [0-7][0-7]? 25 | / !'\\' . 26 | 27 | LEFTARROW_ <- '<-' Spacing_ 28 | SLASH_ <- '/' Spacing_ 29 | AND <- '&' Spacing_ 30 | NOT <- '!' Spacing_ 31 | QUESTION <- '?' Spacing_ 32 | STAR <- '*' Spacing_ 33 | PLUS <- '+' Spacing_ 34 | OPEN_ <- '(' Spacing_ 35 | CLOSE_ <- ')' Spacing_ 36 | DOT <- '.' Spacing_ 37 | 38 | Spacing_ <- (Space_ / Comment_)* 39 | Comment_ <- '#' (!EndOfLine_ .)* EndOfLine_ 40 | Space_ <- ' ' / '\t' / EndOfLine_ 41 | EndOfLine_ <- '\r\n' / '\n' / '\r' 42 | EndOfFile_ <- !. 43 | -------------------------------------------------------------------------------- /grammars/re.peg: -------------------------------------------------------------------------------- 1 | Pattern <- Spacing_ (Expression / Grammar) EndOfFile_ 2 | Grammar <- Definition+ 3 | Definition <- Identifier '<-' Expression 4 | 5 | Expression <- Sequence ('/' Sequence)* 6 | Sequence <- Prefix* 7 | Prefix <- (AND / NOT)? Suffix 8 | Suffix <- Primary (QUESTION / STAR / PLUS)? 9 | Primary <- Identifier !'<-' 10 | / '(' Expression ')' 11 | / Literal / Class 12 | / '{' Expression '}' 13 | / '{+' Expression '+}' 14 | / DOT 15 | 16 | # Lexical syntax 17 | Identifier <- IdentStart IdentCont* Spacing_ 18 | IdentStart <- [a-zA-Z_] 19 | IdentCont <- IdentStart / [0-9] 20 | 21 | Literal <- ['] (!['] Char)* ['] Spacing_ 22 | / ["] (!["] Char)* ["] Spacing_ 23 | Class <- '[' CARAT? (!']' Range)* ']' Spacing_ 24 | Range <- Char '-' Char / Char 25 | Char <- '\\' [nrt'"\[\]\\] 26 | / '\\' [0-2][0-7][0-7] 27 | / '\\' [0-7][0-7]? 28 | / !'\\' . 29 | 30 | AND <- '&' Spacing_ 31 | NOT <- '!' Spacing_ 32 | QUESTION <- '?' Spacing_ 33 | STAR <- '*' Spacing_ 34 | PLUS <- '+' Spacing_ 35 | DOT <- '.' Spacing_ 36 | CARAT <- '^' Spacing_ 37 | 38 | Spacing_ <- (Space_ / Comment_)* 39 | Comment_ <- '#' (!EndOfLine_ .)* EndOfLine_ 40 | Space_ <- ' ' / '\t' / EndOfLine_ 41 | EndOfLine_ <- '\r\n' / '\n' / '\r' 42 | EndOfFile_ <- !. 43 | -------------------------------------------------------------------------------- /incremental_test.go: -------------------------------------------------------------------------------- 1 | package gpeg 2 | 3 | import ( 4 | "io/ioutil" 5 | "math/rand" 6 | "testing" 7 | 8 | "github.com/zyedidia/gpeg/bench" 9 | "github.com/zyedidia/gpeg/input/linerope" 10 | "github.com/zyedidia/gpeg/memo" 11 | "github.com/zyedidia/gpeg/pattern" 12 | "github.com/zyedidia/gpeg/re" 13 | "github.com/zyedidia/gpeg/vm" 14 | ) 15 | 16 | // Open a 250k java file and apply some edits and verify that after each edit 17 | // the incremental result is the same as doing a full parse. 18 | func TestIncrementalJava(t *testing.T) { 19 | rand.Seed(42) 20 | 21 | peg, err := ioutil.ReadFile("grammars/java_memo.peg") 22 | if err != nil { 23 | t.Error(err) 24 | } 25 | p := re.MustCompile(string(peg)) 26 | 27 | java, err := ioutil.ReadFile("testdata/ScriptRuntime.java") 28 | if err != nil { 29 | t.Error(err) 30 | } 31 | 32 | edits := bench.GenerateEdits(java, 100) 33 | edits = bench.ToSingleEdits(edits) 34 | 35 | tbl := memo.NewTreeTable(512) 36 | prog := pattern.MustCompile(p) 37 | code := vm.Encode(prog) 38 | 39 | r := linerope.New(java) 40 | 41 | for _, e := range edits { 42 | start := e.Start 43 | end := e.End 44 | 45 | r.Remove(start, end) 46 | r.Insert(start, []byte(e.Text)) 47 | 48 | // st := time.Now() 49 | tbl.ApplyEdit(memo.Edit{ 50 | Start: start, 51 | End: end, 52 | Len: len(e.Text), 53 | }) 54 | 55 | code.Exec(r, tbl) 56 | // fmt.Println("reparse", time.Since(st), match, off) 57 | // st = time.Now() 58 | // nmatch, noff, _, _ := code.Exec(r, memo.NoneTable{}) 59 | // fmt.Println("full parse", time.Since(st)) 60 | 61 | // if match != nmatch || off != noff { 62 | // t.Fatal(i, match, nmatch, off, noff) 63 | // } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /input/input.go: -------------------------------------------------------------------------------- 1 | // Package input defines data types and functions for managing input data. 2 | package input 3 | 4 | import ( 5 | "io" 6 | ) 7 | 8 | const bufsz = 4096 9 | 10 | // Input represents the input data and is an efficient wrapper of io.ReaderAt 11 | // which provides a nicer API, avoids repeated interface function calls, and 12 | // uses a cache for buffered reading. 13 | // An Input also tracks the index of the furthest byte that has been read. 14 | type Input struct { 15 | r io.ReaderAt 16 | 17 | // cached data. 18 | chunk [bufsz]byte 19 | b [1]byte 20 | // size of the cache. 21 | nchunk int 22 | 23 | // the position within the reader that the chunk starts at. 24 | base int 25 | // the offset within the chunk we are reading at. 26 | coff int 27 | // the furthest position we have read. 28 | furthest int 29 | } 30 | 31 | // NewInput creates a new Input wrapper for the io.ReaderAt. 32 | func NewInput(r io.ReaderAt) *Input { 33 | i := &Input{ 34 | r: r, 35 | } 36 | i.refill(i.base) 37 | return i 38 | } 39 | 40 | func (i *Input) refill(pos int) { 41 | i.base = pos 42 | i.coff = 0 43 | i.nchunk, _ = i.r.ReadAt(i.chunk[:], int64(i.base)) 44 | } 45 | 46 | // Peek returns the next byte in the stream or 'false' if there are no more 47 | // bytes. Successive calls to Peek will return the same value unless there is a 48 | // call to SeekTo or Advance in between. 49 | func (i *Input) Peek() (byte, bool) { 50 | pos := i.base + i.coff 51 | if pos > i.furthest { 52 | i.furthest = pos 53 | } 54 | 55 | return i.chunk[i.coff], i.nchunk != 0 56 | } 57 | 58 | func (i *Input) PeekBefore() (byte, bool) { 59 | if i.base+i.coff-1 < 0 { 60 | return 0, false 61 | } 62 | if i.coff >= 1 { 63 | return i.chunk[i.coff-1], i.nchunk != 0 64 | } 65 | n, _ := i.r.ReadAt(i.b[:], int64(i.base+i.coff-1)) 66 | return i.b[0], n == 1 67 | } 68 | 69 | // SeekTo moves the current read position to the desired read position. Returns 70 | // true if the seek went to a valid location within the reader, and false 71 | // otherwise. In other words, if seek returns true the next call to Peek will 72 | // return a valid byte. 73 | func (i *Input) SeekTo(pos int) bool { 74 | // check if the seek position in within the current chunk and if so just 75 | // update the internal offset. 76 | chunkEnd := i.base + i.nchunk 77 | if pos < chunkEnd && pos >= i.base { 78 | i.coff = pos - i.base 79 | return true 80 | } 81 | 82 | // refill the cache (moves the base) 83 | i.refill(pos) 84 | return i.nchunk != 0 85 | } 86 | 87 | // Advance moves the offset forward by 'n' bytes. Returns true if the advance 88 | // was successful (n chars were successfully skipped) and false otherwise. Note 89 | // that even if Advance returns true the next call to Peek may return false if 90 | // the advance went to the exact end of the data. 91 | func (i *Input) Advance(n int) bool { 92 | if i.nchunk == 0 { 93 | return false 94 | } 95 | 96 | i.coff += n 97 | if i.coff > i.nchunk { 98 | i.refill(i.base + i.coff) 99 | return false 100 | } else if i.coff == i.nchunk { 101 | i.refill(i.base + i.coff) 102 | } 103 | return true 104 | } 105 | 106 | func (i *Input) ReadAt(b []byte, pos int64) (n int, err error) { 107 | return i.r.ReadAt(b, pos) 108 | } 109 | 110 | // Slice returns a slice of the reader corresponding to the range [low:high). 111 | func (i *Input) Slice(low, high int) []byte { 112 | return Slice(i.r, low, high) 113 | } 114 | 115 | // Pos returns the current read position. 116 | func (i *Input) Pos() int { 117 | return i.base + i.coff 118 | } 119 | 120 | // Furthest returns the furthest read position. 121 | func (i *Input) Furthest() int { 122 | return i.furthest 123 | } 124 | 125 | // ResetFurthest resets the furthest read tracker to zero. 126 | func (i *Input) ResetFurthest() { 127 | i.furthest = 0 128 | } 129 | -------------------------------------------------------------------------------- /input/input_test.go: -------------------------------------------------------------------------------- 1 | package input_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/zyedidia/gpeg/input" 8 | ) 9 | 10 | func TestInput(t *testing.T) { 11 | b := bytes.NewReader([]byte("foo bar baz")) 12 | i := input.NewInput(b) 13 | 14 | if b, _ := i.Peek(); b != 'f' { 15 | t.Error("incorrect peek, got", string(b)) 16 | } 17 | i.Advance(1) 18 | if b, _ := i.Peek(); b != 'o' { 19 | t.Error("incorrect peek, got", string(b)) 20 | } 21 | i.Advance(1) 22 | if b, _ := i.Peek(); b != 'o' { 23 | t.Error("incorrect peek, got", string(b)) 24 | } 25 | 26 | slice := i.Slice(4, 7) 27 | if string(slice) != "bar" { 28 | t.Error("incorrect slice, got", string(slice)) 29 | } 30 | 31 | success := i.Advance(9) 32 | if !success { 33 | t.Error("incorrect: couldn't advance by 9") 34 | } 35 | 36 | if b, ok := i.Peek(); ok { 37 | t.Errorf("peek past end of buffer should return false, got %c", b) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /input/linerope/.gitignore: -------------------------------------------------------------------------------- 1 | /main 2 | -------------------------------------------------------------------------------- /input/linerope/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021: Zachary Yedidia. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /input/linerope/line.go: -------------------------------------------------------------------------------- 1 | package linerope 2 | 3 | import ( 4 | "bytes" 5 | ) 6 | 7 | type loc struct { 8 | line int 9 | col int 10 | } 11 | 12 | var lzero = loc{0, 0} 13 | 14 | func llen(b, sep []byte) loc { 15 | lines := bytes.Count(b, sep) 16 | 17 | if lines != 0 { 18 | last := bytes.LastIndex(b, sep) + len(sep) 19 | return loc{ 20 | line: lines, 21 | col: len(b) - last, 22 | } 23 | } 24 | return loc{ 25 | line: 0, 26 | col: len(b), 27 | } 28 | } 29 | 30 | func addlocs(a, b loc) loc { 31 | if a.line != 0 && b.line != 0 { 32 | return loc{ 33 | line: a.line + b.line, 34 | col: b.col, 35 | } 36 | } else if a.line != 0 { 37 | return loc{ 38 | line: a.line, 39 | col: b.col + a.col, 40 | } 41 | } else if b.line != 0 { 42 | return loc{ 43 | line: b.line, 44 | col: b.col, 45 | } 46 | } 47 | return loc{ 48 | line: 0, 49 | col: a.col + b.col, 50 | } 51 | } 52 | 53 | func sublocs(a, b loc) loc { 54 | if a.line == b.line { 55 | return loc{ 56 | line: a.line - b.line, 57 | col: a.col - b.col, 58 | } 59 | } 60 | 61 | return loc{ 62 | line: a.line - b.line, 63 | col: a.col, 64 | } 65 | } 66 | 67 | func (l loc) cmp(other loc) int { 68 | if l.line == other.line { 69 | if l.col < other.col { 70 | return -1 71 | } else if l.col > other.col { 72 | return 1 73 | } 74 | return 0 75 | } else if l.line < other.line { 76 | return -1 77 | } 78 | return 1 79 | } 80 | 81 | func minloc(a, b loc) loc { 82 | if a.cmp(b) < 0 { 83 | return a 84 | } 85 | return b 86 | } 87 | 88 | func maxloc(a, b loc) loc { 89 | if a.cmp(b) > 0 { 90 | return a 91 | } 92 | return b 93 | } 94 | 95 | func sliceloc(b, sep []byte, start, end loc) []byte { 96 | soff := indexN(b, sep, start.line) + len(sep) + start.col 97 | eoff := indexN(b, sep, end.line) + len(sep) + end.col 98 | return b[soff:eoff] 99 | } 100 | -------------------------------------------------------------------------------- /input/linerope/rope.go: -------------------------------------------------------------------------------- 1 | package linerope 2 | 3 | import ( 4 | "io" 5 | "runtime" 6 | "sync" 7 | ) 8 | 9 | var DefaultOptions = Options{ 10 | SplitLen: 4096, 11 | JoinLen: 2048, 12 | RebalanceRatio: 1.2, 13 | LineSep: []byte{'\n'}, 14 | } 15 | 16 | type Options struct { 17 | // SplitLen is the threshold above which slices will be split into separate 18 | // nodes. 19 | SplitLen int 20 | // JoinLen is the threshold below which nodes will be merged into slices. 21 | JoinLen int 22 | // RebalanceRatio is the threshold used to trigger a rebuild during a 23 | // rebalance operation. 24 | RebalanceRatio float64 25 | // LineSep is the newline byte sequence (usually '\n' or '\r\n'). 26 | LineSep []byte 27 | } 28 | 29 | type nodeType byte 30 | 31 | const ( 32 | tLeaf nodeType = iota 33 | tNode 34 | ) 35 | 36 | // A Node in the rope structure. If the kind is tLeaf, only the value and 37 | // length are valid, and if the kind is tNode, only length, left, right are 38 | // valid. 39 | type Node struct { 40 | kind nodeType 41 | value []byte 42 | length int 43 | llength loc 44 | left, right *Node 45 | opts Options 46 | } 47 | 48 | // New returns a new rope node from the given byte slice. The underlying 49 | // data is not copied so the user should ensure that it is okay to insert and 50 | // delete from the input slice. 51 | func New(b []byte) *Node { 52 | return NewWithOpts(b, DefaultOptions) 53 | } 54 | 55 | // NewWithOpts constructs a rope with the given options. 56 | func NewWithOpts(b []byte, opts Options) *Node { 57 | // We build the tree from the bottom up for extra efficiency. This avoids 58 | // counting duplicate newlines a logarithmic number of times (for each 59 | // level of the tree). 60 | // 61 | // We make the chunk size equal to SplitLength which means a node will be 62 | // split when the first edit is made. Since most nodes will never be 63 | // edited, it makes sense to fill them all up to avoid wasting space, even 64 | // if it means inserting will require a split the first time a node is 65 | // edited. 66 | chunksz := opts.SplitLen 67 | nchunks := len(b) / chunksz 68 | nodes := make([]*Node, nchunks, nchunks+1) 69 | 70 | // For even better performance, we load the chunks in parallel. Chunk 71 | // loading is distributed among the cores available on the machine. 72 | var nthreads = runtime.NumCPU() 73 | var wg sync.WaitGroup 74 | wg.Add(nthreads) 75 | for t := 0; t < nthreads; t++ { 76 | go func(t int) { 77 | start := t * (nchunks / nthreads) 78 | end := t*(nchunks/nthreads) + (nchunks / nthreads) 79 | if t == nthreads-1 { 80 | end = nchunks 81 | } 82 | for i := start; i < end; i++ { 83 | j := i * chunksz 84 | // triple index slice notation allows a sort of copy-on-write behavior 85 | // which is extremely beneficial to us because it's likely that this 86 | // slice is backed by a memory-mapped file. 87 | slc := b[j : j+chunksz : j+chunksz] 88 | nodes[i] = &Node{ 89 | kind: tLeaf, 90 | value: slc, 91 | length: len(slc), 92 | llength: llen(slc, opts.LineSep), 93 | opts: opts, 94 | } 95 | } 96 | wg.Done() 97 | }(t) 98 | } 99 | wg.Wait() 100 | // load any extra bytes 101 | slc := b[nchunks*chunksz : len(b) : len(b)] 102 | nodes = append(nodes, &Node{ 103 | kind: tLeaf, 104 | value: slc, 105 | length: len(slc), 106 | llength: llen(slc, opts.LineSep), 107 | opts: opts, 108 | }) 109 | return buildTree(nodes) 110 | } 111 | 112 | // recursively creates parent nodes 113 | func buildTree(nodes []*Node) *Node { 114 | if len(nodes) == 1 { 115 | return nodes[0] 116 | } 117 | if len(nodes)%2 != 0 { 118 | l := len(nodes) 119 | nodes[l-2] = join(nodes[l-2], nodes[l-1]) 120 | nodes = nodes[:l-1] 121 | } 122 | 123 | newnodes := make([]*Node, 0, len(nodes)/2+1) 124 | for i := 0; i < len(nodes); i += 2 { 125 | newnodes = append(newnodes, join(nodes[i], nodes[i+1])) 126 | } 127 | return buildTree(newnodes) 128 | } 129 | 130 | // Len returns the number of elements stored in the rope. 131 | func (n *Node) Len() int { 132 | return n.length 133 | } 134 | 135 | // LLen returns the line/col location one byte beyond the last position in the 136 | // file. 137 | func (n *Node) LLen() (lines, cols int) { 138 | return n.llength.line, n.llength.col 139 | } 140 | 141 | func (n *Node) NumLines() int { 142 | return n.llength.line 143 | } 144 | 145 | func (n *Node) adjust() { 146 | switch n.kind { 147 | case tLeaf: 148 | if n.length > n.opts.SplitLen { 149 | divide := n.length / 2 150 | n.left = NewWithOpts(n.value[:divide], n.opts) 151 | n.right = NewWithOpts(n.value[divide:], n.opts) 152 | n.value = nil 153 | n.kind = tNode 154 | n.length = n.left.length + n.right.length 155 | n.llength = addlocs(n.left.llength, n.right.llength) 156 | } 157 | default: // case tNode 158 | if n.length < n.opts.JoinLen { 159 | n.value = n.Value() 160 | n.left = nil 161 | n.right = nil 162 | n.kind = tLeaf 163 | n.length = len(n.value) 164 | n.llength = llen(n.value, n.opts.LineSep) 165 | } 166 | } 167 | } 168 | 169 | // Value returns the elements of this node concatenated into a slice. May 170 | // return the underlying slice without copying, so do not modify the returned 171 | // slice. 172 | func (n *Node) Value() []byte { 173 | switch n.kind { 174 | case tLeaf: 175 | return n.value 176 | default: // case tNode 177 | return concat(n.left.Value(), n.right.Value()) 178 | } 179 | } 180 | 181 | // Remove deletes the range [start:end) (exclusive bound) from the rope. 182 | func (n *Node) Remove(start, end int) { 183 | switch n.kind { 184 | case tLeaf: 185 | // slice tricks delete 186 | n.value = remove(n.value, start, end) 187 | n.length = len(n.value) 188 | n.llength = llen(n.value, n.opts.LineSep) 189 | default: // case tNode 190 | leftLength := n.left.length 191 | leftStart := min(start, leftLength) 192 | leftEnd := min(end, leftLength) 193 | rightLength := n.right.length 194 | rightStart := max(0, min(start-leftLength, rightLength)) 195 | rightEnd := max(0, min(end-leftLength, rightLength)) 196 | if leftStart < leftLength { 197 | n.left.Remove(leftStart, leftEnd) 198 | } 199 | if rightEnd > 0 { 200 | n.right.Remove(rightStart, rightEnd) 201 | } 202 | n.length = n.left.length + n.right.length 203 | n.llength = addlocs(n.left.llength, n.right.llength) 204 | } 205 | n.adjust() 206 | } 207 | 208 | // Insert inserts the given value at pos. 209 | func (n *Node) Insert(pos int, value []byte) { 210 | switch n.kind { 211 | case tLeaf: 212 | // slice tricks insert 213 | n.value = insert(n.value, pos, value) 214 | n.length = len(n.value) 215 | n.llength = llen(n.value, n.opts.LineSep) 216 | default: // case tNode 217 | leftLength := n.left.length 218 | if pos < leftLength { 219 | n.left.Insert(pos, value) 220 | } else { 221 | n.right.Insert(pos-leftLength, value) 222 | } 223 | n.length = n.left.length + n.right.length 224 | n.llength = addlocs(n.left.llength, n.right.llength) 225 | } 226 | n.adjust() 227 | } 228 | 229 | // slice returns the range of the rope from [start:end). 230 | func (n *Node) slice(start, end int) []byte { 231 | if start >= end { 232 | return []byte{} 233 | } 234 | 235 | switch n.kind { 236 | case tLeaf: 237 | return n.value[start:end] 238 | default: // case tNode 239 | leftLength := n.left.length 240 | leftStart := min(start, leftLength) 241 | leftEnd := min(end, leftLength) 242 | rightLength := n.right.length 243 | rightStart := max(0, min(start-leftLength, rightLength)) 244 | rightEnd := max(0, min(end-leftLength, rightLength)) 245 | 246 | if leftStart != leftEnd { 247 | if rightStart != rightEnd { 248 | return concat(n.left.slice(leftStart, leftEnd), n.right.slice(rightStart, rightEnd)) 249 | } else { 250 | return n.left.slice(leftStart, leftEnd) 251 | } 252 | } else { 253 | if rightStart != rightEnd { 254 | return n.right.slice(rightStart, rightEnd) 255 | } else { 256 | return []byte{} 257 | } 258 | } 259 | } 260 | } 261 | 262 | // OffsetAt returns the absolute character offset of a line/col position. 263 | func (n *Node) OffsetAt(line, col int) int { 264 | pos := loc{line, col} 265 | switch n.kind { 266 | case tLeaf: 267 | return indexN(n.value, n.opts.LineSep, line) + len(n.opts.LineSep) + col 268 | default: // case tNode 269 | leftLength := n.left.llength 270 | if pos.cmp(leftLength) < 0 { 271 | return n.left.OffsetAt(line, col) 272 | } else { 273 | l := sublocs(pos, leftLength) 274 | return n.left.length + n.right.OffsetAt(l.line, l.col) 275 | } 276 | } 277 | } 278 | 279 | // LineColAt returns the line/col position of an absolute character offset. 280 | func (n *Node) LineColAt(pos int) (line, col int) { 281 | l := n.lineColAt(pos) 282 | return l.line, l.col 283 | } 284 | 285 | func (n *Node) lineColAt(pos int) loc { 286 | switch n.kind { 287 | case tLeaf: 288 | return lineCol(n.value, n.opts.LineSep, pos) 289 | default: // case tNode 290 | leftLength := n.left.length 291 | if pos < leftLength { 292 | return n.left.lineColAt(pos) 293 | } else { 294 | return addlocs(n.left.llength, n.right.lineColAt(pos-leftLength)) 295 | } 296 | } 297 | } 298 | 299 | // SliceLC is the same as Slice but uses line/col positions for start and end. 300 | func (n *Node) SliceLC(startl, startc, endl, endc int) []byte { 301 | return n.sliceLC(loc{startl, startc}, loc{endl, endc}) 302 | } 303 | 304 | func (n *Node) sliceLC(start, end loc) []byte { 305 | if start.cmp(end) >= 0 { 306 | return []byte{} 307 | } 308 | 309 | switch n.kind { 310 | case tLeaf: 311 | return sliceloc(n.value, n.opts.LineSep, start, end) 312 | default: // case tNode 313 | leftLength := n.left.llength 314 | leftStart := minloc(start, leftLength) 315 | leftEnd := minloc(end, leftLength) 316 | rightLength := n.right.llength 317 | rightStart := maxloc(lzero, minloc(sublocs(start, leftLength), rightLength)) 318 | rightEnd := maxloc(lzero, minloc(sublocs(end, leftLength), rightLength)) 319 | 320 | if leftStart != leftEnd { 321 | if rightStart != rightEnd { 322 | return concat(n.left.sliceLC(leftStart, leftEnd), n.right.sliceLC(rightStart, rightEnd)) 323 | } else { 324 | return n.left.sliceLC(leftStart, leftEnd) 325 | } 326 | } else { 327 | if rightStart != rightEnd { 328 | return n.right.sliceLC(rightStart, rightEnd) 329 | } else { 330 | return []byte{} 331 | } 332 | } 333 | } 334 | } 335 | 336 | // At returns the element at the given position. 337 | func (n *Node) At(pos int) byte { 338 | s := n.slice(pos, pos+1) 339 | return s[0] 340 | } 341 | 342 | // SplitAt splits the node at the given index and returns two new ropes 343 | // corresponding to the left and right portions of the split. 344 | func (n *Node) SplitAt(i int) (*Node, *Node) { 345 | switch n.kind { 346 | case tLeaf: 347 | return NewWithOpts(n.value[:i], n.opts), NewWithOpts(n.value[i:], n.opts) 348 | default: // case tNode 349 | m := n.left.length 350 | if i == m { 351 | return n.left, n.right 352 | } else if i < m { 353 | l, r := n.left.SplitAt(i) 354 | return l, join(r, n.right) 355 | } 356 | l, r := n.right.SplitAt(i - m) 357 | return join(n.left, l), r 358 | } 359 | } 360 | 361 | func join(l, r *Node) *Node { 362 | n := &Node{ 363 | left: l, 364 | right: r, 365 | length: l.length + r.length, 366 | llength: addlocs(l.llength, r.llength), 367 | kind: tNode, 368 | opts: l.opts, 369 | } 370 | n.adjust() 371 | return n 372 | } 373 | 374 | // Join merges all the given ropes together into one rope. 375 | func Join(a, b *Node, more ...*Node) *Node { 376 | s := join(a, b) 377 | for _, n := range more { 378 | s = join(s, n) 379 | } 380 | return s 381 | } 382 | 383 | // Rebuild rebuilds the entire rope structure, resulting in a balanced tree. 384 | func (n *Node) Rebuild() { 385 | switch n.kind { 386 | case tNode: 387 | n.value = concat(n.left.Value(), n.right.Value()) 388 | n.left = nil 389 | n.right = nil 390 | n.adjust() 391 | } 392 | } 393 | 394 | // Rebalance finds unbalanced nodes and rebuilds them. 395 | func (n *Node) Rebalance() { 396 | switch n.kind { 397 | case tNode: 398 | lratio := float64(n.left.length) / float64(n.right.length) 399 | rratio := float64(n.right.length) / float64(n.left.length) 400 | if lratio > n.opts.RebalanceRatio || rratio > n.opts.RebalanceRatio { 401 | n.Rebuild() 402 | } else { 403 | n.left.Rebalance() 404 | n.right.Rebalance() 405 | } 406 | } 407 | } 408 | 409 | // Each applies the given function to every node in the rope. 410 | func (n *Node) Each(fn func(n *Node)) { 411 | fn(n) 412 | if n.kind == tNode { 413 | n.left.Each(fn) 414 | n.right.Each(fn) 415 | } 416 | } 417 | 418 | // EachLeaf applies the given function to every leaf node in order. 419 | func (n *Node) EachLeaf(fn func(n *Node) bool) bool { 420 | switch n.kind { 421 | case tLeaf: 422 | return fn(n) 423 | default: // case tNode 424 | if n.left.EachLeaf(fn) { 425 | return true 426 | } 427 | return n.right.EachLeaf(fn) 428 | } 429 | } 430 | 431 | // ReadAt implements the io.ReaderAt interface. 432 | func (n *Node) ReadAt(p []byte, off int64) (nread int, err error) { 433 | if off > int64(n.length) { 434 | return 0, io.EOF 435 | } 436 | 437 | end := off + int64(len(p)) 438 | if end >= int64(n.length) { 439 | end = int64(n.length) 440 | err = io.EOF 441 | } 442 | b := n.slice(int(off), int(end)) 443 | nread = copy(p, b) 444 | return nread, err 445 | } 446 | 447 | // WriteTo implements the io.WriterTo interface. 448 | func (n *Node) WriteTo(w io.Writer) (int64, error) { 449 | var err error 450 | var ntotal int64 451 | n.EachLeaf(func(it *Node) bool { 452 | var nwritten int 453 | nwritten, err = w.Write(it.Value()) 454 | ntotal += int64(nwritten) 455 | return err != nil 456 | }) 457 | return ntotal, err 458 | } 459 | 460 | func min(a, b int) int { 461 | if a < b { 462 | return a 463 | } 464 | return b 465 | } 466 | 467 | func max(a, b int) int { 468 | if a > b { 469 | return a 470 | } 471 | return b 472 | } 473 | 474 | // from slice tricks 475 | func insert(s []byte, k int, vs []byte) []byte { 476 | if n := len(s) + len(vs); n <= cap(s) { 477 | s2 := s[:n] 478 | copy(s2[k+len(vs):], s[k:]) 479 | copy(s2[k:], vs) 480 | return s2 481 | } 482 | s2 := make([]byte, len(s)+len(vs)) 483 | copy(s2, s[:k]) 484 | copy(s2[k:], vs) 485 | copy(s2[k+len(vs):], s[k:]) 486 | return s2 487 | } 488 | 489 | func concat(a, b []byte) []byte { 490 | c := make([]byte, 0, len(a)+len(b)) 491 | c = append(c, a...) 492 | c = append(c, b...) 493 | return c 494 | } 495 | 496 | func remove(s []byte, start, end int) []byte { 497 | if len(s) == cap(s) { 498 | // "copy-on-write" for slices where len == cap. 499 | ns := make([]byte, len(s)-(end-start), cap(s)) 500 | copy(ns, s[:start]) 501 | copy(ns[start:], s[end:]) 502 | return ns 503 | } 504 | return append(s[:start], s[end:]...) 505 | } 506 | -------------------------------------------------------------------------------- /input/linerope/rope_test.go: -------------------------------------------------------------------------------- 1 | package linerope_test 2 | 3 | import ( 4 | "bytes" 5 | "math/rand" 6 | "testing" 7 | 8 | "github.com/zyedidia/gpeg/input/linerope" 9 | ) 10 | 11 | func check(r *linerope.Node, b *basicText, t *testing.T) { 12 | if !bytes.Equal(r.Value(), b.value()) { 13 | t.Errorf("incorrect bytes: %s %s", string(r.Value()), string(b.value())) 14 | } 15 | if r.Len() != b.length() { 16 | t.Errorf("incorrect length: %d %d", r.Len(), b.length()) 17 | } 18 | if r.NumLines() != b.NumLines() { 19 | t.Errorf("incorrect line count: %d %d", r.NumLines(), b.NumLines()) 20 | } 21 | 22 | const ncheck = 100 23 | for i := 0; i < ncheck; i++ { 24 | pos := rand.Intn(r.Len()) 25 | rline, rcol := r.LineColAt(pos) 26 | bline, bcol := b.lineColAt(pos) 27 | if rline != bline || rcol != bcol { 28 | t.Errorf("incorrect offset conversion: %d, want (%d, %d), got (%d, %d)", pos, bline, bcol, rline, rcol) 29 | } 30 | 31 | off := r.OffsetAt(rline, rcol) 32 | if off != pos { 33 | t.Errorf("incorrect line/col conversion: (%d, %d), want %d, got %d", rline, rcol, pos, off) 34 | } 35 | } 36 | } 37 | 38 | const datasz = 5000 39 | 40 | func data() (*linerope.Node, *basicText) { 41 | data := randbytes(datasz) 42 | r := linerope.New(data) 43 | b := newBasicText(data) 44 | return r, b 45 | } 46 | 47 | func randrange(high int) (int, int) { 48 | i1 := rand.Intn(high) 49 | i2 := rand.Intn(high) 50 | return min(i1, i2), max(i1, i2) 51 | } 52 | 53 | var letters = []byte("\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 54 | 55 | func randbytes(n int) []byte { 56 | b := make([]byte, n) 57 | for i := range b { 58 | b[i] = letters[rand.Intn(len(letters))] 59 | } 60 | return b 61 | } 62 | 63 | func TestConstruction(t *testing.T) { 64 | r, b := data() 65 | check(r, b, t) 66 | } 67 | 68 | func TestInsertRemove(t *testing.T) { 69 | r, b := data() 70 | 71 | const nedit = 100 72 | const strlen = 20 73 | for i := 0; i < nedit; i++ { 74 | low, high := randrange(r.Len()) 75 | r.Remove(low, high) 76 | b.remove(low, high) 77 | check(r, b, t) 78 | bstr := randbytes(strlen) 79 | r.Insert(low, bstr) 80 | b.insert(low, bstr) 81 | check(r, b, t) 82 | } 83 | check(r, b, t) 84 | } 85 | 86 | func TestReadAt(t *testing.T) { 87 | r, b := data() 88 | 89 | const nslice = 100 90 | length := r.Len() 91 | for i := 0; i < nslice; i++ { 92 | low, high := randrange(length) 93 | 94 | rb := make([]byte, high-low) 95 | r.ReadAt(rb, int64(low)) 96 | bb := b.slice(low, high) 97 | if !bytes.Equal(rb, bb) { 98 | t.Errorf("slice not equal: %s %s", string(rb), string(bb)) 99 | } 100 | } 101 | } 102 | 103 | func TestSplit(t *testing.T) { 104 | r, b := data() 105 | 106 | const nsplit = 10 107 | for i := 0; i < nsplit; i++ { 108 | splitidx := rand.Intn(r.Len()) 109 | left, right := r.SplitAt(splitidx) 110 | 111 | lb := b.slice(0, splitidx) 112 | rb := b.slice(splitidx, b.length()) 113 | if !bytes.Equal(left.Value(), lb) { 114 | t.Errorf("%d: left slice not equal: %s %s", splitidx, string(left.Value()), string(lb)) 115 | } 116 | if !bytes.Equal(right.Value(), rb) { 117 | t.Errorf("%d: right slice not equal: %s %s", splitidx, string(right.Value()), string(rb)) 118 | } 119 | r = linerope.Join(left, right) 120 | check(r, b, t) 121 | } 122 | } 123 | 124 | type basicText struct { 125 | data []byte 126 | } 127 | 128 | func newBasicText(b []byte) *basicText { 129 | data := make([]byte, len(b)) 130 | copy(data, b) 131 | return &basicText{ 132 | data: data, 133 | } 134 | } 135 | 136 | func (b *basicText) length() int { 137 | return len(b.data) 138 | } 139 | 140 | func (b *basicText) value() []byte { 141 | return b.data 142 | } 143 | 144 | func (b *basicText) remove(start, end int) { 145 | b.data = append(b.data[:start], b.data[end:]...) 146 | } 147 | 148 | func (b *basicText) insert(pos int, val []byte) { 149 | b.data = insert(b.data, pos, val) 150 | } 151 | 152 | func (b *basicText) slice(start, end int) []byte { 153 | return b.data[start:end] 154 | } 155 | 156 | func (b *basicText) lineColAt(pos int) (line, col int) { 157 | var last int 158 | for i, c := range b.data { 159 | if c == '\n' { 160 | if i >= pos { 161 | return line, pos - last 162 | } 163 | last = i + 1 164 | line++ 165 | } 166 | } 167 | return line, pos - last 168 | } 169 | 170 | func (b *basicText) NumLines() int { 171 | return bytes.Count(b.data, []byte{'\n'}) 172 | } 173 | 174 | func min(a, b int) int { 175 | if a < b { 176 | return a 177 | } 178 | return b 179 | } 180 | 181 | func max(a, b int) int { 182 | if a > b { 183 | return a 184 | } 185 | return b 186 | } 187 | 188 | // from slice tricks 189 | func insert(s []byte, k int, vs []byte) []byte { 190 | if n := len(s) + len(vs); n <= cap(s) { 191 | s2 := s[:n] 192 | copy(s2[k+len(vs):], s[k:]) 193 | copy(s2[k:], vs) 194 | return s2 195 | } 196 | s2 := make([]byte, len(s)+len(vs)) 197 | copy(s2, s[:k]) 198 | copy(s2[k:], vs) 199 | copy(s2[k+len(vs):], s[k:]) 200 | return s2 201 | } 202 | -------------------------------------------------------------------------------- /input/linerope/util.go: -------------------------------------------------------------------------------- 1 | package linerope 2 | 3 | import ( 4 | "bytes" 5 | ) 6 | 7 | // indexN finds the index of n-th sep in b. 8 | func indexN(b, sep []byte, n int) (index int) { 9 | index, idx, sepLen := 0, -1, len(sep) 10 | for i := 0; i < n; i++ { 11 | if idx = bytes.Index(b, sep); idx == -1 { 12 | break 13 | } 14 | b = b[idx+sepLen:] 15 | index += idx 16 | } 17 | 18 | if idx == -1 { 19 | index = -1 20 | } else { 21 | index += (n - 1) * sepLen 22 | } 23 | 24 | return 25 | } 26 | 27 | // lineCol converts an absolute position to a line/col pair by scanning b. 28 | func lineCol(b, sep []byte, pos int) loc { 29 | var line, last int 30 | for { 31 | idx := bytes.Index(b[last:], sep) 32 | if idx < 0 { 33 | break 34 | } else if last+idx >= pos { 35 | return loc{line, pos - last} 36 | } 37 | last += idx + len(sep) 38 | line++ 39 | } 40 | return loc{line, pos - last} 41 | } 42 | -------------------------------------------------------------------------------- /input/reader.go: -------------------------------------------------------------------------------- 1 | package input 2 | 3 | import ( 4 | "io" 5 | "sync" 6 | ) 7 | 8 | // readerWrapper implements a io.ReaderAt from an io.Reader. The readerWrapper 9 | // works by storing every byte read from the reader, and using that to read 10 | // data that has been read before. 11 | type readerWrapper struct { 12 | reader io.Reader 13 | buf []byte 14 | lock sync.Mutex 15 | } 16 | 17 | // FromReader converts an io.Reader to an io.ReaderAt. 18 | func FromReader(r io.Reader) io.ReaderAt { 19 | return &readerWrapper{ 20 | reader: r, 21 | } 22 | } 23 | 24 | // ReadAt implements the io.ReaderAt interface to wrap an io.Reader. Note that 25 | // calls to ReadAt may change the offset within the wrapped io.Reader (since 26 | // Read is called on the wrapped io.Reader to fetch data). 27 | func (r *readerWrapper) ReadAt(b []byte, off int64) (n int, err error) { 28 | r.lock.Lock() 29 | defer r.lock.Unlock() 30 | 31 | blen := int64(len(b)) 32 | // if there is enough space to fill up b already in the buffer, just copy 33 | // the data and return it. 34 | if int64(len(r.buf))-off >= blen { 35 | return copy(b, r.buf[off:]), nil 36 | } 37 | 38 | // otherwise read data until there is enough or there is an error. 39 | tmp := make([]byte, bufsz) 40 | for int64(len(r.buf))-off < blen { 41 | n, err = r.reader.Read(tmp) 42 | r.buf = append(r.buf, tmp[:n]...) 43 | if err != nil { 44 | break 45 | } 46 | } 47 | if off >= int64(len(r.buf)) { 48 | return 0, err 49 | } 50 | 51 | return copy(b, r.buf[off:]), err 52 | } 53 | 54 | // Slice returns the slice [low:high) in the given ReaderAt. 55 | func Slice(r io.ReaderAt, low, high int) []byte { 56 | buf := make([]byte, high-low) 57 | n, _ := r.ReadAt(buf, int64(low)) 58 | return buf[:n] 59 | } 60 | -------------------------------------------------------------------------------- /input/reader_test.go: -------------------------------------------------------------------------------- 1 | package input_test 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "testing" 7 | 8 | "github.com/zyedidia/gpeg/input" 9 | ) 10 | 11 | func TestReaderWrapper(t *testing.T) { 12 | r := bytes.NewBufferString("foo bar baz") 13 | rat := input.FromReader(r) 14 | b := make([]byte, 3) 15 | 16 | rat.ReadAt(b, 4) 17 | if string(b) != "bar" { 18 | t.Errorf("want %s, got %s", "bar", string(b)) 19 | } 20 | 21 | rat.ReadAt(b, 0) 22 | if string(b) != "foo" { 23 | t.Errorf("want %s, got %s", "foo", string(b)) 24 | } 25 | 26 | n, err := rat.ReadAt(b, 9) 27 | if string(b[:n]) != "az" { 28 | t.Errorf("want %s, got %s", "az", string(b)) 29 | } 30 | if n != 2 || err != io.EOF { 31 | t.Errorf("incorrect, n: %v, err: %v", n, err) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /isa/checker.go: -------------------------------------------------------------------------------- 1 | package isa 2 | 3 | import ( 4 | "github.com/zyedidia/gpeg/input" 5 | ) 6 | 7 | // A Checker is used so the user can perform additional custom validation of 8 | // parse results. For example, you might want to parse only 8-bit integers by 9 | // matching [0-9]+ and then using a checker to ensure the matched integer is in 10 | // the range 0-256. 11 | type Checker interface { 12 | Check(b []byte, src *input.Input, id, flag int) int 13 | } 14 | 15 | type MapChecker map[string]struct{} 16 | 17 | func NewMapChecker(strs []string) MapChecker { 18 | m := make(map[string]struct{}) 19 | for _, s := range strs { 20 | m[s] = struct{}{} 21 | } 22 | return m 23 | } 24 | 25 | func (m MapChecker) Check(b []byte, src *input.Input, id, flag int) int { 26 | if _, ok := m[string(b)]; ok { 27 | return 0 28 | } 29 | return -1 30 | } 31 | 32 | type RefKind uint8 33 | 34 | const ( 35 | RefDef RefKind = iota 36 | RefUse 37 | RefBlock 38 | ) 39 | 40 | type BackReference struct { 41 | Symbols map[int]string 42 | } 43 | 44 | func NewBackRef() *BackReference { 45 | return &BackReference{ 46 | Symbols: make(map[int]string), 47 | } 48 | } 49 | 50 | func (r *BackReference) Check(b []byte, src *input.Input, id, flag int) int { 51 | switch RefKind(flag) { 52 | case RefDef: 53 | r.Symbols[id] = string(b) 54 | return 0 55 | case RefUse: 56 | back := r.Symbols[id] 57 | buf := make([]byte, len(back)) 58 | n, _ := src.ReadAt(buf, int64(src.Pos())) 59 | if n == len(buf) && string(buf) == back { 60 | return n 61 | } 62 | return -1 63 | case RefBlock: 64 | } 65 | return 0 66 | } 67 | -------------------------------------------------------------------------------- /isa/isa.go: -------------------------------------------------------------------------------- 1 | // Package isa provides types for all instructions in the GPeg virtual machine. 2 | package isa 3 | 4 | import ( 5 | "fmt" 6 | "regexp/syntax" 7 | "strconv" 8 | 9 | "github.com/zyedidia/gpeg/charset" 10 | ) 11 | 12 | // Insn represents the interface for an instruction in the ISA 13 | type Insn interface { 14 | insn() 15 | } 16 | 17 | // A Program is a sequence of instructions 18 | type Program []Insn 19 | 20 | // Size returns the number of instructions in a program ignoring labels and 21 | // nops. 22 | func (p Program) Size() int { 23 | var sz int 24 | for _, i := range p { 25 | switch i.(type) { 26 | case Label, Nop: 27 | continue 28 | default: 29 | sz++ 30 | } 31 | } 32 | return sz 33 | } 34 | 35 | // A JumpType instruction is any instruction that refers to a Label. 36 | type JumpType interface { 37 | jumpt() 38 | } 39 | 40 | var uniqId int 41 | 42 | // Label is used for marking a location in the instruction code with 43 | // a unique ID 44 | type Label struct { 45 | Id int 46 | basic 47 | } 48 | 49 | // NewLabel returns a new label with a unique ID 50 | func NewLabel() Label { 51 | uniqId++ 52 | return Label{ 53 | Id: uniqId, 54 | } 55 | } 56 | 57 | // Char consumes the next byte of the subject if it matches Byte and 58 | // fails otherwise. 59 | type Char struct { 60 | Byte byte 61 | basic 62 | } 63 | 64 | // Jump jumps to Lbl. 65 | type Jump struct { 66 | Lbl Label 67 | jump 68 | } 69 | 70 | // Choice pushes Lbl to the stack and if there is a failure the label will 71 | // be popped from the stack and jumped to. 72 | type Choice struct { 73 | Lbl Label 74 | jump 75 | } 76 | 77 | // Call pushes the next instruction to the stack as a return address and jumps 78 | // to Lbl. 79 | type Call struct { 80 | Lbl Label 81 | jump 82 | } 83 | 84 | // Commit jumps to Lbl and removes the top entry from the stack 85 | type Commit struct { 86 | Lbl Label 87 | jump 88 | } 89 | 90 | // Return pops a return address off the stack and jumps to it. 91 | type Return struct { 92 | basic 93 | } 94 | 95 | // Fail causes the instruction pointer to go to the fail state. 96 | type Fail struct { 97 | basic 98 | } 99 | 100 | // Set consumes the next byte of input if it is in the set of chars defined 101 | // by Chars. 102 | type Set struct { 103 | Chars charset.Set 104 | basic 105 | } 106 | 107 | // Any consumes the next N bytes and fails if that is not possible. 108 | type Any struct { 109 | N byte 110 | basic 111 | } 112 | 113 | // PartialCommit modifies the backtrack entry on the top of the stack to 114 | // point to the current subject offset, and jumps to Lbl. 115 | type PartialCommit struct { 116 | Lbl Label 117 | jump 118 | } 119 | 120 | // Span consumes zero or more bytes in the set Chars. This instruction 121 | // never fails. 122 | type Span struct { 123 | Chars charset.Set 124 | basic 125 | } 126 | 127 | // BackCommit pops a backtrack entry off the stack, goes to the subject 128 | // position in the entry, and jumps to Lbl. 129 | type BackCommit struct { 130 | Lbl Label 131 | jump 132 | } 133 | 134 | // FailTwice pops an entry off the stack and sets the instruction pointer to 135 | // the fail state. 136 | type FailTwice struct { 137 | basic 138 | } 139 | 140 | // Empty makes a zero-width assertion according to the Op option. We use the 141 | // same zero-width assertions that are supported by Go's regexp package. 142 | type Empty struct { 143 | Op syntax.EmptyOp 144 | basic 145 | } 146 | 147 | // TestChar consumes the next byte if it matches Byte and jumps to Lbl 148 | // otherwise. If the consumption is possible, a backtrack entry referring 149 | // to Lbl and the subject position from before consumption is pushed to the 150 | // stack. 151 | type TestChar struct { 152 | Byte byte 153 | Lbl Label 154 | jump 155 | } 156 | 157 | // TestCharNoChoice consumes the next byte if it matches Byte and jumps to Lbl 158 | // otherwise. No backtrack entry is pushed to the stack. 159 | type TestCharNoChoice struct { 160 | Byte byte 161 | Lbl Label 162 | jump 163 | } 164 | 165 | // TestSet consumes the next byte if it is in the set Chars and jumps to 166 | // Lbl otherwise. If the consumption is possible, a backtrack entry referring 167 | // to Lbl and the subject position from before consumption is pushed to the 168 | // stack. 169 | type TestSet struct { 170 | Chars charset.Set 171 | Lbl Label 172 | jump 173 | } 174 | 175 | // TestSetNoChoice is the same as TestSet but no backtrack entry is pushed to 176 | // the stack. 177 | type TestSetNoChoice struct { 178 | Chars charset.Set 179 | Lbl Label 180 | jump 181 | } 182 | 183 | // TestAny consumes the next N bytes and jumps to Lbl if that is not possible. 184 | // If the consumption is possible, a backtrack entry referring to Lbl and 185 | // the subject position from before consumption is pushed to the stack. 186 | type TestAny struct { 187 | N byte 188 | Lbl Label 189 | jump 190 | } 191 | 192 | // End immediately completes the pattern as a match. 193 | type End struct { 194 | basic 195 | Fail bool 196 | } 197 | 198 | // Nop does nothing. 199 | type Nop struct { 200 | basic 201 | } 202 | 203 | // MemoOpen begins a memo entry at this position. It marks the pattern that is 204 | // being memoized with a unique ID for that pattern, and stores a label to 205 | // jump to if the pattern is found in the memoization table. 206 | type MemoOpen struct { 207 | Lbl Label 208 | Id int 209 | jump 210 | } 211 | 212 | // MemoClose completes a memoization entry and adds the entry into the memo 213 | // table if it meets certain conditions (size, or other heuristics). 214 | type MemoClose struct { 215 | basic 216 | } 217 | 218 | // MemoTreeOpen starts a memoization tree repetition routine. 219 | type MemoTreeOpen struct { 220 | Lbl Label 221 | Id int 222 | jump 223 | } 224 | 225 | // MemoTreeInsert performs insertion into the memoization table for the tree 226 | // memoization strategy. 227 | type MemoTreeInsert struct { 228 | basic 229 | } 230 | 231 | // MemoTree "tree-ifies" the current memoization entries on the stack. 232 | type MemoTree struct { 233 | basic 234 | } 235 | 236 | // MemoTreeClose completes the tree memoization routine. 237 | type MemoTreeClose struct { 238 | Id int 239 | basic 240 | } 241 | 242 | // CaptureBegin begins capturing the given ID. 243 | type CaptureBegin struct { 244 | Id int 245 | basic 246 | } 247 | 248 | // CaptureLate begins capturing the given ID at the current subject position 249 | // minus Back. 250 | type CaptureLate struct { 251 | Back byte 252 | Id int 253 | basic 254 | } 255 | 256 | // CaptureEnd completes an active capture. 257 | type CaptureEnd struct { 258 | Id int 259 | basic 260 | } 261 | 262 | // CaptureFull begins a capture for the given ID at the current subject 263 | // position minus Back, and immediately completes the capture. This is 264 | // equivalent to CaptureLate Back ID; CaptureEnd. 265 | type CaptureFull struct { 266 | Back byte 267 | Id int 268 | basic 269 | } 270 | 271 | // CheckBegin marks the beginning position for a checker. 272 | type CheckBegin struct { 273 | Id int 274 | Flag int 275 | basic 276 | } 277 | 278 | // CheckEnd records the end position of a checker and applies the checker to 279 | // determine if the match should fail. 280 | type CheckEnd struct { 281 | Checker Checker 282 | basic 283 | } 284 | 285 | // Error logs an error message at the current position. 286 | type Error struct { 287 | basic 288 | Message string 289 | } 290 | 291 | type basic struct{} 292 | 293 | func (b basic) insn() {} 294 | 295 | type jump struct { 296 | basic 297 | } 298 | 299 | func (j jump) jumpt() {} 300 | 301 | // String returns the string representation of this instruction. 302 | func (i Label) String() string { 303 | return fmt.Sprintf("L%v", i.Id) 304 | } 305 | 306 | // String returns the string representation of this instruction. 307 | func (i Char) String() string { 308 | return fmt.Sprintf("Char %v", strconv.QuoteRune(rune(i.Byte))) 309 | } 310 | 311 | // String returns the string representation of this instruction. 312 | func (i Jump) String() string { 313 | return fmt.Sprintf("Jump %v", i.Lbl) 314 | } 315 | 316 | // String returns the string representation of this instruction. 317 | func (i Choice) String() string { 318 | return fmt.Sprintf("Choice %v", i.Lbl) 319 | } 320 | 321 | // String returns the string representation of this instruction. 322 | func (i Call) String() string { 323 | return fmt.Sprintf("Call %v", i.Lbl) 324 | } 325 | 326 | // String returns the string representation of this instruction. 327 | func (i Commit) String() string { 328 | return fmt.Sprintf("Commit %v", i.Lbl) 329 | } 330 | 331 | // String returns the string representation of this instruction. 332 | func (i Return) String() string { 333 | return "Return" 334 | } 335 | 336 | // String returns the string representation of this instruction. 337 | func (i Fail) String() string { 338 | return "Fail" 339 | } 340 | 341 | // String returns the string representation of this instruction. 342 | func (i Set) String() string { 343 | return fmt.Sprintf("Set %v", i.Chars) 344 | } 345 | 346 | // String returns the string representation of this instruction. 347 | func (i Any) String() string { 348 | return fmt.Sprintf("Any %v", i.N) 349 | } 350 | 351 | // String returns the string representation of this instruction. 352 | func (i PartialCommit) String() string { 353 | return fmt.Sprintf("PartialCommit %v", i.Lbl) 354 | } 355 | 356 | // String returns the string representation of this instruction. 357 | func (i Span) String() string { 358 | return fmt.Sprintf("Span %v", i.Chars) 359 | } 360 | 361 | // String returns the string representation of this instruction. 362 | func (i BackCommit) String() string { 363 | return fmt.Sprintf("BackCommit %v", i.Lbl) 364 | } 365 | 366 | // String returns the string representation of this instruction. 367 | func (i FailTwice) String() string { 368 | return "FailTwice" 369 | } 370 | 371 | // String returns the string representation of this instruction. 372 | func (i TestChar) String() string { 373 | return fmt.Sprintf("TestChar %v %v", strconv.QuoteRune(rune(i.Byte)), i.Lbl) 374 | } 375 | 376 | // String returns the string representation of this instruction. 377 | func (i TestCharNoChoice) String() string { 378 | return fmt.Sprintf("TestCharNoChoice %v %v", strconv.QuoteRune(rune(i.Byte)), i.Lbl) 379 | } 380 | 381 | // String returns the string representation of this instruction. 382 | func (i TestSet) String() string { 383 | return fmt.Sprintf("TestSet %v %v", i.Chars, i.Lbl) 384 | } 385 | 386 | // String returns the string representation of this instruction. 387 | func (i TestSetNoChoice) String() string { 388 | return fmt.Sprintf("TestSetNoChoice %v %v", i.Chars, i.Lbl) 389 | } 390 | 391 | // String returns the string representation of this instruction. 392 | func (i TestAny) String() string { 393 | return fmt.Sprintf("TestAny %v %v", i.N, i.Lbl) 394 | } 395 | 396 | // String returns the string representation of this instruction. 397 | func (i End) String() string { 398 | var result string 399 | if i.Fail { 400 | result = "Fail" 401 | } else { 402 | result = "Success" 403 | } 404 | return fmt.Sprintf("End %s", result) 405 | } 406 | 407 | // String returns the string representation of this instruction. 408 | func (i Nop) String() string { 409 | return "Nop" 410 | } 411 | 412 | // String returns the string representation of this instruction. 413 | func (i CheckBegin) String() string { 414 | return "CheckBegin" 415 | } 416 | 417 | // String returns the string representation of this instruction. 418 | func (i CheckEnd) String() string { 419 | return fmt.Sprintf("CheckEnd %v", i.Checker) 420 | } 421 | 422 | // String returns the string representation of this instruction. 423 | func (i MemoOpen) String() string { 424 | return fmt.Sprintf("MemoOpen %v %v", i.Lbl, i.Id) 425 | } 426 | 427 | // String returns the string representation of this instruction. 428 | func (i MemoClose) String() string { 429 | return "MemoClose" 430 | } 431 | 432 | // String returns the string representation of this instruction. 433 | func (i MemoTreeOpen) String() string { 434 | return fmt.Sprintf("MemoTreeOpen %v %v", i.Lbl, i.Id) 435 | } 436 | 437 | // String returns the string representation of this instruction. 438 | func (i MemoTreeInsert) String() string { 439 | return "MemoTreeInsert" 440 | } 441 | 442 | // String returns the string representation of this instruction. 443 | func (i MemoTree) String() string { 444 | return "MemoTree" 445 | } 446 | 447 | // String returns the string representation of this instruction. 448 | func (i MemoTreeClose) String() string { 449 | return fmt.Sprintf("MemoTreeClose %v", i.Id) 450 | } 451 | 452 | // String returns the string representation of this instruction. 453 | func (i CaptureBegin) String() string { 454 | return fmt.Sprintf("Capture begin %v", i.Id) 455 | } 456 | 457 | // String returns the string representation of this instruction. 458 | func (i CaptureLate) String() string { 459 | return fmt.Sprintf("Capture late %v %v", i.Back, i.Id) 460 | } 461 | 462 | // String returns the string representation of this instruction. 463 | func (i CaptureEnd) String() string { 464 | return "Capture end" 465 | } 466 | 467 | // String returns the string representation of this instruction. 468 | func (i CaptureFull) String() string { 469 | return fmt.Sprintf("Capture full %v %v", i.Back, i.Id) 470 | } 471 | 472 | // String returns the string representation of this instruction. 473 | func (i Error) String() string { 474 | return fmt.Sprintf("Error %s", strconv.QuoteToASCII(i.Message)) 475 | } 476 | 477 | // String returns the string representation of this instruction. 478 | func (i Empty) String() string { 479 | return fmt.Sprintf("Empty %s", emptyToString(i.Op)) 480 | } 481 | 482 | // String returns the string representation of the program. 483 | func (p Program) String() string { 484 | s := "" 485 | var last Insn 486 | for _, insn := range p { 487 | switch insn.(type) { 488 | case Nop: 489 | continue 490 | case Label: 491 | if _, ok := last.(Label); ok { 492 | s += fmt.Sprintf("\n%v:", insn) 493 | } else { 494 | s += fmt.Sprintf("%v:", insn) 495 | } 496 | default: 497 | s += fmt.Sprintf("\t%v\n", insn) 498 | } 499 | last = insn 500 | } 501 | s += "\n" 502 | return s 503 | } 504 | 505 | func emptyToString(op syntax.EmptyOp) string { 506 | switch op { 507 | case syntax.EmptyBeginLine: 508 | return "BeginLine" 509 | case syntax.EmptyEndLine: 510 | return "EndLine" 511 | case syntax.EmptyBeginText: 512 | return "BeginText" 513 | case syntax.EmptyEndText: 514 | return "EndText" 515 | case syntax.EmptyWordBoundary: 516 | return "WordBoundary" 517 | case syntax.EmptyNoWordBoundary: 518 | return "NoWordBoundary" 519 | } 520 | return "Unknown" 521 | } 522 | -------------------------------------------------------------------------------- /memo/capture.go: -------------------------------------------------------------------------------- 1 | package memo 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | ) 7 | 8 | const ( 9 | tNode = iota 10 | tDummy 11 | ) 12 | 13 | type Capture struct { 14 | id int32 15 | typ int32 16 | 17 | off int 18 | length int 19 | ment *Entry 20 | children []*Capture 21 | } 22 | 23 | func NewCaptureNode(id int, start, length int, children []*Capture) *Capture { 24 | c := &Capture{ 25 | id: int32(id), 26 | typ: tNode, 27 | off: start, 28 | length: length, 29 | children: children, 30 | } 31 | return c 32 | } 33 | 34 | func NewCaptureDummy(start, length int, children []*Capture) *Capture { 35 | c := &Capture{ 36 | id: 0, 37 | typ: tDummy, 38 | off: start, 39 | length: length, 40 | children: children, 41 | } 42 | return c 43 | } 44 | 45 | func (c *Capture) ChildIterator(start int) func() *Capture { 46 | i := 0 47 | var subit, ret func() *Capture 48 | ret = func() *Capture { 49 | if i >= len(c.children) { 50 | return nil 51 | } 52 | ch := c.children[i] 53 | if ch.Dummy() && subit == nil { 54 | subit = ch.ChildIterator(ch.off) 55 | } 56 | if subit != nil { 57 | ch = subit() 58 | } else { 59 | i++ 60 | } 61 | if ch == nil { 62 | subit = nil 63 | i++ 64 | return ret() 65 | } 66 | return ch 67 | } 68 | return ret 69 | } 70 | 71 | func (c *Capture) Child(n int) *Capture { 72 | it := c.ChildIterator(0) 73 | i := 0 74 | for ch := it(); ch != nil; ch = it() { 75 | if i == n { 76 | return ch 77 | } 78 | i++ 79 | } 80 | return nil 81 | } 82 | 83 | func (c *Capture) NumChildren() int { 84 | nchild := 0 85 | for _, ch := range c.children { 86 | if ch.Dummy() { 87 | nchild += ch.NumChildren() 88 | } else { 89 | nchild++ 90 | } 91 | } 92 | return nchild 93 | } 94 | 95 | func (c *Capture) Start() int { 96 | if c.ment != nil { 97 | return c.ment.pos.Pos() + c.off 98 | } 99 | return c.off 100 | } 101 | 102 | func (c *Capture) Len() int { 103 | return c.length 104 | } 105 | 106 | func (c *Capture) End() int { 107 | return c.Start() + c.length 108 | } 109 | 110 | func (c *Capture) Dummy() bool { 111 | return c.typ == tDummy 112 | } 113 | 114 | func (c *Capture) Id() int { 115 | return int(c.id) 116 | } 117 | 118 | func (c *Capture) setMEnt(e *Entry) { 119 | if c.ment != nil { 120 | return 121 | } 122 | 123 | c.ment = e 124 | c.off = c.off - e.pos.Pos() 125 | 126 | for _, c := range c.children { 127 | c.setMEnt(e) 128 | } 129 | } 130 | 131 | // String returns a readable string representation of this node, showing the ID 132 | // of this node and its children. 133 | func (c *Capture) String() string { 134 | buf := &bytes.Buffer{} 135 | for i, c := range c.children { 136 | buf.WriteString(c.String()) 137 | if i != len(c.children)-1 { 138 | buf.WriteString(", ") 139 | } 140 | } 141 | return fmt.Sprintf("{%d, [%s]}", c.id, buf.String()) 142 | } 143 | -------------------------------------------------------------------------------- /memo/edit.go: -------------------------------------------------------------------------------- 1 | package memo 2 | 3 | // An Edit represents a modification to the subject string where the interval 4 | // [Start, End) is modified to be Len bytes. If Len = 0, this is equivalent 5 | // to deleting the interval, and if Start = End this is an insertion. 6 | type Edit struct { 7 | Start, End int 8 | Len int 9 | } 10 | -------------------------------------------------------------------------------- /memo/entry.go: -------------------------------------------------------------------------------- 1 | package memo 2 | 3 | import ( 4 | "github.com/zyedidia/gpeg/memo/interval" 5 | ) 6 | 7 | // An Entry represents a memoized parse result. It stores the non-terminal 8 | // memoized, the start position of the parse result, the length, and the number 9 | // of characters examined to make the parse determination. If the length is -1, 10 | // the non-terminal failed to match at this location (but still may have 11 | // examined a non-zero number of characters). 12 | type Entry struct { 13 | length int 14 | examined int 15 | count int 16 | captures []*Capture 17 | pos interval.Pos 18 | } 19 | 20 | func (e *Entry) setPos(pos interval.Pos) { 21 | e.pos = pos 22 | for i := range e.captures { 23 | e.captures[i].setMEnt(e) 24 | } 25 | } 26 | 27 | // Pos returns this entry's starting position. 28 | func (e *Entry) Pos() int { 29 | return e.pos.Pos() 30 | } 31 | 32 | // Length returns the number of characters memoized by this entry. 33 | func (e *Entry) Length() int { 34 | return e.length 35 | } 36 | 37 | // Captures returns the captures that occurred within this memoized parse 38 | // result. 39 | func (e *Entry) Captures() []*Capture { 40 | return e.captures 41 | } 42 | 43 | func (e *Entry) Count() int { 44 | return e.count 45 | } 46 | 47 | func (e *Entry) Examined() int { 48 | return e.examined 49 | } 50 | -------------------------------------------------------------------------------- /memo/interval/interval_test.go: -------------------------------------------------------------------------------- 1 | package interval_test 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | 7 | "github.com/zyedidia/gpeg/memo/interval" 8 | "github.com/zyedidia/gpeg/memo/interval/lazy" 9 | "github.com/zyedidia/gpeg/memo/interval/lazylog" 10 | ) 11 | 12 | func randrange(max int) (int, int) { 13 | low := rand.Intn(max) 14 | high := low + rand.Intn(1000) 15 | if low == high { 16 | high = low + 1 17 | } 18 | return low, high 19 | } 20 | 21 | func randint(min, max int) int { 22 | return rand.Intn(max-min) + min 23 | } 24 | 25 | func TestTree(t *testing.T) { 26 | it := &lazy.Array{} 27 | ia := &lazylog.Tree{} 28 | 29 | const ( 30 | opAdd = iota 31 | opFind 32 | opRemoveAndShift 33 | opPos 34 | 35 | nops = 300000 36 | maxidx = 10 37 | maxid = 10 38 | maxshamt = 50 39 | ) 40 | 41 | var pt, pa interval.Pos 42 | var length int 43 | var haspt bool 44 | 45 | for i := 0; i < nops; i++ { 46 | op := rand.Intn(4) 47 | switch op { 48 | case opAdd: 49 | id := rand.Intn(maxid) 50 | low, high := randrange(maxidx) 51 | pt = it.Add(id, low, high, i) 52 | pa = ia.Add(id, low, high, i) 53 | length = high - low 54 | haspt = true 55 | case opFind: 56 | id := rand.Intn(maxid) 57 | pos := rand.Intn(maxidx) 58 | 59 | vt := it.FindLargest(id, pos) 60 | va := ia.FindLargest(id, pos) 61 | 62 | if vt == nil && va == nil { 63 | continue 64 | } 65 | 66 | if vt == nil && va != nil || va == nil && vt != nil { 67 | t.Fatalf("Find (%d, %d): %v != %v", id, pos, vt, va) 68 | } 69 | 70 | if vt.(int) != va.(int) { 71 | t.Fatalf("Find (%d, %d): %d != %d", id, pos, vt.(int), va.(int)) 72 | } 73 | case opRemoveAndShift: 74 | low, high := randrange(maxidx) 75 | amt := randint(-maxshamt, maxshamt) 76 | 77 | if haspt { 78 | ptpos := pt.Pos() 79 | if lazy.Overlaps(lazy.Interval{ 80 | Low: low, 81 | High: high, 82 | }, lazy.Interval{ 83 | Low: ptpos, 84 | High: ptpos + length, 85 | }) { 86 | haspt = false 87 | } 88 | } 89 | 90 | it.RemoveAndShift(low, high, amt) 91 | ia.RemoveAndShift(low, high, amt) 92 | case opPos: 93 | if haspt && pt.Pos() != pa.Pos() { 94 | t.Fatalf("%d != %d", pt.Pos(), pa.Pos()) 95 | } 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /memo/interval/lazy/LICENSE-AVL: -------------------------------------------------------------------------------- 1 | This license applies to the file: tree.go 2 | 3 | MIT License 4 | 5 | Copyright (c) 2017 Kostas Karasavvas, 2021 Zachary Yedidia 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /memo/interval/lazy/array.go: -------------------------------------------------------------------------------- 1 | package lazy 2 | 3 | import "github.com/zyedidia/gpeg/memo/interval" 4 | 5 | // An Array is another implementation of the interval.Set backed by an array 6 | // rather than an AVL tree. This implementation is naive and ineffecient, but 7 | // provides a good point of comparison for benchmarking and testing. 8 | type Array struct { 9 | slots []slot 10 | } 11 | 12 | type slot struct { 13 | *ivalue 14 | id int 15 | } 16 | 17 | func (iv *ivalue) Pos() int { 18 | return iv.interval.Low 19 | } 20 | 21 | func (a *Array) FindLargest(id, pos int) interval.Value { 22 | var max int 23 | maxi := -1 24 | for i, in := range a.slots { 25 | if in.interval.Low == pos && in.id == id && in.interval.High > max { 26 | maxi = i 27 | max = in.interval.High 28 | } 29 | } 30 | if maxi == -1 || maxi >= len(a.slots) { 31 | return nil 32 | } 33 | 34 | return a.slots[maxi].value 35 | } 36 | 37 | func (a *Array) Add(id, low, high int, val interval.Value) interval.Pos { 38 | iv := &ivalue{ 39 | interval: Interval{low, high}, 40 | value: val, 41 | } 42 | a.slots = append(a.slots, slot{ 43 | id: id, 44 | ivalue: iv, 45 | }) 46 | return iv 47 | } 48 | 49 | func (a *Array) RemoveAndShift(low, high, amt int) { 50 | for i := 0; i < len(a.slots); { 51 | if Overlaps(a.slots[i].interval, Interval{low, high}) { 52 | a.slots[i] = a.slots[len(a.slots)-1] 53 | a.slots = a.slots[:len(a.slots)-1] 54 | } else { 55 | i++ 56 | } 57 | } 58 | 59 | if amt == 0 { 60 | return 61 | } 62 | 63 | for i := range a.slots { 64 | if a.slots[i].interval.Low >= low { 65 | a.slots[i].interval = a.slots[i].interval.Shift(amt) 66 | } 67 | } 68 | } 69 | 70 | func (a *Array) Size() int { 71 | return len(a.slots) 72 | } 73 | -------------------------------------------------------------------------------- /memo/interval/lazy/interval.go: -------------------------------------------------------------------------------- 1 | package lazy 2 | 3 | import "fmt" 4 | 5 | type Interval struct { 6 | Low, High int 7 | } 8 | 9 | func (i Interval) String() string { 10 | return fmt.Sprintf("[%d, %d)", i.Low, i.High) 11 | } 12 | 13 | func (i Interval) Shift(amt int) Interval { 14 | return Interval{ 15 | Low: i.Low + amt, 16 | High: i.High + amt, 17 | } 18 | } 19 | 20 | func (i Interval) Len() int { 21 | return i.High - i.Low 22 | } 23 | 24 | func Overlaps(i1, i2 Interval) bool { 25 | return i1.Low < i2.High && i1.High > i2.Low 26 | } 27 | -------------------------------------------------------------------------------- /memo/interval/lazy/interval_test.go: -------------------------------------------------------------------------------- 1 | package lazy 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | 7 | "github.com/zyedidia/gpeg/memo/interval" 8 | ) 9 | 10 | func randrange(max int) (int, int) { 11 | low := rand.Intn(max) 12 | high := low + rand.Intn(1000) 13 | if low == high { 14 | high = low + 1 15 | } 16 | return low, high 17 | } 18 | 19 | func randint(min, max int) int { 20 | return rand.Intn(max-min) + min 21 | } 22 | 23 | func checkParents(n *node, t *testing.T) { 24 | if n == nil { 25 | return 26 | } 27 | if n.left != nil && n.left.parent != n { 28 | t.Fatalf("Incorrect parent n: %p, n.left.parent: %p", n, n.left.parent) 29 | } 30 | if n.right != nil && n.right.parent != n { 31 | t.Fatalf("Incorrect parent n: %p, n.right.parent: %p", n, n.right.parent) 32 | } 33 | checkParents(n.left, t) 34 | checkParents(n.right, t) 35 | } 36 | 37 | func TestTree(t *testing.T) { 38 | it := &Tree{} 39 | ia := &Array{} 40 | 41 | const ( 42 | opAdd = iota 43 | opFind 44 | opRemoveAndShift 45 | opPos 46 | 47 | nops = 300000 48 | maxidx = 10 49 | maxid = 10 50 | maxshamt = 50 51 | ) 52 | 53 | var pt, pa interval.Pos 54 | var length int 55 | var haspt bool 56 | 57 | for i := 0; i < nops; i++ { 58 | op := rand.Intn(4) 59 | switch op { 60 | case opAdd: 61 | id := rand.Intn(maxid) 62 | low, high := randrange(maxidx) 63 | pt = it.Add(id, low, high, i) 64 | pa = ia.Add(id, low, high, i) 65 | length = high - low 66 | haspt = true 67 | case opFind: 68 | id := rand.Intn(maxid) 69 | pos := rand.Intn(maxidx) 70 | 71 | vt := it.FindLargest(id, pos) 72 | va := ia.FindLargest(id, pos) 73 | 74 | if vt == nil && va == nil { 75 | continue 76 | } 77 | 78 | if vt == nil && va != nil || va == nil && vt != nil { 79 | t.Fatalf("Find (%d, %d): %v != %v", id, pos, vt, va) 80 | } 81 | 82 | if vt.(int) != va.(int) { 83 | t.Fatalf("Find (%d, %d): %d != %d", id, pos, vt.(int), va.(int)) 84 | } 85 | case opRemoveAndShift: 86 | low, high := randrange(maxidx) 87 | amt := randint(-maxshamt, maxshamt) 88 | 89 | if haspt { 90 | ptpos := pt.Pos() 91 | if Overlaps(Interval{ 92 | Low: low, 93 | High: high, 94 | }, Interval{ 95 | Low: ptpos, 96 | High: ptpos + length, 97 | }) { 98 | haspt = false 99 | } 100 | } 101 | 102 | it.RemoveAndShift(low, high, amt) 103 | ia.RemoveAndShift(low, high, amt) 104 | case opPos: 105 | if haspt && pt.Pos() != pa.Pos() { 106 | t.Fatalf("%d != %d", pt.Pos(), pa.Pos()) 107 | } 108 | } 109 | checkParents(it.root, t) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /memo/interval/lazy/tree.go: -------------------------------------------------------------------------------- 1 | // Package lazy provides an interval tree backed by an AVL tree. In addition, 2 | // the interval tree supports a lazy shifting algorithm. 3 | package lazy 4 | 5 | import "github.com/zyedidia/gpeg/memo/interval" 6 | 7 | type key struct { 8 | id int 9 | pos int 10 | } 11 | 12 | // compare orders keys by pos and then id. 13 | func (k key) compare(other key) int { 14 | if k.pos < other.pos { 15 | return -1 16 | } else if k.pos > other.pos { 17 | return 1 18 | } else if k.id < other.id { 19 | return -1 20 | } else if k.id > other.id { 21 | return 1 22 | } 23 | return 0 24 | } 25 | 26 | type Tree struct { 27 | root *node 28 | } 29 | 30 | // Adds the given interval to the tree. An id can also be given to the interval 31 | // to separate different types of intervals. 32 | func (t *Tree) Add(id, low, high int, val interval.Value) (pos interval.Pos) { 33 | t.root, pos = t.root.add(key{id, low}, high, val, nil) 34 | return pos 35 | } 36 | 37 | // FindLargest returns the largest interval associated with (id, pos). 38 | func (t *Tree) FindLargest(id, pos int) interval.Value { 39 | n := t.root.search(key{id, pos}) 40 | if n == nil || len(n.iv.ivs) == 0 { 41 | return nil 42 | } 43 | 44 | var max, maxi int 45 | for i := range n.iv.ivs { 46 | if n.iv.ivs[i].interval.High > max { 47 | max = n.iv.ivs[i].interval.High 48 | maxi = i 49 | } 50 | } 51 | return n.iv.ivs[maxi].value 52 | } 53 | 54 | // RemoveAndShift removes all entries that overlap with [low, high) and then shifts 55 | // all entries greater than low by amt. 56 | func (t *Tree) RemoveAndShift(low, high, amt int) { 57 | t.root = t.root.removeOverlaps(low, high, nil) 58 | if amt != 0 { 59 | t.root.addShift(shift{low, amt}) 60 | } 61 | } 62 | 63 | // Size returns the number of intervals in the tree. 64 | func (t *Tree) Size() int { 65 | return t.root.getSize() 66 | } 67 | 68 | type ivalues struct { 69 | ivs []ivalue 70 | node *node 71 | } 72 | 73 | func (iv *ivalues) Pos() int { 74 | iv.node.applyAllShifts() 75 | return iv.node.key.pos 76 | } 77 | 78 | type ivalue struct { 79 | interval Interval 80 | value interval.Value 81 | } 82 | 83 | // A shift of intervals in the tree. The shift starts at idx and moves 84 | // intervals after idx by amt. Shifts are lazily applied in the tree to avoid 85 | // frequent linear time costs. 86 | type shift struct { 87 | idx int 88 | amt int 89 | } 90 | 91 | type node struct { 92 | key key 93 | max int 94 | iv *ivalues 95 | shifts []shift 96 | 97 | // height counts nodes (not edges) 98 | height int 99 | left *node 100 | right *node 101 | parent *node 102 | } 103 | 104 | func (n *node) addShift(sh shift) { 105 | if n == nil { 106 | return 107 | } 108 | 109 | n.shifts = append(n.shifts, sh) 110 | } 111 | 112 | func (n *node) applyShifts() { 113 | if n == nil { 114 | return 115 | } 116 | for _, sh := range n.shifts { 117 | if n.max >= sh.idx { 118 | if n.key.pos >= sh.idx { 119 | n.key.pos += sh.amt 120 | for i, iv := range n.iv.ivs { 121 | n.iv.ivs[i].interval = iv.interval.Shift(sh.amt) 122 | } 123 | } 124 | n.max += sh.amt 125 | // n.updateMax() 126 | } 127 | 128 | n.left.addShift(sh) 129 | n.right.addShift(sh) 130 | } 131 | n.shifts = nil 132 | } 133 | 134 | func (n *node) add(key key, high int, value interval.Value, parent *node) (*node, *ivalues) { 135 | if n == nil { 136 | n = new(node) 137 | *n = node{ 138 | key: key, 139 | max: high, 140 | iv: &ivalues{ 141 | ivs: []ivalue{{ 142 | interval: Interval{key.pos, high}, 143 | value: value, 144 | }}, 145 | node: n, 146 | }, 147 | height: 1, 148 | left: nil, 149 | right: nil, 150 | parent: parent, 151 | } 152 | return n, n.iv 153 | } 154 | n.applyShifts() 155 | 156 | var iv *ivalues 157 | if key.compare(n.key) < 0 { 158 | n.left, iv = n.left.add(key, high, value, n) 159 | } else if key.compare(n.key) > 0 { 160 | n.right, iv = n.right.add(key, high, value, n) 161 | } else { 162 | // if same key exists update value 163 | n.iv.ivs = append(n.iv.ivs, ivalue{ 164 | interval: Interval{key.pos, high}, 165 | value: value, 166 | }) 167 | iv = n.iv 168 | } 169 | return n.rebalanceTree(parent), iv 170 | } 171 | 172 | func (n *node) calcMax() int { 173 | max := 0 174 | for _, iv := range n.iv.ivs { 175 | if iv.interval.High > max { 176 | max = iv.interval.High 177 | } 178 | } 179 | return max 180 | } 181 | 182 | func (n *node) updateMax() { 183 | if n != nil { 184 | if n.right != nil { 185 | n.max = max(n.max, n.right.max) 186 | } 187 | if n.left != nil { 188 | n.max = max(n.max, n.left.max) 189 | } 190 | n.max = max(n.max, n.calcMax()) 191 | } 192 | } 193 | 194 | func (n *node) remove(key key, parent *node) *node { 195 | if n == nil { 196 | return nil 197 | } 198 | n.applyShifts() 199 | if key.compare(n.key) < 0 { 200 | n.left = n.left.remove(key, n) 201 | } else if key.compare(n.key) > 0 { 202 | n.right = n.right.remove(key, n) 203 | } else { 204 | if n.left != nil && n.right != nil { 205 | n.left.applyShifts() 206 | n.right.applyShifts() 207 | // node to delete found with both children; 208 | // replace values with smallest node of the right sub-tree 209 | rightMinNode := n.right.findSmallest() 210 | n.key = rightMinNode.key 211 | n.iv = rightMinNode.iv 212 | n.iv.node = n 213 | n.shifts = rightMinNode.shifts 214 | // delete smallest node that we replaced 215 | n.right = n.right.remove(rightMinNode.key, n) 216 | } else if n.left != nil { 217 | n.left.applyShifts() 218 | // node only has left child 219 | n = n.left 220 | } else if n.right != nil { 221 | n.right.applyShifts() 222 | // node only has right child 223 | n = n.right 224 | } else { 225 | // node has no children 226 | n = nil 227 | return n 228 | } 229 | 230 | } 231 | n.parent = parent 232 | return n.rebalanceTree(parent) 233 | } 234 | 235 | func (n *node) search(key key) *node { 236 | if n == nil { 237 | return nil 238 | } 239 | n.applyShifts() 240 | if key.compare(n.key) < 0 { 241 | return n.left.search(key) 242 | } else if key.compare(n.key) > 0 { 243 | return n.right.search(key) 244 | } else { 245 | return n 246 | } 247 | } 248 | 249 | func (n *node) overlaps(low, high int, result []interval.Value) []interval.Value { 250 | if n == nil { 251 | return result 252 | } 253 | 254 | n.applyShifts() 255 | 256 | if low >= n.max { 257 | return result 258 | } 259 | 260 | result = n.left.overlaps(low, high, result) 261 | 262 | for _, iv := range n.iv.ivs { 263 | if Overlaps(iv.interval, Interval{low, high}) { 264 | result = append(result, iv.value) 265 | } 266 | } 267 | 268 | if high <= n.key.pos { 269 | return result 270 | } 271 | 272 | result = n.right.overlaps(low, high, result) 273 | return result 274 | } 275 | 276 | func (n *node) removeOverlaps(low, high int, parent *node) *node { 277 | if n == nil { 278 | return n 279 | } 280 | 281 | n.applyShifts() 282 | 283 | if low >= n.max { 284 | return n 285 | } 286 | 287 | n.left = n.left.removeOverlaps(low, high, n) 288 | 289 | for i := 0; i < len(n.iv.ivs); { 290 | if Overlaps(n.iv.ivs[i].interval, Interval{low, high}) { 291 | n.iv.ivs[i] = n.iv.ivs[len(n.iv.ivs)-1] 292 | n.iv.ivs[len(n.iv.ivs)-1] = ivalue{} 293 | n.iv.ivs = n.iv.ivs[:len(n.iv.ivs)-1] 294 | } else { 295 | i++ 296 | } 297 | } 298 | 299 | if len(n.iv.ivs) == 0 { 300 | doright := high > n.key.pos 301 | n = n.remove(n.key, parent) 302 | if doright { 303 | return n.removeOverlaps(low, high, parent) 304 | } 305 | return n 306 | } 307 | 308 | if high <= n.key.pos { 309 | return n 310 | } 311 | 312 | n.right = n.right.removeOverlaps(low, high, n) 313 | return n 314 | } 315 | 316 | func (n *node) getHeight() int { 317 | if n == nil { 318 | return 0 319 | } 320 | return n.height 321 | } 322 | 323 | func (n *node) getSize() int { 324 | if n == nil { 325 | return 0 326 | } 327 | return n.left.getSize() + n.right.getSize() + 1 328 | } 329 | 330 | func (n *node) updateHeightAndMax() { 331 | n.height = 1 + max(n.left.getHeight(), n.right.getHeight()) 332 | n.updateMax() 333 | } 334 | 335 | // Checks if node is balanced and rebalance 336 | func (n *node) rebalanceTree(parent *node) *node { 337 | if n == nil { 338 | return n 339 | } 340 | n.updateHeightAndMax() 341 | 342 | // check balance factor and rotateLeft if right-heavy and rotateRight if left-heavy 343 | balanceFactor := n.left.getHeight() - n.right.getHeight() 344 | if balanceFactor == -2 { 345 | // check if child is left-heavy and rotateRight first 346 | if n.right.left.getHeight() > n.right.right.getHeight() { 347 | n.right = n.right.rotateRight(n) 348 | } 349 | return n.rotateLeft(parent) 350 | } else if balanceFactor == 2 { 351 | // check if child is right-heavy and rotateLeft first 352 | if n.left.right.getHeight() > n.left.left.getHeight() { 353 | n.left = n.left.rotateLeft(n) 354 | } 355 | return n.rotateRight(parent) 356 | } 357 | return n 358 | } 359 | 360 | // Rotate nodes left to balance node 361 | func (n *node) rotateLeft(newParent *node) *node { 362 | n.applyShifts() 363 | if n.right != nil { 364 | n.right.applyShifts() 365 | } 366 | 367 | newRoot := n.right 368 | n.right = newRoot.left 369 | if newRoot.left != nil { 370 | newRoot.left.parent = n 371 | } 372 | newRoot.left = n 373 | n.parent = newRoot 374 | newRoot.parent = newParent 375 | 376 | n.updateHeightAndMax() 377 | newRoot.updateHeightAndMax() 378 | return newRoot 379 | } 380 | 381 | // Rotate nodes right to balance node 382 | func (n *node) rotateRight(newParent *node) *node { 383 | n.applyShifts() 384 | if n.left != nil { 385 | n.left.applyShifts() 386 | } 387 | 388 | newRoot := n.left 389 | n.left = newRoot.right 390 | if newRoot.right != nil { 391 | newRoot.right.parent = n 392 | } 393 | newRoot.right = n 394 | n.parent = newRoot 395 | newRoot.parent = newParent 396 | 397 | n.updateHeightAndMax() 398 | newRoot.updateHeightAndMax() 399 | return newRoot 400 | } 401 | 402 | // Finds the smallest child (based on the key) for the current node 403 | func (n *node) findSmallest() *node { 404 | if n.left != nil { 405 | n.left.applyShifts() 406 | return n.left.findSmallest() 407 | } else { 408 | return n 409 | } 410 | } 411 | 412 | func (n *node) applyAllShifts() { 413 | if n.parent != nil && n.parent != n { 414 | n.parent.applyAllShifts() 415 | } 416 | n.applyShifts() 417 | } 418 | 419 | func max(a int, b int) int { 420 | if a > b { 421 | return a 422 | } 423 | return b 424 | } 425 | -------------------------------------------------------------------------------- /memo/interval/lazylog/interval.go: -------------------------------------------------------------------------------- 1 | package lazylog 2 | 3 | import "fmt" 4 | 5 | type interval struct { 6 | low, high int 7 | value interface{} 8 | } 9 | 10 | func (i *interval) Low() int { 11 | return i.low 12 | } 13 | 14 | func (i *interval) High() int { 15 | return i.high 16 | } 17 | 18 | func (i *interval) length() int { 19 | return i.High() - i.Low() 20 | } 21 | 22 | func (i *interval) String() string { 23 | return fmt.Sprintf("[%d, %d)", i.low, i.high) 24 | } 25 | 26 | // returns true if i1 overlaps with the interval [low:high) 27 | func overlaps(i1 interval, low, high int) bool { 28 | return i1.Low() <= high && i1.High() >= low 29 | } 30 | -------------------------------------------------------------------------------- /memo/interval/lazylog/tree.go: -------------------------------------------------------------------------------- 1 | // Package lazylog provides an interval tree backed by an AVL tree. In addition, 2 | // the interval tree supports shifting intervals in amortized constant time 3 | // using lazy shifts. 4 | package lazylog 5 | 6 | import ( 7 | intval "github.com/zyedidia/gpeg/memo/interval" 8 | ) 9 | 10 | // ShiftThreshold is the number of shifts to accumulate before applying all 11 | // shifts. 12 | const ShiftThreshold = -1 13 | 14 | // A key stores the start position of an interval, and a unique ID if you would 15 | // like to store multiple intervals starting from the same position. The key is 16 | // used for uniquely identifying a particular interval when searching or 17 | // removing from the tree. 18 | type key struct { 19 | pos int 20 | id int 21 | } 22 | 23 | // compare orders keys by pos and then id. 24 | func (k key) compare(other key) int { 25 | if k.pos < other.pos { 26 | return -1 27 | } else if k.pos > other.pos { 28 | return 1 29 | } else if k.id < other.id { 30 | return -1 31 | } else if k.id > other.id { 32 | return 1 33 | } 34 | return 0 35 | } 36 | 37 | // A shift of intervals in the tree. The shift starts at idx and moves 38 | // intervals after idx by amt. Shifts are lazily applied in the tree to avoid 39 | // linear time costs. 40 | type shift struct { 41 | idx int 42 | amt int 43 | tstamp uint64 44 | } 45 | 46 | type Tree struct { 47 | root *node 48 | shifts []shift // list of non-applied shifts 49 | tstamp uint64 // most recent timestamp 50 | } 51 | 52 | // Adds the given interval to the tree. An id should also be given to the 53 | // interval to uniquely identify it if any other intervals begin at the same 54 | // location. 55 | func (t *Tree) Add(id, low, high int, value intval.Value) intval.Pos { 56 | var loc intval.Pos 57 | t.root, loc = t.root.add(t, key{ 58 | pos: low, 59 | id: id, 60 | }, interval{ 61 | low: low, 62 | high: high, 63 | value: value, 64 | }) 65 | return loc 66 | } 67 | 68 | // Search for the interval starting at pos with the given id. Returns nil if no 69 | // such interval exists. 70 | func (t *Tree) FindLargest(id, pos int) intval.Value { 71 | n := t.root.search(key{ 72 | pos: pos, 73 | id: id, 74 | }) 75 | if n != nil { 76 | if len(n.interval.ins) == 0 { 77 | return nil 78 | } 79 | 80 | max := 0 81 | for i, in := range n.interval.ins[1:] { 82 | if in.length() > n.interval.ins[max].length() { 83 | max = i + 1 84 | } 85 | } 86 | 87 | return n.interval.ins[max].value 88 | } 89 | return nil 90 | } 91 | 92 | func (t *Tree) RemoveAndShift(low, high, amt int) { 93 | t.root = t.root.removeOverlaps(low, high) 94 | if amt != 0 { 95 | t.shift(low, amt) 96 | } 97 | } 98 | 99 | func (t *Tree) AllValues() []intval.Value { 100 | var vals []intval.Value 101 | return t.root.allvals(vals) 102 | } 103 | 104 | // Shift all intervals in the tree after idx by amt. The shift idx should not 105 | // lie inside an interval. This could conceivably be implemented, but is not 106 | // currently. If a negative shift is performed, ensure that there is space for 107 | // all intervals to be shifted left without overlapping with another interval. 108 | func (t *Tree) shift(idx, amt int) { 109 | if amt == 0 { 110 | return 111 | } 112 | 113 | t.tstamp++ 114 | t.shifts = append(t.shifts, shift{ 115 | idx: idx, 116 | amt: amt, 117 | tstamp: t.tstamp, 118 | }) 119 | if ShiftThreshold != -1 && len(t.shifts) >= ShiftThreshold { 120 | t.applyAllShifts() 121 | } 122 | } 123 | 124 | func (t *Tree) applyAllShifts() { 125 | t.root.applyAllShifts() 126 | t.shifts = nil 127 | } 128 | 129 | // Size returns the total number of intervals stored in the tree. 130 | func (t *Tree) Size() int { 131 | return t.root.size() 132 | } 133 | 134 | type node struct { 135 | key key 136 | max int 137 | interval *lazyInterval 138 | tstamp uint64 // timestamp to determine which shifts to apply 139 | tree *Tree 140 | 141 | // height counts nodes (not edges) 142 | height int 143 | left *node 144 | right *node 145 | } 146 | 147 | // Adds a new node 148 | func (n *node) add(tree *Tree, key key, value interval) (*node, *lazyInterval) { 149 | if n == nil { 150 | nn := &node{ 151 | tree: tree, 152 | key: key, 153 | max: value.High(), 154 | height: 1, 155 | left: nil, 156 | right: nil, 157 | tstamp: tree.tstamp, 158 | } 159 | nn.interval = &lazyInterval{ 160 | ins: []interval{value}, 161 | n: nn, 162 | } 163 | return nn, nn.interval 164 | } 165 | n.applyShifts() 166 | 167 | var loc *lazyInterval 168 | if key.compare(n.key) < 0 { 169 | n.left, loc = n.left.add(tree, key, value) 170 | } else if key.compare(n.key) > 0 { 171 | n.right, loc = n.right.add(tree, key, value) 172 | } else { 173 | // if same key exists update value 174 | n.interval.ins = append(n.interval.ins, value) 175 | n.tstamp = tree.tstamp 176 | loc = n.interval 177 | } 178 | return n.rebalanceTree(), loc 179 | } 180 | 181 | func (n *node) updateMax() { 182 | if n != nil { 183 | if n.right != nil { 184 | n.max = max(n.max, n.right.max) 185 | } 186 | if n.left != nil { 187 | n.max = max(n.max, n.left.max) 188 | } 189 | n.max = max(n.max, n.interval.High()) 190 | } 191 | } 192 | 193 | // Removes a node 194 | func (n *node) remove(key key) *node { 195 | if n == nil { 196 | return nil 197 | } 198 | n.applyShifts() 199 | if key.compare(n.key) < 0 { 200 | n.left = n.left.remove(key) 201 | } else if key.compare(n.key) > 0 { 202 | n.right = n.right.remove(key) 203 | } else { 204 | if n.left != nil && n.right != nil { 205 | n.left.applyShifts() 206 | n.right.applyShifts() 207 | // node to delete found with both children; 208 | // replace values with smallest node of the right sub-tree 209 | rightMinNode := n.right.findSmallest() 210 | 211 | n.key = rightMinNode.key 212 | copy(n.interval.ins, rightMinNode.interval.ins) 213 | n.interval.n = n 214 | n.tstamp = rightMinNode.tstamp 215 | // delete smallest node that we replaced 216 | n.right = n.right.remove(rightMinNode.key) 217 | } else if n.left != nil { 218 | n.left.applyShifts() 219 | // node only has left child 220 | n = n.left 221 | } else if n.right != nil { 222 | n.right.applyShifts() 223 | // node only has right child 224 | n = n.right 225 | } else { 226 | // node has no children 227 | n = nil 228 | return n 229 | } 230 | 231 | } 232 | return n.rebalanceTree() 233 | } 234 | 235 | // Searches for a node 236 | func (n *node) search(key key) *node { 237 | if n == nil { 238 | return nil 239 | } 240 | n.applyShifts() 241 | if key.compare(n.key) < 0 { 242 | return n.left.search(key) 243 | } else if key.compare(n.key) > 0 { 244 | return n.right.search(key) 245 | } else { 246 | return n 247 | } 248 | } 249 | 250 | func (n *node) removeOverlaps(low, high int) *node { 251 | if n == nil { 252 | return n 253 | } 254 | 255 | n.applyShifts() 256 | 257 | if low > n.max { 258 | return n 259 | } 260 | 261 | n.left = n.left.removeOverlaps(low, high) 262 | 263 | for i := 0; i < len(n.interval.ins); { 264 | if overlaps(n.interval.ins[i], low, high) { 265 | n.interval.ins[i] = n.interval.ins[len(n.interval.ins)-1] 266 | n.interval.ins[len(n.interval.ins)-1] = interval{} 267 | n.interval.ins = n.interval.ins[:len(n.interval.ins)-1] 268 | } else { 269 | i++ 270 | } 271 | } 272 | 273 | if len(n.interval.ins) == 0 { 274 | doright := high >= n.key.pos 275 | n = n.remove(n.key) 276 | if doright { 277 | return n.removeOverlaps(low, high) 278 | } 279 | return n 280 | } 281 | 282 | if high < n.key.pos { 283 | return n 284 | } 285 | n.right = n.right.removeOverlaps(low, high) 286 | return n 287 | } 288 | 289 | func (n *node) allvals(vals []intval.Value) []intval.Value { 290 | if n == nil { 291 | return vals 292 | } 293 | 294 | vals = n.left.allvals(vals) 295 | 296 | for _, in := range n.interval.ins { 297 | vals = append(vals, in.value) 298 | } 299 | 300 | vals = n.right.allvals(vals) 301 | 302 | return vals 303 | } 304 | 305 | func (n *node) getHeight() int { 306 | if n == nil { 307 | return 0 308 | } 309 | return n.height 310 | } 311 | 312 | func (n *node) size() int { 313 | if n == nil { 314 | return 0 315 | } 316 | return n.left.size() + n.right.size() + 1 317 | } 318 | 319 | func (n *node) recalculateHeight() { 320 | n.height = 1 + max(n.left.getHeight(), n.right.getHeight()) 321 | } 322 | 323 | // Checks if node is balanced and rebalance 324 | func (n *node) rebalanceTree() *node { 325 | if n == nil { 326 | return n 327 | } 328 | n.recalculateHeight() 329 | n.updateMax() 330 | 331 | // check balance factor and rotateLeft if right-heavy and rotateRight if left-heavy 332 | balanceFactor := n.left.getHeight() - n.right.getHeight() 333 | if balanceFactor <= -2 { 334 | // check if child is left-heavy and rotateRight first 335 | if n.right.left.getHeight() > n.right.right.getHeight() { 336 | n.right = n.right.rotateRight() 337 | } 338 | return n.rotateLeft() 339 | } else if balanceFactor >= 2 { 340 | // check if child is right-heavy and rotateLeft first 341 | if n.left.right.getHeight() > n.left.left.getHeight() { 342 | n.left = n.left.rotateLeft() 343 | } 344 | return n.rotateRight() 345 | } 346 | return n 347 | } 348 | 349 | // Rotate nodes left to balance node 350 | func (n *node) rotateLeft() *node { 351 | n.applyShifts() 352 | if n.right != nil { 353 | n.right.applyShifts() 354 | } 355 | 356 | newRoot := n.right 357 | n.right = newRoot.left 358 | newRoot.left = n 359 | 360 | n.recalculateHeight() 361 | n.updateMax() 362 | newRoot.recalculateHeight() 363 | newRoot.updateMax() 364 | return newRoot 365 | } 366 | 367 | // Rotate nodes right to balance node 368 | func (n *node) rotateRight() *node { 369 | n.applyShifts() 370 | if n.left != nil { 371 | n.left.applyShifts() 372 | } 373 | 374 | newRoot := n.left 375 | n.left = newRoot.right 376 | newRoot.right = n 377 | 378 | n.recalculateHeight() 379 | n.updateMax() 380 | newRoot.recalculateHeight() 381 | newRoot.updateMax() 382 | return newRoot 383 | } 384 | 385 | // Finds the smallest child (based on the key) for the current node 386 | func (n *node) findSmallest() *node { 387 | if n.left != nil { 388 | n.left.applyShifts() 389 | return n.left.findSmallest() 390 | } else { 391 | return n 392 | } 393 | } 394 | 395 | func (n *node) applyShift(s *shift) { 396 | if n.tstamp >= s.tstamp { 397 | // this shift is outdated and we have already applied it 398 | return 399 | } 400 | 401 | n.tstamp = s.tstamp 402 | if n.max < s.idx { 403 | return 404 | } 405 | n.max += s.amt 406 | if n.key.pos >= s.idx { 407 | n.key.pos += s.amt 408 | n.interval.Shift(s.amt) 409 | } 410 | n.updateMax() 411 | } 412 | 413 | func (n *node) applyShifts() { 414 | // optimization: first check if we are completely up-to-date and if so 415 | // there is nothing to do. 416 | if len(n.tree.shifts) == 0 || n.tstamp >= n.tree.shifts[len(n.tree.shifts)-1].tstamp { 417 | return 418 | } 419 | // optimization: search backwards to find the starting point. Alternatively 420 | // we could binary search? not sure which is faster. 421 | var j int 422 | for j = len(n.tree.shifts) - 1; j > 0; j-- { 423 | if n.tstamp >= n.tree.shifts[j].tstamp { 424 | j = j + 1 425 | break 426 | } 427 | } 428 | for i := range n.tree.shifts[j:] { 429 | n.applyShift(&n.tree.shifts[j+i]) 430 | } 431 | } 432 | 433 | func (n *node) applyAllShifts() { 434 | if n == nil { 435 | return 436 | } 437 | 438 | n.left.applyAllShifts() 439 | n.right.applyAllShifts() 440 | n.applyShifts() 441 | } 442 | 443 | func (n *node) eachNode(fn func(*node)) { 444 | if n == nil { 445 | return 446 | } 447 | 448 | n.left.eachNode(fn) 449 | n.applyShifts() 450 | fn(n) 451 | n.right.eachNode(fn) 452 | } 453 | 454 | type lazyInterval struct { 455 | ins []interval 456 | n *node 457 | } 458 | 459 | func (i *lazyInterval) Pos() int { 460 | i.n.applyShifts() 461 | return i.n.key.pos 462 | } 463 | 464 | func (i *lazyInterval) High() int { 465 | high := 0 466 | for _, in := range i.ins { 467 | if in.High() > high { 468 | high = in.High() 469 | } 470 | } 471 | return high 472 | } 473 | 474 | func (i *lazyInterval) Shift(amt int) { 475 | for j := range i.ins { 476 | i.ins[j].low += amt 477 | i.ins[j].high += amt 478 | } 479 | } 480 | 481 | // Returns max number 482 | func max(a int, b int) int { 483 | if a > b { 484 | return a 485 | } 486 | return b 487 | } 488 | -------------------------------------------------------------------------------- /memo/interval/map.go: -------------------------------------------------------------------------------- 1 | package interval 2 | 3 | type Value interface{} 4 | 5 | type Pos interface { 6 | Pos() int 7 | } 8 | 9 | // An interval map is a key-value data structure that maps intervals to 10 | // values. Every value is associated with an interval [low, high) and an id. 11 | // Values may be looked up, added, removed, and queried for overlapping 12 | // intervals. The tree also supports efficient shifting of intervals via 13 | // a lazy shift propagation mechanism. 14 | type Map interface { 15 | // Returns the value associated with the largest interval at (id, pos). 16 | FindLargest(id, pos int) Value 17 | // Adds a new value with 'id' and interval [low, high). Returns a value 18 | // that can be used to locate the inserted value even after shifts have 19 | // occurred (you may want to associate the Pos with your value). 20 | Add(id, low, high int, val Value) Pos 21 | // Removes all values with intervals that overlap [low, high) and then 22 | // performs a shift of size amt at idx. 23 | RemoveAndShift(low, high, amt int) 24 | // AllValues returns all values in the tree. 25 | AllValues() []Value 26 | // Returns the number of values in the tree. 27 | Size() int 28 | } 29 | -------------------------------------------------------------------------------- /memo/none.go: -------------------------------------------------------------------------------- 1 | package memo 2 | 3 | // NoneTable implements a memoization table that does nothing. 4 | type NoneTable struct{} 5 | 6 | // Get always returns 'not found' 7 | func (t NoneTable) Get(id, pos int) (*Entry, bool) { 8 | return nil, false 9 | } 10 | 11 | func (t NoneTable) Put(id, start, length, examined, count int, captures []*Capture) {} 12 | func (t NoneTable) ApplyEdit(e Edit) {} 13 | func (t NoneTable) Overlaps(low, high int) []*Entry { return nil } 14 | func (t NoneTable) Size() int { return 0 } 15 | func (t NoneTable) AllValues() []*Entry { return nil } 16 | -------------------------------------------------------------------------------- /memo/table.go: -------------------------------------------------------------------------------- 1 | package memo 2 | 3 | // A Table is an interface for a memoization table data structure. The 4 | // memoization table tracks memoized parse results corresponding to a 5 | // non-terminal parsed at a certain location. The table interface defines the 6 | // ApplyEdit function which is crucial for incremental parsing. 7 | type Table interface { 8 | // Get returns the entry associated with the given position and ID. If 9 | // there are multiple entries with the same ID at that position, the 10 | // largest entry is returned (determined by matched length). 11 | Get(id, pos int) (*Entry, bool) 12 | 13 | // Put adds a new entry to the table. 14 | Put(id, start, length, examined, count int, captures []*Capture) 15 | 16 | // ApplyEdit updates the table as necessary when an edit occurs. This 17 | // operation invalidates all entries within the range of the edit and 18 | // shifts entries that are to the right of the edit as necessary. 19 | ApplyEdit(Edit) 20 | 21 | AllValues() []*Entry 22 | 23 | // Size returns the number of entries in the table. 24 | Size() int 25 | } 26 | -------------------------------------------------------------------------------- /memo/tree.go: -------------------------------------------------------------------------------- 1 | package memo 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/zyedidia/gpeg/memo/interval" 7 | "github.com/zyedidia/gpeg/memo/interval/lazylog" 8 | ) 9 | 10 | // TreeTable implements a memoization table using an interval tree (augmented 11 | // to support efficient shifting). 12 | type TreeTable struct { 13 | interval.Map 14 | threshold int 15 | lock sync.Mutex 16 | } 17 | 18 | func NewTreeTable(threshold int) *TreeTable { 19 | return &TreeTable{ 20 | Map: &lazylog.Tree{}, 21 | threshold: threshold, 22 | } 23 | } 24 | 25 | func (t *TreeTable) Get(id, pos int) (*Entry, bool) { 26 | t.lock.Lock() 27 | entry := t.Map.FindLargest(id, pos) 28 | t.lock.Unlock() 29 | e, ok := entry.(*Entry) 30 | return e, ok 31 | } 32 | 33 | func (t *TreeTable) Put(id, start, length, examined, count int, captures []*Capture) { 34 | if examined < t.threshold || length == 0 { 35 | return 36 | } 37 | 38 | examined = max(examined, length) 39 | 40 | e := &Entry{ 41 | length: length, 42 | examined: examined, 43 | count: count, 44 | captures: captures, 45 | } 46 | t.lock.Lock() 47 | e.setPos(t.Map.Add(id, start, start+examined, e)) 48 | t.lock.Unlock() 49 | } 50 | 51 | func (t *TreeTable) ApplyEdit(e Edit) { 52 | low, high := e.Start, e.End 53 | if low == high { 54 | high = low + 1 55 | } 56 | amt := e.Len - (e.End - e.Start) 57 | 58 | t.lock.Lock() 59 | t.Map.RemoveAndShift(low, high, amt) 60 | t.lock.Unlock() 61 | } 62 | 63 | func (t *TreeTable) AllValues() []*Entry { 64 | vals := t.Map.AllValues() 65 | entries := make([]*Entry, len(vals)) 66 | for i, v := range vals { 67 | entries[i] = v.(*Entry) 68 | } 69 | return entries 70 | } 71 | 72 | func max(a, b int) int { 73 | if a > b { 74 | return a 75 | } 76 | return b 77 | } 78 | -------------------------------------------------------------------------------- /pattern/compile.go: -------------------------------------------------------------------------------- 1 | package pattern 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/zyedidia/gpeg/charset" 7 | "github.com/zyedidia/gpeg/isa" 8 | ) 9 | 10 | // A NotFoundError means a a non-terminal was not found during grammar 11 | // compilation. 12 | type NotFoundError struct { 13 | Name string 14 | } 15 | 16 | // Error returns the error message. 17 | func (e *NotFoundError) Error() string { return "non-terminal " + e.Name + ": not found" } 18 | 19 | // Compile takes an input pattern and returns the result of compiling it into a 20 | // parsing program, and optimizing the program. 21 | func Compile(p Pattern) (isa.Program, error) { 22 | c, err := p.Compile() 23 | if err != nil { 24 | return nil, err 25 | } 26 | 27 | Optimize(c) 28 | return c, nil 29 | } 30 | 31 | // MustCompile is the same as Compile but panics if there is an error during 32 | // compilation. 33 | func MustCompile(p Pattern) isa.Program { 34 | c, err := Compile(p) 35 | if err != nil { 36 | panic(err) 37 | } 38 | return c 39 | } 40 | 41 | // openCall is a dummy instruction for resolving recursive function calls in 42 | // grammars. 43 | type openCall struct { 44 | name string 45 | isa.Nop 46 | } 47 | 48 | func (i openCall) String() string { 49 | return fmt.Sprintf("OpenCall %v", i.name) 50 | } 51 | 52 | // Compile this node. 53 | func (p *AltNode) Compile() (isa.Program, error) { 54 | // optimization: if Left and Right are charsets/single chars, return the union 55 | set, ok := combine(Get(p.Left), Get(p.Right)) 56 | if ok { 57 | return isa.Program{ 58 | isa.Set{Chars: set}, 59 | }, nil 60 | } 61 | 62 | l, err1 := Get(p.Left).Compile() 63 | r, err2 := Get(p.Right).Compile() 64 | if err1 != nil { 65 | return nil, err1 66 | } 67 | if err2 != nil { 68 | return nil, err2 69 | } 70 | 71 | L1 := isa.NewLabel() 72 | 73 | // optimization: if the right and left nodes are disjoint, we can use 74 | // NoChoice variants of the head-fail optimization instructions. 75 | var disjoint bool 76 | var testinsn isa.Insn 77 | linsn, okl := nextInsn(l) 78 | rinsn, okr := nextInsn(r) 79 | if okl && okr { 80 | switch lt := linsn.(type) { 81 | case isa.Set: 82 | switch rt := rinsn.(type) { 83 | case isa.Char: 84 | disjoint = !lt.Chars.Has(rt.Byte) 85 | } 86 | testinsn = isa.TestSetNoChoice{Chars: lt.Chars, Lbl: L1} 87 | case isa.Char: 88 | switch rt := rinsn.(type) { 89 | case isa.Char: 90 | disjoint = lt.Byte != rt.Byte 91 | case isa.Set: 92 | disjoint = !rt.Chars.Has(lt.Byte) 93 | } 94 | testinsn = isa.TestCharNoChoice{Byte: lt.Byte, Lbl: L1} 95 | } 96 | } 97 | 98 | L2 := isa.NewLabel() 99 | code := make(isa.Program, 0, len(l)+len(r)+5) 100 | if disjoint { 101 | code = append(code, testinsn) 102 | code = append(code, l[1:]...) 103 | code = append(code, isa.Jump{Lbl: L2}) 104 | } else { 105 | code = append(code, isa.Choice{Lbl: L1}) 106 | code = append(code, l...) 107 | code = append(code, isa.Commit{Lbl: L2}) 108 | } 109 | code = append(code, L1) 110 | code = append(code, r...) 111 | code = append(code, L2) 112 | return code, nil 113 | } 114 | 115 | // Compile this node. 116 | func (p *SeqNode) Compile() (isa.Program, error) { 117 | l, err1 := Get(p.Left).Compile() 118 | r, err2 := Get(p.Right).Compile() 119 | if err1 != nil { 120 | return nil, err1 121 | } 122 | if err2 != nil { 123 | return nil, err2 124 | } 125 | 126 | return append(l, r...), nil 127 | } 128 | 129 | // Compile this node. 130 | func (p *StarNode) Compile() (isa.Program, error) { 131 | switch t := Get(p.Patt).(type) { 132 | case *ClassNode: 133 | // optimization: repeating a charset uses the dedicated instruction 'span' 134 | return isa.Program{ 135 | isa.Span{Chars: t.Chars}, 136 | }, nil 137 | case *MemoNode: 138 | // optimization: if the pattern we are repeating is a memoization 139 | // entry, we should use special instructions to memoize it as a tree to 140 | // get logarithmic saving when reparsing. 141 | sub, err := Get(t.Patt).Compile() 142 | code := make(isa.Program, 0, len(sub)+7) 143 | L1 := isa.NewLabel() 144 | L2 := isa.NewLabel() 145 | L3 := isa.NewLabel() 146 | NoJump := isa.NewLabel() 147 | memoId++ 148 | 149 | code = append(code, L1) 150 | code = append(code, isa.MemoTreeOpen{Id: memoId, Lbl: L3}) 151 | code = append(code, isa.Choice{Lbl: L2}) 152 | code = append(code, sub...) 153 | code = append(code, isa.Commit{Lbl: NoJump}) 154 | code = append(code, NoJump) 155 | code = append(code, isa.MemoTreeInsert{}) 156 | code = append(code, L3) 157 | code = append(code, isa.MemoTree{}) 158 | code = append(code, isa.Jump{Lbl: L1}) 159 | code = append(code, L2) 160 | code = append(code, isa.MemoTreeClose{Id: memoId}) 161 | return code, err 162 | } 163 | 164 | sub, err := Get(p.Patt).Compile() 165 | code := make(isa.Program, 0, len(sub)+4) 166 | 167 | L1 := isa.NewLabel() 168 | L2 := isa.NewLabel() 169 | code = append(code, isa.Choice{Lbl: L2}) 170 | code = append(code, L1) 171 | code = append(code, sub...) 172 | code = append(code, isa.PartialCommit{Lbl: L1}) 173 | code = append(code, L2) 174 | return code, err 175 | } 176 | 177 | // Compile this node. 178 | func (p *PlusNode) Compile() (isa.Program, error) { 179 | starp := Star(Get(p.Patt)) 180 | star, err1 := starp.Compile() 181 | sub, err2 := Get(p.Patt).Compile() 182 | if err1 != nil { 183 | return nil, err1 184 | } 185 | if err2 != nil { 186 | return nil, err2 187 | } 188 | 189 | code := make(isa.Program, 0, len(sub)+len(star)) 190 | code = append(code, sub...) 191 | code = append(code, star...) 192 | return code, nil 193 | } 194 | 195 | // Compile this node. 196 | func (p *OptionalNode) Compile() (isa.Program, error) { 197 | // optimization: if the pattern is a class node or single char literal, we 198 | // can use the Test*NoChoice instructions. 199 | switch t := Get(p.Patt).(type) { 200 | case *LiteralNode: 201 | if len(t.Str) == 1 { 202 | L1 := isa.NewLabel() 203 | return isa.Program{ 204 | isa.TestCharNoChoice{Byte: t.Str[0], Lbl: L1}, 205 | L1, 206 | }, nil 207 | } 208 | case *ClassNode: 209 | L1 := isa.NewLabel() 210 | prog := isa.Program{ 211 | isa.TestSetNoChoice{Chars: t.Chars, Lbl: L1}, 212 | L1, 213 | } 214 | return prog, nil 215 | } 216 | 217 | a := AltNode{ 218 | Left: Get(p.Patt), 219 | Right: &EmptyNode{}, 220 | } 221 | return a.Compile() 222 | } 223 | 224 | // Compile this node. 225 | func (p *NotNode) Compile() (isa.Program, error) { 226 | sub, err := Get(p.Patt).Compile() 227 | L1 := isa.NewLabel() 228 | code := make(isa.Program, 0, len(sub)+3) 229 | code = append(code, isa.Choice{Lbl: L1}) 230 | code = append(code, sub...) 231 | code = append(code, isa.FailTwice{}) 232 | code = append(code, L1) 233 | return code, err 234 | } 235 | 236 | // Compile this node. 237 | func (p *AndNode) Compile() (isa.Program, error) { 238 | sub, err := Get(p.Patt).Compile() 239 | code := make(isa.Program, 0, len(sub)+5) 240 | L1 := isa.NewLabel() 241 | L2 := isa.NewLabel() 242 | 243 | code = append(code, isa.Choice{Lbl: L1}) 244 | code = append(code, sub...) 245 | code = append(code, isa.BackCommit{Lbl: L2}) 246 | code = append(code, L1) 247 | code = append(code, isa.Fail{}) 248 | code = append(code, L2) 249 | return code, err 250 | } 251 | 252 | // Compile this node. 253 | func (p *CapNode) Compile() (isa.Program, error) { 254 | sub, err := Get(p.Patt).Compile() 255 | if err != nil { 256 | return nil, err 257 | } 258 | code := make(isa.Program, 0, len(sub)+2) 259 | 260 | i := 0 261 | back := 0 262 | loop: 263 | for _, insn := range sub { 264 | switch t := insn.(type) { 265 | case isa.Char, isa.Set: 266 | back++ 267 | case isa.Any: 268 | back += int(t.N) 269 | default: 270 | break loop 271 | } 272 | i++ 273 | } 274 | 275 | if i == 0 || back >= 256 { 276 | code = append(code, isa.CaptureBegin{Id: p.Id}) 277 | i = 0 278 | } else if i == len(sub) && back < 256 { 279 | code = append(code, sub...) 280 | code = append(code, isa.CaptureFull{Back: byte(back), Id: p.Id}) 281 | return code, nil 282 | } else { 283 | code = append(code, sub[:i]...) 284 | code = append(code, isa.CaptureLate{Back: byte(back), Id: p.Id}) 285 | } 286 | code = append(code, sub[i:]...) 287 | code = append(code, isa.CaptureEnd{}) 288 | return code, nil 289 | } 290 | 291 | // Compile this node. 292 | func (p *MemoNode) Compile() (isa.Program, error) { 293 | L1 := isa.NewLabel() 294 | sub, err := Get(p.Patt).Compile() 295 | code := make(isa.Program, 0, len(sub)+3) 296 | code = append(code, isa.MemoOpen{Lbl: L1, Id: p.Id}) 297 | code = append(code, sub...) 298 | code = append(code, isa.MemoClose{}) 299 | code = append(code, L1) 300 | return code, err 301 | } 302 | 303 | // Compile this node. 304 | func (p *CheckNode) Compile() (isa.Program, error) { 305 | L1 := isa.NewLabel() 306 | sub, err := Get(p.Patt).Compile() 307 | code := make(isa.Program, 0, len(sub)+3) 308 | code = append(code, isa.CheckBegin{ 309 | Id: p.Id, 310 | Flag: p.Flag, 311 | }) 312 | code = append(code, sub...) 313 | code = append(code, isa.CheckEnd{Checker: p.Checker}) 314 | code = append(code, L1) 315 | return code, err 316 | } 317 | 318 | // Compile this node. 319 | func (p *SearchNode) Compile() (isa.Program, error) { 320 | var rsearch Pattern 321 | var set charset.Set 322 | opt := false 323 | 324 | sub, err := Get(p.Patt).Compile() 325 | if err != nil { 326 | return nil, err 327 | } 328 | 329 | next, ok := nextInsn(sub) 330 | if ok { 331 | switch t := next.(type) { 332 | case isa.Char: 333 | set = charset.New([]byte{t.Byte}).Complement() 334 | opt = true 335 | case isa.Set: 336 | // Heuristic: if the set is smaller than 10 chars, it 337 | // is unlikely enough to match that we should consume all 338 | // chars from the complement before continuing the search. 339 | // The number 10 was arbitrarily chosen. 340 | if t.Chars.Size() < 10 { 341 | set = t.Chars.Complement() 342 | opt = true 343 | } 344 | } 345 | } 346 | 347 | if opt { 348 | rsearch = Concat(Star(Set(set)), NonTerm("S")) 349 | } else { 350 | rsearch = NonTerm("S") 351 | } 352 | 353 | return Grammar("S", map[string]Pattern{ 354 | "S": Or(Get(p.Patt), Concat(Any(1), rsearch)), 355 | }).Compile() 356 | } 357 | 358 | // Compile this node. 359 | func (p *EmptyOpNode) Compile() (isa.Program, error) { 360 | return isa.Program{ 361 | isa.Empty{ 362 | Op: p.Op, 363 | }, 364 | }, nil 365 | } 366 | 367 | // Compile this node. 368 | func (p *GrammarNode) Compile() (isa.Program, error) { 369 | p.Inline() 370 | 371 | used := make(map[string]bool) 372 | for _, v := range p.Defs { 373 | WalkPattern(v, true, func(sub Pattern) { 374 | switch t := sub.(type) { 375 | case *NonTermNode: 376 | if t.Inlined == nil { 377 | used[t.Name] = true 378 | } 379 | } 380 | }) 381 | } 382 | 383 | if len(used) == 0 { 384 | return p.Defs[p.Start].Compile() 385 | } 386 | 387 | code := make(isa.Program, 0) 388 | LEnd := isa.NewLabel() 389 | code = append(code, openCall{name: p.Start}, isa.Jump{Lbl: LEnd}) 390 | 391 | labels := make(map[string]isa.Label) 392 | for k, v := range p.Defs { 393 | if k != p.Start && !used[k] { 394 | continue 395 | } 396 | label := isa.NewLabel() 397 | labels[k] = label 398 | fn, err := v.Compile() 399 | if err != nil { 400 | return nil, err 401 | } 402 | code = append(code, label) 403 | code = append(code, fn...) 404 | code = append(code, isa.Return{}) 405 | } 406 | 407 | // resolve calls to openCall and do tail call optimization 408 | for i := 0; i < len(code); i++ { 409 | insn := code[i] 410 | if oc, ok := insn.(openCall); ok { 411 | lbl, ok := labels[oc.name] 412 | if !ok { 413 | return nil, &NotFoundError{ 414 | Name: oc.name, 415 | } 416 | } 417 | 418 | // replace this placeholder instruction with a normal call 419 | var replace isa.Insn = isa.Call{Lbl: lbl} 420 | // if a call is immediately followed by a return, optimize to 421 | // a jump for tail call optimization. 422 | next, ok := nextInsn(code[i+1:]) 423 | if ok { 424 | switch next.(type) { 425 | case isa.Return: 426 | replace = isa.Jump{Lbl: lbl} 427 | // remove the return instruction if there is no label referring to it 428 | retidx, hadlbl := nextInsnLabel(code[i+1:]) 429 | if !hadlbl { 430 | code[i+1+retidx] = isa.Nop{} 431 | } 432 | } 433 | } 434 | 435 | // perform the replacement of the opencall by either a call or jump 436 | code[i] = replace 437 | } 438 | } 439 | 440 | code = append(code, LEnd) 441 | 442 | return code, nil 443 | } 444 | 445 | // Compile this node. 446 | func (p *ClassNode) Compile() (isa.Program, error) { 447 | return isa.Program{ 448 | isa.Set{Chars: p.Chars}, 449 | }, nil 450 | } 451 | 452 | // Compile this node. 453 | func (p *LiteralNode) Compile() (isa.Program, error) { 454 | code := make(isa.Program, len(p.Str)) 455 | for i := 0; i < len(p.Str); i++ { 456 | code[i] = isa.Char{Byte: p.Str[i]} 457 | } 458 | return code, nil 459 | } 460 | 461 | // Compile this node. 462 | func (p *NonTermNode) Compile() (isa.Program, error) { 463 | if p.Inlined != nil { 464 | return p.Inlined.Compile() 465 | } 466 | return isa.Program{ 467 | openCall{name: p.Name}, 468 | }, nil 469 | } 470 | 471 | // Compile this node. 472 | func (p *DotNode) Compile() (isa.Program, error) { 473 | return isa.Program{ 474 | isa.Any{N: p.N}, 475 | }, nil 476 | } 477 | 478 | // Compile this node. 479 | func (p *ErrorNode) Compile() (isa.Program, error) { 480 | var recovery isa.Program 481 | var err error 482 | 483 | if p.Recover == nil { 484 | recovery = isa.Program{ 485 | isa.End{Fail: true}, 486 | } 487 | } else { 488 | recovery, err = Get(p.Recover).Compile() 489 | } 490 | 491 | code := make(isa.Program, 0, len(recovery)+1) 492 | code = append(code, isa.Error{Message: p.Message}) 493 | code = append(code, recovery...) 494 | return code, err 495 | } 496 | 497 | // Compile this node. 498 | func (p *EmptyNode) Compile() (isa.Program, error) { 499 | return isa.Program{}, nil 500 | } 501 | -------------------------------------------------------------------------------- /pattern/nodes.go: -------------------------------------------------------------------------------- 1 | package pattern 2 | 3 | import ( 4 | "regexp/syntax" 5 | 6 | "github.com/zyedidia/gpeg/charset" 7 | "github.com/zyedidia/gpeg/isa" 8 | ) 9 | 10 | // A Pattern is an object that can be compiled into a parsing program. 11 | type Pattern interface { 12 | Compile() (isa.Program, error) 13 | } 14 | 15 | // AltNode is the binary operator for alternation. 16 | type AltNode struct { 17 | Left, Right Pattern 18 | } 19 | 20 | // SeqNode is the binary operator for sequences. 21 | type SeqNode struct { 22 | Left, Right Pattern 23 | } 24 | 25 | // StarNode is the operator for the Kleene star. 26 | type StarNode struct { 27 | Patt Pattern 28 | } 29 | 30 | // PlusNode is the operator for the Kleene plus. 31 | type PlusNode struct { 32 | Patt Pattern 33 | } 34 | 35 | // OptionalNode is the operator for making a pattern optional. 36 | type OptionalNode struct { 37 | Patt Pattern 38 | } 39 | 40 | // NotNode is the not predicate. 41 | type NotNode struct { 42 | Patt Pattern 43 | } 44 | 45 | // AndNode is the and predicate. 46 | type AndNode struct { 47 | Patt Pattern 48 | } 49 | 50 | // CapNode marks a pattern to be captured with a certain ID. 51 | type CapNode struct { 52 | Patt Pattern 53 | Id int 54 | } 55 | 56 | // MemoNode marks a pattern to be memoized with a certain ID. 57 | type MemoNode struct { 58 | Patt Pattern 59 | Id int 60 | } 61 | 62 | // CheckNode marks a pattern to be checker by a certain checker. 63 | type CheckNode struct { 64 | Patt Pattern 65 | Checker isa.Checker 66 | Id, Flag int 67 | } 68 | 69 | // GrammarNode represents a grammar of non-terminals and their associated 70 | // patterns. The Grammar must also have an entry non-terminal. 71 | type GrammarNode struct { 72 | Defs map[string]Pattern 73 | Start string 74 | } 75 | 76 | // SearchNode represents a search for a certain pattern. 77 | type SearchNode struct { 78 | Patt Pattern 79 | } 80 | 81 | // RepeatNode represents the repetition of a pattern a constant number of 82 | // times. 83 | type RepeatNode struct { 84 | Patt Pattern 85 | N int 86 | } 87 | 88 | // ClassNode represents a character set. 89 | type ClassNode struct { 90 | Chars charset.Set 91 | } 92 | 93 | // LiteralNode represents a literal string. 94 | type LiteralNode struct { 95 | Str string 96 | } 97 | 98 | // NonTermNode represents the use of a non-terminal. If this non-terminal is 99 | // inlined during compilation, the `inlined` field will point to the pattern 100 | // that is inlined. 101 | type NonTermNode struct { 102 | Name string 103 | Inlined Pattern 104 | } 105 | 106 | // DotNode represents the pattern to match any byte. 107 | type DotNode struct { 108 | N uint8 109 | } 110 | 111 | // ErrorNode represents a pattern that fails with a certain error message. 112 | type ErrorNode struct { 113 | Message string 114 | Recover Pattern 115 | } 116 | 117 | // EmptyOpNode is a node that performs a zero-width assertion. 118 | type EmptyOpNode struct { 119 | Op syntax.EmptyOp 120 | } 121 | 122 | // EmtpyNode represents the empty pattern. 123 | type EmptyNode struct { 124 | } 125 | 126 | // WalkFunc is a function that takes a pattern. 127 | type WalkFunc func(sub Pattern) 128 | 129 | // CountSubPatterns returns the number of subpatterns that exist in the given 130 | // pattern. 131 | func CountSubPatterns(p Pattern) int { 132 | count := 0 133 | WalkPattern(p, true, func(sub Pattern) { 134 | count++ 135 | }) 136 | return count 137 | } 138 | 139 | // WalkPattern calls fn for every subpattern contained in p. If followInline 140 | // is true, WalkPattern will walk over inlined patterns as well. 141 | func WalkPattern(p Pattern, followInline bool, fn WalkFunc) { 142 | fn(p) 143 | switch t := p.(type) { 144 | case *AltNode: 145 | WalkPattern(t.Left, followInline, fn) 146 | WalkPattern(t.Right, followInline, fn) 147 | case *SeqNode: 148 | WalkPattern(t.Left, followInline, fn) 149 | WalkPattern(t.Right, followInline, fn) 150 | case *StarNode: 151 | WalkPattern(t.Patt, followInline, fn) 152 | case *PlusNode: 153 | WalkPattern(t.Patt, followInline, fn) 154 | case *OptionalNode: 155 | WalkPattern(t.Patt, followInline, fn) 156 | case *NotNode: 157 | WalkPattern(t.Patt, followInline, fn) 158 | case *AndNode: 159 | WalkPattern(t.Patt, followInline, fn) 160 | case *CapNode: 161 | WalkPattern(t.Patt, followInline, fn) 162 | case *MemoNode: 163 | WalkPattern(t.Patt, followInline, fn) 164 | case *SearchNode: 165 | WalkPattern(t.Patt, followInline, fn) 166 | case *CheckNode: 167 | WalkPattern(t.Patt, followInline, fn) 168 | case *ErrorNode: 169 | WalkPattern(t.Recover, followInline, fn) 170 | case *GrammarNode: 171 | for _, p := range t.Defs { 172 | WalkPattern(p, followInline, fn) 173 | } 174 | case *NonTermNode: 175 | if t.Inlined != nil && followInline { 176 | WalkPattern(t.Inlined, followInline, fn) 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /pattern/optimize.go: -------------------------------------------------------------------------------- 1 | package pattern 2 | 3 | import ( 4 | "github.com/zyedidia/gpeg/charset" 5 | "github.com/zyedidia/gpeg/isa" 6 | ) 7 | 8 | // Nodes with trees larger than this size will not be inlined. 9 | var InlineThreshold = 100 10 | 11 | // Inline performs inlining passes until the inliner reaches a steady-state. 12 | func (p *GrammarNode) Inline() { 13 | for p.inline() { 14 | } 15 | } 16 | 17 | // Get returns a possibly optimized version of this pattern. Always use this 18 | // function to read a pattern, especially if you will be using the types of the 19 | // underlying nodes. This function performs optimizations like collapsing an 20 | // alternation of two class nodes into one class node. 21 | func Get(p Pattern) Pattern { 22 | switch t := p.(type) { 23 | case *NonTermNode: 24 | // Return the inlined pattern for a non-terminal that has been inlined. 25 | if t.Inlined != nil { 26 | return t.Inlined 27 | } 28 | case *AltNode: 29 | l, r := Get(t.Left), Get(t.Right) 30 | if n, emptyL := l.(*EmptyNode); emptyL { 31 | return n 32 | } 33 | if _, emptyR := r.(*EmptyNode); emptyR { 34 | return Get(Optional(l)) 35 | } 36 | 37 | // Combine the left and right sides of an alternation into a class node 38 | // if possible. 39 | set, ok := combine(l, r) 40 | if ok { 41 | return &ClassNode{Chars: set} 42 | } 43 | case *OptionalNode: 44 | // Optional of a Kleene star is unnecessary and we can remove the 45 | // optional. 46 | star, ok := Get(t.Patt).(*StarNode) 47 | if ok { 48 | return star 49 | } 50 | case *SeqNode: 51 | // optimize use of empty: `a ""` and `"" a` are just `a`. 52 | l, r := Get(t.Left), Get(t.Right) 53 | if _, emptyR := r.(*EmptyNode); emptyR { 54 | return l 55 | } 56 | if _, emptyL := l.(*EmptyNode); emptyL { 57 | return r 58 | } 59 | 60 | // This optimizes patterns like `![a-z] .`. Instead of using a not 61 | // predicate in this case, we can just complement the set and use a 62 | // class node. 63 | nn, okl := l.(*NotNode) 64 | if !okl { 65 | break 66 | } 67 | 68 | var set charset.Set 69 | switch lt := Get(nn.Patt).(type) { 70 | case *LiteralNode: 71 | if len(lt.Str) != 1 { 72 | return p 73 | } 74 | set = charset.New([]byte{lt.Str[0]}) 75 | case *ClassNode: 76 | set = lt.Chars 77 | default: 78 | return p 79 | } 80 | 81 | switch rt := r.(type) { 82 | case *DotNode: 83 | if rt.N == 1 { 84 | return &ClassNode{ 85 | Chars: set.Complement(), 86 | } 87 | } 88 | case *ClassNode: 89 | return &ClassNode{ 90 | Chars: rt.Chars.Sub(set), 91 | } 92 | case *LiteralNode: 93 | if len(rt.Str) == 1 { 94 | return &ClassNode{ 95 | Chars: charset.New([]byte{rt.Str[0]}).Sub(set), 96 | } 97 | } 98 | } 99 | } 100 | return p 101 | } 102 | 103 | // Performs inlining on a grammar node. 104 | func (p *GrammarNode) inline() bool { 105 | sizes := make(map[string]int) 106 | leaves := make(map[string]bool) 107 | for n, sub := range p.Defs { 108 | size := 0 109 | leaf := true 110 | WalkPattern(sub, true, func(s Pattern) { 111 | switch t := s.(type) { 112 | case *NonTermNode: 113 | if t.Inlined == nil { 114 | leaf = false 115 | } 116 | } 117 | size++ 118 | }) 119 | sizes[n] = size 120 | leaves[n] = leaf 121 | } 122 | 123 | didInline := false 124 | WalkPattern(p, true, func(sub Pattern) { 125 | switch t := sub.(type) { 126 | case *NonTermNode: 127 | if sz, ok := sizes[t.Name]; ok && t.Inlined == nil { 128 | // We only inline nodes if they are small enough and don't use 129 | // any non-terminals themselves. 130 | if sz < InlineThreshold && leaves[t.Name] { 131 | didInline = true 132 | t.Inlined = p.Defs[t.Name] 133 | } 134 | } 135 | } 136 | }) 137 | return didInline 138 | } 139 | 140 | // If the bytes matched by p1 and p2 can be matched by a single charset, then 141 | // that single combined charset is returned. 142 | func combine(p1 Pattern, p2 Pattern) (charset.Set, bool) { 143 | var set charset.Set 144 | switch t1 := p1.(type) { 145 | case *LiteralNode: 146 | if len(t1.Str) != 1 { 147 | return set, false 148 | } 149 | switch t2 := p2.(type) { 150 | case *ClassNode: 151 | return t2.Chars.Add(charset.New([]byte{t1.Str[0]})), true 152 | case *LiteralNode: 153 | if len(t2.Str) != 1 { 154 | return set, false 155 | } 156 | return charset.New([]byte{t1.Str[0], t2.Str[0]}), true 157 | } 158 | case *ClassNode: 159 | switch t2 := p2.(type) { 160 | case *ClassNode: 161 | return t2.Chars.Add(t1.Chars), true 162 | case *LiteralNode: 163 | if len(t2.Str) != 1 { 164 | return set, false 165 | } 166 | return t1.Chars.Add(charset.New([]byte{t2.Str[0]})), true 167 | } 168 | } 169 | return set, false 170 | } 171 | 172 | // Returns the next instruction in p, skipping labels and nops. 173 | // If false is returned, there is no next instruction. 174 | func nextInsn(p isa.Program) (isa.Insn, bool) { 175 | for i := 0; i < len(p); i++ { 176 | switch p[i].(type) { 177 | case isa.Label, isa.Nop: 178 | continue 179 | default: 180 | return p[i], true 181 | } 182 | } 183 | 184 | return isa.Nop{}, false 185 | } 186 | 187 | // Returns the index of the next instruction and if there was a label before 188 | // it. 189 | func nextInsnLabel(p isa.Program) (int, bool) { 190 | hadLabel := false 191 | for i := 0; i < len(p); i++ { 192 | switch p[i].(type) { 193 | case isa.Nop: 194 | continue 195 | case isa.Label: 196 | hadLabel = true 197 | default: 198 | return i, hadLabel 199 | } 200 | } 201 | 202 | return -1, hadLabel 203 | } 204 | 205 | // Optimize performs some optimization passes on the code in p. In particular 206 | // it performs head-fail optimization and jump replacement. 207 | func Optimize(p isa.Program) { 208 | // map from label to index in code 209 | labels := make(map[isa.Label]int) 210 | for i, insn := range p { 211 | switch l := insn.(type) { 212 | case isa.Label: 213 | labels[l] = i 214 | } 215 | } 216 | 217 | for i, insn := range p { 218 | // head-fail optimization: if we find a choice instruction immediately 219 | // followed (no label) by Char/Set/Any, we can replace with the 220 | // dedicated instruction TestChar/TestSet/TestAny. 221 | if ch, ok := insn.(isa.Choice); ok && i < len(p)-1 { 222 | next := p[i+1] 223 | switch t := next.(type) { 224 | case isa.Char: 225 | p[i] = isa.TestChar{ 226 | Byte: t.Byte, 227 | Lbl: ch.Lbl, 228 | } 229 | p[i+1] = isa.Nop{} 230 | case isa.Set: 231 | p[i] = isa.TestSet{ 232 | Chars: t.Chars, 233 | Lbl: ch.Lbl, 234 | } 235 | p[i+1] = isa.Nop{} 236 | case isa.Any: 237 | p[i] = isa.TestAny{ 238 | N: t.N, 239 | Lbl: ch.Lbl, 240 | } 241 | p[i+1] = isa.Nop{} 242 | } 243 | } 244 | 245 | // jump optimization: if we find a jump to another control flow 246 | // instruction, we can replace the current jump directly with the 247 | // target instruction. 248 | if j, ok := insn.(isa.Jump); ok { 249 | next, ok := nextInsn(p[labels[j.Lbl]:]) 250 | if ok { 251 | switch next.(type) { 252 | case isa.PartialCommit, isa.BackCommit, isa.Commit, 253 | isa.Jump, isa.Return, isa.Fail, isa.FailTwice, isa.End: 254 | p[i] = next 255 | } 256 | } 257 | } 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /pattern/pattern.go: -------------------------------------------------------------------------------- 1 | // Package pattern provides data types and functions for compiling patterns 2 | // into GPeg VM programs. 3 | package pattern 4 | 5 | import ( 6 | "regexp/syntax" 7 | 8 | "github.com/zyedidia/gpeg/charset" 9 | "github.com/zyedidia/gpeg/isa" 10 | ) 11 | 12 | // Cap marks a pattern to be captured. 13 | func Cap(p Pattern, id int) Pattern { 14 | return &CapNode{ 15 | Patt: p, 16 | Id: id, 17 | } 18 | } 19 | 20 | // Check marks a pattern to be checked with the given checker. 21 | func Check(p Pattern, c isa.Checker) Pattern { 22 | return &CheckNode{ 23 | Patt: p, 24 | Checker: c, 25 | } 26 | } 27 | 28 | func CheckFlags(p Pattern, c isa.Checker, id, flag int) Pattern { 29 | return &CheckNode{ 30 | Patt: p, 31 | Checker: c, 32 | Id: id, 33 | Flag: flag, 34 | } 35 | } 36 | 37 | var memoId = 0 38 | 39 | // MemoId marks a pattern as memoizable with a particular ID. 40 | func MemoId(p Pattern, id int) Pattern { 41 | m := &MemoNode{ 42 | Patt: p, 43 | Id: id, 44 | } 45 | memoId = max(memoId, id) + 1 46 | return m 47 | } 48 | 49 | // Memo marks a pattern as memoizable. 50 | func Memo(p Pattern) Pattern { 51 | m := &MemoNode{ 52 | Patt: p, 53 | Id: memoId, 54 | } 55 | memoId++ 56 | return m 57 | } 58 | 59 | // Literal matches a given string literal. 60 | func Literal(s string) Pattern { 61 | return &LiteralNode{ 62 | Str: s, 63 | } 64 | } 65 | 66 | // Set matches any character in the given set. 67 | func Set(chars charset.Set) Pattern { 68 | return &ClassNode{ 69 | Chars: chars, 70 | } 71 | } 72 | 73 | // Any consumes n characters, and only fails if there 74 | // aren't enough input characters left. 75 | func Any(n uint8) Pattern { 76 | return &DotNode{ 77 | N: n, 78 | } 79 | } 80 | 81 | // Repeat matches p exactly n times 82 | func Repeat(p Pattern, n int) Pattern { 83 | if n <= 0 { 84 | return &EmptyNode{} 85 | } 86 | 87 | acc := p 88 | for i := 1; i < n; i++ { 89 | acc = &SeqNode{ 90 | Left: acc, 91 | Right: p, 92 | } 93 | } 94 | return acc 95 | } 96 | 97 | // Concat concatenates n patterns: `p1 p2 p3...`. 98 | func Concat(patts ...Pattern) Pattern { 99 | if len(patts) <= 0 { 100 | return &EmptyNode{} 101 | } 102 | 103 | acc := patts[0] 104 | for _, p := range patts[1:] { 105 | acc = &SeqNode{ 106 | Left: acc, 107 | Right: p, 108 | } 109 | } 110 | 111 | return acc 112 | } 113 | 114 | // Or returns the ordered choice between n patterns: `p1 / p2 / p3...`. 115 | func Or(patts ...Pattern) Pattern { 116 | if len(patts) <= 0 { 117 | return &EmptyNode{} 118 | } 119 | 120 | // optimization: make or right associative 121 | acc := patts[len(patts)-1] 122 | for i := len(patts) - 2; i >= 0; i-- { 123 | acc = &AltNode{ 124 | Left: patts[i], 125 | Right: acc, 126 | } 127 | } 128 | 129 | return acc 130 | } 131 | 132 | // Star returns the Kleene star repetition of a pattern: `p*`. 133 | // This matches zero or more occurrences of p. 134 | func Star(p Pattern) Pattern { 135 | return &StarNode{ 136 | Patt: p, 137 | } 138 | } 139 | 140 | // Plus returns the Kleene plus repetition of a pattern: `p+`. 141 | // This matches one or more occurrences of p. 142 | func Plus(p Pattern) Pattern { 143 | return &PlusNode{ 144 | Patt: p, 145 | } 146 | } 147 | 148 | // Optional matches at most 1 occurrence of p: `p?`. 149 | func Optional(p Pattern) Pattern { 150 | return &OptionalNode{ 151 | Patt: p, 152 | } 153 | } 154 | 155 | // Not returns the not predicate applied to a pattern: `!p`. 156 | // The not predicate succeeds if matching `p` at the current position 157 | // fails, and does not consume any input. 158 | func Not(p Pattern) Pattern { 159 | return &NotNode{ 160 | Patt: p, 161 | } 162 | } 163 | 164 | // And returns the and predicate applied to a pattern: `&p`. 165 | // The and predicate succeeds if matching `p` at the current position 166 | // succeeds and does not consume any input. 167 | // This is equivalent to `!!p`. 168 | func And(p Pattern) Pattern { 169 | return &AndNode{ 170 | Patt: p, 171 | } 172 | } 173 | 174 | // Search is a dedicated operator for creating searches. It will match 175 | // the first occurrence of the given pattern. Use Star(Search(p)) to match 176 | // the last occurrence (for a non-overlapping pattern). 177 | func Search(p Pattern) Pattern { 178 | return &SearchNode{ 179 | Patt: p, 180 | } 181 | } 182 | 183 | func EmptyOp(op syntax.EmptyOp) Pattern { 184 | return &EmptyOpNode{ 185 | Op: op, 186 | } 187 | } 188 | 189 | // NonTerm builds an unresolved non-terminal with a given name. 190 | // NonTerms should be used together with `Grammar` to build a recursive 191 | // grammar. 192 | func NonTerm(name string) Pattern { 193 | return &NonTermNode{ 194 | Name: name, 195 | } 196 | } 197 | 198 | // Grammar builds a grammar from a map of non-terminal patterns. 199 | // Any unresolved non-terminals are resolved with their definitions 200 | // in the map. 201 | func Grammar(start string, nonterms map[string]Pattern) Pattern { 202 | return &GrammarNode{ 203 | Defs: nonterms, 204 | Start: start, 205 | } 206 | } 207 | 208 | // CapGrammar builds a grammar, but all values are automatically captured. The 209 | // capture IDs are returned in the 'ids' map. 210 | func CapGrammar(start string, nonterms map[string]Pattern, ids map[string]int) Pattern { 211 | m := make(map[string]Pattern) 212 | id := 0 213 | for k, v := range nonterms { 214 | m[k] = Cap(v, id) 215 | ids[k] = id 216 | id++ 217 | } 218 | return Grammar(start, m) 219 | } 220 | 221 | // Error is a pattern that throws an error with the given message. 222 | func Error(msg string, recovery Pattern) Pattern { 223 | return &ErrorNode{ 224 | Message: msg, 225 | Recover: recovery, 226 | } 227 | } 228 | 229 | func max(a, b int) int { 230 | if a > b { 231 | return a 232 | } 233 | return b 234 | } 235 | -------------------------------------------------------------------------------- /pattern/string.go: -------------------------------------------------------------------------------- 1 | package pattern 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | ) 7 | 8 | func Prettify(p Pattern) string { 9 | switch t := Get(p).(type) { 10 | case *LiteralNode: 11 | return strconv.Quote(t.Str) 12 | case *ClassNode: 13 | return fmt.Sprintf("[%s]", t.Chars.String()) 14 | case *DotNode: 15 | return "." 16 | case *EmptyNode: 17 | return "\"\"" 18 | case *AltNode: 19 | return fmt.Sprintf("(%s / %s)", Prettify(Get(t.Left)), Prettify(Get(t.Right))) 20 | case *SeqNode: 21 | return fmt.Sprintf("(%s %s)", Prettify(Get(t.Left)), Prettify(Get(t.Right))) 22 | case *StarNode: 23 | return fmt.Sprintf("%s*", Prettify(Get(t.Patt))) 24 | case *PlusNode: 25 | return fmt.Sprintf("%s+", Prettify(Get(t.Patt))) 26 | case *OptionalNode: 27 | return fmt.Sprintf("%s?", Prettify(Get(t.Patt))) 28 | case *NotNode: 29 | return fmt.Sprintf("!%s", Prettify(Get(t.Patt))) 30 | case *AndNode: 31 | return fmt.Sprintf("&%s", Prettify(Get(t.Patt))) 32 | case *CapNode: 33 | return fmt.Sprintf("{ %s }", Prettify(Get(t.Patt))) 34 | case *MemoNode: 35 | return fmt.Sprintf("{{ %s }}", Prettify(Get(t.Patt))) 36 | case *SearchNode: 37 | return fmt.Sprintf("search(%s)", Prettify(Get(t.Patt))) 38 | case *CheckNode: 39 | return fmt.Sprintf("check(%s)", Prettify(Get(t.Patt))) 40 | case *ErrorNode: 41 | return fmt.Sprintf("err(%s, %s)", t.Message, Prettify(Get(t.Recover))) 42 | case *EmptyOpNode: 43 | return fmt.Sprintf("empty(%v)", t.Op) 44 | case *GrammarNode: 45 | s := fmt.Sprintf("%s\n", t.Start) 46 | t.Inline() 47 | for name, patt := range t.Defs { 48 | s += fmt.Sprintf("%s <- %s\n", name, Prettify(Get(patt))) 49 | } 50 | return s 51 | case *NonTermNode: 52 | if t.Inlined != nil { 53 | return Prettify(Get(t.Inlined)) 54 | } 55 | return t.Name 56 | } 57 | 58 | return "" 59 | } 60 | -------------------------------------------------------------------------------- /re/grammar.go: -------------------------------------------------------------------------------- 1 | package re 2 | 3 | import ( 4 | "github.com/zyedidia/gpeg/charset" 5 | p "github.com/zyedidia/gpeg/pattern" 6 | ) 7 | 8 | // Pattern <- Spacing_ (Grammar / Expression) EndOfFile_ 9 | // Grammar <- Definition+ 10 | // Definition <- Identifier LEFTARROW Expression 11 | // 12 | // Expression <- Sequence (SLASH Sequence)* 13 | // Sequence <- Prefix* 14 | // Prefix <- (AND / NOT)? Suffix 15 | // Suffix <- Primary (QUESTION / STAR / PLUS)? 16 | // Primary <- Identifier !LEFTARROW 17 | // / '(' Expression ')' 18 | // / Literal / Class 19 | // / BRACEPO Expression BRACEPC 20 | // / BRACEO Expression BRACEC 21 | // / DOT 22 | // 23 | // Identifier <- IdentStart IdentCont* Spacing_ 24 | // IdentStart <- [a-zA-Z_] 25 | // IdentCont <- IdentStart / [0-9] 26 | // 27 | // Literal <- ['] (!['] Char)* ['] Spacing_ 28 | // / ["] (!["] Char)* ["] Spacing_ 29 | // Class <- '[' CARAT? (!']' Range)* ']' Spacing_ 30 | // Range <- Char '-' Char / Char 31 | // Char <- '\\' [nrt'"\[\]\\\-] 32 | // / '\\' [0-2][0-7][0-7] 33 | // / '\\' [0-7][0-7]? 34 | // / !'\\' . 35 | // 36 | // AND <- '&' Spacing_ 37 | // NOT <- '!' Spacing_ 38 | // QUESTION <- '?' Spacing_ 39 | // STAR <- '*' Spacing_ 40 | // PLUS <- '+' Spacing_ 41 | // DOT <- '.' Spacing_ 42 | // CARAT <- '^' Spacing_ 43 | // BRACEO <- '{' Spacing_ 44 | // BRACEC <- '}' Spacing_ 45 | // BRACEPO <- '{{' Spacing_ 46 | // BRACEPC <- '}}' Spacing_ 47 | // LEFTARROW <- '<-' Spacing_ 48 | // OPEN <- '(' Spacing_ 49 | // CLOSE <- ')' Spacing_ 50 | // SLASH <- '/' Spacing_ 51 | // 52 | // Spacing_ <- (Space_ / Comment_)* 53 | // Comment_ <- '#' (!EndOfLine_ .)* EndOfLine_ 54 | // Space_ <- ' ' / '\t' / EndOfLine_ 55 | // EndOfLine_ <- '\r\n' / '\n' / '\r' 56 | // EndOfFile_ <- !. 57 | 58 | const ( 59 | idPattern = iota 60 | idGrammar 61 | idDefinition 62 | idExpression 63 | idSequence 64 | idPrefix 65 | idSuffix 66 | idPrimary 67 | idLiteral 68 | idRange 69 | idClass 70 | idIdentifier 71 | idIdentStart 72 | idIdentCont 73 | idChar 74 | idAND 75 | idNOT 76 | idQUESTION 77 | idSTAR 78 | idPLUS 79 | idDOT 80 | idCARAT 81 | idOPEN 82 | idBRACEO 83 | idBRACEPO 84 | ) 85 | 86 | var grammar = map[string]p.Pattern{ 87 | "Pattern": p.Cap(p.Concat( 88 | p.NonTerm("Spacing"), 89 | p.Or( 90 | p.NonTerm("Grammar"), 91 | p.NonTerm("Expression"), 92 | ), 93 | p.NonTerm("EndOfFile"), 94 | ), idPattern), 95 | "Grammar": p.Cap(p.Plus(p.NonTerm("Definition")), idGrammar), 96 | "Definition": p.Cap(p.Concat( 97 | p.NonTerm("Identifier"), 98 | p.NonTerm("LEFTARROW"), 99 | p.NonTerm("Expression"), 100 | ), idDefinition), 101 | 102 | "Expression": p.Cap(p.Concat( 103 | p.NonTerm("Sequence"), 104 | p.Star(p.Concat( 105 | p.NonTerm("SLASH"), 106 | p.NonTerm("Sequence"), 107 | )), 108 | ), idExpression), 109 | "Sequence": p.Cap(p.Star(p.NonTerm("Prefix")), idSequence), 110 | "Prefix": p.Cap(p.Concat( 111 | p.Optional(p.Or( 112 | p.NonTerm("AND"), 113 | p.NonTerm("NOT"), 114 | )), 115 | p.NonTerm("Suffix"), 116 | ), idPrefix), 117 | "Suffix": p.Cap(p.Concat( 118 | p.NonTerm("Primary"), 119 | p.Optional(p.Or( 120 | p.NonTerm("QUESTION"), 121 | p.NonTerm("STAR"), 122 | p.NonTerm("PLUS"), 123 | )), 124 | ), idSuffix), 125 | "Primary": p.Cap(p.Or( 126 | p.Concat( 127 | p.NonTerm("Identifier"), 128 | p.Not(p.NonTerm("LEFTARROW")), 129 | ), 130 | p.Concat( 131 | p.NonTerm("OPEN"), 132 | p.NonTerm("Expression"), 133 | p.NonTerm("CLOSE"), 134 | ), 135 | p.Concat( 136 | p.NonTerm("BRACEPO"), 137 | p.NonTerm("Expression"), 138 | p.NonTerm("BRACEPC"), 139 | ), 140 | p.Concat( 141 | p.NonTerm("BRACEO"), 142 | p.NonTerm("Expression"), 143 | p.NonTerm("BRACEC"), 144 | ), 145 | p.NonTerm("Literal"), 146 | p.NonTerm("Class"), 147 | p.NonTerm("DOT"), 148 | ), idPrimary), 149 | 150 | "Identifier": p.Cap(p.Concat( 151 | p.NonTerm("IdentStart"), 152 | p.Star(p.NonTerm("IdentCont")), 153 | p.NonTerm("Spacing"), 154 | ), idIdentifier), 155 | "IdentStart": p.Cap( 156 | p.Set(charset.Range('a', 'z'). 157 | Add(charset.Range('A', 'Z')). 158 | Add(charset.New([]byte{'_'})), 159 | ), idIdentStart), 160 | "IdentCont": p.Cap(p.Or( 161 | p.NonTerm("IdentStart"), 162 | p.Set(charset.Range('0', '9')), 163 | ), idIdentCont), 164 | 165 | "Literal": p.Cap(p.Or( 166 | p.Concat( 167 | p.Literal("'"), 168 | p.Star(p.Concat( 169 | p.Not(p.Literal("'")), 170 | p.NonTerm("Char"), 171 | )), 172 | p.Literal("'"), 173 | p.NonTerm("Spacing"), 174 | ), 175 | p.Concat( 176 | p.Literal("\""), 177 | p.Star(p.Concat( 178 | p.Not(p.Literal("\"")), 179 | p.NonTerm("Char"), 180 | )), 181 | p.Literal("\""), 182 | p.NonTerm("Spacing"), 183 | ), 184 | ), idLiteral), 185 | "Class": p.Cap(p.Concat( 186 | p.Literal("["), 187 | p.Optional(p.NonTerm("CARAT")), 188 | p.Star(p.Concat( 189 | p.Not(p.Literal("]")), 190 | p.NonTerm("Range"), 191 | )), 192 | p.Literal("]"), 193 | p.NonTerm("Spacing"), 194 | ), idClass), 195 | "Range": p.Cap(p.Or( 196 | p.Concat( 197 | p.NonTerm("Char"), 198 | p.Literal("-"), 199 | p.NonTerm("Char"), 200 | ), 201 | p.NonTerm("Char"), 202 | ), idRange), 203 | "Char": p.Cap(p.Or( 204 | p.Concat( 205 | p.Literal("\\"), 206 | p.Set(charset.New([]byte{'n', 'r', 't', '\'', '"', '[', ']', '\\', '-'})), 207 | ), 208 | p.Concat( 209 | p.Literal("\\"), 210 | p.Set(charset.Range('0', '2')), 211 | p.Set(charset.Range('0', '7')), 212 | p.Set(charset.Range('0', '7')), 213 | ), 214 | p.Concat( 215 | p.Literal("\\"), 216 | p.Set(charset.Range('0', '7')), 217 | p.Optional(p.Set(charset.Range('0', '7'))), 218 | ), 219 | p.Concat( 220 | p.Not(p.Literal("\\")), 221 | p.Any(1), 222 | ), 223 | ), idChar), 224 | 225 | "AND": p.Cap(p.Concat( 226 | p.Literal("&"), 227 | p.NonTerm("Spacing"), 228 | ), idAND), 229 | "NOT": p.Cap(p.Concat( 230 | p.Literal("!"), 231 | p.NonTerm("Spacing"), 232 | ), idNOT), 233 | "QUESTION": p.Cap(p.Concat( 234 | p.Literal("?"), 235 | p.NonTerm("Spacing"), 236 | ), idQUESTION), 237 | "STAR": p.Cap(p.Concat( 238 | p.Literal("*"), 239 | p.NonTerm("Spacing"), 240 | ), idSTAR), 241 | "PLUS": p.Cap(p.Concat( 242 | p.Literal("+"), 243 | p.NonTerm("Spacing"), 244 | ), idPLUS), 245 | "DOT": p.Cap(p.Concat( 246 | p.Literal("."), 247 | p.NonTerm("Spacing"), 248 | ), idDOT), 249 | "CARAT": p.Cap(p.Concat( 250 | p.Literal("^"), 251 | p.NonTerm("Spacing"), 252 | ), idCARAT), 253 | "OPEN": p.Cap(p.Concat( 254 | p.Literal("("), 255 | p.NonTerm("Spacing"), 256 | ), idOPEN), 257 | "CLOSE": p.Concat( 258 | p.Literal(")"), 259 | p.NonTerm("Spacing"), 260 | ), 261 | "BRACEO": p.Cap(p.Concat( 262 | p.Literal("{"), 263 | p.NonTerm("Spacing"), 264 | ), idBRACEO), 265 | "BRACEC": p.Concat( 266 | p.Literal("}"), 267 | p.NonTerm("Spacing"), 268 | ), 269 | "BRACEPO": p.Cap(p.Concat( 270 | p.Literal("{{"), 271 | p.NonTerm("Spacing"), 272 | ), idBRACEPO), 273 | "BRACEPC": p.Concat( 274 | p.Literal("}}"), 275 | p.NonTerm("Spacing"), 276 | ), 277 | "SLASH": p.Concat( 278 | p.Literal("/"), 279 | p.NonTerm("Spacing"), 280 | ), 281 | "LEFTARROW": p.Concat( 282 | p.Literal("<-"), 283 | p.NonTerm("Spacing"), 284 | ), 285 | 286 | "Spacing": p.Star(p.Or( 287 | p.NonTerm("Space"), 288 | p.NonTerm("Comment"), 289 | )), 290 | "Comment": p.Concat( 291 | p.Literal("#"), 292 | p.Star(p.Concat( 293 | p.Not(p.NonTerm("EndOfLine")), 294 | p.Any(1), 295 | )), 296 | p.NonTerm("EndOfLine"), 297 | ), 298 | "Space": p.Or( 299 | p.Set(charset.New([]byte{' ', '\t'})), 300 | p.NonTerm("EndOfLine"), 301 | ), 302 | "EndOfLine": p.Or( 303 | p.Literal("\r\n"), 304 | p.Literal("\n"), 305 | p.Literal("\r"), 306 | ), 307 | "EndOfFile": p.Not(p.Any(1)), 308 | } 309 | -------------------------------------------------------------------------------- /re/re.go: -------------------------------------------------------------------------------- 1 | // Package re provides functions for compiling 're' patterns (given as strings) 2 | // into standard patterns. 3 | package re 4 | 5 | import ( 6 | "bytes" 7 | "fmt" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/zyedidia/gpeg/charset" 12 | "github.com/zyedidia/gpeg/memo" 13 | "github.com/zyedidia/gpeg/pattern" 14 | "github.com/zyedidia/gpeg/vm" 15 | ) 16 | 17 | var parser vm.Code 18 | 19 | func init() { 20 | prog := pattern.MustCompile(pattern.Grammar("Pattern", grammar)) 21 | parser = vm.Encode(prog) 22 | } 23 | 24 | func compile(root *memo.Capture, s string, capg bool, ids map[string]int) pattern.Pattern { 25 | var p pattern.Pattern 26 | switch root.Id() { 27 | case idPattern: 28 | p = compile(root.Child(0), s, capg, ids) 29 | case idGrammar: 30 | nonterms := make(map[string]pattern.Pattern) 31 | var first string 32 | it := root.ChildIterator(0) 33 | for c := it(); c != nil; c = it() { 34 | k, v := compileDef(c, s, capg, ids) 35 | if first == "" { 36 | first = k 37 | } 38 | nonterms[k] = v 39 | } 40 | if capg { 41 | p = pattern.CapGrammar(first, nonterms, ids) 42 | } else { 43 | p = pattern.Grammar(first, nonterms) 44 | } 45 | case idExpression: 46 | alternations := make([]pattern.Pattern, 0, root.NumChildren()) 47 | it := root.ChildIterator(0) 48 | for c := it(); c != nil; c = it() { 49 | alternations = append(alternations, compile(c, s, capg, ids)) 50 | } 51 | p = pattern.Or(alternations...) 52 | case idSequence: 53 | concats := make([]pattern.Pattern, 0, root.NumChildren()) 54 | it := root.ChildIterator(0) 55 | for c := it(); c != nil; c = it() { 56 | concats = append(concats, compile(c, s, capg, ids)) 57 | } 58 | p = pattern.Concat(concats...) 59 | case idPrefix: 60 | c := root.Child(0) 61 | switch c.Id() { 62 | case idAND: 63 | p = pattern.And(compile(root.Child(1), s, capg, ids)) 64 | case idNOT: 65 | p = pattern.Not(compile(root.Child(1), s, capg, ids)) 66 | default: 67 | p = compile(root.Child(0), s, capg, ids) 68 | } 69 | case idSuffix: 70 | if root.NumChildren() == 2 { 71 | c := root.Child(1) 72 | switch c.Id() { 73 | case idQUESTION: 74 | p = pattern.Optional(compile(root.Child(0), s, capg, ids)) 75 | case idSTAR: 76 | p = pattern.Star(compile(root.Child(0), s, capg, ids)) 77 | case idPLUS: 78 | p = pattern.Plus(compile(root.Child(0), s, capg, ids)) 79 | } 80 | } else { 81 | p = compile(root.Child(0), s, capg, ids) 82 | } 83 | case idPrimary: 84 | switch root.Child(0).Id() { 85 | case idIdentifier, idLiteral, idClass: 86 | p = compile(root.Child(0), s, capg, ids) 87 | case idOPEN: 88 | p = compile(root.Child(1), s, capg, ids) 89 | case idBRACEPO: 90 | p = pattern.Memo(compile(root.Child(1), s, capg, ids)) 91 | case idDOT: 92 | p = pattern.Any(1) 93 | } 94 | case idLiteral: 95 | lit := &bytes.Buffer{} 96 | it := root.ChildIterator(0) 97 | for c := it(); c != nil; c = it() { 98 | lit.WriteByte(parseChar(s[c.Start():c.End()])) 99 | } 100 | p = pattern.Literal(lit.String()) 101 | case idClass: 102 | var set charset.Set 103 | if root.NumChildren() <= 0 { 104 | break 105 | } 106 | complement := false 107 | if root.Child(0).Id() == idCARAT { 108 | complement = true 109 | } 110 | it := root.ChildIterator(0) 111 | i := 0 112 | for c := it(); c != nil; c = it() { 113 | if i == 0 && complement { 114 | i++ 115 | continue 116 | } 117 | set = set.Add(compileSet(c, s)) 118 | } 119 | if complement { 120 | set = set.Complement() 121 | } 122 | p = pattern.Set(set) 123 | case idIdentifier: 124 | p = pattern.NonTerm(parseId(root, s)) 125 | } 126 | return p 127 | } 128 | 129 | var special = map[byte]byte{ 130 | 'n': '\n', 131 | 'r': '\r', 132 | 't': '\t', 133 | '\'': '\'', 134 | '"': '"', 135 | '[': '[', 136 | ']': ']', 137 | '\\': '\\', 138 | '-': '-', 139 | } 140 | 141 | func parseChar(char string) byte { 142 | switch char[0] { 143 | case '\\': 144 | for k, v := range special { 145 | if char[1] == k { 146 | return v 147 | } 148 | } 149 | 150 | i, _ := strconv.ParseInt(string(char[1:]), 8, 8) 151 | return byte(i) 152 | default: 153 | return char[0] 154 | } 155 | } 156 | 157 | func parseId(root *memo.Capture, s string) string { 158 | ident := &bytes.Buffer{} 159 | it := root.ChildIterator(0) 160 | for c := it(); c != nil; c = it() { 161 | ident.WriteString(s[c.Start():c.End()]) 162 | } 163 | return ident.String() 164 | } 165 | 166 | func compileDef(root *memo.Capture, s string, capg bool, ids map[string]int) (string, pattern.Pattern) { 167 | id := root.Child(0) 168 | exp := root.Child(1) 169 | return parseId(id, s), compile(exp, s, capg, ids) 170 | } 171 | 172 | func compileSet(root *memo.Capture, s string) charset.Set { 173 | switch root.NumChildren() { 174 | case 1: 175 | c := root.Child(0) 176 | return charset.New([]byte{parseChar(s[c.Start():c.End()])}) 177 | case 2: 178 | c1, c2 := root.Child(0), root.Child(1) 179 | return charset.Range(parseChar(s[c1.Start():c1.End()]), parseChar(s[c2.Start():c2.End()])) 180 | } 181 | return charset.Set{} 182 | } 183 | 184 | func Compile(s string) (pattern.Pattern, error) { 185 | match, n, ast, errs := parser.Exec(strings.NewReader(s), memo.NoneTable{}) 186 | if len(errs) != 0 { 187 | return nil, errs[0] 188 | } 189 | if !match { 190 | return nil, fmt.Errorf("Invalid PEG: failed at %d", n) 191 | } 192 | 193 | return compile(ast.Child(0), s, false, nil), nil 194 | } 195 | 196 | func MustCompile(s string) pattern.Pattern { 197 | p, err := Compile(s) 198 | if err != nil { 199 | panic(err) 200 | } 201 | return p 202 | } 203 | 204 | func CompileCap(s string, ids map[string]int) (pattern.Pattern, error) { 205 | match, n, ast, errs := parser.Exec(strings.NewReader(s), memo.NoneTable{}) 206 | if len(errs) != 0 { 207 | return nil, errs[0] 208 | } 209 | if !match { 210 | return nil, fmt.Errorf("Invalid PEG: failed at %d", n) 211 | } 212 | 213 | return compile(ast.Child(0), s, true, ids), nil 214 | } 215 | 216 | func MustCompileCap(s string, ids map[string]int) pattern.Pattern { 217 | p, err := CompileCap(s, ids) 218 | if err != nil { 219 | panic(err) 220 | } 221 | return p 222 | } 223 | -------------------------------------------------------------------------------- /re_test.go: -------------------------------------------------------------------------------- 1 | package gpeg 2 | 3 | import ( 4 | "io/ioutil" 5 | "testing" 6 | 7 | "github.com/zyedidia/gpeg/re" 8 | ) 9 | 10 | func TestRe(t *testing.T) { 11 | p := re.MustCompile("ID <- [a-zA-Z][a-zA-Z0-9_]*") 12 | tests := []PatternTest{ 13 | {"hello", 5}, 14 | {"test_1", 6}, 15 | {"_not_allowed", -1}, 16 | {"123", -1}, 17 | } 18 | check(p, tests, t) 19 | } 20 | 21 | func TestReExtra(t *testing.T) { 22 | p := re.MustCompile("[^a-zA-Z]*") 23 | tests := []PatternTest{ 24 | {"hello", 0}, 25 | {"123", 3}, 26 | {"_*&##@0abc", 7}, 27 | } 28 | check(p, tests, t) 29 | } 30 | 31 | func TestJson(t *testing.T) { 32 | peg, err := ioutil.ReadFile("grammars/json.peg") 33 | if err != nil { 34 | t.Error(err) 35 | } 36 | p := re.MustCompile(string(peg)) 37 | 38 | json, err := ioutil.ReadFile("testdata/test.json") 39 | if err != nil { 40 | t.Error(err) 41 | } 42 | 43 | tests := []PatternTest{ 44 | {string(json), len(json)}, 45 | } 46 | 47 | check(p, tests, t) 48 | } 49 | 50 | func TestJava(t *testing.T) { 51 | peg, err := ioutil.ReadFile("grammars/java.peg") 52 | if err != nil { 53 | t.Error(err) 54 | } 55 | p := re.MustCompile(string(peg)) 56 | 57 | java, err := ioutil.ReadFile("testdata/test.java") 58 | if err != nil { 59 | t.Error(err) 60 | } 61 | 62 | tests := []PatternTest{ 63 | {string(java), len(java)}, 64 | } 65 | 66 | check(p, tests, t) 67 | } 68 | -------------------------------------------------------------------------------- /recover_test.go: -------------------------------------------------------------------------------- 1 | package gpeg 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/zyedidia/gpeg/charset" 8 | "github.com/zyedidia/gpeg/memo" 9 | . "github.com/zyedidia/gpeg/pattern" 10 | "github.com/zyedidia/gpeg/vm" 11 | ) 12 | 13 | func sync(p Pattern) Pattern { 14 | return Star(Concat(Not(p), Any(1))) 15 | } 16 | 17 | func TestRecover(t *testing.T) { 18 | id := Plus(Set(charset.Range('a', 'z'))) 19 | p := Grammar("S", map[string]Pattern{ 20 | "S": Or(NonTerm("List"), Concat(Any(1), Error("expecting a list of identifiers", NonTerm("ErrList")))), 21 | "List": Concat( 22 | NonTerm("Id"), 23 | Star(Concat(And(Any(1)), 24 | NonTerm("Comma"), 25 | Or(NonTerm("Id"), 26 | Error("expecting an identifier", NonTerm("ErrId")))), 27 | ), 28 | ), 29 | "Id": Concat(NonTerm("Sp"), id), 30 | "Comma": Or(Concat(NonTerm("Sp"), Literal(",")), Error("expecting ','", NonTerm("ErrComma"))), 31 | "Sp": Star(Set(charset.New([]byte{' ', '\n', '\t'}))), 32 | "ErrId": sync(Literal(",")), 33 | "ErrComma": sync(id), 34 | "ErrList": sync(Not(Any(1))), 35 | }) 36 | 37 | peg := MustCompile(p) 38 | code := vm.Encode(peg) 39 | in := strings.NewReader("one two three,") 40 | _, _, _, errs := code.Exec(in, memo.NoneTable{}) 41 | 42 | if len(errs) != 3 { 43 | t.Error("Incorrect list of errors:", errs) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /rxconv/rxconv.go: -------------------------------------------------------------------------------- 1 | // Package rxconv provides functions to convert a Go regexp into a PEG so that 2 | // it can be used for incremental parsing. 3 | package rxconv 4 | 5 | import ( 6 | "fmt" 7 | "regexp/syntax" 8 | "strconv" 9 | 10 | "github.com/zyedidia/gpeg/charset" 11 | p "github.com/zyedidia/gpeg/pattern" 12 | ) 13 | 14 | var num = 0 15 | 16 | func uniq() string { 17 | num++ 18 | return "a" + strconv.Itoa(num) 19 | } 20 | 21 | func star(r *syntax.Regexp, k p.Pattern) p.Pattern { 22 | nterm := uniq() 23 | nonterms := make(map[string]p.Pattern) 24 | nonterms[nterm] = p.Or(pi(r, p.NonTerm(nterm)), k) 25 | return p.Grammar(nterm, nonterms) 26 | } 27 | 28 | // continuation-based conversion 29 | func pi(e *syntax.Regexp, k p.Pattern) p.Pattern { 30 | switch e.Op { 31 | case syntax.OpEmptyMatch: 32 | return k 33 | case syntax.OpLiteral: 34 | return p.Concat(p.Literal(string(e.Rune)), k) 35 | case syntax.OpCharClass: 36 | lits := make([]p.Pattern, 0, len(e.Rune)) 37 | for i := 0; i < len(e.Rune); i += 2 { 38 | start := e.Rune[i] 39 | end := e.Rune[i+1] 40 | var patt p.Pattern 41 | if start < 256 && end < 256 { 42 | patt = p.Set(charset.Range(byte(start), byte(end))) 43 | lits = append(lits, p.Concat(patt, k)) 44 | } else { 45 | for ; start <= end; start++ { 46 | lits = append(lits, p.Concat(p.Literal(string(start)), k)) 47 | } 48 | } 49 | } 50 | return p.Or(lits...) 51 | case syntax.OpAnyChar: 52 | // TODO: unicode 53 | return p.Concat(p.Any(1), k) 54 | case syntax.OpAnyCharNotNL: 55 | return p.Concat(p.Set(charset.New([]byte{'\n'}).Complement()), k) 56 | case syntax.OpConcat: 57 | patt := k 58 | for i := len(e.Sub) - 1; i >= 0; i-- { 59 | patt = pi(e.Sub[i], patt) 60 | } 61 | return patt 62 | case syntax.OpAlternate: 63 | alts := make([]p.Pattern, 0, len(e.Sub)) 64 | for _, s := range e.Sub { 65 | alts = append(alts, pi(s, k)) 66 | } 67 | return p.Or(alts...) 68 | case syntax.OpCapture: 69 | return pi(e.Sub[0], k) 70 | case syntax.OpStar: 71 | return star(e.Sub[0], k) 72 | case syntax.OpPlus: 73 | return pi(e.Sub[0], star(e.Sub[0], k)) 74 | case syntax.OpQuest: 75 | return p.Or(pi(e.Sub[0], k), k) 76 | case syntax.OpBeginLine: 77 | return p.Concat(p.EmptyOp(syntax.EmptyBeginLine), k) 78 | case syntax.OpEndLine: 79 | return p.Concat(p.EmptyOp(syntax.EmptyEndLine), k) 80 | case syntax.OpBeginText: 81 | return p.Concat(p.EmptyOp(syntax.EmptyBeginText), k) 82 | case syntax.OpEndText: 83 | return p.Concat(p.EmptyOp(syntax.EmptyEndText), k) 84 | case syntax.OpWordBoundary: 85 | return p.Concat(p.EmptyOp(syntax.EmptyWordBoundary), k) 86 | case syntax.OpNoWordBoundary: 87 | return p.Concat(p.EmptyOp(syntax.EmptyNoWordBoundary), k) 88 | } 89 | panic(fmt.Sprintf("unimplemented %s", e.Op)) 90 | } 91 | 92 | func convert(r *syntax.Regexp) p.Pattern { 93 | return pi(r, &p.EmptyNode{}) 94 | } 95 | 96 | func FromRegexp(s string, flags syntax.Flags) (p.Pattern, error) { 97 | re, err := syntax.Parse(s, flags) 98 | if err != nil { 99 | return nil, err 100 | } 101 | if !verify(re) { 102 | return nil, fmt.Errorf("invalid regexp (repeat not supported)") 103 | } 104 | return p.Search(p.Cap(convert(re), 0)), nil 105 | } 106 | 107 | func verify(e *syntax.Regexp) bool { 108 | switch e.Op { 109 | case syntax.OpEmptyMatch, syntax.OpLiteral, syntax.OpCharClass, 110 | syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpBeginLine, 111 | syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText, 112 | syntax.OpWordBoundary, syntax.OpNoWordBoundary: 113 | return true 114 | case syntax.OpConcat, syntax.OpAlternate: 115 | yes := true 116 | for _, s := range e.Sub { 117 | yes = yes && verify(s) 118 | } 119 | return yes 120 | case syntax.OpCapture, syntax.OpStar, syntax.OpPlus, syntax.OpQuest: 121 | return verify(e.Sub[0]) 122 | } 123 | return false 124 | } 125 | -------------------------------------------------------------------------------- /rxconv/rxconv_test.go: -------------------------------------------------------------------------------- 1 | package rxconv_test 2 | 3 | import ( 4 | "regexp/syntax" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/zyedidia/gpeg/memo" 9 | . "github.com/zyedidia/gpeg/pattern" 10 | "github.com/zyedidia/gpeg/rxconv" 11 | "github.com/zyedidia/gpeg/vm" 12 | ) 13 | 14 | type PatternTest struct { 15 | in string 16 | match int 17 | } 18 | 19 | func check(p Pattern, tests []PatternTest, t *testing.T) { 20 | code := vm.Encode(MustCompile(p)) 21 | for _, tt := range tests { 22 | name := tt.in[:min(10, len(tt.in))] 23 | t.Run(name, func(t *testing.T) { 24 | match, off, _, _ := code.Exec(strings.NewReader(tt.in), memo.NoneTable{}) 25 | if tt.match == -1 && match || tt.match != -1 && !match || tt.match != -1 && tt.match != off { 26 | t.Errorf("%s: got: (%t, %d), but expected (%d)\n", tt.in, match, off, tt.match) 27 | } 28 | }) 29 | } 30 | } 31 | 32 | func TestSimple(t *testing.T) { 33 | peg, err := rxconv.FromRegexp("(a|ab)c", syntax.Perl) 34 | if err != nil { 35 | t.Fatal(err) 36 | } 37 | 38 | tests := []PatternTest{ 39 | {"abc", 3}, 40 | {"ac", 2}, 41 | {"ab", -1}, 42 | } 43 | check(peg, tests, t) 44 | } 45 | 46 | func TestStar(t *testing.T) { 47 | peg, err := rxconv.FromRegexp("(ba|a)*a", syntax.Perl) 48 | if err != nil { 49 | t.Fatal(err) 50 | } 51 | 52 | tests := []PatternTest{ 53 | {"abaabaa", 7}, 54 | } 55 | 56 | check(peg, tests, t) 57 | } 58 | 59 | func TestMultiOr(t *testing.T) { 60 | peg, err := rxconv.FromRegexp("aa|bb|dd|ff", syntax.Perl) 61 | if err != nil { 62 | t.Fatal(err) 63 | } 64 | tests := []PatternTest{ 65 | {"aa", 2}, 66 | {"bb", 2}, 67 | {"af", -1}, 68 | {"ff", 2}, 69 | } 70 | 71 | check(peg, tests, t) 72 | } 73 | 74 | func TestCharClass(t *testing.T) { 75 | peg, err := rxconv.FromRegexp("[a-z0-9]+", syntax.Perl) 76 | if err != nil { 77 | t.Fatal(err) 78 | } 79 | tests := []PatternTest{ 80 | {"", -1}, 81 | {"hello123", 8}, 82 | {"foo", 3}, 83 | {"123", 3}, 84 | {"_&_", -1}, 85 | } 86 | check(peg, tests, t) 87 | } 88 | 89 | func TestEmptyOp(t *testing.T) { 90 | peg, err := rxconv.FromRegexp("^foo", syntax.Perl) 91 | if err != nil { 92 | t.Fatal(err) 93 | } 94 | tests := []PatternTest{ 95 | {"foohello", 3}, 96 | {" foo ", -1}, 97 | } 98 | check(peg, tests, t) 99 | 100 | peg, err = rxconv.FromRegexp("\\bfoo\\b", syntax.Perl) 101 | if err != nil { 102 | t.Fatal(err) 103 | } 104 | tests = []PatternTest{ 105 | {"foohello", -1}, 106 | {" foo ", 4}, 107 | } 108 | check(peg, tests, t) 109 | } 110 | 111 | func min(a, b int) int { 112 | if a < b { 113 | return a 114 | } 115 | return b 116 | } 117 | -------------------------------------------------------------------------------- /testdata/test.java: -------------------------------------------------------------------------------- 1 | public class Hello { 2 | public static void main(String[] args) { 3 | System.out.println("Hello world"); 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /vm/code.go: -------------------------------------------------------------------------------- 1 | package vm 2 | 3 | import ( 4 | "bytes" 5 | "compress/gzip" 6 | "encoding/binary" 7 | "encoding/gob" 8 | "encoding/json" 9 | "fmt" 10 | 11 | "github.com/zyedidia/gpeg/charset" 12 | "github.com/zyedidia/gpeg/isa" 13 | ) 14 | 15 | // Code is the representation of VM bytecode. 16 | type Code struct { 17 | data code 18 | } 19 | 20 | type code struct { 21 | // list of charsets 22 | Sets []charset.Set 23 | // list of error messages 24 | Errors []string 25 | // list of checker functions 26 | Checkers []isa.Checker 27 | 28 | // the encoded instructions 29 | Insns []byte 30 | } 31 | 32 | // Size returns the size of the encoded instructions. 33 | func (c *Code) Size() int { 34 | return len(c.data.Insns) 35 | } 36 | 37 | func init() { 38 | gob.Register(isa.MapChecker{}) 39 | gob.Register(isa.BackReference{}) 40 | } 41 | 42 | // ToBytes serializes and compresses this Code. 43 | func (c *Code) ToBytes() ([]byte, error) { 44 | var buf bytes.Buffer 45 | fz := gzip.NewWriter(&buf) 46 | enc := gob.NewEncoder(fz) 47 | err := enc.Encode(c.data) 48 | fz.Close() 49 | return buf.Bytes(), err 50 | } 51 | 52 | // FromBytes loads a Code from a compressed and serialized object. 53 | func FromBytes(b []byte) (Code, error) { 54 | var c code 55 | fz, err := gzip.NewReader(bytes.NewReader(b)) 56 | if err != nil { 57 | return Code{}, err 58 | } 59 | dec := gob.NewDecoder(fz) 60 | err = dec.Decode(&c) 61 | fz.Close() 62 | return Code{ 63 | data: c, 64 | }, err 65 | } 66 | 67 | // ToJson returns this Code serialized to JSON form. 68 | func (c *Code) ToJson() ([]byte, error) { 69 | return json.Marshal(c.data) 70 | } 71 | 72 | // FromJson returns a Code loaded from JSON form. 73 | func FromJson(b []byte) (Code, error) { 74 | var c code 75 | err := json.Unmarshal(b, &c) 76 | return Code{ 77 | data: c, 78 | }, err 79 | } 80 | 81 | // Encode transforms a program into VM bytecode. 82 | func Encode(insns isa.Program) Code { 83 | code := Code{ 84 | data: code{ 85 | Sets: make([]charset.Set, 0), 86 | Insns: make([]byte, 0), 87 | }, 88 | } 89 | 90 | var bcount uint 91 | labels := make(map[isa.Label]uint) 92 | for _, insn := range insns { 93 | switch t := insn.(type) { 94 | case isa.Nop: 95 | continue 96 | case isa.Label: 97 | labels[t] = bcount 98 | continue 99 | default: 100 | bcount += size(insn) 101 | } 102 | } 103 | 104 | for _, insn := range insns { 105 | var op byte 106 | var args []byte 107 | 108 | switch t := insn.(type) { 109 | case isa.Label, isa.Nop: 110 | continue 111 | case isa.Char: 112 | op = opChar 113 | args = []byte{t.Byte} 114 | case isa.Jump: 115 | op = opJump 116 | args = encodeLabel(labels[t.Lbl]) 117 | case isa.Choice: 118 | op = opChoice 119 | args = encodeLabel(labels[t.Lbl]) 120 | case isa.Call: 121 | op = opCall 122 | args = encodeLabel(labels[t.Lbl]) 123 | case isa.Commit: 124 | op = opCommit 125 | args = encodeLabel(labels[t.Lbl]) 126 | case isa.Return: 127 | op = opReturn 128 | case isa.Fail: 129 | op = opFail 130 | case isa.Set: 131 | op = opSet 132 | args = encodeU8(addSet(&code, t.Chars)) 133 | case isa.Any: 134 | op = opAny 135 | args = []byte{t.N} 136 | case isa.PartialCommit: 137 | op = opPartialCommit 138 | args = encodeLabel(labels[t.Lbl]) 139 | case isa.Span: 140 | op = opSpan 141 | args = encodeU8(addSet(&code, t.Chars)) 142 | case isa.BackCommit: 143 | op = opBackCommit 144 | args = encodeLabel(labels[t.Lbl]) 145 | case isa.FailTwice: 146 | op = opFailTwice 147 | case isa.Empty: 148 | op = opEmpty 149 | args = []byte{uint8(t.Op)} 150 | case isa.TestChar: 151 | op = opTestChar 152 | args = append([]byte{t.Byte}, encodeLabel(labels[t.Lbl])...) 153 | case isa.TestCharNoChoice: 154 | op = opTestCharNoChoice 155 | args = append([]byte{t.Byte}, encodeLabel(labels[t.Lbl])...) 156 | case isa.TestSet: 157 | op = opTestSet 158 | args = append(encodeU8(addSet(&code, t.Chars)), encodeLabel(labels[t.Lbl])...) 159 | case isa.TestSetNoChoice: 160 | op = opTestSetNoChoice 161 | args = append(encodeU8(addSet(&code, t.Chars)), encodeLabel(labels[t.Lbl])...) 162 | case isa.TestAny: 163 | op = opTestAny 164 | args = append([]byte{t.N}, encodeLabel(labels[t.Lbl])...) 165 | case isa.CaptureBegin: 166 | op = opCaptureBegin 167 | args = encodeI16(int(t.Id)) 168 | case isa.CaptureEnd: 169 | op = opCaptureEnd 170 | case isa.CaptureLate: 171 | op = opCaptureLate 172 | args = append([]byte{t.Back}, encodeI16(int(t.Id))...) 173 | case isa.CaptureFull: 174 | op = opCaptureFull 175 | args = append([]byte{t.Back}, encodeI16(int(t.Id))...) 176 | case isa.MemoOpen: 177 | op = opMemoOpen 178 | args = append(encodeLabel(labels[t.Lbl]), encodeI16(int(t.Id))...) 179 | case isa.MemoClose: 180 | op = opMemoClose 181 | case isa.MemoTreeOpen: 182 | op = opMemoTreeOpen 183 | args = append(encodeLabel(labels[t.Lbl]), encodeI16(int(t.Id))...) 184 | case isa.MemoTreeInsert: 185 | op = opMemoTreeInsert 186 | case isa.MemoTree: 187 | op = opMemoTree 188 | case isa.MemoTreeClose: 189 | op = opMemoTreeClose 190 | args = encodeI16(int(t.Id)) 191 | case isa.CheckBegin: 192 | op = opCheckBegin 193 | args = append(encodeI16(t.Id), encodeI16(t.Flag)...) 194 | case isa.CheckEnd: 195 | op = opCheckEnd 196 | args = encodeU24(addChecker(&code, t.Checker)) 197 | case isa.Error: 198 | op = opError 199 | args = encodeU24(addError(&code, t.Message)) 200 | case isa.End: 201 | op = opEnd 202 | args = encodeBool(t.Fail) 203 | default: 204 | panic(fmt.Sprintf("invalid instruction during encoding: %v", t)) 205 | } 206 | 207 | code.data.Insns = append(code.data.Insns, op) 208 | 209 | // need padding to align the args if they are divisible by 16 bits 210 | if len(args)%2 == 0 { 211 | code.data.Insns = append(code.data.Insns, 0) 212 | } 213 | 214 | code.data.Insns = append(code.data.Insns, args...) 215 | } 216 | code.data.Insns = append(code.data.Insns, opEnd, 0) 217 | 218 | return code 219 | } 220 | 221 | func encodeU8(x uint) []byte { 222 | if x >= 256 { 223 | panic("U8 out of bounds") 224 | } 225 | 226 | return []byte{uint8(x)} 227 | } 228 | 229 | func encodeI8(x int) []byte { 230 | if x < -128 || x >= 128 { 231 | panic("I8 out of bounds") 232 | } 233 | 234 | return []byte{byte(x)} 235 | } 236 | 237 | func encodeU16(x uint) []byte { 238 | if x >= (1 << 16) { 239 | panic("U16 out of bounds") 240 | } 241 | 242 | b := make([]byte, 2) 243 | binary.LittleEndian.PutUint16(b[0:], uint16(x)) 244 | return b 245 | } 246 | 247 | func encodeI16(x int) []byte { 248 | if x < -(1<<15) || x >= (1<<15) { 249 | panic("I16 out of bounds") 250 | } 251 | 252 | b := make([]byte, 2) 253 | binary.LittleEndian.PutUint16(b[0:], uint16(x)) 254 | return b 255 | } 256 | 257 | func encodeU24(x uint) []byte { 258 | if x >= (1 << 24) { 259 | panic("I24 out of bounds") 260 | } 261 | 262 | b := make([]byte, 4) 263 | i1 := uint16((x >> 16) & 0xff) 264 | i2 := uint16(x) 265 | 266 | binary.BigEndian.PutUint16(b[0:], i1) 267 | binary.LittleEndian.PutUint16(b[2:], i2) 268 | return b[1:4] 269 | } 270 | 271 | func encodeLabel(x uint) []byte { 272 | return encodeU24(x) 273 | } 274 | 275 | func encodeBool(b bool) []byte { 276 | if b { 277 | return []byte{1} 278 | } 279 | return []byte{0} 280 | } 281 | 282 | // Adds the set to the code's list of charsets, and returns the index it was 283 | // added at. If there are duplicate charsets, this may not actually insert 284 | // the new charset. 285 | func addSet(code *Code, set charset.Set) uint { 286 | for i, s := range code.data.Sets { 287 | if set == s { 288 | return uint(i) 289 | } 290 | } 291 | 292 | code.data.Sets = append(code.data.Sets, set) 293 | return uint(len(code.data.Sets) - 1) 294 | } 295 | 296 | func addError(code *Code, msg string) uint { 297 | for i, s := range code.data.Errors { 298 | if msg == s { 299 | return uint(i) 300 | } 301 | } 302 | 303 | code.data.Errors = append(code.data.Errors, msg) 304 | return uint(len(code.data.Errors) - 1) 305 | } 306 | 307 | func addChecker(code *Code, checker isa.Checker) uint { 308 | code.data.Checkers = append(code.data.Checkers, checker) 309 | return uint(len(code.data.Checkers) - 1) 310 | } 311 | -------------------------------------------------------------------------------- /vm/code_test.go: -------------------------------------------------------------------------------- 1 | package vm 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/zyedidia/gpeg/charset" 7 | . "github.com/zyedidia/gpeg/pattern" 8 | ) 9 | 10 | func TestBytes(t *testing.T) { 11 | p := Grammar("Expr", map[string]Pattern{ 12 | "Expr": Concat(NonTerm("Factor"), Star(Concat(Set(charset.New([]byte{'+', '-'})), NonTerm("Factor")))), 13 | "Factor": Concat(NonTerm("Term"), Star(Concat(Set(charset.New([]byte{'*', '/'})), NonTerm("Term")))), 14 | "Term": Or(NonTerm("Number"), Concat(Concat(Literal("("), NonTerm("Expr")), Literal(")"))), 15 | "Number": Plus(Set(charset.Range('0', '9'))), 16 | }) 17 | 18 | code := Encode(MustCompile(p)) 19 | b, err := code.ToBytes() 20 | if err != nil { 21 | t.Error(err) 22 | } 23 | load, err := FromBytes(b) 24 | if err != nil { 25 | t.Error(err) 26 | } 27 | 28 | if load.Size() != code.Size() { 29 | t.Error("Saved and loaded code not equivalent") 30 | } 31 | 32 | for i := range code.data.Insns { 33 | if load.data.Insns[i] != code.data.Insns[i] { 34 | t.Errorf("Code byte %d does not match", i) 35 | } 36 | } 37 | } 38 | 39 | func TestJson(t *testing.T) { 40 | p := Grammar("Expr", map[string]Pattern{ 41 | "Expr": Concat(NonTerm("Factor"), Star(Concat(Set(charset.New([]byte{'+', '-'})), NonTerm("Factor")))), 42 | "Factor": Concat(NonTerm("Term"), Star(Concat(Set(charset.New([]byte{'*', '/'})), NonTerm("Term")))), 43 | "Term": Or(NonTerm("Number"), Concat(Concat(Literal("("), NonTerm("Expr")), Literal(")"))), 44 | "Number": Plus(Set(charset.Range('0', '9'))), 45 | }) 46 | 47 | code := Encode(MustCompile(p)) 48 | b, err := code.ToJson() 49 | if err != nil { 50 | t.Error(err) 51 | } 52 | load, err := FromJson(b) 53 | if err != nil { 54 | t.Error(err) 55 | } 56 | 57 | if load.Size() != code.Size() { 58 | t.Error("Saved and loaded code not equivalent") 59 | } 60 | 61 | for i := range code.data.Insns { 62 | if load.data.Insns[i] != code.data.Insns[i] { 63 | t.Errorf("Code byte %d does not match", i) 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /vm/op.go: -------------------------------------------------------------------------------- 1 | package vm 2 | 3 | import ( 4 | "github.com/zyedidia/gpeg/isa" 5 | ) 6 | 7 | const ( 8 | // base instruction set 9 | opChar byte = iota 10 | opJump 11 | opChoice 12 | opCall 13 | opCommit 14 | opReturn 15 | opFail 16 | opSet 17 | opAny 18 | opPartialCommit 19 | opSpan 20 | opBackCommit 21 | opFailTwice 22 | opEmpty 23 | opTestChar 24 | opTestCharNoChoice 25 | opTestSet 26 | opTestSetNoChoice 27 | opTestAny 28 | opEnd 29 | opNop 30 | opCaptureBegin 31 | opCaptureLate 32 | opCaptureEnd 33 | opCaptureFull 34 | opCheckBegin 35 | opCheckEnd 36 | opMemoOpen 37 | opMemoClose 38 | opMemoTreeOpen 39 | opMemoTreeInsert 40 | opMemoTree 41 | opMemoTreeClose 42 | opError 43 | ) 44 | 45 | // instruction sizes 46 | const ( 47 | // base instruction set 48 | szChar = 2 49 | szReturn = 2 50 | szFail = 2 51 | szSet = 2 52 | szAny = 2 53 | szSpan = 2 54 | szFailTwice = 2 55 | szEnd = 2 56 | szNop = 0 57 | szEmpty = 2 58 | szCaptureBegin = 4 59 | szCaptureLate = 4 60 | szCaptureEnd = 2 61 | szCaptureFull = 4 62 | szMemoClose = 2 63 | szMemoTreeInsert = 2 64 | szMemoTree = 2 65 | szMemoTreeClose = 4 66 | szCheckBegin = 6 67 | szCheckEnd = 4 68 | szError = 4 69 | 70 | // jumps 71 | szJump = 4 72 | szChoice = 4 73 | szCall = 4 74 | szCommit = 4 75 | szPartialCommit = 4 76 | szBackCommit = 4 77 | szTestChar = 6 78 | szTestCharNoChoice = 6 79 | szTestSet = 6 80 | szTestSetNoChoice = 6 81 | szTestAny = 6 82 | szMemoOpen = 6 83 | szMemoTreeOpen = 6 84 | ) 85 | 86 | // returns the size in bytes of the encoded version of this instruction 87 | func size(insn isa.Insn) uint { 88 | var sz uint 89 | switch insn.(type) { 90 | case isa.Label, isa.Nop: 91 | return 0 92 | case isa.JumpType, isa.CheckBegin: 93 | sz += 4 94 | default: 95 | sz += 2 96 | } 97 | 98 | // handle instructions with extra args 99 | switch insn.(type) { 100 | case isa.MemoOpen, isa.MemoTreeOpen, isa.MemoTreeClose, isa.CaptureBegin, isa.CaptureLate, 101 | isa.CaptureFull, isa.TestChar, isa.TestCharNoChoice, isa.TestSet, 102 | isa.TestSetNoChoice, isa.TestAny, isa.Error, isa.CheckBegin, isa.CheckEnd: 103 | sz += 2 104 | } 105 | 106 | return sz 107 | } 108 | 109 | var names = map[byte]string{ 110 | opChar: "Char", 111 | opJump: "Jump", 112 | opChoice: "Choice", 113 | opCall: "Call", 114 | opCommit: "Commit", 115 | opReturn: "Return", 116 | opFail: "Fail", 117 | opSet: "Set", 118 | opAny: "Any", 119 | opPartialCommit: "PartialCommit", 120 | opSpan: "Span", 121 | opBackCommit: "BackCommit", 122 | opFailTwice: "FailTwice", 123 | opTestChar: "TestChar", 124 | opTestCharNoChoice: "TestCharNoChoice", 125 | opTestSet: "TestSet", 126 | opTestSetNoChoice: "TestSetNoChoice", 127 | opTestAny: "TestAny", 128 | opEnd: "End", 129 | opNop: "Nop", 130 | opCaptureBegin: "CaptureBegin", 131 | opCaptureLate: "CaptureLate", 132 | opCaptureEnd: "CaptureEnd", 133 | opCaptureFull: "CaptureFull", 134 | opCheckBegin: "CheckBegin", 135 | opCheckEnd: "CheckEnd", 136 | opMemoOpen: "MemoOpen", 137 | opMemoClose: "MemoClose", 138 | opMemoTreeOpen: "MemoTreeOpen", 139 | opMemoTreeInsert: "MemoTreeInsert", 140 | opMemoTree: "MemoTree", 141 | opMemoTreeClose: "MemoTreeClose", 142 | opError: "Error", 143 | opEmpty: "Empty", 144 | } 145 | 146 | func opstr(op byte) string { 147 | return names[op] 148 | } 149 | -------------------------------------------------------------------------------- /vm/stack.go: -------------------------------------------------------------------------------- 1 | package vm 2 | 3 | import ( 4 | "github.com/zyedidia/gpeg/memo" 5 | ) 6 | 7 | type stack struct { 8 | entries []stackEntry 9 | capt []*memo.Capture 10 | } 11 | 12 | func (s *stack) addCapt(capt ...*memo.Capture) { 13 | if len(s.entries) == 0 { 14 | s.capt = append(s.capt, capt...) 15 | } else { 16 | s.entries[len(s.entries)-1].addCapt(capt) 17 | } 18 | } 19 | 20 | func (s *stack) propCapt() { 21 | if len(s.entries) == 0 { 22 | return 23 | } 24 | 25 | top := s.entries[len(s.entries)-1] 26 | if top.capt != nil && len(top.capt) > 0 { 27 | if len(s.entries) == 1 { 28 | s.capt = append(s.capt, top.capt...) 29 | } else { 30 | s.entries[len(s.entries)-2].addCapt(top.capt) 31 | } 32 | } 33 | } 34 | 35 | const ( 36 | stRet = iota 37 | stBtrack 38 | stMemo 39 | stMemoTree 40 | stCapt 41 | stCheck 42 | ) 43 | 44 | type stackEntry struct { 45 | stype byte 46 | // we could use a union to avoid the space cost but I have found this 47 | // doesn't impact performance and the space cost itself is quite small 48 | // because the stack is usually small. 49 | ret stackRet // stackRet is reused for stCheck 50 | btrack stackBacktrack 51 | memo stackMemo // stackMemo is reused for stCapt 52 | 53 | capt []*memo.Capture 54 | } 55 | 56 | func (se *stackEntry) addCapt(capt []*memo.Capture) { 57 | if len(capt) == 0 { 58 | return 59 | } 60 | if len(se.capt) == 0 { 61 | se.capt = capt 62 | } else { 63 | se.capt = append(se.capt, capt...) 64 | } 65 | } 66 | 67 | type stackRet int 68 | 69 | type stackBacktrack struct { 70 | ip int 71 | off int 72 | } 73 | 74 | type stackMemo struct { 75 | id int16 76 | pos int 77 | count int 78 | } 79 | 80 | func newStack() *stack { 81 | return &stack{ 82 | entries: make([]stackEntry, 0, 4), 83 | capt: make([]*memo.Capture, 0), 84 | } 85 | } 86 | 87 | func (s *stack) reset() { 88 | s.capt = nil 89 | // need to complete remake the slice so that the underlying captures can be 90 | // released to the garbage collector if the user has no references to them 91 | // (unused stack entries shouldn't keep references to those captures). 92 | s.entries = make([]stackEntry, 0, 4) 93 | } 94 | 95 | func (s *stack) push(ent stackEntry) { 96 | s.entries = append(s.entries, ent) 97 | } 98 | 99 | // propagate marks whether captures should be propagated up the stack. 100 | func (s *stack) pop(propagate bool) *stackEntry { 101 | if len(s.entries) == 0 { 102 | return nil 103 | } 104 | 105 | ret := &s.entries[len(s.entries)-1] 106 | s.entries = s.entries[:len(s.entries)-1] 107 | // For non-capture entries, propagate the captures upward. 108 | // For capture entries, we create a new node with the corresponding 109 | // children, and this is manually handled by the caller. 110 | if propagate && ret.capt != nil { 111 | s.addCapt(ret.capt...) 112 | } 113 | return ret 114 | } 115 | 116 | func (s *stack) peek() *stackEntry { 117 | return s.peekn(0) 118 | } 119 | 120 | func (s *stack) peekn(n int) *stackEntry { 121 | if len(s.entries) <= n { 122 | return nil 123 | } 124 | return &s.entries[len(s.entries)-n-1] 125 | } 126 | 127 | func (s *stack) pushRet(r stackRet) { 128 | s.push(stackEntry{ 129 | stype: stRet, 130 | ret: r, 131 | }) 132 | } 133 | 134 | func (s *stack) pushBacktrack(b stackBacktrack) { 135 | s.push(stackEntry{ 136 | stype: stBtrack, 137 | btrack: b, 138 | }) 139 | } 140 | 141 | func (s *stack) pushMemo(m stackMemo) { 142 | s.push(stackEntry{ 143 | stype: stMemo, 144 | memo: m, 145 | }) 146 | } 147 | 148 | func (s *stack) pushMemoTree(m stackMemo) { 149 | s.push(stackEntry{ 150 | stype: stMemoTree, 151 | memo: m, 152 | }) 153 | } 154 | 155 | func (s *stack) pushCapt(m stackMemo) { 156 | s.push(stackEntry{ 157 | stype: stCapt, 158 | memo: m, 159 | }) 160 | } 161 | 162 | func (s *stack) pushCheck(m stackMemo) { 163 | s.push(stackEntry{ 164 | stype: stCheck, 165 | memo: m, 166 | }) 167 | } 168 | -------------------------------------------------------------------------------- /vm/vm.go: -------------------------------------------------------------------------------- 1 | // Package vm implements the GPeg virtual machine. 2 | package vm 3 | 4 | import ( 5 | "encoding/binary" 6 | "fmt" 7 | "io" 8 | "regexp/syntax" 9 | 10 | "github.com/zyedidia/gpeg/charset" 11 | "github.com/zyedidia/gpeg/input" 12 | "github.com/zyedidia/gpeg/memo" 13 | ) 14 | 15 | type ParseError struct { 16 | Message string 17 | Pos int 18 | } 19 | 20 | type Interval struct { 21 | Low, High int 22 | } 23 | 24 | func (e ParseError) Error() string { 25 | return fmt.Sprintf("%v: %s", e.Pos, e.Message) 26 | } 27 | 28 | // Exec executes the parsing program this virtual machine was created with. It 29 | // returns whether the parse was a match, the last position in the subject 30 | // string that was matched, and any captures that were created. 31 | func (vm *Code) Exec(r io.ReaderAt, memtbl memo.Table) (bool, int, *memo.Capture, []ParseError) { 32 | ip := 0 33 | st := newStack() 34 | src := input.NewInput(r) 35 | 36 | // parse in parallel? 37 | // if memtbl.Size() == 0 { 38 | // srccopy := input.NewInput(r) 39 | // srccopy.SeekTo(1000000) 40 | // go vm.exec(0, newStack(), srccopy, memtbl) 41 | // } 42 | 43 | return vm.exec(ip, st, src, memtbl, nil) 44 | } 45 | 46 | func (vm *Code) ExecInterval(r io.ReaderAt, memtbl memo.Table, intrvl *Interval) (bool, int, *memo.Capture, []ParseError) { 47 | ip := 0 48 | st := newStack() 49 | src := input.NewInput(r) 50 | 51 | return vm.exec(ip, st, src, memtbl, intrvl) 52 | } 53 | 54 | func (vm *Code) exec(ip int, st *stack, src *input.Input, memtbl memo.Table, intrvl *Interval) (bool, int, *memo.Capture, []ParseError) { 55 | idata := vm.data.Insns 56 | 57 | if ip < 0 || ip >= len(idata) { 58 | return true, 0, memo.NewCaptureDummy(0, 0, nil), nil 59 | } 60 | 61 | var caprange Interval 62 | 63 | if intrvl != nil { 64 | caprange = *intrvl 65 | // Apply an edit that clears all memoized entries in the interval 66 | // we are capturing. This ensures that we find all captures in the 67 | // requested interval. 68 | memtbl.ApplyEdit(memo.Edit{ 69 | Start: intrvl.Low, 70 | End: intrvl.High, 71 | Len: intrvl.High - intrvl.Low, 72 | }) 73 | } 74 | 75 | memoize := func(id, pos, mlen, count int, capt []*memo.Capture) { 76 | if intrvl != nil { 77 | capt = nil 78 | } 79 | mexam := max(src.Furthest(), src.Pos()) - pos + 1 80 | memtbl.Put(id, pos, mlen, mexam, count, capt) 81 | } 82 | 83 | success := true 84 | var errs []ParseError = nil 85 | 86 | loop: 87 | for { 88 | op := idata[ip] 89 | switch op { 90 | case opChar: 91 | b := decodeU8(idata[ip+1:]) 92 | in, ok := src.Peek() 93 | if ok && b == in { 94 | src.Advance(1) 95 | ip += szChar 96 | } else { 97 | goto fail 98 | } 99 | case opJump: 100 | lbl := decodeU24(idata[ip+1:]) 101 | ip = int(lbl) 102 | case opChoice: 103 | lbl := decodeU24(idata[ip+1:]) 104 | st.pushBacktrack(stackBacktrack{int(lbl), src.Pos()}) 105 | ip += szChoice 106 | case opCall: 107 | lbl := decodeU24(idata[ip+1:]) 108 | st.pushRet(stackRet(ip + szCall)) 109 | ip = int(lbl) 110 | case opCommit: 111 | lbl := decodeU24(idata[ip+1:]) 112 | st.pop(true) 113 | ip = int(lbl) 114 | case opReturn: 115 | ent := st.pop(true) 116 | if ent != nil && ent.stype == stRet { 117 | ip = int(ent.ret) 118 | } else { 119 | panic("Return failed") 120 | } 121 | case opFail: 122 | goto fail 123 | case opSet: 124 | set := decodeSet(idata[ip+1:], vm.data.Sets) 125 | in, ok := src.Peek() 126 | if ok && set.Has(in) { 127 | src.Advance(1) 128 | ip += szSet 129 | } else { 130 | goto fail 131 | } 132 | case opAny: 133 | n := decodeU8(idata[ip+1:]) 134 | ok := src.Advance(int(n)) 135 | if ok { 136 | ip += szAny 137 | } else { 138 | goto fail 139 | } 140 | case opPartialCommit: 141 | lbl := decodeU24(idata[ip+1:]) 142 | ent := st.peek() 143 | if ent != nil && ent.stype == stBtrack { 144 | ent.btrack.off = src.Pos() 145 | st.propCapt() 146 | ent.capt = nil 147 | ip = int(lbl) 148 | } else { 149 | panic("PartialCommit failed") 150 | } 151 | case opSpan: 152 | set := decodeSet(idata[ip+1:], vm.data.Sets) 153 | in, ok := src.Peek() 154 | for ok && set.Has(in) { 155 | src.Advance(1) 156 | in, ok = src.Peek() 157 | } 158 | ip += szSpan 159 | case opBackCommit: 160 | lbl := decodeU24(idata[ip+1:]) 161 | ent := st.pop(true) 162 | if ent != nil && ent.stype == stBtrack { 163 | src.SeekTo(ent.btrack.off) 164 | ip = int(lbl) 165 | } else { 166 | panic("BackCommit failed") 167 | } 168 | case opFailTwice: 169 | st.pop(false) 170 | goto fail 171 | case opEmpty: 172 | op := syntax.EmptyOp(decodeU8(idata[ip+1:])) 173 | r1, r2 := rune(-1), rune(-1) 174 | // TODO: PeekBefore may cause problems with incremental parsing 175 | b1, ok := src.PeekBefore() 176 | if ok { 177 | r1 = rune(b1) 178 | } 179 | b2, ok := src.Peek() 180 | if ok { 181 | r2 = rune(b2) 182 | } 183 | sat := syntax.EmptyOpContext(r1, r2) 184 | if (sat & op) != 0 { 185 | ip += szEmpty 186 | } else { 187 | goto fail 188 | } 189 | case opTestChar: 190 | b := decodeU8(idata[ip+2:]) 191 | lbl := decodeU24(idata[ip+3:]) 192 | in, ok := src.Peek() 193 | if ok && in == b { 194 | st.pushBacktrack(stackBacktrack{int(lbl), src.Pos()}) 195 | src.Advance(1) 196 | ip += szTestChar 197 | } else { 198 | ip = int(lbl) 199 | } 200 | case opTestCharNoChoice: 201 | b := decodeU8(idata[ip+2:]) 202 | in, ok := src.Peek() 203 | if ok && in == b { 204 | src.Advance(1) 205 | ip += szTestCharNoChoice 206 | } else { 207 | lbl := decodeU24(idata[ip+3:]) 208 | ip = int(lbl) 209 | } 210 | case opTestSet: 211 | lbl := decodeU24(idata[ip+3:]) 212 | set := decodeSet(idata[ip+2:], vm.data.Sets) 213 | in, ok := src.Peek() 214 | if ok && set.Has(in) { 215 | st.pushBacktrack(stackBacktrack{int(lbl), src.Pos()}) 216 | src.Advance(1) 217 | ip += szTestSet 218 | } else { 219 | ip = int(lbl) 220 | } 221 | case opTestSetNoChoice: 222 | set := decodeSet(idata[ip+2:], vm.data.Sets) 223 | in, ok := src.Peek() 224 | if ok && set.Has(in) { 225 | src.Advance(1) 226 | ip += szTestSetNoChoice 227 | } else { 228 | lbl := decodeU24(idata[ip+3:]) 229 | ip = int(lbl) 230 | } 231 | case opTestAny: 232 | n := decodeU8(idata[ip+2:]) 233 | lbl := decodeU24(idata[ip+3:]) 234 | ent := stackBacktrack{int(lbl), src.Pos()} 235 | ok := src.Advance(int(n)) 236 | if ok { 237 | st.pushBacktrack(ent) 238 | ip += szTestAny 239 | } else { 240 | ip = int(lbl) 241 | } 242 | case opCaptureBegin: 243 | id := decodeI16(idata[ip+2:]) 244 | st.pushCapt(stackMemo{ 245 | id: id, 246 | pos: src.Pos(), 247 | }) 248 | ip += szCaptureBegin 249 | case opCaptureLate: 250 | back := decodeU8(idata[ip+1:]) 251 | id := decodeI16(idata[ip+2:]) 252 | st.pushCapt(stackMemo{ 253 | id: id, 254 | pos: src.Pos() - int(back), 255 | }) 256 | ip += szCaptureLate 257 | case opCaptureFull: 258 | back := int(decodeU8(idata[ip+1:])) 259 | id := decodeI16(idata[ip+2:]) 260 | pos := src.Pos() 261 | 262 | if overlaps(intrvl, pos-back, pos) { 263 | caprange.Low = min(caprange.Low, pos-back) 264 | caprange.High = max(caprange.High, pos) 265 | capt := memo.NewCaptureNode(int(id), pos-back, back, nil) 266 | st.addCapt(capt) 267 | } 268 | 269 | ip += szCaptureFull 270 | case opCaptureEnd: 271 | ent := st.pop(false) 272 | 273 | if ent == nil || ent.stype != stCapt { 274 | panic("CaptureEnd did not find capture entry") 275 | } 276 | 277 | end := src.Pos() 278 | if overlaps(intrvl, ent.memo.pos, end) { 279 | caprange.Low = min(caprange.Low, ent.memo.pos) 280 | caprange.High = max(caprange.High, end) 281 | capt := memo.NewCaptureNode(int(ent.memo.id), ent.memo.pos, end-ent.memo.pos, ent.capt) 282 | st.addCapt(capt) 283 | } 284 | ip += szCaptureEnd 285 | case opEnd: 286 | fail := decodeU8(idata[ip+1:]) 287 | success = fail != 1 288 | break loop 289 | case opMemoOpen: 290 | lbl := decodeU24(idata[ip+1:]) 291 | id := decodeI16(idata[ip+4:]) 292 | 293 | ment, ok := memtbl.Get(int(id), src.Pos()) 294 | if ok { 295 | if ment.Length() == -1 { 296 | goto fail 297 | } 298 | capt := ment.Captures() 299 | if capt != nil { 300 | st.addCapt(capt...) 301 | } 302 | src.Advance(ment.Length()) 303 | ip = int(lbl) 304 | } else { 305 | st.pushMemo(stackMemo{ 306 | id: id, 307 | pos: src.Pos(), 308 | }) 309 | ip += szMemoOpen 310 | } 311 | case opMemoClose: 312 | ent := st.pop(true) 313 | if ent != nil && ent.stype == stMemo { 314 | mlen := src.Pos() - ent.memo.pos 315 | memoize(int(ent.memo.id), ent.memo.pos, mlen, 1, ent.capt) 316 | } else { 317 | panic("memo close failed") 318 | } 319 | ip += szMemoClose 320 | case opMemoTreeOpen: 321 | lbl := decodeU24(idata[ip+1:]) 322 | id := decodeI16(idata[ip+4:]) 323 | 324 | ment, ok := memtbl.Get(int(id), src.Pos()) 325 | if ok { 326 | if ment.Length() == -1 { 327 | goto fail 328 | } 329 | st.pushMemoTree(stackMemo{ 330 | id: id, 331 | pos: src.Pos(), 332 | count: ment.Count(), 333 | }) 334 | capt := ment.Captures() 335 | if capt != nil { 336 | st.addCapt(capt...) 337 | } 338 | src.Advance(ment.Length()) 339 | src.Peek() 340 | ip = int(lbl) 341 | } else { 342 | st.pushMemoTree(stackMemo{ 343 | id: id, 344 | pos: src.Pos(), 345 | }) 346 | ip += szMemoTreeOpen 347 | } 348 | case opMemoTreeClose: 349 | id := decodeI16(idata[ip+2:]) 350 | for p := st.peek(); p != nil && p.stype == stMemoTree && p.memo.id == id; p = st.peek() { 351 | st.pop(true) 352 | } 353 | ip += szMemoTreeClose 354 | case opMemoTreeInsert: 355 | ent := st.peek() 356 | if ent == nil || ent.stype != stMemoTree { 357 | panic("no memo entry on stack") 358 | } 359 | mlen := src.Pos() - ent.memo.pos 360 | ent.memo.count++ 361 | memoize(int(ent.memo.id), ent.memo.pos, mlen, ent.memo.count, ent.capt) 362 | ip += szMemoTreeInsert 363 | case opMemoTree: 364 | seen := 0 365 | accum := 0 366 | for { 367 | top := st.peekn(seen) 368 | next := st.peekn(seen + 1) 369 | 370 | if top == nil || next == nil || top.stype != stMemoTree || next.stype != stMemoTree { 371 | break 372 | } 373 | 374 | seen++ 375 | accum += top.memo.count 376 | 377 | if accum < next.memo.count { 378 | continue 379 | } 380 | 381 | for i := 0; i < seen-1; i++ { 382 | st.pop(true) 383 | } 384 | ent := st.pop(false) // next is now top of stack 385 | 386 | if len(ent.capt) > 0 && intrvl == nil { 387 | dummy := memo.NewCaptureDummy(ent.memo.pos, src.Pos()-ent.memo.pos, ent.capt) 388 | st.addCapt(dummy) 389 | } else if len(ent.capt) > 0 { 390 | st.addCapt(ent.capt...) 391 | } 392 | 393 | next.memo.count = accum + next.memo.count 394 | mlen := src.Pos() - next.memo.pos 395 | memoize(int(next.memo.id), next.memo.pos, mlen, next.memo.count, next.capt) 396 | 397 | accum = 0 398 | seen = 0 399 | } 400 | 401 | ip += szMemoTree 402 | case opCheckBegin: 403 | id := decodeI16(idata[ip+2:]) 404 | flag := decodeI16(idata[ip+4:]) 405 | st.pushCheck(stackMemo{ 406 | id: id, 407 | count: int(flag), 408 | pos: src.Pos(), 409 | }) 410 | ip += szCheckBegin 411 | case opCheckEnd: 412 | ent := st.pop(true) 413 | if ent == nil || ent.stype != stCheck { 414 | panic("check end needs check stack entry") 415 | } 416 | checkid := decodeU24(idata[ip+1:]) 417 | checker := vm.data.Checkers[checkid] 418 | 419 | id := int(ent.memo.id) 420 | flag := ent.memo.count 421 | n := checker.Check(src.Slice(int(ent.memo.pos), src.Pos()), src, id, flag) 422 | if n == -1 { 423 | goto fail 424 | } else { 425 | src.Advance(n) 426 | } 427 | 428 | ip += szCheckEnd 429 | case opError: 430 | errid := decodeU24(idata[ip+1:]) 431 | msg := vm.data.Errors[errid] 432 | errs = append(errs, ParseError{ 433 | Pos: src.Pos(), 434 | Message: msg, 435 | }) 436 | ip += szError 437 | default: 438 | panic("Invalid opcode") 439 | } 440 | } 441 | 442 | if intrvl != nil { 443 | return success, src.Pos(), memo.NewCaptureDummy(caprange.Low, caprange.High-caprange.Low, st.capt), errs 444 | } 445 | return success, src.Pos(), memo.NewCaptureDummy(0, src.Pos(), st.capt), errs 446 | 447 | fail: 448 | ent := st.pop(false) 449 | if ent == nil { 450 | // match failed 451 | return false, src.Pos(), nil, errs 452 | } 453 | 454 | switch ent.stype { 455 | case stBtrack: 456 | ip = ent.btrack.ip 457 | src.SeekTo(ent.btrack.off) 458 | ent.capt = nil 459 | case stMemo: 460 | // Mark this position in the memoTable as a failed match 461 | memoize(int(ent.memo.id), ent.memo.pos, -1, 0, nil) 462 | ent.capt = nil 463 | goto fail 464 | case stRet, stCapt, stCheck: 465 | ent.capt = nil 466 | goto fail 467 | } 468 | 469 | goto loop 470 | } 471 | 472 | func decodeU8(b []byte) byte { 473 | return b[0] 474 | } 475 | 476 | func decodeI8(b []byte) int8 { 477 | return int8(b[0]) 478 | } 479 | 480 | func decodeU16(b []byte) uint16 { 481 | return binary.LittleEndian.Uint16(b[0:]) 482 | } 483 | 484 | func decodeI16(b []byte) int16 { 485 | return int16(binary.LittleEndian.Uint16(b[0:])) 486 | } 487 | 488 | func decodeU24(b []byte) uint32 { 489 | i1 := uint32(decodeU8(b)) 490 | i2 := uint32(decodeU16(b[1:])) 491 | i := (i1 << 16) | i2 492 | return i 493 | } 494 | 495 | func decodeSet(b []byte, sets []charset.Set) charset.Set { 496 | i := decodeU8(b) 497 | return sets[i] 498 | } 499 | 500 | func overlaps(i *Interval, low2, high2 int) bool { 501 | if i == nil { 502 | return true 503 | } 504 | return i.Low < high2 && i.High > low2 505 | } 506 | 507 | func min(a, b int) int { 508 | if a < b { 509 | return a 510 | } 511 | return b 512 | } 513 | func max(a, b int) int { 514 | if a > b { 515 | return a 516 | } 517 | return b 518 | } 519 | --------------------------------------------------------------------------------