├── go.mod ├── testdata ├── ungrammar.ungrammar ├── exprlang.ungrammar └── rust.ungrammar ├── .github └── workflows │ └── go.yml ├── .gitignore ├── errorlist.go ├── example_test.go ├── LICENSE ├── README.md ├── cmd └── ungrammar2json │ └── ungrammar2json.go ├── lexer_test.go ├── ungrammar.go ├── lexer.go ├── parser.go └── parser_test.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/eliben/go-ungrammar 2 | 3 | go 1.22.2 4 | -------------------------------------------------------------------------------- /testdata/ungrammar.ungrammar: -------------------------------------------------------------------------------- 1 | /// ungrammar for ungrammar 2 | /// copied from https://github.com/rust-analyzer/ungrammar/ 3 | 4 | Grammar = 5 | Node * 6 | 7 | Node = 8 | name:'ident' '=' Rule 9 | 10 | Rule = 11 | 'ident' 12 | | 'token_ident' 13 | | Rule * 14 | | Rule ( '|' Rule) * 15 | | Rule '?' 16 | | Rule '*' 17 | | '(' Rule ')' 18 | | label:'ident' ':' Rule 19 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Run Go tests 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Set up Go 17 | uses: actions/setup-go@v3 18 | with: 19 | go-version: "1.22.2" 20 | 21 | - name: Test 22 | run: go test -v ./... 23 | -------------------------------------------------------------------------------- /testdata/exprlang.ungrammar: -------------------------------------------------------------------------------- 1 | // Ungrammar for a simple expression language 2 | 3 | Program = Stmt* 4 | 5 | Stmt = AssignStmt | Expr 6 | 7 | AssignStmt = 'set' 'ident' '=' Expr 8 | 9 | Expr = 10 | Literal 11 | | UnaryExpr 12 | | ParenExpr 13 | | BinExpr 14 | 15 | UnaryExpr = op:('+' | '-') Expr 16 | 17 | ParenExpr = '(' Expr ')' 18 | 19 | BinExpr = lhs:Expr op:('+' | '-' | '*' | '/' | '%') rhs:Expr 20 | 21 | Literal = 'int_literal' | 'ident' 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | -------------------------------------------------------------------------------- /errorlist.go: -------------------------------------------------------------------------------- 1 | // go-ungrammar: ErrorList type 2 | // 3 | // Eli Bendersky [https://eli.thegreenplace.net] 4 | // This code is in the public domain. 5 | 6 | package ungrammar 7 | 8 | import "fmt" 9 | 10 | // ErrorList represents multiple parse errors reported by the parser on a given 11 | // source. It's loosely modeled on scanner.ErrorList in the Go standard library. 12 | // ErrorList implements the error interface. 13 | type ErrorList []error 14 | 15 | func (el *ErrorList) Add(err error) { 16 | *el = append(*el, err) 17 | } 18 | 19 | func (el ErrorList) Error() string { 20 | if len(el) == 0 { 21 | return "no errors" 22 | } else if len(el) == 1 { 23 | return el[0].Error() 24 | } else { 25 | return fmt.Sprintf("%s (and %d more errors)", el[0], len(el)-1) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | // go-ungrammar: basic usage example. 2 | // 3 | // Eli Bendersky [https://eli.thegreenplace.net] 4 | // This code is in the public domain. 5 | 6 | package ungrammar_test 7 | 8 | import ( 9 | "fmt" 10 | 11 | "github.com/eliben/go-ungrammar" 12 | ) 13 | 14 | func ExampleParser() { 15 | input := ` 16 | Foo = Bar Baz 17 | Baz = ( Kay Jay )* | 'id'` 18 | 19 | // Create an Ungrammar parser and parse input. 20 | p := ungrammar.NewParser(input) 21 | ungram, err := p.ParseGrammar() 22 | if err != nil { 23 | panic(err) 24 | } 25 | 26 | // Display the string representation of the parsed ungrammar. 27 | fmt.Println(ungram.Rules["Foo"].String()) 28 | fmt.Println(ungram.Rules["Baz"].String()) 29 | // Output: 30 | // Seq(Bar, Baz) 31 | // Alt(Rep(Seq(Kay, Jay)), 'id') 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-ungrammar 2 | 3 | Ungrammar implementation and API in Go. [Blog post for background](https://eli.thegreenplace.net/2023/ungrammar-in-go-and-resilient-parsing/). 4 | 5 | Ungrammar is a DSL for 6 | [concrete syntax trees (CST)](https://en.wikipedia.org/wiki/Parse_tree). This implementation is based on the original 7 | [ungrammar crate](https://github.com/rust-analyzer/ungrammar/), also borrowing 8 | some test files from it. 9 | 10 | ## Ungrammar syntax 11 | 12 | The syntax of Ungrammar files is very simple: 13 | 14 | ``` 15 | // -- comment 16 | Name = -- non-terminal definition 17 | 'ident' -- token (terminal) 18 | A B -- sequence 19 | A | B -- alternation 20 | A* -- repetition (zero or more) 21 | A? -- optional (zero or one) 22 | (A B) -- grouping elements for precedence control 23 | label:A -- label hint for naming 24 | ``` 25 | 26 | For some concrete examples, look at files in the `testdata` directory. 27 | 28 | ## Usage 29 | 30 | [![Go Reference](https://pkg.go.dev/badge/github.com/eliben/go-ungrammar.svg)](https://pkg.go.dev/github.com/eliben/go-ungrammar) 31 | 32 | Usage example: 33 | 34 | https://github.com/eliben/go-ungrammar/blob/229d0dd20660980d5069ed676c5c728a9fda5723/example_test.go#L13-L31 35 | 36 | For somewhat more sophisticated usage, see the `cmd/ungrammar2json` command. 37 | -------------------------------------------------------------------------------- /cmd/ungrammar2json/ungrammar2json.go: -------------------------------------------------------------------------------- 1 | // This program parses an ungrammar file and dumps the ungrammar into JSON 2 | // format that any tool/language can read. 3 | // 4 | // It reads stdin and writes to stdout. 5 | // 6 | // The emitted JSON is has minimal whitespace and is not formatted; pipe through 7 | // `jq .` for a pretty/formatted output. 8 | // 9 | // Eli Bendersky [https://eli.thegreenplace.net] 10 | // This code is in the public domain. 11 | 12 | package main 13 | 14 | import ( 15 | "encoding/json" 16 | "io" 17 | "log" 18 | "os" 19 | 20 | "github.com/eliben/go-ungrammar" 21 | ) 22 | 23 | func main() { 24 | if len(os.Args) != 1 { 25 | log.Fatal("Usage: ungrammar2json < input.ungram") 26 | } 27 | 28 | stdinBytes, err := io.ReadAll(os.Stdin) 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | 33 | p := ungrammar.NewParser(string(stdinBytes)) 34 | grammar, err := p.ParseGrammar() 35 | if err != nil { 36 | log.Fatal("Error parsing ungrammar:", err) 37 | } 38 | 39 | grammarObj := make(object) 40 | for name, rule := range grammar.Rules { 41 | grammarObj[name] = ruleToObj(rule) 42 | } 43 | 44 | enc := json.NewEncoder(os.Stdout) 45 | if err := enc.Encode(grammarObj); err != nil { 46 | log.Fatal("Error encoding to JSON:", err) 47 | } 48 | } 49 | 50 | // object is a map with arbitrary values suitable for JSON encoding. 51 | type object map[string]any 52 | 53 | func ruleToObj(r ungrammar.Rule) object { 54 | switch rr := r.(type) { 55 | case *ungrammar.Labeled: 56 | return object{"label": rr.Label, "rule": ruleToObj(rr.Rule)} 57 | case *ungrammar.Node: 58 | return object{"node": rr.Name} 59 | case *ungrammar.Token: 60 | return object{"token": rr.Value} 61 | case *ungrammar.Rep: 62 | return object{"rep": ruleToObj(rr.Rule)} 63 | case *ungrammar.Opt: 64 | return object{"opt": ruleToObj(rr.Rule)} 65 | case *ungrammar.Seq: 66 | var subRules []object 67 | for _, sr := range rr.Rules { 68 | subRules = append(subRules, ruleToObj(sr)) 69 | } 70 | return object{"seq": subRules} 71 | case *ungrammar.Alt: 72 | var subRules []object 73 | for _, sr := range rr.Rules { 74 | subRules = append(subRules, ruleToObj(sr)) 75 | } 76 | return object{"alt": subRules} 77 | default: 78 | return nil 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /lexer_test.go: -------------------------------------------------------------------------------- 1 | // Eli Bendersky [https://eli.thegreenplace.net] 2 | // This code is in the public domain. 3 | 4 | package ungrammar 5 | 6 | import ( 7 | "testing" 8 | ) 9 | 10 | func TestLexer(t *testing.T) { 11 | const input = ` 12 | someid 13 | : ? anotherid 'sometok' 14 | // comment 15 | ( idmore 'tt tt' ) // doc 16 | 'tt\'q' 'tt\\s' 17 | | 18 | ` 19 | 20 | lex := newLexer(input) 21 | var toks []token 22 | 23 | for { 24 | t := lex.nextToken() 25 | toks = append(toks, t) 26 | if t.name == EOF { 27 | break 28 | } 29 | } 30 | 31 | wantToks := []token{ 32 | token{NODE, "someid", location{2, 1}}, 33 | token{COLON, ":", location{3, 1}}, 34 | token{QMARK, "?", location{3, 3}}, 35 | token{NODE, "anotherid", location{3, 5}}, 36 | token{TOKEN, "sometok", location{3, 15}}, 37 | token{LPAREN, "(", location{5, 26}}, 38 | token{NODE, "idmore", location{5, 28}}, 39 | token{TOKEN, "tt tt", location{5, 35}}, 40 | token{RPAREN, ")", location{5, 43}}, 41 | token{TOKEN, `tt'q`, location{6, 1}}, 42 | token{TOKEN, `tt\s`, location{6, 9}}, 43 | token{PIPE, "|", location{7, 1}}, 44 | token{EOF, "", location{8, 0}}, 45 | } 46 | 47 | if len(wantToks) != len(toks) { 48 | t.Fatalf("length mismatch wantToks=%v, toks=%v", len(wantToks), len(toks)) 49 | } 50 | for i := 0; i < len(wantToks); i++ { 51 | if wantToks[i] != toks[i] { 52 | t.Errorf("mismatch at index %2v: got %v, want %v", i, wantToks[i], toks[i]) 53 | } 54 | } 55 | } 56 | 57 | func TestLexerEOF(t *testing.T) { 58 | // Test that we get as many EOF tokens at the end of the input as we ask for. 59 | const input = `: ` 60 | lex := newLexer(input) 61 | 62 | if tok := lex.nextToken(); tok.name != COLON { 63 | t.Errorf("got %v, want COLON", tok) 64 | } 65 | for i := 0; i < 10; i++ { 66 | if tok := lex.nextToken(); tok.name != EOF { 67 | t.Errorf("got %v, want EOF", tok) 68 | } 69 | } 70 | } 71 | 72 | func allTokens(lex *lexer) []token { 73 | var toks []token 74 | for { 75 | t := lex.nextToken() 76 | toks = append(toks, t) 77 | if t.name == EOF { 78 | break 79 | } 80 | } 81 | return toks 82 | } 83 | 84 | func TestLexerError(t *testing.T) { 85 | var tests = []struct { 86 | input string 87 | errorIndex int 88 | errorValue string 89 | errorLocation location 90 | }{ 91 | {`hello $ bye`, 1, `unknown token starting with '$'`, location{1, 7}}, 92 | {`hello | $no`, 2, `unknown token starting with '$'`, location{1, 9}}, 93 | {`hello | $no @`, 4, `unknown token starting with '@'`, location{1, 13}}, 94 | {`he '202020`, 1, `unterminated token literal`, location{1, 4}}, 95 | } 96 | 97 | for _, tt := range tests { 98 | t.Run(tt.input, func(t *testing.T) { 99 | lex := newLexer(tt.input) 100 | toks := allTokens(lex) 101 | gotTok := toks[tt.errorIndex] 102 | if gotTok.name != ERROR || gotTok.value != tt.errorValue || gotTok.loc != tt.errorLocation { 103 | t.Errorf("got token %s, want ERROR with value=%q loc=%v", gotTok, tt.errorValue, tt.errorLocation) 104 | } 105 | }) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /ungrammar.go: -------------------------------------------------------------------------------- 1 | // go-ungrammar: Ungrammar Concrete Syntax Tree (CST). 2 | // 3 | // Eli Bendersky [https://eli.thegreenplace.net] 4 | // This code is in the public domain. 5 | 6 | // package ungrammar provides a parser and representation for Ungrammar 7 | // concrete syntax trees. 8 | package ungrammar 9 | 10 | import ( 11 | "fmt" 12 | "strings" 13 | ) 14 | 15 | // Grammar represents a parsed Ungrammar file. The input is represented as 16 | // a mapping between strings (rule names on the left-hand-side of Ungrammar 17 | // rules) and rules (CST). 18 | // For example, if we have a rule like "Foo = Bar Baz", the Rules map will 19 | // contain a mapping between the string "Foo" and the CST 20 | // Seq(Node(Bar), Node(Baz)). 21 | type Grammar struct { 22 | // Rules maps ruleName --> Rule 23 | Rules map[string]Rule 24 | 25 | // NameLoc maps ruleName --> its location in the input, for accurate error 26 | // reporting. Rules carry their own locations, but since names are just 27 | // strings, locations are kept here. 28 | NameLoc map[string]location 29 | } 30 | 31 | // Rule is the interface defining an Ungrammar CST subtree. At runtime, a value 32 | // implemeting the Rule interface will have a concrete type which is one of the 33 | // exported types in this file. 34 | type Rule interface { 35 | Location() location 36 | String() string 37 | } 38 | 39 | type Labeled struct { 40 | Label string 41 | Rule Rule 42 | labelLoc location 43 | } 44 | 45 | type Node struct { 46 | Name string 47 | nameLoc location 48 | } 49 | 50 | type Token struct { 51 | Value string 52 | valueLoc location 53 | } 54 | 55 | type Seq struct { 56 | Rules []Rule 57 | } 58 | 59 | type Alt struct { 60 | Rules []Rule 61 | } 62 | 63 | type Opt struct { 64 | Rule Rule 65 | } 66 | 67 | type Rep struct { 68 | Rule Rule 69 | } 70 | 71 | // Location methods 72 | 73 | func (seq *Seq) Location() location { 74 | return seq.Rules[0].Location() 75 | } 76 | 77 | func (tok *Token) Location() location { 78 | return tok.valueLoc 79 | } 80 | 81 | func (node *Node) Location() location { 82 | return node.nameLoc 83 | } 84 | 85 | func (alt *Alt) Location() location { 86 | return alt.Rules[0].Location() 87 | } 88 | 89 | func (lbl *Labeled) Location() location { 90 | return lbl.labelLoc 91 | } 92 | 93 | func (opt *Opt) Location() location { 94 | return opt.Rule.Location() 95 | } 96 | 97 | func (rep *Rep) Location() location { 98 | return rep.Rule.Location() 99 | } 100 | 101 | // String methods 102 | 103 | func (g *Grammar) String() string { 104 | var sb strings.Builder 105 | for name, rule := range g.Rules { 106 | fmt.Fprintf(&sb, "%s: %s\n", name, ruleString(rule)) 107 | } 108 | return sb.String() 109 | } 110 | 111 | func (lbl *Labeled) String() string { 112 | return fmt.Sprintf("%s:%s", lbl.Label, ruleString(lbl.Rule)) 113 | } 114 | 115 | func (node *Node) String() string { 116 | return node.Name 117 | } 118 | 119 | func (tok *Token) String() string { 120 | return fmt.Sprintf("'%s'", tok.Value) 121 | } 122 | 123 | func (seq *Seq) String() string { 124 | var parts []string 125 | for _, r := range seq.Rules { 126 | parts = append(parts, ruleString(r)) 127 | } 128 | return fmt.Sprintf("Seq(%v)", strings.Join(parts, ", ")) 129 | } 130 | 131 | func (alt *Alt) String() string { 132 | var parts []string 133 | for _, r := range alt.Rules { 134 | parts = append(parts, ruleString(r)) 135 | } 136 | return fmt.Sprintf("Alt(%v)", strings.Join(parts, ", ")) 137 | } 138 | 139 | func (opt *Opt) String() string { 140 | return fmt.Sprintf("Opt(%s)", ruleString(opt.Rule)) 141 | } 142 | 143 | func (rep *Rep) String() string { 144 | return fmt.Sprintf("Rep(%s)", ruleString(rep.Rule)) 145 | } 146 | 147 | // ruleString returns a Rule's String() representation, or if r == nil. 148 | func ruleString(r Rule) string { 149 | if r == nil { 150 | return "" 151 | } else { 152 | return r.String() 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /lexer.go: -------------------------------------------------------------------------------- 1 | // go-ungrammar: lexical analyzer. 2 | // 3 | // Eli Bendersky [https://eli.thegreenplace.net] 4 | // This code is in the public domain. 5 | 6 | package ungrammar 7 | 8 | import ( 9 | "fmt" 10 | "strings" 11 | "unicode/utf8" 12 | ) 13 | 14 | // token represents a Ungrammar language token - it has a name (one of the 15 | // constants declared below), string value and a location. 16 | // 17 | // The term "token" is slightly overloaded in this file; in Ungrammar, a quoted 18 | // string literal is also called a "Token" -- this is just one of the kinds of 19 | // tokens this lexer returns. 20 | type token struct { 21 | name tokenName 22 | value string 23 | loc location 24 | } 25 | 26 | type location struct { 27 | line int 28 | column int 29 | } 30 | 31 | func (loc location) String() string { 32 | return fmt.Sprintf("%v:%v", loc.line, loc.column) 33 | } 34 | 35 | type tokenName int 36 | 37 | const ( 38 | // Special tokens 39 | ERROR tokenName = iota 40 | EOF 41 | 42 | NODE 43 | TOKEN 44 | 45 | EQ 46 | STAR 47 | PIPE 48 | QMARK 49 | COLON 50 | LPAREN 51 | RPAREN 52 | ) 53 | 54 | var tokenNames = [...]string{ 55 | ERROR: "ERROR", 56 | EOF: "EOF", 57 | 58 | NODE: "NODE", 59 | TOKEN: "TOKEN", 60 | 61 | EQ: "EQ", 62 | STAR: "STAR", 63 | PIPE: "PIPE", 64 | QMARK: "QMARK", 65 | COLON: "COLON", 66 | LPAREN: "LPAREN", 67 | RPAREN: "RPAREN", 68 | } 69 | 70 | func (tok token) String() string { 71 | return fmt.Sprintf("token{%s, '%s', %s}", tokenNames[tok.name], tok.value, tok.loc) 72 | } 73 | 74 | // lexer provides lexical scanning of text into Ungrammar tokens. 75 | // 76 | // Create a new lexer with newLexer and then call nextToken repeatedly to get 77 | // tokens from the stream. The lexer will return an EOF token when done. 78 | type lexer struct { 79 | buf string 80 | 81 | // Current rune. 82 | r rune 83 | 84 | // Offset of the current rune in buf. 85 | rpos int 86 | 87 | // Offset of the next rune in buf. 88 | nextpos int 89 | 90 | // location of r 91 | loc location 92 | } 93 | 94 | // newLexer creates a new lexer for the given string. 95 | func newLexer(buf string) *lexer { 96 | lex := lexer{ 97 | buf: buf, 98 | r: -1, 99 | rpos: 0, 100 | nextpos: 0, 101 | 102 | // column starts at 0 since advace() always increments it before we have 103 | // the first rune in r 104 | loc: location{1, 0}, 105 | } 106 | 107 | lex.advance() 108 | return &lex 109 | } 110 | 111 | // nextToken returns the next token in the input string. 112 | func (lex *lexer) nextToken() token { 113 | lex.skipNontokens() 114 | 115 | rloc := lex.loc 116 | if lex.r < 0 { 117 | return token{EOF, "", rloc} 118 | } else if isIdChar(lex.r) { 119 | return lex.scanNode() 120 | } 121 | 122 | switch lex.r { 123 | case '\'': 124 | return lex.scanQuoted() 125 | case '=': 126 | lex.advance() 127 | return token{EQ, "=", rloc} 128 | case '*': 129 | lex.advance() 130 | return token{STAR, "*", rloc} 131 | case '?': 132 | lex.advance() 133 | return token{QMARK, "?", rloc} 134 | case '(': 135 | lex.advance() 136 | return token{LPAREN, "(", rloc} 137 | case ')': 138 | lex.advance() 139 | return token{RPAREN, ")", rloc} 140 | case '|': 141 | lex.advance() 142 | return token{PIPE, "|", rloc} 143 | case ':': 144 | lex.advance() 145 | return token{COLON, ":", rloc} 146 | default: 147 | errtok := lex.emitError(fmt.Sprintf("unknown token starting with %q", lex.r), rloc) 148 | lex.advance() 149 | return errtok 150 | } 151 | } 152 | 153 | // advance the lexer's internal state to point to the next rune in the 154 | // input. advance is responsible for maintaining the main invariant of the 155 | // lexer: at any point after advance has been called at least once, lex.r 156 | // is the current token the lexer is looking at; lex.rpos is its offset 157 | // the string and lex.loc is its location. lex.nextpost is the offset of the 158 | // next token in the input. When the end of the input is reached, lex.r 159 | // becomes EOF. 160 | func (lex *lexer) advance() { 161 | if lex.nextpos < len(lex.buf) { 162 | lex.rpos = lex.nextpos 163 | r, w := rune(lex.buf[lex.nextpos]), 1 164 | 165 | if r >= utf8.RuneSelf { 166 | r, w = utf8.DecodeRuneInString(lex.buf[lex.nextpos:]) 167 | } 168 | 169 | lex.nextpos += w 170 | lex.r = r 171 | lex.loc.column += 1 172 | } else { 173 | lex.rpos = len(lex.buf) 174 | lex.r = -1 // EOF 175 | } 176 | } 177 | 178 | // peekNext looks at the next rune in the input, after lex.r. It only works 179 | // correctly for rune values < 128. 180 | func (lex *lexer) peekNext() rune { 181 | if lex.nextpos < len(lex.buf) { 182 | return rune(lex.buf[lex.nextpos]) 183 | } else { 184 | return -1 185 | } 186 | } 187 | 188 | func (lex *lexer) emitError(msg string, loc location) token { 189 | return token{ 190 | name: ERROR, 191 | value: msg, 192 | loc: loc, 193 | } 194 | } 195 | 196 | func (lex *lexer) skipNontokens() { 197 | for { 198 | switch lex.r { 199 | case ' ', '\t', '\r': 200 | lex.advance() 201 | case '\n': 202 | lex.loc.line++ 203 | // Set column to 0 because advance() immediately increments it 204 | lex.loc.column = 0 205 | lex.advance() 206 | case '/': 207 | if lex.peekNext() == '/' { 208 | lex.skipLineComment() 209 | } 210 | default: 211 | return 212 | } 213 | } 214 | } 215 | 216 | func (lex *lexer) skipLineComment() { 217 | for lex.r != '\n' && lex.r > 0 { 218 | lex.advance() 219 | } 220 | } 221 | 222 | func (lex *lexer) scanNode() token { 223 | startloc := lex.loc 224 | startpos := lex.rpos 225 | for isIdChar(lex.r) { 226 | lex.advance() 227 | } 228 | return token{NODE, lex.buf[startpos:lex.rpos], startloc} 229 | } 230 | 231 | func (lex *lexer) scanQuoted() token { 232 | startloc := lex.loc 233 | lex.advance() // skip leading quote 234 | var tokbuf strings.Builder 235 | for { 236 | if lex.r == '\'' { 237 | lex.advance() 238 | return token{TOKEN, tokbuf.String(), startloc} 239 | } else if lex.r == -1 { 240 | return lex.emitError("unterminated token literal", startloc) 241 | } else if lex.r == '\\' { 242 | // Skip the backslash and write the rune following it into the buffer. 243 | lex.advance() 244 | tokbuf.WriteRune(lex.r) 245 | } else { 246 | tokbuf.WriteRune(lex.r) 247 | } 248 | lex.advance() 249 | } 250 | } 251 | 252 | func isIdChar(r rune) bool { 253 | if r >= 256 { 254 | return false 255 | } 256 | 257 | const mask = 0 | 258 | (1<<26-1)<<'A' | 259 | (1<<26-1)<<'a' | 260 | 1<<'_' 261 | 262 | b := byte(r) 263 | return (uint64(1)<>64) != 0 264 | } 265 | -------------------------------------------------------------------------------- /parser.go: -------------------------------------------------------------------------------- 1 | // go-ungrammar: parser. 2 | // 3 | // Eli Bendersky [https://eli.thegreenplace.net] 4 | // This code is in the public domain. 5 | 6 | package ungrammar 7 | 8 | import "fmt" 9 | 10 | // Parser parses ungrammar syntax into a Grammar. Create a new parser with 11 | // NewParser, and then call its ParseGrammar method. 12 | type Parser struct { 13 | lex *lexer 14 | 15 | tok token 16 | nextTok token 17 | 18 | errs ErrorList 19 | } 20 | 21 | // NewParser creates a new parser with the given string input. 22 | func NewParser(buf string) *Parser { 23 | p := &Parser{ 24 | lex: newLexer(buf), 25 | errs: nil, 26 | } 27 | 28 | p.tok = p.lex.nextToken() 29 | p.nextTok = p.lex.nextToken() 30 | return p 31 | } 32 | 33 | // ParseGrammar takes the input the Parser was initialized with and parses it 34 | // into a Grammar. It returns an ErrorList which collects all the errors 35 | // encountered during parsing, and in case of errors the returned Grammar may be 36 | // partial. 37 | func (p *Parser) ParseGrammar() (*Grammar, error) { 38 | rules := make(map[string]Rule) 39 | locs := make(map[string]location) 40 | for !p.eof() { 41 | name, location, rule := p.parseNamedRule() 42 | if rule != nil { 43 | if _, found := rules[name]; found { 44 | p.emitError(location, fmt.Sprintf("duplicate rule name %v", name)) 45 | } 46 | rules[name] = rule 47 | locs[name] = location 48 | } 49 | } 50 | 51 | grammar := &Grammar{ 52 | Rules: rules, 53 | NameLoc: locs, 54 | } 55 | 56 | if len(p.errs) > 0 { 57 | return grammar, p.errs 58 | } else { 59 | return grammar, nil 60 | } 61 | } 62 | 63 | // advance returns the current token and consumes it (the next call to advance 64 | // will return the next token in the stream, etc.) 65 | func (p *Parser) advance() token { 66 | tok := p.tok 67 | if tok.name == EOF { 68 | return tok 69 | } 70 | 71 | // Shift the lookahead "buffer" 72 | p.tok = p.nextTok 73 | p.nextTok = p.lex.nextToken() 74 | return tok 75 | } 76 | 77 | func (p *Parser) eof() bool { 78 | return p.tok.name == EOF 79 | } 80 | 81 | // parseNamedRule parses a top-level named rule: Node '=' , and returns 82 | // its name, the location of the name and the rule itself. It returns an empty 83 | // name and rule if the parser doesn't currently point to a rule. 84 | func (p *Parser) parseNamedRule() (string, location, Rule) { 85 | tok := p.tok 86 | if tok.name == NODE { 87 | p.advance() 88 | if p.tok.name == EQ { 89 | p.advance() 90 | rule := p.parseAlt() 91 | return tok.value, tok.loc, rule 92 | } 93 | } 94 | 95 | // If we're here, a named rule was not found. 96 | p.emitError(tok.loc, fmt.Sprintf("expected named rule, got %v", tok.value)) 97 | p.synchronize() 98 | return "", location{}, nil 99 | } 100 | 101 | // parseAlt parses a top-level rule, the LHS of Node '=' . It's 102 | // potentially a '|'-seprated alternation of sequences. 103 | func (p *Parser) parseAlt() Rule { 104 | alts := []Rule{p.parseSeq()} 105 | for p.tok.name == PIPE { 106 | p.advance() 107 | alts = append(alts, p.parseSeq()) 108 | } 109 | if len(alts) == 1 { 110 | return alts[0] 111 | } else { 112 | return &Alt{alts} 113 | } 114 | } 115 | 116 | // parseSeq parses a sequence of single rules. 117 | func (p *Parser) parseSeq() Rule { 118 | sr := p.parseSingleRule() 119 | if sr == nil { 120 | p.emitError(p.tok.loc, fmt.Sprintf("expected rule, got %v", p.tok.value)) 121 | p.synchronize() 122 | return nil 123 | } 124 | seq := []Rule{sr} 125 | 126 | for { 127 | sr = p.parseSingleRule() 128 | if sr == nil { 129 | break 130 | } 131 | seq = append(seq, sr) 132 | } 133 | if len(seq) == 1 { 134 | return seq[0] 135 | } else { 136 | return &Seq{seq} 137 | } 138 | } 139 | 140 | // parseSingleRule parses a single rule atom that's potentially followed by 141 | // a '?' or '*' quantifier. It can return nil if there are no more single 142 | // rules to parse. 143 | // 144 | // The Ungrammar grammar contains an ambiguity, since named rules are not 145 | // terminated explicitly, consider: 146 | // 147 | // Foo = Bar Baz 148 | // Bob = Rob 149 | // 150 | // After "Foo =" we parse a sequence of Bar, Baz, but then we see Bob, which 151 | // shouldn't be in the sequence, but rather start a new named rule. When we 152 | // parse a single rule, we look ahead for a '=' and bail if it's found, leaving 153 | // "Bob =" to a higher-level parser. In that case, nil is returned. 154 | func (p *Parser) parseSingleRule() Rule { 155 | atom := p.parseSingleRuleAtom() 156 | if atom == nil { 157 | return nil 158 | } 159 | if p.tok.name == QMARK { 160 | p.advance() 161 | return &Opt{atom} 162 | } else if p.tok.name == STAR { 163 | p.advance() 164 | return &Rep{atom} 165 | } 166 | return atom 167 | } 168 | 169 | // parseSingleRuleAtom parses a single rule atom - either a node, token, a 170 | // labeled rule, or a rule in parentheses. See the comment on parseSingleRule 171 | // for the grammar ambiguity this has to handle. 172 | func (p *Parser) parseSingleRuleAtom() Rule { 173 | switch p.tok.name { 174 | case NODE: 175 | // Lookahead to see if this is actually the beginning of the next top-level 176 | // rule definition, and bail if yes. 177 | if p.nextTok.name == EQ { 178 | return nil 179 | } else if p.nextTok.name == COLON { 180 | labelTok := p.advance() 181 | // This is a labeled rule and the label is now in labelTok. 182 | // Skip the colon. 183 | p.advance() 184 | r := p.parseSingleRule() 185 | if r == nil { 186 | p.emitError(p.tok.loc, fmt.Sprintf("expected rule after label, got %v", p.tok.value)) 187 | p.synchronize() 188 | } 189 | return &Labeled{ 190 | Label: labelTok.value, 191 | Rule: r, 192 | labelLoc: labelTok.loc, 193 | } 194 | } else { 195 | tok := p.tok 196 | p.advance() 197 | return &Node{ 198 | Name: tok.value, 199 | nameLoc: tok.loc, 200 | } 201 | } 202 | case TOKEN: 203 | tok := p.tok 204 | p.advance() 205 | return &Token{ 206 | Value: tok.value, 207 | valueLoc: tok.loc, 208 | } 209 | case LPAREN: 210 | // Consume '(' and parse the full rule 211 | p.advance() 212 | r := p.parseAlt() 213 | 214 | // Expect closing ')', but return the rule anyway if we don't find it. 215 | if p.tok.name != RPAREN { 216 | p.emitError(p.tok.loc, fmt.Sprintf("expected ')', got %v", p.tok.value)) 217 | p.synchronize() 218 | return r 219 | } 220 | 221 | // Consume ')' 222 | p.advance() 223 | return r 224 | case ERROR: 225 | p.emitError(p.tok.loc, p.tok.value) 226 | p.synchronize() 227 | } 228 | return nil 229 | } 230 | 231 | // synchronize consumes tokens until it finds a safe place to restart parsing. 232 | // It tries to find the next Node '=' where a new named rule can be defined. 233 | func (p *Parser) synchronize() { 234 | for !p.eof() { 235 | if p.tok.name == NODE && p.nextTok.name == EQ { 236 | return 237 | } 238 | p.advance() 239 | } 240 | } 241 | 242 | func (p *Parser) emitError(loc location, msg string) { 243 | p.errs.Add(fmt.Errorf("%s: %s", loc, msg)) 244 | } 245 | -------------------------------------------------------------------------------- /parser_test.go: -------------------------------------------------------------------------------- 1 | // Eli Bendersky [https://eli.thegreenplace.net] 2 | // This code is in the public domain. 3 | 4 | package ungrammar 5 | 6 | import ( 7 | "fmt" 8 | "os" 9 | "path/filepath" 10 | "slices" 11 | "sort" 12 | "strings" 13 | "testing" 14 | ) 15 | 16 | // Tests parsing without errors 17 | func TestParserTable(t *testing.T) { 18 | var tests = []struct { 19 | input string 20 | wantRules []string 21 | }{ 22 | // Basic rules 23 | {`x = mynode`, []string{`x: mynode`}}, 24 | {`x = (mynode)`, []string{`x: mynode`}}, 25 | {`x = mynode*`, []string{`x: Rep(mynode)`}}, 26 | {`x = mynode?`, []string{`x: Opt(mynode)`}}, 27 | {`x = 'atok'`, []string{`x: 'atok'`}}, 28 | {`x = lab:mynode`, []string{`x: lab:mynode`}}, 29 | {`x = node 'tok'`, []string{`x: Seq(node, 'tok')`}}, 30 | {`x = foo | bar`, []string{`x: Alt(foo, bar)`}}, 31 | 32 | // Multiple alts/seqs 33 | {`x = a | b | c | d | e | f`, []string{`x: Alt(a, b, c, d, e, f)`}}, 34 | {`x = a b c d e f`, []string{`x: Seq(a, b, c, d, e, f)`}}, 35 | 36 | // Precedence between Seq and Alt and using (...) 37 | {`x = n | t p`, []string{`x: Alt(n, Seq(t, p))`}}, 38 | {`x = n i | t p | i b`, []string{`x: Alt(Seq(n, i), Seq(t, p), Seq(i, b))`}}, 39 | {`x = (n | t) p`, []string{`x: Seq(Alt(n, t), p)`}}, 40 | {`x = (n | t) p v w | y`, []string{`x: Alt(Seq(Alt(n, t), p, v, w), y)`}}, 41 | {`x = (n | t)? p`, []string{`x: Seq(Opt(Alt(n, t)), p)`}}, 42 | {`x = (n | t)? p *`, []string{`x: Seq(Opt(Alt(n, t)), Rep(p))`}}, 43 | 44 | // Misc. nesting 45 | {`x = (lab:Path '::')? labb:Seg`, []string{`x: Seq(Opt(Seq(lab:Path, '::')), labb:Seg)`}}, 46 | {`x = '=='? 't' (n (',' n)* ','?)? 't'`, []string{`x: Seq(Opt('=='), 't', Opt(Seq(n, Rep(Seq(',', n)), Opt(','))), 't')`}}, 47 | 48 | // Multiple rules 49 | {`x = a b y = d`, []string{`x: Seq(a, b)`, `y: d`}}, 50 | {`x = a b c 51 | y = d | t 52 | z = 'tok'`, 53 | []string{`x: Seq(a, b, c)`, `y: Alt(d, t)`, `z: 'tok'`}}, 54 | {`x = 55 | lab:Rule 'tok' 56 | 57 | Rule = 58 | 'tok' 59 | | Rule '*'`, 60 | []string{`x: Seq(lab:Rule, 'tok')`, `Rule: Alt('tok', Seq(Rule, '*'))`}}, 61 | 62 | // Expected parsing of ungrammar.ungrammar 63 | { 64 | readFileOrPanic(filepath.Join("testdata", "ungrammar.ungrammar")), 65 | []string{ 66 | `Grammar: Rep(Node)`, 67 | `Node: Seq(name:'ident', '=', Rule)`, 68 | `Rule: Alt('ident', 'token_ident', Rep(Rule), Seq(Rule, Rep(Seq('|', Rule))), Seq(Rule, '?'), Seq(Rule, '*'), Seq('(', Rule, ')'), Seq(label:'ident', ':', Rule))`, 69 | }, 70 | }, 71 | 72 | { 73 | readFileOrPanic(filepath.Join("testdata", "exprlang.ungrammar")), 74 | []string{ 75 | `AssignStmt: Seq('set', 'ident', '=', Expr)`, 76 | `BinExpr: Seq(lhs:Expr, op:Alt('+', '-', '*', '/', '%'), rhs:Expr)`, 77 | `Expr: Alt(Literal, UnaryExpr, ParenExpr, BinExpr)`, 78 | `Literal: Alt('int_literal', 'ident')`, 79 | `ParenExpr: Seq('(', Expr, ')')`, 80 | `Program: Rep(Stmt)`, 81 | `Stmt: Alt(AssignStmt, Expr)`, 82 | `UnaryExpr: Seq(op:Alt('+', '-'), Expr)`, 83 | }, 84 | }, 85 | } 86 | 87 | for _, tt := range tests { 88 | t.Run(tt.input, func(t *testing.T) { 89 | p := NewParser(tt.input) 90 | g, err := p.ParseGrammar() 91 | if err != nil { 92 | t.Error(err) 93 | } 94 | gotRules := grammarToStrings(g) 95 | 96 | sort.Strings(tt.wantRules) 97 | if !slices.Equal(gotRules, tt.wantRules) { 98 | t.Errorf("mismatch got != want:\n%v", displaySliceDiff(gotRules, tt.wantRules)) 99 | } 100 | }) 101 | } 102 | } 103 | 104 | // Check that we can read/parse the full rust.ungrammar without errors, and 105 | // perform basic sanity checking. 106 | func TestRustUngrammarFile(t *testing.T) { 107 | contents := readFileOrPanic(filepath.Join("testdata", "rust.ungrammar")) 108 | p := NewParser(string(contents)) 109 | g, err := p.ParseGrammar() 110 | if err != nil { 111 | t.Error(err) 112 | } 113 | rules := grammarToStrings(g) 114 | 115 | // Sanity check: the expected number of rules, and the first and last rules 116 | // match (note that they are first/last in string-sorted order). 117 | if len(rules) != 143 { 118 | t.Errorf("grammar got %v rules, want 143", len(g.Rules)) 119 | } 120 | 121 | want0 := `Abi: Seq('extern', Opt('string'))` 122 | if rules[0] != want0 { 123 | t.Errorf("rule 0 got %v, want %v", rules[0], want0) 124 | } 125 | want142 := `YieldExpr: Seq(Rep(Attr), 'yield', Opt(Expr))` 126 | if rules[142] != want142 { 127 | t.Errorf("rule 142 got %v, want %v", rules[142], want142) 128 | } 129 | } 130 | 131 | func TestLocations(t *testing.T) { 132 | input := ` 133 | x = foo | bar 134 | y = a b?` 135 | 136 | p := NewParser(input) 137 | g, err := p.ParseGrammar() 138 | if err != nil { 139 | t.Error(err) 140 | } 141 | 142 | xrule := g.Rules["x"] 143 | xalt := xrule.(*Alt) 144 | yrule := g.Rules["y"] 145 | yseq := yrule.(*Seq) 146 | yseq1opt := yseq.Rules[1].(*Opt) 147 | 148 | var tests = []struct { 149 | name string 150 | loc location 151 | wantLocString string 152 | }{ 153 | {"x name", g.NameLoc["x"], "2:1"}, 154 | {"x rule", xrule.Location(), "2:5"}, 155 | {"y name", g.NameLoc["y"], "3:1"}, 156 | {"x alt 0", xalt.Rules[0].Location(), "2:5"}, 157 | {"x alt 1", xalt.Rules[1].Location(), "2:11"}, 158 | {"y seq 0", yseq.Rules[0].Location(), "3:5"}, 159 | {"y seq 1", yseq.Rules[1].Location(), "3:7"}, 160 | {"y seq 1 opt", yseq1opt.Location(), "3:7"}, 161 | {"y seq 1 opt rule", yseq1opt.Rule.Location(), "3:7"}, 162 | } 163 | 164 | for _, tt := range tests { 165 | t.Run(tt.name, func(t *testing.T) { 166 | if tt.loc.String() != tt.wantLocString { 167 | t.Errorf("got %v, want %v", tt.loc.String(), tt.wantLocString) 168 | } 169 | }) 170 | } 171 | } 172 | 173 | // Test error handling and parser recovery. The parser will try to make progress 174 | // even in face of errors, returning partial results while errors persist. 175 | func TestParseErrors(t *testing.T) { 176 | var tests = []struct { 177 | input string 178 | wantRules []string 179 | wantErrors []string 180 | }{ 181 | // Missing a named rule 182 | {`foo bar`, []string{}, []string{"1:1: expected named rule, got foo"}}, 183 | 184 | // Missing alternation content, partial tree created with error 185 | {`x = a | | b`, []string{`x: Alt(a, )`}, []string{"1:9: expected rule, got |"}}, 186 | 187 | // Missing closing ')' before new rule, but both rules created 188 | {`x = ( a b t = foo`, []string{`t: foo`, `x: Seq(a, b)`}, []string{"1:11: expected ')', got t"}}, 189 | 190 | // Recovery after spurious '=' 191 | {`x = = foo`, []string{}, []string{"1:5: expected rule, got ="}}, 192 | {`x = = foo = y`, []string{`foo: y`}, []string{"1:5: expected rule, got ="}}, 193 | 194 | // Duplicate rule name 195 | {`x = a b x = y z`, []string{`x: Seq(y, z)`}, []string{`1:11: duplicate rule name x`}}, 196 | 197 | // Lexer errors 198 | {`x = a @ y = t`, []string{`x: a`, `y: t`}, []string{"1:7: unknown token starting with '@'"}}, 199 | {`x = a b 'two y = t`, []string{`x: Seq(a, b)`}, []string{"1:9: unterminated token literal"}}, 200 | 201 | // Multiple errors 202 | {`x = a @ y = t z = ( k`, []string{`x: a`, `y: t`, `z: k`}, []string{`1:7: unknown token starting with '@'`, `1:21: expected ')', got `}}, 203 | } 204 | 205 | for _, tt := range tests { 206 | t.Run(tt.input, func(t *testing.T) { 207 | p := NewParser(tt.input) 208 | g, err := p.ParseGrammar() 209 | gotRules := grammarToStrings(g) 210 | 211 | sort.Strings(tt.wantRules) 212 | if !slices.Equal(gotRules, tt.wantRules) { 213 | t.Errorf("rules mismatch got != want:\n%v", displaySliceDiff(gotRules, tt.wantRules)) 214 | } 215 | 216 | if err == nil { 217 | t.Error("expected errors, got nil") 218 | } 219 | errlist := err.(ErrorList) 220 | var gotErrors []string 221 | for _, err := range errlist { 222 | gotErrors = append(gotErrors, err.Error()) 223 | } 224 | 225 | if !slices.Equal(gotErrors, tt.wantErrors) { 226 | fmt.Println(gotErrors, tt.wantErrors) 227 | t.Errorf("errors mismatch got != want:\n%v", displaySliceDiff(gotErrors, tt.wantErrors)) 228 | } 229 | }) 230 | } 231 | } 232 | 233 | // Test the message received when multiple errors are present 234 | func TestMultipleErrorsMessage(t *testing.T) { 235 | // This has two errors: 236 | // - encountering the first | 237 | // - unterminated '(' 238 | input := ` 239 | foo = | 240 | bar = ( joe 241 | x = y` 242 | 243 | p := NewParser(input) 244 | _, err := p.ParseGrammar() 245 | wantErr := "2:7: expected rule, got | (and 1 more errors)" 246 | if err.Error() != wantErr { 247 | t.Errorf("got %v, want %v", err.Error(), wantErr) 248 | } 249 | } 250 | 251 | // A single isolated test useful for debugging the parser. 252 | func TestIsolated(t *testing.T) { 253 | input := `x = = foo = x` 254 | p := NewParser(input) 255 | g, err := p.ParseGrammar() 256 | 257 | if len(g.Rules) != 1 { 258 | t.Errorf("got %v rules, want 1", len(g.Rules)) 259 | } 260 | if err == nil { 261 | t.Error("got no error, want error") 262 | } 263 | } 264 | 265 | func TestIsolatedErrors(t *testing.T) { 266 | input := ` 267 | foo = @ 268 | bar = ( joe 269 | x = y` 270 | p := NewParser(input) 271 | g, err := p.ParseGrammar() 272 | 273 | gotRules := grammarToStrings(g) 274 | 275 | if len(gotRules) != 2 { 276 | t.Errorf("got %v rules, want 2", len(gotRules)) 277 | } 278 | errlist := err.(ErrorList) 279 | var gotErrors []string 280 | for _, err := range errlist { 281 | gotErrors = append(gotErrors, err.Error()) 282 | } 283 | if len(errlist) != 3 { 284 | t.Errorf("got %v errors, want 3", len(errlist)) 285 | } 286 | } 287 | 288 | // grammarToStrings takes a Grammar's string representation and splits it into 289 | // a sorted slice of strings (one per top-level rule) suitable for testing. 290 | func grammarToStrings(g *Grammar) []string { 291 | if len(g.String()) == 0 { 292 | return []string{} 293 | } 294 | ss := strings.Split(strings.TrimRight(g.String(), "\n"), "\n") 295 | sort.Strings(ss) 296 | return ss 297 | } 298 | 299 | // readFileOrPanic reads the given file's contents and returns them as a string. 300 | // In case of an error, it panics. 301 | func readFileOrPanic(filename string) string { 302 | contents, err := os.ReadFile(filename) 303 | if err != nil { 304 | panic(err) 305 | } 306 | return string(contents) 307 | } 308 | 309 | // displaySliceDiff displays a diff between two slices in a way that's 310 | // readable in test output. 311 | func displaySliceDiff[T any](got []T, want []T) string { 312 | maxLen := 0 313 | for _, g := range got { 314 | gs := fmt.Sprintf("%v", g) 315 | maxLen = max(maxLen + 1, len(gs)) 316 | } 317 | 318 | var sb strings.Builder 319 | fmt.Fprintf(&sb, "%-*v %v\n", maxLen, "got", "want") 320 | 321 | for i := 0; i < max(len(got), len(want)); i++ { 322 | var sgot string 323 | if i < len(got) { 324 | sgot = fmt.Sprintf("%v", got[i]) 325 | } 326 | 327 | var swant string 328 | if i < len(want) { 329 | swant = fmt.Sprintf("%v", want[i]) 330 | } 331 | 332 | sign := " " 333 | if swant != sgot { 334 | sign = "!=" 335 | } 336 | fmt.Fprintf(&sb, "%-*v %v %v\n", maxLen, sgot, sign, swant) 337 | } 338 | return sb.String() 339 | } 340 | -------------------------------------------------------------------------------- /testdata/rust.ungrammar: -------------------------------------------------------------------------------- 1 | /// copied from https://github.com/rust-analyzer/ungrammar/ 2 | 3 | // Rust Un-Grammar. 4 | // 5 | // This grammar specifies the structure of Rust's concrete syntax tree. 6 | // It does not specify parsing rules (ambiguities, precedence, etc are out of scope). 7 | // Tokens are processed -- contextual keywords are recognised, compound operators glued. 8 | // 9 | // Legend: 10 | // 11 | // // -- comment 12 | // Name = -- non-terminal definition 13 | // 'ident' -- token (terminal) 14 | // A B -- sequence 15 | // A | B -- alternation 16 | // A* -- zero or more repetition 17 | // A? -- zero or one repetition 18 | // (A) -- same as A 19 | // label:A -- suggested name for field of AST node 20 | 21 | //*************************// 22 | // Names, Paths and Macros // 23 | //*************************// 24 | 25 | Name = 26 | 'ident' | 'self' 27 | 28 | NameRef = 29 | 'ident' | 'int_number' | 'self' | 'super' | 'crate' | 'Self' 30 | 31 | Lifetime = 32 | 'lifetime_ident' 33 | 34 | Path = 35 | (qualifier:Path '::')? segment:PathSegment 36 | 37 | PathSegment = 38 | '::'? NameRef 39 | | NameRef GenericArgList? 40 | | NameRef ParamList RetType? 41 | | '<' PathType ('as' PathType)? '>' 42 | 43 | GenericArgList = 44 | '::'? '<' (GenericArg (',' GenericArg)* ','?)? '>' 45 | 46 | GenericArg = 47 | TypeArg 48 | | AssocTypeArg 49 | | LifetimeArg 50 | | ConstArg 51 | 52 | TypeArg = 53 | Type 54 | 55 | AssocTypeArg = 56 | NameRef GenericParamList? (':' TypeBoundList | '=' Type) 57 | 58 | LifetimeArg = 59 | Lifetime 60 | 61 | ConstArg = 62 | Expr 63 | 64 | MacroCall = 65 | Attr* Path '!' TokenTree ';'? 66 | 67 | TokenTree = 68 | '(' ')' 69 | | '{' '}' 70 | | '[' ']' 71 | 72 | MacroItems = 73 | Item* 74 | 75 | MacroStmts = 76 | statements:Stmt* 77 | Expr? 78 | 79 | //*************************// 80 | // Items // 81 | //*************************// 82 | 83 | SourceFile = 84 | 'shebang'? 85 | Attr* 86 | Item* 87 | 88 | Item = 89 | Const 90 | | Enum 91 | | ExternBlock 92 | | ExternCrate 93 | | Fn 94 | | Impl 95 | | MacroCall 96 | | MacroRules 97 | | MacroDef 98 | | Module 99 | | Static 100 | | Struct 101 | | Trait 102 | | TypeAlias 103 | | Union 104 | | Use 105 | 106 | MacroRules = 107 | Attr* Visibility? 108 | 'macro_rules' '!' Name 109 | TokenTree 110 | 111 | MacroDef = 112 | Attr* Visibility? 113 | 'macro' Name args:TokenTree? 114 | body:TokenTree 115 | 116 | Module = 117 | Attr* Visibility? 118 | 'mod' Name 119 | (ItemList | ';') 120 | 121 | ItemList = 122 | '{' Attr* Item* '}' 123 | 124 | ExternCrate = 125 | Attr* Visibility? 126 | 'extern' 'crate' NameRef Rename? ';' 127 | 128 | Rename = 129 | 'as' (Name | '_') 130 | 131 | Use = 132 | Attr* Visibility? 133 | 'use' UseTree ';' 134 | 135 | UseTree = 136 | (Path? '::')? ('*' | UseTreeList) 137 | | Path Rename? 138 | 139 | UseTreeList = 140 | '{' (UseTree (',' UseTree)* ','?)? '}' 141 | 142 | Fn = 143 | Attr* Visibility? 144 | 'default'? 'const'? 'async'? 'unsafe'? Abi? 145 | 'fn' Name GenericParamList? ParamList RetType? WhereClause? 146 | (body:BlockExpr | ';') 147 | 148 | Abi = 149 | 'extern' 'string'? 150 | 151 | ParamList = 152 | '('( 153 | SelfParam 154 | | (SelfParam ',')? (Param (',' Param)* ','?)? 155 | )')' 156 | | '|' (Param (',' Param)* ','?)? '|' 157 | 158 | SelfParam = 159 | Attr* ( 160 | ('&' Lifetime?)? 'mut'? Name 161 | | 'mut'? Name ':' Type 162 | ) 163 | 164 | Param = 165 | Attr* ( 166 | Pat (':' Type)? 167 | | Type 168 | | '...' 169 | ) 170 | 171 | RetType = 172 | '->' Type 173 | 174 | TypeAlias = 175 | Attr* Visibility? 176 | 'default'? 177 | 'type' Name GenericParamList? (':' TypeBoundList?)? WhereClause? 178 | ('=' Type)? ';' 179 | 180 | Struct = 181 | Attr* Visibility? 182 | 'struct' Name GenericParamList? ( 183 | WhereClause? (RecordFieldList | ';') 184 | | TupleFieldList WhereClause? ';' 185 | ) 186 | 187 | RecordFieldList = 188 | '{' fields:(RecordField (',' RecordField)* ','?)? '}' 189 | 190 | RecordField = 191 | Attr* Visibility? 192 | Name ':' Type 193 | 194 | TupleFieldList = 195 | '(' fields:(TupleField (',' TupleField)* ','?)? ')' 196 | 197 | TupleField = 198 | Attr* Visibility? 199 | Type 200 | 201 | FieldList = 202 | RecordFieldList 203 | | TupleFieldList 204 | 205 | Enum = 206 | Attr* Visibility? 207 | 'enum' Name GenericParamList? WhereClause? 208 | VariantList 209 | 210 | VariantList = 211 | '{' (Variant (',' Variant)* ','?)? '}' 212 | 213 | Variant = 214 | Attr* Visibility? 215 | Name FieldList? ('=' Expr)? 216 | 217 | Union = 218 | Attr* Visibility? 219 | 'union' Name GenericParamList? WhereClause? 220 | RecordFieldList 221 | 222 | // A Data Type. 223 | // 224 | // Not used directly in the grammar, but handy to have anyway. 225 | Adt = 226 | Enum 227 | | Struct 228 | | Union 229 | 230 | Const = 231 | Attr* Visibility? 232 | 'default'? 233 | 'const' (Name | '_') ':' Type 234 | ('=' body:Expr)? ';' 235 | 236 | Static = 237 | Attr* Visibility? 238 | 'static' 'mut'? Name ':' Type 239 | ('=' body:Expr)? ';' 240 | 241 | Trait = 242 | Attr* Visibility? 243 | 'unsafe'? 'auto'? 244 | 'trait' Name GenericParamList? (':' TypeBoundList?)? WhereClause? 245 | AssocItemList 246 | 247 | AssocItemList = 248 | '{' Attr* AssocItem* '}' 249 | 250 | AssocItem = 251 | Const 252 | | Fn 253 | | MacroCall 254 | | TypeAlias 255 | 256 | Impl = 257 | Attr* Visibility? 258 | 'default'? 'unsafe'? 259 | 'impl' GenericParamList? ('const'? '!'? trait:Type 'for')? self_ty:Type WhereClause? 260 | AssocItemList 261 | 262 | ExternBlock = 263 | Attr* 'unsafe'? Abi ExternItemList 264 | 265 | ExternItemList = 266 | '{' Attr* ExternItem* '}' 267 | 268 | ExternItem = 269 | Fn 270 | | MacroCall 271 | | Static 272 | | TypeAlias 273 | 274 | GenericParamList = 275 | '<' (GenericParam (',' GenericParam)* ','?)? '>' 276 | 277 | GenericParam = 278 | ConstParam 279 | | LifetimeParam 280 | | TypeParam 281 | 282 | TypeParam = 283 | Attr* Name (':' TypeBoundList?)? 284 | ('=' default_type:Type)? 285 | 286 | ConstParam = 287 | Attr* 'const' Name ':' Type 288 | ('=' default_val:Expr)? 289 | 290 | LifetimeParam = 291 | Attr* Lifetime (':' TypeBoundList?)? 292 | 293 | WhereClause = 294 | 'where' predicates:(WherePred (',' WherePred)* ','?) 295 | 296 | WherePred = 297 | ('for' GenericParamList)? (Lifetime | Type) ':' TypeBoundList? 298 | 299 | Visibility = 300 | 'pub' ('(' 'in'? Path ')')? 301 | 302 | Attr = 303 | '#' '!'? '[' Meta ']' 304 | 305 | Meta = 306 | Path ('=' Expr | TokenTree)? 307 | 308 | //****************************// 309 | // Statements and Expressions // 310 | //****************************// 311 | 312 | Stmt = 313 | ';' 314 | | ExprStmt 315 | | Item 316 | | LetStmt 317 | 318 | LetStmt = 319 | Attr* 'let' Pat (':' Type)? 320 | '=' initializer:Expr 321 | LetElse? 322 | ';' 323 | 324 | LetElse = 325 | 'else' BlockExpr 326 | 327 | ExprStmt = 328 | Expr ';'? 329 | 330 | Expr = 331 | ArrayExpr 332 | | AwaitExpr 333 | | BinExpr 334 | | BlockExpr 335 | | BoxExpr 336 | | BreakExpr 337 | | CallExpr 338 | | CastExpr 339 | | ClosureExpr 340 | | ContinueExpr 341 | | FieldExpr 342 | | ForExpr 343 | | IfExpr 344 | | IndexExpr 345 | | Literal 346 | | LoopExpr 347 | | MacroCall 348 | | MacroStmts 349 | | MatchExpr 350 | | MethodCallExpr 351 | | ParenExpr 352 | | PathExpr 353 | | PrefixExpr 354 | | RangeExpr 355 | | RecordExpr 356 | | RefExpr 357 | | ReturnExpr 358 | | TryExpr 359 | | TupleExpr 360 | | WhileExpr 361 | | YieldExpr 362 | | LetExpr 363 | | UnderscoreExpr 364 | 365 | Literal = 366 | Attr* value:( 367 | 'int_number' | 'float_number' 368 | | 'string' | 'raw_string' 369 | | 'byte_string' | 'raw_byte_string' 370 | | 'true' | 'false' 371 | | 'char' | 'byte' 372 | ) 373 | 374 | PathExpr = 375 | Attr* Path 376 | 377 | StmtList = 378 | '{' 379 | Attr* 380 | statements:Stmt* 381 | tail_expr:Expr? 382 | '}' 383 | 384 | RefExpr = 385 | Attr* '&' ('raw' | 'mut' | 'const') Expr 386 | 387 | TryExpr = 388 | Attr* Expr '?' 389 | 390 | BlockExpr = 391 | Attr* Label? ('try' | 'unsafe' | 'async' | 'const') StmtList 392 | 393 | PrefixExpr = 394 | Attr* op:('-' | '!' | '*') Expr 395 | 396 | BinExpr = 397 | Attr* 398 | lhs:Expr 399 | op:( 400 | '||' | '&&' 401 | | '==' | '!=' | '<=' | '>=' | '<' | '>' 402 | | '+' | '*' | '-' | '/' | '%' | '<<' | '>>' | '^' | '|' | '&' 403 | | '=' | '+=' | '/=' | '*=' | '%=' | '>>=' | '<<=' | '-=' | '|=' | '&=' | '^=' 404 | ) 405 | rhs:Expr 406 | 407 | CastExpr = 408 | Attr* Expr 'as' Type 409 | 410 | ParenExpr = 411 | Attr* '(' Attr* Expr ')' 412 | 413 | ArrayExpr = 414 | Attr* '[' Attr* ( 415 | (Expr (',' Expr)* ','?)? 416 | | Expr ';' Expr 417 | ) ']' 418 | 419 | IndexExpr = 420 | Attr* base:Expr '[' index:Expr ']' 421 | 422 | TupleExpr = 423 | Attr* '(' Attr* fields:(Expr (',' Expr)* ','?)? ')' 424 | 425 | RecordExpr = 426 | Path RecordExprFieldList 427 | 428 | RecordExprFieldList = 429 | '{' 430 | Attr* 431 | fields:(RecordExprField (',' RecordExprField)* ','?)? 432 | ('..' spread:Expr?)? 433 | '}' 434 | 435 | RecordExprField = 436 | Attr* (NameRef ':')? Expr 437 | 438 | CallExpr = 439 | Attr* Expr ArgList 440 | 441 | ArgList = 442 | '(' args:(Expr (',' Expr)* ','?)? ')' 443 | 444 | MethodCallExpr = 445 | Attr* receiver:Expr '.' NameRef GenericArgList? ArgList 446 | 447 | FieldExpr = 448 | Attr* Expr '.' NameRef 449 | 450 | ClosureExpr = 451 | Attr* 'static'? 'async'? 'move'? ParamList RetType? 452 | body:Expr 453 | 454 | IfExpr = 455 | Attr* 'if' condition:Expr then_branch:BlockExpr 456 | ('else' else_branch:(IfExpr | BlockExpr))? 457 | 458 | LoopExpr = 459 | Attr* Label? 'loop' 460 | loop_body:BlockExpr 461 | 462 | ForExpr = 463 | Attr* Label? 'for' Pat 'in' iterable:Expr 464 | loop_body:BlockExpr 465 | 466 | WhileExpr = 467 | Attr* Label? 'while' condition:Expr 468 | loop_body:BlockExpr 469 | 470 | Label = 471 | Lifetime ':' 472 | 473 | BreakExpr = 474 | Attr* 'break' Lifetime? Expr? 475 | 476 | ContinueExpr = 477 | Attr* 'continue' Lifetime? 478 | 479 | RangeExpr = 480 | Attr* start:Expr? op:('..' | '..=') end:Expr? 481 | 482 | MatchExpr = 483 | Attr* 'match' Expr MatchArmList 484 | 485 | MatchArmList = 486 | '{' 487 | Attr* 488 | arms:MatchArm* 489 | '}' 490 | 491 | MatchArm = 492 | Attr* Pat guard:MatchGuard? '=>' Expr ','? 493 | 494 | MatchGuard = 495 | 'if' condition:Expr 496 | 497 | ReturnExpr = 498 | Attr* 'return' Expr? 499 | 500 | YieldExpr = 501 | Attr* 'yield' Expr? 502 | 503 | LetExpr = 504 | Attr* 'let' Pat '=' Expr 505 | 506 | UnderscoreExpr = 507 | Attr* '_' 508 | 509 | AwaitExpr = 510 | Attr* Expr '.' 'await' 511 | 512 | BoxExpr = 513 | Attr* 'box' Expr 514 | 515 | //*************************// 516 | // Types // 517 | //*************************// 518 | 519 | Type = 520 | ArrayType 521 | | DynTraitType 522 | | FnPtrType 523 | | ForType 524 | | ImplTraitType 525 | | InferType 526 | | MacroType 527 | | NeverType 528 | | ParenType 529 | | PathType 530 | | PtrType 531 | | RefType 532 | | SliceType 533 | | TupleType 534 | 535 | ParenType = 536 | '(' Type ')' 537 | 538 | NeverType = 539 | '!' 540 | 541 | MacroType = 542 | MacroCall 543 | 544 | PathType = 545 | Path 546 | 547 | TupleType = 548 | '(' fields:(Type (',' Type)* ','?)? ')' 549 | 550 | PtrType = 551 | '*' ('const' | 'mut') Type 552 | 553 | RefType = 554 | '&' Lifetime? 'mut'? Type 555 | 556 | ArrayType = 557 | '[' Type ';' Expr ']' 558 | 559 | SliceType = 560 | '[' Type ']' 561 | 562 | InferType = 563 | '_' 564 | 565 | FnPtrType = 566 | 'const'? 'async'? 'unsafe'? Abi? 'fn' ParamList RetType? 567 | 568 | ForType = 569 | 'for' GenericParamList Type 570 | 571 | ImplTraitType = 572 | 'impl' TypeBoundList 573 | 574 | DynTraitType = 575 | 'dyn' TypeBoundList 576 | 577 | TypeBoundList = 578 | bounds:(TypeBound ('+' TypeBound)* '+'?) 579 | 580 | TypeBound = 581 | Lifetime 582 | | ('?' | '~' 'const')? Type 583 | 584 | //************************// 585 | // Patterns // 586 | //************************// 587 | 588 | Pat = 589 | IdentPat 590 | | BoxPat 591 | | RestPat 592 | | LiteralPat 593 | | MacroPat 594 | | OrPat 595 | | ParenPat 596 | | PathPat 597 | | WildcardPat 598 | | RangePat 599 | | RecordPat 600 | | RefPat 601 | | SlicePat 602 | | TuplePat 603 | | TupleStructPat 604 | | ConstBlockPat 605 | 606 | LiteralPat = 607 | Literal 608 | 609 | IdentPat = 610 | Attr* 'ref'? 'mut'? Name ('@' Pat)? 611 | 612 | WildcardPat = 613 | '_' 614 | 615 | RangePat = 616 | // 1.. 617 | start:Pat op:('..' | '..=') 618 | // 1..2 619 | | start:Pat op:('..' | '..=') end:Pat 620 | // ..2 621 | | op:('..' | '..=') end:Pat 622 | 623 | RefPat = 624 | '&' 'mut'? Pat 625 | 626 | RecordPat = 627 | Path RecordPatFieldList 628 | 629 | RecordPatFieldList = 630 | '{' 631 | fields:(RecordPatField (',' RecordPatField)* ','?)? 632 | RestPat? 633 | '}' 634 | 635 | RecordPatField = 636 | Attr* (NameRef ':')? Pat 637 | 638 | TupleStructPat = 639 | Path '(' fields:(Pat (',' Pat)* ','?)? ')' 640 | 641 | TuplePat = 642 | '(' fields:(Pat (',' Pat)* ','?)? ')' 643 | 644 | ParenPat = 645 | '(' Pat ')' 646 | 647 | SlicePat = 648 | '[' (Pat (',' Pat)* ','?)? ']' 649 | 650 | PathPat = 651 | Path 652 | 653 | OrPat = 654 | (Pat ('|' Pat)* '|'?) 655 | 656 | BoxPat = 657 | 'box' Pat 658 | 659 | RestPat = 660 | Attr* '..' 661 | 662 | MacroPat = 663 | MacroCall 664 | 665 | ConstBlockPat = 666 | 'const' BlockExpr 667 | --------------------------------------------------------------------------------