├── doc ├── .gitignore ├── book.toml └── src │ ├── libsandtools.md │ ├── thirdparty.md │ ├── othertools.md │ ├── lrpar.md │ ├── README.md │ ├── lrtable.md │ ├── cfgrammar.md │ ├── lexing.md │ ├── parsing.md │ ├── yaccextensions.md │ ├── SUMMARY.md │ ├── editions.md │ ├── lrlex.md │ ├── lexextensions.md │ ├── lexcompatibility.md │ ├── actioncode.md │ ├── start_states.md │ ├── manuallexer.md │ ├── yacccompatibility.md │ ├── ast_example.md │ ├── nimbleparse.md │ └── parsing_idioms.md ├── .gitignore ├── lrpar ├── examples │ ├── calc_ast │ │ ├── src │ │ │ ├── input.txt │ │ │ ├── calc.l │ │ │ ├── calc.y │ │ │ └── main.rs │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── build.rs │ ├── calc_actions │ │ ├── src │ │ │ ├── input.txt │ │ │ ├── calc.l │ │ │ ├── calc.y │ │ │ └── main.rs │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── build.rs │ ├── calc_ast_arena │ │ ├── src │ │ │ ├── input.txt │ │ │ ├── calc.l │ │ │ ├── calc.y │ │ │ └── main.rs │ │ ├── Cargo.toml │ │ └── build.rs │ ├── calc_parsetree │ │ ├── src │ │ │ ├── input.txt │ │ │ ├── calc.l │ │ │ ├── calc.y │ │ │ └── main.rs │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── build.rs │ ├── clone_param │ │ ├── src │ │ │ ├── input.txt │ │ │ ├── param.l │ │ │ ├── param.y │ │ │ └── main.rs │ │ ├── Cargo.toml │ │ ├── build.rs │ │ └── README.md │ └── start_states │ │ ├── src │ │ ├── input.txt │ │ ├── comment.y │ │ ├── comment.l │ │ └── main.rs │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── build.rs ├── cttests │ ├── src │ │ ├── storaget.l │ │ ├── multi_start.l │ │ ├── multi_start.y │ │ ├── epp.test │ │ ├── expect.test │ │ ├── ctfails │ │ │ ├── warnings.test │ │ │ ├── missing.test │ │ │ ├── warnings_flags.test │ │ │ ├── test_files2.test │ │ │ ├── test_files3.test │ │ │ ├── test_files1.test │ │ │ └── calc_bad_input.test │ │ ├── expectrr.test │ │ ├── lexer_lifetime.test │ │ ├── parseparam.test │ │ ├── regex_opt.test │ │ ├── warnings.test │ │ ├── quoting.test │ │ ├── lex_flags.test │ │ ├── storaget.y │ │ ├── multitypes.test │ │ ├── typeparams.test │ │ ├── passthrough.test │ │ ├── calc_noactions.test │ │ ├── parseparam_copy.test │ │ ├── calc_nodefault_yacckind.test │ │ ├── calc_recoverer_none.test │ │ ├── calc_multitypes.test │ │ ├── calc_recoverer_cpctplus.test │ │ ├── span.test │ │ ├── calc_actiontype.test │ │ ├── calc_unsafeaction.test │ │ ├── calc_input.test │ │ ├── calc_wasm.test │ │ ├── calc_wasm.rs │ │ ├── grmtools_section.test │ │ └── cgen_helper.rs │ ├── Cargo.toml │ └── build.rs ├── build.rs ├── cttests_macro │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── Cargo.toml ├── src │ └── lib │ │ ├── test_utils.rs │ │ ├── dijkstra.rs │ │ └── lex_api.rs └── README.md ├── lrlex ├── build.rs ├── examples │ ├── calclex │ │ ├── src │ │ │ ├── calc.l │ │ │ └── main.rs │ │ ├── build.rs │ │ ├── Cargo.toml │ │ └── README.md │ └── calc_manual_lex │ │ ├── README.md │ │ ├── Cargo.toml │ │ ├── build.rs │ │ └── src │ │ ├── calc.y │ │ └── main.rs ├── README.md ├── Cargo.toml └── src │ ├── lib │ └── defaults.rs │ └── main.rs ├── deny.toml ├── .cargo └── config.toml ├── cfgrammar ├── README.md ├── Cargo.toml └── src │ └── lib │ ├── idxnewtype.rs │ ├── yacc │ └── mod.rs │ ├── span.rs │ └── mod.rs ├── lrtable ├── README.md ├── Cargo.toml └── src │ └── lib │ └── mod.rs ├── .buildbot_dockerfile_debian ├── .github └── workflows │ └── sdci.yml ├── LICENSE-APACHE ├── nimbleparse ├── Cargo.toml └── README.md ├── COPYRIGHT ├── LICENSE-MIT ├── Cargo.toml ├── .buildbot.sh └── README.md /doc/.gitignore: -------------------------------------------------------------------------------- 1 | book 2 | release 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | *.swp 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast/src/input.txt: -------------------------------------------------------------------------------- 1 | 5 + 4 * 3 -------------------------------------------------------------------------------- /lrpar/examples/calc_actions/src/input.txt: -------------------------------------------------------------------------------- 1 | 5 + 4 * 3 -------------------------------------------------------------------------------- /lrpar/examples/calc_ast_arena/src/input.txt: -------------------------------------------------------------------------------- 1 | 5 + 4 * 3 -------------------------------------------------------------------------------- /lrpar/examples/calc_parsetree/src/input.txt: -------------------------------------------------------------------------------- 1 | 5 + 4 * 3 -------------------------------------------------------------------------------- /lrpar/examples/clone_param/src/input.txt: -------------------------------------------------------------------------------- 1 | 0++++--- 2 | -------------------------------------------------------------------------------- /doc/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | src = "src" 3 | title = "grmtools" 4 | -------------------------------------------------------------------------------- /lrpar/cttests/src/storaget.l: -------------------------------------------------------------------------------- 1 | %% 2 | , "," 3 | [a-zA-Z]+ "word" 4 | [\n\t ]+ ; 5 | -------------------------------------------------------------------------------- /lrpar/examples/start_states/src/input.txt: -------------------------------------------------------------------------------- 1 | coment /* */ 2 | nested comment /* /* */ */ 3 | -------------------------------------------------------------------------------- /lrpar/cttests/src/multi_start.l: -------------------------------------------------------------------------------- 1 | %% 2 | A+ 'A' 3 | B+ 'B' 4 | C+ 'C' 5 | ; ';' 6 | : ':' 7 | , ',' 8 | [ \n\t] ; -------------------------------------------------------------------------------- /lrpar/examples/calc_actions/src/calc.l: -------------------------------------------------------------------------------- 1 | %% 2 | [0-9]+ "INT" 3 | \+ "+" 4 | \* "*" 5 | \( "(" 6 | \) ")" 7 | [\t\n ]+ ; 8 | -------------------------------------------------------------------------------- /lrpar/examples/calc_parsetree/src/calc.l: -------------------------------------------------------------------------------- 1 | %% 2 | [0-9]+ "INT" 3 | \+ "+" 4 | \* "*" 5 | \( "(" 6 | \) ")" 7 | [\t\n ]+ ; 8 | -------------------------------------------------------------------------------- /lrpar/examples/clone_param/src/param.l: -------------------------------------------------------------------------------- 1 | %% 2 | (\-?)[0-9]+ "INT" 3 | \- "Decr" 4 | \+ "Incr" 5 | [\n\t\ ] ; 6 | . 'UNMATCHED' 7 | -------------------------------------------------------------------------------- /lrlex/build.rs: -------------------------------------------------------------------------------- 1 | use vergen::EmitBuilder; 2 | 3 | fn main() { 4 | EmitBuilder::builder().build_timestamp().emit().unwrap(); 5 | } 6 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast/src/calc.l: -------------------------------------------------------------------------------- 1 | %% 2 | [0-9]+ "INT" 3 | \+ "+" 4 | \* "*" 5 | \( "(" 6 | \) ")" 7 | [\t\n ]+ ; 8 | . "UNMATCHED" 9 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast_arena/src/calc.l: -------------------------------------------------------------------------------- 1 | %% 2 | [0-9]+ "INT" 3 | \+ "+" 4 | \* "*" 5 | \( "(" 6 | \) ")" 7 | [\t\n ]+ ; 8 | . "UNMATCHED" 9 | -------------------------------------------------------------------------------- /lrlex/examples/calclex/src/calc.l: -------------------------------------------------------------------------------- 1 | %% 2 | ([0-9]+\.[0-9]*)|([0-9]*\.[0-9]+) "FLOAT" 3 | [0-9]+ "INT" 4 | \+ "+" 5 | \* "*" 6 | - "-" 7 | / "/" 8 | \( "(" 9 | [\t ]+ ; 10 | -------------------------------------------------------------------------------- /deny.toml: -------------------------------------------------------------------------------- 1 | [licenses] 2 | confidence-threshold = 1.0 3 | allow = [ 4 | "Apache-2.0", 5 | "MIT", 6 | "BSD-3-Clause", 7 | "Unicode-3.0", 8 | "Zlib", 9 | ] 10 | -------------------------------------------------------------------------------- /lrpar/build.rs: -------------------------------------------------------------------------------- 1 | use vergen::EmitBuilder; 2 | 3 | fn main() { 4 | println!("cargo::rustc-check-cfg=cfg(grmtools_extra_checks)"); 5 | EmitBuilder::builder().build_timestamp().emit().unwrap(); 6 | } 7 | -------------------------------------------------------------------------------- /doc/src/libsandtools.md: -------------------------------------------------------------------------------- 1 | # The individual libraries and tools 2 | 3 | [grmtools](https://github.com/softdevteam/grmtools/) consists of several 4 | libraries and command-line tools. The following sections describe each. 5 | -------------------------------------------------------------------------------- /lrpar/examples/start_states/src/comment.y: -------------------------------------------------------------------------------- 1 | %grmtools{ 2 | yacckind: Original(GenericParseTree), 3 | test_files: ["input*.txt"], 4 | } 5 | %start Expr 6 | %% 7 | Expr: Expr Text | ; 8 | 9 | Text: 'TEXT'; 10 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.wasm32-wasip2] 2 | runner = "workspace_runner --target wasm32-wasip2 --" 3 | 4 | [target.wasm32-unknown-unknown] 5 | # Provided by the crate wasm-bindgen-cli. 6 | runner = "wasm-bindgen-test-runner" 7 | -------------------------------------------------------------------------------- /lrlex/examples/calclex/build.rs: -------------------------------------------------------------------------------- 1 | use lrlex::CTLexerBuilder; 2 | 3 | fn main() { 4 | CTLexerBuilder::new() 5 | .lexer_in_src_dir("calc.l") 6 | .unwrap() 7 | .build() 8 | .unwrap(); 9 | } 10 | -------------------------------------------------------------------------------- /lrpar/cttests/src/multi_start.y: -------------------------------------------------------------------------------- 1 | %grmtools{yacckind: Grmtools} 2 | %start AStart 3 | %token A B C 4 | %% 5 | 6 | AStart -> () 7 | : A ':' BStart ';' {} 8 | ; 9 | 10 | BStart -> () 11 | : B ',' C {} 12 | | C ',' B {} 13 | ; 14 | -------------------------------------------------------------------------------- /lrpar/examples/start_states/src/comment.l: -------------------------------------------------------------------------------- 1 | %x COMMENT 2 | %% 3 | . "TEXT" 4 | /\* <+COMMENT>; 5 | . ; 6 | \n ; 7 | \*/ <-COMMENT>; 8 | -------------------------------------------------------------------------------- /lrpar/cttests/src/epp.test: -------------------------------------------------------------------------------- 1 | name: Test %epp string 2 | yacckind: Original(YaccOriginalActionKind::GenericParseTree) 3 | grammar: | 4 | %start A 5 | %epp a '"\"a"' 6 | %% 7 | A : 'a'; 8 | lexer: | 9 | %% 10 | a 'a' 11 | -------------------------------------------------------------------------------- /cfgrammar/README.md: -------------------------------------------------------------------------------- 1 | # `cfgrammar` 2 | 3 | `cfgrammar` reads in grammar files, processes them, and provides a convenient 4 | API for operating with them. It may be of interest to those manipulating 5 | grammars directly, or who wish to use custom types of parsers. 6 | -------------------------------------------------------------------------------- /lrtable/README.md: -------------------------------------------------------------------------------- 1 | # `lrtable` 2 | 3 | `lrtable` takes in grammars from [`cfgrammar`](cfgrammar.html) and creates LR 4 | state tables from them. Few users will be interested in its functionality 5 | directly, except those doing advanced forms of grammar analysis. 6 | -------------------------------------------------------------------------------- /lrpar/cttests/src/expect.test: -------------------------------------------------------------------------------- 1 | name: Test %expect 2 | yacckind: Original(YaccOriginalActionKind::NoAction) 3 | grammar: | 4 | %start A 5 | %expect 1 6 | %% 7 | A: 'a' 'b' | B 'b'; 8 | B: 'a'; 9 | lexer: | 10 | %% 11 | a 'a' 12 | b 'b' 13 | -------------------------------------------------------------------------------- /.buildbot_dockerfile_debian: -------------------------------------------------------------------------------- 1 | FROM debian:latest 2 | ARG CI_UID 3 | RUN useradd -m -u ${CI_UID} ci 4 | RUN apt-get update && \ 5 | apt-get -y install build-essential curl procps file 6 | WORKDIR /ci 7 | RUN chown ${CI_UID}:${CI_UID} . 8 | COPY --chown=${CI_UID}:${CI_UID} . . 9 | CMD sh -x .buildbot.sh 10 | -------------------------------------------------------------------------------- /lrpar/cttests/src/ctfails/warnings.test: -------------------------------------------------------------------------------- 1 | name: Test warnings are treated as errors by default. 2 | yacckind: Original(YaccOriginalActionKind::GenericParseTree) 3 | grammar: | 4 | %start A 5 | %token b 6 | %% 7 | A : 'a'; 8 | B : 'b'; 9 | lexer: | 10 | %% 11 | a 'a' 12 | b 'b' 13 | -------------------------------------------------------------------------------- /.github/workflows/sdci.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | merge_group: 4 | 5 | # This is required to silence emails about the workflow having no jobs. 6 | # We simply define a dummy job that does nothing much. 7 | jobs: 8 | dummy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - run: /usr/bin/true 12 | -------------------------------------------------------------------------------- /lrpar/cttests/src/ctfails/missing.test: -------------------------------------------------------------------------------- 1 | name: Test missing terms in lexer and parser 2 | yacckind: Original(YaccOriginalActionKind::NoAction) 3 | lex_flags: [ '!allow_missing_terms_in_lexer', '!allow_missing_tokens_in_parser' ] 4 | grammar: | 5 | %% 6 | S: 'B'; 7 | 8 | lexer: | 9 | %% 10 | A "A" 11 | -------------------------------------------------------------------------------- /lrpar/cttests/src/expectrr.test: -------------------------------------------------------------------------------- 1 | name: Test %expect 2 | yacckind: Original(YaccOriginalActionKind::NoAction) 3 | grammar: | 4 | %start A 5 | %expect 1 6 | %expect-rr 1 7 | %% 8 | A : 'a' 'b' | B 'b'; 9 | B : 'a' | C; 10 | C : 'a'; 11 | lexer: | 12 | %% 13 | a 'a' 14 | b 'b' 15 | -------------------------------------------------------------------------------- /lrpar/cttests/src/lexer_lifetime.test: -------------------------------------------------------------------------------- 1 | name: Test that the lexer does not have to outlive the input's lifetime 2 | yacckind: Grmtools 3 | grammar: | 4 | %start T 5 | %% 6 | T -> &'input str: 7 | "ID" { $lexer.span_str($1.unwrap().span()) } 8 | ; 9 | lexer: | 10 | %% 11 | [a-z] "ID" 12 | -------------------------------------------------------------------------------- /doc/src/thirdparty.md: -------------------------------------------------------------------------------- 1 | # Libraries and tools developed by third parties 2 | 3 | Besides using grmtools to develop parsers. The following items use grmtools 4 | to extend or augment the functionality which may be useful to people developing 5 | parsers with grmtools. 6 | 7 | - [nimbleparse_lsp](https://github.com/ratmice/nimbleparse_lsp) 8 | -------------------------------------------------------------------------------- /lrpar/cttests/src/parseparam.test: -------------------------------------------------------------------------------- 1 | name: Test %parse-param 2 | yacckind: Grmtools 3 | grammar: | 4 | %start S 5 | %parse-param p: &u64 6 | %% 7 | S -> u64: 8 | 'INT' { *p + $lexer.span_str($1.unwrap().span()).parse::().unwrap() } 9 | ; 10 | %% 11 | lexer: | 12 | %% 13 | [0-9]+ 'INT' 14 | -------------------------------------------------------------------------------- /lrpar/examples/calc_parsetree/src/calc.y: -------------------------------------------------------------------------------- 1 | %grmtools{ 2 | yacckind: Original(GenericParseTree), 3 | test_files: ["input*.txt"], 4 | } 5 | %start Expr 6 | %avoid_insert "INT" 7 | %% 8 | Expr: Expr '+' Term 9 | | Term ; 10 | 11 | Term: Term '*' Factor 12 | | Factor ; 13 | 14 | Factor: '(' Expr ')' 15 | | 'INT'; 16 | -------------------------------------------------------------------------------- /lrlex/examples/calc_manual_lex/README.md: -------------------------------------------------------------------------------- 1 | # Parsing a simple calculator language 2 | 3 | This directory contains a very simple example of a calculator evaluator that 4 | uses a hand-written lexer alongside an `lrpar` parser. 5 | 6 | Look at `build.rs` and `src/main.rs` to see how `lrlex` can make it easier to 7 | use a hand-written lexer with `lrpar`. 8 | -------------------------------------------------------------------------------- /lrpar/cttests/src/ctfails/warnings_flags.test: -------------------------------------------------------------------------------- 1 | name: Test enabling warnings are errors. 2 | yacckind: Original(YaccOriginalActionKind::GenericParseTree) 3 | yacc_flags: [ warnings_are_errors ] 4 | grammar: | 5 | %start A 6 | %token b 7 | %% 8 | A : 'a'; 9 | B : 'b'; 10 | lexer: | 11 | %% 12 | a 'a' 13 | b 'b' 14 | -------------------------------------------------------------------------------- /lrpar/cttests/src/regex_opt.test: -------------------------------------------------------------------------------- 1 | name: Test regex options via builder. 2 | yacckind: Original(YaccOriginalActionKind::NoAction) 3 | lex_flags: ['!dot_matches_new_line', 'octal'] 4 | grammar: | 5 | %start Start 6 | %% 7 | Start: 'ANY' | 'a' | 'NL'; 8 | 9 | lexer: | 10 | %% 11 | \141 'a' 12 | . 'ANY' 13 | [\n] 'NL' 14 | -------------------------------------------------------------------------------- /lrpar/cttests/src/warnings.test: -------------------------------------------------------------------------------- 1 | name: Test disabling warnings are errors. 2 | yacckind: Original(YaccOriginalActionKind::GenericParseTree) 3 | yacc_flags: [ '!warnings_are_errors', '!show_warnings' ] 4 | grammar: | 5 | %start A 6 | %token b 7 | %% 8 | A : 'a'; 9 | B : 'b'; 10 | lexer: | 11 | %% 12 | a 'a' 13 | b 'b' -------------------------------------------------------------------------------- /lrpar/cttests/src/quoting.test: -------------------------------------------------------------------------------- 1 | name: Test NoAction using the calculator grammar 2 | yacckind: Original(YaccOriginalActionKind::NoAction) 3 | grammar: | 4 | %start S 5 | %% 6 | S: '\' | '"' | '<' | '+' '🦀' ; 7 | 8 | lexer: | 9 | %% 10 | " '"' 11 | \< '<' 12 | \\ '\' 13 | \+ '+' 14 | 🦀 '🦀' 15 | [\t ]+ ; 16 | -------------------------------------------------------------------------------- /lrpar/cttests/src/lex_flags.test: -------------------------------------------------------------------------------- 1 | name: Lex flags in the grmtools section 2 | grammar: | 3 | %grmtools{yacckind: Original(NoAction)} 4 | %start Start 5 | %% 6 | Start: 'ANY' | 'a' | 'NL'; 7 | 8 | lexer: | 9 | %grmtools{!dot_matches_new_line, octal, size_limit: 1048576} 10 | %% 11 | \141 'a' 12 | . 'ANY' 13 | [\n] 'NL' 14 | -------------------------------------------------------------------------------- /doc/src/othertools.md: -------------------------------------------------------------------------------- 1 | # Other tools 2 | 3 | When parsing text in Rust, you should also evaluate the following tools to see 4 | if they are more suitable for your purposes: 5 | 6 | * [LALRPOP](http://lalrpop.github.io/lalrpop/) 7 | * [nom](https://crates.io/crates/nom) 8 | * [pest](https://pest.rs/) 9 | * [rust-peg](https://github.com/kevinmehall/rust-peg) 10 | -------------------------------------------------------------------------------- /lrpar/cttests/src/storaget.y: -------------------------------------------------------------------------------- 1 | %grmtools{yacckind: Grmtools} 2 | %% 3 | word_seq -> Vec 4 | : "word" {vec![$lexer.span_str($1.as_ref().unwrap().span()).to_string()] 5 | } 6 | | word_seq "," "word" { 7 | let w: String = $lexer.span_str($3.as_ref().unwrap().span()).to_string(); 8 | $1.push(w); 9 | $1 10 | } 11 | ; 12 | %% 13 | -------------------------------------------------------------------------------- /lrpar/cttests/src/multitypes.test: -------------------------------------------------------------------------------- 1 | name: Test multiple types 2 | yacckind: Grmtools 3 | grammar: | 4 | %start S 5 | %% 6 | S -> Vec: 7 | A { vec![$1] } 8 | | S A { 9 | $1.push($2); 10 | $1 11 | } 12 | ; 13 | A -> A: 'a' { A } ; 14 | %% 15 | pub struct A; 16 | lexer: | 17 | %% 18 | a 'a' 19 | -------------------------------------------------------------------------------- /doc/src/lrpar.md: -------------------------------------------------------------------------------- 1 | # `lrpar` 2 | 3 | `lrpar` ([crate](https://crates.io/crates/lrpar); 4 | [source](https://github.com/softdevteam/grmtools/tree/master/lrpar)) is the LR 5 | parser library aspect of grmtools. It takes in streams of lexemes (using a 6 | lexer of the user's choice) and parses them, determining if they successfully 7 | match a grammar or not; if not, it can optionally recover from errors. 8 | -------------------------------------------------------------------------------- /lrpar/cttests/src/typeparams.test: -------------------------------------------------------------------------------- 1 | name: Test %parse-param 2 | yacckind: Grmtools 3 | grammar: | 4 | %start S 5 | %parse-generics 'a, T: Into + Copy, R: From 6 | %parse-param p: &'a T 7 | %% 8 | S -> R: 9 | 'INT' { From::from((*p).into() + $lexer.span_str($1.unwrap().span()).parse::().unwrap()) } 10 | ; 11 | %% 12 | lexer: | 13 | %% 14 | [0-9]+ 'INT' 15 | -------------------------------------------------------------------------------- /doc/src/README.md: -------------------------------------------------------------------------------- 1 | # grmtools 2 | 3 | [grmtools](https://github.com/softdevteam/grmtools/) is a suite of Rust 4 | libraries and binaries for parsing text, both at compile-time, and run-time. 5 | Most users will probably be interested in the compile-time Yacc feature, which 6 | allows traditional `.y` files to be used mostly unchanged in Rust. See the 7 | [Quickstart Guide](quickstart.md) for a quick introduction to this feature. 8 | -------------------------------------------------------------------------------- /lrpar/cttests/src/passthrough.test: -------------------------------------------------------------------------------- 1 | name: Test that $$ is passed through correctly. 2 | yacckind: Grmtools 3 | grammar: | 4 | %start Expr 5 | %avoid_insert "INT" 6 | %% 7 | Expr -> Result: 8 | Num { $1 } 9 | ; 10 | Num -> Result: 11 | "INT" { Ok(format!("$${}", $lexer.span_str($1.unwrap().span()))) } 12 | ; 13 | lexer: | 14 | %% 15 | [0-9]+ "INT" 16 | 17 | -------------------------------------------------------------------------------- /lrpar/cttests_macro/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cttests_macro" 3 | version = "0.1.0" 4 | edition = "2024" 5 | license = "Apache-2.0/MIT" 6 | 7 | [lib] 8 | proc-macro = true 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 10 | 11 | [dependencies] 12 | glob.workspace = true 13 | quote.workspace = true 14 | proc-macro2 = { version = "1.0", features=["proc-macro"]} 15 | syn.workspace = true 16 | -------------------------------------------------------------------------------- /lrpar/cttests/src/ctfails/test_files2.test: -------------------------------------------------------------------------------- 1 | name: Test non-string in array of %grmtools{test_files} 2 | grammar: | 3 | %grmtools { 4 | yacckind: Original(YaccOriginalActionKind::UserAction), 5 | recoverer: RecoveryKind::None, 6 | test_files: [ShouldBeAString] 7 | } 8 | %start Expr 9 | %actiontype () 10 | %% 11 | Expr: '(' ')' { () } ; 12 | lexer: | 13 | %% 14 | \( "(" 15 | \) ")" 16 | [\t\n ]+ ; 17 | -------------------------------------------------------------------------------- /lrlex/examples/calclex/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "calclex" 3 | version = "0.1.0" 4 | authors = ["Laurence Tratt "] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | 8 | [[bin]] 9 | doc = false 10 | name = "calclex" 11 | 12 | [build-dependencies] 13 | lrlex = { path = "../.." } 14 | 15 | [dependencies] 16 | cfgrammar = { path = "../../../cfgrammar" } 17 | lrlex = { path = "../.." } 18 | lrpar = { path = "../../../lrpar" } 19 | -------------------------------------------------------------------------------- /lrpar/cttests/src/ctfails/test_files3.test: -------------------------------------------------------------------------------- 1 | name: Test empty matchless glob in array of %grmtools{test_files} 2 | grammar: | 3 | %grmtools { 4 | yacckind: Original(YaccOriginalActionKind::UserAction), 5 | recoverer: RecoveryKind::None, 6 | test_files: ["*.nonexistent"] 7 | } 8 | %start Expr 9 | %actiontype () 10 | %% 11 | Expr: '(' ')' { () } ; 12 | lexer: | 13 | %% 14 | \( "(" 15 | \) ")" 16 | [\t\n ]+ ; 17 | -------------------------------------------------------------------------------- /lrpar/cttests/src/ctfails/test_files1.test: -------------------------------------------------------------------------------- 1 | name: Test string value type instead of array in %grmtools{test_files} 2 | grammar: | 3 | %grmtools { 4 | yacckind: Original(YaccOriginalActionKind::UserAction), 5 | recoverer: RecoveryKind::None, 6 | test_files: "should_be_an_array" 7 | } 8 | %start Expr 9 | %actiontype () 10 | %% 11 | Expr: '(' ')' { () } ; 12 | lexer: | 13 | %% 14 | \( "(" 15 | \) ")" 16 | [\t\n ]+ ; 17 | -------------------------------------------------------------------------------- /lrpar/examples/clone_param/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "clone_param" 3 | version = "0.1.0" 4 | edition = "2024" 5 | license = "Apache-2.0/MIT" 6 | 7 | [[bin]] 8 | doc = false 9 | name = "clone_param" 10 | 11 | [build-dependencies] 12 | cfgrammar = { path="../../../cfgrammar" } 13 | lrlex = { path="../../../lrlex" } 14 | lrpar = { path="../.." } 15 | 16 | [dependencies] 17 | cfgrammar = { path="../../../cfgrammar" } 18 | lrlex = { path="../../../lrlex" } 19 | lrpar = { path="../.." } 20 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "calc_ast" 3 | version = "0.1.0" 4 | authors = ["Laurence Tratt "] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | 8 | [[bin]] 9 | doc = false 10 | name = "calc_ast" 11 | 12 | [build-dependencies] 13 | cfgrammar = { path="../../../cfgrammar" } 14 | lrlex = { path="../../../lrlex" } 15 | lrpar = { path="../.." } 16 | 17 | [dependencies] 18 | cfgrammar = { path="../../../cfgrammar" } 19 | lrlex = { path="../../../lrlex" } 20 | lrpar = { path="../.." } 21 | -------------------------------------------------------------------------------- /lrpar/examples/start_states/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "start_states" 3 | version = "0.1.0" 4 | authors = ["Simon Martin "] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | 8 | [[bin]] 9 | doc = false 10 | name = "start_states" 11 | 12 | [build-dependencies] 13 | cfgrammar = { path="../../../cfgrammar" } 14 | lrlex = { path="../../../lrlex" } 15 | lrpar = { path="../.." } 16 | 17 | [dependencies] 18 | cfgrammar = { path="../../../cfgrammar" } 19 | lrlex = { path="../../../lrlex" } 20 | lrpar = { path="../.." } 21 | -------------------------------------------------------------------------------- /lrpar/examples/calc_actions/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "calc_actions" 3 | version = "0.1.0" 4 | authors = ["Laurence Tratt "] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | 8 | [[bin]] 9 | doc = false 10 | name = "calc_actions" 11 | 12 | [build-dependencies] 13 | cfgrammar = { path="../../../cfgrammar" } 14 | lrlex = { path="../../../lrlex" } 15 | lrpar = { path="../.." } 16 | 17 | [dependencies] 18 | cfgrammar = { path="../../../cfgrammar" } 19 | lrlex = { path="../../../lrlex" } 20 | lrpar = { path="../.." } 21 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_noactions.test: -------------------------------------------------------------------------------- 1 | name: Test NoAction using the calculator grammar 2 | yacckind: Original(YaccOriginalActionKind::NoAction) 3 | grammar: | 4 | %start Expr 5 | %avoid_insert 'INT' 6 | %% 7 | Expr: Expr '+' Term 8 | | Term 9 | ; 10 | 11 | Term: Term '*' Factor 12 | | Factor 13 | ; 14 | 15 | Factor: '(' Expr ')' 16 | | 'INT' 17 | ; 18 | 19 | lexer: | 20 | %% 21 | [0-9]+ "INT" 22 | \+ "+" 23 | \* "*" 24 | \( "(" 25 | \) ")" 26 | [\t ]+ ; 27 | -------------------------------------------------------------------------------- /lrpar/examples/calc_parsetree/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "calc_parsetree" 3 | version = "0.1.0" 4 | authors = ["Laurence Tratt "] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | 8 | [[bin]] 9 | doc = false 10 | name = "calc_parsetree" 11 | 12 | [build-dependencies] 13 | cfgrammar = { path="../../../cfgrammar" } 14 | lrlex = { path="../../../lrlex" } 15 | lrpar = { path="../.." } 16 | 17 | [dependencies] 18 | cfgrammar = { path="../../../cfgrammar" } 19 | lrlex = { path="../../../lrlex" } 20 | lrpar = { path="../.." } 21 | -------------------------------------------------------------------------------- /lrlex/examples/calc_manual_lex/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "calc_manual_lex" 3 | version = "0.1.0" 4 | authors = ["Laurence Tratt "] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | 8 | [[bin]] 9 | doc = false 10 | name = "calc_manual_lex" 11 | 12 | [build-dependencies] 13 | cfgrammar = { path="../../../cfgrammar" } 14 | lrlex = { path="../../" } 15 | lrpar = { path="../../../lrpar" } 16 | 17 | [dependencies] 18 | cfgrammar = { path="../../../cfgrammar" } 19 | lrlex = { path="../.." } 20 | lrpar = { path="../../../lrpar" } 21 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use 2 | this file except in compliance with the License. You may obtain a copy of the 3 | License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed 8 | under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 9 | CONDITIONS OF ANY KIND, either express or implied. See the License for the 10 | specific language governing permissions and limitations under the License. 11 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast_arena/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "calc_ast_arena" 3 | version = "0.1.0" 4 | authors = ["Laurence Tratt "] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | 8 | [[bin]] 9 | doc = false 10 | name = "calc_ast_arena" 11 | 12 | [build-dependencies] 13 | cfgrammar = { path="../../../cfgrammar" } 14 | lrlex = { path="../../../lrlex" } 15 | lrpar = { path="../.." } 16 | 17 | [dependencies] 18 | cfgrammar = { path="../../../cfgrammar" } 19 | lrlex = { path="../../../lrlex" } 20 | lrpar = { path="../.." } 21 | bumpalo = "3" 22 | -------------------------------------------------------------------------------- /lrpar/examples/calc_parsetree/README.md: -------------------------------------------------------------------------------- 1 | # Parsing a simple calculator language 2 | 3 | This directory contains a very simple example of a calculator in `lrpar` that 4 | uses the generic parse tree output of `lrpar`. `cargo build` processes 5 | `src/calc.l` and `src/calc.y` at compile-time. The compiled program then takes 6 | input from stdin. You can type anything in here (though you'll only get useful 7 | output for valid input!) -- parsing and lexing errors are reported. 8 | 9 | Look at `build.rs`, `src/calc.y`, and `src/main.rs` to see how to use `lrpar` in 10 | your project. 11 | -------------------------------------------------------------------------------- /lrlex/examples/calclex/README.md: -------------------------------------------------------------------------------- 1 | # Lexing a simple calculator language 2 | 3 | This directory contains a very simple example of a calculator in `lrlex`. 4 | Executing `cargo run` processes `src/calc.l` at compile-time; the resulting 5 | binary then takes input from stdin. Each line should be a sequence of calculator 6 | lexemes (note that, since this is a lexer example, there is no notion of lexeme 7 | ordering: i.e. `1 2 +` is a valid sequence of lexemes as is `1 8 | + 2`). 9 | 10 | Look at `build.rs`, `src/calc.l`, and `src/main.rs` to see how to use `lrlex` in 11 | your project. 12 | -------------------------------------------------------------------------------- /lrpar/examples/calc_actions/README.md: -------------------------------------------------------------------------------- 1 | # Parsing a simple calculator language 2 | 3 | This directory contains a very simple example of a calculator in `lrpar` that 4 | executes user-specified actions as parsing is undertaken. `cargo build` 5 | processes `src/calc.l` and `src/calc.y` at compile-time. The compiled program 6 | then takes input from stdin. You can type anything in here (though you'll only 7 | get useful output for valid input!) -- parsing and lexing errors are reported. 8 | 9 | Look at `build.rs`, `src/calc.y`, and `src/main.rs` to see how to use `lrpar` in 10 | your project. 11 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast/README.md: -------------------------------------------------------------------------------- 1 | # Parsing a simple calculator language 2 | 3 | This directory contains a very simple example of a calculator in `lrpar` that 4 | builds up an AST and evaluates it once the AST is completely built. `cargo 5 | build` processes `src/calc.l` and `src/calc.y` at compile-time. The compiled 6 | program then takes input from stdin. You can type anything in here (though 7 | you'll only get useful output for valid input!) -- parsing and lexing errors 8 | are reported. 9 | 10 | Look at `build.rs`, `src/calc.y`, and `src/main.rs` to see how to use `lrpar` in 11 | your project. 12 | -------------------------------------------------------------------------------- /lrpar/examples/start_states/README.md: -------------------------------------------------------------------------------- 1 | # Parsing C-style block comments 2 | 3 | This directory contains a very simple example of (non-nested) comment removal in `lrpar` that 4 | uses the generic parse tree output of `lrpar`. `cargo build` processes 5 | `src/comment.l` and `src/comment.y` at compile-time. The compiled program then takes 6 | input from stdin. You can type anything in here (though you'll only get useful 7 | output for valid input!) -- parsing and lexing errors are reported. 8 | 9 | Look at `build.rs`, `src/comment.l`, `src/comment.y`, and `src/main.rs` to see how to use `lrpar` 10 | in your project. 11 | -------------------------------------------------------------------------------- /lrpar/examples/calc_parsetree/build.rs: -------------------------------------------------------------------------------- 1 | use lrlex::CTLexerBuilder; 2 | 3 | fn main() { 4 | // Since we're using both lrlex and lrpar, we use lrlex's `lrpar_config` convenience function 5 | // that makes it easy to a) create a lexer and parser and b) link them together. 6 | CTLexerBuilder::new() 7 | .rust_edition(lrlex::RustEdition::Rust2021) 8 | .lrpar_config(|ctp| { 9 | ctp.rust_edition(lrpar::RustEdition::Rust2021) 10 | .grammar_in_src_dir("calc.y") 11 | .unwrap() 12 | }) 13 | .lexer_in_src_dir("calc.l") 14 | .unwrap() 15 | .build() 16 | .unwrap(); 17 | } 18 | -------------------------------------------------------------------------------- /lrpar/examples/start_states/build.rs: -------------------------------------------------------------------------------- 1 | use lrlex::CTLexerBuilder; 2 | 3 | fn main() { 4 | // Since we're using both lrlex and lrpar, we use lrlex's `lrpar_config` convenience function 5 | // that makes it easy to a) create a lexer and parser and b) link them together. 6 | CTLexerBuilder::new() 7 | .rust_edition(lrlex::RustEdition::Rust2021) 8 | .lrpar_config(|ctp| { 9 | ctp.rust_edition(lrpar::RustEdition::Rust2021) 10 | .grammar_in_src_dir("comment.y") 11 | .unwrap() 12 | }) 13 | .lexer_in_src_dir("comment.l") 14 | .unwrap() 15 | .build() 16 | .unwrap(); 17 | } 18 | -------------------------------------------------------------------------------- /nimbleparse/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nimbleparse" 3 | description = "Simple Yacc grammar debugging tool" 4 | repository = "https://github.com/softdevteam/grmtools" 5 | version = "0.14.0" 6 | edition = "2024" 7 | readme = "README.md" 8 | license = "Apache-2.0/MIT" 9 | categories = ["parsing"] 10 | 11 | [[bin]] 12 | doc = false 13 | name = "nimbleparse" 14 | 15 | [dependencies] 16 | cfgrammar = { path="../cfgrammar", version="0.14" } 17 | lrlex = { path="../lrlex", version="0.14" } 18 | lrpar = { path="../lrpar", version="0.14" } 19 | lrtable = { path="../lrtable", version="0.14" } 20 | 21 | getopts.workspace = true 22 | num-traits.workspace = true 23 | glob.workspace = true 24 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast/build.rs: -------------------------------------------------------------------------------- 1 | #![deny(rust_2018_idioms)] 2 | use lrlex::CTLexerBuilder; 3 | 4 | fn main() { 5 | // Since we're using both lrlex and lrpar, we use lrlex's `lrpar_config` convenience function 6 | // that makes it easy to a) create a lexer and parser and b) link them together. 7 | CTLexerBuilder::new() 8 | .rust_edition(lrlex::RustEdition::Rust2021) 9 | .lrpar_config(|ctp| { 10 | ctp.rust_edition(lrpar::RustEdition::Rust2021) 11 | .grammar_in_src_dir("calc.y") 12 | .unwrap() 13 | }) 14 | .lexer_in_src_dir("calc.l") 15 | .unwrap() 16 | .build() 17 | .unwrap(); 18 | } 19 | -------------------------------------------------------------------------------- /lrpar/examples/calc_actions/build.rs: -------------------------------------------------------------------------------- 1 | #![deny(rust_2018_idioms)] 2 | use lrlex::CTLexerBuilder; 3 | 4 | fn main() { 5 | // Since we're using both lrlex and lrpar, we use lrlex's `lrpar_config` convenience function 6 | // that makes it easy to a) create a lexer and parser and b) link them together. 7 | CTLexerBuilder::new() 8 | .rust_edition(lrlex::RustEdition::Rust2021) 9 | .lrpar_config(|ctp| { 10 | ctp.rust_edition(lrpar::RustEdition::Rust2021) 11 | .grammar_in_src_dir("calc.y") 12 | .unwrap() 13 | }) 14 | .lexer_in_src_dir("calc.l") 15 | .unwrap() 16 | .build() 17 | .unwrap(); 18 | } 19 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast_arena/build.rs: -------------------------------------------------------------------------------- 1 | #![deny(rust_2018_idioms)] 2 | use lrlex::CTLexerBuilder; 3 | 4 | fn main() { 5 | // Since we're using both lrlex and lrpar, we use lrlex's `lrpar_config` convenience function 6 | // that makes it easy to a) create a lexer and parser and b) link them together. 7 | CTLexerBuilder::new() 8 | .rust_edition(lrlex::RustEdition::Rust2021) 9 | .lrpar_config(|ctp| { 10 | ctp.rust_edition(lrpar::RustEdition::Rust2021) 11 | .grammar_in_src_dir("calc.y") 12 | .unwrap() 13 | }) 14 | .lexer_in_src_dir("calc.l") 15 | .unwrap() 16 | .build() 17 | .unwrap(); 18 | } 19 | -------------------------------------------------------------------------------- /lrpar/examples/clone_param/build.rs: -------------------------------------------------------------------------------- 1 | #![deny(rust_2018_idioms)] 2 | use lrlex::CTLexerBuilder; 3 | 4 | fn main() { 5 | // Since we're using both lrlex and lrpar, we use lrlex's `lrpar_config` convenience function 6 | // that makes it easy to a) create a lexer and parser and b) link them together. 7 | CTLexerBuilder::new() 8 | .rust_edition(lrlex::RustEdition::Rust2021) 9 | .lrpar_config(|ctp| { 10 | ctp.rust_edition(lrpar::RustEdition::Rust2021) 11 | .grammar_in_src_dir("param.y") 12 | .unwrap() 13 | }) 14 | .lexer_in_src_dir("param.l") 15 | .unwrap() 16 | .build() 17 | .unwrap(); 18 | } 19 | -------------------------------------------------------------------------------- /lrlex/examples/calc_manual_lex/build.rs: -------------------------------------------------------------------------------- 1 | use lrlex::{CTTokenMapBuilder, DefaultLexerTypes}; 2 | use lrpar::CTParserBuilder; 3 | 4 | // Some of the token names in the parser do not lead to valid Rust identifiers, so we map them to 5 | // valid identifier names here. 6 | const TOKENS_MAP: &[(&str, &str)] = &[ 7 | ("+", "PLUS"), 8 | ("*", "STAR"), 9 | ("(", "LBRACK"), 10 | (")", "RBRACK"), 11 | ]; 12 | 13 | fn main() { 14 | let ctp = CTParserBuilder::>::new() 15 | .grammar_in_src_dir("calc.y") 16 | .unwrap() 17 | .build() 18 | .unwrap(); 19 | CTTokenMapBuilder::::new("token_map", ctp.token_map()) 20 | .rename_map(Some(TOKENS_MAP)) 21 | .build() 22 | .unwrap(); 23 | } 24 | -------------------------------------------------------------------------------- /lrpar/cttests/src/parseparam_copy.test: -------------------------------------------------------------------------------- 1 | name: Test %parse-param copy 2 | yacckind: Grmtools 3 | grammar: | 4 | %start S 5 | %parse-param p: u64 6 | %% 7 | S -> u64: 8 | // Previously %parse-param required a `Copy` bounds. 9 | // Since then we relaxed the bounds to require `Clone`. 10 | // This tests backwards compatibility of actions that 11 | // rely on the older copy bounds. 12 | 'INT' { 13 | #[allow(clippy::redundant_closure_call)] 14 | (move |_| {})(p); 15 | check_copy(p); 16 | p + $lexer.span_str($1.unwrap().span()).parse::().unwrap() 17 | } 18 | ; 19 | %% 20 | fn check_copy(_: T){} 21 | lexer: | 22 | %% 23 | [0-9]+ 'INT' 24 | -------------------------------------------------------------------------------- /lrpar/examples/clone_param/README.md: -------------------------------------------------------------------------------- 1 | # `clone_param` 2 | 3 | ## Description 4 | Example which shows how to use interior mutability with the `%parse-param` directive. 5 | As a parameter the parse function accepts a `Rc>`. 6 | 7 | ## Input 8 | For input the parser accepts a positive or negative integer e.g. `-1`, `42`, etc followed 9 | by any sequence of `+`, or `-` characters. Except for the initial `-` on a negative integer, 10 | `+` or `-` are treated as `Increment` and `Decrement` operators. 11 | 12 | ## Evaluation 13 | Rather than building an AST, the param is directly mutated by the actions. 14 | As such an input sequence like `-3++-` will evalute to `-2`. 15 | 16 | ## Example 17 | ``` 18 | >>> -3++- 19 | Evaluated: RefCell { value: -2 } 20 | ``` 21 | -------------------------------------------------------------------------------- /doc/src/lrtable.md: -------------------------------------------------------------------------------- 1 | # `lrtable` 2 | 3 | `lrtable` ([crate](https://crates.io/crates/lrtable); 4 | [source](https://github.com/softdevteam/grmtools/tree/master/lrtable)) takes in 5 | grammars from [`cfgrammar`](cfgrammar.html) and creates LR state tables from 6 | them. Few users will be interested in its functionality directly, except those 7 | doing advanced forms of grammar analysis. 8 | 9 | One, admittedly fairly advanced, aspect worth noting is that 10 | `lrtable` uses [Pager's 11 | algorithm](https://link.springer.com/article/10.1007/BF00290336) to compress the 12 | resulting LR state tables. In rare cases this can provide surprising results: 13 | see [Denny and Malloy's 14 | paper](https://www.sciencedirect.com/science/article/pii/S0167642309001191) for 15 | more. 16 | -------------------------------------------------------------------------------- /doc/src/cfgrammar.md: -------------------------------------------------------------------------------- 1 | # `cfgrammar` 2 | 3 | `cfgrammar` ([crate](https://crates.io/crates/cfgrammar); 4 | [source](https://github.com/softdevteam/grmtools/tree/master/cfgrammar)) reads 5 | in grammar files, processes them, and provides a convenient API for operating 6 | with them. Most users only need to think about `cfgrammar` to the 7 | extent that they are required to use it to specify what Yacc variant they wish 8 | to use. 9 | 10 | `cfgrammar` may also be of interest to those manipulating grammars directly, or 11 | who wish to use custom types of parsers. Note that `cfgrammar`'s API should be 12 | considered semi-stable at best. As the needs of other parts of grmtools change, 13 | `cfgrammar` tends to have to change too. Since it is unlikely to have few direct 14 | users, the consequences of changing the API are relatively slight. 15 | -------------------------------------------------------------------------------- /cfgrammar/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cfgrammar" 3 | description = "Grammar manipulation" 4 | repository = "https://github.com/softdevteam/grmtools" 5 | version = "0.14.0" 6 | edition = "2024" 7 | readme = "README.md" 8 | license = "Apache-2.0/MIT" 9 | categories = ["parsing"] 10 | keywords = ["yacc", "grammar"] 11 | 12 | [features] 13 | serde = ["dep:serde", "serde/derive", "vob/serde"] 14 | bincode = ["dep:bincode", "vob/bincode"] 15 | 16 | [lib] 17 | name = "cfgrammar" 18 | path = "src/lib/mod.rs" 19 | 20 | [dependencies] 21 | bincode = { workspace = true, optional = true, features = ["derive"] } 22 | indexmap.workspace = true 23 | num-traits.workspace = true 24 | regex.workspace = true 25 | serde = { workspace = true, optional = true } 26 | vob = { workspace = true } 27 | quote.workspace = true 28 | proc-macro2.workspace = true 29 | -------------------------------------------------------------------------------- /doc/src/lexing.md: -------------------------------------------------------------------------------- 1 | # Lexing 2 | 3 | Lexing is the act of taking in an input stream and splitting it into lexemes. 4 | Colloquially, lexing is often described as splitting input into words. In 5 | `grmtools`, a Lexeme has a type (e.g. "INT", "ID"), a value (e.g. "23", 6 | "xyz"), and knows which part of the user's input matched (e.g. "the input 7 | starting at index 7 to index 10"). There is also a simple mechanism to 8 | differentiate lexemes of zero length (e.g. `DEDENT` tokens in Python) from 9 | lexemes inserted by [error recovery](errorrecovery.md). 10 | 11 | `lrpar` provides a generic lexing interface to which any lexer can plug into. 12 | Many easy lexing tasks can more easily be carried out by [`lrlex`](lrlex.md), a 13 | `lex` replacement. `lrlex` also provides helper functions which make it [easier 14 | to hand-write lexers](manuallexers.md). 15 | -------------------------------------------------------------------------------- /lrtable/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lrtable" 3 | description = "LR grammar table generation" 4 | repository = "https://github.com/softdevteam/grmtools" 5 | version = "0.14.0" 6 | edition = "2024" 7 | readme = "README.md" 8 | license = "Apache-2.0/MIT" 9 | categories = ["parsing"] 10 | 11 | [features] 12 | bincode = ["dep:bincode", "sparsevec/bincode", "cfgrammar/bincode"] 13 | serde = ["dep:serde", "sparsevec/serde", "cfgrammar/serde"] 14 | 15 | [lib] 16 | name = "lrtable" 17 | path = "src/lib/mod.rs" 18 | 19 | [dependencies] 20 | cfgrammar = { path="../cfgrammar", version = "0.14" } 21 | 22 | bincode = { workspace = true, features = ["derive"], optional = true } 23 | fnv.workspace = true 24 | num-traits.workspace = true 25 | serde = { workspace = true, features = ["derive"], optional = true } 26 | vob.workspace = true 27 | sparsevec.workspace = true 28 | -------------------------------------------------------------------------------- /doc/src/parsing.md: -------------------------------------------------------------------------------- 1 | # Parsing 2 | 3 | Parsing is the act of checking whether a stream of lexemes match a grammar. 4 | Since a simple "yes/no" answer is rarely useful, it is common to execute 5 | user-defined *actions* during parsing. 6 | 7 | `grmtools` contains libraries ([`cfgrammar`](cfgrammar.md) and 8 | [`lrtable`](lrtable.md)) which allow users to build their own LR parsers in 9 | whatever fashion they want. However, for 99% of cases, the [`lrpar`](lrpar.md) 10 | library is what users want and need: a (largely) Yacc-compatible parser. Roughly 11 | speaking, the core parts of grammars work identically in Yacc and `lrpar`, but 12 | some other parts of the system have been modernised (e.g. to avoid the use of 13 | global variables) and given a more idiomatic Rust feel. Notably, `lrpar` is 14 | built from the ground-up to have a powerful, flexible approach to [error 15 | recovery](errorrecovery.md). 16 | -------------------------------------------------------------------------------- /lrpar/examples/clone_param/src/param.y: -------------------------------------------------------------------------------- 1 | %grmtools { 2 | yacckind: Grmtools, 3 | test_files: ["input*.txt"], 4 | } 5 | %expect-unused Unmatched "UNMATCHED" 6 | %token Incr Decr 7 | %parse-param val: Rc> 8 | %% 9 | Expr -> () : "INT" Ops { 10 | *val.borrow_mut() += parse_int($lexer.span_str($1.map_err(|_| "").unwrap().span())).unwrap() 11 | }; 12 | Ops -> (): 13 | %empty {} 14 | | Ops Incr { *val.borrow_mut() += 1; } 15 | | Ops Decr { *val.borrow_mut() -= 1; } 16 | ; 17 | Unmatched -> (): 18 | "UNMATCHED" { } 19 | ; 20 | %% 21 | use std::{ rc::Rc, cell::RefCell, error::Error }; 22 | 23 | fn parse_int(s: &str) -> Result> { 24 | match s.parse::() { 25 | Ok(val) => Ok(val), 26 | Err(_) => { 27 | Err(Box::from(format!("{} cannot be represented as a i64", s))) 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /doc/src/yaccextensions.md: -------------------------------------------------------------------------------- 1 | # Yacc Extensions 2 | 3 | At the beginning of a `.y` file is a `%grmtools{}` section, by default this section is required. 4 | But a default can be set or forced by using a `YaccKindResolver`. 5 | 6 | | Flag | Value | Required | 7 | |------------------|-------------------------------------------------|--------------| 8 | | `yacckind` | [YaccKind](yacccompatibility.md#yacckinds) | ✓ | 9 | | `recoverykind` | [RecoveryKind](errorrecovery.md#recoverykinds) | ✗ | 10 | | `test_files`[^†] | Array of string values | ✗ | 11 | 12 | [^†]: Strings containing globs are resolved relative to the yacc `.y` source file. 13 | `test_files` is currently experimental. 14 | 15 | ## Example 16 | 17 | ``` 18 | %grmtools{yacckind: Grmtools} 19 | %% 20 | Start: ; 21 | ``` 22 | -------------------------------------------------------------------------------- /doc/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | - [grmtools](README.md) 4 | - [Quickstart Guide](quickstart.md) 5 | - [Lexing](lexing.md) 6 | - [Lex compatibility](lexcompatibility.md) 7 | - [Extensions](lexextensions.md) 8 | - [Hand-written lexers](manuallexer.md) 9 | - [Start States](start_states.md) 10 | - [Parsing](parsing.md) 11 | - [Yacc compatibility](yacccompatibility.md) 12 | - [Extensions](yaccextensions.md) 13 | - [Return types and action code](actioncode.md) 14 | - [grmtools parsing idioms](parsing_idioms.md) 15 | - [Error recovery](errorrecovery.md) 16 | - [An AST evaluator](ast_example.md) 17 | - [Rust Editions](editions.md) 18 | - [The individual libraries and tools](libsandtools.md) 19 | - [lrpar](lrpar.md) 20 | - [lrlex](lrlex.md) 21 | - [nimbleparse](nimbleparse.md) 22 | - [cfgrammar](cfgrammar.md) 23 | - [lrtable](lrtable.md) 24 | - [third party](thirdparty.md) 25 | - [Other Rust parsing tools](othertools.md) 26 | -------------------------------------------------------------------------------- /lrlex/README.md: -------------------------------------------------------------------------------- 1 | # `lrlex` 2 | 3 | `lrlex` is a partial replacement for 4 | [`lex`](http://dinosaur.compilertools.net/lex/index.html) / 5 | [`flex`](https://westes.github.io/flex/manual/). It takes an input string and 6 | splits it into *lexemes* based on a `.l` file. Unfortunately, many real-world 7 | languages have corner cases which exceed the power that `lrlex` can provide. 8 | However, when it is suitable, it is a very convenient way of expressing lexing. 9 | 10 | `lrlex` also has a simple command-line interface, allowing you to check whether 11 | your lexing rules are working as expected: 12 | 13 | ```ignore 14 | $ cat C.java 15 | class C { 16 | int x = 0; 17 | } 18 | $ cargo run --lrlex java.l /tmp/C.java 19 | Finished dev [unoptimized + debuginfo] target(s) in 0.18s 20 | Running `target/debug/lrlex ../grammars/java7/java.l /tmp/C.java` 21 | CLASS class 22 | IDENTIFIER C 23 | LBRACE { 24 | INT int 25 | IDENTIFIER x 26 | EQ = 27 | INTEGER_LITERAL 0 28 | SEMICOLON ; 29 | RBRACE } 30 | ``` 31 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Except as otherwise noted (below and/or in individual files), this project is 2 | licensed under the Apache License, Version 2.0 3 | or the MIT license 4 | , at your option. 5 | 6 | Copyright is retained by contributors and/or the organisations they 7 | represent(ed) -- this project does not require copyright assignment. Please see 8 | version control history for a full list of contributors. Note that some files 9 | may include explicit copyright and/or licensing notices. 10 | 11 | The following contributors wish to explicitly make it known that the copyright 12 | of their contributions is retained by an organisation: 13 | 14 | Lukas Diekmann : copyright retained by 15 | King's College London 16 | Sarah Mount : copyright retained by 17 | King's College London 18 | Laurence Tratt : copyright retained by 19 | King's College London 20 | -------------------------------------------------------------------------------- /lrlex/examples/calclex/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, BufRead, Write}; 2 | 3 | use lrlex::lrlex_mod; 4 | use lrpar::Lexer; 5 | 6 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be 7 | // `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 8 | lrlex_mod!("calc.l"); 9 | 10 | fn main() { 11 | // Get the `LexerDef` for the `calc` language. 12 | let lexerdef = calc_l::lexerdef(); 13 | let stdin = io::stdin(); 14 | loop { 15 | print!(">>> "); 16 | io::stdout().flush().ok(); 17 | match stdin.lock().lines().next() { 18 | Some(Ok(ref l)) => { 19 | // Now we create a lexer with the `lexer` method with which we can lex an input. 20 | // Note that each lexer can only lex one input in its lifetime. 21 | let lexer = lexerdef.lexer(l); 22 | println!("{:?}", lexer.iter().collect::>()); 23 | } 24 | _ => break, 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /lrpar/cttests/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lrpar-tests" 3 | version = "0.1.0" 4 | authors = ["test"] 5 | edition = "2024" 6 | license = "Apache-2.0/MIT" 7 | build = "build.rs" 8 | 9 | [lib] 10 | crate-type = ["cdylib"] 11 | 12 | [build-dependencies] 13 | cfgrammar = { path = "../../cfgrammar" } 14 | lrlex = { path = "../../lrlex" } 15 | lrpar = { path = "../", features = ["_unstable_api"] } 16 | glob.workspace = true 17 | yaml-rust2.workspace = true 18 | cfg_aliases = "0.2.1" 19 | 20 | [dependencies] 21 | cfgrammar = { path = "../../cfgrammar" } 22 | glob.workspace = true 23 | lrlex = { path = "../../lrlex" } 24 | lrpar = { path = "../" } 25 | yaml-rust2.workspace = true 26 | 27 | [dev-dependencies] 28 | cttests_macro = { path = "../cttests_macro" } 29 | 30 | [target.'cfg(all(target_arch = "wasm32", target_os="unknown", target_vendor="unknown"))'.dependencies] 31 | wasm-bindgen = {version = "0.2.100", default-features = false} 32 | 33 | [target.'cfg(all(target_arch = "wasm32", target_os="unknown", target_vendor="unknown"))'.dev-dependencies] 34 | wasm-bindgen-test = "0.3.50" 35 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining a copy of 2 | this software and associated documentation files (the "Software"), to deal in 3 | the Software without restriction, including without limitation the rights to 4 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 5 | of the Software, and to permit persons to whom the Software is furnished to do 6 | so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all 9 | copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. 18 | -------------------------------------------------------------------------------- /lrlex/examples/calc_manual_lex/src/calc.y: -------------------------------------------------------------------------------- 1 | %grmtools{yacckind: Grmtools} 2 | %start Expr 3 | %avoid_insert "INT" 4 | %expect-unused Unmatched "UNMATCHED" 5 | %% 6 | Expr -> Result: 7 | Expr '+' Term { 8 | Ok(Expr::Add{ span: $span, lhs: Box::new($1?), rhs: Box::new($3?) }) 9 | } 10 | | Term { $1 } 11 | ; 12 | 13 | Term -> Result: 14 | Term '*' Factor { 15 | Ok(Expr::Mul{ span: $span, lhs: Box::new($1?), rhs: Box::new($3?) }) 16 | } 17 | | Factor { $1 } 18 | ; 19 | 20 | Factor -> Result: 21 | '(' Expr ')' { $2 } 22 | | 'INT' { Ok(Expr::Number{ span: $span }) } 23 | ; 24 | 25 | Unmatched -> (): 26 | "UNMATCHED" { } 27 | ; 28 | %% 29 | 30 | use cfgrammar::Span; 31 | 32 | #[derive(Debug)] 33 | pub enum Expr { 34 | Add { 35 | span: Span, 36 | lhs: Box, 37 | rhs: Box, 38 | }, 39 | Mul { 40 | span: Span, 41 | lhs: Box, 42 | rhs: Box, 43 | }, 44 | Number { 45 | span: Span 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /doc/src/editions.md: -------------------------------------------------------------------------------- 1 | # Rust Editions 2 | 3 | The [edition](https://doc.rust-lang.org/edition-guide/rust-2021/index.html) 4 | of rust used by `grmtools` updates as the rust language evolves. We try to 5 | keep code generated by `CTParserBuilder` and `CTLexerBuilder` building with 6 | older versions of rust, so that downstream users can use the edition that 7 | suits their requirements. 8 | 9 | ## Controlling edition used during code generation 10 | 11 | `CTLexerBuilder` and `CTParserBuilder` both have functions, `rust_edition()` 12 | that accept a `lrpar::RustEdition` and `lrlex::RustEdition` respectively. 13 | 14 | ## Known edition incompatibility in the book 15 | 16 | While there is a preference for keeping the code in this manual working with all 17 | editions, exceptions may be made when for clarity. 18 | 19 | * In [An AST evaluator](ast_example.md), with the rust_2018_idioms lint deprecates 20 | some behavior which was previously accepted by the 2015 edition. The `eval` function has 21 | an elided lifetime that must be given explicitly as `lexer: &dyn NonStreamingLexer<'_, DefaultLexeme, u32>`. -------------------------------------------------------------------------------- /lrlex/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lrlex" 3 | description = "Simple lexer generator" 4 | repository = "https://github.com/softdevteam/grmtools" 5 | version = "0.14.0" 6 | edition = "2024" 7 | readme = "README.md" 8 | license = "Apache-2.0/MIT" 9 | categories = ["parsing"] 10 | 11 | [[bin]] 12 | doc = false 13 | name = "lrlex" 14 | 15 | [lib] 16 | name = "lrlex" 17 | path = "src/lib/mod.rs" 18 | 19 | [features] 20 | _unstable_api = [] 21 | _unsealed_unstable_traits = ["_unstable_api"] 22 | 23 | [build-dependencies] 24 | vergen = { version = "8", default-features = false, features = ["build"] } 25 | 26 | [dependencies] 27 | cfgrammar = { path = "../cfgrammar", version = "0.14" } 28 | lrpar = { path = "../lrpar", version = "0.14" } 29 | 30 | getopts.workspace = true 31 | regex.workspace = true 32 | regex-syntax.workspace = true 33 | num-traits.workspace = true 34 | proc-macro2.workspace = true 35 | quote.workspace = true 36 | bincode.workspace = true 37 | serde = { workspace = true, optional = true } 38 | prettyplease.workspace = true 39 | syn.workspace = true 40 | glob.workspace = true 41 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast/src/calc.y: -------------------------------------------------------------------------------- 1 | %grmtools { 2 | yacckind: Grmtools, 3 | test_files: ["input*.txt"], 4 | } 5 | %start Expr 6 | %avoid_insert "INT" 7 | %expect-unused Unmatched "UNMATCHED" 8 | %% 9 | Expr -> Result: 10 | Expr '+' Term { 11 | Ok(Expr::Add{ span: $span, lhs: Box::new($1?), rhs: Box::new($3?) }) 12 | } 13 | | Term { $1 } 14 | ; 15 | 16 | Term -> Result: 17 | Term '*' Factor { 18 | Ok(Expr::Mul{ span: $span, lhs: Box::new($1?), rhs: Box::new($3?) }) 19 | } 20 | | Factor { $1 } 21 | ; 22 | 23 | Factor -> Result: 24 | '(' Expr ')' { $2 } 25 | | 'INT' { Ok(Expr::Number{ span: $span }) } 26 | ; 27 | 28 | Unmatched -> (): 29 | "UNMATCHED" { } 30 | ; 31 | %% 32 | 33 | use cfgrammar::Span; 34 | 35 | #[derive(Debug)] 36 | pub enum Expr { 37 | Add { 38 | span: Span, 39 | lhs: Box, 40 | rhs: Box, 41 | }, 42 | Mul { 43 | span: Span, 44 | lhs: Box, 45 | rhs: Box, 46 | }, 47 | Number { 48 | span: Span 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /doc/src/lrlex.md: -------------------------------------------------------------------------------- 1 | # `lrlex` 2 | 3 | `lrlex` ([crate](https://crates.io/crates/lrlex); 4 | [source](https://github.com/softdevteam/grmtools/tree/master/lrlex)) is a 5 | partial replacement for [`lex`](https://web.archive.org/web/20220402195947/dinosaur.compilertools.net/lex/index.html) / 6 | [`flex`](https://westes.github.io/flex/manual/). It takes an input string and 7 | splits it into *lexemes* based on a `.l` file. Unfortunately, many real-world 8 | languages have corner cases which exceed the power that `lrlex` can provide. 9 | However, when it is suitable, it is a very convenient way of expressing lexing. 10 | 11 | `lrlex` also has a simple command-line interface, allowing you to check whether 12 | your lexing rules are working as expected: 13 | 14 | ``` 15 | $ cat C.java 16 | class C { 17 | int x = 0; 18 | } 19 | $ cargo run --lrlex java.l /tmp/C.java 20 | Finished dev [unoptimized + debuginfo] target(s) in 0.18s 21 | Running `target/debug/lrlex ../grammars/java7/java.l /tmp/C.java` 22 | CLASS class 23 | IDENTIFIER C 24 | LBRACE { 25 | INT int 26 | IDENTIFIER x 27 | EQ = 28 | INTEGER_LITERAL 0 29 | SEMICOLON ; 30 | RBRACE } 31 | ``` 32 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members=[ 3 | "cfgrammar", 4 | "lrlex", 5 | "lrlex/examples/calclex", 6 | "lrlex/examples/calc_manual_lex", 7 | "lrpar", 8 | "lrpar/cttests", 9 | "lrpar/cttests_macro", 10 | "lrpar/examples/calc_actions", 11 | "lrpar/examples/calc_ast", 12 | "lrpar/examples/calc_parsetree", 13 | "lrpar/examples/calc_ast_arena", 14 | "lrpar/examples/start_states", 15 | "lrpar/examples/clone_param", 16 | "lrtable", 17 | "nimbleparse", 18 | ] 19 | resolver = "2" 20 | 21 | [profile.release] 22 | opt-level = 3 23 | debug = false 24 | rpath = false 25 | lto = true 26 | debug-assertions = false 27 | codegen-units = 1 28 | panic = 'unwind' 29 | 30 | [workspace.dependencies] 31 | bincode = "2.0" 32 | cactus = "1.0" 33 | filetime = "0.2" 34 | fnv = "1.0" 35 | getopts = "0.2" 36 | glob = "0.3" 37 | indexmap = "2" 38 | num-traits = "0.2" 39 | packedvec = "1.2" 40 | quote = "1.0" 41 | regex = "1.3" 42 | regex-syntax = "0.8" 43 | serde = "1.0" 44 | sparsevec = "0.2.2" 45 | unicode-width = "0.1.11" 46 | vob = "3.0.4" 47 | proc-macro2 = "1.0" 48 | prettyplease = "0.2.31" 49 | syn = "2.0" 50 | yaml-rust2 = "0.10.1" 51 | -------------------------------------------------------------------------------- /lrpar/examples/calc_actions/src/calc.y: -------------------------------------------------------------------------------- 1 | %grmtools { 2 | yacckind: Grmtools, 3 | test_files: ["input*.txt"], 4 | } 5 | %start Expr 6 | %avoid_insert "INT" 7 | %% 8 | Expr -> Result>: 9 | Expr '+' Term { 10 | $1?.checked_add($3?) 11 | .ok_or_else(|| Box::::from("Overflow detected.")) 12 | } 13 | | Term { $1 } 14 | ; 15 | 16 | Term -> Result>: 17 | Term '*' Factor { 18 | $1?.checked_mul($3?) 19 | .ok_or_else(|| Box::::from("Overflow detected.")) 20 | } 21 | | Factor { $1 } 22 | ; 23 | 24 | Factor -> Result>: 25 | '(' Expr ')' { $2 } 26 | | 'INT' { 27 | parse_int($lexer.span_str($1.map_err(|_| "")?.span())) 28 | } 29 | ; 30 | %% 31 | // Any imports here are in scope for all the grammar actions above. 32 | 33 | use std::error::Error; 34 | 35 | fn parse_int(s: &str) -> Result> { 36 | match s.parse::() { 37 | Ok(val) => Ok(val), 38 | Err(_) => { 39 | Err(Box::from(format!("{} cannot be represented as a u64", s))) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_nodefault_yacckind.test: -------------------------------------------------------------------------------- 1 | name: Test specification of yacckind in %grmtools section 2 | grammar: | 3 | %grmtools {yacckind: Original(UserAction)} 4 | %start Expr 5 | %actiontype Result 6 | %avoid_insert 'INT' 7 | %% 8 | Expr: Expr '+' Term { Ok($1? + $3?) } 9 | | Term { $1 } 10 | ; 11 | 12 | Term: Term '*' Factor { Ok($1? * $3?) } 13 | | Factor { $1 } 14 | ; 15 | 16 | Factor: '(' Expr ')' { $2 } 17 | | 'INT' { 18 | let l = $1.map_err(|_| ())?; 19 | match $lexer.span_str(l.span()).parse::() { 20 | Ok(v) => Ok(v), 21 | Err(_) => { 22 | let ((_, col), _) = $lexer.line_col(l.span()); 23 | eprintln!("Error at column {}: '{}' cannot be represented as a u64", 24 | col, 25 | $lexer.span_str(l.span())); 26 | Err(()) 27 | } 28 | } 29 | } 30 | ; 31 | 32 | lexer: | 33 | %% 34 | [0-9]+ "INT" 35 | \+ "+" 36 | \* "*" 37 | \( "(" 38 | \) ")" 39 | [\t ]+ ; 40 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_recoverer_none.test: -------------------------------------------------------------------------------- 1 | vname: Test %grmtools section RecoveryKind::None 2 | grammar: | 3 | %grmtools {yacckind: Original(UserAction), recoverer: RecoveryKind::None} 4 | %start Expr 5 | %actiontype Result 6 | %avoid_insert 'INT' 7 | %% 8 | Expr: Expr '+' Term { Ok($1? + $3?) } 9 | | Term { $1 } 10 | ; 11 | 12 | Term: Term '*' Factor { Ok($1? * $3?) } 13 | | Factor { $1 } 14 | ; 15 | 16 | Factor: '(' Expr ')' { $2 } 17 | | 'INT' { 18 | let l = $1.map_err(|_| ())?; 19 | match $lexer.span_str(l.span()).parse::() { 20 | Ok(v) => Ok(v), 21 | Err(_) => { 22 | let ((_, col), _) = $lexer.line_col(l.span()); 23 | eprintln!("Error at column {}: '{}' cannot be represented as a u64", 24 | col, 25 | $lexer.span_str(l.span())); 26 | Err(()) 27 | } 28 | } 29 | } 30 | ; 31 | 32 | lexer: | 33 | %% 34 | [0-9]+ "INT" 35 | \+ "+" 36 | \* "*" 37 | \( "(" 38 | \) ")" 39 | [\t ]+ ; 40 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_multitypes.test: -------------------------------------------------------------------------------- 1 | name: Test basic user actions using the calculator grammar (Grmtools yacckind) 2 | yacckind: Grmtools 3 | recoverer: RecoveryKind::CPCTPlus 4 | grammar: | 5 | %start Expr 6 | %avoid_insert "INT" 7 | %% 8 | Expr -> Result: 9 | Expr '+' Term { Ok($1? + $3?) } 10 | | Term { $1 } 11 | ; 12 | 13 | Term -> Result: 14 | Term '*' Factor { Ok($1? * $3?) } 15 | | Factor { $1 } 16 | ; 17 | 18 | Factor -> Result: 19 | '(' Expr ')' { $2 } 20 | | 'INT' 21 | { 22 | let v = $1.map_err(|_| ())?; 23 | parse_int($lexer.span_str(v.span())) 24 | } 25 | ; 26 | %% 27 | // Any functions here are in scope for all the grammar actions above. 28 | 29 | fn parse_int(s: &str) -> Result { 30 | match s.parse::() { 31 | Ok(val) => Ok(val), 32 | Err(_) => { 33 | eprintln!("{} cannot be represented as a u64", s); 34 | Err(()) 35 | } 36 | } 37 | } 38 | lexer: | 39 | %% 40 | [0-9]+ "INT" 41 | \+ "+" 42 | \* "*" 43 | \( "(" 44 | \) ")" 45 | [\t ]+ ; 46 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_recoverer_cpctplus.test: -------------------------------------------------------------------------------- 1 | name: Test multiple values in %grmtools section 2 | grammar: | 3 | %grmtools {yacckind: Original(UserAction), recoverer: RecoveryKind::CPCTPlus} 4 | %start Expr 5 | %actiontype Result 6 | %avoid_insert 'INT' 7 | %% 8 | Expr: Expr '+' Term { Ok($1? + $3?) } 9 | | Term { $1 } 10 | ; 11 | 12 | Term: Term '*' Factor { Ok($1? * $3?) } 13 | | Factor { $1 } 14 | ; 15 | 16 | Factor: '(' Expr ')' { $2 } 17 | | 'INT' { 18 | let l = $1.map_err(|_| ())?; 19 | match $lexer.span_str(l.span()).parse::() { 20 | Ok(v) => Ok(v), 21 | Err(_) => { 22 | let ((_, col), _) = $lexer.line_col(l.span()); 23 | eprintln!("Error at column {}: '{}' cannot be represented as a u64", 24 | col, 25 | $lexer.span_str(l.span())); 26 | Err(()) 27 | } 28 | } 29 | } 30 | ; 31 | 32 | lexer: | 33 | %% 34 | [0-9]+ "INT" 35 | \+ "+" 36 | \* "*" 37 | \( "(" 38 | \) ")" 39 | [\t ]+ ; 40 | -------------------------------------------------------------------------------- /lrpar/cttests/src/span.test: -------------------------------------------------------------------------------- 1 | name: Test the span variable 2 | yacckind: Grmtools 3 | grammar: | 4 | %start Expr 5 | %avoid_insert "INT" 6 | %% 7 | Expr -> Vec<::cfgrammar::Span>: 8 | Expr '+' Term { 9 | let mut spans = $1; 10 | spans.extend($3); 11 | spans.push($span); 12 | spans 13 | } 14 | | Term { 15 | let mut spans = $1; 16 | spans.push($span); 17 | spans 18 | } 19 | ; 20 | 21 | Term -> Vec<::cfgrammar::Span>: 22 | Term '*' Factor { 23 | let mut spans = $1; 24 | spans.extend($3); 25 | spans.push($span); 26 | spans 27 | } 28 | | Factor { 29 | let mut spans = $1; 30 | spans.push($span); 31 | spans 32 | } 33 | ; 34 | 35 | Factor -> Vec<::cfgrammar::Span>: 36 | '(' Expr ')' { 37 | let mut spans = $2; 38 | spans.push($span); 39 | spans 40 | } 41 | | 'INT' { vec![$span] } 42 | ; 43 | lexer: | 44 | %% 45 | [0-9]+ "INT" 46 | \+ "+" 47 | \* "*" 48 | \( "(" 49 | \) ")" 50 | [\t ]+ ; 51 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast_arena/src/calc.y: -------------------------------------------------------------------------------- 1 | %grmtools { 2 | yacckind: Grmtools, 3 | test_files: ["input*.txt"], 4 | } 5 | %start Expr 6 | %avoid_insert "INT" 7 | %expect-unused Unmatched "UNMATCHED" 8 | %parse-generics 'ast 9 | %parse-param arena: &'ast Bump 10 | %% 11 | Expr -> Result, ()>: 12 | Expr '+' Term { 13 | Ok(Expr::Add{ span: $span, lhs: arena.alloc($1?), rhs: arena.alloc($3?) }) 14 | } 15 | | Term { $1 } 16 | ; 17 | 18 | Term -> Result, ()>: 19 | Term '*' Factor { 20 | Ok(Expr::Mul{ span: $span, lhs: arena.alloc($1?), rhs: arena.alloc($3?) }) 21 | } 22 | | Factor { $1 } 23 | ; 24 | 25 | Factor -> Result, ()>: 26 | '(' Expr ')' { $2 } 27 | | 'INT' { Ok(Expr::Number{ span: $span }) } 28 | ; 29 | 30 | Unmatched -> (): 31 | "UNMATCHED" { } 32 | ; 33 | %% 34 | 35 | use cfgrammar::Span; 36 | use bumpalo::Bump; 37 | 38 | #[derive(Debug)] 39 | pub enum Expr<'ast> { 40 | Add { 41 | span: Span, 42 | lhs: &'ast Expr<'ast>, 43 | rhs: &'ast Expr<'ast>, 44 | }, 45 | Mul { 46 | span: Span, 47 | lhs: &'ast Expr<'ast>, 48 | rhs: &'ast Expr<'ast>, 49 | }, 50 | Number { 51 | span: Span 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_actiontype.test: -------------------------------------------------------------------------------- 1 | name: Test basic user actions using the calculator grammar (Original yacckind) 2 | yacckind: Original(YaccOriginalActionKind::UserAction) 3 | recoverer: RecoveryKind::None 4 | grammar: | 5 | %start Expr 6 | %actiontype Result 7 | %avoid_insert 'INT' 8 | %% 9 | Expr: Expr '+' Term { Ok($1? + $3?) } 10 | | Term { $1 } 11 | ; 12 | 13 | Term: Term '*' Factor { Ok($1? * $3?) } 14 | | Factor { $1 } 15 | ; 16 | 17 | Factor: '(' Expr ')' { $2 } 18 | | 'INT' { 19 | let l = $1.map_err(|_| ())?; 20 | match $lexer.span_str(l.span()).parse::() { 21 | Ok(v) => Ok(v), 22 | Err(_) => { 23 | let ((_, col), _) = $lexer.line_col(l.span()); 24 | eprintln!("Error at column {}: '{}' cannot be represented as a u64", 25 | col, 26 | $lexer.span_str(l.span())); 27 | Err(()) 28 | } 29 | } 30 | } 31 | ; 32 | 33 | lexer: | 34 | %% 35 | [0-9]+ "INT" 36 | \+ "+" 37 | \* "*" 38 | \( "(" 39 | \) ")" 40 | [\t ]+ ; 41 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_unsafeaction.test: -------------------------------------------------------------------------------- 1 | name: Test unsafe user actions using the calculator grammar 2 | yacckind: Original(YaccOriginalActionKind::UserAction) 3 | grammar: | 4 | %start Expr 5 | %actiontype Result 6 | %avoid_insert 'INT' 7 | %% 8 | Expr: Expr '+' Term { unsafe { unsafe_ok($1? + $3?) } } 9 | | Term { $1 } 10 | ; 11 | 12 | Term: Term '*' Factor { unsafe { unsafe_ok($1? * $3?) } } 13 | | Factor { $1 } 14 | ; 15 | 16 | Factor: '(' Expr ')' { $2 } 17 | | 'INT' { 18 | let l = $1.map_err(|_| ())?; 19 | match $lexer.span_str(l.span()).parse::() { 20 | Ok(v) => unsafe { unsafe_ok(v) }, 21 | Err(_) => { 22 | let ((_, col), _) = $lexer.line_col(l.span()); 23 | eprintln!("Error at column {}: '{}' cannot be represented as a u64", 24 | col, 25 | $lexer.span_str(l.span())); 26 | Err(()) 27 | } 28 | } 29 | } 30 | ; 31 | %% 32 | // Just check that unsafe blocks work in actions. 33 | unsafe fn unsafe_ok(x:T) -> Result { 34 | Ok(x) 35 | } 36 | 37 | lexer: | 38 | %% 39 | [0-9]+ "INT" 40 | \+ "+" 41 | \* "*" 42 | \( "(" 43 | \) ")" 44 | [\t ]+ ; 45 | -------------------------------------------------------------------------------- /lrpar/cttests/src/ctfails/calc_bad_input.test: -------------------------------------------------------------------------------- 1 | name: Test calculator with malformed input from %grmtools{test_files} 2 | grammar: | 3 | %grmtools { 4 | yacckind: Original(YaccOriginalActionKind::UserAction), 5 | recoverer: RecoveryKind::None, 6 | test_files: ["*.valid_input", "*.bad_input"] 7 | } 8 | %start Expr 9 | %actiontype Result 10 | %avoid_insert 'INT' 11 | %% 12 | Expr: Expr '+' Term { Ok($1? + $3?) } 13 | | Term { $1 } 14 | ; 15 | 16 | Term: Term '*' Factor { Ok($1? * $3?) } 17 | | Factor { $1 } 18 | ; 19 | 20 | Factor: '(' Expr ')' { $2 } 21 | | 'INT' { 22 | let l = $1.map_err(|_| ())?; 23 | match $lexer.span_str(l.span()).parse::() { 24 | Ok(v) => Ok(v), 25 | Err(_) => { 26 | let ((_, col), _) = $lexer.line_col(l.span()); 27 | eprintln!("Error at column {}: '{}' cannot be represented as a u64", 28 | col, 29 | $lexer.span_str(l.span())); 30 | Err(()) 31 | } 32 | } 33 | } 34 | ; 35 | 36 | lexer: | 37 | %% 38 | [0-9]+ "INT" 39 | \+ "+" 40 | \* "*" 41 | \( "(" 42 | \) ")" 43 | [\t\n ]+ ; 44 | extra_files: 45 | input1.valid_input: | 46 | (1 + 2) * 3 47 | input1.bad_input: | 48 | (1 + 2 * 3 49 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_input.test: -------------------------------------------------------------------------------- 1 | name: Test with calculator input from %grmtools{test_files} 2 | grammar: | 3 | %grmtools { 4 | yacckind: Original(YaccOriginalActionKind::UserAction), 5 | recoverer: RecoveryKind::None, 6 | test_files: ["*.calc_input", "*.calc_input2"], 7 | } 8 | %start Expr 9 | %actiontype Result 10 | %avoid_insert 'INT' 11 | %% 12 | Expr: Expr '+' Term { Ok($1? + $3?) } 13 | | Term { $1 } 14 | ; 15 | 16 | Term: Term '*' Factor { Ok($1? * $3?) } 17 | | Factor { $1 } 18 | ; 19 | 20 | Factor: '(' Expr ')' { $2 } 21 | | 'INT' { 22 | let l = $1.map_err(|_| ())?; 23 | match $lexer.span_str(l.span()).parse::() { 24 | Ok(v) => Ok(v), 25 | Err(_) => { 26 | let ((_, col), _) = $lexer.line_col(l.span()); 27 | eprintln!("Error at column {}: '{}' cannot be represented as a u64", 28 | col, 29 | $lexer.span_str(l.span())); 30 | Err(()) 31 | } 32 | } 33 | } 34 | ; 35 | 36 | lexer: | 37 | %% 38 | [0-9]+ "INT" 39 | \+ "+" 40 | \* "*" 41 | \( "(" 42 | \) ")" 43 | [\t\n ]+ ; 44 | extra_files: 45 | input1.calc_input: | 46 | 1 + 2 * 3 47 | input2.calc_input: | 48 | (1 + 2) * 3 49 | input1.calc_input2: | 50 | 2 * 3 51 | 52 | -------------------------------------------------------------------------------- /lrpar/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lrpar" 3 | description = "Yacc-compatible parser generator" 4 | repository = "https://github.com/softdevteam/grmtools" 5 | version = "0.14.0" 6 | edition = "2024" 7 | readme = "README.md" 8 | license = "Apache-2.0/MIT" 9 | build = "build.rs" 10 | categories = ["parsing"] 11 | keywords = ["parser", "LR", "yacc", "grammar"] 12 | 13 | [lib] 14 | name = "lrpar" 15 | path = "src/lib/mod.rs" 16 | 17 | [features] 18 | serde = ["dep:serde", "cfgrammar/serde", "lrtable/serde"] 19 | _unstable_api = [] 20 | _unsealed_unstable_traits = ["_unstable_api"] 21 | 22 | [build-dependencies] 23 | vergen = { version = "8", default-features = false, features = ["build"] } 24 | 25 | [dependencies] 26 | cfgrammar = { path="../cfgrammar", version = "0.14", features = ["bincode"] } 27 | lrtable = { path="../lrtable", version = "0.14", features = ["bincode"] } 28 | 29 | bincode = { workspace = true, features = ["derive"] } 30 | cactus.workspace = true 31 | filetime.workspace = true 32 | indexmap.workspace = true 33 | num-traits.workspace = true 34 | packedvec.workspace = true 35 | proc-macro2.workspace = true 36 | quote.workspace = true 37 | regex.workspace = true 38 | serde = { workspace = true, features = ["derive"], optional = true } 39 | vob.workspace = true 40 | syn.workspace = true 41 | prettyplease.workspace = true 42 | unicode-width.workspace = true 43 | 44 | [target.'cfg(target_arch = "wasm32")'.dependencies] 45 | web-time = "1.1.0" 46 | 47 | [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] 48 | tempfile = "3.0" 49 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_wasm.test: -------------------------------------------------------------------------------- 1 | name: Test running on wasm targets 2 | grammar: | 3 | %grmtools {yacckind: Grmtools} 4 | %start Expr 5 | %avoid_insert "INT" 6 | %expect-unused Unmatched "UNMATCHED" 7 | %epp INT "Int" 8 | %% 9 | Expr -> Result>: 10 | Expr '+' Term { 11 | $1?.checked_add($3?) 12 | .ok_or_else(|| Box::::from("Overflow detected.")) 13 | } 14 | | Term { $1 } 15 | ; 16 | 17 | Term -> Result>: 18 | Term '*' Factor { 19 | $1?.checked_mul($3?) 20 | .ok_or_else(|| Box::::from("Overflow detected.")) 21 | } 22 | | Factor { $1 } 23 | ; 24 | 25 | Factor -> Result>: 26 | '(' Expr ')' { $2 } 27 | | 'INT' { 28 | parse_int($lexer.span_str($1.map_err(|_| "")?.span())) 29 | } 30 | ; 31 | Unmatched -> (): "UNMATCHED" { }; 32 | %% 33 | // Any imports here are in scope for all the grammar actions above. 34 | 35 | use std::error::Error; 36 | 37 | fn parse_int(s: &str) -> Result> { 38 | match s.parse::() { 39 | Ok(val) => Ok(val), 40 | Err(_) => { 41 | Err(Box::from(format!("{} cannot be represented as a u64", s))) 42 | } 43 | } 44 | } 45 | lexer: | 46 | %% 47 | [0-9]+ "INT" 48 | \+ "+" 49 | \* "*" 50 | \( "(" 51 | \) ")" 52 | [\t ]+ ; 53 | . "UNMATCHED" 54 | 55 | -------------------------------------------------------------------------------- /lrpar/examples/clone_param/src/main.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::unnecessary_wraps)] 2 | 3 | use lrlex::lrlex_mod; 4 | use lrpar::lrpar_mod; 5 | use std::io::{self, BufRead, Write}; 6 | use std::{cell::RefCell, rc::Rc}; 7 | 8 | // Using `lrlex_mod!` brings the lexer for `param.l` into scope. By default the module name will be 9 | // `param_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 10 | lrlex_mod!("param.l"); 11 | // Using `lrpar_mod!` brings the parser for `param.y` into scope. By default the module name will be 12 | // `param_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 13 | lrpar_mod!("param.y"); 14 | 15 | fn main() { 16 | // Get the `LexerDef` for the `param` language. 17 | let lexerdef = param_l::lexerdef(); 18 | let stdin = io::stdin(); 19 | loop { 20 | print!(">>> "); 21 | io::stdout().flush().ok(); 22 | match stdin.lock().lines().next() { 23 | Some(Ok(ref l)) => { 24 | if l.trim().is_empty() { 25 | continue; 26 | } 27 | // Now we create a lexer with the `lexer` method with which we can lex an input. 28 | let lexer = lexerdef.lexer(l); 29 | let param = Rc::new(RefCell::new(0)); 30 | // Pass the lexer to the parser and lex and parse the input. 31 | let (_opt, errs) = param_y::parse(&lexer, param.clone()); 32 | for e in errs { 33 | println!("{}", e.pp(&lexer, ¶m_y::token_epp)); 34 | } 35 | println!("Evaluated: {:?}", ¶m); 36 | } 37 | _ => break, 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /lrpar/examples/calc_actions/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, BufRead, Write}; 2 | 3 | use lrlex::lrlex_mod; 4 | use lrpar::lrpar_mod; 5 | 6 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be 7 | // `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 8 | lrlex_mod!("calc.l"); 9 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. By default the module name will be 10 | // `calc_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 11 | lrpar_mod!("calc.y"); 12 | 13 | fn main() { 14 | // Get the `LexerDef` for the `calc` language. 15 | let lexerdef = calc_l::lexerdef(); 16 | let stdin = io::stdin(); 17 | loop { 18 | print!(">>> "); 19 | io::stdout().flush().ok(); 20 | match stdin.lock().lines().next() { 21 | Some(Ok(ref l)) => { 22 | if l.trim().is_empty() { 23 | continue; 24 | } 25 | // Now we create a lexer with the `lexer` method with which we can lex an input. 26 | let lexer = lexerdef.lexer(l); 27 | // Pass the lexer to the parser and lex and parse the input. 28 | let (res, errs) = calc_y::parse(&lexer); 29 | for e in errs { 30 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 31 | } 32 | match res { 33 | Some(Ok(r)) => println!("Result: {}", r), 34 | Some(Err(e)) => eprintln!("{}", e), 35 | _ => eprintln!("Unable to evaluate expression."), 36 | } 37 | } 38 | _ => break, 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /lrpar/cttests_macro/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate proc_macro; 2 | use glob::glob; 3 | use proc_macro::TokenStream; 4 | use proc_macro2::Span; 5 | use quote::quote; 6 | use syn::{Ident, LitStr, parse_macro_input}; 7 | #[proc_macro] 8 | pub fn generate_codegen_fail_tests(item: TokenStream) -> TokenStream { 9 | let mut out = Vec::new(); 10 | let test_glob_str: LitStr = parse_macro_input!(item); 11 | // Not env!("CARGO_MANIFEST_DIR"), which would be relative to the cttests_macro crate. 12 | // An absolute path which may contain non-utf8 characters. 13 | let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap(); 14 | let cwd = std::env::current_dir().unwrap(); 15 | // We want a relative path to the glob from the working directory 16 | // such as: lrpar/cttests/ with any potentially non-utf8 leading characters removed. 17 | let manifest_dir = std::path::Path::new(&manifest_dir) 18 | .strip_prefix(cwd) 19 | .unwrap(); 20 | let test_glob_path = manifest_dir.join(test_glob_str.value()); 21 | let test_glob_str = test_glob_path.into_os_string().into_string().unwrap(); 22 | let test_files = glob(&test_glob_str).unwrap(); 23 | for file in test_files { 24 | let file = file.unwrap(); 25 | // Remove potentially non-utf8 leading characters again. 26 | // This time relative to the manifest dir e.g. `src/ctfails/foo.test` 27 | let file = file.as_path().strip_prefix(manifest_dir).unwrap(); 28 | // Need to convert to string, because `PathBuf` lacks 29 | // an impl for `ToTokens` a bounds given by `quote!`. 30 | let path = file.display().to_string(); 31 | let stem = file.file_stem().unwrap().to_string_lossy(); 32 | let ident = Ident::new(&format!("codegen_fail_{}", stem), Span::call_site()); 33 | out.push(quote! { 34 | #[should_panic] 35 | #[test] 36 | fn #ident(){ 37 | run_test_path(#path).unwrap(); 38 | } 39 | }); 40 | } 41 | out.into_iter().collect::().into() 42 | } 43 | -------------------------------------------------------------------------------- /lrpar/src/lib/test_utils.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::len_without_is_empty)] 2 | #![allow(unused)] 3 | 4 | use std::{error::Error, fmt, hash::Hash}; 5 | 6 | use cfgrammar::Span; 7 | 8 | use crate::{LexError, Lexeme, LexerTypes}; 9 | 10 | type StorageT = u16; 11 | 12 | #[derive(Debug, Clone)] 13 | pub(crate) struct TestLexerTypes(); 14 | 15 | impl LexerTypes for TestLexerTypes { 16 | type LexemeT = TestLexeme; 17 | type StorageT = u16; 18 | type LexErrorT = TestLexError; 19 | } 20 | 21 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 22 | pub(crate) struct TestLexeme { 23 | start: usize, 24 | len: usize, 25 | faulty: bool, 26 | tok_id: u16, 27 | } 28 | 29 | impl Lexeme for TestLexeme { 30 | fn new(tok_id: StorageT, start: usize, len: usize) -> Self { 31 | TestLexeme { 32 | start, 33 | len, 34 | faulty: false, 35 | tok_id, 36 | } 37 | } 38 | 39 | fn new_faulty(tok_id: StorageT, start: usize, len: usize) -> Self { 40 | TestLexeme { 41 | start, 42 | len, 43 | faulty: true, 44 | tok_id, 45 | } 46 | } 47 | 48 | fn tok_id(&self) -> StorageT { 49 | self.tok_id 50 | } 51 | 52 | fn span(&self) -> Span { 53 | Span::new(self.start, self.start + self.len) 54 | } 55 | 56 | fn faulty(&self) -> bool { 57 | self.faulty 58 | } 59 | } 60 | 61 | impl fmt::Display for TestLexeme { 62 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 63 | write!( 64 | f, 65 | "TestLexeme[{}..{}]", 66 | self.span().start(), 67 | self.span().end() 68 | ) 69 | } 70 | } 71 | 72 | impl Error for TestLexeme {} 73 | 74 | #[derive(Debug)] 75 | pub(crate) struct TestLexError {} 76 | 77 | impl LexError for TestLexError { 78 | fn span(&self) -> Span { 79 | unreachable!() 80 | } 81 | } 82 | 83 | impl Error for TestLexError {} 84 | 85 | impl fmt::Display for TestLexError { 86 | fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result { 87 | unreachable!(); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /lrlex/src/lib/defaults.rs: -------------------------------------------------------------------------------- 1 | use std::{cmp, error::Error, fmt, hash::Hash, marker}; 2 | 3 | use cfgrammar::Span; 4 | use lrpar::{Lexeme, LexerTypes}; 5 | use num_traits::{AsPrimitive, PrimInt, Unsigned}; 6 | 7 | use crate::LRLexError; 8 | 9 | /// lrlex's standard [LexerTypes] `struct`, provided as a convenience. 10 | #[derive(Debug, Clone)] 11 | pub struct DefaultLexerTypes 12 | where 13 | T: 'static + fmt::Debug + Hash + PrimInt + Unsigned, 14 | usize: AsPrimitive, 15 | { 16 | phantom: std::marker::PhantomData, 17 | } 18 | 19 | impl LexerTypes for DefaultLexerTypes 20 | where 21 | usize: AsPrimitive, 22 | T: 'static + fmt::Debug + Hash + PrimInt + Unsigned, 23 | { 24 | type LexemeT = DefaultLexeme; 25 | type StorageT = T; 26 | type LexErrorT = LRLexError; 27 | } 28 | 29 | /// lrlex's standard lexeme struct, provided as a convenience. 30 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 31 | pub struct DefaultLexeme { 32 | start: usize, 33 | len: usize, 34 | faulty: bool, 35 | tok_id: StorageT, 36 | } 37 | 38 | impl Lexeme for DefaultLexeme { 39 | fn new(tok_id: StorageT, start: usize, len: usize) -> Self { 40 | DefaultLexeme { 41 | start, 42 | len, 43 | faulty: false, 44 | tok_id, 45 | } 46 | } 47 | 48 | fn new_faulty(tok_id: StorageT, start: usize, len: usize) -> Self { 49 | DefaultLexeme { 50 | start, 51 | len, 52 | faulty: true, 53 | tok_id, 54 | } 55 | } 56 | 57 | fn tok_id(&self) -> StorageT { 58 | self.tok_id 59 | } 60 | 61 | fn span(&self) -> Span { 62 | Span::new(self.start, self.start + self.len) 63 | } 64 | 65 | fn faulty(&self) -> bool { 66 | self.faulty 67 | } 68 | } 69 | 70 | impl fmt::Display 71 | for DefaultLexeme 72 | { 73 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 74 | write!( 75 | f, 76 | "DefaultLexeme[{}..{}]", 77 | self.span().start(), 78 | self.span().end() 79 | ) 80 | } 81 | } 82 | 83 | impl Error 84 | for DefaultLexeme 85 | { 86 | } 87 | -------------------------------------------------------------------------------- /cfgrammar/src/lib/idxnewtype.rs: -------------------------------------------------------------------------------- 1 | // This macro generates a struct which exposes a u32 API (but which may, internally, use a smaller 2 | // storage size). 3 | 4 | use std::mem::size_of; 5 | 6 | #[cfg(feature = "bincode")] 7 | use bincode::{Decode, Encode}; 8 | use num_traits::{PrimInt, Unsigned}; 9 | #[cfg(feature = "serde")] 10 | use serde::{Deserialize, Serialize}; 11 | 12 | macro_rules! IdxNewtype { 13 | ($(#[$attr:meta])* $n: ident) => { 14 | $(#[$attr])* 15 | #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 16 | #[cfg_attr(feature="serde", derive(Serialize, Deserialize))] 17 | #[cfg_attr(feature="bincode", derive(Encode, Decode))] 18 | pub struct $n(pub T); 19 | 20 | impl From<$n> for usize { 21 | fn from($n(st): $n) -> Self { 22 | debug_assert!(size_of::() >= size_of::()); 23 | num_traits::cast(st).unwrap() 24 | } 25 | } 26 | 27 | impl From<$n> for u32 { 28 | fn from($n(st): $n) -> Self { 29 | debug_assert!(size_of::() >= size_of::()); 30 | num_traits::cast(st).unwrap() 31 | } 32 | } 33 | 34 | impl $n { 35 | pub fn as_storaget(&self) -> T { 36 | let $n(st) = self; 37 | *st 38 | } 39 | } 40 | } 41 | } 42 | 43 | IdxNewtype!( 44 | /// A type specifically for rule indices. 45 | /// 46 | /// It is guaranteed that `RIdx` can be converted, without loss of precision, to `usize` with 47 | /// the idiom `usize::from(...)`. 48 | RIdx 49 | ); 50 | IdxNewtype!( 51 | /// A type specifically for production indices (e.g. a rule `E::=A|B` would 52 | /// have two productions for the single rule `E`). 53 | /// 54 | /// It is guaranteed that `PIdx` can be converted, without loss of precision, to `usize` with 55 | /// the idiom `usize::from(...)`. 56 | PIdx 57 | ); 58 | IdxNewtype!( 59 | /// A type specifically for symbol indices (within a production). 60 | /// 61 | /// It is guaranteed that `SIdx` can be converted, without loss of precision, to `usize` with 62 | /// the idiom `usize::from(...)`. 63 | SIdx 64 | ); 65 | IdxNewtype!( 66 | /// A type specifically for token indices. 67 | /// 68 | /// It is guaranteed that `TIdx` can be converted, without loss of precision, to `usize` with 69 | /// the idiom `usize::from(...)`. 70 | TIdx 71 | ); 72 | -------------------------------------------------------------------------------- /lrtable/src/lib/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::cognitive_complexity)] 2 | #![allow(clippy::too_many_arguments)] 3 | #![allow(clippy::type_complexity)] 4 | #![forbid(unsafe_code)] 5 | #![deny(unreachable_pub)] 6 | 7 | use std::{hash::Hash, mem::size_of}; 8 | 9 | #[cfg(feature = "bincode")] 10 | use bincode::{Decode, Encode}; 11 | use num_traits::{AsPrimitive, PrimInt, Unsigned}; 12 | #[cfg(feature = "serde")] 13 | use serde::{Deserialize, Serialize}; 14 | 15 | mod itemset; 16 | mod pager; 17 | mod stategraph; 18 | pub mod statetable; 19 | 20 | pub use crate::{ 21 | stategraph::StateGraph, 22 | statetable::{Action, StateTable, StateTableError, StateTableErrorKind}, 23 | }; 24 | use cfgrammar::yacc::YaccGrammar; 25 | 26 | macro_rules! IdxNewtype { 27 | ($(#[$attr:meta])* $n: ident) => { 28 | $(#[$attr])* 29 | #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 30 | #[cfg_attr(feature="serde", derive(Serialize, Deserialize))] 31 | #[cfg_attr(feature="bincode", derive(Encode, Decode))] 32 | pub struct $n(pub T); 33 | 34 | impl From<$n> for usize { 35 | fn from($n(st): $n) -> Self { 36 | debug_assert!(size_of::() >= size_of::()); 37 | num_traits::cast(st).unwrap() 38 | } 39 | } 40 | 41 | impl From<$n> for u32 { 42 | fn from($n(st): $n) -> Self { 43 | debug_assert!(size_of::() >= size_of::()); 44 | num_traits::cast(st).unwrap() 45 | } 46 | } 47 | 48 | impl $n { 49 | pub fn as_storaget(&self) -> T { 50 | let $n(st) = self; 51 | *st 52 | } 53 | } 54 | } 55 | } 56 | 57 | IdxNewtype!( 58 | /// A type specifically for state table indices. 59 | /// 60 | /// It is guaranteed that `StIdx` can be converted, without loss of precision, to `usize` with 61 | /// the idiom `usize::from(...)`. 62 | StIdx 63 | ); 64 | 65 | #[derive(Clone, Copy)] 66 | pub enum Minimiser { 67 | Pager, 68 | } 69 | 70 | pub fn from_yacc( 71 | grm: &YaccGrammar, 72 | m: Minimiser, 73 | ) -> Result<(StateGraph, StateTable), StateTableError> 74 | where 75 | usize: AsPrimitive, 76 | { 77 | match m { 78 | Minimiser::Pager => { 79 | let sg = pager::pager_stategraph(grm); 80 | let st = StateTable::new(grm, &sg)?; 81 | Ok((sg, st)) 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /doc/src/lexextensions.md: -------------------------------------------------------------------------------- 1 | # Lex extensions 2 | 3 | Flags can be specified at compile time through `LexFlags` or at `.l` file parse time using 4 | a `%grmtools{ }` section. At compile time these flags can be enabled using 5 | [`CTLexerBuilder`](https://docs.rs/lrlex/latest/lrlex/struct.CTLexerBuilder.html) methods. 6 | 7 | Flags commonly affect the parsing of the lex file, the interpretation regular expressions, 8 | and set limits. 9 | 10 | Boolean flags are specified by their name, and can be negated by prefixing with `!` 11 | other flags should specify their value immediately after the flag name. 12 | 13 | 14 | ## Example 15 | 16 | ``` 17 | %grmtools { 18 | allow_wholeline_comments, 19 | !octal, 20 | size_limit: 1024, 21 | } 22 | %% 23 | . "rule" 24 | ``` 25 | 26 | 27 | ## List of flags: 28 | 29 | | Flag | Value | Required | Regex[^regex] | 30 | |-------------------------------|-----------|----------|---------------| 31 | | `lexerkind` | [LexerKind](lexcompatibility.md#lexerkinds) | ✗ | ✗ | 32 | | `posix_escapes`[^†] | bool | ✗ | ✗ | 33 | | `allow_wholeline_comment`[^‡] | bool | ✗ | ✗ | 34 | | `case_insensitive` | bool | ✗ | ✓ | 35 | | `dot_matches_new_line` | bool | ✗ | ✓ | 36 | | `multi_line` | bool | ✗ | ✓ | 37 | | `octal` | bool | ✗ | ✓ | 38 | | `swap_greed` | bool | ✗ | ✓ | 39 | | `ignore_whitespace` | bool | ✗ | ✓ | 40 | | `unicode` | bool | ✗ | ✓ | 41 | | `size_limit` | usize | ✗ | ✓ | 42 | | `dfa_size_limit` | usize | ✗ | ✓ | 43 | | `nest_limit` | u32 | ✗ | ✓ | 44 | 45 | [^†]: Enable compatibility with posix escape sequences. 46 | [^‡]: Enables rust style `// comments` at the start of lines. 47 | Which requires escaping of `/` when used in a regex. 48 | [^regex]: ✓ Flag gets passed directly to `regex::RegexBuilder`. 49 | 50 | 51 | ## Flags affecting Posix compatibility 52 | 53 | As discussed in [Lex compatibility](lexcompatibility.md) the default behaviors of grmtools and rust's regex 54 | library have differed from that of posix lex. 55 | 56 | The following flags can change the behavior to match posix lex more closely. 57 | 58 | ``` 59 | %grmtools { 60 | !dot_matches_new_line, 61 | posix_escapes 62 | } 63 | %% 64 | ... 65 | ``` 66 | -------------------------------------------------------------------------------- /cfgrammar/src/lib/yacc/mod.rs: -------------------------------------------------------------------------------- 1 | #![deny(unreachable_pub)] 2 | 3 | pub mod ast; 4 | pub mod firsts; 5 | pub mod follows; 6 | pub mod grammar; 7 | pub mod parser; 8 | 9 | pub use self::{ 10 | grammar::{AssocKind, Precedence, SentenceGenerator, YaccGrammar}, 11 | parser::{YaccGrammarError, YaccGrammarErrorKind, YaccGrammarWarning, YaccGrammarWarningKind}, 12 | }; 13 | use proc_macro2::TokenStream; 14 | use quote::quote; 15 | 16 | #[cfg(feature = "serde")] 17 | use serde::{Deserialize, Serialize}; 18 | 19 | /// The particular Yacc variant this grammar makes use of. 20 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] 21 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 22 | #[non_exhaustive] 23 | pub enum YaccKind { 24 | /// The original Yacc style as documented by 25 | /// [Johnson](http://dinosaur.compilertools.net/yacc/index.html), 26 | Original(YaccOriginalActionKind), 27 | /// Similar to the original Yacc style, but allowing individual rules' actions to have their 28 | /// own return type. 29 | Grmtools, 30 | /// The variant used in the [Eco language composition editor](http://soft-dev.org/src/eco/) 31 | Eco, 32 | } 33 | 34 | impl quote::ToTokens for YaccKind { 35 | fn to_tokens(&self, tokens: &mut TokenStream) { 36 | tokens.extend(match *self { 37 | YaccKind::Grmtools => quote!(::cfgrammar::yacc::YaccKind::Grmtools), 38 | YaccKind::Original(action_kind) => { 39 | quote!(::cfgrammar::yacc::YaccKind::Original(#action_kind)) 40 | } 41 | YaccKind::Eco => quote!(::cfgrammar::yacc::YaccKind::Eco), 42 | }) 43 | } 44 | } 45 | 46 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] 47 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 48 | pub enum YaccOriginalActionKind { 49 | /// Execute user-specified actions attached to each production; also requires a %actiontype 50 | /// declaration. 51 | UserAction, 52 | /// Automatically create a parse tree instead of user-specified actions. 53 | GenericParseTree, 54 | /// Do not do execute actions of any sort. 55 | NoAction, 56 | } 57 | 58 | impl quote::ToTokens for YaccOriginalActionKind { 59 | fn to_tokens(&self, tokens: &mut TokenStream) { 60 | tokens.extend(match *self { 61 | YaccOriginalActionKind::UserAction => { 62 | quote!(::cfgrammar::yacc::YaccOriginalActionKind::UserAction) 63 | } 64 | YaccOriginalActionKind::GenericParseTree => { 65 | quote!(::cfgrammar::yacc::YaccOriginalActionKind::GenericParseTree) 66 | } 67 | YaccOriginalActionKind::NoAction => { 68 | quote!(::cfgrammar::yacc::YaccOriginalActionKind::NoAction) 69 | } 70 | }) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /cfgrammar/src/lib/span.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "bincode")] 2 | use bincode::{Decode, Encode}; 3 | use proc_macro2::TokenStream; 4 | use quote::{ToTokens, TokenStreamExt, quote}; 5 | #[cfg(feature = "serde")] 6 | use serde::{Deserialize, Serialize}; 7 | 8 | /// A `Span` records what portion of the user's input something (e.g. a lexeme or production) 9 | /// references (i.e. the `Span` doesn't hold a reference / copy of the actual input). 10 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] 11 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 12 | #[cfg_attr(feature = "bincode", derive(Encode, Decode))] 13 | pub struct Span { 14 | start: usize, 15 | end: usize, 16 | } 17 | 18 | impl Span { 19 | /// Create a new span starting at byte `start` and ending at byte `end`. 20 | /// 21 | /// # Panics 22 | /// 23 | /// If `end` is less than `start`. 24 | pub fn new(start: usize, end: usize) -> Self { 25 | if end < start { 26 | panic!("Span starts ({}) after it ends ({})!", start, end); 27 | } 28 | Span { start, end } 29 | } 30 | 31 | /// Byte offset of the start of the span. 32 | pub fn start(&self) -> usize { 33 | self.start 34 | } 35 | 36 | /// Byte offset of the end of the span. 37 | pub fn end(&self) -> usize { 38 | self.end 39 | } 40 | 41 | /// Length in bytes of the span. 42 | pub fn len(&self) -> usize { 43 | self.end - self.start 44 | } 45 | 46 | /// Returns `true` if this `Span` covers 0 bytes, or `false` otherwise. 47 | pub fn is_empty(&self) -> bool { 48 | self.len() == 0 49 | } 50 | } 51 | 52 | /// Implemented for errors and warnings to provide access to their spans. 53 | pub trait Spanned: std::fmt::Display { 54 | /// Returns the spans associated with the error, always containing at least 1 span. 55 | /// 56 | /// Refer to [SpansKind](crate::yacc::parser::SpansKind) via [spanskind](Self::spanskind) 57 | /// for the meaning and interpretation of spans and their ordering. 58 | fn spans(&self) -> &[Span]; 59 | /// Returns the `SpansKind` associated with this error. 60 | fn spanskind(&self) -> crate::yacc::parser::SpansKind; 61 | } 62 | 63 | impl ToTokens for Span { 64 | fn to_tokens(&self, tokens: &mut TokenStream) { 65 | let Span { start, end } = self; 66 | tokens.append_all(quote! {::cfgrammar::Span::new(#start, #end)}); 67 | } 68 | } 69 | 70 | /// A possibly inexact location which could either be a `Span`, 71 | /// a command-line option, or some other location described textually. 72 | #[derive(Clone, Debug, Eq, PartialEq)] 73 | pub enum Location { 74 | Span(Span), 75 | CommandLine, 76 | Other(String), 77 | } 78 | 79 | impl From for Location { 80 | fn from(span: Span) -> Location { 81 | Location::Span(span) 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /lrpar/cttests/src/calc_wasm.rs: -------------------------------------------------------------------------------- 1 | #[cfg(wasm32_unknown)] 2 | use wasm_bindgen::prelude::*; 3 | 4 | use lrlex::lrlex_mod; 5 | use lrpar::lrpar_mod; 6 | 7 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be 8 | // `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 9 | lrlex_mod!("calc_wasm.l"); 10 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. By default the module name will be 11 | // `calc_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 12 | lrpar_mod!("calc_wasm.y"); 13 | 14 | #[cfg_attr(wasm32_unknown, wasm_bindgen)] 15 | #[allow(unused)] 16 | pub fn calculate(l: &str) -> Result { 17 | // Get the `LexerDef` for the `calc` language. 18 | let lexerdef = calc_wasm_l::lexerdef(); 19 | if l.trim().is_empty() { 20 | return Err("input is empty".to_string()); 21 | } 22 | // Now we create a lexer with the `lexer` method with which we can lex an input. 23 | let lexer = lexerdef.lexer(l); 24 | // Pass the lexer to the parser and lex and parse the input. 25 | let (res, errs) = calc_wasm_y::parse(&lexer); 26 | if !errs.is_empty() { 27 | let mut ret = String::new(); 28 | for e in errs { 29 | use lrpar::LexParseError; 30 | match e { 31 | LexParseError::ParseError(e) => { 32 | let repairs_flag = !e.repairs().is_empty(); 33 | ret.push_str(&format!("Error: {}\n Repairs: {}", e, repairs_flag)); 34 | } 35 | e => ret.push_str(&format!("{}\n", e)), 36 | }; 37 | } 38 | if let Some(Err(e)) = res { 39 | ret.push_str(&format!("{}\n", e)); 40 | } 41 | return Err(ret); 42 | } 43 | match res { 44 | Some(Ok(r)) => Ok(r), 45 | Some(Err(e)) => Err(e.to_string()), 46 | None => Err("Unable to parse".to_string()), 47 | } 48 | } 49 | 50 | #[cfg(test)] 51 | mod test { 52 | use super::calculate; 53 | #[cfg(wasm32_unknown)] 54 | use wasm_bindgen_test::*; 55 | 56 | #[cfg_attr(wasm32_unknown, wasm_bindgen_test)] 57 | #[test] 58 | fn test_calc_14() { 59 | assert_eq!(calculate("2 + 3 * 4").unwrap(), 14); 60 | } 61 | 62 | #[cfg_attr(wasm32_unknown, wasm_bindgen_test)] 63 | #[test] 64 | fn test_lex_error() { 65 | assert!(calculate("#1 + #2").is_err()); 66 | } 67 | 68 | #[cfg_attr(wasm32_unknown, wasm_bindgen_test)] 69 | #[test] 70 | fn test_recovery() { 71 | // We really want to test this recovery path, since it contains 72 | // calls to `Instant::now()` which panics on `std` 73 | // Thus we need to check that the `web_time` crate is working. 74 | let x = calculate("1+"); 75 | match x { 76 | Err(e) => assert!(e.contains("Repairs: true")), 77 | Ok(e) => panic!("unexpectedly parsed {}", e), 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast/src/main.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::unnecessary_wraps)] 2 | 3 | use std::io::{self, BufRead, Write}; 4 | 5 | use cfgrammar::Span; 6 | use lrlex::{DefaultLexerTypes, lrlex_mod}; 7 | use lrpar::{NonStreamingLexer, lrpar_mod}; 8 | 9 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be 10 | // `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 11 | lrlex_mod!("calc.l"); 12 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. By default the module name will be 13 | // `calc_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 14 | lrpar_mod!("calc.y"); 15 | 16 | use calc_y::Expr; 17 | 18 | fn main() { 19 | // Get the `LexerDef` for the `calc` language. 20 | let lexerdef = calc_l::lexerdef(); 21 | let stdin = io::stdin(); 22 | loop { 23 | print!(">>> "); 24 | io::stdout().flush().ok(); 25 | match stdin.lock().lines().next() { 26 | Some(Ok(ref l)) => { 27 | if l.trim().is_empty() { 28 | continue; 29 | } 30 | // Now we create a lexer with the `lexer` method with which we can lex an input. 31 | let lexer = lexerdef.lexer(l); 32 | // Pass the lexer to the parser and lex and parse the input. 33 | let (res, errs) = calc_y::parse(&lexer); 34 | for e in errs { 35 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 36 | } 37 | if let Some(Ok(r)) = res { 38 | match eval(&lexer, r) { 39 | Ok(i) => println!("Result: {}", i), 40 | Err((span, msg)) => { 41 | let ((line, col), _) = lexer.line_col(span); 42 | eprintln!( 43 | "Evaluation error at line {} column {}, '{}' {}.", 44 | line, 45 | col, 46 | lexer.span_str(span), 47 | msg 48 | ) 49 | } 50 | } 51 | } 52 | } 53 | _ => break, 54 | } 55 | } 56 | } 57 | 58 | fn eval( 59 | lexer: &dyn NonStreamingLexer>, 60 | e: Expr, 61 | ) -> Result { 62 | match e { 63 | Expr::Add { span, lhs, rhs } => eval(lexer, *lhs)? 64 | .checked_add(eval(lexer, *rhs)?) 65 | .ok_or((span, "overflowed")), 66 | Expr::Mul { span, lhs, rhs } => eval(lexer, *lhs)? 67 | .checked_mul(eval(lexer, *rhs)?) 68 | .ok_or((span, "overflowed")), 69 | Expr::Number { span } => lexer 70 | .span_str(span) 71 | .parse::() 72 | .map_err(|_| (span, "cannot be represented as a u64")), 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /lrpar/examples/calc_ast_arena/src/main.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::unnecessary_wraps)] 2 | 3 | use std::io::{self, BufRead, Write}; 4 | 5 | use cfgrammar::Span; 6 | use lrlex::{DefaultLexerTypes, lrlex_mod}; 7 | use lrpar::{NonStreamingLexer, lrpar_mod}; 8 | 9 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be 10 | // `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 11 | lrlex_mod!("calc.l"); 12 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. By default the module name will be 13 | // `calc_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 14 | lrpar_mod!("calc.y"); 15 | 16 | use calc_y::Expr; 17 | 18 | fn main() { 19 | // Get the `LexerDef` for the `calc` language. 20 | let lexerdef = calc_l::lexerdef(); 21 | let stdin = io::stdin(); 22 | loop { 23 | print!(">>> "); 24 | io::stdout().flush().ok(); 25 | match stdin.lock().lines().next() { 26 | Some(Ok(ref l)) => { 27 | if l.trim().is_empty() { 28 | continue; 29 | } 30 | // Now we create a lexer with the `lexer` method with which we can lex an input. 31 | let lexer = lexerdef.lexer(l); 32 | let arena = bumpalo::Bump::new(); 33 | // Pass the lexer to the parser and lex and parse the input. 34 | let (res, errs) = calc_y::parse(&lexer, &arena); 35 | for e in errs { 36 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 37 | } 38 | if let Some(Ok(r)) = res { 39 | match eval(&lexer, &r) { 40 | Ok(i) => println!("Result: {}", i), 41 | Err((span, msg)) => { 42 | let ((line, col), _) = lexer.line_col(span); 43 | eprintln!( 44 | "Evaluation error at line {} column {}, '{}' {}.", 45 | line, 46 | col, 47 | lexer.span_str(span), 48 | msg 49 | ) 50 | } 51 | } 52 | } 53 | } 54 | _ => break, 55 | } 56 | } 57 | } 58 | 59 | fn eval( 60 | lexer: &dyn NonStreamingLexer>, 61 | e: &Expr, 62 | ) -> Result { 63 | match e { 64 | Expr::Add { span, lhs, rhs } => eval(lexer, lhs)? 65 | .checked_add(eval(lexer, rhs)?) 66 | .ok_or((*span, "overflowed")), 67 | Expr::Mul { span, lhs, rhs } => eval(lexer, lhs)? 68 | .checked_mul(eval(lexer, rhs)?) 69 | .ok_or((*span, "overflowed")), 70 | Expr::Number { span } => lexer 71 | .span_str(*span) 72 | .parse::() 73 | .map_err(|_| (*span, "cannot be represented as a u64")), 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /lrpar/src/lib/dijkstra.rs: -------------------------------------------------------------------------------- 1 | use std::{fmt::Debug, hash::Hash}; 2 | 3 | use indexmap::{ 4 | indexmap, 5 | map::{Entry, IndexMap}, 6 | }; 7 | 8 | /// Starting at `start_node`, return, in arbitrary order, all least-cost success nodes. 9 | /// 10 | /// * `neighbours` takes a node `n` and returns an iterator consisting of all `n`'s neighbouring 11 | /// nodes. 12 | /// * `success` takes a node `n` and returns `true` if it is a success node or `false` otherwise. 13 | /// 14 | /// The name of this function isn't entirely accurate: this isn't Dijkstra's original algorithm or 15 | /// one of its well-known variants. However, unlike the astar_all function it doesn't expect a 16 | /// heuristic and it also filters out some duplicates. 17 | pub(crate) fn dijkstra( 18 | start_node: N, 19 | neighbours: FN, 20 | merge: FM, 21 | success: FS, 22 | ) -> Vec 23 | where 24 | N: Debug + Clone + Hash + Eq + PartialEq, 25 | FN: Fn(bool, &N, &mut Vec<(u16, N)>) -> bool, 26 | FM: Fn(&mut N, N), 27 | FS: Fn(&N) -> bool, 28 | { 29 | let mut scs_nodes = Vec::new(); 30 | let mut todo: Vec> = vec![indexmap![start_node.clone() => start_node]]; 31 | let mut c: u16 = 0; 32 | let mut next = Vec::new(); 33 | loop { 34 | if todo[usize::from(c)].is_empty() { 35 | c = c.checked_add(1).unwrap(); 36 | if usize::from(c) == todo.len() { 37 | return Vec::new(); 38 | } 39 | continue; 40 | } 41 | 42 | let (_, n) = todo[usize::from(c)].pop().unwrap(); 43 | if success(&n) { 44 | scs_nodes.push(n); 45 | break; 46 | } 47 | 48 | if !neighbours(true, &n, &mut next) { 49 | return Vec::new(); 50 | } 51 | for (nbr_cost, nbr) in next.drain(..) { 52 | let off = usize::from(nbr_cost); 53 | todo.resize(todo.len() + off + 1, IndexMap::new()); 54 | match todo[off].entry(nbr.clone()) { 55 | Entry::Vacant(e) => { 56 | e.insert(nbr); 57 | } 58 | Entry::Occupied(mut e) => { 59 | merge(e.get_mut(), nbr); 60 | } 61 | } 62 | } 63 | } 64 | 65 | let mut scs_todo = todo 66 | .drain(usize::from(c)..usize::from(c) + 1) 67 | .next() 68 | .unwrap(); 69 | while let Some((_, n)) = scs_todo.pop() { 70 | if success(&n) { 71 | scs_nodes.push(n); 72 | continue; 73 | } 74 | if !neighbours(false, &n, &mut next) { 75 | return Vec::new(); 76 | } 77 | for (nbr_cost, nbr) in next.drain(..) { 78 | if nbr_cost == c { 79 | match scs_todo.entry(nbr.clone()) { 80 | Entry::Vacant(e) => { 81 | e.insert(nbr); 82 | } 83 | Entry::Occupied(mut e) => { 84 | merge(e.get_mut(), nbr); 85 | } 86 | } 87 | } 88 | } 89 | } 90 | 91 | scs_nodes 92 | } 93 | -------------------------------------------------------------------------------- /lrpar/examples/start_states/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, BufRead, Write}; 2 | 3 | use cfgrammar::RIdx; 4 | use lrlex::{DefaultLexeme, lrlex_mod}; 5 | use lrpar::{Lexeme, lrpar_mod}; 6 | 7 | // Using `lrlex_mod!` brings the lexer for `comment.l` into scope. By default the module name will be 8 | // `comment_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 9 | lrlex_mod!("comment.l"); 10 | // Using `lrpar_mod!` brings the parser for `comment.y` into scope. By default the module name will be 11 | // `comment_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 12 | lrpar_mod!("comment.y"); 13 | 14 | use comment_y::Node; 15 | 16 | fn main() { 17 | // Get the `LexerDef` for the `comment` language. 18 | let lexerdef = comment_l::lexerdef(); 19 | let stdin = io::stdin(); 20 | loop { 21 | print!(">>> "); 22 | io::stdout().flush().ok(); 23 | match stdin.lock().lines().next() { 24 | Some(Ok(ref l)) => { 25 | if l.trim().is_empty() { 26 | continue; 27 | } 28 | // Now we create a lexer with the `lexer` method with which we can lex an input. 29 | let lexer = lexerdef.lexer(l); 30 | // Pass the lexer to the parser and lex and parse the input. 31 | let (pt, errs) = comment_y::parse(&lexer); 32 | for e in errs { 33 | println!("{}", e.pp(&lexer, &comment_y::token_epp)); 34 | } 35 | if let Some(pt) = pt { 36 | // Success! We parsed the input and created a parse tree. 37 | println!("Result: {}", Eval::new(l).eval(&pt)); 38 | } 39 | } 40 | _ => break, 41 | } 42 | } 43 | } 44 | 45 | struct Eval<'a> { 46 | s: &'a str, 47 | } 48 | 49 | impl<'a> Eval<'a> { 50 | fn new(s: &'a str) -> Self { 51 | Eval { s } 52 | } 53 | 54 | fn eval(&self, n: &Node, u32>) -> String { 55 | match *n { 56 | Node::Nonterm { 57 | ridx: RIdx(ridx), 58 | ref nodes, 59 | } if ridx == comment_y::R_EXPR => { 60 | let mut s = String::new(); 61 | for node in nodes { 62 | s.push_str(&self.eval(node)); 63 | } 64 | s 65 | } 66 | Node::Nonterm { 67 | ridx: RIdx(ridx), 68 | ref nodes, 69 | } if ridx == comment_y::R_TEXT => { 70 | if nodes.len() == 1 { 71 | if let Node::Term { lexeme } = nodes[0] { 72 | self.s[lexeme.span().start()..lexeme.span().end()].to_string() 73 | } else { 74 | unreachable!(); 75 | } 76 | } else { 77 | let mut s = String::new(); 78 | for node in nodes { 79 | s.push_str(&self.eval(node)); 80 | } 81 | s 82 | } 83 | } 84 | _ => unreachable!(), 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /lrpar/examples/calc_parsetree/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, BufRead, Write}; 2 | 3 | use cfgrammar::RIdx; 4 | use lrlex::{DefaultLexeme, lrlex_mod}; 5 | use lrpar::{Lexeme, lrpar_mod}; 6 | 7 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be 8 | // `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`). 9 | lrlex_mod!("calc.l"); 10 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. By default the module name will be 11 | // `calc_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 12 | lrpar_mod!("calc.y"); 13 | 14 | use calc_y::Node; 15 | 16 | fn main() { 17 | // Get the `LexerDef` for the `calc` language. 18 | let lexerdef = calc_l::lexerdef(); 19 | let stdin = io::stdin(); 20 | loop { 21 | print!(">>> "); 22 | io::stdout().flush().ok(); 23 | match stdin.lock().lines().next() { 24 | Some(Ok(ref l)) => { 25 | if l.trim().is_empty() { 26 | continue; 27 | } 28 | // Now we create a lexer with the `lexer` method with which we can lex an input. 29 | let lexer = lexerdef.lexer(l); 30 | // Pass the lexer to the parser and lex and parse the input. 31 | let (pt, errs) = calc_y::parse(&lexer); 32 | for e in errs { 33 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 34 | } 35 | if let Some(pt) = pt { 36 | // Success! We parsed the input and created a parse tree. 37 | println!("Result: {}", Eval::new(l).eval(&pt)); 38 | } 39 | } 40 | _ => break, 41 | } 42 | } 43 | } 44 | 45 | struct Eval<'a> { 46 | s: &'a str, 47 | } 48 | 49 | impl<'a> Eval<'a> { 50 | fn new(s: &'a str) -> Self { 51 | Eval { s } 52 | } 53 | 54 | fn eval(&self, n: &Node, u32>) -> i64 { 55 | match *n { 56 | Node::Nonterm { 57 | ridx: RIdx(ridx), 58 | ref nodes, 59 | } if ridx == calc_y::R_EXPR => { 60 | if nodes.len() == 1 { 61 | self.eval(&nodes[0]) 62 | } else { 63 | debug_assert_eq!(nodes.len(), 3); 64 | self.eval(&nodes[0]) + self.eval(&nodes[2]) 65 | } 66 | } 67 | Node::Nonterm { 68 | ridx: RIdx(ridx), 69 | ref nodes, 70 | } if ridx == calc_y::R_TERM => { 71 | if nodes.len() == 1 { 72 | self.eval(&nodes[0]) 73 | } else { 74 | debug_assert_eq!(nodes.len(), 3); 75 | self.eval(&nodes[0]) * self.eval(&nodes[2]) 76 | } 77 | } 78 | Node::Nonterm { 79 | ridx: RIdx(ridx), 80 | ref nodes, 81 | } if ridx == calc_y::R_FACTOR => { 82 | if nodes.len() == 1 { 83 | if let Node::Term { lexeme } = nodes[0] { 84 | self.s[lexeme.span().start()..lexeme.span().end()] 85 | .parse() 86 | .unwrap() 87 | } else { 88 | unreachable!(); 89 | } 90 | } else { 91 | debug_assert_eq!(nodes.len(), 3); 92 | self.eval(&nodes[1]) 93 | } 94 | } 95 | _ => unreachable!(), 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /cfgrammar/src/lib/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::cognitive_complexity)] 2 | #![allow(clippy::many_single_char_names)] 3 | #![allow(clippy::new_without_default)] 4 | #![allow(clippy::unnecessary_wraps)] 5 | #![allow(clippy::upper_case_acronyms)] 6 | #![forbid(unsafe_code)] 7 | #![deny(unreachable_pub)] 8 | 9 | //! A library for manipulating Context Free Grammars (CFG). It is impractical to fully homogenise 10 | //! all the types of grammars out there, so the aim is for different grammar types 11 | //! to have completely separate implementations. Code that wants to be generic over more than one 12 | //! grammar type can then use an "adapter" to homogenise the particular grammar types of interest. 13 | //! Currently this is a little academic, since only Yacc-style grammars are supported (albeit 14 | //! several variants of Yacc grammars). 15 | //! 16 | //! Unfortunately, CFG terminology is something of a mess. Some people use different terms for the 17 | //! same concept interchangeably; some use different terms to convey subtle differences of meaning 18 | //! (but without complete uniformity). "Token", "terminal", and "lexeme" are examples of this: they 19 | //! are synonyms in some tools and papers, but not in others. 20 | //! 21 | //! In order to make this library somewhat coherent, we therefore use some basic terminology 22 | //! guidelines for major concepts (acknowledging that this will cause clashes with some grammar 23 | //! types). 24 | //! 25 | //! * A *grammar* is an ordered sequence of *productions*. 26 | //! * A *production* is an ordered sequence of *symbols*. 27 | //! * A *rule* maps a name to one or more productions. 28 | //! * A *token* is the name of a syntactic element. 29 | //! 30 | //! For example, in the following Yacc grammar: 31 | //! 32 | //! R1: "a" "b" | R2; 33 | //! R2: "c"; 34 | //! 35 | //! the following statements are true: 36 | //! 37 | //! * There are 3 productions. 1: ["a", "b"] 2: ["R2"] 3: ["c"]` 38 | //! * There are two rules: R1 and R2. The mapping to productions is {R1: {1, 2}, R2: {3}} 39 | //! * There are three tokens: a, b, and c. 40 | //! 41 | //! cfgrammar makes the following guarantees about grammars: 42 | //! 43 | //! * Productions are numbered from `0` to `prods_len() - 1` (inclusive). 44 | //! * Rules are numbered from `0` to `rules_len() - 1` (inclusive). 45 | //! * Tokens are numbered from `0` to `toks_len() - 1` (inclusive). 46 | //! * The StorageT type used to store productions, rules, and token indices can be infallibly 47 | //! converted into usize (see [`TIdx`](struct.TIdx.html) and friends for more details). 48 | //! 49 | //! For most current uses, the main function to investigate is 50 | //! [`YaccGrammar::new()`](yacc/grammar/struct.YaccGrammar.html#method.new) and/or 51 | //! [`YaccGrammar::new_with_storaget()`](yacc/grammar/struct.YaccGrammar.html#method.new_with_storaget) 52 | //! which take as input a Yacc grammar. 53 | 54 | #[cfg(feature = "bincode")] 55 | use bincode::{Decode, Encode}; 56 | #[cfg(feature = "serde")] 57 | use serde::{Deserialize, Serialize}; 58 | 59 | #[doc(hidden)] 60 | pub mod header; 61 | mod idxnewtype; 62 | #[doc(hidden)] 63 | pub mod markmap; 64 | pub mod newlinecache; 65 | pub mod span; 66 | pub mod yacc; 67 | 68 | pub use newlinecache::NewlineCache; 69 | pub use span::{Location, Span, Spanned}; 70 | 71 | /// A type specifically for rule indices. 72 | pub use crate::idxnewtype::{PIdx, RIdx, SIdx, TIdx}; 73 | 74 | #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] 75 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 76 | #[cfg_attr(feature = "bincode", derive(Encode, Decode))] 77 | pub enum Symbol { 78 | Rule(RIdx), 79 | Token(TIdx), 80 | } 81 | -------------------------------------------------------------------------------- /doc/src/lexcompatibility.md: -------------------------------------------------------------------------------- 1 | # Lex compatibility 2 | 3 | grmtools currently supports one common use of Lex, which is to produce a 4 | sequence of tokens. All Lex files require at least some porting to grmtools, 5 | though in many cases this is fairly trivial. Nevertheless, aspects such as 6 | the longest match rule are identical to Lex, and we assume familiarity with Lex 7 | syntax and its major features: the [Lex 8 | manual](https://web.archive.org/web/20220402195947/dinosaur.compilertools.net/lex/index.html) is recommended 9 | reading. 10 | 11 | 12 | ## Major differences 13 | 14 | There are several major differences between Lex and grmtools: 15 | 16 | * Lex has its own regular expression language whereas grmtools uses the well 17 | known Rust [regex crate](https://crates.io/crates/regex) for regular 18 | expressions. These two regular expression languages are very similar, but 19 | complex regular expressions might not be supported under one or the other. 20 | 21 | * Lex files consist of a sequence of regular expressions and an action for each. 22 | grmtools lex files consists of a sequence of regular expressions and a token 23 | name. Actions are not currently supported (and, by extension, nor are 24 | special action expressions such as `ECHO` and `REJECT`). 25 | 26 | * Both Lex and grmtools lex files support start conditions as an optional prefix 27 | to regular expressions, listing necessary states for the input expression to 28 | be considered for matching against the input. Lex uses a special action 29 | expression `BEGIN(state)` to switch to the named `state`. Start states in grmtools 30 | are described in [start_states](start_states.md). 31 | 32 | * Character sets, and changes to internal array sizes are not supported by grmtools. 33 | 34 | * Escape sequences: 35 | 36 | In addition to escape sequences involved in the escaping of regular expressions. 37 | Lex and grmtools support the escape sequences `\123` (octal) `\x1234` (hexadecimal) 38 | and ASCII escape sequences. `\\` `\a` `\f` `\n` `\r` `\t` `\v`. 39 | 40 | Lex also interprets the escape sequence `\b` as `backspace`. While regex treats `\b` 41 | as a word boundary subsequently grmtools will too. The Lex behavior can be enabled 42 | using [posix_escapes](lexextensions.md). 43 | 44 | Additional escape sequences supported by regex: 45 | 46 | The `\u1234` and `\U12345678` escape sequences for unicode characters, 47 | the `\p`,`\P` unicode character classes, as well as the `\d` `\D` `\s` `\S` 48 | `\w` `\W` perl character classes, and `\A` `\b` `\B` `\z` escape sequences. 49 | 50 | Both Lex and grmtools support escaping arbitrary characters, for all other characters 51 | besides those listed above, when given an escaped character `\c` it will be passed to 52 | the regex engine as the character `c`. This is useful when a character is used within 53 | the lex format. 54 | 55 | An example of this is when the character `<` is used at the beginning of a regex. Both Lex 56 | and grmtools interpret this as the beginning of a start condition prefix. Which can be 57 | escaped with `\<` to ensure it is treated as the start of a regular expression. 58 | 59 | But the characters to which this behavior applies is impacted by the escape sequence 60 | differences listed above. 61 | 62 | * Lex treats lines in the rules section beginning with whitespace as code to be copied verbatim 63 | into the generated lexer source. Grmtools lex does not support these and produces an error. 64 | 65 | ## LexerKinds 66 | 67 | ### LRNonStreamingLexerKind 68 | 69 | Currently lrlex only supports a single `LexKind::LRNonStreamingLexerKind` which is the default if unspecified. 70 | -------------------------------------------------------------------------------- /doc/src/actioncode.md: -------------------------------------------------------------------------------- 1 | # Action code and return types 2 | 3 | ## Action code 4 | 5 | Action code is normal Rust code with the addition of the following special variables: 6 | 7 | * `$1` ... `$n` refer to the respective symbol in the production, numbered 8 | from 1 (i.e. `$1` refers to the first symbol in the production). If the 9 | symbol references a rule `R` then an instance of `R`'s type will be stored 10 | in the `$i` variable. If the symbol references a lexeme then a 11 | `Result, Lexeme>` instance is returned where the 12 | `Ok` variant is used for lexemes that are directly derived from the user's 13 | input and the `Err` variant is used for lexemes that have been inserted by 14 | [error recovery](errorrecovery.md). 15 | 16 | * `$lexer` allows access to the lexer and its [various 17 | functions](https://softdevteam.github.io/grmtools/master/api/lrpar/trait.Lexer.html). 18 | The most commonly used of these is the `span_str` function, which allows us 19 | to extract `&'input str`s from a `Span` (e.g. to extract the string 20 | represented by a `Lexeme`, we would use `$lexer.span_str(lexeme.span())`). 21 | As this may suggest, actions may also reference the special lifetime 22 | `'input` (without any `$` prefix), which allows strings to be returned / 23 | stored by the grammar without copying memory. 24 | 25 | * `$span` is a 26 | [`cfgrammar::Span`](https://softdevteam.github.io/grmtools/master/api/cfgrammar/struct.Span.html) 27 | which captures how much of the user's input the current production matched. 28 | 29 | * `$$` is equivalent to `$` in normal Rust code. 30 | 31 | Any other variables beginning with `$` are treated as errors. 32 | 33 | 34 | ## Return types 35 | 36 | Productions' return types can be any arbitrary Rust type. You may in addition 37 | make use of the following: 38 | 39 | * The generic parameter `StorageT` references the type of lexemes and is 40 | typically used with the 41 | [`Lexeme`](https://softdevteam.github.io/grmtools/master/api/lrpar/struct.Lexeme.html) 42 | type i.e. `Lexeme`. This allows you to return lexemes from rules. 43 | 44 | * The lifetime `'input` allows you to extract strings whose lifetime is tied 45 | to the lexer and return them from rules / store them in structs without 46 | copying. `Lexer::span_str` returns such strings and the typical idiom of use 47 | is `&'input str`. 48 | 49 | 50 | ## Additional parse parameter 51 | 52 | A single extra parameter can be passed to action functions if the `%parse-param 53 | : ` declaration is used. The variable `` is then visible in all 54 | action code. `` must implement the [`Clone` 55 | trait](https://doc.rust-lang.org/stable/std/clone/trait.Clone.html) (note that `Copy` 56 | bounds imply `Clone`, and `&` references implement `Copy`). 57 | 58 | For example if a grammar has a declaration: 59 | 60 | ``` 61 | %parse-param p: u64 62 | ``` 63 | 64 | then the statically generated `parse` function will take two paramaters 65 | `(lexer: &..., p: u64)` and the variable `p` can be used in action code e.g.: 66 | 67 | ``` 68 | R -> ...: 69 | 'ID' { format!("{}{}", p, ...) } 70 | ; 71 | ``` 72 | 73 | # Generic parse parameter 74 | 75 | If `%parse-param` needs to be generic, additional type variables and lifetimes 76 | can be specified in the `%parse-generics T1, T2, ...` declaration. 77 | 78 | For example, if a grammar has following declarations: 79 | 80 | ``` 81 | %parse-generics T: FromStr 82 | %parse-param p: T 83 | ``` 84 | 85 | then the `parse` function will take an additional parameter of type `T`. 86 | 87 | This can be used, for example, [to allocate AST nodes in a memory arena.](https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/calc_ast_arena). 88 | -------------------------------------------------------------------------------- /doc/src/start_states.md: -------------------------------------------------------------------------------- 1 | # Start States 2 | 3 | The following explains the syntax and semantics of Start States in lrlex.
4 | A working example can be found in the repository at [lrpar/examples/start_states][1] 5 | 6 | [1]: https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/start_states 7 | ## Motivation 8 | 9 | Start states are a feature from lex which can be used for context sensitive lexing. 10 | For instance, they can be used to implement nested comments (see the example in the repository). 11 | Such that the tokens start/end markers of tokens maintain balance. 12 | 13 | This is achieved by making rules which are qualified to match only when the lexer is in a 14 | particular state. Additionally the lexer has a stack of states, and matching rules perform actions 15 | which modify the stack. 16 | 17 | ## The INITIAL start state 18 | Unless specified otherwise all lex rules are members of the *INITIAL* start state. 19 | 20 | ``` 21 | %% 22 | a "A" 23 | [\t \n]+ ; 24 | ``` 25 | 26 | This is the lex file below with no start states specified. 27 | 28 | ``` 29 | %% 30 | a "A" 31 | [\t \n]+ ; 32 | ``` 33 | 34 | ## Rules matching multiple states 35 | 36 | Rules can be matched in multiple states, just separate the states a rule should match in commas. 37 | The following matches the `a` character when in either of the states `FirstState` or `SecondState`. 38 | 39 | ``` 40 | a "A" 41 | ``` 42 | 43 | ## Differences from POSIX lex 44 | 45 | In posix lex start states are entered via code in the action, through either `BEGIN(STATE)` and 46 | calling combinations of `yy_push_state`, and `yy_pop_state`. 47 | 48 | Because lrlex is actionless, and does not support code actions, instead we have operators to 49 | perform the common modifications to the stack of start states. 50 | 51 | ### Push 52 | The push operator is given by the adding '+' to the target state on the right hand side within 53 | angle brackets. The following when regex matches in the *CURRENT_STATE* pushes *TARGET_STATE* to 54 | the top of a stack of states. 55 | 56 | ``` 57 | Regex <+TARGET_STATE>; 58 | ``` 59 | 60 | ### Pop 61 | The pop operator is given by the adding '-' to the target state on the right hand side within angle 62 | brackets. Similarly when in the current state, the following pops the current state off of the 63 | stack of states. Similarly to calling `yy_pop_state` from action code. 64 | ``` 65 | Regex <-CURRENT_STATE>; 66 | ``` 67 | 68 | ### ReplaceStack 69 | The ReplaceStack operator is given by naming the target state within angle brackets. 70 | The ReplaceStack op clears the entire stack of states, then pushing the target state. 71 | 72 | ``` 73 | Regex ; 74 | ``` 75 | 76 | ### Returning a token while performing an operator. 77 | Start state operators can be combined with returning a token for example: 78 | 79 | ``` 80 | Regex <+TARGET_STATE>"TOKEN" 81 | ``` 82 | 83 | ## Adding a start state 84 | Start stats come in two forms, *exclusive* and *inclusive*. These are given by `%x` and `%s` 85 | respectively. 86 | 87 | ### Exclusive states 88 | In an exclusive state, the rule can be matched *only* if the rule begins with the state specified. 89 | In the following because `ExclState` is *exclusive*, the `#=` rule is only matched during the 90 | `INITIAL` state, while the `a` and `=#` characters are only matched while in the `ExclState`. 91 | 92 | ``` 93 | %x ExclState 94 | %% 95 | 96 | #= <+ExclState>; 97 | a "A" 98 | =# <-ExclState>; 99 | ``` 100 | 101 | ### Inclusive states 102 | 103 | Inclusive states are added to the set of rules to be matched when the start state is unspecified. 104 | 105 | ``` 106 | %s InclusiveState 107 | %% 108 | 109 | a "A" 110 | b "B" 111 | #= <+InclusiveState>; 112 | =# <-InclusiveState>; 113 | ``` 114 | 115 | Is equivalent to the following using exclusive states. 116 | 117 | ``` 118 | %x Excl 119 | %% 120 | 121 | a "A" 122 | b "B" 123 | #= <+Excl>; 124 | =# <-Excl>; 125 | ``` 126 | -------------------------------------------------------------------------------- /.buildbot.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -e 4 | 5 | export CARGO_HOME="`pwd`/.cargo_install" 6 | export RUSTUP_HOME="`pwd`/.rustup" 7 | export WASMTIME_HOME="`pwd`/.wasmtime" 8 | export NVM_DIR="`pwd`/.nodejs" 9 | export RUSTFLAGS="--cfg grmtools_extra_checks" 10 | 11 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rustup.sh 12 | sh rustup.sh --default-host x86_64-unknown-linux-gnu --default-toolchain stable -y --no-modify-path 13 | 14 | export PATH=`pwd`/.cargo_install/bin/:$WASMTIME_HOME/bin:$PATH 15 | 16 | # Install wasmtime once debian trixie is stablized 17 | # we can likely just use rust-wasmtime. 18 | # 19 | # Needed for wasm32-wasip2 20 | touch .wasmtime_profile 21 | if [ "X`which wasmtime`" = "X" ]; then 22 | PROFILE=".wasmtime_profile" bash -c 'curl https://wasmtime.dev/install.sh -sSf | bash' 23 | fi 24 | . ./.wasmtime_profile 25 | 26 | # Needed for wasm32-unknown-unknown 27 | mkdir -p $NVM_DIR 28 | PROFILE=/dev/null bash -c 'curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.2/install.sh | bash' 29 | . "$NVM_DIR/nvm.sh" 30 | # Download and install Node.js: 31 | nvm install 22 32 | 33 | cargo fmt --all -- --check 34 | 35 | rustup toolchain install stable 36 | rustup default stable 37 | 38 | # Later on we are going to need to install cargo-deny and mdbook. We kick the 39 | # install jobs off now so that at least some work (e.g. downloading crates) can 40 | # happen in parallel, speeding up the overall process. 41 | 42 | cargo_deny_mdbook_tmp=$(mktemp) 43 | ( cargo install --locked cargo-deny ; cargo install --locked mdbook ) \ 44 | >"${cargo_deny_mdbook_tmp}" 2>&1 & 45 | cargo_deny_mdbook_pid=$! 46 | 47 | cargo test 48 | cargo test --release 49 | 50 | rustup target add wasm32-unknown-unknown 51 | cargo install wasm-bindgen-cli 52 | cargo test --target wasm32-unknown-unknown 53 | 54 | rustup target add wasm32-wasip2 55 | cargo install workspace_runner 56 | cargo test --target wasm32-wasip2 57 | 58 | cargo test --lib cfgrammar --features serde 59 | cargo test --lib lrpar --features serde 60 | 61 | root=`pwd` 62 | cd $root/lrlex/examples/calc_manual_lex 63 | echo "2 + 3 * 4" | cargo run | grep "Result: 14" 64 | # Touching these files shouldn't invalidate the cache (via --cfg grmtools_extra_checks) 65 | touch src/main.rs && CACHE_EXPECTED=y cargo build 66 | cd $root/lrpar/examples/calc_actions 67 | echo "2 + 3 * 4" | cargo run --package nimbleparse -- src/calc.l src/calc.y - 68 | # Invoke `%grmtools{test_files}` 69 | cargo run --package nimbleparse -- src/calc.l src/calc.y 70 | echo "2 + 3 * 4" | cargo run | grep "Result: 14" 71 | touch src/main.rs && CACHE_EXPECTED=y cargo build 72 | cd $root/lrpar/examples/calc_ast 73 | echo "2 + 3 * 4" | cargo run --package nimbleparse -- src/calc.l src/calc.y - 74 | # Invoke `%grmtools{test_files}` 75 | cargo run --package nimbleparse -- src/calc.l src/calc.y 76 | echo "2 + 3 * 4" | cargo run | grep "Result: 14" 77 | cd $root/lrpar/examples/calc_ast_arena 78 | echo "2 + 3 * 4" | cargo run --package nimbleparse -- src/calc.l src/calc.y - 79 | # Invoke `%grmtools{test_files}` 80 | cargo run --package nimbleparse -- src/calc.l src/calc.y 81 | echo "2 + 3 * 4" | cargo run | grep "Result: 14" 82 | touch src/main.rs && CACHE_EXPECTED=y cargo build 83 | cd $root/lrpar/examples/calc_parsetree 84 | echo "2 + 3 * 4" | cargo run --package nimbleparse -- src/calc.l src/calc.y - 85 | # Invoke `%grmtools{test_files}` 86 | cargo run --package nimbleparse -- src/calc.l src/calc.y 87 | echo "2 + 3 * 4" | cargo run | grep "Result: 14" 88 | touch src/main.rs && CACHE_EXPECTED=y cargo build 89 | cd $root/lrpar/examples/clone_param 90 | echo "1+++" | cargo run --package nimbleparse -- src/param.l src/param.y - 91 | # Invoke `%grmtools{test_files}` 92 | cargo run --package nimbleparse -- src/param.l src/param.y 93 | cd $root/lrpar/examples/start_states 94 | echo "/* /* commented out */ */ uncommented text /* */" | cargo run --package nimbleparse -- src/comment.l src/comment.y - 95 | # Invoke `%grmtools{test_files}` 96 | cargo run --package nimbleparse -- src/comment.l src/comment.y 97 | cd $root 98 | 99 | RUSTDOCFLAGS="-Dwarnings" cargo doc --no-deps 100 | 101 | # Check licenses. 102 | wait "${cargo_deny_mdbook_pid}" || ( cat "${cargo_deny_mdbook_tmp}" && exit 1 ) 103 | cargo-deny check license 104 | 105 | # Build the docs 106 | cd $root/doc 107 | mdbook build 108 | test -d book 109 | cd .. 110 | -------------------------------------------------------------------------------- /doc/src/manuallexer.md: -------------------------------------------------------------------------------- 1 | # Hand-written lexers 2 | 3 | `lrpar` provides a generic lexing interface to which any lexer can plug into. 4 | Users can provide 5 | one or both of a custom lexeme type -- conforming to 6 | [`lrpar::Lexeme`](https://softdevteam.github.io/grmtools/master/api/lrpar/trait.Lexeme.html) 7 | -- and a custom lexing type -- conforming to 8 | [`lrpar::NonStreamingLexer`](https://softdevteam.github.io/grmtools/master/api/lrpar/trait.NonStreamingLexer.html). 9 | If you wish to use a custom lexer, you will need to instantiate `lrpar` 10 | appropriately (both 11 | [`CTParserBuilder`](https://softdevteam.github.io/grmtools/master/api/lrpar/struct.CTParserBuilder.html) 12 | and 13 | [`RTParserBuilder`](https://softdevteam.github.io/grmtools/master/api/lrpar/struct.RTParserBuilder.html)). 14 | 15 | For many purposes, the low-level control and performance that `lrpar` gives you is unneeded, 16 | and the boiler-plate that comes with it unwanted. Fortunately, `lrlex` provides the following convenience mechanisms to make it easier to use a hand-written lexer with `lrpar`: 17 | 18 | 1. `lrlex`'s normal `LRNonStreamingLexer` struct can be instantiated by an 19 | end-user with an input stream, a list of lexemes created from that 20 | input stream, and the newlines encountered while lexing that input 21 | stream. This saves having to define a custom instance of the 22 | [`lrpar::NonStreamingLexer`](https://softdevteam.github.io/grmtools/master/api/lrpar/trait.NonStreamingLexer.html) 23 | trait. 24 | 25 | 2. `lrlex`'s [`DefaultLexeme`](https://softdevteam.github.io/grmtools/master/api/lrlex/struct.DefaultLexeme.html) 26 | struct can also be instantiated by end-users, saving having to define a 27 | custom instance of the 28 | [`lrpar::Lexeme`](https://softdevteam.github.io/grmtools/master/api/lrpar/trait.Lexeme.html) 29 | trait. 30 | 31 | 3. `lrlex` exposes 32 | [`CTTokenMapBuilder`](https://softdevteam.github.io/grmtools/master/api/lrlex/struct.CTTokenMapBuilder.html) 33 | to be used from `build.rs` scripts which automatically produces a 34 | Rust module with one constant per token ID. It is explicitly 35 | designed to be easy to use with `lrpar`'s compile-time building. 36 | 37 | Putting these together is then relatively easy. First a `build.rs` file for a 38 | hand-written lexer will look roughly as follows: 39 | 40 | ```rust 41 | use lrlex::{CTTokenMapBuilder, DefaultLexerTypes}; 42 | use lrpar::CTParserBuilder; 43 | 44 | fn main() { 45 | let ctp = CTParserBuilder::>::new() 46 | .grammar_in_src_dir("grammar.y") 47 | .unwrap() 48 | .build() 49 | .unwrap(); 50 | CTTokenMapBuilder::::new("token_map", ctp.token_map()).build().unwrap() 51 | } 52 | ``` 53 | 54 | This produces a module that can be imported with `lrlex_mod!("token_map")`. The 55 | module will contain one constant, prefixed with `T_` per token identifiers in the 56 | grammar. For example, for the following grammar excerpt: 57 | 58 | ```lex 59 | Expr -> Result: 60 | Expr 'PLUS' Term { Ok($1? + $3?) } 61 | | Term { $1 } 62 | ; 63 | ``` 64 | 65 | the module will contain `const T_PLUS: u8 = ...;`. 66 | 67 | Since Yacc grammars can contain token identifiers which are not valid Rust 68 | identifiers, `CTTokenMapBuilder` allows you to provide a map from the token 69 | identifier to a "Rust friendly" variant. For example, for the following grammar 70 | excerpt: 71 | 72 | ```lex 73 | Expr -> Result: 74 | Expr '+' Term { Ok($1? + $3?) } 75 | | Term { $1 } 76 | ; 77 | ``` 78 | 79 | we would provide a map `'+' => 'PLUS'` leading, again, to a constant `T_PLUS` 80 | being defined. 81 | 82 | One can then write a simple custom lexer which lexes all the input in one go 83 | and returns an `LRNonStreamingLexer` as follows: 84 | 85 | ```rust 86 | use cfgrammar::NewlineCache; 87 | use lrlex::{lrlex_mod, DefaultLexeme, DefaultLexerTypes, LRNonStreamingLexer}; 88 | use lrpar::{lrpar_mod, Lexeme, NonStreamingLexer, Span}; 89 | 90 | lrlex_mod!("token_map"); 91 | use token_map::*; 92 | 93 | fn lex(s: &str) -> LRNonStreamingLexer> { 94 | let mut lexemes = Vec::new(); 95 | let mut newlines = NewlineCache::new(); 96 | let mut i = 0; 97 | while i < s.len() { 98 | if i == ... { 99 | lexemes.push(DefaultLexeme::new(T_PLUS, i, ...)); 100 | } else { 101 | ... 102 | } 103 | } 104 | LRNonStreamingLexer::new(s, lexemes, newlines) 105 | } 106 | ``` 107 | -------------------------------------------------------------------------------- /doc/src/yacccompatibility.md: -------------------------------------------------------------------------------- 1 | # Yacc compatibility 2 | 3 | grmtools supports most major Yacc features, to the extent that many Yacc 4 | grammars can be used unchanged with grmtools. In this book we assume 5 | familiarity with Yacc syntax and its major features: the 6 | [Yacc manual](https://web.archive.org/web/20220830093827/dinosaur.compilertools.net/yacc/index.html) is recommended 7 | reading. 8 | 9 | 10 | ## Major differences 11 | 12 | There are several differences between Yacc and grmtools including: 13 | 14 | * grmtools has no equivalent of any of the `yy*` functions (e.g. `yyerror`, 15 | `yylex`, `yylval`, `yyparse` and so on). This means, for example, that 16 | grammar actions cannot currently influence the lexer in any way. 17 | 18 | * grmtools has an entirely different approach to [error 19 | recovery](errorrecovery.md). The token `error` and the special action 20 | expressions `yyerrok` and `yyclearin` are not supported. In general, users 21 | can simply remove alternatives that consist solely of `error`. 22 | 23 | * `%union` can be mapped to `%actiontype` in grmtools, though this is rarely 24 | the best way of using a Yacc grammar in Rust. See the [Grmtools Yacc 25 | variant](#grmtools) below for the most common way of making grammars do 26 | something useful; in a limited number of cases (e.g. if you just want to 27 | build a parse tree), you may find the ["Original" Yacc 28 | variant](#original-yacc) useful. 29 | 30 | * grmtools allows both Yacc's `%expect` and Bison's `%expect-rr` declarations 31 | in its base "Yacc" mode. 32 | 33 | * Bison's `%parse-param` can take multiple arguments. grmtools' `%parse-param` 34 | takes a single argument which can be a tuple, thus emulating multiple 35 | arguments while integrating naturally into Rust's type system. 36 | 37 | * Although rare, it is possible to generate accept/reduce conflicts (e.g. for 38 | a grammar with the sole rule `A: A;`). grmtools considers accept/reduce 39 | conflicts to be a hard error, and refuses to generate anything for the 40 | resulting grammar, whereas Yacc allows them through (with unclear 41 | consequences). Bison also appears to consider accept/reduce conflicts a hard 42 | error, though it appears to detect them in a more generic way (reporting 43 | such rules as "not generating any sentences"). 44 | 45 | 46 | ## YaccKinds 47 | 48 | ### Grmtools 49 | 50 | `YaccKind::Grmtools` is grmtools' own variant of Yacc syntax, and the one that 51 | most users will want to use. The most significant difference to "normal" Yacc 52 | is that rules are annotated with a Rust type to which all their production's 53 | actions must adhere to. Note that whilst a rule's productions must all adhere 54 | to a single type, different rules can have different types. Consider the 55 | following snippet: 56 | 57 | ```rust,noplaypen 58 | R1 -> Result: 59 | 'a' { Ok(5) } 60 | | 'b' { Err(()) } 61 | ; 62 | 63 | R2 -> u64: 64 | | { 0 } 65 | ; 66 | ``` 67 | 68 | Here the rule `R1` has a Rust return type of `Result` (between `->` and 69 | `:`). Both of its productions adhere to this type, the first by instantiating 70 | `Ok(5)` and the second `Err(())`. The rule `R2` has a return type of `u64`. 71 | 72 | 73 | ### “Original” Yacc 74 | 75 | Although the name is not fully accurate (grmtools supports a slightly disjoint 76 | subset of original Yacc's input), this mode allows users to most easily test 77 | externally created Yacc files. Several sub-variants are allowed: 78 | 79 | * `YaccKind::Original(YaccOriginalActionKind::GenericParseTree)` does not 80 | execute user actions, but instead creates a generic parse tree, where elements 81 | are instances of the `lrpar::parser::Node` enum. This is useful for quickly 82 | testing whether a parser is accepting the intended language. 83 | 84 | * `YaccKind::Original(YaccOriginalActionKind::NoAction)` parses input and 85 | reports errors but does not execute any user actions. This is useful if you 86 | are trying to find out whether a corpus of input parses successfully against 87 | your grammar or not. 88 | 89 | * `YaccKind::Original(YaccOriginalActionKind::UserAction)` models original Yacc 90 | most closely but, in a Rust setting, is probably of little use beyond simple 91 | calculator like languages. Instead of Yacc's `%union` directive, users can 92 | specify `%actiontype` which is a Rust type to which every production's actions 93 | in the grammar must adhere to. Unless all actions happen to naturally return 94 | the same type, this quickly becomes cumbersome to use. For most use cases, 95 | `YaccKind::Grmtools` is a superior alternative. 96 | -------------------------------------------------------------------------------- /lrlex/examples/calc_manual_lex/src/main.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::unnecessary_wraps)] 2 | 3 | use std::io::{self, BufRead, Write}; 4 | 5 | use cfgrammar::{NewlineCache, Span}; 6 | use lrlex::{DefaultLexeme, DefaultLexerTypes, LRNonStreamingLexer, lrlex_mod}; 7 | use lrpar::{Lexeme, NonStreamingLexer, lrpar_mod}; 8 | 9 | lrlex_mod!("token_map"); 10 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. By default the module name will be 11 | // `calc_y` (i.e. the file name, minus any extensions, with a suffix of `_y`). 12 | lrpar_mod!("calc.y"); 13 | 14 | use calc_y::Expr; 15 | use token_map::*; 16 | 17 | fn main() { 18 | let stdin = io::stdin(); 19 | loop { 20 | print!(">>> "); 21 | io::stdout().flush().ok(); 22 | match stdin.lock().lines().next() { 23 | Some(Ok(ref l)) => { 24 | if l.trim().is_empty() { 25 | continue; 26 | } 27 | let lexer = lex(l); 28 | // Pass the lexer to the parser and lex and parse the input. 29 | let (res, errs) = calc_y::parse(&lexer); 30 | for e in errs { 31 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 32 | } 33 | if let Some(Ok(r)) = res { 34 | match eval(&lexer, r) { 35 | Ok(i) => println!("Result: {}", i), 36 | Err((span, msg)) => { 37 | let ((line, col), _) = lexer.line_col(span); 38 | eprintln!( 39 | "Evaluation error at line {} column {}, '{}' {}.", 40 | line, 41 | col, 42 | lexer.span_str(span), 43 | msg 44 | ) 45 | } 46 | } 47 | } 48 | } 49 | _ => break, 50 | } 51 | } 52 | } 53 | 54 | fn lex(s: &str) -> LRNonStreamingLexer<'_, '_, DefaultLexerTypes> { 55 | let mut lexemes = Vec::new(); 56 | let mut i = 0; 57 | while i < s.len() { 58 | // Skip whitespace 59 | i += s[i..] 60 | .chars() 61 | .take_while(|c| c.is_whitespace()) 62 | .map(|c| c.len_utf8()) 63 | .sum::(); 64 | if i == s.len() { 65 | break; 66 | } 67 | match s[i..].chars().next().unwrap() { 68 | '+' => { 69 | lexemes.push(Ok(DefaultLexeme::new(T_PLUS, i, 1))); 70 | i += 1; 71 | } 72 | '*' => { 73 | lexemes.push(Ok(DefaultLexeme::new(T_STAR, i, 1))); 74 | i += 1; 75 | } 76 | '(' => { 77 | lexemes.push(Ok(DefaultLexeme::new(T_LBRACK, i, 1))); 78 | i += 1; 79 | } 80 | ')' => { 81 | lexemes.push(Ok(DefaultLexeme::new(T_RBRACK, i, 1))); 82 | i += 1; 83 | } 84 | _ => { 85 | let old_i = i; 86 | while let Some('0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') = 87 | s[i..].chars().next() 88 | { 89 | i += 1; 90 | } 91 | if i > old_i { 92 | lexemes.push(Ok(DefaultLexeme::new(T_INT, old_i, i - old_i))); 93 | } else { 94 | let c_len = s[i..].chars().next().unwrap().len_utf8(); 95 | lexemes.push(Ok(DefaultLexeme::new(T_UNMATCHED, i, c_len))); 96 | i += c_len; 97 | } 98 | } 99 | } 100 | } 101 | LRNonStreamingLexer::new(s, lexemes, NewlineCache::new()) 102 | } 103 | 104 | fn eval( 105 | lexer: &dyn NonStreamingLexer>, 106 | e: Expr, 107 | ) -> Result { 108 | match e { 109 | Expr::Add { span, lhs, rhs } => eval(lexer, *lhs)? 110 | .checked_add(eval(lexer, *rhs)?) 111 | .ok_or((span, "overflowed")), 112 | Expr::Mul { span, lhs, rhs } => eval(lexer, *lhs)? 113 | .checked_mul(eval(lexer, *rhs)?) 114 | .ok_or((span, "overflowed")), 115 | Expr::Number { span } => lexer 116 | .span_str(span) 117 | .parse::() 118 | .map_err(|_| (span, "cannot be represented as a u64")), 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /lrpar/src/lib/lex_api.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::len_without_is_empty)] 2 | 3 | use std::{cmp, error::Error, fmt, hash::Hash, marker}; 4 | 5 | use cfgrammar::Span; 6 | use num_traits::{AsPrimitive, PrimInt, Unsigned}; 7 | 8 | pub trait LexerTypes: fmt::Debug + Clone 9 | where 10 | usize: AsPrimitive, 11 | { 12 | type LexemeT: Lexeme; 13 | type StorageT: 'static + fmt::Debug + Hash + PrimInt + Unsigned; 14 | type LexErrorT: LexError; 15 | } 16 | 17 | /// The base trait which all lexers which want to interact with `lrpar` must implement. 18 | pub trait Lexer 19 | where 20 | usize: AsPrimitive, 21 | { 22 | /// Iterate over all the lexemes in this lexer. Note that: 23 | /// * The lexer may or may not stop after the first [LexError] is encountered. 24 | /// * There are no guarantees about what happens if this function is called more than once. 25 | /// For example, a streaming lexer may only produce [Lexeme]s on the first call. 26 | fn iter<'a>( 27 | &'a self, 28 | ) -> Box> + 'a>; 29 | } 30 | 31 | /// A `NonStreamingLexer` is one that takes input in one go, and is then able to hand out 32 | /// substrings to that input and calculate line and column numbers from a [Span]. 33 | pub trait NonStreamingLexer<'input, LexerTypesT: LexerTypes>: Lexer 34 | where 35 | usize: AsPrimitive, 36 | { 37 | /// Return the user input associated with a [Span]. 38 | /// 39 | /// The [Span] must be well formed: 40 | /// * The start/end byte indexes must be valid UTF-8 character indexes. 41 | /// * The end byte index must not exceed the input's length. 42 | /// 43 | /// If these requirements are not respected this function may panic or return unexpected 44 | /// portions of the input. 45 | fn span_str(&self, span: Span) -> &'input str; 46 | 47 | /// Return the lines containing the input at `span` (including *all* the text on the lines 48 | /// that `span` starts and ends on). 49 | /// 50 | /// The [Span] must be well formed: 51 | /// * The start/end byte indexes must be valid UTF-8 character indexes. 52 | /// * The end byte index must not exceed the input's length. 53 | /// 54 | /// If these requirements are not respected this function may panic or return unexpected 55 | /// portions of the input. 56 | fn span_lines_str(&self, span: Span) -> &'input str; 57 | 58 | /// Return `((start line, start column), (end line, end column))` for `span`. Note that column 59 | /// *characters* (not bytes) are returned. 60 | /// 61 | /// The [Span] must be well formed: 62 | /// * The start/end byte indexes must be valid UTF-8 character indexes. 63 | /// * The end byte index must not exceed the input's length. 64 | /// 65 | /// If these requirements are not respected this function may panic or return unexpected 66 | /// portions of the input. 67 | fn line_col(&self, span: Span) -> ((usize, usize), (usize, usize)); 68 | } 69 | 70 | /// A lexeme represents a segment of the user's input that conforms to a known type: this trait 71 | /// captures the common behaviour of all lexeme structs. 72 | /// 73 | /// Lexemes are assumed to have a definition which describes all possible correct lexemes (e.g. the 74 | /// regular expression `[0-9]+` defines all integer lexemes). This trait also allows "faulty" 75 | /// lexemes to be represented -- that is, lexemes that have resulted from error recovery of some 76 | /// sort. Faulty lexemes can violate the lexeme's type definition in any possible way (e.g. they 77 | /// might span more or less input than the definition would suggest is possible). 78 | pub trait Lexeme: fmt::Debug + fmt::Display + cmp::Eq + Hash + marker::Copy { 79 | /// Create a new lexeme with ID `tok_id`, a starting position in the input `start`, and length 80 | /// `len`. 81 | /// 82 | /// Lexemes created using this function are expected to be "correct" in the sense that they 83 | /// fully respect the lexeme's definition semantics. To create faulty lexemes, use 84 | /// [new_faulty](Lexeme::new_faulty). 85 | fn new(tok_id: StorageT, start: usize, len: usize) -> Self 86 | where 87 | Self: Sized; 88 | 89 | /// Create a new faulty lexeme with ID `tok_id` and a starting position in the input `start`. 90 | fn new_faulty(tok_id: StorageT, start: usize, len: usize) -> Self 91 | where 92 | Self: Sized; 93 | 94 | /// The token ID. 95 | fn tok_id(&self) -> StorageT; 96 | 97 | /// Obtain this `Lexeme`'s [Span]. 98 | fn span(&self) -> Span; 99 | 100 | /// Returns `true` if this lexeme is "faulty" i.e. is the result of error recovery in some way. 101 | /// If `true`, note that the lexeme's span may be greater or less than you may expect from the 102 | /// lexeme's definition. 103 | fn faulty(&self) -> bool; 104 | } 105 | 106 | /// A lexing error. 107 | pub trait LexError: Error { 108 | /// Return the span associated with this error. 109 | fn span(&self) -> Span; 110 | } 111 | -------------------------------------------------------------------------------- /lrpar/cttests/build.rs: -------------------------------------------------------------------------------- 1 | use cfgrammar::yacc::ast::ASTWithValidityInfo; 2 | use glob::glob; 3 | #[path = "src/cgen_helper.rs"] 4 | mod cgen_helper; 5 | use cfg_aliases::cfg_aliases; 6 | use cgen_helper::run_test_path; 7 | use lrlex::{CTLexerBuilder, DefaultLexerTypes}; 8 | 9 | // Compiles the `*.test` files within `src`. Test files are written in Yaml syntax and have 4 10 | // mandatory sections: name (describing what the test does), yacckind (defining the grammar type 11 | // used), grammar (the grammar rules), and lexer (the lexing rules). The tests are compiled into 12 | // two modules `_y` and `_l`, which we can then import into src/lib.rs and 13 | // write tests for. 14 | fn main() -> Result<(), Box> { 15 | for src in glob("src/*.rs")? { 16 | println!("cargo::rerun-if-changed={}", src?.display()); 17 | } 18 | for entry in glob("src/*.test")? { 19 | run_test_path(entry.unwrap())?; 20 | } 21 | 22 | cfg_aliases! { 23 | // Platforms 24 | wasm32_unknown: { all(target_arch = "wasm32", target_os="unknown", target_vendor="unknown") }, 25 | } 26 | 27 | // The generic `src/*.test` testing all use a `u32` StorageT 28 | // In this block we test `storaget.l`, and `storaget.y` with a `u8` instead. 29 | { 30 | // Because we're modifying the `StorageT` this isn't something `run_test_path` can do, 31 | // Since it modifies the type of the builder. 32 | CTLexerBuilder::>::new_with_lexemet() 33 | .rust_edition(lrlex::RustEdition::Rust2021) 34 | .output_path(format!( 35 | "{}/storaget.l.rs", 36 | std::env::var("OUT_DIR").unwrap() 37 | )) 38 | .lrpar_config(|ctp| { 39 | ctp.rust_edition(lrpar::RustEdition::Rust2021) 40 | .output_path(format!( 41 | "{}/storaget.y.rs", 42 | std::env::var("OUT_DIR").unwrap() 43 | )) 44 | .grammar_in_src_dir("storaget.y") 45 | .unwrap() 46 | }) 47 | .lexer_in_src_dir("storaget.l") 48 | .unwrap() 49 | .build() 50 | .unwrap(); 51 | } 52 | println!("cargo::rerun-if-changed=src/storaget.l"); 53 | println!( 54 | "cargo::rerun-if-changed={}/storaget.l.rs", 55 | std::env::var("OUT_DIR").unwrap() 56 | ); 57 | println!("cargo::rerun-if-changed=src/storaget.y"); 58 | println!( 59 | "cargo::rerun-if-changed={}/storaget.y.rs", 60 | std::env::var("OUT_DIR").unwrap() 61 | ); 62 | 63 | // This block specific to `multi_start.test` 64 | // 65 | // We use `clone_and_change_start_rule` to generate multiple parsers with 66 | // different start rules from a single grammar source. 67 | { 68 | use lrpar::unstable_api::UnstableApi; 69 | // In this case we'll be building multiple grammars 70 | // 71 | // 1. Parse multi_start_rule.y into an AST 72 | // 2. Clone the original and change the start rule. 73 | // 3. Build a grammar for `multi_start_rule.y` unchanged. 74 | // 4. Build the modified grammar. 75 | let grammar_path = &std::env::current_dir().unwrap().join("src/multi_start.y"); 76 | let grammar_src = std::fs::read_to_string(grammar_path).unwrap(); 77 | let grammar_src_clone = grammar_src.clone(); 78 | let valid_ast = ASTWithValidityInfo::new(cfgrammar::yacc::YaccKind::Grmtools, &grammar_src); 79 | eprintln!("rules {:?}", valid_ast.ast().rules); 80 | let bstart_rule = valid_ast.ast().get_rule("BStart").unwrap().clone(); 81 | let modified_ast = valid_ast.clone_and_change_start_rule(bstart_rule).unwrap(); 82 | CTLexerBuilder::new() 83 | .lrpar_config(move |ctp| { 84 | ctp.grammar_ast(valid_ast.clone(), UnstableApi) 85 | .with_grammar_src(grammar_src.clone(), UnstableApi) 86 | .grammar_in_src_dir("multi_start.y") 87 | .unwrap() 88 | .mod_name("ast_unmodified_y") 89 | .output_path(format!( 90 | "{}/ast_unmodified.y.rs", 91 | std::env::var("OUT_DIR").unwrap() 92 | )) 93 | }) 94 | .lexer_in_src_dir("multi_start.l") 95 | .unwrap() 96 | .output_path(format!( 97 | "{}/ast_unmodified.l.rs", 98 | std::env::var("OUT_DIR").unwrap() 99 | )) 100 | .mod_name("ast_unmodified_l") 101 | .build() 102 | .unwrap(); 103 | CTLexerBuilder::new() 104 | .lrpar_config(move |ctp| { 105 | ctp.grammar_ast(modified_ast.clone(), UnstableApi) 106 | .with_grammar_src(grammar_src_clone.clone(), UnstableApi) 107 | .grammar_in_src_dir("multi_start.y") 108 | .unwrap() 109 | .mod_name("ast_modified_y") 110 | .output_path(format!( 111 | "{}/ast_modified.y.rs", 112 | std::env::var("OUT_DIR").unwrap() 113 | )) 114 | // We still need to disable these because they are checked after ast validation. 115 | .warnings_are_errors(false) 116 | .show_warnings(false) 117 | }) 118 | .lexer_in_src_dir("multi_start.l") 119 | .unwrap() 120 | .mod_name("ast_modified_l") 121 | .output_path(format!( 122 | "{}/ast_modified.l.rs", 123 | std::env::var("OUT_DIR").unwrap() 124 | )) 125 | .build() 126 | .unwrap(); 127 | } 128 | Ok(()) 129 | } 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Grammar and parsing libraries for Rust 2 | 3 | [![Bors enabled](https://bors.tech/images/badge_small.svg)](https://app.bors.tech/repositories/22484) [![lrpar on crates.io](https://img.shields.io/crates/v/lrpar.svg?label=lrpar)](https://crates.io/crates/lrpar) [![lrlex on crates.io](https://img.shields.io/crates/v/lrlex.svg?label=lrlex)](https://crates.io/crates/lrlex) [![lrtable on crates.io](https://img.shields.io/crates/v/lrtable.svg?label=lrtable)](https://crates.io/crates/lrtable) [![cfgrammar on crates.io](https://img.shields.io/crates/v/cfgrammar.svg?label=cfgrammar)](https://crates.io/crates/cfgrammar) 4 | 5 | grmtools is a suite of Rust libraries and binaries for parsing text, both at 6 | compile-time, and run-time. Most users will probably be interested in the 7 | compile-time Yacc feature, which allows traditional `.y` files to be used 8 | (mostly) unchanged in Rust. 9 | 10 | ## Quickstart 11 | 12 | A minimal example using this library consists of two files (in addition to the 13 | grammar and lexing definitions). First we need to create a file `build.rs` in 14 | the root of our project with the following content: 15 | 16 | ```rust 17 | use lrlex::CTLexerBuilder; 18 | 19 | fn main() { 20 | CTLexerBuilder::new() 21 | .lrpar_config(|ctp| { 22 | ctp.grammar_in_src_dir("calc.y") 23 | .unwrap() 24 | }) 25 | .lexer_in_src_dir("calc.l") 26 | .unwrap() 27 | .build() 28 | .unwrap(); 29 | } 30 | ``` 31 | 32 | This will generate and compile a parser and lexer, where the definitions for the 33 | lexer can be found in `src/calc.l`: 34 | 35 | ```rust 36 | %% 37 | [0-9]+ "INT" 38 | \+ "+" 39 | \* "*" 40 | \( "(" 41 | \) ")" 42 | [\t ]+ ; 43 | ``` 44 | 45 | and where the definitions for the parser can be found in `src/calc.y`: 46 | 47 | ```rust 48 | %grmtools{yacckind: Grmtools} 49 | %start Expr 50 | %avoid_insert "INT" 51 | %% 52 | Expr -> Result: 53 | Expr '+' Term { Ok($1? + $3?) } 54 | | Term { $1 } 55 | ; 56 | 57 | Term -> Result: 58 | Term '*' Factor { Ok($1? * $3?) } 59 | | Factor { $1 } 60 | ; 61 | 62 | Factor -> Result: 63 | '(' Expr ')' { $2 } 64 | | 'INT' 65 | { 66 | let v = $1.map_err(|_| ())?; 67 | parse_int($lexer.span_str(v.span())) 68 | } 69 | ; 70 | %% 71 | // Any functions here are in scope for all the grammar actions above. 72 | 73 | fn parse_int(s: &str) -> Result { 74 | match s.parse::() { 75 | Ok(val) => Ok(val), 76 | Err(_) => { 77 | eprintln!("{} cannot be represented as a u64", s); 78 | Err(()) 79 | } 80 | } 81 | } 82 | ``` 83 | 84 | We can then use the generated lexer and parser within our `src/main.rs` file as 85 | follows: 86 | 87 | ```rust 88 | use std::env; 89 | 90 | use lrlex::lrlex_mod; 91 | use lrpar::lrpar_mod; 92 | 93 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the 94 | // module name will be `calc_l` (i.e. the file name, minus any extensions, 95 | // with a suffix of `_l`). 96 | lrlex_mod!("calc.l"); 97 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. By default the 98 | // module name will be `calc_y` (i.e. the file name, minus any extensions, 99 | // with a suffix of `_y`). 100 | lrpar_mod!("calc.y"); 101 | 102 | fn main() { 103 | // Get the `LexerDef` for the `calc` language. 104 | let lexerdef = calc_l::lexerdef(); 105 | let args: Vec = env::args().collect(); 106 | // Now we create a lexer with the `lexer` method with which we can lex an 107 | // input. 108 | let lexer = lexerdef.lexer(&args[1]); 109 | // Pass the lexer to the parser and lex and parse the input. 110 | let (res, errs) = calc_y::parse(&lexer); 111 | for e in errs { 112 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 113 | } 114 | match res { 115 | Some(r) => println!("Result: {:?}", r), 116 | _ => eprintln!("Unable to evaluate expression.") 117 | } 118 | } 119 | ``` 120 | 121 | For more information on how to use this library please refer to the [grmtools 122 | book](https://softdevteam.github.io/grmtools/master/book/), which also includes 123 | a more detailed [quickstart 124 | guide](https://softdevteam.github.io/grmtools/master/book/quickstart.html). 125 | 126 | ## Examples 127 | 128 | [lrpar](https://github.com/softdevteam/grmtools/tree/master/lrpar/examples) 129 | contains several examples on how to use the `lrpar`/`lrlex` libraries, showing 130 | how to generate [parse 131 | trees](https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/calc_parsetree) 132 | and 133 | [ASTs](https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/calc_ast), use 134 | [start conditions/states](https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/start_states) 135 | or [execute 136 | code](https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/calc_actions) 137 | while parsing. 138 | 139 | ## Documentation 140 | 141 | | Latest release | master | 142 | |-----------------------------------------|--------| 143 | | [grmtools book](https://softdevteam.github.io/grmtools/latest_release/book/) | [grmtools book](https://softdevteam.github.io/grmtools/master/book) | 144 | | [cfgrammar](https://docs.rs/cfgrammar/) | [cfgrammar](https://softdevteam.github.io/grmtools/master/api/cfgrammar/) | 145 | | [lrpar](https://docs.rs/lrpar/) | [lrpar](https://softdevteam.github.io/grmtools/master/api/lrpar/) | 146 | | [lrlex](https://docs.rs/lrlex/) | [lrlex](https://softdevteam.github.io/grmtools/master/api/lrlex/) | 147 | | [lrtable](https://docs.rs/lrtable/) | [lrtable](https://softdevteam.github.io/grmtools/master/api/lrtable/) | 148 | 149 | [Documentation for all past and present releases](https://softdevteam.github.io/grmtools/) 150 | -------------------------------------------------------------------------------- /lrpar/cttests/src/grmtools_section.test: -------------------------------------------------------------------------------- 1 | grammar: | 2 | %grmtools{ 3 | yacckind: Grmtools, 4 | recoverer: RecoveryKind::CPCTPlus, 5 | test_files: ["*.input_grmtools_section"] 6 | } 7 | %token MAGIC IDENT NUM STRING 8 | %epp MAGIC "%grmtools" 9 | %% 10 | start -> Result, Vec>> 11 | : MAGIC '{' contents '}' { $3 } 12 | ; 13 | 14 | contents -> Result, Vec>> 15 | : %empty { Ok(Header::new()) } 16 | | val_seq comma_opt { $1 } 17 | ; 18 | 19 | val_seq -> Result, Vec>> 20 | : valbind { 21 | let ((key, key_loc), val) = $1; 22 | let mut ret = Header::::new(); 23 | match ret.entry(key) { 24 | Entry::Occupied(orig) => { 25 | let HeaderValue(orig_loc, _) : &HeaderValue = orig.get(); 26 | // One difference between the manually written parser and this 27 | // is we don't try return multiple errors, or coalesce them. 28 | return Err(vec![HeaderError { 29 | kind: HeaderErrorKind::DuplicateEntry, 30 | locations: vec![*orig_loc, key_loc] 31 | }]); 32 | } 33 | Entry::Vacant(entry) => { 34 | entry.insert(HeaderValue(key_loc, val)); 35 | } 36 | } 37 | Ok(ret) 38 | } 39 | | val_seq ',' valbind { 40 | let ((key, key_loc), val) = $3; 41 | let mut ret = $1?; 42 | match ret.entry(key) { 43 | Entry::Occupied(orig) => { 44 | let HeaderValue(orig_loc, _): &HeaderValue = orig.get(); 45 | // One difference between the manually written parser and this 46 | // is we don't try return multiple errors, or coalesce them. 47 | return Err(vec![HeaderError { 48 | kind: HeaderErrorKind::DuplicateEntry, 49 | locations: vec![*orig_loc, key_loc] 50 | }]); 51 | } 52 | Entry::Vacant(entry) => { 53 | entry.insert(HeaderValue(key_loc, val)); 54 | } 55 | } 56 | Ok(ret) 57 | } 58 | ; 59 | 60 | namespaced -> Namespaced 61 | : IDENT { 62 | let ident_span = $1.as_ref().unwrap().span(); 63 | let ident = $lexer.span_str(ident_span).to_string().to_lowercase(); 64 | Namespaced{ 65 | namespace: None, 66 | member: (ident, ident_span) 67 | } 68 | } 69 | | IDENT '::' IDENT { 70 | let namespace_span = $1.as_ref().unwrap().span(); 71 | let namespace = $lexer.span_str(namespace_span).to_string().to_lowercase(); 72 | 73 | let ident_span = $3.as_ref().unwrap().span(); 74 | let ident = $lexer.span_str(ident_span).to_string().to_lowercase(); 75 | Namespaced { 76 | namespace: Some((namespace, namespace_span)), 77 | member: (ident, ident_span) 78 | } 79 | } 80 | ; 81 | 82 | valbind -> ((String, Span), Value) 83 | : IDENT ':' val { 84 | let key_span = $1.as_ref().unwrap().span(); 85 | let key = $lexer.span_str(key_span).to_string().to_lowercase(); 86 | ((key, key_span), Value::Setting($3)) 87 | } 88 | | IDENT { 89 | let key_span = $1.as_ref().unwrap().span(); 90 | let key = $lexer.span_str(key_span).to_string().to_lowercase(); 91 | ((key, key_span), Value::Flag(true, key_span)) 92 | } 93 | | '!' IDENT { 94 | let bang_span = $1.as_ref().unwrap().span(); 95 | let key_span = $2.as_ref().unwrap().span(); 96 | let key = $lexer.span_str(key_span).to_string().to_lowercase(); 97 | ((key, key_span), Value::Flag(false, Span::new(bang_span.start(), key_span.end()))) 98 | } 99 | ; 100 | 101 | val -> Setting 102 | : namespaced { Setting::Unitary($1) } 103 | | NUM { 104 | let num_span = $1.as_ref().unwrap().span(); 105 | let n = str::parse::($lexer.span_str(num_span)); 106 | Setting::Num(n.expect("convertible"), num_span) 107 | } 108 | | STRING { 109 | let string_span = $1.as_ref().unwrap().span(); 110 | // Trim the leading and trailing " characters. 111 | let string_span = Span::new(string_span.start() + 1, string_span.end() - 1); 112 | let s = $lexer.span_str(string_span).to_string(); 113 | Setting::String(s, string_span) 114 | } 115 | | namespaced '(' namespaced ')' { Setting::Constructor{ctor: $1, arg: $3} } 116 | | '[' array_seq ']' { Setting::Array($2, $1.as_ref().unwrap().span(), $3.as_ref().unwrap().span()) } 117 | ; 118 | 119 | array_seq -> Vec> 120 | : %empty { Vec::new() } 121 | | val { 122 | vec![$1] 123 | } 124 | | array_seq ',' val { 125 | $1.push($3); 126 | $1 127 | } 128 | ; 129 | comma_opt -> () 130 | : %empty { } 131 | | ',' { } 132 | ; 133 | %% 134 | #![allow(dead_code)] 135 | #![allow(unused)] 136 | 137 | use cfgrammar::{ 138 | Span, 139 | header::{ 140 | Value, 141 | Setting, 142 | HeaderError, 143 | HeaderErrorKind, 144 | Namespaced, 145 | Header, 146 | HeaderValue, 147 | }, 148 | markmap::Entry, 149 | }; 150 | 151 | lexer: | 152 | %grmtools{case_insensitive} 153 | %% 154 | %grmtools 'MAGIC' 155 | ! '!' 156 | [A-Z][A-Z_]* 'IDENT' 157 | [0-9]+ 'NUM' 158 | , ',' 159 | \{ '{' 160 | \} '}' 161 | \( '(' 162 | \) ')' 163 | \[ '[' 164 | \] ']' 165 | :: '::' 166 | : ':' 167 | \"(\\.|[^"\\])*\" 'STRING' 168 | \p{Pattern_White_Space} ; 169 | extra_files: 170 | test.input_grmtools_section: | 171 | %grmtools{yacckind: Grmtools, !b, !a} 172 | -------------------------------------------------------------------------------- /lrlex/src/main.rs: -------------------------------------------------------------------------------- 1 | use getopts::Options; 2 | use std::{ 3 | env, 4 | error::Error, 5 | fmt, 6 | fs::File, 7 | io::{Read, Write, stderr, stdin}, 8 | path::Path, 9 | process, 10 | }; 11 | 12 | use cfgrammar::header::{GrmtoolsSectionParser, HeaderValue}; 13 | use lrlex::{DefaultLexerTypes, LRNonStreamingLexerDef, LexFlags, LexerDef, LexerKind}; 14 | use lrpar::{ 15 | Lexeme, Lexer, 16 | diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter}, 17 | }; 18 | 19 | const ERROR: &str = "[Error]"; 20 | 21 | /// A string which uses `Display` for it's `Debug` impl. 22 | struct ErrorString(String); 23 | impl fmt::Display for ErrorString { 24 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 25 | let ErrorString(s) = self; 26 | write!(f, "{}", s) 27 | } 28 | } 29 | impl fmt::Debug for ErrorString { 30 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 31 | let ErrorString(s) = self; 32 | write!(f, "{}", s) 33 | } 34 | } 35 | impl Error for ErrorString {} 36 | 37 | fn usage(prog: &str, msg: &str) { 38 | let path = Path::new(prog); 39 | let leaf = match path.file_name() { 40 | Some(m) => m.to_str().unwrap(), 41 | None => "lrpar", 42 | }; 43 | if !msg.is_empty() { 44 | writeln!(stderr(), "{}", msg).ok(); 45 | } 46 | writeln!(stderr(), "Usage: {} ", leaf).ok(); 47 | process::exit(1); 48 | } 49 | 50 | fn read_file(path: &str) -> String { 51 | let mut s = String::new(); 52 | if path == "-" { 53 | stdin().read_to_string(&mut s).unwrap(); 54 | return s; 55 | } 56 | let mut f = match File::open(path) { 57 | Ok(r) => r, 58 | Err(e) => { 59 | writeln!(stderr(), "Can't open file {}: {}", path, e).ok(); 60 | process::exit(1); 61 | } 62 | }; 63 | f.read_to_string(&mut s).unwrap(); 64 | s 65 | } 66 | 67 | fn main() -> Result<(), Box> { 68 | let args: Vec = env::args().collect(); 69 | let prog = args[0].clone(); 70 | let matches = match Options::new().optflag("h", "help", "").parse(&args[1..]) { 71 | Ok(m) => m, 72 | Err(f) => { 73 | usage(&prog, f.to_string().as_str()); 74 | return Ok(()); 75 | } 76 | }; 77 | if matches.opt_present("h") || matches.free.len() != 2 { 78 | usage(&prog, ""); 79 | return Ok(()); 80 | } 81 | 82 | let lex_l_path = &matches.free[0]; 83 | let lex_src = read_file(lex_l_path); 84 | let lex_diag = SpannedDiagnosticFormatter::new(&lex_src, Path::new(lex_l_path)); 85 | let (mut header, _) = match GrmtoolsSectionParser::new(&lex_src, false).parse() { 86 | Ok(x) => x, 87 | Err(es) => { 88 | eprintln!( 89 | "\n{ERROR}{}", 90 | lex_diag.file_location_msg(" parsing the `%grmtools` section", None) 91 | ); 92 | for e in es { 93 | eprintln!( 94 | "{}", 95 | &indent(" ", &lex_diag.format_error(e).to_string()) 96 | ); 97 | } 98 | process::exit(1); 99 | } 100 | }; 101 | header.mark_used(&"lexerkind".to_string()); 102 | let lexerkind = if let Some(HeaderValue(_, lk_val)) = header.get("lexerkind") { 103 | LexerKind::try_from(lk_val)? 104 | } else { 105 | LexerKind::LRNonStreamingLexer 106 | }; 107 | 108 | let lexerdef = match lexerkind { 109 | LexerKind::LRNonStreamingLexer => { 110 | let lex_flags = LexFlags::try_from(&mut header)?; 111 | match LRNonStreamingLexerDef::>::new_with_options( 112 | &lex_src, lex_flags, 113 | ) { 114 | Ok(x) => x, 115 | Err(errs) => { 116 | eprintln!("\n{ERROR}{}", lex_diag.file_location_msg("", None)); 117 | for e in errs { 118 | eprintln!( 119 | "{}", 120 | &indent(" ", &lex_diag.format_error(e).to_string()) 121 | ); 122 | } 123 | process::exit(1); 124 | } 125 | } 126 | } 127 | _ => { 128 | return Err(ErrorString("Unrecognized lexer kind".to_string()))?; 129 | } 130 | }; 131 | { 132 | let unused_header_values = header.unused(); 133 | if !unused_header_values.is_empty() { 134 | Err(ErrorString(format!( 135 | "Unused header values: {}", 136 | unused_header_values.join(", ") 137 | )))? 138 | } 139 | } 140 | let input = &read_file(&matches.free[1]); 141 | for r in lexerdef.lexer(input).iter() { 142 | match r { 143 | Ok(l) => println!( 144 | "{} {}", 145 | lexerdef.get_rule_by_id(l.tok_id()).name().unwrap(), 146 | &input[l.span().start()..l.span().end()] 147 | ), 148 | Err(e) => { 149 | println!("{:?}", e); 150 | process::exit(1); 151 | } 152 | } 153 | } 154 | Ok(()) 155 | } 156 | 157 | /// Indents a multi-line string and trims any trailing newline. 158 | /// This currently assumes that indentation on blank lines does not matter. 159 | /// 160 | /// The algorithm used by this function is: 161 | /// 1. Prefix `s` with the indentation, indenting the first line. 162 | /// 2. Trim any trailing newlines. 163 | /// 3. Replace all newlines with `\n{indent}`` to indent all lines after the first. 164 | /// 165 | /// It is plausible that we should a step 4, but currently do not: 166 | /// 4. Replace all `\n{indent}\n` with `\n\n` 167 | fn indent(indent: &str, s: &str) -> String { 168 | format!("{indent}{}\n", s.trim_end_matches('\n')).replace('\n', &format!("\n{}", indent)) 169 | } 170 | -------------------------------------------------------------------------------- /lrpar/README.md: -------------------------------------------------------------------------------- 1 | # `lrpar` 2 | 3 | `lrpar` provides a Yacc-compatible parser (where grammars can be generated at 4 | compile-time or run-time). It can take in traditional `.y` files and convert 5 | them into an idiomatic Rust parser. 6 | 7 | If you're new to `lrpar`, please read the "quick start guide". The "grmtools 8 | book" and API reference have more detailed information. You can find the 9 | appropriate documentation for the version of lrpar you are using here: 10 | 11 | | Latest release | master | 12 | |-----------------------------------------|--------| 13 | | [Quickstart guide](https://softdevteam.github.io/grmtools/latest_release/book/quickstart.html) | [Quickstart guide](https://softdevteam.github.io/grmtools/master/book/quickstart.html) | 14 | | [grmtools book](https://softdevteam.github.io/grmtools/latest_release/book/) | [grmtools book](https://softdevteam.github.io/grmtools/master/book) | 15 | | [lrpar API](https://docs.rs/lrpar/) | [lrpar API](https://softdevteam.github.io/grmtools/master/api/lrpar/) | 16 | 17 | [Documentation for all past and present releases](https://softdevteam.github.io/grmtools/) 18 | 19 | 20 | ## Example 21 | 22 | Let's assume we want to statically generate a parser for a simple calculator 23 | language (and let's also assume we are able to use 24 | [`lrlex`](https://crates.io/crates/lrlex) for the lexer). We need to add a 25 | `build.rs` file to our project which statically compiles both the lexer and 26 | parser. While we can perform both steps individually, it's easiest to use 27 | `lrlex` which does both jobs for us in one go. Our `build.rs` file thus looks 28 | as follows: 29 | 30 | ```rust 31 | use cfgrammar::yacc::YaccKind; 32 | use lrlex::CTLexerBuilder; 33 | 34 | fn main() { 35 | CTLexerBuilder::new() 36 | .lrpar_config(|ctp| { 37 | ctp.yacckind(YaccKind::Grmtools) 38 | .grammar_in_src_dir("calc.y") 39 | .unwrap() 40 | }) 41 | .lexer_in_src_dir("calc.l") 42 | .unwrap() 43 | .build() 44 | .unwrap(); 45 | } 46 | ``` 47 | 48 | where `src/calc.l` is as follows: 49 | 50 | ``` 51 | %% 52 | [0-9]+ "INT" 53 | \+ "+" 54 | \* "*" 55 | \( "(" 56 | \) ")" 57 | [\t ]+ ; 58 | ``` 59 | 60 | and `src/calc.y` is as follows: 61 | 62 | ``` 63 | %start Expr 64 | %avoid_insert "INT" 65 | %% 66 | Expr -> Result: 67 | Expr '+' Term { Ok($1? + $3?) } 68 | | Term { $1 } 69 | ; 70 | 71 | Term -> Result: 72 | Term '*' Factor { Ok($1? * $3?) } 73 | | Factor { $1 } 74 | ; 75 | 76 | Factor -> Result: 77 | '(' Expr ')' { $2 } 78 | | 'INT' 79 | { 80 | let v = $1.map_err(|_| ())?; 81 | parse_int($lexer.span_str(v.span())) 82 | } 83 | ; 84 | %% 85 | // Any functions here are in scope for all the grammar actions above. 86 | 87 | fn parse_int(s: &str) -> Result { 88 | match s.parse::() { 89 | Ok(val) => Ok(val), 90 | Err(_) => { 91 | eprintln!("{} cannot be represented as a u64", s); 92 | Err(()) 93 | } 94 | } 95 | } 96 | ``` 97 | 98 | Because we specified that our Yacc file is in `Grmtools` format, each rule has a 99 | separate Rust type to which all its functions conform (in this case, all the 100 | rules have the same type, but that's not a requirement). 101 | 102 | A simple `src/main.rs` is as follows: 103 | 104 | ```rust 105 | use std::io::{self, BufRead, Write}; 106 | 107 | use lrlex::lrlex_mod; 108 | use lrpar::lrpar_mod; 109 | 110 | // Using `lrlex_mod!` brings the lexer for `calc.l` into scope. 111 | lrlex_mod!("calc.l"); 112 | // Using `lrpar_mod!` brings the parser for `calc.y` into scope. 113 | lrpar_mod!("calc.y"); 114 | 115 | fn main() { 116 | // Get the `LexerDef` for the `calc` language. 117 | let lexerdef = calc_l::lexerdef(); 118 | let stdin = io::stdin(); 119 | loop { 120 | print!(">>> "); 121 | io::stdout().flush().ok(); 122 | match stdin.lock().lines().next() { 123 | Some(Ok(ref l)) => { 124 | if l.trim().is_empty() { 125 | continue; 126 | } 127 | // Now we create a lexer with the `lexer` method with which 128 | // we can lex an input. 129 | let lexer = lexerdef.lexer(l); 130 | // Pass the lexer to the parser and lex and parse the input. 131 | let (res, errs) = calc_y::parse(&lexer); 132 | for e in errs { 133 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 134 | } 135 | match res { 136 | Some(Ok(r)) => println!("Result: {}", r), 137 | _ => eprintln!("Unable to evaluate expression.") 138 | } 139 | } 140 | _ => break 141 | } 142 | } 143 | } 144 | ``` 145 | 146 | We can now `cargo run` our project and evaluate simple expressions: 147 | 148 | ``` 149 | >>> 2 + 3 150 | Result: 5 151 | >>> 2 + 3 * 4 152 | Result: 14 153 | >>> (2 + 3) * 4 154 | Result: 20 155 | ``` 156 | 157 | `lrpar` also comes with advanced [error 158 | recovery](https://softdevteam.github.io/grmtools/master/book/errorrecovery.html) built-in: 159 | 160 | ``` 161 | >>> 2 + + 3 162 | Parsing error at line 1 column 5. Repair sequences found: 163 | 1: Delete + 164 | 2: Insert INT 165 | Result: 5 166 | >>> 2 + 3 3 167 | Parsing error at line 1 column 7. Repair sequences found: 168 | 1: Insert * 169 | 2: Insert + 170 | 3: Delete 3 171 | Result: 11 172 | >>> 2 + 3 4 5 173 | Parsing error at line 1 column 7. Repair sequences found: 174 | 1: Insert *, Delete 4 175 | 2: Insert +, Delete 4 176 | 3: Delete 4, Delete 5 177 | 4: Insert +, Shift 4, Delete 5 178 | 5: Insert +, Shift 4, Insert + 179 | 6: Insert *, Shift 4, Delete 5 180 | 7: Insert *, Shift 4, Insert * 181 | 8: Insert *, Shift 4, Insert + 182 | 9: Insert +, Shift 4, Insert * 183 | Result: 17 184 | ``` 185 | -------------------------------------------------------------------------------- /doc/src/ast_example.md: -------------------------------------------------------------------------------- 1 | # An AST evaluator 2 | 3 | We now know enough to put together a more sophisticated version of our simple 4 | calculator example that builds an Abstract Syntax Tree (AST) while parsing, 5 | which is then evaluated separately. This models a common way of building real 6 | compilers. The full example code can be found at 7 | [https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/calc_ast](https://github.com/softdevteam/grmtools/tree/master/lrpar/examples/calc_ast). 8 | 9 | The `calc.l` file remains unchanged from that in the [Quickstart 10 | guide](quickstart.md). However the `calc.y` file is change as follows: 11 | 12 | 13 | ```rust,noplaypen 14 | %start Expr 15 | %avoid_insert "INT" 16 | %% 17 | Expr -> Result: 18 | Expr '+' Term { Ok(Expr::Add{ span: $span, lhs: Box::new($1?), rhs: Box::new($3?) }) } 19 | | Term { $1 } 20 | ; 21 | 22 | Term -> Result: 23 | Term '*' Factor { Ok(Expr::Mul{ span: $span, lhs: Box::new($1?), rhs: Box::new($3?) }) } 24 | | Factor { $1 } 25 | ; 26 | 27 | Factor -> Result: 28 | '(' Expr ')' { $2 } 29 | | 'INT' { Ok(Expr::Number{ span: $span }) } 30 | ; 31 | %% 32 | 33 | use cfgrammar::Span; 34 | 35 | #[derive(Debug)] 36 | pub enum Expr { 37 | Add { 38 | span: Span, 39 | lhs: Box, 40 | rhs: Box, 41 | }, 42 | Mul { 43 | span: Span, 44 | lhs: Box, 45 | rhs: Box, 46 | }, 47 | Number { 48 | span: Span 49 | } 50 | } 51 | ``` 52 | 53 | The most obvious difference here is that we have defined a simple `enum` `Expr`, 54 | with three variants, for our AST. Each AST variant also records a `Span` which 55 | records how much input the AST element covers. By using the 56 | [`$span`](actioncode.md) variable we can ensure that AST elements record their 57 | relationship to portions of the user's input that span multiple tokens (e.g. 58 | for the expressions `1 + 2` the resulting `Expr::Add` will have a `Span` 59 | starting at byte index 0 and ending at byte index 5 -- in other words covering 60 | the complete input string in this case). 61 | 62 | After parsing, we thus end up with a `Result`. In the case of a 63 | successful parse, this will give us an arbitrarily deeply nested `Expr`. 64 | 65 | Our `main.rs` file then looks as follows: 66 | 67 | ```rust,noplaypen 68 | use std::io::{self, BufRead, Write}; 69 | 70 | use lrlex::{lrlex_mod, DefaultLexeme, LRLexError}; 71 | use lrpar::{lrpar_mod, NonStreamingLexer, Span}; 72 | 73 | lrlex_mod!("calc.l"); 74 | lrpar_mod!("calc.y"); 75 | 76 | use calc_y::Expr; 77 | 78 | fn main() { 79 | let lexerdef = calc_l::lexerdef(); 80 | let stdin = io::stdin(); 81 | loop { 82 | print!(">>> "); 83 | io::stdout().flush().ok(); 84 | match stdin.lock().lines().next() { 85 | Some(Ok(ref l)) => { 86 | if l.trim().is_empty() { 87 | continue; 88 | } 89 | let lexer = lexerdef.lexer(l); 90 | let (res, errs) = calc_y::parse(&lexer); 91 | for e in errs { 92 | println!("{}", e.pp(&lexer, &calc_y::token_epp)); 93 | } 94 | if let Some(Ok(r)) = res { 95 | // We have a successful parse. 96 | match eval(&lexer, r) { 97 | Ok(i) => println!("Result: {}", i), 98 | Err((span, msg)) => { 99 | let ((line, col), _) = lexer.line_col(span); 100 | eprintln!( 101 | "Evaluation error at line {} column {}, '{}' {}.", 102 | line, 103 | col, 104 | lexer.span_str(span), 105 | msg 106 | ) 107 | } 108 | } 109 | } 110 | } 111 | _ => break 112 | } 113 | } 114 | } 115 | 116 | fn eval( 117 | lexer: &dyn NonStreamingLexer, 118 | e: Expr, 119 | LRLexError) 120 | -> Result { 121 | match e { 122 | Expr::Add { span, lhs, rhs } => eval(lexer, *lhs)? 123 | .checked_add(eval(lexer, *rhs)?) 124 | .ok_or((span, "overflowed")), 125 | Expr::Mul { span, lhs, rhs } => eval(lexer, *lhs)? 126 | .checked_mul(eval(lexer, *rhs)?) 127 | .ok_or((span, "overflowed")), 128 | Expr::Number { span } => lexer 129 | .span_str(span) 130 | .parse::() 131 | .map_err(|_| (span, "cannot be represented as a u64")) 132 | } 133 | } 134 | ``` 135 | 136 | Let's start by running this and seeing what happens: 137 | 138 | ``` 139 | >>> 2+3*4 140 | Result: 14 141 | >>> 2++3*4 142 | Parsing error at line 1 column 3. Repair sequences found: 143 | 1: Delete + 144 | 2: Insert INT 145 | Result: 14 146 | >>> 999999*888888 + 777777*666666 147 | Result: 1407404592594 148 | >>> 9999999999*8888888888 + 7777777777*6666666666 149 | Evaluation error at line 1 column 6, '9999999999*8888888888' overflowed. 150 | ``` 151 | 152 | The first three expressions evaluate just as before. However, the fourth is 153 | interesting: we have explicitly captured the fact that the result of 154 | `9999999999*8888888888` is too big to fit into a `u64`; and not only have we 155 | told the user which character the input starts out, but we've printed out the 156 | precise sub-part of the input which caused that error. This works even when 157 | it's in the middle of the input: 158 | 159 | ``` 160 | >>> 10 + 9999999999*8888888888 + 20 161 | Evaluation error at line 1 column 6, '9999999999*8888888888' overflowed. 162 | ``` 163 | 164 | The key to this is that each AST element knows the `$span` of the production it 165 | is related to; and the resulting `Span` can extract the user's input with 166 | `lexer.span_str(span)`. 167 | 168 | Happily, this facility composes nicely with error recovery: 169 | 170 | ``` 171 | >>> 10 ++ 9999999999*8888888888 + 20 172 | Parsing error at line 1 column 5. Repair sequences found: 173 | 1: Delete + 174 | 2: Insert INT 175 | Evaluation error at line 1 column 7, '9999999999*8888888888' overflowed. 176 | ``` 177 | -------------------------------------------------------------------------------- /nimbleparse/README.md: -------------------------------------------------------------------------------- 1 | # `nimbleparse` 2 | 3 | `nimbleparse` is a simple grammar debugging aid. It takes as input a Lex 4 | specification, a Yacc specification, and an input file and prints any warnings 5 | about the specifications (e.g. shift/reduce errors) as well as the resulting 6 | parse tree to stdout. If the parse is unsuccessful it will report parsing 7 | errors and, when possible fixes. If parsing is successful, `nimbleparse` exits 8 | with 0; if an error is detected it exits with 1. 9 | 10 | The full command-line specification is as follows: 11 | 12 | ``` 13 | nimbleparse [-r ] [-y ] [-q] 14 | ``` 15 | 16 | where: 17 | 18 | * `-r` selects the recovery algorithm to be used. Defaults to `cpctplus`. 19 | * `-y` selects the Yacc variant to be used. Defaults to `original`. 20 | * `-q` prevents warnings (e.g. shift/reduce errors) from being reported. 21 | 22 | You can use your own Lex/Yacc files. A small repository of example grammars can 23 | be found at https://github.com/softdevteam/grammars/. 24 | 25 | An example invocation is as follows: 26 | 27 | ``` 28 | $ cat Hello.java 29 | class Hello { 30 | public static void main(String[] args) { 31 | System.out.println("Hello world"); 32 | } 33 | } 34 | $ nimbleparse java7.l java7.y Hello.java 35 | goal 36 | compilation_unit 37 | type_declarations_opt 38 | type_declarations 39 | type_declaration 40 | class_declaration 41 | modifiers_opt 42 | CLASS class 43 | IDENTIFIER Hello 44 | type_parameters_opt 45 | super_opt 46 | interfaces_opt 47 | class_body 48 | LBRACE { 49 | class_body_declarations_opt 50 | class_body_declarations 51 | class_body_declaration 52 | class_member_declaration 53 | method_declaration 54 | method_header 55 | modifiers_opt 56 | modifiers 57 | modifiers 58 | modifier 59 | PUBLIC public 60 | modifier 61 | STATIC static 62 | VOID void 63 | method_declarator 64 | IDENTIFIER main 65 | LPAREN ( 66 | formal_parameter_list_opt 67 | formal_parameter_list 68 | formal_parameter 69 | type 70 | reference_type 71 | array_type 72 | name 73 | simple_name 74 | IDENTIFIER String 75 | dims 76 | LBRACK [ 77 | RBRACK ] 78 | variable_declarator_id 79 | IDENTIFIER args 80 | RPAREN ) 81 | throws_opt 82 | method_body 83 | block 84 | LBRACE { 85 | block_statements_opt 86 | block_statements 87 | block_statement 88 | statement 89 | statement_without_trailing_substatement 90 | expression_statement 91 | statement_expression 92 | method_invocation 93 | qualified_name 94 | name 95 | qualified_name 96 | name 97 | simple_name 98 | IDENTIFIER System 99 | DOT . 100 | IDENTIFIER out 101 | DOT . 102 | IDENTIFIER println 103 | LPAREN ( 104 | argument_list_opt 105 | argument_list 106 | expression 107 | assignment_expression 108 | conditional_expression 109 | conditional_or_expression 110 | conditional_and_expression 111 | inclusive_or_expression 112 | exclusive_or_expression 113 | and_expression 114 | equality_expression 115 | instanceof_expression 116 | relational_expression 117 | shift_expression 118 | additive_expression 119 | multiplicative_expression 120 | unary_expression 121 | unary_expression_not_plus_minus 122 | postfix_expression 123 | primary 124 | primary_no_new_array 125 | literal 126 | STRING_LITERAL "Hello world" 127 | RPAREN ) 128 | SEMICOLON ; 129 | RBRACE } 130 | RBRACE } 131 | $ cat SyntaxError.java 132 | class SyntaxError { 133 | int x y; 134 | } 135 | $ nimbleparse java7.l java7.y Hello.java 136 | goal 137 | compilation_unit 138 | type_declarations_opt 139 | type_declarations 140 | type_declaration 141 | class_declaration 142 | modifiers_opt 143 | CLASS class 144 | IDENTIFIER SyntaxError 145 | type_parameters_opt 146 | super_opt 147 | interfaces_opt 148 | class_body 149 | LBRACE { 150 | class_body_declarations_opt 151 | class_body_declarations 152 | class_body_declaration 153 | class_member_declaration 154 | field_declaration 155 | modifiers_opt 156 | type 157 | primitive_type 158 | numeric_type 159 | integral_type 160 | INT int 161 | variable_declarators 162 | variable_declarators 163 | variable_declarator 164 | variable_declarator_id 165 | IDENTIFIER x 166 | COMMA 167 | variable_declarator 168 | variable_declarator_id 169 | IDENTIFIER y 170 | SEMICOLON ; 171 | RBRACE } 172 | 173 | Parsing error at line 2 column 11. Repair sequences found: 174 | 1: Insert , 175 | 2: Insert = 176 | 3: Delete y 177 | ``` 178 | -------------------------------------------------------------------------------- /doc/src/nimbleparse.md: -------------------------------------------------------------------------------- 1 | # nimbleparse 2 | 3 | `nimbleparse` is a simple grammar debugging aid. It takes as input a Lex 4 | specification, a Yacc specification, and an input file and prints any warnings 5 | about the specifications (e.g. shift/reduce errors) as well as the resulting 6 | parse tree to stdout. If the parse is unsuccessful it will report parsing 7 | errors and, when possible fixes. If parsing is successful, `nimbleparse` exits 8 | with 0; if an error is detected it exits with 1. 9 | 10 | The full command-line specification is as follows: 11 | 12 | ``` 13 | nimbleparse [-r ] [-y ] [-q] 14 | ``` 15 | 16 | where: 17 | 18 | * `-r` selects the recovery algorithm to be used. Defaults to `cpctplus`. 19 | * `-y` selects the Yacc variant to be used. Defaults to `original`. 20 | * `-q` prevents warnings (e.g. shift/reduce errors) from being reported. 21 | 22 | You can use your own Lex/Yacc files. A small repository of example grammars can 23 | be found at [https://github.com/softdevteam/grammars/](https://github.com/softdevteam/grammars/). 24 | 25 | An example invocation is as follows: 26 | 27 | ``` 28 | $ cat Hello.java 29 | class Hello { 30 | public static void main(String[] args) { 31 | System.out.println("Hello world"); 32 | } 33 | } 34 | $ nimbleparse java7.l java7.y Hello.java 35 | goal 36 | compilation_unit 37 | type_declarations_opt 38 | type_declarations 39 | type_declaration 40 | class_declaration 41 | modifiers_opt 42 | CLASS class 43 | IDENTIFIER Hello 44 | type_parameters_opt 45 | super_opt 46 | interfaces_opt 47 | class_body 48 | LBRACE { 49 | class_body_declarations_opt 50 | class_body_declarations 51 | class_body_declaration 52 | class_member_declaration 53 | method_declaration 54 | method_header 55 | modifiers_opt 56 | modifiers 57 | modifiers 58 | modifier 59 | PUBLIC public 60 | modifier 61 | STATIC static 62 | VOID void 63 | method_declarator 64 | IDENTIFIER main 65 | LPAREN ( 66 | formal_parameter_list_opt 67 | formal_parameter_list 68 | formal_parameter 69 | type 70 | reference_type 71 | array_type 72 | name 73 | simple_name 74 | IDENTIFIER String 75 | dims 76 | LBRACK [ 77 | RBRACK ] 78 | variable_declarator_id 79 | IDENTIFIER args 80 | RPAREN ) 81 | throws_opt 82 | method_body 83 | block 84 | LBRACE { 85 | block_statements_opt 86 | block_statements 87 | block_statement 88 | statement 89 | statement_without_trailing_substatement 90 | expression_statement 91 | statement_expression 92 | method_invocation 93 | qualified_name 94 | name 95 | qualified_name 96 | name 97 | simple_name 98 | IDENTIFIER System 99 | DOT . 100 | IDENTIFIER out 101 | DOT . 102 | IDENTIFIER println 103 | LPAREN ( 104 | argument_list_opt 105 | argument_list 106 | expression 107 | assignment_expression 108 | conditional_expression 109 | conditional_or_expression 110 | conditional_and_expression 111 | inclusive_or_expression 112 | exclusive_or_expression 113 | and_expression 114 | equality_expression 115 | instanceof_expression 116 | relational_expression 117 | shift_expression 118 | additive_expression 119 | multiplicative_expression 120 | unary_expression 121 | unary_expression_not_plus_minus 122 | postfix_expression 123 | primary 124 | primary_no_new_array 125 | literal 126 | STRING_LITERAL "Hello world" 127 | RPAREN ) 128 | SEMICOLON ; 129 | RBRACE } 130 | RBRACE } 131 | $ cat SyntaxError.java 132 | class SyntaxError { 133 | int x y; 134 | } 135 | $ nimbleparse java7.l java7.y Hello.java 136 | goal 137 | compilation_unit 138 | type_declarations_opt 139 | type_declarations 140 | type_declaration 141 | class_declaration 142 | modifiers_opt 143 | CLASS class 144 | IDENTIFIER SyntaxError 145 | type_parameters_opt 146 | super_opt 147 | interfaces_opt 148 | class_body 149 | LBRACE { 150 | class_body_declarations_opt 151 | class_body_declarations 152 | class_body_declaration 153 | class_member_declaration 154 | field_declaration 155 | modifiers_opt 156 | type 157 | primitive_type 158 | numeric_type 159 | integral_type 160 | INT int 161 | variable_declarators 162 | variable_declarators 163 | variable_declarator 164 | variable_declarator_id 165 | IDENTIFIER x 166 | COMMA 167 | variable_declarator 168 | variable_declarator_id 169 | IDENTIFIER y 170 | SEMICOLON ; 171 | RBRACE } 172 | 173 | Parsing error at line 2 column 11. Repair sequences found: 174 | 1: Insert , 175 | 2: Insert = 176 | 3: Delete y 177 | ``` 178 | -------------------------------------------------------------------------------- /doc/src/parsing_idioms.md: -------------------------------------------------------------------------------- 1 | # grmtools parsing idioms 2 | 3 | grmtools is a flexible tool and can be used in many ways. However, for those 4 | using the `Grmtools` format, the simple idioms below can often make life easier. 5 | 6 | 7 | ## Return `Span`s when possible 8 | 9 | When executing grammar actions one is often building up an Abstract Syntax Tree 10 | (AST) or equivalent. For example consider a simple language with assignments: 11 | 12 | ``` 13 | Assign: "ID" "=" Expr; 14 | ``` 15 | 16 | Perhaps the "obvious" way to build this into an AST is to extract the string 17 | representing the identifier as follows: 18 | 19 | ```rust,noplaypen 20 | Assign -> ASTAssign: "ID" "=" Expr 21 | { 22 | let id = $lexer.span_str($1.as_ref().unwrap().span()).to_string(); 23 | ASTAssign::new(id, $3) 24 | } 25 | 26 | %% 27 | 28 | struct ASTAssign { 29 | id: String 30 | } 31 | 32 | impl ASTAssign { 33 | fn new(name: String) -> Self { 34 | ASTAssign { name } 35 | } 36 | } 37 | ``` 38 | 39 | This approach is easy to work with, but isn't as performant as may be desired: 40 | the `to_string` call allocates memory and copies part of the user's input into 41 | that. It also loses information about the part of the user's input that the 42 | string relates to. 43 | 44 | An alternative approach is not to convert the lexeme into a `String` during 45 | parsing, but simply to return a 46 | [`Span`](https://docs.rs/lrpar/~0/lrpar/struct.Span.html). An outline of this 47 | is as follows: 48 | 49 | ```rust,noplaypen 50 | Assign -> ASTAssign: "ID" "=" Expr 51 | { 52 | ASTAssign { id: $1, expr: Box::new($3.span()) } 53 | } 54 | 55 | %% 56 | 57 | type StorageT = u32; 58 | 59 | struct ASTAssign { 60 | id: Span 61 | expr: Box 62 | } 63 | 64 | enum Expr { ... } 65 | ``` 66 | 67 | If this is not quite what you want to do, you can use largely the same trick with 68 | the [`Lexeme`](https://docs.rs/lrpar/~0/lrpar/lex/struct.Lexeme.html) `struct`. 69 | Working with `Lexeme`s has the advantage that you can tell what the type of the 70 | lexeme in question is, though generally this is entirely clear from AST 71 | context, and `Lexeme`'s type parameter makes it marginally more fiddly to work 72 | with than `Span`. 73 | 74 | Alternatively, if you really want to extract strings during parsing, consider 75 | using the `'input` to extract `&str`'s during parsing, since this does not 76 | cause any additional memory to be allocated. 77 | 78 | 79 | ## Have rules return a `Result` type 80 | 81 | As described in the [error recovery 82 | section](errorrecovery.html#a-rule-of-thumb-have-rules-return-a-result-type), it 83 | is generally a good idea to give rules a `Result` return type as this allows 84 | you to easily stop, or change, action code execution if you encounter 85 | "important" inserted lexemes. There are many ways that you can use this, but 86 | many simple cases work well using either: 87 | 88 | * `Err(())` works well if you are creating a parse tree and simply want to 89 | stop creating the tree when you encounter an important inserted lexeme. 90 | 91 | * `Err(Box)` works well if you are performing more detailed 92 | evaluation while parsing and wish to explain to the user why you stopped 93 | evaluating when you encountered an important inserted lexeme. 94 | 95 | 96 | ### Using `Err(())` 97 | 98 | The idea here is that we stop evaluating normal action code by returning 99 | `Err(())`. However, this can lead to endless instances of the following 100 | `map_err` idiom: 101 | 102 | ```rust,noplaypen 103 | R -> Result<..., ()>: 104 | "ID" { $1.map_err(|_| ())? } 105 | ; 106 | ``` 107 | 108 | It can be helpful to define a custom `map_err` function which hides some of this 109 | mess for you: 110 | 111 | ```rust,noplaypen 112 | R -> Result, ()>: 113 | "ID" { map_err($1)? } 114 | ; 115 | 116 | %% 117 | 118 | fn map_err(r: Result, Lexeme>) 119 | -> Result, ()> 120 | { 121 | r.map_err(|_| ()) 122 | } 123 | ``` 124 | 125 | 126 | ### Using `Err(Box)` 127 | 128 | The idea here is that we both stop evaluating normal action code, and explain 129 | why, by returning `Err(Box)`. Although `Box` is something 130 | of a mouthful, it allows you significant flexibility in *what* you return in 131 | error situations. If you want to quickly experiment, then this is convenient 132 | because the token type `Result, Lexeme>` can be 133 | automatically coerced to `Box` (e.g. `$1?` in action code will 134 | return the `Err` variant without additional code). You can also return 135 | strings-as-errors with `Box::::from("...")`. 136 | 137 | Using this idiom we can change our calculator example to deal with many more 138 | possible sources of error: 139 | 140 | ```rust,noplaypen 141 | 142 | %start Expr 143 | %avoid_insert "INT" 144 | %% 145 | Expr -> Result>: 146 | Expr '+' Term 147 | { 148 | Ok($1?.checked_add($3?) 149 | .ok_or(Box::::from("Overflow detected."))?) 150 | } 151 | | Term { $1 } 152 | ; 153 | 154 | Term -> Result>: 155 | Term '*' Factor 156 | { 157 | Ok($1?.checked_mul($3?) 158 | .ok_or(Box::::from("Overflow detected."))?) 159 | } 160 | | Factor { $1 } 161 | ; 162 | 163 | Factor -> Result>: 164 | '(' Expr ')' { $2 } 165 | | 'INT' 166 | { 167 | parse_int( 168 | $lexer.span_str( 169 | $1.map_err(|_| "")?.span())) 170 | } 171 | ; 172 | %% 173 | // Any imports here are in scope for all the grammar actions above. 174 | 175 | use std::error::Error; 176 | 177 | fn parse_int(s: &str) -> Result> { 178 | match s.parse::() { 179 | Ok(val) => Ok(val), 180 | Err(_) => { 181 | Err(Box::from( 182 | format!("{} cannot be represented as a u64", s))) 183 | } 184 | } 185 | } 186 | ``` 187 | 188 | 189 | ## Define a `flatten` function 190 | 191 | Yacc grammars make specifying sequences of things something of a bore. A common 192 | idiom is thus: 193 | 194 | ```rust,noplaypen 195 | ListOfAs -> Result, ()>: 196 | A { Ok(vec![$1?]) } 197 | | ListOfAs A 198 | { 199 | let mut $1 = $1?; 200 | $1.push($1?); 201 | Ok($1) 202 | } 203 | ; 204 | 205 | A -> Result: ... ; 206 | ``` 207 | 208 | Since this idiom is often present multiple times in a grammar, it's generally 209 | worth adding a `flatten` function to hide some of this: 210 | 211 | ```rust,noplaypen 212 | ListOfAs -> Result, ()>: 213 | A { Ok(vec![$1?]) } 214 | | ListOfAs A { flatten($1, $2) } 215 | ; 216 | 217 | A -> Result: ... ; 218 | %% 219 | 220 | fn flatten(lhs: Result, ()>, rhs: Result) 221 | -> Result, ()> 222 | { 223 | let mut flt = lhs?; 224 | flt.push(rhs?); 225 | Ok(flt) 226 | } 227 | ``` 228 | 229 | Note that `flatten` is generic with respect to `T` so that it can be used in 230 | multiple places in the grammar. 231 | 232 | 233 | ## Composing idioms 234 | 235 | The above idioms compose well together. For example, `flatten`, `map_err`, and 236 | `Lexeme` can be used together as shown in the following example: 237 | 238 | ```rust,noplaypen 239 | ListOfIds -> Result>, ()>: 240 | "ID" { Ok(vec![map_err($1)?]) } 241 | | ListOfIds "Id" { flatten($1, map_err($2)?) } 242 | ; 243 | 244 | %% 245 | 246 | type StorageT = u32; 247 | 248 | fn map_err(r: Result, Lexeme>) 249 | -> Result, ()> 250 | { 251 | r.map_err(|_| ()) 252 | } 253 | 254 | fn flatten(lhs: Result, ()>, rhs: Result) 255 | -> Result, ()> 256 | { 257 | let mut flt = lhs?; 258 | flt.push(rhs?); 259 | Ok(flt) 260 | } 261 | ``` 262 | -------------------------------------------------------------------------------- /lrpar/cttests/src/cgen_helper.rs: -------------------------------------------------------------------------------- 1 | use cfgrammar::yacc::{YaccKind, YaccOriginalActionKind}; 2 | use lrlex::CTLexerBuilder; 3 | use lrpar::RecoveryKind; 4 | use std::{ 5 | env, fs, 6 | path::{Path, PathBuf}, 7 | }; 8 | use yaml_rust2::YamlLoader; 9 | 10 | #[allow(dead_code)] 11 | pub(crate) fn run_test_path>(path: P) -> Result<(), Box> { 12 | let out_dir = env::var("OUT_DIR").unwrap(); 13 | if path.as_ref().is_file() { 14 | println!("cargo::rerun-if-changed={}", path.as_ref().display()); 15 | // Parse test file 16 | let s = fs::read_to_string(path.as_ref()).unwrap(); 17 | let docs = YamlLoader::load_from_str(&s).unwrap(); 18 | let grm = &docs[0]["grammar"].as_str().unwrap(); 19 | let lex = &docs[0]["lexer"].as_str().unwrap(); 20 | let yacckind = match docs[0]["yacckind"].as_str() { 21 | Some("Original(YaccOriginalActionKind::NoAction)") => { 22 | Some(YaccKind::Original(YaccOriginalActionKind::NoAction)) 23 | } 24 | Some("Original(YaccOriginalActionKind::UserAction)") => { 25 | Some(YaccKind::Original(YaccOriginalActionKind::UserAction)) 26 | } 27 | Some("Grmtools") => Some(YaccKind::Grmtools), 28 | Some("Original(YaccOriginalActionKind::GenericParseTree)") => { 29 | Some(YaccKind::Original(YaccOriginalActionKind::GenericParseTree)) 30 | } 31 | Some(s) => panic!("YaccKind '{}' not supported", s), 32 | None => None, 33 | }; 34 | let recoverer = match docs[0]["revoverer"].as_str() { 35 | Some("RecoveryKind::CPCTPlus") => Some(RecoveryKind::CPCTPlus), 36 | Some("RecoveryKind::None") => Some(RecoveryKind::None), 37 | _ => None, 38 | }; 39 | let (negative_lex_flags, positive_lex_flags) = &docs[0]["lex_flags"] 40 | .as_vec() 41 | .map(|flags_vec| { 42 | flags_vec 43 | .iter() 44 | .partition(|flag| flag.as_str().unwrap().starts_with('!')) 45 | }) 46 | .unwrap_or_else(|| (Vec::new(), Vec::new())); 47 | let negative_lex_flags = negative_lex_flags 48 | .iter() 49 | .map(|flag| { 50 | let flag = flag.as_str().unwrap(); 51 | flag.strip_prefix('!').unwrap() 52 | }) 53 | .collect::>(); 54 | let positive_lex_flags = positive_lex_flags 55 | .iter() 56 | .map(|flag| flag.as_str().unwrap()) 57 | .collect::>(); 58 | let lex_flags = (&positive_lex_flags, &negative_lex_flags); 59 | 60 | // The code below, in essence, replicates lrlex and lrpar's internal / undocumented 61 | // filename conventions. If those change, this code will also have to change. 62 | 63 | // Create grammar files 64 | let base = path.as_ref().file_stem().unwrap().to_str().unwrap(); 65 | let mut pg = PathBuf::from(&out_dir); 66 | pg.push(format!("{}.test.y", base)); 67 | fs::write(&pg, grm).unwrap(); 68 | let mut pl = PathBuf::from(&out_dir); 69 | pl.push(format!("{}.test.l", base)); 70 | fs::write(&pl, lex).unwrap(); 71 | 72 | if let Some(extra_files) = docs[0]["extra_files"].as_hash() { 73 | for (filename, contents) in extra_files.iter() { 74 | let mut out_file = PathBuf::from(&out_dir); 75 | let filename = filename.as_str().unwrap(); 76 | out_file.push(filename); 77 | let contents = contents.as_str().unwrap(); 78 | fs::write(&out_file, contents).unwrap(); 79 | } 80 | } 81 | 82 | // Build parser and lexer 83 | let mut outl = PathBuf::from(&out_dir); 84 | outl.push(format!("{}.l.rs", base)); 85 | outl.set_extension("rs"); 86 | let mut cl_build = CTLexerBuilder::new() 87 | .lrpar_config(|mut cp_build| { 88 | let mut outp = PathBuf::from(&out_dir); 89 | outp.push(format!("{}.y.rs", base)); 90 | outp.set_extension("rs"); 91 | let (negative_yacc_flags, positive_yacc_flags) = &docs[0]["yacc_flags"] 92 | .as_vec() 93 | .map(|flags_vec| { 94 | flags_vec 95 | .iter() 96 | .partition(|flag| flag.as_str().unwrap().starts_with('!')) 97 | }) 98 | .unwrap_or_else(|| (Vec::new(), Vec::new())); 99 | let positive_yacc_flags = positive_yacc_flags 100 | .iter() 101 | .map(|flag| flag.as_str().unwrap()) 102 | .collect::>(); 103 | let negative_yacc_flags = negative_yacc_flags 104 | .iter() 105 | .map(|flag| { 106 | let flag = flag.as_str().unwrap(); 107 | flag.strip_prefix('!').unwrap() 108 | }) 109 | .collect::>(); 110 | let yacc_flags = (&positive_yacc_flags, &negative_yacc_flags); 111 | if let Some(yacckind) = yacckind { 112 | cp_build = cp_build.yacckind(yacckind); 113 | } 114 | if let Some(recoverer) = recoverer { 115 | cp_build = cp_build.recoverer(recoverer) 116 | } 117 | cp_build = cp_build 118 | .grammar_path(pg.to_str().unwrap()) 119 | .output_path(&outp); 120 | if let Some(flag) = check_flag(yacc_flags, "error_on_conflicts") { 121 | cp_build = cp_build.error_on_conflicts(flag) 122 | } 123 | if let Some(flag) = check_flag(yacc_flags, "warnings_are_errors") { 124 | cp_build = cp_build.warnings_are_errors(flag) 125 | } 126 | if let Some(flag) = check_flag(yacc_flags, "show_warnings") { 127 | cp_build = cp_build.show_warnings(flag) 128 | }; 129 | cp_build 130 | }) 131 | .lexer_path(pl.to_str().unwrap()) 132 | .output_path(&outl); 133 | if let Some(flag) = check_flag(lex_flags, "allow_missing_terms_in_lexer") { 134 | cl_build = cl_build.allow_missing_terms_in_lexer(flag) 135 | } 136 | if let Some(flag) = check_flag(lex_flags, "allow_missing_tokens_in_parser") { 137 | cl_build = cl_build.allow_missing_tokens_in_parser(flag) 138 | } 139 | if let Some(flag) = check_flag(lex_flags, "dot_matches_new_line") { 140 | cl_build = cl_build.dot_matches_new_line(flag) 141 | } 142 | if let Some(flag) = check_flag(lex_flags, "case_insensitive") { 143 | cl_build = cl_build.case_insensitive(flag) 144 | } 145 | if let Some(flag) = check_flag(lex_flags, "multi_line") { 146 | cl_build = cl_build.multi_line(flag) 147 | } 148 | if let Some(flag) = check_flag(lex_flags, "swap_greed") { 149 | cl_build = cl_build.swap_greed(flag) 150 | } 151 | if let Some(flag) = check_flag(lex_flags, "ignore_whitespace") { 152 | cl_build = cl_build.ignore_whitespace(flag) 153 | } 154 | if let Some(flag) = check_flag(lex_flags, "unicode") { 155 | cl_build = cl_build.unicode(flag) 156 | } 157 | if let Some(flag) = check_flag(lex_flags, "octal") { 158 | cl_build = cl_build.octal(flag) 159 | } 160 | cl_build.build()?; 161 | } 162 | Ok(()) 163 | } 164 | 165 | fn check_flag((positive, negative): (&Vec<&str>, &Vec<&str>), flag: &str) -> Option { 166 | assert_eq!( 167 | positive.contains(&flag) | negative.contains(&flag), 168 | positive.contains(&flag) ^ negative.contains(&flag) 169 | ); 170 | if positive.contains(&flag) { 171 | Some(true) 172 | } else if negative.contains(&flag) { 173 | Some(false) 174 | } else { 175 | None 176 | } 177 | } 178 | --------------------------------------------------------------------------------