├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── examples └── demo.rs ├── src ├── lexer.rs ├── lib.rs └── parser.rs └── tests ├── ui.rs └── ui ├── if_else_shift_reduce.rs ├── if_else_shift_reduce.stderr ├── reduce_reduce.rs └── reduce_reduce.stderr /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | *.swp 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | 3 | name = "plex" 4 | version = "0.3.1" 5 | edition = "2018" 6 | authors = ["Geoffry Song "] 7 | 8 | description = "A syntax extension for writing lexers and parsers." 9 | homepage = "https://github.com/goffrie/plex" 10 | repository = "https://github.com/goffrie/plex" 11 | documentation = "https://docs.rs/plex" 12 | license = "MIT OR Apache-2.0" 13 | keywords = ["tokenizer", "scanner", "lexer", "parser", "generator"] 14 | 15 | [lib] 16 | 17 | name = "plex" 18 | proc-macro = true 19 | 20 | [features] 21 | 22 | default = ["lexer", "parser"] 23 | lexer = ["redfa"] 24 | parser = ["lalr"] 25 | 26 | [dependencies] 27 | 28 | lalr = { version = "0.0.2", optional = true } 29 | redfa = { version = "0.0.3", optional = true } 30 | syn = { version = "2.0", features = ["extra-traits", "full"] } 31 | proc-macro2 = { version = "1.0" } 32 | quote = "1.0" 33 | 34 | [dev-dependencies] 35 | trybuild = "1.0.85" 36 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-2018 Geoffry Song 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## plex, a parser and lexer generator 2 | 3 | This crate provides a couple syntax extensions: 4 | 5 | - `lexer!`, which creates a DFA-based lexer that uses maximal munch. It works 6 | a bit like the `lex` tool. You write regular expressions defining your 7 | tokens, together with Rust expressions that create your tokens from slices of 8 | input. 9 | - `parser!`, which creates an LALR(1) parser. It works a bit like `yacc`. You 10 | write a context-free grammar, together with expressions for each rule. You 11 | give each nonterminal a Rust type, allowing you to build an AST recursively. 12 | It also supports spans, giving you convenient source location reporting. 13 | 14 | You can find a demo in `examples/demo.rs`. 15 | 16 | ## Usage 17 | 18 | First, include the `plex` macros. 19 | 20 | ```rust 21 | use plex::{lexer, parser}; 22 | ``` 23 | 24 | ### Creating a lexer 25 | 26 | To define a lexer, use the `lexer!` macro. 27 | 28 | ```rust 29 | lexer! { 30 | fn take_token(tok: 'a) -> Token<'a>; 31 | ``` 32 | 33 | First declare the name of the function, the name of the token you will be able 34 | to access within the lexer, and the return type of your lexer. You can also 35 | optionally declare a lifetime for the strings you accept (here, `'a`). 36 | 37 | Note that this will declare a function with the actual signature 38 | `fn take_token<'a>(text: &mut &'a str) -> Option>`. The lexer will 39 | modify the `text` slice to remove the consumed text. This is designed to make 40 | it easier to create an iterator of `Token`s out of a string slice. 41 | 42 | ```rust 43 | r"[ \t\r\n]" => Token::Whitespace, 44 | "[0-9]+" => Token::IntegerLiteral(tok.parse().unwrap()), 45 | r#""[^"]*""# => Token::StringLiteral(&tok[1..tok.len()-1]), 46 | ``` 47 | 48 | The rest of your lexer should consist of rules. The left hand side should be a 49 | literal string (raw string literals are OK) corresponding to a regular 50 | expression. You can use the typical regular expression syntax, including 51 | parentheses for grouping, square brackets for character classes, and the usual 52 | `.`, `|`, `*`, and `+`. (`?` is currently not supported.) You can also use 53 | some extra operators, like `~` for negation and `&` for conjunction: 54 | 55 | ```rust 56 | r"/\*~(.*\*/.*)\*/" => Token::Comment(tok), 57 | ``` 58 | 59 | The above regular expression will match a C-style comment with `/* */` 60 | delimiters, but won't allow `*/` to appear inside the comment. (`.*\*/.*` 61 | matches any string containing `*/`, `~(.*\*/.*)` matches any string that does 62 | not.) This is important because the lexer uses maximal munch. If you had 63 | written simply `r"/\*.*\*/"`, then the lexer would consume the longest matching 64 | substring. That would interpret `/* comment */ not comment? /* comment */` as 65 | one large comment. 66 | 67 | ```rust 68 | "let" => Token::Let, 69 | "[a-zA-Z]+" => Token::Ident(tok), 70 | "." => panic!("unexpected character"), 71 | } 72 | ``` 73 | 74 | Note that if multiple rules could apply, the one declared first wins. This lets 75 | you declare keywords (which have precedence over identifiers) by putting them 76 | first. 77 | 78 | ### Creating a parser 79 | 80 | `plex` uses the LALR(1) construction for parsers. This section, and `plex` in 81 | general, will assume you understand LR parsing, along with its associated 82 | vocabulary. 83 | 84 | To define a parser, use the `parser!` macro. 85 | 86 | ```rust 87 | parser! { 88 | fn parse(Token, Span); 89 | ``` 90 | 91 | This declares the name of the parser (in this case, `parse`) and the input 92 | types that it takes. In this case, `parse` will take any iterator of pairs 93 | `(Token, Span)`. The token type must be an `enum` whose variants are in scope. 94 | (This is a current limitation of `plex` that might be fixed later.). Those 95 | variants are the terminals of your grammar. `plex`-generated parsers also keep 96 | track of source locations ("spans") that are fed into it, so you'll need to 97 | mention your span type here. If you don't want to keep track of source 98 | locations, you can use the unit type `()`. 99 | 100 | Next, tell `plex` how to combine two spans: 101 | 102 | ```rust 103 | (a, b) { 104 | Span { 105 | lo: a.lo, 106 | hi: b.hi, 107 | } 108 | } 109 | ``` 110 | 111 | Here, `a` and `b` are `Span`s. In this case we've defined `Span` as a 112 | structure with two fields, `lo` and `hi`, representing the byte offsets of the 113 | beginning and end of the span. Note that the extra braces are necessary here: 114 | the body of the function has to be a block. 115 | 116 | Now you write your grammar. For each nonterminal, write its name, together with 117 | its type. This indicates the kind of data that the nonterminal parses into. 118 | 119 | ```rust 120 | statements: Vec { 121 | ``` 122 | 123 | Note that the first nonterminal is special: it's the start symbol of your 124 | grammar, and its type is the return type (more or less) of the parser. 125 | 126 | Then write the rules for this nonterminal. (The left-hand side of each rule is 127 | implied to be `statements`.) 128 | 129 | ```rust 130 | statements[mut st] expr[e] Semi => { 131 | st.push(e); 132 | st 133 | } 134 | ``` 135 | 136 | Write the rule's right-hand side, an arrow `=>`, and the code to handle this 137 | rule. The right-hand side is a sequence of nonterminals or terminals to match. 138 | Here, `statements` and `expr` are nonterminals. Square brackets assign a pattern 139 | to the result of a nonterminal, allowing us to use the data returned by that 140 | nonterminal. Terminals must be enum variants brought in scope. The expression 141 | must evaluate to the type of the left-hand side: in this case, `Vec`. 142 | 143 | ```rust 144 | => vec![], 145 | } 146 | ``` 147 | 148 | Empty rules are allowed: just don't write anything before the arrow. 149 | 150 | If a terminal (i.e. a token) is a tuple-like enum variant, and so holds data, 151 | you should destructure it using round brackets: 152 | 153 | ``` 154 | expr: Expr { 155 | Ident(s) => Expr::Var(span!(), s) 156 | } 157 | } 158 | ``` 159 | 160 | Inside a rule, the `span!()` macro evaluates to the span of the current 161 | right-hand-side. However, this only works if at least one token was matched. If 162 | the rule matched an empty sequence, `span!()` will panic, so avoid using it in 163 | nullable rules. 164 | 165 | The return type of this parser is 166 | `Result, (Option<(Token, Span)>, &'static str)>`. The error type is a 167 | pair consisting of the unexpected token, or `None` for EOF, and a message 168 | describing the tokens that were expected. 169 | -------------------------------------------------------------------------------- /examples/demo.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | 3 | mod lexer { 4 | use plex::lexer; 5 | 6 | #[derive(Debug, Clone)] 7 | pub enum Token { 8 | Ident(String), 9 | 10 | Print, 11 | 12 | Integer(i64), 13 | Equals, 14 | Plus, 15 | Minus, 16 | Star, 17 | Slash, 18 | LParen, 19 | RParen, 20 | Semi, 21 | 22 | Whitespace, 23 | Comment, 24 | } 25 | 26 | lexer! { 27 | fn next_token(text: 'a) -> Token; 28 | 29 | r#"[ \t\r\n]+"# => Token::Whitespace, 30 | // "C-style" comments (/* .. */) - can't contain "*/" 31 | r#"/[*](~(.*[*]/.*))[*]/"# => Token::Comment, 32 | // "C++-style" comments (// ...) 33 | r#"//[^\n]*"# => Token::Comment, 34 | 35 | r#"print"# => Token::Print, 36 | 37 | r#"[0-9]+"# => { 38 | if let Ok(i) = text.parse() { 39 | Token::Integer(i) 40 | } else { 41 | panic!("integer {} is out of range", text) 42 | } 43 | } 44 | 45 | r#"[a-zA-Z_][a-zA-Z0-9_]*"# => Token::Ident(text.to_owned()), 46 | 47 | r#"="# => Token::Equals, 48 | r#"\+"# => Token::Plus, 49 | r#"-"# => Token::Minus, 50 | r#"\*"# => Token::Star, 51 | r#"/"# => Token::Slash, 52 | r#"\("# => Token::LParen, 53 | r#"\)"# => Token::RParen, 54 | r#";"# => Token::Semi, 55 | 56 | r#"."# => panic!("unexpected character: {}", text), 57 | } 58 | 59 | pub struct Lexer<'a> { 60 | original: &'a str, 61 | remaining: &'a str, 62 | } 63 | 64 | impl<'a> Lexer<'a> { 65 | pub fn new(s: &'a str) -> Lexer<'a> { 66 | Lexer { 67 | original: s, 68 | remaining: s, 69 | } 70 | } 71 | } 72 | 73 | #[derive(Debug, Clone, Copy)] 74 | pub struct Span { 75 | pub lo: usize, 76 | pub hi: usize, 77 | } 78 | 79 | impl<'a> Iterator for Lexer<'a> { 80 | type Item = (Token, Span); 81 | fn next(&mut self) -> Option<(Token, Span)> { 82 | loop { 83 | let (tok, span) = if let Some((tok, new_remaining)) = next_token(self.remaining) { 84 | let lo = self.original.len() - self.remaining.len(); 85 | let hi = self.original.len() - new_remaining.len(); 86 | self.remaining = new_remaining; 87 | (tok, Span { lo, hi }) 88 | } else { 89 | return None; 90 | }; 91 | match tok { 92 | Token::Whitespace | Token::Comment => { 93 | continue; 94 | } 95 | tok => { 96 | return Some((tok, span)); 97 | } 98 | } 99 | } 100 | } 101 | } 102 | } 103 | 104 | mod ast { 105 | use crate::lexer::Span; 106 | 107 | #[derive(Debug)] 108 | pub struct Program { 109 | pub stmts: Vec, 110 | } 111 | 112 | #[derive(Debug)] 113 | pub struct Expr { 114 | pub span: Span, 115 | pub node: Expr_, 116 | } 117 | 118 | #[derive(Debug)] 119 | pub enum Expr_ { 120 | Add(Box, Box), 121 | Sub(Box, Box), 122 | Mul(Box, Box), 123 | Div(Box, Box), 124 | Var(String), 125 | Assign(String, Box), 126 | Print(Box), 127 | Literal(i64), 128 | } 129 | } 130 | 131 | mod parser { 132 | use crate::ast::*; 133 | use crate::lexer::Token::*; 134 | use crate::lexer::*; 135 | use plex::parser; 136 | parser! { 137 | fn parse_(Token, Span); 138 | 139 | // combine two spans 140 | (a, b) { 141 | Span { 142 | lo: a.lo, 143 | hi: b.hi, 144 | } 145 | } 146 | 147 | program: Program { 148 | statements[s] => Program { stmts: s } 149 | } 150 | 151 | statements: Vec { 152 | => vec![], 153 | statements[mut st] assign[e] Semi => { 154 | st.push(e); 155 | st 156 | } 157 | } 158 | 159 | assign: Expr { 160 | Print assign[a] => Expr { 161 | span: span!(), 162 | node: Expr_::Print(Box::new(a)), 163 | }, 164 | Ident(var) Equals assign[rhs] => Expr { 165 | span: span!(), 166 | node: Expr_::Assign(var, Box::new(rhs)), 167 | }, 168 | term[t] => t, 169 | } 170 | 171 | term: Expr { 172 | term[lhs] Plus fact[rhs] => Expr { 173 | span: span!(), 174 | node: Expr_::Add(Box::new(lhs), Box::new(rhs)), 175 | }, 176 | term[lhs] Minus fact[rhs] => Expr { 177 | span: span!(), 178 | node: Expr_::Sub(Box::new(lhs), Box::new(rhs)), 179 | }, 180 | fact[x] => x 181 | } 182 | 183 | fact: Expr { 184 | fact[lhs] Star atom[rhs] => Expr { 185 | span: span!(), 186 | node: Expr_::Mul(Box::new(lhs), Box::new(rhs)), 187 | }, 188 | fact[lhs] Slash atom[rhs] => Expr { 189 | span: span!(), 190 | node: Expr_::Div(Box::new(lhs), Box::new(rhs)), 191 | }, 192 | atom[x] => x 193 | } 194 | 195 | atom: Expr { 196 | // round brackets to destructure tokens 197 | Ident(i) => Expr { 198 | span: span!(), 199 | node: Expr_::Var(i), 200 | }, 201 | Integer(i) => Expr { 202 | span: span!(), 203 | node: Expr_::Literal(i), 204 | }, 205 | LParen assign[a] RParen => a 206 | } 207 | } 208 | 209 | pub fn parse>( 210 | i: I, 211 | ) -> Result, &'static str)> { 212 | parse_(i) 213 | } 214 | } 215 | 216 | mod interp { 217 | use crate::ast::*; 218 | use std::collections::HashMap; 219 | 220 | pub fn interp<'a>(p: &'a Program) { 221 | let mut env = HashMap::new(); 222 | for expr in &p.stmts { 223 | interp_expr(&mut env, expr); 224 | } 225 | } 226 | fn interp_expr<'a>(env: &mut HashMap<&'a str, i64>, expr: &'a Expr) -> i64 { 227 | use crate::ast::Expr_::*; 228 | match expr.node { 229 | Add(ref a, ref b) => interp_expr(env, a) + interp_expr(env, b), 230 | Sub(ref a, ref b) => interp_expr(env, a) - interp_expr(env, b), 231 | Mul(ref a, ref b) => interp_expr(env, a) * interp_expr(env, b), 232 | Div(ref a, ref b) => interp_expr(env, a) / interp_expr(env, b), 233 | Assign(ref var, ref b) => { 234 | let val = interp_expr(env, b); 235 | env.insert(var, val); 236 | val 237 | } 238 | Var(ref var) => *env.get(&var[..]).unwrap(), 239 | Literal(lit) => lit, 240 | Print(ref e) => { 241 | let val = interp_expr(env, e); 242 | println!("{}", val); 243 | val 244 | } 245 | } 246 | } 247 | } 248 | 249 | fn main() { 250 | let mut s = String::new(); 251 | std::io::stdin().read_to_string(&mut s).unwrap(); 252 | let lexer = lexer::Lexer::new(&s).inspect(|tok| eprintln!("tok: {:?}", tok)); 253 | let program = parser::parse(lexer).unwrap(); 254 | interp::interp(&program); 255 | } 256 | -------------------------------------------------------------------------------- /src/lexer.rs: -------------------------------------------------------------------------------- 1 | use std::char; 2 | use std::collections::{BTreeSet, VecDeque}; 3 | 4 | use redfa::regex::Regex; 5 | use redfa::Dfa; 6 | 7 | use proc_macro2::{Span, TokenStream}; 8 | use quote::quote; 9 | use syn::parse::{Parse, ParseStream}; 10 | use syn::{ 11 | parenthesized, parse_macro_input, token, Error, Expr, Ident, Lifetime, LitStr, Token, Type, 12 | Visibility, 13 | }; 14 | 15 | fn dfa_fn( 16 | dfa: &Dfa, 17 | state_enum: Ident, 18 | state_paths: &[TokenStream], 19 | fn_name: Ident, 20 | ) -> TokenStream { 21 | let mut arms = vec![]; 22 | for (tr, state_name) in dfa.states.iter().zip(state_paths.iter().cloned()) { 23 | let mut subarms = vec![]; 24 | let mut iter = tr.by_char.iter().peekable(); 25 | while let Some((&ch, &target)) = iter.next() { 26 | let mut end = ch; 27 | while let Some(&(&nextc, &nextt)) = iter.peek() { 28 | if nextc as u32 != (end as u32) + 1 || nextt != target { 29 | break; 30 | } 31 | end = nextc; 32 | iter.next(); 33 | } 34 | let pat = if ch == end { 35 | quote!(#ch) 36 | } else { 37 | quote!(#ch ... #end) 38 | }; 39 | let body = state_paths[target as usize].clone(); 40 | subarms.push(quote!(#pat => #body)); 41 | } 42 | let default_state = state_paths[tr.default as usize].clone(); 43 | arms.push(quote!(#state_name => match ch { 44 | #(#subarms,)* 45 | _ => #default_state 46 | })); 47 | } 48 | quote! { 49 | fn #fn_name(state: #state_enum, ch: char) -> #state_enum { 50 | match state { 51 | #(#arms,)* 52 | } 53 | } 54 | } 55 | } 56 | 57 | fn first_nullable(vec: &[Regex]) -> Option { 58 | vec.iter().position(Regex::nullable) 59 | } 60 | 61 | fn dfa_make_names(dfa: &Dfa) -> Vec { 62 | let mut names = vec![String::new(); dfa.states.len()]; 63 | let mut seen = BTreeSet::new(); 64 | seen.insert(0); 65 | let mut worklist = VecDeque::new(); 66 | worklist.push_back((0, "S_".into())); 67 | while let Some((i, name)) = worklist.pop_front() { 68 | for (&c, &next) in &dfa.states[i as usize].by_char { 69 | if seen.insert(next) { 70 | let new_name = if c.is_alphanumeric() { 71 | format!("{}{}", name, c) 72 | } else { 73 | format!("{}_{:x}_", name, c as u32) 74 | }; 75 | worklist.push_back((next, new_name)); 76 | } 77 | } 78 | let default = dfa.states[i as usize].default; 79 | if seen.insert(default) { 80 | let new_name = format!("{}__", name); 81 | worklist.push_back((default, new_name)); 82 | } 83 | names[i as usize] = name; 84 | } 85 | names 86 | } 87 | 88 | struct Rule { 89 | pattern: LitStr, 90 | expr: Expr, 91 | } 92 | 93 | fn parse_rules(input: ParseStream) -> syn::Result> { 94 | let mut rules = vec![]; 95 | while !input.is_empty() { 96 | // FIXME: Make some nicer error messages. 97 | let pattern = input.parse()?; 98 | input.parse::]>()?; 99 | // Like in a `match` expression, braced block doesn't require a comma before the next rule. 100 | let optional_comma = input.peek(token::Brace); 101 | let expr = input.parse()?; 102 | rules.push(Rule { pattern, expr }); 103 | match input.parse::() { 104 | Ok(_) => {} 105 | Err(e) => { 106 | if !input.is_empty() && !optional_comma { 107 | return Err(e); 108 | } 109 | } 110 | } 111 | } 112 | Ok(rules) 113 | } 114 | 115 | struct Lexer { 116 | vis: Visibility, 117 | name: Ident, 118 | input: Ident, 119 | lifetime: Option, 120 | return_type: Type, 121 | rules: Vec, 122 | } 123 | 124 | impl Parse for Lexer { 125 | fn parse(input: ParseStream) -> syn::Result { 126 | let lifetime; 127 | Ok(Lexer { 128 | vis: input.parse()?, 129 | name: { 130 | input.parse::()?; 131 | input.parse()? 132 | }, 133 | input: { 134 | let inner; 135 | parenthesized!(inner in input); 136 | let lexer_input = inner.parse()?; 137 | if !inner.is_empty() { 138 | inner.parse::()?; 139 | lifetime = Some(inner.parse()?); 140 | if !inner.is_empty() { 141 | return Err(inner.error("unexpected token after input lifetime")); 142 | } 143 | } else { 144 | lifetime = None; 145 | } 146 | lexer_input 147 | }, 148 | lifetime, 149 | return_type: { 150 | input.parse::]>()?; 151 | let t = input.parse()?; 152 | input.parse::()?; 153 | t 154 | }, 155 | rules: parse_rules(input)?, 156 | }) 157 | } 158 | } 159 | 160 | pub fn lexer(input: proc_macro::TokenStream) -> proc_macro::TokenStream { 161 | let Lexer { 162 | vis, 163 | name, 164 | input, 165 | lifetime, 166 | return_type, 167 | rules, 168 | } = parse_macro_input!(input as Lexer); 169 | 170 | let mut errors = vec![]; 171 | let (re_vec, actions): (Vec>, Vec) = rules 172 | .into_iter() 173 | .map(|Rule { pattern, expr }| { 174 | let re = match pattern.value().parse() { 175 | Ok(r) => r, 176 | Err(e) => { 177 | errors.push(Error::new_spanned( 178 | &pattern, 179 | format!("invalid regular expression: {}", e), 180 | )); 181 | Regex::Null // dummy 182 | } 183 | }; 184 | if re.nullable() { 185 | errors.push(Error::new_spanned( 186 | &pattern, 187 | "token must not match the empty string", 188 | )); 189 | } 190 | (re, expr) 191 | }) 192 | .unzip(); 193 | 194 | let (dfa, _) = Dfa::from_derivatives(vec![re_vec]); 195 | let dfa = dfa.map(|vec| first_nullable(&vec)).minimize().map(|&x| x); 196 | let error_state_ix = dfa.states.iter().enumerate().position(|(ix, state)| { 197 | state.value.is_none() && state.by_char.is_empty() && state.default as usize == ix 198 | }); 199 | // TODO: consider making this opt-out 200 | if error_state_ix.is_none() { 201 | errors.push(Error::new( 202 | Span::call_site(), 203 | "this DFA has no error state; it will always scan the entire input", 204 | )); 205 | } 206 | 207 | // Construct "human-readable" names for each of the DFA states. 208 | // This is purely to make the generated code nicer. 209 | let mut names: Vec = dfa_make_names(&dfa) 210 | .into_iter() 211 | .map(|n| Ident::new(&n, Span::call_site())) 212 | .collect(); 213 | // If we've identified an error state, give it the special name "Error". 214 | if let Some(ix) = error_state_ix { 215 | names[ix] = Ident::new("Error", Span::call_site()); 216 | } 217 | // The full paths to each of the state names (e.g. `State::Error`). 218 | let state_paths: Vec = names.iter().map(|name| quote!(State::#name)).collect(); 219 | 220 | let initial_state = state_paths[0].clone(); 221 | let error_state = error_state_ix.map(|ix| state_paths[ix].clone()); 222 | 223 | // Construct the actual DFA transition function, which, given a `State` and the next character, returns the next `State`. 224 | let transition_fn = dfa_fn( 225 | &dfa, 226 | Ident::new("State", Span::call_site()), 227 | &state_paths, 228 | Ident::new("transition", Span::call_site()), 229 | ); 230 | 231 | let accepting_fn = { 232 | let arms = dfa 233 | .states 234 | .iter() 235 | .map(|state| state.value) 236 | .zip(&state_paths) 237 | .filter_map(|(maybe_act, state)| { 238 | maybe_act.map(|act| { 239 | let act = act as u32; 240 | quote!(#state => Some(#act)) 241 | }) 242 | }); 243 | quote!(fn accepting(state: State) -> Option { 244 | match state { 245 | #(#arms,)* 246 | _ => None 247 | } 248 | }) 249 | }; 250 | 251 | let compute_result = { 252 | let compute_arms = actions.into_iter().enumerate().map(|(i, expr)| { 253 | let i = i as u32; 254 | quote!(#i => #expr) 255 | }); 256 | quote!(match which { 257 | #(#compute_arms,)* 258 | _ => unreachable!() 259 | }) 260 | }; 261 | 262 | let l1 = lifetime.iter(); 263 | let l2 = lifetime.iter(); 264 | let l3 = lifetime.iter(); 265 | 266 | let error_state = error_state.iter(); 267 | let errors = errors.into_iter().map(|e| e.into_compile_error()).collect::(); 268 | 269 | quote!( 270 | #errors 271 | #vis fn #name #(<#l1>)* (input: &#(#l2)* str) -> Option<(#return_type, &#(#l3)* str)> { 272 | #[derive(Copy, Clone)] 273 | #[allow(non_camel_case_types)] 274 | enum State { 275 | #(#names,)* 276 | } 277 | #transition_fn 278 | #accepting_fn 279 | let mut state = #initial_state; 280 | let mut remaining = input.char_indices(); 281 | let mut last_match = None; 282 | loop { 283 | if let Some(which) = accepting(state) { 284 | last_match = Some((which, remaining.clone())); 285 | } 286 | #( // only produce this if `error_state` exists. 287 | if let #error_state = state { 288 | break; 289 | } 290 | )* 291 | if let Some((_, ch)) = remaining.next() { 292 | state = transition(state, ch); 293 | } else { 294 | break; 295 | } 296 | } 297 | if let Some((which, mut remaining)) = last_match { 298 | let ix = if let Some((ix, _)) = remaining.next() { 299 | ix 300 | } else { 301 | input.len() 302 | }; 303 | let #input = &input[..ix]; 304 | let rule_result = #compute_result; 305 | Some((rule_result, &input[ix..])) 306 | } else { 307 | None 308 | } 309 | } 310 | ) 311 | .into() 312 | } 313 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![recursion_limit = "128"] 2 | #![warn(unused_extern_crates)] 3 | 4 | //! # plex, a parser and lexer generator 5 | //! See README.md for documentation. 6 | 7 | #[cfg(feature = "lexer")] 8 | mod lexer; 9 | #[cfg(feature = "parser")] 10 | mod parser; 11 | 12 | use proc_macro::TokenStream; 13 | 14 | /// Defines a lexer. 15 | #[cfg(feature = "lexer")] 16 | #[proc_macro] 17 | pub fn lexer(tok: TokenStream) -> TokenStream { 18 | lexer::lexer(tok) 19 | } 20 | 21 | /// Defines a parser. 22 | #[cfg(feature = "parser")] 23 | #[proc_macro] 24 | pub fn parser(tok: TokenStream) -> TokenStream { 25 | parser::parser(tok) 26 | } 27 | -------------------------------------------------------------------------------- /src/parser.rs: -------------------------------------------------------------------------------- 1 | use std::collections::btree_map::Entry; 2 | use std::collections::{BTreeMap, BTreeSet}; 3 | use std::fmt::{self, Write}; 4 | 5 | use lalr::*; 6 | 7 | use proc_macro2::{Span, TokenStream}; 8 | use quote::{quote, quote_spanned, ToTokens}; 9 | use syn::parse::{Parse, ParseStream}; 10 | use syn::punctuated::Punctuated; 11 | use syn::spanned::Spanned; 12 | use syn::token::{Bracket, Paren}; 13 | use syn::{ 14 | self, braced, bracketed, parenthesized, parse_macro_input, token, Attribute, Block, Error, 15 | Expr, Ident, Meta, Pat, Token, Type, Visibility, 16 | }; 17 | 18 | /// Return the most frequent item in the given iterator, or None if it is empty. 19 | /// Picks an arbitrary item in case of a tie. 20 | fn most_frequent>(it: I) -> Option { 21 | let mut freq = BTreeMap::new(); 22 | for x in it { 23 | *freq.entry(x).or_insert(0) += 1; 24 | } 25 | freq.into_iter().max_by_key(|&(_, f)| f).map(|(x, _)| x) 26 | } 27 | 28 | fn expected_one_of(xs: &[S]) -> String { 29 | let mut err_msg: String = "expected".to_string(); 30 | for (i, x) in xs.iter().enumerate() { 31 | if i == 0 { 32 | let _ = write!(&mut err_msg, " "); 33 | } else if i == xs.len() - 1 { 34 | if i == 1 { 35 | let _ = write!(&mut err_msg, " or "); 36 | } else { 37 | let _ = write!(&mut err_msg, ", or "); 38 | } 39 | } else { 40 | let _ = write!(&mut err_msg, ", "); 41 | } 42 | let _ = write!(&mut err_msg, "{}", x); 43 | } 44 | err_msg 45 | } 46 | 47 | pub fn lr1_machine<'a, T, N, A, FM, FA, FR, FO>( 48 | grammar: &'a Grammar, 49 | types: &BTreeMap, 50 | token_ty: Type, 51 | span_ty: Type, 52 | range_fn: Option<(Ident, Ident, Block)>, 53 | vis: Visibility, 54 | name: Ident, 55 | mut to_pat: FM, 56 | mut to_expr: FA, 57 | reduce_on: FR, 58 | priority_of: FO, 59 | ) -> Result> 60 | where 61 | T: Ord + fmt::Debug + fmt::Display, 62 | N: Ord + fmt::Debug, 63 | A: fmt::Debug, 64 | FM: FnMut(&T) -> TokenStream, 65 | FA: FnMut(&N, &A, &[Symbol]) -> (TokenStream, Vec>, Span), 66 | FR: FnMut(&Rhs, Option<&T>) -> bool, 67 | FO: FnMut(&Rhs, Option<&T>) -> i32, 68 | { 69 | let actual_start = match grammar 70 | .rules 71 | .get(&grammar.start) 72 | .expect("Grammar didn't contain its start nonterminal")[0] 73 | .syms[0] 74 | { 75 | Terminal(_) => panic!("bad grammar"), 76 | Nonterminal(ref x) => x, 77 | }; 78 | let any_ty = quote!(::std::any::Any); 79 | let start_ty = &types[actual_start]; 80 | let table: LR1ParseTable = grammar.lalr1(reduce_on, priority_of)?; 81 | let rule_fn_ids: BTreeMap<_, _> = grammar 82 | .rules 83 | .iter() 84 | .filter(|&(lhs, _)| *lhs != grammar.start) 85 | .flat_map(|(_, rhss)| { 86 | // Identify rules by their RHS, which should have unique addresses 87 | // FIXME: maybe not the best idea 88 | rhss.iter().map(|rhs| rhs as *const _) 89 | }) 90 | .enumerate() 91 | .map(|(i, k)| (k, Ident::new(&format!("reduce_{}", i), Span::call_site()))) 92 | .collect(); 93 | let goto_fn_ids: BTreeMap<_, _> = grammar 94 | .rules 95 | .keys() 96 | .filter(|&lhs| *lhs != grammar.start) 97 | .enumerate() 98 | .map(|(i, lhs)| (lhs, Ident::new(&format!("goto_{}", i), Span::call_site()))) 99 | .collect(); 100 | 101 | let mut stmts = Vec::new(); 102 | 103 | stmts.push(if let Some((a, b, body)) = range_fn { 104 | quote!(fn range(#a: #span_ty, #b: #span_ty) -> #span_ty { 105 | #body 106 | }) 107 | } else { 108 | quote!( 109 | fn range(_a: (), _b: ()) {} 110 | ) 111 | }); 112 | stmts.push( 113 | quote!(fn range_array(x: &[Option<#span_ty>]) -> Option<#span_ty> { 114 | if let Some(lo) = x.iter().filter_map(|&x| x).next() { 115 | let hi = x.iter().rev().filter_map(|&x| x).next().unwrap(); 116 | Some(range(lo, hi)) 117 | } else { 118 | None 119 | } 120 | }), 121 | ); 122 | // The `lhs`s which are actually unreachable; as such, their associated code should not be generated. 123 | let mut lhs_unreachable = BTreeSet::new(); 124 | for (lhs, id) in &goto_fn_ids { 125 | let expr = if let Some(&most_freq) = 126 | most_frequent(table.states.iter().filter_map(|state| state.goto.get(lhs))) 127 | { 128 | let most_freq = most_freq as u32; 129 | let mut pats_by_dest = BTreeMap::new(); 130 | for (ix, state) in table.states.iter().enumerate() { 131 | let ix = ix as u32; 132 | if let Some(&dest) = state.goto.get(lhs) { 133 | let dest = dest as u32; 134 | if dest != most_freq { 135 | pats_by_dest.entry(dest).or_insert(vec![]).push(quote!(#ix)); 136 | } 137 | } 138 | } 139 | let mut arms: Vec<_> = pats_by_dest 140 | .into_iter() 141 | .map(|(dest, pats)| quote!(#(#pats)|* => #dest,)) 142 | .collect(); 143 | arms.push(quote!(_ => #most_freq,)); 144 | quote!(match state { #(#arms)* }) 145 | } else { 146 | // This shouldn't normally happen, but it can when `lhs` is unused in the 147 | // grammar. 148 | lhs_unreachable.insert(lhs); 149 | continue; 150 | }; 151 | stmts.push(quote!(fn #id(state: u32) -> u32 { 152 | #expr 153 | })); 154 | } 155 | for (lhs, rhss) in &grammar.rules { 156 | if *lhs == grammar.start || lhs_unreachable.contains(&lhs) { 157 | continue; 158 | } 159 | let goto_fn = goto_fn_ids[lhs].clone(); 160 | let lhs_ty = &types[lhs]; 161 | for rhs in rhss.iter() { 162 | let (result, arg_pats, rhs_span) = to_expr(lhs, &rhs.act, &rhs.syms); 163 | let len = rhs.syms.len(); 164 | let current_span_stmt = if rhs.syms.len() > 0 { 165 | // Make the current_span available to the user by exposing it through a macro whose name is unhygienic. 166 | let current_span_ident = quote_spanned!(rhs_span => current_span); 167 | let span_macro = quote!( 168 | #[allow(unused_macros)] 169 | macro_rules! span { 170 | () => { #current_span_ident.unwrap() } 171 | } 172 | ); 173 | quote_spanned!(rhs_span => 174 | let current_span: Option<#span_ty> = { 175 | let sp = range_array(&span_stack[(span_stack.len() - #len)..]); 176 | let newlen = span_stack.len() - #len; 177 | span_stack.truncate(newlen); 178 | sp 179 | }; 180 | #span_macro 181 | ) 182 | } else { 183 | quote_spanned!(rhs_span => 184 | let current_span: Option<#span_ty> = None; 185 | ) 186 | }; 187 | let mut reduce_stmts = vec![current_span_stmt]; 188 | reduce_stmts.extend(rhs.syms.iter().zip(arg_pats.iter().cloned()).rev().map( 189 | |(sym, maybe_pat)| match maybe_pat { 190 | // TODO: maybe use an even more precise span 191 | Some(pat) => { 192 | let ty = match *sym { 193 | Terminal(_) => token_ty.clone(), 194 | Nonterminal(ref n) => types[n].clone(), 195 | }; 196 | quote_spanned!(rhs_span => 197 | let #pat: #ty = *stack.pop().unwrap().downcast().unwrap(); 198 | ) 199 | } 200 | None => quote_spanned!(rhs_span => stack.pop();), 201 | }, 202 | )); 203 | if rhs.syms.len() > 1 { 204 | let len_minus_one = rhs.syms.len() - 1; 205 | // XXX: Annoying syntax :( 206 | reduce_stmts.push( 207 | quote_spanned!(rhs_span => match state_stack.len() - #len_minus_one { x => state_stack.truncate(x) };), 208 | ); 209 | } else if rhs.syms.len() == 0 { 210 | reduce_stmts.push(quote_spanned!(rhs_span => state_stack.push(*state);)); 211 | } 212 | reduce_stmts.push(quote_spanned!( 213 | rhs_span => 214 | *state = #goto_fn(*state_stack.last().unwrap()); 215 | let result: #lhs_ty = ( || -> #lhs_ty { #result } )(); 216 | stack.push(Box::new(result) as Box<#any_ty>); 217 | span_stack.push(current_span); 218 | )); 219 | 220 | let fn_id = rule_fn_ids.get(&(rhs as *const _)).unwrap().clone(); 221 | stmts.push(quote_spanned!( 222 | rhs_span => 223 | fn #fn_id( 224 | stack: &mut Vec>, 225 | span_stack: &mut Vec>, 226 | state_stack: &mut Vec, 227 | state: &mut u32, 228 | ) { 229 | #(#reduce_stmts)* 230 | } 231 | )); 232 | } 233 | } 234 | stmts.push(quote!( 235 | let mut stack: Vec> = Vec::new(); 236 | let mut span_stack = Vec::new(); 237 | let mut state_stack = Vec::new(); 238 | let mut state: u32 = 0; 239 | let mut token_span = it.next(); 240 | )); 241 | stmts.push({ 242 | let state_arms = table.states.iter().enumerate().map(|(ix, state)| { 243 | let mut arms = vec![]; 244 | let mut reduce_arms = BTreeMap::new(); 245 | let mut expected = vec![]; 246 | for (&tok, action) in &state.lookahead { 247 | expected.push(format!("`{}`", tok)); 248 | let tok_pat = to_pat(tok); 249 | let pat = quote!(Some((#tok_pat, _))); 250 | let arm_expr = match *action { 251 | LRAction::Shift(dest) => dest as u32, 252 | LRAction::Reduce(_, rhs) => { 253 | reduce_arms 254 | .entry(rhs as *const _) 255 | .or_insert(vec![]) 256 | .push(pat); 257 | continue; 258 | } 259 | LRAction::Accept => unreachable!(), 260 | }; 261 | arms.push(quote!(#pat => #arm_expr,)); 262 | } 263 | if let Some(ref action) = state.eof { 264 | expected.push("end of file".into()); 265 | let pat = quote!(None); 266 | match *action { 267 | LRAction::Shift(_) => unreachable!(), 268 | LRAction::Reduce(_, rhs) => { 269 | reduce_arms 270 | .entry(rhs as *const _) 271 | .or_insert(vec![]) 272 | .push(pat); 273 | } 274 | LRAction::Accept => { 275 | arms.push( 276 | quote!(#pat => return Ok(*stack.pop().unwrap().downcast::<#start_ty>().unwrap()),), 277 | ); 278 | } 279 | }; 280 | } 281 | for (rhs_ptr, pats) in reduce_arms.into_iter() { 282 | let reduce_fn = rule_fn_ids[&rhs_ptr].clone(); 283 | arms.push(quote!(#(#pats)|* => { 284 | #reduce_fn(&mut stack, &mut span_stack, &mut state_stack, &mut state); 285 | continue 286 | })); 287 | } 288 | let err_msg = expected_one_of(&expected); 289 | arms.push(quote!(_ => return Err((token_span, #err_msg)),)); 290 | let ix = ix as u32; 291 | quote!(#ix => match token_span { #(#arms)* }) 292 | }); 293 | quote!( 294 | loop { 295 | let next_state = match state { 296 | #(#state_arms)* 297 | _ => unreachable!(), 298 | }; 299 | match token_span { 300 | Some((token, span)) => { 301 | stack.push(Box::new(token) as Box<#any_ty>); 302 | span_stack.push(Some(span)); 303 | } 304 | None => unreachable!(), 305 | }; 306 | state_stack.push(state); 307 | token_span = it.next(); 308 | state = next_state; 309 | } 310 | ) 311 | }); 312 | // `quote` bug: can't quote `'static`, so use `&'a str` for any `'a`. hopefully this is fine. 313 | Ok(quote!( 314 | #vis fn #name<'a, I: Iterator>(mut it: I) -> Result<#start_ty, (Option<(#token_ty, #span_ty)>, &'a str)> { 315 | #(#stmts)* 316 | } 317 | )) 318 | } 319 | 320 | #[derive(Debug)] 321 | enum RuleRhsItem { 322 | Symbol(Ident), 323 | SymbolPat(Ident, Bracket, Pat), 324 | Destructure(Ident, Paren, Punctuated), 325 | } 326 | 327 | impl ToTokens for RuleRhsItem { 328 | fn to_tokens(&self, tokens: &mut TokenStream) { 329 | match self { 330 | RuleRhsItem::Symbol(ident) => ident.to_tokens(tokens), 331 | RuleRhsItem::SymbolPat(ident, bracket, pat) => { 332 | ident.to_tokens(tokens); 333 | bracket.surround(tokens, |tokens| pat.to_tokens(tokens)); 334 | } 335 | RuleRhsItem::Destructure(ident, paren, pats) => { 336 | ident.to_tokens(tokens); 337 | paren.surround(tokens, |tokens| pats.to_tokens(tokens)); 338 | } 339 | } 340 | } 341 | } 342 | 343 | impl RuleRhsItem { 344 | fn ident(&self) -> &Ident { 345 | match *self { 346 | RuleRhsItem::Symbol(ref ident) 347 | | RuleRhsItem::SymbolPat(ref ident, _, _) 348 | | RuleRhsItem::Destructure(ref ident, _, _) => ident, 349 | } 350 | } 351 | } 352 | 353 | #[derive(Debug)] 354 | struct Rule { 355 | rhs: Vec, 356 | arrow: Token![=>], 357 | action: TokenStream, 358 | exclusions: BTreeSet, 359 | exclude_eof: bool, 360 | priority: i32, 361 | } 362 | 363 | impl Rule { 364 | fn head(&self) -> proc_macro2::TokenStream { 365 | let rhs = &self.rhs; 366 | let arrow = &self.arrow; 367 | quote!(#(#rhs)* #arrow) 368 | } 369 | } 370 | 371 | fn parse_rules(input: ParseStream) -> syn::Result> { 372 | let mut rules = vec![]; 373 | while !input.is_empty() { 374 | // FIXME: Make some nicer error messages. 375 | let mut exclusions = BTreeSet::new(); 376 | let mut exclude_eof = false; 377 | let mut priority = 0; 378 | let attrs = Attribute::parse_outer(input)?; 379 | for attr in attrs { 380 | match attr.meta { 381 | Meta::List(_) if attr.path().is_ident("no_reduce") => { 382 | attr.parse_nested_meta(|nested| { 383 | if let Some(ident) = nested.path.get_ident() { 384 | if ident == "EOF" { 385 | exclude_eof = true; 386 | } else { 387 | exclusions.insert(ident.clone()); 388 | } 389 | Ok(()) 390 | } else { 391 | Err(nested.error("invalid syntax: no_reduce list includes a non-token")) 392 | } 393 | })?; 394 | } 395 | Meta::Path(_) if attr.path().is_ident("overriding") => { 396 | priority = 1; 397 | } 398 | _ => { 399 | return Err(Error::new_spanned(attr, "unknown attribute")); 400 | } 401 | } 402 | } 403 | let mut rhs = vec![]; 404 | while !input.peek(Token![=>]) { 405 | let ident: Ident = input.parse()?; 406 | rhs.push(if input.peek(token::Bracket) { 407 | let inner; 408 | let bracket = bracketed!(inner in input); 409 | let pat = Pat::parse_single(&inner)?; 410 | if !inner.is_empty() { 411 | return Err(inner.error("unexpected token after pattern")); 412 | } 413 | RuleRhsItem::SymbolPat(ident, bracket, pat) 414 | } else if input.peek(token::Paren) { 415 | let inner; 416 | let paren = parenthesized!(inner in input); 417 | let pats = inner.parse_terminated(Pat::parse_single, Token![,])?; 418 | RuleRhsItem::Destructure( 419 | ident, 420 | paren, 421 | pats.into_pairs().map(|p| p.into_value()).collect(), 422 | ) 423 | } else { 424 | RuleRhsItem::Symbol(ident) 425 | }); 426 | } 427 | let arrow = input.parse::]>()?; 428 | // Like in a `match` expression, braced block doesn't require a comma before the next rule. 429 | let optional_comma = input.peek(token::Brace); 430 | let action: Expr = input.parse()?; 431 | rules.push(Rule { 432 | rhs, 433 | arrow, 434 | action: action.into_token_stream(), 435 | exclusions, 436 | exclude_eof, 437 | priority, 438 | }); 439 | match ::parse(input) { 440 | Ok(_) => {} 441 | Err(e) => { 442 | if !input.is_empty() && !optional_comma { 443 | return Err(e); 444 | } 445 | } 446 | } 447 | } 448 | Ok(rules) 449 | } 450 | 451 | struct RuleSet { 452 | lhs: Ident, 453 | return_ty: Type, 454 | rules: Vec, 455 | } 456 | 457 | impl Parse for RuleSet { 458 | fn parse(input: ParseStream) -> syn::Result { 459 | Ok(RuleSet { 460 | lhs: input.parse()?, 461 | return_ty: { 462 | input.parse::()?; 463 | input.parse()? 464 | }, 465 | rules: { 466 | let rule_content; 467 | braced!(rule_content in input); 468 | parse_rules(&rule_content)? 469 | }, 470 | }) 471 | } 472 | } 473 | 474 | struct Parser { 475 | vis: Visibility, 476 | name: Ident, 477 | token_ty: Type, 478 | span_ty: Type, 479 | range_fn: Option<(Ident, Ident, Block)>, 480 | rule_sets: Vec, 481 | } 482 | 483 | impl Parse for Parser { 484 | fn parse(input: ParseStream) -> syn::Result { 485 | let span_ty; 486 | Ok(Parser { 487 | vis: input.parse()?, 488 | name: { 489 | input.parse::()?; 490 | input.parse()? 491 | }, 492 | token_ty: { 493 | let inner; 494 | parenthesized!(inner in input); 495 | let token = inner.parse()?; 496 | inner.parse::()?; 497 | span_ty = inner.parse()?; 498 | if !inner.is_empty() { 499 | return Err(inner.error("unexpected token after span type")); 500 | } 501 | input.parse::()?; 502 | token 503 | }, 504 | span_ty, 505 | range_fn: { 506 | if input.peek(token::Paren) { 507 | let inner; 508 | parenthesized!(inner in input); 509 | let a = inner.parse()?; 510 | inner.parse::()?; 511 | let b = inner.parse()?; 512 | if !inner.is_empty() { 513 | return Err(inner.error("unexpected token after second range")); 514 | } 515 | let body = input.parse::()?; 516 | Some((a, b, body)) 517 | } else { 518 | None 519 | } 520 | }, 521 | rule_sets: { 522 | let mut r = vec![]; 523 | while !input.is_empty() { 524 | r.push(input.parse()?); 525 | } 526 | r 527 | }, 528 | }) 529 | } 530 | } 531 | 532 | fn pretty_rule(lhs: Ident, syms: &[Symbol]) -> String { 533 | let mut r = String::new(); 534 | let _ = write!(&mut r, "{} ->", lhs); 535 | for sym in syms.iter() { 536 | let _ = write!(&mut r, " {}", sym); 537 | } 538 | r 539 | } 540 | 541 | // Pretty-print an item set, for error messages. 542 | fn pretty(x: &ItemSet, pad: &str) -> String { 543 | let mut r = String::new(); 544 | let mut first = true; 545 | for item in x.items.iter() { 546 | if first { 547 | first = false; 548 | } else { 549 | let _ = write!(&mut r, "\n{}", pad); 550 | } 551 | let _ = write!(&mut r, "{} ->", item.lhs); 552 | for j in 0..item.pos { 553 | let _ = write!(&mut r, " {}", item.rhs.syms[j]); 554 | } 555 | let _ = write!(&mut r, " •"); 556 | for j in item.pos..item.rhs.syms.len() { 557 | let _ = write!(&mut r, " {}", item.rhs.syms[j]); 558 | } 559 | } 560 | r 561 | } 562 | 563 | pub fn parser(input: proc_macro::TokenStream) -> proc_macro::TokenStream { 564 | let parser = parse_macro_input!(input as Parser); 565 | let fake_rule; // N.B. must go before `rules` to appease dropck 566 | let mut rules = BTreeMap::new(); 567 | let mut types = BTreeMap::new(); 568 | let mut start = None; 569 | let mut errors = vec![]; 570 | for rule_set in &parser.rule_sets { 571 | // parse "LHS: Type {" 572 | let lhs = &rule_set.lhs; 573 | if start.is_none() { 574 | start = Some(lhs.clone()); 575 | } 576 | match rules.entry(lhs.clone()) { 577 | Entry::Occupied(ent) => { 578 | errors.push(Error::new_spanned(lhs, "duplicate nonterminal")); 579 | errors.push(Error::new_spanned( 580 | ent.key(), 581 | "the first definition is here", 582 | )); 583 | } 584 | Entry::Vacant(ent) => { 585 | types.insert(lhs.clone(), rule_set.return_ty.clone()); 586 | ent.insert(&rule_set.rules); 587 | } 588 | } 589 | } 590 | let start = if let Some(start) = start { 591 | start 592 | } else { 593 | return Error::new(Span::call_site(), "at least one nonterminal is required") 594 | .into_compile_error() 595 | .into(); 596 | }; 597 | let mut rules: BTreeMap> = rules 598 | .into_iter() 599 | .map(|(lhs, rules)| { 600 | let rhss = rules 601 | .iter() 602 | .map(|rule| { 603 | // figure out which symbols in `rule` are nonterminals vs terminals 604 | let syms = rule 605 | .rhs 606 | .iter() 607 | .map(|tok| { 608 | let ident = tok.ident().clone(); 609 | if types.contains_key(&ident) { 610 | Nonterminal(ident) 611 | } else { 612 | Terminal(ident) 613 | } 614 | }) 615 | .collect(); 616 | Rhs { 617 | syms: syms, 618 | act: rule, 619 | } 620 | }) 621 | .collect(); 622 | (lhs, rhss) 623 | }) 624 | .collect(); 625 | let fake_start = Ident::new("__FIXME__start", Span::call_site()); 626 | fake_rule = Rule { 627 | rhs: vec![], 628 | arrow: Default::default(), 629 | action: quote!(), 630 | exclusions: BTreeSet::new(), 631 | exclude_eof: false, 632 | priority: -1, 633 | }; 634 | rules.insert( 635 | fake_start.clone(), 636 | vec![Rhs { 637 | syms: vec![Nonterminal(start)], 638 | act: &fake_rule, 639 | }], 640 | ); 641 | let grammar = Grammar { 642 | rules: rules, 643 | start: fake_start, 644 | }; 645 | let result = lr1_machine( 646 | &grammar, 647 | &types, 648 | parser.token_ty, 649 | parser.span_ty, 650 | parser.range_fn, 651 | parser.vis, 652 | parser.name, 653 | |ident| quote!(#ident { .. }), 654 | |lhs, act, syms| { 655 | let mut expr = act.action.clone().into_token_stream(); 656 | let mut args = vec![]; 657 | debug_assert_eq!(syms.len(), act.rhs.len()); 658 | for (i, (sym, x)) in syms.iter().zip(&act.rhs).enumerate() { 659 | args.push(match *x { 660 | RuleRhsItem::SymbolPat(_, _, ref pat) => Some(pat.clone().into_token_stream()), 661 | RuleRhsItem::Destructure(ref ident, _, ref pats) => { 662 | let id = Ident::new(&format!("s{}", i), Span::call_site()); 663 | let terminal = match *sym { 664 | Nonterminal(_) => { 665 | errors.push(Error::new_spanned( 666 | x, 667 | "can't bind enum case to a nonterminal", 668 | )); 669 | Ident::new("__error", Span::call_site()) 670 | } 671 | Terminal(ref x) => { 672 | debug_assert_eq!(*x, ident.to_string()); 673 | x.clone() 674 | } 675 | }; 676 | expr = quote_spanned!(act.head().span() => 677 | { 678 | // force a by-move capture 679 | match {#id} { 680 | #terminal(#pats) => #expr, 681 | _ => unreachable!(), 682 | } 683 | }); 684 | Some(id.into_token_stream()) 685 | } 686 | RuleRhsItem::Symbol(_) => None, 687 | }); 688 | } 689 | 690 | // XXX: should be a cargo feature (?) 691 | if false { 692 | let rule_str = pretty_rule(lhs.clone(), syms); 693 | expr = quote!({ 694 | println!("reduce by {}", #rule_str); 695 | #expr 696 | }); 697 | } 698 | 699 | (expr, args, act.head().span()) 700 | }, 701 | |rhs, token| match token { 702 | Some(id) => !rhs.act.exclusions.contains(&id), 703 | None => !rhs.act.exclude_eof, 704 | }, 705 | |rhs, _| rhs.act.priority, 706 | ) 707 | .unwrap_or_else(|conflict| { 708 | match conflict { 709 | LR1Conflict::ReduceReduce { 710 | state, 711 | token, 712 | r1, 713 | r2, 714 | } => { 715 | let mut error = Error::new_spanned( 716 | r1.1.act.head(), 717 | format!( 718 | "reduce-reduce conflict: 719 | state: {} 720 | token: {}", 721 | pretty(&state, " "), 722 | match token { 723 | Some(id) => id.to_string(), 724 | None => "EOF".to_string(), 725 | } 726 | ), 727 | ); 728 | error.combine(Error::new_spanned( 729 | r2.1.act.head(), 730 | "this is the conflicting rule", 731 | )); 732 | error 733 | } 734 | LR1Conflict::ShiftReduce { state, token, rule } => Error::new_spanned( 735 | rule.1.act.head(), 736 | format!( 737 | "shift-reduce conflict: 738 | state: {} 739 | token: {}", 740 | pretty(&state, " "), 741 | match token { 742 | Some(id) => id.to_string(), 743 | None => "EOF".to_string(), 744 | } 745 | ), 746 | ), 747 | } 748 | .into_compile_error() 749 | }); 750 | 751 | let errors = errors.into_iter().map(|e| e.into_compile_error()).collect::(); 752 | quote!(#errors #result).into() 753 | } 754 | -------------------------------------------------------------------------------- /tests/ui.rs: -------------------------------------------------------------------------------- 1 | #[test] 2 | fn ui() { 3 | let t = trybuild::TestCases::new(); 4 | t.compile_fail("tests/ui/*.rs"); 5 | } 6 | -------------------------------------------------------------------------------- /tests/ui/if_else_shift_reduce.rs: -------------------------------------------------------------------------------- 1 | //! This test demonstrates the classic if/else ambiguity in C, 2 | //! which results in a shift-reduce conflict in the LR(1) grammar. 3 | 4 | enum Token { 5 | If, 6 | Else, 7 | Var(i32), 8 | Semi, 9 | LParen, 10 | RParen, 11 | } 12 | type Span = (); 13 | enum Expr { 14 | Var(i32), 15 | } 16 | enum Statement { 17 | If(Expr, Box), 18 | IfElse(Expr, Box, Box), 19 | } 20 | #[allow(unused_imports)] 21 | use self::Token::*; 22 | plex::parser! { 23 | fn parse_(Token, Span); 24 | 25 | statements: Vec { 26 | => vec![], 27 | statements[mut st] statement[e] Semi => { 28 | st.push(e); 29 | st 30 | } 31 | } 32 | 33 | statement: Statement { 34 | // The shift-reduce conflict applies here. 35 | // To resolve this conflict, add the following attribute, which effectively causes the ambiguity 36 | // to be resolved in favour of attaching the "else" to the inner "if" statement: 37 | // #[no_reduce(Else)] 38 | If LParen expr[head] RParen statement[then] => Statement::If( 39 | head, 40 | Box::new(then), 41 | ), 42 | If LParen expr[head] RParen statement[then] Else statement[else_] => Statement::IfElse( 43 | head, 44 | Box::new(then), 45 | Box::new(else_), 46 | ), 47 | } 48 | 49 | expr: Expr { 50 | Var(i) => Expr::Var(i), 51 | } 52 | } 53 | 54 | fn main() { 55 | } 56 | -------------------------------------------------------------------------------- /tests/ui/if_else_shift_reduce.stderr: -------------------------------------------------------------------------------- 1 | error: shift-reduce conflict: 2 | state: statement -> If LParen expr RParen statement • 3 | statement -> If LParen expr RParen statement • Else statement 4 | token: Else 5 | --> tests/ui/if_else_shift_reduce.rs:38:9 6 | | 7 | 38 | If LParen expr[head] RParen statement[then] => Statement::If( 8 | | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 9 | -------------------------------------------------------------------------------- /tests/ui/reduce_reduce.rs: -------------------------------------------------------------------------------- 1 | //! This test demonstrates a reduce-reduce conflict. 2 | 3 | enum Token { 4 | Lit(i32), 5 | Var(i32), 6 | Minus, 7 | } 8 | type Span = (); 9 | enum Expr { 10 | Lit(i32), 11 | Var(i32), 12 | Negate(Box), 13 | } 14 | #[allow(unused_imports)] 15 | use self::Token::*; 16 | plex::parser! { 17 | fn parse_(Token, Span); 18 | 19 | expr: Expr { 20 | Var(i) => Expr::Var(i), 21 | Lit(i) => Expr::Lit(i), 22 | // These rules induce a reduce-reduce conflict. 23 | // The intent here is to create a special case where `Minus` followed by a `Lit` is parsed as a negative literal, 24 | // rather than a negation expression. 25 | // To implement that, use the attribute: 26 | // #[overriding] 27 | Minus Lit(i) => Expr::Lit(-i), 28 | Minus expr[e] => Expr::Negate(Box::new(e)), 29 | } 30 | } 31 | 32 | fn main() {} 33 | -------------------------------------------------------------------------------- /tests/ui/reduce_reduce.stderr: -------------------------------------------------------------------------------- 1 | error: reduce-reduce conflict: 2 | state: expr -> Lit • 3 | expr -> Minus Lit • 4 | token: EOF 5 | --> tests/ui/reduce_reduce.rs:27:9 6 | | 7 | 27 | Minus Lit(i) => Expr::Lit(-i), 8 | | ^^^^^^^^^^^^^^^ 9 | 10 | error: this is the conflicting rule 11 | --> tests/ui/reduce_reduce.rs:21:9 12 | | 13 | 21 | Lit(i) => Expr::Lit(i), 14 | | ^^^^^^^^^ 15 | --------------------------------------------------------------------------------