├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .vim └── coc-settings.json ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE ├── README.md └── crates ├── char_range_gen ├── Cargo.toml └── src │ └── main.rs ├── lexgen ├── Cargo.toml ├── benches │ └── benchmarks.rs ├── src │ ├── ast.rs │ ├── builtin.rs │ ├── char_ranges.rs │ ├── collections.rs │ ├── dfa.rs │ ├── dfa │ │ ├── backtrack.rs │ │ ├── codegen.rs │ │ ├── codegen │ │ │ ├── ctx.rs │ │ │ └── search_table.rs │ │ ├── simplify.rs │ │ └── simulate.rs │ ├── display.rs │ ├── lib.rs │ ├── nfa.rs │ ├── nfa │ │ └── simulate.rs │ ├── nfa_to_dfa.rs │ ├── range_map.rs │ ├── regex_to_nfa.rs │ ├── right_ctx.rs │ ├── semantic_action_table.rs │ └── tests.rs └── tests │ ├── bugs.rs │ ├── lua_5_1.rs │ ├── right_ctx.rs │ ├── test_data │ ├── test_utils.rs │ └── tests.rs ├── lexgen_lalrpop_example ├── Cargo.toml ├── build.rs └── src │ ├── interpolation.lalrpop │ └── lib.rs └── lexgen_util ├── Cargo.toml ├── README.md └── src └── lib.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | name: 'Build and test' 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Get stable toolchain 13 | uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: stable 16 | override: true 17 | 18 | - name: Build 19 | run: cargo build --verbose 20 | 21 | - name: Test 22 | run: cargo test 23 | 24 | formatting: 25 | name: 'Check formatting' 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v2 29 | 30 | - name: Get Rust stable toolchain 31 | uses: actions-rs/toolchain@v1 32 | with: 33 | toolchain: stable 34 | components: rustfmt 35 | override: true 36 | 37 | - name: Check formatting 38 | run: cargo fmt --all -- --check 39 | 40 | clippy: 41 | name: 'Check lints' 42 | runs-on: ubuntu-latest 43 | steps: 44 | - uses: actions/checkout@v2 45 | 46 | - name: Get Rust stable toolchain 47 | uses: actions-rs/toolchain@v1 48 | with: 49 | toolchain: stable 50 | override: true 51 | 52 | - name: Check lints 53 | run: cargo clippy --all-targets --all 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /char_range_gen/target 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /.vim/coc-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rust-analyzer.checkOnSave.allTargets": false 3 | } 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 2024/12/24: 0.16.0 2 | 3 | - Update lexgen rustc-hash dependency from 1.1.0 to 2.0.0, lexgen_util 4 | unicode-width dependency from 0.1.10 to 0.2.0. 5 | 6 | - Lexers can now use `pub(crate)` visibility, and other visibilities supported 7 | by Rust and the `syn` crate. Previously only `pub` was supported. 8 | 9 | - Eliminate redundant `backtrack` calls in generated code, improving code size 10 | and runtime performance. Runtime performance improved 13% in a benchmark. 11 | (#69) 12 | 13 | # 2023/09/03: 0.15.0 14 | 15 | - Lexer type declarations can now have outer attributes other than just 16 | `#[derive(Clone)]`. Example: 17 | ```rust 18 | lexer! { 19 | /// A lexer for Rust. 20 | #[derive(Debug, Clone)] 21 | pub RustLexer(LexerState) -> RustToken; 22 | 23 | ... 24 | } 25 | ``` 26 | The attributes are directly copied to the generated `struct`. In the example, 27 | the documentation and `derive` attribute will be copied to the generated 28 | `struct`: 29 | ```rust 30 | /// A lexer for Rust. 31 | #[derive(Debug, Clone)] 32 | pub struct RustLexer<...>(...); 33 | ``` 34 | 35 | - `lexgen_util::Lexer` type now derives `Debug` (in addition to `Clone`). This 36 | makes it possible to derive `Debug` in generated lexers. 37 | 38 | - `syn` dependency updated to version 2. 39 | 40 | # 2023/04/23: 0.14.0 41 | 42 | - **Breaking change:** Rules without a right-hand side (e.g. `$$whitespace,`) 43 | now always reset the current match. Previously such rules would only reset 44 | the current match in `Init`. (#12) 45 | 46 | To migrate, add a semantic action to your rule that just calls `continue_()` 47 | on the lexer. For example, if you have `$$whitespace,`, replace it with: 48 | 49 | ```rust 50 | $$whitespace => |lexer| lexer.continue_(), 51 | ``` 52 | 53 | - `clippy::manual_is_ascii_check` violations are now ignored in generated code. 54 | 55 | # 2023/04/10: 0.13.0 56 | 57 | - Fix more `manual_range_contains` lints in generated code. 58 | 59 | - `let` bindings can now appear inside `rule`s. Previously `let`s were only 60 | allowed at the top-level. (#28) 61 | 62 | - You can now add `#[derive(Clone)]` before the lexer type name to implement 63 | `Clone` for the lexer type. This can be used to implement backtracking 64 | parsers. Example: 65 | ```rust 66 | lexer! { 67 | #[derive(Clone)] 68 | pub Lexer -> Token; 69 | // The struct `Lexer` will implement `Clone` 70 | 71 | ... 72 | } 73 | ``` 74 | 75 | # 2022/08/12: 0.12.0 76 | 77 | - Fix `double_comparison`, `manual_range_contains` lints in generated code. 78 | (0ecb0b1) 79 | 80 | - Lexer constructors `new_with_state` and `new_from_iter_with_state` no longer 81 | require user state to implement `Default`. (#54) 82 | 83 | - User state can now have lifetime parameters other than `'input`. (#53) 84 | 85 | # 2022/05/15: 0.11.0 86 | 87 | - Lexer state is now reset on failure. (#48) 88 | 89 | # 2022/02/20: 0.10.0 90 | 91 | - Generated lexers now have two new constructors: 92 | 93 | - `new_from_iter + Clone>(iter: I) -> Self` 94 | - `new_from_iter_with_state + Clone, S>(iter: I, user_state: S) -> Self` 95 | 96 | These constructors allow running a lexer on a character iterator instead of a 97 | string slice. Generated lexers work exactly the same way, except the `match_` 98 | method panics when called. 99 | 100 | Locations of matches can be obtained with the `match_loc(&self) -> (Loc, 101 | Loc)` method. 102 | 103 | These constructors are useful when the input is not a flat unicode string, 104 | but something like a rope, gap array, zipper, etc. (#41) 105 | 106 | - `lexgen_util::Loc` now implements `Default`. This makes it easier to use 107 | lexgen with [LALRPOP]. (#44) 108 | 109 | [LALRPOP]: https://github.com/lalrpop/lalrpop 110 | 111 | # 2022/01/31: 0.9.0 112 | 113 | - New regex syntax `#` added for character set difference, e.g. `re1 # re2` 114 | matches characters in `re1` that are not in `re2`. `re1` and `re2` need to be 115 | "character sets", i.e. `*`, `+`, `?`, `"..."`, `$`, and concatenation are not 116 | allowed. 117 | 118 | - **Breaking change:** `LexerError` type is refactored to add location 119 | information to all errors, not just `InvalidToken`. Previously the type was: 120 | 121 | ```rust 122 | #[derive(Debug, Clone, PartialEq, Eq)] 123 | pub enum LexerError { 124 | InvalidToken { 125 | location: Loc, 126 | }, 127 | 128 | /// Custom error, raised by a semantic action 129 | Custom(E), 130 | } 131 | ``` 132 | 133 | with this change, it is now: 134 | 135 | ```rust 136 | #[derive(Debug, Clone, PartialEq, Eq)] 137 | pub struct LexerError { 138 | pub location: Loc, 139 | pub kind: LexerErrorKind, 140 | } 141 | 142 | #[derive(Debug, Clone, PartialEq, Eq)] 143 | pub enum LexerErrorKind { 144 | /// Lexer error, raised by lexgen-generated code 145 | InvalidToken, 146 | 147 | /// Custom error, raised by a semantic action 148 | Custom(E), 149 | } 150 | ``` 151 | 152 | - A new syntax added for right contexts. A right context is basically 153 | lookahead, but can only be used in rules and cannot be nested inside regexes. 154 | See README for details. (#29) 155 | 156 | # 2021/11/30: 0.8.1 157 | 158 | New version published to fix broken README pages for lexgen and lexgen_util in 159 | crates.io. 160 | 161 | # 2021/10/30: 0.8.0 162 | 163 | - **Breaking change:** Starting with this release, lexgen-generated lexers now 164 | depend on `lexgen_util` package of the same version. If you are using lexgen 165 | version 0.8 or newer, make sure to add `lexgen_util = "..."` to your 166 | `Cargo.toml`, using the same version number as `lexgen`. 167 | 168 | - Common code in generated code is moved to a new crate `lexgen_util`. 169 | lexgen-generated lexers now depend on `lexgen_util`. 170 | 171 | - **Breaking change:** Line and column tracking implemented. Iterator 172 | implementation now yields `(Loc, Token, Loc)`, where `Loc` is defined in 173 | `lexgen_util` as `struct Loc { line: u32, col: u32, byte_idx: usize }`. 174 | 175 | - Fixed a bug when initial state of a rule does not have any transitions (rule 176 | is empty). (#27, 001ea51) 177 | 178 | - Fixed a bug in codegen that caused accidental backtracking in some cases. 179 | (#27, 001ea51) 180 | 181 | - Fixed a bug that caused incorrect lexing when a lexer state has both range 182 | and any (`_`) transitions. (#31) 183 | 184 | # 2021/10/21: 0.7.0 185 | 186 | - Regex syntax updated to include "any character" (`_`) and "end of input" 187 | (`$`). 188 | 189 | Previously "any character" (`_`) could be used as a rule left-hand side, but 190 | was not allowed in regexes. 191 | 192 | - Semantic action functions that use user state (`state` method of the lexer 193 | handle) no longer need `mut` modifier in the handle argument. 194 | 195 | This will generate warnings in old code with semantic actions that take a 196 | `mut` argument. 197 | 198 | - New lexer method `reset_match` implemented to reset the current match. 199 | 200 | # 2021/10/19: 0.6.0 201 | 202 | - Fixed precedences of concatenation (juxtaposition) and alternation (`|`). 203 | 204 | - Fixed lexing in lexers that require backtracking to implement longest match 205 | rule. (#16) 206 | 207 | # 2021/10/07: 0.5.0 208 | 209 | - Accepting states without transitions are now simplified in compile time and 210 | semantic actions of such states are inlined in the states that make a 211 | transition to such accepting states. In Lua 5.1 lexer this reduces a 212 | benchmark's runtime by 14.9%. (#7) 213 | 214 | Note that this potentially duplicates a lot of code in the generated code 215 | when some states have large semantic action codes and lots of incoming edges 216 | in the DFA. However in practice I haven't observed this yet. (#8) 217 | 218 | - DFA states with one predecessor are now inlined in the predecessor states. 219 | This reduces code size and improves runtime performance. (33547ec) 220 | 221 | - We now reset the current match after returning a token (with `return_` and 222 | `switch_and_return`). (#11) 223 | 224 | # 2021/05/30: 0.4.0 225 | 226 | - lexgen now comes with a set of built-in regular expressions for matching 227 | Unicode alphanumerics, uppercases, whitespaces etc. See README for details. 228 | 229 | - Fixed a few issues with end-of-stream handling (cbaabe2) 230 | 231 | # 2021/05/28: 0.3.0 232 | 233 | - Fixed handling of overlapping ranges in a single NFA/DFA state. (#3) 234 | 235 | # 2021/05/16: 0.2.2 236 | 237 | - `LexerError` type now implements `Clone` and `Copy`. 238 | 239 | # 2021/05/06: 0.2.1 240 | 241 | - Fixed various bugs in `_` pattern handling. 242 | 243 | # 2021/05/05: 0.2.0 244 | 245 | - It is now possible to use the special lifetime `'input` in your token types 246 | to borrow from the input string. Example: 247 | 248 | ```rust 249 | enum Token<'input> { 250 | Id(&'input str), 251 | } 252 | 253 | lexer! { 254 | Lexer -> Token<'input>; 255 | 256 | rule Init { 257 | [' ' '\t' '\n']; // skip whitespace 258 | 259 | ['a'-'z']+ => |lexer| { 260 | let match_ = lexer.match_(); 261 | lexer.return_(Token::Id(match_)) 262 | }, 263 | } 264 | } 265 | ``` 266 | 267 | See also the Lua 5.1 lexer example, which is updated to use this feature. 268 | 269 | - The `rule Init { ... }` syntax can now be omitted when you don't need named 270 | rule sets. For example, the example in the previous changelog entry can be 271 | simplified as: 272 | 273 | ```rust 274 | lexer! { 275 | Lexer -> Token<'input>; 276 | 277 | [' ' '\t' '\n'], // skip whitespace 278 | 279 | ['a'-'z']+ => |lexer| { 280 | let match_ = lexer.match_(); 281 | lexer.return_(Token::Id(match_)) 282 | }, 283 | } 284 | ``` 285 | 286 | - `pub` keyword before a lexer name now generates the type as `pub`. Useful for 287 | using the generated lexer in other modules. Example: 288 | 289 | ```rust 290 | lexer! { 291 | pub Lexer -> Token; 292 | 293 | ... 294 | } 295 | ``` 296 | 297 | - Two new action kinds: "fallible" and "simple" added. The old ones defined 298 | with `=>` are now called "infallible". 299 | 300 | - "fallible" actions are defined with `=?` instead of `=>`. The difference 301 | from infallible actions is the return type is `Result`, 302 | instead of `Token`, where `UserError` is defined using `type Error = ...;` 303 | syntax. LHS can have a `<'input>` lifetime parameter when borrowing from 304 | the user input in the error values. When a user error type is defined, the 305 | lexer error struct becomes an enum, with two variants: 306 | 307 | ```rust 308 | enum LexerError { 309 | LexerError { char_idx: usize }, 310 | UserError(UserError), 311 | } 312 | ``` 313 | 314 | - "simple" actions are defined with `=` instead of `=>`. The RHS needs to be a 315 | value for a token, instead of a closure for a lexer action. This rule kind is 316 | useful when matching keywords and other simple tokens in a language. Example: 317 | 318 | ```rust 319 | lexer! { 320 | Lexer -> Token; 321 | 322 | '(' = Token::LParen, 323 | ')' = Token::RParen, 324 | ... 325 | } 326 | ``` 327 | 328 | The syntax ` = ` is syntactic sugar for ` => |lexer| 329 | lexer.return_()`. 330 | 331 | # 2021/04/22: 0.1.0 332 | 333 | - Initial release 334 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = [ 4 | "crates/char_range_gen", 5 | "crates/lexgen", 6 | "crates/lexgen_lalrpop_example", 7 | "crates/lexgen_util", 8 | ] 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Ömer Sinan Ağacan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lexgen: A fully-featured lexer generator, implemented as a proc macro 2 | 3 | ```rust 4 | use lexgen::lexer; 5 | use lexgen_util::Loc; 6 | 7 | lexer! { 8 | // First line specifies name of the lexer and the token type returned by 9 | // semantic actions 10 | Lexer -> Token; 11 | 12 | // Regular expressions can be named with `let` syntax 13 | let init = ['a'-'z']; 14 | let subseq = $init | ['A'-'Z' '0'-'9' '-' '_']; 15 | 16 | // Rule sets have names. Each rule set is compiled to a separate DFA. 17 | // Switching between rule sets is done explicitly in semantic actions. 18 | rule Init { 19 | // Rules without a right-hand side for skipping whitespace, 20 | // comments, etc. 21 | [' ' '\t' '\n']+, 22 | 23 | // Rule for matching identifiers 24 | $init $subseq* => |lexer| { 25 | let token = Token::Id(lexer.match_().to_owned()); 26 | lexer.return_(token) 27 | }, 28 | } 29 | } 30 | 31 | // The token type 32 | #[derive(Debug, PartialEq, Eq)] 33 | enum Token { 34 | // An identifier 35 | Id(String), 36 | } 37 | 38 | // Generated lexers are initialized with a `&str` for the input 39 | let mut lexer = Lexer::new(" abc123Q-t z9_9"); 40 | 41 | // Lexers implement `Iterator>`, 42 | // where `T` is the token type specified in the lexer definition (`Token` in 43 | // this case), and `Loc`s indicate line, column, and byte indices of 44 | // beginning and end of the lexemes. 45 | assert_eq!( 46 | lexer.next(), 47 | Some(Ok(( 48 | Loc { line: 0, col: 1, byte_idx: 1 }, 49 | Token::Id("abc123Q-t".to_owned()), 50 | Loc { line: 0, col: 10, byte_idx: 10 } 51 | ))) 52 | ); 53 | assert_eq!( 54 | lexer.next(), 55 | Some(Ok(( 56 | Loc { line: 0, col: 12, byte_idx: 12 }, 57 | Token::Id("z9_9".to_owned()), 58 | Loc { line: 0, col: 16, byte_idx: 16 } 59 | ))) 60 | ); 61 | assert_eq!(lexer.next(), None); 62 | ``` 63 | 64 | See also: 65 | 66 | - [Simple lexer definitions in tests][1] 67 | - [A full Lua 5.1 lexer][2] 68 | - [An example that uses lexgen with LALRPOP][3] 69 | - [A lexer for a simpler version of OCaml][4] 70 | - [A Rust lexer][5] 71 | - [A parse event generator][6] 72 | 73 | ## Motivation 74 | 75 | Implementing lexing is often (along with parsing) the most tedious part of 76 | implementing a language. Lexer generators make this much easier, but in Rust 77 | existing lexer generators miss essential features for practical use, and/or 78 | require a pre-processing step when building. 79 | 80 | My goal with lexgen is to have a feature-complete and easy to use lexer 81 | generator. 82 | 83 | ## Usage 84 | 85 | lexgen doesn't require a build step. Add same versions of `lexgen` and 86 | `lexgen_util` as dependencies in your `Cargo.toml`. 87 | 88 | ## Lexer syntax 89 | 90 | lexgen lexers start with the name of the generated lexer struct, optional user 91 | state part, and the token type (type of values returned by semantic actions). 92 | Example: 93 | 94 | ```rust 95 | lexer! { 96 | Lexer(LexerState) -> Token; 97 | ... 98 | } 99 | ``` 100 | 101 | Here the generated lexer type will be named `Lexer`. User state type is 102 | `LexerState` (this type should be defined by the user). The token type is 103 | `Token`. 104 | 105 | After the lexer name and user state and token types we define the rules: 106 | 107 | ```rust 108 | rule Init { 109 | ... 110 | } 111 | 112 | rule SomeOtherRule { 113 | ... 114 | } 115 | ``` 116 | 117 | The first rule set will be defining the initial state of the lexer and needs to 118 | be named `Init`. 119 | 120 | In the body of a `rule` block we define the rules for that lexer state. The 121 | syntax for a rule is ` => ,`. Regex syntax is described 122 | below. A semantic action is any Rust code with the type `fn(LexerHandle) -> 123 | LexerAction` where `LexerHandle` and `LexerAction` are generated names derived 124 | from the lexer name (`Lexer` in our example). More on these types below. 125 | 126 | Regular expressions can be named with `let = ;` syntax. Example: 127 | 128 | ```rust 129 | let init = ['a'-'z']; 130 | let subseq = $init | ['A'-'Z' '0'-'9' '-' '_']; 131 | 132 | // Named regexes can be used with the `$` prefix 133 | $init $subseq* => |lexer| { ... } 134 | ``` 135 | 136 | You can omit the `rule Init { ... }` part and have all of your rules at the top 137 | level if you don't need rule sets. 138 | 139 | In summary: 140 | 141 | - First line is in form `() -> `. 142 | The `()` part can be omitted for stateless lexers. 143 | 144 | - Next is the rule sets. There should be at least one rule set with the name 145 | `Init`, which is the name of the initial state. 146 | 147 | - `let` bindings can be added at the top-level or in `rule`s. 148 | 149 | ## Regex syntax 150 | 151 | Regex syntax can be used in right-hand side of let bindings and left-hand side 152 | of rules. The syntax is: 153 | 154 | - `$var` for variables defined in the let binding section. Variables need to be 155 | defined before used. 156 | - `$$var` for built-in regexes (see "Built-in regular expressions" section 157 | below). 158 | - Rust character syntax for characters, e.g. `'a'`. 159 | - Rust string syntax for strings, e.g. `"abc"`. 160 | - `[...]` for character sets. Inside the brackets you can have one or more of: 161 | 162 | - Characters 163 | - Character ranges: e.g. `'a'-'z'` 164 | 165 | Here's an example character set for ASCII alphanumerics: `['a'-'z' 'A'-'Z' 166 | '0'-'9']` 167 | - `_` for matching any character 168 | - `$` for matching end-of-input 169 | - `*` for zero or more repetitions of `` 170 | - `+` for one or more repetitions of `` 171 | - `?` for zero or one repetitions of `` 172 | - ` ` for concatenation 173 | - ` | ` for alternation: match the first one, or the second one. 174 | - ` # ` for difference: match characters in the first regex that 175 | are not in the second regex. Note that regexes on the left and right of `#` 176 | should be "characters sets", i.e. `*`, `+`, `?`, `"..."`, `$`, and 177 | concatenation are not allowed. Variables that are bound to character sets are 178 | allowed. 179 | 180 | Binding powers (precedences), from higher to lower: 181 | 182 | - `*`, `+`, `?` 183 | - `#` 184 | - Concatenation 185 | - `|` 186 | 187 | You can use parenthesis for grouping, e.g. `('a' | 'b')*`. 188 | 189 | Example: `'a' 'b' | 'c'+` is the same as `(('a' 'b') | ('c'+))`. 190 | 191 | ## Right context (lookahead) 192 | 193 | A rule in a rule set can be followed by another regex using `> ` syntax, 194 | for right context. Right context is basically a limited form of lookahead: they 195 | can only appear after a top-level regex for a rule. They cannot be used nested 196 | in a regex. 197 | 198 | For example, the rule left-hand side `'a' > (_ # 'b')` matches `'a'` as long as 199 | it's not followed by `'b'`. 200 | 201 | See also [right context tests] for more examples. 202 | 203 | [right context tests]: https://github.com/osa1/lexgen/blob/main/crates/lexgen/tests/right_ctx.rs 204 | 205 | ## Built-in regular expressions 206 | 207 | lexgen comes with a set of built-in regular expressions. Regular 208 | expressions listed below match the same set of characters as their Rust 209 | counterparts. For example, `$$alphabetic` matches the same set of characters as 210 | Rust's [`char::is_alphabetic`]: 211 | 212 | - `$$alphabetic` 213 | - `$$alphanumeric` 214 | - `$$ascii` 215 | - `$$ascii_alphabetic` 216 | - `$$ascii_alphanumeric` 217 | - `$$ascii_control` 218 | - `$$ascii_digit` 219 | - `$$ascii_graphic` 220 | - `$$ascii_hexdigit` 221 | - `$$ascii_lowercase` 222 | - `$$ascii_punctuation` 223 | - `$$ascii_uppercase` 224 | - `$$ascii_whitespace` 225 | - `$$control` 226 | - `$$lowercase` 227 | - `$$numeric` 228 | - `$$uppercase` 229 | - `$$whitespace` 230 | 231 | (Note that in the generated code we don't use Rust `char` methods. For simple 232 | cases like `$$ascii` we generate simple range checks. For more complicated 233 | cases like `$$lowercase` we generate a binary search table and run binary 234 | search when checking a character) 235 | 236 | In addition, these two built-in regular expressions match Unicode [XID_Start and 237 | XID_Continue]: 238 | 239 | - `$$XID_Start` 240 | - `$$XID_Continue` 241 | 242 | [`char::is_alphabetic`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_alphabetic 243 | [XID_Start and XID_Continue]: http://www.unicode.org/reports/tr31/ 244 | 245 | ## Rule syntax 246 | 247 | - ` => ,`: `` syntax is as described above. 248 | `` is any Rust code with type `fn(&mut Lexer) -> 249 | SemanticActionResult`. More on `SemanticActionResult` type in the next 250 | section. 251 | 252 | - ` =? ,`: fallible actions. This syntax is similar to 253 | the syntax above, except `` has type `fn(&mut Lexer) -> 254 | LexerAction>`. When using rules of this kind, the 255 | error type needs to be declared at the beginning of the lexer with the `type 256 | Error = UserError;` syntax. 257 | 258 | When a rule of this kind returns an error, the error is returned to the 259 | caller of the lexer's `next` method. 260 | 261 | - `,`: Syntactic sugar for ` => |lexer| { lexer.reset_match(); 262 | lexer.continue_() },`. Useful for skipping characters (e.g. whitespace). 263 | 264 | - ` = ,`: Syntactic sugar for ` => |lexer| 265 | lexer.return_(),`. Useful for matching keywords, punctuation 266 | (operators) and delimiters (parens, brackets). 267 | 268 | ## End-of-input handling in rule sets 269 | 270 | The `Init` rule set terminates lexing successfully on end-of-input (i.e. 271 | `lexer.next()` returns `None`). Other rule sets fail on end-of-input (i.e. 272 | return `Some(Err(...))`). This is because generally the states other than the 273 | initial one are for complicated tokens (strings, raw strings, multi-line 274 | comments) that need to be terminated and handled, and end-of-input in those 275 | states usually means the token did not terminate properly. 276 | 277 | (To handle end-of-input in a rule set you can use `$` as described in section 278 | "Regex syntax" above.) 279 | 280 | ## Handle, rule, error, and action types 281 | 282 | The `lexer` macro generates a struct with the name specified by the user in the 283 | first line of the lexer definition. In the example at the beginning (`Lexer -> 284 | Token;`), name of the struct is `Lexer`. 285 | 286 | A mut reference to this type is passed to semantic action functions. In the 287 | implementation of a semantic action, you should use one of the methods below 288 | drive the lexer and return tokens: 289 | 290 | - `fn match_(&self) -> &str`: returns the current match. Note that when the 291 | lexer is constructed with `new_from_iter` or `new_from_iter_with_state`, this 292 | method panics. It should only be called when the lexer is initialized with 293 | `new` or `new_with_state`. 294 | - `fn match_loc(&self) -> (lexgen_util::Loc, lexgen_util::Loc)`: returns the 295 | bounds of the current match 296 | - `fn peek(&mut self) -> Option`: looks ahead one character 297 | - `fn state(&mut self) -> &mut `: returns a mutable reference 298 | to the user state 299 | - `fn return_(&self, token: ) -> SemanticActionResult`: 300 | returns the passed token as a match. 301 | - `fn continue_(&self) -> SemanticActionResult`: ignores the current match and 302 | continues lexing in the same lexer state. Useful for skipping characters. 303 | - `fn switch(&mut self, rule: LexerRule) -> SemanticActionResult`: used for 304 | switching between lexer states. The `LexerRule` (where `Lexer` part is the 305 | name of the lexer as specified by the user) is an enum with a variant for 306 | each rule set name, for example, `LexerRule::Init`. See the stateful lexer 307 | example below. 308 | - `fn switch_and_return(&mut self, rule: LexerRule, token: ) 309 | -> SemanticActionResult`: switches to the given lexer state and returns the 310 | given token. 311 | - `fn reset_match(&mut self)`: resets the current match. E.g. if you call 312 | `match_()` right after `reset_match()` it will return an empty string. 313 | 314 | Semantic action functions should return a `SemanticActionResult` value obtained 315 | from one of the methods listed above. 316 | 317 | ## Initializing lexers 318 | 319 | lexgen generates 4 constructors: 320 | 321 | - `fn new(input: &str) -> Self`: Used when the lexer does not have user state, 322 | or user state implements `Default`. 323 | 324 | - `fn new_with_state(input: &str, user_state: S) -> Self`: Used when the lexer 325 | has user state that does not implement `Default`, or you want to initialize 326 | the state with something other than the default. `S` is the user state type 327 | specified in lexer definition. See stateful lexer example below. 328 | 329 | - `fn new_from_iter + Clone>(iter: I) -> Self`: Used 330 | when the input isn't a flat string, but something like a rope or zipper. Note 331 | that the `match_` method panics when this constructor is used. Instead use 332 | `match_loc` to get the location of the current match. 333 | 334 | - `fn new_from_iter_with_state + Clone, S>(iter: I, 335 | user_state: S) -> Self`: Same as above, but doesn't require user state to 336 | implement `Default`. 337 | 338 | ## Stateful lexer example 339 | 340 | Here's an example lexer that counts number of `=`s appear between two `[`s: 341 | 342 | ```rust 343 | lexer! { 344 | // `usize` in parenthesis is the user state type, `usize` after the arrow 345 | // is the token type 346 | Lexer(usize) -> usize; 347 | 348 | rule Init { 349 | $$ascii_whitespace, // line 7 350 | 351 | '[' => |lexer| { 352 | *lexer.state() = 0; // line 10 353 | lexer.switch(LexerRule::Count) // line 11 354 | }, 355 | } 356 | 357 | rule Count { 358 | '=' => |lexer| { 359 | *lexer.state() += 1; // line 17 360 | lexer.continue_() // line 18 361 | }, 362 | 363 | '[' => |lexer| { 364 | let n = *lexer.state(); 365 | lexer.switch_and_return(LexerRule::Init, n) // line 23 366 | }, 367 | } 368 | } 369 | 370 | let mut lexer = Lexer::new("[[ [=[ [==["); 371 | assert_eq!( 372 | lexer.next(), 373 | Some(Ok(( 374 | Loc { line: 0, col: 0, byte_idx: 0 }, 375 | 0, 376 | Loc { line: 0, col: 2, byte_idx: 2 }, 377 | ))) 378 | ); 379 | assert_eq!( 380 | lexer.next(), 381 | Some(Ok(( 382 | Loc { line: 0, col: 3, byte_idx: 3 }, 383 | 1, 384 | Loc { line: 0, col: 6, byte_idx: 6 }, 385 | ))) 386 | ); 387 | assert_eq!( 388 | lexer.next(), 389 | Some(Ok(( 390 | Loc { line: 0, col: 7, byte_idx: 7 }, 391 | 2, 392 | Loc { line: 0, col: 11, byte_idx: 11 }, 393 | ))) 394 | ); 395 | assert_eq!(lexer.next(), None); 396 | ``` 397 | 398 | Initially (the `Init` rule set) we skip spaces (line 7). When we see a `[` we 399 | initialize the user state (line 10) and switch to the `Count` state (line 11). 400 | In `Count`, each `=` increments the user state by one (line 17) and skips the 401 | match (line 18). A `[` in `Count` state returns the current number and switches 402 | to the `Init` state (line 23). 403 | 404 | [1]: https://github.com/osa1/lexgen/blob/main/crates/lexgen/tests/tests.rs 405 | [2]: https://github.com/osa1/lexgen/blob/main/crates/lexgen/tests/lua_5_1.rs 406 | [3]: https://github.com/osa1/lexgen/tree/main/crates/lexgen_lalrpop_example 407 | [4]: https://github.com/osa1/mincaml/blob/master/src/lexer.rs 408 | [5]: https://github.com/osa1/lexgen_rust/blob/main/crates/lexgen_rust/src/lib.rs 409 | [6]: https://github.com/osa1/how-to-parse/blob/4f40236b1f9eca5b67d2193ef0f55fffdc06bffb/src/lexgen_event_parser.rs 410 | -------------------------------------------------------------------------------- /crates/char_range_gen/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "char_range_gen" 3 | version = "0.1.0" 4 | authors = ["Ömer Sinan Ağacan "] 5 | edition = "2021" 6 | 7 | [dependencies] 8 | unicode-xid = "0.2.2" 9 | -------------------------------------------------------------------------------- /crates/char_range_gen/src/main.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::type_complexity)] 2 | 3 | use std::convert::TryFrom; 4 | 5 | fn main() { 6 | for (f, name) in FNS.iter() { 7 | let ranges = generate_char_fn_ranges(*f); 8 | println!("pub static {}: [(u32, u32); {}] = [", name, ranges.len()); 9 | for range in ranges { 10 | println!(" ({}, {}),", range.0, range.1); 11 | } 12 | println!("];"); 13 | } 14 | } 15 | 16 | fn generate_char_fn_ranges(f: fn(char) -> bool) -> Vec<(u32, u32)> { 17 | let mut ranges: Vec<(u32, u32)> = vec![]; 18 | let mut current_range_start: Option = None; 19 | 20 | for i in 0..=u32::from(char::MAX) { 21 | let c = match char::try_from(i) { 22 | Err(_) => continue, 23 | Ok(c) => c, 24 | }; 25 | 26 | if f(c) { 27 | if current_range_start.is_none() { 28 | current_range_start = Some(i); 29 | } 30 | } else if let Some(current_range_start) = current_range_start.take() { 31 | ranges.push((current_range_start, i - 1)); 32 | } 33 | } 34 | 35 | ranges 36 | } 37 | 38 | macro_rules! ascii_fn { 39 | ($x:ident) => { 40 | fn $x(c: char) -> bool { 41 | char::$x(&c) 42 | } 43 | }; 44 | } 45 | 46 | ascii_fn!(is_ascii); 47 | ascii_fn!(is_ascii_alphabetic); 48 | ascii_fn!(is_ascii_alphanumeric); 49 | ascii_fn!(is_ascii_control); 50 | ascii_fn!(is_ascii_digit); 51 | ascii_fn!(is_ascii_graphic); 52 | ascii_fn!(is_ascii_hexdigit); 53 | ascii_fn!(is_ascii_lowercase); 54 | ascii_fn!(is_ascii_punctuation); 55 | ascii_fn!(is_ascii_uppercase); 56 | ascii_fn!(is_ascii_whitespace); 57 | 58 | static FNS: [(fn(char) -> bool, &str); 20] = [ 59 | (char::is_alphabetic, "ALPHABETIC"), 60 | (char::is_alphanumeric, "ALPHANUMERIC"), 61 | (is_ascii, "ASCII"), 62 | (is_ascii_alphabetic, "ASCII_ALPHABETIC"), 63 | (is_ascii_alphanumeric, "ASCII_ALPHANUMERIC"), 64 | (is_ascii_control, "ASCII_CONTROL"), 65 | (is_ascii_digit, "ASCII_DIGIT"), 66 | (is_ascii_graphic, "ASCII_GRAPHIC"), 67 | (is_ascii_hexdigit, "ASCII_HEXDIGIT"), 68 | (is_ascii_lowercase, "ASCII_LOWERCASE"), 69 | (is_ascii_punctuation, "ASCII_PUNCTUATION"), 70 | (is_ascii_uppercase, "ASCII_UPPERCASE"), 71 | (is_ascii_whitespace, "ASCII_WHITESPACE"), 72 | (char::is_control, "CONTROL"), 73 | (char::is_lowercase, "LOWERCASE"), 74 | (char::is_numeric, "NUMERIC"), 75 | (char::is_uppercase, "UPPERCASE"), 76 | (char::is_whitespace, "WHITESPACE"), 77 | (::is_xid_start, "XID_START"), 78 | ( 79 | ::is_xid_continue, 80 | "XID_CONTINUE", 81 | ), 82 | ]; 83 | -------------------------------------------------------------------------------- /crates/lexgen/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lexgen" 3 | version = "0.16.0" 4 | authors = ["Ömer Sinan Ağacan "] 5 | description = "A fully-featured lexer generator implemented as a proc macro" 6 | edition = "2021" 7 | license = "MIT" 8 | homepage = "https://github.com/osa1/lexgen" 9 | categories = ["compilers", "development-tools", "parsing"] 10 | readme = "../../README.md" 11 | repository = "https://github.com/osa1/lexgen" 12 | 13 | [lib] 14 | proc-macro = true 15 | 16 | [dependencies] 17 | proc-macro2 = "1.0" 18 | quote = "1.0" 19 | rustc-hash = "2.0.0" 20 | syn = { version = "2.0.30", features = ["extra-traits", "fold", "full", "visit"] } 21 | 22 | [dev-dependencies] 23 | criterion = "0.3" 24 | lexgen_util = { path = "../lexgen_util" } 25 | 26 | [[bench]] 27 | name = "benchmarks" 28 | harness = false 29 | -------------------------------------------------------------------------------- /crates/lexgen/benches/benchmarks.rs: -------------------------------------------------------------------------------- 1 | // Hacky, but this is the only way I could find to share the lexer in both tests and benchmarks 2 | include!("../tests/lua_5_1.rs"); 3 | 4 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 5 | 6 | #[inline(never)] 7 | fn lex_lua(s: &str) { 8 | let lexer = Lexer::new(s); 9 | for _ in lexer {} 10 | } 11 | 12 | fn lexer_bench(c: &mut Criterion) { 13 | let mut str = String::new(); 14 | str.push_str(&std::fs::read_to_string("tests/test_data").unwrap()); 15 | 16 | for _ in 0..5 { 17 | let str_ = str.clone(); 18 | str.push_str(&str_); 19 | } 20 | 21 | c.bench_function("Lex Lua files", |b| b.iter(|| lex_lua(black_box(&str)))); 22 | } 23 | 24 | criterion_group!(benches, lexer_bench); 25 | criterion_main!(benches); 26 | -------------------------------------------------------------------------------- /crates/lexgen/src/ast.rs: -------------------------------------------------------------------------------- 1 | //! Proc macro AST definition and parser implementations 2 | 3 | use crate::semantic_action_table::{SemanticActionIdx, SemanticActionTable}; 4 | 5 | use syn::parse::discouraged::Speculative; 6 | use syn::parse::ParseStream; 7 | 8 | use std::fmt; 9 | 10 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 11 | pub struct Var(pub String); 12 | 13 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 14 | pub struct Builtin(pub String); 15 | 16 | #[derive(Debug)] 17 | pub struct Lexer { 18 | /// Attributes like `#[derive(...)]` and `/// ...` attached to the lexer type declaration. 19 | /// These attributes copied to the generated lexer struct. 20 | pub attrs: Vec, 21 | pub visibility: Option, 22 | pub type_name: syn::Ident, 23 | pub user_state_type: Option, 24 | pub token_type: syn::Type, 25 | pub rules: Vec, 26 | } 27 | 28 | pub enum Rule { 29 | /// `type Error = UserError;` 30 | ErrorType { 31 | /// Type on the RHS, e.g. `UserError<'input>` 32 | ty: syn::Type, 33 | }, 34 | 35 | /// A top-level binding or unnamed rule 36 | RuleOrBinding(RuleOrBinding), 37 | 38 | /// A list of named rules at the top level: `rule { },` 39 | RuleSet { 40 | name: syn::Ident, 41 | rules: Vec, 42 | }, 43 | } 44 | 45 | pub enum RuleOrBinding { 46 | Rule(SingleRule), 47 | Binding(Binding), 48 | } 49 | 50 | pub struct SingleRule { 51 | pub lhs: RegexCtx, 52 | pub rhs: SemanticActionIdx, 53 | } 54 | 55 | /// A named regex binding: `let = ;`. 56 | #[derive(Debug)] 57 | pub struct Binding { 58 | pub var: Var, 59 | pub re: Regex, 60 | } 61 | 62 | /// Regular expression with optional right context (lookahead) 63 | #[derive(Debug, Clone)] 64 | pub struct RegexCtx { 65 | pub re: Regex, 66 | pub right_ctx: Option, 67 | } 68 | 69 | #[derive(Debug, Clone)] 70 | pub enum RuleRhs { 71 | None, 72 | Rhs { expr: syn::Expr, kind: RuleKind }, 73 | } 74 | 75 | #[derive(Debug, Copy, Clone)] 76 | pub enum RuleKind { 77 | /// Defined with `=`. RHS is not passed a `LexerHandle`, returns `Token`. 78 | Simple, 79 | 80 | /// Defined with `=?`. RHS is passed a `LexerHandle`, returns `LexerAction>`. 82 | Fallible, 83 | 84 | /// Defined with `=>`. RHS is passed a `LexerHandle`, returns `LexerAction` 85 | Infallible, 86 | } 87 | 88 | impl fmt::Debug for Rule { 89 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 90 | match self { 91 | Rule::RuleOrBinding(rule_or_binding) => rule_or_binding.fmt(f), 92 | Rule::RuleSet { name, rules } => f 93 | .debug_struct("Rule::RuleSet") 94 | .field("name", &name.to_string()) 95 | .field("rules", rules) 96 | .finish(), 97 | Rule::ErrorType { ty } => f.debug_struct("Rule::ErrorType").field("ty", ty).finish(), 98 | } 99 | } 100 | } 101 | 102 | impl fmt::Debug for RuleOrBinding { 103 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 104 | match self { 105 | RuleOrBinding::Rule(rule) => rule.fmt(f), 106 | RuleOrBinding::Binding(binding) => binding.fmt(f), 107 | } 108 | } 109 | } 110 | 111 | impl fmt::Debug for SingleRule { 112 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 113 | f.debug_struct("SingleRule") 114 | .field("lhs", &self.lhs) 115 | .field("rhs", &"...") 116 | .finish() 117 | } 118 | } 119 | 120 | #[derive(Debug, Clone)] 121 | pub enum Regex { 122 | Builtin(Builtin), 123 | Var(Var), 124 | Char(char), 125 | String(String), 126 | CharSet(CharSet), 127 | ZeroOrMore(Box), 128 | OneOrMore(Box), 129 | ZeroOrOne(Box), 130 | Concat(Box, Box), 131 | Or(Box, Box), 132 | Any, // any character 133 | EndOfInput, 134 | 135 | /// Difference, or exclusion: characters in the first regex, excluding characters in the second 136 | /// regex. 137 | Diff(Box, Box), 138 | } 139 | 140 | #[derive(Debug, Clone)] 141 | pub struct CharSet(pub Vec); 142 | 143 | #[derive(Debug, Clone, Copy)] 144 | pub enum CharOrRange { 145 | Char(char), 146 | Range(char, char), 147 | } 148 | 149 | /// Parses a regex with optional right context: `re_ctx -> re [> re]` 150 | fn parse_regex_ctx(input: ParseStream) -> syn::Result { 151 | let re = parse_regex(input)?; 152 | if input.peek(syn::token::Gt) { 153 | input.parse::()?; 154 | let right_ctx = parse_regex(input)?; 155 | Ok(RegexCtx { 156 | re, 157 | right_ctx: Some(right_ctx), 158 | }) 159 | } else { 160 | Ok(RegexCtx { 161 | re, 162 | right_ctx: None, 163 | }) 164 | } 165 | } 166 | 167 | /// Parses a regex 168 | fn parse_regex(input: ParseStream) -> syn::Result { 169 | parse_regex_0(input) 170 | } 171 | 172 | // re_0 -> re_1 | re_0 `|` re_1 (alternation) 173 | fn parse_regex_0(input: ParseStream) -> syn::Result { 174 | let mut re = parse_regex_1(input)?; 175 | 176 | while input.peek(syn::token::Or) { 177 | let _ = input.parse::()?; 178 | let re2 = parse_regex_1(input)?; 179 | re = Regex::Or(Box::new(re), Box::new(re2)); // left associative 180 | } 181 | 182 | Ok(re) 183 | } 184 | 185 | // re_1 -> re_2 | re_1 re_2 (concatenation) 186 | fn parse_regex_1(input: ParseStream) -> syn::Result { 187 | let mut re = parse_regex_2(input)?; 188 | 189 | // Parse concatenations 190 | while input.peek(syn::token::Paren) 191 | || input.peek(syn::token::Dollar) 192 | || input.peek(syn::LitChar) 193 | || input.peek(syn::LitStr) 194 | || input.peek(syn::token::Bracket) 195 | || input.peek(syn::token::Underscore) 196 | { 197 | let re2 = parse_regex_2(input)?; 198 | re = Regex::Concat(Box::new(re), Box::new(re2)); // left associative 199 | } 200 | 201 | Ok(re) 202 | } 203 | 204 | // re_2 -> re_3 | re_3* | re_3? | re_3+ 205 | fn parse_regex_2(input: ParseStream) -> syn::Result { 206 | let mut re = parse_regex_3(input)?; 207 | 208 | loop { 209 | if input.peek(syn::token::Star) { 210 | let _ = input.parse::()?; 211 | re = Regex::ZeroOrMore(Box::new(re)); 212 | } else if input.peek(syn::token::Question) { 213 | let _ = input.parse::()?; 214 | re = Regex::ZeroOrOne(Box::new(re)); 215 | } else if input.peek(syn::token::Plus) { 216 | let _ = input.parse::()?; 217 | re = Regex::OneOrMore(Box::new(re)); 218 | } else { 219 | break; 220 | } 221 | } 222 | 223 | Ok(re) 224 | } 225 | 226 | // re_3 -> re_4 | re_4 # re_4 (left associative) 227 | fn parse_regex_3(input: ParseStream) -> syn::Result { 228 | let mut re = parse_regex_4(input)?; 229 | 230 | while input.peek(syn::token::Pound) { 231 | let _ = input.parse::()?; 232 | let re_2 = parse_regex_4(input)?; 233 | re = Regex::Diff(Box::new(re), Box::new(re_2)); 234 | } 235 | 236 | Ok(re) 237 | } 238 | 239 | // re_4 -> ( re_0 ) | $ | $x | $$x | _ | 'x' | "..." | [...] 240 | fn parse_regex_4(input: ParseStream) -> syn::Result { 241 | if input.peek(syn::token::Paren) { 242 | let parenthesized; 243 | syn::parenthesized!(parenthesized in input); 244 | parse_regex(&parenthesized) // no right ctx 245 | } else if input.peek(syn::token::Dollar) { 246 | let _ = input.parse::()?; 247 | if input.parse::().is_ok() { 248 | let ident = input.parse::()?; 249 | Ok(Regex::Builtin(Builtin(ident.to_string()))) 250 | } else { 251 | match input.parse::() { 252 | Ok(ident) => Ok(Regex::Var(Var(ident.to_string()))), 253 | Err(_) => Ok(Regex::EndOfInput), 254 | } 255 | } 256 | } else if input.peek(syn::LitChar) { 257 | let char = input.parse::()?; 258 | Ok(Regex::Char(char.value())) 259 | } else if input.peek(syn::LitStr) { 260 | let str = input.parse::()?; 261 | Ok(Regex::String(str.value())) 262 | } else if input.peek(syn::token::Bracket) { 263 | let bracketed; 264 | syn::bracketed!(bracketed in input); 265 | let char_set = parse_charset(&bracketed)?; 266 | Ok(Regex::CharSet(char_set)) 267 | } else if input.parse::().is_ok() { 268 | Ok(Regex::Any) 269 | } else { 270 | Err(syn::Error::new( 271 | proc_macro2::Span::call_site(), 272 | "Unable to parse regex", 273 | )) 274 | } 275 | } 276 | 277 | /// Parse a sequence of `` or `-`. 278 | fn parse_charset(input: ParseStream) -> syn::Result { 279 | let mut chars = vec![]; 280 | while !input.is_empty() { 281 | chars.push(parse_char_or_range(input)?); 282 | } 283 | Ok(CharSet(chars)) 284 | } 285 | 286 | /// Parse a `` or `-`. 287 | fn parse_char_or_range(input: ParseStream) -> syn::Result { 288 | let char = input.parse::()?.value(); 289 | if input.peek(syn::token::Minus) { 290 | let _ = input.parse::()?; 291 | let char2 = input.parse::()?.value(); 292 | Ok(CharOrRange::Range(char, char2)) 293 | } else { 294 | Ok(CharOrRange::Char(char)) 295 | } 296 | } 297 | 298 | fn parse_rule_or_binding( 299 | input: ParseStream, 300 | semantic_action_table: &mut SemanticActionTable, 301 | ) -> syn::Result { 302 | if input.peek(syn::token::Let) { 303 | // Let binding 304 | input.parse::()?; 305 | let var = input.parse::()?; 306 | input.parse::()?; 307 | let re = parse_regex(input)?; 308 | input.parse::()?; 309 | Ok(RuleOrBinding::Binding(Binding { 310 | var: Var(var.to_string()), 311 | re, 312 | })) 313 | } else { 314 | // Rule 315 | let lhs = parse_regex_ctx(input)?; 316 | 317 | let rhs = if input.parse::().is_ok() { 318 | RuleRhs::None 319 | } else if input.parse::().is_ok() { 320 | let expr = input.parse::()?; 321 | input.parse::()?; 322 | RuleRhs::Rhs { 323 | expr, 324 | kind: RuleKind::Infallible, 325 | } 326 | } else if input.parse::().is_ok() { 327 | let kind = if input.peek(syn::token::Question) { 328 | let _ = input.parse::(); 329 | RuleKind::Fallible 330 | } else { 331 | RuleKind::Simple 332 | }; 333 | let expr = input.parse::()?; 334 | input.parse::()?; 335 | RuleRhs::Rhs { expr, kind } 336 | } else { 337 | panic!("Expected one of `,`, `=>`, `=?`, or `=` after a regex"); 338 | }; 339 | 340 | let rhs = semantic_action_table.add(rhs); 341 | 342 | Ok(RuleOrBinding::Rule(SingleRule { lhs, rhs })) 343 | } 344 | } 345 | 346 | fn parse_rule( 347 | input: ParseStream, 348 | semantic_action_table: &mut SemanticActionTable, 349 | ) -> syn::Result { 350 | if input.peek(syn::Ident) { 351 | // Named rules 352 | let ident = input.parse::()?; 353 | if ident != "rule" { 354 | return Err(syn::Error::new( 355 | ident.span(), 356 | "Unknown identifier, expected \"rule\", \"let\", or a regex", 357 | )); 358 | } 359 | let name = input.parse::()?; 360 | let braced; 361 | syn::braced!(braced in input); 362 | let mut rules = vec![]; 363 | while !braced.is_empty() { 364 | rules.push(parse_rule_or_binding(&braced, semantic_action_table)?); 365 | } 366 | // Consume optional trailing comma 367 | let _ = input.parse::(); 368 | Ok(Rule::RuleSet { name, rules }) 369 | } else if input.parse::().is_ok() { 370 | let ident = input.parse::()?; 371 | if ident != "Error" { 372 | panic!("Error type syntax is: `type Error = ...;`"); 373 | } 374 | input.parse::()?; 375 | let ty = input.parse::()?; 376 | input.parse::()?; 377 | Ok(Rule::ErrorType { ty }) 378 | } else { 379 | Ok(Rule::RuleOrBinding(parse_rule_or_binding( 380 | input, 381 | semantic_action_table, 382 | )?)) 383 | } 384 | } 385 | 386 | pub fn make_lexer_parser( 387 | semantic_action_table: &mut SemanticActionTable, 388 | ) -> impl FnOnce(ParseStream) -> Result + '_ { 389 | |input: ParseStream| { 390 | let attrs = input.call(syn::Attribute::parse_outer)?; 391 | 392 | let forked = input.fork(); 393 | let visibility = match forked.parse::() { 394 | Ok(visibility) => { 395 | input.advance_to(&forked); 396 | Some(visibility) 397 | } 398 | Err(_) => None, 399 | }; 400 | 401 | let type_name = input.parse::()?; 402 | 403 | let user_state_type = if input.peek(syn::token::Paren) { 404 | let parenthesized; 405 | syn::parenthesized!(parenthesized in input); 406 | Some(parenthesized.parse::()?) 407 | } else { 408 | None 409 | }; 410 | 411 | input.parse::()?; 412 | let token_type = input.parse::()?; 413 | input.parse::()?; 414 | 415 | let mut rules = vec![]; 416 | while !input.is_empty() { 417 | rules.push(parse_rule(input, semantic_action_table)?); 418 | } 419 | 420 | Ok(Lexer { 421 | attrs, 422 | visibility, 423 | type_name, 424 | user_state_type, 425 | token_type, 426 | rules, 427 | }) 428 | } 429 | } 430 | -------------------------------------------------------------------------------- /crates/lexgen/src/builtin.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_camel_case_types)] 2 | 3 | // NB. We use this type instead of storing `&'static [...]`s directly to make debugging easier. 4 | #[derive(Debug, Clone, Copy)] 5 | pub enum BuiltinCharRange { 6 | Alphabetic, 7 | Alphanumeric, 8 | Ascii, 9 | AsciiAlphabetic, 10 | AsciiAlphanumeric, 11 | AsciiControl, 12 | AsciiDigit, 13 | AsciiGraphic, 14 | AsciiHexdigit, 15 | AsciiLowercase, 16 | AsciiPunctuation, 17 | AsciiUppercase, 18 | AsciiWhitespace, 19 | Control, 20 | Lowercase, 21 | Numeric, 22 | Uppercase, 23 | Whitespace, 24 | XID_Start, 25 | XID_Continue, 26 | } 27 | 28 | pub static BUILTIN_RANGES: [(&str, BuiltinCharRange); 20] = [ 29 | ("alphabetic", BuiltinCharRange::Alphabetic), 30 | ("alphanumeric", BuiltinCharRange::Alphanumeric), 31 | ("ascii", BuiltinCharRange::Ascii), 32 | ("ascii_alphabetic", BuiltinCharRange::AsciiAlphabetic), 33 | ("ascii_alphanumeric", BuiltinCharRange::AsciiAlphanumeric), 34 | ("ascii_control", BuiltinCharRange::AsciiControl), 35 | ("ascii_digit", BuiltinCharRange::AsciiDigit), 36 | ("ascii_graphic", BuiltinCharRange::AsciiGraphic), 37 | ("ascii_hexdigit", BuiltinCharRange::AsciiHexdigit), 38 | ("ascii_lowercase", BuiltinCharRange::AsciiLowercase), 39 | ("ascii_punctuation", BuiltinCharRange::AsciiPunctuation), 40 | ("ascii_uppercase", BuiltinCharRange::AsciiUppercase), 41 | ("ascii_whitespace", BuiltinCharRange::AsciiWhitespace), 42 | ("control", BuiltinCharRange::Control), 43 | ("lowercase", BuiltinCharRange::Lowercase), 44 | ("numeric", BuiltinCharRange::Numeric), 45 | ("uppercase", BuiltinCharRange::Uppercase), 46 | ("whitespace", BuiltinCharRange::Whitespace), 47 | ("XID_Start", BuiltinCharRange::XID_Start), 48 | ("XID_Continue", BuiltinCharRange::XID_Continue), 49 | ]; 50 | 51 | impl BuiltinCharRange { 52 | pub fn get_ranges(&self) -> &'static [(u32, u32)] { 53 | use crate::char_ranges::*; 54 | 55 | match self { 56 | BuiltinCharRange::Alphabetic => &ALPHABETIC, 57 | BuiltinCharRange::Alphanumeric => &ALPHANUMERIC, 58 | BuiltinCharRange::Ascii => &ASCII, 59 | BuiltinCharRange::AsciiAlphabetic => &ASCII_ALPHABETIC, 60 | BuiltinCharRange::AsciiAlphanumeric => &ASCII_ALPHANUMERIC, 61 | BuiltinCharRange::AsciiControl => &ASCII_CONTROL, 62 | BuiltinCharRange::AsciiDigit => &ASCII_DIGIT, 63 | BuiltinCharRange::AsciiGraphic => &ASCII_GRAPHIC, 64 | BuiltinCharRange::AsciiHexdigit => &ASCII_HEXDIGIT, 65 | BuiltinCharRange::AsciiLowercase => &ASCII_LOWERCASE, 66 | BuiltinCharRange::AsciiPunctuation => &ASCII_PUNCTUATION, 67 | BuiltinCharRange::AsciiUppercase => &ASCII_UPPERCASE, 68 | BuiltinCharRange::AsciiWhitespace => &ASCII_WHITESPACE, 69 | BuiltinCharRange::Control => &CONTROL, 70 | BuiltinCharRange::Lowercase => &LOWERCASE, 71 | BuiltinCharRange::Numeric => &NUMERIC, 72 | BuiltinCharRange::Uppercase => &UPPERCASE, 73 | BuiltinCharRange::Whitespace => &WHITESPACE, 74 | BuiltinCharRange::XID_Start => &XID_START, 75 | BuiltinCharRange::XID_Continue => &XID_CONTINUE, 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /crates/lexgen/src/collections.rs: -------------------------------------------------------------------------------- 1 | use rustc_hash::{FxHashMap, FxHashSet}; 2 | 3 | pub type Set = FxHashSet; 4 | 5 | pub type Map = FxHashMap; 6 | -------------------------------------------------------------------------------- /crates/lexgen/src/dfa.rs: -------------------------------------------------------------------------------- 1 | mod backtrack; 2 | pub mod codegen; 3 | pub mod simplify; 4 | 5 | #[cfg(test)] 6 | pub mod simulate; 7 | 8 | use crate::collections::{Map, Set}; 9 | use crate::nfa::AcceptingState; 10 | use crate::range_map::{Range, RangeMap}; 11 | pub(crate) use backtrack::update_backtracks; 12 | 13 | use std::convert::TryFrom; 14 | use std::iter::{FromIterator, IntoIterator}; 15 | 16 | /// Deterministic finite automate, parameterized on values of accepting states. 17 | #[derive(Debug)] 18 | pub struct DFA { 19 | // Indexed by `StateIdx` 20 | states: Vec>, 21 | } 22 | 23 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 24 | pub struct StateIdx(usize); 25 | 26 | impl StateIdx { 27 | fn map(&self, f: F) -> StateIdx 28 | where 29 | F: Fn(usize) -> usize, 30 | { 31 | StateIdx(f(self.0)) 32 | } 33 | } 34 | 35 | #[derive(Debug)] 36 | pub struct State { 37 | /// Whether the state is the initial state of a rule set. This is used when inlining states 38 | /// with single predecessors. Initial states cannot be inlined as there may be references to 39 | /// these states in semantic actions. 40 | initial: bool, 41 | char_transitions: Map, 42 | range_transitions: RangeMap, 43 | any_transition: Option, 44 | end_of_input_transition: Option, 45 | accepting: Vec>, 46 | /// Predecessors of the state, used to inline code for a state with one predecessor in the 47 | /// predecessor's code. 48 | predecessors: Set, 49 | backtrack: bool, 50 | } 51 | 52 | impl State { 53 | fn new() -> State { 54 | State { 55 | initial: false, 56 | char_transitions: Default::default(), 57 | range_transitions: Default::default(), 58 | any_transition: None, 59 | end_of_input_transition: None, 60 | accepting: vec![], 61 | predecessors: Default::default(), 62 | backtrack: false, 63 | } 64 | } 65 | 66 | fn has_no_transitions(&self) -> bool { 67 | self.char_transitions.is_empty() 68 | && self.range_transitions.is_empty() 69 | && self.any_transition.is_none() 70 | && self.end_of_input_transition.is_none() 71 | } 72 | } 73 | 74 | impl DFA { 75 | pub fn new() -> (DFA, StateIdx) { 76 | let mut initial_state = State::new(); 77 | initial_state.initial = true; 78 | ( 79 | DFA { 80 | states: vec![initial_state], 81 | }, 82 | StateIdx(0), 83 | ) 84 | } 85 | 86 | pub fn initial_state(&self) -> StateIdx { 87 | StateIdx(0) 88 | } 89 | 90 | pub fn make_state_accepting(&mut self, state: StateIdx, accept: AcceptingState) { 91 | self.states[state.0].accepting.push(accept); 92 | } 93 | 94 | pub fn new_state(&mut self) -> StateIdx { 95 | let new_state_idx = StateIdx(self.states.len()); 96 | self.states.push(State::new()); 97 | new_state_idx 98 | } 99 | 100 | pub fn is_accepting_state(&self, state: StateIdx) -> bool { 101 | !self.states[state.0].accepting.is_empty() 102 | } 103 | 104 | pub fn add_char_transition(&mut self, state: StateIdx, char: char, next: StateIdx) { 105 | let old = self.states[state.0].char_transitions.insert(char, next); 106 | assert!( 107 | old.is_none(), 108 | "state={:?}, char={:?}, old={:?}, new={:?}", 109 | state, 110 | char, 111 | old, 112 | next 113 | ); 114 | 115 | self.states[next.0].predecessors.insert(state); 116 | } 117 | 118 | pub fn set_range_transitions(&mut self, state: StateIdx, range_map: RangeMap) { 119 | assert!(self.states[state.0].range_transitions.is_empty()); 120 | 121 | for range in range_map.iter() { 122 | self.states[range.value.0].predecessors.insert(state); 123 | } 124 | 125 | self.states[state.0].range_transitions = range_map; 126 | } 127 | 128 | pub fn set_any_transition(&mut self, state: StateIdx, next: StateIdx) { 129 | assert!(self.states[state.0].any_transition.is_none()); 130 | self.states[state.0].any_transition = Some(next); 131 | self.states[next.0].predecessors.insert(state); 132 | } 133 | 134 | pub fn set_end_of_input_transition(&mut self, state: StateIdx, next: StateIdx) { 135 | assert!(self.states[state.0].end_of_input_transition.is_none()); 136 | self.states[state.0].end_of_input_transition = Some(next); 137 | self.states[next.0].predecessors.insert(state); 138 | } 139 | } 140 | 141 | impl DFA { 142 | fn from_states(states: Vec>) -> DFA { 143 | DFA { states } 144 | } 145 | 146 | pub fn into_state_indices(self) -> impl Iterator)> { 147 | self.states 148 | .into_iter() 149 | .enumerate() 150 | .map(|(state_idx, state)| (StateIdx(state_idx), state)) 151 | } 152 | } 153 | 154 | impl FromIterator<(StateIdx, State)> for DFA { 155 | fn from_iter(iter: I) -> Self 156 | where 157 | I: IntoIterator)>, 158 | { 159 | let mut states: Vec<(StateIdx, State)> = iter.into_iter().collect(); 160 | states.sort_by_key(|&(state_idx, _)| state_idx); 161 | 162 | DFA { 163 | states: states.into_iter().map(|(_, state)| state).collect(), 164 | } 165 | } 166 | } 167 | 168 | impl DFA { 169 | /// Extend the current DFA with another DFA. The extended DFA's states will be renumbered. This 170 | /// does not add any transitions from the original DFA states to the extension. Accepting 171 | /// states of the extension is preserved. 172 | /// 173 | /// Returns initial state for the extension in the new DFA. 174 | pub fn add_dfa(&mut self, other: DFA) -> StateIdx { 175 | let n_current_states = self.states.len(); 176 | 177 | for State { 178 | initial, 179 | char_transitions, 180 | range_transitions, 181 | any_transition, 182 | end_of_input_transition, 183 | accepting, 184 | predecessors, 185 | backtrack, 186 | } in other.states 187 | { 188 | let mut new_char_transitions: Map = Default::default(); 189 | let mut new_any_transition: Option = None; 190 | let mut new_end_of_input_transition: Option = None; 191 | 192 | for (char, next) in char_transitions { 193 | new_char_transitions.insert(char, StateIdx(next.0 + n_current_states)); 194 | } 195 | 196 | let new_range_transitions = 197 | range_transitions.map(|state_idx| StateIdx(state_idx.0 + n_current_states)); 198 | 199 | if let Some(next) = any_transition { 200 | new_any_transition = Some(StateIdx(next.0 + n_current_states)); 201 | } 202 | 203 | if let Some(next) = end_of_input_transition { 204 | new_end_of_input_transition = Some(StateIdx(next.0 + n_current_states)); 205 | } 206 | 207 | let predecessors = predecessors 208 | .into_iter() 209 | .map(|pred| StateIdx(pred.0 + n_current_states)) 210 | .collect(); 211 | 212 | self.states.push(State { 213 | initial, 214 | char_transitions: new_char_transitions, 215 | range_transitions: new_range_transitions, 216 | any_transition: new_any_transition, 217 | end_of_input_transition: new_end_of_input_transition, 218 | accepting, 219 | predecessors, 220 | backtrack, 221 | }); 222 | } 223 | 224 | StateIdx(n_current_states) 225 | } 226 | } 227 | 228 | use std::fmt::{self, Display, Formatter}; 229 | 230 | impl Display for StateIdx { 231 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 232 | self.0.fmt(f) 233 | } 234 | } 235 | 236 | impl Display for DFA { 237 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 238 | for (state_idx, state) in self.states.iter().enumerate() { 239 | let State { 240 | initial, 241 | char_transitions, 242 | range_transitions, 243 | any_transition, 244 | end_of_input_transition, 245 | accepting, 246 | predecessors: _, 247 | backtrack, 248 | } = state; 249 | 250 | if !accepting.is_empty() { 251 | if *initial { 252 | write!(f, "{:>5}:", format!("i*{}", state_idx))?; 253 | } else { 254 | write!(f, "{:>5}:", format!("*{}", state_idx))?; 255 | } 256 | } else { 257 | if *initial { 258 | write!(f, "{:>5}:", format!("i{}", state_idx))?; 259 | } else { 260 | write!(f, "{:>5}:", state_idx)?; 261 | } 262 | } 263 | 264 | let mut first = true; 265 | 266 | for (char, next) in char_transitions.iter() { 267 | if !first { 268 | write!(f, " ")?; 269 | } else { 270 | first = false; 271 | } 272 | 273 | writeln!(f, "{:?} -> {}", char, next)?; 274 | } 275 | 276 | for Range { start, end, value } in range_transitions.iter() { 277 | if !first { 278 | write!(f, " ")?; 279 | } else { 280 | first = false; 281 | } 282 | 283 | writeln!( 284 | f, 285 | "{:?} - {:?} -> {}", 286 | char::try_from(*start).unwrap(), 287 | char::try_from(*end).unwrap(), 288 | value, 289 | )?; 290 | } 291 | 292 | if let Some(next) = any_transition { 293 | if !first { 294 | write!(f, " ")?; 295 | } else { 296 | first = false; 297 | } 298 | 299 | writeln!(f, "_ -> {}", next)?; 300 | } 301 | 302 | if let Some(next) = end_of_input_transition { 303 | if !first { 304 | write!(f, " ")?; 305 | } 306 | 307 | writeln!(f, "$ -> {}", next)?; 308 | } 309 | 310 | if *backtrack { 311 | if !first { 312 | write!(f, " ")?; 313 | } 314 | 315 | writeln!(f, "backtrack")?; 316 | } 317 | 318 | if char_transitions.is_empty() 319 | && range_transitions.is_empty() 320 | && any_transition.is_none() 321 | && end_of_input_transition.is_none() 322 | { 323 | writeln!(f)?; 324 | } 325 | } 326 | 327 | Ok(()) 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /crates/lexgen/src/dfa/backtrack.rs: -------------------------------------------------------------------------------- 1 | use crate::collections::Map; 2 | use crate::dfa::{StateIdx, DFA}; 3 | 4 | use std::collections::hash_map::Entry; 5 | 6 | pub(crate) fn update_backtracks(dfa: &mut DFA) { 7 | // State and whether the state is an accepting state. 8 | let mut work_list: Vec<(StateIdx, bool)> = dfa 9 | .states 10 | .iter() 11 | .enumerate() 12 | .filter_map(|(state_idx, state)| { 13 | if state.initial { 14 | Some((StateIdx(state_idx), false)) 15 | } else { 16 | None 17 | } 18 | }) 19 | .collect(); 20 | 21 | // Set of visited nodes, with their backtrack state when visited. If a state's backtrack 22 | // property changes, we visit it again to make its successors backtrack. 23 | let mut visited: Map = Default::default(); 24 | 25 | while let Some((state, backtrack)) = work_list.pop() { 26 | // Did we visit the state, with the right backtrack state? 27 | match visited.entry(state) { 28 | Entry::Occupied(mut entry) => { 29 | if *entry.get() == backtrack { 30 | continue; 31 | } 32 | entry.insert(backtrack); 33 | } 34 | Entry::Vacant(entry) => { 35 | entry.insert(backtrack); 36 | } 37 | } 38 | 39 | // Whether the successor states should backtrack. 40 | let successor_backtrack = backtrack || dfa.is_accepting_state(state); 41 | 42 | for next in dfa.states[state.0].char_transitions.values() { 43 | work_list.push((*next, successor_backtrack)); 44 | } 45 | 46 | for next_range in dfa.states[state.0].range_transitions.iter() { 47 | work_list.push((next_range.value, successor_backtrack)); 48 | } 49 | 50 | if let Some(next) = dfa.states[state.0].any_transition { 51 | work_list.push((next, successor_backtrack)); 52 | } 53 | 54 | if let Some(next) = dfa.states[state.0].end_of_input_transition { 55 | work_list.push((next, successor_backtrack)); 56 | } 57 | } 58 | 59 | assert_eq!(visited.len(), dfa.states.len()); 60 | 61 | for (state, backtrack) in visited { 62 | dfa.states[state.0].backtrack = backtrack; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /crates/lexgen/src/dfa/codegen/ctx.rs: -------------------------------------------------------------------------------- 1 | use super::search_table::SearchTableSet; 2 | use super::StateIdx; 3 | use super::DFA; 4 | use crate::ast::RuleRhs; 5 | use crate::collections::Map; 6 | use crate::dfa::simplify::Trans; 7 | use crate::semantic_action_table::{SemanticActionIdx, SemanticActionTable}; 8 | 9 | /// Code generation state 10 | pub struct CgCtx { 11 | /// Maps semantic action indices to expressions. Used to generate semantic action functions. 12 | semantic_action_table: SemanticActionTable, 13 | 14 | /// Name of the lexer: `MyLexer` in `lexer! { MyLexer -> MyToken; }` 15 | lexer_name: syn::Ident, 16 | 17 | /// Type of the values the lexer will produce: `MyToken` in `lexer! { MyLexer -> MyToken; }` 18 | token_type: syn::Type, 19 | 20 | /// Type of the user error, when available. `` in `type Error = ...`. 21 | user_error_type: Option, 22 | 23 | /// Maps user-written rule names (e.g. `rule MyRule { ... }`) to their initial states in the 24 | /// final DFA. 25 | rule_states: Map, 26 | 27 | /// Sorted vector of states with only one predecessor. These states will be inlined in the 28 | /// predecessor states and won't appear in the final code. Inlining these states significantly 29 | /// improves code size and runtime performance. 30 | /// 31 | /// This vector is used to map non-inlined states to their final state indices in the generated 32 | /// code. For example, if this vector is `[5]`, state 5 is skipped, and states after 5 are 33 | /// decremented 1, so state 6 becomes 5 etc. 34 | inlined_states: Vec, 35 | 36 | /// Mutable parts of the codegen state 37 | codegen_state: CgState, 38 | } 39 | 40 | struct CgState { 41 | /// Binary search tables generated so far 42 | search_tables: SearchTableSet, 43 | } 44 | 45 | impl CgCtx { 46 | pub fn new( 47 | dfa: &DFA, SemanticActionIdx>, 48 | semantic_action_table: SemanticActionTable, 49 | lexer_name: syn::Ident, 50 | token_type: syn::Type, 51 | user_error_type: Option, 52 | rule_states: Map, 53 | ) -> CgCtx { 54 | let inlined_states: Vec = dfa 55 | .states 56 | .iter() 57 | .enumerate() 58 | .filter_map(|(state_idx, state)| { 59 | if state.predecessors.len() == 1 { 60 | Some(StateIdx(state_idx)) 61 | } else { 62 | None 63 | } 64 | }) 65 | .collect(); 66 | 67 | CgCtx { 68 | semantic_action_table, 69 | lexer_name, 70 | token_type, 71 | user_error_type, 72 | rule_states, 73 | inlined_states, 74 | codegen_state: CgState { 75 | search_tables: SearchTableSet::new(), 76 | }, 77 | } 78 | } 79 | 80 | pub fn lexer_name(&self) -> &syn::Ident { 81 | &self.lexer_name 82 | } 83 | 84 | /// Renumber a state index taking inlined states into account. 85 | pub fn renumber_state(&self, state: StateIdx) -> StateIdx { 86 | match self.inlined_states.binary_search(&state) { 87 | Ok(idx) | Err(idx) => state.map(|state_idx| state_idx - idx), 88 | } 89 | } 90 | 91 | pub fn n_inlined_states(&self) -> usize { 92 | self.inlined_states.len() 93 | } 94 | 95 | pub fn token_type(&self) -> &syn::Type { 96 | &self.token_type 97 | } 98 | 99 | pub fn user_error_type(&self) -> Option<&syn::Type> { 100 | self.user_error_type.as_ref() 101 | } 102 | 103 | pub fn add_search_table(&mut self, ranges: Vec<(char, char)>) -> syn::Ident { 104 | self.codegen_state.search_tables.add_table(ranges) 105 | } 106 | 107 | pub fn take_search_tables(&mut self) -> SearchTableSet { 108 | std::mem::replace(&mut self.codegen_state.search_tables, SearchTableSet::new()) 109 | } 110 | 111 | pub fn rule_states(&self) -> &Map { 112 | &self.rule_states 113 | } 114 | 115 | pub fn iter_semantic_actions(&self) -> impl Iterator { 116 | self.semantic_action_table.iter() 117 | } 118 | 119 | pub fn semantic_action_fn_ident(&self, action: SemanticActionIdx) -> syn::Ident { 120 | syn::Ident::new( 121 | &format!("{}_ACTION_{}", self.lexer_name, action.as_usize()), 122 | self.lexer_name.span(), 123 | ) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /crates/lexgen/src/dfa/codegen/search_table.rs: -------------------------------------------------------------------------------- 1 | use crate::collections::Map; 2 | 3 | use std::collections::hash_map::Entry; 4 | 5 | pub struct SearchTableSet { 6 | tables: Map, syn::Ident>, 7 | } 8 | 9 | impl SearchTableSet { 10 | pub fn new() -> SearchTableSet { 11 | SearchTableSet { 12 | tables: Default::default(), 13 | } 14 | } 15 | 16 | pub fn add_table(&mut self, ranges: Vec<(char, char)>) -> syn::Ident { 17 | let n_tables = self.tables.len(); 18 | match self.tables.entry(ranges) { 19 | Entry::Occupied(entry) => entry.get().clone(), 20 | Entry::Vacant(entry) => { 21 | let ident = syn::Ident::new( 22 | &format!("RANGE_TABLE_{}", n_tables), 23 | proc_macro2::Span::call_site(), 24 | ); 25 | entry.insert(ident.clone()); 26 | ident 27 | } 28 | } 29 | } 30 | 31 | pub fn iter(&self) -> impl Iterator { 32 | self.tables 33 | .iter() 34 | .map(|(ranges, ident)| (ranges.as_slice(), ident)) 35 | } 36 | 37 | pub fn is_empty(&self) -> bool { 38 | self.tables.is_empty() 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /crates/lexgen/src/dfa/simplify.rs: -------------------------------------------------------------------------------- 1 | use super::{State, StateIdx, DFA}; 2 | use crate::collections::Map; 3 | use crate::nfa::AcceptingState; 4 | 5 | #[derive(Debug)] 6 | pub enum Trans { 7 | Accept(Vec>), 8 | Trans(StateIdx), 9 | } 10 | 11 | /// Removes accepting states with no transitions, makes the transitions to those states accepting. 12 | pub fn simplify( 13 | dfa: DFA, 14 | dfa_state_indices: &mut Map, 15 | ) -> DFA, A> { 16 | let mut empty_states: Vec<(StateIdx, Vec>)> = vec![]; 17 | 18 | let mut non_empty_states: Vec<(StateIdx, State)> = vec![]; 19 | 20 | for (state_idx, state) in dfa.into_state_indices() { 21 | if state.has_no_transitions() && !state.initial { 22 | empty_states.push((state_idx, state.accepting)); 23 | } else { 24 | non_empty_states.push((state_idx, state)); 25 | } 26 | } 27 | 28 | for (_, t) in dfa_state_indices.iter_mut() { 29 | let idx = match empty_states.binary_search_by(|(state_idx, _)| state_idx.cmp(t)) { 30 | Ok(idx) | Err(idx) => idx, 31 | }; 32 | *t = t.map(|i| i - idx); 33 | } 34 | 35 | let map_transition = |t: StateIdx| -> Trans { 36 | match empty_states.binary_search_by(|(state_idx, _action)| state_idx.cmp(&t)) { 37 | Ok(idx) => Trans::Accept(empty_states[idx].1.clone()), 38 | Err(idx) => Trans::Trans(t.map(|i| i - idx)), 39 | } 40 | }; 41 | 42 | let new_states: Vec, A>> = non_empty_states 43 | .into_iter() 44 | .map(|(_state_idx, state)| { 45 | let State { 46 | initial, 47 | char_transitions, 48 | range_transitions, 49 | any_transition, 50 | end_of_input_transition, 51 | accepting, 52 | predecessors, 53 | backtrack, 54 | } = state; 55 | 56 | let char_transitions = char_transitions 57 | .into_iter() 58 | .map(|(char, next)| (char, map_transition(next))) 59 | .collect(); 60 | 61 | let range_transitions = range_transitions.map(map_transition); 62 | 63 | let any_transition = any_transition.map(map_transition); 64 | 65 | let end_of_input_transition = end_of_input_transition.map(map_transition); 66 | 67 | let predecessors = predecessors 68 | .into_iter() 69 | .map(|pred| match map_transition(pred) { 70 | Trans::Trans(pred) => pred, 71 | _ => { 72 | // This pass should only remove nodes without successors, so it's a bug if 73 | // we remove a predecessor 74 | panic!("Predecessor of a state is removed in simplification") 75 | } 76 | }) 77 | .collect(); 78 | 79 | State { 80 | initial, 81 | char_transitions, 82 | range_transitions, 83 | any_transition, 84 | end_of_input_transition, 85 | accepting, 86 | predecessors, 87 | backtrack, 88 | } 89 | }) 90 | .collect(); 91 | 92 | DFA::from_states(new_states) 93 | } 94 | -------------------------------------------------------------------------------- /crates/lexgen/src/dfa/simulate.rs: -------------------------------------------------------------------------------- 1 | use super::{StateIdx, DFA}; 2 | 3 | pub use crate::nfa::simulate::{ErrorLoc, Matches}; 4 | use crate::nfa::AcceptingState; 5 | use crate::range_map::Range; 6 | use crate::right_ctx::RightCtxDFAs; 7 | 8 | impl DFA { 9 | pub fn simulate<'input>( 10 | &self, 11 | input: &'input str, 12 | right_ctx_dfas: &RightCtxDFAs, 13 | ) -> (Matches<'input, A>, Option) { 14 | let mut values: Matches<'input, A> = vec![]; 15 | 16 | // Current state 17 | let mut state = StateIdx(0); 18 | 19 | // See comments for the same variable in NFA simulation 20 | let mut last_match: Option<(usize, A, usize)> = None; 21 | 22 | let mut char_indices = input.char_indices(); 23 | 24 | // Where the current match starts 25 | let mut match_start = 0; 26 | 27 | // Index of current character in input string 28 | let mut char_idx: usize; 29 | 30 | 'outer: loop { 31 | while let Some((char_idx_, char)) = char_indices.next() { 32 | char_idx = match_start + char_idx_; 33 | 34 | match next(self, state, char) { 35 | None => { 36 | match last_match.take() { 37 | None => { 38 | // We're stuck and can't backtrack, raise an error 39 | return (values, Some(match_start)); 40 | } 41 | Some((last_match_start, last_match_value, last_match_end)) => { 42 | // Backtrack to the previous accepting state 43 | match_start = last_match_end; 44 | char_indices = input[match_start..].char_indices(); 45 | 46 | // Accept the previous match 47 | values.push(( 48 | &input[last_match_start..last_match_end], 49 | last_match_value, 50 | )); 51 | 52 | // Restart state machine 53 | state = StateIdx(0); 54 | } 55 | } 56 | } 57 | Some(next_state) => { 58 | state = next_state; 59 | 60 | // Check for accepting state 61 | for AcceptingState { value, right_ctx } in &self.states[state.0].accepting { 62 | match right_ctx { 63 | None => { 64 | last_match = 65 | Some((match_start, *value, char_idx + char.len_utf8())); 66 | break; 67 | } 68 | Some(right_ctx_idx) => { 69 | let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); 70 | if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { 71 | last_match = 72 | Some((match_start, *value, char_idx + char.len_utf8())); 73 | break; 74 | } 75 | } 76 | } 77 | } 78 | } 79 | } 80 | } 81 | 82 | // Reached EOF, take EOF transition, check for accepting states 83 | if let Some(next) = next_end_of_input(self, state) { 84 | // Check for accepting state 85 | state = next; 86 | for AcceptingState { value, right_ctx } in &self.states[state.0].accepting { 87 | match right_ctx { 88 | None => { 89 | values.push((&input[match_start..], *value)); 90 | break 'outer; 91 | } 92 | Some(right_ctx_idx) => { 93 | let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); 94 | if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { 95 | values.push((&input[match_start..], *value)); 96 | break 'outer; 97 | } 98 | } 99 | } 100 | } 101 | } 102 | 103 | // Reached EOF but cannot accept input, backtrack if possible, otherwise raise an error 104 | match last_match.take() { 105 | Some((last_match_start, last_match_value, last_match_end)) => { 106 | values.push((&input[last_match_start..last_match_end], last_match_value)); 107 | 108 | if last_match_end == input.len() { 109 | break 'outer; 110 | } else { 111 | // Backtrack 112 | match_start = last_match_end; 113 | char_indices = input[match_start..].char_indices(); 114 | 115 | // Restart state machine 116 | state = StateIdx(0); 117 | } 118 | } 119 | None => { 120 | // We're stuck and can't backtrack, raise an error 121 | return (values, Some(match_start)); 122 | } 123 | } 124 | } 125 | 126 | (values, None) 127 | } 128 | } 129 | 130 | fn next(dfa: &DFA, state: StateIdx, char: char) -> Option { 131 | let state = &dfa.states[state.0]; 132 | 133 | if let Some(next) = state.char_transitions.get(&char) { 134 | return Some(*next); 135 | } 136 | 137 | for range in state.range_transitions.iter() { 138 | let Range { start, end, value } = range; 139 | if char as u32 >= *start && char as u32 <= *end { 140 | return Some(*value); 141 | } 142 | } 143 | 144 | if let Some(next) = state.any_transition { 145 | return Some(next); 146 | } 147 | 148 | None 149 | } 150 | 151 | fn next_end_of_input(dfa: &DFA, state: StateIdx) -> Option { 152 | dfa.states[state.0].end_of_input_transition 153 | } 154 | 155 | // Similar to `simulate`, but does not keep track of the last match as we don't need "longest 156 | // match" semantics and backtracking 157 | pub fn simulate_right_ctx(dfa: &DFA, char_indices: std::str::CharIndices) -> bool { 158 | let mut state = dfa.initial_state(); 159 | 160 | if dfa.is_accepting_state(state) { 161 | return true; 162 | } 163 | 164 | for (_, char) in char_indices { 165 | match next(dfa, state, char) { 166 | None => { 167 | // Stuck 168 | return false; 169 | } 170 | Some(next_state) => { 171 | if dfa.is_accepting_state(next_state) { 172 | return true; 173 | } 174 | 175 | state = next_state; 176 | } 177 | } 178 | } 179 | 180 | match next_end_of_input(dfa, state) { 181 | None => false, 182 | Some(next_state) => dfa.is_accepting_state(next_state), 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /crates/lexgen/src/display.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeSet, HashSet}; 2 | use std::fmt::{self, Display, Formatter}; 3 | 4 | pub struct BTreeSetDisplay<'a, A: Display>(pub &'a BTreeSet); 5 | 6 | pub struct HashSetDisplay<'a, A: Display, S>(pub &'a HashSet); 7 | 8 | impl Display for BTreeSetDisplay<'_, A> { 9 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 10 | display_set(f, self.0.len(), &mut self.0.iter()) 11 | } 12 | } 13 | 14 | impl Display for HashSetDisplay<'_, A, S> { 15 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 16 | display_set(f, self.0.len(), &mut self.0.iter()) 17 | } 18 | } 19 | 20 | fn display_set( 21 | f: &mut Formatter<'_>, 22 | n_elems: usize, 23 | elems: &mut dyn Iterator, 24 | ) -> fmt::Result { 25 | write!(f, "{{")?; 26 | 27 | for (elem_idx, elem) in elems.enumerate() { 28 | write!(f, "{}", elem)?; 29 | if elem_idx != n_elems - 1 { 30 | write!(f, ", ")?; 31 | } 32 | } 33 | 34 | write!(f, "}}") 35 | } 36 | -------------------------------------------------------------------------------- /crates/lexgen/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Please see the [project README][1] for usage. 2 | //! 3 | //! [1]: https://github.com/osa1/lexgen 4 | 5 | #![allow( 6 | clippy::collapsible_else_if, 7 | clippy::enum_variant_names, 8 | clippy::too_many_arguments, 9 | clippy::upper_case_acronyms, 10 | clippy::large_enum_variant 11 | )] 12 | 13 | mod ast; 14 | mod builtin; 15 | mod char_ranges; 16 | mod collections; 17 | mod dfa; 18 | mod display; 19 | mod nfa; 20 | mod nfa_to_dfa; 21 | mod range_map; 22 | mod regex_to_nfa; 23 | mod right_ctx; 24 | mod semantic_action_table; 25 | 26 | #[cfg(test)] 27 | mod tests; 28 | 29 | use ast::{Binding, Lexer, Regex, RegexCtx, Rule, RuleOrBinding, SingleRule, Var}; 30 | use collections::Map; 31 | use dfa::{StateIdx as DfaStateIdx, DFA}; 32 | use nfa::NFA; 33 | use nfa_to_dfa::nfa_to_dfa; 34 | use right_ctx::RightCtxDFAs; 35 | use semantic_action_table::{SemanticActionIdx, SemanticActionTable}; 36 | 37 | use std::collections::hash_map::Entry; 38 | 39 | use proc_macro::TokenStream; 40 | use syn::parse::Parser; 41 | 42 | #[proc_macro] 43 | pub fn lexer(input: TokenStream) -> TokenStream { 44 | let mut semantic_action_table = SemanticActionTable::new(); 45 | 46 | let Lexer { 47 | attrs, 48 | visibility, 49 | type_name, 50 | user_state_type, 51 | token_type, 52 | rules: top_level_rules, 53 | } = match ast::make_lexer_parser(&mut semantic_action_table).parse(input) { 54 | Ok(lexer) => lexer, 55 | Err(error) => return TokenStream::from(error.to_compile_error()), 56 | }; 57 | 58 | // Maps DFA names to their initial states in the final DFA 59 | let mut dfas: Map = Default::default(); 60 | 61 | // DFAs generated for right contexts 62 | let mut right_ctx_dfas = RightCtxDFAs::new(); 63 | 64 | let mut bindings: Map = Default::default(); 65 | 66 | let mut init_dfa: Option> = None; 67 | 68 | let mut user_error_type: Option = None; 69 | 70 | let mut unnamed_nfa: NFA = NFA::new(); 71 | 72 | // Mixing named and unnamed rules is not allowed 73 | { 74 | let mut named = false; 75 | let mut unnamed = false; 76 | for rule in &top_level_rules { 77 | match rule { 78 | Rule::RuleOrBinding(RuleOrBinding::Rule { .. }) => unnamed = true, 79 | Rule::RuleSet { .. } => named = true, 80 | _ => {} 81 | } 82 | } 83 | if named && unnamed { 84 | panic!( 85 | "Unnamed rules cannot be mixed with named rules. Make sure to either \ 86 | have all your rules in `rule ... {} ... {}` syntax, or remove `rule`s \ 87 | entirely and have your rules at the top-level.", 88 | '{', '}', 89 | ); 90 | } 91 | } 92 | 93 | for rule in top_level_rules { 94 | match rule { 95 | Rule::ErrorType { ty } => match user_error_type { 96 | None => { 97 | user_error_type = Some(ty); 98 | } 99 | Some(_) => panic!("Error type defined multiple times"), 100 | }, 101 | 102 | Rule::RuleOrBinding(RuleOrBinding::Binding(Binding { var, re })) => { 103 | match bindings.entry(var) { 104 | Entry::Occupied(entry) => { 105 | panic!("Variable {:?} is defined multiple times", entry.key().0); 106 | } 107 | Entry::Vacant(entry) => { 108 | entry.insert(re); 109 | } 110 | } 111 | } 112 | 113 | Rule::RuleOrBinding(RuleOrBinding::Rule(SingleRule { lhs, rhs })) => { 114 | compile_single_rule(&mut unnamed_nfa, lhs, rhs, &bindings, &mut right_ctx_dfas); 115 | } 116 | 117 | Rule::RuleSet { name, rules } => { 118 | let dfa_idx = if name == "Init" { 119 | let dfa = init_dfa.insert(compile_rule_set( 120 | rules, 121 | bindings.clone(), 122 | &mut right_ctx_dfas, 123 | )); 124 | 125 | dfa.initial_state() 126 | } else { 127 | let dfa = init_dfa 128 | .as_mut() 129 | .expect("First rule set should be named \"Init\""); 130 | 131 | let dfa_ = compile_rule_set(rules, bindings.clone(), &mut right_ctx_dfas); 132 | 133 | dfa.add_dfa(dfa_) 134 | }; 135 | 136 | if dfas.insert(name.to_string(), dfa_idx).is_some() { 137 | panic!("Rule set {:?} is defined multiple times", name.to_string()); 138 | } 139 | } 140 | } 141 | } 142 | 143 | let mut dfa = match init_dfa { 144 | Some(init_dfa) => init_dfa, 145 | None => nfa_to_dfa(&unnamed_nfa), 146 | }; 147 | 148 | dfa::update_backtracks(&mut dfa); 149 | 150 | let dfa = dfa::simplify::simplify(dfa, &mut dfas); 151 | 152 | dfa::codegen::generate( 153 | dfa, 154 | &right_ctx_dfas, 155 | semantic_action_table, 156 | user_state_type, 157 | user_error_type, 158 | dfas, 159 | type_name, 160 | token_type, 161 | visibility, 162 | attrs, 163 | ) 164 | .into() 165 | } 166 | 167 | fn compile_single_rule( 168 | nfa: &mut NFA, 169 | lhs: RegexCtx, 170 | rhs: SemanticActionIdx, 171 | bindings: &Map, 172 | right_ctx_dfas: &mut RightCtxDFAs, 173 | ) { 174 | let RegexCtx { re, right_ctx } = lhs; 175 | 176 | let right_ctx = right_ctx 177 | .as_ref() 178 | .map(|right_ctx| right_ctx_dfas.new_right_ctx(bindings, right_ctx)); 179 | 180 | nfa.add_regex(bindings, &re, right_ctx, rhs); 181 | } 182 | 183 | fn compile_rule_set( 184 | rules: Vec, 185 | mut bindings: Map, 186 | right_ctx_dfas: &mut RightCtxDFAs, 187 | ) -> DFA { 188 | let mut nfa: NFA = NFA::new(); 189 | 190 | for rule in rules { 191 | match rule { 192 | RuleOrBinding::Rule(SingleRule { lhs, rhs }) => { 193 | compile_single_rule(&mut nfa, lhs, rhs, &bindings, right_ctx_dfas); 194 | } 195 | RuleOrBinding::Binding(Binding { var, re }) => match bindings.entry(var) { 196 | Entry::Occupied(entry) => { 197 | panic!("Variable {:?} is defined multiple times", entry.key().0); 198 | } 199 | Entry::Vacant(entry) => { 200 | entry.insert(re); 201 | } 202 | }, 203 | } 204 | } 205 | 206 | nfa_to_dfa(&nfa) 207 | } 208 | -------------------------------------------------------------------------------- /crates/lexgen/src/nfa.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | pub mod simulate; 3 | 4 | use crate::ast::{Regex, Var}; 5 | use crate::collections::{Map, Set}; 6 | use crate::display::HashSetDisplay; 7 | use crate::range_map::{Range, RangeMap}; 8 | use crate::regex_to_nfa; 9 | use crate::right_ctx::RightCtxIdx; 10 | 11 | /// Non-deterministic finite automate, parameterized on values of accepting states. 12 | #[derive(Debug)] 13 | pub struct NFA { 14 | // Indexed by `StateIdx` 15 | states: Vec>, 16 | } 17 | 18 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 19 | pub struct StateIdx(usize); 20 | 21 | #[derive(Debug)] 22 | struct State { 23 | char_transitions: Map>, 24 | range_transitions: RangeMap>, 25 | empty_transitions: Set, 26 | any_transitions: Set, 27 | end_of_input_transitions: Set, 28 | accepting: Option>, 29 | } 30 | 31 | #[derive(Debug, Clone, Copy)] 32 | pub struct AcceptingState { 33 | pub value: A, 34 | pub right_ctx: Option, 35 | } 36 | 37 | impl State { 38 | fn new() -> State { 39 | State { 40 | char_transitions: Default::default(), 41 | range_transitions: Default::default(), 42 | empty_transitions: Default::default(), 43 | any_transitions: Default::default(), 44 | end_of_input_transitions: Default::default(), 45 | accepting: None, 46 | } 47 | } 48 | } 49 | 50 | impl NFA { 51 | pub fn new() -> NFA { 52 | NFA { 53 | states: vec![State::new()], 54 | } 55 | } 56 | 57 | pub fn initial_state(&self) -> StateIdx { 58 | StateIdx(0) 59 | } 60 | 61 | pub fn get_accepting_state(&self, state: StateIdx) -> Option<&AcceptingState> { 62 | self.states[state.0].accepting.as_ref() 63 | } 64 | 65 | pub fn char_transitions( 66 | &self, 67 | state: StateIdx, 68 | ) -> impl Iterator)> { 69 | self.states[state.0].char_transitions.iter() 70 | } 71 | 72 | pub fn range_transitions( 73 | &self, 74 | state: StateIdx, 75 | ) -> impl Iterator>> { 76 | self.states[state.0].range_transitions.iter() 77 | } 78 | 79 | pub fn any_transitions(&self, state: StateIdx) -> impl Iterator + '_ { 80 | self.states[state.0].any_transitions.iter().copied() 81 | } 82 | 83 | pub fn end_of_input_transitions(&self, state: StateIdx) -> impl Iterator + '_ { 84 | self.states[state.0] 85 | .end_of_input_transitions 86 | .iter() 87 | .copied() 88 | } 89 | 90 | pub fn new_state(&mut self) -> StateIdx { 91 | let new_state_idx = StateIdx(self.states.len()); 92 | self.states.push(State::new()); 93 | new_state_idx 94 | } 95 | 96 | pub fn add_regex( 97 | &mut self, 98 | bindings: &Map, 99 | re: &Regex, 100 | right_ctx: Option, 101 | value: A, 102 | ) { 103 | let re_accepting_state = self.new_state(); 104 | 105 | self.make_state_accepting(re_accepting_state, value, right_ctx); 106 | 107 | let re_initial_state = self.new_state(); 108 | let nfa_initial_state = self.initial_state(); 109 | 110 | self.add_empty_transition(nfa_initial_state, re_initial_state); 111 | 112 | regex_to_nfa::add_re(self, bindings, re, re_initial_state, re_accepting_state); 113 | } 114 | 115 | pub fn add_char_transition(&mut self, state: StateIdx, char: char, next: StateIdx) { 116 | let not_exists = self.states[state.0] 117 | .char_transitions 118 | .entry(char) 119 | .or_default() 120 | .insert(next); 121 | 122 | assert!(not_exists, "add_char_transition"); 123 | } 124 | 125 | pub fn add_range_transition( 126 | &mut self, 127 | state: StateIdx, 128 | range_start: char, 129 | range_end: char, 130 | next: StateIdx, 131 | ) { 132 | let mut set: Set = Default::default(); 133 | set.insert(next); 134 | self.states[state.0].range_transitions.insert( 135 | range_start as u32, 136 | range_end as u32, 137 | set, 138 | |values_1, values_2| values_1.extend(values_2), 139 | ); 140 | } 141 | 142 | pub fn add_range_transitions(&mut self, state: StateIdx, ranges: RangeMap<()>, next: StateIdx) { 143 | let mut set: Set = Default::default(); 144 | set.insert(next); 145 | 146 | let ranges = ranges.map(|()| set.clone()); 147 | 148 | self.states[state.0] 149 | .range_transitions 150 | .insert_ranges(ranges.into_iter(), |values_1, values_2| { 151 | values_1.extend(values_2) 152 | }); 153 | } 154 | 155 | pub fn add_empty_transition(&mut self, state: StateIdx, next: StateIdx) { 156 | let not_exists = self.states[state.0].empty_transitions.insert(next); 157 | 158 | assert!(not_exists, "add_empty_transition"); 159 | } 160 | 161 | pub fn add_any_transition(&mut self, state: StateIdx, next: StateIdx) { 162 | let not_exists = self.states[state.0].any_transitions.insert(next); 163 | 164 | assert!(not_exists, "add_any_transition"); 165 | } 166 | 167 | pub fn add_end_of_input_transition(&mut self, state: StateIdx, next: StateIdx) { 168 | let not_exists = self.states[state.0].end_of_input_transitions.insert(next); 169 | 170 | assert!(not_exists, "add_end_of_input_transition"); 171 | } 172 | 173 | fn make_state_accepting(&mut self, state: StateIdx, value: A, right_ctx: Option) { 174 | let old = self.states[state.0] 175 | .accepting 176 | .replace(AcceptingState { value, right_ctx }); 177 | 178 | assert!(old.is_none(), "make_state_accepting"); 179 | } 180 | 181 | pub fn compute_state_closure(&self, states: &Set) -> Set { 182 | let mut worklist: Vec = states.iter().copied().collect(); 183 | let mut closure: Set = states.clone(); 184 | 185 | while let Some(work) = worklist.pop() { 186 | for next_state in self.next_empty_states(work) { 187 | if closure.insert(*next_state) { 188 | worklist.push(*next_state); 189 | } 190 | } 191 | } 192 | 193 | closure 194 | } 195 | 196 | fn next_empty_states(&self, state: StateIdx) -> &Set { 197 | let state = &self.states[state.0]; 198 | &state.empty_transitions 199 | } 200 | } 201 | 202 | use std::fmt::{self, Display, Formatter}; 203 | 204 | impl Display for StateIdx { 205 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 206 | self.0.fmt(f) 207 | } 208 | } 209 | 210 | impl Display for NFA { 211 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 212 | for (state_idx, state) in self.states.iter().enumerate() { 213 | let State { 214 | char_transitions, 215 | range_transitions, 216 | empty_transitions, 217 | any_transitions, 218 | end_of_input_transitions, 219 | accepting, 220 | } = state; 221 | 222 | match accepting { 223 | Some(AcceptingState { 224 | value: _, 225 | right_ctx, 226 | }) => match right_ctx { 227 | Some(right_ctx_idx) => { 228 | write!(f, "{:>4}", format!("*{}", state_idx),)?; 229 | write!(f, " (ctx {})", right_ctx_idx.as_usize())?; 230 | } 231 | None => { 232 | write!(f, "{:>4}", format!("*{}", state_idx))?; 233 | } 234 | }, 235 | None => { 236 | write!(f, "{:>4}:", state_idx)?; 237 | } 238 | } 239 | 240 | let mut first = true; 241 | 242 | if !empty_transitions.is_empty() { 243 | if !first { 244 | write!(f, " ")?; 245 | } else { 246 | first = false; 247 | } 248 | 249 | writeln!(f, "e -> {}", HashSetDisplay(empty_transitions))?; 250 | } 251 | 252 | for (char, next) in char_transitions.iter() { 253 | if !first { 254 | write!(f, " ")?; 255 | } else { 256 | first = false; 257 | } 258 | 259 | writeln!(f, "{:?} -> {}", char, HashSetDisplay(next))?; 260 | } 261 | 262 | for range in range_transitions.iter() { 263 | if !first { 264 | write!(f, " ")?; 265 | } else { 266 | first = false; 267 | } 268 | 269 | writeln!( 270 | f, 271 | "{:?} - {:?} -> {}", 272 | range.start, 273 | range.end, 274 | HashSetDisplay(&range.value) 275 | )?; 276 | } 277 | 278 | if !any_transitions.is_empty() { 279 | if !first { 280 | write!(f, " ")?; 281 | } else { 282 | first = false; 283 | } 284 | 285 | writeln!(f, "_ -> {}", HashSetDisplay(any_transitions))?; 286 | } 287 | 288 | if !end_of_input_transitions.is_empty() { 289 | if !first { 290 | write!(f, " ")?; 291 | } 292 | 293 | writeln!(f, "$ -> {}", HashSetDisplay(end_of_input_transitions))?; 294 | } 295 | 296 | if empty_transitions.is_empty() 297 | && char_transitions.is_empty() 298 | && range_transitions.is_empty() 299 | && any_transitions.is_empty() 300 | && end_of_input_transitions.is_empty() 301 | { 302 | writeln!(f)?; 303 | } 304 | } 305 | 306 | Ok(()) 307 | } 308 | } 309 | -------------------------------------------------------------------------------- /crates/lexgen/src/nfa/simulate.rs: -------------------------------------------------------------------------------- 1 | use super::{AcceptingState, StateIdx, NFA}; 2 | use crate::collections::Set; 3 | use crate::dfa::simulate::simulate_right_ctx; 4 | use crate::dfa::StateIdx as DfaStateIdx; 5 | use crate::right_ctx::RightCtxDFAs; 6 | 7 | pub type Matches<'input, A> = Vec<(&'input str, A)>; 8 | 9 | pub type ErrorLoc = usize; 10 | 11 | impl NFA { 12 | pub fn simulate<'input>( 13 | &self, 14 | input: &'input str, 15 | right_ctx_dfas: &RightCtxDFAs, 16 | ) -> (Matches<'input, A>, Option) { 17 | let mut values: Matches<'input, A> = vec![]; 18 | 19 | // If we skipped an accepting state because we were able to make progress with the next 20 | // character, this state holds the previous match. If we get stuck we return this match. 21 | // 22 | // This implements backtracking in regexes like: 23 | // 24 | // - aaaaaab 25 | // - a 26 | // 27 | // in an input like "aaaa". 28 | let mut last_match: Option<(usize, A, usize)> = None; 29 | 30 | let mut states: Set = Default::default(); 31 | states.insert(StateIdx(0)); 32 | states = self.compute_state_closure(&states); 33 | 34 | let mut char_indices = input.char_indices(); 35 | 36 | // Where the current match starts 37 | let mut match_start: usize = 0; 38 | 39 | // Index of current character in input string 40 | let mut char_idx; 41 | 42 | 'outer: loop { 43 | while let Some((char_idx_, char)) = char_indices.next() { 44 | char_idx = match_start + char_idx_; 45 | 46 | states = next(self, &states, char); 47 | 48 | // When stuck check if we skipped an accepting state 49 | if states.is_empty() { 50 | match last_match.take() { 51 | None => { 52 | // We're stuck and can't backtrack, raise an error 53 | return (values, Some(match_start)); 54 | } 55 | Some((last_match_start, last_match_value, last_match_end)) => { 56 | // Backtrack to the previous accepting state 57 | match_start = last_match_end; 58 | char_indices = input[match_start..].char_indices(); 59 | 60 | // Accept the previous match 61 | values 62 | .push((&input[last_match_start..last_match_end], last_match_value)); 63 | 64 | // Restart state machine 65 | states.insert(StateIdx(0)); 66 | states = self.compute_state_closure(&states); 67 | } 68 | } 69 | } else { 70 | // Check for accepting states. Sort states to pick the one that comes first in 71 | // the program. 72 | let mut states_sorted: Vec = states.iter().copied().collect(); 73 | states_sorted.sort(); 74 | for state in states_sorted { 75 | if let Some(AcceptingState { value, right_ctx }) = 76 | &self.states[state.0].accepting 77 | { 78 | match right_ctx { 79 | None => { 80 | last_match = 81 | Some((match_start, *value, char_idx + char.len_utf8())); 82 | break; 83 | } 84 | Some(right_ctx_idx) => { 85 | let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); 86 | if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { 87 | last_match = 88 | Some((match_start, *value, char_idx + char.len_utf8())); 89 | break; 90 | } 91 | } 92 | } 93 | } 94 | } 95 | } 96 | } 97 | 98 | // Reached EOF, take EOF transitions, check for accepting states 99 | states = next_end_of_input(self, &states); 100 | 101 | { 102 | let mut states_sorted: Vec = states.iter().copied().collect(); 103 | states_sorted.sort(); 104 | 105 | for state in states_sorted { 106 | if let Some(AcceptingState { value, right_ctx }) = 107 | &self.states[state.0].accepting 108 | { 109 | match right_ctx { 110 | None => { 111 | values.push((&input[match_start..], *value)); 112 | break 'outer; 113 | } 114 | Some(right_ctx_idx) => { 115 | let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx); 116 | if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) { 117 | values.push((&input[match_start..], *value)); 118 | break 'outer; 119 | } 120 | } 121 | } 122 | } 123 | } 124 | } 125 | 126 | // Reached EOF but cannot accept input, backtrack if possible, otherwise raise an error 127 | match last_match.take() { 128 | Some((last_match_start, last_match_value, last_match_end)) => { 129 | values.push((&input[last_match_start..last_match_end], last_match_value)); 130 | 131 | if last_match_end == input.len() { 132 | break 'outer; 133 | } else { 134 | // Backtrack 135 | match_start = last_match_end; 136 | char_indices = input[match_start..].char_indices(); 137 | 138 | // Restart state machine 139 | states.insert(StateIdx(0)); 140 | states = self.compute_state_closure(&states); 141 | } 142 | } 143 | None => { 144 | // We're stuck and can't backtrack, raise an error 145 | return (values, Some(match_start)); 146 | } 147 | } 148 | } 149 | 150 | (values, None) 151 | } 152 | } 153 | 154 | fn next(nfa: &NFA, states: &Set, char: char) -> Set { 155 | let mut next_states: Set = Default::default(); 156 | 157 | for state in states { 158 | // Char transitions 159 | if let Some(char_nexts) = nfa.states[state.0].char_transitions.get(&char) { 160 | next_states.extend(char_nexts.iter()); 161 | } 162 | 163 | // Range transitions 164 | for range in nfa.states[state.0].range_transitions.iter() { 165 | if char as u32 >= range.start && char as u32 <= range.end { 166 | next_states.extend(range.value.clone()); 167 | } 168 | } 169 | 170 | // Any transitions 171 | next_states.extend(nfa.states[state.0].any_transitions.iter().copied()); 172 | } 173 | 174 | nfa.compute_state_closure(&next_states) 175 | } 176 | 177 | fn next_end_of_input(nfa: &NFA, states: &Set) -> Set { 178 | let mut next_states: Set = Default::default(); 179 | 180 | for state in states { 181 | next_states.extend(nfa.states[state.0].end_of_input_transitions.iter().copied()); 182 | } 183 | 184 | nfa.compute_state_closure(&next_states) 185 | } 186 | -------------------------------------------------------------------------------- /crates/lexgen/src/nfa_to_dfa.rs: -------------------------------------------------------------------------------- 1 | use crate::collections::{Map, Set}; 2 | use crate::dfa::DFA; 3 | use crate::nfa::NFA; 4 | use crate::range_map::{Range, RangeMap}; 5 | 6 | use crate::dfa::StateIdx as DfaStateIdx; 7 | use crate::nfa::StateIdx as NfaStateIdx; 8 | 9 | use std::collections::hash_map::Entry; 10 | use std::collections::BTreeSet; 11 | 12 | pub fn nfa_to_dfa(nfa: &NFA) -> DFA { 13 | let initial_state = nfa.initial_state(); 14 | 15 | let initial_states: BTreeSet = { 16 | let mut initial_states: Set = Default::default(); 17 | initial_states.insert(initial_state); 18 | 19 | nfa.compute_state_closure(&initial_states) 20 | .into_iter() 21 | .collect() 22 | }; 23 | 24 | let (mut dfa, dfa_initial_state): (DFA, DfaStateIdx) = DFA::new(); 25 | 26 | // Maps sets NFA states to their states in the DFA 27 | let mut state_map: Map, DfaStateIdx> = Default::default(); 28 | state_map.insert(initial_states.clone(), dfa_initial_state); 29 | 30 | let mut work_list: Vec> = vec![initial_states]; 31 | let mut finished_dfa_states: Set = Default::default(); 32 | 33 | while let Some(current_nfa_states) = work_list.pop() { 34 | let current_dfa_state = match state_map.get(¤t_nfa_states) { 35 | None => { 36 | let dfa_state = dfa.new_state(); 37 | state_map.insert(current_nfa_states.clone(), dfa_state); 38 | dfa_state 39 | } 40 | Some(dfa_state) => *dfa_state, 41 | }; 42 | 43 | if finished_dfa_states.contains(¤t_dfa_state) { 44 | continue; 45 | } 46 | 47 | finished_dfa_states.insert(current_dfa_state); 48 | 49 | let mut char_transitions: Map> = Default::default(); 50 | let mut range_transitions: RangeMap> = Default::default(); 51 | let mut any_transitions: Set = Default::default(); 52 | let mut end_of_input_transitions: Set = Default::default(); 53 | 54 | for nfa_state in current_nfa_states.iter().copied() { 55 | if let Some(value) = nfa.get_accepting_state(nfa_state) { 56 | dfa.make_state_accepting(current_dfa_state, value.clone()); 57 | } 58 | 59 | // Collect char transitions 60 | for (char, next_states) in nfa.char_transitions(nfa_state) { 61 | char_transitions 62 | .entry(*char) 63 | .or_default() 64 | .extend(next_states.iter().copied()); 65 | } 66 | 67 | // Collect range transitions 68 | for range in nfa.range_transitions(nfa_state) { 69 | range_transitions.insert( 70 | range.start, 71 | range.end, 72 | range.value.clone(), 73 | |states_1, states_2| states_1.extend(states_2.into_iter()), 74 | ); 75 | } 76 | 77 | // Collect any transitions 78 | any_transitions.extend(nfa.any_transitions(nfa_state)); 79 | 80 | // Collect end-of-input transitions 81 | end_of_input_transitions.extend(nfa.end_of_input_transitions(nfa_state)); 82 | } 83 | 84 | // Compute closures of transition targets and add transitions to DFA 85 | for (char, mut char_states) in char_transitions.into_iter() { 86 | // For ranges that also cover the char we need to add the range transitions to the char 87 | // transition 88 | for range in range_transitions.iter() { 89 | if range.contains(char) { 90 | for range_state in &range.value { 91 | char_states.insert(*range_state); 92 | } 93 | } 94 | } 95 | 96 | // Same for '_' (match any character) transitions 97 | for any_next in &any_transitions { 98 | char_states.insert(*any_next); 99 | } 100 | 101 | let closure: BTreeSet = nfa 102 | .compute_state_closure(&char_states) 103 | .into_iter() 104 | .collect(); 105 | let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone()); 106 | dfa.add_char_transition(current_dfa_state, char, dfa_state); 107 | 108 | work_list.push(closure); 109 | } 110 | 111 | let mut dfa_range_transitions: Vec> = 112 | Vec::with_capacity(range_transitions.len()); 113 | 114 | for range in range_transitions.into_iter() { 115 | let mut range_states: Set = range.value; 116 | 117 | for any_next in &any_transitions { 118 | range_states.insert(*any_next); 119 | } 120 | 121 | let closure: BTreeSet = nfa 122 | .compute_state_closure(&range_states) 123 | .into_iter() 124 | .collect(); 125 | 126 | let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone()); 127 | 128 | dfa_range_transitions.push(Range { 129 | start: range.start, 130 | end: range.end, 131 | value: dfa_state, 132 | }); 133 | 134 | work_list.push(closure); 135 | } 136 | 137 | dfa.set_range_transitions( 138 | current_dfa_state, 139 | RangeMap::from_non_overlapping_sorted_ranges(dfa_range_transitions), 140 | ); 141 | 142 | { 143 | let closure: BTreeSet = nfa 144 | .compute_state_closure(&any_transitions) 145 | .into_iter() 146 | .collect(); 147 | 148 | if !closure.is_empty() { 149 | let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone()); 150 | dfa.set_any_transition(current_dfa_state, dfa_state); 151 | work_list.push(closure); 152 | } 153 | } 154 | 155 | { 156 | let closure: BTreeSet = nfa 157 | .compute_state_closure(&end_of_input_transitions) 158 | .into_iter() 159 | .collect(); 160 | 161 | if !closure.is_empty() { 162 | let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone()); 163 | dfa.set_end_of_input_transition(current_dfa_state, dfa_state); 164 | work_list.push(closure); 165 | } 166 | } 167 | } 168 | 169 | dfa 170 | } 171 | 172 | fn dfa_state_of_nfa_states( 173 | dfa: &mut DFA, 174 | state_map: &mut Map, DfaStateIdx>, 175 | states: BTreeSet, 176 | ) -> DfaStateIdx { 177 | match state_map.entry(states) { 178 | Entry::Occupied(entry) => *entry.get(), 179 | Entry::Vacant(entry) => { 180 | let dfa_state = dfa.new_state(); 181 | entry.insert(dfa_state); 182 | dfa_state 183 | } 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /crates/lexgen/src/regex_to_nfa.rs: -------------------------------------------------------------------------------- 1 | use crate::ast::{Builtin, CharOrRange, Regex, Var}; 2 | use crate::builtin::{BuiltinCharRange, BUILTIN_RANGES}; 3 | use crate::collections::Map; 4 | use crate::nfa::{StateIdx, NFA}; 5 | use crate::range_map::{Range, RangeMap}; 6 | 7 | pub fn add_re( 8 | nfa: &mut NFA, 9 | bindings: &Map, 10 | re: &Regex, 11 | current: StateIdx, 12 | cont: StateIdx, 13 | ) { 14 | match re { 15 | Regex::Builtin(builtin_name) => { 16 | let builtin = get_builtin_regex(builtin_name); 17 | 18 | let ranges: Vec> = builtin 19 | .get_ranges() 20 | .iter() 21 | .copied() 22 | .map(|(start, end)| Range { 23 | start, 24 | end, 25 | value: (), 26 | }) 27 | .collect(); 28 | 29 | let map = RangeMap::from_non_overlapping_sorted_ranges(ranges); 30 | 31 | nfa.add_range_transitions(current, map, cont); 32 | } 33 | 34 | Regex::Var(var) => { 35 | let re = bindings 36 | .get(var) 37 | .unwrap_or_else(|| panic!("Unbound variable {:?}", var.0)); 38 | 39 | add_re(nfa, bindings, re, current, cont); 40 | } 41 | 42 | Regex::Char(char) => { 43 | nfa.add_char_transition(current, *char, cont); 44 | } 45 | 46 | Regex::String(str) => { 47 | let mut iter = str.chars().peekable(); 48 | let mut current = current; 49 | while let Some(char) = iter.next() { 50 | let next = if iter.peek().is_some() { 51 | nfa.new_state() 52 | } else { 53 | cont 54 | }; 55 | nfa.add_char_transition(current, char, next); 56 | current = next; 57 | } 58 | } 59 | 60 | Regex::CharSet(set) => { 61 | for char in &set.0 { 62 | match char { 63 | CharOrRange::Char(char) => { 64 | nfa.add_char_transition(current, *char, cont); 65 | } 66 | CharOrRange::Range(range_start, range_end) => { 67 | nfa.add_range_transition(current, *range_start, *range_end, cont); 68 | } 69 | } 70 | } 71 | } 72 | 73 | Regex::ZeroOrMore(re) => { 74 | let re_init = nfa.new_state(); 75 | let re_cont = nfa.new_state(); 76 | add_re(nfa, bindings, re, re_init, re_cont); 77 | nfa.add_empty_transition(current, cont); 78 | nfa.add_empty_transition(current, re_init); 79 | nfa.add_empty_transition(re_cont, cont); 80 | nfa.add_empty_transition(re_cont, re_init); 81 | } 82 | 83 | Regex::OneOrMore(re) => { 84 | let re_init = nfa.new_state(); 85 | let re_cont = nfa.new_state(); 86 | add_re(nfa, bindings, re, re_init, re_cont); 87 | nfa.add_empty_transition(current, re_init); 88 | nfa.add_empty_transition(re_cont, cont); 89 | nfa.add_empty_transition(re_cont, re_init); 90 | } 91 | 92 | Regex::ZeroOrOne(re) => { 93 | let re_init = nfa.new_state(); 94 | add_re(nfa, bindings, re, re_init, cont); 95 | nfa.add_empty_transition(current, cont); 96 | nfa.add_empty_transition(current, re_init); 97 | } 98 | 99 | Regex::Concat(re1, re2) => { 100 | let re1_cont = nfa.new_state(); 101 | add_re(nfa, bindings, re1, current, re1_cont); 102 | add_re(nfa, bindings, re2, re1_cont, cont); 103 | } 104 | 105 | Regex::Or(re1, re2) => { 106 | let re1_init = nfa.new_state(); 107 | let re2_init = nfa.new_state(); 108 | add_re(nfa, bindings, re1, re1_init, cont); 109 | add_re(nfa, bindings, re2, re2_init, cont); 110 | nfa.add_empty_transition(current, re1_init); 111 | nfa.add_empty_transition(current, re2_init); 112 | } 113 | 114 | Regex::Any => { 115 | nfa.add_any_transition(current, cont); 116 | } 117 | 118 | Regex::EndOfInput => { 119 | nfa.add_end_of_input_transition(current, cont); 120 | } 121 | 122 | Regex::Diff(_, _) => { 123 | let map = regex_to_range_map(bindings, re); 124 | nfa.add_range_transitions(current, map, cont); 125 | } 126 | } 127 | } 128 | 129 | fn get_builtin_regex(builtin: &Builtin) -> BuiltinCharRange { 130 | BUILTIN_RANGES 131 | .iter() 132 | .find_map(|(name, builtin_)| { 133 | if *name == builtin.0 { 134 | Some(*builtin_) 135 | } else { 136 | None 137 | } 138 | }) 139 | .unwrap_or_else(|| panic!("Unknown builtin regex: {}", builtin.0)) 140 | } 141 | 142 | fn regex_to_range_map(bindings: &Map, re: &Regex) -> RangeMap<()> { 143 | match re { 144 | Regex::Builtin(builtin) => { 145 | let builtin = get_builtin_regex(builtin); 146 | let ranges: Vec> = builtin 147 | .get_ranges() 148 | .iter() 149 | .copied() 150 | .map(|(start, end)| Range { 151 | start, 152 | end, 153 | value: (), 154 | }) 155 | .collect(); 156 | RangeMap::from_non_overlapping_sorted_ranges(ranges) 157 | } 158 | 159 | Regex::Var(var) => { 160 | let re = bindings 161 | .get(var) 162 | .unwrap_or_else(|| panic!("Unbound variable {:?}", var.0)); 163 | 164 | regex_to_range_map(bindings, re) 165 | } 166 | 167 | Regex::Char(char) => { 168 | let mut map = RangeMap::new(); 169 | map.insert(*char as u32, *char as u32, (), merge_values); 170 | map 171 | } 172 | 173 | Regex::String(_) => panic!("strings cannot be used in char sets (`#`)"), 174 | 175 | Regex::CharSet(char_set) => { 176 | let mut map = RangeMap::new(); 177 | 178 | // TODO: Quadratic behavior below, `RangeMap::insert` is O(number of ranges) 179 | for char_or_range in char_set.0.iter() { 180 | match char_or_range { 181 | CharOrRange::Char(char) => { 182 | map.insert(*char as u32, *char as u32, (), merge_values); 183 | } 184 | CharOrRange::Range(start, end) => { 185 | map.insert(*start as u32, *end as u32, (), merge_values); 186 | } 187 | } 188 | } 189 | 190 | map 191 | } 192 | 193 | Regex::ZeroOrMore(_) => { 194 | panic!("`*` cannot be used in char sets (`#`)"); 195 | } 196 | 197 | Regex::OneOrMore(_) => { 198 | panic!("`+` cannot be used in char sets (`#`)"); 199 | } 200 | 201 | Regex::ZeroOrOne(_) => { 202 | panic!("`?` cannot be used in char sets (`#`)"); 203 | } 204 | 205 | Regex::Concat(_, _) => { 206 | panic!("concatenation (` `) cannot be used in char sets (`#`)"); 207 | } 208 | 209 | Regex::Or(re1, re2) => { 210 | let mut map1 = regex_to_range_map(bindings, re1); 211 | let map2 = regex_to_range_map(bindings, re2); 212 | 213 | map1.insert_ranges(map2.into_iter(), merge_values); 214 | 215 | map1 216 | } 217 | 218 | Regex::Any => { 219 | let mut map = RangeMap::new(); 220 | map.insert(0, char::MAX as u32, (), merge_values); 221 | map 222 | } 223 | 224 | Regex::EndOfInput => panic!("`$` cannot be used in char sets (`#`)"), 225 | 226 | Regex::Diff(re1, re2) => { 227 | let mut map1 = regex_to_range_map(bindings, re1); 228 | let map2 = regex_to_range_map(bindings, re2); 229 | map1.remove_ranges(&map2); 230 | map1 231 | } 232 | } 233 | } 234 | 235 | fn merge_values(_val1: &mut (), _val2: ()) {} 236 | -------------------------------------------------------------------------------- /crates/lexgen/src/right_ctx.rs: -------------------------------------------------------------------------------- 1 | //! Stuff related to right contexts 2 | //! 3 | //! A right context is a limited version of lookahead. A rule can have at most one right context. 4 | //! When a rule has right context, after the regex for the rule matches, we run the DFA for the 5 | //! right context with cloned input stream. Only if it matches we consider the the rule as a match. 6 | //! This provides a simple "lookahead" support, which should be good enough when lexing programming 7 | //! languages. 8 | 9 | use crate::ast::{Regex, Var}; 10 | use crate::collections::Map; 11 | // use crate::dfa::simplify::{simplify, Trans}; 12 | use crate::dfa::{StateIdx, DFA}; 13 | use crate::nfa::NFA; 14 | use crate::nfa_to_dfa::nfa_to_dfa; 15 | 16 | #[derive(Debug)] 17 | pub struct RightCtxDFAs { 18 | dfas: Vec>, 19 | } 20 | 21 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] 22 | pub struct RightCtxIdx(usize); 23 | 24 | impl RightCtxIdx { 25 | pub fn as_usize(&self) -> usize { 26 | self.0 27 | } 28 | } 29 | 30 | impl RightCtxDFAs { 31 | pub fn new() -> Self { 32 | RightCtxDFAs { dfas: vec![] } 33 | } 34 | 35 | pub fn iter(&self) -> impl Iterator)> { 36 | self.dfas 37 | .iter() 38 | .enumerate() 39 | .map(|(i, dfa)| (RightCtxIdx(i), dfa)) 40 | } 41 | } 42 | 43 | impl RightCtxDFAs { 44 | pub fn new_right_ctx(&mut self, bindings: &Map, right_ctx: &Regex) -> RightCtxIdx { 45 | let idx = self.dfas.len(); 46 | 47 | let mut nfa: NFA<()> = NFA::new(); 48 | nfa.add_regex(bindings, right_ctx, None, ()); 49 | 50 | let dfa = nfa_to_dfa(&nfa); 51 | self.dfas.push(dfa); 52 | 53 | RightCtxIdx(idx) 54 | } 55 | 56 | #[cfg(test)] 57 | pub fn get(&self, right_ctx: &RightCtxIdx) -> &DFA { 58 | &self.dfas[right_ctx.as_usize()] 59 | } 60 | 61 | /* 62 | pub fn simplify(self) -> RightCtxDFAs> { 63 | RightCtxDFAs { 64 | dfas: self 65 | .dfas 66 | .into_iter() 67 | .map(|dfa| simplify::<(), ()>(dfa, &mut Default::default())) 68 | .collect(), 69 | } 70 | } 71 | */ 72 | } 73 | -------------------------------------------------------------------------------- /crates/lexgen/src/semantic_action_table.rs: -------------------------------------------------------------------------------- 1 | use crate::ast::RuleRhs; 2 | 3 | pub struct SemanticActionTable { 4 | table: Vec, 5 | } 6 | 7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 8 | pub struct SemanticActionIdx(usize); 9 | 10 | impl SemanticActionTable { 11 | pub fn new() -> Self { 12 | Self { table: vec![] } 13 | } 14 | 15 | pub fn add(&mut self, action: RuleRhs) -> SemanticActionIdx { 16 | let idx = self.table.len(); 17 | self.table.push(action); 18 | SemanticActionIdx(idx) 19 | } 20 | 21 | pub fn iter(&self) -> impl Iterator { 22 | self.table 23 | .iter() 24 | .enumerate() 25 | .map(|(idx, expr)| (SemanticActionIdx(idx), expr)) 26 | } 27 | } 28 | 29 | impl SemanticActionIdx { 30 | pub fn as_usize(&self) -> usize { 31 | self.0 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /crates/lexgen/src/tests.rs: -------------------------------------------------------------------------------- 1 | use crate::ast::{CharOrRange, CharSet, Regex, Var}; 2 | use crate::collections::Map; 3 | use crate::dfa::StateIdx as DfaStateIdx; 4 | use crate::nfa::simulate::{ErrorLoc, Matches}; 5 | use crate::nfa::NFA; 6 | use crate::nfa_to_dfa::nfa_to_dfa; 7 | use crate::right_ctx::RightCtxDFAs; 8 | 9 | fn test_simulate<'input, A: Copy + std::fmt::Debug + Eq>( 10 | nfa: &NFA, 11 | test_cases: Vec<(&'input str, Matches<'input, A>, Option)>, 12 | ) { 13 | test_simulate_right_ctx(nfa, &RightCtxDFAs::new(), test_cases) 14 | } 15 | 16 | fn test_simulate_right_ctx<'input, A: Copy + std::fmt::Debug + Eq>( 17 | nfa: &NFA, 18 | right_ctx_dfas: &RightCtxDFAs, 19 | test_cases: Vec<(&'input str, Matches<'input, A>, Option)>, 20 | ) { 21 | println!("NFA=\n{}", nfa); 22 | 23 | let dfa = nfa_to_dfa(nfa); 24 | 25 | println!("DFA=\n{}", dfa); 26 | 27 | for (str, expected_matches, expected_error) in test_cases { 28 | let expected = (expected_matches, expected_error); 29 | 30 | assert_eq!( 31 | &nfa.simulate(str, right_ctx_dfas), 32 | &expected, 33 | "NFA simulation failed for string: {:?}", 34 | str 35 | ); 36 | 37 | assert_eq!( 38 | dfa.simulate(str, right_ctx_dfas), 39 | expected, 40 | "DFA simulation failed for string: {:?}", 41 | str 42 | ); 43 | } 44 | } 45 | 46 | #[test] 47 | fn simulate_backtracking() { 48 | let mut nfa: NFA = NFA::new(); 49 | 50 | nfa.add_regex( 51 | &Default::default(), 52 | &Regex::Concat( 53 | Box::new(Regex::OneOrMore(Box::new(Regex::Char('a')))), 54 | Box::new(Regex::Char('b')), 55 | ), 56 | None, 57 | 1, 58 | ); 59 | 60 | nfa.add_regex(&Default::default(), &Regex::Char('a'), None, 2); 61 | 62 | test_simulate( 63 | &nfa, 64 | vec![ 65 | ("a", vec![("a", 2)], None), 66 | ("aa", vec![("a", 2), ("a", 2)], None), 67 | ("aab", vec![("aab", 1)], None), 68 | ], 69 | ); 70 | } 71 | 72 | #[test] 73 | fn issue_16() { 74 | let mut nfa: NFA = NFA::new(); 75 | 76 | nfa.add_regex( 77 | &Default::default(), 78 | &Regex::String("xyzxyz".to_owned()), 79 | None, 80 | 1, 81 | ); 82 | nfa.add_regex( 83 | &Default::default(), 84 | &Regex::String("xyz".to_owned()), 85 | None, 86 | 2, 87 | ); 88 | nfa.add_regex( 89 | &Default::default(), 90 | &Regex::String("xya".to_owned()), 91 | None, 92 | 3, 93 | ); 94 | 95 | test_simulate( 96 | &nfa, 97 | vec![ 98 | ("xyzxya", vec![("xyz", 2), ("xya", 3)], None), 99 | ("xyzxyz", vec![("xyzxyz", 1)], None), 100 | ], 101 | ); 102 | } 103 | 104 | #[test] 105 | fn stuck_1() { 106 | let nfa: NFA = NFA::new(); 107 | test_simulate(&nfa, vec![("a", vec![], Some(0))]); 108 | } 109 | 110 | #[test] 111 | fn stuck_2() { 112 | let mut nfa: NFA = NFA::new(); 113 | 114 | nfa.add_regex( 115 | &Default::default(), 116 | &Regex::String("ab".to_owned()), 117 | None, 118 | 1, 119 | ); 120 | 121 | test_simulate(&nfa, vec![("aba", vec![("ab", 1)], Some(2))]); 122 | } 123 | 124 | #[test] 125 | fn stuck_3() { 126 | let mut nfa: NFA = NFA::new(); 127 | 128 | nfa.add_regex( 129 | &Default::default(), 130 | &Regex::String("aaab".to_owned()), 131 | None, 132 | 1, 133 | ); 134 | nfa.add_regex(&Default::default(), &Regex::String("a".to_owned()), None, 2); 135 | 136 | test_simulate(&nfa, vec![("aaabb", vec![("aaab", 1)], Some(4))]); 137 | } 138 | 139 | #[test] 140 | fn simulate_char() { 141 | let re = Regex::Char('a'); 142 | let mut nfa: NFA = NFA::new(); 143 | nfa.add_regex(&Default::default(), &re, None, 1); 144 | 145 | test_simulate( 146 | &nfa, 147 | vec![ 148 | ("aa", vec![("a", 1), ("a", 1)], None), 149 | ("b", vec![], Some(0)), 150 | ], 151 | ); 152 | } 153 | 154 | #[test] 155 | fn simulate_string() { 156 | let re = Regex::String("ab".to_owned()); 157 | let mut nfa: NFA = NFA::new(); 158 | nfa.add_regex(&Default::default(), &re, None, 1); 159 | 160 | test_simulate( 161 | &nfa, 162 | vec![ 163 | ("a", vec![], Some(0)), 164 | ("ab", vec![("ab", 1)], None), 165 | ("abc", vec![("ab", 1)], Some(2)), 166 | ], 167 | ); 168 | } 169 | 170 | #[test] 171 | fn simulate_char_set_char() { 172 | let re = Regex::CharSet(CharSet(vec![ 173 | CharOrRange::Char('a'), 174 | CharOrRange::Char('b'), 175 | ])); 176 | let mut nfa: NFA = NFA::new(); 177 | nfa.add_regex(&Default::default(), &re, None, 1); 178 | 179 | test_simulate( 180 | &nfa, 181 | vec![ 182 | ("a", vec![("a", 1)], None), 183 | ("b", vec![("b", 1)], None), 184 | ("ab", vec![("a", 1), ("b", 1)], None), 185 | ("ba", vec![("b", 1), ("a", 1)], None), 186 | ], 187 | ); 188 | } 189 | 190 | #[test] 191 | fn simulate_char_set_range() { 192 | let re = Regex::CharSet(CharSet(vec![ 193 | CharOrRange::Char('a'), 194 | CharOrRange::Char('b'), 195 | CharOrRange::Range('0', '9'), 196 | ])); 197 | let mut nfa: NFA = NFA::new(); 198 | nfa.add_regex(&Default::default(), &re, None, 1); 199 | 200 | test_simulate( 201 | &nfa, 202 | vec![("ab09", vec![("a", 1), ("b", 1), ("0", 1), ("9", 1)], None)], 203 | ); 204 | } 205 | 206 | #[test] 207 | fn simulate_zero_or_more() { 208 | let re = Regex::ZeroOrMore(Box::new(Regex::Char('a'))); 209 | let mut nfa: NFA = NFA::new(); 210 | nfa.add_regex(&Default::default(), &re, None, 1); 211 | 212 | test_simulate( 213 | &nfa, 214 | vec![ 215 | // TODO 216 | // ("", vec![], None), 217 | ("a", vec![("a", 1)], None), 218 | ("aa", vec![("aa", 1)], None), 219 | ("aab", vec![("aa", 1)], Some(2)), 220 | ], 221 | ); 222 | } 223 | 224 | #[test] 225 | fn simulate_one_or_more() { 226 | let re = Regex::OneOrMore(Box::new(Regex::Char('a'))); 227 | let mut nfa: NFA = NFA::new(); 228 | nfa.add_regex(&Default::default(), &re, None, 1); 229 | 230 | test_simulate( 231 | &nfa, 232 | vec![ 233 | ("", vec![], Some(0)), 234 | ("a", vec![("a", 1)], None), 235 | ("aa", vec![("aa", 1)], None), 236 | ("aab", vec![("aa", 1)], Some(2)), 237 | ], 238 | ); 239 | } 240 | 241 | #[test] 242 | fn simulate_zero_or_one() { 243 | let re = Regex::ZeroOrOne(Box::new(Regex::Char('a'))); 244 | let mut nfa: NFA = NFA::new(); 245 | nfa.add_regex(&Default::default(), &re, None, 1); 246 | 247 | test_simulate( 248 | &nfa, 249 | vec![ 250 | ("", vec![], Some(0)), 251 | ("a", vec![("a", 1)], None), 252 | ("aa", vec![("a", 1), ("a", 1)], None), 253 | ("aab", vec![("a", 1), ("a", 1)], Some(2)), 254 | ], 255 | ); 256 | } 257 | 258 | #[test] 259 | fn simulate_concat() { 260 | let re = Regex::Concat(Box::new(Regex::Char('a')), Box::new(Regex::Char('b'))); 261 | let mut nfa: NFA = NFA::new(); 262 | nfa.add_regex(&Default::default(), &re, None, 1); 263 | 264 | test_simulate( 265 | &nfa, 266 | vec![ 267 | ("a", vec![], Some(0)), 268 | ("ab", vec![("ab", 1)], None), 269 | ("aba", vec![("ab", 1)], Some(2)), 270 | ], 271 | ); 272 | } 273 | 274 | #[test] 275 | fn simulate_or() { 276 | let re = Regex::Or(Box::new(Regex::Char('a')), Box::new(Regex::Char('b'))); 277 | let mut nfa: NFA = NFA::new(); 278 | nfa.add_regex(&Default::default(), &re, None, 1); 279 | 280 | test_simulate( 281 | &nfa, 282 | vec![ 283 | ("a", vec![("a", 1)], None), 284 | ("b", vec![("b", 1)], None), 285 | ("ab", vec![("a", 1), ("b", 1)], None), 286 | ], 287 | ); 288 | } 289 | 290 | #[test] 291 | fn simulate_or_one_or_more_char() { 292 | let re = Regex::Or( 293 | Box::new(Regex::OneOrMore(Box::new(Regex::Char('a')))), 294 | Box::new(Regex::Char('b')), 295 | ); 296 | let mut nfa: NFA = NFA::new(); 297 | nfa.add_regex(&Default::default(), &re, None, 1); 298 | 299 | test_simulate( 300 | &nfa, 301 | vec![ 302 | ("a", vec![("a", 1)], None), 303 | ("b", vec![("b", 1)], None), 304 | ("aa", vec![("aa", 1)], None), 305 | ], 306 | ); 307 | } 308 | 309 | #[test] 310 | fn simulate_multiple_accepting_states_1() { 311 | let re1 = Regex::String("aaaa".to_owned()); 312 | let re2 = Regex::String("aaab".to_owned()); 313 | let mut nfa: NFA = NFA::new(); 314 | nfa.add_regex(&Default::default(), &re1, None, 1); 315 | nfa.add_regex(&Default::default(), &re2, None, 2); 316 | 317 | test_simulate( 318 | &nfa, 319 | vec![ 320 | ("aaaa", vec![("aaaa", 1)], None), 321 | ("aaab", vec![("aaab", 2)], None), 322 | ("aaac", vec![], Some(0)), 323 | ], 324 | ); 325 | } 326 | 327 | #[test] 328 | fn multiple_accepting_states_2() { 329 | let re1 = Regex::Or( 330 | Box::new(Regex::OneOrMore(Box::new(Regex::Char('a')))), 331 | Box::new(Regex::Char('b')), 332 | ); 333 | let re2 = Regex::CharSet(CharSet(vec![CharOrRange::Range('0', '9')])); 334 | let mut nfa: NFA = NFA::new(); 335 | nfa.add_regex(&Default::default(), &re1, None, 1); 336 | nfa.add_regex(&Default::default(), &re2, None, 2); 337 | 338 | test_simulate( 339 | &nfa, 340 | vec![ 341 | ("b", vec![("b", 1)], None), 342 | ("a", vec![("a", 1)], None), 343 | ("aa", vec![("aa", 1)], None), 344 | ("0", vec![("0", 2)], None), 345 | ], 346 | ); 347 | } 348 | 349 | #[test] 350 | fn simulate_variables() { 351 | let mut bindings: Map = Default::default(); 352 | bindings.insert( 353 | Var("initial".to_owned()), 354 | Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'z')])), 355 | ); 356 | bindings.insert( 357 | Var("subsequent".to_owned()), 358 | Regex::CharSet(CharSet(vec![ 359 | CharOrRange::Range('a', 'z'), 360 | CharOrRange::Range('A', 'Z'), 361 | CharOrRange::Range('0', '9'), 362 | CharOrRange::Char('-'), 363 | CharOrRange::Char('_'), 364 | ])), 365 | ); 366 | let re = Regex::Concat( 367 | Box::new(Regex::Var(Var("initial".to_owned()))), 368 | Box::new(Regex::ZeroOrMore(Box::new(Regex::Var(Var( 369 | "subsequent".to_owned() 370 | ))))), 371 | ); 372 | let mut nfa: NFA = NFA::new(); 373 | nfa.add_regex(&bindings, &re, None, 1); 374 | 375 | test_simulate( 376 | &nfa, 377 | vec![ 378 | ("a", vec![("a", 1)], None), 379 | ("aA", vec![("aA", 1)], None), 380 | ("aA123-a", vec![("aA123-a", 1)], None), 381 | ], 382 | ); 383 | } 384 | 385 | #[test] 386 | fn zero_or_more_concat_confusion_1() { 387 | let mut nfa: NFA = NFA::new(); 388 | 389 | let re = Regex::Concat( 390 | Box::new(Regex::ZeroOrMore(Box::new(Regex::Char('a')))), 391 | Box::new(Regex::Char('a')), 392 | ); 393 | 394 | nfa.add_regex(&Default::default(), &re, None, 1); 395 | 396 | test_simulate( 397 | &nfa, 398 | vec![("a", vec![("a", 1)], None), ("aa", vec![("aa", 1)], None)], 399 | ); 400 | } 401 | 402 | #[test] 403 | fn zero_or_more_concat_confusion_2() { 404 | let mut nfa: NFA = NFA::new(); 405 | 406 | let re = Regex::Concat( 407 | Box::new(Regex::ZeroOrMore(Box::new(Regex::Char('a')))), 408 | Box::new(Regex::String("ab".to_owned())), 409 | ); 410 | 411 | nfa.add_regex(&Default::default(), &re, None, 1); 412 | 413 | test_simulate( 414 | &nfa, 415 | vec![ 416 | ("ab", vec![("ab", 1)], None), 417 | ("aab", vec![("aab", 1)], None), 418 | ], 419 | ); 420 | } 421 | 422 | #[test] 423 | fn zero_or_more_concat_confusion_3() { 424 | let mut nfa: NFA = NFA::new(); 425 | 426 | let re = Regex::Concat( 427 | Box::new(Regex::Concat( 428 | Box::new(Regex::Char('a')), 429 | Box::new(Regex::ZeroOrMore(Box::new(Regex::Char('a')))), 430 | )), 431 | Box::new(Regex::Char('a')), 432 | ); 433 | 434 | nfa.add_regex(&Default::default(), &re, None, 1); 435 | 436 | test_simulate( 437 | &nfa, 438 | vec![ 439 | ("a", vec![], Some(0)), 440 | ("aa", vec![("aa", 1)], None), 441 | ("aaa", vec![("aaa", 1)], None), 442 | ], 443 | ); 444 | } 445 | 446 | #[test] 447 | fn simulate_any_1() { 448 | let mut nfa: NFA = NFA::new(); 449 | 450 | nfa.add_regex( 451 | &Default::default(), 452 | &Regex::String("ab".to_owned()), 453 | None, 454 | 1, 455 | ); 456 | nfa.add_regex(&Default::default(), &Regex::Any, None, 2); 457 | 458 | test_simulate( 459 | &nfa, 460 | vec![ 461 | ("a", vec![("a", 2)], None), 462 | ("ab", vec![("ab", 1)], None), 463 | ("abc", vec![("ab", 1), ("c", 2)], None), 464 | ], 465 | ); 466 | } 467 | 468 | #[test] 469 | fn simulate_any_2() { 470 | let mut nfa: NFA = NFA::new(); 471 | 472 | nfa.add_regex( 473 | &Default::default(), 474 | &Regex::Concat( 475 | Box::new(Regex::Char('\'')), 476 | Box::new(Regex::Concat( 477 | Box::new(Regex::Any), 478 | Box::new(Regex::Char('\'')), 479 | )), 480 | ), 481 | None, 482 | 1, 483 | ); 484 | 485 | test_simulate(&nfa, vec![("'a'", vec![("'a'", 1)], None)]); 486 | } 487 | 488 | #[test] 489 | fn simulate_end_of_input_1() { 490 | let mut nfa: NFA = NFA::new(); 491 | 492 | // C-style single-line comment syntax: "//" _* ('\n' | $) 493 | nfa.add_regex( 494 | &Default::default(), 495 | &Regex::Concat( 496 | Box::new(Regex::String("//".to_owned())), 497 | Box::new(Regex::Concat( 498 | Box::new(Regex::ZeroOrMore(Box::new(Regex::Any))), 499 | Box::new(Regex::Or( 500 | Box::new(Regex::Char('\n')), 501 | Box::new(Regex::EndOfInput), 502 | )), 503 | )), 504 | ), 505 | None, 506 | 1, 507 | ); 508 | 509 | test_simulate( 510 | &nfa, 511 | vec![ 512 | ("//", vec![("//", 1)], None), 513 | ("// \n", vec![("// \n", 1)], None), 514 | ("// ", vec![("// ", 1)], None), 515 | ], 516 | ); 517 | } 518 | 519 | #[test] 520 | fn simulate_end_of_input_2() { 521 | let mut nfa: NFA = NFA::new(); 522 | 523 | nfa.add_regex(&Default::default(), &Regex::EndOfInput, None, 1); 524 | nfa.add_regex( 525 | &Default::default(), 526 | &Regex::ZeroOrMore(Box::new(Regex::Any)), 527 | None, 528 | 2, 529 | ); 530 | 531 | // TODO: EndOfInput never matches? 532 | test_simulate(&nfa, vec![("a", vec![("a", 2)], None)]); 533 | } 534 | 535 | #[test] 536 | fn simulate_multiple_accepting_states_3() { 537 | let mut nfa: NFA = NFA::new(); 538 | 539 | nfa.add_regex( 540 | &Default::default(), 541 | &Regex::String("aaa".to_owned()), 542 | None, 543 | 1, 544 | ); 545 | nfa.add_regex( 546 | &Default::default(), 547 | &Regex::String("aaa".to_owned()), 548 | None, 549 | 2, 550 | ); 551 | nfa.add_regex( 552 | &Default::default(), 553 | &Regex::String("aa".to_owned()), 554 | None, 555 | 3, 556 | ); 557 | 558 | test_simulate( 559 | &nfa, 560 | vec![ 561 | ("aaa", vec![("aaa", 1)], None), 562 | ("aa", vec![("aa", 3)], None), 563 | ], 564 | ); 565 | } 566 | 567 | #[test] 568 | fn range_and_char_confusion() { 569 | let mut nfa: NFA = NFA::new(); 570 | 571 | nfa.add_regex( 572 | &Default::default(), 573 | &Regex::String("ab".to_owned()), 574 | None, 575 | 1, 576 | ); 577 | nfa.add_regex( 578 | &Default::default(), 579 | &Regex::OneOrMore(Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range( 580 | 'a', 'z', 581 | )])))), 582 | None, 583 | 2, 584 | ); 585 | 586 | test_simulate( 587 | &nfa, 588 | vec![("ab", vec![("ab", 1)], None), ("ac", vec![("ac", 2)], None)], 589 | ); 590 | } 591 | 592 | #[test] 593 | fn overlapping_ranges() { 594 | let mut nfa: NFA = NFA::new(); 595 | 596 | nfa.add_regex( 597 | &Default::default(), 598 | &Regex::Concat( 599 | Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'b')]))), 600 | Box::new(Regex::Char('1')), 601 | ), 602 | None, 603 | 1, 604 | ); 605 | nfa.add_regex( 606 | &Default::default(), 607 | &Regex::Concat( 608 | Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'c')]))), 609 | Box::new(Regex::Char('2')), 610 | ), 611 | None, 612 | 2, 613 | ); 614 | 615 | test_simulate( 616 | &nfa, 617 | vec![("a1", vec![("a1", 1)], None), ("a2", vec![("a2", 2)], None)], 618 | ); 619 | } 620 | 621 | #[test] 622 | fn right_context_1() { 623 | let mut nfa: NFA = NFA::new(); 624 | let mut right_ctxs = RightCtxDFAs::new(); 625 | 626 | let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Char('a')); 627 | nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); 628 | 629 | test_simulate_right_ctx(&nfa, &right_ctxs, vec![("aa", vec![("a", 1)], Some(1))]); 630 | test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![], Some(0))]); 631 | } 632 | 633 | #[test] 634 | fn right_context_2() { 635 | let mut nfa: NFA = NFA::new(); 636 | let mut right_ctxs = RightCtxDFAs::new(); 637 | 638 | let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Any); 639 | nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); 640 | 641 | test_simulate_right_ctx(&nfa, &right_ctxs, vec![("aa", vec![("a", 1)], Some(1))]); 642 | test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![("a", 1)], Some(1))]); 643 | test_simulate_right_ctx(&nfa, &right_ctxs, vec![("a", vec![], Some(0))]); 644 | } 645 | 646 | #[test] 647 | fn right_context_3() { 648 | let mut nfa: NFA = NFA::new(); 649 | let mut right_ctxs = RightCtxDFAs::new(); 650 | 651 | let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::EndOfInput); 652 | nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); 653 | 654 | test_simulate_right_ctx(&nfa, &right_ctxs, vec![("a", vec![("a", 1)], None)]); 655 | test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![], Some(0))]); 656 | } 657 | 658 | #[test] 659 | fn right_context_4() { 660 | let mut nfa: NFA = NFA::new(); 661 | let mut right_ctxs = RightCtxDFAs::new(); 662 | 663 | let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Char('a')); 664 | nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1); 665 | 666 | let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::EndOfInput); 667 | nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 2); 668 | 669 | test_simulate_right_ctx( 670 | &nfa, 671 | &right_ctxs, 672 | vec![("aa", vec![("a", 1), ("a", 2)], None)], 673 | ); 674 | } 675 | -------------------------------------------------------------------------------- /crates/lexgen/tests/bugs.rs: -------------------------------------------------------------------------------- 1 | mod test_utils; 2 | 3 | use lexgen::lexer; 4 | use lexgen_util::{LexerError, LexerErrorKind}; 5 | use test_utils::{loc, next}; 6 | 7 | #[test] 8 | fn failure_confusion_1() { 9 | // The bug: in the lexer below, when the input is "\\\"", the first backslash would be pushed 10 | // to the string buffer by the catch-all (now called "failure") case. The correct behaviour is 11 | // the failure case should only run if none of the other rules match to completion. 12 | 13 | #[derive(Debug, Default)] 14 | struct LexerState { 15 | buf: String, 16 | } 17 | 18 | lexer! { 19 | Lexer(LexerState) -> String; 20 | 21 | let whitespace = [' ' '\t' '\n']; 22 | 23 | '"' => |lexer| { 24 | println!("matched a double quote"); 25 | let str = std::mem::take(&mut lexer.state().buf); 26 | lexer.return_(str) 27 | }, 28 | 29 | "\\\"" => |lexer| { 30 | println!("matched an escaped double quote"); 31 | lexer.state().buf.push('"'); 32 | lexer.continue_() 33 | }, 34 | 35 | _ => |lexer| { 36 | let char = lexer.match_().chars().next_back().unwrap(); 37 | println!("wildcard matched {:?}", char); 38 | lexer.state().buf.push(char); 39 | lexer.continue_() 40 | }, 41 | } 42 | 43 | let mut lexer = Lexer::new("test\""); 44 | assert_eq!(next(&mut lexer), Some(Ok("test".to_owned()))); 45 | assert_eq!(next(&mut lexer), None); 46 | 47 | let mut lexer = Lexer::new("\\\"\""); 48 | assert_eq!(next(&mut lexer), Some(Ok("\"".to_owned()))); 49 | assert_eq!(next(&mut lexer), None); 50 | } 51 | 52 | #[test] 53 | fn failure_confusion_2() { 54 | // Similar to the bug above: the failure case should run if none of the other rules match to 55 | // completion. 56 | 57 | #[derive(Debug, Default)] 58 | struct LexerState { 59 | comment_depth: usize, 60 | } 61 | 62 | lexer! { 63 | Lexer(LexerState) -> (); 64 | 65 | 66 | rule Init { 67 | ' ', 68 | 69 | "(*" => |lexer| { 70 | lexer.state().comment_depth = 1; 71 | lexer.switch(LexerRule::Comment) 72 | }, 73 | } 74 | 75 | rule Comment { 76 | "(*" => |lexer| { 77 | let depth = &mut lexer.state().comment_depth; 78 | *depth += 1; 79 | lexer.continue_() 80 | }, 81 | 82 | "*)" => |lexer| { 83 | let depth = &mut lexer.state().comment_depth; 84 | if *depth == 1 { 85 | lexer.switch(LexerRule::Init) 86 | } else { 87 | *depth -= 1; 88 | lexer.continue_() 89 | } 90 | }, 91 | 92 | _, 93 | } 94 | } 95 | 96 | let mut lexer = Lexer::new("(* * *) (* (* ** *) *)"); 97 | assert_eq!(lexer.next(), None); 98 | } 99 | 100 | #[test] 101 | fn failure_confusion_3_1() { 102 | lexer! { 103 | Lexer -> usize; 104 | 105 | ' ' = 0, 106 | "ab" = 1, 107 | _ = 2, 108 | } 109 | 110 | let mut lexer = Lexer::new("a ab abc"); 111 | assert_eq!(next(&mut lexer), Some(Ok(2))); 112 | assert_eq!(next(&mut lexer), Some(Ok(0))); 113 | assert_eq!(next(&mut lexer), Some(Ok(1))); 114 | assert_eq!(next(&mut lexer), Some(Ok(0))); 115 | assert_eq!(next(&mut lexer), Some(Ok(1))); 116 | assert_eq!(next(&mut lexer), Some(Ok(2))); 117 | assert_eq!(next(&mut lexer), None); 118 | } 119 | 120 | #[test] 121 | fn failure_confusion_3_2() { 122 | // In practice the case we test in the previous test happens when lexing single-letter 123 | // identifiers in a lexer that allows multi-letter identifiers (i.e. practically all language 124 | // lexers). Here's a more realistic example: 125 | lexer! { 126 | Lexer -> usize; 127 | 128 | $$ascii_lowercase+ = 1, 129 | ',' = 2, 130 | } 131 | 132 | let mut lexer = Lexer::new("f,"); 133 | assert_eq!(next(&mut lexer), Some(Ok(1))); 134 | assert_eq!(next(&mut lexer), Some(Ok(2))); 135 | assert_eq!(next(&mut lexer), None); 136 | } 137 | 138 | #[test] 139 | fn failure_confusion_4() { 140 | lexer! { 141 | Lexer -> u32; 142 | 143 | ' ', 144 | "aaa" = 1, 145 | "aa" = 2, 146 | _ = 3, 147 | } 148 | 149 | let mut lexer = Lexer::new("aaa aa a"); 150 | 151 | assert_eq!(next(&mut lexer), Some(Ok(1))); 152 | assert_eq!(next(&mut lexer), Some(Ok(2))); 153 | assert_eq!(next(&mut lexer), Some(Ok(3))); 154 | assert_eq!(next(&mut lexer), None); 155 | } 156 | 157 | #[test] 158 | fn continue_confusion_1() { 159 | lexer! { 160 | Lexer -> u32; 161 | 162 | _, 163 | } 164 | 165 | let mut lexer = Lexer::new(""); 166 | assert_eq!(lexer.next(), None); 167 | 168 | let mut lexer = Lexer::new("a"); 169 | assert_eq!(lexer.next(), None); 170 | 171 | let mut lexer = Lexer::new("aaa"); 172 | assert_eq!(lexer.next(), None); 173 | } 174 | 175 | #[test] 176 | fn continue_confusion_2() { 177 | lexer! { 178 | Lexer -> u32; 179 | 180 | rule Init { 181 | _ => |lexer| lexer.switch(LexerRule::Test), 182 | } 183 | 184 | // Previously failure code would run on end-of-stream, which resets the state to `Test` and 185 | // continues, causing a loop. 186 | // 187 | // This issue does not exist in `Init` as we explicitly handle EOF there, to stop the main 188 | // loop. 189 | // 190 | // Instead end-of-stream in a state other than `Init` should fail with "unexpected EOF". 191 | rule Test { 192 | _, 193 | } 194 | } 195 | 196 | let mut lexer = Lexer::new("a"); 197 | assert!(matches!(lexer.next(), Some(Err(_)))); 198 | 199 | let mut lexer = Lexer::new("aa"); 200 | assert!(matches!(lexer.next(), Some(Err(_)))); 201 | } 202 | 203 | #[test] 204 | fn return_should_reset_match() { 205 | lexer! { 206 | Lexer -> &'input str; 207 | 208 | rule Init { 209 | "aaa" => |lexer| { 210 | let match_ = lexer.match_(); 211 | lexer.switch_and_return(LexerRule::State1, match_) 212 | }, 213 | } 214 | 215 | rule State1 { 216 | "bbb" => |lexer| { 217 | let match_ = lexer.match_(); 218 | lexer.switch_and_return(LexerRule::Init, match_) 219 | }, 220 | } 221 | } 222 | 223 | let mut lexer = Lexer::new("aaabbb"); 224 | assert_eq!(next(&mut lexer), Some(Ok("aaa"))); 225 | assert_eq!(next(&mut lexer), Some(Ok("bbb"))); 226 | assert_eq!(next(&mut lexer), None); 227 | } 228 | 229 | #[test] 230 | fn issue_16_backtracking_1() { 231 | lexer! { 232 | Lexer -> &'input str; 233 | 234 | 'a'+ 'b' => |lexer| { 235 | let match_ = lexer.match_(); 236 | lexer.return_(match_) 237 | }, 238 | 239 | 'a' => |lexer| { 240 | let match_ = lexer.match_(); 241 | lexer.return_(match_) 242 | }, 243 | } 244 | 245 | let mut lexer = Lexer::new("aaaab"); 246 | assert_eq!(next(&mut lexer), Some(Ok("aaaab"))); 247 | assert_eq!(next(&mut lexer), None); 248 | 249 | let mut lexer = Lexer::new("aaaa"); 250 | assert_eq!(next(&mut lexer), Some(Ok("a"))); 251 | assert_eq!(next(&mut lexer), Some(Ok("a"))); 252 | assert_eq!(next(&mut lexer), Some(Ok("a"))); 253 | assert_eq!(next(&mut lexer), Some(Ok("a"))); 254 | assert_eq!(next(&mut lexer), None); 255 | } 256 | 257 | #[test] 258 | fn issue_16_backtracking_2() { 259 | fn return_match<'input, I: Iterator + Clone>( 260 | lexer: &mut Lexer<'input, I>, 261 | ) -> lexgen_util::SemanticActionResult<&'input str> { 262 | let match_ = lexer.match_(); 263 | lexer.return_(match_) 264 | } 265 | 266 | lexer! { 267 | Lexer -> &'input str; 268 | 269 | "xyzxyz" => return_match, 270 | "xyz" => return_match, 271 | "xya" => return_match, 272 | } 273 | 274 | let mut lexer = Lexer::new("xyzxya"); 275 | assert_eq!(next(&mut lexer), Some(Ok("xyz"))); 276 | assert_eq!(next(&mut lexer), Some(Ok("xya"))); 277 | assert_eq!(next(&mut lexer), None); 278 | } 279 | 280 | #[test] 281 | fn end_of_input_handling() { 282 | lexer! { 283 | Lexer -> (usize, &'input str); 284 | 285 | rule Init { 286 | 'a' => |lexer| { 287 | let match_ = lexer.match_(); 288 | lexer.switch_and_return(LexerRule::Rule1, (0, match_)) 289 | }, 290 | } 291 | 292 | rule Rule1 { 293 | $, 294 | 295 | 'a' => |lexer| { 296 | let match_ = lexer.match_(); 297 | lexer.return_((1, match_)) 298 | }, 299 | } 300 | } 301 | 302 | let mut lexer = Lexer::new("aa"); 303 | assert_eq!( 304 | lexer.next(), 305 | Some(Ok((loc(0, 0, 0), (0, "a"), loc(0, 1, 1)))) 306 | ); 307 | assert_eq!( 308 | lexer.next(), 309 | Some(Ok((loc(0, 1, 1), (1, "a"), loc(0, 2, 2)))) 310 | ); 311 | assert_eq!(lexer.next(), None); 312 | } 313 | 314 | #[test] 315 | fn empty_rule_simpification_issue_27() { 316 | // Tests that: 317 | // 318 | // 1. Simplifier doesn't eliminate empty (i.e. no outgoing transitions) initial states without 319 | // incoming transitions. Since initial states can be switched to in semantic actions we 320 | // cannot know that we won't ever switch to them, so we cannot eliminate them. 321 | // 322 | // 2. When running a semantic action we reset `last_match` so if the next state is empty we 323 | // fail, instead of backtracking. 324 | 325 | lexer! { 326 | Lexer -> &'input str; 327 | 328 | rule Init { 329 | "0x" => |lexer| lexer.switch(LexerRule::HexInt), 330 | '0' => |lexer| lexer.switch(LexerRule::DecInt), 331 | } 332 | 333 | rule DecInt { 334 | _ => |lexer| lexer.return_("wat"), 335 | } 336 | 337 | rule HexInt {} 338 | } 339 | 340 | let mut lexer = Lexer::new("0xff"); 341 | 342 | // This used to return `Some("wat")` with the bug 343 | assert_eq!( 344 | next(&mut lexer), 345 | Some(Err(LexerError { 346 | location: loc(0, 0, 0), 347 | kind: LexerErrorKind::InvalidToken, 348 | })) 349 | ); 350 | } 351 | 352 | #[test] 353 | fn range_any_overlap_issue_31() { 354 | lexer! { 355 | Lexer -> usize; 356 | 357 | "'" _ "'" = 1, 358 | "'" ['a'-'z']+ = 2, 359 | } 360 | 361 | let input = "'a'"; 362 | let mut lexer = Lexer::new(input); 363 | assert_eq!(lexer.next(), Some(Ok((loc(0, 0, 0), 1, loc(0, 3, 3))))); 364 | assert_eq!(lexer.next(), None); 365 | } 366 | 367 | #[test] 368 | fn failure_should_reset_state_issue_48() { 369 | lexer! { 370 | Lexer -> &'input str; 371 | 372 | rule Init { 373 | 's' => |lexer| 374 | lexer.switch_and_return(LexerRule::InString, lexer.match_()), 375 | } 376 | 377 | rule InString { 378 | 'a' => |lexer| 379 | lexer.switch_and_return(LexerRule::Init, lexer.match_()), 380 | } 381 | } 382 | 383 | let input = "sxasa"; 384 | let mut lexer = Lexer::new(input); 385 | 386 | assert_eq!(lexer.next(), Some(Ok((loc(0, 0, 0), "s", loc(0, 1, 1))))); 387 | assert_eq!( 388 | lexer.next(), 389 | Some(Err(LexerError { 390 | location: loc(0, 1, 1), 391 | kind: LexerErrorKind::InvalidToken 392 | })) 393 | ); 394 | assert_eq!( 395 | lexer.next(), 396 | Some(Err(LexerError { 397 | location: loc(0, 2, 2), 398 | kind: LexerErrorKind::InvalidToken 399 | })) 400 | ); 401 | assert_eq!(lexer.next(), Some(Ok((loc(0, 3, 3), "s", loc(0, 4, 4))))); 402 | assert_eq!(lexer.next(), Some(Ok((loc(0, 4, 4), "a", loc(0, 5, 5))))); 403 | assert_eq!(lexer.next(), None); 404 | } 405 | 406 | #[test] 407 | fn new_methods_no_default() { 408 | // #54: `new_with_state` and `new_from_iter_with_state` shouldn't require state to implement 409 | // `Default` 410 | 411 | struct UserState {} 412 | 413 | lexer! { 414 | Lexer(UserState) -> (); 415 | 416 | $ = (), 417 | } 418 | 419 | Lexer::new_with_state("", UserState {}); 420 | Lexer::new_from_iter_with_state(std::iter::empty(), UserState {}); 421 | } 422 | 423 | #[test] 424 | fn new_methods_default() { 425 | // #54: `new` and `new_from_iter` should work with user state that implements `Default` 426 | 427 | #[derive(Default)] 428 | struct UserState {} 429 | 430 | lexer! { 431 | Lexer(UserState) -> (); 432 | 433 | $ = (), 434 | } 435 | 436 | Lexer::new(""); 437 | Lexer::new_from_iter(std::iter::empty()); 438 | } 439 | -------------------------------------------------------------------------------- /crates/lexgen/tests/lua_5_1.rs: -------------------------------------------------------------------------------- 1 | // A Lua 5.1 lexer. We use this as 2 | // 3 | // - An example: this file is linked from README 4 | // 5 | // - A test: `test_data` contains all Lua files in Lua 5.1 source distribution, we lex it using 6 | // this lexer as a test. 7 | // 8 | // - A benchmark: We also use `test_data` lexing time as a runtime benchmark. 9 | 10 | use lexgen::lexer; 11 | 12 | //////////////////////////////////////////////////////////////////////////////// 13 | // // 14 | // Lexer definition and tests // 15 | // // 16 | //////////////////////////////////////////////////////////////////////////////// 17 | 18 | #[derive(Debug, PartialEq, Eq, Clone)] 19 | enum Token<'input> { 20 | Plus, 21 | Minus, 22 | Star, 23 | Slash, 24 | Percent, 25 | Caret, 26 | Hash, 27 | EqEq, 28 | TildeEq, 29 | LtEq, 30 | GtEq, 31 | Lt, 32 | Gt, 33 | Eq, 34 | LParen, 35 | RParen, 36 | LBrace, 37 | RBrace, 38 | LBracket, 39 | RBracket, 40 | Semicolon, 41 | Colon, 42 | Comma, 43 | Dot, 44 | DotDot, 45 | DotDotDot, 46 | Keyword(Keyword), 47 | String(StringToken<'input>), 48 | Var(&'input str), 49 | Number(&'input str), // uninterpreted 50 | } 51 | 52 | /// Raw string tokens are borrowed from the input string. Interpreted strings are copied and owned. 53 | #[derive(Debug, PartialEq, Eq, Clone)] 54 | enum StringToken<'input> { 55 | Raw(&'input str), 56 | Interpreted(String), 57 | } 58 | 59 | #[derive(Debug, PartialEq, Eq, Clone)] 60 | enum Keyword { 61 | And, 62 | Break, 63 | Do, 64 | Else, 65 | ElseIf, 66 | End, 67 | False, 68 | For, 69 | Function, 70 | If, 71 | In, 72 | Local, 73 | Nil, 74 | Not, 75 | Or, 76 | Repeat, 77 | Return, 78 | Then, 79 | True, 80 | Until, 81 | While, 82 | } 83 | 84 | #[derive(Debug, Default, Clone)] 85 | struct LexerState { 86 | /// Number of opening `=`s seen when parsing a long string 87 | long_string_opening_eqs: usize, 88 | /// Number of closing `=`s seen when parsing a long string 89 | long_string_closing_eqs: usize, 90 | /// When parsing a short string, whether it's started with a double or single quote 91 | short_string_delim: Quote, 92 | /// Buffer for strings 93 | string_buf: String, 94 | /// When parsing a long string, whether we're inside a comment or not. When inside a comment we 95 | /// don't return a token. Otherwise we return a string. 96 | in_comment: bool, 97 | } 98 | 99 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] 100 | enum Quote { 101 | #[default] // arbitrary 102 | Single, 103 | Double, 104 | } 105 | 106 | lexer! { 107 | Lexer(LexerState) -> Token<'input>; 108 | 109 | let whitespace = [' ' '\t' '\n'] | "\r\n"; 110 | 111 | rule Init { 112 | $whitespace, 113 | 114 | "+" = Token::Plus, 115 | "-" = Token::Minus, 116 | "*" = Token::Star, 117 | "/" = Token::Slash, 118 | "%" = Token::Percent, 119 | "^" = Token::Caret, 120 | "#" = Token::Hash, 121 | "==" = Token::EqEq, 122 | "~=" = Token::TildeEq, 123 | "<=" = Token::LtEq, 124 | ">=" = Token::GtEq, 125 | "<" = Token::Lt, 126 | ">" = Token::Gt, 127 | "=" = Token::Eq, 128 | "(" = Token::LParen, 129 | ")" = Token::RParen, 130 | "{" = Token::LBrace, 131 | "}" = Token::RBrace, 132 | "]" = Token::RBracket, 133 | ";" = Token::Semicolon, 134 | ":" = Token::Colon, 135 | "," = Token::Comma, 136 | "." = Token::Dot, 137 | ".." = Token::DotDot, 138 | "..." = Token::DotDotDot, 139 | "and" = Token::Keyword(Keyword::And), 140 | "break" = Token::Keyword(Keyword::Break), 141 | "do" = Token::Keyword(Keyword::Do), 142 | "else" = Token::Keyword(Keyword::Else), 143 | "elseif" = Token::Keyword(Keyword::ElseIf), 144 | "end" = Token::Keyword(Keyword::End), 145 | "false" = Token::Keyword(Keyword::False), 146 | "for" = Token::Keyword(Keyword::For), 147 | "function" = Token::Keyword(Keyword::Function), 148 | "if" = Token::Keyword(Keyword::If), 149 | "in" = Token::Keyword(Keyword::In), 150 | "local" = Token::Keyword(Keyword::Local), 151 | "nil" = Token::Keyword(Keyword::Nil), 152 | "not" = Token::Keyword(Keyword::Not), 153 | "or" = Token::Keyword(Keyword::Or), 154 | "repeat" = Token::Keyword(Keyword::Repeat), 155 | "return" = Token::Keyword(Keyword::Return), 156 | "then" = Token::Keyword(Keyword::Then), 157 | "true" = Token::Keyword(Keyword::True), 158 | "until" = Token::Keyword(Keyword::Until), 159 | "while" = Token::Keyword(Keyword::While), 160 | 161 | '"' => |lexer| { 162 | lexer.state().short_string_delim = Quote::Double; 163 | lexer.state().string_buf.clear(); 164 | lexer.switch(LexerRule::String) 165 | }, 166 | 167 | '\'' => |lexer| { 168 | lexer.state().short_string_delim = Quote::Single; 169 | lexer.state().string_buf.clear(); 170 | lexer.switch(LexerRule::String) 171 | }, 172 | 173 | "[" => |lexer| { 174 | match lexer.peek() { 175 | Some('[') | Some('=') => { 176 | lexer.state().long_string_opening_eqs = 0; 177 | lexer.state().in_comment = false; 178 | lexer.switch(LexerRule::LongStringBracketLeft) 179 | } 180 | _ => lexer.return_(Token::LBracket), 181 | } 182 | }, 183 | 184 | "--" => |lexer| { 185 | lexer.switch(LexerRule::EnterComment) 186 | }, 187 | 188 | // > Names (also called identifiers) in Lua can be any string of letters, digits, and 189 | // > underscores, not beginning with a digit. This coincides with the definition of names 190 | // > in most languages. (The definition of letter depends on the current locale: any 191 | // > character considered alphabetic by the current locale can be used in an identifier.) 192 | let var_init = ['a'-'z' 'A'-'Z' '_']; 193 | let var_subseq = $var_init | ['0'-'9']; 194 | 195 | $var_init $var_subseq* => |lexer| { 196 | let match_ = lexer.match_(); 197 | lexer.return_(Token::Var(match_)) 198 | }, 199 | 200 | let digit = ['0'-'9']; 201 | let hex_digit = ['a'-'f' 'A'-'F' '0'-'9']; 202 | 203 | $digit+ ('.'? $digit+ (('e' | 'E') ('+'|'-')? $digit+)?)? => |lexer| { 204 | let match_ = lexer.match_(); 205 | lexer.return_(Token::Number(match_)) 206 | }, 207 | 208 | "0x" $hex_digit+ => |lexer| { 209 | let match_ = lexer.match_(); 210 | lexer.return_(Token::Number(match_)) 211 | }, 212 | } 213 | 214 | rule LongStringBracketLeft { 215 | '=' => |lexer| { 216 | lexer.state().long_string_opening_eqs += 1; 217 | lexer.continue_() 218 | }, 219 | 220 | '[' => |lexer| lexer.switch(LexerRule::LongString), 221 | } 222 | 223 | rule LongString { 224 | ']' => |lexer| { 225 | lexer.state().long_string_closing_eqs = 0; 226 | lexer.switch(LexerRule::LongStringBracketRight) 227 | }, 228 | 229 | _ => |lexer| lexer.continue_(), 230 | } 231 | 232 | rule LongStringBracketRight { 233 | '=' => |lexer| { 234 | lexer.state().long_string_closing_eqs += 1; 235 | lexer.continue_() 236 | }, 237 | 238 | ']' => |lexer| { 239 | let state = lexer.state(); 240 | let in_comment = state.in_comment; 241 | let left_eqs = state.long_string_opening_eqs; 242 | let right_eqs = state.long_string_closing_eqs; 243 | if left_eqs == right_eqs { 244 | if in_comment { 245 | lexer.switch(LexerRule::Init) 246 | } else { 247 | let match_ = &lexer.match_()[left_eqs + 2..lexer.match_().len() - right_eqs - 2]; 248 | lexer.switch_and_return(LexerRule::Init, Token::String(StringToken::Raw(match_))) 249 | } 250 | } else { 251 | lexer.state().long_string_closing_eqs = 0; 252 | lexer.continue_() 253 | } 254 | }, 255 | 256 | _ => |lexer| lexer.switch(LexerRule::LongString), 257 | } 258 | 259 | rule String { 260 | '"' => |lexer| { 261 | if lexer.state().short_string_delim == Quote::Double { 262 | let str = lexer.state().string_buf.clone(); 263 | lexer.switch_and_return(LexerRule::Init, Token::String(StringToken::Interpreted(str))) 264 | } else { 265 | lexer.state().string_buf.push('"'); 266 | lexer.continue_() 267 | } 268 | }, 269 | 270 | "'" => |lexer| { 271 | if lexer.state().short_string_delim == Quote::Single { 272 | let str = lexer.state().string_buf.clone(); 273 | lexer.switch_and_return(LexerRule::Init, Token::String(StringToken::Interpreted(str))) 274 | } else { 275 | lexer.state().string_buf.push('\''); 276 | lexer.continue_() 277 | } 278 | }, 279 | 280 | "\\a" => |lexer| { 281 | lexer.state().string_buf.push('\u{7}'); 282 | lexer.continue_() 283 | }, 284 | 285 | "\\b" => |lexer| { 286 | lexer.state().string_buf.push('\u{8}'); 287 | lexer.continue_() 288 | }, 289 | 290 | "\\f" => |lexer| { 291 | lexer.state().string_buf.push('\u{c}'); 292 | lexer.continue_() 293 | }, 294 | 295 | "\\n" => |lexer| { 296 | lexer.state().string_buf.push('\n'); 297 | lexer.continue_() 298 | }, 299 | 300 | "\\r" => |lexer| { 301 | lexer.state().string_buf.push('\r'); 302 | lexer.continue_() 303 | }, 304 | 305 | "\\t" => |lexer| { 306 | lexer.state().string_buf.push('\t'); 307 | lexer.continue_() 308 | }, 309 | 310 | "\\v" => |lexer| { 311 | lexer.state().string_buf.push('\u{b}'); 312 | lexer.continue_() 313 | }, 314 | 315 | "\\\\" => |lexer| { 316 | lexer.state().string_buf.push('\\'); 317 | lexer.continue_() 318 | }, 319 | 320 | "\\\"" => |lexer| { 321 | lexer.state().string_buf.push('"'); 322 | lexer.continue_() 323 | }, 324 | 325 | "\\'" => |lexer| { 326 | lexer.state().string_buf.push('\''); 327 | lexer.continue_() 328 | }, 329 | 330 | "\\\n" => |lexer| { 331 | lexer.state().string_buf.push('\n'); 332 | lexer.continue_() 333 | }, 334 | 335 | _ => |lexer| { 336 | let char = lexer.match_().chars().next_back().unwrap(); 337 | lexer.state().string_buf.push(char); 338 | lexer.continue_() 339 | }, 340 | } 341 | 342 | rule EnterComment { 343 | '[' => |lexer| { 344 | match lexer.peek() { 345 | Some('[') | Some('=') => { 346 | lexer.state().long_string_opening_eqs = 0; 347 | lexer.state().in_comment = true; 348 | lexer.switch(LexerRule::LongStringBracketLeft) 349 | } 350 | _ => 351 | lexer.switch(LexerRule::Comment), 352 | } 353 | }, 354 | 355 | _ => |lexer| lexer.switch(LexerRule::Comment), 356 | } 357 | 358 | rule Comment { 359 | '\n' => |lexer| lexer.switch(LexerRule::Init), 360 | 361 | _ => |lexer| lexer.continue_(), 362 | } 363 | } 364 | 365 | #[allow(dead_code)] 366 | fn ignore_pos(ret: Option>) -> Option> { 367 | ret.map(|res| res.map(|(_, a, _)| a)) 368 | } 369 | 370 | #[test] 371 | fn lex_lua_number() { 372 | let mut lexer = Lexer::new("3 3.0 3.1416 314.16e-2 0.31416E1 0xff 0x56"); 373 | 374 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("3")))); 375 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("3.0")))); 376 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("3.1416")))); 377 | assert_eq!( 378 | ignore_pos(lexer.next()), 379 | Some(Ok(Token::Number("314.16e-2"))) 380 | ); 381 | assert_eq!( 382 | ignore_pos(lexer.next()), 383 | Some(Ok(Token::Number("0.31416E1"))) 384 | ); 385 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("0xff")))); 386 | 387 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("0x56")))); 388 | } 389 | 390 | #[test] 391 | fn lex_lua_string() { 392 | let str = " 393 | \"test\" 394 | \"\\ 395 | test'\\\"\" 396 | "; 397 | let mut lexer = Lexer::new(str); 398 | 399 | assert_eq!( 400 | ignore_pos(lexer.next()), 401 | Some(Ok(Token::String(StringToken::Interpreted( 402 | "test".to_owned() 403 | )))) 404 | ); 405 | assert_eq!( 406 | ignore_pos(lexer.next()), 407 | Some(Ok(Token::String(StringToken::Interpreted( 408 | "\ntest'\"".to_owned() 409 | )))) 410 | ); 411 | } 412 | 413 | #[test] 414 | fn lex_lua_long_string() { 415 | let mut lexer = Lexer::new("[[ ]] [=[test]=] [=[ ]]"); 416 | assert_eq!( 417 | ignore_pos(lexer.next()), 418 | Some(Ok(Token::String(StringToken::Raw(" ")))) 419 | ); 420 | assert_eq!( 421 | ignore_pos(lexer.next()), 422 | Some(Ok(Token::String(StringToken::Raw("test")))), 423 | ); 424 | assert!(matches!(lexer.next(), Some(Err(_)))); 425 | } 426 | 427 | #[test] 428 | fn lex_lua_comment() { 429 | let mut lexer = Lexer::new( 430 | "-- test 431 | + 432 | --[[test 433 | test]]+ 434 | --[===[ 435 | ]=]===] 436 | + 437 | --[===[ 438 | ] 439 | ]===] 440 | + 441 | ", 442 | ); 443 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus))); 444 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus))); 445 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus))); 446 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus))); 447 | assert_eq!(ignore_pos(lexer.next()), None); 448 | } 449 | 450 | #[test] 451 | fn lex_lua_var() { 452 | let str = "ab ab1 ab_1_2 Aab"; 453 | let mut lexer = Lexer::new(str); 454 | 455 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("ab")))); 456 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("ab1")))); 457 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("ab_1_2")))); 458 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("Aab")))); 459 | } 460 | 461 | #[test] 462 | fn lex_lua_simple() { 463 | let lexer = Lexer::new( 464 | "+ - * / % ^ # == ~= <= >= < > = ( ) { } [ ] \ 465 | ; : , . .. ... and break do else elseif end \ 466 | false for function if in local nil not or repeat \ 467 | return then true until while n", 468 | ); 469 | 470 | let mut tokens: Vec = vec![]; 471 | for token in lexer { 472 | tokens.push(token.unwrap().1); 473 | } 474 | 475 | assert_eq!( 476 | tokens, 477 | vec![ 478 | Token::Plus, 479 | Token::Minus, 480 | Token::Star, 481 | Token::Slash, 482 | Token::Percent, 483 | Token::Caret, 484 | Token::Hash, 485 | Token::EqEq, 486 | Token::TildeEq, 487 | Token::LtEq, 488 | Token::GtEq, 489 | Token::Lt, 490 | Token::Gt, 491 | Token::Eq, 492 | Token::LParen, 493 | Token::RParen, 494 | Token::LBrace, 495 | Token::RBrace, 496 | Token::LBracket, 497 | Token::RBracket, 498 | Token::Semicolon, 499 | Token::Colon, 500 | Token::Comma, 501 | Token::Dot, 502 | Token::DotDot, 503 | Token::DotDotDot, 504 | Token::Keyword(Keyword::And), 505 | Token::Keyword(Keyword::Break), 506 | Token::Keyword(Keyword::Do), 507 | Token::Keyword(Keyword::Else), 508 | Token::Keyword(Keyword::ElseIf), 509 | Token::Keyword(Keyword::End), 510 | Token::Keyword(Keyword::False), 511 | Token::Keyword(Keyword::For), 512 | Token::Keyword(Keyword::Function), 513 | Token::Keyword(Keyword::If), 514 | Token::Keyword(Keyword::In), 515 | Token::Keyword(Keyword::Local), 516 | Token::Keyword(Keyword::Nil), 517 | Token::Keyword(Keyword::Not), 518 | Token::Keyword(Keyword::Or), 519 | Token::Keyword(Keyword::Repeat), 520 | Token::Keyword(Keyword::Return), 521 | Token::Keyword(Keyword::Then), 522 | Token::Keyword(Keyword::True), 523 | Token::Keyword(Keyword::Until), 524 | Token::Keyword(Keyword::While), 525 | Token::Var("n"), 526 | ] 527 | ); 528 | } 529 | 530 | #[test] 531 | fn lex_lua_windows_line_ending() { 532 | let mut lexer = Lexer::new("+\r\n+"); 533 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus))); 534 | assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus))); 535 | assert_eq!(ignore_pos(lexer.next()), None); 536 | } 537 | 538 | #[test] 539 | fn lex_lua_files() { 540 | let str = std::fs::read_to_string("tests/test_data").unwrap(); 541 | let lexer = Lexer::new(&str); 542 | let mut i = 0; 543 | for tok in lexer { 544 | assert!(tok.is_ok()); 545 | i += 1; 546 | } 547 | println!("{} tokens", i); 548 | } 549 | -------------------------------------------------------------------------------- /crates/lexgen/tests/right_ctx.rs: -------------------------------------------------------------------------------- 1 | mod test_utils; 2 | 3 | use lexgen::lexer; 4 | use lexgen_util::{LexerError, LexerErrorKind}; 5 | use test_utils::{loc, next}; 6 | 7 | #[test] 8 | fn right_ctx_1() { 9 | lexer! { 10 | Lexer -> u32; 11 | 12 | 'a' > 'a' = 1, 13 | } 14 | 15 | let mut lexer = Lexer::new("aa"); 16 | assert_eq!(next(&mut lexer), Some(Ok(1))); 17 | assert_eq!( 18 | next(&mut lexer), 19 | Some(Err(LexerError { 20 | location: loc(0, 1, 1), 21 | kind: LexerErrorKind::InvalidToken, 22 | })) 23 | ); 24 | 25 | let mut lexer = Lexer::new("ab"); 26 | assert_eq!( 27 | next(&mut lexer), 28 | Some(Err(LexerError { 29 | location: loc(0, 0, 0), 30 | kind: LexerErrorKind::InvalidToken, 31 | })) 32 | ); 33 | } 34 | 35 | #[test] 36 | fn right_ctx_2() { 37 | lexer! { 38 | Lexer -> u32; 39 | 40 | 'a' > _ = 1, 41 | } 42 | 43 | let mut lexer = Lexer::new("aa"); 44 | assert_eq!(next(&mut lexer), Some(Ok(1))); 45 | assert_eq!( 46 | next(&mut lexer), 47 | Some(Err(LexerError { 48 | location: loc(0, 1, 1), 49 | kind: LexerErrorKind::InvalidToken, 50 | })) 51 | ); 52 | 53 | let mut lexer = Lexer::new("ab"); 54 | assert_eq!(next(&mut lexer), Some(Ok(1))); 55 | assert_eq!( 56 | next(&mut lexer), 57 | Some(Err(LexerError { 58 | location: loc(0, 1, 1), 59 | kind: LexerErrorKind::InvalidToken, 60 | })) 61 | ); 62 | 63 | let mut lexer = Lexer::new("a"); 64 | assert_eq!( 65 | next(&mut lexer), 66 | Some(Err(LexerError { 67 | location: loc(0, 0, 0), 68 | kind: LexerErrorKind::InvalidToken, 69 | })) 70 | ); 71 | } 72 | 73 | #[test] 74 | fn right_ctx_3() { 75 | lexer! { 76 | Lexer -> u32; 77 | 78 | 'a' > $ = 1, 79 | } 80 | 81 | let mut lexer = Lexer::new("a"); 82 | assert_eq!(next(&mut lexer), Some(Ok(1))); 83 | assert_eq!(next(&mut lexer), None); 84 | 85 | let mut lexer = Lexer::new("ab"); 86 | assert_eq!( 87 | next(&mut lexer), 88 | Some(Err(LexerError { 89 | location: loc(0, 0, 0), 90 | kind: LexerErrorKind::InvalidToken, 91 | })) 92 | ); 93 | } 94 | 95 | #[test] 96 | fn right_ctx_4() { 97 | lexer! { 98 | Lexer -> u32; 99 | 100 | 'a' > 'a' = 1, 101 | 'a' > $ = 2, 102 | } 103 | 104 | let mut lexer = Lexer::new("a"); 105 | assert_eq!(next(&mut lexer), Some(Ok(2))); 106 | assert_eq!(next(&mut lexer), None); 107 | 108 | let mut lexer = Lexer::new("aa"); 109 | assert_eq!(next(&mut lexer), Some(Ok(1))); 110 | assert_eq!(next(&mut lexer), Some(Ok(2))); 111 | assert_eq!(next(&mut lexer), None); 112 | } 113 | 114 | #[test] 115 | fn rust_single_line_comment() { 116 | lexer! { 117 | Lexer -> &'input str; 118 | 119 | rule Init { 120 | $$ascii_whitespace, 121 | 122 | "//" => |lexer| lexer.switch(LexerRule::SinglelineComment), 123 | } 124 | 125 | rule SinglelineComment { 126 | (_ # '\n')* > ('\n' | $) => |lexer| { 127 | let comment = lexer.match_(); 128 | lexer.switch_and_return(LexerRule::Init, comment) 129 | }, 130 | } 131 | } 132 | 133 | // Terminated at the end of input (no newline) 134 | let input = "// / "; 135 | let mut lexer = Lexer::new(input); 136 | assert_eq!(next(&mut lexer), Some(Ok(input))); 137 | assert_eq!(next(&mut lexer), None); 138 | 139 | // Terminated with newlines 140 | let input = "// / \n"; 141 | let mut lexer = Lexer::new(input); 142 | assert_eq!(next(&mut lexer), Some(Ok("// / "))); 143 | assert_eq!(next(&mut lexer), None); 144 | 145 | // Empty comment, terminated with eof 146 | let input = "//"; 147 | let mut lexer = Lexer::new(input); 148 | assert_eq!(next(&mut lexer), Some(Ok("//"))); 149 | assert_eq!(next(&mut lexer), None); 150 | 151 | // Empty comment, terminated with eol 152 | let input = "//\n"; 153 | let mut lexer = Lexer::new(input); 154 | assert_eq!(next(&mut lexer), Some(Ok("//"))); 155 | assert_eq!(next(&mut lexer), None); 156 | } 157 | 158 | #[test] 159 | fn rust_float() { 160 | #[derive(Debug, PartialEq, Eq)] 161 | enum Token<'input> { 162 | Float(&'input str), 163 | Int(&'input str), 164 | Range, 165 | } 166 | 167 | lexer! { 168 | Lexer -> Token<'input>; 169 | 170 | ['0'-'9']+ '.' > (_ # ('.' | '_' | $$XID_Start) | $) => |lexer| { 171 | let match_ = lexer.match_(); 172 | lexer.return_(Token::Float(match_)) 173 | }, 174 | 175 | ['0'-'9']+ => |lexer| { 176 | let match_ = lexer.match_(); 177 | lexer.return_(Token::Int(match_)) 178 | }, 179 | 180 | ".." = Token::Range, 181 | } 182 | 183 | let mut lexer = Lexer::new("1."); 184 | assert_eq!(next(&mut lexer), Some(Ok(Token::Float("1.")))); 185 | assert_eq!(next(&mut lexer), None); 186 | 187 | let mut lexer = Lexer::new("1.."); 188 | assert_eq!(next(&mut lexer), Some(Ok(Token::Int("1")))); 189 | assert_eq!(next(&mut lexer), Some(Ok(Token::Range))); 190 | assert_eq!(next(&mut lexer), None); 191 | } 192 | 193 | #[test] 194 | fn ligature_shaping() { 195 | #[derive(Debug, PartialEq, Eq)] 196 | enum Token<'input> { 197 | Lig(&'input str), 198 | NotLig(&'input str), 199 | } 200 | 201 | lexer! { 202 | Lexer -> Token<'input>; 203 | 204 | "---" > ((_ # '-') | $) => |lexer| { 205 | let match_ = lexer.match_(); 206 | lexer.return_(Token::Lig(match_)) 207 | }, 208 | 209 | _+ => |lexer| { 210 | let match_ = lexer.match_(); 211 | lexer.return_(Token::NotLig(match_)) 212 | }, 213 | } 214 | 215 | let mut lexer = Lexer::new("--"); 216 | assert_eq!(next(&mut lexer), Some(Ok(Token::NotLig("--")))); 217 | assert_eq!(next(&mut lexer), None); 218 | 219 | let mut lexer = Lexer::new("---"); 220 | assert_eq!(next(&mut lexer), Some(Ok(Token::Lig("---")))); 221 | assert_eq!(next(&mut lexer), None); 222 | 223 | let mut lexer = Lexer::new("----"); 224 | assert_eq!(next(&mut lexer), Some(Ok(Token::NotLig("----")))); 225 | assert_eq!(next(&mut lexer), None); 226 | } 227 | -------------------------------------------------------------------------------- /crates/lexgen/tests/test_data: -------------------------------------------------------------------------------- 1 | -- This file is contains concatenation of Lua files in Lua 5.1 source 2 | -- distribution, used to test the Lua lexer (and lexgen) and as a runtime 3 | -- benchmark. 4 | 5 | -- Copyright (C) 1994-2012 Lua.org, PUC-Rio. 6 | -- 7 | -- Permission is hereby granted, free of charge, to any person obtaining a copy 8 | -- of this software and associated documentation files (the "Software"), to deal 9 | -- in the Software without restriction, including without limitation the rights 10 | -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | -- copies of the Software, and to permit persons to whom the Software is 12 | -- furnished to do so, subject to the following conditions: 13 | -- 14 | -- The above copyright notice and this permission notice shall be included in 15 | -- all copies or substantial portions of the Software. 16 | -- 17 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | -- THE SOFTWARE. 24 | 25 | delta=1e-6 -- tolerance 26 | 27 | function bisect(f,a,b,fa,fb) 28 | local c=(a+b)/2 29 | io.write(n," c=",c," a=",a," b=",b,"\n") 30 | if c==a or c==b or math.abs(a-b) posted to lua-l 181 | -- modified to use ANSI terminal escape sequences 182 | -- modified to use for instead of while 183 | 184 | local write=io.write 185 | 186 | ALIVE="¥" DEAD="þ" 187 | ALIVE="O" DEAD="-" 188 | 189 | function delay() -- NOTE: SYSTEM-DEPENDENT, adjust as necessary 190 | for i=1,10000 do end 191 | -- local i=os.clock()+1 while(os.clock() 0 do 220 | local xm1,x,xp1,xi=self.w-1,self.w,1,self.w 221 | while xi > 0 do 222 | local sum = self[ym1][xm1] + self[ym1][x] + self[ym1][xp1] + 223 | self[y][xm1] + self[y][xp1] + 224 | self[yp1][xm1] + self[yp1][x] + self[yp1][xp1] 225 | next[y][x] = ((sum==2) and self[y][x]) or ((sum==3) and 1) or 0 226 | xm1,x,xp1,xi = x,xp1,xp1+1,xi-1 227 | end 228 | ym1,y,yp1,yi = y,yp1,yp1+1,yi-1 229 | end 230 | end 231 | 232 | -- output the array to screen 233 | function _CELLS:draw() 234 | local out="" -- accumulate to reduce flicker 235 | for y=1,self.h do 236 | for x=1,self.w do 237 | out=out..(((self[y][x]>0) and ALIVE) or DEAD) 238 | end 239 | out=out.."\n" 240 | end 241 | write(out) 242 | end 243 | 244 | -- constructor 245 | function CELLS(w,h) 246 | local c = ARRAY2D(w,h) 247 | c.spawn = _CELLS.spawn 248 | c.evolve = _CELLS.evolve 249 | c.draw = _CELLS.draw 250 | return c 251 | end 252 | 253 | -- 254 | -- shapes suitable for use with spawn() above 255 | -- 256 | HEART = { 1,0,1,1,0,1,1,1,1; w=3,h=3 } 257 | GLIDER = { 0,0,1,1,0,1,0,1,1; w=3,h=3 } 258 | EXPLODE = { 0,1,0,1,1,1,1,0,1,0,1,0; w=3,h=4 } 259 | FISH = { 0,1,1,1,1,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0; w=5,h=4 } 260 | BUTTERFLY = { 1,0,0,0,1,0,1,1,1,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,1; w=5,h=5 } 261 | 262 | -- the main routine 263 | function LIFE(w,h) 264 | -- create two arrays 265 | local thisgen = CELLS(w,h) 266 | local nextgen = CELLS(w,h) 267 | 268 | -- create some life 269 | -- about 1000 generations of fun, then a glider steady-state 270 | thisgen:spawn(GLIDER,5,4) 271 | thisgen:spawn(EXPLODE,25,10) 272 | thisgen:spawn(FISH,4,12) 273 | 274 | -- run until break 275 | local gen=1 276 | write("\027[2J") -- ANSI clear screen 277 | while 1 do 278 | thisgen:evolve(nextgen) 279 | thisgen,nextgen = nextgen,thisgen 280 | write("\027[H") -- ANSI home cursor 281 | thisgen:draw() 282 | write("Life - generation ",gen,"\n") 283 | gen=gen+1 284 | if gen>2000 then break end 285 | --delay() -- no delay 286 | end 287 | end 288 | 289 | LIFE(40,20) 290 | -- bare-bones luac in Lua 291 | -- usage: lua luac.lua file.lua 292 | 293 | assert(arg[1]~=nil and arg[2]==nil,"usage: lua luac.lua file.lua") 294 | f=assert(io.open("luac.out","wb")) 295 | assert(f:write(string.dump(assert(loadfile(arg[1]))))) 296 | assert(f:close()) 297 | -- an implementation of printf 298 | 299 | function printf(...) 300 | io.write(string.format(...)) 301 | end 302 | 303 | printf("Hello %s from %s on %s\n",os.getenv"USER" or "there",_VERSION,os.date()) 304 | -- make global variables readonly 305 | 306 | local f=function (t,i) error("cannot redefine global variable `"..i.."'",2) end 307 | local g={} 308 | local G=getfenv() 309 | setmetatable(g,{__index=G,__newindex=f}) 310 | setfenv(1,g) 311 | 312 | -- an example 313 | rawset(g,"x",3) 314 | x=2 315 | y=1 -- cannot redefine `y' 316 | -- the sieve of of Eratosthenes programmed with coroutines 317 | -- typical usage: lua -e N=1000 sieve.lua | column 318 | 319 | -- generate all the numbers from 2 to n 320 | function gen (n) 321 | return coroutine.wrap(function () 322 | for i=2,n do coroutine.yield(i) end 323 | end) 324 | end 325 | 326 | -- filter the numbers generated by `g', removing multiples of `p' 327 | function filter (p, g) 328 | return coroutine.wrap(function () 329 | while 1 do 330 | local n = g() 331 | if n == nil then return end 332 | if math.mod(n, p) ~= 0 then coroutine.yield(n) end 333 | end 334 | end) 335 | end 336 | 337 | N=N or 1000 -- from command line 338 | x = gen(N) -- generate primes up to N 339 | while 1 do 340 | local n = x() -- pick a number until done 341 | if n == nil then break end 342 | print(n) -- must be a prime number 343 | x = filter(n, x) -- now remove its multiples 344 | end 345 | -- two implementations of a sort function 346 | -- this is an example only. Lua has now a built-in function "sort" 347 | 348 | -- extracted from Programming Pearls, page 110 349 | function qsort(x,l,u,f) 350 | if ly end) 402 | show("after reverse selection sort",x) 403 | qsort(x,1,n,function (x,y) return x>> ",string.rep(" ",level)) 431 | if t~=nil and t.currentline>=0 then io.write(t.short_src,":",t.currentline," ") end 432 | t=debug.getinfo(2) 433 | if event=="call" then 434 | level=level+1 435 | else 436 | level=level-1 if level<0 then level=0 end 437 | end 438 | if t.what=="main" then 439 | if event=="call" then 440 | io.write("begin ",t.short_src) 441 | else 442 | io.write("end ",t.short_src) 443 | end 444 | elseif t.what=="Lua" then 445 | -- table.foreach(t,print) 446 | io.write(event," ",t.name or "(Lua)"," <",t.linedefined,":",t.short_src,">") 447 | else 448 | io.write(event," ",t.name or "(C)"," [",t.what,"] ") 449 | end 450 | io.write("\n") 451 | end 452 | 453 | debug.sethook(hook,"cr") 454 | level=0 455 | -- trace assigments to global variables 456 | 457 | do 458 | -- a tostring that quotes strings. note the use of the original tostring. 459 | local _tostring=tostring 460 | local tostring=function(a) 461 | if type(a)=="string" then 462 | return string.format("%q",a) 463 | else 464 | return _tostring(a) 465 | end 466 | end 467 | 468 | local log=function (name,old,new) 469 | local t=debug.getinfo(3,"Sl") 470 | local line=t.currentline 471 | io.write(t.short_src) 472 | if line>=0 then io.write(":",line) end 473 | io.write(": ",name," is now ",tostring(new)," (was ",tostring(old),")","\n") 474 | end 475 | 476 | local g={} 477 | local set=function (t,name,value) 478 | log(name,g[name],value) 479 | g[name]=value 480 | end 481 | setmetatable(getfenv(),{__index=g,__newindex=set}) 482 | end 483 | 484 | -- an example 485 | 486 | a=1 487 | b=2 488 | a=10 489 | b=20 490 | b=nil 491 | b=200 492 | print(a,b,c) 493 | -- hex dump 494 | -- usage: lua xd.lua < file 495 | 496 | local offset=0 497 | while true do 498 | local s=io.read(16) 499 | if s==nil then return end 500 | io.write(string.format("%08X ",offset)) 501 | string.gsub(s,"(.)", 502 | function (c) io.write(string.format("%02X ",string.byte(c))) end) 503 | io.write(string.rep(" ",3*(16-string.len(s)))) 504 | io.write(" ",string.gsub(s,"%c","."),"\n") 505 | offset=offset+16 506 | end 507 | -------------------------------------------------------------------------------- /crates/lexgen/tests/test_utils.rs: -------------------------------------------------------------------------------- 1 | use lexgen_util::Loc; 2 | 3 | pub fn ignore_pos(ret: Option>) -> Option> { 4 | ret.map(|res| res.map(|(_, a, _)| a)) 5 | } 6 | 7 | pub fn next(iter: &mut dyn Iterator>) -> Option> { 8 | ignore_pos(iter.next()) 9 | } 10 | 11 | pub fn loc(line: u32, col: u32, byte_idx: usize) -> Loc { 12 | Loc { 13 | line, 14 | col, 15 | byte_idx, 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /crates/lexgen_lalrpop_example/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lexgen_lalrpop_example" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | lalrpop-util = "0.19.9" 8 | lexgen = { path = "../lexgen" } 9 | lexgen_util = { path = "../lexgen_util" } 10 | 11 | [build-dependencies] 12 | lalrpop = "0.19.9" 13 | -------------------------------------------------------------------------------- /crates/lexgen_lalrpop_example/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | lalrpop::process_root().unwrap(); 3 | } 4 | -------------------------------------------------------------------------------- /crates/lexgen_lalrpop_example/src/interpolation.lalrpop: -------------------------------------------------------------------------------- 1 | use super::{ 2 | ast::{Expression, StringFragment}, 3 | lexer::{LexerError, Loc, Token}, 4 | }; 5 | 6 | grammar<'input>; 7 | 8 | pub Expression: Expression<'input> = { 9 | Term, 10 | "+" => Expression::Concat(Box::new(lhs), Box::new(rhs)), 11 | } 12 | 13 | Term: Expression<'input> = { 14 | "(" ")", 15 | StringStart StringEnd => Expression::String(<>), 16 | } 17 | 18 | StringInner: StringFragment<'input> = { 19 | StringFragment => StringFragment::String(<>), 20 | InterpolationStart InterpolationEnd => StringFragment::Expression(<>), 21 | } 22 | 23 | 24 | extern { 25 | type Location = Loc; 26 | type Error = LexerError; 27 | 28 | enum Token<'input> { 29 | "+" => Token::Plus, 30 | "(" => Token::LParen, 31 | ")" => Token::RParen, 32 | StringStart => Token::StringStart, 33 | StringFragment => Token::StringFragment(<&'input str>), 34 | StringEnd => Token::StringEnd, 35 | InterpolationStart => Token::InterpolationStart, 36 | InterpolationEnd => Token::InterpolationEnd, 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /crates/lexgen_lalrpop_example/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This example shows how to use the generated lexer with [lalrpop](https://docs.rs/lalrpop/latest/lalrpop/) 2 | //! by implementing an evaluator of an example language. 3 | //! 4 | //! The language has three type of expressions. The first type is a string expression which starts 5 | //! and ends with `"`. The other types are string concatenation denoted by `+` and parenthesized 6 | //! expression. Inside string expression, you can write caracters as normal language, and can also 7 | //! interpolate another expression by surrounding the expression with `\(` and `)`. 8 | 9 | use lalrpop_util::lalrpop_mod; 10 | 11 | pub mod ast { 12 | #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] 13 | pub enum StringFragment<'input> { 14 | /// Represents a sequence of normal characters or a string consists of a single 15 | /// escaped character in a string literal. 16 | String(&'input str), 17 | /// Represents an interpolated expression in a string literal. 18 | Expression(Expression<'input>), 19 | } 20 | 21 | #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] 22 | pub enum Expression<'input> { 23 | /// Represents a string literal. 24 | String(Vec>), 25 | /// Represents `lhs + rhs`. It's possible to make this desugared into 26 | /// `"\(lhs)\(rhs)"` instead of having this variant. 27 | Concat(Box>, Box>), 28 | } 29 | 30 | impl StringFragment<'_> { 31 | fn eval_to(&self, w: &mut impl std::fmt::Write) -> std::fmt::Result { 32 | match self { 33 | StringFragment::String(s) => w.write_str(s), 34 | StringFragment::Expression(e) => e.eval_to(w), 35 | } 36 | } 37 | } 38 | 39 | impl Expression<'_> { 40 | fn eval_to(&self, w: &mut impl std::fmt::Write) -> std::fmt::Result { 41 | match self { 42 | Expression::String(v) => v.iter().try_for_each(|f| f.eval_to(w)), 43 | Expression::Concat(l, r) => [l, r].iter().try_for_each(|e| e.eval_to(w)), 44 | } 45 | } 46 | 47 | pub fn eval(&self) -> String { 48 | let mut ret = String::new(); 49 | self.eval_to(&mut ret) 50 | .expect("Format into String shoudln't fail"); 51 | ret 52 | } 53 | } 54 | } 55 | 56 | #[allow(clippy::manual_range_contains)] 57 | pub mod lexer { 58 | use lexgen::lexer; 59 | pub type LexerError = lexgen_util::LexerError; 60 | pub type Loc = lexgen_util::Loc; 61 | 62 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] 63 | pub enum Token<'input> { 64 | /// Represents `+` (outside of string literals). 65 | Plus, 66 | /// Represents `(` of parenthesized expression (outside of string literals). 67 | LParen, 68 | /// Represents `)` of parenthesized expression (outside of string literals). 69 | RParen, 70 | /// Represents `"` at the begining of a string literal. 71 | StringStart, 72 | /// Represents a non-interpolated part of a string literal. 73 | /// Either a sequence of characters exactly same as a part of input, 74 | /// or an un-escaped character of a part of input. 75 | StringFragment(&'input str), 76 | /// Represents `"` at the end of a string literal. 77 | StringEnd, 78 | /// Represents `\(` that starts interpolation in a string literal. 79 | InterpolationStart, 80 | /// Represents `)` that ends interpolation in a string literal. 81 | InterpolationEnd, 82 | } 83 | 84 | pub struct LexerState { 85 | /// For each interpolation, we want to lex the `)` that ends the interpolation differently. 86 | /// To do so, we keep track of the balance of parenthesis in the expression, and treat the 87 | /// the first `)` that over-closed the expression as the interpolation end marker. 88 | /// Since we can nest string interpolations like `"\("\("a")")"`, we use stack to keep this 89 | /// balance. 90 | paren_nest: Vec, 91 | } 92 | 93 | impl Default for LexerState { 94 | fn default() -> Self { 95 | Self { 96 | paren_nest: vec![0], 97 | } 98 | } 99 | } 100 | 101 | lexer! { 102 | pub Lexer(LexerState) -> Token<'input>; 103 | type Error = String; 104 | 105 | let ws = [' ' '\t' '\n'] | "\r\n"; 106 | 107 | rule Init { 108 | $ws, 109 | '+' = Token::Plus, 110 | '"' => |lexer| lexer.switch_and_return(LexerRule::InString, Token::StringStart), 111 | '(' =? |lexer| { 112 | match lexer.state().paren_nest.last_mut() { 113 | Some(x) => { 114 | *x += 1; 115 | lexer.return_(Ok(Token::LParen)) 116 | }, 117 | None => { 118 | lexer.return_(Err("Invalid state, maybe already failed?".to_string())) 119 | } 120 | } 121 | }, 122 | ')' =? |lexer| { 123 | match lexer.state().paren_nest.last_mut() { 124 | Some(0) => { 125 | lexer.state().paren_nest.pop(); 126 | if lexer.state().paren_nest.is_empty() { 127 | lexer.return_(Err("Too many close parens".to_string())) 128 | } else { 129 | lexer.switch_and_return(LexerRule::InString, Ok(Token::InterpolationEnd)) 130 | } 131 | }, 132 | Some(x) => { 133 | *x -= 1; 134 | lexer.return_(Ok(Token::RParen)) 135 | }, 136 | None => { 137 | lexer.return_(Err("Invalid state, maybe already failed?".to_string())) 138 | } 139 | } 140 | }, 141 | } 142 | 143 | rule InString { 144 | "\\\"" = Token::StringFragment("\""), 145 | "\\n" = Token::StringFragment("\n"), 146 | "\\r" = Token::StringFragment("\r"), 147 | "\\t" = Token::StringFragment("\t"), 148 | "\\\\" = Token::StringFragment("\\"), 149 | '"' => |lexer| lexer.switch_and_return(LexerRule::Init, Token::StringEnd), 150 | "\\(" => |lexer| { 151 | lexer.state().paren_nest.push(0); 152 | lexer.switch_and_return(LexerRule::Init, Token::InterpolationStart) 153 | }, 154 | (_ # ['\\' '"'])+ => |lexer| lexer.return_(Token::StringFragment(lexer.match_())), 155 | } 156 | } 157 | } 158 | 159 | lalrpop_mod!(#[allow(unused_imports, clippy::all)] pub parser, "/interpolation.rs"); 160 | 161 | #[cfg(test)] 162 | mod test { 163 | use super::{ 164 | ast::Expression, 165 | lexer::{Lexer, LexerError, Loc, Token}, 166 | parser::ExpressionParser, 167 | }; 168 | 169 | type Result<'input, T> = 170 | std::result::Result, LexerError>>; 171 | 172 | fn parse(code: &str) -> Result { 173 | let lexer = Lexer::new(code); 174 | ExpressionParser::new().parse(lexer) 175 | } 176 | 177 | fn parse_and_eval(code: &str) -> Result { 178 | parse(code).map(|e| e.eval()) 179 | } 180 | 181 | #[test] 182 | fn test_basic() -> Result<'static, ()> { 183 | assert_eq!(parse_and_eval(r#""a" + "b" + "c""#)?, "abc"); 184 | assert_eq!(parse_and_eval(r#""\n\t\\(" + "b" + "c""#)?, "\n\t\\(bc"); 185 | Ok(()) 186 | } 187 | 188 | #[test] 189 | fn test_invalid() { 190 | assert!(parse(r#""a" +"#).is_err()); 191 | assert!(parse(r#""a" + ""#).is_err()); 192 | assert!(parse(r#"("a" + "b" "#).is_err()); 193 | assert!(parse(r#""a\(""#).is_err()); 194 | assert!(parse(r#""a\(")""#).is_err()); 195 | assert!(parse(r#""a\())""#).is_err()); 196 | assert!(parse(r#"("a\())""#).is_err()); 197 | assert!(parse(r#")"#).is_err()); 198 | } 199 | 200 | #[test] 201 | fn test_associativity() -> Result<'static, ()> { 202 | assert_eq!( 203 | parse(r#""a" + "b" + "c" + "d""#)?, 204 | parse(r#"(("a" + "b") + "c") + "d""#)? 205 | ); 206 | Ok(()) 207 | } 208 | 209 | #[test] 210 | fn test_interpolation() -> Result<'static, ()> { 211 | assert_eq!(parse_and_eval(r#""ab\("c" + "d")""#)?, "abcd"); 212 | assert_eq!(parse_and_eval(r#""ab\(("c") + ("d"))""#)?, "abcd"); 213 | assert_eq!(parse_and_eval(r#""ab\(("c\("d")"))""#)?, "abcd"); 214 | Ok(()) 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /crates/lexgen_util/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lexgen_util" 3 | version = "0.16.0" 4 | authors = ["Ömer Sinan Ağacan "] 5 | description = "Runtime library for lexers generated by lexgen" 6 | edition = "2021" 7 | license = "MIT" 8 | homepage = "https://github.com/osa1/lexgen" 9 | repository = "https://github.com/osa1/lexgen" 10 | 11 | [dependencies] 12 | unicode-width = "0.2.0" 13 | -------------------------------------------------------------------------------- /crates/lexgen_util/README.md: -------------------------------------------------------------------------------- 1 | # lexgen_util: Runtime library for lexers generated by lexgen 2 | 3 | This library is used by lexgen-generated lexers. See the [lexgen crate] or 4 | [lexgen homepage]. 5 | 6 | [lexgen crate]: https://crates.io/crates/lexgen 7 | [lexgen homepage]: https://github.com/osa1/lexgen 8 | -------------------------------------------------------------------------------- /crates/lexgen_util/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::should_implement_trait, clippy::type_complexity)] 2 | 3 | use std::iter::Peekable; 4 | use std::str::Chars; 5 | 6 | use unicode_width::UnicodeWidthChar; 7 | 8 | #[derive(Debug, Clone, PartialEq, Eq)] 9 | pub struct LexerError { 10 | pub location: Loc, 11 | pub kind: LexerErrorKind, 12 | } 13 | 14 | #[derive(Debug, Clone, PartialEq, Eq)] 15 | pub enum LexerErrorKind { 16 | /// A lexer error, returned by lexgen. 17 | InvalidToken, 18 | 19 | /// A custom error, returned by a semantic action. 20 | Custom(E), 21 | } 22 | 23 | /// A location in an input. 24 | #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] 25 | pub struct Loc { 26 | /// Zero-based line number in the input. 27 | pub line: u32, 28 | 29 | /// Zero-based character index of this location in its line. 30 | pub col: u32, 31 | 32 | /// Zero-based UTF-8 byte index of this location in the input. 33 | pub byte_idx: usize, 34 | } 35 | 36 | impl Loc { 37 | const ZERO: Loc = Loc { 38 | line: 0, 39 | col: 0, 40 | byte_idx: 0, 41 | }; 42 | } 43 | 44 | /// **Do not use** 45 | // Possible outcomes of a semantic action 46 | pub enum SemanticActionResult { 47 | // Semantic action did not return a token, continue with lexing 48 | Continue, 49 | // Semantic action returned a token, return it 50 | Return(T), 51 | } 52 | 53 | impl SemanticActionResult { 54 | pub fn map_token(self, f: F) -> SemanticActionResult 55 | where 56 | F: Fn(T) -> T1, 57 | { 58 | match self { 59 | SemanticActionResult::Continue => SemanticActionResult::Continue, 60 | SemanticActionResult::Return(t) => SemanticActionResult::Return(f(t)), 61 | } 62 | } 63 | } 64 | 65 | /// Common parts in lexers generated by lexgen. 66 | /// 67 | /// **Fields are used by lexgen-generated code and should not be used directly.** 68 | #[derive(Debug, Clone)] 69 | pub struct Lexer<'input, Iter: Iterator + Clone, Token, State, Error, Wrapper> { 70 | // Current lexer state 71 | pub __state: usize, 72 | 73 | // Set after end-of-input is handled by a rule, or by default in `Init` rule 74 | pub __done: bool, 75 | 76 | // Which lexer state to switch to on successful match 77 | pub __initial_state: usize, 78 | 79 | user_state: State, 80 | 81 | // User-provided input string. Does not change after initialization. 82 | input: &'input str, 83 | 84 | // Start location of `iter`. We update this as we backtrack and update `iter`. 85 | iter_loc: Loc, 86 | 87 | // Character iterator. `Peekable` is used in the handler's `peek` method. Note that we can't 88 | // use byte index returned by this directly, as we re-initialize this field when backtracking. 89 | // Add `iter_byte_idx` to the byte index before using. When resetting, update `iter_byte_idx`. 90 | pub __iter: Peekable, 91 | 92 | // Start of the current match 93 | current_match_start: Loc, 94 | 95 | // End of the current match 96 | current_match_end: Loc, 97 | 98 | // If we skipped an accepting state, this holds the triple: 99 | // 100 | // - Skipped match start (byte index in `input`) 101 | // - Semantic action (a function name) 102 | // - Skipped match end (exclusive, byte index in `input`) 103 | last_match: Option<( 104 | Loc, 105 | Peekable, 106 | for<'lexer> fn(&'lexer mut Wrapper) -> SemanticActionResult>, 107 | Loc, 108 | )>, 109 | } 110 | 111 | impl + Clone, T, S: Default, E, W> Lexer<'static, I, T, S, E, W> { 112 | pub fn new_from_iter(iter: I) -> Self { 113 | Self::new_from_iter_with_state(iter, Default::default()) 114 | } 115 | } 116 | 117 | impl + Clone, T, S, E, W> Lexer<'static, I, T, S, E, W> { 118 | pub fn new_from_iter_with_state(iter: I, state: S) -> Self { 119 | Self { 120 | __state: 0, 121 | __done: false, 122 | __initial_state: 0, 123 | user_state: state, 124 | input: "", 125 | iter_loc: Loc::ZERO, 126 | __iter: iter.peekable(), 127 | current_match_start: Loc::ZERO, 128 | current_match_end: Loc::ZERO, 129 | last_match: None, 130 | } 131 | } 132 | } 133 | 134 | impl<'input, T, S: Default, E, W> Lexer<'input, Chars<'input>, T, S, E, W> { 135 | pub fn new(input: &'input str) -> Self { 136 | Self::new_with_state(input, Default::default()) 137 | } 138 | } 139 | 140 | impl<'input, T, S, E, W> Lexer<'input, Chars<'input>, T, S, E, W> { 141 | pub fn new_with_state(input: &'input str, state: S) -> Self { 142 | Self { 143 | __state: 0, 144 | __done: false, 145 | __initial_state: 0, 146 | user_state: state, 147 | input, 148 | iter_loc: Loc::ZERO, 149 | __iter: input.chars().peekable(), 150 | current_match_start: Loc::ZERO, 151 | current_match_end: Loc::ZERO, 152 | last_match: None, 153 | } 154 | } 155 | } 156 | 157 | impl<'input, I: Iterator + Clone, T, S, E, W> Lexer<'input, I, T, S, E, W> { 158 | // Read the next chracter 159 | pub fn next(&mut self) -> Option { 160 | match self.__iter.next() { 161 | None => None, 162 | Some(char) => { 163 | self.current_match_end.byte_idx += char.len_utf8(); 164 | if char == '\n' { 165 | self.current_match_end.line += 1; 166 | self.current_match_end.col = 0; 167 | } else if char == '\t' { 168 | self.current_match_end.col += 4; // TODO: Make this configurable? 169 | } else { 170 | self.current_match_end.col += UnicodeWidthChar::width(char).unwrap_or(1) as u32; 171 | } 172 | Some(char) 173 | } 174 | } 175 | } 176 | 177 | pub fn peek(&mut self) -> Option { 178 | self.__iter.peek().copied() 179 | } 180 | 181 | // On success returns semantic action function for the last match 182 | pub fn backtrack( 183 | &mut self, 184 | ) -> Result fn(&'lexer mut W) -> SemanticActionResult>, LexerError> 185 | { 186 | match self.last_match.take() { 187 | None => { 188 | self.__state = 0; 189 | Err(LexerError { 190 | location: self.current_match_start, 191 | kind: LexerErrorKind::InvalidToken, 192 | }) 193 | } 194 | Some((match_start, iter, semantic_action, match_end)) => { 195 | self.__done = false; 196 | self.current_match_start = match_start; 197 | self.current_match_end = match_end; 198 | self.__iter = iter; 199 | self.iter_loc = match_end; 200 | Ok(semantic_action) 201 | } 202 | } 203 | } 204 | 205 | pub fn reset_accepting_state(&mut self) { 206 | self.last_match = None; 207 | } 208 | 209 | pub fn set_accepting_state( 210 | &mut self, 211 | semantic_action_fn: for<'lexer> fn(&'lexer mut W) -> SemanticActionResult>, 212 | ) { 213 | self.last_match = Some(( 214 | self.current_match_start, 215 | self.__iter.clone(), 216 | semantic_action_fn, 217 | self.current_match_end, 218 | )); 219 | } 220 | 221 | pub fn reset_match(&mut self) { 222 | self.current_match_start = self.current_match_end; 223 | } 224 | 225 | pub fn match_(&self) -> &'input str { 226 | &self.input[self.current_match_start.byte_idx..self.current_match_end.byte_idx] 227 | } 228 | 229 | pub fn match_loc(&self) -> (Loc, Loc) { 230 | (self.current_match_start, self.current_match_end) 231 | } 232 | 233 | pub fn state(&mut self) -> &mut S { 234 | &mut self.user_state 235 | } 236 | } 237 | --------------------------------------------------------------------------------