├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .vim
    └── coc-settings.json
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE
├── README.md
└── crates
    ├── char_range_gen
        ├── Cargo.toml
        └── src
        │   └── main.rs
    ├── lexgen
        ├── Cargo.toml
        ├── benches
        │   └── benchmarks.rs
        ├── src
        │   ├── ast.rs
        │   ├── builtin.rs
        │   ├── char_ranges.rs
        │   ├── collections.rs
        │   ├── dfa.rs
        │   ├── dfa
        │   │   ├── backtrack.rs
        │   │   ├── codegen.rs
        │   │   ├── codegen
        │   │   │   ├── ctx.rs
        │   │   │   └── search_table.rs
        │   │   ├── simplify.rs
        │   │   └── simulate.rs
        │   ├── display.rs
        │   ├── lib.rs
        │   ├── nfa.rs
        │   ├── nfa
        │   │   └── simulate.rs
        │   ├── nfa_to_dfa.rs
        │   ├── range_map.rs
        │   ├── regex_to_nfa.rs
        │   ├── right_ctx.rs
        │   ├── semantic_action_table.rs
        │   └── tests.rs
        └── tests
        │   ├── bugs.rs
        │   ├── lua_5_1.rs
        │   ├── right_ctx.rs
        │   ├── test_data
        │   ├── test_utils.rs
        │   └── tests.rs
    ├── lexgen_lalrpop_example
        ├── Cargo.toml
        ├── build.rs
        └── src
        │   ├── interpolation.lalrpop
        │   └── lib.rs
    └── lexgen_util
        ├── Cargo.toml
        ├── README.md
        └── src
            └── lib.rs


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     name: 'Build and test'
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 | 
12 |       - name: Get stable toolchain
13 |         uses: actions-rs/toolchain@v1
14 |         with:
15 |             toolchain: stable
16 |             override: true
17 | 
18 |       - name: Build
19 |         run: cargo build --verbose
20 | 
21 |       - name: Test
22 |         run: cargo test
23 | 
24 |   formatting:
25 |     name: 'Check formatting'
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - uses: actions/checkout@v2
29 | 
30 |       - name: Get Rust stable toolchain
31 |         uses: actions-rs/toolchain@v1
32 |         with:
33 |             toolchain: stable
34 |             components: rustfmt
35 |             override: true
36 | 
37 |       - name: Check formatting
38 |         run: cargo fmt --all -- --check
39 | 
40 |   clippy:
41 |     name: 'Check lints'
42 |     runs-on: ubuntu-latest
43 |     steps:
44 |       - uses: actions/checkout@v2
45 | 
46 |       - name: Get Rust stable toolchain
47 |         uses: actions-rs/toolchain@v1
48 |         with:
49 |             toolchain: stable
50 |             override: true
51 | 
52 |       - name: Check lints
53 |         run: cargo clippy --all-targets --all
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /char_range_gen/target
3 | Cargo.lock
4 | 


--------------------------------------------------------------------------------
/.vim/coc-settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "rust-analyzer.checkOnSave.allTargets": false
3 | }
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # 2024/12/24: 0.16.0
  2 | 
  3 | - Update lexgen rustc-hash dependency from 1.1.0 to 2.0.0, lexgen_util
  4 |   unicode-width dependency from 0.1.10 to 0.2.0.
  5 | 
  6 | - Lexers can now use `pub(crate)` visibility, and other visibilities supported
  7 |   by Rust and the `syn` crate. Previously only `pub` was supported.
  8 | 
  9 | - Eliminate redundant `backtrack` calls in generated code, improving code size
 10 |   and runtime performance. Runtime performance improved 13% in a benchmark.
 11 |   (#69)
 12 | 
 13 | # 2023/09/03: 0.15.0
 14 | 
 15 | - Lexer type declarations can now have outer attributes other than just
 16 |   `#[derive(Clone)]`. Example:
 17 |   ```rust
 18 |   lexer! {
 19 |       /// A lexer for Rust.
 20 |       #[derive(Debug, Clone)]
 21 |       pub RustLexer(LexerState) -> RustToken;
 22 | 
 23 |       ...
 24 |   }
 25 |   ```
 26 |   The attributes are directly copied to the generated `struct`. In the example,
 27 |   the documentation and `derive` attribute will be copied to the generated
 28 |   `struct`:
 29 |   ```rust
 30 |   /// A lexer for Rust.
 31 |   #[derive(Debug, Clone)]
 32 |   pub struct RustLexer<...>(...);
 33 |   ```
 34 | 
 35 | - `lexgen_util::Lexer` type now derives `Debug` (in addition to `Clone`). This
 36 |   makes it possible to derive `Debug` in generated lexers.
 37 | 
 38 | - `syn` dependency updated to version 2.
 39 | 
 40 | # 2023/04/23: 0.14.0
 41 | 
 42 | - **Breaking change:** Rules without a right-hand side (e.g. `$$whitespace,`)
 43 |   now always reset the current match. Previously such rules would only reset
 44 |   the current match in `Init`. (#12)
 45 | 
 46 |   To migrate, add a semantic action to your rule that just calls `continue_()`
 47 |   on the lexer. For example, if you have `$$whitespace,`, replace it with:
 48 | 
 49 |   ```rust
 50 |   $$whitespace => |lexer| lexer.continue_(),
 51 |   ```
 52 | 
 53 | - `clippy::manual_is_ascii_check` violations are now ignored in generated code.
 54 | 
 55 | # 2023/04/10: 0.13.0
 56 | 
 57 | - Fix more `manual_range_contains` lints in generated code.
 58 | 
 59 | - `let` bindings can now appear inside `rule`s. Previously `let`s were only
 60 |   allowed at the top-level. (#28)
 61 | 
 62 | - You can now add `#[derive(Clone)]` before the lexer type name to implement
 63 |   `Clone` for the lexer type. This can be used to implement backtracking
 64 |   parsers. Example:
 65 |   ```rust
 66 |   lexer! {
 67 |       #[derive(Clone)]
 68 |       pub Lexer -> Token;
 69 |       // The struct `Lexer` will implement `Clone`
 70 | 
 71 |       ...
 72 |   }
 73 |   ```
 74 | 
 75 | # 2022/08/12: 0.12.0
 76 | 
 77 | - Fix `double_comparison`, `manual_range_contains` lints in generated code.
 78 |   (0ecb0b1)
 79 | 
 80 | - Lexer constructors `new_with_state` and `new_from_iter_with_state` no longer
 81 |   require user state to implement `Default`. (#54)
 82 | 
 83 | - User state can now have lifetime parameters other than `'input`. (#53)
 84 | 
 85 | # 2022/05/15: 0.11.0
 86 | 
 87 | - Lexer state is now reset on failure. (#48)
 88 | 
 89 | # 2022/02/20: 0.10.0
 90 | 
 91 | - Generated lexers now have two new constructors:
 92 | 
 93 |   - `new_from_iter<I: Iterator<Item = char> + Clone>(iter: I) -> Self`
 94 |   - `new_from_iter_with_state<I: Iterator<Item = char> + Clone, S>(iter: I, user_state: S) -> Self`
 95 | 
 96 |   These constructors allow running a lexer on a character iterator instead of a
 97 |   string slice. Generated lexers work exactly the same way, except the `match_`
 98 |   method panics when called.
 99 | 
100 |   Locations of matches can be obtained with the `match_loc(&self) -> (Loc,
101 |   Loc)` method.
102 | 
103 |   These constructors are useful when the input is not a flat unicode string,
104 |   but something like a rope, gap array, zipper, etc. (#41)
105 | 
106 | - `lexgen_util::Loc` now implements `Default`. This makes it easier to use
107 |   lexgen with [LALRPOP]. (#44)
108 | 
109 | [LALRPOP]: https://github.com/lalrpop/lalrpop
110 | 
111 | # 2022/01/31: 0.9.0
112 | 
113 | - New regex syntax `#` added for character set difference, e.g. `re1 # re2`
114 |   matches characters in `re1` that are not in `re2`. `re1` and `re2` need to be
115 |   "character sets", i.e. `*`, `+`, `?`, `"..."`, `$`, and concatenation are not
116 |   allowed.
117 | 
118 | - **Breaking change:** `LexerError` type is refactored to add location
119 |   information to all errors, not just `InvalidToken`. Previously the type was:
120 | 
121 |   ```rust
122 |   #[derive(Debug, Clone, PartialEq, Eq)]
123 |   pub enum LexerError<E> {
124 |       InvalidToken {
125 |           location: Loc,
126 |       },
127 | 
128 |       /// Custom error, raised by a semantic action
129 |       Custom(E),
130 |   }
131 |   ```
132 | 
133 |   with this change, it is now:
134 | 
135 |   ```rust
136 |   #[derive(Debug, Clone, PartialEq, Eq)]
137 |   pub struct LexerError<E> {
138 |       pub location: Loc,
139 |       pub kind: LexerErrorKind<E>,
140 |   }
141 | 
142 |   #[derive(Debug, Clone, PartialEq, Eq)]
143 |   pub enum LexerErrorKind<E> {
144 |       /// Lexer error, raised by lexgen-generated code
145 |       InvalidToken,
146 | 
147 |       /// Custom error, raised by a semantic action
148 |       Custom(E),
149 |   }
150 |   ```
151 | 
152 | - A new syntax added for right contexts. A right context is basically
153 |   lookahead, but can only be used in rules and cannot be nested inside regexes.
154 |   See README for details. (#29)
155 | 
156 | # 2021/11/30: 0.8.1
157 | 
158 | New version published to fix broken README pages for lexgen and lexgen_util in
159 | crates.io.
160 | 
161 | # 2021/10/30: 0.8.0
162 | 
163 | - **Breaking change:** Starting with this release, lexgen-generated lexers now
164 |   depend on `lexgen_util` package of the same version. If you are using lexgen
165 |   version 0.8 or newer, make sure to add `lexgen_util = "..."` to your
166 |   `Cargo.toml`, using the same version number as `lexgen`.
167 | 
168 | - Common code in generated code is moved to a new crate `lexgen_util`.
169 |   lexgen-generated lexers now depend on `lexgen_util`.
170 | 
171 | - **Breaking change:** Line and column tracking implemented. Iterator
172 |   implementation now yields `(Loc, Token, Loc)`, where `Loc` is defined in
173 |   `lexgen_util` as `struct Loc { line: u32, col: u32, byte_idx: usize }`.
174 | 
175 | - Fixed a bug when initial state of a rule does not have any transitions (rule
176 |   is empty). (#27, 001ea51)
177 | 
178 | - Fixed a bug in codegen that caused accidental backtracking in some cases.
179 |   (#27, 001ea51)
180 | 
181 | - Fixed a bug that caused incorrect lexing when a lexer state has both range
182 |   and any (`_`) transitions. (#31)
183 | 
184 | # 2021/10/21: 0.7.0
185 | 
186 | - Regex syntax updated to include "any character" (`_`) and "end of input"
187 |   (`$`).
188 | 
189 |   Previously "any character" (`_`) could be used as a rule left-hand side, but
190 |   was not allowed in regexes.
191 | 
192 | - Semantic action functions that use user state (`state` method of the lexer
193 |   handle) no longer need `mut` modifier in the handle argument.
194 | 
195 |   This will generate warnings in old code with semantic actions that take a
196 |   `mut` argument.
197 | 
198 | - New lexer method `reset_match` implemented to reset the current match.
199 | 
200 | # 2021/10/19: 0.6.0
201 | 
202 | - Fixed precedences of concatenation (juxtaposition) and alternation (`|`).
203 | 
204 | - Fixed lexing in lexers that require backtracking to implement longest match
205 |   rule. (#16)
206 | 
207 | # 2021/10/07: 0.5.0
208 | 
209 | - Accepting states without transitions are now simplified in compile time and
210 |   semantic actions of such states are inlined in the states that make a
211 |   transition to such accepting states. In Lua 5.1 lexer this reduces a
212 |   benchmark's runtime by 14.9%. (#7)
213 | 
214 |   Note that this potentially duplicates a lot of code in the generated code
215 |   when some states have large semantic action codes and lots of incoming edges
216 |   in the DFA. However in practice I haven't observed this yet. (#8)
217 | 
218 | - DFA states with one predecessor are now inlined in the predecessor states.
219 |   This reduces code size and improves runtime performance. (33547ec)
220 | 
221 | - We now reset the current match after returning a token (with `return_` and
222 |   `switch_and_return`). (#11)
223 | 
224 | # 2021/05/30: 0.4.0
225 | 
226 | - lexgen now comes with a set of built-in regular expressions for matching
227 |   Unicode alphanumerics, uppercases, whitespaces etc. See README for details.
228 | 
229 | - Fixed a few issues with end-of-stream handling (cbaabe2)
230 | 
231 | # 2021/05/28: 0.3.0
232 | 
233 | - Fixed handling of overlapping ranges in a single NFA/DFA state. (#3)
234 | 
235 | # 2021/05/16: 0.2.2
236 | 
237 | - `LexerError` type now implements `Clone` and `Copy`.
238 | 
239 | # 2021/05/06: 0.2.1
240 | 
241 | - Fixed various bugs in `_` pattern handling.
242 | 
243 | # 2021/05/05: 0.2.0
244 | 
245 | - It is now possible to use the special lifetime `'input` in your token types
246 |   to borrow from the input string. Example:
247 | 
248 |   ```rust
249 |   enum Token<'input> {
250 |       Id(&'input str),
251 |   }
252 | 
253 |   lexer! {
254 |       Lexer -> Token<'input>;
255 | 
256 |       rule Init {
257 |           [' ' '\t' '\n']; // skip whitespace
258 | 
259 |           ['a'-'z']+ => |lexer| {
260 |               let match_ = lexer.match_();
261 |               lexer.return_(Token::Id(match_))
262 |           },
263 |       }
264 |   }
265 |   ```
266 | 
267 |   See also the Lua 5.1 lexer example, which is updated to use this feature.
268 | 
269 | - The `rule Init { ... }` syntax can now be omitted when you don't need named
270 |   rule sets. For example, the example in the previous changelog entry can be
271 |   simplified as:
272 | 
273 |   ```rust
274 |   lexer! {
275 |       Lexer -> Token<'input>;
276 | 
277 |       [' ' '\t' '\n'], // skip whitespace
278 | 
279 |       ['a'-'z']+ => |lexer| {
280 |           let match_ = lexer.match_();
281 |           lexer.return_(Token::Id(match_))
282 |       },
283 |   }
284 |   ```
285 | 
286 | - `pub` keyword before a lexer name now generates the type as `pub`. Useful for
287 |   using the generated lexer in other modules. Example:
288 | 
289 |   ```rust
290 |   lexer! {
291 |       pub Lexer -> Token;
292 | 
293 |       ...
294 |   }
295 |   ```
296 | 
297 | - Two new action kinds: "fallible" and "simple" added. The old ones defined
298 |   with `=>` are now called "infallible".
299 | 
300 |   - "fallible" actions are defined with `=?` instead of `=>`. The difference
301 |     from infallible actions is the return type is `Result<Token, UserError>`,
302 |     instead of `Token`, where `UserError` is defined using `type Error = ...;`
303 |     syntax. LHS can have a `<'input>` lifetime parameter when borrowing from
304 |     the user input in the error values. When a user error type is defined, the
305 |     lexer error struct becomes an enum, with two variants:
306 | 
307 |     ```rust
308 |     enum LexerError {
309 |         LexerError { char_idx: usize },
310 |         UserError(UserError),
311 |     }
312 |     ```
313 | 
314 |   - "simple" actions are defined with `=` instead of `=>`. The RHS needs to be a
315 |     value for a token, instead of a closure for a lexer action. This rule kind is
316 |     useful when matching keywords and other simple tokens in a language. Example:
317 | 
318 |     ```rust
319 |     lexer! {
320 |         Lexer -> Token;
321 | 
322 |         '(' = Token::LParen,
323 |         ')' = Token::RParen,
324 |         ...
325 |     }
326 |     ```
327 | 
328 |     The syntax `<regex> = <expr>` is syntactic sugar for `<regex> => |lexer|
329 |     lexer.return_(<expr>)`.
330 | 
331 | # 2021/04/22: 0.1.0
332 | 
333 | - Initial release
334 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | resolver = "2"
3 | members = [
4 |     "crates/char_range_gen",
5 |     "crates/lexgen",
6 |     "crates/lexgen_lalrpop_example",
7 |     "crates/lexgen_util",
8 | ]
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2021 Ömer Sinan Ağacan
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # lexgen: A fully-featured lexer generator, implemented as a proc macro
  2 | 
  3 | ```rust
  4 | use lexgen::lexer;
  5 | use lexgen_util::Loc;
  6 | 
  7 | lexer! {
  8 |     // First line specifies name of the lexer and the token type returned by
  9 |     // semantic actions
 10 |     Lexer -> Token;
 11 | 
 12 |     // Regular expressions can be named with `let` syntax
 13 |     let init = ['a'-'z'];
 14 |     let subseq = $init | ['A'-'Z' '0'-'9' '-' '_'];
 15 | 
 16 |     // Rule sets have names. Each rule set is compiled to a separate DFA.
 17 |     // Switching between rule sets is done explicitly in semantic actions.
 18 |     rule Init {
 19 |         // Rules without a right-hand side for skipping whitespace,
 20 |         // comments, etc.
 21 |         [' ' '\t' '\n']+,
 22 | 
 23 |         // Rule for matching identifiers
 24 |         $init $subseq* => |lexer| {
 25 |             let token = Token::Id(lexer.match_().to_owned());
 26 |             lexer.return_(token)
 27 |         },
 28 |     }
 29 | }
 30 | 
 31 | // The token type
 32 | #[derive(Debug, PartialEq, Eq)]
 33 | enum Token {
 34 |     // An identifier
 35 |     Id(String),
 36 | }
 37 | 
 38 | // Generated lexers are initialized with a `&str` for the input
 39 | let mut lexer = Lexer::new(" abc123Q-t  z9_9");
 40 | 
 41 | // Lexers implement `Iterator<Item=Result<(Loc, T, Loc), LexerError>>`,
 42 | // where `T` is the token type specified in the lexer definition (`Token` in
 43 | // this case), and `Loc`s indicate line, column, and byte indices of
 44 | // beginning and end of the lexemes.
 45 | assert_eq!(
 46 |     lexer.next(),
 47 |     Some(Ok((
 48 |         Loc { line: 0, col: 1, byte_idx: 1 },
 49 |         Token::Id("abc123Q-t".to_owned()),
 50 |         Loc { line: 0, col: 10, byte_idx: 10 }
 51 |     )))
 52 | );
 53 | assert_eq!(
 54 |     lexer.next(),
 55 |     Some(Ok((
 56 |         Loc { line: 0, col: 12, byte_idx: 12 },
 57 |         Token::Id("z9_9".to_owned()),
 58 |         Loc { line: 0, col: 16, byte_idx: 16 }
 59 |     )))
 60 | );
 61 | assert_eq!(lexer.next(), None);
 62 | ```
 63 | 
 64 | See also:
 65 | 
 66 | - [Simple lexer definitions in tests][1]
 67 | - [A full Lua 5.1 lexer][2]
 68 | - [An example that uses lexgen with LALRPOP][3]
 69 | - [A lexer for a simpler version of OCaml][4]
 70 | - [A Rust lexer][5]
 71 | - [A parse event generator][6]
 72 | 
 73 | ## Motivation
 74 | 
 75 | Implementing lexing is often (along with parsing) the most tedious part of
 76 | implementing a language. Lexer generators make this much easier, but in Rust
 77 | existing lexer generators miss essential features for practical use, and/or
 78 | require a pre-processing step when building.
 79 | 
 80 | My goal with lexgen is to have a feature-complete and easy to use lexer
 81 | generator.
 82 | 
 83 | ## Usage
 84 | 
 85 | lexgen doesn't require a build step. Add same versions of `lexgen` and
 86 | `lexgen_util` as dependencies in your `Cargo.toml`.
 87 | 
 88 | ## Lexer syntax
 89 | 
 90 | lexgen lexers start with the name of the generated lexer struct, optional user
 91 | state part, and the token type (type of values returned by semantic actions).
 92 | Example:
 93 | 
 94 | ```rust
 95 | lexer! {
 96 |     Lexer(LexerState) -> Token;
 97 |     ...
 98 | }
 99 | ```
100 | 
101 | Here the generated lexer type will be named `Lexer`. User state type is
102 | `LexerState` (this type should be defined by the user). The token type is
103 | `Token`.
104 | 
105 | After the lexer name and user state and token types we define the rules:
106 | 
107 | ```rust
108 | rule Init {
109 |     ...
110 | }
111 | 
112 | rule SomeOtherRule {
113 |     ...
114 | }
115 | ```
116 | 
117 | The first rule set will be defining the initial state of the lexer and needs to
118 | be named `Init`.
119 | 
120 | In the body of a `rule` block we define the rules for that lexer state. The
121 | syntax for a rule is `<regex> => <semantic action>,`. Regex syntax is described
122 | below. A semantic action is any Rust code with the type `fn(LexerHandle) ->
123 | LexerAction` where `LexerHandle` and `LexerAction` are generated names derived
124 | from the lexer name (`Lexer` in our example). More on these types below.
125 | 
126 | Regular expressions can be named with `let <name> = <regex>;` syntax. Example:
127 | 
128 | ```rust
129 | let init = ['a'-'z'];
130 | let subseq = $init | ['A'-'Z' '0'-'9' '-' '_'];
131 | 
132 | // Named regexes can be used with the `$` prefix
133 | $init $subseq* => |lexer| { ... }
134 | ```
135 | 
136 | You can omit the `rule Init { ... }` part and have all of your rules at the top
137 | level if you don't need rule sets.
138 | 
139 | In summary:
140 | 
141 | - First line is in form `<lexer name>(<user state type>) -> <token type name>`.
142 |   The `(<user state type>)` part can be omitted for stateless lexers.
143 | 
144 | - Next is the rule sets. There should be at least one rule set with the name
145 |   `Init`, which is the name of the initial state.
146 | 
147 | - `let` bindings can be added at the top-level or in `rule`s.
148 | 
149 | ## Regex syntax
150 | 
151 | Regex syntax can be used in right-hand side of let bindings and left-hand side
152 | of rules. The syntax is:
153 | 
154 | - `$var` for variables defined in the let binding section. Variables need to be
155 |   defined before used.
156 | - `$$var` for built-in regexes (see "Built-in regular expressions" section
157 |   below).
158 | - Rust character syntax for characters, e.g. `'a'`.
159 | - Rust string syntax for strings, e.g. `"abc"`.
160 | - `[...]` for character sets. Inside the brackets you can have one or more of:
161 | 
162 |   - Characters
163 |   - Character ranges: e.g. `'a'-'z'`
164 | 
165 |   Here's an example character set for ASCII alphanumerics: `['a'-'z' 'A'-'Z'
166 |   '0'-'9']`
167 | - `_` for matching any character
168 | - `$` for matching end-of-input
169 | - `<regex>*` for zero or more repetitions of `<regex>`
170 | - `<regex>+` for one or more repetitions of `<regex>`
171 | - `<regex>?` for zero or one repetitions of `<regex>`
172 | - `<regex> <regex>` for concatenation
173 | - `<regex> | <regex>` for alternation: match the first one, or the second one.
174 | - `<regex> # <regex>` for difference: match characters in the first regex that
175 |   are not in the second regex. Note that regexes on the left and right of `#`
176 |   should be "characters sets", i.e. `*`, `+`, `?`, `"..."`, `$`, and
177 |   concatenation are not allowed. Variables that are bound to character sets are
178 |   allowed.
179 | 
180 | Binding powers (precedences), from higher to lower:
181 | 
182 | - `*`, `+`, `?`
183 | - `#`
184 | - Concatenation
185 | - `|`
186 | 
187 | You can use parenthesis for grouping, e.g. `('a' | 'b')*`.
188 | 
189 | Example: `'a' 'b' | 'c'+` is the same as `(('a' 'b') | ('c'+))`.
190 | 
191 | ## Right context (lookahead)
192 | 
193 | A rule in a rule set can be followed by another regex using `> <regex>` syntax,
194 | for right context. Right context is basically a limited form of lookahead: they
195 | can only appear after a top-level regex for a rule. They cannot be used nested
196 | in a regex.
197 | 
198 | For example, the rule left-hand side `'a' > (_ # 'b')` matches `'a'` as long as
199 | it's not followed by `'b'`.
200 | 
201 | See also [right context tests] for more examples.
202 | 
203 | [right context tests]: https://github.com/osa1/lexgen/blob/main/crates/lexgen/tests/right_ctx.rs
204 | 
205 | ## Built-in regular expressions
206 | 
207 | lexgen comes with a set of built-in regular expressions. Regular
208 | expressions listed below match the same set of characters as their Rust
209 | counterparts. For example, `$$alphabetic` matches the same set of characters as
210 | Rust's [`char::is_alphabetic`]:
211 | 
212 | - `$$alphabetic`
213 | - `$$alphanumeric`
214 | - `$$ascii`
215 | - `$$ascii_alphabetic`
216 | - `$$ascii_alphanumeric`
217 | - `$$ascii_control`
218 | - `$$ascii_digit`
219 | - `$$ascii_graphic`
220 | - `$$ascii_hexdigit`
221 | - `$$ascii_lowercase`
222 | - `$$ascii_punctuation`
223 | - `$$ascii_uppercase`
224 | - `$$ascii_whitespace`
225 | - `$$control`
226 | - `$$lowercase`
227 | - `$$numeric`
228 | - `$$uppercase`
229 | - `$$whitespace`
230 | 
231 | (Note that in the generated code we don't use Rust `char` methods. For simple
232 | cases like `$$ascii` we generate simple range checks. For more complicated
233 | cases like `$$lowercase` we generate a binary search table and run binary
234 | search when checking a character)
235 | 
236 | In addition, these two built-in regular expressions match Unicode [XID_Start and
237 | XID_Continue]:
238 | 
239 | - `$$XID_Start`
240 | - `$$XID_Continue`
241 | 
242 | [`char::is_alphabetic`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_alphabetic
243 | [XID_Start and XID_Continue]: http://www.unicode.org/reports/tr31/
244 | 
245 | ## Rule syntax
246 | 
247 | - `<regex> => <semantic action>,`: `<regex>` syntax is as described above.
248 |   `<semantic action>` is any Rust code with type `fn(&mut Lexer) ->
249 |   SemanticActionResult<Token>`. More on `SemanticActionResult` type in the next
250 |   section.
251 | 
252 | - `<regex> =? <semantic action>,`: fallible actions. This syntax is similar to
253 |   the syntax above, except `<semantic action>` has type `fn(&mut Lexer) ->
254 |   LexerAction<Result<Token, UserError>>`. When using rules of this kind, the
255 |   error type needs to be declared at the beginning of the lexer with the `type
256 |   Error = UserError;` syntax.
257 | 
258 |   When a rule of this kind returns an error, the error is returned to the
259 |   caller of the lexer's `next` method.
260 | 
261 | - `<regex>,`: Syntactic sugar for `<regex> => |lexer| { lexer.reset_match();
262 |   lexer.continue_() },`. Useful for skipping characters (e.g. whitespace).
263 | 
264 | - `<regex> = <token>,`: Syntactic sugar for `<regex> => |lexer|
265 |   lexer.return_(<token>),`. Useful for matching keywords, punctuation
266 |   (operators) and delimiters (parens, brackets).
267 | 
268 | ## End-of-input handling in rule sets
269 | 
270 | The `Init` rule set terminates lexing successfully on end-of-input (i.e.
271 | `lexer.next()` returns `None`). Other rule sets fail on end-of-input (i.e.
272 | return `Some(Err(...))`). This is because generally the states other than the
273 | initial one are for complicated tokens (strings, raw strings, multi-line
274 | comments) that need to be terminated and handled, and end-of-input in those
275 | states usually means the token did not terminate properly.
276 | 
277 | (To handle end-of-input in a rule set you can use `$` as described in section
278 | "Regex syntax" above.)
279 | 
280 | ## Handle, rule, error, and action types
281 | 
282 | The `lexer` macro generates a struct with the name specified by the user in the
283 | first line of the lexer definition. In the example at the beginning (`Lexer ->
284 | Token;`), name of the struct is `Lexer`.
285 | 
286 | A mut reference to this type is passed to semantic action functions. In the
287 | implementation of a semantic action, you should use one of the methods below
288 | drive the lexer and return tokens:
289 | 
290 | - `fn match_(&self) -> &str`: returns the current match. Note that when the
291 |   lexer is constructed with `new_from_iter` or `new_from_iter_with_state`, this
292 |   method panics. It should only be called when the lexer is initialized with
293 |   `new` or `new_with_state`.
294 | - `fn match_loc(&self) -> (lexgen_util::Loc, lexgen_util::Loc)`: returns the
295 |   bounds of the current match
296 | - `fn peek(&mut self) -> Option<char>`: looks ahead one character
297 | - `fn state(&mut self) -> &mut <user state type>`: returns a mutable reference
298 |   to the user state
299 | - `fn return_(&self, token: <user token type>) -> SemanticActionResult`:
300 |   returns the passed token as a match.
301 | - `fn continue_(&self) -> SemanticActionResult`: ignores the current match and
302 |   continues lexing in the same lexer state. Useful for skipping characters.
303 | - `fn switch(&mut self, rule: LexerRule) -> SemanticActionResult`: used for
304 |   switching between lexer states. The `LexerRule` (where `Lexer` part is the
305 |   name of the lexer as specified by the user) is an enum with a variant for
306 |   each rule set name, for example, `LexerRule::Init`. See the stateful lexer
307 |   example below.
308 | - `fn switch_and_return(&mut self, rule: LexerRule, token: <user token type>)
309 |   -> SemanticActionResult`: switches to the given lexer state and returns the
310 |   given token.
311 | - `fn reset_match(&mut self)`: resets the current match. E.g. if you call
312 |   `match_()` right after `reset_match()` it will return an empty string.
313 | 
314 | Semantic action functions should return a `SemanticActionResult` value obtained
315 | from one of the methods listed above.
316 | 
317 | ## Initializing lexers
318 | 
319 | lexgen generates 4 constructors:
320 | 
321 | - `fn new(input: &str) -> Self`: Used when the lexer does not have user state,
322 |   or user state implements `Default`.
323 | 
324 | - `fn new_with_state(input: &str, user_state: S) -> Self`: Used when the lexer
325 |   has user state that does not implement `Default`, or you want to initialize
326 |   the state with something other than the default. `S` is the user state type
327 |   specified in lexer definition. See stateful lexer example below.
328 | 
329 | - `fn new_from_iter<I: Iterator<Item = char> + Clone>(iter: I) -> Self`: Used
330 |   when the input isn't a flat string, but something like a rope or zipper. Note
331 |   that the `match_` method panics when this constructor is used. Instead use
332 |   `match_loc` to get the location of the current match.
333 | 
334 | - `fn new_from_iter_with_state<I: Iterator<Item = char> + Clone, S>(iter: I,
335 |   user_state: S) -> Self`: Same as above, but doesn't require user state to
336 |   implement `Default`.
337 | 
338 | ## Stateful lexer example
339 | 
340 | Here's an example lexer that counts number of `=`s appear between two `[`s:
341 | 
342 | ```rust
343 | lexer! {
344 |     // `usize` in parenthesis is the user state type, `usize` after the arrow
345 |     // is the token type
346 |     Lexer(usize) -> usize;
347 | 
348 |     rule Init {
349 |         $$ascii_whitespace,                             // line 7
350 | 
351 |         '[' => |lexer| {
352 |             *lexer.state() = 0;                         // line 10
353 |             lexer.switch(LexerRule::Count)              // line 11
354 |         },
355 |     }
356 | 
357 |     rule Count {
358 |         '=' => |lexer| {
359 |             *lexer.state() += 1;                        // line 17
360 |             lexer.continue_()                           // line 18
361 |         },
362 | 
363 |         '[' => |lexer| {
364 |             let n = *lexer.state();
365 |             lexer.switch_and_return(LexerRule::Init, n) // line 23
366 |         },
367 |     }
368 | }
369 | 
370 | let mut lexer = Lexer::new("[[ [=[ [==[");
371 | assert_eq!(
372 |     lexer.next(),
373 |     Some(Ok((
374 |         Loc { line: 0, col: 0, byte_idx: 0 },
375 |         0,
376 |         Loc { line: 0, col: 2, byte_idx: 2 },
377 |     )))
378 | );
379 | assert_eq!(
380 |     lexer.next(),
381 |     Some(Ok((
382 |         Loc { line: 0, col: 3, byte_idx: 3 },
383 |         1,
384 |         Loc { line: 0, col: 6, byte_idx: 6 },
385 |     )))
386 | );
387 | assert_eq!(
388 |     lexer.next(),
389 |     Some(Ok((
390 |         Loc { line: 0, col: 7, byte_idx: 7 },
391 |         2,
392 |         Loc { line: 0, col: 11, byte_idx: 11 },
393 |     )))
394 | );
395 | assert_eq!(lexer.next(), None);
396 | ```
397 | 
398 | Initially (the `Init` rule set) we skip spaces (line 7). When we see a `[` we
399 | initialize the user state (line 10) and switch to the `Count` state (line 11).
400 | In `Count`, each `=` increments the user state by one (line 17) and skips the
401 | match (line 18). A `[` in `Count` state returns the current number and switches
402 | to the `Init` state (line 23).
403 | 
404 | [1]: https://github.com/osa1/lexgen/blob/main/crates/lexgen/tests/tests.rs
405 | [2]: https://github.com/osa1/lexgen/blob/main/crates/lexgen/tests/lua_5_1.rs
406 | [3]: https://github.com/osa1/lexgen/tree/main/crates/lexgen_lalrpop_example
407 | [4]: https://github.com/osa1/mincaml/blob/master/src/lexer.rs
408 | [5]: https://github.com/osa1/lexgen_rust/blob/main/crates/lexgen_rust/src/lib.rs
409 | [6]: https://github.com/osa1/how-to-parse/blob/4f40236b1f9eca5b67d2193ef0f55fffdc06bffb/src/lexgen_event_parser.rs
410 | 


--------------------------------------------------------------------------------
/crates/char_range_gen/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "char_range_gen"
3 | version = "0.1.0"
4 | authors = ["Ömer Sinan Ağacan <omeragacan@gmail.com>"]
5 | edition = "2021"
6 | 
7 | [dependencies]
8 | unicode-xid = "0.2.2"
9 | 


--------------------------------------------------------------------------------
/crates/char_range_gen/src/main.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::type_complexity)]
 2 | 
 3 | use std::convert::TryFrom;
 4 | 
 5 | fn main() {
 6 |     for (f, name) in FNS.iter() {
 7 |         let ranges = generate_char_fn_ranges(*f);
 8 |         println!("pub static {}: [(u32, u32); {}] = [", name, ranges.len());
 9 |         for range in ranges {
10 |             println!("    ({}, {}),", range.0, range.1);
11 |         }
12 |         println!("];");
13 |     }
14 | }
15 | 
16 | fn generate_char_fn_ranges(f: fn(char) -> bool) -> Vec<(u32, u32)> {
17 |     let mut ranges: Vec<(u32, u32)> = vec![];
18 |     let mut current_range_start: Option<u32> = None;
19 | 
20 |     for i in 0..=u32::from(char::MAX) {
21 |         let c = match char::try_from(i) {
22 |             Err(_) => continue,
23 |             Ok(c) => c,
24 |         };
25 | 
26 |         if f(c) {
27 |             if current_range_start.is_none() {
28 |                 current_range_start = Some(i);
29 |             }
30 |         } else if let Some(current_range_start) = current_range_start.take() {
31 |             ranges.push((current_range_start, i - 1));
32 |         }
33 |     }
34 | 
35 |     ranges
36 | }
37 | 
38 | macro_rules! ascii_fn {
39 |     ($x:ident) => {
40 |         fn $x(c: char) -> bool {
41 |             char::$x(&c)
42 |         }
43 |     };
44 | }
45 | 
46 | ascii_fn!(is_ascii);
47 | ascii_fn!(is_ascii_alphabetic);
48 | ascii_fn!(is_ascii_alphanumeric);
49 | ascii_fn!(is_ascii_control);
50 | ascii_fn!(is_ascii_digit);
51 | ascii_fn!(is_ascii_graphic);
52 | ascii_fn!(is_ascii_hexdigit);
53 | ascii_fn!(is_ascii_lowercase);
54 | ascii_fn!(is_ascii_punctuation);
55 | ascii_fn!(is_ascii_uppercase);
56 | ascii_fn!(is_ascii_whitespace);
57 | 
58 | static FNS: [(fn(char) -> bool, &str); 20] = [
59 |     (char::is_alphabetic, "ALPHABETIC"),
60 |     (char::is_alphanumeric, "ALPHANUMERIC"),
61 |     (is_ascii, "ASCII"),
62 |     (is_ascii_alphabetic, "ASCII_ALPHABETIC"),
63 |     (is_ascii_alphanumeric, "ASCII_ALPHANUMERIC"),
64 |     (is_ascii_control, "ASCII_CONTROL"),
65 |     (is_ascii_digit, "ASCII_DIGIT"),
66 |     (is_ascii_graphic, "ASCII_GRAPHIC"),
67 |     (is_ascii_hexdigit, "ASCII_HEXDIGIT"),
68 |     (is_ascii_lowercase, "ASCII_LOWERCASE"),
69 |     (is_ascii_punctuation, "ASCII_PUNCTUATION"),
70 |     (is_ascii_uppercase, "ASCII_UPPERCASE"),
71 |     (is_ascii_whitespace, "ASCII_WHITESPACE"),
72 |     (char::is_control, "CONTROL"),
73 |     (char::is_lowercase, "LOWERCASE"),
74 |     (char::is_numeric, "NUMERIC"),
75 |     (char::is_uppercase, "UPPERCASE"),
76 |     (char::is_whitespace, "WHITESPACE"),
77 |     (<char as unicode_xid::UnicodeXID>::is_xid_start, "XID_START"),
78 |     (
79 |         <char as unicode_xid::UnicodeXID>::is_xid_continue,
80 |         "XID_CONTINUE",
81 |     ),
82 | ];
83 | 


--------------------------------------------------------------------------------
/crates/lexgen/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lexgen"
 3 | version = "0.16.0"
 4 | authors = ["Ömer Sinan Ağacan <omeragacan@gmail.com>"]
 5 | description = "A fully-featured lexer generator implemented as a proc macro"
 6 | edition = "2021"
 7 | license = "MIT"
 8 | homepage = "https://github.com/osa1/lexgen"
 9 | categories = ["compilers", "development-tools", "parsing"]
10 | readme = "../../README.md"
11 | repository = "https://github.com/osa1/lexgen"
12 | 
13 | [lib]
14 | proc-macro = true
15 | 
16 | [dependencies]
17 | proc-macro2 = "1.0"
18 | quote = "1.0"
19 | rustc-hash = "2.0.0"
20 | syn = { version = "2.0.30", features = ["extra-traits", "fold", "full", "visit"] }
21 | 
22 | [dev-dependencies]
23 | criterion = "0.3"
24 | lexgen_util = { path = "../lexgen_util" }
25 | 
26 | [[bench]]
27 | name = "benchmarks"
28 | harness = false
29 | 


--------------------------------------------------------------------------------
/crates/lexgen/benches/benchmarks.rs:
--------------------------------------------------------------------------------
 1 | // Hacky, but this is the only way I could find to share the lexer in both tests and benchmarks
 2 | include!("../tests/lua_5_1.rs");
 3 | 
 4 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
 5 | 
 6 | #[inline(never)]
 7 | fn lex_lua(s: &str) {
 8 |     let lexer = Lexer::new(s);
 9 |     for _ in lexer {}
10 | }
11 | 
12 | fn lexer_bench(c: &mut Criterion) {
13 |     let mut str = String::new();
14 |     str.push_str(&std::fs::read_to_string("tests/test_data").unwrap());
15 | 
16 |     for _ in 0..5 {
17 |         let str_ = str.clone();
18 |         str.push_str(&str_);
19 |     }
20 | 
21 |     c.bench_function("Lex Lua files", |b| b.iter(|| lex_lua(black_box(&str))));
22 | }
23 | 
24 | criterion_group!(benches, lexer_bench);
25 | criterion_main!(benches);
26 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/ast.rs:
--------------------------------------------------------------------------------
  1 | //! Proc macro AST definition and parser implementations
  2 | 
  3 | use crate::semantic_action_table::{SemanticActionIdx, SemanticActionTable};
  4 | 
  5 | use syn::parse::discouraged::Speculative;
  6 | use syn::parse::ParseStream;
  7 | 
  8 | use std::fmt;
  9 | 
 10 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 11 | pub struct Var(pub String);
 12 | 
 13 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 14 | pub struct Builtin(pub String);
 15 | 
 16 | #[derive(Debug)]
 17 | pub struct Lexer {
 18 |     /// Attributes like `#[derive(...)]` and `/// ...` attached to the lexer type declaration.
 19 |     /// These attributes copied to the generated lexer struct.
 20 |     pub attrs: Vec<syn::Attribute>,
 21 |     pub visibility: Option<syn::Visibility>,
 22 |     pub type_name: syn::Ident,
 23 |     pub user_state_type: Option<syn::Type>,
 24 |     pub token_type: syn::Type,
 25 |     pub rules: Vec<Rule>,
 26 | }
 27 | 
 28 | pub enum Rule {
 29 |     /// `type Error = UserError;`
 30 |     ErrorType {
 31 |         /// Type on the RHS, e.g. `UserError<'input>`
 32 |         ty: syn::Type,
 33 |     },
 34 | 
 35 |     /// A top-level binding or unnamed rule
 36 |     RuleOrBinding(RuleOrBinding),
 37 | 
 38 |     /// A list of named rules at the top level: `rule <Ident> { <rules> },`
 39 |     RuleSet {
 40 |         name: syn::Ident,
 41 |         rules: Vec<RuleOrBinding>,
 42 |     },
 43 | }
 44 | 
 45 | pub enum RuleOrBinding {
 46 |     Rule(SingleRule),
 47 |     Binding(Binding),
 48 | }
 49 | 
 50 | pub struct SingleRule {
 51 |     pub lhs: RegexCtx,
 52 |     pub rhs: SemanticActionIdx,
 53 | }
 54 | 
 55 | /// A named regex binding: `let <ident> = <regex>;`.
 56 | #[derive(Debug)]
 57 | pub struct Binding {
 58 |     pub var: Var,
 59 |     pub re: Regex,
 60 | }
 61 | 
 62 | /// Regular expression with optional right context (lookahead)
 63 | #[derive(Debug, Clone)]
 64 | pub struct RegexCtx {
 65 |     pub re: Regex,
 66 |     pub right_ctx: Option<Regex>,
 67 | }
 68 | 
 69 | #[derive(Debug, Clone)]
 70 | pub enum RuleRhs {
 71 |     None,
 72 |     Rhs { expr: syn::Expr, kind: RuleKind },
 73 | }
 74 | 
 75 | #[derive(Debug, Copy, Clone)]
 76 | pub enum RuleKind {
 77 |     /// Defined with `=`. RHS is not passed a `LexerHandle`, returns `Token`.
 78 |     Simple,
 79 | 
 80 |     /// Defined with `=?`. RHS is passed a `LexerHandle`, returns `LexerAction<Result<Token,
 81 |     /// Error>>`.
 82 |     Fallible,
 83 | 
 84 |     /// Defined with `=>`. RHS is passed a `LexerHandle`, returns `LexerAction<Token>`
 85 |     Infallible,
 86 | }
 87 | 
 88 | impl fmt::Debug for Rule {
 89 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 90 |         match self {
 91 |             Rule::RuleOrBinding(rule_or_binding) => rule_or_binding.fmt(f),
 92 |             Rule::RuleSet { name, rules } => f
 93 |                 .debug_struct("Rule::RuleSet")
 94 |                 .field("name", &name.to_string())
 95 |                 .field("rules", rules)
 96 |                 .finish(),
 97 |             Rule::ErrorType { ty } => f.debug_struct("Rule::ErrorType").field("ty", ty).finish(),
 98 |         }
 99 |     }
100 | }
101 | 
102 | impl fmt::Debug for RuleOrBinding {
103 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 |         match self {
105 |             RuleOrBinding::Rule(rule) => rule.fmt(f),
106 |             RuleOrBinding::Binding(binding) => binding.fmt(f),
107 |         }
108 |     }
109 | }
110 | 
111 | impl fmt::Debug for SingleRule {
112 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
113 |         f.debug_struct("SingleRule")
114 |             .field("lhs", &self.lhs)
115 |             .field("rhs", &"...")
116 |             .finish()
117 |     }
118 | }
119 | 
120 | #[derive(Debug, Clone)]
121 | pub enum Regex {
122 |     Builtin(Builtin),
123 |     Var(Var),
124 |     Char(char),
125 |     String(String),
126 |     CharSet(CharSet),
127 |     ZeroOrMore(Box<Regex>),
128 |     OneOrMore(Box<Regex>),
129 |     ZeroOrOne(Box<Regex>),
130 |     Concat(Box<Regex>, Box<Regex>),
131 |     Or(Box<Regex>, Box<Regex>),
132 |     Any, // any character
133 |     EndOfInput,
134 | 
135 |     /// Difference, or exclusion: characters in the first regex, excluding characters in the second
136 |     /// regex.
137 |     Diff(Box<Regex>, Box<Regex>),
138 | }
139 | 
140 | #[derive(Debug, Clone)]
141 | pub struct CharSet(pub Vec<CharOrRange>);
142 | 
143 | #[derive(Debug, Clone, Copy)]
144 | pub enum CharOrRange {
145 |     Char(char),
146 |     Range(char, char),
147 | }
148 | 
149 | /// Parses a regex with optional right context: `re_ctx -> re [> re]`
150 | fn parse_regex_ctx(input: ParseStream) -> syn::Result<RegexCtx> {
151 |     let re = parse_regex(input)?;
152 |     if input.peek(syn::token::Gt) {
153 |         input.parse::<syn::token::Gt>()?;
154 |         let right_ctx = parse_regex(input)?;
155 |         Ok(RegexCtx {
156 |             re,
157 |             right_ctx: Some(right_ctx),
158 |         })
159 |     } else {
160 |         Ok(RegexCtx {
161 |             re,
162 |             right_ctx: None,
163 |         })
164 |     }
165 | }
166 | 
167 | /// Parses a regex
168 | fn parse_regex(input: ParseStream) -> syn::Result<Regex> {
169 |     parse_regex_0(input)
170 | }
171 | 
172 | // re_0 -> re_1 | re_0 `|` re_1 (alternation)
173 | fn parse_regex_0(input: ParseStream) -> syn::Result<Regex> {
174 |     let mut re = parse_regex_1(input)?;
175 | 
176 |     while input.peek(syn::token::Or) {
177 |         let _ = input.parse::<syn::token::Or>()?;
178 |         let re2 = parse_regex_1(input)?;
179 |         re = Regex::Or(Box::new(re), Box::new(re2)); // left associative
180 |     }
181 | 
182 |     Ok(re)
183 | }
184 | 
185 | // re_1 -> re_2 | re_1 re_2 (concatenation)
186 | fn parse_regex_1(input: ParseStream) -> syn::Result<Regex> {
187 |     let mut re = parse_regex_2(input)?;
188 | 
189 |     // Parse concatenations
190 |     while input.peek(syn::token::Paren)
191 |         || input.peek(syn::token::Dollar)
192 |         || input.peek(syn::LitChar)
193 |         || input.peek(syn::LitStr)
194 |         || input.peek(syn::token::Bracket)
195 |         || input.peek(syn::token::Underscore)
196 |     {
197 |         let re2 = parse_regex_2(input)?;
198 |         re = Regex::Concat(Box::new(re), Box::new(re2)); // left associative
199 |     }
200 | 
201 |     Ok(re)
202 | }
203 | 
204 | // re_2 -> re_3 | re_3* | re_3? | re_3+
205 | fn parse_regex_2(input: ParseStream) -> syn::Result<Regex> {
206 |     let mut re = parse_regex_3(input)?;
207 | 
208 |     loop {
209 |         if input.peek(syn::token::Star) {
210 |             let _ = input.parse::<syn::token::Star>()?;
211 |             re = Regex::ZeroOrMore(Box::new(re));
212 |         } else if input.peek(syn::token::Question) {
213 |             let _ = input.parse::<syn::token::Question>()?;
214 |             re = Regex::ZeroOrOne(Box::new(re));
215 |         } else if input.peek(syn::token::Plus) {
216 |             let _ = input.parse::<syn::token::Plus>()?;
217 |             re = Regex::OneOrMore(Box::new(re));
218 |         } else {
219 |             break;
220 |         }
221 |     }
222 | 
223 |     Ok(re)
224 | }
225 | 
226 | // re_3 -> re_4 | re_4 # re_4 (left associative)
227 | fn parse_regex_3(input: ParseStream) -> syn::Result<Regex> {
228 |     let mut re = parse_regex_4(input)?;
229 | 
230 |     while input.peek(syn::token::Pound) {
231 |         let _ = input.parse::<syn::token::Pound>()?;
232 |         let re_2 = parse_regex_4(input)?;
233 |         re = Regex::Diff(Box::new(re), Box::new(re_2));
234 |     }
235 | 
236 |     Ok(re)
237 | }
238 | 
239 | // re_4 -> ( re_0 ) | $ | $x | $$x | _ | 'x' | "..." | [...]
240 | fn parse_regex_4(input: ParseStream) -> syn::Result<Regex> {
241 |     if input.peek(syn::token::Paren) {
242 |         let parenthesized;
243 |         syn::parenthesized!(parenthesized in input);
244 |         parse_regex(&parenthesized) // no right ctx
245 |     } else if input.peek(syn::token::Dollar) {
246 |         let _ = input.parse::<syn::token::Dollar>()?;
247 |         if input.parse::<syn::token::Dollar>().is_ok() {
248 |             let ident = input.parse::<syn::Ident>()?;
249 |             Ok(Regex::Builtin(Builtin(ident.to_string())))
250 |         } else {
251 |             match input.parse::<syn::Ident>() {
252 |                 Ok(ident) => Ok(Regex::Var(Var(ident.to_string()))),
253 |                 Err(_) => Ok(Regex::EndOfInput),
254 |             }
255 |         }
256 |     } else if input.peek(syn::LitChar) {
257 |         let char = input.parse::<syn::LitChar>()?;
258 |         Ok(Regex::Char(char.value()))
259 |     } else if input.peek(syn::LitStr) {
260 |         let str = input.parse::<syn::LitStr>()?;
261 |         Ok(Regex::String(str.value()))
262 |     } else if input.peek(syn::token::Bracket) {
263 |         let bracketed;
264 |         syn::bracketed!(bracketed in input);
265 |         let char_set = parse_charset(&bracketed)?;
266 |         Ok(Regex::CharSet(char_set))
267 |     } else if input.parse::<syn::token::Underscore>().is_ok() {
268 |         Ok(Regex::Any)
269 |     } else {
270 |         Err(syn::Error::new(
271 |             proc_macro2::Span::call_site(),
272 |             "Unable to parse regex",
273 |         ))
274 |     }
275 | }
276 | 
277 | /// Parse a sequence of `<char>` or `<char>-<char>`.
278 | fn parse_charset(input: ParseStream) -> syn::Result<CharSet> {
279 |     let mut chars = vec![];
280 |     while !input.is_empty() {
281 |         chars.push(parse_char_or_range(input)?);
282 |     }
283 |     Ok(CharSet(chars))
284 | }
285 | 
286 | /// Parse a `<char>` or `<char>-<char>`.
287 | fn parse_char_or_range(input: ParseStream) -> syn::Result<CharOrRange> {
288 |     let char = input.parse::<syn::LitChar>()?.value();
289 |     if input.peek(syn::token::Minus) {
290 |         let _ = input.parse::<syn::token::Minus>()?;
291 |         let char2 = input.parse::<syn::LitChar>()?.value();
292 |         Ok(CharOrRange::Range(char, char2))
293 |     } else {
294 |         Ok(CharOrRange::Char(char))
295 |     }
296 | }
297 | 
298 | fn parse_rule_or_binding(
299 |     input: ParseStream,
300 |     semantic_action_table: &mut SemanticActionTable,
301 | ) -> syn::Result<RuleOrBinding> {
302 |     if input.peek(syn::token::Let) {
303 |         // Let binding
304 |         input.parse::<syn::token::Let>()?;
305 |         let var = input.parse::<syn::Ident>()?;
306 |         input.parse::<syn::token::Eq>()?;
307 |         let re = parse_regex(input)?;
308 |         input.parse::<syn::token::Semi>()?;
309 |         Ok(RuleOrBinding::Binding(Binding {
310 |             var: Var(var.to_string()),
311 |             re,
312 |         }))
313 |     } else {
314 |         // Rule
315 |         let lhs = parse_regex_ctx(input)?;
316 | 
317 |         let rhs = if input.parse::<syn::token::Comma>().is_ok() {
318 |             RuleRhs::None
319 |         } else if input.parse::<syn::token::FatArrow>().is_ok() {
320 |             let expr = input.parse::<syn::Expr>()?;
321 |             input.parse::<syn::token::Comma>()?;
322 |             RuleRhs::Rhs {
323 |                 expr,
324 |                 kind: RuleKind::Infallible,
325 |             }
326 |         } else if input.parse::<syn::token::Eq>().is_ok() {
327 |             let kind = if input.peek(syn::token::Question) {
328 |                 let _ = input.parse::<syn::token::Question>();
329 |                 RuleKind::Fallible
330 |             } else {
331 |                 RuleKind::Simple
332 |             };
333 |             let expr = input.parse::<syn::Expr>()?;
334 |             input.parse::<syn::token::Comma>()?;
335 |             RuleRhs::Rhs { expr, kind }
336 |         } else {
337 |             panic!("Expected one of `,`, `=>`, `=?`, or `=` after a regex");
338 |         };
339 | 
340 |         let rhs = semantic_action_table.add(rhs);
341 | 
342 |         Ok(RuleOrBinding::Rule(SingleRule { lhs, rhs }))
343 |     }
344 | }
345 | 
346 | fn parse_rule(
347 |     input: ParseStream,
348 |     semantic_action_table: &mut SemanticActionTable,
349 | ) -> syn::Result<Rule> {
350 |     if input.peek(syn::Ident) {
351 |         // Named rules
352 |         let ident = input.parse::<syn::Ident>()?;
353 |         if ident != "rule" {
354 |             return Err(syn::Error::new(
355 |                 ident.span(),
356 |                 "Unknown identifier, expected \"rule\", \"let\", or a regex",
357 |             ));
358 |         }
359 |         let name = input.parse::<syn::Ident>()?;
360 |         let braced;
361 |         syn::braced!(braced in input);
362 |         let mut rules = vec![];
363 |         while !braced.is_empty() {
364 |             rules.push(parse_rule_or_binding(&braced, semantic_action_table)?);
365 |         }
366 |         // Consume optional trailing comma
367 |         let _ = input.parse::<syn::token::Comma>();
368 |         Ok(Rule::RuleSet { name, rules })
369 |     } else if input.parse::<syn::token::Type>().is_ok() {
370 |         let ident = input.parse::<syn::Ident>()?;
371 |         if ident != "Error" {
372 |             panic!("Error type syntax is: `type Error = ...;`");
373 |         }
374 |         input.parse::<syn::token::Eq>()?;
375 |         let ty = input.parse::<syn::Type>()?;
376 |         input.parse::<syn::token::Semi>()?;
377 |         Ok(Rule::ErrorType { ty })
378 |     } else {
379 |         Ok(Rule::RuleOrBinding(parse_rule_or_binding(
380 |             input,
381 |             semantic_action_table,
382 |         )?))
383 |     }
384 | }
385 | 
386 | pub fn make_lexer_parser(
387 |     semantic_action_table: &mut SemanticActionTable,
388 | ) -> impl FnOnce(ParseStream) -> Result<Lexer, syn::Error> + '_ {
389 |     |input: ParseStream| {
390 |         let attrs = input.call(syn::Attribute::parse_outer)?;
391 | 
392 |         let forked = input.fork();
393 |         let visibility = match forked.parse::<syn::Visibility>() {
394 |             Ok(visibility) => {
395 |                 input.advance_to(&forked);
396 |                 Some(visibility)
397 |             }
398 |             Err(_) => None,
399 |         };
400 | 
401 |         let type_name = input.parse::<syn::Ident>()?;
402 | 
403 |         let user_state_type = if input.peek(syn::token::Paren) {
404 |             let parenthesized;
405 |             syn::parenthesized!(parenthesized in input);
406 |             Some(parenthesized.parse::<syn::Type>()?)
407 |         } else {
408 |             None
409 |         };
410 | 
411 |         input.parse::<syn::token::RArrow>()?;
412 |         let token_type = input.parse::<syn::Type>()?;
413 |         input.parse::<syn::token::Semi>()?;
414 | 
415 |         let mut rules = vec![];
416 |         while !input.is_empty() {
417 |             rules.push(parse_rule(input, semantic_action_table)?);
418 |         }
419 | 
420 |         Ok(Lexer {
421 |             attrs,
422 |             visibility,
423 |             type_name,
424 |             user_state_type,
425 |             token_type,
426 |             rules,
427 |         })
428 |     }
429 | }
430 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/builtin.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_camel_case_types)]
 2 | 
 3 | // NB. We use this type instead of storing `&'static [...]`s directly to make debugging easier.
 4 | #[derive(Debug, Clone, Copy)]
 5 | pub enum BuiltinCharRange {
 6 |     Alphabetic,
 7 |     Alphanumeric,
 8 |     Ascii,
 9 |     AsciiAlphabetic,
10 |     AsciiAlphanumeric,
11 |     AsciiControl,
12 |     AsciiDigit,
13 |     AsciiGraphic,
14 |     AsciiHexdigit,
15 |     AsciiLowercase,
16 |     AsciiPunctuation,
17 |     AsciiUppercase,
18 |     AsciiWhitespace,
19 |     Control,
20 |     Lowercase,
21 |     Numeric,
22 |     Uppercase,
23 |     Whitespace,
24 |     XID_Start,
25 |     XID_Continue,
26 | }
27 | 
28 | pub static BUILTIN_RANGES: [(&str, BuiltinCharRange); 20] = [
29 |     ("alphabetic", BuiltinCharRange::Alphabetic),
30 |     ("alphanumeric", BuiltinCharRange::Alphanumeric),
31 |     ("ascii", BuiltinCharRange::Ascii),
32 |     ("ascii_alphabetic", BuiltinCharRange::AsciiAlphabetic),
33 |     ("ascii_alphanumeric", BuiltinCharRange::AsciiAlphanumeric),
34 |     ("ascii_control", BuiltinCharRange::AsciiControl),
35 |     ("ascii_digit", BuiltinCharRange::AsciiDigit),
36 |     ("ascii_graphic", BuiltinCharRange::AsciiGraphic),
37 |     ("ascii_hexdigit", BuiltinCharRange::AsciiHexdigit),
38 |     ("ascii_lowercase", BuiltinCharRange::AsciiLowercase),
39 |     ("ascii_punctuation", BuiltinCharRange::AsciiPunctuation),
40 |     ("ascii_uppercase", BuiltinCharRange::AsciiUppercase),
41 |     ("ascii_whitespace", BuiltinCharRange::AsciiWhitespace),
42 |     ("control", BuiltinCharRange::Control),
43 |     ("lowercase", BuiltinCharRange::Lowercase),
44 |     ("numeric", BuiltinCharRange::Numeric),
45 |     ("uppercase", BuiltinCharRange::Uppercase),
46 |     ("whitespace", BuiltinCharRange::Whitespace),
47 |     ("XID_Start", BuiltinCharRange::XID_Start),
48 |     ("XID_Continue", BuiltinCharRange::XID_Continue),
49 | ];
50 | 
51 | impl BuiltinCharRange {
52 |     pub fn get_ranges(&self) -> &'static [(u32, u32)] {
53 |         use crate::char_ranges::*;
54 | 
55 |         match self {
56 |             BuiltinCharRange::Alphabetic => &ALPHABETIC,
57 |             BuiltinCharRange::Alphanumeric => &ALPHANUMERIC,
58 |             BuiltinCharRange::Ascii => &ASCII,
59 |             BuiltinCharRange::AsciiAlphabetic => &ASCII_ALPHABETIC,
60 |             BuiltinCharRange::AsciiAlphanumeric => &ASCII_ALPHANUMERIC,
61 |             BuiltinCharRange::AsciiControl => &ASCII_CONTROL,
62 |             BuiltinCharRange::AsciiDigit => &ASCII_DIGIT,
63 |             BuiltinCharRange::AsciiGraphic => &ASCII_GRAPHIC,
64 |             BuiltinCharRange::AsciiHexdigit => &ASCII_HEXDIGIT,
65 |             BuiltinCharRange::AsciiLowercase => &ASCII_LOWERCASE,
66 |             BuiltinCharRange::AsciiPunctuation => &ASCII_PUNCTUATION,
67 |             BuiltinCharRange::AsciiUppercase => &ASCII_UPPERCASE,
68 |             BuiltinCharRange::AsciiWhitespace => &ASCII_WHITESPACE,
69 |             BuiltinCharRange::Control => &CONTROL,
70 |             BuiltinCharRange::Lowercase => &LOWERCASE,
71 |             BuiltinCharRange::Numeric => &NUMERIC,
72 |             BuiltinCharRange::Uppercase => &UPPERCASE,
73 |             BuiltinCharRange::Whitespace => &WHITESPACE,
74 |             BuiltinCharRange::XID_Start => &XID_START,
75 |             BuiltinCharRange::XID_Continue => &XID_CONTINUE,
76 |         }
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/collections.rs:
--------------------------------------------------------------------------------
1 | use rustc_hash::{FxHashMap, FxHashSet};
2 | 
3 | pub type Set<T> = FxHashSet<T>;
4 | 
5 | pub type Map<K, V> = FxHashMap<K, V>;
6 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/dfa.rs:
--------------------------------------------------------------------------------
  1 | mod backtrack;
  2 | pub mod codegen;
  3 | pub mod simplify;
  4 | 
  5 | #[cfg(test)]
  6 | pub mod simulate;
  7 | 
  8 | use crate::collections::{Map, Set};
  9 | use crate::nfa::AcceptingState;
 10 | use crate::range_map::{Range, RangeMap};
 11 | pub(crate) use backtrack::update_backtracks;
 12 | 
 13 | use std::convert::TryFrom;
 14 | use std::iter::{FromIterator, IntoIterator};
 15 | 
 16 | /// Deterministic finite automate, parameterized on values of accepting states.
 17 | #[derive(Debug)]
 18 | pub struct DFA<T, A> {
 19 |     // Indexed by `StateIdx`
 20 |     states: Vec<State<T, A>>,
 21 | }
 22 | 
 23 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 24 | pub struct StateIdx(usize);
 25 | 
 26 | impl StateIdx {
 27 |     fn map<F>(&self, f: F) -> StateIdx
 28 |     where
 29 |         F: Fn(usize) -> usize,
 30 |     {
 31 |         StateIdx(f(self.0))
 32 |     }
 33 | }
 34 | 
 35 | #[derive(Debug)]
 36 | pub struct State<T, A> {
 37 |     /// Whether the state is the initial state of a rule set. This is used when inlining states
 38 |     /// with single predecessors. Initial states cannot be inlined as there may be references to
 39 |     /// these states in semantic actions.
 40 |     initial: bool,
 41 |     char_transitions: Map<char, T>,
 42 |     range_transitions: RangeMap<T>,
 43 |     any_transition: Option<T>,
 44 |     end_of_input_transition: Option<T>,
 45 |     accepting: Vec<AcceptingState<A>>,
 46 |     /// Predecessors of the state, used to inline code for a state with one predecessor in the
 47 |     /// predecessor's code.
 48 |     predecessors: Set<StateIdx>,
 49 |     backtrack: bool,
 50 | }
 51 | 
 52 | impl<T, A> State<T, A> {
 53 |     fn new() -> State<T, A> {
 54 |         State {
 55 |             initial: false,
 56 |             char_transitions: Default::default(),
 57 |             range_transitions: Default::default(),
 58 |             any_transition: None,
 59 |             end_of_input_transition: None,
 60 |             accepting: vec![],
 61 |             predecessors: Default::default(),
 62 |             backtrack: false,
 63 |         }
 64 |     }
 65 | 
 66 |     fn has_no_transitions(&self) -> bool {
 67 |         self.char_transitions.is_empty()
 68 |             && self.range_transitions.is_empty()
 69 |             && self.any_transition.is_none()
 70 |             && self.end_of_input_transition.is_none()
 71 |     }
 72 | }
 73 | 
 74 | impl<A> DFA<StateIdx, A> {
 75 |     pub fn new() -> (DFA<StateIdx, A>, StateIdx) {
 76 |         let mut initial_state = State::new();
 77 |         initial_state.initial = true;
 78 |         (
 79 |             DFA {
 80 |                 states: vec![initial_state],
 81 |             },
 82 |             StateIdx(0),
 83 |         )
 84 |     }
 85 | 
 86 |     pub fn initial_state(&self) -> StateIdx {
 87 |         StateIdx(0)
 88 |     }
 89 | 
 90 |     pub fn make_state_accepting(&mut self, state: StateIdx, accept: AcceptingState<A>) {
 91 |         self.states[state.0].accepting.push(accept);
 92 |     }
 93 | 
 94 |     pub fn new_state(&mut self) -> StateIdx {
 95 |         let new_state_idx = StateIdx(self.states.len());
 96 |         self.states.push(State::new());
 97 |         new_state_idx
 98 |     }
 99 | 
100 |     pub fn is_accepting_state(&self, state: StateIdx) -> bool {
101 |         !self.states[state.0].accepting.is_empty()
102 |     }
103 | 
104 |     pub fn add_char_transition(&mut self, state: StateIdx, char: char, next: StateIdx) {
105 |         let old = self.states[state.0].char_transitions.insert(char, next);
106 |         assert!(
107 |             old.is_none(),
108 |             "state={:?}, char={:?}, old={:?}, new={:?}",
109 |             state,
110 |             char,
111 |             old,
112 |             next
113 |         );
114 | 
115 |         self.states[next.0].predecessors.insert(state);
116 |     }
117 | 
118 |     pub fn set_range_transitions(&mut self, state: StateIdx, range_map: RangeMap<StateIdx>) {
119 |         assert!(self.states[state.0].range_transitions.is_empty());
120 | 
121 |         for range in range_map.iter() {
122 |             self.states[range.value.0].predecessors.insert(state);
123 |         }
124 | 
125 |         self.states[state.0].range_transitions = range_map;
126 |     }
127 | 
128 |     pub fn set_any_transition(&mut self, state: StateIdx, next: StateIdx) {
129 |         assert!(self.states[state.0].any_transition.is_none());
130 |         self.states[state.0].any_transition = Some(next);
131 |         self.states[next.0].predecessors.insert(state);
132 |     }
133 | 
134 |     pub fn set_end_of_input_transition(&mut self, state: StateIdx, next: StateIdx) {
135 |         assert!(self.states[state.0].end_of_input_transition.is_none());
136 |         self.states[state.0].end_of_input_transition = Some(next);
137 |         self.states[next.0].predecessors.insert(state);
138 |     }
139 | }
140 | 
141 | impl<T, A> DFA<T, A> {
142 |     fn from_states(states: Vec<State<T, A>>) -> DFA<T, A> {
143 |         DFA { states }
144 |     }
145 | 
146 |     pub fn into_state_indices(self) -> impl Iterator<Item = (StateIdx, State<T, A>)> {
147 |         self.states
148 |             .into_iter()
149 |             .enumerate()
150 |             .map(|(state_idx, state)| (StateIdx(state_idx), state))
151 |     }
152 | }
153 | 
154 | impl<T, A> FromIterator<(StateIdx, State<T, A>)> for DFA<T, A> {
155 |     fn from_iter<I>(iter: I) -> Self
156 |     where
157 |         I: IntoIterator<Item = (StateIdx, State<T, A>)>,
158 |     {
159 |         let mut states: Vec<(StateIdx, State<T, A>)> = iter.into_iter().collect();
160 |         states.sort_by_key(|&(state_idx, _)| state_idx);
161 | 
162 |         DFA {
163 |             states: states.into_iter().map(|(_, state)| state).collect(),
164 |         }
165 |     }
166 | }
167 | 
168 | impl<A> DFA<StateIdx, A> {
169 |     /// Extend the current DFA with another DFA. The extended DFA's states will be renumbered. This
170 |     /// does not add any transitions from the original DFA states to the extension. Accepting
171 |     /// states of the extension is preserved.
172 |     ///
173 |     /// Returns initial state for the extension in the new DFA.
174 |     pub fn add_dfa(&mut self, other: DFA<StateIdx, A>) -> StateIdx {
175 |         let n_current_states = self.states.len();
176 | 
177 |         for State {
178 |             initial,
179 |             char_transitions,
180 |             range_transitions,
181 |             any_transition,
182 |             end_of_input_transition,
183 |             accepting,
184 |             predecessors,
185 |             backtrack,
186 |         } in other.states
187 |         {
188 |             let mut new_char_transitions: Map<char, StateIdx> = Default::default();
189 |             let mut new_any_transition: Option<StateIdx> = None;
190 |             let mut new_end_of_input_transition: Option<StateIdx> = None;
191 | 
192 |             for (char, next) in char_transitions {
193 |                 new_char_transitions.insert(char, StateIdx(next.0 + n_current_states));
194 |             }
195 | 
196 |             let new_range_transitions =
197 |                 range_transitions.map(|state_idx| StateIdx(state_idx.0 + n_current_states));
198 | 
199 |             if let Some(next) = any_transition {
200 |                 new_any_transition = Some(StateIdx(next.0 + n_current_states));
201 |             }
202 | 
203 |             if let Some(next) = end_of_input_transition {
204 |                 new_end_of_input_transition = Some(StateIdx(next.0 + n_current_states));
205 |             }
206 | 
207 |             let predecessors = predecessors
208 |                 .into_iter()
209 |                 .map(|pred| StateIdx(pred.0 + n_current_states))
210 |                 .collect();
211 | 
212 |             self.states.push(State {
213 |                 initial,
214 |                 char_transitions: new_char_transitions,
215 |                 range_transitions: new_range_transitions,
216 |                 any_transition: new_any_transition,
217 |                 end_of_input_transition: new_end_of_input_transition,
218 |                 accepting,
219 |                 predecessors,
220 |                 backtrack,
221 |             });
222 |         }
223 | 
224 |         StateIdx(n_current_states)
225 |     }
226 | }
227 | 
228 | use std::fmt::{self, Display, Formatter};
229 | 
230 | impl Display for StateIdx {
231 |     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
232 |         self.0.fmt(f)
233 |     }
234 | }
235 | 
236 | impl<A> Display for DFA<StateIdx, A> {
237 |     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
238 |         for (state_idx, state) in self.states.iter().enumerate() {
239 |             let State {
240 |                 initial,
241 |                 char_transitions,
242 |                 range_transitions,
243 |                 any_transition,
244 |                 end_of_input_transition,
245 |                 accepting,
246 |                 predecessors: _,
247 |                 backtrack,
248 |             } = state;
249 | 
250 |             if !accepting.is_empty() {
251 |                 if *initial {
252 |                     write!(f, "{:>5}:", format!("i*{}", state_idx))?;
253 |                 } else {
254 |                     write!(f, "{:>5}:", format!("*{}", state_idx))?;
255 |                 }
256 |             } else {
257 |                 if *initial {
258 |                     write!(f, "{:>5}:", format!("i{}", state_idx))?;
259 |                 } else {
260 |                     write!(f, "{:>5}:", state_idx)?;
261 |                 }
262 |             }
263 | 
264 |             let mut first = true;
265 | 
266 |             for (char, next) in char_transitions.iter() {
267 |                 if !first {
268 |                     write!(f, "      ")?;
269 |                 } else {
270 |                     first = false;
271 |                 }
272 | 
273 |                 writeln!(f, "{:?} -> {}", char, next)?;
274 |             }
275 | 
276 |             for Range { start, end, value } in range_transitions.iter() {
277 |                 if !first {
278 |                     write!(f, "      ")?;
279 |                 } else {
280 |                     first = false;
281 |                 }
282 | 
283 |                 writeln!(
284 |                     f,
285 |                     "{:?} - {:?} -> {}",
286 |                     char::try_from(*start).unwrap(),
287 |                     char::try_from(*end).unwrap(),
288 |                     value,
289 |                 )?;
290 |             }
291 | 
292 |             if let Some(next) = any_transition {
293 |                 if !first {
294 |                     write!(f, "      ")?;
295 |                 } else {
296 |                     first = false;
297 |                 }
298 | 
299 |                 writeln!(f, "_ -> {}", next)?;
300 |             }
301 | 
302 |             if let Some(next) = end_of_input_transition {
303 |                 if !first {
304 |                     write!(f, "      ")?;
305 |                 }
306 | 
307 |                 writeln!(f, "$ -> {}", next)?;
308 |             }
309 | 
310 |             if *backtrack {
311 |                 if !first {
312 |                     write!(f, "      ")?;
313 |                 }
314 | 
315 |                 writeln!(f, "backtrack")?;
316 |             }
317 | 
318 |             if char_transitions.is_empty()
319 |                 && range_transitions.is_empty()
320 |                 && any_transition.is_none()
321 |                 && end_of_input_transition.is_none()
322 |             {
323 |                 writeln!(f)?;
324 |             }
325 |         }
326 | 
327 |         Ok(())
328 |     }
329 | }
330 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/dfa/backtrack.rs:
--------------------------------------------------------------------------------
 1 | use crate::collections::Map;
 2 | use crate::dfa::{StateIdx, DFA};
 3 | 
 4 | use std::collections::hash_map::Entry;
 5 | 
 6 | pub(crate) fn update_backtracks<A>(dfa: &mut DFA<StateIdx, A>) {
 7 |     // State and whether the state is an accepting state.
 8 |     let mut work_list: Vec<(StateIdx, bool)> = dfa
 9 |         .states
10 |         .iter()
11 |         .enumerate()
12 |         .filter_map(|(state_idx, state)| {
13 |             if state.initial {
14 |                 Some((StateIdx(state_idx), false))
15 |             } else {
16 |                 None
17 |             }
18 |         })
19 |         .collect();
20 | 
21 |     // Set of visited nodes, with their backtrack state when visited. If a state's backtrack
22 |     // property changes, we visit it again to make its successors backtrack.
23 |     let mut visited: Map<StateIdx, bool> = Default::default();
24 | 
25 |     while let Some((state, backtrack)) = work_list.pop() {
26 |         // Did we visit the state, with the right backtrack state?
27 |         match visited.entry(state) {
28 |             Entry::Occupied(mut entry) => {
29 |                 if *entry.get() == backtrack {
30 |                     continue;
31 |                 }
32 |                 entry.insert(backtrack);
33 |             }
34 |             Entry::Vacant(entry) => {
35 |                 entry.insert(backtrack);
36 |             }
37 |         }
38 | 
39 |         // Whether the successor states should backtrack.
40 |         let successor_backtrack = backtrack || dfa.is_accepting_state(state);
41 | 
42 |         for next in dfa.states[state.0].char_transitions.values() {
43 |             work_list.push((*next, successor_backtrack));
44 |         }
45 | 
46 |         for next_range in dfa.states[state.0].range_transitions.iter() {
47 |             work_list.push((next_range.value, successor_backtrack));
48 |         }
49 | 
50 |         if let Some(next) = dfa.states[state.0].any_transition {
51 |             work_list.push((next, successor_backtrack));
52 |         }
53 | 
54 |         if let Some(next) = dfa.states[state.0].end_of_input_transition {
55 |             work_list.push((next, successor_backtrack));
56 |         }
57 |     }
58 | 
59 |     assert_eq!(visited.len(), dfa.states.len());
60 | 
61 |     for (state, backtrack) in visited {
62 |         dfa.states[state.0].backtrack = backtrack;
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/dfa/codegen/ctx.rs:
--------------------------------------------------------------------------------
  1 | use super::search_table::SearchTableSet;
  2 | use super::StateIdx;
  3 | use super::DFA;
  4 | use crate::ast::RuleRhs;
  5 | use crate::collections::Map;
  6 | use crate::dfa::simplify::Trans;
  7 | use crate::semantic_action_table::{SemanticActionIdx, SemanticActionTable};
  8 | 
  9 | /// Code generation state
 10 | pub struct CgCtx {
 11 |     /// Maps semantic action indices to expressions. Used to generate semantic action functions.
 12 |     semantic_action_table: SemanticActionTable,
 13 | 
 14 |     /// Name of the lexer: `MyLexer` in `lexer! { MyLexer -> MyToken; }`
 15 |     lexer_name: syn::Ident,
 16 | 
 17 |     /// Type of the values the lexer will produce: `MyToken` in `lexer! { MyLexer -> MyToken; }`
 18 |     token_type: syn::Type,
 19 | 
 20 |     /// Type of the user error, when available. `<type>` in `type Error = ...`.
 21 |     user_error_type: Option<syn::Type>,
 22 | 
 23 |     /// Maps user-written rule names (e.g. `rule MyRule { ... }`) to their initial states in the
 24 |     /// final DFA.
 25 |     rule_states: Map<String, StateIdx>,
 26 | 
 27 |     /// Sorted vector of states with only one predecessor. These states will be inlined in the
 28 |     /// predecessor states and won't appear in the final code. Inlining these states significantly
 29 |     /// improves code size and runtime performance.
 30 |     ///
 31 |     /// This vector is used to map non-inlined states to their final state indices in the generated
 32 |     /// code. For example, if this vector is `[5]`, state 5 is skipped, and states after 5 are
 33 |     /// decremented 1, so state 6 becomes 5 etc.
 34 |     inlined_states: Vec<StateIdx>,
 35 | 
 36 |     /// Mutable parts of the codegen state
 37 |     codegen_state: CgState,
 38 | }
 39 | 
 40 | struct CgState {
 41 |     /// Binary search tables generated so far
 42 |     search_tables: SearchTableSet,
 43 | }
 44 | 
 45 | impl CgCtx {
 46 |     pub fn new(
 47 |         dfa: &DFA<Trans<SemanticActionIdx>, SemanticActionIdx>,
 48 |         semantic_action_table: SemanticActionTable,
 49 |         lexer_name: syn::Ident,
 50 |         token_type: syn::Type,
 51 |         user_error_type: Option<syn::Type>,
 52 |         rule_states: Map<String, StateIdx>,
 53 |     ) -> CgCtx {
 54 |         let inlined_states: Vec<StateIdx> = dfa
 55 |             .states
 56 |             .iter()
 57 |             .enumerate()
 58 |             .filter_map(|(state_idx, state)| {
 59 |                 if state.predecessors.len() == 1 {
 60 |                     Some(StateIdx(state_idx))
 61 |                 } else {
 62 |                     None
 63 |                 }
 64 |             })
 65 |             .collect();
 66 | 
 67 |         CgCtx {
 68 |             semantic_action_table,
 69 |             lexer_name,
 70 |             token_type,
 71 |             user_error_type,
 72 |             rule_states,
 73 |             inlined_states,
 74 |             codegen_state: CgState {
 75 |                 search_tables: SearchTableSet::new(),
 76 |             },
 77 |         }
 78 |     }
 79 | 
 80 |     pub fn lexer_name(&self) -> &syn::Ident {
 81 |         &self.lexer_name
 82 |     }
 83 | 
 84 |     /// Renumber a state index taking inlined states into account.
 85 |     pub fn renumber_state(&self, state: StateIdx) -> StateIdx {
 86 |         match self.inlined_states.binary_search(&state) {
 87 |             Ok(idx) | Err(idx) => state.map(|state_idx| state_idx - idx),
 88 |         }
 89 |     }
 90 | 
 91 |     pub fn n_inlined_states(&self) -> usize {
 92 |         self.inlined_states.len()
 93 |     }
 94 | 
 95 |     pub fn token_type(&self) -> &syn::Type {
 96 |         &self.token_type
 97 |     }
 98 | 
 99 |     pub fn user_error_type(&self) -> Option<&syn::Type> {
100 |         self.user_error_type.as_ref()
101 |     }
102 | 
103 |     pub fn add_search_table(&mut self, ranges: Vec<(char, char)>) -> syn::Ident {
104 |         self.codegen_state.search_tables.add_table(ranges)
105 |     }
106 | 
107 |     pub fn take_search_tables(&mut self) -> SearchTableSet {
108 |         std::mem::replace(&mut self.codegen_state.search_tables, SearchTableSet::new())
109 |     }
110 | 
111 |     pub fn rule_states(&self) -> &Map<String, StateIdx> {
112 |         &self.rule_states
113 |     }
114 | 
115 |     pub fn iter_semantic_actions(&self) -> impl Iterator<Item = (SemanticActionIdx, &RuleRhs)> {
116 |         self.semantic_action_table.iter()
117 |     }
118 | 
119 |     pub fn semantic_action_fn_ident(&self, action: SemanticActionIdx) -> syn::Ident {
120 |         syn::Ident::new(
121 |             &format!("{}_ACTION_{}", self.lexer_name, action.as_usize()),
122 |             self.lexer_name.span(),
123 |         )
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/dfa/codegen/search_table.rs:
--------------------------------------------------------------------------------
 1 | use crate::collections::Map;
 2 | 
 3 | use std::collections::hash_map::Entry;
 4 | 
 5 | pub struct SearchTableSet {
 6 |     tables: Map<Vec<(char, char)>, syn::Ident>,
 7 | }
 8 | 
 9 | impl SearchTableSet {
10 |     pub fn new() -> SearchTableSet {
11 |         SearchTableSet {
12 |             tables: Default::default(),
13 |         }
14 |     }
15 | 
16 |     pub fn add_table(&mut self, ranges: Vec<(char, char)>) -> syn::Ident {
17 |         let n_tables = self.tables.len();
18 |         match self.tables.entry(ranges) {
19 |             Entry::Occupied(entry) => entry.get().clone(),
20 |             Entry::Vacant(entry) => {
21 |                 let ident = syn::Ident::new(
22 |                     &format!("RANGE_TABLE_{}", n_tables),
23 |                     proc_macro2::Span::call_site(),
24 |                 );
25 |                 entry.insert(ident.clone());
26 |                 ident
27 |             }
28 |         }
29 |     }
30 | 
31 |     pub fn iter(&self) -> impl Iterator<Item = (&[(char, char)], &syn::Ident)> {
32 |         self.tables
33 |             .iter()
34 |             .map(|(ranges, ident)| (ranges.as_slice(), ident))
35 |     }
36 | 
37 |     pub fn is_empty(&self) -> bool {
38 |         self.tables.is_empty()
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/dfa/simplify.rs:
--------------------------------------------------------------------------------
 1 | use super::{State, StateIdx, DFA};
 2 | use crate::collections::Map;
 3 | use crate::nfa::AcceptingState;
 4 | 
 5 | #[derive(Debug)]
 6 | pub enum Trans<A> {
 7 |     Accept(Vec<AcceptingState<A>>),
 8 |     Trans(StateIdx),
 9 | }
10 | 
11 | /// Removes accepting states with no transitions, makes the transitions to those states accepting.
12 | pub fn simplify<K, A: Clone>(
13 |     dfa: DFA<StateIdx, A>,
14 |     dfa_state_indices: &mut Map<K, StateIdx>,
15 | ) -> DFA<Trans<A>, A> {
16 |     let mut empty_states: Vec<(StateIdx, Vec<AcceptingState<A>>)> = vec![];
17 | 
18 |     let mut non_empty_states: Vec<(StateIdx, State<StateIdx, A>)> = vec![];
19 | 
20 |     for (state_idx, state) in dfa.into_state_indices() {
21 |         if state.has_no_transitions() && !state.initial {
22 |             empty_states.push((state_idx, state.accepting));
23 |         } else {
24 |             non_empty_states.push((state_idx, state));
25 |         }
26 |     }
27 | 
28 |     for (_, t) in dfa_state_indices.iter_mut() {
29 |         let idx = match empty_states.binary_search_by(|(state_idx, _)| state_idx.cmp(t)) {
30 |             Ok(idx) | Err(idx) => idx,
31 |         };
32 |         *t = t.map(|i| i - idx);
33 |     }
34 | 
35 |     let map_transition = |t: StateIdx| -> Trans<A> {
36 |         match empty_states.binary_search_by(|(state_idx, _action)| state_idx.cmp(&t)) {
37 |             Ok(idx) => Trans::Accept(empty_states[idx].1.clone()),
38 |             Err(idx) => Trans::Trans(t.map(|i| i - idx)),
39 |         }
40 |     };
41 | 
42 |     let new_states: Vec<State<Trans<A>, A>> = non_empty_states
43 |         .into_iter()
44 |         .map(|(_state_idx, state)| {
45 |             let State {
46 |                 initial,
47 |                 char_transitions,
48 |                 range_transitions,
49 |                 any_transition,
50 |                 end_of_input_transition,
51 |                 accepting,
52 |                 predecessors,
53 |                 backtrack,
54 |             } = state;
55 | 
56 |             let char_transitions = char_transitions
57 |                 .into_iter()
58 |                 .map(|(char, next)| (char, map_transition(next)))
59 |                 .collect();
60 | 
61 |             let range_transitions = range_transitions.map(map_transition);
62 | 
63 |             let any_transition = any_transition.map(map_transition);
64 | 
65 |             let end_of_input_transition = end_of_input_transition.map(map_transition);
66 | 
67 |             let predecessors = predecessors
68 |                 .into_iter()
69 |                 .map(|pred| match map_transition(pred) {
70 |                     Trans::Trans(pred) => pred,
71 |                     _ => {
72 |                         // This pass should only remove nodes without successors, so it's a bug if
73 |                         // we remove a predecessor
74 |                         panic!("Predecessor of a state is removed in simplification")
75 |                     }
76 |                 })
77 |                 .collect();
78 | 
79 |             State {
80 |                 initial,
81 |                 char_transitions,
82 |                 range_transitions,
83 |                 any_transition,
84 |                 end_of_input_transition,
85 |                 accepting,
86 |                 predecessors,
87 |                 backtrack,
88 |             }
89 |         })
90 |         .collect();
91 | 
92 |     DFA::from_states(new_states)
93 | }
94 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/dfa/simulate.rs:
--------------------------------------------------------------------------------
  1 | use super::{StateIdx, DFA};
  2 | 
  3 | pub use crate::nfa::simulate::{ErrorLoc, Matches};
  4 | use crate::nfa::AcceptingState;
  5 | use crate::range_map::Range;
  6 | use crate::right_ctx::RightCtxDFAs;
  7 | 
  8 | impl<A: Copy> DFA<StateIdx, A> {
  9 |     pub fn simulate<'input>(
 10 |         &self,
 11 |         input: &'input str,
 12 |         right_ctx_dfas: &RightCtxDFAs<StateIdx>,
 13 |     ) -> (Matches<'input, A>, Option<ErrorLoc>) {
 14 |         let mut values: Matches<'input, A> = vec![];
 15 | 
 16 |         // Current state
 17 |         let mut state = StateIdx(0);
 18 | 
 19 |         // See comments for the same variable in NFA simulation
 20 |         let mut last_match: Option<(usize, A, usize)> = None;
 21 | 
 22 |         let mut char_indices = input.char_indices();
 23 | 
 24 |         // Where the current match starts
 25 |         let mut match_start = 0;
 26 | 
 27 |         // Index of current character in input string
 28 |         let mut char_idx: usize;
 29 | 
 30 |         'outer: loop {
 31 |             while let Some((char_idx_, char)) = char_indices.next() {
 32 |                 char_idx = match_start + char_idx_;
 33 | 
 34 |                 match next(self, state, char) {
 35 |                     None => {
 36 |                         match last_match.take() {
 37 |                             None => {
 38 |                                 // We're stuck and can't backtrack, raise an error
 39 |                                 return (values, Some(match_start));
 40 |                             }
 41 |                             Some((last_match_start, last_match_value, last_match_end)) => {
 42 |                                 // Backtrack to the previous accepting state
 43 |                                 match_start = last_match_end;
 44 |                                 char_indices = input[match_start..].char_indices();
 45 | 
 46 |                                 // Accept the previous match
 47 |                                 values.push((
 48 |                                     &input[last_match_start..last_match_end],
 49 |                                     last_match_value,
 50 |                                 ));
 51 | 
 52 |                                 // Restart state machine
 53 |                                 state = StateIdx(0);
 54 |                             }
 55 |                         }
 56 |                     }
 57 |                     Some(next_state) => {
 58 |                         state = next_state;
 59 | 
 60 |                         // Check for accepting state
 61 |                         for AcceptingState { value, right_ctx } in &self.states[state.0].accepting {
 62 |                             match right_ctx {
 63 |                                 None => {
 64 |                                     last_match =
 65 |                                         Some((match_start, *value, char_idx + char.len_utf8()));
 66 |                                     break;
 67 |                                 }
 68 |                                 Some(right_ctx_idx) => {
 69 |                                     let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx);
 70 |                                     if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) {
 71 |                                         last_match =
 72 |                                             Some((match_start, *value, char_idx + char.len_utf8()));
 73 |                                         break;
 74 |                                     }
 75 |                                 }
 76 |                             }
 77 |                         }
 78 |                     }
 79 |                 }
 80 |             }
 81 | 
 82 |             // Reached EOF, take EOF transition, check for accepting states
 83 |             if let Some(next) = next_end_of_input(self, state) {
 84 |                 // Check for accepting state
 85 |                 state = next;
 86 |                 for AcceptingState { value, right_ctx } in &self.states[state.0].accepting {
 87 |                     match right_ctx {
 88 |                         None => {
 89 |                             values.push((&input[match_start..], *value));
 90 |                             break 'outer;
 91 |                         }
 92 |                         Some(right_ctx_idx) => {
 93 |                             let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx);
 94 |                             if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) {
 95 |                                 values.push((&input[match_start..], *value));
 96 |                                 break 'outer;
 97 |                             }
 98 |                         }
 99 |                     }
100 |                 }
101 |             }
102 | 
103 |             // Reached EOF but cannot accept input, backtrack if possible, otherwise raise an error
104 |             match last_match.take() {
105 |                 Some((last_match_start, last_match_value, last_match_end)) => {
106 |                     values.push((&input[last_match_start..last_match_end], last_match_value));
107 | 
108 |                     if last_match_end == input.len() {
109 |                         break 'outer;
110 |                     } else {
111 |                         // Backtrack
112 |                         match_start = last_match_end;
113 |                         char_indices = input[match_start..].char_indices();
114 | 
115 |                         // Restart state machine
116 |                         state = StateIdx(0);
117 |                     }
118 |                 }
119 |                 None => {
120 |                     // We're stuck and can't backtrack, raise an error
121 |                     return (values, Some(match_start));
122 |                 }
123 |             }
124 |         }
125 | 
126 |         (values, None)
127 |     }
128 | }
129 | 
130 | fn next<A>(dfa: &DFA<StateIdx, A>, state: StateIdx, char: char) -> Option<StateIdx> {
131 |     let state = &dfa.states[state.0];
132 | 
133 |     if let Some(next) = state.char_transitions.get(&char) {
134 |         return Some(*next);
135 |     }
136 | 
137 |     for range in state.range_transitions.iter() {
138 |         let Range { start, end, value } = range;
139 |         if char as u32 >= *start && char as u32 <= *end {
140 |             return Some(*value);
141 |         }
142 |     }
143 | 
144 |     if let Some(next) = state.any_transition {
145 |         return Some(next);
146 |     }
147 | 
148 |     None
149 | }
150 | 
151 | fn next_end_of_input<A>(dfa: &DFA<StateIdx, A>, state: StateIdx) -> Option<StateIdx> {
152 |     dfa.states[state.0].end_of_input_transition
153 | }
154 | 
155 | // Similar to `simulate`, but does not keep track of the last match as we don't need "longest
156 | // match" semantics and backtracking
157 | pub fn simulate_right_ctx(dfa: &DFA<StateIdx, ()>, char_indices: std::str::CharIndices) -> bool {
158 |     let mut state = dfa.initial_state();
159 | 
160 |     if dfa.is_accepting_state(state) {
161 |         return true;
162 |     }
163 | 
164 |     for (_, char) in char_indices {
165 |         match next(dfa, state, char) {
166 |             None => {
167 |                 // Stuck
168 |                 return false;
169 |             }
170 |             Some(next_state) => {
171 |                 if dfa.is_accepting_state(next_state) {
172 |                     return true;
173 |                 }
174 | 
175 |                 state = next_state;
176 |             }
177 |         }
178 |     }
179 | 
180 |     match next_end_of_input(dfa, state) {
181 |         None => false,
182 |         Some(next_state) => dfa.is_accepting_state(next_state),
183 |     }
184 | }
185 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/display.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::{BTreeSet, HashSet};
 2 | use std::fmt::{self, Display, Formatter};
 3 | 
 4 | pub struct BTreeSetDisplay<'a, A: Display>(pub &'a BTreeSet<A>);
 5 | 
 6 | pub struct HashSetDisplay<'a, A: Display, S>(pub &'a HashSet<A, S>);
 7 | 
 8 | impl<A: Display> Display for BTreeSetDisplay<'_, A> {
 9 |     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
10 |         display_set(f, self.0.len(), &mut self.0.iter())
11 |     }
12 | }
13 | 
14 | impl<A: Display, S> Display for HashSetDisplay<'_, A, S> {
15 |     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
16 |         display_set(f, self.0.len(), &mut self.0.iter())
17 |     }
18 | }
19 | 
20 | fn display_set<A: Display>(
21 |     f: &mut Formatter<'_>,
22 |     n_elems: usize,
23 |     elems: &mut dyn Iterator<Item = A>,
24 | ) -> fmt::Result {
25 |     write!(f, "{{")?;
26 | 
27 |     for (elem_idx, elem) in elems.enumerate() {
28 |         write!(f, "{}", elem)?;
29 |         if elem_idx != n_elems - 1 {
30 |             write!(f, ", ")?;
31 |         }
32 |     }
33 | 
34 |     write!(f, "}}")
35 | }
36 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! Please see the [project README][1] for usage.
  2 | //!
  3 | //! [1]: https://github.com/osa1/lexgen
  4 | 
  5 | #![allow(
  6 |     clippy::collapsible_else_if,
  7 |     clippy::enum_variant_names,
  8 |     clippy::too_many_arguments,
  9 |     clippy::upper_case_acronyms,
 10 |     clippy::large_enum_variant
 11 | )]
 12 | 
 13 | mod ast;
 14 | mod builtin;
 15 | mod char_ranges;
 16 | mod collections;
 17 | mod dfa;
 18 | mod display;
 19 | mod nfa;
 20 | mod nfa_to_dfa;
 21 | mod range_map;
 22 | mod regex_to_nfa;
 23 | mod right_ctx;
 24 | mod semantic_action_table;
 25 | 
 26 | #[cfg(test)]
 27 | mod tests;
 28 | 
 29 | use ast::{Binding, Lexer, Regex, RegexCtx, Rule, RuleOrBinding, SingleRule, Var};
 30 | use collections::Map;
 31 | use dfa::{StateIdx as DfaStateIdx, DFA};
 32 | use nfa::NFA;
 33 | use nfa_to_dfa::nfa_to_dfa;
 34 | use right_ctx::RightCtxDFAs;
 35 | use semantic_action_table::{SemanticActionIdx, SemanticActionTable};
 36 | 
 37 | use std::collections::hash_map::Entry;
 38 | 
 39 | use proc_macro::TokenStream;
 40 | use syn::parse::Parser;
 41 | 
 42 | #[proc_macro]
 43 | pub fn lexer(input: TokenStream) -> TokenStream {
 44 |     let mut semantic_action_table = SemanticActionTable::new();
 45 | 
 46 |     let Lexer {
 47 |         attrs,
 48 |         visibility,
 49 |         type_name,
 50 |         user_state_type,
 51 |         token_type,
 52 |         rules: top_level_rules,
 53 |     } = match ast::make_lexer_parser(&mut semantic_action_table).parse(input) {
 54 |         Ok(lexer) => lexer,
 55 |         Err(error) => return TokenStream::from(error.to_compile_error()),
 56 |     };
 57 | 
 58 |     // Maps DFA names to their initial states in the final DFA
 59 |     let mut dfas: Map<String, dfa::StateIdx> = Default::default();
 60 | 
 61 |     // DFAs generated for right contexts
 62 |     let mut right_ctx_dfas = RightCtxDFAs::new();
 63 | 
 64 |     let mut bindings: Map<Var, Regex> = Default::default();
 65 | 
 66 |     let mut init_dfa: Option<DFA<DfaStateIdx, SemanticActionIdx>> = None;
 67 | 
 68 |     let mut user_error_type: Option<syn::Type> = None;
 69 | 
 70 |     let mut unnamed_nfa: NFA<SemanticActionIdx> = NFA::new();
 71 | 
 72 |     // Mixing named and unnamed rules is not allowed
 73 |     {
 74 |         let mut named = false;
 75 |         let mut unnamed = false;
 76 |         for rule in &top_level_rules {
 77 |             match rule {
 78 |                 Rule::RuleOrBinding(RuleOrBinding::Rule { .. }) => unnamed = true,
 79 |                 Rule::RuleSet { .. } => named = true,
 80 |                 _ => {}
 81 |             }
 82 |         }
 83 |         if named && unnamed {
 84 |             panic!(
 85 |                 "Unnamed rules cannot be mixed with named rules. Make sure to either \
 86 |                 have all your rules in `rule ... {} ... {}` syntax, or remove `rule`s \
 87 |                 entirely and have your rules at the top-level.",
 88 |                 '{', '}',
 89 |             );
 90 |         }
 91 |     }
 92 | 
 93 |     for rule in top_level_rules {
 94 |         match rule {
 95 |             Rule::ErrorType { ty } => match user_error_type {
 96 |                 None => {
 97 |                     user_error_type = Some(ty);
 98 |                 }
 99 |                 Some(_) => panic!("Error type defined multiple times"),
100 |             },
101 | 
102 |             Rule::RuleOrBinding(RuleOrBinding::Binding(Binding { var, re })) => {
103 |                 match bindings.entry(var) {
104 |                     Entry::Occupied(entry) => {
105 |                         panic!("Variable {:?} is defined multiple times", entry.key().0);
106 |                     }
107 |                     Entry::Vacant(entry) => {
108 |                         entry.insert(re);
109 |                     }
110 |                 }
111 |             }
112 | 
113 |             Rule::RuleOrBinding(RuleOrBinding::Rule(SingleRule { lhs, rhs })) => {
114 |                 compile_single_rule(&mut unnamed_nfa, lhs, rhs, &bindings, &mut right_ctx_dfas);
115 |             }
116 | 
117 |             Rule::RuleSet { name, rules } => {
118 |                 let dfa_idx = if name == "Init" {
119 |                     let dfa = init_dfa.insert(compile_rule_set(
120 |                         rules,
121 |                         bindings.clone(),
122 |                         &mut right_ctx_dfas,
123 |                     ));
124 | 
125 |                     dfa.initial_state()
126 |                 } else {
127 |                     let dfa = init_dfa
128 |                         .as_mut()
129 |                         .expect("First rule set should be named \"Init\"");
130 | 
131 |                     let dfa_ = compile_rule_set(rules, bindings.clone(), &mut right_ctx_dfas);
132 | 
133 |                     dfa.add_dfa(dfa_)
134 |                 };
135 | 
136 |                 if dfas.insert(name.to_string(), dfa_idx).is_some() {
137 |                     panic!("Rule set {:?} is defined multiple times", name.to_string());
138 |                 }
139 |             }
140 |         }
141 |     }
142 | 
143 |     let mut dfa = match init_dfa {
144 |         Some(init_dfa) => init_dfa,
145 |         None => nfa_to_dfa(&unnamed_nfa),
146 |     };
147 | 
148 |     dfa::update_backtracks(&mut dfa);
149 | 
150 |     let dfa = dfa::simplify::simplify(dfa, &mut dfas);
151 | 
152 |     dfa::codegen::generate(
153 |         dfa,
154 |         &right_ctx_dfas,
155 |         semantic_action_table,
156 |         user_state_type,
157 |         user_error_type,
158 |         dfas,
159 |         type_name,
160 |         token_type,
161 |         visibility,
162 |         attrs,
163 |     )
164 |     .into()
165 | }
166 | 
167 | fn compile_single_rule(
168 |     nfa: &mut NFA<SemanticActionIdx>,
169 |     lhs: RegexCtx,
170 |     rhs: SemanticActionIdx,
171 |     bindings: &Map<Var, Regex>,
172 |     right_ctx_dfas: &mut RightCtxDFAs<DfaStateIdx>,
173 | ) {
174 |     let RegexCtx { re, right_ctx } = lhs;
175 | 
176 |     let right_ctx = right_ctx
177 |         .as_ref()
178 |         .map(|right_ctx| right_ctx_dfas.new_right_ctx(bindings, right_ctx));
179 | 
180 |     nfa.add_regex(bindings, &re, right_ctx, rhs);
181 | }
182 | 
183 | fn compile_rule_set(
184 |     rules: Vec<RuleOrBinding>,
185 |     mut bindings: Map<Var, Regex>,
186 |     right_ctx_dfas: &mut RightCtxDFAs<DfaStateIdx>,
187 | ) -> DFA<DfaStateIdx, SemanticActionIdx> {
188 |     let mut nfa: NFA<SemanticActionIdx> = NFA::new();
189 | 
190 |     for rule in rules {
191 |         match rule {
192 |             RuleOrBinding::Rule(SingleRule { lhs, rhs }) => {
193 |                 compile_single_rule(&mut nfa, lhs, rhs, &bindings, right_ctx_dfas);
194 |             }
195 |             RuleOrBinding::Binding(Binding { var, re }) => match bindings.entry(var) {
196 |                 Entry::Occupied(entry) => {
197 |                     panic!("Variable {:?} is defined multiple times", entry.key().0);
198 |                 }
199 |                 Entry::Vacant(entry) => {
200 |                     entry.insert(re);
201 |                 }
202 |             },
203 |         }
204 |     }
205 | 
206 |     nfa_to_dfa(&nfa)
207 | }
208 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/nfa.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(test)]
  2 | pub mod simulate;
  3 | 
  4 | use crate::ast::{Regex, Var};
  5 | use crate::collections::{Map, Set};
  6 | use crate::display::HashSetDisplay;
  7 | use crate::range_map::{Range, RangeMap};
  8 | use crate::regex_to_nfa;
  9 | use crate::right_ctx::RightCtxIdx;
 10 | 
 11 | /// Non-deterministic finite automate, parameterized on values of accepting states.
 12 | #[derive(Debug)]
 13 | pub struct NFA<A> {
 14 |     // Indexed by `StateIdx`
 15 |     states: Vec<State<A>>,
 16 | }
 17 | 
 18 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 19 | pub struct StateIdx(usize);
 20 | 
 21 | #[derive(Debug)]
 22 | struct State<A> {
 23 |     char_transitions: Map<char, Set<StateIdx>>,
 24 |     range_transitions: RangeMap<Set<StateIdx>>,
 25 |     empty_transitions: Set<StateIdx>,
 26 |     any_transitions: Set<StateIdx>,
 27 |     end_of_input_transitions: Set<StateIdx>,
 28 |     accepting: Option<AcceptingState<A>>,
 29 | }
 30 | 
 31 | #[derive(Debug, Clone, Copy)]
 32 | pub struct AcceptingState<A> {
 33 |     pub value: A,
 34 |     pub right_ctx: Option<RightCtxIdx>,
 35 | }
 36 | 
 37 | impl<A> State<A> {
 38 |     fn new() -> State<A> {
 39 |         State {
 40 |             char_transitions: Default::default(),
 41 |             range_transitions: Default::default(),
 42 |             empty_transitions: Default::default(),
 43 |             any_transitions: Default::default(),
 44 |             end_of_input_transitions: Default::default(),
 45 |             accepting: None,
 46 |         }
 47 |     }
 48 | }
 49 | 
 50 | impl<A> NFA<A> {
 51 |     pub fn new() -> NFA<A> {
 52 |         NFA {
 53 |             states: vec![State::new()],
 54 |         }
 55 |     }
 56 | 
 57 |     pub fn initial_state(&self) -> StateIdx {
 58 |         StateIdx(0)
 59 |     }
 60 | 
 61 |     pub fn get_accepting_state(&self, state: StateIdx) -> Option<&AcceptingState<A>> {
 62 |         self.states[state.0].accepting.as_ref()
 63 |     }
 64 | 
 65 |     pub fn char_transitions(
 66 |         &self,
 67 |         state: StateIdx,
 68 |     ) -> impl Iterator<Item = (&char, &Set<StateIdx>)> {
 69 |         self.states[state.0].char_transitions.iter()
 70 |     }
 71 | 
 72 |     pub fn range_transitions(
 73 |         &self,
 74 |         state: StateIdx,
 75 |     ) -> impl Iterator<Item = &Range<Set<StateIdx>>> {
 76 |         self.states[state.0].range_transitions.iter()
 77 |     }
 78 | 
 79 |     pub fn any_transitions(&self, state: StateIdx) -> impl Iterator<Item = StateIdx> + '_ {
 80 |         self.states[state.0].any_transitions.iter().copied()
 81 |     }
 82 | 
 83 |     pub fn end_of_input_transitions(&self, state: StateIdx) -> impl Iterator<Item = StateIdx> + '_ {
 84 |         self.states[state.0]
 85 |             .end_of_input_transitions
 86 |             .iter()
 87 |             .copied()
 88 |     }
 89 | 
 90 |     pub fn new_state(&mut self) -> StateIdx {
 91 |         let new_state_idx = StateIdx(self.states.len());
 92 |         self.states.push(State::new());
 93 |         new_state_idx
 94 |     }
 95 | 
 96 |     pub fn add_regex(
 97 |         &mut self,
 98 |         bindings: &Map<Var, Regex>,
 99 |         re: &Regex,
100 |         right_ctx: Option<RightCtxIdx>,
101 |         value: A,
102 |     ) {
103 |         let re_accepting_state = self.new_state();
104 | 
105 |         self.make_state_accepting(re_accepting_state, value, right_ctx);
106 | 
107 |         let re_initial_state = self.new_state();
108 |         let nfa_initial_state = self.initial_state();
109 | 
110 |         self.add_empty_transition(nfa_initial_state, re_initial_state);
111 | 
112 |         regex_to_nfa::add_re(self, bindings, re, re_initial_state, re_accepting_state);
113 |     }
114 | 
115 |     pub fn add_char_transition(&mut self, state: StateIdx, char: char, next: StateIdx) {
116 |         let not_exists = self.states[state.0]
117 |             .char_transitions
118 |             .entry(char)
119 |             .or_default()
120 |             .insert(next);
121 | 
122 |         assert!(not_exists, "add_char_transition");
123 |     }
124 | 
125 |     pub fn add_range_transition(
126 |         &mut self,
127 |         state: StateIdx,
128 |         range_start: char,
129 |         range_end: char,
130 |         next: StateIdx,
131 |     ) {
132 |         let mut set: Set<StateIdx> = Default::default();
133 |         set.insert(next);
134 |         self.states[state.0].range_transitions.insert(
135 |             range_start as u32,
136 |             range_end as u32,
137 |             set,
138 |             |values_1, values_2| values_1.extend(values_2),
139 |         );
140 |     }
141 | 
142 |     pub fn add_range_transitions(&mut self, state: StateIdx, ranges: RangeMap<()>, next: StateIdx) {
143 |         let mut set: Set<StateIdx> = Default::default();
144 |         set.insert(next);
145 | 
146 |         let ranges = ranges.map(|()| set.clone());
147 | 
148 |         self.states[state.0]
149 |             .range_transitions
150 |             .insert_ranges(ranges.into_iter(), |values_1, values_2| {
151 |                 values_1.extend(values_2)
152 |             });
153 |     }
154 | 
155 |     pub fn add_empty_transition(&mut self, state: StateIdx, next: StateIdx) {
156 |         let not_exists = self.states[state.0].empty_transitions.insert(next);
157 | 
158 |         assert!(not_exists, "add_empty_transition");
159 |     }
160 | 
161 |     pub fn add_any_transition(&mut self, state: StateIdx, next: StateIdx) {
162 |         let not_exists = self.states[state.0].any_transitions.insert(next);
163 | 
164 |         assert!(not_exists, "add_any_transition");
165 |     }
166 | 
167 |     pub fn add_end_of_input_transition(&mut self, state: StateIdx, next: StateIdx) {
168 |         let not_exists = self.states[state.0].end_of_input_transitions.insert(next);
169 | 
170 |         assert!(not_exists, "add_end_of_input_transition");
171 |     }
172 | 
173 |     fn make_state_accepting(&mut self, state: StateIdx, value: A, right_ctx: Option<RightCtxIdx>) {
174 |         let old = self.states[state.0]
175 |             .accepting
176 |             .replace(AcceptingState { value, right_ctx });
177 | 
178 |         assert!(old.is_none(), "make_state_accepting");
179 |     }
180 | 
181 |     pub fn compute_state_closure(&self, states: &Set<StateIdx>) -> Set<StateIdx> {
182 |         let mut worklist: Vec<StateIdx> = states.iter().copied().collect();
183 |         let mut closure: Set<StateIdx> = states.clone();
184 | 
185 |         while let Some(work) = worklist.pop() {
186 |             for next_state in self.next_empty_states(work) {
187 |                 if closure.insert(*next_state) {
188 |                     worklist.push(*next_state);
189 |                 }
190 |             }
191 |         }
192 | 
193 |         closure
194 |     }
195 | 
196 |     fn next_empty_states(&self, state: StateIdx) -> &Set<StateIdx> {
197 |         let state = &self.states[state.0];
198 |         &state.empty_transitions
199 |     }
200 | }
201 | 
202 | use std::fmt::{self, Display, Formatter};
203 | 
204 | impl Display for StateIdx {
205 |     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
206 |         self.0.fmt(f)
207 |     }
208 | }
209 | 
210 | impl<A> Display for NFA<A> {
211 |     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
212 |         for (state_idx, state) in self.states.iter().enumerate() {
213 |             let State {
214 |                 char_transitions,
215 |                 range_transitions,
216 |                 empty_transitions,
217 |                 any_transitions,
218 |                 end_of_input_transitions,
219 |                 accepting,
220 |             } = state;
221 | 
222 |             match accepting {
223 |                 Some(AcceptingState {
224 |                     value: _,
225 |                     right_ctx,
226 |                 }) => match right_ctx {
227 |                     Some(right_ctx_idx) => {
228 |                         write!(f, "{:>4}", format!("*{}", state_idx),)?;
229 |                         write!(f, " (ctx {})", right_ctx_idx.as_usize())?;
230 |                     }
231 |                     None => {
232 |                         write!(f, "{:>4}", format!("*{}", state_idx))?;
233 |                     }
234 |                 },
235 |                 None => {
236 |                     write!(f, "{:>4}:", state_idx)?;
237 |                 }
238 |             }
239 | 
240 |             let mut first = true;
241 | 
242 |             if !empty_transitions.is_empty() {
243 |                 if !first {
244 |                     write!(f, "     ")?;
245 |                 } else {
246 |                     first = false;
247 |                 }
248 | 
249 |                 writeln!(f, "e -> {}", HashSetDisplay(empty_transitions))?;
250 |             }
251 | 
252 |             for (char, next) in char_transitions.iter() {
253 |                 if !first {
254 |                     write!(f, "     ")?;
255 |                 } else {
256 |                     first = false;
257 |                 }
258 | 
259 |                 writeln!(f, "{:?} -> {}", char, HashSetDisplay(next))?;
260 |             }
261 | 
262 |             for range in range_transitions.iter() {
263 |                 if !first {
264 |                     write!(f, "     ")?;
265 |                 } else {
266 |                     first = false;
267 |                 }
268 | 
269 |                 writeln!(
270 |                     f,
271 |                     "{:?} - {:?} -> {}",
272 |                     range.start,
273 |                     range.end,
274 |                     HashSetDisplay(&range.value)
275 |                 )?;
276 |             }
277 | 
278 |             if !any_transitions.is_empty() {
279 |                 if !first {
280 |                     write!(f, "     ")?;
281 |                 } else {
282 |                     first = false;
283 |                 }
284 | 
285 |                 writeln!(f, "_ -> {}", HashSetDisplay(any_transitions))?;
286 |             }
287 | 
288 |             if !end_of_input_transitions.is_empty() {
289 |                 if !first {
290 |                     write!(f, "     ")?;
291 |                 }
292 | 
293 |                 writeln!(f, "$ -> {}", HashSetDisplay(end_of_input_transitions))?;
294 |             }
295 | 
296 |             if empty_transitions.is_empty()
297 |                 && char_transitions.is_empty()
298 |                 && range_transitions.is_empty()
299 |                 && any_transitions.is_empty()
300 |                 && end_of_input_transitions.is_empty()
301 |             {
302 |                 writeln!(f)?;
303 |             }
304 |         }
305 | 
306 |         Ok(())
307 |     }
308 | }
309 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/nfa/simulate.rs:
--------------------------------------------------------------------------------
  1 | use super::{AcceptingState, StateIdx, NFA};
  2 | use crate::collections::Set;
  3 | use crate::dfa::simulate::simulate_right_ctx;
  4 | use crate::dfa::StateIdx as DfaStateIdx;
  5 | use crate::right_ctx::RightCtxDFAs;
  6 | 
  7 | pub type Matches<'input, A> = Vec<(&'input str, A)>;
  8 | 
  9 | pub type ErrorLoc = usize;
 10 | 
 11 | impl<A: std::fmt::Debug + Copy> NFA<A> {
 12 |     pub fn simulate<'input>(
 13 |         &self,
 14 |         input: &'input str,
 15 |         right_ctx_dfas: &RightCtxDFAs<DfaStateIdx>,
 16 |     ) -> (Matches<'input, A>, Option<ErrorLoc>) {
 17 |         let mut values: Matches<'input, A> = vec![];
 18 | 
 19 |         // If we skipped an accepting state because we were able to make progress with the next
 20 |         // character, this state holds the previous match. If we get stuck we return this match.
 21 |         //
 22 |         // This implements backtracking in regexes like:
 23 |         //
 24 |         // - aaaaaab
 25 |         // - a
 26 |         //
 27 |         // in an input like "aaaa".
 28 |         let mut last_match: Option<(usize, A, usize)> = None;
 29 | 
 30 |         let mut states: Set<StateIdx> = Default::default();
 31 |         states.insert(StateIdx(0));
 32 |         states = self.compute_state_closure(&states);
 33 | 
 34 |         let mut char_indices = input.char_indices();
 35 | 
 36 |         // Where the current match starts
 37 |         let mut match_start: usize = 0;
 38 | 
 39 |         // Index of current character in input string
 40 |         let mut char_idx;
 41 | 
 42 |         'outer: loop {
 43 |             while let Some((char_idx_, char)) = char_indices.next() {
 44 |                 char_idx = match_start + char_idx_;
 45 | 
 46 |                 states = next(self, &states, char);
 47 | 
 48 |                 // When stuck check if we skipped an accepting state
 49 |                 if states.is_empty() {
 50 |                     match last_match.take() {
 51 |                         None => {
 52 |                             // We're stuck and can't backtrack, raise an error
 53 |                             return (values, Some(match_start));
 54 |                         }
 55 |                         Some((last_match_start, last_match_value, last_match_end)) => {
 56 |                             // Backtrack to the previous accepting state
 57 |                             match_start = last_match_end;
 58 |                             char_indices = input[match_start..].char_indices();
 59 | 
 60 |                             // Accept the previous match
 61 |                             values
 62 |                                 .push((&input[last_match_start..last_match_end], last_match_value));
 63 | 
 64 |                             // Restart state machine
 65 |                             states.insert(StateIdx(0));
 66 |                             states = self.compute_state_closure(&states);
 67 |                         }
 68 |                     }
 69 |                 } else {
 70 |                     // Check for accepting states. Sort states to pick the one that comes first in
 71 |                     // the program.
 72 |                     let mut states_sorted: Vec<StateIdx> = states.iter().copied().collect();
 73 |                     states_sorted.sort();
 74 |                     for state in states_sorted {
 75 |                         if let Some(AcceptingState { value, right_ctx }) =
 76 |                             &self.states[state.0].accepting
 77 |                         {
 78 |                             match right_ctx {
 79 |                                 None => {
 80 |                                     last_match =
 81 |                                         Some((match_start, *value, char_idx + char.len_utf8()));
 82 |                                     break;
 83 |                                 }
 84 |                                 Some(right_ctx_idx) => {
 85 |                                     let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx);
 86 |                                     if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) {
 87 |                                         last_match =
 88 |                                             Some((match_start, *value, char_idx + char.len_utf8()));
 89 |                                         break;
 90 |                                     }
 91 |                                 }
 92 |                             }
 93 |                         }
 94 |                     }
 95 |                 }
 96 |             }
 97 | 
 98 |             // Reached EOF, take EOF transitions, check for accepting states
 99 |             states = next_end_of_input(self, &states);
100 | 
101 |             {
102 |                 let mut states_sorted: Vec<StateIdx> = states.iter().copied().collect();
103 |                 states_sorted.sort();
104 | 
105 |                 for state in states_sorted {
106 |                     if let Some(AcceptingState { value, right_ctx }) =
107 |                         &self.states[state.0].accepting
108 |                     {
109 |                         match right_ctx {
110 |                             None => {
111 |                                 values.push((&input[match_start..], *value));
112 |                                 break 'outer;
113 |                             }
114 |                             Some(right_ctx_idx) => {
115 |                                 let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx);
116 |                                 if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) {
117 |                                     values.push((&input[match_start..], *value));
118 |                                     break 'outer;
119 |                                 }
120 |                             }
121 |                         }
122 |                     }
123 |                 }
124 |             }
125 | 
126 |             // Reached EOF but cannot accept input, backtrack if possible, otherwise raise an error
127 |             match last_match.take() {
128 |                 Some((last_match_start, last_match_value, last_match_end)) => {
129 |                     values.push((&input[last_match_start..last_match_end], last_match_value));
130 | 
131 |                     if last_match_end == input.len() {
132 |                         break 'outer;
133 |                     } else {
134 |                         // Backtrack
135 |                         match_start = last_match_end;
136 |                         char_indices = input[match_start..].char_indices();
137 | 
138 |                         // Restart state machine
139 |                         states.insert(StateIdx(0));
140 |                         states = self.compute_state_closure(&states);
141 |                     }
142 |                 }
143 |                 None => {
144 |                     // We're stuck and can't backtrack, raise an error
145 |                     return (values, Some(match_start));
146 |                 }
147 |             }
148 |         }
149 | 
150 |         (values, None)
151 |     }
152 | }
153 | 
154 | fn next<A>(nfa: &NFA<A>, states: &Set<StateIdx>, char: char) -> Set<StateIdx> {
155 |     let mut next_states: Set<StateIdx> = Default::default();
156 | 
157 |     for state in states {
158 |         // Char transitions
159 |         if let Some(char_nexts) = nfa.states[state.0].char_transitions.get(&char) {
160 |             next_states.extend(char_nexts.iter());
161 |         }
162 | 
163 |         // Range transitions
164 |         for range in nfa.states[state.0].range_transitions.iter() {
165 |             if char as u32 >= range.start && char as u32 <= range.end {
166 |                 next_states.extend(range.value.clone());
167 |             }
168 |         }
169 | 
170 |         // Any transitions
171 |         next_states.extend(nfa.states[state.0].any_transitions.iter().copied());
172 |     }
173 | 
174 |     nfa.compute_state_closure(&next_states)
175 | }
176 | 
177 | fn next_end_of_input<A>(nfa: &NFA<A>, states: &Set<StateIdx>) -> Set<StateIdx> {
178 |     let mut next_states: Set<StateIdx> = Default::default();
179 | 
180 |     for state in states {
181 |         next_states.extend(nfa.states[state.0].end_of_input_transitions.iter().copied());
182 |     }
183 | 
184 |     nfa.compute_state_closure(&next_states)
185 | }
186 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/nfa_to_dfa.rs:
--------------------------------------------------------------------------------
  1 | use crate::collections::{Map, Set};
  2 | use crate::dfa::DFA;
  3 | use crate::nfa::NFA;
  4 | use crate::range_map::{Range, RangeMap};
  5 | 
  6 | use crate::dfa::StateIdx as DfaStateIdx;
  7 | use crate::nfa::StateIdx as NfaStateIdx;
  8 | 
  9 | use std::collections::hash_map::Entry;
 10 | use std::collections::BTreeSet;
 11 | 
 12 | pub fn nfa_to_dfa<A: Clone>(nfa: &NFA<A>) -> DFA<DfaStateIdx, A> {
 13 |     let initial_state = nfa.initial_state();
 14 | 
 15 |     let initial_states: BTreeSet<NfaStateIdx> = {
 16 |         let mut initial_states: Set<NfaStateIdx> = Default::default();
 17 |         initial_states.insert(initial_state);
 18 | 
 19 |         nfa.compute_state_closure(&initial_states)
 20 |             .into_iter()
 21 |             .collect()
 22 |     };
 23 | 
 24 |     let (mut dfa, dfa_initial_state): (DFA<DfaStateIdx, A>, DfaStateIdx) = DFA::new();
 25 | 
 26 |     // Maps sets NFA states to their states in the DFA
 27 |     let mut state_map: Map<BTreeSet<NfaStateIdx>, DfaStateIdx> = Default::default();
 28 |     state_map.insert(initial_states.clone(), dfa_initial_state);
 29 | 
 30 |     let mut work_list: Vec<BTreeSet<NfaStateIdx>> = vec![initial_states];
 31 |     let mut finished_dfa_states: Set<DfaStateIdx> = Default::default();
 32 | 
 33 |     while let Some(current_nfa_states) = work_list.pop() {
 34 |         let current_dfa_state = match state_map.get(&current_nfa_states) {
 35 |             None => {
 36 |                 let dfa_state = dfa.new_state();
 37 |                 state_map.insert(current_nfa_states.clone(), dfa_state);
 38 |                 dfa_state
 39 |             }
 40 |             Some(dfa_state) => *dfa_state,
 41 |         };
 42 | 
 43 |         if finished_dfa_states.contains(&current_dfa_state) {
 44 |             continue;
 45 |         }
 46 | 
 47 |         finished_dfa_states.insert(current_dfa_state);
 48 | 
 49 |         let mut char_transitions: Map<char, Set<NfaStateIdx>> = Default::default();
 50 |         let mut range_transitions: RangeMap<Set<NfaStateIdx>> = Default::default();
 51 |         let mut any_transitions: Set<NfaStateIdx> = Default::default();
 52 |         let mut end_of_input_transitions: Set<NfaStateIdx> = Default::default();
 53 | 
 54 |         for nfa_state in current_nfa_states.iter().copied() {
 55 |             if let Some(value) = nfa.get_accepting_state(nfa_state) {
 56 |                 dfa.make_state_accepting(current_dfa_state, value.clone());
 57 |             }
 58 | 
 59 |             // Collect char transitions
 60 |             for (char, next_states) in nfa.char_transitions(nfa_state) {
 61 |                 char_transitions
 62 |                     .entry(*char)
 63 |                     .or_default()
 64 |                     .extend(next_states.iter().copied());
 65 |             }
 66 | 
 67 |             // Collect range transitions
 68 |             for range in nfa.range_transitions(nfa_state) {
 69 |                 range_transitions.insert(
 70 |                     range.start,
 71 |                     range.end,
 72 |                     range.value.clone(),
 73 |                     |states_1, states_2| states_1.extend(states_2.into_iter()),
 74 |                 );
 75 |             }
 76 | 
 77 |             // Collect any transitions
 78 |             any_transitions.extend(nfa.any_transitions(nfa_state));
 79 | 
 80 |             // Collect end-of-input transitions
 81 |             end_of_input_transitions.extend(nfa.end_of_input_transitions(nfa_state));
 82 |         }
 83 | 
 84 |         // Compute closures of transition targets and add transitions to DFA
 85 |         for (char, mut char_states) in char_transitions.into_iter() {
 86 |             // For ranges that also cover the char we need to add the range transitions to the char
 87 |             // transition
 88 |             for range in range_transitions.iter() {
 89 |                 if range.contains(char) {
 90 |                     for range_state in &range.value {
 91 |                         char_states.insert(*range_state);
 92 |                     }
 93 |                 }
 94 |             }
 95 | 
 96 |             // Same for '_' (match any character) transitions
 97 |             for any_next in &any_transitions {
 98 |                 char_states.insert(*any_next);
 99 |             }
100 | 
101 |             let closure: BTreeSet<NfaStateIdx> = nfa
102 |                 .compute_state_closure(&char_states)
103 |                 .into_iter()
104 |                 .collect();
105 |             let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone());
106 |             dfa.add_char_transition(current_dfa_state, char, dfa_state);
107 | 
108 |             work_list.push(closure);
109 |         }
110 | 
111 |         let mut dfa_range_transitions: Vec<Range<DfaStateIdx>> =
112 |             Vec::with_capacity(range_transitions.len());
113 | 
114 |         for range in range_transitions.into_iter() {
115 |             let mut range_states: Set<NfaStateIdx> = range.value;
116 | 
117 |             for any_next in &any_transitions {
118 |                 range_states.insert(*any_next);
119 |             }
120 | 
121 |             let closure: BTreeSet<NfaStateIdx> = nfa
122 |                 .compute_state_closure(&range_states)
123 |                 .into_iter()
124 |                 .collect();
125 | 
126 |             let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone());
127 | 
128 |             dfa_range_transitions.push(Range {
129 |                 start: range.start,
130 |                 end: range.end,
131 |                 value: dfa_state,
132 |             });
133 | 
134 |             work_list.push(closure);
135 |         }
136 | 
137 |         dfa.set_range_transitions(
138 |             current_dfa_state,
139 |             RangeMap::from_non_overlapping_sorted_ranges(dfa_range_transitions),
140 |         );
141 | 
142 |         {
143 |             let closure: BTreeSet<NfaStateIdx> = nfa
144 |                 .compute_state_closure(&any_transitions)
145 |                 .into_iter()
146 |                 .collect();
147 | 
148 |             if !closure.is_empty() {
149 |                 let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone());
150 |                 dfa.set_any_transition(current_dfa_state, dfa_state);
151 |                 work_list.push(closure);
152 |             }
153 |         }
154 | 
155 |         {
156 |             let closure: BTreeSet<NfaStateIdx> = nfa
157 |                 .compute_state_closure(&end_of_input_transitions)
158 |                 .into_iter()
159 |                 .collect();
160 | 
161 |             if !closure.is_empty() {
162 |                 let dfa_state = dfa_state_of_nfa_states(&mut dfa, &mut state_map, closure.clone());
163 |                 dfa.set_end_of_input_transition(current_dfa_state, dfa_state);
164 |                 work_list.push(closure);
165 |             }
166 |         }
167 |     }
168 | 
169 |     dfa
170 | }
171 | 
172 | fn dfa_state_of_nfa_states<A>(
173 |     dfa: &mut DFA<DfaStateIdx, A>,
174 |     state_map: &mut Map<BTreeSet<NfaStateIdx>, DfaStateIdx>,
175 |     states: BTreeSet<NfaStateIdx>,
176 | ) -> DfaStateIdx {
177 |     match state_map.entry(states) {
178 |         Entry::Occupied(entry) => *entry.get(),
179 |         Entry::Vacant(entry) => {
180 |             let dfa_state = dfa.new_state();
181 |             entry.insert(dfa_state);
182 |             dfa_state
183 |         }
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/regex_to_nfa.rs:
--------------------------------------------------------------------------------
  1 | use crate::ast::{Builtin, CharOrRange, Regex, Var};
  2 | use crate::builtin::{BuiltinCharRange, BUILTIN_RANGES};
  3 | use crate::collections::Map;
  4 | use crate::nfa::{StateIdx, NFA};
  5 | use crate::range_map::{Range, RangeMap};
  6 | 
  7 | pub fn add_re<A>(
  8 |     nfa: &mut NFA<A>,
  9 |     bindings: &Map<Var, Regex>,
 10 |     re: &Regex,
 11 |     current: StateIdx,
 12 |     cont: StateIdx,
 13 | ) {
 14 |     match re {
 15 |         Regex::Builtin(builtin_name) => {
 16 |             let builtin = get_builtin_regex(builtin_name);
 17 | 
 18 |             let ranges: Vec<Range<()>> = builtin
 19 |                 .get_ranges()
 20 |                 .iter()
 21 |                 .copied()
 22 |                 .map(|(start, end)| Range {
 23 |                     start,
 24 |                     end,
 25 |                     value: (),
 26 |                 })
 27 |                 .collect();
 28 | 
 29 |             let map = RangeMap::from_non_overlapping_sorted_ranges(ranges);
 30 | 
 31 |             nfa.add_range_transitions(current, map, cont);
 32 |         }
 33 | 
 34 |         Regex::Var(var) => {
 35 |             let re = bindings
 36 |                 .get(var)
 37 |                 .unwrap_or_else(|| panic!("Unbound variable {:?}", var.0));
 38 | 
 39 |             add_re(nfa, bindings, re, current, cont);
 40 |         }
 41 | 
 42 |         Regex::Char(char) => {
 43 |             nfa.add_char_transition(current, *char, cont);
 44 |         }
 45 | 
 46 |         Regex::String(str) => {
 47 |             let mut iter = str.chars().peekable();
 48 |             let mut current = current;
 49 |             while let Some(char) = iter.next() {
 50 |                 let next = if iter.peek().is_some() {
 51 |                     nfa.new_state()
 52 |                 } else {
 53 |                     cont
 54 |                 };
 55 |                 nfa.add_char_transition(current, char, next);
 56 |                 current = next;
 57 |             }
 58 |         }
 59 | 
 60 |         Regex::CharSet(set) => {
 61 |             for char in &set.0 {
 62 |                 match char {
 63 |                     CharOrRange::Char(char) => {
 64 |                         nfa.add_char_transition(current, *char, cont);
 65 |                     }
 66 |                     CharOrRange::Range(range_start, range_end) => {
 67 |                         nfa.add_range_transition(current, *range_start, *range_end, cont);
 68 |                     }
 69 |                 }
 70 |             }
 71 |         }
 72 | 
 73 |         Regex::ZeroOrMore(re) => {
 74 |             let re_init = nfa.new_state();
 75 |             let re_cont = nfa.new_state();
 76 |             add_re(nfa, bindings, re, re_init, re_cont);
 77 |             nfa.add_empty_transition(current, cont);
 78 |             nfa.add_empty_transition(current, re_init);
 79 |             nfa.add_empty_transition(re_cont, cont);
 80 |             nfa.add_empty_transition(re_cont, re_init);
 81 |         }
 82 | 
 83 |         Regex::OneOrMore(re) => {
 84 |             let re_init = nfa.new_state();
 85 |             let re_cont = nfa.new_state();
 86 |             add_re(nfa, bindings, re, re_init, re_cont);
 87 |             nfa.add_empty_transition(current, re_init);
 88 |             nfa.add_empty_transition(re_cont, cont);
 89 |             nfa.add_empty_transition(re_cont, re_init);
 90 |         }
 91 | 
 92 |         Regex::ZeroOrOne(re) => {
 93 |             let re_init = nfa.new_state();
 94 |             add_re(nfa, bindings, re, re_init, cont);
 95 |             nfa.add_empty_transition(current, cont);
 96 |             nfa.add_empty_transition(current, re_init);
 97 |         }
 98 | 
 99 |         Regex::Concat(re1, re2) => {
100 |             let re1_cont = nfa.new_state();
101 |             add_re(nfa, bindings, re1, current, re1_cont);
102 |             add_re(nfa, bindings, re2, re1_cont, cont);
103 |         }
104 | 
105 |         Regex::Or(re1, re2) => {
106 |             let re1_init = nfa.new_state();
107 |             let re2_init = nfa.new_state();
108 |             add_re(nfa, bindings, re1, re1_init, cont);
109 |             add_re(nfa, bindings, re2, re2_init, cont);
110 |             nfa.add_empty_transition(current, re1_init);
111 |             nfa.add_empty_transition(current, re2_init);
112 |         }
113 | 
114 |         Regex::Any => {
115 |             nfa.add_any_transition(current, cont);
116 |         }
117 | 
118 |         Regex::EndOfInput => {
119 |             nfa.add_end_of_input_transition(current, cont);
120 |         }
121 | 
122 |         Regex::Diff(_, _) => {
123 |             let map = regex_to_range_map(bindings, re);
124 |             nfa.add_range_transitions(current, map, cont);
125 |         }
126 |     }
127 | }
128 | 
129 | fn get_builtin_regex(builtin: &Builtin) -> BuiltinCharRange {
130 |     BUILTIN_RANGES
131 |         .iter()
132 |         .find_map(|(name, builtin_)| {
133 |             if *name == builtin.0 {
134 |                 Some(*builtin_)
135 |             } else {
136 |                 None
137 |             }
138 |         })
139 |         .unwrap_or_else(|| panic!("Unknown builtin regex: {}", builtin.0))
140 | }
141 | 
142 | fn regex_to_range_map(bindings: &Map<Var, Regex>, re: &Regex) -> RangeMap<()> {
143 |     match re {
144 |         Regex::Builtin(builtin) => {
145 |             let builtin = get_builtin_regex(builtin);
146 |             let ranges: Vec<Range<()>> = builtin
147 |                 .get_ranges()
148 |                 .iter()
149 |                 .copied()
150 |                 .map(|(start, end)| Range {
151 |                     start,
152 |                     end,
153 |                     value: (),
154 |                 })
155 |                 .collect();
156 |             RangeMap::from_non_overlapping_sorted_ranges(ranges)
157 |         }
158 | 
159 |         Regex::Var(var) => {
160 |             let re = bindings
161 |                 .get(var)
162 |                 .unwrap_or_else(|| panic!("Unbound variable {:?}", var.0));
163 | 
164 |             regex_to_range_map(bindings, re)
165 |         }
166 | 
167 |         Regex::Char(char) => {
168 |             let mut map = RangeMap::new();
169 |             map.insert(*char as u32, *char as u32, (), merge_values);
170 |             map
171 |         }
172 | 
173 |         Regex::String(_) => panic!("strings cannot be used in char sets (`#`)"),
174 | 
175 |         Regex::CharSet(char_set) => {
176 |             let mut map = RangeMap::new();
177 | 
178 |             // TODO: Quadratic behavior below, `RangeMap::insert` is O(number of ranges)
179 |             for char_or_range in char_set.0.iter() {
180 |                 match char_or_range {
181 |                     CharOrRange::Char(char) => {
182 |                         map.insert(*char as u32, *char as u32, (), merge_values);
183 |                     }
184 |                     CharOrRange::Range(start, end) => {
185 |                         map.insert(*start as u32, *end as u32, (), merge_values);
186 |                     }
187 |                 }
188 |             }
189 | 
190 |             map
191 |         }
192 | 
193 |         Regex::ZeroOrMore(_) => {
194 |             panic!("`*` cannot be used in char sets (`#`)");
195 |         }
196 | 
197 |         Regex::OneOrMore(_) => {
198 |             panic!("`+` cannot be used in char sets (`#`)");
199 |         }
200 | 
201 |         Regex::ZeroOrOne(_) => {
202 |             panic!("`?` cannot be used in char sets (`#`)");
203 |         }
204 | 
205 |         Regex::Concat(_, _) => {
206 |             panic!("concatenation (`<re1> <re2>`) cannot be used in char sets (`#`)");
207 |         }
208 | 
209 |         Regex::Or(re1, re2) => {
210 |             let mut map1 = regex_to_range_map(bindings, re1);
211 |             let map2 = regex_to_range_map(bindings, re2);
212 | 
213 |             map1.insert_ranges(map2.into_iter(), merge_values);
214 | 
215 |             map1
216 |         }
217 | 
218 |         Regex::Any => {
219 |             let mut map = RangeMap::new();
220 |             map.insert(0, char::MAX as u32, (), merge_values);
221 |             map
222 |         }
223 | 
224 |         Regex::EndOfInput => panic!("`$` cannot be used in char sets (`#`)"),
225 | 
226 |         Regex::Diff(re1, re2) => {
227 |             let mut map1 = regex_to_range_map(bindings, re1);
228 |             let map2 = regex_to_range_map(bindings, re2);
229 |             map1.remove_ranges(&map2);
230 |             map1
231 |         }
232 |     }
233 | }
234 | 
235 | fn merge_values(_val1: &mut (), _val2: ()) {}
236 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/right_ctx.rs:
--------------------------------------------------------------------------------
 1 | //! Stuff related to right contexts
 2 | //!
 3 | //! A right context is a limited version of lookahead. A rule can have at most one right context.
 4 | //! When a rule has right context, after the regex for the rule matches, we run the DFA for the
 5 | //! right context with cloned input stream. Only if it matches we consider the the rule as a match.
 6 | //! This provides a simple "lookahead" support, which should be good enough when lexing programming
 7 | //! languages.
 8 | 
 9 | use crate::ast::{Regex, Var};
10 | use crate::collections::Map;
11 | // use crate::dfa::simplify::{simplify, Trans};
12 | use crate::dfa::{StateIdx, DFA};
13 | use crate::nfa::NFA;
14 | use crate::nfa_to_dfa::nfa_to_dfa;
15 | 
16 | #[derive(Debug)]
17 | pub struct RightCtxDFAs<S> {
18 |     dfas: Vec<DFA<S, ()>>,
19 | }
20 | 
21 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
22 | pub struct RightCtxIdx(usize);
23 | 
24 | impl RightCtxIdx {
25 |     pub fn as_usize(&self) -> usize {
26 |         self.0
27 |     }
28 | }
29 | 
30 | impl<A> RightCtxDFAs<A> {
31 |     pub fn new() -> Self {
32 |         RightCtxDFAs { dfas: vec![] }
33 |     }
34 | 
35 |     pub fn iter(&self) -> impl Iterator<Item = (RightCtxIdx, &DFA<A, ()>)> {
36 |         self.dfas
37 |             .iter()
38 |             .enumerate()
39 |             .map(|(i, dfa)| (RightCtxIdx(i), dfa))
40 |     }
41 | }
42 | 
43 | impl RightCtxDFAs<StateIdx> {
44 |     pub fn new_right_ctx(&mut self, bindings: &Map<Var, Regex>, right_ctx: &Regex) -> RightCtxIdx {
45 |         let idx = self.dfas.len();
46 | 
47 |         let mut nfa: NFA<()> = NFA::new();
48 |         nfa.add_regex(bindings, right_ctx, None, ());
49 | 
50 |         let dfa = nfa_to_dfa(&nfa);
51 |         self.dfas.push(dfa);
52 | 
53 |         RightCtxIdx(idx)
54 |     }
55 | 
56 |     #[cfg(test)]
57 |     pub fn get(&self, right_ctx: &RightCtxIdx) -> &DFA<StateIdx, ()> {
58 |         &self.dfas[right_ctx.as_usize()]
59 |     }
60 | 
61 |     /*
62 |     pub fn simplify(self) -> RightCtxDFAs<Trans<()>> {
63 |         RightCtxDFAs {
64 |             dfas: self
65 |                 .dfas
66 |                 .into_iter()
67 |                 .map(|dfa| simplify::<(), ()>(dfa, &mut Default::default()))
68 |                 .collect(),
69 |         }
70 |     }
71 |     */
72 | }
73 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/semantic_action_table.rs:
--------------------------------------------------------------------------------
 1 | use crate::ast::RuleRhs;
 2 | 
 3 | pub struct SemanticActionTable {
 4 |     table: Vec<RuleRhs>,
 5 | }
 6 | 
 7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 8 | pub struct SemanticActionIdx(usize);
 9 | 
10 | impl SemanticActionTable {
11 |     pub fn new() -> Self {
12 |         Self { table: vec![] }
13 |     }
14 | 
15 |     pub fn add(&mut self, action: RuleRhs) -> SemanticActionIdx {
16 |         let idx = self.table.len();
17 |         self.table.push(action);
18 |         SemanticActionIdx(idx)
19 |     }
20 | 
21 |     pub fn iter(&self) -> impl Iterator<Item = (SemanticActionIdx, &RuleRhs)> {
22 |         self.table
23 |             .iter()
24 |             .enumerate()
25 |             .map(|(idx, expr)| (SemanticActionIdx(idx), expr))
26 |     }
27 | }
28 | 
29 | impl SemanticActionIdx {
30 |     pub fn as_usize(&self) -> usize {
31 |         self.0
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/crates/lexgen/src/tests.rs:
--------------------------------------------------------------------------------
  1 | use crate::ast::{CharOrRange, CharSet, Regex, Var};
  2 | use crate::collections::Map;
  3 | use crate::dfa::StateIdx as DfaStateIdx;
  4 | use crate::nfa::simulate::{ErrorLoc, Matches};
  5 | use crate::nfa::NFA;
  6 | use crate::nfa_to_dfa::nfa_to_dfa;
  7 | use crate::right_ctx::RightCtxDFAs;
  8 | 
  9 | fn test_simulate<'input, A: Copy + std::fmt::Debug + Eq>(
 10 |     nfa: &NFA<A>,
 11 |     test_cases: Vec<(&'input str, Matches<'input, A>, Option<ErrorLoc>)>,
 12 | ) {
 13 |     test_simulate_right_ctx(nfa, &RightCtxDFAs::new(), test_cases)
 14 | }
 15 | 
 16 | fn test_simulate_right_ctx<'input, A: Copy + std::fmt::Debug + Eq>(
 17 |     nfa: &NFA<A>,
 18 |     right_ctx_dfas: &RightCtxDFAs<DfaStateIdx>,
 19 |     test_cases: Vec<(&'input str, Matches<'input, A>, Option<ErrorLoc>)>,
 20 | ) {
 21 |     println!("NFA=\n{}", nfa);
 22 | 
 23 |     let dfa = nfa_to_dfa(nfa);
 24 | 
 25 |     println!("DFA=\n{}", dfa);
 26 | 
 27 |     for (str, expected_matches, expected_error) in test_cases {
 28 |         let expected = (expected_matches, expected_error);
 29 | 
 30 |         assert_eq!(
 31 |             &nfa.simulate(str, right_ctx_dfas),
 32 |             &expected,
 33 |             "NFA simulation failed for string: {:?}",
 34 |             str
 35 |         );
 36 | 
 37 |         assert_eq!(
 38 |             dfa.simulate(str, right_ctx_dfas),
 39 |             expected,
 40 |             "DFA simulation failed for string: {:?}",
 41 |             str
 42 |         );
 43 |     }
 44 | }
 45 | 
 46 | #[test]
 47 | fn simulate_backtracking() {
 48 |     let mut nfa: NFA<usize> = NFA::new();
 49 | 
 50 |     nfa.add_regex(
 51 |         &Default::default(),
 52 |         &Regex::Concat(
 53 |             Box::new(Regex::OneOrMore(Box::new(Regex::Char('a')))),
 54 |             Box::new(Regex::Char('b')),
 55 |         ),
 56 |         None,
 57 |         1,
 58 |     );
 59 | 
 60 |     nfa.add_regex(&Default::default(), &Regex::Char('a'), None, 2);
 61 | 
 62 |     test_simulate(
 63 |         &nfa,
 64 |         vec![
 65 |             ("a", vec![("a", 2)], None),
 66 |             ("aa", vec![("a", 2), ("a", 2)], None),
 67 |             ("aab", vec![("aab", 1)], None),
 68 |         ],
 69 |     );
 70 | }
 71 | 
 72 | #[test]
 73 | fn issue_16() {
 74 |     let mut nfa: NFA<usize> = NFA::new();
 75 | 
 76 |     nfa.add_regex(
 77 |         &Default::default(),
 78 |         &Regex::String("xyzxyz".to_owned()),
 79 |         None,
 80 |         1,
 81 |     );
 82 |     nfa.add_regex(
 83 |         &Default::default(),
 84 |         &Regex::String("xyz".to_owned()),
 85 |         None,
 86 |         2,
 87 |     );
 88 |     nfa.add_regex(
 89 |         &Default::default(),
 90 |         &Regex::String("xya".to_owned()),
 91 |         None,
 92 |         3,
 93 |     );
 94 | 
 95 |     test_simulate(
 96 |         &nfa,
 97 |         vec![
 98 |             ("xyzxya", vec![("xyz", 2), ("xya", 3)], None),
 99 |             ("xyzxyz", vec![("xyzxyz", 1)], None),
100 |         ],
101 |     );
102 | }
103 | 
104 | #[test]
105 | fn stuck_1() {
106 |     let nfa: NFA<usize> = NFA::new();
107 |     test_simulate(&nfa, vec![("a", vec![], Some(0))]);
108 | }
109 | 
110 | #[test]
111 | fn stuck_2() {
112 |     let mut nfa: NFA<usize> = NFA::new();
113 | 
114 |     nfa.add_regex(
115 |         &Default::default(),
116 |         &Regex::String("ab".to_owned()),
117 |         None,
118 |         1,
119 |     );
120 | 
121 |     test_simulate(&nfa, vec![("aba", vec![("ab", 1)], Some(2))]);
122 | }
123 | 
124 | #[test]
125 | fn stuck_3() {
126 |     let mut nfa: NFA<usize> = NFA::new();
127 | 
128 |     nfa.add_regex(
129 |         &Default::default(),
130 |         &Regex::String("aaab".to_owned()),
131 |         None,
132 |         1,
133 |     );
134 |     nfa.add_regex(&Default::default(), &Regex::String("a".to_owned()), None, 2);
135 | 
136 |     test_simulate(&nfa, vec![("aaabb", vec![("aaab", 1)], Some(4))]);
137 | }
138 | 
139 | #[test]
140 | fn simulate_char() {
141 |     let re = Regex::Char('a');
142 |     let mut nfa: NFA<usize> = NFA::new();
143 |     nfa.add_regex(&Default::default(), &re, None, 1);
144 | 
145 |     test_simulate(
146 |         &nfa,
147 |         vec![
148 |             ("aa", vec![("a", 1), ("a", 1)], None),
149 |             ("b", vec![], Some(0)),
150 |         ],
151 |     );
152 | }
153 | 
154 | #[test]
155 | fn simulate_string() {
156 |     let re = Regex::String("ab".to_owned());
157 |     let mut nfa: NFA<usize> = NFA::new();
158 |     nfa.add_regex(&Default::default(), &re, None, 1);
159 | 
160 |     test_simulate(
161 |         &nfa,
162 |         vec![
163 |             ("a", vec![], Some(0)),
164 |             ("ab", vec![("ab", 1)], None),
165 |             ("abc", vec![("ab", 1)], Some(2)),
166 |         ],
167 |     );
168 | }
169 | 
170 | #[test]
171 | fn simulate_char_set_char() {
172 |     let re = Regex::CharSet(CharSet(vec![
173 |         CharOrRange::Char('a'),
174 |         CharOrRange::Char('b'),
175 |     ]));
176 |     let mut nfa: NFA<usize> = NFA::new();
177 |     nfa.add_regex(&Default::default(), &re, None, 1);
178 | 
179 |     test_simulate(
180 |         &nfa,
181 |         vec![
182 |             ("a", vec![("a", 1)], None),
183 |             ("b", vec![("b", 1)], None),
184 |             ("ab", vec![("a", 1), ("b", 1)], None),
185 |             ("ba", vec![("b", 1), ("a", 1)], None),
186 |         ],
187 |     );
188 | }
189 | 
190 | #[test]
191 | fn simulate_char_set_range() {
192 |     let re = Regex::CharSet(CharSet(vec![
193 |         CharOrRange::Char('a'),
194 |         CharOrRange::Char('b'),
195 |         CharOrRange::Range('0', '9'),
196 |     ]));
197 |     let mut nfa: NFA<usize> = NFA::new();
198 |     nfa.add_regex(&Default::default(), &re, None, 1);
199 | 
200 |     test_simulate(
201 |         &nfa,
202 |         vec![("ab09", vec![("a", 1), ("b", 1), ("0", 1), ("9", 1)], None)],
203 |     );
204 | }
205 | 
206 | #[test]
207 | fn simulate_zero_or_more() {
208 |     let re = Regex::ZeroOrMore(Box::new(Regex::Char('a')));
209 |     let mut nfa: NFA<usize> = NFA::new();
210 |     nfa.add_regex(&Default::default(), &re, None, 1);
211 | 
212 |     test_simulate(
213 |         &nfa,
214 |         vec![
215 |             // TODO
216 |             // ("", vec![], None),
217 |             ("a", vec![("a", 1)], None),
218 |             ("aa", vec![("aa", 1)], None),
219 |             ("aab", vec![("aa", 1)], Some(2)),
220 |         ],
221 |     );
222 | }
223 | 
224 | #[test]
225 | fn simulate_one_or_more() {
226 |     let re = Regex::OneOrMore(Box::new(Regex::Char('a')));
227 |     let mut nfa: NFA<usize> = NFA::new();
228 |     nfa.add_regex(&Default::default(), &re, None, 1);
229 | 
230 |     test_simulate(
231 |         &nfa,
232 |         vec![
233 |             ("", vec![], Some(0)),
234 |             ("a", vec![("a", 1)], None),
235 |             ("aa", vec![("aa", 1)], None),
236 |             ("aab", vec![("aa", 1)], Some(2)),
237 |         ],
238 |     );
239 | }
240 | 
241 | #[test]
242 | fn simulate_zero_or_one() {
243 |     let re = Regex::ZeroOrOne(Box::new(Regex::Char('a')));
244 |     let mut nfa: NFA<usize> = NFA::new();
245 |     nfa.add_regex(&Default::default(), &re, None, 1);
246 | 
247 |     test_simulate(
248 |         &nfa,
249 |         vec![
250 |             ("", vec![], Some(0)),
251 |             ("a", vec![("a", 1)], None),
252 |             ("aa", vec![("a", 1), ("a", 1)], None),
253 |             ("aab", vec![("a", 1), ("a", 1)], Some(2)),
254 |         ],
255 |     );
256 | }
257 | 
258 | #[test]
259 | fn simulate_concat() {
260 |     let re = Regex::Concat(Box::new(Regex::Char('a')), Box::new(Regex::Char('b')));
261 |     let mut nfa: NFA<usize> = NFA::new();
262 |     nfa.add_regex(&Default::default(), &re, None, 1);
263 | 
264 |     test_simulate(
265 |         &nfa,
266 |         vec![
267 |             ("a", vec![], Some(0)),
268 |             ("ab", vec![("ab", 1)], None),
269 |             ("aba", vec![("ab", 1)], Some(2)),
270 |         ],
271 |     );
272 | }
273 | 
274 | #[test]
275 | fn simulate_or() {
276 |     let re = Regex::Or(Box::new(Regex::Char('a')), Box::new(Regex::Char('b')));
277 |     let mut nfa: NFA<usize> = NFA::new();
278 |     nfa.add_regex(&Default::default(), &re, None, 1);
279 | 
280 |     test_simulate(
281 |         &nfa,
282 |         vec![
283 |             ("a", vec![("a", 1)], None),
284 |             ("b", vec![("b", 1)], None),
285 |             ("ab", vec![("a", 1), ("b", 1)], None),
286 |         ],
287 |     );
288 | }
289 | 
290 | #[test]
291 | fn simulate_or_one_or_more_char() {
292 |     let re = Regex::Or(
293 |         Box::new(Regex::OneOrMore(Box::new(Regex::Char('a')))),
294 |         Box::new(Regex::Char('b')),
295 |     );
296 |     let mut nfa: NFA<usize> = NFA::new();
297 |     nfa.add_regex(&Default::default(), &re, None, 1);
298 | 
299 |     test_simulate(
300 |         &nfa,
301 |         vec![
302 |             ("a", vec![("a", 1)], None),
303 |             ("b", vec![("b", 1)], None),
304 |             ("aa", vec![("aa", 1)], None),
305 |         ],
306 |     );
307 | }
308 | 
309 | #[test]
310 | fn simulate_multiple_accepting_states_1() {
311 |     let re1 = Regex::String("aaaa".to_owned());
312 |     let re2 = Regex::String("aaab".to_owned());
313 |     let mut nfa: NFA<usize> = NFA::new();
314 |     nfa.add_regex(&Default::default(), &re1, None, 1);
315 |     nfa.add_regex(&Default::default(), &re2, None, 2);
316 | 
317 |     test_simulate(
318 |         &nfa,
319 |         vec![
320 |             ("aaaa", vec![("aaaa", 1)], None),
321 |             ("aaab", vec![("aaab", 2)], None),
322 |             ("aaac", vec![], Some(0)),
323 |         ],
324 |     );
325 | }
326 | 
327 | #[test]
328 | fn multiple_accepting_states_2() {
329 |     let re1 = Regex::Or(
330 |         Box::new(Regex::OneOrMore(Box::new(Regex::Char('a')))),
331 |         Box::new(Regex::Char('b')),
332 |     );
333 |     let re2 = Regex::CharSet(CharSet(vec![CharOrRange::Range('0', '9')]));
334 |     let mut nfa: NFA<usize> = NFA::new();
335 |     nfa.add_regex(&Default::default(), &re1, None, 1);
336 |     nfa.add_regex(&Default::default(), &re2, None, 2);
337 | 
338 |     test_simulate(
339 |         &nfa,
340 |         vec![
341 |             ("b", vec![("b", 1)], None),
342 |             ("a", vec![("a", 1)], None),
343 |             ("aa", vec![("aa", 1)], None),
344 |             ("0", vec![("0", 2)], None),
345 |         ],
346 |     );
347 | }
348 | 
349 | #[test]
350 | fn simulate_variables() {
351 |     let mut bindings: Map<Var, Regex> = Default::default();
352 |     bindings.insert(
353 |         Var("initial".to_owned()),
354 |         Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'z')])),
355 |     );
356 |     bindings.insert(
357 |         Var("subsequent".to_owned()),
358 |         Regex::CharSet(CharSet(vec![
359 |             CharOrRange::Range('a', 'z'),
360 |             CharOrRange::Range('A', 'Z'),
361 |             CharOrRange::Range('0', '9'),
362 |             CharOrRange::Char('-'),
363 |             CharOrRange::Char('_'),
364 |         ])),
365 |     );
366 |     let re = Regex::Concat(
367 |         Box::new(Regex::Var(Var("initial".to_owned()))),
368 |         Box::new(Regex::ZeroOrMore(Box::new(Regex::Var(Var(
369 |             "subsequent".to_owned()
370 |         ))))),
371 |     );
372 |     let mut nfa: NFA<usize> = NFA::new();
373 |     nfa.add_regex(&bindings, &re, None, 1);
374 | 
375 |     test_simulate(
376 |         &nfa,
377 |         vec![
378 |             ("a", vec![("a", 1)], None),
379 |             ("aA", vec![("aA", 1)], None),
380 |             ("aA123-a", vec![("aA123-a", 1)], None),
381 |         ],
382 |     );
383 | }
384 | 
385 | #[test]
386 | fn zero_or_more_concat_confusion_1() {
387 |     let mut nfa: NFA<usize> = NFA::new();
388 | 
389 |     let re = Regex::Concat(
390 |         Box::new(Regex::ZeroOrMore(Box::new(Regex::Char('a')))),
391 |         Box::new(Regex::Char('a')),
392 |     );
393 | 
394 |     nfa.add_regex(&Default::default(), &re, None, 1);
395 | 
396 |     test_simulate(
397 |         &nfa,
398 |         vec![("a", vec![("a", 1)], None), ("aa", vec![("aa", 1)], None)],
399 |     );
400 | }
401 | 
402 | #[test]
403 | fn zero_or_more_concat_confusion_2() {
404 |     let mut nfa: NFA<usize> = NFA::new();
405 | 
406 |     let re = Regex::Concat(
407 |         Box::new(Regex::ZeroOrMore(Box::new(Regex::Char('a')))),
408 |         Box::new(Regex::String("ab".to_owned())),
409 |     );
410 | 
411 |     nfa.add_regex(&Default::default(), &re, None, 1);
412 | 
413 |     test_simulate(
414 |         &nfa,
415 |         vec![
416 |             ("ab", vec![("ab", 1)], None),
417 |             ("aab", vec![("aab", 1)], None),
418 |         ],
419 |     );
420 | }
421 | 
422 | #[test]
423 | fn zero_or_more_concat_confusion_3() {
424 |     let mut nfa: NFA<usize> = NFA::new();
425 | 
426 |     let re = Regex::Concat(
427 |         Box::new(Regex::Concat(
428 |             Box::new(Regex::Char('a')),
429 |             Box::new(Regex::ZeroOrMore(Box::new(Regex::Char('a')))),
430 |         )),
431 |         Box::new(Regex::Char('a')),
432 |     );
433 | 
434 |     nfa.add_regex(&Default::default(), &re, None, 1);
435 | 
436 |     test_simulate(
437 |         &nfa,
438 |         vec![
439 |             ("a", vec![], Some(0)),
440 |             ("aa", vec![("aa", 1)], None),
441 |             ("aaa", vec![("aaa", 1)], None),
442 |         ],
443 |     );
444 | }
445 | 
446 | #[test]
447 | fn simulate_any_1() {
448 |     let mut nfa: NFA<usize> = NFA::new();
449 | 
450 |     nfa.add_regex(
451 |         &Default::default(),
452 |         &Regex::String("ab".to_owned()),
453 |         None,
454 |         1,
455 |     );
456 |     nfa.add_regex(&Default::default(), &Regex::Any, None, 2);
457 | 
458 |     test_simulate(
459 |         &nfa,
460 |         vec![
461 |             ("a", vec![("a", 2)], None),
462 |             ("ab", vec![("ab", 1)], None),
463 |             ("abc", vec![("ab", 1), ("c", 2)], None),
464 |         ],
465 |     );
466 | }
467 | 
468 | #[test]
469 | fn simulate_any_2() {
470 |     let mut nfa: NFA<usize> = NFA::new();
471 | 
472 |     nfa.add_regex(
473 |         &Default::default(),
474 |         &Regex::Concat(
475 |             Box::new(Regex::Char('\'')),
476 |             Box::new(Regex::Concat(
477 |                 Box::new(Regex::Any),
478 |                 Box::new(Regex::Char('\'')),
479 |             )),
480 |         ),
481 |         None,
482 |         1,
483 |     );
484 | 
485 |     test_simulate(&nfa, vec![("'a'", vec![("'a'", 1)], None)]);
486 | }
487 | 
488 | #[test]
489 | fn simulate_end_of_input_1() {
490 |     let mut nfa: NFA<usize> = NFA::new();
491 | 
492 |     // C-style single-line comment syntax: "//" _* ('\n' | $)
493 |     nfa.add_regex(
494 |         &Default::default(),
495 |         &Regex::Concat(
496 |             Box::new(Regex::String("//".to_owned())),
497 |             Box::new(Regex::Concat(
498 |                 Box::new(Regex::ZeroOrMore(Box::new(Regex::Any))),
499 |                 Box::new(Regex::Or(
500 |                     Box::new(Regex::Char('\n')),
501 |                     Box::new(Regex::EndOfInput),
502 |                 )),
503 |             )),
504 |         ),
505 |         None,
506 |         1,
507 |     );
508 | 
509 |     test_simulate(
510 |         &nfa,
511 |         vec![
512 |             ("//", vec![("//", 1)], None),
513 |             ("//  \n", vec![("//  \n", 1)], None),
514 |             ("//  ", vec![("//  ", 1)], None),
515 |         ],
516 |     );
517 | }
518 | 
519 | #[test]
520 | fn simulate_end_of_input_2() {
521 |     let mut nfa: NFA<usize> = NFA::new();
522 | 
523 |     nfa.add_regex(&Default::default(), &Regex::EndOfInput, None, 1);
524 |     nfa.add_regex(
525 |         &Default::default(),
526 |         &Regex::ZeroOrMore(Box::new(Regex::Any)),
527 |         None,
528 |         2,
529 |     );
530 | 
531 |     // TODO: EndOfInput never matches?
532 |     test_simulate(&nfa, vec![("a", vec![("a", 2)], None)]);
533 | }
534 | 
535 | #[test]
536 | fn simulate_multiple_accepting_states_3() {
537 |     let mut nfa: NFA<usize> = NFA::new();
538 | 
539 |     nfa.add_regex(
540 |         &Default::default(),
541 |         &Regex::String("aaa".to_owned()),
542 |         None,
543 |         1,
544 |     );
545 |     nfa.add_regex(
546 |         &Default::default(),
547 |         &Regex::String("aaa".to_owned()),
548 |         None,
549 |         2,
550 |     );
551 |     nfa.add_regex(
552 |         &Default::default(),
553 |         &Regex::String("aa".to_owned()),
554 |         None,
555 |         3,
556 |     );
557 | 
558 |     test_simulate(
559 |         &nfa,
560 |         vec![
561 |             ("aaa", vec![("aaa", 1)], None),
562 |             ("aa", vec![("aa", 3)], None),
563 |         ],
564 |     );
565 | }
566 | 
567 | #[test]
568 | fn range_and_char_confusion() {
569 |     let mut nfa: NFA<usize> = NFA::new();
570 | 
571 |     nfa.add_regex(
572 |         &Default::default(),
573 |         &Regex::String("ab".to_owned()),
574 |         None,
575 |         1,
576 |     );
577 |     nfa.add_regex(
578 |         &Default::default(),
579 |         &Regex::OneOrMore(Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range(
580 |             'a', 'z',
581 |         )])))),
582 |         None,
583 |         2,
584 |     );
585 | 
586 |     test_simulate(
587 |         &nfa,
588 |         vec![("ab", vec![("ab", 1)], None), ("ac", vec![("ac", 2)], None)],
589 |     );
590 | }
591 | 
592 | #[test]
593 | fn overlapping_ranges() {
594 |     let mut nfa: NFA<usize> = NFA::new();
595 | 
596 |     nfa.add_regex(
597 |         &Default::default(),
598 |         &Regex::Concat(
599 |             Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'b')]))),
600 |             Box::new(Regex::Char('1')),
601 |         ),
602 |         None,
603 |         1,
604 |     );
605 |     nfa.add_regex(
606 |         &Default::default(),
607 |         &Regex::Concat(
608 |             Box::new(Regex::CharSet(CharSet(vec![CharOrRange::Range('a', 'c')]))),
609 |             Box::new(Regex::Char('2')),
610 |         ),
611 |         None,
612 |         2,
613 |     );
614 | 
615 |     test_simulate(
616 |         &nfa,
617 |         vec![("a1", vec![("a1", 1)], None), ("a2", vec![("a2", 2)], None)],
618 |     );
619 | }
620 | 
621 | #[test]
622 | fn right_context_1() {
623 |     let mut nfa: NFA<usize> = NFA::new();
624 |     let mut right_ctxs = RightCtxDFAs::new();
625 | 
626 |     let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Char('a'));
627 |     nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1);
628 | 
629 |     test_simulate_right_ctx(&nfa, &right_ctxs, vec![("aa", vec![("a", 1)], Some(1))]);
630 |     test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![], Some(0))]);
631 | }
632 | 
633 | #[test]
634 | fn right_context_2() {
635 |     let mut nfa: NFA<usize> = NFA::new();
636 |     let mut right_ctxs = RightCtxDFAs::new();
637 | 
638 |     let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Any);
639 |     nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1);
640 | 
641 |     test_simulate_right_ctx(&nfa, &right_ctxs, vec![("aa", vec![("a", 1)], Some(1))]);
642 |     test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![("a", 1)], Some(1))]);
643 |     test_simulate_right_ctx(&nfa, &right_ctxs, vec![("a", vec![], Some(0))]);
644 | }
645 | 
646 | #[test]
647 | fn right_context_3() {
648 |     let mut nfa: NFA<usize> = NFA::new();
649 |     let mut right_ctxs = RightCtxDFAs::new();
650 | 
651 |     let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::EndOfInput);
652 |     nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1);
653 | 
654 |     test_simulate_right_ctx(&nfa, &right_ctxs, vec![("a", vec![("a", 1)], None)]);
655 |     test_simulate_right_ctx(&nfa, &right_ctxs, vec![("ab", vec![], Some(0))]);
656 | }
657 | 
658 | #[test]
659 | fn right_context_4() {
660 |     let mut nfa: NFA<usize> = NFA::new();
661 |     let mut right_ctxs = RightCtxDFAs::new();
662 | 
663 |     let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::Char('a'));
664 |     nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 1);
665 | 
666 |     let right_ctx = right_ctxs.new_right_ctx(&Default::default(), &Regex::EndOfInput);
667 |     nfa.add_regex(&Default::default(), &Regex::Char('a'), Some(right_ctx), 2);
668 | 
669 |     test_simulate_right_ctx(
670 |         &nfa,
671 |         &right_ctxs,
672 |         vec![("aa", vec![("a", 1), ("a", 2)], None)],
673 |     );
674 | }
675 | 


--------------------------------------------------------------------------------
/crates/lexgen/tests/bugs.rs:
--------------------------------------------------------------------------------
  1 | mod test_utils;
  2 | 
  3 | use lexgen::lexer;
  4 | use lexgen_util::{LexerError, LexerErrorKind};
  5 | use test_utils::{loc, next};
  6 | 
  7 | #[test]
  8 | fn failure_confusion_1() {
  9 |     // The bug: in the lexer below, when the input is "\\\"", the first backslash would be pushed
 10 |     // to the string buffer by the catch-all (now called "failure") case. The correct behaviour is
 11 |     // the failure case should only run if none of the other rules match to completion.
 12 | 
 13 |     #[derive(Debug, Default)]
 14 |     struct LexerState {
 15 |         buf: String,
 16 |     }
 17 | 
 18 |     lexer! {
 19 |         Lexer(LexerState) -> String;
 20 | 
 21 |         let whitespace = [' ' '\t' '\n'];
 22 | 
 23 |         '"' => |lexer| {
 24 |             println!("matched a double quote");
 25 |             let str = std::mem::take(&mut lexer.state().buf);
 26 |             lexer.return_(str)
 27 |         },
 28 | 
 29 |         "\\\"" => |lexer| {
 30 |             println!("matched an escaped double quote");
 31 |             lexer.state().buf.push('"');
 32 |             lexer.continue_()
 33 |         },
 34 | 
 35 |         _ => |lexer| {
 36 |             let char = lexer.match_().chars().next_back().unwrap();
 37 |             println!("wildcard matched {:?}", char);
 38 |             lexer.state().buf.push(char);
 39 |             lexer.continue_()
 40 |         },
 41 |     }
 42 | 
 43 |     let mut lexer = Lexer::new("test\"");
 44 |     assert_eq!(next(&mut lexer), Some(Ok("test".to_owned())));
 45 |     assert_eq!(next(&mut lexer), None);
 46 | 
 47 |     let mut lexer = Lexer::new("\\\"\"");
 48 |     assert_eq!(next(&mut lexer), Some(Ok("\"".to_owned())));
 49 |     assert_eq!(next(&mut lexer), None);
 50 | }
 51 | 
 52 | #[test]
 53 | fn failure_confusion_2() {
 54 |     // Similar to the bug above: the failure case should run if none of the other rules match to
 55 |     // completion.
 56 | 
 57 |     #[derive(Debug, Default)]
 58 |     struct LexerState {
 59 |         comment_depth: usize,
 60 |     }
 61 | 
 62 |     lexer! {
 63 |         Lexer(LexerState) -> ();
 64 | 
 65 | 
 66 |         rule Init {
 67 |             ' ',
 68 | 
 69 |             "(*" => |lexer| {
 70 |                 lexer.state().comment_depth = 1;
 71 |                 lexer.switch(LexerRule::Comment)
 72 |             },
 73 |         }
 74 | 
 75 |         rule Comment {
 76 |             "(*" => |lexer| {
 77 |                 let depth = &mut lexer.state().comment_depth;
 78 |                 *depth += 1;
 79 |                 lexer.continue_()
 80 |             },
 81 | 
 82 |             "*)" => |lexer| {
 83 |                 let depth = &mut lexer.state().comment_depth;
 84 |                 if *depth == 1 {
 85 |                     lexer.switch(LexerRule::Init)
 86 |                 } else {
 87 |                     *depth -= 1;
 88 |                     lexer.continue_()
 89 |                 }
 90 |             },
 91 | 
 92 |             _,
 93 |         }
 94 |     }
 95 | 
 96 |     let mut lexer = Lexer::new("(* * *) (* (* ** *) *)");
 97 |     assert_eq!(lexer.next(), None);
 98 | }
 99 | 
100 | #[test]
101 | fn failure_confusion_3_1() {
102 |     lexer! {
103 |         Lexer -> usize;
104 | 
105 |         ' ' = 0,
106 |         "ab" = 1,
107 |         _ = 2,
108 |     }
109 | 
110 |     let mut lexer = Lexer::new("a ab abc");
111 |     assert_eq!(next(&mut lexer), Some(Ok(2)));
112 |     assert_eq!(next(&mut lexer), Some(Ok(0)));
113 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
114 |     assert_eq!(next(&mut lexer), Some(Ok(0)));
115 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
116 |     assert_eq!(next(&mut lexer), Some(Ok(2)));
117 |     assert_eq!(next(&mut lexer), None);
118 | }
119 | 
120 | #[test]
121 | fn failure_confusion_3_2() {
122 |     // In practice the case we test in the previous test happens when lexing single-letter
123 |     // identifiers in a lexer that allows multi-letter identifiers (i.e. practically all language
124 |     // lexers). Here's a more realistic example:
125 |     lexer! {
126 |         Lexer -> usize;
127 | 
128 |         $$ascii_lowercase+ = 1,
129 |         ',' = 2,
130 |     }
131 | 
132 |     let mut lexer = Lexer::new("f,");
133 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
134 |     assert_eq!(next(&mut lexer), Some(Ok(2)));
135 |     assert_eq!(next(&mut lexer), None);
136 | }
137 | 
138 | #[test]
139 | fn failure_confusion_4() {
140 |     lexer! {
141 |         Lexer -> u32;
142 | 
143 |         ' ',
144 |         "aaa" = 1,
145 |         "aa" = 2,
146 |         _ = 3,
147 |     }
148 | 
149 |     let mut lexer = Lexer::new("aaa aa a");
150 | 
151 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
152 |     assert_eq!(next(&mut lexer), Some(Ok(2)));
153 |     assert_eq!(next(&mut lexer), Some(Ok(3)));
154 |     assert_eq!(next(&mut lexer), None);
155 | }
156 | 
157 | #[test]
158 | fn continue_confusion_1() {
159 |     lexer! {
160 |         Lexer -> u32;
161 | 
162 |         _,
163 |     }
164 | 
165 |     let mut lexer = Lexer::new("");
166 |     assert_eq!(lexer.next(), None);
167 | 
168 |     let mut lexer = Lexer::new("a");
169 |     assert_eq!(lexer.next(), None);
170 | 
171 |     let mut lexer = Lexer::new("aaa");
172 |     assert_eq!(lexer.next(), None);
173 | }
174 | 
175 | #[test]
176 | fn continue_confusion_2() {
177 |     lexer! {
178 |         Lexer -> u32;
179 | 
180 |         rule Init {
181 |             _ => |lexer| lexer.switch(LexerRule::Test),
182 |         }
183 | 
184 |         // Previously failure code would run on end-of-stream, which resets the state to `Test` and
185 |         // continues, causing a loop.
186 |         //
187 |         // This issue does not exist in `Init` as we explicitly handle EOF there, to stop the main
188 |         // loop.
189 |         //
190 |         // Instead end-of-stream in a state other than `Init` should fail with "unexpected EOF".
191 |         rule Test {
192 |             _,
193 |         }
194 |     }
195 | 
196 |     let mut lexer = Lexer::new("a");
197 |     assert!(matches!(lexer.next(), Some(Err(_))));
198 | 
199 |     let mut lexer = Lexer::new("aa");
200 |     assert!(matches!(lexer.next(), Some(Err(_))));
201 | }
202 | 
203 | #[test]
204 | fn return_should_reset_match() {
205 |     lexer! {
206 |         Lexer -> &'input str;
207 | 
208 |         rule Init {
209 |             "aaa" => |lexer| {
210 |                 let match_ = lexer.match_();
211 |                 lexer.switch_and_return(LexerRule::State1, match_)
212 |             },
213 |         }
214 | 
215 |         rule State1 {
216 |             "bbb" => |lexer| {
217 |                 let match_ = lexer.match_();
218 |                 lexer.switch_and_return(LexerRule::Init, match_)
219 |             },
220 |         }
221 |     }
222 | 
223 |     let mut lexer = Lexer::new("aaabbb");
224 |     assert_eq!(next(&mut lexer), Some(Ok("aaa")));
225 |     assert_eq!(next(&mut lexer), Some(Ok("bbb")));
226 |     assert_eq!(next(&mut lexer), None);
227 | }
228 | 
229 | #[test]
230 | fn issue_16_backtracking_1() {
231 |     lexer! {
232 |         Lexer -> &'input str;
233 | 
234 |         'a'+ 'b' => |lexer| {
235 |             let match_ = lexer.match_();
236 |             lexer.return_(match_)
237 |         },
238 | 
239 |         'a' => |lexer| {
240 |             let match_ = lexer.match_();
241 |             lexer.return_(match_)
242 |         },
243 |     }
244 | 
245 |     let mut lexer = Lexer::new("aaaab");
246 |     assert_eq!(next(&mut lexer), Some(Ok("aaaab")));
247 |     assert_eq!(next(&mut lexer), None);
248 | 
249 |     let mut lexer = Lexer::new("aaaa");
250 |     assert_eq!(next(&mut lexer), Some(Ok("a")));
251 |     assert_eq!(next(&mut lexer), Some(Ok("a")));
252 |     assert_eq!(next(&mut lexer), Some(Ok("a")));
253 |     assert_eq!(next(&mut lexer), Some(Ok("a")));
254 |     assert_eq!(next(&mut lexer), None);
255 | }
256 | 
257 | #[test]
258 | fn issue_16_backtracking_2() {
259 |     fn return_match<'input, I: Iterator<Item = char> + Clone>(
260 |         lexer: &mut Lexer<'input, I>,
261 |     ) -> lexgen_util::SemanticActionResult<&'input str> {
262 |         let match_ = lexer.match_();
263 |         lexer.return_(match_)
264 |     }
265 | 
266 |     lexer! {
267 |         Lexer -> &'input str;
268 | 
269 |         "xyzxyz" => return_match,
270 |         "xyz" => return_match,
271 |         "xya" => return_match,
272 |     }
273 | 
274 |     let mut lexer = Lexer::new("xyzxya");
275 |     assert_eq!(next(&mut lexer), Some(Ok("xyz")));
276 |     assert_eq!(next(&mut lexer), Some(Ok("xya")));
277 |     assert_eq!(next(&mut lexer), None);
278 | }
279 | 
280 | #[test]
281 | fn end_of_input_handling() {
282 |     lexer! {
283 |         Lexer -> (usize, &'input str);
284 | 
285 |         rule Init {
286 |             'a' => |lexer| {
287 |                 let match_ = lexer.match_();
288 |                 lexer.switch_and_return(LexerRule::Rule1, (0, match_))
289 |             },
290 |         }
291 | 
292 |         rule Rule1 {
293 |             $,
294 | 
295 |             'a' => |lexer| {
296 |                 let match_ = lexer.match_();
297 |                 lexer.return_((1, match_))
298 |             },
299 |         }
300 |     }
301 | 
302 |     let mut lexer = Lexer::new("aa");
303 |     assert_eq!(
304 |         lexer.next(),
305 |         Some(Ok((loc(0, 0, 0), (0, "a"), loc(0, 1, 1))))
306 |     );
307 |     assert_eq!(
308 |         lexer.next(),
309 |         Some(Ok((loc(0, 1, 1), (1, "a"), loc(0, 2, 2))))
310 |     );
311 |     assert_eq!(lexer.next(), None);
312 | }
313 | 
314 | #[test]
315 | fn empty_rule_simpification_issue_27() {
316 |     // Tests that:
317 |     //
318 |     // 1. Simplifier doesn't eliminate empty (i.e. no outgoing transitions) initial states without
319 |     //    incoming transitions. Since initial states can be switched to in semantic actions we
320 |     //    cannot know that we won't ever switch to them, so we cannot eliminate them.
321 |     //
322 |     // 2. When running a semantic action we reset `last_match` so if the next state is empty we
323 |     //    fail, instead of backtracking.
324 | 
325 |     lexer! {
326 |         Lexer -> &'input str;
327 | 
328 |         rule Init {
329 |             "0x" => |lexer| lexer.switch(LexerRule::HexInt),
330 |             '0' => |lexer| lexer.switch(LexerRule::DecInt),
331 |         }
332 | 
333 |         rule DecInt {
334 |             _ => |lexer| lexer.return_("wat"),
335 |         }
336 | 
337 |         rule HexInt {}
338 |     }
339 | 
340 |     let mut lexer = Lexer::new("0xff");
341 | 
342 |     // This used to return `Some("wat")` with the bug
343 |     assert_eq!(
344 |         next(&mut lexer),
345 |         Some(Err(LexerError {
346 |             location: loc(0, 0, 0),
347 |             kind: LexerErrorKind::InvalidToken,
348 |         }))
349 |     );
350 | }
351 | 
352 | #[test]
353 | fn range_any_overlap_issue_31() {
354 |     lexer! {
355 |         Lexer -> usize;
356 | 
357 |         "'" _ "'" = 1,
358 |         "'" ['a'-'z']+ = 2,
359 |     }
360 | 
361 |     let input = "'a'";
362 |     let mut lexer = Lexer::new(input);
363 |     assert_eq!(lexer.next(), Some(Ok((loc(0, 0, 0), 1, loc(0, 3, 3)))));
364 |     assert_eq!(lexer.next(), None);
365 | }
366 | 
367 | #[test]
368 | fn failure_should_reset_state_issue_48() {
369 |     lexer! {
370 |         Lexer -> &'input str;
371 | 
372 |         rule Init {
373 |             's' => |lexer|
374 |                 lexer.switch_and_return(LexerRule::InString, lexer.match_()),
375 |         }
376 | 
377 |         rule InString {
378 |             'a' => |lexer|
379 |                 lexer.switch_and_return(LexerRule::Init, lexer.match_()),
380 |         }
381 |     }
382 | 
383 |     let input = "sxasa";
384 |     let mut lexer = Lexer::new(input);
385 | 
386 |     assert_eq!(lexer.next(), Some(Ok((loc(0, 0, 0), "s", loc(0, 1, 1)))));
387 |     assert_eq!(
388 |         lexer.next(),
389 |         Some(Err(LexerError {
390 |             location: loc(0, 1, 1),
391 |             kind: LexerErrorKind::InvalidToken
392 |         }))
393 |     );
394 |     assert_eq!(
395 |         lexer.next(),
396 |         Some(Err(LexerError {
397 |             location: loc(0, 2, 2),
398 |             kind: LexerErrorKind::InvalidToken
399 |         }))
400 |     );
401 |     assert_eq!(lexer.next(), Some(Ok((loc(0, 3, 3), "s", loc(0, 4, 4)))));
402 |     assert_eq!(lexer.next(), Some(Ok((loc(0, 4, 4), "a", loc(0, 5, 5)))));
403 |     assert_eq!(lexer.next(), None);
404 | }
405 | 
406 | #[test]
407 | fn new_methods_no_default() {
408 |     // #54: `new_with_state` and `new_from_iter_with_state` shouldn't require state to implement
409 |     // `Default`
410 | 
411 |     struct UserState {}
412 | 
413 |     lexer! {
414 |         Lexer(UserState) -> ();
415 | 
416 |         $ = (),
417 |     }
418 | 
419 |     Lexer::new_with_state("", UserState {});
420 |     Lexer::new_from_iter_with_state(std::iter::empty(), UserState {});
421 | }
422 | 
423 | #[test]
424 | fn new_methods_default() {
425 |     // #54: `new` and `new_from_iter` should work with user state that implements `Default`
426 | 
427 |     #[derive(Default)]
428 |     struct UserState {}
429 | 
430 |     lexer! {
431 |         Lexer(UserState) -> ();
432 | 
433 |         $ = (),
434 |     }
435 | 
436 |     Lexer::new("");
437 |     Lexer::new_from_iter(std::iter::empty());
438 | }
439 | 


--------------------------------------------------------------------------------
/crates/lexgen/tests/lua_5_1.rs:
--------------------------------------------------------------------------------
  1 | // A Lua 5.1 lexer. We use this as
  2 | //
  3 | // - An example: this file is linked from README
  4 | //
  5 | // - A test: `test_data` contains all Lua files in Lua 5.1 source distribution, we lex it using
  6 | //   this lexer as a test.
  7 | //
  8 | // - A benchmark: We also use `test_data` lexing time as a runtime benchmark.
  9 | 
 10 | use lexgen::lexer;
 11 | 
 12 | ////////////////////////////////////////////////////////////////////////////////
 13 | //                                                                            //
 14 | //                      Lexer definition and tests                            //
 15 | //                                                                            //
 16 | ////////////////////////////////////////////////////////////////////////////////
 17 | 
 18 | #[derive(Debug, PartialEq, Eq, Clone)]
 19 | enum Token<'input> {
 20 |     Plus,
 21 |     Minus,
 22 |     Star,
 23 |     Slash,
 24 |     Percent,
 25 |     Caret,
 26 |     Hash,
 27 |     EqEq,
 28 |     TildeEq,
 29 |     LtEq,
 30 |     GtEq,
 31 |     Lt,
 32 |     Gt,
 33 |     Eq,
 34 |     LParen,
 35 |     RParen,
 36 |     LBrace,
 37 |     RBrace,
 38 |     LBracket,
 39 |     RBracket,
 40 |     Semicolon,
 41 |     Colon,
 42 |     Comma,
 43 |     Dot,
 44 |     DotDot,
 45 |     DotDotDot,
 46 |     Keyword(Keyword),
 47 |     String(StringToken<'input>),
 48 |     Var(&'input str),
 49 |     Number(&'input str), // uninterpreted
 50 | }
 51 | 
 52 | /// Raw string tokens are borrowed from the input string. Interpreted strings are copied and owned.
 53 | #[derive(Debug, PartialEq, Eq, Clone)]
 54 | enum StringToken<'input> {
 55 |     Raw(&'input str),
 56 |     Interpreted(String),
 57 | }
 58 | 
 59 | #[derive(Debug, PartialEq, Eq, Clone)]
 60 | enum Keyword {
 61 |     And,
 62 |     Break,
 63 |     Do,
 64 |     Else,
 65 |     ElseIf,
 66 |     End,
 67 |     False,
 68 |     For,
 69 |     Function,
 70 |     If,
 71 |     In,
 72 |     Local,
 73 |     Nil,
 74 |     Not,
 75 |     Or,
 76 |     Repeat,
 77 |     Return,
 78 |     Then,
 79 |     True,
 80 |     Until,
 81 |     While,
 82 | }
 83 | 
 84 | #[derive(Debug, Default, Clone)]
 85 | struct LexerState {
 86 |     /// Number of opening `=`s seen when parsing a long string
 87 |     long_string_opening_eqs: usize,
 88 |     /// Number of closing `=`s seen when parsing a long string
 89 |     long_string_closing_eqs: usize,
 90 |     /// When parsing a short string, whether it's started with a double or single quote
 91 |     short_string_delim: Quote,
 92 |     /// Buffer for strings
 93 |     string_buf: String,
 94 |     /// When parsing a long string, whether we're inside a comment or not. When inside a comment we
 95 |     /// don't return a token. Otherwise we return a string.
 96 |     in_comment: bool,
 97 | }
 98 | 
 99 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
100 | enum Quote {
101 |     #[default] // arbitrary
102 |     Single,
103 |     Double,
104 | }
105 | 
106 | lexer! {
107 |     Lexer(LexerState) -> Token<'input>;
108 | 
109 |     let whitespace = [' ' '\t' '\n'] | "\r\n";
110 | 
111 |     rule Init {
112 |         $whitespace,
113 | 
114 |         "+" = Token::Plus,
115 |         "-" = Token::Minus,
116 |         "*" = Token::Star,
117 |         "/" = Token::Slash,
118 |         "%" = Token::Percent,
119 |         "^" = Token::Caret,
120 |         "#" = Token::Hash,
121 |         "==" = Token::EqEq,
122 |         "~=" = Token::TildeEq,
123 |         "<=" = Token::LtEq,
124 |         ">=" = Token::GtEq,
125 |         "<" = Token::Lt,
126 |         ">" = Token::Gt,
127 |         "=" = Token::Eq,
128 |         "(" = Token::LParen,
129 |         ")" = Token::RParen,
130 |         "{" = Token::LBrace,
131 |         "}" = Token::RBrace,
132 |         "]" = Token::RBracket,
133 |         ";" = Token::Semicolon,
134 |         ":" = Token::Colon,
135 |         "," = Token::Comma,
136 |         "." = Token::Dot,
137 |         ".." = Token::DotDot,
138 |         "..." = Token::DotDotDot,
139 |         "and" = Token::Keyword(Keyword::And),
140 |         "break" = Token::Keyword(Keyword::Break),
141 |         "do" = Token::Keyword(Keyword::Do),
142 |         "else" = Token::Keyword(Keyword::Else),
143 |         "elseif" = Token::Keyword(Keyword::ElseIf),
144 |         "end" = Token::Keyword(Keyword::End),
145 |         "false" = Token::Keyword(Keyword::False),
146 |         "for" = Token::Keyword(Keyword::For),
147 |         "function" = Token::Keyword(Keyword::Function),
148 |         "if" = Token::Keyword(Keyword::If),
149 |         "in" = Token::Keyword(Keyword::In),
150 |         "local" = Token::Keyword(Keyword::Local),
151 |         "nil" = Token::Keyword(Keyword::Nil),
152 |         "not" = Token::Keyword(Keyword::Not),
153 |         "or" = Token::Keyword(Keyword::Or),
154 |         "repeat" = Token::Keyword(Keyword::Repeat),
155 |         "return" = Token::Keyword(Keyword::Return),
156 |         "then" = Token::Keyword(Keyword::Then),
157 |         "true" = Token::Keyword(Keyword::True),
158 |         "until" = Token::Keyword(Keyword::Until),
159 |         "while" = Token::Keyword(Keyword::While),
160 | 
161 |         '"' => |lexer| {
162 |             lexer.state().short_string_delim = Quote::Double;
163 |             lexer.state().string_buf.clear();
164 |             lexer.switch(LexerRule::String)
165 |         },
166 | 
167 |         '\'' => |lexer| {
168 |             lexer.state().short_string_delim = Quote::Single;
169 |             lexer.state().string_buf.clear();
170 |             lexer.switch(LexerRule::String)
171 |         },
172 | 
173 |         "[" => |lexer| {
174 |             match lexer.peek() {
175 |                 Some('[') | Some('=') => {
176 |                     lexer.state().long_string_opening_eqs = 0;
177 |                     lexer.state().in_comment = false;
178 |                     lexer.switch(LexerRule::LongStringBracketLeft)
179 |                 }
180 |                 _ => lexer.return_(Token::LBracket),
181 |             }
182 |         },
183 | 
184 |         "--" => |lexer| {
185 |             lexer.switch(LexerRule::EnterComment)
186 |         },
187 | 
188 |         // > Names (also called identifiers) in Lua can be any string of letters, digits, and
189 |         // > underscores, not beginning with a digit. This coincides with the definition of names
190 |         // > in most languages. (The definition of letter depends on the current locale: any
191 |         // > character considered alphabetic by the current locale can be used in an identifier.)
192 |         let var_init = ['a'-'z' 'A'-'Z' '_'];
193 |         let var_subseq = $var_init | ['0'-'9'];
194 | 
195 |         $var_init $var_subseq* => |lexer| {
196 |             let match_ = lexer.match_();
197 |             lexer.return_(Token::Var(match_))
198 |         },
199 | 
200 |         let digit = ['0'-'9'];
201 |         let hex_digit = ['a'-'f' 'A'-'F' '0'-'9'];
202 | 
203 |         $digit+ ('.'? $digit+ (('e' | 'E') ('+'|'-')? $digit+)?)? => |lexer| {
204 |             let match_ = lexer.match_();
205 |             lexer.return_(Token::Number(match_))
206 |         },
207 | 
208 |         "0x" $hex_digit+ => |lexer| {
209 |             let match_ = lexer.match_();
210 |             lexer.return_(Token::Number(match_))
211 |         },
212 |     }
213 | 
214 |     rule LongStringBracketLeft {
215 |         '=' => |lexer| {
216 |             lexer.state().long_string_opening_eqs += 1;
217 |             lexer.continue_()
218 |         },
219 | 
220 |         '[' => |lexer| lexer.switch(LexerRule::LongString),
221 |     }
222 | 
223 |     rule LongString {
224 |         ']' => |lexer| {
225 |             lexer.state().long_string_closing_eqs = 0;
226 |             lexer.switch(LexerRule::LongStringBracketRight)
227 |         },
228 | 
229 |         _ => |lexer| lexer.continue_(),
230 |     }
231 | 
232 |     rule LongStringBracketRight {
233 |         '=' => |lexer| {
234 |             lexer.state().long_string_closing_eqs += 1;
235 |             lexer.continue_()
236 |         },
237 | 
238 |         ']' => |lexer| {
239 |             let state = lexer.state();
240 |             let in_comment = state.in_comment;
241 |             let left_eqs = state.long_string_opening_eqs;
242 |             let right_eqs = state.long_string_closing_eqs;
243 |             if left_eqs == right_eqs {
244 |                 if in_comment {
245 |                     lexer.switch(LexerRule::Init)
246 |                 } else {
247 |                     let match_ = &lexer.match_()[left_eqs + 2..lexer.match_().len() - right_eqs - 2];
248 |                     lexer.switch_and_return(LexerRule::Init, Token::String(StringToken::Raw(match_)))
249 |                 }
250 |             } else {
251 |                 lexer.state().long_string_closing_eqs = 0;
252 |                 lexer.continue_()
253 |             }
254 |         },
255 | 
256 |         _ => |lexer| lexer.switch(LexerRule::LongString),
257 |     }
258 | 
259 |     rule String {
260 |         '"' => |lexer| {
261 |             if lexer.state().short_string_delim == Quote::Double {
262 |                 let str = lexer.state().string_buf.clone();
263 |                 lexer.switch_and_return(LexerRule::Init, Token::String(StringToken::Interpreted(str)))
264 |             } else {
265 |                 lexer.state().string_buf.push('"');
266 |                 lexer.continue_()
267 |             }
268 |         },
269 | 
270 |         "'" => |lexer| {
271 |             if lexer.state().short_string_delim == Quote::Single {
272 |                 let str = lexer.state().string_buf.clone();
273 |                 lexer.switch_and_return(LexerRule::Init, Token::String(StringToken::Interpreted(str)))
274 |             } else {
275 |                 lexer.state().string_buf.push('\'');
276 |                 lexer.continue_()
277 |             }
278 |         },
279 | 
280 |         "\\a" => |lexer| {
281 |             lexer.state().string_buf.push('\u{7}');
282 |             lexer.continue_()
283 |         },
284 | 
285 |         "\\b" => |lexer| {
286 |             lexer.state().string_buf.push('\u{8}');
287 |             lexer.continue_()
288 |         },
289 | 
290 |         "\\f" => |lexer| {
291 |             lexer.state().string_buf.push('\u{c}');
292 |             lexer.continue_()
293 |         },
294 | 
295 |         "\\n" => |lexer| {
296 |             lexer.state().string_buf.push('\n');
297 |             lexer.continue_()
298 |         },
299 | 
300 |         "\\r" => |lexer| {
301 |             lexer.state().string_buf.push('\r');
302 |             lexer.continue_()
303 |         },
304 | 
305 |         "\\t" => |lexer| {
306 |             lexer.state().string_buf.push('\t');
307 |             lexer.continue_()
308 |         },
309 | 
310 |         "\\v" => |lexer| {
311 |             lexer.state().string_buf.push('\u{b}');
312 |             lexer.continue_()
313 |         },
314 | 
315 |         "\\\\" => |lexer| {
316 |             lexer.state().string_buf.push('\\');
317 |             lexer.continue_()
318 |         },
319 | 
320 |         "\\\"" => |lexer| {
321 |             lexer.state().string_buf.push('"');
322 |             lexer.continue_()
323 |         },
324 | 
325 |         "\\'" => |lexer| {
326 |             lexer.state().string_buf.push('\'');
327 |             lexer.continue_()
328 |         },
329 | 
330 |         "\\\n" => |lexer| {
331 |             lexer.state().string_buf.push('\n');
332 |             lexer.continue_()
333 |         },
334 | 
335 |         _ => |lexer| {
336 |             let char = lexer.match_().chars().next_back().unwrap();
337 |             lexer.state().string_buf.push(char);
338 |             lexer.continue_()
339 |         },
340 |     }
341 | 
342 |     rule EnterComment {
343 |         '[' => |lexer| {
344 |             match lexer.peek() {
345 |                 Some('[') | Some('=') => {
346 |                     lexer.state().long_string_opening_eqs = 0;
347 |                     lexer.state().in_comment = true;
348 |                     lexer.switch(LexerRule::LongStringBracketLeft)
349 |                 }
350 |                 _ =>
351 |                     lexer.switch(LexerRule::Comment),
352 |             }
353 |         },
354 | 
355 |         _ => |lexer| lexer.switch(LexerRule::Comment),
356 |     }
357 | 
358 |     rule Comment {
359 |         '\n' => |lexer| lexer.switch(LexerRule::Init),
360 | 
361 |         _ => |lexer| lexer.continue_(),
362 |     }
363 | }
364 | 
365 | #[allow(dead_code)]
366 | fn ignore_pos<A, E, L>(ret: Option<Result<(L, A, L), E>>) -> Option<Result<A, E>> {
367 |     ret.map(|res| res.map(|(_, a, _)| a))
368 | }
369 | 
370 | #[test]
371 | fn lex_lua_number() {
372 |     let mut lexer = Lexer::new("3 3.0 3.1416 314.16e-2 0.31416E1 0xff 0x56");
373 | 
374 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("3"))));
375 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("3.0"))));
376 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("3.1416"))));
377 |     assert_eq!(
378 |         ignore_pos(lexer.next()),
379 |         Some(Ok(Token::Number("314.16e-2")))
380 |     );
381 |     assert_eq!(
382 |         ignore_pos(lexer.next()),
383 |         Some(Ok(Token::Number("0.31416E1")))
384 |     );
385 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("0xff"))));
386 | 
387 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Number("0x56"))));
388 | }
389 | 
390 | #[test]
391 | fn lex_lua_string() {
392 |     let str = "
393 |             \"test\"
394 |             \"\\
395 | test'\\\"\"
396 |         ";
397 |     let mut lexer = Lexer::new(str);
398 | 
399 |     assert_eq!(
400 |         ignore_pos(lexer.next()),
401 |         Some(Ok(Token::String(StringToken::Interpreted(
402 |             "test".to_owned()
403 |         ))))
404 |     );
405 |     assert_eq!(
406 |         ignore_pos(lexer.next()),
407 |         Some(Ok(Token::String(StringToken::Interpreted(
408 |             "\ntest'\"".to_owned()
409 |         ))))
410 |     );
411 | }
412 | 
413 | #[test]
414 | fn lex_lua_long_string() {
415 |     let mut lexer = Lexer::new("[[ ]] [=[test]=] [=[ ]]");
416 |     assert_eq!(
417 |         ignore_pos(lexer.next()),
418 |         Some(Ok(Token::String(StringToken::Raw(" "))))
419 |     );
420 |     assert_eq!(
421 |         ignore_pos(lexer.next()),
422 |         Some(Ok(Token::String(StringToken::Raw("test")))),
423 |     );
424 |     assert!(matches!(lexer.next(), Some(Err(_))));
425 | }
426 | 
427 | #[test]
428 | fn lex_lua_comment() {
429 |     let mut lexer = Lexer::new(
430 |         "-- test
431 |          +
432 |          --[[test
433 |          test]]+
434 |          --[===[
435 |          ]=]===]
436 |          +
437 |          --[===[
438 |          ]
439 |          ]===]
440 |          +
441 |         ",
442 |     );
443 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus)));
444 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus)));
445 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus)));
446 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus)));
447 |     assert_eq!(ignore_pos(lexer.next()), None);
448 | }
449 | 
450 | #[test]
451 | fn lex_lua_var() {
452 |     let str = "ab ab1 ab_1_2 Aab";
453 |     let mut lexer = Lexer::new(str);
454 | 
455 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("ab"))));
456 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("ab1"))));
457 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("ab_1_2"))));
458 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Var("Aab"))));
459 | }
460 | 
461 | #[test]
462 | fn lex_lua_simple() {
463 |     let lexer = Lexer::new(
464 |         "+ - * / % ^ # == ~= <= >= < > = ( ) { } [ ] \
465 |          ; : , . .. ... and break do else elseif end \
466 |          false for function if in local nil not or repeat \
467 |          return then true until while n",
468 |     );
469 | 
470 |     let mut tokens: Vec<Token> = vec![];
471 |     for token in lexer {
472 |         tokens.push(token.unwrap().1);
473 |     }
474 | 
475 |     assert_eq!(
476 |         tokens,
477 |         vec![
478 |             Token::Plus,
479 |             Token::Minus,
480 |             Token::Star,
481 |             Token::Slash,
482 |             Token::Percent,
483 |             Token::Caret,
484 |             Token::Hash,
485 |             Token::EqEq,
486 |             Token::TildeEq,
487 |             Token::LtEq,
488 |             Token::GtEq,
489 |             Token::Lt,
490 |             Token::Gt,
491 |             Token::Eq,
492 |             Token::LParen,
493 |             Token::RParen,
494 |             Token::LBrace,
495 |             Token::RBrace,
496 |             Token::LBracket,
497 |             Token::RBracket,
498 |             Token::Semicolon,
499 |             Token::Colon,
500 |             Token::Comma,
501 |             Token::Dot,
502 |             Token::DotDot,
503 |             Token::DotDotDot,
504 |             Token::Keyword(Keyword::And),
505 |             Token::Keyword(Keyword::Break),
506 |             Token::Keyword(Keyword::Do),
507 |             Token::Keyword(Keyword::Else),
508 |             Token::Keyword(Keyword::ElseIf),
509 |             Token::Keyword(Keyword::End),
510 |             Token::Keyword(Keyword::False),
511 |             Token::Keyword(Keyword::For),
512 |             Token::Keyword(Keyword::Function),
513 |             Token::Keyword(Keyword::If),
514 |             Token::Keyword(Keyword::In),
515 |             Token::Keyword(Keyword::Local),
516 |             Token::Keyword(Keyword::Nil),
517 |             Token::Keyword(Keyword::Not),
518 |             Token::Keyword(Keyword::Or),
519 |             Token::Keyword(Keyword::Repeat),
520 |             Token::Keyword(Keyword::Return),
521 |             Token::Keyword(Keyword::Then),
522 |             Token::Keyword(Keyword::True),
523 |             Token::Keyword(Keyword::Until),
524 |             Token::Keyword(Keyword::While),
525 |             Token::Var("n"),
526 |         ]
527 |     );
528 | }
529 | 
530 | #[test]
531 | fn lex_lua_windows_line_ending() {
532 |     let mut lexer = Lexer::new("+\r\n+");
533 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus)));
534 |     assert_eq!(ignore_pos(lexer.next()), Some(Ok(Token::Plus)));
535 |     assert_eq!(ignore_pos(lexer.next()), None);
536 | }
537 | 
538 | #[test]
539 | fn lex_lua_files() {
540 |     let str = std::fs::read_to_string("tests/test_data").unwrap();
541 |     let lexer = Lexer::new(&str);
542 |     let mut i = 0;
543 |     for tok in lexer {
544 |         assert!(tok.is_ok());
545 |         i += 1;
546 |     }
547 |     println!("{} tokens", i);
548 | }
549 | 


--------------------------------------------------------------------------------
/crates/lexgen/tests/right_ctx.rs:
--------------------------------------------------------------------------------
  1 | mod test_utils;
  2 | 
  3 | use lexgen::lexer;
  4 | use lexgen_util::{LexerError, LexerErrorKind};
  5 | use test_utils::{loc, next};
  6 | 
  7 | #[test]
  8 | fn right_ctx_1() {
  9 |     lexer! {
 10 |         Lexer -> u32;
 11 | 
 12 |         'a' > 'a' = 1,
 13 |     }
 14 | 
 15 |     let mut lexer = Lexer::new("aa");
 16 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
 17 |     assert_eq!(
 18 |         next(&mut lexer),
 19 |         Some(Err(LexerError {
 20 |             location: loc(0, 1, 1),
 21 |             kind: LexerErrorKind::InvalidToken,
 22 |         }))
 23 |     );
 24 | 
 25 |     let mut lexer = Lexer::new("ab");
 26 |     assert_eq!(
 27 |         next(&mut lexer),
 28 |         Some(Err(LexerError {
 29 |             location: loc(0, 0, 0),
 30 |             kind: LexerErrorKind::InvalidToken,
 31 |         }))
 32 |     );
 33 | }
 34 | 
 35 | #[test]
 36 | fn right_ctx_2() {
 37 |     lexer! {
 38 |         Lexer -> u32;
 39 | 
 40 |         'a' > _ = 1,
 41 |     }
 42 | 
 43 |     let mut lexer = Lexer::new("aa");
 44 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
 45 |     assert_eq!(
 46 |         next(&mut lexer),
 47 |         Some(Err(LexerError {
 48 |             location: loc(0, 1, 1),
 49 |             kind: LexerErrorKind::InvalidToken,
 50 |         }))
 51 |     );
 52 | 
 53 |     let mut lexer = Lexer::new("ab");
 54 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
 55 |     assert_eq!(
 56 |         next(&mut lexer),
 57 |         Some(Err(LexerError {
 58 |             location: loc(0, 1, 1),
 59 |             kind: LexerErrorKind::InvalidToken,
 60 |         }))
 61 |     );
 62 | 
 63 |     let mut lexer = Lexer::new("a");
 64 |     assert_eq!(
 65 |         next(&mut lexer),
 66 |         Some(Err(LexerError {
 67 |             location: loc(0, 0, 0),
 68 |             kind: LexerErrorKind::InvalidToken,
 69 |         }))
 70 |     );
 71 | }
 72 | 
 73 | #[test]
 74 | fn right_ctx_3() {
 75 |     lexer! {
 76 |         Lexer -> u32;
 77 | 
 78 |         'a' > $ = 1,
 79 |     }
 80 | 
 81 |     let mut lexer = Lexer::new("a");
 82 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
 83 |     assert_eq!(next(&mut lexer), None);
 84 | 
 85 |     let mut lexer = Lexer::new("ab");
 86 |     assert_eq!(
 87 |         next(&mut lexer),
 88 |         Some(Err(LexerError {
 89 |             location: loc(0, 0, 0),
 90 |             kind: LexerErrorKind::InvalidToken,
 91 |         }))
 92 |     );
 93 | }
 94 | 
 95 | #[test]
 96 | fn right_ctx_4() {
 97 |     lexer! {
 98 |         Lexer -> u32;
 99 | 
100 |         'a' > 'a' = 1,
101 |         'a' > $ = 2,
102 |     }
103 | 
104 |     let mut lexer = Lexer::new("a");
105 |     assert_eq!(next(&mut lexer), Some(Ok(2)));
106 |     assert_eq!(next(&mut lexer), None);
107 | 
108 |     let mut lexer = Lexer::new("aa");
109 |     assert_eq!(next(&mut lexer), Some(Ok(1)));
110 |     assert_eq!(next(&mut lexer), Some(Ok(2)));
111 |     assert_eq!(next(&mut lexer), None);
112 | }
113 | 
114 | #[test]
115 | fn rust_single_line_comment() {
116 |     lexer! {
117 |         Lexer -> &'input str;
118 | 
119 |         rule Init {
120 |             $$ascii_whitespace,
121 | 
122 |             "//" => |lexer| lexer.switch(LexerRule::SinglelineComment),
123 |         }
124 | 
125 |         rule SinglelineComment {
126 |             (_ # '\n')* > ('\n' | $) => |lexer| {
127 |                 let comment = lexer.match_();
128 |                 lexer.switch_and_return(LexerRule::Init, comment)
129 |             },
130 |         }
131 |     }
132 | 
133 |     // Terminated at the end of input (no newline)
134 |     let input = "//  /  ";
135 |     let mut lexer = Lexer::new(input);
136 |     assert_eq!(next(&mut lexer), Some(Ok(input)));
137 |     assert_eq!(next(&mut lexer), None);
138 | 
139 |     // Terminated with newlines
140 |     let input = "//  /  \n";
141 |     let mut lexer = Lexer::new(input);
142 |     assert_eq!(next(&mut lexer), Some(Ok("//  /  ")));
143 |     assert_eq!(next(&mut lexer), None);
144 | 
145 |     // Empty comment, terminated with eof
146 |     let input = "//";
147 |     let mut lexer = Lexer::new(input);
148 |     assert_eq!(next(&mut lexer), Some(Ok("//")));
149 |     assert_eq!(next(&mut lexer), None);
150 | 
151 |     // Empty comment, terminated with eol
152 |     let input = "//\n";
153 |     let mut lexer = Lexer::new(input);
154 |     assert_eq!(next(&mut lexer), Some(Ok("//")));
155 |     assert_eq!(next(&mut lexer), None);
156 | }
157 | 
158 | #[test]
159 | fn rust_float() {
160 |     #[derive(Debug, PartialEq, Eq)]
161 |     enum Token<'input> {
162 |         Float(&'input str),
163 |         Int(&'input str),
164 |         Range,
165 |     }
166 | 
167 |     lexer! {
168 |         Lexer -> Token<'input>;
169 | 
170 |         ['0'-'9']+ '.' > (_ # ('.' | '_' | $$XID_Start) | $) => |lexer| {
171 |             let match_ = lexer.match_();
172 |             lexer.return_(Token::Float(match_))
173 |         },
174 | 
175 |         ['0'-'9']+ => |lexer| {
176 |             let match_ = lexer.match_();
177 |             lexer.return_(Token::Int(match_))
178 |         },
179 | 
180 |         ".." = Token::Range,
181 |     }
182 | 
183 |     let mut lexer = Lexer::new("1.");
184 |     assert_eq!(next(&mut lexer), Some(Ok(Token::Float("1."))));
185 |     assert_eq!(next(&mut lexer), None);
186 | 
187 |     let mut lexer = Lexer::new("1..");
188 |     assert_eq!(next(&mut lexer), Some(Ok(Token::Int("1"))));
189 |     assert_eq!(next(&mut lexer), Some(Ok(Token::Range)));
190 |     assert_eq!(next(&mut lexer), None);
191 | }
192 | 
193 | #[test]
194 | fn ligature_shaping() {
195 |     #[derive(Debug, PartialEq, Eq)]
196 |     enum Token<'input> {
197 |         Lig(&'input str),
198 |         NotLig(&'input str),
199 |     }
200 | 
201 |     lexer! {
202 |         Lexer -> Token<'input>;
203 | 
204 |         "---" > ((_ # '-') | $) => |lexer| {
205 |             let match_ = lexer.match_();
206 |             lexer.return_(Token::Lig(match_))
207 |         },
208 | 
209 |         _+ => |lexer| {
210 |             let match_ = lexer.match_();
211 |             lexer.return_(Token::NotLig(match_))
212 |         },
213 |     }
214 | 
215 |     let mut lexer = Lexer::new("--");
216 |     assert_eq!(next(&mut lexer), Some(Ok(Token::NotLig("--"))));
217 |     assert_eq!(next(&mut lexer), None);
218 | 
219 |     let mut lexer = Lexer::new("---");
220 |     assert_eq!(next(&mut lexer), Some(Ok(Token::Lig("---"))));
221 |     assert_eq!(next(&mut lexer), None);
222 | 
223 |     let mut lexer = Lexer::new("----");
224 |     assert_eq!(next(&mut lexer), Some(Ok(Token::NotLig("----"))));
225 |     assert_eq!(next(&mut lexer), None);
226 | }
227 | 


--------------------------------------------------------------------------------
/crates/lexgen/tests/test_data:
--------------------------------------------------------------------------------
  1 | -- This file is contains concatenation of Lua files in Lua 5.1 source
  2 | -- distribution, used to test the Lua lexer (and lexgen) and as a runtime
  3 | -- benchmark.
  4 | 
  5 | -- Copyright (C) 1994-2012 Lua.org, PUC-Rio.
  6 | -- 
  7 | -- Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | -- of this software and associated documentation files (the "Software"), to deal
  9 | -- in the Software without restriction, including without limitation the rights
 10 | -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | -- copies of the Software, and to permit persons to whom the Software is
 12 | -- furnished to do so, subject to the following conditions:
 13 | -- 
 14 | -- The above copyright notice and this permission notice shall be included in
 15 | -- all copies or substantial portions of the Software.
 16 | -- 
 17 | -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 20 | -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | -- THE SOFTWARE.
 24 | 
 25 | delta=1e-6	-- tolerance
 26 | 
 27 | function bisect(f,a,b,fa,fb)
 28 |  local c=(a+b)/2
 29 |  io.write(n," c=",c," a=",a," b=",b,"\n")
 30 |  if c==a or c==b or math.abs(a-b)<delta then return c,b-a end
 31 |  n=n+1
 32 |  local fc=f(c)
 33 |  if fa*fc<0 then return bisect(f,a,c,fa,fc) else return bisect(f,c,b,fc,fb) end
 34 | end
 35 | 
 36 | -- find root of f in the inverval [a,b]. needs f(a)*f(b)<0
 37 | function solve(f,a,b)
 38 |  n=0
 39 |  local z,e=bisect(f,a,b,f(a),f(b))
 40 |  io.write(string.format("after %d steps, root is %.17g with error %.1e, f=%.1e\n",n,z,e,f(z)))
 41 | end
 42 | 
 43 | -- our function
 44 | function f(x)
 45 |  return x*x*x-x-1
 46 | end
 47 | 
 48 | -- find zero in [1,2]
 49 | solve(f,1,2)
 50 | -- temperature conversion table (celsius to farenheit)
 51 | 
 52 | for c0=-20,50-1,10 do
 53 | 	io.write("C ")
 54 | 	for c=c0,c0+10-1 do
 55 | 		io.write(string.format("%3.0f ",c))
 56 | 	end
 57 | 	io.write("\n")
 58 | 	
 59 | 	io.write("F ")
 60 | 	for c=c0,c0+10-1 do
 61 | 		f=(9/5)*c+32
 62 | 		io.write(string.format("%3.0f ",f))
 63 | 	end
 64 | 	io.write("\n\n")
 65 | end
 66 | -- echo command line arguments
 67 | 
 68 | for i=0,table.getn(arg) do
 69 |  print(i,arg[i])
 70 | end
 71 | -- read environment variables as if they were global variables
 72 | 
 73 | local f=function (t,i) return os.getenv(i) end
 74 | setmetatable(getfenv(),{__index=f})
 75 | 
 76 | -- an example
 77 | print(a,USER,PATH)
 78 | -- function closures are powerful
 79 | 
 80 | -- traditional fixed-point operator from functional programming
 81 | Y = function (g)
 82 |       local a = function (f) return f(f) end
 83 |       return a(function (f)
 84 |                  return g(function (x)
 85 |                              local c=f(f)
 86 |                              return c(x)
 87 |                            end)
 88 |                end)
 89 | end
 90 | 
 91 | 
 92 | -- factorial without recursion
 93 | F = function (f)
 94 |       return function (n)
 95 |                if n == 0 then return 1
 96 |                else return n*f(n-1) end
 97 |              end
 98 |     end
 99 | 
100 | factorial = Y(F)   -- factorial is the fixed point of F
101 | 
102 | -- now test it
103 | function test(x)
104 | 	io.write(x,"! = ",factorial(x),"\n")
105 | end
106 | 
107 | for n=0,16 do
108 | 	test(n)
109 | end
110 | -- example of for with generator functions
111 | 
112 | function generatefib (n)
113 |   return coroutine.wrap(function ()
114 |     local a,b = 1, 1
115 |     while a <= n do
116 |       coroutine.yield(a)
117 |       a, b = b, a+b
118 |     end
119 |   end)
120 | end
121 | 
122 | for i in generatefib(1000) do print(i) end
123 | -- fibonacci function with cache
124 | 
125 | -- very inefficient fibonacci function
126 | function fib(n)
127 | 	N=N+1
128 | 	if n<2 then
129 | 		return n
130 | 	else
131 | 		return fib(n-1)+fib(n-2)
132 | 	end
133 | end
134 | 
135 | -- a general-purpose value cache
136 | function cache(f)
137 | 	local c={}
138 | 	return function (x)
139 | 		local y=c[x]
140 | 		if not y then
141 | 			y=f(x)
142 | 			c[x]=y
143 | 		end
144 | 		return y
145 | 	end
146 | end
147 | 
148 | -- run and time it
149 | function test(s,f)
150 | 	N=0
151 | 	local c=os.clock()
152 | 	local v=f(n)
153 | 	local t=os.clock()-c
154 | 	print(s,n,v,t,N)
155 | end
156 | 
157 | n=arg[1] or 24		-- for other values, do lua fib.lua XX
158 | n=tonumber(n)
159 | print("","n","value","time","evals")
160 | test("plain",fib)
161 | fib=cache(fib)
162 | test("cached",fib)
163 | -- reads luac listings and reports global variable usage
164 | -- lines where a global is written to are marked with "*"
165 | -- typical usage: luac -p -l file.lua | lua globals.lua | sort | lua table.lua
166 | 
167 | while 1 do
168 |  local s=io.read()
169 |  if s==nil then break end
170 |  local ok,_,l,op,g=string.find(s,"%[%-?(%d*)%]%s*([GS])ETGLOBAL.-;%s+(.*)$")
171 |  if ok then
172 |   if op=="S" then op="*" else op="" end
173 |   io.write(g,"\t",l,op,"\n")
174 |  end
175 | end
176 | -- the first program in every language
177 | 
178 | io.write("Hello world, from ",_VERSION,"!\n")
179 | -- life.lua
180 | -- original by Dave Bollinger <DBollinger@compuserve.com> posted to lua-l
181 | -- modified to use ANSI terminal escape sequences
182 | -- modified to use for instead of while
183 | 
184 | local write=io.write
185 | 
186 | ALIVE="¥"	DEAD="þ"
187 | ALIVE="O"	DEAD="-"
188 | 
189 | function delay() -- NOTE: SYSTEM-DEPENDENT, adjust as necessary
190 |   for i=1,10000 do end
191 |   -- local i=os.clock()+1 while(os.clock()<i) do end
192 | end
193 | 
194 | function ARRAY2D(w,h)
195 |   local t = {w=w,h=h}
196 |   for y=1,h do
197 |     t[y] = {}
198 |     for x=1,w do
199 |       t[y][x]=0
200 |     end
201 |   end
202 |   return t
203 | end
204 | 
205 | _CELLS = {}
206 | 
207 | -- give birth to a "shape" within the cell array
208 | function _CELLS:spawn(shape,left,top)
209 |   for y=0,shape.h-1 do
210 |     for x=0,shape.w-1 do
211 |       self[top+y][left+x] = shape[y*shape.w+x+1]
212 |     end
213 |   end
214 | end
215 | 
216 | -- run the CA and produce the next generation
217 | function _CELLS:evolve(next)
218 |   local ym1,y,yp1,yi=self.h-1,self.h,1,self.h
219 |   while yi > 0 do
220 |     local xm1,x,xp1,xi=self.w-1,self.w,1,self.w
221 |     while xi > 0 do
222 |       local sum = self[ym1][xm1] + self[ym1][x] + self[ym1][xp1] +
223 |                   self[y][xm1] + self[y][xp1] +
224 |                   self[yp1][xm1] + self[yp1][x] + self[yp1][xp1]
225 |       next[y][x] = ((sum==2) and self[y][x]) or ((sum==3) and 1) or 0
226 |       xm1,x,xp1,xi = x,xp1,xp1+1,xi-1
227 |     end
228 |     ym1,y,yp1,yi = y,yp1,yp1+1,yi-1
229 |   end
230 | end
231 | 
232 | -- output the array to screen
233 | function _CELLS:draw()
234 |   local out="" -- accumulate to reduce flicker
235 |   for y=1,self.h do
236 |    for x=1,self.w do
237 |       out=out..(((self[y][x]>0) and ALIVE) or DEAD)
238 |     end
239 |     out=out.."\n"
240 |   end
241 |   write(out)
242 | end
243 | 
244 | -- constructor
245 | function CELLS(w,h)
246 |   local c = ARRAY2D(w,h)
247 |   c.spawn = _CELLS.spawn
248 |   c.evolve = _CELLS.evolve
249 |   c.draw = _CELLS.draw
250 |   return c
251 | end
252 | 
253 | --
254 | -- shapes suitable for use with spawn() above
255 | --
256 | HEART = { 1,0,1,1,0,1,1,1,1; w=3,h=3 }
257 | GLIDER = { 0,0,1,1,0,1,0,1,1; w=3,h=3 }
258 | EXPLODE = { 0,1,0,1,1,1,1,0,1,0,1,0; w=3,h=4 }
259 | FISH = { 0,1,1,1,1,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0; w=5,h=4 }
260 | BUTTERFLY = { 1,0,0,0,1,0,1,1,1,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,1; w=5,h=5 }
261 | 
262 | -- the main routine
263 | function LIFE(w,h)
264 |   -- create two arrays
265 |   local thisgen = CELLS(w,h)
266 |   local nextgen = CELLS(w,h)
267 | 
268 |   -- create some life
269 |   -- about 1000 generations of fun, then a glider steady-state
270 |   thisgen:spawn(GLIDER,5,4)
271 |   thisgen:spawn(EXPLODE,25,10)
272 |   thisgen:spawn(FISH,4,12)
273 | 
274 |   -- run until break
275 |   local gen=1
276 |   write("\027[2J")	-- ANSI clear screen
277 |   while 1 do
278 |     thisgen:evolve(nextgen)
279 |     thisgen,nextgen = nextgen,thisgen
280 |     write("\027[H")	-- ANSI home cursor
281 |     thisgen:draw()
282 |     write("Life - generation ",gen,"\n")
283 |     gen=gen+1
284 |     if gen>2000 then break end
285 |     --delay()		-- no delay
286 |   end
287 | end
288 | 
289 | LIFE(40,20)
290 | -- bare-bones luac in Lua
291 | -- usage: lua luac.lua file.lua
292 | 
293 | assert(arg[1]~=nil and arg[2]==nil,"usage: lua luac.lua file.lua")
294 | f=assert(io.open("luac.out","wb"))
295 | assert(f:write(string.dump(assert(loadfile(arg[1])))))
296 | assert(f:close())
297 | -- an implementation of printf
298 | 
299 | function printf(...)
300 |  io.write(string.format(...))
301 | end
302 | 
303 | printf("Hello %s from %s on %s\n",os.getenv"USER" or "there",_VERSION,os.date())
304 | -- make global variables readonly
305 | 
306 | local f=function (t,i) error("cannot redefine global variable `"..i.."'",2) end
307 | local g={}
308 | local G=getfenv()
309 | setmetatable(g,{__index=G,__newindex=f})
310 | setfenv(1,g)
311 | 
312 | -- an example
313 | rawset(g,"x",3)
314 | x=2
315 | y=1	-- cannot redefine `y'
316 | -- the sieve of of Eratosthenes programmed with coroutines
317 | -- typical usage: lua -e N=1000 sieve.lua | column
318 | 
319 | -- generate all the numbers from 2 to n
320 | function gen (n)
321 |   return coroutine.wrap(function ()
322 |     for i=2,n do coroutine.yield(i) end
323 |   end)
324 | end
325 | 
326 | -- filter the numbers generated by `g', removing multiples of `p'
327 | function filter (p, g)
328 |   return coroutine.wrap(function ()
329 |     while 1 do
330 |       local n = g()
331 |       if n == nil then return end
332 |       if math.mod(n, p) ~= 0 then coroutine.yield(n) end
333 |     end
334 |   end)
335 | end
336 | 
337 | N=N or 1000		-- from command line
338 | x = gen(N)		-- generate primes up to N
339 | while 1 do
340 |   local n = x()		-- pick a number until done
341 |   if n == nil then break end
342 |   print(n)		-- must be a prime number
343 |   x = filter(n, x)	-- now remove its multiples
344 | end
345 | -- two implementations of a sort function
346 | -- this is an example only. Lua has now a built-in function "sort"
347 | 
348 | -- extracted from Programming Pearls, page 110
349 | function qsort(x,l,u,f)
350 |  if l<u then
351 |   local m=math.random(u-(l-1))+l-1	-- choose a random pivot in range l..u
352 |   x[l],x[m]=x[m],x[l]			-- swap pivot to first position
353 |   local t=x[l]				-- pivot value
354 |   m=l
355 |   local i=l+1
356 |   while i<=u do
357 |     -- invariant: x[l+1..m] < t <= x[m+1..i-1]
358 |     if f(x[i],t) then
359 |       m=m+1
360 |       x[m],x[i]=x[i],x[m]		-- swap x[i] and x[m]
361 |     end
362 |     i=i+1
363 |   end
364 |   x[l],x[m]=x[m],x[l]			-- swap pivot to a valid place
365 |   -- x[l+1..m-1] < x[m] <= x[m+1..u]
366 |   qsort(x,l,m-1,f)
367 |   qsort(x,m+1,u,f)
368 |  end
369 | end
370 | 
371 | function selectionsort(x,n,f)
372 |  local i=1
373 |  while i<=n do
374 |   local m,j=i,i+1
375 |   while j<=n do
376 |    if f(x[j],x[m]) then m=j end
377 |    j=j+1
378 |   end
379 |  x[i],x[m]=x[m],x[i]			-- swap x[i] and x[m]
380 |  i=i+1
381 |  end
382 | end
383 | 
384 | function show(m,x)
385 |  io.write(m,"\n\t")
386 |  local i=1
387 |  while x[i] do
388 |   io.write(x[i])
389 |   i=i+1
390 |   if x[i] then io.write(",") end
391 |  end
392 |  io.write("\n")
393 | end
394 | 
395 | function testsorts(x)
396 |  local n=1
397 |  while x[n] do n=n+1 end; n=n-1		-- count elements
398 |  show("original",x)
399 |  qsort(x,1,n,function (x,y) return x<y end)
400 |  show("after quicksort",x)
401 |  selectionsort(x,n,function (x,y) return x>y end)
402 |  show("after reverse selection sort",x)
403 |  qsort(x,1,n,function (x,y) return x<y end)
404 |  show("after quicksort again",x)
405 | end
406 | 
407 | -- array to be sorted
408 | x={"Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"}
409 | 
410 | testsorts(x)
411 | -- make table, grouping all data for the same item
412 | -- input is 2 columns (item, data)
413 | 
414 | local A
415 | while 1 do
416 |  local l=io.read()
417 |  if l==nil then break end
418 |  local _,_,a,b=string.find(l,'"?([_%w]+)"?%s*(.*)$')
419 |  if a~=A then A=a io.write("\n",a,":") end
420 |  io.write(" ",b)
421 | end
422 | io.write("\n")
423 | -- trace calls
424 | -- example: lua -ltrace-calls bisect.lua
425 | 
426 | local level=0
427 | 
428 | local function hook(event)
429 |  local t=debug.getinfo(3)
430 |  io.write(level," >>> ",string.rep(" ",level))
431 |  if t~=nil and t.currentline>=0 then io.write(t.short_src,":",t.currentline," ") end
432 |  t=debug.getinfo(2)
433 |  if event=="call" then
434 |   level=level+1
435 |  else
436 |   level=level-1 if level<0 then level=0 end
437 |  end
438 |  if t.what=="main" then
439 |   if event=="call" then
440 |    io.write("begin ",t.short_src)
441 |   else
442 |    io.write("end ",t.short_src)
443 |   end
444 |  elseif t.what=="Lua" then
445 | -- table.foreach(t,print)
446 |   io.write(event," ",t.name or "(Lua)"," <",t.linedefined,":",t.short_src,">")
447 |  else
448 |  io.write(event," ",t.name or "(C)"," [",t.what,"] ")
449 |  end
450 |  io.write("\n")
451 | end
452 | 
453 | debug.sethook(hook,"cr")
454 | level=0
455 | -- trace assigments to global variables
456 | 
457 | do
458 |  -- a tostring that quotes strings. note the use of the original tostring.
459 |  local _tostring=tostring
460 |  local tostring=function(a)
461 |   if type(a)=="string" then
462 |    return string.format("%q",a)
463 |   else
464 |    return _tostring(a)
465 |   end
466 |  end
467 | 
468 |  local log=function (name,old,new)
469 |   local t=debug.getinfo(3,"Sl")
470 |   local line=t.currentline
471 |   io.write(t.short_src)
472 |   if line>=0 then io.write(":",line) end
473 |   io.write(": ",name," is now ",tostring(new)," (was ",tostring(old),")","\n")
474 |  end
475 | 
476 |  local g={}
477 |  local set=function (t,name,value)
478 |   log(name,g[name],value)
479 |   g[name]=value
480 |  end
481 |  setmetatable(getfenv(),{__index=g,__newindex=set})
482 | end
483 | 
484 | -- an example
485 | 
486 | a=1
487 | b=2
488 | a=10
489 | b=20
490 | b=nil
491 | b=200
492 | print(a,b,c)
493 | -- hex dump
494 | -- usage: lua xd.lua < file
495 | 
496 | local offset=0
497 | while true do
498 |  local s=io.read(16)
499 |  if s==nil then return end
500 |  io.write(string.format("%08X  ",offset))
501 |  string.gsub(s,"(.)",
502 | 	function (c) io.write(string.format("%02X ",string.byte(c))) end)
503 |  io.write(string.rep(" ",3*(16-string.len(s))))
504 |  io.write(" ",string.gsub(s,"%c","."),"\n") 
505 |  offset=offset+16
506 | end
507 | 


--------------------------------------------------------------------------------
/crates/lexgen/tests/test_utils.rs:
--------------------------------------------------------------------------------
 1 | use lexgen_util::Loc;
 2 | 
 3 | pub fn ignore_pos<A, E, L>(ret: Option<Result<(L, A, L), E>>) -> Option<Result<A, E>> {
 4 |     ret.map(|res| res.map(|(_, a, _)| a))
 5 | }
 6 | 
 7 | pub fn next<A, E, L>(iter: &mut dyn Iterator<Item = Result<(L, A, L), E>>) -> Option<Result<A, E>> {
 8 |     ignore_pos(iter.next())
 9 | }
10 | 
11 | pub fn loc(line: u32, col: u32, byte_idx: usize) -> Loc {
12 |     Loc {
13 |         line,
14 |         col,
15 |         byte_idx,
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/crates/lexgen_lalrpop_example/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lexgen_lalrpop_example"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | lalrpop-util = "0.19.9"
 8 | lexgen = { path = "../lexgen" }
 9 | lexgen_util = { path = "../lexgen_util" }
10 | 
11 | [build-dependencies]
12 | lalrpop = "0.19.9"
13 | 


--------------------------------------------------------------------------------
/crates/lexgen_lalrpop_example/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     lalrpop::process_root().unwrap();
3 | }
4 | 


--------------------------------------------------------------------------------
/crates/lexgen_lalrpop_example/src/interpolation.lalrpop:
--------------------------------------------------------------------------------
 1 | use super::{
 2 |     ast::{Expression, StringFragment},
 3 |     lexer::{LexerError, Loc, Token},
 4 | };
 5 | 
 6 | grammar<'input>;
 7 | 
 8 | pub Expression: Expression<'input> = {
 9 |     Term,
10 |     <lhs: Expression> "+" <rhs: Term> => Expression::Concat(Box::new(lhs), Box::new(rhs)),
11 | }
12 | 
13 | Term: Expression<'input> = {
14 |     "(" <Expression> ")",
15 |     StringStart <StringInner*> StringEnd => Expression::String(<>),
16 | }
17 | 
18 | StringInner: StringFragment<'input> = {
19 |     StringFragment => StringFragment::String(<>),
20 |     InterpolationStart <Expression> InterpolationEnd => StringFragment::Expression(<>),
21 | }
22 | 
23 | 
24 | extern {
25 |     type Location = Loc;
26 |     type Error = LexerError;
27 | 
28 |     enum Token<'input> {
29 |         "+" => Token::Plus,
30 |         "(" => Token::LParen,
31 |         ")" => Token::RParen,
32 |         StringStart => Token::StringStart,
33 |         StringFragment => Token::StringFragment(<&'input str>),
34 |         StringEnd => Token::StringEnd,
35 |         InterpolationStart => Token::InterpolationStart,
36 |         InterpolationEnd => Token::InterpolationEnd,
37 |     }
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/crates/lexgen_lalrpop_example/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! This example shows how to use the generated lexer with [lalrpop](https://docs.rs/lalrpop/latest/lalrpop/)
  2 | //! by implementing an evaluator of an example language.
  3 | //!
  4 | //! The language has three type of expressions. The first type is a string expression which starts
  5 | //! and ends with `"`. The other types are string concatenation denoted by `+` and parenthesized
  6 | //! expression. Inside string expression, you can write caracters as normal language, and can also
  7 | //! interpolate another expression by surrounding the expression with `\(` and `)`.
  8 | 
  9 | use lalrpop_util::lalrpop_mod;
 10 | 
 11 | pub mod ast {
 12 |     #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
 13 |     pub enum StringFragment<'input> {
 14 |         /// Represents a sequence of normal characters or a string consists of a single
 15 |         /// escaped character in a string literal.
 16 |         String(&'input str),
 17 |         /// Represents an interpolated expression in a string literal.
 18 |         Expression(Expression<'input>),
 19 |     }
 20 | 
 21 |     #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
 22 |     pub enum Expression<'input> {
 23 |         /// Represents a string literal.
 24 |         String(Vec<StringFragment<'input>>),
 25 |         /// Represents `lhs + rhs`. It's possible to make this desugared into
 26 |         /// `"\(lhs)\(rhs)"` instead of having this variant.
 27 |         Concat(Box<Expression<'input>>, Box<Expression<'input>>),
 28 |     }
 29 | 
 30 |     impl StringFragment<'_> {
 31 |         fn eval_to(&self, w: &mut impl std::fmt::Write) -> std::fmt::Result {
 32 |             match self {
 33 |                 StringFragment::String(s) => w.write_str(s),
 34 |                 StringFragment::Expression(e) => e.eval_to(w),
 35 |             }
 36 |         }
 37 |     }
 38 | 
 39 |     impl Expression<'_> {
 40 |         fn eval_to(&self, w: &mut impl std::fmt::Write) -> std::fmt::Result {
 41 |             match self {
 42 |                 Expression::String(v) => v.iter().try_for_each(|f| f.eval_to(w)),
 43 |                 Expression::Concat(l, r) => [l, r].iter().try_for_each(|e| e.eval_to(w)),
 44 |             }
 45 |         }
 46 | 
 47 |         pub fn eval(&self) -> String {
 48 |             let mut ret = String::new();
 49 |             self.eval_to(&mut ret)
 50 |                 .expect("Format into String shoudln't fail");
 51 |             ret
 52 |         }
 53 |     }
 54 | }
 55 | 
 56 | #[allow(clippy::manual_range_contains)]
 57 | pub mod lexer {
 58 |     use lexgen::lexer;
 59 |     pub type LexerError = lexgen_util::LexerError<String>;
 60 |     pub type Loc = lexgen_util::Loc;
 61 | 
 62 |     #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
 63 |     pub enum Token<'input> {
 64 |         /// Represents `+` (outside of string literals).
 65 |         Plus,
 66 |         /// Represents `(` of parenthesized expression (outside of string literals).
 67 |         LParen,
 68 |         /// Represents `)` of parenthesized expression (outside of string literals).
 69 |         RParen,
 70 |         /// Represents `"` at the begining of a string literal.
 71 |         StringStart,
 72 |         /// Represents a non-interpolated part of a string literal.
 73 |         /// Either a sequence of characters exactly same as a part of input,
 74 |         /// or an un-escaped character of a part of input.
 75 |         StringFragment(&'input str),
 76 |         /// Represents `"` at the end of a string literal.
 77 |         StringEnd,
 78 |         /// Represents `\(` that starts interpolation in a string literal.
 79 |         InterpolationStart,
 80 |         /// Represents `)` that ends interpolation in a string literal.
 81 |         InterpolationEnd,
 82 |     }
 83 | 
 84 |     pub struct LexerState {
 85 |         /// For each interpolation, we want to lex the `)` that ends the interpolation differently.
 86 |         /// To do so, we keep track of the balance of parenthesis in the expression, and treat the
 87 |         /// the first `)` that over-closed the expression as the interpolation end marker.
 88 |         /// Since we can nest string interpolations like `"\("\("a")")"`, we use stack to keep this
 89 |         /// balance.
 90 |         paren_nest: Vec<usize>,
 91 |     }
 92 | 
 93 |     impl Default for LexerState {
 94 |         fn default() -> Self {
 95 |             Self {
 96 |                 paren_nest: vec![0],
 97 |             }
 98 |         }
 99 |     }
100 | 
101 |     lexer! {
102 |         pub Lexer(LexerState) -> Token<'input>;
103 |         type Error = String;
104 | 
105 |         let ws = [' ' '\t' '\n'] | "\r\n";
106 | 
107 |         rule Init {
108 |             $ws,
109 |             '+' = Token::Plus,
110 |             '"' => |lexer| lexer.switch_and_return(LexerRule::InString, Token::StringStart),
111 |             '(' =? |lexer| {
112 |                 match lexer.state().paren_nest.last_mut() {
113 |                     Some(x) => {
114 |                         *x += 1;
115 |                         lexer.return_(Ok(Token::LParen))
116 |                     },
117 |                     None => {
118 |                         lexer.return_(Err("Invalid state, maybe already failed?".to_string()))
119 |                     }
120 |                 }
121 |             },
122 |             ')' =? |lexer| {
123 |                 match lexer.state().paren_nest.last_mut() {
124 |                     Some(0) => {
125 |                         lexer.state().paren_nest.pop();
126 |                         if lexer.state().paren_nest.is_empty() {
127 |                             lexer.return_(Err("Too many close parens".to_string()))
128 |                         } else {
129 |                             lexer.switch_and_return(LexerRule::InString, Ok(Token::InterpolationEnd))
130 |                         }
131 |                     },
132 |                     Some(x) => {
133 |                         *x -= 1;
134 |                         lexer.return_(Ok(Token::RParen))
135 |                     },
136 |                     None => {
137 |                         lexer.return_(Err("Invalid state, maybe already failed?".to_string()))
138 |                     }
139 |                 }
140 |             },
141 |         }
142 | 
143 |         rule InString {
144 |             "\\\"" = Token::StringFragment("\""),
145 |             "\\n" = Token::StringFragment("\n"),
146 |             "\\r" = Token::StringFragment("\r"),
147 |             "\\t" = Token::StringFragment("\t"),
148 |             "\\\\" = Token::StringFragment("\\"),
149 |             '"' => |lexer| lexer.switch_and_return(LexerRule::Init, Token::StringEnd),
150 |             "\\(" => |lexer| {
151 |                 lexer.state().paren_nest.push(0);
152 |                 lexer.switch_and_return(LexerRule::Init, Token::InterpolationStart)
153 |             },
154 |             (_ # ['\\' '"'])+ => |lexer| lexer.return_(Token::StringFragment(lexer.match_())),
155 |         }
156 |     }
157 | }
158 | 
159 | lalrpop_mod!(#[allow(unused_imports, clippy::all)] pub parser, "/interpolation.rs");
160 | 
161 | #[cfg(test)]
162 | mod test {
163 |     use super::{
164 |         ast::Expression,
165 |         lexer::{Lexer, LexerError, Loc, Token},
166 |         parser::ExpressionParser,
167 |     };
168 | 
169 |     type Result<'input, T> =
170 |         std::result::Result<T, lalrpop_util::ParseError<Loc, Token<'input>, LexerError>>;
171 | 
172 |     fn parse(code: &str) -> Result<Expression> {
173 |         let lexer = Lexer::new(code);
174 |         ExpressionParser::new().parse(lexer)
175 |     }
176 | 
177 |     fn parse_and_eval(code: &str) -> Result<String> {
178 |         parse(code).map(|e| e.eval())
179 |     }
180 | 
181 |     #[test]
182 |     fn test_basic() -> Result<'static, ()> {
183 |         assert_eq!(parse_and_eval(r#""a" + "b" + "c""#)?, "abc");
184 |         assert_eq!(parse_and_eval(r#""\n\t\\(" + "b" + "c""#)?, "\n\t\\(bc");
185 |         Ok(())
186 |     }
187 | 
188 |     #[test]
189 |     fn test_invalid() {
190 |         assert!(parse(r#""a" +"#).is_err());
191 |         assert!(parse(r#""a" + ""#).is_err());
192 |         assert!(parse(r#"("a" + "b" "#).is_err());
193 |         assert!(parse(r#""a\(""#).is_err());
194 |         assert!(parse(r#""a\(")""#).is_err());
195 |         assert!(parse(r#""a\())""#).is_err());
196 |         assert!(parse(r#"("a\())""#).is_err());
197 |         assert!(parse(r#")"#).is_err());
198 |     }
199 | 
200 |     #[test]
201 |     fn test_associativity() -> Result<'static, ()> {
202 |         assert_eq!(
203 |             parse(r#""a" + "b" + "c" + "d""#)?,
204 |             parse(r#"(("a" + "b") + "c") + "d""#)?
205 |         );
206 |         Ok(())
207 |     }
208 | 
209 |     #[test]
210 |     fn test_interpolation() -> Result<'static, ()> {
211 |         assert_eq!(parse_and_eval(r#""ab\("c" + "d")""#)?, "abcd");
212 |         assert_eq!(parse_and_eval(r#""ab\(("c") + ("d"))""#)?, "abcd");
213 |         assert_eq!(parse_and_eval(r#""ab\(("c\("d")"))""#)?, "abcd");
214 |         Ok(())
215 |     }
216 | }
217 | 


--------------------------------------------------------------------------------
/crates/lexgen_util/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lexgen_util"
 3 | version = "0.16.0"
 4 | authors = ["Ömer Sinan Ağacan <omeragacan@gmail.com>"]
 5 | description = "Runtime library for lexers generated by lexgen"
 6 | edition = "2021"
 7 | license = "MIT"
 8 | homepage = "https://github.com/osa1/lexgen"
 9 | repository = "https://github.com/osa1/lexgen"
10 | 
11 | [dependencies]
12 | unicode-width = "0.2.0"
13 | 


--------------------------------------------------------------------------------
/crates/lexgen_util/README.md:
--------------------------------------------------------------------------------
1 | # lexgen_util: Runtime library for lexers generated by lexgen
2 | 
3 | This library is used by lexgen-generated lexers. See the [lexgen crate] or
4 | [lexgen homepage].
5 | 
6 | [lexgen crate]: https://crates.io/crates/lexgen
7 | [lexgen homepage]: https://github.com/osa1/lexgen
8 | 


--------------------------------------------------------------------------------
/crates/lexgen_util/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::should_implement_trait, clippy::type_complexity)]
  2 | 
  3 | use std::iter::Peekable;
  4 | use std::str::Chars;
  5 | 
  6 | use unicode_width::UnicodeWidthChar;
  7 | 
  8 | #[derive(Debug, Clone, PartialEq, Eq)]
  9 | pub struct LexerError<E> {
 10 |     pub location: Loc,
 11 |     pub kind: LexerErrorKind<E>,
 12 | }
 13 | 
 14 | #[derive(Debug, Clone, PartialEq, Eq)]
 15 | pub enum LexerErrorKind<E> {
 16 |     /// A lexer error, returned by lexgen.
 17 |     InvalidToken,
 18 | 
 19 |     /// A custom error, returned by a semantic action.
 20 |     Custom(E),
 21 | }
 22 | 
 23 | /// A location in an input.
 24 | #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
 25 | pub struct Loc {
 26 |     /// Zero-based line number in the input.
 27 |     pub line: u32,
 28 | 
 29 |     /// Zero-based character index of this location in its line.
 30 |     pub col: u32,
 31 | 
 32 |     /// Zero-based UTF-8 byte index of this location in the input.
 33 |     pub byte_idx: usize,
 34 | }
 35 | 
 36 | impl Loc {
 37 |     const ZERO: Loc = Loc {
 38 |         line: 0,
 39 |         col: 0,
 40 |         byte_idx: 0,
 41 |     };
 42 | }
 43 | 
 44 | /// **Do not use**
 45 | // Possible outcomes of a semantic action
 46 | pub enum SemanticActionResult<T> {
 47 |     // Semantic action did not return a token, continue with lexing
 48 |     Continue,
 49 |     // Semantic action returned a token, return it
 50 |     Return(T),
 51 | }
 52 | 
 53 | impl<T> SemanticActionResult<T> {
 54 |     pub fn map_token<F, T1>(self, f: F) -> SemanticActionResult<T1>
 55 |     where
 56 |         F: Fn(T) -> T1,
 57 |     {
 58 |         match self {
 59 |             SemanticActionResult::Continue => SemanticActionResult::Continue,
 60 |             SemanticActionResult::Return(t) => SemanticActionResult::Return(f(t)),
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | /// Common parts in lexers generated by lexgen.
 66 | ///
 67 | /// **Fields are used by lexgen-generated code and should not be used directly.**
 68 | #[derive(Debug, Clone)]
 69 | pub struct Lexer<'input, Iter: Iterator<Item = char> + Clone, Token, State, Error, Wrapper> {
 70 |     // Current lexer state
 71 |     pub __state: usize,
 72 | 
 73 |     // Set after end-of-input is handled by a rule, or by default in `Init` rule
 74 |     pub __done: bool,
 75 | 
 76 |     // Which lexer state to switch to on successful match
 77 |     pub __initial_state: usize,
 78 | 
 79 |     user_state: State,
 80 | 
 81 |     // User-provided input string. Does not change after initialization.
 82 |     input: &'input str,
 83 | 
 84 |     // Start location of `iter`. We update this as we backtrack and update `iter`.
 85 |     iter_loc: Loc,
 86 | 
 87 |     // Character iterator. `Peekable` is used in the handler's `peek` method. Note that we can't
 88 |     // use byte index returned by this directly, as we re-initialize this field when backtracking.
 89 |     // Add `iter_byte_idx` to the byte index before using. When resetting, update `iter_byte_idx`.
 90 |     pub __iter: Peekable<Iter>,
 91 | 
 92 |     // Start of the current match
 93 |     current_match_start: Loc,
 94 | 
 95 |     // End of the current match
 96 |     current_match_end: Loc,
 97 | 
 98 |     // If we skipped an accepting state, this holds the triple:
 99 |     //
100 |     // - Skipped match start (byte index in `input`)
101 |     // - Semantic action (a function name)
102 |     // - Skipped match end (exclusive, byte index in `input`)
103 |     last_match: Option<(
104 |         Loc,
105 |         Peekable<Iter>,
106 |         for<'lexer> fn(&'lexer mut Wrapper) -> SemanticActionResult<Result<Token, Error>>,
107 |         Loc,
108 |     )>,
109 | }
110 | 
111 | impl<I: Iterator<Item = char> + Clone, T, S: Default, E, W> Lexer<'static, I, T, S, E, W> {
112 |     pub fn new_from_iter(iter: I) -> Self {
113 |         Self::new_from_iter_with_state(iter, Default::default())
114 |     }
115 | }
116 | 
117 | impl<I: Iterator<Item = char> + Clone, T, S, E, W> Lexer<'static, I, T, S, E, W> {
118 |     pub fn new_from_iter_with_state(iter: I, state: S) -> Self {
119 |         Self {
120 |             __state: 0,
121 |             __done: false,
122 |             __initial_state: 0,
123 |             user_state: state,
124 |             input: "",
125 |             iter_loc: Loc::ZERO,
126 |             __iter: iter.peekable(),
127 |             current_match_start: Loc::ZERO,
128 |             current_match_end: Loc::ZERO,
129 |             last_match: None,
130 |         }
131 |     }
132 | }
133 | 
134 | impl<'input, T, S: Default, E, W> Lexer<'input, Chars<'input>, T, S, E, W> {
135 |     pub fn new(input: &'input str) -> Self {
136 |         Self::new_with_state(input, Default::default())
137 |     }
138 | }
139 | 
140 | impl<'input, T, S, E, W> Lexer<'input, Chars<'input>, T, S, E, W> {
141 |     pub fn new_with_state(input: &'input str, state: S) -> Self {
142 |         Self {
143 |             __state: 0,
144 |             __done: false,
145 |             __initial_state: 0,
146 |             user_state: state,
147 |             input,
148 |             iter_loc: Loc::ZERO,
149 |             __iter: input.chars().peekable(),
150 |             current_match_start: Loc::ZERO,
151 |             current_match_end: Loc::ZERO,
152 |             last_match: None,
153 |         }
154 |     }
155 | }
156 | 
157 | impl<'input, I: Iterator<Item = char> + Clone, T, S, E, W> Lexer<'input, I, T, S, E, W> {
158 |     // Read the next chracter
159 |     pub fn next(&mut self) -> Option<char> {
160 |         match self.__iter.next() {
161 |             None => None,
162 |             Some(char) => {
163 |                 self.current_match_end.byte_idx += char.len_utf8();
164 |                 if char == '\n' {
165 |                     self.current_match_end.line += 1;
166 |                     self.current_match_end.col = 0;
167 |                 } else if char == '\t' {
168 |                     self.current_match_end.col += 4; // TODO: Make this configurable?
169 |                 } else {
170 |                     self.current_match_end.col += UnicodeWidthChar::width(char).unwrap_or(1) as u32;
171 |                 }
172 |                 Some(char)
173 |             }
174 |         }
175 |     }
176 | 
177 |     pub fn peek(&mut self) -> Option<char> {
178 |         self.__iter.peek().copied()
179 |     }
180 | 
181 |     // On success returns semantic action function for the last match
182 |     pub fn backtrack(
183 |         &mut self,
184 |     ) -> Result<for<'lexer> fn(&'lexer mut W) -> SemanticActionResult<Result<T, E>>, LexerError<E>>
185 |     {
186 |         match self.last_match.take() {
187 |             None => {
188 |                 self.__state = 0;
189 |                 Err(LexerError {
190 |                     location: self.current_match_start,
191 |                     kind: LexerErrorKind::InvalidToken,
192 |                 })
193 |             }
194 |             Some((match_start, iter, semantic_action, match_end)) => {
195 |                 self.__done = false;
196 |                 self.current_match_start = match_start;
197 |                 self.current_match_end = match_end;
198 |                 self.__iter = iter;
199 |                 self.iter_loc = match_end;
200 |                 Ok(semantic_action)
201 |             }
202 |         }
203 |     }
204 | 
205 |     pub fn reset_accepting_state(&mut self) {
206 |         self.last_match = None;
207 |     }
208 | 
209 |     pub fn set_accepting_state(
210 |         &mut self,
211 |         semantic_action_fn: for<'lexer> fn(&'lexer mut W) -> SemanticActionResult<Result<T, E>>,
212 |     ) {
213 |         self.last_match = Some((
214 |             self.current_match_start,
215 |             self.__iter.clone(),
216 |             semantic_action_fn,
217 |             self.current_match_end,
218 |         ));
219 |     }
220 | 
221 |     pub fn reset_match(&mut self) {
222 |         self.current_match_start = self.current_match_end;
223 |     }
224 | 
225 |     pub fn match_(&self) -> &'input str {
226 |         &self.input[self.current_match_start.byte_idx..self.current_match_end.byte_idx]
227 |     }
228 | 
229 |     pub fn match_loc(&self) -> (Loc, Loc) {
230 |         (self.current_match_start, self.current_match_end)
231 |     }
232 | 
233 |     pub fn state(&mut self) -> &mut S {
234 |         &mut self.user_state
235 |     }
236 | }
237 | 


--------------------------------------------------------------------------------