├── .github └── workflows │ └── build-test.yml ├── .gitignore ├── .rustfmt.toml ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches ├── c_lexer.rs ├── long.c ├── sort.c └── spaces.c ├── examples ├── calc │ ├── README.md │ └── main.rs ├── clike │ ├── README.md │ ├── fib.c │ ├── main.rs │ └── sort.c ├── json │ ├── README.md │ ├── example.json │ └── main.rs └── sexp │ ├── README.md │ ├── example.sexp │ └── main.rs ├── laps_macros ├── Cargo.toml ├── README.md └── src │ ├── lib.rs │ ├── parse.rs │ ├── spanned.rs │ ├── token_ast.rs │ ├── token_kind.rs │ ├── tokenize.rs │ └── utils.rs ├── laps_regex ├── Cargo.toml ├── README.md └── src │ ├── dfa.rs │ ├── fa.rs │ ├── lib.rs │ ├── mir.rs │ ├── nfa.rs │ ├── re.rs │ └── table.rs └── src ├── ast.rs ├── input.rs ├── lexer.rs ├── lib.rs ├── parse.rs ├── reader.rs ├── span.rs └── token.rs /.github/workflows/build-test.yml: -------------------------------------------------------------------------------- 1 | name: Build and Test 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build_test: 11 | name: Build and Test 12 | runs-on: ubuntu-latest 13 | if: "!contains(github.event.head_commit.message, 'skip-ci')" 14 | timeout-minutes: 30 15 | steps: 16 | - name: Checkout laps 17 | uses: actions/checkout@v2 18 | 19 | - name: Build 20 | run: cargo check --workspace && 21 | cargo check --workspace --all-features 22 | 23 | - name: Clippy 24 | run: cargo clippy --workspace -- -D warnings && 25 | cargo clippy --workspace --all-targets --all-features -- -D warnings 26 | 27 | - name: Test 28 | run: cargo test --workspace && 29 | cargo test --workspace --all-features 30 | 31 | build_examples: 32 | name: Build Examples 33 | runs-on: ubuntu-latest 34 | if: "!contains(github.event.head_commit.message, 'skip-ci')" 35 | timeout-minutes: 30 36 | strategy: 37 | matrix: 38 | example-name: ['sexp', 'calc', 'json', 'clike'] 39 | 40 | steps: 41 | - name: Checkout laps 42 | uses: actions/checkout@v2 43 | 44 | - name: Build 45 | run: cargo check --example ${{matrix.example-name}} --features macros 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # macOS 2 | *.DS_Store 3 | 4 | # Cargo 5 | /target 6 | /Cargo.lock 7 | 8 | # VS Code 9 | .vscode 10 | 11 | # Debugging 12 | debug 13 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | tab_spaces = 2 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to the `laps` will be documented in this file. 4 | 5 | ## Unreleased 6 | 7 | ### Added 8 | 9 | * AST type `NonEmptyOptSepList` and `OptSepList`. 10 | 11 | ## 0.1.7 - 2023-12-30 12 | 13 | ### Added 14 | 15 | * AST type `TokenPrefix`. 16 | 17 | ### Fixed 18 | 19 | * Issue about method `maybe` of ASTs generate by macro `token_ast`. 20 | 21 | ## 0.1.6 - 2023-12-24 22 | 23 | ### Added 24 | 25 | * Implement `Parse` and `Spanned` trait for tuples. 26 | * AST type `OptPrefix`, `OptTokenPrefix`, `OptSepSeq` and `NonEmptyOptSepSeq`. 27 | * Trait `TrySpan`. 28 | * Attribute `try_span` for derive macro `Spanned`. 29 | 30 | ### Changed 31 | 32 | * Improve performance of minimizing DFA. 33 | * Mark AST `Quoted` as deprecated. 34 | * Derived `PartialOrd` and `Ord` trait for AST types (except `Quoted`). 35 | 36 | ### Fixed 37 | 38 | * Issue about parsing if-guard in `token_ast`. 39 | 40 | ## 0.1.5 - 2023-12-17 41 | 42 | ### Added 43 | 44 | * Method `file_type` for `Span`. 45 | 46 | ### Changed 47 | 48 | * Improve performance of compiling regular expressions again. 49 | 50 | ## 0.1.4 - 2023-12-13 51 | 52 | ### Added 53 | 54 | * Method `inner` and `inner_mut` for `TokenBuffer`. 55 | 56 | ### Fixed 57 | 58 | * Return type of method `Lexer::input_mut`. 59 | 60 | ## 0.1.3 - 2023-12-10 61 | 62 | ### Added 63 | 64 | * `laps::lexer::signed_int_literal` for parsing integer literal with an optional sign. 65 | * If-guard support in `kind` pattern of `token_ast` macro. 66 | * Method `new` for `Reader` and `ByteReader`. 67 | * Method `set_line_col` for trait `InputStream`, for supporting C preprocessor. 68 | * Method `input` and `input_mut` for `Lexer`. 69 | 70 | ### Changed 71 | 72 | * Some document comments. 73 | * Improve performance of compiling regular expressions. 74 | * Bumped dependency `colored` to version 2.1.0. 75 | 76 | ### Fixed 77 | 78 | * Issues about printing line information in `Span`. 79 | 80 | ## 0.1.2 - 2023-07-13 81 | 82 | ### Changed 83 | 84 | * Made `Span` fully thread-safe (embarrassed). 85 | 86 | ## 0.1.1 - 2023-07-13 87 | 88 | ### Changed 89 | 90 | * Made `Span` thread-safe. 91 | * Enabled LTO for release mode. 92 | * Supported transition table compression. 93 | 94 | ## 0.1.0 - 2023-06-17 95 | 96 | ### Added 97 | 98 | * Sub-crate `laps_regex` for generating state-transition table for multiple regular expressions. 99 | * Trait and derive macro `Tokenize`, allow users to get a lexer by deriving `Tokenize` for a token kind. 100 | 101 | ### Changed 102 | 103 | * New and more intuitive syntax for macro `token_ast`. 104 | * `Span` and `InputStream` now supports generic character types. 105 | * Removed trait `TokenBuilder` and struct `Ident`. 106 | * Removed some lexing methods in trait `InputStream`. 107 | 108 | ## 0.0.2 - 2023-01-13 109 | 110 | ### Added 111 | 112 | * `derive` syntax for macro `token_ast`. 113 | * More examples, including `sexp`, `calc` and `json`. 114 | * More documentation comments. 115 | * Module `prelude` for some common traits and macros. 116 | * `token_kind` now implements `Clone`, `1ryFrom` and `1ryFrom<&Kind>` for token kinds. 117 | * `token_ast` now implements `unwrap` and `unwrap_ref` methods for token ASTs. 118 | 119 | ### Changed 120 | 121 | * Updated version of some dependencies. 122 | * Feature `no-logger` to default feature `logger`. 123 | * License to either Apache 2.0 or MIT. 124 | 125 | ### Fixed 126 | 127 | * Fault about byte buffer offset (`byte_buf_offset`) in `Reader`. 128 | * Fault about namespace of some Rust preludes in procedure macros. 129 | * Fault about error message in method `TokenStream::expect`. 130 | * Fault about string width calculation in `Span`'s error logging related methods. 131 | 132 | ## 0.0.1 - 2022-10-25 133 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "laps_macros", 4 | "laps_regex", 5 | ] 6 | 7 | [package] 8 | name = "laps" 9 | version = "0.1.7" 10 | authors = ["MaxXing "] 11 | edition = "2021" 12 | description = "Build lexers and parsers by deriving traits." 13 | repository = "https://github.com/MaxXSoft/laps" 14 | documentation = "https://docs.rs/laps" 15 | categories = ["parsing", "text-processing"] 16 | keywords = ["parser", "lexer", "ast"] 17 | readme = "README.md" 18 | license = "MIT OR Apache-2.0" 19 | 20 | [package.metadata.docs.rs] 21 | all-features = true 22 | rustdoc-args = ["--cfg", "docsrs"] 23 | 24 | [features] 25 | default = ["logger"] 26 | # Enable the front-end logger instead of returning error messages as strings. 27 | logger = ["dep:colored"] 28 | # Enable additional macros, such as derive macros, etc. 29 | macros = ["dep:laps_macros"] 30 | 31 | [dependencies] 32 | colored = { version = "2.1.0", optional = true } 33 | laps_macros = { path = "./laps_macros", version = "0.1.5", optional = true } 34 | unicode-width = "0.1.10" 35 | 36 | [dev-dependencies] 37 | criterion = "0.5.1" 38 | 39 | [profile] 40 | release = { lto = true } 41 | bench = { lto = true } 42 | 43 | [[example]] 44 | name = "sexp" 45 | required-features = ["macros"] 46 | 47 | [[example]] 48 | name = "calc" 49 | required-features = ["macros"] 50 | 51 | [[example]] 52 | name = "json" 53 | required-features = ["macros"] 54 | 55 | [[example]] 56 | name = "clike" 57 | required-features = ["macros"] 58 | 59 | [[bench]] 60 | name = "c_lexer" 61 | harness = false 62 | required-features = ["macros"] 63 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 MaxXing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # laps 2 | 3 | [github](https://github.com/MaxXSoft/laps) 4 | [crates.io](https://crates.io/crates/laps) 5 | [docs.rs](https://docs.rs/laps) 6 | [build status](https://github.com/MaxXSoft/laps/actions?query=branch%3Amaster) 7 | 8 | Lexer and parser collections. 9 | 10 | With `laps`, you can build lexers/parsers by just defining tokens/ASTs and deriving `Tokenize`/`Parse` trait for them. 11 | 12 | ## Usage 13 | 14 | Add `laps` to your project by running `cargo add`: 15 | 16 | ``` 17 | cargo add laps --features macros 18 | ``` 19 | 20 | ## Example 21 | 22 | Implement a lexer for [S-expression](https://en.wikipedia.org/wiki/S-expression): 23 | 24 | ```rust 25 | use laps::prelude::*; 26 | 27 | #[token_kind] 28 | #[derive(Debug, Tokenize)] 29 | enum TokenKind { 30 | // This token will be skipped. 31 | #[skip(r"\s+")] 32 | _Skip, 33 | /// Parentheses. 34 | #[regex(r"[()]")] 35 | Paren(char), 36 | /// Atom. 37 | #[regex(r"[^\s()]+")] 38 | Atom(String), 39 | /// End-of-file. 40 | #[eof] 41 | Eof, 42 | } 43 | ``` 44 | 45 | And the parser and [ASTs](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (or actually [CSTs](https://en.wikipedia.org/wiki/Parse_tree)): 46 | 47 | ```rust 48 | type Token = laps::token::Token; 49 | 50 | token_ast! { 51 | macro Token { 52 | [atom] => { kind: TokenKind::Atom(_), prompt: "atom" }, 53 | [lpr] => { kind: TokenKind::Paren('(') }, 54 | [rpr] => { kind: TokenKind::Paren(')') }, 55 | [eof] => { kind: TokenKind::Eof }, 56 | } 57 | } 58 | 59 | #[derive(Parse)] 60 | #[token(Token)] 61 | enum Statement { 62 | Elem(Elem), 63 | End(Token![eof]), 64 | } 65 | 66 | #[derive(Parse)] 67 | #[token(Token)] 68 | struct SExp(Token![lpr], Vec, Token![rpr]); 69 | 70 | #[derive(Parse)] 71 | #[token(Token)] 72 | enum Elem { 73 | Atom(Token![atom]), 74 | SExp(SExp), 75 | } 76 | ``` 77 | 78 | The above implementation is very close in form to the corresponding EBNF representation of the S-expression: 79 | 80 | ```ebnf 81 | Statement ::= Elem | EOF; 82 | SExp ::= "(" {Elem} ")"; 83 | Elem ::= ATOM | SExp; 84 | ``` 85 | 86 | ## More Examples 87 | 88 | See the [`examples` directory](examples), which contains the following examples: 89 | 90 | * [`sexp`](examples/sexp): a [S-expression](https://en.wikipedia.org/wiki/S-expression) parser. 91 | * [`calc`](examples/calc): a simple expression calculator. 92 | * [`json`](examples/json): a simple JSON parser. 93 | * [`clike`](examples/clike): interpreter for a C-like programming language. 94 | 95 | ## Accelerating Code Completion for IDEs 96 | 97 | By default, Cargo does not enable optimizations for procedural macros, which may result in slower code completion if you are using `laps` to generate lexers. To avoid this, you can add the following configuration to `Cargo.toml`: 98 | 99 | ```toml 100 | [profile.dev.build-override] 101 | opt-level = 3 102 | ``` 103 | 104 | You can also try to manually enable/disable parallelization for lexer generation by adding: 105 | 106 | ```rust 107 | #[derive(Tokenize)] 108 | #[enable_par(true)] // or #[enable_par(false)] 109 | enum TokenKind { 110 | // ... 111 | } 112 | ``` 113 | 114 | The parallelization setting only affects compilation speed and has no effect at runtime, it's set automatically by `laps` by default. 115 | 116 | ## Changelog 117 | 118 | See [CHANGELOG.md](CHANGELOG.md). 119 | 120 | ## License 121 | 122 | Copyright (C) 2022-2023 MaxXing. Licensed under either of [Apache 2.0](LICENSE-APACHE) or [MIT](LICENSE-MIT) at your option. 123 | -------------------------------------------------------------------------------- /benches/c_lexer.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; 2 | use laps::{lexer::int_literal, prelude::*, reader::Reader}; 3 | use std::{fmt, fs::read_to_string, str::FromStr}; 4 | 5 | #[token_kind] 6 | #[derive(Debug, Tokenize)] 7 | enum TokenKind { 8 | #[skip(r"\s+")] 9 | _Skip, 10 | /// Keyword. 11 | #[regex(r"int|void|if|else|while|break|continue|return")] 12 | Keyword(Keyword), 13 | /// Identifier. 14 | #[regex(r"[_a-zA-Z][_a-zA-Z0-9]*")] 15 | Ident(String), 16 | /// Integer-literal. 17 | #[regex(r"[0-9]|[1-9][0-9]+|0x[0-9a-fA-F]+", int_literal)] 18 | Int(u64), 19 | /// Operator. 20 | #[regex(r"\+|-|\*|/|%|<|>|<=|>=|==|!=|&&|\|\||!|=")] 21 | Operator(Operator), 22 | /// Other character. 23 | #[regex(r".")] 24 | Other(char), 25 | /// End-of-file. 26 | #[eof] 27 | Eof, 28 | } 29 | 30 | #[derive(Clone, Copy, Debug, PartialEq)] 31 | enum Keyword { 32 | Int, 33 | Void, 34 | If, 35 | Else, 36 | While, 37 | Break, 38 | Continue, 39 | Return, 40 | } 41 | 42 | impl FromStr for Keyword { 43 | type Err = (); 44 | 45 | fn from_str(s: &str) -> std::result::Result { 46 | match s { 47 | "int" => Ok(Keyword::Int), 48 | "void" => Ok(Keyword::Void), 49 | "if" => Ok(Keyword::If), 50 | "else" => Ok(Keyword::Else), 51 | "while" => Ok(Keyword::While), 52 | "break" => Ok(Keyword::Break), 53 | "continue" => Ok(Keyword::Continue), 54 | "return" => Ok(Keyword::Return), 55 | _ => Err(()), 56 | } 57 | } 58 | } 59 | 60 | impl fmt::Display for Keyword { 61 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 62 | match self { 63 | Self::Int => write!(f, "int"), 64 | Self::Void => write!(f, "void"), 65 | Self::If => write!(f, "if"), 66 | Self::Else => write!(f, "else"), 67 | Self::While => write!(f, "while"), 68 | Self::Break => write!(f, "break"), 69 | Self::Continue => write!(f, "continue"), 70 | Self::Return => write!(f, "return"), 71 | } 72 | } 73 | } 74 | 75 | #[derive(Clone, Copy, Debug, PartialEq)] 76 | enum Operator { 77 | Add, 78 | Sub, 79 | Mul, 80 | Div, 81 | Mod, 82 | Lt, 83 | Gt, 84 | Le, 85 | Ge, 86 | Eq, 87 | Ne, 88 | And, 89 | Or, 90 | Not, 91 | Assign, 92 | } 93 | 94 | impl FromStr for Operator { 95 | type Err = (); 96 | 97 | fn from_str(s: &str) -> std::result::Result { 98 | match s { 99 | "+" => Ok(Self::Add), 100 | "-" => Ok(Self::Sub), 101 | "*" => Ok(Self::Mul), 102 | "/" => Ok(Self::Div), 103 | "%" => Ok(Self::Mod), 104 | "<" => Ok(Self::Lt), 105 | ">" => Ok(Self::Gt), 106 | "<=" => Ok(Self::Le), 107 | ">=" => Ok(Self::Ge), 108 | "==" => Ok(Self::Eq), 109 | "!=" => Ok(Self::Ne), 110 | "&&" => Ok(Self::And), 111 | "||" => Ok(Self::Or), 112 | "!" => Ok(Self::Not), 113 | "=" => Ok(Self::Assign), 114 | _ => Err(()), 115 | } 116 | } 117 | } 118 | 119 | impl fmt::Display for Operator { 120 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 121 | match self { 122 | Self::Add => write!(f, "+"), 123 | Self::Sub => write!(f, "-"), 124 | Self::Mul => write!(f, "*"), 125 | Self::Div => write!(f, "/"), 126 | Self::Mod => write!(f, "%"), 127 | Self::Lt => write!(f, "<"), 128 | Self::Gt => write!(f, ">"), 129 | Self::Le => write!(f, "<="), 130 | Self::Ge => write!(f, ">="), 131 | Self::Eq => write!(f, "=="), 132 | Self::Ne => write!(f, "!="), 133 | Self::And => write!(f, "&&"), 134 | Self::Or => write!(f, "||"), 135 | Self::Not => write!(f, "!"), 136 | Self::Assign => write!(f, "="), 137 | } 138 | } 139 | } 140 | 141 | fn tokenize(s: &str) { 142 | let mut lexer = TokenKind::lexer(Reader::from(s)); 143 | loop { 144 | let token = lexer.next_token().unwrap(); 145 | match token.kind { 146 | TokenKind::Eof => break, 147 | t => black_box(t), 148 | }; 149 | } 150 | } 151 | 152 | fn bench_tokenize(c: &mut Criterion) { 153 | let mut group = c.benchmark_group("c_lexer"); 154 | for src in ["sort", "spaces", "long"] { 155 | let input = read_to_string(format!("benches/{src}.c")).unwrap(); 156 | group.throughput(Throughput::Bytes(input.as_bytes().len() as u64)); 157 | group.bench_with_input(src, &input, |b, s| b.iter(|| tokenize(s))); 158 | } 159 | group.finish(); 160 | } 161 | 162 | criterion_group!(benches, bench_tokenize); 163 | criterion_main!(benches); 164 | -------------------------------------------------------------------------------- /benches/sort.c: -------------------------------------------------------------------------------- 1 | int buf[2][100]; 2 | 3 | // sort [l, r) 4 | void merge_sort(int l, int r) 5 | { 6 | if (l + 1 >= r) 7 | return; 8 | 9 | int mid = (l + r) / 2; 10 | merge_sort(l, mid); 11 | merge_sort(mid, r); 12 | 13 | int i = l, j = mid, k = l; 14 | while (i < mid && j < r) { 15 | if (buf[0][i] < buf[0][j]) { 16 | buf[1][k] = buf[0][i]; 17 | i = i + 1; 18 | } else { 19 | buf[1][k] = buf[0][j]; 20 | j = j + 1; 21 | } 22 | k = k + 1; 23 | } 24 | while (i < mid) { 25 | buf[1][k] = buf[0][i]; 26 | i = i + 1; 27 | k = k + 1; 28 | } 29 | while (j < r) { 30 | buf[1][k] = buf[0][j]; 31 | j = j + 1; 32 | k = k + 1; 33 | } 34 | 35 | while (l < r) { 36 | buf[0][l] = buf[1][l]; 37 | l = l + 1; 38 | } 39 | } 40 | 41 | int main() 42 | { 43 | int n = getarray(buf[0]); 44 | merge_sort(0, n); 45 | putarray(n, buf[0]); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /benches/spaces.c: -------------------------------------------------------------------------------- 1 | int __HELLO [ 2 | 3 | 4 | 100 5 | ] 6 | = { 7 | 87, 101, 108, 99, 8 | 111, 109, 101, 9 | 32, 116, 111, 32, 10 | 116, 104, 11 | 101, 32, 74, 12 | 97, 13 | 14 | 112, 97, 15 | 16 | 114, 105, 32, 80, 97, 17 | 18 | 19 | 20 | 21 | 114, 107, 33, 10 }; /* Names of 22 | kemono 23 | friends */ int N4__mE___[6][50] = { { 83, 97, 97, 98, 24 | 97, 25 | 114, 26 | 117 }, { 75, 97, 98, 27 | 28 | 97, 110 29 | 30 | }, { 31 | 32 | 33 | 72, 34 | 35 | 97, 36 | 115, 104, 105, 37 | 98, 105, 114, 111, 38 | 39 | 40 | 41 | 42 | 43 | 44 | 107, 45 | 111, 46 | 47 | 48 | 117 49 | 50 | }, { 65, 114, 51 | 52 | 97, 53 | 54 | 105, 55 | 103, 56 | 57 | 117, 58 | 109, 59 | 60 | 61 | 97 }, 62 | { 72, 117, 63 | 110, 98, 111, 114, 64 | 117, 65 | 66 | 116, 111, 32, 80, 67 | 101, 110, 68 | 69 | 103, 105, 110 70 | }, 71 | { 84, 97, 105, 114, 105, 107, 117, 32, 79, 72 | 73 | 74 | 111, 107, 75 | 97, 76 | 109, 77 | 78 | 79 | 80 | 81 | 82 | 83 | 105 } }; 84 | int 85 | 86 | saY_HeI10_To[40] = { 32, 87 | 115, 97, 121, 88 | 89 | 115, 90 | 91 | 32, 92 | 104, 93 | 94 | 101, 108, 108, 111, 95 | 96 | 32, 97 | 98 | 99 | 116, 111, 100 | 32 }; int 101 | RET[5] 102 | = 103 | {10}; int putstr( 104 | int str[ ] ) { 105 | int 106 | 107 | iNd__1X ; iNd__1X = 0 ; while ( str 108 | [ iNd__1X 109 | ] ) { 110 | 111 | putch ( 112 | 113 | str[ iNd__1X 114 | ] 115 | ) ; iNd__1X 116 | = 117 | iNd__1X 118 | 119 | + 1 120 | 121 | ; } return iNd__1X 122 | ; } int main( /* no param */ ) { 123 | putstr( 124 | __HELLO ) ; int i = 125 | 0 ; /* say 126 | 127 | 128 | hello to 129 | kemono friends 130 | ~ */ while ( 131 | 132 | 1 ) { 133 | 134 | int _ 135 | = i 136 | 137 | / 6 138 | 139 | ; int __ 140 | = 141 | i % 6 142 | 143 | ; 144 | 145 | if 146 | ( 147 | 148 | _ 149 | 150 | != 151 | 152 | 153 | __ ) 154 | { putstr( 155 | 156 | N4__mE___ 157 | 158 | [ _ 159 | ] ) 160 | ; putstr( 161 | saY_HeI10_To ) ; 162 | putstr( 163 | N4__mE___ [ 164 | 165 | 166 | 167 | __ ] ) 168 | 169 | 170 | ; 171 | 172 | putstr( 173 | RET 174 | ) ; 175 | } 176 | /* 177 | do 178 | 179 | linear 180 | modulo 181 | to find the next pair of friends */ i = ( i 182 | * 183 | 184 | 185 | 17 186 | 187 | + 23 188 | ) 189 | % 190 | 191 | 192 | 32 193 | 194 | 195 | ; 196 | if ( i 197 | == 198 | 0 ) { break ; } 199 | 200 | 201 | } return 0; } -------------------------------------------------------------------------------- /examples/calc/README.md: -------------------------------------------------------------------------------- 1 | # calc 2 | 3 | A simple expression calculator, with a front-end built with `laps`. 4 | 5 | Supporting addition, subtraction, multiplication, division, modulo and brackets. 6 | 7 | ## Usage 8 | 9 | Run in the repository root: 10 | 11 | ``` 12 | echo '-10 * (2 + 5) * 2 - 5.3' | cargo run --example calc --features=macros 13 | ``` 14 | 15 | The output will be: 16 | 17 | ``` 18 | -145.3 19 | ``` 20 | -------------------------------------------------------------------------------- /examples/calc/main.rs: -------------------------------------------------------------------------------- 1 | use laps::{ast::NonEmptySepList, prelude::*, reader::Reader, span::Result, token::TokenBuffer}; 2 | 3 | /// Kinds of the token. 4 | /// 5 | /// The tokenizer (lexer) will read user input and turn it into a stream of 6 | /// tokens based on regular expressions. 7 | #[token_kind] 8 | #[derive(Tokenize)] 9 | enum TokenKind { 10 | // This token will be skipped. 11 | #[skip(r"\s+")] 12 | _Skip, 13 | /// Floating-point number. 14 | #[regex(r"[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?")] 15 | Float(f64), 16 | /// Other character. 17 | #[regex(r".")] 18 | Other(char), 19 | /// End-of-file. 20 | #[eof] 21 | Eof, 22 | } 23 | 24 | /// Type of token. 25 | /// 26 | /// [`laps::token::Token`] has two fields, one is the token kind and 27 | /// the other is the span of this token, representing the location of 28 | /// the token in the input. 29 | type Token = laps::token::Token; 30 | 31 | token_ast! { 32 | /// Macro for referencing ASTs corresponding to tokens. 33 | /// 34 | /// The [`token_ast`] macro defines ASTs for tokens, and automatically 35 | /// implements methods for parsing them. 36 | macro Token { 37 | [float] => { kind: TokenKind::Float(_), prompt: "floating-point" }, 38 | [+] => { kind: TokenKind::Other('+') }, 39 | [-] => { kind: TokenKind::Other('-') }, 40 | [*] => { kind: TokenKind::Other('*') }, 41 | [/] => { kind: TokenKind::Other('/') }, 42 | [%] => { kind: TokenKind::Other('%') }, 43 | [lpr] => { kind: TokenKind::Other('(') }, 44 | [rpr] => { kind: TokenKind::Other(')') }, 45 | [eof] => { kind: TokenKind::Eof }, 46 | } 47 | } 48 | 49 | // EBNF of arithmetic expression: 50 | // 51 | // Expr ::= AddExpr EOF; 52 | // AddExpr ::= MulExpr {AddOps MulExpr}; 53 | // AddOps ::= "+" | "-"; 54 | // MulExpr ::= Value {MulOps Value}; 55 | // MulOps ::= "*" | "/" | "%"; 56 | // Value ::= FLOAT | "-" Value | "(" AddExpr ")"; 57 | // 58 | // So we define the following ASTs, and implement there parsers by deriving 59 | // the `Parse` trait. 60 | 61 | #[derive(Parse)] 62 | #[token(Token)] 63 | struct Expr { 64 | add: AddExpr, 65 | _eof: Token![eof], 66 | } 67 | 68 | type AddExpr = NonEmptySepList; 69 | 70 | #[derive(Parse)] 71 | #[token(Token)] 72 | enum AddOps { 73 | Add(Token![+]), 74 | Sub(Token![-]), 75 | } 76 | 77 | type MulExpr = NonEmptySepList; 78 | 79 | #[derive(Parse)] 80 | #[token(Token)] 81 | enum MulOps { 82 | Mul(Token![*]), 83 | Div(Token![/]), 84 | Mod(Token![%]), 85 | } 86 | 87 | #[derive(Parse)] 88 | #[token(Token)] 89 | enum Value { 90 | Num(Token![float]), 91 | Neg(Token![-], Box), 92 | Paren(Token![lpr], Box, Token![rpr]), 93 | } 94 | 95 | // Some implementations for calculating the parsed expression. 96 | 97 | trait Calculate { 98 | fn calc(&self) -> Result; 99 | } 100 | 101 | impl Calculate for Expr { 102 | fn calc(&self) -> Result { 103 | self.add.calc() 104 | } 105 | } 106 | 107 | impl Calculate for AddExpr { 108 | fn calc(&self) -> Result { 109 | match self { 110 | Self::One(e) => e.calc(), 111 | Self::More(l, op, r) => { 112 | let (l, r) = (l.calc()?, r.calc()?); 113 | Ok(match op { 114 | AddOps::Add(_) => l + r, 115 | AddOps::Sub(_) => l - r, 116 | }) 117 | } 118 | } 119 | } 120 | } 121 | 122 | impl Calculate for MulExpr { 123 | fn calc(&self) -> Result { 124 | match self { 125 | Self::One(e) => e.calc(), 126 | Self::More(l, op, r) => { 127 | let (l, r) = (l.calc()?, r.calc()?); 128 | Ok(match op { 129 | MulOps::Mul(_) => l * r, 130 | MulOps::Div(_) => l / r, 131 | MulOps::Mod(_) => l % r, 132 | }) 133 | } 134 | } 135 | } 136 | } 137 | 138 | impl Calculate for Value { 139 | fn calc(&self) -> Result { 140 | match self { 141 | Self::Num(num) => Ok(*num.unwrap_ref::<&f64, _>()), 142 | Self::Neg(_, value) => Ok(-value.calc()?), 143 | Self::Paren(_, add, _) => add.calc(), 144 | } 145 | } 146 | } 147 | 148 | fn main() -> Result<()> { 149 | // Create a reader and a lexer. 150 | let reader = Reader::from_stdin(); 151 | let lexer = TokenKind::lexer(reader); 152 | // Create a token buffer for parsing. 153 | // Token buffer can temporarily hold tokens to help the parser perform 154 | // some look-ahead operations. 155 | let mut tokens = TokenBuffer::new(lexer); 156 | // Parse and calculate expression, and print the result. 157 | println!("{}", tokens.parse::()?.calc()?); 158 | Ok(()) 159 | } 160 | -------------------------------------------------------------------------------- /examples/clike/README.md: -------------------------------------------------------------------------------- 1 | # clike 2 | 3 | Interpreter for a C-like programming language, with a front-end built with `laps`. 4 | 5 | ## Usage 6 | 7 | Run in the repository root: 8 | 9 | ``` 10 | cargo run --example clike --features=macros -- examples/clike/fib.c 11 | ``` 12 | 13 | Input: 14 | 15 | ``` 16 | 10 17 | ``` 18 | 19 | Output: 20 | 21 | ``` 22 | 55 23 | ``` 24 | -------------------------------------------------------------------------------- /examples/clike/fib.c: -------------------------------------------------------------------------------- 1 | int fib(int n) { 2 | if (n <= 2) { 3 | return 1; 4 | } else { 5 | return fib(n - 1) + fib(n - 2); 6 | } 7 | } 8 | 9 | int main() { 10 | return putint(fib(getint())); 11 | } 12 | -------------------------------------------------------------------------------- /examples/clike/sort.c: -------------------------------------------------------------------------------- 1 | int arr[20]; 2 | 3 | int qsort(int l, int r) { 4 | int i = l; 5 | int j = r; 6 | int p = arr[(l + r) / 2]; 7 | while (i <= j) { 8 | while (arr[i] < p) i = i + 1; 9 | while (arr[j] > p) j = j - 1; 10 | if (i > j) break; 11 | int u = arr[i]; 12 | arr[i] = arr[j]; 13 | arr[j] = u; 14 | i = i + 1; 15 | j = j - 1; 16 | } 17 | if (i < r) qsort(i, r); 18 | if (j > l) qsort(l, j); 19 | return 0; 20 | } 21 | 22 | int main() { 23 | int i = 0; 24 | while (i < 20) { 25 | arr[i] = getint(); 26 | i = i + 1; 27 | } 28 | qsort(0, 19); 29 | i = 0; 30 | while (i < 20) { 31 | putint(arr[i]); 32 | i = i + 1; 33 | } 34 | return 0; 35 | } 36 | -------------------------------------------------------------------------------- /examples/json/README.md: -------------------------------------------------------------------------------- 1 | # json 2 | 3 | A simple JSON parser, with a front-end built with `laps`. 4 | 5 | ## Usage 6 | 7 | Run in the repository root: 8 | 9 | ``` 10 | cargo run --example json --features=macros -- examples/json/example.json 11 | ``` 12 | 13 | The structure of the parsed JSON will be printed. 14 | -------------------------------------------------------------------------------- /examples/json/main.rs: -------------------------------------------------------------------------------- 1 | use laps::ast::SepSeq; 2 | use laps::prelude::*; 3 | use laps::reader::Reader; 4 | use laps::return_error; 5 | use laps::token::TokenBuffer; 6 | use std::{collections::HashMap, env, fmt, io::Read, process, str::FromStr}; 7 | 8 | // ============================== 9 | // Token definitions. 10 | // ============================== 11 | 12 | type Token = laps::token::Token; 13 | 14 | #[token_kind] 15 | #[derive(Tokenize)] 16 | enum TokenKind { 17 | #[skip(r"[ \r\n\t]+")] 18 | _Skip, 19 | /// Keyword. 20 | #[regex(r"true|false|null")] 21 | Keyword(Keyword), 22 | /// Number. 23 | #[regex(r"-?([0-9]|[1-9][0-9]+)(\.[0-9]+)?([Ee][+-]?[0-9]+)?")] 24 | Number(f64), 25 | /// String. 26 | #[regex(r#""([^\x00-\x1f"\\]|\\(["\\/bfnrt]|u[0-9a-fA-F]{4}))*""#, json_str)] 27 | String(String), 28 | /// Other character. 29 | #[regex(r".")] 30 | Other(char), 31 | /// End-of-file. 32 | #[eof] 33 | Eof, 34 | } 35 | 36 | #[derive(Clone, PartialEq)] 37 | enum Keyword { 38 | True, 39 | False, 40 | Null, 41 | } 42 | 43 | impl FromStr for Keyword { 44 | type Err = (); 45 | 46 | fn from_str(s: &str) -> std::result::Result { 47 | match s { 48 | "true" => Ok(Self::True), 49 | "false" => Ok(Self::False), 50 | "null" => Ok(Self::Null), 51 | _ => Err(()), 52 | } 53 | } 54 | } 55 | 56 | impl fmt::Display for Keyword { 57 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 58 | match self { 59 | Self::True => write!(f, "true"), 60 | Self::False => write!(f, "false"), 61 | Self::Null => write!(f, "null"), 62 | } 63 | } 64 | } 65 | 66 | fn json_str(s: &str) -> Option { 67 | let mut buf = String::new(); 68 | let mut escape = false; 69 | let mut hex_num = 0; 70 | let mut hex = 0; 71 | for c in s[1..s.len() - 1].chars() { 72 | if escape { 73 | if hex_num > 0 && c.is_ascii_digit() { 74 | hex = hex * 16 + c.to_digit(16)?; 75 | hex_num -= 1; 76 | if hex_num == 0 { 77 | buf.push(char::from_u32(hex)?); 78 | hex = 0; 79 | escape = false; 80 | } 81 | } else if c == 'u' { 82 | hex_num = 4; 83 | } else { 84 | match c { 85 | '"' => buf.push('"'), 86 | '\\' => buf.push('\\'), 87 | '/' => buf.push('/'), 88 | 'b' => buf.push('\x08'), 89 | 'f' => buf.push('\x0c'), 90 | 'n' => buf.push('\n'), 91 | 'r' => buf.push('\r'), 92 | 't' => buf.push('\t'), 93 | _ => return None, 94 | } 95 | escape = false; 96 | } 97 | } else { 98 | match c { 99 | '\\' => escape = true, 100 | c => buf.push(c), 101 | } 102 | } 103 | } 104 | Some(buf) 105 | } 106 | 107 | // ============================== 108 | // AST definitions. 109 | // ============================== 110 | 111 | token_ast! { 112 | macro Token { 113 | [true] => { kind: TokenKind::Keyword(Keyword::True) }, 114 | [false] => { kind: TokenKind::Keyword(Keyword::False) }, 115 | [null] => { kind: TokenKind::Keyword(Keyword::Null) }, 116 | [num] => { kind: TokenKind::Number(_), prompt: "number" }, 117 | [str] => { kind: TokenKind::String(_), prompt: "string" }, 118 | [:] => { kind: TokenKind::Other(':') }, 119 | [,] => { kind: TokenKind::Other(',') }, 120 | [lbk] => { kind: TokenKind::Other('{') }, 121 | [rbk] => { kind: TokenKind::Other('}') }, 122 | [lbc] => { kind: TokenKind::Other('[') }, 123 | [rbc] => { kind: TokenKind::Other(']') }, 124 | [eof] => { kind: TokenKind::Eof }, 125 | } 126 | } 127 | 128 | #[derive(Parse)] 129 | #[token(Token)] 130 | struct JsonDef { 131 | value: ValueDef, 132 | _eof: Token![eof], 133 | } 134 | 135 | #[derive(Parse)] 136 | #[token(Token)] 137 | enum ValueDef { 138 | ObjectDef(ObjectDef), 139 | ArrayDef(ArrayDef), 140 | String(Token![str]), 141 | Number(Token![num]), 142 | True(Token![true]), 143 | False(Token![false]), 144 | Null(Token![null]), 145 | } 146 | 147 | #[derive(Parse)] 148 | #[token(Token)] 149 | struct ObjectDef { 150 | _lbk: Token![lbk], 151 | members: SepSeq, 152 | _rbk: Token![rbk], 153 | } 154 | 155 | #[derive(Parse)] 156 | #[token(Token)] 157 | struct Member { 158 | name: Token![str], 159 | _colon: Token![:], 160 | value: ValueDef, 161 | } 162 | 163 | #[derive(Parse)] 164 | #[token(Token)] 165 | struct ArrayDef { 166 | _lbc: Token![lbc], 167 | values: SepSeq, 168 | _rbc: Token![rbc], 169 | } 170 | 171 | // ============================== 172 | // Converter. 173 | // ============================== 174 | 175 | #[derive(Debug)] 176 | enum Value { 177 | Object(HashMap), 178 | Array(Vec), 179 | String(String), 180 | Number(f64), 181 | Bool(bool), 182 | Null, 183 | } 184 | 185 | impl From for Value { 186 | fn from(json: JsonDef) -> Self { 187 | json.value.into() 188 | } 189 | } 190 | 191 | impl From for Value { 192 | fn from(value: ValueDef) -> Self { 193 | match value { 194 | ValueDef::ObjectDef(obj) => obj.into(), 195 | ValueDef::ArrayDef(arr) => arr.into(), 196 | ValueDef::String(s) => Self::String(s.unwrap()), 197 | ValueDef::Number(n) => Self::Number(n.unwrap()), 198 | ValueDef::True(_) => Self::Bool(true), 199 | ValueDef::False(_) => Self::Bool(false), 200 | ValueDef::Null(_) => Self::Null, 201 | } 202 | } 203 | } 204 | 205 | impl From for Value { 206 | fn from(obj: ObjectDef) -> Self { 207 | Self::Object( 208 | obj 209 | .members 210 | .into_iter() 211 | .map(|Member { name, value, .. }| (name.unwrap(), value.into())) 212 | .collect(), 213 | ) 214 | } 215 | } 216 | 217 | impl From for Value { 218 | fn from(arr: ArrayDef) -> Self { 219 | Self::Array(arr.values.into_iter().map(From::from).collect()) 220 | } 221 | } 222 | 223 | fn main() { 224 | let mut args = env::args(); 225 | args.next(); 226 | match args.next() { 227 | Some(path) => parse_and_dump(Reader::from_path(path).expect("invalid path")), 228 | None => parse_and_dump(Reader::from_stdin()), 229 | } 230 | } 231 | 232 | fn parse_and_dump(reader: Reader) 233 | where 234 | T: Read, 235 | { 236 | let span = reader.span().clone(); 237 | let lexer = TokenKind::lexer(reader); 238 | let mut tokens = TokenBuffer::new(lexer); 239 | if let Ok(json) = tokens.parse::() { 240 | let value = Value::from(json); 241 | println!("{value:#?}"); 242 | } else { 243 | span.log_summary(); 244 | process::exit(span.error_num() as i32); 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /examples/sexp/README.md: -------------------------------------------------------------------------------- 1 | # sexp 2 | 3 | A [S-expression](https://en.wikipedia.org/wiki/S-expression) parser built with `laps`. 4 | 5 | ## Usage 6 | 7 | Run in the repository root: 8 | 9 | ``` 10 | cat examples/sexp/example.sexp | cargo run --example sexp --features=macros 11 | ``` 12 | 13 | The structure of the parsed S-expression AST will be printed. 14 | -------------------------------------------------------------------------------- /examples/sexp/example.sexp: -------------------------------------------------------------------------------- 1 | atom 2 | 3 | () 4 | 5 | (() (())) 6 | 7 | (The (quick (brown fox)) 8 | jumps over ((the) lazy) dog.) 9 | 10 | (defun factorial (x) 11 | (if (zerop x) 12 | 1 13 | (* x (factorial (- x 1))))) 14 | -------------------------------------------------------------------------------- /examples/sexp/main.rs: -------------------------------------------------------------------------------- 1 | use laps::{prelude::*, reader::Reader, span::Result, token::TokenBuffer}; 2 | 3 | /// Kinds of the token. 4 | /// 5 | /// The tokenizer (lexer) will read user input and turn it into a stream of 6 | /// tokens based on regular expressions. 7 | #[token_kind] 8 | #[derive(Debug, Tokenize)] 9 | enum TokenKind { 10 | // This token will be skipped. 11 | #[skip(r"\s+")] 12 | _Skip, 13 | /// Parentheses. 14 | #[regex(r"[()]")] 15 | Paren(char), 16 | /// Atom. 17 | #[regex(r"[^\s()]+")] 18 | Atom(String), 19 | /// End-of-file. 20 | #[eof] 21 | Eof, 22 | } 23 | 24 | /// Type of token. 25 | /// 26 | /// [`laps::token::Token`] has two fields, one is the token kind and 27 | /// the other is the span of this token, representing the location of 28 | /// the token in the input. 29 | type Token = laps::token::Token; 30 | 31 | token_ast! { 32 | /// Macro for referencing ASTs corresponding to tokens. 33 | /// 34 | /// The [`token_ast`] macro defines ASTs for tokens, and automatically 35 | /// implements methods for parsing them. 36 | #[derive(Clone, Debug, PartialEq)] 37 | macro Token { 38 | [atom] => { kind: TokenKind::Atom(_), prompt: "atom" }, 39 | [lpr] => { kind: TokenKind::Paren('(') }, 40 | [rpr] => { kind: TokenKind::Paren(')') }, 41 | [eof] => { kind: TokenKind::Eof }, 42 | } 43 | } 44 | 45 | // EBNF of S-expression: 46 | // 47 | // Statement ::= Elem | EOF; 48 | // SExp ::= "(" {Elem} ")"; 49 | // Elem ::= ATOM | SExp; 50 | // 51 | // So we define the following ASTs, and implement there parsers by deriving 52 | // the `Parse` trait. 53 | 54 | #[derive(Parse, Debug)] 55 | #[token(Token)] 56 | enum Statement { 57 | Elem(Elem), 58 | End(Token![eof]), 59 | } 60 | 61 | #[derive(Parse, Debug)] 62 | #[token(Token)] 63 | struct SExp(Token![lpr], Vec, Token![rpr]); 64 | 65 | #[derive(Parse, Debug)] 66 | #[token(Token)] 67 | enum Elem { 68 | Atom(Token![atom]), 69 | SExp(SExp), 70 | } 71 | 72 | fn main() -> Result<()> { 73 | // Create a reader and a lexer. 74 | let reader = Reader::from_stdin(); 75 | let lexer = TokenKind::lexer(reader); 76 | // Create a token buffer for parsing. 77 | // Token buffer can temporarily hold tokens to help the parser perform 78 | // some look-ahead operations. 79 | let mut tokens = TokenBuffer::new(lexer); 80 | // Parse S-expressions and print them until the end of the input. 81 | loop { 82 | match tokens.parse::()? { 83 | Statement::End(_) => break Ok(()), 84 | stmt => println!("{stmt:#?}"), 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /laps_macros/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "laps_macros" 3 | version = "0.1.5" 4 | authors = ["MaxXing "] 5 | edition = "2021" 6 | description = "Macros for crate `laps`." 7 | repository = "https://github.com/MaxXSoft/laps" 8 | documentation = "https://docs.rs/laps_macros" 9 | categories = ["parsing", "development-tools::procedural-macro-helpers"] 10 | keywords = ["laps", "parser", "lexer", "derive", "proc_macro"] 11 | readme = "README.md" 12 | license = "MIT OR Apache-2.0" 13 | 14 | [lib] 15 | proc-macro = true 16 | 17 | [dependencies] 18 | proc-macro2 = "1.0" 19 | quote = "1.0" 20 | syn = { version = "2.0", features = ["full"] } 21 | laps_regex = { path = "../laps_regex", version = "0.1.1" } 22 | -------------------------------------------------------------------------------- /laps_macros/README.md: -------------------------------------------------------------------------------- 1 | # laps_macros 2 | 3 | Macros for crate [`laps`](https://crates.io/crates/laps), including derive macros and other helper macros. 4 | 5 | ## License 6 | 7 | Copyright (C) 2022-2023 MaxXing. Licensed under either of Apache 2.0 or MIT at your option. 8 | -------------------------------------------------------------------------------- /laps_macros/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Macros for crate [`laps`](https://crates.io/crates/laps), 2 | //! including derive macros and other helper macros. 3 | 4 | mod parse; 5 | mod spanned; 6 | mod token_ast; 7 | mod token_kind; 8 | mod tokenize; 9 | mod utils; 10 | 11 | use proc_macro::TokenStream; 12 | use utils::result_to_tokens; 13 | 14 | /// Generates the `Parse` trait implementation. 15 | /// 16 | /// # Helper Attributes 17 | /// 18 | /// * `#[token(type)]`: implements `Parse` trait for token streams that 19 | /// produce tokens of the given type. 20 | /// * `#[starts_with(token_ast0, token_ast1, ...)]`: specifies which tokens 21 | /// the current AST may start with. This will affect the implementation of 22 | /// method `maybe` of the `Parse` trait. 23 | #[proc_macro_derive(Parse, attributes(token, starts_with))] 24 | pub fn derive_parse(item: TokenStream) -> TokenStream { 25 | result_to_tokens!(parse::derive_parse(item)) 26 | } 27 | 28 | /// Generates the `Spanned` trait implementation. 29 | /// 30 | /// # `#[try_span]` 31 | /// 32 | /// Tells the macro that a field implements the `TrySpan` trait. 33 | /// This may be helpful when: 34 | /// 35 | /// ``` 36 | /// # use laps_macros::Spanned; 37 | /// # mod laps { 38 | /// # pub mod span { 39 | /// # pub type Result = std::result::Result; 40 | /// # pub struct Span; 41 | /// # impl Span { 42 | /// # pub fn into_end_updated(self, span: Self) -> Self { todo!() } 43 | /// # } 44 | /// # pub trait Spanned { 45 | /// # fn span(&self) -> Span; 46 | /// # } 47 | /// # pub trait TrySpan { 48 | /// # fn try_span(&self) -> Option; 49 | /// # } 50 | /// # impl TrySpan for T where T: Spanned { 51 | /// # fn try_span(&self) -> Option { todo!() } 52 | /// # } 53 | /// # impl TrySpan for Option where T: TrySpan { 54 | /// # fn try_span(&self) -> Option { todo!() } 55 | /// # } 56 | /// # } 57 | /// # } 58 | /// # struct Atom; 59 | /// # impl laps::span::Spanned for Atom { 60 | /// # fn span(&self) -> laps::span::Span { todo!() } 61 | /// # } 62 | /// # type ReturnKeyword = Atom; 63 | /// # type Value = Atom; 64 | /// #[derive(Spanned)] 65 | /// struct Return { 66 | /// ret: ReturnKeyword, 67 | /// #[try_span] 68 | /// value: Option, 69 | /// } 70 | /// ``` 71 | /// 72 | /// The following deriving fails to compile: 73 | /// 74 | /// ```compile_fail 75 | /// # use laps_macros::Spanned; 76 | /// # mod laps { 77 | /// # pub mod span { 78 | /// # pub type Result = std::result::Result; 79 | /// # pub struct Span; 80 | /// # impl Span { 81 | /// # pub fn into_end_updated(self, span: Self) -> Self { todo!() } 82 | /// # } 83 | /// # pub trait Spanned { 84 | /// # fn span(&self) -> Span; 85 | /// # } 86 | /// # pub trait TrySpan { 87 | /// # fn try_span(&self) -> Option; 88 | /// # } 89 | /// # impl TrySpan for T where T: Spanned { 90 | /// # fn try_span(&self) -> Option { todo!() } 91 | /// # } 92 | /// # impl TrySpan for Option where T: TrySpan { 93 | /// # fn try_span(&self) -> Option { todo!() } 94 | /// # } 95 | /// # } 96 | /// # } 97 | /// # struct Atom; 98 | /// # impl laps::span::Spanned for Atom { 99 | /// # fn span(&self) -> laps::span::Span { todo!() } 100 | /// # } 101 | /// # type ReturnKeyword = Atom; 102 | /// # type Value = Atom; 103 | /// #[derive(Spanned)] 104 | /// struct Return { 105 | /// ret: ReturnKeyword, 106 | /// value: Option, 107 | /// } 108 | /// ``` 109 | #[proc_macro_derive(Spanned, attributes(try_span))] 110 | pub fn derive_spanned(item: TokenStream) -> TokenStream { 111 | result_to_tokens!(spanned::derive_spanned(item)) 112 | } 113 | 114 | /// Generates the `Tokenize` trait implementation for token kinds. This macro 115 | /// can only be applied to `enum`s. 116 | /// 117 | /// # Helper Attributes 118 | /// 119 | /// * `#[char_type(type)]`: optional, specifies `CharType` of `Tokenize` trait. 120 | /// Defaults to [`char`], and can only be [`char`] or [`u8`]. 121 | /// * `#[enable_par(true/false)]`: optional, set to `true` to generate lexer in 122 | /// parallel, `false` to disable parallelization. Defaults to automatic 123 | /// selection. 124 | /// * `#[regex(regex [, parser])]`: marks a enum variant can be matched by the 125 | /// given regular expression. The `parser` parameter is optional, which is a 126 | /// function that converts a &[str] (`char_type` = [`char`]) or 127 | /// a &[[u8]] (`char_type` = [`u8`]) to [`Option`], which `T` 128 | /// is type of the tuple field of this variant. If `parser` is omitted, 129 | /// [`std::str::FromStr`] will be called. 130 | /// * `#[skip(regex)]`: marks a enum variant can be matched by the 131 | /// given regular expression, and it should be skipped. 132 | /// * `#[eof]`: marks a enum variant should be returned when the tokenizer 133 | /// encountered end-of-file. 134 | /// 135 | /// # Notes 136 | /// 137 | /// The variants that appear first will be matched first. 138 | #[proc_macro_derive(Tokenize, attributes(char_type, enable_par, regex, skip, eof))] 139 | pub fn derive_tokenize(item: TokenStream) -> TokenStream { 140 | result_to_tokens!(tokenize::derive_tokenize(item)) 141 | } 142 | 143 | /// Implements [`From`], [`TryFrom`] and [`Display`](std::fmt::Display) 144 | /// trait for token kind enums. 145 | /// 146 | /// The [`From`] and [`TryFrom`] trait will only be implemented for variants 147 | /// with single unnamed field. 148 | /// 149 | /// # Examples 150 | /// 151 | /// ``` 152 | /// # use laps_macros::token_kind; 153 | /// #[token_kind] 154 | /// enum TokenKind { 155 | /// /// String literal. 156 | /// Str(String), 157 | /// /// Integer literal. 158 | /// Int(i32), 159 | /// /// End-of-file. 160 | /// Eof, 161 | /// } 162 | /// ``` 163 | /// 164 | /// will be expanded to: 165 | /// 166 | /// ``` 167 | /// #[derive(Clone, PartialEq)] 168 | /// enum TokenKind { 169 | /// // ... 170 | /// # Str(String), 171 | /// # Int(i32), 172 | /// # Eof, 173 | /// } 174 | /// 175 | /// impl From for TokenKind { 176 | /// fn from(s: String) -> Self { 177 | /// Self::Str(s) 178 | /// } 179 | /// } 180 | /// 181 | /// impl TryFrom for String { 182 | /// type Error = (); 183 | /// fn try_from(kind: TokenKind) -> Result { 184 | /// match kind { 185 | /// TokenKind::Str(s) => Ok(s), 186 | /// _ => Err(()), 187 | /// } 188 | /// } 189 | /// } 190 | /// 191 | /// impl<'a> TryFrom<&'a TokenKind> for &'a String { 192 | /// type Error = (); 193 | /// fn try_from(kind: &'a TokenKind) -> Result { 194 | /// match kind { 195 | /// TokenKind::Str(s) => Ok(s), 196 | /// _ => Err(()), 197 | /// } 198 | /// } 199 | /// } 200 | /// 201 | /// // Same for `TokenKind::Int`. 202 | /// // ... 203 | /// 204 | /// impl std::fmt::Display for TokenKind { 205 | /// fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 206 | /// match self { 207 | /// Self::Str(s) => write!(f, "string literal `{s}`"), 208 | /// Self::Int(i) => write!(f, "integer literal `{i}`"), 209 | /// Self::Eof => write!(f, "end-of-file"), 210 | /// } 211 | /// } 212 | /// } 213 | /// ``` 214 | #[proc_macro_attribute] 215 | pub fn token_kind(attr: TokenStream, item: TokenStream) -> TokenStream { 216 | result_to_tokens!(token_kind::token_kind(attr, item)) 217 | } 218 | 219 | /// Generates ASTs for tokens, also generates a macro 220 | /// for referencing AST types. 221 | /// 222 | /// The generated ASTs can be parsed from token stream that produces 223 | /// `laps::tokens::Token` with the given type as its kind. 224 | /// 225 | /// # Examples 226 | /// 227 | /// ``` 228 | /// # use laps_macros::token_ast; 229 | /// # mod laps { 230 | /// # pub mod span { 231 | /// # pub type Result = std::result::Result; 232 | /// # pub struct Span; 233 | /// # pub trait Spanned { 234 | /// # fn span(&self) -> Span; 235 | /// # } 236 | /// # } 237 | /// # pub mod token { 238 | /// # #[derive(Clone, Debug, PartialEq, Eq, Hash)] 239 | /// # pub struct Token { 240 | /// # pub kind: Kind, 241 | /// # pub span: (), 242 | /// # } 243 | /// # impl super::span::Spanned for Token { 244 | /// # fn span(&self) -> super::span::Span { super::span::Span } 245 | /// # } 246 | /// # impl AsRef for Token { 247 | /// # fn as_ref(&self) -> &Kind { 248 | /// # &self.kind 249 | /// # } 250 | /// # } 251 | /// # pub trait TokenStream { 252 | /// # type Token; 253 | /// # fn next_token(&mut self) -> super::span::Result; 254 | /// # fn peek(&mut self) -> super::span::Result; 255 | /// # fn expect(&mut self, _: T) -> super::span::Result; 256 | /// # } 257 | /// # } 258 | /// # pub mod parse { 259 | /// # pub trait Parse: Sized { 260 | /// # fn parse(_: &mut TS) -> super::span::Result; 261 | /// # fn maybe(_: &mut TS) -> super::span::Result; 262 | /// # } 263 | /// # } 264 | /// # macro_rules! return_error { 265 | /// # ($span:expr, $($arg:tt)+) => { 266 | /// # return Err(()) 267 | /// # }; 268 | /// # } 269 | /// # pub(crate) use return_error; 270 | /// # } 271 | /// # fn main() {} 272 | /// #[derive(Clone, Debug, PartialEq, Eq, Hash)] 273 | /// enum TokenKind { 274 | /// /// String literal. 275 | /// Str(String), 276 | /// /// Integer literal. 277 | /// Int(i32), 278 | /// /// Other character. 279 | /// Other(char), 280 | /// /// End-of-file. 281 | /// Eof, 282 | /// } 283 | /// 284 | /// // Declare ASTs and there name, define macro `Token` for referencing ASTs. 285 | /// // You can use `Token![..]` to represent the generated ASTs, 286 | /// // such as `Token![str]`, `Token![+]`, ... 287 | /// // All of the generated ASTs are single-field structures, you can access 288 | /// // the inner token by using `ast.0`. 289 | /// token_ast! { 290 | /// // optional, all derives will be applied to the generated AST structures. 291 | /// #[derive(Debug, PartialEq)] 292 | /// pub(crate) macro Token { 293 | /// // pattern, and prompt for error messages 294 | /// [str] => { kind: TokenKind::Str(_), prompt: "string literal" }, 295 | /// [int] => { kind: TokenKind::Int(_), prompt: "integer literal" }, 296 | /// [0] => { kind: TokenKind::Int(i) if *i == 0, prompt: "zero" }, 297 | /// // use default prompt of the token kind 298 | /// [+] => { kind: TokenKind::Other('+') }, 299 | /// [-] => { kind: TokenKind::Other('-') }, 300 | /// [*] => { kind: TokenKind::Other('*') }, 301 | /// [/] => { kind: TokenKind::Other('/') }, 302 | /// [eof] => { kind: TokenKind::Eof }, 303 | /// } 304 | /// } 305 | /// ``` 306 | #[proc_macro] 307 | pub fn token_ast(item: TokenStream) -> TokenStream { 308 | result_to_tokens!(token_ast::token_ast(item)) 309 | } 310 | -------------------------------------------------------------------------------- /laps_macros/src/parse.rs: -------------------------------------------------------------------------------- 1 | use crate::utils::{ident, match_attr, return_error}; 2 | use proc_macro::TokenStream; 3 | use proc_macro2::{Ident, TokenStream as TokenStream2}; 4 | use quote::{quote, ToTokens, TokenStreamExt}; 5 | use std::iter; 6 | use syn::{ 7 | parse::Parser, punctuated::Punctuated, AttrStyle, Attribute, Data, DataEnum, DataStruct, 8 | DeriveInput, Expr, Field, Fields, GenericParam, Generics, Path, PredicateType, Result, Token, 9 | Type, TypePath, WhereClause, WherePredicate, 10 | }; 11 | 12 | /// Entry function of `#[derive(Parse)]`. 13 | pub fn derive_parse(item: TokenStream) -> Result { 14 | // parse input tokens and check 15 | let input: DeriveInput = syn::parse(item)?; 16 | if !matches!(&input.data, Data::Struct(_) | Data::Enum(_)) { 17 | return_error!("`#[derive(Parse)]` only supports structs and enums"); 18 | } 19 | // parse attributes 20 | let token = parse_token(&input.attrs)?; 21 | let starts_with = parse_starts_with(&input.attrs)?; 22 | // get generic related stuffs 23 | let ts_type = ident("__LAPS_MACROS_TS"); 24 | let (_, ty_generics, where_clause) = input.generics.split_for_impl(); 25 | let impl_generics = gen_impl_generics(&input.generics, &ts_type); 26 | let where_clause = gen_where_clause(&ts_type, token, where_clause)?; 27 | // get method implementations 28 | let (parse, maybe) = match &input.data { 29 | Data::Struct(s) => gen_struct_methods(s, &ts_type, starts_with), 30 | Data::Enum(e) => gen_enum_methods(e, &ts_type, starts_with), 31 | _ => unreachable!(), 32 | }?; 33 | // generate implementations 34 | let name = input.ident; 35 | Ok(TokenStream::from(quote! { 36 | impl #impl_generics laps::parse::Parse<#ts_type> 37 | for #name #ty_generics #where_clause { 38 | #parse 39 | #maybe 40 | } 41 | })) 42 | } 43 | 44 | /// Parses attribute `#[token(...)]`. 45 | fn parse_token(attrs: &Vec) -> Result> { 46 | let mut token = None; 47 | match_attr! { 48 | for meta in attrs if "token" && token.is_none() => { 49 | token = Some(syn::parse2(meta.tokens.clone())?); 50 | } 51 | } 52 | Ok(token) 53 | } 54 | 55 | /// Parses attribute `#[starts_with(...)]`. 56 | fn parse_starts_with(attrs: &Vec) -> Result> { 57 | let mut starts_with = Vec::new(); 58 | match_attr! { 59 | for meta in attrs if "starts_with" && starts_with.is_empty() => { 60 | let exprs: Punctuated = Punctuated::parse_separated_nonempty.parse2(meta.tokens.clone())?; 61 | starts_with = exprs.into_iter().collect(); 62 | } 63 | } 64 | Ok(starts_with) 65 | } 66 | 67 | /// Generates `impl` generics. 68 | fn gen_impl_generics(generics: &Generics, ts_type: &Ident) -> TokenStream2 { 69 | let mut tokens = TokenStream2::new(); 70 | ::default().to_tokens(&mut tokens); 71 | // generate lifetimes 72 | for param in &generics.params { 73 | if let GenericParam::Lifetime(_) = param { 74 | param.to_tokens(&mut tokens); 75 | ::default().to_tokens(&mut tokens); 76 | } 77 | } 78 | // generate other parameters 79 | let is_outer = |attr: &&Attribute| matches!(attr.style, AttrStyle::Outer); 80 | for param in &generics.params { 81 | match param { 82 | GenericParam::Lifetime(_) => continue, 83 | GenericParam::Type(param) => { 84 | tokens.append_all(param.attrs.iter().filter(is_outer)); 85 | param.ident.to_tokens(&mut tokens); 86 | if !param.bounds.is_empty() { 87 | ::default().to_tokens(&mut tokens); 88 | param.bounds.to_tokens(&mut tokens); 89 | } 90 | } 91 | GenericParam::Const(param) => { 92 | tokens.append_all(param.attrs.iter().filter(is_outer)); 93 | param.const_token.to_tokens(&mut tokens); 94 | param.ident.to_tokens(&mut tokens); 95 | param.colon_token.to_tokens(&mut tokens); 96 | param.ty.to_tokens(&mut tokens); 97 | } 98 | } 99 | ::default().to_tokens(&mut tokens); 100 | } 101 | // generate token stream type name 102 | ts_type.to_tokens(&mut tokens); 103 | ]>::default().to_tokens(&mut tokens); 104 | tokens 105 | } 106 | 107 | /// Generates `where` clause. 108 | fn gen_where_clause( 109 | ts_type: &Ident, 110 | token: Option, 111 | where_clause: Option<&WhereClause>, 112 | ) -> Result { 113 | // `TokenStream` trait bound 114 | let mut ts_trait = Punctuated::new(); 115 | let ts_trait_tokens = match token { 116 | Some(token) => quote!(laps::token::TokenStream), 117 | None => quote!(laps::token::TokenStream), 118 | }; 119 | ts_trait.push(syn::parse2(ts_trait_tokens).unwrap()); 120 | // generate where predicates for token stream type 121 | let param_ty = Type::Path(TypePath { 122 | qself: None, 123 | path: ts_type.clone().into(), 124 | }); 125 | let pred = WherePredicate::Type(PredicateType { 126 | lifetimes: None, 127 | bounded_ty: param_ty, 128 | colon_token: Default::default(), 129 | bounds: ts_trait, 130 | }); 131 | // create where clause 132 | let mut predicates = Punctuated::new(); 133 | if let Some(wc) = where_clause { 134 | predicates.extend(wc.predicates.iter().cloned()); 135 | } 136 | predicates.push(pred); 137 | Ok(WhereClause { 138 | where_token: Default::default(), 139 | predicates, 140 | }) 141 | } 142 | 143 | /// Generates trait methods for the given struct data. 144 | fn gen_struct_methods( 145 | data: &DataStruct, 146 | ts_type: &Ident, 147 | starts_with: Vec, 148 | ) -> Result<(TokenStream2, TokenStream2)> { 149 | // generate `parse` method 150 | let constructor = gen_constructor(&data.fields); 151 | let parse = quote! { 152 | fn parse(tokens: &mut #ts_type) -> laps::span::Result { 153 | std::result::Result::Ok(Self #constructor) 154 | } 155 | }; 156 | // generate `maybe` method 157 | let result = if !starts_with.is_empty() { 158 | gen_maybe(starts_with) 159 | } else if let Some(Field { ty, .. }) = first_field(&data.fields) { 160 | quote!(<#ty>::maybe(tokens)) 161 | } else { 162 | quote!(std::result::Result::Ok(true)) 163 | }; 164 | let maybe = quote! { 165 | fn maybe(tokens: &mut #ts_type) -> laps::span::Result { 166 | #result 167 | } 168 | }; 169 | Ok((parse, maybe)) 170 | } 171 | 172 | /// Generates trait methods for the given enum data. 173 | fn gen_enum_methods( 174 | data: &DataEnum, 175 | ts_type: &Ident, 176 | starts_with: Vec, 177 | ) -> Result<(TokenStream2, TokenStream2)> { 178 | // generate `parse` method 179 | let mut branches = TokenStream2::new(); 180 | for (i, variant) in data.variants.iter().enumerate() { 181 | if i != 0 { 182 | ::default().to_tokens(&mut branches); 183 | } 184 | if i != data.variants.len() - 1 { 185 | ::default().to_tokens(&mut branches); 186 | branches.append_all(match first_field(&variant.fields) { 187 | Some(Field { ty, .. }) => quote!(<#ty>::maybe(tokens)?), 188 | None => quote!(true), 189 | }); 190 | } 191 | let ident = &variant.ident; 192 | let constructor = gen_constructor(&variant.fields); 193 | branches.append_all(quote!({ Self::#ident #constructor })); 194 | } 195 | let parse = quote! { 196 | fn parse(tokens: &mut #ts_type) -> laps::span::Result { 197 | std::result::Result::Ok(#branches) 198 | } 199 | }; 200 | // generate `maybe` method 201 | let result = if !starts_with.is_empty() { 202 | gen_maybe(starts_with) 203 | } else if data.variants.is_empty() { 204 | quote!(std::result::Result::Ok(true)) 205 | } else { 206 | let mut tokens = TokenStream2::new(); 207 | for (i, variant) in data.variants.iter().enumerate() { 208 | if i != 0 { 209 | ::default().to_tokens(&mut tokens); 210 | } 211 | tokens.append_all(match first_field(&variant.fields) { 212 | Some(Field { ty, .. }) => quote!(<#ty>::maybe(tokens)?), 213 | None => quote!(true), 214 | }); 215 | } 216 | quote!(std::result::Result::Ok(#tokens)) 217 | }; 218 | let maybe = quote! { 219 | fn maybe(tokens: &mut #ts_type) -> laps::span::Result { 220 | #result 221 | } 222 | }; 223 | Ok((parse, maybe)) 224 | } 225 | 226 | /// Generates the constructor for the given fields. 227 | fn gen_constructor(fields: &Fields) -> TokenStream2 { 228 | match fields { 229 | Fields::Named(f) => { 230 | let fields = f 231 | .named 232 | .iter() 233 | .map(|Field { ident, .. }| quote!(#ident: tokens.parse()?,)); 234 | quote!({#(#fields)*}) 235 | } 236 | Fields::Unnamed(f) => { 237 | let fields = iter::repeat(quote!(tokens.parse()?,)).take(f.unnamed.len()); 238 | quote!((#(#fields)*)) 239 | } 240 | Fields::Unit => quote!(), 241 | } 242 | } 243 | 244 | /// Generates the body of the `maybe` method by the given tokens. 245 | fn gen_maybe(starts_with: Vec) -> TokenStream2 { 246 | let maybe_chain: TokenStream2 = starts_with 247 | .into_iter() 248 | .flat_map(|expr| quote!(.maybe(#expr)?)) 249 | .collect(); 250 | quote!(tokens.lookahead()#maybe_chain.result()) 251 | } 252 | 253 | /// Returns the first field of the given fields. 254 | fn first_field(fields: &Fields) -> Option<&Field> { 255 | match fields { 256 | Fields::Named(f) => f.named.first(), 257 | Fields::Unnamed(f) => f.unnamed.first(), 258 | Fields::Unit => None, 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /laps_macros/src/spanned.rs: -------------------------------------------------------------------------------- 1 | use crate::utils::{error, return_error}; 2 | use proc_macro::TokenStream; 3 | use proc_macro2::{Ident, Literal, TokenStream as TokenStream2}; 4 | use quote::quote; 5 | use syn::{ 6 | punctuated::Punctuated, spanned::Spanned, Attribute, Data, DataEnum, DataStruct, DeriveInput, 7 | Field, Fields, Meta, Result, Token, Variant, 8 | }; 9 | 10 | /// Entry function of `#[derive(Spanned)]`. 11 | pub fn derive_spanned(item: TokenStream) -> Result { 12 | // parse input tokens 13 | let input: DeriveInput = syn::parse(item)?; 14 | // generate trait implementation 15 | let name = &input.ident; 16 | let (impl_generics, ty_generics, where_clause) = input.generics.split_for_impl(); 17 | let body = match &input.data { 18 | Data::Struct(DataStruct { 19 | fields: Fields::Named(f), 20 | .. 21 | }) if !f.named.is_empty() => gen_struct_body(&f.named)?, 22 | Data::Struct(DataStruct { 23 | fields: Fields::Unnamed(f), 24 | .. 25 | }) if !f.unnamed.is_empty() => gen_struct_body(&f.unnamed)?, 26 | Data::Enum(DataEnum { variants, .. }) if !variants.is_empty() => gen_enum_body(variants)?, 27 | _ => { 28 | return_error!("`#[derive(Spanned)]` only supports non-unit and non-empty structs and enums"); 29 | } 30 | }; 31 | Ok(TokenStream::from(quote! { 32 | impl #impl_generics laps::span::Spanned 33 | for #name #ty_generics #where_clause { 34 | fn span(&self) -> laps::span::Span { 35 | use laps::span::TrySpan; 36 | #body 37 | } 38 | } 39 | })) 40 | } 41 | 42 | /// Generates body of the `span` method for struct fields. 43 | fn gen_struct_body(fields: &Punctuated) -> Result { 44 | let arm = gen_fields_span(quote!(Self), fields)?; 45 | Ok(quote!(match self { #arm })) 46 | } 47 | 48 | /// Generates body of the `span` method for enum variants. 49 | fn gen_enum_body(variants: &Punctuated) -> Result { 50 | let mut arms = TokenStream2::new(); 51 | for variant in variants { 52 | let name = &variant.ident; 53 | let name = quote!(Self::#name); 54 | let arm = match &variant.fields { 55 | Fields::Named(f) if !f.named.is_empty() => gen_fields_span(name, &f.named)?, 56 | Fields::Unnamed(f) if !f.unnamed.is_empty() => gen_fields_span(name, &f.unnamed)?, 57 | _ => return_error!( 58 | variant.span(), 59 | "`#[derive(Spanned)]` only supports non-unit and non-empty variants in enums" 60 | ), 61 | }; 62 | arms.extend(arm); 63 | } 64 | Ok(quote!(match self { #arms })) 65 | } 66 | 67 | /// Generates span of the given fields. 68 | fn gen_fields_span( 69 | name: TokenStream2, 70 | fields: &Punctuated, 71 | ) -> Result { 72 | let (exts, ts_ids) = gen_fields_extract(name, fields)?; 73 | let first = gen_first_span(ts_ids.iter())?; 74 | let last = gen_first_span(ts_ids.iter().rev())?; 75 | Ok(quote!(#exts => #first.into_end_updated(#last),)) 76 | } 77 | 78 | /// Generates the extraction of the given fields. 79 | fn gen_fields_extract( 80 | name: TokenStream2, 81 | fields: &Punctuated, 82 | ) -> Result<(TokenStream2, Vec<(bool, Ident)>)> { 83 | let mut exts = TokenStream2::new(); 84 | let mut ts_ids = Vec::new(); 85 | for (i, field) in fields.iter().enumerate() { 86 | let ts = has_try_span(&field.attrs)?; 87 | let span = field.span(); 88 | let (ext, ts_id) = if let Some(id) = &field.ident { 89 | let new_id = Ident::new(&format!("_{id}"), span); 90 | (quote!(#id: #new_id,), (ts, new_id)) 91 | } else { 92 | let index = Literal::usize_unsuffixed(i); 93 | let id = Ident::new(&format!("_f{i}"), span); 94 | (quote!(#index: #id,), (ts, id)) 95 | }; 96 | exts.extend(ext); 97 | ts_ids.push(ts_id); 98 | } 99 | Ok((quote!(#name { #exts }), ts_ids)) 100 | } 101 | 102 | /// Returns `true` if the given attributes contains `try_span`. 103 | fn has_try_span(attrs: &[Attribute]) -> Result { 104 | let mut result = false; 105 | for attr in attrs { 106 | match &attr.meta { 107 | Meta::Path(path) if path.is_ident("try_span") => { 108 | if result { 109 | return_error!(attr.span(), "attribute `try_span` is bound more than once"); 110 | } 111 | result = true; 112 | } 113 | _ => {} 114 | } 115 | } 116 | Ok(result) 117 | } 118 | 119 | /// Generates the first span of the given iterator of `try_span` flag 120 | /// and identifier. 121 | fn gen_first_span<'a, I>(mut ts_ids: I) -> Result 122 | where 123 | I: Iterator, 124 | { 125 | let (ts, id) = ts_ids.next().ok_or(error!( 126 | "attribute `try_span` can not be applied to all the fields" 127 | ))?; 128 | Ok(if *ts { 129 | let span = gen_first_span(ts_ids)?; 130 | quote!(match #id.try_span() { 131 | std::option::Option::Some(span) => span, 132 | std::option::Option::None => #span, 133 | }) 134 | } else { 135 | quote!(#id.span()) 136 | }) 137 | } 138 | -------------------------------------------------------------------------------- /laps_macros/src/token_ast.rs: -------------------------------------------------------------------------------- 1 | use crate::utils::{ident, return_error}; 2 | use proc_macro::TokenStream; 3 | use proc_macro2::{Ident, TokenStream as TokenStream2}; 4 | use quote::quote; 5 | use syn::{ 6 | braced, bracketed, 7 | parse::{Parse, ParseStream}, 8 | punctuated::{Pair, Punctuated}, 9 | spanned::Spanned, 10 | Attribute, Expr, GenericArgument, LitStr, Meta, Pat, Path, PathArguments, PathSegment, Result, 11 | Token, Type, Visibility, 12 | }; 13 | 14 | struct TokenAst { 15 | attrs: Vec, 16 | derives: Vec, 17 | vis: Visibility, 18 | current_mod: Path, 19 | name: Ident, 20 | token_kind: Type, 21 | arms: Punctuated, 22 | } 23 | 24 | impl Parse for TokenAst { 25 | fn parse(input: ParseStream) -> Result { 26 | // parse attributes and derives 27 | let (derives, attrs) = input 28 | .call(Attribute::parse_outer)? 29 | .into_iter() 30 | .partition(|attr| matches!(&attr.meta, Meta::List(l) if l.path.is_ident("derive"))); 31 | // parse visibility and `macro` 32 | let vis = input.parse()?; 33 | input.parse::()?; 34 | // parse current module, name and token kind 35 | let mut current_mod: Path = input.parse()?; 36 | let (name, token_kind) = match current_mod.segments.pop() { 37 | Some(Pair::End(PathSegment { 38 | ident, 39 | arguments: PathArguments::AngleBracketed(mut a), 40 | })) => match a.args.pop() { 41 | Some(Pair::End(GenericArgument::Type(ty))) if a.args.is_empty() => (ident, ty), 42 | _ => return_error!(a.span(), "must have only one type parameter"), 43 | }, 44 | _ => return_error!(current_mod.span(), "invalid path"), 45 | }; 46 | // parse arms 47 | let brace_content; 48 | braced!(brace_content in input); 49 | let arms = Punctuated::parse_terminated(&brace_content)?; 50 | Ok(Self { 51 | attrs, 52 | derives, 53 | vis, 54 | current_mod, 55 | name, 56 | token_kind, 57 | arms, 58 | }) 59 | } 60 | } 61 | 62 | struct TokenAstArm { 63 | token: TokenStream2, 64 | pat: Pat, 65 | guard: Option, 66 | prompt: Option, 67 | } 68 | 69 | impl Parse for TokenAstArm { 70 | fn parse(input: ParseStream) -> Result { 71 | // parse token 72 | let bracket_content; 73 | bracketed!(bracket_content in input); 74 | let token = bracket_content.parse()?; 75 | // parse arm 76 | input.parse::]>()?; 77 | let brace_content; 78 | braced!(brace_content in input); 79 | // parse `kind:` 80 | let kind: Ident = brace_content.parse()?; 81 | if kind != "kind" { 82 | return_error!(kind.span(), "must be `kind`"); 83 | } 84 | brace_content.parse::()?; 85 | // parse pattern 86 | let pat = Pat::parse_multi_with_leading_vert(&brace_content)?; 87 | // parse if guard 88 | let guard = if brace_content.peek(Token![if]) { 89 | brace_content.parse::()?; 90 | Some(brace_content.parse()?) 91 | } else { 92 | None 93 | }; 94 | // parse the optional prompt part 95 | let prompt = if brace_content.peek(Token![,]) && brace_content.peek2(syn::Ident) { 96 | brace_content.parse::()?; 97 | // parse `prompt:` 98 | let prompt_ident: Ident = brace_content.parse()?; 99 | if prompt_ident != "prompt" { 100 | return_error!(prompt_ident.span(), "must be `prompt`"); 101 | } 102 | brace_content.parse::()?; 103 | // parse prompt 104 | let prompt = brace_content.parse()?; 105 | // parse the optional comma 106 | if brace_content.peek(Token![,]) { 107 | brace_content.parse::()?; 108 | } 109 | Some(prompt) 110 | } else { 111 | None 112 | }; 113 | Ok(Self { 114 | token, 115 | pat, 116 | guard, 117 | prompt, 118 | }) 119 | } 120 | } 121 | 122 | /// Entry function of `token_ast`. 123 | pub fn token_ast(item: TokenStream) -> Result { 124 | // parse macro input 125 | let input: TokenAst = syn::parse(item)?; 126 | // generate AST definitions 127 | let (ast_defs, ast_names) = gen_ast_defs(&input)?; 128 | // generate macro definition 129 | let macro_def = gen_macro_def(&input, ast_names); 130 | Ok(TokenStream::from(quote!(#ast_defs #macro_def))) 131 | } 132 | 133 | /// Generates AST definitions. 134 | /// 135 | /// Returns definitions and AST names. 136 | fn gen_ast_defs(input: &TokenAst) -> Result<(TokenStream2, Vec)> { 137 | // generate AST names 138 | let names = (0..input.arms.len()).map(|i| ident(&format!("Token{i}"))); 139 | // generate AST definitions 140 | let kind = &input.token_kind; 141 | let field_vis = match &input.vis { 142 | Visibility::Inherited => quote!(pub(super)), 143 | Visibility::Restricted(res) => { 144 | let path = res.path.as_ref(); 145 | match path.segments.first() { 146 | Some(p) if p.arguments.is_none() && path.leading_colon.is_none() => { 147 | if p.ident == "self" { 148 | quote!(pub(super)) 149 | } else if p.ident == "crate" { 150 | quote!(pub(in #path)) 151 | } else { 152 | quote!(pub(in super::#path)) 153 | } 154 | } 155 | _ => return_error!(path.span(), "invalid path in visibility"), 156 | } 157 | } 158 | vis => quote!(#vis), 159 | }; 160 | let token = quote!(laps::token::Token<#kind>); 161 | let derive = if input.derives.is_empty() { 162 | quote!(#[derive(PartialEq)]) 163 | } else { 164 | let derives = &input.derives; 165 | quote!(#(#derives)*) 166 | }; 167 | let defs: Vec<_> = names 168 | .clone() 169 | .zip(&input.arms) 170 | .map(|(name, TokenAstArm { pat, guard, prompt, .. })| { 171 | let if_guard = guard.as_ref().map(|e| quote!(if #e)); 172 | let parse_body = match prompt { 173 | Some(prompt) => quote! { 174 | let token = tokens.next_token()?; 175 | match &token.kind { 176 | #[allow(unused_parens)] 177 | #pat #if_guard => std::result::Result::Ok(Self(token)), 178 | _ => laps::return_error!(token.span, std::concat!("expected ", #prompt, ", found {}"), token), 179 | } 180 | }, 181 | None => match if_guard { 182 | Some(e) => return_error!(e.span(), "if-guard must be used with `prompt`"), 183 | None => quote!(tokens.expect(#pat).map(Self)), 184 | }, 185 | }; 186 | Ok(quote! { 187 | #derive 188 | pub struct #name(#field_vis #token); 189 | impl #name { 190 | /// Unwraps the inner token kind and returns its value. 191 | /// 192 | /// # Panics 193 | /// 194 | /// Panics if the inner token kind does not contain a value of 195 | /// the type `T`. 196 | #field_vis fn unwrap(self) -> T 197 | where 198 | T: std::convert::TryFrom<#kind, Error = E>, 199 | E: std::fmt::Debug, 200 | { 201 | self.0.kind.try_into().unwrap() 202 | } 203 | 204 | /// Unwraps the inner token kind and returns its value. 205 | /// 206 | /// # Panics 207 | /// 208 | /// Panics if the inner token kind does not contain a value of 209 | /// the type `T`. 210 | #field_vis fn unwrap_ref<'a, T, E>(&'a self) -> T 211 | where 212 | T: std::convert::TryFrom<&'a #kind, Error = E>, 213 | E: std::fmt::Debug, 214 | { 215 | self.0.as_ref().try_into().unwrap() 216 | } 217 | } 218 | impl laps::parse::Parse for #name 219 | where 220 | TS: laps::token::TokenStream 221 | { 222 | fn parse(tokens: &mut TS) -> laps::span::Result { 223 | #parse_body 224 | } 225 | fn maybe(tokens: &mut TS) -> laps::span::Result { 226 | #[allow(unused_parens)] 227 | std::result::Result::Ok(matches!(&tokens.peek()?.kind, #pat #if_guard)) 228 | } 229 | } 230 | impl laps::span::Spanned for #name { 231 | fn span(&self) -> laps::span::Span { 232 | self.0.span() 233 | } 234 | } 235 | }) 236 | }) 237 | .collect::>()?; 238 | let vis = &input.vis; 239 | let mod_name = ident(&format!("__token_ast_{}", input.name)); 240 | let ast_defs = quote! { 241 | #[doc(hidden)] 242 | #[allow(non_snake_case)] 243 | #vis mod #mod_name { 244 | use super::*; 245 | #(#defs)* 246 | } 247 | }; 248 | // generate full paths for all ASTs 249 | let current_mod = &input.current_mod; 250 | let ast_names = names.map(|ident| quote!(#current_mod #mod_name::#ident)); 251 | Ok((ast_defs, ast_names.collect())) 252 | } 253 | 254 | /// Generates the macro definition. 255 | fn gen_macro_def(input: &TokenAst, ast_names: Vec) -> TokenStream2 { 256 | // generate arms 257 | let arms = ast_names 258 | .into_iter() 259 | .zip(&input.arms) 260 | .map(|(name, TokenAstArm { token, .. })| quote!([#token] => {#name};)); 261 | // generate definition 262 | let attrs = &input.attrs; 263 | let name = &input.name; 264 | let macro_def = quote! { 265 | #(#attrs)* 266 | macro_rules! #name { 267 | #(#arms)* 268 | } 269 | }; 270 | // generate definition with visibility 271 | match &input.vis { 272 | Visibility::Inherited => quote!(#macro_def), 273 | Visibility::Public(_) => quote!(#[macro_export] #macro_def), 274 | vis => quote! { 275 | #macro_def 276 | #vis use #name; 277 | }, 278 | } 279 | } 280 | -------------------------------------------------------------------------------- /laps_macros/src/token_kind.rs: -------------------------------------------------------------------------------- 1 | use crate::utils::{camel_to_lower, parse_doc_comments, return_error}; 2 | use proc_macro::TokenStream; 3 | use proc_macro2::TokenStream as TokenStream2; 4 | use quote::quote; 5 | use syn::{Fields, ItemEnum, Result}; 6 | 7 | /// Entry function of `#[token_kind]`. 8 | pub fn token_kind(attr: TokenStream, item: TokenStream) -> Result { 9 | // parse input 10 | if !attr.is_empty() { 11 | return_error!("only `#[token_kind]` can be used"); 12 | } 13 | let input = syn::parse(item)?; 14 | // generate trait implementations 15 | let froms = gen_from_impls(&input); 16 | let display = gen_display_impl(&input); 17 | Ok(TokenStream::from(quote! { 18 | #[derive(Clone, PartialEq)] 19 | #input 20 | #froms #display 21 | })) 22 | } 23 | 24 | /// Generates `From` and `TryFrom` trait implementations. 25 | fn gen_from_impls(input: &ItemEnum) -> TokenStream2 { 26 | let mut impls = TokenStream2::new(); 27 | let ident = &input.ident; 28 | // for all variants 29 | for variant in &input.variants { 30 | let variant_name = &variant.ident; 31 | // check if is unnamed, and has only one field 32 | match &variant.fields { 33 | Fields::Unnamed(f) if f.unnamed.len() == 1 => { 34 | let ty = &f.unnamed.first().unwrap().ty; 35 | impls.extend(quote! { 36 | impl std::convert::From<#ty> for #ident { 37 | fn from(v: #ty) -> Self { 38 | Self::#variant_name(v) 39 | } 40 | } 41 | impl std::convert::TryFrom<#ident> for #ty { 42 | type Error = (); 43 | fn try_from(v: #ident) -> std::result::Result { 44 | match v { 45 | #ident::#variant_name(v) => std::result::Result::Ok(v), 46 | _ => std::result::Result::Err(()), 47 | } 48 | } 49 | } 50 | impl<'a> std::convert::TryFrom<&'a #ident> for &'a #ty { 51 | type Error = (); 52 | fn try_from(v: &'a #ident) -> std::result::Result { 53 | match v { 54 | #ident::#variant_name(v) => std::result::Result::Ok(v), 55 | _ => std::result::Result::Err(()), 56 | } 57 | } 58 | } 59 | }); 60 | } 61 | _ => {} 62 | } 63 | } 64 | impls 65 | } 66 | 67 | /// Generates `Display` trait implementation. 68 | fn gen_display_impl(input: &ItemEnum) -> TokenStream2 { 69 | let ident = &input.ident; 70 | // generate match arms 71 | let mut arms = TokenStream2::new(); 72 | for variant in &input.variants { 73 | let ident = &variant.ident; 74 | let prompt = parse_doc_comments(&variant.attrs).map_or_else( 75 | || camel_to_lower(ident.to_string()), 76 | |mut p| { 77 | p.make_ascii_lowercase(); 78 | if p.ends_with('.') { 79 | p.pop(); 80 | } 81 | p 82 | }, 83 | ); 84 | arms.extend(match &variant.fields { 85 | Fields::Unnamed(f) if f.unnamed.len() == 1 => { 86 | let prompt = prompt + " `{}`"; 87 | quote!(Self::#ident(v) => std::write!(f, #prompt, v),) 88 | } 89 | Fields::Named(_) => quote!(Self::#ident { .. } => std::write!(f, #prompt),), 90 | Fields::Unnamed(_) => quote!(Self::#ident(..) => std::write!(f, #prompt),), 91 | Fields::Unit => quote!(Self::#ident => std::write!(f, #prompt),), 92 | }); 93 | } 94 | quote! { 95 | impl std::fmt::Display for #ident { 96 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 97 | match self { 98 | #arms 99 | } 100 | } 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /laps_macros/src/utils.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Ident, Span}; 2 | use syn::parse::{Parse, ParseStream}; 3 | use syn::{parenthesized, Attribute, Expr, ExprLit, Lit, Meta, MetaNameValue, Result}; 4 | 5 | /// Generates a compile error. 6 | macro_rules! error { 7 | ($msg:expr) => { 8 | syn::Error::new(proc_macro2::Span::call_site(), $msg) 9 | }; 10 | ($span:expr, $msg:expr) => { 11 | syn::Error::new($span, $msg) 12 | }; 13 | } 14 | pub(crate) use error; 15 | 16 | /// Generates a compile error and returns. 17 | macro_rules! return_error { 18 | ($msg:expr) => { 19 | return Err(crate::utils::error!($msg)) 20 | }; 21 | ($span:expr, $msg:expr) => { 22 | return Err(crate::utils::error!($span, $msg)) 23 | }; 24 | } 25 | pub(crate) use return_error; 26 | 27 | /// Converts `Result` to `TokenStream`. 28 | macro_rules! result_to_tokens { 29 | ($result:expr) => { 30 | match $result { 31 | Ok(data) => data, 32 | Err(err) => err.to_compile_error().into(), 33 | } 34 | }; 35 | } 36 | pub(crate) use result_to_tokens; 37 | 38 | /// Helper macro for handling attributes like `#[name(...)]`. 39 | macro_rules! match_attr { 40 | (for $id:ident in $attrs:ident if $name:literal && $cond:expr => $body:block) => { 41 | for $id in $attrs { 42 | match &$id.meta { 43 | syn::Meta::List($id) if $id.path.is_ident($name) => { 44 | if $cond $body else { 45 | use syn::spanned::Spanned; 46 | crate::utils::return_error!( 47 | $id.span(), 48 | concat!("attribute `", $name, "` is bound more than once") 49 | ); 50 | } 51 | } 52 | _ => {} 53 | } 54 | } 55 | }; 56 | } 57 | pub(crate) use match_attr; 58 | 59 | /// Data of `(...)`. 60 | pub struct Parenthesized(pub T); 61 | 62 | impl Parse for Parenthesized { 63 | fn parse(input: ParseStream) -> Result { 64 | let content; 65 | parenthesized!(content in input); 66 | Ok(Self(content.parse()?)) 67 | } 68 | } 69 | 70 | /// Creates a new identifier by the given string. 71 | pub fn ident(s: &str) -> Ident { 72 | Ident::new(s, Span::call_site()) 73 | } 74 | 75 | /// Parses doc comments. 76 | pub fn parse_doc_comments(attrs: &[Attribute]) -> Option { 77 | attrs 78 | .iter() 79 | .filter_map(|attr| match &attr.meta { 80 | Meta::NameValue(MetaNameValue { 81 | path, 82 | value: Expr::Lit(ExprLit { 83 | lit: Lit::Str(s), .. 84 | }), 85 | .. 86 | }) if path.is_ident("doc") => Some(s.value().trim().to_string()), 87 | _ => None, 88 | }) 89 | .reduce(|mut s, cur| { 90 | s.reserve(cur.len() + 1); 91 | s.push(' '); 92 | s.push_str(&cur); 93 | s 94 | }) 95 | .and_then(|s| { 96 | let s = s.trim().to_string(); 97 | (!s.is_empty()).then_some(s) 98 | }) 99 | } 100 | 101 | /// Converts the given camel case string to lower case space-delimited string. 102 | pub fn camel_to_lower(s: String) -> String { 103 | let mut ans = String::new(); 104 | for c in s.chars() { 105 | if c.is_ascii_uppercase() { 106 | if !ans.is_empty() { 107 | ans.push(' '); 108 | } 109 | ans.push(c.to_ascii_lowercase()); 110 | } else { 111 | ans.push(c); 112 | } 113 | } 114 | ans 115 | } 116 | -------------------------------------------------------------------------------- /laps_regex/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "laps_regex" 3 | version = "0.1.1" 4 | authors = ["MaxXing "] 5 | edition = "2021" 6 | description = "Tools for generating NFAs, DFAs and state-transition tables from regular expressions." 7 | repository = "https://github.com/MaxXSoft/laps" 8 | documentation = "https://docs.rs/laps_regex" 9 | categories = ["parsing", "text-processing"] 10 | keywords = ["laps", "regex", "lexer", "parser", "automaton"] 11 | readme = "README.md" 12 | license = "MIT OR Apache-2.0" 13 | 14 | [dependencies] 15 | rayon = "1.8.0" 16 | regex-syntax = "0.7.2" 17 | -------------------------------------------------------------------------------- /laps_regex/README.md: -------------------------------------------------------------------------------- 1 | # laps_regex 2 | 3 | Tools for generating NFAs, DFAs and state-transition tables from regular expressions. 4 | 5 | This library is built for crate [`laps`](https://crates.io/crates/laps). 6 | 7 | ## Example: Matching UTF-8 Strings 8 | 9 | ```rust 10 | use laps_regex::re::{RegexBuilder, CharsMatcher}; 11 | 12 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 13 | enum Token { 14 | Keyword, 15 | Identifier, 16 | Number, 17 | } 18 | 19 | let matcher: CharsMatcher<_> = RegexBuilder::new() 20 | .add("if|else|while", Token::Keyword) 21 | .add("[_a-zA-Z][_a-zA-Z0-9]*", Token::Identifier) 22 | .add("[0-9]|[1-9][0-9]+", Token::Number) 23 | .build() 24 | .unwrap(); 25 | 26 | assert_eq!(matcher.is_str_match("if"), Some(&Token::Keyword)); 27 | assert_eq!(matcher.is_str_match("while1"), Some(&Token::Identifier)); 28 | assert_eq!(matcher.is_str_match("42"), Some(&Token::Number)); 29 | assert_eq!(matcher.is_str_match("?"), None); 30 | ``` 31 | 32 | ## Example: Matching Bytes 33 | 34 | ```rust 35 | use laps_regex::re::{RegexBuilder, BytesMatcher}; 36 | 37 | let matcher: BytesMatcher<_> = RegexBuilder::new() 38 | .add("hello|hi", 0) 39 | .add("goodbye|bye", 1) 40 | .build_bytes() 41 | .unwrap(); 42 | 43 | assert_eq!(matcher.is_match(b"hello"), Some(&0)); 44 | assert_eq!(matcher.is_match(&[0x62, 0x79, 0x65]), Some(&1)); 45 | ``` 46 | 47 | ## License 48 | 49 | Copyright (C) 2022-2023 MaxXing. Licensed under either of Apache 2.0 or MIT at your option. 50 | -------------------------------------------------------------------------------- /laps_regex/src/dfa.rs: -------------------------------------------------------------------------------- 1 | //! Deterministic finite automaton ([`DFA`]) related implementations. 2 | //! 3 | //! A DFA can be built from a nondeterministic finite automaton ([`NFA`]). 4 | 5 | use crate::fa::{CachedClosures, Closure, ClosureBuilder, DenseFA, State}; 6 | use crate::nfa::NFA; 7 | use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; 8 | use std::hash::Hash; 9 | use std::{fmt, io}; 10 | 11 | /// Helper macro for finding the first matching tag of the given states. 12 | macro_rules! first_tag { 13 | ($nfa_tags:expr, $states:expr) => { 14 | $nfa_tags 15 | .iter() 16 | .find_map(|(tag, id)| $states.contains(id).then(|| tag.clone())) 17 | }; 18 | } 19 | 20 | /// A deterministic finite automaton (DFA) 21 | /// with symbol type `S` and tag type `T`. 22 | #[derive(Debug)] 23 | pub struct DFA { 24 | fa: DenseFA>, 25 | tags: HashMap, 26 | } 27 | 28 | impl DFA { 29 | /// Creates a new DFA from the given [`NFA`]. 30 | /// 31 | /// Set `enable_par` to [`Some(true)`] to construct the DFA in parallel, 32 | /// [`Some(false)`] to disable parallelization, and [`None`] to choose 33 | /// automatically. 34 | pub fn new(nfa: NFA, enable_par: Option) -> Self 35 | where 36 | S: Clone + Hash + Eq + Ord + Sync, 37 | T: Clone + Hash + Eq + Ord, 38 | { 39 | let (dfa, syms) = Self::new_from_nfa(nfa, enable_par); 40 | let partition = Self::minimize(&dfa, &syms); 41 | Self::rebuild(dfa, syms, partition) 42 | } 43 | 44 | /// Creates a new DFA from the given [`NFA`]. Returns the created DFA 45 | /// and its symbol set. 46 | /// 47 | /// The created DFA is not minimal. 48 | fn new_from_nfa(nfa: NFA, enable_par: Option) -> (Self, Vec>) 49 | where 50 | S: Clone + Hash + Eq + Sync, 51 | T: Clone + Ord, 52 | { 53 | let (nfa, nfa_tags) = nfa.into_fa_tags(); 54 | // stuffs for maintaining tags mappings between NFA and DFA 55 | let mut nfa_tags: Vec<_> = nfa_tags.into_iter().map(|(id, tag)| (tag, id)).collect(); 56 | nfa_tags.sort_unstable(); 57 | // create DFA, update the initial state 58 | let mut init_cached = CachedClosures::new(); 59 | let init_id = nfa.init_id(); 60 | let cb = ClosureBuilder::from(nfa); 61 | let init = cb.epsilon_closure(&mut init_cached, [init_id]); 62 | let mut fa = DenseFA::new(); 63 | let mut tags = HashMap::new(); 64 | if let Some(tag) = first_tag!(nfa_tags, init) { 65 | fa.set_final_state(fa.init_id()); 66 | tags.insert(fa.init_id(), tag); 67 | } 68 | // create other states 69 | let syms: Vec<_> = cb.symbol_set().into_iter().collect(); 70 | let constructor = Constructor { 71 | nfa_tags, 72 | cb, 73 | tags, 74 | states: vec![init.clone()], 75 | ids: HashMap::from([(init, fa.init_id())]), 76 | fa, 77 | enable_par, 78 | }; 79 | (constructor.construct(init_cached, &syms).into_dfa(), syms) 80 | } 81 | 82 | /// Creates a minimal DFA by the given DFA and symbol set. 83 | fn minimize(dfa: &Self, syms: &[Vec<(S, S)>]) -> VecDeque> 84 | where 85 | S: Ord + Hash, 86 | T: Hash + Eq, 87 | { 88 | let Self { fa, tags } = dfa; 89 | // get the initial partition 90 | let mut partition = tags 91 | .iter() 92 | .fold( 93 | HashMap::new(), 94 | |mut m: HashMap<_, HashSet<_>>, (id, tag)| { 95 | m.entry(tag).or_default().insert(*id); 96 | m 97 | }, 98 | ) 99 | .into_values() 100 | .collect::>(); 101 | let others: HashSet<_> = fa 102 | .states() 103 | .keys() 104 | .filter_map(|id| (!fa.finals().contains(id)).then_some(*id)) 105 | .collect(); 106 | if !others.is_empty() { 107 | partition.push_back(others); 108 | } 109 | // get new partition until there are no changes 110 | let mut num_states = partition.len(); 111 | loop { 112 | // create mapping from state IDs to partition index 113 | let index_map: HashMap<_, _> = partition 114 | .iter() 115 | .enumerate() 116 | .flat_map(|(i, ids)| ids.iter().map(move |id| (*id, i))) 117 | .collect(); 118 | for _ in 0..num_states { 119 | let states = partition.pop_front().unwrap(); 120 | // check if can be divided 121 | if states.len() <= 1 { 122 | partition.push_back(states); 123 | continue; 124 | } 125 | // get a new division 126 | let mut division: HashMap<_, HashSet> = HashMap::new(); 127 | for id in states { 128 | // get division ID set 129 | let div_id: BTreeSet<_> = syms 130 | .iter() 131 | .filter_map(|s| { 132 | // get the next state after accepting symbol `s` 133 | let next = fa.state(id).unwrap().next_state(s); 134 | // get partition index of the next state 135 | let index = next.and_then(|next| index_map.get(&next).copied()); 136 | index.map(|i| (s, i)) 137 | }) 138 | .collect(); 139 | // update division 140 | division.entry(div_id).or_default().insert(id); 141 | } 142 | // add to the partition 143 | partition.extend(division.into_values()); 144 | } 145 | // check and update the number of states 146 | if partition.len() == num_states { 147 | break; 148 | } 149 | num_states = partition.len(); 150 | } 151 | partition 152 | } 153 | 154 | /// Rebuilds a DFA by the given partition. 155 | fn rebuild(dfa: Self, syms: Vec>, partition: VecDeque>) -> Self 156 | where 157 | S: Clone + Eq + Hash, 158 | T: Clone, 159 | { 160 | let Self { 161 | fa: dfa, 162 | tags: dfa_tags, 163 | } = dfa; 164 | let mut fa = DenseFA::new(); 165 | // rebuild mapping of states 166 | let mut tags = HashMap::new(); 167 | let partition: Vec<_> = partition 168 | .into_iter() 169 | .map(|ids| { 170 | // add new state 171 | let id = if ids.contains(&dfa.init_id()) { 172 | fa.init_id() 173 | } else { 174 | fa.add_state() 175 | }; 176 | // check if is a final state 177 | if let Some(tag) = ids.iter().find_map(|id| dfa_tags.get(id)) { 178 | fa.set_final_state(id); 179 | tags.insert(id, tag.clone()); 180 | } 181 | (ids, id) 182 | }) 183 | .collect(); 184 | let states: HashMap<_, _> = partition 185 | .iter() 186 | .flat_map(|(ids, cur_id)| ids.iter().map(|id| (*id, *cur_id))) 187 | .collect(); 188 | // rebuild edges 189 | for (ids, cur_id) in &partition { 190 | let state = fa.state_mut(*cur_id).unwrap(); 191 | let mut added_edges = HashSet::new(); 192 | for id in ids { 193 | for s in &syms { 194 | if added_edges.contains(s) { 195 | continue; 196 | } 197 | // get the next state after accepting symbol `s` 198 | let next = dfa.state(*id).unwrap().next_state(s); 199 | if let Some(next) = next { 200 | // add a new edge 201 | state.add(s.clone(), states[&next]); 202 | added_edges.insert(s.clone()); 203 | } 204 | } 205 | } 206 | } 207 | Self { fa, tags } 208 | } 209 | 210 | /// Converts the current NFA into a 211 | /// [`FiniteAutomaton`](crate::fa::FiniteAutomaton) and a tag set. 212 | pub fn into_fa_tags(self) -> FATags { 213 | (self.fa, self.tags) 214 | } 215 | 216 | /// Dumps the current finite automaton to the given writer as Graphviz. 217 | pub fn dump(&self, writer: &mut W) -> io::Result<()> 218 | where 219 | S: fmt::Debug, 220 | W: io::Write, 221 | { 222 | self.fa.dump(writer) 223 | } 224 | } 225 | 226 | impl From> for DFA 227 | where 228 | S: Clone + Hash + Eq + Ord + Sync, 229 | T: Clone + Hash + Eq + Ord, 230 | { 231 | fn from(nfa: NFA) -> Self { 232 | Self::new(nfa, None) 233 | } 234 | } 235 | 236 | /// A pair of [`DFA`]'s internal finite automaton and the tag map. 237 | /// 238 | /// Used by method [`into_fa_tags`](DFA#method.into_fa_tags) of [`DFA`]. 239 | pub type FATags = (DenseFA>, HashMap); 240 | 241 | /// A [`NFA`] to [`DFA`] constructor. 242 | struct Constructor { 243 | nfa_tags: Vec<(T, usize)>, 244 | cb: ClosureBuilder>, 245 | fa: DenseFA>, 246 | tags: HashMap, 247 | states: Vec, 248 | ids: HashMap, 249 | enable_par: Option, 250 | } 251 | 252 | impl Constructor 253 | where 254 | S: Clone + Hash + Eq + Sync, 255 | T: Clone, 256 | { 257 | /// Consumes the current constructor, constructs a [`DFA`] using 258 | /// the powerset construction algorithm. 259 | fn construct(self, cached: CachedClosures, syms: &[Vec<(S, S)>]) -> Self { 260 | let enable_par = self.enable_par.unwrap_or_else(|| { 261 | let parallelism = std::thread::available_parallelism() 262 | .map(Into::into) 263 | .unwrap_or(1); 264 | parallelism > 1 && syms.len() > parallelism * 8 265 | }); 266 | if enable_par { 267 | self.construct_par(cached, syms) 268 | } else { 269 | self.construct_normal(cached, syms) 270 | } 271 | } 272 | 273 | /// Consumes the current constructor, constructs a [`DFA`] using 274 | /// the powerset construction algorithm. 275 | /// 276 | /// This method runs serially. 277 | fn construct_normal(mut self, mut cached: CachedClosures, syms: &[Vec<(S, S)>]) -> Self { 278 | while let Some(cur) = self.states.pop() { 279 | let cur_id = self.ids[&cur]; 280 | for s in syms { 281 | // get next states in parallel 282 | let next = self.cb.state_closure(&mut cached, &cur, s); 283 | if next.is_empty() { 284 | continue; 285 | } 286 | self.add_to_fa(cur_id, s.clone(), next); 287 | } 288 | } 289 | self 290 | } 291 | 292 | /// Consumes the current constructor, constructs a [`DFA`] using 293 | /// the powerset construction algorithm. 294 | /// 295 | /// This method runs in parallel. 296 | fn construct_par(mut self, cached: CachedClosures, syms: &[Vec<(S, S)>]) -> Self { 297 | use rayon::prelude::*; 298 | let mut nexts = Vec::new(); 299 | let mut cached_epsilons = vec![cached; syms.len()]; 300 | while let Some(cur) = self.states.pop() { 301 | let cur_id = self.ids[&cur]; 302 | // get next states in parallel 303 | syms 304 | .par_iter() 305 | .zip(&mut cached_epsilons) 306 | .map(|(s, c)| self.cb.state_closure(c, &cur, s)) 307 | .collect_into_vec(&mut nexts); 308 | // add to the finite automanton 309 | for (s, next) in syms.iter().zip(nexts.drain(..)) { 310 | if next.is_empty() { 311 | continue; 312 | } 313 | self.add_to_fa(cur_id, s.clone(), next); 314 | } 315 | } 316 | self 317 | } 318 | 319 | fn add_to_fa(&mut self, cur_id: usize, s: Vec<(S, S)>, next: Closure) { 320 | // get the ID of the next state 321 | let id = if let Some(id) = self.ids.get(&next) { 322 | *id 323 | } else { 324 | // add a new state 325 | let id = if let Some(tag) = first_tag!(self.nfa_tags, next) { 326 | let id = self.fa.add_final_state(); 327 | self.tags.insert(id, tag); 328 | id 329 | } else { 330 | self.fa.add_state() 331 | }; 332 | // update states and ID map 333 | self.states.push(next.clone()); 334 | self.ids.insert(next, id); 335 | id 336 | }; 337 | // add an edge to the next state 338 | self.fa.state_mut(cur_id).unwrap().add(s, id); 339 | } 340 | 341 | /// Converts the current constructor into a [`DFA`]. 342 | fn into_dfa(self) -> DFA { 343 | DFA { 344 | fa: self.fa, 345 | tags: self.tags, 346 | } 347 | } 348 | } 349 | -------------------------------------------------------------------------------- /laps_regex/src/fa.rs: -------------------------------------------------------------------------------- 1 | //! Finite automaton representations. 2 | //! 3 | //! This module contains [`FiniteAutomaton`], which is a simple finite 4 | //! automaton implementation, and [`State`], which represents a state in 5 | //! the automaton. 6 | 7 | use std::collections::{BTreeSet, HashMap, HashSet}; 8 | use std::hash::Hash; 9 | use std::marker::PhantomData; 10 | use std::sync::{Mutex, MutexGuard, OnceLock}; 11 | use std::{fmt, io}; 12 | 13 | /// The next state ID. 14 | static NEXT_STATE_ID: OnceLock> = OnceLock::new(); 15 | 16 | /// Acquires and returns the next state ID. 17 | fn next_state_id() -> MutexGuard<'static, usize> { 18 | NEXT_STATE_ID 19 | .get_or_init(|| Mutex::new(0)) 20 | .lock() 21 | .expect("failed to acquire the next state ID") 22 | } 23 | 24 | /// Returns a new state ID and updates the ID counter. 25 | fn get_and_update_state_id() -> usize { 26 | let mut id = next_state_id(); 27 | let cur = *id; 28 | *id += 1; 29 | cur 30 | } 31 | 32 | /// Trait for state of finite automaton. 33 | pub trait State { 34 | /// Creates a new empty state. 35 | fn new() -> Self; 36 | 37 | /// Adds a new edge to the current state. 38 | fn add(&mut self, sym: S, state: usize); 39 | 40 | /// Dumps the current state to the given writer as Graphviz. 41 | fn dump(&self, writer: &mut W, id: usize) -> io::Result<()> 42 | where 43 | S: fmt::Debug, 44 | W: io::Write; 45 | } 46 | 47 | /// A state of the finite automaton with symbol type `S`. 48 | /// 49 | /// This state uses [`Vec`] to store edges internally. 50 | #[derive(Debug)] 51 | pub struct DenseState { 52 | outs: Vec<(S, usize)>, 53 | } 54 | 55 | impl DenseState { 56 | /// Returns the output edges. 57 | pub fn outs(&self) -> &[(S, usize)] { 58 | &self.outs 59 | } 60 | 61 | /// Returns ID of the next state after accepting the given symbol `sym`. 62 | /// 63 | /// This method will return only the first matching state. 64 | /// Returns [`None`] if no matching state. 65 | pub fn next_state(&self, sym: &S) -> Option 66 | where 67 | S: PartialEq, 68 | { 69 | self 70 | .outs 71 | .iter() 72 | .find_map(|(s, id)| (s == sym).then_some(*id)) 73 | } 74 | } 75 | 76 | impl State for DenseState { 77 | fn new() -> Self { 78 | Self { outs: Vec::new() } 79 | } 80 | 81 | fn add(&mut self, sym: S, state: usize) { 82 | self.outs.push((sym, state)); 83 | } 84 | 85 | fn dump(&self, writer: &mut W, id: usize) -> io::Result<()> 86 | where 87 | S: fmt::Debug, 88 | W: io::Write, 89 | { 90 | for (s, to) in &self.outs { 91 | writeln!(writer, " {id} -> {to} [label = \"{s:?}\"]")?; 92 | } 93 | Ok(()) 94 | } 95 | } 96 | 97 | /// A state of the finite automaton with symbol type `S`. 98 | /// 99 | /// This state uses [`HashMap>`] to store edges 100 | /// and all their output states. 101 | #[derive(Debug)] 102 | pub struct MultiState { 103 | outs: HashMap>, 104 | } 105 | 106 | impl MultiState { 107 | /// Returns the map of output edges. 108 | pub fn outs(&self) -> &HashMap> { 109 | &self.outs 110 | } 111 | } 112 | 113 | impl State for MultiState 114 | where 115 | S: Eq + Hash, 116 | { 117 | fn new() -> Self { 118 | Self { 119 | outs: HashMap::new(), 120 | } 121 | } 122 | 123 | fn add(&mut self, sym: S, state: usize) { 124 | self.outs.entry(sym).or_default().insert(state); 125 | } 126 | 127 | fn dump(&self, writer: &mut W, id: usize) -> io::Result<()> 128 | where 129 | S: fmt::Debug, 130 | W: io::Write, 131 | { 132 | for (s, to_ids) in &self.outs { 133 | for to in to_ids { 134 | writeln!(writer, " {id} -> {to} [label = \"{s:?}\"]")?; 135 | } 136 | } 137 | Ok(()) 138 | } 139 | } 140 | 141 | /// A finite automaton with symbol type `S`. 142 | #[derive(Debug)] 143 | pub struct FiniteAutomaton> { 144 | states: HashMap, 145 | init: usize, 146 | finals: HashSet, 147 | sym: PhantomData, 148 | } 149 | 150 | impl> FiniteAutomaton { 151 | /// Creates an empty finite automaton. 152 | pub fn new() -> Self { 153 | let init = get_and_update_state_id(); 154 | Self { 155 | states: [(init, State::new())].into(), 156 | init, 157 | finals: HashSet::new(), 158 | sym: PhantomData, 159 | } 160 | } 161 | 162 | /// Creates a new state in the current finite automaton. 163 | /// 164 | /// Returns the state ID. 165 | pub fn add_state(&mut self) -> usize { 166 | let id = get_and_update_state_id(); 167 | self.states.insert(id, State::new()); 168 | id 169 | } 170 | 171 | /// Creates a new final state in the current finite automaton. 172 | /// 173 | /// Returns the state ID. 174 | pub fn add_final_state(&mut self) -> usize { 175 | let id = self.add_state(); 176 | self.finals.insert(id); 177 | id 178 | } 179 | 180 | /// Sets the given state as a final state. 181 | /// 182 | /// Returns [`false`](bool) if the given state does not exist. 183 | pub fn set_final_state(&mut self, id: usize) -> bool { 184 | if self.states.contains_key(&id) { 185 | self.finals.insert(id); 186 | true 187 | } else { 188 | false 189 | } 190 | } 191 | 192 | /// Sets the given state as a normal state. 193 | /// 194 | /// Returns [`false`](bool) if the given state does not exist. 195 | pub fn set_normal_state(&mut self, id: usize) -> bool { 196 | if self.states.contains_key(&id) { 197 | self.finals.remove(&id); 198 | true 199 | } else { 200 | false 201 | } 202 | } 203 | 204 | /// Unions the current finite automaton with the given finite automaton. 205 | /// 206 | /// The initial state of the given finite automaton will be added to 207 | /// the current finite automaton as normal states. All final states of 208 | /// the given finite automaton will be kept. 209 | pub fn union(&mut self, fa: Self) { 210 | self.states.extend(fa.states); 211 | self.finals.extend(fa.finals); 212 | } 213 | 214 | /// Returns a reference to the state map. 215 | pub fn states(&self) -> &HashMap { 216 | &self.states 217 | } 218 | 219 | /// Returns a reference to the given state. 220 | /// 221 | /// Returns [`None`] if the given state does not exist. 222 | pub fn state(&self, id: usize) -> Option<&State> { 223 | self.states.get(&id) 224 | } 225 | 226 | /// Returns a mutable reference to the given state. 227 | /// 228 | /// Returns [`None`] if the given state does not exist. 229 | pub fn state_mut(&mut self, id: usize) -> Option<&mut State> { 230 | self.states.get_mut(&id) 231 | } 232 | 233 | /// Returns a reference to the initial state. 234 | pub fn init(&self) -> &State { 235 | self.states.get(&self.init).unwrap() 236 | } 237 | 238 | /// Returns a mutable reference to the given initial state. 239 | pub fn init_mut(&mut self) -> &mut State { 240 | self.states.get_mut(&self.init).unwrap() 241 | } 242 | 243 | /// Returns the ID of the initial state. 244 | pub fn init_id(&self) -> usize { 245 | self.init 246 | } 247 | 248 | /// Returns a reference to the ID set of the final states. 249 | pub fn finals(&self) -> &HashSet { 250 | &self.finals 251 | } 252 | 253 | /// Returns the ID of the final state. 254 | /// 255 | /// Returns [`None`] if there is no final state or more than one final state. 256 | pub fn final_id(&self) -> Option { 257 | if self.finals().len() > 1 { 258 | None 259 | } else { 260 | self.finals().iter().next().copied() 261 | } 262 | } 263 | 264 | /// Dumps the current finite automaton to the given writer as Graphviz. 265 | pub fn dump(&self, writer: &mut W) -> io::Result<()> 266 | where 267 | Sym: fmt::Debug, 268 | W: io::Write, 269 | { 270 | writeln!(writer, "digraph finite_automaton {{")?; 271 | writeln!(writer, " rankdir = LR")?; 272 | writeln!(writer, " node [shape = doublecircle];")?; 273 | write!(writer, " ")?; 274 | for id in &self.finals { 275 | write!(writer, " {id}")?; 276 | } 277 | writeln!(writer, ";")?; 278 | writeln!(writer, " node [shape = circle];")?; 279 | for (id, state) in &self.states { 280 | state.dump(writer, *id)?; 281 | } 282 | writeln!(writer, "}}")?; 283 | Ok(()) 284 | } 285 | } 286 | 287 | impl> Default for FiniteAutomaton { 288 | fn default() -> Self { 289 | Self::new() 290 | } 291 | } 292 | 293 | /// Finite automaton which state type is [`DenseState`]. 294 | pub type DenseFA = FiniteAutomaton>; 295 | 296 | /// Finite automaton which state type is [`MultiState`]. 297 | pub type MultiFA = FiniteAutomaton>; 298 | 299 | /// Builder for calculating closures from a finite automation. 300 | pub struct ClosureBuilder { 301 | empty_edges: HashMap>, 302 | normal_edges: HashMap>, 303 | } 304 | 305 | impl From>> for ClosureBuilder 306 | where 307 | S: Eq + Hash, 308 | { 309 | fn from(fa: MultiFA>) -> Self { 310 | let mut empty_edges = HashMap::new(); 311 | let mut normal_edges: HashMap<_, MultiState> = HashMap::new(); 312 | for (id, s) in fa.states { 313 | for (s, to) in s.outs { 314 | match s { 315 | Some(s) => normal_edges 316 | .entry(id) 317 | .or_insert_with(|| State::new()) 318 | .outs 319 | .insert(s, to), 320 | None => empty_edges.insert(id, to), 321 | }; 322 | } 323 | } 324 | Self { 325 | empty_edges, 326 | normal_edges, 327 | } 328 | } 329 | } 330 | 331 | impl ClosureBuilder { 332 | /// Returns the symbol set of the current finite automaton. 333 | pub fn symbol_set(&self) -> HashSet 334 | where 335 | S: Clone + Eq + Hash, 336 | { 337 | self 338 | .normal_edges 339 | .values() 340 | .flat_map(|s| s.outs().keys().cloned()) 341 | .collect() 342 | } 343 | 344 | /// Returns the epsilon closure of the given state. 345 | pub fn epsilon_closure(&self, cached: &mut CachedClosures, ids: Ids) -> Closure 346 | where 347 | Ids: Into, 348 | { 349 | let mut closure = ids.into(); 350 | if closure.is_empty() { 351 | closure 352 | } else if let Some(c) = cached.get(&closure) { 353 | c.clone() 354 | } else { 355 | let ids = closure.clone(); 356 | let mut next_ids: Vec<_> = closure.iter().copied().collect(); 357 | while let Some(id) = next_ids.pop() { 358 | if let Some(to_ids) = self.empty_edges.get(&id) { 359 | for id in to_ids { 360 | if closure.insert(*id) { 361 | next_ids.push(*id); 362 | } 363 | } 364 | } 365 | } 366 | cached.insert(ids, closure.clone()); 367 | closure 368 | } 369 | } 370 | 371 | /// Returns a set of all possible states that can be reached 372 | /// after accepting symbol `s` on the given states. 373 | pub fn state_closure(&self, cached: &mut CachedClosures, states: &Closure, s: &S) -> Closure 374 | where 375 | S: Eq + Hash, 376 | { 377 | let mut next_states = Closure::new(); 378 | for id in states { 379 | if let Some(ids) = self.normal_edges.get(id).and_then(|st| st.outs().get(s)) { 380 | next_states.extend(ids); 381 | } 382 | } 383 | self.epsilon_closure(cached, next_states) 384 | } 385 | } 386 | 387 | /// Closure of a state of finite automaton. 388 | pub type Closure = BTreeSet; 389 | 390 | /// Cached closures. 391 | pub type CachedClosures = HashMap; 392 | -------------------------------------------------------------------------------- /laps_regex/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # laps_regex 2 | //! 3 | //! Tools for generating NFAs, DFAs and state-transition tables from 4 | //! regular expressions. 5 | //! 6 | //! This library is built for crate [`laps`](https://crates.io/crates/laps). 7 | //! 8 | //! ## Example: Matching UTF-8 Strings 9 | //! 10 | //! ``` 11 | //! use laps_regex::re::{RegexBuilder, CharsMatcher}; 12 | //! 13 | //! #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 14 | //! enum Token { 15 | //! Keyword, 16 | //! Identifier, 17 | //! Number, 18 | //! } 19 | //! 20 | //! let matcher: CharsMatcher<_> = RegexBuilder::new() 21 | //! .add("if|else|while", Token::Keyword) 22 | //! .add("[_a-zA-Z][_a-zA-Z0-9]*", Token::Identifier) 23 | //! .add("[0-9]|[1-9][0-9]+", Token::Number) 24 | //! .build() 25 | //! .unwrap(); 26 | //! 27 | //! assert_eq!(matcher.is_str_match("if"), Some(&Token::Keyword)); 28 | //! assert_eq!(matcher.is_str_match("while1"), Some(&Token::Identifier)); 29 | //! assert_eq!(matcher.is_str_match("42"), Some(&Token::Number)); 30 | //! assert_eq!(matcher.is_str_match("?"), None); 31 | //! ``` 32 | //! 33 | //! ## Example: Matching Bytes 34 | //! 35 | //! ``` 36 | //! use laps_regex::re::{RegexBuilder, BytesMatcher}; 37 | //! 38 | //! let matcher: BytesMatcher<_> = RegexBuilder::new() 39 | //! .add("hello|hi", 0) 40 | //! .add("goodbye|bye", 1) 41 | //! .build_bytes() 42 | //! .unwrap(); 43 | //! 44 | //! assert_eq!(matcher.is_match("hello".as_bytes()), Some(&0)); 45 | //! assert_eq!(matcher.is_match(&[0x62, 0x79, 0x65]), Some(&1)); 46 | //! ``` 47 | 48 | pub mod dfa; 49 | pub mod fa; 50 | pub mod mir; 51 | pub mod nfa; 52 | pub mod re; 53 | pub mod table; 54 | -------------------------------------------------------------------------------- /laps_regex/src/nfa.rs: -------------------------------------------------------------------------------- 1 | //! Nondeterministic finite automaton ([`NFA`]) related implementations. 2 | //! 3 | //! An NFA can be built from a mid-level intermediate represention ([`Mir`]). 4 | 5 | use crate::fa::{MultiFA, State}; 6 | use crate::mir::Mir; 7 | use std::collections::HashMap; 8 | use std::hash::Hash; 9 | use std::{fmt, io}; 10 | 11 | /// A nondeterministic finite automaton (NFA) 12 | /// with symbol type `S` and tag type `T`. 13 | #[derive(Debug)] 14 | pub struct NFA 15 | where 16 | S: Eq + Hash, 17 | { 18 | fa: MultiFA>>, 19 | tags: HashMap, 20 | } 21 | 22 | impl NFA 23 | where 24 | S: Eq + Hash, 25 | { 26 | /// Creates a new NFA from [`Mir`]. 27 | pub fn new(mir: Mir) -> Self { 28 | match mir { 29 | Mir::Empty => Self::new_nfa_with_symbol(None), 30 | Mir::Ranges(rs) => Self::new_nfa_with_symbol(Some(rs)), 31 | Mir::Concat(c) => c.into_iter().map(Self::new).reduce(Self::concat).unwrap(), 32 | Mir::Alter(mut a) => { 33 | if a.len() == 1 { 34 | let (mir, tag) = a.swap_remove(0); 35 | let mut nfa = Self::new(mir); 36 | if let Some(tag) = tag { 37 | let fs = nfa.normalize(); 38 | nfa.fa.set_final_state(fs); 39 | nfa.tags.insert(fs, tag); 40 | } 41 | nfa 42 | } else { 43 | a.into_iter() 44 | .map(|(mir, tag)| (Self::new(mir), tag)) 45 | .reduce(Self::alter) 46 | .unwrap() 47 | .0 48 | } 49 | } 50 | Mir::Kleene(k) => { 51 | // create NFA and normalize 52 | let mut nfa = Self::new(*k); 53 | let id = nfa.normalize(); 54 | // create a edge to the initial state 55 | let init = nfa.fa.init_id(); 56 | nfa.fa.state_mut(id).unwrap().add(None, init); 57 | // set the initial state as a final state 58 | nfa.fa.set_final_state(init); 59 | nfa 60 | } 61 | } 62 | } 63 | 64 | /// Creates a new NFA which matches the given symbol. 65 | fn new_nfa_with_symbol(sym: Option>) -> Self { 66 | let mut fa = MultiFA::new(); 67 | let fs = fa.add_final_state(); 68 | fa.init_mut().add(sym, fs); 69 | Self { 70 | fa, 71 | tags: HashMap::new(), 72 | } 73 | } 74 | 75 | /// Creates an alternation of the given two NFA-tag pairs. 76 | fn alter( 77 | (mut nfa1, tag1): (Self, Option), 78 | (mut nfa2, tag2): (Self, Option), 79 | ) -> (Self, Option) { 80 | // create final state and tag mapping for `nfa1` 81 | let fs1 = nfa1.normalize(); 82 | nfa1.fa.set_final_state(fs1); 83 | if let Some(tag1) = tag1 { 84 | nfa1.tags.insert(fs1, tag1); 85 | } 86 | // add empty edge to the initial state of `nfa2` 87 | nfa1.fa.init_mut().add(None, nfa2.fa.init_id()); 88 | // create final state and tag mapping for `nfa2` if it has a tag 89 | if let Some(tag2) = tag2 { 90 | let fs2 = nfa2.normalize(); 91 | nfa2.fa.set_final_state(fs2); 92 | nfa1.tags.insert(fs2, tag2); 93 | } 94 | // union states and tags of two NFAs 95 | nfa1.fa.union(nfa2.fa); 96 | nfa1.tags.extend(nfa2.tags); 97 | (nfa1, None) 98 | } 99 | 100 | /// Concatenates the given two NFAs into a new NFA. 101 | fn concat(mut nfa1: Self, nfa2: Self) -> Self { 102 | let fs1 = nfa1.normalize(); 103 | nfa1.fa.state_mut(fs1).unwrap().add(None, nfa2.fa.init_id()); 104 | nfa1.fa.union(nfa2.fa); 105 | nfa1.tags.extend(nfa2.tags); 106 | nfa1 107 | } 108 | 109 | /// Normalizes the current NFA. 110 | /// 111 | /// Keeps only final states with tags, set all other final states as 112 | /// normal states, and route them to a new normal state with an empty edge. 113 | /// 114 | /// Returns the normal state ID. 115 | fn normalize(&mut self) -> usize { 116 | // try to get an untagged final state 117 | let untagged = self 118 | .fa 119 | .finals() 120 | .iter() 121 | .copied() 122 | .find(|id| !self.tags.contains_key(id)); 123 | // get the target state id 124 | let target = if let Some(untagged) = untagged { 125 | self.fa.set_normal_state(untagged); 126 | untagged 127 | } else { 128 | self.fa.add_state() 129 | }; 130 | // add edges to the target state 131 | for id in self.fa.finals().clone() { 132 | if id != target { 133 | self.fa.state_mut(id).unwrap().add(None, target); 134 | if !self.tags.contains_key(&id) { 135 | self.fa.set_normal_state(id); 136 | } 137 | } 138 | } 139 | target 140 | } 141 | 142 | /// Converts the current NFA into a 143 | /// [`FiniteAutomaton`](crate::fa::FiniteAutomaton) and a tag set. 144 | pub fn into_fa_tags(self) -> FATags { 145 | (self.fa, self.tags) 146 | } 147 | 148 | /// Dumps the current finite automaton to the given writer as Graphviz. 149 | pub fn dump(&self, writer: &mut W) -> io::Result<()> 150 | where 151 | S: fmt::Debug, 152 | W: io::Write, 153 | { 154 | self.fa.dump(writer) 155 | } 156 | } 157 | 158 | impl From> for NFA 159 | where 160 | S: Eq + Hash, 161 | { 162 | fn from(mir: Mir) -> Self { 163 | Self::new(mir) 164 | } 165 | } 166 | 167 | /// A pair of [`NFA`]'s internal finite automaton and the tag map. 168 | /// 169 | /// Used by method [`into_fa_tags`](NFA#method.into_fa_tags) of [`NFA`]. 170 | pub type FATags = (MultiFA>>, HashMap); 171 | -------------------------------------------------------------------------------- /laps_regex/src/re.rs: -------------------------------------------------------------------------------- 1 | //! User interfaces for building and matching regular expressions. 2 | //! 3 | //! This module contains the regular expression builder [`RegexBuilder`] 4 | //! and the regular expression matcher [`RegexMatcher`]. 5 | 6 | use crate::dfa::DFA; 7 | use crate::mir::{Error as MirError, Mir, MirBuilder, SymbolOp}; 8 | use crate::nfa::NFA; 9 | use crate::table::StateTransTable; 10 | use regex_syntax::hir::Hir; 11 | use regex_syntax::{parse, Error as RegexError, ParserBuilder}; 12 | use std::fmt; 13 | use std::hash::Hash; 14 | 15 | /// A builder for regular expressions with tag type `T`. 16 | pub struct RegexBuilder { 17 | re_tags: Vec<(String, T)>, 18 | enable_par: Option, 19 | } 20 | 21 | impl RegexBuilder { 22 | /// Creates a new regular expression builder. 23 | pub fn new() -> Self { 24 | Self { 25 | re_tags: Vec::new(), 26 | enable_par: None, 27 | } 28 | } 29 | 30 | /// Adds a new regular expression to the builder, with the given tag. 31 | pub fn add(mut self, re: &str, tag: T) -> Self { 32 | self.re_tags.push((re.into(), tag)); 33 | self 34 | } 35 | 36 | /// Sets to [`Some(true)`] to construct the DFA in parallel, 37 | /// [`Some(false)`] to disable parallelization, and [`None`] to 38 | /// choose automatically. 39 | /// 40 | /// Defaults to [`None`]. 41 | pub fn enable_par(mut self, enable_par: Option) -> Self { 42 | self.enable_par = enable_par; 43 | self 44 | } 45 | } 46 | 47 | impl RegexBuilder 48 | where 49 | T: Clone + Hash + Eq + Ord, 50 | { 51 | /// Builds all regular expressions in the current builder as UTF-8 mode. 52 | /// 53 | /// Returns a [`RegexMatcher`], or an error. 54 | pub fn build(self) -> Result, Error> 55 | where 56 | S: Hash + Eq + Clone + Ord + SymbolOp + Sync + Send, 57 | Mir: MirBuilder, 58 | { 59 | self.build_impl(parse) 60 | } 61 | 62 | /// Builds all regular expressions in the current builder as bytes mode. 63 | /// 64 | /// Returns a [`RegexMatcher`], or an error. 65 | pub fn build_bytes(self) -> Result, Error> 66 | where 67 | S: Hash + Eq + Clone + Ord + SymbolOp + Sync + Send, 68 | Mir: MirBuilder, 69 | { 70 | self.build_impl(|re| ParserBuilder::new().utf8(false).build().parse(re)) 71 | } 72 | 73 | /// Implementation of all building methods. 74 | fn build_impl(self, re_parse: R) -> Result, Error> 75 | where 76 | R: Fn(&str) -> Result, 77 | S: Hash + Eq + Clone + Ord + SymbolOp + Sync + Send, 78 | Mir: MirBuilder, 79 | { 80 | if self.re_tags.is_empty() { 81 | Err(Error::EmptyBuilder) 82 | } else { 83 | Mir::Alter( 84 | self 85 | .re_tags 86 | .into_iter() 87 | .map(|(re, tag)| { 88 | re_parse(&re) 89 | .map_err(|e| Error::Regex(Box::new(e), tag.clone())) 90 | .and_then(|hir| Mir::new(hir).map_err(Error::Mir)) 91 | .map(|mir| (mir, Some(tag))) 92 | }) 93 | .collect::>()?, 94 | ) 95 | .optimize() 96 | .map(|mir| { 97 | RegexMatcher::new(StateTransTable::new(DFA::new( 98 | NFA::new(mir), 99 | self.enable_par, 100 | ))) 101 | }) 102 | .map_err(Error::Mir) 103 | } 104 | } 105 | } 106 | 107 | impl Default for RegexBuilder { 108 | fn default() -> Self { 109 | Self::new() 110 | } 111 | } 112 | 113 | /// Possible errors in building of regular expressions with tag type `T`. 114 | #[derive(Debug)] 115 | pub enum Error { 116 | /// There is no regular expressions in [`RegexBuilder`]. 117 | EmptyBuilder, 118 | /// An error occurred during parsing the regular expression with the tag `T`. 119 | Regex(Box, T), 120 | /// An error occurred during compiling or optimizing regular expressions. 121 | Mir(MirError), 122 | } 123 | 124 | impl fmt::Display for Error { 125 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 126 | match self { 127 | Self::EmptyBuilder => write!(f, "no regular expressions in the builder"), 128 | Self::Regex(e, _) => write!(f, "{e}"), 129 | Self::Mir(e) => write!(f, "{e}"), 130 | } 131 | } 132 | } 133 | 134 | /// A matcher for matching regular expressions. 135 | #[derive(Debug)] 136 | pub struct RegexMatcher { 137 | table: StateTransTable, 138 | state: usize, 139 | } 140 | 141 | impl RegexMatcher { 142 | /// Creates a new matcher from the given [`StateTransTable`]. 143 | fn new(table: StateTransTable) -> Self { 144 | Self { 145 | state: table.init_id(), 146 | table, 147 | } 148 | } 149 | 150 | /// Returns the current state ID. 151 | pub fn state(&self) -> usize { 152 | self.state 153 | } 154 | 155 | /// Checks if the given bytes can be matched. 156 | /// If so, returns a reference to the corresponding tag. 157 | /// Otherwise, returns [`None`]. 158 | /// 159 | /// Smaller tags have higher precedence. 160 | pub fn is_match(&self, seq: &[S]) -> Option<&T> 161 | where 162 | S: Ord, 163 | { 164 | let mut id = self.table.init_id(); 165 | for s in seq { 166 | if let Some(next) = self.table.next_state(id, s) { 167 | id = next; 168 | } else { 169 | return None; 170 | } 171 | } 172 | self.table.is_final(id) 173 | } 174 | 175 | /// Returns true if the given symbol can be accepted. 176 | /// 177 | /// This method will update the internal state. 178 | pub fn is_accept(&mut self, s: &S) -> bool 179 | where 180 | S: Ord, 181 | { 182 | if let Some(next) = self.table.next_state(self.state, s) { 183 | self.state = next; 184 | true 185 | } else { 186 | false 187 | } 188 | } 189 | 190 | /// Checks if the current state is a final state. 191 | /// If so, returns a reference to the corresponding tag. 192 | /// Otherwise, returns [`None`]. 193 | /// 194 | /// Smaller tags have higher precedence. 195 | pub fn is_final(&self) -> Option<&T> { 196 | self.table.is_final(self.state) 197 | } 198 | 199 | /// Checks if the given state is a final state. 200 | /// If so, returns a reference to the corresponding tag. 201 | /// Otherwise, returns [`None`]. 202 | /// 203 | /// Smaller tags have higher precedence. 204 | pub fn is_state_final(&self, id: usize) -> Option<&T> { 205 | self.table.is_final(id) 206 | } 207 | 208 | /// Resets the internal state of the current matcher to initial state. 209 | pub fn reset(&mut self) { 210 | self.state = self.table.init_id(); 211 | } 212 | } 213 | 214 | impl From> for StateTransTable { 215 | fn from(matcher: RegexMatcher) -> Self { 216 | matcher.table 217 | } 218 | } 219 | 220 | /// A regular expression matcher for matching characters. 221 | pub type CharsMatcher = RegexMatcher; 222 | 223 | impl CharsMatcher { 224 | /// Checks if the given string can be matched. 225 | /// If so, returns a reference to the corresponding tag. 226 | /// Otherwise, returns [`None`]. 227 | /// 228 | /// Smaller tags have higher precedence. 229 | pub fn is_str_match(&self, s: &str) -> Option<&T> { 230 | let mut id = self.table.init_id(); 231 | for c in s.chars() { 232 | if let Some(next) = self.table.next_state(id, &c) { 233 | id = next; 234 | } else { 235 | return None; 236 | } 237 | } 238 | self.table.is_final(id) 239 | } 240 | } 241 | 242 | /// A regular expression matcher for matching bytes. 243 | pub type BytesMatcher = RegexMatcher; 244 | 245 | impl BytesMatcher { 246 | /// Checks if the given string can be matched. 247 | /// If so, returns a reference to the corresponding tag. 248 | /// Otherwise, returns [`None`]. 249 | /// 250 | /// Smaller tags have higher precedence. 251 | pub fn is_str_match(&self, s: &str) -> Option<&T> { 252 | let mut id = self.table.init_id(); 253 | for c in s.bytes() { 254 | if let Some(next) = self.table.next_state(id, &c) { 255 | id = next; 256 | } else { 257 | return None; 258 | } 259 | } 260 | self.table.is_final(id) 261 | } 262 | } 263 | 264 | #[cfg(test)] 265 | mod test { 266 | use super::*; 267 | use Token::*; 268 | 269 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 270 | enum Token { 271 | Keyword, 272 | Identifier, 273 | Number, 274 | Str, 275 | Operator, 276 | Skip, 277 | Other, 278 | } 279 | 280 | #[test] 281 | fn match_string() { 282 | let matcher: CharsMatcher<_> = RegexBuilder::new() 283 | .add("if|else|while", Keyword) 284 | .add("[_a-zA-Z][_a-zA-Z0-9]*", Identifier) 285 | .add("[0-9]|[1-9][0-9]+", Number) 286 | .build() 287 | .unwrap(); 288 | assert_eq!(matcher.is_str_match("if"), Some(&Keyword)); 289 | assert_eq!(matcher.is_str_match("else"), Some(&Keyword)); 290 | assert_eq!(matcher.is_str_match("while"), Some(&Keyword)); 291 | assert_eq!(matcher.is_str_match("ifi"), Some(&Identifier)); 292 | assert_eq!(matcher.is_str_match("else1"), Some(&Identifier)); 293 | assert_eq!(matcher.is_str_match("_while"), Some(&Identifier)); 294 | assert_eq!(matcher.is_str_match("a_8"), Some(&Identifier)); 295 | assert_eq!(matcher.is_str_match("_"), Some(&Identifier)); 296 | assert_eq!(matcher.is_str_match("A_good_id"), Some(&Identifier)); 297 | assert_eq!(matcher.is_str_match("A_b@d_id"), None); 298 | assert_eq!(matcher.is_str_match("0"), Some(&Number)); 299 | assert_eq!(matcher.is_str_match("5"), Some(&Number)); 300 | assert_eq!(matcher.is_str_match("12450"), Some(&Number)); 301 | assert_eq!(matcher.is_str_match("10"), Some(&Number)); 302 | assert_eq!(matcher.is_str_match("01"), None); 303 | assert_eq!(matcher.is_str_match(""), None); 304 | assert_eq!(matcher.is_str_match("?"), None); 305 | } 306 | 307 | #[test] 308 | fn match_bytes() { 309 | let matcher: BytesMatcher<_> = RegexBuilder::new() 310 | .add("hello|hi", 0) 311 | .add("goodbye|bye", 1) 312 | .build_bytes() 313 | .unwrap(); 314 | assert_eq!(matcher.is_str_match("hello"), Some(&0)); 315 | assert_eq!(matcher.is_match(b"hello"), Some(&0)); 316 | assert_eq!(matcher.is_match(b"hi"), Some(&0)); 317 | assert_eq!(matcher.is_match(b"goodbye"), Some(&1)); 318 | assert_eq!(matcher.is_match(&[0x62, 0x79, 0x65]), Some(&1)); 319 | } 320 | 321 | #[test] 322 | fn match_stream() { 323 | use std::io::{Cursor, Read}; 324 | 325 | struct Lexer { 326 | reader: R, 327 | matcher: CharsMatcher, 328 | last_char: Option, 329 | } 330 | 331 | impl Lexer { 332 | fn new(reader: R) -> Self { 333 | Self { 334 | reader, 335 | matcher: RegexBuilder::new() 336 | .add("if|else|while", Keyword) 337 | .add("[_a-zA-Z][_a-zA-Z0-9]*", Identifier) 338 | .add("[0-9]|[1-9][0-9]+", Number) 339 | .add("\"[^\"\r\n]*\"", Str) 340 | .add(r"==|>|-=|\+=", Operator) 341 | .add(r"\s+", Skip) 342 | .add(".", Other) 343 | .build() 344 | .unwrap(), 345 | last_char: None, 346 | } 347 | } 348 | 349 | fn unread(&mut self, c: char) { 350 | self.last_char = Some(c); 351 | } 352 | } 353 | 354 | impl Lexer 355 | where 356 | R: Read, 357 | { 358 | fn read(&mut self) -> Option { 359 | let mut buf = [0]; 360 | match self.last_char.take() { 361 | None => match self.reader.read(&mut buf) { 362 | Ok(1) => Some(buf[0] as char), 363 | _ => None, 364 | }, 365 | c => c, 366 | } 367 | } 368 | 369 | fn next_token_impl(&mut self) -> Option<(Token, String)> { 370 | let mut last_state; 371 | let mut buf = String::new(); 372 | self.matcher.reset(); 373 | loop { 374 | let c = self.read()?; 375 | last_state = self.matcher.state(); 376 | if !self.matcher.is_accept(&c) { 377 | self.unread(c); 378 | break; 379 | } 380 | buf.push(c); 381 | } 382 | self.matcher.is_state_final(last_state).map(|t| (*t, buf)) 383 | } 384 | 385 | fn next_token(&mut self) -> Option<(Token, String)> { 386 | loop { 387 | let ts = self.next_token_impl(); 388 | if !matches!(ts, Some((Skip, _))) { 389 | return ts; 390 | } 391 | } 392 | } 393 | } 394 | 395 | let mut lexer = Lexer::new(Cursor::new( 396 | r#" 397 | while (test(b) =="hello!") { 398 | if (b> 5){ 399 | b-=1; 400 | } else { 401 | b += 2; 402 | } 403 | } 404 | "#, 405 | )); 406 | 407 | assert_eq!(lexer.next_token(), Some((Keyword, "while".into()))); 408 | assert_eq!(lexer.next_token(), Some((Other, "(".into()))); 409 | assert_eq!(lexer.next_token(), Some((Identifier, "test".into()))); 410 | assert_eq!(lexer.next_token(), Some((Other, "(".into()))); 411 | assert_eq!(lexer.next_token(), Some((Identifier, "b".into()))); 412 | assert_eq!(lexer.next_token(), Some((Other, ")".into()))); 413 | assert_eq!(lexer.next_token(), Some((Operator, "==".into()))); 414 | assert_eq!(lexer.next_token(), Some((Str, "\"hello!\"".into()))); 415 | assert_eq!(lexer.next_token(), Some((Other, ")".into()))); 416 | assert_eq!(lexer.next_token(), Some((Other, "{".into()))); 417 | assert_eq!(lexer.next_token(), Some((Keyword, "if".into()))); 418 | assert_eq!(lexer.next_token(), Some((Other, "(".into()))); 419 | assert_eq!(lexer.next_token(), Some((Identifier, "b".into()))); 420 | assert_eq!(lexer.next_token(), Some((Operator, ">".into()))); 421 | assert_eq!(lexer.next_token(), Some((Number, "5".into()))); 422 | assert_eq!(lexer.next_token(), Some((Other, ")".into()))); 423 | assert_eq!(lexer.next_token(), Some((Other, "{".into()))); 424 | assert_eq!(lexer.next_token(), Some((Identifier, "b".into()))); 425 | assert_eq!(lexer.next_token(), Some((Operator, "-=".into()))); 426 | assert_eq!(lexer.next_token(), Some((Number, "1".into()))); 427 | assert_eq!(lexer.next_token(), Some((Other, ";".into()))); 428 | assert_eq!(lexer.next_token(), Some((Other, "}".into()))); 429 | assert_eq!(lexer.next_token(), Some((Keyword, "else".into()))); 430 | assert_eq!(lexer.next_token(), Some((Other, "{".into()))); 431 | assert_eq!(lexer.next_token(), Some((Identifier, "b".into()))); 432 | assert_eq!(lexer.next_token(), Some((Operator, "+=".into()))); 433 | assert_eq!(lexer.next_token(), Some((Number, "2".into()))); 434 | assert_eq!(lexer.next_token(), Some((Other, ";".into()))); 435 | assert_eq!(lexer.next_token(), Some((Other, "}".into()))); 436 | assert_eq!(lexer.next_token(), Some((Other, "}".into()))); 437 | assert_eq!(lexer.next_token(), None); 438 | } 439 | 440 | #[test] 441 | fn match_word() { 442 | let matcher: CharsMatcher<_> = RegexBuilder::new().add(r"\w+", 0).build().unwrap(); 443 | assert_eq!(matcher.is_str_match("if"), Some(&0)); 444 | assert_eq!(matcher.is_str_match("hello"), Some(&0)); 445 | assert_eq!(matcher.is_str_match(".hello"), None); 446 | assert_eq!(matcher.is_str_match("??"), None); 447 | } 448 | 449 | #[test] 450 | fn match_xeno_tokens() { 451 | let res = [ 452 | r"\s+", 453 | r"(0b[01]+|0o[0-7]+|0x[0-9a-fA-F]+|[0-9]+)([iIuU](8|16|32|64))?", 454 | r"[0-9]+(\.([0-9]+([eE][+-]?[0-9]+)?([fF](32|64))?)?|([eE][+-]?[0-9]+)([fF](32|64))?|([fF](32|64)))", 455 | r#"'([^'\\\n\r\t]|\\'|\\"|\\x[0-7][0-9a-fA-F]|\\n|\\r|\\t|\\\\|\\0|\\u\{[0-9a-fA-F]{1,6}\})'"#, 456 | r#"b'([\x20-\x26\x28-\x5b\x5d-\x7e]|\\x[0-9a-fA-F]{2}|\\n|\\r|\\t|\\\\|\\0|\\'|\\")'"#, 457 | r#""([^'\\\n\r\t]|\\'|\\"|\\x[0-7][0-9a-fA-F]|\\n|\\r|\\t|\\\\|\\0|\\u\{[0-9a-fA-F]{1,6}\})*""#, 458 | r####"r"[^"]*"|r#"([^"]|"[^#])*"#|r##"([^"]|"[^#]|"#[^#])*"##|r###"([^"]|"[^#]|"#[^#]|"##[^#])*"###"####, 459 | r#"b"([\x20-\x26\x28-\x5b\x5d-\x7e]|\\x[0-9a-fA-F]{2}|\\n|\\r|\\t|\\\\|\\0|\\'|\\")*""#, 460 | r"\+|-|\*|/|%|&|\||!|\^|<<|>>|&&|\|\||==|!=|<|<=|>|>=|=|\+=|-=|\*=|/=|%=|&=|\|=|\^=|<<=|>>=|\(|\)|\[|\]|\{|\}|\.|\.\.|\.\.\.|->|,|:|@|_|\?", 461 | r"[~!@#$%^&*()_\-+={}\[\]|\\:;<,>.?/]+", 462 | r#"[^\s~!@#$%^&*()_\-+={}\[\]|\\:;<,>.?/0-9][^\s~!@#$%^&*()\-+={}\[\]|\\:;<,>.?/]*"#, 463 | ]; 464 | let matcher: CharsMatcher<_> = res 465 | .iter() 466 | .enumerate() 467 | .fold(RegexBuilder::new(), |b, (i, re)| b.add(re, i)) 468 | .build() 469 | .unwrap(); 470 | assert_eq!(matcher.is_str_match("123"), Some(&1)); 471 | } 472 | } 473 | -------------------------------------------------------------------------------- /laps_regex/src/table.rs: -------------------------------------------------------------------------------- 1 | //! State-transition table ([`StateTransTable`]) related implementations. 2 | //! 3 | //! A state-transition table can be built from a deterministic finite 4 | //! automaton ([`DFA`]). 5 | 6 | use crate::dfa::DFA; 7 | use crate::mir::SymbolOp; 8 | use std::collections::{BTreeMap, HashMap}; 9 | use std::hash::Hash; 10 | 11 | /// A state-transition table with symbol type `S` and tag type `T`. 12 | #[derive(Debug)] 13 | pub struct StateTransTable { 14 | /// State-transition table, which is a `num_equivs * num_states` 2d array. 15 | table: Box<[usize]>, 16 | /// Initial state ID. 17 | init_id: usize, 18 | /// Number of states. 19 | num_states: usize, 20 | /// Mapping between symbol ranges and equivalence class ID. 21 | /// 22 | /// The key of the map is the right bound of the range, and 23 | /// the value is `(left_bound, equiv_id)`. 24 | sym_map: BTreeMap, 25 | /// Mapping between state IDs and tags. 26 | /// 27 | /// Only the state presents in this map are final states. 28 | tags: HashMap, 29 | } 30 | 31 | impl StateTransTable { 32 | /// Creates a new state-transition table from the given [`DFA`]. 33 | pub fn new(dfa: DFA) -> Self 34 | where 35 | S: Clone + Hash + Eq + Ord + SymbolOp, 36 | { 37 | let (equivs, trans_table, init_id, tags) = TempTable::new(dfa).into_optimized(); 38 | // get number of states 39 | let num_states = trans_table[0].len(); 40 | // get the final table 41 | let table = trans_table 42 | .into_iter() 43 | .flat_map(|s| s.into_iter()) 44 | .collect::>() 45 | .into_boxed_slice(); 46 | // get symbol map 47 | let sym_map = equivs 48 | .into_iter() 49 | .enumerate() 50 | .flat_map(|(i, es)| es.into_iter().map(move |(l, r)| (r, (l, i)))) 51 | .collect(); 52 | Self { 53 | table, 54 | init_id, 55 | num_states, 56 | sym_map, 57 | tags, 58 | } 59 | } 60 | 61 | /// Returns a reference to the internal transition table, 62 | /// which is a `num_equivs * num_states` 2d array. 63 | pub fn table(&self) -> &[usize] { 64 | &self.table 65 | } 66 | 67 | /// Returns the ID of the initial state. 68 | pub fn init_id(&self) -> usize { 69 | self.init_id 70 | } 71 | 72 | /// Returns number of states. 73 | pub fn num_states(&self) -> usize { 74 | self.num_states 75 | } 76 | 77 | /// Returns a reference to the mapping between symbol ranges 78 | /// and equivalence class ID. 79 | /// 80 | /// The key of the map is the right bound of the range, and 81 | /// the value is `(left_bound, equiv_id)`. 82 | pub fn sym_map(&self) -> &BTreeMap { 83 | &self.sym_map 84 | } 85 | 86 | /// Returns a reference to the mapping between state IDs and tags. 87 | /// 88 | /// Only the state presents in this map are final states. 89 | pub fn tags(&self) -> &HashMap { 90 | &self.tags 91 | } 92 | 93 | /// Returns the ID of the next state after 94 | /// accepting symbol `s` on the given state. 95 | /// 96 | /// Returns [`None`] if the given state ID is invalid, 97 | /// or the given state can not accept symbol `s`. 98 | pub fn next_state(&self, id: usize, s: &S) -> Option 99 | where 100 | S: Ord, 101 | { 102 | // check if the ID is valid 103 | if id >= self.num_states { 104 | return None; 105 | } 106 | // get equivalence class ID 107 | let equiv = match self.sym_map.range(s..).next() { 108 | Some((_, (l, id))) if s >= l => *id, 109 | _ => return None, 110 | }; 111 | // get the next state 112 | let next = self.table[equiv * self.num_states + id]; 113 | (next < self.num_states).then_some(next) 114 | } 115 | 116 | /// Checks if the given state ID corresponds to a final state. 117 | /// 118 | /// Returns [`Some(tag)`] which `tag` corresponds to a user-input 119 | /// regular expression, otherwise returns [`None`]. 120 | pub fn is_final(&self, id: usize) -> Option<&T> { 121 | self.tags.get(&id) 122 | } 123 | } 124 | 125 | impl From> for StateTransTable 126 | where 127 | S: Clone + Hash + Eq + Ord + SymbolOp, 128 | { 129 | fn from(dfa: DFA) -> Self { 130 | Self::new(dfa) 131 | } 132 | } 133 | 134 | /// A temporary state-transition table. 135 | /// 136 | /// This structure will be constructed during the creation of 137 | /// [`StateTransTable`]. 138 | struct TempTable { 139 | table: HashMap, Vec>, 140 | tags: HashMap, 141 | init_id: usize, 142 | } 143 | 144 | impl TempTable { 145 | /// Creates a new temporary state-transition table from the given [`DFA`]. 146 | fn new(dfa: DFA) -> Self 147 | where 148 | S: Clone + Hash + Eq, 149 | { 150 | let (fa, tags) = dfa.into_fa_tags(); 151 | let num_states = fa.states().len(); 152 | // assign IDs for all states 153 | let mut ids = HashMap::new(); 154 | for id in fa.states().keys() { 155 | let next_id = ids.len(); 156 | ids.insert(*id, next_id); 157 | } 158 | // build the table 159 | let mut table = HashMap::new(); 160 | for (id, state) in fa.states() { 161 | let id = ids[id]; 162 | for (sym, next) in state.outs() { 163 | // create or get a state table 164 | let states = table.entry(sym.clone()).or_insert_with(|| { 165 | let mut v = Vec::new(); 166 | v.resize(num_states, num_states); 167 | v 168 | }); 169 | // update it 170 | states[id] = ids[next]; 171 | } 172 | } 173 | // build the tag map 174 | let tags = tags.into_iter().map(|(id, tag)| (ids[&id], tag)).collect(); 175 | Self { 176 | table, 177 | tags, 178 | init_id: ids[&fa.init_id()], 179 | } 180 | } 181 | 182 | /// Optimizes the current table. 183 | /// 184 | /// Returns equivalence classes, state-transition table, 185 | /// initial state ID and tags. 186 | fn into_optimized(self) -> OptimizedTable 187 | where 188 | S: Ord + SymbolOp, 189 | { 190 | // sort the table 191 | let mut table: Vec<_> = self.table.into_iter().map(|(s, t)| (t, s)).collect(); 192 | table.sort_unstable(); 193 | // get equivalence classes and the state-transition table 194 | let mut equivs: Vec> = Vec::new(); 195 | let mut trans_table = Vec::new(); 196 | for (states, sym) in table { 197 | match trans_table.last() { 198 | Some(t) if t == &states => { 199 | // get the last equivalence classes 200 | let equiv = equivs.last_mut().unwrap(); 201 | // get the last symbol of the last equivalence classes 202 | // and the first symbol of the current range 203 | let (_, last_r) = equiv.last_mut().unwrap(); 204 | let mut iter = sym.into_iter(); 205 | let first_sym = iter.next().unwrap(); 206 | // check if the current symbol can be merged into the last one 207 | if last_r.next().as_ref() == Some(&first_sym.0) { 208 | *last_r = first_sym.1; 209 | } else { 210 | equiv.push(first_sym); 211 | } 212 | // add the rest symbols 213 | equiv.extend(iter); 214 | } 215 | _ => { 216 | equivs.push(sym); 217 | trans_table.push(states); 218 | } 219 | } 220 | } 221 | (equivs, trans_table, self.init_id, self.tags) 222 | } 223 | } 224 | 225 | /// Intermediate result of an optimized state-transition table. 226 | /// 227 | /// Contains equivalence classes, optimized state-transition table, 228 | /// initial state ID and tags. 229 | type OptimizedTable = (Vec>, Vec>, usize, HashMap); 230 | -------------------------------------------------------------------------------- /src/ast.rs: -------------------------------------------------------------------------------- 1 | //! Some common predefined AST structures that can be used in parser. 2 | 3 | use crate::parse::Parse; 4 | use crate::span::{Result, Span, Spanned, TrySpan}; 5 | use crate::token::TokenStream; 6 | use std::marker::PhantomData; 7 | use std::slice::{Iter, IterMut}; 8 | use std::vec::IntoIter; 9 | 10 | /// Implements [`IntoIterator`] trait for the given wrapper type. 11 | macro_rules! impl_into_iterator { 12 | ($t:ident<$($generic:ident),+>, $item:ident) => { 13 | impl<'a, $($generic),+> IntoIterator for &'a $t<$($generic),+> { 14 | type Item = &'a $item; 15 | type IntoIter = Iter<'a, $item>; 16 | fn into_iter(self) -> Self::IntoIter { 17 | self.0.as_slice().into_iter() 18 | } 19 | } 20 | impl<'a, $($generic),+> IntoIterator for &'a mut $t<$($generic),+> { 21 | type Item = &'a mut $item; 22 | type IntoIter = IterMut<'a, $item>; 23 | fn into_iter(self) -> Self::IntoIter { 24 | self.0.as_mut_slice().into_iter() 25 | } 26 | } 27 | impl<$($generic),+> IntoIterator for $t<$($generic),+> { 28 | type Item = $item; 29 | type IntoIter = IntoIter<$item>; 30 | fn into_iter(self) -> Self::IntoIter { 31 | self.0.into_iter() 32 | } 33 | } 34 | }; 35 | } 36 | 37 | /// A non-empty sequence of AST `T`, which `T` can occur one or more times, 38 | /// like `T`, `T T`, `T T T`, ... 39 | /// 40 | /// The inner [`Vec`] is guaranteed not to be empty. 41 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 42 | pub struct NonEmptySeq(pub Vec); 43 | impl_into_iterator!(NonEmptySeq, T); 44 | 45 | impl Parse for NonEmptySeq 46 | where 47 | TS: TokenStream, 48 | T: Parse, 49 | { 50 | fn parse(tokens: &mut TS) -> Result { 51 | let mut ts = vec![tokens.parse()?]; 52 | while T::maybe(tokens)? { 53 | ts.push(tokens.parse()?); 54 | } 55 | Ok(Self(ts)) 56 | } 57 | 58 | fn maybe(tokens: &mut TS) -> Result { 59 | T::maybe(tokens) 60 | } 61 | } 62 | 63 | impl Spanned for NonEmptySeq 64 | where 65 | T: Spanned, 66 | { 67 | fn span(&self) -> Span { 68 | if self.0.len() == 1 { 69 | self.0.first().unwrap().span() 70 | } else { 71 | self 72 | .0 73 | .first() 74 | .unwrap() 75 | .span() 76 | .into_end_updated(self.0.last().unwrap().span()) 77 | } 78 | } 79 | } 80 | 81 | /// A sequence of AST `T`, separated by AST `S`, 82 | /// like ``, `T`, `T S T`, `T S T S T`, ... 83 | /// 84 | /// The delimiter will not be stored. 85 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 86 | pub struct SepSeq(pub Vec, PhantomData); 87 | impl_into_iterator!(SepSeq, T); 88 | 89 | impl Parse for SepSeq 90 | where 91 | TS: TokenStream, 92 | T: Parse, 93 | S: Parse, 94 | { 95 | fn parse(tokens: &mut TS) -> Result { 96 | let mut ts = Vec::new(); 97 | if T::maybe(tokens)? { 98 | loop { 99 | ts.push(tokens.parse()?); 100 | if !S::maybe(tokens)? { 101 | break; 102 | } 103 | S::parse(tokens)?; 104 | } 105 | } 106 | Ok(Self(ts, PhantomData)) 107 | } 108 | 109 | fn maybe(_: &mut TS) -> Result { 110 | Ok(true) 111 | } 112 | } 113 | 114 | impl TrySpan for SepSeq 115 | where 116 | T: TrySpan, 117 | { 118 | fn try_span(&self) -> Option { 119 | self.0.try_span() 120 | } 121 | } 122 | 123 | /// A non-empty sequence of AST `T`, separated by AST `S`, 124 | /// like `T`, `T S T`, `T S T S T`, ... 125 | /// 126 | /// The delimiter will not be stored, and the inner [`Vec`] 127 | /// is guaranteed not to be empty. 128 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 129 | pub struct NonEmptySepSeq(pub Vec, PhantomData); 130 | impl_into_iterator!(NonEmptySepSeq, T); 131 | 132 | impl Parse for NonEmptySepSeq 133 | where 134 | TS: TokenStream, 135 | T: Parse, 136 | S: Parse, 137 | { 138 | fn parse(tokens: &mut TS) -> Result { 139 | let mut ts = vec![tokens.parse()?]; 140 | while S::maybe(tokens)? { 141 | S::parse(tokens)?; 142 | ts.push(tokens.parse()?); 143 | } 144 | Ok(Self(ts, PhantomData)) 145 | } 146 | 147 | fn maybe(tokens: &mut TS) -> Result { 148 | T::maybe(tokens) 149 | } 150 | } 151 | 152 | impl Spanned for NonEmptySepSeq 153 | where 154 | T: Spanned, 155 | { 156 | fn span(&self) -> Span { 157 | let span = self.0.first().unwrap().span(); 158 | if self.0.len() == 1 { 159 | span 160 | } else { 161 | span.into_end_updated(self.0.last().unwrap().span()) 162 | } 163 | } 164 | } 165 | 166 | /// A sequence of AST `T`, separated by AST `S`, ending with an optional `S`, 167 | /// like ``, `T`, `T S`, `T S T`, `T S T S`, `T S T S T`, ... 168 | /// 169 | /// The delimiter will not be stored. 170 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 171 | pub struct OptSepSeq(pub Vec, PhantomData); 172 | impl_into_iterator!(OptSepSeq, T); 173 | 174 | impl Parse for OptSepSeq 175 | where 176 | TS: TokenStream, 177 | T: Parse, 178 | S: Parse, 179 | { 180 | fn parse(tokens: &mut TS) -> Result { 181 | let mut ts = Vec::new(); 182 | while T::maybe(tokens)? { 183 | ts.push(tokens.parse()?); 184 | if !S::maybe(tokens)? { 185 | break; 186 | } 187 | S::parse(tokens)?; 188 | } 189 | Ok(Self(ts, PhantomData)) 190 | } 191 | 192 | fn maybe(_: &mut TS) -> Result { 193 | Ok(true) 194 | } 195 | } 196 | 197 | impl TrySpan for OptSepSeq 198 | where 199 | T: TrySpan, 200 | { 201 | fn try_span(&self) -> Option { 202 | self.0.try_span() 203 | } 204 | } 205 | 206 | /// A non-empty sequence of AST `T`, separated by AST `S`, ending with an 207 | /// optional `S`, like `T`, `T S`, `T S T`, `T S T S`, `T S T S T`, ... 208 | /// 209 | /// The delimiter will not be stored, and the inner [`Vec`] 210 | /// is guaranteed not to be empty. 211 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 212 | pub struct NonEmptyOptSepSeq(pub Vec, PhantomData); 213 | impl_into_iterator!(NonEmptyOptSepSeq, T); 214 | 215 | impl Parse for NonEmptyOptSepSeq 216 | where 217 | TS: TokenStream, 218 | T: Parse, 219 | S: Parse, 220 | { 221 | fn parse(tokens: &mut TS) -> Result { 222 | let mut ts = vec![tokens.parse()?]; 223 | while S::maybe(tokens)? { 224 | S::parse(tokens)?; 225 | if !T::maybe(tokens)? { 226 | break; 227 | } 228 | ts.push(tokens.parse()?); 229 | } 230 | Ok(Self(ts, PhantomData)) 231 | } 232 | 233 | fn maybe(tokens: &mut TS) -> Result { 234 | T::maybe(tokens) 235 | } 236 | } 237 | 238 | impl Spanned for NonEmptyOptSepSeq 239 | where 240 | T: Spanned, 241 | { 242 | fn span(&self) -> Span { 243 | let span = self.0.first().unwrap().span(); 244 | if self.0.len() == 1 { 245 | span 246 | } else { 247 | span.into_end_updated(self.0.last().unwrap().span()) 248 | } 249 | } 250 | } 251 | 252 | /// A non-empty linked list of AST `T`, separated by AST `S`, 253 | /// like `T`, `T S T`, `T S T S T`, ... 254 | /// 255 | /// The delimiter will be stored. 256 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 257 | pub enum NonEmptySepList { 258 | /// One element. 259 | One(T), 260 | /// More than one element. 261 | More(T, S, Box), 262 | } 263 | 264 | impl Parse for NonEmptySepList 265 | where 266 | TS: TokenStream, 267 | T: Parse, 268 | S: Parse, 269 | { 270 | fn parse(tokens: &mut TS) -> Result { 271 | let t = tokens.parse()?; 272 | Ok(if S::maybe(tokens)? { 273 | Self::More(t, tokens.parse()?, tokens.parse()?) 274 | } else { 275 | Self::One(t) 276 | }) 277 | } 278 | 279 | fn maybe(tokens: &mut TS) -> Result { 280 | T::maybe(tokens) 281 | } 282 | } 283 | 284 | impl Spanned for NonEmptySepList 285 | where 286 | T: Spanned, 287 | { 288 | fn span(&self) -> Span { 289 | match self { 290 | Self::One(t) => t.span(), 291 | Self::More(t, _, l) => t.span().into_end_updated(l.span()), 292 | } 293 | } 294 | } 295 | 296 | /// A linked list of AST `T`, separated by AST `S`, 297 | /// like ``, `T`, `T S T`, `T S T S T`, ... 298 | /// 299 | /// The delimiter will be stored. 300 | pub type SepList = Option>; 301 | 302 | /// A non-empty linked list of AST `T`, separated by AST `S`, ending with 303 | /// an optional `S`, like `T`, `T S`, `T S T`, `T S T S`, `T S T S T`, ... 304 | /// 305 | /// The delimiter will be stored. 306 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 307 | pub enum NonEmptyOptSepList { 308 | /// One element. 309 | One(T), 310 | /// One element with a separator. 311 | OneWithSep(T, S), 312 | /// More than one element. 313 | More(T, S, Box), 314 | } 315 | 316 | impl Parse for NonEmptyOptSepList 317 | where 318 | TS: TokenStream, 319 | T: Parse, 320 | S: Parse, 321 | { 322 | fn parse(tokens: &mut TS) -> Result { 323 | let t = tokens.parse()?; 324 | Ok(if S::maybe(tokens)? { 325 | let s = tokens.parse()?; 326 | if T::maybe(tokens)? { 327 | Self::More(t, s, tokens.parse()?) 328 | } else { 329 | Self::OneWithSep(t, s) 330 | } 331 | } else { 332 | Self::One(t) 333 | }) 334 | } 335 | 336 | fn maybe(tokens: &mut TS) -> Result { 337 | T::maybe(tokens) 338 | } 339 | } 340 | 341 | impl Spanned for NonEmptyOptSepList 342 | where 343 | T: Spanned, 344 | S: Spanned, 345 | { 346 | fn span(&self) -> Span { 347 | match self { 348 | Self::One(t) => t.span(), 349 | Self::OneWithSep(t, s) => t.span().into_end_updated(s.span()), 350 | Self::More(t, _, l) => t.span().into_end_updated(l.span()), 351 | } 352 | } 353 | } 354 | 355 | /// A linked list of AST `T`, separated by AST `S`, ending with 356 | /// an optional `S`, like ``, `T`, `T S`, `T S T`, `T S T S`, 357 | /// `T S T S T`, ... 358 | /// 359 | /// The delimiter will be stored. 360 | pub type OptSepList = Option>; 361 | 362 | /// An AST `T` quoted by AST `L` and AST `R`, like `L T R`. 363 | #[deprecated( 364 | since = "0.1.6", 365 | note = "will be removed in 0.2.0, please use tuple `(L, T, R)` instead" 366 | )] 367 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 368 | pub struct Quoted(pub L, pub T, pub R); 369 | 370 | #[allow(deprecated)] 371 | impl Parse for Quoted 372 | where 373 | TS: TokenStream, 374 | L: Parse, 375 | T: Parse, 376 | R: Parse, 377 | { 378 | fn parse(tokens: &mut TS) -> Result { 379 | Ok(Self(tokens.parse()?, tokens.parse()?, tokens.parse()?)) 380 | } 381 | 382 | fn maybe(tokens: &mut TS) -> Result { 383 | L::maybe(tokens) 384 | } 385 | } 386 | 387 | #[allow(deprecated)] 388 | impl Spanned for Quoted 389 | where 390 | L: Spanned, 391 | R: Spanned, 392 | { 393 | fn span(&self) -> Span { 394 | self.0.span().into_end_updated(self.2.span()) 395 | } 396 | } 397 | 398 | /// An AST `T` with an optional prefix `P`, like `T` or `P T`. 399 | /// 400 | /// The `maybe` method of AST returns `true` when either `P::maybe` returns 401 | /// `true` or `T::maybe` returns `true`. This may not work in the following 402 | /// example: 403 | /// 404 | /// ``` 405 | /// # use laps::{prelude::*, span::Result, ast::OptPrefix, token::{Tokenizer, TokenBuffer}}; 406 | /// # struct Prefix; 407 | /// # impl Parse for Prefix 408 | /// # where 409 | /// # TS: TokenStream, 410 | /// # { 411 | /// # fn parse(_: &mut TS) -> Result { Ok(Self) } 412 | /// # fn maybe(_: &mut TS) -> Result { Ok(true) } 413 | /// # } 414 | /// # struct Item1; 415 | /// # impl Parse for Item1 416 | /// # where 417 | /// # TS: TokenStream, 418 | /// # { 419 | /// # fn parse(_: &mut TS) -> Result { Ok(Self) } 420 | /// # fn maybe(_: &mut TS) -> Result { Ok(true) } 421 | /// # } 422 | /// # struct Item2; 423 | /// # impl Parse for Item2 424 | /// # where 425 | /// # TS: TokenStream, 426 | /// # { 427 | /// # fn parse(_: &mut TS) -> Result { Ok(Self) } 428 | /// # fn maybe(_: &mut TS) -> Result { Ok(true) } 429 | /// # } 430 | /// # struct Lexer; 431 | /// # impl Tokenizer for Lexer { 432 | /// # type Token = (); 433 | /// # fn next_token(&mut self) -> Result<()> { Ok(()) } 434 | /// # } 435 | /// # let mut tokens = TokenBuffer::new(Lexer); 436 | /// # impl Parse for Items 437 | /// # where 438 | /// # TS: TokenStream, 439 | /// # { 440 | /// # fn parse(_: &mut TS) -> Result { Ok(Self::Item1(OptPrefix(None, Item1))) } 441 | /// # fn maybe(_: &mut TS) -> Result { Ok(true) } 442 | /// # } 443 | /// enum Items { 444 | /// Item1(OptPrefix), 445 | /// Item2(OptPrefix), 446 | /// } 447 | /// 448 | /// let items: Items = tokens.parse().unwrap(); 449 | /// ``` 450 | /// 451 | /// The `items` may always be `Items::Item1` whether the input is 452 | /// `Prefix Item1` or `Prefix Item2` with a naive implementation of trait 453 | /// `Parse` for `Items` (like `#[derive(Parse)]`). 454 | /// 455 | /// For more precise implementation of `maybe` method, please use 456 | /// [`OptTokenPrefix`] if possible. 457 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 458 | pub struct OptPrefix(pub Option

, pub T); 459 | 460 | impl Parse for OptPrefix 461 | where 462 | TS: TokenStream, 463 | P: Parse, 464 | T: Parse, 465 | { 466 | fn parse(tokens: &mut TS) -> Result { 467 | Ok(Self(tokens.parse()?, tokens.parse()?)) 468 | } 469 | 470 | fn maybe(tokens: &mut TS) -> Result { 471 | Ok(P::maybe(tokens)? || T::maybe(tokens)?) 472 | } 473 | } 474 | 475 | impl Spanned for OptPrefix 476 | where 477 | P: Spanned, 478 | T: Spanned, 479 | { 480 | fn span(&self) -> Span { 481 | match &self.0 { 482 | Some(p) => p.span().into_end_updated(self.1.span()), 483 | None => self.1.span(), 484 | } 485 | } 486 | } 487 | 488 | /// An AST `T` with an optional prefix `P`, like `T` or `P T`. 489 | /// 490 | /// The `maybe` method of AST treats `P` as a single token, and returns 491 | /// `true` if both `P::maybe` returns `true` and `T::maybe` returns `true`, 492 | /// otherwise returns the result of `T::maybe`. 493 | /// 494 | /// # Notes 495 | /// 496 | /// Do not use this AST type if `P` is not a single token. 497 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 498 | pub struct OptTokenPrefix(pub Option

, pub T); 499 | 500 | impl Parse for OptTokenPrefix 501 | where 502 | TS: TokenStream, 503 | P: Parse, 504 | T: Parse, 505 | { 506 | fn parse(tokens: &mut TS) -> Result { 507 | Ok(Self(tokens.parse()?, tokens.parse()?)) 508 | } 509 | 510 | fn maybe(tokens: &mut TS) -> Result { 511 | if P::maybe(tokens)? { 512 | let token = tokens.next_token()?; 513 | let result = T::maybe(tokens)?; 514 | tokens.unread(token); 515 | Ok(result) 516 | } else { 517 | T::maybe(tokens) 518 | } 519 | } 520 | } 521 | 522 | impl Spanned for OptTokenPrefix 523 | where 524 | P: Spanned, 525 | T: Spanned, 526 | { 527 | fn span(&self) -> Span { 528 | match &self.0 { 529 | Some(p) => p.span().into_end_updated(self.1.span()), 530 | None => self.1.span(), 531 | } 532 | } 533 | } 534 | 535 | /// An AST `T` with a prefix `P`, like `T` or `P T`. 536 | /// 537 | /// The `maybe` method of AST treats `P` as a single token, and returns 538 | /// `true` if both `P::maybe` returns `true` and `T::maybe` returns `true`. 539 | /// 540 | /// # Notes 541 | /// 542 | /// Do not use this AST type if `P` is not a single token. 543 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] 544 | pub struct TokenPrefix(pub P, pub T); 545 | 546 | impl Parse for TokenPrefix 547 | where 548 | TS: TokenStream, 549 | P: Parse, 550 | T: Parse, 551 | { 552 | fn parse(tokens: &mut TS) -> Result { 553 | Ok(Self(tokens.parse()?, tokens.parse()?)) 554 | } 555 | 556 | fn maybe(tokens: &mut TS) -> Result { 557 | if P::maybe(tokens)? { 558 | let token = tokens.next_token()?; 559 | let result = T::maybe(tokens)?; 560 | tokens.unread(token); 561 | Ok(result) 562 | } else { 563 | Ok(false) 564 | } 565 | } 566 | } 567 | 568 | impl Spanned for TokenPrefix 569 | where 570 | P: Spanned, 571 | T: Spanned, 572 | { 573 | fn span(&self) -> Span { 574 | self.0.span().into_end_updated(self.1.span()) 575 | } 576 | } 577 | -------------------------------------------------------------------------------- /src/input.rs: -------------------------------------------------------------------------------- 1 | //! Utilities for constructing lexers. 2 | //! 3 | //! This module conntains the [`InputStream`] trait, which can be 4 | //! implemented for input streams, i.e. streams that return characters. 5 | //! This trait has already been implemented for 6 | //! [`Reader`](crate::reader::Reader) and 7 | //! [`ByteReader`](crate::reader::ByteReader). 8 | //! 9 | //! The [`InputStream`] trait provides many useful utility methods for 10 | //! reading characters and the corresponding [`Span`]s from the input stream. 11 | 12 | use crate::span::{Location, Result, Span}; 13 | 14 | /// Trait for input streams. 15 | pub trait InputStream { 16 | /// The type of the character produced by the input stream. 17 | type CharType; 18 | 19 | /// Reads the next character from the input stream. 20 | /// 21 | /// Returns the character and the last location (location before reading 22 | /// the character) if successful, or [Ok]([None]) 23 | /// if EOF was encountered, or [`Err`] if something wrong. 24 | fn next_char_loc(&mut self) -> Result<(Option, Location)>; 25 | 26 | /// Unreads the given character and the last location 27 | /// and put it back to the input stream. 28 | fn unread(&mut self, last: (Option, Location)); 29 | 30 | /// Returns a reference to the current span in the lexer. 31 | fn span(&self) -> &Span; 32 | 33 | /// Sets the line and column of the current span. 34 | /// 35 | /// This could be useful if something like the C preprocessor 36 | /// is to be supported: 37 | /// 38 | /// ```text 39 | /// # 37 "" 40 | /// ... 41 | /// ``` 42 | fn set_line_col(&mut self, line: u32, col: u32); 43 | 44 | /// Reads the next character from the input stream. 45 | /// 46 | /// Returns the character if successful, 47 | /// or [Ok]([None]) if EOF was encountered, 48 | /// or [`Err`] if something wrong. 49 | fn next_char(&mut self) -> Result> { 50 | self.next_char_loc().map(|(c, _)| c) 51 | } 52 | 53 | /// Reads the next character from the input stream. 54 | /// 55 | /// Returns the character and its span if successful, 56 | /// or [Ok](([None], _)) if EOF was encountered, 57 | /// or [`Err`] if something wrong. 58 | fn next_char_span(&mut self) -> Result<(Option, Span)> { 59 | self.next_char_loc().map(|(c, _)| (c, self.span().clone())) 60 | } 61 | 62 | /// Reads the next character from the input stream. 63 | /// 64 | /// Returns a reference to the span of the read character if successful, 65 | /// or [`Err`] if something wrong. 66 | fn next_span(&mut self) -> Result<&Span> { 67 | self.next_char_loc()?; 68 | Ok(self.span()) 69 | } 70 | 71 | /// Peeks the next character from the input stream. 72 | /// 73 | /// Does not advance the position of the input stream. 74 | fn peek(&mut self) -> Result> 75 | where 76 | Self::CharType: Clone, 77 | { 78 | let (c, loc) = self.next_char_loc()?; 79 | self.unread((c.clone(), loc)); 80 | Ok(c) 81 | } 82 | 83 | /// Peeks the next character from the input stream. 84 | /// Returns the peeked character and its span. 85 | /// 86 | /// Does not advance the position of the input stream. 87 | fn peek_with_span(&mut self) -> Result<(Option, Span)> 88 | where 89 | Self::CharType: Clone, 90 | { 91 | let (c, loc) = self.next_char_loc()?; 92 | let span = self.span().clone(); 93 | self.unread((c.clone(), loc)); 94 | Ok((c, span)) 95 | } 96 | 97 | /// Skips characters until a character specified by the predicate is encountered. 98 | fn skip_until(&mut self, mut f: F) -> Result<()> 99 | where 100 | Self::CharType: Clone, 101 | F: FnMut(Self::CharType) -> bool, 102 | { 103 | while self.peek()?.map_or(false, |c| !f(c)) { 104 | self.next_char()?; 105 | } 106 | Ok(()) 107 | } 108 | 109 | /// Collects characters into a vector until a character specified by the 110 | /// predicate is encountered. 111 | fn collect_until(&mut self, mut f: F) -> Result> 112 | where 113 | Self::CharType: Clone, 114 | F: FnMut(&Self::CharType) -> bool, 115 | { 116 | let mut v = Vec::new(); 117 | while let Some(c) = self.peek()? { 118 | if f(&c) { 119 | break; 120 | } 121 | v.push(c); 122 | self.next_char()?; 123 | } 124 | Ok(v) 125 | } 126 | 127 | /// Collects characters into a vector until a character specified by the 128 | /// predicate is encountered. 129 | /// 130 | /// Returns the collected vector and its span. 131 | fn collect_with_span_until(&mut self, mut f: F) -> Result<(Vec, Span)> 132 | where 133 | Self::CharType: Clone, 134 | F: FnMut(&Self::CharType) -> bool, 135 | { 136 | let mut v = Vec::new(); 137 | let mut span = match self.peek_with_span()? { 138 | (Some(c), span) if !f(&c) => span, 139 | (_, span) => return Ok((v, span)), 140 | }; 141 | while let Some(c) = self.peek()? { 142 | if f(&c) { 143 | break; 144 | } 145 | v.push(c); 146 | span.update_end(self.next_span()?); 147 | } 148 | Ok((v, span)) 149 | } 150 | } 151 | 152 | #[cfg(test)] 153 | mod test { 154 | use super::*; 155 | use crate::reader::Reader; 156 | 157 | #[test] 158 | fn next_char_or_span() { 159 | let mut reader = Reader::from("123 abc"); 160 | assert_eq!(reader.next_char(), Ok(Some('1'))); 161 | assert_eq!(reader.next_char(), Ok(Some('2'))); 162 | let (c, span) = reader.next_char_span().unwrap(); 163 | assert_eq!(c, Some('3')); 164 | assert_eq!(format!("{span}"), "1:3-1:3"); 165 | let (c, span) = reader.next_char_span().unwrap(); 166 | assert_eq!(c, Some(' ')); 167 | assert_eq!(format!("{span}"), "1:4-1:4"); 168 | assert_eq!(format!("{}", reader.next_span().unwrap()), "1:5-1:5"); 169 | assert_eq!(format!("{}", reader.next_span().unwrap()), "1:6-1:6"); 170 | assert_eq!(reader.next_char(), Ok(Some('c'))); 171 | assert_eq!(reader.next_char(), Ok(None)); 172 | assert_eq!(reader.next_char(), Ok(None)); 173 | } 174 | 175 | #[test] 176 | fn skip_until() { 177 | let mut reader = Reader::from("123 abc"); 178 | assert_eq!(reader.skip_until(|c| c.is_whitespace()), Ok(())); 179 | assert_eq!(reader.next_char(), Ok(Some(' '))); 180 | assert_eq!(reader.next_char(), Ok(Some(' '))); 181 | assert_eq!(reader.next_char(), Ok(Some('a'))); 182 | assert_eq!(reader.next_char(), Ok(Some('b'))); 183 | assert_eq!(reader.next_char(), Ok(Some('c'))); 184 | assert_eq!(reader.next_char(), Ok(None)); 185 | assert_eq!(reader.next_char(), Ok(None)); 186 | } 187 | 188 | #[test] 189 | fn collect_until() { 190 | let mut reader = Reader::from("123 abc"); 191 | assert_eq!(reader.collect_until(|c| *c == '1'), Ok(vec![])); 192 | assert_eq!( 193 | reader.collect_with_span_until(|c| *c == '1').unwrap().0, 194 | vec![] 195 | ); 196 | assert_eq!( 197 | reader.collect_until(|c| c.is_whitespace()), 198 | Ok("123".chars().collect()) 199 | ); 200 | assert_eq!(reader.next_char(), Ok(Some(' '))); 201 | let (s, span) = reader.collect_with_span_until(|_| false).unwrap(); 202 | assert_eq!(s, "abc".chars().collect::>()); 203 | assert_eq!(format!("{span}"), "1:5-1:7"); 204 | assert_eq!(reader.next_char(), Ok(None)); 205 | assert_eq!(reader.next_char(), Ok(None)); 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /src/lexer.rs: -------------------------------------------------------------------------------- 1 | //! Implementations for constructing lexers. 2 | //! 3 | //! This module contains: 4 | //! 5 | //! * [`Tokenize`]: trait for tokenizing token kinds. With feature `macros` 6 | //! enabled, you can derive this trait for token kinds. 7 | //! * [`Lexer`]: a lexer implementation for token kinds that implemented 8 | //! [`Tokenize`] trait. 9 | //! * Some helper functions for constructing lexers. 10 | 11 | use crate::input::InputStream; 12 | use crate::token::{Token, Tokenizer}; 13 | use std::marker::PhantomData; 14 | use std::num::ParseIntError; 15 | 16 | #[cfg(feature = "macros")] 17 | pub use laps_macros::Tokenize; 18 | 19 | /// Trait for token kinds that can be tokenized from an input stream. 20 | pub trait Tokenize: Sized { 21 | /// The type of the character produced by the input stream. 22 | type CharType; 23 | 24 | /// Reads the next token from the given input stream. 25 | /// 26 | /// Returns the token ([`Token`]) if successful, otherwise [`Err`]. 27 | fn next_token(input: &mut I) -> crate::span::Result> 28 | where 29 | I: InputStream; 30 | 31 | /// Creates a lexer from the given input stream that 32 | /// produces the current token kind. 33 | fn lexer(input: I) -> Lexer { 34 | Lexer { 35 | input, 36 | token: PhantomData, 37 | } 38 | } 39 | } 40 | 41 | /// A lexer with input stream type `I` and token kind type `K`. 42 | /// 43 | /// This lexer will produce tokens of type [`Token`]. 44 | pub struct Lexer { 45 | input: I, 46 | token: PhantomData, 47 | } 48 | 49 | impl Lexer { 50 | /// Converts the lexer into its inner input stream. 51 | pub fn into_input(self) -> I { 52 | self.input 53 | } 54 | 55 | /// Returns a reference to the inner input stream. 56 | pub fn input(&self) -> &I { 57 | &self.input 58 | } 59 | 60 | /// Returns a mutable reference to the inner input stream. 61 | pub fn input_mut(&mut self) -> &mut I { 62 | &mut self.input 63 | } 64 | } 65 | 66 | impl Tokenizer for Lexer 67 | where 68 | I: InputStream, 69 | K: Tokenize, 70 | { 71 | type Token = Token; 72 | 73 | fn next_token(&mut self) -> crate::span::Result { 74 | K::next_token(&mut self.input) 75 | } 76 | } 77 | 78 | /// Parses integer literals from the given string. 79 | /// Supports decimal, binary, hexadecimal and octal. 80 | /// 81 | /// Returns the integer if successful, otherwise returns [`None`]. 82 | /// 83 | /// # Examples 84 | /// 85 | /// ``` 86 | /// use laps::lexer::int_literal; 87 | /// 88 | /// assert_eq!(int_literal("0"), Some(0)); 89 | /// assert_eq!(int_literal("00"), Some(0)); 90 | /// assert_eq!(int_literal("42"), Some(42)); 91 | /// assert_eq!(int_literal("0x1a"), Some(26)); 92 | /// assert_eq!(int_literal("0b0110"), Some(6)); 93 | /// assert_eq!(int_literal("0o777"), Some(511)); 94 | /// assert_eq!(int_literal::("z"), None); 95 | /// assert_eq!(int_literal::("0f"), None); 96 | /// assert_eq!(int_literal::("0b777"), None); 97 | /// ``` 98 | pub fn int_literal(s: &str) -> Option 99 | where 100 | T: IntLiteral, 101 | { 102 | // check if is a valid integer literal 103 | let mut chars = s.chars(); 104 | let (radix, starts_from) = match (chars.next(), chars.next()) { 105 | (Some('0'), Some(c)) if "box".contains(c) => ( 106 | match c { 107 | 'b' => 2, 108 | 'o' => 8, 109 | 'x' => 16, 110 | _ => unreachable!(), 111 | }, 112 | 2, 113 | ), 114 | (Some(c), None) if c.is_ascii_digit() => (10, 0), 115 | (Some(c1), Some(c2)) if c1.is_ascii_digit() && c2.is_ascii_digit() => (10, 0), 116 | _ => return None, 117 | }; 118 | if !chars.all(|c| c.is_digit(radix)) { 119 | return None; 120 | } 121 | // convert to integer 122 | T::from_str_radix(&s[starts_from..], radix).ok() 123 | } 124 | 125 | /// Parses integer literals with an optional sign from the given string. 126 | /// Supports decimal, binary, hexadecimal and octal. 127 | /// 128 | /// Returns the integer if successful, otherwise returns [`None`]. 129 | /// 130 | /// # Examples 131 | /// 132 | /// ``` 133 | /// use laps::lexer::signed_int_literal; 134 | /// 135 | /// assert_eq!(signed_int_literal("0"), Some(0)); 136 | /// assert_eq!(signed_int_literal("+00"), Some(0)); 137 | /// assert_eq!(signed_int_literal("-42"), Some(-42)); 138 | /// assert_eq!(signed_int_literal("-0x1a"), Some(-26)); 139 | /// assert_eq!(signed_int_literal("0b0110"), Some(6)); 140 | /// assert_eq!(signed_int_literal("+0o777"), Some(511)); 141 | /// assert_eq!(signed_int_literal::("-1"), Some(u32::MAX)); 142 | /// assert_eq!(signed_int_literal::("+"), None); 143 | /// assert_eq!(signed_int_literal::("--1"), None); 144 | /// assert_eq!(signed_int_literal::("-0b777"), None); 145 | /// ``` 146 | pub fn signed_int_literal(s: &str) -> Option 147 | where 148 | T: IntLiteral, 149 | { 150 | let first = s.chars().next()?; 151 | if first == '+' || first == '-' { 152 | int_literal(&s[1..]).map(|n: T| if first == '-' { n.wrapping_neg() } else { n }) 153 | } else { 154 | int_literal(s) 155 | } 156 | } 157 | 158 | /// A helper trait for function [`int_literal`]. 159 | /// 160 | /// Users are not allowed to implement this trait for other types. 161 | pub trait IntLiteral: Sized + sealed_traits::SealedIntLiteral { 162 | /// Converts a string slice in a given base to an integer. 163 | /// 164 | /// This is identical to `from_str_radix` method of primitive integer types, 165 | /// such as [`i32::from_str_radix`](i32#method.from_str_radix). 166 | fn from_str_radix(s: &str, radix: u32) -> Result; 167 | 168 | /// Wrapping negates the current number. 169 | fn wrapping_neg(self) -> Self; 170 | } 171 | 172 | /// Helper macro for implementing `IntLiteral` for integers. 173 | macro_rules! impl_int_literal { 174 | ($ty:ty) => { 175 | impl IntLiteral for $ty { 176 | fn from_str_radix(s: &str, radix: u32) -> Result { 177 | <$ty>::from_str_radix(s, radix) 178 | } 179 | 180 | fn wrapping_neg(self) -> Self { 181 | self.wrapping_neg() 182 | } 183 | } 184 | }; 185 | } 186 | 187 | impl_int_literal!(i8); 188 | impl_int_literal!(i16); 189 | impl_int_literal!(i32); 190 | impl_int_literal!(i64); 191 | impl_int_literal!(i128); 192 | impl_int_literal!(isize); 193 | impl_int_literal!(u8); 194 | impl_int_literal!(u16); 195 | impl_int_literal!(u32); 196 | impl_int_literal!(u64); 197 | impl_int_literal!(u128); 198 | impl_int_literal!(usize); 199 | 200 | /// Sealed trait for trait `IntLiteral`. 201 | mod sealed_traits { 202 | pub trait SealedIntLiteral {} 203 | impl SealedIntLiteral for i8 {} 204 | impl SealedIntLiteral for i16 {} 205 | impl SealedIntLiteral for i32 {} 206 | impl SealedIntLiteral for i64 {} 207 | impl SealedIntLiteral for i128 {} 208 | impl SealedIntLiteral for isize {} 209 | impl SealedIntLiteral for u8 {} 210 | impl SealedIntLiteral for u16 {} 211 | impl SealedIntLiteral for u32 {} 212 | impl SealedIntLiteral for u64 {} 213 | impl SealedIntLiteral for u128 {} 214 | impl SealedIntLiteral for usize {} 215 | } 216 | 217 | /// Parses string literals (`"..."`) from the given string. 218 | /// 219 | /// Supported escapes: 220 | /// * `\r`, `\n`, `\t`, `\0`, `\\`. 221 | /// * `\'`, `\"`. 222 | /// * `\x00`-`\xff` (`\xFF`). 223 | /// * `\u{0}`-`\u{d7ff}` and `\u{e000}`-`\u{10ffff}` (`\u{10FFFF}`). 224 | /// 225 | /// Returns the string if successful, otherwise returns [`None`]. 226 | /// 227 | /// # Examples 228 | /// 229 | /// ``` 230 | /// use laps::lexer::str_literal; 231 | /// 232 | /// assert_eq!(str_literal(r#""hello""#), Some("hello".into())); 233 | /// assert_eq!(str_literal(r#""你好""#), Some("你好".into())); 234 | /// assert_eq!(str_literal(r#""""#), Some("".into())); 235 | /// assert_eq!(str_literal(r#""\"\n\t\\""#), Some("\"\n\t\\".into())); 236 | /// assert_eq!(str_literal(r#""#), None); 237 | /// assert_eq!(str_literal(r#""hello"#), None); 238 | /// ``` 239 | pub fn str_literal(s: &str) -> Option { 240 | let mut chars = s.chars(); 241 | // check the first quote 242 | if chars.next()? != '"' { 243 | return None; 244 | } 245 | // get string literal 246 | let mut s = String::new(); 247 | loop { 248 | match parse_char_literal(&mut chars, '"') { 249 | ParseResult::Char(c) => s.push(c), 250 | ParseResult::Quote => break, 251 | ParseResult::Error => return None, 252 | } 253 | } 254 | // check the last quote 255 | chars.next().is_none().then_some(s) 256 | } 257 | 258 | /// Parses character literals (`'...'`) from the given string. 259 | /// 260 | /// Supported escapes: 261 | /// * `\r`, `\n`, `\t`, `\0`, `\\`. 262 | /// * `\'`, `\"`. 263 | /// * `\x00`-`\xff` (`\xFF`). 264 | /// * `\u{0}`-`\u{d7ff}` and `\u{e000}`-`\u{10ffff}` (`\u{10FFFF}`). 265 | /// 266 | /// Returns the character if successful, otherwise returns [`None`]. 267 | /// 268 | /// # Examples 269 | /// 270 | /// ``` 271 | /// use laps::lexer::char_literal; 272 | /// 273 | /// assert_eq!(char_literal(r#"'a'"#), Some('a')); 274 | /// assert_eq!(char_literal(r#"'😂'"#), Some('😂')); 275 | /// assert_eq!(char_literal(r#"'\n'"#), Some('\n')); 276 | /// assert_eq!(char_literal(r#""#), None); 277 | /// assert_eq!(char_literal(r#"''"#), None); 278 | /// assert_eq!(char_literal(r#"'a"#), None); 279 | /// ``` 280 | pub fn char_literal(s: &str) -> Option { 281 | let mut chars = s.chars(); 282 | // check the first quote 283 | if chars.next()? != '\'' { 284 | return None; 285 | } 286 | // get character literal 287 | match parse_char_literal(&mut chars, '\'') { 288 | ParseResult::Char(c) => ((chars.next(), chars.next()) == (Some('\''), None)).then_some(c), 289 | _ => None, 290 | } 291 | } 292 | 293 | /// Parses a char literal (do not include quotes) 294 | /// from the given character iterator. 295 | fn parse_char_literal(iter: &mut I, quote: char) -> ParseResult 296 | where 297 | I: Iterator, 298 | { 299 | match iter.next() { 300 | Some('\n') | Some('\r') | Some('\t') => ParseResult::Error, 301 | Some('\\') => match iter.next() { 302 | Some('r') => ParseResult::Char('\r'), 303 | Some('n') => ParseResult::Char('\n'), 304 | Some('t') => ParseResult::Char('\t'), 305 | Some('0') => ParseResult::Char('\0'), 306 | Some('\\') => ParseResult::Char('\\'), 307 | Some('\'') => ParseResult::Char('\''), 308 | Some('\"') => ParseResult::Char('\"'), 309 | Some('x') => { 310 | // get escaped char 311 | let c = iter 312 | .next() 313 | .and_then(|c| c.to_digit(16)) 314 | .zip(iter.next().and_then(|c| c.to_digit(16))) 315 | .map(|(c1, c2)| (c1 * 16 + c2) as u8 as char); 316 | match c { 317 | Some(c) => ParseResult::Char(c), 318 | None => ParseResult::Error, 319 | } 320 | } 321 | Some('u') => { 322 | // check '{' 323 | if iter.next() != Some('{') { 324 | return ParseResult::Error; 325 | } 326 | // get hex value 327 | let mut hex = 0u32; 328 | for c in iter { 329 | match c.to_digit(16) { 330 | Some(h) => match hex.checked_mul(16) { 331 | Some(h16) => hex = h16 + h, 332 | None => break, 333 | }, 334 | None if c == '}' => match char::from_u32(hex) { 335 | Some(c) => return ParseResult::Char(c), 336 | None => break, 337 | }, 338 | None => break, 339 | } 340 | } 341 | ParseResult::Error 342 | } 343 | _ => ParseResult::Error, 344 | }, 345 | Some(c) if c == quote => ParseResult::Quote, 346 | Some(c) => ParseResult::Char(c), 347 | None => ParseResult::Error, 348 | } 349 | } 350 | 351 | /// Result type of `parse_char_literal`. 352 | enum ParseResult { 353 | Char(char), 354 | Quote, 355 | Error, 356 | } 357 | 358 | #[cfg(test)] 359 | mod test { 360 | use super::*; 361 | 362 | #[test] 363 | fn parse_int() { 364 | assert_eq!(int_literal("123"), Some(123)); 365 | assert_eq!(int_literal("0"), Some(0)); 366 | assert_eq!(int_literal("000"), Some(0)); 367 | assert_eq!(int_literal("0x0"), Some(0x0)); 368 | assert_eq!(int_literal("0xFf"), Some(0xff)); 369 | assert_eq!(int_literal("0b110"), Some(0b110)); 370 | assert_eq!(int_literal("0o765"), Some(0o765)); 371 | assert_eq!(int_literal::(""), None); 372 | assert_eq!(int_literal::("?"), None); 373 | assert_eq!(int_literal::("0x?"), None); 374 | assert_eq!(int_literal::("99999999999999999999999999999999"), None); 375 | } 376 | 377 | #[test] 378 | fn parse_str() { 379 | assert_eq!(str_literal(r#""""#), Some("".into())); 380 | assert_eq!(str_literal(r#""a""#), Some("a".into())); 381 | assert_eq!(str_literal(r#""🤡👈""#), Some("🤡👈".into())); 382 | assert_eq!(str_literal(r#""\t""#), Some("\t".into())); 383 | assert_eq!(str_literal(r#""\n""#), Some("\n".into())); 384 | assert_eq!(str_literal(r#""\r""#), Some("\r".into())); 385 | assert_eq!(str_literal(r#""\\r""#), Some("\\r".into())); 386 | assert_eq!(str_literal(r#""\'""#), Some("\'".into())); 387 | assert_eq!(str_literal(r#""\"""#), Some("\"".into())); 388 | assert_eq!(str_literal(r#""\x4a""#), Some("\x4a".into())); 389 | assert_eq!(str_literal(r#""\u{1234}""#), Some("\u{1234}".into())); 390 | assert_eq!( 391 | str_literal(r#""\u{1234}\u{5678}""#), 392 | Some("\u{1234}\u{5678}".into()) 393 | ); 394 | assert_eq!(str_literal(r#""\u{10ffff}""#), Some("\u{10ffff}".into())); 395 | assert_eq!(str_literal(r#""a\x4aa""#), Some("a\x4aa".into())); 396 | assert_eq!(str_literal(r#""'""#), Some("'".into())); 397 | assert_eq!(str_literal(r#"?"#), None); 398 | assert_eq!(str_literal(r#"""#), None); 399 | assert_eq!(str_literal(r#""aa"#), None); 400 | assert_eq!(str_literal(r#""\"#), None); 401 | assert_eq!( 402 | str_literal( 403 | r#"" 404 | ""# 405 | ), 406 | None, 407 | ); 408 | assert_eq!( 409 | str_literal( 410 | r#""aa 411 | ""# 412 | ), 413 | None, 414 | ); 415 | assert_eq!(str_literal(r#""\?""#), None); 416 | assert_eq!(str_literal(r#""\x""#), None); 417 | assert_eq!(str_literal(r#""\x4""#), None); 418 | assert_eq!(str_literal(r#""\u""#), None); 419 | assert_eq!(str_literal(r#""\u{""#), None); 420 | assert_eq!(str_literal(r#""\u{111111111""#), None); 421 | assert_eq!(str_literal(r#""\u{111111111}""#), None); 422 | assert_eq!(str_literal(r#""\u{d800}""#), None); 423 | assert_eq!(str_literal(r#""\u{dfff}""#), None); 424 | } 425 | 426 | #[test] 427 | fn parse_char() { 428 | assert_eq!(char_literal("'a'"), Some('a')); 429 | assert_eq!(char_literal("'🤔'"), Some('🤔')); 430 | assert_eq!(char_literal(r"'\t'"), Some('\t')); 431 | assert_eq!(char_literal(r"'\n'"), Some('\n')); 432 | assert_eq!(char_literal(r"'\r'"), Some('\r')); 433 | assert_eq!(char_literal(r"'\\'"), Some('\\')); 434 | assert_eq!(char_literal(r"'\''"), Some('\'')); 435 | assert_eq!(char_literal(r#"'\"'"#), Some('\"')); 436 | assert_eq!(char_literal(r"'\x4a'"), Some('\x4a')); 437 | assert_eq!(char_literal(r"'\u{1234}'"), Some('\u{1234}')); 438 | assert_eq!(char_literal(r"'\u{10ffff}'"), Some('\u{10ffff}')); 439 | assert_eq!(char_literal(r#"'"'"#), Some('"')); 440 | assert_eq!(char_literal("?"), None); 441 | assert_eq!(char_literal("'"), None); 442 | assert_eq!(char_literal("''"), None); 443 | assert_eq!(char_literal("'a"), None); 444 | assert_eq!(char_literal("'ab"), None); 445 | assert_eq!(char_literal("'ab'"), None); 446 | assert_eq!( 447 | char_literal( 448 | r#"' 449 | '"# 450 | ), 451 | None, 452 | ); 453 | assert_eq!( 454 | char_literal( 455 | r#"'a 456 | '"# 457 | ), 458 | None, 459 | ); 460 | assert_eq!(char_literal(r"'\'"), None); 461 | assert_eq!(char_literal(r"'\?'"), None); 462 | assert_eq!(char_literal(r"'\x'"), None); 463 | assert_eq!(char_literal(r"'\x4'"), None); 464 | assert_eq!(char_literal(r"'\u'"), None); 465 | assert_eq!(char_literal(r"'\u{'"), None); 466 | assert_eq!(char_literal(r"'\u{111111111'"), None); 467 | assert_eq!(char_literal(r"'\u{111111111}'"), None); 468 | assert_eq!(str_literal(r"'\u{d800}'"), None); 469 | assert_eq!(str_literal(r"'\u{dfff}'"), None); 470 | } 471 | } 472 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![cfg_attr(docsrs, feature(doc_auto_cfg))] 2 | 3 | //! Lexer and parser collections. 4 | //! 5 | //! With `laps`, you can build lexers/parsers by just defining tokens/ASTs 6 | //! and deriving [`Tokenize`](lexer::Tokenize)/[`Parse`](parse::Parse) 7 | //! trait for them. 8 | //! 9 | //! # Example 10 | //! 11 | //! Implement a lexer for 12 | //! [S-expression](https://en.wikipedia.org/wiki/S-expression): 13 | //! 14 | #![cfg_attr(not(feature = "macros"), doc = " ```ignore")] 15 | #![cfg_attr(feature = "macros", doc = " ```")] 16 | //! # fn main() {} 17 | //! use laps::prelude::*; 18 | //! 19 | //! #[token_kind] 20 | //! #[derive(Debug, Tokenize)] 21 | //! enum TokenKind { 22 | //! // This token will be skipped. 23 | //! #[skip(r"\s+")] 24 | //! _Skip, 25 | //! /// Parentheses. 26 | //! #[regex(r"[()]")] 27 | //! Paren(char), 28 | //! /// Atom. 29 | //! #[regex(r"[^\s()]+")] 30 | //! Atom(String), 31 | //! /// End-of-file. 32 | //! #[eof] 33 | //! Eof, 34 | //! } 35 | //! ``` 36 | //! 37 | //! And the parser and [ASTs](https://en.wikipedia.org/wiki/Abstract_syntax_tree) 38 | //! (or actually [CSTs](https://en.wikipedia.org/wiki/Parse_tree)): 39 | //! 40 | #![cfg_attr(not(feature = "macros"), doc = " ```ignore")] 41 | #![cfg_attr(feature = "macros", doc = " ```")] 42 | //! # fn main() {} 43 | //! # use laps::prelude::*; 44 | //! # #[token_kind] 45 | //! # #[derive(Debug, Tokenize)] 46 | //! # enum TokenKind { 47 | //! # // This token will be skipped. 48 | //! # #[skip(r"\s+")] 49 | //! # _Skip, 50 | //! # /// Parentheses. 51 | //! # #[regex(r"[()]")] 52 | //! # Paren(char), 53 | //! # /// Atom. 54 | //! # #[regex(r"[^\s()]+")] 55 | //! # Atom(String), 56 | //! # /// End-of-file. 57 | //! # #[eof] 58 | //! # Eof, 59 | //! # } 60 | //! type Token = laps::token::Token; 61 | //! 62 | //! token_ast! { 63 | //! macro Token { 64 | //! [atom] => { kind: TokenKind::Atom(_), prompt: "atom" }, 65 | //! [lpr] => { kind: TokenKind::Paren('(') }, 66 | //! [rpr] => { kind: TokenKind::Paren(')') }, 67 | //! [eof] => { kind: TokenKind::Eof }, 68 | //! } 69 | //! } 70 | //! 71 | //! #[derive(Parse)] 72 | //! #[token(Token)] 73 | //! enum Statement { 74 | //! Elem(Elem), 75 | //! End(Token![eof]), 76 | //! } 77 | //! 78 | //! #[derive(Parse)] 79 | //! #[token(Token)] 80 | //! struct SExp(Token![lpr], Vec, Token![rpr]); 81 | //! 82 | //! #[derive(Parse)] 83 | //! #[token(Token)] 84 | //! enum Elem { 85 | //! Atom(Token![atom]), 86 | //! SExp(SExp), 87 | //! } 88 | //! ``` 89 | //! 90 | //! The above implementation is very close in form to the corresponding 91 | //! EBNF representation of the S-expression: 92 | //! 93 | //! ```text 94 | //! Statement ::= Elem | EOF; 95 | //! SExp ::= "(" {Elem} ")"; 96 | //! Elem ::= ATOM | SExp; 97 | //! ``` 98 | //! 99 | //! # More Examples 100 | //! 101 | //! See the 102 | //! [`examples` directory](https://github.com/MaxXSoft/laps/tree/master/examples), 103 | //! which contains the following examples: 104 | //! 105 | //! * [`sexp`](https://github.com/MaxXSoft/laps/tree/master/examples/sexp): 106 | //! a [S-expression](https://en.wikipedia.org/wiki/S-expression) parser. 107 | //! * [`calc`](https://github.com/MaxXSoft/laps/tree/master/examples/calc): 108 | //! a simple expression calculator. 109 | //! * [`json`](https://github.com/MaxXSoft/laps/tree/master/examples/json): 110 | //! a simple JSON parser. 111 | //! * [`clike`](https://github.com/MaxXSoft/laps/tree/master/examples/clike): 112 | //! interpreter for a C-like programming language. 113 | //! 114 | //! # Accelerating Code Completion for IDEs 115 | //! 116 | //! By default, Cargo does not enable optimizations for procedural macros, 117 | //! which may result in slower code completion if you are using `laps` to 118 | //! generate lexers. To avoid this, you can add the following configuration 119 | //! to `Cargo.toml`: 120 | //! 121 | //! ```toml 122 | //! [profile.dev.build-override] 123 | //! opt-level = 3 124 | //! ``` 125 | //! 126 | //! You can also try to manually enable/disable parallelization for lexer 127 | //! generation by adding: 128 | //! 129 | #![cfg_attr(not(feature = "macros"), doc = " ```ignore")] 130 | #![cfg_attr(feature = "macros", doc = " ```")] 131 | //! # fn main() {} 132 | //! # use laps::prelude::*; 133 | //! #[derive(Tokenize)] 134 | //! #[enable_par(true)] // or #[enable_par(false)] 135 | //! enum TokenKind { 136 | //! // ... 137 | //! # #[regex(r"[^\s()]+")] 138 | //! # Atom(String), 139 | //! # #[eof] 140 | //! # Eof, 141 | //! } 142 | //! ``` 143 | 144 | pub mod ast; 145 | pub mod input; 146 | pub mod lexer; 147 | pub mod parse; 148 | pub mod reader; 149 | pub mod span; 150 | pub mod token; 151 | 152 | /// A prelude of some common traits and macros (if enabled feature `macros`) 153 | /// in [`laps`](crate). 154 | /// 155 | /// ``` 156 | /// use laps::prelude::*; 157 | /// ``` 158 | pub mod prelude { 159 | pub use crate::input::InputStream; 160 | pub use crate::lexer::Tokenize; 161 | pub use crate::parse::Parse; 162 | pub use crate::span::Spanned; 163 | pub use crate::token::{TokenStream, Tokenizer}; 164 | 165 | #[cfg(feature = "macros")] 166 | pub use crate::token::{token_ast, token_kind}; 167 | } 168 | -------------------------------------------------------------------------------- /src/parse.rs: -------------------------------------------------------------------------------- 1 | //! Implementations for constructing parsers. 2 | //! 3 | //! This module contains the [`Parse`] trait, which can be implemented 4 | //! for all types that can be parsed from a token stream, such as ASTs. 5 | 6 | use crate::span::Result; 7 | use crate::token::TokenStream; 8 | 9 | #[cfg(feature = "macros")] 10 | pub use laps_macros::Parse; 11 | 12 | /// Parsing trait for all types that can be parsed from a token stream. 13 | pub trait Parse: Sized 14 | where 15 | TS: TokenStream, 16 | { 17 | /// Parses a instance of the current type from the given token stream. 18 | fn parse(tokens: &mut TS) -> Result; 19 | 20 | /// Checks if the current type may be parsed from the given token stream. 21 | /// 22 | /// Does not advance the position of the token stream. 23 | fn maybe(tokens: &mut TS) -> Result; 24 | } 25 | 26 | impl Parse for Box 27 | where 28 | TS: TokenStream, 29 | T: Parse, 30 | { 31 | fn parse(tokens: &mut TS) -> Result { 32 | tokens.parse().map(Box::new) 33 | } 34 | 35 | fn maybe(tokens: &mut TS) -> Result { 36 | T::maybe(tokens) 37 | } 38 | } 39 | 40 | impl Parse for Option 41 | where 42 | TS: TokenStream, 43 | T: Parse, 44 | { 45 | fn parse(tokens: &mut TS) -> Result { 46 | T::maybe(tokens)?.then(|| tokens.parse()).transpose() 47 | } 48 | 49 | fn maybe(_: &mut TS) -> Result { 50 | Ok(true) 51 | } 52 | } 53 | 54 | impl Parse for Vec 55 | where 56 | TS: TokenStream, 57 | T: Parse, 58 | { 59 | fn parse(tokens: &mut TS) -> Result { 60 | let mut ts = Vec::new(); 61 | while T::maybe(tokens)? { 62 | ts.push(tokens.parse()?); 63 | } 64 | Ok(ts) 65 | } 66 | 67 | fn maybe(_: &mut TS) -> Result { 68 | Ok(true) 69 | } 70 | } 71 | 72 | /// Helper macro for implementing [`Parse`] trait for tuples. 73 | macro_rules! impl_for_tuple { 74 | ($t:ident $($ts:ident)*) => { 75 | impl Parse for ($t, $($ts,)*) 76 | where 77 | TS: TokenStream, 78 | $t: Parse, 79 | $($ts: Parse,)* 80 | { 81 | fn parse(tokens: &mut TS) -> Result { 82 | Ok((tokens.parse()?, $(tokens.parse::<$ts>()?,)*)) 83 | } 84 | 85 | fn maybe(tokens: &mut TS) -> Result { 86 | $t::maybe(tokens) 87 | } 88 | } 89 | }; 90 | } 91 | 92 | impl_for_tuple!(A); 93 | impl_for_tuple!(A B); 94 | impl_for_tuple!(A B C); 95 | impl_for_tuple!(A B C D); 96 | impl_for_tuple!(A B C D E); 97 | impl_for_tuple!(A B C D E F); 98 | impl_for_tuple!(A B C D E F G); 99 | impl_for_tuple!(A B C D E F G H); 100 | impl_for_tuple!(A B C D E F G H I); 101 | impl_for_tuple!(A B C D E F G H I J); 102 | impl_for_tuple!(A B C D E F G H I J K); 103 | impl_for_tuple!(A B C D E F G H I J K L); 104 | impl_for_tuple!(A B C D E F G H I J K L M); 105 | impl_for_tuple!(A B C D E F G H I J K L M N); 106 | impl_for_tuple!(A B C D E F G H I J K L M N O); 107 | impl_for_tuple!(A B C D E F G H I J K L M N O P); 108 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q); 109 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R); 110 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S); 111 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S T); 112 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S T U); 113 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S T U V); 114 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S T U V W); 115 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S T U V W X); 116 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S T U V W X Y); 117 | impl_for_tuple!(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z); 118 | -------------------------------------------------------------------------------- /src/reader.rs: -------------------------------------------------------------------------------- 1 | //! Reader related implementations for lexers. 2 | //! 3 | //! Reader implements [`InputStream`] trait, and it can read and buffer 4 | //! characters and their corresponding spans from any types that implement 5 | //! the [`Read`] trait. 6 | //! 7 | //! This module contains two kinds of readers: [`Reader`] will try to read 8 | //! UTF-8 characters from the stream, and will report fatal error if there are 9 | //! no valid UTF-8 characters. [`ByteReader`] will read bytes from the stream. 10 | 11 | use crate::input::InputStream; 12 | use crate::log_raw_fatal_error; 13 | use crate::span::{FileType, Location, Result, Span}; 14 | use std::fs::File; 15 | use std::io::{self, stdin, Cursor, Read, Stdin}; 16 | use std::path::Path; 17 | use std::str::{from_utf8, from_utf8_unchecked}; 18 | 19 | /// Size of the byte buffer. 20 | const BYTE_BUFFER_SIZE: usize = 1024; 21 | 22 | /// A generic UTF-8 character reader for lexers. 23 | /// 24 | /// The generic parameter `BUFFER_SIZE` specifies the size of the internal 25 | /// buffer of [`Reader`]. 26 | /// 27 | /// [`Reader`] always tries to read UTF-8 characters from the stream. 28 | /// If there are no valid UTF-8 characters, [`Reader`] will return a 29 | /// fatal error ([`Error::Fatal`](crate::span::Error::Fatal)). 30 | pub struct Reader { 31 | reader: T, 32 | span: Span, 33 | 34 | // Buffers in `Reader`: 35 | // Read bytes to buffer `byte_buf`, start at offset `byte_buf_offset`, 36 | // then convert bytes to UTF-8 characters and stores them into `char_buf`. 37 | // If there are some remaining bytes can not be converted, move them to the 38 | // begining of the `byte_buf`, and update `byte_buf_offset`. 39 | char_buf: Vec, 40 | byte_buf: Box<[u8; BUFFER_SIZE]>, 41 | byte_buf_offset: usize, 42 | } 43 | 44 | impl Reader { 45 | /// Creates a new reader with the given inner reader and file type. 46 | pub fn new(reader: T, file_type: FileType) -> Self { 47 | Self { 48 | reader, 49 | span: Span::new(file_type), 50 | char_buf: Vec::new(), 51 | byte_buf: Box::new([0; BUFFER_SIZE]), 52 | byte_buf_offset: 0, 53 | } 54 | } 55 | 56 | /// Returns the next character and the last location from the reader. 57 | fn next_char_loc_from_reader(&mut self) -> Result<(Option, Location)> 58 | where 59 | T: Read, 60 | { 61 | // get the current location 62 | let loc = self.span.start(); 63 | // read bytes to buffer 64 | let count = self 65 | .reader 66 | .read(&mut self.byte_buf[self.byte_buf_offset..]) 67 | .map_err(|e| log_raw_fatal_error!(self.span, "{e}"))? 68 | + self.byte_buf_offset; 69 | // handle EOF 70 | if count == 0 { 71 | return Ok((None, loc)); 72 | } 73 | // converts bytes to UTF-8 string 74 | let (s, end) = match from_utf8(&self.byte_buf[..count]) { 75 | Ok(s) => (s, None), 76 | Err(e) => { 77 | let end = e.valid_up_to(); 78 | // safe due to the above check 79 | let s = unsafe { from_utf8_unchecked(&self.byte_buf[..end]) }; 80 | (s, Some(end)) 81 | } 82 | }; 83 | // get the character and fill the char buffer 84 | let mut chars = s.chars(); 85 | let c = if let Some(c) = chars.next() { 86 | self.char_buf.extend(chars.rev()); 87 | c 88 | } else { 89 | return log_raw_fatal_error!(self.span, "invalid UTF-8 character").into(); 90 | }; 91 | // update byte buffer and its offset 92 | if let Some(end) = end { 93 | self.byte_buf.copy_within(end..count, 0); 94 | self.byte_buf_offset = count - end; 95 | } else { 96 | self.byte_buf_offset = 0; 97 | } 98 | // update the span 99 | self.span.update(&c); 100 | Ok((Some(c), loc)) 101 | } 102 | } 103 | 104 | /// A generic byte reader for lexers. 105 | /// 106 | /// The generic parameter `BUFFER_SIZE` specifies the size of the internal 107 | /// buffer of [`ByteReader`]. 108 | pub struct ByteReader { 109 | reader: T, 110 | span: Span, 111 | char_buf: Vec, 112 | } 113 | 114 | impl ByteReader { 115 | /// Creates a new reader with the given inner reader and file type. 116 | pub fn new(reader: T, file_type: FileType) -> Self { 117 | Self { 118 | reader, 119 | span: Span::new(file_type), 120 | char_buf: Vec::new(), 121 | } 122 | } 123 | 124 | /// Returns the next byte and the last location from the reader. 125 | fn next_char_loc_from_reader(&mut self) -> Result<(Option, Location)> 126 | where 127 | T: Read, 128 | { 129 | // get the current location 130 | let loc = self.span.start(); 131 | // read bytes to buffer 132 | let mut buf = [0; BUFFER_SIZE]; 133 | let count = self 134 | .reader 135 | .read(&mut buf) 136 | .map_err(|e| log_raw_fatal_error!(self.span, "{e}"))?; 137 | // handle EOF 138 | if count == 0 { 139 | return Ok((None, loc)); 140 | } 141 | // get the byte and fill the char buffer 142 | let b = buf[0]; 143 | self.char_buf.extend(buf[1..count].iter().rev()); 144 | // update the span 145 | self.span.update(&b); 146 | Ok((Some(b), loc)) 147 | } 148 | } 149 | 150 | /// Implements necessary methods for the given reader. 151 | macro_rules! impl_reader { 152 | ($name:ident, $char:ty) => { 153 | impl $name { 154 | /// Converts the reader into its inner reader. 155 | pub fn into_inner(self) -> T { 156 | self.reader 157 | } 158 | } 159 | 160 | impl $name { 161 | /// Creates a new reader from the file at the given path. 162 | pub fn from_path

(path: P) -> io::Result 163 | where 164 | P: AsRef + Clone, 165 | { 166 | File::open(path.clone()).map(|f| Self::new(f, FileType::File(Box::from(path.as_ref())))) 167 | } 168 | } 169 | 170 | impl $name { 171 | /// Creates a new reader from the standard input. 172 | pub fn from_stdin() -> Self { 173 | stdin().into() 174 | } 175 | } 176 | 177 | impl From for $name { 178 | /// Creates a new reader from the standard input. 179 | fn from(stdin: Stdin) -> Self { 180 | Self::new(stdin, FileType::Stdin) 181 | } 182 | } 183 | 184 | impl From for $name> { 185 | /// Creates a new reader from the given [`String`]. 186 | fn from(s: String) -> Self { 187 | Self::new(Cursor::new(s), FileType::Buffer) 188 | } 189 | } 190 | 191 | impl<'a> From<&'a str> for $name> { 192 | /// Creates a new reader from the given &[str]. 193 | fn from(s: &'a str) -> Self { 194 | Self::new(Cursor::new(s), FileType::Buffer) 195 | } 196 | } 197 | 198 | impl<'a> From<&'a [u8]> for $name<&'a [u8]> { 199 | /// Creates a new reader from the given &[[u8]]. 200 | fn from(b: &'a [u8]) -> Self { 201 | Self::new(b, FileType::Buffer) 202 | } 203 | } 204 | 205 | impl InputStream for $name 206 | where 207 | T: Read, 208 | { 209 | type CharType = $char; 210 | 211 | fn next_char_loc(&mut self) -> Result<(Option<$char>, Location)> { 212 | if let Some(c) = self.char_buf.pop() { 213 | let loc = self.span.start(); 214 | self.span.update(&c); 215 | Ok((Some(c), loc)) 216 | } else { 217 | self.next_char_loc_from_reader() 218 | } 219 | } 220 | 221 | fn unread(&mut self, last: (Option<$char>, Location)) { 222 | self.span.update_loc(last.1); 223 | if let Some(c) = last.0 { 224 | self.char_buf.push(c); 225 | } 226 | } 227 | 228 | fn span(&self) -> &Span { 229 | &self.span 230 | } 231 | 232 | fn set_line_col(&mut self, line: u32, col: u32) { 233 | self.span.update_loc(Location { line, col }); 234 | } 235 | 236 | fn peek(&mut self) -> Result> { 237 | if let Some(c) = self.char_buf.last() { 238 | Ok(Some(*c)) 239 | } else { 240 | let char_loc = self.next_char_loc_from_reader()?; 241 | self.unread(char_loc); 242 | Ok(char_loc.0) 243 | } 244 | } 245 | 246 | fn peek_with_span(&mut self) -> Result<(Option<$char>, Span)> { 247 | if let Some(c) = self.char_buf.last() { 248 | Ok((Some(*c), self.span.clone().into_updated(c))) 249 | } else { 250 | let char_loc = self.next_char_loc_from_reader()?; 251 | let span = self.span.clone(); 252 | self.unread(char_loc); 253 | Ok((char_loc.0, span)) 254 | } 255 | } 256 | } 257 | }; 258 | } 259 | 260 | impl_reader!(Reader, char); 261 | impl_reader!(ByteReader, u8); 262 | 263 | #[cfg(test)] 264 | mod test { 265 | use super::*; 266 | 267 | #[test] 268 | fn next_char_loc_unread() { 269 | let mut reader = Reader::from("123 abc"); 270 | assert_eq!(reader.next_char_loc().unwrap().0, Some('1')); 271 | let last = reader.next_char_loc().unwrap(); 272 | assert_eq!(last.0, Some('2')); 273 | reader.unread(last); 274 | let loc = last.1; 275 | assert_eq!(reader.next_char_loc().unwrap(), (Some('2'), loc)); 276 | assert_eq!(reader.next_char_loc().unwrap().0, Some('3')); 277 | assert_eq!(reader.next_char_loc().unwrap().0, Some(' ')); 278 | assert_eq!(reader.next_char_loc().unwrap().0, Some('a')); 279 | assert_eq!(reader.next_char_loc().unwrap().0, Some('b')); 280 | assert_eq!(reader.next_char_loc().unwrap().0, Some('c')); 281 | let last = reader.next_char_loc().unwrap(); 282 | assert_eq!(last.0, None); 283 | reader.unread(last); 284 | let loc = last.1; 285 | assert_eq!(reader.next_char_loc().unwrap(), (None, loc)); 286 | assert_eq!(reader.next_char_loc().unwrap().0, None); 287 | } 288 | 289 | #[test] 290 | fn peek_span() { 291 | let mut reader = Reader::from("123 abc"); 292 | assert_eq!(reader.peek(), Ok(Some('1'))); 293 | assert_eq!(format!("{}", reader.span()), "1:0-1:0"); 294 | assert_eq!(reader.peek(), Ok(Some('1'))); 295 | assert_eq!(format!("{}", reader.span()), "1:0-1:0"); 296 | reader.next_char_loc().unwrap(); 297 | assert_eq!(reader.peek(), Ok(Some('2'))); 298 | assert_eq!(format!("{}", reader.span()), "1:1-1:1"); 299 | } 300 | 301 | #[test] 302 | fn peek_with_span() { 303 | let mut reader = Reader::from("123 abc"); 304 | let (c, span) = reader.peek_with_span().unwrap(); 305 | assert_eq!(c, Some('1')); 306 | assert_eq!(format!("{span}"), "1:1-1:1"); 307 | let (c, span) = reader.peek_with_span().unwrap(); 308 | assert_eq!(c, Some('1')); 309 | assert_eq!(format!("{span}"), "1:1-1:1"); 310 | reader.next_char_loc().unwrap(); 311 | let (c, span) = reader.peek_with_span().unwrap(); 312 | assert_eq!(c, Some('2')); 313 | assert_eq!(format!("{span}"), "1:2-1:2"); 314 | } 315 | 316 | #[test] 317 | fn unicode_chars() { 318 | let mut bytes: Vec<_> = "你好, abc✨".into(); 319 | bytes.push(0xff); 320 | bytes.push(b'z'); 321 | let mut reader = Reader::from(bytes.as_slice()); 322 | assert_eq!(reader.next_char(), Ok(Some('你'))); 323 | assert_eq!(reader.next_char(), Ok(Some('好'))); 324 | assert_eq!(reader.next_char(), Ok(Some(','))); 325 | assert_eq!(reader.next_char(), Ok(Some(' '))); 326 | assert_eq!(reader.next_char(), Ok(Some('a'))); 327 | assert_eq!(reader.next_char(), Ok(Some('b'))); 328 | assert_eq!(reader.next_char(), Ok(Some('c'))); 329 | assert_eq!(reader.next_char(), Ok(Some('✨'))); 330 | assert!(reader.next_char().is_err()); 331 | assert!(reader.next_char().is_err()); 332 | let mut reader = ByteReader::from(bytes.as_slice()); 333 | assert_eq!(reader.next_char(), Ok(Some(0xe4))); 334 | assert_eq!(reader.next_char(), Ok(Some(0xbd))); 335 | assert_eq!(reader.next_char(), Ok(Some(0xa0))); 336 | assert_eq!(reader.next_char(), Ok(Some(0xe5))); 337 | assert_eq!(reader.next_char(), Ok(Some(0xa5))); 338 | assert_eq!(reader.next_char(), Ok(Some(0xbd))); 339 | assert_eq!(reader.next_char(), Ok(Some(b','))); 340 | assert_eq!(reader.next_char(), Ok(Some(b' '))); 341 | assert_eq!(reader.next_char(), Ok(Some(b'a'))); 342 | assert_eq!(reader.next_char(), Ok(Some(b'b'))); 343 | assert_eq!(reader.next_char(), Ok(Some(b'c'))); 344 | assert_eq!(reader.next_char(), Ok(Some(0xe2))); 345 | assert_eq!(reader.next_char(), Ok(Some(0x9c))); 346 | assert_eq!(reader.next_char(), Ok(Some(0xa8))); 347 | assert_eq!(reader.next_char(), Ok(Some(0xff))); 348 | assert_eq!(reader.next_char(), Ok(Some(b'z'))); 349 | assert_eq!(reader.next_char(), Ok(None)); 350 | assert_eq!(reader.next_char(), Ok(None)); 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /src/token.rs: -------------------------------------------------------------------------------- 1 | //! Token ([`Token`]) related implementations, including 2 | //! tokenizer ([`Tokenizer`]) and token stream ([`TokenStream`]). 3 | //! 4 | //! All of these implementations can be used in lexers and parsers, 5 | //! specifically: 6 | //! 7 | //! * [`Token`]: generic token representations, can be produced by lexers. 8 | //! * [`Tokenizer`]: trait for tokenizers (structures that can produce 9 | //! tokens), all lexers should implement this trait. 10 | //! * [`TokenStream`]: a tokenizer wrapper trait, provides several helper 11 | //! methods for parsing, can be used in parsers. 12 | //! * [`TokenBuffer`]: a structure that implements the [`TokenStream`] trait, 13 | //! can be used in parsers. 14 | 15 | use crate::log_error; 16 | use crate::parse::Parse; 17 | use crate::span::{Result, Span, Spanned}; 18 | use std::borrow::{Borrow, BorrowMut}; 19 | use std::collections::VecDeque; 20 | use std::{fmt, hash}; 21 | 22 | #[cfg(feature = "macros")] 23 | pub use laps_macros::{token_ast, token_kind}; 24 | 25 | /// A generic token. 26 | #[derive(Clone, Debug)] 27 | pub struct Token { 28 | /// Kind of the token. 29 | pub kind: Kind, 30 | /// Span of the token. 31 | pub span: Span, 32 | } 33 | 34 | impl Token { 35 | /// Creates a new token from the given value and span. 36 | pub fn new(value: T, span: Span) -> Self 37 | where 38 | Kind: From, 39 | { 40 | Self { 41 | kind: value.into(), 42 | span, 43 | } 44 | } 45 | } 46 | 47 | impl Spanned for Token { 48 | fn span(&self) -> Span { 49 | self.span.clone() 50 | } 51 | } 52 | 53 | impl PartialEq for Token 54 | where 55 | Kind: PartialEq, 56 | { 57 | fn eq(&self, other: &Kind) -> bool { 58 | self.kind.eq(other) 59 | } 60 | } 61 | 62 | impl PartialEq for Token 63 | where 64 | Kind: PartialEq, 65 | { 66 | fn eq(&self, other: &Self) -> bool { 67 | self.kind.eq(&other.kind) 68 | } 69 | } 70 | 71 | impl Eq for Token where Kind: Eq {} 72 | 73 | impl hash::Hash for Token 74 | where 75 | Kind: hash::Hash, 76 | { 77 | fn hash(&self, state: &mut H) { 78 | self.kind.hash(state) 79 | } 80 | } 81 | 82 | impl fmt::Display for Token 83 | where 84 | Kind: fmt::Display, 85 | { 86 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 87 | self.kind.fmt(f) 88 | } 89 | } 90 | 91 | impl Borrow for Token { 92 | fn borrow(&self) -> &Kind { 93 | &self.kind 94 | } 95 | } 96 | 97 | impl BorrowMut for Token { 98 | fn borrow_mut(&mut self) -> &mut Kind { 99 | &mut self.kind 100 | } 101 | } 102 | 103 | impl AsRef for Token { 104 | fn as_ref(&self) -> &Kind { 105 | &self.kind 106 | } 107 | } 108 | 109 | impl AsMut for Token { 110 | fn as_mut(&mut self) -> &mut Kind { 111 | &mut self.kind 112 | } 113 | } 114 | 115 | /// Trait for tokenizers. 116 | pub trait Tokenizer { 117 | /// Type of the token produced by the tokenizer. 118 | type Token; 119 | 120 | /// Reads the next token from the token stream. 121 | /// 122 | /// Returns the token if successful, otherwise [`Err`]. 123 | fn next_token(&mut self) -> Result; 124 | } 125 | 126 | /// Trait for token streams. 127 | pub trait TokenStream: Tokenizer { 128 | /// Unreads the given token and put it back to the token stream. 129 | fn unread(&mut self, token: Self::Token); 130 | 131 | /// Parses an AST of type `T` from the token stream. 132 | fn parse(&mut self) -> Result 133 | where 134 | T: Parse, 135 | Self: Sized, 136 | { 137 | T::parse(self) 138 | } 139 | 140 | /// Peeks the next token from the token stream. 141 | /// 142 | /// Does not advance the position of the token stream. 143 | fn peek(&mut self) -> Result 144 | where 145 | Self::Token: Clone, 146 | { 147 | let token = self.next_token()?; 148 | self.unread(token.clone()); 149 | Ok(token) 150 | } 151 | 152 | /// Peeks the next 2 tokens from the token stream. 153 | /// 154 | /// Does not advance the position of the token stream. 155 | fn peek2(&mut self) -> Result<(Self::Token, Self::Token)> 156 | where 157 | Self::Token: Clone, 158 | { 159 | let token1 = self.next_token()?; 160 | let token2 = self.next_token()?; 161 | self.unread(token2.clone()); 162 | self.unread(token1.clone()); 163 | Ok((token1, token2)) 164 | } 165 | 166 | /// Peeks the next N tokens from the token stream. 167 | /// 168 | /// Does not advance the position of the token stream. 169 | fn peek_n(&mut self, n: usize) -> Result> 170 | where 171 | Self::Token: Clone, 172 | { 173 | let v = (0..n) 174 | .map(|_| self.next_token()) 175 | .collect::>>()?; 176 | v.iter().rev().for_each(|t| self.unread(t.clone())); 177 | Ok(v) 178 | } 179 | 180 | /// Skips tokens untils a token specified by the predicate is encountered. 181 | fn skip_until(&mut self, mut f: F) -> Result<()> 182 | where 183 | F: FnMut(&Self::Token) -> bool, 184 | { 185 | loop { 186 | let token = self.next_token()?; 187 | if f(&token) { 188 | self.unread(token); 189 | break Ok(()); 190 | } 191 | } 192 | } 193 | 194 | /// Collects tokens into a [`Vec`] until a token specified by the predicate 195 | /// is encountered. 196 | fn collect_until(&mut self, mut f: F) -> Result> 197 | where 198 | F: FnMut(&Self::Token) -> bool, 199 | { 200 | let mut v = Vec::new(); 201 | loop { 202 | let token = self.next_token()?; 203 | if f(&token) { 204 | self.unread(token); 205 | break Ok(v); 206 | } 207 | v.push(token); 208 | } 209 | } 210 | 211 | /// Checks if the next token is the same as the given token, 212 | /// and returns the token if it is, otherwise returns an error. 213 | fn expect(&mut self, token: T) -> Result 214 | where 215 | Self::Token: PartialEq + Spanned + fmt::Display, 216 | T: fmt::Display, 217 | { 218 | let next = self.next_token()?; 219 | if next == token { 220 | Ok(next) 221 | } else { 222 | let err = log_error!(next.span(), "expected {token}, found {next}"); 223 | self.unread(next); 224 | Err(err) 225 | } 226 | } 227 | 228 | /// Constructs a helper for peeking a sequence of tokens. 229 | fn lookahead(&mut self) -> Lookahead 230 | where 231 | Self: Sized, 232 | { 233 | Lookahead { 234 | tokens: self, 235 | buf: Vec::new(), 236 | #[cfg(feature = "macros")] 237 | last_result: true, 238 | } 239 | } 240 | } 241 | 242 | /// Support for checking token sequences without 243 | /// advancing the position of the token stream. 244 | pub struct Lookahead<'ts, TS, T> 245 | where 246 | TS: TokenStream, 247 | { 248 | tokens: &'ts mut TS, 249 | buf: Vec, 250 | #[cfg(feature = "macros")] 251 | last_result: bool, 252 | } 253 | 254 | impl<'ts, TS, T> Lookahead<'ts, TS, T> 255 | where 256 | TS: TokenStream, 257 | { 258 | /// Peeks the next token from the token stream. 259 | pub fn peek_next(&mut self) -> Result 260 | where 261 | T: Clone, 262 | { 263 | let token = self.tokens.next_token()?; 264 | self.buf.push(token.clone()); 265 | Ok(token) 266 | } 267 | 268 | #[cfg(feature = "macros")] 269 | /// Checks if the next token maybe the given token. 270 | /// 271 | /// Accepts token AST types only, see [`token_ast`]. 272 | pub fn maybe(mut self, _: F) -> Result 273 | where 274 | F: FnOnce(T) -> TA, 275 | TA: Parse, 276 | { 277 | if self.last_result { 278 | self.last_result = TA::maybe(self.tokens)?; 279 | } 280 | self.buf.push(self.tokens.next_token()?); 281 | Ok(self) 282 | } 283 | 284 | #[cfg(feature = "macros")] 285 | /// Consumes and returns the final result of the 286 | /// [`maybe`](Lookahead#method.maybe) chain. 287 | pub fn result(self) -> Result { 288 | Ok(self.last_result) 289 | } 290 | } 291 | 292 | impl<'ts, TS, T> Drop for Lookahead<'ts, TS, T> 293 | where 294 | TS: TokenStream, 295 | { 296 | fn drop(&mut self) { 297 | while let Some(token) = self.buf.pop() { 298 | self.tokens.unread(token) 299 | } 300 | } 301 | } 302 | 303 | /// A token buffer that implements trait [`TokenStream`]. 304 | /// 305 | /// Contains a tokenizer of type `TN`, produces tokens of type `T`. 306 | pub struct TokenBuffer { 307 | tokenizer: TN, 308 | token_buf: VecDeque, 309 | } 310 | 311 | impl TokenBuffer { 312 | /// Creates a new token buffer by the given tokenizer. 313 | pub fn new(tokenizer: TN) -> Self { 314 | Self { 315 | tokenizer, 316 | token_buf: VecDeque::new(), 317 | } 318 | } 319 | 320 | /// Converts the token buffer into the inner tokenizer. 321 | pub fn into_inner(self) -> TN { 322 | self.tokenizer 323 | } 324 | 325 | /// Returns a reference to the inner tokenizer. 326 | pub fn inner(&self) -> &TN { 327 | &self.tokenizer 328 | } 329 | 330 | /// Returns a mutable reference to the inner tokenizer. 331 | pub fn inner_mut(&mut self) -> &mut TN { 332 | &mut self.tokenizer 333 | } 334 | 335 | /// Extends the token buffer by `n` new tokens. 336 | fn extend_by(&mut self, n: usize) -> Result<()> 337 | where 338 | TN: Tokenizer, 339 | { 340 | for _ in 0..n { 341 | self.token_buf.push_back(self.tokenizer.next_token()?); 342 | } 343 | Ok(()) 344 | } 345 | } 346 | 347 | impl From for TokenBuffer { 348 | /// Converts the given tokenizer to a token buffer. 349 | fn from(tokenizer: TN) -> Self { 350 | Self::new(tokenizer) 351 | } 352 | } 353 | 354 | impl Tokenizer for TokenBuffer 355 | where 356 | TN: Tokenizer, 357 | { 358 | type Token = T; 359 | 360 | fn next_token(&mut self) -> Result { 361 | match self.token_buf.pop_front() { 362 | Some(t) => Ok(t), 363 | None => self.tokenizer.next_token(), 364 | } 365 | } 366 | } 367 | 368 | impl TokenStream for TokenBuffer 369 | where 370 | TN: Tokenizer, 371 | { 372 | fn unread(&mut self, token: Self::Token) { 373 | self.token_buf.push_front(token) 374 | } 375 | 376 | fn peek(&mut self) -> Result 377 | where 378 | Self::Token: Clone, 379 | { 380 | if let Some(t) = self.token_buf.front() { 381 | Ok(t.clone()) 382 | } else { 383 | let t = self.tokenizer.next_token()?; 384 | self.token_buf.push_front(t.clone()); 385 | Ok(t) 386 | } 387 | } 388 | 389 | fn peek2(&mut self) -> Result<(Self::Token, Self::Token)> 390 | where 391 | Self::Token: Clone, 392 | { 393 | if self.token_buf.len() < 2 { 394 | self.extend_by(2 - self.token_buf.len())?; 395 | } 396 | let mut iter = self.token_buf.iter(); 397 | match (iter.next(), iter.next()) { 398 | (Some(t1), Some(t2)) => Ok((t1.clone(), t2.clone())), 399 | _ => unreachable!(), 400 | } 401 | } 402 | 403 | fn peek_n(&mut self, n: usize) -> Result> 404 | where 405 | Self::Token: Clone, 406 | { 407 | if self.token_buf.len() < n { 408 | self.extend_by(n - self.token_buf.len())?; 409 | } 410 | Ok(self.token_buf.iter().take(n).cloned().collect()) 411 | } 412 | } 413 | --------------------------------------------------------------------------------