├── .github └── workflows │ └── main.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── compiler-tools-derive ├── Cargo.toml ├── src │ ├── gen │ │ ├── class_match.rs │ │ ├── display.rs │ │ ├── full_regex.rs │ │ ├── mod.rs │ │ └── simple_regex.rs │ ├── lib.rs │ ├── lit_table.rs │ └── simple_regex │ │ ├── dfa.rs │ │ ├── generate.rs │ │ ├── matching.rs │ │ ├── mod.rs │ │ ├── nfa.rs │ │ └── parse.rs └── tests │ ├── integration.rs │ └── regex_bench.rs ├── compiler-tools ├── Cargo.toml └── src │ ├── lib.rs │ ├── misc.rs │ ├── span.rs │ ├── tokenizer.rs │ └── util.rs └── rustfmt.toml /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | tests: 13 | name: Run tests - Rust (${{ matrix.rust }}) on ${{ matrix.os }} 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | include: 19 | - { rust: stable, os: ubuntu-22.04 } 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v4 23 | - uses: dtolnay/rust-toolchain@stable 24 | with: 25 | toolchain: ${{ matrix.rust }} 26 | components: rustfmt 27 | - name: Build with all features 28 | run: cargo build --all-features 29 | - name: Build 30 | run: cargo build --workspace --verbose 31 | - name: Run tests 32 | run: cargo test --workspace --all-features 33 | - name: Clean 34 | run: cargo clean 35 | 36 | tests_min_compat: 37 | name: Run min rust version tests - Rust (${{ matrix.rust }}) on ${{ matrix.os }} 38 | runs-on: ${{ matrix.os }} 39 | strategy: 40 | fail-fast: false 41 | matrix: 42 | include: 43 | - { rust: 1.75.0, os: ubuntu-22.04 } 44 | steps: 45 | - name: Checkout 46 | uses: actions/checkout@v4 47 | with: 48 | submodules: true 49 | - uses: dtolnay/rust-toolchain@stable 50 | with: 51 | toolchain: ${{ matrix.rust }} 52 | components: rustfmt 53 | - name: Build with all features 54 | run: cargo build --all-features 55 | - name: Build 56 | run: cargo build --workspace --verbose 57 | - name: Run tests 58 | run: cargo test --workspace --all-features 59 | - name: Clean 60 | run: cargo clean 61 | 62 | rustfmt: 63 | name: Run rustfmt - Rust (${{ matrix.rust }}) on ${{ matrix.os }} 64 | runs-on: ${{ matrix.os }} 65 | strategy: 66 | fail-fast: false 67 | matrix: 68 | include: 69 | - { rust: nightly-2024-03-31, os: ubuntu-22.04 } 70 | steps: 71 | - name: Checkout 72 | uses: actions/checkout@v4 73 | with: 74 | submodules: true 75 | - uses: dtolnay/rust-toolchain@stable 76 | with: 77 | toolchain: ${{ matrix.rust }} 78 | components: rustfmt 79 | - name: Check format 80 | run: cargo +${{ matrix.rust }} fmt --all -- --check 81 | # TODO: Need examples 82 | # - name: Check examples format 83 | # working-directory: ./examples 84 | # run: cargo +${{ matrix.rust }} fmt --all -- --check 85 | 86 | clippy: 87 | name: Run clippy - Rust (${{ matrix.rust }}) on ${{ matrix.os }} 88 | runs-on: ${{ matrix.os }} 89 | strategy: 90 | fail-fast: false 91 | matrix: 92 | include: 93 | - { rust: stable, os: ubuntu-22.04 } 94 | steps: 95 | - name: Checkout 96 | uses: actions/checkout@v4 97 | with: 98 | submodules: true 99 | - uses: dtolnay/rust-toolchain@stable 100 | with: 101 | toolchain: ${{ matrix.rust }} 102 | components: clippy 103 | - name: Check with clippy 104 | run: cargo clippy --all 105 | 106 | # TODO: Need examples 107 | # examples: 108 | # name: Build examples - Rust (${{ matrix.rust }}) on ${{ matrix.os }} 109 | # runs-on: ${{ matrix.os }} 110 | # strategy: 111 | # fail-fast: false 112 | # matrix: 113 | # include: 114 | # - { rust: stable, os: ubuntu-22.04 } 115 | # steps: 116 | # - name: Checkout 117 | # uses: actions/checkout@v4 118 | # with: 119 | # submodules: true 120 | # - uses: dtolnay/rust-toolchain@stable 121 | # with: 122 | # toolchain: ${{ matrix.rust }} 123 | # components: clippy, rustfmt 124 | # - name: Check examples with clippy 125 | # run: cargo clippy --all 126 | # working-directory: ./examples 127 | # - name: Build examples 128 | # run: cargo build --workspace --verbose 129 | # working-directory: ./examples 130 | # - name: Clean examples 131 | # run: cargo clean 132 | # working-directory: ./examples 133 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "compiler-tools" 16 | version = "0.2.0" 17 | dependencies = [ 18 | "regex", 19 | "serde", 20 | ] 21 | 22 | [[package]] 23 | name = "compiler-tools-derive" 24 | version = "0.2.0" 25 | dependencies = [ 26 | "compiler-tools", 27 | "indexmap", 28 | "proc-macro2", 29 | "quote", 30 | "regex", 31 | "syn", 32 | ] 33 | 34 | [[package]] 35 | name = "equivalent" 36 | version = "1.0.1" 37 | source = "registry+https://github.com/rust-lang/crates.io-index" 38 | checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" 39 | 40 | [[package]] 41 | name = "hashbrown" 42 | version = "0.14.3" 43 | source = "registry+https://github.com/rust-lang/crates.io-index" 44 | checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" 45 | 46 | [[package]] 47 | name = "indexmap" 48 | version = "2.2.6" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" 51 | dependencies = [ 52 | "equivalent", 53 | "hashbrown", 54 | ] 55 | 56 | [[package]] 57 | name = "memchr" 58 | version = "2.7.2" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" 61 | 62 | [[package]] 63 | name = "proc-macro2" 64 | version = "1.0.79" 65 | source = "registry+https://github.com/rust-lang/crates.io-index" 66 | checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" 67 | dependencies = [ 68 | "unicode-ident", 69 | ] 70 | 71 | [[package]] 72 | name = "quote" 73 | version = "1.0.35" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" 76 | dependencies = [ 77 | "proc-macro2", 78 | ] 79 | 80 | [[package]] 81 | name = "regex" 82 | version = "1.10.4" 83 | source = "registry+https://github.com/rust-lang/crates.io-index" 84 | checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" 85 | dependencies = [ 86 | "aho-corasick", 87 | "memchr", 88 | "regex-automata", 89 | "regex-syntax", 90 | ] 91 | 92 | [[package]] 93 | name = "regex-automata" 94 | version = "0.4.6" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" 97 | dependencies = [ 98 | "aho-corasick", 99 | "memchr", 100 | "regex-syntax", 101 | ] 102 | 103 | [[package]] 104 | name = "regex-syntax" 105 | version = "0.8.3" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" 108 | 109 | [[package]] 110 | name = "serde" 111 | version = "1.0.197" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" 114 | dependencies = [ 115 | "serde_derive", 116 | ] 117 | 118 | [[package]] 119 | name = "serde_derive" 120 | version = "1.0.197" 121 | source = "registry+https://github.com/rust-lang/crates.io-index" 122 | checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" 123 | dependencies = [ 124 | "proc-macro2", 125 | "quote", 126 | "syn", 127 | ] 128 | 129 | [[package]] 130 | name = "syn" 131 | version = "2.0.57" 132 | source = "registry+https://github.com/rust-lang/crates.io-index" 133 | checksum = "11a6ae1e52eb25aab8f3fb9fca13be982a373b8f1157ca14b897a825ba4a2d35" 134 | dependencies = [ 135 | "proc-macro2", 136 | "quote", 137 | "unicode-ident", 138 | ] 139 | 140 | [[package]] 141 | name = "unicode-ident" 142 | version = "1.0.12" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" 145 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = [ 4 | "compiler-tools", 5 | "compiler-tools-derive", 6 | ] 7 | -------------------------------------------------------------------------------- /compiler-tools-derive/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "compiler-tools-derive" 3 | version = "0.2.0" 4 | edition = "2021" 5 | authors = ["Protryon "] 6 | license = "MIT OR Apache-2.0" 7 | repository = "https://github.com/Protryon/compiler-tools" 8 | description = "A proc-macro for deriving powerful and fast tokenizers with compile-time regex" 9 | keywords = [ "compiler", "parser", "generator" ] 10 | rust-version = "1.75.0" 11 | 12 | [lib] 13 | proc-macro = true 14 | 15 | [dependencies] 16 | syn = { version = "2.0", features = ["extra-traits", "full"] } 17 | quote = "1.0" 18 | proc-macro2 = "1.0" 19 | indexmap = "2.2" 20 | regex = { version = "1.10" } 21 | 22 | [dev-dependencies] 23 | compiler-tools = { version = "0.2.0", path = "../compiler-tools" } 24 | 25 | [features] 26 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/gen/class_match.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Ident, TokenStream}; 2 | use quote::quote; 3 | 4 | use crate::{flatten, TokenParseData}; 5 | 6 | pub(crate) fn gen_class_match(tokens_to_parse: &[TokenParseData], enum_ident: &Ident) -> TokenStream { 7 | let mut matches = vec![]; 8 | for info in tokens_to_parse { 9 | let ident = &info.ident; 10 | 11 | if info.has_target { 12 | //todo: what to do with its a parsed target that doesn't impl Display? 13 | matches.push(quote! { 14 | (#enum_ident::#ident(_), #enum_ident::#ident(_)) => true, 15 | }) 16 | } else { 17 | matches.push(quote! { 18 | (#enum_ident::#ident, #enum_ident::#ident) => true, 19 | }) 20 | } 21 | } 22 | matches.push(quote! { 23 | _ => false, 24 | }); 25 | flatten(matches) 26 | } 27 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/gen/display.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Ident, TokenStream}; 2 | use quote::quote; 3 | 4 | use crate::{flatten, TokenParseData}; 5 | 6 | pub(crate) fn gen_display(tokens_to_parse: &[TokenParseData], enum_ident: &Ident) -> TokenStream { 7 | let mut display_fields = vec![]; 8 | for info in tokens_to_parse { 9 | let ident = &info.ident; 10 | 11 | if info.has_target { 12 | //todo: what to do with its a parsed target that doesn't impl Display? 13 | display_fields.push(quote! { 14 | #enum_ident::#ident(x) => write!(f, "{}", x), 15 | }) 16 | } else if !info.literals.is_empty() { 17 | let target = info.literals.first().unwrap().replace("\n", "\\n"); 18 | display_fields.push(quote! { 19 | #enum_ident::#ident => write!(f, "{}", #target), 20 | }) 21 | } else { 22 | let ident_str = format!("{}", ident); 23 | display_fields.push(quote! { 24 | #enum_ident::#ident => write!(f, "{}", #ident_str), 25 | }) 26 | } 27 | } 28 | flatten(display_fields) 29 | } 30 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/gen/full_regex.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use proc_macro2::{Ident, TokenStream}; 4 | use quote::{format_ident, quote}; 5 | 6 | use crate::{construct_variant, flatten, TokenParseData}; 7 | 8 | pub(crate) fn gen_full_regex( 9 | tokens_to_parse: &[TokenParseData], 10 | conflicts: &BTreeMap<(Ident, String), Vec<(Ident, String)>>, 11 | enum_ident: &Ident, 12 | parse_fns: &mut BTreeMap>, 13 | ) -> Result<(), TokenStream> { 14 | for (token_index, item) in tokens_to_parse.iter().enumerate() { 15 | for regex in &item.regexes { 16 | let key = (item.ident.clone(), regex.clone()); 17 | let regex = format!("\\A(?:{})", regex); 18 | 19 | let fn_ident = format_ident!("parse_r_{}", item.ident); 20 | let regex_fn = quote! { 21 | fn #fn_ident(from: &str) -> Option<(&str, &str)> { 22 | static REGEX: ::std::sync::OnceLock<::compiler_tools::regex::Regex> = ::std::sync::OnceLock::new(); 23 | let regex = REGEX.get_or_init(|| ::compiler_tools::regex::Regex::new(#regex).unwrap()); 24 | if let Some(matching) = regex.find(from) { 25 | assert_eq!(matching.start(), 0); 26 | Some((&from[..matching.end()], &from[matching.end()..])) 27 | } else { 28 | None 29 | } 30 | } 31 | }; 32 | 33 | let constructed = construct_variant(item, enum_ident); 34 | 35 | let span = quote! { 36 | ::compiler_tools::Span { 37 | line_start: self.line, 38 | col_start: self.col, 39 | line_stop: { 40 | self.line += passed.chars().filter(|x| *x == '\n').count() as u64; 41 | self.line 42 | }, 43 | //todo: handle utf8 better with newline seeking here 44 | col_stop: if let Some(newline_offset) = passed.as_bytes().iter().rev().position(|x| *x == b'\n') { 45 | let newline_offset = passed.len() - newline_offset; 46 | self.col = (newline_offset as u64).saturating_sub(1); 47 | self.col 48 | } else { 49 | self.col += passed.len() as u64; 50 | self.col 51 | }, 52 | } 53 | }; 54 | 55 | let conflicts = conflicts.get(&key).cloned().unwrap_or_default(); 56 | let mut conflict_resolutions = vec![]; 57 | for (ident, literal) in conflicts { 58 | let subitem = tokens_to_parse.iter().find(|x| x.ident == ident).expect("missing subitem"); 59 | let constructed = construct_variant(subitem, enum_ident); 60 | 61 | conflict_resolutions.push(quote! { 62 | #literal => return Some(::compiler_tools::Spanned { 63 | token: #constructed, 64 | span, 65 | }), 66 | }) 67 | } 68 | let conflict_resolutions = flatten(conflict_resolutions); 69 | 70 | parse_fns.entry(token_index).or_default().push(quote! { 71 | { 72 | #regex_fn 73 | if let Some((passed, remaining)) = #fn_ident(self.inner) { 74 | let span = #span; 75 | self.inner = remaining; 76 | match passed { 77 | #conflict_resolutions 78 | passed => return Some(::compiler_tools::Spanned { 79 | token: #constructed, 80 | span, 81 | }), 82 | } 83 | } 84 | } 85 | }); 86 | } 87 | } 88 | Ok(()) 89 | } 90 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/gen/mod.rs: -------------------------------------------------------------------------------- 1 | pub(super) mod class_match; 2 | pub(super) mod display; 3 | pub(super) mod simple_regex; 4 | 5 | pub(super) mod full_regex; 6 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/gen/simple_regex.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use proc_macro2::{Ident, TokenStream}; 4 | use quote::{format_ident, quote}; 5 | 6 | use crate::{construct_variant, flatten, SimpleRegexData, TokenParseData}; 7 | 8 | pub(crate) fn gen_simple_regex( 9 | tokens_to_parse: &[TokenParseData], 10 | parsed: &BTreeMap<(Ident, String), SimpleRegexData>, 11 | conflicts: &BTreeMap<(Ident, String), Vec<(Ident, String)>>, 12 | enum_ident: &Ident, 13 | parse_fns: &mut BTreeMap>, 14 | ) -> Result<(), TokenStream> { 15 | for (token_index, item) in tokens_to_parse.iter().enumerate() { 16 | for simple_regex in &item.simple_regexes { 17 | let key = (item.ident.clone(), simple_regex.clone()); 18 | let parsed = &parsed.get(&key).unwrap().regex; 19 | let fn_ident = format_ident!("parse_sr_{}", item.ident); 20 | let parse_fn = parsed.generate_parser(fn_ident.clone()); 21 | 22 | let constructed = construct_variant(item, enum_ident); 23 | 24 | let span = if parsed.could_capture_newline() { 25 | quote! { 26 | ::compiler_tools::Span { 27 | line_start: self.line, 28 | col_start: self.col, 29 | line_stop: { 30 | self.line += passed.chars().filter(|x| *x == '\n').count() as u64; 31 | self.line 32 | }, 33 | //todo: handle utf8 better with newline seeking here 34 | col_stop: if let Some(newline_offset) = passed.as_bytes().iter().rev().position(|x| *x == b'\n') { 35 | let newline_offset = passed.len() - newline_offset; 36 | self.col = (newline_offset as u64).saturating_sub(1); 37 | self.col 38 | } else { 39 | self.col += passed.len() as u64; 40 | self.col 41 | }, 42 | } 43 | } 44 | } else { 45 | quote! { 46 | ::compiler_tools::Span { 47 | line_start: self.line, 48 | col_start: self.col, 49 | line_stop: self.line, 50 | col_stop: { 51 | self.col += passed.len() as u64; 52 | self.col 53 | }, 54 | } 55 | } 56 | }; 57 | 58 | let conflicts = conflicts.get(&key).cloned().unwrap_or_default(); 59 | let mut conflict_resolutions = vec![]; 60 | for (ident, literal) in conflicts { 61 | let subitem = tokens_to_parse.iter().find(|x| x.ident == ident).expect("missing subitem"); 62 | let constructed = construct_variant(subitem, enum_ident); 63 | 64 | conflict_resolutions.push(quote! { 65 | #literal => return Some(::compiler_tools::Spanned { 66 | token: #constructed, 67 | span, 68 | }), 69 | }) 70 | } 71 | let conflict_resolutions = flatten(conflict_resolutions); 72 | 73 | parse_fns.entry(token_index).or_default().push(quote! { 74 | { 75 | #parse_fn 76 | if let Some((passed, remaining)) = #fn_ident(self.inner) { 77 | let span = #span; 78 | self.inner = remaining; 79 | match passed { 80 | #conflict_resolutions 81 | passed => return Some(::compiler_tools::Spanned { 82 | token: #constructed, 83 | span, 84 | }), 85 | } 86 | } 87 | } 88 | }); 89 | } 90 | } 91 | 92 | Ok(()) 93 | } 94 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, HashSet}; 2 | 3 | use indexmap::IndexMap; 4 | use proc_macro::TokenStream; 5 | use proc_macro2::{Ident, TokenStream as TokenStream2, TokenTree}; 6 | use quote::{format_ident, quote, quote_spanned, ToTokens, TokenStreamExt}; 7 | use regex::Regex; 8 | use syn::{parse_macro_input, spanned::Spanned, DeriveInput, Expr, ExprLit, ExprPath, Fields, FieldsUnnamed, Lifetime, Lit, Meta, Type}; 9 | 10 | use crate::{gen::class_match::gen_class_match, lit_table::LitTable, simple_regex::SimpleRegex}; 11 | 12 | mod gen; 13 | mod lit_table; 14 | mod simple_regex; 15 | 16 | #[proc_macro_attribute] 17 | pub fn token_parse(_metadata: TokenStream, input: TokenStream) -> TokenStream { 18 | let ast = parse_macro_input!(input as DeriveInput); 19 | impl_token_parse(&ast).into() 20 | } 21 | 22 | fn flatten>(iter: T) -> TokenStream2 { 23 | let mut out = quote! {}; 24 | out.append_all(iter); 25 | out 26 | } 27 | 28 | struct TokenParseData { 29 | has_target: bool, 30 | target_needs_parse: bool, 31 | is_illegal: bool, 32 | literals: Vec, 33 | simple_regexes: Vec, 34 | regexes: Vec, 35 | parse_fn: Option, 36 | ident: Ident, 37 | } 38 | 39 | fn parse_attributes(input: TokenStream2) -> Option>> { 40 | let mut tokens = input.into_iter(); 41 | 42 | let mut attributes = IndexMap::>::new(); 43 | 44 | loop { 45 | let name = match tokens.next() { 46 | None => break, 47 | Some(TokenTree::Ident(ident)) => ident, 48 | _ => return None, 49 | }; 50 | match tokens.next() { 51 | None => { 52 | attributes.insert(name.to_string(), None); 53 | break; 54 | } 55 | Some(TokenTree::Punct(p)) if p.as_char() == ',' => { 56 | attributes.insert(name.to_string(), None); 57 | } 58 | Some(TokenTree::Punct(p)) if p.as_char() == '=' => { 59 | let value = if let TokenTree::Literal(literal) = tokens.next()? { 60 | let lit = Lit::new(literal); 61 | Some(match lit { 62 | Lit::Str(s) => s.value(), 63 | Lit::Char(c) => c.value().to_string(), 64 | _ => return None, 65 | }) 66 | } else { 67 | return None; 68 | }; 69 | attributes.insert(name.to_string(), value); 70 | } 71 | _ => return None, 72 | } 73 | } 74 | Some(attributes) 75 | } 76 | 77 | fn construct_variant(item: &TokenParseData, enum_ident: &Ident) -> TokenStream2 { 78 | let variant = &item.ident; 79 | if item.has_target { 80 | if item.target_needs_parse { 81 | //TODO: emit better error for parsefail 82 | quote! { 83 | #enum_ident::#variant(passed.parse().ok()?) 84 | } 85 | } else { 86 | quote! { 87 | #enum_ident::#variant(passed) 88 | } 89 | } 90 | } else { 91 | quote! { 92 | #enum_ident::#variant 93 | } 94 | } 95 | } 96 | 97 | struct SimpleRegexData { 98 | pub token_index: usize, 99 | pub regex: SimpleRegex, 100 | } 101 | 102 | struct RegexData { 103 | pub token_index: usize, 104 | pub regex: Regex, 105 | } 106 | 107 | fn impl_token_parse(input: &DeriveInput) -> proc_macro2::TokenStream { 108 | if input.generics.params.len() > 1 || !matches!(input.generics.params.first(), None | Some(syn::GenericParam::Lifetime(_))) { 109 | return quote_spanned! { 110 | input.generics.span() => 111 | compile_error!("TokenParse can only have a single lifetime type parameter"); 112 | }; 113 | } 114 | let has_lifetime_param = input.generics.params.len() == 1; 115 | let original_lifetime_param = if let Some(syn::GenericParam::Lifetime(lifetime)) = input.generics.params.first() { 116 | Some(lifetime.lifetime.ident.clone()) 117 | } else { 118 | None 119 | }; 120 | 121 | let items = match &input.data { 122 | syn::Data::Enum(items) => items, 123 | _ => { 124 | return quote_spanned! { 125 | input.span() => 126 | compile_error!("TokenParse can only be derived on enums"); 127 | } 128 | } 129 | }; 130 | 131 | let mut tokens_to_parse = vec![]; 132 | let mut has_illegal = false; 133 | for variant in &items.variants { 134 | let mut parse_data = TokenParseData { 135 | has_target: false, 136 | target_needs_parse: false, 137 | is_illegal: false, 138 | literals: vec![], 139 | simple_regexes: vec![], 140 | regexes: vec![], 141 | parse_fn: None, 142 | ident: variant.ident.clone(), 143 | }; 144 | 145 | for attribute in &variant.attrs { 146 | if !attribute.path().is_ident("token") { 147 | continue; 148 | } 149 | 150 | let Meta::List(meta) = &attribute.meta else { 151 | continue; 152 | }; 153 | 154 | let attributes = match parse_attributes(meta.tokens.clone()) { 155 | Some(x) => x, 156 | None => { 157 | return quote_spanned! { 158 | attribute.span() => 159 | compile_error!("invalid attribute syntax"); 160 | } 161 | } 162 | }; 163 | for (name, value) in attributes { 164 | if name != "illegal" && value.is_none() { 165 | return quote_spanned! { 166 | attribute.span() => 167 | compile_error!("missing attribute value"); 168 | }; 169 | } 170 | 171 | match &*name { 172 | "literal" => { 173 | parse_data.literals.push(value.unwrap()); 174 | } 175 | "regex" => { 176 | parse_data.simple_regexes.push(value.unwrap()); 177 | } 178 | "regex_full" => { 179 | parse_data.regexes.push(value.unwrap()); 180 | } 181 | "parse_fn" => { 182 | if parse_data.parse_fn.is_some() { 183 | return quote_spanned! { 184 | attribute.span() => 185 | compile_error!("redefined 'parse_fn' attribute"); 186 | }; 187 | } 188 | parse_data.parse_fn = Some(value.unwrap()); 189 | } 190 | "illegal" => { 191 | if value.is_some() { 192 | return quote_spanned! { 193 | attribute.span() => 194 | compile_error!("unexpected attribute value"); 195 | }; 196 | } 197 | if parse_data.is_illegal || has_illegal { 198 | return quote_spanned! { 199 | attribute.span() => 200 | compile_error!("redefined 'illegal' attribute"); 201 | }; 202 | } 203 | parse_data.is_illegal = true; 204 | has_illegal = true; 205 | } 206 | _ => { 207 | return quote_spanned! { 208 | attribute.span() => 209 | compile_error!("unknown attribute"); 210 | } 211 | } 212 | } 213 | } 214 | } 215 | if let Some((_, discriminant)) = &variant.discriminant { 216 | if let Expr::Lit(ExprLit { 217 | lit: Lit::Str(lit_str), 218 | .. 219 | }) = discriminant 220 | { 221 | parse_data.literals.push(lit_str.value()); 222 | } else { 223 | return quote_spanned! { 224 | input.span() => 225 | compile_error!("TokenParse enums cannot have non-string discriminants"); 226 | }; 227 | } 228 | } 229 | if parse_data.parse_fn.is_some() && (!parse_data.literals.is_empty() || !parse_data.simple_regexes.is_empty() || !parse_data.regexes.is_empty()) { 230 | return quote_spanned! { 231 | input.span() => 232 | compile_error!("cannot have a 'parse_fn' attribute and a 'literal', 'regex', or 'regex_full' attribute"); 233 | }; 234 | } 235 | let has_anything = 236 | parse_data.parse_fn.is_some() || !parse_data.literals.is_empty() || !parse_data.simple_regexes.is_empty() || !parse_data.regexes.is_empty(); 237 | if parse_data.is_illegal && has_anything { 238 | return quote_spanned! { 239 | input.span() => 240 | compile_error!("cannot have an 'illegal' attribute and a 'literal', 'regex', 'regex_full', or 'parse_fn' attribute"); 241 | }; 242 | } else if !parse_data.is_illegal && !has_anything { 243 | return quote_spanned! { 244 | input.span() => 245 | compile_error!("must have an enum discriminant or 'illegal', 'literal', 'regex', 'regex_full', or 'parse_fn' attribute"); 246 | }; 247 | } 248 | 249 | match &variant.fields { 250 | Fields::Named(_) => { 251 | return quote_spanned! { 252 | variant.fields.span() => 253 | compile_error!("cannot have enum struct in TokenParse variant"); 254 | }; 255 | } 256 | Fields::Unnamed(FieldsUnnamed { 257 | unnamed, 258 | .. 259 | }) => { 260 | if unnamed.len() != 1 { 261 | return quote_spanned! { 262 | unnamed.span() => 263 | compile_error!("must have single target type in TokenParse variant"); 264 | }; 265 | } 266 | let field = unnamed.first().unwrap(); 267 | match &field.ty { 268 | Type::Reference(ty) => { 269 | if ty.mutability.is_some() { 270 | return quote_spanned! { 271 | unnamed.span() => 272 | compile_error!("cannot have `&mut` in TokenParse variant"); 273 | }; 274 | } 275 | if !matches!(&ty.lifetime, Some(Lifetime { ident, ..}) if Some(ident) == original_lifetime_param.as_ref()) { 276 | return quote_spanned! { 277 | unnamed.span() => 278 | compile_error!("unexpected lifetime in TokenParse variant (use the same one as defined in enum declaration)"); 279 | }; 280 | } 281 | if let Type::Path(path) = &*ty.elem { 282 | if path.qself.is_some() || path.path.segments.len() != 1 || path.path.segments.first().unwrap().ident != "str" { 283 | return quote_spanned! { 284 | unnamed.span() => 285 | compile_error!("invalid type in reference for TokenParse (only &str allowed)"); 286 | }; 287 | } 288 | } else { 289 | return quote_spanned! { 290 | unnamed.span() => 291 | compile_error!("invalid type in reference for TokenParse (only &str allowed)"); 292 | }; 293 | } 294 | parse_data.has_target = true; 295 | } 296 | _ => { 297 | parse_data.has_target = true; 298 | parse_data.target_needs_parse = true; 299 | } 300 | } 301 | } 302 | // no target 303 | Fields::Unit => { 304 | if parse_data.is_illegal { 305 | return quote_spanned! { 306 | variant.span() => 307 | compile_error!("'illegal' attributed tokens must have a single field (usually 'char' or '&str')"); 308 | }; 309 | } 310 | } 311 | } 312 | 313 | tokens_to_parse.push(parse_data) 314 | } 315 | 316 | let mut simple_regexes = BTreeMap::new(); 317 | for (token_index, item) in tokens_to_parse.iter().enumerate() { 318 | for simple_regex in &item.simple_regexes { 319 | let parsed = match SimpleRegex::parse(simple_regex) { 320 | Some(x) => x, 321 | None => { 322 | return quote_spanned! { 323 | item.ident.span() => 324 | compile_error!("invalid simple regex"); 325 | } 326 | } 327 | }; 328 | simple_regexes.insert( 329 | (item.ident.clone(), simple_regex.clone()), 330 | SimpleRegexData { 331 | token_index, 332 | regex: parsed, 333 | }, 334 | ); 335 | } 336 | } 337 | 338 | let mut regexes = BTreeMap::new(); 339 | for (token_index, item) in tokens_to_parse.iter().enumerate() { 340 | for regex in &item.regexes { 341 | let modified_regex = format!("^{}", regex); 342 | let parsed = match Regex::new(&*modified_regex) { 343 | Ok(x) => x, 344 | Err(_) => { 345 | return quote_spanned! { 346 | item.ident.span() => 347 | compile_error!("invalid simple regex"); 348 | } 349 | } 350 | }; 351 | regexes.insert( 352 | (item.ident.clone(), regex.clone()), 353 | RegexData { 354 | token_index, 355 | regex: parsed, 356 | }, 357 | ); 358 | } 359 | } 360 | 361 | // (regex ident, raw regex) => (literal ident, literal) 362 | let mut simple_regex_ident_conflicts: BTreeMap<(Ident, String), Vec<(Ident, String)>> = BTreeMap::new(); 363 | let mut regex_ident_conflicts: BTreeMap<(Ident, String), Vec<(Ident, String)>> = BTreeMap::new(); 364 | let mut known_literals = HashSet::new(); 365 | 366 | let mut parse_fns: BTreeMap> = BTreeMap::new(); 367 | 368 | let mut lit_table = LitTable::default(); 369 | for (token_index, item) in tokens_to_parse.iter().enumerate() { 370 | for literal in &item.literals { 371 | if !known_literals.insert(literal.clone()) { 372 | return quote_spanned! { 373 | item.ident.span() => 374 | compile_error!("conflicting literals"); 375 | }; 376 | } 377 | let mut any_matched = false; 378 | for ((ident, raw_regex), regex) in &simple_regexes { 379 | if regex.token_index > token_index && regex.regex.matches(&**literal) { 380 | simple_regex_ident_conflicts 381 | .entry((ident.clone(), raw_regex.clone())) 382 | .or_default() 383 | .push((item.ident.clone(), literal.clone())); 384 | any_matched = true; 385 | } 386 | } 387 | if any_matched { 388 | continue; 389 | } 390 | for ((ident, raw_regex), regex) in ®exes { 391 | if regex.token_index > token_index && regex.regex.is_match(&**literal) { 392 | regex_ident_conflicts 393 | .entry((ident.clone(), raw_regex.clone())) 394 | .or_default() 395 | .push((item.ident.clone(), literal.clone())); 396 | any_matched = true; 397 | } 398 | } 399 | if any_matched { 400 | continue; 401 | } 402 | lit_table.push(item, &**literal, &mut literal.chars()); 403 | } 404 | } 405 | 406 | let lit_table_name = format_ident!("parse_lits"); 407 | let lit_table = lit_table.emit(&lit_table_name, &input.ident); 408 | 409 | if let Err(e) = gen::simple_regex::gen_simple_regex(&tokens_to_parse[..], &simple_regexes, &simple_regex_ident_conflicts, &input.ident, &mut parse_fns) { 410 | return e; 411 | } 412 | if let Err(e) = gen::full_regex::gen_full_regex(&tokens_to_parse[..], ®ex_ident_conflicts, &input.ident, &mut parse_fns) { 413 | return e; 414 | } 415 | 416 | let lifetime_param = if has_lifetime_param { 417 | quote! { <'a> } 418 | } else { 419 | quote! {} 420 | }; 421 | let ident_raw = input.ident.to_string(); 422 | let tokenizer_ident = if ident_raw.contains("Token") { 423 | format_ident!("{}", ident_raw.replace("Token", "Tokenizer")) 424 | } else { 425 | format_ident!("{}Tokenizer", ident_raw) 426 | }; 427 | let token_ident = &input.ident; 428 | let vis = &input.vis; 429 | 430 | let display_fields = gen::display::gen_display(&tokens_to_parse[..], &input.ident); 431 | 432 | let illegal_emission = if let Some(illegal) = tokens_to_parse.iter().find(|x| x.is_illegal) { 433 | let constructor = construct_variant(illegal, &input.ident); 434 | quote! { 435 | if let Some(value) = self.inner.chars().next() { 436 | let span = ::compiler_tools::Span { 437 | line_start: self.line, 438 | col_start: self.col, 439 | line_stop: if value == '\n' { 440 | self.line 441 | } else { 442 | self.line += 1; 443 | self.line 444 | }, 445 | col_stop: if value != '\n' { 446 | self.col += value.len_utf8() as u64; 447 | self.col 448 | } else { 449 | self.col = 0; 450 | self.col 451 | }, 452 | }; 453 | let passed = &self.inner[..value.len_utf8()]; 454 | self.inner = &self.inner[value.len_utf8()..]; 455 | return Some(::compiler_tools::Spanned { 456 | token: #constructor, 457 | span, 458 | }) 459 | } else { 460 | None 461 | } 462 | } 463 | } else { 464 | quote! { 465 | None 466 | } 467 | }; 468 | 469 | let reinput = { 470 | let attrs = flatten(&input.attrs); 471 | let vis = &input.vis; 472 | let ident = &input.ident; 473 | let generics = &input.generics; 474 | let mut variants = vec![]; 475 | for variant in &items.variants { 476 | let attrs = flatten( 477 | variant 478 | .attrs 479 | .iter() 480 | .filter(|a| a.path().segments.len() != 1 || a.path().segments.first().unwrap().ident != "token"), 481 | ); 482 | let ident = &variant.ident; 483 | let fields = &variant.fields; 484 | // discriminant ignored 485 | variants.push(quote! { 486 | #attrs 487 | #ident #fields, 488 | }); 489 | } 490 | let variants = flatten(variants); 491 | quote! { 492 | #attrs 493 | #vis enum #ident #generics { 494 | #variants 495 | } 496 | } 497 | }; 498 | 499 | let class_matches = gen_class_match(&tokens_to_parse[..], &input.ident); 500 | 501 | for (token_index, token) in tokens_to_parse.iter().enumerate() { 502 | if let Some(parse_fn) = &token.parse_fn { 503 | let path_expr: ExprPath = match syn::parse_str(&parse_fn) { 504 | Ok(x) => x, 505 | Err(_e) => { 506 | parse_fns 507 | .entry(token_index) 508 | .or_default() 509 | .push(quote! { compile_error!("can't parse path for parse_fn"); }); 510 | continue; 511 | } 512 | }; 513 | let constructed = construct_variant(token, &input.ident); 514 | parse_fns.entry(token_index).or_default().push(quote! { 515 | { 516 | if let Some((passed, remaining)) = #path_expr(self.inner) { 517 | let span = ::compiler_tools::Span { 518 | line_start: self.line, 519 | col_start: self.col, 520 | line_stop: { 521 | self.line += passed.chars().filter(|x| *x == '\n').count() as u64; 522 | self.line 523 | }, 524 | //todo: handle utf8 better with newline seeking here 525 | col_stop: if let Some(newline_offset) = passed.as_bytes().iter().rev().position(|x| *x == b'\n') { 526 | let newline_offset = passed.len() - newline_offset; 527 | self.col = (newline_offset as u64).saturating_sub(1); 528 | self.col 529 | } else { 530 | self.col += passed.len() as u64; 531 | self.col 532 | }, 533 | }; 534 | self.inner = remaining; 535 | match passed { 536 | passed => return Some(::compiler_tools::Spanned { 537 | token: #constructed, 538 | span, 539 | }), 540 | } 541 | } 542 | } 543 | }); 544 | } 545 | } 546 | 547 | let lit_table_parse = quote! { 548 | match #lit_table_name(self.inner) { 549 | Some((token, remaining, newlines)) => { 550 | let span = ::compiler_tools::Span { 551 | line_start: self.line, 552 | col_start: self.col, 553 | line_stop: if newlines == 0 { 554 | self.line 555 | } else { 556 | self.line += newlines; 557 | self.line 558 | }, 559 | col_stop: if newlines == 0 { 560 | self.col += (self.inner.len() - remaining.len()) as u64; 561 | self.col 562 | } else { 563 | //todo: handle utf8 better with newline seeking here 564 | let newline_offset = self.inner[..self.inner.len() - remaining.len()].as_bytes().iter().rev().position(|x| *x == b'\n').expect("malformed newline state"); 565 | let newline_offset = (self.inner.len() - remaining.len()) - newline_offset; 566 | self.col = (newline_offset as u64).saturating_sub(1); 567 | self.col 568 | }, 569 | }; 570 | self.inner = remaining; 571 | return Some(::compiler_tools::Spanned { 572 | token, 573 | span, 574 | }); 575 | }, 576 | None => (), 577 | } 578 | }; 579 | 580 | if let Some(lit_table_index) = tokens_to_parse.iter().position(|x| !x.literals.is_empty()) { 581 | parse_fns.entry(lit_table_index).or_default().push(lit_table_parse); 582 | } 583 | 584 | let parse_fns = flatten(parse_fns.into_values().flatten()); 585 | 586 | quote! { 587 | #reinput 588 | 589 | impl #lifetime_param ::core::fmt::Display for #token_ident #lifetime_param { 590 | fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { 591 | match self { 592 | #display_fields 593 | } 594 | } 595 | } 596 | 597 | impl #lifetime_param ::compiler_tools::TokenExt for #token_ident #lifetime_param { 598 | fn matches_class(&self, other: &Self) -> bool { 599 | match (self, other) { 600 | #class_matches 601 | } 602 | } 603 | } 604 | 605 | #vis struct #tokenizer_ident<'a> { 606 | line: u64, 607 | col: u64, 608 | inner: &'a str, 609 | } 610 | 611 | impl<'a> #tokenizer_ident<'a> { 612 | pub fn new(input: &'a str) -> Self { 613 | Self { 614 | line: 0, 615 | col: 0, 616 | inner: input, 617 | } 618 | } 619 | } 620 | 621 | impl<'a> ::compiler_tools::TokenParse<'a> for #tokenizer_ident<'a> { 622 | type Token = #token_ident #lifetime_param; 623 | 624 | #[allow(non_snake_case, unreachable_pattern, unreachable_code)] 625 | fn next(&mut self) -> Option<::compiler_tools::Spanned> { 626 | #lit_table 627 | #parse_fns 628 | #illegal_emission 629 | } 630 | } 631 | } 632 | } 633 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/lit_table.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use proc_macro2::{Ident, TokenStream}; 4 | use quote::quote; 5 | 6 | use crate::{flatten, TokenParseData}; 7 | 8 | #[derive(Default)] 9 | pub(super) struct LitTable { 10 | table: BTreeMap, 11 | token: Option, 12 | } 13 | 14 | pub(super) struct LitTableToken { 15 | ident: Ident, 16 | has_target: bool, 17 | target_needs_parse: bool, 18 | literal: String, 19 | } 20 | 21 | impl LitTableToken { 22 | fn emit(&self, enum_ident: &Ident) -> TokenStream { 23 | let variant = &self.ident; 24 | let len = self.literal.len(); 25 | let newlines = self.literal.chars().filter(|c| *c == '\n').count() as u64; 26 | 27 | if self.has_target { 28 | if self.target_needs_parse { 29 | //TODO: emit better error for parsefail 30 | quote! { 31 | { 32 | let (before, after) = from.split_at(#len); 33 | from = after; 34 | (#enum_ident::#variant(before.parse().ok()?), #newlines) 35 | } 36 | } 37 | } else { 38 | quote! { 39 | { 40 | let (before, after) = from.split_at(#len); 41 | from = after; 42 | (#enum_ident::#variant(before), #newlines) 43 | } 44 | } 45 | } 46 | } else { 47 | quote! { 48 | { 49 | from = &from[#len..]; 50 | (#enum_ident::#variant, #newlines) 51 | } 52 | } 53 | } 54 | } 55 | } 56 | 57 | impl LitTable { 58 | pub(super) fn push(&mut self, item: &TokenParseData, literal: &str, remaining: &mut impl Iterator) { 59 | match remaining.next() { 60 | Some(c) => { 61 | let entry = self.table.entry(c).or_default(); 62 | entry.push(item, literal, remaining); 63 | } 64 | None => { 65 | self.token = Some(LitTableToken { 66 | ident: item.ident.clone(), 67 | has_target: item.has_target, 68 | target_needs_parse: item.target_needs_parse, 69 | literal: literal.to_string(), 70 | }) 71 | } 72 | } 73 | } 74 | 75 | fn emit_internal(&self, enum_ident: &Ident, pending_default: Option<&LitTableToken>) -> TokenStream { 76 | if self.table.is_empty() { 77 | if let Some(token) = &self.token { 78 | let emitted = token.emit(enum_ident); 79 | quote! { 80 | Some(#emitted) 81 | } 82 | } else { 83 | quote! { 84 | None 85 | } 86 | } 87 | } else { 88 | let default = self.token.as_ref().or(pending_default); 89 | let mut entries = vec![]; 90 | for (c, table) in &self.table { 91 | let internal = table.emit_internal(enum_ident, default); 92 | entries.push(quote! { 93 | Some(#c) => #internal, 94 | }); 95 | } 96 | if let Some(token) = default { 97 | let emitted = token.emit(enum_ident); 98 | entries.push(quote! { 99 | _ => Some(#emitted), 100 | }); 101 | } else { 102 | entries.push(quote! { 103 | _ => None, 104 | }); 105 | } 106 | let entries = flatten(entries); 107 | 108 | quote! { 109 | match iter.next() { 110 | #entries 111 | } 112 | } 113 | } 114 | } 115 | 116 | //TODO: straightshot optimization 117 | pub(super) fn emit(&self, fn_name: &Ident, enum_ident: &Ident) -> TokenStream { 118 | let internal = self.emit_internal(&enum_ident, None); 119 | // println!("{}", internal); 120 | quote! { 121 | // returns (token, remaining, newlines_skipped) 122 | #[inline] 123 | fn #fn_name(mut from: &str) -> Option<(#enum_ident, &str, u64)> { 124 | let start = from; 125 | let mut iter = from.chars(); 126 | let (token, newlines) = #internal?; 127 | Some((token, from, newlines)) 128 | } 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/simple_regex/dfa.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, HashMap, HashSet}; 2 | 3 | use super::nfa::{Nfa, TransitionEvent}; 4 | 5 | #[derive(Debug)] 6 | pub struct Dfa { 7 | // state => [(event, state)] 8 | pub transitions: BTreeMap>, 9 | pub final_state: u32, 10 | } 11 | 12 | impl Dfa { 13 | pub fn build(nfa: &Nfa) -> Self { 14 | let mut self_ = Self { 15 | transitions: Default::default(), 16 | final_state: nfa.final_state, 17 | }; 18 | 19 | let mut state_set: Vec<(u32, Vec<(TransitionEvent, u32)>)> = vec![(0u32, vec![])]; 20 | let mut observed = HashSet::new(); 21 | while let Some((state, shadows)) = state_set.pop() { 22 | // println!("oshadow {} {:?}", state, shadows); 23 | if state == nfa.final_state { 24 | continue; 25 | } 26 | let mut epsilon_closure = nfa.epsilon_closure(state); 27 | epsilon_closure.extend(shadows); 28 | 29 | let mut epsilon_closure = epsilon_closure.into_iter().enumerate().collect::>(); 30 | 31 | let mut shadowed_closures: HashMap> = HashMap::new(); 32 | for (i, (transition1, _)) in epsilon_closure.iter() { 33 | for (j, (transition2, _)) in epsilon_closure.iter() { 34 | if j == i { 35 | continue; 36 | } 37 | if transition1.completely_shadows(transition2) { 38 | shadowed_closures.entry(*j).or_default().push(*i); 39 | } 40 | } 41 | } 42 | let mut original_shadowed_closures: HashMap> = shadowed_closures 43 | .iter() 44 | .map(|(shadowed, shadows)| { 45 | ( 46 | epsilon_closure.get(shadowed).unwrap().clone().1, 47 | shadows.into_iter().map(|shadow| epsilon_closure.get(shadow).unwrap().clone()).collect(), 48 | ) 49 | }) 50 | .collect(); 51 | 52 | let total = epsilon_closure.len(); 53 | let mut emitted_closures = vec![]; 54 | while emitted_closures.len() < total { 55 | for i in 0..total { 56 | if !epsilon_closure.contains_key(&i) { 57 | continue; 58 | } 59 | if !shadowed_closures.contains_key(&i) || shadowed_closures.get(&i).unwrap().is_empty() { 60 | emitted_closures.push(epsilon_closure.remove(&i).unwrap()); 61 | for (_shadowed, shadowing) in shadowed_closures.iter_mut() { 62 | shadowing.retain(|x| *x != i); 63 | } 64 | } 65 | } 66 | } 67 | emitted_closures.reverse(); 68 | 69 | // println!("eps closure {} = {:?}", state, emitted_closures); 70 | for (_, target) in &emitted_closures { 71 | if observed.contains(target) { 72 | continue; 73 | } 74 | observed.insert(*target); 75 | state_set.push((*target, original_shadowed_closures.remove(target).unwrap_or_default())); 76 | } 77 | self_.transitions.insert(state, emitted_closures); 78 | } 79 | 80 | self_ 81 | } 82 | } 83 | 84 | #[cfg(test)] 85 | mod tests { 86 | use crate::simple_regex::SimpleRegexAst; 87 | 88 | use super::*; 89 | 90 | #[test] 91 | fn test_dfa() { 92 | let regex = SimpleRegexAst::parse("/\\*.*\\*/").unwrap(); 93 | let nfa = Nfa::build(®ex); 94 | println!("{:?}", nfa); 95 | let dfa = Dfa::build(&nfa); 96 | println!("{:?}", dfa); 97 | } 98 | 99 | #[test] 100 | fn test_dfa_ident() { 101 | let regex = SimpleRegexAst::parse("[a-z][a-zA-Z0-9_]*").unwrap(); 102 | let nfa = Nfa::build(®ex); 103 | println!("{:?}", nfa); 104 | let dfa = Dfa::build(&nfa); 105 | println!("{:?}", dfa); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/simple_regex/generate.rs: -------------------------------------------------------------------------------- 1 | use quote::format_ident; 2 | 3 | use super::*; 4 | 5 | impl SimpleRegex { 6 | pub fn generate_parser(&self, fn_name: Ident) -> TokenStream { 7 | let mut state_fns = vec![]; 8 | let mut state_matches = vec![]; 9 | for (state, transitions) in &self.dfa.transitions { 10 | let state_fn = format_ident!("state_{}", state); 11 | let mut transition_matches = vec![]; 12 | for (transition, target) in transitions { 13 | let match_expr = match transition { 14 | nfa::TransitionEvent::Epsilon => unreachable!(), 15 | nfa::TransitionEvent::End => quote! { _ => ::compiler_tools::MatchResult::MatchedEmpty(#target), }, 16 | nfa::TransitionEvent::Char(c) => quote! { Some(#c) => ::compiler_tools::MatchResult::Matched(#target), }, 17 | nfa::TransitionEvent::Chars(inverted, group) => { 18 | let mut matching = vec![]; 19 | for entry in group { 20 | match entry { 21 | GroupEntry::Char(c) => { 22 | if !matching.is_empty() { 23 | matching.push(quote! { | }) 24 | } 25 | matching.push(quote! { #c }); 26 | } 27 | GroupEntry::Range(start, end) => { 28 | if !matching.is_empty() { 29 | matching.push(quote! { | }) 30 | } 31 | matching.push(quote! { #start ..= #end }); 32 | } 33 | } 34 | } 35 | let matching_empty = matching.is_empty(); 36 | 37 | let matching = flatten(matching); 38 | if *inverted { 39 | if matching_empty { 40 | quote! { 41 | _ => ::compiler_tools::MatchResult::Matched(#target), 42 | } 43 | } else { 44 | quote! { 45 | Some(c) if !matches!(c, #matching) => ::compiler_tools::MatchResult::Matched(#target), 46 | } 47 | } 48 | } else { 49 | quote! { 50 | Some(c) if matches!(c, #matching) => ::compiler_tools::MatchResult::Matched(#target), 51 | } 52 | } 53 | } 54 | }; 55 | transition_matches.push(match_expr); 56 | } 57 | let transition_matches = flatten(transition_matches); 58 | 59 | state_fns.push(quote! { 60 | #[inline] 61 | fn #state_fn(target: Option) -> ::compiler_tools::MatchResult { 62 | match target { 63 | #transition_matches 64 | _ => ::compiler_tools::MatchResult::NoMatch, 65 | } 66 | } 67 | }); 68 | state_matches.push(quote! { 69 | #state => #state_fn(c), 70 | }); 71 | } 72 | /* 73 | let mut state = 0u32; 74 | let mut chars = from.chars(); 75 | while let Some(char) = chars.next() { 76 | for (transition, target) in self.dfa.transitions.get(&state).unwrap() { 77 | if transition.matches(char) { 78 | state = *target; 79 | if state == self.dfa.final_state { 80 | return true; 81 | } 82 | } 83 | } 84 | } 85 | false 86 | */ 87 | let state_fns = flatten(state_fns); 88 | let state_matches = flatten(state_matches); 89 | let final_state = self.dfa.final_state; 90 | quote! { 91 | fn #fn_name(from: &str) -> Option<(&str, &str)> { 92 | #state_fns 93 | let mut counter = 0usize; 94 | let mut state = 0u32; 95 | let mut chars = from.chars(); 96 | loop { 97 | let c = chars.next(); 98 | let next_state = match state { 99 | #state_matches 100 | _ => ::compiler_tools::MatchResult::NoMatch, 101 | }; 102 | match next_state { 103 | ::compiler_tools::MatchResult::Matched(next_state) => { 104 | state = next_state; 105 | if let Some(c) = c { 106 | counter += c.len_utf8(); 107 | } 108 | if next_state == #final_state { 109 | return Some((&from[..counter], &from[counter..])); 110 | } 111 | }, 112 | ::compiler_tools::MatchResult::MatchedEmpty(next_state) => { 113 | state = next_state; 114 | //TODO: backtrack iterator (but this only occurs at End sequence right now) 115 | if next_state == #final_state { 116 | return Some((&from[..counter], &from[counter..])); 117 | } 118 | }, 119 | ::compiler_tools::MatchResult::NoMatch => return None, 120 | } 121 | } 122 | None 123 | } 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/simple_regex/matching.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | impl SimpleRegex { 4 | pub fn could_capture_newline(&self) -> bool { 5 | for atom in &self.ast.atoms { 6 | match &atom.atom { 7 | Atom::Literal(lit) => { 8 | if lit.contains('\n') { 9 | return true; 10 | } 11 | } 12 | Atom::Group(inverted, entries) => { 13 | if *inverted { 14 | for entry in entries { 15 | match entry { 16 | GroupEntry::Char(c) => { 17 | if *c == '\n' { 18 | return false; 19 | } 20 | } 21 | GroupEntry::Range(start, end) => { 22 | if *start <= '\n' && *end >= '\n' { 23 | return false; 24 | } 25 | } 26 | } 27 | } 28 | return true; 29 | } else { 30 | for entry in entries { 31 | let matched = match entry { 32 | GroupEntry::Char(c) => *c == '\n', 33 | GroupEntry::Range(start, end) => *start <= '\n' && *end >= '\n', 34 | }; 35 | if matched { 36 | return true; 37 | } 38 | } 39 | } 40 | } 41 | } 42 | } 43 | return false; 44 | } 45 | 46 | pub fn matches(&self, from: &str) -> bool { 47 | let mut state = 0u32; 48 | let mut chars = from.chars(); 49 | while let Some(char) = chars.next() { 50 | for (transition, target) in self.dfa.transitions.get(&state).unwrap() { 51 | if transition.matches(char) { 52 | state = *target; 53 | if state == self.dfa.final_state { 54 | return true; 55 | } 56 | } 57 | } 58 | } 59 | false 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/simple_regex/mod.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Ident, TokenStream}; 2 | use quote::quote; 3 | 4 | use crate::flatten; 5 | 6 | use self::{dfa::Dfa, nfa::Nfa}; 7 | 8 | mod dfa; 9 | mod generate; 10 | mod matching; 11 | mod nfa; 12 | mod parse; 13 | 14 | #[derive(Debug)] 15 | pub enum Repeat { 16 | Once, 17 | ZeroOrOnce, 18 | OnceOrMore, 19 | ZeroOrMore, 20 | } 21 | 22 | // TODO: we should support classes (i.e. unicode ident_start) 23 | #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] 24 | pub enum GroupEntry { 25 | Char(char), 26 | Range(char, char), 27 | } 28 | 29 | #[derive(Debug)] 30 | pub enum Atom { 31 | Literal(String), 32 | // (inverted, items) 33 | Group(bool, Vec), 34 | } 35 | 36 | #[derive(Debug)] 37 | pub struct AtomRepeat { 38 | pub atom: Atom, 39 | pub repeat: Repeat, 40 | } 41 | 42 | pub struct SimpleRegexAst { 43 | pub atoms: Vec, 44 | } 45 | 46 | pub struct SimpleRegex { 47 | pub ast: SimpleRegexAst, 48 | pub dfa: Dfa, 49 | } 50 | 51 | impl SimpleRegex { 52 | pub fn parse(from: &str) -> Option { 53 | let parsed = SimpleRegexAst::parse(from)?; 54 | let nfa = Nfa::build(&parsed); 55 | Some(SimpleRegex { 56 | ast: parsed, 57 | dfa: Dfa::build(&nfa), 58 | }) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/simple_regex/nfa.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use super::{Atom, GroupEntry, Repeat, SimpleRegexAst}; 4 | 5 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] 6 | pub enum TransitionEvent { 7 | Epsilon, 8 | Char(char), 9 | // (inverted, set) 10 | Chars(bool, Vec), 11 | End, 12 | } 13 | 14 | impl TransitionEvent { 15 | pub fn matches(&self, target: char) -> bool { 16 | match self { 17 | TransitionEvent::Epsilon => false, 18 | TransitionEvent::Char(c) => *c == target, 19 | TransitionEvent::Chars(inverted, group) => { 20 | for entry in group { 21 | match entry { 22 | GroupEntry::Char(c) => { 23 | if *c == target { 24 | return !*inverted; 25 | } 26 | } 27 | GroupEntry::Range(start, end) => { 28 | if *start <= target && *end >= target { 29 | return !*inverted; 30 | } 31 | } 32 | } 33 | } 34 | *inverted 35 | } 36 | TransitionEvent::End => true, 37 | } 38 | } 39 | 40 | pub fn completely_shadows(&self, other: &TransitionEvent) -> bool { 41 | match (self, other) { 42 | (TransitionEvent::Epsilon, _) | (_, TransitionEvent::Epsilon) | (_, TransitionEvent::End) => false, 43 | (TransitionEvent::End, _) => true, 44 | (TransitionEvent::Char(c1), TransitionEvent::Char(c2)) => c1 == c2, 45 | //TODO: this could be true, investigate 46 | (TransitionEvent::Char(_), TransitionEvent::Chars(_, _)) => false, 47 | (e1, TransitionEvent::Char(c2)) => e1.matches(*c2), 48 | //TODO: this could be true, investigate 49 | (TransitionEvent::Chars(_, _), TransitionEvent::Chars(_, _)) => false, 50 | } 51 | } 52 | } 53 | 54 | #[derive(Debug)] 55 | pub struct Nfa { 56 | // state => [(event, state)] 57 | pub transitions: BTreeMap>, 58 | pub final_state: u32, 59 | } 60 | 61 | fn event_from_atom(atom: &Atom) -> Vec { 62 | match atom { 63 | Atom::Literal(l) => l.chars().map(|x| TransitionEvent::Char(x)).collect(), 64 | Atom::Group(inverted, entries) => { 65 | vec![TransitionEvent::Chars(*inverted, entries.clone())] 66 | } 67 | } 68 | } 69 | 70 | impl Nfa { 71 | pub fn epsilon_closure(&self, state: u32) -> Vec<(TransitionEvent, u32)> { 72 | let mut output: Vec<(TransitionEvent, u32)> = vec![]; 73 | let mut stack = vec![state]; 74 | let mut observed = BTreeSet::new(); 75 | while let Some(state) = stack.pop() { 76 | if state == self.final_state { 77 | output.push((TransitionEvent::End, state)); 78 | continue; 79 | } 80 | let transitions = self.transitions.get(&state).expect("invalid state"); 81 | for (event, target) in transitions { 82 | if observed.contains(&(event, target)) { 83 | continue; 84 | } 85 | observed.insert((event, target)); 86 | 87 | match event { 88 | TransitionEvent::Epsilon => stack.push(*target), 89 | transition => { 90 | output.push((transition.clone(), *target)); 91 | } 92 | } 93 | } 94 | } 95 | output 96 | } 97 | 98 | pub fn build(from: &SimpleRegexAst) -> Self { 99 | let mut self_ = Self { 100 | transitions: Default::default(), 101 | final_state: 0, 102 | }; 103 | 104 | let mut current_state = 0u32; 105 | for atom in &from.atoms { 106 | let mut events = event_from_atom(&atom.atom); 107 | 108 | match atom.repeat { 109 | Repeat::Once => { 110 | for event in events { 111 | self_.transitions.insert(current_state, vec![(event, current_state + 1)]); 112 | current_state += 1; 113 | } 114 | } 115 | Repeat::ZeroOrOnce => { 116 | let first_event = events.remove(0); 117 | self_.transitions.insert( 118 | current_state, 119 | vec![ 120 | (first_event, current_state + 1), 121 | (TransitionEvent::Epsilon, current_state + events.len() as u32 + 1), 122 | ], 123 | ); 124 | current_state += 1; 125 | 126 | for event in events { 127 | self_.transitions.insert(current_state, vec![(event, current_state + 1)]); 128 | current_state += 1; 129 | } 130 | } 131 | Repeat::OnceOrMore => { 132 | for event in events.iter().cloned() { 133 | self_.transitions.insert(current_state, vec![(event, current_state + 1)]); 134 | current_state += 1; 135 | } 136 | 137 | let first_event = events.remove(0); 138 | let initialization = current_state; 139 | let next_state = if events.is_empty() { initialization } else { current_state + 1 }; 140 | self_ 141 | .transitions 142 | .insert(current_state, vec![(first_event, next_state), (TransitionEvent::Epsilon, current_state + events.len() as u32 + 1)]); 143 | current_state += 1; 144 | 145 | let len = events.len(); 146 | for (i, event) in events.into_iter().enumerate() { 147 | let next_state = if i + 1 == len { initialization } else { current_state + 1 }; 148 | self_.transitions.insert(current_state, vec![(event, next_state)]); 149 | current_state += 1; 150 | } 151 | } 152 | Repeat::ZeroOrMore => { 153 | let first_event = events.remove(0); 154 | let initialization = current_state; 155 | let next_state = if events.is_empty() { initialization } else { current_state + 1 }; 156 | self_ 157 | .transitions 158 | .insert(current_state, vec![(first_event, next_state), (TransitionEvent::Epsilon, current_state + events.len() as u32 + 1)]); 159 | current_state += 1; 160 | 161 | let len = events.len(); 162 | for (i, event) in events.into_iter().enumerate() { 163 | let next_state = if i + 1 == len { initialization } else { current_state + 1 }; 164 | self_.transitions.insert(current_state, vec![(event, next_state)]); 165 | current_state += 1; 166 | } 167 | } 168 | } 169 | } 170 | self_.final_state = current_state; 171 | 172 | self_ 173 | } 174 | } 175 | 176 | #[cfg(test)] 177 | mod tests { 178 | use super::*; 179 | 180 | #[test] 181 | fn test_nfa() { 182 | let regex = SimpleRegexAst::parse("[a-z][a-zA-Z0-9_]*").unwrap(); 183 | let nfa = Nfa::build(®ex); 184 | println!("{:?}", nfa); 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /compiler-tools-derive/src/simple_regex/parse.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | //todo: unit tests 4 | fn parse_group(iter: &mut impl Iterator) -> Option { 5 | let mut group_entries = vec![]; 6 | let mut escaped = false; 7 | let mut in_range = false; 8 | let mut inverted = false; 9 | let mut first = true; 10 | loop { 11 | match iter.next() { 12 | None => return None, 13 | Some(']') if !escaped => break, 14 | Some('\\') if !escaped => { 15 | escaped = !escaped; 16 | } 17 | Some('-') if !escaped && matches!(group_entries.last(), Some(GroupEntry::Char(_))) => { 18 | group_entries.push(GroupEntry::Char('-')); 19 | in_range = true; 20 | } 21 | Some('^') if !escaped && first => { 22 | inverted = true; 23 | } 24 | Some(c) => { 25 | if in_range { 26 | assert_eq!(group_entries.pop(), Some(GroupEntry::Char('-'))); 27 | let start = group_entries.pop().expect("malformed state during group formation"); 28 | let start = if let GroupEntry::Char(c) = start { 29 | c 30 | } else { 31 | panic!("malformed state during group formation"); 32 | }; 33 | in_range = false; 34 | group_entries.push(GroupEntry::Range(start, c)) 35 | } else { 36 | group_entries.push(GroupEntry::Char(c)); 37 | } 38 | escaped = false; 39 | } 40 | } 41 | first = false; 42 | } 43 | if escaped { 44 | if in_range { 45 | assert_eq!(group_entries.pop(), Some(GroupEntry::Char('-'))); 46 | let start = group_entries.pop().expect("malformed state during group formation"); 47 | let start = if let GroupEntry::Char(c) = start { 48 | c 49 | } else { 50 | panic!("malformed state during group formation"); 51 | }; 52 | group_entries.push(GroupEntry::Range(start, '\\')) 53 | } else { 54 | group_entries.push(GroupEntry::Char('\\')); 55 | } 56 | } 57 | 58 | Some(Atom::Group(inverted, group_entries)) 59 | } 60 | 61 | impl SimpleRegexAst { 62 | pub fn parse(from: &str) -> Option { 63 | let mut iter = from.chars(); 64 | let mut atoms = vec![]; 65 | let mut escaped = false; 66 | let push_lit = |atoms: &mut Vec, c: char| { 67 | if let Some(AtomRepeat { 68 | atom: Atom::Literal(literal), 69 | repeat: Repeat::Once, 70 | }) = atoms.last_mut() 71 | { 72 | literal.push(c); 73 | } else { 74 | atoms.push(AtomRepeat { 75 | atom: Atom::Literal(c.to_string()), 76 | repeat: Repeat::Once, 77 | }); 78 | } 79 | }; 80 | while let Some(next) = iter.next() { 81 | match next { 82 | '\\' if !escaped => { 83 | escaped = !escaped; 84 | } 85 | '[' if !escaped => { 86 | atoms.push(AtomRepeat { 87 | atom: parse_group(&mut iter)?, 88 | repeat: Repeat::Once, 89 | }); 90 | } 91 | '*' if !escaped && !atoms.is_empty() => { 92 | let last_atom = atoms.last_mut().unwrap(); 93 | if !matches!(last_atom.repeat, Repeat::Once) { 94 | push_lit(&mut atoms, '*'); 95 | continue; 96 | } 97 | let atom = match &mut last_atom.atom { 98 | Atom::Literal(lit) => Atom::Literal(lit.pop().unwrap().to_string()), 99 | Atom::Group(..) => atoms.pop().unwrap().atom, 100 | }; 101 | atoms.push(AtomRepeat { 102 | atom, 103 | repeat: Repeat::ZeroOrMore, 104 | }) 105 | } 106 | '+' if !escaped && !atoms.is_empty() => { 107 | let last_atom = atoms.last_mut().unwrap(); 108 | if !matches!(last_atom.repeat, Repeat::Once) { 109 | push_lit(&mut atoms, '+'); 110 | continue; 111 | } 112 | let atom = match &mut last_atom.atom { 113 | Atom::Literal(lit) => Atom::Literal(lit.pop().unwrap().to_string()), 114 | Atom::Group(..) => atoms.pop().unwrap().atom, 115 | }; 116 | atoms.push(AtomRepeat { 117 | atom, 118 | repeat: Repeat::OnceOrMore, 119 | }) 120 | } 121 | '?' if !escaped && !atoms.is_empty() => { 122 | let last_atom = atoms.last_mut().unwrap(); 123 | if !matches!(last_atom.repeat, Repeat::Once) { 124 | push_lit(&mut atoms, '?'); 125 | continue; 126 | } 127 | let atom = match &mut last_atom.atom { 128 | Atom::Literal(lit) => Atom::Literal(lit.pop().unwrap().to_string()), 129 | Atom::Group(..) => atoms.pop().unwrap().atom, 130 | }; 131 | atoms.push(AtomRepeat { 132 | atom, 133 | repeat: Repeat::ZeroOrOnce, 134 | }) 135 | } 136 | '.' if !escaped => atoms.push(AtomRepeat { 137 | atom: Atom::Group(true, vec![]), 138 | repeat: Repeat::Once, 139 | }), 140 | c => { 141 | push_lit(&mut atoms, c); 142 | escaped = false; 143 | } 144 | } 145 | } 146 | if escaped { 147 | push_lit(&mut atoms, '\\'); 148 | } 149 | Some(SimpleRegexAst { 150 | atoms, 151 | }) 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /compiler-tools-derive/tests/integration.rs: -------------------------------------------------------------------------------- 1 | use compiler_tools::TokenParse; 2 | use compiler_tools_derive::token_parse; 3 | 4 | #[token_parse] 5 | #[derive(PartialEq, Clone, Copy, Debug)] 6 | pub enum Token<'a> { 7 | #[token(regex = "%[0-9]+")] 8 | PercentInt(&'a str), 9 | Async = "async", 10 | Await = "await", 11 | AwaitYe = "awaitye", 12 | Percent = "%", 13 | Plus = "+", 14 | Minus = "-", 15 | #[token(parse_fn = "compiler_tools::util::parse_str::<'\\''>")] 16 | String(&'a str), 17 | #[token(regex = "[0-9]+")] 18 | Int(i32), 19 | #[token(regex = "awa[a-z]+")] 20 | AwaIdent(&'a str), 21 | #[token(regex = "[a-z][a-zA-Z0-9_]*")] 22 | Ident(&'a str), 23 | #[token(regex = "//[^\n]*")] 24 | Comment(&'a str), 25 | #[token(regex = "/\\*.*\\*/")] 26 | CommentBlock(&'a str), 27 | #[token(regex = "[ \n]+")] 28 | Whitespace, 29 | #[token(illegal)] 30 | Illegal(char), 31 | } 32 | 33 | #[test] 34 | fn test_token() { 35 | let mut tokenizer = Tokenizer::new( 36 | r#"async%+await+ 37 | +%awaitye 38 | await 39 | awaye 40 | %234 41 | test_ident+ 42 | awaityematies 43 | //test comment 44 | 1234 45 | -1234 46 | 'test str' 47 | 'test '' str' 48 | 'test \d str' 49 | 'test \' str' 50 | /* test 51 | * 52 | block */ 53 | new_ident 54 | //comment end"#, 55 | ); 56 | while let Some(next) = tokenizer.next() { 57 | println!("{:?}", next); 58 | } 59 | } 60 | 61 | #[test] 62 | fn test_token_illegal() { 63 | let mut tokenizer = Tokenizer::new( 64 | r#"async%+await+ 65 | +%awaitye 66 | ^ 67 | * 68 | 1234 69 | -1234 70 | async 71 | await 72 | "#, 73 | ); 74 | while let Some(next) = tokenizer.next() { 75 | println!("{:?}", next); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /compiler-tools-derive/tests/regex_bench.rs: -------------------------------------------------------------------------------- 1 | use std::time::{Duration, Instant}; 2 | 3 | use compiler_tools::TokenParse; 4 | use compiler_tools_derive::token_parse; 5 | 6 | #[token_parse] 7 | #[derive(PartialEq, Clone, Copy, Debug)] 8 | pub enum TokenSimple<'a> { 9 | Async = "async", 10 | Plus = "+", 11 | #[token(regex = "(?i)\\*|\\+")] 12 | OrTest(&'a str), 13 | #[token(regex = "[a-z][a-zA-Z0-9_]*")] 14 | Ident(&'a str), 15 | #[token(regex = "/\\*.*\\*/")] 16 | CommentBlock(&'a str), 17 | } 18 | 19 | #[token_parse] 20 | #[derive(PartialEq, Clone, Copy, Debug)] 21 | pub enum TokenFull<'a> { 22 | Async = "async", 23 | Plus = "+", 24 | #[token(regex_full = "[a-z][a-zA-Z0-9_]*")] 25 | Ident(&'a str), 26 | #[token(regex_full = "/\\*.*?\\*/")] 27 | CommentBlock(&'a str), 28 | } 29 | 30 | fn duration_ms(duration: Duration) -> f64 { 31 | duration.as_secs_f64() * 1000.0 32 | } 33 | 34 | const TEST_COUNT: usize = 100000; 35 | 36 | #[test] 37 | fn test_regex_or() { 38 | let str = "sdf+sdfsdf*+*sdfsdf+*sdfsdf+*sdf"; 39 | 40 | let mut tokenizer = TokenizerSimple::new(str); 41 | while tokenizer.next().is_some() {} 42 | } 43 | 44 | // cargo test --release --package compiler-tools-derive --test regex_bench -- bench_simple --exact --nocapture 45 | // took 1.21 ms for 100000 idents @ 0.0000 ms/ident 46 | // took 4.99 ms for 100000 idents @ 0.0000 ms/ident 47 | #[test] 48 | fn bench_simple() { 49 | let idents = "test_ide123nt+".repeat(TEST_COUNT); 50 | 51 | let mut tokenizer = TokenizerSimple::new(&*idents); 52 | let start = Instant::now(); 53 | for _ in 0..TEST_COUNT { 54 | assert!(tokenizer.next().is_some()); 55 | } 56 | let elapsed = duration_ms(start.elapsed()); 57 | println!("took {:.02} ms for {} idents @ {:.04} ms/ident", elapsed, TEST_COUNT, elapsed / TEST_COUNT as f64); 58 | 59 | let idents = "/* test * block */+".repeat(TEST_COUNT); 60 | 61 | let mut tokenizer = TokenizerSimple::new(&*idents); 62 | let start = Instant::now(); 63 | for _ in 0..TEST_COUNT { 64 | assert!(tokenizer.next().is_some()); 65 | } 66 | let elapsed = duration_ms(start.elapsed()); 67 | println!("took {:.02} ms for {} idents @ {:.04} ms/ident", elapsed, TEST_COUNT, elapsed / TEST_COUNT as f64); 68 | } 69 | 70 | // cargo test --release --package compiler-tools-derive --test regex_bench -- bench_full --exact --nocapture 71 | // took 10.61 ms for 100000 idents @ 0.0001 ms/ident 72 | // took 14.23 ms for 100000 idents @ 0.0001 ms/ident 73 | #[test] 74 | fn bench_full() { 75 | let idents = "test_ide123nt+".repeat(TEST_COUNT); 76 | 77 | let mut tokenizer = TokenizerFull::new(&*idents); 78 | let start = Instant::now(); 79 | for _ in 0..TEST_COUNT { 80 | assert!(tokenizer.next().is_some()); 81 | } 82 | let elapsed = duration_ms(start.elapsed()); 83 | println!("took {:.02} ms for {} idents @ {:.04} ms/ident", elapsed, TEST_COUNT, elapsed / TEST_COUNT as f64); 84 | 85 | let idents = "/* test * block */+".repeat(TEST_COUNT); 86 | 87 | let mut tokenizer = TokenizerFull::new(&*idents); 88 | let start = Instant::now(); 89 | for _ in 0..TEST_COUNT { 90 | assert!(tokenizer.next().is_some()); 91 | } 92 | let elapsed = duration_ms(start.elapsed()); 93 | println!("took {:.02} ms for {} idents @ {:.04} ms/ident", elapsed, TEST_COUNT, elapsed / TEST_COUNT as f64); 94 | } 95 | -------------------------------------------------------------------------------- /compiler-tools/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "compiler-tools" 3 | version = "0.2.0" 4 | edition = "2021" 5 | authors = ["Protryon "] 6 | license = "MIT OR Apache-2.0" 7 | repository = "https://github.com/Protryon/compiler-tools" 8 | description = "A proc-macro for deriving powerful and fast tokenizers with compile-time regex" 9 | keywords = [ "compiler", "parser", "generator" ] 10 | rust-version = "1.75.0" 11 | 12 | [dependencies] 13 | serde = { version = "1.0", optional = true, features = ["derive"] } 14 | regex = { version = "1.10", optional = true } 15 | 16 | [features] 17 | default = ["serde", "use_regex"] 18 | use_regex = ["regex"] -------------------------------------------------------------------------------- /compiler-tools/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod tokenizer; 2 | pub use tokenizer::*; 3 | 4 | pub mod span; 5 | pub use span::*; 6 | 7 | pub mod misc; 8 | pub use misc::*; 9 | 10 | pub mod util; 11 | 12 | #[cfg(feature = "use_regex")] 13 | #[doc(hidden)] 14 | pub use regex; 15 | -------------------------------------------------------------------------------- /compiler-tools/src/misc.rs: -------------------------------------------------------------------------------- 1 | /// Used by simple_regex generated code 2 | pub enum MatchResult { 3 | Matched(u32), 4 | MatchedEmpty(u32), 5 | NoMatch, 6 | } 7 | -------------------------------------------------------------------------------- /compiler-tools/src/span.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt::{self, Debug, Display}, 3 | ops::{Deref, DerefMut}, 4 | }; 5 | 6 | #[derive(Clone, Debug, Copy, Default)] 7 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 8 | pub struct Span { 9 | pub line_start: u64, 10 | pub line_stop: u64, 11 | pub col_start: u64, 12 | pub col_stop: u64, 13 | } 14 | 15 | impl PartialEq for Span { 16 | fn eq(&self, _other: &Span) -> bool { 17 | true 18 | } 19 | } 20 | 21 | impl std::hash::Hash for Span { 22 | fn hash(&self, _state: &mut H) {} 23 | } 24 | 25 | impl fmt::Display for Span { 26 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 27 | if self.line_start == self.line_stop { 28 | write!(f, "{}:{}-{}", self.line_start, self.col_start, self.col_stop) 29 | } else { 30 | write!(f, "{}:{}-{}:{}", self.line_start, self.col_start, self.line_stop, self.col_stop) 31 | } 32 | } 33 | } 34 | 35 | impl std::ops::Add for Span { 36 | type Output = Self; 37 | 38 | fn add(self, other: Self) -> Self { 39 | if self.line_start == other.line_stop { 40 | Span { 41 | line_start: self.line_start, 42 | line_stop: self.line_stop, 43 | col_start: self.col_start.min(other.col_start), 44 | col_stop: self.col_stop.max(other.col_stop), 45 | } 46 | } else if self.line_start < other.line_start { 47 | Span { 48 | line_start: self.line_start, 49 | line_stop: other.line_stop, 50 | col_start: self.col_start, 51 | col_stop: other.col_stop, 52 | } 53 | } else { 54 | Span { 55 | line_start: other.line_start, 56 | line_stop: self.line_stop, 57 | col_start: other.col_start, 58 | col_stop: self.col_stop, 59 | } 60 | } 61 | } 62 | } 63 | 64 | impl std::ops::AddAssign for Span { 65 | fn add_assign(&mut self, rhs: Self) { 66 | *self = *self + rhs; 67 | } 68 | } 69 | 70 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 71 | #[derive(Clone, Copy)] 72 | pub struct Spanned { 73 | pub token: T, 74 | pub span: Span, 75 | } 76 | 77 | impl Deref for Spanned { 78 | type Target = T; 79 | 80 | fn deref(&self) -> &Self::Target { 81 | &self.token 82 | } 83 | } 84 | 85 | impl DerefMut for Spanned { 86 | fn deref_mut(&mut self) -> &mut Self::Target { 87 | &mut self.token 88 | } 89 | } 90 | 91 | impl Display for Spanned { 92 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 93 | write!(f, "'{}' @ {}", self.token, self.span) 94 | } 95 | } 96 | 97 | impl Debug for Spanned { 98 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 99 | write!(f, "'{:?}' @ {}", self.token, self.span) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /compiler-tools/src/tokenizer.rs: -------------------------------------------------------------------------------- 1 | use std::marker::PhantomData; 2 | 3 | use crate::span::Spanned; 4 | 5 | pub trait TokenExt: Clone + Copy + PartialEq { 6 | fn matches_class(&self, other: &Self) -> bool; 7 | } 8 | 9 | pub trait TokenParse<'a> { 10 | type Token: TokenExt + 'a; 11 | 12 | fn next(&mut self) -> Option>; 13 | } 14 | 15 | pub struct TokenizerWrap<'a, T: TokenParse<'a>> { 16 | inner: T, 17 | peeked: Option>, 18 | tokens_to_ignore: Vec, 19 | _lifetime: PhantomData<&'a ()>, 20 | } 21 | 22 | impl<'a, T: TokenParse<'a>> TokenizerWrap<'a, T> { 23 | pub fn new(inner: T, tokens_to_ignore: impl IntoIterator) -> Self { 24 | Self { 25 | inner, 26 | tokens_to_ignore: tokens_to_ignore.into_iter().collect(), 27 | peeked: None, 28 | _lifetime: PhantomData, 29 | } 30 | } 31 | 32 | pub fn next(&mut self) -> Option> { 33 | if let Some(peeked) = self.peeked.take() { 34 | Some(peeked) 35 | } else { 36 | loop { 37 | let next = self.inner.next()?; 38 | if self.tokens_to_ignore.iter().all(|x| !x.matches_class(&*next)) { 39 | break Some(next); 40 | } 41 | } 42 | } 43 | } 44 | 45 | pub fn peek(&mut self) -> Option<&Spanned> { 46 | if self.peeked.is_none() { 47 | self.peeked = self.next(); 48 | } 49 | self.peeked.as_ref() 50 | } 51 | 52 | pub fn eat(&mut self, token: T::Token) -> Option> { 53 | let next = self.next()?; 54 | if next.matches_class(&token) { 55 | Some(next) 56 | } else { 57 | self.peeked = Some(next); 58 | None 59 | } 60 | } 61 | 62 | pub fn eat_any(&mut self, tokens: &[T::Token]) -> Option> { 63 | let next = self.next()?; 64 | for token in tokens { 65 | if next.matches_class(token) { 66 | return Some(next); 67 | } 68 | } 69 | self.peeked = Some(next); 70 | None 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /compiler-tools/src/util.rs: -------------------------------------------------------------------------------- 1 | /// Simple parse function for a string token with an arbitrary delimeter 2 | pub fn parse_str(input: &str) -> Option<(&str, &str)> { 3 | if !input.starts_with(DELIMITER) { 4 | return None; 5 | } 6 | let mut escaped = false; 7 | let mut iter = input.char_indices().skip(1); 8 | while let Some((i, c)) = iter.next() { 9 | if escaped { 10 | escaped = false; 11 | continue; 12 | } 13 | if c == '\\' { 14 | escaped = true; 15 | continue; 16 | } 17 | if c == DELIMITER { 18 | if let Some((_, c)) = iter.next() { 19 | if c == DELIMITER { 20 | continue; 21 | } 22 | } 23 | return Some((&input[..i + c.len_utf8()], &input[i + c.len_utf8()..])); 24 | } 25 | } 26 | 27 | None 28 | } 29 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | edition = "2021" 2 | 3 | max_width = 160 4 | # rustfmt breaks the send_request macros 5 | attr_fn_like_width = 160 6 | fn_call_width = 120 7 | struct_lit_width = 0 8 | --------------------------------------------------------------------------------