├── .gitignore ├── fuzz ├── .gitignore ├── Cargo.toml └── src │ └── check.rs ├── examples └── regex.rs ├── src ├── dfa │ ├── mod.rs │ ├── to_tokens │ │ ├── mod.rs │ │ ├── binary_search.rs │ │ └── lookup_table.rs │ └── nfa_to_dfa.rs ├── macro_input.rs ├── lib.rs ├── character.rs └── nfa │ ├── repetition.rs │ └── mod.rs ├── Cargo.toml ├── tests ├── class.rs ├── regex.rs ├── standard.rs └── repetition.rs ├── LICENSE ├── .github └── workflows │ └── ci.yml ├── benches └── compare.rs └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | hfuzz_target 2 | hfuzz_workspace 3 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzz" 3 | version = "0.1.0" 4 | authors = ["LinkTed "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | regex = "~1.4.6" 9 | honggfuzz = "~0.5.54" 10 | 11 | [dependencies.proc-macro-regex] 12 | path = ".." 13 | 14 | [[bin]] 15 | name = "check" 16 | path = "src/check.rs" 17 | test = false 18 | doc = false 19 | -------------------------------------------------------------------------------- /examples/regex.rs: -------------------------------------------------------------------------------- 1 | use proc_macro_regex::regex; 2 | 3 | regex!(example_1 "abc"); 4 | regex!(example_2 "abc" 256); 5 | regex!(pub example_3 "abc"); 6 | regex!(example_4 b"abc"); 7 | 8 | fn main() { 9 | println!("example_1 == {}", example_1("abc")); 10 | println!("example_2 == {}", example_2("abc")); 11 | println!("example_3 == {}", example_3("abc")); 12 | println!("example_4 == {}", example_4(b"abc")); 13 | } 14 | -------------------------------------------------------------------------------- /src/dfa/mod.rs: -------------------------------------------------------------------------------- 1 | mod nfa_to_dfa; 2 | mod to_tokens; 3 | 4 | pub(super) use crate::dfa::to_tokens::DfaToTokens; 5 | use crate::{character::Character, dfa::nfa_to_dfa::NfaToDfaIter, nfa::Nfa}; 6 | use std::{ 7 | collections::{BTreeMap, BTreeSet}, 8 | convert::From, 9 | fmt::Debug, 10 | }; 11 | 12 | #[derive(Debug)] 13 | pub(crate) struct Dfa 14 | where 15 | T: Character, 16 | { 17 | states: BTreeSet, 18 | transitions: BTreeMap>, 19 | accept_states: BTreeSet, 20 | start_text: bool, 21 | end_text: bool, 22 | } 23 | 24 | impl From> for Dfa 25 | where 26 | T: Character + Copy, 27 | { 28 | fn from(nfa: Nfa) -> Self { 29 | let nfa_to_dfa = NfaToDfaIter::new(nfa); 30 | Dfa::from(nfa_to_dfa) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "proc-macro-regex" 3 | version = "1.1.0" 4 | authors = ["LinkTed "] 5 | license = "BSD-3-Clause" 6 | readme = "README.md" 7 | description = "A proc macro regex library" 8 | keywords = ["regex", "proc-marco"] 9 | edition = "2021" 10 | include = [ 11 | "src/**/*.rs", 12 | "tests/*.rs", 13 | "examples/*.rs", 14 | "Cargo.toml", 15 | "README.md", 16 | "LICENSE", 17 | ] 18 | repository = "https://github.com/LinkTed/proc-macro-regex" 19 | categories = ["text-processing"] 20 | 21 | [lib] 22 | proc-macro = true 23 | 24 | [dependencies] 25 | regex-syntax = "~0.6.26" 26 | proc-macro2 = "~1.0.36" 27 | quote = "~1.0.18" 28 | thiserror = "~1.0.31" 29 | 30 | [dependencies.syn] 31 | version = "~1.0.96" 32 | features = ["extra-traits"] 33 | 34 | [dev-dependencies] 35 | criterion = "~0.3.5" 36 | regex = "~1.5.6" 37 | 38 | [[bench]] 39 | name = "compare" 40 | harness = false 41 | path = "benches/compare.rs" 42 | -------------------------------------------------------------------------------- /tests/class.rs: -------------------------------------------------------------------------------- 1 | use proc_macro_regex::regex; 2 | 3 | #[test] 4 | fn character_class_regex() { 5 | regex!(character_class "[xyz]"); 6 | assert!(character_class("x")); 7 | assert!(!character_class("a")); 8 | } 9 | 10 | #[test] 11 | fn character_class_except_regex() { 12 | regex!(character_class_except b"[^x]"); 13 | assert!(character_class_except(b"a")); 14 | assert!(!character_class_except(b"x")); 15 | } 16 | 17 | #[test] 18 | fn character_class_range_regex() { 19 | regex!(character_class_range "[a-c]"); 20 | assert!(character_class_range("a")); 21 | assert!(character_class_range("c")); 22 | assert!(!character_class_range("x")); 23 | } 24 | 25 | #[test] 26 | fn character_class_alpha_regex() { 27 | regex!(character_class_alpha "[[:alpha:]]"); 28 | assert!(character_class_alpha("a")); 29 | assert!(character_class_alpha("Z")); 30 | assert!(!character_class_alpha("1")); 31 | } 32 | 33 | #[test] 34 | fn character_class_nested_regex() { 35 | regex!(character_class_nested b"[x[^xyz]]"); 36 | assert!(character_class_nested(b"x")); 37 | assert!(!character_class_nested(b"y")); 38 | } 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, LinkTed 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | rustfmt: 14 | name: Job rustfmt 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Install toolchain with rustfmt 18 | uses: actions-rs/toolchain@v1 19 | with: 20 | toolchain: stable 21 | components: rustfmt 22 | - uses: actions/checkout@v2 23 | - name: Run rustfmt 24 | run: cargo fmt --all -- --check 25 | 26 | audit: 27 | name: Job audit 28 | runs-on: ubuntu-latest 29 | steps: 30 | - uses: actions/checkout@v1 31 | - name: Run audit 32 | uses: actions-rs/audit-check@v1 33 | with: 34 | token: ${{ secrets.GITHUB_TOKEN }} 35 | 36 | clippy: 37 | name: Job clippy 38 | needs: rustfmt 39 | runs-on: ubuntu-latest 40 | steps: 41 | - name: Install toolchain with clippy 42 | uses: actions-rs/toolchain@v1 43 | with: 44 | toolchain: stable 45 | components: clippy 46 | - uses: actions/checkout@v2 47 | - name: Run clippy 48 | uses: actions-rs/clippy-check@v1 49 | with: 50 | token: ${{ secrets.GITHUB_TOKEN }} 51 | args: --all-features --all-targets -- --deny warnings -A clippy::unknown-clippy-lints 52 | 53 | tests: 54 | name: Job tests 55 | needs: clippy 56 | strategy: 57 | matrix: 58 | os: [ubuntu-latest, macos-latest, windows-latest] 59 | rust_channel: [stable, nightly] 60 | runs-on: ${{ matrix.os }} 61 | steps: 62 | - name: Install toolchain ${{ matrix.rust_channel }} on ${{ matrix.os }} 63 | uses: actions-rs/toolchain@v1 64 | with: 65 | toolchain: ${{ matrix.rust_channel }} 66 | - uses: actions/checkout@v2 67 | - name: Run cargo test 68 | uses: actions-rs/cargo@v1 69 | with: 70 | command: test 71 | args: --all-features 72 | -------------------------------------------------------------------------------- /tests/regex.rs: -------------------------------------------------------------------------------- 1 | use proc_macro_regex::regex; 2 | 3 | #[test] 4 | fn ipv4_regex() { 5 | // source https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses 6 | regex!(ipv4 r"^((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])$"); 7 | assert!(ipv4("127.0.0.1")); 8 | assert!(!ipv4("127.0.0.256")); 9 | } 10 | 11 | #[test] 12 | fn ipv6_regex() { 13 | // source https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses 14 | regex!(ipv6 r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$" 1048576); 15 | assert!(ipv6("fe80::1ff:fe23:4567:890a")); 16 | assert!(!ipv6("fe80::1ff::fe23:4567:890a")); 17 | } 18 | 19 | #[test] 20 | fn ipv6_pattern_regex() { 21 | // source https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses 22 | regex!(ipv6 r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"); 23 | assert!(ipv6("Pattern fe80::1ff:fe23:4567:890a")); 24 | } 25 | 26 | #[test] 27 | fn email_regex() { 28 | regex!(email "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$" 0); 29 | assert!(email("example@example.org")); 30 | assert!(!email("example@example@org")); 31 | } 32 | 33 | #[test] 34 | fn url_http() { 35 | // source https://gist.github.com/jacksonfdam/3000275 36 | regex!(url_http r"^http(s)?://(([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|(([0-9A-Za-z-]+\.)+([a-z,A-Z][0-9A-Za-z_-]*)))(:[1-9][0-9]*)?(/([0-9A-Za-z_./:%+@&=-]+[0-9A-Za-z_ ./?:%+@&=-]*)?)?(#([\t\n\v\f\r ]*))?$"); 37 | assert!(url_http("http://127.0.0.1/page?param=value")); 38 | assert!(url_http("http://www.example.org/page?param=value")); 39 | assert!(!url_http("htt://www.example.org/")); 40 | } 41 | -------------------------------------------------------------------------------- /src/macro_input.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::Span; 2 | use syn::{ 3 | parse::{Parse, ParseStream, Result as ParseResult}, 4 | spanned::Spanned, 5 | Ident, LitByteStr, LitInt, LitStr, Visibility, 6 | }; 7 | 8 | const DEFAULT_LIMIT: usize = 65536; 9 | 10 | pub enum Regex { 11 | LitStr(LitStr), 12 | LitByteStr(LitByteStr), 13 | } 14 | 15 | impl Regex { 16 | fn is_str(&self) -> bool { 17 | match self { 18 | Regex::LitStr(_) => true, 19 | Regex::LitByteStr(_) => false, 20 | } 21 | } 22 | 23 | fn get_regex(&self) -> String { 24 | match self { 25 | Regex::LitStr(lit_str) => lit_str.value(), 26 | Regex::LitByteStr(lit_byte_str) => { 27 | let bytes = lit_byte_str.value(); 28 | String::from_utf8(bytes).unwrap() 29 | } 30 | } 31 | } 32 | } 33 | 34 | impl Parse for Regex { 35 | fn parse(input: ParseStream) -> ParseResult { 36 | let lookahead = input.lookahead1(); 37 | let result = if lookahead.peek(LitStr) { 38 | Regex::LitStr(input.parse()?) 39 | } else { 40 | Regex::LitByteStr(input.parse()?) 41 | }; 42 | Ok(result) 43 | } 44 | } 45 | 46 | impl Spanned for Regex { 47 | fn span(&self) -> proc_macro2::Span { 48 | match self { 49 | Regex::LitStr(lit_str) => lit_str.span(), 50 | Regex::LitByteStr(lit_byte_str) => lit_byte_str.span(), 51 | } 52 | } 53 | } 54 | 55 | pub struct MacroInput { 56 | visibility: Visibility, 57 | name: Ident, 58 | regex: Regex, 59 | threshold: usize, 60 | } 61 | 62 | impl Parse for MacroInput { 63 | fn parse(input: ParseStream) -> ParseResult { 64 | let visibility: Visibility = input.parse()?; 65 | let name: Ident = input.parse()?; 66 | let regex = input.parse()?; 67 | let lookahead = input.lookahead1(); 68 | let threshold = if lookahead.peek(LitInt) { 69 | let threshold: LitInt = input.parse()?; 70 | threshold.base10_parse()? 71 | } else { 72 | DEFAULT_LIMIT 73 | }; 74 | Ok(MacroInput { 75 | visibility, 76 | name, 77 | regex, 78 | threshold, 79 | }) 80 | } 81 | } 82 | 83 | impl MacroInput { 84 | pub fn is_str(&self) -> bool { 85 | self.regex.is_str() 86 | } 87 | 88 | pub fn get_regex(&self) -> String { 89 | self.regex.get_regex() 90 | } 91 | 92 | pub fn get_regex_span(&self) -> Span { 93 | self.regex.span() 94 | } 95 | 96 | pub fn get_name(&self) -> &Ident { 97 | &self.name 98 | } 99 | 100 | pub fn get_visibility(&self) -> &Visibility { 101 | &self.visibility 102 | } 103 | 104 | pub fn get_threshold(&self) -> usize { 105 | self.threshold 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /fuzz/src/check.rs: -------------------------------------------------------------------------------- 1 | use honggfuzz::fuzz; 2 | use proc_macro_regex::regex; 3 | use regex::{Regex, RegexBuilder}; 4 | 5 | fn build_regex(regex: &str) -> Regex { 6 | RegexBuilder::new(regex).unicode(false).build().unwrap() 7 | } 8 | 9 | fn check(string: &str, regex: &Regex, proc_macro_regex: fn(&str) -> bool) { 10 | let result_regex = regex.is_match(string); 11 | let result_proc_macro_regex = proc_macro_regex(string); 12 | if result_regex != result_proc_macro_regex { 13 | panic!( 14 | "{} != {}: {}", 15 | result_regex, result_proc_macro_regex, string 16 | ); 17 | } 18 | } 19 | 20 | fn main() { 21 | let regex_email = build_regex(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"); 22 | regex!(proc_macro_regex_email r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"); 23 | 24 | let regex_url = build_regex( 25 | r"^http(s)?://((\d+\.\d+\.\d+\.\d+)|(([\w-]+\.)+([a-z,A-Z][\w-]*)))(:[1-9][0-9]*)?(/([\w./:%+@&=-]+[\w ./?:%+@&=-]*)?)?(#(\s*))?$", 26 | ); 27 | regex!(proc_macro_regex_url r"^http(s)?://((\d+\.\d+\.\d+\.\d+)|(([\w-]+\.)+([a-z,A-Z][\w-]*)))(:[1-9][0-9]*)?(/([\w./:%+@&=-]+[\w ./?:%+@&=-]*)?)?(#(\s*))?$"); 28 | 29 | let regex_ipv6 = build_regex( 30 | r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$", 31 | ); 32 | regex!(proc_macro_regex_ipv6 r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$"); 33 | 34 | let regex_test = build_regex("^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4})$"); 35 | regex!(proc_macro_regex_test "^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4})$"); 36 | 37 | loop { 38 | fuzz!(|string: &str| { 39 | check(string, ®ex_email, proc_macro_regex_email); 40 | check(string, ®ex_url, proc_macro_regex_url); 41 | check(string, ®ex_ipv6, proc_macro_regex_ipv6); 42 | check(string, ®ex_test, proc_macro_regex_test); 43 | }); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod character; 2 | mod dfa; 3 | mod macro_input; 4 | mod nfa; 5 | 6 | use crate::{ 7 | dfa::{Dfa, DfaToTokens}, 8 | macro_input::MacroInput, 9 | nfa::Nfa, 10 | }; 11 | use proc_macro::TokenStream; 12 | use quote::quote; 13 | use syn::parse_macro_input; 14 | 15 | /// The macro creates a function which returns `true` if the argument matches the regex. 16 | /// 17 | /// If the first argument is an identifier (name), then this is the name of the function, which 18 | /// would be generated. Example: 19 | /// ```rust 20 | /// use proc_macro_regex::regex; 21 | /// 22 | /// regex!(the_name_of_the_function "the regex to check"); 23 | /// ``` 24 | /// 25 | /// Alternative, if the first argument is a visibility keyword, then this is the visibility of the 26 | /// function. Otherwise, the function is private. Example: 27 | /// ```rust 28 | /// # use proc_macro_regex::regex; 29 | /// regex!(pub public_function "the function is public"); 30 | /// regex!(private_function "the function is private"); 31 | /// ``` 32 | /// 33 | /// The next argument is a string of the regex, which the function should check. Alternative, a 34 | /// byte string can be given, if the input should be a byte array (`&[u8]`). otherwise a string is 35 | /// taken. 36 | /// ```rust 37 | /// # use proc_macro_regex::regex; 38 | /// regex!(string_function "This function takes a string"); 39 | /// regex!(bytes_function "This function takes a byte array"); 40 | /// ``` 41 | /// 42 | /// At the end, a positive number can be given to set the limit of the lookup table 43 | /// (see `README.md`). 44 | /// ```rust 45 | /// # use proc_macro_regex::regex; 46 | /// regex!(limit_function "The limit is set to 100 bytes" 100); 47 | /// ``` 48 | /// 49 | /// # Syntax 50 | /// The given regex works the same as in the [regex](https://crates.io/crates/regex) crate. 51 | /// * If the `^` is at the beginning of the regex, then it is checked if the input is match at the 52 | /// beginning of the text. 53 | /// * If the `$` is at the end, then it is checked if the input is match at the end of the text. 54 | /// * If both are present then the whole input is checked. 55 | /// * Otherwise, is check if the string contains the regex. 56 | #[proc_macro] 57 | pub fn regex(input: TokenStream) -> TokenStream { 58 | let input = parse_macro_input!(input as MacroInput); 59 | let visibility = input.get_visibility(); 60 | let name = input.get_name(); 61 | let threshold = input.get_threshold(); 62 | let (argument_type, body) = if input.is_str() { 63 | let nfa = Nfa::::try_from(&input).unwrap(); 64 | let dfa = Dfa::from(nfa); 65 | ( 66 | quote! { 67 | str 68 | }, 69 | DfaToTokens::new(dfa, threshold).get_token_streams(), 70 | ) 71 | } else { 72 | let nfa = Nfa::::try_from(&input).unwrap(); 73 | let dfa = Dfa::from(nfa); 74 | ( 75 | quote! { 76 | [u8] 77 | }, 78 | DfaToTokens::new(dfa, threshold).get_token_streams(), 79 | ) 80 | }; 81 | let function = quote! { 82 | #visibility fn #name(s: &#argument_type) -> bool { 83 | #body 84 | } 85 | }; 86 | function.into() 87 | } 88 | -------------------------------------------------------------------------------- /src/dfa/to_tokens/mod.rs: -------------------------------------------------------------------------------- 1 | mod binary_search; 2 | mod lookup_table; 3 | 4 | use crate::{character::Character, dfa::Dfa, nfa::START_STATE}; 5 | use proc_macro2::{Span, TokenStream}; 6 | use quote::{quote, ToTokens}; 7 | use std::collections::BTreeSet; 8 | use syn::LitInt; 9 | 10 | fn usize_to_lit_int(i: usize) -> LitInt { 11 | let s = format!("{}", i); 12 | LitInt::new(&s, Span::call_site()) 13 | } 14 | 15 | pub(crate) struct DfaToTokens 16 | where 17 | T: Character, 18 | { 19 | dfa: Dfa, 20 | threshold: usize, 21 | required_states: BTreeSet, 22 | is_byte: bool, 23 | } 24 | 25 | impl DfaToTokens 26 | where 27 | T: Character, 28 | { 29 | /// If `self.end_text` is `true` then only no accept-states have to be implemented. 30 | /// Because if the state machine reaches an accept-state, then it stops. 31 | fn get_required_states(dfa: &Dfa) -> BTreeSet { 32 | if dfa.end_text { 33 | dfa.states.clone() 34 | } else { 35 | let mut required_states = BTreeSet::new(); 36 | for state in dfa.states.iter() { 37 | if !dfa.accept_states.contains(state) { 38 | required_states.insert(*state); 39 | } 40 | } 41 | required_states 42 | } 43 | } 44 | 45 | fn is_byte(dfa: &Dfa) -> bool { 46 | for (_, transitions) in dfa.transitions.iter() { 47 | for (ch, _) in transitions.iter() { 48 | if !ch.is_byte() { 49 | return false; 50 | } 51 | } 52 | } 53 | true 54 | } 55 | 56 | fn returns_true(&self) -> bool { 57 | if self.dfa.accept_states.contains(&START_STATE) { 58 | if self.dfa.end_text { 59 | !self.dfa.start_text && self.dfa.states.len() == 1 60 | } else { 61 | true 62 | } 63 | } else { 64 | false 65 | } 66 | } 67 | } 68 | 69 | impl DfaToTokens 70 | where 71 | T: Character + ToTokens + Copy + Into, 72 | { 73 | pub(crate) fn new(dfa: Dfa, threshold: usize) -> DfaToTokens { 74 | let required_states = DfaToTokens::get_required_states(&dfa); 75 | let is_byte = DfaToTokens::is_byte(&dfa); 76 | DfaToTokens { 77 | dfa, 78 | required_states, 79 | threshold, 80 | is_byte, 81 | } 82 | } 83 | 84 | fn last_check(&self) -> TokenStream { 85 | if self.dfa.end_text { 86 | let accept_states: Vec = self 87 | .dfa 88 | .accept_states 89 | .iter() 90 | .map(|u| usize_to_lit_int(*u)) 91 | .collect(); 92 | quote! { 93 | match state { 94 | #(#accept_states => true,)* 95 | _ => false, 96 | } 97 | } 98 | } else { 99 | quote! { 100 | false 101 | } 102 | } 103 | } 104 | 105 | fn for_each(&self) -> TokenStream { 106 | if let Some(for_each_lookup_table) = self.for_each_lookup_table() { 107 | for_each_lookup_table 108 | } else { 109 | self.for_each_binary_search() 110 | } 111 | } 112 | 113 | pub fn get_token_streams(&self) -> TokenStream { 114 | if self.returns_true() { 115 | quote! {true} 116 | } else { 117 | let for_each = self.for_each(); 118 | let last_check = self.last_check(); 119 | quote! { 120 | #for_each 121 | 122 | #last_check 123 | } 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /benches/compare.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; 2 | use proc_macro_regex::regex; 3 | use regex::Regex; 4 | 5 | const INPUT_EMAIL: &str = "example@example.org"; 6 | const INPUT_URL: &str = "https://www.example.org/page?param=value"; 7 | const INPUT_IPV6: &str = "fe80::1ff:fe23:4567:890a"; 8 | 9 | fn regex(c: &mut Criterion) { 10 | let mut group = c.benchmark_group("regex"); 11 | 12 | let regex_email = Regex::new("^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$").unwrap(); 13 | let throughput = Throughput::Bytes(INPUT_EMAIL.len() as u64); 14 | let benchmark_id = BenchmarkId::new("email", INPUT_EMAIL.len()); 15 | group.throughput(throughput); 16 | group.bench_with_input(benchmark_id, INPUT_EMAIL, |b, input| { 17 | b.iter(|| regex_email.is_match(input)) 18 | }); 19 | 20 | let regex_url = Regex::new( 21 | r"^http(s)?://(([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|(([0-9A-Za-z-]+\.)+([a-z,A-Z][0-9A-Za-z_-]*)))(:[1-9][0-9]*)?(/([0-9A-Za-z_./:%+@&=-]+[0-9A-Za-z_ ./?:%+@&=-]*)?)?(#([\t\n\v\f\r ]*))?$", 22 | ).unwrap(); 23 | let throughput = Throughput::Bytes(INPUT_URL.len() as u64); 24 | let benchmark_id = BenchmarkId::new("url", INPUT_URL.len()); 25 | group.throughput(throughput); 26 | group.bench_with_input(benchmark_id, INPUT_URL, |b, input| { 27 | b.iter(|| regex_url.is_match(input)) 28 | }); 29 | 30 | let regex_ipv6 = Regex::new( 31 | r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$", 32 | ).unwrap(); 33 | let throughput = Throughput::Bytes(INPUT_IPV6.len() as u64); 34 | let benchmark_id = BenchmarkId::new("ipv6", INPUT_IPV6.len()); 35 | group.throughput(throughput); 36 | group.bench_with_input(benchmark_id, INPUT_IPV6, |b, input| { 37 | b.iter(|| regex_ipv6.is_match(input)) 38 | }); 39 | } 40 | 41 | fn proc_macro_regex(c: &mut Criterion) { 42 | let mut group = c.benchmark_group("proc-macro-regex"); 43 | 44 | regex!(regex_email "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$"); 45 | let throughput = Throughput::Bytes(INPUT_EMAIL.len() as u64); 46 | let benchmark_id = BenchmarkId::new("email", INPUT_EMAIL.len()); 47 | group.throughput(throughput); 48 | group.bench_with_input(benchmark_id, INPUT_EMAIL, |b, input| { 49 | b.iter(|| regex_email(input)) 50 | }); 51 | 52 | regex!(regex_url r"^http(s)?://(([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|(([0-9A-Za-z-]+\.)+([a-z,A-Z][0-9A-Za-z_-]*)))(:[1-9][0-9]*)?(/([0-9A-Za-z_./:%+@&=-]+[0-9A-Za-z_ ./?:%+@&=-]*)?)?(#([\t\n\v\f\r ]*))?$"); 53 | let throughput = Throughput::Bytes(INPUT_URL.len() as u64); 54 | let benchmark_id = BenchmarkId::new("url", INPUT_URL.len()); 55 | group.throughput(throughput); 56 | group.bench_with_input(benchmark_id, INPUT_URL, |b, input| { 57 | b.iter(|| regex_url(input)) 58 | }); 59 | 60 | regex!(regex_ipv6 r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$" 1048576); 61 | let throughput = Throughput::Bytes(INPUT_IPV6.len() as u64); 62 | let benchmark_id = BenchmarkId::new("ipv6", INPUT_IPV6.len()); 63 | group.throughput(throughput); 64 | group.bench_with_input(benchmark_id, INPUT_IPV6, |b, input| { 65 | b.iter(|| regex_ipv6(input)) 66 | }); 67 | } 68 | 69 | criterion_group! { 70 | name = benches; 71 | config = Criterion::default(); 72 | targets = regex, proc_macro_regex 73 | } 74 | criterion_main!(benches); 75 | -------------------------------------------------------------------------------- /tests/standard.rs: -------------------------------------------------------------------------------- 1 | use proc_macro_regex::regex; 2 | 3 | #[test] 4 | fn empty() { 5 | regex!(regex ""); 6 | assert!(regex("")); 7 | assert!(regex("a")); 8 | } 9 | 10 | #[test] 11 | fn literal_1() { 12 | regex!(regex "a"); 13 | assert!(!regex("")); 14 | assert!(regex("a")); 15 | assert!(regex("ab")); 16 | assert!(regex("ba")); 17 | } 18 | 19 | #[test] 20 | fn literal_2() { 21 | regex!(regex "^a"); 22 | assert!(!regex("")); 23 | assert!(regex("a")); 24 | assert!(regex("ab")); 25 | assert!(!regex("ba")); 26 | } 27 | 28 | #[test] 29 | fn literal_3() { 30 | regex!(regex "a$"); 31 | assert!(!regex("")); 32 | assert!(regex("a")); 33 | assert!(!regex("ab")); 34 | assert!(regex("ba")); 35 | } 36 | 37 | #[test] 38 | fn literal_4() { 39 | regex!(regex "^a$"); 40 | assert!(!regex("")); 41 | assert!(regex("a")); 42 | assert!(!regex("ab")); 43 | assert!(!regex("ba")); 44 | } 45 | 46 | #[test] 47 | fn class_1() { 48 | regex!(regex "[ab]"); 49 | assert!(!regex("")); 50 | assert!(regex("a")); 51 | assert!(regex("ab")); 52 | assert!(regex("abc")); 53 | assert!(regex("ba")); 54 | assert!(regex("cba")); 55 | assert!(regex("b")); 56 | assert!(!regex("c")); 57 | } 58 | 59 | #[test] 60 | fn class_2() { 61 | regex!(regex "^[ab]"); 62 | assert!(!regex("")); 63 | assert!(regex("a")); 64 | assert!(regex("ab")); 65 | assert!(regex("abc")); 66 | assert!(regex("ba")); 67 | assert!(!regex("cba")); 68 | assert!(regex("b")); 69 | assert!(!regex("c")); 70 | } 71 | 72 | #[test] 73 | fn class_3() { 74 | regex!(regex "[ab]$"); 75 | assert!(!regex("")); 76 | assert!(regex("a")); 77 | assert!(regex("ab")); 78 | assert!(!regex("abc")); 79 | assert!(regex("ba")); 80 | assert!(regex("cba")); 81 | assert!(regex("b")); 82 | assert!(!regex("c")); 83 | } 84 | 85 | #[test] 86 | fn class_4() { 87 | regex!(regex "^[ab]$"); 88 | assert!(!regex("")); 89 | assert!(regex("a")); 90 | assert!(!regex("ab")); 91 | assert!(!regex("abc")); 92 | assert!(!regex("ba")); 93 | assert!(!regex("cba")); 94 | assert!(regex("b")); 95 | assert!(!regex("c")); 96 | } 97 | 98 | #[test] 99 | fn alternation_1() { 100 | regex!(regex "ab|cb"); 101 | assert!(!regex("")); 102 | assert!(!regex("a")); 103 | assert!(regex("ab")); 104 | assert!(regex("cb")); 105 | assert!(regex("abc")); 106 | assert!(regex("cab")); 107 | assert!(!regex("ba")); 108 | assert!(regex("cba")); 109 | assert!(!regex("b")); 110 | assert!(!regex("c")); 111 | } 112 | 113 | #[test] 114 | fn alternation_2() { 115 | regex!(regex "^(ab|cb)"); 116 | assert!(!regex("")); 117 | assert!(!regex("a")); 118 | assert!(regex("ab")); 119 | assert!(regex("cb")); 120 | assert!(regex("abc")); 121 | assert!(!regex("cab")); 122 | assert!(!regex("ba")); 123 | assert!(regex("cba")); 124 | assert!(!regex("b")); 125 | assert!(!regex("c")); 126 | } 127 | 128 | #[test] 129 | fn alternation_3() { 130 | regex!(regex "(ab|cb)$"); 131 | assert!(!regex("")); 132 | assert!(!regex("a")); 133 | assert!(regex("ab")); 134 | assert!(regex("cb")); 135 | assert!(!regex("abc")); 136 | assert!(regex("cab")); 137 | assert!(!regex("ba")); 138 | assert!(!regex("cba")); 139 | assert!(!regex("b")); 140 | assert!(!regex("c")); 141 | } 142 | 143 | #[test] 144 | fn alternation_4() { 145 | regex!(regex "^(ab|cb)$"); 146 | assert!(!regex("")); 147 | assert!(!regex("a")); 148 | assert!(regex("ab")); 149 | assert!(regex("cb")); 150 | assert!(!regex("abc")); 151 | assert!(!regex("cab")); 152 | assert!(!regex("ba")); 153 | assert!(!regex("cba")); 154 | assert!(!regex("b")); 155 | assert!(!regex("c")); 156 | } 157 | 158 | #[test] 159 | fn concat_1() { 160 | regex!(regex "ab"); 161 | assert!(!regex("")); 162 | assert!(!regex("a")); 163 | assert!(regex("ab")); 164 | assert!(regex("abc")); 165 | assert!(regex("cab")); 166 | assert!(!regex("ba")); 167 | assert!(!regex("cba")); 168 | assert!(!regex("b")); 169 | assert!(!regex("c")); 170 | } 171 | 172 | #[test] 173 | fn concat_2() { 174 | regex!(regex "^ab"); 175 | assert!(!regex("")); 176 | assert!(!regex("a")); 177 | assert!(regex("ab")); 178 | assert!(regex("abc")); 179 | assert!(!regex("cab")); 180 | assert!(!regex("ba")); 181 | assert!(!regex("cba")); 182 | assert!(!regex("b")); 183 | assert!(!regex("c")); 184 | } 185 | 186 | #[test] 187 | fn concat_3() { 188 | regex!(regex "ab$"); 189 | assert!(!regex("")); 190 | assert!(!regex("a")); 191 | assert!(regex("ab")); 192 | assert!(!regex("abc")); 193 | assert!(regex("cab")); 194 | assert!(!regex("ba")); 195 | assert!(!regex("cba")); 196 | assert!(!regex("b")); 197 | assert!(!regex("c")); 198 | } 199 | 200 | #[test] 201 | fn concat_4() { 202 | regex!(regex "^ab$"); 203 | assert!(!regex("")); 204 | assert!(!regex("a")); 205 | assert!(regex("ab")); 206 | assert!(!regex("abc")); 207 | assert!(!regex("cab")); 208 | assert!(!regex("ba")); 209 | assert!(!regex("cba")); 210 | assert!(!regex("b")); 211 | assert!(!regex("c")); 212 | } 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # proc-macro-regex 2 | A proc macro regex library to match an arbitrary string or byte array to a regular expression. 3 | [![Build status](https://github.com/LinkTed/proc-macro-regex/workflows/Continuous%20Integration/badge.svg)](https://github.com/LinkTed/proc-macro-regex/actions?query=workflow%3A%22Continuous+Integration%22) 4 | [![Latest version](https://img.shields.io/crates/v/proc-macro-regex.svg)](https://crates.io/crates/proc-macro-regex) 5 | [![Dependency status](https://deps.rs/repo/github/linkted/proc-macro-regex/status.svg)](https://deps.rs/repo/github/linkted/proc-macro-regex) 6 | [![License](https://img.shields.io/crates/l/proc-macro-regex.svg)](https://opensource.org/licenses/BSD-3-Clause) 7 | 8 | ## Usage 9 | Add this to your `Cargo.toml`: 10 | ```toml 11 | [dependencies] 12 | proc-macro-regex = "~1.1.0" 13 | ``` 14 | 15 | ## Example 16 | The macro `regex!` creates a function of the given name which takes a string or byte array and 17 | returns `true` if the argument matches the regex, otherwise `false`. 18 | ```rust 19 | use proc_macro_regex::regex; 20 | 21 | /// Create the function with the signature: 22 | /// fn regex_email(s: &str) -> bool; 23 | regex!(regex_email "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$"); 24 | 25 | fn main () { 26 | println!("Returns true == {}", regex_email("example@example.org")); 27 | println!("Returns false == {}", regex_email("example.example.org")); 28 | } 29 | ``` 30 | 31 | The given regex works the same as in the [regex](https://crates.io/crates/regex) crate. If the `^` 32 | is at the beginning of the regex and `$` at the end then the whole string is checked, otherwise is 33 | check if the string contains the regex. 34 | 35 | ## How it works 36 | The macro creates a *deterministic finite automaton* (DFA), which parse the given input. 37 | Depending on the size of the DFA or the character of the regex, a lookup table or a code base 38 | implementation (binary search) is generated. If the size of the lookup table would be bigger than 39 | 65536 bytes (can be changed) then a code base implementation (binary search) is used. Additionally, 40 | if the regex contains any Unicode (no ASCII) character then a code base implementation 41 | (binary search) is used, too. 42 | 43 | The following macro generates the following code: 44 | ```rust 45 | regex!(example_1 "abc"); 46 | ``` 47 | Generates: 48 | ```rust 49 | fn example_1(s: &str) -> bool { 50 | static TABLE: [[u8; 256]; 3usize] = [ ... ]; 51 | let mut state = 0; 52 | for c in s.bytes() { 53 | state = TABLE[state as usize][c as usize]; 54 | if state == u8::MAX { 55 | return true; 56 | } 57 | } 58 | false 59 | } 60 | ``` 61 | 62 | To tell the macro that the lookup table is not allowed to be bigger than 256 bytes, a third 63 | argument can be given. Therefore, a code base implementation (binary search) of the DFA is 64 | generated. 65 | ```rust 66 | regex!(example_2 "abc" 256); 67 | ``` 68 | Generates: 69 | ```rust 70 | fn example_2(s: &str) -> bool { 71 | let mut state = 0; 72 | for c in s.bytes() { 73 | state = if state < 1usize { 74 | match c { 75 | 97u8 => 1usize, 76 | _ => 0usize, 77 | } 78 | } else { 79 | if state == 1usize { 80 | match c { 81 | 97u8 => 1usize, 82 | 98u8 => 2usize, 83 | _ => 0usize, 84 | } 85 | } else { 86 | match c { 87 | 97u8 => 1usize, 88 | 99u8 => return true, 89 | _ => 0usize, 90 | } 91 | } 92 | }; 93 | } 94 | false 95 | } 96 | ``` 97 | 98 | To change the visibility of the function, add the keywords at the beginning of the arguments. 99 | ```rust 100 | regex!(pub example_2 "abc" 256); 101 | ``` 102 | Generates: 103 | ```rust 104 | pub fn example_3(s: &str) -> bool { 105 | // same as in example_1 (see above) 106 | } 107 | ``` 108 | 109 | To parse a byte array instead of string, pass a byte string. 110 | ```rust 111 | regex!(example_4 b"abc"); 112 | ``` 113 | Generates: 114 | ```rust 115 | fn example_4(s: &[u8]) -> bool { 116 | // same as in example_1 (see above) 117 | } 118 | ``` 119 | 120 | The generated code should work with `#![no_std]`, too. 121 | 122 | ## proc-macro-regex vs regex 123 | Advantages: 124 | * Compile-time (no runtime initialization, no lazy-static) 125 | * Generated code that does not contain any dependencies 126 | * No heap allocation 127 | * Approximately 12%-68% faster for no trivia regex [^1] 128 | 129 | [^1]: It were tested with regex in `benches/compare.rs`. For pattern/word matching it is slower 130 | because the [regex](https://crates.io/crates/regex) library uses 131 | [aho-corasick](https://crates.io/crates/aho-corasick/). (See Performance) 132 | 133 | Disadvantages: 134 | * Currently, no group captures 135 | * No runtime regex generation 136 | 137 | ### Performance 138 | This is the performance comparison between this crate and the regex crate. If you want to test it 139 | by yourself, run `cargo bench --bench compare`. 140 | 141 | | Name | `proc-macro-regex` | `regex` | Result | 142 | |--------|--------------:|-------------:|--------:| 143 | | E-Mail | 743.95 MiB/s | 441.67 MiB/s | 68.44 % | 144 | | URL | 584.62 MiB/s | 519.00 MiB/s | 12.64 % | 145 | | IPv6 | 746.92 MiB/s | 473.38 MiB/s | 57.78 % | 146 | 147 | This was compiled with `rustc 1.53.0-nightly (392ba2ba1 2021-04-17)`. 148 | 149 | ## License 150 | This project is licensed under the [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) 151 | license. 152 | 153 | ### Contribution 154 | Any contribution intentionally submitted for inclusion in `proc-macro-regex` by you, shall 155 | be licensed as [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause), without any additional 156 | terms or conditions. 157 | -------------------------------------------------------------------------------- /src/character.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Ident, Span, TokenStream}; 2 | use quote::quote; 3 | use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Literal}; 4 | use std::collections::BTreeSet; 5 | use thiserror::Error; 6 | 7 | fn to_byte(c: char) -> CharacterResult { 8 | if c.len_utf8() == 1 { 9 | let mut bytes = [0; 1]; 10 | c.encode_utf8(&mut bytes); 11 | Ok(bytes[0]) 12 | } else { 13 | Err(CharacterError::Unicode(c)) 14 | } 15 | } 16 | 17 | fn to_char(b: u8) -> CharacterResult { 18 | match char::try_from(b) { 19 | Ok(c) => Ok(c), 20 | Err(_) => Err(CharacterError::Byte(b)), 21 | } 22 | } 23 | 24 | #[derive(Debug, Error)] 25 | pub enum CharacterError { 26 | #[error("got byte: {0}")] 27 | Byte(u8), 28 | #[error("got class bytes: {0:?}")] 29 | ClassBytes(ClassBytes), 30 | #[error("got unicode: {0}")] 31 | Unicode(char), 32 | #[error("got class unicode: {0:?}")] 33 | ClassUnicode(ClassUnicode), 34 | } 35 | 36 | pub type CharacterResult = Result; 37 | 38 | pub trait Character: Sized + Ord + TryFrom + Into { 39 | fn new_line() -> Self; 40 | 41 | fn from_literal(literal: Literal) -> CharacterResult; 42 | 43 | fn from_class(class: Class) -> CharacterResult>; 44 | 45 | fn to_byte(&self) -> Option; 46 | 47 | fn is_byte(&self) -> bool; 48 | 49 | fn is_next(&self, other: &Self) -> bool; 50 | 51 | fn get_iterator_function(is_byte: bool) -> Ident; 52 | 53 | fn to_usize(element: Ident, is_byte: bool) -> TokenStream; 54 | 55 | fn allow_invalid_utf8() -> bool; 56 | 57 | fn unicode() -> bool; 58 | } 59 | 60 | impl Character for char { 61 | fn new_line() -> Self { 62 | '\n' 63 | } 64 | 65 | fn from_literal(literal: Literal) -> CharacterResult { 66 | match literal { 67 | Literal::Unicode(c) => Ok(c), 68 | Literal::Byte(b) => to_char(b), 69 | } 70 | } 71 | 72 | fn from_class(class: Class) -> CharacterResult> { 73 | let mut cs = BTreeSet::new(); 74 | match class { 75 | Class::Unicode(class_unicode) => { 76 | for class_unicode_range in class_unicode.iter() { 77 | let start = class_unicode_range.start(); 78 | let end = class_unicode_range.end(); 79 | for c in start..=end { 80 | cs.insert(c); 81 | } 82 | } 83 | } 84 | Class::Bytes(class_bytes) => { 85 | for class_bytes_range in class_bytes.iter() { 86 | let start = class_bytes_range.start(); 87 | let end = class_bytes_range.end(); 88 | for b in start..=end { 89 | let c = to_char(b)?; 90 | cs.insert(c); 91 | } 92 | } 93 | } 94 | } 95 | Ok(cs) 96 | } 97 | 98 | fn to_byte(&self) -> Option { 99 | to_byte(*self).ok() 100 | } 101 | 102 | fn is_byte(&self) -> bool { 103 | self.len_utf8() == 1 104 | } 105 | 106 | fn is_next(&self, other: &Self) -> bool { 107 | let self_u32: u32 = *self as u32; 108 | if let Some(next) = self_u32.checked_add(1) { 109 | let other_u32 = *other as u32; 110 | next == other_u32 111 | } else { 112 | false 113 | } 114 | } 115 | 116 | fn get_iterator_function(is_byte: bool) -> Ident { 117 | if is_byte { 118 | Ident::new("bytes", Span::call_site()) 119 | } else { 120 | Ident::new("chars", Span::call_site()) 121 | } 122 | } 123 | 124 | fn to_usize(element: Ident, _is_byte: bool) -> TokenStream { 125 | quote! { 126 | #element as usize 127 | } 128 | } 129 | 130 | fn allow_invalid_utf8() -> bool { 131 | false 132 | } 133 | 134 | fn unicode() -> bool { 135 | true 136 | } 137 | } 138 | 139 | impl Character for u8 { 140 | fn new_line() -> Self { 141 | b'\n' 142 | } 143 | 144 | fn from_literal(literal: Literal) -> CharacterResult { 145 | match literal { 146 | Literal::Unicode(c) => to_byte(c), 147 | Literal::Byte(b) => Ok(b), 148 | } 149 | } 150 | 151 | fn from_class(class: Class) -> CharacterResult> { 152 | let mut bs = BTreeSet::new(); 153 | match class { 154 | Class::Unicode(class_unicode) => { 155 | for class_unicode_range in class_unicode.iter() { 156 | let start = class_unicode_range.start(); 157 | let end = class_unicode_range.end(); 158 | for c in start..=end { 159 | let b = to_byte(c)?; 160 | bs.insert(b); 161 | } 162 | } 163 | } 164 | Class::Bytes(class_bytes) => { 165 | for class_bytes_range in class_bytes.iter() { 166 | let start = class_bytes_range.start(); 167 | let end = class_bytes_range.end(); 168 | for b in start..=end { 169 | bs.insert(b); 170 | } 171 | } 172 | } 173 | } 174 | Ok(bs) 175 | } 176 | 177 | fn to_byte(&self) -> Option { 178 | Some(*self) 179 | } 180 | 181 | fn is_byte(&self) -> bool { 182 | true 183 | } 184 | 185 | fn is_next(&self, other: &u8) -> bool { 186 | if let Some(next) = other.checked_add(1) { 187 | next == *other 188 | } else { 189 | false 190 | } 191 | } 192 | 193 | fn get_iterator_function(_is_byte: bool) -> Ident { 194 | Ident::new("into_iter", Span::call_site()) 195 | } 196 | 197 | fn to_usize(element: Ident, _is_byte: bool) -> TokenStream { 198 | quote! { 199 | *#element as usize 200 | } 201 | } 202 | 203 | fn allow_invalid_utf8() -> bool { 204 | true 205 | } 206 | 207 | fn unicode() -> bool { 208 | false 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /src/dfa/to_tokens/binary_search.rs: -------------------------------------------------------------------------------- 1 | use crate::{character::Character, dfa::to_tokens::DfaToTokens}; 2 | use proc_macro2::TokenStream; 3 | use quote::{quote, ToTokens}; 4 | use std::collections::{BTreeMap, BTreeSet}; 5 | 6 | fn transition_condition_to_tokens(start: T, end: T) -> TokenStream 7 | where 8 | T: ToTokens + Ord, 9 | { 10 | if start == end { 11 | quote! { 12 | #start 13 | } 14 | } else { 15 | quote! { 16 | #start..=#end 17 | } 18 | } 19 | } 20 | 21 | impl DfaToTokens 22 | where 23 | T: Character + ToTokens + Ord, 24 | { 25 | fn transition_condition(&self, start: T, end: T) -> TokenStream { 26 | if self.is_byte { 27 | if let Some(start) = start.to_byte() { 28 | if let Some(end) = end.to_byte() { 29 | return transition_condition_to_tokens::(start, end); 30 | } 31 | } 32 | } 33 | 34 | transition_condition_to_tokens(start, end) 35 | } 36 | } 37 | 38 | impl DfaToTokens 39 | where 40 | T: Character + ToTokens + Copy, 41 | { 42 | fn transitions_inverse(transitions: &BTreeSet<(T, usize)>) -> BTreeMap> { 43 | let mut result: BTreeMap> = BTreeMap::new(); 44 | for (c, t) in transitions { 45 | if let Some(set) = result.get_mut(t) { 46 | set.insert(*c); 47 | } else { 48 | let mut set = BTreeSet::new(); 49 | set.insert(*c); 50 | result.insert(*t, set); 51 | } 52 | } 53 | result 54 | } 55 | 56 | fn transitions_inverse_pack( 57 | transitions_inverse: BTreeMap>, 58 | ) -> BTreeMap> { 59 | let mut result = BTreeMap::new(); 60 | for (t, cs) in transitions_inverse { 61 | let mut ranges = BTreeSet::new(); 62 | let mut start = None; 63 | let mut prev: Option = None; 64 | for character in cs { 65 | if let Some(prev) = prev { 66 | if !prev.is_next(&character) { 67 | ranges.insert((start.unwrap(), prev)); 68 | start = Some(character); 69 | } 70 | } else { 71 | start = Some(character); 72 | } 73 | prev = Some(character); 74 | } 75 | if let Some(start) = start { 76 | if let Some(prev) = prev { 77 | ranges.insert((start, prev)); 78 | } else { 79 | panic!() 80 | } 81 | } 82 | result.insert(t, ranges); 83 | } 84 | result 85 | } 86 | 87 | fn transitions_inverse_condition(&self, ranges: BTreeSet<(T, T)>) -> TokenStream { 88 | let mut conditions = Vec::new(); 89 | for (start, end) in ranges { 90 | let condition = self.transition_condition(start, end); 91 | conditions.push(condition); 92 | } 93 | quote! { 94 | #(#conditions )|* 95 | } 96 | } 97 | 98 | fn transitions_default(&self) -> TokenStream { 99 | if self.dfa.start_text { 100 | quote! { 101 | return false; 102 | } 103 | } else { 104 | quote! { 105 | 0usize 106 | } 107 | } 108 | } 109 | 110 | fn transitions_binary_search_match_inner(&self, state: usize) -> TokenStream { 111 | let default = self.transitions_default(); 112 | if let Some(transitions) = self.dfa.transitions.get(&state) { 113 | let transitions_inverse = DfaToTokens::::transitions_inverse(transitions); 114 | let transitions_inverse_pack = 115 | DfaToTokens::::transitions_inverse_pack(transitions_inverse); 116 | let mut arms = Vec::new(); 117 | for (t, ranges) in transitions_inverse_pack { 118 | let condition = self.transitions_inverse_condition(ranges); 119 | let arm = if !self.dfa.end_text && self.dfa.accept_states.contains(&t) { 120 | quote! { 121 | #condition => return true 122 | } 123 | } else { 124 | quote! { 125 | #condition => #t 126 | } 127 | }; 128 | arms.push(arm); 129 | } 130 | 131 | quote! { 132 | match c { 133 | #(#arms,)* 134 | _ => { 135 | #default 136 | }, 137 | } 138 | } 139 | } else { 140 | default 141 | } 142 | } 143 | 144 | fn transitions_binary_search_recursive( 145 | &self, 146 | states: &[usize], 147 | start: usize, 148 | len: usize, 149 | ) -> TokenStream { 150 | if len == 1 { 151 | self.transitions_binary_search_match_inner(states[start]) 152 | } else if len == 2 { 153 | let left_state = states[start]; 154 | let right_state = states[start + 1]; 155 | let left = self.transitions_binary_search_match_inner(left_state); 156 | let right = self.transitions_binary_search_match_inner(right_state); 157 | quote! { 158 | if state == #left_state { 159 | #left 160 | } else { 161 | #right 162 | } 163 | } 164 | } else { 165 | let new_len = len / 2; 166 | let remain = len % 2; 167 | let new_start = start + new_len; 168 | let new_state = states[new_start]; 169 | let left = self.transitions_binary_search_recursive(states, start, new_len); 170 | let right = 171 | self.transitions_binary_search_recursive(states, new_start, new_len + remain); 172 | quote! { 173 | if state < #new_state { 174 | #left 175 | } else { 176 | #right 177 | } 178 | } 179 | } 180 | } 181 | 182 | pub(super) fn for_each_binary_search(&self) -> TokenStream { 183 | let states: Vec = self.required_states.iter().copied().collect(); 184 | let iterator = T::get_iterator_function(self.is_byte); 185 | let transitions = self.transitions_binary_search_recursive(&states[..], 0, states.len()); 186 | quote! { 187 | let mut state = 0; 188 | 189 | for c in s.#iterator() { 190 | state = #transitions; 191 | } 192 | } 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /src/dfa/nfa_to_dfa.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | character::Character, 3 | dfa::Dfa, 4 | nfa::{Nfa, START_STATE}, 5 | }; 6 | use std::{ 7 | collections::{BTreeMap, BTreeSet}, 8 | convert::From, 9 | fmt::Debug, 10 | }; 11 | 12 | type State = BTreeSet; 13 | type Transition = (State, T, State); 14 | 15 | #[derive(Debug)] 16 | pub(crate) struct NfaToDfaIter 17 | where 18 | T: Character + Copy, 19 | { 20 | nfa: Nfa, 21 | states: BTreeSet, 22 | new_states: BTreeSet, 23 | transitions: BTreeSet>, 24 | accept_states: BTreeSet, 25 | } 26 | 27 | impl NfaToDfaIter 28 | where 29 | T: Character + Copy, 30 | { 31 | pub(super) fn new(nfa: Nfa) -> NfaToDfaIter { 32 | let mut start_state = BTreeSet::new(); 33 | start_state.insert(START_STATE.to_owned()); 34 | 35 | // The start state is always there. 36 | let mut states: BTreeSet> = BTreeSet::new(); 37 | states.insert(start_state); 38 | 39 | // The start state is an accept-state and the end_text is false. This means that we are 40 | // already on a accept-state and we do not have to parse all the string. As a result, the 41 | // DFA is always true. 42 | let new_states = if nfa.is_accept_state(START_STATE) && !nfa.is_end_text() { 43 | BTreeSet::new() 44 | } else { 45 | states.clone() 46 | }; 47 | 48 | let accept_states = if nfa.is_accept_state(START_STATE) { 49 | states.clone() 50 | } else { 51 | BTreeSet::new() 52 | }; 53 | 54 | NfaToDfaIter { 55 | nfa, 56 | states, 57 | new_states, 58 | transitions: BTreeSet::new(), 59 | accept_states, 60 | } 61 | } 62 | 63 | /// Returns a set of all character the given state has a transition as source state. 64 | fn characters(&self, state: &State) -> BTreeSet { 65 | let mut characters = BTreeSet::new(); 66 | for s in state { 67 | self.nfa.chars(*s, &mut characters); 68 | } 69 | characters 70 | } 71 | 72 | /// 73 | fn simulate(&self, state: &State, c: T) -> State { 74 | let mut new_state = BTreeSet::new(); 75 | for s in state { 76 | self.nfa.simulate(*s, c, &mut new_state); 77 | } 78 | new_state 79 | } 80 | 81 | fn is_accept_state(&self, state: &State) -> bool { 82 | for s in state { 83 | if self.nfa.is_accept_state(*s) { 84 | return true; 85 | } 86 | } 87 | false 88 | } 89 | 90 | fn next_step(&mut self) { 91 | let mut new_states = BTreeSet::new(); 92 | for state in self.new_states.iter() { 93 | let chars = self.characters(state); 94 | for c in chars { 95 | let mut new_state = self.simulate(state, c); 96 | if !self.nfa.is_start_text() { 97 | new_state.insert(START_STATE.to_owned()); 98 | } 99 | 100 | if !self.states.contains(&new_state) { 101 | self.states.insert(new_state.clone()); 102 | new_states.insert(new_state.clone()); 103 | 104 | if self.is_accept_state(&new_state) { 105 | self.accept_states.insert(new_state.clone()); 106 | } 107 | } 108 | self.transitions.insert((state.clone(), c, new_state)); 109 | } 110 | } 111 | self.new_states = new_states; 112 | } 113 | } 114 | 115 | impl Iterator for &mut NfaToDfaIter 116 | where 117 | T: Character + Copy, 118 | { 119 | type Item = usize; 120 | 121 | fn next(&mut self) -> Option { 122 | if self.new_states.is_empty() { 123 | return None; 124 | } 125 | 126 | self.next_step(); 127 | 128 | match self.new_states.len() { 129 | 0 => None, 130 | len => Some(len), 131 | } 132 | } 133 | } 134 | 135 | impl From> for Dfa 136 | where 137 | T: Character + Copy, 138 | { 139 | fn from(mut nfa_to_dfa: NfaToDfaIter) -> Self { 140 | for _ in &mut nfa_to_dfa {} 141 | 142 | let mut states = BTreeSet::new(); 143 | let mut accept_states = BTreeSet::new(); 144 | let mut mapping = BTreeMap::new(); 145 | 146 | let mut start_state = BTreeSet::new(); 147 | start_state.insert(START_STATE); 148 | 149 | // It has to be ensured that the start-state is mapped to zero. 150 | // Therefore, the start-state has to be removed. 151 | nfa_to_dfa.states.remove(&start_state); 152 | states.insert(START_STATE); 153 | if nfa_to_dfa.accept_states.remove(&start_state) { 154 | accept_states.insert(START_STATE); 155 | } 156 | mapping.insert(start_state, START_STATE); 157 | 158 | // First map all non accept-states. 159 | for state in nfa_to_dfa.states { 160 | if !nfa_to_dfa.accept_states.contains(&state) { 161 | states.insert(mapping.len()); 162 | mapping.insert(state, mapping.len()); 163 | } 164 | } 165 | 166 | // Then map all accept-states. 167 | // Because if `end_text` equals true then the accept states are implemented differently so 168 | // all accept-states should be at the end of the mapping. 169 | for accept_state in nfa_to_dfa.accept_states.iter() { 170 | states.insert(mapping.len()); 171 | mapping.insert(accept_state.clone(), mapping.len()); 172 | } 173 | 174 | // Convert the transitions according the mapping. 175 | let mut transitions: BTreeMap> = BTreeMap::new(); 176 | for (s, c, t) in nfa_to_dfa.transitions { 177 | let s = mapping.get(&s).unwrap(); 178 | let t = mapping.get(&t).unwrap(); 179 | if let Some(state_transitions) = transitions.get_mut(s) { 180 | state_transitions.insert((c, *t)); 181 | } else { 182 | let mut state_transitions = BTreeSet::new(); 183 | state_transitions.insert((c, *t)); 184 | transitions.insert(*s, state_transitions); 185 | } 186 | } 187 | 188 | // Convert the accept states according the mapping. 189 | for s in nfa_to_dfa.accept_states { 190 | let s = mapping.get(&s).unwrap(); 191 | accept_states.insert(*s); 192 | } 193 | 194 | Dfa { 195 | states, 196 | transitions, 197 | accept_states, 198 | start_text: nfa_to_dfa.nfa.is_start_text(), 199 | end_text: nfa_to_dfa.nfa.is_end_text(), 200 | } 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/dfa/to_tokens/lookup_table.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | character::Character, 3 | dfa::to_tokens::{usize_to_lit_int, DfaToTokens}, 4 | }; 5 | use proc_macro2::{Span, TokenStream}; 6 | use quote::{quote, ToTokens}; 7 | use std::{ 8 | collections::{BTreeMap, BTreeSet}, 9 | mem::size_of, 10 | }; 11 | use syn::Ident; 12 | 13 | impl DfaToTokens 14 | where 15 | T: Character + ToTokens + Copy + Into, 16 | { 17 | fn lookup_table_u8_row_map(transitions: &BTreeSet<(T, usize)>) -> Option> { 18 | let mut transitions_u8 = BTreeMap::new(); 19 | for (ch, t) in transitions.iter() { 20 | let ch = ch.to_byte()?; 21 | transitions_u8.insert(ch, *t); 22 | } 23 | Some(transitions_u8) 24 | } 25 | 26 | fn lookup_table_row_no_transition_default(&self, int_type: &Ident) -> TokenStream { 27 | if self.dfa.start_text { 28 | quote! { 29 | #int_type::MAX 30 | } 31 | } else { 32 | quote! { 33 | 0 34 | } 35 | } 36 | } 37 | 38 | fn lookup_table_row_accept_transition_end(&self, int_type: &Ident) -> TokenStream { 39 | if self.dfa.start_text { 40 | quote! { 41 | #int_type::MAX - 1 42 | } 43 | } else { 44 | quote! { 45 | #int_type::MAX 46 | } 47 | } 48 | } 49 | 50 | fn lookup_table_row( 51 | &self, 52 | transitions_u8: &BTreeMap, 53 | int_type: &Ident, 54 | ) -> Vec { 55 | let no_transition_default = self.lookup_table_row_no_transition_default(int_type); 56 | let accept_transition_end = self.lookup_table_row_accept_transition_end(int_type); 57 | let mut row = Vec::with_capacity(256); 58 | for i in 0..=u8::MAX { 59 | let new_state = if let Some(t) = transitions_u8.get(&i) { 60 | if !self.dfa.end_text && self.dfa.accept_states.contains(t) { 61 | accept_transition_end.clone() 62 | } else { 63 | let new_state = usize_to_lit_int(*t); 64 | quote! { 65 | #new_state 66 | } 67 | } 68 | } else { 69 | no_transition_default.clone() 70 | }; 71 | row.push(new_state); 72 | } 73 | row 74 | } 75 | 76 | fn lookup_table_row_default(&self, int_type: &Ident) -> Vec { 77 | let no_transition_default = self.lookup_table_row_no_transition_default(int_type); 78 | vec![no_transition_default; 256] 79 | } 80 | 81 | fn transitions_lookup_table(&self, int_type: &Ident) -> Option { 82 | let mut table = Vec::new(); 83 | for state in self.required_states.iter() { 84 | let row = if let Some(transitions) = self.dfa.transitions.get(state) { 85 | let transitions_u8 = DfaToTokens::::lookup_table_u8_row_map(transitions)?; 86 | self.lookup_table_row(&transitions_u8, int_type) 87 | } else { 88 | self.lookup_table_row_default(int_type) 89 | }; 90 | table.push(quote! { 91 | [#(#row),*] 92 | }); 93 | } 94 | let len = table.len(); 95 | let transitions = quote! { 96 | static TABLE: [[#int_type; 256]; #len] = [#(#table),*] 97 | }; 98 | Some(transitions) 99 | } 100 | 101 | fn for_each_lookup_table_check(&self, int_type: &Ident) -> TokenStream { 102 | match (self.dfa.start_text, self.dfa.end_text) { 103 | (false, false) => quote! { 104 | if state == #int_type::MAX { 105 | return true; 106 | } 107 | }, 108 | (false, true) => quote! {}, 109 | (true, false) => quote! { 110 | if state == #int_type::MAX { 111 | return false; 112 | } else if state == #int_type::MAX - 1 { 113 | return true; 114 | } 115 | }, 116 | (true, true) => quote! { 117 | if state == #int_type::MAX { 118 | return false; 119 | } 120 | }, 121 | } 122 | } 123 | 124 | pub(super) fn for_each_lookup_table(&self) -> Option { 125 | if !self.is_byte { 126 | return None; 127 | } 128 | 129 | let int_type = self.get_int_type()?; 130 | let transitions_lookup_table = self.transitions_lookup_table(&int_type)?; 131 | let iterator = T::get_iterator_function(self.is_byte); 132 | let c_to_usize = T::to_usize(Ident::new("c", Span::call_site()), self.is_byte); 133 | let check = self.for_each_lookup_table_check(&int_type); 134 | let for_each = quote! { 135 | #transitions_lookup_table; 136 | let mut state = 0; 137 | 138 | for c in s.#iterator() { 139 | state = TABLE[state as usize][#c_to_usize]; 140 | 141 | #check 142 | } 143 | }; 144 | Some(for_each) 145 | } 146 | 147 | fn lookup_table_states(&self) -> Option { 148 | let mut additional_state = 0; 149 | 150 | if self.dfa.start_text { 151 | additional_state += 1; 152 | } 153 | 154 | if !self.dfa.end_text { 155 | additional_state += 1; 156 | } 157 | 158 | self.required_states.len().checked_add(additional_state) 159 | } 160 | 161 | fn lookup_table_size(&self) -> Option<(usize, Ident)> { 162 | let states = self.lookup_table_states()?; 163 | let states_character = states.checked_mul(256)?; 164 | let ret = if states < (u8::MAX) as usize { 165 | ( 166 | states_character.checked_mul(size_of::())?, 167 | Ident::new("u8", Span::call_site()), 168 | ) 169 | } else if states < (u16::MAX) as usize { 170 | ( 171 | states_character.checked_mul(size_of::())?, 172 | Ident::new("u16", Span::call_site()), 173 | ) 174 | } else if states < (u32::MAX) as usize { 175 | ( 176 | states_character.checked_mul(size_of::())?, 177 | Ident::new("u32", Span::call_site()), 178 | ) 179 | } else { 180 | ( 181 | states_character.checked_mul(size_of::())?, 182 | Ident::new("u64", Span::call_site()), 183 | ) 184 | }; 185 | Some(ret) 186 | } 187 | 188 | pub(super) fn get_int_type(&self) -> Option { 189 | let (lookup_table_size, int_type) = self.lookup_table_size()?; 190 | if lookup_table_size <= self.threshold { 191 | Some(int_type) 192 | } else { 193 | None 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/nfa/repetition.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | character::Character, 3 | nfa::{NFAResult, Nfa}, 4 | }; 5 | use regex_syntax::hir::{Hir, Repetition, RepetitionKind, RepetitionRange}; 6 | use std::collections::{BTreeMap, BTreeSet}; 7 | 8 | impl Nfa 9 | where 10 | T: Character + Copy, 11 | { 12 | fn repetition_range_exactly(&mut self, hir: Hir, exactly: u32) -> NFAResult<()> { 13 | for _ in 0..exactly { 14 | let nfa = self.sub(hir.clone())?; 15 | self.append_states(&nfa)?; 16 | self.accept_states = nfa.accept_states; 17 | } 18 | Ok(()) 19 | } 20 | 21 | fn repetition_range_at_least(&mut self, hir: Hir, at_least: u32) -> NFAResult<()> { 22 | for _ in 0..at_least { 23 | let nfa = self.sub(hir.clone())?; 24 | self.append_states(&nfa)?; 25 | self.accept_states = nfa.accept_states; 26 | } 27 | self.repetition_zero_or_more(hir) 28 | } 29 | 30 | fn repetition_range_bounded(&mut self, hir: Hir, m: u32, n: u32) -> NFAResult<()> { 31 | if m != 0 { 32 | self.repetition_range_exactly(hir.clone(), m)?; 33 | } 34 | 35 | let mut accept_states = self.accept_states.clone(); 36 | for _ in m..n { 37 | let nfa = self.sub(hir.clone())?; 38 | self.append_states(&nfa)?; 39 | accept_states.extend(nfa.accept_states.clone()); 40 | self.accept_states = nfa.accept_states; 41 | } 42 | self.accept_states = accept_states; 43 | 44 | Ok(()) 45 | } 46 | 47 | fn repetition_range(&mut self, hir: Hir, repetition_range: RepetitionRange) -> NFAResult<()> { 48 | match repetition_range { 49 | RepetitionRange::Exactly(exactly) => self.repetition_range_exactly(hir, exactly), 50 | RepetitionRange::AtLeast(at_least) => self.repetition_range_at_least(hir, at_least), 51 | RepetitionRange::Bounded(m, n) => self.repetition_range_bounded(hir, m, n), 52 | } 53 | } 54 | 55 | fn repetition_zero_or_one(&mut self, hir: Hir) -> NFAResult<()> { 56 | let nfa = self.sub(hir)?; 57 | self.append_states(&nfa)?; 58 | self.accept_states.extend(nfa.accept_states); 59 | Ok(()) 60 | } 61 | 62 | fn repetition_zero_or_more(&mut self, hir: Hir) -> NFAResult<()> { 63 | let nfa = self.sub(hir)?; 64 | for state in nfa.states { 65 | if !nfa.accept_states.contains(&state) { 66 | self.add_state(state)?; 67 | } 68 | } 69 | 70 | for (source_state, characters_to_targets) in nfa.transitions { 71 | for (character, targets) in characters_to_targets { 72 | for target_state in targets { 73 | let s_accept = nfa.accept_states.contains(&source_state); 74 | let t_accept = nfa.accept_states.contains(&target_state); 75 | match (s_accept, t_accept) { 76 | (true, true) => { 77 | for source_state in self.accept_states.iter() { 78 | for target_state in self.accept_states.iter() { 79 | Nfa::add_transition( 80 | &mut self.transitions, 81 | *target_state, 82 | character, 83 | *source_state, 84 | ); 85 | } 86 | } 87 | } 88 | (true, false) => { 89 | for source_state in self.accept_states.iter() { 90 | Nfa::add_transition( 91 | &mut self.transitions, 92 | *source_state, 93 | character, 94 | target_state, 95 | ); 96 | } 97 | } 98 | (false, true) => { 99 | for target_state in self.accept_states.iter() { 100 | Nfa::add_transition( 101 | &mut self.transitions, 102 | source_state, 103 | character, 104 | *target_state, 105 | ); 106 | } 107 | } 108 | (false, false) => { 109 | Nfa::add_transition( 110 | &mut self.transitions, 111 | source_state, 112 | character, 113 | target_state, 114 | ); 115 | } 116 | } 117 | } 118 | } 119 | } 120 | 121 | Ok(()) 122 | } 123 | 124 | fn repetition_one_or_more(&mut self, hir: Hir) -> NFAResult<()> { 125 | let mut nfa = self.sub(hir)?; 126 | let mut backwards_characters_to_targets: BTreeMap> = BTreeMap::new(); 127 | for accept_state in self.accept_states.iter() { 128 | if let Some(characters_to_targets) = nfa.transitions.get(accept_state) { 129 | for (character, targets) in characters_to_targets.iter() { 130 | if let Some(backwards_targets) = 131 | backwards_characters_to_targets.get_mut(character) 132 | { 133 | for target in targets { 134 | backwards_targets.insert(*target); 135 | } 136 | } else { 137 | backwards_characters_to_targets.insert(*character, targets.clone()); 138 | } 139 | } 140 | } 141 | } 142 | 143 | for (character, targets) in backwards_characters_to_targets { 144 | for target_state in targets { 145 | for accept_state in nfa.accept_states.iter() { 146 | Nfa::add_transition( 147 | &mut nfa.transitions, 148 | *accept_state, 149 | character, 150 | target_state, 151 | ); 152 | } 153 | } 154 | } 155 | 156 | self.append_states(&nfa)?; 157 | self.accept_states = nfa.accept_states; 158 | Ok(()) 159 | } 160 | 161 | pub(super) fn repetition(&mut self, repetition: Repetition) -> NFAResult<()> { 162 | match repetition.kind { 163 | RepetitionKind::ZeroOrOne => self.repetition_zero_or_one(*repetition.hir), 164 | RepetitionKind::ZeroOrMore => self.repetition_zero_or_more(*repetition.hir), 165 | RepetitionKind::OneOrMore => self.repetition_one_or_more(*repetition.hir), 166 | RepetitionKind::Range(repetition_range) => { 167 | self.repetition_range(*repetition.hir, repetition_range) 168 | } 169 | } 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /tests/repetition.rs: -------------------------------------------------------------------------------- 1 | use proc_macro_regex::regex; 2 | 3 | #[test] 4 | fn zero_or_more_1() { 5 | regex!(regex "a*"); 6 | assert!(regex("")); 7 | assert!(regex("a")); 8 | assert!(regex("aa")); 9 | assert!(regex("b")); 10 | assert!(regex("ab")); 11 | assert!(regex("ba")); 12 | } 13 | 14 | #[test] 15 | fn zero_or_more_2() { 16 | regex!(regex "^a*"); 17 | assert!(regex("")); 18 | assert!(regex("a")); 19 | assert!(regex("aa")); 20 | assert!(regex("ab")); 21 | assert!(regex("ba")); 22 | } 23 | 24 | #[test] 25 | fn zero_or_more_3() { 26 | regex!(regex "a*$"); 27 | assert!(regex("")); 28 | assert!(regex("a")); 29 | assert!(regex("aa")); 30 | assert!(regex("ab")); 31 | assert!(regex("ba")); 32 | } 33 | 34 | #[test] 35 | fn zero_or_more_4() { 36 | regex!(regex "^a*$"); 37 | assert!(regex("")); 38 | assert!(regex("a")); 39 | assert!(regex("aa")); 40 | assert!(!regex("b")); 41 | assert!(!regex("ab")); 42 | assert!(!regex("ba")); 43 | } 44 | 45 | #[test] 46 | fn zero_or_one_1() { 47 | regex!(regex "a?"); 48 | assert!(regex("")); 49 | assert!(regex("a")); 50 | assert!(regex("aa")); 51 | assert!(regex("b")); 52 | assert!(regex("ab")); 53 | assert!(regex("ba")); 54 | } 55 | 56 | #[test] 57 | fn zero_or_one_2() { 58 | regex!(regex "^a?"); 59 | assert!(regex("")); 60 | assert!(regex("a")); 61 | assert!(regex("aa")); 62 | assert!(regex("b")); 63 | assert!(regex("ab")); 64 | assert!(regex("ba")); 65 | } 66 | 67 | #[test] 68 | fn zero_or_one_3() { 69 | regex!(regex "a?$"); 70 | assert!(regex("")); 71 | assert!(regex("a")); 72 | assert!(regex("aa")); 73 | assert!(regex("b")); 74 | assert!(regex("ab")); 75 | assert!(regex("ba")); 76 | } 77 | 78 | #[test] 79 | fn zero_or_one_4() { 80 | regex!(regex "^a?$"); 81 | assert!(regex("")); 82 | assert!(regex("a")); 83 | assert!(!regex("aa")); 84 | assert!(!regex("b")); 85 | assert!(!regex("ab")); 86 | assert!(!regex("ba")); 87 | } 88 | 89 | #[test] 90 | fn one_or_more_1() { 91 | regex!(regex "a+"); 92 | assert!(!regex("")); 93 | assert!(regex("a")); 94 | assert!(regex("aa")); 95 | assert!(!regex("b")); 96 | assert!(regex("ab")); 97 | assert!(regex("ba")); 98 | } 99 | 100 | #[test] 101 | fn one_or_more_2() { 102 | regex!(regex "^a+"); 103 | assert!(!regex("")); 104 | assert!(regex("a")); 105 | assert!(regex("aa")); 106 | assert!(!regex("b")); 107 | assert!(regex("ab")); 108 | assert!(!regex("ba")); 109 | } 110 | 111 | #[test] 112 | fn one_or_more_3() { 113 | regex!(regex "a+$"); 114 | assert!(!regex("")); 115 | assert!(regex("a")); 116 | assert!(regex("aa")); 117 | assert!(!regex("b")); 118 | assert!(!regex("ab")); 119 | assert!(regex("ba")); 120 | } 121 | 122 | #[test] 123 | fn one_or_more_4() { 124 | regex!(regex "^a+$"); 125 | assert!(!regex("")); 126 | assert!(regex("a")); 127 | assert!(regex("aa")); 128 | assert!(!regex("b")); 129 | assert!(!regex("ab")); 130 | assert!(!regex("ba")); 131 | } 132 | 133 | #[test] 134 | fn range_exactly_1() { 135 | regex!(regex "a{2}"); 136 | assert!(!regex("")); 137 | assert!(!regex("a")); 138 | assert!(regex("aa")); 139 | assert!(regex("aaa")); 140 | assert!(!regex("b")); 141 | assert!(!regex("ab")); 142 | assert!(regex("aab")); 143 | assert!(regex("aaab")); 144 | assert!(!regex("ba")); 145 | assert!(regex("baa")); 146 | assert!(regex("baaa")); 147 | } 148 | 149 | #[test] 150 | fn range_exactly_2() { 151 | regex!(regex "^a{2}"); 152 | assert!(!regex("")); 153 | assert!(!regex("a")); 154 | assert!(regex("aa")); 155 | assert!(regex("aaa")); 156 | assert!(!regex("b")); 157 | assert!(!regex("ab")); 158 | assert!(regex("aab")); 159 | assert!(regex("aaab")); 160 | assert!(!regex("ba")); 161 | assert!(!regex("baa")); 162 | assert!(!regex("baaa")); 163 | } 164 | 165 | #[test] 166 | fn range_exactly_3() { 167 | regex!(regex "a{2}$"); 168 | assert!(!regex("")); 169 | assert!(!regex("a")); 170 | assert!(regex("aa")); 171 | assert!(regex("aaa")); 172 | assert!(!regex("b")); 173 | assert!(!regex("ab")); 174 | assert!(!regex("aab")); 175 | assert!(!regex("aaab")); 176 | assert!(!regex("ba")); 177 | assert!(regex("baa")); 178 | assert!(regex("baaa")); 179 | } 180 | 181 | #[test] 182 | fn range_exactly_4() { 183 | regex!(regex "^a{2}$"); 184 | assert!(!regex("")); 185 | assert!(!regex("a")); 186 | assert!(regex("aa")); 187 | assert!(!regex("aaa")); 188 | assert!(!regex("b")); 189 | assert!(!regex("ab")); 190 | assert!(!regex("aab")); 191 | assert!(!regex("aaab")); 192 | assert!(!regex("ba")); 193 | assert!(!regex("baa")); 194 | assert!(!regex("baaa")); 195 | } 196 | 197 | #[test] 198 | fn range_at_least_1() { 199 | regex!(regex "a{2,}"); 200 | assert!(!regex("")); 201 | assert!(!regex("a")); 202 | assert!(regex("aa")); 203 | assert!(regex("aaa")); 204 | assert!(!regex("b")); 205 | assert!(!regex("ab")); 206 | assert!(regex("aab")); 207 | assert!(regex("aaab")); 208 | assert!(!regex("ba")); 209 | assert!(regex("baa")); 210 | assert!(regex("baaa")); 211 | } 212 | 213 | #[test] 214 | fn range_at_least_2() { 215 | regex!(regex "^a{2,}"); 216 | assert!(!regex("")); 217 | assert!(!regex("a")); 218 | assert!(regex("aa")); 219 | assert!(regex("aaa")); 220 | assert!(!regex("b")); 221 | assert!(!regex("ab")); 222 | assert!(regex("aab")); 223 | assert!(regex("aaab")); 224 | assert!(!regex("ba")); 225 | assert!(!regex("baa")); 226 | assert!(!regex("baaa")); 227 | } 228 | 229 | #[test] 230 | fn range_at_least_3() { 231 | regex!(regex "a{2,}$"); 232 | assert!(!regex("")); 233 | assert!(!regex("a")); 234 | assert!(regex("aa")); 235 | assert!(regex("aaa")); 236 | assert!(!regex("b")); 237 | assert!(!regex("ab")); 238 | assert!(!regex("aab")); 239 | assert!(!regex("aaab")); 240 | assert!(!regex("ba")); 241 | assert!(regex("baa")); 242 | assert!(regex("baaa")); 243 | } 244 | 245 | #[test] 246 | fn range_at_least_4() { 247 | regex!(regex "^a{2,}$"); 248 | assert!(!regex("")); 249 | assert!(!regex("a")); 250 | assert!(regex("aa")); 251 | assert!(regex("aaa")); 252 | assert!(!regex("b")); 253 | assert!(!regex("ab")); 254 | assert!(!regex("aab")); 255 | assert!(!regex("aaab")); 256 | assert!(!regex("ba")); 257 | assert!(!regex("baa")); 258 | assert!(!regex("baaa")); 259 | } 260 | 261 | #[test] 262 | fn range_bounded_1() { 263 | regex!(regex "a{1,4}"); 264 | assert!(!regex("")); 265 | assert!(regex("a")); 266 | assert!(regex("aa")); 267 | assert!(regex("aaa")); 268 | assert!(regex("aaaa")); 269 | assert!(regex("aaaaa")); 270 | assert!(!regex("b")); 271 | assert!(regex("ab")); 272 | assert!(regex("aab")); 273 | assert!(regex("aaab")); 274 | assert!(regex("aaaab")); 275 | assert!(regex("aaaaab")); 276 | assert!(regex("ba")); 277 | assert!(regex("baa")); 278 | assert!(regex("baaa")); 279 | assert!(regex("baaaa")); 280 | assert!(regex("baaaaa")); 281 | } 282 | 283 | #[test] 284 | fn range_bounded_2() { 285 | regex!(regex "^a{1,4}"); 286 | assert!(!regex("")); 287 | assert!(regex("a")); 288 | assert!(regex("aa")); 289 | assert!(regex("aaa")); 290 | assert!(regex("aaaa")); 291 | assert!(regex("aaaaa")); 292 | assert!(!regex("b")); 293 | assert!(regex("ab")); 294 | assert!(regex("aab")); 295 | assert!(regex("aaab")); 296 | assert!(regex("aaaab")); 297 | assert!(regex("aaaaab")); 298 | assert!(!regex("ba")); 299 | assert!(!regex("baa")); 300 | assert!(!regex("baaa")); 301 | assert!(!regex("baaaa")); 302 | assert!(!regex("baaaaa")); 303 | } 304 | 305 | #[test] 306 | fn range_bounded_3() { 307 | regex!(regex "a{1,4}$"); 308 | assert!(!regex("")); 309 | assert!(regex("a")); 310 | assert!(regex("aa")); 311 | assert!(regex("aaa")); 312 | assert!(regex("aaaa")); 313 | assert!(regex("aaaaa")); 314 | assert!(!regex("b")); 315 | assert!(!regex("ab")); 316 | assert!(!regex("aab")); 317 | assert!(!regex("aaab")); 318 | assert!(!regex("aaaab")); 319 | assert!(!regex("aaaaab")); 320 | assert!(regex("ba")); 321 | assert!(regex("baa")); 322 | assert!(regex("baaa")); 323 | assert!(regex("baaaaa")); 324 | } 325 | 326 | #[test] 327 | fn range_bounded_4() { 328 | regex!(regex "^a{1,4}$"); 329 | assert!(!regex("")); 330 | assert!(regex("a")); 331 | assert!(regex("aa")); 332 | assert!(regex("aaa")); 333 | assert!(regex("aaaa")); 334 | assert!(!regex("aaaaa")); 335 | assert!(!regex("b")); 336 | assert!(!regex("ab")); 337 | assert!(!regex("aab")); 338 | assert!(!regex("aaab")); 339 | assert!(!regex("aaaab")); 340 | assert!(!regex("aaaaab")); 341 | assert!(!regex("ba")); 342 | assert!(!regex("baa")); 343 | assert!(!regex("baaa")); 344 | assert!(!regex("baaaa")); 345 | assert!(!regex("baaaaa")); 346 | } 347 | -------------------------------------------------------------------------------- /src/nfa/mod.rs: -------------------------------------------------------------------------------- 1 | mod repetition; 2 | 3 | use crate::{ 4 | character::{Character, CharacterError}, 5 | macro_input::MacroInput, 6 | }; 7 | use regex_syntax::{ 8 | hir::{Anchor, Class, Group, Hir, HirKind, Literal, WordBoundary}, 9 | ParserBuilder, 10 | }; 11 | use std::{ 12 | cmp::max, 13 | collections::{BTreeMap, BTreeSet}, 14 | fmt::Debug, 15 | }; 16 | use syn::{Error as SynError, Result as SynResult}; 17 | use thiserror::Error; 18 | 19 | pub const START_STATE: usize = 0; 20 | 21 | type Transition = BTreeMap>>; 22 | 23 | pub type NFAResult = Result; 24 | 25 | fn to_hir(input: &MacroInput) -> SynResult 26 | where 27 | T: Character, 28 | { 29 | let mut parser = ParserBuilder::new() 30 | .unicode(T::unicode()) 31 | .allow_invalid_utf8(T::allow_invalid_utf8()) 32 | .build(); 33 | match parser.parse(&input.get_regex()) { 34 | Ok(hir) => Ok(hir), 35 | Err(e) => Err(SynError::new( 36 | input.get_regex_span(), 37 | format!("Could not parse regex: {:?}", e), 38 | )), 39 | } 40 | } 41 | 42 | #[derive(Debug, Error)] 43 | pub enum NFAError { 44 | #[error("alternation has zero lenght")] 45 | AlternationZeroLen, 46 | #[error("CharacterError: {0}")] 47 | CharacterError(#[from] CharacterError), 48 | #[error("Start text was not at the beginning of the regex")] 49 | StartTextError, 50 | #[error("End text was not at the end of the text")] 51 | EndTextError, 52 | } 53 | 54 | #[derive(Debug)] 55 | pub struct Nfa 56 | where 57 | T: Character + Copy, 58 | { 59 | states: BTreeSet, 60 | transitions: Transition, 61 | accept_states: BTreeSet, 62 | state_count: usize, 63 | start_text: bool, 64 | end_text: bool, 65 | } 66 | 67 | impl Nfa 68 | where 69 | T: Character + Copy, 70 | { 71 | fn add_transition( 72 | transitions: &mut Transition, 73 | source_state: usize, 74 | character: T, 75 | target_state: usize, 76 | ) { 77 | if let Some(characters_to_targets) = transitions.get_mut(&source_state) { 78 | if let Some(targets) = characters_to_targets.get_mut(&character) { 79 | targets.insert(target_state); 80 | } else { 81 | let mut targets = BTreeSet::new(); 82 | targets.insert(target_state); 83 | 84 | characters_to_targets.insert(character, targets); 85 | } 86 | } else { 87 | let mut targets = BTreeSet::new(); 88 | targets.insert(target_state); 89 | 90 | let mut characters_to_targets = BTreeMap::new(); 91 | characters_to_targets.insert(character, targets); 92 | 93 | transitions.insert(source_state, characters_to_targets); 94 | } 95 | } 96 | 97 | fn extend_transitions(d: &mut Transition, s: &Transition) { 98 | for (new_source_state, new_characters_to_targets) in s.iter() { 99 | if let Some(characters_to_targets) = d.get_mut(new_source_state) { 100 | for (new_character, new_targets) in new_characters_to_targets.iter() { 101 | if let Some(targets) = characters_to_targets.get_mut(new_character) { 102 | for new_target_state in new_targets { 103 | targets.insert(*new_target_state); 104 | } 105 | } else { 106 | characters_to_targets.insert(*new_character, new_targets.clone()); 107 | } 108 | } 109 | } else { 110 | d.insert(*new_source_state, new_characters_to_targets.clone()); 111 | } 112 | } 113 | } 114 | 115 | fn add_state(&mut self, new_state: usize) -> NFAResult<()> { 116 | if self.end_text { 117 | return Err(NFAError::EndTextError); 118 | } 119 | 120 | let assert = self.states.insert(new_state); 121 | debug_assert!(assert); 122 | self.state_count = max(new_state, self.state_count); 123 | Ok(()) 124 | } 125 | 126 | fn append_states(&mut self, nfa: &Nfa) -> NFAResult<()> { 127 | self.set_start_text(nfa.start_text)?; 128 | if !nfa.states.is_empty() { 129 | if self.end_text { 130 | return Err(NFAError::EndTextError); 131 | } 132 | 133 | for new_state in nfa.states.iter() { 134 | let assert = self.states.insert(*new_state); 135 | debug_assert!(assert); 136 | self.state_count = max(*new_state, self.state_count); 137 | } 138 | Nfa::extend_transitions(&mut self.transitions, &nfa.transitions); 139 | } 140 | self.end_text = nfa.end_text; 141 | Ok(()) 142 | } 143 | 144 | fn next_state_count(&mut self) -> usize { 145 | self.state_count += 1; 146 | self.state_count 147 | } 148 | 149 | fn next_state(&mut self) -> NFAResult { 150 | let new_state = self.next_state_count(); 151 | self.add_state(new_state)?; 152 | Ok(new_state) 153 | } 154 | 155 | fn sub(&mut self, hir: Hir) -> NFAResult> { 156 | let mut nfa = Nfa { 157 | states: BTreeSet::new(), 158 | transitions: Transition::new(), 159 | accept_states: self.accept_states.clone(), 160 | state_count: self.next_state_count(), 161 | start_text: false, 162 | end_text: self.end_text, 163 | }; 164 | nfa.hir(hir)?; 165 | Ok(nfa) 166 | } 167 | 168 | fn new() -> Nfa { 169 | let mut states = BTreeSet::new(); 170 | states.insert(START_STATE); 171 | 172 | Nfa { 173 | states: states.clone(), 174 | transitions: Transition::new(), 175 | accept_states: states, 176 | state_count: START_STATE, 177 | start_text: false, 178 | end_text: false, 179 | } 180 | } 181 | 182 | fn char(&mut self, c: T) -> NFAResult<()> { 183 | let state = self.next_state()?; 184 | for s in self.accept_states.iter() { 185 | Nfa::add_transition(&mut self.transitions, *s, c, state); 186 | } 187 | self.accept_states = BTreeSet::new(); 188 | self.accept_states.insert(state); 189 | Ok(()) 190 | } 191 | 192 | fn literal(&mut self, literal: Literal) -> NFAResult<()> { 193 | let c = T::from_literal(literal)?; 194 | self.char(c) 195 | } 196 | 197 | fn class(&mut self, class: Class) -> NFAResult<()> { 198 | let state = self.next_state()?; 199 | let cs = T::from_class(class)?; 200 | for c in cs { 201 | for s in &self.accept_states { 202 | Nfa::add_transition(&mut self.transitions, *s, c, state); 203 | } 204 | } 205 | self.accept_states = BTreeSet::new(); 206 | self.accept_states.insert(state); 207 | Ok(()) 208 | } 209 | 210 | fn alternation(&mut self, alternation: Vec) -> NFAResult<()> { 211 | if alternation.is_empty() { 212 | return Err(NFAError::AlternationZeroLen); 213 | } 214 | 215 | let mut accept_states = BTreeSet::new(); 216 | for hir in alternation { 217 | let nfa = self.sub(hir)?; 218 | self.append_states(&nfa)?; 219 | accept_states.extend(nfa.accept_states) 220 | } 221 | self.accept_states = accept_states; 222 | Ok(()) 223 | } 224 | 225 | fn conact(&mut self, concat: Vec) -> NFAResult<()> { 226 | for hir in concat { 227 | self.hir(hir)?; 228 | } 229 | Ok(()) 230 | } 231 | 232 | fn group(&mut self, group: Group) -> NFAResult<()> { 233 | self.hir(*group.hir) 234 | } 235 | 236 | fn word_boundary(&mut self, _word_boundary: WordBoundary) -> NFAResult<()> { 237 | unimplemented!(); 238 | } 239 | 240 | fn set_start_text(&mut self, start_text: bool) -> NFAResult<()> { 241 | if start_text { 242 | if self.state_count == 0 { 243 | self.start_text = true; 244 | } else { 245 | return Err(NFAError::StartTextError); 246 | } 247 | } 248 | 249 | Ok(()) 250 | } 251 | 252 | fn anchor(&mut self, anchor: Anchor) -> NFAResult<()> { 253 | match anchor { 254 | Anchor::StartLine => self.char(T::new_line()), 255 | Anchor::EndLine => self.char(T::new_line()), 256 | Anchor::StartText => self.set_start_text(true), 257 | Anchor::EndText => { 258 | self.end_text = true; 259 | Ok(()) 260 | } 261 | } 262 | } 263 | 264 | fn hir(&mut self, hir: Hir) -> NFAResult<()> { 265 | match hir.into_kind() { 266 | HirKind::Empty => Ok(()), 267 | HirKind::Literal(literal) => self.literal(literal), 268 | HirKind::Class(class) => self.class(class), 269 | HirKind::Alternation(alternation) => self.alternation(alternation), 270 | HirKind::Concat(concat) => self.conact(concat), 271 | HirKind::Repetition(repetition) => self.repetition(repetition), 272 | HirKind::Group(group) => self.group(group), 273 | HirKind::WordBoundary(word_boundary) => self.word_boundary(word_boundary), 274 | HirKind::Anchor(anchor) => self.anchor(anchor), 275 | } 276 | } 277 | 278 | pub fn chars(&self, source_state: usize, characters: &mut BTreeSet) { 279 | if let Some(characters_to_targets) = self.transitions.get(&source_state) { 280 | characters.extend(characters_to_targets.keys()); 281 | } 282 | } 283 | 284 | pub fn simulate(&self, source_state: usize, character: T, new_targets: &mut BTreeSet) { 285 | if let Some(characters_to_targets) = self.transitions.get(&source_state) { 286 | if let Some(targets) = characters_to_targets.get(&character) { 287 | for target_state in targets { 288 | new_targets.insert(*target_state); 289 | } 290 | } 291 | } 292 | } 293 | 294 | pub fn is_accept_state(&self, state: usize) -> bool { 295 | self.accept_states.contains(&state) 296 | } 297 | 298 | pub fn is_start_text(&self) -> bool { 299 | self.start_text 300 | } 301 | 302 | pub fn is_end_text(&self) -> bool { 303 | self.end_text 304 | } 305 | } 306 | 307 | impl TryFrom<&MacroInput> for Nfa 308 | where 309 | T: Character + Copy, 310 | { 311 | type Error = SynError; 312 | 313 | fn try_from(input: &MacroInput) -> SynResult { 314 | let hir = to_hir::(input)?; 315 | 316 | let mut nfa = Nfa::new(); 317 | match nfa.hir(hir) { 318 | Ok(_) => Ok(nfa), 319 | Err(e) => Err(SynError::new( 320 | input.get_regex_span(), 321 | format!("Error create the NFA: {:?}", e), 322 | )), 323 | } 324 | } 325 | } 326 | --------------------------------------------------------------------------------