├── .gitignore
├── fuzz
    ├── .gitignore
    ├── Cargo.toml
    └── src
    │   └── check.rs
├── examples
    └── regex.rs
├── src
    ├── dfa
    │   ├── mod.rs
    │   ├── to_tokens
    │   │   ├── mod.rs
    │   │   ├── binary_search.rs
    │   │   └── lookup_table.rs
    │   └── nfa_to_dfa.rs
    ├── macro_input.rs
    ├── lib.rs
    ├── character.rs
    └── nfa
    │   ├── repetition.rs
    │   └── mod.rs
├── Cargo.toml
├── tests
    ├── class.rs
    ├── regex.rs
    ├── standard.rs
    └── repetition.rs
├── LICENSE
├── .github
    └── workflows
    │   └── ci.yml
├── benches
    └── compare.rs
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Cargo.lock
3 | 


--------------------------------------------------------------------------------
/fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | hfuzz_target
2 | hfuzz_workspace
3 | 


--------------------------------------------------------------------------------
/fuzz/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "fuzz"
 3 | version = "0.1.0"
 4 | authors = ["LinkTed <link.ted@mailbox.org>"]
 5 | edition = "2018"
 6 | 
 7 | [dependencies]
 8 | regex = "~1.4.6"
 9 | honggfuzz = "~0.5.54"
10 | 
11 | [dependencies.proc-macro-regex]
12 | path = ".."
13 | 
14 | [[bin]]
15 | name = "check"
16 | path = "src/check.rs"
17 | test = false
18 | doc = false
19 | 


--------------------------------------------------------------------------------
/examples/regex.rs:
--------------------------------------------------------------------------------
 1 | use proc_macro_regex::regex;
 2 | 
 3 | regex!(example_1 "abc");
 4 | regex!(example_2 "abc" 256);
 5 | regex!(pub example_3 "abc");
 6 | regex!(example_4 b"abc");
 7 | 
 8 | fn main() {
 9 |     println!("example_1 == {}", example_1("abc"));
10 |     println!("example_2 == {}", example_2("abc"));
11 |     println!("example_3 == {}", example_3("abc"));
12 |     println!("example_4 == {}", example_4(b"abc"));
13 | }
14 | 


--------------------------------------------------------------------------------
/src/dfa/mod.rs:
--------------------------------------------------------------------------------
 1 | mod nfa_to_dfa;
 2 | mod to_tokens;
 3 | 
 4 | pub(super) use crate::dfa::to_tokens::DfaToTokens;
 5 | use crate::{character::Character, dfa::nfa_to_dfa::NfaToDfaIter, nfa::Nfa};
 6 | use std::{
 7 |     collections::{BTreeMap, BTreeSet},
 8 |     convert::From,
 9 |     fmt::Debug,
10 | };
11 | 
12 | #[derive(Debug)]
13 | pub(crate) struct Dfa<T>
14 | where
15 |     T: Character,
16 | {
17 |     states: BTreeSet<usize>,
18 |     transitions: BTreeMap<usize, BTreeSet<(T, usize)>>,
19 |     accept_states: BTreeSet<usize>,
20 |     start_text: bool,
21 |     end_text: bool,
22 | }
23 | 
24 | impl<T> From<Nfa<T>> for Dfa<T>
25 | where
26 |     T: Character + Copy,
27 | {
28 |     fn from(nfa: Nfa<T>) -> Self {
29 |         let nfa_to_dfa = NfaToDfaIter::new(nfa);
30 |         Dfa::from(nfa_to_dfa)
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "proc-macro-regex"
 3 | version = "1.1.0"
 4 | authors = ["LinkTed <link.ted@mailbox.org>"]
 5 | license = "BSD-3-Clause"
 6 | readme = "README.md"
 7 | description = "A proc macro regex library"
 8 | keywords = ["regex", "proc-marco"]
 9 | edition = "2021"
10 | include = [
11 |     "src/**/*.rs",
12 |     "tests/*.rs",
13 |     "examples/*.rs", 
14 |     "Cargo.toml",
15 |     "README.md",
16 |     "LICENSE",
17 |     ]
18 | repository = "https://github.com/LinkTed/proc-macro-regex"
19 | categories = ["text-processing"]
20 | 
21 | [lib]
22 | proc-macro = true
23 | 
24 | [dependencies]
25 | regex-syntax = "~0.6.26"
26 | proc-macro2 = "~1.0.36"
27 | quote = "~1.0.18"
28 | thiserror = "~1.0.31"
29 | 
30 | [dependencies.syn]
31 | version = "~1.0.96"
32 | features = ["extra-traits"]
33 | 
34 | [dev-dependencies]
35 | criterion = "~0.3.5"
36 | regex = "~1.5.6"
37 | 
38 | [[bench]]
39 | name = "compare"
40 | harness = false
41 | path = "benches/compare.rs"
42 | 


--------------------------------------------------------------------------------
/tests/class.rs:
--------------------------------------------------------------------------------
 1 | use proc_macro_regex::regex;
 2 | 
 3 | #[test]
 4 | fn character_class_regex() {
 5 |     regex!(character_class "[xyz]");
 6 |     assert!(character_class("x"));
 7 |     assert!(!character_class("a"));
 8 | }
 9 | 
10 | #[test]
11 | fn character_class_except_regex() {
12 |     regex!(character_class_except b"[^x]");
13 |     assert!(character_class_except(b"a"));
14 |     assert!(!character_class_except(b"x"));
15 | }
16 | 
17 | #[test]
18 | fn character_class_range_regex() {
19 |     regex!(character_class_range "[a-c]");
20 |     assert!(character_class_range("a"));
21 |     assert!(character_class_range("c"));
22 |     assert!(!character_class_range("x"));
23 | }
24 | 
25 | #[test]
26 | fn character_class_alpha_regex() {
27 |     regex!(character_class_alpha "[[:alpha:]]");
28 |     assert!(character_class_alpha("a"));
29 |     assert!(character_class_alpha("Z"));
30 |     assert!(!character_class_alpha("1"));
31 | }
32 | 
33 | #[test]
34 | fn character_class_nested_regex() {
35 |     regex!(character_class_nested b"[x[^xyz]]");
36 |     assert!(character_class_nested(b"x"));
37 |     assert!(!character_class_nested(b"y"));
38 | }
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, LinkTed
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   rustfmt:
14 |     name: Job rustfmt
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Install toolchain with rustfmt
18 |         uses: actions-rs/toolchain@v1
19 |         with:
20 |           toolchain: stable
21 |           components: rustfmt
22 |       - uses: actions/checkout@v2
23 |       - name: Run rustfmt
24 |         run: cargo fmt --all -- --check
25 | 
26 |   audit:
27 |     name: Job audit
28 |     runs-on: ubuntu-latest
29 |     steps: 
30 |       - uses: actions/checkout@v1
31 |       - name: Run audit
32 |         uses: actions-rs/audit-check@v1
33 |         with:
34 |           token: ${{ secrets.GITHUB_TOKEN }}
35 | 
36 |   clippy:
37 |     name: Job clippy
38 |     needs: rustfmt
39 |     runs-on: ubuntu-latest
40 |     steps:
41 |       - name: Install toolchain with clippy
42 |         uses: actions-rs/toolchain@v1
43 |         with:
44 |           toolchain: stable
45 |           components: clippy
46 |       - uses: actions/checkout@v2
47 |       - name: Run clippy
48 |         uses: actions-rs/clippy-check@v1
49 |         with:
50 |           token: ${{ secrets.GITHUB_TOKEN }}
51 |           args: --all-features --all-targets -- --deny warnings -A clippy::unknown-clippy-lints
52 | 
53 |   tests:
54 |     name: Job tests
55 |     needs: clippy
56 |     strategy:
57 |       matrix:
58 |         os: [ubuntu-latest, macos-latest, windows-latest]
59 |         rust_channel: [stable, nightly]
60 |     runs-on: ${{ matrix.os }}
61 |     steps:
62 |       - name: Install toolchain ${{ matrix.rust_channel }} on ${{ matrix.os }}
63 |         uses: actions-rs/toolchain@v1
64 |         with:
65 |           toolchain: ${{ matrix.rust_channel }}
66 |       - uses: actions/checkout@v2
67 |       - name: Run cargo test
68 |         uses: actions-rs/cargo@v1
69 |         with:
70 |           command: test
71 |           args: --all-features
72 | 


--------------------------------------------------------------------------------
/tests/regex.rs:
--------------------------------------------------------------------------------
 1 | use proc_macro_regex::regex;
 2 | 
 3 | #[test]
 4 | fn ipv4_regex() {
 5 |     // source https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses
 6 |     regex!(ipv4 r"^((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])$");
 7 |     assert!(ipv4("127.0.0.1"));
 8 |     assert!(!ipv4("127.0.0.256"));
 9 | }
10 | 
11 | #[test]
12 | fn ipv6_regex() {
13 |     // source https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses
14 |     regex!(ipv6 r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$" 1048576);
15 |     assert!(ipv6("fe80::1ff:fe23:4567:890a"));
16 |     assert!(!ipv6("fe80::1ff::fe23:4567:890a"));
17 | }
18 | 
19 | #[test]
20 | fn ipv6_pattern_regex() {
21 |     // source https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses
22 |     regex!(ipv6 r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))");
23 |     assert!(ipv6("Pattern fe80::1ff:fe23:4567:890a"));
24 | }
25 | 
26 | #[test]
27 | fn email_regex() {
28 |     regex!(email "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$" 0);
29 |     assert!(email("example@example.org"));
30 |     assert!(!email("example@example@org"));
31 | }
32 | 
33 | #[test]
34 | fn url_http() {
35 |     // source https://gist.github.com/jacksonfdam/3000275
36 |     regex!(url_http r"^http(s)?://(([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|(([0-9A-Za-z-]+\.)+([a-z,A-Z][0-9A-Za-z_-]*)))(:[1-9][0-9]*)?(/([0-9A-Za-z_./:%+@&=-]+[0-9A-Za-z_ ./?:%+@&=-]*)?)?(#([\t\n\v\f\r ]*))?$");
37 |     assert!(url_http("http://127.0.0.1/page?param=value"));
38 |     assert!(url_http("http://www.example.org/page?param=value"));
39 |     assert!(!url_http("htt://www.example.org/"));
40 | }
41 | 


--------------------------------------------------------------------------------
/src/macro_input.rs:
--------------------------------------------------------------------------------
  1 | use proc_macro2::Span;
  2 | use syn::{
  3 |     parse::{Parse, ParseStream, Result as ParseResult},
  4 |     spanned::Spanned,
  5 |     Ident, LitByteStr, LitInt, LitStr, Visibility,
  6 | };
  7 | 
  8 | const DEFAULT_LIMIT: usize = 65536;
  9 | 
 10 | pub enum Regex {
 11 |     LitStr(LitStr),
 12 |     LitByteStr(LitByteStr),
 13 | }
 14 | 
 15 | impl Regex {
 16 |     fn is_str(&self) -> bool {
 17 |         match self {
 18 |             Regex::LitStr(_) => true,
 19 |             Regex::LitByteStr(_) => false,
 20 |         }
 21 |     }
 22 | 
 23 |     fn get_regex(&self) -> String {
 24 |         match self {
 25 |             Regex::LitStr(lit_str) => lit_str.value(),
 26 |             Regex::LitByteStr(lit_byte_str) => {
 27 |                 let bytes = lit_byte_str.value();
 28 |                 String::from_utf8(bytes).unwrap()
 29 |             }
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | impl Parse for Regex {
 35 |     fn parse(input: ParseStream) -> ParseResult<Self> {
 36 |         let lookahead = input.lookahead1();
 37 |         let result = if lookahead.peek(LitStr) {
 38 |             Regex::LitStr(input.parse()?)
 39 |         } else {
 40 |             Regex::LitByteStr(input.parse()?)
 41 |         };
 42 |         Ok(result)
 43 |     }
 44 | }
 45 | 
 46 | impl Spanned for Regex {
 47 |     fn span(&self) -> proc_macro2::Span {
 48 |         match self {
 49 |             Regex::LitStr(lit_str) => lit_str.span(),
 50 |             Regex::LitByteStr(lit_byte_str) => lit_byte_str.span(),
 51 |         }
 52 |     }
 53 | }
 54 | 
 55 | pub struct MacroInput {
 56 |     visibility: Visibility,
 57 |     name: Ident,
 58 |     regex: Regex,
 59 |     threshold: usize,
 60 | }
 61 | 
 62 | impl Parse for MacroInput {
 63 |     fn parse(input: ParseStream) -> ParseResult<Self> {
 64 |         let visibility: Visibility = input.parse()?;
 65 |         let name: Ident = input.parse()?;
 66 |         let regex = input.parse()?;
 67 |         let lookahead = input.lookahead1();
 68 |         let threshold = if lookahead.peek(LitInt) {
 69 |             let threshold: LitInt = input.parse()?;
 70 |             threshold.base10_parse()?
 71 |         } else {
 72 |             DEFAULT_LIMIT
 73 |         };
 74 |         Ok(MacroInput {
 75 |             visibility,
 76 |             name,
 77 |             regex,
 78 |             threshold,
 79 |         })
 80 |     }
 81 | }
 82 | 
 83 | impl MacroInput {
 84 |     pub fn is_str(&self) -> bool {
 85 |         self.regex.is_str()
 86 |     }
 87 | 
 88 |     pub fn get_regex(&self) -> String {
 89 |         self.regex.get_regex()
 90 |     }
 91 | 
 92 |     pub fn get_regex_span(&self) -> Span {
 93 |         self.regex.span()
 94 |     }
 95 | 
 96 |     pub fn get_name(&self) -> &Ident {
 97 |         &self.name
 98 |     }
 99 | 
100 |     pub fn get_visibility(&self) -> &Visibility {
101 |         &self.visibility
102 |     }
103 | 
104 |     pub fn get_threshold(&self) -> usize {
105 |         self.threshold
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/fuzz/src/check.rs:
--------------------------------------------------------------------------------
 1 | use honggfuzz::fuzz;
 2 | use proc_macro_regex::regex;
 3 | use regex::{Regex, RegexBuilder};
 4 | 
 5 | fn build_regex(regex: &str) -> Regex {
 6 |     RegexBuilder::new(regex).unicode(false).build().unwrap()
 7 | }
 8 | 
 9 | fn check(string: &str, regex: &Regex, proc_macro_regex: fn(&str) -> bool) {
10 |     let result_regex = regex.is_match(string);
11 |     let result_proc_macro_regex = proc_macro_regex(string);
12 |     if result_regex != result_proc_macro_regex {
13 |         panic!(
14 |             "{} != {}: {}",
15 |             result_regex, result_proc_macro_regex, string
16 |         );
17 |     }
18 | }
19 | 
20 | fn main() {
21 |     let regex_email = build_regex(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$");
22 |     regex!(proc_macro_regex_email r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$");
23 | 
24 |     let regex_url = build_regex(
25 |         r"^http(s)?://((\d+\.\d+\.\d+\.\d+)|(([\w-]+\.)+([a-z,A-Z][\w-]*)))(:[1-9][0-9]*)?(/([\w./:%+@&=-]+[\w ./?:%+@&=-]*)?)?(#(\s*))?$",
26 |     );
27 |     regex!(proc_macro_regex_url r"^http(s)?://((\d+\.\d+\.\d+\.\d+)|(([\w-]+\.)+([a-z,A-Z][\w-]*)))(:[1-9][0-9]*)?(/([\w./:%+@&=-]+[\w ./?:%+@&=-]*)?)?(#(\s*))?$");
28 | 
29 |     let regex_ipv6 = build_regex(
30 |         r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$",
31 |     );
32 |     regex!(proc_macro_regex_ipv6 r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$");
33 | 
34 |     let regex_test = build_regex("^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4})$");
35 |     regex!(proc_macro_regex_test "^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4})$");
36 | 
37 |     loop {
38 |         fuzz!(|string: &str| {
39 |             check(string, &regex_email, proc_macro_regex_email);
40 |             check(string, &regex_url, proc_macro_regex_url);
41 |             check(string, &regex_ipv6, proc_macro_regex_ipv6);
42 |             check(string, &regex_test, proc_macro_regex_test);
43 |         });
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod character;
 2 | mod dfa;
 3 | mod macro_input;
 4 | mod nfa;
 5 | 
 6 | use crate::{
 7 |     dfa::{Dfa, DfaToTokens},
 8 |     macro_input::MacroInput,
 9 |     nfa::Nfa,
10 | };
11 | use proc_macro::TokenStream;
12 | use quote::quote;
13 | use syn::parse_macro_input;
14 | 
15 | /// The macro creates a function which returns `true` if the argument matches the regex.
16 | ///
17 | /// If the first argument is an identifier (name), then this is the name of  the function, which
18 | /// would be generated. Example:
19 | /// ```rust
20 | /// use proc_macro_regex::regex;
21 | ///
22 | /// regex!(the_name_of_the_function "the regex to check");
23 | /// ```
24 | ///
25 | /// Alternative, if the first argument is a visibility keyword, then this is the visibility of the
26 | /// function. Otherwise, the function is private. Example:
27 | /// ```rust
28 | /// # use proc_macro_regex::regex;
29 | /// regex!(pub public_function "the function is public");
30 | /// regex!(private_function "the function is private");
31 | /// ```
32 | ///
33 | /// The next argument is a string of the regex, which the function should check. Alternative, a
34 | /// byte string can be given, if the input should be a byte array (`&[u8]`). otherwise a string is
35 | /// taken.
36 | /// ```rust
37 | /// # use proc_macro_regex::regex;
38 | /// regex!(string_function "This function takes a string");
39 | /// regex!(bytes_function "This function takes a byte array");
40 | /// ```
41 | ///
42 | /// At the end, a positive number can be given to set the limit of the lookup table
43 | /// (see `README.md`).
44 | /// ```rust
45 | /// # use proc_macro_regex::regex;
46 | /// regex!(limit_function "The limit is set to 100 bytes" 100);
47 | /// ```
48 | ///
49 | /// # Syntax
50 | /// The given regex works the same as in the [regex](https://crates.io/crates/regex) crate.
51 | /// * If the `^` is at the beginning of the regex, then it is checked if the input is match at the
52 | ///   beginning of the text.
53 | /// * If the `$` is at the end, then it is checked if the input is match at the end of the text.
54 | /// * If both are present then the whole input is checked.
55 | /// * Otherwise, is check if the string contains the regex.
56 | #[proc_macro]
57 | pub fn regex(input: TokenStream) -> TokenStream {
58 |     let input = parse_macro_input!(input as MacroInput);
59 |     let visibility = input.get_visibility();
60 |     let name = input.get_name();
61 |     let threshold = input.get_threshold();
62 |     let (argument_type, body) = if input.is_str() {
63 |         let nfa = Nfa::<char>::try_from(&input).unwrap();
64 |         let dfa = Dfa::from(nfa);
65 |         (
66 |             quote! {
67 |                 str
68 |             },
69 |             DfaToTokens::new(dfa, threshold).get_token_streams(),
70 |         )
71 |     } else {
72 |         let nfa = Nfa::<u8>::try_from(&input).unwrap();
73 |         let dfa = Dfa::from(nfa);
74 |         (
75 |             quote! {
76 |                 [u8]
77 |             },
78 |             DfaToTokens::new(dfa, threshold).get_token_streams(),
79 |         )
80 |     };
81 |     let function = quote! {
82 |         #visibility fn #name(s: &#argument_type) -> bool {
83 |             #body
84 |         }
85 |     };
86 |     function.into()
87 | }
88 | 


--------------------------------------------------------------------------------
/src/dfa/to_tokens/mod.rs:
--------------------------------------------------------------------------------
  1 | mod binary_search;
  2 | mod lookup_table;
  3 | 
  4 | use crate::{character::Character, dfa::Dfa, nfa::START_STATE};
  5 | use proc_macro2::{Span, TokenStream};
  6 | use quote::{quote, ToTokens};
  7 | use std::collections::BTreeSet;
  8 | use syn::LitInt;
  9 | 
 10 | fn usize_to_lit_int(i: usize) -> LitInt {
 11 |     let s = format!("{}", i);
 12 |     LitInt::new(&s, Span::call_site())
 13 | }
 14 | 
 15 | pub(crate) struct DfaToTokens<T>
 16 | where
 17 |     T: Character,
 18 | {
 19 |     dfa: Dfa<T>,
 20 |     threshold: usize,
 21 |     required_states: BTreeSet<usize>,
 22 |     is_byte: bool,
 23 | }
 24 | 
 25 | impl<T> DfaToTokens<T>
 26 | where
 27 |     T: Character,
 28 | {
 29 |     /// If `self.end_text` is `true` then only no accept-states have to be implemented.
 30 |     /// Because if the state machine reaches an accept-state, then it stops.
 31 |     fn get_required_states(dfa: &Dfa<T>) -> BTreeSet<usize> {
 32 |         if dfa.end_text {
 33 |             dfa.states.clone()
 34 |         } else {
 35 |             let mut required_states = BTreeSet::new();
 36 |             for state in dfa.states.iter() {
 37 |                 if !dfa.accept_states.contains(state) {
 38 |                     required_states.insert(*state);
 39 |                 }
 40 |             }
 41 |             required_states
 42 |         }
 43 |     }
 44 | 
 45 |     fn is_byte(dfa: &Dfa<T>) -> bool {
 46 |         for (_, transitions) in dfa.transitions.iter() {
 47 |             for (ch, _) in transitions.iter() {
 48 |                 if !ch.is_byte() {
 49 |                     return false;
 50 |                 }
 51 |             }
 52 |         }
 53 |         true
 54 |     }
 55 | 
 56 |     fn returns_true(&self) -> bool {
 57 |         if self.dfa.accept_states.contains(&START_STATE) {
 58 |             if self.dfa.end_text {
 59 |                 !self.dfa.start_text && self.dfa.states.len() == 1
 60 |             } else {
 61 |                 true
 62 |             }
 63 |         } else {
 64 |             false
 65 |         }
 66 |     }
 67 | }
 68 | 
 69 | impl<T> DfaToTokens<T>
 70 | where
 71 |     T: Character + ToTokens + Copy + Into<u32>,
 72 | {
 73 |     pub(crate) fn new(dfa: Dfa<T>, threshold: usize) -> DfaToTokens<T> {
 74 |         let required_states = DfaToTokens::get_required_states(&dfa);
 75 |         let is_byte = DfaToTokens::is_byte(&dfa);
 76 |         DfaToTokens {
 77 |             dfa,
 78 |             required_states,
 79 |             threshold,
 80 |             is_byte,
 81 |         }
 82 |     }
 83 | 
 84 |     fn last_check(&self) -> TokenStream {
 85 |         if self.dfa.end_text {
 86 |             let accept_states: Vec<LitInt> = self
 87 |                 .dfa
 88 |                 .accept_states
 89 |                 .iter()
 90 |                 .map(|u| usize_to_lit_int(*u))
 91 |                 .collect();
 92 |             quote! {
 93 |                 match state {
 94 |                     #(#accept_states => true,)*
 95 |                     _ => false,
 96 |                 }
 97 |             }
 98 |         } else {
 99 |             quote! {
100 |                 false
101 |             }
102 |         }
103 |     }
104 | 
105 |     fn for_each(&self) -> TokenStream {
106 |         if let Some(for_each_lookup_table) = self.for_each_lookup_table() {
107 |             for_each_lookup_table
108 |         } else {
109 |             self.for_each_binary_search()
110 |         }
111 |     }
112 | 
113 |     pub fn get_token_streams(&self) -> TokenStream {
114 |         if self.returns_true() {
115 |             quote! {true}
116 |         } else {
117 |             let for_each = self.for_each();
118 |             let last_check = self.last_check();
119 |             quote! {
120 |                 #for_each
121 | 
122 |                 #last_check
123 |             }
124 |         }
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/benches/compare.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 2 | use proc_macro_regex::regex;
 3 | use regex::Regex;
 4 | 
 5 | const INPUT_EMAIL: &str = "example@example.org";
 6 | const INPUT_URL: &str = "https://www.example.org/page?param=value";
 7 | const INPUT_IPV6: &str = "fe80::1ff:fe23:4567:890a";
 8 | 
 9 | fn regex(c: &mut Criterion) {
10 |     let mut group = c.benchmark_group("regex");
11 | 
12 |     let regex_email = Regex::new("^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$").unwrap();
13 |     let throughput = Throughput::Bytes(INPUT_EMAIL.len() as u64);
14 |     let benchmark_id = BenchmarkId::new("email", INPUT_EMAIL.len());
15 |     group.throughput(throughput);
16 |     group.bench_with_input(benchmark_id, INPUT_EMAIL, |b, input| {
17 |         b.iter(|| regex_email.is_match(input))
18 |     });
19 | 
20 |     let regex_url = Regex::new(
21 |         r"^http(s)?://(([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|(([0-9A-Za-z-]+\.)+([a-z,A-Z][0-9A-Za-z_-]*)))(:[1-9][0-9]*)?(/([0-9A-Za-z_./:%+@&=-]+[0-9A-Za-z_ ./?:%+@&=-]*)?)?(#([\t\n\v\f\r ]*))?$",
22 |     ).unwrap();
23 |     let throughput = Throughput::Bytes(INPUT_URL.len() as u64);
24 |     let benchmark_id = BenchmarkId::new("url", INPUT_URL.len());
25 |     group.throughput(throughput);
26 |     group.bench_with_input(benchmark_id, INPUT_URL, |b, input| {
27 |         b.iter(|| regex_url.is_match(input))
28 |     });
29 | 
30 |     let regex_ipv6 = Regex::new(
31 |         r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$",
32 |     ).unwrap();
33 |     let throughput = Throughput::Bytes(INPUT_IPV6.len() as u64);
34 |     let benchmark_id = BenchmarkId::new("ipv6", INPUT_IPV6.len());
35 |     group.throughput(throughput);
36 |     group.bench_with_input(benchmark_id, INPUT_IPV6, |b, input| {
37 |         b.iter(|| regex_ipv6.is_match(input))
38 |     });
39 | }
40 | 
41 | fn proc_macro_regex(c: &mut Criterion) {
42 |     let mut group = c.benchmark_group("proc-macro-regex");
43 | 
44 |     regex!(regex_email "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$");
45 |     let throughput = Throughput::Bytes(INPUT_EMAIL.len() as u64);
46 |     let benchmark_id = BenchmarkId::new("email", INPUT_EMAIL.len());
47 |     group.throughput(throughput);
48 |     group.bench_with_input(benchmark_id, INPUT_EMAIL, |b, input| {
49 |         b.iter(|| regex_email(input))
50 |     });
51 | 
52 |     regex!(regex_url r"^http(s)?://(([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|(([0-9A-Za-z-]+\.)+([a-z,A-Z][0-9A-Za-z_-]*)))(:[1-9][0-9]*)?(/([0-9A-Za-z_./:%+@&=-]+[0-9A-Za-z_ ./?:%+@&=-]*)?)?(#([\t\n\v\f\r ]*))?$");
53 |     let throughput = Throughput::Bytes(INPUT_URL.len() as u64);
54 |     let benchmark_id = BenchmarkId::new("url", INPUT_URL.len());
55 |     group.throughput(throughput);
56 |     group.bench_with_input(benchmark_id, INPUT_URL, |b, input| {
57 |         b.iter(|| regex_url(input))
58 |     });
59 | 
60 |     regex!(regex_ipv6 r"^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))$" 1048576);
61 |     let throughput = Throughput::Bytes(INPUT_IPV6.len() as u64);
62 |     let benchmark_id = BenchmarkId::new("ipv6", INPUT_IPV6.len());
63 |     group.throughput(throughput);
64 |     group.bench_with_input(benchmark_id, INPUT_IPV6, |b, input| {
65 |         b.iter(|| regex_ipv6(input))
66 |     });
67 | }
68 | 
69 | criterion_group! {
70 |     name = benches;
71 |     config = Criterion::default();
72 |     targets = regex, proc_macro_regex
73 | }
74 | criterion_main!(benches);
75 | 


--------------------------------------------------------------------------------
/tests/standard.rs:
--------------------------------------------------------------------------------
  1 | use proc_macro_regex::regex;
  2 | 
  3 | #[test]
  4 | fn empty() {
  5 |     regex!(regex "");
  6 |     assert!(regex(""));
  7 |     assert!(regex("a"));
  8 | }
  9 | 
 10 | #[test]
 11 | fn literal_1() {
 12 |     regex!(regex "a");
 13 |     assert!(!regex(""));
 14 |     assert!(regex("a"));
 15 |     assert!(regex("ab"));
 16 |     assert!(regex("ba"));
 17 | }
 18 | 
 19 | #[test]
 20 | fn literal_2() {
 21 |     regex!(regex "^a");
 22 |     assert!(!regex(""));
 23 |     assert!(regex("a"));
 24 |     assert!(regex("ab"));
 25 |     assert!(!regex("ba"));
 26 | }
 27 | 
 28 | #[test]
 29 | fn literal_3() {
 30 |     regex!(regex "a$");
 31 |     assert!(!regex(""));
 32 |     assert!(regex("a"));
 33 |     assert!(!regex("ab"));
 34 |     assert!(regex("ba"));
 35 | }
 36 | 
 37 | #[test]
 38 | fn literal_4() {
 39 |     regex!(regex "^a$");
 40 |     assert!(!regex(""));
 41 |     assert!(regex("a"));
 42 |     assert!(!regex("ab"));
 43 |     assert!(!regex("ba"));
 44 | }
 45 | 
 46 | #[test]
 47 | fn class_1() {
 48 |     regex!(regex "[ab]");
 49 |     assert!(!regex(""));
 50 |     assert!(regex("a"));
 51 |     assert!(regex("ab"));
 52 |     assert!(regex("abc"));
 53 |     assert!(regex("ba"));
 54 |     assert!(regex("cba"));
 55 |     assert!(regex("b"));
 56 |     assert!(!regex("c"));
 57 | }
 58 | 
 59 | #[test]
 60 | fn class_2() {
 61 |     regex!(regex "^[ab]");
 62 |     assert!(!regex(""));
 63 |     assert!(regex("a"));
 64 |     assert!(regex("ab"));
 65 |     assert!(regex("abc"));
 66 |     assert!(regex("ba"));
 67 |     assert!(!regex("cba"));
 68 |     assert!(regex("b"));
 69 |     assert!(!regex("c"));
 70 | }
 71 | 
 72 | #[test]
 73 | fn class_3() {
 74 |     regex!(regex "[ab]$");
 75 |     assert!(!regex(""));
 76 |     assert!(regex("a"));
 77 |     assert!(regex("ab"));
 78 |     assert!(!regex("abc"));
 79 |     assert!(regex("ba"));
 80 |     assert!(regex("cba"));
 81 |     assert!(regex("b"));
 82 |     assert!(!regex("c"));
 83 | }
 84 | 
 85 | #[test]
 86 | fn class_4() {
 87 |     regex!(regex "^[ab]$");
 88 |     assert!(!regex(""));
 89 |     assert!(regex("a"));
 90 |     assert!(!regex("ab"));
 91 |     assert!(!regex("abc"));
 92 |     assert!(!regex("ba"));
 93 |     assert!(!regex("cba"));
 94 |     assert!(regex("b"));
 95 |     assert!(!regex("c"));
 96 | }
 97 | 
 98 | #[test]
 99 | fn alternation_1() {
100 |     regex!(regex "ab|cb");
101 |     assert!(!regex(""));
102 |     assert!(!regex("a"));
103 |     assert!(regex("ab"));
104 |     assert!(regex("cb"));
105 |     assert!(regex("abc"));
106 |     assert!(regex("cab"));
107 |     assert!(!regex("ba"));
108 |     assert!(regex("cba"));
109 |     assert!(!regex("b"));
110 |     assert!(!regex("c"));
111 | }
112 | 
113 | #[test]
114 | fn alternation_2() {
115 |     regex!(regex "^(ab|cb)");
116 |     assert!(!regex(""));
117 |     assert!(!regex("a"));
118 |     assert!(regex("ab"));
119 |     assert!(regex("cb"));
120 |     assert!(regex("abc"));
121 |     assert!(!regex("cab"));
122 |     assert!(!regex("ba"));
123 |     assert!(regex("cba"));
124 |     assert!(!regex("b"));
125 |     assert!(!regex("c"));
126 | }
127 | 
128 | #[test]
129 | fn alternation_3() {
130 |     regex!(regex "(ab|cb)$");
131 |     assert!(!regex(""));
132 |     assert!(!regex("a"));
133 |     assert!(regex("ab"));
134 |     assert!(regex("cb"));
135 |     assert!(!regex("abc"));
136 |     assert!(regex("cab"));
137 |     assert!(!regex("ba"));
138 |     assert!(!regex("cba"));
139 |     assert!(!regex("b"));
140 |     assert!(!regex("c"));
141 | }
142 | 
143 | #[test]
144 | fn alternation_4() {
145 |     regex!(regex "^(ab|cb)$");
146 |     assert!(!regex(""));
147 |     assert!(!regex("a"));
148 |     assert!(regex("ab"));
149 |     assert!(regex("cb"));
150 |     assert!(!regex("abc"));
151 |     assert!(!regex("cab"));
152 |     assert!(!regex("ba"));
153 |     assert!(!regex("cba"));
154 |     assert!(!regex("b"));
155 |     assert!(!regex("c"));
156 | }
157 | 
158 | #[test]
159 | fn concat_1() {
160 |     regex!(regex "ab");
161 |     assert!(!regex(""));
162 |     assert!(!regex("a"));
163 |     assert!(regex("ab"));
164 |     assert!(regex("abc"));
165 |     assert!(regex("cab"));
166 |     assert!(!regex("ba"));
167 |     assert!(!regex("cba"));
168 |     assert!(!regex("b"));
169 |     assert!(!regex("c"));
170 | }
171 | 
172 | #[test]
173 | fn concat_2() {
174 |     regex!(regex "^ab");
175 |     assert!(!regex(""));
176 |     assert!(!regex("a"));
177 |     assert!(regex("ab"));
178 |     assert!(regex("abc"));
179 |     assert!(!regex("cab"));
180 |     assert!(!regex("ba"));
181 |     assert!(!regex("cba"));
182 |     assert!(!regex("b"));
183 |     assert!(!regex("c"));
184 | }
185 | 
186 | #[test]
187 | fn concat_3() {
188 |     regex!(regex "ab$");
189 |     assert!(!regex(""));
190 |     assert!(!regex("a"));
191 |     assert!(regex("ab"));
192 |     assert!(!regex("abc"));
193 |     assert!(regex("cab"));
194 |     assert!(!regex("ba"));
195 |     assert!(!regex("cba"));
196 |     assert!(!regex("b"));
197 |     assert!(!regex("c"));
198 | }
199 | 
200 | #[test]
201 | fn concat_4() {
202 |     regex!(regex "^ab$");
203 |     assert!(!regex(""));
204 |     assert!(!regex("a"));
205 |     assert!(regex("ab"));
206 |     assert!(!regex("abc"));
207 |     assert!(!regex("cab"));
208 |     assert!(!regex("ba"));
209 |     assert!(!regex("cba"));
210 |     assert!(!regex("b"));
211 |     assert!(!regex("c"));
212 | }
213 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # proc-macro-regex
  2 | A proc macro regex library to match an arbitrary string or byte array to a regular expression.
  3 | [![Build status](https://github.com/LinkTed/proc-macro-regex/workflows/Continuous%20Integration/badge.svg)](https://github.com/LinkTed/proc-macro-regex/actions?query=workflow%3A%22Continuous+Integration%22)
  4 | [![Latest version](https://img.shields.io/crates/v/proc-macro-regex.svg)](https://crates.io/crates/proc-macro-regex)
  5 | [![Dependency status](https://deps.rs/repo/github/linkted/proc-macro-regex/status.svg)](https://deps.rs/repo/github/linkted/proc-macro-regex)
  6 | [![License](https://img.shields.io/crates/l/proc-macro-regex.svg)](https://opensource.org/licenses/BSD-3-Clause)
  7 | 
  8 | ## Usage
  9 | Add this to your `Cargo.toml`:
 10 | ```toml
 11 | [dependencies]
 12 | proc-macro-regex = "~1.1.0"
 13 | ```
 14 | 
 15 | ## Example
 16 | The macro `regex!` creates a function of the given name which takes a string or byte array and 
 17 | returns `true` if the argument matches the regex, otherwise `false`.
 18 | ```rust
 19 | use proc_macro_regex::regex;
 20 | 
 21 | /// Create the function with the signature:
 22 | /// fn regex_email(s: &str) -> bool; 
 23 | regex!(regex_email "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$");
 24 | 
 25 | fn main () {
 26 |    println!("Returns true  == {}", regex_email("example@example.org"));
 27 |    println!("Returns false == {}", regex_email("example.example.org"));
 28 | }
 29 | ```
 30 | 
 31 | The given regex works the same as in the [regex](https://crates.io/crates/regex) crate. If the `^` 
 32 | is at the beginning of the regex and `$` at the end then the whole string is checked, otherwise is 
 33 | check if the string contains the regex.
 34 | 
 35 | ## How it works
 36 | The macro creates a *deterministic finite automaton* (DFA), which parse the given input. 
 37 | Depending on the size of the DFA or the character of the regex, a lookup table or a code base 
 38 | implementation (binary search) is generated. If the size of the lookup table would be bigger than 
 39 | 65536 bytes (can be changed) then a code base implementation (binary search) is used. Additionally, 
 40 | if the regex contains any Unicode (no ASCII) character then a code base implementation 
 41 | (binary search) is used, too.
 42 | 
 43 | The following macro generates the following code:
 44 | ```rust
 45 | regex!(example_1 "abc");
 46 | ```
 47 | Generates:
 48 | ```rust
 49 | fn example_1(s: &str) -> bool {
 50 |     static TABLE: [[u8; 256]; 3usize] = [ ... ];
 51 |     let mut state = 0;
 52 |     for c in s.bytes() {
 53 |         state = TABLE[state as usize][c as usize];
 54 |         if state == u8::MAX {
 55 |             return true;
 56 |         }
 57 |     }
 58 |     false
 59 | }
 60 | ```
 61 | 
 62 | To tell the macro that the lookup table is not allowed to be bigger than 256 bytes, a third 
 63 | argument can be given. Therefore, a code base implementation (binary search) of the DFA is 
 64 | generated.
 65 | ```rust
 66 | regex!(example_2 "abc" 256);
 67 | ```
 68 | Generates:
 69 | ```rust
 70 | fn example_2(s: &str) -> bool {
 71 |     let mut state = 0;
 72 |     for c in s.bytes() {
 73 |         state = if state < 1usize {
 74 |             match c {
 75 |                 97u8 => 1usize,
 76 |                 _ => 0usize,
 77 |             }
 78 |         } else {
 79 |             if state == 1usize {
 80 |                 match c {
 81 |                     97u8 => 1usize,
 82 |                     98u8 => 2usize,
 83 |                     _ => 0usize,
 84 |                 }
 85 |             } else {
 86 |                 match c {
 87 |                     97u8 => 1usize,
 88 |                     99u8 => return true,
 89 |                     _ => 0usize,
 90 |                 }
 91 |             }
 92 |         };
 93 |     }
 94 |     false
 95 | }
 96 | ```
 97 | 
 98 | To change the visibility of the function, add the keywords at the beginning of the arguments. 
 99 | ```rust
100 | regex!(pub example_2 "abc" 256);
101 | ```
102 | Generates:
103 | ```rust
104 | pub fn example_3(s: &str) -> bool {
105 |    // same as in example_1 (see above)
106 | }
107 | ```
108 | 
109 | To parse a byte array instead of string, pass a byte string.
110 | ```rust
111 | regex!(example_4 b"abc");
112 | ```
113 | Generates:
114 | ```rust
115 | fn example_4(s: &[u8]) -> bool {
116 |    // same as in example_1 (see above)
117 | }
118 | ```
119 | 
120 | The generated code should work with `#![no_std]`, too.
121 | 
122 | ## proc-macro-regex vs regex
123 | Advantages:
124 | * Compile-time (no runtime initialization, no lazy-static)
125 | * Generated code that does not contain any dependencies
126 | * No heap allocation
127 | * Approximately 12%-68% faster for no trivia regex [^1]
128 | 
129 | [^1]: It were tested with regex in `benches/compare.rs`. For pattern/word matching it is slower 
130 |     because the [regex](https://crates.io/crates/regex) library uses 
131 |     [aho-corasick](https://crates.io/crates/aho-corasick/). (See Performance)
132 | 
133 | Disadvantages:
134 | * Currently, no group captures
135 | * No runtime regex generation
136 | 
137 | ### Performance
138 | This is the performance comparison between this crate and the regex crate. If you want to test it 
139 | by yourself, run `cargo bench --bench compare`.
140 | 
141 | | Name   | `proc-macro-regex` |      `regex` |  Result |
142 | |--------|--------------:|-------------:|--------:|
143 | | E-Mail |  743.95 MiB/s | 441.67 MiB/s | 68.44 % |
144 | | URL    |  584.62 MiB/s | 519.00 MiB/s | 12.64 % |
145 | | IPv6   |  746.92 MiB/s | 473.38 MiB/s | 57.78 % |
146 | 
147 | This was compiled with `rustc 1.53.0-nightly (392ba2ba1 2021-04-17)`.
148 | 
149 | ## License
150 | This project is licensed under the [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) 
151 | license.
152 | 
153 | ### Contribution
154 | Any contribution intentionally submitted for inclusion in `proc-macro-regex` by you, shall 
155 | be licensed as [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause), without any additional 
156 | terms or conditions.
157 | 


--------------------------------------------------------------------------------
/src/character.rs:
--------------------------------------------------------------------------------
  1 | use proc_macro2::{Ident, Span, TokenStream};
  2 | use quote::quote;
  3 | use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Literal};
  4 | use std::collections::BTreeSet;
  5 | use thiserror::Error;
  6 | 
  7 | fn to_byte(c: char) -> CharacterResult<u8> {
  8 |     if c.len_utf8() == 1 {
  9 |         let mut bytes = [0; 1];
 10 |         c.encode_utf8(&mut bytes);
 11 |         Ok(bytes[0])
 12 |     } else {
 13 |         Err(CharacterError::Unicode(c))
 14 |     }
 15 | }
 16 | 
 17 | fn to_char(b: u8) -> CharacterResult<char> {
 18 |     match char::try_from(b) {
 19 |         Ok(c) => Ok(c),
 20 |         Err(_) => Err(CharacterError::Byte(b)),
 21 |     }
 22 | }
 23 | 
 24 | #[derive(Debug, Error)]
 25 | pub enum CharacterError {
 26 |     #[error("got byte: {0}")]
 27 |     Byte(u8),
 28 |     #[error("got class bytes: {0:?}")]
 29 |     ClassBytes(ClassBytes),
 30 |     #[error("got unicode: {0}")]
 31 |     Unicode(char),
 32 |     #[error("got class unicode: {0:?}")]
 33 |     ClassUnicode(ClassUnicode),
 34 | }
 35 | 
 36 | pub type CharacterResult<T> = Result<T, CharacterError>;
 37 | 
 38 | pub trait Character: Sized + Ord + TryFrom<u32> + Into<u32> {
 39 |     fn new_line() -> Self;
 40 | 
 41 |     fn from_literal(literal: Literal) -> CharacterResult<Self>;
 42 | 
 43 |     fn from_class(class: Class) -> CharacterResult<BTreeSet<Self>>;
 44 | 
 45 |     fn to_byte(&self) -> Option<u8>;
 46 | 
 47 |     fn is_byte(&self) -> bool;
 48 | 
 49 |     fn is_next(&self, other: &Self) -> bool;
 50 | 
 51 |     fn get_iterator_function(is_byte: bool) -> Ident;
 52 | 
 53 |     fn to_usize(element: Ident, is_byte: bool) -> TokenStream;
 54 | 
 55 |     fn allow_invalid_utf8() -> bool;
 56 | 
 57 |     fn unicode() -> bool;
 58 | }
 59 | 
 60 | impl Character for char {
 61 |     fn new_line() -> Self {
 62 |         '\n'
 63 |     }
 64 | 
 65 |     fn from_literal(literal: Literal) -> CharacterResult<Self> {
 66 |         match literal {
 67 |             Literal::Unicode(c) => Ok(c),
 68 |             Literal::Byte(b) => to_char(b),
 69 |         }
 70 |     }
 71 | 
 72 |     fn from_class(class: Class) -> CharacterResult<BTreeSet<Self>> {
 73 |         let mut cs = BTreeSet::new();
 74 |         match class {
 75 |             Class::Unicode(class_unicode) => {
 76 |                 for class_unicode_range in class_unicode.iter() {
 77 |                     let start = class_unicode_range.start();
 78 |                     let end = class_unicode_range.end();
 79 |                     for c in start..=end {
 80 |                         cs.insert(c);
 81 |                     }
 82 |                 }
 83 |             }
 84 |             Class::Bytes(class_bytes) => {
 85 |                 for class_bytes_range in class_bytes.iter() {
 86 |                     let start = class_bytes_range.start();
 87 |                     let end = class_bytes_range.end();
 88 |                     for b in start..=end {
 89 |                         let c = to_char(b)?;
 90 |                         cs.insert(c);
 91 |                     }
 92 |                 }
 93 |             }
 94 |         }
 95 |         Ok(cs)
 96 |     }
 97 | 
 98 |     fn to_byte(&self) -> Option<u8> {
 99 |         to_byte(*self).ok()
100 |     }
101 | 
102 |     fn is_byte(&self) -> bool {
103 |         self.len_utf8() == 1
104 |     }
105 | 
106 |     fn is_next(&self, other: &Self) -> bool {
107 |         let self_u32: u32 = *self as u32;
108 |         if let Some(next) = self_u32.checked_add(1) {
109 |             let other_u32 = *other as u32;
110 |             next == other_u32
111 |         } else {
112 |             false
113 |         }
114 |     }
115 | 
116 |     fn get_iterator_function(is_byte: bool) -> Ident {
117 |         if is_byte {
118 |             Ident::new("bytes", Span::call_site())
119 |         } else {
120 |             Ident::new("chars", Span::call_site())
121 |         }
122 |     }
123 | 
124 |     fn to_usize(element: Ident, _is_byte: bool) -> TokenStream {
125 |         quote! {
126 |             #element as usize
127 |         }
128 |     }
129 | 
130 |     fn allow_invalid_utf8() -> bool {
131 |         false
132 |     }
133 | 
134 |     fn unicode() -> bool {
135 |         true
136 |     }
137 | }
138 | 
139 | impl Character for u8 {
140 |     fn new_line() -> Self {
141 |         b'\n'
142 |     }
143 | 
144 |     fn from_literal(literal: Literal) -> CharacterResult<Self> {
145 |         match literal {
146 |             Literal::Unicode(c) => to_byte(c),
147 |             Literal::Byte(b) => Ok(b),
148 |         }
149 |     }
150 | 
151 |     fn from_class(class: Class) -> CharacterResult<BTreeSet<Self>> {
152 |         let mut bs = BTreeSet::new();
153 |         match class {
154 |             Class::Unicode(class_unicode) => {
155 |                 for class_unicode_range in class_unicode.iter() {
156 |                     let start = class_unicode_range.start();
157 |                     let end = class_unicode_range.end();
158 |                     for c in start..=end {
159 |                         let b = to_byte(c)?;
160 |                         bs.insert(b);
161 |                     }
162 |                 }
163 |             }
164 |             Class::Bytes(class_bytes) => {
165 |                 for class_bytes_range in class_bytes.iter() {
166 |                     let start = class_bytes_range.start();
167 |                     let end = class_bytes_range.end();
168 |                     for b in start..=end {
169 |                         bs.insert(b);
170 |                     }
171 |                 }
172 |             }
173 |         }
174 |         Ok(bs)
175 |     }
176 | 
177 |     fn to_byte(&self) -> Option<u8> {
178 |         Some(*self)
179 |     }
180 | 
181 |     fn is_byte(&self) -> bool {
182 |         true
183 |     }
184 | 
185 |     fn is_next(&self, other: &u8) -> bool {
186 |         if let Some(next) = other.checked_add(1) {
187 |             next == *other
188 |         } else {
189 |             false
190 |         }
191 |     }
192 | 
193 |     fn get_iterator_function(_is_byte: bool) -> Ident {
194 |         Ident::new("into_iter", Span::call_site())
195 |     }
196 | 
197 |     fn to_usize(element: Ident, _is_byte: bool) -> TokenStream {
198 |         quote! {
199 |             *#element as usize
200 |         }
201 |     }
202 | 
203 |     fn allow_invalid_utf8() -> bool {
204 |         true
205 |     }
206 | 
207 |     fn unicode() -> bool {
208 |         false
209 |     }
210 | }
211 | 


--------------------------------------------------------------------------------
/src/dfa/to_tokens/binary_search.rs:
--------------------------------------------------------------------------------
  1 | use crate::{character::Character, dfa::to_tokens::DfaToTokens};
  2 | use proc_macro2::TokenStream;
  3 | use quote::{quote, ToTokens};
  4 | use std::collections::{BTreeMap, BTreeSet};
  5 | 
  6 | fn transition_condition_to_tokens<T>(start: T, end: T) -> TokenStream
  7 | where
  8 |     T: ToTokens + Ord,
  9 | {
 10 |     if start == end {
 11 |         quote! {
 12 |             #start
 13 |         }
 14 |     } else {
 15 |         quote! {
 16 |             #start..=#end
 17 |         }
 18 |     }
 19 | }
 20 | 
 21 | impl<T> DfaToTokens<T>
 22 | where
 23 |     T: Character + ToTokens + Ord,
 24 | {
 25 |     fn transition_condition(&self, start: T, end: T) -> TokenStream {
 26 |         if self.is_byte {
 27 |             if let Some(start) = start.to_byte() {
 28 |                 if let Some(end) = end.to_byte() {
 29 |                     return transition_condition_to_tokens::<u8>(start, end);
 30 |                 }
 31 |             }
 32 |         }
 33 | 
 34 |         transition_condition_to_tokens(start, end)
 35 |     }
 36 | }
 37 | 
 38 | impl<T> DfaToTokens<T>
 39 | where
 40 |     T: Character + ToTokens + Copy,
 41 | {
 42 |     fn transitions_inverse(transitions: &BTreeSet<(T, usize)>) -> BTreeMap<usize, BTreeSet<T>> {
 43 |         let mut result: BTreeMap<usize, BTreeSet<T>> = BTreeMap::new();
 44 |         for (c, t) in transitions {
 45 |             if let Some(set) = result.get_mut(t) {
 46 |                 set.insert(*c);
 47 |             } else {
 48 |                 let mut set = BTreeSet::new();
 49 |                 set.insert(*c);
 50 |                 result.insert(*t, set);
 51 |             }
 52 |         }
 53 |         result
 54 |     }
 55 | 
 56 |     fn transitions_inverse_pack(
 57 |         transitions_inverse: BTreeMap<usize, BTreeSet<T>>,
 58 |     ) -> BTreeMap<usize, BTreeSet<(T, T)>> {
 59 |         let mut result = BTreeMap::new();
 60 |         for (t, cs) in transitions_inverse {
 61 |             let mut ranges = BTreeSet::new();
 62 |             let mut start = None;
 63 |             let mut prev: Option<T> = None;
 64 |             for character in cs {
 65 |                 if let Some(prev) = prev {
 66 |                     if !prev.is_next(&character) {
 67 |                         ranges.insert((start.unwrap(), prev));
 68 |                         start = Some(character);
 69 |                     }
 70 |                 } else {
 71 |                     start = Some(character);
 72 |                 }
 73 |                 prev = Some(character);
 74 |             }
 75 |             if let Some(start) = start {
 76 |                 if let Some(prev) = prev {
 77 |                     ranges.insert((start, prev));
 78 |                 } else {
 79 |                     panic!()
 80 |                 }
 81 |             }
 82 |             result.insert(t, ranges);
 83 |         }
 84 |         result
 85 |     }
 86 | 
 87 |     fn transitions_inverse_condition(&self, ranges: BTreeSet<(T, T)>) -> TokenStream {
 88 |         let mut conditions = Vec::new();
 89 |         for (start, end) in ranges {
 90 |             let condition = self.transition_condition(start, end);
 91 |             conditions.push(condition);
 92 |         }
 93 |         quote! {
 94 |             #(#conditions )|*
 95 |         }
 96 |     }
 97 | 
 98 |     fn transitions_default(&self) -> TokenStream {
 99 |         if self.dfa.start_text {
100 |             quote! {
101 |                 return false;
102 |             }
103 |         } else {
104 |             quote! {
105 |                 0usize
106 |             }
107 |         }
108 |     }
109 | 
110 |     fn transitions_binary_search_match_inner(&self, state: usize) -> TokenStream {
111 |         let default = self.transitions_default();
112 |         if let Some(transitions) = self.dfa.transitions.get(&state) {
113 |             let transitions_inverse = DfaToTokens::<T>::transitions_inverse(transitions);
114 |             let transitions_inverse_pack =
115 |                 DfaToTokens::<T>::transitions_inverse_pack(transitions_inverse);
116 |             let mut arms = Vec::new();
117 |             for (t, ranges) in transitions_inverse_pack {
118 |                 let condition = self.transitions_inverse_condition(ranges);
119 |                 let arm = if !self.dfa.end_text && self.dfa.accept_states.contains(&t) {
120 |                     quote! {
121 |                         #condition => return true
122 |                     }
123 |                 } else {
124 |                     quote! {
125 |                         #condition => #t
126 |                     }
127 |                 };
128 |                 arms.push(arm);
129 |             }
130 | 
131 |             quote! {
132 |                 match c {
133 |                     #(#arms,)*
134 |                     _ => {
135 |                         #default
136 |                     },
137 |                 }
138 |             }
139 |         } else {
140 |             default
141 |         }
142 |     }
143 | 
144 |     fn transitions_binary_search_recursive(
145 |         &self,
146 |         states: &[usize],
147 |         start: usize,
148 |         len: usize,
149 |     ) -> TokenStream {
150 |         if len == 1 {
151 |             self.transitions_binary_search_match_inner(states[start])
152 |         } else if len == 2 {
153 |             let left_state = states[start];
154 |             let right_state = states[start + 1];
155 |             let left = self.transitions_binary_search_match_inner(left_state);
156 |             let right = self.transitions_binary_search_match_inner(right_state);
157 |             quote! {
158 |                 if state == #left_state {
159 |                     #left
160 |                 } else {
161 |                     #right
162 |                 }
163 |             }
164 |         } else {
165 |             let new_len = len / 2;
166 |             let remain = len % 2;
167 |             let new_start = start + new_len;
168 |             let new_state = states[new_start];
169 |             let left = self.transitions_binary_search_recursive(states, start, new_len);
170 |             let right =
171 |                 self.transitions_binary_search_recursive(states, new_start, new_len + remain);
172 |             quote! {
173 |                 if state < #new_state {
174 |                     #left
175 |                 } else {
176 |                     #right
177 |                 }
178 |             }
179 |         }
180 |     }
181 | 
182 |     pub(super) fn for_each_binary_search(&self) -> TokenStream {
183 |         let states: Vec<usize> = self.required_states.iter().copied().collect();
184 |         let iterator = T::get_iterator_function(self.is_byte);
185 |         let transitions = self.transitions_binary_search_recursive(&states[..], 0, states.len());
186 |         quote! {
187 |             let mut state = 0;
188 | 
189 |             for c in s.#iterator() {
190 |                 state = #transitions;
191 |             }
192 |         }
193 |     }
194 | }
195 | 


--------------------------------------------------------------------------------
/src/dfa/nfa_to_dfa.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     character::Character,
  3 |     dfa::Dfa,
  4 |     nfa::{Nfa, START_STATE},
  5 | };
  6 | use std::{
  7 |     collections::{BTreeMap, BTreeSet},
  8 |     convert::From,
  9 |     fmt::Debug,
 10 | };
 11 | 
 12 | type State = BTreeSet<usize>;
 13 | type Transition<T> = (State, T, State);
 14 | 
 15 | #[derive(Debug)]
 16 | pub(crate) struct NfaToDfaIter<T>
 17 | where
 18 |     T: Character + Copy,
 19 | {
 20 |     nfa: Nfa<T>,
 21 |     states: BTreeSet<State>,
 22 |     new_states: BTreeSet<State>,
 23 |     transitions: BTreeSet<Transition<T>>,
 24 |     accept_states: BTreeSet<State>,
 25 | }
 26 | 
 27 | impl<T> NfaToDfaIter<T>
 28 | where
 29 |     T: Character + Copy,
 30 | {
 31 |     pub(super) fn new(nfa: Nfa<T>) -> NfaToDfaIter<T> {
 32 |         let mut start_state = BTreeSet::new();
 33 |         start_state.insert(START_STATE.to_owned());
 34 | 
 35 |         // The start state is always there.
 36 |         let mut states: BTreeSet<BTreeSet<usize>> = BTreeSet::new();
 37 |         states.insert(start_state);
 38 | 
 39 |         // The start state is an accept-state and the end_text is false. This means that we are
 40 |         // already on a accept-state and we do not have to parse all the string. As a result, the
 41 |         // DFA is always true.
 42 |         let new_states = if nfa.is_accept_state(START_STATE) && !nfa.is_end_text() {
 43 |             BTreeSet::new()
 44 |         } else {
 45 |             states.clone()
 46 |         };
 47 | 
 48 |         let accept_states = if nfa.is_accept_state(START_STATE) {
 49 |             states.clone()
 50 |         } else {
 51 |             BTreeSet::new()
 52 |         };
 53 | 
 54 |         NfaToDfaIter {
 55 |             nfa,
 56 |             states,
 57 |             new_states,
 58 |             transitions: BTreeSet::new(),
 59 |             accept_states,
 60 |         }
 61 |     }
 62 | 
 63 |     /// Returns a set of all character the given state has a transition as source state.
 64 |     fn characters(&self, state: &State) -> BTreeSet<T> {
 65 |         let mut characters = BTreeSet::new();
 66 |         for s in state {
 67 |             self.nfa.chars(*s, &mut characters);
 68 |         }
 69 |         characters
 70 |     }
 71 | 
 72 |     ///
 73 |     fn simulate(&self, state: &State, c: T) -> State {
 74 |         let mut new_state = BTreeSet::new();
 75 |         for s in state {
 76 |             self.nfa.simulate(*s, c, &mut new_state);
 77 |         }
 78 |         new_state
 79 |     }
 80 | 
 81 |     fn is_accept_state(&self, state: &State) -> bool {
 82 |         for s in state {
 83 |             if self.nfa.is_accept_state(*s) {
 84 |                 return true;
 85 |             }
 86 |         }
 87 |         false
 88 |     }
 89 | 
 90 |     fn next_step(&mut self) {
 91 |         let mut new_states = BTreeSet::new();
 92 |         for state in self.new_states.iter() {
 93 |             let chars = self.characters(state);
 94 |             for c in chars {
 95 |                 let mut new_state = self.simulate(state, c);
 96 |                 if !self.nfa.is_start_text() {
 97 |                     new_state.insert(START_STATE.to_owned());
 98 |                 }
 99 | 
100 |                 if !self.states.contains(&new_state) {
101 |                     self.states.insert(new_state.clone());
102 |                     new_states.insert(new_state.clone());
103 | 
104 |                     if self.is_accept_state(&new_state) {
105 |                         self.accept_states.insert(new_state.clone());
106 |                     }
107 |                 }
108 |                 self.transitions.insert((state.clone(), c, new_state));
109 |             }
110 |         }
111 |         self.new_states = new_states;
112 |     }
113 | }
114 | 
115 | impl<T> Iterator for &mut NfaToDfaIter<T>
116 | where
117 |     T: Character + Copy,
118 | {
119 |     type Item = usize;
120 | 
121 |     fn next(&mut self) -> Option<usize> {
122 |         if self.new_states.is_empty() {
123 |             return None;
124 |         }
125 | 
126 |         self.next_step();
127 | 
128 |         match self.new_states.len() {
129 |             0 => None,
130 |             len => Some(len),
131 |         }
132 |     }
133 | }
134 | 
135 | impl<T> From<NfaToDfaIter<T>> for Dfa<T>
136 | where
137 |     T: Character + Copy,
138 | {
139 |     fn from(mut nfa_to_dfa: NfaToDfaIter<T>) -> Self {
140 |         for _ in &mut nfa_to_dfa {}
141 | 
142 |         let mut states = BTreeSet::new();
143 |         let mut accept_states = BTreeSet::new();
144 |         let mut mapping = BTreeMap::new();
145 | 
146 |         let mut start_state = BTreeSet::new();
147 |         start_state.insert(START_STATE);
148 | 
149 |         // It has to be ensured that the start-state is mapped to zero.
150 |         // Therefore, the start-state has to be removed.
151 |         nfa_to_dfa.states.remove(&start_state);
152 |         states.insert(START_STATE);
153 |         if nfa_to_dfa.accept_states.remove(&start_state) {
154 |             accept_states.insert(START_STATE);
155 |         }
156 |         mapping.insert(start_state, START_STATE);
157 | 
158 |         // First map all non accept-states.
159 |         for state in nfa_to_dfa.states {
160 |             if !nfa_to_dfa.accept_states.contains(&state) {
161 |                 states.insert(mapping.len());
162 |                 mapping.insert(state, mapping.len());
163 |             }
164 |         }
165 | 
166 |         // Then map all accept-states.
167 |         // Because if `end_text` equals true then the accept states are implemented differently so
168 |         // all accept-states should be at the end of the mapping.
169 |         for accept_state in nfa_to_dfa.accept_states.iter() {
170 |             states.insert(mapping.len());
171 |             mapping.insert(accept_state.clone(), mapping.len());
172 |         }
173 | 
174 |         // Convert the transitions according the mapping.
175 |         let mut transitions: BTreeMap<usize, BTreeSet<(T, usize)>> = BTreeMap::new();
176 |         for (s, c, t) in nfa_to_dfa.transitions {
177 |             let s = mapping.get(&s).unwrap();
178 |             let t = mapping.get(&t).unwrap();
179 |             if let Some(state_transitions) = transitions.get_mut(s) {
180 |                 state_transitions.insert((c, *t));
181 |             } else {
182 |                 let mut state_transitions = BTreeSet::new();
183 |                 state_transitions.insert((c, *t));
184 |                 transitions.insert(*s, state_transitions);
185 |             }
186 |         }
187 | 
188 |         // Convert the accept states according the mapping.
189 |         for s in nfa_to_dfa.accept_states {
190 |             let s = mapping.get(&s).unwrap();
191 |             accept_states.insert(*s);
192 |         }
193 | 
194 |         Dfa {
195 |             states,
196 |             transitions,
197 |             accept_states,
198 |             start_text: nfa_to_dfa.nfa.is_start_text(),
199 |             end_text: nfa_to_dfa.nfa.is_end_text(),
200 |         }
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/src/dfa/to_tokens/lookup_table.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     character::Character,
  3 |     dfa::to_tokens::{usize_to_lit_int, DfaToTokens},
  4 | };
  5 | use proc_macro2::{Span, TokenStream};
  6 | use quote::{quote, ToTokens};
  7 | use std::{
  8 |     collections::{BTreeMap, BTreeSet},
  9 |     mem::size_of,
 10 | };
 11 | use syn::Ident;
 12 | 
 13 | impl<T> DfaToTokens<T>
 14 | where
 15 |     T: Character + ToTokens + Copy + Into<u32>,
 16 | {
 17 |     fn lookup_table_u8_row_map(transitions: &BTreeSet<(T, usize)>) -> Option<BTreeMap<u8, usize>> {
 18 |         let mut transitions_u8 = BTreeMap::new();
 19 |         for (ch, t) in transitions.iter() {
 20 |             let ch = ch.to_byte()?;
 21 |             transitions_u8.insert(ch, *t);
 22 |         }
 23 |         Some(transitions_u8)
 24 |     }
 25 | 
 26 |     fn lookup_table_row_no_transition_default(&self, int_type: &Ident) -> TokenStream {
 27 |         if self.dfa.start_text {
 28 |             quote! {
 29 |                 #int_type::MAX
 30 |             }
 31 |         } else {
 32 |             quote! {
 33 |                 0
 34 |             }
 35 |         }
 36 |     }
 37 | 
 38 |     fn lookup_table_row_accept_transition_end(&self, int_type: &Ident) -> TokenStream {
 39 |         if self.dfa.start_text {
 40 |             quote! {
 41 |                 #int_type::MAX - 1
 42 |             }
 43 |         } else {
 44 |             quote! {
 45 |                 #int_type::MAX
 46 |             }
 47 |         }
 48 |     }
 49 | 
 50 |     fn lookup_table_row(
 51 |         &self,
 52 |         transitions_u8: &BTreeMap<u8, usize>,
 53 |         int_type: &Ident,
 54 |     ) -> Vec<TokenStream> {
 55 |         let no_transition_default = self.lookup_table_row_no_transition_default(int_type);
 56 |         let accept_transition_end = self.lookup_table_row_accept_transition_end(int_type);
 57 |         let mut row = Vec::with_capacity(256);
 58 |         for i in 0..=u8::MAX {
 59 |             let new_state = if let Some(t) = transitions_u8.get(&i) {
 60 |                 if !self.dfa.end_text && self.dfa.accept_states.contains(t) {
 61 |                     accept_transition_end.clone()
 62 |                 } else {
 63 |                     let new_state = usize_to_lit_int(*t);
 64 |                     quote! {
 65 |                         #new_state
 66 |                     }
 67 |                 }
 68 |             } else {
 69 |                 no_transition_default.clone()
 70 |             };
 71 |             row.push(new_state);
 72 |         }
 73 |         row
 74 |     }
 75 | 
 76 |     fn lookup_table_row_default(&self, int_type: &Ident) -> Vec<TokenStream> {
 77 |         let no_transition_default = self.lookup_table_row_no_transition_default(int_type);
 78 |         vec![no_transition_default; 256]
 79 |     }
 80 | 
 81 |     fn transitions_lookup_table(&self, int_type: &Ident) -> Option<TokenStream> {
 82 |         let mut table = Vec::new();
 83 |         for state in self.required_states.iter() {
 84 |             let row = if let Some(transitions) = self.dfa.transitions.get(state) {
 85 |                 let transitions_u8 = DfaToTokens::<T>::lookup_table_u8_row_map(transitions)?;
 86 |                 self.lookup_table_row(&transitions_u8, int_type)
 87 |             } else {
 88 |                 self.lookup_table_row_default(int_type)
 89 |             };
 90 |             table.push(quote! {
 91 |                 [#(#row),*]
 92 |             });
 93 |         }
 94 |         let len = table.len();
 95 |         let transitions = quote! {
 96 |             static TABLE: [[#int_type; 256]; #len] = [#(#table),*]
 97 |         };
 98 |         Some(transitions)
 99 |     }
100 | 
101 |     fn for_each_lookup_table_check(&self, int_type: &Ident) -> TokenStream {
102 |         match (self.dfa.start_text, self.dfa.end_text) {
103 |             (false, false) => quote! {
104 |                 if state == #int_type::MAX {
105 |                     return true;
106 |                 }
107 |             },
108 |             (false, true) => quote! {},
109 |             (true, false) => quote! {
110 |                 if state == #int_type::MAX {
111 |                     return false;
112 |                 } else if state == #int_type::MAX - 1 {
113 |                     return true;
114 |                 }
115 |             },
116 |             (true, true) => quote! {
117 |                 if state == #int_type::MAX {
118 |                     return false;
119 |                 }
120 |             },
121 |         }
122 |     }
123 | 
124 |     pub(super) fn for_each_lookup_table(&self) -> Option<TokenStream> {
125 |         if !self.is_byte {
126 |             return None;
127 |         }
128 | 
129 |         let int_type = self.get_int_type()?;
130 |         let transitions_lookup_table = self.transitions_lookup_table(&int_type)?;
131 |         let iterator = T::get_iterator_function(self.is_byte);
132 |         let c_to_usize = T::to_usize(Ident::new("c", Span::call_site()), self.is_byte);
133 |         let check = self.for_each_lookup_table_check(&int_type);
134 |         let for_each = quote! {
135 |             #transitions_lookup_table;
136 |             let mut state = 0;
137 | 
138 |             for c in s.#iterator() {
139 |                 state = TABLE[state as usize][#c_to_usize];
140 | 
141 |                 #check
142 |             }
143 |         };
144 |         Some(for_each)
145 |     }
146 | 
147 |     fn lookup_table_states(&self) -> Option<usize> {
148 |         let mut additional_state = 0;
149 | 
150 |         if self.dfa.start_text {
151 |             additional_state += 1;
152 |         }
153 | 
154 |         if !self.dfa.end_text {
155 |             additional_state += 1;
156 |         }
157 | 
158 |         self.required_states.len().checked_add(additional_state)
159 |     }
160 | 
161 |     fn lookup_table_size(&self) -> Option<(usize, Ident)> {
162 |         let states = self.lookup_table_states()?;
163 |         let states_character = states.checked_mul(256)?;
164 |         let ret = if states < (u8::MAX) as usize {
165 |             (
166 |                 states_character.checked_mul(size_of::<u8>())?,
167 |                 Ident::new("u8", Span::call_site()),
168 |             )
169 |         } else if states < (u16::MAX) as usize {
170 |             (
171 |                 states_character.checked_mul(size_of::<u16>())?,
172 |                 Ident::new("u16", Span::call_site()),
173 |             )
174 |         } else if states < (u32::MAX) as usize {
175 |             (
176 |                 states_character.checked_mul(size_of::<u32>())?,
177 |                 Ident::new("u32", Span::call_site()),
178 |             )
179 |         } else {
180 |             (
181 |                 states_character.checked_mul(size_of::<u64>())?,
182 |                 Ident::new("u64", Span::call_site()),
183 |             )
184 |         };
185 |         Some(ret)
186 |     }
187 | 
188 |     pub(super) fn get_int_type(&self) -> Option<Ident> {
189 |         let (lookup_table_size, int_type) = self.lookup_table_size()?;
190 |         if lookup_table_size <= self.threshold {
191 |             Some(int_type)
192 |         } else {
193 |             None
194 |         }
195 |     }
196 | }
197 | 


--------------------------------------------------------------------------------
/src/nfa/repetition.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     character::Character,
  3 |     nfa::{NFAResult, Nfa},
  4 | };
  5 | use regex_syntax::hir::{Hir, Repetition, RepetitionKind, RepetitionRange};
  6 | use std::collections::{BTreeMap, BTreeSet};
  7 | 
  8 | impl<T> Nfa<T>
  9 | where
 10 |     T: Character + Copy,
 11 | {
 12 |     fn repetition_range_exactly(&mut self, hir: Hir, exactly: u32) -> NFAResult<()> {
 13 |         for _ in 0..exactly {
 14 |             let nfa = self.sub(hir.clone())?;
 15 |             self.append_states(&nfa)?;
 16 |             self.accept_states = nfa.accept_states;
 17 |         }
 18 |         Ok(())
 19 |     }
 20 | 
 21 |     fn repetition_range_at_least(&mut self, hir: Hir, at_least: u32) -> NFAResult<()> {
 22 |         for _ in 0..at_least {
 23 |             let nfa = self.sub(hir.clone())?;
 24 |             self.append_states(&nfa)?;
 25 |             self.accept_states = nfa.accept_states;
 26 |         }
 27 |         self.repetition_zero_or_more(hir)
 28 |     }
 29 | 
 30 |     fn repetition_range_bounded(&mut self, hir: Hir, m: u32, n: u32) -> NFAResult<()> {
 31 |         if m != 0 {
 32 |             self.repetition_range_exactly(hir.clone(), m)?;
 33 |         }
 34 | 
 35 |         let mut accept_states = self.accept_states.clone();
 36 |         for _ in m..n {
 37 |             let nfa = self.sub(hir.clone())?;
 38 |             self.append_states(&nfa)?;
 39 |             accept_states.extend(nfa.accept_states.clone());
 40 |             self.accept_states = nfa.accept_states;
 41 |         }
 42 |         self.accept_states = accept_states;
 43 | 
 44 |         Ok(())
 45 |     }
 46 | 
 47 |     fn repetition_range(&mut self, hir: Hir, repetition_range: RepetitionRange) -> NFAResult<()> {
 48 |         match repetition_range {
 49 |             RepetitionRange::Exactly(exactly) => self.repetition_range_exactly(hir, exactly),
 50 |             RepetitionRange::AtLeast(at_least) => self.repetition_range_at_least(hir, at_least),
 51 |             RepetitionRange::Bounded(m, n) => self.repetition_range_bounded(hir, m, n),
 52 |         }
 53 |     }
 54 | 
 55 |     fn repetition_zero_or_one(&mut self, hir: Hir) -> NFAResult<()> {
 56 |         let nfa = self.sub(hir)?;
 57 |         self.append_states(&nfa)?;
 58 |         self.accept_states.extend(nfa.accept_states);
 59 |         Ok(())
 60 |     }
 61 | 
 62 |     fn repetition_zero_or_more(&mut self, hir: Hir) -> NFAResult<()> {
 63 |         let nfa = self.sub(hir)?;
 64 |         for state in nfa.states {
 65 |             if !nfa.accept_states.contains(&state) {
 66 |                 self.add_state(state)?;
 67 |             }
 68 |         }
 69 | 
 70 |         for (source_state, characters_to_targets) in nfa.transitions {
 71 |             for (character, targets) in characters_to_targets {
 72 |                 for target_state in targets {
 73 |                     let s_accept = nfa.accept_states.contains(&source_state);
 74 |                     let t_accept = nfa.accept_states.contains(&target_state);
 75 |                     match (s_accept, t_accept) {
 76 |                         (true, true) => {
 77 |                             for source_state in self.accept_states.iter() {
 78 |                                 for target_state in self.accept_states.iter() {
 79 |                                     Nfa::add_transition(
 80 |                                         &mut self.transitions,
 81 |                                         *target_state,
 82 |                                         character,
 83 |                                         *source_state,
 84 |                                     );
 85 |                                 }
 86 |                             }
 87 |                         }
 88 |                         (true, false) => {
 89 |                             for source_state in self.accept_states.iter() {
 90 |                                 Nfa::add_transition(
 91 |                                     &mut self.transitions,
 92 |                                     *source_state,
 93 |                                     character,
 94 |                                     target_state,
 95 |                                 );
 96 |                             }
 97 |                         }
 98 |                         (false, true) => {
 99 |                             for target_state in self.accept_states.iter() {
100 |                                 Nfa::add_transition(
101 |                                     &mut self.transitions,
102 |                                     source_state,
103 |                                     character,
104 |                                     *target_state,
105 |                                 );
106 |                             }
107 |                         }
108 |                         (false, false) => {
109 |                             Nfa::add_transition(
110 |                                 &mut self.transitions,
111 |                                 source_state,
112 |                                 character,
113 |                                 target_state,
114 |                             );
115 |                         }
116 |                     }
117 |                 }
118 |             }
119 |         }
120 | 
121 |         Ok(())
122 |     }
123 | 
124 |     fn repetition_one_or_more(&mut self, hir: Hir) -> NFAResult<()> {
125 |         let mut nfa = self.sub(hir)?;
126 |         let mut backwards_characters_to_targets: BTreeMap<T, BTreeSet<usize>> = BTreeMap::new();
127 |         for accept_state in self.accept_states.iter() {
128 |             if let Some(characters_to_targets) = nfa.transitions.get(accept_state) {
129 |                 for (character, targets) in characters_to_targets.iter() {
130 |                     if let Some(backwards_targets) =
131 |                         backwards_characters_to_targets.get_mut(character)
132 |                     {
133 |                         for target in targets {
134 |                             backwards_targets.insert(*target);
135 |                         }
136 |                     } else {
137 |                         backwards_characters_to_targets.insert(*character, targets.clone());
138 |                     }
139 |                 }
140 |             }
141 |         }
142 | 
143 |         for (character, targets) in backwards_characters_to_targets {
144 |             for target_state in targets {
145 |                 for accept_state in nfa.accept_states.iter() {
146 |                     Nfa::add_transition(
147 |                         &mut nfa.transitions,
148 |                         *accept_state,
149 |                         character,
150 |                         target_state,
151 |                     );
152 |                 }
153 |             }
154 |         }
155 | 
156 |         self.append_states(&nfa)?;
157 |         self.accept_states = nfa.accept_states;
158 |         Ok(())
159 |     }
160 | 
161 |     pub(super) fn repetition(&mut self, repetition: Repetition) -> NFAResult<()> {
162 |         match repetition.kind {
163 |             RepetitionKind::ZeroOrOne => self.repetition_zero_or_one(*repetition.hir),
164 |             RepetitionKind::ZeroOrMore => self.repetition_zero_or_more(*repetition.hir),
165 |             RepetitionKind::OneOrMore => self.repetition_one_or_more(*repetition.hir),
166 |             RepetitionKind::Range(repetition_range) => {
167 |                 self.repetition_range(*repetition.hir, repetition_range)
168 |             }
169 |         }
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/tests/repetition.rs:
--------------------------------------------------------------------------------
  1 | use proc_macro_regex::regex;
  2 | 
  3 | #[test]
  4 | fn zero_or_more_1() {
  5 |     regex!(regex "a*");
  6 |     assert!(regex(""));
  7 |     assert!(regex("a"));
  8 |     assert!(regex("aa"));
  9 |     assert!(regex("b"));
 10 |     assert!(regex("ab"));
 11 |     assert!(regex("ba"));
 12 | }
 13 | 
 14 | #[test]
 15 | fn zero_or_more_2() {
 16 |     regex!(regex "^a*");
 17 |     assert!(regex(""));
 18 |     assert!(regex("a"));
 19 |     assert!(regex("aa"));
 20 |     assert!(regex("ab"));
 21 |     assert!(regex("ba"));
 22 | }
 23 | 
 24 | #[test]
 25 | fn zero_or_more_3() {
 26 |     regex!(regex "a*$");
 27 |     assert!(regex(""));
 28 |     assert!(regex("a"));
 29 |     assert!(regex("aa"));
 30 |     assert!(regex("ab"));
 31 |     assert!(regex("ba"));
 32 | }
 33 | 
 34 | #[test]
 35 | fn zero_or_more_4() {
 36 |     regex!(regex "^a*$");
 37 |     assert!(regex(""));
 38 |     assert!(regex("a"));
 39 |     assert!(regex("aa"));
 40 |     assert!(!regex("b"));
 41 |     assert!(!regex("ab"));
 42 |     assert!(!regex("ba"));
 43 | }
 44 | 
 45 | #[test]
 46 | fn zero_or_one_1() {
 47 |     regex!(regex "a?");
 48 |     assert!(regex(""));
 49 |     assert!(regex("a"));
 50 |     assert!(regex("aa"));
 51 |     assert!(regex("b"));
 52 |     assert!(regex("ab"));
 53 |     assert!(regex("ba"));
 54 | }
 55 | 
 56 | #[test]
 57 | fn zero_or_one_2() {
 58 |     regex!(regex "^a?");
 59 |     assert!(regex(""));
 60 |     assert!(regex("a"));
 61 |     assert!(regex("aa"));
 62 |     assert!(regex("b"));
 63 |     assert!(regex("ab"));
 64 |     assert!(regex("ba"));
 65 | }
 66 | 
 67 | #[test]
 68 | fn zero_or_one_3() {
 69 |     regex!(regex "a?$");
 70 |     assert!(regex(""));
 71 |     assert!(regex("a"));
 72 |     assert!(regex("aa"));
 73 |     assert!(regex("b"));
 74 |     assert!(regex("ab"));
 75 |     assert!(regex("ba"));
 76 | }
 77 | 
 78 | #[test]
 79 | fn zero_or_one_4() {
 80 |     regex!(regex "^a?$");
 81 |     assert!(regex(""));
 82 |     assert!(regex("a"));
 83 |     assert!(!regex("aa"));
 84 |     assert!(!regex("b"));
 85 |     assert!(!regex("ab"));
 86 |     assert!(!regex("ba"));
 87 | }
 88 | 
 89 | #[test]
 90 | fn one_or_more_1() {
 91 |     regex!(regex "a+");
 92 |     assert!(!regex(""));
 93 |     assert!(regex("a"));
 94 |     assert!(regex("aa"));
 95 |     assert!(!regex("b"));
 96 |     assert!(regex("ab"));
 97 |     assert!(regex("ba"));
 98 | }
 99 | 
100 | #[test]
101 | fn one_or_more_2() {
102 |     regex!(regex "^a+");
103 |     assert!(!regex(""));
104 |     assert!(regex("a"));
105 |     assert!(regex("aa"));
106 |     assert!(!regex("b"));
107 |     assert!(regex("ab"));
108 |     assert!(!regex("ba"));
109 | }
110 | 
111 | #[test]
112 | fn one_or_more_3() {
113 |     regex!(regex "a+$");
114 |     assert!(!regex(""));
115 |     assert!(regex("a"));
116 |     assert!(regex("aa"));
117 |     assert!(!regex("b"));
118 |     assert!(!regex("ab"));
119 |     assert!(regex("ba"));
120 | }
121 | 
122 | #[test]
123 | fn one_or_more_4() {
124 |     regex!(regex "^a+$");
125 |     assert!(!regex(""));
126 |     assert!(regex("a"));
127 |     assert!(regex("aa"));
128 |     assert!(!regex("b"));
129 |     assert!(!regex("ab"));
130 |     assert!(!regex("ba"));
131 | }
132 | 
133 | #[test]
134 | fn range_exactly_1() {
135 |     regex!(regex "a{2}");
136 |     assert!(!regex(""));
137 |     assert!(!regex("a"));
138 |     assert!(regex("aa"));
139 |     assert!(regex("aaa"));
140 |     assert!(!regex("b"));
141 |     assert!(!regex("ab"));
142 |     assert!(regex("aab"));
143 |     assert!(regex("aaab"));
144 |     assert!(!regex("ba"));
145 |     assert!(regex("baa"));
146 |     assert!(regex("baaa"));
147 | }
148 | 
149 | #[test]
150 | fn range_exactly_2() {
151 |     regex!(regex "^a{2}");
152 |     assert!(!regex(""));
153 |     assert!(!regex("a"));
154 |     assert!(regex("aa"));
155 |     assert!(regex("aaa"));
156 |     assert!(!regex("b"));
157 |     assert!(!regex("ab"));
158 |     assert!(regex("aab"));
159 |     assert!(regex("aaab"));
160 |     assert!(!regex("ba"));
161 |     assert!(!regex("baa"));
162 |     assert!(!regex("baaa"));
163 | }
164 | 
165 | #[test]
166 | fn range_exactly_3() {
167 |     regex!(regex "a{2}$");
168 |     assert!(!regex(""));
169 |     assert!(!regex("a"));
170 |     assert!(regex("aa"));
171 |     assert!(regex("aaa"));
172 |     assert!(!regex("b"));
173 |     assert!(!regex("ab"));
174 |     assert!(!regex("aab"));
175 |     assert!(!regex("aaab"));
176 |     assert!(!regex("ba"));
177 |     assert!(regex("baa"));
178 |     assert!(regex("baaa"));
179 | }
180 | 
181 | #[test]
182 | fn range_exactly_4() {
183 |     regex!(regex "^a{2}$");
184 |     assert!(!regex(""));
185 |     assert!(!regex("a"));
186 |     assert!(regex("aa"));
187 |     assert!(!regex("aaa"));
188 |     assert!(!regex("b"));
189 |     assert!(!regex("ab"));
190 |     assert!(!regex("aab"));
191 |     assert!(!regex("aaab"));
192 |     assert!(!regex("ba"));
193 |     assert!(!regex("baa"));
194 |     assert!(!regex("baaa"));
195 | }
196 | 
197 | #[test]
198 | fn range_at_least_1() {
199 |     regex!(regex "a{2,}");
200 |     assert!(!regex(""));
201 |     assert!(!regex("a"));
202 |     assert!(regex("aa"));
203 |     assert!(regex("aaa"));
204 |     assert!(!regex("b"));
205 |     assert!(!regex("ab"));
206 |     assert!(regex("aab"));
207 |     assert!(regex("aaab"));
208 |     assert!(!regex("ba"));
209 |     assert!(regex("baa"));
210 |     assert!(regex("baaa"));
211 | }
212 | 
213 | #[test]
214 | fn range_at_least_2() {
215 |     regex!(regex "^a{2,}");
216 |     assert!(!regex(""));
217 |     assert!(!regex("a"));
218 |     assert!(regex("aa"));
219 |     assert!(regex("aaa"));
220 |     assert!(!regex("b"));
221 |     assert!(!regex("ab"));
222 |     assert!(regex("aab"));
223 |     assert!(regex("aaab"));
224 |     assert!(!regex("ba"));
225 |     assert!(!regex("baa"));
226 |     assert!(!regex("baaa"));
227 | }
228 | 
229 | #[test]
230 | fn range_at_least_3() {
231 |     regex!(regex "a{2,}$");
232 |     assert!(!regex(""));
233 |     assert!(!regex("a"));
234 |     assert!(regex("aa"));
235 |     assert!(regex("aaa"));
236 |     assert!(!regex("b"));
237 |     assert!(!regex("ab"));
238 |     assert!(!regex("aab"));
239 |     assert!(!regex("aaab"));
240 |     assert!(!regex("ba"));
241 |     assert!(regex("baa"));
242 |     assert!(regex("baaa"));
243 | }
244 | 
245 | #[test]
246 | fn range_at_least_4() {
247 |     regex!(regex "^a{2,}$");
248 |     assert!(!regex(""));
249 |     assert!(!regex("a"));
250 |     assert!(regex("aa"));
251 |     assert!(regex("aaa"));
252 |     assert!(!regex("b"));
253 |     assert!(!regex("ab"));
254 |     assert!(!regex("aab"));
255 |     assert!(!regex("aaab"));
256 |     assert!(!regex("ba"));
257 |     assert!(!regex("baa"));
258 |     assert!(!regex("baaa"));
259 | }
260 | 
261 | #[test]
262 | fn range_bounded_1() {
263 |     regex!(regex "a{1,4}");
264 |     assert!(!regex(""));
265 |     assert!(regex("a"));
266 |     assert!(regex("aa"));
267 |     assert!(regex("aaa"));
268 |     assert!(regex("aaaa"));
269 |     assert!(regex("aaaaa"));
270 |     assert!(!regex("b"));
271 |     assert!(regex("ab"));
272 |     assert!(regex("aab"));
273 |     assert!(regex("aaab"));
274 |     assert!(regex("aaaab"));
275 |     assert!(regex("aaaaab"));
276 |     assert!(regex("ba"));
277 |     assert!(regex("baa"));
278 |     assert!(regex("baaa"));
279 |     assert!(regex("baaaa"));
280 |     assert!(regex("baaaaa"));
281 | }
282 | 
283 | #[test]
284 | fn range_bounded_2() {
285 |     regex!(regex "^a{1,4}");
286 |     assert!(!regex(""));
287 |     assert!(regex("a"));
288 |     assert!(regex("aa"));
289 |     assert!(regex("aaa"));
290 |     assert!(regex("aaaa"));
291 |     assert!(regex("aaaaa"));
292 |     assert!(!regex("b"));
293 |     assert!(regex("ab"));
294 |     assert!(regex("aab"));
295 |     assert!(regex("aaab"));
296 |     assert!(regex("aaaab"));
297 |     assert!(regex("aaaaab"));
298 |     assert!(!regex("ba"));
299 |     assert!(!regex("baa"));
300 |     assert!(!regex("baaa"));
301 |     assert!(!regex("baaaa"));
302 |     assert!(!regex("baaaaa"));
303 | }
304 | 
305 | #[test]
306 | fn range_bounded_3() {
307 |     regex!(regex "a{1,4}$");
308 |     assert!(!regex(""));
309 |     assert!(regex("a"));
310 |     assert!(regex("aa"));
311 |     assert!(regex("aaa"));
312 |     assert!(regex("aaaa"));
313 |     assert!(regex("aaaaa"));
314 |     assert!(!regex("b"));
315 |     assert!(!regex("ab"));
316 |     assert!(!regex("aab"));
317 |     assert!(!regex("aaab"));
318 |     assert!(!regex("aaaab"));
319 |     assert!(!regex("aaaaab"));
320 |     assert!(regex("ba"));
321 |     assert!(regex("baa"));
322 |     assert!(regex("baaa"));
323 |     assert!(regex("baaaaa"));
324 | }
325 | 
326 | #[test]
327 | fn range_bounded_4() {
328 |     regex!(regex "^a{1,4}$");
329 |     assert!(!regex(""));
330 |     assert!(regex("a"));
331 |     assert!(regex("aa"));
332 |     assert!(regex("aaa"));
333 |     assert!(regex("aaaa"));
334 |     assert!(!regex("aaaaa"));
335 |     assert!(!regex("b"));
336 |     assert!(!regex("ab"));
337 |     assert!(!regex("aab"));
338 |     assert!(!regex("aaab"));
339 |     assert!(!regex("aaaab"));
340 |     assert!(!regex("aaaaab"));
341 |     assert!(!regex("ba"));
342 |     assert!(!regex("baa"));
343 |     assert!(!regex("baaa"));
344 |     assert!(!regex("baaaa"));
345 |     assert!(!regex("baaaaa"));
346 | }
347 | 


--------------------------------------------------------------------------------
/src/nfa/mod.rs:
--------------------------------------------------------------------------------
  1 | mod repetition;
  2 | 
  3 | use crate::{
  4 |     character::{Character, CharacterError},
  5 |     macro_input::MacroInput,
  6 | };
  7 | use regex_syntax::{
  8 |     hir::{Anchor, Class, Group, Hir, HirKind, Literal, WordBoundary},
  9 |     ParserBuilder,
 10 | };
 11 | use std::{
 12 |     cmp::max,
 13 |     collections::{BTreeMap, BTreeSet},
 14 |     fmt::Debug,
 15 | };
 16 | use syn::{Error as SynError, Result as SynResult};
 17 | use thiserror::Error;
 18 | 
 19 | pub const START_STATE: usize = 0;
 20 | 
 21 | type Transition<T> = BTreeMap<usize, BTreeMap<T, BTreeSet<usize>>>;
 22 | 
 23 | pub type NFAResult<T> = Result<T, NFAError>;
 24 | 
 25 | fn to_hir<T>(input: &MacroInput) -> SynResult<Hir>
 26 | where
 27 |     T: Character,
 28 | {
 29 |     let mut parser = ParserBuilder::new()
 30 |         .unicode(T::unicode())
 31 |         .allow_invalid_utf8(T::allow_invalid_utf8())
 32 |         .build();
 33 |     match parser.parse(&input.get_regex()) {
 34 |         Ok(hir) => Ok(hir),
 35 |         Err(e) => Err(SynError::new(
 36 |             input.get_regex_span(),
 37 |             format!("Could not parse regex: {:?}", e),
 38 |         )),
 39 |     }
 40 | }
 41 | 
 42 | #[derive(Debug, Error)]
 43 | pub enum NFAError {
 44 |     #[error("alternation has zero lenght")]
 45 |     AlternationZeroLen,
 46 |     #[error("CharacterError: {0}")]
 47 |     CharacterError(#[from] CharacterError),
 48 |     #[error("Start text was not at the beginning of the regex")]
 49 |     StartTextError,
 50 |     #[error("End text was not at the end of the text")]
 51 |     EndTextError,
 52 | }
 53 | 
 54 | #[derive(Debug)]
 55 | pub struct Nfa<T>
 56 | where
 57 |     T: Character + Copy,
 58 | {
 59 |     states: BTreeSet<usize>,
 60 |     transitions: Transition<T>,
 61 |     accept_states: BTreeSet<usize>,
 62 |     state_count: usize,
 63 |     start_text: bool,
 64 |     end_text: bool,
 65 | }
 66 | 
 67 | impl<T> Nfa<T>
 68 | where
 69 |     T: Character + Copy,
 70 | {
 71 |     fn add_transition(
 72 |         transitions: &mut Transition<T>,
 73 |         source_state: usize,
 74 |         character: T,
 75 |         target_state: usize,
 76 |     ) {
 77 |         if let Some(characters_to_targets) = transitions.get_mut(&source_state) {
 78 |             if let Some(targets) = characters_to_targets.get_mut(&character) {
 79 |                 targets.insert(target_state);
 80 |             } else {
 81 |                 let mut targets = BTreeSet::new();
 82 |                 targets.insert(target_state);
 83 | 
 84 |                 characters_to_targets.insert(character, targets);
 85 |             }
 86 |         } else {
 87 |             let mut targets = BTreeSet::new();
 88 |             targets.insert(target_state);
 89 | 
 90 |             let mut characters_to_targets = BTreeMap::new();
 91 |             characters_to_targets.insert(character, targets);
 92 | 
 93 |             transitions.insert(source_state, characters_to_targets);
 94 |         }
 95 |     }
 96 | 
 97 |     fn extend_transitions(d: &mut Transition<T>, s: &Transition<T>) {
 98 |         for (new_source_state, new_characters_to_targets) in s.iter() {
 99 |             if let Some(characters_to_targets) = d.get_mut(new_source_state) {
100 |                 for (new_character, new_targets) in new_characters_to_targets.iter() {
101 |                     if let Some(targets) = characters_to_targets.get_mut(new_character) {
102 |                         for new_target_state in new_targets {
103 |                             targets.insert(*new_target_state);
104 |                         }
105 |                     } else {
106 |                         characters_to_targets.insert(*new_character, new_targets.clone());
107 |                     }
108 |                 }
109 |             } else {
110 |                 d.insert(*new_source_state, new_characters_to_targets.clone());
111 |             }
112 |         }
113 |     }
114 | 
115 |     fn add_state(&mut self, new_state: usize) -> NFAResult<()> {
116 |         if self.end_text {
117 |             return Err(NFAError::EndTextError);
118 |         }
119 | 
120 |         let assert = self.states.insert(new_state);
121 |         debug_assert!(assert);
122 |         self.state_count = max(new_state, self.state_count);
123 |         Ok(())
124 |     }
125 | 
126 |     fn append_states(&mut self, nfa: &Nfa<T>) -> NFAResult<()> {
127 |         self.set_start_text(nfa.start_text)?;
128 |         if !nfa.states.is_empty() {
129 |             if self.end_text {
130 |                 return Err(NFAError::EndTextError);
131 |             }
132 | 
133 |             for new_state in nfa.states.iter() {
134 |                 let assert = self.states.insert(*new_state);
135 |                 debug_assert!(assert);
136 |                 self.state_count = max(*new_state, self.state_count);
137 |             }
138 |             Nfa::extend_transitions(&mut self.transitions, &nfa.transitions);
139 |         }
140 |         self.end_text = nfa.end_text;
141 |         Ok(())
142 |     }
143 | 
144 |     fn next_state_count(&mut self) -> usize {
145 |         self.state_count += 1;
146 |         self.state_count
147 |     }
148 | 
149 |     fn next_state(&mut self) -> NFAResult<usize> {
150 |         let new_state = self.next_state_count();
151 |         self.add_state(new_state)?;
152 |         Ok(new_state)
153 |     }
154 | 
155 |     fn sub(&mut self, hir: Hir) -> NFAResult<Nfa<T>> {
156 |         let mut nfa = Nfa {
157 |             states: BTreeSet::new(),
158 |             transitions: Transition::new(),
159 |             accept_states: self.accept_states.clone(),
160 |             state_count: self.next_state_count(),
161 |             start_text: false,
162 |             end_text: self.end_text,
163 |         };
164 |         nfa.hir(hir)?;
165 |         Ok(nfa)
166 |     }
167 | 
168 |     fn new() -> Nfa<T> {
169 |         let mut states = BTreeSet::new();
170 |         states.insert(START_STATE);
171 | 
172 |         Nfa {
173 |             states: states.clone(),
174 |             transitions: Transition::new(),
175 |             accept_states: states,
176 |             state_count: START_STATE,
177 |             start_text: false,
178 |             end_text: false,
179 |         }
180 |     }
181 | 
182 |     fn char(&mut self, c: T) -> NFAResult<()> {
183 |         let state = self.next_state()?;
184 |         for s in self.accept_states.iter() {
185 |             Nfa::add_transition(&mut self.transitions, *s, c, state);
186 |         }
187 |         self.accept_states = BTreeSet::new();
188 |         self.accept_states.insert(state);
189 |         Ok(())
190 |     }
191 | 
192 |     fn literal(&mut self, literal: Literal) -> NFAResult<()> {
193 |         let c = T::from_literal(literal)?;
194 |         self.char(c)
195 |     }
196 | 
197 |     fn class(&mut self, class: Class) -> NFAResult<()> {
198 |         let state = self.next_state()?;
199 |         let cs = T::from_class(class)?;
200 |         for c in cs {
201 |             for s in &self.accept_states {
202 |                 Nfa::add_transition(&mut self.transitions, *s, c, state);
203 |             }
204 |         }
205 |         self.accept_states = BTreeSet::new();
206 |         self.accept_states.insert(state);
207 |         Ok(())
208 |     }
209 | 
210 |     fn alternation(&mut self, alternation: Vec<Hir>) -> NFAResult<()> {
211 |         if alternation.is_empty() {
212 |             return Err(NFAError::AlternationZeroLen);
213 |         }
214 | 
215 |         let mut accept_states = BTreeSet::new();
216 |         for hir in alternation {
217 |             let nfa = self.sub(hir)?;
218 |             self.append_states(&nfa)?;
219 |             accept_states.extend(nfa.accept_states)
220 |         }
221 |         self.accept_states = accept_states;
222 |         Ok(())
223 |     }
224 | 
225 |     fn conact(&mut self, concat: Vec<Hir>) -> NFAResult<()> {
226 |         for hir in concat {
227 |             self.hir(hir)?;
228 |         }
229 |         Ok(())
230 |     }
231 | 
232 |     fn group(&mut self, group: Group) -> NFAResult<()> {
233 |         self.hir(*group.hir)
234 |     }
235 | 
236 |     fn word_boundary(&mut self, _word_boundary: WordBoundary) -> NFAResult<()> {
237 |         unimplemented!();
238 |     }
239 | 
240 |     fn set_start_text(&mut self, start_text: bool) -> NFAResult<()> {
241 |         if start_text {
242 |             if self.state_count == 0 {
243 |                 self.start_text = true;
244 |             } else {
245 |                 return Err(NFAError::StartTextError);
246 |             }
247 |         }
248 | 
249 |         Ok(())
250 |     }
251 | 
252 |     fn anchor(&mut self, anchor: Anchor) -> NFAResult<()> {
253 |         match anchor {
254 |             Anchor::StartLine => self.char(T::new_line()),
255 |             Anchor::EndLine => self.char(T::new_line()),
256 |             Anchor::StartText => self.set_start_text(true),
257 |             Anchor::EndText => {
258 |                 self.end_text = true;
259 |                 Ok(())
260 |             }
261 |         }
262 |     }
263 | 
264 |     fn hir(&mut self, hir: Hir) -> NFAResult<()> {
265 |         match hir.into_kind() {
266 |             HirKind::Empty => Ok(()),
267 |             HirKind::Literal(literal) => self.literal(literal),
268 |             HirKind::Class(class) => self.class(class),
269 |             HirKind::Alternation(alternation) => self.alternation(alternation),
270 |             HirKind::Concat(concat) => self.conact(concat),
271 |             HirKind::Repetition(repetition) => self.repetition(repetition),
272 |             HirKind::Group(group) => self.group(group),
273 |             HirKind::WordBoundary(word_boundary) => self.word_boundary(word_boundary),
274 |             HirKind::Anchor(anchor) => self.anchor(anchor),
275 |         }
276 |     }
277 | 
278 |     pub fn chars(&self, source_state: usize, characters: &mut BTreeSet<T>) {
279 |         if let Some(characters_to_targets) = self.transitions.get(&source_state) {
280 |             characters.extend(characters_to_targets.keys());
281 |         }
282 |     }
283 | 
284 |     pub fn simulate(&self, source_state: usize, character: T, new_targets: &mut BTreeSet<usize>) {
285 |         if let Some(characters_to_targets) = self.transitions.get(&source_state) {
286 |             if let Some(targets) = characters_to_targets.get(&character) {
287 |                 for target_state in targets {
288 |                     new_targets.insert(*target_state);
289 |                 }
290 |             }
291 |         }
292 |     }
293 | 
294 |     pub fn is_accept_state(&self, state: usize) -> bool {
295 |         self.accept_states.contains(&state)
296 |     }
297 | 
298 |     pub fn is_start_text(&self) -> bool {
299 |         self.start_text
300 |     }
301 | 
302 |     pub fn is_end_text(&self) -> bool {
303 |         self.end_text
304 |     }
305 | }
306 | 
307 | impl<T> TryFrom<&MacroInput> for Nfa<T>
308 | where
309 |     T: Character + Copy,
310 | {
311 |     type Error = SynError;
312 | 
313 |     fn try_from(input: &MacroInput) -> SynResult<Self> {
314 |         let hir = to_hir::<T>(input)?;
315 | 
316 |         let mut nfa = Nfa::new();
317 |         match nfa.hir(hir) {
318 |             Ok(_) => Ok(nfa),
319 |             Err(e) => Err(SynError::new(
320 |                 input.get_regex_span(),
321 |                 format!("Error create the NFA: {:?}", e),
322 |             )),
323 |         }
324 |     }
325 | }
326 | 


--------------------------------------------------------------------------------