├── .github └── workflows │ └── ci.yml ├── .gitignore ├── COPYING ├── Cargo.toml ├── LICENSE-MIT ├── PLANS.md ├── README.md ├── TODO ├── UNLICENSE ├── bench ├── .gitignore ├── Cargo.toml ├── data │ ├── opensubtitles2018-en-huge-ascii.txt │ ├── opensubtitles2018-en-small-ascii.txt │ ├── opensubtitles2018-en-tiny-ascii.txt │ ├── opensubtitles2018-ru-huge-utf8.txt │ ├── opensubtitles2018-ru-small-utf8.txt │ ├── opensubtitles2018-ru-tiny-utf8.txt │ ├── opensubtitles2018-zh-huge-utf8.txt │ ├── opensubtitles2018-zh-small-utf8.txt │ ├── opensubtitles2018-zh-tiny-utf8.txt │ ├── sherlock-holmes-huge.txt │ ├── sherlock-holmes-small.txt │ └── sherlock-holmes-tiny.txt └── src │ ├── bench.rs │ ├── inputs.rs │ └── lib.rs ├── examples ├── Cargo.toml └── fst.rs ├── regex-cli ├── Cargo.toml └── src │ ├── app.rs │ ├── cmd │ ├── debug.rs │ ├── find.rs │ └── mod.rs │ ├── config.rs │ ├── escape.rs │ ├── main.rs │ └── util.rs ├── regex-test ├── COPYING ├── Cargo.toml ├── LICENSE-MIT ├── UNLICENSE └── src │ ├── escape.rs │ └── lib.rs ├── rustfmt.toml ├── scripts ├── fowler-to-toml └── generate-fowler-tests ├── src ├── dfa │ ├── accel.rs │ ├── automaton.rs │ ├── dense.rs │ ├── determinize.rs │ ├── error.rs │ ├── minimize.rs │ ├── mod.rs │ ├── regex.rs │ ├── search.rs │ ├── search_unsafe.rs │ ├── sparse.rs │ ├── special.rs │ └── transducer.rs ├── hybrid │ ├── dfa.rs │ ├── error.rs │ ├── id.rs │ ├── mod.rs │ ├── regex.rs │ └── search.rs ├── lib.rs ├── macros.rs ├── nfa │ ├── mod.rs │ └── thompson │ │ ├── compiler.rs │ │ ├── error.rs │ │ ├── map.rs │ │ ├── mod.rs │ │ ├── pikevm.rs │ │ └── range_trie.rs └── util │ ├── alphabet.rs │ ├── bytes.rs │ ├── determinize │ ├── mod.rs │ └── state.rs │ ├── id.rs │ ├── lazy.rs │ ├── matchtypes.rs │ ├── mod.rs │ ├── prefilter.rs │ ├── sparse_set.rs │ ├── start.rs │ └── syntax.rs └── tests ├── data ├── bytes.toml ├── crazy.toml ├── earliest.toml ├── empty.toml ├── expensive.toml ├── flags.toml ├── fowler │ ├── basic.toml │ ├── dat │ │ ├── README │ │ ├── basic.dat │ │ ├── nullsubexpr.dat │ │ ├── repetition-expensive.dat │ │ └── repetition.dat │ ├── nullsubexpr.toml │ ├── repetition-expensive.toml │ ├── repetition-long.toml │ └── repetition.toml ├── iter.toml ├── misc.toml ├── multiline.toml ├── no-unicode.toml ├── overlapping.toml ├── regression.toml ├── set.toml ├── unicode.toml └── word-boundary.toml ├── dfa ├── api.rs ├── mod.rs └── suite.rs ├── hybrid ├── api.rs ├── mod.rs └── suite.rs ├── nfa ├── mod.rs └── thompson │ ├── mod.rs │ └── pikevm │ ├── api.rs │ ├── mod.rs │ └── suite.rs ├── regression.rs ├── tests.rs └── util.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | schedule: 8 | - cron: '00 01 * * *' 9 | jobs: 10 | test: 11 | name: test 12 | env: 13 | # For some builds, we use cross to test on 32-bit and big-endian 14 | # systems. 15 | CARGO: cargo 16 | # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. 17 | TARGET: 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | matrix: 21 | build: 22 | - pinned 23 | - stable 24 | - stable-32 25 | - stable-mips 26 | - stable-thumb 27 | - beta 28 | - nightly 29 | - macos 30 | - win-msvc 31 | - win-gnu 32 | include: 33 | - build: pinned 34 | os: ubuntu-18.04 35 | rust: 1.41.1 36 | - build: stable 37 | os: ubuntu-18.04 38 | rust: stable 39 | - build: stable-32 40 | os: ubuntu-18.04 41 | rust: stable 42 | target: i686-unknown-linux-gnu 43 | - build: stable-mips 44 | os: ubuntu-18.04 45 | rust: stable 46 | target: mips64-unknown-linux-gnuabi64 47 | - build: stable-thumb 48 | os: ubuntu-18.04 49 | rust: stable 50 | target: thumbv7em-none-eabihf 51 | - build: beta 52 | os: ubuntu-18.04 53 | rust: beta 54 | - build: nightly 55 | os: ubuntu-18.04 56 | rust: nightly 57 | - build: macos 58 | os: macos-latest 59 | rust: stable 60 | - build: win-msvc 61 | os: windows-2019 62 | rust: stable 63 | - build: win-gnu 64 | os: windows-2019 65 | rust: stable-x86_64-gnu 66 | steps: 67 | - name: Checkout repository 68 | uses: actions/checkout@v1 69 | with: 70 | fetch-depth: 1 71 | 72 | - name: Install Rust 73 | uses: actions-rs/toolchain@v1 74 | with: 75 | toolchain: ${{ matrix.rust }} 76 | profile: minimal 77 | override: true 78 | 79 | - name: Use Cross 80 | if: matrix.target != '' 81 | run: | 82 | cargo install cross 83 | echo "CARGO=cross" >> $GITHUB_ENV 84 | echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV 85 | 86 | - name: Show command used for Cargo 87 | run: | 88 | echo "cargo command is: ${{ env.CARGO }}" 89 | echo "target flag is: ${{ env.TARGET }}" 90 | 91 | - name: Build 92 | if: matrix.build != 'stable-thumb' 93 | run: ${{ env.CARGO }} build --verbose ${{ env.TARGET }} 94 | 95 | - name: Build docs 96 | if: matrix.build != 'stable-thumb' 97 | run: ${{ env.CARGO }} doc --verbose ${{ env.TARGET }} 98 | 99 | # Our dev dependencies are increasing their MSRV more quickly then we want 100 | # to, so the following are only run on non-pinned targets. 101 | 102 | - name: Build examples 103 | if: matrix.build != 'pinned' && matrix.build != 'stable-thumb' 104 | run: ${{ env.CARGO }} build --manifest-path examples/Cargo.toml --examples 105 | 106 | - name: Run tests 107 | if: matrix.build != 'pinned' && matrix.build != 'stable-thumb' && matrix.build != 'stable-mips' 108 | run: ${{ env.CARGO }} test --verbose --features transducer ${{ env.TARGET }} 109 | 110 | # The mips test runner is quite sluggish, so don't run the full test 111 | # suite there. Unfortunate, but CI times balloon otherwise. 112 | - name: Run tests 113 | if: matrix.build == 'stable-mips' 114 | run: ${{ env.CARGO }} test --verbose --features transducer --lib ${{ env.TARGET }} 115 | 116 | - name: Build without default features 117 | if: matrix.build != 'pinned' 118 | run: ${{ env.CARGO }} build --verbose --no-default-features ${{ env.TARGET }} 119 | 120 | - name: Build docs without default features 121 | if: matrix.build != 'pinned' 122 | run: ${{ env.CARGO }} doc --verbose --lib --no-default-features ${{ env.TARGET }} 123 | 124 | - name: Run tests without default features 125 | if: matrix.build != 'pinned' && matrix.build != 'stable-thumb' 126 | run: ${{ env.CARGO }} test --verbose --lib --no-default-features ${{ env.TARGET }} 127 | 128 | - name: Compile debug tool 129 | if: matrix.build != 'pinned' && matrix.build != 'stable-thumb' 130 | run: ${{ env.CARGO }} build --verbose --manifest-path regex-cli/Cargo.toml ${{ env.TARGET }} 131 | 132 | - name: Test benchmarks 133 | if: matrix.build != 'pinned' && matrix.build != 'stable-thumb' 134 | run: ${{ env.CARGO }} bench --manifest-path bench/Cargo.toml --verbose ${{ env.TARGET }} -- --test 135 | 136 | rustfmt: 137 | name: rustfmt 138 | runs-on: ubuntu-18.04 139 | steps: 140 | - name: Checkout repository 141 | uses: actions/checkout@v1 142 | with: 143 | fetch-depth: 1 144 | - name: Install Rust 145 | uses: actions-rs/toolchain@v1 146 | with: 147 | toolchain: stable 148 | override: true 149 | profile: minimal 150 | components: rustfmt 151 | - name: Check formatting 152 | run: | 153 | cargo fmt --all -- --check 154 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /tmp 2 | /target 3 | /examples/target 4 | /regex-automata-debug/target 5 | /regex-cli/target 6 | /regex-test/target 7 | tags 8 | /Cargo.lock 9 | /examples/Cargo.lock 10 | BREADCRUMBS 11 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | This project is dual-licensed under the Unlicense and MIT licenses. 2 | 3 | You may use this code under the terms of either license. 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "regex-automata" 3 | version = "0.2.0" #:version 4 | authors = ["Andrew Gallant "] 5 | description = "Automata construction and matching using regular expressions." 6 | documentation = "https://docs.rs/regex-automata" 7 | homepage = "https://github.com/BurntSushi/regex-automata" 8 | repository = "https://github.com/BurntSushi/regex-automata" 9 | readme = "README.md" 10 | keywords = ["regex", "dfa", "automata", "automaton", "nfa"] 11 | license = "Unlicense/MIT" 12 | categories = ["text-processing"] 13 | exclude = [ 14 | "/.github", "/scripts/*", "/regex-cli", "/regex-test", 15 | ] 16 | autotests = false 17 | autoexamples = false 18 | edition = "2018" 19 | resolver = "2" 20 | 21 | [workspace] 22 | members = ["bench", "examples", "regex-cli", "regex-test"] 23 | 24 | [lib] 25 | bench = false 26 | 27 | [features] 28 | # WARNING: The features below were assembled quickly without much thought. 29 | # They might not work as you expect. The safest configuration is the default 30 | # configuration. 31 | default = ["std", "alloc", "syntax"] 32 | std = [] 33 | alloc = ["syntax"] 34 | transducer = ["fst"] 35 | logging = ["log"] 36 | syntax = ["regex-syntax"] 37 | 38 | # WARNING: The features below are in a very rough draft form, which is why 39 | # they are all commented out. I'm still working through the crate feature 40 | # design, planned for the regex-automata 0.3 release. 41 | 42 | # TODO: These features need to be fleshed out more, actually implemented and 43 | # then tested. Also, add 'alloc' and 'std' features to regex-syntax before 44 | # doing so. 45 | #default = ["std", "dfa", "syntax", "unicode", "regex-syntax/default"] 46 | #std = ["alloc", "memchr/std"] 47 | # TODO: Should this also imply regex-syntax/alloc? Will that turn into a no-op 48 | # if regex-syntax isn't enabled as a dependency? Do we need a separate 49 | # 'alloc_nosyntax' feature to enable alloc features without bringing in 50 | # regex-syntax? Sigh. 51 | #alloc = [] 52 | #logging = ["log"] 53 | #transducer = ["fst"] 54 | 55 | # When enabled, the 'dfa' sub-module will be available. Note that if 'dfa' is 56 | # enabled but 'alloc' is not, then only DFA deserialization and search will be 57 | # available. DFA construction requires the 'alloc' and 'syntax' features to be 58 | # enabled. 59 | #dfa = [] 60 | #syntax = ["regex-syntax"] 61 | 62 | ## Enables all Unicode features. This expands if new Unicode features are added. 63 | #unicode = [ 64 | # "unicode-age", 65 | # "unicode-bool", 66 | # "unicode-case", 67 | # "unicode-gencat", 68 | # "unicode-perl", 69 | # "unicode-script", 70 | # "unicode-segment", 71 | # "regex-syntax/unicode", 72 | #] 73 | ## Enables use of the `Age` property, e.g., `\p{Age:3.0}`. 74 | #unicode-age = ["regex-syntax/unicode-age"] 75 | ## Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`. 76 | #unicode-bool = ["regex-syntax/unicode-bool"] 77 | ## Enables Unicode-aware case insensitive matching, e.g., `(?i)β`. 78 | #unicode-case = ["regex-syntax/unicode-case"] 79 | ## Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`. 80 | #unicode-gencat = ["regex-syntax/unicode-gencat"] 81 | ## Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`. 82 | #unicode-perl = ["regex-syntax/unicode-perl"] 83 | ## Enables Unicode scripts and script extensions, e.g., `\p{Greek}`. 84 | #unicode-script = ["regex-syntax/unicode-script"] 85 | ## Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`. 86 | #unicode-segment = ["regex-syntax/unicode-segment"] 87 | 88 | [dependencies] 89 | fst = { version = "0.4.5", optional = true } 90 | log = { version = "0.4.14", optional = true } 91 | memchr = { version = "2.4.0", default-features = false } 92 | regex-syntax = { version = "0.6.24", optional = true } 93 | 94 | [dev-dependencies] 95 | bstr = { version = "0.2.16", default-features = false, features = ["std"] } 96 | quickcheck = { version = "1.0.3", default-features = false } 97 | regex-syntax = "0.6.16" 98 | regex-test = { version = "*", path = "regex-test" } 99 | 100 | [[test]] 101 | path = "tests/tests.rs" 102 | name = "integration" 103 | 104 | [profile.dev] 105 | # Running tests takes too long in debug mode, so we forcefully always build 106 | # with optimizations. Unfortunate, but, ¯\_(ツ)_/¯. 107 | # 108 | # It's counter-intuitive that this needs to be set on dev *and* test, but 109 | # it's because the tests that take a long time to run are run as integration 110 | # tests in a separate crate. The test.opt-level setting won't apply there, so 111 | # we need to set the opt-level across the entire build. 112 | opt-level = 3 113 | debug = true 114 | 115 | [profile.test] 116 | opt-level = 3 117 | debug = true 118 | 119 | [profile.release] 120 | debug = true 121 | 122 | [profile.bench] 123 | debug = true 124 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Andrew Gallant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /PLANS.md: -------------------------------------------------------------------------------- 1 | pattern_limit should not be defined inside nfa::thompson, but rather at the 2 | top-level. 3 | 4 | ----- 5 | 6 | Main problem right now is exemplified by the set60 and set70 failing tests. In 7 | particular, when finding the starting position while matching multiple regexes 8 | simultaneously, the reverse search is messed up. The reverse search doesn't 9 | depend on which regex matched in the forward direction, which means it won't 10 | always find the correcting starting location. Unfortunately, the only way to 11 | fix this, as far as I can tell, is to add a group of start states for every 12 | regex in the DFA. Then once we do the reverse search, we need to choose the 13 | correct start state based on which regex matched in the forward direction. 14 | 15 | This is a nasty change. 16 | 17 | So it looks like this only applies when doing an overlapping search in reverse 18 | to find the start of a match. That means we should make this configurable 19 | but enable it by default for the reverse automata. It should be configurable 20 | so that folks can construct a regex that doesn't have the ability to do 21 | overlapping searches correctly. If an overlapping search is attempted with 22 | a reverse automaton that lacks starting states for each pattern, then the 23 | implementation should panic. 24 | 25 | BUT! It is also convenient to provide this option in general for folks that 26 | want a DFA that can match any pattern while also being able to match a specific 27 | pattern. 28 | 29 | Straw man: 30 | 31 | * Update dense::Config to have a `starts_for_each_pattern` option. It should 32 | be disabled by default. 33 | * In `RegexBuilder::build_many_with_size` tweak the reverse DFA configuration 34 | to have the aforementioned option enabled. 35 | * It would be interesting to add new APIs to `Regex` that support matching 36 | specific patterns, but I think this is a complication. If we did want to do 37 | this, then we should just add it to the `_at` variants and leave the rest of 38 | the API untouched. 39 | * Add a `pattern_id: Option` parameter to each of the five 40 | `*_at` methods on the `dfa::Automaton` trait. A value of `None` retains the 41 | existing behavior. A `Some` value means that the starting state for that 42 | specific pattern must be chosen, which in turn implies an anchored search. 43 | (This means `starts_for_each_pattern` has utility for single-pattern DFAs 44 | since it makes it possible to build a DFA that can do both unanchored and 45 | anchored searches.) 46 | * Thread this new parameter down into the various functions in `dfa::search` 47 | all the way down into `init_fwd` and `init_rev`. These functions will then 48 | pass it to `dfa.start_state_{forward,reverse}`. 49 | * This is where things get gruesome since we now need to completely re-work how 50 | start states are represented in dense and sparse DFAs _and_ it needs to be 51 | configurable. It looks like the `Start` type from `dfa::automaton` can 52 | basically remain unchanged, since it still represents one of the four 53 | possible starting states that will need to be applied for every pattern. 54 | * For `dfa::dense`, change `StartList` to `StartTable`. Currently, its only 55 | header is the state ID count, which is always 4. We'll want to change this 56 | to the stride and add a new header value that encodes the number of patterns. 57 | When the number of patterns is zero, then existing behavior is preserved and 58 | represents the case where `starts_for_each_pattern` is disabled (or in the 59 | case of an empty DFA). When non-zero, a table of starting state IDs is 60 | encoded with each row corresponding to the 4 starting states for each 61 | pattern. Before this table (even if it's empty), the 4 starting states for 62 | the entire DFA are encoded. 63 | * For `dfa::sparse`, do the same as above. They are essentially the same right 64 | now anyway, with the only difference being that sparse DFAs use `&[u8]` 65 | instead of `&[S]` (because sparse DFAs don't have any alignment 66 | requirements). 67 | * Modify `DFA::empty` to accept a `starts_for_each_pattern` bool that, when 68 | true, creates a start table with the header, the start states for the entire 69 | DFA and a row of start states for each pattern. When false, no rows are 70 | added. 71 | * Expose whether there are starting states for each pattern via a predicate 72 | on the DFA. 73 | * Modify the determinizer's `add_starts` method to basically do what it does, 74 | but also do it for each pattern when the DFA is configured for it. It should 75 | continue to reuse states as appropriate or not generate new states if they 76 | aren't needed. This will want to use the `NFA::start_pattern` method, which 77 | provides the starting NFA state ID for the given pattern. 78 | * Fix the dense->sparse conversion. At this point, this piece should be fairly 79 | straight-forward since the sparse representation of starting states is 80 | basically identical to the dense representation. 81 | 82 | At this point, I think the bug should resolve itself. 83 | 84 | ^^^^ DONE! IT WORKS! 85 | 86 | ----- 87 | 88 | 89 | Add top-level SyntaxConfig (or some such) that has all of the regex-syntax 90 | options forwarded, but with automata oriented docs. Then use this for all of 91 | the engines instead of having to repeat every option for every builder. 92 | 93 | ----- 94 | 95 | These produce different results. PCRE2 looks correct. Basically, we should be 96 | using the context around the `at` position correctly, which we aren't doing 97 | right now. Seems tricky to get right, particularly when confirming the match 98 | with a reverse DFA. 99 | 100 | Maybe our 'at' functions need to take a full range... Sigh. This is indeed what 101 | RE2 does. GAH. 102 | 103 | fn main() { 104 | let re = regex::Regex::new(r"(?-u)\b\sbar").unwrap(); 105 | let s = "foo bar baz"; 106 | println!("{:?}", re.find_at(s, 3).map(|m| m.as_str())); 107 | 108 | let re = pcre2::bytes::Regex::new(r"\b\sbar").unwrap(); 109 | let s = "foo bar baz"; 110 | println!("{:?}", re.find_at(s.as_bytes(), 3).unwrap()); 111 | } 112 | 113 | ^^^^ This is fixed now, but we still need to find a way to add test coverage 114 | for "context" searches. It'd be nice to do this automatically, but we'll 115 | probably just added a new 'context = [start, end]' option. 116 | 117 | ----- 118 | 119 | 120 | * Create regex-test crate, based on glob-test. Try to anticipate the needs for 121 | the full regex test suite. 122 | * See if we can clean up tests. 123 | * Provide a way to mark a test as expensive. 124 | * Provide a way to test is_match_at and find_at. 125 | * Test shortest_match_at too? Huge pain. Add tests for it. 126 | * Port ALL tests from the regex crate. Will probably need a way to mark a 127 | test as skipped. 128 | * Document tests better. 129 | * Find a way to remove byteorder dependency. 130 | * Reorganize crate API: 131 | * Have errors contain `Box` instead of `String`. 132 | * Make errors non-exhaustive. 133 | * Audit `StateID` trait for safety. 134 | * Brainstorm hard about `DFA` trait and the fact that DenseDFA and SparseDFA 135 | have inefficient implementations of some methods. Maybe use multiple 136 | traits? Answer: get rid of premultiply/classes knobs and just enable 137 | them by default. Should remove a huge amount of code. 138 | * Check whether `unsafe` is really needed to eliminate bounds checks. Use 139 | micro-benchmarks and bigger CLI workloads using `regex-automata-debug`. 140 | * Re-write module docs for `dfa` as they are no longer top-level. Keep most. 141 | * Retain any pertinent top-level crate docs, but don't rewrite yet. 142 | * Clean up builders if we can. e.g., Determinizer, minimizer, it's all a mess 143 | right now. 144 | * Clean up and add 'always_match' and 'never_match' constructors for every 145 | regex engine. 146 | * See about supporting ^, $, \A, \z, \b and \B in DFAs. Do the non-Unicode 147 | version of \b unfortunately. Carefully scrutinize how the regex crate's 148 | lazy DFA does it and try to make it comprehensible. Done! Except for the 149 | part about making it comprehensible. 150 | * Rethink prefilters? 151 | * Add `regex-automata-generate` CLI tool. This should just be a copy of 152 | the `ucd-generate dfa` and `ucd-generate regex` commands. 153 | 154 | Then build new public `nfa` sub-module. 155 | * For Unicode \b, generate \w DFA (forwards and reverse) and embed it into 156 | source for fast checking. That way, we don't need to ever do explicit UTF-8 157 | decoding anywhere. Yay. 158 | 159 | Then `lazy` sub-module. 160 | 161 | Then `onepass`. 162 | 163 | Then `jit`. 164 | 165 | ... and beyond? CRAZY. But it can be done! Build STRONG base layers. 166 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | * Consider refactoring the NFA representation such that it can be instantly 2 | loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this 3 | could negatively impact using the NFA with deserialization costs. Before 4 | doing this, we should write PikeVM and backtracking implementations so that 5 | they can be benchmarked. 6 | * Add captures to NFA. 7 | * Once we're happy, re-organize the public API such that NFAs are exported 8 | and usable on their own. 9 | 10 | * Investigate why NFA shrinking seems to produce bigger DFAs after 11 | determinization, even though it makes determinization substantially 12 | faster. This might be because of its use of sparse NFA states, which have 13 | a lower constant overhead associated with them. 14 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /bench/.gitignore: -------------------------------------------------------------------------------- 1 | log 2 | /target 3 | -------------------------------------------------------------------------------- /bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "regex-automata-bench" 4 | version = "0.0.1" 5 | authors = ["Andrew Gallant "] 6 | description = "Criterion benchmark suite for regex-automata." 7 | homepage = "https://github.com/BurntSushi/regex-automata" 8 | repository = "https://github.com/BurntSushi/regex-automata" 9 | license = "Unlicense/MIT" 10 | workspace = ".." 11 | edition = "2018" 12 | 13 | [lib] 14 | bench = false 15 | 16 | [[bench]] 17 | name = "regex-automata" 18 | harness = false 19 | path = "src/bench.rs" 20 | 21 | [dependencies] 22 | criterion = "0.3.4" 23 | regex-automata = { version = "*", path = ".." } 24 | -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-en-small-ascii.txt: -------------------------------------------------------------------------------- 1 | Presented by IM Pictures 2 | Produced by Shin Cine 3 | In association with MVP Venture Capital and Cinema Service 4 | Jeon Ji-hyun Cha Tae-hyun 5 | My Sassy Girl 6 | Exactly two years ago today, she and I buried a time capsule here. 7 | We promised to meet here two years later, but she hasn't come yet. 8 | I'm going to wait. 9 | Here we go. 10 | Please, don't move. 11 | One, two... 12 | Wait a minute. 13 | Hello? 14 | Oh, auntie. 15 | Sorry, I'm on my way. 16 | I'm really sorry. 17 | Yes, I'm coming. 18 | I'm having my photo taken. 19 | Bye. 20 | Are you ready? 21 | Here we go. 22 | One, two... 23 | My parents wanted a daughter, so they raised me like one. 24 | So I thought I was a girl until I was seven. 25 | I had to go to the women's public bath, too. 26 | The older I got, 27 | I thought my penis would get smaller and disappear. 28 | But it was the opposite. 29 | First Half 30 | He hasn't changed at all. 31 | No, I'm a real man now. 32 | Hey, asshole. 33 | Think clerical work in the army makes you a man? 34 | You irritate me! 35 | Give me a break, asshole. 36 | My job was tougher than you could imagine. 37 | Hey! 38 | I worked near the DMZ. 39 | Who are you kid -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-en-tiny-ascii.txt: -------------------------------------------------------------------------------- 1 | Presented by IM Pictures 2 | Produced by Shi -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-ru-small-utf8.txt: -------------------------------------------------------------------------------- 1 | Рэй МИЛЛАНД, Энтони КУИН, Дебра ПАЖЕТ в фильме БЕРЕГ РЕКИ 2 | в фильме также снимались: 3 | Гарри КЭРИ-мл., Чабби ДЖОНСОН, Байрон ФУЛДЖЕ, Том МакКи, Фрэнк ГЕРСТЛ сценарий Гарольда Джэкоба СМИТА и Джэймса ЛЕЙСЕСТЕРА по рассказу Гарольда Джэкоба СМИТА "Самая высокая гора" 4 | режиссер Аллан ДВАН 5 | - А вы выбрали жаркий денек, мистер. 6 | - Я всегда так делаю. 7 | - Полный бак? 8 | - Еще бы! 9 | А у вас мощная "тачка", как я погляжу. 10 | - Могу продать ее вам. 11 | - Нет, спасибо! 12 | - Собираетесь немного поохотиться? 13 | - Ну, я надеюсь на это. 14 | Вы знаете, не проиживает тут поблизости парень по имени Кэмеро -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-ru-tiny-utf8.txt: -------------------------------------------------------------------------------- 1 | Рэй МИЛЛАНД, Энтони КУ -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-zh-small-utf8.txt: -------------------------------------------------------------------------------- 1 | 我去拜托旅馆的人 2 | 出去喝就行了 3 | 我去拜托长井找工作 4 | 他说帮我问问他哥哥的公司 5 | 不知道会否成事 6 | 既然他肯答应,一定有结果的 7 | 真羡慕他至今还是优哉悠哉 8 | 叔叔,你老是偷听人家拉琴 9 | 小缝,你的颤音有进步了 10 | 我才不理你 11 | 叔叔你知道 12 | 爷爷找你来谈什么吗? 13 | 不知道 14 | 你的亲事 15 | 我去看看 16 | 走好 17 | 加油 18 | 你已经30岁了吧? 19 | 是的 20 | 身体健壮吧? 21 | 两三年来没有感冒 22 | 脑袋还算不笨吧? 23 | 是的 24 | 游手好闲太可惜了 25 | 他叫什么名字呢... 26 | 那个常去找你聊天的男人 27 | 我曾经见过他一两次 28 | 平冈吗? 29 | 那个人不算上乘人材... 30 | 听说帝大毕业后就去了外地 31 | 如今因为失败而回来 32 | 为什么? 33 | 想要为了温饱而工作吧 34 | 你在这里 35 | 我的梳子好像掉在这附近 36 | 你还是一样迷迷糊糊 37 | 坐吧,我陪你聊聊天 38 | 天气不错 39 | 去赏花如何? 40 | 等你真的想去再说 41 | -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-zh-tiny-utf8.txt: -------------------------------------------------------------------------------- 1 | 你突然来信说最近要搬到这里 2 | -------------------------------------------------------------------------------- /bench/data/sherlock-holmes-small.txt: -------------------------------------------------------------------------------- 1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save 2 | upon those not infrequent occasions when he was up all night, was seated 3 | at the breakfast table. I stood upon the hearth-rug and picked up the 4 | stick which our visitor had left behind him the night before. It was a 5 | fine, thick piece of wood, bulbous-headed, of the sort which is known as 6 | a "Penang lawyer." Just under the head was a broad silver band nearly 7 | an inch across. "To James Mortimer, M.R.C.S., from his friends of the 8 | C.C.H.," was engraved upon it, with the date "1884." It was just such a 9 | stick as the old-fashioned family practitioner used to carry--dignified, 10 | solid, and reassuring. 11 | -------------------------------------------------------------------------------- /bench/data/sherlock-holmes-tiny.txt: -------------------------------------------------------------------------------- 1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save 2 | -------------------------------------------------------------------------------- /bench/src/bench.rs: -------------------------------------------------------------------------------- 1 | use criterion::{ 2 | criterion_group, criterion_main, Bencher, Criterion, Throughput, 3 | }; 4 | use regex_automata::dfa::{dense, regex}; 5 | use regex_automata::nfa::thompson; 6 | 7 | use crate::inputs::*; 8 | 9 | mod inputs; 10 | 11 | fn is_match(c: &mut Criterion) { 12 | let corpus = SHERLOCK_HUGE; 13 | define(c, "is-match", "sherlock-huge", corpus, move |b| { 14 | let re = regex::Builder::new().build(r"\p{Greek}").unwrap(); 15 | // let re = re.forward().to_sparse().unwrap(); 16 | b.iter(|| { 17 | assert!(!re.is_match(corpus)); 18 | }); 19 | }); 20 | 21 | // let corpus = OPEN_ZH_SMALL; 22 | let corpus = SHERLOCK_SMALL; 23 | define(c, "is-match", "sherlock-small", corpus, move |b| { 24 | let re = regex::Builder::new().build(r"\p{Greek}").unwrap(); 25 | // let re = re.forward().to_sparse().unwrap(); 26 | b.iter(|| { 27 | assert!(!re.is_match(corpus)); 28 | }); 29 | }); 30 | 31 | let corpus = SHERLOCK_TINY; 32 | define(c, "is-match", "sherlock-tiny", corpus, move |b| { 33 | let re = regex::Builder::new().build(r"\p{Greek}").unwrap(); 34 | b.iter(|| { 35 | assert!(!re.is_match(corpus)); 36 | }); 37 | }); 38 | 39 | let corpus = EMPTY; 40 | define(c, "is-match", "empty", corpus, move |b| { 41 | let re = regex::Builder::new().build(r"\p{Greek}").unwrap(); 42 | b.iter(|| { 43 | assert!(!re.is_match(corpus)); 44 | }); 45 | }); 46 | } 47 | 48 | // \w has 128,640 codepoints. 49 | fn compile_unicode_word(c: &mut Criterion) { 50 | define_compile(c, "unicode-word", r"\w"); 51 | define_compile_reverse(c, "unicode-word", r"\w"); 52 | } 53 | 54 | // \p{Other_Math} has 1,362 codepoints 55 | fn compile_unicode_other_math(c: &mut Criterion) { 56 | define_compile(c, "unicode-other-math", r"\p{Other_Math}"); 57 | } 58 | 59 | // \p{Other_Uppercase} has 120 codepoints 60 | fn compile_unicode_other_uppercase(c: &mut Criterion) { 61 | define_compile( 62 | c, 63 | "unicode-other-uppercase", 64 | r"\p{any}*?\p{Other_Uppercase}", 65 | ); 66 | } 67 | 68 | fn compile_muammar(c: &mut Criterion) { 69 | define_compile( 70 | c, 71 | "muammar", 72 | r"\p{any}*?M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", 73 | ); 74 | } 75 | 76 | fn define_compile(c: &mut Criterion, group_name: &str, pattern: &'static str) { 77 | let group = format!("fwd-compile/{}", group_name); 78 | define(c, &group, "default", &[], move |b| { 79 | b.iter(|| { 80 | let result = dense::Builder::new() 81 | .configure(dense::Config::new().anchored(true)) 82 | .build(pattern); 83 | assert!(result.is_ok()); 84 | }); 85 | }); 86 | } 87 | 88 | fn define_compile_reverse( 89 | c: &mut Criterion, 90 | group_name: &str, 91 | pattern: &'static str, 92 | ) { 93 | let group = format!("rev-compile/{}", group_name); 94 | define(c, &group, "default", &[], move |b| { 95 | b.iter(|| { 96 | let result = dense::Builder::new() 97 | .configure(dense::Config::new().anchored(true)) 98 | .thompson(thompson::Config::new().reverse(true)) 99 | .build(pattern); 100 | assert!(result.is_ok()); 101 | }); 102 | }); 103 | } 104 | 105 | fn define( 106 | c: &mut Criterion, 107 | group_name: &str, 108 | bench_name: &str, 109 | corpus: &[u8], 110 | bench: impl FnMut(&mut Bencher) + 'static, 111 | ) { 112 | c.benchmark_group(group_name) 113 | .throughput(Throughput::Bytes(corpus.len() as u64)) 114 | .sample_size(15) 115 | .warm_up_time(std::time::Duration::from_millis(500)) 116 | .measurement_time(std::time::Duration::from_secs(2)) 117 | .bench_function(bench_name, bench); 118 | } 119 | 120 | criterion_group!(g1, is_match); 121 | criterion_group!(g2, compile_unicode_other_math); 122 | criterion_group!(g3, compile_unicode_other_uppercase); 123 | criterion_group!(g4, compile_muammar); 124 | criterion_group!(g5, compile_unicode_word); 125 | criterion_main!(g1, g2, g3, g4, g5); 126 | -------------------------------------------------------------------------------- /bench/src/inputs.rs: -------------------------------------------------------------------------------- 1 | pub const EMPTY: &'static [u8] = b""; 2 | 3 | pub const SHERLOCK_HUGE: &'static [u8] = 4 | include_bytes!("../data/sherlock-holmes-huge.txt"); 5 | pub const SHERLOCK_SMALL: &'static [u8] = 6 | include_bytes!("../data/sherlock-holmes-small.txt"); 7 | pub const SHERLOCK_TINY: &'static [u8] = 8 | include_bytes!("../data/sherlock-holmes-tiny.txt"); 9 | 10 | // pub const OPEN_ZH_SMALL: &'static [u8] = 11 | // include_bytes!("../data/opensubtitles2018-zh-small-utf8.txt"); 12 | -------------------------------------------------------------------------------- /bench/src/lib.rs: -------------------------------------------------------------------------------- 1 | // This is purposely empty. See src/bench.rs instead. We use src/bench.rs 2 | // to avoid including the same file in multiple build targets. 3 | -------------------------------------------------------------------------------- /examples/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "regex-automata-examples" 4 | version = "0.0.0" #:version 5 | edition = "2018" 6 | 7 | [dev-dependencies] 8 | fst = "0.4.0" 9 | regex-automata = { version = "*", path = "..", features = ["transducer"] } 10 | 11 | [[example]] 12 | name = "fst" 13 | path = "fst.rs" 14 | -------------------------------------------------------------------------------- /examples/fst.rs: -------------------------------------------------------------------------------- 1 | // To run this example, use: 2 | // 3 | // cargo run --manifest-path examples/Cargo.toml --example fst 4 | 5 | use fst::{IntoStreamer, Set}; 6 | use regex_automata::dfa::dense; 7 | 8 | fn main() -> Result<(), Box> { 9 | let set = Set::from_iter(&["FoO", "Foo", "fOO", "foo"])?; 10 | let pattern = r"(?i)foo"; 11 | let config = dense::Config::new().anchored(true); 12 | let dfa = dense::Builder::new().configure(config).build(pattern)?; 13 | 14 | let keys = set.search(&dfa).into_stream().into_strs()?; 15 | assert_eq!(keys, vec!["FoO", "Foo", "fOO", "foo"]); 16 | println!("{:?}", keys); 17 | Ok(()) 18 | } 19 | -------------------------------------------------------------------------------- /regex-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "regex-cli" 4 | version = "0.0.1" 5 | authors = ["Andrew Gallant "] 6 | description = """ 7 | A command line tool for debugging, benchmarking and generating regular 8 | expressions. 9 | """ 10 | documentation = "https://docs.rs/regex-cli" 11 | repository = "https://github.com/BurntSushi/regex-automata" 12 | keywords = ["regex", "cli", "debug", "nfa", "dfa"] 13 | license = "Unlicense/MIT" 14 | categories = ["text-processing"] 15 | autotests = false 16 | edition = "2018" 17 | 18 | [[bin]] 19 | name = "regex-cli" 20 | 21 | [dependencies] 22 | anyhow = "1.0.28" 23 | bstr = { version = "0.2.16", default-features = false, features = ["std"] } 24 | clap = { version = "2.33.0", default-features = false } 25 | memmap2 = "0.3.0" 26 | regex = "1.5.4" 27 | syntax = { package = "regex-syntax", version = "0.6.17" } 28 | tabwriter = "1.2.1" 29 | unicode-width = "0.1.7" 30 | 31 | [dependencies.automata] 32 | package = "regex-automata" 33 | path = ".." 34 | features = ["logging"] 35 | 36 | [dependencies.env_logger] 37 | version = "0.8.4" 38 | default-features = false 39 | features = ["atty", "humantime", "termcolor"] 40 | -------------------------------------------------------------------------------- /regex-cli/src/app.rs: -------------------------------------------------------------------------------- 1 | use crate::cmd; 2 | 3 | const TEMPLATE_ROOT: &'static str = "\ 4 | {bin} {version} 5 | {author} 6 | {about} 7 | USAGE: 8 | {usage} 9 | 10 | TIP: 11 | use -h for short docs and --help for long docs 12 | 13 | SUBCOMMANDS: 14 | {subcommands} 15 | 16 | OPTIONS: 17 | {unified}"; 18 | 19 | const TEMPLATE_SUBCOMMAND: &'static str = "\ 20 | USAGE: 21 | {usage} 22 | 23 | TIP: 24 | use -h for short docs and --help for long docs 25 | 26 | SUBCOMMANDS: 27 | {subcommands} 28 | 29 | OPTIONS: 30 | {unified}"; 31 | 32 | const TEMPLATE_LEAF: &'static str = "\ 33 | USAGE: 34 | {usage} 35 | 36 | TIP: 37 | use -h for short docs and --help for long docs 38 | 39 | ARGS: 40 | {positionals} 41 | 42 | OPTIONS: 43 | {unified}"; 44 | 45 | const ABOUT: &'static str = " 46 | regex-cli is a tool for interacting with regular expressions on the command 47 | line. It is useful as a debugging aide, an ad hoc benchmarking tool and as a 48 | way to conveniently pre-compile and embed regular expressions into Rust 49 | code. 50 | "; 51 | 52 | /// Convenience type alias for the Clap app type that we use. 53 | pub type App = clap::App<'static, 'static>; 54 | 55 | /// Convenience type alias for the Clap argument result type that we use. 56 | pub type Args = clap::ArgMatches<'static>; 57 | 58 | /// Convenience function for creating a new Clap sub-command. 59 | /// 60 | /// This should be used for sub-commands that contain other sub-commands. 61 | pub fn command(name: &'static str) -> App { 62 | clap::SubCommand::with_name(name) 63 | .author(clap::crate_authors!()) 64 | .version(clap::crate_version!()) 65 | .template(TEMPLATE_SUBCOMMAND) 66 | .setting(clap::AppSettings::UnifiedHelpMessage) 67 | } 68 | 69 | /// Convenience function for creating a new Clap sub-command. 70 | /// 71 | /// This should be used for sub-commands that do NOT contain other 72 | /// sub-commands. 73 | pub fn leaf(name: &'static str) -> App { 74 | clap::SubCommand::with_name(name) 75 | .author(clap::crate_authors!()) 76 | .version(clap::crate_version!()) 77 | .template(TEMPLATE_LEAF) 78 | .setting(clap::AppSettings::UnifiedHelpMessage) 79 | } 80 | 81 | /// Convenience function for defining a Clap positional argument with the 82 | /// given name. 83 | pub fn arg(name: &'static str) -> clap::Arg { 84 | clap::Arg::with_name(name) 85 | } 86 | 87 | /// Convenience function for defining a Clap argument with a long flag name 88 | /// that accepts a single value. 89 | pub fn flag(name: &'static str) -> clap::Arg { 90 | clap::Arg::with_name(name).long(name).takes_value(true) 91 | } 92 | 93 | /// Convenience function for defining a Clap argument with a long flag name 94 | /// that accepts no values. i.e., It is a boolean switch. 95 | pub fn switch(name: &'static str) -> clap::Arg { 96 | clap::Arg::with_name(name).long(name) 97 | } 98 | 99 | /// Build the main Clap application. 100 | pub fn root() -> App { 101 | clap::App::new("regex-cli") 102 | .author(clap::crate_authors!()) 103 | .version(clap::crate_version!()) 104 | .about(ABOUT) 105 | .template(TEMPLATE_ROOT) 106 | .max_term_width(100) 107 | .setting(clap::AppSettings::UnifiedHelpMessage) 108 | .arg(switch("quiet").short("q").global(true).help("Show less output.")) 109 | .subcommand(cmd::debug::define()) 110 | .subcommand(cmd::find::define()) 111 | } 112 | -------------------------------------------------------------------------------- /regex-cli/src/cmd/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod debug; 2 | pub mod find; 3 | -------------------------------------------------------------------------------- /regex-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | #![allow(warnings)] 2 | 3 | mod app; 4 | mod cmd; 5 | mod config; 6 | mod escape; 7 | mod util; 8 | 9 | fn main() -> anyhow::Result<()> { 10 | env_logger::init(); 11 | let args = app::root().get_matches(); 12 | util::run_subcommand(&args, app::root, |cmd, args| match cmd { 13 | "debug" => cmd::debug::run(args), 14 | "find" => cmd::find::run(args), 15 | _ => Err(util::UnrecognizedCommandError.into()), 16 | }) 17 | } 18 | -------------------------------------------------------------------------------- /regex-cli/src/util.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Write}; 2 | 3 | use unicode_width::UnicodeWidthStr; 4 | 5 | use crate::app::{App, Args}; 6 | 7 | /// An error that indicates that a sub-command was seen that was not 8 | /// recognized. 9 | /// 10 | /// This is a sentinel error that is always converted to a panic via 11 | /// run_subcommand. Namely, not handling a defined sub-command is a programmer 12 | /// error. 13 | #[derive(Debug)] 14 | pub struct UnrecognizedCommandError; 15 | 16 | impl std::error::Error for UnrecognizedCommandError {} 17 | 18 | impl std::fmt::Display for UnrecognizedCommandError { 19 | fn fmt(&self, _: &mut std::fmt::Formatter) -> std::fmt::Result { 20 | unreachable!() 21 | } 22 | } 23 | 24 | /// Choose the sub-command of 'args' to run with 'run'. If the sub-command 25 | /// wasn't recognized or is unknown, then an error is returned. 26 | pub fn run_subcommand( 27 | args: &Args, 28 | app: impl FnOnce() -> App, 29 | run: impl FnOnce(&str, &Args) -> anyhow::Result<()>, 30 | ) -> anyhow::Result<()> { 31 | let (name, args) = args.subcommand(); 32 | if name.is_empty() || args.is_none() { 33 | app().print_help()?; 34 | writeln!(std::io::stdout(), "")?; 35 | return Ok(()); 36 | } 37 | let err = match run(name, args.unwrap()) { 38 | Ok(()) => return Ok(()), 39 | Err(err) => err, 40 | }; 41 | if err.is::() { 42 | // The programmer should handle all defined sub-commands, 43 | unreachable!("unrecognized command: {}", name); 44 | } 45 | Err(err) 46 | } 47 | 48 | /// Time an arbitrary operation. 49 | pub fn timeit(run: impl FnOnce() -> T) -> (T, std::time::Duration) { 50 | let start = std::time::Instant::now(); 51 | let t = run(); 52 | (t, start.elapsed()) 53 | } 54 | 55 | /// Convenient time an operation that returns a result by packing the duration 56 | /// into the `Ok` variant. 57 | pub fn timeitr( 58 | run: impl FnOnce() -> Result, 59 | ) -> Result<(T, std::time::Duration), E> { 60 | let (result, time) = timeit(run); 61 | let t = result?; 62 | Ok((t, time)) 63 | } 64 | 65 | /// Print the given text with an ASCII art underline beneath it. 66 | /// 67 | /// If the given text is empty, then '' is printed. 68 | pub fn print_with_underline( 69 | mut wtr: W, 70 | text: &str, 71 | ) -> io::Result<()> { 72 | let toprint = if text.is_empty() { "" } else { text }; 73 | writeln!(wtr, "{}", toprint)?; 74 | writeln!(wtr, "{}", "-".repeat(toprint.width()))?; 75 | Ok(()) 76 | } 77 | 78 | #[derive(Debug)] 79 | pub struct Table { 80 | pairs: Vec<(&'static str, Box)>, 81 | } 82 | 83 | impl Table { 84 | pub fn empty() -> Table { 85 | Table { pairs: vec![] } 86 | } 87 | 88 | pub fn add( 89 | &mut self, 90 | label: &'static str, 91 | value: D, 92 | ) { 93 | self.pairs.push((label, Box::new(value))); 94 | } 95 | 96 | pub fn print(&self, wtr: W) -> io::Result<()> { 97 | let mut wtr = tabwriter::TabWriter::new(wtr) 98 | .alignment(tabwriter::Alignment::Right); 99 | for (label, value) in self.pairs.iter() { 100 | writeln!(wtr, "{}:\t{:?}", label, value)?; 101 | } 102 | wtr.flush() 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /regex-test/COPYING: -------------------------------------------------------------------------------- 1 | This project is dual-licensed under the Unlicense and MIT licenses. 2 | 3 | You may use this code under the terms of either license. 4 | -------------------------------------------------------------------------------- /regex-test/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "regex-test" 4 | version = "0.0.0" #:version 5 | authors = ["Andrew Gallant "] 6 | description = """ 7 | Infrastructure for testing regexes. 8 | 9 | You probably don't want to use this crate unless you're working on a regex 10 | implementation. 11 | """ 12 | documentation = "https://docs.rs/regex-test" 13 | repository = "https://github.com/BurntSushi/regex-automata/tree/master/regex-test" 14 | readme = "README.md" 15 | keywords = ["regex", "regexp", "dfa", "automata", "test"] 16 | license = "Unlicense/MIT" 17 | edition = "2018" 18 | 19 | [lib] 20 | name = "regex_test" 21 | bench = false 22 | 23 | [dependencies] 24 | anyhow = "1.0.27" 25 | bstr = { version = "0.2.16", default-features = false, features = ["std", "serde1"] } 26 | serde = { version = "1.0.105", features = ["derive"] } 27 | toml = "0.5.6" 28 | -------------------------------------------------------------------------------- /regex-test/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Andrew Gallant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /regex-test/UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /regex-test/src/escape.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | 3 | use std::ascii; 4 | use std::str; 5 | 6 | use bstr::{ByteSlice, ByteVec}; 7 | 8 | pub fn nice_raw_bytes(bytes: &[u8]) -> String { 9 | match str::from_utf8(bytes) { 10 | Ok(s) => s.to_string(), 11 | Err(_) => escape_bytes(bytes), 12 | } 13 | } 14 | 15 | pub fn escape_bytes(bytes: &[u8]) -> String { 16 | let escaped = bytes 17 | .iter() 18 | .flat_map(|&b| ascii::escape_default(b)) 19 | .collect::>(); 20 | String::from_utf8(escaped).unwrap() 21 | } 22 | 23 | pub fn hex_bytes(bytes: &[u8]) -> String { 24 | bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect() 25 | } 26 | 27 | pub fn escape_default(s: &str) -> String { 28 | s.chars().flat_map(|c| c.escape_default()).collect() 29 | } 30 | 31 | pub fn escape(bytes: &[u8]) -> String { 32 | let mut escaped = String::new(); 33 | for (s, e, ch) in bytes.char_indices() { 34 | if ch == '\u{FFFD}' { 35 | for b in bytes[s..e].bytes() { 36 | escape_byte(b, &mut escaped); 37 | } 38 | } else { 39 | escape_char(ch, &mut escaped); 40 | } 41 | } 42 | escaped 43 | } 44 | 45 | pub fn unescape>(s: B) -> Vec { 46 | #[derive(Clone, Copy, Eq, PartialEq)] 47 | enum State { 48 | /// The state after seeing a `\`. 49 | Escape, 50 | /// The state after seeing a `\x`. 51 | HexFirst, 52 | /// The state after seeing a `\x[0-9A-Fa-f]`. 53 | HexSecond(char), 54 | /// Default state. 55 | Literal, 56 | } 57 | 58 | let mut bytes = vec![]; 59 | let mut state = State::Literal; 60 | for c in s.as_ref().chars() { 61 | match state { 62 | State::Escape => match c { 63 | '\\' => { 64 | bytes.push(b'\\'); 65 | state = State::Literal; 66 | } 67 | 'n' => { 68 | bytes.push(b'\n'); 69 | state = State::Literal; 70 | } 71 | 'r' => { 72 | bytes.push(b'\r'); 73 | state = State::Literal; 74 | } 75 | 't' => { 76 | bytes.push(b'\t'); 77 | state = State::Literal; 78 | } 79 | 'x' => { 80 | state = State::HexFirst; 81 | } 82 | c => { 83 | bytes.push_char('\\'); 84 | bytes.push_char(c); 85 | state = State::Literal; 86 | } 87 | }, 88 | State::HexFirst => match c { 89 | '0'..='9' | 'A'..='F' | 'a'..='f' => { 90 | state = State::HexSecond(c); 91 | } 92 | c => { 93 | bytes.push_char('\\'); 94 | bytes.push_char('x'); 95 | bytes.push_char(c); 96 | state = State::Literal; 97 | } 98 | }, 99 | State::HexSecond(first) => match c { 100 | '0'..='9' | 'A'..='F' | 'a'..='f' => { 101 | let ordinal = format!("{}{}", first, c); 102 | let byte = u8::from_str_radix(&ordinal, 16).unwrap(); 103 | bytes.push_byte(byte); 104 | state = State::Literal; 105 | } 106 | c => { 107 | bytes.push_char('\\'); 108 | bytes.push_char('x'); 109 | bytes.push_char(first); 110 | bytes.push_char(c); 111 | state = State::Literal; 112 | } 113 | }, 114 | State::Literal => match c { 115 | '\\' => { 116 | state = State::Escape; 117 | } 118 | c => { 119 | bytes.push_char(c); 120 | } 121 | }, 122 | } 123 | } 124 | match state { 125 | State::Escape => bytes.push_char('\\'), 126 | State::HexFirst => bytes.push_str("\\x"), 127 | State::HexSecond(c) => { 128 | bytes.push_char('\\'); 129 | bytes.push_char('x'); 130 | bytes.push_char(c); 131 | } 132 | State::Literal => {} 133 | } 134 | bytes 135 | } 136 | 137 | /// Adds the given codepoint to the given string, escaping it if necessary. 138 | fn escape_char(cp: char, into: &mut String) { 139 | if cp.is_ascii() { 140 | escape_byte(cp as u8, into); 141 | } else { 142 | into.push(cp); 143 | } 144 | } 145 | 146 | /// Adds the given byte to the given string, escaping it if necessary. 147 | fn escape_byte(byte: u8, into: &mut String) { 148 | match byte { 149 | 0x21..=0x5B | 0x5D..=0x7D => into.push(byte as char), 150 | b'\n' => into.push_str(r"\n"), 151 | b'\r' => into.push_str(r"\r"), 152 | b'\t' => into.push_str(r"\t"), 153 | b'\\' => into.push_str(r"\\"), 154 | _ => into.push_str(&format!(r"\x{:02X}", byte)), 155 | } 156 | } 157 | 158 | #[cfg(test)] 159 | mod tests { 160 | use super::{escape, unescape}; 161 | 162 | fn b(bytes: &'static [u8]) -> Vec { 163 | bytes.to_vec() 164 | } 165 | 166 | #[test] 167 | fn empty() { 168 | assert_eq!(b(b""), unescape(r"")); 169 | assert_eq!(r"", escape(b"")); 170 | } 171 | 172 | #[test] 173 | fn backslash() { 174 | assert_eq!(b(b"\\"), unescape(r"\\")); 175 | assert_eq!(r"\\", escape(b"\\")); 176 | } 177 | 178 | #[test] 179 | fn nul() { 180 | assert_eq!(b(b"\x00"), unescape(r"\x00")); 181 | assert_eq!(r"\x00", escape(b"\x00")); 182 | } 183 | 184 | #[test] 185 | fn nl() { 186 | assert_eq!(b(b"\n"), unescape(r"\n")); 187 | assert_eq!(r"\n", escape(b"\n")); 188 | } 189 | 190 | #[test] 191 | fn tab() { 192 | assert_eq!(b(b"\t"), unescape(r"\t")); 193 | assert_eq!(r"\t", escape(b"\t")); 194 | } 195 | 196 | #[test] 197 | fn carriage() { 198 | assert_eq!(b(b"\r"), unescape(r"\r")); 199 | assert_eq!(r"\r", escape(b"\r")); 200 | } 201 | 202 | #[test] 203 | fn nothing_simple() { 204 | assert_eq!(b(b"\\a"), unescape(r"\a")); 205 | assert_eq!(b(b"\\a"), unescape(r"\\a")); 206 | assert_eq!(r"\\a", escape(b"\\a")); 207 | } 208 | 209 | #[test] 210 | fn nothing_hex0() { 211 | assert_eq!(b(b"\\x"), unescape(r"\x")); 212 | assert_eq!(b(b"\\x"), unescape(r"\\x")); 213 | assert_eq!(r"\\x", escape(b"\\x")); 214 | } 215 | 216 | #[test] 217 | fn nothing_hex1() { 218 | assert_eq!(b(b"\\xz"), unescape(r"\xz")); 219 | assert_eq!(b(b"\\xz"), unescape(r"\\xz")); 220 | assert_eq!(r"\\xz", escape(b"\\xz")); 221 | } 222 | 223 | #[test] 224 | fn nothing_hex2() { 225 | assert_eq!(b(b"\\xzz"), unescape(r"\xzz")); 226 | assert_eq!(b(b"\\xzz"), unescape(r"\\xzz")); 227 | assert_eq!(r"\\xzz", escape(b"\\xzz")); 228 | } 229 | 230 | #[test] 231 | fn invalid_utf8() { 232 | assert_eq!(r"\xFF", escape(b"\xFF")); 233 | assert_eq!(r"a\xFFb", escape(b"a\xFFb")); 234 | } 235 | 236 | #[test] 237 | fn trailing_incomplete() { 238 | assert_eq!(b(b"\\xA"), unescape(r"\xA")); 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 79 2 | use_small_heuristics = "max" 3 | -------------------------------------------------------------------------------- /scripts/fowler-to-toml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import, division, print_function 4 | import argparse 5 | import os.path as path 6 | 7 | 8 | def read_tests(f): 9 | basename, _ = path.splitext(path.basename(f)) 10 | tests = [] 11 | prev_pattern = None 12 | 13 | for lineno, line in enumerate(open(f), 1): 14 | fields = list(filter(None, map(str.strip, line.split('\t')))) 15 | if not (4 <= len(fields) <= 5) \ 16 | or 'E' not in fields[0] or fields[0][0] == '#': 17 | continue 18 | 19 | terse_opts, pat, text, sgroups = fields[0:4] 20 | groups = [] # groups as integer ranges 21 | if sgroups == 'NOMATCH': 22 | groups = [] 23 | elif ',' in sgroups: 24 | noparen = map(lambda s: s.strip('()'), sgroups.split(')(')) 25 | for g in noparen: 26 | s, e = map(str.strip, g.split(',')) 27 | if s == '?' and e == '?': 28 | groups.append([]) 29 | else: 30 | groups.append([int(s), int(e)]) 31 | else: 32 | # This skips tests that should result in an error. 33 | # There aren't many, so I think we can just capture those 34 | # manually. Possibly fix this in future. 35 | continue 36 | 37 | case_insensitive = False 38 | if text == "NULL": 39 | text = "" 40 | if pat == 'SAME': 41 | pat = prev_pattern 42 | if '$' in terse_opts: 43 | pat = pat.encode('utf-8').decode('unicode_escape') 44 | text = text.encode('utf-8').decode('unicode_escape') 45 | text = text.encode('unicode_escape').decode('utf-8') 46 | else: 47 | text = text.encode('unicode_escape').decode('utf-8') 48 | if 'i' in terse_opts: 49 | case_insensitive = True 50 | 51 | pat = pat.encode('unicode_escape').decode('utf-8') 52 | pat = pat.replace('\\\\', '\\') 53 | if len(groups) > 0: 54 | captures = '[' + str(groups) + ']' 55 | else: 56 | captures = '[]' 57 | test = { 58 | 'name': '"%s%d"' % (basename, lineno), 59 | 'regex': "'''%s'''" % pat, 60 | 'input': "'''%s'''" % text, 61 | 'captures': captures, 62 | 'match_limit': '1', 63 | 'unescape': 'true', 64 | } 65 | if case_insensitive: 66 | test['case_insensitive'] = 'true' 67 | tests.append(test) 68 | prev_pattern = pat 69 | return tests 70 | 71 | 72 | if __name__ == '__main__': 73 | parser = argparse.ArgumentParser( 74 | description='Generate match tests from an AT&T POSIX test file.', 75 | ) 76 | aa = parser.add_argument 77 | aa('outdir', help='Directory to write generated TOML files.') 78 | aa('datfile', help='A dat AT&T POSIX test file.', nargs='+') 79 | args = parser.parse_args() 80 | 81 | for datfile in args.datfile: 82 | tests = read_tests(datfile) 83 | filename = path.basename(datfile) 84 | name, _ = path.splitext(filename) 85 | toml_path = path.join(args.outdir, f'{name}.toml') 86 | 87 | with open(toml_path, 'w+') as out: 88 | print(''' 89 | # !!! DO NOT EDIT !!! 90 | # Automatically generated by scripts/fowler-to-toml. 91 | # Numbers in the test names correspond to the line number of the test from 92 | # the original dat file. 93 | '''.strip(), file=out) 94 | print(file=out) 95 | for t in tests: 96 | print('[[tests]]', file=out) 97 | for k, v in t.items(): 98 | print(f'{k} = {v}', file=out) 99 | print(file=out) 100 | -------------------------------------------------------------------------------- /scripts/generate-fowler-tests: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import, division, print_function 4 | import argparse 5 | import datetime 6 | import os.path as path 7 | 8 | 9 | def print_tests(tests): 10 | print('\n'.join([test_tostr(t) for t in tests])) 11 | 12 | 13 | def read_tests(f): 14 | basename, _ = path.splitext(path.basename(f)) 15 | tests = [] 16 | for lineno, line in enumerate(open(f), 1): 17 | fields = list(filter(None, map(str.strip, line.split('\t')))) 18 | if not (4 <= len(fields) <= 5) \ 19 | or 'E' not in fields[0] or fields[0][0] == '#': 20 | continue 21 | 22 | opts, pat, text, sgroups = fields[0:4] 23 | groups = [] # groups as integer ranges 24 | if sgroups == 'NOMATCH': 25 | groups = [None] 26 | elif ',' in sgroups: 27 | noparen = map(lambda s: s.strip('()'), sgroups.split(')(')) 28 | for g in noparen: 29 | s, e = map(str.strip, g.split(',')) 30 | if s == '?' and e == '?': 31 | groups.append(None) 32 | else: 33 | groups.append((int(s), int(e))) 34 | else: 35 | # This skips tests that should result in an error. 36 | # There aren't many, so I think we can just capture those 37 | # manually. Possibly fix this in future. 38 | continue 39 | 40 | if pat == 'SAME': 41 | pat = tests[-1][1] 42 | if '$' in opts: 43 | pat = pat.encode('utf-8').decode('unicode_escape') 44 | text = text.encode('utf-8').decode('unicode_escape') 45 | if 'i' in opts: 46 | pat = '(?i)%s' % pat 47 | 48 | name = '%s_%d' % (basename, lineno) 49 | tests.append((name, pat, text, groups)) 50 | return tests 51 | 52 | 53 | def test_tostr(t): 54 | lineno, pat, text, groups = t 55 | options = map(group_tostr, groups) 56 | pat = pat.encode('unicode_escape').decode('utf-8') 57 | if text == 'NULL': 58 | text = '' 59 | else: 60 | text = text.encode('unicode_escape').decode('utf-8') 61 | return ' ("%s", "%s", b"%s", &[%s]),' \ 62 | % (lineno, pat, text, ', '.join(options)) 63 | 64 | 65 | def group_tostr(g): 66 | if g is None: 67 | return 'None' 68 | else: 69 | return 'Some((%d, %d))' % (g[0], g[1]) 70 | 71 | 72 | if __name__ == '__main__': 73 | parser = argparse.ArgumentParser( 74 | description='Generate match tests from an AT&T POSIX test file.') 75 | aa = parser.add_argument 76 | aa('files', nargs='+', 77 | help='A list of dat AT&T POSIX test files. See src/testdata') 78 | args = parser.parse_args() 79 | 80 | tests = [] 81 | for f in args.files: 82 | tests += read_tests(f) 83 | 84 | tpl = '''// !!! DO NOT EDIT !!! 85 | // Automatically generated by 'scripts/regex-match-tests.py' on {date}. 86 | ''' 87 | print(tpl.format(date=str(datetime.datetime.now()))) 88 | 89 | strty = "&'static str" 90 | capty = "&'static [Option<(usize, usize)>]" 91 | elty = "({str}, {str}, &'static [u8], {cap})".format(str=strty, cap=capty) 92 | print('pub const TESTS: &\'static [%s] = &[' % elty, end='') 93 | for f in args.files: 94 | print('') 95 | print(' // tests from %s' % path.basename(f)) 96 | print_tests(read_tests(f)) 97 | print('];') 98 | -------------------------------------------------------------------------------- /src/dfa/error.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | nfa, 3 | util::{ 4 | id::{PatternID, StateID}, 5 | start::Start, 6 | }, 7 | }; 8 | 9 | /// An error that occurred during the construction of a DFA. 10 | /// 11 | /// This error does not provide many introspection capabilities. There are 12 | /// generally only two things you can do with it: 13 | /// 14 | /// * Obtain a human readable message via its `std::fmt::Display` impl. 15 | /// * Access an underlying [`nfa::thompson::Error`] type from its `source` 16 | /// method via the `std::error::Error` trait. This error only occurs when using 17 | /// convenience routines for building a DFA directly from a pattern string. 18 | /// 19 | /// When the `std` feature is enabled, this implements the `std::error::Error` 20 | /// trait. 21 | #[derive(Clone, Debug)] 22 | pub struct Error { 23 | kind: ErrorKind, 24 | } 25 | 26 | /// The kind of error that occurred during the construction of a DFA. 27 | /// 28 | /// Note that this error is non-exhaustive. Adding new variants is not 29 | /// considered a breaking change. 30 | #[derive(Clone, Debug)] 31 | enum ErrorKind { 32 | /// An error that occurred while constructing an NFA as a precursor step 33 | /// before a DFA is compiled. 34 | NFA(nfa::thompson::Error), 35 | /// An error that occurred because an unsupported regex feature was used. 36 | /// The message string describes which unsupported feature was used. 37 | /// 38 | /// The primary regex feature that is unsupported by DFAs is the Unicode 39 | /// word boundary look-around assertion (`\b`). This can be worked around 40 | /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the 41 | /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary) 42 | /// option when building a DFA. 43 | Unsupported(&'static str), 44 | /// An error that occurs if too many states are produced while building a 45 | /// DFA. 46 | TooManyStates, 47 | /// An error that occurs if too many start states are needed while building 48 | /// a DFA. 49 | /// 50 | /// This is a kind of oddball error that occurs when building a DFA with 51 | /// start states enabled for each pattern and enough patterns to cause 52 | /// the table of start states to overflow `usize`. 53 | TooManyStartStates, 54 | /// This is another oddball error that can occur if there are too many 55 | /// patterns spread out across too many match states. 56 | TooManyMatchPatternIDs, 57 | /// An error that occurs if the DFA got too big during determinization. 58 | DFAExceededSizeLimit { limit: usize }, 59 | /// An error that occurs if auxiliary storage (not the DFA) used during 60 | /// determinization got too big. 61 | DeterminizeExceededSizeLimit { limit: usize }, 62 | } 63 | 64 | impl Error { 65 | /// Return the kind of this error. 66 | fn kind(&self) -> &ErrorKind { 67 | &self.kind 68 | } 69 | 70 | pub(crate) fn nfa(err: nfa::thompson::Error) -> Error { 71 | Error { kind: ErrorKind::NFA(err) } 72 | } 73 | 74 | pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error { 75 | let msg = "cannot build DFAs for regexes with Unicode word \ 76 | boundaries; switch to ASCII word boundaries, or \ 77 | heuristically enable Unicode word boundaries or use a \ 78 | different regex engine"; 79 | Error { kind: ErrorKind::Unsupported(msg) } 80 | } 81 | 82 | pub(crate) fn too_many_states() -> Error { 83 | Error { kind: ErrorKind::TooManyStates } 84 | } 85 | 86 | pub(crate) fn too_many_start_states() -> Error { 87 | Error { kind: ErrorKind::TooManyStartStates } 88 | } 89 | 90 | pub(crate) fn too_many_match_pattern_ids() -> Error { 91 | Error { kind: ErrorKind::TooManyMatchPatternIDs } 92 | } 93 | 94 | pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error { 95 | Error { kind: ErrorKind::DFAExceededSizeLimit { limit } } 96 | } 97 | 98 | pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error { 99 | Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } } 100 | } 101 | } 102 | 103 | #[cfg(feature = "std")] 104 | impl std::error::Error for Error { 105 | fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { 106 | match self.kind() { 107 | ErrorKind::NFA(ref err) => Some(err), 108 | ErrorKind::Unsupported(_) => None, 109 | ErrorKind::TooManyStates => None, 110 | ErrorKind::TooManyStartStates => None, 111 | ErrorKind::TooManyMatchPatternIDs => None, 112 | ErrorKind::DFAExceededSizeLimit { .. } => None, 113 | ErrorKind::DeterminizeExceededSizeLimit { .. } => None, 114 | } 115 | } 116 | } 117 | 118 | impl core::fmt::Display for Error { 119 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 120 | match self.kind() { 121 | ErrorKind::NFA(_) => write!(f, "error building NFA"), 122 | ErrorKind::Unsupported(ref msg) => { 123 | write!(f, "unsupported regex feature for DFAs: {}", msg) 124 | } 125 | ErrorKind::TooManyStates => write!( 126 | f, 127 | "number of DFA states exceeds limit of {}", 128 | StateID::LIMIT, 129 | ), 130 | ErrorKind::TooManyStartStates => { 131 | let stride = Start::count(); 132 | // The start table has `stride` entries for starting states for 133 | // the entire DFA, and then `stride` entries for each pattern 134 | // if start states for each pattern are enabled (which is the 135 | // only way this error can occur). Thus, the total number of 136 | // patterns that can fit in the table is `stride` less than 137 | // what we can allocate. 138 | let limit = ((core::isize::MAX as usize) - stride) / stride; 139 | write!( 140 | f, 141 | "compiling DFA with start states exceeds pattern \ 142 | pattern limit of {}", 143 | limit, 144 | ) 145 | } 146 | ErrorKind::TooManyMatchPatternIDs => write!( 147 | f, 148 | "compiling DFA with total patterns in all match states \ 149 | exceeds limit of {}", 150 | PatternID::LIMIT, 151 | ), 152 | ErrorKind::DFAExceededSizeLimit { limit } => write!( 153 | f, 154 | "DFA exceeded size limit of {:?} during determinization", 155 | limit, 156 | ), 157 | ErrorKind::DeterminizeExceededSizeLimit { limit } => { 158 | write!(f, "determinization exceeded size limit of {:?}", limit) 159 | } 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/dfa/transducer.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | dfa::{automaton::Automaton, dense, sparse}, 3 | util::id::StateID, 4 | }; 5 | 6 | impl> fst::Automaton for dense::DFA { 7 | type State = StateID; 8 | 9 | #[inline] 10 | fn start(&self) -> StateID { 11 | self.start_state_forward(None, &[], 0, 0) 12 | } 13 | 14 | #[inline] 15 | fn is_match(&self, state: &StateID) -> bool { 16 | self.is_match_state(*state) 17 | } 18 | 19 | #[inline] 20 | fn accept(&self, state: &StateID, byte: u8) -> StateID { 21 | if fst::Automaton::is_match(self, state) { 22 | return *state; 23 | } 24 | self.next_state(*state, byte) 25 | } 26 | 27 | #[inline] 28 | fn accept_eof(&self, state: &StateID) -> Option { 29 | if fst::Automaton::is_match(self, state) { 30 | return Some(*state); 31 | } 32 | Some(self.next_eoi_state(*state)) 33 | } 34 | 35 | #[inline] 36 | fn can_match(&self, state: &StateID) -> bool { 37 | !self.is_dead_state(*state) 38 | } 39 | } 40 | 41 | impl> fst::Automaton for sparse::DFA { 42 | type State = StateID; 43 | 44 | #[inline] 45 | fn start(&self) -> StateID { 46 | self.start_state_forward(None, &[], 0, 0) 47 | } 48 | 49 | #[inline] 50 | fn is_match(&self, state: &StateID) -> bool { 51 | self.is_match_state(*state) 52 | } 53 | 54 | #[inline] 55 | fn accept(&self, state: &StateID, byte: u8) -> StateID { 56 | if fst::Automaton::is_match(self, state) { 57 | return *state; 58 | } 59 | self.next_state(*state, byte) 60 | } 61 | 62 | #[inline] 63 | fn accept_eof(&self, state: &StateID) -> Option { 64 | if fst::Automaton::is_match(self, state) { 65 | return Some(*state); 66 | } 67 | Some(self.next_eoi_state(*state)) 68 | } 69 | 70 | #[inline] 71 | fn can_match(&self, state: &StateID) -> bool { 72 | !self.is_dead_state(*state) 73 | } 74 | } 75 | 76 | #[cfg(test)] 77 | mod tests { 78 | use bstr::BString; 79 | use fst::{Automaton, IntoStreamer, Set, Streamer}; 80 | 81 | use crate::dfa::{dense, sparse}; 82 | 83 | fn search>( 84 | set: &Set, 85 | aut: A, 86 | ) -> Vec { 87 | let mut stream = set.search(aut).into_stream(); 88 | 89 | let mut results = vec![]; 90 | while let Some(key) = stream.next() { 91 | results.push(BString::from(key)); 92 | } 93 | results 94 | } 95 | 96 | #[test] 97 | fn dense_anywhere() { 98 | let set = 99 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 100 | .unwrap(); 101 | let dfa = dense::DFA::new("ba.*").unwrap(); 102 | let got = search(&set, &dfa); 103 | assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); 104 | } 105 | 106 | #[test] 107 | fn dense_anchored() { 108 | let set = 109 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 110 | .unwrap(); 111 | let dfa = dense::Builder::new() 112 | .configure(dense::Config::new().anchored(true)) 113 | .build("ba.*") 114 | .unwrap(); 115 | let got = search(&set, &dfa); 116 | assert_eq!(got, vec!["bar", "baz"]); 117 | } 118 | 119 | #[test] 120 | fn dense_assertions_start() { 121 | let set = 122 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 123 | .unwrap(); 124 | let dfa = dense::Builder::new().build("^ba.*").unwrap(); 125 | let got = search(&set, &dfa); 126 | assert_eq!(got, vec!["bar", "baz"]); 127 | } 128 | 129 | #[test] 130 | fn dense_assertions_end() { 131 | let set = 132 | Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) 133 | .unwrap(); 134 | let dfa = dense::Builder::new().build(".*x$").unwrap(); 135 | let got = search(&set, &dfa); 136 | assert_eq!(got, vec!["bax", "xbax"]); 137 | } 138 | 139 | #[test] 140 | fn dense_assertions_word() { 141 | let set = 142 | Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); 143 | let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap(); 144 | let got = search(&set, &dfa); 145 | assert_eq!(got, vec!["foo", "zzz foo zzz"]); 146 | } 147 | 148 | #[test] 149 | fn sparse_anywhere() { 150 | let set = 151 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 152 | .unwrap(); 153 | let dfa = sparse::DFA::new("ba.*").unwrap(); 154 | let got = search(&set, &dfa); 155 | assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); 156 | } 157 | 158 | #[test] 159 | fn sparse_anchored() { 160 | let set = 161 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 162 | .unwrap(); 163 | let dfa = dense::Builder::new() 164 | .configure(dense::Config::new().anchored(true)) 165 | .build("ba.*") 166 | .unwrap() 167 | .to_sparse() 168 | .unwrap(); 169 | let got = search(&set, &dfa); 170 | assert_eq!(got, vec!["bar", "baz"]); 171 | } 172 | 173 | #[test] 174 | fn sparse_assertions_start() { 175 | let set = 176 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 177 | .unwrap(); 178 | let dfa = 179 | dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap(); 180 | let got = search(&set, &dfa); 181 | assert_eq!(got, vec!["bar", "baz"]); 182 | } 183 | 184 | #[test] 185 | fn sparse_assertions_end() { 186 | let set = 187 | Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) 188 | .unwrap(); 189 | let dfa = 190 | dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap(); 191 | let got = search(&set, &dfa); 192 | assert_eq!(got, vec!["bax", "xbax"]); 193 | } 194 | 195 | #[test] 196 | fn sparse_assertions_word() { 197 | let set = 198 | Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); 199 | let dfa = dense::Builder::new() 200 | .build(r"(?-u)\bfoo\b") 201 | .unwrap() 202 | .to_sparse() 203 | .unwrap(); 204 | let got = search(&set, &dfa); 205 | assert_eq!(got, vec!["foo", "zzz foo zzz"]); 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /src/hybrid/error.rs: -------------------------------------------------------------------------------- 1 | use crate::{hybrid::id::LazyStateIDError, nfa}; 2 | 3 | /// An error that occurs when initial construction of a lazy DFA fails. 4 | /// 5 | /// A build error can occur when insufficient cache capacity is configured or 6 | /// if something about the NFA is unsupported. (For example, if one attempts 7 | /// to build a lazy DFA without heuristic Unicode support but with an NFA that 8 | /// contains a Unicode word boundary.) 9 | /// 10 | /// When the `std` feature is enabled, this implements the `std::error::Error` 11 | /// trait. 12 | #[derive(Clone, Debug)] 13 | pub struct BuildError { 14 | kind: BuildErrorKind, 15 | } 16 | 17 | #[derive(Clone, Debug)] 18 | enum BuildErrorKind { 19 | NFA(nfa::thompson::Error), 20 | InsufficientCacheCapacity { minimum: usize, given: usize }, 21 | InsufficientStateIDCapacity { err: LazyStateIDError }, 22 | Unsupported(&'static str), 23 | } 24 | 25 | impl BuildError { 26 | fn kind(&self) -> &BuildErrorKind { 27 | &self.kind 28 | } 29 | 30 | pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError { 31 | BuildError { kind: BuildErrorKind::NFA(err) } 32 | } 33 | 34 | pub(crate) fn insufficient_cache_capacity( 35 | minimum: usize, 36 | given: usize, 37 | ) -> BuildError { 38 | BuildError { 39 | kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given }, 40 | } 41 | } 42 | 43 | pub(crate) fn insufficient_state_id_capacity( 44 | err: LazyStateIDError, 45 | ) -> BuildError { 46 | BuildError { 47 | kind: BuildErrorKind::InsufficientStateIDCapacity { err }, 48 | } 49 | } 50 | 51 | pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { 52 | let msg = "cannot build lazy DFAs for regexes with Unicode word \ 53 | boundaries; switch to ASCII word boundaries, or \ 54 | heuristically enable Unicode word boundaries or use a \ 55 | different regex engine"; 56 | BuildError { kind: BuildErrorKind::Unsupported(msg) } 57 | } 58 | } 59 | 60 | #[cfg(feature = "std")] 61 | impl std::error::Error for BuildError { 62 | fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { 63 | match self.kind() { 64 | BuildErrorKind::NFA(ref err) => Some(err), 65 | BuildErrorKind::InsufficientCacheCapacity { .. } => None, 66 | // LazyStateIDError is an implementation detail, don't expose it. 67 | BuildErrorKind::InsufficientStateIDCapacity { .. } => None, 68 | BuildErrorKind::Unsupported(_) => None, 69 | } 70 | } 71 | } 72 | 73 | impl core::fmt::Display for BuildError { 74 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 75 | match self.kind() { 76 | BuildErrorKind::NFA(_) => write!(f, "error building NFA"), 77 | BuildErrorKind::InsufficientCacheCapacity { minimum, given } => { 78 | write!( 79 | f, 80 | "given cache capacity ({}) is smaller than \ 81 | minimum required ({})", 82 | given, minimum, 83 | ) 84 | } 85 | BuildErrorKind::InsufficientStateIDCapacity { ref err } => { 86 | err.fmt(f) 87 | } 88 | BuildErrorKind::Unsupported(ref msg) => { 89 | write!(f, "unsupported regex feature for DFAs: {}", msg) 90 | } 91 | } 92 | } 93 | } 94 | 95 | /// An error that occurs when cache usage has become inefficient. 96 | /// 97 | /// One of the weaknesses of a lazy DFA is that it may need to clear its 98 | /// cache repeatedly if it's not big enough. If this happens too much, then it 99 | /// can slow searching down significantly. A mitigation to this is to use 100 | /// heuristics to detect whether the cache is being used efficiently or not. 101 | /// If not, then a lazy DFA can return a `CacheError`. 102 | /// 103 | /// The default configuration of a lazy DFA in this crate is 104 | /// set such that a `CacheError` will never occur. Instead, 105 | /// callers must opt into this behavior with settings like 106 | /// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count). 107 | /// 108 | /// When the `std` feature is enabled, this implements the `std::error::Error` 109 | /// trait. 110 | #[derive(Clone, Debug)] 111 | pub struct CacheError(()); 112 | 113 | impl CacheError { 114 | pub(crate) fn too_many_cache_clears() -> CacheError { 115 | CacheError(()) 116 | } 117 | } 118 | 119 | #[cfg(feature = "std")] 120 | impl std::error::Error for CacheError { 121 | fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { 122 | None 123 | } 124 | } 125 | 126 | impl core::fmt::Display for CacheError { 127 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 128 | write!(f, "lazy DFA cache has been cleared too many times") 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/hybrid/mod.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | A module for building and searching with lazy determinstic finite automata 3 | (DFAs). 4 | 5 | Like other modules in this crate, lazy DFAs support a rich regex syntax with 6 | Unicode features. The key feature of a lazy DFA is that it builds itself 7 | incrementally during search, and never uses more than a configured capacity of 8 | memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache" 9 | in which the actual DFA's transition table is stored. 10 | 11 | If you're looking for fully compiled DFAs, then please see the top-level 12 | [`dfa` module](crate::dfa). 13 | 14 | # Overview 15 | 16 | This section gives a brief overview of the primary types in this module: 17 | 18 | * A [`regex::Regex`] provides a way to search for matches of a regular 19 | expression using lazy DFAs. This includes iterating over matches with both the 20 | start and end positions of each match. 21 | * A [`dfa::DFA`] provides direct low level access to a lazy DFA. 22 | 23 | # Example: basic regex searching 24 | 25 | This example shows how to compile a regex using the default configuration 26 | and then use it to find matches in a byte string: 27 | 28 | ``` 29 | use regex_automata::{hybrid::regex::Regex, MultiMatch}; 30 | 31 | let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; 32 | let mut cache = re.create_cache(); 33 | 34 | let text = b"2018-12-24 2016-10-08"; 35 | let matches: Vec = 36 | re.find_leftmost_iter(&mut cache, text).collect(); 37 | assert_eq!(matches, vec![ 38 | MultiMatch::must(0, 0, 10), 39 | MultiMatch::must(0, 11, 21), 40 | ]); 41 | # Ok::<(), Box>(()) 42 | ``` 43 | 44 | # Example: searching with regex sets 45 | 46 | The lazy DFAs in this module all fully support searching with multiple regexes 47 | simultaneously. You can use this support with standard leftmost-first style 48 | searching to find non-overlapping matches: 49 | 50 | ``` 51 | use regex_automata::{hybrid::regex::Regex, MultiMatch}; 52 | 53 | let re = Regex::new_many(&[r"\w+", r"\S+"])?; 54 | let mut cache = re.create_cache(); 55 | 56 | let text = b"@foo bar"; 57 | let matches: Vec = 58 | re.find_leftmost_iter(&mut cache, text).collect(); 59 | assert_eq!(matches, vec![ 60 | MultiMatch::must(1, 0, 4), 61 | MultiMatch::must(0, 5, 8), 62 | ]); 63 | # Ok::<(), Box>(()) 64 | ``` 65 | 66 | Or use overlapping style searches to find all possible occurrences: 67 | 68 | ``` 69 | use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch}; 70 | 71 | // N.B. For overlapping searches, we need the underlying lazy DFA to report all 72 | // possible matches. 73 | let re = Regex::builder() 74 | .dfa(dfa::Config::new().match_kind(MatchKind::All)) 75 | .build_many(&[r"\w{3}", r"\S{3}"])?; 76 | let mut cache = re.create_cache(); 77 | 78 | let text = b"@foo bar"; 79 | let matches: Vec = 80 | re.find_overlapping_iter(&mut cache, text).collect(); 81 | assert_eq!(matches, vec![ 82 | MultiMatch::must(1, 0, 3), 83 | MultiMatch::must(0, 1, 4), 84 | MultiMatch::must(1, 1, 4), 85 | MultiMatch::must(0, 5, 8), 86 | MultiMatch::must(1, 5, 8), 87 | ]); 88 | # Ok::<(), Box>(()) 89 | ``` 90 | 91 | # When should I use this? 92 | 93 | Generally speaking, if you can abide the use of mutable state during search, 94 | and you don't need things like capturing groups or Unicode word boundary 95 | support in non-ASCII text, then a lazy DFA is likely a robust choice with 96 | respect to both search speed and memory usage. Note however that its speed 97 | may be worse than a general purpose regex engine if you don't select a good 98 | [prefilter](crate::util::prefilter). 99 | 100 | If you know ahead of time that your pattern would result in a very large DFA 101 | if it was fully compiled, it may be better to use an NFA simulation instead 102 | of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA 103 | to something that is big enough to hold the state machine (likely through 104 | experimentation). The issue here is that if the cache is too small, then it 105 | could wind up being reset too frequently and this might decrease searching 106 | speed significantly. 107 | 108 | # Differences with fully compiled DFAs 109 | 110 | A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a 111 | [`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities 112 | (and similarly for their underlying DFAs), but they achieve them through 113 | different means. The main difference is that a hybrid or "lazy" regex builds 114 | its DFA lazily during search, where as a fully compiled regex will build its 115 | DFA at construction time. While building a DFA at search time might sound like 116 | it's slow, it tends to work out where most bytes seen during a search will 117 | reuse pre-built parts of the DFA and thus can be almost as fast as a fully 118 | compiled DFA. The main downside is that searching requires mutable space to 119 | store the DFA, and, in the worst case, a search can result in a new state being 120 | created for each byte seen, which would make searching quite a bit slower. 121 | 122 | A fully compiled DFA never has to worry about searches being slower once 123 | it's built. (Aside from, say, the transition table being so large that it 124 | is subject to harsh CPU cache effects.) However, of course, building a full 125 | DFA can be quite time consuming and memory hungry. Particularly when it's 126 | so easy to build large DFAs when Unicode mode is enabled. 127 | 128 | A lazy DFA strikes a nice balance _in practice_, particularly in the 129 | presence of Unicode mode, by only building what is needed. It avoids the 130 | worst case exponential time complexity of DFA compilation by guaranteeing that 131 | it will only build at most one state per byte searched. While the worst 132 | case here can lead to a very high constant, it will never be exponential. 133 | 134 | # Syntax 135 | 136 | This module supports the same syntax as the `regex` crate, since they share the 137 | same parser. You can find an exhaustive list of supported syntax in the 138 | [documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). 139 | 140 | There are two things that are not supported by the lazy DFAs in this module: 141 | 142 | * Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top 143 | of them) can only find the offsets of an entire match, but cannot resolve 144 | the offsets of each capturing group. This is because DFAs do not have the 145 | expressive power necessary. 146 | * Unicode word boundaries. These present particularly difficult challenges for 147 | DFA construction and would result in an explosion in the number of states. 148 | One can enable [`dfa::Config::unicode_word_boundary`] though, which provides 149 | heuristic support for Unicode word boundaries that only works on ASCII text. 150 | Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work 151 | on any input. 152 | 153 | There are no plans to lift either of these limitations. 154 | 155 | Note that these restrictions are identical to the restrictions on fully 156 | compiled DFAs. 157 | 158 | # Support for `alloc`-only 159 | 160 | This crate comes with `alloc` and `std` features that are enabled by default. 161 | One can disable the `std` feature and still use the full API of a lazy DFA. 162 | (You should use `std` when possible, since it permits providing implementations 163 | of the `std::error::Error` trait, and does enable some minor internal 164 | optimizations.) 165 | 166 | This module does require at least the `alloc` feature though. It is not 167 | available in any capacity without `alloc`. 168 | */ 169 | 170 | pub use self::{ 171 | error::{BuildError, CacheError}, 172 | id::{LazyStateID, OverlappingState}, 173 | }; 174 | 175 | pub mod dfa; 176 | mod error; 177 | mod id; 178 | pub mod regex; 179 | mod search; 180 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | This crate provides an "expert" API for executing regular expressions using 3 | finite automata. 4 | 5 | **WARNING**: This `0.2` release of `regex-automata` was published 6 | before it was ready to unblock work elsewhere that needed some 7 | of the new APIs in this release. At the time of writing, it is 8 | strongly preferred that you continue using the 9 | [`regex-automata 0.1`](https://docs.rs/regex-automata/0.1/regex_automata/) 10 | release. Since this release represents an unfinished state, please do not 11 | create issues for this release unless it's for a critical bug. 12 | */ 13 | 14 | #![allow(warnings)] 15 | // #![deny(missing_docs)] 16 | #![cfg_attr(not(feature = "std"), no_std)] 17 | 18 | #[cfg(not(any( 19 | target_pointer_width = "16", 20 | target_pointer_width = "32", 21 | target_pointer_width = "64" 22 | )))] 23 | compile_error!("regex-automata currently not supported on non-{16,32,64}"); 24 | 25 | #[cfg(feature = "alloc")] 26 | extern crate alloc; 27 | 28 | #[doc(inline)] 29 | pub use crate::util::id::PatternID; 30 | #[cfg(feature = "alloc")] 31 | pub use crate::util::syntax::SyntaxConfig; 32 | pub use crate::util::{ 33 | bytes::{DeserializeError, SerializeError}, 34 | matchtypes::{HalfMatch, Match, MatchError, MatchKind, MultiMatch}, 35 | }; 36 | 37 | #[macro_use] 38 | mod macros; 39 | 40 | pub mod dfa; 41 | #[cfg(feature = "alloc")] 42 | pub mod hybrid; 43 | #[doc(hidden)] 44 | #[cfg(feature = "alloc")] 45 | pub mod nfa; 46 | #[doc(hidden)] 47 | pub mod util; 48 | -------------------------------------------------------------------------------- /src/macros.rs: -------------------------------------------------------------------------------- 1 | /// A simple macro for defining bitfield accessors/mutators. 2 | #[cfg(feature = "alloc")] 3 | macro_rules! define_bool { 4 | ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { 5 | fn $is_fn_name(&self) -> bool { 6 | self.bools & (0b1 << $bit) > 0 7 | } 8 | 9 | fn $set_fn_name(&mut self, yes: bool) { 10 | if yes { 11 | self.bools |= 1 << $bit; 12 | } else { 13 | self.bools &= !(1 << $bit); 14 | } 15 | } 16 | }; 17 | } 18 | 19 | macro_rules! log { 20 | ($($tt:tt)*) => { 21 | #[cfg(feature = "logging")] 22 | { 23 | $($tt)* 24 | } 25 | } 26 | } 27 | 28 | macro_rules! trace { 29 | ($($tt:tt)*) => { log!(log::trace!($($tt)*)) } 30 | } 31 | -------------------------------------------------------------------------------- /src/nfa/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod thompson; 2 | -------------------------------------------------------------------------------- /src/nfa/thompson/error.rs: -------------------------------------------------------------------------------- 1 | use crate::util::id::{PatternID, StateID}; 2 | 3 | /// An error that can occured during the construction of a thompson NFA. 4 | /// 5 | /// This error does not provide many introspection capabilities. There are 6 | /// generally only two things you can do with it: 7 | /// 8 | /// * Obtain a human readable message via its `std::fmt::Display` impl. 9 | /// * Access an underlying [`regex_syntax::Error`] type from its `source` 10 | /// method via the `std::error::Error` trait. This error only occurs when using 11 | /// convenience routines for building an NFA directly from a pattern string. 12 | /// 13 | /// Otherwise, errors typically occur when a limit has been breeched. For 14 | /// example, if the total heap usage of the compiled NFA exceeds the limit 15 | /// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then 16 | /// building the NFA will fail. 17 | #[derive(Clone, Debug)] 18 | pub struct Error { 19 | kind: ErrorKind, 20 | } 21 | 22 | /// The kind of error that occurred during the construction of a thompson NFA. 23 | #[derive(Clone, Debug)] 24 | enum ErrorKind { 25 | /// An error that occurred while parsing a regular expression. Note that 26 | /// this error may be printed over multiple lines, and is generally 27 | /// intended to be end user readable on its own. 28 | Syntax(regex_syntax::Error), 29 | /// An error that occurs if too many patterns were given to the NFA 30 | /// compiler. 31 | TooManyPatterns { 32 | /// The number of patterns given, which exceeds the limit. 33 | given: usize, 34 | /// The limit on the number of patterns. 35 | limit: usize, 36 | }, 37 | /// An error that occurs if too states are produced while building an NFA. 38 | TooManyStates { 39 | /// The minimum number of states that are desired, which exceeds the 40 | /// limit. 41 | given: usize, 42 | /// The limit on the number of states. 43 | limit: usize, 44 | }, 45 | /// An error that occurs when NFA compilation exceeds a configured heap 46 | /// limit. 47 | ExceededSizeLimit { 48 | /// The configured limit, in bytes. 49 | limit: usize, 50 | }, 51 | /// An error that occurs when an invalid capture group index is added to 52 | /// the NFA. An "invalid" index can be one that is too big (e.g., results 53 | /// in an integer overflow) or one that is discontinuous from previous 54 | /// capture group indices added. 55 | InvalidCaptureIndex { 56 | /// The invalid index that was given. 57 | index: usize, 58 | }, 59 | /// An error that occurs when an NFA contains a Unicode word boundary, but 60 | /// where the crate was compiled without the necessary data for dealing 61 | /// with Unicode word boundaries. 62 | UnicodeWordUnavailable, 63 | } 64 | 65 | impl Error { 66 | fn kind(&self) -> &ErrorKind { 67 | &self.kind 68 | } 69 | 70 | pub(crate) fn syntax(err: regex_syntax::Error) -> Error { 71 | Error { kind: ErrorKind::Syntax(err) } 72 | } 73 | 74 | pub(crate) fn too_many_patterns(given: usize) -> Error { 75 | let limit = PatternID::LIMIT; 76 | Error { kind: ErrorKind::TooManyPatterns { given, limit } } 77 | } 78 | 79 | pub(crate) fn too_many_states(given: usize) -> Error { 80 | let limit = StateID::LIMIT; 81 | Error { kind: ErrorKind::TooManyStates { given, limit } } 82 | } 83 | 84 | pub(crate) fn exceeded_size_limit(limit: usize) -> Error { 85 | Error { kind: ErrorKind::ExceededSizeLimit { limit } } 86 | } 87 | 88 | pub(crate) fn invalid_capture_index(index: usize) -> Error { 89 | Error { kind: ErrorKind::InvalidCaptureIndex { index } } 90 | } 91 | 92 | pub(crate) fn unicode_word_unavailable() -> Error { 93 | Error { kind: ErrorKind::UnicodeWordUnavailable } 94 | } 95 | } 96 | 97 | #[cfg(feature = "std")] 98 | impl std::error::Error for Error { 99 | fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { 100 | match self.kind() { 101 | ErrorKind::Syntax(ref err) => Some(err), 102 | ErrorKind::TooManyPatterns { .. } => None, 103 | ErrorKind::TooManyStates { .. } => None, 104 | ErrorKind::ExceededSizeLimit { .. } => None, 105 | ErrorKind::InvalidCaptureIndex { .. } => None, 106 | ErrorKind::UnicodeWordUnavailable => None, 107 | } 108 | } 109 | } 110 | 111 | impl core::fmt::Display for Error { 112 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 113 | match self.kind() { 114 | ErrorKind::Syntax(_) => write!(f, "error parsing regex"), 115 | ErrorKind::TooManyPatterns { given, limit } => write!( 116 | f, 117 | "attemped to compile {} patterns, \ 118 | which exceeds the limit of {}", 119 | given, limit, 120 | ), 121 | ErrorKind::TooManyStates { given, limit } => write!( 122 | f, 123 | "attemped to compile {} NFA states, \ 124 | which exceeds the limit of {}", 125 | given, limit, 126 | ), 127 | ErrorKind::ExceededSizeLimit { limit } => write!( 128 | f, 129 | "heap usage during NFA compilation exceeded limit of {}", 130 | limit, 131 | ), 132 | ErrorKind::InvalidCaptureIndex { index } => write!( 133 | f, 134 | "capture group index {} is invalid (too big or discontinuous)", 135 | index, 136 | ), 137 | ErrorKind::UnicodeWordUnavailable => write!( 138 | f, 139 | "crate has been compiled without Unicode word boundary \ 140 | support, but the NFA contains Unicode word boundary \ 141 | assertions", 142 | ), 143 | } 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/util/lazy.rs: -------------------------------------------------------------------------------- 1 | use core::{ 2 | cell::Cell, 3 | ptr, 4 | sync::atomic::{AtomicPtr, Ordering}, 5 | }; 6 | 7 | use alloc::{boxed::Box, vec::Vec}; 8 | 9 | #[inline(always)] 10 | pub(crate) fn get_or_init( 11 | location: &'static AtomicPtr, 12 | init: impl FnOnce() -> T, 13 | ) -> &'static T { 14 | let mut ptr = location.load(Ordering::Acquire); 15 | if ptr.is_null() { 16 | let new_dfa = Box::new(init()); 17 | ptr = Box::into_raw(new_dfa); 18 | let result = location.compare_exchange( 19 | ptr::null_mut(), 20 | ptr, 21 | Ordering::AcqRel, 22 | Ordering::Acquire, 23 | ); 24 | if let Err(old) = result { 25 | let redundant = unsafe { Box::from_raw(ptr) }; 26 | drop(redundant); 27 | ptr = old; 28 | } 29 | } 30 | unsafe { &*ptr } 31 | } 32 | -------------------------------------------------------------------------------- /src/util/sparse_set.rs: -------------------------------------------------------------------------------- 1 | use alloc::{boxed::Box, vec, vec::Vec}; 2 | 3 | use crate::util::id::StateID; 4 | 5 | /// A pairse of sparse sets. 6 | /// 7 | /// This is useful when one needs to compute NFA epsilon closures from a 8 | /// previous set of states derived from an epsilon closure. One set can be the 9 | /// starting states where as the other set can be the destination states after 10 | /// following the transitions for a particular byte of input. 11 | /// 12 | /// There is no significance to 'set1' or 'set2'. They are both sparse sets of 13 | /// the same size. 14 | /// 15 | /// The members of this struct are exposed so that callers may borrow 'set1' 16 | /// and 'set2' individually without being force to borrow both at the same 17 | /// time. 18 | #[derive(Clone, Debug)] 19 | pub(crate) struct SparseSets { 20 | pub(crate) set1: SparseSet, 21 | pub(crate) set2: SparseSet, 22 | } 23 | 24 | impl SparseSets { 25 | /// Create a new pair of sparse sets where each set has the given capacity. 26 | /// 27 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. 28 | pub(crate) fn new(capacity: usize) -> SparseSets { 29 | SparseSets { 30 | set1: SparseSet::new(capacity), 31 | set2: SparseSet::new(capacity), 32 | } 33 | } 34 | 35 | /// Resizes these sparse sets to have the new capacity given. 36 | /// 37 | /// The sets are automatically cleared. 38 | /// 39 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. 40 | #[inline] 41 | pub(crate) fn resize(&mut self, new_capacity: usize) { 42 | self.set1.resize(new_capacity); 43 | self.set2.resize(new_capacity); 44 | } 45 | 46 | /// Clear both sparse sets. 47 | pub(crate) fn clear(&mut self) { 48 | self.set1.clear(); 49 | self.set2.clear(); 50 | } 51 | 52 | /// Swap set1 with set2. 53 | pub(crate) fn swap(&mut self) { 54 | core::mem::swap(&mut self.set1, &mut self.set2); 55 | } 56 | 57 | /// Returns the memory usage, in bytes, used by this pair of sparse sets. 58 | pub(crate) fn memory_usage(&self) -> usize { 59 | self.set1.memory_usage() + self.set2.memory_usage() 60 | } 61 | } 62 | 63 | /// A sparse set used for representing ordered NFA states. 64 | /// 65 | /// This supports constant time addition and membership testing. Clearing an 66 | /// entire set can also be done in constant time. Iteration yields elements 67 | /// in the order in which they were inserted. 68 | /// 69 | /// The data structure is based on: https://research.swtch.com/sparse 70 | /// Note though that we don't actually use uninitialized memory. We generally 71 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its 72 | /// other properties listed above are extremely useful. 73 | #[derive(Clone)] 74 | pub(crate) struct SparseSet { 75 | /// The number of elements currently in this set. 76 | len: usize, 77 | /// Dense contains the ids in the order in which they were inserted. 78 | dense: Vec, 79 | /// Sparse maps ids to their location in dense. 80 | /// 81 | /// A state ID is in the set if and only if 82 | /// sparse[id] < dense.len() && id == dense[sparse[id]]. 83 | sparse: Vec, 84 | } 85 | 86 | impl SparseSet { 87 | /// Create a new sparse set with the given capacity. 88 | /// 89 | /// Sparse sets have a fixed size and they cannot grow. Attempting to 90 | /// insert more distinct elements than the total capacity of the set will 91 | /// result in a panic. 92 | /// 93 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. 94 | #[inline] 95 | pub(crate) fn new(capacity: usize) -> SparseSet { 96 | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; 97 | set.resize(capacity); 98 | set 99 | } 100 | 101 | /// Resizes this sparse set to have the new capacity given. 102 | /// 103 | /// This set is automatically cleared. 104 | /// 105 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. 106 | #[inline] 107 | pub(crate) fn resize(&mut self, new_capacity: usize) { 108 | assert!( 109 | new_capacity <= StateID::LIMIT, 110 | "sparse set capacity cannot excced {:?}", 111 | StateID::LIMIT 112 | ); 113 | self.clear(); 114 | self.dense.resize(new_capacity, StateID::ZERO); 115 | self.sparse.resize(new_capacity, StateID::ZERO); 116 | } 117 | 118 | /// Returns the capacity of this set. 119 | /// 120 | /// The capacity represents a fixed limit on the number of distinct 121 | /// elements that are allowed in this set. The capacity cannot be changed. 122 | #[inline] 123 | pub(crate) fn capacity(&self) -> usize { 124 | self.dense.len() 125 | } 126 | 127 | /// Returns the number of elements in this set. 128 | #[inline] 129 | pub(crate) fn len(&self) -> usize { 130 | self.len 131 | } 132 | 133 | /// Returns true if and only if this set is empty. 134 | #[inline] 135 | pub(crate) fn is_empty(&self) -> bool { 136 | self.len() == 0 137 | } 138 | 139 | /// Insert the state ID value into this set and return true if the given 140 | /// state ID was not previously in this set. 141 | /// 142 | /// This operation is idempotent. If the given value is already in this 143 | /// set, then this is a no-op. 144 | /// 145 | /// If more than `capacity` ids are inserted, then this panics. 146 | /// 147 | /// This is marked as inline(always) since the compiler won't inline it 148 | /// otherwise, and it's a fairly hot piece of code in DFA determinization. 149 | #[inline(always)] 150 | pub(crate) fn insert(&mut self, value: StateID) -> bool { 151 | if self.contains(value) { 152 | return false; 153 | } 154 | 155 | let i = self.len(); 156 | assert!( 157 | i < self.capacity(), 158 | "{:?} exceeds capacity of {:?} when inserting {:?}", 159 | i, 160 | self.capacity(), 161 | value, 162 | ); 163 | // OK since i < self.capacity() and self.capacity() is guaranteed to 164 | // be <= StateID::LIMIT. 165 | let id = StateID::new_unchecked(i); 166 | self.dense[id] = value; 167 | self.sparse[value] = id; 168 | self.len += 1; 169 | true 170 | } 171 | 172 | /// Returns true if and only if this set contains the given value. 173 | #[inline] 174 | pub(crate) fn contains(&self, value: StateID) -> bool { 175 | let i = self.sparse[value]; 176 | i.as_usize() < self.len() && self.dense[i] == value 177 | } 178 | 179 | /// Returns the ith inserted element from this set. 180 | /// 181 | /// Panics when i >= self.len(). 182 | #[inline] 183 | pub(crate) fn get(&self, i: usize) -> StateID { 184 | self.dense[i] 185 | } 186 | 187 | /// Clear this set such that it has no members. 188 | #[inline] 189 | pub(crate) fn clear(&mut self) { 190 | self.len = 0; 191 | } 192 | 193 | /// Returns the heap memory usage, in bytes, used by this sparse set. 194 | #[inline] 195 | pub(crate) fn memory_usage(&self) -> usize { 196 | 2 * self.dense.len() * StateID::SIZE 197 | } 198 | } 199 | 200 | impl core::fmt::Debug for SparseSet { 201 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 202 | let elements: Vec = self.into_iter().collect(); 203 | f.debug_tuple("SparseSet").field(&elements).finish() 204 | } 205 | } 206 | 207 | /// An iterator over all elements in a sparse set. 208 | /// 209 | /// The lifetime `'a` refers to the lifetime of the set being iterated over. 210 | #[derive(Debug)] 211 | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); 212 | 213 | impl<'a> IntoIterator for &'a SparseSet { 214 | type Item = StateID; 215 | type IntoIter = SparseSetIter<'a>; 216 | 217 | fn into_iter(self) -> Self::IntoIter { 218 | SparseSetIter(self.dense[..self.len()].iter()) 219 | } 220 | } 221 | 222 | impl<'a> Iterator for SparseSetIter<'a> { 223 | type Item = StateID; 224 | 225 | #[inline(always)] 226 | fn next(&mut self) -> Option { 227 | self.0.next().map(|value| *value) 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /src/util/start.rs: -------------------------------------------------------------------------------- 1 | /// Represents the four possible starting configurations of a DFA search. 2 | /// 3 | /// The starting configuration is determined by inspecting the the beginning of 4 | /// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if 5 | /// specified) is what selects the start state to use in a DFA. 6 | /// 7 | /// In a DFA that doesn't have starting states for each pattern, then it will 8 | /// have a maximum of four DFA start states. If the DFA was compiled with start 9 | /// states for each pattern, then it will have a maximum of four DFA start 10 | /// states for searching for any pattern, and then another maximum of four DFA 11 | /// start states for executing an anchored search for each pattern. 12 | /// 13 | /// This ends up being represented as a table in the DFA (whether lazy or fully 14 | /// built) where the stride of that table is 4, and each entry is an index into 15 | /// the state transition table. Note though that multiple entries in the table 16 | /// might point to the same state if the states would otherwise be equivalent. 17 | /// (This is guaranteed by DFA minimization and may even be accomplished by 18 | /// normal determinization, since it attempts to reuse equivalent states too.) 19 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] 20 | pub(crate) enum Start { 21 | /// This occurs when the starting position is not any of the ones below. 22 | NonWordByte = 0, 23 | /// This occurs when the byte immediately preceding the start of the search 24 | /// is an ASCII word byte. 25 | WordByte = 1, 26 | /// This occurs when the starting position of the search corresponds to the 27 | /// beginning of the haystack. 28 | Text = 2, 29 | /// This occurs when the byte immediately preceding the start of the search 30 | /// is a line terminator. Specifically, `\n`. 31 | Line = 3, 32 | } 33 | 34 | impl Start { 35 | /// Return the starting state corresponding to the given integer. If no 36 | /// starting state exists for the given integer, then None is returned. 37 | pub(crate) fn from_usize(n: usize) -> Option { 38 | match n { 39 | 0 => Some(Start::NonWordByte), 40 | 1 => Some(Start::WordByte), 41 | 2 => Some(Start::Text), 42 | 3 => Some(Start::Line), 43 | _ => None, 44 | } 45 | } 46 | 47 | /// Returns the total number of starting state configurations. 48 | pub(crate) fn count() -> usize { 49 | 4 50 | } 51 | 52 | /// Returns the starting state configuration for the given search 53 | /// parameters. If the given offset range is not valid, then this panics. 54 | #[inline(always)] 55 | pub(crate) fn from_position_fwd( 56 | bytes: &[u8], 57 | start: usize, 58 | end: usize, 59 | ) -> Start { 60 | assert!( 61 | bytes.get(start..end).is_some(), 62 | "{}..{} is invalid", 63 | start, 64 | end 65 | ); 66 | if start == 0 { 67 | Start::Text 68 | } else if bytes[start - 1] == b'\n' { 69 | Start::Line 70 | } else if crate::util::is_word_byte(bytes[start - 1]) { 71 | Start::WordByte 72 | } else { 73 | Start::NonWordByte 74 | } 75 | } 76 | 77 | /// Returns the starting state configuration for a reverse search with the 78 | /// given search parameters. If the given offset range is not valid, then 79 | /// this panics. 80 | #[inline(always)] 81 | pub(crate) fn from_position_rev( 82 | bytes: &[u8], 83 | start: usize, 84 | end: usize, 85 | ) -> Start { 86 | assert!( 87 | bytes.get(start..end).is_some(), 88 | "{}..{} is invalid", 89 | start, 90 | end 91 | ); 92 | if end == bytes.len() { 93 | Start::Text 94 | } else if bytes[end] == b'\n' { 95 | Start::Line 96 | } else if crate::util::is_word_byte(bytes[end]) { 97 | Start::WordByte 98 | } else { 99 | Start::NonWordByte 100 | } 101 | } 102 | 103 | /// Return this starting configuration as an integer. It is guaranteed to 104 | /// be less than `Start::count()`. 105 | #[inline(always)] 106 | pub(crate) fn as_usize(&self) -> usize { 107 | *self as usize 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /tests/data/bytes.toml: -------------------------------------------------------------------------------- 1 | # These are tests specifically crafted for regexes that can match arbitrary 2 | # bytes. In some cases, we also test the Unicode variant as well, just because 3 | # it's good sense to do so. But also, these tests aren't really about Unicode, 4 | # but whether matches are only reported at valid UTF-8 boundaries. For most 5 | # tests in this entire collection, utf8 = true. But for these tests, we use 6 | # utf8 = false. 7 | 8 | [[tests]] 9 | name = "word-boundary-ascii" 10 | regex = ' \b' 11 | input = " δ" 12 | matches = [] 13 | unicode = false 14 | utf8 = false 15 | 16 | [[tests]] 17 | name = "word-boundary-unicode" 18 | regex = ' \b' 19 | input = " δ" 20 | matches = [[0, 1]] 21 | unicode = true 22 | utf8 = false 23 | 24 | [[tests]] 25 | name = "word-boundary-ascii-not" 26 | regex = ' \B' 27 | input = " δ" 28 | matches = [[0, 1]] 29 | unicode = false 30 | utf8 = false 31 | 32 | [[tests]] 33 | name = "word-boundary-unicode-not" 34 | regex = ' \B' 35 | input = " δ" 36 | matches = [] 37 | unicode = true 38 | utf8 = false 39 | 40 | [[tests]] 41 | name = "perl-word-ascii" 42 | regex = '\w+' 43 | input = "aδ" 44 | matches = [[0, 1]] 45 | unicode = false 46 | utf8 = false 47 | 48 | [[tests]] 49 | name = "perl-word-unicode" 50 | regex = '\w+' 51 | input = "aδ" 52 | matches = [[0, 3]] 53 | unicode = true 54 | utf8 = false 55 | 56 | [[tests]] 57 | name = "perl-decimal-ascii" 58 | regex = '\d+' 59 | input = "1२३9" 60 | matches = [[0, 1], [7, 8]] 61 | unicode = false 62 | utf8 = false 63 | 64 | [[tests]] 65 | name = "perl-decimal-unicode" 66 | regex = '\d+' 67 | input = "1२३9" 68 | matches = [[0, 8]] 69 | unicode = true 70 | utf8 = false 71 | 72 | [[tests]] 73 | name = "perl-whitespace-ascii" 74 | regex = '\s+' 75 | input = " \u1680" 76 | matches = [[0, 1]] 77 | unicode = false 78 | utf8 = false 79 | 80 | [[tests]] 81 | name = "perl-whitespace-unicode" 82 | regex = '\s+' 83 | input = " \u1680" 84 | matches = [[0, 4]] 85 | unicode = true 86 | utf8 = false 87 | 88 | # The first `(.+)` matches two Unicode codepoints, but can't match the 5th 89 | # byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and 90 | # matches. 91 | [[tests]] 92 | name = "mixed-dot" 93 | regex = '(.+)(?-u)(.+)' 94 | input = '\xCE\x93\xCE\x94\xFF' 95 | captures = [ 96 | [[0, 5], [0, 4], [4, 5]], 97 | ] 98 | unescape = true 99 | unicode = true 100 | utf8 = false 101 | 102 | [[tests]] 103 | name = "case-one-ascii" 104 | regex = 'a' 105 | input = "A" 106 | matches = [[0, 1]] 107 | case_insensitive = true 108 | unicode = false 109 | utf8 = false 110 | 111 | [[tests]] 112 | name = "case-one-unicode" 113 | regex = 'a' 114 | input = "A" 115 | matches = [[0, 1]] 116 | case_insensitive = true 117 | unicode = true 118 | utf8 = false 119 | 120 | [[tests]] 121 | name = "case-class-simple-ascii" 122 | regex = '[a-z]+' 123 | input = "AaAaA" 124 | matches = [[0, 5]] 125 | case_insensitive = true 126 | unicode = false 127 | utf8 = false 128 | 129 | [[tests]] 130 | name = "case-class-ascii" 131 | regex = '[a-z]+' 132 | input = "aA\u212AaA" 133 | matches = [[0, 2], [5, 7]] 134 | case_insensitive = true 135 | unicode = false 136 | utf8 = false 137 | 138 | [[tests]] 139 | name = "case-class-unicode" 140 | regex = '[a-z]+' 141 | input = "aA\u212AaA" 142 | matches = [[0, 7]] 143 | case_insensitive = true 144 | unicode = true 145 | utf8 = false 146 | 147 | [[tests]] 148 | name = "negate-ascii" 149 | regex = '[^a]' 150 | input = "δ" 151 | matches = [[0, 1], [1, 2]] 152 | unicode = false 153 | utf8 = false 154 | 155 | [[tests]] 156 | name = "negate-unicode" 157 | regex = '[^a]' 158 | input = "δ" 159 | matches = [[0, 2]] 160 | unicode = true 161 | utf8 = false 162 | 163 | # When utf8=true, this won't match, because the implicit '.*?' prefix is 164 | # Unicode aware and will refuse to match through invalid UTF-8 bytes. 165 | [[tests]] 166 | name = "dotstar-prefix-ascii" 167 | regex = 'a' 168 | input = '\xFFa' 169 | matches = [[1, 2]] 170 | unescape = true 171 | unicode = false 172 | utf8 = false 173 | 174 | [[tests]] 175 | name = "dotstar-prefix-unicode" 176 | regex = 'a' 177 | input = '\xFFa' 178 | matches = [[1, 2]] 179 | unescape = true 180 | unicode = true 181 | utf8 = false 182 | 183 | [[tests]] 184 | name = "null-bytes" 185 | regex = '(?P[^\x00]+)\x00' 186 | input = 'foo\x00' 187 | captures = [ 188 | [[0, 4], [0, 3]], 189 | ] 190 | unescape = true 191 | unicode = false 192 | utf8 = false 193 | 194 | [[tests]] 195 | name = "invalid-utf8-anchor-100" 196 | regex = '\xCC?^' 197 | input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' 198 | matches = [[0, 0]] 199 | unescape = true 200 | unicode = false 201 | utf8 = false 202 | 203 | [[tests]] 204 | name = "invalid-utf8-anchor-200" 205 | regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$' 206 | input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' 207 | matches = [[22, 22]] 208 | unescape = true 209 | unicode = false 210 | utf8 = false 211 | 212 | [[tests]] 213 | name = "invalid-utf8-anchor-300" 214 | regex = '^|ddp\xff\xffdddddlQd@\x80' 215 | input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' 216 | matches = [[0, 0]] 217 | unescape = true 218 | unicode = false 219 | utf8 = false 220 | 221 | [[tests]] 222 | name = "word-boundary-ascii-100" 223 | regex = '\Bx\B' 224 | input = "áxβ" 225 | matches = [] 226 | unicode = false 227 | utf8 = false 228 | 229 | [[tests]] 230 | name = "word-boundary-ascii-200" 231 | regex = '\B' 232 | input = "0\U0007EF5E" 233 | matches = [[2, 2], [3, 3], [4, 4], [5, 5]] 234 | unicode = false 235 | utf8 = false 236 | -------------------------------------------------------------------------------- /tests/data/crazy.toml: -------------------------------------------------------------------------------- 1 | # TODO: There are still a couple of manually written tests in crazy.rs. 2 | 3 | [[tests]] 4 | name = "ranges" 5 | regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b' 6 | input = "num: 255" 7 | matches = [[5, 8]] 8 | 9 | [[tests]] 10 | name = "ranges-not" 11 | regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b' 12 | input = "num: 256" 13 | matches = [] 14 | 15 | [[tests]] 16 | name = "float1" 17 | regex = '[-+]?[0-9]*\.?[0-9]+' 18 | input = "0.1" 19 | matches = [[0, 3]] 20 | 21 | [[tests]] 22 | name = "float2" 23 | regex = '[-+]?[0-9]*\.?[0-9]+' 24 | input = "0.1.2" 25 | matches = [[0, 3]] 26 | match_limit = 1 27 | 28 | [[tests]] 29 | name = "float3" 30 | regex = '[-+]?[0-9]*\.?[0-9]+' 31 | input = "a1.2" 32 | matches = [[1, 4]] 33 | 34 | [[tests]] 35 | name = "float4" 36 | regex = '[-+]?[0-9]*\.?[0-9]+' 37 | input = "1.a" 38 | matches = [[0, 1]] 39 | 40 | [[tests]] 41 | name = "float5" 42 | regex = '^[-+]?[0-9]*\.?[0-9]+$' 43 | input = "1.a" 44 | matches = [] 45 | 46 | [[tests]] 47 | name = "email" 48 | regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b' 49 | input = "mine is jam.slam@gmail.com " 50 | matches = [[8, 26]] 51 | 52 | [[tests]] 53 | name = "email-not" 54 | regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b' 55 | input = "mine is jam.slam@gmail " 56 | matches = [] 57 | 58 | [[tests]] 59 | name = "email-big" 60 | regex = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''' 61 | input = "mine is jam.slam@gmail.com " 62 | matches = [[8, 26]] 63 | 64 | [[tests]] 65 | name = "date1" 66 | regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$' 67 | input = "1900-01-01" 68 | matches = [[0, 10]] 69 | 70 | [[tests]] 71 | name = "date2" 72 | regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$' 73 | input = "1900-00-01" 74 | matches = [] 75 | 76 | [[tests]] 77 | name = "date3" 78 | regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$' 79 | input = "1900-13-01" 80 | matches = [] 81 | 82 | [[tests]] 83 | name = "start-end-empty" 84 | regex = '^$' 85 | input = "" 86 | matches = [[0, 0]] 87 | 88 | [[tests]] 89 | name = "start-end-empty-rev" 90 | regex = '$^' 91 | input = "" 92 | matches = [[0, 0]] 93 | 94 | [[tests]] 95 | name = "start-end-empty-many-1" 96 | regex = '^$^$^$' 97 | input = "" 98 | matches = [[0, 0]] 99 | 100 | [[tests]] 101 | name = "start-end-empty-many-2" 102 | regex = '^^^$$$' 103 | input = "" 104 | matches = [[0, 0]] 105 | 106 | [[tests]] 107 | name = "start-end-empty-rep" 108 | regex = '(?:^$)*' 109 | input = "a\nb\nc" 110 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] 111 | 112 | [[tests]] 113 | name = "start-end-empty-rep-rev" 114 | regex = '(?:$^)*' 115 | input = "a\nb\nc" 116 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] 117 | 118 | [[tests]] 119 | name = "neg-class-letter" 120 | regex = '[^ac]' 121 | input = "acx" 122 | matches = [[2, 3]] 123 | 124 | [[tests]] 125 | name = "neg-class-letter-comma" 126 | regex = '[^a,]' 127 | input = "a,x" 128 | matches = [[2, 3]] 129 | 130 | [[tests]] 131 | name = "neg-class-letter-space" 132 | regex = '[^a[:space:]]' 133 | input = "a x" 134 | matches = [[2, 3]] 135 | 136 | [[tests]] 137 | name = "neg-class-comma" 138 | regex = '[^,]' 139 | input = ",,x" 140 | matches = [[2, 3]] 141 | 142 | [[tests]] 143 | name = "neg-class-space" 144 | regex = '[^[:space:]]' 145 | input = " a" 146 | matches = [[1, 2]] 147 | 148 | [[tests]] 149 | name = "neg-class-space-comma" 150 | regex = '[^,[:space:]]' 151 | input = ", a" 152 | matches = [[2, 3]] 153 | 154 | [[tests]] 155 | name = "neg-class-comma-space" 156 | regex = '[^[:space:],]' 157 | input = " ,a" 158 | matches = [[2, 3]] 159 | 160 | [[tests]] 161 | name = "neg-class-ascii" 162 | regex = '[^[:alpha:]Z]' 163 | input = "A1" 164 | matches = [[1, 2]] 165 | 166 | [[tests]] 167 | name = "lazy-many-many" 168 | regex = '((?:.*)*?)=' 169 | input = "a=b" 170 | matches = [[0, 2]] 171 | 172 | [[tests]] 173 | name = "lazy-many-optional" 174 | regex = '((?:.?)*?)=' 175 | input = "a=b" 176 | matches = [[0, 2]] 177 | 178 | [[tests]] 179 | name = "lazy-one-many-many" 180 | regex = '((?:.*)+?)=' 181 | input = "a=b" 182 | matches = [[0, 2]] 183 | 184 | [[tests]] 185 | name = "lazy-one-many-optional" 186 | regex = '((?:.?)+?)=' 187 | input = "a=b" 188 | matches = [[0, 2]] 189 | 190 | [[tests]] 191 | name = "lazy-range-min-many" 192 | regex = '((?:.*){1,}?)=' 193 | input = "a=b" 194 | matches = [[0, 2]] 195 | 196 | [[tests]] 197 | name = "lazy-range-many" 198 | regex = '((?:.*){1,2}?)=' 199 | input = "a=b" 200 | matches = [[0, 2]] 201 | 202 | [[tests]] 203 | name = "greedy-many-many" 204 | regex = '((?:.*)*)=' 205 | input = "a=b" 206 | matches = [[0, 2]] 207 | 208 | [[tests]] 209 | name = "greedy-many-optional" 210 | regex = '((?:.?)*)=' 211 | input = "a=b" 212 | matches = [[0, 2]] 213 | 214 | [[tests]] 215 | name = "greedy-one-many-many" 216 | regex = '((?:.*)+)=' 217 | input = "a=b" 218 | matches = [[0, 2]] 219 | 220 | [[tests]] 221 | name = "greedy-one-many-optional" 222 | regex = '((?:.?)+)=' 223 | input = "a=b" 224 | matches = [[0, 2]] 225 | 226 | [[tests]] 227 | name = "greedy-range-min-many" 228 | regex = '((?:.*){1,})=' 229 | input = "a=b" 230 | matches = [[0, 2]] 231 | 232 | [[tests]] 233 | name = "greedy-range-many" 234 | regex = '((?:.*){1,2})=' 235 | input = "a=b" 236 | matches = [[0, 2]] 237 | 238 | [[tests]] 239 | name = "empty1" 240 | regex = '' 241 | input = "" 242 | matches = [[0, 0]] 243 | 244 | [[tests]] 245 | name = "empty2" 246 | regex = '' 247 | input = "abc" 248 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 249 | 250 | [[tests]] 251 | name = "empty3" 252 | regex = '()' 253 | input = "abc" 254 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 255 | 256 | [[tests]] 257 | name = "empty4" 258 | regex = '()*' 259 | input = "abc" 260 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 261 | 262 | [[tests]] 263 | name = "empty5" 264 | regex = '()+' 265 | input = "abc" 266 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 267 | 268 | [[tests]] 269 | name = "empty6" 270 | regex = '()?' 271 | input = "abc" 272 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 273 | 274 | [[tests]] 275 | name = "empty7" 276 | regex = '()()' 277 | input = "abc" 278 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 279 | 280 | [[tests]] 281 | name = "empty8" 282 | regex = '()+|z' 283 | input = "abc" 284 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 285 | 286 | [[tests]] 287 | name = "empty9" 288 | regex = 'z|()+' 289 | input = "abc" 290 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 291 | 292 | [[tests]] 293 | name = "empty10" 294 | regex = '()+|b' 295 | input = "abc" 296 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 297 | 298 | [[tests]] 299 | name = "empty11" 300 | regex = 'b|()+' 301 | input = "abc" 302 | matches = [[0, 0], [1, 2], [3, 3]] 303 | -------------------------------------------------------------------------------- /tests/data/earliest.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "no-greedy-100" 3 | regex = 'a+' 4 | input = "aaa" 5 | matches = [[0, 1], [1, 2], [2, 3]] 6 | search_kind = "earliest" 7 | 8 | [[tests]] 9 | name = "no-greedy-200" 10 | regex = 'abc+' 11 | input = "zzzabccc" 12 | matches = [[3, 6]] 13 | search_kind = "earliest" 14 | 15 | [[tests]] 16 | name = "is-ungreedy" 17 | regex = 'a+?' 18 | input = "aaa" 19 | matches = [[0, 1], [1, 2], [2, 3]] 20 | search_kind = "earliest" 21 | 22 | [[tests]] 23 | name = "look-start-test" 24 | regex = '^(abc|a)' 25 | input = "abc" 26 | matches = [[0, 1]] 27 | search_kind = "earliest" 28 | 29 | [[tests]] 30 | name = "look-end-test" 31 | regex = '(abc|a)$' 32 | input = "abc" 33 | matches = [[0, 3]] 34 | search_kind = "earliest" 35 | 36 | [[tests]] 37 | name = "no-leftmost-first-100" 38 | regex = 'abc|a' 39 | input = "abc" 40 | matches = [[0, 1]] 41 | search_kind = "earliest" 42 | 43 | [[tests]] 44 | name = "no-leftmost-first-200" 45 | regex = 'aba|a' 46 | input = "aba" 47 | matches = [[0, 1], [2, 3]] 48 | search_kind = "earliest" 49 | -------------------------------------------------------------------------------- /tests/data/empty.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "100" 3 | regex = "|b" 4 | input = "abc" 5 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 6 | 7 | [[tests]] 8 | name = "110" 9 | regex = "b|" 10 | input = "abc" 11 | matches = [[0, 0], [1, 2], [3, 3]] 12 | 13 | [[tests]] 14 | name = "120" 15 | regex = "|z" 16 | input = "abc" 17 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 18 | 19 | [[tests]] 20 | name = "130" 21 | regex = "z|" 22 | input = "abc" 23 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 24 | 25 | [[tests]] 26 | name = "200" 27 | regex = "|" 28 | input = "abc" 29 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 30 | 31 | [[tests]] 32 | name = "210" 33 | regex = "||" 34 | input = "abc" 35 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 36 | 37 | [[tests]] 38 | name = "220" 39 | regex = "||b" 40 | input = "abc" 41 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 42 | 43 | [[tests]] 44 | name = "230" 45 | regex = "b||" 46 | input = "abc" 47 | matches = [[0, 0], [1, 2], [3, 3]] 48 | 49 | [[tests]] 50 | name = "240" 51 | regex = "||z" 52 | input = "abc" 53 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 54 | 55 | [[tests]] 56 | name = "300" 57 | regex = "(?:)|b" 58 | input = "abc" 59 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 60 | 61 | [[tests]] 62 | name = "310" 63 | regex = "b|(?:)" 64 | input = "abc" 65 | matches = [[0, 0], [1, 2], [3, 3]] 66 | 67 | [[tests]] 68 | name = "320" 69 | regex = "(?:|)" 70 | input = "abc" 71 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 72 | 73 | [[tests]] 74 | name = "330" 75 | regex = "(?:|)|z" 76 | input = "abc" 77 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 78 | 79 | [[tests]] 80 | name = "400" 81 | regex = "a(?:)|b" 82 | input = "abc" 83 | matches = [[0, 1], [1, 2]] 84 | 85 | [[tests]] 86 | name = "500" 87 | regex = "" 88 | input = "" 89 | matches = [[0, 0]] 90 | 91 | [[tests]] 92 | name = "510" 93 | regex = "" 94 | input = "a" 95 | matches = [[0, 0], [1, 1]] 96 | 97 | [[tests]] 98 | name = "520" 99 | regex = "" 100 | input = "abc" 101 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 102 | 103 | [[tests]] 104 | name = "600" 105 | regex = '(|a)*' 106 | input = "aaa" 107 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 108 | 109 | [[tests]] 110 | name = "610" 111 | regex = '(|a)+' 112 | input = "aaa" 113 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 114 | -------------------------------------------------------------------------------- /tests/data/expensive.toml: -------------------------------------------------------------------------------- 1 | # These represent tests that may be expensive to run on some regex engines. For 2 | # example, tests that build a full DFA ahead of time and minimize it can take a 3 | # horrendously long time on regexes that are large (or result in an explosion 4 | # in the number of states). We group these tests together so that such engines 5 | # can simply skip these tests. 6 | 7 | # See: https://github.com/rust-lang/regex/issues/98 8 | [[tests]] 9 | name = "regression-many-repeat-no-stack-overflow" 10 | regex = '^.{1,2500}' 11 | input = "a" 12 | matches = [[0, 1]] 13 | -------------------------------------------------------------------------------- /tests/data/flags.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "1" 3 | regex = "(?i)abc" 4 | input = "ABC" 5 | matches = [[0, 3]] 6 | 7 | [[tests]] 8 | name = "2" 9 | regex = "(?i)a(?-i)bc" 10 | input = "Abc" 11 | matches = [[0, 3]] 12 | 13 | [[tests]] 14 | name = "3" 15 | regex = "(?i)a(?-i)bc" 16 | input = "ABC" 17 | matches = [] 18 | 19 | [[tests]] 20 | name = "4" 21 | regex = "(?is)a." 22 | input = "A\n" 23 | matches = [[0, 2]] 24 | 25 | [[tests]] 26 | name = "5" 27 | regex = "(?is)a.(?-is)a." 28 | input = "A\nab" 29 | matches = [[0, 4]] 30 | 31 | [[tests]] 32 | name = "6" 33 | regex = "(?is)a.(?-is)a." 34 | input = "A\na\n" 35 | matches = [] 36 | 37 | [[tests]] 38 | name = "7" 39 | regex = "(?is)a.(?-is:a.)?" 40 | input = "A\na\n" 41 | matches = [[0, 2]] 42 | match_limit = 1 43 | 44 | [[tests]] 45 | name = "8" 46 | regex = "(?U)a+" 47 | input = "aa" 48 | matches = [[0, 1]] 49 | match_limit = 1 50 | 51 | [[tests]] 52 | name = "9" 53 | regex = "(?U)a+?" 54 | input = "aa" 55 | matches = [[0, 2]] 56 | 57 | [[tests]] 58 | name = "10" 59 | regex = "(?U)(?-U)a+" 60 | input = "aa" 61 | matches = [[0, 2]] 62 | 63 | [[tests]] 64 | name = "11" 65 | regex = '(?m)(?:^\d+$\n?)+' 66 | input = "123\n456\n789" 67 | matches = [[0, 11]] 68 | -------------------------------------------------------------------------------- /tests/data/fowler/dat/README: -------------------------------------------------------------------------------- 1 | Test data was taken from the Go distribution, which was in turn taken from the 2 | testregex test suite: 3 | 4 | http://www2.research.att.com/~astopen/testregex/testregex.html 5 | 6 | Unfortunately, the above link is now dead, but the test data lives on. 7 | 8 | The LICENSE in this directory corresponds to the LICENSE that the data was 9 | originally released under. 10 | 11 | The tests themselves were modified for RE2/Go. A couple were modified further 12 | by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. 13 | (Yes, it seems like RE2/Go includes failing test cases.) This may or may not 14 | have been a bad idea, but I think being consistent with an established Regex 15 | library is worth something. 16 | 17 | After some number of years, these tests were transformed into a TOML format 18 | using the fowler-to-toml script in the 'scripts' directory. To re-generate the 19 | TOML files, then run the following from the root of this repository: 20 | 21 | ./scripts/fowler-to-toml tests/data/fowler tests/data/fowler/dat/*.dat 22 | 23 | which brings them into a sensible structured format in which other tests can 24 | be written. 25 | -------------------------------------------------------------------------------- /tests/data/fowler/dat/nullsubexpr.dat: -------------------------------------------------------------------------------- 1 | NOTE null subexpression matches : 2002-06-06 2 | 3 | E (a*)* a (0,1)(0,1) 4 | #E SAME x (0,0)(0,0) 5 | E SAME x (0,0)(?,?) RE2/Go 6 | E SAME aaaaaa (0,6)(0,6) 7 | E SAME aaaaaax (0,6)(0,6) 8 | E (a*)+ a (0,1)(0,1) 9 | E SAME x (0,0)(0,0) 10 | E SAME aaaaaa (0,6)(0,6) 11 | E SAME aaaaaax (0,6)(0,6) 12 | E (a+)* a (0,1)(0,1) 13 | E SAME x (0,0) 14 | E SAME aaaaaa (0,6)(0,6) 15 | E SAME aaaaaax (0,6)(0,6) 16 | E (a+)+ a (0,1)(0,1) 17 | E SAME x NOMATCH 18 | E SAME aaaaaa (0,6)(0,6) 19 | E SAME aaaaaax (0,6)(0,6) 20 | 21 | E ([a]*)* a (0,1)(0,1) 22 | #E SAME x (0,0)(0,0) 23 | E SAME x (0,0)(?,?) RE2/Go 24 | E SAME aaaaaa (0,6)(0,6) 25 | E SAME aaaaaax (0,6)(0,6) 26 | E ([a]*)+ a (0,1)(0,1) 27 | E SAME x (0,0)(0,0) 28 | E SAME aaaaaa (0,6)(0,6) 29 | E SAME aaaaaax (0,6)(0,6) 30 | E ([^b]*)* a (0,1)(0,1) 31 | #E SAME b (0,0)(0,0) 32 | E SAME b (0,0)(?,?) RE2/Go 33 | E SAME aaaaaa (0,6)(0,6) 34 | E SAME aaaaaab (0,6)(0,6) 35 | E ([ab]*)* a (0,1)(0,1) 36 | E SAME aaaaaa (0,6)(0,6) 37 | E SAME ababab (0,6)(0,6) 38 | E SAME bababa (0,6)(0,6) 39 | E SAME b (0,1)(0,1) 40 | E SAME bbbbbb (0,6)(0,6) 41 | E SAME aaaabcde (0,5)(0,5) 42 | E ([^a]*)* b (0,1)(0,1) 43 | E SAME bbbbbb (0,6)(0,6) 44 | #E SAME aaaaaa (0,0)(0,0) 45 | E SAME aaaaaa (0,0)(?,?) RE2/Go 46 | E ([^ab]*)* ccccxx (0,6)(0,6) 47 | #E SAME ababab (0,0)(0,0) 48 | E SAME ababab (0,0)(?,?) RE2/Go 49 | 50 | E ((z)+|a)* zabcde (0,2)(1,2) 51 | 52 | #{E a+? aaaaaa (0,1) no *? +? mimimal match ops 53 | #E (a) aaa (0,1)(0,1) 54 | #E (a*?) aaa (0,0)(0,0) 55 | #E (a)*? aaa (0,0) 56 | #E (a*?)*? aaa (0,0) 57 | #} 58 | 59 | B \(a*\)*\(x\) x (0,1)(0,0)(0,1) 60 | B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) 61 | B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) 62 | B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) 63 | B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) 64 | B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) 65 | B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) 66 | B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) 67 | 68 | #E (a*)*(x) x (0,1)(0,0)(0,1) 69 | E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go 70 | E (a*)*(x) ax (0,2)(0,1)(1,2) 71 | E (a*)*(x) axa (0,2)(0,1)(1,2) 72 | 73 | E (a*)+(x) x (0,1)(0,0)(0,1) 74 | E (a*)+(x) ax (0,2)(0,1)(1,2) 75 | E (a*)+(x) axa (0,2)(0,1)(1,2) 76 | 77 | E (a*){2}(x) x (0,1)(0,0)(0,1) 78 | E (a*){2}(x) ax (0,2)(1,1)(1,2) 79 | E (a*){2}(x) axa (0,2)(1,1)(1,2) 80 | -------------------------------------------------------------------------------- /tests/data/fowler/dat/repetition-expensive.dat: -------------------------------------------------------------------------------- 1 | NOTE implicit vs. explicit repetitions : 2009-02-02 2 | 3 | # Glenn Fowler 4 | # conforming matches (column 4) must match one of the following BREs 5 | # NOMATCH 6 | # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* 7 | # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* 8 | # i.e., each 3-tuple has two identical elements and one (?,?) 9 | 10 | NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 11 | 12 | :HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) 13 | :HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) 14 | :HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) 15 | :HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) 16 | :HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) 17 | :HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) 18 | :HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) 19 | :HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) 20 | :HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) 21 | #:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) 22 | :HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go 23 | #:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) 24 | :HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go 25 | #:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) 26 | :HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go 27 | #:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) 28 | :HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go 29 | #:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) 30 | :HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go 31 | #:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) 32 | :HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go 33 | #:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) 34 | :HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go 35 | #:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) 36 | :HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go 37 | :HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) 38 | 39 | # These test a fixed bug in my regex-tdfa that did not keep the expanded 40 | # form properly grouped, so right association did the wrong thing with 41 | # these ambiguous patterns (crafted just to test my code when I became 42 | # suspicious of my implementation). The first subexpression should use 43 | # "ab" then "a" then "bcd". 44 | 45 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible 46 | # results like (0,6)(4,5)(6,6). 47 | 48 | :HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) 49 | :HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) 50 | :HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 51 | :HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 52 | :HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH 53 | :HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) 54 | :HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) 55 | :HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 56 | :HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 57 | :HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH 58 | :HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) 59 | :HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) 60 | 61 | # The above worked on Linux/GLIBC but the following often fail. 62 | # They also trip up OS X / FreeBSD / NetBSD: 63 | 64 | #:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 65 | :HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 66 | #:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 67 | :HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 68 | #:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 69 | :HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 70 | #:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 71 | :HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 72 | :HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH 73 | #:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 74 | :HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 75 | #:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 76 | :HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 77 | #:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 78 | :HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 79 | #:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 80 | :HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 81 | :HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH 82 | #:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 83 | :HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 84 | #:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 85 | :HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 86 | -------------------------------------------------------------------------------- /tests/data/fowler/dat/repetition.dat: -------------------------------------------------------------------------------- 1 | NOTE implicit vs. explicit repetitions : 2009-02-02 2 | 3 | # Glenn Fowler 4 | # conforming matches (column 4) must match one of the following BREs 5 | # NOMATCH 6 | # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* 7 | # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* 8 | # i.e., each 3-tuple has two identical elements and one (?,?) 9 | 10 | E ((..)|(.)) NULL NOMATCH 11 | E ((..)|(.))((..)|(.)) NULL NOMATCH 12 | E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH 13 | 14 | E ((..)|(.)){1} NULL NOMATCH 15 | E ((..)|(.)){2} NULL NOMATCH 16 | E ((..)|(.)){3} NULL NOMATCH 17 | 18 | E ((..)|(.))* NULL (0,0) 19 | 20 | E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) 21 | E ((..)|(.))((..)|(.)) a NOMATCH 22 | E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH 23 | 24 | E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) 25 | E ((..)|(.)){2} a NOMATCH 26 | E ((..)|(.)){3} a NOMATCH 27 | 28 | E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) 29 | 30 | E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) 31 | E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) 32 | E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH 33 | 34 | E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) 35 | E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) 36 | E ((..)|(.)){3} aa NOMATCH 37 | 38 | E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) 39 | 40 | E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) 41 | E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) 42 | E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) 43 | 44 | E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) 45 | #E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) 46 | E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 47 | E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) 48 | 49 | #E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) 50 | E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 51 | 52 | E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) 53 | E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 54 | E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) 55 | 56 | E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) 57 | E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) 58 | #E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) 59 | E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go 60 | 61 | E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) 62 | 63 | E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) 64 | E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 65 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) 66 | 67 | E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) 68 | E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) 69 | #E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) 70 | E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 71 | 72 | #E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) 73 | E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 74 | 75 | E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) 76 | E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 77 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) 78 | 79 | E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) 80 | E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) 81 | E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) 82 | 83 | E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) 84 | -------------------------------------------------------------------------------- /tests/data/fowler/nullsubexpr.toml: -------------------------------------------------------------------------------- 1 | # !!! DO NOT EDIT !!! 2 | # Automatically generated by scripts/fowler-to-toml. 3 | # Numbers in the test names correspond to the line number of the test from 4 | # the original dat file. 5 | 6 | [[tests]] 7 | name = "nullsubexpr3" 8 | regex = '''(a*)*''' 9 | input = '''a''' 10 | captures = [[[0, 1], [0, 1]]] 11 | match_limit = 1 12 | unescape = true 13 | 14 | [[tests]] 15 | name = "nullsubexpr5" 16 | regex = '''(a*)*''' 17 | input = '''x''' 18 | captures = [[[0, 0], []]] 19 | match_limit = 1 20 | unescape = true 21 | 22 | [[tests]] 23 | name = "nullsubexpr6" 24 | regex = '''(a*)*''' 25 | input = '''aaaaaa''' 26 | captures = [[[0, 6], [0, 6]]] 27 | match_limit = 1 28 | unescape = true 29 | 30 | [[tests]] 31 | name = "nullsubexpr7" 32 | regex = '''(a*)*''' 33 | input = '''aaaaaax''' 34 | captures = [[[0, 6], [0, 6]]] 35 | match_limit = 1 36 | unescape = true 37 | 38 | [[tests]] 39 | name = "nullsubexpr8" 40 | regex = '''(a*)+''' 41 | input = '''a''' 42 | captures = [[[0, 1], [0, 1]]] 43 | match_limit = 1 44 | unescape = true 45 | 46 | [[tests]] 47 | name = "nullsubexpr9" 48 | regex = '''(a*)+''' 49 | input = '''x''' 50 | captures = [[[0, 0], [0, 0]]] 51 | match_limit = 1 52 | unescape = true 53 | 54 | [[tests]] 55 | name = "nullsubexpr10" 56 | regex = '''(a*)+''' 57 | input = '''aaaaaa''' 58 | captures = [[[0, 6], [0, 6]]] 59 | match_limit = 1 60 | unescape = true 61 | 62 | [[tests]] 63 | name = "nullsubexpr11" 64 | regex = '''(a*)+''' 65 | input = '''aaaaaax''' 66 | captures = [[[0, 6], [0, 6]]] 67 | match_limit = 1 68 | unescape = true 69 | 70 | [[tests]] 71 | name = "nullsubexpr12" 72 | regex = '''(a+)*''' 73 | input = '''a''' 74 | captures = [[[0, 1], [0, 1]]] 75 | match_limit = 1 76 | unescape = true 77 | 78 | [[tests]] 79 | name = "nullsubexpr13" 80 | regex = '''(a+)*''' 81 | input = '''x''' 82 | captures = [[[0, 0]]] 83 | match_limit = 1 84 | unescape = true 85 | 86 | [[tests]] 87 | name = "nullsubexpr14" 88 | regex = '''(a+)*''' 89 | input = '''aaaaaa''' 90 | captures = [[[0, 6], [0, 6]]] 91 | match_limit = 1 92 | unescape = true 93 | 94 | [[tests]] 95 | name = "nullsubexpr15" 96 | regex = '''(a+)*''' 97 | input = '''aaaaaax''' 98 | captures = [[[0, 6], [0, 6]]] 99 | match_limit = 1 100 | unescape = true 101 | 102 | [[tests]] 103 | name = "nullsubexpr16" 104 | regex = '''(a+)+''' 105 | input = '''a''' 106 | captures = [[[0, 1], [0, 1]]] 107 | match_limit = 1 108 | unescape = true 109 | 110 | [[tests]] 111 | name = "nullsubexpr17" 112 | regex = '''(a+)+''' 113 | input = '''x''' 114 | captures = [] 115 | match_limit = 1 116 | unescape = true 117 | 118 | [[tests]] 119 | name = "nullsubexpr18" 120 | regex = '''(a+)+''' 121 | input = '''aaaaaa''' 122 | captures = [[[0, 6], [0, 6]]] 123 | match_limit = 1 124 | unescape = true 125 | 126 | [[tests]] 127 | name = "nullsubexpr19" 128 | regex = '''(a+)+''' 129 | input = '''aaaaaax''' 130 | captures = [[[0, 6], [0, 6]]] 131 | match_limit = 1 132 | unescape = true 133 | 134 | [[tests]] 135 | name = "nullsubexpr21" 136 | regex = '''([a]*)*''' 137 | input = '''a''' 138 | captures = [[[0, 1], [0, 1]]] 139 | match_limit = 1 140 | unescape = true 141 | 142 | [[tests]] 143 | name = "nullsubexpr23" 144 | regex = '''([a]*)*''' 145 | input = '''x''' 146 | captures = [[[0, 0], []]] 147 | match_limit = 1 148 | unescape = true 149 | 150 | [[tests]] 151 | name = "nullsubexpr24" 152 | regex = '''([a]*)*''' 153 | input = '''aaaaaa''' 154 | captures = [[[0, 6], [0, 6]]] 155 | match_limit = 1 156 | unescape = true 157 | 158 | [[tests]] 159 | name = "nullsubexpr25" 160 | regex = '''([a]*)*''' 161 | input = '''aaaaaax''' 162 | captures = [[[0, 6], [0, 6]]] 163 | match_limit = 1 164 | unescape = true 165 | 166 | [[tests]] 167 | name = "nullsubexpr26" 168 | regex = '''([a]*)+''' 169 | input = '''a''' 170 | captures = [[[0, 1], [0, 1]]] 171 | match_limit = 1 172 | unescape = true 173 | 174 | [[tests]] 175 | name = "nullsubexpr27" 176 | regex = '''([a]*)+''' 177 | input = '''x''' 178 | captures = [[[0, 0], [0, 0]]] 179 | match_limit = 1 180 | unescape = true 181 | 182 | [[tests]] 183 | name = "nullsubexpr28" 184 | regex = '''([a]*)+''' 185 | input = '''aaaaaa''' 186 | captures = [[[0, 6], [0, 6]]] 187 | match_limit = 1 188 | unescape = true 189 | 190 | [[tests]] 191 | name = "nullsubexpr29" 192 | regex = '''([a]*)+''' 193 | input = '''aaaaaax''' 194 | captures = [[[0, 6], [0, 6]]] 195 | match_limit = 1 196 | unescape = true 197 | 198 | [[tests]] 199 | name = "nullsubexpr30" 200 | regex = '''([^b]*)*''' 201 | input = '''a''' 202 | captures = [[[0, 1], [0, 1]]] 203 | match_limit = 1 204 | unescape = true 205 | 206 | [[tests]] 207 | name = "nullsubexpr32" 208 | regex = '''([^b]*)*''' 209 | input = '''b''' 210 | captures = [[[0, 0], []]] 211 | match_limit = 1 212 | unescape = true 213 | 214 | [[tests]] 215 | name = "nullsubexpr33" 216 | regex = '''([^b]*)*''' 217 | input = '''aaaaaa''' 218 | captures = [[[0, 6], [0, 6]]] 219 | match_limit = 1 220 | unescape = true 221 | 222 | [[tests]] 223 | name = "nullsubexpr34" 224 | regex = '''([^b]*)*''' 225 | input = '''aaaaaab''' 226 | captures = [[[0, 6], [0, 6]]] 227 | match_limit = 1 228 | unescape = true 229 | 230 | [[tests]] 231 | name = "nullsubexpr35" 232 | regex = '''([ab]*)*''' 233 | input = '''a''' 234 | captures = [[[0, 1], [0, 1]]] 235 | match_limit = 1 236 | unescape = true 237 | 238 | [[tests]] 239 | name = "nullsubexpr36" 240 | regex = '''([ab]*)*''' 241 | input = '''aaaaaa''' 242 | captures = [[[0, 6], [0, 6]]] 243 | match_limit = 1 244 | unescape = true 245 | 246 | [[tests]] 247 | name = "nullsubexpr37" 248 | regex = '''([ab]*)*''' 249 | input = '''ababab''' 250 | captures = [[[0, 6], [0, 6]]] 251 | match_limit = 1 252 | unescape = true 253 | 254 | [[tests]] 255 | name = "nullsubexpr38" 256 | regex = '''([ab]*)*''' 257 | input = '''bababa''' 258 | captures = [[[0, 6], [0, 6]]] 259 | match_limit = 1 260 | unescape = true 261 | 262 | [[tests]] 263 | name = "nullsubexpr39" 264 | regex = '''([ab]*)*''' 265 | input = '''b''' 266 | captures = [[[0, 1], [0, 1]]] 267 | match_limit = 1 268 | unescape = true 269 | 270 | [[tests]] 271 | name = "nullsubexpr40" 272 | regex = '''([ab]*)*''' 273 | input = '''bbbbbb''' 274 | captures = [[[0, 6], [0, 6]]] 275 | match_limit = 1 276 | unescape = true 277 | 278 | [[tests]] 279 | name = "nullsubexpr41" 280 | regex = '''([ab]*)*''' 281 | input = '''aaaabcde''' 282 | captures = [[[0, 5], [0, 5]]] 283 | match_limit = 1 284 | unescape = true 285 | 286 | [[tests]] 287 | name = "nullsubexpr42" 288 | regex = '''([^a]*)*''' 289 | input = '''b''' 290 | captures = [[[0, 1], [0, 1]]] 291 | match_limit = 1 292 | unescape = true 293 | 294 | [[tests]] 295 | name = "nullsubexpr43" 296 | regex = '''([^a]*)*''' 297 | input = '''bbbbbb''' 298 | captures = [[[0, 6], [0, 6]]] 299 | match_limit = 1 300 | unescape = true 301 | 302 | [[tests]] 303 | name = "nullsubexpr45" 304 | regex = '''([^a]*)*''' 305 | input = '''aaaaaa''' 306 | captures = [[[0, 0], []]] 307 | match_limit = 1 308 | unescape = true 309 | 310 | [[tests]] 311 | name = "nullsubexpr46" 312 | regex = '''([^ab]*)*''' 313 | input = '''ccccxx''' 314 | captures = [[[0, 6], [0, 6]]] 315 | match_limit = 1 316 | unescape = true 317 | 318 | [[tests]] 319 | name = "nullsubexpr48" 320 | regex = '''([^ab]*)*''' 321 | input = '''ababab''' 322 | captures = [[[0, 0], []]] 323 | match_limit = 1 324 | unescape = true 325 | 326 | [[tests]] 327 | name = "nullsubexpr50" 328 | regex = '''((z)+|a)*''' 329 | input = '''zabcde''' 330 | captures = [[[0, 2], [1, 2]]] 331 | match_limit = 1 332 | unescape = true 333 | 334 | [[tests]] 335 | name = "nullsubexpr69" 336 | regex = '''(a*)*(x)''' 337 | input = '''x''' 338 | captures = [[[0, 1], [], [0, 1]]] 339 | match_limit = 1 340 | unescape = true 341 | 342 | [[tests]] 343 | name = "nullsubexpr70" 344 | regex = '''(a*)*(x)''' 345 | input = '''ax''' 346 | captures = [[[0, 2], [0, 1], [1, 2]]] 347 | match_limit = 1 348 | unescape = true 349 | 350 | [[tests]] 351 | name = "nullsubexpr71" 352 | regex = '''(a*)*(x)''' 353 | input = '''axa''' 354 | captures = [[[0, 2], [0, 1], [1, 2]]] 355 | match_limit = 1 356 | unescape = true 357 | 358 | [[tests]] 359 | name = "nullsubexpr73" 360 | regex = '''(a*)+(x)''' 361 | input = '''x''' 362 | captures = [[[0, 1], [0, 0], [0, 1]]] 363 | match_limit = 1 364 | unescape = true 365 | 366 | [[tests]] 367 | name = "nullsubexpr74" 368 | regex = '''(a*)+(x)''' 369 | input = '''ax''' 370 | captures = [[[0, 2], [0, 1], [1, 2]]] 371 | match_limit = 1 372 | unescape = true 373 | 374 | [[tests]] 375 | name = "nullsubexpr75" 376 | regex = '''(a*)+(x)''' 377 | input = '''axa''' 378 | captures = [[[0, 2], [0, 1], [1, 2]]] 379 | match_limit = 1 380 | unescape = true 381 | 382 | [[tests]] 383 | name = "nullsubexpr77" 384 | regex = '''(a*){2}(x)''' 385 | input = '''x''' 386 | captures = [[[0, 1], [0, 0], [0, 1]]] 387 | match_limit = 1 388 | unescape = true 389 | 390 | [[tests]] 391 | name = "nullsubexpr78" 392 | regex = '''(a*){2}(x)''' 393 | input = '''ax''' 394 | captures = [[[0, 2], [1, 1], [1, 2]]] 395 | match_limit = 1 396 | unescape = true 397 | 398 | [[tests]] 399 | name = "nullsubexpr79" 400 | regex = '''(a*){2}(x)''' 401 | input = '''axa''' 402 | captures = [[[0, 2], [1, 1], [1, 2]]] 403 | match_limit = 1 404 | unescape = true 405 | 406 | -------------------------------------------------------------------------------- /tests/data/fowler/repetition-expensive.toml: -------------------------------------------------------------------------------- 1 | # !!! DO NOT EDIT !!! 2 | # Automatically generated by scripts/fowler-to-toml. 3 | # Numbers in the test names correspond to the line number of the test from 4 | # the original dat file. 5 | 6 | [[tests]] 7 | name = "repetition-expensive12" 8 | regex = '''X(.?){0,}Y''' 9 | input = '''X1234567Y''' 10 | captures = [[[0, 9], [7, 8]]] 11 | match_limit = 1 12 | unescape = true 13 | 14 | [[tests]] 15 | name = "repetition-expensive13" 16 | regex = '''X(.?){1,}Y''' 17 | input = '''X1234567Y''' 18 | captures = [[[0, 9], [7, 8]]] 19 | match_limit = 1 20 | unescape = true 21 | 22 | [[tests]] 23 | name = "repetition-expensive14" 24 | regex = '''X(.?){2,}Y''' 25 | input = '''X1234567Y''' 26 | captures = [[[0, 9], [7, 8]]] 27 | match_limit = 1 28 | unescape = true 29 | 30 | [[tests]] 31 | name = "repetition-expensive15" 32 | regex = '''X(.?){3,}Y''' 33 | input = '''X1234567Y''' 34 | captures = [[[0, 9], [7, 8]]] 35 | match_limit = 1 36 | unescape = true 37 | 38 | [[tests]] 39 | name = "repetition-expensive16" 40 | regex = '''X(.?){4,}Y''' 41 | input = '''X1234567Y''' 42 | captures = [[[0, 9], [7, 8]]] 43 | match_limit = 1 44 | unescape = true 45 | 46 | [[tests]] 47 | name = "repetition-expensive17" 48 | regex = '''X(.?){5,}Y''' 49 | input = '''X1234567Y''' 50 | captures = [[[0, 9], [7, 8]]] 51 | match_limit = 1 52 | unescape = true 53 | 54 | [[tests]] 55 | name = "repetition-expensive18" 56 | regex = '''X(.?){6,}Y''' 57 | input = '''X1234567Y''' 58 | captures = [[[0, 9], [7, 8]]] 59 | match_limit = 1 60 | unescape = true 61 | 62 | [[tests]] 63 | name = "repetition-expensive19" 64 | regex = '''X(.?){7,}Y''' 65 | input = '''X1234567Y''' 66 | captures = [[[0, 9], [7, 8]]] 67 | match_limit = 1 68 | unescape = true 69 | 70 | [[tests]] 71 | name = "repetition-expensive20" 72 | regex = '''X(.?){8,}Y''' 73 | input = '''X1234567Y''' 74 | captures = [[[0, 9], [8, 8]]] 75 | match_limit = 1 76 | unescape = true 77 | 78 | [[tests]] 79 | name = "repetition-expensive22" 80 | regex = '''X(.?){0,8}Y''' 81 | input = '''X1234567Y''' 82 | captures = [[[0, 9], [8, 8]]] 83 | match_limit = 1 84 | unescape = true 85 | 86 | [[tests]] 87 | name = "repetition-expensive24" 88 | regex = '''X(.?){1,8}Y''' 89 | input = '''X1234567Y''' 90 | captures = [[[0, 9], [8, 8]]] 91 | match_limit = 1 92 | unescape = true 93 | 94 | [[tests]] 95 | name = "repetition-expensive26" 96 | regex = '''X(.?){2,8}Y''' 97 | input = '''X1234567Y''' 98 | captures = [[[0, 9], [8, 8]]] 99 | match_limit = 1 100 | unescape = true 101 | 102 | [[tests]] 103 | name = "repetition-expensive28" 104 | regex = '''X(.?){3,8}Y''' 105 | input = '''X1234567Y''' 106 | captures = [[[0, 9], [8, 8]]] 107 | match_limit = 1 108 | unescape = true 109 | 110 | [[tests]] 111 | name = "repetition-expensive30" 112 | regex = '''X(.?){4,8}Y''' 113 | input = '''X1234567Y''' 114 | captures = [[[0, 9], [8, 8]]] 115 | match_limit = 1 116 | unescape = true 117 | 118 | [[tests]] 119 | name = "repetition-expensive32" 120 | regex = '''X(.?){5,8}Y''' 121 | input = '''X1234567Y''' 122 | captures = [[[0, 9], [8, 8]]] 123 | match_limit = 1 124 | unescape = true 125 | 126 | [[tests]] 127 | name = "repetition-expensive34" 128 | regex = '''X(.?){6,8}Y''' 129 | input = '''X1234567Y''' 130 | captures = [[[0, 9], [8, 8]]] 131 | match_limit = 1 132 | unescape = true 133 | 134 | [[tests]] 135 | name = "repetition-expensive36" 136 | regex = '''X(.?){7,8}Y''' 137 | input = '''X1234567Y''' 138 | captures = [[[0, 9], [8, 8]]] 139 | match_limit = 1 140 | unescape = true 141 | 142 | [[tests]] 143 | name = "repetition-expensive37" 144 | regex = '''X(.?){8,8}Y''' 145 | input = '''X1234567Y''' 146 | captures = [[[0, 9], [8, 8]]] 147 | match_limit = 1 148 | unescape = true 149 | 150 | [[tests]] 151 | name = "repetition-expensive48" 152 | regex = '''(a|ab|c|bcd){0,}(d*)''' 153 | input = '''ababcd''' 154 | captures = [[[0, 1], [0, 1], [1, 1]]] 155 | match_limit = 1 156 | unescape = true 157 | 158 | [[tests]] 159 | name = "repetition-expensive49" 160 | regex = '''(a|ab|c|bcd){1,}(d*)''' 161 | input = '''ababcd''' 162 | captures = [[[0, 1], [0, 1], [1, 1]]] 163 | match_limit = 1 164 | unescape = true 165 | 166 | [[tests]] 167 | name = "repetition-expensive50" 168 | regex = '''(a|ab|c|bcd){2,}(d*)''' 169 | input = '''ababcd''' 170 | captures = [[[0, 6], [3, 6], [6, 6]]] 171 | match_limit = 1 172 | unescape = true 173 | 174 | [[tests]] 175 | name = "repetition-expensive51" 176 | regex = '''(a|ab|c|bcd){3,}(d*)''' 177 | input = '''ababcd''' 178 | captures = [[[0, 6], [3, 6], [6, 6]]] 179 | match_limit = 1 180 | unescape = true 181 | 182 | [[tests]] 183 | name = "repetition-expensive52" 184 | regex = '''(a|ab|c|bcd){4,}(d*)''' 185 | input = '''ababcd''' 186 | captures = [] 187 | match_limit = 1 188 | unescape = true 189 | 190 | [[tests]] 191 | name = "repetition-expensive53" 192 | regex = '''(a|ab|c|bcd){0,10}(d*)''' 193 | input = '''ababcd''' 194 | captures = [[[0, 1], [0, 1], [1, 1]]] 195 | match_limit = 1 196 | unescape = true 197 | 198 | [[tests]] 199 | name = "repetition-expensive54" 200 | regex = '''(a|ab|c|bcd){1,10}(d*)''' 201 | input = '''ababcd''' 202 | captures = [[[0, 1], [0, 1], [1, 1]]] 203 | match_limit = 1 204 | unescape = true 205 | 206 | [[tests]] 207 | name = "repetition-expensive55" 208 | regex = '''(a|ab|c|bcd){2,10}(d*)''' 209 | input = '''ababcd''' 210 | captures = [[[0, 6], [3, 6], [6, 6]]] 211 | match_limit = 1 212 | unescape = true 213 | 214 | [[tests]] 215 | name = "repetition-expensive56" 216 | regex = '''(a|ab|c|bcd){3,10}(d*)''' 217 | input = '''ababcd''' 218 | captures = [[[0, 6], [3, 6], [6, 6]]] 219 | match_limit = 1 220 | unescape = true 221 | 222 | [[tests]] 223 | name = "repetition-expensive57" 224 | regex = '''(a|ab|c|bcd){4,10}(d*)''' 225 | input = '''ababcd''' 226 | captures = [] 227 | match_limit = 1 228 | unescape = true 229 | 230 | [[tests]] 231 | name = "repetition-expensive58" 232 | regex = '''(a|ab|c|bcd)*(d*)''' 233 | input = '''ababcd''' 234 | captures = [[[0, 1], [0, 1], [1, 1]]] 235 | match_limit = 1 236 | unescape = true 237 | 238 | [[tests]] 239 | name = "repetition-expensive59" 240 | regex = '''(a|ab|c|bcd)+(d*)''' 241 | input = '''ababcd''' 242 | captures = [[[0, 1], [0, 1], [1, 1]]] 243 | match_limit = 1 244 | unescape = true 245 | 246 | [[tests]] 247 | name = "repetition-expensive65" 248 | regex = '''(ab|a|c|bcd){0,}(d*)''' 249 | input = '''ababcd''' 250 | captures = [[[0, 6], [4, 5], [5, 6]]] 251 | match_limit = 1 252 | unescape = true 253 | 254 | [[tests]] 255 | name = "repetition-expensive67" 256 | regex = '''(ab|a|c|bcd){1,}(d*)''' 257 | input = '''ababcd''' 258 | captures = [[[0, 6], [4, 5], [5, 6]]] 259 | match_limit = 1 260 | unescape = true 261 | 262 | [[tests]] 263 | name = "repetition-expensive69" 264 | regex = '''(ab|a|c|bcd){2,}(d*)''' 265 | input = '''ababcd''' 266 | captures = [[[0, 6], [4, 5], [5, 6]]] 267 | match_limit = 1 268 | unescape = true 269 | 270 | [[tests]] 271 | name = "repetition-expensive71" 272 | regex = '''(ab|a|c|bcd){3,}(d*)''' 273 | input = '''ababcd''' 274 | captures = [[[0, 6], [4, 5], [5, 6]]] 275 | match_limit = 1 276 | unescape = true 277 | 278 | [[tests]] 279 | name = "repetition-expensive72" 280 | regex = '''(ab|a|c|bcd){4,}(d*)''' 281 | input = '''ababcd''' 282 | captures = [] 283 | match_limit = 1 284 | unescape = true 285 | 286 | [[tests]] 287 | name = "repetition-expensive74" 288 | regex = '''(ab|a|c|bcd){0,10}(d*)''' 289 | input = '''ababcd''' 290 | captures = [[[0, 6], [4, 5], [5, 6]]] 291 | match_limit = 1 292 | unescape = true 293 | 294 | [[tests]] 295 | name = "repetition-expensive76" 296 | regex = '''(ab|a|c|bcd){1,10}(d*)''' 297 | input = '''ababcd''' 298 | captures = [[[0, 6], [4, 5], [5, 6]]] 299 | match_limit = 1 300 | unescape = true 301 | 302 | [[tests]] 303 | name = "repetition-expensive78" 304 | regex = '''(ab|a|c|bcd){2,10}(d*)''' 305 | input = '''ababcd''' 306 | captures = [[[0, 6], [4, 5], [5, 6]]] 307 | match_limit = 1 308 | unescape = true 309 | 310 | [[tests]] 311 | name = "repetition-expensive80" 312 | regex = '''(ab|a|c|bcd){3,10}(d*)''' 313 | input = '''ababcd''' 314 | captures = [[[0, 6], [4, 5], [5, 6]]] 315 | match_limit = 1 316 | unescape = true 317 | 318 | [[tests]] 319 | name = "repetition-expensive81" 320 | regex = '''(ab|a|c|bcd){4,10}(d*)''' 321 | input = '''ababcd''' 322 | captures = [] 323 | match_limit = 1 324 | unescape = true 325 | 326 | [[tests]] 327 | name = "repetition-expensive83" 328 | regex = '''(ab|a|c|bcd)*(d*)''' 329 | input = '''ababcd''' 330 | captures = [[[0, 6], [4, 5], [5, 6]]] 331 | match_limit = 1 332 | unescape = true 333 | 334 | [[tests]] 335 | name = "repetition-expensive85" 336 | regex = '''(ab|a|c|bcd)+(d*)''' 337 | input = '''ababcd''' 338 | captures = [[[0, 6], [4, 5], [5, 6]]] 339 | match_limit = 1 340 | unescape = true 341 | 342 | -------------------------------------------------------------------------------- /tests/data/fowler/repetition-long.toml: -------------------------------------------------------------------------------- 1 | # !!! DO NOT EDIT !!! 2 | # Automatically generated by scripts/fowler-to-toml. 3 | # Numbers in the test names correspond to the line number of the test from 4 | # the original dat file. 5 | 6 | [[tests]] 7 | name = "repetition-long12" 8 | regex = '''X(.?){0,}Y''' 9 | input = '''X1234567Y''' 10 | captures = [[[0, 9], [7, 8]]] 11 | match_limit = 1 12 | unescape = true 13 | 14 | [[tests]] 15 | name = "repetition-long13" 16 | regex = '''X(.?){1,}Y''' 17 | input = '''X1234567Y''' 18 | captures = [[[0, 9], [7, 8]]] 19 | match_limit = 1 20 | unescape = true 21 | 22 | [[tests]] 23 | name = "repetition-long14" 24 | regex = '''X(.?){2,}Y''' 25 | input = '''X1234567Y''' 26 | captures = [[[0, 9], [7, 8]]] 27 | match_limit = 1 28 | unescape = true 29 | 30 | [[tests]] 31 | name = "repetition-long15" 32 | regex = '''X(.?){3,}Y''' 33 | input = '''X1234567Y''' 34 | captures = [[[0, 9], [7, 8]]] 35 | match_limit = 1 36 | unescape = true 37 | 38 | [[tests]] 39 | name = "repetition-long16" 40 | regex = '''X(.?){4,}Y''' 41 | input = '''X1234567Y''' 42 | captures = [[[0, 9], [7, 8]]] 43 | match_limit = 1 44 | unescape = true 45 | 46 | [[tests]] 47 | name = "repetition-long17" 48 | regex = '''X(.?){5,}Y''' 49 | input = '''X1234567Y''' 50 | captures = [[[0, 9], [7, 8]]] 51 | match_limit = 1 52 | unescape = true 53 | 54 | [[tests]] 55 | name = "repetition-long18" 56 | regex = '''X(.?){6,}Y''' 57 | input = '''X1234567Y''' 58 | captures = [[[0, 9], [7, 8]]] 59 | match_limit = 1 60 | unescape = true 61 | 62 | [[tests]] 63 | name = "repetition-long19" 64 | regex = '''X(.?){7,}Y''' 65 | input = '''X1234567Y''' 66 | captures = [[[0, 9], [7, 8]]] 67 | match_limit = 1 68 | unescape = true 69 | 70 | [[tests]] 71 | name = "repetition-long20" 72 | regex = '''X(.?){8,}Y''' 73 | input = '''X1234567Y''' 74 | captures = [[[0, 9], [8, 8]]] 75 | match_limit = 1 76 | unescape = true 77 | 78 | [[tests]] 79 | name = "repetition-long22" 80 | regex = '''X(.?){0,8}Y''' 81 | input = '''X1234567Y''' 82 | captures = [[[0, 9], [8, 8]]] 83 | match_limit = 1 84 | unescape = true 85 | 86 | [[tests]] 87 | name = "repetition-long24" 88 | regex = '''X(.?){1,8}Y''' 89 | input = '''X1234567Y''' 90 | captures = [[[0, 9], [8, 8]]] 91 | match_limit = 1 92 | unescape = true 93 | 94 | [[tests]] 95 | name = "repetition-long26" 96 | regex = '''X(.?){2,8}Y''' 97 | input = '''X1234567Y''' 98 | captures = [[[0, 9], [8, 8]]] 99 | match_limit = 1 100 | unescape = true 101 | 102 | [[tests]] 103 | name = "repetition-long28" 104 | regex = '''X(.?){3,8}Y''' 105 | input = '''X1234567Y''' 106 | captures = [[[0, 9], [8, 8]]] 107 | match_limit = 1 108 | unescape = true 109 | 110 | [[tests]] 111 | name = "repetition-long30" 112 | regex = '''X(.?){4,8}Y''' 113 | input = '''X1234567Y''' 114 | captures = [[[0, 9], [8, 8]]] 115 | match_limit = 1 116 | unescape = true 117 | 118 | [[tests]] 119 | name = "repetition-long32" 120 | regex = '''X(.?){5,8}Y''' 121 | input = '''X1234567Y''' 122 | captures = [[[0, 9], [8, 8]]] 123 | match_limit = 1 124 | unescape = true 125 | 126 | [[tests]] 127 | name = "repetition-long34" 128 | regex = '''X(.?){6,8}Y''' 129 | input = '''X1234567Y''' 130 | captures = [[[0, 9], [8, 8]]] 131 | match_limit = 1 132 | unescape = true 133 | 134 | [[tests]] 135 | name = "repetition-long36" 136 | regex = '''X(.?){7,8}Y''' 137 | input = '''X1234567Y''' 138 | captures = [[[0, 9], [8, 8]]] 139 | match_limit = 1 140 | unescape = true 141 | 142 | [[tests]] 143 | name = "repetition-long37" 144 | regex = '''X(.?){8,8}Y''' 145 | input = '''X1234567Y''' 146 | captures = [[[0, 9], [8, 8]]] 147 | match_limit = 1 148 | unescape = true 149 | 150 | [[tests]] 151 | name = "repetition-long48" 152 | regex = '''(a|ab|c|bcd){0,}(d*)''' 153 | input = '''ababcd''' 154 | captures = [[[0, 1], [0, 1], [1, 1]]] 155 | match_limit = 1 156 | unescape = true 157 | 158 | [[tests]] 159 | name = "repetition-long49" 160 | regex = '''(a|ab|c|bcd){1,}(d*)''' 161 | input = '''ababcd''' 162 | captures = [[[0, 1], [0, 1], [1, 1]]] 163 | match_limit = 1 164 | unescape = true 165 | 166 | [[tests]] 167 | name = "repetition-long50" 168 | regex = '''(a|ab|c|bcd){2,}(d*)''' 169 | input = '''ababcd''' 170 | captures = [[[0, 6], [3, 6], [6, 6]]] 171 | match_limit = 1 172 | unescape = true 173 | 174 | [[tests]] 175 | name = "repetition-long51" 176 | regex = '''(a|ab|c|bcd){3,}(d*)''' 177 | input = '''ababcd''' 178 | captures = [[[0, 6], [3, 6], [6, 6]]] 179 | match_limit = 1 180 | unescape = true 181 | 182 | [[tests]] 183 | name = "repetition-long52" 184 | regex = '''(a|ab|c|bcd){4,}(d*)''' 185 | input = '''ababcd''' 186 | captures = [] 187 | match_limit = 1 188 | unescape = true 189 | 190 | [[tests]] 191 | name = "repetition-long53" 192 | regex = '''(a|ab|c|bcd){0,10}(d*)''' 193 | input = '''ababcd''' 194 | captures = [[[0, 1], [0, 1], [1, 1]]] 195 | match_limit = 1 196 | unescape = true 197 | 198 | [[tests]] 199 | name = "repetition-long54" 200 | regex = '''(a|ab|c|bcd){1,10}(d*)''' 201 | input = '''ababcd''' 202 | captures = [[[0, 1], [0, 1], [1, 1]]] 203 | match_limit = 1 204 | unescape = true 205 | 206 | [[tests]] 207 | name = "repetition-long55" 208 | regex = '''(a|ab|c|bcd){2,10}(d*)''' 209 | input = '''ababcd''' 210 | captures = [[[0, 6], [3, 6], [6, 6]]] 211 | match_limit = 1 212 | unescape = true 213 | 214 | [[tests]] 215 | name = "repetition-long56" 216 | regex = '''(a|ab|c|bcd){3,10}(d*)''' 217 | input = '''ababcd''' 218 | captures = [[[0, 6], [3, 6], [6, 6]]] 219 | match_limit = 1 220 | unescape = true 221 | 222 | [[tests]] 223 | name = "repetition-long57" 224 | regex = '''(a|ab|c|bcd){4,10}(d*)''' 225 | input = '''ababcd''' 226 | captures = [] 227 | match_limit = 1 228 | unescape = true 229 | 230 | [[tests]] 231 | name = "repetition-long58" 232 | regex = '''(a|ab|c|bcd)*(d*)''' 233 | input = '''ababcd''' 234 | captures = [[[0, 1], [0, 1], [1, 1]]] 235 | match_limit = 1 236 | unescape = true 237 | 238 | [[tests]] 239 | name = "repetition-long59" 240 | regex = '''(a|ab|c|bcd)+(d*)''' 241 | input = '''ababcd''' 242 | captures = [[[0, 1], [0, 1], [1, 1]]] 243 | match_limit = 1 244 | unescape = true 245 | 246 | [[tests]] 247 | name = "repetition-long65" 248 | regex = '''(ab|a|c|bcd){0,}(d*)''' 249 | input = '''ababcd''' 250 | captures = [[[0, 6], [4, 5], [5, 6]]] 251 | match_limit = 1 252 | unescape = true 253 | 254 | [[tests]] 255 | name = "repetition-long67" 256 | regex = '''(ab|a|c|bcd){1,}(d*)''' 257 | input = '''ababcd''' 258 | captures = [[[0, 6], [4, 5], [5, 6]]] 259 | match_limit = 1 260 | unescape = true 261 | 262 | [[tests]] 263 | name = "repetition-long69" 264 | regex = '''(ab|a|c|bcd){2,}(d*)''' 265 | input = '''ababcd''' 266 | captures = [[[0, 6], [4, 5], [5, 6]]] 267 | match_limit = 1 268 | unescape = true 269 | 270 | [[tests]] 271 | name = "repetition-long71" 272 | regex = '''(ab|a|c|bcd){3,}(d*)''' 273 | input = '''ababcd''' 274 | captures = [[[0, 6], [4, 5], [5, 6]]] 275 | match_limit = 1 276 | unescape = true 277 | 278 | [[tests]] 279 | name = "repetition-long72" 280 | regex = '''(ab|a|c|bcd){4,}(d*)''' 281 | input = '''ababcd''' 282 | captures = [] 283 | match_limit = 1 284 | unescape = true 285 | 286 | [[tests]] 287 | name = "repetition-long74" 288 | regex = '''(ab|a|c|bcd){0,10}(d*)''' 289 | input = '''ababcd''' 290 | captures = [[[0, 6], [4, 5], [5, 6]]] 291 | match_limit = 1 292 | unescape = true 293 | 294 | [[tests]] 295 | name = "repetition-long76" 296 | regex = '''(ab|a|c|bcd){1,10}(d*)''' 297 | input = '''ababcd''' 298 | captures = [[[0, 6], [4, 5], [5, 6]]] 299 | match_limit = 1 300 | unescape = true 301 | 302 | [[tests]] 303 | name = "repetition-long78" 304 | regex = '''(ab|a|c|bcd){2,10}(d*)''' 305 | input = '''ababcd''' 306 | captures = [[[0, 6], [4, 5], [5, 6]]] 307 | match_limit = 1 308 | unescape = true 309 | 310 | [[tests]] 311 | name = "repetition-long80" 312 | regex = '''(ab|a|c|bcd){3,10}(d*)''' 313 | input = '''ababcd''' 314 | captures = [[[0, 6], [4, 5], [5, 6]]] 315 | match_limit = 1 316 | unescape = true 317 | 318 | [[tests]] 319 | name = "repetition-long81" 320 | regex = '''(ab|a|c|bcd){4,10}(d*)''' 321 | input = '''ababcd''' 322 | captures = [] 323 | match_limit = 1 324 | unescape = true 325 | 326 | [[tests]] 327 | name = "repetition-long83" 328 | regex = '''(ab|a|c|bcd)*(d*)''' 329 | input = '''ababcd''' 330 | captures = [[[0, 6], [4, 5], [5, 6]]] 331 | match_limit = 1 332 | unescape = true 333 | 334 | [[tests]] 335 | name = "repetition-long85" 336 | regex = '''(ab|a|c|bcd)+(d*)''' 337 | input = '''ababcd''' 338 | captures = [[[0, 6], [4, 5], [5, 6]]] 339 | match_limit = 1 340 | unescape = true 341 | 342 | -------------------------------------------------------------------------------- /tests/data/iter.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "1" 3 | regex = "a" 4 | input = "aaa" 5 | matches = [[0, 1], [1, 2], [2, 3]] 6 | 7 | [[tests]] 8 | name = "2" 9 | regex = "a" 10 | input = "aba" 11 | matches = [[0, 1], [2, 3]] 12 | 13 | [[tests]] 14 | name = "empty1" 15 | regex = '' 16 | input = '' 17 | matches = [[0, 0]] 18 | 19 | [[tests]] 20 | name = "empty2" 21 | regex = '' 22 | input = 'abc' 23 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 24 | 25 | [[tests]] 26 | name = "empty3" 27 | regex = '()' 28 | input = 'abc' 29 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 30 | 31 | [[tests]] 32 | name = "empty4" 33 | regex = '()*' 34 | input = 'abc' 35 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 36 | 37 | [[tests]] 38 | name = "empty5" 39 | regex = '()+' 40 | input = 'abc' 41 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 42 | 43 | [[tests]] 44 | name = "empty6" 45 | regex = '()?' 46 | input = 'abc' 47 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 48 | 49 | [[tests]] 50 | name = "empty7" 51 | regex = '()()' 52 | input = 'abc' 53 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 54 | 55 | [[tests]] 56 | name = "empty8" 57 | regex = '()+|z' 58 | input = 'abc' 59 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 60 | 61 | [[tests]] 62 | name = "empty9" 63 | regex = 'z|()+' 64 | input = 'abc' 65 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 66 | 67 | [[tests]] 68 | name = "empty10" 69 | regex = '()+|b' 70 | input = 'abc' 71 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 72 | 73 | [[tests]] 74 | name = "empty11" 75 | regex = 'b|()+' 76 | input = 'abc' 77 | matches = [[0, 0], [1, 2], [3, 3]] 78 | 79 | [[tests]] 80 | name = "start1" 81 | regex = "^a" 82 | input = "a" 83 | matches = [[0, 1]] 84 | 85 | [[tests]] 86 | name = "start2" 87 | regex = "^a" 88 | input = "aa" 89 | matches = [[0, 1]] 90 | 91 | [[tests]] 92 | name = "anchored1" 93 | regex = "a" 94 | input = "a" 95 | matches = [[0, 1]] 96 | anchored = true 97 | 98 | # This test is pretty subtle. It demonstrates the crucial difference between 99 | # '^a' and 'a' compiled in 'anchored' mode. The former regex exclusively 100 | # matches at the start of a haystack and nowhere else. The latter regex has 101 | # no such restriction, but its automaton is constructed such that it lacks a 102 | # `.*?` prefix. So it can actually produce matches at multiple locations. 103 | # The anchored3 test drives this point home. 104 | [[tests]] 105 | name = "anchored2" 106 | regex = "a" 107 | input = "aa" 108 | matches = [[0, 1], [1, 2]] 109 | anchored = true 110 | 111 | # Unlikely anchored2, this test stops matching anything after it sees `b` 112 | # since it lacks a `.*?` prefix. Since it is looking for 'a' but sees 'b', it 113 | # determines that there are no remaining matches. 114 | [[tests]] 115 | name = "anchored3" 116 | regex = "a" 117 | input = "aaba" 118 | matches = [[0, 1], [1, 2]] 119 | anchored = true 120 | -------------------------------------------------------------------------------- /tests/data/misc.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "ascii-literal" 3 | regex = "a" 4 | input = "a" 5 | matches = [[0, 1]] 6 | 7 | [[tests]] 8 | name = "ascii-literal-not" 9 | regex = "a" 10 | input = "z" 11 | matches = [] 12 | 13 | [[tests]] 14 | name = "ascii-literal-anchored" 15 | regex = "a" 16 | input = "a" 17 | matches = [[0, 1]] 18 | anchored = true 19 | 20 | [[tests]] 21 | name = "ascii-literal-anchored-not" 22 | regex = "a" 23 | input = "z" 24 | matches = [] 25 | anchored = true 26 | 27 | [[tests]] 28 | name = "anchor-start-end-line" 29 | regex = '(?m)^bar$' 30 | input = "foo\nbar\nbaz" 31 | matches = [[4, 7]] 32 | 33 | [[tests]] 34 | name = "prefix-literal-match" 35 | regex = '^abc' 36 | input = "abc" 37 | matches = [[0, 3]] 38 | 39 | [[tests]] 40 | name = "prefix-literal-match-ascii" 41 | regex = '^abc' 42 | input = "abc" 43 | matches = [[0, 3]] 44 | unicode = false 45 | utf8 = false 46 | 47 | [[tests]] 48 | name = "prefix-literal-no-match" 49 | regex = '^abc' 50 | input = "zabc" 51 | matches = [] 52 | 53 | [[tests]] 54 | name = "one-literal-edge" 55 | regex = 'abc' 56 | input = "xxxxxab" 57 | matches = [] 58 | 59 | [[tests]] 60 | name = "terminates" 61 | regex = 'a$' 62 | input = "a" 63 | matches = [[0, 1]] 64 | 65 | [[tests]] 66 | name = "suffix-100" 67 | regex = '.*abcd' 68 | input = "abcd" 69 | matches = [[0, 4]] 70 | 71 | [[tests]] 72 | name = "suffix-200" 73 | regex = '.*(?:abcd)+' 74 | input = "abcd" 75 | matches = [[0, 4]] 76 | 77 | [[tests]] 78 | name = "suffix-300" 79 | regex = '.*(?:abcd)+' 80 | input = "abcdabcd" 81 | matches = [[0, 8]] 82 | 83 | [[tests]] 84 | name = "suffix-400" 85 | regex = '.*(?:abcd)+' 86 | input = "abcdxabcd" 87 | matches = [[0, 9]] 88 | 89 | [[tests]] 90 | name = "suffix-500" 91 | regex = '.*x(?:abcd)+' 92 | input = "abcdxabcd" 93 | matches = [[0, 9]] 94 | 95 | [[tests]] 96 | name = "suffix-600" 97 | regex = '[^abcd]*x(?:abcd)+' 98 | input = "abcdxabcd" 99 | matches = [[4, 9]] 100 | -------------------------------------------------------------------------------- /tests/data/multiline.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "basic1" 3 | regex = '(?m)^[a-z]+$' 4 | input = "abc\ndef\nxyz" 5 | matches = [[0, 3], [4, 7], [8, 11]] 6 | 7 | [[tests]] 8 | name = "basic2" 9 | regex = '(?m)^$' 10 | input = "abc\ndef\nxyz" 11 | matches = [] 12 | 13 | [[tests]] 14 | name = "basic3" 15 | regex = '(?m)^' 16 | input = "abc\ndef\nxyz" 17 | matches = [[0, 0], [4, 4], [8, 8]] 18 | 19 | [[tests]] 20 | name = "basic4" 21 | regex = '(?m)$' 22 | input = "abc\ndef\nxyz" 23 | matches = [[3, 3], [7, 7], [11, 11]] 24 | 25 | [[tests]] 26 | name = "basic5" 27 | regex = '(?m)^[a-z]' 28 | input = "abc\ndef\nxyz" 29 | matches = [[0, 1], [4, 5], [8, 9]] 30 | 31 | [[tests]] 32 | name = "basic6" 33 | regex = '(?m)[a-z]^' 34 | input = "abc\ndef\nxyz" 35 | matches = [] 36 | 37 | [[tests]] 38 | name = "basic7" 39 | regex = '(?m)[a-z]$' 40 | input = "abc\ndef\nxyz" 41 | matches = [[2, 3], [6, 7], [10, 11]] 42 | 43 | [[tests]] 44 | name = "basic8" 45 | regex = '(?m)$[a-z]' 46 | input = "abc\ndef\nxyz" 47 | matches = [] 48 | 49 | [[tests]] 50 | name = "basic9" 51 | regex = '(?m)^$' 52 | input = "" 53 | matches = [[0, 0]] 54 | 55 | [[tests]] 56 | name = "repeat1" 57 | regex = '(?m)(?:^$)*' 58 | input = "a\nb\nc" 59 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] 60 | 61 | [[tests]] 62 | name = "repeat1-no-multi" 63 | regex = '(?:^$)*' 64 | input = "a\nb\nc" 65 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] 66 | 67 | [[tests]] 68 | name = "repeat2" 69 | regex = '(?m)(?:^|a)+' 70 | input = "a\naaa\n" 71 | matches = [[0, 0], [2, 2], [3, 5], [6, 6]] 72 | 73 | [[tests]] 74 | name = "repeat100" 75 | regex = '(?m)(?:^|a)+' 76 | input = "a\naaa\n" 77 | matches = [[0, 0], [2, 2], [3, 5], [6, 6]] 78 | 79 | [[tests]] 80 | name = "repeat2-no-multi" 81 | regex = '(?:^|a)+' 82 | input = "a\naaa\n" 83 | matches = [[0, 0], [2, 5]] 84 | 85 | [[tests]] 86 | name = "repeat3" 87 | regex = '(?m)(?:^|a)*' 88 | input = "a\naaa\n" 89 | matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]] 90 | 91 | [[tests]] 92 | name = "repeat3-no-multi" 93 | regex = '(?:^|a)*' 94 | input = "a\naaa\n" 95 | matches = [[0, 0], [1, 1], [2, 5], [6, 6]] 96 | 97 | [[tests]] 98 | name = "repeat4" 99 | regex = '(?m)(?:^|a+)' 100 | input = "a\naaa\n" 101 | matches = [[0, 0], [2, 2], [3, 5], [6, 6]] 102 | 103 | [[tests]] 104 | name = "repeat4-no-multi" 105 | regex = '(?:^|a+)' 106 | input = "a\naaa\n" 107 | matches = [[0, 0], [2, 5]] 108 | 109 | [[tests]] 110 | name = "repeat5" 111 | regex = '(?m)(?:^|a*)' 112 | input = "a\naaa\n" 113 | matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]] 114 | 115 | [[tests]] 116 | name = "repeat5-no-multi" 117 | regex = '(?:^|a*)' 118 | input = "a\naaa\n" 119 | matches = [[0, 0], [1, 1], [2, 5], [6, 6]] 120 | 121 | [[tests]] 122 | name = "repeat6" 123 | regex = '(?m)(?:^[a-z])+' 124 | input = "abc\ndef\nxyz" 125 | matches = [[0, 1], [4, 5], [8, 9]] 126 | 127 | [[tests]] 128 | name = "repeat6-no-multi" 129 | regex = '(?:^[a-z])+' 130 | input = "abc\ndef\nxyz" 131 | matches = [[0, 1]] 132 | 133 | [[tests]] 134 | name = "repeat7" 135 | regex = '(?m)(?:^[a-z]{3}\n?)+' 136 | input = "abc\ndef\nxyz" 137 | matches = [[0, 11]] 138 | 139 | [[tests]] 140 | name = "repeat7-no-multi" 141 | regex = '(?:^[a-z]{3}\n?)+' 142 | input = "abc\ndef\nxyz" 143 | matches = [[0, 4]] 144 | 145 | [[tests]] 146 | name = "repeat8" 147 | regex = '(?m)(?:^[a-z]{3}\n?)*' 148 | input = "abc\ndef\nxyz" 149 | matches = [[0, 11]] 150 | 151 | [[tests]] 152 | name = "repeat8-no-multi" 153 | regex = '(?:^[a-z]{3}\n?)*' 154 | input = "abc\ndef\nxyz" 155 | matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]] 156 | 157 | [[tests]] 158 | name = "repeat9" 159 | regex = '(?m)(?:\n?[a-z]{3}$)+' 160 | input = "abc\ndef\nxyz" 161 | matches = [[0, 11]] 162 | 163 | [[tests]] 164 | name = "repeat9-no-multi" 165 | regex = '(?:\n?[a-z]{3}$)+' 166 | input = "abc\ndef\nxyz" 167 | matches = [[7, 11]] 168 | 169 | [[tests]] 170 | name = "repeat10" 171 | regex = '(?m)(?:\n?[a-z]{3}$)*' 172 | input = "abc\ndef\nxyz" 173 | matches = [[0, 11]] 174 | 175 | [[tests]] 176 | name = "repeat10-no-multi" 177 | regex = '(?:\n?[a-z]{3}$)*' 178 | input = "abc\ndef\nxyz" 179 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]] 180 | 181 | [[tests]] 182 | name = "repeat11" 183 | regex = '(?m)^*' 184 | input = "\naa\n" 185 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] 186 | 187 | [[tests]] 188 | name = "repeat11-no-multi" 189 | regex = '^*' 190 | input = "\naa\n" 191 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] 192 | 193 | [[tests]] 194 | name = "repeat12" 195 | regex = '(?m)^+' 196 | input = "\naa\n" 197 | matches = [[0, 0], [1, 1], [4, 4]] 198 | 199 | [[tests]] 200 | name = "repeat12-no-multi" 201 | regex = '^+' 202 | input = "\naa\n" 203 | matches = [[0, 0]] 204 | 205 | [[tests]] 206 | name = "repeat13" 207 | regex = '(?m)$*' 208 | input = "\naa\n" 209 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] 210 | 211 | [[tests]] 212 | name = "repeat13-no-multi" 213 | regex = '$*' 214 | input = "\naa\n" 215 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] 216 | 217 | [[tests]] 218 | name = "repeat14" 219 | regex = '(?m)$+' 220 | input = "\naa\n" 221 | matches = [[0, 0], [3, 3], [4, 4]] 222 | 223 | [[tests]] 224 | name = "repeat14-no-multi" 225 | regex = '$+' 226 | input = "\naa\n" 227 | matches = [[4, 4]] 228 | 229 | [[tests]] 230 | name = "repeat15" 231 | regex = '(?m)(?:$\n)+' 232 | input = "\n\naaa\n\n" 233 | matches = [[0, 2], [5, 7]] 234 | 235 | [[tests]] 236 | name = "repeat15-no-multi" 237 | regex = '(?:$\n)+' 238 | input = "\n\naaa\n\n" 239 | matches = [] 240 | 241 | [[tests]] 242 | name = "repeat16" 243 | regex = '(?m)(?:$\n)*' 244 | input = "\n\naaa\n\n" 245 | matches = [[0, 2], [3, 3], [4, 4], [5, 7]] 246 | 247 | [[tests]] 248 | name = "repeat16-no-multi" 249 | regex = '(?:$\n)*' 250 | input = "\n\naaa\n\n" 251 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]] 252 | 253 | [[tests]] 254 | name = "repeat17" 255 | regex = '(?m)(?:$\n^)+' 256 | input = "\n\naaa\n\n" 257 | matches = [[0, 2], [5, 7]] 258 | 259 | [[tests]] 260 | name = "repeat17-no-multi" 261 | regex = '(?:$\n^)+' 262 | input = "\n\naaa\n\n" 263 | matches = [] 264 | 265 | [[tests]] 266 | name = "repeat18" 267 | regex = '(?m)(?:^|$)+' 268 | input = "\n\naaa\n\n" 269 | matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]] 270 | 271 | [[tests]] 272 | name = "repeat18-no-multi" 273 | regex = '(?:^|$)+' 274 | input = "\n\naaa\n\n" 275 | matches = [[0, 0], [7, 7]] 276 | -------------------------------------------------------------------------------- /tests/data/no-unicode.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "invalid-utf8-literal1" 3 | regex = '\xFF' 4 | input = '\xFF' 5 | matches = [[0, 1]] 6 | unicode = false 7 | utf8 = false 8 | unescape = true 9 | 10 | 11 | [[tests]] 12 | name = "mixed" 13 | regex = '(.+)(?-u)(.+)' 14 | input = '\xCE\x93\xCE\x94\xFF' 15 | matches = [[0, 5]] 16 | utf8 = false 17 | unescape = true 18 | 19 | 20 | [[tests]] 21 | name = "case1" 22 | regex = "a" 23 | input = "A" 24 | matches = [[0, 1]] 25 | case_insensitive = true 26 | unicode = false 27 | 28 | [[tests]] 29 | name = "case2" 30 | regex = "[a-z]+" 31 | input = "AaAaA" 32 | matches = [[0, 5]] 33 | case_insensitive = true 34 | unicode = false 35 | 36 | [[tests]] 37 | name = "case3" 38 | regex = "[a-z]+" 39 | input = "aA\u212AaA" 40 | matches = [[0, 7]] 41 | case_insensitive = true 42 | 43 | [[tests]] 44 | name = "case4" 45 | regex = "[a-z]+" 46 | input = "aA\u212AaA" 47 | matches = [[0, 2], [5, 7]] 48 | case_insensitive = true 49 | unicode = false 50 | 51 | 52 | [[tests]] 53 | name = "negate1" 54 | regex = "[^a]" 55 | input = "δ" 56 | matches = [[0, 2]] 57 | 58 | [[tests]] 59 | name = "negate2" 60 | regex = "[^a]" 61 | input = "δ" 62 | matches = [[0, 1], [1, 2]] 63 | unicode = false 64 | utf8 = false 65 | 66 | 67 | [[tests]] 68 | name = "dotstar-prefix1" 69 | regex = "a" 70 | input = '\xFFa' 71 | matches = [[1, 2]] 72 | unicode = false 73 | utf8 = false 74 | unescape = true 75 | 76 | [[tests]] 77 | name = "dotstar-prefix2" 78 | regex = "a" 79 | input = '\xFFa' 80 | matches = [[1, 2]] 81 | utf8 = false 82 | unescape = true 83 | 84 | 85 | [[tests]] 86 | name = "null-bytes1" 87 | regex = '[^\x00]+\x00' 88 | input = 'foo\x00' 89 | matches = [[0, 4]] 90 | unicode = false 91 | utf8 = false 92 | unescape = true 93 | 94 | 95 | [[tests]] 96 | name = "word-ascii" 97 | regex = '\w+' 98 | input = "aδ" 99 | matches = [[0, 1]] 100 | unicode = false 101 | 102 | [[tests]] 103 | name = "word-unicode" 104 | regex = '\w+' 105 | input = "aδ" 106 | matches = [[0, 3]] 107 | 108 | [[tests]] 109 | name = "decimal-ascii" 110 | regex = '\d+' 111 | input = "1२३9" 112 | matches = [[0, 1], [7, 8]] 113 | unicode = false 114 | 115 | [[tests]] 116 | name = "decimal-unicode" 117 | regex = '\d+' 118 | input = "1२३9" 119 | matches = [[0, 8]] 120 | 121 | [[tests]] 122 | name = "space-ascii" 123 | regex = '\s+' 124 | input = " \u1680" 125 | matches = [[0, 1]] 126 | unicode = false 127 | 128 | [[tests]] 129 | name = "space-unicode" 130 | regex = '\s+' 131 | input = " \u1680" 132 | matches = [[0, 4]] 133 | 134 | 135 | [[tests]] 136 | # See: https://github.com/rust-lang/regex/issues/484 137 | name = "iter1-bytes" 138 | regex = '' 139 | input = "☃" 140 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 141 | utf8 = false 142 | 143 | [[tests]] 144 | # See: https://github.com/rust-lang/regex/issues/484 145 | name = "iter1-utf8" 146 | regex = '' 147 | input = "☃" 148 | matches = [[0, 0], [3, 3]] 149 | 150 | [[tests]] 151 | # See: https://github.com/rust-lang/regex/issues/484 152 | # Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8. 153 | name = "iter2-bytes" 154 | regex = '' 155 | input = 'b\xFFr' 156 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 157 | unescape = true 158 | utf8 = false 159 | -------------------------------------------------------------------------------- /tests/data/overlapping.toml: -------------------------------------------------------------------------------- 1 | [[tests]] 2 | name = "repetition-plus-leftmost-first-100" 3 | regex = 'a+' 4 | input = "aaa" 5 | matches = [[0, 1], [0, 2], [0, 3]] 6 | match_kind = "leftmost-first" 7 | search_kind = "overlapping" 8 | 9 | [[tests]] 10 | name = "repetition-plus-all-100" 11 | regex = 'a+' 12 | input = "aaa" 13 | matches = [[0, 1], [0, 2], [0, 3]] 14 | match_kind = "all" 15 | search_kind = "overlapping" 16 | 17 | [[tests]] 18 | name = "repetition-plus-leftmost-first-200" 19 | regex = '(abc)+' 20 | input = "zzabcabczzabc" 21 | matches = [[2, 5], [2, 8]] 22 | match_kind = "leftmost-first" 23 | search_kind = "overlapping" 24 | 25 | [[tests]] 26 | name = "repetition-plus-all-200" 27 | regex = '(abc)+' 28 | input = "zzabcabczzabc" 29 | matches = [[2, 5], [2, 8], [10, 13]] 30 | match_kind = "all" 31 | search_kind = "overlapping" 32 | 33 | [[tests]] 34 | name = "repetition-star-leftmost-first-100" 35 | regex = 'a*' 36 | input = "aaa" 37 | matches = [[0, 0], [0, 1], [0, 2], [0, 3]] 38 | match_kind = "leftmost-first" 39 | search_kind = "overlapping" 40 | 41 | [[tests]] 42 | name = "repetition-star-all-100" 43 | regex = 'a*' 44 | input = "aaa" 45 | matches = [[0, 0], [0, 1], [0, 2], [0, 3]] 46 | match_kind = "all" 47 | search_kind = "overlapping" 48 | 49 | [[tests]] 50 | name = "repetition-star-leftmost-first-200" 51 | regex = '(abc)*' 52 | input = "zzabcabczzabc" 53 | matches = [[0, 0]] 54 | match_kind = "leftmost-first" 55 | search_kind = "overlapping" 56 | 57 | [[tests]] 58 | name = "repetition-star-all-200" 59 | regex = '(abc)*' 60 | input = "zzabcabczzabc" 61 | matches = [ 62 | [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], 63 | [2, 5], 64 | [6, 6], [7, 7], 65 | [2, 8], 66 | [9, 9], [10, 10], [11, 11], [12, 12], 67 | [10, 13], 68 | ] 69 | match_kind = "all" 70 | search_kind = "overlapping" 71 | 72 | [[tests]] 73 | name = "start-end-rep-leftmost-first" 74 | regex = '(^$)*' 75 | input = "abc" 76 | matches = [[0, 0]] 77 | match_kind = "leftmost-first" 78 | search_kind = "overlapping" 79 | 80 | [[tests]] 81 | name = "start-end-rep-all" 82 | regex = '(^$)*' 83 | input = "abc" 84 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 85 | match_kind = "all" 86 | search_kind = "overlapping" 87 | 88 | [[tests]] 89 | name = "alt-leftmost-first-100" 90 | regex = 'abc|a' 91 | input = "zzabcazzaabc" 92 | matches = [[2, 3], [2, 5]] 93 | match_kind = "leftmost-first" 94 | search_kind = "overlapping" 95 | 96 | [[tests]] 97 | name = "alt-all-100" 98 | regex = 'abc|a' 99 | input = "zzabcazzaabc" 100 | matches = [[2, 3], [2, 5], [5, 6], [8, 9], [9, 10], [9, 12]] 101 | match_kind = "all" 102 | search_kind = "overlapping" 103 | 104 | [[tests]] 105 | name = "empty-000" 106 | regex = "" 107 | input = "abc" 108 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]] 109 | match_kind = "all" 110 | search_kind = "overlapping" 111 | 112 | [[tests]] 113 | name = "empty-alt-000" 114 | regex = "|b" 115 | input = "abc" 116 | matches = [[0, 0], [1, 1], [1, 2], [3, 3]] 117 | match_kind = "all" 118 | search_kind = "overlapping" 119 | 120 | [[tests]] 121 | name = "empty-alt-010" 122 | regex = "b|" 123 | input = "abc" 124 | matches = [[0, 0], [1, 1], [1, 2], [3, 3]] 125 | match_kind = "all" 126 | search_kind = "overlapping" 127 | -------------------------------------------------------------------------------- /tests/dfa/api.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | 3 | use regex_automata::{ 4 | dfa::{dense, regex::Regex, Automaton, OverlappingState}, 5 | nfa::thompson, 6 | HalfMatch, MatchError, MatchKind, MultiMatch, 7 | }; 8 | 9 | use crate::util::{BunkPrefilter, SubstringPrefilter}; 10 | 11 | // Tests that quit bytes in the forward direction work correctly. 12 | #[test] 13 | fn quit_fwd() -> Result<(), Box> { 14 | let dfa = dense::Builder::new() 15 | .configure(dense::Config::new().quit(b'x', true)) 16 | .build("[[:word:]]+$")?; 17 | 18 | assert_eq!( 19 | dfa.find_earliest_fwd(b"abcxyz"), 20 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 21 | ); 22 | assert_eq!( 23 | dfa.find_leftmost_fwd(b"abcxyz"), 24 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 25 | ); 26 | assert_eq!( 27 | dfa.find_overlapping_fwd(b"abcxyz", &mut OverlappingState::start()), 28 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 29 | ); 30 | 31 | Ok(()) 32 | } 33 | 34 | // Tests that quit bytes in the reverse direction work correctly. 35 | #[test] 36 | fn quit_rev() -> Result<(), Box> { 37 | let dfa = dense::Builder::new() 38 | .configure(dense::Config::new().quit(b'x', true)) 39 | .thompson(thompson::Config::new().reverse(true)) 40 | .build("^[[:word:]]+")?; 41 | 42 | assert_eq!( 43 | dfa.find_earliest_rev(b"abcxyz"), 44 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 45 | ); 46 | assert_eq!( 47 | dfa.find_leftmost_rev(b"abcxyz"), 48 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 49 | ); 50 | 51 | Ok(()) 52 | } 53 | 54 | // Tests that if we heuristically enable Unicode word boundaries but then 55 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder 56 | // will panic. 57 | #[test] 58 | #[should_panic] 59 | fn quit_panics() { 60 | dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false); 61 | } 62 | 63 | // Tests that if we attempt an overlapping search using a regex without a 64 | // reverse DFA compiled with 'starts_for_each_pattern', then we get a panic. 65 | #[test] 66 | #[should_panic] 67 | fn incorrect_config_overlapping_search_panics() { 68 | let forward = dense::DFA::new(r"abca").unwrap(); 69 | let reverse = dense::Builder::new() 70 | .configure( 71 | dense::Config::new() 72 | .anchored(true) 73 | .match_kind(MatchKind::All) 74 | .starts_for_each_pattern(false), 75 | ) 76 | .thompson(thompson::Config::new().reverse(true)) 77 | .build(r"abca") 78 | .unwrap(); 79 | 80 | let re = Regex::builder().build_from_dfas(forward, reverse); 81 | let haystack = "bar abcabcabca abca foo".as_bytes(); 82 | re.find_overlapping(haystack, &mut OverlappingState::start()); 83 | } 84 | 85 | // This tests an intesting case where even if the Unicode word boundary option 86 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode 87 | // word boundaries to be enabled. 88 | #[test] 89 | fn unicode_word_implicitly_works() -> Result<(), Box> { 90 | let mut config = dense::Config::new(); 91 | for b in 0x80..=0xFF { 92 | config = config.quit(b, true); 93 | } 94 | let dfa = dense::Builder::new().configure(config).build(r"\b")?; 95 | let expected = HalfMatch::must(0, 1); 96 | assert_eq!(dfa.find_leftmost_fwd(b" a"), Ok(Some(expected))); 97 | Ok(()) 98 | } 99 | 100 | // Tests that we can provide a prefilter to a Regex, and the search reports 101 | // correct results. 102 | #[test] 103 | fn prefilter_works() -> Result<(), Box> { 104 | let re = Regex::new(r"a[0-9]+") 105 | .unwrap() 106 | .with_prefilter(SubstringPrefilter::new("a")); 107 | let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; 108 | let matches: Vec<(usize, usize)> = 109 | re.find_leftmost_iter(text).map(|m| (m.start(), m.end())).collect(); 110 | assert_eq!( 111 | matches, 112 | vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] 113 | ); 114 | Ok(()) 115 | } 116 | 117 | // This test confirms that a prefilter is active by using a prefilter that 118 | // reports false negatives. 119 | #[test] 120 | fn prefilter_is_active() -> Result<(), Box> { 121 | let text = b"za123"; 122 | let re = Regex::new(r"a[0-9]+") 123 | .unwrap() 124 | .with_prefilter(SubstringPrefilter::new("a")); 125 | assert_eq!(re.find_leftmost(b"za123"), Some(MultiMatch::must(0, 1, 5))); 126 | assert_eq!(re.find_leftmost(b"a123"), Some(MultiMatch::must(0, 0, 4))); 127 | let re = re.with_prefilter(BunkPrefilter::new()); 128 | assert_eq!(re.find_leftmost(b"za123"), None); 129 | // This checks that the prefilter is used when first starting the search, 130 | // instead of waiting until at least one transition has occurred. 131 | assert_eq!(re.find_leftmost(b"a123"), None); 132 | Ok(()) 133 | } 134 | -------------------------------------------------------------------------------- /tests/dfa/mod.rs: -------------------------------------------------------------------------------- 1 | mod api; 2 | mod suite; 3 | -------------------------------------------------------------------------------- /tests/hybrid/api.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | 3 | use regex_automata::{ 4 | hybrid::{ 5 | dfa::{self, DFA}, 6 | regex::Regex, 7 | OverlappingState, 8 | }, 9 | nfa::thompson, 10 | HalfMatch, MatchError, MatchKind, MultiMatch, 11 | }; 12 | 13 | use crate::util::{BunkPrefilter, SubstringPrefilter}; 14 | 15 | // Tests that too many cache resets cause the lazy DFA to quit. 16 | // 17 | // We only test this on 64-bit because the test is gingerly crafted based on 18 | // implementation details of cache sizes. It's not a great test because of 19 | // that, but it does check some interesting properties around how positions are 20 | // reported when a search "gives up." 21 | #[test] 22 | #[cfg(target_pointer_width = "64")] 23 | fn too_many_cache_resets_cause_quit() -> Result<(), Box> { 24 | // This is a carefully chosen regex. The idea is to pick one that requires 25 | // some decent number of states (hence the bounded repetition). But we 26 | // specifically choose to create a class with an ASCII letter and a 27 | // non-ASCII letter so that we can check that no new states are created 28 | // once the cache is full. Namely, if we fill up the cache on a haystack 29 | // of 'a's, then in order to match one 'β', a new state will need to be 30 | // created since a 'β' is encoded with multiple bytes. Since there's no 31 | // room for this state, the search should quit at the very first position. 32 | let pattern = r"[aβ]{100}"; 33 | let dfa = DFA::builder() 34 | .configure( 35 | // Configure it so that we have the minimum cache capacity 36 | // possible. And that if any resets occur, the search quits. 37 | DFA::config() 38 | .skip_cache_capacity_check(true) 39 | .cache_capacity(0) 40 | .minimum_cache_clear_count(Some(0)), 41 | ) 42 | .build(pattern)?; 43 | let mut cache = dfa.create_cache(); 44 | 45 | let haystack = "a".repeat(101).into_bytes(); 46 | let err = MatchError::GaveUp { offset: 25 }; 47 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone())); 48 | assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone())); 49 | assert_eq!( 50 | dfa.find_overlapping_fwd( 51 | &mut cache, 52 | &haystack, 53 | &mut OverlappingState::start() 54 | ), 55 | Err(err.clone()) 56 | ); 57 | 58 | let haystack = "β".repeat(101).into_bytes(); 59 | let err = MatchError::GaveUp { offset: 0 }; 60 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); 61 | // no need to test that other find routines quit, since we did that above 62 | 63 | // OK, if we reset the cache, then we should be able to create more states 64 | // and make more progress with searching for betas. 65 | cache.reset(&dfa); 66 | let err = MatchError::GaveUp { offset: 26 }; 67 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); 68 | 69 | // ... switching back to ASCII still makes progress since it just needs to 70 | // set transitions on existing states! 71 | let haystack = "a".repeat(101).into_bytes(); 72 | let err = MatchError::GaveUp { offset: 13 }; 73 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); 74 | 75 | Ok(()) 76 | } 77 | 78 | // Tests that quit bytes in the forward direction work correctly. 79 | #[test] 80 | fn quit_fwd() -> Result<(), Box> { 81 | let dfa = DFA::builder() 82 | .configure(DFA::config().quit(b'x', true)) 83 | .build("[[:word:]]+$")?; 84 | let mut cache = dfa.create_cache(); 85 | 86 | assert_eq!( 87 | dfa.find_earliest_fwd(&mut cache, b"abcxyz"), 88 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 89 | ); 90 | assert_eq!( 91 | dfa.find_leftmost_fwd(&mut cache, b"abcxyz"), 92 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 93 | ); 94 | assert_eq!( 95 | dfa.find_overlapping_fwd( 96 | &mut cache, 97 | b"abcxyz", 98 | &mut OverlappingState::start() 99 | ), 100 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 101 | ); 102 | 103 | Ok(()) 104 | } 105 | 106 | // Tests that quit bytes in the reverse direction work correctly. 107 | #[test] 108 | fn quit_rev() -> Result<(), Box> { 109 | let dfa = DFA::builder() 110 | .configure(DFA::config().quit(b'x', true)) 111 | .thompson(thompson::Config::new().reverse(true)) 112 | .build("^[[:word:]]+")?; 113 | let mut cache = dfa.create_cache(); 114 | 115 | assert_eq!( 116 | dfa.find_earliest_rev(&mut cache, b"abcxyz"), 117 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 118 | ); 119 | assert_eq!( 120 | dfa.find_leftmost_rev(&mut cache, b"abcxyz"), 121 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 122 | ); 123 | 124 | Ok(()) 125 | } 126 | 127 | // Tests that if we heuristically enable Unicode word boundaries but then 128 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder 129 | // will panic. 130 | #[test] 131 | #[should_panic] 132 | fn quit_panics() { 133 | DFA::config().unicode_word_boundary(true).quit(b'\xFF', false); 134 | } 135 | 136 | // This tests an intesting case where even if the Unicode word boundary option 137 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode 138 | // word boundaries to be enabled. 139 | #[test] 140 | fn unicode_word_implicitly_works() -> Result<(), Box> { 141 | let mut config = DFA::config(); 142 | for b in 0x80..=0xFF { 143 | config = config.quit(b, true); 144 | } 145 | let dfa = DFA::builder().configure(config).build(r"\b")?; 146 | let mut cache = dfa.create_cache(); 147 | let expected = HalfMatch::must(0, 1); 148 | assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected))); 149 | Ok(()) 150 | } 151 | 152 | // Tests that we can provide a prefilter to a Regex, and the search reports 153 | // correct results. 154 | #[test] 155 | fn prefilter_works() -> Result<(), Box> { 156 | let mut re = Regex::new(r"a[0-9]+").unwrap(); 157 | re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); 158 | let mut cache = re.create_cache(); 159 | 160 | let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; 161 | let matches: Vec<(usize, usize)> = re 162 | .find_leftmost_iter(&mut cache, text) 163 | .map(|m| (m.start(), m.end())) 164 | .collect(); 165 | assert_eq!( 166 | matches, 167 | vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] 168 | ); 169 | Ok(()) 170 | } 171 | 172 | // This test confirms that a prefilter is active by using a prefilter that 173 | // reports false negatives. 174 | #[test] 175 | fn prefilter_is_active() -> Result<(), Box> { 176 | let text = b"za123"; 177 | let mut re = Regex::new(r"a[0-9]+").unwrap(); 178 | let mut cache = re.create_cache(); 179 | 180 | re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); 181 | assert_eq!( 182 | re.find_leftmost(&mut cache, b"za123"), 183 | Some(MultiMatch::must(0, 1, 5)) 184 | ); 185 | assert_eq!( 186 | re.find_leftmost(&mut cache, b"a123"), 187 | Some(MultiMatch::must(0, 0, 4)) 188 | ); 189 | re.set_prefilter(Some(Box::new(BunkPrefilter::new()))); 190 | assert_eq!(re.find_leftmost(&mut cache, b"za123"), None); 191 | // This checks that the prefilter is used when first starting the search, 192 | // instead of waiting until at least one transition has occurred. 193 | assert_eq!(re.find_leftmost(&mut cache, b"a123"), None); 194 | Ok(()) 195 | } 196 | -------------------------------------------------------------------------------- /tests/hybrid/mod.rs: -------------------------------------------------------------------------------- 1 | mod api; 2 | mod suite; 3 | -------------------------------------------------------------------------------- /tests/hybrid/suite.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{ 2 | hybrid::{ 3 | dfa::DFA, 4 | regex::{self, Regex}, 5 | }, 6 | nfa::thompson, 7 | MatchKind, SyntaxConfig, 8 | }; 9 | use regex_syntax as syntax; 10 | 11 | use regex_test::{ 12 | bstr::{BString, ByteSlice}, 13 | CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, 14 | SearchKind as TestSearchKind, TestResult, TestRunner, 15 | }; 16 | 17 | use crate::{suite, Result}; 18 | 19 | /// Tests the default configuration of the hybrid NFA/DFA. 20 | #[test] 21 | fn default() -> Result<()> { 22 | let builder = Regex::builder(); 23 | TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); 24 | Ok(()) 25 | } 26 | 27 | /// Tests the hybrid NFA/DFA with NFA shrinking disabled. 28 | /// 29 | /// This is actually the typical configuration one wants for a lazy DFA. NFA 30 | /// shrinking is mostly only advantageous when building a full DFA since it 31 | /// can sharply decrease the amount of time determinization takes. But NFA 32 | /// shrinking is itself otherwise fairly expensive. Since a lazy DFA has 33 | /// no compilation time (other than for building the NFA of course) before 34 | /// executing a search, it's usually worth it to forgo NFA shrinking. 35 | #[test] 36 | fn no_nfa_shrink() -> Result<()> { 37 | let mut builder = Regex::builder(); 38 | builder.thompson(thompson::Config::new().shrink(false)); 39 | TestRunner::new()? 40 | // Without NFA shrinking, this test blows the default cache capacity. 41 | .blacklist("expensive/regression-many-repeat-no-stack-overflow") 42 | .test_iter(suite()?.iter(), compiler(builder)) 43 | .assert(); 44 | Ok(()) 45 | } 46 | 47 | /// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled. 48 | #[test] 49 | fn starts_for_each_pattern() -> Result<()> { 50 | let mut builder = Regex::builder(); 51 | builder.dfa(DFA::config().starts_for_each_pattern(true)); 52 | TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); 53 | Ok(()) 54 | } 55 | 56 | /// Tests the hybrid NFA/DFA when byte classes are disabled. 57 | /// 58 | /// N.B. Disabling byte classes doesn't avoid any indirection at search time. 59 | /// All it does is cause every byte value to be its own distinct equivalence 60 | /// class. 61 | #[test] 62 | fn no_byte_classes() -> Result<()> { 63 | let mut builder = Regex::builder(); 64 | builder.dfa(DFA::config().byte_classes(false)); 65 | TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); 66 | Ok(()) 67 | } 68 | 69 | /// Tests that hybrid NFA/DFA never clears its cache for any test with the 70 | /// default capacity. 71 | /// 72 | /// N.B. If a regex suite test is added that causes the cache to be cleared, 73 | /// then this should just skip that test. (Which can be done by calling the 74 | /// 'blacklist' method on 'TestRunner'.) 75 | #[test] 76 | fn no_cache_clearing() -> Result<()> { 77 | let mut builder = Regex::builder(); 78 | builder.dfa(DFA::config().minimum_cache_clear_count(Some(0))); 79 | TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); 80 | Ok(()) 81 | } 82 | 83 | /// Tests the hybrid NFA/DFA when the minimum cache capacity is set. 84 | #[test] 85 | fn min_cache_capacity() -> Result<()> { 86 | let mut builder = Regex::builder(); 87 | builder 88 | .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true)); 89 | TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); 90 | Ok(()) 91 | } 92 | 93 | fn compiler( 94 | mut builder: regex::Builder, 95 | ) -> impl FnMut(&RegexTest, &[BString]) -> Result { 96 | move |test, regexes| { 97 | let regexes = regexes 98 | .iter() 99 | .map(|r| r.to_str().map(|s| s.to_string())) 100 | .collect::, _>>()?; 101 | 102 | // Check if our regex contains things that aren't supported by DFAs. 103 | // That is, Unicode word boundaries when searching non-ASCII text. 104 | let mut thompson = thompson::Builder::new(); 105 | thompson.syntax(config_syntax(test)).configure(config_thompson(test)); 106 | if let Ok(nfa) = thompson.build_many(®exes) { 107 | let non_ascii = test.input().iter().any(|&b| !b.is_ascii()); 108 | if nfa.has_word_boundary_unicode() && non_ascii { 109 | return Ok(CompiledRegex::skip()); 110 | } 111 | } 112 | if !configure_regex_builder(test, &mut builder) { 113 | return Ok(CompiledRegex::skip()); 114 | } 115 | let re = builder.build_many(®exes)?; 116 | let mut cache = re.create_cache(); 117 | Ok(CompiledRegex::compiled(move |test| -> Vec { 118 | run_test(&re, &mut cache, test) 119 | })) 120 | } 121 | } 122 | 123 | fn run_test( 124 | re: &Regex, 125 | cache: &mut regex::Cache, 126 | test: &RegexTest, 127 | ) -> Vec { 128 | let is_match = if re.is_match(cache, test.input()) { 129 | TestResult::matched() 130 | } else { 131 | TestResult::no_match() 132 | }; 133 | let is_match = is_match.name("is_match"); 134 | 135 | let find_matches = match test.search_kind() { 136 | TestSearchKind::Earliest => { 137 | let it = re 138 | .find_earliest_iter(cache, test.input()) 139 | .take(test.match_limit().unwrap_or(std::usize::MAX)) 140 | .map(|m| Match { 141 | id: m.pattern().as_usize(), 142 | start: m.start(), 143 | end: m.end(), 144 | }); 145 | TestResult::matches(it).name("find_earliest_iter") 146 | } 147 | TestSearchKind::Leftmost => { 148 | let it = re 149 | .find_leftmost_iter(cache, test.input()) 150 | .take(test.match_limit().unwrap_or(std::usize::MAX)) 151 | .map(|m| Match { 152 | id: m.pattern().as_usize(), 153 | start: m.start(), 154 | end: m.end(), 155 | }); 156 | TestResult::matches(it).name("find_leftmost_iter") 157 | } 158 | TestSearchKind::Overlapping => { 159 | let it = re 160 | .find_overlapping_iter(cache, test.input()) 161 | .take(test.match_limit().unwrap_or(std::usize::MAX)) 162 | .map(|m| Match { 163 | id: m.pattern().as_usize(), 164 | start: m.start(), 165 | end: m.end(), 166 | }); 167 | TestResult::matches(it).name("find_overlapping_iter") 168 | } 169 | }; 170 | vec![is_match, find_matches] 171 | } 172 | 173 | /// Configures the given regex builder with all relevant settings on the given 174 | /// regex test. 175 | /// 176 | /// If the regex test has a setting that is unsupported, then this returns 177 | /// false (implying the test should be skipped). 178 | fn configure_regex_builder( 179 | test: &RegexTest, 180 | builder: &mut regex::Builder, 181 | ) -> bool { 182 | let match_kind = match test.match_kind() { 183 | TestMatchKind::All => MatchKind::All, 184 | TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst, 185 | TestMatchKind::LeftmostLongest => return false, 186 | }; 187 | 188 | let dense_config = DFA::config() 189 | .anchored(test.anchored()) 190 | .match_kind(match_kind) 191 | .unicode_word_boundary(true); 192 | let regex_config = Regex::config().utf8(test.utf8()); 193 | builder 194 | .configure(regex_config) 195 | .syntax(config_syntax(test)) 196 | .thompson(config_thompson(test)) 197 | .dfa(dense_config); 198 | true 199 | } 200 | 201 | /// Configuration of a Thompson NFA compiler from a regex test. 202 | fn config_thompson(test: &RegexTest) -> thompson::Config { 203 | thompson::Config::new().utf8(test.utf8()) 204 | } 205 | 206 | /// Configuration of the regex parser from a regex test. 207 | fn config_syntax(test: &RegexTest) -> SyntaxConfig { 208 | SyntaxConfig::new() 209 | .case_insensitive(test.case_insensitive()) 210 | .unicode(test.unicode()) 211 | .utf8(test.utf8()) 212 | } 213 | -------------------------------------------------------------------------------- /tests/nfa/mod.rs: -------------------------------------------------------------------------------- 1 | mod thompson; 2 | -------------------------------------------------------------------------------- /tests/nfa/thompson/mod.rs: -------------------------------------------------------------------------------- 1 | mod pikevm; 2 | -------------------------------------------------------------------------------- /tests/nfa/thompson/pikevm/api.rs: -------------------------------------------------------------------------------- 1 | /* 2 | use std::error::Error; 3 | 4 | use regex_automata::{ 5 | hybrid::{ 6 | dfa::{self, DFA}, 7 | regex::Regex, 8 | OverlappingState, 9 | }, 10 | nfa::thompson, 11 | HalfMatch, MatchError, MatchKind, MultiMatch, 12 | }; 13 | 14 | use crate::util::{BunkPrefilter, SubstringPrefilter}; 15 | 16 | // Tests that too many cache resets cause the lazy DFA to quit. 17 | #[test] 18 | fn too_many_cache_resets_cause_quit() -> Result<(), Box> { 19 | // This is a carefully chosen regex. The idea is to pick one that requires 20 | // some decent number of states (hence the bounded repetition). But we 21 | // specifically choose to create a class with an ASCII letter and a 22 | // non-ASCII letter so that we can check that no new states are created 23 | // once the cache is full. Namely, if we fill up the cache on a haystack 24 | // of 'a's, then in order to match one 'β', a new state will need to be 25 | // created since a 'β' is encoded with multiple bytes. Since there's no 26 | // room for this state, the search should quit at the very first position. 27 | let pattern = r"[aβ]{100}"; 28 | let dfa = DFA::builder() 29 | .configure( 30 | // Configure it so that we have the minimum cache capacity 31 | // possible. And that if any resets occur, the search quits. 32 | DFA::config() 33 | .skip_cache_capacity_check(true) 34 | .cache_capacity(0) 35 | .minimum_cache_clear_count(Some(0)), 36 | ) 37 | .build(pattern)?; 38 | let mut cache = dfa.create_cache(); 39 | 40 | let haystack = "a".repeat(101).into_bytes(); 41 | let err = MatchError::GaveUp { offset: 25 }; 42 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone())); 43 | assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone())); 44 | assert_eq!( 45 | dfa.find_overlapping_fwd( 46 | &mut cache, 47 | &haystack, 48 | &mut OverlappingState::start() 49 | ), 50 | Err(err.clone()) 51 | ); 52 | 53 | let haystack = "β".repeat(101).into_bytes(); 54 | let err = MatchError::GaveUp { offset: 0 }; 55 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); 56 | // no need to test that other find routines quit, since we did that above 57 | 58 | // OK, if we reset the cache, then we should be able to create more states 59 | // and make more progress with searching for betas. 60 | cache.reset(&dfa); 61 | let err = MatchError::GaveUp { offset: 26 }; 62 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); 63 | 64 | // ... switching back to ASCII still makes progress since it just needs to 65 | // set transitions on existing states! 66 | let haystack = "a".repeat(101).into_bytes(); 67 | let err = MatchError::GaveUp { offset: 13 }; 68 | assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); 69 | 70 | Ok(()) 71 | } 72 | 73 | // Tests that quit bytes in the forward direction work correctly. 74 | #[test] 75 | fn quit_fwd() -> Result<(), Box> { 76 | let dfa = DFA::builder() 77 | .configure(DFA::config().quit(b'x', true)) 78 | .build("[[:word:]]+$")?; 79 | let mut cache = dfa.create_cache(); 80 | 81 | assert_eq!( 82 | dfa.find_earliest_fwd(&mut cache, b"abcxyz"), 83 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 84 | ); 85 | assert_eq!( 86 | dfa.find_leftmost_fwd(&mut cache, b"abcxyz"), 87 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 88 | ); 89 | assert_eq!( 90 | dfa.find_overlapping_fwd( 91 | &mut cache, 92 | b"abcxyz", 93 | &mut OverlappingState::start() 94 | ), 95 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 96 | ); 97 | 98 | Ok(()) 99 | } 100 | 101 | // Tests that quit bytes in the reverse direction work correctly. 102 | #[test] 103 | fn quit_rev() -> Result<(), Box> { 104 | let dfa = DFA::builder() 105 | .configure(DFA::config().quit(b'x', true)) 106 | .thompson(thompson::Config::new().reverse(true)) 107 | .build("^[[:word:]]+")?; 108 | let mut cache = dfa.create_cache(); 109 | 110 | assert_eq!( 111 | dfa.find_earliest_rev(&mut cache, b"abcxyz"), 112 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 113 | ); 114 | assert_eq!( 115 | dfa.find_leftmost_rev(&mut cache, b"abcxyz"), 116 | Err(MatchError::Quit { byte: b'x', offset: 3 }) 117 | ); 118 | 119 | Ok(()) 120 | } 121 | 122 | // Tests that if we heuristically enable Unicode word boundaries but then 123 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder 124 | // will panic. 125 | #[test] 126 | #[should_panic] 127 | fn quit_panics() { 128 | DFA::config().unicode_word_boundary(true).quit(b'\xFF', false); 129 | } 130 | 131 | // This tests an intesting case where even if the Unicode word boundary option 132 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode 133 | // word boundaries to be enabled. 134 | #[test] 135 | fn unicode_word_implicitly_works() -> Result<(), Box> { 136 | let mut config = DFA::config(); 137 | for b in 0x80..=0xFF { 138 | config = config.quit(b, true); 139 | } 140 | let dfa = DFA::builder().configure(config).build(r"\b")?; 141 | let mut cache = dfa.create_cache(); 142 | let expected = HalfMatch::must(0, 1); 143 | assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected))); 144 | Ok(()) 145 | } 146 | 147 | // Tests that we can provide a prefilter to a Regex, and the search reports 148 | // correct results. 149 | #[test] 150 | fn prefilter_works() -> Result<(), Box> { 151 | let mut re = Regex::new(r"a[0-9]+").unwrap(); 152 | re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); 153 | let mut cache = re.create_cache(); 154 | 155 | let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; 156 | let matches: Vec<(usize, usize)> = re 157 | .find_leftmost_iter(&mut cache, text) 158 | .map(|m| (m.start(), m.end())) 159 | .collect(); 160 | assert_eq!( 161 | matches, 162 | vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] 163 | ); 164 | Ok(()) 165 | } 166 | 167 | // This test confirms that a prefilter is active by using a prefilter that 168 | // reports false negatives. 169 | #[test] 170 | fn prefilter_is_active() -> Result<(), Box> { 171 | let text = b"za123"; 172 | let mut re = Regex::new(r"a[0-9]+").unwrap(); 173 | let mut cache = re.create_cache(); 174 | 175 | re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); 176 | assert_eq!( 177 | re.find_leftmost(&mut cache, b"za123"), 178 | Some(MultiMatch::must(0, 1, 5)) 179 | ); 180 | assert_eq!( 181 | re.find_leftmost(&mut cache, b"a123"), 182 | Some(MultiMatch::must(0, 0, 4)) 183 | ); 184 | re.set_prefilter(Some(Box::new(BunkPrefilter::new()))); 185 | assert_eq!(re.find_leftmost(&mut cache, b"za123"), None); 186 | // This checks that the prefilter is used when first starting the search, 187 | // instead of waiting until at least one transition has occurred. 188 | assert_eq!(re.find_leftmost(&mut cache, b"a123"), None); 189 | Ok(()) 190 | } 191 | */ 192 | -------------------------------------------------------------------------------- /tests/nfa/thompson/pikevm/mod.rs: -------------------------------------------------------------------------------- 1 | mod api; 2 | mod suite; 3 | -------------------------------------------------------------------------------- /tests/nfa/thompson/pikevm/suite.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{ 2 | nfa::thompson::{ 3 | self, 4 | pikevm::{self, PikeVM}, 5 | }, 6 | MatchKind, SyntaxConfig, 7 | }; 8 | use regex_syntax as syntax; 9 | 10 | use regex_test::{ 11 | bstr::{BString, ByteSlice}, 12 | CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, 13 | SearchKind as TestSearchKind, TestResult, TestRunner, 14 | }; 15 | 16 | use crate::{suite, Result}; 17 | 18 | /// Tests the default configuration of the hybrid NFA/DFA. 19 | #[test] 20 | fn default() -> Result<()> { 21 | let builder = PikeVM::builder(); 22 | TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); 23 | Ok(()) 24 | } 25 | 26 | fn compiler( 27 | mut builder: pikevm::Builder, 28 | ) -> impl FnMut(&RegexTest, &[BString]) -> Result { 29 | move |test, regexes| { 30 | let regexes = regexes 31 | .iter() 32 | .map(|r| r.to_str().map(|s| s.to_string())) 33 | .collect::, _>>()?; 34 | if !configure_pikevm_builder(test, &mut builder) { 35 | return Ok(CompiledRegex::skip()); 36 | } 37 | let re = builder.build_many(®exes)?; 38 | let mut cache = re.create_cache(); 39 | Ok(CompiledRegex::compiled(move |test| -> Vec { 40 | run_test(&re, &mut cache, test) 41 | })) 42 | } 43 | } 44 | 45 | fn run_test( 46 | re: &PikeVM, 47 | cache: &mut pikevm::Cache, 48 | test: &RegexTest, 49 | ) -> Vec { 50 | // let is_match = if re.is_match(cache, test.input()) { 51 | // TestResult::matched() 52 | // } else { 53 | // TestResult::no_match() 54 | // }; 55 | // let is_match = is_match.name("is_match"); 56 | 57 | let find_matches = match test.search_kind() { 58 | TestSearchKind::Earliest => { 59 | TestResult::skip().name("find_earliest_iter") 60 | } 61 | TestSearchKind::Leftmost => { 62 | let it = re 63 | .find_leftmost_iter(cache, test.input()) 64 | .take(test.match_limit().unwrap_or(std::usize::MAX)) 65 | .map(|m| Match { 66 | id: m.pattern().as_usize(), 67 | start: m.start(), 68 | end: m.end(), 69 | }); 70 | TestResult::matches(it).name("find_leftmost_iter") 71 | } 72 | TestSearchKind::Overlapping => { 73 | TestResult::skip().name("find_overlapping_iter") 74 | } 75 | }; 76 | // vec![is_match, find_matches] 77 | vec![find_matches] 78 | } 79 | 80 | /// Configures the given regex builder with all relevant settings on the given 81 | /// regex test. 82 | /// 83 | /// If the regex test has a setting that is unsupported, then this returns 84 | /// false (implying the test should be skipped). 85 | fn configure_pikevm_builder( 86 | test: &RegexTest, 87 | builder: &mut pikevm::Builder, 88 | ) -> bool { 89 | let pikevm_config = 90 | PikeVM::config().anchored(test.anchored()).utf8(test.utf8()); 91 | builder 92 | .configure(pikevm_config) 93 | .syntax(config_syntax(test)) 94 | .thompson(config_thompson(test)); 95 | true 96 | } 97 | 98 | /// Configuration of a Thompson NFA compiler from a regex test. 99 | fn config_thompson(test: &RegexTest) -> thompson::Config { 100 | thompson::Config::new().utf8(test.utf8()) 101 | } 102 | 103 | /// Configuration of the regex parser from a regex test. 104 | fn config_syntax(test: &RegexTest) -> SyntaxConfig { 105 | SyntaxConfig::new() 106 | .case_insensitive(test.case_insensitive()) 107 | .unicode(test.unicode()) 108 | .utf8(test.utf8()) 109 | } 110 | -------------------------------------------------------------------------------- /tests/regression.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{ 2 | dfa::{dense, Automaton}, 3 | MatchError, 4 | }; 5 | 6 | // A regression test for checking that minimization correctly translates 7 | // whether a state is a match state or not. Previously, it was possible for 8 | // minimization to mark a non-matching state as matching. 9 | #[test] 10 | fn minimize_sets_correct_match_states() { 11 | let pattern = 12 | // This is a subset of the grapheme matching regex. I couldn't seem 13 | // to get a repro any smaller than this unfortunately. 14 | r"(?x) 15 | (?: 16 | \p{gcb=Prepend}* 17 | (?: 18 | (?: 19 | (?: 20 | \p{gcb=L}* 21 | (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT}) 22 | \p{gcb=T}* 23 | ) 24 | | 25 | \p{gcb=L}+ 26 | | 27 | \p{gcb=T}+ 28 | ) 29 | | 30 | \p{Extended_Pictographic} 31 | (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})* 32 | | 33 | [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}] 34 | ) 35 | [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]* 36 | ) 37 | "; 38 | 39 | let dfa = dense::Builder::new() 40 | .configure(dense::Config::new().anchored(true).minimize(true)) 41 | .build(pattern) 42 | .unwrap(); 43 | assert_eq!(Ok(None), dfa.find_leftmost_fwd(b"\xE2")); 44 | } 45 | -------------------------------------------------------------------------------- /tests/tests.rs: -------------------------------------------------------------------------------- 1 | #![allow(warnings)] 2 | 3 | use regex_test::RegexTests; 4 | 5 | mod dfa; 6 | mod hybrid; 7 | mod nfa; 8 | mod regression; 9 | mod util; 10 | 11 | type Result = std::result::Result>; 12 | 13 | fn suite() -> Result { 14 | let mut tests = RegexTests::new(); 15 | macro_rules! load { 16 | ($name:expr) => {{ 17 | const DATA: &[u8] = 18 | include_bytes!(concat!("data/", $name, ".toml")); 19 | tests.load_slice($name, DATA)?; 20 | }}; 21 | } 22 | 23 | load!("bytes"); 24 | load!("crazy"); 25 | load!("earliest"); 26 | load!("empty"); 27 | load!("expensive"); 28 | load!("flags"); 29 | load!("iter"); 30 | load!("misc"); 31 | load!("multiline"); 32 | load!("no-unicode"); 33 | load!("overlapping"); 34 | load!("regression"); 35 | load!("set"); 36 | load!("unicode"); 37 | load!("word-boundary"); 38 | load!("fowler/basic"); 39 | load!("fowler/nullsubexpr"); 40 | load!("fowler/repetition"); 41 | load!("fowler/repetition-expensive"); 42 | 43 | Ok(tests) 44 | } 45 | -------------------------------------------------------------------------------- /tests/util.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::util::prefilter::{self, Candidate, Prefilter}; 2 | 3 | #[derive(Clone, Debug)] 4 | pub struct SubstringPrefilter(bstr::Finder<'static>); 5 | 6 | impl SubstringPrefilter { 7 | pub fn new>(needle: B) -> SubstringPrefilter { 8 | SubstringPrefilter(bstr::Finder::new(needle.as_ref()).into_owned()) 9 | } 10 | } 11 | 12 | impl Prefilter for SubstringPrefilter { 13 | #[inline] 14 | fn next_candidate( 15 | &self, 16 | state: &mut prefilter::State, 17 | haystack: &[u8], 18 | at: usize, 19 | ) -> Candidate { 20 | self.0 21 | .find(&haystack[at..]) 22 | .map(|i| Candidate::PossibleStartOfMatch(at + i)) 23 | .unwrap_or(Candidate::None) 24 | } 25 | 26 | fn heap_bytes(&self) -> usize { 27 | self.0.needle().len() 28 | } 29 | } 30 | 31 | /// A prefilter that always returns `Candidate::None`, even if it's a false 32 | /// negative. This is useful for confirming that a prefilter is actually 33 | /// active by asserting an incorrect result. 34 | #[derive(Clone, Debug)] 35 | pub struct BunkPrefilter(()); 36 | 37 | impl BunkPrefilter { 38 | pub fn new() -> BunkPrefilter { 39 | BunkPrefilter(()) 40 | } 41 | } 42 | 43 | impl Prefilter for BunkPrefilter { 44 | #[inline] 45 | fn next_candidate( 46 | &self, 47 | _state: &mut prefilter::State, 48 | _haystack: &[u8], 49 | _at: usize, 50 | ) -> Candidate { 51 | Candidate::None 52 | } 53 | 54 | fn heap_bytes(&self) -> usize { 55 | 0 56 | } 57 | } 58 | --------------------------------------------------------------------------------