├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── COPYING
├── Cargo.toml
├── LICENSE-MIT
├── PLANS.md
├── README.md
├── TODO
├── UNLICENSE
├── bench
    ├── .gitignore
    ├── Cargo.toml
    ├── data
    │   ├── opensubtitles2018-en-huge-ascii.txt
    │   ├── opensubtitles2018-en-small-ascii.txt
    │   ├── opensubtitles2018-en-tiny-ascii.txt
    │   ├── opensubtitles2018-ru-huge-utf8.txt
    │   ├── opensubtitles2018-ru-small-utf8.txt
    │   ├── opensubtitles2018-ru-tiny-utf8.txt
    │   ├── opensubtitles2018-zh-huge-utf8.txt
    │   ├── opensubtitles2018-zh-small-utf8.txt
    │   ├── opensubtitles2018-zh-tiny-utf8.txt
    │   ├── sherlock-holmes-huge.txt
    │   ├── sherlock-holmes-small.txt
    │   └── sherlock-holmes-tiny.txt
    └── src
    │   ├── bench.rs
    │   ├── inputs.rs
    │   └── lib.rs
├── examples
    ├── Cargo.toml
    └── fst.rs
├── regex-cli
    ├── Cargo.toml
    └── src
    │   ├── app.rs
    │   ├── cmd
    │       ├── debug.rs
    │       ├── find.rs
    │       └── mod.rs
    │   ├── config.rs
    │   ├── escape.rs
    │   ├── main.rs
    │   └── util.rs
├── regex-test
    ├── COPYING
    ├── Cargo.toml
    ├── LICENSE-MIT
    ├── UNLICENSE
    └── src
    │   ├── escape.rs
    │   └── lib.rs
├── rustfmt.toml
├── scripts
    ├── fowler-to-toml
    └── generate-fowler-tests
├── src
    ├── dfa
    │   ├── accel.rs
    │   ├── automaton.rs
    │   ├── dense.rs
    │   ├── determinize.rs
    │   ├── error.rs
    │   ├── minimize.rs
    │   ├── mod.rs
    │   ├── regex.rs
    │   ├── search.rs
    │   ├── search_unsafe.rs
    │   ├── sparse.rs
    │   ├── special.rs
    │   └── transducer.rs
    ├── hybrid
    │   ├── dfa.rs
    │   ├── error.rs
    │   ├── id.rs
    │   ├── mod.rs
    │   ├── regex.rs
    │   └── search.rs
    ├── lib.rs
    ├── macros.rs
    ├── nfa
    │   ├── mod.rs
    │   └── thompson
    │   │   ├── compiler.rs
    │   │   ├── error.rs
    │   │   ├── map.rs
    │   │   ├── mod.rs
    │   │   ├── pikevm.rs
    │   │   └── range_trie.rs
    └── util
    │   ├── alphabet.rs
    │   ├── bytes.rs
    │   ├── determinize
    │       ├── mod.rs
    │       └── state.rs
    │   ├── id.rs
    │   ├── lazy.rs
    │   ├── matchtypes.rs
    │   ├── mod.rs
    │   ├── prefilter.rs
    │   ├── sparse_set.rs
    │   ├── start.rs
    │   └── syntax.rs
└── tests
    ├── data
        ├── bytes.toml
        ├── crazy.toml
        ├── earliest.toml
        ├── empty.toml
        ├── expensive.toml
        ├── flags.toml
        ├── fowler
        │   ├── basic.toml
        │   ├── dat
        │   │   ├── README
        │   │   ├── basic.dat
        │   │   ├── nullsubexpr.dat
        │   │   ├── repetition-expensive.dat
        │   │   └── repetition.dat
        │   ├── nullsubexpr.toml
        │   ├── repetition-expensive.toml
        │   ├── repetition-long.toml
        │   └── repetition.toml
        ├── iter.toml
        ├── misc.toml
        ├── multiline.toml
        ├── no-unicode.toml
        ├── overlapping.toml
        ├── regression.toml
        ├── set.toml
        ├── unicode.toml
        └── word-boundary.toml
    ├── dfa
        ├── api.rs
        ├── mod.rs
        └── suite.rs
    ├── hybrid
        ├── api.rs
        ├── mod.rs
        └── suite.rs
    ├── nfa
        ├── mod.rs
        └── thompson
        │   ├── mod.rs
        │   └── pikevm
        │       ├── api.rs
        │       ├── mod.rs
        │       └── suite.rs
    ├── regression.rs
    ├── tests.rs
    └── util.rs


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: ci
  2 | on:
  3 |   pull_request:
  4 |   push:
  5 |     branches:
  6 |     - master
  7 |   schedule:
  8 |   - cron: '00 01 * * *'
  9 | jobs:
 10 |   test:
 11 |     name: test
 12 |     env:
 13 |       # For some builds, we use cross to test on 32-bit and big-endian
 14 |       # systems.
 15 |       CARGO: cargo
 16 |       # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`.
 17 |       TARGET:
 18 |     runs-on: ${{ matrix.os }}
 19 |     strategy:
 20 |       matrix:
 21 |         build:
 22 |         - pinned
 23 |         - stable
 24 |         - stable-32
 25 |         - stable-mips
 26 |         - stable-thumb
 27 |         - beta
 28 |         - nightly
 29 |         - macos
 30 |         - win-msvc
 31 |         - win-gnu
 32 |         include:
 33 |         - build: pinned
 34 |           os: ubuntu-18.04
 35 |           rust: 1.41.1
 36 |         - build: stable
 37 |           os: ubuntu-18.04
 38 |           rust: stable
 39 |         - build: stable-32
 40 |           os: ubuntu-18.04
 41 |           rust: stable
 42 |           target: i686-unknown-linux-gnu
 43 |         - build: stable-mips
 44 |           os: ubuntu-18.04
 45 |           rust: stable
 46 |           target: mips64-unknown-linux-gnuabi64
 47 |         - build: stable-thumb
 48 |           os: ubuntu-18.04
 49 |           rust: stable
 50 |           target: thumbv7em-none-eabihf
 51 |         - build: beta
 52 |           os: ubuntu-18.04
 53 |           rust: beta
 54 |         - build: nightly
 55 |           os: ubuntu-18.04
 56 |           rust: nightly
 57 |         - build: macos
 58 |           os: macos-latest
 59 |           rust: stable
 60 |         - build: win-msvc
 61 |           os: windows-2019
 62 |           rust: stable
 63 |         - build: win-gnu
 64 |           os: windows-2019
 65 |           rust: stable-x86_64-gnu
 66 |     steps:
 67 |     - name: Checkout repository
 68 |       uses: actions/checkout@v1
 69 |       with:
 70 |         fetch-depth: 1
 71 | 
 72 |     - name: Install Rust
 73 |       uses: actions-rs/toolchain@v1
 74 |       with:
 75 |         toolchain: ${{ matrix.rust }}
 76 |         profile: minimal
 77 |         override: true
 78 | 
 79 |     - name: Use Cross
 80 |       if: matrix.target != ''
 81 |       run: |
 82 |         cargo install cross
 83 |         echo "CARGO=cross" >> $GITHUB_ENV
 84 |         echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV
 85 | 
 86 |     - name: Show command used for Cargo
 87 |       run: |
 88 |         echo "cargo command is: ${{ env.CARGO }}"
 89 |         echo "target flag is: ${{ env.TARGET }}"
 90 | 
 91 |     - name: Build
 92 |       if: matrix.build != 'stable-thumb'
 93 |       run: ${{ env.CARGO }} build --verbose ${{ env.TARGET }}
 94 | 
 95 |     - name: Build docs
 96 |       if: matrix.build != 'stable-thumb'
 97 |       run: ${{ env.CARGO }} doc --verbose ${{ env.TARGET }}
 98 | 
 99 |     # Our dev dependencies are increasing their MSRV more quickly then we want
100 |     # to, so the following are only run on non-pinned targets.
101 | 
102 |     - name: Build examples
103 |       if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
104 |       run: ${{ env.CARGO }} build --manifest-path examples/Cargo.toml --examples
105 | 
106 |     - name: Run tests
107 |       if: matrix.build != 'pinned' && matrix.build != 'stable-thumb' && matrix.build != 'stable-mips'
108 |       run: ${{ env.CARGO }} test --verbose --features transducer ${{ env.TARGET }}
109 | 
110 |     # The mips test runner is quite sluggish, so don't run the full test
111 |     # suite there. Unfortunate, but CI times balloon otherwise.
112 |     - name: Run tests
113 |       if: matrix.build == 'stable-mips'
114 |       run: ${{ env.CARGO }} test --verbose --features transducer --lib ${{ env.TARGET }}
115 | 
116 |     - name: Build without default features
117 |       if: matrix.build != 'pinned'
118 |       run: ${{ env.CARGO }} build --verbose --no-default-features ${{ env.TARGET }}
119 | 
120 |     - name: Build docs without default features
121 |       if: matrix.build != 'pinned'
122 |       run: ${{ env.CARGO }} doc --verbose --lib --no-default-features ${{ env.TARGET }}
123 | 
124 |     - name: Run tests without default features
125 |       if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
126 |       run: ${{ env.CARGO }} test --verbose --lib --no-default-features ${{ env.TARGET }}
127 | 
128 |     - name: Compile debug tool
129 |       if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
130 |       run: ${{ env.CARGO }} build --verbose --manifest-path regex-cli/Cargo.toml ${{ env.TARGET }}
131 | 
132 |     - name: Test benchmarks
133 |       if: matrix.build != 'pinned' && matrix.build != 'stable-thumb'
134 |       run: ${{ env.CARGO }} bench --manifest-path bench/Cargo.toml --verbose ${{ env.TARGET }} -- --test
135 | 
136 |   rustfmt:
137 |     name: rustfmt
138 |     runs-on: ubuntu-18.04
139 |     steps:
140 |     - name: Checkout repository
141 |       uses: actions/checkout@v1
142 |       with:
143 |         fetch-depth: 1
144 |     - name: Install Rust
145 |       uses: actions-rs/toolchain@v1
146 |       with:
147 |         toolchain: stable
148 |         override: true
149 |         profile: minimal
150 |         components: rustfmt
151 |     - name: Check formatting
152 |       run: |
153 |         cargo fmt --all -- --check
154 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /tmp
 2 | /target
 3 | /examples/target
 4 | /regex-automata-debug/target
 5 | /regex-cli/target
 6 | /regex-test/target
 7 | tags
 8 | /Cargo.lock
 9 | /examples/Cargo.lock
10 | BREADCRUMBS
11 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | This project is dual-licensed under the Unlicense and MIT licenses.
2 | 
3 | You may use this code under the terms of either license.
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
  1 | [package]
  2 | name = "regex-automata"
  3 | version = "0.2.0"  #:version
  4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
  5 | description = "Automata construction and matching using regular expressions."
  6 | documentation = "https://docs.rs/regex-automata"
  7 | homepage = "https://github.com/BurntSushi/regex-automata"
  8 | repository = "https://github.com/BurntSushi/regex-automata"
  9 | readme = "README.md"
 10 | keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
 11 | license = "Unlicense/MIT"
 12 | categories = ["text-processing"]
 13 | exclude = [
 14 |   "/.github", "/scripts/*", "/regex-cli", "/regex-test",
 15 | ]
 16 | autotests = false
 17 | autoexamples = false
 18 | edition = "2018"
 19 | resolver = "2"
 20 | 
 21 | [workspace]
 22 | members = ["bench", "examples", "regex-cli", "regex-test"]
 23 | 
 24 | [lib]
 25 | bench = false
 26 | 
 27 | [features]
 28 | # WARNING: The features below were assembled quickly without much thought.
 29 | # They might not work as you expect. The safest configuration is the default
 30 | # configuration.
 31 | default = ["std", "alloc", "syntax"]
 32 | std = []
 33 | alloc = ["syntax"]
 34 | transducer = ["fst"]
 35 | logging = ["log"]
 36 | syntax = ["regex-syntax"]
 37 | 
 38 | # WARNING: The features below are in a very rough draft form, which is why
 39 | # they are all commented out. I'm still working through the crate feature
 40 | # design, planned for the regex-automata 0.3 release.
 41 | 
 42 | # TODO: These features need to be fleshed out more, actually implemented and
 43 | # then tested. Also, add 'alloc' and 'std' features to regex-syntax before
 44 | # doing so.
 45 | #default = ["std", "dfa", "syntax", "unicode", "regex-syntax/default"]
 46 | #std = ["alloc", "memchr/std"]
 47 | # TODO: Should this also imply regex-syntax/alloc? Will that turn into a no-op
 48 | # if regex-syntax isn't enabled as a dependency? Do we need a separate
 49 | # 'alloc_nosyntax' feature to enable alloc features without bringing in
 50 | # regex-syntax? Sigh.
 51 | #alloc = []
 52 | #logging = ["log"]
 53 | #transducer = ["fst"]
 54 | 
 55 | # When enabled, the 'dfa' sub-module will be available. Note that if 'dfa' is
 56 | # enabled but 'alloc' is not, then only DFA deserialization and search will be
 57 | # available. DFA construction requires the 'alloc' and 'syntax' features to be
 58 | # enabled.
 59 | #dfa = []
 60 | #syntax = ["regex-syntax"]
 61 | 
 62 | ## Enables all Unicode features. This expands if new Unicode features are added.
 63 | #unicode = [
 64 | #  "unicode-age",
 65 | #  "unicode-bool",
 66 | #  "unicode-case",
 67 | #  "unicode-gencat",
 68 | #  "unicode-perl",
 69 | #  "unicode-script",
 70 | #  "unicode-segment",
 71 | #  "regex-syntax/unicode",
 72 | #]
 73 | ## Enables use of the `Age` property, e.g., `\p{Age:3.0}`.
 74 | #unicode-age = ["regex-syntax/unicode-age"]
 75 | ## Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`.
 76 | #unicode-bool = ["regex-syntax/unicode-bool"]
 77 | ## Enables Unicode-aware case insensitive matching, e.g., `(?i)β`.
 78 | #unicode-case = ["regex-syntax/unicode-case"]
 79 | ## Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`.
 80 | #unicode-gencat = ["regex-syntax/unicode-gencat"]
 81 | ## Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`.
 82 | #unicode-perl = ["regex-syntax/unicode-perl"]
 83 | ## Enables Unicode scripts and script extensions, e.g., `\p{Greek}`.
 84 | #unicode-script = ["regex-syntax/unicode-script"]
 85 | ## Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`.
 86 | #unicode-segment = ["regex-syntax/unicode-segment"]
 87 | 
 88 | [dependencies]
 89 | fst = { version = "0.4.5", optional = true }
 90 | log = { version = "0.4.14", optional = true }
 91 | memchr = { version = "2.4.0", default-features = false }
 92 | regex-syntax = { version = "0.6.24", optional = true }
 93 | 
 94 | [dev-dependencies]
 95 | bstr = { version = "0.2.16", default-features = false, features = ["std"] }
 96 | quickcheck = { version = "1.0.3", default-features = false }
 97 | regex-syntax = "0.6.16"
 98 | regex-test = { version = "*", path = "regex-test" }
 99 | 
100 | [[test]]
101 | path = "tests/tests.rs"
102 | name = "integration"
103 | 
104 | [profile.dev]
105 | # Running tests takes too long in debug mode, so we forcefully always build
106 | # with optimizations. Unfortunate, but, ¯\_(ツ)_/¯.
107 | #
108 | # It's counter-intuitive that this needs to be set on dev *and* test, but
109 | # it's because the tests that take a long time to run are run as integration
110 | # tests in a separate crate. The test.opt-level setting won't apply there, so
111 | # we need to set the opt-level across the entire build.
112 | opt-level = 3
113 | debug = true
114 | 
115 | [profile.test]
116 | opt-level = 3
117 | debug = true
118 | 
119 | [profile.release]
120 | debug = true
121 | 
122 | [profile.bench]
123 | debug = true
124 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PLANS.md:
--------------------------------------------------------------------------------
  1 | pattern_limit should not be defined inside nfa::thompson, but rather at the
  2 | top-level.
  3 | 
  4 | -----
  5 | 
  6 | Main problem right now is exemplified by the set60 and set70 failing tests. In
  7 | particular, when finding the starting position while matching multiple regexes
  8 | simultaneously, the reverse search is messed up. The reverse search doesn't
  9 | depend on which regex matched in the forward direction, which means it won't
 10 | always find the correcting starting location. Unfortunately, the only way to
 11 | fix this, as far as I can tell, is to add a group of start states for every
 12 | regex in the DFA. Then once we do the reverse search, we need to choose the
 13 | correct start state based on which regex matched in the forward direction.
 14 | 
 15 | This is a nasty change.
 16 | 
 17 | So it looks like this only applies when doing an overlapping search in reverse
 18 | to find the start of a match. That means we should make this configurable
 19 | but enable it by default for the reverse automata. It should be configurable
 20 | so that folks can construct a regex that doesn't have the ability to do
 21 | overlapping searches correctly. If an overlapping search is attempted with
 22 | a reverse automaton that lacks starting states for each pattern, then the
 23 | implementation should panic.
 24 | 
 25 | BUT! It is also convenient to provide this option in general for folks that
 26 | want a DFA that can match any pattern while also being able to match a specific
 27 | pattern.
 28 | 
 29 | Straw man:
 30 | 
 31 | * Update dense::Config to have a `starts_for_each_pattern` option. It should
 32 |   be disabled by default.
 33 | * In `RegexBuilder::build_many_with_size` tweak the reverse DFA configuration
 34 |   to have the aforementioned option enabled.
 35 | * It would be interesting to add new APIs to `Regex` that support matching
 36 |   specific patterns, but I think this is a complication. If we did want to do
 37 |   this, then we should just add it to the `_at` variants and leave the rest of
 38 |   the API untouched.
 39 | * Add a `pattern_id: Option<PatternID>` parameter to each of the five
 40 |   `*_at` methods on the `dfa::Automaton` trait. A value of `None` retains the
 41 |   existing behavior. A `Some` value means that the starting state for that
 42 |   specific pattern must be chosen, which in turn implies an anchored search.
 43 |   (This means `starts_for_each_pattern` has utility for single-pattern DFAs
 44 |   since it makes it possible to build a DFA that can do both unanchored and
 45 |   anchored searches.)
 46 | * Thread this new parameter down into the various functions in `dfa::search`
 47 |   all the way down into `init_fwd` and `init_rev`. These functions will then
 48 |   pass it to `dfa.start_state_{forward,reverse}`.
 49 | * This is where things get gruesome since we now need to completely re-work how
 50 |   start states are represented in dense and sparse DFAs _and_ it needs to be
 51 |   configurable. It looks like the `Start` type from `dfa::automaton` can
 52 |   basically remain unchanged, since it still represents one of the four
 53 |   possible starting states that will need to be applied for every pattern.
 54 | * For `dfa::dense`, change `StartList` to `StartTable`. Currently, its only
 55 |   header is the state ID count, which is always 4. We'll want to change this
 56 |   to the stride and add a new header value that encodes the number of patterns.
 57 |   When the number of patterns is zero, then existing behavior is preserved and
 58 |   represents the case where `starts_for_each_pattern` is disabled (or in the
 59 |   case of an empty DFA). When non-zero, a table of starting state IDs is
 60 |   encoded with each row corresponding to the 4 starting states for each
 61 |   pattern. Before this table (even if it's empty), the 4 starting states for
 62 |   the entire DFA are encoded.
 63 | * For `dfa::sparse`, do the same as above. They are essentially the same right
 64 |   now anyway, with the only difference being that sparse DFAs use `&[u8]`
 65 |   instead of `&[S]` (because sparse DFAs don't have any alignment
 66 |   requirements).
 67 | * Modify `DFA::empty` to accept a `starts_for_each_pattern` bool that, when
 68 |   true, creates a start table with the header, the start states for the entire
 69 |   DFA and a row of start states for each pattern. When false, no rows are
 70 |   added.
 71 | * Expose whether there are starting states for each pattern via a predicate
 72 |   on the DFA.
 73 | * Modify the determinizer's `add_starts` method to basically do what it does,
 74 |   but also do it for each pattern when the DFA is configured for it. It should
 75 |   continue to reuse states as appropriate or not generate new states if they
 76 |   aren't needed. This will want to use the `NFA::start_pattern` method, which
 77 |   provides the starting NFA state ID for the given pattern.
 78 | * Fix the dense->sparse conversion. At this point, this piece should be fairly
 79 |   straight-forward since the sparse representation of starting states is
 80 |   basically identical to the dense representation.
 81 | 
 82 | At this point, I think the bug should resolve itself.
 83 | 
 84 | ^^^^ DONE! IT WORKS!
 85 | 
 86 | -----
 87 | 
 88 | 
 89 | Add top-level SyntaxConfig (or some such) that has all of the regex-syntax
 90 | options forwarded, but with automata oriented docs. Then use this for all of
 91 | the engines instead of having to repeat every option for every builder.
 92 | 
 93 | -----
 94 | 
 95 | These produce different results. PCRE2 looks correct. Basically, we should be
 96 | using the context around the `at` position correctly, which we aren't doing
 97 | right now. Seems tricky to get right, particularly when confirming the match
 98 | with a reverse DFA.
 99 | 
100 | Maybe our 'at' functions need to take a full range... Sigh. This is indeed what
101 | RE2 does. GAH.
102 | 
103 | fn main() {
104 |     let re = regex::Regex::new(r"(?-u)\b\sbar").unwrap();
105 |     let s = "foo bar baz";
106 |     println!("{:?}", re.find_at(s, 3).map(|m| m.as_str()));
107 | 
108 |     let re = pcre2::bytes::Regex::new(r"\b\sbar").unwrap();
109 |     let s = "foo bar baz";
110 |     println!("{:?}", re.find_at(s.as_bytes(), 3).unwrap());
111 | }
112 | 
113 | ^^^^ This is fixed now, but we still need to find a way to add test coverage
114 | for "context" searches. It'd be nice to do this automatically, but we'll
115 | probably just added a new 'context = [start, end]' option.
116 | 
117 | -----
118 | 
119 | 
120 | * Create regex-test crate, based on glob-test. Try to anticipate the needs for
121 |   the full regex test suite.
122 |   * See if we can clean up tests.
123 |     * Provide a way to mark a test as expensive.
124 |     * Provide a way to test is_match_at and find_at.
125 |     * Test shortest_match_at too? Huge pain. Add tests for it.
126 |     * Port ALL tests from the regex crate. Will probably need a way to mark a
127 |       test as skipped.
128 |     * Document tests better.
129 | * Find a way to remove byteorder dependency.
130 | * Reorganize crate API:
131 |   * Have errors contain `Box<Error+Send+Sync>` instead of `String`.
132 |   * Make errors non-exhaustive.
133 |   * Audit `StateID` trait for safety.
134 |   * Brainstorm hard about `DFA` trait and the fact that DenseDFA and SparseDFA
135 |     have inefficient implementations of some methods. Maybe use multiple
136 |     traits? Answer: get rid of premultiply/classes knobs and just enable
137 |     them by default. Should remove a huge amount of code.
138 |   * Check whether `unsafe` is really needed to eliminate bounds checks. Use
139 |     micro-benchmarks and bigger CLI workloads using `regex-automata-debug`.
140 |   * Re-write module docs for `dfa` as they are no longer top-level. Keep most.
141 |   * Retain any pertinent top-level crate docs, but don't rewrite yet.
142 |   * Clean up builders if we can. e.g., Determinizer, minimizer, it's all a mess
143 |     right now.
144 |   * Clean up and add 'always_match' and 'never_match' constructors for every
145 |     regex engine.
146 |   * See about supporting ^, $, \A, \z, \b and \B in DFAs. Do the non-Unicode
147 |     version of \b unfortunately. Carefully scrutinize how the regex crate's
148 |     lazy DFA does it and try to make it comprehensible. Done! Except for the
149 |     part about making it comprehensible.
150 | * Rethink prefilters?
151 | * Add `regex-automata-generate` CLI tool. This should just be a copy of
152 |   the `ucd-generate dfa` and `ucd-generate regex` commands.
153 | 
154 | Then build new public `nfa` sub-module.
155 |   * For Unicode \b, generate \w DFA (forwards and reverse) and embed it into
156 |     source for fast checking. That way, we don't need to ever do explicit UTF-8
157 |     decoding anywhere. Yay.
158 | 
159 | Then `lazy` sub-module.
160 | 
161 | Then `onepass`.
162 | 
163 | Then `jit`.
164 | 
165 | ... and beyond? CRAZY. But it can be done! Build STRONG base layers.
166 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
 1 | * Consider refactoring the NFA representation such that it can be instantly
 2 |   loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this
 3 |   could negatively impact using the NFA with deserialization costs. Before
 4 |   doing this, we should write PikeVM and backtracking implementations so that
 5 |   they can be benchmarked.
 6 | * Add captures to NFA.
 7 | * Once we're happy, re-organize the public API such that NFAs are exported
 8 |   and usable on their own.
 9 | 
10 | * Investigate why NFA shrinking seems to produce bigger DFAs after
11 |   determinization, even though it makes determinization substantially
12 |   faster. This might be because of its use of sparse NFA states, which have
13 |   a lower constant overhead associated with them.
14 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/bench/.gitignore:
--------------------------------------------------------------------------------
1 | log
2 | /target
3 | 


--------------------------------------------------------------------------------
/bench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "regex-automata-bench"
 4 | version = "0.0.1"
 5 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 6 | description = "Criterion benchmark suite for regex-automata."
 7 | homepage = "https://github.com/BurntSushi/regex-automata"
 8 | repository = "https://github.com/BurntSushi/regex-automata"
 9 | license = "Unlicense/MIT"
10 | workspace = ".."
11 | edition = "2018"
12 | 
13 | [lib]
14 | bench = false
15 | 
16 | [[bench]]
17 | name = "regex-automata"
18 | harness = false
19 | path = "src/bench.rs"
20 | 
21 | [dependencies]
22 | criterion = "0.3.4"
23 | regex-automata = { version = "*", path = ".." }
24 | 


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-en-small-ascii.txt:
--------------------------------------------------------------------------------
 1 | Presented by IM Pictures
 2 | Produced by Shin Cine
 3 | In association with MVP Venture Capital and Cinema Service
 4 | Jeon Ji-hyun Cha Tae-hyun
 5 | My Sassy Girl
 6 | Exactly two years ago today, she and I buried a time capsule here.
 7 | We promised to meet here two years later, but she hasn't come yet.
 8 | I'm going to wait.
 9 | Here we go.
10 | Please, don't move.
11 | One, two...
12 | Wait a minute.
13 | Hello?
14 | Oh, auntie.
15 | Sorry, I'm on my way.
16 | I'm really sorry.
17 | Yes, I'm coming.
18 | I'm having my photo taken.
19 | Bye.
20 | Are you ready?
21 | Here we go.
22 | One, two...
23 | My parents wanted a daughter, so they raised me like one.
24 | So I thought I was a girl until I was seven.
25 | I had to go to the women's public bath, too.
26 | The older I got,
27 | I thought my penis would get smaller and disappear.
28 | But it was the opposite.
29 | First Half
30 | He hasn't changed at all.
31 | No, I'm a real man now.
32 | Hey, asshole.
33 | Think clerical work in the army makes you a man?
34 | You irritate me!
35 | Give me a break, asshole.
36 | My job was tougher than you could imagine.
37 | Hey!
38 | I worked near the DMZ.
39 | Who are you kid


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-en-tiny-ascii.txt:
--------------------------------------------------------------------------------
1 | Presented by IM Pictures
2 | Produced by Shi


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-ru-small-utf8.txt:
--------------------------------------------------------------------------------
 1 | Рэй МИЛЛАНД, Энтони КУИН, Дебра ПАЖЕТ в фильме БЕРЕГ РЕКИ
 2 | в фильме также снимались:
 3 | Гарри КЭРИ-мл., Чабби ДЖОНСОН, Байрон ФУЛДЖЕ, Том МакКи, Фрэнк ГЕРСТЛ сценарий Гарольда Джэкоба СМИТА и Джэймса ЛЕЙСЕСТЕРА по рассказу Гарольда Джэкоба СМИТА "Самая высокая гора"
 4 | режиссер Аллан ДВАН
 5 | - А вы выбрали жаркий денек, мистер.
 6 | - Я всегда так делаю.
 7 | - Полный бак?
 8 | - Еще бы!
 9 | А у вас мощная "тачка", как я погляжу.
10 | - Могу продать ее вам.
11 | - Нет, спасибо!
12 | - Собираетесь немного поохотиться?
13 | - Ну, я надеюсь на это.
14 | Вы знаете, не проиживает тут поблизости парень по имени Кэмеро


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-ru-tiny-utf8.txt:
--------------------------------------------------------------------------------
1 | Рэй МИЛЛАНД, Энтони КУ


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-zh-small-utf8.txt:
--------------------------------------------------------------------------------
 1 | 我去拜托旅馆的人
 2 | 出去喝就行了
 3 | 我去拜托长井找工作
 4 | 他说帮我问问他哥哥的公司
 5 | 不知道会否成事
 6 | 既然他肯答应，一定有结果的
 7 | 真羡慕他至今还是优哉悠哉
 8 | 叔叔，你老是偷听人家拉琴
 9 | 小缝，你的颤音有进步了
10 | 我才不理你
11 | 叔叔你知道
12 | 爷爷找你来谈什么吗？
13 | 不知道
14 | 你的亲事
15 | 我去看看
16 | 走好
17 | 加油
18 | 你已经30岁了吧？
19 | 是的
20 | 身体健壮吧？
21 | 两三年来没有感冒
22 | 脑袋还算不笨吧？
23 | 是的
24 | 游手好闲太可惜了
25 | 他叫什么名字呢...
26 | 那个常去找你聊天的男人
27 | 我曾经见过他一两次
28 | 平冈吗？
29 | 那个人不算上乘人材...
30 | 听说帝大毕业后就去了外地
31 | 如今因为失败而回来
32 | 为什么？
33 | 想要为了温饱而工作吧
34 | 你在这里
35 | 我的梳子好像掉在这附近
36 | 你还是一样迷迷糊糊
37 | 坐吧，我陪你聊聊天
38 | 天气不错
39 | 去赏花如何？
40 | 等你真的想去再说
41 | 


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-zh-tiny-utf8.txt:
--------------------------------------------------------------------------------
1 | 你突然来信说最近要搬到这里
2 | 


--------------------------------------------------------------------------------
/bench/data/sherlock-holmes-small.txt:
--------------------------------------------------------------------------------
 1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save
 2 | upon those not infrequent occasions when he was up all night, was seated
 3 | at the breakfast table. I stood upon the hearth-rug and picked up the
 4 | stick which our visitor had left behind him the night before. It was a
 5 | fine, thick piece of wood, bulbous-headed, of the sort which is known as
 6 | a "Penang lawyer." Just under the head was a broad silver band nearly
 7 | an inch across. "To James Mortimer, M.R.C.S., from his friends of the
 8 | C.C.H.," was engraved upon it, with the date "1884." It was just such a
 9 | stick as the old-fashioned family practitioner used to carry--dignified,
10 | solid, and reassuring.
11 | 


--------------------------------------------------------------------------------
/bench/data/sherlock-holmes-tiny.txt:
--------------------------------------------------------------------------------
1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save
2 | 


--------------------------------------------------------------------------------
/bench/src/bench.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{
  2 |     criterion_group, criterion_main, Bencher, Criterion, Throughput,
  3 | };
  4 | use regex_automata::dfa::{dense, regex};
  5 | use regex_automata::nfa::thompson;
  6 | 
  7 | use crate::inputs::*;
  8 | 
  9 | mod inputs;
 10 | 
 11 | fn is_match(c: &mut Criterion) {
 12 |     let corpus = SHERLOCK_HUGE;
 13 |     define(c, "is-match", "sherlock-huge", corpus, move |b| {
 14 |         let re = regex::Builder::new().build(r"\p{Greek}").unwrap();
 15 |         // let re = re.forward().to_sparse().unwrap();
 16 |         b.iter(|| {
 17 |             assert!(!re.is_match(corpus));
 18 |         });
 19 |     });
 20 | 
 21 |     // let corpus = OPEN_ZH_SMALL;
 22 |     let corpus = SHERLOCK_SMALL;
 23 |     define(c, "is-match", "sherlock-small", corpus, move |b| {
 24 |         let re = regex::Builder::new().build(r"\p{Greek}").unwrap();
 25 |         // let re = re.forward().to_sparse().unwrap();
 26 |         b.iter(|| {
 27 |             assert!(!re.is_match(corpus));
 28 |         });
 29 |     });
 30 | 
 31 |     let corpus = SHERLOCK_TINY;
 32 |     define(c, "is-match", "sherlock-tiny", corpus, move |b| {
 33 |         let re = regex::Builder::new().build(r"\p{Greek}").unwrap();
 34 |         b.iter(|| {
 35 |             assert!(!re.is_match(corpus));
 36 |         });
 37 |     });
 38 | 
 39 |     let corpus = EMPTY;
 40 |     define(c, "is-match", "empty", corpus, move |b| {
 41 |         let re = regex::Builder::new().build(r"\p{Greek}").unwrap();
 42 |         b.iter(|| {
 43 |             assert!(!re.is_match(corpus));
 44 |         });
 45 |     });
 46 | }
 47 | 
 48 | // \w has 128,640 codepoints.
 49 | fn compile_unicode_word(c: &mut Criterion) {
 50 |     define_compile(c, "unicode-word", r"\w");
 51 |     define_compile_reverse(c, "unicode-word", r"\w");
 52 | }
 53 | 
 54 | // \p{Other_Math} has 1,362 codepoints
 55 | fn compile_unicode_other_math(c: &mut Criterion) {
 56 |     define_compile(c, "unicode-other-math", r"\p{Other_Math}");
 57 | }
 58 | 
 59 | // \p{Other_Uppercase} has 120 codepoints
 60 | fn compile_unicode_other_uppercase(c: &mut Criterion) {
 61 |     define_compile(
 62 |         c,
 63 |         "unicode-other-uppercase",
 64 |         r"\p{any}*?\p{Other_Uppercase}",
 65 |     );
 66 | }
 67 | 
 68 | fn compile_muammar(c: &mut Criterion) {
 69 |     define_compile(
 70 |         c,
 71 |         "muammar",
 72 |         r"\p{any}*?M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
 73 |     );
 74 | }
 75 | 
 76 | fn define_compile(c: &mut Criterion, group_name: &str, pattern: &'static str) {
 77 |     let group = format!("fwd-compile/{}", group_name);
 78 |     define(c, &group, "default", &[], move |b| {
 79 |         b.iter(|| {
 80 |             let result = dense::Builder::new()
 81 |                 .configure(dense::Config::new().anchored(true))
 82 |                 .build(pattern);
 83 |             assert!(result.is_ok());
 84 |         });
 85 |     });
 86 | }
 87 | 
 88 | fn define_compile_reverse(
 89 |     c: &mut Criterion,
 90 |     group_name: &str,
 91 |     pattern: &'static str,
 92 | ) {
 93 |     let group = format!("rev-compile/{}", group_name);
 94 |     define(c, &group, "default", &[], move |b| {
 95 |         b.iter(|| {
 96 |             let result = dense::Builder::new()
 97 |                 .configure(dense::Config::new().anchored(true))
 98 |                 .thompson(thompson::Config::new().reverse(true))
 99 |                 .build(pattern);
100 |             assert!(result.is_ok());
101 |         });
102 |     });
103 | }
104 | 
105 | fn define(
106 |     c: &mut Criterion,
107 |     group_name: &str,
108 |     bench_name: &str,
109 |     corpus: &[u8],
110 |     bench: impl FnMut(&mut Bencher) + 'static,
111 | ) {
112 |     c.benchmark_group(group_name)
113 |         .throughput(Throughput::Bytes(corpus.len() as u64))
114 |         .sample_size(15)
115 |         .warm_up_time(std::time::Duration::from_millis(500))
116 |         .measurement_time(std::time::Duration::from_secs(2))
117 |         .bench_function(bench_name, bench);
118 | }
119 | 
120 | criterion_group!(g1, is_match);
121 | criterion_group!(g2, compile_unicode_other_math);
122 | criterion_group!(g3, compile_unicode_other_uppercase);
123 | criterion_group!(g4, compile_muammar);
124 | criterion_group!(g5, compile_unicode_word);
125 | criterion_main!(g1, g2, g3, g4, g5);
126 | 


--------------------------------------------------------------------------------
/bench/src/inputs.rs:
--------------------------------------------------------------------------------
 1 | pub const EMPTY: &'static [u8] = b"";
 2 | 
 3 | pub const SHERLOCK_HUGE: &'static [u8] =
 4 |     include_bytes!("../data/sherlock-holmes-huge.txt");
 5 | pub const SHERLOCK_SMALL: &'static [u8] =
 6 |     include_bytes!("../data/sherlock-holmes-small.txt");
 7 | pub const SHERLOCK_TINY: &'static [u8] =
 8 |     include_bytes!("../data/sherlock-holmes-tiny.txt");
 9 | 
10 | // pub const OPEN_ZH_SMALL: &'static [u8] =
11 | // include_bytes!("../data/opensubtitles2018-zh-small-utf8.txt");
12 | 


--------------------------------------------------------------------------------
/bench/src/lib.rs:
--------------------------------------------------------------------------------
1 | // This is purposely empty. See src/bench.rs instead. We use src/bench.rs
2 | // to avoid including the same file in multiple build targets.
3 | 


--------------------------------------------------------------------------------
/examples/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "regex-automata-examples"
 4 | version = "0.0.0"  #:version
 5 | edition = "2018"
 6 | 
 7 | [dev-dependencies]
 8 | fst = "0.4.0"
 9 | regex-automata = { version = "*", path = "..", features = ["transducer"] }
10 | 
11 | [[example]]
12 | name = "fst"
13 | path = "fst.rs"
14 | 


--------------------------------------------------------------------------------
/examples/fst.rs:
--------------------------------------------------------------------------------
 1 | // To run this example, use:
 2 | //
 3 | //     cargo run --manifest-path examples/Cargo.toml --example fst
 4 | 
 5 | use fst::{IntoStreamer, Set};
 6 | use regex_automata::dfa::dense;
 7 | 
 8 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 9 |     let set = Set::from_iter(&["FoO", "Foo", "fOO", "foo"])?;
10 |     let pattern = r"(?i)foo";
11 |     let config = dense::Config::new().anchored(true);
12 |     let dfa = dense::Builder::new().configure(config).build(pattern)?;
13 | 
14 |     let keys = set.search(&dfa).into_stream().into_strs()?;
15 |     assert_eq!(keys, vec!["FoO", "Foo", "fOO", "foo"]);
16 |     println!("{:?}", keys);
17 |     Ok(())
18 | }
19 | 


--------------------------------------------------------------------------------
/regex-cli/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "regex-cli"
 4 | version = "0.0.1"
 5 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 6 | description = """
 7 | A command line tool for debugging, benchmarking and generating regular
 8 | expressions.
 9 | """
10 | documentation = "https://docs.rs/regex-cli"
11 | repository = "https://github.com/BurntSushi/regex-automata"
12 | keywords = ["regex", "cli", "debug", "nfa", "dfa"]
13 | license = "Unlicense/MIT"
14 | categories = ["text-processing"]
15 | autotests = false
16 | edition = "2018"
17 | 
18 | [[bin]]
19 | name = "regex-cli"
20 | 
21 | [dependencies]
22 | anyhow = "1.0.28"
23 | bstr = { version = "0.2.16", default-features = false, features = ["std"] }
24 | clap = { version = "2.33.0", default-features = false }
25 | memmap2 = "0.3.0"
26 | regex = "1.5.4"
27 | syntax = { package = "regex-syntax", version = "0.6.17" }
28 | tabwriter = "1.2.1"
29 | unicode-width = "0.1.7"
30 | 
31 | [dependencies.automata]
32 | package = "regex-automata"
33 | path = ".."
34 | features = ["logging"]
35 | 
36 | [dependencies.env_logger]
37 | version = "0.8.4"
38 | default-features = false
39 | features = ["atty", "humantime", "termcolor"]
40 | 


--------------------------------------------------------------------------------
/regex-cli/src/app.rs:
--------------------------------------------------------------------------------
  1 | use crate::cmd;
  2 | 
  3 | const TEMPLATE_ROOT: &'static str = "\
  4 | {bin} {version}
  5 | {author}
  6 | {about}
  7 | USAGE:
  8 |     {usage}
  9 | 
 10 | TIP:
 11 |     use -h for short docs and --help for long docs
 12 | 
 13 | SUBCOMMANDS:
 14 | {subcommands}
 15 | 
 16 | OPTIONS:
 17 | {unified}";
 18 | 
 19 | const TEMPLATE_SUBCOMMAND: &'static str = "\
 20 | USAGE:
 21 |     {usage}
 22 | 
 23 | TIP:
 24 |     use -h for short docs and --help for long docs
 25 | 
 26 | SUBCOMMANDS:
 27 | {subcommands}
 28 | 
 29 | OPTIONS:
 30 | {unified}";
 31 | 
 32 | const TEMPLATE_LEAF: &'static str = "\
 33 | USAGE:
 34 |     {usage}
 35 | 
 36 | TIP:
 37 |     use -h for short docs and --help for long docs
 38 | 
 39 | ARGS:
 40 | {positionals}
 41 | 
 42 | OPTIONS:
 43 | {unified}";
 44 | 
 45 | const ABOUT: &'static str = "
 46 | regex-cli is a tool for interacting with regular expressions on the command
 47 | line. It is useful as a debugging aide, an ad hoc benchmarking tool and as a
 48 | way to conveniently pre-compile and embed regular expressions into Rust
 49 | code.
 50 | ";
 51 | 
 52 | /// Convenience type alias for the Clap app type that we use.
 53 | pub type App = clap::App<'static, 'static>;
 54 | 
 55 | /// Convenience type alias for the Clap argument result type that we use.
 56 | pub type Args = clap::ArgMatches<'static>;
 57 | 
 58 | /// Convenience function for creating a new Clap sub-command.
 59 | ///
 60 | /// This should be used for sub-commands that contain other sub-commands.
 61 | pub fn command(name: &'static str) -> App {
 62 |     clap::SubCommand::with_name(name)
 63 |         .author(clap::crate_authors!())
 64 |         .version(clap::crate_version!())
 65 |         .template(TEMPLATE_SUBCOMMAND)
 66 |         .setting(clap::AppSettings::UnifiedHelpMessage)
 67 | }
 68 | 
 69 | /// Convenience function for creating a new Clap sub-command.
 70 | ///
 71 | /// This should be used for sub-commands that do NOT contain other
 72 | /// sub-commands.
 73 | pub fn leaf(name: &'static str) -> App {
 74 |     clap::SubCommand::with_name(name)
 75 |         .author(clap::crate_authors!())
 76 |         .version(clap::crate_version!())
 77 |         .template(TEMPLATE_LEAF)
 78 |         .setting(clap::AppSettings::UnifiedHelpMessage)
 79 | }
 80 | 
 81 | /// Convenience function for defining a Clap positional argument with the
 82 | /// given name.
 83 | pub fn arg(name: &'static str) -> clap::Arg {
 84 |     clap::Arg::with_name(name)
 85 | }
 86 | 
 87 | /// Convenience function for defining a Clap argument with a long flag name
 88 | /// that accepts a single value.
 89 | pub fn flag(name: &'static str) -> clap::Arg {
 90 |     clap::Arg::with_name(name).long(name).takes_value(true)
 91 | }
 92 | 
 93 | /// Convenience function for defining a Clap argument with a long flag name
 94 | /// that accepts no values. i.e., It is a boolean switch.
 95 | pub fn switch(name: &'static str) -> clap::Arg {
 96 |     clap::Arg::with_name(name).long(name)
 97 | }
 98 | 
 99 | /// Build the main Clap application.
100 | pub fn root() -> App {
101 |     clap::App::new("regex-cli")
102 |         .author(clap::crate_authors!())
103 |         .version(clap::crate_version!())
104 |         .about(ABOUT)
105 |         .template(TEMPLATE_ROOT)
106 |         .max_term_width(100)
107 |         .setting(clap::AppSettings::UnifiedHelpMessage)
108 |         .arg(switch("quiet").short("q").global(true).help("Show less output."))
109 |         .subcommand(cmd::debug::define())
110 |         .subcommand(cmd::find::define())
111 | }
112 | 


--------------------------------------------------------------------------------
/regex-cli/src/cmd/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod debug;
2 | pub mod find;
3 | 


--------------------------------------------------------------------------------
/regex-cli/src/main.rs:
--------------------------------------------------------------------------------
 1 | #![allow(warnings)]
 2 | 
 3 | mod app;
 4 | mod cmd;
 5 | mod config;
 6 | mod escape;
 7 | mod util;
 8 | 
 9 | fn main() -> anyhow::Result<()> {
10 |     env_logger::init();
11 |     let args = app::root().get_matches();
12 |     util::run_subcommand(&args, app::root, |cmd, args| match cmd {
13 |         "debug" => cmd::debug::run(args),
14 |         "find" => cmd::find::run(args),
15 |         _ => Err(util::UnrecognizedCommandError.into()),
16 |     })
17 | }
18 | 


--------------------------------------------------------------------------------
/regex-cli/src/util.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{self, Write};
  2 | 
  3 | use unicode_width::UnicodeWidthStr;
  4 | 
  5 | use crate::app::{App, Args};
  6 | 
  7 | /// An error that indicates that a sub-command was seen that was not
  8 | /// recognized.
  9 | ///
 10 | /// This is a sentinel error that is always converted to a panic via
 11 | /// run_subcommand. Namely, not handling a defined sub-command is a programmer
 12 | /// error.
 13 | #[derive(Debug)]
 14 | pub struct UnrecognizedCommandError;
 15 | 
 16 | impl std::error::Error for UnrecognizedCommandError {}
 17 | 
 18 | impl std::fmt::Display for UnrecognizedCommandError {
 19 |     fn fmt(&self, _: &mut std::fmt::Formatter) -> std::fmt::Result {
 20 |         unreachable!()
 21 |     }
 22 | }
 23 | 
 24 | /// Choose the sub-command of 'args' to run with 'run'. If the sub-command
 25 | /// wasn't recognized or is unknown, then an error is returned.
 26 | pub fn run_subcommand(
 27 |     args: &Args,
 28 |     app: impl FnOnce() -> App,
 29 |     run: impl FnOnce(&str, &Args) -> anyhow::Result<()>,
 30 | ) -> anyhow::Result<()> {
 31 |     let (name, args) = args.subcommand();
 32 |     if name.is_empty() || args.is_none() {
 33 |         app().print_help()?;
 34 |         writeln!(std::io::stdout(), "")?;
 35 |         return Ok(());
 36 |     }
 37 |     let err = match run(name, args.unwrap()) {
 38 |         Ok(()) => return Ok(()),
 39 |         Err(err) => err,
 40 |     };
 41 |     if err.is::<UnrecognizedCommandError>() {
 42 |         // The programmer should handle all defined sub-commands,
 43 |         unreachable!("unrecognized command: {}", name);
 44 |     }
 45 |     Err(err)
 46 | }
 47 | 
 48 | /// Time an arbitrary operation.
 49 | pub fn timeit<T>(run: impl FnOnce() -> T) -> (T, std::time::Duration) {
 50 |     let start = std::time::Instant::now();
 51 |     let t = run();
 52 |     (t, start.elapsed())
 53 | }
 54 | 
 55 | /// Convenient time an operation that returns a result by packing the duration
 56 | /// into the `Ok` variant.
 57 | pub fn timeitr<T, E>(
 58 |     run: impl FnOnce() -> Result<T, E>,
 59 | ) -> Result<(T, std::time::Duration), E> {
 60 |     let (result, time) = timeit(run);
 61 |     let t = result?;
 62 |     Ok((t, time))
 63 | }
 64 | 
 65 | /// Print the given text with an ASCII art underline beneath it.
 66 | ///
 67 | /// If the given text is empty, then '<empty>' is printed.
 68 | pub fn print_with_underline<W: io::Write>(
 69 |     mut wtr: W,
 70 |     text: &str,
 71 | ) -> io::Result<()> {
 72 |     let toprint = if text.is_empty() { "<empty>" } else { text };
 73 |     writeln!(wtr, "{}", toprint)?;
 74 |     writeln!(wtr, "{}", "-".repeat(toprint.width()))?;
 75 |     Ok(())
 76 | }
 77 | 
 78 | #[derive(Debug)]
 79 | pub struct Table {
 80 |     pairs: Vec<(&'static str, Box<dyn std::fmt::Debug>)>,
 81 | }
 82 | 
 83 | impl Table {
 84 |     pub fn empty() -> Table {
 85 |         Table { pairs: vec![] }
 86 |     }
 87 | 
 88 |     pub fn add<D: std::fmt::Debug + 'static>(
 89 |         &mut self,
 90 |         label: &'static str,
 91 |         value: D,
 92 |     ) {
 93 |         self.pairs.push((label, Box::new(value)));
 94 |     }
 95 | 
 96 |     pub fn print<W: io::Write>(&self, wtr: W) -> io::Result<()> {
 97 |         let mut wtr = tabwriter::TabWriter::new(wtr)
 98 |             .alignment(tabwriter::Alignment::Right);
 99 |         for (label, value) in self.pairs.iter() {
100 |             writeln!(wtr, "{}:\t{:?}", label, value)?;
101 |         }
102 |         wtr.flush()
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/regex-test/COPYING:
--------------------------------------------------------------------------------
1 | This project is dual-licensed under the Unlicense and MIT licenses.
2 | 
3 | You may use this code under the terms of either license.
4 | 


--------------------------------------------------------------------------------
/regex-test/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "regex-test"
 4 | version = "0.0.0"  #:version
 5 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 6 | description = """
 7 | Infrastructure for testing regexes.
 8 | 
 9 | You probably don't want to use this crate unless you're working on a regex
10 | implementation.
11 | """
12 | documentation = "https://docs.rs/regex-test"
13 | repository = "https://github.com/BurntSushi/regex-automata/tree/master/regex-test"
14 | readme = "README.md"
15 | keywords = ["regex", "regexp", "dfa", "automata", "test"]
16 | license = "Unlicense/MIT"
17 | edition = "2018"
18 | 
19 | [lib]
20 | name = "regex_test"
21 | bench = false
22 | 
23 | [dependencies]
24 | anyhow = "1.0.27"
25 | bstr = { version = "0.2.16", default-features = false, features = ["std", "serde1"] }
26 | serde = { version = "1.0.105", features = ["derive"] }
27 | toml = "0.5.6"
28 | 


--------------------------------------------------------------------------------
/regex-test/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/regex-test/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/regex-test/src/escape.rs:
--------------------------------------------------------------------------------
  1 | #![allow(dead_code)]
  2 | 
  3 | use std::ascii;
  4 | use std::str;
  5 | 
  6 | use bstr::{ByteSlice, ByteVec};
  7 | 
  8 | pub fn nice_raw_bytes(bytes: &[u8]) -> String {
  9 |     match str::from_utf8(bytes) {
 10 |         Ok(s) => s.to_string(),
 11 |         Err(_) => escape_bytes(bytes),
 12 |     }
 13 | }
 14 | 
 15 | pub fn escape_bytes(bytes: &[u8]) -> String {
 16 |     let escaped = bytes
 17 |         .iter()
 18 |         .flat_map(|&b| ascii::escape_default(b))
 19 |         .collect::<Vec<u8>>();
 20 |     String::from_utf8(escaped).unwrap()
 21 | }
 22 | 
 23 | pub fn hex_bytes(bytes: &[u8]) -> String {
 24 |     bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
 25 | }
 26 | 
 27 | pub fn escape_default(s: &str) -> String {
 28 |     s.chars().flat_map(|c| c.escape_default()).collect()
 29 | }
 30 | 
 31 | pub fn escape(bytes: &[u8]) -> String {
 32 |     let mut escaped = String::new();
 33 |     for (s, e, ch) in bytes.char_indices() {
 34 |         if ch == '\u{FFFD}' {
 35 |             for b in bytes[s..e].bytes() {
 36 |                 escape_byte(b, &mut escaped);
 37 |             }
 38 |         } else {
 39 |             escape_char(ch, &mut escaped);
 40 |         }
 41 |     }
 42 |     escaped
 43 | }
 44 | 
 45 | pub fn unescape<B: AsRef<[u8]>>(s: B) -> Vec<u8> {
 46 |     #[derive(Clone, Copy, Eq, PartialEq)]
 47 |     enum State {
 48 |         /// The state after seeing a `\`.
 49 |         Escape,
 50 |         /// The state after seeing a `\x`.
 51 |         HexFirst,
 52 |         /// The state after seeing a `\x[0-9A-Fa-f]`.
 53 |         HexSecond(char),
 54 |         /// Default state.
 55 |         Literal,
 56 |     }
 57 | 
 58 |     let mut bytes = vec![];
 59 |     let mut state = State::Literal;
 60 |     for c in s.as_ref().chars() {
 61 |         match state {
 62 |             State::Escape => match c {
 63 |                 '\\' => {
 64 |                     bytes.push(b'\\');
 65 |                     state = State::Literal;
 66 |                 }
 67 |                 'n' => {
 68 |                     bytes.push(b'\n');
 69 |                     state = State::Literal;
 70 |                 }
 71 |                 'r' => {
 72 |                     bytes.push(b'\r');
 73 |                     state = State::Literal;
 74 |                 }
 75 |                 't' => {
 76 |                     bytes.push(b'\t');
 77 |                     state = State::Literal;
 78 |                 }
 79 |                 'x' => {
 80 |                     state = State::HexFirst;
 81 |                 }
 82 |                 c => {
 83 |                     bytes.push_char('\\');
 84 |                     bytes.push_char(c);
 85 |                     state = State::Literal;
 86 |                 }
 87 |             },
 88 |             State::HexFirst => match c {
 89 |                 '0'..='9' | 'A'..='F' | 'a'..='f' => {
 90 |                     state = State::HexSecond(c);
 91 |                 }
 92 |                 c => {
 93 |                     bytes.push_char('\\');
 94 |                     bytes.push_char('x');
 95 |                     bytes.push_char(c);
 96 |                     state = State::Literal;
 97 |                 }
 98 |             },
 99 |             State::HexSecond(first) => match c {
100 |                 '0'..='9' | 'A'..='F' | 'a'..='f' => {
101 |                     let ordinal = format!("{}{}", first, c);
102 |                     let byte = u8::from_str_radix(&ordinal, 16).unwrap();
103 |                     bytes.push_byte(byte);
104 |                     state = State::Literal;
105 |                 }
106 |                 c => {
107 |                     bytes.push_char('\\');
108 |                     bytes.push_char('x');
109 |                     bytes.push_char(first);
110 |                     bytes.push_char(c);
111 |                     state = State::Literal;
112 |                 }
113 |             },
114 |             State::Literal => match c {
115 |                 '\\' => {
116 |                     state = State::Escape;
117 |                 }
118 |                 c => {
119 |                     bytes.push_char(c);
120 |                 }
121 |             },
122 |         }
123 |     }
124 |     match state {
125 |         State::Escape => bytes.push_char('\\'),
126 |         State::HexFirst => bytes.push_str("\\x"),
127 |         State::HexSecond(c) => {
128 |             bytes.push_char('\\');
129 |             bytes.push_char('x');
130 |             bytes.push_char(c);
131 |         }
132 |         State::Literal => {}
133 |     }
134 |     bytes
135 | }
136 | 
137 | /// Adds the given codepoint to the given string, escaping it if necessary.
138 | fn escape_char(cp: char, into: &mut String) {
139 |     if cp.is_ascii() {
140 |         escape_byte(cp as u8, into);
141 |     } else {
142 |         into.push(cp);
143 |     }
144 | }
145 | 
146 | /// Adds the given byte to the given string, escaping it if necessary.
147 | fn escape_byte(byte: u8, into: &mut String) {
148 |     match byte {
149 |         0x21..=0x5B | 0x5D..=0x7D => into.push(byte as char),
150 |         b'\n' => into.push_str(r"\n"),
151 |         b'\r' => into.push_str(r"\r"),
152 |         b'\t' => into.push_str(r"\t"),
153 |         b'\\' => into.push_str(r"\\"),
154 |         _ => into.push_str(&format!(r"\x{:02X}", byte)),
155 |     }
156 | }
157 | 
158 | #[cfg(test)]
159 | mod tests {
160 |     use super::{escape, unescape};
161 | 
162 |     fn b(bytes: &'static [u8]) -> Vec<u8> {
163 |         bytes.to_vec()
164 |     }
165 | 
166 |     #[test]
167 |     fn empty() {
168 |         assert_eq!(b(b""), unescape(r""));
169 |         assert_eq!(r"", escape(b""));
170 |     }
171 | 
172 |     #[test]
173 |     fn backslash() {
174 |         assert_eq!(b(b"\\"), unescape(r"\\"));
175 |         assert_eq!(r"\\", escape(b"\\"));
176 |     }
177 | 
178 |     #[test]
179 |     fn nul() {
180 |         assert_eq!(b(b"\x00"), unescape(r"\x00"));
181 |         assert_eq!(r"\x00", escape(b"\x00"));
182 |     }
183 | 
184 |     #[test]
185 |     fn nl() {
186 |         assert_eq!(b(b"\n"), unescape(r"\n"));
187 |         assert_eq!(r"\n", escape(b"\n"));
188 |     }
189 | 
190 |     #[test]
191 |     fn tab() {
192 |         assert_eq!(b(b"\t"), unescape(r"\t"));
193 |         assert_eq!(r"\t", escape(b"\t"));
194 |     }
195 | 
196 |     #[test]
197 |     fn carriage() {
198 |         assert_eq!(b(b"\r"), unescape(r"\r"));
199 |         assert_eq!(r"\r", escape(b"\r"));
200 |     }
201 | 
202 |     #[test]
203 |     fn nothing_simple() {
204 |         assert_eq!(b(b"\\a"), unescape(r"\a"));
205 |         assert_eq!(b(b"\\a"), unescape(r"\\a"));
206 |         assert_eq!(r"\\a", escape(b"\\a"));
207 |     }
208 | 
209 |     #[test]
210 |     fn nothing_hex0() {
211 |         assert_eq!(b(b"\\x"), unescape(r"\x"));
212 |         assert_eq!(b(b"\\x"), unescape(r"\\x"));
213 |         assert_eq!(r"\\x", escape(b"\\x"));
214 |     }
215 | 
216 |     #[test]
217 |     fn nothing_hex1() {
218 |         assert_eq!(b(b"\\xz"), unescape(r"\xz"));
219 |         assert_eq!(b(b"\\xz"), unescape(r"\\xz"));
220 |         assert_eq!(r"\\xz", escape(b"\\xz"));
221 |     }
222 | 
223 |     #[test]
224 |     fn nothing_hex2() {
225 |         assert_eq!(b(b"\\xzz"), unescape(r"\xzz"));
226 |         assert_eq!(b(b"\\xzz"), unescape(r"\\xzz"));
227 |         assert_eq!(r"\\xzz", escape(b"\\xzz"));
228 |     }
229 | 
230 |     #[test]
231 |     fn invalid_utf8() {
232 |         assert_eq!(r"\xFF", escape(b"\xFF"));
233 |         assert_eq!(r"a\xFFb", escape(b"a\xFFb"));
234 |     }
235 | 
236 |     #[test]
237 |     fn trailing_incomplete() {
238 |         assert_eq!(b(b"\\xA"), unescape(r"\xA"));
239 |     }
240 | }
241 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 79
2 | use_small_heuristics = "max"
3 | 


--------------------------------------------------------------------------------
/scripts/fowler-to-toml:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import absolute_import, division, print_function
  4 | import argparse
  5 | import os.path as path
  6 | 
  7 | 
  8 | def read_tests(f):
  9 |     basename, _ = path.splitext(path.basename(f))
 10 |     tests = []
 11 |     prev_pattern = None
 12 | 
 13 |     for lineno, line in enumerate(open(f), 1):
 14 |         fields = list(filter(None, map(str.strip, line.split('\t'))))
 15 |         if not (4 <= len(fields) <= 5) \
 16 |            or 'E' not in fields[0] or fields[0][0] == '#':
 17 |             continue
 18 | 
 19 |         terse_opts, pat, text, sgroups = fields[0:4]
 20 |         groups = []  # groups as integer ranges
 21 |         if sgroups == 'NOMATCH':
 22 |             groups = []
 23 |         elif ',' in sgroups:
 24 |             noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
 25 |             for g in noparen:
 26 |                 s, e = map(str.strip, g.split(','))
 27 |                 if s == '?' and e == '?':
 28 |                     groups.append([])
 29 |                 else:
 30 |                     groups.append([int(s), int(e)])
 31 |         else:
 32 |             # This skips tests that should result in an error.
 33 |             # There aren't many, so I think we can just capture those
 34 |             # manually. Possibly fix this in future.
 35 |             continue
 36 | 
 37 |         case_insensitive = False
 38 |         if text == "NULL":
 39 |             text = ""
 40 |         if pat == 'SAME':
 41 |             pat = prev_pattern
 42 |         if '$' in terse_opts:
 43 |             pat = pat.encode('utf-8').decode('unicode_escape')
 44 |             text = text.encode('utf-8').decode('unicode_escape')
 45 |             text = text.encode('unicode_escape').decode('utf-8')
 46 |         else:
 47 |             text = text.encode('unicode_escape').decode('utf-8')
 48 |         if 'i' in terse_opts:
 49 |             case_insensitive = True
 50 | 
 51 |         pat = pat.encode('unicode_escape').decode('utf-8')
 52 |         pat = pat.replace('\\\\', '\\')
 53 |         if len(groups) > 0:
 54 |             captures = '[' + str(groups) + ']'
 55 |         else:
 56 |             captures = '[]'
 57 |         test = {
 58 |             'name': '"%s%d"' % (basename, lineno),
 59 |             'regex': "'''%s'''" % pat,
 60 |             'input': "'''%s'''" % text,
 61 |             'captures': captures,
 62 |             'match_limit': '1',
 63 |             'unescape': 'true',
 64 |         }
 65 |         if case_insensitive:
 66 |             test['case_insensitive'] = 'true'
 67 |         tests.append(test)
 68 |         prev_pattern = pat
 69 |     return tests
 70 | 
 71 | 
 72 | if __name__ == '__main__':
 73 |     parser = argparse.ArgumentParser(
 74 |         description='Generate match tests from an AT&T POSIX test file.',
 75 |     )
 76 |     aa = parser.add_argument
 77 |     aa('outdir', help='Directory to write generated TOML files.')
 78 |     aa('datfile', help='A dat AT&T POSIX test file.', nargs='+')
 79 |     args = parser.parse_args()
 80 | 
 81 |     for datfile in args.datfile:
 82 |         tests = read_tests(datfile)
 83 |         filename = path.basename(datfile)
 84 |         name, _ = path.splitext(filename)
 85 |         toml_path = path.join(args.outdir, f'{name}.toml')
 86 | 
 87 |         with open(toml_path, 'w+') as out:
 88 |             print('''
 89 | # !!! DO NOT EDIT !!!
 90 | # Automatically generated by scripts/fowler-to-toml.
 91 | # Numbers in the test names correspond to the line number of the test from
 92 | # the original dat file.
 93 |     '''.strip(), file=out)
 94 |             print(file=out)
 95 |             for t in tests:
 96 |                 print('[[tests]]', file=out)
 97 |                 for k, v in t.items():
 98 |                     print(f'{k} = {v}', file=out)
 99 |                 print(file=out)
100 | 


--------------------------------------------------------------------------------
/scripts/generate-fowler-tests:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import absolute_import, division, print_function
 4 | import argparse
 5 | import datetime
 6 | import os.path as path
 7 | 
 8 | 
 9 | def print_tests(tests):
10 |     print('\n'.join([test_tostr(t) for t in tests]))
11 | 
12 | 
13 | def read_tests(f):
14 |     basename, _ = path.splitext(path.basename(f))
15 |     tests = []
16 |     for lineno, line in enumerate(open(f), 1):
17 |         fields = list(filter(None, map(str.strip, line.split('\t'))))
18 |         if not (4 <= len(fields) <= 5) \
19 |            or 'E' not in fields[0] or fields[0][0] == '#':
20 |             continue
21 | 
22 |         opts, pat, text, sgroups = fields[0:4]
23 |         groups = []  # groups as integer ranges
24 |         if sgroups == 'NOMATCH':
25 |             groups = [None]
26 |         elif ',' in sgroups:
27 |             noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
28 |             for g in noparen:
29 |                 s, e = map(str.strip, g.split(','))
30 |                 if s == '?' and e == '?':
31 |                     groups.append(None)
32 |                 else:
33 |                     groups.append((int(s), int(e)))
34 |         else:
35 |             # This skips tests that should result in an error.
36 |             # There aren't many, so I think we can just capture those
37 |             # manually. Possibly fix this in future.
38 |             continue
39 | 
40 |         if pat == 'SAME':
41 |             pat = tests[-1][1]
42 |         if '$' in opts:
43 |             pat = pat.encode('utf-8').decode('unicode_escape')
44 |             text = text.encode('utf-8').decode('unicode_escape')
45 |         if 'i' in opts:
46 |             pat = '(?i)%s' % pat
47 | 
48 |         name = '%s_%d' % (basename, lineno)
49 |         tests.append((name, pat, text, groups))
50 |     return tests
51 | 
52 | 
53 | def test_tostr(t):
54 |     lineno, pat, text, groups = t
55 |     options = map(group_tostr, groups)
56 |     pat = pat.encode('unicode_escape').decode('utf-8')
57 |     if text == 'NULL':
58 |         text = ''
59 |     else:
60 |         text = text.encode('unicode_escape').decode('utf-8')
61 |     return '    ("%s", "%s", b"%s", &[%s]),' \
62 |            % (lineno, pat, text, ', '.join(options))
63 | 
64 | 
65 | def group_tostr(g):
66 |     if g is None:
67 |         return 'None'
68 |     else:
69 |         return 'Some((%d, %d))' % (g[0], g[1])
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     parser = argparse.ArgumentParser(
74 |         description='Generate match tests from an AT&T POSIX test file.')
75 |     aa = parser.add_argument
76 |     aa('files', nargs='+',
77 |        help='A list of dat AT&T POSIX test files. See src/testdata')
78 |     args = parser.parse_args()
79 | 
80 |     tests = []
81 |     for f in args.files:
82 |         tests += read_tests(f)
83 | 
84 |     tpl = '''// !!! DO NOT EDIT !!!
85 | // Automatically generated by 'scripts/regex-match-tests.py' on {date}.
86 | '''
87 |     print(tpl.format(date=str(datetime.datetime.now())))
88 | 
89 |     strty = "&'static str"
90 |     capty = "&'static [Option<(usize, usize)>]"
91 |     elty = "({str}, {str}, &'static [u8], {cap})".format(str=strty, cap=capty)
92 |     print('pub const TESTS: &\'static [%s] = &[' % elty, end='')
93 |     for f in args.files:
94 |         print('')
95 |         print('    // tests from %s' % path.basename(f))
96 |         print_tests(read_tests(f))
97 |     print('];')
98 | 


--------------------------------------------------------------------------------
/src/dfa/error.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     nfa,
  3 |     util::{
  4 |         id::{PatternID, StateID},
  5 |         start::Start,
  6 |     },
  7 | };
  8 | 
  9 | /// An error that occurred during the construction of a DFA.
 10 | ///
 11 | /// This error does not provide many introspection capabilities. There are
 12 | /// generally only two things you can do with it:
 13 | ///
 14 | /// * Obtain a human readable message via its `std::fmt::Display` impl.
 15 | /// * Access an underlying [`nfa::thompson::Error`] type from its `source`
 16 | /// method via the `std::error::Error` trait. This error only occurs when using
 17 | /// convenience routines for building a DFA directly from a pattern string.
 18 | ///
 19 | /// When the `std` feature is enabled, this implements the `std::error::Error`
 20 | /// trait.
 21 | #[derive(Clone, Debug)]
 22 | pub struct Error {
 23 |     kind: ErrorKind,
 24 | }
 25 | 
 26 | /// The kind of error that occurred during the construction of a DFA.
 27 | ///
 28 | /// Note that this error is non-exhaustive. Adding new variants is not
 29 | /// considered a breaking change.
 30 | #[derive(Clone, Debug)]
 31 | enum ErrorKind {
 32 |     /// An error that occurred while constructing an NFA as a precursor step
 33 |     /// before a DFA is compiled.
 34 |     NFA(nfa::thompson::Error),
 35 |     /// An error that occurred because an unsupported regex feature was used.
 36 |     /// The message string describes which unsupported feature was used.
 37 |     ///
 38 |     /// The primary regex feature that is unsupported by DFAs is the Unicode
 39 |     /// word boundary look-around assertion (`\b`). This can be worked around
 40 |     /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the
 41 |     /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary)
 42 |     /// option when building a DFA.
 43 |     Unsupported(&'static str),
 44 |     /// An error that occurs if too many states are produced while building a
 45 |     /// DFA.
 46 |     TooManyStates,
 47 |     /// An error that occurs if too many start states are needed while building
 48 |     /// a DFA.
 49 |     ///
 50 |     /// This is a kind of oddball error that occurs when building a DFA with
 51 |     /// start states enabled for each pattern and enough patterns to cause
 52 |     /// the table of start states to overflow `usize`.
 53 |     TooManyStartStates,
 54 |     /// This is another oddball error that can occur if there are too many
 55 |     /// patterns spread out across too many match states.
 56 |     TooManyMatchPatternIDs,
 57 |     /// An error that occurs if the DFA got too big during determinization.
 58 |     DFAExceededSizeLimit { limit: usize },
 59 |     /// An error that occurs if auxiliary storage (not the DFA) used during
 60 |     /// determinization got too big.
 61 |     DeterminizeExceededSizeLimit { limit: usize },
 62 | }
 63 | 
 64 | impl Error {
 65 |     /// Return the kind of this error.
 66 |     fn kind(&self) -> &ErrorKind {
 67 |         &self.kind
 68 |     }
 69 | 
 70 |     pub(crate) fn nfa(err: nfa::thompson::Error) -> Error {
 71 |         Error { kind: ErrorKind::NFA(err) }
 72 |     }
 73 | 
 74 |     pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error {
 75 |         let msg = "cannot build DFAs for regexes with Unicode word \
 76 |                    boundaries; switch to ASCII word boundaries, or \
 77 |                    heuristically enable Unicode word boundaries or use a \
 78 |                    different regex engine";
 79 |         Error { kind: ErrorKind::Unsupported(msg) }
 80 |     }
 81 | 
 82 |     pub(crate) fn too_many_states() -> Error {
 83 |         Error { kind: ErrorKind::TooManyStates }
 84 |     }
 85 | 
 86 |     pub(crate) fn too_many_start_states() -> Error {
 87 |         Error { kind: ErrorKind::TooManyStartStates }
 88 |     }
 89 | 
 90 |     pub(crate) fn too_many_match_pattern_ids() -> Error {
 91 |         Error { kind: ErrorKind::TooManyMatchPatternIDs }
 92 |     }
 93 | 
 94 |     pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error {
 95 |         Error { kind: ErrorKind::DFAExceededSizeLimit { limit } }
 96 |     }
 97 | 
 98 |     pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error {
 99 |         Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } }
100 |     }
101 | }
102 | 
103 | #[cfg(feature = "std")]
104 | impl std::error::Error for Error {
105 |     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
106 |         match self.kind() {
107 |             ErrorKind::NFA(ref err) => Some(err),
108 |             ErrorKind::Unsupported(_) => None,
109 |             ErrorKind::TooManyStates => None,
110 |             ErrorKind::TooManyStartStates => None,
111 |             ErrorKind::TooManyMatchPatternIDs => None,
112 |             ErrorKind::DFAExceededSizeLimit { .. } => None,
113 |             ErrorKind::DeterminizeExceededSizeLimit { .. } => None,
114 |         }
115 |     }
116 | }
117 | 
118 | impl core::fmt::Display for Error {
119 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
120 |         match self.kind() {
121 |             ErrorKind::NFA(_) => write!(f, "error building NFA"),
122 |             ErrorKind::Unsupported(ref msg) => {
123 |                 write!(f, "unsupported regex feature for DFAs: {}", msg)
124 |             }
125 |             ErrorKind::TooManyStates => write!(
126 |                 f,
127 |                 "number of DFA states exceeds limit of {}",
128 |                 StateID::LIMIT,
129 |             ),
130 |             ErrorKind::TooManyStartStates => {
131 |                 let stride = Start::count();
132 |                 // The start table has `stride` entries for starting states for
133 |                 // the entire DFA, and then `stride` entries for each pattern
134 |                 // if start states for each pattern are enabled (which is the
135 |                 // only way this error can occur). Thus, the total number of
136 |                 // patterns that can fit in the table is `stride` less than
137 |                 // what we can allocate.
138 |                 let limit = ((core::isize::MAX as usize) - stride) / stride;
139 |                 write!(
140 |                     f,
141 |                     "compiling DFA with start states exceeds pattern \
142 |                      pattern limit of {}",
143 |                     limit,
144 |                 )
145 |             }
146 |             ErrorKind::TooManyMatchPatternIDs => write!(
147 |                 f,
148 |                 "compiling DFA with total patterns in all match states \
149 |                  exceeds limit of {}",
150 |                 PatternID::LIMIT,
151 |             ),
152 |             ErrorKind::DFAExceededSizeLimit { limit } => write!(
153 |                 f,
154 |                 "DFA exceeded size limit of {:?} during determinization",
155 |                 limit,
156 |             ),
157 |             ErrorKind::DeterminizeExceededSizeLimit { limit } => {
158 |                 write!(f, "determinization exceeded size limit of {:?}", limit)
159 |             }
160 |         }
161 |     }
162 | }
163 | 


--------------------------------------------------------------------------------
/src/dfa/transducer.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     dfa::{automaton::Automaton, dense, sparse},
  3 |     util::id::StateID,
  4 | };
  5 | 
  6 | impl<T: AsRef<[u32]>> fst::Automaton for dense::DFA<T> {
  7 |     type State = StateID;
  8 | 
  9 |     #[inline]
 10 |     fn start(&self) -> StateID {
 11 |         self.start_state_forward(None, &[], 0, 0)
 12 |     }
 13 | 
 14 |     #[inline]
 15 |     fn is_match(&self, state: &StateID) -> bool {
 16 |         self.is_match_state(*state)
 17 |     }
 18 | 
 19 |     #[inline]
 20 |     fn accept(&self, state: &StateID, byte: u8) -> StateID {
 21 |         if fst::Automaton::is_match(self, state) {
 22 |             return *state;
 23 |         }
 24 |         self.next_state(*state, byte)
 25 |     }
 26 | 
 27 |     #[inline]
 28 |     fn accept_eof(&self, state: &StateID) -> Option<StateID> {
 29 |         if fst::Automaton::is_match(self, state) {
 30 |             return Some(*state);
 31 |         }
 32 |         Some(self.next_eoi_state(*state))
 33 |     }
 34 | 
 35 |     #[inline]
 36 |     fn can_match(&self, state: &StateID) -> bool {
 37 |         !self.is_dead_state(*state)
 38 |     }
 39 | }
 40 | 
 41 | impl<T: AsRef<[u8]>> fst::Automaton for sparse::DFA<T> {
 42 |     type State = StateID;
 43 | 
 44 |     #[inline]
 45 |     fn start(&self) -> StateID {
 46 |         self.start_state_forward(None, &[], 0, 0)
 47 |     }
 48 | 
 49 |     #[inline]
 50 |     fn is_match(&self, state: &StateID) -> bool {
 51 |         self.is_match_state(*state)
 52 |     }
 53 | 
 54 |     #[inline]
 55 |     fn accept(&self, state: &StateID, byte: u8) -> StateID {
 56 |         if fst::Automaton::is_match(self, state) {
 57 |             return *state;
 58 |         }
 59 |         self.next_state(*state, byte)
 60 |     }
 61 | 
 62 |     #[inline]
 63 |     fn accept_eof(&self, state: &StateID) -> Option<StateID> {
 64 |         if fst::Automaton::is_match(self, state) {
 65 |             return Some(*state);
 66 |         }
 67 |         Some(self.next_eoi_state(*state))
 68 |     }
 69 | 
 70 |     #[inline]
 71 |     fn can_match(&self, state: &StateID) -> bool {
 72 |         !self.is_dead_state(*state)
 73 |     }
 74 | }
 75 | 
 76 | #[cfg(test)]
 77 | mod tests {
 78 |     use bstr::BString;
 79 |     use fst::{Automaton, IntoStreamer, Set, Streamer};
 80 | 
 81 |     use crate::dfa::{dense, sparse};
 82 | 
 83 |     fn search<A: Automaton, D: AsRef<[u8]>>(
 84 |         set: &Set<D>,
 85 |         aut: A,
 86 |     ) -> Vec<BString> {
 87 |         let mut stream = set.search(aut).into_stream();
 88 | 
 89 |         let mut results = vec![];
 90 |         while let Some(key) = stream.next() {
 91 |             results.push(BString::from(key));
 92 |         }
 93 |         results
 94 |     }
 95 | 
 96 |     #[test]
 97 |     fn dense_anywhere() {
 98 |         let set =
 99 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
100 |                 .unwrap();
101 |         let dfa = dense::DFA::new("ba.*").unwrap();
102 |         let got = search(&set, &dfa);
103 |         assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
104 |     }
105 | 
106 |     #[test]
107 |     fn dense_anchored() {
108 |         let set =
109 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
110 |                 .unwrap();
111 |         let dfa = dense::Builder::new()
112 |             .configure(dense::Config::new().anchored(true))
113 |             .build("ba.*")
114 |             .unwrap();
115 |         let got = search(&set, &dfa);
116 |         assert_eq!(got, vec!["bar", "baz"]);
117 |     }
118 | 
119 |     #[test]
120 |     fn dense_assertions_start() {
121 |         let set =
122 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
123 |                 .unwrap();
124 |         let dfa = dense::Builder::new().build("^ba.*").unwrap();
125 |         let got = search(&set, &dfa);
126 |         assert_eq!(got, vec!["bar", "baz"]);
127 |     }
128 | 
129 |     #[test]
130 |     fn dense_assertions_end() {
131 |         let set =
132 |             Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
133 |                 .unwrap();
134 |         let dfa = dense::Builder::new().build(".*x$").unwrap();
135 |         let got = search(&set, &dfa);
136 |         assert_eq!(got, vec!["bax", "xbax"]);
137 |     }
138 | 
139 |     #[test]
140 |     fn dense_assertions_word() {
141 |         let set =
142 |             Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
143 |         let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap();
144 |         let got = search(&set, &dfa);
145 |         assert_eq!(got, vec!["foo", "zzz foo zzz"]);
146 |     }
147 | 
148 |     #[test]
149 |     fn sparse_anywhere() {
150 |         let set =
151 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
152 |                 .unwrap();
153 |         let dfa = sparse::DFA::new("ba.*").unwrap();
154 |         let got = search(&set, &dfa);
155 |         assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
156 |     }
157 | 
158 |     #[test]
159 |     fn sparse_anchored() {
160 |         let set =
161 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
162 |                 .unwrap();
163 |         let dfa = dense::Builder::new()
164 |             .configure(dense::Config::new().anchored(true))
165 |             .build("ba.*")
166 |             .unwrap()
167 |             .to_sparse()
168 |             .unwrap();
169 |         let got = search(&set, &dfa);
170 |         assert_eq!(got, vec!["bar", "baz"]);
171 |     }
172 | 
173 |     #[test]
174 |     fn sparse_assertions_start() {
175 |         let set =
176 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
177 |                 .unwrap();
178 |         let dfa =
179 |             dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap();
180 |         let got = search(&set, &dfa);
181 |         assert_eq!(got, vec!["bar", "baz"]);
182 |     }
183 | 
184 |     #[test]
185 |     fn sparse_assertions_end() {
186 |         let set =
187 |             Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
188 |                 .unwrap();
189 |         let dfa =
190 |             dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap();
191 |         let got = search(&set, &dfa);
192 |         assert_eq!(got, vec!["bax", "xbax"]);
193 |     }
194 | 
195 |     #[test]
196 |     fn sparse_assertions_word() {
197 |         let set =
198 |             Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
199 |         let dfa = dense::Builder::new()
200 |             .build(r"(?-u)\bfoo\b")
201 |             .unwrap()
202 |             .to_sparse()
203 |             .unwrap();
204 |         let got = search(&set, &dfa);
205 |         assert_eq!(got, vec!["foo", "zzz foo zzz"]);
206 |     }
207 | }
208 | 


--------------------------------------------------------------------------------
/src/hybrid/error.rs:
--------------------------------------------------------------------------------
  1 | use crate::{hybrid::id::LazyStateIDError, nfa};
  2 | 
  3 | /// An error that occurs when initial construction of a lazy DFA fails.
  4 | ///
  5 | /// A build error can occur when insufficient cache capacity is configured or
  6 | /// if something about the NFA is unsupported. (For example, if one attempts
  7 | /// to build a lazy DFA without heuristic Unicode support but with an NFA that
  8 | /// contains a Unicode word boundary.)
  9 | ///
 10 | /// When the `std` feature is enabled, this implements the `std::error::Error`
 11 | /// trait.
 12 | #[derive(Clone, Debug)]
 13 | pub struct BuildError {
 14 |     kind: BuildErrorKind,
 15 | }
 16 | 
 17 | #[derive(Clone, Debug)]
 18 | enum BuildErrorKind {
 19 |     NFA(nfa::thompson::Error),
 20 |     InsufficientCacheCapacity { minimum: usize, given: usize },
 21 |     InsufficientStateIDCapacity { err: LazyStateIDError },
 22 |     Unsupported(&'static str),
 23 | }
 24 | 
 25 | impl BuildError {
 26 |     fn kind(&self) -> &BuildErrorKind {
 27 |         &self.kind
 28 |     }
 29 | 
 30 |     pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError {
 31 |         BuildError { kind: BuildErrorKind::NFA(err) }
 32 |     }
 33 | 
 34 |     pub(crate) fn insufficient_cache_capacity(
 35 |         minimum: usize,
 36 |         given: usize,
 37 |     ) -> BuildError {
 38 |         BuildError {
 39 |             kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given },
 40 |         }
 41 |     }
 42 | 
 43 |     pub(crate) fn insufficient_state_id_capacity(
 44 |         err: LazyStateIDError,
 45 |     ) -> BuildError {
 46 |         BuildError {
 47 |             kind: BuildErrorKind::InsufficientStateIDCapacity { err },
 48 |         }
 49 |     }
 50 | 
 51 |     pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
 52 |         let msg = "cannot build lazy DFAs for regexes with Unicode word \
 53 |                    boundaries; switch to ASCII word boundaries, or \
 54 |                    heuristically enable Unicode word boundaries or use a \
 55 |                    different regex engine";
 56 |         BuildError { kind: BuildErrorKind::Unsupported(msg) }
 57 |     }
 58 | }
 59 | 
 60 | #[cfg(feature = "std")]
 61 | impl std::error::Error for BuildError {
 62 |     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
 63 |         match self.kind() {
 64 |             BuildErrorKind::NFA(ref err) => Some(err),
 65 |             BuildErrorKind::InsufficientCacheCapacity { .. } => None,
 66 |             // LazyStateIDError is an implementation detail, don't expose it.
 67 |             BuildErrorKind::InsufficientStateIDCapacity { .. } => None,
 68 |             BuildErrorKind::Unsupported(_) => None,
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | impl core::fmt::Display for BuildError {
 74 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
 75 |         match self.kind() {
 76 |             BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
 77 |             BuildErrorKind::InsufficientCacheCapacity { minimum, given } => {
 78 |                 write!(
 79 |                     f,
 80 |                     "given cache capacity ({}) is smaller than \
 81 |                      minimum required ({})",
 82 |                     given, minimum,
 83 |                 )
 84 |             }
 85 |             BuildErrorKind::InsufficientStateIDCapacity { ref err } => {
 86 |                 err.fmt(f)
 87 |             }
 88 |             BuildErrorKind::Unsupported(ref msg) => {
 89 |                 write!(f, "unsupported regex feature for DFAs: {}", msg)
 90 |             }
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | /// An error that occurs when cache usage has become inefficient.
 96 | ///
 97 | /// One of the weaknesses of a lazy DFA is that it may need to clear its
 98 | /// cache repeatedly if it's not big enough. If this happens too much, then it
 99 | /// can slow searching down significantly. A mitigation to this is to use
100 | /// heuristics to detect whether the cache is being used efficiently or not.
101 | /// If not, then a lazy DFA can return a `CacheError`.
102 | ///
103 | /// The default configuration of a lazy DFA in this crate is
104 | /// set such that a `CacheError` will never occur. Instead,
105 | /// callers must opt into this behavior with settings like
106 | /// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count).
107 | ///
108 | /// When the `std` feature is enabled, this implements the `std::error::Error`
109 | /// trait.
110 | #[derive(Clone, Debug)]
111 | pub struct CacheError(());
112 | 
113 | impl CacheError {
114 |     pub(crate) fn too_many_cache_clears() -> CacheError {
115 |         CacheError(())
116 |     }
117 | }
118 | 
119 | #[cfg(feature = "std")]
120 | impl std::error::Error for CacheError {
121 |     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
122 |         None
123 |     }
124 | }
125 | 
126 | impl core::fmt::Display for CacheError {
127 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
128 |         write!(f, "lazy DFA cache has been cleared too many times")
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/src/hybrid/mod.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | A module for building and searching with lazy determinstic finite automata
  3 | (DFAs).
  4 | 
  5 | Like other modules in this crate, lazy DFAs support a rich regex syntax with
  6 | Unicode features. The key feature of a lazy DFA is that it builds itself
  7 | incrementally during search, and never uses more than a configured capacity of
  8 | memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache"
  9 | in which the actual DFA's transition table is stored.
 10 | 
 11 | If you're looking for fully compiled DFAs, then please see the top-level
 12 | [`dfa` module](crate::dfa).
 13 | 
 14 | # Overview
 15 | 
 16 | This section gives a brief overview of the primary types in this module:
 17 | 
 18 | * A [`regex::Regex`] provides a way to search for matches of a regular
 19 | expression using lazy DFAs. This includes iterating over matches with both the
 20 | start and end positions of each match.
 21 | * A [`dfa::DFA`] provides direct low level access to a lazy DFA.
 22 | 
 23 | # Example: basic regex searching
 24 | 
 25 | This example shows how to compile a regex using the default configuration
 26 | and then use it to find matches in a byte string:
 27 | 
 28 | ```
 29 | use regex_automata::{hybrid::regex::Regex, MultiMatch};
 30 | 
 31 | let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
 32 | let mut cache = re.create_cache();
 33 | 
 34 | let text = b"2018-12-24 2016-10-08";
 35 | let matches: Vec<MultiMatch> =
 36 |     re.find_leftmost_iter(&mut cache, text).collect();
 37 | assert_eq!(matches, vec![
 38 |     MultiMatch::must(0, 0, 10),
 39 |     MultiMatch::must(0, 11, 21),
 40 | ]);
 41 | # Ok::<(), Box<dyn std::error::Error>>(())
 42 | ```
 43 | 
 44 | # Example: searching with regex sets
 45 | 
 46 | The lazy DFAs in this module all fully support searching with multiple regexes
 47 | simultaneously. You can use this support with standard leftmost-first style
 48 | searching to find non-overlapping matches:
 49 | 
 50 | ```
 51 | use regex_automata::{hybrid::regex::Regex, MultiMatch};
 52 | 
 53 | let re = Regex::new_many(&[r"\w+", r"\S+"])?;
 54 | let mut cache = re.create_cache();
 55 | 
 56 | let text = b"@foo bar";
 57 | let matches: Vec<MultiMatch> =
 58 |     re.find_leftmost_iter(&mut cache, text).collect();
 59 | assert_eq!(matches, vec![
 60 |     MultiMatch::must(1, 0, 4),
 61 |     MultiMatch::must(0, 5, 8),
 62 | ]);
 63 | # Ok::<(), Box<dyn std::error::Error>>(())
 64 | ```
 65 | 
 66 | Or use overlapping style searches to find all possible occurrences:
 67 | 
 68 | ```
 69 | use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch};
 70 | 
 71 | // N.B. For overlapping searches, we need the underlying lazy DFA to report all
 72 | // possible matches.
 73 | let re = Regex::builder()
 74 |     .dfa(dfa::Config::new().match_kind(MatchKind::All))
 75 |     .build_many(&[r"\w{3}", r"\S{3}"])?;
 76 | let mut cache = re.create_cache();
 77 | 
 78 | let text = b"@foo bar";
 79 | let matches: Vec<MultiMatch> =
 80 |     re.find_overlapping_iter(&mut cache, text).collect();
 81 | assert_eq!(matches, vec![
 82 |     MultiMatch::must(1, 0, 3),
 83 |     MultiMatch::must(0, 1, 4),
 84 |     MultiMatch::must(1, 1, 4),
 85 |     MultiMatch::must(0, 5, 8),
 86 |     MultiMatch::must(1, 5, 8),
 87 | ]);
 88 | # Ok::<(), Box<dyn std::error::Error>>(())
 89 | ```
 90 | 
 91 | # When should I use this?
 92 | 
 93 | Generally speaking, if you can abide the use of mutable state during search,
 94 | and you don't need things like capturing groups or Unicode word boundary
 95 | support in non-ASCII text, then a lazy DFA is likely a robust choice with
 96 | respect to both search speed and memory usage. Note however that its speed
 97 | may be worse than a general purpose regex engine if you don't select a good
 98 | [prefilter](crate::util::prefilter).
 99 | 
100 | If you know ahead of time that your pattern would result in a very large DFA
101 | if it was fully compiled, it may be better to use an NFA simulation instead
102 | of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA
103 | to something that is big enough to hold the state machine (likely through
104 | experimentation). The issue here is that if the cache is too small, then it
105 | could wind up being reset too frequently and this might decrease searching
106 | speed significantly.
107 | 
108 | # Differences with fully compiled DFAs
109 | 
110 | A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a
111 | [`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities
112 | (and similarly for their underlying DFAs), but they achieve them through
113 | different means. The main difference is that a hybrid or "lazy" regex builds
114 | its DFA lazily during search, where as a fully compiled regex will build its
115 | DFA at construction time. While building a DFA at search time might sound like
116 | it's slow, it tends to work out where most bytes seen during a search will
117 | reuse pre-built parts of the DFA and thus can be almost as fast as a fully
118 | compiled DFA. The main downside is that searching requires mutable space to
119 | store the DFA, and, in the worst case, a search can result in a new state being
120 | created for each byte seen, which would make searching quite a bit slower.
121 | 
122 | A fully compiled DFA never has to worry about searches being slower once
123 | it's built. (Aside from, say, the transition table being so large that it
124 | is subject to harsh CPU cache effects.) However, of course, building a full
125 | DFA can be quite time consuming and memory hungry. Particularly when it's
126 | so easy to build large DFAs when Unicode mode is enabled.
127 | 
128 | A lazy DFA strikes a nice balance _in practice_, particularly in the
129 | presence of Unicode mode, by only building what is needed. It avoids the
130 | worst case exponential time complexity of DFA compilation by guaranteeing that
131 | it will only build at most one state per byte searched. While the worst
132 | case here can lead to a very high constant, it will never be exponential.
133 | 
134 | # Syntax
135 | 
136 | This module supports the same syntax as the `regex` crate, since they share the
137 | same parser. You can find an exhaustive list of supported syntax in the
138 | [documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
139 | 
140 | There are two things that are not supported by the lazy DFAs in this module:
141 | 
142 | * Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
143 | of them) can only find the offsets of an entire match, but cannot resolve
144 | the offsets of each capturing group. This is because DFAs do not have the
145 | expressive power necessary.
146 | * Unicode word boundaries. These present particularly difficult challenges for
147 | DFA construction and would result in an explosion in the number of states.
148 | One can enable [`dfa::Config::unicode_word_boundary`] though, which provides
149 | heuristic support for Unicode word boundaries that only works on ASCII text.
150 | Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
151 | on any input.
152 | 
153 | There are no plans to lift either of these limitations.
154 | 
155 | Note that these restrictions are identical to the restrictions on fully
156 | compiled DFAs.
157 | 
158 | # Support for `alloc`-only
159 | 
160 | This crate comes with `alloc` and `std` features that are enabled by default.
161 | One can disable the `std` feature and still use the full API of a lazy DFA.
162 | (You should use `std` when possible, since it permits providing implementations
163 | of the `std::error::Error` trait, and does enable some minor internal
164 | optimizations.)
165 | 
166 | This module does require at least the `alloc` feature though. It is not
167 | available in any capacity without `alloc`.
168 | */
169 | 
170 | pub use self::{
171 |     error::{BuildError, CacheError},
172 |     id::{LazyStateID, OverlappingState},
173 | };
174 | 
175 | pub mod dfa;
176 | mod error;
177 | mod id;
178 | pub mod regex;
179 | mod search;
180 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | This crate provides an "expert" API for executing regular expressions using
 3 | finite automata.
 4 | 
 5 | **WARNING**: This `0.2` release of `regex-automata` was published
 6 | before it was ready to unblock work elsewhere that needed some
 7 | of the new APIs in this release. At the time of writing, it is
 8 | strongly preferred that you continue using the
 9 | [`regex-automata 0.1`](https://docs.rs/regex-automata/0.1/regex_automata/)
10 | release. Since this release represents an unfinished state, please do not
11 | create issues for this release unless it's for a critical bug.
12 | */
13 | 
14 | #![allow(warnings)]
15 | // #![deny(missing_docs)]
16 | #![cfg_attr(not(feature = "std"), no_std)]
17 | 
18 | #[cfg(not(any(
19 |     target_pointer_width = "16",
20 |     target_pointer_width = "32",
21 |     target_pointer_width = "64"
22 | )))]
23 | compile_error!("regex-automata currently not supported on non-{16,32,64}");
24 | 
25 | #[cfg(feature = "alloc")]
26 | extern crate alloc;
27 | 
28 | #[doc(inline)]
29 | pub use crate::util::id::PatternID;
30 | #[cfg(feature = "alloc")]
31 | pub use crate::util::syntax::SyntaxConfig;
32 | pub use crate::util::{
33 |     bytes::{DeserializeError, SerializeError},
34 |     matchtypes::{HalfMatch, Match, MatchError, MatchKind, MultiMatch},
35 | };
36 | 
37 | #[macro_use]
38 | mod macros;
39 | 
40 | pub mod dfa;
41 | #[cfg(feature = "alloc")]
42 | pub mod hybrid;
43 | #[doc(hidden)]
44 | #[cfg(feature = "alloc")]
45 | pub mod nfa;
46 | #[doc(hidden)]
47 | pub mod util;
48 | 


--------------------------------------------------------------------------------
/src/macros.rs:
--------------------------------------------------------------------------------
 1 | /// A simple macro for defining bitfield accessors/mutators.
 2 | #[cfg(feature = "alloc")]
 3 | macro_rules! define_bool {
 4 |     ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
 5 |         fn $is_fn_name(&self) -> bool {
 6 |             self.bools & (0b1 << $bit) > 0
 7 |         }
 8 | 
 9 |         fn $set_fn_name(&mut self, yes: bool) {
10 |             if yes {
11 |                 self.bools |= 1 << $bit;
12 |             } else {
13 |                 self.bools &= !(1 << $bit);
14 |             }
15 |         }
16 |     };
17 | }
18 | 
19 | macro_rules! log {
20 |     ($($tt:tt)*) => {
21 |         #[cfg(feature = "logging")]
22 |         {
23 |             $($tt)*
24 |         }
25 |     }
26 | }
27 | 
28 | macro_rules! trace {
29 |     ($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/nfa/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod thompson;
2 | 


--------------------------------------------------------------------------------
/src/nfa/thompson/error.rs:
--------------------------------------------------------------------------------
  1 | use crate::util::id::{PatternID, StateID};
  2 | 
  3 | /// An error that can occured during the construction of a thompson NFA.
  4 | ///
  5 | /// This error does not provide many introspection capabilities. There are
  6 | /// generally only two things you can do with it:
  7 | ///
  8 | /// * Obtain a human readable message via its `std::fmt::Display` impl.
  9 | /// * Access an underlying [`regex_syntax::Error`] type from its `source`
 10 | /// method via the `std::error::Error` trait. This error only occurs when using
 11 | /// convenience routines for building an NFA directly from a pattern string.
 12 | ///
 13 | /// Otherwise, errors typically occur when a limit has been breeched. For
 14 | /// example, if the total heap usage of the compiled NFA exceeds the limit
 15 | /// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
 16 | /// building the NFA will fail.
 17 | #[derive(Clone, Debug)]
 18 | pub struct Error {
 19 |     kind: ErrorKind,
 20 | }
 21 | 
 22 | /// The kind of error that occurred during the construction of a thompson NFA.
 23 | #[derive(Clone, Debug)]
 24 | enum ErrorKind {
 25 |     /// An error that occurred while parsing a regular expression. Note that
 26 |     /// this error may be printed over multiple lines, and is generally
 27 |     /// intended to be end user readable on its own.
 28 |     Syntax(regex_syntax::Error),
 29 |     /// An error that occurs if too many patterns were given to the NFA
 30 |     /// compiler.
 31 |     TooManyPatterns {
 32 |         /// The number of patterns given, which exceeds the limit.
 33 |         given: usize,
 34 |         /// The limit on the number of patterns.
 35 |         limit: usize,
 36 |     },
 37 |     /// An error that occurs if too states are produced while building an NFA.
 38 |     TooManyStates {
 39 |         /// The minimum number of states that are desired, which exceeds the
 40 |         /// limit.
 41 |         given: usize,
 42 |         /// The limit on the number of states.
 43 |         limit: usize,
 44 |     },
 45 |     /// An error that occurs when NFA compilation exceeds a configured heap
 46 |     /// limit.
 47 |     ExceededSizeLimit {
 48 |         /// The configured limit, in bytes.
 49 |         limit: usize,
 50 |     },
 51 |     /// An error that occurs when an invalid capture group index is added to
 52 |     /// the NFA. An "invalid" index can be one that is too big (e.g., results
 53 |     /// in an integer overflow) or one that is discontinuous from previous
 54 |     /// capture group indices added.
 55 |     InvalidCaptureIndex {
 56 |         /// The invalid index that was given.
 57 |         index: usize,
 58 |     },
 59 |     /// An error that occurs when an NFA contains a Unicode word boundary, but
 60 |     /// where the crate was compiled without the necessary data for dealing
 61 |     /// with Unicode word boundaries.
 62 |     UnicodeWordUnavailable,
 63 | }
 64 | 
 65 | impl Error {
 66 |     fn kind(&self) -> &ErrorKind {
 67 |         &self.kind
 68 |     }
 69 | 
 70 |     pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
 71 |         Error { kind: ErrorKind::Syntax(err) }
 72 |     }
 73 | 
 74 |     pub(crate) fn too_many_patterns(given: usize) -> Error {
 75 |         let limit = PatternID::LIMIT;
 76 |         Error { kind: ErrorKind::TooManyPatterns { given, limit } }
 77 |     }
 78 | 
 79 |     pub(crate) fn too_many_states(given: usize) -> Error {
 80 |         let limit = StateID::LIMIT;
 81 |         Error { kind: ErrorKind::TooManyStates { given, limit } }
 82 |     }
 83 | 
 84 |     pub(crate) fn exceeded_size_limit(limit: usize) -> Error {
 85 |         Error { kind: ErrorKind::ExceededSizeLimit { limit } }
 86 |     }
 87 | 
 88 |     pub(crate) fn invalid_capture_index(index: usize) -> Error {
 89 |         Error { kind: ErrorKind::InvalidCaptureIndex { index } }
 90 |     }
 91 | 
 92 |     pub(crate) fn unicode_word_unavailable() -> Error {
 93 |         Error { kind: ErrorKind::UnicodeWordUnavailable }
 94 |     }
 95 | }
 96 | 
 97 | #[cfg(feature = "std")]
 98 | impl std::error::Error for Error {
 99 |     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
100 |         match self.kind() {
101 |             ErrorKind::Syntax(ref err) => Some(err),
102 |             ErrorKind::TooManyPatterns { .. } => None,
103 |             ErrorKind::TooManyStates { .. } => None,
104 |             ErrorKind::ExceededSizeLimit { .. } => None,
105 |             ErrorKind::InvalidCaptureIndex { .. } => None,
106 |             ErrorKind::UnicodeWordUnavailable => None,
107 |         }
108 |     }
109 | }
110 | 
111 | impl core::fmt::Display for Error {
112 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
113 |         match self.kind() {
114 |             ErrorKind::Syntax(_) => write!(f, "error parsing regex"),
115 |             ErrorKind::TooManyPatterns { given, limit } => write!(
116 |                 f,
117 |                 "attemped to compile {} patterns, \
118 |                  which exceeds the limit of {}",
119 |                 given, limit,
120 |             ),
121 |             ErrorKind::TooManyStates { given, limit } => write!(
122 |                 f,
123 |                 "attemped to compile {} NFA states, \
124 |                  which exceeds the limit of {}",
125 |                 given, limit,
126 |             ),
127 |             ErrorKind::ExceededSizeLimit { limit } => write!(
128 |                 f,
129 |                 "heap usage during NFA compilation exceeded limit of {}",
130 |                 limit,
131 |             ),
132 |             ErrorKind::InvalidCaptureIndex { index } => write!(
133 |                 f,
134 |                 "capture group index {} is invalid (too big or discontinuous)",
135 |                 index,
136 |             ),
137 |             ErrorKind::UnicodeWordUnavailable => write!(
138 |                 f,
139 |                 "crate has been compiled without Unicode word boundary \
140 |                  support, but the NFA contains Unicode word boundary \
141 |                  assertions",
142 |             ),
143 |         }
144 |     }
145 | }
146 | 


--------------------------------------------------------------------------------
/src/util/lazy.rs:
--------------------------------------------------------------------------------
 1 | use core::{
 2 |     cell::Cell,
 3 |     ptr,
 4 |     sync::atomic::{AtomicPtr, Ordering},
 5 | };
 6 | 
 7 | use alloc::{boxed::Box, vec::Vec};
 8 | 
 9 | #[inline(always)]
10 | pub(crate) fn get_or_init<T: Send + Sync + 'static>(
11 |     location: &'static AtomicPtr<T>,
12 |     init: impl FnOnce() -> T,
13 | ) -> &'static T {
14 |     let mut ptr = location.load(Ordering::Acquire);
15 |     if ptr.is_null() {
16 |         let new_dfa = Box::new(init());
17 |         ptr = Box::into_raw(new_dfa);
18 |         let result = location.compare_exchange(
19 |             ptr::null_mut(),
20 |             ptr,
21 |             Ordering::AcqRel,
22 |             Ordering::Acquire,
23 |         );
24 |         if let Err(old) = result {
25 |             let redundant = unsafe { Box::from_raw(ptr) };
26 |             drop(redundant);
27 |             ptr = old;
28 |         }
29 |     }
30 |     unsafe { &*ptr }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/util/sparse_set.rs:
--------------------------------------------------------------------------------
  1 | use alloc::{boxed::Box, vec, vec::Vec};
  2 | 
  3 | use crate::util::id::StateID;
  4 | 
  5 | /// A pairse of sparse sets.
  6 | ///
  7 | /// This is useful when one needs to compute NFA epsilon closures from a
  8 | /// previous set of states derived from an epsilon closure. One set can be the
  9 | /// starting states where as the other set can be the destination states after
 10 | /// following the transitions for a particular byte of input.
 11 | ///
 12 | /// There is no significance to 'set1' or 'set2'. They are both sparse sets of
 13 | /// the same size.
 14 | ///
 15 | /// The members of this struct are exposed so that callers may borrow 'set1'
 16 | /// and 'set2' individually without being force to borrow both at the same
 17 | /// time.
 18 | #[derive(Clone, Debug)]
 19 | pub(crate) struct SparseSets {
 20 |     pub(crate) set1: SparseSet,
 21 |     pub(crate) set2: SparseSet,
 22 | }
 23 | 
 24 | impl SparseSets {
 25 |     /// Create a new pair of sparse sets where each set has the given capacity.
 26 |     ///
 27 |     /// This panics if the capacity given is bigger than `StateID::LIMIT`.
 28 |     pub(crate) fn new(capacity: usize) -> SparseSets {
 29 |         SparseSets {
 30 |             set1: SparseSet::new(capacity),
 31 |             set2: SparseSet::new(capacity),
 32 |         }
 33 |     }
 34 | 
 35 |     /// Resizes these sparse sets to have the new capacity given.
 36 |     ///
 37 |     /// The sets are automatically cleared.
 38 |     ///
 39 |     /// This panics if the capacity given is bigger than `StateID::LIMIT`.
 40 |     #[inline]
 41 |     pub(crate) fn resize(&mut self, new_capacity: usize) {
 42 |         self.set1.resize(new_capacity);
 43 |         self.set2.resize(new_capacity);
 44 |     }
 45 | 
 46 |     /// Clear both sparse sets.
 47 |     pub(crate) fn clear(&mut self) {
 48 |         self.set1.clear();
 49 |         self.set2.clear();
 50 |     }
 51 | 
 52 |     /// Swap set1 with set2.
 53 |     pub(crate) fn swap(&mut self) {
 54 |         core::mem::swap(&mut self.set1, &mut self.set2);
 55 |     }
 56 | 
 57 |     /// Returns the memory usage, in bytes, used by this pair of sparse sets.
 58 |     pub(crate) fn memory_usage(&self) -> usize {
 59 |         self.set1.memory_usage() + self.set2.memory_usage()
 60 |     }
 61 | }
 62 | 
 63 | /// A sparse set used for representing ordered NFA states.
 64 | ///
 65 | /// This supports constant time addition and membership testing. Clearing an
 66 | /// entire set can also be done in constant time. Iteration yields elements
 67 | /// in the order in which they were inserted.
 68 | ///
 69 | /// The data structure is based on: https://research.swtch.com/sparse
 70 | /// Note though that we don't actually use uninitialized memory. We generally
 71 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its
 72 | /// other properties listed above are extremely useful.
 73 | #[derive(Clone)]
 74 | pub(crate) struct SparseSet {
 75 |     /// The number of elements currently in this set.
 76 |     len: usize,
 77 |     /// Dense contains the ids in the order in which they were inserted.
 78 |     dense: Vec<StateID>,
 79 |     /// Sparse maps ids to their location in dense.
 80 |     ///
 81 |     /// A state ID is in the set if and only if
 82 |     /// sparse[id] < dense.len() && id == dense[sparse[id]].
 83 |     sparse: Vec<StateID>,
 84 | }
 85 | 
 86 | impl SparseSet {
 87 |     /// Create a new sparse set with the given capacity.
 88 |     ///
 89 |     /// Sparse sets have a fixed size and they cannot grow. Attempting to
 90 |     /// insert more distinct elements than the total capacity of the set will
 91 |     /// result in a panic.
 92 |     ///
 93 |     /// This panics if the capacity given is bigger than `StateID::LIMIT`.
 94 |     #[inline]
 95 |     pub(crate) fn new(capacity: usize) -> SparseSet {
 96 |         let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
 97 |         set.resize(capacity);
 98 |         set
 99 |     }
100 | 
101 |     /// Resizes this sparse set to have the new capacity given.
102 |     ///
103 |     /// This set is automatically cleared.
104 |     ///
105 |     /// This panics if the capacity given is bigger than `StateID::LIMIT`.
106 |     #[inline]
107 |     pub(crate) fn resize(&mut self, new_capacity: usize) {
108 |         assert!(
109 |             new_capacity <= StateID::LIMIT,
110 |             "sparse set capacity cannot excced {:?}",
111 |             StateID::LIMIT
112 |         );
113 |         self.clear();
114 |         self.dense.resize(new_capacity, StateID::ZERO);
115 |         self.sparse.resize(new_capacity, StateID::ZERO);
116 |     }
117 | 
118 |     /// Returns the capacity of this set.
119 |     ///
120 |     /// The capacity represents a fixed limit on the number of distinct
121 |     /// elements that are allowed in this set. The capacity cannot be changed.
122 |     #[inline]
123 |     pub(crate) fn capacity(&self) -> usize {
124 |         self.dense.len()
125 |     }
126 | 
127 |     /// Returns the number of elements in this set.
128 |     #[inline]
129 |     pub(crate) fn len(&self) -> usize {
130 |         self.len
131 |     }
132 | 
133 |     /// Returns true if and only if this set is empty.
134 |     #[inline]
135 |     pub(crate) fn is_empty(&self) -> bool {
136 |         self.len() == 0
137 |     }
138 | 
139 |     /// Insert the state ID value into this set and return true if the given
140 |     /// state ID was not previously in this set.
141 |     ///
142 |     /// This operation is idempotent. If the given value is already in this
143 |     /// set, then this is a no-op.
144 |     ///
145 |     /// If more than `capacity` ids are inserted, then this panics.
146 |     ///
147 |     /// This is marked as inline(always) since the compiler won't inline it
148 |     /// otherwise, and it's a fairly hot piece of code in DFA determinization.
149 |     #[inline(always)]
150 |     pub(crate) fn insert(&mut self, value: StateID) -> bool {
151 |         if self.contains(value) {
152 |             return false;
153 |         }
154 | 
155 |         let i = self.len();
156 |         assert!(
157 |             i < self.capacity(),
158 |             "{:?} exceeds capacity of {:?} when inserting {:?}",
159 |             i,
160 |             self.capacity(),
161 |             value,
162 |         );
163 |         // OK since i < self.capacity() and self.capacity() is guaranteed to
164 |         // be <= StateID::LIMIT.
165 |         let id = StateID::new_unchecked(i);
166 |         self.dense[id] = value;
167 |         self.sparse[value] = id;
168 |         self.len += 1;
169 |         true
170 |     }
171 | 
172 |     /// Returns true if and only if this set contains the given value.
173 |     #[inline]
174 |     pub(crate) fn contains(&self, value: StateID) -> bool {
175 |         let i = self.sparse[value];
176 |         i.as_usize() < self.len() && self.dense[i] == value
177 |     }
178 | 
179 |     /// Returns the ith inserted element from this set.
180 |     ///
181 |     /// Panics when i >= self.len().
182 |     #[inline]
183 |     pub(crate) fn get(&self, i: usize) -> StateID {
184 |         self.dense[i]
185 |     }
186 | 
187 |     /// Clear this set such that it has no members.
188 |     #[inline]
189 |     pub(crate) fn clear(&mut self) {
190 |         self.len = 0;
191 |     }
192 | 
193 |     /// Returns the heap memory usage, in bytes, used by this sparse set.
194 |     #[inline]
195 |     pub(crate) fn memory_usage(&self) -> usize {
196 |         2 * self.dense.len() * StateID::SIZE
197 |     }
198 | }
199 | 
200 | impl core::fmt::Debug for SparseSet {
201 |     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
202 |         let elements: Vec<StateID> = self.into_iter().collect();
203 |         f.debug_tuple("SparseSet").field(&elements).finish()
204 |     }
205 | }
206 | 
207 | /// An iterator over all elements in a sparse set.
208 | ///
209 | /// The lifetime `'a` refers to the lifetime of the set being iterated over.
210 | #[derive(Debug)]
211 | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
212 | 
213 | impl<'a> IntoIterator for &'a SparseSet {
214 |     type Item = StateID;
215 |     type IntoIter = SparseSetIter<'a>;
216 | 
217 |     fn into_iter(self) -> Self::IntoIter {
218 |         SparseSetIter(self.dense[..self.len()].iter())
219 |     }
220 | }
221 | 
222 | impl<'a> Iterator for SparseSetIter<'a> {
223 |     type Item = StateID;
224 | 
225 |     #[inline(always)]
226 |     fn next(&mut self) -> Option<StateID> {
227 |         self.0.next().map(|value| *value)
228 |     }
229 | }
230 | 


--------------------------------------------------------------------------------
/src/util/start.rs:
--------------------------------------------------------------------------------
  1 | /// Represents the four possible starting configurations of a DFA search.
  2 | ///
  3 | /// The starting configuration is determined by inspecting the the beginning of
  4 | /// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if
  5 | /// specified) is what selects the start state to use in a DFA.
  6 | ///
  7 | /// In a DFA that doesn't have starting states for each pattern, then it will
  8 | /// have a maximum of four DFA start states. If the DFA was compiled with start
  9 | /// states for each pattern, then it will have a maximum of four DFA start
 10 | /// states for searching for any pattern, and then another maximum of four DFA
 11 | /// start states for executing an anchored search for each pattern.
 12 | ///
 13 | /// This ends up being represented as a table in the DFA (whether lazy or fully
 14 | /// built) where the stride of that table is 4, and each entry is an index into
 15 | /// the state transition table. Note though that multiple entries in the table
 16 | /// might point to the same state if the states would otherwise be equivalent.
 17 | /// (This is guaranteed by DFA minimization and may even be accomplished by
 18 | /// normal determinization, since it attempts to reuse equivalent states too.)
 19 | #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 20 | pub(crate) enum Start {
 21 |     /// This occurs when the starting position is not any of the ones below.
 22 |     NonWordByte = 0,
 23 |     /// This occurs when the byte immediately preceding the start of the search
 24 |     /// is an ASCII word byte.
 25 |     WordByte = 1,
 26 |     /// This occurs when the starting position of the search corresponds to the
 27 |     /// beginning of the haystack.
 28 |     Text = 2,
 29 |     /// This occurs when the byte immediately preceding the start of the search
 30 |     /// is a line terminator. Specifically, `\n`.
 31 |     Line = 3,
 32 | }
 33 | 
 34 | impl Start {
 35 |     /// Return the starting state corresponding to the given integer. If no
 36 |     /// starting state exists for the given integer, then None is returned.
 37 |     pub(crate) fn from_usize(n: usize) -> Option<Start> {
 38 |         match n {
 39 |             0 => Some(Start::NonWordByte),
 40 |             1 => Some(Start::WordByte),
 41 |             2 => Some(Start::Text),
 42 |             3 => Some(Start::Line),
 43 |             _ => None,
 44 |         }
 45 |     }
 46 | 
 47 |     /// Returns the total number of starting state configurations.
 48 |     pub(crate) fn count() -> usize {
 49 |         4
 50 |     }
 51 | 
 52 |     /// Returns the starting state configuration for the given search
 53 |     /// parameters. If the given offset range is not valid, then this panics.
 54 |     #[inline(always)]
 55 |     pub(crate) fn from_position_fwd(
 56 |         bytes: &[u8],
 57 |         start: usize,
 58 |         end: usize,
 59 |     ) -> Start {
 60 |         assert!(
 61 |             bytes.get(start..end).is_some(),
 62 |             "{}..{} is invalid",
 63 |             start,
 64 |             end
 65 |         );
 66 |         if start == 0 {
 67 |             Start::Text
 68 |         } else if bytes[start - 1] == b'\n' {
 69 |             Start::Line
 70 |         } else if crate::util::is_word_byte(bytes[start - 1]) {
 71 |             Start::WordByte
 72 |         } else {
 73 |             Start::NonWordByte
 74 |         }
 75 |     }
 76 | 
 77 |     /// Returns the starting state configuration for a reverse search with the
 78 |     /// given search parameters. If the given offset range is not valid, then
 79 |     /// this panics.
 80 |     #[inline(always)]
 81 |     pub(crate) fn from_position_rev(
 82 |         bytes: &[u8],
 83 |         start: usize,
 84 |         end: usize,
 85 |     ) -> Start {
 86 |         assert!(
 87 |             bytes.get(start..end).is_some(),
 88 |             "{}..{} is invalid",
 89 |             start,
 90 |             end
 91 |         );
 92 |         if end == bytes.len() {
 93 |             Start::Text
 94 |         } else if bytes[end] == b'\n' {
 95 |             Start::Line
 96 |         } else if crate::util::is_word_byte(bytes[end]) {
 97 |             Start::WordByte
 98 |         } else {
 99 |             Start::NonWordByte
100 |         }
101 |     }
102 | 
103 |     /// Return this starting configuration as an integer. It is guaranteed to
104 |     /// be less than `Start::count()`.
105 |     #[inline(always)]
106 |     pub(crate) fn as_usize(&self) -> usize {
107 |         *self as usize
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/tests/data/bytes.toml:
--------------------------------------------------------------------------------
  1 | # These are tests specifically crafted for regexes that can match arbitrary
  2 | # bytes. In some cases, we also test the Unicode variant as well, just because
  3 | # it's good sense to do so. But also, these tests aren't really about Unicode,
  4 | # but whether matches are only reported at valid UTF-8 boundaries. For most
  5 | # tests in this entire collection, utf8 = true. But for these tests, we use
  6 | # utf8 = false.
  7 | 
  8 | [[tests]]
  9 | name = "word-boundary-ascii"
 10 | regex = ' \b'
 11 | input = " δ"
 12 | matches = []
 13 | unicode = false
 14 | utf8 = false
 15 | 
 16 | [[tests]]
 17 | name = "word-boundary-unicode"
 18 | regex = ' \b'
 19 | input = " δ"
 20 | matches = [[0, 1]]
 21 | unicode = true
 22 | utf8 = false
 23 | 
 24 | [[tests]]
 25 | name = "word-boundary-ascii-not"
 26 | regex = ' \B'
 27 | input = " δ"
 28 | matches = [[0, 1]]
 29 | unicode = false
 30 | utf8 = false
 31 | 
 32 | [[tests]]
 33 | name = "word-boundary-unicode-not"
 34 | regex = ' \B'
 35 | input = " δ"
 36 | matches = []
 37 | unicode = true
 38 | utf8 = false
 39 | 
 40 | [[tests]]
 41 | name = "perl-word-ascii"
 42 | regex = '\w+'
 43 | input = "aδ"
 44 | matches = [[0, 1]]
 45 | unicode = false
 46 | utf8 = false
 47 | 
 48 | [[tests]]
 49 | name = "perl-word-unicode"
 50 | regex = '\w+'
 51 | input = "aδ"
 52 | matches = [[0, 3]]
 53 | unicode = true
 54 | utf8 = false
 55 | 
 56 | [[tests]]
 57 | name = "perl-decimal-ascii"
 58 | regex = '\d+'
 59 | input = "1२३9"
 60 | matches = [[0, 1], [7, 8]]
 61 | unicode = false
 62 | utf8 = false
 63 | 
 64 | [[tests]]
 65 | name = "perl-decimal-unicode"
 66 | regex = '\d+'
 67 | input = "1२३9"
 68 | matches = [[0, 8]]
 69 | unicode = true
 70 | utf8 = false
 71 | 
 72 | [[tests]]
 73 | name = "perl-whitespace-ascii"
 74 | regex = '\s+'
 75 | input = " \u1680"
 76 | matches = [[0, 1]]
 77 | unicode = false
 78 | utf8 = false
 79 | 
 80 | [[tests]]
 81 | name = "perl-whitespace-unicode"
 82 | regex = '\s+'
 83 | input = " \u1680"
 84 | matches = [[0, 4]]
 85 | unicode = true
 86 | utf8 = false
 87 | 
 88 | # The first `(.+)` matches two Unicode codepoints, but can't match the 5th
 89 | # byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
 90 | # matches.
 91 | [[tests]]
 92 | name = "mixed-dot"
 93 | regex = '(.+)(?-u)(.+)'
 94 | input = '\xCE\x93\xCE\x94\xFF'
 95 | captures = [
 96 |   [[0, 5], [0, 4], [4, 5]],
 97 | ]
 98 | unescape = true
 99 | unicode = true
100 | utf8 = false
101 | 
102 | [[tests]]
103 | name = "case-one-ascii"
104 | regex = 'a'
105 | input = "A"
106 | matches = [[0, 1]]
107 | case_insensitive = true
108 | unicode = false
109 | utf8 = false
110 | 
111 | [[tests]]
112 | name = "case-one-unicode"
113 | regex = 'a'
114 | input = "A"
115 | matches = [[0, 1]]
116 | case_insensitive = true
117 | unicode = true
118 | utf8 = false
119 | 
120 | [[tests]]
121 | name = "case-class-simple-ascii"
122 | regex = '[a-z]+'
123 | input = "AaAaA"
124 | matches = [[0, 5]]
125 | case_insensitive = true
126 | unicode = false
127 | utf8 = false
128 | 
129 | [[tests]]
130 | name = "case-class-ascii"
131 | regex = '[a-z]+'
132 | input = "aA\u212AaA"
133 | matches = [[0, 2], [5, 7]]
134 | case_insensitive = true
135 | unicode = false
136 | utf8 = false
137 | 
138 | [[tests]]
139 | name = "case-class-unicode"
140 | regex = '[a-z]+'
141 | input = "aA\u212AaA"
142 | matches = [[0, 7]]
143 | case_insensitive = true
144 | unicode = true
145 | utf8 = false
146 | 
147 | [[tests]]
148 | name = "negate-ascii"
149 | regex = '[^a]'
150 | input = "δ"
151 | matches = [[0, 1], [1, 2]]
152 | unicode = false
153 | utf8 = false
154 | 
155 | [[tests]]
156 | name = "negate-unicode"
157 | regex = '[^a]'
158 | input = "δ"
159 | matches = [[0, 2]]
160 | unicode = true
161 | utf8 = false
162 | 
163 | # When utf8=true, this won't match, because the implicit '.*?' prefix is
164 | # Unicode aware and will refuse to match through invalid UTF-8 bytes.
165 | [[tests]]
166 | name = "dotstar-prefix-ascii"
167 | regex = 'a'
168 | input = '\xFFa'
169 | matches = [[1, 2]]
170 | unescape = true
171 | unicode = false
172 | utf8 = false
173 | 
174 | [[tests]]
175 | name = "dotstar-prefix-unicode"
176 | regex = 'a'
177 | input = '\xFFa'
178 | matches = [[1, 2]]
179 | unescape = true
180 | unicode = true
181 | utf8 = false
182 | 
183 | [[tests]]
184 | name = "null-bytes"
185 | regex = '(?P<cstr>[^\x00]+)\x00'
186 | input = 'foo\x00'
187 | captures = [
188 |   [[0, 4], [0, 3]],
189 | ]
190 | unescape = true
191 | unicode = false
192 | utf8 = false
193 | 
194 | [[tests]]
195 | name = "invalid-utf8-anchor-100"
196 | regex = '\xCC?^'
197 | input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
198 | matches = [[0, 0]]
199 | unescape = true
200 | unicode = false
201 | utf8 = false
202 | 
203 | [[tests]]
204 | name = "invalid-utf8-anchor-200"
205 | regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$'
206 | input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
207 | matches = [[22, 22]]
208 | unescape = true
209 | unicode = false
210 | utf8 = false
211 | 
212 | [[tests]]
213 | name = "invalid-utf8-anchor-300"
214 | regex = '^|ddp\xff\xffdddddlQd@\x80'
215 | input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
216 | matches = [[0, 0]]
217 | unescape = true
218 | unicode = false
219 | utf8 = false
220 | 
221 | [[tests]]
222 | name = "word-boundary-ascii-100"
223 | regex = '\Bx\B'
224 | input = "áxβ"
225 | matches = []
226 | unicode = false
227 | utf8 = false
228 | 
229 | [[tests]]
230 | name = "word-boundary-ascii-200"
231 | regex = '\B'
232 | input = "0\U0007EF5E"
233 | matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
234 | unicode = false
235 | utf8 = false
236 | 


--------------------------------------------------------------------------------
/tests/data/crazy.toml:
--------------------------------------------------------------------------------
  1 | # TODO: There are still a couple of manually written tests in crazy.rs.
  2 | 
  3 | [[tests]]
  4 | name = "ranges"
  5 | regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
  6 | input = "num: 255"
  7 | matches = [[5, 8]]
  8 | 
  9 | [[tests]]
 10 | name = "ranges-not"
 11 | regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
 12 | input = "num: 256"
 13 | matches = []
 14 | 
 15 | [[tests]]
 16 | name = "float1"
 17 | regex = '[-+]?[0-9]*\.?[0-9]+'
 18 | input = "0.1"
 19 | matches = [[0, 3]]
 20 | 
 21 | [[tests]]
 22 | name = "float2"
 23 | regex = '[-+]?[0-9]*\.?[0-9]+'
 24 | input = "0.1.2"
 25 | matches = [[0, 3]]
 26 | match_limit = 1
 27 | 
 28 | [[tests]]
 29 | name = "float3"
 30 | regex = '[-+]?[0-9]*\.?[0-9]+'
 31 | input = "a1.2"
 32 | matches = [[1, 4]]
 33 | 
 34 | [[tests]]
 35 | name = "float4"
 36 | regex = '[-+]?[0-9]*\.?[0-9]+'
 37 | input = "1.a"
 38 | matches = [[0, 1]]
 39 | 
 40 | [[tests]]
 41 | name = "float5"
 42 | regex = '^[-+]?[0-9]*\.?[0-9]+$'
 43 | input = "1.a"
 44 | matches = []
 45 | 
 46 | [[tests]]
 47 | name = "email"
 48 | regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
 49 | input = "mine is jam.slam@gmail.com "
 50 | matches = [[8, 26]]
 51 | 
 52 | [[tests]]
 53 | name = "email-not"
 54 | regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
 55 | input = "mine is jam.slam@gmail "
 56 | matches = []
 57 | 
 58 | [[tests]]
 59 | name = "email-big"
 60 | regex = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
 61 | input = "mine is jam.slam@gmail.com "
 62 | matches = [[8, 26]]
 63 | 
 64 | [[tests]]
 65 | name = "date1"
 66 | regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
 67 | input = "1900-01-01"
 68 | matches = [[0, 10]]
 69 | 
 70 | [[tests]]
 71 | name = "date2"
 72 | regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
 73 | input = "1900-00-01"
 74 | matches = []
 75 | 
 76 | [[tests]]
 77 | name = "date3"
 78 | regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
 79 | input = "1900-13-01"
 80 | matches = []
 81 | 
 82 | [[tests]]
 83 | name = "start-end-empty"
 84 | regex = '^$'
 85 | input = ""
 86 | matches = [[0, 0]]
 87 | 
 88 | [[tests]]
 89 | name = "start-end-empty-rev"
 90 | regex = '$^'
 91 | input = ""
 92 | matches = [[0, 0]]
 93 | 
 94 | [[tests]]
 95 | name = "start-end-empty-many-1"
 96 | regex = '^$^$^$'
 97 | input = ""
 98 | matches = [[0, 0]]
 99 | 
100 | [[tests]]
101 | name = "start-end-empty-many-2"
102 | regex = '^^^$$$'
103 | input = ""
104 | matches = [[0, 0]]
105 | 
106 | [[tests]]
107 | name = "start-end-empty-rep"
108 | regex = '(?:^$)*'
109 | input = "a\nb\nc"
110 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
111 | 
112 | [[tests]]
113 | name = "start-end-empty-rep-rev"
114 | regex = '(?:$^)*'
115 | input = "a\nb\nc"
116 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
117 | 
118 | [[tests]]
119 | name = "neg-class-letter"
120 | regex = '[^ac]'
121 | input = "acx"
122 | matches = [[2, 3]]
123 | 
124 | [[tests]]
125 | name = "neg-class-letter-comma"
126 | regex = '[^a,]'
127 | input = "a,x"
128 | matches = [[2, 3]]
129 | 
130 | [[tests]]
131 | name = "neg-class-letter-space"
132 | regex = '[^a[:space:]]'
133 | input = "a x"
134 | matches = [[2, 3]]
135 | 
136 | [[tests]]
137 | name = "neg-class-comma"
138 | regex = '[^,]'
139 | input = ",,x"
140 | matches = [[2, 3]]
141 | 
142 | [[tests]]
143 | name = "neg-class-space"
144 | regex = '[^[:space:]]'
145 | input = " a"
146 | matches = [[1, 2]]
147 | 
148 | [[tests]]
149 | name = "neg-class-space-comma"
150 | regex = '[^,[:space:]]'
151 | input = ", a"
152 | matches = [[2, 3]]
153 | 
154 | [[tests]]
155 | name = "neg-class-comma-space"
156 | regex = '[^[:space:],]'
157 | input = " ,a"
158 | matches = [[2, 3]]
159 | 
160 | [[tests]]
161 | name = "neg-class-ascii"
162 | regex = '[^[:alpha:]Z]'
163 | input = "A1"
164 | matches = [[1, 2]]
165 | 
166 | [[tests]]
167 | name = "lazy-many-many"
168 | regex = '((?:.*)*?)='
169 | input = "a=b"
170 | matches = [[0, 2]]
171 | 
172 | [[tests]]
173 | name = "lazy-many-optional"
174 | regex = '((?:.?)*?)='
175 | input = "a=b"
176 | matches = [[0, 2]]
177 | 
178 | [[tests]]
179 | name = "lazy-one-many-many"
180 | regex = '((?:.*)+?)='
181 | input = "a=b"
182 | matches = [[0, 2]]
183 | 
184 | [[tests]]
185 | name = "lazy-one-many-optional"
186 | regex = '((?:.?)+?)='
187 | input = "a=b"
188 | matches = [[0, 2]]
189 | 
190 | [[tests]]
191 | name = "lazy-range-min-many"
192 | regex = '((?:.*){1,}?)='
193 | input = "a=b"
194 | matches = [[0, 2]]
195 | 
196 | [[tests]]
197 | name = "lazy-range-many"
198 | regex = '((?:.*){1,2}?)='
199 | input = "a=b"
200 | matches = [[0, 2]]
201 | 
202 | [[tests]]
203 | name = "greedy-many-many"
204 | regex = '((?:.*)*)='
205 | input = "a=b"
206 | matches = [[0, 2]]
207 | 
208 | [[tests]]
209 | name = "greedy-many-optional"
210 | regex = '((?:.?)*)='
211 | input = "a=b"
212 | matches = [[0, 2]]
213 | 
214 | [[tests]]
215 | name = "greedy-one-many-many"
216 | regex = '((?:.*)+)='
217 | input = "a=b"
218 | matches = [[0, 2]]
219 | 
220 | [[tests]]
221 | name = "greedy-one-many-optional"
222 | regex = '((?:.?)+)='
223 | input = "a=b"
224 | matches = [[0, 2]]
225 | 
226 | [[tests]]
227 | name = "greedy-range-min-many"
228 | regex = '((?:.*){1,})='
229 | input = "a=b"
230 | matches = [[0, 2]]
231 | 
232 | [[tests]]
233 | name = "greedy-range-many"
234 | regex = '((?:.*){1,2})='
235 | input = "a=b"
236 | matches = [[0, 2]]
237 | 
238 | [[tests]]
239 | name = "empty1"
240 | regex = ''
241 | input = ""
242 | matches = [[0, 0]]
243 | 
244 | [[tests]]
245 | name = "empty2"
246 | regex = ''
247 | input = "abc"
248 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
249 | 
250 | [[tests]]
251 | name = "empty3"
252 | regex = '()'
253 | input = "abc"
254 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
255 | 
256 | [[tests]]
257 | name = "empty4"
258 | regex = '()*'
259 | input = "abc"
260 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
261 | 
262 | [[tests]]
263 | name = "empty5"
264 | regex = '()+'
265 | input = "abc"
266 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
267 | 
268 | [[tests]]
269 | name = "empty6"
270 | regex = '()?'
271 | input = "abc"
272 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
273 | 
274 | [[tests]]
275 | name = "empty7"
276 | regex = '()()'
277 | input = "abc"
278 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
279 | 
280 | [[tests]]
281 | name = "empty8"
282 | regex = '()+|z'
283 | input = "abc"
284 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
285 | 
286 | [[tests]]
287 | name = "empty9"
288 | regex = 'z|()+'
289 | input = "abc"
290 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
291 | 
292 | [[tests]]
293 | name = "empty10"
294 | regex = '()+|b'
295 | input = "abc"
296 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
297 | 
298 | [[tests]]
299 | name = "empty11"
300 | regex = 'b|()+'
301 | input = "abc"
302 | matches = [[0, 0], [1, 2], [3, 3]]
303 | 


--------------------------------------------------------------------------------
/tests/data/earliest.toml:
--------------------------------------------------------------------------------
 1 | [[tests]]
 2 | name = "no-greedy-100"
 3 | regex = 'a+'
 4 | input = "aaa"
 5 | matches = [[0, 1], [1, 2], [2, 3]]
 6 | search_kind = "earliest"
 7 | 
 8 | [[tests]]
 9 | name = "no-greedy-200"
10 | regex = 'abc+'
11 | input = "zzzabccc"
12 | matches = [[3, 6]]
13 | search_kind = "earliest"
14 | 
15 | [[tests]]
16 | name = "is-ungreedy"
17 | regex = 'a+?'
18 | input = "aaa"
19 | matches = [[0, 1], [1, 2], [2, 3]]
20 | search_kind = "earliest"
21 | 
22 | [[tests]]
23 | name = "look-start-test"
24 | regex = '^(abc|a)'
25 | input = "abc"
26 | matches = [[0, 1]]
27 | search_kind = "earliest"
28 | 
29 | [[tests]]
30 | name = "look-end-test"
31 | regex = '(abc|a)$'
32 | input = "abc"
33 | matches = [[0, 3]]
34 | search_kind = "earliest"
35 | 
36 | [[tests]]
37 | name = "no-leftmost-first-100"
38 | regex = 'abc|a'
39 | input = "abc"
40 | matches = [[0, 1]]
41 | search_kind = "earliest"
42 | 
43 | [[tests]]
44 | name = "no-leftmost-first-200"
45 | regex = 'aba|a'
46 | input = "aba"
47 | matches = [[0, 1], [2, 3]]
48 | search_kind = "earliest"
49 | 


--------------------------------------------------------------------------------
/tests/data/empty.toml:
--------------------------------------------------------------------------------
  1 | [[tests]]
  2 | name = "100"
  3 | regex = "|b"
  4 | input = "abc"
  5 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
  6 | 
  7 | [[tests]]
  8 | name = "110"
  9 | regex = "b|"
 10 | input = "abc"
 11 | matches = [[0, 0], [1, 2], [3, 3]]
 12 | 
 13 | [[tests]]
 14 | name = "120"
 15 | regex = "|z"
 16 | input = "abc"
 17 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 18 | 
 19 | [[tests]]
 20 | name = "130"
 21 | regex = "z|"
 22 | input = "abc"
 23 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 24 | 
 25 | [[tests]]
 26 | name = "200"
 27 | regex = "|"
 28 | input = "abc"
 29 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 30 | 
 31 | [[tests]]
 32 | name = "210"
 33 | regex = "||"
 34 | input = "abc"
 35 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 36 | 
 37 | [[tests]]
 38 | name = "220"
 39 | regex = "||b"
 40 | input = "abc"
 41 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 42 | 
 43 | [[tests]]
 44 | name = "230"
 45 | regex = "b||"
 46 | input = "abc"
 47 | matches = [[0, 0], [1, 2], [3, 3]]
 48 | 
 49 | [[tests]]
 50 | name = "240"
 51 | regex = "||z"
 52 | input = "abc"
 53 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 54 | 
 55 | [[tests]]
 56 | name = "300"
 57 | regex = "(?:)|b"
 58 | input = "abc"
 59 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 60 | 
 61 | [[tests]]
 62 | name = "310"
 63 | regex = "b|(?:)"
 64 | input = "abc"
 65 | matches = [[0, 0], [1, 2], [3, 3]]
 66 | 
 67 | [[tests]]
 68 | name = "320"
 69 | regex = "(?:|)"
 70 | input = "abc"
 71 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 72 | 
 73 | [[tests]]
 74 | name = "330"
 75 | regex = "(?:|)|z"
 76 | input = "abc"
 77 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 78 | 
 79 | [[tests]]
 80 | name = "400"
 81 | regex = "a(?:)|b"
 82 | input = "abc"
 83 | matches = [[0, 1], [1, 2]]
 84 | 
 85 | [[tests]]
 86 | name = "500"
 87 | regex = ""
 88 | input = ""
 89 | matches = [[0, 0]]
 90 | 
 91 | [[tests]]
 92 | name = "510"
 93 | regex = ""
 94 | input = "a"
 95 | matches = [[0, 0], [1, 1]]
 96 | 
 97 | [[tests]]
 98 | name = "520"
 99 | regex = ""
100 | input = "abc"
101 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
102 | 
103 | [[tests]]
104 | name = "600"
105 | regex = '(|a)*'
106 | input = "aaa"
107 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
108 | 
109 | [[tests]]
110 | name = "610"
111 | regex = '(|a)+'
112 | input = "aaa"
113 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
114 | 


--------------------------------------------------------------------------------
/tests/data/expensive.toml:
--------------------------------------------------------------------------------
 1 | # These represent tests that may be expensive to run on some regex engines. For
 2 | # example, tests that build a full DFA ahead of time and minimize it can take a
 3 | # horrendously long time on regexes that are large (or result in an explosion
 4 | # in the number of states). We group these tests together so that such engines
 5 | # can simply skip these tests.
 6 | 
 7 | # See: https://github.com/rust-lang/regex/issues/98
 8 | [[tests]]
 9 | name = "regression-many-repeat-no-stack-overflow"
10 | regex = '^.{1,2500}'
11 | input = "a"
12 | matches = [[0, 1]]
13 | 


--------------------------------------------------------------------------------
/tests/data/flags.toml:
--------------------------------------------------------------------------------
 1 | [[tests]]
 2 | name = "1"
 3 | regex = "(?i)abc"
 4 | input = "ABC"
 5 | matches = [[0, 3]]
 6 | 
 7 | [[tests]]
 8 | name = "2"
 9 | regex = "(?i)a(?-i)bc"
10 | input = "Abc"
11 | matches = [[0, 3]]
12 | 
13 | [[tests]]
14 | name = "3"
15 | regex = "(?i)a(?-i)bc"
16 | input = "ABC"
17 | matches = []
18 | 
19 | [[tests]]
20 | name = "4"
21 | regex = "(?is)a."
22 | input = "A\n"
23 | matches = [[0, 2]]
24 | 
25 | [[tests]]
26 | name = "5"
27 | regex = "(?is)a.(?-is)a."
28 | input = "A\nab"
29 | matches = [[0, 4]]
30 | 
31 | [[tests]]
32 | name = "6"
33 | regex = "(?is)a.(?-is)a."
34 | input = "A\na\n"
35 | matches = []
36 | 
37 | [[tests]]
38 | name = "7"
39 | regex = "(?is)a.(?-is:a.)?"
40 | input = "A\na\n"
41 | matches = [[0, 2]]
42 | match_limit = 1
43 | 
44 | [[tests]]
45 | name = "8"
46 | regex = "(?U)a+"
47 | input = "aa"
48 | matches = [[0, 1]]
49 | match_limit = 1
50 | 
51 | [[tests]]
52 | name = "9"
53 | regex = "(?U)a+?"
54 | input = "aa"
55 | matches = [[0, 2]]
56 | 
57 | [[tests]]
58 | name = "10"
59 | regex = "(?U)(?-U)a+"
60 | input = "aa"
61 | matches = [[0, 2]]
62 | 
63 | [[tests]]
64 | name = "11"
65 | regex = '(?m)(?:^\d+$\n?)+'
66 | input = "123\n456\n789"
67 | matches = [[0, 11]]
68 | 


--------------------------------------------------------------------------------
/tests/data/fowler/dat/README:
--------------------------------------------------------------------------------
 1 | Test data was taken from the Go distribution, which was in turn taken from the
 2 | testregex test suite:
 3 | 
 4 |   http://www2.research.att.com/~astopen/testregex/testregex.html
 5 | 
 6 | Unfortunately, the above link is now dead, but the test data lives on.
 7 | 
 8 | The LICENSE in this directory corresponds to the LICENSE that the data was
 9 | originally released under.
10 | 
11 | The tests themselves were modified for RE2/Go. A couple were modified further
12 | by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
13 | (Yes, it seems like RE2/Go includes failing test cases.) This may or may not
14 | have been a bad idea, but I think being consistent with an established Regex
15 | library is worth something.
16 | 
17 | After some number of years, these tests were transformed into a TOML format
18 | using the fowler-to-toml script in the 'scripts' directory. To re-generate the
19 | TOML files, then run the following from the root of this repository:
20 | 
21 |   ./scripts/fowler-to-toml tests/data/fowler tests/data/fowler/dat/*.dat
22 | 
23 | which brings them into a sensible structured format in which other tests can
24 | be written.
25 | 


--------------------------------------------------------------------------------
/tests/data/fowler/dat/nullsubexpr.dat:
--------------------------------------------------------------------------------
 1 | NOTE	null subexpression matches : 2002-06-06
 2 | 
 3 | E	(a*)*		a		(0,1)(0,1)
 4 | #E	SAME		x		(0,0)(0,0)
 5 | E	SAME		x		(0,0)(?,?)	RE2/Go
 6 | E	SAME		aaaaaa		(0,6)(0,6)
 7 | E	SAME		aaaaaax		(0,6)(0,6)
 8 | E	(a*)+		a		(0,1)(0,1)
 9 | E	SAME		x		(0,0)(0,0)
10 | E	SAME		aaaaaa		(0,6)(0,6)
11 | E	SAME		aaaaaax		(0,6)(0,6)
12 | E	(a+)*		a		(0,1)(0,1)
13 | E	SAME		x		(0,0)
14 | E	SAME		aaaaaa		(0,6)(0,6)
15 | E	SAME		aaaaaax		(0,6)(0,6)
16 | E	(a+)+		a		(0,1)(0,1)
17 | E	SAME		x		NOMATCH
18 | E	SAME		aaaaaa		(0,6)(0,6)
19 | E	SAME		aaaaaax		(0,6)(0,6)
20 | 
21 | E	([a]*)*		a		(0,1)(0,1)
22 | #E	SAME		x		(0,0)(0,0)
23 | E	SAME		x		(0,0)(?,?)	RE2/Go
24 | E	SAME		aaaaaa		(0,6)(0,6)
25 | E	SAME		aaaaaax		(0,6)(0,6)
26 | E	([a]*)+		a		(0,1)(0,1)
27 | E	SAME		x		(0,0)(0,0)
28 | E	SAME		aaaaaa		(0,6)(0,6)
29 | E	SAME		aaaaaax		(0,6)(0,6)
30 | E	([^b]*)*	a		(0,1)(0,1)
31 | #E	SAME		b		(0,0)(0,0)
32 | E	SAME		b		(0,0)(?,?)	RE2/Go
33 | E	SAME		aaaaaa		(0,6)(0,6)
34 | E	SAME		aaaaaab		(0,6)(0,6)
35 | E	([ab]*)*	a		(0,1)(0,1)
36 | E	SAME		aaaaaa		(0,6)(0,6)
37 | E	SAME		ababab		(0,6)(0,6)
38 | E	SAME		bababa		(0,6)(0,6)
39 | E	SAME		b		(0,1)(0,1)
40 | E	SAME		bbbbbb		(0,6)(0,6)
41 | E	SAME		aaaabcde	(0,5)(0,5)
42 | E	([^a]*)*	b		(0,1)(0,1)
43 | E	SAME		bbbbbb		(0,6)(0,6)
44 | #E	SAME		aaaaaa		(0,0)(0,0)
45 | E	SAME		aaaaaa		(0,0)(?,?)	RE2/Go
46 | E	([^ab]*)*	ccccxx		(0,6)(0,6)
47 | #E	SAME		ababab		(0,0)(0,0)
48 | E	SAME		ababab		(0,0)(?,?)	RE2/Go
49 | 
50 | E	((z)+|a)*	zabcde		(0,2)(1,2)
51 | 
52 | #{E	a+?		aaaaaa		(0,1)	no *? +? mimimal match ops
53 | #E	(a)		aaa		(0,1)(0,1)
54 | #E	(a*?)		aaa		(0,0)(0,0)
55 | #E	(a)*?		aaa		(0,0)
56 | #E	(a*?)*?		aaa		(0,0)
57 | #}
58 | 
59 | B	\(a*\)*\(x\)		x	(0,1)(0,0)(0,1)
60 | B	\(a*\)*\(x\)		ax	(0,2)(0,1)(1,2)
61 | B	\(a*\)*\(x\)		axa	(0,2)(0,1)(1,2)
62 | B	\(a*\)*\(x\)\(\1\)	x	(0,1)(0,0)(0,1)(1,1)
63 | B	\(a*\)*\(x\)\(\1\)	ax	(0,2)(1,1)(1,2)(2,2)
64 | B	\(a*\)*\(x\)\(\1\)	axa	(0,3)(0,1)(1,2)(2,3)
65 | B	\(a*\)*\(x\)\(\1\)\(x\)	axax	(0,4)(0,1)(1,2)(2,3)(3,4)
66 | B	\(a*\)*\(x\)\(\1\)\(x\)	axxa	(0,3)(1,1)(1,2)(2,2)(2,3)
67 | 
68 | #E	(a*)*(x)		x	(0,1)(0,0)(0,1)
69 | E	(a*)*(x)		x	(0,1)(?,?)(0,1)	RE2/Go
70 | E	(a*)*(x)		ax	(0,2)(0,1)(1,2)
71 | E	(a*)*(x)		axa	(0,2)(0,1)(1,2)
72 | 
73 | E	(a*)+(x)		x	(0,1)(0,0)(0,1)
74 | E	(a*)+(x)		ax	(0,2)(0,1)(1,2)
75 | E	(a*)+(x)		axa	(0,2)(0,1)(1,2)
76 | 
77 | E	(a*){2}(x)		x	(0,1)(0,0)(0,1)
78 | E	(a*){2}(x)		ax	(0,2)(1,1)(1,2)
79 | E	(a*){2}(x)		axa	(0,2)(1,1)(1,2)
80 | 


--------------------------------------------------------------------------------
/tests/data/fowler/dat/repetition-expensive.dat:
--------------------------------------------------------------------------------
 1 | NOTE	implicit vs. explicit repetitions : 2009-02-02
 2 | 
 3 | # Glenn Fowler <gsf@research.att.com>
 4 | # conforming matches (column 4) must match one of the following BREs
 5 | #	NOMATCH
 6 | #	(0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
 7 | #	(0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
 8 | # i.e., each 3-tuple has two identical elements and one (?,?)
 9 | 
10 | NOTE	additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
11 | 
12 | :HA#100:E	X(.?){0,}Y	X1234567Y	(0,9)(7,8)
13 | :HA#101:E	X(.?){1,}Y	X1234567Y	(0,9)(7,8)
14 | :HA#102:E	X(.?){2,}Y	X1234567Y	(0,9)(7,8)
15 | :HA#103:E	X(.?){3,}Y	X1234567Y	(0,9)(7,8)
16 | :HA#104:E	X(.?){4,}Y	X1234567Y	(0,9)(7,8)
17 | :HA#105:E	X(.?){5,}Y	X1234567Y	(0,9)(7,8)
18 | :HA#106:E	X(.?){6,}Y	X1234567Y	(0,9)(7,8)
19 | :HA#107:E	X(.?){7,}Y	X1234567Y	(0,9)(7,8)
20 | :HA#108:E	X(.?){8,}Y	X1234567Y	(0,9)(8,8)
21 | #:HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(7,8)
22 | :HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
23 | #:HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(7,8)
24 | :HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
25 | #:HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(7,8)
26 | :HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
27 | #:HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(7,8)
28 | :HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
29 | #:HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(7,8)
30 | :HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
31 | #:HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(7,8)
32 | :HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
33 | #:HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(7,8)
34 | :HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
35 | #:HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(7,8)
36 | :HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
37 | :HA#118:E	X(.?){8,8}Y	X1234567Y	(0,9)(8,8)
38 | 
39 | # These test a fixed bug in my regex-tdfa that did not keep the expanded
40 | # form properly grouped, so right association did the wrong thing with
41 | # these ambiguous patterns (crafted just to test my code when I became
42 | # suspicious of my implementation).  The first subexpression should use
43 | # "ab" then "a" then "bcd".
44 | 
45 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible
46 | # results like (0,6)(4,5)(6,6).
47 | 
48 | :HA#260:E	(a|ab|c|bcd){0,}(d*)	ababcd	(0,1)(0,1)(1,1)
49 | :HA#261:E	(a|ab|c|bcd){1,}(d*)	ababcd	(0,1)(0,1)(1,1)
50 | :HA#262:E	(a|ab|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
51 | :HA#263:E	(a|ab|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
52 | :HA#264:E	(a|ab|c|bcd){4,}(d*)	ababcd	NOMATCH
53 | :HA#265:E	(a|ab|c|bcd){0,10}(d*)	ababcd	(0,1)(0,1)(1,1)
54 | :HA#266:E	(a|ab|c|bcd){1,10}(d*)	ababcd	(0,1)(0,1)(1,1)
55 | :HA#267:E	(a|ab|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
56 | :HA#268:E	(a|ab|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
57 | :HA#269:E	(a|ab|c|bcd){4,10}(d*)	ababcd	NOMATCH
58 | :HA#270:E	(a|ab|c|bcd)*(d*)	ababcd	(0,1)(0,1)(1,1)
59 | :HA#271:E	(a|ab|c|bcd)+(d*)	ababcd	(0,1)(0,1)(1,1)
60 | 
61 | # The above worked on Linux/GLIBC but the following often fail.
62 | # They also trip up OS X / FreeBSD / NetBSD:
63 | 
64 | #:HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(3,6)(6,6)
65 | :HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
66 | #:HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(3,6)(6,6)
67 | :HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
68 | #:HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
69 | :HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
70 | #:HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
71 | :HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
72 | :HA#284:E	(ab|a|c|bcd){4,}(d*)	ababcd	NOMATCH
73 | #:HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(3,6)(6,6)
74 | :HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
75 | #:HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(3,6)(6,6)
76 | :HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
77 | #:HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
78 | :HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
79 | #:HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
80 | :HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
81 | :HA#289:E	(ab|a|c|bcd){4,10}(d*)	ababcd	NOMATCH
82 | #:HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(3,6)(6,6)
83 | :HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
84 | #:HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(3,6)(6,6)
85 | :HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
86 | 


--------------------------------------------------------------------------------
/tests/data/fowler/dat/repetition.dat:
--------------------------------------------------------------------------------
 1 | NOTE	implicit vs. explicit repetitions : 2009-02-02
 2 | 
 3 | # Glenn Fowler <gsf@research.att.com>
 4 | # conforming matches (column 4) must match one of the following BREs
 5 | #	NOMATCH
 6 | #	(0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
 7 | #	(0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
 8 | # i.e., each 3-tuple has two identical elements and one (?,?)
 9 | 
10 | E	((..)|(.))				NULL		NOMATCH
11 | E	((..)|(.))((..)|(.))			NULL		NOMATCH
12 | E	((..)|(.))((..)|(.))((..)|(.))		NULL		NOMATCH
13 | 
14 | E	((..)|(.)){1}				NULL		NOMATCH
15 | E	((..)|(.)){2}				NULL		NOMATCH
16 | E	((..)|(.)){3}				NULL		NOMATCH
17 | 
18 | E	((..)|(.))*				NULL		(0,0)
19 | 
20 | E	((..)|(.))				a		(0,1)(0,1)(?,?)(0,1)
21 | E	((..)|(.))((..)|(.))			a		NOMATCH
22 | E	((..)|(.))((..)|(.))((..)|(.))		a		NOMATCH
23 | 
24 | E	((..)|(.)){1}				a		(0,1)(0,1)(?,?)(0,1)
25 | E	((..)|(.)){2}				a		NOMATCH
26 | E	((..)|(.)){3}				a		NOMATCH
27 | 
28 | E	((..)|(.))*				a		(0,1)(0,1)(?,?)(0,1)
29 | 
30 | E	((..)|(.))				aa		(0,2)(0,2)(0,2)(?,?)
31 | E	((..)|(.))((..)|(.))			aa		(0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
32 | E	((..)|(.))((..)|(.))((..)|(.))		aa		NOMATCH
33 | 
34 | E	((..)|(.)){1}				aa		(0,2)(0,2)(0,2)(?,?)
35 | E	((..)|(.)){2}				aa		(0,2)(1,2)(?,?)(1,2)
36 | E	((..)|(.)){3}				aa		NOMATCH
37 | 
38 | E	((..)|(.))*				aa		(0,2)(0,2)(0,2)(?,?)
39 | 
40 | E	((..)|(.))				aaa		(0,2)(0,2)(0,2)(?,?)
41 | E	((..)|(.))((..)|(.))			aaa		(0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
42 | E	((..)|(.))((..)|(.))((..)|(.))		aaa		(0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
43 | 
44 | E	((..)|(.)){1}				aaa		(0,2)(0,2)(0,2)(?,?)
45 | #E	((..)|(.)){2}				aaa		(0,3)(2,3)(?,?)(2,3)
46 | E	((..)|(.)){2}				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
47 | E	((..)|(.)){3}				aaa		(0,3)(2,3)(?,?)(2,3)
48 | 
49 | #E	((..)|(.))*				aaa		(0,3)(2,3)(?,?)(2,3)
50 | E	((..)|(.))*				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
51 | 
52 | E	((..)|(.))				aaaa		(0,2)(0,2)(0,2)(?,?)
53 | E	((..)|(.))((..)|(.))			aaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
54 | E	((..)|(.))((..)|(.))((..)|(.))		aaaa		(0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
55 | 
56 | E	((..)|(.)){1}				aaaa		(0,2)(0,2)(0,2)(?,?)
57 | E	((..)|(.)){2}				aaaa		(0,4)(2,4)(2,4)(?,?)
58 | #E	((..)|(.)){3}				aaaa		(0,4)(3,4)(?,?)(3,4)
59 | E	((..)|(.)){3}				aaaa		(0,4)(3,4)(0,2)(3,4)	RE2/Go
60 | 
61 | E	((..)|(.))*				aaaa		(0,4)(2,4)(2,4)(?,?)
62 | 
63 | E	((..)|(.))				aaaaa		(0,2)(0,2)(0,2)(?,?)
64 | E	((..)|(.))((..)|(.))			aaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
65 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaa		(0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
66 | 
67 | E	((..)|(.)){1}				aaaaa		(0,2)(0,2)(0,2)(?,?)
68 | E	((..)|(.)){2}				aaaaa		(0,4)(2,4)(2,4)(?,?)
69 | #E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(?,?)(4,5)
70 | E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
71 | 
72 | #E	((..)|(.))*				aaaaa		(0,5)(4,5)(?,?)(4,5)
73 | E	((..)|(.))*				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
74 | 
75 | E	((..)|(.))				aaaaaa		(0,2)(0,2)(0,2)(?,?)
76 | E	((..)|(.))((..)|(.))			aaaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
77 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaaa		(0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
78 | 
79 | E	((..)|(.)){1}				aaaaaa		(0,2)(0,2)(0,2)(?,?)
80 | E	((..)|(.)){2}				aaaaaa		(0,4)(2,4)(2,4)(?,?)
81 | E	((..)|(.)){3}				aaaaaa		(0,6)(4,6)(4,6)(?,?)
82 | 
83 | E	((..)|(.))*				aaaaaa		(0,6)(4,6)(4,6)(?,?)
84 | 


--------------------------------------------------------------------------------
/tests/data/fowler/nullsubexpr.toml:
--------------------------------------------------------------------------------
  1 | # !!! DO NOT EDIT !!!
  2 | # Automatically generated by scripts/fowler-to-toml.
  3 | # Numbers in the test names correspond to the line number of the test from
  4 | # the original dat file.
  5 | 
  6 | [[tests]]
  7 | name = "nullsubexpr3"
  8 | regex = '''(a*)*'''
  9 | input = '''a'''
 10 | captures = [[[0, 1], [0, 1]]]
 11 | match_limit = 1
 12 | unescape = true
 13 | 
 14 | [[tests]]
 15 | name = "nullsubexpr5"
 16 | regex = '''(a*)*'''
 17 | input = '''x'''
 18 | captures = [[[0, 0], []]]
 19 | match_limit = 1
 20 | unescape = true
 21 | 
 22 | [[tests]]
 23 | name = "nullsubexpr6"
 24 | regex = '''(a*)*'''
 25 | input = '''aaaaaa'''
 26 | captures = [[[0, 6], [0, 6]]]
 27 | match_limit = 1
 28 | unescape = true
 29 | 
 30 | [[tests]]
 31 | name = "nullsubexpr7"
 32 | regex = '''(a*)*'''
 33 | input = '''aaaaaax'''
 34 | captures = [[[0, 6], [0, 6]]]
 35 | match_limit = 1
 36 | unescape = true
 37 | 
 38 | [[tests]]
 39 | name = "nullsubexpr8"
 40 | regex = '''(a*)+'''
 41 | input = '''a'''
 42 | captures = [[[0, 1], [0, 1]]]
 43 | match_limit = 1
 44 | unescape = true
 45 | 
 46 | [[tests]]
 47 | name = "nullsubexpr9"
 48 | regex = '''(a*)+'''
 49 | input = '''x'''
 50 | captures = [[[0, 0], [0, 0]]]
 51 | match_limit = 1
 52 | unescape = true
 53 | 
 54 | [[tests]]
 55 | name = "nullsubexpr10"
 56 | regex = '''(a*)+'''
 57 | input = '''aaaaaa'''
 58 | captures = [[[0, 6], [0, 6]]]
 59 | match_limit = 1
 60 | unescape = true
 61 | 
 62 | [[tests]]
 63 | name = "nullsubexpr11"
 64 | regex = '''(a*)+'''
 65 | input = '''aaaaaax'''
 66 | captures = [[[0, 6], [0, 6]]]
 67 | match_limit = 1
 68 | unescape = true
 69 | 
 70 | [[tests]]
 71 | name = "nullsubexpr12"
 72 | regex = '''(a+)*'''
 73 | input = '''a'''
 74 | captures = [[[0, 1], [0, 1]]]
 75 | match_limit = 1
 76 | unescape = true
 77 | 
 78 | [[tests]]
 79 | name = "nullsubexpr13"
 80 | regex = '''(a+)*'''
 81 | input = '''x'''
 82 | captures = [[[0, 0]]]
 83 | match_limit = 1
 84 | unescape = true
 85 | 
 86 | [[tests]]
 87 | name = "nullsubexpr14"
 88 | regex = '''(a+)*'''
 89 | input = '''aaaaaa'''
 90 | captures = [[[0, 6], [0, 6]]]
 91 | match_limit = 1
 92 | unescape = true
 93 | 
 94 | [[tests]]
 95 | name = "nullsubexpr15"
 96 | regex = '''(a+)*'''
 97 | input = '''aaaaaax'''
 98 | captures = [[[0, 6], [0, 6]]]
 99 | match_limit = 1
100 | unescape = true
101 | 
102 | [[tests]]
103 | name = "nullsubexpr16"
104 | regex = '''(a+)+'''
105 | input = '''a'''
106 | captures = [[[0, 1], [0, 1]]]
107 | match_limit = 1
108 | unescape = true
109 | 
110 | [[tests]]
111 | name = "nullsubexpr17"
112 | regex = '''(a+)+'''
113 | input = '''x'''
114 | captures = []
115 | match_limit = 1
116 | unescape = true
117 | 
118 | [[tests]]
119 | name = "nullsubexpr18"
120 | regex = '''(a+)+'''
121 | input = '''aaaaaa'''
122 | captures = [[[0, 6], [0, 6]]]
123 | match_limit = 1
124 | unescape = true
125 | 
126 | [[tests]]
127 | name = "nullsubexpr19"
128 | regex = '''(a+)+'''
129 | input = '''aaaaaax'''
130 | captures = [[[0, 6], [0, 6]]]
131 | match_limit = 1
132 | unescape = true
133 | 
134 | [[tests]]
135 | name = "nullsubexpr21"
136 | regex = '''([a]*)*'''
137 | input = '''a'''
138 | captures = [[[0, 1], [0, 1]]]
139 | match_limit = 1
140 | unescape = true
141 | 
142 | [[tests]]
143 | name = "nullsubexpr23"
144 | regex = '''([a]*)*'''
145 | input = '''x'''
146 | captures = [[[0, 0], []]]
147 | match_limit = 1
148 | unescape = true
149 | 
150 | [[tests]]
151 | name = "nullsubexpr24"
152 | regex = '''([a]*)*'''
153 | input = '''aaaaaa'''
154 | captures = [[[0, 6], [0, 6]]]
155 | match_limit = 1
156 | unescape = true
157 | 
158 | [[tests]]
159 | name = "nullsubexpr25"
160 | regex = '''([a]*)*'''
161 | input = '''aaaaaax'''
162 | captures = [[[0, 6], [0, 6]]]
163 | match_limit = 1
164 | unescape = true
165 | 
166 | [[tests]]
167 | name = "nullsubexpr26"
168 | regex = '''([a]*)+'''
169 | input = '''a'''
170 | captures = [[[0, 1], [0, 1]]]
171 | match_limit = 1
172 | unescape = true
173 | 
174 | [[tests]]
175 | name = "nullsubexpr27"
176 | regex = '''([a]*)+'''
177 | input = '''x'''
178 | captures = [[[0, 0], [0, 0]]]
179 | match_limit = 1
180 | unescape = true
181 | 
182 | [[tests]]
183 | name = "nullsubexpr28"
184 | regex = '''([a]*)+'''
185 | input = '''aaaaaa'''
186 | captures = [[[0, 6], [0, 6]]]
187 | match_limit = 1
188 | unescape = true
189 | 
190 | [[tests]]
191 | name = "nullsubexpr29"
192 | regex = '''([a]*)+'''
193 | input = '''aaaaaax'''
194 | captures = [[[0, 6], [0, 6]]]
195 | match_limit = 1
196 | unescape = true
197 | 
198 | [[tests]]
199 | name = "nullsubexpr30"
200 | regex = '''([^b]*)*'''
201 | input = '''a'''
202 | captures = [[[0, 1], [0, 1]]]
203 | match_limit = 1
204 | unescape = true
205 | 
206 | [[tests]]
207 | name = "nullsubexpr32"
208 | regex = '''([^b]*)*'''
209 | input = '''b'''
210 | captures = [[[0, 0], []]]
211 | match_limit = 1
212 | unescape = true
213 | 
214 | [[tests]]
215 | name = "nullsubexpr33"
216 | regex = '''([^b]*)*'''
217 | input = '''aaaaaa'''
218 | captures = [[[0, 6], [0, 6]]]
219 | match_limit = 1
220 | unescape = true
221 | 
222 | [[tests]]
223 | name = "nullsubexpr34"
224 | regex = '''([^b]*)*'''
225 | input = '''aaaaaab'''
226 | captures = [[[0, 6], [0, 6]]]
227 | match_limit = 1
228 | unescape = true
229 | 
230 | [[tests]]
231 | name = "nullsubexpr35"
232 | regex = '''([ab]*)*'''
233 | input = '''a'''
234 | captures = [[[0, 1], [0, 1]]]
235 | match_limit = 1
236 | unescape = true
237 | 
238 | [[tests]]
239 | name = "nullsubexpr36"
240 | regex = '''([ab]*)*'''
241 | input = '''aaaaaa'''
242 | captures = [[[0, 6], [0, 6]]]
243 | match_limit = 1
244 | unescape = true
245 | 
246 | [[tests]]
247 | name = "nullsubexpr37"
248 | regex = '''([ab]*)*'''
249 | input = '''ababab'''
250 | captures = [[[0, 6], [0, 6]]]
251 | match_limit = 1
252 | unescape = true
253 | 
254 | [[tests]]
255 | name = "nullsubexpr38"
256 | regex = '''([ab]*)*'''
257 | input = '''bababa'''
258 | captures = [[[0, 6], [0, 6]]]
259 | match_limit = 1
260 | unescape = true
261 | 
262 | [[tests]]
263 | name = "nullsubexpr39"
264 | regex = '''([ab]*)*'''
265 | input = '''b'''
266 | captures = [[[0, 1], [0, 1]]]
267 | match_limit = 1
268 | unescape = true
269 | 
270 | [[tests]]
271 | name = "nullsubexpr40"
272 | regex = '''([ab]*)*'''
273 | input = '''bbbbbb'''
274 | captures = [[[0, 6], [0, 6]]]
275 | match_limit = 1
276 | unescape = true
277 | 
278 | [[tests]]
279 | name = "nullsubexpr41"
280 | regex = '''([ab]*)*'''
281 | input = '''aaaabcde'''
282 | captures = [[[0, 5], [0, 5]]]
283 | match_limit = 1
284 | unescape = true
285 | 
286 | [[tests]]
287 | name = "nullsubexpr42"
288 | regex = '''([^a]*)*'''
289 | input = '''b'''
290 | captures = [[[0, 1], [0, 1]]]
291 | match_limit = 1
292 | unescape = true
293 | 
294 | [[tests]]
295 | name = "nullsubexpr43"
296 | regex = '''([^a]*)*'''
297 | input = '''bbbbbb'''
298 | captures = [[[0, 6], [0, 6]]]
299 | match_limit = 1
300 | unescape = true
301 | 
302 | [[tests]]
303 | name = "nullsubexpr45"
304 | regex = '''([^a]*)*'''
305 | input = '''aaaaaa'''
306 | captures = [[[0, 0], []]]
307 | match_limit = 1
308 | unescape = true
309 | 
310 | [[tests]]
311 | name = "nullsubexpr46"
312 | regex = '''([^ab]*)*'''
313 | input = '''ccccxx'''
314 | captures = [[[0, 6], [0, 6]]]
315 | match_limit = 1
316 | unescape = true
317 | 
318 | [[tests]]
319 | name = "nullsubexpr48"
320 | regex = '''([^ab]*)*'''
321 | input = '''ababab'''
322 | captures = [[[0, 0], []]]
323 | match_limit = 1
324 | unescape = true
325 | 
326 | [[tests]]
327 | name = "nullsubexpr50"
328 | regex = '''((z)+|a)*'''
329 | input = '''zabcde'''
330 | captures = [[[0, 2], [1, 2]]]
331 | match_limit = 1
332 | unescape = true
333 | 
334 | [[tests]]
335 | name = "nullsubexpr69"
336 | regex = '''(a*)*(x)'''
337 | input = '''x'''
338 | captures = [[[0, 1], [], [0, 1]]]
339 | match_limit = 1
340 | unescape = true
341 | 
342 | [[tests]]
343 | name = "nullsubexpr70"
344 | regex = '''(a*)*(x)'''
345 | input = '''ax'''
346 | captures = [[[0, 2], [0, 1], [1, 2]]]
347 | match_limit = 1
348 | unescape = true
349 | 
350 | [[tests]]
351 | name = "nullsubexpr71"
352 | regex = '''(a*)*(x)'''
353 | input = '''axa'''
354 | captures = [[[0, 2], [0, 1], [1, 2]]]
355 | match_limit = 1
356 | unescape = true
357 | 
358 | [[tests]]
359 | name = "nullsubexpr73"
360 | regex = '''(a*)+(x)'''
361 | input = '''x'''
362 | captures = [[[0, 1], [0, 0], [0, 1]]]
363 | match_limit = 1
364 | unescape = true
365 | 
366 | [[tests]]
367 | name = "nullsubexpr74"
368 | regex = '''(a*)+(x)'''
369 | input = '''ax'''
370 | captures = [[[0, 2], [0, 1], [1, 2]]]
371 | match_limit = 1
372 | unescape = true
373 | 
374 | [[tests]]
375 | name = "nullsubexpr75"
376 | regex = '''(a*)+(x)'''
377 | input = '''axa'''
378 | captures = [[[0, 2], [0, 1], [1, 2]]]
379 | match_limit = 1
380 | unescape = true
381 | 
382 | [[tests]]
383 | name = "nullsubexpr77"
384 | regex = '''(a*){2}(x)'''
385 | input = '''x'''
386 | captures = [[[0, 1], [0, 0], [0, 1]]]
387 | match_limit = 1
388 | unescape = true
389 | 
390 | [[tests]]
391 | name = "nullsubexpr78"
392 | regex = '''(a*){2}(x)'''
393 | input = '''ax'''
394 | captures = [[[0, 2], [1, 1], [1, 2]]]
395 | match_limit = 1
396 | unescape = true
397 | 
398 | [[tests]]
399 | name = "nullsubexpr79"
400 | regex = '''(a*){2}(x)'''
401 | input = '''axa'''
402 | captures = [[[0, 2], [1, 1], [1, 2]]]
403 | match_limit = 1
404 | unescape = true
405 | 
406 | 


--------------------------------------------------------------------------------
/tests/data/fowler/repetition-expensive.toml:
--------------------------------------------------------------------------------
  1 | # !!! DO NOT EDIT !!!
  2 | # Automatically generated by scripts/fowler-to-toml.
  3 | # Numbers in the test names correspond to the line number of the test from
  4 | # the original dat file.
  5 | 
  6 | [[tests]]
  7 | name = "repetition-expensive12"
  8 | regex = '''X(.?){0,}Y'''
  9 | input = '''X1234567Y'''
 10 | captures = [[[0, 9], [7, 8]]]
 11 | match_limit = 1
 12 | unescape = true
 13 | 
 14 | [[tests]]
 15 | name = "repetition-expensive13"
 16 | regex = '''X(.?){1,}Y'''
 17 | input = '''X1234567Y'''
 18 | captures = [[[0, 9], [7, 8]]]
 19 | match_limit = 1
 20 | unescape = true
 21 | 
 22 | [[tests]]
 23 | name = "repetition-expensive14"
 24 | regex = '''X(.?){2,}Y'''
 25 | input = '''X1234567Y'''
 26 | captures = [[[0, 9], [7, 8]]]
 27 | match_limit = 1
 28 | unescape = true
 29 | 
 30 | [[tests]]
 31 | name = "repetition-expensive15"
 32 | regex = '''X(.?){3,}Y'''
 33 | input = '''X1234567Y'''
 34 | captures = [[[0, 9], [7, 8]]]
 35 | match_limit = 1
 36 | unescape = true
 37 | 
 38 | [[tests]]
 39 | name = "repetition-expensive16"
 40 | regex = '''X(.?){4,}Y'''
 41 | input = '''X1234567Y'''
 42 | captures = [[[0, 9], [7, 8]]]
 43 | match_limit = 1
 44 | unescape = true
 45 | 
 46 | [[tests]]
 47 | name = "repetition-expensive17"
 48 | regex = '''X(.?){5,}Y'''
 49 | input = '''X1234567Y'''
 50 | captures = [[[0, 9], [7, 8]]]
 51 | match_limit = 1
 52 | unescape = true
 53 | 
 54 | [[tests]]
 55 | name = "repetition-expensive18"
 56 | regex = '''X(.?){6,}Y'''
 57 | input = '''X1234567Y'''
 58 | captures = [[[0, 9], [7, 8]]]
 59 | match_limit = 1
 60 | unescape = true
 61 | 
 62 | [[tests]]
 63 | name = "repetition-expensive19"
 64 | regex = '''X(.?){7,}Y'''
 65 | input = '''X1234567Y'''
 66 | captures = [[[0, 9], [7, 8]]]
 67 | match_limit = 1
 68 | unescape = true
 69 | 
 70 | [[tests]]
 71 | name = "repetition-expensive20"
 72 | regex = '''X(.?){8,}Y'''
 73 | input = '''X1234567Y'''
 74 | captures = [[[0, 9], [8, 8]]]
 75 | match_limit = 1
 76 | unescape = true
 77 | 
 78 | [[tests]]
 79 | name = "repetition-expensive22"
 80 | regex = '''X(.?){0,8}Y'''
 81 | input = '''X1234567Y'''
 82 | captures = [[[0, 9], [8, 8]]]
 83 | match_limit = 1
 84 | unescape = true
 85 | 
 86 | [[tests]]
 87 | name = "repetition-expensive24"
 88 | regex = '''X(.?){1,8}Y'''
 89 | input = '''X1234567Y'''
 90 | captures = [[[0, 9], [8, 8]]]
 91 | match_limit = 1
 92 | unescape = true
 93 | 
 94 | [[tests]]
 95 | name = "repetition-expensive26"
 96 | regex = '''X(.?){2,8}Y'''
 97 | input = '''X1234567Y'''
 98 | captures = [[[0, 9], [8, 8]]]
 99 | match_limit = 1
100 | unescape = true
101 | 
102 | [[tests]]
103 | name = "repetition-expensive28"
104 | regex = '''X(.?){3,8}Y'''
105 | input = '''X1234567Y'''
106 | captures = [[[0, 9], [8, 8]]]
107 | match_limit = 1
108 | unescape = true
109 | 
110 | [[tests]]
111 | name = "repetition-expensive30"
112 | regex = '''X(.?){4,8}Y'''
113 | input = '''X1234567Y'''
114 | captures = [[[0, 9], [8, 8]]]
115 | match_limit = 1
116 | unescape = true
117 | 
118 | [[tests]]
119 | name = "repetition-expensive32"
120 | regex = '''X(.?){5,8}Y'''
121 | input = '''X1234567Y'''
122 | captures = [[[0, 9], [8, 8]]]
123 | match_limit = 1
124 | unescape = true
125 | 
126 | [[tests]]
127 | name = "repetition-expensive34"
128 | regex = '''X(.?){6,8}Y'''
129 | input = '''X1234567Y'''
130 | captures = [[[0, 9], [8, 8]]]
131 | match_limit = 1
132 | unescape = true
133 | 
134 | [[tests]]
135 | name = "repetition-expensive36"
136 | regex = '''X(.?){7,8}Y'''
137 | input = '''X1234567Y'''
138 | captures = [[[0, 9], [8, 8]]]
139 | match_limit = 1
140 | unescape = true
141 | 
142 | [[tests]]
143 | name = "repetition-expensive37"
144 | regex = '''X(.?){8,8}Y'''
145 | input = '''X1234567Y'''
146 | captures = [[[0, 9], [8, 8]]]
147 | match_limit = 1
148 | unescape = true
149 | 
150 | [[tests]]
151 | name = "repetition-expensive48"
152 | regex = '''(a|ab|c|bcd){0,}(d*)'''
153 | input = '''ababcd'''
154 | captures = [[[0, 1], [0, 1], [1, 1]]]
155 | match_limit = 1
156 | unescape = true
157 | 
158 | [[tests]]
159 | name = "repetition-expensive49"
160 | regex = '''(a|ab|c|bcd){1,}(d*)'''
161 | input = '''ababcd'''
162 | captures = [[[0, 1], [0, 1], [1, 1]]]
163 | match_limit = 1
164 | unescape = true
165 | 
166 | [[tests]]
167 | name = "repetition-expensive50"
168 | regex = '''(a|ab|c|bcd){2,}(d*)'''
169 | input = '''ababcd'''
170 | captures = [[[0, 6], [3, 6], [6, 6]]]
171 | match_limit = 1
172 | unescape = true
173 | 
174 | [[tests]]
175 | name = "repetition-expensive51"
176 | regex = '''(a|ab|c|bcd){3,}(d*)'''
177 | input = '''ababcd'''
178 | captures = [[[0, 6], [3, 6], [6, 6]]]
179 | match_limit = 1
180 | unescape = true
181 | 
182 | [[tests]]
183 | name = "repetition-expensive52"
184 | regex = '''(a|ab|c|bcd){4,}(d*)'''
185 | input = '''ababcd'''
186 | captures = []
187 | match_limit = 1
188 | unescape = true
189 | 
190 | [[tests]]
191 | name = "repetition-expensive53"
192 | regex = '''(a|ab|c|bcd){0,10}(d*)'''
193 | input = '''ababcd'''
194 | captures = [[[0, 1], [0, 1], [1, 1]]]
195 | match_limit = 1
196 | unescape = true
197 | 
198 | [[tests]]
199 | name = "repetition-expensive54"
200 | regex = '''(a|ab|c|bcd){1,10}(d*)'''
201 | input = '''ababcd'''
202 | captures = [[[0, 1], [0, 1], [1, 1]]]
203 | match_limit = 1
204 | unescape = true
205 | 
206 | [[tests]]
207 | name = "repetition-expensive55"
208 | regex = '''(a|ab|c|bcd){2,10}(d*)'''
209 | input = '''ababcd'''
210 | captures = [[[0, 6], [3, 6], [6, 6]]]
211 | match_limit = 1
212 | unescape = true
213 | 
214 | [[tests]]
215 | name = "repetition-expensive56"
216 | regex = '''(a|ab|c|bcd){3,10}(d*)'''
217 | input = '''ababcd'''
218 | captures = [[[0, 6], [3, 6], [6, 6]]]
219 | match_limit = 1
220 | unescape = true
221 | 
222 | [[tests]]
223 | name = "repetition-expensive57"
224 | regex = '''(a|ab|c|bcd){4,10}(d*)'''
225 | input = '''ababcd'''
226 | captures = []
227 | match_limit = 1
228 | unescape = true
229 | 
230 | [[tests]]
231 | name = "repetition-expensive58"
232 | regex = '''(a|ab|c|bcd)*(d*)'''
233 | input = '''ababcd'''
234 | captures = [[[0, 1], [0, 1], [1, 1]]]
235 | match_limit = 1
236 | unescape = true
237 | 
238 | [[tests]]
239 | name = "repetition-expensive59"
240 | regex = '''(a|ab|c|bcd)+(d*)'''
241 | input = '''ababcd'''
242 | captures = [[[0, 1], [0, 1], [1, 1]]]
243 | match_limit = 1
244 | unescape = true
245 | 
246 | [[tests]]
247 | name = "repetition-expensive65"
248 | regex = '''(ab|a|c|bcd){0,}(d*)'''
249 | input = '''ababcd'''
250 | captures = [[[0, 6], [4, 5], [5, 6]]]
251 | match_limit = 1
252 | unescape = true
253 | 
254 | [[tests]]
255 | name = "repetition-expensive67"
256 | regex = '''(ab|a|c|bcd){1,}(d*)'''
257 | input = '''ababcd'''
258 | captures = [[[0, 6], [4, 5], [5, 6]]]
259 | match_limit = 1
260 | unescape = true
261 | 
262 | [[tests]]
263 | name = "repetition-expensive69"
264 | regex = '''(ab|a|c|bcd){2,}(d*)'''
265 | input = '''ababcd'''
266 | captures = [[[0, 6], [4, 5], [5, 6]]]
267 | match_limit = 1
268 | unescape = true
269 | 
270 | [[tests]]
271 | name = "repetition-expensive71"
272 | regex = '''(ab|a|c|bcd){3,}(d*)'''
273 | input = '''ababcd'''
274 | captures = [[[0, 6], [4, 5], [5, 6]]]
275 | match_limit = 1
276 | unescape = true
277 | 
278 | [[tests]]
279 | name = "repetition-expensive72"
280 | regex = '''(ab|a|c|bcd){4,}(d*)'''
281 | input = '''ababcd'''
282 | captures = []
283 | match_limit = 1
284 | unescape = true
285 | 
286 | [[tests]]
287 | name = "repetition-expensive74"
288 | regex = '''(ab|a|c|bcd){0,10}(d*)'''
289 | input = '''ababcd'''
290 | captures = [[[0, 6], [4, 5], [5, 6]]]
291 | match_limit = 1
292 | unescape = true
293 | 
294 | [[tests]]
295 | name = "repetition-expensive76"
296 | regex = '''(ab|a|c|bcd){1,10}(d*)'''
297 | input = '''ababcd'''
298 | captures = [[[0, 6], [4, 5], [5, 6]]]
299 | match_limit = 1
300 | unescape = true
301 | 
302 | [[tests]]
303 | name = "repetition-expensive78"
304 | regex = '''(ab|a|c|bcd){2,10}(d*)'''
305 | input = '''ababcd'''
306 | captures = [[[0, 6], [4, 5], [5, 6]]]
307 | match_limit = 1
308 | unescape = true
309 | 
310 | [[tests]]
311 | name = "repetition-expensive80"
312 | regex = '''(ab|a|c|bcd){3,10}(d*)'''
313 | input = '''ababcd'''
314 | captures = [[[0, 6], [4, 5], [5, 6]]]
315 | match_limit = 1
316 | unescape = true
317 | 
318 | [[tests]]
319 | name = "repetition-expensive81"
320 | regex = '''(ab|a|c|bcd){4,10}(d*)'''
321 | input = '''ababcd'''
322 | captures = []
323 | match_limit = 1
324 | unescape = true
325 | 
326 | [[tests]]
327 | name = "repetition-expensive83"
328 | regex = '''(ab|a|c|bcd)*(d*)'''
329 | input = '''ababcd'''
330 | captures = [[[0, 6], [4, 5], [5, 6]]]
331 | match_limit = 1
332 | unescape = true
333 | 
334 | [[tests]]
335 | name = "repetition-expensive85"
336 | regex = '''(ab|a|c|bcd)+(d*)'''
337 | input = '''ababcd'''
338 | captures = [[[0, 6], [4, 5], [5, 6]]]
339 | match_limit = 1
340 | unescape = true
341 | 
342 | 


--------------------------------------------------------------------------------
/tests/data/fowler/repetition-long.toml:
--------------------------------------------------------------------------------
  1 | # !!! DO NOT EDIT !!!
  2 | # Automatically generated by scripts/fowler-to-toml.
  3 | # Numbers in the test names correspond to the line number of the test from
  4 | # the original dat file.
  5 | 
  6 | [[tests]]
  7 | name = "repetition-long12"
  8 | regex = '''X(.?){0,}Y'''
  9 | input = '''X1234567Y'''
 10 | captures = [[[0, 9], [7, 8]]]
 11 | match_limit = 1
 12 | unescape = true
 13 | 
 14 | [[tests]]
 15 | name = "repetition-long13"
 16 | regex = '''X(.?){1,}Y'''
 17 | input = '''X1234567Y'''
 18 | captures = [[[0, 9], [7, 8]]]
 19 | match_limit = 1
 20 | unescape = true
 21 | 
 22 | [[tests]]
 23 | name = "repetition-long14"
 24 | regex = '''X(.?){2,}Y'''
 25 | input = '''X1234567Y'''
 26 | captures = [[[0, 9], [7, 8]]]
 27 | match_limit = 1
 28 | unescape = true
 29 | 
 30 | [[tests]]
 31 | name = "repetition-long15"
 32 | regex = '''X(.?){3,}Y'''
 33 | input = '''X1234567Y'''
 34 | captures = [[[0, 9], [7, 8]]]
 35 | match_limit = 1
 36 | unescape = true
 37 | 
 38 | [[tests]]
 39 | name = "repetition-long16"
 40 | regex = '''X(.?){4,}Y'''
 41 | input = '''X1234567Y'''
 42 | captures = [[[0, 9], [7, 8]]]
 43 | match_limit = 1
 44 | unescape = true
 45 | 
 46 | [[tests]]
 47 | name = "repetition-long17"
 48 | regex = '''X(.?){5,}Y'''
 49 | input = '''X1234567Y'''
 50 | captures = [[[0, 9], [7, 8]]]
 51 | match_limit = 1
 52 | unescape = true
 53 | 
 54 | [[tests]]
 55 | name = "repetition-long18"
 56 | regex = '''X(.?){6,}Y'''
 57 | input = '''X1234567Y'''
 58 | captures = [[[0, 9], [7, 8]]]
 59 | match_limit = 1
 60 | unescape = true
 61 | 
 62 | [[tests]]
 63 | name = "repetition-long19"
 64 | regex = '''X(.?){7,}Y'''
 65 | input = '''X1234567Y'''
 66 | captures = [[[0, 9], [7, 8]]]
 67 | match_limit = 1
 68 | unescape = true
 69 | 
 70 | [[tests]]
 71 | name = "repetition-long20"
 72 | regex = '''X(.?){8,}Y'''
 73 | input = '''X1234567Y'''
 74 | captures = [[[0, 9], [8, 8]]]
 75 | match_limit = 1
 76 | unescape = true
 77 | 
 78 | [[tests]]
 79 | name = "repetition-long22"
 80 | regex = '''X(.?){0,8}Y'''
 81 | input = '''X1234567Y'''
 82 | captures = [[[0, 9], [8, 8]]]
 83 | match_limit = 1
 84 | unescape = true
 85 | 
 86 | [[tests]]
 87 | name = "repetition-long24"
 88 | regex = '''X(.?){1,8}Y'''
 89 | input = '''X1234567Y'''
 90 | captures = [[[0, 9], [8, 8]]]
 91 | match_limit = 1
 92 | unescape = true
 93 | 
 94 | [[tests]]
 95 | name = "repetition-long26"
 96 | regex = '''X(.?){2,8}Y'''
 97 | input = '''X1234567Y'''
 98 | captures = [[[0, 9], [8, 8]]]
 99 | match_limit = 1
100 | unescape = true
101 | 
102 | [[tests]]
103 | name = "repetition-long28"
104 | regex = '''X(.?){3,8}Y'''
105 | input = '''X1234567Y'''
106 | captures = [[[0, 9], [8, 8]]]
107 | match_limit = 1
108 | unescape = true
109 | 
110 | [[tests]]
111 | name = "repetition-long30"
112 | regex = '''X(.?){4,8}Y'''
113 | input = '''X1234567Y'''
114 | captures = [[[0, 9], [8, 8]]]
115 | match_limit = 1
116 | unescape = true
117 | 
118 | [[tests]]
119 | name = "repetition-long32"
120 | regex = '''X(.?){5,8}Y'''
121 | input = '''X1234567Y'''
122 | captures = [[[0, 9], [8, 8]]]
123 | match_limit = 1
124 | unescape = true
125 | 
126 | [[tests]]
127 | name = "repetition-long34"
128 | regex = '''X(.?){6,8}Y'''
129 | input = '''X1234567Y'''
130 | captures = [[[0, 9], [8, 8]]]
131 | match_limit = 1
132 | unescape = true
133 | 
134 | [[tests]]
135 | name = "repetition-long36"
136 | regex = '''X(.?){7,8}Y'''
137 | input = '''X1234567Y'''
138 | captures = [[[0, 9], [8, 8]]]
139 | match_limit = 1
140 | unescape = true
141 | 
142 | [[tests]]
143 | name = "repetition-long37"
144 | regex = '''X(.?){8,8}Y'''
145 | input = '''X1234567Y'''
146 | captures = [[[0, 9], [8, 8]]]
147 | match_limit = 1
148 | unescape = true
149 | 
150 | [[tests]]
151 | name = "repetition-long48"
152 | regex = '''(a|ab|c|bcd){0,}(d*)'''
153 | input = '''ababcd'''
154 | captures = [[[0, 1], [0, 1], [1, 1]]]
155 | match_limit = 1
156 | unescape = true
157 | 
158 | [[tests]]
159 | name = "repetition-long49"
160 | regex = '''(a|ab|c|bcd){1,}(d*)'''
161 | input = '''ababcd'''
162 | captures = [[[0, 1], [0, 1], [1, 1]]]
163 | match_limit = 1
164 | unescape = true
165 | 
166 | [[tests]]
167 | name = "repetition-long50"
168 | regex = '''(a|ab|c|bcd){2,}(d*)'''
169 | input = '''ababcd'''
170 | captures = [[[0, 6], [3, 6], [6, 6]]]
171 | match_limit = 1
172 | unescape = true
173 | 
174 | [[tests]]
175 | name = "repetition-long51"
176 | regex = '''(a|ab|c|bcd){3,}(d*)'''
177 | input = '''ababcd'''
178 | captures = [[[0, 6], [3, 6], [6, 6]]]
179 | match_limit = 1
180 | unescape = true
181 | 
182 | [[tests]]
183 | name = "repetition-long52"
184 | regex = '''(a|ab|c|bcd){4,}(d*)'''
185 | input = '''ababcd'''
186 | captures = []
187 | match_limit = 1
188 | unescape = true
189 | 
190 | [[tests]]
191 | name = "repetition-long53"
192 | regex = '''(a|ab|c|bcd){0,10}(d*)'''
193 | input = '''ababcd'''
194 | captures = [[[0, 1], [0, 1], [1, 1]]]
195 | match_limit = 1
196 | unescape = true
197 | 
198 | [[tests]]
199 | name = "repetition-long54"
200 | regex = '''(a|ab|c|bcd){1,10}(d*)'''
201 | input = '''ababcd'''
202 | captures = [[[0, 1], [0, 1], [1, 1]]]
203 | match_limit = 1
204 | unescape = true
205 | 
206 | [[tests]]
207 | name = "repetition-long55"
208 | regex = '''(a|ab|c|bcd){2,10}(d*)'''
209 | input = '''ababcd'''
210 | captures = [[[0, 6], [3, 6], [6, 6]]]
211 | match_limit = 1
212 | unescape = true
213 | 
214 | [[tests]]
215 | name = "repetition-long56"
216 | regex = '''(a|ab|c|bcd){3,10}(d*)'''
217 | input = '''ababcd'''
218 | captures = [[[0, 6], [3, 6], [6, 6]]]
219 | match_limit = 1
220 | unescape = true
221 | 
222 | [[tests]]
223 | name = "repetition-long57"
224 | regex = '''(a|ab|c|bcd){4,10}(d*)'''
225 | input = '''ababcd'''
226 | captures = []
227 | match_limit = 1
228 | unescape = true
229 | 
230 | [[tests]]
231 | name = "repetition-long58"
232 | regex = '''(a|ab|c|bcd)*(d*)'''
233 | input = '''ababcd'''
234 | captures = [[[0, 1], [0, 1], [1, 1]]]
235 | match_limit = 1
236 | unescape = true
237 | 
238 | [[tests]]
239 | name = "repetition-long59"
240 | regex = '''(a|ab|c|bcd)+(d*)'''
241 | input = '''ababcd'''
242 | captures = [[[0, 1], [0, 1], [1, 1]]]
243 | match_limit = 1
244 | unescape = true
245 | 
246 | [[tests]]
247 | name = "repetition-long65"
248 | regex = '''(ab|a|c|bcd){0,}(d*)'''
249 | input = '''ababcd'''
250 | captures = [[[0, 6], [4, 5], [5, 6]]]
251 | match_limit = 1
252 | unescape = true
253 | 
254 | [[tests]]
255 | name = "repetition-long67"
256 | regex = '''(ab|a|c|bcd){1,}(d*)'''
257 | input = '''ababcd'''
258 | captures = [[[0, 6], [4, 5], [5, 6]]]
259 | match_limit = 1
260 | unescape = true
261 | 
262 | [[tests]]
263 | name = "repetition-long69"
264 | regex = '''(ab|a|c|bcd){2,}(d*)'''
265 | input = '''ababcd'''
266 | captures = [[[0, 6], [4, 5], [5, 6]]]
267 | match_limit = 1
268 | unescape = true
269 | 
270 | [[tests]]
271 | name = "repetition-long71"
272 | regex = '''(ab|a|c|bcd){3,}(d*)'''
273 | input = '''ababcd'''
274 | captures = [[[0, 6], [4, 5], [5, 6]]]
275 | match_limit = 1
276 | unescape = true
277 | 
278 | [[tests]]
279 | name = "repetition-long72"
280 | regex = '''(ab|a|c|bcd){4,}(d*)'''
281 | input = '''ababcd'''
282 | captures = []
283 | match_limit = 1
284 | unescape = true
285 | 
286 | [[tests]]
287 | name = "repetition-long74"
288 | regex = '''(ab|a|c|bcd){0,10}(d*)'''
289 | input = '''ababcd'''
290 | captures = [[[0, 6], [4, 5], [5, 6]]]
291 | match_limit = 1
292 | unescape = true
293 | 
294 | [[tests]]
295 | name = "repetition-long76"
296 | regex = '''(ab|a|c|bcd){1,10}(d*)'''
297 | input = '''ababcd'''
298 | captures = [[[0, 6], [4, 5], [5, 6]]]
299 | match_limit = 1
300 | unescape = true
301 | 
302 | [[tests]]
303 | name = "repetition-long78"
304 | regex = '''(ab|a|c|bcd){2,10}(d*)'''
305 | input = '''ababcd'''
306 | captures = [[[0, 6], [4, 5], [5, 6]]]
307 | match_limit = 1
308 | unescape = true
309 | 
310 | [[tests]]
311 | name = "repetition-long80"
312 | regex = '''(ab|a|c|bcd){3,10}(d*)'''
313 | input = '''ababcd'''
314 | captures = [[[0, 6], [4, 5], [5, 6]]]
315 | match_limit = 1
316 | unescape = true
317 | 
318 | [[tests]]
319 | name = "repetition-long81"
320 | regex = '''(ab|a|c|bcd){4,10}(d*)'''
321 | input = '''ababcd'''
322 | captures = []
323 | match_limit = 1
324 | unescape = true
325 | 
326 | [[tests]]
327 | name = "repetition-long83"
328 | regex = '''(ab|a|c|bcd)*(d*)'''
329 | input = '''ababcd'''
330 | captures = [[[0, 6], [4, 5], [5, 6]]]
331 | match_limit = 1
332 | unescape = true
333 | 
334 | [[tests]]
335 | name = "repetition-long85"
336 | regex = '''(ab|a|c|bcd)+(d*)'''
337 | input = '''ababcd'''
338 | captures = [[[0, 6], [4, 5], [5, 6]]]
339 | match_limit = 1
340 | unescape = true
341 | 
342 | 


--------------------------------------------------------------------------------
/tests/data/iter.toml:
--------------------------------------------------------------------------------
  1 | [[tests]]
  2 | name = "1"
  3 | regex = "a"
  4 | input = "aaa"
  5 | matches = [[0, 1], [1, 2], [2, 3]]
  6 | 
  7 | [[tests]]
  8 | name = "2"
  9 | regex = "a"
 10 | input = "aba"
 11 | matches = [[0, 1], [2, 3]]
 12 | 
 13 | [[tests]]
 14 | name = "empty1"
 15 | regex = ''
 16 | input = ''
 17 | matches = [[0, 0]]
 18 | 
 19 | [[tests]]
 20 | name = "empty2"
 21 | regex = ''
 22 | input = 'abc'
 23 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 24 | 
 25 | [[tests]]
 26 | name = "empty3"
 27 | regex = '()'
 28 | input = 'abc'
 29 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 30 | 
 31 | [[tests]]
 32 | name = "empty4"
 33 | regex = '()*'
 34 | input = 'abc'
 35 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 36 | 
 37 | [[tests]]
 38 | name = "empty5"
 39 | regex = '()+'
 40 | input = 'abc'
 41 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 42 | 
 43 | [[tests]]
 44 | name = "empty6"
 45 | regex = '()?'
 46 | input = 'abc'
 47 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 48 | 
 49 | [[tests]]
 50 | name = "empty7"
 51 | regex = '()()'
 52 | input = 'abc'
 53 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 54 | 
 55 | [[tests]]
 56 | name = "empty8"
 57 | regex = '()+|z'
 58 | input = 'abc'
 59 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 60 | 
 61 | [[tests]]
 62 | name = "empty9"
 63 | regex = 'z|()+'
 64 | input = 'abc'
 65 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 66 | 
 67 | [[tests]]
 68 | name = "empty10"
 69 | regex = '()+|b'
 70 | input = 'abc'
 71 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 72 | 
 73 | [[tests]]
 74 | name = "empty11"
 75 | regex = 'b|()+'
 76 | input = 'abc'
 77 | matches = [[0, 0], [1, 2], [3, 3]]
 78 | 
 79 | [[tests]]
 80 | name = "start1"
 81 | regex = "^a"
 82 | input = "a"
 83 | matches = [[0, 1]]
 84 | 
 85 | [[tests]]
 86 | name = "start2"
 87 | regex = "^a"
 88 | input = "aa"
 89 | matches = [[0, 1]]
 90 | 
 91 | [[tests]]
 92 | name = "anchored1"
 93 | regex = "a"
 94 | input = "a"
 95 | matches = [[0, 1]]
 96 | anchored = true
 97 | 
 98 | # This test is pretty subtle. It demonstrates the crucial difference between
 99 | # '^a' and 'a' compiled in 'anchored' mode. The former regex exclusively
100 | # matches at the start of a haystack and nowhere else. The latter regex has
101 | # no such restriction, but its automaton is constructed such that it lacks a
102 | # `.*?` prefix. So it can actually produce matches at multiple locations.
103 | # The anchored3 test drives this point home.
104 | [[tests]]
105 | name = "anchored2"
106 | regex = "a"
107 | input = "aa"
108 | matches = [[0, 1], [1, 2]]
109 | anchored = true
110 | 
111 | # Unlikely anchored2, this test stops matching anything after it sees `b`
112 | # since it lacks a `.*?` prefix. Since it is looking for 'a' but sees 'b', it
113 | # determines that there are no remaining matches.
114 | [[tests]]
115 | name = "anchored3"
116 | regex = "a"
117 | input = "aaba"
118 | matches = [[0, 1], [1, 2]]
119 | anchored = true
120 | 


--------------------------------------------------------------------------------
/tests/data/misc.toml:
--------------------------------------------------------------------------------
  1 | [[tests]]
  2 | name = "ascii-literal"
  3 | regex = "a"
  4 | input = "a"
  5 | matches = [[0, 1]]
  6 | 
  7 | [[tests]]
  8 | name = "ascii-literal-not"
  9 | regex = "a"
 10 | input = "z"
 11 | matches = []
 12 | 
 13 | [[tests]]
 14 | name = "ascii-literal-anchored"
 15 | regex = "a"
 16 | input = "a"
 17 | matches = [[0, 1]]
 18 | anchored = true
 19 | 
 20 | [[tests]]
 21 | name = "ascii-literal-anchored-not"
 22 | regex = "a"
 23 | input = "z"
 24 | matches = []
 25 | anchored = true
 26 | 
 27 | [[tests]]
 28 | name = "anchor-start-end-line"
 29 | regex = '(?m)^bar$'
 30 | input = "foo\nbar\nbaz"
 31 | matches = [[4, 7]]
 32 | 
 33 | [[tests]]
 34 | name = "prefix-literal-match"
 35 | regex = '^abc'
 36 | input = "abc"
 37 | matches = [[0, 3]]
 38 | 
 39 | [[tests]]
 40 | name = "prefix-literal-match-ascii"
 41 | regex = '^abc'
 42 | input = "abc"
 43 | matches = [[0, 3]]
 44 | unicode = false
 45 | utf8 = false
 46 | 
 47 | [[tests]]
 48 | name = "prefix-literal-no-match"
 49 | regex = '^abc'
 50 | input = "zabc"
 51 | matches = []
 52 | 
 53 | [[tests]]
 54 | name = "one-literal-edge"
 55 | regex = 'abc'
 56 | input = "xxxxxab"
 57 | matches = []
 58 | 
 59 | [[tests]]
 60 | name = "terminates"
 61 | regex = 'a$'
 62 | input = "a"
 63 | matches = [[0, 1]]
 64 | 
 65 | [[tests]]
 66 | name = "suffix-100"
 67 | regex = '.*abcd'
 68 | input = "abcd"
 69 | matches = [[0, 4]]
 70 | 
 71 | [[tests]]
 72 | name = "suffix-200"
 73 | regex = '.*(?:abcd)+'
 74 | input = "abcd"
 75 | matches = [[0, 4]]
 76 | 
 77 | [[tests]]
 78 | name = "suffix-300"
 79 | regex = '.*(?:abcd)+'
 80 | input = "abcdabcd"
 81 | matches = [[0, 8]]
 82 | 
 83 | [[tests]]
 84 | name = "suffix-400"
 85 | regex = '.*(?:abcd)+'
 86 | input = "abcdxabcd"
 87 | matches = [[0, 9]]
 88 | 
 89 | [[tests]]
 90 | name = "suffix-500"
 91 | regex = '.*x(?:abcd)+'
 92 | input = "abcdxabcd"
 93 | matches = [[0, 9]]
 94 | 
 95 | [[tests]]
 96 | name = "suffix-600"
 97 | regex = '[^abcd]*x(?:abcd)+'
 98 | input = "abcdxabcd"
 99 | matches = [[4, 9]]
100 | 


--------------------------------------------------------------------------------
/tests/data/multiline.toml:
--------------------------------------------------------------------------------
  1 | [[tests]]
  2 | name = "basic1"
  3 | regex = '(?m)^[a-z]+$'
  4 | input = "abc\ndef\nxyz"
  5 | matches = [[0, 3], [4, 7], [8, 11]]
  6 | 
  7 | [[tests]]
  8 | name = "basic2"
  9 | regex = '(?m)^$'
 10 | input = "abc\ndef\nxyz"
 11 | matches = []
 12 | 
 13 | [[tests]]
 14 | name = "basic3"
 15 | regex = '(?m)^'
 16 | input = "abc\ndef\nxyz"
 17 | matches = [[0, 0], [4, 4], [8, 8]]
 18 | 
 19 | [[tests]]
 20 | name = "basic4"
 21 | regex = '(?m)$'
 22 | input = "abc\ndef\nxyz"
 23 | matches = [[3, 3], [7, 7], [11, 11]]
 24 | 
 25 | [[tests]]
 26 | name = "basic5"
 27 | regex = '(?m)^[a-z]'
 28 | input = "abc\ndef\nxyz"
 29 | matches = [[0, 1], [4, 5], [8, 9]]
 30 | 
 31 | [[tests]]
 32 | name = "basic6"
 33 | regex = '(?m)[a-z]^'
 34 | input = "abc\ndef\nxyz"
 35 | matches = []
 36 | 
 37 | [[tests]]
 38 | name = "basic7"
 39 | regex = '(?m)[a-z]$'
 40 | input = "abc\ndef\nxyz"
 41 | matches = [[2, 3], [6, 7], [10, 11]]
 42 | 
 43 | [[tests]]
 44 | name = "basic8"
 45 | regex = '(?m)$[a-z]'
 46 | input = "abc\ndef\nxyz"
 47 | matches = []
 48 | 
 49 | [[tests]]
 50 | name = "basic9"
 51 | regex = '(?m)^$'
 52 | input = ""
 53 | matches = [[0, 0]]
 54 | 
 55 | [[tests]]
 56 | name = "repeat1"
 57 | regex = '(?m)(?:^$)*'
 58 | input = "a\nb\nc"
 59 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
 60 | 
 61 | [[tests]]
 62 | name = "repeat1-no-multi"
 63 | regex = '(?:^$)*'
 64 | input = "a\nb\nc"
 65 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
 66 | 
 67 | [[tests]]
 68 | name = "repeat2"
 69 | regex = '(?m)(?:^|a)+'
 70 | input = "a\naaa\n"
 71 | matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
 72 | 
 73 | [[tests]]
 74 | name = "repeat100"
 75 | regex = '(?m)(?:^|a)+'
 76 | input = "a\naaa\n"
 77 | matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
 78 | 
 79 | [[tests]]
 80 | name = "repeat2-no-multi"
 81 | regex = '(?:^|a)+'
 82 | input = "a\naaa\n"
 83 | matches = [[0, 0], [2, 5]]
 84 | 
 85 | [[tests]]
 86 | name = "repeat3"
 87 | regex = '(?m)(?:^|a)*'
 88 | input = "a\naaa\n"
 89 | matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
 90 | 
 91 | [[tests]]
 92 | name = "repeat3-no-multi"
 93 | regex = '(?:^|a)*'
 94 | input = "a\naaa\n"
 95 | matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
 96 | 
 97 | [[tests]]
 98 | name = "repeat4"
 99 | regex = '(?m)(?:^|a+)'
100 | input = "a\naaa\n"
101 | matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
102 | 
103 | [[tests]]
104 | name = "repeat4-no-multi"
105 | regex = '(?:^|a+)'
106 | input = "a\naaa\n"
107 | matches = [[0, 0], [2, 5]]
108 | 
109 | [[tests]]
110 | name = "repeat5"
111 | regex = '(?m)(?:^|a*)'
112 | input = "a\naaa\n"
113 | matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
114 | 
115 | [[tests]]
116 | name = "repeat5-no-multi"
117 | regex = '(?:^|a*)'
118 | input = "a\naaa\n"
119 | matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
120 | 
121 | [[tests]]
122 | name = "repeat6"
123 | regex = '(?m)(?:^[a-z])+'
124 | input = "abc\ndef\nxyz"
125 | matches = [[0, 1], [4, 5], [8, 9]]
126 | 
127 | [[tests]]
128 | name = "repeat6-no-multi"
129 | regex = '(?:^[a-z])+'
130 | input = "abc\ndef\nxyz"
131 | matches = [[0, 1]]
132 | 
133 | [[tests]]
134 | name = "repeat7"
135 | regex = '(?m)(?:^[a-z]{3}\n?)+'
136 | input = "abc\ndef\nxyz"
137 | matches = [[0, 11]]
138 | 
139 | [[tests]]
140 | name = "repeat7-no-multi"
141 | regex = '(?:^[a-z]{3}\n?)+'
142 | input = "abc\ndef\nxyz"
143 | matches = [[0, 4]]
144 | 
145 | [[tests]]
146 | name = "repeat8"
147 | regex = '(?m)(?:^[a-z]{3}\n?)*'
148 | input = "abc\ndef\nxyz"
149 | matches = [[0, 11]]
150 | 
151 | [[tests]]
152 | name = "repeat8-no-multi"
153 | regex = '(?:^[a-z]{3}\n?)*'
154 | input = "abc\ndef\nxyz"
155 | matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]]
156 | 
157 | [[tests]]
158 | name = "repeat9"
159 | regex = '(?m)(?:\n?[a-z]{3}$)+'
160 | input = "abc\ndef\nxyz"
161 | matches = [[0, 11]]
162 | 
163 | [[tests]]
164 | name = "repeat9-no-multi"
165 | regex = '(?:\n?[a-z]{3}$)+'
166 | input = "abc\ndef\nxyz"
167 | matches = [[7, 11]]
168 | 
169 | [[tests]]
170 | name = "repeat10"
171 | regex = '(?m)(?:\n?[a-z]{3}$)*'
172 | input = "abc\ndef\nxyz"
173 | matches = [[0, 11]]
174 | 
175 | [[tests]]
176 | name = "repeat10-no-multi"
177 | regex = '(?:\n?[a-z]{3}$)*'
178 | input = "abc\ndef\nxyz"
179 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]]
180 | 
181 | [[tests]]
182 | name = "repeat11"
183 | regex = '(?m)^*'
184 | input = "\naa\n"
185 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
186 | 
187 | [[tests]]
188 | name = "repeat11-no-multi"
189 | regex = '^*'
190 | input = "\naa\n"
191 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
192 | 
193 | [[tests]]
194 | name = "repeat12"
195 | regex = '(?m)^+'
196 | input = "\naa\n"
197 | matches = [[0, 0], [1, 1], [4, 4]]
198 | 
199 | [[tests]]
200 | name = "repeat12-no-multi"
201 | regex = '^+'
202 | input = "\naa\n"
203 | matches = [[0, 0]]
204 | 
205 | [[tests]]
206 | name = "repeat13"
207 | regex = '(?m)$*'
208 | input = "\naa\n"
209 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
210 | 
211 | [[tests]]
212 | name = "repeat13-no-multi"
213 | regex = '$*'
214 | input = "\naa\n"
215 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
216 | 
217 | [[tests]]
218 | name = "repeat14"
219 | regex = '(?m)$+'
220 | input = "\naa\n"
221 | matches = [[0, 0], [3, 3], [4, 4]]
222 | 
223 | [[tests]]
224 | name = "repeat14-no-multi"
225 | regex = '$+'
226 | input = "\naa\n"
227 | matches = [[4, 4]]
228 | 
229 | [[tests]]
230 | name = "repeat15"
231 | regex = '(?m)(?:$\n)+'
232 | input = "\n\naaa\n\n"
233 | matches = [[0, 2], [5, 7]]
234 | 
235 | [[tests]]
236 | name = "repeat15-no-multi"
237 | regex = '(?:$\n)+'
238 | input = "\n\naaa\n\n"
239 | matches = []
240 | 
241 | [[tests]]
242 | name = "repeat16"
243 | regex = '(?m)(?:$\n)*'
244 | input = "\n\naaa\n\n"
245 | matches = [[0, 2], [3, 3], [4, 4], [5, 7]]
246 | 
247 | [[tests]]
248 | name = "repeat16-no-multi"
249 | regex = '(?:$\n)*'
250 | input = "\n\naaa\n\n"
251 | matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]
252 | 
253 | [[tests]]
254 | name = "repeat17"
255 | regex = '(?m)(?:$\n^)+'
256 | input = "\n\naaa\n\n"
257 | matches = [[0, 2], [5, 7]]
258 | 
259 | [[tests]]
260 | name = "repeat17-no-multi"
261 | regex = '(?:$\n^)+'
262 | input = "\n\naaa\n\n"
263 | matches = []
264 | 
265 | [[tests]]
266 | name = "repeat18"
267 | regex = '(?m)(?:^|$)+'
268 | input = "\n\naaa\n\n"
269 | matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]]
270 | 
271 | [[tests]]
272 | name = "repeat18-no-multi"
273 | regex = '(?:^|$)+'
274 | input = "\n\naaa\n\n"
275 | matches = [[0, 0], [7, 7]]
276 | 


--------------------------------------------------------------------------------
/tests/data/no-unicode.toml:
--------------------------------------------------------------------------------
  1 | [[tests]]
  2 | name = "invalid-utf8-literal1"
  3 | regex = '\xFF'
  4 | input = '\xFF'
  5 | matches = [[0, 1]]
  6 | unicode = false
  7 | utf8 = false
  8 | unescape = true
  9 | 
 10 | 
 11 | [[tests]]
 12 | name = "mixed"
 13 | regex = '(.+)(?-u)(.+)'
 14 | input = '\xCE\x93\xCE\x94\xFF'
 15 | matches = [[0, 5]]
 16 | utf8 = false
 17 | unescape = true
 18 | 
 19 | 
 20 | [[tests]]
 21 | name = "case1"
 22 | regex = "a"
 23 | input = "A"
 24 | matches = [[0, 1]]
 25 | case_insensitive = true
 26 | unicode = false
 27 | 
 28 | [[tests]]
 29 | name = "case2"
 30 | regex = "[a-z]+"
 31 | input = "AaAaA"
 32 | matches = [[0, 5]]
 33 | case_insensitive = true
 34 | unicode = false
 35 | 
 36 | [[tests]]
 37 | name = "case3"
 38 | regex = "[a-z]+"
 39 | input = "aA\u212AaA"
 40 | matches = [[0, 7]]
 41 | case_insensitive = true
 42 | 
 43 | [[tests]]
 44 | name = "case4"
 45 | regex = "[a-z]+"
 46 | input = "aA\u212AaA"
 47 | matches = [[0, 2], [5, 7]]
 48 | case_insensitive = true
 49 | unicode = false
 50 | 
 51 | 
 52 | [[tests]]
 53 | name = "negate1"
 54 | regex = "[^a]"
 55 | input = "δ"
 56 | matches = [[0, 2]]
 57 | 
 58 | [[tests]]
 59 | name = "negate2"
 60 | regex = "[^a]"
 61 | input = "δ"
 62 | matches = [[0, 1], [1, 2]]
 63 | unicode = false
 64 | utf8 = false
 65 | 
 66 | 
 67 | [[tests]]
 68 | name = "dotstar-prefix1"
 69 | regex = "a"
 70 | input = '\xFFa'
 71 | matches = [[1, 2]]
 72 | unicode = false
 73 | utf8 = false
 74 | unescape = true
 75 | 
 76 | [[tests]]
 77 | name = "dotstar-prefix2"
 78 | regex = "a"
 79 | input = '\xFFa'
 80 | matches = [[1, 2]]
 81 | utf8 = false
 82 | unescape = true
 83 | 
 84 | 
 85 | [[tests]]
 86 | name = "null-bytes1"
 87 | regex = '[^\x00]+\x00'
 88 | input = 'foo\x00'
 89 | matches = [[0, 4]]
 90 | unicode = false
 91 | utf8 = false
 92 | unescape = true
 93 | 
 94 | 
 95 | [[tests]]
 96 | name = "word-ascii"
 97 | regex = '\w+'
 98 | input = "aδ"
 99 | matches = [[0, 1]]
100 | unicode = false
101 | 
102 | [[tests]]
103 | name = "word-unicode"
104 | regex = '\w+'
105 | input = "aδ"
106 | matches = [[0, 3]]
107 | 
108 | [[tests]]
109 | name = "decimal-ascii"
110 | regex = '\d+'
111 | input = "1२३9"
112 | matches = [[0, 1], [7, 8]]
113 | unicode = false
114 | 
115 | [[tests]]
116 | name = "decimal-unicode"
117 | regex = '\d+'
118 | input = "1२३9"
119 | matches = [[0, 8]]
120 | 
121 | [[tests]]
122 | name = "space-ascii"
123 | regex = '\s+'
124 | input = " \u1680"
125 | matches = [[0, 1]]
126 | unicode = false
127 | 
128 | [[tests]]
129 | name = "space-unicode"
130 | regex = '\s+'
131 | input = " \u1680"
132 | matches = [[0, 4]]
133 | 
134 | 
135 | [[tests]]
136 | # See: https://github.com/rust-lang/regex/issues/484
137 | name = "iter1-bytes"
138 | regex = ''
139 | input = "☃"
140 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
141 | utf8 = false
142 | 
143 | [[tests]]
144 | # See: https://github.com/rust-lang/regex/issues/484
145 | name = "iter1-utf8"
146 | regex = ''
147 | input = "☃"
148 | matches = [[0, 0], [3, 3]]
149 | 
150 | [[tests]]
151 | # See: https://github.com/rust-lang/regex/issues/484
152 | # Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
153 | name = "iter2-bytes"
154 | regex = ''
155 | input = 'b\xFFr'
156 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
157 | unescape = true
158 | utf8 = false
159 | 


--------------------------------------------------------------------------------
/tests/data/overlapping.toml:
--------------------------------------------------------------------------------
  1 | [[tests]]
  2 | name = "repetition-plus-leftmost-first-100"
  3 | regex = 'a+'
  4 | input = "aaa"
  5 | matches = [[0, 1], [0, 2], [0, 3]]
  6 | match_kind = "leftmost-first"
  7 | search_kind = "overlapping"
  8 | 
  9 | [[tests]]
 10 | name = "repetition-plus-all-100"
 11 | regex = 'a+'
 12 | input = "aaa"
 13 | matches = [[0, 1], [0, 2], [0, 3]]
 14 | match_kind = "all"
 15 | search_kind = "overlapping"
 16 | 
 17 | [[tests]]
 18 | name = "repetition-plus-leftmost-first-200"
 19 | regex = '(abc)+'
 20 | input = "zzabcabczzabc"
 21 | matches = [[2, 5], [2, 8]]
 22 | match_kind = "leftmost-first"
 23 | search_kind = "overlapping"
 24 | 
 25 | [[tests]]
 26 | name = "repetition-plus-all-200"
 27 | regex = '(abc)+'
 28 | input = "zzabcabczzabc"
 29 | matches = [[2, 5], [2, 8], [10, 13]]
 30 | match_kind = "all"
 31 | search_kind = "overlapping"
 32 | 
 33 | [[tests]]
 34 | name = "repetition-star-leftmost-first-100"
 35 | regex = 'a*'
 36 | input = "aaa"
 37 | matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
 38 | match_kind = "leftmost-first"
 39 | search_kind = "overlapping"
 40 | 
 41 | [[tests]]
 42 | name = "repetition-star-all-100"
 43 | regex = 'a*'
 44 | input = "aaa"
 45 | matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
 46 | match_kind = "all"
 47 | search_kind = "overlapping"
 48 | 
 49 | [[tests]]
 50 | name = "repetition-star-leftmost-first-200"
 51 | regex = '(abc)*'
 52 | input = "zzabcabczzabc"
 53 | matches = [[0, 0]]
 54 | match_kind = "leftmost-first"
 55 | search_kind = "overlapping"
 56 | 
 57 | [[tests]]
 58 | name = "repetition-star-all-200"
 59 | regex = '(abc)*'
 60 | input = "zzabcabczzabc"
 61 | matches = [
 62 |   [0, 0], [1, 1], [2, 2], [3, 3], [4, 4],
 63 |   [2, 5],
 64 |   [6, 6], [7, 7],
 65 |   [2, 8],
 66 |   [9, 9], [10, 10], [11, 11], [12, 12],
 67 |   [10, 13],
 68 | ]
 69 | match_kind = "all"
 70 | search_kind = "overlapping"
 71 | 
 72 | [[tests]]
 73 | name = "start-end-rep-leftmost-first"
 74 | regex = '(^$)*'
 75 | input = "abc"
 76 | matches = [[0, 0]]
 77 | match_kind = "leftmost-first"
 78 | search_kind = "overlapping"
 79 | 
 80 | [[tests]]
 81 | name = "start-end-rep-all"
 82 | regex = '(^$)*'
 83 | input = "abc"
 84 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 85 | match_kind = "all"
 86 | search_kind = "overlapping"
 87 | 
 88 | [[tests]]
 89 | name = "alt-leftmost-first-100"
 90 | regex = 'abc|a'
 91 | input = "zzabcazzaabc"
 92 | matches = [[2, 3], [2, 5]]
 93 | match_kind = "leftmost-first"
 94 | search_kind = "overlapping"
 95 | 
 96 | [[tests]]
 97 | name = "alt-all-100"
 98 | regex = 'abc|a'
 99 | input = "zzabcazzaabc"
100 | matches = [[2, 3], [2, 5], [5, 6], [8, 9], [9, 10], [9, 12]]
101 | match_kind = "all"
102 | search_kind = "overlapping"
103 | 
104 | [[tests]]
105 | name = "empty-000"
106 | regex = ""
107 | input = "abc"
108 | matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
109 | match_kind = "all"
110 | search_kind = "overlapping"
111 | 
112 | [[tests]]
113 | name = "empty-alt-000"
114 | regex = "|b"
115 | input = "abc"
116 | matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
117 | match_kind = "all"
118 | search_kind = "overlapping"
119 | 
120 | [[tests]]
121 | name = "empty-alt-010"
122 | regex = "b|"
123 | input = "abc"
124 | matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
125 | match_kind = "all"
126 | search_kind = "overlapping"
127 | 


--------------------------------------------------------------------------------
/tests/dfa/api.rs:
--------------------------------------------------------------------------------
  1 | use std::error::Error;
  2 | 
  3 | use regex_automata::{
  4 |     dfa::{dense, regex::Regex, Automaton, OverlappingState},
  5 |     nfa::thompson,
  6 |     HalfMatch, MatchError, MatchKind, MultiMatch,
  7 | };
  8 | 
  9 | use crate::util::{BunkPrefilter, SubstringPrefilter};
 10 | 
 11 | // Tests that quit bytes in the forward direction work correctly.
 12 | #[test]
 13 | fn quit_fwd() -> Result<(), Box<dyn Error>> {
 14 |     let dfa = dense::Builder::new()
 15 |         .configure(dense::Config::new().quit(b'x', true))
 16 |         .build("[[:word:]]+$")?;
 17 | 
 18 |     assert_eq!(
 19 |         dfa.find_earliest_fwd(b"abcxyz"),
 20 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 21 |     );
 22 |     assert_eq!(
 23 |         dfa.find_leftmost_fwd(b"abcxyz"),
 24 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 25 |     );
 26 |     assert_eq!(
 27 |         dfa.find_overlapping_fwd(b"abcxyz", &mut OverlappingState::start()),
 28 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 29 |     );
 30 | 
 31 |     Ok(())
 32 | }
 33 | 
 34 | // Tests that quit bytes in the reverse direction work correctly.
 35 | #[test]
 36 | fn quit_rev() -> Result<(), Box<dyn Error>> {
 37 |     let dfa = dense::Builder::new()
 38 |         .configure(dense::Config::new().quit(b'x', true))
 39 |         .thompson(thompson::Config::new().reverse(true))
 40 |         .build("^[[:word:]]+")?;
 41 | 
 42 |     assert_eq!(
 43 |         dfa.find_earliest_rev(b"abcxyz"),
 44 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 45 |     );
 46 |     assert_eq!(
 47 |         dfa.find_leftmost_rev(b"abcxyz"),
 48 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 49 |     );
 50 | 
 51 |     Ok(())
 52 | }
 53 | 
 54 | // Tests that if we heuristically enable Unicode word boundaries but then
 55 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder
 56 | // will panic.
 57 | #[test]
 58 | #[should_panic]
 59 | fn quit_panics() {
 60 |     dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false);
 61 | }
 62 | 
 63 | // Tests that if we attempt an overlapping search using a regex without a
 64 | // reverse DFA compiled with 'starts_for_each_pattern', then we get a panic.
 65 | #[test]
 66 | #[should_panic]
 67 | fn incorrect_config_overlapping_search_panics() {
 68 |     let forward = dense::DFA::new(r"abca").unwrap();
 69 |     let reverse = dense::Builder::new()
 70 |         .configure(
 71 |             dense::Config::new()
 72 |                 .anchored(true)
 73 |                 .match_kind(MatchKind::All)
 74 |                 .starts_for_each_pattern(false),
 75 |         )
 76 |         .thompson(thompson::Config::new().reverse(true))
 77 |         .build(r"abca")
 78 |         .unwrap();
 79 | 
 80 |     let re = Regex::builder().build_from_dfas(forward, reverse);
 81 |     let haystack = "bar abcabcabca abca foo".as_bytes();
 82 |     re.find_overlapping(haystack, &mut OverlappingState::start());
 83 | }
 84 | 
 85 | // This tests an intesting case where even if the Unicode word boundary option
 86 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
 87 | // word boundaries to be enabled.
 88 | #[test]
 89 | fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
 90 |     let mut config = dense::Config::new();
 91 |     for b in 0x80..=0xFF {
 92 |         config = config.quit(b, true);
 93 |     }
 94 |     let dfa = dense::Builder::new().configure(config).build(r"\b")?;
 95 |     let expected = HalfMatch::must(0, 1);
 96 |     assert_eq!(dfa.find_leftmost_fwd(b" a"), Ok(Some(expected)));
 97 |     Ok(())
 98 | }
 99 | 
100 | // Tests that we can provide a prefilter to a Regex, and the search reports
101 | // correct results.
102 | #[test]
103 | fn prefilter_works() -> Result<(), Box<dyn Error>> {
104 |     let re = Regex::new(r"a[0-9]+")
105 |         .unwrap()
106 |         .with_prefilter(SubstringPrefilter::new("a"));
107 |     let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
108 |     let matches: Vec<(usize, usize)> =
109 |         re.find_leftmost_iter(text).map(|m| (m.start(), m.end())).collect();
110 |     assert_eq!(
111 |         matches,
112 |         vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
113 |     );
114 |     Ok(())
115 | }
116 | 
117 | // This test confirms that a prefilter is active by using a prefilter that
118 | // reports false negatives.
119 | #[test]
120 | fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
121 |     let text = b"za123";
122 |     let re = Regex::new(r"a[0-9]+")
123 |         .unwrap()
124 |         .with_prefilter(SubstringPrefilter::new("a"));
125 |     assert_eq!(re.find_leftmost(b"za123"), Some(MultiMatch::must(0, 1, 5)));
126 |     assert_eq!(re.find_leftmost(b"a123"), Some(MultiMatch::must(0, 0, 4)));
127 |     let re = re.with_prefilter(BunkPrefilter::new());
128 |     assert_eq!(re.find_leftmost(b"za123"), None);
129 |     // This checks that the prefilter is used when first starting the search,
130 |     // instead of waiting until at least one transition has occurred.
131 |     assert_eq!(re.find_leftmost(b"a123"), None);
132 |     Ok(())
133 | }
134 | 


--------------------------------------------------------------------------------
/tests/dfa/mod.rs:
--------------------------------------------------------------------------------
1 | mod api;
2 | mod suite;
3 | 


--------------------------------------------------------------------------------
/tests/hybrid/api.rs:
--------------------------------------------------------------------------------
  1 | use std::error::Error;
  2 | 
  3 | use regex_automata::{
  4 |     hybrid::{
  5 |         dfa::{self, DFA},
  6 |         regex::Regex,
  7 |         OverlappingState,
  8 |     },
  9 |     nfa::thompson,
 10 |     HalfMatch, MatchError, MatchKind, MultiMatch,
 11 | };
 12 | 
 13 | use crate::util::{BunkPrefilter, SubstringPrefilter};
 14 | 
 15 | // Tests that too many cache resets cause the lazy DFA to quit.
 16 | //
 17 | // We only test this on 64-bit because the test is gingerly crafted based on
 18 | // implementation details of cache sizes. It's not a great test because of
 19 | // that, but it does check some interesting properties around how positions are
 20 | // reported when a search "gives up."
 21 | #[test]
 22 | #[cfg(target_pointer_width = "64")]
 23 | fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
 24 |     // This is a carefully chosen regex. The idea is to pick one that requires
 25 |     // some decent number of states (hence the bounded repetition). But we
 26 |     // specifically choose to create a class with an ASCII letter and a
 27 |     // non-ASCII letter so that we can check that no new states are created
 28 |     // once the cache is full. Namely, if we fill up the cache on a haystack
 29 |     // of 'a's, then in order to match one 'β', a new state will need to be
 30 |     // created since a 'β' is encoded with multiple bytes. Since there's no
 31 |     // room for this state, the search should quit at the very first position.
 32 |     let pattern = r"[aβ]{100}";
 33 |     let dfa = DFA::builder()
 34 |         .configure(
 35 |             // Configure it so that we have the minimum cache capacity
 36 |             // possible. And that if any resets occur, the search quits.
 37 |             DFA::config()
 38 |                 .skip_cache_capacity_check(true)
 39 |                 .cache_capacity(0)
 40 |                 .minimum_cache_clear_count(Some(0)),
 41 |         )
 42 |         .build(pattern)?;
 43 |     let mut cache = dfa.create_cache();
 44 | 
 45 |     let haystack = "a".repeat(101).into_bytes();
 46 |     let err = MatchError::GaveUp { offset: 25 };
 47 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
 48 |     assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
 49 |     assert_eq!(
 50 |         dfa.find_overlapping_fwd(
 51 |             &mut cache,
 52 |             &haystack,
 53 |             &mut OverlappingState::start()
 54 |         ),
 55 |         Err(err.clone())
 56 |     );
 57 | 
 58 |     let haystack = "β".repeat(101).into_bytes();
 59 |     let err = MatchError::GaveUp { offset: 0 };
 60 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
 61 |     // no need to test that other find routines quit, since we did that above
 62 | 
 63 |     // OK, if we reset the cache, then we should be able to create more states
 64 |     // and make more progress with searching for betas.
 65 |     cache.reset(&dfa);
 66 |     let err = MatchError::GaveUp { offset: 26 };
 67 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
 68 | 
 69 |     // ... switching back to ASCII still makes progress since it just needs to
 70 |     // set transitions on existing states!
 71 |     let haystack = "a".repeat(101).into_bytes();
 72 |     let err = MatchError::GaveUp { offset: 13 };
 73 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
 74 | 
 75 |     Ok(())
 76 | }
 77 | 
 78 | // Tests that quit bytes in the forward direction work correctly.
 79 | #[test]
 80 | fn quit_fwd() -> Result<(), Box<dyn Error>> {
 81 |     let dfa = DFA::builder()
 82 |         .configure(DFA::config().quit(b'x', true))
 83 |         .build("[[:word:]]+$")?;
 84 |     let mut cache = dfa.create_cache();
 85 | 
 86 |     assert_eq!(
 87 |         dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
 88 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 89 |     );
 90 |     assert_eq!(
 91 |         dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
 92 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 93 |     );
 94 |     assert_eq!(
 95 |         dfa.find_overlapping_fwd(
 96 |             &mut cache,
 97 |             b"abcxyz",
 98 |             &mut OverlappingState::start()
 99 |         ),
100 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
101 |     );
102 | 
103 |     Ok(())
104 | }
105 | 
106 | // Tests that quit bytes in the reverse direction work correctly.
107 | #[test]
108 | fn quit_rev() -> Result<(), Box<dyn Error>> {
109 |     let dfa = DFA::builder()
110 |         .configure(DFA::config().quit(b'x', true))
111 |         .thompson(thompson::Config::new().reverse(true))
112 |         .build("^[[:word:]]+")?;
113 |     let mut cache = dfa.create_cache();
114 | 
115 |     assert_eq!(
116 |         dfa.find_earliest_rev(&mut cache, b"abcxyz"),
117 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
118 |     );
119 |     assert_eq!(
120 |         dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
121 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
122 |     );
123 | 
124 |     Ok(())
125 | }
126 | 
127 | // Tests that if we heuristically enable Unicode word boundaries but then
128 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder
129 | // will panic.
130 | #[test]
131 | #[should_panic]
132 | fn quit_panics() {
133 |     DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
134 | }
135 | 
136 | // This tests an intesting case where even if the Unicode word boundary option
137 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
138 | // word boundaries to be enabled.
139 | #[test]
140 | fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
141 |     let mut config = DFA::config();
142 |     for b in 0x80..=0xFF {
143 |         config = config.quit(b, true);
144 |     }
145 |     let dfa = DFA::builder().configure(config).build(r"\b")?;
146 |     let mut cache = dfa.create_cache();
147 |     let expected = HalfMatch::must(0, 1);
148 |     assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
149 |     Ok(())
150 | }
151 | 
152 | // Tests that we can provide a prefilter to a Regex, and the search reports
153 | // correct results.
154 | #[test]
155 | fn prefilter_works() -> Result<(), Box<dyn Error>> {
156 |     let mut re = Regex::new(r"a[0-9]+").unwrap();
157 |     re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
158 |     let mut cache = re.create_cache();
159 | 
160 |     let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
161 |     let matches: Vec<(usize, usize)> = re
162 |         .find_leftmost_iter(&mut cache, text)
163 |         .map(|m| (m.start(), m.end()))
164 |         .collect();
165 |     assert_eq!(
166 |         matches,
167 |         vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
168 |     );
169 |     Ok(())
170 | }
171 | 
172 | // This test confirms that a prefilter is active by using a prefilter that
173 | // reports false negatives.
174 | #[test]
175 | fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
176 |     let text = b"za123";
177 |     let mut re = Regex::new(r"a[0-9]+").unwrap();
178 |     let mut cache = re.create_cache();
179 | 
180 |     re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
181 |     assert_eq!(
182 |         re.find_leftmost(&mut cache, b"za123"),
183 |         Some(MultiMatch::must(0, 1, 5))
184 |     );
185 |     assert_eq!(
186 |         re.find_leftmost(&mut cache, b"a123"),
187 |         Some(MultiMatch::must(0, 0, 4))
188 |     );
189 |     re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
190 |     assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
191 |     // This checks that the prefilter is used when first starting the search,
192 |     // instead of waiting until at least one transition has occurred.
193 |     assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
194 |     Ok(())
195 | }
196 | 


--------------------------------------------------------------------------------
/tests/hybrid/mod.rs:
--------------------------------------------------------------------------------
1 | mod api;
2 | mod suite;
3 | 


--------------------------------------------------------------------------------
/tests/hybrid/suite.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::{
  2 |     hybrid::{
  3 |         dfa::DFA,
  4 |         regex::{self, Regex},
  5 |     },
  6 |     nfa::thompson,
  7 |     MatchKind, SyntaxConfig,
  8 | };
  9 | use regex_syntax as syntax;
 10 | 
 11 | use regex_test::{
 12 |     bstr::{BString, ByteSlice},
 13 |     CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
 14 |     SearchKind as TestSearchKind, TestResult, TestRunner,
 15 | };
 16 | 
 17 | use crate::{suite, Result};
 18 | 
 19 | /// Tests the default configuration of the hybrid NFA/DFA.
 20 | #[test]
 21 | fn default() -> Result<()> {
 22 |     let builder = Regex::builder();
 23 |     TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
 24 |     Ok(())
 25 | }
 26 | 
 27 | /// Tests the hybrid NFA/DFA with NFA shrinking disabled.
 28 | ///
 29 | /// This is actually the typical configuration one wants for a lazy DFA. NFA
 30 | /// shrinking is mostly only advantageous when building a full DFA since it
 31 | /// can sharply decrease the amount of time determinization takes. But NFA
 32 | /// shrinking is itself otherwise fairly expensive. Since a lazy DFA has
 33 | /// no compilation time (other than for building the NFA of course) before
 34 | /// executing a search, it's usually worth it to forgo NFA shrinking.
 35 | #[test]
 36 | fn no_nfa_shrink() -> Result<()> {
 37 |     let mut builder = Regex::builder();
 38 |     builder.thompson(thompson::Config::new().shrink(false));
 39 |     TestRunner::new()?
 40 |         // Without NFA shrinking, this test blows the default cache capacity.
 41 |         .blacklist("expensive/regression-many-repeat-no-stack-overflow")
 42 |         .test_iter(suite()?.iter(), compiler(builder))
 43 |         .assert();
 44 |     Ok(())
 45 | }
 46 | 
 47 | /// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled.
 48 | #[test]
 49 | fn starts_for_each_pattern() -> Result<()> {
 50 |     let mut builder = Regex::builder();
 51 |     builder.dfa(DFA::config().starts_for_each_pattern(true));
 52 |     TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
 53 |     Ok(())
 54 | }
 55 | 
 56 | /// Tests the hybrid NFA/DFA when byte classes are disabled.
 57 | ///
 58 | /// N.B. Disabling byte classes doesn't avoid any indirection at search time.
 59 | /// All it does is cause every byte value to be its own distinct equivalence
 60 | /// class.
 61 | #[test]
 62 | fn no_byte_classes() -> Result<()> {
 63 |     let mut builder = Regex::builder();
 64 |     builder.dfa(DFA::config().byte_classes(false));
 65 |     TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
 66 |     Ok(())
 67 | }
 68 | 
 69 | /// Tests that hybrid NFA/DFA never clears its cache for any test with the
 70 | /// default capacity.
 71 | ///
 72 | /// N.B. If a regex suite test is added that causes the cache to be cleared,
 73 | /// then this should just skip that test. (Which can be done by calling the
 74 | /// 'blacklist' method on 'TestRunner'.)
 75 | #[test]
 76 | fn no_cache_clearing() -> Result<()> {
 77 |     let mut builder = Regex::builder();
 78 |     builder.dfa(DFA::config().minimum_cache_clear_count(Some(0)));
 79 |     TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
 80 |     Ok(())
 81 | }
 82 | 
 83 | /// Tests the hybrid NFA/DFA when the minimum cache capacity is set.
 84 | #[test]
 85 | fn min_cache_capacity() -> Result<()> {
 86 |     let mut builder = Regex::builder();
 87 |     builder
 88 |         .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true));
 89 |     TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
 90 |     Ok(())
 91 | }
 92 | 
 93 | fn compiler(
 94 |     mut builder: regex::Builder,
 95 | ) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
 96 |     move |test, regexes| {
 97 |         let regexes = regexes
 98 |             .iter()
 99 |             .map(|r| r.to_str().map(|s| s.to_string()))
100 |             .collect::<std::result::Result<Vec<String>, _>>()?;
101 | 
102 |         // Check if our regex contains things that aren't supported by DFAs.
103 |         // That is, Unicode word boundaries when searching non-ASCII text.
104 |         let mut thompson = thompson::Builder::new();
105 |         thompson.syntax(config_syntax(test)).configure(config_thompson(test));
106 |         if let Ok(nfa) = thompson.build_many(&regexes) {
107 |             let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
108 |             if nfa.has_word_boundary_unicode() && non_ascii {
109 |                 return Ok(CompiledRegex::skip());
110 |             }
111 |         }
112 |         if !configure_regex_builder(test, &mut builder) {
113 |             return Ok(CompiledRegex::skip());
114 |         }
115 |         let re = builder.build_many(&regexes)?;
116 |         let mut cache = re.create_cache();
117 |         Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
118 |             run_test(&re, &mut cache, test)
119 |         }))
120 |     }
121 | }
122 | 
123 | fn run_test(
124 |     re: &Regex,
125 |     cache: &mut regex::Cache,
126 |     test: &RegexTest,
127 | ) -> Vec<TestResult> {
128 |     let is_match = if re.is_match(cache, test.input()) {
129 |         TestResult::matched()
130 |     } else {
131 |         TestResult::no_match()
132 |     };
133 |     let is_match = is_match.name("is_match");
134 | 
135 |     let find_matches = match test.search_kind() {
136 |         TestSearchKind::Earliest => {
137 |             let it = re
138 |                 .find_earliest_iter(cache, test.input())
139 |                 .take(test.match_limit().unwrap_or(std::usize::MAX))
140 |                 .map(|m| Match {
141 |                     id: m.pattern().as_usize(),
142 |                     start: m.start(),
143 |                     end: m.end(),
144 |                 });
145 |             TestResult::matches(it).name("find_earliest_iter")
146 |         }
147 |         TestSearchKind::Leftmost => {
148 |             let it = re
149 |                 .find_leftmost_iter(cache, test.input())
150 |                 .take(test.match_limit().unwrap_or(std::usize::MAX))
151 |                 .map(|m| Match {
152 |                     id: m.pattern().as_usize(),
153 |                     start: m.start(),
154 |                     end: m.end(),
155 |                 });
156 |             TestResult::matches(it).name("find_leftmost_iter")
157 |         }
158 |         TestSearchKind::Overlapping => {
159 |             let it = re
160 |                 .find_overlapping_iter(cache, test.input())
161 |                 .take(test.match_limit().unwrap_or(std::usize::MAX))
162 |                 .map(|m| Match {
163 |                     id: m.pattern().as_usize(),
164 |                     start: m.start(),
165 |                     end: m.end(),
166 |                 });
167 |             TestResult::matches(it).name("find_overlapping_iter")
168 |         }
169 |     };
170 |     vec![is_match, find_matches]
171 | }
172 | 
173 | /// Configures the given regex builder with all relevant settings on the given
174 | /// regex test.
175 | ///
176 | /// If the regex test has a setting that is unsupported, then this returns
177 | /// false (implying the test should be skipped).
178 | fn configure_regex_builder(
179 |     test: &RegexTest,
180 |     builder: &mut regex::Builder,
181 | ) -> bool {
182 |     let match_kind = match test.match_kind() {
183 |         TestMatchKind::All => MatchKind::All,
184 |         TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
185 |         TestMatchKind::LeftmostLongest => return false,
186 |     };
187 | 
188 |     let dense_config = DFA::config()
189 |         .anchored(test.anchored())
190 |         .match_kind(match_kind)
191 |         .unicode_word_boundary(true);
192 |     let regex_config = Regex::config().utf8(test.utf8());
193 |     builder
194 |         .configure(regex_config)
195 |         .syntax(config_syntax(test))
196 |         .thompson(config_thompson(test))
197 |         .dfa(dense_config);
198 |     true
199 | }
200 | 
201 | /// Configuration of a Thompson NFA compiler from a regex test.
202 | fn config_thompson(test: &RegexTest) -> thompson::Config {
203 |     thompson::Config::new().utf8(test.utf8())
204 | }
205 | 
206 | /// Configuration of the regex parser from a regex test.
207 | fn config_syntax(test: &RegexTest) -> SyntaxConfig {
208 |     SyntaxConfig::new()
209 |         .case_insensitive(test.case_insensitive())
210 |         .unicode(test.unicode())
211 |         .utf8(test.utf8())
212 | }
213 | 


--------------------------------------------------------------------------------
/tests/nfa/mod.rs:
--------------------------------------------------------------------------------
1 | mod thompson;
2 | 


--------------------------------------------------------------------------------
/tests/nfa/thompson/mod.rs:
--------------------------------------------------------------------------------
1 | mod pikevm;
2 | 


--------------------------------------------------------------------------------
/tests/nfa/thompson/pikevm/api.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 | use std::error::Error;
  3 | 
  4 | use regex_automata::{
  5 |     hybrid::{
  6 |         dfa::{self, DFA},
  7 |         regex::Regex,
  8 |         OverlappingState,
  9 |     },
 10 |     nfa::thompson,
 11 |     HalfMatch, MatchError, MatchKind, MultiMatch,
 12 | };
 13 | 
 14 | use crate::util::{BunkPrefilter, SubstringPrefilter};
 15 | 
 16 | // Tests that too many cache resets cause the lazy DFA to quit.
 17 | #[test]
 18 | fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
 19 |     // This is a carefully chosen regex. The idea is to pick one that requires
 20 |     // some decent number of states (hence the bounded repetition). But we
 21 |     // specifically choose to create a class with an ASCII letter and a
 22 |     // non-ASCII letter so that we can check that no new states are created
 23 |     // once the cache is full. Namely, if we fill up the cache on a haystack
 24 |     // of 'a's, then in order to match one 'β', a new state will need to be
 25 |     // created since a 'β' is encoded with multiple bytes. Since there's no
 26 |     // room for this state, the search should quit at the very first position.
 27 |     let pattern = r"[aβ]{100}";
 28 |     let dfa = DFA::builder()
 29 |         .configure(
 30 |             // Configure it so that we have the minimum cache capacity
 31 |             // possible. And that if any resets occur, the search quits.
 32 |             DFA::config()
 33 |                 .skip_cache_capacity_check(true)
 34 |                 .cache_capacity(0)
 35 |                 .minimum_cache_clear_count(Some(0)),
 36 |         )
 37 |         .build(pattern)?;
 38 |     let mut cache = dfa.create_cache();
 39 | 
 40 |     let haystack = "a".repeat(101).into_bytes();
 41 |     let err = MatchError::GaveUp { offset: 25 };
 42 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
 43 |     assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
 44 |     assert_eq!(
 45 |         dfa.find_overlapping_fwd(
 46 |             &mut cache,
 47 |             &haystack,
 48 |             &mut OverlappingState::start()
 49 |         ),
 50 |         Err(err.clone())
 51 |     );
 52 | 
 53 |     let haystack = "β".repeat(101).into_bytes();
 54 |     let err = MatchError::GaveUp { offset: 0 };
 55 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
 56 |     // no need to test that other find routines quit, since we did that above
 57 | 
 58 |     // OK, if we reset the cache, then we should be able to create more states
 59 |     // and make more progress with searching for betas.
 60 |     cache.reset(&dfa);
 61 |     let err = MatchError::GaveUp { offset: 26 };
 62 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
 63 | 
 64 |     // ... switching back to ASCII still makes progress since it just needs to
 65 |     // set transitions on existing states!
 66 |     let haystack = "a".repeat(101).into_bytes();
 67 |     let err = MatchError::GaveUp { offset: 13 };
 68 |     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
 69 | 
 70 |     Ok(())
 71 | }
 72 | 
 73 | // Tests that quit bytes in the forward direction work correctly.
 74 | #[test]
 75 | fn quit_fwd() -> Result<(), Box<dyn Error>> {
 76 |     let dfa = DFA::builder()
 77 |         .configure(DFA::config().quit(b'x', true))
 78 |         .build("[[:word:]]+$")?;
 79 |     let mut cache = dfa.create_cache();
 80 | 
 81 |     assert_eq!(
 82 |         dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
 83 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 84 |     );
 85 |     assert_eq!(
 86 |         dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
 87 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 88 |     );
 89 |     assert_eq!(
 90 |         dfa.find_overlapping_fwd(
 91 |             &mut cache,
 92 |             b"abcxyz",
 93 |             &mut OverlappingState::start()
 94 |         ),
 95 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
 96 |     );
 97 | 
 98 |     Ok(())
 99 | }
100 | 
101 | // Tests that quit bytes in the reverse direction work correctly.
102 | #[test]
103 | fn quit_rev() -> Result<(), Box<dyn Error>> {
104 |     let dfa = DFA::builder()
105 |         .configure(DFA::config().quit(b'x', true))
106 |         .thompson(thompson::Config::new().reverse(true))
107 |         .build("^[[:word:]]+")?;
108 |     let mut cache = dfa.create_cache();
109 | 
110 |     assert_eq!(
111 |         dfa.find_earliest_rev(&mut cache, b"abcxyz"),
112 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
113 |     );
114 |     assert_eq!(
115 |         dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
116 |         Err(MatchError::Quit { byte: b'x', offset: 3 })
117 |     );
118 | 
119 |     Ok(())
120 | }
121 | 
122 | // Tests that if we heuristically enable Unicode word boundaries but then
123 | // instruct that a non-ASCII byte should NOT be a quit byte, then the builder
124 | // will panic.
125 | #[test]
126 | #[should_panic]
127 | fn quit_panics() {
128 |     DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
129 | }
130 | 
131 | // This tests an intesting case where even if the Unicode word boundary option
132 | // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
133 | // word boundaries to be enabled.
134 | #[test]
135 | fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
136 |     let mut config = DFA::config();
137 |     for b in 0x80..=0xFF {
138 |         config = config.quit(b, true);
139 |     }
140 |     let dfa = DFA::builder().configure(config).build(r"\b")?;
141 |     let mut cache = dfa.create_cache();
142 |     let expected = HalfMatch::must(0, 1);
143 |     assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
144 |     Ok(())
145 | }
146 | 
147 | // Tests that we can provide a prefilter to a Regex, and the search reports
148 | // correct results.
149 | #[test]
150 | fn prefilter_works() -> Result<(), Box<dyn Error>> {
151 |     let mut re = Regex::new(r"a[0-9]+").unwrap();
152 |     re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
153 |     let mut cache = re.create_cache();
154 | 
155 |     let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
156 |     let matches: Vec<(usize, usize)> = re
157 |         .find_leftmost_iter(&mut cache, text)
158 |         .map(|m| (m.start(), m.end()))
159 |         .collect();
160 |     assert_eq!(
161 |         matches,
162 |         vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
163 |     );
164 |     Ok(())
165 | }
166 | 
167 | // This test confirms that a prefilter is active by using a prefilter that
168 | // reports false negatives.
169 | #[test]
170 | fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
171 |     let text = b"za123";
172 |     let mut re = Regex::new(r"a[0-9]+").unwrap();
173 |     let mut cache = re.create_cache();
174 | 
175 |     re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
176 |     assert_eq!(
177 |         re.find_leftmost(&mut cache, b"za123"),
178 |         Some(MultiMatch::must(0, 1, 5))
179 |     );
180 |     assert_eq!(
181 |         re.find_leftmost(&mut cache, b"a123"),
182 |         Some(MultiMatch::must(0, 0, 4))
183 |     );
184 |     re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
185 |     assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
186 |     // This checks that the prefilter is used when first starting the search,
187 |     // instead of waiting until at least one transition has occurred.
188 |     assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
189 |     Ok(())
190 | }
191 | */
192 | 


--------------------------------------------------------------------------------
/tests/nfa/thompson/pikevm/mod.rs:
--------------------------------------------------------------------------------
1 | mod api;
2 | mod suite;
3 | 


--------------------------------------------------------------------------------
/tests/nfa/thompson/pikevm/suite.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::{
  2 |     nfa::thompson::{
  3 |         self,
  4 |         pikevm::{self, PikeVM},
  5 |     },
  6 |     MatchKind, SyntaxConfig,
  7 | };
  8 | use regex_syntax as syntax;
  9 | 
 10 | use regex_test::{
 11 |     bstr::{BString, ByteSlice},
 12 |     CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
 13 |     SearchKind as TestSearchKind, TestResult, TestRunner,
 14 | };
 15 | 
 16 | use crate::{suite, Result};
 17 | 
 18 | /// Tests the default configuration of the hybrid NFA/DFA.
 19 | #[test]
 20 | fn default() -> Result<()> {
 21 |     let builder = PikeVM::builder();
 22 |     TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
 23 |     Ok(())
 24 | }
 25 | 
 26 | fn compiler(
 27 |     mut builder: pikevm::Builder,
 28 | ) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
 29 |     move |test, regexes| {
 30 |         let regexes = regexes
 31 |             .iter()
 32 |             .map(|r| r.to_str().map(|s| s.to_string()))
 33 |             .collect::<std::result::Result<Vec<String>, _>>()?;
 34 |         if !configure_pikevm_builder(test, &mut builder) {
 35 |             return Ok(CompiledRegex::skip());
 36 |         }
 37 |         let re = builder.build_many(&regexes)?;
 38 |         let mut cache = re.create_cache();
 39 |         Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
 40 |             run_test(&re, &mut cache, test)
 41 |         }))
 42 |     }
 43 | }
 44 | 
 45 | fn run_test(
 46 |     re: &PikeVM,
 47 |     cache: &mut pikevm::Cache,
 48 |     test: &RegexTest,
 49 | ) -> Vec<TestResult> {
 50 |     // let is_match = if re.is_match(cache, test.input()) {
 51 |     // TestResult::matched()
 52 |     // } else {
 53 |     // TestResult::no_match()
 54 |     // };
 55 |     // let is_match = is_match.name("is_match");
 56 | 
 57 |     let find_matches = match test.search_kind() {
 58 |         TestSearchKind::Earliest => {
 59 |             TestResult::skip().name("find_earliest_iter")
 60 |         }
 61 |         TestSearchKind::Leftmost => {
 62 |             let it = re
 63 |                 .find_leftmost_iter(cache, test.input())
 64 |                 .take(test.match_limit().unwrap_or(std::usize::MAX))
 65 |                 .map(|m| Match {
 66 |                     id: m.pattern().as_usize(),
 67 |                     start: m.start(),
 68 |                     end: m.end(),
 69 |                 });
 70 |             TestResult::matches(it).name("find_leftmost_iter")
 71 |         }
 72 |         TestSearchKind::Overlapping => {
 73 |             TestResult::skip().name("find_overlapping_iter")
 74 |         }
 75 |     };
 76 |     // vec![is_match, find_matches]
 77 |     vec![find_matches]
 78 | }
 79 | 
 80 | /// Configures the given regex builder with all relevant settings on the given
 81 | /// regex test.
 82 | ///
 83 | /// If the regex test has a setting that is unsupported, then this returns
 84 | /// false (implying the test should be skipped).
 85 | fn configure_pikevm_builder(
 86 |     test: &RegexTest,
 87 |     builder: &mut pikevm::Builder,
 88 | ) -> bool {
 89 |     let pikevm_config =
 90 |         PikeVM::config().anchored(test.anchored()).utf8(test.utf8());
 91 |     builder
 92 |         .configure(pikevm_config)
 93 |         .syntax(config_syntax(test))
 94 |         .thompson(config_thompson(test));
 95 |     true
 96 | }
 97 | 
 98 | /// Configuration of a Thompson NFA compiler from a regex test.
 99 | fn config_thompson(test: &RegexTest) -> thompson::Config {
100 |     thompson::Config::new().utf8(test.utf8())
101 | }
102 | 
103 | /// Configuration of the regex parser from a regex test.
104 | fn config_syntax(test: &RegexTest) -> SyntaxConfig {
105 |     SyntaxConfig::new()
106 |         .case_insensitive(test.case_insensitive())
107 |         .unicode(test.unicode())
108 |         .utf8(test.utf8())
109 | }
110 | 


--------------------------------------------------------------------------------
/tests/regression.rs:
--------------------------------------------------------------------------------
 1 | use regex_automata::{
 2 |     dfa::{dense, Automaton},
 3 |     MatchError,
 4 | };
 5 | 
 6 | // A regression test for checking that minimization correctly translates
 7 | // whether a state is a match state or not. Previously, it was possible for
 8 | // minimization to mark a non-matching state as matching.
 9 | #[test]
10 | fn minimize_sets_correct_match_states() {
11 |     let pattern =
12 |         // This is a subset of the grapheme matching regex. I couldn't seem
13 |         // to get a repro any smaller than this unfortunately.
14 |         r"(?x)
15 |             (?:
16 |                 \p{gcb=Prepend}*
17 |                 (?:
18 |                     (?:
19 |                         (?:
20 |                             \p{gcb=L}*
21 |                             (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT})
22 |                             \p{gcb=T}*
23 |                         )
24 |                         |
25 |                         \p{gcb=L}+
26 |                         |
27 |                         \p{gcb=T}+
28 |                     )
29 |                     |
30 |                     \p{Extended_Pictographic}
31 |                     (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})*
32 |                     |
33 |                     [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
34 |                 )
35 |                 [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]*
36 |             )
37 |         ";
38 | 
39 |     let dfa = dense::Builder::new()
40 |         .configure(dense::Config::new().anchored(true).minimize(true))
41 |         .build(pattern)
42 |         .unwrap();
43 |     assert_eq!(Ok(None), dfa.find_leftmost_fwd(b"\xE2"));
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/tests.rs:
--------------------------------------------------------------------------------
 1 | #![allow(warnings)]
 2 | 
 3 | use regex_test::RegexTests;
 4 | 
 5 | mod dfa;
 6 | mod hybrid;
 7 | mod nfa;
 8 | mod regression;
 9 | mod util;
10 | 
11 | type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
12 | 
13 | fn suite() -> Result<RegexTests> {
14 |     let mut tests = RegexTests::new();
15 |     macro_rules! load {
16 |         ($name:expr) => {{
17 |             const DATA: &[u8] =
18 |                 include_bytes!(concat!("data/", $name, ".toml"));
19 |             tests.load_slice($name, DATA)?;
20 |         }};
21 |     }
22 | 
23 |     load!("bytes");
24 |     load!("crazy");
25 |     load!("earliest");
26 |     load!("empty");
27 |     load!("expensive");
28 |     load!("flags");
29 |     load!("iter");
30 |     load!("misc");
31 |     load!("multiline");
32 |     load!("no-unicode");
33 |     load!("overlapping");
34 |     load!("regression");
35 |     load!("set");
36 |     load!("unicode");
37 |     load!("word-boundary");
38 |     load!("fowler/basic");
39 |     load!("fowler/nullsubexpr");
40 |     load!("fowler/repetition");
41 |     load!("fowler/repetition-expensive");
42 | 
43 |     Ok(tests)
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/util.rs:
--------------------------------------------------------------------------------
 1 | use regex_automata::util::prefilter::{self, Candidate, Prefilter};
 2 | 
 3 | #[derive(Clone, Debug)]
 4 | pub struct SubstringPrefilter(bstr::Finder<'static>);
 5 | 
 6 | impl SubstringPrefilter {
 7 |     pub fn new<B: AsRef<[u8]>>(needle: B) -> SubstringPrefilter {
 8 |         SubstringPrefilter(bstr::Finder::new(needle.as_ref()).into_owned())
 9 |     }
10 | }
11 | 
12 | impl Prefilter for SubstringPrefilter {
13 |     #[inline]
14 |     fn next_candidate(
15 |         &self,
16 |         state: &mut prefilter::State,
17 |         haystack: &[u8],
18 |         at: usize,
19 |     ) -> Candidate {
20 |         self.0
21 |             .find(&haystack[at..])
22 |             .map(|i| Candidate::PossibleStartOfMatch(at + i))
23 |             .unwrap_or(Candidate::None)
24 |     }
25 | 
26 |     fn heap_bytes(&self) -> usize {
27 |         self.0.needle().len()
28 |     }
29 | }
30 | 
31 | /// A prefilter that always returns `Candidate::None`, even if it's a false
32 | /// negative. This is useful for confirming that a prefilter is actually
33 | /// active by asserting an incorrect result.
34 | #[derive(Clone, Debug)]
35 | pub struct BunkPrefilter(());
36 | 
37 | impl BunkPrefilter {
38 |     pub fn new() -> BunkPrefilter {
39 |         BunkPrefilter(())
40 |     }
41 | }
42 | 
43 | impl Prefilter for BunkPrefilter {
44 |     #[inline]
45 |     fn next_candidate(
46 |         &self,
47 |         _state: &mut prefilter::State,
48 |         _haystack: &[u8],
49 |         _at: usize,
50 |     ) -> Candidate {
51 |         Candidate::None
52 |     }
53 | 
54 |     fn heap_bytes(&self) -> usize {
55 |         0
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------