├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .vim
    └── coc-settings.json
├── COPYING
├── Cargo.toml
├── DESIGN.md
├── LICENSE-MIT
├── README.md
├── UNLICENSE
├── aho-corasick-debug
    ├── Cargo.toml
    └── main.rs
├── benchmarks
    ├── definitions
    │   ├── build.toml
    │   ├── curated.toml
    │   ├── jetscii.toml
    │   ├── random
    │   │   ├── many.toml
    │   │   ├── memchr.toml
    │   │   └── misc.toml
    │   ├── regexcurated.toml
    │   ├── same.toml
    │   ├── sherlock.toml
    │   └── teddy.toml
    ├── engines.toml
    ├── engines
    │   ├── naive
    │   │   ├── Cargo.lock
    │   │   ├── Cargo.toml
    │   │   └── main.rs
    │   ├── rust-aho-corasick
    │   │   ├── .gitignore
    │   │   ├── Cargo.lock
    │   │   ├── Cargo.toml
    │   │   ├── README.md
    │   │   └── main.rs
    │   ├── rust-daachorse
    │   │   ├── Cargo.lock
    │   │   ├── Cargo.toml
    │   │   └── main.rs
    │   ├── rust-jetscii
    │   │   ├── Cargo.lock
    │   │   ├── Cargo.toml
    │   │   └── main.rs
    │   └── rust-old-aho-corasick
    │   │   ├── .gitignore
    │   │   ├── Cargo.lock
    │   │   ├── Cargo.toml
    │   │   ├── README.md
    │   │   └── main.rs
    ├── haystacks
    │   ├── catalog.data.gov
    │   │   ├── README.md
    │   │   └── mental-health-4weeks.xml
    │   ├── opensubtitles
    │   │   ├── README.md
    │   │   ├── en-huge.txt
    │   │   ├── en-medium.txt
    │   │   ├── en-sampled.txt
    │   │   ├── en-small.txt
    │   │   ├── en-teeny.txt
    │   │   ├── en-tiny.txt
    │   │   ├── ru-huge.txt
    │   │   ├── ru-medium.txt
    │   │   ├── ru-sampled.txt
    │   │   ├── ru-small.txt
    │   │   ├── ru-teeny.txt
    │   │   ├── ru-tiny.txt
    │   │   ├── zh-huge.txt
    │   │   ├── zh-medium.txt
    │   │   ├── zh-sampled.txt
    │   │   ├── zh-small.txt
    │   │   ├── zh-teeny.txt
    │   │   └── zh-tiny.txt
    │   ├── random.txt
    │   ├── random10x.txt
    │   └── sherlock.txt
    ├── record
    │   ├── aarch64
    │   │   ├── 2023-09-04.csv
    │   │   ├── 2023-09-07.csv
    │   │   ├── 2023-09-16.csv
    │   │   └── 2023-09-17.csv
    │   └── x86_64
    │   │   ├── 2023-09-04.csv
    │   │   ├── 2023-09-07.csv
    │   │   ├── 2023-09-16.csv
    │   │   └── 2023-09-17.csv
    ├── regexes
    │   ├── dictionary
    │   │   └── english
    │   │   │   ├── length-10.txt
    │   │   │   ├── length-15.txt
    │   │   │   ├── sorted-by-length.txt
    │   │   │   └── sorted.txt
    │   ├── words-100
    │   ├── words-15000
    │   └── words-5000
    └── shared
    │   ├── Cargo.lock
    │   ├── Cargo.toml
    │   └── lib.rs
├── fuzz
    ├── .gitignore
    ├── Cargo.toml
    └── fuzz-targets
    │   └── fuzz_find.rs
├── rustfmt.toml
└── src
    ├── ahocorasick.rs
    ├── automaton.rs
    ├── dfa.rs
    ├── lib.rs
    ├── macros.rs
    ├── nfa
        ├── contiguous.rs
        ├── mod.rs
        └── noncontiguous.rs
    ├── packed
        ├── api.rs
        ├── ext.rs
        ├── mod.rs
        ├── pattern.rs
        ├── rabinkarp.rs
        ├── teddy
        │   ├── README.md
        │   ├── builder.rs
        │   ├── generic.rs
        │   └── mod.rs
        ├── tests.rs
        └── vector.rs
    ├── tests.rs
    ├── transducer.rs
    └── util
        ├── alphabet.rs
        ├── buffer.rs
        ├── byte_frequencies.rs
        ├── debug.rs
        ├── error.rs
        ├── int.rs
        ├── mod.rs
        ├── prefilter.rs
        ├── primitives.rs
        ├── remapper.rs
        ├── search.rs
        └── special.rs


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [BurntSushi]
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: ci
  2 | on:
  3 |   pull_request:
  4 |   push:
  5 |     branches:
  6 |     - master
  7 |   schedule:
  8 |   - cron: '00 01 * * *'
  9 | 
 10 | # The section is needed to drop write-all permissions that are granted on
 11 | # `schedule` event. By specifying any permission explicitly all others are set
 12 | # to none. By using the principle of least privilege the damage a compromised
 13 | # workflow can do (because of an injection or compromised third party tool or
 14 | # action) is restricted. Currently the worklow doesn't need any additional
 15 | # permission except for pulling the code. Adding labels to issues, commenting
 16 | # on pull-requests, etc. may need additional permissions:
 17 | #
 18 | # Syntax for this section:
 19 | # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions
 20 | #
 21 | # Reference for how to assign permissions on a job-by-job basis:
 22 | # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
 23 | #
 24 | # Reference for available permissions that we can enable if needed:
 25 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token
 26 | permissions:
 27 |   # to fetch code (actions/checkout)
 28 |   contents: read
 29 | 
 30 | jobs:
 31 |   test:
 32 |     name: test
 33 |     env:
 34 |       # For some builds, we use cross to test on 32-bit and big-endian
 35 |       # systems.
 36 |       CARGO: cargo
 37 |       # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`.
 38 |       # Note that we only use cross on Linux, so setting a target on a
 39 |       # different OS will just use normal cargo.
 40 |       TARGET:
 41 |       # Bump this as appropriate. We pin to a version to make sure CI
 42 |       # continues to work as cross releases in the past have broken things
 43 |       # in subtle ways.
 44 |       CROSS_VERSION: v0.2.5
 45 |     runs-on: ${{ matrix.os }}
 46 |     strategy:
 47 |       fail-fast: false
 48 |       matrix:
 49 |         include:
 50 |         - build: pinned
 51 |           os: ubuntu-latest
 52 |           rust: 1.60.0
 53 |         - build: stable
 54 |           os: ubuntu-latest
 55 |           rust: stable
 56 |         - build: stable-x86
 57 |           os: ubuntu-latest
 58 |           rust: stable
 59 |           target: i686-unknown-linux-gnu
 60 |         - build: stable-aarch64
 61 |           os: ubuntu-latest
 62 |           rust: stable
 63 |           target: aarch64-unknown-linux-gnu
 64 |         - build: stable-powerpc64
 65 |           os: ubuntu-latest
 66 |           rust: stable
 67 |           target: powerpc64-unknown-linux-gnu
 68 |         - build: stable-s390x
 69 |           os: ubuntu-latest
 70 |           rust: stable
 71 |           target: s390x-unknown-linux-gnu
 72 |         - build: beta
 73 |           os: ubuntu-latest
 74 |           rust: beta
 75 |         - build: nightly
 76 |           os: ubuntu-latest
 77 |           rust: nightly
 78 |         - build: macos
 79 |           os: macos-latest
 80 |           rust: stable
 81 |         - build: win-msvc
 82 |           os: windows-latest
 83 |           rust: stable
 84 |         - build: win-gnu
 85 |           os: windows-latest
 86 |           rust: stable-x86_64-gnu
 87 |     steps:
 88 |     - name: Checkout repository
 89 |       uses: actions/checkout@v3
 90 |     - name: Install Rust
 91 |       uses: dtolnay/rust-toolchain@master
 92 |       with:
 93 |         toolchain: ${{ matrix.rust }}
 94 |     - name: Install and configure Cross
 95 |       if: matrix.os == 'ubuntu-latest' && matrix.target != ''
 96 |       run: |
 97 |         # In the past, new releases of 'cross' have broken CI. So for now, we
 98 |         # pin it. We also use their pre-compiled binary releases because cross
 99 |         # has over 100 dependencies and takes a bit to compile.
100 |         dir="$RUNNER_TEMP/cross-download"
101 |         mkdir "$dir"
102 |         echo "$dir" >> $GITHUB_PATH
103 |         cd "$dir"
104 |         curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz"
105 |         tar xf cross-x86_64-unknown-linux-musl.tar.gz
106 | 
107 |         # We used to install 'cross' from master, but it kept failing. So now
108 |         # we build from a known-good version until 'cross' becomes more stable
109 |         # or we find an alternative. Notably, between v0.2.1 and current
110 |         # master (2022-06-14), the number of Cross's dependencies has doubled.
111 |         echo "CARGO=cross" >> $GITHUB_ENV
112 |         echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV
113 |     - name: Show command used for Cargo
114 |       run: |
115 |         echo "cargo command is: ${{ env.CARGO }}"
116 |         echo "target flag is: ${{ env.TARGET }}"
117 |     - name: Show CPU info for debugging
118 |       if: matrix.os == 'ubuntu-latest'
119 |       run: lscpu
120 |     # See: https://github.com/rust-lang/regex/blob/a2887636930156023172e4b376a6febad4e49120/.github/workflows/ci.yml#L145-L163
121 |     - name: Pin memchr to 2.6.2
122 |       if: matrix.build == 'pinned'
123 |       run: cargo update -p memchr --precise 2.6.2
124 |     - run: ${{ env.CARGO }} build --verbose $TARGET
125 |     - run: ${{ env.CARGO }} doc --verbose $TARGET
126 |     - run: ${{ env.CARGO }} test --verbose $TARGET
127 |     - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std,perf-literal $TARGET
128 |     - run: ${{ env.CARGO }} test --lib --verbose --no-default-features $TARGET
129 |     - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std $TARGET
130 |     - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features perf-literal $TARGET
131 |     - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std,perf-literal,logging $TARGET
132 |     - if: matrix.build == 'nightly'
133 |       run: ${{ env.CARGO }} build --manifest-path aho-corasick-debug/Cargo.toml $TARGET
134 | 
135 |   rustfmt:
136 |     name: rustfmt
137 |     runs-on: ubuntu-latest
138 |     steps:
139 |     - name: Checkout repository
140 |       uses: actions/checkout@v3
141 |     - name: Install Rust
142 |       uses: dtolnay/rust-toolchain@master
143 |       with:
144 |         toolchain: stable
145 |         components: rustfmt
146 |     - name: Check formatting
147 |       run: |
148 |         cargo fmt --all -- --check
149 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .*.swp
 2 | doc
 3 | tags
 4 | examples/ss10pusa.csv
 5 | build
 6 | target
 7 | /Cargo.lock
 8 | scratch*
 9 | bench_large/huge
10 | BREADCRUMBS
11 | /tmp
12 | /aho-corasick-debug/Cargo.lock
13 | 


--------------------------------------------------------------------------------
/.vim/coc-settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "rust-analyzer.linkedProjects": [
 3 |     "aho-corasick-debug/Cargo.toml",
 4 |     "benchmarks/engines/rust-aho-corasick/Cargo.toml",
 5 |     "benchmarks/engines/rust-daachorse/Cargo.toml",
 6 |     "benchmarks/engines/rust-jetscii/Cargo.toml",
 7 |     "benchmarks/engines/naive/Cargo.toml",
 8 |     "benchmarks/shared/Cargo.toml",
 9 |     "fuzz/Cargo.toml",
10 |     "Cargo.toml"
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | This project is dual-licensed under the Unlicense and MIT licenses.
2 | 
3 | You may use this code under the terms of either license.
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "aho-corasick"
 3 | version = "1.1.3"  #:version
 4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 5 | description = "Fast multiple substring searching."
 6 | homepage = "https://github.com/BurntSushi/aho-corasick"
 7 | repository = "https://github.com/BurntSushi/aho-corasick"
 8 | readme = "README.md"
 9 | keywords = ["string", "search", "text", "pattern", "multi"]
10 | license = "Unlicense OR MIT"
11 | categories = ["text-processing"]
12 | autotests = false
13 | exclude = ["/aho-corasick-debug", "/benchmarks", "/tmp"]
14 | edition = "2021"
15 | rust-version = "1.60.0"
16 | 
17 | [lib]
18 | name = "aho_corasick"
19 | 
20 | [features]
21 | default = ["std", "perf-literal"]
22 | std = ["memchr?/std"]
23 | 
24 | # Enables prefilter optimizations that depend on external crates.
25 | perf-literal = ["dep:memchr"]
26 | 
27 | # Enable logging via the 'log' crate. This is useful for seeing messages about
28 | # internal decisions and metrics. For example, how the choice of the internal
29 | # Aho-Corasick implementation is used or the heap usage of an automaton.
30 | logging = ["dep:log"]
31 | 
32 | # Provides a trait impl for fst::Automaton for nfa::noncontiguous::NFA,
33 | # nfa::contiguous::NFA and dfa::DFA. This is useful for searching an
34 | # FST with an Aho-Corasick automaton. Note that this does not apply
35 | # to the top-level 'AhoCorasick' type, as it does not implement the
36 | # aho_corasick::automaton::Automaton trait, and thus enabling this feature does
37 | # not cause it to implement fst::Automaton either.
38 | #
39 | # NOTE: Currently this feature is not available as `fst` is not at 1.0 yet,
40 | # and this would make `fst` a public dependency. If you absolutely need this,
41 | # you can copy the (very small) src/transducer.rs file to your tree. It
42 | # specifically does not use any private APIs and should work after replacing
43 | # 'crate::' with 'aho_corasick::'.
44 | #
45 | # NOTE: I think my current plan is to flip this around an add an optional
46 | # dependency on 'aho-corasick' to the 'fst' crate and move the trait impls
47 | # there. But I haven't gotten around to it yet.
48 | # transducer = ["fst"]
49 | 
50 | [dependencies]
51 | log = { version = "0.4.17", optional = true }
52 | memchr = { version = "2.4.0", default-features = false, optional = true }
53 | 
54 | [dev-dependencies]
55 | doc-comment = "0.3.3"
56 | # fst = "0.4.5"
57 | 
58 | [package.metadata.docs.rs]
59 | # We want to document all features.
60 | all-features = true
61 | # This opts into a nightly unstable option to show the features that need to be
62 | # enabled for public API items. To do that, we set 'docsrs', and when that's
63 | # enabled, we enable the 'doc_auto_cfg' feature.
64 | #
65 | # To test this locally, run:
66 | #
67 | #     RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
68 | rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
69 | 
70 | [profile.release]
71 | debug = true
72 | 
73 | [profile.bench]
74 | debug = true
75 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | aho-corasick
  2 | ============
  3 | A library for finding occurrences of many patterns at once with SIMD
  4 | acceleration in some cases. This library provides multiple pattern
  5 | search principally through an implementation of the
  6 | [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
  7 | which builds a finite state machine for executing searches in linear time.
  8 | Features include case insensitive matching, overlapping matches, fast searching
  9 | via SIMD and optional full DFA construction and search & replace in streams.
 10 | 
 11 | [![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
 12 | [![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick)
 13 | 
 14 | Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
 15 | 
 16 | 
 17 | ### Documentation
 18 | 
 19 | https://docs.rs/aho-corasick
 20 | 
 21 | 
 22 | ### Usage
 23 | 
 24 | Run `cargo add aho-corasick` to automatically add this crate as a dependency
 25 | in your `Cargo.toml` file.
 26 | 
 27 | 
 28 | ### Example: basic searching
 29 | 
 30 | This example shows how to search for occurrences of multiple patterns
 31 | simultaneously. Each match includes the pattern that matched along with the
 32 | byte offsets of the match.
 33 | 
 34 | ```rust
 35 | use aho_corasick::{AhoCorasick, PatternID};
 36 | 
 37 | let patterns = &["apple", "maple", "Snapple"];
 38 | let haystack = "Nobody likes maple in their apple flavored Snapple.";
 39 | 
 40 | let ac = AhoCorasick::new(patterns).unwrap();
 41 | let mut matches = vec![];
 42 | for mat in ac.find_iter(haystack) {
 43 |     matches.push((mat.pattern(), mat.start(), mat.end()));
 44 | }
 45 | assert_eq!(matches, vec![
 46 |     (PatternID::must(1), 13, 18),
 47 |     (PatternID::must(0), 28, 33),
 48 |     (PatternID::must(2), 43, 50),
 49 | ]);
 50 | ```
 51 | 
 52 | 
 53 | ### Example: ASCII case insensitivity
 54 | 
 55 | This is like the previous example, but matches `Snapple` case insensitively
 56 | using `AhoCorasickBuilder`:
 57 | 
 58 | ```rust
 59 | use aho_corasick::{AhoCorasick, PatternID};
 60 | 
 61 | let patterns = &["apple", "maple", "snapple"];
 62 | let haystack = "Nobody likes maple in their apple flavored Snapple.";
 63 | 
 64 | let ac = AhoCorasick::builder()
 65 |     .ascii_case_insensitive(true)
 66 |     .build(patterns)
 67 |     .unwrap();
 68 | let mut matches = vec![];
 69 | for mat in ac.find_iter(haystack) {
 70 |     matches.push((mat.pattern(), mat.start(), mat.end()));
 71 | }
 72 | assert_eq!(matches, vec![
 73 |     (PatternID::must(1), 13, 18),
 74 |     (PatternID::must(0), 28, 33),
 75 |     (PatternID::must(2), 43, 50),
 76 | ]);
 77 | ```
 78 | 
 79 | 
 80 | ### Example: replacing matches in a stream
 81 | 
 82 | This example shows how to execute a search and replace on a stream without
 83 | loading the entire stream into memory first.
 84 | 
 85 | ```rust,ignore
 86 | use aho_corasick::AhoCorasick;
 87 | 
 88 | let patterns = &["fox", "brown", "quick"];
 89 | let replace_with = &["sloth", "grey", "slow"];
 90 | 
 91 | // In a real example, these might be `std::fs::File`s instead. All you need to
 92 | // do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
 93 | let rdr = "The quick brown fox.";
 94 | let mut wtr = vec![];
 95 | 
 96 | let ac = AhoCorasick::new(patterns).unwrap();
 97 | ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)
 98 |     .expect("stream_replace_all failed");
 99 | assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
100 | ```
101 | 
102 | 
103 | ### Example: finding the leftmost first match
104 | 
105 | In the textbook description of Aho-Corasick, its formulation is typically
106 | structured such that it reports all possible matches, even when they overlap
107 | with another. In many cases, overlapping matches may not be desired, such as
108 | the case of finding all successive non-overlapping matches like you might with
109 | a standard regular expression.
110 | 
111 | Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
112 | this doesn't always work in the expected way, since it will report matches as
113 | soon as they are seen. For example, consider matching the regex `Samwise|Sam`
114 | against the text `Samwise`. Most regex engines (that are Perl-like, or
115 | non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
116 | algorithm modified for reporting non-overlapping matches will report `Sam`.
117 | 
118 | A novel contribution of this library is the ability to change the match
119 | semantics of Aho-Corasick (without additional search time overhead) such that
120 | `Samwise` is reported instead. For example, here's the standard approach:
121 | 
122 | ```rust
123 | use aho_corasick::AhoCorasick;
124 | 
125 | let patterns = &["Samwise", "Sam"];
126 | let haystack = "Samwise";
127 | 
128 | let ac = AhoCorasick::new(patterns).unwrap();
129 | let mat = ac.find(haystack).expect("should have a match");
130 | assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
131 | ```
132 | 
133 | And now here's the leftmost-first version, which matches how a Perl-like
134 | regex will work:
135 | 
136 | ```rust
137 | use aho_corasick::{AhoCorasick, MatchKind};
138 | 
139 | let patterns = &["Samwise", "Sam"];
140 | let haystack = "Samwise";
141 | 
142 | let ac = AhoCorasick::builder()
143 |     .match_kind(MatchKind::LeftmostFirst)
144 |     .build(patterns)
145 |     .unwrap();
146 | let mat = ac.find(haystack).expect("should have a match");
147 | assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
148 | ```
149 | 
150 | In addition to leftmost-first semantics, this library also supports
151 | leftmost-longest semantics, which match the POSIX behavior of a regular
152 | expression alternation. See `MatchKind` in the docs for more details.
153 | 
154 | 
155 | ### Minimum Rust version policy
156 | 
157 | This crate's minimum supported `rustc` version is `1.60.0`.
158 | 
159 | The current policy is that the minimum Rust version required to use this crate
160 | can be increased in minor version updates. For example, if `crate 1.0` requires
161 | Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
162 | 1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
163 | version of Rust.
164 | 
165 | In general, this crate will be conservative with respect to the minimum
166 | supported version of Rust.
167 | 
168 | 
169 | ### FFI bindings
170 | 
171 | * [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/)
172 | is a Python wrapper for this library.
173 | * [tmikus/ahocorasick_rs](https://github.com/tmikus/ahocorasick_rs) is a Go
174 |     wrapper for this library.
175 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/aho-corasick-debug/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "aho-corasick-debug"
 4 | version = "0.0.1"
 5 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 6 | description = "A simple command line tool for playing with Aho-Corasick."
 7 | license = "Unlicense/MIT"
 8 | categories = ["text-processing"]
 9 | autotests = false
10 | edition = "2018"
11 | 
12 | [[bin]]
13 | name = "aho-corasick-debug"
14 | path = "main.rs"
15 | 
16 | [dependencies]
17 | aho-corasick = { version = "*", path = "..", features = ["logging"] }
18 | anyhow = "1.0.68"
19 | memmap2 = "0.5.8"
20 | 
21 | [dependencies.env_logger]
22 | version = "0.9.3"
23 | default-features = false
24 | # I am quite annoyed that 'auto-color' brings in a whole mess of dependencies,
25 | # so I am keeping it disabled.
26 | features = ["humantime"]
27 | 
28 | [dependencies.clap]
29 | version = "2.34.0"
30 | default-features = false
31 | 
32 | [profile.release]
33 | debug = true
34 | 


--------------------------------------------------------------------------------
/aho-corasick-debug/main.rs:
--------------------------------------------------------------------------------
  1 | use std::{fs, path::PathBuf, time::Instant};
  2 | 
  3 | use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind, StartKind};
  4 | use memmap2::Mmap;
  5 | 
  6 | fn main() -> anyhow::Result<()> {
  7 |     env_logger::try_init()?;
  8 | 
  9 |     let args = Args::parse()?;
 10 |     let ac = args.aho_corasick()?;
 11 |     let haystack = args.haystack()?;
 12 | 
 13 |     eprintln!("automaton heap usage: {} bytes", ac.memory_usage());
 14 |     if args.no_search || args.debug {
 15 |         if args.debug {
 16 |             eprintln!("{:?}", ac);
 17 |         }
 18 |         return Ok(());
 19 |     }
 20 | 
 21 |     let start = Instant::now();
 22 |     let count = ac.find_iter(&haystack).count();
 23 |     println!("match count: {}", count);
 24 | 
 25 |     let count_time = Instant::now().duration_since(start);
 26 |     eprintln!("count time: {:?}", count_time);
 27 |     Ok(())
 28 | }
 29 | 
 30 | #[derive(Debug)]
 31 | struct Args {
 32 |     dictionary: PathBuf,
 33 |     haystack: PathBuf,
 34 |     match_kind: MatchKind,
 35 |     start_kind: StartKind,
 36 |     kind: Option<AhoCorasickKind>,
 37 |     ascii_casei: bool,
 38 |     dense_depth: usize,
 39 |     no_prefilter: bool,
 40 |     no_classes: bool,
 41 |     no_search: bool,
 42 |     debug: bool,
 43 | }
 44 | 
 45 | impl Args {
 46 |     fn parse() -> anyhow::Result<Args> {
 47 |         use clap::{crate_authors, crate_version, App, Arg};
 48 | 
 49 |         let parsed = App::new("Search using aho-corasick")
 50 |             .author(crate_authors!())
 51 |             .version(crate_version!())
 52 |             .max_term_width(100)
 53 |             .arg(Arg::with_name("dictionary").required(true))
 54 |             .arg(Arg::with_name("haystack").required(true))
 55 |             .arg(
 56 |                 Arg::with_name("kind")
 57 |                     .long("kind")
 58 |                     .possible_values(&[
 59 |                         "auto",
 60 |                         "noncontiguous",
 61 |                         "contiguous",
 62 |                         "dfa",
 63 |                     ])
 64 |                     .default_value("auto"),
 65 |             )
 66 |             .arg(
 67 |                 Arg::with_name("match-kind")
 68 |                     .long("match-kind")
 69 |                     .possible_values(&[
 70 |                         "standard",
 71 |                         "leftmost-first",
 72 |                         "leftmost-longest",
 73 |                     ])
 74 |                     .default_value("standard"),
 75 |             )
 76 |             .arg(
 77 |                 Arg::with_name("start-kind")
 78 |                     .long("start-kind")
 79 |                     .possible_values(&["both", "unanchored", "anchored"])
 80 |                     .default_value("unanchored"),
 81 |             )
 82 |             .arg(
 83 |                 Arg::with_name("ascii-case-insensitive")
 84 |                     .long("ascii-case-insensitive")
 85 |                     .short("i"),
 86 |             )
 87 |             .arg(
 88 |                 Arg::with_name("dense-depth")
 89 |                     .long("dense-depth")
 90 |                     .default_value("3"),
 91 |             )
 92 |             .arg(
 93 |                 Arg::with_name("no-prefilter").long("no-prefilter").short("f"),
 94 |             )
 95 |             .arg(Arg::with_name("no-classes").long("no-classes").short("C"))
 96 |             .arg(Arg::with_name("no-search").long("no-search"))
 97 |             .arg(Arg::with_name("debug").long("debug"))
 98 |             .get_matches();
 99 | 
100 |         let dictionary =
101 |             PathBuf::from(parsed.value_of_os("dictionary").unwrap());
102 |         let haystack = PathBuf::from(parsed.value_of_os("haystack").unwrap());
103 |         let match_kind = match parsed.value_of("match-kind").unwrap() {
104 |             "standard" => MatchKind::Standard,
105 |             "leftmost-first" => MatchKind::LeftmostFirst,
106 |             "leftmost-longest" => MatchKind::LeftmostLongest,
107 |             _ => unreachable!(),
108 |         };
109 |         let start_kind = match parsed.value_of("start-kind").unwrap() {
110 |             "both" => StartKind::Both,
111 |             "unanchored" => StartKind::Unanchored,
112 |             "anchored" => StartKind::Anchored,
113 |             _ => unreachable!(),
114 |         };
115 |         let kind = match parsed.value_of("kind").unwrap() {
116 |             "auto" => None,
117 |             "noncontiguous" => Some(AhoCorasickKind::NoncontiguousNFA),
118 |             "contiguous" => Some(AhoCorasickKind::ContiguousNFA),
119 |             "dfa" => Some(AhoCorasickKind::DFA),
120 |             _ => unreachable!(),
121 |         };
122 |         let dense_depth = parsed.value_of("dense-depth").unwrap().parse()?;
123 | 
124 |         Ok(Args {
125 |             dictionary,
126 |             haystack,
127 |             match_kind,
128 |             start_kind,
129 |             kind,
130 |             dense_depth,
131 |             ascii_casei: parsed.is_present("ascii-case-insensitive"),
132 |             no_prefilter: parsed.is_present("no-prefilter"),
133 |             no_classes: parsed.is_present("no-classes"),
134 |             no_search: parsed.is_present("no-search"),
135 |             debug: parsed.is_present("debug"),
136 |         })
137 |     }
138 | 
139 |     fn aho_corasick(&self) -> anyhow::Result<AhoCorasick> {
140 |         let start = Instant::now();
141 |         let patterns = fs::read_to_string(&self.dictionary)?;
142 |         let read_time = Instant::now().duration_since(start);
143 |         eprintln!("pattern read time: {:?}", read_time);
144 | 
145 |         let start = Instant::now();
146 |         let ac = AhoCorasick::builder()
147 |             .match_kind(self.match_kind)
148 |             .start_kind(self.start_kind)
149 |             .kind(self.kind)
150 |             .ascii_case_insensitive(self.ascii_casei)
151 |             .dense_depth(self.dense_depth)
152 |             .prefilter(!self.no_prefilter)
153 |             .byte_classes(!self.no_classes)
154 |             .build(patterns.lines())?;
155 |         let build_time = Instant::now().duration_since(start);
156 |         eprintln!("automaton build time: {:?}", build_time);
157 |         Ok(ac)
158 |     }
159 | 
160 |     fn haystack(&self) -> anyhow::Result<Mmap> {
161 |         // SAFETY: We only read from this content and generally assume the file
162 |         // is not mutated while it is searched.
163 |         Ok(unsafe { Mmap::map(&fs::File::open(&self.haystack)?)? })
164 |     }
165 | }
166 | 


--------------------------------------------------------------------------------
/benchmarks/definitions/build.toml:
--------------------------------------------------------------------------------
  1 | [[bench]]
  2 | model = "compile"
  3 | name = "empty"
  4 | regex = []
  5 | haystack = ""
  6 | count = 0
  7 | engines = [
  8 |   "rust/aho-corasick/default/standard",
  9 |   "rust/aho-corasick/default/leftmost-first",
 10 |   "rust/aho-corasick/default/leftmost-longest",
 11 |   "rust/old-aho-corasick/default/standard",
 12 |   "rust/old-aho-corasick/default/leftmost-first",
 13 |   "rust/old-aho-corasick/default/leftmost-longest",
 14 |   "naive/rust/memchr/memmem",
 15 | ]
 16 | 
 17 | [[bench]]
 18 | model = "compile"
 19 | name = "onebyte"
 20 | regex = ["a"]
 21 | haystack = "a"
 22 | count = 1
 23 | engines = [
 24 |   "rust/aho-corasick/default/standard",
 25 |   "rust/aho-corasick/default/leftmost-first",
 26 |   "rust/aho-corasick/default/leftmost-longest",
 27 |   "rust/aho-corasick/packed/leftmost-first",
 28 |   "rust/old-aho-corasick/default/standard",
 29 |   "rust/old-aho-corasick/default/leftmost-first",
 30 |   "rust/old-aho-corasick/default/leftmost-longest",
 31 |   "rust/old-aho-corasick/packed/leftmost-first",
 32 |   "daachorse/bytewise/leftmost-first",
 33 |   "daachorse/bytewise/leftmost-longest",
 34 |   "naive/rust/memchr/memmem",
 35 | ]
 36 | 
 37 | [[bench]]
 38 | model = "compile"
 39 | name = "twobytes"
 40 | regex = ["a", "b"]
 41 | haystack = "ab"
 42 | count = 2
 43 | engines = [
 44 |   "rust/aho-corasick/default/standard",
 45 |   "rust/aho-corasick/default/leftmost-first",
 46 |   "rust/aho-corasick/default/leftmost-longest",
 47 |   "rust/aho-corasick/packed/leftmost-first",
 48 |   "rust/old-aho-corasick/default/standard",
 49 |   "rust/old-aho-corasick/default/leftmost-first",
 50 |   "rust/old-aho-corasick/default/leftmost-longest",
 51 |   "rust/old-aho-corasick/packed/leftmost-first",
 52 |   "daachorse/bytewise/leftmost-first",
 53 |   "daachorse/bytewise/leftmost-longest",
 54 |   "naive/rust/memchr/memmem",
 55 | ]
 56 | 
 57 | [[bench]]
 58 | model = "compile"
 59 | name = "many-short"
 60 | regex = [
 61 |   "ADL", "ADl", "AdL", "Adl", "BAK", "BAk", "BAK", "BaK", "Bak",
 62 |   "BaK", "HOL", "HOl", "HoL", "Hol", "IRE", "IRe", "IrE", "Ire",
 63 |   "JOH", "JOh", "JoH", "Joh", "SHE", "SHe", "ShE", "She", "WAT",
 64 |   "WAt", "WaT", "Wat", "aDL", "aDl", "adL", "adl", "bAK", "bAk",
 65 |   "bAK", "baK", "bak", "baK", "hOL", "hOl", "hoL", "hol", "iRE",
 66 |   "iRe", "irE", "ire", "jOH", "jOh", "joH", "joh", "sHE", "sHe",
 67 |   "shE", "she", "wAT", "wAt", "waT", "wat", "ſHE", "ſHe", "ſhE",
 68 |   "ſhe",
 69 | ]
 70 | haystack = "ſhe"
 71 | count = 1
 72 | engines = [
 73 |   "rust/aho-corasick/default/standard",
 74 |   "rust/aho-corasick/default/leftmost-first",
 75 |   "rust/aho-corasick/default/leftmost-longest",
 76 |   "rust/aho-corasick/packed/leftmost-first",
 77 |   "rust/old-aho-corasick/default/standard",
 78 |   "rust/old-aho-corasick/default/leftmost-first",
 79 |   "rust/old-aho-corasick/default/leftmost-longest",
 80 |   "rust/old-aho-corasick/packed/leftmost-first",
 81 |   "daachorse/bytewise/leftmost-first",
 82 |   "daachorse/bytewise/leftmost-longest",
 83 |   "naive/rust/memchr/memmem",
 84 | ]
 85 | 
 86 | [[bench]]
 87 | model = "compile"
 88 | name = "words5000"
 89 | regex = { path = "words-5000", per-line = "pattern" }
 90 | haystack = "transfuse"
 91 | count = 1
 92 | engines = [
 93 |   "rust/aho-corasick/default/standard",
 94 |   "rust/aho-corasick/default/leftmost-first",
 95 |   "rust/aho-corasick/default/leftmost-longest",
 96 |   "rust/old-aho-corasick/default/standard",
 97 |   "rust/old-aho-corasick/default/leftmost-first",
 98 |   "rust/old-aho-corasick/default/leftmost-longest",
 99 |   "daachorse/bytewise/leftmost-first",
100 |   "daachorse/bytewise/leftmost-longest",
101 |   "naive/rust/memchr/memmem",
102 | ]
103 | 
104 | [[bench]]
105 | model = "compile"
106 | name = "words15000"
107 | regex = { path = "words-15000", per-line = "pattern" }
108 | haystack = "dovetailing"
109 | count = 1
110 | engines = [
111 |   "rust/aho-corasick/default/standard",
112 |   "rust/aho-corasick/default/leftmost-first",
113 |   "rust/aho-corasick/default/leftmost-longest",
114 |   "rust/old-aho-corasick/default/standard",
115 |   "rust/old-aho-corasick/default/leftmost-first",
116 |   "rust/old-aho-corasick/default/leftmost-longest",
117 |   "daachorse/bytewise/leftmost-first",
118 |   "daachorse/bytewise/leftmost-longest",
119 |   "naive/rust/memchr/memmem",
120 | ]
121 | 


--------------------------------------------------------------------------------
/benchmarks/definitions/curated.toml:
--------------------------------------------------------------------------------
 1 | analysis = '''
 2 | This is a WIP for building our a curated set of Aho-Corasick benchmarks.
 3 | The next step is not to actually add more benchmarks, but to hook up more
 4 | Aho-Corasick libraries. There are a lot of them and it's a fair bit of work to
 5 | do.
 6 | '''
 7 | 
 8 | [[bench]]
 9 | model = "count"
10 | name = "sherlock"
11 | regex = [
12 |   'Sherlock Holmes',
13 |   'John Watson',
14 |   'Irene Adler',
15 |   'Inspector Lestrade',
16 |   'Professor Moriarty',
17 | ]
18 | haystack = { path = "opensubtitles/en-sampled.txt" }
19 | count = 714
20 | engines = [
21 |   "rust/aho-corasick/default/standard",
22 |   "rust/aho-corasick/default/leftmost-first",
23 |   "daachorse/bytewise/standard",
24 |   "daachorse/bytewise/leftmost-first",
25 |   "naive/rust/memchr/memmem",
26 | ]
27 | 
28 | [[bench]]
29 | model = "count"
30 | name = "dictionary-15"
31 | regex = { path = "dictionary/english/length-15.txt", per-line = "pattern" }
32 | haystack = { path = "opensubtitles/en-sampled.txt" }
33 | count = 15
34 | engines = [
35 |   "rust/aho-corasick/default/standard",
36 |   "rust/aho-corasick/default/overlapping",
37 |   "rust/aho-corasick/default/leftmost-first",
38 |   "daachorse/bytewise/standard",
39 |   "daachorse/bytewise/overlapping",
40 |   "daachorse/bytewise/leftmost-first",
41 |   "naive/rust/memchr/memmem",
42 | ]
43 | analysis = '''
44 | Looks for occurrences of 2,663 words of length at least 15.
45 | '''
46 | 


--------------------------------------------------------------------------------
/benchmarks/definitions/jetscii.toml:
--------------------------------------------------------------------------------
  1 | analysis = '''
  2 | These benchmarks were ported out of the jetscii crate, specifically from
  3 | [Dr-Emann's PR]. There were some irregularities in the benchmark results, so
  4 | I thought it might be interesting to include it here.
  5 | 
  6 | We add "real" variants of each benchmark as well using a small XML data set on
  7 | mental health. The original benchmarks search a haystack consisting entirely of
  8 | `a` repeated, with the last byte corresponding to one of the needle bytes. This
  9 | is useful for measuring pure throughput, but less good for approximating real
 10 | world performance. In this case, for at least `xml-delim3` and `xml-delim5`, it
 11 | seems like an XML haystack would be better suited.
 12 | 
 13 | [Dr-Emann's PR]: https://github.com/shepmaster/jetscii/pull/57
 14 | '''
 15 | 
 16 | [[bench]]
 17 | model = "count"
 18 | name = "space-repeateda"
 19 | regex = [' ']
 20 | haystack = { contents = "a", repeat = 5_242_880, append = " " }
 21 | count = 1
 22 | engines = [
 23 |   "daachorse/bytewise/leftmost-first",
 24 |   "rust/aho-corasick/dfa/leftmost-first",
 25 |   "rust/aho-corasick/packed/leftmost-first",
 26 |   "rust/old-aho-corasick/packed/leftmost-first",
 27 |   "rust/jetscii/ascii-chars/prebuilt",
 28 | ]
 29 | 
 30 | [[bench]]
 31 | model = "count"
 32 | name = "xmldelim3-repeateda"
 33 | regex = ['<', '>', '&']
 34 | haystack = { contents = "a", repeat = 5_242_880, append = "&" }
 35 | count = 1
 36 | engines = [
 37 |   "daachorse/bytewise/leftmost-first",
 38 |   "rust/aho-corasick/dfa/leftmost-first",
 39 |   "rust/aho-corasick/packed/leftmost-first",
 40 |   "rust/old-aho-corasick/packed/leftmost-first",
 41 |   "rust/jetscii/ascii-chars/prebuilt",
 42 | ]
 43 | 
 44 | [[bench]]
 45 | model = "count"
 46 | name = "xmldelim5-repeateda"
 47 | regex = ['<', '>', '&', "'", '"']
 48 | haystack = { contents = "a", repeat = 5_242_880, append = '"' }
 49 | count = 1
 50 | engines = [
 51 |   "daachorse/bytewise/leftmost-first",
 52 |   "rust/aho-corasick/dfa/leftmost-first",
 53 |   "rust/aho-corasick/packed/leftmost-first",
 54 |   "rust/old-aho-corasick/packed/leftmost-first",
 55 |   "rust/jetscii/ascii-chars/prebuilt",
 56 | ]
 57 | 
 58 | [[bench]]
 59 | model = "count"
 60 | name = "big16-repeateda"
 61 | regex = [
 62 |   'A', 'B', 'C', 'D',
 63 |   'E', 'F', 'G', 'H',
 64 |   'I', 'J', 'K', 'L',
 65 |   'M', 'N', 'O', 'P',
 66 | ]
 67 | haystack = { contents = "a", repeat = 5_242_880, append = "P" }
 68 | count = 1
 69 | engines = [
 70 |   "daachorse/bytewise/leftmost-first",
 71 |   "rust/aho-corasick/dfa/leftmost-first",
 72 |   "rust/aho-corasick/packed/leftmost-first",
 73 |   "rust/old-aho-corasick/packed/leftmost-first",
 74 |   "rust/jetscii/ascii-chars/prebuilt",
 75 | ]
 76 | 
 77 | [[bench]]
 78 | model = "count"
 79 | name = "big16earlyshort-repeateda"
 80 | regex = [
 81 |   'A', 'B', 'C', 'D',
 82 |   'E', 'F', 'G', 'H',
 83 |   'I', 'J', 'K', 'L',
 84 |   'M', 'N', 'O', 'P',
 85 | ]
 86 | haystack = { contents = "Pa" }
 87 | count = 1
 88 | engines = [
 89 |   "daachorse/bytewise/leftmost-first",
 90 |   "rust/aho-corasick/dfa/leftmost-first",
 91 |   "rust/aho-corasick/packed/leftmost-first",
 92 |   "rust/old-aho-corasick/packed/leftmost-first",
 93 |   "rust/jetscii/ascii-chars/prebuilt",
 94 | ]
 95 | 
 96 | [[bench]]
 97 | model = "count"
 98 | name = "big16earlylong-repeateda"
 99 | regex = [
100 |   'A', 'B', 'C', 'D',
101 |   'E', 'F', 'G', 'H',
102 |   'I', 'J', 'K', 'L',
103 |   'M', 'N', 'O', 'P',
104 | ]
105 | haystack = { contents = "a", repeat = 14, append = "P" }
106 | count = 1
107 | engines = [
108 |   "daachorse/bytewise/leftmost-first",
109 |   "rust/aho-corasick/dfa/leftmost-first",
110 |   "rust/aho-corasick/packed/leftmost-first",
111 |   "rust/old-aho-corasick/packed/leftmost-first",
112 |   "rust/jetscii/ascii-chars/prebuilt",
113 | ]
114 | 
115 | [[bench]]
116 | model = "count"
117 | name = "space-mentalhealth"
118 | regex = [' ']
119 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
120 | count = 1_181_201
121 | engines = [
122 |   "daachorse/bytewise/leftmost-first",
123 |   "rust/aho-corasick/dfa/leftmost-first",
124 |   "rust/aho-corasick/packed/leftmost-first",
125 |   "rust/old-aho-corasick/packed/leftmost-first",
126 |   "rust/jetscii/ascii-chars/prebuilt",
127 | ]
128 | 
129 | [[bench]]
130 | model = "count"
131 | name = "xmldelim3-mentalhealth"
132 | regex = ['<', '>', '&']
133 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
134 | count = 604_714
135 | engines = [
136 |   "daachorse/bytewise/leftmost-first",
137 |   "rust/aho-corasick/dfa/leftmost-first",
138 |   "rust/aho-corasick/packed/leftmost-first",
139 |   "rust/old-aho-corasick/packed/leftmost-first",
140 |   "rust/jetscii/ascii-chars/prebuilt",
141 | ]
142 | 
143 | [[bench]]
144 | model = "count"
145 | name = "xmldelim5-mentalhealth"
146 | regex = ['<', '>', '&', "'", '"']
147 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
148 | count = 688_252
149 | engines = [
150 |   "daachorse/bytewise/leftmost-first",
151 |   "rust/aho-corasick/dfa/leftmost-first",
152 |   "rust/aho-corasick/packed/leftmost-first",
153 |   "rust/old-aho-corasick/packed/leftmost-first",
154 |   "rust/jetscii/ascii-chars/prebuilt",
155 | ]
156 | 
157 | [[bench]]
158 | model = "count"
159 | name = "big16-mentalhealth"
160 | regex = [
161 |   'A', 'B', 'C', 'D',
162 |   'E', 'F', 'G', 'H',
163 |   'I', 'J', 'K', 'L',
164 |   'M', 'N', 'O', 'P',
165 | ]
166 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
167 | count = 176_447
168 | engines = [
169 |   "daachorse/bytewise/leftmost-first",
170 |   "rust/aho-corasick/dfa/leftmost-first",
171 |   "rust/aho-corasick/packed/leftmost-first",
172 |   "rust/old-aho-corasick/packed/leftmost-first",
173 |   "rust/jetscii/ascii-chars/prebuilt",
174 | ]
175 | 


--------------------------------------------------------------------------------
/benchmarks/definitions/random/many.toml:
--------------------------------------------------------------------------------
 1 | analysis = '''
 2 | Miscellaneous benchmarks on a large random haystack with a large pattern set.
 3 | '''
 4 | 
 5 | [[bench]]
 6 | model = "count"
 7 | name = "words100"
 8 | regex = { path = "words-100", per-line = "pattern" }
 9 | haystack = { path = "random10x.txt" }
10 | count = 0
11 | engines = [
12 |   "rust/aho-corasick/default/standard",
13 |   "rust/aho-corasick/default/leftmost-first",
14 |   "rust/aho-corasick/default/leftmost-longest",
15 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
16 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
17 |   "rust/aho-corasick/dfa/leftmost-first",
18 |   "rust/old-aho-corasick/default/standard",
19 |   "rust/old-aho-corasick/default/leftmost-first",
20 |   "rust/old-aho-corasick/default/leftmost-longest",
21 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
22 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
23 |   "rust/old-aho-corasick/dfa/leftmost-first",
24 |   "daachorse/bytewise/leftmost-first",
25 |   "daachorse/bytewise/leftmost-longest",
26 |   "naive/rust/memchr/memmem",
27 |   "naive/rust/std",
28 | ]
29 | 
30 | [[bench]]
31 | model = "count"
32 | name = "words5000"
33 | regex = { path = "words-5000", per-line = "pattern" }
34 | haystack = { path = "random10x.txt" }
35 | count = 0
36 | engines = [
37 |   "rust/aho-corasick/default/standard",
38 |   "rust/aho-corasick/default/leftmost-first",
39 |   "rust/aho-corasick/default/leftmost-longest",
40 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
41 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
42 |   "rust/aho-corasick/dfa/leftmost-first",
43 |   "rust/old-aho-corasick/default/standard",
44 |   "rust/old-aho-corasick/default/leftmost-first",
45 |   "rust/old-aho-corasick/default/leftmost-longest",
46 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
47 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
48 |   "rust/old-aho-corasick/dfa/leftmost-first",
49 |   "daachorse/bytewise/leftmost-first",
50 |   "daachorse/bytewise/leftmost-longest",
51 |   "naive/rust/memchr/memmem",
52 |   "naive/rust/std",
53 | ]
54 | 


--------------------------------------------------------------------------------
/benchmarks/definitions/random/memchr.toml:
--------------------------------------------------------------------------------
  1 | analysis = '''
  2 | These benchmarks test the prefix byte optimization, and the impact that
  3 | match-vs-non-match has.
  4 | 
  5 | More specifically, Aho-Corasick will use highly optimized vectorized routines
  6 | (on some targets) if it determines that all matches start with 1, 2 or 3
  7 | distinct bytes. (Perhaps not in all cases. Even if there are 1-3 common bytes
  8 | in the prefix, it could still use Teddy.)
  9 | 
 10 | For match-vs-non-match, we keep the match counts fixed across the different
 11 | prefix optimizations as a way to control what we measure.
 12 | '''
 13 | 
 14 | [[bench]]
 15 | model = "count"
 16 | name = "onebyte-match"
 17 | regex = ["a"]
 18 | haystack = { path = "random.txt" }
 19 | count = 352
 20 | engines = [
 21 |   "rust/aho-corasick/default/standard",
 22 |   "rust/aho-corasick/default/leftmost-first",
 23 |   "rust/aho-corasick/default/leftmost-longest",
 24 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
 25 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
 26 |   "rust/aho-corasick/dfa/leftmost-first",
 27 |   "rust/aho-corasick/packed/leftmost-first",
 28 |   "rust/old-aho-corasick/default/standard",
 29 |   "rust/old-aho-corasick/default/leftmost-first",
 30 |   "rust/old-aho-corasick/default/leftmost-longest",
 31 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
 32 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
 33 |   "rust/old-aho-corasick/dfa/leftmost-first",
 34 |   "rust/old-aho-corasick/packed/leftmost-first",
 35 |   "daachorse/bytewise/leftmost-first",
 36 |   "daachorse/bytewise/leftmost-longest",
 37 |   "naive/rust/memchr/memmem",
 38 |   "naive/rust/std",
 39 | ]
 40 | 
 41 | [[bench]]
 42 | model = "count"
 43 | name = "onebyte-nomatch"
 44 | regex = ["\u0000"]
 45 | haystack = { path = "random.txt" }
 46 | count = 0
 47 | engines = [
 48 |   "rust/aho-corasick/default/standard",
 49 |   "rust/aho-corasick/default/leftmost-first",
 50 |   "rust/aho-corasick/default/leftmost-longest",
 51 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
 52 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
 53 |   "rust/aho-corasick/dfa/leftmost-first",
 54 |   "rust/aho-corasick/packed/leftmost-first",
 55 |   "rust/old-aho-corasick/default/standard",
 56 |   "rust/old-aho-corasick/default/leftmost-first",
 57 |   "rust/old-aho-corasick/default/leftmost-longest",
 58 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
 59 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
 60 |   "rust/old-aho-corasick/dfa/leftmost-first",
 61 |   "rust/old-aho-corasick/packed/leftmost-first",
 62 |   "daachorse/bytewise/leftmost-first",
 63 |   "daachorse/bytewise/leftmost-longest",
 64 |   "naive/rust/memchr/memmem",
 65 |   "naive/rust/std",
 66 | ]
 67 | 
 68 | [[bench]]
 69 | model = "count"
 70 | name = "twobytes-match"
 71 | regex = ["a", "\u0000"]
 72 | haystack = { path = "random.txt" }
 73 | count = 352
 74 | engines = [
 75 |   "rust/aho-corasick/default/standard",
 76 |   "rust/aho-corasick/default/leftmost-first",
 77 |   "rust/aho-corasick/default/leftmost-longest",
 78 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
 79 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
 80 |   "rust/aho-corasick/dfa/leftmost-first",
 81 |   "rust/aho-corasick/packed/leftmost-first",
 82 |   "rust/old-aho-corasick/default/standard",
 83 |   "rust/old-aho-corasick/default/leftmost-first",
 84 |   "rust/old-aho-corasick/default/leftmost-longest",
 85 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
 86 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
 87 |   "rust/old-aho-corasick/dfa/leftmost-first",
 88 |   "rust/old-aho-corasick/packed/leftmost-first",
 89 |   "daachorse/bytewise/leftmost-first",
 90 |   "daachorse/bytewise/leftmost-longest",
 91 |   "naive/rust/memchr/memmem",
 92 |   "naive/rust/std",
 93 | ]
 94 | 
 95 | [[bench]]
 96 | model = "count"
 97 | name = "twobytes-nomatch"
 98 | regex = ["\u0000", "\u0001"]
 99 | haystack = { path = "random.txt" }
100 | count = 0
101 | engines = [
102 |   "rust/aho-corasick/default/standard",
103 |   "rust/aho-corasick/default/leftmost-first",
104 |   "rust/aho-corasick/default/leftmost-longest",
105 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
106 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
107 |   "rust/aho-corasick/dfa/leftmost-first",
108 |   "rust/aho-corasick/packed/leftmost-first",
109 |   "rust/old-aho-corasick/default/standard",
110 |   "rust/old-aho-corasick/default/leftmost-first",
111 |   "rust/old-aho-corasick/default/leftmost-longest",
112 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
113 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
114 |   "rust/old-aho-corasick/dfa/leftmost-first",
115 |   "rust/old-aho-corasick/packed/leftmost-first",
116 |   "daachorse/bytewise/leftmost-first",
117 |   "daachorse/bytewise/leftmost-longest",
118 |   "naive/rust/memchr/memmem",
119 |   "naive/rust/std",
120 | ]
121 | 
122 | [[bench]]
123 | model = "count"
124 | name = "threebytes-match"
125 | regex = ["a", "\u0000", "\u0001"]
126 | haystack = { path = "random.txt" }
127 | count = 352
128 | engines = [
129 |   "rust/aho-corasick/default/standard",
130 |   "rust/aho-corasick/default/leftmost-first",
131 |   "rust/aho-corasick/default/leftmost-longest",
132 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
133 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
134 |   "rust/aho-corasick/dfa/leftmost-first",
135 |   "rust/aho-corasick/packed/leftmost-first",
136 |   "rust/old-aho-corasick/default/standard",
137 |   "rust/old-aho-corasick/default/leftmost-first",
138 |   "rust/old-aho-corasick/default/leftmost-longest",
139 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
140 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
141 |   "rust/old-aho-corasick/dfa/leftmost-first",
142 |   "rust/old-aho-corasick/packed/leftmost-first",
143 |   "daachorse/bytewise/leftmost-first",
144 |   "daachorse/bytewise/leftmost-longest",
145 |   "naive/rust/memchr/memmem",
146 |   "naive/rust/std",
147 | ]
148 | 
149 | [[bench]]
150 | model = "count"
151 | name = "threebytes-nomatch"
152 | regex = ["\u0000", "\u0001", "\u0002"]
153 | haystack = { path = "random.txt" }
154 | count = 0
155 | engines = [
156 |   "rust/aho-corasick/default/standard",
157 |   "rust/aho-corasick/default/leftmost-first",
158 |   "rust/aho-corasick/default/leftmost-longest",
159 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
160 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
161 |   "rust/aho-corasick/dfa/leftmost-first",
162 |   "rust/aho-corasick/packed/leftmost-first",
163 |   "rust/old-aho-corasick/default/standard",
164 |   "rust/old-aho-corasick/default/leftmost-first",
165 |   "rust/old-aho-corasick/default/leftmost-longest",
166 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
167 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
168 |   "rust/old-aho-corasick/dfa/leftmost-first",
169 |   "rust/old-aho-corasick/packed/leftmost-first",
170 |   "daachorse/bytewise/leftmost-first",
171 |   "daachorse/bytewise/leftmost-longest",
172 |   "naive/rust/memchr/memmem",
173 |   "naive/rust/std",
174 | ]
175 | 
176 | [[bench]]
177 | model = "count"
178 | name = "fourbytes-match"
179 | regex = ["a", "\u0000", "\u0001", "\u0002"]
180 | haystack = { path = "random.txt" }
181 | count = 352
182 | engines = [
183 |   "rust/aho-corasick/default/standard",
184 |   "rust/aho-corasick/default/leftmost-first",
185 |   "rust/aho-corasick/default/leftmost-longest",
186 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
187 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
188 |   "rust/aho-corasick/dfa/leftmost-first",
189 |   "rust/aho-corasick/packed/leftmost-first",
190 |   "rust/old-aho-corasick/default/standard",
191 |   "rust/old-aho-corasick/default/leftmost-first",
192 |   "rust/old-aho-corasick/default/leftmost-longest",
193 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
194 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
195 |   "rust/old-aho-corasick/dfa/leftmost-first",
196 |   "rust/old-aho-corasick/packed/leftmost-first",
197 |   "daachorse/bytewise/leftmost-first",
198 |   "daachorse/bytewise/leftmost-longest",
199 |   "naive/rust/memchr/memmem",
200 |   "naive/rust/std",
201 | ]
202 | 
203 | [[bench]]
204 | model = "count"
205 | name = "fourbytes-nomatch"
206 | regex = ["\u0000", "\u0001", "\u0002", "\u0003"]
207 | haystack = { path = "random.txt" }
208 | count = 0
209 | engines = [
210 |   "rust/aho-corasick/default/standard",
211 |   "rust/aho-corasick/default/leftmost-first",
212 |   "rust/aho-corasick/default/leftmost-longest",
213 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
214 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
215 |   "rust/aho-corasick/dfa/leftmost-first",
216 |   "rust/aho-corasick/packed/leftmost-first",
217 |   "rust/old-aho-corasick/default/standard",
218 |   "rust/old-aho-corasick/default/leftmost-first",
219 |   "rust/old-aho-corasick/default/leftmost-longest",
220 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
221 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
222 |   "rust/old-aho-corasick/dfa/leftmost-first",
223 |   "rust/old-aho-corasick/packed/leftmost-first",
224 |   "daachorse/bytewise/leftmost-first",
225 |   "daachorse/bytewise/leftmost-longest",
226 |   "naive/rust/memchr/memmem",
227 |   "naive/rust/std",
228 | ]
229 | 
230 | [[bench]]
231 | model = "count"
232 | name = "fivebytes-match"
233 | regex = ["a", "\u0000", "\u0001", "\u0002", "\u0003"]
234 | haystack = { path = "random.txt" }
235 | count = 352
236 | engines = [
237 |   "rust/aho-corasick/default/standard",
238 |   "rust/aho-corasick/default/leftmost-first",
239 |   "rust/aho-corasick/default/leftmost-longest",
240 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
241 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
242 |   "rust/aho-corasick/dfa/leftmost-first",
243 |   "rust/aho-corasick/packed/leftmost-first",
244 |   "rust/old-aho-corasick/default/standard",
245 |   "rust/old-aho-corasick/default/leftmost-first",
246 |   "rust/old-aho-corasick/default/leftmost-longest",
247 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
248 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
249 |   "rust/old-aho-corasick/dfa/leftmost-first",
250 |   "rust/old-aho-corasick/packed/leftmost-first",
251 |   "daachorse/bytewise/leftmost-first",
252 |   "daachorse/bytewise/leftmost-longest",
253 |   "naive/rust/memchr/memmem",
254 |   "naive/rust/std",
255 | ]
256 | 
257 | [[bench]]
258 | model = "count"
259 | name = "fivebytes-nomatch"
260 | regex = ["\u0000", "\u0001", "\u0002", "\u0003", "\u0004"]
261 | haystack = { path = "random.txt" }
262 | count = 0
263 | engines = [
264 |   "rust/aho-corasick/default/standard",
265 |   "rust/aho-corasick/default/leftmost-first",
266 |   "rust/aho-corasick/default/leftmost-longest",
267 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
268 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
269 |   "rust/aho-corasick/dfa/leftmost-first",
270 |   "rust/aho-corasick/packed/leftmost-first",
271 |   "rust/old-aho-corasick/default/standard",
272 |   "rust/old-aho-corasick/default/leftmost-first",
273 |   "rust/old-aho-corasick/default/leftmost-longest",
274 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
275 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
276 |   "rust/old-aho-corasick/dfa/leftmost-first",
277 |   "rust/old-aho-corasick/packed/leftmost-first",
278 |   "daachorse/bytewise/leftmost-first",
279 |   "daachorse/bytewise/leftmost-longest",
280 |   "naive/rust/memchr/memmem",
281 |   "naive/rust/std",
282 | ]
283 | 


--------------------------------------------------------------------------------
/benchmarks/definitions/random/misc.toml:
--------------------------------------------------------------------------------
 1 | analysis = '''
 2 | Miscellaneous benchmarks on a random haystack.
 3 | '''
 4 | 
 5 | [[bench]]
 6 | model = "count"
 7 | name = "ten-one-prefix"
 8 | regex = [
 9 |   "zacdef", "zbcdef", "zccdef", "zdcdef", "zecdef", "zfcdef",
10 |   "zgcdef", "zhcdef", "zicdef", "zjcdef",
11 | ]
12 | haystack = { path = "random.txt" }
13 | count = 0
14 | engines = [
15 |   "rust/aho-corasick/default/standard",
16 |   "rust/aho-corasick/default/leftmost-first",
17 |   "rust/aho-corasick/default/leftmost-longest",
18 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
19 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
20 |   "rust/aho-corasick/dfa/leftmost-first",
21 |   "rust/aho-corasick/packed/leftmost-first",
22 |   "rust/old-aho-corasick/default/standard",
23 |   "rust/old-aho-corasick/default/leftmost-first",
24 |   "rust/old-aho-corasick/default/leftmost-longest",
25 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
26 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
27 |   "rust/old-aho-corasick/dfa/leftmost-first",
28 |   "rust/old-aho-corasick/packed/leftmost-first",
29 |   "daachorse/bytewise/leftmost-first",
30 |   "daachorse/bytewise/leftmost-longest",
31 |   "naive/rust/memchr/memmem",
32 |   "naive/rust/std",
33 | ]
34 | 
35 | [[bench]]
36 | model = "count"
37 | name = "ten-diff-prefix"
38 | regex = [
39 |   "abcdef", "bcdefg", "cdefgh", "defghi", "efghij", "fghijk",
40 |   "ghijkl", "hijklm", "ijklmn", "jklmno",
41 | ]
42 | haystack = { path = "random.txt" }
43 | count = 0
44 | engines = [
45 |   "rust/aho-corasick/default/standard",
46 |   "rust/aho-corasick/default/leftmost-first",
47 |   "rust/aho-corasick/default/leftmost-longest",
48 |   "rust/aho-corasick/nfa-noncontiguous/leftmost-first",
49 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
50 |   "rust/aho-corasick/dfa/leftmost-first",
51 |   "rust/aho-corasick/packed/leftmost-first",
52 |   "rust/old-aho-corasick/default/standard",
53 |   "rust/old-aho-corasick/default/leftmost-first",
54 |   "rust/old-aho-corasick/default/leftmost-longest",
55 |   "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first",
56 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
57 |   "rust/old-aho-corasick/dfa/leftmost-first",
58 |   "rust/old-aho-corasick/packed/leftmost-first",
59 |   "daachorse/bytewise/leftmost-first",
60 |   "daachorse/bytewise/leftmost-longest",
61 |   "naive/rust/memchr/memmem",
62 |   "naive/rust/std",
63 | ]
64 | 


--------------------------------------------------------------------------------
/benchmarks/definitions/regexcurated.toml:
--------------------------------------------------------------------------------
  1 | analysis = '''
  2 | These benchmarks come from [rebar's curated benchmark set].
  3 | 
  4 | We don't copy all of the benchmarks from there. Just the ones where the
  5 | `aho-corasick` crate is likely relevant. For example, for the regex
  6 | `(?i)Sherlock Holmes`, a small set of prefix literals is extracted that results
  7 | in a Teddy searcher being used. So we specifically benchmark the literals that
  8 | are extracted (at time of writing).
  9 | 
 10 | [rebar's curated benchmark set]: https://github.com/BurntSushi/rebar/tree/e6100636137496c97273efcb5f5d869278e2e95d/benchmarks/definitions/curated
 11 | '''
 12 | 
 13 | [[bench]]
 14 | model = "count"
 15 | name = "sherlock-en"
 16 | regex = ['Sherlock Holmes']
 17 | haystack = { path = "opensubtitles/en-sampled.txt" }
 18 | count = 513
 19 | engines = [
 20 |   "rust/aho-corasick/default/leftmost-first",
 21 |   "rust/aho-corasick/dfa/leftmost-first",
 22 |   "rust/aho-corasick/packed/leftmost-first",
 23 |   "rust/old-aho-corasick/default/leftmost-first",
 24 |   "rust/old-aho-corasick/dfa/leftmost-first",
 25 |   "rust/old-aho-corasick/packed/leftmost-first",
 26 |   "daachorse/bytewise/leftmost-first",
 27 |   "naive/rust/memchr/memmem",
 28 | ]
 29 | 
 30 | [[bench]]
 31 | model = "count"
 32 | name = "sherlock-casei-en"
 33 | regex = [
 34 |   "SHER", "SHEr", "SHeR", "SHer", "ShER", "ShEr", "SheR", "Sher",
 35 |   "sHER", "sHEr", "sHeR", "sHer", "shER", "shEr", "sheR", "sher",
 36 |   "ſHE" , "ſHe" , "ſhE" , "ſhe" ,
 37 | ]
 38 | haystack = { path = "opensubtitles/en-sampled.txt" }
 39 | count = 540  # original regex is 522
 40 | engines = [
 41 |   "rust/aho-corasick/default/leftmost-first",
 42 |   "rust/aho-corasick/dfa/leftmost-first",
 43 |   "rust/aho-corasick/packed/leftmost-first",
 44 |   "rust/old-aho-corasick/default/leftmost-first",
 45 |   "rust/old-aho-corasick/dfa/leftmost-first",
 46 |   "rust/old-aho-corasick/packed/leftmost-first",
 47 |   "daachorse/bytewise/leftmost-first",
 48 |   "naive/rust/memchr/memmem",
 49 | ]
 50 | 
 51 | [[bench]]
 52 | model = "count"
 53 | name = "sherlock-ru"
 54 | regex = ['Шерлок Холмс']
 55 | haystack = { path = "opensubtitles/ru-sampled.txt" }
 56 | count = 724
 57 | engines = [
 58 |   "rust/aho-corasick/default/leftmost-first",
 59 |   "rust/aho-corasick/dfa/leftmost-first",
 60 |   "rust/aho-corasick/packed/leftmost-first",
 61 |   "rust/old-aho-corasick/default/leftmost-first",
 62 |   "rust/old-aho-corasick/dfa/leftmost-first",
 63 |   "rust/old-aho-corasick/packed/leftmost-first",
 64 |   "daachorse/bytewise/leftmost-first",
 65 |   "naive/rust/memchr/memmem",
 66 | ]
 67 | 
 68 | [[bench]]
 69 | model = "count"
 70 | name = "sherlock-casei-ru"
 71 | regex = [
 72 |   'ШЕ\xd0', 'ШЕ\xd1',
 73 |   'Ше\xd0', 'Ше\xd1',
 74 |   'шЕ\xd0', 'шЕ\xd1',
 75 |   'ше\xd0', 'ше\xd1',
 76 | ]
 77 | haystack = { path = "opensubtitles/ru-sampled.txt" }
 78 | count = 1608  # original regex is 746
 79 | engines = [
 80 |   "rust/aho-corasick/default/leftmost-first",
 81 |   "rust/aho-corasick/dfa/leftmost-first",
 82 |   "rust/aho-corasick/packed/leftmost-first",
 83 |   "rust/old-aho-corasick/default/leftmost-first",
 84 |   "rust/old-aho-corasick/dfa/leftmost-first",
 85 |   "rust/old-aho-corasick/packed/leftmost-first",
 86 |   "daachorse/bytewise/leftmost-first",
 87 |   "naive/rust/memchr/memmem",
 88 | ]
 89 | 
 90 | [[bench]]
 91 | model = "count"
 92 | name = "sherlock-zh"
 93 | regex = ['夏洛克·福尔摩斯']
 94 | haystack = { path = "opensubtitles/zh-sampled.txt" }
 95 | count = 30
 96 | engines = [
 97 |   "rust/aho-corasick/default/leftmost-first",
 98 |   "rust/aho-corasick/dfa/leftmost-first",
 99 |   "rust/aho-corasick/packed/leftmost-first",
100 |   "rust/old-aho-corasick/default/leftmost-first",
101 |   "rust/old-aho-corasick/dfa/leftmost-first",
102 |   "rust/old-aho-corasick/packed/leftmost-first",
103 |   "daachorse/bytewise/leftmost-first",
104 |   "naive/rust/memchr/memmem",
105 | ]
106 | 
107 | [[bench]]
108 | model = "count"
109 | name = "alt-sherlock-en"
110 | regex = [
111 |   'Sherlock Holmes',
112 |   'John Watson',
113 |   'Irene Adler',
114 |   'Inspector Lestrade',
115 |   'Professor Moriarty',
116 | ]
117 | haystack = { path = "opensubtitles/en-sampled.txt" }
118 | count = 714
119 | engines = [
120 |   "rust/aho-corasick/default/leftmost-first",
121 |   "rust/aho-corasick/dfa/leftmost-first",
122 |   "rust/aho-corasick/packed/leftmost-first",
123 |   "rust/old-aho-corasick/default/leftmost-first",
124 |   "rust/old-aho-corasick/dfa/leftmost-first",
125 |   "rust/old-aho-corasick/packed/leftmost-first",
126 |   "daachorse/bytewise/leftmost-first",
127 |   "naive/rust/memchr/memmem",
128 | ]
129 | 
130 | [[bench]]
131 | model = "count"
132 | name = "alt-sherlock-casei-en"
133 | regex = [
134 |   'SHE', 'SHe', 'ShE', 'She', 'sHE', 'sHe', 'shE', 'she', 'ſH', 'ſh',
135 |   'JOH', 'JOh', 'JoH', 'Joh', 'jOH', 'jOh', 'joH', 'joh',
136 |   'IRE', 'IRe', 'IrE', 'Ire', 'iRE', 'iRe', 'irE', 'ire',
137 |   'INS', 'INs', 'IN\xc5', 'InS', 'Ins', 'In\xc5',
138 |   'iNS', 'iNs', 'iN\xc5', 'inS', 'ins', 'in\xc5',
139 |   'PRO', 'PRo', 'PrO', 'Pro', 'pRO', 'pRo', 'prO', 'pro',
140 | ]
141 | haystack = { path = "opensubtitles/en-sampled.txt" }
142 | count = 2456  # original regex is 725
143 | engines = [
144 |   "rust/aho-corasick/default/leftmost-first",
145 |   "rust/aho-corasick/dfa/leftmost-first",
146 |   "rust/aho-corasick/packed/leftmost-first",
147 |   "rust/old-aho-corasick/default/leftmost-first",
148 |   "rust/old-aho-corasick/dfa/leftmost-first",
149 |   "rust/old-aho-corasick/packed/leftmost-first",
150 |   "daachorse/bytewise/leftmost-first",
151 |   "naive/rust/memchr/memmem",
152 | ]
153 | 
154 | [[bench]]
155 | model = "count"
156 | name = "alt-sherlock-ru"
157 | regex = [
158 |   "Шерлок Холмс",
159 |   "Джон Уотсон",
160 |   "Ирен Адлер",
161 |   "инспектор Лестрейд",
162 |   "профессор Мориарти",
163 | ]
164 | haystack = { path = "opensubtitles/ru-sampled.txt" }
165 | count = 899
166 | engines = [
167 |   "rust/aho-corasick/default/leftmost-first",
168 |   "rust/aho-corasick/dfa/leftmost-first",
169 |   "rust/aho-corasick/packed/leftmost-first",
170 |   "rust/old-aho-corasick/default/leftmost-first",
171 |   "rust/old-aho-corasick/dfa/leftmost-first",
172 |   "rust/old-aho-corasick/packed/leftmost-first",
173 |   "daachorse/bytewise/leftmost-first",
174 |   "naive/rust/memchr/memmem",
175 | ]
176 | 
177 | [[bench]]
178 | model = "count"
179 | name = "alt-sherlock-casei-ru"
180 | regex = [
181 |   'ШЕ', 'Ше', 'шЕ', 'ше',
182 |   'ДЖ', 'Дж', 'дЖ', 'дж', 'ᲁ\xd0',
183 |   'ИР', 'Ир', 'иР', 'ир',
184 |   'ИН', 'Ин', 'иН', 'ин',
185 |   'ПР', 'Пр', 'пР', 'пр',
186 | ]
187 | haystack = { path = "opensubtitles/ru-sampled.txt" }
188 | count = 11_400  # original regex is 971
189 | engines = [
190 |   "rust/aho-corasick/default/leftmost-first",
191 |   "rust/aho-corasick/dfa/leftmost-first",
192 |   "rust/aho-corasick/packed/leftmost-first",
193 |   "rust/old-aho-corasick/default/leftmost-first",
194 |   "rust/old-aho-corasick/dfa/leftmost-first",
195 |   "rust/old-aho-corasick/packed/leftmost-first",
196 |   "daachorse/bytewise/leftmost-first",
197 |   "naive/rust/memchr/memmem",
198 | ]
199 | 
200 | [[bench]]
201 | model = "count"
202 | name = "alt-sherlock-zh"
203 | regex = [
204 |   "夏洛克·福尔摩斯",
205 |   "约翰华生",
206 |   "阿德勒",
207 |   "雷斯垂德",
208 |   "莫里亚蒂教授",
209 | ]
210 | haystack = { path = "opensubtitles/zh-sampled.txt" }
211 | count = 207
212 | engines = [
213 |   "rust/aho-corasick/default/leftmost-first",
214 |   "rust/aho-corasick/dfa/leftmost-first",
215 |   "rust/aho-corasick/packed/leftmost-first",
216 |   "rust/old-aho-corasick/default/leftmost-first",
217 |   "rust/old-aho-corasick/dfa/leftmost-first",
218 |   "rust/old-aho-corasick/packed/leftmost-first",
219 |   "daachorse/bytewise/leftmost-first",
220 |   "naive/rust/memchr/memmem",
221 | ]
222 | 
223 | [[bench]]
224 | model = "count"
225 | name = "dictionary-15"
226 | regex = { path = "dictionary/english/length-15.txt", per-line = "pattern" }
227 | haystack = { path = "opensubtitles/en-medium.txt" }
228 | count = 1
229 | engines = [
230 |   "rust/aho-corasick/default/leftmost-first",
231 |   "rust/aho-corasick/nfa-contiguous/leftmost-first",
232 |   "rust/aho-corasick/dfa/leftmost-first",
233 |   "rust/old-aho-corasick/default/leftmost-first",
234 |   "rust/old-aho-corasick/nfa-contiguous/leftmost-first",
235 |   "rust/old-aho-corasick/dfa/leftmost-first",
236 |   "daachorse/bytewise/leftmost-first",
237 |   "naive/rust/memchr/memmem",
238 | ]
239 | 


--------------------------------------------------------------------------------
/benchmarks/engines.toml:
--------------------------------------------------------------------------------
  1 | # Engines for the aho-corasick crate. We don't cover literally every possible
  2 | # configuration, but we try to cover everything broadly.
  3 | 
  4 | [[engine]]
  5 |   name = "rust/aho-corasick/default/standard"
  6 |   cwd = "./engines/rust-aho-corasick"
  7 |   [engine.version]
  8 |     bin = "./target/release/main"
  9 |     args = ["--version"]
 10 |   [engine.run]
 11 |     bin = "./target/release/main"
 12 |     args = ["default/standard"]
 13 |   [[engine.build]]
 14 |     bin = "cargo"
 15 |     args = ["build", "--release"]
 16 |   [[engine.clean]]
 17 |     bin = "cargo"
 18 |     args = ["clean"]
 19 | 
 20 | [[engine]]
 21 |   name = "rust/aho-corasick/default/overlapping"
 22 |   cwd = "./engines/rust-aho-corasick"
 23 |   [engine.version]
 24 |     bin = "./target/release/main"
 25 |     args = ["--version"]
 26 |   [engine.run]
 27 |     bin = "./target/release/main"
 28 |     args = ["default/overlapping"]
 29 |   [[engine.build]]
 30 |     bin = "cargo"
 31 |     args = ["build", "--release"]
 32 |   [[engine.clean]]
 33 |     bin = "cargo"
 34 |     args = ["clean"]
 35 | 
 36 | [[engine]]
 37 |   name = "rust/aho-corasick/default/leftmost-first"
 38 |   cwd = "./engines/rust-aho-corasick"
 39 |   [engine.version]
 40 |     bin = "./target/release/main"
 41 |     args = ["--version"]
 42 |   [engine.run]
 43 |     bin = "./target/release/main"
 44 |     args = ["default/leftmost-first"]
 45 |   [[engine.build]]
 46 |     bin = "cargo"
 47 |     args = ["build", "--release"]
 48 |   [[engine.clean]]
 49 |     bin = "cargo"
 50 |     args = ["clean"]
 51 | 
 52 | [[engine]]
 53 |   name = "rust/aho-corasick/default/leftmost-longest"
 54 |   cwd = "./engines/rust-aho-corasick"
 55 |   [engine.version]
 56 |     bin = "./target/release/main"
 57 |     args = ["--version"]
 58 |   [engine.run]
 59 |     bin = "./target/release/main"
 60 |     args = ["default/leftmost-longest"]
 61 |   [[engine.build]]
 62 |     bin = "cargo"
 63 |     args = ["build", "--release"]
 64 |   [[engine.clean]]
 65 |     bin = "cargo"
 66 |     args = ["clean"]
 67 | 
 68 | [[engine]]
 69 |   name = "rust/aho-corasick/nfa-noncontiguous/leftmost-first"
 70 |   cwd = "./engines/rust-aho-corasick"
 71 |   [engine.version]
 72 |     bin = "./target/release/main"
 73 |     args = ["--version"]
 74 |   [engine.run]
 75 |     bin = "./target/release/main"
 76 |     args = ["nfa-noncontiguous/leftmost-first"]
 77 |   [[engine.build]]
 78 |     bin = "cargo"
 79 |     args = ["build", "--release"]
 80 |   [[engine.clean]]
 81 |     bin = "cargo"
 82 |     args = ["clean"]
 83 | 
 84 | [[engine]]
 85 |   name = "rust/aho-corasick/nfa-contiguous/leftmost-first"
 86 |   cwd = "./engines/rust-aho-corasick"
 87 |   [engine.version]
 88 |     bin = "./target/release/main"
 89 |     args = ["--version"]
 90 |   [engine.run]
 91 |     bin = "./target/release/main"
 92 |     args = ["nfa-contiguous/leftmost-first"]
 93 |   [[engine.build]]
 94 |     bin = "cargo"
 95 |     args = ["build", "--release"]
 96 |   [[engine.clean]]
 97 |     bin = "cargo"
 98 |     args = ["clean"]
 99 | 
100 | [[engine]]
101 |   name = "rust/aho-corasick/dfa/leftmost-first"
102 |   cwd = "./engines/rust-aho-corasick"
103 |   [engine.version]
104 |     bin = "./target/release/main"
105 |     args = ["--version"]
106 |   [engine.run]
107 |     bin = "./target/release/main"
108 |     args = ["dfa/leftmost-first"]
109 |   [[engine.build]]
110 |     bin = "cargo"
111 |     args = ["build", "--release"]
112 |   [[engine.clean]]
113 |     bin = "cargo"
114 |     args = ["clean"]
115 | 
116 | [[engine]]
117 |   name = "rust/aho-corasick/packed/leftmost-first"
118 |   cwd = "./engines/rust-aho-corasick"
119 |   [engine.version]
120 |     bin = "./target/release/main"
121 |     args = ["--version"]
122 |   [engine.run]
123 |     bin = "./target/release/main"
124 |     args = ["packed/leftmost-first"]
125 |   [[engine.build]]
126 |     bin = "cargo"
127 |     args = ["build", "--release"]
128 |   [[engine.clean]]
129 |     bin = "cargo"
130 |     args = ["clean"]
131 | 
132 | # Engines for aho-corasick, but pinned to 1.0.5. Essentially a way of
133 | # benchmarking the older version before some internal refactoring.
134 | 
135 | [[engine]]
136 |   name = "rust/old-aho-corasick/default/standard"
137 |   cwd = "./engines/rust-old-aho-corasick"
138 |   [engine.version]
139 |     bin = "./target/release/main"
140 |     args = ["--version"]
141 |   [engine.run]
142 |     bin = "./target/release/main"
143 |     args = ["default/standard"]
144 |   [[engine.build]]
145 |     bin = "cargo"
146 |     args = ["build", "--release"]
147 |   [[engine.clean]]
148 |     bin = "cargo"
149 |     args = ["clean"]
150 | 
151 | [[engine]]
152 |   name = "rust/old-aho-corasick/default/leftmost-first"
153 |   cwd = "./engines/rust-old-aho-corasick"
154 |   [engine.version]
155 |     bin = "./target/release/main"
156 |     args = ["--version"]
157 |   [engine.run]
158 |     bin = "./target/release/main"
159 |     args = ["default/leftmost-first"]
160 |   [[engine.build]]
161 |     bin = "cargo"
162 |     args = ["build", "--release"]
163 |   [[engine.clean]]
164 |     bin = "cargo"
165 |     args = ["clean"]
166 | 
167 | [[engine]]
168 |   name = "rust/old-aho-corasick/default/leftmost-longest"
169 |   cwd = "./engines/rust-old-aho-corasick"
170 |   [engine.version]
171 |     bin = "./target/release/main"
172 |     args = ["--version"]
173 |   [engine.run]
174 |     bin = "./target/release/main"
175 |     args = ["default/leftmost-longest"]
176 |   [[engine.build]]
177 |     bin = "cargo"
178 |     args = ["build", "--release"]
179 |   [[engine.clean]]
180 |     bin = "cargo"
181 |     args = ["clean"]
182 | 
183 | [[engine]]
184 |   name = "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first"
185 |   cwd = "./engines/rust-old-aho-corasick"
186 |   [engine.version]
187 |     bin = "./target/release/main"
188 |     args = ["--version"]
189 |   [engine.run]
190 |     bin = "./target/release/main"
191 |     args = ["nfa-noncontiguous/leftmost-first"]
192 |   [[engine.build]]
193 |     bin = "cargo"
194 |     args = ["build", "--release"]
195 |   [[engine.clean]]
196 |     bin = "cargo"
197 |     args = ["clean"]
198 | 
199 | [[engine]]
200 |   name = "rust/old-aho-corasick/nfa-contiguous/leftmost-first"
201 |   cwd = "./engines/rust-old-aho-corasick"
202 |   [engine.version]
203 |     bin = "./target/release/main"
204 |     args = ["--version"]
205 |   [engine.run]
206 |     bin = "./target/release/main"
207 |     args = ["nfa-contiguous/leftmost-first"]
208 |   [[engine.build]]
209 |     bin = "cargo"
210 |     args = ["build", "--release"]
211 |   [[engine.clean]]
212 |     bin = "cargo"
213 |     args = ["clean"]
214 | 
215 | [[engine]]
216 |   name = "rust/old-aho-corasick/dfa/leftmost-first"
217 |   cwd = "./engines/rust-old-aho-corasick"
218 |   [engine.version]
219 |     bin = "./target/release/main"
220 |     args = ["--version"]
221 |   [engine.run]
222 |     bin = "./target/release/main"
223 |     args = ["dfa/leftmost-first"]
224 |   [[engine.build]]
225 |     bin = "cargo"
226 |     args = ["build", "--release"]
227 |   [[engine.clean]]
228 |     bin = "cargo"
229 |     args = ["clean"]
230 | 
231 | [[engine]]
232 |   name = "rust/old-aho-corasick/packed/leftmost-first"
233 |   cwd = "./engines/rust-old-aho-corasick"
234 |   [engine.version]
235 |     bin = "./target/release/main"
236 |     args = ["--version"]
237 |   [engine.run]
238 |     bin = "./target/release/main"
239 |     args = ["packed/leftmost-first"]
240 |   [[engine.build]]
241 |     bin = "cargo"
242 |     args = ["build", "--release"]
243 |   [[engine.clean]]
244 |     bin = "cargo"
245 |     args = ["clean"]
246 | 
247 | # Aho-Corasick engines from daachorse. AFAIK, this is the only Rust library
248 | # that is anywhere near competitive with the aho-corasick crate.
249 | #
250 | # It uses the "double array trie" technique for implementing the Aho-Corasick
251 | # algorithm with a few additional tricks that were published as part of
252 | # creating this library.
253 | #
254 | # Ref: https://github.com/daac-tools/daachorse
255 | 
256 | [[engine]]
257 |   name = "daachorse/bytewise/standard"
258 |   cwd = "./engines/rust-daachorse"
259 |   [engine.version]
260 |     bin = "./target/release/main"
261 |     args = ["--version"]
262 |   [engine.run]
263 |     bin = "./target/release/main"
264 |     args = ["bytewise/standard"]
265 |   [[engine.build]]
266 |     bin = "cargo"
267 |     args = ["build", "--release"]
268 |   [[engine.clean]]
269 |     bin = "cargo"
270 |     args = ["clean"]
271 | 
272 | [[engine]]
273 |   name = "daachorse/bytewise/overlapping"
274 |   cwd = "./engines/rust-daachorse"
275 |   [engine.version]
276 |     bin = "./target/release/main"
277 |     args = ["--version"]
278 |   [engine.run]
279 |     bin = "./target/release/main"
280 |     args = ["bytewise/overlapping"]
281 |   [[engine.build]]
282 |     bin = "cargo"
283 |     args = ["build", "--release"]
284 |   [[engine.clean]]
285 |     bin = "cargo"
286 |     args = ["clean"]
287 | 
288 | [[engine]]
289 |   name = "daachorse/bytewise/leftmost-first"
290 |   cwd = "./engines/rust-daachorse"
291 |   [engine.version]
292 |     bin = "./target/release/main"
293 |     args = ["--version"]
294 |   [engine.run]
295 |     bin = "./target/release/main"
296 |     args = ["bytewise/leftmost-first"]
297 |   [[engine.build]]
298 |     bin = "cargo"
299 |     args = ["build", "--release"]
300 |   [[engine.clean]]
301 |     bin = "cargo"
302 |     args = ["clean"]
303 | 
304 | [[engine]]
305 |   name = "daachorse/bytewise/leftmost-longest"
306 |   cwd = "./engines/rust-daachorse"
307 |   [engine.version]
308 |     bin = "./target/release/main"
309 |     args = ["--version"]
310 |   [engine.run]
311 |     bin = "./target/release/main"
312 |     args = ["bytewise/leftmost-longest"]
313 |   [[engine.build]]
314 |     bin = "cargo"
315 |     args = ["build", "--release"]
316 |   [[engine.clean]]
317 |     bin = "cargo"
318 |     args = ["clean"]
319 | 
320 | # Engines based on the `jetscii` Rust crate. This is somewhat more appropriately
321 | # compared with routines in `memchr`, but there is some overlap in use cases
322 | # with Teddy's packed searcher for multiple single-byte needles.
323 | 
324 | [[engine]]
325 |   name = "rust/jetscii/ascii-chars/prebuilt"
326 |   cwd = "./engines/rust-jetscii"
327 |   [engine.version]
328 |     bin = "./target/release/main"
329 |     args = ["--version"]
330 |   [engine.run]
331 |     bin = "./target/release/main"
332 |     args = ["ascii-chars-prebuilt"]
333 |   [[engine.build]]
334 |     bin = "cargo"
335 |     args = ["build", "--release"]
336 |   [[engine.clean]]
337 |     bin = "cargo"
338 |     args = ["clean"]
339 | 
340 | [[engine]]
341 |   name = "rust/jetscii/ascii-chars/oneshot"
342 |   cwd = "./engines/rust-jetscii"
343 |   [engine.version]
344 |     bin = "./target/release/main"
345 |     args = ["--version"]
346 |   [engine.run]
347 |     bin = "./target/release/main"
348 |     args = ["ascii-chars-oneshot"]
349 |   [[engine.build]]
350 |     bin = "cargo"
351 |     args = ["build", "--release"]
352 |   [[engine.clean]]
353 |     bin = "cargo"
354 |     args = ["clean"]
355 | 
356 | # Naive engines. Useful for comparisons and to determine the crossover point
357 | # where a multi-substring algorithm is beneficial over multi single-substring
358 | # algorithms. We include both the `memchr` crate and `std`.
359 | 
360 | [[engine]]
361 |   name = "naive/rust/std"
362 |   cwd = "./engines/naive"
363 |   [engine.version]
364 |     bin = "./target/release/main"
365 |     args = ["--version"]
366 |   [engine.run]
367 |     bin = "./target/release/main"
368 |     args = ["rust/std"]
369 |   [[engine.build]]
370 |     bin = "cargo"
371 |     args = ["build", "--release"]
372 |   [[engine.clean]]
373 |     bin = "cargo"
374 |     args = ["clean"]
375 | 
376 | [[engine]]
377 |   name = "naive/rust/memchr/memmem"
378 |   cwd = "./engines/naive"
379 |   [engine.version]
380 |     bin = "./target/release/main"
381 |     args = ["--version"]
382 |   [engine.run]
383 |     bin = "./target/release/main"
384 |     args = ["rust/memchr/memmem"]
385 |   [[engine.build]]
386 |     bin = "cargo"
387 |     args = ["build", "--release"]
388 |   [[engine.clean]]
389 |     bin = "cargo"
390 |     args = ["clean"]
391 | 


--------------------------------------------------------------------------------
/benchmarks/engines/naive/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "anyhow"
  7 | version = "1.0.75"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
 10 | 
 11 | [[package]]
 12 | name = "bstr"
 13 | version = "1.6.2"
 14 | source = "registry+https://github.com/rust-lang/crates.io-index"
 15 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
 16 | dependencies = [
 17 |  "memchr",
 18 |  "serde",
 19 | ]
 20 | 
 21 | [[package]]
 22 | name = "lexopt"
 23 | version = "0.3.0"
 24 | source = "registry+https://github.com/rust-lang/crates.io-index"
 25 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401"
 26 | 
 27 | [[package]]
 28 | name = "main"
 29 | version = "0.1.0"
 30 | dependencies = [
 31 |  "anyhow",
 32 |  "lexopt",
 33 |  "memchr",
 34 |  "shared",
 35 | ]
 36 | 
 37 | [[package]]
 38 | name = "memchr"
 39 | version = "2.6.3"
 40 | source = "registry+https://github.com/rust-lang/crates.io-index"
 41 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
 42 | 
 43 | [[package]]
 44 | name = "proc-macro2"
 45 | version = "1.0.66"
 46 | source = "registry+https://github.com/rust-lang/crates.io-index"
 47 | checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
 48 | dependencies = [
 49 |  "unicode-ident",
 50 | ]
 51 | 
 52 | [[package]]
 53 | name = "quote"
 54 | version = "1.0.33"
 55 | source = "registry+https://github.com/rust-lang/crates.io-index"
 56 | checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
 57 | dependencies = [
 58 |  "proc-macro2",
 59 | ]
 60 | 
 61 | [[package]]
 62 | name = "serde"
 63 | version = "1.0.188"
 64 | source = "registry+https://github.com/rust-lang/crates.io-index"
 65 | checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
 66 | dependencies = [
 67 |  "serde_derive",
 68 | ]
 69 | 
 70 | [[package]]
 71 | name = "serde_derive"
 72 | version = "1.0.188"
 73 | source = "registry+https://github.com/rust-lang/crates.io-index"
 74 | checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
 75 | dependencies = [
 76 |  "proc-macro2",
 77 |  "quote",
 78 |  "syn",
 79 | ]
 80 | 
 81 | [[package]]
 82 | name = "shared"
 83 | version = "0.1.0"
 84 | dependencies = [
 85 |  "anyhow",
 86 |  "bstr",
 87 | ]
 88 | 
 89 | [[package]]
 90 | name = "syn"
 91 | version = "2.0.31"
 92 | source = "registry+https://github.com/rust-lang/crates.io-index"
 93 | checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398"
 94 | dependencies = [
 95 |  "proc-macro2",
 96 |  "quote",
 97 |  "unicode-ident",
 98 | ]
 99 | 
100 | [[package]]
101 | name = "unicode-ident"
102 | version = "1.0.11"
103 | source = "registry+https://github.com/rust-lang/crates.io-index"
104 | checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
105 | 


--------------------------------------------------------------------------------
/benchmarks/engines/naive/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "main"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [[bin]]
 7 | name = "main"
 8 | path = "main.rs"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.69"
12 | lexopt = "0.3.0"
13 | memchr = "2.6.3"
14 | 
15 | [dependencies.shared]
16 | path = "../../shared"
17 | 
18 | [profile.release]
19 | debug = true
20 | codegen-units = 1
21 | lto = "fat"
22 | 


--------------------------------------------------------------------------------
/benchmarks/engines/naive/main.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Write;
  2 | 
  3 | use {
  4 |     anyhow::Context,
  5 |     lexopt::{Arg, ValueExt},
  6 |     memchr::memmem,
  7 | };
  8 | 
  9 | use shared::{Benchmark, Sample};
 10 | 
 11 | fn main() -> anyhow::Result<()> {
 12 |     let mut p = lexopt::Parser::from_env();
 13 |     let (mut engine, mut quiet) = (String::new(), false);
 14 |     while let Some(arg) = p.next()? {
 15 |         match arg {
 16 |             Arg::Short('h') | Arg::Long("help") => {
 17 |                 anyhow::bail!("main [--version | --quiet] <engine>")
 18 |             }
 19 |             Arg::Short('q') | Arg::Long("quiet") => {
 20 |                 quiet = true;
 21 |             }
 22 |             Arg::Long("version") => {
 23 |                 writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?;
 24 |                 return Ok(());
 25 |             }
 26 |             Arg::Value(v) => {
 27 |                 anyhow::ensure!(
 28 |                     engine.is_empty(),
 29 |                     "only one engine string allowed"
 30 |                 );
 31 |                 engine = v.string().context("<engine>")?;
 32 |                 anyhow::ensure!(
 33 |                     !engine.is_empty(),
 34 |                     "engine string cannot be empty"
 35 |                 );
 36 |             }
 37 |             _ => return Err(arg.unexpected().into()),
 38 |         }
 39 |     }
 40 | 
 41 |     let b = Benchmark::from_stdin()
 42 |         .context("failed to read KLV data from <stdin>")?;
 43 |     let samples = match (b.model.as_str(), engine.as_str()) {
 44 |         ("compile", "rust/memchr/memmem") => model_compile_memmem(&b)?,
 45 |         ("count", "rust/memchr/memmem") => model_count_memmem(&b)?,
 46 |         ("count", "rust/std") => model_count_std(&b)?,
 47 |         _ => anyhow::bail!(
 48 |             "unsupported model/engine pair, model={} engine={}",
 49 |             b.model,
 50 |             engine
 51 |         ),
 52 |     };
 53 |     if !quiet {
 54 |         let mut stdout = std::io::stdout().lock();
 55 |         for s in samples.iter() {
 56 |             writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?;
 57 |         }
 58 |     }
 59 |     Ok(())
 60 | }
 61 | 
 62 | /// Implements the "compile a matcher" model for naive multi-substring search
 63 | /// with the `memchr` crate's `memmem` implementation.
 64 | fn model_compile_memmem(b: &Benchmark) -> anyhow::Result<Vec<Sample>> {
 65 |     let haystack = &*b.haystack;
 66 |     shared::run_and_count(
 67 |         b,
 68 |         |finders: Vec<memmem::Finder<'static>>| {
 69 |             let mut count = 0;
 70 |             for f in finders.iter() {
 71 |                 count += f.find_iter(haystack).count();
 72 |             }
 73 |             Ok(count)
 74 |         },
 75 |         || compile_memmem(b),
 76 |     )
 77 | }
 78 | 
 79 | /// Implements a naive multi-substring algorithm using the `memchr` crate's
 80 | /// `memmem` implementation.
 81 | fn model_count_memmem(b: &Benchmark) -> anyhow::Result<Vec<Sample>> {
 82 |     let haystack = &*b.haystack;
 83 |     let finders = compile_memmem(b)?;
 84 |     shared::run(b, || {
 85 |         let mut count = 0;
 86 |         for f in finders.iter() {
 87 |             count += f.find_iter(haystack).count();
 88 |         }
 89 |         Ok(count)
 90 |     })
 91 | }
 92 | 
 93 | /// Implements a naive multi-substring algorithm using std's single substring
 94 | /// search implementation. This returns an error if the haystack or any of
 95 | /// the needles are invalid UTF-8.
 96 | fn model_count_std(b: &Benchmark) -> anyhow::Result<Vec<Sample>> {
 97 |     let Ok(haystack) = std::str::from_utf8(&b.haystack) else {
 98 |         anyhow::bail!("haystack is not valid UTF-8")
 99 |     };
100 |     let mut needles = vec![];
101 |     for needle in b.needles.iter() {
102 |         let Ok(needle) = std::str::from_utf8(needle) else {
103 |             anyhow::bail!("one of the needles is not valid UTF-8")
104 |         };
105 |         needles.push(needle);
106 |     }
107 |     shared::run(b, || {
108 |         let mut count = 0;
109 |         for needle in needles.iter() {
110 |             count += haystack.matches(needle).count();
111 |         }
112 |         Ok(count)
113 |     })
114 | }
115 | 
116 | /// Compiles a naive multi-substring matcher by building a single substring
117 | /// matcher for each needle.
118 | fn compile_memmem(
119 |     b: &Benchmark,
120 | ) -> anyhow::Result<Vec<memmem::Finder<'static>>> {
121 |     anyhow::ensure!(
122 |         !b.case_insensitive,
123 |         "naive multi-substring search doesn't support case insensitive mode",
124 |     );
125 |     let mut finders = vec![];
126 |     for needle in b.needles.iter() {
127 |         finders.push(memmem::Finder::new(needle).into_owned());
128 |     }
129 |     Ok(finders)
130 | }
131 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-aho-corasick/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-aho-corasick/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "aho-corasick"
  7 | version = "1.0.5"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
 10 | dependencies = [
 11 |  "memchr",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "aho-corasick"
 16 | version = "1.1.3"
 17 | dependencies = [
 18 |  "log",
 19 |  "memchr",
 20 | ]
 21 | 
 22 | [[package]]
 23 | name = "anyhow"
 24 | version = "1.0.69"
 25 | source = "registry+https://github.com/rust-lang/crates.io-index"
 26 | checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
 27 | 
 28 | [[package]]
 29 | name = "atty"
 30 | version = "0.2.14"
 31 | source = "registry+https://github.com/rust-lang/crates.io-index"
 32 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
 33 | dependencies = [
 34 |  "hermit-abi",
 35 |  "libc",
 36 |  "winapi",
 37 | ]
 38 | 
 39 | [[package]]
 40 | name = "bstr"
 41 | version = "1.6.2"
 42 | source = "registry+https://github.com/rust-lang/crates.io-index"
 43 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
 44 | dependencies = [
 45 |  "memchr",
 46 |  "serde",
 47 | ]
 48 | 
 49 | [[package]]
 50 | name = "env_logger"
 51 | version = "0.9.3"
 52 | source = "registry+https://github.com/rust-lang/crates.io-index"
 53 | checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7"
 54 | dependencies = [
 55 |  "atty",
 56 |  "humantime",
 57 |  "log",
 58 |  "regex",
 59 |  "termcolor",
 60 | ]
 61 | 
 62 | [[package]]
 63 | name = "hermit-abi"
 64 | version = "0.1.19"
 65 | source = "registry+https://github.com/rust-lang/crates.io-index"
 66 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
 67 | dependencies = [
 68 |  "libc",
 69 | ]
 70 | 
 71 | [[package]]
 72 | name = "humantime"
 73 | version = "2.1.0"
 74 | source = "registry+https://github.com/rust-lang/crates.io-index"
 75 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
 76 | 
 77 | [[package]]
 78 | name = "lexopt"
 79 | version = "0.3.0"
 80 | source = "registry+https://github.com/rust-lang/crates.io-index"
 81 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401"
 82 | 
 83 | [[package]]
 84 | name = "libc"
 85 | version = "0.2.148"
 86 | source = "registry+https://github.com/rust-lang/crates.io-index"
 87 | checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b"
 88 | 
 89 | [[package]]
 90 | name = "log"
 91 | version = "0.4.20"
 92 | source = "registry+https://github.com/rust-lang/crates.io-index"
 93 | checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 94 | 
 95 | [[package]]
 96 | name = "main"
 97 | version = "1.0.5"
 98 | dependencies = [
 99 |  "aho-corasick 1.1.3",
100 |  "anyhow",
101 |  "env_logger",
102 |  "lexopt",
103 |  "shared",
104 | ]
105 | 
106 | [[package]]
107 | name = "memchr"
108 | version = "2.6.3"
109 | source = "registry+https://github.com/rust-lang/crates.io-index"
110 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
111 | 
112 | [[package]]
113 | name = "regex"
114 | version = "1.9.5"
115 | source = "registry+https://github.com/rust-lang/crates.io-index"
116 | checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
117 | dependencies = [
118 |  "aho-corasick 1.0.5",
119 |  "memchr",
120 |  "regex-automata",
121 |  "regex-syntax",
122 | ]
123 | 
124 | [[package]]
125 | name = "regex-automata"
126 | version = "0.3.8"
127 | source = "registry+https://github.com/rust-lang/crates.io-index"
128 | checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
129 | dependencies = [
130 |  "aho-corasick 1.0.5",
131 |  "memchr",
132 |  "regex-syntax",
133 | ]
134 | 
135 | [[package]]
136 | name = "regex-syntax"
137 | version = "0.7.5"
138 | source = "registry+https://github.com/rust-lang/crates.io-index"
139 | checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
140 | 
141 | [[package]]
142 | name = "serde"
143 | version = "1.0.152"
144 | source = "registry+https://github.com/rust-lang/crates.io-index"
145 | checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
146 | 
147 | [[package]]
148 | name = "shared"
149 | version = "0.1.0"
150 | dependencies = [
151 |  "anyhow",
152 |  "bstr",
153 | ]
154 | 
155 | [[package]]
156 | name = "termcolor"
157 | version = "1.2.0"
158 | source = "registry+https://github.com/rust-lang/crates.io-index"
159 | checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
160 | dependencies = [
161 |  "winapi-util",
162 | ]
163 | 
164 | [[package]]
165 | name = "winapi"
166 | version = "0.3.9"
167 | source = "registry+https://github.com/rust-lang/crates.io-index"
168 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
169 | dependencies = [
170 |  "winapi-i686-pc-windows-gnu",
171 |  "winapi-x86_64-pc-windows-gnu",
172 | ]
173 | 
174 | [[package]]
175 | name = "winapi-i686-pc-windows-gnu"
176 | version = "0.4.0"
177 | source = "registry+https://github.com/rust-lang/crates.io-index"
178 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
179 | 
180 | [[package]]
181 | name = "winapi-util"
182 | version = "0.1.5"
183 | source = "registry+https://github.com/rust-lang/crates.io-index"
184 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
185 | dependencies = [
186 |  "winapi",
187 | ]
188 | 
189 | [[package]]
190 | name = "winapi-x86_64-pc-windows-gnu"
191 | version = "0.4.0"
192 | source = "registry+https://github.com/rust-lang/crates.io-index"
193 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
194 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-aho-corasick/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "main"
 3 | version = "1.0.5"
 4 | edition = "2021"
 5 | 
 6 | [[bin]]
 7 | name = "main"
 8 | path = "main.rs"
 9 | 
10 | [dependencies]
11 | aho-corasick = { version = "*", path = "../../../", features = ["logging"] }
12 | anyhow = "1.0.69"
13 | # Using an older version here because I am really not a fan of the dependency
14 | # tree explosion that has happened in 0.10.
15 | env_logger = "0.9.3"
16 | lexopt = "0.3.0"
17 | 
18 | [dependencies.shared]
19 | path = "../../shared"
20 | 
21 | [profile.release]
22 | debug = true
23 | codegen-units = 1
24 | lto = "fat"
25 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-aho-corasick/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains a Rust runner program for benchmarking the
 2 | [`aho-corasick` crate][rust-aho-corasick]. The `aho-corasick` crate
 3 | principally implements the [Aho-Corasick algorithm][aho-corasick], although
 4 | it has other algorithms for multiple substring search, such as [Teddy], which
 5 | was ported from the Hyperscan project.
 6 | 
 7 | The `aho-corasick` crate is used by [Rust's `regex` crate][rust-regex] to
 8 | implement fast prefilters that permit finding candidates very quickly and only
 9 | needing to use the regex engine to confirm the match. The Teddy algorithm is
10 | particularly excellent here. (Sometimes `aho-corasick` is used as the regex
11 | engine itself, for example, when the regex is just an alternation of literals.)
12 | 
13 | Since the `aho-corasick` crate only supports searching for literal strings, this
14 | engine should only be used for regex patterns that are literals. This is up to
15 | the author of the benchmark definition, as this runner program will always
16 | treat regex patterns as literals.
17 | 
18 | This also means that this runner program cannot support all benchmark models.
19 | Only the `compile`, `count`, `count-spans` and `grep` models are supported.
20 | 
21 | Finally, this runner program supports measuring two different Aho-Corasick
22 | implementations: `nfa` and `dfa`. The former follows failure transitions at
23 | search time and is thus usually slower, where as the latter builds a full
24 | transition table by pre-computing all failure transitions. The latter tends
25 | to be faster at search time, but can use orders (plural) of magnitude more
26 | memory. In both the `nfa` and `dfa` engines, prefilters inside of Aho-Corasick
27 | are disabled.
28 | 
29 | [rust-aho-corasick]: https://github.com/BurntSushi/aho-corasick
30 | [aho-corasick]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
31 | [Teddy]: https://github.com/BurntSushi/aho-corasick/tree/4e7fa3b85dd3a3ce882896f1d4ee22b1f271f0b4/src/packed/teddy
32 | [rust-regex]: https://github.com/rust-lang/regex
33 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-aho-corasick/main.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Write;
  2 | 
  3 | use {
  4 |     aho_corasick::{
  5 |         AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, MatchKind,
  6 |     },
  7 |     anyhow::Context,
  8 |     lexopt::{Arg, ValueExt},
  9 | };
 10 | 
 11 | use shared::{Benchmark, Sample};
 12 | 
 13 | fn main() -> anyhow::Result<()> {
 14 |     env_logger::try_init()?;
 15 | 
 16 |     let mut p = lexopt::Parser::from_env();
 17 |     let (mut engine, mut quiet) = (String::new(), false);
 18 |     while let Some(arg) = p.next()? {
 19 |         match arg {
 20 |             Arg::Short('h') | Arg::Long("help") => {
 21 |                 anyhow::bail!("main [--version | --quiet] <engine>")
 22 |             }
 23 |             Arg::Short('q') | Arg::Long("quiet") => {
 24 |                 quiet = true;
 25 |             }
 26 |             Arg::Long("version") => {
 27 |                 writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?;
 28 |                 return Ok(());
 29 |             }
 30 |             Arg::Value(v) => {
 31 |                 anyhow::ensure!(
 32 |                     engine.is_empty(),
 33 |                     "only one engine string allowed"
 34 |                 );
 35 |                 engine = v.string().context("<engine>")?;
 36 |                 anyhow::ensure!(
 37 |                     !engine.is_empty(),
 38 |                     "engine string cannot be empty"
 39 |                 );
 40 |             }
 41 |             _ => return Err(arg.unexpected().into()),
 42 |         }
 43 |     }
 44 | 
 45 |     let b = Benchmark::from_stdin()
 46 |         .context("failed to read KLV data from <stdin>")?;
 47 |     let samples = match (b.model.as_str(), engine.as_str()) {
 48 |         // These first 7 configurations are meant to test the default settings
 49 |         // on each of {compile, count} x {standard, leftmost-{first,longest}}.
 50 |         // We don't also test each of them with {nfa/(non-)?contiguous, dfa}
 51 |         // because it would just get ridiculous.
 52 |         ("compile", "default/standard") => {
 53 |             model_compile_ac(&b, || Ok(builder_ac(&b)?.build(&b.needles)?))?
 54 |         }
 55 |         ("compile", "default/leftmost-first") => model_compile_ac(&b, || {
 56 |             Ok(builder_ac(&b)?
 57 |                 .match_kind(MatchKind::LeftmostFirst)
 58 |                 .build(&b.needles)?)
 59 |         })?,
 60 |         ("compile", "default/leftmost-longest") => {
 61 |             model_compile_ac(&b, || {
 62 |                 Ok(builder_ac(&b)?
 63 |                     .match_kind(MatchKind::LeftmostLongest)
 64 |                     .build(&b.needles)?)
 65 |             })?
 66 |         }
 67 |         ("count", "default/standard") => {
 68 |             let ac = builder_ac(&b)?.build(&b.needles)?;
 69 |             model_count_ac(&b, &ac)?
 70 |         }
 71 |         ("count", "default/overlapping") => {
 72 |             let ac = builder_ac(&b)?.build(&b.needles)?;
 73 |             model_count_ac_overlapping(&b, &ac)?
 74 |         }
 75 |         ("count", "default/leftmost-first") => {
 76 |             let ac = builder_ac(&b)?
 77 |                 .match_kind(MatchKind::LeftmostFirst)
 78 |                 .build(&b.needles)?;
 79 |             model_count_ac(&b, &ac)?
 80 |         }
 81 |         ("count", "default/leftmost-longest") => {
 82 |             let ac = builder_ac(&b)?
 83 |                 .match_kind(MatchKind::LeftmostLongest)
 84 |                 .build(&b.needles)?;
 85 |             model_count_ac(&b, &ac)?
 86 |         }
 87 | 
 88 |         // OK, now we start testing the specific Aho-Corasick automatons, but
 89 |         // we just focus on leftmost-first because that's the case we tend to
 90 |         // be more interested in optimizing in practice. There's also likely
 91 |         // to not be much of a perf difference between leftmost-first and
 92 |         // leftmost-longest.
 93 |         //
 94 |         // We also specifically disable prefilters so that we know we're always
 95 |         // measuring the actual automaton. (The 'default' engines above might
 96 |         // use a prefilter!)
 97 |         ("count", "nfa-noncontiguous/leftmost-first") => {
 98 |             let ac = builder_ac(&b)?
 99 |                 .prefilter(false)
100 |                 .kind(Some(AhoCorasickKind::NoncontiguousNFA))
101 |                 .match_kind(MatchKind::LeftmostFirst)
102 |                 .build(&b.needles)?;
103 |             model_count_ac(&b, &ac)?
104 |         }
105 |         ("count", "nfa-contiguous/leftmost-first") => {
106 |             let ac = builder_ac(&b)?
107 |                 .prefilter(false)
108 |                 .kind(Some(AhoCorasickKind::ContiguousNFA))
109 |                 .match_kind(MatchKind::LeftmostFirst)
110 |                 .build(&b.needles)?;
111 |             model_count_ac(&b, &ac)?
112 |         }
113 |         ("count", "dfa/leftmost-first") => {
114 |             let ac = builder_ac(&b)?
115 |                 .prefilter(false)
116 |                 .kind(Some(AhoCorasickKind::DFA))
117 |                 .match_kind(MatchKind::LeftmostFirst)
118 |                 .build(&b.needles)?;
119 |             model_count_ac(&b, &ac)?
120 |         }
121 | 
122 |         // And now the packed substring routines. We include a 'compile'
123 |         // model here as well because it's nice to know how long, specifically,
124 |         // the packed searcher take to build in isolation.
125 |         ("compile", "packed/leftmost-first") => {
126 |             model_compile_packed(&b, || {
127 |                 let searcher = aho_corasick::packed::Config::new()
128 |                     .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst)
129 |                     .heuristic_pattern_limits(false)
130 |                     .builder()
131 |                     .extend(&b.needles)
132 |                     .build()
133 |                     .ok_or_else(|| {
134 |                         anyhow::anyhow!("could not build packed searcher")
135 |                     })?;
136 |                 Ok(searcher)
137 |             })?
138 |         }
139 |         ("count", "packed/leftmost-first") => {
140 |             let searcher = aho_corasick::packed::Config::new()
141 |                 .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst)
142 |                 .heuristic_pattern_limits(false)
143 |                 .builder()
144 |                 .extend(&b.needles)
145 |                 .build()
146 |                 .ok_or_else(|| {
147 |                     anyhow::anyhow!("could not build packed searcher")
148 |                 })?;
149 |             model_count_packed(&b, &searcher)?
150 |         }
151 |         _ => anyhow::bail!(
152 |             "unsupported model/engine pair, model={} engine={}",
153 |             b.model,
154 |             engine
155 |         ),
156 |     };
157 |     if !quiet {
158 |         let mut stdout = std::io::stdout().lock();
159 |         for s in samples.iter() {
160 |             writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?;
161 |         }
162 |     }
163 |     Ok(())
164 | }
165 | 
166 | /// Implements the "compile a matcher" model for `AhoCorasick`.
167 | fn model_compile_ac(
168 |     b: &Benchmark,
169 |     compile: impl FnMut() -> anyhow::Result<AhoCorasick>,
170 | ) -> anyhow::Result<Vec<Sample>> {
171 |     let haystack = &*b.haystack;
172 |     shared::run_and_count(
173 |         b,
174 |         |re: AhoCorasick| Ok(re.find_iter(haystack).count()),
175 |         compile,
176 |     )
177 | }
178 | 
179 | /// Implements the "compile a matcher" model for packed substring search.
180 | fn model_compile_packed(
181 |     b: &Benchmark,
182 |     compile: impl FnMut() -> anyhow::Result<aho_corasick::packed::Searcher>,
183 | ) -> anyhow::Result<Vec<Sample>> {
184 |     let haystack = &*b.haystack;
185 |     shared::run_and_count(
186 |         b,
187 |         |re: aho_corasick::packed::Searcher| {
188 |             Ok(re.find_iter(haystack).count())
189 |         },
190 |         compile,
191 |     )
192 | }
193 | 
194 | /// Implements the "count all matches" model for `AhoCorasick`.
195 | fn model_count_ac(
196 |     b: &Benchmark,
197 |     ac: &AhoCorasick,
198 | ) -> anyhow::Result<Vec<Sample>> {
199 |     let haystack = &*b.haystack;
200 |     shared::run(b, || Ok(ac.find_iter(haystack).count()))
201 | }
202 | 
203 | /// Implements the "count all overlapping matches" model for `AhoCorasick`.
204 | fn model_count_ac_overlapping(
205 |     b: &Benchmark,
206 |     ac: &AhoCorasick,
207 | ) -> anyhow::Result<Vec<Sample>> {
208 |     let haystack = &*b.haystack;
209 |     shared::run(b, || Ok(ac.find_overlapping_iter(haystack).count()))
210 | }
211 | 
212 | /// Implements the "count all matches" model for packed substring search.
213 | fn model_count_packed(
214 |     b: &Benchmark,
215 |     searcher: &aho_corasick::packed::Searcher,
216 | ) -> anyhow::Result<Vec<Sample>> {
217 |     anyhow::ensure!(
218 |         !b.case_insensitive,
219 |         "rust/aho-corasick/packed engines are incompatible \
220 |          with 'case-insensitive = true'"
221 |     );
222 | 
223 |     let haystack = &*b.haystack;
224 |     shared::run(b, || Ok(searcher.find_iter(haystack).count()))
225 | }
226 | 
227 | /// Returns a default builder with as many settings as possible applied from
228 | /// the benchmark definition. If the settings from the definition are not
229 | /// supported, then this returns an error.
230 | fn builder_ac(b: &Benchmark) -> anyhow::Result<AhoCorasickBuilder> {
231 |     anyhow::ensure!(
232 |         !(b.unicode && b.case_insensitive),
233 |         "rust/aho-corasick engines are incompatible with 'unicode = true' and \
234 |          'case-insensitive = true'"
235 |     );
236 |     let mut builder = AhoCorasick::builder();
237 |     builder.ascii_case_insensitive(b.case_insensitive);
238 |     Ok(builder)
239 | }
240 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-daachorse/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "anyhow"
  7 | version = "1.0.75"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
 10 | 
 11 | [[package]]
 12 | name = "bstr"
 13 | version = "1.6.2"
 14 | source = "registry+https://github.com/rust-lang/crates.io-index"
 15 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
 16 | dependencies = [
 17 |  "memchr",
 18 |  "serde",
 19 | ]
 20 | 
 21 | [[package]]
 22 | name = "daachorse"
 23 | version = "1.0.0"
 24 | source = "registry+https://github.com/rust-lang/crates.io-index"
 25 | checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
 26 | 
 27 | [[package]]
 28 | name = "lexopt"
 29 | version = "0.3.0"
 30 | source = "registry+https://github.com/rust-lang/crates.io-index"
 31 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401"
 32 | 
 33 | [[package]]
 34 | name = "main"
 35 | version = "1.0.0"
 36 | dependencies = [
 37 |  "anyhow",
 38 |  "daachorse",
 39 |  "lexopt",
 40 |  "shared",
 41 | ]
 42 | 
 43 | [[package]]
 44 | name = "memchr"
 45 | version = "2.6.3"
 46 | source = "registry+https://github.com/rust-lang/crates.io-index"
 47 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
 48 | 
 49 | [[package]]
 50 | name = "proc-macro2"
 51 | version = "1.0.66"
 52 | source = "registry+https://github.com/rust-lang/crates.io-index"
 53 | checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
 54 | dependencies = [
 55 |  "unicode-ident",
 56 | ]
 57 | 
 58 | [[package]]
 59 | name = "quote"
 60 | version = "1.0.33"
 61 | source = "registry+https://github.com/rust-lang/crates.io-index"
 62 | checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
 63 | dependencies = [
 64 |  "proc-macro2",
 65 | ]
 66 | 
 67 | [[package]]
 68 | name = "serde"
 69 | version = "1.0.188"
 70 | source = "registry+https://github.com/rust-lang/crates.io-index"
 71 | checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
 72 | dependencies = [
 73 |  "serde_derive",
 74 | ]
 75 | 
 76 | [[package]]
 77 | name = "serde_derive"
 78 | version = "1.0.188"
 79 | source = "registry+https://github.com/rust-lang/crates.io-index"
 80 | checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
 81 | dependencies = [
 82 |  "proc-macro2",
 83 |  "quote",
 84 |  "syn",
 85 | ]
 86 | 
 87 | [[package]]
 88 | name = "shared"
 89 | version = "0.1.0"
 90 | dependencies = [
 91 |  "anyhow",
 92 |  "bstr",
 93 | ]
 94 | 
 95 | [[package]]
 96 | name = "syn"
 97 | version = "2.0.31"
 98 | source = "registry+https://github.com/rust-lang/crates.io-index"
 99 | checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398"
100 | dependencies = [
101 |  "proc-macro2",
102 |  "quote",
103 |  "unicode-ident",
104 | ]
105 | 
106 | [[package]]
107 | name = "unicode-ident"
108 | version = "1.0.11"
109 | source = "registry+https://github.com/rust-lang/crates.io-index"
110 | checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
111 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-daachorse/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "main"
 3 | version = "1.0.0"
 4 | edition = "2021"
 5 | 
 6 | [[bin]]
 7 | name = "main"
 8 | path = "main.rs"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.69"
12 | daachorse = "=1.0.0"
13 | lexopt = "0.3.0"
14 | 
15 | [dependencies.shared]
16 | path = "../../shared"
17 | 
18 | [profile.release]
19 | debug = true
20 | codegen-units = 1
21 | lto = "fat"
22 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-daachorse/main.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Write;
  2 | 
  3 | use {
  4 |     anyhow::Context,
  5 |     daachorse::{
  6 |         bytewise::{DoubleArrayAhoCorasick, DoubleArrayAhoCorasickBuilder},
  7 |         MatchKind,
  8 |     },
  9 |     lexopt::{Arg, ValueExt},
 10 | };
 11 | 
 12 | use shared::{Benchmark, Sample};
 13 | 
 14 | fn main() -> anyhow::Result<()> {
 15 |     let mut p = lexopt::Parser::from_env();
 16 |     let (mut engine, mut quiet) = (String::new(), false);
 17 |     while let Some(arg) = p.next()? {
 18 |         match arg {
 19 |             Arg::Short('h') | Arg::Long("help") => {
 20 |                 anyhow::bail!("main [--version | --quiet] <engine>")
 21 |             }
 22 |             Arg::Short('q') | Arg::Long("quiet") => {
 23 |                 quiet = true;
 24 |             }
 25 |             Arg::Long("version") => {
 26 |                 writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?;
 27 |                 return Ok(());
 28 |             }
 29 |             Arg::Value(v) => {
 30 |                 anyhow::ensure!(
 31 |                     engine.is_empty(),
 32 |                     "only one engine string allowed"
 33 |                 );
 34 |                 engine = v.string().context("<engine>")?;
 35 |                 anyhow::ensure!(
 36 |                     !engine.is_empty(),
 37 |                     "engine string cannot be empty"
 38 |                 );
 39 |             }
 40 |             _ => return Err(arg.unexpected().into()),
 41 |         }
 42 |     }
 43 | 
 44 |     let b = Benchmark::from_stdin()
 45 |         .context("failed to read KLV data from <stdin>")?;
 46 |     let samples = match (b.model.as_str(), engine.as_str()) {
 47 |         ("compile", "bytewise/standard") => {
 48 |             model_compile_bytewise_standard(&b)?
 49 |         }
 50 |         ("compile", "bytewise/leftmost-first") => {
 51 |             model_compile_bytewise_leftmost(&b, MatchKind::LeftmostFirst)?
 52 |         }
 53 |         ("compile", "bytewise/leftmost-longest") => {
 54 |             model_compile_bytewise_leftmost(&b, MatchKind::LeftmostLongest)?
 55 |         }
 56 |         ("count", "bytewise/standard") => model_count_bytewise_standard(&b)?,
 57 |         ("count", "bytewise/overlapping") => {
 58 |             model_count_bytewise_overlapping(&b)?
 59 |         }
 60 |         ("count", "bytewise/leftmost-first") => {
 61 |             model_count_bytewise_leftmost(&b, MatchKind::LeftmostFirst)?
 62 |         }
 63 |         ("count", "bytewise/leftmost-longest") => {
 64 |             model_count_bytewise_leftmost(&b, MatchKind::LeftmostLongest)?
 65 |         }
 66 |         _ => anyhow::bail!(
 67 |             "unsupported model/engine pair, model={} engine={}",
 68 |             b.model,
 69 |             engine
 70 |         ),
 71 |     };
 72 |     if !quiet {
 73 |         let mut stdout = std::io::stdout().lock();
 74 |         for s in samples.iter() {
 75 |             writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?;
 76 |         }
 77 |     }
 78 |     Ok(())
 79 | }
 80 | 
 81 | /// Implements the "compile a matcher" model for a bytewise daachorse automaton
 82 | /// using "standard" (i.e., what's found in a textbook description of
 83 | /// Aho-Corasick for a non-overlapping search) match semantics.
 84 | fn model_compile_bytewise_standard(
 85 |     b: &Benchmark,
 86 | ) -> anyhow::Result<Vec<Sample>> {
 87 |     let haystack = &*b.haystack;
 88 |     shared::run_and_count(
 89 |         b,
 90 |         |ac: daachorse::DoubleArrayAhoCorasick<u32>| {
 91 |             Ok(ac.find_iter(haystack).count())
 92 |         },
 93 |         || compile_bytewise(b, MatchKind::Standard),
 94 |     )
 95 | }
 96 | 
 97 | /// Implements the "compile a matcher" model for a bytewise daachorse automaton
 98 | /// using the given match semantics. The match semantics must be either
 99 | /// leftmost-first or leftmost-longest.
100 | fn model_compile_bytewise_leftmost(
101 |     b: &Benchmark,
102 |     kind: MatchKind,
103 | ) -> anyhow::Result<Vec<Sample>> {
104 |     let haystack = &*b.haystack;
105 |     shared::run_and_count(
106 |         b,
107 |         |ac: daachorse::DoubleArrayAhoCorasick<u32>| {
108 |             Ok(ac.leftmost_find_iter(haystack).count())
109 |         },
110 |         || compile_bytewise(b, kind),
111 |     )
112 | }
113 | 
114 | /// Implements a multi-substring algorithm using daachorse's bytewise
115 | /// Aho-Corasick automaton. This uses "standard" match semantics.
116 | fn model_count_bytewise_standard(
117 |     b: &Benchmark,
118 | ) -> anyhow::Result<Vec<Sample>> {
119 |     let haystack = &*b.haystack;
120 |     let ac = compile_bytewise(b, MatchKind::Standard)?;
121 |     shared::run(b, || Ok(ac.find_iter(haystack).count()))
122 | }
123 | 
124 | /// Implements a multi-substring algorithm using daachorse's bytewise
125 | /// Aho-Corasick automaton. This uses "standard" match semantics and finds all
126 | /// overlapping matches.
127 | fn model_count_bytewise_overlapping(
128 |     b: &Benchmark,
129 | ) -> anyhow::Result<Vec<Sample>> {
130 |     let haystack = &*b.haystack;
131 |     let ac = compile_bytewise(b, MatchKind::Standard)?;
132 |     shared::run(b, || Ok(ac.find_overlapping_iter(haystack).count()))
133 | }
134 | 
135 | /// Implements a multi-substring algorithm using daachorse's bytewise
136 | /// Aho-Corasick automaton. This requires leftmost-first or leftmost-longest
137 | /// match semantics.
138 | fn model_count_bytewise_leftmost(
139 |     b: &Benchmark,
140 |     kind: MatchKind,
141 | ) -> anyhow::Result<Vec<Sample>> {
142 |     let haystack = &*b.haystack;
143 |     let ac = compile_bytewise(b, kind)?;
144 |     shared::run(b, || Ok(ac.leftmost_find_iter(haystack).count()))
145 | }
146 | 
147 | /// Compiles a naive multi-substring matcher by building a single substring
148 | /// matcher for each needle.
149 | fn compile_bytewise(
150 |     b: &Benchmark,
151 |     kind: MatchKind,
152 | ) -> anyhow::Result<DoubleArrayAhoCorasick<u32>> {
153 |     anyhow::ensure!(
154 |         !b.case_insensitive,
155 |         "daachorse doesn't support case insensitive mode",
156 |     );
157 |     let result = DoubleArrayAhoCorasickBuilder::new()
158 |         .match_kind(kind)
159 |         .build(&b.needles);
160 |     let ac = match result {
161 |         Ok(ac) => ac,
162 |         Err(err) => anyhow::bail!("daachorse build failed: {}", err),
163 |     };
164 |     Ok(ac)
165 | }
166 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-jetscii/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | version = 3
 4 | 
 5 | [[package]]
 6 | name = "anyhow"
 7 | version = "1.0.72"
 8 | source = "registry+https://github.com/rust-lang/crates.io-index"
 9 | checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854"
10 | 
11 | [[package]]
12 | name = "bstr"
13 | version = "1.6.0"
14 | source = "registry+https://github.com/rust-lang/crates.io-index"
15 | checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05"
16 | dependencies = [
17 |  "memchr",
18 |  "serde",
19 | ]
20 | 
21 | [[package]]
22 | name = "jetscii"
23 | version = "0.5.3"
24 | source = "registry+https://github.com/rust-lang/crates.io-index"
25 | checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e"
26 | 
27 | [[package]]
28 | name = "main"
29 | version = "0.5.3"
30 | dependencies = [
31 |  "anyhow",
32 |  "jetscii",
33 |  "shared",
34 | ]
35 | 
36 | [[package]]
37 | name = "memchr"
38 | version = "2.5.0"
39 | source = "registry+https://github.com/rust-lang/crates.io-index"
40 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
41 | 
42 | [[package]]
43 | name = "serde"
44 | version = "1.0.178"
45 | source = "registry+https://github.com/rust-lang/crates.io-index"
46 | checksum = "60363bdd39a7be0266a520dab25fdc9241d2f987b08a01e01f0ec6d06a981348"
47 | 
48 | [[package]]
49 | name = "shared"
50 | version = "0.1.0"
51 | dependencies = [
52 |  "anyhow",
53 |  "bstr",
54 | ]
55 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-jetscii/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "main"
 4 | version = "0.5.3"
 5 | edition = "2021"
 6 | 
 7 | [workspace]
 8 | 
 9 | [dependencies]
10 | anyhow = "1.0.72"
11 | jetscii = "=0.5.3"
12 | 
13 | [dependencies.shared]
14 | path = "../../shared"
15 | 
16 | [[bin]]
17 | name = "main"
18 | path = "main.rs"
19 | 
20 | [profile.release]
21 | debug = true
22 | codegen-units = 1
23 | lto = "fat"
24 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-jetscii/main.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Write;
  2 | 
  3 | use shared::{Benchmark, Sample};
  4 | 
  5 | fn main() -> anyhow::Result<()> {
  6 |     let Some(arg) = std::env::args_os().nth(1) else {
  7 |         anyhow::bail!("Usage: runner (<engine-name> | --version)")
  8 |     };
  9 |     let Ok(arg) = arg.into_string() else {
 10 |         anyhow::bail!("argument given is not valid UTF-8")
 11 |     };
 12 |     if arg == "--version" {
 13 |         writeln!(std::io::stdout(), env!("CARGO_PKG_VERSION"))?;
 14 |         return Ok(());
 15 |     }
 16 |     let engine = arg;
 17 |     let b = Benchmark::from_stdin()?;
 18 |     let samples = match (&*engine, &*b.model) {
 19 |         ("ascii-chars-prebuilt", "count") => memmem_prebuilt_count(&b)?,
 20 |         ("ascii-chars-oneshot", "count") => memmem_oneshot_count(&b)?,
 21 |         (engine, model) => {
 22 |             anyhow::bail!("unrecognized engine '{engine}' and model '{model}'")
 23 |         }
 24 |     };
 25 |     let mut stdout = std::io::stdout().lock();
 26 |     for s in samples.iter() {
 27 |         writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?;
 28 |     }
 29 |     Ok(())
 30 | }
 31 | 
 32 | fn memmem_prebuilt_count(b: &Benchmark) -> anyhow::Result<Vec<Sample>> {
 33 |     let Ok(haystack) = std::str::from_utf8(&b.haystack) else {
 34 |         anyhow::bail!("jetscii ASCII search requires valid UTF-8 haystack")
 35 |     };
 36 |     let (needles, len) = needle_array(b)?;
 37 |     let fallback = jetscii_fallback(b)?;
 38 |     let finder = jetscii::AsciiChars::new(needles, len, fallback);
 39 |     shared::run(b, || {
 40 |         let mut haystack = haystack;
 41 |         let mut count = 0;
 42 |         while let Some(i) = finder.find(haystack) {
 43 |             count += 1;
 44 |             haystack = &haystack[i + 1..];
 45 |         }
 46 |         Ok(count)
 47 |     })
 48 | }
 49 | 
 50 | fn memmem_oneshot_count(b: &Benchmark) -> anyhow::Result<Vec<Sample>> {
 51 |     let Ok(haystack) = std::str::from_utf8(&b.haystack) else {
 52 |         anyhow::bail!("jetscii ASCII search requires valid UTF-8 haystack")
 53 |     };
 54 |     let (needles, len) = needle_array(b)?;
 55 |     let fallback = jetscii_fallback(b)?;
 56 |     shared::run(b, || {
 57 |         let finder = jetscii::AsciiChars::new(needles, len, &fallback);
 58 |         let mut haystack = haystack;
 59 |         let mut count = 0;
 60 |         while let Some(i) = finder.find(haystack) {
 61 |             count += 1;
 62 |             haystack = &haystack[i + 1..];
 63 |         }
 64 |         Ok(count)
 65 |     })
 66 | }
 67 | 
 68 | /// Converts the needles from the given benchmark into a fixed size 16-element
 69 | /// array along with the number of actual needles in the array (which may be
 70 | /// less than 16).
 71 | ///
 72 | /// If any needle is more than one byte or there are too many needles to fit
 73 | /// into a 16-element array, then this returns an error. This also returns an
 74 | /// error if any of the bytes are not ASCII.
 75 | fn needle_array(b: &Benchmark) -> anyhow::Result<([u8; 16], i32)> {
 76 |     let mut array = [0u8; 16];
 77 |     let needles = b.needle_bytes()?;
 78 |     let Ok(len) = i32::try_from(needles.len()) else {
 79 |         anyhow::bail!("needle length {} could not fit into i32", needles.len())
 80 |     };
 81 |     anyhow::ensure!(
 82 |         needles.len() <= 16,
 83 |         "jetscii only supports at most 16 single byte needles, \
 84 |          but found {} needles",
 85 |         needles.len(),
 86 |     );
 87 |     for (i, byte) in needles.into_iter().enumerate() {
 88 |         array[i] = byte;
 89 |     }
 90 |     Ok((array, len))
 91 | }
 92 | 
 93 | /// Create a fallback predicate for jetscii's up-to-16-bytes search.
 94 | fn jetscii_fallback(b: &Benchmark) -> anyhow::Result<impl Fn(u8) -> bool> {
 95 |     let mut set = vec![false; 256];
 96 |     for byte in b.needle_bytes()? {
 97 |         set[usize::from(byte)] = true;
 98 |     }
 99 |     Ok(move |byte| set[usize::from(byte)])
100 | }
101 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-old-aho-corasick/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-old-aho-corasick/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | version = 3
 4 | 
 5 | [[package]]
 6 | name = "aho-corasick"
 7 | version = "1.0.5"
 8 | source = "registry+https://github.com/rust-lang/crates.io-index"
 9 | checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
10 | dependencies = [
11 |  "memchr",
12 | ]
13 | 
14 | [[package]]
15 | name = "anyhow"
16 | version = "1.0.69"
17 | source = "registry+https://github.com/rust-lang/crates.io-index"
18 | checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
19 | 
20 | [[package]]
21 | name = "bstr"
22 | version = "1.6.2"
23 | source = "registry+https://github.com/rust-lang/crates.io-index"
24 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
25 | dependencies = [
26 |  "memchr",
27 |  "serde",
28 | ]
29 | 
30 | [[package]]
31 | name = "lexopt"
32 | version = "0.3.0"
33 | source = "registry+https://github.com/rust-lang/crates.io-index"
34 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401"
35 | 
36 | [[package]]
37 | name = "main"
38 | version = "1.0.5"
39 | dependencies = [
40 |  "aho-corasick",
41 |  "anyhow",
42 |  "lexopt",
43 |  "shared",
44 | ]
45 | 
46 | [[package]]
47 | name = "memchr"
48 | version = "2.6.3"
49 | source = "registry+https://github.com/rust-lang/crates.io-index"
50 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
51 | 
52 | [[package]]
53 | name = "serde"
54 | version = "1.0.152"
55 | source = "registry+https://github.com/rust-lang/crates.io-index"
56 | checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
57 | 
58 | [[package]]
59 | name = "shared"
60 | version = "0.1.0"
61 | dependencies = [
62 |  "anyhow",
63 |  "bstr",
64 | ]
65 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-old-aho-corasick/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "main"
 3 | version = "1.0.5"
 4 | edition = "2021"
 5 | 
 6 | [[bin]]
 7 | name = "main"
 8 | path = "main.rs"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.69"
12 | lexopt = "0.3.0"
13 | aho-corasick = "=1.0.5"
14 | 
15 | [dependencies.shared]
16 | path = "../../shared"
17 | 
18 | [profile.release]
19 | debug = true
20 | codegen-units = 1
21 | lto = "fat"
22 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-old-aho-corasick/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains a Rust runner program for benchmarking the
 2 | [`aho-corasick` crate][rust-aho-corasick]. The `aho-corasick` crate
 3 | principally implements the [Aho-Corasick algorithm][aho-corasick], although
 4 | it has other algorithms for multiple substring search, such as [Teddy], which
 5 | was ported from the Hyperscan project.
 6 | 
 7 | The `aho-corasick` crate is used by [Rust's `regex` crate][rust-regex] to
 8 | implement fast prefilters that permit finding candidates very quickly and only
 9 | needing to use the regex engine to confirm the match. The Teddy algorithm is
10 | particularly excellent here. (Sometimes `aho-corasick` is used as the regex
11 | engine itself, for example, when the regex is just an alternation of literals.)
12 | 
13 | Since the `aho-corasick` crate only supports searching for literal strings, this
14 | engine should only be used for regex patterns that are literals. This is up to
15 | the author of the benchmark definition, as this runner program will always
16 | treat regex patterns as literals.
17 | 
18 | This also means that this runner program cannot support all benchmark models.
19 | Only the `compile`, `count`, `count-spans` and `grep` models are supported.
20 | 
21 | Finally, this runner program supports measuring two different Aho-Corasick
22 | implementations: `nfa` and `dfa`. The former follows failure transitions at
23 | search time and is thus usually slower, where as the latter builds a full
24 | transition table by pre-computing all failure transitions. The latter tends
25 | to be faster at search time, but can use orders (plural) of magnitude more
26 | memory. In both the `nfa` and `dfa` engines, prefilters inside of Aho-Corasick
27 | are disabled.
28 | 
29 | [rust-aho-corasick]: https://github.com/BurntSushi/aho-corasick
30 | [aho-corasick]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
31 | [Teddy]: https://github.com/BurntSushi/aho-corasick/tree/4e7fa3b85dd3a3ce882896f1d4ee22b1f271f0b4/src/packed/teddy
32 | [rust-regex]: https://github.com/rust-lang/regex
33 | 


--------------------------------------------------------------------------------
/benchmarks/engines/rust-old-aho-corasick/main.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Write;
  2 | 
  3 | use {
  4 |     aho_corasick::{
  5 |         AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, MatchKind,
  6 |     },
  7 |     anyhow::Context,
  8 |     lexopt::{Arg, ValueExt},
  9 | };
 10 | 
 11 | use shared::{Benchmark, Sample};
 12 | 
 13 | fn main() -> anyhow::Result<()> {
 14 |     let mut p = lexopt::Parser::from_env();
 15 |     let (mut engine, mut quiet) = (String::new(), false);
 16 |     while let Some(arg) = p.next()? {
 17 |         match arg {
 18 |             Arg::Short('h') | Arg::Long("help") => {
 19 |                 anyhow::bail!("main [--version | --quiet] <engine>")
 20 |             }
 21 |             Arg::Short('q') | Arg::Long("quiet") => {
 22 |                 quiet = true;
 23 |             }
 24 |             Arg::Long("version") => {
 25 |                 writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?;
 26 |                 return Ok(());
 27 |             }
 28 |             Arg::Value(v) => {
 29 |                 anyhow::ensure!(
 30 |                     engine.is_empty(),
 31 |                     "only one engine string allowed"
 32 |                 );
 33 |                 engine = v.string().context("<engine>")?;
 34 |                 anyhow::ensure!(
 35 |                     !engine.is_empty(),
 36 |                     "engine string cannot be empty"
 37 |                 );
 38 |             }
 39 |             _ => return Err(arg.unexpected().into()),
 40 |         }
 41 |     }
 42 | 
 43 |     let b = Benchmark::from_stdin()
 44 |         .context("failed to read KLV data from <stdin>")?;
 45 |     let samples = match (b.model.as_str(), engine.as_str()) {
 46 |         // These first 6 configurations are meant to test the default settings
 47 |         // on each of {compile, count} x {standard, leftmost-{first,longest}}.
 48 |         // We don't also test each of them with {nfa/(non-)?contiguous, dfa}
 49 |         // because it would just get ridiculous.
 50 |         ("compile", "default/standard") => {
 51 |             model_compile_ac(&b, || Ok(builder_ac(&b)?.build(&b.needles)?))?
 52 |         }
 53 |         ("compile", "default/leftmost-first") => model_compile_ac(&b, || {
 54 |             Ok(builder_ac(&b)?
 55 |                 .match_kind(MatchKind::LeftmostFirst)
 56 |                 .build(&b.needles)?)
 57 |         })?,
 58 |         ("compile", "default/leftmost-longest") => {
 59 |             model_compile_ac(&b, || {
 60 |                 Ok(builder_ac(&b)?
 61 |                     .match_kind(MatchKind::LeftmostLongest)
 62 |                     .build(&b.needles)?)
 63 |             })?
 64 |         }
 65 |         ("count", "default/standard") => {
 66 |             let ac = builder_ac(&b)?.build(&b.needles)?;
 67 |             model_count_ac(&b, &ac)?
 68 |         }
 69 |         ("count", "default/leftmost-first") => {
 70 |             let ac = builder_ac(&b)?
 71 |                 .match_kind(MatchKind::LeftmostFirst)
 72 |                 .build(&b.needles)?;
 73 |             model_count_ac(&b, &ac)?
 74 |         }
 75 |         ("count", "default/leftmost-longest") => {
 76 |             let ac = builder_ac(&b)?
 77 |                 .match_kind(MatchKind::LeftmostLongest)
 78 |                 .build(&b.needles)?;
 79 |             model_count_ac(&b, &ac)?
 80 |         }
 81 | 
 82 |         // OK, now we start testing the specific Aho-Corasick automatons, but
 83 |         // we just focus on leftmost-first because that's the case we tend to
 84 |         // be more interested in optimizing in practice. There's also likely
 85 |         // to not be much of a perf difference between leftmost-first and
 86 |         // leftmost-longest.
 87 |         //
 88 |         // We also specifically disable prefilters so that we know we're always
 89 |         // measuring the actual automaton. (The 'default' engines above might
 90 |         // use a prefilter!)
 91 |         ("count", "nfa-noncontiguous/leftmost-first") => {
 92 |             let ac = builder_ac(&b)?
 93 |                 .prefilter(false)
 94 |                 .kind(Some(AhoCorasickKind::NoncontiguousNFA))
 95 |                 .match_kind(MatchKind::LeftmostFirst)
 96 |                 .build(&b.needles)?;
 97 |             model_count_ac(&b, &ac)?
 98 |         }
 99 |         ("count", "nfa-contiguous/leftmost-first") => {
100 |             let ac = builder_ac(&b)?
101 |                 .prefilter(false)
102 |                 .kind(Some(AhoCorasickKind::ContiguousNFA))
103 |                 .match_kind(MatchKind::LeftmostFirst)
104 |                 .build(&b.needles)?;
105 |             model_count_ac(&b, &ac)?
106 |         }
107 |         ("count", "dfa/leftmost-first") => {
108 |             let ac = builder_ac(&b)?
109 |                 .prefilter(false)
110 |                 .kind(Some(AhoCorasickKind::DFA))
111 |                 .match_kind(MatchKind::LeftmostFirst)
112 |                 .build(&b.needles)?;
113 |             model_count_ac(&b, &ac)?
114 |         }
115 | 
116 |         // And now the packed substring routines. We include a 'compile'
117 |         // model here as well because it's nice to know how long, specifically,
118 |         // the packed searcher take to build in isolation.
119 |         ("compile", "packed/leftmost-first") => {
120 |             model_compile_packed(&b, || {
121 |                 let searcher = aho_corasick::packed::Config::new()
122 |                     .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst)
123 |                     .builder()
124 |                     .extend(&b.needles)
125 |                     .build()
126 |                     .ok_or_else(|| {
127 |                         anyhow::anyhow!("could not build packed searcher")
128 |                     })?;
129 |                 Ok(searcher)
130 |             })?
131 |         }
132 |         ("count", "packed/leftmost-first") => {
133 |             let searcher = aho_corasick::packed::Config::new()
134 |                 .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst)
135 |                 .builder()
136 |                 .extend(&b.needles)
137 |                 .build()
138 |                 .ok_or_else(|| {
139 |                     anyhow::anyhow!("could not build packed searcher")
140 |                 })?;
141 |             model_count_packed(&b, &searcher)?
142 |         }
143 |         _ => anyhow::bail!(
144 |             "unsupported model/engine pair, model={} engine={}",
145 |             b.model,
146 |             engine
147 |         ),
148 |     };
149 |     if !quiet {
150 |         let mut stdout = std::io::stdout().lock();
151 |         for s in samples.iter() {
152 |             writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?;
153 |         }
154 |     }
155 |     Ok(())
156 | }
157 | 
158 | /// Implements the "compile a matcher" model for `AhoCorasick`.
159 | fn model_compile_ac(
160 |     b: &Benchmark,
161 |     compile: impl FnMut() -> anyhow::Result<AhoCorasick>,
162 | ) -> anyhow::Result<Vec<Sample>> {
163 |     let haystack = &*b.haystack;
164 |     shared::run_and_count(
165 |         b,
166 |         |re: AhoCorasick| Ok(re.find_iter(haystack).count()),
167 |         compile,
168 |     )
169 | }
170 | 
171 | /// Implements the "compile a matcher" model for packed substring search.
172 | fn model_compile_packed(
173 |     b: &Benchmark,
174 |     compile: impl FnMut() -> anyhow::Result<aho_corasick::packed::Searcher>,
175 | ) -> anyhow::Result<Vec<Sample>> {
176 |     let haystack = &*b.haystack;
177 |     shared::run_and_count(
178 |         b,
179 |         |re: aho_corasick::packed::Searcher| {
180 |             Ok(re.find_iter(haystack).count())
181 |         },
182 |         compile,
183 |     )
184 | }
185 | 
186 | /// Implements the "count all matches" model for `AhoCorasick`.
187 | fn model_count_ac(
188 |     b: &Benchmark,
189 |     ac: &AhoCorasick,
190 | ) -> anyhow::Result<Vec<Sample>> {
191 |     let haystack = &*b.haystack;
192 |     shared::run(b, || Ok(ac.find_iter(haystack).count()))
193 | }
194 | 
195 | /// Implements the "count all matches" model for packed substring search.
196 | fn model_count_packed(
197 |     b: &Benchmark,
198 |     searcher: &aho_corasick::packed::Searcher,
199 | ) -> anyhow::Result<Vec<Sample>> {
200 |     anyhow::ensure!(
201 |         !b.case_insensitive,
202 |         "rust/aho-corasick/packed engines are incompatible \
203 |          with 'case-insensitive = true'"
204 |     );
205 | 
206 |     let haystack = &*b.haystack;
207 |     shared::run(b, || Ok(searcher.find_iter(haystack).count()))
208 | }
209 | 
210 | /// Returns a default builder with as many settings as possible applied from
211 | /// the benchmark definition. If the settings from the definition are not
212 | /// supported, then this returns an error.
213 | fn builder_ac(b: &Benchmark) -> anyhow::Result<AhoCorasickBuilder> {
214 |     anyhow::ensure!(
215 |         !(b.unicode && b.case_insensitive),
216 |         "rust/aho-corasick engines are incompatible with 'unicode = true' and \
217 |          'case-insensitive = true'"
218 |     );
219 |     let mut builder = AhoCorasick::builder();
220 |     builder.ascii_case_insensitive(b.case_insensitive);
221 |     Ok(builder)
222 | }
223 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/catalog.data.gov/README.md:
--------------------------------------------------------------------------------
1 | Source: https://catalog.data.gov/dataset/
2 | 
3 | Mental health:
4 | https://catalog.data.gov/dataset/mental-health-care-in-the-last-4-weeks
5 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/README.md:
--------------------------------------------------------------------------------
 1 | These were downloaded and derived from the Open Subtitles data set:
 2 | https://opus.nlpl.eu/OpenSubtitles-v2018.php
 3 | 
 4 | The specific way in which they were modified has been lost to time, but it's
 5 | likely they were just a simple truncation based on target file sizes for
 6 | various benchmarks.
 7 | 
 8 | The main reason why we have them is that it gives us a way to test similar
 9 | inputs on non-ASCII text. Normally this wouldn't matter for a substring search
10 | implementation, but because of the heuristics used to pick a priori determined
11 | "rare bytes" to base a prefilter on, it's possible for this heuristic to do
12 | more poorly on non-ASCII text than one might expect.
13 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/en-small.txt:
--------------------------------------------------------------------------------
 1 | Now you can tell 'em.
 2 | What for are you mixing in?
 3 | Maybe I don't like to see kids get hurt.
 4 | Break any bones, son?
 5 | He's got a knife behind his collar!
 6 | - There's a stirrup.
 7 | You want a lift?
 8 | - No.
 9 | - Why not?
10 | - I'm beholden to you, mister.
11 | Couldn't we just leave it that way?
12 | - Morning.
13 | - Morning.
14 | - Put him up?
15 | - For how long?
16 | - I wouldn't know.
17 | - It'll be two bits for oats.
18 | - Ain't I seen you before?
19 | - Depends on where you've been.
20 | - I follow the railroad, mostly.
21 | - Could be you've seen me.
22 | - It'll be four bits if he stays the night.
23 | - Fair enough.
24 | Morning.
25 | Did a man ride in today - tall, sort of heavyset?
26 | - You mean him, Mr Renner?
27 | - Not him.
28 | This one had a scar.
29 | Along his cheek?
30 | No, sir.
31 | I don't see no man with a scar.
32 | I guess maybe I can have some apple pie and coffee.
33 | I guess you could have eggs with bacon if you wanted eggs with bacon.
34 | - Hello, Charlie.
35 | - Hello, Grant.
36 | It's good to see you, Charlie.
37 | It's awful good to see you.
38 | It's good to see you too.
39 | Doc you're beginning to sound like Sherlock Holmes.
40 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/en-teeny.txt:
--------------------------------------------------------------------------------
1 | Sound like Sherlock Holmes.
2 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/en-tiny.txt:
--------------------------------------------------------------------------------
1 | I saw you before but I didn't think you were this young
2 | Doc you're beginning to sound like Sherlock Holmes.
3 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/ru-small.txt:
--------------------------------------------------------------------------------
 1 | -Две недели не даешь мне прохода.
 2 | Вот и действуй, чем ты рискуешь?
 3 | Я думал, что сделаю тебя счастливой.
 4 | Тоже мне счастье.
 5 | Муж не дает ни гроша, и у любовника ума не хватает подумать о деньгах.
 6 | - Хорошенькое счастье.
 7 | - Извини, я думал, ты любишь меня.
 8 | Ну люблю, люблю тебя, но и не хочу, чтобы все началось как в прошлый раз.
 9 | Ты не права.
10 | У меня для тебя сюрприз.
11 | Шлихтовальная машина, ты о ней давно мечтала.
12 | -Для костей?
13 | - Нет, настоящая.
14 | Хочешь, приходи за ней вечером.
15 | Я тебе не девочка.
16 | Была бы ты девочкой, я бы тебе ее не купил.
17 | Я люблю тебя
18 | Митч МакКафи, летающий Шерлок Холмс.
19 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/ru-teeny.txt:
--------------------------------------------------------------------------------
1 | летающий Шерлок Холмс.
2 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/ru-tiny.txt:
--------------------------------------------------------------------------------
1 | Это - одно из самых поразительных недавних открытий науки.
2 | Митч МакКафи, летающий Шерлок Холмс.
3 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/zh-small.txt:
--------------------------------------------------------------------------------
 1 | 魯哇克香貓咖啡 世界上最稀有的飲品 Kopi luwak.
 2 | the rarest beverage in the world.
 3 | 嘗一小口 Take a whiff.
 4 | 來 Go ahead.
 5 | 寇爾先生 董事會已準備好聽你的提案 Uh, mr.
 6 | cole, the board is ready to hear your proposal.
 7 | 等一下下 Hold on just a second.
 8 | 來 繼續 Go ahead.
 9 | go on.
10 | 怎樣 Well?
11 | 真不錯 Really good.
12 | 真不錯 Really good.
13 | 寇爾先生?
14 | Mr.
15 | cole.
16 | sir?
17 | 吉姆 你知道庸俗是什麼嗎 Do you know what a philistine is, jim?
18 | 先生 我叫理查德 Sir, it's richard.
19 | 沒錯 費爾 出動你的如簧巧舌吧 That's right, phil.
20 | give them the spiel.
21 | 謝謝 主席先生 主管們 Thank you, mr.
22 | chairman, fellow supervisors.
23 | 我們寇爾集團財務的管理不善 We at the cole group feel the decline of the winwood hospital...
24 | 直接造成了溫伍德醫院的衰敗 ...is a direct result of significant fiscal mismanagement.
25 | 請原諒 我們醫院...
26 | I beg your pardon, this hospital...
27 | 日常開支近2倍 overhead costs are nearly double.
28 | 帽子不错 汤姆 夏洛克·福尔摩斯
29 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/zh-teeny.txt:
--------------------------------------------------------------------------------
1 | 汤姆 夏洛克·福尔摩斯
2 | 


--------------------------------------------------------------------------------
/benchmarks/haystacks/opensubtitles/zh-tiny.txt:
--------------------------------------------------------------------------------
1 | 谁是早餐界的冠军?
2 | 你突然来信说最近要搬到这里
3 | 帽子不错 汤姆 夏洛克·福尔摩斯
4 | 


--------------------------------------------------------------------------------
/benchmarks/regexes/words-100:
--------------------------------------------------------------------------------
  1 | stampeding
  2 | commendable
  3 | adrenaline
  4 | exobiology
  5 | indifference
  6 | avuncular
  7 | prevailed
  8 | foreparts
  9 | legalistically
 10 | intermarries
 11 | desideratum
 12 | evaluating
 13 | lavishing
 14 | attractable
 15 | philippics
 16 | antiabortionist
 17 | lascivious
 18 | breathable
 19 | histogram
 20 | rattlings
 21 | interdict
 22 | summarized
 23 | relieving
 24 | congresspeople
 25 | fitfulness
 26 | percolation
 27 | upperclasswoman
 28 | epistemic
 29 | Chantilly
 30 | stonemasons
 31 | nonferrous
 32 | emulsions
 33 | charitably
 34 | barracudas
 35 | integrity
 36 | knockdowns
 37 | roadworks
 38 | abortionists
 39 | Salvadoran
 40 | chanceries
 41 | misinform
 42 | caretaker
 43 | extricated
 44 | mandolins
 45 | steeliest
 46 | transpiration
 47 | weirdness
 48 | audiologists
 49 | baronetcies
 50 | performing
 51 | publishing
 52 | suspending
 53 | dermatological
 54 | contemplate
 55 | spiritless
 56 | nightwatchman
 57 | paradisaical
 58 | implicating
 59 | timpanists
 60 | Leavenworth
 61 | amorality
 62 | strangulated
 63 | cellophane
 64 | waterboard
 65 | astrophysicists
 66 | aerospace
 67 | passphrase
 68 | engendered
 69 | spotlighting
 70 | misapplication
 71 | barterers
 72 | poetesses
 73 | dollhouse
 74 | laparoscopic
 75 | Dubrovnik
 76 | rerecords
 77 | shielding
 78 | orthographically
 79 | thicknesses
 80 | Bendictus
 81 | congealed
 82 | cooperative
 83 | encompass
 84 | grouching
 85 | shipowners
 86 | jealously
 87 | generational
 88 | antecedents
 89 | persecutes
 90 | exemplified
 91 | admirable
 92 | squeakiest
 93 | absconding
 94 | extirpated
 95 | exoskeletons
 96 | earthworms
 97 | chaotically
 98 | shipbuilder
 99 | equidistantly
100 | overprint
101 | 


--------------------------------------------------------------------------------
/benchmarks/shared/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | version = 3
 4 | 
 5 | [[package]]
 6 | name = "anyhow"
 7 | version = "1.0.72"
 8 | source = "registry+https://github.com/rust-lang/crates.io-index"
 9 | checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854"
10 | 
11 | [[package]]
12 | name = "bstr"
13 | version = "1.6.0"
14 | source = "registry+https://github.com/rust-lang/crates.io-index"
15 | checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05"
16 | dependencies = [
17 |  "memchr",
18 |  "serde",
19 | ]
20 | 
21 | [[package]]
22 | name = "memchr"
23 | version = "2.5.0"
24 | source = "registry+https://github.com/rust-lang/crates.io-index"
25 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
26 | 
27 | [[package]]
28 | name = "serde"
29 | version = "1.0.177"
30 | source = "registry+https://github.com/rust-lang/crates.io-index"
31 | checksum = "63ba2516aa6bf82e0b19ca8b50019d52df58455d3cf9bdaf6315225fdd0c560a"
32 | 
33 | [[package]]
34 | name = "shared"
35 | version = "0.1.0"
36 | dependencies = [
37 |  "anyhow",
38 |  "bstr",
39 | ]
40 | 


--------------------------------------------------------------------------------
/benchmarks/shared/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "shared"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [workspace]
 7 | 
 8 | [dependencies]
 9 | anyhow = "1.0.69"
10 | bstr = { version = "1.6.0", default-features = false, features = ["std"] }
11 | 
12 | [lib]
13 | name = "shared"
14 | path = "lib.rs"
15 | 


--------------------------------------------------------------------------------
/benchmarks/shared/lib.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     io::Read,
  3 |     time::{Duration, Instant},
  4 | };
  5 | 
  6 | use {
  7 |     anyhow::Context,
  8 |     bstr::{ByteSlice, ByteVec},
  9 | };
 10 | 
 11 | /// A single benchmark configuration read from a sequence of KLV items on
 12 | /// stdin.
 13 | #[derive(Clone, Debug, Default)]
 14 | pub struct Benchmark {
 15 |     pub name: String,
 16 |     pub model: String,
 17 |     pub needles: Vec<Vec<u8>>,
 18 |     pub haystack: Vec<u8>,
 19 |     pub case_insensitive: bool,
 20 |     pub unicode: bool,
 21 |     pub max_iters: u64,
 22 |     pub max_warmup_iters: u64,
 23 |     pub max_time: Duration,
 24 |     pub max_warmup_time: Duration,
 25 | }
 26 | 
 27 | impl Benchmark {
 28 |     /// Read the KLV benchmark configuration from stdin.
 29 |     pub fn from_stdin() -> anyhow::Result<Benchmark> {
 30 |         let mut raw = vec![];
 31 |         std::io::stdin().read_to_end(&mut raw)?;
 32 |         Benchmark::read(&raw)
 33 |     }
 34 | 
 35 |     /// Return single byte needles from this benchmark definition. If any
 36 |     /// needle is more than one byte, then this returns an error.
 37 |     pub fn needle_bytes(&self) -> anyhow::Result<Vec<u8>> {
 38 |         let mut needles = vec![];
 39 |         for needle in self.needles.iter() {
 40 |             anyhow::ensure!(
 41 |                 needle.len() == 1,
 42 |                 "needle must have length 1 (in bytes) but it has length {}",
 43 |                 needle.len(),
 44 |             );
 45 |             needles.push(needle[0]);
 46 |         }
 47 |         Ok(needles)
 48 |     }
 49 | 
 50 |     fn read(mut raw: &[u8]) -> anyhow::Result<Benchmark> {
 51 |         let mut config = Benchmark::default();
 52 |         while !raw.is_empty() {
 53 |             let (klv, nread) = OneKLV::read(raw)?;
 54 |             raw = &raw[nread..];
 55 |             config.set(klv)?;
 56 |         }
 57 |         Ok(config)
 58 |     }
 59 | 
 60 |     fn set(&mut self, klv: OneKLV) -> anyhow::Result<()> {
 61 |         let parse_duration = |v: &str| -> anyhow::Result<Duration> {
 62 |             Ok(Duration::from_nanos(v.parse()?))
 63 |         };
 64 |         let OneKLV { key, value } = klv;
 65 |         match &*key {
 66 |             "name" => self.name = value.to_str()?.to_string(),
 67 |             "model" => self.model = value.to_str()?.to_string(),
 68 |             "pattern" => {
 69 |                 self.needles.push(Vec::unescape_bytes(value.to_str()?))
 70 |             }
 71 |             "haystack" => self.haystack = value.to_vec(),
 72 |             "case-insensitive" => {
 73 |                 self.case_insensitive = value.to_str()?.parse()?
 74 |             }
 75 |             "unicode" => self.unicode = value.to_str()?.parse()?,
 76 |             "max-iters" => self.max_iters = value.to_str()?.parse()?,
 77 |             "max-warmup-iters" => {
 78 |                 self.max_warmup_iters = value.to_str()?.parse()?
 79 |             }
 80 |             "max-time" => self.max_time = parse_duration(value.to_str()?)?,
 81 |             "max-warmup-time" => {
 82 |                 self.max_warmup_time = parse_duration(value.to_str()?)?
 83 |             }
 84 |             _ => {}
 85 |         }
 86 |         Ok(())
 87 |     }
 88 | }
 89 | 
 90 | #[derive(Clone, Debug)]
 91 | struct OneKLV {
 92 |     key: String,
 93 |     value: Vec<u8>,
 94 | }
 95 | 
 96 | impl OneKLV {
 97 |     fn read(bytes: &[u8]) -> anyhow::Result<(OneKLV, usize)> {
 98 |         let mut nread = 0;
 99 |         let (key, bytes) = match bytes.split_once_str(":") {
100 |             Some(x) => x,
101 |             None => anyhow::bail!(
102 |                 "failed to find first ':' in key-length-value item \
103 |                  where the next (at most) 80 bytes are: {:?}",
104 |                 bytes[..std::cmp::min(80, bytes.len())].as_bstr(),
105 |             ),
106 |         };
107 |         nread += key.len() + 1; // +1 for ':'
108 |         let key = key
109 |             .to_str()
110 |             .with_context(|| {
111 |                 format!("key {:?} is not valid UTF-8", key.as_bstr())
112 |             })?
113 |             .to_string();
114 | 
115 |         let (len, bytes) = match bytes.split_once_str(":") {
116 |             Some(x) => x,
117 |             None => anyhow::bail!(
118 |                 "failed to find second ':' in key-length-value item \
119 |                  for key '{}'",
120 |                 key,
121 |             ),
122 |         };
123 |         nread += len.len() + 1; // +1 for ':'
124 |         let len = len.to_str().with_context(|| {
125 |             format!("length for key '{}' is not valid UTF-8", key)
126 |         })?;
127 |         let len = len.parse::<usize>().with_context(|| {
128 |             format!(
129 |                 "length '{}' for key '{}' is not a valid integer",
130 |                 len, key,
131 |             )
132 |         })?;
133 | 
134 |         anyhow::ensure!(
135 |             bytes.len() >= len,
136 |             "got length of {} for key '{}', but only {} bytes remain",
137 |             len,
138 |             key,
139 |             bytes.len(),
140 |         );
141 |         let value = bytes[..len].into();
142 |         let bytes = &bytes[len..];
143 |         nread += len;
144 | 
145 |         anyhow::ensure!(
146 |             bytes.len() >= 1,
147 |             "expected trailing '\\n' after value, but got EOF",
148 |         );
149 |         anyhow::ensure!(
150 |             bytes[0] == b'\n',
151 |             "expected '\\n' after value, but got {:?}",
152 |             bytes[0..1].as_bstr(),
153 |         );
154 |         nread += 1;
155 | 
156 |         let klv = OneKLV { key, value };
157 |         Ok((klv, nread))
158 |     }
159 | }
160 | 
161 | /// A sample computed from a single benchmark iteration.
162 | #[derive(Clone, Debug)]
163 | pub struct Sample {
164 |     /// The duration of the iteration.
165 |     pub duration: Duration,
166 |     /// The count reported by the benchmark. This is used by the harness to
167 |     /// verify that the result is correct.
168 |     ///
169 |     /// All benchmark models except for regex-redux use this. For regex-redux,
170 |     /// it is always zero.
171 |     pub count: u64,
172 | }
173 | 
174 | /// Run the given `bench` function repeatedly until either the maximum
175 | /// time or number of iterations has been reached and return the set of
176 | /// samples.
177 | pub fn run(
178 |     b: &Benchmark,
179 |     bench: impl FnMut() -> anyhow::Result<usize>,
180 | ) -> anyhow::Result<Vec<Sample>> {
181 |     run_and_count(b, |count| Ok(count), bench)
182 | }
183 | 
184 | /// Run the given `bench` function repeatedly until either the maximum
185 | /// time or number of iterations has been reached and return the set of
186 | /// samples. The count for each sample is determined by running `count` on
187 | /// the result of `bench`. The execution time of `count` is specifically
188 | /// not included in the sample's duration.
189 | ///
190 | /// N.B. This variant only exists for the 'compile' model. We want to only
191 | /// measure compile time, but still do extra work that we specifically
192 | /// don't measure to produce a count to ensure the compile regex behaves as
193 | /// expected.
194 | pub fn run_and_count<T>(
195 |     b: &Benchmark,
196 |     mut count: impl FnMut(T) -> anyhow::Result<usize>,
197 |     mut bench: impl FnMut() -> anyhow::Result<T>,
198 | ) -> anyhow::Result<Vec<Sample>> {
199 |     let warmup_start = Instant::now();
200 |     for _ in 0..b.max_warmup_iters {
201 |         let result = bench();
202 |         // We still compute the count in case there was a problem doing so,
203 |         // even though we don't do anything with the count.
204 |         let _count = count(result?)?;
205 |         if warmup_start.elapsed() >= b.max_warmup_time {
206 |             break;
207 |         }
208 |     }
209 | 
210 |     let mut samples = vec![];
211 |     let run_start = Instant::now();
212 |     for _ in 0..b.max_iters {
213 |         let bench_start = Instant::now();
214 |         let result = bench();
215 |         let duration = bench_start.elapsed();
216 |         // Should be fine since it's unreasonable for a match count to
217 |         // exceed u64::MAX.
218 |         let count = u64::try_from(count(result?)?).unwrap();
219 |         samples.push(Sample { duration, count });
220 |         if run_start.elapsed() >= b.max_time {
221 |             break;
222 |         }
223 |     }
224 |     Ok(samples)
225 | }
226 | 


--------------------------------------------------------------------------------
/fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | /Cargo.lock
2 | /artifacts
3 | /corpus
4 | 


--------------------------------------------------------------------------------
/fuzz/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "aho-corasick-fuzz"
 4 | version = "0.0.0"
 5 | authors = ["Automatically generated"]
 6 | edition = "2021"
 7 | 
 8 | # Prevent this from interfering with workspaces
 9 | [workspace]
10 | members = ["."]
11 | 
12 | [dependencies]
13 | aho-corasick = { path = ".." }
14 | libfuzzer-sys = { version = "0.4", features = ["arbitrary-derive"] }
15 | 
16 | [[bin]]
17 | name = "fuzz-find"
18 | path = "fuzz-targets/fuzz_find.rs"
19 | test = false
20 | doc = false
21 | 
22 | [package.metadata]
23 | cargo-fuzz = true
24 | 


--------------------------------------------------------------------------------
/fuzz/fuzz-targets/fuzz_find.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | 
 3 | use libfuzzer_sys::{arbitrary, fuzz_target};
 4 | 
 5 | use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
 6 | 
 7 | #[derive(arbitrary::Arbitrary, Debug, Clone)]
 8 | enum Operation {
 9 |     Find(String),
10 |     ReplaceAll(String, Vec<String>),
11 | }
12 | 
13 | #[derive(arbitrary::Arbitrary, Debug, Clone)]
14 | struct Inputs {
15 |     patterns: Vec<String>,
16 |     kind: u8,
17 |     match_kind: u8,
18 |     ascii_case_insensitive: bool,
19 |     dense_depth: Option<usize>,
20 |     prefilter: bool,
21 |     operation: Operation,
22 |     byte_classes: bool,
23 | }
24 | 
25 | fuzz_target!(|input: Inputs| {
26 |     let mut acb = AhoCorasick::builder();
27 |     acb.ascii_case_insensitive(input.ascii_case_insensitive)
28 |         .prefilter(input.prefilter)
29 |         .byte_classes(input.byte_classes);
30 |     match input.kind % 5 {
31 |         0 => &mut acb,
32 |         1 => acb.kind(None),
33 |         2 => acb.kind(Some(AhoCorasickKind::NoncontiguousNFA)),
34 |         3 => acb.kind(Some(AhoCorasickKind::ContiguousNFA)),
35 |         4 => acb.kind(Some(AhoCorasickKind::DFA)),
36 |         _ => unreachable!(),
37 |     };
38 |     match input.match_kind % 4 {
39 |         0 => &mut acb,
40 |         1 => acb.match_kind(MatchKind::Standard),
41 |         2 => acb.match_kind(MatchKind::LeftmostFirst),
42 |         3 => acb.match_kind(MatchKind::LeftmostLongest),
43 |         _ => unreachable!(),
44 |     };
45 |     if let Some(dense_depth) = input.dense_depth {
46 |         acb.dense_depth(dense_depth);
47 |     }
48 | 
49 |     let num_patterns = input.patterns.len();
50 |     let ac = acb.build(input.patterns).unwrap();
51 |     match input.operation {
52 |         Operation::Find(haystack) => {
53 |             ac.find(&haystack);
54 |         }
55 |         Operation::ReplaceAll(haystack, substitutions) => {
56 |             if substitutions.len() == num_patterns {
57 |                 ac.replace_all(&haystack, &substitutions);
58 |             }
59 |         }
60 |     }
61 | });
62 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 79
2 | use_small_heuristics = "max"
3 | 


--------------------------------------------------------------------------------
/src/macros.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused_macros)]
 2 | 
 3 | macro_rules! log {
 4 |     ($($tt:tt)*) => {
 5 |         #[cfg(feature = "logging")]
 6 |         {
 7 |             $($tt)*
 8 |         }
 9 |     }
10 | }
11 | 
12 | macro_rules! debug {
13 |     ($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
14 | }
15 | 
16 | macro_rules! trace {
17 |     ($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/nfa/mod.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | Provides direct access to NFA implementations of Aho-Corasick.
 3 | 
 4 | The principle characteristic of an NFA in this crate is that it may
 5 | transition through multiple states per byte of haystack. In Aho-Corasick
 6 | parlance, NFAs follow failure transitions during a search. In contrast,
 7 | a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during
 8 | compilation at the expense of a much bigger memory footprint.
 9 | 
10 | Currently, there are two NFA implementations provided: noncontiguous and
11 | contiguous. The names reflect their internal representation, and consequently,
12 | the trade offs associated with them:
13 | 
14 | * A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to
15 | represent its transitions in a sparse format. This is ideal for building an
16 | NFA, since it cheaply permits different states to have a different number of
17 | transitions. A noncontiguous NFA is where the main Aho-Corasick construction
18 | algorithm is implemented. All other Aho-Corasick implementations are built by
19 | first constructing a noncontiguous NFA.
20 | * A [`contiguous::NFA`] is uses a single allocation to represent all states,
21 | while still encoding most states as sparse states but permitting states near
22 | the starting state to have a dense representation. The dense representation
23 | uses more memory, but permits computing transitions during a search more
24 | quickly. By only making the most active states dense (the states near the
25 | starting state), a contiguous NFA better balances memory usage with search
26 | speed. The single contiguous allocation also uses less overhead per state and
27 | enables compression tricks where most states only use 8 bytes of heap memory.
28 | 
29 | When given the choice between these two, you almost always want to pick a
30 | contiguous NFA. It takes only a little longer to build, but both its memory
31 | usage and search speed are typically much better than a noncontiguous NFA. A
32 | noncontiguous NFA is useful when prioritizing build times, or when there are
33 | so many patterns that a contiguous NFA could not be built. (Currently, because
34 | of both memory and search speed improvements, a contiguous NFA has a smaller
35 | internal limit on the total number of NFA states it can represent. But you
36 | would likely need to have hundreds of thousands or even millions of patterns
37 | before you hit this limit.)
38 | */
39 | pub mod contiguous;
40 | pub mod noncontiguous;
41 | 


--------------------------------------------------------------------------------
/src/packed/ext.rs:
--------------------------------------------------------------------------------
 1 | /// A trait for adding some helper routines to pointers.
 2 | pub(crate) trait Pointer {
 3 |     /// Returns the distance, in units of `T`, between `self` and `origin`.
 4 |     ///
 5 |     /// # Safety
 6 |     ///
 7 |     /// Same as `ptr::offset_from` in addition to `self >= origin`.
 8 |     unsafe fn distance(self, origin: Self) -> usize;
 9 | 
10 |     /// Casts this pointer to `usize`.
11 |     ///
12 |     /// Callers should not convert the `usize` back to a pointer if at all
13 |     /// possible. (And if you believe it's necessary, open an issue to discuss
14 |     /// why. Otherwise, it has the potential to violate pointer provenance.)
15 |     /// The purpose of this function is just to be able to do arithmetic, i.e.,
16 |     /// computing offsets or alignments.
17 |     fn as_usize(self) -> usize;
18 | }
19 | 
20 | impl<T> Pointer for *const T {
21 |     unsafe fn distance(self, origin: *const T) -> usize {
22 |         // TODO: Replace with `ptr::sub_ptr` once stabilized.
23 |         usize::try_from(self.offset_from(origin)).unwrap_unchecked()
24 |     }
25 | 
26 |     fn as_usize(self) -> usize {
27 |         self as usize
28 |     }
29 | }
30 | 
31 | impl<T> Pointer for *mut T {
32 |     unsafe fn distance(self, origin: *mut T) -> usize {
33 |         (self as *const T).distance(origin as *const T)
34 |     }
35 | 
36 |     fn as_usize(self) -> usize {
37 |         (self as *const T).as_usize()
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/packed/mod.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | Provides packed multiple substring search, principally for a small number of
  3 | patterns.
  4 | 
  5 | This sub-module provides vectorized routines for quickly finding
  6 | matches of a small number of patterns. In general, users of this crate
  7 | shouldn't need to interface with this module directly, as the primary
  8 | [`AhoCorasick`](crate::AhoCorasick) searcher will use these routines
  9 | automatically as a prefilter when applicable. However, in some cases, callers
 10 | may want to bypass the Aho-Corasick machinery entirely and use this vectorized
 11 | searcher directly.
 12 | 
 13 | # Overview
 14 | 
 15 | The primary types in this sub-module are:
 16 | 
 17 | * [`Searcher`] executes the actual search algorithm to report matches in a
 18 | haystack.
 19 | * [`Builder`] accumulates patterns incrementally and can construct a
 20 | `Searcher`.
 21 | * [`Config`] permits tuning the searcher, and itself will produce a `Builder`
 22 | (which can then be used to build a `Searcher`). Currently, the only tuneable
 23 | knob are the match semantics, but this may be expanded in the future.
 24 | 
 25 | # Examples
 26 | 
 27 | This example shows how to create a searcher from an iterator of patterns.
 28 | By default, leftmost-first match semantics are used. (See the top-level
 29 | [`MatchKind`] type for more details about match semantics, which apply
 30 | similarly to packed substring search.)
 31 | 
 32 | ```
 33 | use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
 34 | 
 35 | # fn example() -> Option<()> {
 36 | let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
 37 | let matches: Vec<PatternID> = searcher
 38 |     .find_iter("foobar")
 39 |     .map(|mat| mat.pattern())
 40 |     .collect();
 41 | assert_eq!(vec![PatternID::ZERO], matches);
 42 | # Some(()) }
 43 | # if cfg!(all(feature = "std", any(
 44 | #     target_arch = "x86_64", target_arch = "aarch64",
 45 | # ))) {
 46 | #     example().unwrap()
 47 | # } else {
 48 | #     assert!(example().is_none());
 49 | # }
 50 | ```
 51 | 
 52 | This example shows how to use [`Config`] to change the match semantics to
 53 | leftmost-longest:
 54 | 
 55 | ```
 56 | use aho_corasick::{packed::{Config, MatchKind}, PatternID};
 57 | 
 58 | # fn example() -> Option<()> {
 59 | let searcher = Config::new()
 60 |     .match_kind(MatchKind::LeftmostLongest)
 61 |     .builder()
 62 |     .add("foo")
 63 |     .add("foobar")
 64 |     .build()?;
 65 | let matches: Vec<PatternID> = searcher
 66 |     .find_iter("foobar")
 67 |     .map(|mat| mat.pattern())
 68 |     .collect();
 69 | assert_eq!(vec![PatternID::must(1)], matches);
 70 | # Some(()) }
 71 | # if cfg!(all(feature = "std", any(
 72 | #     target_arch = "x86_64", target_arch = "aarch64",
 73 | # ))) {
 74 | #     example().unwrap()
 75 | # } else {
 76 | #     assert!(example().is_none());
 77 | # }
 78 | ```
 79 | 
 80 | # Packed substring searching
 81 | 
 82 | Packed substring searching refers to the use of SIMD (Single Instruction,
 83 | Multiple Data) to accelerate the detection of matches in a haystack. Unlike
 84 | conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring
 85 | search tend to do better with a small number of patterns, where as Aho-Corasick
 86 | generally maintains reasonably consistent performance regardless of the number
 87 | of patterns you give it. Because of this, the vectorized searcher in this
 88 | sub-module cannot be used as a general purpose searcher, since building the
 89 | searcher may fail even when given a small number of patterns. However, in
 90 | exchange, when searching for a small number of patterns, searching can be quite
 91 | a bit faster than Aho-Corasick (sometimes by an order of magnitude).
 92 | 
 93 | The key take away here is that constructing a searcher from a list of patterns
 94 | is a fallible operation with no clear rules for when it will fail. While the
 95 | precise conditions under which building a searcher can fail is specifically an
 96 | implementation detail, here are some common reasons:
 97 | 
 98 | * Too many patterns were given. Typically, the limit is on the order of 100 or
 99 |   so, but this limit may fluctuate based on available CPU features.
100 | * The available packed algorithms require CPU features that aren't available.
101 |   For example, currently, this crate only provides packed algorithms for
102 |   `x86_64` and `aarch64`. Therefore, constructing a packed searcher on any
103 |   other target will always fail.
104 | * Zero patterns were given, or one of the patterns given was empty. Packed
105 |   searchers require at least one pattern and that all patterns are non-empty.
106 | * Something else about the nature of the patterns (typically based on
107 |   heuristics) suggests that a packed searcher would perform very poorly, so
108 |   no searcher is built.
109 | */
110 | 
111 | pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
112 | 
113 | mod api;
114 | mod ext;
115 | mod pattern;
116 | mod rabinkarp;
117 | mod teddy;
118 | #[cfg(all(feature = "std", test))]
119 | mod tests;
120 | mod vector;
121 | 


--------------------------------------------------------------------------------
/src/packed/rabinkarp.rs:
--------------------------------------------------------------------------------
  1 | use alloc::{sync::Arc, vec, vec::Vec};
  2 | 
  3 | use crate::{packed::pattern::Patterns, util::search::Match, PatternID};
  4 | 
  5 | /// The type of the rolling hash used in the Rabin-Karp algorithm.
  6 | type Hash = usize;
  7 | 
  8 | /// The number of buckets to store our patterns in. We don't want this to be
  9 | /// too big in order to avoid wasting memory, but we don't want it to be too
 10 | /// small either to avoid spending too much time confirming literals.
 11 | ///
 12 | /// The number of buckets MUST be a power of two. Otherwise, determining the
 13 | /// bucket from a hash will slow down the code considerably. Using a power
 14 | /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
 15 | /// instruction.
 16 | const NUM_BUCKETS: usize = 64;
 17 | 
 18 | /// An implementation of the Rabin-Karp algorithm. The main idea of this
 19 | /// algorithm is to maintain a rolling hash as it moves through the input, and
 20 | /// then check whether that hash corresponds to the same hash for any of the
 21 | /// patterns we're looking for.
 22 | ///
 23 | /// A draw back of naively scaling Rabin-Karp to multiple patterns is that
 24 | /// it requires all of the patterns to be the same length, which in turn
 25 | /// corresponds to the number of bytes to hash. We adapt this to work for
 26 | /// multiple patterns of varying size by fixing the number of bytes to hash
 27 | /// to be the length of the smallest pattern. We also split the patterns into
 28 | /// several buckets to hopefully make the confirmation step faster.
 29 | ///
 30 | /// Wikipedia has a decent explanation, if a bit heavy on the theory:
 31 | /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
 32 | ///
 33 | /// But ESMAJ provides something a bit more concrete:
 34 | /// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
 35 | #[derive(Clone, Debug)]
 36 | pub(crate) struct RabinKarp {
 37 |     /// The patterns we're searching for.
 38 |     patterns: Arc<Patterns>,
 39 |     /// The order of patterns in each bucket is significant. Namely, they are
 40 |     /// arranged such that the first one to match is the correct match. This
 41 |     /// may not necessarily correspond to the order provided by the caller.
 42 |     /// For example, if leftmost-longest semantics are used, then the patterns
 43 |     /// are sorted by their length in descending order. If leftmost-first
 44 |     /// semantics are used, then the patterns are sorted by their pattern ID
 45 |     /// in ascending order (which corresponds to the caller's order).
 46 |     buckets: Vec<Vec<(Hash, PatternID)>>,
 47 |     /// The length of the hashing window. Generally, this corresponds to the
 48 |     /// length of the smallest pattern.
 49 |     hash_len: usize,
 50 |     /// The factor to subtract out of a hash before updating it with a new
 51 |     /// byte.
 52 |     hash_2pow: usize,
 53 | }
 54 | 
 55 | impl RabinKarp {
 56 |     /// Compile a new Rabin-Karp matcher from the patterns given.
 57 |     ///
 58 |     /// This panics if any of the patterns in the collection are empty, or if
 59 |     /// the collection is itself empty.
 60 |     pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp {
 61 |         assert!(patterns.len() >= 1);
 62 |         let hash_len = patterns.minimum_len();
 63 |         assert!(hash_len >= 1);
 64 | 
 65 |         let mut hash_2pow = 1usize;
 66 |         for _ in 1..hash_len {
 67 |             hash_2pow = hash_2pow.wrapping_shl(1);
 68 |         }
 69 | 
 70 |         let mut rk = RabinKarp {
 71 |             patterns: Arc::clone(patterns),
 72 |             buckets: vec![vec![]; NUM_BUCKETS],
 73 |             hash_len,
 74 |             hash_2pow,
 75 |         };
 76 |         for (id, pat) in patterns.iter() {
 77 |             let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
 78 |             let bucket = hash % NUM_BUCKETS;
 79 |             rk.buckets[bucket].push((hash, id));
 80 |         }
 81 |         rk
 82 |     }
 83 | 
 84 |     /// Return the first matching pattern in the given haystack, begining the
 85 |     /// search at `at`.
 86 |     pub(crate) fn find_at(
 87 |         &self,
 88 |         haystack: &[u8],
 89 |         mut at: usize,
 90 |     ) -> Option<Match> {
 91 |         assert_eq!(NUM_BUCKETS, self.buckets.len());
 92 | 
 93 |         if at + self.hash_len > haystack.len() {
 94 |             return None;
 95 |         }
 96 |         let mut hash = self.hash(&haystack[at..at + self.hash_len]);
 97 |         loop {
 98 |             let bucket = &self.buckets[hash % NUM_BUCKETS];
 99 |             for &(phash, pid) in bucket {
100 |                 if phash == hash {
101 |                     if let Some(c) = self.verify(pid, haystack, at) {
102 |                         return Some(c);
103 |                     }
104 |                 }
105 |             }
106 |             if at + self.hash_len >= haystack.len() {
107 |                 return None;
108 |             }
109 |             hash = self.update_hash(
110 |                 hash,
111 |                 haystack[at],
112 |                 haystack[at + self.hash_len],
113 |             );
114 |             at += 1;
115 |         }
116 |     }
117 | 
118 |     /// Returns the approximate total amount of heap used by this searcher, in
119 |     /// units of bytes.
120 |     pub(crate) fn memory_usage(&self) -> usize {
121 |         self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
122 |             + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>()
123 |     }
124 | 
125 |     /// Verify whether the pattern with the given id matches at
126 |     /// `haystack[at..]`.
127 |     ///
128 |     /// We tag this function as `cold` because it helps improve codegen.
129 |     /// Intuitively, it would seem like inlining it would be better. However,
130 |     /// the only time this is called and a match is not found is when there
131 |     /// there is a hash collision, or when a prefix of a pattern matches but
132 |     /// the entire pattern doesn't match. This is hopefully fairly rare, and
133 |     /// if it does occur a lot, it's going to be slow no matter what we do.
134 |     #[cold]
135 |     fn verify(
136 |         &self,
137 |         id: PatternID,
138 |         haystack: &[u8],
139 |         at: usize,
140 |     ) -> Option<Match> {
141 |         let pat = self.patterns.get(id);
142 |         if pat.is_prefix(&haystack[at..]) {
143 |             Some(Match::new(id, at..at + pat.len()))
144 |         } else {
145 |             None
146 |         }
147 |     }
148 | 
149 |     /// Hash the given bytes.
150 |     fn hash(&self, bytes: &[u8]) -> Hash {
151 |         assert_eq!(self.hash_len, bytes.len());
152 | 
153 |         let mut hash = 0usize;
154 |         for &b in bytes {
155 |             hash = hash.wrapping_shl(1).wrapping_add(b as usize);
156 |         }
157 |         hash
158 |     }
159 | 
160 |     /// Update the hash given based on removing `old_byte` at the beginning
161 |     /// of some byte string, and appending `new_byte` to the end of that same
162 |     /// byte string.
163 |     fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
164 |         prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
165 |             .wrapping_shl(1)
166 |             .wrapping_add(new_byte as usize)
167 |     }
168 | }
169 | 


--------------------------------------------------------------------------------
/src/packed/teddy/mod.rs:
--------------------------------------------------------------------------------
 1 | // Regrettable, but Teddy stuff just isn't used on all targets. And for some
 2 | // targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a
 3 | // bunch of dead-code warnings. Just not worth trying to squash them. Blech.
 4 | #![allow(dead_code)]
 5 | 
 6 | pub(crate) use self::builder::{Builder, Searcher};
 7 | 
 8 | mod builder;
 9 | mod generic;
10 | 


--------------------------------------------------------------------------------
/src/transducer.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | Provides implementations of `fst::Automaton` for Aho-Corasick automata.
  3 | 
  4 | This works by providing two wrapper types, [`Anchored`] and [`Unanchored`].
  5 | The former executes an anchored search on an FST while the latter executes
  6 | an unanchored search. Building these wrappers is fallible and will fail if
  7 | the underlying Aho-Corasick automaton does not support the type of search it
  8 | represents.
  9 | */
 10 | 
 11 | use crate::{
 12 |     automaton::{Automaton, StateID},
 13 |     Anchored as AcAnchored, Input, MatchError,
 14 | };
 15 | 
 16 | /// Represents an unanchored Aho-Corasick search of a finite state transducer.
 17 | ///
 18 | /// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
 19 | /// underlying automaton does not support unanchored searches.
 20 | ///
 21 | /// # Example
 22 | ///
 23 | /// This shows how to build an FST of keys and then run an unanchored search on
 24 | /// those keys using an Aho-Corasick automaton.
 25 | ///
 26 | /// ```
 27 | /// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored};
 28 | /// use fst::{Automaton, IntoStreamer, Set, Streamer};
 29 | ///
 30 | /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
 31 | /// let nfa = NFA::new(&["bcd", "x"]).unwrap();
 32 | /// // NFAs always support both unanchored and anchored searches.
 33 | /// let searcher = Unanchored::new(&nfa).unwrap();
 34 | ///
 35 | /// let mut stream = set.search(searcher).into_stream();
 36 | /// let mut results = vec![];
 37 | /// while let Some(key) = stream.next() {
 38 | ///     results.push(std::str::from_utf8(key).unwrap().to_string());
 39 | /// }
 40 | /// assert_eq!(vec!["abcd", "bcd", "xyz"], results);
 41 | /// ```
 42 | #[derive(Clone, Debug)]
 43 | pub struct Unanchored<A>(A);
 44 | 
 45 | impl<A: Automaton> Unanchored<A> {
 46 |     /// Create a new `Unanchored` implementation of the `fst::Automaton` trait.
 47 |     ///
 48 |     /// If the given Aho-Corasick automaton does not support unanchored
 49 |     /// searches, then this returns an error.
 50 |     pub fn new(aut: A) -> Result<Unanchored<A>, MatchError> {
 51 |         let input = Input::new("").anchored(AcAnchored::No);
 52 |         let _ = aut.start_state(&input)?;
 53 |         Ok(Unanchored(aut))
 54 |     }
 55 | 
 56 |     /// Returns a borrow to the underlying automaton.
 57 |     pub fn as_ref(&self) -> &A {
 58 |         &self.0
 59 |     }
 60 | 
 61 |     /// Unwrap this value and return the inner automaton.
 62 |     pub fn into_inner(self) -> A {
 63 |         self.0
 64 |     }
 65 | }
 66 | 
 67 | impl<A: Automaton> fst::Automaton for Unanchored<A> {
 68 |     type State = StateID;
 69 | 
 70 |     #[inline]
 71 |     fn start(&self) -> StateID {
 72 |         let input = Input::new("").anchored(AcAnchored::No);
 73 |         self.0.start_state(&input).expect("support for unanchored searches")
 74 |     }
 75 | 
 76 |     #[inline]
 77 |     fn is_match(&self, state: &StateID) -> bool {
 78 |         self.0.is_match(*state)
 79 |     }
 80 | 
 81 |     #[inline]
 82 |     fn accept(&self, state: &StateID, byte: u8) -> StateID {
 83 |         if fst::Automaton::is_match(self, state) {
 84 |             return *state;
 85 |         }
 86 |         self.0.next_state(AcAnchored::No, *state, byte)
 87 |     }
 88 | 
 89 |     #[inline]
 90 |     fn can_match(&self, state: &StateID) -> bool {
 91 |         !self.0.is_dead(*state)
 92 |     }
 93 | }
 94 | 
 95 | /// Represents an anchored Aho-Corasick search of a finite state transducer.
 96 | ///
 97 | /// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
 98 | /// underlying automaton does not support unanchored searches.
 99 | ///
100 | /// # Example
101 | ///
102 | /// This shows how to build an FST of keys and then run an anchored search on
103 | /// those keys using an Aho-Corasick automaton.
104 | ///
105 | /// ```
106 | /// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored};
107 | /// use fst::{Automaton, IntoStreamer, Set, Streamer};
108 | ///
109 | /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
110 | /// let nfa = NFA::new(&["bcd", "x"]).unwrap();
111 | /// // NFAs always support both unanchored and anchored searches.
112 | /// let searcher = Anchored::new(&nfa).unwrap();
113 | ///
114 | /// let mut stream = set.search(searcher).into_stream();
115 | /// let mut results = vec![];
116 | /// while let Some(key) = stream.next() {
117 | ///     results.push(std::str::from_utf8(key).unwrap().to_string());
118 | /// }
119 | /// assert_eq!(vec!["bcd", "xyz"], results);
120 | /// ```
121 | ///
122 | /// This is like the example above, except we use an Aho-Corasick DFA, which
123 | /// requires explicitly configuring it to support anchored searches. (NFAs
124 | /// unconditionally support both unanchored and anchored searches.)
125 | ///
126 | /// ```
127 | /// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind};
128 | /// use fst::{Automaton, IntoStreamer, Set, Streamer};
129 | ///
130 | /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
131 | /// let dfa = DFA::builder()
132 | ///     .start_kind(StartKind::Anchored)
133 | ///     .build(&["bcd", "x"])
134 | ///     .unwrap();
135 | /// // We've explicitly configured our DFA to support anchored searches.
136 | /// let searcher = Anchored::new(&dfa).unwrap();
137 | ///
138 | /// let mut stream = set.search(searcher).into_stream();
139 | /// let mut results = vec![];
140 | /// while let Some(key) = stream.next() {
141 | ///     results.push(std::str::from_utf8(key).unwrap().to_string());
142 | /// }
143 | /// assert_eq!(vec!["bcd", "xyz"], results);
144 | /// ```
145 | #[derive(Clone, Debug)]
146 | pub struct Anchored<A>(A);
147 | 
148 | impl<A: Automaton> Anchored<A> {
149 |     /// Create a new `Anchored` implementation of the `fst::Automaton` trait.
150 |     ///
151 |     /// If the given Aho-Corasick automaton does not support anchored searches,
152 |     /// then this returns an error.
153 |     pub fn new(aut: A) -> Result<Anchored<A>, MatchError> {
154 |         let input = Input::new("").anchored(AcAnchored::Yes);
155 |         let _ = aut.start_state(&input)?;
156 |         Ok(Anchored(aut))
157 |     }
158 | 
159 |     /// Returns a borrow to the underlying automaton.
160 |     pub fn as_ref(&self) -> &A {
161 |         &self.0
162 |     }
163 | 
164 |     /// Unwrap this value and return the inner automaton.
165 |     pub fn into_inner(self) -> A {
166 |         self.0
167 |     }
168 | }
169 | 
170 | impl<A: Automaton> fst::Automaton for Anchored<A> {
171 |     type State = StateID;
172 | 
173 |     #[inline]
174 |     fn start(&self) -> StateID {
175 |         let input = Input::new("").anchored(AcAnchored::Yes);
176 |         self.0.start_state(&input).expect("support for unanchored searches")
177 |     }
178 | 
179 |     #[inline]
180 |     fn is_match(&self, state: &StateID) -> bool {
181 |         self.0.is_match(*state)
182 |     }
183 | 
184 |     #[inline]
185 |     fn accept(&self, state: &StateID, byte: u8) -> StateID {
186 |         if fst::Automaton::is_match(self, state) {
187 |             return *state;
188 |         }
189 |         self.0.next_state(AcAnchored::Yes, *state, byte)
190 |     }
191 | 
192 |     #[inline]
193 |     fn can_match(&self, state: &StateID) -> bool {
194 |         !self.0.is_dead(*state)
195 |     }
196 | }
197 | 
198 | #[cfg(test)]
199 | mod tests {
200 |     use alloc::{string::String, vec, vec::Vec};
201 | 
202 |     use fst::{Automaton, IntoStreamer, Set, Streamer};
203 | 
204 |     use crate::{
205 |         dfa::DFA,
206 |         nfa::{contiguous, noncontiguous},
207 |         StartKind,
208 |     };
209 | 
210 |     use super::*;
211 | 
212 |     fn search<A: Automaton, D: AsRef<[u8]>>(
213 |         set: &Set<D>,
214 |         aut: A,
215 |     ) -> Vec<String> {
216 |         let mut stream = set.search(aut).into_stream();
217 |         let mut results = vec![];
218 |         while let Some(key) = stream.next() {
219 |             results.push(String::from(core::str::from_utf8(key).unwrap()));
220 |         }
221 |         results
222 |     }
223 | 
224 |     #[test]
225 |     fn unanchored() {
226 |         let set =
227 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
228 |                 .unwrap();
229 |         let patterns = vec!["baz", "bax"];
230 |         let expected = vec!["baz", "xbax"];
231 | 
232 |         let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap());
233 |         let got = search(&set, &aut);
234 |         assert_eq!(got, expected);
235 | 
236 |         let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap());
237 |         let got = search(&set, &aut);
238 |         assert_eq!(got, expected);
239 | 
240 |         let aut = Unanchored(DFA::new(&patterns).unwrap());
241 |         let got = search(&set, &aut);
242 |         assert_eq!(got, expected);
243 |     }
244 | 
245 |     #[test]
246 |     fn anchored() {
247 |         let set =
248 |             Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
249 |                 .unwrap();
250 |         let patterns = vec!["baz", "bax"];
251 |         let expected = vec!["baz"];
252 | 
253 |         let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap());
254 |         let got = search(&set, &aut);
255 |         assert_eq!(got, expected);
256 | 
257 |         let aut = Anchored(contiguous::NFA::new(&patterns).unwrap());
258 |         let got = search(&set, &aut);
259 |         assert_eq!(got, expected);
260 | 
261 |         let aut = Anchored(
262 |             DFA::builder()
263 |                 .start_kind(StartKind::Anchored)
264 |                 .build(&patterns)
265 |                 .unwrap(),
266 |         );
267 |         let got = search(&set, &aut);
268 |         assert_eq!(got, expected);
269 |     }
270 | }
271 | 


--------------------------------------------------------------------------------
/src/util/buffer.rs:
--------------------------------------------------------------------------------
  1 | use alloc::{vec, vec::Vec};
  2 | 
  3 | /// The default buffer capacity that we use for the stream buffer.
  4 | const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB
  5 | 
  6 | /// A fairly simple roll buffer for supporting stream searches.
  7 | ///
  8 | /// This buffer acts as a temporary place to store a fixed amount of data when
  9 | /// reading from a stream. Its central purpose is to allow "rolling" some
 10 | /// suffix of the data to the beginning of the buffer before refilling it with
 11 | /// more data from the stream. For example, let's say we are trying to match
 12 | /// "foobar" on a stream. When we report the match, we'd like to not only
 13 | /// report the correct offsets at which the match occurs, but also the matching
 14 | /// bytes themselves. So let's say our stream is a file with the following
 15 | /// contents: `test test foobar test test`. Now assume that we happen to read
 16 | /// the aforementioned file in two chunks: `test test foo` and `bar test test`.
 17 | /// Naively, it would not be possible to report a single contiguous `foobar`
 18 | /// match, but this roll buffer allows us to do that. Namely, after the second
 19 | /// read, the contents of the buffer should be `st foobar test test`, where the
 20 | /// search should ultimately resume immediately after `foo`. (The prefix `st `
 21 | /// is included because the roll buffer saves N bytes at the end of the buffer,
 22 | /// where N is the maximum possible length of a match.)
 23 | ///
 24 | /// A lot of the logic for dealing with this is unfortunately split out between
 25 | /// this roll buffer and the `StreamChunkIter`.
 26 | ///
 27 | /// Note also that this buffer is not actually required to just report matches.
 28 | /// Because a `Match` is just some offsets. But it *is* required for supporting
 29 | /// things like `try_stream_replace_all` because that needs some mechanism for
 30 | /// knowing which bytes in the stream correspond to a match and which don't. So
 31 | /// when a match occurs across two `read` calls, *something* needs to retain
 32 | /// the bytes from the previous `read` call because you don't know before the
 33 | /// second read call whether a match exists or not.
 34 | #[derive(Debug)]
 35 | pub(crate) struct Buffer {
 36 |     /// The raw buffer contents. This has a fixed size and never increases.
 37 |     buf: Vec<u8>,
 38 |     /// The minimum size of the buffer, which is equivalent to the maximum
 39 |     /// possible length of a match. This corresponds to the amount that we
 40 |     /// roll
 41 |     min: usize,
 42 |     /// The end of the contents of this buffer.
 43 |     end: usize,
 44 | }
 45 | 
 46 | impl Buffer {
 47 |     /// Create a new buffer for stream searching. The minimum buffer length
 48 |     /// given should be the size of the maximum possible match length.
 49 |     pub(crate) fn new(min_buffer_len: usize) -> Buffer {
 50 |         let min = core::cmp::max(1, min_buffer_len);
 51 |         // The minimum buffer amount is also the amount that we roll our
 52 |         // buffer in order to support incremental searching. To this end,
 53 |         // our actual capacity needs to be at least 1 byte bigger than our
 54 |         // minimum amount, otherwise we won't have any overlap. In actuality,
 55 |         // we want our buffer to be a bit bigger than that for performance
 56 |         // reasons, so we set a lower bound of `8 * min`.
 57 |         //
 58 |         // TODO: It would be good to find a way to test the streaming
 59 |         // implementation with the minimal buffer size. For now, we just
 60 |         // uncomment out the next line and comment out the subsequent line.
 61 |         // let capacity = 1 + min;
 62 |         let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
 63 |         Buffer { buf: vec![0; capacity], min, end: 0 }
 64 |     }
 65 | 
 66 |     /// Return the contents of this buffer.
 67 |     #[inline]
 68 |     pub(crate) fn buffer(&self) -> &[u8] {
 69 |         &self.buf[..self.end]
 70 |     }
 71 | 
 72 |     /// Return the minimum size of the buffer. The only way a buffer may be
 73 |     /// smaller than this is if the stream itself contains less than the
 74 |     /// minimum buffer amount.
 75 |     #[inline]
 76 |     pub(crate) fn min_buffer_len(&self) -> usize {
 77 |         self.min
 78 |     }
 79 | 
 80 |     /// Return all free capacity in this buffer.
 81 |     fn free_buffer(&mut self) -> &mut [u8] {
 82 |         &mut self.buf[self.end..]
 83 |     }
 84 | 
 85 |     /// Refill the contents of this buffer by reading as much as possible into
 86 |     /// this buffer's free capacity. If no more bytes could be read, then this
 87 |     /// returns false. Otherwise, this reads until it has filled the buffer
 88 |     /// past the minimum amount.
 89 |     pub(crate) fn fill<R: std::io::Read>(
 90 |         &mut self,
 91 |         mut rdr: R,
 92 |     ) -> std::io::Result<bool> {
 93 |         let mut readany = false;
 94 |         loop {
 95 |             let readlen = rdr.read(self.free_buffer())?;
 96 |             if readlen == 0 {
 97 |                 return Ok(readany);
 98 |             }
 99 |             readany = true;
100 |             self.end += readlen;
101 |             if self.buffer().len() >= self.min {
102 |                 return Ok(true);
103 |             }
104 |         }
105 |     }
106 | 
107 |     /// Roll the contents of the buffer so that the suffix of this buffer is
108 |     /// moved to the front and all other contents are dropped. The size of the
109 |     /// suffix corresponds precisely to the minimum buffer length.
110 |     ///
111 |     /// This should only be called when the entire contents of this buffer have
112 |     /// been searched.
113 |     pub(crate) fn roll(&mut self) {
114 |         let roll_start = self
115 |             .end
116 |             .checked_sub(self.min)
117 |             .expect("buffer capacity should be bigger than minimum amount");
118 |         let roll_end = roll_start + self.min;
119 | 
120 |         assert!(roll_end <= self.end);
121 |         self.buf.copy_within(roll_start..roll_end, 0);
122 |         self.end = self.min;
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/util/byte_frequencies.rs:
--------------------------------------------------------------------------------
  1 | pub const BYTE_FREQUENCIES: [u8; 256] = [
  2 |     55,  // '\x00'
  3 |     52,  // '\x01'
  4 |     51,  // '\x02'
  5 |     50,  // '\x03'
  6 |     49,  // '\x04'
  7 |     48,  // '\x05'
  8 |     47,  // '\x06'
  9 |     46,  // '\x07'
 10 |     45,  // '\x08'
 11 |     103, // '\t'
 12 |     242, // '\n'
 13 |     66,  // '\x0b'
 14 |     67,  // '\x0c'
 15 |     229, // '\r'
 16 |     44,  // '\x0e'
 17 |     43,  // '\x0f'
 18 |     42,  // '\x10'
 19 |     41,  // '\x11'
 20 |     40,  // '\x12'
 21 |     39,  // '\x13'
 22 |     38,  // '\x14'
 23 |     37,  // '\x15'
 24 |     36,  // '\x16'
 25 |     35,  // '\x17'
 26 |     34,  // '\x18'
 27 |     33,  // '\x19'
 28 |     56,  // '\x1a'
 29 |     32,  // '\x1b'
 30 |     31,  // '\x1c'
 31 |     30,  // '\x1d'
 32 |     29,  // '\x1e'
 33 |     28,  // '\x1f'
 34 |     255, // ' '
 35 |     148, // '!'
 36 |     164, // '"'
 37 |     149, // '#'
 38 |     136, // '$'
 39 |     160, // '%'
 40 |     155, // '&'
 41 |     173, // "'"
 42 |     221, // '('
 43 |     222, // ')'
 44 |     134, // '*'
 45 |     122, // '+'
 46 |     232, // ','
 47 |     202, // '-'
 48 |     215, // '.'
 49 |     224, // '/'
 50 |     208, // '0'
 51 |     220, // '1'
 52 |     204, // '2'
 53 |     187, // '3'
 54 |     183, // '4'
 55 |     179, // '5'
 56 |     177, // '6'
 57 |     168, // '7'
 58 |     178, // '8'
 59 |     200, // '9'
 60 |     226, // ':'
 61 |     195, // ';'
 62 |     154, // '<'
 63 |     184, // '='
 64 |     174, // '>'
 65 |     126, // '?'
 66 |     120, // '@'
 67 |     191, // 'A'
 68 |     157, // 'B'
 69 |     194, // 'C'
 70 |     170, // 'D'
 71 |     189, // 'E'
 72 |     162, // 'F'
 73 |     161, // 'G'
 74 |     150, // 'H'
 75 |     193, // 'I'
 76 |     142, // 'J'
 77 |     137, // 'K'
 78 |     171, // 'L'
 79 |     176, // 'M'
 80 |     185, // 'N'
 81 |     167, // 'O'
 82 |     186, // 'P'
 83 |     112, // 'Q'
 84 |     175, // 'R'
 85 |     192, // 'S'
 86 |     188, // 'T'
 87 |     156, // 'U'
 88 |     140, // 'V'
 89 |     143, // 'W'
 90 |     123, // 'X'
 91 |     133, // 'Y'
 92 |     128, // 'Z'
 93 |     147, // '['
 94 |     138, // '\\'
 95 |     146, // ']'
 96 |     114, // '^'
 97 |     223, // '_'
 98 |     151, // '`'
 99 |     249, // 'a'
100 |     216, // 'b'
101 |     238, // 'c'
102 |     236, // 'd'
103 |     253, // 'e'
104 |     227, // 'f'
105 |     218, // 'g'
106 |     230, // 'h'
107 |     247, // 'i'
108 |     135, // 'j'
109 |     180, // 'k'
110 |     241, // 'l'
111 |     233, // 'm'
112 |     246, // 'n'
113 |     244, // 'o'
114 |     231, // 'p'
115 |     139, // 'q'
116 |     245, // 'r'
117 |     243, // 's'
118 |     251, // 't'
119 |     235, // 'u'
120 |     201, // 'v'
121 |     196, // 'w'
122 |     240, // 'x'
123 |     214, // 'y'
124 |     152, // 'z'
125 |     182, // '{'
126 |     205, // '|'
127 |     181, // '}'
128 |     127, // '~'
129 |     27,  // '\x7f'
130 |     212, // '\x80'
131 |     211, // '\x81'
132 |     210, // '\x82'
133 |     213, // '\x83'
134 |     228, // '\x84'
135 |     197, // '\x85'
136 |     169, // '\x86'
137 |     159, // '\x87'
138 |     131, // '\x88'
139 |     172, // '\x89'
140 |     105, // '\x8a'
141 |     80,  // '\x8b'
142 |     98,  // '\x8c'
143 |     96,  // '\x8d'
144 |     97,  // '\x8e'
145 |     81,  // '\x8f'
146 |     207, // '\x90'
147 |     145, // '\x91'
148 |     116, // '\x92'
149 |     115, // '\x93'
150 |     144, // '\x94'
151 |     130, // '\x95'
152 |     153, // '\x96'
153 |     121, // '\x97'
154 |     107, // '\x98'
155 |     132, // '\x99'
156 |     109, // '\x9a'
157 |     110, // '\x9b'
158 |     124, // '\x9c'
159 |     111, // '\x9d'
160 |     82,  // '\x9e'
161 |     108, // '\x9f'
162 |     118, // '\xa0'
163 |     141, // '¡'
164 |     113, // '¢'
165 |     129, // '£'
166 |     119, // '¤'
167 |     125, // '¥'
168 |     165, // '¦'
169 |     117, // '§'
170 |     92,  // '¨'
171 |     106, // '©'
172 |     83,  // 'ª'
173 |     72,  // '«'
174 |     99,  // '¬'
175 |     93,  // '\xad'
176 |     65,  // '®'
177 |     79,  // '¯'
178 |     166, // '°'
179 |     237, // '±'
180 |     163, // '²'
181 |     199, // '³'
182 |     190, // '´'
183 |     225, // 'µ'
184 |     209, // '¶'
185 |     203, // '·'
186 |     198, // '¸'
187 |     217, // '¹'
188 |     219, // 'º'
189 |     206, // '»'
190 |     234, // '¼'
191 |     248, // '½'
192 |     158, // '¾'
193 |     239, // '¿'
194 |     255, // 'À'
195 |     255, // 'Á'
196 |     255, // 'Â'
197 |     255, // 'Ã'
198 |     255, // 'Ä'
199 |     255, // 'Å'
200 |     255, // 'Æ'
201 |     255, // 'Ç'
202 |     255, // 'È'
203 |     255, // 'É'
204 |     255, // 'Ê'
205 |     255, // 'Ë'
206 |     255, // 'Ì'
207 |     255, // 'Í'
208 |     255, // 'Î'
209 |     255, // 'Ï'
210 |     255, // 'Ð'
211 |     255, // 'Ñ'
212 |     255, // 'Ò'
213 |     255, // 'Ó'
214 |     255, // 'Ô'
215 |     255, // 'Õ'
216 |     255, // 'Ö'
217 |     255, // '×'
218 |     255, // 'Ø'
219 |     255, // 'Ù'
220 |     255, // 'Ú'
221 |     255, // 'Û'
222 |     255, // 'Ü'
223 |     255, // 'Ý'
224 |     255, // 'Þ'
225 |     255, // 'ß'
226 |     255, // 'à'
227 |     255, // 'á'
228 |     255, // 'â'
229 |     255, // 'ã'
230 |     255, // 'ä'
231 |     255, // 'å'
232 |     255, // 'æ'
233 |     255, // 'ç'
234 |     255, // 'è'
235 |     255, // 'é'
236 |     255, // 'ê'
237 |     255, // 'ë'
238 |     255, // 'ì'
239 |     255, // 'í'
240 |     255, // 'î'
241 |     255, // 'ï'
242 |     255, // 'ð'
243 |     255, // 'ñ'
244 |     255, // 'ò'
245 |     255, // 'ó'
246 |     255, // 'ô'
247 |     255, // 'õ'
248 |     255, // 'ö'
249 |     255, // '÷'
250 |     255, // 'ø'
251 |     255, // 'ù'
252 |     255, // 'ú'
253 |     255, // 'û'
254 |     255, // 'ü'
255 |     255, // 'ý'
256 |     255, // 'þ'
257 |     255, // 'ÿ'
258 | ];
259 | 


--------------------------------------------------------------------------------
/src/util/debug.rs:
--------------------------------------------------------------------------------
 1 | /// A type that wraps a single byte with a convenient fmt::Debug impl that
 2 | /// escapes the byte.
 3 | pub(crate) struct DebugByte(pub(crate) u8);
 4 | 
 5 | impl core::fmt::Debug for DebugByte {
 6 |     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
 7 |         // Special case ASCII space. It's too hard to read otherwise, so
 8 |         // put quotes around it. I sometimes wonder whether just '\x20' would
 9 |         // be better...
10 |         if self.0 == b' ' {
11 |             return write!(f, "' '");
12 |         }
13 |         // 10 bytes is enough to cover any output from ascii::escape_default.
14 |         let mut bytes = [0u8; 10];
15 |         let mut len = 0;
16 |         for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
17 |             // capitalize \xab to \xAB
18 |             if i >= 2 && b'a' <= b && b <= b'f' {
19 |                 b -= 32;
20 |             }
21 |             bytes[len] = b;
22 |             len += 1;
23 |         }
24 |         write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/util/error.rs:
--------------------------------------------------------------------------------
  1 | use crate::util::{
  2 |     primitives::{PatternID, SmallIndex},
  3 |     search::MatchKind,
  4 | };
  5 | 
  6 | /// An error that occurred during the construction of an Aho-Corasick
  7 | /// automaton.
  8 | ///
  9 | /// Build errors occur when some kind of limit has been exceeded, either in the
 10 | /// number of states, the number of patterns of the length of a pattern. These
 11 | /// limits aren't part of the public API, but they should generally be large
 12 | /// enough to handle most use cases.
 13 | ///
 14 | /// When the `std` feature is enabled, this implements the `std::error::Error`
 15 | /// trait.
 16 | #[derive(Clone, Debug)]
 17 | pub struct BuildError {
 18 |     kind: ErrorKind,
 19 | }
 20 | 
 21 | /// The kind of error that occurred.
 22 | #[derive(Clone, Debug)]
 23 | enum ErrorKind {
 24 |     /// An error that occurs when allocating a new state would result in an
 25 |     /// identifier that exceeds the capacity of a `StateID`.
 26 |     StateIDOverflow {
 27 |         /// The maximum possible id.
 28 |         max: u64,
 29 |         /// The maximum ID requested.
 30 |         requested_max: u64,
 31 |     },
 32 |     /// An error that occurs when adding a pattern to an Aho-Corasick
 33 |     /// automaton would result in an identifier that exceeds the capacity of a
 34 |     /// `PatternID`.
 35 |     PatternIDOverflow {
 36 |         /// The maximum possible id.
 37 |         max: u64,
 38 |         /// The maximum ID requested.
 39 |         requested_max: u64,
 40 |     },
 41 |     /// Occurs when a pattern string is given to the Aho-Corasick constructor
 42 |     /// that is too long.
 43 |     PatternTooLong {
 44 |         /// The ID of the pattern that was too long.
 45 |         pattern: PatternID,
 46 |         /// The length that was too long.
 47 |         len: usize,
 48 |     },
 49 | }
 50 | 
 51 | impl BuildError {
 52 |     pub(crate) fn state_id_overflow(
 53 |         max: u64,
 54 |         requested_max: u64,
 55 |     ) -> BuildError {
 56 |         BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } }
 57 |     }
 58 | 
 59 |     pub(crate) fn pattern_id_overflow(
 60 |         max: u64,
 61 |         requested_max: u64,
 62 |     ) -> BuildError {
 63 |         BuildError {
 64 |             kind: ErrorKind::PatternIDOverflow { max, requested_max },
 65 |         }
 66 |     }
 67 | 
 68 |     pub(crate) fn pattern_too_long(
 69 |         pattern: PatternID,
 70 |         len: usize,
 71 |     ) -> BuildError {
 72 |         BuildError { kind: ErrorKind::PatternTooLong { pattern, len } }
 73 |     }
 74 | }
 75 | 
 76 | #[cfg(feature = "std")]
 77 | impl std::error::Error for BuildError {}
 78 | 
 79 | impl core::fmt::Display for BuildError {
 80 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
 81 |         match self.kind {
 82 |             ErrorKind::StateIDOverflow { max, requested_max } => {
 83 |                 write!(
 84 |                     f,
 85 |                     "state identifier overflow: failed to create state ID \
 86 |                      from {}, which exceeds the max of {}",
 87 |                     requested_max, max,
 88 |                 )
 89 |             }
 90 |             ErrorKind::PatternIDOverflow { max, requested_max } => {
 91 |                 write!(
 92 |                     f,
 93 |                     "pattern identifier overflow: failed to create pattern ID \
 94 |                      from {}, which exceeds the max of {}",
 95 |                     requested_max, max,
 96 |                 )
 97 |             }
 98 |             ErrorKind::PatternTooLong { pattern, len } => {
 99 |                 write!(
100 |                     f,
101 |                     "pattern {} with length {} exceeds \
102 |                      the maximum pattern length of {}",
103 |                     pattern.as_usize(),
104 |                     len,
105 |                     SmallIndex::MAX.as_usize(),
106 |                 )
107 |             }
108 |         }
109 |     }
110 | }
111 | 
112 | /// An error that occurred during an Aho-Corasick search.
113 | ///
114 | /// An error that occurs during a search is limited to some kind of
115 | /// misconfiguration that resulted in an illegal call. Stated differently,
116 | /// whether an error occurs is not dependent on the specific bytes in the
117 | /// haystack.
118 | ///
119 | /// Examples of misconfiguration:
120 | ///
121 | /// * Executing a stream or overlapping search on a searcher that was built was
122 | /// something other than [`MatchKind::Standard`](crate::MatchKind::Standard)
123 | /// semantics.
124 | /// * Requested an anchored or an unanchored search on a searcher that doesn't
125 | /// support unanchored or anchored searches, respectively.
126 | ///
127 | /// When the `std` feature is enabled, this implements the `std::error::Error`
128 | /// trait.
129 | #[derive(Clone, Debug, Eq, PartialEq)]
130 | pub struct MatchError(alloc::boxed::Box<MatchErrorKind>);
131 | 
132 | impl MatchError {
133 |     /// Create a new error value with the given kind.
134 |     ///
135 |     /// This is a more verbose version of the kind-specific constructors, e.g.,
136 |     /// `MatchError::unsupported_stream`.
137 |     pub fn new(kind: MatchErrorKind) -> MatchError {
138 |         MatchError(alloc::boxed::Box::new(kind))
139 |     }
140 | 
141 |     /// Returns a reference to the underlying error kind.
142 |     pub fn kind(&self) -> &MatchErrorKind {
143 |         &self.0
144 |     }
145 | 
146 |     /// Create a new "invalid anchored search" error. This occurs when the
147 |     /// caller requests an anchored search but where anchored searches aren't
148 |     /// supported.
149 |     ///
150 |     /// This is the same as calling `MatchError::new` with a
151 |     /// [`MatchErrorKind::InvalidInputAnchored`] kind.
152 |     pub fn invalid_input_anchored() -> MatchError {
153 |         MatchError::new(MatchErrorKind::InvalidInputAnchored)
154 |     }
155 | 
156 |     /// Create a new "invalid unanchored search" error. This occurs when the
157 |     /// caller requests an unanchored search but where unanchored searches
158 |     /// aren't supported.
159 |     ///
160 |     /// This is the same as calling `MatchError::new` with a
161 |     /// [`MatchErrorKind::InvalidInputUnanchored`] kind.
162 |     pub fn invalid_input_unanchored() -> MatchError {
163 |         MatchError::new(MatchErrorKind::InvalidInputUnanchored)
164 |     }
165 | 
166 |     /// Create a new "unsupported stream search" error. This occurs when the
167 |     /// caller requests a stream search while using an Aho-Corasick automaton
168 |     /// with a match kind other than [`MatchKind::Standard`].
169 |     ///
170 |     /// The match kind given should be the match kind of the automaton. It
171 |     /// should never be `MatchKind::Standard`.
172 |     pub fn unsupported_stream(got: MatchKind) -> MatchError {
173 |         MatchError::new(MatchErrorKind::UnsupportedStream { got })
174 |     }
175 | 
176 |     /// Create a new "unsupported overlapping search" error. This occurs when
177 |     /// the caller requests an overlapping search while using an Aho-Corasick
178 |     /// automaton with a match kind other than [`MatchKind::Standard`].
179 |     ///
180 |     /// The match kind given should be the match kind of the automaton. It
181 |     /// should never be `MatchKind::Standard`.
182 |     pub fn unsupported_overlapping(got: MatchKind) -> MatchError {
183 |         MatchError::new(MatchErrorKind::UnsupportedOverlapping { got })
184 |     }
185 | 
186 |     /// Create a new "unsupported empty pattern" error. This occurs when the
187 |     /// caller requests a search for which matching an automaton that contains
188 |     /// an empty pattern string is not supported.
189 |     pub fn unsupported_empty() -> MatchError {
190 |         MatchError::new(MatchErrorKind::UnsupportedEmpty)
191 |     }
192 | }
193 | 
194 | /// The underlying kind of a [`MatchError`].
195 | ///
196 | /// This is a **non-exhaustive** enum. That means new variants may be added in
197 | /// a semver-compatible release.
198 | #[non_exhaustive]
199 | #[derive(Clone, Debug, Eq, PartialEq)]
200 | pub enum MatchErrorKind {
201 |     /// An error indicating that an anchored search was requested, but from a
202 |     /// searcher that was built without anchored support.
203 |     InvalidInputAnchored,
204 |     /// An error indicating that an unanchored search was requested, but from a
205 |     /// searcher that was built without unanchored support.
206 |     InvalidInputUnanchored,
207 |     /// An error indicating that a stream search was attempted on an
208 |     /// Aho-Corasick automaton with an unsupported `MatchKind`.
209 |     UnsupportedStream {
210 |         /// The match semantics for the automaton that was used.
211 |         got: MatchKind,
212 |     },
213 |     /// An error indicating that an overlapping search was attempted on an
214 |     /// Aho-Corasick automaton with an unsupported `MatchKind`.
215 |     UnsupportedOverlapping {
216 |         /// The match semantics for the automaton that was used.
217 |         got: MatchKind,
218 |     },
219 |     /// An error indicating that the operation requested doesn't support
220 |     /// automatons that contain an empty pattern string.
221 |     UnsupportedEmpty,
222 | }
223 | 
224 | #[cfg(feature = "std")]
225 | impl std::error::Error for MatchError {}
226 | 
227 | impl core::fmt::Display for MatchError {
228 |     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
229 |         match *self.kind() {
230 |             MatchErrorKind::InvalidInputAnchored => {
231 |                 write!(f, "anchored searches are not supported or enabled")
232 |             }
233 |             MatchErrorKind::InvalidInputUnanchored => {
234 |                 write!(f, "unanchored searches are not supported or enabled")
235 |             }
236 |             MatchErrorKind::UnsupportedStream { got } => {
237 |                 write!(
238 |                     f,
239 |                     "match kind {:?} does not support stream searching",
240 |                     got,
241 |                 )
242 |             }
243 |             MatchErrorKind::UnsupportedOverlapping { got } => {
244 |                 write!(
245 |                     f,
246 |                     "match kind {:?} does not support overlapping searches",
247 |                     got,
248 |                 )
249 |             }
250 |             MatchErrorKind::UnsupportedEmpty => {
251 |                 write!(
252 |                     f,
253 |                     "matching with an empty pattern string is not \
254 |                      supported for this operation",
255 |                 )
256 |             }
257 |         }
258 |     }
259 | }
260 | 


--------------------------------------------------------------------------------
/src/util/int.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | This module provides several integer oriented traits for converting between
  3 | both fixed size integers and integers whose size varies based on the target
  4 | (like `usize`).
  5 | 
  6 | The main design principle for this module is to centralize all uses of `as`.
  7 | The thinking here is that `as` makes it very easy to perform accidental lossy
  8 | conversions, and if we centralize all its uses here under more descriptive
  9 | higher level operations, its use and correctness becomes easier to audit.
 10 | 
 11 | This was copied mostly wholesale from `regex-automata`.
 12 | 
 13 | NOTE: for simplicity, we don't take target pointer width into account here for
 14 | `usize` conversions. Since we currently only panic in debug mode, skipping the
 15 | check when it can be proven it isn't needed at compile time doesn't really
 16 | matter. Now, if we wind up wanting to do as many checks as possible in release
 17 | mode, then we would want to skip those when we know the conversions are always
 18 | non-lossy.
 19 | */
 20 | 
 21 | // We define a little more than what we need, but I'd rather just have
 22 | // everything via a consistent and uniform API then have holes.
 23 | #![allow(dead_code)]
 24 | 
 25 | pub(crate) trait U8 {
 26 |     fn as_usize(self) -> usize;
 27 | }
 28 | 
 29 | impl U8 for u8 {
 30 |     fn as_usize(self) -> usize {
 31 |         usize::from(self)
 32 |     }
 33 | }
 34 | 
 35 | pub(crate) trait U16 {
 36 |     fn as_usize(self) -> usize;
 37 |     fn low_u8(self) -> u8;
 38 |     fn high_u8(self) -> u8;
 39 | }
 40 | 
 41 | impl U16 for u16 {
 42 |     fn as_usize(self) -> usize {
 43 |         usize::from(self)
 44 |     }
 45 | 
 46 |     fn low_u8(self) -> u8 {
 47 |         self as u8
 48 |     }
 49 | 
 50 |     fn high_u8(self) -> u8 {
 51 |         (self >> 8) as u8
 52 |     }
 53 | }
 54 | 
 55 | pub(crate) trait U32 {
 56 |     fn as_usize(self) -> usize;
 57 |     fn low_u8(self) -> u8;
 58 |     fn low_u16(self) -> u16;
 59 |     fn high_u16(self) -> u16;
 60 | }
 61 | 
 62 | impl U32 for u32 {
 63 |     #[inline]
 64 |     fn as_usize(self) -> usize {
 65 |         #[cfg(debug_assertions)]
 66 |         {
 67 |             usize::try_from(self).expect("u32 overflowed usize")
 68 |         }
 69 |         #[cfg(not(debug_assertions))]
 70 |         {
 71 |             self as usize
 72 |         }
 73 |     }
 74 | 
 75 |     fn low_u8(self) -> u8 {
 76 |         self as u8
 77 |     }
 78 | 
 79 |     fn low_u16(self) -> u16 {
 80 |         self as u16
 81 |     }
 82 | 
 83 |     fn high_u16(self) -> u16 {
 84 |         (self >> 16) as u16
 85 |     }
 86 | }
 87 | 
 88 | pub(crate) trait U64 {
 89 |     fn as_usize(self) -> usize;
 90 |     fn low_u8(self) -> u8;
 91 |     fn low_u16(self) -> u16;
 92 |     fn low_u32(self) -> u32;
 93 |     fn high_u32(self) -> u32;
 94 | }
 95 | 
 96 | impl U64 for u64 {
 97 |     fn as_usize(self) -> usize {
 98 |         #[cfg(debug_assertions)]
 99 |         {
100 |             usize::try_from(self).expect("u64 overflowed usize")
101 |         }
102 |         #[cfg(not(debug_assertions))]
103 |         {
104 |             self as usize
105 |         }
106 |     }
107 | 
108 |     fn low_u8(self) -> u8 {
109 |         self as u8
110 |     }
111 | 
112 |     fn low_u16(self) -> u16 {
113 |         self as u16
114 |     }
115 | 
116 |     fn low_u32(self) -> u32 {
117 |         self as u32
118 |     }
119 | 
120 |     fn high_u32(self) -> u32 {
121 |         (self >> 32) as u32
122 |     }
123 | }
124 | 
125 | pub(crate) trait I8 {
126 |     fn as_usize(self) -> usize;
127 |     fn to_bits(self) -> u8;
128 |     fn from_bits(n: u8) -> i8;
129 | }
130 | 
131 | impl I8 for i8 {
132 |     fn as_usize(self) -> usize {
133 |         #[cfg(debug_assertions)]
134 |         {
135 |             usize::try_from(self).expect("i8 overflowed usize")
136 |         }
137 |         #[cfg(not(debug_assertions))]
138 |         {
139 |             self as usize
140 |         }
141 |     }
142 | 
143 |     fn to_bits(self) -> u8 {
144 |         self as u8
145 |     }
146 | 
147 |     fn from_bits(n: u8) -> i8 {
148 |         n as i8
149 |     }
150 | }
151 | 
152 | pub(crate) trait I32 {
153 |     fn as_usize(self) -> usize;
154 |     fn to_bits(self) -> u32;
155 |     fn from_bits(n: u32) -> i32;
156 | }
157 | 
158 | impl I32 for i32 {
159 |     fn as_usize(self) -> usize {
160 |         #[cfg(debug_assertions)]
161 |         {
162 |             usize::try_from(self).expect("i32 overflowed usize")
163 |         }
164 |         #[cfg(not(debug_assertions))]
165 |         {
166 |             self as usize
167 |         }
168 |     }
169 | 
170 |     fn to_bits(self) -> u32 {
171 |         self as u32
172 |     }
173 | 
174 |     fn from_bits(n: u32) -> i32 {
175 |         n as i32
176 |     }
177 | }
178 | 
179 | pub(crate) trait I64 {
180 |     fn as_usize(self) -> usize;
181 |     fn to_bits(self) -> u64;
182 |     fn from_bits(n: u64) -> i64;
183 | }
184 | 
185 | impl I64 for i64 {
186 |     fn as_usize(self) -> usize {
187 |         #[cfg(debug_assertions)]
188 |         {
189 |             usize::try_from(self).expect("i64 overflowed usize")
190 |         }
191 |         #[cfg(not(debug_assertions))]
192 |         {
193 |             self as usize
194 |         }
195 |     }
196 | 
197 |     fn to_bits(self) -> u64 {
198 |         self as u64
199 |     }
200 | 
201 |     fn from_bits(n: u64) -> i64 {
202 |         n as i64
203 |     }
204 | }
205 | 
206 | pub(crate) trait Usize {
207 |     fn as_u8(self) -> u8;
208 |     fn as_u16(self) -> u16;
209 |     fn as_u32(self) -> u32;
210 |     fn as_u64(self) -> u64;
211 | }
212 | 
213 | impl Usize for usize {
214 |     fn as_u8(self) -> u8 {
215 |         #[cfg(debug_assertions)]
216 |         {
217 |             u8::try_from(self).expect("usize overflowed u8")
218 |         }
219 |         #[cfg(not(debug_assertions))]
220 |         {
221 |             self as u8
222 |         }
223 |     }
224 | 
225 |     fn as_u16(self) -> u16 {
226 |         #[cfg(debug_assertions)]
227 |         {
228 |             u16::try_from(self).expect("usize overflowed u16")
229 |         }
230 |         #[cfg(not(debug_assertions))]
231 |         {
232 |             self as u16
233 |         }
234 |     }
235 | 
236 |     fn as_u32(self) -> u32 {
237 |         #[cfg(debug_assertions)]
238 |         {
239 |             u32::try_from(self).expect("usize overflowed u32")
240 |         }
241 |         #[cfg(not(debug_assertions))]
242 |         {
243 |             self as u32
244 |         }
245 |     }
246 | 
247 |     fn as_u64(self) -> u64 {
248 |         #[cfg(debug_assertions)]
249 |         {
250 |             u64::try_from(self).expect("usize overflowed u64")
251 |         }
252 |         #[cfg(not(debug_assertions))]
253 |         {
254 |             self as u64
255 |         }
256 |     }
257 | }
258 | 
259 | // Pointers aren't integers, but we convert pointers to integers to perform
260 | // offset arithmetic in some places. (And no, we don't convert the integers
261 | // back to pointers.) So add 'as_usize' conversions here too for completeness.
262 | //
263 | // These 'as' casts are actually okay because they're always non-lossy. But the
264 | // idea here is to just try and remove as much 'as' as possible, particularly
265 | // in this crate where we are being really paranoid about offsets and making
266 | // sure we don't panic on inputs that might be untrusted. This way, the 'as'
267 | // casts become easier to audit if they're all in one place, even when some of
268 | // them are actually okay 100% of the time.
269 | 
270 | pub(crate) trait Pointer {
271 |     fn as_usize(self) -> usize;
272 | }
273 | 
274 | impl<T> Pointer for *const T {
275 |     fn as_usize(self) -> usize {
276 |         self as usize
277 |     }
278 | }
279 | 


--------------------------------------------------------------------------------
/src/util/mod.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) mod alphabet;
 2 | #[cfg(feature = "std")]
 3 | pub(crate) mod buffer;
 4 | pub(crate) mod byte_frequencies;
 5 | pub(crate) mod debug;
 6 | pub(crate) mod error;
 7 | pub(crate) mod int;
 8 | pub(crate) mod prefilter;
 9 | pub(crate) mod primitives;
10 | pub(crate) mod remapper;
11 | pub(crate) mod search;
12 | pub(crate) mod special;
13 | 


--------------------------------------------------------------------------------
/src/util/remapper.rs:
--------------------------------------------------------------------------------
  1 | use alloc::vec::Vec;
  2 | 
  3 | use crate::{nfa::noncontiguous, util::primitives::StateID};
  4 | 
  5 | /// Remappable is a tightly coupled abstraction that facilitates remapping
  6 | /// state identifiers in DFAs.
  7 | ///
  8 | /// The main idea behind remapping state IDs is that DFAs often need to check
  9 | /// if a certain state is a "special" state of some kind (like a match state)
 10 | /// during a search. Since this is extremely perf critical code, we want this
 11 | /// check to be as fast as possible. Partitioning state IDs into, for example,
 12 | /// into "non-match" and "match" states means one can tell if a state is a
 13 | /// match state via a simple comparison of the state ID.
 14 | ///
 15 | /// The issue is that during the DFA construction process, it's not
 16 | /// particularly easy to partition the states. Instead, the simplest thing is
 17 | /// to often just do a pass over all of the states and shuffle them into their
 18 | /// desired partitionings. To do that, we need a mechanism for swapping states.
 19 | /// Hence, this abstraction.
 20 | ///
 21 | /// Normally, for such little code, I would just duplicate it. But this is a
 22 | /// key optimization and the implementation is a bit subtle. So the abstraction
 23 | /// is basically a ham-fisted attempt at DRY. The only place we use this is in
 24 | /// the dense and one-pass DFAs.
 25 | ///
 26 | /// See also src/dfa/special.rs for a more detailed explanation of how dense
 27 | /// DFAs are partitioned.
 28 | pub(crate) trait Remappable: core::fmt::Debug {
 29 |     /// Return the total number of states.
 30 |     fn state_len(&self) -> usize;
 31 | 
 32 |     /// Swap the states pointed to by the given IDs. The underlying finite
 33 |     /// state machine should be mutated such that all of the transitions in
 34 |     /// `id1` are now in the memory region where the transitions for `id2`
 35 |     /// were, and all of the transitions in `id2` are now in the memory region
 36 |     /// where the transitions for `id1` were.
 37 |     ///
 38 |     /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
 39 |     ///
 40 |     /// It is expected that, after calling this, the underlying state machine
 41 |     /// will be left in an inconsistent state, since any other transitions
 42 |     /// pointing to, e.g., `id1` need to be updated to point to `id2`, since
 43 |     /// that's where `id1` moved to.
 44 |     ///
 45 |     /// In order to "fix" the underlying inconsistent state, a `Remapper`
 46 |     /// should be used to guarantee that `remap` is called at the appropriate
 47 |     /// time.
 48 |     fn swap_states(&mut self, id1: StateID, id2: StateID);
 49 | 
 50 |     /// This must remap every single state ID in the underlying value according
 51 |     /// to the function given. For example, in a DFA, this should remap every
 52 |     /// transition and every starting state ID.
 53 |     fn remap(&mut self, map: impl Fn(StateID) -> StateID);
 54 | }
 55 | 
 56 | /// Remapper is an abstraction the manages the remapping of state IDs in a
 57 | /// finite state machine. This is useful when one wants to shuffle states into
 58 | /// different positions in the machine.
 59 | ///
 60 | /// One of the key complexities this manages is the ability to correctly move
 61 | /// one state multiple times.
 62 | ///
 63 | /// Once shuffling is complete, `remap` must be called, which will rewrite
 64 | /// all pertinent transitions to updated state IDs. Neglecting to call `remap`
 65 | /// will almost certainly result in a corrupt machine.
 66 | #[derive(Debug)]
 67 | pub(crate) struct Remapper {
 68 |     /// A map from the index of a state to its pre-multiplied identifier.
 69 |     ///
 70 |     /// When a state is swapped with another, then their corresponding
 71 |     /// locations in this map are also swapped. Thus, its new position will
 72 |     /// still point to its old pre-multiplied StateID.
 73 |     ///
 74 |     /// While there is a bit more to it, this then allows us to rewrite the
 75 |     /// state IDs in a DFA's transition table in a single pass. This is done
 76 |     /// by iterating over every ID in this map, then iterating over each
 77 |     /// transition for the state at that ID and re-mapping the transition from
 78 |     /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
 79 |     /// in this map where `old_id` *started*, and set it to where it ended up
 80 |     /// after all swaps have been completed.
 81 |     map: Vec<StateID>,
 82 |     /// A way to map indices to state IDs (and back).
 83 |     idx: IndexMapper,
 84 | }
 85 | 
 86 | impl Remapper {
 87 |     /// Create a new remapper from the given remappable implementation. The
 88 |     /// remapper can then be used to swap states. The remappable value given
 89 |     /// here must the same one given to `swap` and `remap`.
 90 |     ///
 91 |     /// The given stride should be the stride of the transition table expressed
 92 |     /// as a power of 2. This stride is used to map between state IDs and state
 93 |     /// indices. If state IDs and state indices are equivalent, then provide
 94 |     /// a `stride2` of `0`, which acts as an identity.
 95 |     pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper {
 96 |         let idx = IndexMapper { stride2 };
 97 |         let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect();
 98 |         Remapper { map, idx }
 99 |     }
100 | 
101 |     /// Swap two states. Once this is called, callers must follow through to
102 |     /// call `remap`, or else it's possible for the underlying remappable
103 |     /// value to be in a corrupt state.
104 |     pub(crate) fn swap(
105 |         &mut self,
106 |         r: &mut impl Remappable,
107 |         id1: StateID,
108 |         id2: StateID,
109 |     ) {
110 |         if id1 == id2 {
111 |             return;
112 |         }
113 |         r.swap_states(id1, id2);
114 |         self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2));
115 |     }
116 | 
117 |     /// Complete the remapping process by rewriting all state IDs in the
118 |     /// remappable value according to the swaps performed.
119 |     pub(crate) fn remap(mut self, r: &mut impl Remappable) {
120 |         // Update the map to account for states that have been swapped
121 |         // multiple times. For example, if (A, C) and (C, G) are swapped, then
122 |         // transitions previously pointing to A should now point to G. But if
123 |         // we don't update our map, they will erroneously be set to C. All we
124 |         // do is follow the swaps in our map until we see our original state
125 |         // ID.
126 |         //
127 |         // The intuition here is to think about how changes are made to the
128 |         // map: only through pairwise swaps. That means that starting at any
129 |         // given state, it is always possible to find the loop back to that
130 |         // state by following the swaps represented in the map (which might be
131 |         // 0 swaps).
132 |         //
133 |         // We are also careful to clone the map before starting in order to
134 |         // freeze it. We use the frozen map to find our loops, since we need to
135 |         // update our map as well. Without freezing it, our updates could break
136 |         // the loops referenced above and produce incorrect results.
137 |         let oldmap = self.map.clone();
138 |         for i in 0..r.state_len() {
139 |             let cur_id = self.idx.to_state_id(i);
140 |             let mut new_id = oldmap[i];
141 |             if cur_id == new_id {
142 |                 continue;
143 |             }
144 |             loop {
145 |                 let id = oldmap[self.idx.to_index(new_id)];
146 |                 if cur_id == id {
147 |                     self.map[i] = new_id;
148 |                     break;
149 |                 }
150 |                 new_id = id;
151 |             }
152 |         }
153 |         r.remap(|sid| self.map[self.idx.to_index(sid)]);
154 |     }
155 | }
156 | 
157 | /// A simple type for mapping between state indices and state IDs.
158 | ///
159 | /// The reason why this exists is because state IDs are "premultiplied" in a
160 | /// DFA. That is, in order to get to the transitions for a particular state,
161 | /// one need only use the state ID as-is, instead of having to multiply it by
162 | /// transition table's stride.
163 | ///
164 | /// The downside of this is that it's inconvenient to map between state IDs
165 | /// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
166 | /// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`,
167 | /// etc.
168 | ///
169 | /// Since our state IDs are premultiplied, we can convert back-and-forth
170 | /// between IDs and indices by simply unmultiplying the IDs and multiplying the
171 | /// indices.
172 | ///
173 | /// Note that for a sparse NFA, state IDs and indices are equivalent. In this
174 | /// case, we set the stride of the index mapped to be `0`, which acts as an
175 | /// identity.
176 | #[derive(Debug)]
177 | struct IndexMapper {
178 |     /// The power of 2 corresponding to the stride of the corresponding
179 |     /// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
180 |     /// stride2' pre-multiplies an index to an ID.
181 |     stride2: usize,
182 | }
183 | 
184 | impl IndexMapper {
185 |     /// Convert a state ID to a state index.
186 |     fn to_index(&self, id: StateID) -> usize {
187 |         id.as_usize() >> self.stride2
188 |     }
189 | 
190 |     /// Convert a state index to a state ID.
191 |     fn to_state_id(&self, index: usize) -> StateID {
192 |         // CORRECTNESS: If the given index is not valid, then it is not
193 |         // required for this to panic or return a valid state ID. We'll "just"
194 |         // wind up with panics or silent logic errors at some other point. But
195 |         // this is OK because if Remappable::state_len is correct and so is
196 |         // 'to_index', then all inputs to 'to_state_id' should be valid indices
197 |         // and thus transform into valid state IDs.
198 |         StateID::new_unchecked(index << self.stride2)
199 |     }
200 | }
201 | 
202 | impl Remappable for noncontiguous::NFA {
203 |     fn state_len(&self) -> usize {
204 |         noncontiguous::NFA::states(self).len()
205 |     }
206 | 
207 |     fn swap_states(&mut self, id1: StateID, id2: StateID) {
208 |         noncontiguous::NFA::swap_states(self, id1, id2)
209 |     }
210 | 
211 |     fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
212 |         noncontiguous::NFA::remap(self, map)
213 |     }
214 | }
215 | 


--------------------------------------------------------------------------------
/src/util/special.rs:
--------------------------------------------------------------------------------
 1 | use crate::util::primitives::StateID;
 2 | 
 3 | /// A collection of sentinel state IDs for Aho-Corasick automata.
 4 | ///
 5 | /// This specifically enables the technique by which we determine which states
 6 | /// are dead, matches or start states. Namely, by arranging states in a
 7 | /// particular order, we can determine the type of a state simply by looking at
 8 | /// its ID.
 9 | #[derive(Clone, Debug)]
10 | pub(crate) struct Special {
11 |     /// The maximum ID of all the "special" states. This corresponds either to
12 |     /// start_anchored_id when a prefilter is active and max_match_id when a
13 |     /// prefilter is not active. The idea here is that if there is no prefilter,
14 |     /// then there is no point in treating start states as special.
15 |     pub(crate) max_special_id: StateID,
16 |     /// The maximum ID of all the match states. Any state ID bigger than this
17 |     /// is guaranteed to be a non-match ID.
18 |     ///
19 |     /// It is possible and legal for max_match_id to be equal to
20 |     /// start_anchored_id, which occurs precisely in the case where the empty
21 |     /// string is a pattern that was added to the underlying automaton.
22 |     pub(crate) max_match_id: StateID,
23 |     /// The state ID of the start state used for unanchored searches.
24 |     pub(crate) start_unanchored_id: StateID,
25 |     /// The state ID of the start state used for anchored searches. This is
26 |     /// always start_unanchored_id+1.
27 |     pub(crate) start_anchored_id: StateID,
28 | }
29 | 
30 | impl Special {
31 |     /// Create a new set of "special" state IDs with all IDs initialized to
32 |     /// zero. The general idea here is that they will be updated and set to
33 |     /// correct values later.
34 |     pub(crate) fn zero() -> Special {
35 |         Special {
36 |             max_special_id: StateID::ZERO,
37 |             max_match_id: StateID::ZERO,
38 |             start_unanchored_id: StateID::ZERO,
39 |             start_anchored_id: StateID::ZERO,
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------