├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── .gitignore ├── .vim └── coc-settings.json ├── COPYING ├── Cargo.toml ├── DESIGN.md ├── LICENSE-MIT ├── README.md ├── UNLICENSE ├── aho-corasick-debug ├── Cargo.toml └── main.rs ├── benchmarks ├── definitions │ ├── build.toml │ ├── curated.toml │ ├── jetscii.toml │ ├── random │ │ ├── many.toml │ │ ├── memchr.toml │ │ └── misc.toml │ ├── regexcurated.toml │ ├── same.toml │ ├── sherlock.toml │ └── teddy.toml ├── engines.toml ├── engines │ ├── naive │ │ ├── Cargo.lock │ │ ├── Cargo.toml │ │ └── main.rs │ ├── rust-aho-corasick │ │ ├── .gitignore │ │ ├── Cargo.lock │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── main.rs │ ├── rust-daachorse │ │ ├── Cargo.lock │ │ ├── Cargo.toml │ │ └── main.rs │ ├── rust-jetscii │ │ ├── Cargo.lock │ │ ├── Cargo.toml │ │ └── main.rs │ └── rust-old-aho-corasick │ │ ├── .gitignore │ │ ├── Cargo.lock │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── main.rs ├── haystacks │ ├── catalog.data.gov │ │ ├── README.md │ │ └── mental-health-4weeks.xml │ ├── opensubtitles │ │ ├── README.md │ │ ├── en-huge.txt │ │ ├── en-medium.txt │ │ ├── en-sampled.txt │ │ ├── en-small.txt │ │ ├── en-teeny.txt │ │ ├── en-tiny.txt │ │ ├── ru-huge.txt │ │ ├── ru-medium.txt │ │ ├── ru-sampled.txt │ │ ├── ru-small.txt │ │ ├── ru-teeny.txt │ │ ├── ru-tiny.txt │ │ ├── zh-huge.txt │ │ ├── zh-medium.txt │ │ ├── zh-sampled.txt │ │ ├── zh-small.txt │ │ ├── zh-teeny.txt │ │ └── zh-tiny.txt │ ├── random.txt │ ├── random10x.txt │ └── sherlock.txt ├── record │ ├── aarch64 │ │ ├── 2023-09-04.csv │ │ ├── 2023-09-07.csv │ │ ├── 2023-09-16.csv │ │ └── 2023-09-17.csv │ └── x86_64 │ │ ├── 2023-09-04.csv │ │ ├── 2023-09-07.csv │ │ ├── 2023-09-16.csv │ │ └── 2023-09-17.csv ├── regexes │ ├── dictionary │ │ └── english │ │ │ ├── length-10.txt │ │ │ ├── length-15.txt │ │ │ ├── sorted-by-length.txt │ │ │ └── sorted.txt │ ├── words-100 │ ├── words-15000 │ └── words-5000 └── shared │ ├── Cargo.lock │ ├── Cargo.toml │ └── lib.rs ├── fuzz ├── .gitignore ├── Cargo.toml └── fuzz-targets │ └── fuzz_find.rs ├── rustfmt.toml └── src ├── ahocorasick.rs ├── automaton.rs ├── dfa.rs ├── lib.rs ├── macros.rs ├── nfa ├── contiguous.rs ├── mod.rs └── noncontiguous.rs ├── packed ├── api.rs ├── ext.rs ├── mod.rs ├── pattern.rs ├── rabinkarp.rs ├── teddy │ ├── README.md │ ├── builder.rs │ ├── generic.rs │ └── mod.rs ├── tests.rs └── vector.rs ├── tests.rs ├── transducer.rs └── util ├── alphabet.rs ├── buffer.rs ├── byte_frequencies.rs ├── debug.rs ├── error.rs ├── int.rs ├── mod.rs ├── prefilter.rs ├── primitives.rs ├── remapper.rs ├── search.rs └── special.rs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [BurntSushi] 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | schedule: 8 | - cron: '00 01 * * *' 9 | 10 | # The section is needed to drop write-all permissions that are granted on 11 | # `schedule` event. By specifying any permission explicitly all others are set 12 | # to none. By using the principle of least privilege the damage a compromised 13 | # workflow can do (because of an injection or compromised third party tool or 14 | # action) is restricted. Currently the worklow doesn't need any additional 15 | # permission except for pulling the code. Adding labels to issues, commenting 16 | # on pull-requests, etc. may need additional permissions: 17 | # 18 | # Syntax for this section: 19 | # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions 20 | # 21 | # Reference for how to assign permissions on a job-by-job basis: 22 | # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs 23 | # 24 | # Reference for available permissions that we can enable if needed: 25 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token 26 | permissions: 27 | # to fetch code (actions/checkout) 28 | contents: read 29 | 30 | jobs: 31 | test: 32 | name: test 33 | env: 34 | # For some builds, we use cross to test on 32-bit and big-endian 35 | # systems. 36 | CARGO: cargo 37 | # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. 38 | # Note that we only use cross on Linux, so setting a target on a 39 | # different OS will just use normal cargo. 40 | TARGET: 41 | # Bump this as appropriate. We pin to a version to make sure CI 42 | # continues to work as cross releases in the past have broken things 43 | # in subtle ways. 44 | CROSS_VERSION: v0.2.5 45 | runs-on: ${{ matrix.os }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | include: 50 | - build: pinned 51 | os: ubuntu-latest 52 | rust: 1.60.0 53 | - build: stable 54 | os: ubuntu-latest 55 | rust: stable 56 | - build: stable-x86 57 | os: ubuntu-latest 58 | rust: stable 59 | target: i686-unknown-linux-gnu 60 | - build: stable-aarch64 61 | os: ubuntu-latest 62 | rust: stable 63 | target: aarch64-unknown-linux-gnu 64 | - build: stable-powerpc64 65 | os: ubuntu-latest 66 | rust: stable 67 | target: powerpc64-unknown-linux-gnu 68 | - build: stable-s390x 69 | os: ubuntu-latest 70 | rust: stable 71 | target: s390x-unknown-linux-gnu 72 | - build: beta 73 | os: ubuntu-latest 74 | rust: beta 75 | - build: nightly 76 | os: ubuntu-latest 77 | rust: nightly 78 | - build: macos 79 | os: macos-latest 80 | rust: stable 81 | - build: win-msvc 82 | os: windows-latest 83 | rust: stable 84 | - build: win-gnu 85 | os: windows-latest 86 | rust: stable-x86_64-gnu 87 | steps: 88 | - name: Checkout repository 89 | uses: actions/checkout@v3 90 | - name: Install Rust 91 | uses: dtolnay/rust-toolchain@master 92 | with: 93 | toolchain: ${{ matrix.rust }} 94 | - name: Install and configure Cross 95 | if: matrix.os == 'ubuntu-latest' && matrix.target != '' 96 | run: | 97 | # In the past, new releases of 'cross' have broken CI. So for now, we 98 | # pin it. We also use their pre-compiled binary releases because cross 99 | # has over 100 dependencies and takes a bit to compile. 100 | dir="$RUNNER_TEMP/cross-download" 101 | mkdir "$dir" 102 | echo "$dir" >> $GITHUB_PATH 103 | cd "$dir" 104 | curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz" 105 | tar xf cross-x86_64-unknown-linux-musl.tar.gz 106 | 107 | # We used to install 'cross' from master, but it kept failing. So now 108 | # we build from a known-good version until 'cross' becomes more stable 109 | # or we find an alternative. Notably, between v0.2.1 and current 110 | # master (2022-06-14), the number of Cross's dependencies has doubled. 111 | echo "CARGO=cross" >> $GITHUB_ENV 112 | echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV 113 | - name: Show command used for Cargo 114 | run: | 115 | echo "cargo command is: ${{ env.CARGO }}" 116 | echo "target flag is: ${{ env.TARGET }}" 117 | - name: Show CPU info for debugging 118 | if: matrix.os == 'ubuntu-latest' 119 | run: lscpu 120 | # See: https://github.com/rust-lang/regex/blob/a2887636930156023172e4b376a6febad4e49120/.github/workflows/ci.yml#L145-L163 121 | - name: Pin memchr to 2.6.2 122 | if: matrix.build == 'pinned' 123 | run: cargo update -p memchr --precise 2.6.2 124 | - run: ${{ env.CARGO }} build --verbose $TARGET 125 | - run: ${{ env.CARGO }} doc --verbose $TARGET 126 | - run: ${{ env.CARGO }} test --verbose $TARGET 127 | - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std,perf-literal $TARGET 128 | - run: ${{ env.CARGO }} test --lib --verbose --no-default-features $TARGET 129 | - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std $TARGET 130 | - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features perf-literal $TARGET 131 | - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std,perf-literal,logging $TARGET 132 | - if: matrix.build == 'nightly' 133 | run: ${{ env.CARGO }} build --manifest-path aho-corasick-debug/Cargo.toml $TARGET 134 | 135 | rustfmt: 136 | name: rustfmt 137 | runs-on: ubuntu-latest 138 | steps: 139 | - name: Checkout repository 140 | uses: actions/checkout@v3 141 | - name: Install Rust 142 | uses: dtolnay/rust-toolchain@master 143 | with: 144 | toolchain: stable 145 | components: rustfmt 146 | - name: Check formatting 147 | run: | 148 | cargo fmt --all -- --check 149 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | doc 3 | tags 4 | examples/ss10pusa.csv 5 | build 6 | target 7 | /Cargo.lock 8 | scratch* 9 | bench_large/huge 10 | BREADCRUMBS 11 | /tmp 12 | /aho-corasick-debug/Cargo.lock 13 | -------------------------------------------------------------------------------- /.vim/coc-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rust-analyzer.linkedProjects": [ 3 | "aho-corasick-debug/Cargo.toml", 4 | "benchmarks/engines/rust-aho-corasick/Cargo.toml", 5 | "benchmarks/engines/rust-daachorse/Cargo.toml", 6 | "benchmarks/engines/rust-jetscii/Cargo.toml", 7 | "benchmarks/engines/naive/Cargo.toml", 8 | "benchmarks/shared/Cargo.toml", 9 | "fuzz/Cargo.toml", 10 | "Cargo.toml" 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | This project is dual-licensed under the Unlicense and MIT licenses. 2 | 3 | You may use this code under the terms of either license. 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aho-corasick" 3 | version = "1.1.3" #:version 4 | authors = ["Andrew Gallant "] 5 | description = "Fast multiple substring searching." 6 | homepage = "https://github.com/BurntSushi/aho-corasick" 7 | repository = "https://github.com/BurntSushi/aho-corasick" 8 | readme = "README.md" 9 | keywords = ["string", "search", "text", "pattern", "multi"] 10 | license = "Unlicense OR MIT" 11 | categories = ["text-processing"] 12 | autotests = false 13 | exclude = ["/aho-corasick-debug", "/benchmarks", "/tmp"] 14 | edition = "2021" 15 | rust-version = "1.60.0" 16 | 17 | [lib] 18 | name = "aho_corasick" 19 | 20 | [features] 21 | default = ["std", "perf-literal"] 22 | std = ["memchr?/std"] 23 | 24 | # Enables prefilter optimizations that depend on external crates. 25 | perf-literal = ["dep:memchr"] 26 | 27 | # Enable logging via the 'log' crate. This is useful for seeing messages about 28 | # internal decisions and metrics. For example, how the choice of the internal 29 | # Aho-Corasick implementation is used or the heap usage of an automaton. 30 | logging = ["dep:log"] 31 | 32 | # Provides a trait impl for fst::Automaton for nfa::noncontiguous::NFA, 33 | # nfa::contiguous::NFA and dfa::DFA. This is useful for searching an 34 | # FST with an Aho-Corasick automaton. Note that this does not apply 35 | # to the top-level 'AhoCorasick' type, as it does not implement the 36 | # aho_corasick::automaton::Automaton trait, and thus enabling this feature does 37 | # not cause it to implement fst::Automaton either. 38 | # 39 | # NOTE: Currently this feature is not available as `fst` is not at 1.0 yet, 40 | # and this would make `fst` a public dependency. If you absolutely need this, 41 | # you can copy the (very small) src/transducer.rs file to your tree. It 42 | # specifically does not use any private APIs and should work after replacing 43 | # 'crate::' with 'aho_corasick::'. 44 | # 45 | # NOTE: I think my current plan is to flip this around an add an optional 46 | # dependency on 'aho-corasick' to the 'fst' crate and move the trait impls 47 | # there. But I haven't gotten around to it yet. 48 | # transducer = ["fst"] 49 | 50 | [dependencies] 51 | log = { version = "0.4.17", optional = true } 52 | memchr = { version = "2.4.0", default-features = false, optional = true } 53 | 54 | [dev-dependencies] 55 | doc-comment = "0.3.3" 56 | # fst = "0.4.5" 57 | 58 | [package.metadata.docs.rs] 59 | # We want to document all features. 60 | all-features = true 61 | # This opts into a nightly unstable option to show the features that need to be 62 | # enabled for public API items. To do that, we set 'docsrs', and when that's 63 | # enabled, we enable the 'doc_auto_cfg' feature. 64 | # 65 | # To test this locally, run: 66 | # 67 | # RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features 68 | rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"] 69 | 70 | [profile.release] 71 | debug = true 72 | 73 | [profile.bench] 74 | debug = true 75 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Andrew Gallant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | aho-corasick 2 | ============ 3 | A library for finding occurrences of many patterns at once with SIMD 4 | acceleration in some cases. This library provides multiple pattern 5 | search principally through an implementation of the 6 | [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), 7 | which builds a finite state machine for executing searches in linear time. 8 | Features include case insensitive matching, overlapping matches, fast searching 9 | via SIMD and optional full DFA construction and search & replace in streams. 10 | 11 | [![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions) 12 | [![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick) 13 | 14 | Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). 15 | 16 | 17 | ### Documentation 18 | 19 | https://docs.rs/aho-corasick 20 | 21 | 22 | ### Usage 23 | 24 | Run `cargo add aho-corasick` to automatically add this crate as a dependency 25 | in your `Cargo.toml` file. 26 | 27 | 28 | ### Example: basic searching 29 | 30 | This example shows how to search for occurrences of multiple patterns 31 | simultaneously. Each match includes the pattern that matched along with the 32 | byte offsets of the match. 33 | 34 | ```rust 35 | use aho_corasick::{AhoCorasick, PatternID}; 36 | 37 | let patterns = &["apple", "maple", "Snapple"]; 38 | let haystack = "Nobody likes maple in their apple flavored Snapple."; 39 | 40 | let ac = AhoCorasick::new(patterns).unwrap(); 41 | let mut matches = vec![]; 42 | for mat in ac.find_iter(haystack) { 43 | matches.push((mat.pattern(), mat.start(), mat.end())); 44 | } 45 | assert_eq!(matches, vec![ 46 | (PatternID::must(1), 13, 18), 47 | (PatternID::must(0), 28, 33), 48 | (PatternID::must(2), 43, 50), 49 | ]); 50 | ``` 51 | 52 | 53 | ### Example: ASCII case insensitivity 54 | 55 | This is like the previous example, but matches `Snapple` case insensitively 56 | using `AhoCorasickBuilder`: 57 | 58 | ```rust 59 | use aho_corasick::{AhoCorasick, PatternID}; 60 | 61 | let patterns = &["apple", "maple", "snapple"]; 62 | let haystack = "Nobody likes maple in their apple flavored Snapple."; 63 | 64 | let ac = AhoCorasick::builder() 65 | .ascii_case_insensitive(true) 66 | .build(patterns) 67 | .unwrap(); 68 | let mut matches = vec![]; 69 | for mat in ac.find_iter(haystack) { 70 | matches.push((mat.pattern(), mat.start(), mat.end())); 71 | } 72 | assert_eq!(matches, vec![ 73 | (PatternID::must(1), 13, 18), 74 | (PatternID::must(0), 28, 33), 75 | (PatternID::must(2), 43, 50), 76 | ]); 77 | ``` 78 | 79 | 80 | ### Example: replacing matches in a stream 81 | 82 | This example shows how to execute a search and replace on a stream without 83 | loading the entire stream into memory first. 84 | 85 | ```rust,ignore 86 | use aho_corasick::AhoCorasick; 87 | 88 | let patterns = &["fox", "brown", "quick"]; 89 | let replace_with = &["sloth", "grey", "slow"]; 90 | 91 | // In a real example, these might be `std::fs::File`s instead. All you need to 92 | // do is supply a pair of `std::io::Read` and `std::io::Write` implementations. 93 | let rdr = "The quick brown fox."; 94 | let mut wtr = vec![]; 95 | 96 | let ac = AhoCorasick::new(patterns).unwrap(); 97 | ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with) 98 | .expect("stream_replace_all failed"); 99 | assert_eq!(b"The slow grey sloth.".to_vec(), wtr); 100 | ``` 101 | 102 | 103 | ### Example: finding the leftmost first match 104 | 105 | In the textbook description of Aho-Corasick, its formulation is typically 106 | structured such that it reports all possible matches, even when they overlap 107 | with another. In many cases, overlapping matches may not be desired, such as 108 | the case of finding all successive non-overlapping matches like you might with 109 | a standard regular expression. 110 | 111 | Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do 112 | this doesn't always work in the expected way, since it will report matches as 113 | soon as they are seen. For example, consider matching the regex `Samwise|Sam` 114 | against the text `Samwise`. Most regex engines (that are Perl-like, or 115 | non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick 116 | algorithm modified for reporting non-overlapping matches will report `Sam`. 117 | 118 | A novel contribution of this library is the ability to change the match 119 | semantics of Aho-Corasick (without additional search time overhead) such that 120 | `Samwise` is reported instead. For example, here's the standard approach: 121 | 122 | ```rust 123 | use aho_corasick::AhoCorasick; 124 | 125 | let patterns = &["Samwise", "Sam"]; 126 | let haystack = "Samwise"; 127 | 128 | let ac = AhoCorasick::new(patterns).unwrap(); 129 | let mat = ac.find(haystack).expect("should have a match"); 130 | assert_eq!("Sam", &haystack[mat.start()..mat.end()]); 131 | ``` 132 | 133 | And now here's the leftmost-first version, which matches how a Perl-like 134 | regex will work: 135 | 136 | ```rust 137 | use aho_corasick::{AhoCorasick, MatchKind}; 138 | 139 | let patterns = &["Samwise", "Sam"]; 140 | let haystack = "Samwise"; 141 | 142 | let ac = AhoCorasick::builder() 143 | .match_kind(MatchKind::LeftmostFirst) 144 | .build(patterns) 145 | .unwrap(); 146 | let mat = ac.find(haystack).expect("should have a match"); 147 | assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); 148 | ``` 149 | 150 | In addition to leftmost-first semantics, this library also supports 151 | leftmost-longest semantics, which match the POSIX behavior of a regular 152 | expression alternation. See `MatchKind` in the docs for more details. 153 | 154 | 155 | ### Minimum Rust version policy 156 | 157 | This crate's minimum supported `rustc` version is `1.60.0`. 158 | 159 | The current policy is that the minimum Rust version required to use this crate 160 | can be increased in minor version updates. For example, if `crate 1.0` requires 161 | Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust 162 | 1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum 163 | version of Rust. 164 | 165 | In general, this crate will be conservative with respect to the minimum 166 | supported version of Rust. 167 | 168 | 169 | ### FFI bindings 170 | 171 | * [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/) 172 | is a Python wrapper for this library. 173 | * [tmikus/ahocorasick_rs](https://github.com/tmikus/ahocorasick_rs) is a Go 174 | wrapper for this library. 175 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /aho-corasick-debug/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "aho-corasick-debug" 4 | version = "0.0.1" 5 | authors = ["Andrew Gallant "] 6 | description = "A simple command line tool for playing with Aho-Corasick." 7 | license = "Unlicense/MIT" 8 | categories = ["text-processing"] 9 | autotests = false 10 | edition = "2018" 11 | 12 | [[bin]] 13 | name = "aho-corasick-debug" 14 | path = "main.rs" 15 | 16 | [dependencies] 17 | aho-corasick = { version = "*", path = "..", features = ["logging"] } 18 | anyhow = "1.0.68" 19 | memmap2 = "0.5.8" 20 | 21 | [dependencies.env_logger] 22 | version = "0.9.3" 23 | default-features = false 24 | # I am quite annoyed that 'auto-color' brings in a whole mess of dependencies, 25 | # so I am keeping it disabled. 26 | features = ["humantime"] 27 | 28 | [dependencies.clap] 29 | version = "2.34.0" 30 | default-features = false 31 | 32 | [profile.release] 33 | debug = true 34 | -------------------------------------------------------------------------------- /aho-corasick-debug/main.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, path::PathBuf, time::Instant}; 2 | 3 | use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind, StartKind}; 4 | use memmap2::Mmap; 5 | 6 | fn main() -> anyhow::Result<()> { 7 | env_logger::try_init()?; 8 | 9 | let args = Args::parse()?; 10 | let ac = args.aho_corasick()?; 11 | let haystack = args.haystack()?; 12 | 13 | eprintln!("automaton heap usage: {} bytes", ac.memory_usage()); 14 | if args.no_search || args.debug { 15 | if args.debug { 16 | eprintln!("{:?}", ac); 17 | } 18 | return Ok(()); 19 | } 20 | 21 | let start = Instant::now(); 22 | let count = ac.find_iter(&haystack).count(); 23 | println!("match count: {}", count); 24 | 25 | let count_time = Instant::now().duration_since(start); 26 | eprintln!("count time: {:?}", count_time); 27 | Ok(()) 28 | } 29 | 30 | #[derive(Debug)] 31 | struct Args { 32 | dictionary: PathBuf, 33 | haystack: PathBuf, 34 | match_kind: MatchKind, 35 | start_kind: StartKind, 36 | kind: Option, 37 | ascii_casei: bool, 38 | dense_depth: usize, 39 | no_prefilter: bool, 40 | no_classes: bool, 41 | no_search: bool, 42 | debug: bool, 43 | } 44 | 45 | impl Args { 46 | fn parse() -> anyhow::Result { 47 | use clap::{crate_authors, crate_version, App, Arg}; 48 | 49 | let parsed = App::new("Search using aho-corasick") 50 | .author(crate_authors!()) 51 | .version(crate_version!()) 52 | .max_term_width(100) 53 | .arg(Arg::with_name("dictionary").required(true)) 54 | .arg(Arg::with_name("haystack").required(true)) 55 | .arg( 56 | Arg::with_name("kind") 57 | .long("kind") 58 | .possible_values(&[ 59 | "auto", 60 | "noncontiguous", 61 | "contiguous", 62 | "dfa", 63 | ]) 64 | .default_value("auto"), 65 | ) 66 | .arg( 67 | Arg::with_name("match-kind") 68 | .long("match-kind") 69 | .possible_values(&[ 70 | "standard", 71 | "leftmost-first", 72 | "leftmost-longest", 73 | ]) 74 | .default_value("standard"), 75 | ) 76 | .arg( 77 | Arg::with_name("start-kind") 78 | .long("start-kind") 79 | .possible_values(&["both", "unanchored", "anchored"]) 80 | .default_value("unanchored"), 81 | ) 82 | .arg( 83 | Arg::with_name("ascii-case-insensitive") 84 | .long("ascii-case-insensitive") 85 | .short("i"), 86 | ) 87 | .arg( 88 | Arg::with_name("dense-depth") 89 | .long("dense-depth") 90 | .default_value("3"), 91 | ) 92 | .arg( 93 | Arg::with_name("no-prefilter").long("no-prefilter").short("f"), 94 | ) 95 | .arg(Arg::with_name("no-classes").long("no-classes").short("C")) 96 | .arg(Arg::with_name("no-search").long("no-search")) 97 | .arg(Arg::with_name("debug").long("debug")) 98 | .get_matches(); 99 | 100 | let dictionary = 101 | PathBuf::from(parsed.value_of_os("dictionary").unwrap()); 102 | let haystack = PathBuf::from(parsed.value_of_os("haystack").unwrap()); 103 | let match_kind = match parsed.value_of("match-kind").unwrap() { 104 | "standard" => MatchKind::Standard, 105 | "leftmost-first" => MatchKind::LeftmostFirst, 106 | "leftmost-longest" => MatchKind::LeftmostLongest, 107 | _ => unreachable!(), 108 | }; 109 | let start_kind = match parsed.value_of("start-kind").unwrap() { 110 | "both" => StartKind::Both, 111 | "unanchored" => StartKind::Unanchored, 112 | "anchored" => StartKind::Anchored, 113 | _ => unreachable!(), 114 | }; 115 | let kind = match parsed.value_of("kind").unwrap() { 116 | "auto" => None, 117 | "noncontiguous" => Some(AhoCorasickKind::NoncontiguousNFA), 118 | "contiguous" => Some(AhoCorasickKind::ContiguousNFA), 119 | "dfa" => Some(AhoCorasickKind::DFA), 120 | _ => unreachable!(), 121 | }; 122 | let dense_depth = parsed.value_of("dense-depth").unwrap().parse()?; 123 | 124 | Ok(Args { 125 | dictionary, 126 | haystack, 127 | match_kind, 128 | start_kind, 129 | kind, 130 | dense_depth, 131 | ascii_casei: parsed.is_present("ascii-case-insensitive"), 132 | no_prefilter: parsed.is_present("no-prefilter"), 133 | no_classes: parsed.is_present("no-classes"), 134 | no_search: parsed.is_present("no-search"), 135 | debug: parsed.is_present("debug"), 136 | }) 137 | } 138 | 139 | fn aho_corasick(&self) -> anyhow::Result { 140 | let start = Instant::now(); 141 | let patterns = fs::read_to_string(&self.dictionary)?; 142 | let read_time = Instant::now().duration_since(start); 143 | eprintln!("pattern read time: {:?}", read_time); 144 | 145 | let start = Instant::now(); 146 | let ac = AhoCorasick::builder() 147 | .match_kind(self.match_kind) 148 | .start_kind(self.start_kind) 149 | .kind(self.kind) 150 | .ascii_case_insensitive(self.ascii_casei) 151 | .dense_depth(self.dense_depth) 152 | .prefilter(!self.no_prefilter) 153 | .byte_classes(!self.no_classes) 154 | .build(patterns.lines())?; 155 | let build_time = Instant::now().duration_since(start); 156 | eprintln!("automaton build time: {:?}", build_time); 157 | Ok(ac) 158 | } 159 | 160 | fn haystack(&self) -> anyhow::Result { 161 | // SAFETY: We only read from this content and generally assume the file 162 | // is not mutated while it is searched. 163 | Ok(unsafe { Mmap::map(&fs::File::open(&self.haystack)?)? }) 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /benchmarks/definitions/build.toml: -------------------------------------------------------------------------------- 1 | [[bench]] 2 | model = "compile" 3 | name = "empty" 4 | regex = [] 5 | haystack = "" 6 | count = 0 7 | engines = [ 8 | "rust/aho-corasick/default/standard", 9 | "rust/aho-corasick/default/leftmost-first", 10 | "rust/aho-corasick/default/leftmost-longest", 11 | "rust/old-aho-corasick/default/standard", 12 | "rust/old-aho-corasick/default/leftmost-first", 13 | "rust/old-aho-corasick/default/leftmost-longest", 14 | "naive/rust/memchr/memmem", 15 | ] 16 | 17 | [[bench]] 18 | model = "compile" 19 | name = "onebyte" 20 | regex = ["a"] 21 | haystack = "a" 22 | count = 1 23 | engines = [ 24 | "rust/aho-corasick/default/standard", 25 | "rust/aho-corasick/default/leftmost-first", 26 | "rust/aho-corasick/default/leftmost-longest", 27 | "rust/aho-corasick/packed/leftmost-first", 28 | "rust/old-aho-corasick/default/standard", 29 | "rust/old-aho-corasick/default/leftmost-first", 30 | "rust/old-aho-corasick/default/leftmost-longest", 31 | "rust/old-aho-corasick/packed/leftmost-first", 32 | "daachorse/bytewise/leftmost-first", 33 | "daachorse/bytewise/leftmost-longest", 34 | "naive/rust/memchr/memmem", 35 | ] 36 | 37 | [[bench]] 38 | model = "compile" 39 | name = "twobytes" 40 | regex = ["a", "b"] 41 | haystack = "ab" 42 | count = 2 43 | engines = [ 44 | "rust/aho-corasick/default/standard", 45 | "rust/aho-corasick/default/leftmost-first", 46 | "rust/aho-corasick/default/leftmost-longest", 47 | "rust/aho-corasick/packed/leftmost-first", 48 | "rust/old-aho-corasick/default/standard", 49 | "rust/old-aho-corasick/default/leftmost-first", 50 | "rust/old-aho-corasick/default/leftmost-longest", 51 | "rust/old-aho-corasick/packed/leftmost-first", 52 | "daachorse/bytewise/leftmost-first", 53 | "daachorse/bytewise/leftmost-longest", 54 | "naive/rust/memchr/memmem", 55 | ] 56 | 57 | [[bench]] 58 | model = "compile" 59 | name = "many-short" 60 | regex = [ 61 | "ADL", "ADl", "AdL", "Adl", "BAK", "BAk", "BAK", "BaK", "Bak", 62 | "BaK", "HOL", "HOl", "HoL", "Hol", "IRE", "IRe", "IrE", "Ire", 63 | "JOH", "JOh", "JoH", "Joh", "SHE", "SHe", "ShE", "She", "WAT", 64 | "WAt", "WaT", "Wat", "aDL", "aDl", "adL", "adl", "bAK", "bAk", 65 | "bAK", "baK", "bak", "baK", "hOL", "hOl", "hoL", "hol", "iRE", 66 | "iRe", "irE", "ire", "jOH", "jOh", "joH", "joh", "sHE", "sHe", 67 | "shE", "she", "wAT", "wAt", "waT", "wat", "ſHE", "ſHe", "ſhE", 68 | "ſhe", 69 | ] 70 | haystack = "ſhe" 71 | count = 1 72 | engines = [ 73 | "rust/aho-corasick/default/standard", 74 | "rust/aho-corasick/default/leftmost-first", 75 | "rust/aho-corasick/default/leftmost-longest", 76 | "rust/aho-corasick/packed/leftmost-first", 77 | "rust/old-aho-corasick/default/standard", 78 | "rust/old-aho-corasick/default/leftmost-first", 79 | "rust/old-aho-corasick/default/leftmost-longest", 80 | "rust/old-aho-corasick/packed/leftmost-first", 81 | "daachorse/bytewise/leftmost-first", 82 | "daachorse/bytewise/leftmost-longest", 83 | "naive/rust/memchr/memmem", 84 | ] 85 | 86 | [[bench]] 87 | model = "compile" 88 | name = "words5000" 89 | regex = { path = "words-5000", per-line = "pattern" } 90 | haystack = "transfuse" 91 | count = 1 92 | engines = [ 93 | "rust/aho-corasick/default/standard", 94 | "rust/aho-corasick/default/leftmost-first", 95 | "rust/aho-corasick/default/leftmost-longest", 96 | "rust/old-aho-corasick/default/standard", 97 | "rust/old-aho-corasick/default/leftmost-first", 98 | "rust/old-aho-corasick/default/leftmost-longest", 99 | "daachorse/bytewise/leftmost-first", 100 | "daachorse/bytewise/leftmost-longest", 101 | "naive/rust/memchr/memmem", 102 | ] 103 | 104 | [[bench]] 105 | model = "compile" 106 | name = "words15000" 107 | regex = { path = "words-15000", per-line = "pattern" } 108 | haystack = "dovetailing" 109 | count = 1 110 | engines = [ 111 | "rust/aho-corasick/default/standard", 112 | "rust/aho-corasick/default/leftmost-first", 113 | "rust/aho-corasick/default/leftmost-longest", 114 | "rust/old-aho-corasick/default/standard", 115 | "rust/old-aho-corasick/default/leftmost-first", 116 | "rust/old-aho-corasick/default/leftmost-longest", 117 | "daachorse/bytewise/leftmost-first", 118 | "daachorse/bytewise/leftmost-longest", 119 | "naive/rust/memchr/memmem", 120 | ] 121 | -------------------------------------------------------------------------------- /benchmarks/definitions/curated.toml: -------------------------------------------------------------------------------- 1 | analysis = ''' 2 | This is a WIP for building our a curated set of Aho-Corasick benchmarks. 3 | The next step is not to actually add more benchmarks, but to hook up more 4 | Aho-Corasick libraries. There are a lot of them and it's a fair bit of work to 5 | do. 6 | ''' 7 | 8 | [[bench]] 9 | model = "count" 10 | name = "sherlock" 11 | regex = [ 12 | 'Sherlock Holmes', 13 | 'John Watson', 14 | 'Irene Adler', 15 | 'Inspector Lestrade', 16 | 'Professor Moriarty', 17 | ] 18 | haystack = { path = "opensubtitles/en-sampled.txt" } 19 | count = 714 20 | engines = [ 21 | "rust/aho-corasick/default/standard", 22 | "rust/aho-corasick/default/leftmost-first", 23 | "daachorse/bytewise/standard", 24 | "daachorse/bytewise/leftmost-first", 25 | "naive/rust/memchr/memmem", 26 | ] 27 | 28 | [[bench]] 29 | model = "count" 30 | name = "dictionary-15" 31 | regex = { path = "dictionary/english/length-15.txt", per-line = "pattern" } 32 | haystack = { path = "opensubtitles/en-sampled.txt" } 33 | count = 15 34 | engines = [ 35 | "rust/aho-corasick/default/standard", 36 | "rust/aho-corasick/default/overlapping", 37 | "rust/aho-corasick/default/leftmost-first", 38 | "daachorse/bytewise/standard", 39 | "daachorse/bytewise/overlapping", 40 | "daachorse/bytewise/leftmost-first", 41 | "naive/rust/memchr/memmem", 42 | ] 43 | analysis = ''' 44 | Looks for occurrences of 2,663 words of length at least 15. 45 | ''' 46 | -------------------------------------------------------------------------------- /benchmarks/definitions/jetscii.toml: -------------------------------------------------------------------------------- 1 | analysis = ''' 2 | These benchmarks were ported out of the jetscii crate, specifically from 3 | [Dr-Emann's PR]. There were some irregularities in the benchmark results, so 4 | I thought it might be interesting to include it here. 5 | 6 | We add "real" variants of each benchmark as well using a small XML data set on 7 | mental health. The original benchmarks search a haystack consisting entirely of 8 | `a` repeated, with the last byte corresponding to one of the needle bytes. This 9 | is useful for measuring pure throughput, but less good for approximating real 10 | world performance. In this case, for at least `xml-delim3` and `xml-delim5`, it 11 | seems like an XML haystack would be better suited. 12 | 13 | [Dr-Emann's PR]: https://github.com/shepmaster/jetscii/pull/57 14 | ''' 15 | 16 | [[bench]] 17 | model = "count" 18 | name = "space-repeateda" 19 | regex = [' '] 20 | haystack = { contents = "a", repeat = 5_242_880, append = " " } 21 | count = 1 22 | engines = [ 23 | "daachorse/bytewise/leftmost-first", 24 | "rust/aho-corasick/dfa/leftmost-first", 25 | "rust/aho-corasick/packed/leftmost-first", 26 | "rust/old-aho-corasick/packed/leftmost-first", 27 | "rust/jetscii/ascii-chars/prebuilt", 28 | ] 29 | 30 | [[bench]] 31 | model = "count" 32 | name = "xmldelim3-repeateda" 33 | regex = ['<', '>', '&'] 34 | haystack = { contents = "a", repeat = 5_242_880, append = "&" } 35 | count = 1 36 | engines = [ 37 | "daachorse/bytewise/leftmost-first", 38 | "rust/aho-corasick/dfa/leftmost-first", 39 | "rust/aho-corasick/packed/leftmost-first", 40 | "rust/old-aho-corasick/packed/leftmost-first", 41 | "rust/jetscii/ascii-chars/prebuilt", 42 | ] 43 | 44 | [[bench]] 45 | model = "count" 46 | name = "xmldelim5-repeateda" 47 | regex = ['<', '>', '&', "'", '"'] 48 | haystack = { contents = "a", repeat = 5_242_880, append = '"' } 49 | count = 1 50 | engines = [ 51 | "daachorse/bytewise/leftmost-first", 52 | "rust/aho-corasick/dfa/leftmost-first", 53 | "rust/aho-corasick/packed/leftmost-first", 54 | "rust/old-aho-corasick/packed/leftmost-first", 55 | "rust/jetscii/ascii-chars/prebuilt", 56 | ] 57 | 58 | [[bench]] 59 | model = "count" 60 | name = "big16-repeateda" 61 | regex = [ 62 | 'A', 'B', 'C', 'D', 63 | 'E', 'F', 'G', 'H', 64 | 'I', 'J', 'K', 'L', 65 | 'M', 'N', 'O', 'P', 66 | ] 67 | haystack = { contents = "a", repeat = 5_242_880, append = "P" } 68 | count = 1 69 | engines = [ 70 | "daachorse/bytewise/leftmost-first", 71 | "rust/aho-corasick/dfa/leftmost-first", 72 | "rust/aho-corasick/packed/leftmost-first", 73 | "rust/old-aho-corasick/packed/leftmost-first", 74 | "rust/jetscii/ascii-chars/prebuilt", 75 | ] 76 | 77 | [[bench]] 78 | model = "count" 79 | name = "big16earlyshort-repeateda" 80 | regex = [ 81 | 'A', 'B', 'C', 'D', 82 | 'E', 'F', 'G', 'H', 83 | 'I', 'J', 'K', 'L', 84 | 'M', 'N', 'O', 'P', 85 | ] 86 | haystack = { contents = "Pa" } 87 | count = 1 88 | engines = [ 89 | "daachorse/bytewise/leftmost-first", 90 | "rust/aho-corasick/dfa/leftmost-first", 91 | "rust/aho-corasick/packed/leftmost-first", 92 | "rust/old-aho-corasick/packed/leftmost-first", 93 | "rust/jetscii/ascii-chars/prebuilt", 94 | ] 95 | 96 | [[bench]] 97 | model = "count" 98 | name = "big16earlylong-repeateda" 99 | regex = [ 100 | 'A', 'B', 'C', 'D', 101 | 'E', 'F', 'G', 'H', 102 | 'I', 'J', 'K', 'L', 103 | 'M', 'N', 'O', 'P', 104 | ] 105 | haystack = { contents = "a", repeat = 14, append = "P" } 106 | count = 1 107 | engines = [ 108 | "daachorse/bytewise/leftmost-first", 109 | "rust/aho-corasick/dfa/leftmost-first", 110 | "rust/aho-corasick/packed/leftmost-first", 111 | "rust/old-aho-corasick/packed/leftmost-first", 112 | "rust/jetscii/ascii-chars/prebuilt", 113 | ] 114 | 115 | [[bench]] 116 | model = "count" 117 | name = "space-mentalhealth" 118 | regex = [' '] 119 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" } 120 | count = 1_181_201 121 | engines = [ 122 | "daachorse/bytewise/leftmost-first", 123 | "rust/aho-corasick/dfa/leftmost-first", 124 | "rust/aho-corasick/packed/leftmost-first", 125 | "rust/old-aho-corasick/packed/leftmost-first", 126 | "rust/jetscii/ascii-chars/prebuilt", 127 | ] 128 | 129 | [[bench]] 130 | model = "count" 131 | name = "xmldelim3-mentalhealth" 132 | regex = ['<', '>', '&'] 133 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" } 134 | count = 604_714 135 | engines = [ 136 | "daachorse/bytewise/leftmost-first", 137 | "rust/aho-corasick/dfa/leftmost-first", 138 | "rust/aho-corasick/packed/leftmost-first", 139 | "rust/old-aho-corasick/packed/leftmost-first", 140 | "rust/jetscii/ascii-chars/prebuilt", 141 | ] 142 | 143 | [[bench]] 144 | model = "count" 145 | name = "xmldelim5-mentalhealth" 146 | regex = ['<', '>', '&', "'", '"'] 147 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" } 148 | count = 688_252 149 | engines = [ 150 | "daachorse/bytewise/leftmost-first", 151 | "rust/aho-corasick/dfa/leftmost-first", 152 | "rust/aho-corasick/packed/leftmost-first", 153 | "rust/old-aho-corasick/packed/leftmost-first", 154 | "rust/jetscii/ascii-chars/prebuilt", 155 | ] 156 | 157 | [[bench]] 158 | model = "count" 159 | name = "big16-mentalhealth" 160 | regex = [ 161 | 'A', 'B', 'C', 'D', 162 | 'E', 'F', 'G', 'H', 163 | 'I', 'J', 'K', 'L', 164 | 'M', 'N', 'O', 'P', 165 | ] 166 | haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" } 167 | count = 176_447 168 | engines = [ 169 | "daachorse/bytewise/leftmost-first", 170 | "rust/aho-corasick/dfa/leftmost-first", 171 | "rust/aho-corasick/packed/leftmost-first", 172 | "rust/old-aho-corasick/packed/leftmost-first", 173 | "rust/jetscii/ascii-chars/prebuilt", 174 | ] 175 | -------------------------------------------------------------------------------- /benchmarks/definitions/random/many.toml: -------------------------------------------------------------------------------- 1 | analysis = ''' 2 | Miscellaneous benchmarks on a large random haystack with a large pattern set. 3 | ''' 4 | 5 | [[bench]] 6 | model = "count" 7 | name = "words100" 8 | regex = { path = "words-100", per-line = "pattern" } 9 | haystack = { path = "random10x.txt" } 10 | count = 0 11 | engines = [ 12 | "rust/aho-corasick/default/standard", 13 | "rust/aho-corasick/default/leftmost-first", 14 | "rust/aho-corasick/default/leftmost-longest", 15 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 16 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 17 | "rust/aho-corasick/dfa/leftmost-first", 18 | "rust/old-aho-corasick/default/standard", 19 | "rust/old-aho-corasick/default/leftmost-first", 20 | "rust/old-aho-corasick/default/leftmost-longest", 21 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 22 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 23 | "rust/old-aho-corasick/dfa/leftmost-first", 24 | "daachorse/bytewise/leftmost-first", 25 | "daachorse/bytewise/leftmost-longest", 26 | "naive/rust/memchr/memmem", 27 | "naive/rust/std", 28 | ] 29 | 30 | [[bench]] 31 | model = "count" 32 | name = "words5000" 33 | regex = { path = "words-5000", per-line = "pattern" } 34 | haystack = { path = "random10x.txt" } 35 | count = 0 36 | engines = [ 37 | "rust/aho-corasick/default/standard", 38 | "rust/aho-corasick/default/leftmost-first", 39 | "rust/aho-corasick/default/leftmost-longest", 40 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 41 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 42 | "rust/aho-corasick/dfa/leftmost-first", 43 | "rust/old-aho-corasick/default/standard", 44 | "rust/old-aho-corasick/default/leftmost-first", 45 | "rust/old-aho-corasick/default/leftmost-longest", 46 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 47 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 48 | "rust/old-aho-corasick/dfa/leftmost-first", 49 | "daachorse/bytewise/leftmost-first", 50 | "daachorse/bytewise/leftmost-longest", 51 | "naive/rust/memchr/memmem", 52 | "naive/rust/std", 53 | ] 54 | -------------------------------------------------------------------------------- /benchmarks/definitions/random/memchr.toml: -------------------------------------------------------------------------------- 1 | analysis = ''' 2 | These benchmarks test the prefix byte optimization, and the impact that 3 | match-vs-non-match has. 4 | 5 | More specifically, Aho-Corasick will use highly optimized vectorized routines 6 | (on some targets) if it determines that all matches start with 1, 2 or 3 7 | distinct bytes. (Perhaps not in all cases. Even if there are 1-3 common bytes 8 | in the prefix, it could still use Teddy.) 9 | 10 | For match-vs-non-match, we keep the match counts fixed across the different 11 | prefix optimizations as a way to control what we measure. 12 | ''' 13 | 14 | [[bench]] 15 | model = "count" 16 | name = "onebyte-match" 17 | regex = ["a"] 18 | haystack = { path = "random.txt" } 19 | count = 352 20 | engines = [ 21 | "rust/aho-corasick/default/standard", 22 | "rust/aho-corasick/default/leftmost-first", 23 | "rust/aho-corasick/default/leftmost-longest", 24 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 25 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 26 | "rust/aho-corasick/dfa/leftmost-first", 27 | "rust/aho-corasick/packed/leftmost-first", 28 | "rust/old-aho-corasick/default/standard", 29 | "rust/old-aho-corasick/default/leftmost-first", 30 | "rust/old-aho-corasick/default/leftmost-longest", 31 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 32 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 33 | "rust/old-aho-corasick/dfa/leftmost-first", 34 | "rust/old-aho-corasick/packed/leftmost-first", 35 | "daachorse/bytewise/leftmost-first", 36 | "daachorse/bytewise/leftmost-longest", 37 | "naive/rust/memchr/memmem", 38 | "naive/rust/std", 39 | ] 40 | 41 | [[bench]] 42 | model = "count" 43 | name = "onebyte-nomatch" 44 | regex = ["\u0000"] 45 | haystack = { path = "random.txt" } 46 | count = 0 47 | engines = [ 48 | "rust/aho-corasick/default/standard", 49 | "rust/aho-corasick/default/leftmost-first", 50 | "rust/aho-corasick/default/leftmost-longest", 51 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 52 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 53 | "rust/aho-corasick/dfa/leftmost-first", 54 | "rust/aho-corasick/packed/leftmost-first", 55 | "rust/old-aho-corasick/default/standard", 56 | "rust/old-aho-corasick/default/leftmost-first", 57 | "rust/old-aho-corasick/default/leftmost-longest", 58 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 59 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 60 | "rust/old-aho-corasick/dfa/leftmost-first", 61 | "rust/old-aho-corasick/packed/leftmost-first", 62 | "daachorse/bytewise/leftmost-first", 63 | "daachorse/bytewise/leftmost-longest", 64 | "naive/rust/memchr/memmem", 65 | "naive/rust/std", 66 | ] 67 | 68 | [[bench]] 69 | model = "count" 70 | name = "twobytes-match" 71 | regex = ["a", "\u0000"] 72 | haystack = { path = "random.txt" } 73 | count = 352 74 | engines = [ 75 | "rust/aho-corasick/default/standard", 76 | "rust/aho-corasick/default/leftmost-first", 77 | "rust/aho-corasick/default/leftmost-longest", 78 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 79 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 80 | "rust/aho-corasick/dfa/leftmost-first", 81 | "rust/aho-corasick/packed/leftmost-first", 82 | "rust/old-aho-corasick/default/standard", 83 | "rust/old-aho-corasick/default/leftmost-first", 84 | "rust/old-aho-corasick/default/leftmost-longest", 85 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 86 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 87 | "rust/old-aho-corasick/dfa/leftmost-first", 88 | "rust/old-aho-corasick/packed/leftmost-first", 89 | "daachorse/bytewise/leftmost-first", 90 | "daachorse/bytewise/leftmost-longest", 91 | "naive/rust/memchr/memmem", 92 | "naive/rust/std", 93 | ] 94 | 95 | [[bench]] 96 | model = "count" 97 | name = "twobytes-nomatch" 98 | regex = ["\u0000", "\u0001"] 99 | haystack = { path = "random.txt" } 100 | count = 0 101 | engines = [ 102 | "rust/aho-corasick/default/standard", 103 | "rust/aho-corasick/default/leftmost-first", 104 | "rust/aho-corasick/default/leftmost-longest", 105 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 106 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 107 | "rust/aho-corasick/dfa/leftmost-first", 108 | "rust/aho-corasick/packed/leftmost-first", 109 | "rust/old-aho-corasick/default/standard", 110 | "rust/old-aho-corasick/default/leftmost-first", 111 | "rust/old-aho-corasick/default/leftmost-longest", 112 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 113 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 114 | "rust/old-aho-corasick/dfa/leftmost-first", 115 | "rust/old-aho-corasick/packed/leftmost-first", 116 | "daachorse/bytewise/leftmost-first", 117 | "daachorse/bytewise/leftmost-longest", 118 | "naive/rust/memchr/memmem", 119 | "naive/rust/std", 120 | ] 121 | 122 | [[bench]] 123 | model = "count" 124 | name = "threebytes-match" 125 | regex = ["a", "\u0000", "\u0001"] 126 | haystack = { path = "random.txt" } 127 | count = 352 128 | engines = [ 129 | "rust/aho-corasick/default/standard", 130 | "rust/aho-corasick/default/leftmost-first", 131 | "rust/aho-corasick/default/leftmost-longest", 132 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 133 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 134 | "rust/aho-corasick/dfa/leftmost-first", 135 | "rust/aho-corasick/packed/leftmost-first", 136 | "rust/old-aho-corasick/default/standard", 137 | "rust/old-aho-corasick/default/leftmost-first", 138 | "rust/old-aho-corasick/default/leftmost-longest", 139 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 140 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 141 | "rust/old-aho-corasick/dfa/leftmost-first", 142 | "rust/old-aho-corasick/packed/leftmost-first", 143 | "daachorse/bytewise/leftmost-first", 144 | "daachorse/bytewise/leftmost-longest", 145 | "naive/rust/memchr/memmem", 146 | "naive/rust/std", 147 | ] 148 | 149 | [[bench]] 150 | model = "count" 151 | name = "threebytes-nomatch" 152 | regex = ["\u0000", "\u0001", "\u0002"] 153 | haystack = { path = "random.txt" } 154 | count = 0 155 | engines = [ 156 | "rust/aho-corasick/default/standard", 157 | "rust/aho-corasick/default/leftmost-first", 158 | "rust/aho-corasick/default/leftmost-longest", 159 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 160 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 161 | "rust/aho-corasick/dfa/leftmost-first", 162 | "rust/aho-corasick/packed/leftmost-first", 163 | "rust/old-aho-corasick/default/standard", 164 | "rust/old-aho-corasick/default/leftmost-first", 165 | "rust/old-aho-corasick/default/leftmost-longest", 166 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 167 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 168 | "rust/old-aho-corasick/dfa/leftmost-first", 169 | "rust/old-aho-corasick/packed/leftmost-first", 170 | "daachorse/bytewise/leftmost-first", 171 | "daachorse/bytewise/leftmost-longest", 172 | "naive/rust/memchr/memmem", 173 | "naive/rust/std", 174 | ] 175 | 176 | [[bench]] 177 | model = "count" 178 | name = "fourbytes-match" 179 | regex = ["a", "\u0000", "\u0001", "\u0002"] 180 | haystack = { path = "random.txt" } 181 | count = 352 182 | engines = [ 183 | "rust/aho-corasick/default/standard", 184 | "rust/aho-corasick/default/leftmost-first", 185 | "rust/aho-corasick/default/leftmost-longest", 186 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 187 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 188 | "rust/aho-corasick/dfa/leftmost-first", 189 | "rust/aho-corasick/packed/leftmost-first", 190 | "rust/old-aho-corasick/default/standard", 191 | "rust/old-aho-corasick/default/leftmost-first", 192 | "rust/old-aho-corasick/default/leftmost-longest", 193 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 194 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 195 | "rust/old-aho-corasick/dfa/leftmost-first", 196 | "rust/old-aho-corasick/packed/leftmost-first", 197 | "daachorse/bytewise/leftmost-first", 198 | "daachorse/bytewise/leftmost-longest", 199 | "naive/rust/memchr/memmem", 200 | "naive/rust/std", 201 | ] 202 | 203 | [[bench]] 204 | model = "count" 205 | name = "fourbytes-nomatch" 206 | regex = ["\u0000", "\u0001", "\u0002", "\u0003"] 207 | haystack = { path = "random.txt" } 208 | count = 0 209 | engines = [ 210 | "rust/aho-corasick/default/standard", 211 | "rust/aho-corasick/default/leftmost-first", 212 | "rust/aho-corasick/default/leftmost-longest", 213 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 214 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 215 | "rust/aho-corasick/dfa/leftmost-first", 216 | "rust/aho-corasick/packed/leftmost-first", 217 | "rust/old-aho-corasick/default/standard", 218 | "rust/old-aho-corasick/default/leftmost-first", 219 | "rust/old-aho-corasick/default/leftmost-longest", 220 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 221 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 222 | "rust/old-aho-corasick/dfa/leftmost-first", 223 | "rust/old-aho-corasick/packed/leftmost-first", 224 | "daachorse/bytewise/leftmost-first", 225 | "daachorse/bytewise/leftmost-longest", 226 | "naive/rust/memchr/memmem", 227 | "naive/rust/std", 228 | ] 229 | 230 | [[bench]] 231 | model = "count" 232 | name = "fivebytes-match" 233 | regex = ["a", "\u0000", "\u0001", "\u0002", "\u0003"] 234 | haystack = { path = "random.txt" } 235 | count = 352 236 | engines = [ 237 | "rust/aho-corasick/default/standard", 238 | "rust/aho-corasick/default/leftmost-first", 239 | "rust/aho-corasick/default/leftmost-longest", 240 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 241 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 242 | "rust/aho-corasick/dfa/leftmost-first", 243 | "rust/aho-corasick/packed/leftmost-first", 244 | "rust/old-aho-corasick/default/standard", 245 | "rust/old-aho-corasick/default/leftmost-first", 246 | "rust/old-aho-corasick/default/leftmost-longest", 247 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 248 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 249 | "rust/old-aho-corasick/dfa/leftmost-first", 250 | "rust/old-aho-corasick/packed/leftmost-first", 251 | "daachorse/bytewise/leftmost-first", 252 | "daachorse/bytewise/leftmost-longest", 253 | "naive/rust/memchr/memmem", 254 | "naive/rust/std", 255 | ] 256 | 257 | [[bench]] 258 | model = "count" 259 | name = "fivebytes-nomatch" 260 | regex = ["\u0000", "\u0001", "\u0002", "\u0003", "\u0004"] 261 | haystack = { path = "random.txt" } 262 | count = 0 263 | engines = [ 264 | "rust/aho-corasick/default/standard", 265 | "rust/aho-corasick/default/leftmost-first", 266 | "rust/aho-corasick/default/leftmost-longest", 267 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 268 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 269 | "rust/aho-corasick/dfa/leftmost-first", 270 | "rust/aho-corasick/packed/leftmost-first", 271 | "rust/old-aho-corasick/default/standard", 272 | "rust/old-aho-corasick/default/leftmost-first", 273 | "rust/old-aho-corasick/default/leftmost-longest", 274 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 275 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 276 | "rust/old-aho-corasick/dfa/leftmost-first", 277 | "rust/old-aho-corasick/packed/leftmost-first", 278 | "daachorse/bytewise/leftmost-first", 279 | "daachorse/bytewise/leftmost-longest", 280 | "naive/rust/memchr/memmem", 281 | "naive/rust/std", 282 | ] 283 | -------------------------------------------------------------------------------- /benchmarks/definitions/random/misc.toml: -------------------------------------------------------------------------------- 1 | analysis = ''' 2 | Miscellaneous benchmarks on a random haystack. 3 | ''' 4 | 5 | [[bench]] 6 | model = "count" 7 | name = "ten-one-prefix" 8 | regex = [ 9 | "zacdef", "zbcdef", "zccdef", "zdcdef", "zecdef", "zfcdef", 10 | "zgcdef", "zhcdef", "zicdef", "zjcdef", 11 | ] 12 | haystack = { path = "random.txt" } 13 | count = 0 14 | engines = [ 15 | "rust/aho-corasick/default/standard", 16 | "rust/aho-corasick/default/leftmost-first", 17 | "rust/aho-corasick/default/leftmost-longest", 18 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 19 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 20 | "rust/aho-corasick/dfa/leftmost-first", 21 | "rust/aho-corasick/packed/leftmost-first", 22 | "rust/old-aho-corasick/default/standard", 23 | "rust/old-aho-corasick/default/leftmost-first", 24 | "rust/old-aho-corasick/default/leftmost-longest", 25 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 26 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 27 | "rust/old-aho-corasick/dfa/leftmost-first", 28 | "rust/old-aho-corasick/packed/leftmost-first", 29 | "daachorse/bytewise/leftmost-first", 30 | "daachorse/bytewise/leftmost-longest", 31 | "naive/rust/memchr/memmem", 32 | "naive/rust/std", 33 | ] 34 | 35 | [[bench]] 36 | model = "count" 37 | name = "ten-diff-prefix" 38 | regex = [ 39 | "abcdef", "bcdefg", "cdefgh", "defghi", "efghij", "fghijk", 40 | "ghijkl", "hijklm", "ijklmn", "jklmno", 41 | ] 42 | haystack = { path = "random.txt" } 43 | count = 0 44 | engines = [ 45 | "rust/aho-corasick/default/standard", 46 | "rust/aho-corasick/default/leftmost-first", 47 | "rust/aho-corasick/default/leftmost-longest", 48 | "rust/aho-corasick/nfa-noncontiguous/leftmost-first", 49 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 50 | "rust/aho-corasick/dfa/leftmost-first", 51 | "rust/aho-corasick/packed/leftmost-first", 52 | "rust/old-aho-corasick/default/standard", 53 | "rust/old-aho-corasick/default/leftmost-first", 54 | "rust/old-aho-corasick/default/leftmost-longest", 55 | "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first", 56 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 57 | "rust/old-aho-corasick/dfa/leftmost-first", 58 | "rust/old-aho-corasick/packed/leftmost-first", 59 | "daachorse/bytewise/leftmost-first", 60 | "daachorse/bytewise/leftmost-longest", 61 | "naive/rust/memchr/memmem", 62 | "naive/rust/std", 63 | ] 64 | -------------------------------------------------------------------------------- /benchmarks/definitions/regexcurated.toml: -------------------------------------------------------------------------------- 1 | analysis = ''' 2 | These benchmarks come from [rebar's curated benchmark set]. 3 | 4 | We don't copy all of the benchmarks from there. Just the ones where the 5 | `aho-corasick` crate is likely relevant. For example, for the regex 6 | `(?i)Sherlock Holmes`, a small set of prefix literals is extracted that results 7 | in a Teddy searcher being used. So we specifically benchmark the literals that 8 | are extracted (at time of writing). 9 | 10 | [rebar's curated benchmark set]: https://github.com/BurntSushi/rebar/tree/e6100636137496c97273efcb5f5d869278e2e95d/benchmarks/definitions/curated 11 | ''' 12 | 13 | [[bench]] 14 | model = "count" 15 | name = "sherlock-en" 16 | regex = ['Sherlock Holmes'] 17 | haystack = { path = "opensubtitles/en-sampled.txt" } 18 | count = 513 19 | engines = [ 20 | "rust/aho-corasick/default/leftmost-first", 21 | "rust/aho-corasick/dfa/leftmost-first", 22 | "rust/aho-corasick/packed/leftmost-first", 23 | "rust/old-aho-corasick/default/leftmost-first", 24 | "rust/old-aho-corasick/dfa/leftmost-first", 25 | "rust/old-aho-corasick/packed/leftmost-first", 26 | "daachorse/bytewise/leftmost-first", 27 | "naive/rust/memchr/memmem", 28 | ] 29 | 30 | [[bench]] 31 | model = "count" 32 | name = "sherlock-casei-en" 33 | regex = [ 34 | "SHER", "SHEr", "SHeR", "SHer", "ShER", "ShEr", "SheR", "Sher", 35 | "sHER", "sHEr", "sHeR", "sHer", "shER", "shEr", "sheR", "sher", 36 | "ſHE" , "ſHe" , "ſhE" , "ſhe" , 37 | ] 38 | haystack = { path = "opensubtitles/en-sampled.txt" } 39 | count = 540 # original regex is 522 40 | engines = [ 41 | "rust/aho-corasick/default/leftmost-first", 42 | "rust/aho-corasick/dfa/leftmost-first", 43 | "rust/aho-corasick/packed/leftmost-first", 44 | "rust/old-aho-corasick/default/leftmost-first", 45 | "rust/old-aho-corasick/dfa/leftmost-first", 46 | "rust/old-aho-corasick/packed/leftmost-first", 47 | "daachorse/bytewise/leftmost-first", 48 | "naive/rust/memchr/memmem", 49 | ] 50 | 51 | [[bench]] 52 | model = "count" 53 | name = "sherlock-ru" 54 | regex = ['Шерлок Холмс'] 55 | haystack = { path = "opensubtitles/ru-sampled.txt" } 56 | count = 724 57 | engines = [ 58 | "rust/aho-corasick/default/leftmost-first", 59 | "rust/aho-corasick/dfa/leftmost-first", 60 | "rust/aho-corasick/packed/leftmost-first", 61 | "rust/old-aho-corasick/default/leftmost-first", 62 | "rust/old-aho-corasick/dfa/leftmost-first", 63 | "rust/old-aho-corasick/packed/leftmost-first", 64 | "daachorse/bytewise/leftmost-first", 65 | "naive/rust/memchr/memmem", 66 | ] 67 | 68 | [[bench]] 69 | model = "count" 70 | name = "sherlock-casei-ru" 71 | regex = [ 72 | 'ШЕ\xd0', 'ШЕ\xd1', 73 | 'Ше\xd0', 'Ше\xd1', 74 | 'шЕ\xd0', 'шЕ\xd1', 75 | 'ше\xd0', 'ше\xd1', 76 | ] 77 | haystack = { path = "opensubtitles/ru-sampled.txt" } 78 | count = 1608 # original regex is 746 79 | engines = [ 80 | "rust/aho-corasick/default/leftmost-first", 81 | "rust/aho-corasick/dfa/leftmost-first", 82 | "rust/aho-corasick/packed/leftmost-first", 83 | "rust/old-aho-corasick/default/leftmost-first", 84 | "rust/old-aho-corasick/dfa/leftmost-first", 85 | "rust/old-aho-corasick/packed/leftmost-first", 86 | "daachorse/bytewise/leftmost-first", 87 | "naive/rust/memchr/memmem", 88 | ] 89 | 90 | [[bench]] 91 | model = "count" 92 | name = "sherlock-zh" 93 | regex = ['夏洛克·福尔摩斯'] 94 | haystack = { path = "opensubtitles/zh-sampled.txt" } 95 | count = 30 96 | engines = [ 97 | "rust/aho-corasick/default/leftmost-first", 98 | "rust/aho-corasick/dfa/leftmost-first", 99 | "rust/aho-corasick/packed/leftmost-first", 100 | "rust/old-aho-corasick/default/leftmost-first", 101 | "rust/old-aho-corasick/dfa/leftmost-first", 102 | "rust/old-aho-corasick/packed/leftmost-first", 103 | "daachorse/bytewise/leftmost-first", 104 | "naive/rust/memchr/memmem", 105 | ] 106 | 107 | [[bench]] 108 | model = "count" 109 | name = "alt-sherlock-en" 110 | regex = [ 111 | 'Sherlock Holmes', 112 | 'John Watson', 113 | 'Irene Adler', 114 | 'Inspector Lestrade', 115 | 'Professor Moriarty', 116 | ] 117 | haystack = { path = "opensubtitles/en-sampled.txt" } 118 | count = 714 119 | engines = [ 120 | "rust/aho-corasick/default/leftmost-first", 121 | "rust/aho-corasick/dfa/leftmost-first", 122 | "rust/aho-corasick/packed/leftmost-first", 123 | "rust/old-aho-corasick/default/leftmost-first", 124 | "rust/old-aho-corasick/dfa/leftmost-first", 125 | "rust/old-aho-corasick/packed/leftmost-first", 126 | "daachorse/bytewise/leftmost-first", 127 | "naive/rust/memchr/memmem", 128 | ] 129 | 130 | [[bench]] 131 | model = "count" 132 | name = "alt-sherlock-casei-en" 133 | regex = [ 134 | 'SHE', 'SHe', 'ShE', 'She', 'sHE', 'sHe', 'shE', 'she', 'ſH', 'ſh', 135 | 'JOH', 'JOh', 'JoH', 'Joh', 'jOH', 'jOh', 'joH', 'joh', 136 | 'IRE', 'IRe', 'IrE', 'Ire', 'iRE', 'iRe', 'irE', 'ire', 137 | 'INS', 'INs', 'IN\xc5', 'InS', 'Ins', 'In\xc5', 138 | 'iNS', 'iNs', 'iN\xc5', 'inS', 'ins', 'in\xc5', 139 | 'PRO', 'PRo', 'PrO', 'Pro', 'pRO', 'pRo', 'prO', 'pro', 140 | ] 141 | haystack = { path = "opensubtitles/en-sampled.txt" } 142 | count = 2456 # original regex is 725 143 | engines = [ 144 | "rust/aho-corasick/default/leftmost-first", 145 | "rust/aho-corasick/dfa/leftmost-first", 146 | "rust/aho-corasick/packed/leftmost-first", 147 | "rust/old-aho-corasick/default/leftmost-first", 148 | "rust/old-aho-corasick/dfa/leftmost-first", 149 | "rust/old-aho-corasick/packed/leftmost-first", 150 | "daachorse/bytewise/leftmost-first", 151 | "naive/rust/memchr/memmem", 152 | ] 153 | 154 | [[bench]] 155 | model = "count" 156 | name = "alt-sherlock-ru" 157 | regex = [ 158 | "Шерлок Холмс", 159 | "Джон Уотсон", 160 | "Ирен Адлер", 161 | "инспектор Лестрейд", 162 | "профессор Мориарти", 163 | ] 164 | haystack = { path = "opensubtitles/ru-sampled.txt" } 165 | count = 899 166 | engines = [ 167 | "rust/aho-corasick/default/leftmost-first", 168 | "rust/aho-corasick/dfa/leftmost-first", 169 | "rust/aho-corasick/packed/leftmost-first", 170 | "rust/old-aho-corasick/default/leftmost-first", 171 | "rust/old-aho-corasick/dfa/leftmost-first", 172 | "rust/old-aho-corasick/packed/leftmost-first", 173 | "daachorse/bytewise/leftmost-first", 174 | "naive/rust/memchr/memmem", 175 | ] 176 | 177 | [[bench]] 178 | model = "count" 179 | name = "alt-sherlock-casei-ru" 180 | regex = [ 181 | 'ШЕ', 'Ше', 'шЕ', 'ше', 182 | 'ДЖ', 'Дж', 'дЖ', 'дж', 'ᲁ\xd0', 183 | 'ИР', 'Ир', 'иР', 'ир', 184 | 'ИН', 'Ин', 'иН', 'ин', 185 | 'ПР', 'Пр', 'пР', 'пр', 186 | ] 187 | haystack = { path = "opensubtitles/ru-sampled.txt" } 188 | count = 11_400 # original regex is 971 189 | engines = [ 190 | "rust/aho-corasick/default/leftmost-first", 191 | "rust/aho-corasick/dfa/leftmost-first", 192 | "rust/aho-corasick/packed/leftmost-first", 193 | "rust/old-aho-corasick/default/leftmost-first", 194 | "rust/old-aho-corasick/dfa/leftmost-first", 195 | "rust/old-aho-corasick/packed/leftmost-first", 196 | "daachorse/bytewise/leftmost-first", 197 | "naive/rust/memchr/memmem", 198 | ] 199 | 200 | [[bench]] 201 | model = "count" 202 | name = "alt-sherlock-zh" 203 | regex = [ 204 | "夏洛克·福尔摩斯", 205 | "约翰华生", 206 | "阿德勒", 207 | "雷斯垂德", 208 | "莫里亚蒂教授", 209 | ] 210 | haystack = { path = "opensubtitles/zh-sampled.txt" } 211 | count = 207 212 | engines = [ 213 | "rust/aho-corasick/default/leftmost-first", 214 | "rust/aho-corasick/dfa/leftmost-first", 215 | "rust/aho-corasick/packed/leftmost-first", 216 | "rust/old-aho-corasick/default/leftmost-first", 217 | "rust/old-aho-corasick/dfa/leftmost-first", 218 | "rust/old-aho-corasick/packed/leftmost-first", 219 | "daachorse/bytewise/leftmost-first", 220 | "naive/rust/memchr/memmem", 221 | ] 222 | 223 | [[bench]] 224 | model = "count" 225 | name = "dictionary-15" 226 | regex = { path = "dictionary/english/length-15.txt", per-line = "pattern" } 227 | haystack = { path = "opensubtitles/en-medium.txt" } 228 | count = 1 229 | engines = [ 230 | "rust/aho-corasick/default/leftmost-first", 231 | "rust/aho-corasick/nfa-contiguous/leftmost-first", 232 | "rust/aho-corasick/dfa/leftmost-first", 233 | "rust/old-aho-corasick/default/leftmost-first", 234 | "rust/old-aho-corasick/nfa-contiguous/leftmost-first", 235 | "rust/old-aho-corasick/dfa/leftmost-first", 236 | "daachorse/bytewise/leftmost-first", 237 | "naive/rust/memchr/memmem", 238 | ] 239 | -------------------------------------------------------------------------------- /benchmarks/engines.toml: -------------------------------------------------------------------------------- 1 | # Engines for the aho-corasick crate. We don't cover literally every possible 2 | # configuration, but we try to cover everything broadly. 3 | 4 | [[engine]] 5 | name = "rust/aho-corasick/default/standard" 6 | cwd = "./engines/rust-aho-corasick" 7 | [engine.version] 8 | bin = "./target/release/main" 9 | args = ["--version"] 10 | [engine.run] 11 | bin = "./target/release/main" 12 | args = ["default/standard"] 13 | [[engine.build]] 14 | bin = "cargo" 15 | args = ["build", "--release"] 16 | [[engine.clean]] 17 | bin = "cargo" 18 | args = ["clean"] 19 | 20 | [[engine]] 21 | name = "rust/aho-corasick/default/overlapping" 22 | cwd = "./engines/rust-aho-corasick" 23 | [engine.version] 24 | bin = "./target/release/main" 25 | args = ["--version"] 26 | [engine.run] 27 | bin = "./target/release/main" 28 | args = ["default/overlapping"] 29 | [[engine.build]] 30 | bin = "cargo" 31 | args = ["build", "--release"] 32 | [[engine.clean]] 33 | bin = "cargo" 34 | args = ["clean"] 35 | 36 | [[engine]] 37 | name = "rust/aho-corasick/default/leftmost-first" 38 | cwd = "./engines/rust-aho-corasick" 39 | [engine.version] 40 | bin = "./target/release/main" 41 | args = ["--version"] 42 | [engine.run] 43 | bin = "./target/release/main" 44 | args = ["default/leftmost-first"] 45 | [[engine.build]] 46 | bin = "cargo" 47 | args = ["build", "--release"] 48 | [[engine.clean]] 49 | bin = "cargo" 50 | args = ["clean"] 51 | 52 | [[engine]] 53 | name = "rust/aho-corasick/default/leftmost-longest" 54 | cwd = "./engines/rust-aho-corasick" 55 | [engine.version] 56 | bin = "./target/release/main" 57 | args = ["--version"] 58 | [engine.run] 59 | bin = "./target/release/main" 60 | args = ["default/leftmost-longest"] 61 | [[engine.build]] 62 | bin = "cargo" 63 | args = ["build", "--release"] 64 | [[engine.clean]] 65 | bin = "cargo" 66 | args = ["clean"] 67 | 68 | [[engine]] 69 | name = "rust/aho-corasick/nfa-noncontiguous/leftmost-first" 70 | cwd = "./engines/rust-aho-corasick" 71 | [engine.version] 72 | bin = "./target/release/main" 73 | args = ["--version"] 74 | [engine.run] 75 | bin = "./target/release/main" 76 | args = ["nfa-noncontiguous/leftmost-first"] 77 | [[engine.build]] 78 | bin = "cargo" 79 | args = ["build", "--release"] 80 | [[engine.clean]] 81 | bin = "cargo" 82 | args = ["clean"] 83 | 84 | [[engine]] 85 | name = "rust/aho-corasick/nfa-contiguous/leftmost-first" 86 | cwd = "./engines/rust-aho-corasick" 87 | [engine.version] 88 | bin = "./target/release/main" 89 | args = ["--version"] 90 | [engine.run] 91 | bin = "./target/release/main" 92 | args = ["nfa-contiguous/leftmost-first"] 93 | [[engine.build]] 94 | bin = "cargo" 95 | args = ["build", "--release"] 96 | [[engine.clean]] 97 | bin = "cargo" 98 | args = ["clean"] 99 | 100 | [[engine]] 101 | name = "rust/aho-corasick/dfa/leftmost-first" 102 | cwd = "./engines/rust-aho-corasick" 103 | [engine.version] 104 | bin = "./target/release/main" 105 | args = ["--version"] 106 | [engine.run] 107 | bin = "./target/release/main" 108 | args = ["dfa/leftmost-first"] 109 | [[engine.build]] 110 | bin = "cargo" 111 | args = ["build", "--release"] 112 | [[engine.clean]] 113 | bin = "cargo" 114 | args = ["clean"] 115 | 116 | [[engine]] 117 | name = "rust/aho-corasick/packed/leftmost-first" 118 | cwd = "./engines/rust-aho-corasick" 119 | [engine.version] 120 | bin = "./target/release/main" 121 | args = ["--version"] 122 | [engine.run] 123 | bin = "./target/release/main" 124 | args = ["packed/leftmost-first"] 125 | [[engine.build]] 126 | bin = "cargo" 127 | args = ["build", "--release"] 128 | [[engine.clean]] 129 | bin = "cargo" 130 | args = ["clean"] 131 | 132 | # Engines for aho-corasick, but pinned to 1.0.5. Essentially a way of 133 | # benchmarking the older version before some internal refactoring. 134 | 135 | [[engine]] 136 | name = "rust/old-aho-corasick/default/standard" 137 | cwd = "./engines/rust-old-aho-corasick" 138 | [engine.version] 139 | bin = "./target/release/main" 140 | args = ["--version"] 141 | [engine.run] 142 | bin = "./target/release/main" 143 | args = ["default/standard"] 144 | [[engine.build]] 145 | bin = "cargo" 146 | args = ["build", "--release"] 147 | [[engine.clean]] 148 | bin = "cargo" 149 | args = ["clean"] 150 | 151 | [[engine]] 152 | name = "rust/old-aho-corasick/default/leftmost-first" 153 | cwd = "./engines/rust-old-aho-corasick" 154 | [engine.version] 155 | bin = "./target/release/main" 156 | args = ["--version"] 157 | [engine.run] 158 | bin = "./target/release/main" 159 | args = ["default/leftmost-first"] 160 | [[engine.build]] 161 | bin = "cargo" 162 | args = ["build", "--release"] 163 | [[engine.clean]] 164 | bin = "cargo" 165 | args = ["clean"] 166 | 167 | [[engine]] 168 | name = "rust/old-aho-corasick/default/leftmost-longest" 169 | cwd = "./engines/rust-old-aho-corasick" 170 | [engine.version] 171 | bin = "./target/release/main" 172 | args = ["--version"] 173 | [engine.run] 174 | bin = "./target/release/main" 175 | args = ["default/leftmost-longest"] 176 | [[engine.build]] 177 | bin = "cargo" 178 | args = ["build", "--release"] 179 | [[engine.clean]] 180 | bin = "cargo" 181 | args = ["clean"] 182 | 183 | [[engine]] 184 | name = "rust/old-aho-corasick/nfa-noncontiguous/leftmost-first" 185 | cwd = "./engines/rust-old-aho-corasick" 186 | [engine.version] 187 | bin = "./target/release/main" 188 | args = ["--version"] 189 | [engine.run] 190 | bin = "./target/release/main" 191 | args = ["nfa-noncontiguous/leftmost-first"] 192 | [[engine.build]] 193 | bin = "cargo" 194 | args = ["build", "--release"] 195 | [[engine.clean]] 196 | bin = "cargo" 197 | args = ["clean"] 198 | 199 | [[engine]] 200 | name = "rust/old-aho-corasick/nfa-contiguous/leftmost-first" 201 | cwd = "./engines/rust-old-aho-corasick" 202 | [engine.version] 203 | bin = "./target/release/main" 204 | args = ["--version"] 205 | [engine.run] 206 | bin = "./target/release/main" 207 | args = ["nfa-contiguous/leftmost-first"] 208 | [[engine.build]] 209 | bin = "cargo" 210 | args = ["build", "--release"] 211 | [[engine.clean]] 212 | bin = "cargo" 213 | args = ["clean"] 214 | 215 | [[engine]] 216 | name = "rust/old-aho-corasick/dfa/leftmost-first" 217 | cwd = "./engines/rust-old-aho-corasick" 218 | [engine.version] 219 | bin = "./target/release/main" 220 | args = ["--version"] 221 | [engine.run] 222 | bin = "./target/release/main" 223 | args = ["dfa/leftmost-first"] 224 | [[engine.build]] 225 | bin = "cargo" 226 | args = ["build", "--release"] 227 | [[engine.clean]] 228 | bin = "cargo" 229 | args = ["clean"] 230 | 231 | [[engine]] 232 | name = "rust/old-aho-corasick/packed/leftmost-first" 233 | cwd = "./engines/rust-old-aho-corasick" 234 | [engine.version] 235 | bin = "./target/release/main" 236 | args = ["--version"] 237 | [engine.run] 238 | bin = "./target/release/main" 239 | args = ["packed/leftmost-first"] 240 | [[engine.build]] 241 | bin = "cargo" 242 | args = ["build", "--release"] 243 | [[engine.clean]] 244 | bin = "cargo" 245 | args = ["clean"] 246 | 247 | # Aho-Corasick engines from daachorse. AFAIK, this is the only Rust library 248 | # that is anywhere near competitive with the aho-corasick crate. 249 | # 250 | # It uses the "double array trie" technique for implementing the Aho-Corasick 251 | # algorithm with a few additional tricks that were published as part of 252 | # creating this library. 253 | # 254 | # Ref: https://github.com/daac-tools/daachorse 255 | 256 | [[engine]] 257 | name = "daachorse/bytewise/standard" 258 | cwd = "./engines/rust-daachorse" 259 | [engine.version] 260 | bin = "./target/release/main" 261 | args = ["--version"] 262 | [engine.run] 263 | bin = "./target/release/main" 264 | args = ["bytewise/standard"] 265 | [[engine.build]] 266 | bin = "cargo" 267 | args = ["build", "--release"] 268 | [[engine.clean]] 269 | bin = "cargo" 270 | args = ["clean"] 271 | 272 | [[engine]] 273 | name = "daachorse/bytewise/overlapping" 274 | cwd = "./engines/rust-daachorse" 275 | [engine.version] 276 | bin = "./target/release/main" 277 | args = ["--version"] 278 | [engine.run] 279 | bin = "./target/release/main" 280 | args = ["bytewise/overlapping"] 281 | [[engine.build]] 282 | bin = "cargo" 283 | args = ["build", "--release"] 284 | [[engine.clean]] 285 | bin = "cargo" 286 | args = ["clean"] 287 | 288 | [[engine]] 289 | name = "daachorse/bytewise/leftmost-first" 290 | cwd = "./engines/rust-daachorse" 291 | [engine.version] 292 | bin = "./target/release/main" 293 | args = ["--version"] 294 | [engine.run] 295 | bin = "./target/release/main" 296 | args = ["bytewise/leftmost-first"] 297 | [[engine.build]] 298 | bin = "cargo" 299 | args = ["build", "--release"] 300 | [[engine.clean]] 301 | bin = "cargo" 302 | args = ["clean"] 303 | 304 | [[engine]] 305 | name = "daachorse/bytewise/leftmost-longest" 306 | cwd = "./engines/rust-daachorse" 307 | [engine.version] 308 | bin = "./target/release/main" 309 | args = ["--version"] 310 | [engine.run] 311 | bin = "./target/release/main" 312 | args = ["bytewise/leftmost-longest"] 313 | [[engine.build]] 314 | bin = "cargo" 315 | args = ["build", "--release"] 316 | [[engine.clean]] 317 | bin = "cargo" 318 | args = ["clean"] 319 | 320 | # Engines based on the `jetscii` Rust crate. This is somewhat more appropriately 321 | # compared with routines in `memchr`, but there is some overlap in use cases 322 | # with Teddy's packed searcher for multiple single-byte needles. 323 | 324 | [[engine]] 325 | name = "rust/jetscii/ascii-chars/prebuilt" 326 | cwd = "./engines/rust-jetscii" 327 | [engine.version] 328 | bin = "./target/release/main" 329 | args = ["--version"] 330 | [engine.run] 331 | bin = "./target/release/main" 332 | args = ["ascii-chars-prebuilt"] 333 | [[engine.build]] 334 | bin = "cargo" 335 | args = ["build", "--release"] 336 | [[engine.clean]] 337 | bin = "cargo" 338 | args = ["clean"] 339 | 340 | [[engine]] 341 | name = "rust/jetscii/ascii-chars/oneshot" 342 | cwd = "./engines/rust-jetscii" 343 | [engine.version] 344 | bin = "./target/release/main" 345 | args = ["--version"] 346 | [engine.run] 347 | bin = "./target/release/main" 348 | args = ["ascii-chars-oneshot"] 349 | [[engine.build]] 350 | bin = "cargo" 351 | args = ["build", "--release"] 352 | [[engine.clean]] 353 | bin = "cargo" 354 | args = ["clean"] 355 | 356 | # Naive engines. Useful for comparisons and to determine the crossover point 357 | # where a multi-substring algorithm is beneficial over multi single-substring 358 | # algorithms. We include both the `memchr` crate and `std`. 359 | 360 | [[engine]] 361 | name = "naive/rust/std" 362 | cwd = "./engines/naive" 363 | [engine.version] 364 | bin = "./target/release/main" 365 | args = ["--version"] 366 | [engine.run] 367 | bin = "./target/release/main" 368 | args = ["rust/std"] 369 | [[engine.build]] 370 | bin = "cargo" 371 | args = ["build", "--release"] 372 | [[engine.clean]] 373 | bin = "cargo" 374 | args = ["clean"] 375 | 376 | [[engine]] 377 | name = "naive/rust/memchr/memmem" 378 | cwd = "./engines/naive" 379 | [engine.version] 380 | bin = "./target/release/main" 381 | args = ["--version"] 382 | [engine.run] 383 | bin = "./target/release/main" 384 | args = ["rust/memchr/memmem"] 385 | [[engine.build]] 386 | bin = "cargo" 387 | args = ["build", "--release"] 388 | [[engine.clean]] 389 | bin = "cargo" 390 | args = ["clean"] 391 | -------------------------------------------------------------------------------- /benchmarks/engines/naive/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anyhow" 7 | version = "1.0.75" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" 10 | 11 | [[package]] 12 | name = "bstr" 13 | version = "1.6.2" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" 16 | dependencies = [ 17 | "memchr", 18 | "serde", 19 | ] 20 | 21 | [[package]] 22 | name = "lexopt" 23 | version = "0.3.0" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401" 26 | 27 | [[package]] 28 | name = "main" 29 | version = "0.1.0" 30 | dependencies = [ 31 | "anyhow", 32 | "lexopt", 33 | "memchr", 34 | "shared", 35 | ] 36 | 37 | [[package]] 38 | name = "memchr" 39 | version = "2.6.3" 40 | source = "registry+https://github.com/rust-lang/crates.io-index" 41 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" 42 | 43 | [[package]] 44 | name = "proc-macro2" 45 | version = "1.0.66" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" 48 | dependencies = [ 49 | "unicode-ident", 50 | ] 51 | 52 | [[package]] 53 | name = "quote" 54 | version = "1.0.33" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" 57 | dependencies = [ 58 | "proc-macro2", 59 | ] 60 | 61 | [[package]] 62 | name = "serde" 63 | version = "1.0.188" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" 66 | dependencies = [ 67 | "serde_derive", 68 | ] 69 | 70 | [[package]] 71 | name = "serde_derive" 72 | version = "1.0.188" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" 75 | dependencies = [ 76 | "proc-macro2", 77 | "quote", 78 | "syn", 79 | ] 80 | 81 | [[package]] 82 | name = "shared" 83 | version = "0.1.0" 84 | dependencies = [ 85 | "anyhow", 86 | "bstr", 87 | ] 88 | 89 | [[package]] 90 | name = "syn" 91 | version = "2.0.31" 92 | source = "registry+https://github.com/rust-lang/crates.io-index" 93 | checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398" 94 | dependencies = [ 95 | "proc-macro2", 96 | "quote", 97 | "unicode-ident", 98 | ] 99 | 100 | [[package]] 101 | name = "unicode-ident" 102 | version = "1.0.11" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" 105 | -------------------------------------------------------------------------------- /benchmarks/engines/naive/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "main" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [[bin]] 7 | name = "main" 8 | path = "main.rs" 9 | 10 | [dependencies] 11 | anyhow = "1.0.69" 12 | lexopt = "0.3.0" 13 | memchr = "2.6.3" 14 | 15 | [dependencies.shared] 16 | path = "../../shared" 17 | 18 | [profile.release] 19 | debug = true 20 | codegen-units = 1 21 | lto = "fat" 22 | -------------------------------------------------------------------------------- /benchmarks/engines/naive/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use { 4 | anyhow::Context, 5 | lexopt::{Arg, ValueExt}, 6 | memchr::memmem, 7 | }; 8 | 9 | use shared::{Benchmark, Sample}; 10 | 11 | fn main() -> anyhow::Result<()> { 12 | let mut p = lexopt::Parser::from_env(); 13 | let (mut engine, mut quiet) = (String::new(), false); 14 | while let Some(arg) = p.next()? { 15 | match arg { 16 | Arg::Short('h') | Arg::Long("help") => { 17 | anyhow::bail!("main [--version | --quiet] ") 18 | } 19 | Arg::Short('q') | Arg::Long("quiet") => { 20 | quiet = true; 21 | } 22 | Arg::Long("version") => { 23 | writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?; 24 | return Ok(()); 25 | } 26 | Arg::Value(v) => { 27 | anyhow::ensure!( 28 | engine.is_empty(), 29 | "only one engine string allowed" 30 | ); 31 | engine = v.string().context("")?; 32 | anyhow::ensure!( 33 | !engine.is_empty(), 34 | "engine string cannot be empty" 35 | ); 36 | } 37 | _ => return Err(arg.unexpected().into()), 38 | } 39 | } 40 | 41 | let b = Benchmark::from_stdin() 42 | .context("failed to read KLV data from ")?; 43 | let samples = match (b.model.as_str(), engine.as_str()) { 44 | ("compile", "rust/memchr/memmem") => model_compile_memmem(&b)?, 45 | ("count", "rust/memchr/memmem") => model_count_memmem(&b)?, 46 | ("count", "rust/std") => model_count_std(&b)?, 47 | _ => anyhow::bail!( 48 | "unsupported model/engine pair, model={} engine={}", 49 | b.model, 50 | engine 51 | ), 52 | }; 53 | if !quiet { 54 | let mut stdout = std::io::stdout().lock(); 55 | for s in samples.iter() { 56 | writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?; 57 | } 58 | } 59 | Ok(()) 60 | } 61 | 62 | /// Implements the "compile a matcher" model for naive multi-substring search 63 | /// with the `memchr` crate's `memmem` implementation. 64 | fn model_compile_memmem(b: &Benchmark) -> anyhow::Result> { 65 | let haystack = &*b.haystack; 66 | shared::run_and_count( 67 | b, 68 | |finders: Vec>| { 69 | let mut count = 0; 70 | for f in finders.iter() { 71 | count += f.find_iter(haystack).count(); 72 | } 73 | Ok(count) 74 | }, 75 | || compile_memmem(b), 76 | ) 77 | } 78 | 79 | /// Implements a naive multi-substring algorithm using the `memchr` crate's 80 | /// `memmem` implementation. 81 | fn model_count_memmem(b: &Benchmark) -> anyhow::Result> { 82 | let haystack = &*b.haystack; 83 | let finders = compile_memmem(b)?; 84 | shared::run(b, || { 85 | let mut count = 0; 86 | for f in finders.iter() { 87 | count += f.find_iter(haystack).count(); 88 | } 89 | Ok(count) 90 | }) 91 | } 92 | 93 | /// Implements a naive multi-substring algorithm using std's single substring 94 | /// search implementation. This returns an error if the haystack or any of 95 | /// the needles are invalid UTF-8. 96 | fn model_count_std(b: &Benchmark) -> anyhow::Result> { 97 | let Ok(haystack) = std::str::from_utf8(&b.haystack) else { 98 | anyhow::bail!("haystack is not valid UTF-8") 99 | }; 100 | let mut needles = vec![]; 101 | for needle in b.needles.iter() { 102 | let Ok(needle) = std::str::from_utf8(needle) else { 103 | anyhow::bail!("one of the needles is not valid UTF-8") 104 | }; 105 | needles.push(needle); 106 | } 107 | shared::run(b, || { 108 | let mut count = 0; 109 | for needle in needles.iter() { 110 | count += haystack.matches(needle).count(); 111 | } 112 | Ok(count) 113 | }) 114 | } 115 | 116 | /// Compiles a naive multi-substring matcher by building a single substring 117 | /// matcher for each needle. 118 | fn compile_memmem( 119 | b: &Benchmark, 120 | ) -> anyhow::Result>> { 121 | anyhow::ensure!( 122 | !b.case_insensitive, 123 | "naive multi-substring search doesn't support case insensitive mode", 124 | ); 125 | let mut finders = vec![]; 126 | for needle in b.needles.iter() { 127 | finders.push(memmem::Finder::new(needle).into_owned()); 128 | } 129 | Ok(finders) 130 | } 131 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-aho-corasick/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-aho-corasick/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.0.5" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "aho-corasick" 16 | version = "1.1.3" 17 | dependencies = [ 18 | "log", 19 | "memchr", 20 | ] 21 | 22 | [[package]] 23 | name = "anyhow" 24 | version = "1.0.69" 25 | source = "registry+https://github.com/rust-lang/crates.io-index" 26 | checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" 27 | 28 | [[package]] 29 | name = "atty" 30 | version = "0.2.14" 31 | source = "registry+https://github.com/rust-lang/crates.io-index" 32 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 33 | dependencies = [ 34 | "hermit-abi", 35 | "libc", 36 | "winapi", 37 | ] 38 | 39 | [[package]] 40 | name = "bstr" 41 | version = "1.6.2" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" 44 | dependencies = [ 45 | "memchr", 46 | "serde", 47 | ] 48 | 49 | [[package]] 50 | name = "env_logger" 51 | version = "0.9.3" 52 | source = "registry+https://github.com/rust-lang/crates.io-index" 53 | checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" 54 | dependencies = [ 55 | "atty", 56 | "humantime", 57 | "log", 58 | "regex", 59 | "termcolor", 60 | ] 61 | 62 | [[package]] 63 | name = "hermit-abi" 64 | version = "0.1.19" 65 | source = "registry+https://github.com/rust-lang/crates.io-index" 66 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 67 | dependencies = [ 68 | "libc", 69 | ] 70 | 71 | [[package]] 72 | name = "humantime" 73 | version = "2.1.0" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" 76 | 77 | [[package]] 78 | name = "lexopt" 79 | version = "0.3.0" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401" 82 | 83 | [[package]] 84 | name = "libc" 85 | version = "0.2.148" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" 88 | 89 | [[package]] 90 | name = "log" 91 | version = "0.4.20" 92 | source = "registry+https://github.com/rust-lang/crates.io-index" 93 | checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" 94 | 95 | [[package]] 96 | name = "main" 97 | version = "1.0.5" 98 | dependencies = [ 99 | "aho-corasick 1.1.3", 100 | "anyhow", 101 | "env_logger", 102 | "lexopt", 103 | "shared", 104 | ] 105 | 106 | [[package]] 107 | name = "memchr" 108 | version = "2.6.3" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" 111 | 112 | [[package]] 113 | name = "regex" 114 | version = "1.9.5" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" 117 | dependencies = [ 118 | "aho-corasick 1.0.5", 119 | "memchr", 120 | "regex-automata", 121 | "regex-syntax", 122 | ] 123 | 124 | [[package]] 125 | name = "regex-automata" 126 | version = "0.3.8" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" 129 | dependencies = [ 130 | "aho-corasick 1.0.5", 131 | "memchr", 132 | "regex-syntax", 133 | ] 134 | 135 | [[package]] 136 | name = "regex-syntax" 137 | version = "0.7.5" 138 | source = "registry+https://github.com/rust-lang/crates.io-index" 139 | checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" 140 | 141 | [[package]] 142 | name = "serde" 143 | version = "1.0.152" 144 | source = "registry+https://github.com/rust-lang/crates.io-index" 145 | checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" 146 | 147 | [[package]] 148 | name = "shared" 149 | version = "0.1.0" 150 | dependencies = [ 151 | "anyhow", 152 | "bstr", 153 | ] 154 | 155 | [[package]] 156 | name = "termcolor" 157 | version = "1.2.0" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" 160 | dependencies = [ 161 | "winapi-util", 162 | ] 163 | 164 | [[package]] 165 | name = "winapi" 166 | version = "0.3.9" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 169 | dependencies = [ 170 | "winapi-i686-pc-windows-gnu", 171 | "winapi-x86_64-pc-windows-gnu", 172 | ] 173 | 174 | [[package]] 175 | name = "winapi-i686-pc-windows-gnu" 176 | version = "0.4.0" 177 | source = "registry+https://github.com/rust-lang/crates.io-index" 178 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 179 | 180 | [[package]] 181 | name = "winapi-util" 182 | version = "0.1.5" 183 | source = "registry+https://github.com/rust-lang/crates.io-index" 184 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 185 | dependencies = [ 186 | "winapi", 187 | ] 188 | 189 | [[package]] 190 | name = "winapi-x86_64-pc-windows-gnu" 191 | version = "0.4.0" 192 | source = "registry+https://github.com/rust-lang/crates.io-index" 193 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 194 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-aho-corasick/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "main" 3 | version = "1.0.5" 4 | edition = "2021" 5 | 6 | [[bin]] 7 | name = "main" 8 | path = "main.rs" 9 | 10 | [dependencies] 11 | aho-corasick = { version = "*", path = "../../../", features = ["logging"] } 12 | anyhow = "1.0.69" 13 | # Using an older version here because I am really not a fan of the dependency 14 | # tree explosion that has happened in 0.10. 15 | env_logger = "0.9.3" 16 | lexopt = "0.3.0" 17 | 18 | [dependencies.shared] 19 | path = "../../shared" 20 | 21 | [profile.release] 22 | debug = true 23 | codegen-units = 1 24 | lto = "fat" 25 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-aho-corasick/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a Rust runner program for benchmarking the 2 | [`aho-corasick` crate][rust-aho-corasick]. The `aho-corasick` crate 3 | principally implements the [Aho-Corasick algorithm][aho-corasick], although 4 | it has other algorithms for multiple substring search, such as [Teddy], which 5 | was ported from the Hyperscan project. 6 | 7 | The `aho-corasick` crate is used by [Rust's `regex` crate][rust-regex] to 8 | implement fast prefilters that permit finding candidates very quickly and only 9 | needing to use the regex engine to confirm the match. The Teddy algorithm is 10 | particularly excellent here. (Sometimes `aho-corasick` is used as the regex 11 | engine itself, for example, when the regex is just an alternation of literals.) 12 | 13 | Since the `aho-corasick` crate only supports searching for literal strings, this 14 | engine should only be used for regex patterns that are literals. This is up to 15 | the author of the benchmark definition, as this runner program will always 16 | treat regex patterns as literals. 17 | 18 | This also means that this runner program cannot support all benchmark models. 19 | Only the `compile`, `count`, `count-spans` and `grep` models are supported. 20 | 21 | Finally, this runner program supports measuring two different Aho-Corasick 22 | implementations: `nfa` and `dfa`. The former follows failure transitions at 23 | search time and is thus usually slower, where as the latter builds a full 24 | transition table by pre-computing all failure transitions. The latter tends 25 | to be faster at search time, but can use orders (plural) of magnitude more 26 | memory. In both the `nfa` and `dfa` engines, prefilters inside of Aho-Corasick 27 | are disabled. 28 | 29 | [rust-aho-corasick]: https://github.com/BurntSushi/aho-corasick 30 | [aho-corasick]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm 31 | [Teddy]: https://github.com/BurntSushi/aho-corasick/tree/4e7fa3b85dd3a3ce882896f1d4ee22b1f271f0b4/src/packed/teddy 32 | [rust-regex]: https://github.com/rust-lang/regex 33 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-aho-corasick/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use { 4 | aho_corasick::{ 5 | AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, MatchKind, 6 | }, 7 | anyhow::Context, 8 | lexopt::{Arg, ValueExt}, 9 | }; 10 | 11 | use shared::{Benchmark, Sample}; 12 | 13 | fn main() -> anyhow::Result<()> { 14 | env_logger::try_init()?; 15 | 16 | let mut p = lexopt::Parser::from_env(); 17 | let (mut engine, mut quiet) = (String::new(), false); 18 | while let Some(arg) = p.next()? { 19 | match arg { 20 | Arg::Short('h') | Arg::Long("help") => { 21 | anyhow::bail!("main [--version | --quiet] ") 22 | } 23 | Arg::Short('q') | Arg::Long("quiet") => { 24 | quiet = true; 25 | } 26 | Arg::Long("version") => { 27 | writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?; 28 | return Ok(()); 29 | } 30 | Arg::Value(v) => { 31 | anyhow::ensure!( 32 | engine.is_empty(), 33 | "only one engine string allowed" 34 | ); 35 | engine = v.string().context("")?; 36 | anyhow::ensure!( 37 | !engine.is_empty(), 38 | "engine string cannot be empty" 39 | ); 40 | } 41 | _ => return Err(arg.unexpected().into()), 42 | } 43 | } 44 | 45 | let b = Benchmark::from_stdin() 46 | .context("failed to read KLV data from ")?; 47 | let samples = match (b.model.as_str(), engine.as_str()) { 48 | // These first 7 configurations are meant to test the default settings 49 | // on each of {compile, count} x {standard, leftmost-{first,longest}}. 50 | // We don't also test each of them with {nfa/(non-)?contiguous, dfa} 51 | // because it would just get ridiculous. 52 | ("compile", "default/standard") => { 53 | model_compile_ac(&b, || Ok(builder_ac(&b)?.build(&b.needles)?))? 54 | } 55 | ("compile", "default/leftmost-first") => model_compile_ac(&b, || { 56 | Ok(builder_ac(&b)? 57 | .match_kind(MatchKind::LeftmostFirst) 58 | .build(&b.needles)?) 59 | })?, 60 | ("compile", "default/leftmost-longest") => { 61 | model_compile_ac(&b, || { 62 | Ok(builder_ac(&b)? 63 | .match_kind(MatchKind::LeftmostLongest) 64 | .build(&b.needles)?) 65 | })? 66 | } 67 | ("count", "default/standard") => { 68 | let ac = builder_ac(&b)?.build(&b.needles)?; 69 | model_count_ac(&b, &ac)? 70 | } 71 | ("count", "default/overlapping") => { 72 | let ac = builder_ac(&b)?.build(&b.needles)?; 73 | model_count_ac_overlapping(&b, &ac)? 74 | } 75 | ("count", "default/leftmost-first") => { 76 | let ac = builder_ac(&b)? 77 | .match_kind(MatchKind::LeftmostFirst) 78 | .build(&b.needles)?; 79 | model_count_ac(&b, &ac)? 80 | } 81 | ("count", "default/leftmost-longest") => { 82 | let ac = builder_ac(&b)? 83 | .match_kind(MatchKind::LeftmostLongest) 84 | .build(&b.needles)?; 85 | model_count_ac(&b, &ac)? 86 | } 87 | 88 | // OK, now we start testing the specific Aho-Corasick automatons, but 89 | // we just focus on leftmost-first because that's the case we tend to 90 | // be more interested in optimizing in practice. There's also likely 91 | // to not be much of a perf difference between leftmost-first and 92 | // leftmost-longest. 93 | // 94 | // We also specifically disable prefilters so that we know we're always 95 | // measuring the actual automaton. (The 'default' engines above might 96 | // use a prefilter!) 97 | ("count", "nfa-noncontiguous/leftmost-first") => { 98 | let ac = builder_ac(&b)? 99 | .prefilter(false) 100 | .kind(Some(AhoCorasickKind::NoncontiguousNFA)) 101 | .match_kind(MatchKind::LeftmostFirst) 102 | .build(&b.needles)?; 103 | model_count_ac(&b, &ac)? 104 | } 105 | ("count", "nfa-contiguous/leftmost-first") => { 106 | let ac = builder_ac(&b)? 107 | .prefilter(false) 108 | .kind(Some(AhoCorasickKind::ContiguousNFA)) 109 | .match_kind(MatchKind::LeftmostFirst) 110 | .build(&b.needles)?; 111 | model_count_ac(&b, &ac)? 112 | } 113 | ("count", "dfa/leftmost-first") => { 114 | let ac = builder_ac(&b)? 115 | .prefilter(false) 116 | .kind(Some(AhoCorasickKind::DFA)) 117 | .match_kind(MatchKind::LeftmostFirst) 118 | .build(&b.needles)?; 119 | model_count_ac(&b, &ac)? 120 | } 121 | 122 | // And now the packed substring routines. We include a 'compile' 123 | // model here as well because it's nice to know how long, specifically, 124 | // the packed searcher take to build in isolation. 125 | ("compile", "packed/leftmost-first") => { 126 | model_compile_packed(&b, || { 127 | let searcher = aho_corasick::packed::Config::new() 128 | .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst) 129 | .heuristic_pattern_limits(false) 130 | .builder() 131 | .extend(&b.needles) 132 | .build() 133 | .ok_or_else(|| { 134 | anyhow::anyhow!("could not build packed searcher") 135 | })?; 136 | Ok(searcher) 137 | })? 138 | } 139 | ("count", "packed/leftmost-first") => { 140 | let searcher = aho_corasick::packed::Config::new() 141 | .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst) 142 | .heuristic_pattern_limits(false) 143 | .builder() 144 | .extend(&b.needles) 145 | .build() 146 | .ok_or_else(|| { 147 | anyhow::anyhow!("could not build packed searcher") 148 | })?; 149 | model_count_packed(&b, &searcher)? 150 | } 151 | _ => anyhow::bail!( 152 | "unsupported model/engine pair, model={} engine={}", 153 | b.model, 154 | engine 155 | ), 156 | }; 157 | if !quiet { 158 | let mut stdout = std::io::stdout().lock(); 159 | for s in samples.iter() { 160 | writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?; 161 | } 162 | } 163 | Ok(()) 164 | } 165 | 166 | /// Implements the "compile a matcher" model for `AhoCorasick`. 167 | fn model_compile_ac( 168 | b: &Benchmark, 169 | compile: impl FnMut() -> anyhow::Result, 170 | ) -> anyhow::Result> { 171 | let haystack = &*b.haystack; 172 | shared::run_and_count( 173 | b, 174 | |re: AhoCorasick| Ok(re.find_iter(haystack).count()), 175 | compile, 176 | ) 177 | } 178 | 179 | /// Implements the "compile a matcher" model for packed substring search. 180 | fn model_compile_packed( 181 | b: &Benchmark, 182 | compile: impl FnMut() -> anyhow::Result, 183 | ) -> anyhow::Result> { 184 | let haystack = &*b.haystack; 185 | shared::run_and_count( 186 | b, 187 | |re: aho_corasick::packed::Searcher| { 188 | Ok(re.find_iter(haystack).count()) 189 | }, 190 | compile, 191 | ) 192 | } 193 | 194 | /// Implements the "count all matches" model for `AhoCorasick`. 195 | fn model_count_ac( 196 | b: &Benchmark, 197 | ac: &AhoCorasick, 198 | ) -> anyhow::Result> { 199 | let haystack = &*b.haystack; 200 | shared::run(b, || Ok(ac.find_iter(haystack).count())) 201 | } 202 | 203 | /// Implements the "count all overlapping matches" model for `AhoCorasick`. 204 | fn model_count_ac_overlapping( 205 | b: &Benchmark, 206 | ac: &AhoCorasick, 207 | ) -> anyhow::Result> { 208 | let haystack = &*b.haystack; 209 | shared::run(b, || Ok(ac.find_overlapping_iter(haystack).count())) 210 | } 211 | 212 | /// Implements the "count all matches" model for packed substring search. 213 | fn model_count_packed( 214 | b: &Benchmark, 215 | searcher: &aho_corasick::packed::Searcher, 216 | ) -> anyhow::Result> { 217 | anyhow::ensure!( 218 | !b.case_insensitive, 219 | "rust/aho-corasick/packed engines are incompatible \ 220 | with 'case-insensitive = true'" 221 | ); 222 | 223 | let haystack = &*b.haystack; 224 | shared::run(b, || Ok(searcher.find_iter(haystack).count())) 225 | } 226 | 227 | /// Returns a default builder with as many settings as possible applied from 228 | /// the benchmark definition. If the settings from the definition are not 229 | /// supported, then this returns an error. 230 | fn builder_ac(b: &Benchmark) -> anyhow::Result { 231 | anyhow::ensure!( 232 | !(b.unicode && b.case_insensitive), 233 | "rust/aho-corasick engines are incompatible with 'unicode = true' and \ 234 | 'case-insensitive = true'" 235 | ); 236 | let mut builder = AhoCorasick::builder(); 237 | builder.ascii_case_insensitive(b.case_insensitive); 238 | Ok(builder) 239 | } 240 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-daachorse/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anyhow" 7 | version = "1.0.75" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" 10 | 11 | [[package]] 12 | name = "bstr" 13 | version = "1.6.2" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" 16 | dependencies = [ 17 | "memchr", 18 | "serde", 19 | ] 20 | 21 | [[package]] 22 | name = "daachorse" 23 | version = "1.0.0" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36" 26 | 27 | [[package]] 28 | name = "lexopt" 29 | version = "0.3.0" 30 | source = "registry+https://github.com/rust-lang/crates.io-index" 31 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401" 32 | 33 | [[package]] 34 | name = "main" 35 | version = "1.0.0" 36 | dependencies = [ 37 | "anyhow", 38 | "daachorse", 39 | "lexopt", 40 | "shared", 41 | ] 42 | 43 | [[package]] 44 | name = "memchr" 45 | version = "2.6.3" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" 48 | 49 | [[package]] 50 | name = "proc-macro2" 51 | version = "1.0.66" 52 | source = "registry+https://github.com/rust-lang/crates.io-index" 53 | checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" 54 | dependencies = [ 55 | "unicode-ident", 56 | ] 57 | 58 | [[package]] 59 | name = "quote" 60 | version = "1.0.33" 61 | source = "registry+https://github.com/rust-lang/crates.io-index" 62 | checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" 63 | dependencies = [ 64 | "proc-macro2", 65 | ] 66 | 67 | [[package]] 68 | name = "serde" 69 | version = "1.0.188" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" 72 | dependencies = [ 73 | "serde_derive", 74 | ] 75 | 76 | [[package]] 77 | name = "serde_derive" 78 | version = "1.0.188" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" 81 | dependencies = [ 82 | "proc-macro2", 83 | "quote", 84 | "syn", 85 | ] 86 | 87 | [[package]] 88 | name = "shared" 89 | version = "0.1.0" 90 | dependencies = [ 91 | "anyhow", 92 | "bstr", 93 | ] 94 | 95 | [[package]] 96 | name = "syn" 97 | version = "2.0.31" 98 | source = "registry+https://github.com/rust-lang/crates.io-index" 99 | checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398" 100 | dependencies = [ 101 | "proc-macro2", 102 | "quote", 103 | "unicode-ident", 104 | ] 105 | 106 | [[package]] 107 | name = "unicode-ident" 108 | version = "1.0.11" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" 111 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-daachorse/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "main" 3 | version = "1.0.0" 4 | edition = "2021" 5 | 6 | [[bin]] 7 | name = "main" 8 | path = "main.rs" 9 | 10 | [dependencies] 11 | anyhow = "1.0.69" 12 | daachorse = "=1.0.0" 13 | lexopt = "0.3.0" 14 | 15 | [dependencies.shared] 16 | path = "../../shared" 17 | 18 | [profile.release] 19 | debug = true 20 | codegen-units = 1 21 | lto = "fat" 22 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-daachorse/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use { 4 | anyhow::Context, 5 | daachorse::{ 6 | bytewise::{DoubleArrayAhoCorasick, DoubleArrayAhoCorasickBuilder}, 7 | MatchKind, 8 | }, 9 | lexopt::{Arg, ValueExt}, 10 | }; 11 | 12 | use shared::{Benchmark, Sample}; 13 | 14 | fn main() -> anyhow::Result<()> { 15 | let mut p = lexopt::Parser::from_env(); 16 | let (mut engine, mut quiet) = (String::new(), false); 17 | while let Some(arg) = p.next()? { 18 | match arg { 19 | Arg::Short('h') | Arg::Long("help") => { 20 | anyhow::bail!("main [--version | --quiet] ") 21 | } 22 | Arg::Short('q') | Arg::Long("quiet") => { 23 | quiet = true; 24 | } 25 | Arg::Long("version") => { 26 | writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?; 27 | return Ok(()); 28 | } 29 | Arg::Value(v) => { 30 | anyhow::ensure!( 31 | engine.is_empty(), 32 | "only one engine string allowed" 33 | ); 34 | engine = v.string().context("")?; 35 | anyhow::ensure!( 36 | !engine.is_empty(), 37 | "engine string cannot be empty" 38 | ); 39 | } 40 | _ => return Err(arg.unexpected().into()), 41 | } 42 | } 43 | 44 | let b = Benchmark::from_stdin() 45 | .context("failed to read KLV data from ")?; 46 | let samples = match (b.model.as_str(), engine.as_str()) { 47 | ("compile", "bytewise/standard") => { 48 | model_compile_bytewise_standard(&b)? 49 | } 50 | ("compile", "bytewise/leftmost-first") => { 51 | model_compile_bytewise_leftmost(&b, MatchKind::LeftmostFirst)? 52 | } 53 | ("compile", "bytewise/leftmost-longest") => { 54 | model_compile_bytewise_leftmost(&b, MatchKind::LeftmostLongest)? 55 | } 56 | ("count", "bytewise/standard") => model_count_bytewise_standard(&b)?, 57 | ("count", "bytewise/overlapping") => { 58 | model_count_bytewise_overlapping(&b)? 59 | } 60 | ("count", "bytewise/leftmost-first") => { 61 | model_count_bytewise_leftmost(&b, MatchKind::LeftmostFirst)? 62 | } 63 | ("count", "bytewise/leftmost-longest") => { 64 | model_count_bytewise_leftmost(&b, MatchKind::LeftmostLongest)? 65 | } 66 | _ => anyhow::bail!( 67 | "unsupported model/engine pair, model={} engine={}", 68 | b.model, 69 | engine 70 | ), 71 | }; 72 | if !quiet { 73 | let mut stdout = std::io::stdout().lock(); 74 | for s in samples.iter() { 75 | writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?; 76 | } 77 | } 78 | Ok(()) 79 | } 80 | 81 | /// Implements the "compile a matcher" model for a bytewise daachorse automaton 82 | /// using "standard" (i.e., what's found in a textbook description of 83 | /// Aho-Corasick for a non-overlapping search) match semantics. 84 | fn model_compile_bytewise_standard( 85 | b: &Benchmark, 86 | ) -> anyhow::Result> { 87 | let haystack = &*b.haystack; 88 | shared::run_and_count( 89 | b, 90 | |ac: daachorse::DoubleArrayAhoCorasick| { 91 | Ok(ac.find_iter(haystack).count()) 92 | }, 93 | || compile_bytewise(b, MatchKind::Standard), 94 | ) 95 | } 96 | 97 | /// Implements the "compile a matcher" model for a bytewise daachorse automaton 98 | /// using the given match semantics. The match semantics must be either 99 | /// leftmost-first or leftmost-longest. 100 | fn model_compile_bytewise_leftmost( 101 | b: &Benchmark, 102 | kind: MatchKind, 103 | ) -> anyhow::Result> { 104 | let haystack = &*b.haystack; 105 | shared::run_and_count( 106 | b, 107 | |ac: daachorse::DoubleArrayAhoCorasick| { 108 | Ok(ac.leftmost_find_iter(haystack).count()) 109 | }, 110 | || compile_bytewise(b, kind), 111 | ) 112 | } 113 | 114 | /// Implements a multi-substring algorithm using daachorse's bytewise 115 | /// Aho-Corasick automaton. This uses "standard" match semantics. 116 | fn model_count_bytewise_standard( 117 | b: &Benchmark, 118 | ) -> anyhow::Result> { 119 | let haystack = &*b.haystack; 120 | let ac = compile_bytewise(b, MatchKind::Standard)?; 121 | shared::run(b, || Ok(ac.find_iter(haystack).count())) 122 | } 123 | 124 | /// Implements a multi-substring algorithm using daachorse's bytewise 125 | /// Aho-Corasick automaton. This uses "standard" match semantics and finds all 126 | /// overlapping matches. 127 | fn model_count_bytewise_overlapping( 128 | b: &Benchmark, 129 | ) -> anyhow::Result> { 130 | let haystack = &*b.haystack; 131 | let ac = compile_bytewise(b, MatchKind::Standard)?; 132 | shared::run(b, || Ok(ac.find_overlapping_iter(haystack).count())) 133 | } 134 | 135 | /// Implements a multi-substring algorithm using daachorse's bytewise 136 | /// Aho-Corasick automaton. This requires leftmost-first or leftmost-longest 137 | /// match semantics. 138 | fn model_count_bytewise_leftmost( 139 | b: &Benchmark, 140 | kind: MatchKind, 141 | ) -> anyhow::Result> { 142 | let haystack = &*b.haystack; 143 | let ac = compile_bytewise(b, kind)?; 144 | shared::run(b, || Ok(ac.leftmost_find_iter(haystack).count())) 145 | } 146 | 147 | /// Compiles a naive multi-substring matcher by building a single substring 148 | /// matcher for each needle. 149 | fn compile_bytewise( 150 | b: &Benchmark, 151 | kind: MatchKind, 152 | ) -> anyhow::Result> { 153 | anyhow::ensure!( 154 | !b.case_insensitive, 155 | "daachorse doesn't support case insensitive mode", 156 | ); 157 | let result = DoubleArrayAhoCorasickBuilder::new() 158 | .match_kind(kind) 159 | .build(&b.needles); 160 | let ac = match result { 161 | Ok(ac) => ac, 162 | Err(err) => anyhow::bail!("daachorse build failed: {}", err), 163 | }; 164 | Ok(ac) 165 | } 166 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-jetscii/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anyhow" 7 | version = "1.0.72" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854" 10 | 11 | [[package]] 12 | name = "bstr" 13 | version = "1.6.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05" 16 | dependencies = [ 17 | "memchr", 18 | "serde", 19 | ] 20 | 21 | [[package]] 22 | name = "jetscii" 23 | version = "0.5.3" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e" 26 | 27 | [[package]] 28 | name = "main" 29 | version = "0.5.3" 30 | dependencies = [ 31 | "anyhow", 32 | "jetscii", 33 | "shared", 34 | ] 35 | 36 | [[package]] 37 | name = "memchr" 38 | version = "2.5.0" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 41 | 42 | [[package]] 43 | name = "serde" 44 | version = "1.0.178" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | checksum = "60363bdd39a7be0266a520dab25fdc9241d2f987b08a01e01f0ec6d06a981348" 47 | 48 | [[package]] 49 | name = "shared" 50 | version = "0.1.0" 51 | dependencies = [ 52 | "anyhow", 53 | "bstr", 54 | ] 55 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-jetscii/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "main" 4 | version = "0.5.3" 5 | edition = "2021" 6 | 7 | [workspace] 8 | 9 | [dependencies] 10 | anyhow = "1.0.72" 11 | jetscii = "=0.5.3" 12 | 13 | [dependencies.shared] 14 | path = "../../shared" 15 | 16 | [[bin]] 17 | name = "main" 18 | path = "main.rs" 19 | 20 | [profile.release] 21 | debug = true 22 | codegen-units = 1 23 | lto = "fat" 24 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-jetscii/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use shared::{Benchmark, Sample}; 4 | 5 | fn main() -> anyhow::Result<()> { 6 | let Some(arg) = std::env::args_os().nth(1) else { 7 | anyhow::bail!("Usage: runner ( | --version)") 8 | }; 9 | let Ok(arg) = arg.into_string() else { 10 | anyhow::bail!("argument given is not valid UTF-8") 11 | }; 12 | if arg == "--version" { 13 | writeln!(std::io::stdout(), env!("CARGO_PKG_VERSION"))?; 14 | return Ok(()); 15 | } 16 | let engine = arg; 17 | let b = Benchmark::from_stdin()?; 18 | let samples = match (&*engine, &*b.model) { 19 | ("ascii-chars-prebuilt", "count") => memmem_prebuilt_count(&b)?, 20 | ("ascii-chars-oneshot", "count") => memmem_oneshot_count(&b)?, 21 | (engine, model) => { 22 | anyhow::bail!("unrecognized engine '{engine}' and model '{model}'") 23 | } 24 | }; 25 | let mut stdout = std::io::stdout().lock(); 26 | for s in samples.iter() { 27 | writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?; 28 | } 29 | Ok(()) 30 | } 31 | 32 | fn memmem_prebuilt_count(b: &Benchmark) -> anyhow::Result> { 33 | let Ok(haystack) = std::str::from_utf8(&b.haystack) else { 34 | anyhow::bail!("jetscii ASCII search requires valid UTF-8 haystack") 35 | }; 36 | let (needles, len) = needle_array(b)?; 37 | let fallback = jetscii_fallback(b)?; 38 | let finder = jetscii::AsciiChars::new(needles, len, fallback); 39 | shared::run(b, || { 40 | let mut haystack = haystack; 41 | let mut count = 0; 42 | while let Some(i) = finder.find(haystack) { 43 | count += 1; 44 | haystack = &haystack[i + 1..]; 45 | } 46 | Ok(count) 47 | }) 48 | } 49 | 50 | fn memmem_oneshot_count(b: &Benchmark) -> anyhow::Result> { 51 | let Ok(haystack) = std::str::from_utf8(&b.haystack) else { 52 | anyhow::bail!("jetscii ASCII search requires valid UTF-8 haystack") 53 | }; 54 | let (needles, len) = needle_array(b)?; 55 | let fallback = jetscii_fallback(b)?; 56 | shared::run(b, || { 57 | let finder = jetscii::AsciiChars::new(needles, len, &fallback); 58 | let mut haystack = haystack; 59 | let mut count = 0; 60 | while let Some(i) = finder.find(haystack) { 61 | count += 1; 62 | haystack = &haystack[i + 1..]; 63 | } 64 | Ok(count) 65 | }) 66 | } 67 | 68 | /// Converts the needles from the given benchmark into a fixed size 16-element 69 | /// array along with the number of actual needles in the array (which may be 70 | /// less than 16). 71 | /// 72 | /// If any needle is more than one byte or there are too many needles to fit 73 | /// into a 16-element array, then this returns an error. This also returns an 74 | /// error if any of the bytes are not ASCII. 75 | fn needle_array(b: &Benchmark) -> anyhow::Result<([u8; 16], i32)> { 76 | let mut array = [0u8; 16]; 77 | let needles = b.needle_bytes()?; 78 | let Ok(len) = i32::try_from(needles.len()) else { 79 | anyhow::bail!("needle length {} could not fit into i32", needles.len()) 80 | }; 81 | anyhow::ensure!( 82 | needles.len() <= 16, 83 | "jetscii only supports at most 16 single byte needles, \ 84 | but found {} needles", 85 | needles.len(), 86 | ); 87 | for (i, byte) in needles.into_iter().enumerate() { 88 | array[i] = byte; 89 | } 90 | Ok((array, len)) 91 | } 92 | 93 | /// Create a fallback predicate for jetscii's up-to-16-bytes search. 94 | fn jetscii_fallback(b: &Benchmark) -> anyhow::Result bool> { 95 | let mut set = vec![false; 256]; 96 | for byte in b.needle_bytes()? { 97 | set[usize::from(byte)] = true; 98 | } 99 | Ok(move |byte| set[usize::from(byte)]) 100 | } 101 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-old-aho-corasick/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-old-aho-corasick/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.0.5" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anyhow" 16 | version = "1.0.69" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" 19 | 20 | [[package]] 21 | name = "bstr" 22 | version = "1.6.2" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" 25 | dependencies = [ 26 | "memchr", 27 | "serde", 28 | ] 29 | 30 | [[package]] 31 | name = "lexopt" 32 | version = "0.3.0" 33 | source = "registry+https://github.com/rust-lang/crates.io-index" 34 | checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401" 35 | 36 | [[package]] 37 | name = "main" 38 | version = "1.0.5" 39 | dependencies = [ 40 | "aho-corasick", 41 | "anyhow", 42 | "lexopt", 43 | "shared", 44 | ] 45 | 46 | [[package]] 47 | name = "memchr" 48 | version = "2.6.3" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" 51 | 52 | [[package]] 53 | name = "serde" 54 | version = "1.0.152" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" 57 | 58 | [[package]] 59 | name = "shared" 60 | version = "0.1.0" 61 | dependencies = [ 62 | "anyhow", 63 | "bstr", 64 | ] 65 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-old-aho-corasick/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "main" 3 | version = "1.0.5" 4 | edition = "2021" 5 | 6 | [[bin]] 7 | name = "main" 8 | path = "main.rs" 9 | 10 | [dependencies] 11 | anyhow = "1.0.69" 12 | lexopt = "0.3.0" 13 | aho-corasick = "=1.0.5" 14 | 15 | [dependencies.shared] 16 | path = "../../shared" 17 | 18 | [profile.release] 19 | debug = true 20 | codegen-units = 1 21 | lto = "fat" 22 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-old-aho-corasick/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a Rust runner program for benchmarking the 2 | [`aho-corasick` crate][rust-aho-corasick]. The `aho-corasick` crate 3 | principally implements the [Aho-Corasick algorithm][aho-corasick], although 4 | it has other algorithms for multiple substring search, such as [Teddy], which 5 | was ported from the Hyperscan project. 6 | 7 | The `aho-corasick` crate is used by [Rust's `regex` crate][rust-regex] to 8 | implement fast prefilters that permit finding candidates very quickly and only 9 | needing to use the regex engine to confirm the match. The Teddy algorithm is 10 | particularly excellent here. (Sometimes `aho-corasick` is used as the regex 11 | engine itself, for example, when the regex is just an alternation of literals.) 12 | 13 | Since the `aho-corasick` crate only supports searching for literal strings, this 14 | engine should only be used for regex patterns that are literals. This is up to 15 | the author of the benchmark definition, as this runner program will always 16 | treat regex patterns as literals. 17 | 18 | This also means that this runner program cannot support all benchmark models. 19 | Only the `compile`, `count`, `count-spans` and `grep` models are supported. 20 | 21 | Finally, this runner program supports measuring two different Aho-Corasick 22 | implementations: `nfa` and `dfa`. The former follows failure transitions at 23 | search time and is thus usually slower, where as the latter builds a full 24 | transition table by pre-computing all failure transitions. The latter tends 25 | to be faster at search time, but can use orders (plural) of magnitude more 26 | memory. In both the `nfa` and `dfa` engines, prefilters inside of Aho-Corasick 27 | are disabled. 28 | 29 | [rust-aho-corasick]: https://github.com/BurntSushi/aho-corasick 30 | [aho-corasick]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm 31 | [Teddy]: https://github.com/BurntSushi/aho-corasick/tree/4e7fa3b85dd3a3ce882896f1d4ee22b1f271f0b4/src/packed/teddy 32 | [rust-regex]: https://github.com/rust-lang/regex 33 | -------------------------------------------------------------------------------- /benchmarks/engines/rust-old-aho-corasick/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use { 4 | aho_corasick::{ 5 | AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, MatchKind, 6 | }, 7 | anyhow::Context, 8 | lexopt::{Arg, ValueExt}, 9 | }; 10 | 11 | use shared::{Benchmark, Sample}; 12 | 13 | fn main() -> anyhow::Result<()> { 14 | let mut p = lexopt::Parser::from_env(); 15 | let (mut engine, mut quiet) = (String::new(), false); 16 | while let Some(arg) = p.next()? { 17 | match arg { 18 | Arg::Short('h') | Arg::Long("help") => { 19 | anyhow::bail!("main [--version | --quiet] ") 20 | } 21 | Arg::Short('q') | Arg::Long("quiet") => { 22 | quiet = true; 23 | } 24 | Arg::Long("version") => { 25 | writeln!(std::io::stdout(), "{}", env!("CARGO_PKG_VERSION"))?; 26 | return Ok(()); 27 | } 28 | Arg::Value(v) => { 29 | anyhow::ensure!( 30 | engine.is_empty(), 31 | "only one engine string allowed" 32 | ); 33 | engine = v.string().context("")?; 34 | anyhow::ensure!( 35 | !engine.is_empty(), 36 | "engine string cannot be empty" 37 | ); 38 | } 39 | _ => return Err(arg.unexpected().into()), 40 | } 41 | } 42 | 43 | let b = Benchmark::from_stdin() 44 | .context("failed to read KLV data from ")?; 45 | let samples = match (b.model.as_str(), engine.as_str()) { 46 | // These first 6 configurations are meant to test the default settings 47 | // on each of {compile, count} x {standard, leftmost-{first,longest}}. 48 | // We don't also test each of them with {nfa/(non-)?contiguous, dfa} 49 | // because it would just get ridiculous. 50 | ("compile", "default/standard") => { 51 | model_compile_ac(&b, || Ok(builder_ac(&b)?.build(&b.needles)?))? 52 | } 53 | ("compile", "default/leftmost-first") => model_compile_ac(&b, || { 54 | Ok(builder_ac(&b)? 55 | .match_kind(MatchKind::LeftmostFirst) 56 | .build(&b.needles)?) 57 | })?, 58 | ("compile", "default/leftmost-longest") => { 59 | model_compile_ac(&b, || { 60 | Ok(builder_ac(&b)? 61 | .match_kind(MatchKind::LeftmostLongest) 62 | .build(&b.needles)?) 63 | })? 64 | } 65 | ("count", "default/standard") => { 66 | let ac = builder_ac(&b)?.build(&b.needles)?; 67 | model_count_ac(&b, &ac)? 68 | } 69 | ("count", "default/leftmost-first") => { 70 | let ac = builder_ac(&b)? 71 | .match_kind(MatchKind::LeftmostFirst) 72 | .build(&b.needles)?; 73 | model_count_ac(&b, &ac)? 74 | } 75 | ("count", "default/leftmost-longest") => { 76 | let ac = builder_ac(&b)? 77 | .match_kind(MatchKind::LeftmostLongest) 78 | .build(&b.needles)?; 79 | model_count_ac(&b, &ac)? 80 | } 81 | 82 | // OK, now we start testing the specific Aho-Corasick automatons, but 83 | // we just focus on leftmost-first because that's the case we tend to 84 | // be more interested in optimizing in practice. There's also likely 85 | // to not be much of a perf difference between leftmost-first and 86 | // leftmost-longest. 87 | // 88 | // We also specifically disable prefilters so that we know we're always 89 | // measuring the actual automaton. (The 'default' engines above might 90 | // use a prefilter!) 91 | ("count", "nfa-noncontiguous/leftmost-first") => { 92 | let ac = builder_ac(&b)? 93 | .prefilter(false) 94 | .kind(Some(AhoCorasickKind::NoncontiguousNFA)) 95 | .match_kind(MatchKind::LeftmostFirst) 96 | .build(&b.needles)?; 97 | model_count_ac(&b, &ac)? 98 | } 99 | ("count", "nfa-contiguous/leftmost-first") => { 100 | let ac = builder_ac(&b)? 101 | .prefilter(false) 102 | .kind(Some(AhoCorasickKind::ContiguousNFA)) 103 | .match_kind(MatchKind::LeftmostFirst) 104 | .build(&b.needles)?; 105 | model_count_ac(&b, &ac)? 106 | } 107 | ("count", "dfa/leftmost-first") => { 108 | let ac = builder_ac(&b)? 109 | .prefilter(false) 110 | .kind(Some(AhoCorasickKind::DFA)) 111 | .match_kind(MatchKind::LeftmostFirst) 112 | .build(&b.needles)?; 113 | model_count_ac(&b, &ac)? 114 | } 115 | 116 | // And now the packed substring routines. We include a 'compile' 117 | // model here as well because it's nice to know how long, specifically, 118 | // the packed searcher take to build in isolation. 119 | ("compile", "packed/leftmost-first") => { 120 | model_compile_packed(&b, || { 121 | let searcher = aho_corasick::packed::Config::new() 122 | .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst) 123 | .builder() 124 | .extend(&b.needles) 125 | .build() 126 | .ok_or_else(|| { 127 | anyhow::anyhow!("could not build packed searcher") 128 | })?; 129 | Ok(searcher) 130 | })? 131 | } 132 | ("count", "packed/leftmost-first") => { 133 | let searcher = aho_corasick::packed::Config::new() 134 | .match_kind(aho_corasick::packed::MatchKind::LeftmostFirst) 135 | .builder() 136 | .extend(&b.needles) 137 | .build() 138 | .ok_or_else(|| { 139 | anyhow::anyhow!("could not build packed searcher") 140 | })?; 141 | model_count_packed(&b, &searcher)? 142 | } 143 | _ => anyhow::bail!( 144 | "unsupported model/engine pair, model={} engine={}", 145 | b.model, 146 | engine 147 | ), 148 | }; 149 | if !quiet { 150 | let mut stdout = std::io::stdout().lock(); 151 | for s in samples.iter() { 152 | writeln!(stdout, "{},{}", s.duration.as_nanos(), s.count)?; 153 | } 154 | } 155 | Ok(()) 156 | } 157 | 158 | /// Implements the "compile a matcher" model for `AhoCorasick`. 159 | fn model_compile_ac( 160 | b: &Benchmark, 161 | compile: impl FnMut() -> anyhow::Result, 162 | ) -> anyhow::Result> { 163 | let haystack = &*b.haystack; 164 | shared::run_and_count( 165 | b, 166 | |re: AhoCorasick| Ok(re.find_iter(haystack).count()), 167 | compile, 168 | ) 169 | } 170 | 171 | /// Implements the "compile a matcher" model for packed substring search. 172 | fn model_compile_packed( 173 | b: &Benchmark, 174 | compile: impl FnMut() -> anyhow::Result, 175 | ) -> anyhow::Result> { 176 | let haystack = &*b.haystack; 177 | shared::run_and_count( 178 | b, 179 | |re: aho_corasick::packed::Searcher| { 180 | Ok(re.find_iter(haystack).count()) 181 | }, 182 | compile, 183 | ) 184 | } 185 | 186 | /// Implements the "count all matches" model for `AhoCorasick`. 187 | fn model_count_ac( 188 | b: &Benchmark, 189 | ac: &AhoCorasick, 190 | ) -> anyhow::Result> { 191 | let haystack = &*b.haystack; 192 | shared::run(b, || Ok(ac.find_iter(haystack).count())) 193 | } 194 | 195 | /// Implements the "count all matches" model for packed substring search. 196 | fn model_count_packed( 197 | b: &Benchmark, 198 | searcher: &aho_corasick::packed::Searcher, 199 | ) -> anyhow::Result> { 200 | anyhow::ensure!( 201 | !b.case_insensitive, 202 | "rust/aho-corasick/packed engines are incompatible \ 203 | with 'case-insensitive = true'" 204 | ); 205 | 206 | let haystack = &*b.haystack; 207 | shared::run(b, || Ok(searcher.find_iter(haystack).count())) 208 | } 209 | 210 | /// Returns a default builder with as many settings as possible applied from 211 | /// the benchmark definition. If the settings from the definition are not 212 | /// supported, then this returns an error. 213 | fn builder_ac(b: &Benchmark) -> anyhow::Result { 214 | anyhow::ensure!( 215 | !(b.unicode && b.case_insensitive), 216 | "rust/aho-corasick engines are incompatible with 'unicode = true' and \ 217 | 'case-insensitive = true'" 218 | ); 219 | let mut builder = AhoCorasick::builder(); 220 | builder.ascii_case_insensitive(b.case_insensitive); 221 | Ok(builder) 222 | } 223 | -------------------------------------------------------------------------------- /benchmarks/haystacks/catalog.data.gov/README.md: -------------------------------------------------------------------------------- 1 | Source: https://catalog.data.gov/dataset/ 2 | 3 | Mental health: 4 | https://catalog.data.gov/dataset/mental-health-care-in-the-last-4-weeks 5 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/README.md: -------------------------------------------------------------------------------- 1 | These were downloaded and derived from the Open Subtitles data set: 2 | https://opus.nlpl.eu/OpenSubtitles-v2018.php 3 | 4 | The specific way in which they were modified has been lost to time, but it's 5 | likely they were just a simple truncation based on target file sizes for 6 | various benchmarks. 7 | 8 | The main reason why we have them is that it gives us a way to test similar 9 | inputs on non-ASCII text. Normally this wouldn't matter for a substring search 10 | implementation, but because of the heuristics used to pick a priori determined 11 | "rare bytes" to base a prefilter on, it's possible for this heuristic to do 12 | more poorly on non-ASCII text than one might expect. 13 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/en-small.txt: -------------------------------------------------------------------------------- 1 | Now you can tell 'em. 2 | What for are you mixing in? 3 | Maybe I don't like to see kids get hurt. 4 | Break any bones, son? 5 | He's got a knife behind his collar! 6 | - There's a stirrup. 7 | You want a lift? 8 | - No. 9 | - Why not? 10 | - I'm beholden to you, mister. 11 | Couldn't we just leave it that way? 12 | - Morning. 13 | - Morning. 14 | - Put him up? 15 | - For how long? 16 | - I wouldn't know. 17 | - It'll be two bits for oats. 18 | - Ain't I seen you before? 19 | - Depends on where you've been. 20 | - I follow the railroad, mostly. 21 | - Could be you've seen me. 22 | - It'll be four bits if he stays the night. 23 | - Fair enough. 24 | Morning. 25 | Did a man ride in today - tall, sort of heavyset? 26 | - You mean him, Mr Renner? 27 | - Not him. 28 | This one had a scar. 29 | Along his cheek? 30 | No, sir. 31 | I don't see no man with a scar. 32 | I guess maybe I can have some apple pie and coffee. 33 | I guess you could have eggs with bacon if you wanted eggs with bacon. 34 | - Hello, Charlie. 35 | - Hello, Grant. 36 | It's good to see you, Charlie. 37 | It's awful good to see you. 38 | It's good to see you too. 39 | Doc you're beginning to sound like Sherlock Holmes. 40 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/en-teeny.txt: -------------------------------------------------------------------------------- 1 | Sound like Sherlock Holmes. 2 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/en-tiny.txt: -------------------------------------------------------------------------------- 1 | I saw you before but I didn't think you were this young 2 | Doc you're beginning to sound like Sherlock Holmes. 3 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/ru-small.txt: -------------------------------------------------------------------------------- 1 | -Две недели не даешь мне прохода. 2 | Вот и действуй, чем ты рискуешь? 3 | Я думал, что сделаю тебя счастливой. 4 | Тоже мне счастье. 5 | Муж не дает ни гроша, и у любовника ума не хватает подумать о деньгах. 6 | - Хорошенькое счастье. 7 | - Извини, я думал, ты любишь меня. 8 | Ну люблю, люблю тебя, но и не хочу, чтобы все началось как в прошлый раз. 9 | Ты не права. 10 | У меня для тебя сюрприз. 11 | Шлихтовальная машина, ты о ней давно мечтала. 12 | -Для костей? 13 | - Нет, настоящая. 14 | Хочешь, приходи за ней вечером. 15 | Я тебе не девочка. 16 | Была бы ты девочкой, я бы тебе ее не купил. 17 | Я люблю тебя 18 | Митч МакКафи, летающий Шерлок Холмс. 19 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/ru-teeny.txt: -------------------------------------------------------------------------------- 1 | летающий Шерлок Холмс. 2 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/ru-tiny.txt: -------------------------------------------------------------------------------- 1 | Это - одно из самых поразительных недавних открытий науки. 2 | Митч МакКафи, летающий Шерлок Холмс. 3 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/zh-small.txt: -------------------------------------------------------------------------------- 1 | 魯哇克香貓咖啡 世界上最稀有的飲品 Kopi luwak. 2 | the rarest beverage in the world. 3 | 嘗一小口 Take a whiff. 4 | 來 Go ahead. 5 | 寇爾先生 董事會已準備好聽你的提案 Uh, mr. 6 | cole, the board is ready to hear your proposal. 7 | 等一下下 Hold on just a second. 8 | 來 繼續 Go ahead. 9 | go on. 10 | 怎樣 Well? 11 | 真不錯 Really good. 12 | 真不錯 Really good. 13 | 寇爾先生? 14 | Mr. 15 | cole. 16 | sir? 17 | 吉姆 你知道庸俗是什麼嗎 Do you know what a philistine is, jim? 18 | 先生 我叫理查德 Sir, it's richard. 19 | 沒錯 費爾 出動你的如簧巧舌吧 That's right, phil. 20 | give them the spiel. 21 | 謝謝 主席先生 主管們 Thank you, mr. 22 | chairman, fellow supervisors. 23 | 我們寇爾集團財務的管理不善 We at the cole group feel the decline of the winwood hospital... 24 | 直接造成了溫伍德醫院的衰敗 ...is a direct result of significant fiscal mismanagement. 25 | 請原諒 我們醫院... 26 | I beg your pardon, this hospital... 27 | 日常開支近2倍 overhead costs are nearly double. 28 | 帽子不错 汤姆 夏洛克·福尔摩斯 29 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/zh-teeny.txt: -------------------------------------------------------------------------------- 1 | 汤姆 夏洛克·福尔摩斯 2 | -------------------------------------------------------------------------------- /benchmarks/haystacks/opensubtitles/zh-tiny.txt: -------------------------------------------------------------------------------- 1 | 谁是早餐界的冠军? 2 | 你突然来信说最近要搬到这里 3 | 帽子不错 汤姆 夏洛克·福尔摩斯 4 | -------------------------------------------------------------------------------- /benchmarks/regexes/words-100: -------------------------------------------------------------------------------- 1 | stampeding 2 | commendable 3 | adrenaline 4 | exobiology 5 | indifference 6 | avuncular 7 | prevailed 8 | foreparts 9 | legalistically 10 | intermarries 11 | desideratum 12 | evaluating 13 | lavishing 14 | attractable 15 | philippics 16 | antiabortionist 17 | lascivious 18 | breathable 19 | histogram 20 | rattlings 21 | interdict 22 | summarized 23 | relieving 24 | congresspeople 25 | fitfulness 26 | percolation 27 | upperclasswoman 28 | epistemic 29 | Chantilly 30 | stonemasons 31 | nonferrous 32 | emulsions 33 | charitably 34 | barracudas 35 | integrity 36 | knockdowns 37 | roadworks 38 | abortionists 39 | Salvadoran 40 | chanceries 41 | misinform 42 | caretaker 43 | extricated 44 | mandolins 45 | steeliest 46 | transpiration 47 | weirdness 48 | audiologists 49 | baronetcies 50 | performing 51 | publishing 52 | suspending 53 | dermatological 54 | contemplate 55 | spiritless 56 | nightwatchman 57 | paradisaical 58 | implicating 59 | timpanists 60 | Leavenworth 61 | amorality 62 | strangulated 63 | cellophane 64 | waterboard 65 | astrophysicists 66 | aerospace 67 | passphrase 68 | engendered 69 | spotlighting 70 | misapplication 71 | barterers 72 | poetesses 73 | dollhouse 74 | laparoscopic 75 | Dubrovnik 76 | rerecords 77 | shielding 78 | orthographically 79 | thicknesses 80 | Bendictus 81 | congealed 82 | cooperative 83 | encompass 84 | grouching 85 | shipowners 86 | jealously 87 | generational 88 | antecedents 89 | persecutes 90 | exemplified 91 | admirable 92 | squeakiest 93 | absconding 94 | extirpated 95 | exoskeletons 96 | earthworms 97 | chaotically 98 | shipbuilder 99 | equidistantly 100 | overprint 101 | -------------------------------------------------------------------------------- /benchmarks/shared/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anyhow" 7 | version = "1.0.72" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854" 10 | 11 | [[package]] 12 | name = "bstr" 13 | version = "1.6.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05" 16 | dependencies = [ 17 | "memchr", 18 | "serde", 19 | ] 20 | 21 | [[package]] 22 | name = "memchr" 23 | version = "2.5.0" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 26 | 27 | [[package]] 28 | name = "serde" 29 | version = "1.0.177" 30 | source = "registry+https://github.com/rust-lang/crates.io-index" 31 | checksum = "63ba2516aa6bf82e0b19ca8b50019d52df58455d3cf9bdaf6315225fdd0c560a" 32 | 33 | [[package]] 34 | name = "shared" 35 | version = "0.1.0" 36 | dependencies = [ 37 | "anyhow", 38 | "bstr", 39 | ] 40 | -------------------------------------------------------------------------------- /benchmarks/shared/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "shared" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [workspace] 7 | 8 | [dependencies] 9 | anyhow = "1.0.69" 10 | bstr = { version = "1.6.0", default-features = false, features = ["std"] } 11 | 12 | [lib] 13 | name = "shared" 14 | path = "lib.rs" 15 | -------------------------------------------------------------------------------- /benchmarks/shared/lib.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | io::Read, 3 | time::{Duration, Instant}, 4 | }; 5 | 6 | use { 7 | anyhow::Context, 8 | bstr::{ByteSlice, ByteVec}, 9 | }; 10 | 11 | /// A single benchmark configuration read from a sequence of KLV items on 12 | /// stdin. 13 | #[derive(Clone, Debug, Default)] 14 | pub struct Benchmark { 15 | pub name: String, 16 | pub model: String, 17 | pub needles: Vec>, 18 | pub haystack: Vec, 19 | pub case_insensitive: bool, 20 | pub unicode: bool, 21 | pub max_iters: u64, 22 | pub max_warmup_iters: u64, 23 | pub max_time: Duration, 24 | pub max_warmup_time: Duration, 25 | } 26 | 27 | impl Benchmark { 28 | /// Read the KLV benchmark configuration from stdin. 29 | pub fn from_stdin() -> anyhow::Result { 30 | let mut raw = vec![]; 31 | std::io::stdin().read_to_end(&mut raw)?; 32 | Benchmark::read(&raw) 33 | } 34 | 35 | /// Return single byte needles from this benchmark definition. If any 36 | /// needle is more than one byte, then this returns an error. 37 | pub fn needle_bytes(&self) -> anyhow::Result> { 38 | let mut needles = vec![]; 39 | for needle in self.needles.iter() { 40 | anyhow::ensure!( 41 | needle.len() == 1, 42 | "needle must have length 1 (in bytes) but it has length {}", 43 | needle.len(), 44 | ); 45 | needles.push(needle[0]); 46 | } 47 | Ok(needles) 48 | } 49 | 50 | fn read(mut raw: &[u8]) -> anyhow::Result { 51 | let mut config = Benchmark::default(); 52 | while !raw.is_empty() { 53 | let (klv, nread) = OneKLV::read(raw)?; 54 | raw = &raw[nread..]; 55 | config.set(klv)?; 56 | } 57 | Ok(config) 58 | } 59 | 60 | fn set(&mut self, klv: OneKLV) -> anyhow::Result<()> { 61 | let parse_duration = |v: &str| -> anyhow::Result { 62 | Ok(Duration::from_nanos(v.parse()?)) 63 | }; 64 | let OneKLV { key, value } = klv; 65 | match &*key { 66 | "name" => self.name = value.to_str()?.to_string(), 67 | "model" => self.model = value.to_str()?.to_string(), 68 | "pattern" => { 69 | self.needles.push(Vec::unescape_bytes(value.to_str()?)) 70 | } 71 | "haystack" => self.haystack = value.to_vec(), 72 | "case-insensitive" => { 73 | self.case_insensitive = value.to_str()?.parse()? 74 | } 75 | "unicode" => self.unicode = value.to_str()?.parse()?, 76 | "max-iters" => self.max_iters = value.to_str()?.parse()?, 77 | "max-warmup-iters" => { 78 | self.max_warmup_iters = value.to_str()?.parse()? 79 | } 80 | "max-time" => self.max_time = parse_duration(value.to_str()?)?, 81 | "max-warmup-time" => { 82 | self.max_warmup_time = parse_duration(value.to_str()?)? 83 | } 84 | _ => {} 85 | } 86 | Ok(()) 87 | } 88 | } 89 | 90 | #[derive(Clone, Debug)] 91 | struct OneKLV { 92 | key: String, 93 | value: Vec, 94 | } 95 | 96 | impl OneKLV { 97 | fn read(bytes: &[u8]) -> anyhow::Result<(OneKLV, usize)> { 98 | let mut nread = 0; 99 | let (key, bytes) = match bytes.split_once_str(":") { 100 | Some(x) => x, 101 | None => anyhow::bail!( 102 | "failed to find first ':' in key-length-value item \ 103 | where the next (at most) 80 bytes are: {:?}", 104 | bytes[..std::cmp::min(80, bytes.len())].as_bstr(), 105 | ), 106 | }; 107 | nread += key.len() + 1; // +1 for ':' 108 | let key = key 109 | .to_str() 110 | .with_context(|| { 111 | format!("key {:?} is not valid UTF-8", key.as_bstr()) 112 | })? 113 | .to_string(); 114 | 115 | let (len, bytes) = match bytes.split_once_str(":") { 116 | Some(x) => x, 117 | None => anyhow::bail!( 118 | "failed to find second ':' in key-length-value item \ 119 | for key '{}'", 120 | key, 121 | ), 122 | }; 123 | nread += len.len() + 1; // +1 for ':' 124 | let len = len.to_str().with_context(|| { 125 | format!("length for key '{}' is not valid UTF-8", key) 126 | })?; 127 | let len = len.parse::().with_context(|| { 128 | format!( 129 | "length '{}' for key '{}' is not a valid integer", 130 | len, key, 131 | ) 132 | })?; 133 | 134 | anyhow::ensure!( 135 | bytes.len() >= len, 136 | "got length of {} for key '{}', but only {} bytes remain", 137 | len, 138 | key, 139 | bytes.len(), 140 | ); 141 | let value = bytes[..len].into(); 142 | let bytes = &bytes[len..]; 143 | nread += len; 144 | 145 | anyhow::ensure!( 146 | bytes.len() >= 1, 147 | "expected trailing '\\n' after value, but got EOF", 148 | ); 149 | anyhow::ensure!( 150 | bytes[0] == b'\n', 151 | "expected '\\n' after value, but got {:?}", 152 | bytes[0..1].as_bstr(), 153 | ); 154 | nread += 1; 155 | 156 | let klv = OneKLV { key, value }; 157 | Ok((klv, nread)) 158 | } 159 | } 160 | 161 | /// A sample computed from a single benchmark iteration. 162 | #[derive(Clone, Debug)] 163 | pub struct Sample { 164 | /// The duration of the iteration. 165 | pub duration: Duration, 166 | /// The count reported by the benchmark. This is used by the harness to 167 | /// verify that the result is correct. 168 | /// 169 | /// All benchmark models except for regex-redux use this. For regex-redux, 170 | /// it is always zero. 171 | pub count: u64, 172 | } 173 | 174 | /// Run the given `bench` function repeatedly until either the maximum 175 | /// time or number of iterations has been reached and return the set of 176 | /// samples. 177 | pub fn run( 178 | b: &Benchmark, 179 | bench: impl FnMut() -> anyhow::Result, 180 | ) -> anyhow::Result> { 181 | run_and_count(b, |count| Ok(count), bench) 182 | } 183 | 184 | /// Run the given `bench` function repeatedly until either the maximum 185 | /// time or number of iterations has been reached and return the set of 186 | /// samples. The count for each sample is determined by running `count` on 187 | /// the result of `bench`. The execution time of `count` is specifically 188 | /// not included in the sample's duration. 189 | /// 190 | /// N.B. This variant only exists for the 'compile' model. We want to only 191 | /// measure compile time, but still do extra work that we specifically 192 | /// don't measure to produce a count to ensure the compile regex behaves as 193 | /// expected. 194 | pub fn run_and_count( 195 | b: &Benchmark, 196 | mut count: impl FnMut(T) -> anyhow::Result, 197 | mut bench: impl FnMut() -> anyhow::Result, 198 | ) -> anyhow::Result> { 199 | let warmup_start = Instant::now(); 200 | for _ in 0..b.max_warmup_iters { 201 | let result = bench(); 202 | // We still compute the count in case there was a problem doing so, 203 | // even though we don't do anything with the count. 204 | let _count = count(result?)?; 205 | if warmup_start.elapsed() >= b.max_warmup_time { 206 | break; 207 | } 208 | } 209 | 210 | let mut samples = vec![]; 211 | let run_start = Instant::now(); 212 | for _ in 0..b.max_iters { 213 | let bench_start = Instant::now(); 214 | let result = bench(); 215 | let duration = bench_start.elapsed(); 216 | // Should be fine since it's unreasonable for a match count to 217 | // exceed u64::MAX. 218 | let count = u64::try_from(count(result?)?).unwrap(); 219 | samples.push(Sample { duration, count }); 220 | if run_start.elapsed() >= b.max_time { 221 | break; 222 | } 223 | } 224 | Ok(samples) 225 | } 226 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | /Cargo.lock 2 | /artifacts 3 | /corpus 4 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "aho-corasick-fuzz" 4 | version = "0.0.0" 5 | authors = ["Automatically generated"] 6 | edition = "2021" 7 | 8 | # Prevent this from interfering with workspaces 9 | [workspace] 10 | members = ["."] 11 | 12 | [dependencies] 13 | aho-corasick = { path = ".." } 14 | libfuzzer-sys = { version = "0.4", features = ["arbitrary-derive"] } 15 | 16 | [[bin]] 17 | name = "fuzz-find" 18 | path = "fuzz-targets/fuzz_find.rs" 19 | test = false 20 | doc = false 21 | 22 | [package.metadata] 23 | cargo-fuzz = true 24 | -------------------------------------------------------------------------------- /fuzz/fuzz-targets/fuzz_find.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use libfuzzer_sys::{arbitrary, fuzz_target}; 4 | 5 | use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind}; 6 | 7 | #[derive(arbitrary::Arbitrary, Debug, Clone)] 8 | enum Operation { 9 | Find(String), 10 | ReplaceAll(String, Vec), 11 | } 12 | 13 | #[derive(arbitrary::Arbitrary, Debug, Clone)] 14 | struct Inputs { 15 | patterns: Vec, 16 | kind: u8, 17 | match_kind: u8, 18 | ascii_case_insensitive: bool, 19 | dense_depth: Option, 20 | prefilter: bool, 21 | operation: Operation, 22 | byte_classes: bool, 23 | } 24 | 25 | fuzz_target!(|input: Inputs| { 26 | let mut acb = AhoCorasick::builder(); 27 | acb.ascii_case_insensitive(input.ascii_case_insensitive) 28 | .prefilter(input.prefilter) 29 | .byte_classes(input.byte_classes); 30 | match input.kind % 5 { 31 | 0 => &mut acb, 32 | 1 => acb.kind(None), 33 | 2 => acb.kind(Some(AhoCorasickKind::NoncontiguousNFA)), 34 | 3 => acb.kind(Some(AhoCorasickKind::ContiguousNFA)), 35 | 4 => acb.kind(Some(AhoCorasickKind::DFA)), 36 | _ => unreachable!(), 37 | }; 38 | match input.match_kind % 4 { 39 | 0 => &mut acb, 40 | 1 => acb.match_kind(MatchKind::Standard), 41 | 2 => acb.match_kind(MatchKind::LeftmostFirst), 42 | 3 => acb.match_kind(MatchKind::LeftmostLongest), 43 | _ => unreachable!(), 44 | }; 45 | if let Some(dense_depth) = input.dense_depth { 46 | acb.dense_depth(dense_depth); 47 | } 48 | 49 | let num_patterns = input.patterns.len(); 50 | let ac = acb.build(input.patterns).unwrap(); 51 | match input.operation { 52 | Operation::Find(haystack) => { 53 | ac.find(&haystack); 54 | } 55 | Operation::ReplaceAll(haystack, substitutions) => { 56 | if substitutions.len() == num_patterns { 57 | ac.replace_all(&haystack, &substitutions); 58 | } 59 | } 60 | } 61 | }); 62 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 79 2 | use_small_heuristics = "max" 3 | -------------------------------------------------------------------------------- /src/macros.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_macros)] 2 | 3 | macro_rules! log { 4 | ($($tt:tt)*) => { 5 | #[cfg(feature = "logging")] 6 | { 7 | $($tt)* 8 | } 9 | } 10 | } 11 | 12 | macro_rules! debug { 13 | ($($tt:tt)*) => { log!(log::debug!($($tt)*)) } 14 | } 15 | 16 | macro_rules! trace { 17 | ($($tt:tt)*) => { log!(log::trace!($($tt)*)) } 18 | } 19 | -------------------------------------------------------------------------------- /src/nfa/mod.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Provides direct access to NFA implementations of Aho-Corasick. 3 | 4 | The principle characteristic of an NFA in this crate is that it may 5 | transition through multiple states per byte of haystack. In Aho-Corasick 6 | parlance, NFAs follow failure transitions during a search. In contrast, 7 | a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during 8 | compilation at the expense of a much bigger memory footprint. 9 | 10 | Currently, there are two NFA implementations provided: noncontiguous and 11 | contiguous. The names reflect their internal representation, and consequently, 12 | the trade offs associated with them: 13 | 14 | * A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to 15 | represent its transitions in a sparse format. This is ideal for building an 16 | NFA, since it cheaply permits different states to have a different number of 17 | transitions. A noncontiguous NFA is where the main Aho-Corasick construction 18 | algorithm is implemented. All other Aho-Corasick implementations are built by 19 | first constructing a noncontiguous NFA. 20 | * A [`contiguous::NFA`] is uses a single allocation to represent all states, 21 | while still encoding most states as sparse states but permitting states near 22 | the starting state to have a dense representation. The dense representation 23 | uses more memory, but permits computing transitions during a search more 24 | quickly. By only making the most active states dense (the states near the 25 | starting state), a contiguous NFA better balances memory usage with search 26 | speed. The single contiguous allocation also uses less overhead per state and 27 | enables compression tricks where most states only use 8 bytes of heap memory. 28 | 29 | When given the choice between these two, you almost always want to pick a 30 | contiguous NFA. It takes only a little longer to build, but both its memory 31 | usage and search speed are typically much better than a noncontiguous NFA. A 32 | noncontiguous NFA is useful when prioritizing build times, or when there are 33 | so many patterns that a contiguous NFA could not be built. (Currently, because 34 | of both memory and search speed improvements, a contiguous NFA has a smaller 35 | internal limit on the total number of NFA states it can represent. But you 36 | would likely need to have hundreds of thousands or even millions of patterns 37 | before you hit this limit.) 38 | */ 39 | pub mod contiguous; 40 | pub mod noncontiguous; 41 | -------------------------------------------------------------------------------- /src/packed/ext.rs: -------------------------------------------------------------------------------- 1 | /// A trait for adding some helper routines to pointers. 2 | pub(crate) trait Pointer { 3 | /// Returns the distance, in units of `T`, between `self` and `origin`. 4 | /// 5 | /// # Safety 6 | /// 7 | /// Same as `ptr::offset_from` in addition to `self >= origin`. 8 | unsafe fn distance(self, origin: Self) -> usize; 9 | 10 | /// Casts this pointer to `usize`. 11 | /// 12 | /// Callers should not convert the `usize` back to a pointer if at all 13 | /// possible. (And if you believe it's necessary, open an issue to discuss 14 | /// why. Otherwise, it has the potential to violate pointer provenance.) 15 | /// The purpose of this function is just to be able to do arithmetic, i.e., 16 | /// computing offsets or alignments. 17 | fn as_usize(self) -> usize; 18 | } 19 | 20 | impl Pointer for *const T { 21 | unsafe fn distance(self, origin: *const T) -> usize { 22 | // TODO: Replace with `ptr::sub_ptr` once stabilized. 23 | usize::try_from(self.offset_from(origin)).unwrap_unchecked() 24 | } 25 | 26 | fn as_usize(self) -> usize { 27 | self as usize 28 | } 29 | } 30 | 31 | impl Pointer for *mut T { 32 | unsafe fn distance(self, origin: *mut T) -> usize { 33 | (self as *const T).distance(origin as *const T) 34 | } 35 | 36 | fn as_usize(self) -> usize { 37 | (self as *const T).as_usize() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/packed/mod.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Provides packed multiple substring search, principally for a small number of 3 | patterns. 4 | 5 | This sub-module provides vectorized routines for quickly finding 6 | matches of a small number of patterns. In general, users of this crate 7 | shouldn't need to interface with this module directly, as the primary 8 | [`AhoCorasick`](crate::AhoCorasick) searcher will use these routines 9 | automatically as a prefilter when applicable. However, in some cases, callers 10 | may want to bypass the Aho-Corasick machinery entirely and use this vectorized 11 | searcher directly. 12 | 13 | # Overview 14 | 15 | The primary types in this sub-module are: 16 | 17 | * [`Searcher`] executes the actual search algorithm to report matches in a 18 | haystack. 19 | * [`Builder`] accumulates patterns incrementally and can construct a 20 | `Searcher`. 21 | * [`Config`] permits tuning the searcher, and itself will produce a `Builder` 22 | (which can then be used to build a `Searcher`). Currently, the only tuneable 23 | knob are the match semantics, but this may be expanded in the future. 24 | 25 | # Examples 26 | 27 | This example shows how to create a searcher from an iterator of patterns. 28 | By default, leftmost-first match semantics are used. (See the top-level 29 | [`MatchKind`] type for more details about match semantics, which apply 30 | similarly to packed substring search.) 31 | 32 | ``` 33 | use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; 34 | 35 | # fn example() -> Option<()> { 36 | let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 37 | let matches: Vec = searcher 38 | .find_iter("foobar") 39 | .map(|mat| mat.pattern()) 40 | .collect(); 41 | assert_eq!(vec![PatternID::ZERO], matches); 42 | # Some(()) } 43 | # if cfg!(all(feature = "std", any( 44 | # target_arch = "x86_64", target_arch = "aarch64", 45 | # ))) { 46 | # example().unwrap() 47 | # } else { 48 | # assert!(example().is_none()); 49 | # } 50 | ``` 51 | 52 | This example shows how to use [`Config`] to change the match semantics to 53 | leftmost-longest: 54 | 55 | ``` 56 | use aho_corasick::{packed::{Config, MatchKind}, PatternID}; 57 | 58 | # fn example() -> Option<()> { 59 | let searcher = Config::new() 60 | .match_kind(MatchKind::LeftmostLongest) 61 | .builder() 62 | .add("foo") 63 | .add("foobar") 64 | .build()?; 65 | let matches: Vec = searcher 66 | .find_iter("foobar") 67 | .map(|mat| mat.pattern()) 68 | .collect(); 69 | assert_eq!(vec![PatternID::must(1)], matches); 70 | # Some(()) } 71 | # if cfg!(all(feature = "std", any( 72 | # target_arch = "x86_64", target_arch = "aarch64", 73 | # ))) { 74 | # example().unwrap() 75 | # } else { 76 | # assert!(example().is_none()); 77 | # } 78 | ``` 79 | 80 | # Packed substring searching 81 | 82 | Packed substring searching refers to the use of SIMD (Single Instruction, 83 | Multiple Data) to accelerate the detection of matches in a haystack. Unlike 84 | conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring 85 | search tend to do better with a small number of patterns, where as Aho-Corasick 86 | generally maintains reasonably consistent performance regardless of the number 87 | of patterns you give it. Because of this, the vectorized searcher in this 88 | sub-module cannot be used as a general purpose searcher, since building the 89 | searcher may fail even when given a small number of patterns. However, in 90 | exchange, when searching for a small number of patterns, searching can be quite 91 | a bit faster than Aho-Corasick (sometimes by an order of magnitude). 92 | 93 | The key take away here is that constructing a searcher from a list of patterns 94 | is a fallible operation with no clear rules for when it will fail. While the 95 | precise conditions under which building a searcher can fail is specifically an 96 | implementation detail, here are some common reasons: 97 | 98 | * Too many patterns were given. Typically, the limit is on the order of 100 or 99 | so, but this limit may fluctuate based on available CPU features. 100 | * The available packed algorithms require CPU features that aren't available. 101 | For example, currently, this crate only provides packed algorithms for 102 | `x86_64` and `aarch64`. Therefore, constructing a packed searcher on any 103 | other target will always fail. 104 | * Zero patterns were given, or one of the patterns given was empty. Packed 105 | searchers require at least one pattern and that all patterns are non-empty. 106 | * Something else about the nature of the patterns (typically based on 107 | heuristics) suggests that a packed searcher would perform very poorly, so 108 | no searcher is built. 109 | */ 110 | 111 | pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; 112 | 113 | mod api; 114 | mod ext; 115 | mod pattern; 116 | mod rabinkarp; 117 | mod teddy; 118 | #[cfg(all(feature = "std", test))] 119 | mod tests; 120 | mod vector; 121 | -------------------------------------------------------------------------------- /src/packed/rabinkarp.rs: -------------------------------------------------------------------------------- 1 | use alloc::{sync::Arc, vec, vec::Vec}; 2 | 3 | use crate::{packed::pattern::Patterns, util::search::Match, PatternID}; 4 | 5 | /// The type of the rolling hash used in the Rabin-Karp algorithm. 6 | type Hash = usize; 7 | 8 | /// The number of buckets to store our patterns in. We don't want this to be 9 | /// too big in order to avoid wasting memory, but we don't want it to be too 10 | /// small either to avoid spending too much time confirming literals. 11 | /// 12 | /// The number of buckets MUST be a power of two. Otherwise, determining the 13 | /// bucket from a hash will slow down the code considerably. Using a power 14 | /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` 15 | /// instruction. 16 | const NUM_BUCKETS: usize = 64; 17 | 18 | /// An implementation of the Rabin-Karp algorithm. The main idea of this 19 | /// algorithm is to maintain a rolling hash as it moves through the input, and 20 | /// then check whether that hash corresponds to the same hash for any of the 21 | /// patterns we're looking for. 22 | /// 23 | /// A draw back of naively scaling Rabin-Karp to multiple patterns is that 24 | /// it requires all of the patterns to be the same length, which in turn 25 | /// corresponds to the number of bytes to hash. We adapt this to work for 26 | /// multiple patterns of varying size by fixing the number of bytes to hash 27 | /// to be the length of the smallest pattern. We also split the patterns into 28 | /// several buckets to hopefully make the confirmation step faster. 29 | /// 30 | /// Wikipedia has a decent explanation, if a bit heavy on the theory: 31 | /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm 32 | /// 33 | /// But ESMAJ provides something a bit more concrete: 34 | /// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html 35 | #[derive(Clone, Debug)] 36 | pub(crate) struct RabinKarp { 37 | /// The patterns we're searching for. 38 | patterns: Arc, 39 | /// The order of patterns in each bucket is significant. Namely, they are 40 | /// arranged such that the first one to match is the correct match. This 41 | /// may not necessarily correspond to the order provided by the caller. 42 | /// For example, if leftmost-longest semantics are used, then the patterns 43 | /// are sorted by their length in descending order. If leftmost-first 44 | /// semantics are used, then the patterns are sorted by their pattern ID 45 | /// in ascending order (which corresponds to the caller's order). 46 | buckets: Vec>, 47 | /// The length of the hashing window. Generally, this corresponds to the 48 | /// length of the smallest pattern. 49 | hash_len: usize, 50 | /// The factor to subtract out of a hash before updating it with a new 51 | /// byte. 52 | hash_2pow: usize, 53 | } 54 | 55 | impl RabinKarp { 56 | /// Compile a new Rabin-Karp matcher from the patterns given. 57 | /// 58 | /// This panics if any of the patterns in the collection are empty, or if 59 | /// the collection is itself empty. 60 | pub(crate) fn new(patterns: &Arc) -> RabinKarp { 61 | assert!(patterns.len() >= 1); 62 | let hash_len = patterns.minimum_len(); 63 | assert!(hash_len >= 1); 64 | 65 | let mut hash_2pow = 1usize; 66 | for _ in 1..hash_len { 67 | hash_2pow = hash_2pow.wrapping_shl(1); 68 | } 69 | 70 | let mut rk = RabinKarp { 71 | patterns: Arc::clone(patterns), 72 | buckets: vec![vec![]; NUM_BUCKETS], 73 | hash_len, 74 | hash_2pow, 75 | }; 76 | for (id, pat) in patterns.iter() { 77 | let hash = rk.hash(&pat.bytes()[..rk.hash_len]); 78 | let bucket = hash % NUM_BUCKETS; 79 | rk.buckets[bucket].push((hash, id)); 80 | } 81 | rk 82 | } 83 | 84 | /// Return the first matching pattern in the given haystack, begining the 85 | /// search at `at`. 86 | pub(crate) fn find_at( 87 | &self, 88 | haystack: &[u8], 89 | mut at: usize, 90 | ) -> Option { 91 | assert_eq!(NUM_BUCKETS, self.buckets.len()); 92 | 93 | if at + self.hash_len > haystack.len() { 94 | return None; 95 | } 96 | let mut hash = self.hash(&haystack[at..at + self.hash_len]); 97 | loop { 98 | let bucket = &self.buckets[hash % NUM_BUCKETS]; 99 | for &(phash, pid) in bucket { 100 | if phash == hash { 101 | if let Some(c) = self.verify(pid, haystack, at) { 102 | return Some(c); 103 | } 104 | } 105 | } 106 | if at + self.hash_len >= haystack.len() { 107 | return None; 108 | } 109 | hash = self.update_hash( 110 | hash, 111 | haystack[at], 112 | haystack[at + self.hash_len], 113 | ); 114 | at += 1; 115 | } 116 | } 117 | 118 | /// Returns the approximate total amount of heap used by this searcher, in 119 | /// units of bytes. 120 | pub(crate) fn memory_usage(&self) -> usize { 121 | self.buckets.len() * core::mem::size_of::>() 122 | + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>() 123 | } 124 | 125 | /// Verify whether the pattern with the given id matches at 126 | /// `haystack[at..]`. 127 | /// 128 | /// We tag this function as `cold` because it helps improve codegen. 129 | /// Intuitively, it would seem like inlining it would be better. However, 130 | /// the only time this is called and a match is not found is when there 131 | /// there is a hash collision, or when a prefix of a pattern matches but 132 | /// the entire pattern doesn't match. This is hopefully fairly rare, and 133 | /// if it does occur a lot, it's going to be slow no matter what we do. 134 | #[cold] 135 | fn verify( 136 | &self, 137 | id: PatternID, 138 | haystack: &[u8], 139 | at: usize, 140 | ) -> Option { 141 | let pat = self.patterns.get(id); 142 | if pat.is_prefix(&haystack[at..]) { 143 | Some(Match::new(id, at..at + pat.len())) 144 | } else { 145 | None 146 | } 147 | } 148 | 149 | /// Hash the given bytes. 150 | fn hash(&self, bytes: &[u8]) -> Hash { 151 | assert_eq!(self.hash_len, bytes.len()); 152 | 153 | let mut hash = 0usize; 154 | for &b in bytes { 155 | hash = hash.wrapping_shl(1).wrapping_add(b as usize); 156 | } 157 | hash 158 | } 159 | 160 | /// Update the hash given based on removing `old_byte` at the beginning 161 | /// of some byte string, and appending `new_byte` to the end of that same 162 | /// byte string. 163 | fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { 164 | prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) 165 | .wrapping_shl(1) 166 | .wrapping_add(new_byte as usize) 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/packed/teddy/mod.rs: -------------------------------------------------------------------------------- 1 | // Regrettable, but Teddy stuff just isn't used on all targets. And for some 2 | // targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a 3 | // bunch of dead-code warnings. Just not worth trying to squash them. Blech. 4 | #![allow(dead_code)] 5 | 6 | pub(crate) use self::builder::{Builder, Searcher}; 7 | 8 | mod builder; 9 | mod generic; 10 | -------------------------------------------------------------------------------- /src/transducer.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Provides implementations of `fst::Automaton` for Aho-Corasick automata. 3 | 4 | This works by providing two wrapper types, [`Anchored`] and [`Unanchored`]. 5 | The former executes an anchored search on an FST while the latter executes 6 | an unanchored search. Building these wrappers is fallible and will fail if 7 | the underlying Aho-Corasick automaton does not support the type of search it 8 | represents. 9 | */ 10 | 11 | use crate::{ 12 | automaton::{Automaton, StateID}, 13 | Anchored as AcAnchored, Input, MatchError, 14 | }; 15 | 16 | /// Represents an unanchored Aho-Corasick search of a finite state transducer. 17 | /// 18 | /// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the 19 | /// underlying automaton does not support unanchored searches. 20 | /// 21 | /// # Example 22 | /// 23 | /// This shows how to build an FST of keys and then run an unanchored search on 24 | /// those keys using an Aho-Corasick automaton. 25 | /// 26 | /// ``` 27 | /// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored}; 28 | /// use fst::{Automaton, IntoStreamer, Set, Streamer}; 29 | /// 30 | /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); 31 | /// let nfa = NFA::new(&["bcd", "x"]).unwrap(); 32 | /// // NFAs always support both unanchored and anchored searches. 33 | /// let searcher = Unanchored::new(&nfa).unwrap(); 34 | /// 35 | /// let mut stream = set.search(searcher).into_stream(); 36 | /// let mut results = vec![]; 37 | /// while let Some(key) = stream.next() { 38 | /// results.push(std::str::from_utf8(key).unwrap().to_string()); 39 | /// } 40 | /// assert_eq!(vec!["abcd", "bcd", "xyz"], results); 41 | /// ``` 42 | #[derive(Clone, Debug)] 43 | pub struct Unanchored(A); 44 | 45 | impl Unanchored { 46 | /// Create a new `Unanchored` implementation of the `fst::Automaton` trait. 47 | /// 48 | /// If the given Aho-Corasick automaton does not support unanchored 49 | /// searches, then this returns an error. 50 | pub fn new(aut: A) -> Result, MatchError> { 51 | let input = Input::new("").anchored(AcAnchored::No); 52 | let _ = aut.start_state(&input)?; 53 | Ok(Unanchored(aut)) 54 | } 55 | 56 | /// Returns a borrow to the underlying automaton. 57 | pub fn as_ref(&self) -> &A { 58 | &self.0 59 | } 60 | 61 | /// Unwrap this value and return the inner automaton. 62 | pub fn into_inner(self) -> A { 63 | self.0 64 | } 65 | } 66 | 67 | impl fst::Automaton for Unanchored { 68 | type State = StateID; 69 | 70 | #[inline] 71 | fn start(&self) -> StateID { 72 | let input = Input::new("").anchored(AcAnchored::No); 73 | self.0.start_state(&input).expect("support for unanchored searches") 74 | } 75 | 76 | #[inline] 77 | fn is_match(&self, state: &StateID) -> bool { 78 | self.0.is_match(*state) 79 | } 80 | 81 | #[inline] 82 | fn accept(&self, state: &StateID, byte: u8) -> StateID { 83 | if fst::Automaton::is_match(self, state) { 84 | return *state; 85 | } 86 | self.0.next_state(AcAnchored::No, *state, byte) 87 | } 88 | 89 | #[inline] 90 | fn can_match(&self, state: &StateID) -> bool { 91 | !self.0.is_dead(*state) 92 | } 93 | } 94 | 95 | /// Represents an anchored Aho-Corasick search of a finite state transducer. 96 | /// 97 | /// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the 98 | /// underlying automaton does not support unanchored searches. 99 | /// 100 | /// # Example 101 | /// 102 | /// This shows how to build an FST of keys and then run an anchored search on 103 | /// those keys using an Aho-Corasick automaton. 104 | /// 105 | /// ``` 106 | /// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored}; 107 | /// use fst::{Automaton, IntoStreamer, Set, Streamer}; 108 | /// 109 | /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); 110 | /// let nfa = NFA::new(&["bcd", "x"]).unwrap(); 111 | /// // NFAs always support both unanchored and anchored searches. 112 | /// let searcher = Anchored::new(&nfa).unwrap(); 113 | /// 114 | /// let mut stream = set.search(searcher).into_stream(); 115 | /// let mut results = vec![]; 116 | /// while let Some(key) = stream.next() { 117 | /// results.push(std::str::from_utf8(key).unwrap().to_string()); 118 | /// } 119 | /// assert_eq!(vec!["bcd", "xyz"], results); 120 | /// ``` 121 | /// 122 | /// This is like the example above, except we use an Aho-Corasick DFA, which 123 | /// requires explicitly configuring it to support anchored searches. (NFAs 124 | /// unconditionally support both unanchored and anchored searches.) 125 | /// 126 | /// ``` 127 | /// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind}; 128 | /// use fst::{Automaton, IntoStreamer, Set, Streamer}; 129 | /// 130 | /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); 131 | /// let dfa = DFA::builder() 132 | /// .start_kind(StartKind::Anchored) 133 | /// .build(&["bcd", "x"]) 134 | /// .unwrap(); 135 | /// // We've explicitly configured our DFA to support anchored searches. 136 | /// let searcher = Anchored::new(&dfa).unwrap(); 137 | /// 138 | /// let mut stream = set.search(searcher).into_stream(); 139 | /// let mut results = vec![]; 140 | /// while let Some(key) = stream.next() { 141 | /// results.push(std::str::from_utf8(key).unwrap().to_string()); 142 | /// } 143 | /// assert_eq!(vec!["bcd", "xyz"], results); 144 | /// ``` 145 | #[derive(Clone, Debug)] 146 | pub struct Anchored(A); 147 | 148 | impl Anchored { 149 | /// Create a new `Anchored` implementation of the `fst::Automaton` trait. 150 | /// 151 | /// If the given Aho-Corasick automaton does not support anchored searches, 152 | /// then this returns an error. 153 | pub fn new(aut: A) -> Result, MatchError> { 154 | let input = Input::new("").anchored(AcAnchored::Yes); 155 | let _ = aut.start_state(&input)?; 156 | Ok(Anchored(aut)) 157 | } 158 | 159 | /// Returns a borrow to the underlying automaton. 160 | pub fn as_ref(&self) -> &A { 161 | &self.0 162 | } 163 | 164 | /// Unwrap this value and return the inner automaton. 165 | pub fn into_inner(self) -> A { 166 | self.0 167 | } 168 | } 169 | 170 | impl fst::Automaton for Anchored { 171 | type State = StateID; 172 | 173 | #[inline] 174 | fn start(&self) -> StateID { 175 | let input = Input::new("").anchored(AcAnchored::Yes); 176 | self.0.start_state(&input).expect("support for unanchored searches") 177 | } 178 | 179 | #[inline] 180 | fn is_match(&self, state: &StateID) -> bool { 181 | self.0.is_match(*state) 182 | } 183 | 184 | #[inline] 185 | fn accept(&self, state: &StateID, byte: u8) -> StateID { 186 | if fst::Automaton::is_match(self, state) { 187 | return *state; 188 | } 189 | self.0.next_state(AcAnchored::Yes, *state, byte) 190 | } 191 | 192 | #[inline] 193 | fn can_match(&self, state: &StateID) -> bool { 194 | !self.0.is_dead(*state) 195 | } 196 | } 197 | 198 | #[cfg(test)] 199 | mod tests { 200 | use alloc::{string::String, vec, vec::Vec}; 201 | 202 | use fst::{Automaton, IntoStreamer, Set, Streamer}; 203 | 204 | use crate::{ 205 | dfa::DFA, 206 | nfa::{contiguous, noncontiguous}, 207 | StartKind, 208 | }; 209 | 210 | use super::*; 211 | 212 | fn search>( 213 | set: &Set, 214 | aut: A, 215 | ) -> Vec { 216 | let mut stream = set.search(aut).into_stream(); 217 | let mut results = vec![]; 218 | while let Some(key) = stream.next() { 219 | results.push(String::from(core::str::from_utf8(key).unwrap())); 220 | } 221 | results 222 | } 223 | 224 | #[test] 225 | fn unanchored() { 226 | let set = 227 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 228 | .unwrap(); 229 | let patterns = vec!["baz", "bax"]; 230 | let expected = vec!["baz", "xbax"]; 231 | 232 | let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap()); 233 | let got = search(&set, &aut); 234 | assert_eq!(got, expected); 235 | 236 | let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap()); 237 | let got = search(&set, &aut); 238 | assert_eq!(got, expected); 239 | 240 | let aut = Unanchored(DFA::new(&patterns).unwrap()); 241 | let got = search(&set, &aut); 242 | assert_eq!(got, expected); 243 | } 244 | 245 | #[test] 246 | fn anchored() { 247 | let set = 248 | Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) 249 | .unwrap(); 250 | let patterns = vec!["baz", "bax"]; 251 | let expected = vec!["baz"]; 252 | 253 | let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap()); 254 | let got = search(&set, &aut); 255 | assert_eq!(got, expected); 256 | 257 | let aut = Anchored(contiguous::NFA::new(&patterns).unwrap()); 258 | let got = search(&set, &aut); 259 | assert_eq!(got, expected); 260 | 261 | let aut = Anchored( 262 | DFA::builder() 263 | .start_kind(StartKind::Anchored) 264 | .build(&patterns) 265 | .unwrap(), 266 | ); 267 | let got = search(&set, &aut); 268 | assert_eq!(got, expected); 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /src/util/buffer.rs: -------------------------------------------------------------------------------- 1 | use alloc::{vec, vec::Vec}; 2 | 3 | /// The default buffer capacity that we use for the stream buffer. 4 | const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB 5 | 6 | /// A fairly simple roll buffer for supporting stream searches. 7 | /// 8 | /// This buffer acts as a temporary place to store a fixed amount of data when 9 | /// reading from a stream. Its central purpose is to allow "rolling" some 10 | /// suffix of the data to the beginning of the buffer before refilling it with 11 | /// more data from the stream. For example, let's say we are trying to match 12 | /// "foobar" on a stream. When we report the match, we'd like to not only 13 | /// report the correct offsets at which the match occurs, but also the matching 14 | /// bytes themselves. So let's say our stream is a file with the following 15 | /// contents: `test test foobar test test`. Now assume that we happen to read 16 | /// the aforementioned file in two chunks: `test test foo` and `bar test test`. 17 | /// Naively, it would not be possible to report a single contiguous `foobar` 18 | /// match, but this roll buffer allows us to do that. Namely, after the second 19 | /// read, the contents of the buffer should be `st foobar test test`, where the 20 | /// search should ultimately resume immediately after `foo`. (The prefix `st ` 21 | /// is included because the roll buffer saves N bytes at the end of the buffer, 22 | /// where N is the maximum possible length of a match.) 23 | /// 24 | /// A lot of the logic for dealing with this is unfortunately split out between 25 | /// this roll buffer and the `StreamChunkIter`. 26 | /// 27 | /// Note also that this buffer is not actually required to just report matches. 28 | /// Because a `Match` is just some offsets. But it *is* required for supporting 29 | /// things like `try_stream_replace_all` because that needs some mechanism for 30 | /// knowing which bytes in the stream correspond to a match and which don't. So 31 | /// when a match occurs across two `read` calls, *something* needs to retain 32 | /// the bytes from the previous `read` call because you don't know before the 33 | /// second read call whether a match exists or not. 34 | #[derive(Debug)] 35 | pub(crate) struct Buffer { 36 | /// The raw buffer contents. This has a fixed size and never increases. 37 | buf: Vec, 38 | /// The minimum size of the buffer, which is equivalent to the maximum 39 | /// possible length of a match. This corresponds to the amount that we 40 | /// roll 41 | min: usize, 42 | /// The end of the contents of this buffer. 43 | end: usize, 44 | } 45 | 46 | impl Buffer { 47 | /// Create a new buffer for stream searching. The minimum buffer length 48 | /// given should be the size of the maximum possible match length. 49 | pub(crate) fn new(min_buffer_len: usize) -> Buffer { 50 | let min = core::cmp::max(1, min_buffer_len); 51 | // The minimum buffer amount is also the amount that we roll our 52 | // buffer in order to support incremental searching. To this end, 53 | // our actual capacity needs to be at least 1 byte bigger than our 54 | // minimum amount, otherwise we won't have any overlap. In actuality, 55 | // we want our buffer to be a bit bigger than that for performance 56 | // reasons, so we set a lower bound of `8 * min`. 57 | // 58 | // TODO: It would be good to find a way to test the streaming 59 | // implementation with the minimal buffer size. For now, we just 60 | // uncomment out the next line and comment out the subsequent line. 61 | // let capacity = 1 + min; 62 | let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); 63 | Buffer { buf: vec![0; capacity], min, end: 0 } 64 | } 65 | 66 | /// Return the contents of this buffer. 67 | #[inline] 68 | pub(crate) fn buffer(&self) -> &[u8] { 69 | &self.buf[..self.end] 70 | } 71 | 72 | /// Return the minimum size of the buffer. The only way a buffer may be 73 | /// smaller than this is if the stream itself contains less than the 74 | /// minimum buffer amount. 75 | #[inline] 76 | pub(crate) fn min_buffer_len(&self) -> usize { 77 | self.min 78 | } 79 | 80 | /// Return all free capacity in this buffer. 81 | fn free_buffer(&mut self) -> &mut [u8] { 82 | &mut self.buf[self.end..] 83 | } 84 | 85 | /// Refill the contents of this buffer by reading as much as possible into 86 | /// this buffer's free capacity. If no more bytes could be read, then this 87 | /// returns false. Otherwise, this reads until it has filled the buffer 88 | /// past the minimum amount. 89 | pub(crate) fn fill( 90 | &mut self, 91 | mut rdr: R, 92 | ) -> std::io::Result { 93 | let mut readany = false; 94 | loop { 95 | let readlen = rdr.read(self.free_buffer())?; 96 | if readlen == 0 { 97 | return Ok(readany); 98 | } 99 | readany = true; 100 | self.end += readlen; 101 | if self.buffer().len() >= self.min { 102 | return Ok(true); 103 | } 104 | } 105 | } 106 | 107 | /// Roll the contents of the buffer so that the suffix of this buffer is 108 | /// moved to the front and all other contents are dropped. The size of the 109 | /// suffix corresponds precisely to the minimum buffer length. 110 | /// 111 | /// This should only be called when the entire contents of this buffer have 112 | /// been searched. 113 | pub(crate) fn roll(&mut self) { 114 | let roll_start = self 115 | .end 116 | .checked_sub(self.min) 117 | .expect("buffer capacity should be bigger than minimum amount"); 118 | let roll_end = roll_start + self.min; 119 | 120 | assert!(roll_end <= self.end); 121 | self.buf.copy_within(roll_start..roll_end, 0); 122 | self.end = self.min; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/util/byte_frequencies.rs: -------------------------------------------------------------------------------- 1 | pub const BYTE_FREQUENCIES: [u8; 256] = [ 2 | 55, // '\x00' 3 | 52, // '\x01' 4 | 51, // '\x02' 5 | 50, // '\x03' 6 | 49, // '\x04' 7 | 48, // '\x05' 8 | 47, // '\x06' 9 | 46, // '\x07' 10 | 45, // '\x08' 11 | 103, // '\t' 12 | 242, // '\n' 13 | 66, // '\x0b' 14 | 67, // '\x0c' 15 | 229, // '\r' 16 | 44, // '\x0e' 17 | 43, // '\x0f' 18 | 42, // '\x10' 19 | 41, // '\x11' 20 | 40, // '\x12' 21 | 39, // '\x13' 22 | 38, // '\x14' 23 | 37, // '\x15' 24 | 36, // '\x16' 25 | 35, // '\x17' 26 | 34, // '\x18' 27 | 33, // '\x19' 28 | 56, // '\x1a' 29 | 32, // '\x1b' 30 | 31, // '\x1c' 31 | 30, // '\x1d' 32 | 29, // '\x1e' 33 | 28, // '\x1f' 34 | 255, // ' ' 35 | 148, // '!' 36 | 164, // '"' 37 | 149, // '#' 38 | 136, // '$' 39 | 160, // '%' 40 | 155, // '&' 41 | 173, // "'" 42 | 221, // '(' 43 | 222, // ')' 44 | 134, // '*' 45 | 122, // '+' 46 | 232, // ',' 47 | 202, // '-' 48 | 215, // '.' 49 | 224, // '/' 50 | 208, // '0' 51 | 220, // '1' 52 | 204, // '2' 53 | 187, // '3' 54 | 183, // '4' 55 | 179, // '5' 56 | 177, // '6' 57 | 168, // '7' 58 | 178, // '8' 59 | 200, // '9' 60 | 226, // ':' 61 | 195, // ';' 62 | 154, // '<' 63 | 184, // '=' 64 | 174, // '>' 65 | 126, // '?' 66 | 120, // '@' 67 | 191, // 'A' 68 | 157, // 'B' 69 | 194, // 'C' 70 | 170, // 'D' 71 | 189, // 'E' 72 | 162, // 'F' 73 | 161, // 'G' 74 | 150, // 'H' 75 | 193, // 'I' 76 | 142, // 'J' 77 | 137, // 'K' 78 | 171, // 'L' 79 | 176, // 'M' 80 | 185, // 'N' 81 | 167, // 'O' 82 | 186, // 'P' 83 | 112, // 'Q' 84 | 175, // 'R' 85 | 192, // 'S' 86 | 188, // 'T' 87 | 156, // 'U' 88 | 140, // 'V' 89 | 143, // 'W' 90 | 123, // 'X' 91 | 133, // 'Y' 92 | 128, // 'Z' 93 | 147, // '[' 94 | 138, // '\\' 95 | 146, // ']' 96 | 114, // '^' 97 | 223, // '_' 98 | 151, // '`' 99 | 249, // 'a' 100 | 216, // 'b' 101 | 238, // 'c' 102 | 236, // 'd' 103 | 253, // 'e' 104 | 227, // 'f' 105 | 218, // 'g' 106 | 230, // 'h' 107 | 247, // 'i' 108 | 135, // 'j' 109 | 180, // 'k' 110 | 241, // 'l' 111 | 233, // 'm' 112 | 246, // 'n' 113 | 244, // 'o' 114 | 231, // 'p' 115 | 139, // 'q' 116 | 245, // 'r' 117 | 243, // 's' 118 | 251, // 't' 119 | 235, // 'u' 120 | 201, // 'v' 121 | 196, // 'w' 122 | 240, // 'x' 123 | 214, // 'y' 124 | 152, // 'z' 125 | 182, // '{' 126 | 205, // '|' 127 | 181, // '}' 128 | 127, // '~' 129 | 27, // '\x7f' 130 | 212, // '\x80' 131 | 211, // '\x81' 132 | 210, // '\x82' 133 | 213, // '\x83' 134 | 228, // '\x84' 135 | 197, // '\x85' 136 | 169, // '\x86' 137 | 159, // '\x87' 138 | 131, // '\x88' 139 | 172, // '\x89' 140 | 105, // '\x8a' 141 | 80, // '\x8b' 142 | 98, // '\x8c' 143 | 96, // '\x8d' 144 | 97, // '\x8e' 145 | 81, // '\x8f' 146 | 207, // '\x90' 147 | 145, // '\x91' 148 | 116, // '\x92' 149 | 115, // '\x93' 150 | 144, // '\x94' 151 | 130, // '\x95' 152 | 153, // '\x96' 153 | 121, // '\x97' 154 | 107, // '\x98' 155 | 132, // '\x99' 156 | 109, // '\x9a' 157 | 110, // '\x9b' 158 | 124, // '\x9c' 159 | 111, // '\x9d' 160 | 82, // '\x9e' 161 | 108, // '\x9f' 162 | 118, // '\xa0' 163 | 141, // '¡' 164 | 113, // '¢' 165 | 129, // '£' 166 | 119, // '¤' 167 | 125, // '¥' 168 | 165, // '¦' 169 | 117, // '§' 170 | 92, // '¨' 171 | 106, // '©' 172 | 83, // 'ª' 173 | 72, // '«' 174 | 99, // '¬' 175 | 93, // '\xad' 176 | 65, // '®' 177 | 79, // '¯' 178 | 166, // '°' 179 | 237, // '±' 180 | 163, // '²' 181 | 199, // '³' 182 | 190, // '´' 183 | 225, // 'µ' 184 | 209, // '¶' 185 | 203, // '·' 186 | 198, // '¸' 187 | 217, // '¹' 188 | 219, // 'º' 189 | 206, // '»' 190 | 234, // '¼' 191 | 248, // '½' 192 | 158, // '¾' 193 | 239, // '¿' 194 | 255, // 'À' 195 | 255, // 'Á' 196 | 255, // 'Â' 197 | 255, // 'Ã' 198 | 255, // 'Ä' 199 | 255, // 'Å' 200 | 255, // 'Æ' 201 | 255, // 'Ç' 202 | 255, // 'È' 203 | 255, // 'É' 204 | 255, // 'Ê' 205 | 255, // 'Ë' 206 | 255, // 'Ì' 207 | 255, // 'Í' 208 | 255, // 'Î' 209 | 255, // 'Ï' 210 | 255, // 'Ð' 211 | 255, // 'Ñ' 212 | 255, // 'Ò' 213 | 255, // 'Ó' 214 | 255, // 'Ô' 215 | 255, // 'Õ' 216 | 255, // 'Ö' 217 | 255, // '×' 218 | 255, // 'Ø' 219 | 255, // 'Ù' 220 | 255, // 'Ú' 221 | 255, // 'Û' 222 | 255, // 'Ü' 223 | 255, // 'Ý' 224 | 255, // 'Þ' 225 | 255, // 'ß' 226 | 255, // 'à' 227 | 255, // 'á' 228 | 255, // 'â' 229 | 255, // 'ã' 230 | 255, // 'ä' 231 | 255, // 'å' 232 | 255, // 'æ' 233 | 255, // 'ç' 234 | 255, // 'è' 235 | 255, // 'é' 236 | 255, // 'ê' 237 | 255, // 'ë' 238 | 255, // 'ì' 239 | 255, // 'í' 240 | 255, // 'î' 241 | 255, // 'ï' 242 | 255, // 'ð' 243 | 255, // 'ñ' 244 | 255, // 'ò' 245 | 255, // 'ó' 246 | 255, // 'ô' 247 | 255, // 'õ' 248 | 255, // 'ö' 249 | 255, // '÷' 250 | 255, // 'ø' 251 | 255, // 'ù' 252 | 255, // 'ú' 253 | 255, // 'û' 254 | 255, // 'ü' 255 | 255, // 'ý' 256 | 255, // 'þ' 257 | 255, // 'ÿ' 258 | ]; 259 | -------------------------------------------------------------------------------- /src/util/debug.rs: -------------------------------------------------------------------------------- 1 | /// A type that wraps a single byte with a convenient fmt::Debug impl that 2 | /// escapes the byte. 3 | pub(crate) struct DebugByte(pub(crate) u8); 4 | 5 | impl core::fmt::Debug for DebugByte { 6 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 7 | // Special case ASCII space. It's too hard to read otherwise, so 8 | // put quotes around it. I sometimes wonder whether just '\x20' would 9 | // be better... 10 | if self.0 == b' ' { 11 | return write!(f, "' '"); 12 | } 13 | // 10 bytes is enough to cover any output from ascii::escape_default. 14 | let mut bytes = [0u8; 10]; 15 | let mut len = 0; 16 | for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { 17 | // capitalize \xab to \xAB 18 | if i >= 2 && b'a' <= b && b <= b'f' { 19 | b -= 32; 20 | } 21 | bytes[len] = b; 22 | len += 1; 23 | } 24 | write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/util/error.rs: -------------------------------------------------------------------------------- 1 | use crate::util::{ 2 | primitives::{PatternID, SmallIndex}, 3 | search::MatchKind, 4 | }; 5 | 6 | /// An error that occurred during the construction of an Aho-Corasick 7 | /// automaton. 8 | /// 9 | /// Build errors occur when some kind of limit has been exceeded, either in the 10 | /// number of states, the number of patterns of the length of a pattern. These 11 | /// limits aren't part of the public API, but they should generally be large 12 | /// enough to handle most use cases. 13 | /// 14 | /// When the `std` feature is enabled, this implements the `std::error::Error` 15 | /// trait. 16 | #[derive(Clone, Debug)] 17 | pub struct BuildError { 18 | kind: ErrorKind, 19 | } 20 | 21 | /// The kind of error that occurred. 22 | #[derive(Clone, Debug)] 23 | enum ErrorKind { 24 | /// An error that occurs when allocating a new state would result in an 25 | /// identifier that exceeds the capacity of a `StateID`. 26 | StateIDOverflow { 27 | /// The maximum possible id. 28 | max: u64, 29 | /// The maximum ID requested. 30 | requested_max: u64, 31 | }, 32 | /// An error that occurs when adding a pattern to an Aho-Corasick 33 | /// automaton would result in an identifier that exceeds the capacity of a 34 | /// `PatternID`. 35 | PatternIDOverflow { 36 | /// The maximum possible id. 37 | max: u64, 38 | /// The maximum ID requested. 39 | requested_max: u64, 40 | }, 41 | /// Occurs when a pattern string is given to the Aho-Corasick constructor 42 | /// that is too long. 43 | PatternTooLong { 44 | /// The ID of the pattern that was too long. 45 | pattern: PatternID, 46 | /// The length that was too long. 47 | len: usize, 48 | }, 49 | } 50 | 51 | impl BuildError { 52 | pub(crate) fn state_id_overflow( 53 | max: u64, 54 | requested_max: u64, 55 | ) -> BuildError { 56 | BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } } 57 | } 58 | 59 | pub(crate) fn pattern_id_overflow( 60 | max: u64, 61 | requested_max: u64, 62 | ) -> BuildError { 63 | BuildError { 64 | kind: ErrorKind::PatternIDOverflow { max, requested_max }, 65 | } 66 | } 67 | 68 | pub(crate) fn pattern_too_long( 69 | pattern: PatternID, 70 | len: usize, 71 | ) -> BuildError { 72 | BuildError { kind: ErrorKind::PatternTooLong { pattern, len } } 73 | } 74 | } 75 | 76 | #[cfg(feature = "std")] 77 | impl std::error::Error for BuildError {} 78 | 79 | impl core::fmt::Display for BuildError { 80 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 81 | match self.kind { 82 | ErrorKind::StateIDOverflow { max, requested_max } => { 83 | write!( 84 | f, 85 | "state identifier overflow: failed to create state ID \ 86 | from {}, which exceeds the max of {}", 87 | requested_max, max, 88 | ) 89 | } 90 | ErrorKind::PatternIDOverflow { max, requested_max } => { 91 | write!( 92 | f, 93 | "pattern identifier overflow: failed to create pattern ID \ 94 | from {}, which exceeds the max of {}", 95 | requested_max, max, 96 | ) 97 | } 98 | ErrorKind::PatternTooLong { pattern, len } => { 99 | write!( 100 | f, 101 | "pattern {} with length {} exceeds \ 102 | the maximum pattern length of {}", 103 | pattern.as_usize(), 104 | len, 105 | SmallIndex::MAX.as_usize(), 106 | ) 107 | } 108 | } 109 | } 110 | } 111 | 112 | /// An error that occurred during an Aho-Corasick search. 113 | /// 114 | /// An error that occurs during a search is limited to some kind of 115 | /// misconfiguration that resulted in an illegal call. Stated differently, 116 | /// whether an error occurs is not dependent on the specific bytes in the 117 | /// haystack. 118 | /// 119 | /// Examples of misconfiguration: 120 | /// 121 | /// * Executing a stream or overlapping search on a searcher that was built was 122 | /// something other than [`MatchKind::Standard`](crate::MatchKind::Standard) 123 | /// semantics. 124 | /// * Requested an anchored or an unanchored search on a searcher that doesn't 125 | /// support unanchored or anchored searches, respectively. 126 | /// 127 | /// When the `std` feature is enabled, this implements the `std::error::Error` 128 | /// trait. 129 | #[derive(Clone, Debug, Eq, PartialEq)] 130 | pub struct MatchError(alloc::boxed::Box); 131 | 132 | impl MatchError { 133 | /// Create a new error value with the given kind. 134 | /// 135 | /// This is a more verbose version of the kind-specific constructors, e.g., 136 | /// `MatchError::unsupported_stream`. 137 | pub fn new(kind: MatchErrorKind) -> MatchError { 138 | MatchError(alloc::boxed::Box::new(kind)) 139 | } 140 | 141 | /// Returns a reference to the underlying error kind. 142 | pub fn kind(&self) -> &MatchErrorKind { 143 | &self.0 144 | } 145 | 146 | /// Create a new "invalid anchored search" error. This occurs when the 147 | /// caller requests an anchored search but where anchored searches aren't 148 | /// supported. 149 | /// 150 | /// This is the same as calling `MatchError::new` with a 151 | /// [`MatchErrorKind::InvalidInputAnchored`] kind. 152 | pub fn invalid_input_anchored() -> MatchError { 153 | MatchError::new(MatchErrorKind::InvalidInputAnchored) 154 | } 155 | 156 | /// Create a new "invalid unanchored search" error. This occurs when the 157 | /// caller requests an unanchored search but where unanchored searches 158 | /// aren't supported. 159 | /// 160 | /// This is the same as calling `MatchError::new` with a 161 | /// [`MatchErrorKind::InvalidInputUnanchored`] kind. 162 | pub fn invalid_input_unanchored() -> MatchError { 163 | MatchError::new(MatchErrorKind::InvalidInputUnanchored) 164 | } 165 | 166 | /// Create a new "unsupported stream search" error. This occurs when the 167 | /// caller requests a stream search while using an Aho-Corasick automaton 168 | /// with a match kind other than [`MatchKind::Standard`]. 169 | /// 170 | /// The match kind given should be the match kind of the automaton. It 171 | /// should never be `MatchKind::Standard`. 172 | pub fn unsupported_stream(got: MatchKind) -> MatchError { 173 | MatchError::new(MatchErrorKind::UnsupportedStream { got }) 174 | } 175 | 176 | /// Create a new "unsupported overlapping search" error. This occurs when 177 | /// the caller requests an overlapping search while using an Aho-Corasick 178 | /// automaton with a match kind other than [`MatchKind::Standard`]. 179 | /// 180 | /// The match kind given should be the match kind of the automaton. It 181 | /// should never be `MatchKind::Standard`. 182 | pub fn unsupported_overlapping(got: MatchKind) -> MatchError { 183 | MatchError::new(MatchErrorKind::UnsupportedOverlapping { got }) 184 | } 185 | 186 | /// Create a new "unsupported empty pattern" error. This occurs when the 187 | /// caller requests a search for which matching an automaton that contains 188 | /// an empty pattern string is not supported. 189 | pub fn unsupported_empty() -> MatchError { 190 | MatchError::new(MatchErrorKind::UnsupportedEmpty) 191 | } 192 | } 193 | 194 | /// The underlying kind of a [`MatchError`]. 195 | /// 196 | /// This is a **non-exhaustive** enum. That means new variants may be added in 197 | /// a semver-compatible release. 198 | #[non_exhaustive] 199 | #[derive(Clone, Debug, Eq, PartialEq)] 200 | pub enum MatchErrorKind { 201 | /// An error indicating that an anchored search was requested, but from a 202 | /// searcher that was built without anchored support. 203 | InvalidInputAnchored, 204 | /// An error indicating that an unanchored search was requested, but from a 205 | /// searcher that was built without unanchored support. 206 | InvalidInputUnanchored, 207 | /// An error indicating that a stream search was attempted on an 208 | /// Aho-Corasick automaton with an unsupported `MatchKind`. 209 | UnsupportedStream { 210 | /// The match semantics for the automaton that was used. 211 | got: MatchKind, 212 | }, 213 | /// An error indicating that an overlapping search was attempted on an 214 | /// Aho-Corasick automaton with an unsupported `MatchKind`. 215 | UnsupportedOverlapping { 216 | /// The match semantics for the automaton that was used. 217 | got: MatchKind, 218 | }, 219 | /// An error indicating that the operation requested doesn't support 220 | /// automatons that contain an empty pattern string. 221 | UnsupportedEmpty, 222 | } 223 | 224 | #[cfg(feature = "std")] 225 | impl std::error::Error for MatchError {} 226 | 227 | impl core::fmt::Display for MatchError { 228 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 229 | match *self.kind() { 230 | MatchErrorKind::InvalidInputAnchored => { 231 | write!(f, "anchored searches are not supported or enabled") 232 | } 233 | MatchErrorKind::InvalidInputUnanchored => { 234 | write!(f, "unanchored searches are not supported or enabled") 235 | } 236 | MatchErrorKind::UnsupportedStream { got } => { 237 | write!( 238 | f, 239 | "match kind {:?} does not support stream searching", 240 | got, 241 | ) 242 | } 243 | MatchErrorKind::UnsupportedOverlapping { got } => { 244 | write!( 245 | f, 246 | "match kind {:?} does not support overlapping searches", 247 | got, 248 | ) 249 | } 250 | MatchErrorKind::UnsupportedEmpty => { 251 | write!( 252 | f, 253 | "matching with an empty pattern string is not \ 254 | supported for this operation", 255 | ) 256 | } 257 | } 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /src/util/int.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | This module provides several integer oriented traits for converting between 3 | both fixed size integers and integers whose size varies based on the target 4 | (like `usize`). 5 | 6 | The main design principle for this module is to centralize all uses of `as`. 7 | The thinking here is that `as` makes it very easy to perform accidental lossy 8 | conversions, and if we centralize all its uses here under more descriptive 9 | higher level operations, its use and correctness becomes easier to audit. 10 | 11 | This was copied mostly wholesale from `regex-automata`. 12 | 13 | NOTE: for simplicity, we don't take target pointer width into account here for 14 | `usize` conversions. Since we currently only panic in debug mode, skipping the 15 | check when it can be proven it isn't needed at compile time doesn't really 16 | matter. Now, if we wind up wanting to do as many checks as possible in release 17 | mode, then we would want to skip those when we know the conversions are always 18 | non-lossy. 19 | */ 20 | 21 | // We define a little more than what we need, but I'd rather just have 22 | // everything via a consistent and uniform API then have holes. 23 | #![allow(dead_code)] 24 | 25 | pub(crate) trait U8 { 26 | fn as_usize(self) -> usize; 27 | } 28 | 29 | impl U8 for u8 { 30 | fn as_usize(self) -> usize { 31 | usize::from(self) 32 | } 33 | } 34 | 35 | pub(crate) trait U16 { 36 | fn as_usize(self) -> usize; 37 | fn low_u8(self) -> u8; 38 | fn high_u8(self) -> u8; 39 | } 40 | 41 | impl U16 for u16 { 42 | fn as_usize(self) -> usize { 43 | usize::from(self) 44 | } 45 | 46 | fn low_u8(self) -> u8 { 47 | self as u8 48 | } 49 | 50 | fn high_u8(self) -> u8 { 51 | (self >> 8) as u8 52 | } 53 | } 54 | 55 | pub(crate) trait U32 { 56 | fn as_usize(self) -> usize; 57 | fn low_u8(self) -> u8; 58 | fn low_u16(self) -> u16; 59 | fn high_u16(self) -> u16; 60 | } 61 | 62 | impl U32 for u32 { 63 | #[inline] 64 | fn as_usize(self) -> usize { 65 | #[cfg(debug_assertions)] 66 | { 67 | usize::try_from(self).expect("u32 overflowed usize") 68 | } 69 | #[cfg(not(debug_assertions))] 70 | { 71 | self as usize 72 | } 73 | } 74 | 75 | fn low_u8(self) -> u8 { 76 | self as u8 77 | } 78 | 79 | fn low_u16(self) -> u16 { 80 | self as u16 81 | } 82 | 83 | fn high_u16(self) -> u16 { 84 | (self >> 16) as u16 85 | } 86 | } 87 | 88 | pub(crate) trait U64 { 89 | fn as_usize(self) -> usize; 90 | fn low_u8(self) -> u8; 91 | fn low_u16(self) -> u16; 92 | fn low_u32(self) -> u32; 93 | fn high_u32(self) -> u32; 94 | } 95 | 96 | impl U64 for u64 { 97 | fn as_usize(self) -> usize { 98 | #[cfg(debug_assertions)] 99 | { 100 | usize::try_from(self).expect("u64 overflowed usize") 101 | } 102 | #[cfg(not(debug_assertions))] 103 | { 104 | self as usize 105 | } 106 | } 107 | 108 | fn low_u8(self) -> u8 { 109 | self as u8 110 | } 111 | 112 | fn low_u16(self) -> u16 { 113 | self as u16 114 | } 115 | 116 | fn low_u32(self) -> u32 { 117 | self as u32 118 | } 119 | 120 | fn high_u32(self) -> u32 { 121 | (self >> 32) as u32 122 | } 123 | } 124 | 125 | pub(crate) trait I8 { 126 | fn as_usize(self) -> usize; 127 | fn to_bits(self) -> u8; 128 | fn from_bits(n: u8) -> i8; 129 | } 130 | 131 | impl I8 for i8 { 132 | fn as_usize(self) -> usize { 133 | #[cfg(debug_assertions)] 134 | { 135 | usize::try_from(self).expect("i8 overflowed usize") 136 | } 137 | #[cfg(not(debug_assertions))] 138 | { 139 | self as usize 140 | } 141 | } 142 | 143 | fn to_bits(self) -> u8 { 144 | self as u8 145 | } 146 | 147 | fn from_bits(n: u8) -> i8 { 148 | n as i8 149 | } 150 | } 151 | 152 | pub(crate) trait I32 { 153 | fn as_usize(self) -> usize; 154 | fn to_bits(self) -> u32; 155 | fn from_bits(n: u32) -> i32; 156 | } 157 | 158 | impl I32 for i32 { 159 | fn as_usize(self) -> usize { 160 | #[cfg(debug_assertions)] 161 | { 162 | usize::try_from(self).expect("i32 overflowed usize") 163 | } 164 | #[cfg(not(debug_assertions))] 165 | { 166 | self as usize 167 | } 168 | } 169 | 170 | fn to_bits(self) -> u32 { 171 | self as u32 172 | } 173 | 174 | fn from_bits(n: u32) -> i32 { 175 | n as i32 176 | } 177 | } 178 | 179 | pub(crate) trait I64 { 180 | fn as_usize(self) -> usize; 181 | fn to_bits(self) -> u64; 182 | fn from_bits(n: u64) -> i64; 183 | } 184 | 185 | impl I64 for i64 { 186 | fn as_usize(self) -> usize { 187 | #[cfg(debug_assertions)] 188 | { 189 | usize::try_from(self).expect("i64 overflowed usize") 190 | } 191 | #[cfg(not(debug_assertions))] 192 | { 193 | self as usize 194 | } 195 | } 196 | 197 | fn to_bits(self) -> u64 { 198 | self as u64 199 | } 200 | 201 | fn from_bits(n: u64) -> i64 { 202 | n as i64 203 | } 204 | } 205 | 206 | pub(crate) trait Usize { 207 | fn as_u8(self) -> u8; 208 | fn as_u16(self) -> u16; 209 | fn as_u32(self) -> u32; 210 | fn as_u64(self) -> u64; 211 | } 212 | 213 | impl Usize for usize { 214 | fn as_u8(self) -> u8 { 215 | #[cfg(debug_assertions)] 216 | { 217 | u8::try_from(self).expect("usize overflowed u8") 218 | } 219 | #[cfg(not(debug_assertions))] 220 | { 221 | self as u8 222 | } 223 | } 224 | 225 | fn as_u16(self) -> u16 { 226 | #[cfg(debug_assertions)] 227 | { 228 | u16::try_from(self).expect("usize overflowed u16") 229 | } 230 | #[cfg(not(debug_assertions))] 231 | { 232 | self as u16 233 | } 234 | } 235 | 236 | fn as_u32(self) -> u32 { 237 | #[cfg(debug_assertions)] 238 | { 239 | u32::try_from(self).expect("usize overflowed u32") 240 | } 241 | #[cfg(not(debug_assertions))] 242 | { 243 | self as u32 244 | } 245 | } 246 | 247 | fn as_u64(self) -> u64 { 248 | #[cfg(debug_assertions)] 249 | { 250 | u64::try_from(self).expect("usize overflowed u64") 251 | } 252 | #[cfg(not(debug_assertions))] 253 | { 254 | self as u64 255 | } 256 | } 257 | } 258 | 259 | // Pointers aren't integers, but we convert pointers to integers to perform 260 | // offset arithmetic in some places. (And no, we don't convert the integers 261 | // back to pointers.) So add 'as_usize' conversions here too for completeness. 262 | // 263 | // These 'as' casts are actually okay because they're always non-lossy. But the 264 | // idea here is to just try and remove as much 'as' as possible, particularly 265 | // in this crate where we are being really paranoid about offsets and making 266 | // sure we don't panic on inputs that might be untrusted. This way, the 'as' 267 | // casts become easier to audit if they're all in one place, even when some of 268 | // them are actually okay 100% of the time. 269 | 270 | pub(crate) trait Pointer { 271 | fn as_usize(self) -> usize; 272 | } 273 | 274 | impl Pointer for *const T { 275 | fn as_usize(self) -> usize { 276 | self as usize 277 | } 278 | } 279 | -------------------------------------------------------------------------------- /src/util/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod alphabet; 2 | #[cfg(feature = "std")] 3 | pub(crate) mod buffer; 4 | pub(crate) mod byte_frequencies; 5 | pub(crate) mod debug; 6 | pub(crate) mod error; 7 | pub(crate) mod int; 8 | pub(crate) mod prefilter; 9 | pub(crate) mod primitives; 10 | pub(crate) mod remapper; 11 | pub(crate) mod search; 12 | pub(crate) mod special; 13 | -------------------------------------------------------------------------------- /src/util/remapper.rs: -------------------------------------------------------------------------------- 1 | use alloc::vec::Vec; 2 | 3 | use crate::{nfa::noncontiguous, util::primitives::StateID}; 4 | 5 | /// Remappable is a tightly coupled abstraction that facilitates remapping 6 | /// state identifiers in DFAs. 7 | /// 8 | /// The main idea behind remapping state IDs is that DFAs often need to check 9 | /// if a certain state is a "special" state of some kind (like a match state) 10 | /// during a search. Since this is extremely perf critical code, we want this 11 | /// check to be as fast as possible. Partitioning state IDs into, for example, 12 | /// into "non-match" and "match" states means one can tell if a state is a 13 | /// match state via a simple comparison of the state ID. 14 | /// 15 | /// The issue is that during the DFA construction process, it's not 16 | /// particularly easy to partition the states. Instead, the simplest thing is 17 | /// to often just do a pass over all of the states and shuffle them into their 18 | /// desired partitionings. To do that, we need a mechanism for swapping states. 19 | /// Hence, this abstraction. 20 | /// 21 | /// Normally, for such little code, I would just duplicate it. But this is a 22 | /// key optimization and the implementation is a bit subtle. So the abstraction 23 | /// is basically a ham-fisted attempt at DRY. The only place we use this is in 24 | /// the dense and one-pass DFAs. 25 | /// 26 | /// See also src/dfa/special.rs for a more detailed explanation of how dense 27 | /// DFAs are partitioned. 28 | pub(crate) trait Remappable: core::fmt::Debug { 29 | /// Return the total number of states. 30 | fn state_len(&self) -> usize; 31 | 32 | /// Swap the states pointed to by the given IDs. The underlying finite 33 | /// state machine should be mutated such that all of the transitions in 34 | /// `id1` are now in the memory region where the transitions for `id2` 35 | /// were, and all of the transitions in `id2` are now in the memory region 36 | /// where the transitions for `id1` were. 37 | /// 38 | /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`. 39 | /// 40 | /// It is expected that, after calling this, the underlying state machine 41 | /// will be left in an inconsistent state, since any other transitions 42 | /// pointing to, e.g., `id1` need to be updated to point to `id2`, since 43 | /// that's where `id1` moved to. 44 | /// 45 | /// In order to "fix" the underlying inconsistent state, a `Remapper` 46 | /// should be used to guarantee that `remap` is called at the appropriate 47 | /// time. 48 | fn swap_states(&mut self, id1: StateID, id2: StateID); 49 | 50 | /// This must remap every single state ID in the underlying value according 51 | /// to the function given. For example, in a DFA, this should remap every 52 | /// transition and every starting state ID. 53 | fn remap(&mut self, map: impl Fn(StateID) -> StateID); 54 | } 55 | 56 | /// Remapper is an abstraction the manages the remapping of state IDs in a 57 | /// finite state machine. This is useful when one wants to shuffle states into 58 | /// different positions in the machine. 59 | /// 60 | /// One of the key complexities this manages is the ability to correctly move 61 | /// one state multiple times. 62 | /// 63 | /// Once shuffling is complete, `remap` must be called, which will rewrite 64 | /// all pertinent transitions to updated state IDs. Neglecting to call `remap` 65 | /// will almost certainly result in a corrupt machine. 66 | #[derive(Debug)] 67 | pub(crate) struct Remapper { 68 | /// A map from the index of a state to its pre-multiplied identifier. 69 | /// 70 | /// When a state is swapped with another, then their corresponding 71 | /// locations in this map are also swapped. Thus, its new position will 72 | /// still point to its old pre-multiplied StateID. 73 | /// 74 | /// While there is a bit more to it, this then allows us to rewrite the 75 | /// state IDs in a DFA's transition table in a single pass. This is done 76 | /// by iterating over every ID in this map, then iterating over each 77 | /// transition for the state at that ID and re-mapping the transition from 78 | /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position 79 | /// in this map where `old_id` *started*, and set it to where it ended up 80 | /// after all swaps have been completed. 81 | map: Vec, 82 | /// A way to map indices to state IDs (and back). 83 | idx: IndexMapper, 84 | } 85 | 86 | impl Remapper { 87 | /// Create a new remapper from the given remappable implementation. The 88 | /// remapper can then be used to swap states. The remappable value given 89 | /// here must the same one given to `swap` and `remap`. 90 | /// 91 | /// The given stride should be the stride of the transition table expressed 92 | /// as a power of 2. This stride is used to map between state IDs and state 93 | /// indices. If state IDs and state indices are equivalent, then provide 94 | /// a `stride2` of `0`, which acts as an identity. 95 | pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper { 96 | let idx = IndexMapper { stride2 }; 97 | let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect(); 98 | Remapper { map, idx } 99 | } 100 | 101 | /// Swap two states. Once this is called, callers must follow through to 102 | /// call `remap`, or else it's possible for the underlying remappable 103 | /// value to be in a corrupt state. 104 | pub(crate) fn swap( 105 | &mut self, 106 | r: &mut impl Remappable, 107 | id1: StateID, 108 | id2: StateID, 109 | ) { 110 | if id1 == id2 { 111 | return; 112 | } 113 | r.swap_states(id1, id2); 114 | self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2)); 115 | } 116 | 117 | /// Complete the remapping process by rewriting all state IDs in the 118 | /// remappable value according to the swaps performed. 119 | pub(crate) fn remap(mut self, r: &mut impl Remappable) { 120 | // Update the map to account for states that have been swapped 121 | // multiple times. For example, if (A, C) and (C, G) are swapped, then 122 | // transitions previously pointing to A should now point to G. But if 123 | // we don't update our map, they will erroneously be set to C. All we 124 | // do is follow the swaps in our map until we see our original state 125 | // ID. 126 | // 127 | // The intuition here is to think about how changes are made to the 128 | // map: only through pairwise swaps. That means that starting at any 129 | // given state, it is always possible to find the loop back to that 130 | // state by following the swaps represented in the map (which might be 131 | // 0 swaps). 132 | // 133 | // We are also careful to clone the map before starting in order to 134 | // freeze it. We use the frozen map to find our loops, since we need to 135 | // update our map as well. Without freezing it, our updates could break 136 | // the loops referenced above and produce incorrect results. 137 | let oldmap = self.map.clone(); 138 | for i in 0..r.state_len() { 139 | let cur_id = self.idx.to_state_id(i); 140 | let mut new_id = oldmap[i]; 141 | if cur_id == new_id { 142 | continue; 143 | } 144 | loop { 145 | let id = oldmap[self.idx.to_index(new_id)]; 146 | if cur_id == id { 147 | self.map[i] = new_id; 148 | break; 149 | } 150 | new_id = id; 151 | } 152 | } 153 | r.remap(|sid| self.map[self.idx.to_index(sid)]); 154 | } 155 | } 156 | 157 | /// A simple type for mapping between state indices and state IDs. 158 | /// 159 | /// The reason why this exists is because state IDs are "premultiplied" in a 160 | /// DFA. That is, in order to get to the transitions for a particular state, 161 | /// one need only use the state ID as-is, instead of having to multiply it by 162 | /// transition table's stride. 163 | /// 164 | /// The downside of this is that it's inconvenient to map between state IDs 165 | /// using a dense map, e.g., Vec. That's because state IDs look like 166 | /// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`, 167 | /// etc. 168 | /// 169 | /// Since our state IDs are premultiplied, we can convert back-and-forth 170 | /// between IDs and indices by simply unmultiplying the IDs and multiplying the 171 | /// indices. 172 | /// 173 | /// Note that for a sparse NFA, state IDs and indices are equivalent. In this 174 | /// case, we set the stride of the index mapped to be `0`, which acts as an 175 | /// identity. 176 | #[derive(Debug)] 177 | struct IndexMapper { 178 | /// The power of 2 corresponding to the stride of the corresponding 179 | /// transition table. 'id >> stride2' de-multiplies an ID while 'index << 180 | /// stride2' pre-multiplies an index to an ID. 181 | stride2: usize, 182 | } 183 | 184 | impl IndexMapper { 185 | /// Convert a state ID to a state index. 186 | fn to_index(&self, id: StateID) -> usize { 187 | id.as_usize() >> self.stride2 188 | } 189 | 190 | /// Convert a state index to a state ID. 191 | fn to_state_id(&self, index: usize) -> StateID { 192 | // CORRECTNESS: If the given index is not valid, then it is not 193 | // required for this to panic or return a valid state ID. We'll "just" 194 | // wind up with panics or silent logic errors at some other point. But 195 | // this is OK because if Remappable::state_len is correct and so is 196 | // 'to_index', then all inputs to 'to_state_id' should be valid indices 197 | // and thus transform into valid state IDs. 198 | StateID::new_unchecked(index << self.stride2) 199 | } 200 | } 201 | 202 | impl Remappable for noncontiguous::NFA { 203 | fn state_len(&self) -> usize { 204 | noncontiguous::NFA::states(self).len() 205 | } 206 | 207 | fn swap_states(&mut self, id1: StateID, id2: StateID) { 208 | noncontiguous::NFA::swap_states(self, id1, id2) 209 | } 210 | 211 | fn remap(&mut self, map: impl Fn(StateID) -> StateID) { 212 | noncontiguous::NFA::remap(self, map) 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /src/util/special.rs: -------------------------------------------------------------------------------- 1 | use crate::util::primitives::StateID; 2 | 3 | /// A collection of sentinel state IDs for Aho-Corasick automata. 4 | /// 5 | /// This specifically enables the technique by which we determine which states 6 | /// are dead, matches or start states. Namely, by arranging states in a 7 | /// particular order, we can determine the type of a state simply by looking at 8 | /// its ID. 9 | #[derive(Clone, Debug)] 10 | pub(crate) struct Special { 11 | /// The maximum ID of all the "special" states. This corresponds either to 12 | /// start_anchored_id when a prefilter is active and max_match_id when a 13 | /// prefilter is not active. The idea here is that if there is no prefilter, 14 | /// then there is no point in treating start states as special. 15 | pub(crate) max_special_id: StateID, 16 | /// The maximum ID of all the match states. Any state ID bigger than this 17 | /// is guaranteed to be a non-match ID. 18 | /// 19 | /// It is possible and legal for max_match_id to be equal to 20 | /// start_anchored_id, which occurs precisely in the case where the empty 21 | /// string is a pattern that was added to the underlying automaton. 22 | pub(crate) max_match_id: StateID, 23 | /// The state ID of the start state used for unanchored searches. 24 | pub(crate) start_unanchored_id: StateID, 25 | /// The state ID of the start state used for anchored searches. This is 26 | /// always start_unanchored_id+1. 27 | pub(crate) start_anchored_id: StateID, 28 | } 29 | 30 | impl Special { 31 | /// Create a new set of "special" state IDs with all IDs initialized to 32 | /// zero. The general idea here is that they will be updated and set to 33 | /// correct values later. 34 | pub(crate) fn zero() -> Special { 35 | Special { 36 | max_special_id: StateID::ZERO, 37 | max_match_id: StateID::ZERO, 38 | start_unanchored_id: StateID::ZERO, 39 | start_anchored_id: StateID::ZERO, 40 | } 41 | } 42 | } 43 | --------------------------------------------------------------------------------