├── .github ├── dependabot.yml └── workflows │ ├── ci-version.yml │ └── ci.yml ├── tests ├── data │ ├── utf8.txt │ └── character.txt ├── utf8.rs ├── character.rs └── common.rs ├── Cargo.toml ├── LICENSE ├── rustfmt.toml ├── benches ├── normal_text_search_lib │ └── mod.rs ├── full_text_search_lib │ └── mod.rs ├── full_text_search.rs └── normal_text_search.rs ├── README.md ├── src ├── lib.rs ├── byte.rs └── character.rs └── .gitignore /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" -------------------------------------------------------------------------------- /tests/data/utf8.txt: -------------------------------------------------------------------------------- 1 | 3 2 | EXAMPLE 3 | 2 4 | HERE IS A SIMPLE EXAMPLE 5 | 1 17 6 | HERE IS A SIMPLE EXAMPLE, WHICH CONTAINS MULTIPLE EXAMPLES. SIXLEE IS A WRONG WORD. EXAMPLEEXAMPLE 7 | 4 17 50 84 91 8 | oocoo 9 | 2 10 | coocoocoo 11 | 2 1 4 12 | coocoocoocoo 13 | 3 1 4 7 14 | E 15 | 1 16 | HERE IS A SIMPLE EXAMPLE 17 | 5 1 3 15 17 23 18 | 這段話裡有多少個中文字和English words呢？中文萬歲！ 19 | 2 24 55 -------------------------------------------------------------------------------- /tests/data/character.txt: -------------------------------------------------------------------------------- 1 | 4 2 | EXAMPLE 3 | 2 4 | HERE IS A SIMPLE EXAMPLE 5 | 1 17 6 | HERE IS A SIMPLE EXAMPLE, WHICH CONTAINS MULTIPLE EXAMPLES. SIXLEE IS A WRONG WORD. EXAMPLEEXAMPLE 7 | 4 17 50 84 91 8 | oocoo 9 | 2 10 | coocoocoo 11 | 2 1 4 12 | coocoocoocoo 13 | 3 1 4 7 14 | E 15 | 1 16 | HERE IS A SIMPLE EXAMPLE 17 | 5 1 3 15 17 23 18 | 中文 19 | 1 20 | 這段話裡有多少個中文字和English words呢？中文萬歲！ 21 | 2 8 27 -------------------------------------------------------------------------------- /tests/utf8.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use boyer_moore_magiclen::*; 4 | 5 | const INPUT_DATA_PATH: &str = r"tests/data/utf8.txt"; 6 | 7 | #[test] 8 | fn data_input_from_file() { 9 | common::data_input_from_file( 10 | INPUT_DATA_PATH, 11 | |text, pattern, answer, answer_not_full, answer_not_full_rev| { 12 | let bm = BMByte::from(pattern).unwrap(); 13 | 14 | assert_eq!(answer, bm.find_full_all_in(text)); 15 | assert_eq!( 16 | answer.iter().rev().copied().collect::>(), 17 | bm.rfind_full_all_in(text) 18 | ); 19 | assert_eq!(answer_not_full, bm.find_all_in(text)); 20 | assert_eq!(answer_not_full_rev, bm.rfind_all_in(text)); 21 | }, 22 | ); 23 | } 24 | -------------------------------------------------------------------------------- /tests/character.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "character")] 2 | 3 | mod common; 4 | 5 | use boyer_moore_magiclen::*; 6 | 7 | const INPUT_DATA_PATH: &str = r"tests/data/character.txt"; 8 | 9 | #[test] 10 | fn data_input_from_file() { 11 | common::data_input_from_file( 12 | INPUT_DATA_PATH, 13 | |text, pattern, answer, answer_not_full, answer_not_full_rev| { 14 | let pattern = pattern.chars().collect::>(); 15 | let text = text.chars().collect::>(); 16 | 17 | let bm = BMCharacter::from(pattern).unwrap(); 18 | 19 | assert_eq!(answer, bm.find_full_all_in(&text)); 20 | assert_eq!( 21 | answer.iter().rev().copied().collect::>(), 22 | bm.rfind_full_all_in(&text) 23 | ); 24 | assert_eq!(answer_not_full, bm.find_all_in(&text)); 25 | assert_eq!(answer_not_full_rev, bm.rfind_all_in(&text)); 26 | }, 27 | ); 28 | } 29 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "boyer-moore-magiclen" 3 | version = "0.2.22" 4 | authors = ["Magic Len "] 5 | edition = "2021" 6 | rust-version = "1.68" 7 | repository = "https://github.com/magiclen/boyer-moore-magiclen" 8 | homepage = "https://magiclen.org/rust-boyer-moore-magiclen" 9 | keywords = ["boyer-moore-magiclen", "boyer-moore-horspool", "quick-search", "string", "search"] 10 | categories = ["no-std", "algorithms"] 11 | description = "Boyer-Moore-MagicLen, a fast string search algorithm implemented in Rust." 12 | license = "MIT" 13 | include = ["src/**/*", "Cargo.toml", "README.md", "LICENSE", "benches/full_text_search.rs", "benches/normal_text_search.rs"] 14 | 15 | [dependencies] 16 | debug-helper = "0.3" 17 | 18 | [dev-dependencies] 19 | scanner-rust = "2" 20 | bencher = "0.1.5" 21 | regex = "1.1.5" 22 | utf8-width = "0.1" 23 | 24 | [features] 25 | character = [] 26 | 27 | [[bench]] 28 | name = "full_text_search" 29 | harness = false 30 | 31 | [[bench]] 32 | name = "normal_text_search" 33 | harness = false 34 | 35 | [package.metadata.docs.rs] 36 | all-features = true 37 | rustdoc-args = ["--cfg", "docsrs"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 magiclen.org (Ron Li) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/ci-version.yml: -------------------------------------------------------------------------------- 1 | name: CI-version 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | env: 9 | CARGO_TERM_COLOR: always 10 | 11 | jobs: 12 | tests: 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: 17 | - ubuntu-latest 18 | - macos-latest 19 | - windows-latest 20 | toolchain: 21 | - stable 22 | - nightly 23 | features: 24 | - 25 | - --features character 26 | name: Test ${{ matrix.toolchain }} on ${{ matrix.os }} (${{ matrix.features }}) 27 | runs-on: ${{ matrix.os }} 28 | steps: 29 | - uses: actions/checkout@v6 30 | - uses: actions-rust-lang/setup-rust-toolchain@v1 31 | with: 32 | toolchain: ${{ matrix.toolchain }} 33 | - run: cargo test --release ${{ matrix.features }} 34 | - run: cargo doc --release ${{ matrix.features }} 35 | 36 | MSRV: 37 | strategy: 38 | fail-fast: false 39 | matrix: 40 | os: 41 | - ubuntu-latest 42 | - macos-latest 43 | - windows-latest 44 | toolchain: 45 | - "1.68" 46 | features: 47 | - 48 | - --features character 49 | name: Test ${{ matrix.toolchain }} on ${{ matrix.os }} (${{ matrix.features }}) 50 | runs-on: ${{ matrix.os }} 51 | steps: 52 | - uses: actions/checkout@v6 53 | - uses: actions-rust-lang/setup-rust-toolchain@v1 54 | with: 55 | toolchain: ${{ matrix.toolchain }} 56 | - run: cargo test --release --lib --bins ${{ matrix.features }} -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | # array_width = 60 2 | # attr_fn_like_width = 70 3 | binop_separator = "Front" 4 | blank_lines_lower_bound = 0 5 | blank_lines_upper_bound = 1 6 | brace_style = "PreferSameLine" 7 | # chain_width = 60 8 | color = "Auto" 9 | # comment_width = 100 10 | condense_wildcard_suffixes = true 11 | control_brace_style = "AlwaysSameLine" 12 | empty_item_single_line = true 13 | enum_discrim_align_threshold = 80 14 | error_on_line_overflow = false 15 | error_on_unformatted = false 16 | # fn_call_width = 60 17 | fn_params_layout = "Tall" 18 | fn_single_line = false 19 | force_explicit_abi = true 20 | force_multiline_blocks = false 21 | format_code_in_doc_comments = true 22 | doc_comment_code_block_width = 80 23 | format_generated_files = true 24 | format_macro_matchers = true 25 | format_macro_bodies = true 26 | skip_macro_invocations = [] 27 | format_strings = true 28 | hard_tabs = false 29 | hex_literal_case = "Upper" 30 | imports_indent = "Block" 31 | imports_layout = "Mixed" 32 | indent_style = "Block" 33 | inline_attribute_width = 0 34 | match_arm_blocks = true 35 | match_arm_leading_pipes = "Never" 36 | match_block_trailing_comma = true 37 | max_width = 100 38 | merge_derives = true 39 | imports_granularity = "Crate" 40 | newline_style = "Unix" 41 | normalize_comments = false 42 | normalize_doc_attributes = true 43 | overflow_delimited_expr = true 44 | remove_nested_parens = true 45 | reorder_impl_items = true 46 | reorder_imports = true 47 | group_imports = "StdExternalCrate" 48 | reorder_modules = true 49 | short_array_element_width_threshold = 10 50 | # single_line_if_else_max_width = 50 51 | space_after_colon = true 52 | space_before_colon = false 53 | spaces_around_ranges = false 54 | struct_field_align_threshold = 80 55 | struct_lit_single_line = false 56 | # struct_lit_width = 18 57 | # struct_variant_width = 35 58 | tab_spaces = 4 59 | trailing_comma = "Vertical" 60 | trailing_semicolon = true 61 | type_punctuation_density = "Wide" 62 | use_field_init_shorthand = true 63 | use_small_heuristics = "Max" 64 | use_try_shorthand = true 65 | where_single_line = false 66 | wrap_comments = false -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | env: 6 | CARGO_TERM_COLOR: always 7 | 8 | jobs: 9 | rustfmt: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v6 13 | - uses: actions-rust-lang/setup-rust-toolchain@v1 14 | with: 15 | toolchain: nightly 16 | components: rustfmt 17 | - uses: actions-rust-lang/rustfmt@v1 18 | 19 | clippy: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v6 23 | - uses: actions-rust-lang/setup-rust-toolchain@v1 24 | with: 25 | components: clippy 26 | - run: cargo clippy --all-targets --all-features -- -D warnings 27 | 28 | tests: 29 | strategy: 30 | fail-fast: false 31 | matrix: 32 | os: 33 | - ubuntu-latest 34 | - macos-latest 35 | - windows-latest 36 | toolchain: 37 | - stable 38 | - nightly 39 | features: 40 | - 41 | - --features character 42 | name: Test ${{ matrix.toolchain }} on ${{ matrix.os }} (${{ matrix.features }}) 43 | runs-on: ${{ matrix.os }} 44 | steps: 45 | - uses: actions/checkout@v6 46 | - uses: actions-rust-lang/setup-rust-toolchain@v1 47 | with: 48 | toolchain: ${{ matrix.toolchain }} 49 | - run: cargo test ${{ matrix.features }} 50 | - run: cargo doc ${{ matrix.features }} 51 | 52 | MSRV: 53 | strategy: 54 | fail-fast: false 55 | matrix: 56 | os: 57 | - ubuntu-latest 58 | - macos-latest 59 | - windows-latest 60 | toolchain: 61 | - "1.68" 62 | features: 63 | - 64 | - --features character 65 | name: Test ${{ matrix.toolchain }} on ${{ matrix.os }} (${{ matrix.features }}) 66 | runs-on: ${{ matrix.os }} 67 | steps: 68 | - uses: actions/checkout@v6 69 | - uses: actions-rust-lang/setup-rust-toolchain@v1 70 | with: 71 | toolchain: ${{ matrix.toolchain }} 72 | - run: cargo test --lib --bins ${{ matrix.features }} -------------------------------------------------------------------------------- /tests/common.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use scanner_rust::Scanner; 4 | 5 | #[allow(dead_code)] 6 | pub(crate) fn data_input_from_file< 7 | P: AsRef, 8 | F: Fn(&str, &str, Vec, Vec, Vec), 9 | >( 10 | p: P, 11 | f: F, 12 | ) { 13 | let mut sc = Scanner::scan_path(p).unwrap(); 14 | 15 | let pattern_count = sc.next_usize().unwrap().unwrap(); 16 | 17 | for _ in 0..pattern_count { 18 | sc.skip_whitespaces().unwrap(); 19 | 20 | let pattern = sc.next_line().unwrap().unwrap(); 21 | 22 | let text_count = sc.next_usize().unwrap().unwrap(); 23 | 24 | for _ in 0..text_count { 25 | sc.skip_whitespaces().unwrap(); 26 | 27 | let text = sc.next_line().unwrap().unwrap(); 28 | 29 | let answer_count = sc.next_usize().unwrap().unwrap(); 30 | 31 | let mut answer = Vec::with_capacity(answer_count); 32 | 33 | for _ in 0..answer_count { 34 | answer.push(sc.next_usize().unwrap().unwrap()); 35 | } 36 | 37 | let pattern_length = pattern.len(); 38 | 39 | let mut answer_not_full = Vec::with_capacity(answer.len()); 40 | 41 | let mut min_index = 0; 42 | 43 | for &index in answer.iter() { 44 | if index >= min_index { 45 | answer_not_full.push(index); 46 | 47 | min_index = index + pattern_length; 48 | } 49 | } 50 | 51 | let mut answer_not_full_rev = Vec::with_capacity(answer.len()); 52 | 53 | let mut max_index = text.len(); 54 | 55 | for &index in answer.iter().rev() { 56 | if index <= max_index { 57 | answer_not_full_rev.push(index); 58 | 59 | if index > pattern_length { 60 | max_index = index - pattern_length; 61 | } else { 62 | break; 63 | } 64 | } 65 | } 66 | 67 | f(&text, &pattern, answer, answer_not_full, answer_not_full_rev) 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /benches/normal_text_search_lib/mod.rs: -------------------------------------------------------------------------------- 1 | use boyer_moore_magiclen::*; 2 | use regex::Regex; 3 | 4 | pub fn naive_search, P: AsRef>(text: S, pattern: P) -> Vec { 5 | let text = text.as_ref(); 6 | let pattern = pattern.as_ref(); 7 | 8 | let length = text.len(); 9 | let pattern_length = pattern.len(); 10 | 11 | let mut result = Vec::new(); 12 | 13 | let mut offset = 0; 14 | 15 | while offset < length { 16 | if let Some(index) = text[offset..].find(pattern) { 17 | let index = index + offset; 18 | 19 | offset = index + pattern_length; 20 | 21 | result.push(index); 22 | } else { 23 | break; 24 | } 25 | } 26 | 27 | result 28 | } 29 | 30 | pub fn regex_search, P: AsRef>(text: S, pattern: P) -> Vec { 31 | let text = text.as_ref(); 32 | let pattern = pattern.as_ref(); 33 | 34 | let regex = Regex::new(regex::escape(pattern).as_str()).unwrap(); 35 | 36 | let length = text.len(); 37 | let pattern_length = pattern.len(); 38 | 39 | let mut result = Vec::new(); 40 | 41 | let mut offset = 0; 42 | 43 | while offset < length { 44 | if let Some(m) = regex.find(&text[offset..]) { 45 | let index = m.start() + offset; 46 | 47 | offset = index + pattern_length; 48 | 49 | result.push(index); 50 | } else { 51 | break; 52 | } 53 | } 54 | 55 | result 56 | } 57 | 58 | pub fn bmb_search(text: TT, pattern: TP) -> Vec { 59 | let bad_char_shift_map = BMByteBadCharShiftMap::create_bad_char_shift_map(&pattern).unwrap(); 60 | 61 | boyer_moore_magiclen::byte::find(text, pattern, &bad_char_shift_map, 0) 62 | } 63 | 64 | #[cfg(feature = "character")] 65 | pub fn character_search_char( 66 | text: TT, 67 | pattern: TP, 68 | ) -> Vec { 69 | let bad_char_shift_map = 70 | BMCharacterBadCharShiftMap::create_bad_char_shift_map(&pattern).unwrap(); 71 | 72 | boyer_moore_magiclen::character::find(text, pattern, &bad_char_shift_map, 0) 73 | } 74 | -------------------------------------------------------------------------------- /benches/full_text_search_lib/mod.rs: -------------------------------------------------------------------------------- 1 | use boyer_moore_magiclen::*; 2 | use regex::Regex; 3 | 4 | pub fn naive_search, P: AsRef>(text: S, pattern: P) -> Vec { 5 | let text = text.as_ref(); 6 | let pattern = pattern.as_ref(); 7 | 8 | let length = text.len(); 9 | 10 | let mut result = Vec::new(); 11 | 12 | let mut offset = 0; 13 | 14 | let pattern_first_char_width = 15 | unsafe { utf8_width::get_width_assume_valid(pattern.as_bytes()[0]) }; 16 | 17 | while offset < length { 18 | if let Some(index) = text[offset..].find(pattern) { 19 | let index = index + offset; 20 | 21 | offset = index + pattern_first_char_width; 22 | 23 | result.push(index); 24 | } else { 25 | break; 26 | } 27 | } 28 | 29 | result 30 | } 31 | 32 | pub fn regex_search, P: AsRef>(text: S, pattern: P) -> Vec { 33 | let text = text.as_ref(); 34 | let pattern = pattern.as_ref(); 35 | 36 | let regex = Regex::new(regex::escape(pattern).as_str()).unwrap(); 37 | 38 | let length = text.len(); 39 | 40 | let mut result = Vec::new(); 41 | 42 | let mut offset = 0; 43 | 44 | let pattern_first_char_width = 45 | unsafe { utf8_width::get_width_assume_valid(pattern.as_bytes()[0]) }; 46 | 47 | while offset < length { 48 | if let Some(m) = regex.find(&text[offset..]) { 49 | let index = m.start() + offset; 50 | 51 | offset = index + pattern_first_char_width; 52 | 53 | result.push(index); 54 | } else { 55 | break; 56 | } 57 | } 58 | 59 | result 60 | } 61 | 62 | pub fn bmb_search(text: TT, pattern: TP) -> Vec { 63 | let bad_char_shift_map = BMByteBadCharShiftMap::create_bad_char_shift_map(&pattern).unwrap(); 64 | 65 | boyer_moore_magiclen::byte::find_full(text, pattern, &bad_char_shift_map, 0) 66 | } 67 | 68 | #[cfg(feature = "character")] 69 | pub fn character_search_char( 70 | text: TT, 71 | pattern: TP, 72 | ) -> Vec { 73 | let bad_char_shift_map = 74 | BMCharacterBadCharShiftMap::create_bad_char_shift_map(&pattern).unwrap(); 75 | 76 | boyer_moore_magiclen::character::find_full(text, pattern, &bad_char_shift_map, 0) 77 | } 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Boyer-Moore-MagicLen 2 | ==================== 3 | 4 | [![CI](https://github.com/magiclen/boyer-moore-magiclen/actions/workflows/ci.yml/badge.svg)](https://github.com/magiclen/boyer-moore-magiclen/actions/workflows/ci.yml) 5 | 6 | This crate can be used to search substrings in a string or search any sub-sequences in any sequence by using boyer-moore-magiclen (which is sometimes faster than boyer-moore and boyer-moore-horspool). 7 | 8 | ## Usage 9 | 10 | For binary data and UTF-8 data, use the `BMByte` struct. For character sequences, use the `BMCharacter` struct (however it is much slower than `BMByte`). The `BMCharacter` struct needs the standard library support, and you have to enable the `character` feature to make it available. 11 | 12 | Every `BMXXX` has a `from` associated function to create the instance by a search pattern (the needle). 13 | 14 | For example, 15 | 16 | ```rust 17 | use boyer_moore_magiclen::BMByte; 18 | 19 | let bmb = BMByte::from("oocoo").unwrap(); 20 | ``` 21 | 22 | Now, we can search any binary data or UTF-8 data for the pattern `oocoo`. 23 | 24 | There are two search modes and two search directions. The first mode is called **full text search**, which finds the positions of the matched sub-sequences including the overlapping ones. 25 | 26 | ```rust 27 | use boyer_moore_magiclen::BMByte; 28 | 29 | let bmb = BMByte::from("oocoo").unwrap(); 30 | 31 | assert_eq!(vec![1, 4], bmb.find_full_in("coocoocoocoo", 2)); 32 | ``` 33 | 34 | The other mode is called **normal text search**, which finds the positions of the matched sub-sequences excluding the overlapping ones. 35 | 36 | ```rust 37 | use boyer_moore_magiclen::BMByte; 38 | 39 | let bmb = BMByte::from("oocoo").unwrap(); 40 | 41 | assert_eq!(vec![1, 7], bmb.find_in("coocoocoocoo", 2)); 42 | ``` 43 | 44 | The search direction can be from the head (searching forward, `find_xxx`) or from the tail (searching backward, `rfind_xxx`). 45 | 46 | ```rust 47 | use boyer_moore_magiclen::BMByte; 48 | 49 | let bmb = BMByte::from("oocoo").unwrap(); 50 | 51 | assert_eq!(vec![7, 1], bmb.rfind_in("coocoocoocoo", 2)); 52 | ``` 53 | 54 | To search all results at a time, use the `find_all_in`, `rfind_all_in`, `find_full_all_in` or `rfind_full_all_in` method. 55 | 56 | ```rust 57 | use boyer_moore_magiclen::BMByte; 58 | 59 | let bmb = BMByte::from("oocoo").unwrap(); 60 | 61 | assert_eq!(vec![7, 4, 1], bmb.rfind_full_all_in("coocoocoocoo")); 62 | ``` 63 | 64 | ## Benchmark 65 | 66 | ```bash 67 | cargo bench --bench full_text_search 68 | ``` 69 | 70 | or 71 | 72 | ```bash 73 | cargo bench --bench normal_text_search 74 | ``` 75 | 76 | ## Crates.io 77 | 78 | https://crates.io/crates/boyer-moore-magiclen 79 | 80 | ## Documentation 81 | 82 | https://docs.rs/boyer-moore-magiclen 83 | 84 | ## License 85 | 86 | [MIT](LICENSE) -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | # Boyer-Moore-MagicLen 3 | 4 | This crate can be used to search substrings in a string or search any sub-sequences in any sequence by using boyer-moore-magiclen (which is sometimes faster than boyer-moore and boyer-moore-horspool). 5 | 6 | ## Usage 7 | 8 | For binary data and UTF-8 data, use the `BMByte` struct. For character sequences, use the `BMCharacter` struct (however it is much slower than `BMByte`). The `BMCharacter` struct needs the standard library support, and you have to enable the `character` feature to make it available. 9 | 10 | Every `BMXXX` has a `from` associated function to create the instance by a search pattern (the needle). 11 | 12 | For example, 13 | 14 | ```rust 15 | use boyer_moore_magiclen::BMByte; 16 | 17 | let bmb = BMByte::from("oocoo").unwrap(); 18 | ``` 19 | 20 | Now, we can search any binary data or UTF-8 data for the pattern `oocoo`. 21 | 22 | There are two search modes and two search directions. The first mode is called **full text search**, which finds the positions of the matched sub-sequences including the overlapping ones. 23 | 24 | ```rust 25 | use boyer_moore_magiclen::BMByte; 26 | 27 | let bmb = BMByte::from("oocoo").unwrap(); 28 | 29 | assert_eq!(vec![1, 4], bmb.find_full_in("coocoocoocoo", 2)); 30 | ``` 31 | 32 | The other mode is called **normal text search**, which finds the positions of the matched sub-sequences excluding the overlapping ones. 33 | 34 | ```rust 35 | use boyer_moore_magiclen::BMByte; 36 | 37 | let bmb = BMByte::from("oocoo").unwrap(); 38 | 39 | assert_eq!(vec![1, 7], bmb.find_in("coocoocoocoo", 2)); 40 | ``` 41 | 42 | The search direction can be from the head (searching forward, `find_xxx`) or from the tail (searching backward, `rfind_xxx`). 43 | 44 | ```rust 45 | use boyer_moore_magiclen::BMByte; 46 | 47 | let bmb = BMByte::from("oocoo").unwrap(); 48 | 49 | assert_eq!(vec![7, 1], bmb.rfind_in("coocoocoocoo", 2)); 50 | ``` 51 | 52 | To search all results at a time, use the `find_all_in`, `rfind_all_in`, `find_full_all_in` or `rfind_full_all_in` method. 53 | 54 | ```rust 55 | use boyer_moore_magiclen::BMByte; 56 | 57 | let bmb = BMByte::from("oocoo").unwrap(); 58 | 59 | assert_eq!(vec![7, 4, 1], bmb.rfind_full_all_in("coocoocoocoo")); 60 | ``` 61 | */ 62 | 63 | #![cfg_attr(not(feature = "character"), no_std)] 64 | #![cfg_attr(docsrs, feature(doc_cfg))] 65 | 66 | #[macro_use] 67 | extern crate alloc; 68 | 69 | /// This module helps you search sub-sequences in any byte sequence, including self-synchronizing string encoding data such as UTF-8. 70 | pub mod byte; 71 | #[cfg(feature = "character")] 72 | /// This module helps you search character sub-sequences in any character sequence. 73 | pub mod character; 74 | 75 | pub use byte::{BMByte, BMByteBadCharShiftMap, BMByteBadCharShiftMapRev, BMByteSearchable}; 76 | #[cfg(feature = "character")] 77 | pub use character::{ 78 | BMCharacter, BMCharacterBadCharShiftMap, BMCharacterBadCharShiftMapRev, BMCharacterSearchable, 79 | }; 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Intellij+all ### 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 4 | 5 | # User-specific stuff 6 | .idea/**/workspace.xml 7 | .idea/**/tasks.xml 8 | .idea/**/usage.statistics.xml 9 | .idea/**/dictionaries 10 | .idea/**/shelf 11 | 12 | # AWS User-specific 13 | .idea/**/aws.xml 14 | 15 | # Generated files 16 | .idea/**/contentModel.xml 17 | 18 | # Sensitive or high-churn files 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.local.xml 22 | .idea/**/sqlDataSources.xml 23 | .idea/**/dynamic.xml 24 | .idea/**/uiDesigner.xml 25 | .idea/**/dbnavigator.xml 26 | 27 | # Gradle 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # Gradle and Maven with auto-import 32 | # When using Gradle or Maven with auto-import, you should exclude module files, 33 | # since they will be recreated, and may cause churn. Uncomment if using 34 | # auto-import. 35 | # .idea/artifacts 36 | # .idea/compiler.xml 37 | # .idea/jarRepositories.xml 38 | # .idea/modules.xml 39 | # .idea/*.iml 40 | # .idea/modules 41 | # *.iml 42 | # *.ipr 43 | 44 | # CMake 45 | cmake-build-*/ 46 | 47 | # Mongo Explorer plugin 48 | .idea/**/mongoSettings.xml 49 | 50 | # File-based project format 51 | *.iws 52 | 53 | # IntelliJ 54 | out/ 55 | 56 | # mpeltonen/sbt-idea plugin 57 | .idea_modules/ 58 | 59 | # JIRA plugin 60 | atlassian-ide-plugin.xml 61 | 62 | # Cursive Clojure plugin 63 | .idea/replstate.xml 64 | 65 | # SonarLint plugin 66 | .idea/sonarlint/ 67 | 68 | # Crashlytics plugin (for Android Studio and IntelliJ) 69 | com_crashlytics_export_strings.xml 70 | crashlytics.properties 71 | crashlytics-build.properties 72 | fabric.properties 73 | 74 | # Editor-based Rest Client 75 | .idea/httpRequests 76 | 77 | # Android studio 3.1+ serialized cache file 78 | .idea/caches/build_file_checksums.ser 79 | 80 | ### Intellij+all Patch ### 81 | # Ignore everything but code style settings and run configurations 82 | # that are supposed to be shared within teams. 83 | 84 | .idea/* 85 | 86 | !.idea/codeStyles 87 | !.idea/runConfigurations 88 | 89 | ### Rust ### 90 | # Generated by Cargo 91 | # will have compiled files and executables 92 | debug/ 93 | target/ 94 | 95 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 96 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 97 | Cargo.lock 98 | 99 | # These are backup files generated by rustfmt 100 | **/*.rs.bk 101 | 102 | # MSVC Windows builds of rustc generate these, which store debugging information 103 | *.pdb 104 | 105 | ### Vim ### 106 | # Swap 107 | [._]*.s[a-v][a-z] 108 | !*.svg # comment out if you don't need vector files 109 | [._]*.sw[a-p] 110 | [._]s[a-rt-v][a-z] 111 | [._]ss[a-gi-z] 112 | [._]sw[a-p] 113 | 114 | # Session 115 | Session.vim 116 | Sessionx.vim 117 | 118 | # Temporary 119 | .netrwhist 120 | *~ 121 | # Auto-generated tag files 122 | tags 123 | # Persistent undo 124 | [._]*.un~ 125 | 126 | ### VisualStudioCode ### 127 | .vscode/* 128 | !.vscode/settings.json 129 | !.vscode/tasks.json 130 | !.vscode/launch.json 131 | !.vscode/extensions.json 132 | !.vscode/*.code-snippets 133 | 134 | # Local History for Visual Studio Code 135 | .history/ 136 | 137 | # Built Visual Studio Code Extensions 138 | *.vsix 139 | 140 | ### VisualStudioCode Patch ### 141 | # Ignore all local history of files 142 | .history 143 | .ionide -------------------------------------------------------------------------------- /benches/full_text_search.rs: -------------------------------------------------------------------------------- 1 | mod full_text_search_lib; 2 | 3 | use std::fs; 4 | 5 | use bencher::{benchmark_group, benchmark_main, Bencher}; 6 | use full_text_search_lib::*; 7 | 8 | #[cfg(windows)] 9 | const TXT_PATH: &str = r"benches\data\vgilante.txt"; 10 | 11 | #[cfg(not(windows))] 12 | const TXT_PATH: &str = r"benches/data/vgilante.txt"; 13 | 14 | const PATTERN_SHORT: &str = "the"; 15 | const PATTERN_SHORT_RESULT_COUNT: usize = 5034; 16 | 17 | const PATTERN_LONG: &str = "Half the screen showed a graphic representation of what the 18 | scanners had picked up the other side showed an analysis of the 19 | same data. The graphics showed an irregular shaped lump fade 20 | on, stay several frames, then fade out. At the time the lump 21 | reminded on screen the analysis showed size about a quarter that 22 | of the ship they had seen and mass as undetermined."; 23 | const PATTERN_LONG_RESULT_COUNT: usize = 1; 24 | 25 | const NOT_EXIST_PATTERN_SHORT: &str = "xyz"; 26 | const NOT_EXIST_PATTERN_LONG: &str = "xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 27 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 28 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 29 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 30 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz"; 31 | 32 | fn short_naive(bencher: &mut Bencher) { 33 | let text = fs::read_to_string(TXT_PATH).unwrap(); 34 | 35 | bencher.iter(|| { 36 | let result = naive_search(&text, PATTERN_SHORT); 37 | 38 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 39 | }); 40 | } 41 | 42 | fn short_regex(bencher: &mut Bencher) { 43 | let text = fs::read_to_string(TXT_PATH).unwrap(); 44 | 45 | bencher.iter(|| { 46 | let result = regex_search(&text, PATTERN_SHORT); 47 | 48 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 49 | }); 50 | } 51 | 52 | fn short_bmb(bencher: &mut Bencher) { 53 | let text = fs::read_to_string(TXT_PATH).unwrap(); 54 | 55 | bencher.iter(|| { 56 | let result = bmb_search(text.as_str(), PATTERN_SHORT); 57 | 58 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 59 | }); 60 | } 61 | 62 | #[cfg(feature = "character")] 63 | fn short_character(bencher: &mut Bencher) { 64 | let text = fs::read_to_string(TXT_PATH).unwrap(); 65 | 66 | let text: Vec = text.chars().collect(); 67 | let pattern: Vec = PATTERN_SHORT.chars().collect(); 68 | 69 | bencher.iter(|| { 70 | let result = character_search_char(&text, &pattern); 71 | 72 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 73 | }); 74 | } 75 | 76 | #[cfg(feature = "character")] 77 | benchmark_group!(short, short_naive, short_regex, short_bmb, short_character); 78 | 79 | #[cfg(not(feature = "character"))] 80 | benchmark_group!(short, short_naive, short_regex, short_bmb); 81 | 82 | fn long_naive(bencher: &mut Bencher) { 83 | let text = fs::read_to_string(TXT_PATH).unwrap(); 84 | 85 | bencher.iter(|| { 86 | let result = naive_search(&text, PATTERN_LONG); 87 | 88 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 89 | }); 90 | } 91 | 92 | fn long_regex(bencher: &mut Bencher) { 93 | let text = fs::read_to_string(TXT_PATH).unwrap(); 94 | 95 | bencher.iter(|| { 96 | let result = regex_search(&text, PATTERN_LONG); 97 | 98 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 99 | }); 100 | } 101 | 102 | fn long_bmb(bencher: &mut Bencher) { 103 | let text = fs::read_to_string(TXT_PATH).unwrap(); 104 | 105 | bencher.iter(|| { 106 | let result = bmb_search(text.as_str(), PATTERN_LONG); 107 | 108 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 109 | }); 110 | } 111 | 112 | #[cfg(feature = "character")] 113 | fn long_character(bencher: &mut Bencher) { 114 | let text = fs::read_to_string(TXT_PATH).unwrap(); 115 | 116 | let text: Vec = text.chars().collect(); 117 | let pattern: Vec = PATTERN_LONG.chars().collect(); 118 | 119 | bencher.iter(|| { 120 | let result = character_search_char(&text, &pattern); 121 | 122 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 123 | }); 124 | } 125 | 126 | #[cfg(feature = "character")] 127 | benchmark_group!(long, long_naive, long_regex, long_bmb, long_character); 128 | 129 | #[cfg(not(feature = "character"))] 130 | benchmark_group!(long, long_naive, long_regex, long_bmb); 131 | 132 | fn not_exist_short_naive(bencher: &mut Bencher) { 133 | let text = fs::read_to_string(TXT_PATH).unwrap(); 134 | 135 | bencher.iter(|| { 136 | let result = naive_search(&text, NOT_EXIST_PATTERN_SHORT); 137 | 138 | assert_eq!(0, result.len()); 139 | }); 140 | } 141 | 142 | fn not_exist_short_regex(bencher: &mut Bencher) { 143 | let text = fs::read_to_string(TXT_PATH).unwrap(); 144 | 145 | bencher.iter(|| { 146 | let result = regex_search(&text, NOT_EXIST_PATTERN_SHORT); 147 | 148 | assert_eq!(0, result.len()); 149 | }); 150 | } 151 | 152 | fn not_exist_short_bmb(bencher: &mut Bencher) { 153 | let text = fs::read_to_string(TXT_PATH).unwrap(); 154 | 155 | bencher.iter(|| { 156 | let result = bmb_search(text.as_str(), NOT_EXIST_PATTERN_SHORT); 157 | 158 | assert_eq!(0, result.len()); 159 | }); 160 | } 161 | 162 | #[cfg(feature = "character")] 163 | fn not_exist_short_character(bencher: &mut Bencher) { 164 | let text = fs::read_to_string(TXT_PATH).unwrap(); 165 | 166 | let text: Vec = text.chars().collect(); 167 | let pattern: Vec = NOT_EXIST_PATTERN_SHORT.chars().collect(); 168 | 169 | bencher.iter(|| { 170 | let result = character_search_char(&text, &pattern); 171 | 172 | assert_eq!(0, result.len()); 173 | }); 174 | } 175 | 176 | #[cfg(feature = "character")] 177 | benchmark_group!( 178 | not_exist_short, 179 | not_exist_short_naive, 180 | not_exist_short_regex, 181 | not_exist_short_bmb, 182 | not_exist_short_character 183 | ); 184 | 185 | #[cfg(not(feature = "character"))] 186 | benchmark_group!( 187 | not_exist_short, 188 | not_exist_short_naive, 189 | not_exist_short_regex, 190 | not_exist_short_bmb 191 | ); 192 | 193 | fn not_exist_long_naive(bencher: &mut Bencher) { 194 | let text = fs::read_to_string(TXT_PATH).unwrap(); 195 | 196 | bencher.iter(|| { 197 | let result = naive_search(&text, NOT_EXIST_PATTERN_LONG); 198 | 199 | assert_eq!(0, result.len()); 200 | }); 201 | } 202 | 203 | fn not_exist_long_regex(bencher: &mut Bencher) { 204 | let text = fs::read_to_string(TXT_PATH).unwrap(); 205 | 206 | bencher.iter(|| { 207 | let result = regex_search(&text, NOT_EXIST_PATTERN_LONG); 208 | 209 | assert_eq!(0, result.len()); 210 | }); 211 | } 212 | 213 | fn not_exist_long_bmb(bencher: &mut Bencher) { 214 | let text = fs::read_to_string(TXT_PATH).unwrap(); 215 | 216 | bencher.iter(|| { 217 | let result = bmb_search(text.as_str(), NOT_EXIST_PATTERN_LONG); 218 | 219 | assert_eq!(0, result.len()); 220 | }); 221 | } 222 | 223 | #[cfg(feature = "character")] 224 | fn not_exist_long_character(bencher: &mut Bencher) { 225 | let text = fs::read_to_string(TXT_PATH).unwrap(); 226 | 227 | let text: Vec = text.chars().collect(); 228 | let pattern: Vec = NOT_EXIST_PATTERN_LONG.chars().collect(); 229 | 230 | bencher.iter(|| { 231 | let result = character_search_char(&text, &pattern); 232 | 233 | assert_eq!(0, result.len()); 234 | }); 235 | } 236 | 237 | #[cfg(feature = "character")] 238 | benchmark_group!( 239 | not_exist_long, 240 | not_exist_long_naive, 241 | not_exist_long_regex, 242 | not_exist_long_bmb, 243 | not_exist_long_character 244 | ); 245 | 246 | #[cfg(not(feature = "character"))] 247 | benchmark_group!(not_exist_long, not_exist_long_naive, not_exist_long_regex, not_exist_long_bmb); 248 | 249 | benchmark_main!(short, long, not_exist_short, not_exist_long); 250 | -------------------------------------------------------------------------------- /benches/normal_text_search.rs: -------------------------------------------------------------------------------- 1 | mod normal_text_search_lib; 2 | 3 | use std::fs; 4 | 5 | use bencher::{benchmark_group, benchmark_main, Bencher}; 6 | use normal_text_search_lib::*; 7 | 8 | #[cfg(windows)] 9 | const TXT_PATH: &str = r"benches\data\vgilante.txt"; 10 | 11 | #[cfg(not(windows))] 12 | const TXT_PATH: &str = r"benches/data/vgilante.txt"; 13 | 14 | const PATTERN_SHORT: &str = "the"; 15 | const PATTERN_SHORT_RESULT_COUNT: usize = 5034; 16 | 17 | const PATTERN_LONG: &str = "Half the screen showed a graphic representation of what the 18 | scanners had picked up the other side showed an analysis of the 19 | same data. The graphics showed an irregular shaped lump fade 20 | on, stay several frames, then fade out. At the time the lump 21 | reminded on screen the analysis showed size about a quarter that 22 | of the ship they had seen and mass as undetermined."; 23 | const PATTERN_LONG_RESULT_COUNT: usize = 1; 24 | 25 | const NOT_EXIST_PATTERN_SHORT: &str = "xyz"; 26 | const NOT_EXIST_PATTERN_LONG: &str = "xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 27 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 28 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 29 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz 30 | xyzabcdefghijklmnopqrstuvwzyz xyzabcdefghijklmnopqrstuvwzyz"; 31 | 32 | fn short_naive(bencher: &mut Bencher) { 33 | let text = fs::read_to_string(TXT_PATH).unwrap(); 34 | 35 | bencher.iter(|| { 36 | let result = naive_search(&text, PATTERN_SHORT); 37 | 38 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 39 | }); 40 | } 41 | 42 | fn short_regex(bencher: &mut Bencher) { 43 | let text = fs::read_to_string(TXT_PATH).unwrap(); 44 | 45 | bencher.iter(|| { 46 | let result = regex_search(&text, PATTERN_SHORT); 47 | 48 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 49 | }); 50 | } 51 | 52 | fn short_bmb(bencher: &mut Bencher) { 53 | let text = fs::read_to_string(TXT_PATH).unwrap(); 54 | 55 | bencher.iter(|| { 56 | let result = bmb_search(text.as_str(), PATTERN_SHORT); 57 | 58 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 59 | }); 60 | } 61 | 62 | #[cfg(feature = "character")] 63 | fn short_character(bencher: &mut Bencher) { 64 | let text = fs::read_to_string(TXT_PATH).unwrap(); 65 | 66 | let text: Vec = text.chars().collect(); 67 | let pattern: Vec = PATTERN_SHORT.chars().collect(); 68 | 69 | bencher.iter(|| { 70 | let result = character_search_char(&text, &pattern); 71 | 72 | assert_eq!(PATTERN_SHORT_RESULT_COUNT, result.len()); 73 | }); 74 | } 75 | 76 | #[cfg(feature = "character")] 77 | benchmark_group!(short, short_naive, short_regex, short_bmb, short_character); 78 | 79 | #[cfg(not(feature = "character"))] 80 | benchmark_group!(short, short_naive, short_regex, short_bmb); 81 | 82 | fn long_naive(bencher: &mut Bencher) { 83 | let text = fs::read_to_string(TXT_PATH).unwrap(); 84 | 85 | bencher.iter(|| { 86 | let result = naive_search(&text, PATTERN_LONG); 87 | 88 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 89 | }); 90 | } 91 | 92 | fn long_regex(bencher: &mut Bencher) { 93 | let text = fs::read_to_string(TXT_PATH).unwrap(); 94 | 95 | bencher.iter(|| { 96 | let result = regex_search(&text, PATTERN_LONG); 97 | 98 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 99 | }); 100 | } 101 | 102 | fn long_bmb(bencher: &mut Bencher) { 103 | let text = fs::read_to_string(TXT_PATH).unwrap(); 104 | 105 | bencher.iter(|| { 106 | let result = bmb_search(text.as_str(), PATTERN_LONG); 107 | 108 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 109 | }); 110 | } 111 | 112 | #[cfg(feature = "character")] 113 | fn long_character(bencher: &mut Bencher) { 114 | let text = fs::read_to_string(TXT_PATH).unwrap(); 115 | 116 | let text: Vec = text.chars().collect(); 117 | let pattern: Vec = PATTERN_LONG.chars().collect(); 118 | 119 | bencher.iter(|| { 120 | let result = character_search_char(&text, &pattern); 121 | 122 | assert_eq!(PATTERN_LONG_RESULT_COUNT, result.len()); 123 | }); 124 | } 125 | 126 | #[cfg(feature = "character")] 127 | benchmark_group!(long, long_naive, long_regex, long_bmb, long_character); 128 | 129 | #[cfg(not(feature = "character"))] 130 | benchmark_group!(long, long_naive, long_regex, long_bmb); 131 | 132 | fn not_exist_short_naive(bencher: &mut Bencher) { 133 | let text = fs::read_to_string(TXT_PATH).unwrap(); 134 | 135 | bencher.iter(|| { 136 | let result = naive_search(&text, NOT_EXIST_PATTERN_SHORT); 137 | 138 | assert_eq!(0, result.len()); 139 | }); 140 | } 141 | 142 | fn not_exist_short_regex(bencher: &mut Bencher) { 143 | let text = fs::read_to_string(TXT_PATH).unwrap(); 144 | 145 | bencher.iter(|| { 146 | let result = regex_search(&text, NOT_EXIST_PATTERN_SHORT); 147 | 148 | assert_eq!(0, result.len()); 149 | }); 150 | } 151 | 152 | fn not_exist_short_bmb(bencher: &mut Bencher) { 153 | let text = fs::read_to_string(TXT_PATH).unwrap(); 154 | 155 | bencher.iter(|| { 156 | let result = bmb_search(text.as_str(), NOT_EXIST_PATTERN_SHORT); 157 | 158 | assert_eq!(0, result.len()); 159 | }); 160 | } 161 | 162 | #[cfg(feature = "character")] 163 | fn not_exist_short_character(bencher: &mut Bencher) { 164 | let text = fs::read_to_string(TXT_PATH).unwrap(); 165 | 166 | let text: Vec = text.chars().collect(); 167 | let pattern: Vec = NOT_EXIST_PATTERN_SHORT.chars().collect(); 168 | 169 | bencher.iter(|| { 170 | let result = character_search_char(&text, &pattern); 171 | 172 | assert_eq!(0, result.len()); 173 | }); 174 | } 175 | 176 | #[cfg(feature = "character")] 177 | benchmark_group!( 178 | not_exist_short, 179 | not_exist_short_naive, 180 | not_exist_short_regex, 181 | not_exist_short_bmb, 182 | not_exist_short_character 183 | ); 184 | 185 | #[cfg(not(feature = "character"))] 186 | benchmark_group!( 187 | not_exist_short, 188 | not_exist_short_naive, 189 | not_exist_short_regex, 190 | not_exist_short_bmb 191 | ); 192 | 193 | fn not_exist_long_naive(bencher: &mut Bencher) { 194 | let text = fs::read_to_string(TXT_PATH).unwrap(); 195 | 196 | bencher.iter(|| { 197 | let result = naive_search(&text, NOT_EXIST_PATTERN_LONG); 198 | 199 | assert_eq!(0, result.len()); 200 | }); 201 | } 202 | 203 | fn not_exist_long_regex(bencher: &mut Bencher) { 204 | let text = fs::read_to_string(TXT_PATH).unwrap(); 205 | 206 | bencher.iter(|| { 207 | let result = regex_search(&text, NOT_EXIST_PATTERN_LONG); 208 | 209 | assert_eq!(0, result.len()); 210 | }); 211 | } 212 | 213 | fn not_exist_long_bmb(bencher: &mut Bencher) { 214 | let text = fs::read_to_string(TXT_PATH).unwrap(); 215 | 216 | bencher.iter(|| { 217 | let result = bmb_search(text.as_str(), NOT_EXIST_PATTERN_LONG); 218 | 219 | assert_eq!(0, result.len()); 220 | }); 221 | } 222 | 223 | #[cfg(feature = "character")] 224 | fn not_exist_long_character(bencher: &mut Bencher) { 225 | let text = fs::read_to_string(TXT_PATH).unwrap(); 226 | 227 | let text: Vec = text.chars().collect(); 228 | let pattern: Vec = NOT_EXIST_PATTERN_LONG.chars().collect(); 229 | 230 | bencher.iter(|| { 231 | let result = character_search_char(&text, &pattern); 232 | 233 | assert_eq!(0, result.len()); 234 | }); 235 | } 236 | 237 | #[cfg(feature = "character")] 238 | benchmark_group!( 239 | not_exist_long, 240 | not_exist_long_naive, 241 | not_exist_long_regex, 242 | not_exist_long_bmb, 243 | not_exist_long_character 244 | ); 245 | 246 | #[cfg(not(feature = "character"))] 247 | benchmark_group!(not_exist_long, not_exist_long_naive, not_exist_long_regex, not_exist_long_bmb); 248 | 249 | benchmark_main!(short, long, not_exist_short, not_exist_long); 250 | -------------------------------------------------------------------------------- /src/byte.rs: -------------------------------------------------------------------------------- 1 | use alloc::{ 2 | fmt::{self, Debug, Formatter}, 3 | string::String, 4 | vec::Vec, 5 | }; 6 | use core::{ops::Deref, slice::Iter}; 7 | 8 | // TODO Searchable 9 | 10 | #[allow(clippy::len_without_is_empty)] 11 | pub trait BMByteSearchable { 12 | fn len(&self) -> usize; 13 | 14 | fn value_at(&self, index: usize) -> u8; 15 | 16 | fn iter(&self) -> Iter<'_, u8>; 17 | } 18 | 19 | impl BMByteSearchable for String { 20 | #[inline] 21 | fn len(&self) -> usize { 22 | String::len(self) 23 | } 24 | 25 | #[inline] 26 | fn value_at(&self, index: usize) -> u8 { 27 | self.as_bytes()[index] 28 | } 29 | 30 | #[inline] 31 | fn iter(&self) -> Iter<'_, u8> { 32 | self.as_bytes().iter() 33 | } 34 | } 35 | 36 | impl BMByteSearchable for &str { 37 | #[inline] 38 | fn len(&self) -> usize { 39 | str::len(self) 40 | } 41 | 42 | #[inline] 43 | fn value_at(&self, index: usize) -> u8 { 44 | unsafe { (*(*self as *const str as *const [u8]))[index] } 45 | } 46 | 47 | #[inline] 48 | fn iter(&self) -> Iter<'_, u8> { 49 | self.as_bytes().iter() 50 | } 51 | } 52 | 53 | impl BMByteSearchable for dyn Deref { 54 | #[inline] 55 | fn len(&self) -> usize { 56 | <[u8]>::len(self) 57 | } 58 | 59 | #[inline] 60 | fn value_at(&self, index: usize) -> u8 { 61 | self[index] 62 | } 63 | 64 | #[inline] 65 | fn iter(&self) -> Iter<'_, u8> { 66 | <[u8]>::iter(self) 67 | } 68 | } 69 | 70 | impl BMByteSearchable for Vec { 71 | #[inline] 72 | fn len(&self) -> usize { 73 | Vec::len(self) 74 | } 75 | 76 | #[inline] 77 | fn value_at(&self, index: usize) -> u8 { 78 | self[index] 79 | } 80 | 81 | #[inline] 82 | fn iter(&self) -> Iter<'_, u8> { 83 | self.as_slice().iter() 84 | } 85 | } 86 | 87 | impl BMByteSearchable for &T { 88 | #[inline] 89 | fn len(&self) -> usize { 90 | ::len(*self) 91 | } 92 | 93 | #[inline] 94 | fn value_at(&self, index: usize) -> u8 { 95 | ::value_at(*self, index) 96 | } 97 | 98 | #[inline] 99 | fn iter(&self) -> Iter<'_, u8> { 100 | ::iter(*self) 101 | } 102 | } 103 | 104 | // TODO BasCharShiftMap 105 | 106 | pub struct BMByteBadCharShiftMap { 107 | t: [usize; 256], 108 | } 109 | 110 | impl Debug for BMByteBadCharShiftMap { 111 | #[inline] 112 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 113 | debug_helper::impl_debug_for_struct!(BMByteBadCharShiftMap, f, self, let .t = self.t.as_ref()); 114 | } 115 | } 116 | 117 | impl Deref for BMByteBadCharShiftMap { 118 | type Target = [usize]; 119 | 120 | #[inline] 121 | fn deref(&self) -> &[usize] { 122 | self.t.as_ref() 123 | } 124 | } 125 | 126 | pub struct BMByteBadCharShiftMapRev { 127 | t: [usize; 256], 128 | } 129 | 130 | impl Debug for BMByteBadCharShiftMapRev { 131 | #[inline] 132 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 133 | debug_helper::impl_debug_for_struct!(BMByteBadCharShiftMapRev, f, self, let .t = self.t.as_ref()); 134 | } 135 | } 136 | 137 | impl Deref for BMByteBadCharShiftMapRev { 138 | type Target = [usize]; 139 | 140 | #[inline] 141 | fn deref(&self) -> &[usize] { 142 | self.t.as_ref() 143 | } 144 | } 145 | 146 | impl BMByteBadCharShiftMap { 147 | pub fn create_bad_char_shift_map( 148 | pattern: T, 149 | ) -> Option { 150 | let pattern_len = pattern.len(); 151 | 152 | if pattern_len == 0 { 153 | return None; 154 | } 155 | 156 | let pattern_len_dec = pattern_len - 1; 157 | 158 | let mut bad_char_shift_map = [pattern_len; 256]; 159 | 160 | for (i, c) in pattern.iter().take(pattern_len_dec).map(|&c| c as usize).enumerate() { 161 | bad_char_shift_map[c] = pattern_len_dec - i; 162 | } 163 | 164 | Some(BMByteBadCharShiftMap { 165 | t: bad_char_shift_map 166 | }) 167 | } 168 | } 169 | 170 | impl BMByteBadCharShiftMapRev { 171 | pub fn create_bad_char_shift_map( 172 | pattern: T, 173 | ) -> Option { 174 | let pattern_len = pattern.len(); 175 | 176 | if pattern_len == 0 { 177 | return None; 178 | } 179 | 180 | let pattern_len_dec = pattern_len - 1; 181 | 182 | let mut bad_char_shift_map = [pattern_len; 256]; 183 | 184 | for (i, c) in 185 | pattern.iter().enumerate().rev().take(pattern_len_dec).map(|(i, &c)| (i, c as usize)) 186 | { 187 | bad_char_shift_map[c] = i; 188 | } 189 | 190 | Some(BMByteBadCharShiftMapRev { 191 | t: bad_char_shift_map 192 | }) 193 | } 194 | } 195 | 196 | // TODO BM 197 | 198 | /// Using Boyer-Moore-MagicLen to search byte sub-sequences in any byte sequence, including self-synchronizing string encoding data such as UTF-8. 199 | #[derive(Debug)] 200 | pub struct BMByte { 201 | bad_char_shift_map: BMByteBadCharShiftMap, 202 | bad_char_shift_map_rev: BMByteBadCharShiftMapRev, 203 | pattern: Vec, 204 | } 205 | 206 | impl BMByte { 207 | /// Create a `BMByte` instance from a pattern (the needle). 208 | /// 209 | /// ``` 210 | /// use boyer_moore_magiclen::BMByte; 211 | /// 212 | /// let bmb = BMByte::from("oocoo").unwrap(); 213 | /// ``` 214 | pub fn from(pattern: T) -> Option { 215 | let bad_char_shift_map = BMByteBadCharShiftMap::create_bad_char_shift_map(&pattern)?; 216 | let bad_char_shift_map_rev = BMByteBadCharShiftMapRev::create_bad_char_shift_map(&pattern)?; 217 | 218 | Some(BMByte { 219 | bad_char_shift_map, 220 | bad_char_shift_map_rev, 221 | pattern: pattern.iter().copied().collect(), 222 | }) 223 | } 224 | } 225 | 226 | // TODO Find Full 227 | 228 | impl BMByte { 229 | /// Find and return the positions of all matched sub-sequences in any text (the haystack). 230 | /// 231 | /// ``` 232 | /// use boyer_moore_magiclen::BMByte; 233 | /// 234 | /// let bmb = BMByte::from("oocoo").unwrap(); 235 | /// 236 | /// assert_eq!(vec![1, 4, 7], bmb.find_full_all_in("coocoocoocoo")); 237 | /// ``` 238 | pub fn find_full_all_in(&self, text: T) -> Vec { 239 | find_full(text, &self.pattern, &self.bad_char_shift_map, 0) 240 | } 241 | 242 | /// Find and return the positions of matched sub-sequences in any text (the haystack). If the `limit` is set to `0`, all sub-sequences will be found. 243 | /// 244 | /// ``` 245 | /// use boyer_moore_magiclen::BMByte; 246 | /// 247 | /// let bmb = BMByte::from("oocoo").unwrap(); 248 | /// 249 | /// assert_eq!(vec![1, 4], bmb.find_full_in("coocoocoocoo", 2)); 250 | /// ``` 251 | pub fn find_full_in(&self, text: T, limit: usize) -> Vec { 252 | find_full(text, &self.pattern, &self.bad_char_shift_map, limit) 253 | } 254 | } 255 | 256 | impl BMByte { 257 | /// Find and return the positions of all matched sub-sequences in any text (the haystack) from its tail to its head. 258 | /// 259 | /// ``` 260 | /// use boyer_moore_magiclen::BMByte; 261 | /// 262 | /// let bmb = BMByte::from("oocoo").unwrap(); 263 | /// 264 | /// assert_eq!(vec![7, 4, 1], bmb.rfind_full_all_in("coocoocoocoo")); 265 | /// ``` 266 | pub fn rfind_full_all_in(&self, text: T) -> Vec { 267 | rfind_full(text, &self.pattern, &self.bad_char_shift_map_rev, 0) 268 | } 269 | 270 | /// Find and return the positions of matched sub-sequences in any text (the haystack) from its tail to its head. If the `limit` is set to `0`, all sub-sequences will be found. 271 | /// 272 | /// ``` 273 | /// use boyer_moore_magiclen::BMByte; 274 | /// 275 | /// let bmb = BMByte::from("oocoo").unwrap(); 276 | /// 277 | /// assert_eq!(vec![7, 4], bmb.rfind_full_in("coocoocoocoo", 2)); 278 | /// ``` 279 | pub fn rfind_full_in(&self, text: T, limit: usize) -> Vec { 280 | rfind_full(text, &self.pattern, &self.bad_char_shift_map_rev, limit) 281 | } 282 | } 283 | 284 | pub fn find_full( 285 | text: TT, 286 | pattern: TP, 287 | bad_char_shift_map: &BMByteBadCharShiftMap, 288 | limit: usize, 289 | ) -> Vec { 290 | let text_len = text.len(); 291 | let pattern_len = pattern.len(); 292 | 293 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 294 | return vec![]; 295 | } 296 | 297 | let pattern_len_dec = pattern_len - 1; 298 | 299 | let last_pattern_char = pattern.value_at(pattern_len_dec); 300 | 301 | let mut shift = 0; 302 | 303 | let end_index = text_len - pattern_len; 304 | 305 | let mut result = vec![]; 306 | 307 | 'outer: loop { 308 | for (i, pc) in pattern.iter().copied().enumerate().rev() { 309 | if text.value_at(shift + i) != pc { 310 | let p = shift + pattern_len; 311 | if p == text_len { 312 | break 'outer; 313 | } 314 | shift += bad_char_shift_map[text.value_at(shift + pattern_len_dec) as usize].max({ 315 | let c = text.value_at(p); 316 | 317 | if c == last_pattern_char { 318 | 1 319 | } else { 320 | bad_char_shift_map[c as usize] + 1 321 | } 322 | }); 323 | if shift > end_index { 324 | break 'outer; 325 | } 326 | continue 'outer; 327 | } 328 | } 329 | result.push(shift); 330 | 331 | if shift == end_index { 332 | break; 333 | } 334 | 335 | if result.len() == limit { 336 | break; 337 | } 338 | 339 | shift += bad_char_shift_map[text.value_at(shift + pattern_len_dec) as usize].max({ 340 | let c = text.value_at(shift + pattern_len); 341 | 342 | if c == last_pattern_char { 343 | 1 344 | } else { 345 | bad_char_shift_map[c as usize] + 1 346 | } 347 | }); 348 | if shift > end_index { 349 | break; 350 | } 351 | } 352 | 353 | result 354 | } 355 | 356 | pub fn rfind_full( 357 | text: TT, 358 | pattern: TP, 359 | bad_char_shift_map: &BMByteBadCharShiftMapRev, 360 | limit: usize, 361 | ) -> Vec { 362 | let text_len = text.len(); 363 | let pattern_len = pattern.len(); 364 | 365 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 366 | return vec![]; 367 | } 368 | 369 | let pattern_len_dec = pattern_len - 1; 370 | 371 | let first_pattern_char = pattern.value_at(0); 372 | 373 | let mut shift = text_len - 1; 374 | 375 | let start_index = pattern_len_dec; 376 | 377 | let mut result = vec![]; 378 | 379 | 'outer: loop { 380 | for (i, pc) in pattern.iter().copied().enumerate() { 381 | if text.value_at(shift - pattern_len_dec + i) != pc { 382 | if shift < pattern_len { 383 | break 'outer; 384 | } 385 | let s = bad_char_shift_map[text.value_at(shift - pattern_len_dec) as usize].max({ 386 | let c = text.value_at(shift - pattern_len); 387 | 388 | if c == first_pattern_char { 389 | 1 390 | } else { 391 | bad_char_shift_map[c as usize] + 1 392 | } 393 | }); 394 | if shift < s { 395 | break 'outer; 396 | } 397 | shift -= s; 398 | if shift < start_index { 399 | break 'outer; 400 | } 401 | continue 'outer; 402 | } 403 | } 404 | result.push(shift - pattern_len_dec); 405 | 406 | if shift == start_index { 407 | break; 408 | } 409 | 410 | if result.len() == limit { 411 | break; 412 | } 413 | 414 | let s = bad_char_shift_map[text.value_at(shift - pattern_len_dec) as usize].max({ 415 | let c = text.value_at(shift - pattern_len); 416 | 417 | if c == first_pattern_char { 418 | 1 419 | } else { 420 | bad_char_shift_map[c as usize] + 1 421 | } 422 | }); 423 | if shift < s { 424 | break; 425 | } 426 | shift -= s; 427 | if shift < start_index { 428 | break; 429 | } 430 | } 431 | 432 | result 433 | } 434 | 435 | // TODO Find 436 | 437 | impl BMByte { 438 | /// Find and return the positions of all matched sub-sequences in any text (the haystack) but not including the overlap. 439 | /// 440 | /// ``` 441 | /// use boyer_moore_magiclen::BMByte; 442 | /// 443 | /// let bmb = BMByte::from("oocoo").unwrap(); 444 | /// 445 | /// assert_eq!(vec![1, 7], bmb.find_all_in("coocoocoocoo")); 446 | /// ``` 447 | pub fn find_all_in(&self, text: T) -> Vec { 448 | find(text, &self.pattern, &self.bad_char_shift_map, 0) 449 | } 450 | 451 | /// Find and return the position of the first matched sub-sequence in any text (the haystack). 452 | /// 453 | /// ``` 454 | /// use boyer_moore_magiclen::BMByte; 455 | /// 456 | /// let bmb = BMByte::from("oocoo").unwrap(); 457 | /// 458 | /// assert_eq!(Some(1), bmb.find_first_in("coocoocoocoo")); 459 | /// ``` 460 | pub fn find_first_in(&self, text: T) -> Option { 461 | find(text, &self.pattern, &self.bad_char_shift_map, 1).first().copied() 462 | } 463 | 464 | /// Find and return the positions of matched sub-sequences in any text (the haystack) but not including the overlap. If the `limit` is set to `0`, all sub-sequences will be found. 465 | /// 466 | /// ``` 467 | /// use boyer_moore_magiclen::BMByte; 468 | /// 469 | /// let bmb = BMByte::from("oocoo").unwrap(); 470 | /// 471 | /// assert_eq!(vec![1], bmb.find_in("coocoocoocoo", 1)); 472 | /// ``` 473 | pub fn find_in(&self, text: T, limit: usize) -> Vec { 474 | find(text, &self.pattern, &self.bad_char_shift_map, limit) 475 | } 476 | } 477 | 478 | impl BMByte { 479 | /// Find and return the positions of all matched sub-sequences in any text (the haystack) but not including the overlap from its tail to its head. 480 | /// 481 | /// ``` 482 | /// use boyer_moore_magiclen::BMByte; 483 | /// 484 | /// let bmb = BMByte::from("oocoo").unwrap(); 485 | /// 486 | /// assert_eq!(vec![7, 1], bmb.rfind_all_in("coocoocoocoo")); 487 | /// ``` 488 | pub fn rfind_all_in(&self, text: T) -> Vec { 489 | rfind(text, &self.pattern, &self.bad_char_shift_map_rev, 0) 490 | } 491 | 492 | /// Find and return the position of the first matched sub-sequence in any text (the haystack) from its tail to its head. 493 | /// 494 | /// ``` 495 | /// use boyer_moore_magiclen::BMByte; 496 | /// 497 | /// let bmb = BMByte::from("oocoo").unwrap(); 498 | /// 499 | /// assert_eq!(Some(7), bmb.rfind_first_in("coocoocoocoo")); 500 | /// ``` 501 | pub fn rfind_first_in(&self, text: T) -> Option { 502 | rfind(text, &self.pattern, &self.bad_char_shift_map_rev, 1).first().copied() 503 | } 504 | 505 | /// Find and return the positions of matched sub-sequences in any text (the haystack) but not including the overlap from its tail to its head. If the `limit` is set to `0`, all sub-sequences will be found. 506 | /// 507 | /// ``` 508 | /// use boyer_moore_magiclen::BMByte; 509 | /// 510 | /// let bmb = BMByte::from("oocoo").unwrap(); 511 | /// 512 | /// assert_eq!(vec![7], bmb.rfind_in("coocoocoocoo", 1)); 513 | /// ``` 514 | pub fn rfind_in(&self, text: T, limit: usize) -> Vec { 515 | rfind(text, &self.pattern, &self.bad_char_shift_map_rev, limit) 516 | } 517 | } 518 | 519 | pub fn find( 520 | text: TT, 521 | pattern: TP, 522 | bad_char_shift_map: &BMByteBadCharShiftMap, 523 | limit: usize, 524 | ) -> Vec { 525 | let text_len = text.len(); 526 | let pattern_len = pattern.len(); 527 | 528 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 529 | return vec![]; 530 | } 531 | 532 | let pattern_len_dec = pattern_len - 1; 533 | 534 | let last_pattern_char = pattern.value_at(pattern_len_dec); 535 | 536 | let mut shift = 0; 537 | 538 | let end_index = text_len - pattern_len; 539 | 540 | let mut result = vec![]; 541 | 542 | 'outer: loop { 543 | for (i, pc) in pattern.iter().copied().enumerate().rev() { 544 | if text.value_at(shift + i) != pc { 545 | let p = shift + pattern_len; 546 | if p == text_len { 547 | break 'outer; 548 | } 549 | shift += bad_char_shift_map[text.value_at(shift + pattern_len_dec) as usize].max({ 550 | let c = text.value_at(p); 551 | 552 | if c == last_pattern_char { 553 | 1 554 | } else { 555 | bad_char_shift_map[c as usize] + 1 556 | } 557 | }); 558 | if shift > end_index { 559 | break 'outer; 560 | } 561 | continue 'outer; 562 | } 563 | } 564 | result.push(shift); 565 | 566 | if shift == end_index { 567 | break; 568 | } 569 | 570 | if result.len() == limit { 571 | break; 572 | } 573 | 574 | shift += pattern_len; 575 | if shift > end_index { 576 | break; 577 | } 578 | } 579 | 580 | result 581 | } 582 | 583 | pub fn rfind( 584 | text: TT, 585 | pattern: TP, 586 | bad_char_shift_map: &BMByteBadCharShiftMapRev, 587 | limit: usize, 588 | ) -> Vec { 589 | let text_len = text.len(); 590 | let pattern_len = pattern.len(); 591 | 592 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 593 | return vec![]; 594 | } 595 | 596 | let pattern_len_dec = pattern_len - 1; 597 | 598 | let first_pattern_char = pattern.value_at(0); 599 | 600 | let mut shift = text_len - 1; 601 | 602 | let start_index = pattern_len_dec; 603 | 604 | let mut result = vec![]; 605 | 606 | 'outer: loop { 607 | for (i, pc) in pattern.iter().copied().enumerate() { 608 | if text.value_at(shift - pattern_len_dec + i) != pc { 609 | if shift < pattern_len { 610 | break 'outer; 611 | } 612 | let s = bad_char_shift_map[text.value_at(shift - pattern_len_dec) as usize].max({ 613 | let c = text.value_at(shift - pattern_len); 614 | 615 | if c == first_pattern_char { 616 | 1 617 | } else { 618 | bad_char_shift_map[c as usize] + 1 619 | } 620 | }); 621 | if shift < s { 622 | break 'outer; 623 | } 624 | shift -= s; 625 | if shift < start_index { 626 | break 'outer; 627 | } 628 | continue 'outer; 629 | } 630 | } 631 | result.push(shift - pattern_len_dec); 632 | 633 | if shift == start_index { 634 | break; 635 | } 636 | 637 | if result.len() == limit { 638 | break; 639 | } 640 | 641 | shift -= pattern_len; 642 | if shift < start_index { 643 | break; 644 | } 645 | } 646 | 647 | result 648 | } 649 | -------------------------------------------------------------------------------- /src/character.rs: -------------------------------------------------------------------------------- 1 | use core::slice::Iter; 2 | use std::{collections::HashMap, ops::Deref}; 3 | 4 | // TODO Searchable 5 | 6 | #[allow(clippy::len_without_is_empty)] 7 | pub trait BMCharacterSearchable { 8 | fn len(&self) -> usize; 9 | 10 | fn value_at(&self, index: usize) -> char; 11 | 12 | fn iter(&self) -> Iter<'_, char>; 13 | } 14 | 15 | impl BMCharacterSearchable for dyn Deref { 16 | #[inline] 17 | fn len(&self) -> usize { 18 | <[char]>::len(self) 19 | } 20 | 21 | #[inline] 22 | fn value_at(&self, index: usize) -> char { 23 | self[index] 24 | } 25 | 26 | #[inline] 27 | fn iter(&self) -> Iter<'_, char> { 28 | <[char]>::iter(self) 29 | } 30 | } 31 | 32 | impl BMCharacterSearchable for Vec { 33 | #[inline] 34 | fn len(&self) -> usize { 35 | Vec::len(self) 36 | } 37 | 38 | #[inline] 39 | fn value_at(&self, index: usize) -> char { 40 | self[index] 41 | } 42 | 43 | #[inline] 44 | fn iter(&self) -> Iter<'_, char> { 45 | self.as_slice().iter() 46 | } 47 | } 48 | 49 | impl BMCharacterSearchable for &T { 50 | #[inline] 51 | fn len(&self) -> usize { 52 | ::len(*self) 53 | } 54 | 55 | #[inline] 56 | fn value_at(&self, index: usize) -> char { 57 | ::value_at(*self, index) 58 | } 59 | 60 | #[inline] 61 | fn iter(&self) -> Iter<'_, char> { 62 | ::iter(*self) 63 | } 64 | } 65 | 66 | // TODO BasCharShiftMap 67 | 68 | #[derive(Debug)] 69 | pub struct BMCharacterBadCharShiftMap { 70 | t: HashMap, 71 | } 72 | 73 | impl Deref for BMCharacterBadCharShiftMap { 74 | type Target = HashMap; 75 | 76 | #[inline] 77 | fn deref(&self) -> &HashMap { 78 | &self.t 79 | } 80 | } 81 | 82 | #[derive(Debug)] 83 | pub struct BMCharacterBadCharShiftMapRev { 84 | t: HashMap, 85 | } 86 | 87 | impl Deref for BMCharacterBadCharShiftMapRev { 88 | type Target = HashMap; 89 | 90 | #[inline] 91 | fn deref(&self) -> &HashMap { 92 | &self.t 93 | } 94 | } 95 | 96 | impl BMCharacterBadCharShiftMap { 97 | pub fn create_bad_char_shift_map( 98 | pattern: T, 99 | ) -> Option { 100 | let pattern_len = pattern.len(); 101 | 102 | if pattern_len == 0 { 103 | return None; 104 | } 105 | 106 | let pattern_len_dec = pattern_len - 1; 107 | 108 | let mut bad_char_shift_map: HashMap = HashMap::with_capacity(pattern_len_dec); 109 | 110 | for (i, c) in pattern.iter().copied().take(pattern_len_dec).enumerate() { 111 | bad_char_shift_map.insert(c, pattern_len_dec - i); 112 | } 113 | 114 | Some(BMCharacterBadCharShiftMap { 115 | t: bad_char_shift_map 116 | }) 117 | } 118 | } 119 | 120 | impl BMCharacterBadCharShiftMapRev { 121 | pub fn create_bad_char_shift_map( 122 | pattern: T, 123 | ) -> Option { 124 | let pattern_len = pattern.len(); 125 | 126 | if pattern_len == 0 { 127 | return None; 128 | } 129 | 130 | let pattern_len_dec = pattern_len - 1; 131 | 132 | let mut bad_char_shift_map: HashMap = HashMap::with_capacity(pattern_len_dec); 133 | 134 | for (i, c) in pattern.iter().copied().enumerate().rev().take(pattern_len_dec) { 135 | bad_char_shift_map.insert(c, i); 136 | } 137 | 138 | Some(BMCharacterBadCharShiftMapRev { 139 | t: bad_char_shift_map 140 | }) 141 | } 142 | } 143 | 144 | // TODO BM 145 | 146 | /// Using Boyer-Moore-MagicLen to search character sub-sequences in any character sequence. 147 | #[derive(Debug)] 148 | pub struct BMCharacter { 149 | bad_char_shift_map: BMCharacterBadCharShiftMap, 150 | bad_char_shift_map_rev: BMCharacterBadCharShiftMapRev, 151 | pattern: Vec, 152 | } 153 | 154 | impl BMCharacter { 155 | /// Create a `BMByte` instance from a pattern (the search needle). 156 | /// 157 | /// ``` 158 | /// use boyer_moore_magiclen::BMCharacter; 159 | /// 160 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 161 | /// ``` 162 | pub fn from(pattern: T) -> Option { 163 | let bad_char_shift_map = BMCharacterBadCharShiftMap::create_bad_char_shift_map(&pattern)?; 164 | let bad_char_shift_map_rev = 165 | BMCharacterBadCharShiftMapRev::create_bad_char_shift_map(&pattern)?; 166 | 167 | Some(BMCharacter { 168 | bad_char_shift_map, 169 | bad_char_shift_map_rev, 170 | pattern: pattern.iter().copied().collect(), 171 | }) 172 | } 173 | } 174 | 175 | // TODO Find Full 176 | 177 | impl BMCharacter { 178 | /// Find and return the positions of all matched sub-sequences in any text (the haystack). 179 | /// 180 | /// ``` 181 | /// use boyer_moore_magiclen::BMCharacter; 182 | /// 183 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 184 | /// 185 | /// assert_eq!( 186 | /// vec![1, 4, 7], 187 | /// bmc.find_full_all_in(vec![ 188 | /// 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o' 189 | /// ]) 190 | /// ); 191 | /// ``` 192 | pub fn find_full_all_in(&self, text: T) -> Vec { 193 | find_full(text, &self.pattern, &self.bad_char_shift_map, 0) 194 | } 195 | 196 | /// Find and return the positions of matched sub-sequences in any text (the haystack). If the `limit` is set to `0`, all sub-sequences will be found. 197 | /// 198 | /// ``` 199 | /// use boyer_moore_magiclen::BMCharacter; 200 | /// 201 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 202 | /// 203 | /// assert_eq!( 204 | /// vec![1, 4], 205 | /// bmc.find_full_in( 206 | /// vec!['c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o'], 207 | /// 2 208 | /// ) 209 | /// ); 210 | /// ``` 211 | pub fn find_full_in(&self, text: T, limit: usize) -> Vec { 212 | find_full(text, &self.pattern, &self.bad_char_shift_map, limit) 213 | } 214 | } 215 | 216 | impl BMCharacter { 217 | /// Find and return the positions of all matched sub-sequences in any text (the haystack) from its tail to its head. 218 | /// 219 | /// ``` 220 | /// use boyer_moore_magiclen::BMCharacter; 221 | /// 222 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 223 | /// 224 | /// assert_eq!( 225 | /// vec![7, 4, 1], 226 | /// bmc.rfind_full_all_in(vec![ 227 | /// 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o' 228 | /// ]) 229 | /// ); 230 | /// ``` 231 | pub fn rfind_full_all_in(&self, text: T) -> Vec { 232 | rfind_full(text, &self.pattern, &self.bad_char_shift_map_rev, 0) 233 | } 234 | 235 | /// Find and return the positions of matched sub-sequences in any text (the haystack) from its tail to its head. If the `limit` is set to `0`, all sub-sequences will be found. 236 | /// 237 | /// ``` 238 | /// use boyer_moore_magiclen::BMCharacter; 239 | /// 240 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 241 | /// 242 | /// assert_eq!( 243 | /// vec![7, 4], 244 | /// bmc.rfind_full_in( 245 | /// vec!['c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o'], 246 | /// 2 247 | /// ) 248 | /// ); 249 | /// ``` 250 | pub fn rfind_full_in(&self, text: T, limit: usize) -> Vec { 251 | rfind_full(text, &self.pattern, &self.bad_char_shift_map_rev, limit) 252 | } 253 | } 254 | 255 | pub fn find_full( 256 | text: TT, 257 | pattern: TP, 258 | bad_char_shift_map: &BMCharacterBadCharShiftMap, 259 | limit: usize, 260 | ) -> Vec { 261 | let text_len = text.len(); 262 | let pattern_len = pattern.len(); 263 | 264 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 265 | return vec![]; 266 | } 267 | 268 | let pattern_len_dec = pattern_len - 1; 269 | let pattern_len_inc = pattern_len + 1; 270 | 271 | let last_pattern_char = pattern.value_at(pattern_len_dec); 272 | 273 | let mut shift = 0; 274 | 275 | let end_index = text_len - pattern_len; 276 | 277 | let mut result = vec![]; 278 | 279 | 'outer: loop { 280 | for (i, pc) in pattern.iter().copied().enumerate().rev() { 281 | if text.value_at(shift + i) != pc { 282 | let p = shift + pattern_len; 283 | if p == text_len { 284 | break 'outer; 285 | } 286 | shift += bad_char_shift_map 287 | .get(&text.value_at(shift + pattern_len_dec)) 288 | .copied() 289 | .unwrap_or(pattern_len) 290 | .max({ 291 | let c = text.value_at(p); 292 | 293 | if c == last_pattern_char { 294 | 1 295 | } else { 296 | bad_char_shift_map.get(&c).map(|&c| c + 1).unwrap_or(pattern_len_inc) 297 | } 298 | }); 299 | if shift > end_index { 300 | break 'outer; 301 | } 302 | continue 'outer; 303 | } 304 | } 305 | result.push(shift); 306 | 307 | if shift == end_index { 308 | break; 309 | } 310 | 311 | if result.len() == limit { 312 | break; 313 | } 314 | 315 | shift += bad_char_shift_map 316 | .get(&text.value_at(shift + pattern_len_dec)) 317 | .copied() 318 | .unwrap_or(pattern_len) 319 | .max({ 320 | let c = text.value_at(shift + pattern_len); 321 | 322 | if c == last_pattern_char { 323 | 1 324 | } else { 325 | bad_char_shift_map.get(&c).map(|&c| c + 1).unwrap_or(pattern_len_inc) 326 | } 327 | }); 328 | if shift > end_index { 329 | break; 330 | } 331 | } 332 | 333 | result 334 | } 335 | 336 | pub fn rfind_full( 337 | text: TT, 338 | pattern: TP, 339 | bad_char_shift_map: &BMCharacterBadCharShiftMapRev, 340 | limit: usize, 341 | ) -> Vec { 342 | let text_len = text.len(); 343 | let pattern_len = pattern.len(); 344 | 345 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 346 | return vec![]; 347 | } 348 | 349 | let pattern_len_dec = pattern_len - 1; 350 | let pattern_len_inc = pattern_len + 1; 351 | 352 | let first_pattern_char = pattern.value_at(0); 353 | 354 | let mut shift = text_len - 1; 355 | 356 | let start_index = pattern_len_dec; 357 | 358 | let mut result = vec![]; 359 | 360 | 'outer: loop { 361 | for (i, pc) in pattern.iter().copied().enumerate() { 362 | if text.value_at(shift - pattern_len_dec + i) != pc { 363 | if shift < pattern_len { 364 | break 'outer; 365 | } 366 | let s = bad_char_shift_map 367 | .get(&text.value_at(shift - pattern_len_dec)) 368 | .copied() 369 | .unwrap_or(pattern_len) 370 | .max({ 371 | let c = text.value_at(shift - pattern_len); 372 | 373 | if c == first_pattern_char { 374 | 1 375 | } else { 376 | bad_char_shift_map.get(&c).map(|&c| c + 1).unwrap_or(pattern_len_inc) 377 | } 378 | }); 379 | if shift < s { 380 | break 'outer; 381 | } 382 | shift -= s; 383 | if shift < start_index { 384 | break 'outer; 385 | } 386 | continue 'outer; 387 | } 388 | } 389 | result.push(shift - pattern_len_dec); 390 | 391 | if shift == start_index { 392 | break; 393 | } 394 | 395 | if result.len() == limit { 396 | break; 397 | } 398 | 399 | let s = bad_char_shift_map 400 | .get(&text.value_at(shift - pattern_len_dec)) 401 | .copied() 402 | .unwrap_or(pattern_len) 403 | .max({ 404 | let c = text.value_at(shift - pattern_len); 405 | 406 | if c == first_pattern_char { 407 | 1 408 | } else { 409 | bad_char_shift_map.get(&c).map(|&c| c + 1).unwrap_or(pattern_len_inc) 410 | } 411 | }); 412 | if shift < s { 413 | break; 414 | } 415 | shift -= s; 416 | if shift < start_index { 417 | break; 418 | } 419 | } 420 | 421 | result 422 | } 423 | 424 | // TODO Find 425 | 426 | impl BMCharacter { 427 | /// Find and return the positions of all matched sub-sequences in any text (the haystack) but not including the overlap. 428 | /// 429 | /// ``` 430 | /// use boyer_moore_magiclen::BMCharacter; 431 | /// 432 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 433 | /// 434 | /// assert_eq!( 435 | /// vec![1, 7], 436 | /// bmc.find_all_in(vec![ 437 | /// 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o' 438 | /// ]) 439 | /// ); 440 | /// ``` 441 | pub fn find_all_in(&self, text: T) -> Vec { 442 | find(text, &self.pattern, &self.bad_char_shift_map, 0) 443 | } 444 | 445 | /// Find and return the position of the first matched sub-sequence in any text (the haystack). 446 | /// 447 | /// ``` 448 | /// use boyer_moore_magiclen::BMCharacter; 449 | /// 450 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 451 | /// 452 | /// assert_eq!( 453 | /// Some(1), 454 | /// bmc.find_first_in(vec![ 455 | /// 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o' 456 | /// ]) 457 | /// ); 458 | /// ``` 459 | pub fn find_first_in(&self, text: T) -> Option { 460 | find(text, &self.pattern, &self.bad_char_shift_map, 1).first().copied() 461 | } 462 | 463 | /// Find and return the positions of matched sub-sequences in any text (the haystack) but not including the overlap. If the `limit` is set to `0`, all sub-sequences will be found. 464 | /// 465 | /// ``` 466 | /// use boyer_moore_magiclen::BMCharacter; 467 | /// 468 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 469 | /// 470 | /// assert_eq!( 471 | /// vec![1], 472 | /// bmc.find_in( 473 | /// vec!['c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o'], 474 | /// 1 475 | /// ) 476 | /// ); 477 | /// ``` 478 | pub fn find_in(&self, text: T, limit: usize) -> Vec { 479 | find(text, &self.pattern, &self.bad_char_shift_map, limit) 480 | } 481 | } 482 | 483 | impl BMCharacter { 484 | /// Find and return the positions of all matched sub-sequences in any text (the haystack) but not including the overlap from its tail to its head. 485 | /// 486 | /// ``` 487 | /// use boyer_moore_magiclen::BMCharacter; 488 | /// 489 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 490 | /// 491 | /// assert_eq!( 492 | /// vec![7, 1], 493 | /// bmc.rfind_all_in(vec![ 494 | /// 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o' 495 | /// ]) 496 | /// ); 497 | /// ``` 498 | pub fn rfind_all_in(&self, text: T) -> Vec { 499 | rfind(text, &self.pattern, &self.bad_char_shift_map_rev, 0) 500 | } 501 | 502 | /// Find and return the position of the first matched sub-sequence in any text (the haystack) from its tail to its head. 503 | /// 504 | /// ``` 505 | /// use boyer_moore_magiclen::BMCharacter; 506 | /// 507 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 508 | /// 509 | /// assert_eq!( 510 | /// Some(7), 511 | /// bmc.rfind_first_in(vec![ 512 | /// 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o' 513 | /// ]) 514 | /// ); 515 | /// ``` 516 | pub fn rfind_first_in(&self, text: T) -> Option { 517 | rfind(text, &self.pattern, &self.bad_char_shift_map_rev, 1).first().copied() 518 | } 519 | 520 | /// Find and return the positions of matched sub-sequences in any text (the haystack) but not including the overlap from its tail to its head. If the `limit` is set to `0`, all sub-sequences will be found. 521 | /// 522 | /// ``` 523 | /// use boyer_moore_magiclen::BMCharacter; 524 | /// 525 | /// let bmc = BMCharacter::from(vec!['o', 'o', 'c', 'o', 'o']).unwrap(); 526 | /// 527 | /// assert_eq!( 528 | /// vec![7], 529 | /// bmc.rfind_in( 530 | /// vec!['c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o', 'c', 'o', 'o'], 531 | /// 1 532 | /// ) 533 | /// ); 534 | /// ``` 535 | pub fn rfind_in(&self, text: T, limit: usize) -> Vec { 536 | rfind(text, &self.pattern, &self.bad_char_shift_map_rev, limit) 537 | } 538 | } 539 | 540 | pub fn find( 541 | text: TT, 542 | pattern: TP, 543 | bad_char_shift_map: &BMCharacterBadCharShiftMap, 544 | limit: usize, 545 | ) -> Vec { 546 | let text_len = text.len(); 547 | let pattern_len = pattern.len(); 548 | 549 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 550 | return vec![]; 551 | } 552 | 553 | let pattern_len_dec = pattern_len - 1; 554 | let pattern_len_inc = pattern_len + 1; 555 | 556 | let last_pattern_char = pattern.value_at(pattern_len_dec); 557 | 558 | let mut shift = 0; 559 | 560 | let end_index = text_len - pattern_len; 561 | 562 | let mut result = vec![]; 563 | 564 | 'outer: loop { 565 | for (i, pc) in pattern.iter().copied().enumerate().rev() { 566 | if text.value_at(shift + i) != pc { 567 | let p = shift + pattern_len; 568 | if p == text_len { 569 | break 'outer; 570 | } 571 | shift += bad_char_shift_map 572 | .get(&text.value_at(shift + pattern_len_dec)) 573 | .copied() 574 | .unwrap_or(pattern_len) 575 | .max({ 576 | let c = text.value_at(p); 577 | 578 | if c == last_pattern_char { 579 | 1 580 | } else { 581 | bad_char_shift_map.get(&c).map(|&c| c + 1).unwrap_or(pattern_len_inc) 582 | } 583 | }); 584 | if shift > end_index { 585 | break 'outer; 586 | } 587 | continue 'outer; 588 | } 589 | } 590 | result.push(shift); 591 | 592 | if shift == end_index { 593 | break; 594 | } 595 | 596 | if result.len() == limit { 597 | break; 598 | } 599 | 600 | shift += pattern_len; 601 | if shift > end_index { 602 | break; 603 | } 604 | } 605 | 606 | result 607 | } 608 | 609 | pub fn rfind( 610 | text: TT, 611 | pattern: TP, 612 | bad_char_shift_map: &BMCharacterBadCharShiftMapRev, 613 | limit: usize, 614 | ) -> Vec { 615 | let text_len = text.len(); 616 | let pattern_len = pattern.len(); 617 | 618 | if text_len == 0 || pattern_len == 0 || text_len < pattern_len { 619 | return vec![]; 620 | } 621 | 622 | let pattern_len_dec = pattern_len - 1; 623 | let pattern_len_inc = pattern_len + 1; 624 | 625 | let first_pattern_char = pattern.value_at(0); 626 | 627 | let mut shift = text_len - 1; 628 | 629 | let start_index = pattern_len_dec; 630 | 631 | let mut result = vec![]; 632 | 633 | 'outer: loop { 634 | for (i, pc) in pattern.iter().copied().enumerate() { 635 | if text.value_at(shift - pattern_len_dec + i) != pc { 636 | if shift < pattern_len { 637 | break 'outer; 638 | } 639 | let s = bad_char_shift_map 640 | .get(&text.value_at(shift - pattern_len_dec)) 641 | .copied() 642 | .unwrap_or(pattern_len) 643 | .max({ 644 | let c = text.value_at(shift - pattern_len); 645 | 646 | if c == first_pattern_char { 647 | 1 648 | } else { 649 | bad_char_shift_map.get(&c).map(|&c| c + 1).unwrap_or(pattern_len_inc) 650 | } 651 | }); 652 | if shift < s { 653 | break 'outer; 654 | } 655 | shift -= s; 656 | if shift < start_index { 657 | break 'outer; 658 | } 659 | continue 'outer; 660 | } 661 | } 662 | result.push(shift - pattern_len_dec); 663 | 664 | if shift == start_index { 665 | break; 666 | } 667 | 668 | if result.len() == limit { 669 | break; 670 | } 671 | 672 | shift -= pattern_len; 673 | if shift < start_index { 674 | break; 675 | } 676 | } 677 | 678 | result 679 | } 680 | --------------------------------------------------------------------------------