├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── .gitignore ├── COPYING ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── bench ├── .gitignore ├── Cargo.toml ├── data │ ├── opensubtitles2018-en-huge-ascii.txt │ ├── opensubtitles2018-en-small-ascii.txt │ ├── opensubtitles2018-en-tiny-ascii.txt │ ├── opensubtitles2018-ru-huge-utf8.txt │ ├── opensubtitles2018-ru-small-utf8.txt │ ├── opensubtitles2018-ru-tiny-utf8.txt │ ├── opensubtitles2018-zh-huge-utf8.txt │ ├── opensubtitles2018-zh-small-utf8.txt │ ├── opensubtitles2018-zh-tiny-utf8.txt │ ├── repeated-rare-huge │ ├── repeated-rare-small │ ├── sherlock-holmes-huge-ascii.txt │ ├── sherlock-holmes-small-ascii.txt │ └── sherlock-holmes-tiny-ascii.txt └── src │ ├── bench.rs │ ├── inputs.rs │ ├── lib.rs │ └── search.rs ├── examples ├── graphemes-std.rs ├── graphemes.rs ├── lines-std.rs ├── lines.rs ├── uppercase-std.rs ├── uppercase.rs ├── words-std.rs └── words.rs ├── rustfmt.toml ├── scripts ├── generate-unicode-data └── regex │ ├── grapheme.sh │ ├── sentence.sh │ └── word.sh └── src ├── ascii.rs ├── bstr.rs ├── bstring.rs ├── byteset ├── mod.rs └── scalar.rs ├── escape_bytes.rs ├── ext_slice.rs ├── ext_vec.rs ├── impls.rs ├── io.rs ├── lib.rs ├── tests.rs ├── unicode ├── data │ ├── GraphemeBreakTest.txt │ ├── LICENSE-UNICODE │ ├── SentenceBreakTest.txt │ └── WordBreakTest.txt ├── fsm │ ├── grapheme_break_fwd.bigendian.dfa │ ├── grapheme_break_fwd.littleendian.dfa │ ├── grapheme_break_fwd.rs │ ├── grapheme_break_rev.bigendian.dfa │ ├── grapheme_break_rev.littleendian.dfa │ ├── grapheme_break_rev.rs │ ├── mod.rs │ ├── regional_indicator_rev.bigendian.dfa │ ├── regional_indicator_rev.littleendian.dfa │ ├── regional_indicator_rev.rs │ ├── sentence_break_fwd.bigendian.dfa │ ├── sentence_break_fwd.littleendian.dfa │ ├── sentence_break_fwd.rs │ ├── simple_word_fwd.bigendian.dfa │ ├── simple_word_fwd.littleendian.dfa │ ├── simple_word_fwd.rs │ ├── whitespace_anchored_fwd.bigendian.dfa │ ├── whitespace_anchored_fwd.littleendian.dfa │ ├── whitespace_anchored_fwd.rs │ ├── whitespace_anchored_rev.bigendian.dfa │ ├── whitespace_anchored_rev.littleendian.dfa │ ├── whitespace_anchored_rev.rs │ ├── word_break_fwd.bigendian.dfa │ ├── word_break_fwd.littleendian.dfa │ └── word_break_fwd.rs ├── grapheme.rs ├── mod.rs ├── sentence.rs ├── whitespace.rs └── word.rs └── utf8.rs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [BurntSushi] 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | schedule: 8 | - cron: '00 01 * * *' 9 | 10 | # The section is needed to drop write-all permissions that are granted on 11 | # `schedule` event. By specifying any permission explicitly all others are set 12 | # to none. By using the principle of least privilege the damage a compromised 13 | # workflow can do (because of an injection or compromised third party tool or 14 | # action) is restricted. Currently the worklow doesn't need any additional 15 | # permission except for pulling the code. Adding labels to issues, commenting 16 | # on pull-requests, etc. may need additional permissions: 17 | # 18 | # Syntax for this section: 19 | # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions 20 | # 21 | # Reference for how to assign permissions on a job-by-job basis: 22 | # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs 23 | # 24 | # Reference for available permissions that we can enable if needed: 25 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token 26 | permissions: 27 | # to fetch code (actions/checkout) 28 | contents: read 29 | 30 | jobs: 31 | test: 32 | name: test 33 | runs-on: ${{ matrix.os }} 34 | strategy: 35 | matrix: 36 | include: 37 | - build: pinned 38 | os: ubuntu-latest 39 | rust: 1.73.0 40 | - build: stable 41 | os: ubuntu-latest 42 | rust: stable 43 | - build: beta 44 | os: ubuntu-latest 45 | rust: beta 46 | - build: nightly 47 | os: ubuntu-latest 48 | rust: nightly 49 | - build: macos 50 | os: macos-latest 51 | rust: stable 52 | - build: win-msvc 53 | os: windows-latest 54 | rust: stable 55 | - build: win-gnu 56 | os: windows-latest 57 | rust: stable-x86_64-gnu 58 | env: 59 | RUSTFLAGS: -D warnings 60 | RUST_BACKTRACE: 1 61 | steps: 62 | - name: Checkout repository 63 | uses: actions/checkout@v3 64 | - name: Install Rust 65 | uses: dtolnay/rust-toolchain@master 66 | with: 67 | toolchain: ${{ matrix.rust }} 68 | - run: cargo build --verbose 69 | - run: cargo doc --verbose 70 | # We run a few other builds, but only on one instance to avoid doing 71 | # more work than we need to. 72 | - if: matrix.build == 'stable' 73 | run: cargo build --verbose --features serde 74 | - if: matrix.build == 'stable' 75 | run: cargo build --verbose --no-default-features 76 | - if: matrix.build == 'stable' 77 | run: cargo build --verbose --no-default-features --features serde,alloc 78 | - if: matrix.build == 'stable' 79 | run: cargo build --verbose --no-default-features --features serde 80 | - if: matrix.build == 'stable' 81 | run: cargo build --verbose --no-default-features --features alloc 82 | # Our dev dependencies evolve more rapidly than we'd like, so only run 83 | # tests when we aren't pinning the Rust version. 84 | - if: matrix.build != 'pinned' 85 | run: cargo test --verbose 86 | # As with 'cargo build' above, run tests on a bunch of feature 87 | # combinations, but just on 'stable' to avoid doing more work that we have 88 | # to. 89 | - if: matrix.build == 'stable' 90 | run: cargo test --verbose --features serde 91 | - if: matrix.build == 'stable' 92 | run: cargo test --verbose --no-default-features 93 | - if: matrix.build == 'stable' 94 | run: cargo test --verbose --no-default-features --features serde,alloc 95 | - if: matrix.build == 'stable' 96 | run: cargo test --verbose --no-default-features --features serde 97 | - if: matrix.build == 'stable' 98 | run: cargo test --verbose --no-default-features --features alloc 99 | - name: Run benchmarks as tests 100 | if: matrix.build == 'stable' 101 | working-directory: ./bench 102 | run: cargo test --verbose --benches 103 | 104 | rustfmt: 105 | name: rustfmt 106 | runs-on: ubuntu-latest 107 | steps: 108 | - name: Checkout repository 109 | uses: actions/checkout@v3 110 | - name: Install Rust 111 | uses: dtolnay/rust-toolchain@master 112 | with: 113 | toolchain: stable 114 | components: rustfmt 115 | - name: Check formatting 116 | run: cargo fmt --check 117 | 118 | # miri: 119 | # name: miri 120 | # runs-on: ubuntu-latest 121 | # steps: 122 | # - name: Checkout repository 123 | # uses: actions/checkout@v3 124 | # - name: Install Rust 125 | # uses: dtolnay/rust-toolchain@miri 126 | # - run: cargo miri test --lib --verbose 127 | # env: 128 | # MIRIFLAGS: -Zmiri-strict-provenance 129 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | tags 3 | target 4 | /Cargo.lock 5 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | This project is licensed under either of 2 | 3 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or 4 | https://www.apache.org/licenses/LICENSE-2.0) 5 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or 6 | https://opensource.org/licenses/MIT) 7 | 8 | at your option. 9 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bstr" 3 | version = "1.12.0" #:version 4 | authors = ["Andrew Gallant "] 5 | description = "A string type that is not required to be valid UTF-8." 6 | documentation = "https://docs.rs/bstr" 7 | homepage = "https://github.com/BurntSushi/bstr" 8 | repository = "https://github.com/BurntSushi/bstr" 9 | readme = "README.md" 10 | keywords = ["string", "str", "byte", "bytes", "text"] 11 | license = "MIT OR Apache-2.0" 12 | categories = ["text-processing", "encoding"] 13 | exclude = ["/.github", "/scripts", "/src/unicode/data"] 14 | edition = "2021" 15 | rust-version = "1.73" 16 | resolver = "2" 17 | 18 | [workspace] 19 | members = ["bench"] 20 | 21 | [lib] 22 | bench = false 23 | 24 | [features] 25 | default = ["std", "unicode"] 26 | std = ["alloc", "memchr/std", "serde?/std"] 27 | alloc = ["memchr/alloc", "serde?/alloc"] 28 | unicode = ["dep:regex-automata"] 29 | serde = ["dep:serde"] 30 | 31 | [dependencies] 32 | memchr = { version = "2.7.1", default-features = false } 33 | serde = { version = "1.0.85", default-features = false, optional = true } 34 | 35 | [dependencies.regex-automata] 36 | version = "0.4.1" 37 | default-features = false 38 | features = ["dfa-search"] 39 | optional = true 40 | 41 | [dev-dependencies] 42 | quickcheck = { version = "1", default-features = false } 43 | ucd-parse = "0.1.3" 44 | unicode-segmentation = "1.2.1" 45 | 46 | [package.metadata.docs.rs] 47 | # We want to document all features. 48 | all-features = true 49 | # Since this crate's feature setup is pretty complicated, it is worth opting 50 | # into a nightly unstable option to show the features that need to be enabled 51 | # for public API items. To do that, we set 'docsrs', and when that's enabled, 52 | # we enable the 'doc_auto_cfg' feature. 53 | # 54 | # To test this locally, run: 55 | # 56 | # RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features 57 | rustdoc-args = ["--cfg", "docsrs"] 58 | 59 | [profile.release] 60 | debug = true 61 | 62 | [[example]] 63 | name = "graphemes" 64 | required-features = ["std", "unicode"] 65 | 66 | [[example]] 67 | name = "lines" 68 | required-features = ["std"] 69 | 70 | [[example]] 71 | name = "uppercase" 72 | required-features = ["std", "unicode"] 73 | 74 | [[example]] 75 | name = "words" 76 | required-features = ["std", "unicode"] 77 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018-2019 Andrew Gallant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bstr 2 | ==== 3 | This crate provides extension traits for `&[u8]` and `Vec` that enable 4 | their use as byte strings, where byte strings are _conventionally_ UTF-8. This 5 | differs from the standard library's `String` and `str` types in that they are 6 | not required to be valid UTF-8, but may be fully or partially valid UTF-8. 7 | 8 | [![Build status](https://github.com/BurntSushi/bstr/workflows/ci/badge.svg)](https://github.com/BurntSushi/bstr/actions) 9 | [![crates.io](https://img.shields.io/crates/v/bstr.svg)](https://crates.io/crates/bstr) 10 | 11 | 12 | ### Documentation 13 | 14 | https://docs.rs/bstr 15 | 16 | 17 | ### When should I use byte strings? 18 | 19 | See this part of the documentation for more details: 20 | . 21 | 22 | The short story is that byte strings are useful when it is inconvenient or 23 | incorrect to require valid UTF-8. 24 | 25 | 26 | ### Usage 27 | 28 | `cargo add bstr` 29 | 30 | ### Examples 31 | 32 | The following two examples exhibit both the API features of byte strings and 33 | the I/O convenience functions provided for reading line-by-line quickly. 34 | 35 | This first example simply shows how to efficiently iterate over lines in stdin, 36 | and print out lines containing a particular substring: 37 | 38 | ```rust 39 | use std::{error::Error, io::{self, Write}}; 40 | use bstr::{ByteSlice, io::BufReadExt}; 41 | 42 | fn main() -> Result<(), Box> { 43 | let stdin = io::stdin(); 44 | let mut stdout = io::BufWriter::new(io::stdout()); 45 | 46 | stdin.lock().for_byte_line_with_terminator(|line| { 47 | if line.contains_str("Dimension") { 48 | stdout.write_all(line)?; 49 | } 50 | Ok(true) 51 | })?; 52 | Ok(()) 53 | } 54 | ``` 55 | 56 | This example shows how to count all of the words (Unicode-aware) in stdin, 57 | line-by-line: 58 | 59 | ```rust 60 | use std::{error::Error, io}; 61 | use bstr::{ByteSlice, io::BufReadExt}; 62 | 63 | fn main() -> Result<(), Box> { 64 | let stdin = io::stdin(); 65 | let mut words = 0; 66 | stdin.lock().for_byte_line_with_terminator(|line| { 67 | words += line.words().count(); 68 | Ok(true) 69 | })?; 70 | println!("{}", words); 71 | Ok(()) 72 | } 73 | ``` 74 | 75 | This example shows how to convert a stream on stdin to uppercase without 76 | performing UTF-8 validation _and_ amortizing allocation. On standard ASCII 77 | text, this is quite a bit faster than what you can (easily) do with standard 78 | library APIs. (N.B. Any invalid UTF-8 bytes are passed through unchanged.) 79 | 80 | ```rust 81 | use std::{error::Error, io::{self, Write}}; 82 | use bstr::{ByteSlice, io::BufReadExt}; 83 | 84 | fn main() -> Result<(), Box> { 85 | let stdin = io::stdin(); 86 | let mut stdout = io::BufWriter::new(io::stdout()); 87 | 88 | let mut upper = vec![]; 89 | stdin.lock().for_byte_line_with_terminator(|line| { 90 | upper.clear(); 91 | line.to_uppercase_into(&mut upper); 92 | stdout.write_all(&upper)?; 93 | Ok(true) 94 | })?; 95 | Ok(()) 96 | } 97 | ``` 98 | 99 | This example shows how to extract the first 10 visual characters (as grapheme 100 | clusters) from each line, where invalid UTF-8 sequences are generally treated 101 | as a single character and are passed through correctly: 102 | 103 | ```rust 104 | use std::{error::Error, io::{self, Write}}; 105 | use bstr::{ByteSlice, io::BufReadExt}; 106 | 107 | fn main() -> Result<(), Box> { 108 | let stdin = io::stdin(); 109 | let mut stdout = io::BufWriter::new(io::stdout()); 110 | 111 | stdin.lock().for_byte_line_with_terminator(|line| { 112 | let end = line 113 | .grapheme_indices() 114 | .map(|(_, end, _)| end) 115 | .take(10) 116 | .last() 117 | .unwrap_or(line.len()); 118 | stdout.write_all(line[..end].trim_end())?; 119 | stdout.write_all(b"\n")?; 120 | Ok(true) 121 | })?; 122 | Ok(()) 123 | } 124 | ``` 125 | 126 | 127 | ### Cargo features 128 | 129 | This crates comes with a few features that control standard library, serde and 130 | Unicode support. 131 | 132 | * `std` - **Enabled** by default. This provides APIs that require the standard 133 | library, such as `Vec` and `PathBuf`. Enabling this feature also enables 134 | the `alloc` feature. 135 | * `alloc` - **Enabled** by default. This provides APIs that require allocations 136 | via the `alloc` crate, such as `Vec`. 137 | * `unicode` - **Enabled** by default. This provides APIs that require sizable 138 | Unicode data compiled into the binary. This includes, but is not limited to, 139 | grapheme/word/sentence segmenters. When this is disabled, basic support such 140 | as UTF-8 decoding is still included. Note that currently, enabling this 141 | feature also requires enabling the `std` feature. It is expected that this 142 | limitation will be lifted at some point. 143 | * `serde` - Enables implementations of serde traits for `BStr`, and also 144 | `BString` when `alloc` is enabled. 145 | 146 | 147 | ### Minimum Rust version policy 148 | 149 | This crate's minimum supported `rustc` version (MSRV) is `1.73`. 150 | 151 | In general, this crate will be conservative with respect to the minimum 152 | supported version of Rust. MSRV may be bumped in minor version releases. 153 | 154 | 155 | ### Future work 156 | 157 | Since it is plausible that some of the types in this crate might end up in your 158 | public API (e.g., `BStr` and `BString`), we will commit to being very 159 | conservative with respect to new major version releases. It's difficult to say 160 | precisely how conservative, but unless there is a major issue with the `1.0` 161 | release, I wouldn't expect a `2.0` release to come out any sooner than some 162 | period of years. 163 | 164 | A large part of the API surface area was taken from the standard library, so 165 | from an API design perspective, a good portion of this crate should be on solid 166 | ground. The main differences from the standard library are in how the various 167 | substring search routines work. The standard library provides generic 168 | infrastructure for supporting different types of searches with a single method, 169 | where as this library prefers to define new methods for each type of search and 170 | drop the generic infrastructure. 171 | 172 | Some _probable_ future considerations for APIs include, but are not limited to: 173 | 174 | * Unicode normalization. 175 | * More sophisticated support for dealing with Unicode case, perhaps by 176 | combining the use cases supported by [`caseless`](https://docs.rs/caseless) 177 | and [`unicase`](https://docs.rs/unicase). 178 | 179 | Here are some examples that are _probably_ out of scope for this crate: 180 | 181 | * Regular expressions. 182 | * Unicode collation. 183 | 184 | The exact scope isn't quite clear, but I expect we can iterate on it. 185 | 186 | In general, as stated below, this crate brings lots of related APIs together 187 | into a single crate while simultaneously attempting to keep the total number of 188 | dependencies low. Indeed, every dependency of `bstr`, except for `memchr`, is 189 | optional. 190 | 191 | 192 | ### High level motivation 193 | 194 | Strictly speaking, the `bstr` crate provides very little that can't already be 195 | achieved with the standard library `Vec`/`&[u8]` APIs and the ecosystem of 196 | library crates. For example: 197 | 198 | * The standard library's 199 | [`Utf8Error`](https://doc.rust-lang.org/std/str/struct.Utf8Error.html) can be 200 | used for incremental lossy decoding of `&[u8]`. 201 | * The 202 | [`unicode-segmentation`](https://unicode-rs.github.io/unicode-segmentation/unicode_segmentation/index.html) 203 | crate can be used for iterating over graphemes (or words), but is only 204 | implemented for `&str` types. One could use `Utf8Error` above to implement 205 | grapheme iteration with the same semantics as what `bstr` provides (automatic 206 | Unicode replacement codepoint substitution). 207 | * The [`twoway`](https://docs.rs/twoway) crate can be used for fast substring 208 | searching on `&[u8]`. 209 | 210 | So why create `bstr`? Part of the point of the `bstr` crate is to provide a 211 | uniform API of coupled components instead of relying on users to piece together 212 | loosely coupled components from the crate ecosystem. For example, if you wanted 213 | to perform a search and replace in a `Vec`, then writing the code to do 214 | that with the `twoway` crate is not that difficult, but it's still additional 215 | glue code you have to write. This work adds up depending on what you're doing. 216 | Consider, for example, trimming and splitting, along with their different 217 | variants. 218 | 219 | In other words, `bstr` is partially a way of pushing back against the 220 | micro-crate ecosystem that appears to be evolving. Namely, it is a goal of 221 | `bstr` to keep its dependency list lightweight. For example, `serde` is an 222 | optional dependency because there is no feasible alternative. In service of 223 | this philosophy, currently, the only required dependency of `bstr` is `memchr`. 224 | 225 | 226 | ### License 227 | 228 | This project is licensed under either of 229 | 230 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or 231 | https://www.apache.org/licenses/LICENSE-2.0) 232 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or 233 | https://opensource.org/licenses/MIT) 234 | 235 | at your option. 236 | 237 | The data in `src/unicode/data/` is licensed under the Unicode License Agreement 238 | ([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)), although 239 | this data is only used in tests. 240 | -------------------------------------------------------------------------------- /bench/.gitignore: -------------------------------------------------------------------------------- 1 | log 2 | -------------------------------------------------------------------------------- /bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | publish = false 3 | name = "bstr-bench" 4 | version = "0.0.1" 5 | authors = ["Andrew Gallant "] 6 | description = "Criterion benchmark suite for bstr." 7 | homepage = "https://github.com/BurntSushi/bstr" 8 | repository = "https://github.com/BurntSushi/bstr" 9 | license = "Unlicense OR MIT" 10 | edition = "2018" 11 | 12 | [lib] 13 | bench = false 14 | 15 | [[bench]] 16 | name = "bstr" 17 | harness = false 18 | path = "src/bench.rs" 19 | 20 | [dependencies] 21 | criterion = "0.3.4" 22 | bstr = { version = "1.0.0", path = ".." } 23 | # For comparisons. 24 | unicode-segmentation = "1.2.1" 25 | -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-en-small-ascii.txt: -------------------------------------------------------------------------------- 1 | Presented by IM Pictures 2 | Produced by Shin Cine 3 | In association with MVP Venture Capital and Cinema Service 4 | Jeon Ji-hyun Cha Tae-hyun 5 | My Sassy Girl 6 | Exactly two years ago today, she and I buried a time capsule here. 7 | We promised to meet here two years later, but she hasn't come yet. 8 | I'm going to wait. 9 | Here we go. 10 | Please, don't move. 11 | One, two... 12 | Wait a minute. 13 | Hello? 14 | Oh, auntie. 15 | Sorry, I'm on my way. 16 | I'm really sorry. 17 | Yes, I'm coming. 18 | I'm having my photo taken. 19 | Bye. 20 | Are you ready? 21 | Here we go. 22 | One, two... 23 | My parents wanted a daughter, so they raised me like one. 24 | So I thought I was a girl until I was seven. 25 | I had to go to the women's public bath, too. 26 | The older I got, 27 | I thought my penis would get smaller and disappear. 28 | But it was the opposite. 29 | First Half 30 | He hasn't changed at all. 31 | No, I'm a real man now. 32 | Hey, asshole. 33 | Think clerical work in the army makes you a man? 34 | You irritate me! 35 | Give me a break, asshole. 36 | My job was tougher than you could imagine. 37 | Hey! 38 | I worked near the DMZ. 39 | Who are you kid -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-en-tiny-ascii.txt: -------------------------------------------------------------------------------- 1 | Presented by IM Pictures 2 | Produced by Shi -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-ru-small-utf8.txt: -------------------------------------------------------------------------------- 1 | Рэй МИЛЛАНД, Энтони КУИН, Дебра ПАЖЕТ в фильме БЕРЕГ РЕКИ 2 | в фильме также снимались: 3 | Гарри КЭРИ-мл., Чабби ДЖОНСОН, Байрон ФУЛДЖЕ, Том МакКи, Фрэнк ГЕРСТЛ сценарий Гарольда Джэкоба СМИТА и Джэймса ЛЕЙСЕСТЕРА по рассказу Гарольда Джэкоба СМИТА "Самая высокая гора" 4 | режиссер Аллан ДВАН 5 | - А вы выбрали жаркий денек, мистер. 6 | - Я всегда так делаю. 7 | - Полный бак? 8 | - Еще бы! 9 | А у вас мощная "тачка", как я погляжу. 10 | - Могу продать ее вам. 11 | - Нет, спасибо! 12 | - Собираетесь немного поохотиться? 13 | - Ну, я надеюсь на это. 14 | Вы знаете, не проиживает тут поблизости парень по имени Кэмеро -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-ru-tiny-utf8.txt: -------------------------------------------------------------------------------- 1 | Рэй МИЛЛАНД, Энтони КУ -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-zh-small-utf8.txt: -------------------------------------------------------------------------------- 1 | 我去拜托旅馆的人 2 | 出去喝就行了 3 | 我去拜托长井找工作 4 | 他说帮我问问他哥哥的公司 5 | 不知道会否成事 6 | 既然他肯答应,一定有结果的 7 | 真羡慕他至今还是优哉悠哉 8 | 叔叔,你老是偷听人家拉琴 9 | 小缝,你的颤音有进步了 10 | 我才不理你 11 | 叔叔你知道 12 | 爷爷找你来谈什么吗? 13 | 不知道 14 | 你的亲事 15 | 我去看看 16 | 走好 17 | 加油 18 | 你已经30岁了吧? 19 | 是的 20 | 身体健壮吧? 21 | 两三年来没有感冒 22 | 脑袋还算不笨吧? 23 | 是的 24 | 游手好闲太可惜了 25 | 他叫什么名字呢... 26 | 那个常去找你聊天的男人 27 | 我曾经见过他一两次 28 | 平冈吗? 29 | 那个人不算上乘人材... 30 | 听说帝大毕业后就去了外地 31 | 如今因为失败而回来 32 | 为什么? 33 | 想要为了温饱而工作吧 34 | 你在这里 35 | 我的梳子好像掉在这附近 36 | 你还是一样迷迷糊糊 37 | 坐吧,我陪你聊聊天 38 | 天气不错 39 | 去赏花如何? 40 | 等你真的想去再说 41 | -------------------------------------------------------------------------------- /bench/data/opensubtitles2018-zh-tiny-utf8.txt: -------------------------------------------------------------------------------- 1 | 你突然来信说最近要搬到这里 2 | -------------------------------------------------------------------------------- /bench/data/repeated-rare-small: -------------------------------------------------------------------------------- 1 | zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz 2 | -------------------------------------------------------------------------------- /bench/data/sherlock-holmes-small-ascii.txt: -------------------------------------------------------------------------------- 1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save 2 | upon those not infrequent occasions when he was up all night, was seated 3 | at the breakfast table. I stood upon the hearth-rug and picked up the 4 | stick which our visitor had left behind him the night before. It was a 5 | fine, thick piece of wood, bulbous-headed, of the sort which is known as 6 | a "Penang lawyer." Just under the head was a broad silver band nearly 7 | an inch across. "To James Mortimer, M.R.C.S., from his friends of the 8 | C.C.H.," was engraved upon it, with the date "1884." It was just such a 9 | stick as the old-fashioned family practitioner used to carry--dignified, 10 | solid, and reassuring. 11 | -------------------------------------------------------------------------------- /bench/data/sherlock-holmes-tiny-ascii.txt: -------------------------------------------------------------------------------- 1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save 2 | -------------------------------------------------------------------------------- /bench/src/bench.rs: -------------------------------------------------------------------------------- 1 | use bstr::{ByteSlice, B}; 2 | use criterion::{ 3 | criterion_group, criterion_main, Bencher, Criterion, Throughput, 4 | }; 5 | 6 | use crate::inputs::*; 7 | 8 | mod inputs; 9 | mod search; 10 | 11 | // All benchmark corpora up to and including "huge" inputs. 12 | // 13 | // "huge" inputs are about 500KB. "small" inputs are about 1KB. "tiny" inputs 14 | // are under 100 bytes. 15 | const CORPORA_HUGE: &'static [(&'static str, &'static [u8])] = &[ 16 | ("en-huge-ascii", SUBTITLE_EN_HUGE), 17 | ("en-small-ascii", SUBTITLE_EN_SMALL), 18 | ("en-tiny-ascii", SUBTITLE_EN_TINY), 19 | ("ru-huge-utf8", SUBTITLE_RU_HUGE), 20 | ("ru-small-utf8", SUBTITLE_RU_SMALL), 21 | ("ru-tiny-utf8", SUBTITLE_RU_TINY), 22 | ("zh-huge-utf8", SUBTITLE_ZH_HUGE), 23 | ("zh-small-utf8", SUBTITLE_ZH_SMALL), 24 | ("zh-tiny-utf8", SUBTITLE_ZH_TINY), 25 | ]; 26 | 27 | // All benchmark corpora up to and including "small" inputs. This does not 28 | // include huge inputs. This is useful for benchmarks that take longer, or if 29 | // there isn't useful to benchmark larger inputs. 30 | // 31 | // "huge" inputs are about 500KB. "small" inputs are about 1KB. "tiny" inputs 32 | // are under 100 bytes. 33 | const CORPORA_SMALL: &'static [(&'static str, &'static [u8])] = &[ 34 | ("en-small-ascii", SUBTITLE_EN_SMALL), 35 | ("en-tiny-ascii", SUBTITLE_EN_TINY), 36 | ("ru-small-utf8", SUBTITLE_RU_SMALL), 37 | ("ru-tiny-utf8", SUBTITLE_RU_TINY), 38 | ("zh-small-utf8", SUBTITLE_ZH_SMALL), 39 | ("zh-tiny-utf8", SUBTITLE_ZH_TINY), 40 | ]; 41 | 42 | fn is_ascii(c: &mut Criterion) { 43 | let corpus = SHERLOCK_HUGE; 44 | define(c, "is_ascii", "huge-ascii", corpus, move |b| { 45 | b.iter(|| { 46 | assert!(corpus.is_ascii()); 47 | }); 48 | }); 49 | 50 | let corpus = SHERLOCK_SMALL; 51 | define(c, "is_ascii", "small-ascii", corpus, move |b| { 52 | b.iter(|| { 53 | assert!(corpus.is_ascii()); 54 | }); 55 | }); 56 | 57 | let corpus = SHERLOCK_TINY; 58 | define(c, "is_ascii", "tiny-ascii", corpus, move |b| { 59 | b.iter(|| { 60 | assert!(corpus.is_ascii()); 61 | }); 62 | }); 63 | 64 | let corpus = EMPTY; 65 | define(c, "is_ascii", "empty-ascii", corpus, move |b| { 66 | b.iter(|| { 67 | assert!(corpus.is_ascii()); 68 | }); 69 | }); 70 | 71 | let corpus = "abcdefghijklm☃abcdefghijklmnopqrstuvwxyz".as_bytes(); 72 | define(c, "is_ascii", "tiny-non-ascii", corpus, move |b| { 73 | b.iter(|| { 74 | assert!(!corpus.is_ascii()); 75 | }); 76 | }); 77 | } 78 | 79 | fn to_str(c: &mut Criterion) { 80 | // benchmark our impl 81 | for &(name, corpus) in CORPORA_HUGE { 82 | define(c, "bstr/to_str", name, corpus, move |b| { 83 | b.iter(|| { 84 | assert!(corpus.to_str().is_ok()); 85 | }); 86 | }); 87 | } 88 | // benchmark std's impl 89 | for &(name, corpus) in CORPORA_HUGE { 90 | define(c, "std/to_str", name, corpus, move |b| { 91 | use std::str; 92 | 93 | b.iter(|| { 94 | assert!(str::from_utf8(corpus).is_ok()); 95 | }); 96 | }); 97 | } 98 | } 99 | 100 | fn to_str_lossy_valid(c: &mut Criterion) { 101 | // benchmark our impl 102 | for &(name, corpus) in CORPORA_HUGE { 103 | define(c, "bstr/to_str_lossy_valid", name, corpus, move |b| { 104 | b.iter(|| { 105 | assert!(corpus.to_str_lossy().len() > 0); 106 | }); 107 | }); 108 | } 109 | // benchmark std's impl 110 | for &(name, corpus) in CORPORA_HUGE { 111 | define(c, "std/to_str_lossy_valid", name, corpus, move |b| { 112 | b.iter(|| { 113 | assert!(String::from_utf8_lossy(corpus).len() > 0); 114 | }); 115 | }); 116 | } 117 | } 118 | 119 | fn trim(c: &mut Criterion) { 120 | let corpus = "\u{2007}\t\n\u{200a}foo\tbar\t\t\t\t\n \t\u{2002}"; 121 | 122 | // benchmark our impl 123 | define(c, "bstr/trim", "tiny", corpus.as_bytes(), move |b| { 124 | b.iter(|| { 125 | assert_eq!("foo\tbar".as_bytes(), B(corpus).trim()); 126 | }); 127 | }); 128 | 129 | // benchmark std's impl 130 | define(c, "std/trim", "tiny", corpus.as_bytes(), move |b| { 131 | b.iter(|| { 132 | assert_eq!("foo\tbar", corpus.trim()); 133 | }); 134 | }); 135 | } 136 | 137 | fn chars(c: &mut Criterion) { 138 | // benchmark our impl 139 | for &(name, corpus) in CORPORA_HUGE { 140 | define(c, "bstr/chars", name, corpus, move |b| { 141 | b.iter(|| { 142 | let mut count = 0; 143 | for ch in corpus.chars() { 144 | count += ch.len_utf8(); 145 | } 146 | assert!(count > 0); 147 | }); 148 | }); 149 | } 150 | // benchmark std's impl 151 | for &(name, corpus) in CORPORA_HUGE { 152 | define(c, "std/chars", name, corpus, move |b| { 153 | use std::str; 154 | 155 | let corpus = str::from_utf8(corpus).unwrap(); 156 | b.iter(|| { 157 | let mut count = 0; 158 | for ch in corpus.chars() { 159 | count += ch.len_utf8(); 160 | } 161 | assert!(count > 0); 162 | }); 163 | }); 164 | } 165 | } 166 | 167 | fn graphemes(c: &mut Criterion) { 168 | // benchmark our impl 169 | for &(name, corpus) in CORPORA_SMALL { 170 | define(c, "bstr/graphemes", name, corpus, move |b| { 171 | b.iter(|| { 172 | let mut count = 0; 173 | for g in corpus.graphemes() { 174 | count += g.len(); 175 | } 176 | assert!(count > 0); 177 | }); 178 | }); 179 | } 180 | // benchmark unicode-segmentation impl 181 | for &(name, corpus) in CORPORA_SMALL { 182 | define(c, "unicode-segmentation/graphemes", name, corpus, move |b| { 183 | use std::str; 184 | use unicode_segmentation::UnicodeSegmentation; 185 | 186 | let corpus = str::from_utf8(corpus).unwrap(); 187 | b.iter(|| { 188 | let mut count = 0; 189 | for g in corpus.graphemes(true) { 190 | count += g.len(); 191 | } 192 | assert!(count > 0); 193 | }); 194 | }); 195 | } 196 | } 197 | 198 | fn words(c: &mut Criterion) { 199 | // benchmark our impl 200 | for &(name, corpus) in CORPORA_SMALL { 201 | define(c, "bstr/words", name, corpus, move |b| { 202 | b.iter(|| { 203 | let mut count = 0; 204 | for g in corpus.words() { 205 | count += g.len(); 206 | } 207 | assert!(count > 0); 208 | }); 209 | }); 210 | } 211 | // benchmark unicode-segmentation impl 212 | for &(name, corpus) in CORPORA_SMALL { 213 | define(c, "unicode-segmentation/words", name, corpus, move |b| { 214 | use std::str; 215 | use unicode_segmentation::UnicodeSegmentation; 216 | 217 | let corpus = str::from_utf8(corpus).unwrap(); 218 | b.iter(|| { 219 | let mut count = 0; 220 | for g in corpus.unicode_words() { 221 | count += g.len(); 222 | } 223 | assert!(count > 0); 224 | }); 225 | }); 226 | } 227 | } 228 | 229 | fn sentences(c: &mut Criterion) { 230 | // benchmark our impl 231 | for &(name, corpus) in CORPORA_SMALL { 232 | define(c, "bstr/sentences", name, corpus, move |b| { 233 | b.iter(|| { 234 | let mut count = 0; 235 | for g in corpus.sentences() { 236 | count += g.len(); 237 | } 238 | assert!(count > 0); 239 | }); 240 | }); 241 | } 242 | } 243 | 244 | fn byte_lines(c: &mut Criterion) { 245 | use bstr::io::BufReadExt; 246 | 247 | let corpus = SUBTITLE_EN_HUGE; 248 | define(c, "bstr/for_byte_line", "ascii", corpus, move |b| { 249 | b.iter(|| { 250 | let mut corpus = corpus; 251 | let mut count = 0; 252 | corpus 253 | .for_byte_line(|line| { 254 | count += line.len(); 255 | Ok(true) 256 | }) 257 | .unwrap(); 258 | assert!(count > 0); 259 | }); 260 | }); 261 | } 262 | 263 | fn define( 264 | c: &mut Criterion, 265 | group_name: &str, 266 | bench_name: &str, 267 | corpus: &[u8], 268 | bench: impl FnMut(&mut Bencher<'_>) + 'static, 269 | ) { 270 | let mut group = c.benchmark_group(group_name); 271 | group.throughput(Throughput::Bytes(corpus.len() as u64)); 272 | group.bench_function(bench_name, bench); 273 | group.finish(); 274 | } 275 | 276 | criterion_group!(g1, is_ascii); 277 | criterion_group!(g2, to_str); 278 | criterion_group!(g3, to_str_lossy_valid); 279 | criterion_group!(g4, trim); 280 | criterion_group!(g5, chars); 281 | criterion_group!(g6, graphemes); 282 | criterion_group!(g7, words); 283 | criterion_group!(g8, sentences); 284 | criterion_group!(g9, byte_lines); 285 | criterion_group!(g10, search::find_iter); 286 | criterion_group!(g11, search::rfind_iter); 287 | criterion_group!(g12, search::find_char); 288 | criterion_group!(g13, search::find_byteset); 289 | criterion_group!(g14, search::find_not_byteset); 290 | criterion_main!(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14); 291 | -------------------------------------------------------------------------------- /bench/src/inputs.rs: -------------------------------------------------------------------------------- 1 | pub const EMPTY: &'static [u8] = b""; 2 | 3 | pub const SHERLOCK_HUGE: &'static [u8] = 4 | include_bytes!("../data/sherlock-holmes-huge-ascii.txt"); 5 | pub const SHERLOCK_SMALL: &'static [u8] = 6 | include_bytes!("../data/sherlock-holmes-small-ascii.txt"); 7 | pub const SHERLOCK_TINY: &'static [u8] = 8 | include_bytes!("../data/sherlock-holmes-tiny-ascii.txt"); 9 | 10 | pub const SUBTITLE_EN_HUGE: &'static [u8] = 11 | include_bytes!("../data/opensubtitles2018-en-huge-ascii.txt"); 12 | pub const SUBTITLE_EN_SMALL: &'static [u8] = 13 | include_bytes!("../data/opensubtitles2018-en-small-ascii.txt"); 14 | pub const SUBTITLE_EN_TINY: &'static [u8] = 15 | include_bytes!("../data/opensubtitles2018-en-tiny-ascii.txt"); 16 | 17 | pub const SUBTITLE_RU_HUGE: &'static [u8] = 18 | include_bytes!("../data/opensubtitles2018-ru-huge-utf8.txt"); 19 | pub const SUBTITLE_RU_SMALL: &'static [u8] = 20 | include_bytes!("../data/opensubtitles2018-ru-small-utf8.txt"); 21 | pub const SUBTITLE_RU_TINY: &'static [u8] = 22 | include_bytes!("../data/opensubtitles2018-ru-tiny-utf8.txt"); 23 | 24 | pub const SUBTITLE_ZH_HUGE: &'static [u8] = 25 | include_bytes!("../data/opensubtitles2018-zh-huge-utf8.txt"); 26 | pub const SUBTITLE_ZH_SMALL: &'static [u8] = 27 | include_bytes!("../data/opensubtitles2018-zh-small-utf8.txt"); 28 | pub const SUBTITLE_ZH_TINY: &'static [u8] = 29 | include_bytes!("../data/opensubtitles2018-zh-tiny-utf8.txt"); 30 | 31 | pub const REPEATED_RARE_HUGE: &'static [u8] = 32 | include_bytes!("../data/repeated-rare-huge"); 33 | pub const REPEATED_RARE_SMALL: &'static [u8] = 34 | include_bytes!("../data/repeated-rare-small"); 35 | -------------------------------------------------------------------------------- /bench/src/lib.rs: -------------------------------------------------------------------------------- 1 | // This is purposely empty. See src/bench.rs instead. We use src/bench.rs 2 | // to avoid including the same file in multiple build targets. 3 | -------------------------------------------------------------------------------- /bench/src/search.rs: -------------------------------------------------------------------------------- 1 | use std::str; 2 | 3 | use bstr::ByteSlice; 4 | use criterion::Criterion; 5 | 6 | use crate::define; 7 | use crate::inputs::*; 8 | 9 | pub fn find_iter(c: &mut Criterion) { 10 | define_find_iter( 11 | c, 12 | "find/rare", 13 | "en-huge-ascii", 14 | SUBTITLE_EN_HUGE, 15 | "Sherlock Holmes", 16 | 1, 17 | ); 18 | define_find_iter( 19 | c, 20 | "find/verycommon1", 21 | "en-huge-ascii", 22 | SUBTITLE_EN_HUGE, 23 | " ", 24 | 76792, 25 | ); 26 | define_find_iter( 27 | c, 28 | "find/verycommon2", 29 | "en-huge-ascii", 30 | SUBTITLE_EN_HUGE, 31 | " ", 32 | 0, 33 | ); 34 | 35 | define_find_iter( 36 | c, 37 | "find/rare", 38 | "en-small-ascii", 39 | SUBTITLE_EN_SMALL, 40 | "IM Pictures", 41 | 1, 42 | ); 43 | define_find_iter( 44 | c, 45 | "find/verycommon1", 46 | "en-small-ascii", 47 | SUBTITLE_EN_SMALL, 48 | " ", 49 | 155, 50 | ); 51 | define_find_iter( 52 | c, 53 | "find/verycommon2", 54 | "en-small-ascii", 55 | SUBTITLE_EN_SMALL, 56 | " ", 57 | 0, 58 | ); 59 | 60 | define_find_iter( 61 | c, 62 | "find/verycommon1", 63 | "en-tiny-ascii", 64 | SUBTITLE_EN_TINY, 65 | " ", 66 | 5, 67 | ); 68 | define_find_iter( 69 | c, 70 | "find/verycommon2", 71 | "en-tiny-ascii", 72 | SUBTITLE_EN_TINY, 73 | " ", 74 | 0, 75 | ); 76 | 77 | define_find_iter( 78 | c, 79 | "find/pathological", 80 | "repeated-huge", 81 | REPEATED_RARE_HUGE, 82 | "abczdef", 83 | 0, 84 | ); 85 | define_find_iter( 86 | c, 87 | "find/pathological", 88 | "repeated-small", 89 | REPEATED_RARE_SMALL, 90 | "abczdef", 91 | 0, 92 | ); 93 | } 94 | 95 | pub fn rfind_iter(c: &mut Criterion) { 96 | define_rfind_iter( 97 | c, 98 | "rfind/rare", 99 | "en-huge-ascii", 100 | SUBTITLE_EN_HUGE, 101 | "Sherlock Holmes", 102 | 1, 103 | ); 104 | define_rfind_iter( 105 | c, 106 | "rfind/verycommon1", 107 | "en-huge-ascii", 108 | SUBTITLE_EN_HUGE, 109 | " ", 110 | 76792, 111 | ); 112 | define_rfind_iter( 113 | c, 114 | "rfind/verycommon2", 115 | "en-huge-ascii", 116 | SUBTITLE_EN_HUGE, 117 | " ", 118 | 0, 119 | ); 120 | 121 | define_rfind_iter( 122 | c, 123 | "rfind/rare", 124 | "en-small-ascii", 125 | SUBTITLE_EN_SMALL, 126 | "IM Pictures", 127 | 1, 128 | ); 129 | define_rfind_iter( 130 | c, 131 | "rfind/verycommon1", 132 | "en-small-ascii", 133 | SUBTITLE_EN_SMALL, 134 | " ", 135 | 155, 136 | ); 137 | define_rfind_iter( 138 | c, 139 | "rfind/verycommon2", 140 | "en-small-ascii", 141 | SUBTITLE_EN_SMALL, 142 | " ", 143 | 0, 144 | ); 145 | 146 | define_rfind_iter( 147 | c, 148 | "rfind/verycommon1", 149 | "en-tiny-ascii", 150 | SUBTITLE_EN_TINY, 151 | " ", 152 | 5, 153 | ); 154 | define_rfind_iter( 155 | c, 156 | "rfind/verycommon2", 157 | "en-tiny-ascii", 158 | SUBTITLE_EN_TINY, 159 | " ", 160 | 0, 161 | ); 162 | 163 | define_rfind_iter( 164 | c, 165 | "rfind/pathological", 166 | "repeated-huge", 167 | REPEATED_RARE_HUGE, 168 | "abczdef", 169 | 0, 170 | ); 171 | define_rfind_iter( 172 | c, 173 | "rfind/pathological", 174 | "repeated-small", 175 | REPEATED_RARE_SMALL, 176 | "abczdef", 177 | 0, 178 | ); 179 | } 180 | 181 | pub fn find_char(c: &mut Criterion) { 182 | let corpus = str::from_utf8(SUBTITLE_EN_HUGE).unwrap(); 183 | define( 184 | c, 185 | "bstr/find_char", 186 | "en-huge-ascii", 187 | corpus.as_bytes(), 188 | move |b| { 189 | let corpus = corpus.as_bytes(); 190 | b.iter(|| { 191 | assert_eq!(None, corpus.find_char('γ')); 192 | }); 193 | }, 194 | ); 195 | 196 | define(c, "std/find_char", "en-huge-ascii", corpus.as_bytes(), move |b| { 197 | b.iter(|| { 198 | assert_eq!(None, corpus.find('γ')); 199 | }); 200 | }); 201 | } 202 | 203 | pub fn find_byteset(c: &mut Criterion) { 204 | let corpus = SUBTITLE_EN_SMALL; 205 | define(c, "bstr/find_byteset/1", "en-small-ascii", corpus, move |b| { 206 | let corpus = corpus.as_bytes(); 207 | b.iter(|| { 208 | assert_eq!(None, corpus.find_byteset(b"\0")); 209 | }); 210 | }); 211 | define(c, "bstr/find_byteset/2", "en-small-ascii", corpus, move |b| { 212 | let corpus = corpus.as_bytes(); 213 | b.iter(|| { 214 | assert_eq!(None, corpus.find_byteset(b"\0\xff")); 215 | }); 216 | }); 217 | define(c, "bstr/find_byteset/3", "en-small-ascii", corpus, move |b| { 218 | let corpus = corpus.as_bytes(); 219 | b.iter(|| { 220 | assert_eq!(None, corpus.find_byteset(b"\0\xff\xee")); 221 | }); 222 | }); 223 | define(c, "bstr/find_byteset/4", "en-small-ascii", corpus, move |b| { 224 | let corpus = corpus.as_bytes(); 225 | b.iter(|| { 226 | assert_eq!(None, corpus.find_byteset(b"\0\xff\xee\xdd")); 227 | }); 228 | }); 229 | define(c, "bstr/find_byteset/10", "en-small-ascii", corpus, move |b| { 230 | let corpus = corpus.as_bytes(); 231 | b.iter(|| { 232 | assert_eq!(None, corpus.find_byteset(b"0123456789")); 233 | }); 234 | }); 235 | 236 | define(c, "bstr/rfind_byteset/1", "en-small-ascii", corpus, move |b| { 237 | let corpus = corpus.as_bytes(); 238 | b.iter(|| { 239 | assert_eq!(None, corpus.rfind_byteset(b"\0")); 240 | }); 241 | }); 242 | define(c, "bstr/rfind_byteset/2", "en-small-ascii", corpus, move |b| { 243 | let corpus = corpus.as_bytes(); 244 | b.iter(|| { 245 | assert_eq!(None, corpus.rfind_byteset(b"\0\xff")); 246 | }); 247 | }); 248 | define(c, "bstr/rfind_byteset/3", "en-small-ascii", corpus, move |b| { 249 | let corpus = corpus.as_bytes(); 250 | b.iter(|| { 251 | assert_eq!(None, corpus.rfind_byteset(b"\0\xff\xee")); 252 | }); 253 | }); 254 | define(c, "bstr/rfind_byteset/4", "en-small-ascii", corpus, move |b| { 255 | let corpus = corpus.as_bytes(); 256 | b.iter(|| { 257 | assert_eq!(None, corpus.rfind_byteset(b"\0\xff\xee\xdd")); 258 | }); 259 | }); 260 | define(c, "bstr/rfind_byteset/10", "en-small-ascii", corpus, move |b| { 261 | let corpus = corpus.as_bytes(); 262 | b.iter(|| { 263 | assert_eq!(None, corpus.rfind_byteset(b"0123456789")); 264 | }); 265 | }); 266 | } 267 | 268 | pub fn find_not_byteset(c: &mut Criterion) { 269 | let corpus = REPEATED_RARE_SMALL; 270 | define( 271 | c, 272 | "bstr/find_not_byteset/1", 273 | "repeated-rare-small", 274 | corpus, 275 | move |b| { 276 | let corpus = corpus.as_bytes(); 277 | b.iter(|| { 278 | assert_eq!(Some(1000), corpus.find_not_byteset(b"z")); 279 | }) 280 | }, 281 | ); 282 | define( 283 | c, 284 | "bstr/find_not_byteset/2", 285 | "repeated-rare-small", 286 | corpus, 287 | move |b| { 288 | let corpus = corpus.as_bytes(); 289 | b.iter(|| { 290 | assert_eq!(Some(1000), corpus.find_not_byteset(b"zy")); 291 | }); 292 | }, 293 | ); 294 | define( 295 | c, 296 | "bstr/find_not_byteset/3", 297 | "repeated-rare-small", 298 | corpus, 299 | move |b| { 300 | let corpus = corpus.as_bytes(); 301 | b.iter(|| { 302 | assert_eq!(Some(1000), corpus.find_not_byteset(b"zyx")); 303 | }); 304 | }, 305 | ); 306 | define( 307 | c, 308 | "bstr/find_not_byteset/4", 309 | "repeated-rare-small", 310 | corpus, 311 | move |b| { 312 | let corpus = corpus.as_bytes(); 313 | b.iter(|| { 314 | assert_eq!(Some(1000), corpus.find_not_byteset(b"zyxw")); 315 | }); 316 | }, 317 | ); 318 | define( 319 | c, 320 | "bstr/find_not_byteset/10", 321 | "repeated-rare-small", 322 | corpus, 323 | move |b| { 324 | let corpus = corpus.as_bytes(); 325 | b.iter(|| { 326 | assert_eq!(Some(1000), corpus.find_not_byteset(b"zyxwv12345")); 327 | }); 328 | }, 329 | ); 330 | 331 | define( 332 | c, 333 | "bstr/rfind_not_byteset/1", 334 | "repeated-rare-small", 335 | corpus, 336 | move |b| { 337 | // This file ends in \n, breaking our benchmark.... TODO find a 338 | // better dataset... 339 | let corpus = &corpus.as_bytes()[..(corpus.len() - 1)]; 340 | b.iter(|| { 341 | assert_eq!(None, corpus.rfind_not_byteset(b"z")); 342 | }); 343 | }, 344 | ); 345 | define( 346 | c, 347 | "bstr/rfind_not_byteset/2", 348 | "repeated-rare-small", 349 | corpus, 350 | move |b| { 351 | let corpus = corpus.as_bytes(); 352 | b.iter(|| { 353 | assert_eq!(None, corpus.rfind_not_byteset(b"z\n")); 354 | }); 355 | }, 356 | ); 357 | define( 358 | c, 359 | "bstr/rfind_not_byteset/3", 360 | "repeated-rare-small", 361 | corpus, 362 | move |b| { 363 | let corpus = corpus.as_bytes(); 364 | b.iter(|| { 365 | assert_eq!(None, corpus.rfind_not_byteset(b"zy\n")); 366 | }); 367 | }, 368 | ); 369 | define( 370 | c, 371 | "bstr/rfind_not_byteset/4", 372 | "repeated-rare-small", 373 | corpus, 374 | move |b| { 375 | let corpus = corpus.as_bytes(); 376 | b.iter(|| { 377 | assert_eq!(None, corpus.rfind_not_byteset(b"zyx\n")); 378 | }); 379 | }, 380 | ); 381 | define( 382 | c, 383 | "bstr/rfind_not_byteset/10", 384 | "repeated-rare-small", 385 | corpus, 386 | move |b| { 387 | let corpus = corpus.as_bytes(); 388 | b.iter(|| { 389 | assert_eq!(None, corpus.rfind_not_byteset(b"zyxwv1234\n")); 390 | }); 391 | }, 392 | ); 393 | } 394 | 395 | fn define_find_iter( 396 | c: &mut Criterion, 397 | group_name: &str, 398 | bench_name: &str, 399 | corpus: &'static [u8], 400 | needle: &'static str, 401 | expected: usize, 402 | ) { 403 | let corpus = str::from_utf8(corpus).unwrap(); 404 | 405 | let name = format!("bstr/{}", group_name); 406 | define(c, &name, bench_name, corpus.as_bytes(), move |b| { 407 | let corpus = corpus.as_bytes(); 408 | b.iter(|| { 409 | assert_eq!(expected, corpus.find_iter(needle).count()); 410 | }); 411 | }); 412 | 413 | let name = format!("std/{}", group_name); 414 | define(c, &name, bench_name, corpus.as_bytes(), move |b| { 415 | b.iter(|| { 416 | assert_eq!(expected, corpus.matches(needle).count()); 417 | }); 418 | }); 419 | } 420 | 421 | fn define_rfind_iter( 422 | c: &mut Criterion, 423 | group_name: &str, 424 | bench_name: &str, 425 | corpus: &'static [u8], 426 | needle: &'static str, 427 | expected: usize, 428 | ) { 429 | let corpus = str::from_utf8(corpus).unwrap(); 430 | 431 | let name = format!("bstr/{}", group_name); 432 | define(c, &name, bench_name, corpus.as_bytes(), move |b| { 433 | let corpus = corpus.as_bytes(); 434 | b.iter(|| { 435 | assert_eq!(expected, corpus.rfind_iter(needle).count()); 436 | }); 437 | }); 438 | 439 | let name = format!("std/{}", group_name); 440 | define(c, &name, bench_name, corpus.as_bytes(), move |b| { 441 | b.iter(|| { 442 | assert_eq!(expected, corpus.rmatches(needle).count()); 443 | }); 444 | }); 445 | } 446 | -------------------------------------------------------------------------------- /examples/graphemes-std.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io::{self, BufRead, Write}; 3 | 4 | use unicode_segmentation::UnicodeSegmentation; 5 | 6 | fn main() -> Result<(), Box> { 7 | let stdin = io::stdin(); 8 | let mut stdin = stdin.lock(); 9 | let mut stdout = io::BufWriter::new(io::stdout()); 10 | 11 | let mut line = String::new(); 12 | while stdin.read_line(&mut line)? > 0 { 13 | let end = line 14 | .grapheme_indices(true) 15 | .map(|(start, g)| start + g.len()) 16 | .take(10) 17 | .last() 18 | .unwrap_or(line.len()); 19 | stdout.write_all(line[..end].trim_end().as_bytes())?; 20 | stdout.write_all(b"\n")?; 21 | 22 | line.clear(); 23 | } 24 | Ok(()) 25 | } 26 | -------------------------------------------------------------------------------- /examples/graphemes.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io::{self, Write}; 3 | 4 | use bstr::{io::BufReadExt, ByteSlice}; 5 | 6 | fn main() -> Result<(), Box> { 7 | let stdin = io::stdin(); 8 | let mut stdout = io::BufWriter::new(io::stdout()); 9 | 10 | stdin.lock().for_byte_line_with_terminator(|line| { 11 | let end = line 12 | .grapheme_indices() 13 | .map(|(_, end, _)| end) 14 | .take(10) 15 | .last() 16 | .unwrap_or(line.len()); 17 | stdout.write_all(line[..end].trim_end())?; 18 | stdout.write_all(b"\n")?; 19 | Ok(true) 20 | })?; 21 | Ok(()) 22 | } 23 | -------------------------------------------------------------------------------- /examples/lines-std.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io::{self, BufRead, Write}; 3 | 4 | fn main() -> Result<(), Box> { 5 | let stdin = io::stdin(); 6 | let mut stdin = stdin.lock(); 7 | let mut stdout = io::BufWriter::new(io::stdout()); 8 | 9 | let mut line = String::new(); 10 | while stdin.read_line(&mut line)? > 0 { 11 | if line.contains("Dimension") { 12 | stdout.write_all(line.as_bytes())?; 13 | } 14 | line.clear(); 15 | } 16 | Ok(()) 17 | } 18 | -------------------------------------------------------------------------------- /examples/lines.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io::{self, Write}; 3 | 4 | use bstr::{io::BufReadExt, ByteSlice}; 5 | 6 | fn main() -> Result<(), Box> { 7 | let stdin = io::stdin(); 8 | let mut stdout = io::BufWriter::new(io::stdout()); 9 | 10 | stdin.lock().for_byte_line_with_terminator(|line| { 11 | if line.contains_str("Dimension") { 12 | stdout.write_all(line)?; 13 | } 14 | Ok(true) 15 | })?; 16 | Ok(()) 17 | } 18 | -------------------------------------------------------------------------------- /examples/uppercase-std.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io::{self, BufRead, Write}; 3 | 4 | fn main() -> Result<(), Box> { 5 | let stdin = io::stdin(); 6 | let mut stdin = stdin.lock(); 7 | let mut stdout = io::BufWriter::new(io::stdout()); 8 | 9 | let mut line = String::new(); 10 | while stdin.read_line(&mut line)? > 0 { 11 | stdout.write_all(line.to_uppercase().as_bytes())?; 12 | line.clear(); 13 | } 14 | Ok(()) 15 | } 16 | -------------------------------------------------------------------------------- /examples/uppercase.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io::{self, Write}; 3 | 4 | use bstr::{io::BufReadExt, ByteSlice}; 5 | 6 | fn main() -> Result<(), Box> { 7 | let stdin = io::stdin(); 8 | let mut stdout = io::BufWriter::new(io::stdout()); 9 | 10 | let mut upper = vec![]; 11 | stdin.lock().for_byte_line_with_terminator(|line| { 12 | upper.clear(); 13 | line.to_uppercase_into(&mut upper); 14 | stdout.write_all(&upper)?; 15 | Ok(true) 16 | })?; 17 | Ok(()) 18 | } 19 | -------------------------------------------------------------------------------- /examples/words-std.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io::{self, BufRead}; 3 | 4 | use unicode_segmentation::UnicodeSegmentation; 5 | 6 | fn main() -> Result<(), Box> { 7 | let stdin = io::stdin(); 8 | let mut stdin = stdin.lock(); 9 | 10 | let mut words = 0; 11 | let mut line = String::new(); 12 | while stdin.read_line(&mut line)? > 0 { 13 | words += line.unicode_words().count(); 14 | line.clear(); 15 | } 16 | println!("{}", words); 17 | Ok(()) 18 | } 19 | -------------------------------------------------------------------------------- /examples/words.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::io; 3 | 4 | use bstr::{io::BufReadExt, ByteSlice}; 5 | 6 | fn main() -> Result<(), Box> { 7 | let stdin = io::stdin(); 8 | let mut words = 0; 9 | stdin.lock().for_byte_line_with_terminator(|line| { 10 | words += line.words().count(); 11 | Ok(true) 12 | })?; 13 | println!("{}", words); 14 | Ok(()) 15 | } 16 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 79 2 | use_small_heuristics = "max" 3 | -------------------------------------------------------------------------------- /scripts/generate-unicode-data: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | D="$(dirname "$0")" 5 | 6 | # Convenience function for checking that a command exists. 7 | requires() { 8 | cmd="$1" 9 | if ! command -v "$cmd" > /dev/null 2>&1; then 10 | echo "DEPENDENCY MISSING: $cmd must be installed" >&2 11 | exit 1 12 | fi 13 | } 14 | 15 | # Test if an array ($2) contains a particular element ($1). 16 | array_exists() { 17 | needle="$1" 18 | shift 19 | 20 | for el in "$@"; do 21 | if [ "$el" = "$needle" ]; then 22 | return 0 23 | fi 24 | done 25 | return 1 26 | } 27 | 28 | graphemes() { 29 | regex="$(sh "$D/regex/grapheme.sh")" 30 | 31 | echo "generating forward grapheme DFA" 32 | regex-cli generate serialize sparse dfa \ 33 | --minimize \ 34 | --start-kind anchored \ 35 | --shrink \ 36 | --rustfmt \ 37 | --safe \ 38 | GRAPHEME_BREAK_FWD \ 39 | src/unicode/fsm/ \ 40 | "$regex" 41 | 42 | echo "generating reverse grapheme DFA" 43 | regex-cli generate serialize sparse dfa \ 44 | --minimize \ 45 | --start-kind anchored \ 46 | --reverse \ 47 | --match-kind all \ 48 | --no-captures \ 49 | --shrink \ 50 | --rustfmt \ 51 | --safe \ 52 | GRAPHEME_BREAK_REV \ 53 | src/unicode/fsm/ \ 54 | "$regex" 55 | } 56 | 57 | words() { 58 | regex="$(sh "$D/regex/word.sh")" 59 | 60 | echo "generating forward word DFA (this can take a while)" 61 | regex-cli generate serialize sparse dfa \ 62 | --minimize \ 63 | --start-kind anchored \ 64 | --shrink \ 65 | --rustfmt \ 66 | --safe \ 67 | WORD_BREAK_FWD \ 68 | src/unicode/fsm/ \ 69 | "$regex" 70 | } 71 | 72 | sentences() { 73 | regex="$(sh "$D/regex/sentence.sh")" 74 | 75 | echo "generating forward sentence DFA (this can take a while)" 76 | regex-cli generate serialize sparse dfa \ 77 | --minimize \ 78 | --start-kind anchored \ 79 | --shrink \ 80 | --rustfmt \ 81 | --safe \ 82 | SENTENCE_BREAK_FWD \ 83 | src/unicode/fsm/ \ 84 | "$regex" 85 | } 86 | 87 | regional_indicator() { 88 | # For finding all occurrences of region indicators. This is used to handle 89 | # regional indicators as a special case for the reverse grapheme iterator 90 | # and the reverse word iterator. 91 | echo "generating regional indicator DFA" 92 | regex-cli generate serialize dense dfa \ 93 | --minimize \ 94 | --start-kind anchored \ 95 | --reverse \ 96 | --no-captures \ 97 | --shrink \ 98 | --rustfmt \ 99 | --safe \ 100 | REGIONAL_INDICATOR_REV \ 101 | src/unicode/fsm/ \ 102 | "\p{gcb=Regional_Indicator}" 103 | } 104 | 105 | simple_word() { 106 | echo "generating forward simple word DFA" 107 | regex-cli generate serialize sparse dfa \ 108 | --minimize \ 109 | --start-kind anchored \ 110 | --shrink \ 111 | --rustfmt \ 112 | --safe \ 113 | SIMPLE_WORD_FWD \ 114 | src/unicode/fsm/ \ 115 | "\w" 116 | } 117 | 118 | whitespace() { 119 | echo "generating forward whitespace DFA" 120 | regex-cli generate serialize dense dfa \ 121 | --minimize \ 122 | --start-kind anchored \ 123 | --shrink \ 124 | --rustfmt \ 125 | --safe \ 126 | WHITESPACE_ANCHORED_FWD \ 127 | src/unicode/fsm/ \ 128 | "\s+" 129 | 130 | echo "generating reverse whitespace DFA" 131 | regex-cli generate serialize dense dfa \ 132 | --minimize \ 133 | --start-kind anchored \ 134 | --reverse \ 135 | --no-captures \ 136 | --shrink \ 137 | --rustfmt \ 138 | --safe \ 139 | WHITESPACE_ANCHORED_REV \ 140 | src/unicode/fsm/ \ 141 | "\s+" 142 | } 143 | 144 | main() { 145 | if array_exists "-h" "$@" || array_exists "--help" "$@"; then 146 | echo "Usage: $(basename "$0") [--list-commands] [] ..." >&2 147 | exit 148 | fi 149 | 150 | commands=" 151 | graphemes 152 | sentences 153 | words 154 | regional-indicator 155 | simple-word 156 | whitespace 157 | " 158 | if array_exists "--list-commands" "$@"; then 159 | for cmd in $commands; do 160 | echo "$cmd" 161 | done 162 | exit 163 | fi 164 | 165 | # regex-cli is used to compile regexes into DFAs. 166 | # To get regex-cli, run: 167 | # 168 | # cargo install --git https://github.com/rust-lang/regex regex-cli 169 | # 170 | # regex-cli will build DFAs, serialize them to big endian and little endian 171 | # files, and then generate the Rust code to deserialize them. 172 | requires regex-cli 173 | 174 | mkdir -p src/unicode/fsm/ 175 | 176 | cmds=$* 177 | if [ $# -eq 0 ] || array_exists "all" "$@"; then 178 | cmds=$commands 179 | fi 180 | for cmd in $cmds; do 181 | if array_exists "$cmd" $commands; then 182 | fun="$(echo "$cmd" | sed 's/-/_/g')" 183 | eval "$fun" 184 | else 185 | echo "unrecognized command: $cmd" >&2 186 | fi 187 | done 188 | } 189 | 190 | main "$@" 191 | -------------------------------------------------------------------------------- /scripts/regex/grapheme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # vim: indentexpr= nosmartindent autoindent 4 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 5 | 6 | # This regex was manually written, derived from the rules in UAX #29. 7 | # Particularly, from Table 1c, which lays out a regex for grapheme clusters. 8 | 9 | CR="\p{gcb=CR}" 10 | LF="\p{gcb=LF}" 11 | Control="\p{gcb=Control}" 12 | Prepend="\p{gcb=Prepend}" 13 | L="\p{gcb=L}" 14 | V="\p{gcb=V}" 15 | LV="\p{gcb=LV}" 16 | LVT="\p{gcb=LVT}" 17 | T="\p{gcb=T}" 18 | RI="\p{gcb=RI}" 19 | Extend="\p{gcb=Extend}" 20 | ZWJ="\p{gcb=ZWJ}" 21 | SpacingMark="\p{gcb=SpacingMark}" 22 | 23 | Any="\p{any}" 24 | ExtendPict="\p{Extended_Pictographic}" 25 | 26 | echo "(?x) 27 | $CR $LF 28 | | 29 | $Control 30 | | 31 | $Prepend* 32 | ( 33 | ( 34 | ($L* ($V+ | $LV $V* | $LVT) $T*) 35 | | 36 | $L+ 37 | | 38 | $T+ 39 | ) 40 | | 41 | $RI $RI 42 | | 43 | $ExtendPict ($Extend* $ZWJ $ExtendPict)* 44 | | 45 | [^$Control $CR $LF] 46 | ) 47 | [$Extend $ZWJ $SpacingMark]* 48 | | 49 | $Any 50 | " 51 | -------------------------------------------------------------------------------- /scripts/regex/sentence.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # vim: indentexpr= nosmartindent autoindent 4 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 5 | 6 | # This is a regex that I reverse engineered from the sentence boundary chain 7 | # rules in UAX #29. Unlike the grapheme regex, which is essentially provided 8 | # for us in UAX #29, no such sentence regex exists. 9 | # 10 | # I looked into how ICU achieves this, since UAX #29 hints that producing 11 | # finite state machines for grapheme/sentence/word/line breaking is possible, 12 | # but only easy to do for graphemes. ICU does this by implementing their own 13 | # DSL for describing the break algorithms in terms of the chaining rules 14 | # directly. You can see an example for sentences in 15 | # icu4c/source/data/brkitr/rules/sent.txt. ICU then builds a finite state 16 | # machine from those rules in a mostly standard way, but implements the 17 | # "chaining" aspect of the rules by connecting overlapping end and start 18 | # states. For example, given SB7: 19 | # 20 | # (Upper | Lower) ATerm x Upper 21 | # 22 | # Then the naive way to convert this into a regex would be something like 23 | # 24 | # [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper} 25 | # 26 | # Unfortunately, this is incorrect. Why? Well, consider an example like so: 27 | # 28 | # U.S.A. 29 | # 30 | # A correct implementation of the sentence breaking algorithm should not insert 31 | # any breaks here, exactly in accordance with repeatedly applying rule SB7 as 32 | # given above. Our regex fails to do this because it will first match `U.S` 33 | # without breaking them---which is correct---but will then start looking for 34 | # its next rule beginning with a full stop (in ATerm) and followed by an 35 | # uppercase letter (A). This will wind up triggering rule SB11 (without 36 | # matching `A`), which inserts a break. 37 | # 38 | # The reason why this happens is because our initial application of rule SB7 39 | # "consumes" the next uppercase letter (S), which we want to reuse as a prefix 40 | # in the next rule application. A natural way to express this would be with 41 | # look-around, although it's not clear that works in every case since you 42 | # ultimately might want to consume that ending uppercase letter. In any case, 43 | # we can't use look-around in our truly regular regexes, so we must fix this. 44 | # The approach we take is to explicitly repeat rules when a suffix of a rule 45 | # is a prefix of another rule. In the case of SB7, the end of the rule, an 46 | # uppercase letter, also happens to match the beginning of the rule. This can 47 | # in turn be repeated indefinitely. Thus, our actual translation to a regex is: 48 | # 49 | # [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}(\p{sb=ATerm}\p{sb=Upper}* 50 | # 51 | # It turns out that this is exactly what ICU does, but in their case, they do 52 | # it automatically. In our case, we connect the chaining rules manually. It's 53 | # tedious. With that said, we do no implement Unicode line breaking with this 54 | # approach, which is a far scarier beast. In that case, it would probably be 55 | # worth writing the code to do what ICU does. 56 | # 57 | # In the case of sentence breaks, there aren't *too* many overlaps of this 58 | # nature. We list them out exhaustively to make this clear, because it's 59 | # essentially impossible to easily observe this in the regex. (It took me a 60 | # full day to figure all of this out.) Rules marked with N/A mean that they 61 | # specify a break, and this strategy only really applies to stringing together 62 | # non-breaks. 63 | # 64 | # SB1 - N/A 65 | # SB2 - N/A 66 | # SB3 - None 67 | # SB4 - N/A 68 | # SB5 - None 69 | # SB6 - None 70 | # SB7 - End overlaps with beginning of SB7 71 | # SB8 - End overlaps with beginning of SB7 72 | # SB8a - End overlaps with beginning of SB6, SB8, SB8a, SB9, SB10, SB11 73 | # SB9 - None 74 | # SB10 - None 75 | # SB11 - None 76 | # SB998 - N/A 77 | # 78 | # SB8a is in particular quite tricky to get right without look-ahead, since it 79 | # allows ping-ponging between match rules SB8a and SB9-11, where SB9-11 80 | # otherwise indicate that a break has been found. In the regex below, we tackle 81 | # this by only permitting part of SB8a to match inside our core non-breaking 82 | # repetition. In particular, we only allow the parts of SB8a to match that 83 | # permit the non-breaking components to continue. If a part of SB8a matches 84 | # that guarantees a pop out to SB9-11, (like `STerm STerm`), then we let it 85 | # happen. This still isn't correct because an SContinue might be seen which 86 | # would allow moving back into SB998 and thus the non-breaking repetition, so 87 | # we handle that case as well. 88 | # 89 | # Finally, the last complication here is the sprinkling of $Ex* everywhere. 90 | # This essentially corresponds to the implementation of SB5 by following 91 | # UAX #29's recommendation in S6.2. Essentially, we use it avoid ever breaking 92 | # in the middle of a grapheme cluster. 93 | 94 | CR="\p{sb=CR}" 95 | LF="\p{sb=LF}" 96 | Sep="\p{sb=Sep}" 97 | Close="\p{sb=Close}" 98 | Sp="\p{sb=Sp}" 99 | STerm="\p{sb=STerm}" 100 | ATerm="\p{sb=ATerm}" 101 | SContinue="\p{sb=SContinue}" 102 | Numeric="\p{sb=Numeric}" 103 | Upper="\p{sb=Upper}" 104 | Lower="\p{sb=Lower}" 105 | OLetter="\p{sb=OLetter}" 106 | 107 | Ex="[\p{sb=Extend}\p{sb=Format}]" 108 | ParaSep="[$Sep $CR $LF]" 109 | SATerm="[$STerm $ATerm]" 110 | 111 | LetterSepTerm="[$OLetter $Upper $Lower $ParaSep $SATerm]" 112 | 113 | echo "(?x) 114 | ( 115 | # SB6 116 | $ATerm $Ex* 117 | $Numeric 118 | | 119 | # SB7 120 | [$Upper $Lower] $Ex* $ATerm $Ex* 121 | $Upper $Ex* 122 | # overlap with SB7 123 | ($ATerm $Ex* $Upper $Ex*)* 124 | | 125 | # SB8 126 | $ATerm $Ex* $Close* $Ex* $Sp* $Ex* 127 | ([^$LetterSepTerm] $Ex*)* $Lower $Ex* 128 | # overlap with SB7 129 | ($ATerm $Ex* $Upper $Ex*)* 130 | | 131 | # SB8a 132 | $SATerm $Ex* $Close* $Ex* $Sp* $Ex* 133 | ( 134 | $SContinue 135 | | 136 | $ATerm $Ex* 137 | # Permit repetition of SB8a 138 | (($Close $Ex*)* ($Sp $Ex*)* $SATerm)* 139 | # In order to continue non-breaking matching, we now must observe 140 | # a match with a rule that keeps us in SB6-8a. Otherwise, we've entered 141 | # one of SB9-11 and know that a break must follow. 142 | ( 143 | # overlap with SB6 144 | $Numeric 145 | | 146 | # overlap with SB8 147 | ($Close $Ex*)* ($Sp $Ex*)* 148 | ([^$LetterSepTerm] $Ex*)* $Lower $Ex* 149 | # overlap with SB7 150 | ($ATerm $Ex* $Upper $Ex*)* 151 | | 152 | # overlap with SB8a 153 | ($Close $Ex*)* ($Sp $Ex*)* $SContinue 154 | ) 155 | | 156 | $STerm $Ex* 157 | # Permit repetition of SB8a 158 | (($Close $Ex*)* ($Sp $Ex*)* $SATerm)* 159 | # As with ATerm above, in order to continue non-breaking matching, we 160 | # must now observe a match with a rule that keeps us out of SB9-11. 161 | # For STerm, the only such possibility is to see an SContinue. Anything 162 | # else will result in a break. 163 | ($Close $Ex*)* ($Sp $Ex*)* $SContinue 164 | ) 165 | | 166 | # SB998 167 | # The logic behind this catch-all is that if we get to this point and 168 | # see a Sep, CR, LF, STerm or ATerm, then it has to fall into one of 169 | # SB9, SB10 or SB11. In the cases of SB9-11, we always find a break since 170 | # SB11 acts as a catch-all to induce a break following a SATerm that isn't 171 | # handled by rules SB6-SB8a. 172 | [^$ParaSep $SATerm] 173 | )* 174 | # The following collapses rules SB3, SB4, part of SB8a, SB9, SB10 and SB11. 175 | ($SATerm $Ex* ($Close $Ex*)* ($Sp $Ex*)*)* ($CR $LF | $ParaSep)? 176 | " 177 | -------------------------------------------------------------------------------- /scripts/regex/word.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # vim: indentexpr= nosmartindent autoindent 4 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 5 | 6 | # See the comments in regex/sentence.sh for the general approach to how this 7 | # regex was written. 8 | # 9 | # Writing the regex for this was *hard*. It took me two days of hacking to get 10 | # this far, and that was after I had finished the sentence regex, so my brain 11 | # was fully cached on this. Unlike the sentence regex, the rules in the regex 12 | # below don't correspond as nicely to the rules in UAX #29. In particular, the 13 | # UAX #29 rules have a ton of overlap with each other, which requires crazy 14 | # stuff in the regex. I'm not even sure the regex below is 100% correct or even 15 | # minimal, however, I did compare this with the ICU word segmenter on a few 16 | # different corpora, and it produces identical results. (In addition to of 17 | # course passing the UCD tests.) 18 | # 19 | # In general, I consider this approach to be a failure. Firstly, this is 20 | # clearly a write-only regex. Secondly, building the minimized DFA for this is 21 | # incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly, 22 | # reversing this regex (for reverse word iteration) results in a >19MB DFA. 23 | # Yes. That's MB. Wat. And it took 5 minutes to build. 24 | # 25 | # I think we might consider changing our approach to this problem. The normal 26 | # path I've seen, I think, is to decode codepoints one at a time, and then 27 | # thread them through a state machine in the code itself. We could take this 28 | # approach, or possibly combine it with a DFA that tells us which Word_Break 29 | # value a codepoint has. I'd prefer the latter approach, but it requires adding 30 | # RegexSet support to regex-automata. Something that should definitely be done, 31 | # but is a fair amount of work. 32 | # 33 | # Gah. 34 | 35 | CR="\p{wb=CR}" 36 | LF="\p{wb=LF}" 37 | Newline="\p{wb=Newline}" 38 | ZWJ="\p{wb=ZWJ}" 39 | RI="\p{wb=Regional_Indicator}" 40 | Katakana="\p{wb=Katakana}" 41 | HebrewLet="\p{wb=HebrewLetter}" 42 | ALetter="\p{wb=ALetter}" 43 | SingleQuote="\p{wb=SingleQuote}" 44 | DoubleQuote="\p{wb=DoubleQuote}" 45 | MidNumLet="\p{wb=MidNumLet}" 46 | MidLetter="\p{wb=MidLetter}" 47 | MidNum="\p{wb=MidNum}" 48 | Numeric="\p{wb=Numeric}" 49 | ExtendNumLet="\p{wb=ExtendNumLet}" 50 | WSegSpace="\p{wb=WSegSpace}" 51 | 52 | Any="\p{any}" 53 | Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]" 54 | ExtendPict="\p{Extended_Pictographic}" 55 | AHLetter="[$ALetter $HebrewLet]" 56 | MidNumLetQ="[$MidNumLet $SingleQuote]" 57 | 58 | AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*" 59 | NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*" 60 | 61 | echo "(?x) 62 | $CR $LF 63 | | 64 | [$Newline $CR $LF] 65 | | 66 | $WSegSpace $WSegSpace+ 67 | | 68 | ( 69 | ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+ 70 | | 71 | ($ExtendNumLet $Ex*)* $AHLetter $Ex* 72 | ( 73 | ( 74 | ($NumericRepeat | $ExtendNumLet $Ex*)* 75 | | 76 | [$MidLetter $MidNumLetQ] $Ex* 77 | ) 78 | $AHLetter $Ex* 79 | )+ 80 | ($NumericRepeat | $ExtendNumLet $Ex*)* 81 | | 82 | ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+ 83 | | 84 | ($ExtendNumLet $Ex*)* $Numeric $Ex* 85 | ( 86 | ( 87 | ($AHLetterRepeat | $ExtendNumLet $Ex*)* 88 | | 89 | [$MidNum $MidNumLetQ] $Ex* 90 | ) 91 | $Numeric $Ex* 92 | )+ 93 | ($AHLetterRepeat | $ExtendNumLet $Ex*)* 94 | | 95 | ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+ 96 | | 97 | $Katakana $Ex* 98 | (($Katakana | $ExtendNumLet) $Ex*)+ 99 | | 100 | $ExtendNumLet $Ex* 101 | (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+ 102 | )+ 103 | | 104 | $HebrewLet $Ex* $SingleQuote $Ex* 105 | | 106 | ($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex* 107 | | 108 | $RI $Ex* $RI $Ex* 109 | | 110 | $Any $Ex* 111 | " 112 | -------------------------------------------------------------------------------- /src/ascii.rs: -------------------------------------------------------------------------------- 1 | // The following ~400 lines of code exists for exactly one purpose, which is 2 | // to optimize this code: 3 | // 4 | // byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len()) 5 | // 6 | // Yes... Overengineered is a word that comes to mind, but this is effectively 7 | // a very similar problem to memchr, and virtually nobody has been able to 8 | // resist optimizing the crap out of that (except for perhaps the BSD and MUSL 9 | // folks). In particular, this routine makes a very common case (ASCII) very 10 | // fast, which seems worth it. We do stop short of adding AVX variants of the 11 | // code below in order to retain our sanity and also to avoid needing to deal 12 | // with runtime target feature detection. RESIST! 13 | // 14 | // In order to understand the SIMD version below, it would be good to read this 15 | // comment describing how my memchr routine works: 16 | // https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106 17 | // 18 | // The primary difference with memchr is that for ASCII, we can do a bit less 19 | // work. In particular, we don't need to detect the presence of a specific 20 | // byte, but rather, whether any byte has its most significant bit set. That 21 | // means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to 22 | // _mm_movemask_epi8. 23 | 24 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 25 | const USIZE_BYTES: usize = core::mem::size_of::(); 26 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 27 | const ALIGN_MASK: usize = core::mem::align_of::() - 1; 28 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 29 | const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES; 30 | 31 | // This is a mask where the most significant bit of each byte in the usize 32 | // is set. We test this bit to determine whether a character is ASCII or not. 33 | // Namely, a single byte is regarded as an ASCII codepoint if and only if it's 34 | // most significant bit is not set. 35 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 36 | const ASCII_MASK_U64: u64 = 0x8080808080808080; 37 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 38 | const ASCII_MASK: usize = ASCII_MASK_U64 as usize; 39 | 40 | /// Returns the index of the first non ASCII byte in the given slice. 41 | /// 42 | /// If slice only contains ASCII bytes, then the length of the slice is 43 | /// returned. 44 | pub fn first_non_ascii_byte(slice: &[u8]) -> usize { 45 | #[cfg(any(miri, not(target_arch = "x86_64")))] 46 | { 47 | first_non_ascii_byte_fallback(slice) 48 | } 49 | 50 | #[cfg(all(not(miri), target_arch = "x86_64"))] 51 | { 52 | first_non_ascii_byte_sse2(slice) 53 | } 54 | } 55 | 56 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 57 | fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize { 58 | let start_ptr = slice.as_ptr(); 59 | let end_ptr = slice[slice.len()..].as_ptr(); 60 | let mut ptr = start_ptr; 61 | 62 | unsafe { 63 | if slice.len() < USIZE_BYTES { 64 | return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); 65 | } 66 | 67 | let chunk = read_unaligned_usize(ptr); 68 | let mask = chunk & ASCII_MASK; 69 | if mask != 0 { 70 | return first_non_ascii_byte_mask(mask); 71 | } 72 | 73 | ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & ALIGN_MASK)); 74 | debug_assert!(ptr > start_ptr); 75 | debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr); 76 | if slice.len() >= FALLBACK_LOOP_SIZE { 77 | while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) { 78 | debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); 79 | 80 | let a = *(ptr as *const usize); 81 | let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize); 82 | if (a | b) & ASCII_MASK != 0 { 83 | // What a kludge. We wrap the position finding code into 84 | // a non-inlineable function, which makes the codegen in 85 | // the tight loop above a bit better by avoiding a 86 | // couple extra movs. We pay for it by two additional 87 | // stores, but only in the case of finding a non-ASCII 88 | // byte. 89 | #[inline(never)] 90 | unsafe fn findpos( 91 | start_ptr: *const u8, 92 | ptr: *const u8, 93 | ) -> usize { 94 | let a = *(ptr as *const usize); 95 | let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize); 96 | 97 | let mut at = sub(ptr, start_ptr); 98 | let maska = a & ASCII_MASK; 99 | if maska != 0 { 100 | return at + first_non_ascii_byte_mask(maska); 101 | } 102 | 103 | at += USIZE_BYTES; 104 | let maskb = b & ASCII_MASK; 105 | debug_assert!(maskb != 0); 106 | return at + first_non_ascii_byte_mask(maskb); 107 | } 108 | return findpos(start_ptr, ptr); 109 | } 110 | ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE); 111 | } 112 | } 113 | first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) 114 | } 115 | } 116 | 117 | #[cfg(all(not(miri), target_arch = "x86_64"))] 118 | fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { 119 | use core::arch::x86_64::*; 120 | 121 | const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>(); 122 | const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; 123 | const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE; 124 | 125 | let start_ptr = slice.as_ptr(); 126 | let end_ptr = slice[slice.len()..].as_ptr(); 127 | let mut ptr = start_ptr; 128 | 129 | unsafe { 130 | if slice.len() < VECTOR_SIZE { 131 | return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); 132 | } 133 | 134 | let chunk = _mm_loadu_si128(ptr as *const __m128i); 135 | let mask = _mm_movemask_epi8(chunk); 136 | if mask != 0 { 137 | return mask.trailing_zeros() as usize; 138 | } 139 | 140 | ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); 141 | debug_assert!(ptr > start_ptr); 142 | debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr); 143 | if slice.len() >= VECTOR_LOOP_SIZE { 144 | while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) { 145 | debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); 146 | 147 | let a = _mm_load_si128(ptr as *const __m128i); 148 | let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); 149 | let c = 150 | _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); 151 | let d = 152 | _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); 153 | 154 | let or1 = _mm_or_si128(a, b); 155 | let or2 = _mm_or_si128(c, d); 156 | let or3 = _mm_or_si128(or1, or2); 157 | if _mm_movemask_epi8(or3) != 0 { 158 | let mut at = sub(ptr, start_ptr); 159 | let mask = _mm_movemask_epi8(a); 160 | if mask != 0 { 161 | return at + mask.trailing_zeros() as usize; 162 | } 163 | 164 | at += VECTOR_SIZE; 165 | let mask = _mm_movemask_epi8(b); 166 | if mask != 0 { 167 | return at + mask.trailing_zeros() as usize; 168 | } 169 | 170 | at += VECTOR_SIZE; 171 | let mask = _mm_movemask_epi8(c); 172 | if mask != 0 { 173 | return at + mask.trailing_zeros() as usize; 174 | } 175 | 176 | at += VECTOR_SIZE; 177 | let mask = _mm_movemask_epi8(d); 178 | debug_assert!(mask != 0); 179 | return at + mask.trailing_zeros() as usize; 180 | } 181 | ptr = ptr_add(ptr, VECTOR_LOOP_SIZE); 182 | } 183 | } 184 | while ptr <= end_ptr.sub(VECTOR_SIZE) { 185 | debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); 186 | 187 | let chunk = _mm_loadu_si128(ptr as *const __m128i); 188 | let mask = _mm_movemask_epi8(chunk); 189 | if mask != 0 { 190 | return sub(ptr, start_ptr) + mask.trailing_zeros() as usize; 191 | } 192 | ptr = ptr.add(VECTOR_SIZE); 193 | } 194 | first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) 195 | } 196 | } 197 | 198 | #[inline(always)] 199 | unsafe fn first_non_ascii_byte_slow( 200 | start_ptr: *const u8, 201 | end_ptr: *const u8, 202 | mut ptr: *const u8, 203 | ) -> usize { 204 | debug_assert!(start_ptr <= ptr); 205 | debug_assert!(ptr <= end_ptr); 206 | 207 | while ptr < end_ptr { 208 | if *ptr > 0x7F { 209 | return sub(ptr, start_ptr); 210 | } 211 | ptr = ptr.offset(1); 212 | } 213 | sub(end_ptr, start_ptr) 214 | } 215 | 216 | /// Compute the position of the first ASCII byte in the given mask. 217 | /// 218 | /// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is 219 | /// 8 contiguous bytes of the slice being checked where *at least* one of those 220 | /// bytes is not an ASCII byte. 221 | /// 222 | /// The position returned is always in the inclusive range [0, 7]. 223 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 224 | fn first_non_ascii_byte_mask(mask: usize) -> usize { 225 | #[cfg(target_endian = "little")] 226 | { 227 | mask.trailing_zeros() as usize / 8 228 | } 229 | #[cfg(target_endian = "big")] 230 | { 231 | mask.leading_zeros() as usize / 8 232 | } 233 | } 234 | 235 | /// Increment the given pointer by the given amount. 236 | unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { 237 | ptr.add(amt) 238 | } 239 | 240 | /// Decrement the given pointer by the given amount. 241 | unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { 242 | ptr.sub(amt) 243 | } 244 | 245 | #[cfg(any(test, miri, not(target_arch = "x86_64")))] 246 | unsafe fn read_unaligned_usize(ptr: *const u8) -> usize { 247 | use core::ptr; 248 | 249 | let mut n: usize = 0; 250 | ptr::copy_nonoverlapping(ptr, &mut n as *mut _ as *mut u8, USIZE_BYTES); 251 | n 252 | } 253 | 254 | /// Subtract `b` from `a` and return the difference. `a` should be greater than 255 | /// or equal to `b`. 256 | fn sub(a: *const u8, b: *const u8) -> usize { 257 | debug_assert!(a >= b); 258 | (a as usize) - (b as usize) 259 | } 260 | 261 | #[cfg(test)] 262 | mod tests { 263 | use super::*; 264 | 265 | // Our testing approach here is to try and exhaustively test every case. 266 | // This includes the position at which a non-ASCII byte occurs in addition 267 | // to the alignment of the slice that we're searching. 268 | 269 | #[test] 270 | fn positive_fallback_forward() { 271 | for i in 0..517 { 272 | let s = "a".repeat(i); 273 | assert_eq!( 274 | i, 275 | first_non_ascii_byte_fallback(s.as_bytes()), 276 | "i: {:?}, len: {:?}, s: {:?}", 277 | i, 278 | s.len(), 279 | s 280 | ); 281 | } 282 | } 283 | 284 | #[test] 285 | #[cfg(target_arch = "x86_64")] 286 | #[cfg(not(miri))] 287 | fn positive_sse2_forward() { 288 | for i in 0..517 { 289 | let b = "a".repeat(i).into_bytes(); 290 | assert_eq!(b.len(), first_non_ascii_byte_sse2(&b)); 291 | } 292 | } 293 | 294 | #[test] 295 | #[cfg(not(miri))] 296 | fn negative_fallback_forward() { 297 | for i in 0..517 { 298 | for align in 0..65 { 299 | let mut s = "a".repeat(i); 300 | s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); 301 | let s = s.get(align..).unwrap_or(""); 302 | assert_eq!( 303 | i.saturating_sub(align), 304 | first_non_ascii_byte_fallback(s.as_bytes()), 305 | "i: {:?}, align: {:?}, len: {:?}, s: {:?}", 306 | i, 307 | align, 308 | s.len(), 309 | s 310 | ); 311 | } 312 | } 313 | } 314 | 315 | #[test] 316 | #[cfg(target_arch = "x86_64")] 317 | #[cfg(not(miri))] 318 | fn negative_sse2_forward() { 319 | for i in 0..517 { 320 | for align in 0..65 { 321 | let mut s = "a".repeat(i); 322 | s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); 323 | let s = s.get(align..).unwrap_or(""); 324 | assert_eq!( 325 | i.saturating_sub(align), 326 | first_non_ascii_byte_sse2(s.as_bytes()), 327 | "i: {:?}, align: {:?}, len: {:?}, s: {:?}", 328 | i, 329 | align, 330 | s.len(), 331 | s 332 | ); 333 | } 334 | } 335 | } 336 | } 337 | -------------------------------------------------------------------------------- /src/bstr.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "alloc")] 2 | use alloc::boxed::Box; 3 | 4 | /// A wrapper for `&[u8]` that provides convenient string oriented trait impls. 5 | /// 6 | /// If you need ownership or a growable byte string buffer, then use 7 | /// [`BString`](struct.BString.html). 8 | /// 9 | /// Using a `&BStr` is just like using a `&[u8]`, since `BStr` 10 | /// implements `Deref` to `[u8]`. So all methods available on `[u8]` 11 | /// are also available on `BStr`. 12 | /// 13 | /// # Representation 14 | /// 15 | /// A `&BStr` has the same representation as a `&str`. That is, a `&BStr` is 16 | /// a fat pointer which consists of a pointer to some bytes and a length. 17 | /// 18 | /// # Trait implementations 19 | /// 20 | /// The `BStr` type has a number of trait implementations, and in particular, 21 | /// defines equality and ordinal comparisons between `&BStr`, `&str` and 22 | /// `&[u8]` for convenience. 23 | /// 24 | /// The `Debug` implementation for `BStr` shows its bytes as a normal string. 25 | /// For invalid UTF-8, hex escape sequences are used. 26 | /// 27 | /// The `Display` implementation behaves as if `BStr` were first lossily 28 | /// converted to a `str`. Invalid UTF-8 bytes are substituted with the Unicode 29 | /// replacement codepoint, which looks like this: �. 30 | #[repr(transparent)] 31 | pub struct BStr { 32 | pub(crate) bytes: [u8], 33 | } 34 | 35 | impl BStr { 36 | /// Directly creates a `BStr` slice from anything that can be converted 37 | /// to a byte slice. 38 | /// 39 | /// This is very similar to the [`B`](crate::B) function, except this 40 | /// returns a `&BStr` instead of a `&[u8]`. 41 | /// 42 | /// This is a cost-free conversion. 43 | /// 44 | /// # Example 45 | /// 46 | /// You can create `BStr`'s from byte arrays, byte slices or even string 47 | /// slices: 48 | /// 49 | /// ``` 50 | /// use bstr::BStr; 51 | /// 52 | /// let a = BStr::new(b"abc"); 53 | /// let b = BStr::new(&b"abc"[..]); 54 | /// let c = BStr::new("abc"); 55 | /// 56 | /// assert_eq!(a, b); 57 | /// assert_eq!(a, c); 58 | /// ``` 59 | #[inline] 60 | pub fn new>(bytes: &B) -> &BStr { 61 | BStr::from_bytes(bytes.as_ref()) 62 | } 63 | 64 | #[inline] 65 | pub(crate) fn new_mut>( 66 | bytes: &mut B, 67 | ) -> &mut BStr { 68 | BStr::from_bytes_mut(bytes.as_mut()) 69 | } 70 | 71 | #[inline] 72 | pub(crate) fn from_bytes(slice: &[u8]) -> &BStr { 73 | unsafe { &*(slice as *const [u8] as *const BStr) } 74 | } 75 | 76 | #[inline] 77 | pub(crate) fn from_bytes_mut(slice: &mut [u8]) -> &mut BStr { 78 | unsafe { &mut *(slice as *mut [u8] as *mut BStr) } 79 | } 80 | 81 | #[inline] 82 | #[cfg(feature = "alloc")] 83 | pub(crate) fn from_boxed_bytes(slice: Box<[u8]>) -> Box { 84 | unsafe { Box::from_raw(Box::into_raw(slice) as _) } 85 | } 86 | 87 | #[inline] 88 | #[cfg(feature = "alloc")] 89 | pub(crate) fn into_boxed_bytes(slice: Box) -> Box<[u8]> { 90 | unsafe { Box::from_raw(Box::into_raw(slice) as _) } 91 | } 92 | 93 | #[inline] 94 | pub(crate) fn as_bytes(&self) -> &[u8] { 95 | &self.bytes 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/bstring.rs: -------------------------------------------------------------------------------- 1 | use alloc::vec::Vec; 2 | 3 | use crate::bstr::BStr; 4 | 5 | /// A wrapper for `Vec` that provides convenient string oriented trait 6 | /// impls. 7 | /// 8 | /// A `BString` has ownership over its contents and corresponds to 9 | /// a growable or shrinkable buffer. Its borrowed counterpart is a 10 | /// [`BStr`](struct.BStr.html), called a byte string slice. 11 | /// 12 | /// Using a `BString` is just like using a `Vec`, since `BString` 13 | /// implements `Deref` to `Vec`. So all methods available on `Vec` 14 | /// are also available on `BString`. 15 | /// 16 | /// # Examples 17 | /// 18 | /// You can create a new `BString` from a `Vec` via a `From` impl: 19 | /// 20 | /// ``` 21 | /// use bstr::BString; 22 | /// 23 | /// let s = BString::from("Hello, world!"); 24 | /// ``` 25 | /// 26 | /// # Deref 27 | /// 28 | /// The `BString` type implements `Deref` and `DerefMut`, where the target 29 | /// types are `&Vec` and `&mut Vec`, respectively. `Deref` permits all of the 30 | /// methods defined on `Vec` to be implicitly callable on any `BString`. 31 | /// 32 | /// For more information about how deref works, see the documentation for the 33 | /// [`std::ops::Deref`](https://doc.rust-lang.org/std/ops/trait.Deref.html) 34 | /// trait. 35 | /// 36 | /// # Representation 37 | /// 38 | /// A `BString` has the same representation as a `Vec` and a `String`. 39 | /// That is, it is made up of three word sized components: a pointer to a 40 | /// region of memory containing the bytes, a length and a capacity. 41 | #[derive(Clone)] 42 | pub struct BString { 43 | bytes: Vec, 44 | } 45 | 46 | impl BString { 47 | /// Constructs a new `BString` from the given [`Vec`]. 48 | /// 49 | /// # Examples 50 | /// 51 | /// ``` 52 | /// use bstr::BString; 53 | /// 54 | /// let mut b = BString::new(Vec::with_capacity(10)); 55 | /// ``` 56 | /// 57 | /// This function is `const`: 58 | /// 59 | /// ``` 60 | /// use bstr::BString; 61 | /// 62 | /// const B: BString = BString::new(vec![]); 63 | /// ``` 64 | #[inline] 65 | pub const fn new(bytes: Vec) -> BString { 66 | BString { bytes } 67 | } 68 | 69 | #[inline] 70 | pub(crate) fn as_bytes(&self) -> &[u8] { 71 | &self.bytes 72 | } 73 | 74 | #[inline] 75 | pub(crate) fn as_bytes_mut(&mut self) -> &mut [u8] { 76 | &mut self.bytes 77 | } 78 | 79 | #[inline] 80 | pub(crate) fn as_bstr(&self) -> &BStr { 81 | BStr::new(&self.bytes) 82 | } 83 | 84 | #[inline] 85 | pub(crate) fn as_mut_bstr(&mut self) -> &mut BStr { 86 | BStr::new_mut(&mut self.bytes) 87 | } 88 | 89 | #[inline] 90 | pub(crate) fn as_vec(&self) -> &Vec { 91 | &self.bytes 92 | } 93 | 94 | #[inline] 95 | pub(crate) fn as_vec_mut(&mut self) -> &mut Vec { 96 | &mut self.bytes 97 | } 98 | 99 | #[inline] 100 | pub(crate) fn into_vec(self) -> Vec { 101 | self.bytes 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/byteset/mod.rs: -------------------------------------------------------------------------------- 1 | use memchr::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; 2 | 3 | mod scalar; 4 | 5 | #[inline] 6 | fn build_table(byteset: &[u8]) -> [u8; 256] { 7 | let mut table = [0u8; 256]; 8 | for &b in byteset { 9 | table[b as usize] = 1; 10 | } 11 | table 12 | } 13 | 14 | #[inline] 15 | pub(crate) fn find(haystack: &[u8], byteset: &[u8]) -> Option { 16 | match byteset.len() { 17 | 0 => None, 18 | 1 => memchr(byteset[0], haystack), 19 | 2 => memchr2(byteset[0], byteset[1], haystack), 20 | 3 => memchr3(byteset[0], byteset[1], byteset[2], haystack), 21 | _ => { 22 | let table = build_table(byteset); 23 | scalar::forward_search_bytes(haystack, |b| table[b as usize] != 0) 24 | } 25 | } 26 | } 27 | 28 | #[inline] 29 | pub(crate) fn rfind(haystack: &[u8], byteset: &[u8]) -> Option { 30 | match byteset.len() { 31 | 0 => None, 32 | 1 => memrchr(byteset[0], haystack), 33 | 2 => memrchr2(byteset[0], byteset[1], haystack), 34 | 3 => memrchr3(byteset[0], byteset[1], byteset[2], haystack), 35 | _ => { 36 | let table = build_table(byteset); 37 | scalar::reverse_search_bytes(haystack, |b| table[b as usize] != 0) 38 | } 39 | } 40 | } 41 | 42 | #[inline] 43 | pub(crate) fn find_not(haystack: &[u8], byteset: &[u8]) -> Option { 44 | if haystack.is_empty() { 45 | return None; 46 | } 47 | match byteset.len() { 48 | 0 => Some(0), 49 | 1 => scalar::inv_memchr(byteset[0], haystack), 50 | 2 => scalar::forward_search_bytes(haystack, |b| { 51 | b != byteset[0] && b != byteset[1] 52 | }), 53 | 3 => scalar::forward_search_bytes(haystack, |b| { 54 | b != byteset[0] && b != byteset[1] && b != byteset[2] 55 | }), 56 | _ => { 57 | let table = build_table(byteset); 58 | scalar::forward_search_bytes(haystack, |b| table[b as usize] == 0) 59 | } 60 | } 61 | } 62 | #[inline] 63 | pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option { 64 | if haystack.is_empty() { 65 | return None; 66 | } 67 | match byteset.len() { 68 | 0 => Some(haystack.len() - 1), 69 | 1 => scalar::inv_memrchr(byteset[0], haystack), 70 | 2 => scalar::reverse_search_bytes(haystack, |b| { 71 | b != byteset[0] && b != byteset[1] 72 | }), 73 | 3 => scalar::reverse_search_bytes(haystack, |b| { 74 | b != byteset[0] && b != byteset[1] && b != byteset[2] 75 | }), 76 | _ => { 77 | let table = build_table(byteset); 78 | scalar::reverse_search_bytes(haystack, |b| table[b as usize] == 0) 79 | } 80 | } 81 | } 82 | 83 | #[cfg(all(test, feature = "std", not(miri)))] 84 | mod tests { 85 | use alloc::vec::Vec; 86 | 87 | quickcheck::quickcheck! { 88 | fn qc_byteset_forward_matches_naive( 89 | haystack: Vec, 90 | needles: Vec 91 | ) -> bool { 92 | super::find(&haystack, &needles) 93 | == haystack.iter().position(|b| needles.contains(b)) 94 | } 95 | fn qc_byteset_backwards_matches_naive( 96 | haystack: Vec, 97 | needles: Vec 98 | ) -> bool { 99 | super::rfind(&haystack, &needles) 100 | == haystack.iter().rposition(|b| needles.contains(b)) 101 | } 102 | fn qc_byteset_forward_not_matches_naive( 103 | haystack: Vec, 104 | needles: Vec 105 | ) -> bool { 106 | super::find_not(&haystack, &needles) 107 | == haystack.iter().position(|b| !needles.contains(b)) 108 | } 109 | fn qc_byteset_backwards_not_matches_naive( 110 | haystack: Vec, 111 | needles: Vec 112 | ) -> bool { 113 | super::rfind_not(&haystack, &needles) 114 | == haystack.iter().rposition(|b| !needles.contains(b)) 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/byteset/scalar.rs: -------------------------------------------------------------------------------- 1 | // This is adapted from `fallback.rs` from rust-memchr. It's modified to return 2 | // the 'inverse' query of memchr, e.g. finding the first byte not in the 3 | // provided set. This is simple for the 1-byte case. 4 | 5 | use core::{cmp, usize}; 6 | 7 | const USIZE_BYTES: usize = core::mem::size_of::(); 8 | const ALIGN_MASK: usize = core::mem::align_of::() - 1; 9 | 10 | // The number of bytes to loop at in one iteration of memchr/memrchr. 11 | const LOOP_SIZE: usize = 2 * USIZE_BYTES; 12 | 13 | /// Repeat the given byte into a word size number. That is, every 8 bits 14 | /// is equivalent to the given byte. For example, if `b` is `\x4E` or 15 | /// `01001110` in binary, then the returned value on a 32-bit system would be: 16 | /// `01001110_01001110_01001110_01001110`. 17 | #[inline(always)] 18 | fn repeat_byte(b: u8) -> usize { 19 | (b as usize) * (usize::MAX / 255) 20 | } 21 | 22 | pub fn inv_memchr(n1: u8, haystack: &[u8]) -> Option { 23 | let vn1 = repeat_byte(n1); 24 | let confirm = |byte| byte != n1; 25 | let loop_size = cmp::min(LOOP_SIZE, haystack.len()); 26 | let start_ptr = haystack.as_ptr(); 27 | 28 | unsafe { 29 | let end_ptr = haystack.as_ptr().add(haystack.len()); 30 | let mut ptr = start_ptr; 31 | 32 | if haystack.len() < USIZE_BYTES { 33 | return forward_search(start_ptr, end_ptr, ptr, confirm); 34 | } 35 | 36 | let chunk = read_unaligned_usize(ptr); 37 | if (chunk ^ vn1) != 0 { 38 | return forward_search(start_ptr, end_ptr, ptr, confirm); 39 | } 40 | 41 | ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & ALIGN_MASK)); 42 | debug_assert!(ptr > start_ptr); 43 | debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); 44 | while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { 45 | debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); 46 | 47 | let a = *(ptr as *const usize); 48 | let b = *(ptr.add(USIZE_BYTES) as *const usize); 49 | let eqa = (a ^ vn1) != 0; 50 | let eqb = (b ^ vn1) != 0; 51 | if eqa || eqb { 52 | break; 53 | } 54 | ptr = ptr.add(LOOP_SIZE); 55 | } 56 | forward_search(start_ptr, end_ptr, ptr, confirm) 57 | } 58 | } 59 | 60 | /// Return the last index not matching the byte `x` in `text`. 61 | pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option { 62 | let vn1 = repeat_byte(n1); 63 | let confirm = |byte| byte != n1; 64 | let loop_size = cmp::min(LOOP_SIZE, haystack.len()); 65 | let start_ptr = haystack.as_ptr(); 66 | 67 | unsafe { 68 | let end_ptr = haystack.as_ptr().add(haystack.len()); 69 | let mut ptr = end_ptr; 70 | 71 | if haystack.len() < USIZE_BYTES { 72 | return reverse_search(start_ptr, end_ptr, ptr, confirm); 73 | } 74 | 75 | let chunk = read_unaligned_usize(ptr.sub(USIZE_BYTES)); 76 | if (chunk ^ vn1) != 0 { 77 | return reverse_search(start_ptr, end_ptr, ptr, confirm); 78 | } 79 | 80 | ptr = ptr.sub(end_ptr as usize & ALIGN_MASK); 81 | debug_assert!(start_ptr <= ptr && ptr <= end_ptr); 82 | while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { 83 | debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); 84 | 85 | let a = *(ptr.sub(2 * USIZE_BYTES) as *const usize); 86 | let b = *(ptr.sub(1 * USIZE_BYTES) as *const usize); 87 | let eqa = (a ^ vn1) != 0; 88 | let eqb = (b ^ vn1) != 0; 89 | if eqa || eqb { 90 | break; 91 | } 92 | ptr = ptr.sub(loop_size); 93 | } 94 | reverse_search(start_ptr, end_ptr, ptr, confirm) 95 | } 96 | } 97 | 98 | #[inline(always)] 99 | unsafe fn forward_search bool>( 100 | start_ptr: *const u8, 101 | end_ptr: *const u8, 102 | mut ptr: *const u8, 103 | confirm: F, 104 | ) -> Option { 105 | debug_assert!(start_ptr <= ptr); 106 | debug_assert!(ptr <= end_ptr); 107 | 108 | while ptr < end_ptr { 109 | if confirm(*ptr) { 110 | return Some(sub(ptr, start_ptr)); 111 | } 112 | ptr = ptr.offset(1); 113 | } 114 | None 115 | } 116 | 117 | #[inline(always)] 118 | unsafe fn reverse_search bool>( 119 | start_ptr: *const u8, 120 | end_ptr: *const u8, 121 | mut ptr: *const u8, 122 | confirm: F, 123 | ) -> Option { 124 | debug_assert!(start_ptr <= ptr); 125 | debug_assert!(ptr <= end_ptr); 126 | 127 | while ptr > start_ptr { 128 | ptr = ptr.offset(-1); 129 | if confirm(*ptr) { 130 | return Some(sub(ptr, start_ptr)); 131 | } 132 | } 133 | None 134 | } 135 | 136 | unsafe fn read_unaligned_usize(ptr: *const u8) -> usize { 137 | (ptr as *const usize).read_unaligned() 138 | } 139 | 140 | /// Subtract `b` from `a` and return the difference. `a` should be greater than 141 | /// or equal to `b`. 142 | fn sub(a: *const u8, b: *const u8) -> usize { 143 | debug_assert!(a >= b); 144 | (a as usize) - (b as usize) 145 | } 146 | 147 | /// Safe wrapper around `forward_search` 148 | #[inline] 149 | pub(crate) fn forward_search_bytes bool>( 150 | s: &[u8], 151 | confirm: F, 152 | ) -> Option { 153 | unsafe { 154 | let start = s.as_ptr(); 155 | let end = start.add(s.len()); 156 | forward_search(start, end, start, confirm) 157 | } 158 | } 159 | 160 | /// Safe wrapper around `reverse_search` 161 | #[inline] 162 | pub(crate) fn reverse_search_bytes bool>( 163 | s: &[u8], 164 | confirm: F, 165 | ) -> Option { 166 | unsafe { 167 | let start = s.as_ptr(); 168 | let end = start.add(s.len()); 169 | reverse_search(start, end, end, confirm) 170 | } 171 | } 172 | 173 | #[cfg(all(test, feature = "std"))] 174 | mod tests { 175 | use alloc::{vec, vec::Vec}; 176 | 177 | use super::{inv_memchr, inv_memrchr}; 178 | 179 | // search string, search byte, inv_memchr result, inv_memrchr result. 180 | // these are expanded into a much larger set of tests in build_tests 181 | const TESTS: &[(&[u8], u8, usize, usize)] = &[ 182 | (b"z", b'a', 0, 0), 183 | (b"zz", b'a', 0, 1), 184 | (b"aza", b'a', 1, 1), 185 | (b"zaz", b'a', 0, 2), 186 | (b"zza", b'a', 0, 1), 187 | (b"zaa", b'a', 0, 0), 188 | (b"zzz", b'a', 0, 2), 189 | ]; 190 | 191 | type TestCase = (Vec, u8, Option<(usize, usize)>); 192 | 193 | fn build_tests() -> Vec { 194 | #[cfg(not(miri))] 195 | const MAX_PER: usize = 515; 196 | #[cfg(miri)] 197 | const MAX_PER: usize = 10; 198 | 199 | let mut result = vec![]; 200 | for &(search, byte, fwd_pos, rev_pos) in TESTS { 201 | result.push((search.to_vec(), byte, Some((fwd_pos, rev_pos)))); 202 | for i in 1..MAX_PER { 203 | // add a bunch of copies of the search byte to the end. 204 | let mut suffixed: Vec = search.into(); 205 | suffixed.extend(std::iter::repeat(byte).take(i)); 206 | result.push((suffixed, byte, Some((fwd_pos, rev_pos)))); 207 | 208 | // add a bunch of copies of the search byte to the start. 209 | let mut prefixed: Vec = 210 | std::iter::repeat(byte).take(i).collect(); 211 | prefixed.extend(search); 212 | result.push(( 213 | prefixed, 214 | byte, 215 | Some((fwd_pos + i, rev_pos + i)), 216 | )); 217 | 218 | // add a bunch of copies of the search byte to both ends. 219 | let mut surrounded: Vec = 220 | std::iter::repeat(byte).take(i).collect(); 221 | surrounded.extend(search); 222 | surrounded.extend(std::iter::repeat(byte).take(i)); 223 | result.push(( 224 | surrounded, 225 | byte, 226 | Some((fwd_pos + i, rev_pos + i)), 227 | )); 228 | } 229 | } 230 | 231 | // build non-matching tests for several sizes 232 | for i in 0..MAX_PER { 233 | result.push(( 234 | std::iter::repeat(b'\0').take(i).collect(), 235 | b'\0', 236 | None, 237 | )); 238 | } 239 | 240 | result 241 | } 242 | 243 | #[test] 244 | fn test_inv_memchr() { 245 | use crate::{ByteSlice, B}; 246 | 247 | #[cfg(not(miri))] 248 | const MAX_OFFSET: usize = 130; 249 | #[cfg(miri)] 250 | const MAX_OFFSET: usize = 13; 251 | 252 | for (search, byte, matching) in build_tests() { 253 | assert_eq!( 254 | inv_memchr(byte, &search), 255 | matching.map(|m| m.0), 256 | "inv_memchr when searching for {:?} in {:?}", 257 | byte as char, 258 | // better printing 259 | B(&search).as_bstr(), 260 | ); 261 | assert_eq!( 262 | inv_memrchr(byte, &search), 263 | matching.map(|m| m.1), 264 | "inv_memrchr when searching for {:?} in {:?}", 265 | byte as char, 266 | // better printing 267 | B(&search).as_bstr(), 268 | ); 269 | // Test a rather large number off offsets for potential alignment 270 | // issues. 271 | for offset in 1..MAX_OFFSET { 272 | if offset >= search.len() { 273 | break; 274 | } 275 | // If this would cause us to shift the results off the end, 276 | // skip it so that we don't have to recompute them. 277 | if let Some((f, r)) = matching { 278 | if offset > f || offset > r { 279 | break; 280 | } 281 | } 282 | let realigned = &search[offset..]; 283 | 284 | let forward_pos = matching.map(|m| m.0 - offset); 285 | let reverse_pos = matching.map(|m| m.1 - offset); 286 | 287 | assert_eq!( 288 | inv_memchr(byte, &realigned), 289 | forward_pos, 290 | "inv_memchr when searching (realigned by {}) for {:?} in {:?}", 291 | offset, 292 | byte as char, 293 | realigned.as_bstr(), 294 | ); 295 | assert_eq!( 296 | inv_memrchr(byte, &realigned), 297 | reverse_pos, 298 | "inv_memrchr when searching (realigned by {}) for {:?} in {:?}", 299 | offset, 300 | byte as char, 301 | realigned.as_bstr(), 302 | ); 303 | } 304 | } 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /src/escape_bytes.rs: -------------------------------------------------------------------------------- 1 | /// An iterator of `char` values that represent an escaping of arbitrary bytes. 2 | /// 3 | /// The lifetime parameter `'a` refers to the lifetime of the bytes being 4 | /// escaped. 5 | /// 6 | /// This iterator is created by the 7 | /// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method. 8 | #[derive(Clone, Debug)] 9 | pub struct EscapeBytes<'a> { 10 | remaining: &'a [u8], 11 | state: EscapeState, 12 | } 13 | 14 | impl<'a> EscapeBytes<'a> { 15 | pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes<'a> { 16 | EscapeBytes { remaining: bytes, state: EscapeState::Start } 17 | } 18 | } 19 | 20 | impl<'a> Iterator for EscapeBytes<'a> { 21 | type Item = char; 22 | 23 | #[inline] 24 | fn next(&mut self) -> Option { 25 | use self::EscapeState::*; 26 | 27 | match self.state { 28 | Start => { 29 | let byte = match crate::decode_utf8(self.remaining) { 30 | (None, 0) => return None, 31 | // If we see invalid UTF-8 or ASCII, then we always just 32 | // peel one byte off. If it's printable ASCII, we'll pass 33 | // it through as-is below. Otherwise, below, it will get 34 | // escaped in some way. 35 | (None, _) | (Some(_), 1) => { 36 | let byte = self.remaining[0]; 37 | self.remaining = &self.remaining[1..]; 38 | byte 39 | } 40 | // For any valid UTF-8 that is not ASCII, we pass it 41 | // through as-is. We don't do any Unicode escaping. 42 | (Some(ch), size) => { 43 | self.remaining = &self.remaining[size..]; 44 | return Some(ch); 45 | } 46 | }; 47 | self.state = match byte { 48 | 0x21..=0x5B | 0x5D..=0x7E => { 49 | return Some(char::from(byte)) 50 | } 51 | b'\0' => SpecialEscape('0'), 52 | b'\n' => SpecialEscape('n'), 53 | b'\r' => SpecialEscape('r'), 54 | b'\t' => SpecialEscape('t'), 55 | b'\\' => SpecialEscape('\\'), 56 | _ => HexEscapeX(byte), 57 | }; 58 | Some('\\') 59 | } 60 | SpecialEscape(ch) => { 61 | self.state = Start; 62 | Some(ch) 63 | } 64 | HexEscapeX(byte) => { 65 | self.state = HexEscapeHighNybble(byte); 66 | Some('x') 67 | } 68 | HexEscapeHighNybble(byte) => { 69 | self.state = HexEscapeLowNybble(byte); 70 | let nybble = byte >> 4; 71 | Some(hexdigit_to_char(nybble)) 72 | } 73 | HexEscapeLowNybble(byte) => { 74 | self.state = Start; 75 | let nybble = byte & 0xF; 76 | Some(hexdigit_to_char(nybble)) 77 | } 78 | } 79 | } 80 | } 81 | 82 | impl<'a> core::fmt::Display for EscapeBytes<'a> { 83 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 84 | use core::fmt::Write; 85 | for ch in self.clone() { 86 | f.write_char(ch)?; 87 | } 88 | Ok(()) 89 | } 90 | } 91 | 92 | /// The state used by the FSM in the escaping iterator. 93 | #[derive(Clone, Debug)] 94 | enum EscapeState { 95 | /// Read and remove the next byte from 'remaining'. If 'remaining' is 96 | /// empty, then return None. Otherwise, escape the byte according to the 97 | /// following rules or emit it as-is. 98 | /// 99 | /// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current 100 | /// state to 'SpecialEscape(n | r | t | \ | 0)'. Otherwise, if the 'byte' 101 | /// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to 102 | /// to 'HexEscapeX(byte)'. 103 | Start, 104 | /// Emit the given codepoint as is. This assumes '\' has just been emitted. 105 | /// Then set the state to 'Start'. 106 | SpecialEscape(char), 107 | /// Emit the 'x' part of a hex escape. This assumes '\' has just been 108 | /// emitted. Then set the state to 'HexEscapeHighNybble(byte)'. 109 | HexEscapeX(u8), 110 | /// Emit the high nybble of the byte as a hexadecimal digit. This 111 | /// assumes '\x' has just been emitted. Then set the state to 112 | /// 'HexEscapeLowNybble(byte)'. 113 | HexEscapeHighNybble(u8), 114 | /// Emit the low nybble of the byte as a hexadecimal digit. This assume 115 | /// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte. 116 | /// Then set the state to 'Start'. 117 | HexEscapeLowNybble(u8), 118 | } 119 | 120 | /// An iterator of `u8` values that represent an unescaping of a sequence of 121 | /// codepoints. 122 | /// 123 | /// The type parameter `I` refers to the iterator of codepoints that is 124 | /// unescaped. 125 | /// 126 | /// Currently this iterator is not exposed in the crate API, and instead all 127 | /// we expose is a `ByteVec::unescape` method. Which of course requires an 128 | /// alloc. That's the most convenient form of this, but in theory, we could 129 | /// expose this for core-only use cases too. I'm just not quite sure what the 130 | /// API should be. 131 | #[derive(Clone, Debug)] 132 | #[cfg(feature = "alloc")] 133 | pub(crate) struct UnescapeBytes { 134 | it: I, 135 | state: UnescapeState, 136 | } 137 | 138 | #[cfg(feature = "alloc")] 139 | impl> UnescapeBytes { 140 | pub(crate) fn new>( 141 | t: T, 142 | ) -> UnescapeBytes { 143 | UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start } 144 | } 145 | } 146 | 147 | #[cfg(feature = "alloc")] 148 | impl> Iterator for UnescapeBytes { 149 | type Item = u8; 150 | 151 | fn next(&mut self) -> Option { 152 | use self::UnescapeState::*; 153 | 154 | loop { 155 | match self.state { 156 | Start => { 157 | let ch = self.it.next()?; 158 | match ch { 159 | '\\' => { 160 | self.state = Escape; 161 | } 162 | ch => { 163 | self.state = UnescapeState::bytes(&[], ch); 164 | } 165 | } 166 | } 167 | Bytes { buf, mut cur, len } => { 168 | let byte = buf[cur]; 169 | cur += 1; 170 | if cur >= len { 171 | self.state = Start; 172 | } else { 173 | self.state = Bytes { buf, cur, len }; 174 | } 175 | return Some(byte); 176 | } 177 | Escape => { 178 | let ch = match self.it.next() { 179 | Some(ch) => ch, 180 | None => { 181 | self.state = Start; 182 | // Incomplete escape sequences unescape as 183 | // themselves. 184 | return Some(b'\\'); 185 | } 186 | }; 187 | match ch { 188 | '0' => { 189 | self.state = Start; 190 | return Some(b'\x00'); 191 | } 192 | '\\' => { 193 | self.state = Start; 194 | return Some(b'\\'); 195 | } 196 | 'r' => { 197 | self.state = Start; 198 | return Some(b'\r'); 199 | } 200 | 'n' => { 201 | self.state = Start; 202 | return Some(b'\n'); 203 | } 204 | 't' => { 205 | self.state = Start; 206 | return Some(b'\t'); 207 | } 208 | 'x' => { 209 | self.state = HexFirst; 210 | } 211 | ch => { 212 | // An invalid escape sequence unescapes as itself. 213 | self.state = UnescapeState::bytes(&[b'\\'], ch); 214 | } 215 | } 216 | } 217 | HexFirst => { 218 | let ch = match self.it.next() { 219 | Some(ch) => ch, 220 | None => { 221 | // An incomplete escape sequence unescapes as 222 | // itself. 223 | self.state = UnescapeState::bytes_raw(&[b'x']); 224 | return Some(b'\\'); 225 | } 226 | }; 227 | match ch { 228 | '0'..='9' | 'A'..='F' | 'a'..='f' => { 229 | self.state = HexSecond(ch); 230 | } 231 | ch => { 232 | // An invalid escape sequence unescapes as itself. 233 | self.state = UnescapeState::bytes(&[b'x'], ch); 234 | return Some(b'\\'); 235 | } 236 | } 237 | } 238 | HexSecond(first) => { 239 | let second = match self.it.next() { 240 | Some(ch) => ch, 241 | None => { 242 | // An incomplete escape sequence unescapes as 243 | // itself. 244 | self.state = UnescapeState::bytes(&[b'x'], first); 245 | return Some(b'\\'); 246 | } 247 | }; 248 | match second { 249 | '0'..='9' | 'A'..='F' | 'a'..='f' => { 250 | self.state = Start; 251 | let hinybble = char_to_hexdigit(first); 252 | let lonybble = char_to_hexdigit(second); 253 | let byte = hinybble << 4 | lonybble; 254 | return Some(byte); 255 | } 256 | ch => { 257 | // An invalid escape sequence unescapes as itself. 258 | self.state = 259 | UnescapeState::bytes2(&[b'x'], first, ch); 260 | return Some(b'\\'); 261 | } 262 | } 263 | } 264 | } 265 | } 266 | } 267 | } 268 | 269 | /// The state used by the FSM in the unescaping iterator. 270 | #[derive(Clone, Debug)] 271 | #[cfg(feature = "alloc")] 272 | enum UnescapeState { 273 | /// The start state. Look for an escape sequence, otherwise emit the next 274 | /// codepoint as-is. 275 | Start, 276 | /// Emit the byte at `buf[cur]`. 277 | /// 278 | /// This state should never be created when `cur >= len`. That is, when 279 | /// this state is visited, it is assumed that `cur < len`. 280 | Bytes { buf: [u8; 11], cur: usize, len: usize }, 281 | /// This state is entered after a `\` is seen. 282 | Escape, 283 | /// This state is entered after a `\x` is seen. 284 | HexFirst, 285 | /// This state is entered after a `\xN` is seen, where `N` is in 286 | /// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`. 287 | HexSecond(char), 288 | } 289 | 290 | #[cfg(feature = "alloc")] 291 | impl UnescapeState { 292 | /// Create a new `Bytes` variant with the given slice. 293 | /// 294 | /// # Panics 295 | /// 296 | /// Panics if `bytes.len() > 11`. 297 | fn bytes_raw(bytes: &[u8]) -> UnescapeState { 298 | // This can be increased, you just need to make sure 'buf' in the 299 | // 'Bytes' state has enough room. 300 | assert!(bytes.len() <= 11, "no more than 11 bytes allowed"); 301 | let mut buf = [0; 11]; 302 | buf[..bytes.len()].copy_from_slice(bytes); 303 | UnescapeState::Bytes { buf, cur: 0, len: bytes.len() } 304 | } 305 | 306 | /// Create a new `Bytes` variant with the prefix byte slice, followed by 307 | /// the UTF-8 encoding of the given char. 308 | /// 309 | /// # Panics 310 | /// 311 | /// Panics if `prefix.len() > 3`. 312 | fn bytes(prefix: &[u8], ch: char) -> UnescapeState { 313 | // This can be increased, you just need to make sure 'buf' in the 314 | // 'Bytes' state has enough room. 315 | assert!(prefix.len() <= 3, "no more than 3 bytes allowed"); 316 | let mut buf = [0; 11]; 317 | buf[..prefix.len()].copy_from_slice(prefix); 318 | let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len(); 319 | UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + chlen } 320 | } 321 | 322 | /// Create a new `Bytes` variant with the prefix byte slice, followed by 323 | /// the UTF-8 encoding of `ch1` and then `ch2`. 324 | /// 325 | /// # Panics 326 | /// 327 | /// Panics if `prefix.len() > 3`. 328 | fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState { 329 | // This can be increased, you just need to make sure 'buf' in the 330 | // 'Bytes' state has enough room. 331 | assert!(prefix.len() <= 3, "no more than 3 bytes allowed"); 332 | let mut buf = [0; 11]; 333 | buf[..prefix.len()].copy_from_slice(prefix); 334 | let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len(); 335 | let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len(); 336 | UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + len1 + len2 } 337 | } 338 | } 339 | 340 | /// Convert the given codepoint to its corresponding hexadecimal digit. 341 | /// 342 | /// # Panics 343 | /// 344 | /// This panics if `ch` is not in `[0-9A-Fa-f]`. 345 | #[cfg(feature = "alloc")] 346 | fn char_to_hexdigit(ch: char) -> u8 { 347 | u8::try_from(ch.to_digit(16).unwrap()).unwrap() 348 | } 349 | 350 | /// Convert the given hexadecimal digit to its corresponding codepoint. 351 | /// 352 | /// # Panics 353 | /// 354 | /// This panics when `digit > 15`. 355 | fn hexdigit_to_char(digit: u8) -> char { 356 | char::from_digit(u32::from(digit), 16).unwrap().to_ascii_uppercase() 357 | } 358 | 359 | #[cfg(all(test, feature = "std"))] 360 | mod tests { 361 | use alloc::string::{String, ToString}; 362 | 363 | use crate::BString; 364 | 365 | use super::*; 366 | 367 | #[allow(non_snake_case)] 368 | fn B>(bytes: B) -> BString { 369 | BString::from(bytes.as_ref()) 370 | } 371 | 372 | fn e>(bytes: B) -> String { 373 | EscapeBytes::new(bytes.as_ref()).to_string() 374 | } 375 | 376 | fn u(string: &str) -> BString { 377 | UnescapeBytes::new(string.chars()).collect() 378 | } 379 | 380 | #[test] 381 | fn escape() { 382 | assert_eq!(r"a", e(br"a")); 383 | assert_eq!(r"\\x61", e(br"\x61")); 384 | assert_eq!(r"a", e(b"\x61")); 385 | assert_eq!(r"~", e(b"\x7E")); 386 | assert_eq!(r"\x7F", e(b"\x7F")); 387 | 388 | assert_eq!(r"\n", e(b"\n")); 389 | assert_eq!(r"\r", e(b"\r")); 390 | assert_eq!(r"\t", e(b"\t")); 391 | assert_eq!(r"\\", e(b"\\")); 392 | assert_eq!(r"\0", e(b"\0")); 393 | assert_eq!(r"\0", e(b"\x00")); 394 | 395 | assert_eq!(r"\x88", e(b"\x88")); 396 | assert_eq!(r"\x8F", e(b"\x8F")); 397 | assert_eq!(r"\xF8", e(b"\xF8")); 398 | assert_eq!(r"\xFF", e(b"\xFF")); 399 | 400 | assert_eq!(r"\xE2", e(b"\xE2")); 401 | assert_eq!(r"\xE2\x98", e(b"\xE2\x98")); 402 | assert_eq!(r"☃", e(b"\xE2\x98\x83")); 403 | 404 | assert_eq!(r"\xF0", e(b"\xF0")); 405 | assert_eq!(r"\xF0\x9F", e(b"\xF0\x9F")); 406 | assert_eq!(r"\xF0\x9F\x92", e(b"\xF0\x9F\x92")); 407 | assert_eq!(r"💩", e(b"\xF0\x9F\x92\xA9")); 408 | } 409 | 410 | #[test] 411 | fn unescape() { 412 | assert_eq!(B(r"a"), u(r"a")); 413 | assert_eq!(B(r"\x61"), u(r"\\x61")); 414 | assert_eq!(B(r"a"), u(r"\x61")); 415 | assert_eq!(B(r"~"), u(r"\x7E")); 416 | assert_eq!(B(b"\x7F"), u(r"\x7F")); 417 | 418 | assert_eq!(B(b"\n"), u(r"\n")); 419 | assert_eq!(B(b"\r"), u(r"\r")); 420 | assert_eq!(B(b"\t"), u(r"\t")); 421 | assert_eq!(B(b"\\"), u(r"\\")); 422 | assert_eq!(B(b"\0"), u(r"\0")); 423 | assert_eq!(B(b"\0"), u(r"\x00")); 424 | 425 | assert_eq!(B(b"\x88"), u(r"\x88")); 426 | assert_eq!(B(b"\x8F"), u(r"\x8F")); 427 | assert_eq!(B(b"\xF8"), u(r"\xF8")); 428 | assert_eq!(B(b"\xFF"), u(r"\xFF")); 429 | 430 | assert_eq!(B(b"\xE2"), u(r"\xE2")); 431 | assert_eq!(B(b"\xE2\x98"), u(r"\xE2\x98")); 432 | assert_eq!(B("☃"), u(r"\xE2\x98\x83")); 433 | 434 | assert_eq!(B(b"\xF0"), u(r"\xf0")); 435 | assert_eq!(B(b"\xF0\x9F"), u(r"\xf0\x9f")); 436 | assert_eq!(B(b"\xF0\x9F\x92"), u(r"\xf0\x9f\x92")); 437 | assert_eq!(B("💩"), u(r"\xf0\x9f\x92\xa9")); 438 | } 439 | 440 | #[test] 441 | fn unescape_weird() { 442 | assert_eq!(B(b"\\"), u(r"\")); 443 | assert_eq!(B(b"\\"), u(r"\\")); 444 | assert_eq!(B(b"\\x"), u(r"\x")); 445 | assert_eq!(B(b"\\xA"), u(r"\xA")); 446 | 447 | assert_eq!(B(b"\\xZ"), u(r"\xZ")); 448 | assert_eq!(B(b"\\xZZ"), u(r"\xZZ")); 449 | assert_eq!(B(b"\\i"), u(r"\i")); 450 | assert_eq!(B(b"\\u"), u(r"\u")); 451 | assert_eq!(B(b"\\u{2603}"), u(r"\u{2603}")); 452 | } 453 | } 454 | -------------------------------------------------------------------------------- /src/io.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Utilities for working with I/O using byte strings. 3 | 4 | This module currently only exports a single trait, `BufReadExt`, which provides 5 | facilities for conveniently and efficiently working with lines as byte strings. 6 | 7 | More APIs may be added in the future. 8 | */ 9 | 10 | use alloc::{vec, vec::Vec}; 11 | 12 | use std::io; 13 | 14 | use crate::{ext_slice::ByteSlice, ext_vec::ByteVec}; 15 | 16 | /// An extension trait for 17 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) 18 | /// which provides convenience APIs for dealing with byte strings. 19 | pub trait BufReadExt: io::BufRead { 20 | /// Returns an iterator over the lines of this reader, where each line 21 | /// is represented as a byte string. 22 | /// 23 | /// Each item yielded by this iterator is a `io::Result>`, where 24 | /// an error is yielded if there was a problem reading from the underlying 25 | /// reader. 26 | /// 27 | /// On success, the next line in the iterator is returned. The line does 28 | /// *not* contain a trailing `\n` or `\r\n`. 29 | /// 30 | /// # Examples 31 | /// 32 | /// Basic usage: 33 | /// 34 | /// ``` 35 | /// use std::io; 36 | /// 37 | /// use bstr::io::BufReadExt; 38 | /// 39 | /// # fn example() -> Result<(), io::Error> { 40 | /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); 41 | /// 42 | /// let mut lines = vec![]; 43 | /// for result in cursor.byte_lines() { 44 | /// let line = result?; 45 | /// lines.push(line); 46 | /// } 47 | /// assert_eq!(lines.len(), 3); 48 | /// assert_eq!(lines[0], "lorem".as_bytes()); 49 | /// assert_eq!(lines[1], "ipsum".as_bytes()); 50 | /// assert_eq!(lines[2], "dolor".as_bytes()); 51 | /// # Ok(()) }; example().unwrap() 52 | /// ``` 53 | fn byte_lines(self) -> ByteLines 54 | where 55 | Self: Sized, 56 | { 57 | ByteLines { buf: self } 58 | } 59 | 60 | /// Returns an iterator over byte-terminated records of this reader, where 61 | /// each record is represented as a byte string. 62 | /// 63 | /// Each item yielded by this iterator is a `io::Result>`, where 64 | /// an error is yielded if there was a problem reading from the underlying 65 | /// reader. 66 | /// 67 | /// On success, the next record in the iterator is returned. The record 68 | /// does *not* contain its trailing terminator. 69 | /// 70 | /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in 71 | /// that it has no special handling for `\r`. 72 | /// 73 | /// # Examples 74 | /// 75 | /// Basic usage: 76 | /// 77 | /// ``` 78 | /// use std::io; 79 | /// 80 | /// use bstr::io::BufReadExt; 81 | /// 82 | /// # fn example() -> Result<(), io::Error> { 83 | /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); 84 | /// 85 | /// let mut records = vec![]; 86 | /// for result in cursor.byte_records(b'\x00') { 87 | /// let record = result?; 88 | /// records.push(record); 89 | /// } 90 | /// assert_eq!(records.len(), 3); 91 | /// assert_eq!(records[0], "lorem".as_bytes()); 92 | /// assert_eq!(records[1], "ipsum".as_bytes()); 93 | /// assert_eq!(records[2], "dolor".as_bytes()); 94 | /// # Ok(()) }; example().unwrap() 95 | /// ``` 96 | fn byte_records(self, terminator: u8) -> ByteRecords 97 | where 98 | Self: Sized, 99 | { 100 | ByteRecords { terminator, buf: self } 101 | } 102 | 103 | /// Executes the given closure on each line in the underlying reader. 104 | /// 105 | /// If the closure returns an error (or if the underlying reader returns an 106 | /// error), then iteration is stopped and the error is returned. If false 107 | /// is returned, then iteration is stopped and no error is returned. 108 | /// 109 | /// The closure given is called on exactly the same values as yielded by 110 | /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines) 111 | /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes. 112 | /// 113 | /// This routine is useful for iterating over lines as quickly as 114 | /// possible. Namely, a single allocation is reused for each line. 115 | /// 116 | /// # Examples 117 | /// 118 | /// Basic usage: 119 | /// 120 | /// ``` 121 | /// use std::io; 122 | /// 123 | /// use bstr::io::BufReadExt; 124 | /// 125 | /// # fn example() -> Result<(), io::Error> { 126 | /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); 127 | /// 128 | /// let mut lines = vec![]; 129 | /// cursor.for_byte_line(|line| { 130 | /// lines.push(line.to_vec()); 131 | /// Ok(true) 132 | /// })?; 133 | /// assert_eq!(lines.len(), 3); 134 | /// assert_eq!(lines[0], "lorem".as_bytes()); 135 | /// assert_eq!(lines[1], "ipsum".as_bytes()); 136 | /// assert_eq!(lines[2], "dolor".as_bytes()); 137 | /// # Ok(()) }; example().unwrap() 138 | /// ``` 139 | fn for_byte_line(&mut self, mut for_each_line: F) -> io::Result<()> 140 | where 141 | Self: Sized, 142 | F: FnMut(&[u8]) -> io::Result, 143 | { 144 | self.for_byte_line_with_terminator(|line| { 145 | for_each_line(trim_line_slice(line)) 146 | }) 147 | } 148 | 149 | /// Executes the given closure on each byte-terminated record in the 150 | /// underlying reader. 151 | /// 152 | /// If the closure returns an error (or if the underlying reader returns an 153 | /// error), then iteration is stopped and the error is returned. If false 154 | /// is returned, then iteration is stopped and no error is returned. 155 | /// 156 | /// The closure given is called on exactly the same values as yielded by 157 | /// the [`byte_records`](trait.BufReadExt.html#method.byte_records) 158 | /// iterator. Namely, records do _not_ contain a trailing terminator byte. 159 | /// 160 | /// This routine is useful for iterating over records as quickly as 161 | /// possible. Namely, a single allocation is reused for each record. 162 | /// 163 | /// # Examples 164 | /// 165 | /// Basic usage: 166 | /// 167 | /// ``` 168 | /// use std::io; 169 | /// 170 | /// use bstr::io::BufReadExt; 171 | /// 172 | /// # fn example() -> Result<(), io::Error> { 173 | /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); 174 | /// 175 | /// let mut records = vec![]; 176 | /// cursor.for_byte_record(b'\x00', |record| { 177 | /// records.push(record.to_vec()); 178 | /// Ok(true) 179 | /// })?; 180 | /// assert_eq!(records.len(), 3); 181 | /// assert_eq!(records[0], "lorem".as_bytes()); 182 | /// assert_eq!(records[1], "ipsum".as_bytes()); 183 | /// assert_eq!(records[2], "dolor".as_bytes()); 184 | /// # Ok(()) }; example().unwrap() 185 | /// ``` 186 | fn for_byte_record( 187 | &mut self, 188 | terminator: u8, 189 | mut for_each_record: F, 190 | ) -> io::Result<()> 191 | where 192 | Self: Sized, 193 | F: FnMut(&[u8]) -> io::Result, 194 | { 195 | self.for_byte_record_with_terminator(terminator, |chunk| { 196 | for_each_record(trim_record_slice(chunk, terminator)) 197 | }) 198 | } 199 | 200 | /// Executes the given closure on each line in the underlying reader. 201 | /// 202 | /// If the closure returns an error (or if the underlying reader returns an 203 | /// error), then iteration is stopped and the error is returned. If false 204 | /// is returned, then iteration is stopped and no error is returned. 205 | /// 206 | /// Unlike 207 | /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line), 208 | /// the lines given to the closure *do* include the line terminator, if one 209 | /// exists. 210 | /// 211 | /// This routine is useful for iterating over lines as quickly as 212 | /// possible. Namely, a single allocation is reused for each line. 213 | /// 214 | /// This is identical to `for_byte_record_with_terminator` with a 215 | /// terminator of `\n`. 216 | /// 217 | /// # Examples 218 | /// 219 | /// Basic usage: 220 | /// 221 | /// ``` 222 | /// use std::io; 223 | /// 224 | /// use bstr::io::BufReadExt; 225 | /// 226 | /// # fn example() -> Result<(), io::Error> { 227 | /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); 228 | /// 229 | /// let mut lines = vec![]; 230 | /// cursor.for_byte_line_with_terminator(|line| { 231 | /// lines.push(line.to_vec()); 232 | /// Ok(true) 233 | /// })?; 234 | /// assert_eq!(lines.len(), 3); 235 | /// assert_eq!(lines[0], "lorem\n".as_bytes()); 236 | /// assert_eq!(lines[1], "ipsum\r\n".as_bytes()); 237 | /// assert_eq!(lines[2], "dolor".as_bytes()); 238 | /// # Ok(()) }; example().unwrap() 239 | /// ``` 240 | fn for_byte_line_with_terminator( 241 | &mut self, 242 | for_each_line: F, 243 | ) -> io::Result<()> 244 | where 245 | Self: Sized, 246 | F: FnMut(&[u8]) -> io::Result, 247 | { 248 | self.for_byte_record_with_terminator(b'\n', for_each_line) 249 | } 250 | 251 | /// Executes the given closure on each byte-terminated record in the 252 | /// underlying reader. 253 | /// 254 | /// If the closure returns an error (or if the underlying reader returns an 255 | /// error), then iteration is stopped and the error is returned. If false 256 | /// is returned, then iteration is stopped and no error is returned. 257 | /// 258 | /// Unlike 259 | /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record), 260 | /// the lines given to the closure *do* include the record terminator, if 261 | /// one exists. 262 | /// 263 | /// This routine is useful for iterating over records as quickly as 264 | /// possible. Namely, a single allocation is reused for each record. 265 | /// 266 | /// # Examples 267 | /// 268 | /// Basic usage: 269 | /// 270 | /// ``` 271 | /// use std::io; 272 | /// 273 | /// use bstr::{io::BufReadExt, B}; 274 | /// 275 | /// # fn example() -> Result<(), io::Error> { 276 | /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); 277 | /// 278 | /// let mut records = vec![]; 279 | /// cursor.for_byte_record_with_terminator(b'\x00', |record| { 280 | /// records.push(record.to_vec()); 281 | /// Ok(true) 282 | /// })?; 283 | /// assert_eq!(records.len(), 3); 284 | /// assert_eq!(records[0], B(b"lorem\x00")); 285 | /// assert_eq!(records[1], B("ipsum\x00")); 286 | /// assert_eq!(records[2], B("dolor")); 287 | /// # Ok(()) }; example().unwrap() 288 | /// ``` 289 | fn for_byte_record_with_terminator( 290 | &mut self, 291 | terminator: u8, 292 | mut for_each_record: F, 293 | ) -> io::Result<()> 294 | where 295 | Self: Sized, 296 | F: FnMut(&[u8]) -> io::Result, 297 | { 298 | let mut bytes = vec![]; 299 | let mut res = Ok(()); 300 | let mut consumed = 0; 301 | 'outer: loop { 302 | // Lend out complete record slices from our buffer 303 | { 304 | let mut buf = self.fill_buf()?; 305 | if buf.is_empty() { 306 | break; 307 | } 308 | while let Some(index) = buf.find_byte(terminator) { 309 | let (record, rest) = buf.split_at(index + 1); 310 | buf = rest; 311 | consumed += record.len(); 312 | match for_each_record(record) { 313 | Ok(false) => break 'outer, 314 | Err(err) => { 315 | res = Err(err); 316 | break 'outer; 317 | } 318 | _ => (), 319 | } 320 | } 321 | 322 | // Copy the final record fragment to our local buffer. This 323 | // saves read_until() from re-scanning a buffer we know 324 | // contains no remaining terminators. 325 | bytes.extend_from_slice(buf); 326 | consumed += buf.len(); 327 | } 328 | 329 | self.consume(consumed); 330 | consumed = 0; 331 | 332 | // N.B. read_until uses a different version of memchr that may 333 | // be slower than the memchr crate that bstr uses. However, this 334 | // should only run for a fairly small number of records, assuming a 335 | // decent buffer size. 336 | self.read_until(terminator, &mut bytes)?; 337 | if bytes.is_empty() || !for_each_record(&bytes)? { 338 | break; 339 | } 340 | bytes.clear(); 341 | } 342 | self.consume(consumed); 343 | res 344 | } 345 | } 346 | 347 | impl BufReadExt for B {} 348 | 349 | /// An iterator over lines from an instance of 350 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). 351 | /// 352 | /// This iterator is generally created by calling the 353 | /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines) 354 | /// method on the 355 | /// [`BufReadExt`](trait.BufReadExt.html) 356 | /// trait. 357 | #[derive(Debug)] 358 | pub struct ByteLines { 359 | buf: B, 360 | } 361 | 362 | /// An iterator over records from an instance of 363 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). 364 | /// 365 | /// A byte record is any sequence of bytes terminated by a particular byte 366 | /// chosen by the caller. For example, NUL separated byte strings are said to 367 | /// be NUL-terminated byte records. 368 | /// 369 | /// This iterator is generally created by calling the 370 | /// [`byte_records`](trait.BufReadExt.html#method.byte_records) 371 | /// method on the 372 | /// [`BufReadExt`](trait.BufReadExt.html) 373 | /// trait. 374 | #[derive(Debug)] 375 | pub struct ByteRecords { 376 | buf: B, 377 | terminator: u8, 378 | } 379 | 380 | impl Iterator for ByteLines { 381 | type Item = io::Result>; 382 | 383 | fn next(&mut self) -> Option>> { 384 | let mut bytes = vec![]; 385 | match self.buf.read_until(b'\n', &mut bytes) { 386 | Err(e) => Some(Err(e)), 387 | Ok(0) => None, 388 | Ok(_) => { 389 | trim_line(&mut bytes); 390 | Some(Ok(bytes)) 391 | } 392 | } 393 | } 394 | } 395 | 396 | impl Iterator for ByteRecords { 397 | type Item = io::Result>; 398 | 399 | fn next(&mut self) -> Option>> { 400 | let mut bytes = vec![]; 401 | match self.buf.read_until(self.terminator, &mut bytes) { 402 | Err(e) => Some(Err(e)), 403 | Ok(0) => None, 404 | Ok(_) => { 405 | trim_record(&mut bytes, self.terminator); 406 | Some(Ok(bytes)) 407 | } 408 | } 409 | } 410 | } 411 | 412 | fn trim_line(line: &mut Vec) { 413 | if line.last_byte() == Some(b'\n') { 414 | line.pop_byte(); 415 | if line.last_byte() == Some(b'\r') { 416 | line.pop_byte(); 417 | } 418 | } 419 | } 420 | 421 | fn trim_line_slice(mut line: &[u8]) -> &[u8] { 422 | if line.last_byte() == Some(b'\n') { 423 | line = &line[..line.len() - 1]; 424 | if line.last_byte() == Some(b'\r') { 425 | line = &line[..line.len() - 1]; 426 | } 427 | } 428 | line 429 | } 430 | 431 | fn trim_record(record: &mut Vec, terminator: u8) { 432 | if record.last_byte() == Some(terminator) { 433 | record.pop_byte(); 434 | } 435 | } 436 | 437 | fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] { 438 | if record.last_byte() == Some(terminator) { 439 | record = &record[..record.len() - 1]; 440 | } 441 | record 442 | } 443 | 444 | #[cfg(all(test, feature = "std"))] 445 | mod tests { 446 | use alloc::{vec, vec::Vec}; 447 | 448 | use crate::bstring::BString; 449 | 450 | use super::BufReadExt; 451 | 452 | fn collect_lines>(slice: B) -> Vec { 453 | let mut lines = vec![]; 454 | slice 455 | .as_ref() 456 | .for_byte_line(|line| { 457 | lines.push(BString::from(line.to_vec())); 458 | Ok(true) 459 | }) 460 | .unwrap(); 461 | lines 462 | } 463 | 464 | fn collect_lines_term>(slice: B) -> Vec { 465 | let mut lines = vec![]; 466 | slice 467 | .as_ref() 468 | .for_byte_line_with_terminator(|line| { 469 | lines.push(BString::from(line.to_vec())); 470 | Ok(true) 471 | }) 472 | .unwrap(); 473 | lines 474 | } 475 | 476 | #[test] 477 | fn lines_without_terminator() { 478 | assert_eq!(collect_lines(""), Vec::::new()); 479 | 480 | assert_eq!(collect_lines("\n"), vec![""]); 481 | assert_eq!(collect_lines("\n\n"), vec!["", ""]); 482 | assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]); 483 | assert_eq!(collect_lines("a\nb"), vec!["a", "b"]); 484 | assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]); 485 | assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]); 486 | 487 | assert_eq!(collect_lines("\r\n"), vec![""]); 488 | assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]); 489 | assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]); 490 | assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]); 491 | assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]); 492 | assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]); 493 | 494 | assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]); 495 | } 496 | 497 | #[test] 498 | fn lines_with_terminator() { 499 | assert_eq!(collect_lines_term(""), Vec::::new()); 500 | 501 | assert_eq!(collect_lines_term("\n"), vec!["\n"]); 502 | assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]); 503 | assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]); 504 | assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]); 505 | assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]); 506 | assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]); 507 | 508 | assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]); 509 | assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]); 510 | assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]); 511 | assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]); 512 | assert_eq!( 513 | collect_lines_term("abc\r\nxyz\r\n"), 514 | vec!["abc\r\n", "xyz\r\n"] 515 | ); 516 | assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]); 517 | 518 | assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]); 519 | } 520 | } 521 | -------------------------------------------------------------------------------- /src/tests.rs: -------------------------------------------------------------------------------- 1 | /// A sequence of tests for checking whether lossy decoding uses the maximal 2 | /// subpart strategy correctly. Namely, if a sequence of otherwise invalid 3 | /// UTF-8 bytes is a valid prefix of a valid UTF-8 sequence, then the entire 4 | /// prefix is replaced by a single replacement codepoint. In all other cases, 5 | /// each invalid byte is replaced by a single replacement codepoint. 6 | /// 7 | /// The first element in each tuple is the expected result of lossy decoding, 8 | /// while the second element is the input given. 9 | pub(crate) const LOSSY_TESTS: &[(&str, &[u8])] = &[ 10 | ("a", b"a"), 11 | ("\u{FFFD}", b"\xFF"), 12 | ("\u{FFFD}\u{FFFD}", b"\xFF\xFF"), 13 | ("β\u{FFFD}", b"\xCE\xB2\xFF"), 14 | ("☃\u{FFFD}", b"\xE2\x98\x83\xFF"), 15 | ("𝝱\u{FFFD}", b"\xF0\x9D\x9D\xB1\xFF"), 16 | ("\u{FFFD}\u{FFFD}", b"\xCE\xF0"), 17 | ("\u{FFFD}\u{FFFD}", b"\xCE\xFF"), 18 | ("\u{FFFD}\u{FFFD}", b"\xE2\x98\xF0"), 19 | ("\u{FFFD}\u{FFFD}", b"\xE2\x98\xFF"), 20 | ("\u{FFFD}", b"\xF0\x9D\x9D"), 21 | ("\u{FFFD}\u{FFFD}", b"\xF0\x9D\x9D\xF0"), 22 | ("\u{FFFD}\u{FFFD}", b"\xF0\x9D\x9D\xFF"), 23 | ("\u{FFFD}", b"\xCE"), 24 | ("a\u{FFFD}", b"a\xCE"), 25 | ("\u{FFFD}", b"\xE2\x98"), 26 | ("a\u{FFFD}", b"a\xE2\x98"), 27 | ("\u{FFFD}", b"\xF0\x9D\x9C"), 28 | ("a\u{FFFD}", b"a\xF0\x9D\x9C"), 29 | ("a\u{FFFD}\u{FFFD}\u{FFFD}z", b"a\xED\xA0\x80z"), 30 | ("☃βツ\u{FFFD}", b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF"), 31 | ("a\u{FFFD}\u{FFFD}\u{FFFD}b", b"\x61\xF1\x80\x80\xE1\x80\xC2\x62"), 32 | ]; 33 | -------------------------------------------------------------------------------- /src/unicode/data/LICENSE-UNICODE: -------------------------------------------------------------------------------- 1 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 2 | See Terms of Use for definitions of Unicode Inc.'s 3 | Data Files and Software. 4 | 5 | NOTICE TO USER: Carefully read the following legal agreement. 6 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 7 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 8 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 9 | TERMS AND CONDITIONS OF THIS AGREEMENT. 10 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 11 | THE DATA FILES OR SOFTWARE. 12 | 13 | COPYRIGHT AND PERMISSION NOTICE 14 | 15 | Copyright © 1991-2019 Unicode, Inc. All rights reserved. 16 | Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 17 | 18 | Permission is hereby granted, free of charge, to any person obtaining 19 | a copy of the Unicode data files and any associated documentation 20 | (the "Data Files") or Unicode software and any associated documentation 21 | (the "Software") to deal in the Data Files or Software 22 | without restriction, including without limitation the rights to use, 23 | copy, modify, merge, publish, distribute, and/or sell copies of 24 | the Data Files or Software, and to permit persons to whom the Data Files 25 | or Software are furnished to do so, provided that either 26 | (a) this copyright and permission notice appear with all copies 27 | of the Data Files or Software, or 28 | (b) this copyright and permission notice appear in associated 29 | Documentation. 30 | 31 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 32 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 33 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 34 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. 35 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 36 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 37 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 38 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 39 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 40 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. 41 | 42 | Except as contained in this notice, the name of a copyright holder 43 | shall not be used in advertising or otherwise to promote the sale, 44 | use or other dealings in these Data Files or Software without prior 45 | written authorization of the copyright holder. 46 | -------------------------------------------------------------------------------- /src/unicode/fsm/grapheme_break_fwd.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/grapheme_break_fwd.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/grapheme_break_fwd.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe GRAPHEME_BREAK_FWD src/unicode/fsm/ 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; 8 | 9 | pub static GRAPHEME_BREAK_FWD: Lazy> = Lazy::new(|| { 10 | #[cfg(target_endian = "big")] 11 | static BYTES: &'static [u8] = 12 | include_bytes!("grapheme_break_fwd.bigendian.dfa"); 13 | #[cfg(target_endian = "little")] 14 | static BYTES: &'static [u8] = 15 | include_bytes!("grapheme_break_fwd.littleendian.dfa"); 16 | let (dfa, _) = 17 | DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); 18 | dfa 19 | }); 20 | -------------------------------------------------------------------------------- /src/unicode/fsm/grapheme_break_rev.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_rev.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/grapheme_break_rev.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_rev.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/grapheme_break_rev.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize sparse dfa --minimize --start-kind anchored --reverse --match-kind all --no-captures --shrink --rustfmt --safe GRAPHEME_BREAK_REV src/unicode/fsm/ 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; 8 | 9 | pub static GRAPHEME_BREAK_REV: Lazy> = Lazy::new(|| { 10 | #[cfg(target_endian = "big")] 11 | static BYTES: &'static [u8] = 12 | include_bytes!("grapheme_break_rev.bigendian.dfa"); 13 | #[cfg(target_endian = "little")] 14 | static BYTES: &'static [u8] = 15 | include_bytes!("grapheme_break_rev.littleendian.dfa"); 16 | let (dfa, _) = 17 | DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); 18 | dfa 19 | }); 20 | -------------------------------------------------------------------------------- /src/unicode/fsm/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod grapheme_break_fwd; 2 | pub mod grapheme_break_rev; 3 | pub mod regional_indicator_rev; 4 | pub mod sentence_break_fwd; 5 | pub mod simple_word_fwd; 6 | pub mod whitespace_anchored_fwd; 7 | pub mod whitespace_anchored_rev; 8 | pub mod word_break_fwd; 9 | -------------------------------------------------------------------------------- /src/unicode/fsm/regional_indicator_rev.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/regional_indicator_rev.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/regional_indicator_rev.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/regional_indicator_rev.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/regional_indicator_rev.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe REGIONAL_INDICATOR_REV src/unicode/fsm/ \p{gcb=Regional_Indicator} 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{ 8 | dfa::dense::DFA, 9 | util::{lazy::Lazy, wire::AlignAs}, 10 | }; 11 | 12 | pub static REGIONAL_INDICATOR_REV: Lazy> = 13 | Lazy::new(|| { 14 | static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { 15 | _align: [], 16 | #[cfg(target_endian = "big")] 17 | bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"), 18 | #[cfg(target_endian = "little")] 19 | bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"), 20 | }; 21 | let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) 22 | .expect("serialized DFA should be valid"); 23 | dfa 24 | }); 25 | -------------------------------------------------------------------------------- /src/unicode/fsm/sentence_break_fwd.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/sentence_break_fwd.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/sentence_break_fwd.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/sentence_break_fwd.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/sentence_break_fwd.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SENTENCE_BREAK_FWD src/unicode/fsm/ 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; 8 | 9 | pub static SENTENCE_BREAK_FWD: Lazy> = Lazy::new(|| { 10 | #[cfg(target_endian = "big")] 11 | static BYTES: &'static [u8] = 12 | include_bytes!("sentence_break_fwd.bigendian.dfa"); 13 | #[cfg(target_endian = "little")] 14 | static BYTES: &'static [u8] = 15 | include_bytes!("sentence_break_fwd.littleendian.dfa"); 16 | let (dfa, _) = 17 | DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); 18 | dfa 19 | }); 20 | -------------------------------------------------------------------------------- /src/unicode/fsm/simple_word_fwd.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/simple_word_fwd.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/simple_word_fwd.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/simple_word_fwd.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/simple_word_fwd.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SIMPLE_WORD_FWD src/unicode/fsm/ \w 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; 8 | 9 | pub static SIMPLE_WORD_FWD: Lazy> = Lazy::new(|| { 10 | #[cfg(target_endian = "big")] 11 | static BYTES: &'static [u8] = 12 | include_bytes!("simple_word_fwd.bigendian.dfa"); 13 | #[cfg(target_endian = "little")] 14 | static BYTES: &'static [u8] = 15 | include_bytes!("simple_word_fwd.littleendian.dfa"); 16 | let (dfa, _) = 17 | DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); 18 | dfa 19 | }); 20 | -------------------------------------------------------------------------------- /src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/whitespace_anchored_fwd.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize dense dfa --minimize --start-kind anchored --shrink --rustfmt --safe WHITESPACE_ANCHORED_FWD src/unicode/fsm/ \s+ 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{ 8 | dfa::dense::DFA, 9 | util::{lazy::Lazy, wire::AlignAs}, 10 | }; 11 | 12 | pub static WHITESPACE_ANCHORED_FWD: Lazy> = 13 | Lazy::new(|| { 14 | static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { 15 | _align: [], 16 | #[cfg(target_endian = "big")] 17 | bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"), 18 | #[cfg(target_endian = "little")] 19 | bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"), 20 | }; 21 | let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) 22 | .expect("serialized DFA should be valid"); 23 | dfa 24 | }); 25 | -------------------------------------------------------------------------------- /src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/whitespace_anchored_rev.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe WHITESPACE_ANCHORED_REV src/unicode/fsm/ \s+ 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{ 8 | dfa::dense::DFA, 9 | util::{lazy::Lazy, wire::AlignAs}, 10 | }; 11 | 12 | pub static WHITESPACE_ANCHORED_REV: Lazy> = 13 | Lazy::new(|| { 14 | static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { 15 | _align: [], 16 | #[cfg(target_endian = "big")] 17 | bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"), 18 | #[cfg(target_endian = "little")] 19 | bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"), 20 | }; 21 | let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) 22 | .expect("serialized DFA should be valid"); 23 | dfa 24 | }); 25 | -------------------------------------------------------------------------------- /src/unicode/fsm/word_break_fwd.bigendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/word_break_fwd.bigendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/word_break_fwd.littleendian.dfa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/word_break_fwd.littleendian.dfa -------------------------------------------------------------------------------- /src/unicode/fsm/word_break_fwd.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe WORD_BREAK_FWD src/unicode/fsm/ 4 | // 5 | // regex-cli 0.0.1 is available on crates.io. 6 | 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy}; 8 | 9 | pub static WORD_BREAK_FWD: Lazy> = Lazy::new(|| { 10 | #[cfg(target_endian = "big")] 11 | static BYTES: &'static [u8] = 12 | include_bytes!("word_break_fwd.bigendian.dfa"); 13 | #[cfg(target_endian = "little")] 14 | static BYTES: &'static [u8] = 15 | include_bytes!("word_break_fwd.littleendian.dfa"); 16 | let (dfa, _) = 17 | DFA::from_bytes(BYTES).expect("serialized DFA should be valid"); 18 | dfa 19 | }); 20 | -------------------------------------------------------------------------------- /src/unicode/grapheme.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{dfa::Automaton, Anchored, Input}; 2 | 3 | use crate::{ 4 | ext_slice::ByteSlice, 5 | unicode::fsm::{ 6 | grapheme_break_fwd::GRAPHEME_BREAK_FWD, 7 | grapheme_break_rev::GRAPHEME_BREAK_REV, 8 | regional_indicator_rev::REGIONAL_INDICATOR_REV, 9 | }, 10 | utf8, 11 | }; 12 | 13 | /// An iterator over grapheme clusters in a byte string. 14 | /// 15 | /// This iterator is typically constructed by 16 | /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes). 17 | /// 18 | /// Unicode defines a grapheme cluster as an *approximation* to a single user 19 | /// visible character. A grapheme cluster, or just "grapheme," is made up of 20 | /// one or more codepoints. For end user oriented tasks, one should generally 21 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which 22 | /// always yields one codepoint at a time. 23 | /// 24 | /// Since graphemes are made up of one or more codepoints, this iterator yields 25 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints 26 | /// are [substituted](index.html#handling-of-invalid-utf-8). 27 | /// 28 | /// This iterator can be used in reverse. When reversed, exactly the same 29 | /// set of grapheme clusters are yielded, but in reverse order. 30 | /// 31 | /// This iterator only yields *extended* grapheme clusters, in accordance with 32 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). 33 | #[derive(Clone, Debug)] 34 | pub struct Graphemes<'a> { 35 | bs: &'a [u8], 36 | } 37 | 38 | impl<'a> Graphemes<'a> { 39 | pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> { 40 | Graphemes { bs } 41 | } 42 | 43 | /// View the underlying data as a subslice of the original data. 44 | /// 45 | /// The slice returned has the same lifetime as the original slice, and so 46 | /// the iterator can continue to be used while this exists. 47 | /// 48 | /// # Examples 49 | /// 50 | /// ``` 51 | /// use bstr::ByteSlice; 52 | /// 53 | /// let mut it = b"abc".graphemes(); 54 | /// 55 | /// assert_eq!(b"abc", it.as_bytes()); 56 | /// it.next(); 57 | /// assert_eq!(b"bc", it.as_bytes()); 58 | /// it.next(); 59 | /// it.next(); 60 | /// assert_eq!(b"", it.as_bytes()); 61 | /// ``` 62 | #[inline] 63 | pub fn as_bytes(&self) -> &'a [u8] { 64 | self.bs 65 | } 66 | } 67 | 68 | impl<'a> Iterator for Graphemes<'a> { 69 | type Item = &'a str; 70 | 71 | #[inline] 72 | fn next(&mut self) -> Option<&'a str> { 73 | let (grapheme, size) = decode_grapheme(self.bs); 74 | if size == 0 { 75 | return None; 76 | } 77 | self.bs = &self.bs[size..]; 78 | Some(grapheme) 79 | } 80 | } 81 | 82 | impl<'a> DoubleEndedIterator for Graphemes<'a> { 83 | #[inline] 84 | fn next_back(&mut self) -> Option<&'a str> { 85 | let (grapheme, size) = decode_last_grapheme(self.bs); 86 | if size == 0 { 87 | return None; 88 | } 89 | self.bs = &self.bs[..self.bs.len() - size]; 90 | Some(grapheme) 91 | } 92 | } 93 | 94 | /// An iterator over grapheme clusters in a byte string and their byte index 95 | /// positions. 96 | /// 97 | /// This iterator is typically constructed by 98 | /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices). 99 | /// 100 | /// Unicode defines a grapheme cluster as an *approximation* to a single user 101 | /// visible character. A grapheme cluster, or just "grapheme," is made up of 102 | /// one or more codepoints. For end user oriented tasks, one should generally 103 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which 104 | /// always yields one codepoint at a time. 105 | /// 106 | /// Since graphemes are made up of one or more codepoints, this iterator 107 | /// yields `&str` elements (along with their start and end byte offsets). 108 | /// When invalid UTF-8 is encountered, replacement codepoints are 109 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the 110 | /// indices yielded by this iterator may not correspond to the length of the 111 | /// grapheme cluster yielded with those indices. For example, when this 112 | /// iterator encounters `\xFF` in the byte string, then it will yield a pair 113 | /// of indices ranging over a single byte, but will provide an `&str` 114 | /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when 115 | /// given only valid UTF-8, then all indices are in exact correspondence with 116 | /// their paired grapheme cluster. 117 | /// 118 | /// This iterator can be used in reverse. When reversed, exactly the same 119 | /// set of grapheme clusters are yielded, but in reverse order. 120 | /// 121 | /// This iterator only yields *extended* grapheme clusters, in accordance with 122 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). 123 | #[derive(Clone, Debug)] 124 | pub struct GraphemeIndices<'a> { 125 | bs: &'a [u8], 126 | forward_index: usize, 127 | reverse_index: usize, 128 | } 129 | 130 | impl<'a> GraphemeIndices<'a> { 131 | pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { 132 | GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() } 133 | } 134 | 135 | /// View the underlying data as a subslice of the original data. 136 | /// 137 | /// The slice returned has the same lifetime as the original slice, and so 138 | /// the iterator can continue to be used while this exists. 139 | /// 140 | /// # Examples 141 | /// 142 | /// ``` 143 | /// use bstr::ByteSlice; 144 | /// 145 | /// let mut it = b"abc".grapheme_indices(); 146 | /// 147 | /// assert_eq!(b"abc", it.as_bytes()); 148 | /// it.next(); 149 | /// assert_eq!(b"bc", it.as_bytes()); 150 | /// it.next(); 151 | /// it.next(); 152 | /// assert_eq!(b"", it.as_bytes()); 153 | /// ``` 154 | #[inline] 155 | pub fn as_bytes(&self) -> &'a [u8] { 156 | self.bs 157 | } 158 | } 159 | 160 | impl<'a> Iterator for GraphemeIndices<'a> { 161 | type Item = (usize, usize, &'a str); 162 | 163 | #[inline] 164 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { 165 | let index = self.forward_index; 166 | let (grapheme, size) = decode_grapheme(self.bs); 167 | if size == 0 { 168 | return None; 169 | } 170 | self.bs = &self.bs[size..]; 171 | self.forward_index += size; 172 | Some((index, index + size, grapheme)) 173 | } 174 | } 175 | 176 | impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { 177 | #[inline] 178 | fn next_back(&mut self) -> Option<(usize, usize, &'a str)> { 179 | let (grapheme, size) = decode_last_grapheme(self.bs); 180 | if size == 0 { 181 | return None; 182 | } 183 | self.bs = &self.bs[..self.bs.len() - size]; 184 | self.reverse_index -= size; 185 | Some((self.reverse_index, self.reverse_index + size, grapheme)) 186 | } 187 | } 188 | 189 | /// Decode a grapheme from the given byte string. 190 | /// 191 | /// This returns the resulting grapheme (which may be a Unicode replacement 192 | /// codepoint if invalid UTF-8 was found), along with the number of bytes 193 | /// decoded in the byte string. The number of bytes decoded may not be the 194 | /// same as the length of grapheme in the case where invalid UTF-8 is found. 195 | pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { 196 | if bs.is_empty() { 197 | ("", 0) 198 | } else if bs.len() >= 2 199 | && bs[0].is_ascii() 200 | && bs[1].is_ascii() 201 | && !bs[0].is_ascii_whitespace() 202 | { 203 | // FIXME: It is somewhat sad that we have to special case this, but it 204 | // leads to a significant speed up in predominantly ASCII text. The 205 | // issue here is that the DFA has a bit of overhead, and running it for 206 | // every byte in mostly ASCII text results in a bit slowdown. We should 207 | // re-litigate this once regex-automata 0.3 is out, but it might be 208 | // hard to avoid the special case. A DFA is always going to at least 209 | // require some memory access. 210 | 211 | // Safe because all ASCII bytes are valid UTF-8. 212 | let grapheme = unsafe { bs[..1].to_str_unchecked() }; 213 | (grapheme, 1) 214 | } else if let Some(hm) = { 215 | let input = Input::new(bs).anchored(Anchored::Yes); 216 | GRAPHEME_BREAK_FWD.try_search_fwd(&input).unwrap() 217 | } { 218 | // Safe because a match can only occur for valid UTF-8. 219 | let grapheme = unsafe { bs[..hm.offset()].to_str_unchecked() }; 220 | (grapheme, grapheme.len()) 221 | } else { 222 | const INVALID: &str = "\u{FFFD}"; 223 | // No match on non-empty bytes implies we found invalid UTF-8. 224 | let (_, size) = utf8::decode_lossy(bs); 225 | (INVALID, size) 226 | } 227 | } 228 | 229 | fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) { 230 | if bs.is_empty() { 231 | ("", 0) 232 | } else if let Some(hm) = { 233 | let input = Input::new(bs).anchored(Anchored::Yes); 234 | GRAPHEME_BREAK_REV.try_search_rev(&input).unwrap() 235 | } { 236 | let start = adjust_rev_for_regional_indicator(bs, hm.offset()); 237 | // Safe because a match can only occur for valid UTF-8. 238 | let grapheme = unsafe { bs[start..].to_str_unchecked() }; 239 | (grapheme, grapheme.len()) 240 | } else { 241 | const INVALID: &str = "\u{FFFD}"; 242 | // No match on non-empty bytes implies we found invalid UTF-8. 243 | let (_, size) = utf8::decode_last_lossy(bs); 244 | (INVALID, size) 245 | } 246 | } 247 | 248 | /// Return the correct offset for the next grapheme decoded at the end of the 249 | /// given byte string, where `i` is the initial guess. In particular, 250 | /// `&bs[i..]` represents the candidate grapheme. 251 | /// 252 | /// `i` is returned by this function in all cases except when `&bs[i..]` is 253 | /// a pair of regional indicator codepoints. In that case, if an odd number of 254 | /// additional regional indicator codepoints precedes `i`, then `i` is 255 | /// adjusted such that it points to only a single regional indicator. 256 | /// 257 | /// This "fixing" is necessary to handle the requirement that a break cannot 258 | /// occur between regional indicators where it would cause an odd number of 259 | /// regional indicators to exist before the break from the *start* of the 260 | /// string. A reverse regex cannot detect this case easily without look-around. 261 | fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { 262 | // All regional indicators use a 4 byte encoding, and we only care about 263 | // the case where we found a pair of regional indicators. 264 | if bs.len() - i != 8 { 265 | return i; 266 | } 267 | // Count all contiguous occurrences of regional indicators. If there's an 268 | // even number of them, then we can accept the pair we found. Otherwise, 269 | // we can only take one of them. 270 | // 271 | // FIXME: This is quadratic in the worst case, e.g., a string of just 272 | // regional indicator codepoints. A fix probably requires refactoring this 273 | // code a bit such that we don't rescan regional indicators. 274 | let mut count = 0; 275 | while let Some(hm) = { 276 | let input = Input::new(bs).anchored(Anchored::Yes); 277 | REGIONAL_INDICATOR_REV.try_search_rev(&input).unwrap() 278 | } { 279 | bs = &bs[..hm.offset()]; 280 | count += 1; 281 | } 282 | if count % 2 == 0 { 283 | i 284 | } else { 285 | i + 4 286 | } 287 | } 288 | 289 | #[cfg(all(test, feature = "std"))] 290 | mod tests { 291 | use alloc::{ 292 | string::{String, ToString}, 293 | vec, 294 | vec::Vec, 295 | }; 296 | 297 | #[cfg(not(miri))] 298 | use ucd_parse::GraphemeClusterBreakTest; 299 | 300 | use crate::tests::LOSSY_TESTS; 301 | 302 | use super::*; 303 | 304 | #[test] 305 | #[cfg(not(miri))] 306 | fn forward_ucd() { 307 | for (i, test) in ucdtests().into_iter().enumerate() { 308 | let given = test.grapheme_clusters.concat(); 309 | let got: Vec = Graphemes::new(given.as_bytes()) 310 | .map(|cluster| cluster.to_string()) 311 | .collect(); 312 | assert_eq!( 313 | test.grapheme_clusters, 314 | got, 315 | "\ngrapheme forward break test {} failed:\n\ 316 | given: {:?}\n\ 317 | expected: {:?}\n\ 318 | got: {:?}\n", 319 | i, 320 | uniescape(&given), 321 | uniescape_vec(&test.grapheme_clusters), 322 | uniescape_vec(&got), 323 | ); 324 | } 325 | } 326 | 327 | #[test] 328 | #[cfg(not(miri))] 329 | fn reverse_ucd() { 330 | for (i, test) in ucdtests().into_iter().enumerate() { 331 | let given = test.grapheme_clusters.concat(); 332 | let mut got: Vec = Graphemes::new(given.as_bytes()) 333 | .rev() 334 | .map(|cluster| cluster.to_string()) 335 | .collect(); 336 | got.reverse(); 337 | assert_eq!( 338 | test.grapheme_clusters, 339 | got, 340 | "\n\ngrapheme reverse break test {} failed:\n\ 341 | given: {:?}\n\ 342 | expected: {:?}\n\ 343 | got: {:?}\n", 344 | i, 345 | uniescape(&given), 346 | uniescape_vec(&test.grapheme_clusters), 347 | uniescape_vec(&got), 348 | ); 349 | } 350 | } 351 | 352 | #[test] 353 | fn forward_lossy() { 354 | for &(expected, input) in LOSSY_TESTS { 355 | let got = Graphemes::new(input.as_bytes()).collect::(); 356 | assert_eq!(expected, got); 357 | } 358 | } 359 | 360 | #[test] 361 | fn reverse_lossy() { 362 | for &(expected, input) in LOSSY_TESTS { 363 | let expected: String = expected.chars().rev().collect(); 364 | let got = 365 | Graphemes::new(input.as_bytes()).rev().collect::(); 366 | assert_eq!(expected, got); 367 | } 368 | } 369 | 370 | #[cfg(not(miri))] 371 | fn uniescape(s: &str) -> String { 372 | s.chars().flat_map(|c| c.escape_unicode()).collect::() 373 | } 374 | 375 | #[cfg(not(miri))] 376 | fn uniescape_vec(strs: &[String]) -> Vec { 377 | strs.iter().map(|s| uniescape(s)).collect() 378 | } 379 | 380 | /// Return all of the UCD for grapheme breaks. 381 | #[cfg(not(miri))] 382 | fn ucdtests() -> Vec { 383 | const TESTDATA: &str = include_str!("data/GraphemeBreakTest.txt"); 384 | 385 | let mut tests = vec![]; 386 | for mut line in TESTDATA.lines() { 387 | line = line.trim(); 388 | if line.starts_with("#") || line.contains("surrogate") { 389 | continue; 390 | } 391 | tests.push(line.parse().unwrap()); 392 | } 393 | tests 394 | } 395 | } 396 | -------------------------------------------------------------------------------- /src/unicode/mod.rs: -------------------------------------------------------------------------------- 1 | pub use self::{ 2 | grapheme::{decode_grapheme, GraphemeIndices, Graphemes}, 3 | sentence::{SentenceIndices, Sentences}, 4 | whitespace::{whitespace_len_fwd, whitespace_len_rev}, 5 | word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks}, 6 | }; 7 | 8 | mod fsm; 9 | mod grapheme; 10 | mod sentence; 11 | mod whitespace; 12 | mod word; 13 | -------------------------------------------------------------------------------- /src/unicode/sentence.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{dfa::Automaton, Anchored, Input}; 2 | 3 | use crate::{ 4 | ext_slice::ByteSlice, 5 | unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8, 6 | }; 7 | 8 | /// An iterator over sentences in a byte string. 9 | /// 10 | /// This iterator is typically constructed by 11 | /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences). 12 | /// 13 | /// Sentences typically include their trailing punctuation and whitespace. 14 | /// 15 | /// Since sentences are made up of one or more codepoints, this iterator yields 16 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints 17 | /// are [substituted](index.html#handling-of-invalid-utf-8). 18 | /// 19 | /// This iterator yields words in accordance with the default sentence boundary 20 | /// rules specified in 21 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). 22 | #[derive(Clone, Debug)] 23 | pub struct Sentences<'a> { 24 | bs: &'a [u8], 25 | } 26 | 27 | impl<'a> Sentences<'a> { 28 | pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> { 29 | Sentences { bs } 30 | } 31 | 32 | /// View the underlying data as a subslice of the original data. 33 | /// 34 | /// The slice returned has the same lifetime as the original slice, and so 35 | /// the iterator can continue to be used while this exists. 36 | /// 37 | /// # Examples 38 | /// 39 | /// ``` 40 | /// use bstr::ByteSlice; 41 | /// 42 | /// let mut it = b"I want this. Not that. Right now.".sentences(); 43 | /// 44 | /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes()); 45 | /// it.next(); 46 | /// assert_eq!(b"Not that. Right now.", it.as_bytes()); 47 | /// it.next(); 48 | /// it.next(); 49 | /// assert_eq!(b"", it.as_bytes()); 50 | /// ``` 51 | #[inline] 52 | pub fn as_bytes(&self) -> &'a [u8] { 53 | self.bs 54 | } 55 | } 56 | 57 | impl<'a> Iterator for Sentences<'a> { 58 | type Item = &'a str; 59 | 60 | #[inline] 61 | fn next(&mut self) -> Option<&'a str> { 62 | let (sentence, size) = decode_sentence(self.bs); 63 | if size == 0 { 64 | return None; 65 | } 66 | self.bs = &self.bs[size..]; 67 | Some(sentence) 68 | } 69 | } 70 | 71 | /// An iterator over sentences in a byte string, along with their byte offsets. 72 | /// 73 | /// This iterator is typically constructed by 74 | /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices). 75 | /// 76 | /// Sentences typically include their trailing punctuation and whitespace. 77 | /// 78 | /// Since sentences are made up of one or more codepoints, this iterator 79 | /// yields `&str` elements (along with their start and end byte offsets). 80 | /// When invalid UTF-8 is encountered, replacement codepoints are 81 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the 82 | /// indices yielded by this iterator may not correspond to the length of the 83 | /// sentence yielded with those indices. For example, when this iterator 84 | /// encounters `\xFF` in the byte string, then it will yield a pair of indices 85 | /// ranging over a single byte, but will provide an `&str` equivalent to 86 | /// `"\u{FFFD}"`, which is three bytes in length. However, when given only 87 | /// valid UTF-8, then all indices are in exact correspondence with their paired 88 | /// word. 89 | /// 90 | /// This iterator yields words in accordance with the default sentence boundary 91 | /// rules specified in 92 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). 93 | #[derive(Clone, Debug)] 94 | pub struct SentenceIndices<'a> { 95 | bs: &'a [u8], 96 | forward_index: usize, 97 | } 98 | 99 | impl<'a> SentenceIndices<'a> { 100 | pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> { 101 | SentenceIndices { bs, forward_index: 0 } 102 | } 103 | 104 | /// View the underlying data as a subslice of the original data. 105 | /// 106 | /// The slice returned has the same lifetime as the original slice, and so 107 | /// the iterator can continue to be used while this exists. 108 | /// 109 | /// # Examples 110 | /// 111 | /// ``` 112 | /// use bstr::ByteSlice; 113 | /// 114 | /// let mut it = b"I want this. Not that. Right now.".sentence_indices(); 115 | /// 116 | /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes()); 117 | /// it.next(); 118 | /// assert_eq!(b"Not that. Right now.", it.as_bytes()); 119 | /// it.next(); 120 | /// it.next(); 121 | /// assert_eq!(b"", it.as_bytes()); 122 | /// ``` 123 | #[inline] 124 | pub fn as_bytes(&self) -> &'a [u8] { 125 | self.bs 126 | } 127 | } 128 | 129 | impl<'a> Iterator for SentenceIndices<'a> { 130 | type Item = (usize, usize, &'a str); 131 | 132 | #[inline] 133 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { 134 | let index = self.forward_index; 135 | let (word, size) = decode_sentence(self.bs); 136 | if size == 0 { 137 | return None; 138 | } 139 | self.bs = &self.bs[size..]; 140 | self.forward_index += size; 141 | Some((index, index + size, word)) 142 | } 143 | } 144 | 145 | fn decode_sentence(bs: &[u8]) -> (&str, usize) { 146 | if bs.is_empty() { 147 | ("", 0) 148 | } else if let Some(hm) = { 149 | let input = Input::new(bs).anchored(Anchored::Yes); 150 | SENTENCE_BREAK_FWD.try_search_fwd(&input).unwrap() 151 | } { 152 | // Safe because a match can only occur for valid UTF-8. 153 | let sentence = unsafe { bs[..hm.offset()].to_str_unchecked() }; 154 | (sentence, sentence.len()) 155 | } else { 156 | const INVALID: &str = "\u{FFFD}"; 157 | // No match on non-empty bytes implies we found invalid UTF-8. 158 | let (_, size) = utf8::decode_lossy(bs); 159 | (INVALID, size) 160 | } 161 | } 162 | 163 | #[cfg(all(test, feature = "std"))] 164 | mod tests { 165 | use alloc::{vec, vec::Vec}; 166 | 167 | #[cfg(not(miri))] 168 | use ucd_parse::SentenceBreakTest; 169 | 170 | use crate::ext_slice::ByteSlice; 171 | 172 | #[test] 173 | #[cfg(not(miri))] 174 | fn forward_ucd() { 175 | for (i, test) in ucdtests().into_iter().enumerate() { 176 | let given = test.sentences.concat(); 177 | let got = sentences(given.as_bytes()); 178 | assert_eq!( 179 | test.sentences, 180 | got, 181 | "\n\nsentence forward break test {} failed:\n\ 182 | given: {:?}\n\ 183 | expected: {:?}\n\ 184 | got: {:?}\n", 185 | i, 186 | given, 187 | strs_to_bstrs(&test.sentences), 188 | strs_to_bstrs(&got), 189 | ); 190 | } 191 | } 192 | 193 | // Some additional tests that don't seem to be covered by the UCD tests. 194 | #[test] 195 | fn forward_additional() { 196 | assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A")); 197 | assert_eq!(vec!["a.. a"], sentences(b"a.. a")); 198 | 199 | assert_eq!(vec!["a... ", "A"], sentences(b"a... A")); 200 | assert_eq!(vec!["a... a"], sentences(b"a... a")); 201 | 202 | assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a")); 203 | } 204 | 205 | fn sentences(bytes: &[u8]) -> Vec<&str> { 206 | bytes.sentences().collect() 207 | } 208 | 209 | #[cfg(not(miri))] 210 | fn strs_to_bstrs>(strs: &[S]) -> Vec<&[u8]> { 211 | strs.iter().map(|s| s.as_ref().as_bytes()).collect() 212 | } 213 | 214 | /// Return all of the UCD for sentence breaks. 215 | #[cfg(not(miri))] 216 | fn ucdtests() -> Vec { 217 | const TESTDATA: &str = include_str!("data/SentenceBreakTest.txt"); 218 | 219 | let mut tests = vec![]; 220 | for mut line in TESTDATA.lines() { 221 | line = line.trim(); 222 | if line.starts_with("#") || line.contains("surrogate") { 223 | continue; 224 | } 225 | tests.push(line.parse().unwrap()); 226 | } 227 | tests 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /src/unicode/whitespace.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{dfa::Automaton, Anchored, Input}; 2 | 3 | use crate::unicode::fsm::{ 4 | whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD, 5 | whitespace_anchored_rev::WHITESPACE_ANCHORED_REV, 6 | }; 7 | 8 | /// Return the first position of a non-whitespace character. 9 | pub fn whitespace_len_fwd(slice: &[u8]) -> usize { 10 | let input = Input::new(slice).anchored(Anchored::Yes); 11 | WHITESPACE_ANCHORED_FWD 12 | .try_search_fwd(&input) 13 | .unwrap() 14 | .map_or(0, |hm| hm.offset()) 15 | } 16 | 17 | /// Return the last position of a non-whitespace character. 18 | pub fn whitespace_len_rev(slice: &[u8]) -> usize { 19 | let input = Input::new(slice).anchored(Anchored::Yes); 20 | WHITESPACE_ANCHORED_REV 21 | .try_search_rev(&input) 22 | .unwrap() 23 | .map_or(slice.len(), |hm| hm.offset()) 24 | } 25 | -------------------------------------------------------------------------------- /src/unicode/word.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{dfa::Automaton, Anchored, Input}; 2 | 3 | use crate::{ 4 | ext_slice::ByteSlice, 5 | unicode::fsm::{ 6 | simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD, 7 | }, 8 | utf8, 9 | }; 10 | 11 | /// An iterator over words in a byte string. 12 | /// 13 | /// This iterator is typically constructed by 14 | /// [`ByteSlice::words`](trait.ByteSlice.html#method.words). 15 | /// 16 | /// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html) 17 | /// iterator, except it only returns elements that contain a "word" character. 18 | /// A word character is defined by UTS #18 (Annex C) to be the combination 19 | /// of the `Alphabetic` and `Join_Control` properties, along with the 20 | /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories. 21 | /// 22 | /// Since words are made up of one or more codepoints, this iterator yields 23 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints 24 | /// are [substituted](index.html#handling-of-invalid-utf-8). 25 | /// 26 | /// This iterator yields words in accordance with the default word boundary 27 | /// rules specified in 28 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). 29 | /// In particular, this may not be suitable for Japanese and Chinese scripts 30 | /// that do not use spaces between words. 31 | #[derive(Clone, Debug)] 32 | pub struct Words<'a>(WordsWithBreaks<'a>); 33 | 34 | impl<'a> Words<'a> { 35 | pub(crate) fn new(bs: &'a [u8]) -> Words<'a> { 36 | Words(WordsWithBreaks::new(bs)) 37 | } 38 | 39 | /// View the underlying data as a subslice of the original data. 40 | /// 41 | /// The slice returned has the same lifetime as the original slice, and so 42 | /// the iterator can continue to be used while this exists. 43 | /// 44 | /// # Examples 45 | /// 46 | /// ``` 47 | /// use bstr::ByteSlice; 48 | /// 49 | /// let mut it = b"foo bar baz".words(); 50 | /// 51 | /// assert_eq!(b"foo bar baz", it.as_bytes()); 52 | /// it.next(); 53 | /// it.next(); 54 | /// assert_eq!(b" baz", it.as_bytes()); 55 | /// it.next(); 56 | /// assert_eq!(b"", it.as_bytes()); 57 | /// ``` 58 | #[inline] 59 | pub fn as_bytes(&self) -> &'a [u8] { 60 | self.0.as_bytes() 61 | } 62 | } 63 | 64 | impl<'a> Iterator for Words<'a> { 65 | type Item = &'a str; 66 | 67 | #[inline] 68 | fn next(&mut self) -> Option<&'a str> { 69 | for word in self.0.by_ref() { 70 | let input = 71 | Input::new(word).anchored(Anchored::Yes).earliest(true); 72 | if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() { 73 | return Some(word); 74 | } 75 | } 76 | None 77 | } 78 | } 79 | 80 | /// An iterator over words in a byte string and their byte index positions. 81 | /// 82 | /// This iterator is typically constructed by 83 | /// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices). 84 | /// 85 | /// This is similar to the 86 | /// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator, 87 | /// except it only returns elements that contain a "word" character. A 88 | /// word character is defined by UTS #18 (Annex C) to be the combination 89 | /// of the `Alphabetic` and `Join_Control` properties, along with the 90 | /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories. 91 | /// 92 | /// Since words are made up of one or more codepoints, this iterator 93 | /// yields `&str` elements (along with their start and end byte offsets). 94 | /// When invalid UTF-8 is encountered, replacement codepoints are 95 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the 96 | /// indices yielded by this iterator may not correspond to the length of the 97 | /// word yielded with those indices. For example, when this iterator encounters 98 | /// `\xFF` in the byte string, then it will yield a pair of indices ranging 99 | /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`, 100 | /// which is three bytes in length. However, when given only valid UTF-8, then 101 | /// all indices are in exact correspondence with their paired word. 102 | /// 103 | /// This iterator yields words in accordance with the default word boundary 104 | /// rules specified in 105 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). 106 | /// In particular, this may not be suitable for Japanese and Chinese scripts 107 | /// that do not use spaces between words. 108 | #[derive(Clone, Debug)] 109 | pub struct WordIndices<'a>(WordsWithBreakIndices<'a>); 110 | 111 | impl<'a> WordIndices<'a> { 112 | pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> { 113 | WordIndices(WordsWithBreakIndices::new(bs)) 114 | } 115 | 116 | /// View the underlying data as a subslice of the original data. 117 | /// 118 | /// The slice returned has the same lifetime as the original slice, and so 119 | /// the iterator can continue to be used while this exists. 120 | /// 121 | /// # Examples 122 | /// 123 | /// ``` 124 | /// use bstr::ByteSlice; 125 | /// 126 | /// let mut it = b"foo bar baz".word_indices(); 127 | /// 128 | /// assert_eq!(b"foo bar baz", it.as_bytes()); 129 | /// it.next(); 130 | /// it.next(); 131 | /// assert_eq!(b" baz", it.as_bytes()); 132 | /// it.next(); 133 | /// it.next(); 134 | /// assert_eq!(b"", it.as_bytes()); 135 | /// ``` 136 | #[inline] 137 | pub fn as_bytes(&self) -> &'a [u8] { 138 | self.0.as_bytes() 139 | } 140 | } 141 | 142 | impl<'a> Iterator for WordIndices<'a> { 143 | type Item = (usize, usize, &'a str); 144 | 145 | #[inline] 146 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { 147 | for (start, end, word) in self.0.by_ref() { 148 | let input = 149 | Input::new(word).anchored(Anchored::Yes).earliest(true); 150 | if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() { 151 | return Some((start, end, word)); 152 | } 153 | } 154 | None 155 | } 156 | } 157 | 158 | /// An iterator over all word breaks in a byte string. 159 | /// 160 | /// This iterator is typically constructed by 161 | /// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks). 162 | /// 163 | /// This iterator yields not only all words, but the content that comes between 164 | /// words. In particular, if all elements yielded by this iterator are 165 | /// concatenated, then the result is the original string (subject to Unicode 166 | /// replacement codepoint substitutions). 167 | /// 168 | /// Since words are made up of one or more codepoints, this iterator yields 169 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints 170 | /// are [substituted](index.html#handling-of-invalid-utf-8). 171 | /// 172 | /// This iterator yields words in accordance with the default word boundary 173 | /// rules specified in 174 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). 175 | /// In particular, this may not be suitable for Japanese and Chinese scripts 176 | /// that do not use spaces between words. 177 | #[derive(Clone, Debug)] 178 | pub struct WordsWithBreaks<'a> { 179 | bs: &'a [u8], 180 | } 181 | 182 | impl<'a> WordsWithBreaks<'a> { 183 | pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> { 184 | WordsWithBreaks { bs } 185 | } 186 | 187 | /// View the underlying data as a subslice of the original data. 188 | /// 189 | /// The slice returned has the same lifetime as the original slice, and so 190 | /// the iterator can continue to be used while this exists. 191 | /// 192 | /// # Examples 193 | /// 194 | /// ``` 195 | /// use bstr::ByteSlice; 196 | /// 197 | /// let mut it = b"foo bar baz".words_with_breaks(); 198 | /// 199 | /// assert_eq!(b"foo bar baz", it.as_bytes()); 200 | /// it.next(); 201 | /// assert_eq!(b" bar baz", it.as_bytes()); 202 | /// it.next(); 203 | /// it.next(); 204 | /// assert_eq!(b" baz", it.as_bytes()); 205 | /// it.next(); 206 | /// it.next(); 207 | /// assert_eq!(b"", it.as_bytes()); 208 | /// ``` 209 | #[inline] 210 | pub fn as_bytes(&self) -> &'a [u8] { 211 | self.bs 212 | } 213 | } 214 | 215 | impl<'a> Iterator for WordsWithBreaks<'a> { 216 | type Item = &'a str; 217 | 218 | #[inline] 219 | fn next(&mut self) -> Option<&'a str> { 220 | let (word, size) = decode_word(self.bs); 221 | if size == 0 { 222 | return None; 223 | } 224 | self.bs = &self.bs[size..]; 225 | Some(word) 226 | } 227 | } 228 | 229 | /// An iterator over all word breaks in a byte string, along with their byte 230 | /// index positions. 231 | /// 232 | /// This iterator is typically constructed by 233 | /// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices). 234 | /// 235 | /// This iterator yields not only all words, but the content that comes between 236 | /// words. In particular, if all elements yielded by this iterator are 237 | /// concatenated, then the result is the original string (subject to Unicode 238 | /// replacement codepoint substitutions). 239 | /// 240 | /// Since words are made up of one or more codepoints, this iterator 241 | /// yields `&str` elements (along with their start and end byte offsets). 242 | /// When invalid UTF-8 is encountered, replacement codepoints are 243 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the 244 | /// indices yielded by this iterator may not correspond to the length of the 245 | /// word yielded with those indices. For example, when this iterator encounters 246 | /// `\xFF` in the byte string, then it will yield a pair of indices ranging 247 | /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`, 248 | /// which is three bytes in length. However, when given only valid UTF-8, then 249 | /// all indices are in exact correspondence with their paired word. 250 | /// 251 | /// This iterator yields words in accordance with the default word boundary 252 | /// rules specified in 253 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries). 254 | /// In particular, this may not be suitable for Japanese and Chinese scripts 255 | /// that do not use spaces between words. 256 | #[derive(Clone, Debug)] 257 | pub struct WordsWithBreakIndices<'a> { 258 | bs: &'a [u8], 259 | forward_index: usize, 260 | } 261 | 262 | impl<'a> WordsWithBreakIndices<'a> { 263 | pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> { 264 | WordsWithBreakIndices { bs, forward_index: 0 } 265 | } 266 | 267 | /// View the underlying data as a subslice of the original data. 268 | /// 269 | /// The slice returned has the same lifetime as the original slice, and so 270 | /// the iterator can continue to be used while this exists. 271 | /// 272 | /// # Examples 273 | /// 274 | /// ``` 275 | /// use bstr::ByteSlice; 276 | /// 277 | /// let mut it = b"foo bar baz".words_with_break_indices(); 278 | /// 279 | /// assert_eq!(b"foo bar baz", it.as_bytes()); 280 | /// it.next(); 281 | /// assert_eq!(b" bar baz", it.as_bytes()); 282 | /// it.next(); 283 | /// it.next(); 284 | /// assert_eq!(b" baz", it.as_bytes()); 285 | /// it.next(); 286 | /// it.next(); 287 | /// assert_eq!(b"", it.as_bytes()); 288 | /// ``` 289 | #[inline] 290 | pub fn as_bytes(&self) -> &'a [u8] { 291 | self.bs 292 | } 293 | } 294 | 295 | impl<'a> Iterator for WordsWithBreakIndices<'a> { 296 | type Item = (usize, usize, &'a str); 297 | 298 | #[inline] 299 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { 300 | let index = self.forward_index; 301 | let (word, size) = decode_word(self.bs); 302 | if size == 0 { 303 | return None; 304 | } 305 | self.bs = &self.bs[size..]; 306 | self.forward_index += size; 307 | Some((index, index + size, word)) 308 | } 309 | } 310 | 311 | fn decode_word(bs: &[u8]) -> (&str, usize) { 312 | if bs.is_empty() { 313 | ("", 0) 314 | } else if let Some(hm) = { 315 | let input = Input::new(bs).anchored(Anchored::Yes); 316 | WORD_BREAK_FWD.try_search_fwd(&input).unwrap() 317 | } { 318 | // Safe because a match can only occur for valid UTF-8. 319 | let word = unsafe { bs[..hm.offset()].to_str_unchecked() }; 320 | (word, word.len()) 321 | } else { 322 | const INVALID: &str = "\u{FFFD}"; 323 | // No match on non-empty bytes implies we found invalid UTF-8. 324 | let (_, size) = utf8::decode_lossy(bs); 325 | (INVALID, size) 326 | } 327 | } 328 | 329 | #[cfg(all(test, feature = "std"))] 330 | mod tests { 331 | use alloc::{vec, vec::Vec}; 332 | 333 | #[cfg(not(miri))] 334 | use ucd_parse::WordBreakTest; 335 | 336 | use crate::ext_slice::ByteSlice; 337 | 338 | #[test] 339 | #[cfg(not(miri))] 340 | fn forward_ucd() { 341 | for (i, test) in ucdtests().into_iter().enumerate() { 342 | let given = test.words.concat(); 343 | let got = words(given.as_bytes()); 344 | assert_eq!( 345 | test.words, 346 | got, 347 | "\n\nword forward break test {} failed:\n\ 348 | given: {:?}\n\ 349 | expected: {:?}\n\ 350 | got: {:?}\n", 351 | i, 352 | given, 353 | strs_to_bstrs(&test.words), 354 | strs_to_bstrs(&got), 355 | ); 356 | } 357 | } 358 | 359 | // Some additional tests that don't seem to be covered by the UCD tests. 360 | // 361 | // It's pretty amazing that the UCD tests miss these cases. I only found 362 | // them by running this crate's segmenter and ICU's segmenter on the same 363 | // text and comparing the output. 364 | #[test] 365 | fn forward_additional() { 366 | assert_eq!(vec!["a", ".", " ", "Y"], words(b"a. Y")); 367 | assert_eq!(vec!["r", ".", " ", "Yo"], words(b"r. Yo")); 368 | assert_eq!( 369 | vec!["whatsoever", ".", " ", "You", " ", "may"], 370 | words(b"whatsoever. You may") 371 | ); 372 | assert_eq!( 373 | vec!["21stcentury'syesterday"], 374 | words(b"21stcentury'syesterday") 375 | ); 376 | 377 | assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s")); 378 | assert_eq!(vec!["_vhat's"], words(b"_vhat's")); 379 | assert_eq!(vec!["__on'anima"], words(b"__on'anima")); 380 | assert_eq!(vec!["123_", "'", "4"], words(b"123_'4")); 381 | assert_eq!(vec!["_123'4"], words(b"_123'4")); 382 | assert_eq!(vec!["__12'345"], words(b"__12'345")); 383 | 384 | assert_eq!( 385 | vec!["tomorrowat4", ":", "00", ","], 386 | words(b"tomorrowat4:00,") 387 | ); 388 | assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's")); 389 | assert_eq!(vec!["X38"], words(b"X38")); 390 | 391 | assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,")); 392 | assert_eq!(vec!["12S", "'", "1"], words(b"12S'1")); 393 | assert_eq!(vec!["1XY"], words(b"1XY")); 394 | 395 | assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes())); 396 | 397 | // Tests that Vithkuqi works, which was introduced in Unicode 14. 398 | // This test fails prior to Unicode 14. 399 | assert_eq!( 400 | vec!["\u{10570}\u{10597}"], 401 | words("\u{10570}\u{10597}".as_bytes()) 402 | ); 403 | } 404 | 405 | fn words(bytes: &[u8]) -> Vec<&str> { 406 | bytes.words_with_breaks().collect() 407 | } 408 | 409 | #[cfg(not(miri))] 410 | fn strs_to_bstrs>(strs: &[S]) -> Vec<&[u8]> { 411 | strs.iter().map(|s| s.as_ref().as_bytes()).collect() 412 | } 413 | 414 | /// Return all of the UCD for word breaks. 415 | #[cfg(not(miri))] 416 | fn ucdtests() -> Vec { 417 | const TESTDATA: &str = include_str!("data/WordBreakTest.txt"); 418 | 419 | let mut tests = vec![]; 420 | for mut line in TESTDATA.lines() { 421 | line = line.trim(); 422 | if line.starts_with("#") || line.contains("surrogate") { 423 | continue; 424 | } 425 | tests.push(line.parse().unwrap()); 426 | } 427 | tests 428 | } 429 | } 430 | --------------------------------------------------------------------------------