├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── COPYING
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── bench
    ├── .gitignore
    ├── Cargo.toml
    ├── data
    │   ├── opensubtitles2018-en-huge-ascii.txt
    │   ├── opensubtitles2018-en-small-ascii.txt
    │   ├── opensubtitles2018-en-tiny-ascii.txt
    │   ├── opensubtitles2018-ru-huge-utf8.txt
    │   ├── opensubtitles2018-ru-small-utf8.txt
    │   ├── opensubtitles2018-ru-tiny-utf8.txt
    │   ├── opensubtitles2018-zh-huge-utf8.txt
    │   ├── opensubtitles2018-zh-small-utf8.txt
    │   ├── opensubtitles2018-zh-tiny-utf8.txt
    │   ├── repeated-rare-huge
    │   ├── repeated-rare-small
    │   ├── sherlock-holmes-huge-ascii.txt
    │   ├── sherlock-holmes-small-ascii.txt
    │   └── sherlock-holmes-tiny-ascii.txt
    └── src
    │   ├── bench.rs
    │   ├── inputs.rs
    │   ├── lib.rs
    │   └── search.rs
├── examples
    ├── graphemes-std.rs
    ├── graphemes.rs
    ├── lines-std.rs
    ├── lines.rs
    ├── uppercase-std.rs
    ├── uppercase.rs
    ├── words-std.rs
    └── words.rs
├── rustfmt.toml
├── scripts
    ├── generate-unicode-data
    └── regex
    │   ├── grapheme.sh
    │   ├── sentence.sh
    │   └── word.sh
└── src
    ├── ascii.rs
    ├── bstr.rs
    ├── bstring.rs
    ├── byteset
        ├── mod.rs
        └── scalar.rs
    ├── escape_bytes.rs
    ├── ext_slice.rs
    ├── ext_vec.rs
    ├── impls.rs
    ├── io.rs
    ├── lib.rs
    ├── tests.rs
    ├── unicode
        ├── data
        │   ├── GraphemeBreakTest.txt
        │   ├── LICENSE-UNICODE
        │   ├── SentenceBreakTest.txt
        │   └── WordBreakTest.txt
        ├── fsm
        │   ├── grapheme_break_fwd.bigendian.dfa
        │   ├── grapheme_break_fwd.littleendian.dfa
        │   ├── grapheme_break_fwd.rs
        │   ├── grapheme_break_rev.bigendian.dfa
        │   ├── grapheme_break_rev.littleendian.dfa
        │   ├── grapheme_break_rev.rs
        │   ├── mod.rs
        │   ├── regional_indicator_rev.bigendian.dfa
        │   ├── regional_indicator_rev.littleendian.dfa
        │   ├── regional_indicator_rev.rs
        │   ├── sentence_break_fwd.bigendian.dfa
        │   ├── sentence_break_fwd.littleendian.dfa
        │   ├── sentence_break_fwd.rs
        │   ├── simple_word_fwd.bigendian.dfa
        │   ├── simple_word_fwd.littleendian.dfa
        │   ├── simple_word_fwd.rs
        │   ├── whitespace_anchored_fwd.bigendian.dfa
        │   ├── whitespace_anchored_fwd.littleendian.dfa
        │   ├── whitespace_anchored_fwd.rs
        │   ├── whitespace_anchored_rev.bigendian.dfa
        │   ├── whitespace_anchored_rev.littleendian.dfa
        │   ├── whitespace_anchored_rev.rs
        │   ├── word_break_fwd.bigendian.dfa
        │   ├── word_break_fwd.littleendian.dfa
        │   └── word_break_fwd.rs
        ├── grapheme.rs
        ├── mod.rs
        ├── sentence.rs
        ├── whitespace.rs
        └── word.rs
    └── utf8.rs


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [BurntSushi]
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: ci
  2 | on:
  3 |   pull_request:
  4 |   push:
  5 |     branches:
  6 |     - master
  7 |   schedule:
  8 |     - cron: '00 01 * * *'
  9 | 
 10 | # The section is needed to drop write-all permissions that are granted on
 11 | # `schedule` event. By specifying any permission explicitly all others are set
 12 | # to none. By using the principle of least privilege the damage a compromised
 13 | # workflow can do (because of an injection or compromised third party tool or
 14 | # action) is restricted. Currently the worklow doesn't need any additional
 15 | # permission except for pulling the code. Adding labels to issues, commenting
 16 | # on pull-requests, etc. may need additional permissions:
 17 | #
 18 | # Syntax for this section:
 19 | # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions
 20 | #
 21 | # Reference for how to assign permissions on a job-by-job basis:
 22 | # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
 23 | #
 24 | # Reference for available permissions that we can enable if needed:
 25 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token
 26 | permissions:
 27 |   # to fetch code (actions/checkout)
 28 |   contents: read
 29 | 
 30 | jobs:
 31 |   test:
 32 |     name: test
 33 |     runs-on: ${{ matrix.os }}
 34 |     strategy:
 35 |       matrix:
 36 |         include:
 37 |         - build: pinned
 38 |           os: ubuntu-latest
 39 |           rust: 1.73.0
 40 |         - build: stable
 41 |           os: ubuntu-latest
 42 |           rust: stable
 43 |         - build: beta
 44 |           os: ubuntu-latest
 45 |           rust: beta
 46 |         - build: nightly
 47 |           os: ubuntu-latest
 48 |           rust: nightly
 49 |         - build: macos
 50 |           os: macos-latest
 51 |           rust: stable
 52 |         - build: win-msvc
 53 |           os: windows-latest
 54 |           rust: stable
 55 |         - build: win-gnu
 56 |           os: windows-latest
 57 |           rust: stable-x86_64-gnu
 58 |     env:
 59 |       RUSTFLAGS: -D warnings
 60 |       RUST_BACKTRACE: 1
 61 |     steps:
 62 |     - name: Checkout repository
 63 |       uses: actions/checkout@v3
 64 |     - name: Install Rust
 65 |       uses: dtolnay/rust-toolchain@master
 66 |       with:
 67 |         toolchain: ${{ matrix.rust }}
 68 |     - run: cargo build --verbose
 69 |     - run: cargo doc --verbose
 70 |     # We run a few other builds, but only on one instance to avoid doing
 71 |     # more work than we need to.
 72 |     - if: matrix.build == 'stable'
 73 |       run: cargo build --verbose --features serde
 74 |     - if: matrix.build == 'stable'
 75 |       run: cargo build --verbose --no-default-features
 76 |     - if: matrix.build == 'stable'
 77 |       run: cargo build --verbose --no-default-features --features serde,alloc
 78 |     - if: matrix.build == 'stable'
 79 |       run: cargo build --verbose --no-default-features --features serde
 80 |     - if: matrix.build == 'stable'
 81 |       run: cargo build --verbose --no-default-features --features alloc
 82 |     # Our dev dependencies evolve more rapidly than we'd like, so only run
 83 |     # tests when we aren't pinning the Rust version.
 84 |     - if: matrix.build != 'pinned'
 85 |       run: cargo test --verbose
 86 |     # As with 'cargo build' above, run tests on a bunch of feature
 87 |     # combinations, but just on 'stable' to avoid doing more work that we have
 88 |     # to.
 89 |     - if: matrix.build == 'stable'
 90 |       run: cargo test --verbose --features serde
 91 |     - if: matrix.build == 'stable'
 92 |       run: cargo test --verbose --no-default-features
 93 |     - if: matrix.build == 'stable'
 94 |       run: cargo test --verbose --no-default-features --features serde,alloc
 95 |     - if: matrix.build == 'stable'
 96 |       run: cargo test --verbose --no-default-features --features serde
 97 |     - if: matrix.build == 'stable'
 98 |       run: cargo test --verbose --no-default-features --features alloc
 99 |     - name: Run benchmarks as tests
100 |       if: matrix.build == 'stable'
101 |       working-directory: ./bench
102 |       run: cargo test --verbose --benches
103 | 
104 |   rustfmt:
105 |     name: rustfmt
106 |     runs-on: ubuntu-latest
107 |     steps:
108 |     - name: Checkout repository
109 |       uses: actions/checkout@v3
110 |     - name: Install Rust
111 |       uses: dtolnay/rust-toolchain@master
112 |       with:
113 |         toolchain: stable
114 |         components: rustfmt
115 |     - name: Check formatting
116 |       run: cargo fmt --check
117 | 
118 | #   miri:
119 | #     name: miri
120 | #     runs-on: ubuntu-latest
121 | #     steps:
122 | #     - name: Checkout repository
123 | #       uses: actions/checkout@v3
124 | #     - name: Install Rust
125 | #       uses: dtolnay/rust-toolchain@miri
126 | #     - run: cargo miri test --lib --verbose
127 | #       env:
128 | #         MIRIFLAGS: -Zmiri-strict-provenance
129 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .*.swp
2 | tags
3 | target
4 | /Cargo.lock
5 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | This project is licensed under either of
2 | 
3 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
4 |    https://www.apache.org/licenses/LICENSE-2.0)
5 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
6 |    https://opensource.org/licenses/MIT)
7 | 
8 | at your option.
9 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "bstr"
 3 | version = "1.12.0"  #:version
 4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 5 | description = "A string type that is not required to be valid UTF-8."
 6 | documentation = "https://docs.rs/bstr"
 7 | homepage = "https://github.com/BurntSushi/bstr"
 8 | repository = "https://github.com/BurntSushi/bstr"
 9 | readme = "README.md"
10 | keywords = ["string", "str", "byte", "bytes", "text"]
11 | license = "MIT OR Apache-2.0"
12 | categories = ["text-processing", "encoding"]
13 | exclude = ["/.github", "/scripts", "/src/unicode/data"]
14 | edition = "2021"
15 | rust-version = "1.73"
16 | resolver = "2"
17 | 
18 | [workspace]
19 | members = ["bench"]
20 | 
21 | [lib]
22 | bench = false
23 | 
24 | [features]
25 | default = ["std", "unicode"]
26 | std = ["alloc", "memchr/std", "serde?/std"]
27 | alloc = ["memchr/alloc", "serde?/alloc"]
28 | unicode = ["dep:regex-automata"]
29 | serde = ["dep:serde"]
30 | 
31 | [dependencies]
32 | memchr = { version = "2.7.1", default-features = false }
33 | serde = { version = "1.0.85", default-features = false, optional = true }
34 | 
35 | [dependencies.regex-automata]
36 | version = "0.4.1"
37 | default-features = false
38 | features = ["dfa-search"]
39 | optional = true
40 | 
41 | [dev-dependencies]
42 | quickcheck = { version = "1", default-features = false }
43 | ucd-parse = "0.1.3"
44 | unicode-segmentation = "1.2.1"
45 | 
46 | [package.metadata.docs.rs]
47 | # We want to document all features.
48 | all-features = true
49 | # Since this crate's feature setup is pretty complicated, it is worth opting
50 | # into a nightly unstable option to show the features that need to be enabled
51 | # for public API items. To do that, we set 'docsrs', and when that's enabled,
52 | # we enable the 'doc_auto_cfg' feature.
53 | #
54 | # To test this locally, run:
55 | #
56 | #     RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
57 | rustdoc-args = ["--cfg", "docsrs"]
58 | 
59 | [profile.release]
60 | debug = true
61 | 
62 | [[example]]
63 | name = "graphemes"
64 | required-features = ["std", "unicode"]
65 | 
66 | [[example]]
67 | name = "lines"
68 | required-features = ["std"]
69 | 
70 | [[example]]
71 | name = "uppercase"
72 | required-features = ["std", "unicode"]
73 | 
74 | [[example]]
75 | name = "words"
76 | required-features = ["std", "unicode"]
77 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018-2019 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | bstr
  2 | ====
  3 | This crate provides extension traits for `&[u8]` and `Vec<u8>` that enable
  4 | their use as byte strings, where byte strings are _conventionally_ UTF-8. This
  5 | differs from the standard library's `String` and `str` types in that they are
  6 | not required to be valid UTF-8, but may be fully or partially valid UTF-8.
  7 | 
  8 | [![Build status](https://github.com/BurntSushi/bstr/workflows/ci/badge.svg)](https://github.com/BurntSushi/bstr/actions)
  9 | [![crates.io](https://img.shields.io/crates/v/bstr.svg)](https://crates.io/crates/bstr)
 10 | 
 11 | 
 12 | ### Documentation
 13 | 
 14 | https://docs.rs/bstr
 15 | 
 16 | 
 17 | ### When should I use byte strings?
 18 | 
 19 | See this part of the documentation for more details:
 20 | <https://docs.rs/bstr/1.*/bstr/#when-should-i-use-byte-strings>.
 21 | 
 22 | The short story is that byte strings are useful when it is inconvenient or
 23 | incorrect to require valid UTF-8.
 24 | 
 25 | 
 26 | ### Usage
 27 | 
 28 | `cargo add bstr`
 29 | 
 30 | ### Examples
 31 | 
 32 | The following two examples exhibit both the API features of byte strings and
 33 | the I/O convenience functions provided for reading line-by-line quickly.
 34 | 
 35 | This first example simply shows how to efficiently iterate over lines in stdin,
 36 | and print out lines containing a particular substring:
 37 | 
 38 | ```rust
 39 | use std::{error::Error, io::{self, Write}};
 40 | use bstr::{ByteSlice, io::BufReadExt};
 41 | 
 42 | fn main() -> Result<(), Box<dyn Error>> {
 43 |     let stdin = io::stdin();
 44 |     let mut stdout = io::BufWriter::new(io::stdout());
 45 | 
 46 |     stdin.lock().for_byte_line_with_terminator(|line| {
 47 |         if line.contains_str("Dimension") {
 48 |             stdout.write_all(line)?;
 49 |         }
 50 |         Ok(true)
 51 |     })?;
 52 |     Ok(())
 53 | }
 54 | ```
 55 | 
 56 | This example shows how to count all of the words (Unicode-aware) in stdin,
 57 | line-by-line:
 58 | 
 59 | ```rust
 60 | use std::{error::Error, io};
 61 | use bstr::{ByteSlice, io::BufReadExt};
 62 | 
 63 | fn main() -> Result<(), Box<dyn Error>> {
 64 |     let stdin = io::stdin();
 65 |     let mut words = 0;
 66 |     stdin.lock().for_byte_line_with_terminator(|line| {
 67 |         words += line.words().count();
 68 |         Ok(true)
 69 |     })?;
 70 |     println!("{}", words);
 71 |     Ok(())
 72 | }
 73 | ```
 74 | 
 75 | This example shows how to convert a stream on stdin to uppercase without
 76 | performing UTF-8 validation _and_ amortizing allocation. On standard ASCII
 77 | text, this is quite a bit faster than what you can (easily) do with standard
 78 | library APIs. (N.B. Any invalid UTF-8 bytes are passed through unchanged.)
 79 | 
 80 | ```rust
 81 | use std::{error::Error, io::{self, Write}};
 82 | use bstr::{ByteSlice, io::BufReadExt};
 83 | 
 84 | fn main() -> Result<(), Box<dyn Error>> {
 85 |     let stdin = io::stdin();
 86 |     let mut stdout = io::BufWriter::new(io::stdout());
 87 | 
 88 |     let mut upper = vec![];
 89 |     stdin.lock().for_byte_line_with_terminator(|line| {
 90 |         upper.clear();
 91 |         line.to_uppercase_into(&mut upper);
 92 |         stdout.write_all(&upper)?;
 93 |         Ok(true)
 94 |     })?;
 95 |     Ok(())
 96 | }
 97 | ```
 98 | 
 99 | This example shows how to extract the first 10 visual characters (as grapheme
100 | clusters) from each line, where invalid UTF-8 sequences are generally treated
101 | as a single character and are passed through correctly:
102 | 
103 | ```rust
104 | use std::{error::Error, io::{self, Write}};
105 | use bstr::{ByteSlice, io::BufReadExt};
106 | 
107 | fn main() -> Result<(), Box<dyn Error>> {
108 |     let stdin = io::stdin();
109 |     let mut stdout = io::BufWriter::new(io::stdout());
110 | 
111 |     stdin.lock().for_byte_line_with_terminator(|line| {
112 |         let end = line
113 |             .grapheme_indices()
114 |             .map(|(_, end, _)| end)
115 |             .take(10)
116 |             .last()
117 |             .unwrap_or(line.len());
118 |         stdout.write_all(line[..end].trim_end())?;
119 |         stdout.write_all(b"\n")?;
120 |         Ok(true)
121 |     })?;
122 |     Ok(())
123 | }
124 | ```
125 | 
126 | 
127 | ### Cargo features
128 | 
129 | This crates comes with a few features that control standard library, serde and
130 | Unicode support.
131 | 
132 | * `std` - **Enabled** by default. This provides APIs that require the standard
133 |   library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables
134 |   the `alloc` feature.
135 | * `alloc` - **Enabled** by default. This provides APIs that require allocations
136 |   via the `alloc` crate, such as `Vec<u8>`.
137 | * `unicode` - **Enabled** by default. This provides APIs that require sizable
138 |   Unicode data compiled into the binary. This includes, but is not limited to,
139 |   grapheme/word/sentence segmenters. When this is disabled, basic support such
140 |   as UTF-8 decoding is still included. Note that currently, enabling this
141 |   feature also requires enabling the `std` feature. It is expected that this
142 |   limitation will be lifted at some point.
143 | * `serde` - Enables implementations of serde traits for `BStr`, and also
144 |   `BString` when `alloc` is enabled.
145 | 
146 | 
147 | ### Minimum Rust version policy
148 | 
149 | This crate's minimum supported `rustc` version (MSRV) is `1.73`.
150 | 
151 | In general, this crate will be conservative with respect to the minimum
152 | supported version of Rust. MSRV may be bumped in minor version releases.
153 | 
154 | 
155 | ### Future work
156 | 
157 | Since it is plausible that some of the types in this crate might end up in your
158 | public API (e.g., `BStr` and `BString`), we will commit to being very
159 | conservative with respect to new major version releases. It's difficult to say
160 | precisely how conservative, but unless there is a major issue with the `1.0`
161 | release, I wouldn't expect a `2.0` release to come out any sooner than some
162 | period of years.
163 | 
164 | A large part of the API surface area was taken from the standard library, so
165 | from an API design perspective, a good portion of this crate should be on solid
166 | ground. The main differences from the standard library are in how the various
167 | substring search routines work. The standard library provides generic
168 | infrastructure for supporting different types of searches with a single method,
169 | where as this library prefers to define new methods for each type of search and
170 | drop the generic infrastructure.
171 | 
172 | Some _probable_ future considerations for APIs include, but are not limited to:
173 | 
174 | * Unicode normalization.
175 | * More sophisticated support for dealing with Unicode case, perhaps by
176 |   combining the use cases supported by [`caseless`](https://docs.rs/caseless)
177 |   and [`unicase`](https://docs.rs/unicase).
178 | 
179 | Here are some examples that are _probably_ out of scope for this crate:
180 | 
181 | * Regular expressions.
182 | * Unicode collation.
183 | 
184 | The exact scope isn't quite clear, but I expect we can iterate on it.
185 | 
186 | In general, as stated below, this crate brings lots of related APIs together
187 | into a single crate while simultaneously attempting to keep the total number of
188 | dependencies low. Indeed, every dependency of `bstr`, except for `memchr`, is
189 | optional.
190 | 
191 | 
192 | ### High level motivation
193 | 
194 | Strictly speaking, the `bstr` crate provides very little that can't already be
195 | achieved with the standard library `Vec<u8>`/`&[u8]` APIs and the ecosystem of
196 | library crates. For example:
197 | 
198 | * The standard library's
199 |   [`Utf8Error`](https://doc.rust-lang.org/std/str/struct.Utf8Error.html) can be
200 |   used for incremental lossy decoding of `&[u8]`.
201 | * The
202 |   [`unicode-segmentation`](https://unicode-rs.github.io/unicode-segmentation/unicode_segmentation/index.html)
203 |   crate can be used for iterating over graphemes (or words), but is only
204 |   implemented for `&str` types. One could use `Utf8Error` above to implement
205 |   grapheme iteration with the same semantics as what `bstr` provides (automatic
206 |   Unicode replacement codepoint substitution).
207 | * The [`twoway`](https://docs.rs/twoway) crate can be used for fast substring
208 |   searching on `&[u8]`.
209 | 
210 | So why create `bstr`? Part of the point of the `bstr` crate is to provide a
211 | uniform API of coupled components instead of relying on users to piece together
212 | loosely coupled components from the crate ecosystem. For example, if you wanted
213 | to perform a search and replace in a `Vec<u8>`, then writing the code to do
214 | that with the `twoway` crate is not that difficult, but it's still additional
215 | glue code you have to write. This work adds up depending on what you're doing.
216 | Consider, for example, trimming and splitting, along with their different
217 | variants.
218 | 
219 | In other words, `bstr` is partially a way of pushing back against the
220 | micro-crate ecosystem that appears to be evolving. Namely, it is a goal of
221 | `bstr` to keep its dependency list lightweight. For example, `serde` is an
222 | optional dependency because there is no feasible alternative. In service of
223 | this philosophy, currently, the only required dependency of `bstr` is `memchr`.
224 | 
225 | 
226 | ### License
227 | 
228 | This project is licensed under either of
229 | 
230 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
231 |    https://www.apache.org/licenses/LICENSE-2.0)
232 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
233 |    https://opensource.org/licenses/MIT)
234 | 
235 | at your option.
236 | 
237 | The data in `src/unicode/data/` is licensed under the Unicode License Agreement
238 | ([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)), although
239 | this data is only used in tests.
240 | 


--------------------------------------------------------------------------------
/bench/.gitignore:
--------------------------------------------------------------------------------
1 | log
2 | 


--------------------------------------------------------------------------------
/bench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | publish = false
 3 | name = "bstr-bench"
 4 | version = "0.0.1"
 5 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 6 | description = "Criterion benchmark suite for bstr."
 7 | homepage = "https://github.com/BurntSushi/bstr"
 8 | repository = "https://github.com/BurntSushi/bstr"
 9 | license = "Unlicense OR MIT"
10 | edition = "2018"
11 | 
12 | [lib]
13 | bench = false
14 | 
15 | [[bench]]
16 | name = "bstr"
17 | harness = false
18 | path = "src/bench.rs"
19 | 
20 | [dependencies]
21 | criterion = "0.3.4"
22 | bstr = { version = "1.0.0", path = ".." }
23 | # For comparisons.
24 | unicode-segmentation = "1.2.1"
25 | 


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-en-small-ascii.txt:
--------------------------------------------------------------------------------
 1 | Presented by IM Pictures
 2 | Produced by Shin Cine
 3 | In association with MVP Venture Capital and Cinema Service
 4 | Jeon Ji-hyun Cha Tae-hyun
 5 | My Sassy Girl
 6 | Exactly two years ago today, she and I buried a time capsule here.
 7 | We promised to meet here two years later, but she hasn't come yet.
 8 | I'm going to wait.
 9 | Here we go.
10 | Please, don't move.
11 | One, two...
12 | Wait a minute.
13 | Hello?
14 | Oh, auntie.
15 | Sorry, I'm on my way.
16 | I'm really sorry.
17 | Yes, I'm coming.
18 | I'm having my photo taken.
19 | Bye.
20 | Are you ready?
21 | Here we go.
22 | One, two...
23 | My parents wanted a daughter, so they raised me like one.
24 | So I thought I was a girl until I was seven.
25 | I had to go to the women's public bath, too.
26 | The older I got,
27 | I thought my penis would get smaller and disappear.
28 | But it was the opposite.
29 | First Half
30 | He hasn't changed at all.
31 | No, I'm a real man now.
32 | Hey, asshole.
33 | Think clerical work in the army makes you a man?
34 | You irritate me!
35 | Give me a break, asshole.
36 | My job was tougher than you could imagine.
37 | Hey!
38 | I worked near the DMZ.
39 | Who are you kid


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-en-tiny-ascii.txt:
--------------------------------------------------------------------------------
1 | Presented by IM Pictures
2 | Produced by Shi


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-ru-small-utf8.txt:
--------------------------------------------------------------------------------
 1 | Рэй МИЛЛАНД, Энтони КУИН, Дебра ПАЖЕТ в фильме БЕРЕГ РЕКИ
 2 | в фильме также снимались:
 3 | Гарри КЭРИ-мл., Чабби ДЖОНСОН, Байрон ФУЛДЖЕ, Том МакКи, Фрэнк ГЕРСТЛ сценарий Гарольда Джэкоба СМИТА и Джэймса ЛЕЙСЕСТЕРА по рассказу Гарольда Джэкоба СМИТА "Самая высокая гора"
 4 | режиссер Аллан ДВАН
 5 | - А вы выбрали жаркий денек, мистер.
 6 | - Я всегда так делаю.
 7 | - Полный бак?
 8 | - Еще бы!
 9 | А у вас мощная "тачка", как я погляжу.
10 | - Могу продать ее вам.
11 | - Нет, спасибо!
12 | - Собираетесь немного поохотиться?
13 | - Ну, я надеюсь на это.
14 | Вы знаете, не проиживает тут поблизости парень по имени Кэмеро


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-ru-tiny-utf8.txt:
--------------------------------------------------------------------------------
1 | Рэй МИЛЛАНД, Энтони КУ


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-zh-small-utf8.txt:
--------------------------------------------------------------------------------
 1 | 我去拜托旅馆的人
 2 | 出去喝就行了
 3 | 我去拜托长井找工作
 4 | 他说帮我问问他哥哥的公司
 5 | 不知道会否成事
 6 | 既然他肯答应，一定有结果的
 7 | 真羡慕他至今还是优哉悠哉
 8 | 叔叔，你老是偷听人家拉琴
 9 | 小缝，你的颤音有进步了
10 | 我才不理你
11 | 叔叔你知道
12 | 爷爷找你来谈什么吗？
13 | 不知道
14 | 你的亲事
15 | 我去看看
16 | 走好
17 | 加油
18 | 你已经30岁了吧？
19 | 是的
20 | 身体健壮吧？
21 | 两三年来没有感冒
22 | 脑袋还算不笨吧？
23 | 是的
24 | 游手好闲太可惜了
25 | 他叫什么名字呢...
26 | 那个常去找你聊天的男人
27 | 我曾经见过他一两次
28 | 平冈吗？
29 | 那个人不算上乘人材...
30 | 听说帝大毕业后就去了外地
31 | 如今因为失败而回来
32 | 为什么？
33 | 想要为了温饱而工作吧
34 | 你在这里
35 | 我的梳子好像掉在这附近
36 | 你还是一样迷迷糊糊
37 | 坐吧，我陪你聊聊天
38 | 天气不错
39 | 去赏花如何？
40 | 等你真的想去再说
41 | 


--------------------------------------------------------------------------------
/bench/data/opensubtitles2018-zh-tiny-utf8.txt:
--------------------------------------------------------------------------------
1 | 你突然来信说最近要搬到这里
2 | 


--------------------------------------------------------------------------------
/bench/data/repeated-rare-small:
--------------------------------------------------------------------------------
1 | zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
2 | 


--------------------------------------------------------------------------------
/bench/data/sherlock-holmes-small-ascii.txt:
--------------------------------------------------------------------------------
 1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save
 2 | upon those not infrequent occasions when he was up all night, was seated
 3 | at the breakfast table. I stood upon the hearth-rug and picked up the
 4 | stick which our visitor had left behind him the night before. It was a
 5 | fine, thick piece of wood, bulbous-headed, of the sort which is known as
 6 | a "Penang lawyer." Just under the head was a broad silver band nearly
 7 | an inch across. "To James Mortimer, M.R.C.S., from his friends of the
 8 | C.C.H.," was engraved upon it, with the date "1884." It was just such a
 9 | stick as the old-fashioned family practitioner used to carry--dignified,
10 | solid, and reassuring.
11 | 


--------------------------------------------------------------------------------
/bench/data/sherlock-holmes-tiny-ascii.txt:
--------------------------------------------------------------------------------
1 | Mr. Sherlock Holmes, who was usually very late in the mornings, save
2 | 


--------------------------------------------------------------------------------
/bench/src/bench.rs:
--------------------------------------------------------------------------------
  1 | use bstr::{ByteSlice, B};
  2 | use criterion::{
  3 |     criterion_group, criterion_main, Bencher, Criterion, Throughput,
  4 | };
  5 | 
  6 | use crate::inputs::*;
  7 | 
  8 | mod inputs;
  9 | mod search;
 10 | 
 11 | // All benchmark corpora up to and including "huge" inputs.
 12 | //
 13 | // "huge" inputs are about 500KB. "small" inputs are about 1KB. "tiny" inputs
 14 | // are under 100 bytes.
 15 | const CORPORA_HUGE: &'static [(&'static str, &'static [u8])] = &[
 16 |     ("en-huge-ascii", SUBTITLE_EN_HUGE),
 17 |     ("en-small-ascii", SUBTITLE_EN_SMALL),
 18 |     ("en-tiny-ascii", SUBTITLE_EN_TINY),
 19 |     ("ru-huge-utf8", SUBTITLE_RU_HUGE),
 20 |     ("ru-small-utf8", SUBTITLE_RU_SMALL),
 21 |     ("ru-tiny-utf8", SUBTITLE_RU_TINY),
 22 |     ("zh-huge-utf8", SUBTITLE_ZH_HUGE),
 23 |     ("zh-small-utf8", SUBTITLE_ZH_SMALL),
 24 |     ("zh-tiny-utf8", SUBTITLE_ZH_TINY),
 25 | ];
 26 | 
 27 | // All benchmark corpora up to and including "small" inputs. This does not
 28 | // include huge inputs. This is useful for benchmarks that take longer, or if
 29 | // there isn't useful to benchmark larger inputs.
 30 | //
 31 | // "huge" inputs are about 500KB. "small" inputs are about 1KB. "tiny" inputs
 32 | // are under 100 bytes.
 33 | const CORPORA_SMALL: &'static [(&'static str, &'static [u8])] = &[
 34 |     ("en-small-ascii", SUBTITLE_EN_SMALL),
 35 |     ("en-tiny-ascii", SUBTITLE_EN_TINY),
 36 |     ("ru-small-utf8", SUBTITLE_RU_SMALL),
 37 |     ("ru-tiny-utf8", SUBTITLE_RU_TINY),
 38 |     ("zh-small-utf8", SUBTITLE_ZH_SMALL),
 39 |     ("zh-tiny-utf8", SUBTITLE_ZH_TINY),
 40 | ];
 41 | 
 42 | fn is_ascii(c: &mut Criterion) {
 43 |     let corpus = SHERLOCK_HUGE;
 44 |     define(c, "is_ascii", "huge-ascii", corpus, move |b| {
 45 |         b.iter(|| {
 46 |             assert!(corpus.is_ascii());
 47 |         });
 48 |     });
 49 | 
 50 |     let corpus = SHERLOCK_SMALL;
 51 |     define(c, "is_ascii", "small-ascii", corpus, move |b| {
 52 |         b.iter(|| {
 53 |             assert!(corpus.is_ascii());
 54 |         });
 55 |     });
 56 | 
 57 |     let corpus = SHERLOCK_TINY;
 58 |     define(c, "is_ascii", "tiny-ascii", corpus, move |b| {
 59 |         b.iter(|| {
 60 |             assert!(corpus.is_ascii());
 61 |         });
 62 |     });
 63 | 
 64 |     let corpus = EMPTY;
 65 |     define(c, "is_ascii", "empty-ascii", corpus, move |b| {
 66 |         b.iter(|| {
 67 |             assert!(corpus.is_ascii());
 68 |         });
 69 |     });
 70 | 
 71 |     let corpus = "abcdefghijklm☃abcdefghijklmnopqrstuvwxyz".as_bytes();
 72 |     define(c, "is_ascii", "tiny-non-ascii", corpus, move |b| {
 73 |         b.iter(|| {
 74 |             assert!(!corpus.is_ascii());
 75 |         });
 76 |     });
 77 | }
 78 | 
 79 | fn to_str(c: &mut Criterion) {
 80 |     // benchmark our impl
 81 |     for &(name, corpus) in CORPORA_HUGE {
 82 |         define(c, "bstr/to_str", name, corpus, move |b| {
 83 |             b.iter(|| {
 84 |                 assert!(corpus.to_str().is_ok());
 85 |             });
 86 |         });
 87 |     }
 88 |     // benchmark std's impl
 89 |     for &(name, corpus) in CORPORA_HUGE {
 90 |         define(c, "std/to_str", name, corpus, move |b| {
 91 |             use std::str;
 92 | 
 93 |             b.iter(|| {
 94 |                 assert!(str::from_utf8(corpus).is_ok());
 95 |             });
 96 |         });
 97 |     }
 98 | }
 99 | 
100 | fn to_str_lossy_valid(c: &mut Criterion) {
101 |     // benchmark our impl
102 |     for &(name, corpus) in CORPORA_HUGE {
103 |         define(c, "bstr/to_str_lossy_valid", name, corpus, move |b| {
104 |             b.iter(|| {
105 |                 assert!(corpus.to_str_lossy().len() > 0);
106 |             });
107 |         });
108 |     }
109 |     // benchmark std's impl
110 |     for &(name, corpus) in CORPORA_HUGE {
111 |         define(c, "std/to_str_lossy_valid", name, corpus, move |b| {
112 |             b.iter(|| {
113 |                 assert!(String::from_utf8_lossy(corpus).len() > 0);
114 |             });
115 |         });
116 |     }
117 | }
118 | 
119 | fn trim(c: &mut Criterion) {
120 |     let corpus = "\u{2007}\t\n\u{200a}foo\tbar\t\t\t\t\n   \t\u{2002}";
121 | 
122 |     // benchmark our impl
123 |     define(c, "bstr/trim", "tiny", corpus.as_bytes(), move |b| {
124 |         b.iter(|| {
125 |             assert_eq!("foo\tbar".as_bytes(), B(corpus).trim());
126 |         });
127 |     });
128 | 
129 |     // benchmark std's impl
130 |     define(c, "std/trim", "tiny", corpus.as_bytes(), move |b| {
131 |         b.iter(|| {
132 |             assert_eq!("foo\tbar", corpus.trim());
133 |         });
134 |     });
135 | }
136 | 
137 | fn chars(c: &mut Criterion) {
138 |     // benchmark our impl
139 |     for &(name, corpus) in CORPORA_HUGE {
140 |         define(c, "bstr/chars", name, corpus, move |b| {
141 |             b.iter(|| {
142 |                 let mut count = 0;
143 |                 for ch in corpus.chars() {
144 |                     count += ch.len_utf8();
145 |                 }
146 |                 assert!(count > 0);
147 |             });
148 |         });
149 |     }
150 |     // benchmark std's impl
151 |     for &(name, corpus) in CORPORA_HUGE {
152 |         define(c, "std/chars", name, corpus, move |b| {
153 |             use std::str;
154 | 
155 |             let corpus = str::from_utf8(corpus).unwrap();
156 |             b.iter(|| {
157 |                 let mut count = 0;
158 |                 for ch in corpus.chars() {
159 |                     count += ch.len_utf8();
160 |                 }
161 |                 assert!(count > 0);
162 |             });
163 |         });
164 |     }
165 | }
166 | 
167 | fn graphemes(c: &mut Criterion) {
168 |     // benchmark our impl
169 |     for &(name, corpus) in CORPORA_SMALL {
170 |         define(c, "bstr/graphemes", name, corpus, move |b| {
171 |             b.iter(|| {
172 |                 let mut count = 0;
173 |                 for g in corpus.graphemes() {
174 |                     count += g.len();
175 |                 }
176 |                 assert!(count > 0);
177 |             });
178 |         });
179 |     }
180 |     // benchmark unicode-segmentation impl
181 |     for &(name, corpus) in CORPORA_SMALL {
182 |         define(c, "unicode-segmentation/graphemes", name, corpus, move |b| {
183 |             use std::str;
184 |             use unicode_segmentation::UnicodeSegmentation;
185 | 
186 |             let corpus = str::from_utf8(corpus).unwrap();
187 |             b.iter(|| {
188 |                 let mut count = 0;
189 |                 for g in corpus.graphemes(true) {
190 |                     count += g.len();
191 |                 }
192 |                 assert!(count > 0);
193 |             });
194 |         });
195 |     }
196 | }
197 | 
198 | fn words(c: &mut Criterion) {
199 |     // benchmark our impl
200 |     for &(name, corpus) in CORPORA_SMALL {
201 |         define(c, "bstr/words", name, corpus, move |b| {
202 |             b.iter(|| {
203 |                 let mut count = 0;
204 |                 for g in corpus.words() {
205 |                     count += g.len();
206 |                 }
207 |                 assert!(count > 0);
208 |             });
209 |         });
210 |     }
211 |     // benchmark unicode-segmentation impl
212 |     for &(name, corpus) in CORPORA_SMALL {
213 |         define(c, "unicode-segmentation/words", name, corpus, move |b| {
214 |             use std::str;
215 |             use unicode_segmentation::UnicodeSegmentation;
216 | 
217 |             let corpus = str::from_utf8(corpus).unwrap();
218 |             b.iter(|| {
219 |                 let mut count = 0;
220 |                 for g in corpus.unicode_words() {
221 |                     count += g.len();
222 |                 }
223 |                 assert!(count > 0);
224 |             });
225 |         });
226 |     }
227 | }
228 | 
229 | fn sentences(c: &mut Criterion) {
230 |     // benchmark our impl
231 |     for &(name, corpus) in CORPORA_SMALL {
232 |         define(c, "bstr/sentences", name, corpus, move |b| {
233 |             b.iter(|| {
234 |                 let mut count = 0;
235 |                 for g in corpus.sentences() {
236 |                     count += g.len();
237 |                 }
238 |                 assert!(count > 0);
239 |             });
240 |         });
241 |     }
242 | }
243 | 
244 | fn byte_lines(c: &mut Criterion) {
245 |     use bstr::io::BufReadExt;
246 | 
247 |     let corpus = SUBTITLE_EN_HUGE;
248 |     define(c, "bstr/for_byte_line", "ascii", corpus, move |b| {
249 |         b.iter(|| {
250 |             let mut corpus = corpus;
251 |             let mut count = 0;
252 |             corpus
253 |                 .for_byte_line(|line| {
254 |                     count += line.len();
255 |                     Ok(true)
256 |                 })
257 |                 .unwrap();
258 |             assert!(count > 0);
259 |         });
260 |     });
261 | }
262 | 
263 | fn define(
264 |     c: &mut Criterion,
265 |     group_name: &str,
266 |     bench_name: &str,
267 |     corpus: &[u8],
268 |     bench: impl FnMut(&mut Bencher<'_>) + 'static,
269 | ) {
270 |     let mut group = c.benchmark_group(group_name);
271 |     group.throughput(Throughput::Bytes(corpus.len() as u64));
272 |     group.bench_function(bench_name, bench);
273 |     group.finish();
274 | }
275 | 
276 | criterion_group!(g1, is_ascii);
277 | criterion_group!(g2, to_str);
278 | criterion_group!(g3, to_str_lossy_valid);
279 | criterion_group!(g4, trim);
280 | criterion_group!(g5, chars);
281 | criterion_group!(g6, graphemes);
282 | criterion_group!(g7, words);
283 | criterion_group!(g8, sentences);
284 | criterion_group!(g9, byte_lines);
285 | criterion_group!(g10, search::find_iter);
286 | criterion_group!(g11, search::rfind_iter);
287 | criterion_group!(g12, search::find_char);
288 | criterion_group!(g13, search::find_byteset);
289 | criterion_group!(g14, search::find_not_byteset);
290 | criterion_main!(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14);
291 | 


--------------------------------------------------------------------------------
/bench/src/inputs.rs:
--------------------------------------------------------------------------------
 1 | pub const EMPTY: &'static [u8] = b"";
 2 | 
 3 | pub const SHERLOCK_HUGE: &'static [u8] =
 4 |     include_bytes!("../data/sherlock-holmes-huge-ascii.txt");
 5 | pub const SHERLOCK_SMALL: &'static [u8] =
 6 |     include_bytes!("../data/sherlock-holmes-small-ascii.txt");
 7 | pub const SHERLOCK_TINY: &'static [u8] =
 8 |     include_bytes!("../data/sherlock-holmes-tiny-ascii.txt");
 9 | 
10 | pub const SUBTITLE_EN_HUGE: &'static [u8] =
11 |     include_bytes!("../data/opensubtitles2018-en-huge-ascii.txt");
12 | pub const SUBTITLE_EN_SMALL: &'static [u8] =
13 |     include_bytes!("../data/opensubtitles2018-en-small-ascii.txt");
14 | pub const SUBTITLE_EN_TINY: &'static [u8] =
15 |     include_bytes!("../data/opensubtitles2018-en-tiny-ascii.txt");
16 | 
17 | pub const SUBTITLE_RU_HUGE: &'static [u8] =
18 |     include_bytes!("../data/opensubtitles2018-ru-huge-utf8.txt");
19 | pub const SUBTITLE_RU_SMALL: &'static [u8] =
20 |     include_bytes!("../data/opensubtitles2018-ru-small-utf8.txt");
21 | pub const SUBTITLE_RU_TINY: &'static [u8] =
22 |     include_bytes!("../data/opensubtitles2018-ru-tiny-utf8.txt");
23 | 
24 | pub const SUBTITLE_ZH_HUGE: &'static [u8] =
25 |     include_bytes!("../data/opensubtitles2018-zh-huge-utf8.txt");
26 | pub const SUBTITLE_ZH_SMALL: &'static [u8] =
27 |     include_bytes!("../data/opensubtitles2018-zh-small-utf8.txt");
28 | pub const SUBTITLE_ZH_TINY: &'static [u8] =
29 |     include_bytes!("../data/opensubtitles2018-zh-tiny-utf8.txt");
30 | 
31 | pub const REPEATED_RARE_HUGE: &'static [u8] =
32 |     include_bytes!("../data/repeated-rare-huge");
33 | pub const REPEATED_RARE_SMALL: &'static [u8] =
34 |     include_bytes!("../data/repeated-rare-small");
35 | 


--------------------------------------------------------------------------------
/bench/src/lib.rs:
--------------------------------------------------------------------------------
1 | // This is purposely empty. See src/bench.rs instead. We use src/bench.rs
2 | // to avoid including the same file in multiple build targets.
3 | 


--------------------------------------------------------------------------------
/bench/src/search.rs:
--------------------------------------------------------------------------------
  1 | use std::str;
  2 | 
  3 | use bstr::ByteSlice;
  4 | use criterion::Criterion;
  5 | 
  6 | use crate::define;
  7 | use crate::inputs::*;
  8 | 
  9 | pub fn find_iter(c: &mut Criterion) {
 10 |     define_find_iter(
 11 |         c,
 12 |         "find/rare",
 13 |         "en-huge-ascii",
 14 |         SUBTITLE_EN_HUGE,
 15 |         "Sherlock Holmes",
 16 |         1,
 17 |     );
 18 |     define_find_iter(
 19 |         c,
 20 |         "find/verycommon1",
 21 |         "en-huge-ascii",
 22 |         SUBTITLE_EN_HUGE,
 23 |         " ",
 24 |         76792,
 25 |     );
 26 |     define_find_iter(
 27 |         c,
 28 |         "find/verycommon2",
 29 |         "en-huge-ascii",
 30 |         SUBTITLE_EN_HUGE,
 31 |         "  ",
 32 |         0,
 33 |     );
 34 | 
 35 |     define_find_iter(
 36 |         c,
 37 |         "find/rare",
 38 |         "en-small-ascii",
 39 |         SUBTITLE_EN_SMALL,
 40 |         "IM Pictures",
 41 |         1,
 42 |     );
 43 |     define_find_iter(
 44 |         c,
 45 |         "find/verycommon1",
 46 |         "en-small-ascii",
 47 |         SUBTITLE_EN_SMALL,
 48 |         " ",
 49 |         155,
 50 |     );
 51 |     define_find_iter(
 52 |         c,
 53 |         "find/verycommon2",
 54 |         "en-small-ascii",
 55 |         SUBTITLE_EN_SMALL,
 56 |         "  ",
 57 |         0,
 58 |     );
 59 | 
 60 |     define_find_iter(
 61 |         c,
 62 |         "find/verycommon1",
 63 |         "en-tiny-ascii",
 64 |         SUBTITLE_EN_TINY,
 65 |         " ",
 66 |         5,
 67 |     );
 68 |     define_find_iter(
 69 |         c,
 70 |         "find/verycommon2",
 71 |         "en-tiny-ascii",
 72 |         SUBTITLE_EN_TINY,
 73 |         "  ",
 74 |         0,
 75 |     );
 76 | 
 77 |     define_find_iter(
 78 |         c,
 79 |         "find/pathological",
 80 |         "repeated-huge",
 81 |         REPEATED_RARE_HUGE,
 82 |         "abczdef",
 83 |         0,
 84 |     );
 85 |     define_find_iter(
 86 |         c,
 87 |         "find/pathological",
 88 |         "repeated-small",
 89 |         REPEATED_RARE_SMALL,
 90 |         "abczdef",
 91 |         0,
 92 |     );
 93 | }
 94 | 
 95 | pub fn rfind_iter(c: &mut Criterion) {
 96 |     define_rfind_iter(
 97 |         c,
 98 |         "rfind/rare",
 99 |         "en-huge-ascii",
100 |         SUBTITLE_EN_HUGE,
101 |         "Sherlock Holmes",
102 |         1,
103 |     );
104 |     define_rfind_iter(
105 |         c,
106 |         "rfind/verycommon1",
107 |         "en-huge-ascii",
108 |         SUBTITLE_EN_HUGE,
109 |         " ",
110 |         76792,
111 |     );
112 |     define_rfind_iter(
113 |         c,
114 |         "rfind/verycommon2",
115 |         "en-huge-ascii",
116 |         SUBTITLE_EN_HUGE,
117 |         "  ",
118 |         0,
119 |     );
120 | 
121 |     define_rfind_iter(
122 |         c,
123 |         "rfind/rare",
124 |         "en-small-ascii",
125 |         SUBTITLE_EN_SMALL,
126 |         "IM Pictures",
127 |         1,
128 |     );
129 |     define_rfind_iter(
130 |         c,
131 |         "rfind/verycommon1",
132 |         "en-small-ascii",
133 |         SUBTITLE_EN_SMALL,
134 |         " ",
135 |         155,
136 |     );
137 |     define_rfind_iter(
138 |         c,
139 |         "rfind/verycommon2",
140 |         "en-small-ascii",
141 |         SUBTITLE_EN_SMALL,
142 |         "  ",
143 |         0,
144 |     );
145 | 
146 |     define_rfind_iter(
147 |         c,
148 |         "rfind/verycommon1",
149 |         "en-tiny-ascii",
150 |         SUBTITLE_EN_TINY,
151 |         " ",
152 |         5,
153 |     );
154 |     define_rfind_iter(
155 |         c,
156 |         "rfind/verycommon2",
157 |         "en-tiny-ascii",
158 |         SUBTITLE_EN_TINY,
159 |         "  ",
160 |         0,
161 |     );
162 | 
163 |     define_rfind_iter(
164 |         c,
165 |         "rfind/pathological",
166 |         "repeated-huge",
167 |         REPEATED_RARE_HUGE,
168 |         "abczdef",
169 |         0,
170 |     );
171 |     define_rfind_iter(
172 |         c,
173 |         "rfind/pathological",
174 |         "repeated-small",
175 |         REPEATED_RARE_SMALL,
176 |         "abczdef",
177 |         0,
178 |     );
179 | }
180 | 
181 | pub fn find_char(c: &mut Criterion) {
182 |     let corpus = str::from_utf8(SUBTITLE_EN_HUGE).unwrap();
183 |     define(
184 |         c,
185 |         "bstr/find_char",
186 |         "en-huge-ascii",
187 |         corpus.as_bytes(),
188 |         move |b| {
189 |             let corpus = corpus.as_bytes();
190 |             b.iter(|| {
191 |                 assert_eq!(None, corpus.find_char('γ'));
192 |             });
193 |         },
194 |     );
195 | 
196 |     define(c, "std/find_char", "en-huge-ascii", corpus.as_bytes(), move |b| {
197 |         b.iter(|| {
198 |             assert_eq!(None, corpus.find('γ'));
199 |         });
200 |     });
201 | }
202 | 
203 | pub fn find_byteset(c: &mut Criterion) {
204 |     let corpus = SUBTITLE_EN_SMALL;
205 |     define(c, "bstr/find_byteset/1", "en-small-ascii", corpus, move |b| {
206 |         let corpus = corpus.as_bytes();
207 |         b.iter(|| {
208 |             assert_eq!(None, corpus.find_byteset(b"\0"));
209 |         });
210 |     });
211 |     define(c, "bstr/find_byteset/2", "en-small-ascii", corpus, move |b| {
212 |         let corpus = corpus.as_bytes();
213 |         b.iter(|| {
214 |             assert_eq!(None, corpus.find_byteset(b"\0\xff"));
215 |         });
216 |     });
217 |     define(c, "bstr/find_byteset/3", "en-small-ascii", corpus, move |b| {
218 |         let corpus = corpus.as_bytes();
219 |         b.iter(|| {
220 |             assert_eq!(None, corpus.find_byteset(b"\0\xff\xee"));
221 |         });
222 |     });
223 |     define(c, "bstr/find_byteset/4", "en-small-ascii", corpus, move |b| {
224 |         let corpus = corpus.as_bytes();
225 |         b.iter(|| {
226 |             assert_eq!(None, corpus.find_byteset(b"\0\xff\xee\xdd"));
227 |         });
228 |     });
229 |     define(c, "bstr/find_byteset/10", "en-small-ascii", corpus, move |b| {
230 |         let corpus = corpus.as_bytes();
231 |         b.iter(|| {
232 |             assert_eq!(None, corpus.find_byteset(b"0123456789"));
233 |         });
234 |     });
235 | 
236 |     define(c, "bstr/rfind_byteset/1", "en-small-ascii", corpus, move |b| {
237 |         let corpus = corpus.as_bytes();
238 |         b.iter(|| {
239 |             assert_eq!(None, corpus.rfind_byteset(b"\0"));
240 |         });
241 |     });
242 |     define(c, "bstr/rfind_byteset/2", "en-small-ascii", corpus, move |b| {
243 |         let corpus = corpus.as_bytes();
244 |         b.iter(|| {
245 |             assert_eq!(None, corpus.rfind_byteset(b"\0\xff"));
246 |         });
247 |     });
248 |     define(c, "bstr/rfind_byteset/3", "en-small-ascii", corpus, move |b| {
249 |         let corpus = corpus.as_bytes();
250 |         b.iter(|| {
251 |             assert_eq!(None, corpus.rfind_byteset(b"\0\xff\xee"));
252 |         });
253 |     });
254 |     define(c, "bstr/rfind_byteset/4", "en-small-ascii", corpus, move |b| {
255 |         let corpus = corpus.as_bytes();
256 |         b.iter(|| {
257 |             assert_eq!(None, corpus.rfind_byteset(b"\0\xff\xee\xdd"));
258 |         });
259 |     });
260 |     define(c, "bstr/rfind_byteset/10", "en-small-ascii", corpus, move |b| {
261 |         let corpus = corpus.as_bytes();
262 |         b.iter(|| {
263 |             assert_eq!(None, corpus.rfind_byteset(b"0123456789"));
264 |         });
265 |     });
266 | }
267 | 
268 | pub fn find_not_byteset(c: &mut Criterion) {
269 |     let corpus = REPEATED_RARE_SMALL;
270 |     define(
271 |         c,
272 |         "bstr/find_not_byteset/1",
273 |         "repeated-rare-small",
274 |         corpus,
275 |         move |b| {
276 |             let corpus = corpus.as_bytes();
277 |             b.iter(|| {
278 |                 assert_eq!(Some(1000), corpus.find_not_byteset(b"z"));
279 |             })
280 |         },
281 |     );
282 |     define(
283 |         c,
284 |         "bstr/find_not_byteset/2",
285 |         "repeated-rare-small",
286 |         corpus,
287 |         move |b| {
288 |             let corpus = corpus.as_bytes();
289 |             b.iter(|| {
290 |                 assert_eq!(Some(1000), corpus.find_not_byteset(b"zy"));
291 |             });
292 |         },
293 |     );
294 |     define(
295 |         c,
296 |         "bstr/find_not_byteset/3",
297 |         "repeated-rare-small",
298 |         corpus,
299 |         move |b| {
300 |             let corpus = corpus.as_bytes();
301 |             b.iter(|| {
302 |                 assert_eq!(Some(1000), corpus.find_not_byteset(b"zyx"));
303 |             });
304 |         },
305 |     );
306 |     define(
307 |         c,
308 |         "bstr/find_not_byteset/4",
309 |         "repeated-rare-small",
310 |         corpus,
311 |         move |b| {
312 |             let corpus = corpus.as_bytes();
313 |             b.iter(|| {
314 |                 assert_eq!(Some(1000), corpus.find_not_byteset(b"zyxw"));
315 |             });
316 |         },
317 |     );
318 |     define(
319 |         c,
320 |         "bstr/find_not_byteset/10",
321 |         "repeated-rare-small",
322 |         corpus,
323 |         move |b| {
324 |             let corpus = corpus.as_bytes();
325 |             b.iter(|| {
326 |                 assert_eq!(Some(1000), corpus.find_not_byteset(b"zyxwv12345"));
327 |             });
328 |         },
329 |     );
330 | 
331 |     define(
332 |         c,
333 |         "bstr/rfind_not_byteset/1",
334 |         "repeated-rare-small",
335 |         corpus,
336 |         move |b| {
337 |             // This file ends in \n, breaking our benchmark.... TODO find a
338 |             // better dataset...
339 |             let corpus = &corpus.as_bytes()[..(corpus.len() - 1)];
340 |             b.iter(|| {
341 |                 assert_eq!(None, corpus.rfind_not_byteset(b"z"));
342 |             });
343 |         },
344 |     );
345 |     define(
346 |         c,
347 |         "bstr/rfind_not_byteset/2",
348 |         "repeated-rare-small",
349 |         corpus,
350 |         move |b| {
351 |             let corpus = corpus.as_bytes();
352 |             b.iter(|| {
353 |                 assert_eq!(None, corpus.rfind_not_byteset(b"z\n"));
354 |             });
355 |         },
356 |     );
357 |     define(
358 |         c,
359 |         "bstr/rfind_not_byteset/3",
360 |         "repeated-rare-small",
361 |         corpus,
362 |         move |b| {
363 |             let corpus = corpus.as_bytes();
364 |             b.iter(|| {
365 |                 assert_eq!(None, corpus.rfind_not_byteset(b"zy\n"));
366 |             });
367 |         },
368 |     );
369 |     define(
370 |         c,
371 |         "bstr/rfind_not_byteset/4",
372 |         "repeated-rare-small",
373 |         corpus,
374 |         move |b| {
375 |             let corpus = corpus.as_bytes();
376 |             b.iter(|| {
377 |                 assert_eq!(None, corpus.rfind_not_byteset(b"zyx\n"));
378 |             });
379 |         },
380 |     );
381 |     define(
382 |         c,
383 |         "bstr/rfind_not_byteset/10",
384 |         "repeated-rare-small",
385 |         corpus,
386 |         move |b| {
387 |             let corpus = corpus.as_bytes();
388 |             b.iter(|| {
389 |                 assert_eq!(None, corpus.rfind_not_byteset(b"zyxwv1234\n"));
390 |             });
391 |         },
392 |     );
393 | }
394 | 
395 | fn define_find_iter(
396 |     c: &mut Criterion,
397 |     group_name: &str,
398 |     bench_name: &str,
399 |     corpus: &'static [u8],
400 |     needle: &'static str,
401 |     expected: usize,
402 | ) {
403 |     let corpus = str::from_utf8(corpus).unwrap();
404 | 
405 |     let name = format!("bstr/{}", group_name);
406 |     define(c, &name, bench_name, corpus.as_bytes(), move |b| {
407 |         let corpus = corpus.as_bytes();
408 |         b.iter(|| {
409 |             assert_eq!(expected, corpus.find_iter(needle).count());
410 |         });
411 |     });
412 | 
413 |     let name = format!("std/{}", group_name);
414 |     define(c, &name, bench_name, corpus.as_bytes(), move |b| {
415 |         b.iter(|| {
416 |             assert_eq!(expected, corpus.matches(needle).count());
417 |         });
418 |     });
419 | }
420 | 
421 | fn define_rfind_iter(
422 |     c: &mut Criterion,
423 |     group_name: &str,
424 |     bench_name: &str,
425 |     corpus: &'static [u8],
426 |     needle: &'static str,
427 |     expected: usize,
428 | ) {
429 |     let corpus = str::from_utf8(corpus).unwrap();
430 | 
431 |     let name = format!("bstr/{}", group_name);
432 |     define(c, &name, bench_name, corpus.as_bytes(), move |b| {
433 |         let corpus = corpus.as_bytes();
434 |         b.iter(|| {
435 |             assert_eq!(expected, corpus.rfind_iter(needle).count());
436 |         });
437 |     });
438 | 
439 |     let name = format!("std/{}", group_name);
440 |     define(c, &name, bench_name, corpus.as_bytes(), move |b| {
441 |         b.iter(|| {
442 |             assert_eq!(expected, corpus.rmatches(needle).count());
443 |         });
444 |     });
445 | }
446 | 


--------------------------------------------------------------------------------
/examples/graphemes-std.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io::{self, BufRead, Write};
 3 | 
 4 | use unicode_segmentation::UnicodeSegmentation;
 5 | 
 6 | fn main() -> Result<(), Box<dyn Error>> {
 7 |     let stdin = io::stdin();
 8 |     let mut stdin = stdin.lock();
 9 |     let mut stdout = io::BufWriter::new(io::stdout());
10 | 
11 |     let mut line = String::new();
12 |     while stdin.read_line(&mut line)? > 0 {
13 |         let end = line
14 |             .grapheme_indices(true)
15 |             .map(|(start, g)| start + g.len())
16 |             .take(10)
17 |             .last()
18 |             .unwrap_or(line.len());
19 |         stdout.write_all(line[..end].trim_end().as_bytes())?;
20 |         stdout.write_all(b"\n")?;
21 | 
22 |         line.clear();
23 |     }
24 |     Ok(())
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/graphemes.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io::{self, Write};
 3 | 
 4 | use bstr::{io::BufReadExt, ByteSlice};
 5 | 
 6 | fn main() -> Result<(), Box<dyn Error>> {
 7 |     let stdin = io::stdin();
 8 |     let mut stdout = io::BufWriter::new(io::stdout());
 9 | 
10 |     stdin.lock().for_byte_line_with_terminator(|line| {
11 |         let end = line
12 |             .grapheme_indices()
13 |             .map(|(_, end, _)| end)
14 |             .take(10)
15 |             .last()
16 |             .unwrap_or(line.len());
17 |         stdout.write_all(line[..end].trim_end())?;
18 |         stdout.write_all(b"\n")?;
19 |         Ok(true)
20 |     })?;
21 |     Ok(())
22 | }
23 | 


--------------------------------------------------------------------------------
/examples/lines-std.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io::{self, BufRead, Write};
 3 | 
 4 | fn main() -> Result<(), Box<dyn Error>> {
 5 |     let stdin = io::stdin();
 6 |     let mut stdin = stdin.lock();
 7 |     let mut stdout = io::BufWriter::new(io::stdout());
 8 | 
 9 |     let mut line = String::new();
10 |     while stdin.read_line(&mut line)? > 0 {
11 |         if line.contains("Dimension") {
12 |             stdout.write_all(line.as_bytes())?;
13 |         }
14 |         line.clear();
15 |     }
16 |     Ok(())
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/lines.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io::{self, Write};
 3 | 
 4 | use bstr::{io::BufReadExt, ByteSlice};
 5 | 
 6 | fn main() -> Result<(), Box<dyn Error>> {
 7 |     let stdin = io::stdin();
 8 |     let mut stdout = io::BufWriter::new(io::stdout());
 9 | 
10 |     stdin.lock().for_byte_line_with_terminator(|line| {
11 |         if line.contains_str("Dimension") {
12 |             stdout.write_all(line)?;
13 |         }
14 |         Ok(true)
15 |     })?;
16 |     Ok(())
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/uppercase-std.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io::{self, BufRead, Write};
 3 | 
 4 | fn main() -> Result<(), Box<dyn Error>> {
 5 |     let stdin = io::stdin();
 6 |     let mut stdin = stdin.lock();
 7 |     let mut stdout = io::BufWriter::new(io::stdout());
 8 | 
 9 |     let mut line = String::new();
10 |     while stdin.read_line(&mut line)? > 0 {
11 |         stdout.write_all(line.to_uppercase().as_bytes())?;
12 |         line.clear();
13 |     }
14 |     Ok(())
15 | }
16 | 


--------------------------------------------------------------------------------
/examples/uppercase.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io::{self, Write};
 3 | 
 4 | use bstr::{io::BufReadExt, ByteSlice};
 5 | 
 6 | fn main() -> Result<(), Box<dyn Error>> {
 7 |     let stdin = io::stdin();
 8 |     let mut stdout = io::BufWriter::new(io::stdout());
 9 | 
10 |     let mut upper = vec![];
11 |     stdin.lock().for_byte_line_with_terminator(|line| {
12 |         upper.clear();
13 |         line.to_uppercase_into(&mut upper);
14 |         stdout.write_all(&upper)?;
15 |         Ok(true)
16 |     })?;
17 |     Ok(())
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/words-std.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io::{self, BufRead};
 3 | 
 4 | use unicode_segmentation::UnicodeSegmentation;
 5 | 
 6 | fn main() -> Result<(), Box<dyn Error>> {
 7 |     let stdin = io::stdin();
 8 |     let mut stdin = stdin.lock();
 9 | 
10 |     let mut words = 0;
11 |     let mut line = String::new();
12 |     while stdin.read_line(&mut line)? > 0 {
13 |         words += line.unicode_words().count();
14 |         line.clear();
15 |     }
16 |     println!("{}", words);
17 |     Ok(())
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/words.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use std::io;
 3 | 
 4 | use bstr::{io::BufReadExt, ByteSlice};
 5 | 
 6 | fn main() -> Result<(), Box<dyn Error>> {
 7 |     let stdin = io::stdin();
 8 |     let mut words = 0;
 9 |     stdin.lock().for_byte_line_with_terminator(|line| {
10 |         words += line.words().count();
11 |         Ok(true)
12 |     })?;
13 |     println!("{}", words);
14 |     Ok(())
15 | }
16 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 79
2 | use_small_heuristics = "max"
3 | 


--------------------------------------------------------------------------------
/scripts/generate-unicode-data:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | set -e
  4 | D="$(dirname "$0")"
  5 | 
  6 | # Convenience function for checking that a command exists.
  7 | requires() {
  8 |     cmd="$1"
  9 |     if ! command -v "$cmd" > /dev/null 2>&1; then
 10 |         echo "DEPENDENCY MISSING: $cmd must be installed" >&2
 11 |         exit 1
 12 |     fi
 13 | }
 14 | 
 15 | # Test if an array ($2) contains a particular element ($1).
 16 | array_exists() {
 17 |     needle="$1"
 18 |     shift
 19 | 
 20 |     for el in "$@"; do
 21 |         if [ "$el" = "$needle" ]; then
 22 |             return 0
 23 |         fi
 24 |     done
 25 |     return 1
 26 | }
 27 | 
 28 | graphemes() {
 29 |     regex="$(sh "$D/regex/grapheme.sh")"
 30 | 
 31 |     echo "generating forward grapheme DFA"
 32 |     regex-cli generate serialize sparse dfa \
 33 |       --minimize \
 34 |       --start-kind anchored \
 35 |       --shrink \
 36 |       --rustfmt \
 37 |       --safe \
 38 |       GRAPHEME_BREAK_FWD \
 39 |       src/unicode/fsm/ \
 40 |       "$regex"
 41 | 
 42 |     echo "generating reverse grapheme DFA"
 43 |     regex-cli generate serialize sparse dfa \
 44 |       --minimize \
 45 |       --start-kind anchored \
 46 |       --reverse \
 47 |       --match-kind all \
 48 |       --no-captures \
 49 |       --shrink \
 50 |       --rustfmt \
 51 |       --safe \
 52 |       GRAPHEME_BREAK_REV \
 53 |       src/unicode/fsm/ \
 54 |       "$regex"
 55 | }
 56 | 
 57 | words() {
 58 |     regex="$(sh "$D/regex/word.sh")"
 59 | 
 60 |     echo "generating forward word DFA (this can take a while)"
 61 |     regex-cli generate serialize sparse dfa \
 62 |       --minimize \
 63 |       --start-kind anchored \
 64 |       --shrink \
 65 |       --rustfmt \
 66 |       --safe \
 67 |       WORD_BREAK_FWD \
 68 |       src/unicode/fsm/ \
 69 |       "$regex"
 70 | }
 71 | 
 72 | sentences() {
 73 |     regex="$(sh "$D/regex/sentence.sh")"
 74 | 
 75 |     echo "generating forward sentence DFA (this can take a while)"
 76 |     regex-cli generate serialize sparse dfa \
 77 |       --minimize \
 78 |       --start-kind anchored \
 79 |       --shrink \
 80 |       --rustfmt \
 81 |       --safe \
 82 |       SENTENCE_BREAK_FWD \
 83 |       src/unicode/fsm/ \
 84 |       "$regex"
 85 | }
 86 | 
 87 | regional_indicator() {
 88 |     # For finding all occurrences of region indicators. This is used to handle
 89 |     # regional indicators as a special case for the reverse grapheme iterator
 90 |     # and the reverse word iterator.
 91 |     echo "generating regional indicator DFA"
 92 |     regex-cli generate serialize dense dfa \
 93 |       --minimize \
 94 |       --start-kind anchored \
 95 |       --reverse \
 96 |       --no-captures \
 97 |       --shrink \
 98 |       --rustfmt \
 99 |       --safe \
100 |       REGIONAL_INDICATOR_REV \
101 |       src/unicode/fsm/ \
102 |       "\p{gcb=Regional_Indicator}"
103 | }
104 | 
105 | simple_word() {
106 |     echo "generating forward simple word DFA"
107 |     regex-cli generate serialize sparse dfa \
108 |       --minimize \
109 |       --start-kind anchored \
110 |       --shrink \
111 |       --rustfmt \
112 |       --safe \
113 |       SIMPLE_WORD_FWD \
114 |       src/unicode/fsm/ \
115 |       "\w"
116 | }
117 | 
118 | whitespace() {
119 |     echo "generating forward whitespace DFA"
120 |     regex-cli generate serialize dense dfa \
121 |       --minimize \
122 |       --start-kind anchored \
123 |       --shrink \
124 |       --rustfmt \
125 |       --safe \
126 |       WHITESPACE_ANCHORED_FWD \
127 |       src/unicode/fsm/ \
128 |       "\s+"
129 | 
130 |     echo "generating reverse whitespace DFA"
131 |     regex-cli generate serialize dense dfa \
132 |       --minimize \
133 |       --start-kind anchored \
134 |       --reverse \
135 |       --no-captures \
136 |       --shrink \
137 |       --rustfmt \
138 |       --safe \
139 |       WHITESPACE_ANCHORED_REV \
140 |       src/unicode/fsm/ \
141 |       "\s+"
142 | }
143 | 
144 | main() {
145 |     if array_exists "-h" "$@" || array_exists "--help" "$@"; then
146 |         echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2
147 |         exit
148 |     fi
149 | 
150 |     commands="
151 |         graphemes
152 |         sentences
153 |         words
154 |         regional-indicator
155 |         simple-word
156 |         whitespace
157 |     "
158 |     if array_exists "--list-commands" "$@"; then
159 |         for cmd in $commands; do
160 |             echo "$cmd"
161 |         done
162 |         exit
163 |     fi
164 | 
165 |     # regex-cli is used to compile regexes into DFAs.
166 |     # To get regex-cli, run:
167 |     #
168 |     #     cargo install --git https://github.com/rust-lang/regex regex-cli
169 |     #
170 |     # regex-cli will build DFAs, serialize them to big endian and little endian
171 |     # files, and then generate the Rust code to deserialize them.
172 |     requires regex-cli
173 | 
174 |     mkdir -p src/unicode/fsm/
175 | 
176 |     cmds=$*
177 |     if [ $# -eq 0 ] || array_exists "all" "$@"; then
178 |         cmds=$commands
179 |     fi
180 |     for cmd in $cmds; do
181 |         if array_exists "$cmd" $commands; then
182 |             fun="$(echo "$cmd" | sed 's/-/_/g')"
183 |             eval "$fun"
184 |         else
185 |             echo "unrecognized command: $cmd" >&2
186 |         fi
187 |     done
188 | }
189 | 
190 | main "$@"
191 | 


--------------------------------------------------------------------------------
/scripts/regex/grapheme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # vim: indentexpr= nosmartindent autoindent
 4 | # vim: tabstop=2 shiftwidth=2 softtabstop=2
 5 | 
 6 | # This regex was manually written, derived from the rules in UAX #29.
 7 | # Particularly, from Table 1c, which lays out a regex for grapheme clusters.
 8 | 
 9 | CR="\p{gcb=CR}"
10 | LF="\p{gcb=LF}"
11 | Control="\p{gcb=Control}"
12 | Prepend="\p{gcb=Prepend}"
13 | L="\p{gcb=L}"
14 | V="\p{gcb=V}"
15 | LV="\p{gcb=LV}"
16 | LVT="\p{gcb=LVT}"
17 | T="\p{gcb=T}"
18 | RI="\p{gcb=RI}"
19 | Extend="\p{gcb=Extend}"
20 | ZWJ="\p{gcb=ZWJ}"
21 | SpacingMark="\p{gcb=SpacingMark}"
22 | 
23 | Any="\p{any}"
24 | ExtendPict="\p{Extended_Pictographic}"
25 | 
26 | echo "(?x)
27 | $CR $LF
28 | |
29 | $Control
30 | |
31 | $Prepend*
32 | (
33 |   (
34 |     ($L* ($V+ | $LV $V* | $LVT) $T*)
35 |     |
36 |     $L+
37 |     |
38 |     $T+
39 |   )
40 |   |
41 |   $RI $RI
42 |   |
43 |   $ExtendPict ($Extend* $ZWJ $ExtendPict)*
44 |   |
45 |   [^$Control $CR $LF]
46 | )
47 | [$Extend $ZWJ $SpacingMark]*
48 | |
49 | $Any
50 | "
51 | 


--------------------------------------------------------------------------------
/scripts/regex/sentence.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # vim: indentexpr= nosmartindent autoindent
  4 | # vim: tabstop=2 shiftwidth=2 softtabstop=2
  5 | 
  6 | # This is a regex that I reverse engineered from the sentence boundary chain
  7 | # rules in UAX #29. Unlike the grapheme regex, which is essentially provided
  8 | # for us in UAX #29, no such sentence regex exists.
  9 | #
 10 | # I looked into how ICU achieves this, since UAX #29 hints that producing
 11 | # finite state machines for grapheme/sentence/word/line breaking is possible,
 12 | # but only easy to do for graphemes. ICU does this by implementing their own
 13 | # DSL for describing the break algorithms in terms of the chaining rules
 14 | # directly. You can see an example for sentences in
 15 | # icu4c/source/data/brkitr/rules/sent.txt. ICU then builds a finite state
 16 | # machine from those rules in a mostly standard way, but implements the
 17 | # "chaining" aspect of the rules by connecting overlapping end and start
 18 | # states. For example, given SB7:
 19 | #
 20 | #     (Upper | Lower) ATerm x Upper
 21 | #
 22 | # Then the naive way to convert this into a regex would be something like
 23 | #
 24 | #     [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}
 25 | #
 26 | # Unfortunately, this is incorrect. Why? Well, consider an example like so:
 27 | #
 28 | #     U.S.A.
 29 | #
 30 | # A correct implementation of the sentence breaking algorithm should not insert
 31 | # any breaks here, exactly in accordance with repeatedly applying rule SB7 as
 32 | # given above. Our regex fails to do this because it will first match `U.S`
 33 | # without breaking them---which is correct---but will then start looking for
 34 | # its next rule beginning with a full stop (in ATerm) and followed by an
 35 | # uppercase letter (A). This will wind up triggering rule SB11 (without
 36 | # matching `A`), which inserts a break.
 37 | #
 38 | # The reason why this happens is because our initial application of rule SB7
 39 | # "consumes" the next uppercase letter (S), which we want to reuse as a prefix
 40 | # in the next rule application. A natural way to express this would be with
 41 | # look-around, although it's not clear that works in every case since you
 42 | # ultimately might want to consume that ending uppercase letter. In any case,
 43 | # we can't use look-around in our truly regular regexes, so we must fix this.
 44 | # The approach we take is to explicitly repeat rules when a suffix of a rule
 45 | # is a prefix of another rule. In the case of SB7, the end of the rule, an
 46 | # uppercase letter, also happens to match the beginning of the rule. This can
 47 | # in turn be repeated indefinitely. Thus, our actual translation to a regex is:
 48 | #
 49 | #     [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}(\p{sb=ATerm}\p{sb=Upper}*
 50 | #
 51 | # It turns out that this is exactly what ICU does, but in their case, they do
 52 | # it automatically. In our case, we connect the chaining rules manually. It's
 53 | # tedious. With that said, we do no implement Unicode line breaking with this
 54 | # approach, which is a far scarier beast. In that case, it would probably be
 55 | # worth writing the code to do what ICU does.
 56 | #
 57 | # In the case of sentence breaks, there aren't *too* many overlaps of this
 58 | # nature. We list them out exhaustively to make this clear, because it's
 59 | # essentially impossible to easily observe this in the regex. (It took me a
 60 | # full day to figure all of this out.) Rules marked with N/A mean that they
 61 | # specify a break, and this strategy only really applies to stringing together
 62 | # non-breaks.
 63 | #
 64 | #     SB1   - N/A
 65 | #     SB2   - N/A
 66 | #     SB3   - None
 67 | #     SB4   - N/A
 68 | #     SB5   - None
 69 | #     SB6   - None
 70 | #     SB7   - End overlaps with beginning of SB7
 71 | #     SB8   - End overlaps with beginning of SB7
 72 | #     SB8a  - End overlaps with beginning of SB6, SB8, SB8a, SB9, SB10, SB11
 73 | #     SB9   - None
 74 | #     SB10  - None
 75 | #     SB11  - None
 76 | #     SB998 - N/A
 77 | #
 78 | # SB8a is in particular quite tricky to get right without look-ahead, since it
 79 | # allows ping-ponging between match rules SB8a and SB9-11, where SB9-11
 80 | # otherwise indicate that a break has been found. In the regex below, we tackle
 81 | # this by only permitting part of SB8a to match inside our core non-breaking
 82 | # repetition. In particular, we only allow the parts of SB8a to match that
 83 | # permit the non-breaking components to continue. If a part of SB8a matches
 84 | # that guarantees a pop out to SB9-11, (like `STerm STerm`), then we let it
 85 | # happen. This still isn't correct because an SContinue might be seen which
 86 | # would allow moving back into SB998 and thus the non-breaking repetition, so
 87 | # we handle that case as well.
 88 | #
 89 | # Finally, the last complication here is the sprinkling of $Ex* everywhere.
 90 | # This essentially corresponds to the implementation of SB5 by following
 91 | # UAX #29's recommendation in S6.2. Essentially, we use it avoid ever breaking
 92 | # in the middle of a grapheme cluster.
 93 | 
 94 | CR="\p{sb=CR}"
 95 | LF="\p{sb=LF}"
 96 | Sep="\p{sb=Sep}"
 97 | Close="\p{sb=Close}"
 98 | Sp="\p{sb=Sp}"
 99 | STerm="\p{sb=STerm}"
100 | ATerm="\p{sb=ATerm}"
101 | SContinue="\p{sb=SContinue}"
102 | Numeric="\p{sb=Numeric}"
103 | Upper="\p{sb=Upper}"
104 | Lower="\p{sb=Lower}"
105 | OLetter="\p{sb=OLetter}"
106 | 
107 | Ex="[\p{sb=Extend}\p{sb=Format}]"
108 | ParaSep="[$Sep $CR $LF]"
109 | SATerm="[$STerm $ATerm]"
110 | 
111 | LetterSepTerm="[$OLetter $Upper $Lower $ParaSep $SATerm]"
112 | 
113 | echo "(?x)
114 | (
115 |   # SB6
116 |   $ATerm $Ex*
117 |     $Numeric
118 |   |
119 |   # SB7
120 |   [$Upper $Lower] $Ex* $ATerm $Ex*
121 |     $Upper $Ex*
122 |     # overlap with SB7
123 |     ($ATerm $Ex* $Upper $Ex*)*
124 |   |
125 |   # SB8
126 |   $ATerm $Ex* $Close* $Ex* $Sp* $Ex*
127 |     ([^$LetterSepTerm] $Ex*)* $Lower $Ex*
128 |     # overlap with SB7
129 |     ($ATerm $Ex* $Upper $Ex*)*
130 |   |
131 |   # SB8a
132 |   $SATerm $Ex* $Close* $Ex* $Sp* $Ex*
133 |   (
134 |     $SContinue
135 |     |
136 |     $ATerm $Ex*
137 |       # Permit repetition of SB8a
138 |       (($Close $Ex*)* ($Sp $Ex*)* $SATerm)*
139 |       # In order to continue non-breaking matching, we now must observe
140 |       # a match with a rule that keeps us in SB6-8a. Otherwise, we've entered
141 |       # one of SB9-11 and know that a break must follow.
142 |       (
143 |         # overlap with SB6
144 |         $Numeric
145 |         |
146 |         # overlap with SB8
147 |         ($Close $Ex*)* ($Sp $Ex*)*
148 |           ([^$LetterSepTerm] $Ex*)* $Lower $Ex*
149 |           # overlap with SB7
150 |           ($ATerm $Ex* $Upper $Ex*)*
151 |         |
152 |         # overlap with SB8a
153 |         ($Close $Ex*)* ($Sp $Ex*)* $SContinue
154 |       )
155 |     |
156 |     $STerm $Ex*
157 |       # Permit repetition of SB8a
158 |       (($Close $Ex*)* ($Sp $Ex*)* $SATerm)*
159 |       # As with ATerm above, in order to continue non-breaking matching, we
160 |       # must now observe a match with a rule that keeps us out of SB9-11.
161 |       # For STerm, the only such possibility is to see an SContinue. Anything
162 |       # else will result in a break.
163 |       ($Close $Ex*)* ($Sp $Ex*)* $SContinue
164 |   )
165 |   |
166 |   # SB998
167 |   # The logic behind this catch-all is that if we get to this point and
168 |   # see a Sep, CR, LF, STerm or ATerm, then it has to fall into one of
169 |   # SB9, SB10 or SB11. In the cases of SB9-11, we always find a break since
170 |   # SB11 acts as a catch-all to induce a break following a SATerm that isn't
171 |   # handled by rules SB6-SB8a.
172 |   [^$ParaSep $SATerm]
173 | )*
174 | # The following collapses rules SB3, SB4, part of SB8a, SB9, SB10 and SB11.
175 | ($SATerm $Ex* ($Close $Ex*)* ($Sp $Ex*)*)* ($CR $LF | $ParaSep)?
176 | "
177 | 


--------------------------------------------------------------------------------
/scripts/regex/word.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # vim: indentexpr= nosmartindent autoindent
  4 | # vim: tabstop=2 shiftwidth=2 softtabstop=2
  5 | 
  6 | # See the comments in regex/sentence.sh for the general approach to how this
  7 | # regex was written.
  8 | #
  9 | # Writing the regex for this was *hard*. It took me two days of hacking to get
 10 | # this far, and that was after I had finished the sentence regex, so my brain
 11 | # was fully cached on this. Unlike the sentence regex, the rules in the regex
 12 | # below don't correspond as nicely to the rules in UAX #29. In particular, the
 13 | # UAX #29 rules have a ton of overlap with each other, which requires crazy
 14 | # stuff in the regex. I'm not even sure the regex below is 100% correct or even
 15 | # minimal, however, I did compare this with the ICU word segmenter on a few
 16 | # different corpora, and it produces identical results. (In addition to of
 17 | # course passing the UCD tests.)
 18 | #
 19 | # In general, I consider this approach to be a failure. Firstly, this is
 20 | # clearly a write-only regex. Secondly, building the minimized DFA for this is
 21 | # incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly,
 22 | # reversing this regex (for reverse word iteration) results in a >19MB DFA.
 23 | # Yes. That's MB. Wat. And it took 5 minutes to build.
 24 | #
 25 | # I think we might consider changing our approach to this problem. The normal
 26 | # path I've seen, I think, is to decode codepoints one at a time, and then
 27 | # thread them through a state machine in the code itself. We could take this
 28 | # approach, or possibly combine it with a DFA that tells us which Word_Break
 29 | # value a codepoint has. I'd prefer the latter approach, but it requires adding
 30 | # RegexSet support to regex-automata. Something that should definitely be done,
 31 | # but is a fair amount of work.
 32 | #
 33 | # Gah.
 34 | 
 35 | CR="\p{wb=CR}"
 36 | LF="\p{wb=LF}"
 37 | Newline="\p{wb=Newline}"
 38 | ZWJ="\p{wb=ZWJ}"
 39 | RI="\p{wb=Regional_Indicator}"
 40 | Katakana="\p{wb=Katakana}"
 41 | HebrewLet="\p{wb=HebrewLetter}"
 42 | ALetter="\p{wb=ALetter}"
 43 | SingleQuote="\p{wb=SingleQuote}"
 44 | DoubleQuote="\p{wb=DoubleQuote}"
 45 | MidNumLet="\p{wb=MidNumLet}"
 46 | MidLetter="\p{wb=MidLetter}"
 47 | MidNum="\p{wb=MidNum}"
 48 | Numeric="\p{wb=Numeric}"
 49 | ExtendNumLet="\p{wb=ExtendNumLet}"
 50 | WSegSpace="\p{wb=WSegSpace}"
 51 | 
 52 | Any="\p{any}"
 53 | Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]"
 54 | ExtendPict="\p{Extended_Pictographic}"
 55 | AHLetter="[$ALetter $HebrewLet]"
 56 | MidNumLetQ="[$MidNumLet $SingleQuote]"
 57 | 
 58 | AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*"
 59 | NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*"
 60 | 
 61 | echo "(?x)
 62 | $CR $LF
 63 | |
 64 | [$Newline $CR $LF]
 65 | |
 66 | $WSegSpace $WSegSpace+
 67 | |
 68 | (
 69 |   ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+
 70 |   |
 71 |   ($ExtendNumLet $Ex*)* $AHLetter $Ex*
 72 |     (
 73 |       (
 74 |         ($NumericRepeat | $ExtendNumLet $Ex*)*
 75 |         |
 76 |         [$MidLetter $MidNumLetQ] $Ex*
 77 |       )
 78 |       $AHLetter $Ex*
 79 |     )+
 80 |     ($NumericRepeat | $ExtendNumLet $Ex*)*
 81 |   |
 82 |   ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+
 83 |   |
 84 |   ($ExtendNumLet $Ex*)* $Numeric $Ex*
 85 |     (
 86 |       (
 87 |         ($AHLetterRepeat | $ExtendNumLet $Ex*)*
 88 |         |
 89 |         [$MidNum $MidNumLetQ] $Ex*
 90 |       )
 91 |       $Numeric $Ex*
 92 |     )+
 93 |     ($AHLetterRepeat | $ExtendNumLet $Ex*)*
 94 |   |
 95 |   ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+
 96 |   |
 97 |   $Katakana $Ex*
 98 |     (($Katakana | $ExtendNumLet) $Ex*)+
 99 |   |
100 |   $ExtendNumLet $Ex*
101 |     (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+
102 | )+
103 | |
104 | $HebrewLet $Ex* $SingleQuote $Ex*
105 | |
106 | ($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex*
107 | |
108 | $RI $Ex* $RI $Ex*
109 | |
110 | $Any $Ex*
111 | "
112 | 


--------------------------------------------------------------------------------
/src/ascii.rs:
--------------------------------------------------------------------------------
  1 | // The following ~400 lines of code exists for exactly one purpose, which is
  2 | // to optimize this code:
  3 | //
  4 | //     byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len())
  5 | //
  6 | // Yes... Overengineered is a word that comes to mind, but this is effectively
  7 | // a very similar problem to memchr, and virtually nobody has been able to
  8 | // resist optimizing the crap out of that (except for perhaps the BSD and MUSL
  9 | // folks). In particular, this routine makes a very common case (ASCII) very
 10 | // fast, which seems worth it. We do stop short of adding AVX variants of the
 11 | // code below in order to retain our sanity and also to avoid needing to deal
 12 | // with runtime target feature detection. RESIST!
 13 | //
 14 | // In order to understand the SIMD version below, it would be good to read this
 15 | // comment describing how my memchr routine works:
 16 | // https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106
 17 | //
 18 | // The primary difference with memchr is that for ASCII, we can do a bit less
 19 | // work. In particular, we don't need to detect the presence of a specific
 20 | // byte, but rather, whether any byte has its most significant bit set. That
 21 | // means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
 22 | // _mm_movemask_epi8.
 23 | 
 24 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
 25 | const USIZE_BYTES: usize = core::mem::size_of::<usize>();
 26 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
 27 | const ALIGN_MASK: usize = core::mem::align_of::<usize>() - 1;
 28 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
 29 | const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
 30 | 
 31 | // This is a mask where the most significant bit of each byte in the usize
 32 | // is set. We test this bit to determine whether a character is ASCII or not.
 33 | // Namely, a single byte is regarded as an ASCII codepoint if and only if it's
 34 | // most significant bit is not set.
 35 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
 36 | const ASCII_MASK_U64: u64 = 0x8080808080808080;
 37 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
 38 | const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
 39 | 
 40 | /// Returns the index of the first non ASCII byte in the given slice.
 41 | ///
 42 | /// If slice only contains ASCII bytes, then the length of the slice is
 43 | /// returned.
 44 | pub fn first_non_ascii_byte(slice: &[u8]) -> usize {
 45 |     #[cfg(any(miri, not(target_arch = "x86_64")))]
 46 |     {
 47 |         first_non_ascii_byte_fallback(slice)
 48 |     }
 49 | 
 50 |     #[cfg(all(not(miri), target_arch = "x86_64"))]
 51 |     {
 52 |         first_non_ascii_byte_sse2(slice)
 53 |     }
 54 | }
 55 | 
 56 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
 57 | fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
 58 |     let start_ptr = slice.as_ptr();
 59 |     let end_ptr = slice[slice.len()..].as_ptr();
 60 |     let mut ptr = start_ptr;
 61 | 
 62 |     unsafe {
 63 |         if slice.len() < USIZE_BYTES {
 64 |             return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
 65 |         }
 66 | 
 67 |         let chunk = read_unaligned_usize(ptr);
 68 |         let mask = chunk & ASCII_MASK;
 69 |         if mask != 0 {
 70 |             return first_non_ascii_byte_mask(mask);
 71 |         }
 72 | 
 73 |         ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & ALIGN_MASK));
 74 |         debug_assert!(ptr > start_ptr);
 75 |         debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
 76 |         if slice.len() >= FALLBACK_LOOP_SIZE {
 77 |             while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) {
 78 |                 debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
 79 | 
 80 |                 let a = *(ptr as *const usize);
 81 |                 let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize);
 82 |                 if (a | b) & ASCII_MASK != 0 {
 83 |                     // What a kludge. We wrap the position finding code into
 84 |                     // a non-inlineable function, which makes the codegen in
 85 |                     // the tight loop above a bit better by avoiding a
 86 |                     // couple extra movs. We pay for it by two additional
 87 |                     // stores, but only in the case of finding a non-ASCII
 88 |                     // byte.
 89 |                     #[inline(never)]
 90 |                     unsafe fn findpos(
 91 |                         start_ptr: *const u8,
 92 |                         ptr: *const u8,
 93 |                     ) -> usize {
 94 |                         let a = *(ptr as *const usize);
 95 |                         let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize);
 96 | 
 97 |                         let mut at = sub(ptr, start_ptr);
 98 |                         let maska = a & ASCII_MASK;
 99 |                         if maska != 0 {
100 |                             return at + first_non_ascii_byte_mask(maska);
101 |                         }
102 | 
103 |                         at += USIZE_BYTES;
104 |                         let maskb = b & ASCII_MASK;
105 |                         debug_assert!(maskb != 0);
106 |                         return at + first_non_ascii_byte_mask(maskb);
107 |                     }
108 |                     return findpos(start_ptr, ptr);
109 |                 }
110 |                 ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE);
111 |             }
112 |         }
113 |         first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
114 |     }
115 | }
116 | 
117 | #[cfg(all(not(miri), target_arch = "x86_64"))]
118 | fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
119 |     use core::arch::x86_64::*;
120 | 
121 |     const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>();
122 |     const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
123 |     const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE;
124 | 
125 |     let start_ptr = slice.as_ptr();
126 |     let end_ptr = slice[slice.len()..].as_ptr();
127 |     let mut ptr = start_ptr;
128 | 
129 |     unsafe {
130 |         if slice.len() < VECTOR_SIZE {
131 |             return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
132 |         }
133 | 
134 |         let chunk = _mm_loadu_si128(ptr as *const __m128i);
135 |         let mask = _mm_movemask_epi8(chunk);
136 |         if mask != 0 {
137 |             return mask.trailing_zeros() as usize;
138 |         }
139 | 
140 |         ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
141 |         debug_assert!(ptr > start_ptr);
142 |         debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr);
143 |         if slice.len() >= VECTOR_LOOP_SIZE {
144 |             while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) {
145 |                 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
146 | 
147 |                 let a = _mm_load_si128(ptr as *const __m128i);
148 |                 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
149 |                 let c =
150 |                     _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
151 |                 let d =
152 |                     _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
153 | 
154 |                 let or1 = _mm_or_si128(a, b);
155 |                 let or2 = _mm_or_si128(c, d);
156 |                 let or3 = _mm_or_si128(or1, or2);
157 |                 if _mm_movemask_epi8(or3) != 0 {
158 |                     let mut at = sub(ptr, start_ptr);
159 |                     let mask = _mm_movemask_epi8(a);
160 |                     if mask != 0 {
161 |                         return at + mask.trailing_zeros() as usize;
162 |                     }
163 | 
164 |                     at += VECTOR_SIZE;
165 |                     let mask = _mm_movemask_epi8(b);
166 |                     if mask != 0 {
167 |                         return at + mask.trailing_zeros() as usize;
168 |                     }
169 | 
170 |                     at += VECTOR_SIZE;
171 |                     let mask = _mm_movemask_epi8(c);
172 |                     if mask != 0 {
173 |                         return at + mask.trailing_zeros() as usize;
174 |                     }
175 | 
176 |                     at += VECTOR_SIZE;
177 |                     let mask = _mm_movemask_epi8(d);
178 |                     debug_assert!(mask != 0);
179 |                     return at + mask.trailing_zeros() as usize;
180 |                 }
181 |                 ptr = ptr_add(ptr, VECTOR_LOOP_SIZE);
182 |             }
183 |         }
184 |         while ptr <= end_ptr.sub(VECTOR_SIZE) {
185 |             debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
186 | 
187 |             let chunk = _mm_loadu_si128(ptr as *const __m128i);
188 |             let mask = _mm_movemask_epi8(chunk);
189 |             if mask != 0 {
190 |                 return sub(ptr, start_ptr) + mask.trailing_zeros() as usize;
191 |             }
192 |             ptr = ptr.add(VECTOR_SIZE);
193 |         }
194 |         first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
195 |     }
196 | }
197 | 
198 | #[inline(always)]
199 | unsafe fn first_non_ascii_byte_slow(
200 |     start_ptr: *const u8,
201 |     end_ptr: *const u8,
202 |     mut ptr: *const u8,
203 | ) -> usize {
204 |     debug_assert!(start_ptr <= ptr);
205 |     debug_assert!(ptr <= end_ptr);
206 | 
207 |     while ptr < end_ptr {
208 |         if *ptr > 0x7F {
209 |             return sub(ptr, start_ptr);
210 |         }
211 |         ptr = ptr.offset(1);
212 |     }
213 |     sub(end_ptr, start_ptr)
214 | }
215 | 
216 | /// Compute the position of the first ASCII byte in the given mask.
217 | ///
218 | /// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is
219 | /// 8 contiguous bytes of the slice being checked where *at least* one of those
220 | /// bytes is not an ASCII byte.
221 | ///
222 | /// The position returned is always in the inclusive range [0, 7].
223 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
224 | fn first_non_ascii_byte_mask(mask: usize) -> usize {
225 |     #[cfg(target_endian = "little")]
226 |     {
227 |         mask.trailing_zeros() as usize / 8
228 |     }
229 |     #[cfg(target_endian = "big")]
230 |     {
231 |         mask.leading_zeros() as usize / 8
232 |     }
233 | }
234 | 
235 | /// Increment the given pointer by the given amount.
236 | unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
237 |     ptr.add(amt)
238 | }
239 | 
240 | /// Decrement the given pointer by the given amount.
241 | unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
242 |     ptr.sub(amt)
243 | }
244 | 
245 | #[cfg(any(test, miri, not(target_arch = "x86_64")))]
246 | unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
247 |     use core::ptr;
248 | 
249 |     let mut n: usize = 0;
250 |     ptr::copy_nonoverlapping(ptr, &mut n as *mut _ as *mut u8, USIZE_BYTES);
251 |     n
252 | }
253 | 
254 | /// Subtract `b` from `a` and return the difference. `a` should be greater than
255 | /// or equal to `b`.
256 | fn sub(a: *const u8, b: *const u8) -> usize {
257 |     debug_assert!(a >= b);
258 |     (a as usize) - (b as usize)
259 | }
260 | 
261 | #[cfg(test)]
262 | mod tests {
263 |     use super::*;
264 | 
265 |     // Our testing approach here is to try and exhaustively test every case.
266 |     // This includes the position at which a non-ASCII byte occurs in addition
267 |     // to the alignment of the slice that we're searching.
268 | 
269 |     #[test]
270 |     fn positive_fallback_forward() {
271 |         for i in 0..517 {
272 |             let s = "a".repeat(i);
273 |             assert_eq!(
274 |                 i,
275 |                 first_non_ascii_byte_fallback(s.as_bytes()),
276 |                 "i: {:?}, len: {:?}, s: {:?}",
277 |                 i,
278 |                 s.len(),
279 |                 s
280 |             );
281 |         }
282 |     }
283 | 
284 |     #[test]
285 |     #[cfg(target_arch = "x86_64")]
286 |     #[cfg(not(miri))]
287 |     fn positive_sse2_forward() {
288 |         for i in 0..517 {
289 |             let b = "a".repeat(i).into_bytes();
290 |             assert_eq!(b.len(), first_non_ascii_byte_sse2(&b));
291 |         }
292 |     }
293 | 
294 |     #[test]
295 |     #[cfg(not(miri))]
296 |     fn negative_fallback_forward() {
297 |         for i in 0..517 {
298 |             for align in 0..65 {
299 |                 let mut s = "a".repeat(i);
300 |                 s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
301 |                 let s = s.get(align..).unwrap_or("");
302 |                 assert_eq!(
303 |                     i.saturating_sub(align),
304 |                     first_non_ascii_byte_fallback(s.as_bytes()),
305 |                     "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
306 |                     i,
307 |                     align,
308 |                     s.len(),
309 |                     s
310 |                 );
311 |             }
312 |         }
313 |     }
314 | 
315 |     #[test]
316 |     #[cfg(target_arch = "x86_64")]
317 |     #[cfg(not(miri))]
318 |     fn negative_sse2_forward() {
319 |         for i in 0..517 {
320 |             for align in 0..65 {
321 |                 let mut s = "a".repeat(i);
322 |                 s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
323 |                 let s = s.get(align..).unwrap_or("");
324 |                 assert_eq!(
325 |                     i.saturating_sub(align),
326 |                     first_non_ascii_byte_sse2(s.as_bytes()),
327 |                     "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
328 |                     i,
329 |                     align,
330 |                     s.len(),
331 |                     s
332 |                 );
333 |             }
334 |         }
335 |     }
336 | }
337 | 


--------------------------------------------------------------------------------
/src/bstr.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "alloc")]
 2 | use alloc::boxed::Box;
 3 | 
 4 | /// A wrapper for `&[u8]` that provides convenient string oriented trait impls.
 5 | ///
 6 | /// If you need ownership or a growable byte string buffer, then use
 7 | /// [`BString`](struct.BString.html).
 8 | ///
 9 | /// Using a `&BStr` is just like using a `&[u8]`, since `BStr`
10 | /// implements `Deref` to `[u8]`. So all methods available on `[u8]`
11 | /// are also available on `BStr`.
12 | ///
13 | /// # Representation
14 | ///
15 | /// A `&BStr` has the same representation as a `&str`. That is, a `&BStr` is
16 | /// a fat pointer which consists of a pointer to some bytes and a length.
17 | ///
18 | /// # Trait implementations
19 | ///
20 | /// The `BStr` type has a number of trait implementations, and in particular,
21 | /// defines equality and ordinal comparisons between `&BStr`, `&str` and
22 | /// `&[u8]` for convenience.
23 | ///
24 | /// The `Debug` implementation for `BStr` shows its bytes as a normal string.
25 | /// For invalid UTF-8, hex escape sequences are used.
26 | ///
27 | /// The `Display` implementation behaves as if `BStr` were first lossily
28 | /// converted to a `str`. Invalid UTF-8 bytes are substituted with the Unicode
29 | /// replacement codepoint, which looks like this: �.
30 | #[repr(transparent)]
31 | pub struct BStr {
32 |     pub(crate) bytes: [u8],
33 | }
34 | 
35 | impl BStr {
36 |     /// Directly creates a `BStr` slice from anything that can be converted
37 |     /// to a byte slice.
38 |     ///
39 |     /// This is very similar to the [`B`](crate::B) function, except this
40 |     /// returns a `&BStr` instead of a `&[u8]`.
41 |     ///
42 |     /// This is a cost-free conversion.
43 |     ///
44 |     /// # Example
45 |     ///
46 |     /// You can create `BStr`'s from byte arrays, byte slices or even string
47 |     /// slices:
48 |     ///
49 |     /// ```
50 |     /// use bstr::BStr;
51 |     ///
52 |     /// let a = BStr::new(b"abc");
53 |     /// let b = BStr::new(&b"abc"[..]);
54 |     /// let c = BStr::new("abc");
55 |     ///
56 |     /// assert_eq!(a, b);
57 |     /// assert_eq!(a, c);
58 |     /// ```
59 |     #[inline]
60 |     pub fn new<B: ?Sized + AsRef<[u8]>>(bytes: &B) -> &BStr {
61 |         BStr::from_bytes(bytes.as_ref())
62 |     }
63 | 
64 |     #[inline]
65 |     pub(crate) fn new_mut<B: ?Sized + AsMut<[u8]>>(
66 |         bytes: &mut B,
67 |     ) -> &mut BStr {
68 |         BStr::from_bytes_mut(bytes.as_mut())
69 |     }
70 | 
71 |     #[inline]
72 |     pub(crate) fn from_bytes(slice: &[u8]) -> &BStr {
73 |         unsafe { &*(slice as *const [u8] as *const BStr) }
74 |     }
75 | 
76 |     #[inline]
77 |     pub(crate) fn from_bytes_mut(slice: &mut [u8]) -> &mut BStr {
78 |         unsafe { &mut *(slice as *mut [u8] as *mut BStr) }
79 |     }
80 | 
81 |     #[inline]
82 |     #[cfg(feature = "alloc")]
83 |     pub(crate) fn from_boxed_bytes(slice: Box<[u8]>) -> Box<BStr> {
84 |         unsafe { Box::from_raw(Box::into_raw(slice) as _) }
85 |     }
86 | 
87 |     #[inline]
88 |     #[cfg(feature = "alloc")]
89 |     pub(crate) fn into_boxed_bytes(slice: Box<BStr>) -> Box<[u8]> {
90 |         unsafe { Box::from_raw(Box::into_raw(slice) as _) }
91 |     }
92 | 
93 |     #[inline]
94 |     pub(crate) fn as_bytes(&self) -> &[u8] {
95 |         &self.bytes
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/bstring.rs:
--------------------------------------------------------------------------------
  1 | use alloc::vec::Vec;
  2 | 
  3 | use crate::bstr::BStr;
  4 | 
  5 | /// A wrapper for `Vec<u8>` that provides convenient string oriented trait
  6 | /// impls.
  7 | ///
  8 | /// A `BString` has ownership over its contents and corresponds to
  9 | /// a growable or shrinkable buffer. Its borrowed counterpart is a
 10 | /// [`BStr`](struct.BStr.html), called a byte string slice.
 11 | ///
 12 | /// Using a `BString` is just like using a `Vec<u8>`, since `BString`
 13 | /// implements `Deref` to `Vec<u8>`. So all methods available on `Vec<u8>`
 14 | /// are also available on `BString`.
 15 | ///
 16 | /// # Examples
 17 | ///
 18 | /// You can create a new `BString` from a `Vec<u8>` via a `From` impl:
 19 | ///
 20 | /// ```
 21 | /// use bstr::BString;
 22 | ///
 23 | /// let s = BString::from("Hello, world!");
 24 | /// ```
 25 | ///
 26 | /// # Deref
 27 | ///
 28 | /// The `BString` type implements `Deref` and `DerefMut`, where the target
 29 | /// types are `&Vec<u8>` and `&mut Vec<u8>`, respectively. `Deref` permits all of the
 30 | /// methods defined on `Vec<u8>` to be implicitly callable on any `BString`.
 31 | ///
 32 | /// For more information about how deref works, see the documentation for the
 33 | /// [`std::ops::Deref`](https://doc.rust-lang.org/std/ops/trait.Deref.html)
 34 | /// trait.
 35 | ///
 36 | /// # Representation
 37 | ///
 38 | /// A `BString` has the same representation as a `Vec<u8>` and a `String`.
 39 | /// That is, it is made up of three word sized components: a pointer to a
 40 | /// region of memory containing the bytes, a length and a capacity.
 41 | #[derive(Clone)]
 42 | pub struct BString {
 43 |     bytes: Vec<u8>,
 44 | }
 45 | 
 46 | impl BString {
 47 |     /// Constructs a new `BString` from the given [`Vec`].
 48 |     ///
 49 |     /// # Examples
 50 |     ///
 51 |     /// ```
 52 |     /// use bstr::BString;
 53 |     ///
 54 |     /// let mut b = BString::new(Vec::with_capacity(10));
 55 |     /// ```
 56 |     ///
 57 |     /// This function is `const`:
 58 |     ///
 59 |     /// ```
 60 |     /// use bstr::BString;
 61 |     ///
 62 |     /// const B: BString = BString::new(vec![]);
 63 |     /// ```
 64 |     #[inline]
 65 |     pub const fn new(bytes: Vec<u8>) -> BString {
 66 |         BString { bytes }
 67 |     }
 68 | 
 69 |     #[inline]
 70 |     pub(crate) fn as_bytes(&self) -> &[u8] {
 71 |         &self.bytes
 72 |     }
 73 | 
 74 |     #[inline]
 75 |     pub(crate) fn as_bytes_mut(&mut self) -> &mut [u8] {
 76 |         &mut self.bytes
 77 |     }
 78 | 
 79 |     #[inline]
 80 |     pub(crate) fn as_bstr(&self) -> &BStr {
 81 |         BStr::new(&self.bytes)
 82 |     }
 83 | 
 84 |     #[inline]
 85 |     pub(crate) fn as_mut_bstr(&mut self) -> &mut BStr {
 86 |         BStr::new_mut(&mut self.bytes)
 87 |     }
 88 | 
 89 |     #[inline]
 90 |     pub(crate) fn as_vec(&self) -> &Vec<u8> {
 91 |         &self.bytes
 92 |     }
 93 | 
 94 |     #[inline]
 95 |     pub(crate) fn as_vec_mut(&mut self) -> &mut Vec<u8> {
 96 |         &mut self.bytes
 97 |     }
 98 | 
 99 |     #[inline]
100 |     pub(crate) fn into_vec(self) -> Vec<u8> {
101 |         self.bytes
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/byteset/mod.rs:
--------------------------------------------------------------------------------
  1 | use memchr::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
  2 | 
  3 | mod scalar;
  4 | 
  5 | #[inline]
  6 | fn build_table(byteset: &[u8]) -> [u8; 256] {
  7 |     let mut table = [0u8; 256];
  8 |     for &b in byteset {
  9 |         table[b as usize] = 1;
 10 |     }
 11 |     table
 12 | }
 13 | 
 14 | #[inline]
 15 | pub(crate) fn find(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
 16 |     match byteset.len() {
 17 |         0 => None,
 18 |         1 => memchr(byteset[0], haystack),
 19 |         2 => memchr2(byteset[0], byteset[1], haystack),
 20 |         3 => memchr3(byteset[0], byteset[1], byteset[2], haystack),
 21 |         _ => {
 22 |             let table = build_table(byteset);
 23 |             scalar::forward_search_bytes(haystack, |b| table[b as usize] != 0)
 24 |         }
 25 |     }
 26 | }
 27 | 
 28 | #[inline]
 29 | pub(crate) fn rfind(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
 30 |     match byteset.len() {
 31 |         0 => None,
 32 |         1 => memrchr(byteset[0], haystack),
 33 |         2 => memrchr2(byteset[0], byteset[1], haystack),
 34 |         3 => memrchr3(byteset[0], byteset[1], byteset[2], haystack),
 35 |         _ => {
 36 |             let table = build_table(byteset);
 37 |             scalar::reverse_search_bytes(haystack, |b| table[b as usize] != 0)
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | #[inline]
 43 | pub(crate) fn find_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
 44 |     if haystack.is_empty() {
 45 |         return None;
 46 |     }
 47 |     match byteset.len() {
 48 |         0 => Some(0),
 49 |         1 => scalar::inv_memchr(byteset[0], haystack),
 50 |         2 => scalar::forward_search_bytes(haystack, |b| {
 51 |             b != byteset[0] && b != byteset[1]
 52 |         }),
 53 |         3 => scalar::forward_search_bytes(haystack, |b| {
 54 |             b != byteset[0] && b != byteset[1] && b != byteset[2]
 55 |         }),
 56 |         _ => {
 57 |             let table = build_table(byteset);
 58 |             scalar::forward_search_bytes(haystack, |b| table[b as usize] == 0)
 59 |         }
 60 |     }
 61 | }
 62 | #[inline]
 63 | pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
 64 |     if haystack.is_empty() {
 65 |         return None;
 66 |     }
 67 |     match byteset.len() {
 68 |         0 => Some(haystack.len() - 1),
 69 |         1 => scalar::inv_memrchr(byteset[0], haystack),
 70 |         2 => scalar::reverse_search_bytes(haystack, |b| {
 71 |             b != byteset[0] && b != byteset[1]
 72 |         }),
 73 |         3 => scalar::reverse_search_bytes(haystack, |b| {
 74 |             b != byteset[0] && b != byteset[1] && b != byteset[2]
 75 |         }),
 76 |         _ => {
 77 |             let table = build_table(byteset);
 78 |             scalar::reverse_search_bytes(haystack, |b| table[b as usize] == 0)
 79 |         }
 80 |     }
 81 | }
 82 | 
 83 | #[cfg(all(test, feature = "std", not(miri)))]
 84 | mod tests {
 85 |     use alloc::vec::Vec;
 86 | 
 87 |     quickcheck::quickcheck! {
 88 |         fn qc_byteset_forward_matches_naive(
 89 |             haystack: Vec<u8>,
 90 |             needles: Vec<u8>
 91 |         ) -> bool {
 92 |             super::find(&haystack, &needles)
 93 |                 == haystack.iter().position(|b| needles.contains(b))
 94 |         }
 95 |         fn qc_byteset_backwards_matches_naive(
 96 |             haystack: Vec<u8>,
 97 |             needles: Vec<u8>
 98 |         ) -> bool {
 99 |             super::rfind(&haystack, &needles)
100 |                 == haystack.iter().rposition(|b| needles.contains(b))
101 |         }
102 |         fn qc_byteset_forward_not_matches_naive(
103 |             haystack: Vec<u8>,
104 |             needles: Vec<u8>
105 |         ) -> bool {
106 |             super::find_not(&haystack, &needles)
107 |                 == haystack.iter().position(|b| !needles.contains(b))
108 |         }
109 |         fn qc_byteset_backwards_not_matches_naive(
110 |             haystack: Vec<u8>,
111 |             needles: Vec<u8>
112 |         ) -> bool {
113 |             super::rfind_not(&haystack, &needles)
114 |                 == haystack.iter().rposition(|b| !needles.contains(b))
115 |         }
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/byteset/scalar.rs:
--------------------------------------------------------------------------------
  1 | // This is adapted from `fallback.rs` from rust-memchr. It's modified to return
  2 | // the 'inverse' query of memchr, e.g. finding the first byte not in the
  3 | // provided set. This is simple for the 1-byte case.
  4 | 
  5 | use core::{cmp, usize};
  6 | 
  7 | const USIZE_BYTES: usize = core::mem::size_of::<usize>();
  8 | const ALIGN_MASK: usize = core::mem::align_of::<usize>() - 1;
  9 | 
 10 | // The number of bytes to loop at in one iteration of memchr/memrchr.
 11 | const LOOP_SIZE: usize = 2 * USIZE_BYTES;
 12 | 
 13 | /// Repeat the given byte into a word size number. That is, every 8 bits
 14 | /// is equivalent to the given byte. For example, if `b` is `\x4E` or
 15 | /// `01001110` in binary, then the returned value on a 32-bit system would be:
 16 | /// `01001110_01001110_01001110_01001110`.
 17 | #[inline(always)]
 18 | fn repeat_byte(b: u8) -> usize {
 19 |     (b as usize) * (usize::MAX / 255)
 20 | }
 21 | 
 22 | pub fn inv_memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
 23 |     let vn1 = repeat_byte(n1);
 24 |     let confirm = |byte| byte != n1;
 25 |     let loop_size = cmp::min(LOOP_SIZE, haystack.len());
 26 |     let start_ptr = haystack.as_ptr();
 27 | 
 28 |     unsafe {
 29 |         let end_ptr = haystack.as_ptr().add(haystack.len());
 30 |         let mut ptr = start_ptr;
 31 | 
 32 |         if haystack.len() < USIZE_BYTES {
 33 |             return forward_search(start_ptr, end_ptr, ptr, confirm);
 34 |         }
 35 | 
 36 |         let chunk = read_unaligned_usize(ptr);
 37 |         if (chunk ^ vn1) != 0 {
 38 |             return forward_search(start_ptr, end_ptr, ptr, confirm);
 39 |         }
 40 | 
 41 |         ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & ALIGN_MASK));
 42 |         debug_assert!(ptr > start_ptr);
 43 |         debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr);
 44 |         while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
 45 |             debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
 46 | 
 47 |             let a = *(ptr as *const usize);
 48 |             let b = *(ptr.add(USIZE_BYTES) as *const usize);
 49 |             let eqa = (a ^ vn1) != 0;
 50 |             let eqb = (b ^ vn1) != 0;
 51 |             if eqa || eqb {
 52 |                 break;
 53 |             }
 54 |             ptr = ptr.add(LOOP_SIZE);
 55 |         }
 56 |         forward_search(start_ptr, end_ptr, ptr, confirm)
 57 |     }
 58 | }
 59 | 
 60 | /// Return the last index not matching the byte `x` in `text`.
 61 | pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
 62 |     let vn1 = repeat_byte(n1);
 63 |     let confirm = |byte| byte != n1;
 64 |     let loop_size = cmp::min(LOOP_SIZE, haystack.len());
 65 |     let start_ptr = haystack.as_ptr();
 66 | 
 67 |     unsafe {
 68 |         let end_ptr = haystack.as_ptr().add(haystack.len());
 69 |         let mut ptr = end_ptr;
 70 | 
 71 |         if haystack.len() < USIZE_BYTES {
 72 |             return reverse_search(start_ptr, end_ptr, ptr, confirm);
 73 |         }
 74 | 
 75 |         let chunk = read_unaligned_usize(ptr.sub(USIZE_BYTES));
 76 |         if (chunk ^ vn1) != 0 {
 77 |             return reverse_search(start_ptr, end_ptr, ptr, confirm);
 78 |         }
 79 | 
 80 |         ptr = ptr.sub(end_ptr as usize & ALIGN_MASK);
 81 |         debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
 82 |         while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
 83 |             debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
 84 | 
 85 |             let a = *(ptr.sub(2 * USIZE_BYTES) as *const usize);
 86 |             let b = *(ptr.sub(1 * USIZE_BYTES) as *const usize);
 87 |             let eqa = (a ^ vn1) != 0;
 88 |             let eqb = (b ^ vn1) != 0;
 89 |             if eqa || eqb {
 90 |                 break;
 91 |             }
 92 |             ptr = ptr.sub(loop_size);
 93 |         }
 94 |         reverse_search(start_ptr, end_ptr, ptr, confirm)
 95 |     }
 96 | }
 97 | 
 98 | #[inline(always)]
 99 | unsafe fn forward_search<F: Fn(u8) -> bool>(
100 |     start_ptr: *const u8,
101 |     end_ptr: *const u8,
102 |     mut ptr: *const u8,
103 |     confirm: F,
104 | ) -> Option<usize> {
105 |     debug_assert!(start_ptr <= ptr);
106 |     debug_assert!(ptr <= end_ptr);
107 | 
108 |     while ptr < end_ptr {
109 |         if confirm(*ptr) {
110 |             return Some(sub(ptr, start_ptr));
111 |         }
112 |         ptr = ptr.offset(1);
113 |     }
114 |     None
115 | }
116 | 
117 | #[inline(always)]
118 | unsafe fn reverse_search<F: Fn(u8) -> bool>(
119 |     start_ptr: *const u8,
120 |     end_ptr: *const u8,
121 |     mut ptr: *const u8,
122 |     confirm: F,
123 | ) -> Option<usize> {
124 |     debug_assert!(start_ptr <= ptr);
125 |     debug_assert!(ptr <= end_ptr);
126 | 
127 |     while ptr > start_ptr {
128 |         ptr = ptr.offset(-1);
129 |         if confirm(*ptr) {
130 |             return Some(sub(ptr, start_ptr));
131 |         }
132 |     }
133 |     None
134 | }
135 | 
136 | unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
137 |     (ptr as *const usize).read_unaligned()
138 | }
139 | 
140 | /// Subtract `b` from `a` and return the difference. `a` should be greater than
141 | /// or equal to `b`.
142 | fn sub(a: *const u8, b: *const u8) -> usize {
143 |     debug_assert!(a >= b);
144 |     (a as usize) - (b as usize)
145 | }
146 | 
147 | /// Safe wrapper around `forward_search`
148 | #[inline]
149 | pub(crate) fn forward_search_bytes<F: Fn(u8) -> bool>(
150 |     s: &[u8],
151 |     confirm: F,
152 | ) -> Option<usize> {
153 |     unsafe {
154 |         let start = s.as_ptr();
155 |         let end = start.add(s.len());
156 |         forward_search(start, end, start, confirm)
157 |     }
158 | }
159 | 
160 | /// Safe wrapper around `reverse_search`
161 | #[inline]
162 | pub(crate) fn reverse_search_bytes<F: Fn(u8) -> bool>(
163 |     s: &[u8],
164 |     confirm: F,
165 | ) -> Option<usize> {
166 |     unsafe {
167 |         let start = s.as_ptr();
168 |         let end = start.add(s.len());
169 |         reverse_search(start, end, end, confirm)
170 |     }
171 | }
172 | 
173 | #[cfg(all(test, feature = "std"))]
174 | mod tests {
175 |     use alloc::{vec, vec::Vec};
176 | 
177 |     use super::{inv_memchr, inv_memrchr};
178 | 
179 |     // search string, search byte, inv_memchr result, inv_memrchr result.
180 |     // these are expanded into a much larger set of tests in build_tests
181 |     const TESTS: &[(&[u8], u8, usize, usize)] = &[
182 |         (b"z", b'a', 0, 0),
183 |         (b"zz", b'a', 0, 1),
184 |         (b"aza", b'a', 1, 1),
185 |         (b"zaz", b'a', 0, 2),
186 |         (b"zza", b'a', 0, 1),
187 |         (b"zaa", b'a', 0, 0),
188 |         (b"zzz", b'a', 0, 2),
189 |     ];
190 | 
191 |     type TestCase = (Vec<u8>, u8, Option<(usize, usize)>);
192 | 
193 |     fn build_tests() -> Vec<TestCase> {
194 |         #[cfg(not(miri))]
195 |         const MAX_PER: usize = 515;
196 |         #[cfg(miri)]
197 |         const MAX_PER: usize = 10;
198 | 
199 |         let mut result = vec![];
200 |         for &(search, byte, fwd_pos, rev_pos) in TESTS {
201 |             result.push((search.to_vec(), byte, Some((fwd_pos, rev_pos))));
202 |             for i in 1..MAX_PER {
203 |                 // add a bunch of copies of the search byte to the end.
204 |                 let mut suffixed: Vec<u8> = search.into();
205 |                 suffixed.extend(std::iter::repeat(byte).take(i));
206 |                 result.push((suffixed, byte, Some((fwd_pos, rev_pos))));
207 | 
208 |                 // add a bunch of copies of the search byte to the start.
209 |                 let mut prefixed: Vec<u8> =
210 |                     std::iter::repeat(byte).take(i).collect();
211 |                 prefixed.extend(search);
212 |                 result.push((
213 |                     prefixed,
214 |                     byte,
215 |                     Some((fwd_pos + i, rev_pos + i)),
216 |                 ));
217 | 
218 |                 // add a bunch of copies of the search byte to both ends.
219 |                 let mut surrounded: Vec<u8> =
220 |                     std::iter::repeat(byte).take(i).collect();
221 |                 surrounded.extend(search);
222 |                 surrounded.extend(std::iter::repeat(byte).take(i));
223 |                 result.push((
224 |                     surrounded,
225 |                     byte,
226 |                     Some((fwd_pos + i, rev_pos + i)),
227 |                 ));
228 |             }
229 |         }
230 | 
231 |         // build non-matching tests for several sizes
232 |         for i in 0..MAX_PER {
233 |             result.push((
234 |                 std::iter::repeat(b'\0').take(i).collect(),
235 |                 b'\0',
236 |                 None,
237 |             ));
238 |         }
239 | 
240 |         result
241 |     }
242 | 
243 |     #[test]
244 |     fn test_inv_memchr() {
245 |         use crate::{ByteSlice, B};
246 | 
247 |         #[cfg(not(miri))]
248 |         const MAX_OFFSET: usize = 130;
249 |         #[cfg(miri)]
250 |         const MAX_OFFSET: usize = 13;
251 | 
252 |         for (search, byte, matching) in build_tests() {
253 |             assert_eq!(
254 |                 inv_memchr(byte, &search),
255 |                 matching.map(|m| m.0),
256 |                 "inv_memchr when searching for {:?} in {:?}",
257 |                 byte as char,
258 |                 // better printing
259 |                 B(&search).as_bstr(),
260 |             );
261 |             assert_eq!(
262 |                 inv_memrchr(byte, &search),
263 |                 matching.map(|m| m.1),
264 |                 "inv_memrchr when searching for {:?} in {:?}",
265 |                 byte as char,
266 |                 // better printing
267 |                 B(&search).as_bstr(),
268 |             );
269 |             // Test a rather large number off offsets for potential alignment
270 |             // issues.
271 |             for offset in 1..MAX_OFFSET {
272 |                 if offset >= search.len() {
273 |                     break;
274 |                 }
275 |                 // If this would cause us to shift the results off the end,
276 |                 // skip it so that we don't have to recompute them.
277 |                 if let Some((f, r)) = matching {
278 |                     if offset > f || offset > r {
279 |                         break;
280 |                     }
281 |                 }
282 |                 let realigned = &search[offset..];
283 | 
284 |                 let forward_pos = matching.map(|m| m.0 - offset);
285 |                 let reverse_pos = matching.map(|m| m.1 - offset);
286 | 
287 |                 assert_eq!(
288 |                     inv_memchr(byte, &realigned),
289 |                     forward_pos,
290 |                     "inv_memchr when searching (realigned by {}) for {:?} in {:?}",
291 |                     offset,
292 |                     byte as char,
293 |                     realigned.as_bstr(),
294 |                 );
295 |                 assert_eq!(
296 |                     inv_memrchr(byte, &realigned),
297 |                     reverse_pos,
298 |                     "inv_memrchr when searching (realigned by {}) for {:?} in {:?}",
299 |                     offset,
300 |                     byte as char,
301 |                     realigned.as_bstr(),
302 |                 );
303 |             }
304 |         }
305 |     }
306 | }
307 | 


--------------------------------------------------------------------------------
/src/escape_bytes.rs:
--------------------------------------------------------------------------------
  1 | /// An iterator of `char` values that represent an escaping of arbitrary bytes.
  2 | ///
  3 | /// The lifetime parameter `'a` refers to the lifetime of the bytes being
  4 | /// escaped.
  5 | ///
  6 | /// This iterator is created by the
  7 | /// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method.
  8 | #[derive(Clone, Debug)]
  9 | pub struct EscapeBytes<'a> {
 10 |     remaining: &'a [u8],
 11 |     state: EscapeState,
 12 | }
 13 | 
 14 | impl<'a> EscapeBytes<'a> {
 15 |     pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes<'a> {
 16 |         EscapeBytes { remaining: bytes, state: EscapeState::Start }
 17 |     }
 18 | }
 19 | 
 20 | impl<'a> Iterator for EscapeBytes<'a> {
 21 |     type Item = char;
 22 | 
 23 |     #[inline]
 24 |     fn next(&mut self) -> Option<char> {
 25 |         use self::EscapeState::*;
 26 | 
 27 |         match self.state {
 28 |             Start => {
 29 |                 let byte = match crate::decode_utf8(self.remaining) {
 30 |                     (None, 0) => return None,
 31 |                     // If we see invalid UTF-8 or ASCII, then we always just
 32 |                     // peel one byte off. If it's printable ASCII, we'll pass
 33 |                     // it through as-is below. Otherwise, below, it will get
 34 |                     // escaped in some way.
 35 |                     (None, _) | (Some(_), 1) => {
 36 |                         let byte = self.remaining[0];
 37 |                         self.remaining = &self.remaining[1..];
 38 |                         byte
 39 |                     }
 40 |                     // For any valid UTF-8 that is not ASCII, we pass it
 41 |                     // through as-is. We don't do any Unicode escaping.
 42 |                     (Some(ch), size) => {
 43 |                         self.remaining = &self.remaining[size..];
 44 |                         return Some(ch);
 45 |                     }
 46 |                 };
 47 |                 self.state = match byte {
 48 |                     0x21..=0x5B | 0x5D..=0x7E => {
 49 |                         return Some(char::from(byte))
 50 |                     }
 51 |                     b'\0' => SpecialEscape('0'),
 52 |                     b'\n' => SpecialEscape('n'),
 53 |                     b'\r' => SpecialEscape('r'),
 54 |                     b'\t' => SpecialEscape('t'),
 55 |                     b'\\' => SpecialEscape('\\'),
 56 |                     _ => HexEscapeX(byte),
 57 |                 };
 58 |                 Some('\\')
 59 |             }
 60 |             SpecialEscape(ch) => {
 61 |                 self.state = Start;
 62 |                 Some(ch)
 63 |             }
 64 |             HexEscapeX(byte) => {
 65 |                 self.state = HexEscapeHighNybble(byte);
 66 |                 Some('x')
 67 |             }
 68 |             HexEscapeHighNybble(byte) => {
 69 |                 self.state = HexEscapeLowNybble(byte);
 70 |                 let nybble = byte >> 4;
 71 |                 Some(hexdigit_to_char(nybble))
 72 |             }
 73 |             HexEscapeLowNybble(byte) => {
 74 |                 self.state = Start;
 75 |                 let nybble = byte & 0xF;
 76 |                 Some(hexdigit_to_char(nybble))
 77 |             }
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | impl<'a> core::fmt::Display for EscapeBytes<'a> {
 83 |     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
 84 |         use core::fmt::Write;
 85 |         for ch in self.clone() {
 86 |             f.write_char(ch)?;
 87 |         }
 88 |         Ok(())
 89 |     }
 90 | }
 91 | 
 92 | /// The state used by the FSM in the escaping iterator.
 93 | #[derive(Clone, Debug)]
 94 | enum EscapeState {
 95 |     /// Read and remove the next byte from 'remaining'. If 'remaining' is
 96 |     /// empty, then return None. Otherwise, escape the byte according to the
 97 |     /// following rules or emit it as-is.
 98 |     ///
 99 |     /// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current
100 |     /// state to 'SpecialEscape(n | r | t | \ | 0)'. Otherwise, if the 'byte'
101 |     /// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to
102 |     /// to 'HexEscapeX(byte)'.
103 |     Start,
104 |     /// Emit the given codepoint as is. This assumes '\' has just been emitted.
105 |     /// Then set the state to 'Start'.
106 |     SpecialEscape(char),
107 |     /// Emit the 'x' part of a hex escape. This assumes '\' has just been
108 |     /// emitted. Then set the state to 'HexEscapeHighNybble(byte)'.
109 |     HexEscapeX(u8),
110 |     /// Emit the high nybble of the byte as a hexadecimal digit. This
111 |     /// assumes '\x' has just been emitted. Then set the state to
112 |     /// 'HexEscapeLowNybble(byte)'.
113 |     HexEscapeHighNybble(u8),
114 |     /// Emit the low nybble of the byte as a hexadecimal digit. This assume
115 |     /// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte.
116 |     /// Then set the state to 'Start'.
117 |     HexEscapeLowNybble(u8),
118 | }
119 | 
120 | /// An iterator of `u8` values that represent an unescaping of a sequence of
121 | /// codepoints.
122 | ///
123 | /// The type parameter `I` refers to the iterator of codepoints that is
124 | /// unescaped.
125 | ///
126 | /// Currently this iterator is not exposed in the crate API, and instead all
127 | /// we expose is a `ByteVec::unescape` method. Which of course requires an
128 | /// alloc. That's the most convenient form of this, but in theory, we could
129 | /// expose this for core-only use cases too. I'm just not quite sure what the
130 | /// API should be.
131 | #[derive(Clone, Debug)]
132 | #[cfg(feature = "alloc")]
133 | pub(crate) struct UnescapeBytes<I> {
134 |     it: I,
135 |     state: UnescapeState,
136 | }
137 | 
138 | #[cfg(feature = "alloc")]
139 | impl<I: Iterator<Item = char>> UnescapeBytes<I> {
140 |     pub(crate) fn new<T: IntoIterator<IntoIter = I>>(
141 |         t: T,
142 |     ) -> UnescapeBytes<I> {
143 |         UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start }
144 |     }
145 | }
146 | 
147 | #[cfg(feature = "alloc")]
148 | impl<I: Iterator<Item = char>> Iterator for UnescapeBytes<I> {
149 |     type Item = u8;
150 | 
151 |     fn next(&mut self) -> Option<u8> {
152 |         use self::UnescapeState::*;
153 | 
154 |         loop {
155 |             match self.state {
156 |                 Start => {
157 |                     let ch = self.it.next()?;
158 |                     match ch {
159 |                         '\\' => {
160 |                             self.state = Escape;
161 |                         }
162 |                         ch => {
163 |                             self.state = UnescapeState::bytes(&[], ch);
164 |                         }
165 |                     }
166 |                 }
167 |                 Bytes { buf, mut cur, len } => {
168 |                     let byte = buf[cur];
169 |                     cur += 1;
170 |                     if cur >= len {
171 |                         self.state = Start;
172 |                     } else {
173 |                         self.state = Bytes { buf, cur, len };
174 |                     }
175 |                     return Some(byte);
176 |                 }
177 |                 Escape => {
178 |                     let ch = match self.it.next() {
179 |                         Some(ch) => ch,
180 |                         None => {
181 |                             self.state = Start;
182 |                             // Incomplete escape sequences unescape as
183 |                             // themselves.
184 |                             return Some(b'\\');
185 |                         }
186 |                     };
187 |                     match ch {
188 |                         '0' => {
189 |                             self.state = Start;
190 |                             return Some(b'\x00');
191 |                         }
192 |                         '\\' => {
193 |                             self.state = Start;
194 |                             return Some(b'\\');
195 |                         }
196 |                         'r' => {
197 |                             self.state = Start;
198 |                             return Some(b'\r');
199 |                         }
200 |                         'n' => {
201 |                             self.state = Start;
202 |                             return Some(b'\n');
203 |                         }
204 |                         't' => {
205 |                             self.state = Start;
206 |                             return Some(b'\t');
207 |                         }
208 |                         'x' => {
209 |                             self.state = HexFirst;
210 |                         }
211 |                         ch => {
212 |                             // An invalid escape sequence unescapes as itself.
213 |                             self.state = UnescapeState::bytes(&[b'\\'], ch);
214 |                         }
215 |                     }
216 |                 }
217 |                 HexFirst => {
218 |                     let ch = match self.it.next() {
219 |                         Some(ch) => ch,
220 |                         None => {
221 |                             // An incomplete escape sequence unescapes as
222 |                             // itself.
223 |                             self.state = UnescapeState::bytes_raw(&[b'x']);
224 |                             return Some(b'\\');
225 |                         }
226 |                     };
227 |                     match ch {
228 |                         '0'..='9' | 'A'..='F' | 'a'..='f' => {
229 |                             self.state = HexSecond(ch);
230 |                         }
231 |                         ch => {
232 |                             // An invalid escape sequence unescapes as itself.
233 |                             self.state = UnescapeState::bytes(&[b'x'], ch);
234 |                             return Some(b'\\');
235 |                         }
236 |                     }
237 |                 }
238 |                 HexSecond(first) => {
239 |                     let second = match self.it.next() {
240 |                         Some(ch) => ch,
241 |                         None => {
242 |                             // An incomplete escape sequence unescapes as
243 |                             // itself.
244 |                             self.state = UnescapeState::bytes(&[b'x'], first);
245 |                             return Some(b'\\');
246 |                         }
247 |                     };
248 |                     match second {
249 |                         '0'..='9' | 'A'..='F' | 'a'..='f' => {
250 |                             self.state = Start;
251 |                             let hinybble = char_to_hexdigit(first);
252 |                             let lonybble = char_to_hexdigit(second);
253 |                             let byte = hinybble << 4 | lonybble;
254 |                             return Some(byte);
255 |                         }
256 |                         ch => {
257 |                             // An invalid escape sequence unescapes as itself.
258 |                             self.state =
259 |                                 UnescapeState::bytes2(&[b'x'], first, ch);
260 |                             return Some(b'\\');
261 |                         }
262 |                     }
263 |                 }
264 |             }
265 |         }
266 |     }
267 | }
268 | 
269 | /// The state used by the FSM in the unescaping iterator.
270 | #[derive(Clone, Debug)]
271 | #[cfg(feature = "alloc")]
272 | enum UnescapeState {
273 |     /// The start state. Look for an escape sequence, otherwise emit the next
274 |     /// codepoint as-is.
275 |     Start,
276 |     /// Emit the byte at `buf[cur]`.
277 |     ///
278 |     /// This state should never be created when `cur >= len`. That is, when
279 |     /// this state is visited, it is assumed that `cur < len`.
280 |     Bytes { buf: [u8; 11], cur: usize, len: usize },
281 |     /// This state is entered after a `\` is seen.
282 |     Escape,
283 |     /// This state is entered after a `\x` is seen.
284 |     HexFirst,
285 |     /// This state is entered after a `\xN` is seen, where `N` is in
286 |     /// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`.
287 |     HexSecond(char),
288 | }
289 | 
290 | #[cfg(feature = "alloc")]
291 | impl UnescapeState {
292 |     /// Create a new `Bytes` variant with the given slice.
293 |     ///
294 |     /// # Panics
295 |     ///
296 |     /// Panics if `bytes.len() > 11`.
297 |     fn bytes_raw(bytes: &[u8]) -> UnescapeState {
298 |         // This can be increased, you just need to make sure 'buf' in the
299 |         // 'Bytes' state has enough room.
300 |         assert!(bytes.len() <= 11, "no more than 11 bytes allowed");
301 |         let mut buf = [0; 11];
302 |         buf[..bytes.len()].copy_from_slice(bytes);
303 |         UnescapeState::Bytes { buf, cur: 0, len: bytes.len() }
304 |     }
305 | 
306 |     /// Create a new `Bytes` variant with the prefix byte slice, followed by
307 |     /// the UTF-8 encoding of the given char.
308 |     ///
309 |     /// # Panics
310 |     ///
311 |     /// Panics if `prefix.len() > 3`.
312 |     fn bytes(prefix: &[u8], ch: char) -> UnescapeState {
313 |         // This can be increased, you just need to make sure 'buf' in the
314 |         // 'Bytes' state has enough room.
315 |         assert!(prefix.len() <= 3, "no more than 3 bytes allowed");
316 |         let mut buf = [0; 11];
317 |         buf[..prefix.len()].copy_from_slice(prefix);
318 |         let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len();
319 |         UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + chlen }
320 |     }
321 | 
322 |     /// Create a new `Bytes` variant with the prefix byte slice, followed by
323 |     /// the UTF-8 encoding of `ch1` and then `ch2`.
324 |     ///
325 |     /// # Panics
326 |     ///
327 |     /// Panics if `prefix.len() > 3`.
328 |     fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState {
329 |         // This can be increased, you just need to make sure 'buf' in the
330 |         // 'Bytes' state has enough room.
331 |         assert!(prefix.len() <= 3, "no more than 3 bytes allowed");
332 |         let mut buf = [0; 11];
333 |         buf[..prefix.len()].copy_from_slice(prefix);
334 |         let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len();
335 |         let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len();
336 |         UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + len1 + len2 }
337 |     }
338 | }
339 | 
340 | /// Convert the given codepoint to its corresponding hexadecimal digit.
341 | ///
342 | /// # Panics
343 | ///
344 | /// This panics if `ch` is not in `[0-9A-Fa-f]`.
345 | #[cfg(feature = "alloc")]
346 | fn char_to_hexdigit(ch: char) -> u8 {
347 |     u8::try_from(ch.to_digit(16).unwrap()).unwrap()
348 | }
349 | 
350 | /// Convert the given hexadecimal digit to its corresponding codepoint.
351 | ///
352 | /// # Panics
353 | ///
354 | /// This panics when `digit > 15`.
355 | fn hexdigit_to_char(digit: u8) -> char {
356 |     char::from_digit(u32::from(digit), 16).unwrap().to_ascii_uppercase()
357 | }
358 | 
359 | #[cfg(all(test, feature = "std"))]
360 | mod tests {
361 |     use alloc::string::{String, ToString};
362 | 
363 |     use crate::BString;
364 | 
365 |     use super::*;
366 | 
367 |     #[allow(non_snake_case)]
368 |     fn B<B: AsRef<[u8]>>(bytes: B) -> BString {
369 |         BString::from(bytes.as_ref())
370 |     }
371 | 
372 |     fn e<B: AsRef<[u8]>>(bytes: B) -> String {
373 |         EscapeBytes::new(bytes.as_ref()).to_string()
374 |     }
375 | 
376 |     fn u(string: &str) -> BString {
377 |         UnescapeBytes::new(string.chars()).collect()
378 |     }
379 | 
380 |     #[test]
381 |     fn escape() {
382 |         assert_eq!(r"a", e(br"a"));
383 |         assert_eq!(r"\\x61", e(br"\x61"));
384 |         assert_eq!(r"a", e(b"\x61"));
385 |         assert_eq!(r"~", e(b"\x7E"));
386 |         assert_eq!(r"\x7F", e(b"\x7F"));
387 | 
388 |         assert_eq!(r"\n", e(b"\n"));
389 |         assert_eq!(r"\r", e(b"\r"));
390 |         assert_eq!(r"\t", e(b"\t"));
391 |         assert_eq!(r"\\", e(b"\\"));
392 |         assert_eq!(r"\0", e(b"\0"));
393 |         assert_eq!(r"\0", e(b"\x00"));
394 | 
395 |         assert_eq!(r"\x88", e(b"\x88"));
396 |         assert_eq!(r"\x8F", e(b"\x8F"));
397 |         assert_eq!(r"\xF8", e(b"\xF8"));
398 |         assert_eq!(r"\xFF", e(b"\xFF"));
399 | 
400 |         assert_eq!(r"\xE2", e(b"\xE2"));
401 |         assert_eq!(r"\xE2\x98", e(b"\xE2\x98"));
402 |         assert_eq!(r"☃", e(b"\xE2\x98\x83"));
403 | 
404 |         assert_eq!(r"\xF0", e(b"\xF0"));
405 |         assert_eq!(r"\xF0\x9F", e(b"\xF0\x9F"));
406 |         assert_eq!(r"\xF0\x9F\x92", e(b"\xF0\x9F\x92"));
407 |         assert_eq!(r"💩", e(b"\xF0\x9F\x92\xA9"));
408 |     }
409 | 
410 |     #[test]
411 |     fn unescape() {
412 |         assert_eq!(B(r"a"), u(r"a"));
413 |         assert_eq!(B(r"\x61"), u(r"\\x61"));
414 |         assert_eq!(B(r"a"), u(r"\x61"));
415 |         assert_eq!(B(r"~"), u(r"\x7E"));
416 |         assert_eq!(B(b"\x7F"), u(r"\x7F"));
417 | 
418 |         assert_eq!(B(b"\n"), u(r"\n"));
419 |         assert_eq!(B(b"\r"), u(r"\r"));
420 |         assert_eq!(B(b"\t"), u(r"\t"));
421 |         assert_eq!(B(b"\\"), u(r"\\"));
422 |         assert_eq!(B(b"\0"), u(r"\0"));
423 |         assert_eq!(B(b"\0"), u(r"\x00"));
424 | 
425 |         assert_eq!(B(b"\x88"), u(r"\x88"));
426 |         assert_eq!(B(b"\x8F"), u(r"\x8F"));
427 |         assert_eq!(B(b"\xF8"), u(r"\xF8"));
428 |         assert_eq!(B(b"\xFF"), u(r"\xFF"));
429 | 
430 |         assert_eq!(B(b"\xE2"), u(r"\xE2"));
431 |         assert_eq!(B(b"\xE2\x98"), u(r"\xE2\x98"));
432 |         assert_eq!(B("☃"), u(r"\xE2\x98\x83"));
433 | 
434 |         assert_eq!(B(b"\xF0"), u(r"\xf0"));
435 |         assert_eq!(B(b"\xF0\x9F"), u(r"\xf0\x9f"));
436 |         assert_eq!(B(b"\xF0\x9F\x92"), u(r"\xf0\x9f\x92"));
437 |         assert_eq!(B("💩"), u(r"\xf0\x9f\x92\xa9"));
438 |     }
439 | 
440 |     #[test]
441 |     fn unescape_weird() {
442 |         assert_eq!(B(b"\\"), u(r"\"));
443 |         assert_eq!(B(b"\\"), u(r"\\"));
444 |         assert_eq!(B(b"\\x"), u(r"\x"));
445 |         assert_eq!(B(b"\\xA"), u(r"\xA"));
446 | 
447 |         assert_eq!(B(b"\\xZ"), u(r"\xZ"));
448 |         assert_eq!(B(b"\\xZZ"), u(r"\xZZ"));
449 |         assert_eq!(B(b"\\i"), u(r"\i"));
450 |         assert_eq!(B(b"\\u"), u(r"\u"));
451 |         assert_eq!(B(b"\\u{2603}"), u(r"\u{2603}"));
452 |     }
453 | }
454 | 


--------------------------------------------------------------------------------
/src/io.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | Utilities for working with I/O using byte strings.
  3 | 
  4 | This module currently only exports a single trait, `BufReadExt`, which provides
  5 | facilities for conveniently and efficiently working with lines as byte strings.
  6 | 
  7 | More APIs may be added in the future.
  8 | */
  9 | 
 10 | use alloc::{vec, vec::Vec};
 11 | 
 12 | use std::io;
 13 | 
 14 | use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
 15 | 
 16 | /// An extension trait for
 17 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
 18 | /// which provides convenience APIs for dealing with byte strings.
 19 | pub trait BufReadExt: io::BufRead {
 20 |     /// Returns an iterator over the lines of this reader, where each line
 21 |     /// is represented as a byte string.
 22 |     ///
 23 |     /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
 24 |     /// an error is yielded if there was a problem reading from the underlying
 25 |     /// reader.
 26 |     ///
 27 |     /// On success, the next line in the iterator is returned. The line does
 28 |     /// *not* contain a trailing `\n` or `\r\n`.
 29 |     ///
 30 |     /// # Examples
 31 |     ///
 32 |     /// Basic usage:
 33 |     ///
 34 |     /// ```
 35 |     /// use std::io;
 36 |     ///
 37 |     /// use bstr::io::BufReadExt;
 38 |     ///
 39 |     /// # fn example() -> Result<(), io::Error> {
 40 |     /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
 41 |     ///
 42 |     /// let mut lines = vec![];
 43 |     /// for result in cursor.byte_lines() {
 44 |     ///     let line = result?;
 45 |     ///     lines.push(line);
 46 |     /// }
 47 |     /// assert_eq!(lines.len(), 3);
 48 |     /// assert_eq!(lines[0], "lorem".as_bytes());
 49 |     /// assert_eq!(lines[1], "ipsum".as_bytes());
 50 |     /// assert_eq!(lines[2], "dolor".as_bytes());
 51 |     /// # Ok(()) }; example().unwrap()
 52 |     /// ```
 53 |     fn byte_lines(self) -> ByteLines<Self>
 54 |     where
 55 |         Self: Sized,
 56 |     {
 57 |         ByteLines { buf: self }
 58 |     }
 59 | 
 60 |     /// Returns an iterator over byte-terminated records of this reader, where
 61 |     /// each record is represented as a byte string.
 62 |     ///
 63 |     /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
 64 |     /// an error is yielded if there was a problem reading from the underlying
 65 |     /// reader.
 66 |     ///
 67 |     /// On success, the next record in the iterator is returned. The record
 68 |     /// does *not* contain its trailing terminator.
 69 |     ///
 70 |     /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
 71 |     /// that it has no special handling for `\r`.
 72 |     ///
 73 |     /// # Examples
 74 |     ///
 75 |     /// Basic usage:
 76 |     ///
 77 |     /// ```
 78 |     /// use std::io;
 79 |     ///
 80 |     /// use bstr::io::BufReadExt;
 81 |     ///
 82 |     /// # fn example() -> Result<(), io::Error> {
 83 |     /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
 84 |     ///
 85 |     /// let mut records = vec![];
 86 |     /// for result in cursor.byte_records(b'\x00') {
 87 |     ///     let record = result?;
 88 |     ///     records.push(record);
 89 |     /// }
 90 |     /// assert_eq!(records.len(), 3);
 91 |     /// assert_eq!(records[0], "lorem".as_bytes());
 92 |     /// assert_eq!(records[1], "ipsum".as_bytes());
 93 |     /// assert_eq!(records[2], "dolor".as_bytes());
 94 |     /// # Ok(()) }; example().unwrap()
 95 |     /// ```
 96 |     fn byte_records(self, terminator: u8) -> ByteRecords<Self>
 97 |     where
 98 |         Self: Sized,
 99 |     {
100 |         ByteRecords { terminator, buf: self }
101 |     }
102 | 
103 |     /// Executes the given closure on each line in the underlying reader.
104 |     ///
105 |     /// If the closure returns an error (or if the underlying reader returns an
106 |     /// error), then iteration is stopped and the error is returned. If false
107 |     /// is returned, then iteration is stopped and no error is returned.
108 |     ///
109 |     /// The closure given is called on exactly the same values as yielded by
110 |     /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
111 |     /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
112 |     ///
113 |     /// This routine is useful for iterating over lines as quickly as
114 |     /// possible. Namely, a single allocation is reused for each line.
115 |     ///
116 |     /// # Examples
117 |     ///
118 |     /// Basic usage:
119 |     ///
120 |     /// ```
121 |     /// use std::io;
122 |     ///
123 |     /// use bstr::io::BufReadExt;
124 |     ///
125 |     /// # fn example() -> Result<(), io::Error> {
126 |     /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
127 |     ///
128 |     /// let mut lines = vec![];
129 |     /// cursor.for_byte_line(|line| {
130 |     ///     lines.push(line.to_vec());
131 |     ///     Ok(true)
132 |     /// })?;
133 |     /// assert_eq!(lines.len(), 3);
134 |     /// assert_eq!(lines[0], "lorem".as_bytes());
135 |     /// assert_eq!(lines[1], "ipsum".as_bytes());
136 |     /// assert_eq!(lines[2], "dolor".as_bytes());
137 |     /// # Ok(()) }; example().unwrap()
138 |     /// ```
139 |     fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
140 |     where
141 |         Self: Sized,
142 |         F: FnMut(&[u8]) -> io::Result<bool>,
143 |     {
144 |         self.for_byte_line_with_terminator(|line| {
145 |             for_each_line(trim_line_slice(line))
146 |         })
147 |     }
148 | 
149 |     /// Executes the given closure on each byte-terminated record in the
150 |     /// underlying reader.
151 |     ///
152 |     /// If the closure returns an error (or if the underlying reader returns an
153 |     /// error), then iteration is stopped and the error is returned. If false
154 |     /// is returned, then iteration is stopped and no error is returned.
155 |     ///
156 |     /// The closure given is called on exactly the same values as yielded by
157 |     /// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
158 |     /// iterator. Namely, records do _not_ contain a trailing terminator byte.
159 |     ///
160 |     /// This routine is useful for iterating over records as quickly as
161 |     /// possible. Namely, a single allocation is reused for each record.
162 |     ///
163 |     /// # Examples
164 |     ///
165 |     /// Basic usage:
166 |     ///
167 |     /// ```
168 |     /// use std::io;
169 |     ///
170 |     /// use bstr::io::BufReadExt;
171 |     ///
172 |     /// # fn example() -> Result<(), io::Error> {
173 |     /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
174 |     ///
175 |     /// let mut records = vec![];
176 |     /// cursor.for_byte_record(b'\x00', |record| {
177 |     ///     records.push(record.to_vec());
178 |     ///     Ok(true)
179 |     /// })?;
180 |     /// assert_eq!(records.len(), 3);
181 |     /// assert_eq!(records[0], "lorem".as_bytes());
182 |     /// assert_eq!(records[1], "ipsum".as_bytes());
183 |     /// assert_eq!(records[2], "dolor".as_bytes());
184 |     /// # Ok(()) }; example().unwrap()
185 |     /// ```
186 |     fn for_byte_record<F>(
187 |         &mut self,
188 |         terminator: u8,
189 |         mut for_each_record: F,
190 |     ) -> io::Result<()>
191 |     where
192 |         Self: Sized,
193 |         F: FnMut(&[u8]) -> io::Result<bool>,
194 |     {
195 |         self.for_byte_record_with_terminator(terminator, |chunk| {
196 |             for_each_record(trim_record_slice(chunk, terminator))
197 |         })
198 |     }
199 | 
200 |     /// Executes the given closure on each line in the underlying reader.
201 |     ///
202 |     /// If the closure returns an error (or if the underlying reader returns an
203 |     /// error), then iteration is stopped and the error is returned. If false
204 |     /// is returned, then iteration is stopped and no error is returned.
205 |     ///
206 |     /// Unlike
207 |     /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
208 |     /// the lines given to the closure *do* include the line terminator, if one
209 |     /// exists.
210 |     ///
211 |     /// This routine is useful for iterating over lines as quickly as
212 |     /// possible. Namely, a single allocation is reused for each line.
213 |     ///
214 |     /// This is identical to `for_byte_record_with_terminator` with a
215 |     /// terminator of `\n`.
216 |     ///
217 |     /// # Examples
218 |     ///
219 |     /// Basic usage:
220 |     ///
221 |     /// ```
222 |     /// use std::io;
223 |     ///
224 |     /// use bstr::io::BufReadExt;
225 |     ///
226 |     /// # fn example() -> Result<(), io::Error> {
227 |     /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
228 |     ///
229 |     /// let mut lines = vec![];
230 |     /// cursor.for_byte_line_with_terminator(|line| {
231 |     ///     lines.push(line.to_vec());
232 |     ///     Ok(true)
233 |     /// })?;
234 |     /// assert_eq!(lines.len(), 3);
235 |     /// assert_eq!(lines[0], "lorem\n".as_bytes());
236 |     /// assert_eq!(lines[1], "ipsum\r\n".as_bytes());
237 |     /// assert_eq!(lines[2], "dolor".as_bytes());
238 |     /// # Ok(()) }; example().unwrap()
239 |     /// ```
240 |     fn for_byte_line_with_terminator<F>(
241 |         &mut self,
242 |         for_each_line: F,
243 |     ) -> io::Result<()>
244 |     where
245 |         Self: Sized,
246 |         F: FnMut(&[u8]) -> io::Result<bool>,
247 |     {
248 |         self.for_byte_record_with_terminator(b'\n', for_each_line)
249 |     }
250 | 
251 |     /// Executes the given closure on each byte-terminated record in the
252 |     /// underlying reader.
253 |     ///
254 |     /// If the closure returns an error (or if the underlying reader returns an
255 |     /// error), then iteration is stopped and the error is returned. If false
256 |     /// is returned, then iteration is stopped and no error is returned.
257 |     ///
258 |     /// Unlike
259 |     /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
260 |     /// the lines given to the closure *do* include the record terminator, if
261 |     /// one exists.
262 |     ///
263 |     /// This routine is useful for iterating over records as quickly as
264 |     /// possible. Namely, a single allocation is reused for each record.
265 |     ///
266 |     /// # Examples
267 |     ///
268 |     /// Basic usage:
269 |     ///
270 |     /// ```
271 |     /// use std::io;
272 |     ///
273 |     /// use bstr::{io::BufReadExt, B};
274 |     ///
275 |     /// # fn example() -> Result<(), io::Error> {
276 |     /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
277 |     ///
278 |     /// let mut records = vec![];
279 |     /// cursor.for_byte_record_with_terminator(b'\x00', |record| {
280 |     ///     records.push(record.to_vec());
281 |     ///     Ok(true)
282 |     /// })?;
283 |     /// assert_eq!(records.len(), 3);
284 |     /// assert_eq!(records[0], B(b"lorem\x00"));
285 |     /// assert_eq!(records[1], B("ipsum\x00"));
286 |     /// assert_eq!(records[2], B("dolor"));
287 |     /// # Ok(()) }; example().unwrap()
288 |     /// ```
289 |     fn for_byte_record_with_terminator<F>(
290 |         &mut self,
291 |         terminator: u8,
292 |         mut for_each_record: F,
293 |     ) -> io::Result<()>
294 |     where
295 |         Self: Sized,
296 |         F: FnMut(&[u8]) -> io::Result<bool>,
297 |     {
298 |         let mut bytes = vec![];
299 |         let mut res = Ok(());
300 |         let mut consumed = 0;
301 |         'outer: loop {
302 |             // Lend out complete record slices from our buffer
303 |             {
304 |                 let mut buf = self.fill_buf()?;
305 |                 if buf.is_empty() {
306 |                     break;
307 |                 }
308 |                 while let Some(index) = buf.find_byte(terminator) {
309 |                     let (record, rest) = buf.split_at(index + 1);
310 |                     buf = rest;
311 |                     consumed += record.len();
312 |                     match for_each_record(record) {
313 |                         Ok(false) => break 'outer,
314 |                         Err(err) => {
315 |                             res = Err(err);
316 |                             break 'outer;
317 |                         }
318 |                         _ => (),
319 |                     }
320 |                 }
321 | 
322 |                 // Copy the final record fragment to our local buffer. This
323 |                 // saves read_until() from re-scanning a buffer we know
324 |                 // contains no remaining terminators.
325 |                 bytes.extend_from_slice(buf);
326 |                 consumed += buf.len();
327 |             }
328 | 
329 |             self.consume(consumed);
330 |             consumed = 0;
331 | 
332 |             // N.B. read_until uses a different version of memchr that may
333 |             // be slower than the memchr crate that bstr uses. However, this
334 |             // should only run for a fairly small number of records, assuming a
335 |             // decent buffer size.
336 |             self.read_until(terminator, &mut bytes)?;
337 |             if bytes.is_empty() || !for_each_record(&bytes)? {
338 |                 break;
339 |             }
340 |             bytes.clear();
341 |         }
342 |         self.consume(consumed);
343 |         res
344 |     }
345 | }
346 | 
347 | impl<B: io::BufRead> BufReadExt for B {}
348 | 
349 | /// An iterator over lines from an instance of
350 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
351 | ///
352 | /// This iterator is generally created by calling the
353 | /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
354 | /// method on the
355 | /// [`BufReadExt`](trait.BufReadExt.html)
356 | /// trait.
357 | #[derive(Debug)]
358 | pub struct ByteLines<B> {
359 |     buf: B,
360 | }
361 | 
362 | /// An iterator over records from an instance of
363 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
364 | ///
365 | /// A byte record is any sequence of bytes terminated by a particular byte
366 | /// chosen by the caller. For example, NUL separated byte strings are said to
367 | /// be NUL-terminated byte records.
368 | ///
369 | /// This iterator is generally created by calling the
370 | /// [`byte_records`](trait.BufReadExt.html#method.byte_records)
371 | /// method on the
372 | /// [`BufReadExt`](trait.BufReadExt.html)
373 | /// trait.
374 | #[derive(Debug)]
375 | pub struct ByteRecords<B> {
376 |     buf: B,
377 |     terminator: u8,
378 | }
379 | 
380 | impl<B: io::BufRead> Iterator for ByteLines<B> {
381 |     type Item = io::Result<Vec<u8>>;
382 | 
383 |     fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
384 |         let mut bytes = vec![];
385 |         match self.buf.read_until(b'\n', &mut bytes) {
386 |             Err(e) => Some(Err(e)),
387 |             Ok(0) => None,
388 |             Ok(_) => {
389 |                 trim_line(&mut bytes);
390 |                 Some(Ok(bytes))
391 |             }
392 |         }
393 |     }
394 | }
395 | 
396 | impl<B: io::BufRead> Iterator for ByteRecords<B> {
397 |     type Item = io::Result<Vec<u8>>;
398 | 
399 |     fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
400 |         let mut bytes = vec![];
401 |         match self.buf.read_until(self.terminator, &mut bytes) {
402 |             Err(e) => Some(Err(e)),
403 |             Ok(0) => None,
404 |             Ok(_) => {
405 |                 trim_record(&mut bytes, self.terminator);
406 |                 Some(Ok(bytes))
407 |             }
408 |         }
409 |     }
410 | }
411 | 
412 | fn trim_line(line: &mut Vec<u8>) {
413 |     if line.last_byte() == Some(b'\n') {
414 |         line.pop_byte();
415 |         if line.last_byte() == Some(b'\r') {
416 |             line.pop_byte();
417 |         }
418 |     }
419 | }
420 | 
421 | fn trim_line_slice(mut line: &[u8]) -> &[u8] {
422 |     if line.last_byte() == Some(b'\n') {
423 |         line = &line[..line.len() - 1];
424 |         if line.last_byte() == Some(b'\r') {
425 |             line = &line[..line.len() - 1];
426 |         }
427 |     }
428 |     line
429 | }
430 | 
431 | fn trim_record(record: &mut Vec<u8>, terminator: u8) {
432 |     if record.last_byte() == Some(terminator) {
433 |         record.pop_byte();
434 |     }
435 | }
436 | 
437 | fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
438 |     if record.last_byte() == Some(terminator) {
439 |         record = &record[..record.len() - 1];
440 |     }
441 |     record
442 | }
443 | 
444 | #[cfg(all(test, feature = "std"))]
445 | mod tests {
446 |     use alloc::{vec, vec::Vec};
447 | 
448 |     use crate::bstring::BString;
449 | 
450 |     use super::BufReadExt;
451 | 
452 |     fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
453 |         let mut lines = vec![];
454 |         slice
455 |             .as_ref()
456 |             .for_byte_line(|line| {
457 |                 lines.push(BString::from(line.to_vec()));
458 |                 Ok(true)
459 |             })
460 |             .unwrap();
461 |         lines
462 |     }
463 | 
464 |     fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
465 |         let mut lines = vec![];
466 |         slice
467 |             .as_ref()
468 |             .for_byte_line_with_terminator(|line| {
469 |                 lines.push(BString::from(line.to_vec()));
470 |                 Ok(true)
471 |             })
472 |             .unwrap();
473 |         lines
474 |     }
475 | 
476 |     #[test]
477 |     fn lines_without_terminator() {
478 |         assert_eq!(collect_lines(""), Vec::<BString>::new());
479 | 
480 |         assert_eq!(collect_lines("\n"), vec![""]);
481 |         assert_eq!(collect_lines("\n\n"), vec!["", ""]);
482 |         assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]);
483 |         assert_eq!(collect_lines("a\nb"), vec!["a", "b"]);
484 |         assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]);
485 |         assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]);
486 | 
487 |         assert_eq!(collect_lines("\r\n"), vec![""]);
488 |         assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]);
489 |         assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]);
490 |         assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]);
491 |         assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]);
492 |         assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]);
493 | 
494 |         assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]);
495 |     }
496 | 
497 |     #[test]
498 |     fn lines_with_terminator() {
499 |         assert_eq!(collect_lines_term(""), Vec::<BString>::new());
500 | 
501 |         assert_eq!(collect_lines_term("\n"), vec!["\n"]);
502 |         assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]);
503 |         assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]);
504 |         assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]);
505 |         assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
506 |         assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]);
507 | 
508 |         assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]);
509 |         assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]);
510 |         assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]);
511 |         assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]);
512 |         assert_eq!(
513 |             collect_lines_term("abc\r\nxyz\r\n"),
514 |             vec!["abc\r\n", "xyz\r\n"]
515 |         );
516 |         assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]);
517 | 
518 |         assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]);
519 |     }
520 | }
521 | 


--------------------------------------------------------------------------------
/src/tests.rs:
--------------------------------------------------------------------------------
 1 | /// A sequence of tests for checking whether lossy decoding uses the maximal
 2 | /// subpart strategy correctly. Namely, if a sequence of otherwise invalid
 3 | /// UTF-8 bytes is a valid prefix of a valid UTF-8 sequence, then the entire
 4 | /// prefix is replaced by a single replacement codepoint. In all other cases,
 5 | /// each invalid byte is replaced by a single replacement codepoint.
 6 | ///
 7 | /// The first element in each tuple is the expected result of lossy decoding,
 8 | /// while the second element is the input given.
 9 | pub(crate) const LOSSY_TESTS: &[(&str, &[u8])] = &[
10 |     ("a", b"a"),
11 |     ("\u{FFFD}", b"\xFF"),
12 |     ("\u{FFFD}\u{FFFD}", b"\xFF\xFF"),
13 |     ("β\u{FFFD}", b"\xCE\xB2\xFF"),
14 |     ("☃\u{FFFD}", b"\xE2\x98\x83\xFF"),
15 |     ("𝝱\u{FFFD}", b"\xF0\x9D\x9D\xB1\xFF"),
16 |     ("\u{FFFD}\u{FFFD}", b"\xCE\xF0"),
17 |     ("\u{FFFD}\u{FFFD}", b"\xCE\xFF"),
18 |     ("\u{FFFD}\u{FFFD}", b"\xE2\x98\xF0"),
19 |     ("\u{FFFD}\u{FFFD}", b"\xE2\x98\xFF"),
20 |     ("\u{FFFD}", b"\xF0\x9D\x9D"),
21 |     ("\u{FFFD}\u{FFFD}", b"\xF0\x9D\x9D\xF0"),
22 |     ("\u{FFFD}\u{FFFD}", b"\xF0\x9D\x9D\xFF"),
23 |     ("\u{FFFD}", b"\xCE"),
24 |     ("a\u{FFFD}", b"a\xCE"),
25 |     ("\u{FFFD}", b"\xE2\x98"),
26 |     ("a\u{FFFD}", b"a\xE2\x98"),
27 |     ("\u{FFFD}", b"\xF0\x9D\x9C"),
28 |     ("a\u{FFFD}", b"a\xF0\x9D\x9C"),
29 |     ("a\u{FFFD}\u{FFFD}\u{FFFD}z", b"a\xED\xA0\x80z"),
30 |     ("☃βツ\u{FFFD}", b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF"),
31 |     ("a\u{FFFD}\u{FFFD}\u{FFFD}b", b"\x61\xF1\x80\x80\xE1\x80\xC2\x62"),
32 | ];
33 | 


--------------------------------------------------------------------------------
/src/unicode/data/LICENSE-UNICODE:
--------------------------------------------------------------------------------
 1 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 2 | See Terms of Use for definitions of Unicode Inc.'s
 3 | Data Files and Software.
 4 | 
 5 | NOTICE TO USER: Carefully read the following legal agreement.
 6 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 7 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 8 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 9 | TERMS AND CONDITIONS OF THIS AGREEMENT.
10 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
11 | THE DATA FILES OR SOFTWARE.
12 | 
13 | COPYRIGHT AND PERMISSION NOTICE
14 | 
15 | Copyright © 1991-2019 Unicode, Inc. All rights reserved.
16 | Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
17 | 
18 | Permission is hereby granted, free of charge, to any person obtaining
19 | a copy of the Unicode data files and any associated documentation
20 | (the "Data Files") or Unicode software and any associated documentation
21 | (the "Software") to deal in the Data Files or Software
22 | without restriction, including without limitation the rights to use,
23 | copy, modify, merge, publish, distribute, and/or sell copies of
24 | the Data Files or Software, and to permit persons to whom the Data Files
25 | or Software are furnished to do so, provided that either
26 | (a) this copyright and permission notice appear with all copies
27 | of the Data Files or Software, or
28 | (b) this copyright and permission notice appear in associated
29 | Documentation.
30 | 
31 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
32 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
33 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34 | NONINFRINGEMENT OF THIRD PARTY RIGHTS.
35 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
36 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
37 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
38 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
39 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
40 | PERFORMANCE OF THE DATA FILES OR SOFTWARE.
41 | 
42 | Except as contained in this notice, the name of a copyright holder
43 | shall not be used in advertising or otherwise to promote the sale,
44 | use or other dealings in these Data Files or Software without prior
45 | written authorization of the copyright holder.
46 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/grapheme_break_fwd.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe GRAPHEME_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
 8 | 
 9 | pub static GRAPHEME_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
10 |     #[cfg(target_endian = "big")]
11 |     static BYTES: &'static [u8] =
12 |         include_bytes!("grapheme_break_fwd.bigendian.dfa");
13 |     #[cfg(target_endian = "little")]
14 |     static BYTES: &'static [u8] =
15 |         include_bytes!("grapheme_break_fwd.littleendian.dfa");
16 |     let (dfa, _) =
17 |         DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
18 |     dfa
19 | });
20 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/grapheme_break_rev.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_rev.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/grapheme_break_rev.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/grapheme_break_rev.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/grapheme_break_rev.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --reverse --match-kind all --no-captures --shrink --rustfmt --safe GRAPHEME_BREAK_REV src/unicode/fsm/ <snip: arg too long>
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
 8 | 
 9 | pub static GRAPHEME_BREAK_REV: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
10 |     #[cfg(target_endian = "big")]
11 |     static BYTES: &'static [u8] =
12 |         include_bytes!("grapheme_break_rev.bigendian.dfa");
13 |     #[cfg(target_endian = "little")]
14 |     static BYTES: &'static [u8] =
15 |         include_bytes!("grapheme_break_rev.littleendian.dfa");
16 |     let (dfa, _) =
17 |         DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
18 |     dfa
19 | });
20 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod grapheme_break_fwd;
2 | pub mod grapheme_break_rev;
3 | pub mod regional_indicator_rev;
4 | pub mod sentence_break_fwd;
5 | pub mod simple_word_fwd;
6 | pub mod whitespace_anchored_fwd;
7 | pub mod whitespace_anchored_rev;
8 | pub mod word_break_fwd;
9 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/regional_indicator_rev.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/regional_indicator_rev.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/regional_indicator_rev.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/regional_indicator_rev.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/regional_indicator_rev.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe REGIONAL_INDICATOR_REV src/unicode/fsm/ \p{gcb=Regional_Indicator}
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{
 8 |     dfa::dense::DFA,
 9 |     util::{lazy::Lazy, wire::AlignAs},
10 | };
11 | 
12 | pub static REGIONAL_INDICATOR_REV: Lazy<DFA<&'static [u32]>> =
13 |     Lazy::new(|| {
14 |         static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
15 |             _align: [],
16 |             #[cfg(target_endian = "big")]
17 |             bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
18 |             #[cfg(target_endian = "little")]
19 |             bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
20 |         };
21 |         let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
22 |             .expect("serialized DFA should be valid");
23 |         dfa
24 |     });
25 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/sentence_break_fwd.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/sentence_break_fwd.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/sentence_break_fwd.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/sentence_break_fwd.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/sentence_break_fwd.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SENTENCE_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
 8 | 
 9 | pub static SENTENCE_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
10 |     #[cfg(target_endian = "big")]
11 |     static BYTES: &'static [u8] =
12 |         include_bytes!("sentence_break_fwd.bigendian.dfa");
13 |     #[cfg(target_endian = "little")]
14 |     static BYTES: &'static [u8] =
15 |         include_bytes!("sentence_break_fwd.littleendian.dfa");
16 |     let (dfa, _) =
17 |         DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
18 |     dfa
19 | });
20 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/simple_word_fwd.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/simple_word_fwd.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/simple_word_fwd.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/simple_word_fwd.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/simple_word_fwd.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe SIMPLE_WORD_FWD src/unicode/fsm/ \w
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
 8 | 
 9 | pub static SIMPLE_WORD_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
10 |     #[cfg(target_endian = "big")]
11 |     static BYTES: &'static [u8] =
12 |         include_bytes!("simple_word_fwd.bigendian.dfa");
13 |     #[cfg(target_endian = "little")]
14 |     static BYTES: &'static [u8] =
15 |         include_bytes!("simple_word_fwd.littleendian.dfa");
16 |     let (dfa, _) =
17 |         DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
18 |     dfa
19 | });
20 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_fwd.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_fwd.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/whitespace_anchored_fwd.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize dense dfa --minimize --start-kind anchored --shrink --rustfmt --safe WHITESPACE_ANCHORED_FWD src/unicode/fsm/ \s+
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{
 8 |     dfa::dense::DFA,
 9 |     util::{lazy::Lazy, wire::AlignAs},
10 | };
11 | 
12 | pub static WHITESPACE_ANCHORED_FWD: Lazy<DFA<&'static [u32]>> =
13 |     Lazy::new(|| {
14 |         static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
15 |             _align: [],
16 |             #[cfg(target_endian = "big")]
17 |             bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
18 |             #[cfg(target_endian = "little")]
19 |             bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
20 |         };
21 |         let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
22 |             .expect("serialized DFA should be valid");
23 |         dfa
24 |     });
25 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_rev.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/whitespace_anchored_rev.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/whitespace_anchored_rev.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize dense dfa --minimize --start-kind anchored --reverse --no-captures --shrink --rustfmt --safe WHITESPACE_ANCHORED_REV src/unicode/fsm/ \s+
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{
 8 |     dfa::dense::DFA,
 9 |     util::{lazy::Lazy, wire::AlignAs},
10 | };
11 | 
12 | pub static WHITESPACE_ANCHORED_REV: Lazy<DFA<&'static [u32]>> =
13 |     Lazy::new(|| {
14 |         static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
15 |             _align: [],
16 |             #[cfg(target_endian = "big")]
17 |             bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
18 |             #[cfg(target_endian = "little")]
19 |             bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
20 |         };
21 |         let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
22 |             .expect("serialized DFA should be valid");
23 |         dfa
24 |     });
25 | 


--------------------------------------------------------------------------------
/src/unicode/fsm/word_break_fwd.bigendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/word_break_fwd.bigendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/word_break_fwd.littleendian.dfa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/bstr/8904072067fca9ca993f7234a2dacad1c362afa7/src/unicode/fsm/word_break_fwd.littleendian.dfa


--------------------------------------------------------------------------------
/src/unicode/fsm/word_break_fwd.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //     regex-cli generate serialize sparse dfa --minimize --start-kind anchored --shrink --rustfmt --safe WORD_BREAK_FWD src/unicode/fsm/ <snip: arg too long>
 4 | //
 5 | // regex-cli 0.0.1 is available on crates.io.
 6 | 
 7 | use regex_automata::{dfa::sparse::DFA, util::lazy::Lazy};
 8 | 
 9 | pub static WORD_BREAK_FWD: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
10 |     #[cfg(target_endian = "big")]
11 |     static BYTES: &'static [u8] =
12 |         include_bytes!("word_break_fwd.bigendian.dfa");
13 |     #[cfg(target_endian = "little")]
14 |     static BYTES: &'static [u8] =
15 |         include_bytes!("word_break_fwd.littleendian.dfa");
16 |     let (dfa, _) =
17 |         DFA::from_bytes(BYTES).expect("serialized DFA should be valid");
18 |     dfa
19 | });
20 | 


--------------------------------------------------------------------------------
/src/unicode/grapheme.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::{dfa::Automaton, Anchored, Input};
  2 | 
  3 | use crate::{
  4 |     ext_slice::ByteSlice,
  5 |     unicode::fsm::{
  6 |         grapheme_break_fwd::GRAPHEME_BREAK_FWD,
  7 |         grapheme_break_rev::GRAPHEME_BREAK_REV,
  8 |         regional_indicator_rev::REGIONAL_INDICATOR_REV,
  9 |     },
 10 |     utf8,
 11 | };
 12 | 
 13 | /// An iterator over grapheme clusters in a byte string.
 14 | ///
 15 | /// This iterator is typically constructed by
 16 | /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
 17 | ///
 18 | /// Unicode defines a grapheme cluster as an *approximation* to a single user
 19 | /// visible character. A grapheme cluster, or just "grapheme," is made up of
 20 | /// one or more codepoints. For end user oriented tasks, one should generally
 21 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
 22 | /// always yields one codepoint at a time.
 23 | ///
 24 | /// Since graphemes are made up of one or more codepoints, this iterator yields
 25 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
 26 | /// are [substituted](index.html#handling-of-invalid-utf-8).
 27 | ///
 28 | /// This iterator can be used in reverse. When reversed, exactly the same
 29 | /// set of grapheme clusters are yielded, but in reverse order.
 30 | ///
 31 | /// This iterator only yields *extended* grapheme clusters, in accordance with
 32 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
 33 | #[derive(Clone, Debug)]
 34 | pub struct Graphemes<'a> {
 35 |     bs: &'a [u8],
 36 | }
 37 | 
 38 | impl<'a> Graphemes<'a> {
 39 |     pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> {
 40 |         Graphemes { bs }
 41 |     }
 42 | 
 43 |     /// View the underlying data as a subslice of the original data.
 44 |     ///
 45 |     /// The slice returned has the same lifetime as the original slice, and so
 46 |     /// the iterator can continue to be used while this exists.
 47 |     ///
 48 |     /// # Examples
 49 |     ///
 50 |     /// ```
 51 |     /// use bstr::ByteSlice;
 52 |     ///
 53 |     /// let mut it = b"abc".graphemes();
 54 |     ///
 55 |     /// assert_eq!(b"abc", it.as_bytes());
 56 |     /// it.next();
 57 |     /// assert_eq!(b"bc", it.as_bytes());
 58 |     /// it.next();
 59 |     /// it.next();
 60 |     /// assert_eq!(b"", it.as_bytes());
 61 |     /// ```
 62 |     #[inline]
 63 |     pub fn as_bytes(&self) -> &'a [u8] {
 64 |         self.bs
 65 |     }
 66 | }
 67 | 
 68 | impl<'a> Iterator for Graphemes<'a> {
 69 |     type Item = &'a str;
 70 | 
 71 |     #[inline]
 72 |     fn next(&mut self) -> Option<&'a str> {
 73 |         let (grapheme, size) = decode_grapheme(self.bs);
 74 |         if size == 0 {
 75 |             return None;
 76 |         }
 77 |         self.bs = &self.bs[size..];
 78 |         Some(grapheme)
 79 |     }
 80 | }
 81 | 
 82 | impl<'a> DoubleEndedIterator for Graphemes<'a> {
 83 |     #[inline]
 84 |     fn next_back(&mut self) -> Option<&'a str> {
 85 |         let (grapheme, size) = decode_last_grapheme(self.bs);
 86 |         if size == 0 {
 87 |             return None;
 88 |         }
 89 |         self.bs = &self.bs[..self.bs.len() - size];
 90 |         Some(grapheme)
 91 |     }
 92 | }
 93 | 
 94 | /// An iterator over grapheme clusters in a byte string and their byte index
 95 | /// positions.
 96 | ///
 97 | /// This iterator is typically constructed by
 98 | /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
 99 | ///
100 | /// Unicode defines a grapheme cluster as an *approximation* to a single user
101 | /// visible character. A grapheme cluster, or just "grapheme," is made up of
102 | /// one or more codepoints. For end user oriented tasks, one should generally
103 | /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
104 | /// always yields one codepoint at a time.
105 | ///
106 | /// Since graphemes are made up of one or more codepoints, this iterator
107 | /// yields `&str` elements (along with their start and end byte offsets).
108 | /// When invalid UTF-8 is encountered, replacement codepoints are
109 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
110 | /// indices yielded by this iterator may not correspond to the length of the
111 | /// grapheme cluster yielded with those indices. For example, when this
112 | /// iterator encounters `\xFF` in the byte string, then it will yield a pair
113 | /// of indices ranging over a single byte, but will provide an `&str`
114 | /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
115 | /// given only valid UTF-8, then all indices are in exact correspondence with
116 | /// their paired grapheme cluster.
117 | ///
118 | /// This iterator can be used in reverse. When reversed, exactly the same
119 | /// set of grapheme clusters are yielded, but in reverse order.
120 | ///
121 | /// This iterator only yields *extended* grapheme clusters, in accordance with
122 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
123 | #[derive(Clone, Debug)]
124 | pub struct GraphemeIndices<'a> {
125 |     bs: &'a [u8],
126 |     forward_index: usize,
127 |     reverse_index: usize,
128 | }
129 | 
130 | impl<'a> GraphemeIndices<'a> {
131 |     pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
132 |         GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
133 |     }
134 | 
135 |     /// View the underlying data as a subslice of the original data.
136 |     ///
137 |     /// The slice returned has the same lifetime as the original slice, and so
138 |     /// the iterator can continue to be used while this exists.
139 |     ///
140 |     /// # Examples
141 |     ///
142 |     /// ```
143 |     /// use bstr::ByteSlice;
144 |     ///
145 |     /// let mut it = b"abc".grapheme_indices();
146 |     ///
147 |     /// assert_eq!(b"abc", it.as_bytes());
148 |     /// it.next();
149 |     /// assert_eq!(b"bc", it.as_bytes());
150 |     /// it.next();
151 |     /// it.next();
152 |     /// assert_eq!(b"", it.as_bytes());
153 |     /// ```
154 |     #[inline]
155 |     pub fn as_bytes(&self) -> &'a [u8] {
156 |         self.bs
157 |     }
158 | }
159 | 
160 | impl<'a> Iterator for GraphemeIndices<'a> {
161 |     type Item = (usize, usize, &'a str);
162 | 
163 |     #[inline]
164 |     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
165 |         let index = self.forward_index;
166 |         let (grapheme, size) = decode_grapheme(self.bs);
167 |         if size == 0 {
168 |             return None;
169 |         }
170 |         self.bs = &self.bs[size..];
171 |         self.forward_index += size;
172 |         Some((index, index + size, grapheme))
173 |     }
174 | }
175 | 
176 | impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
177 |     #[inline]
178 |     fn next_back(&mut self) -> Option<(usize, usize, &'a str)> {
179 |         let (grapheme, size) = decode_last_grapheme(self.bs);
180 |         if size == 0 {
181 |             return None;
182 |         }
183 |         self.bs = &self.bs[..self.bs.len() - size];
184 |         self.reverse_index -= size;
185 |         Some((self.reverse_index, self.reverse_index + size, grapheme))
186 |     }
187 | }
188 | 
189 | /// Decode a grapheme from the given byte string.
190 | ///
191 | /// This returns the resulting grapheme (which may be a Unicode replacement
192 | /// codepoint if invalid UTF-8 was found), along with the number of bytes
193 | /// decoded in the byte string. The number of bytes decoded may not be the
194 | /// same as the length of grapheme in the case where invalid UTF-8 is found.
195 | pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
196 |     if bs.is_empty() {
197 |         ("", 0)
198 |     } else if bs.len() >= 2
199 |         && bs[0].is_ascii()
200 |         && bs[1].is_ascii()
201 |         && !bs[0].is_ascii_whitespace()
202 |     {
203 |         // FIXME: It is somewhat sad that we have to special case this, but it
204 |         // leads to a significant speed up in predominantly ASCII text. The
205 |         // issue here is that the DFA has a bit of overhead, and running it for
206 |         // every byte in mostly ASCII text results in a bit slowdown. We should
207 |         // re-litigate this once regex-automata 0.3 is out, but it might be
208 |         // hard to avoid the special case. A DFA is always going to at least
209 |         // require some memory access.
210 | 
211 |         // Safe because all ASCII bytes are valid UTF-8.
212 |         let grapheme = unsafe { bs[..1].to_str_unchecked() };
213 |         (grapheme, 1)
214 |     } else if let Some(hm) = {
215 |         let input = Input::new(bs).anchored(Anchored::Yes);
216 |         GRAPHEME_BREAK_FWD.try_search_fwd(&input).unwrap()
217 |     } {
218 |         // Safe because a match can only occur for valid UTF-8.
219 |         let grapheme = unsafe { bs[..hm.offset()].to_str_unchecked() };
220 |         (grapheme, grapheme.len())
221 |     } else {
222 |         const INVALID: &str = "\u{FFFD}";
223 |         // No match on non-empty bytes implies we found invalid UTF-8.
224 |         let (_, size) = utf8::decode_lossy(bs);
225 |         (INVALID, size)
226 |     }
227 | }
228 | 
229 | fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) {
230 |     if bs.is_empty() {
231 |         ("", 0)
232 |     } else if let Some(hm) = {
233 |         let input = Input::new(bs).anchored(Anchored::Yes);
234 |         GRAPHEME_BREAK_REV.try_search_rev(&input).unwrap()
235 |     } {
236 |         let start = adjust_rev_for_regional_indicator(bs, hm.offset());
237 |         // Safe because a match can only occur for valid UTF-8.
238 |         let grapheme = unsafe { bs[start..].to_str_unchecked() };
239 |         (grapheme, grapheme.len())
240 |     } else {
241 |         const INVALID: &str = "\u{FFFD}";
242 |         // No match on non-empty bytes implies we found invalid UTF-8.
243 |         let (_, size) = utf8::decode_last_lossy(bs);
244 |         (INVALID, size)
245 |     }
246 | }
247 | 
248 | /// Return the correct offset for the next grapheme decoded at the end of the
249 | /// given byte string, where `i` is the initial guess. In particular,
250 | /// `&bs[i..]` represents the candidate grapheme.
251 | ///
252 | /// `i` is returned by this function in all cases except when `&bs[i..]` is
253 | /// a pair of regional indicator codepoints. In that case, if an odd number of
254 | /// additional regional indicator codepoints precedes `i`, then `i` is
255 | /// adjusted such that it points to only a single regional indicator.
256 | ///
257 | /// This "fixing" is necessary to handle the requirement that a break cannot
258 | /// occur between regional indicators where it would cause an odd number of
259 | /// regional indicators to exist before the break from the *start* of the
260 | /// string. A reverse regex cannot detect this case easily without look-around.
261 | fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
262 |     // All regional indicators use a 4 byte encoding, and we only care about
263 |     // the case where we found a pair of regional indicators.
264 |     if bs.len() - i != 8 {
265 |         return i;
266 |     }
267 |     // Count all contiguous occurrences of regional indicators. If there's an
268 |     // even number of them, then we can accept the pair we found. Otherwise,
269 |     // we can only take one of them.
270 |     //
271 |     // FIXME: This is quadratic in the worst case, e.g., a string of just
272 |     // regional indicator codepoints. A fix probably requires refactoring this
273 |     // code a bit such that we don't rescan regional indicators.
274 |     let mut count = 0;
275 |     while let Some(hm) = {
276 |         let input = Input::new(bs).anchored(Anchored::Yes);
277 |         REGIONAL_INDICATOR_REV.try_search_rev(&input).unwrap()
278 |     } {
279 |         bs = &bs[..hm.offset()];
280 |         count += 1;
281 |     }
282 |     if count % 2 == 0 {
283 |         i
284 |     } else {
285 |         i + 4
286 |     }
287 | }
288 | 
289 | #[cfg(all(test, feature = "std"))]
290 | mod tests {
291 |     use alloc::{
292 |         string::{String, ToString},
293 |         vec,
294 |         vec::Vec,
295 |     };
296 | 
297 |     #[cfg(not(miri))]
298 |     use ucd_parse::GraphemeClusterBreakTest;
299 | 
300 |     use crate::tests::LOSSY_TESTS;
301 | 
302 |     use super::*;
303 | 
304 |     #[test]
305 |     #[cfg(not(miri))]
306 |     fn forward_ucd() {
307 |         for (i, test) in ucdtests().into_iter().enumerate() {
308 |             let given = test.grapheme_clusters.concat();
309 |             let got: Vec<String> = Graphemes::new(given.as_bytes())
310 |                 .map(|cluster| cluster.to_string())
311 |                 .collect();
312 |             assert_eq!(
313 |                 test.grapheme_clusters,
314 |                 got,
315 |                 "\ngrapheme forward break test {} failed:\n\
316 |                  given:    {:?}\n\
317 |                  expected: {:?}\n\
318 |                  got:      {:?}\n",
319 |                 i,
320 |                 uniescape(&given),
321 |                 uniescape_vec(&test.grapheme_clusters),
322 |                 uniescape_vec(&got),
323 |             );
324 |         }
325 |     }
326 | 
327 |     #[test]
328 |     #[cfg(not(miri))]
329 |     fn reverse_ucd() {
330 |         for (i, test) in ucdtests().into_iter().enumerate() {
331 |             let given = test.grapheme_clusters.concat();
332 |             let mut got: Vec<String> = Graphemes::new(given.as_bytes())
333 |                 .rev()
334 |                 .map(|cluster| cluster.to_string())
335 |                 .collect();
336 |             got.reverse();
337 |             assert_eq!(
338 |                 test.grapheme_clusters,
339 |                 got,
340 |                 "\n\ngrapheme reverse break test {} failed:\n\
341 |                  given:    {:?}\n\
342 |                  expected: {:?}\n\
343 |                  got:      {:?}\n",
344 |                 i,
345 |                 uniescape(&given),
346 |                 uniescape_vec(&test.grapheme_clusters),
347 |                 uniescape_vec(&got),
348 |             );
349 |         }
350 |     }
351 | 
352 |     #[test]
353 |     fn forward_lossy() {
354 |         for &(expected, input) in LOSSY_TESTS {
355 |             let got = Graphemes::new(input.as_bytes()).collect::<String>();
356 |             assert_eq!(expected, got);
357 |         }
358 |     }
359 | 
360 |     #[test]
361 |     fn reverse_lossy() {
362 |         for &(expected, input) in LOSSY_TESTS {
363 |             let expected: String = expected.chars().rev().collect();
364 |             let got =
365 |                 Graphemes::new(input.as_bytes()).rev().collect::<String>();
366 |             assert_eq!(expected, got);
367 |         }
368 |     }
369 | 
370 |     #[cfg(not(miri))]
371 |     fn uniescape(s: &str) -> String {
372 |         s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
373 |     }
374 | 
375 |     #[cfg(not(miri))]
376 |     fn uniescape_vec(strs: &[String]) -> Vec<String> {
377 |         strs.iter().map(|s| uniescape(s)).collect()
378 |     }
379 | 
380 |     /// Return all of the UCD for grapheme breaks.
381 |     #[cfg(not(miri))]
382 |     fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
383 |         const TESTDATA: &str = include_str!("data/GraphemeBreakTest.txt");
384 | 
385 |         let mut tests = vec![];
386 |         for mut line in TESTDATA.lines() {
387 |             line = line.trim();
388 |             if line.starts_with("#") || line.contains("surrogate") {
389 |                 continue;
390 |             }
391 |             tests.push(line.parse().unwrap());
392 |         }
393 |         tests
394 |     }
395 | }
396 | 


--------------------------------------------------------------------------------
/src/unicode/mod.rs:
--------------------------------------------------------------------------------
 1 | pub use self::{
 2 |     grapheme::{decode_grapheme, GraphemeIndices, Graphemes},
 3 |     sentence::{SentenceIndices, Sentences},
 4 |     whitespace::{whitespace_len_fwd, whitespace_len_rev},
 5 |     word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks},
 6 | };
 7 | 
 8 | mod fsm;
 9 | mod grapheme;
10 | mod sentence;
11 | mod whitespace;
12 | mod word;
13 | 


--------------------------------------------------------------------------------
/src/unicode/sentence.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::{dfa::Automaton, Anchored, Input};
  2 | 
  3 | use crate::{
  4 |     ext_slice::ByteSlice,
  5 |     unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
  6 | };
  7 | 
  8 | /// An iterator over sentences in a byte string.
  9 | ///
 10 | /// This iterator is typically constructed by
 11 | /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
 12 | ///
 13 | /// Sentences typically include their trailing punctuation and whitespace.
 14 | ///
 15 | /// Since sentences are made up of one or more codepoints, this iterator yields
 16 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
 17 | /// are [substituted](index.html#handling-of-invalid-utf-8).
 18 | ///
 19 | /// This iterator yields words in accordance with the default sentence boundary
 20 | /// rules specified in
 21 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
 22 | #[derive(Clone, Debug)]
 23 | pub struct Sentences<'a> {
 24 |     bs: &'a [u8],
 25 | }
 26 | 
 27 | impl<'a> Sentences<'a> {
 28 |     pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
 29 |         Sentences { bs }
 30 |     }
 31 | 
 32 |     /// View the underlying data as a subslice of the original data.
 33 |     ///
 34 |     /// The slice returned has the same lifetime as the original slice, and so
 35 |     /// the iterator can continue to be used while this exists.
 36 |     ///
 37 |     /// # Examples
 38 |     ///
 39 |     /// ```
 40 |     /// use bstr::ByteSlice;
 41 |     ///
 42 |     /// let mut it = b"I want this. Not that. Right now.".sentences();
 43 |     ///
 44 |     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
 45 |     /// it.next();
 46 |     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
 47 |     /// it.next();
 48 |     /// it.next();
 49 |     /// assert_eq!(b"", it.as_bytes());
 50 |     /// ```
 51 |     #[inline]
 52 |     pub fn as_bytes(&self) -> &'a [u8] {
 53 |         self.bs
 54 |     }
 55 | }
 56 | 
 57 | impl<'a> Iterator for Sentences<'a> {
 58 |     type Item = &'a str;
 59 | 
 60 |     #[inline]
 61 |     fn next(&mut self) -> Option<&'a str> {
 62 |         let (sentence, size) = decode_sentence(self.bs);
 63 |         if size == 0 {
 64 |             return None;
 65 |         }
 66 |         self.bs = &self.bs[size..];
 67 |         Some(sentence)
 68 |     }
 69 | }
 70 | 
 71 | /// An iterator over sentences in a byte string, along with their byte offsets.
 72 | ///
 73 | /// This iterator is typically constructed by
 74 | /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
 75 | ///
 76 | /// Sentences typically include their trailing punctuation and whitespace.
 77 | ///
 78 | /// Since sentences are made up of one or more codepoints, this iterator
 79 | /// yields `&str` elements (along with their start and end byte offsets).
 80 | /// When invalid UTF-8 is encountered, replacement codepoints are
 81 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
 82 | /// indices yielded by this iterator may not correspond to the length of the
 83 | /// sentence yielded with those indices. For example, when this iterator
 84 | /// encounters `\xFF` in the byte string, then it will yield a pair of indices
 85 | /// ranging over a single byte, but will provide an `&str` equivalent to
 86 | /// `"\u{FFFD}"`, which is three bytes in length. However, when given only
 87 | /// valid UTF-8, then all indices are in exact correspondence with their paired
 88 | /// word.
 89 | ///
 90 | /// This iterator yields words in accordance with the default sentence boundary
 91 | /// rules specified in
 92 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
 93 | #[derive(Clone, Debug)]
 94 | pub struct SentenceIndices<'a> {
 95 |     bs: &'a [u8],
 96 |     forward_index: usize,
 97 | }
 98 | 
 99 | impl<'a> SentenceIndices<'a> {
100 |     pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
101 |         SentenceIndices { bs, forward_index: 0 }
102 |     }
103 | 
104 |     /// View the underlying data as a subslice of the original data.
105 |     ///
106 |     /// The slice returned has the same lifetime as the original slice, and so
107 |     /// the iterator can continue to be used while this exists.
108 |     ///
109 |     /// # Examples
110 |     ///
111 |     /// ```
112 |     /// use bstr::ByteSlice;
113 |     ///
114 |     /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
115 |     ///
116 |     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
117 |     /// it.next();
118 |     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
119 |     /// it.next();
120 |     /// it.next();
121 |     /// assert_eq!(b"", it.as_bytes());
122 |     /// ```
123 |     #[inline]
124 |     pub fn as_bytes(&self) -> &'a [u8] {
125 |         self.bs
126 |     }
127 | }
128 | 
129 | impl<'a> Iterator for SentenceIndices<'a> {
130 |     type Item = (usize, usize, &'a str);
131 | 
132 |     #[inline]
133 |     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
134 |         let index = self.forward_index;
135 |         let (word, size) = decode_sentence(self.bs);
136 |         if size == 0 {
137 |             return None;
138 |         }
139 |         self.bs = &self.bs[size..];
140 |         self.forward_index += size;
141 |         Some((index, index + size, word))
142 |     }
143 | }
144 | 
145 | fn decode_sentence(bs: &[u8]) -> (&str, usize) {
146 |     if bs.is_empty() {
147 |         ("", 0)
148 |     } else if let Some(hm) = {
149 |         let input = Input::new(bs).anchored(Anchored::Yes);
150 |         SENTENCE_BREAK_FWD.try_search_fwd(&input).unwrap()
151 |     } {
152 |         // Safe because a match can only occur for valid UTF-8.
153 |         let sentence = unsafe { bs[..hm.offset()].to_str_unchecked() };
154 |         (sentence, sentence.len())
155 |     } else {
156 |         const INVALID: &str = "\u{FFFD}";
157 |         // No match on non-empty bytes implies we found invalid UTF-8.
158 |         let (_, size) = utf8::decode_lossy(bs);
159 |         (INVALID, size)
160 |     }
161 | }
162 | 
163 | #[cfg(all(test, feature = "std"))]
164 | mod tests {
165 |     use alloc::{vec, vec::Vec};
166 | 
167 |     #[cfg(not(miri))]
168 |     use ucd_parse::SentenceBreakTest;
169 | 
170 |     use crate::ext_slice::ByteSlice;
171 | 
172 |     #[test]
173 |     #[cfg(not(miri))]
174 |     fn forward_ucd() {
175 |         for (i, test) in ucdtests().into_iter().enumerate() {
176 |             let given = test.sentences.concat();
177 |             let got = sentences(given.as_bytes());
178 |             assert_eq!(
179 |                 test.sentences,
180 |                 got,
181 |                 "\n\nsentence forward break test {} failed:\n\
182 |                  given:    {:?}\n\
183 |                  expected: {:?}\n\
184 |                  got:      {:?}\n",
185 |                 i,
186 |                 given,
187 |                 strs_to_bstrs(&test.sentences),
188 |                 strs_to_bstrs(&got),
189 |             );
190 |         }
191 |     }
192 | 
193 |     // Some additional tests that don't seem to be covered by the UCD tests.
194 |     #[test]
195 |     fn forward_additional() {
196 |         assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
197 |         assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
198 | 
199 |         assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
200 |         assert_eq!(vec!["a... a"], sentences(b"a... a"));
201 | 
202 |         assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
203 |     }
204 | 
205 |     fn sentences(bytes: &[u8]) -> Vec<&str> {
206 |         bytes.sentences().collect()
207 |     }
208 | 
209 |     #[cfg(not(miri))]
210 |     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
211 |         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
212 |     }
213 | 
214 |     /// Return all of the UCD for sentence breaks.
215 |     #[cfg(not(miri))]
216 |     fn ucdtests() -> Vec<SentenceBreakTest> {
217 |         const TESTDATA: &str = include_str!("data/SentenceBreakTest.txt");
218 | 
219 |         let mut tests = vec![];
220 |         for mut line in TESTDATA.lines() {
221 |             line = line.trim();
222 |             if line.starts_with("#") || line.contains("surrogate") {
223 |                 continue;
224 |             }
225 |             tests.push(line.parse().unwrap());
226 |         }
227 |         tests
228 |     }
229 | }
230 | 


--------------------------------------------------------------------------------
/src/unicode/whitespace.rs:
--------------------------------------------------------------------------------
 1 | use regex_automata::{dfa::Automaton, Anchored, Input};
 2 | 
 3 | use crate::unicode::fsm::{
 4 |     whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD,
 5 |     whitespace_anchored_rev::WHITESPACE_ANCHORED_REV,
 6 | };
 7 | 
 8 | /// Return the first position of a non-whitespace character.
 9 | pub fn whitespace_len_fwd(slice: &[u8]) -> usize {
10 |     let input = Input::new(slice).anchored(Anchored::Yes);
11 |     WHITESPACE_ANCHORED_FWD
12 |         .try_search_fwd(&input)
13 |         .unwrap()
14 |         .map_or(0, |hm| hm.offset())
15 | }
16 | 
17 | /// Return the last position of a non-whitespace character.
18 | pub fn whitespace_len_rev(slice: &[u8]) -> usize {
19 |     let input = Input::new(slice).anchored(Anchored::Yes);
20 |     WHITESPACE_ANCHORED_REV
21 |         .try_search_rev(&input)
22 |         .unwrap()
23 |         .map_or(slice.len(), |hm| hm.offset())
24 | }
25 | 


--------------------------------------------------------------------------------
/src/unicode/word.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::{dfa::Automaton, Anchored, Input};
  2 | 
  3 | use crate::{
  4 |     ext_slice::ByteSlice,
  5 |     unicode::fsm::{
  6 |         simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
  7 |     },
  8 |     utf8,
  9 | };
 10 | 
 11 | /// An iterator over words in a byte string.
 12 | ///
 13 | /// This iterator is typically constructed by
 14 | /// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
 15 | ///
 16 | /// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
 17 | /// iterator, except it only returns elements that contain a "word" character.
 18 | /// A word character is defined by UTS #18 (Annex C) to be the combination
 19 | /// of the `Alphabetic` and `Join_Control` properties, along with the
 20 | /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
 21 | ///
 22 | /// Since words are made up of one or more codepoints, this iterator yields
 23 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
 24 | /// are [substituted](index.html#handling-of-invalid-utf-8).
 25 | ///
 26 | /// This iterator yields words in accordance with the default word boundary
 27 | /// rules specified in
 28 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
 29 | /// In particular, this may not be suitable for Japanese and Chinese scripts
 30 | /// that do not use spaces between words.
 31 | #[derive(Clone, Debug)]
 32 | pub struct Words<'a>(WordsWithBreaks<'a>);
 33 | 
 34 | impl<'a> Words<'a> {
 35 |     pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
 36 |         Words(WordsWithBreaks::new(bs))
 37 |     }
 38 | 
 39 |     /// View the underlying data as a subslice of the original data.
 40 |     ///
 41 |     /// The slice returned has the same lifetime as the original slice, and so
 42 |     /// the iterator can continue to be used while this exists.
 43 |     ///
 44 |     /// # Examples
 45 |     ///
 46 |     /// ```
 47 |     /// use bstr::ByteSlice;
 48 |     ///
 49 |     /// let mut it = b"foo bar baz".words();
 50 |     ///
 51 |     /// assert_eq!(b"foo bar baz", it.as_bytes());
 52 |     /// it.next();
 53 |     /// it.next();
 54 |     /// assert_eq!(b" baz", it.as_bytes());
 55 |     /// it.next();
 56 |     /// assert_eq!(b"", it.as_bytes());
 57 |     /// ```
 58 |     #[inline]
 59 |     pub fn as_bytes(&self) -> &'a [u8] {
 60 |         self.0.as_bytes()
 61 |     }
 62 | }
 63 | 
 64 | impl<'a> Iterator for Words<'a> {
 65 |     type Item = &'a str;
 66 | 
 67 |     #[inline]
 68 |     fn next(&mut self) -> Option<&'a str> {
 69 |         for word in self.0.by_ref() {
 70 |             let input =
 71 |                 Input::new(word).anchored(Anchored::Yes).earliest(true);
 72 |             if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
 73 |                 return Some(word);
 74 |             }
 75 |         }
 76 |         None
 77 |     }
 78 | }
 79 | 
 80 | /// An iterator over words in a byte string and their byte index positions.
 81 | ///
 82 | /// This iterator is typically constructed by
 83 | /// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
 84 | ///
 85 | /// This is similar to the
 86 | /// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
 87 | /// except it only returns elements that contain a "word" character. A
 88 | /// word character is defined by UTS #18 (Annex C) to be the combination
 89 | /// of the `Alphabetic` and `Join_Control` properties, along with the
 90 | /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
 91 | ///
 92 | /// Since words are made up of one or more codepoints, this iterator
 93 | /// yields `&str` elements (along with their start and end byte offsets).
 94 | /// When invalid UTF-8 is encountered, replacement codepoints are
 95 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
 96 | /// indices yielded by this iterator may not correspond to the length of the
 97 | /// word yielded with those indices. For example, when this iterator encounters
 98 | /// `\xFF` in the byte string, then it will yield a pair of indices ranging
 99 | /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
100 | /// which is three bytes in length. However, when given only valid UTF-8, then
101 | /// all indices are in exact correspondence with their paired word.
102 | ///
103 | /// This iterator yields words in accordance with the default word boundary
104 | /// rules specified in
105 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
106 | /// In particular, this may not be suitable for Japanese and Chinese scripts
107 | /// that do not use spaces between words.
108 | #[derive(Clone, Debug)]
109 | pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
110 | 
111 | impl<'a> WordIndices<'a> {
112 |     pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
113 |         WordIndices(WordsWithBreakIndices::new(bs))
114 |     }
115 | 
116 |     /// View the underlying data as a subslice of the original data.
117 |     ///
118 |     /// The slice returned has the same lifetime as the original slice, and so
119 |     /// the iterator can continue to be used while this exists.
120 |     ///
121 |     /// # Examples
122 |     ///
123 |     /// ```
124 |     /// use bstr::ByteSlice;
125 |     ///
126 |     /// let mut it = b"foo bar baz".word_indices();
127 |     ///
128 |     /// assert_eq!(b"foo bar baz", it.as_bytes());
129 |     /// it.next();
130 |     /// it.next();
131 |     /// assert_eq!(b" baz", it.as_bytes());
132 |     /// it.next();
133 |     /// it.next();
134 |     /// assert_eq!(b"", it.as_bytes());
135 |     /// ```
136 |     #[inline]
137 |     pub fn as_bytes(&self) -> &'a [u8] {
138 |         self.0.as_bytes()
139 |     }
140 | }
141 | 
142 | impl<'a> Iterator for WordIndices<'a> {
143 |     type Item = (usize, usize, &'a str);
144 | 
145 |     #[inline]
146 |     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
147 |         for (start, end, word) in self.0.by_ref() {
148 |             let input =
149 |                 Input::new(word).anchored(Anchored::Yes).earliest(true);
150 |             if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
151 |                 return Some((start, end, word));
152 |             }
153 |         }
154 |         None
155 |     }
156 | }
157 | 
158 | /// An iterator over all word breaks in a byte string.
159 | ///
160 | /// This iterator is typically constructed by
161 | /// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
162 | ///
163 | /// This iterator yields not only all words, but the content that comes between
164 | /// words. In particular, if all elements yielded by this iterator are
165 | /// concatenated, then the result is the original string (subject to Unicode
166 | /// replacement codepoint substitutions).
167 | ///
168 | /// Since words are made up of one or more codepoints, this iterator yields
169 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
170 | /// are [substituted](index.html#handling-of-invalid-utf-8).
171 | ///
172 | /// This iterator yields words in accordance with the default word boundary
173 | /// rules specified in
174 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
175 | /// In particular, this may not be suitable for Japanese and Chinese scripts
176 | /// that do not use spaces between words.
177 | #[derive(Clone, Debug)]
178 | pub struct WordsWithBreaks<'a> {
179 |     bs: &'a [u8],
180 | }
181 | 
182 | impl<'a> WordsWithBreaks<'a> {
183 |     pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
184 |         WordsWithBreaks { bs }
185 |     }
186 | 
187 |     /// View the underlying data as a subslice of the original data.
188 |     ///
189 |     /// The slice returned has the same lifetime as the original slice, and so
190 |     /// the iterator can continue to be used while this exists.
191 |     ///
192 |     /// # Examples
193 |     ///
194 |     /// ```
195 |     /// use bstr::ByteSlice;
196 |     ///
197 |     /// let mut it = b"foo bar baz".words_with_breaks();
198 |     ///
199 |     /// assert_eq!(b"foo bar baz", it.as_bytes());
200 |     /// it.next();
201 |     /// assert_eq!(b" bar baz", it.as_bytes());
202 |     /// it.next();
203 |     /// it.next();
204 |     /// assert_eq!(b" baz", it.as_bytes());
205 |     /// it.next();
206 |     /// it.next();
207 |     /// assert_eq!(b"", it.as_bytes());
208 |     /// ```
209 |     #[inline]
210 |     pub fn as_bytes(&self) -> &'a [u8] {
211 |         self.bs
212 |     }
213 | }
214 | 
215 | impl<'a> Iterator for WordsWithBreaks<'a> {
216 |     type Item = &'a str;
217 | 
218 |     #[inline]
219 |     fn next(&mut self) -> Option<&'a str> {
220 |         let (word, size) = decode_word(self.bs);
221 |         if size == 0 {
222 |             return None;
223 |         }
224 |         self.bs = &self.bs[size..];
225 |         Some(word)
226 |     }
227 | }
228 | 
229 | /// An iterator over all word breaks in a byte string, along with their byte
230 | /// index positions.
231 | ///
232 | /// This iterator is typically constructed by
233 | /// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
234 | ///
235 | /// This iterator yields not only all words, but the content that comes between
236 | /// words. In particular, if all elements yielded by this iterator are
237 | /// concatenated, then the result is the original string (subject to Unicode
238 | /// replacement codepoint substitutions).
239 | ///
240 | /// Since words are made up of one or more codepoints, this iterator
241 | /// yields `&str` elements (along with their start and end byte offsets).
242 | /// When invalid UTF-8 is encountered, replacement codepoints are
243 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
244 | /// indices yielded by this iterator may not correspond to the length of the
245 | /// word yielded with those indices. For example, when this iterator encounters
246 | /// `\xFF` in the byte string, then it will yield a pair of indices ranging
247 | /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
248 | /// which is three bytes in length. However, when given only valid UTF-8, then
249 | /// all indices are in exact correspondence with their paired word.
250 | ///
251 | /// This iterator yields words in accordance with the default word boundary
252 | /// rules specified in
253 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
254 | /// In particular, this may not be suitable for Japanese and Chinese scripts
255 | /// that do not use spaces between words.
256 | #[derive(Clone, Debug)]
257 | pub struct WordsWithBreakIndices<'a> {
258 |     bs: &'a [u8],
259 |     forward_index: usize,
260 | }
261 | 
262 | impl<'a> WordsWithBreakIndices<'a> {
263 |     pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
264 |         WordsWithBreakIndices { bs, forward_index: 0 }
265 |     }
266 | 
267 |     /// View the underlying data as a subslice of the original data.
268 |     ///
269 |     /// The slice returned has the same lifetime as the original slice, and so
270 |     /// the iterator can continue to be used while this exists.
271 |     ///
272 |     /// # Examples
273 |     ///
274 |     /// ```
275 |     /// use bstr::ByteSlice;
276 |     ///
277 |     /// let mut it = b"foo bar baz".words_with_break_indices();
278 |     ///
279 |     /// assert_eq!(b"foo bar baz", it.as_bytes());
280 |     /// it.next();
281 |     /// assert_eq!(b" bar baz", it.as_bytes());
282 |     /// it.next();
283 |     /// it.next();
284 |     /// assert_eq!(b" baz", it.as_bytes());
285 |     /// it.next();
286 |     /// it.next();
287 |     /// assert_eq!(b"", it.as_bytes());
288 |     /// ```
289 |     #[inline]
290 |     pub fn as_bytes(&self) -> &'a [u8] {
291 |         self.bs
292 |     }
293 | }
294 | 
295 | impl<'a> Iterator for WordsWithBreakIndices<'a> {
296 |     type Item = (usize, usize, &'a str);
297 | 
298 |     #[inline]
299 |     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
300 |         let index = self.forward_index;
301 |         let (word, size) = decode_word(self.bs);
302 |         if size == 0 {
303 |             return None;
304 |         }
305 |         self.bs = &self.bs[size..];
306 |         self.forward_index += size;
307 |         Some((index, index + size, word))
308 |     }
309 | }
310 | 
311 | fn decode_word(bs: &[u8]) -> (&str, usize) {
312 |     if bs.is_empty() {
313 |         ("", 0)
314 |     } else if let Some(hm) = {
315 |         let input = Input::new(bs).anchored(Anchored::Yes);
316 |         WORD_BREAK_FWD.try_search_fwd(&input).unwrap()
317 |     } {
318 |         // Safe because a match can only occur for valid UTF-8.
319 |         let word = unsafe { bs[..hm.offset()].to_str_unchecked() };
320 |         (word, word.len())
321 |     } else {
322 |         const INVALID: &str = "\u{FFFD}";
323 |         // No match on non-empty bytes implies we found invalid UTF-8.
324 |         let (_, size) = utf8::decode_lossy(bs);
325 |         (INVALID, size)
326 |     }
327 | }
328 | 
329 | #[cfg(all(test, feature = "std"))]
330 | mod tests {
331 |     use alloc::{vec, vec::Vec};
332 | 
333 |     #[cfg(not(miri))]
334 |     use ucd_parse::WordBreakTest;
335 | 
336 |     use crate::ext_slice::ByteSlice;
337 | 
338 |     #[test]
339 |     #[cfg(not(miri))]
340 |     fn forward_ucd() {
341 |         for (i, test) in ucdtests().into_iter().enumerate() {
342 |             let given = test.words.concat();
343 |             let got = words(given.as_bytes());
344 |             assert_eq!(
345 |                 test.words,
346 |                 got,
347 |                 "\n\nword forward break test {} failed:\n\
348 |                  given:    {:?}\n\
349 |                  expected: {:?}\n\
350 |                  got:      {:?}\n",
351 |                 i,
352 |                 given,
353 |                 strs_to_bstrs(&test.words),
354 |                 strs_to_bstrs(&got),
355 |             );
356 |         }
357 |     }
358 | 
359 |     // Some additional tests that don't seem to be covered by the UCD tests.
360 |     //
361 |     // It's pretty amazing that the UCD tests miss these cases. I only found
362 |     // them by running this crate's segmenter and ICU's segmenter on the same
363 |     // text and comparing the output.
364 |     #[test]
365 |     fn forward_additional() {
366 |         assert_eq!(vec!["a", ".", "  ", "Y"], words(b"a.  Y"));
367 |         assert_eq!(vec!["r", ".", "  ", "Yo"], words(b"r.  Yo"));
368 |         assert_eq!(
369 |             vec!["whatsoever", ".", "  ", "You", " ", "may"],
370 |             words(b"whatsoever.  You may")
371 |         );
372 |         assert_eq!(
373 |             vec!["21stcentury'syesterday"],
374 |             words(b"21stcentury'syesterday")
375 |         );
376 | 
377 |         assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
378 |         assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
379 |         assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
380 |         assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
381 |         assert_eq!(vec!["_123'4"], words(b"_123'4"));
382 |         assert_eq!(vec!["__12'345"], words(b"__12'345"));
383 | 
384 |         assert_eq!(
385 |             vec!["tomorrowat4", ":", "00", ","],
386 |             words(b"tomorrowat4:00,")
387 |         );
388 |         assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
389 |         assert_eq!(vec!["X38"], words(b"X38"));
390 | 
391 |         assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
392 |         assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
393 |         assert_eq!(vec!["1XY"], words(b"1XY"));
394 | 
395 |         assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
396 | 
397 |         // Tests that Vithkuqi works, which was introduced in Unicode 14.
398 |         // This test fails prior to Unicode 14.
399 |         assert_eq!(
400 |             vec!["\u{10570}\u{10597}"],
401 |             words("\u{10570}\u{10597}".as_bytes())
402 |         );
403 |     }
404 | 
405 |     fn words(bytes: &[u8]) -> Vec<&str> {
406 |         bytes.words_with_breaks().collect()
407 |     }
408 | 
409 |     #[cfg(not(miri))]
410 |     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
411 |         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
412 |     }
413 | 
414 |     /// Return all of the UCD for word breaks.
415 |     #[cfg(not(miri))]
416 |     fn ucdtests() -> Vec<WordBreakTest> {
417 |         const TESTDATA: &str = include_str!("data/WordBreakTest.txt");
418 | 
419 |         let mut tests = vec![];
420 |         for mut line in TESTDATA.lines() {
421 |             line = line.trim();
422 |             if line.starts_with("#") || line.contains("surrogate") {
423 |                 continue;
424 |             }
425 |             tests.push(line.parse().unwrap());
426 |         }
427 |         tests
428 |     }
429 | }
430 | 


--------------------------------------------------------------------------------