├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── LineBreak.txt
├── README.md
├── gen-tables
    ├── .gitignore
    ├── Cargo.lock
    ├── Cargo.toml
    └── src
    │   └── main.rs
├── src
    ├── lib.rs
    └── shared.rs
└── tests
    ├── LineBreakTest.txt
    └── test_default.rs


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - name: Install Rust
11 |         uses: actions-rs/toolchain@v1
12 |         with:
13 |           toolchain: stable
14 |           profile: minimal
15 |           override: true
16 | 
17 |       - name: Configure src/tables.rs cache
18 |         id: cache-tables
19 |         uses: actions/cache@v3
20 |         with:
21 |           path: src/tables.rs
22 |           key: ${{ hashFiles('LineBreak.txt', 'gen-tables/**') }}
23 | 
24 |       - name: Generates src/tables.rs
25 |         run: cargo run
26 |         working-directory: ./gen-tables
27 |         if: steps.cache-tables.outputs.cache-hit != 'true'
28 | 
29 |       - name: Run tests
30 |         run: cargo test
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 | /src/tables.rs
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "unicode-linebreak"
 3 | version = "0.1.5"
 4 | authors = ["Axel Forsman <axelsfor@gmail.com>"]
 5 | description = "Implementation of the Unicode Line Breaking Algorithm"
 6 | homepage = "https://github.com/axelf4/unicode-linebreak"
 7 | repository = "https://github.com/axelf4/unicode-linebreak"
 8 | readme = "README.md"
 9 | keywords = ["unicode", "text", "layout"]
10 | categories = ["internationalization"]
11 | license = "Apache-2.0"
12 | include = ["src/**/*", "LICENSE"]
13 | edition = "2021"
14 | rust-version = "1.56"
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # unicode-linebreak
 2 | 
 3 | Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14].
 4 | 
 5 | ![test](https://github.com/axelf4/unicode-linebreak/workflows/test/badge.svg)
 6 | [![Documentation](https://docs.rs/unicode-linebreak/badge.svg)](https://docs.rs/unicode-linebreak)
 7 | 
 8 | Given an input text, locates "line break opportunities", or positions appropriate for wrapping
 9 | lines when displaying text.
10 | 
11 | ## Example
12 | 
13 | ```rust
14 | use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}};
15 | 
16 | let text = "a b \nc";
17 | assert!(linebreaks(text).eq([
18 | 	(2, Allowed),   // May break after first space
19 | 	(5, Mandatory), // Must break after line feed
20 | 	(6, Mandatory)  // Must break at end of text, so that there always is at least one LB
21 | ]));
22 | ```
23 | 
24 | ## Development
25 | 
26 | After cloning the repository or modifying `LineBreak.txt` the tables
27 | have to be (re-)generated:
28 | 
29 | ```sh
30 | # Generate src/tables.rs
31 | (cd gen-tables && cargo run)
32 | # Run tests to make sure it was successful
33 | cargo test
34 | ```
35 | 
36 | [UAX14]: https://www.unicode.org/reports/tr14/
37 | 


--------------------------------------------------------------------------------
/gen-tables/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | 


--------------------------------------------------------------------------------
/gen-tables/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "ahash"
  7 | version = "0.8.3"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
 10 | dependencies = [
 11 |  "cfg-if",
 12 |  "once_cell",
 13 |  "version_check",
 14 | ]
 15 | 
 16 | [[package]]
 17 | name = "aho-corasick"
 18 | version = "1.0.2"
 19 | source = "registry+https://github.com/rust-lang/crates.io-index"
 20 | checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
 21 | dependencies = [
 22 |  "memchr",
 23 | ]
 24 | 
 25 | [[package]]
 26 | name = "allocator-api2"
 27 | version = "0.2.15"
 28 | source = "registry+https://github.com/rust-lang/crates.io-index"
 29 | checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9"
 30 | 
 31 | [[package]]
 32 | name = "cfg-if"
 33 | version = "1.0.0"
 34 | source = "registry+https://github.com/rust-lang/crates.io-index"
 35 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 36 | 
 37 | [[package]]
 38 | name = "gen-tables"
 39 | version = "0.0.0"
 40 | dependencies = [
 41 |  "hashbrown",
 42 |  "regex",
 43 | ]
 44 | 
 45 | [[package]]
 46 | name = "hashbrown"
 47 | version = "0.14.0"
 48 | source = "registry+https://github.com/rust-lang/crates.io-index"
 49 | checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
 50 | dependencies = [
 51 |  "ahash",
 52 |  "allocator-api2",
 53 | ]
 54 | 
 55 | [[package]]
 56 | name = "memchr"
 57 | version = "2.5.0"
 58 | source = "registry+https://github.com/rust-lang/crates.io-index"
 59 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 60 | 
 61 | [[package]]
 62 | name = "once_cell"
 63 | version = "1.18.0"
 64 | source = "registry+https://github.com/rust-lang/crates.io-index"
 65 | checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
 66 | 
 67 | [[package]]
 68 | name = "regex"
 69 | version = "1.9.1"
 70 | source = "registry+https://github.com/rust-lang/crates.io-index"
 71 | checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
 72 | dependencies = [
 73 |  "aho-corasick",
 74 |  "memchr",
 75 |  "regex-automata",
 76 |  "regex-syntax",
 77 | ]
 78 | 
 79 | [[package]]
 80 | name = "regex-automata"
 81 | version = "0.3.2"
 82 | source = "registry+https://github.com/rust-lang/crates.io-index"
 83 | checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf"
 84 | dependencies = [
 85 |  "aho-corasick",
 86 |  "memchr",
 87 |  "regex-syntax",
 88 | ]
 89 | 
 90 | [[package]]
 91 | name = "regex-syntax"
 92 | version = "0.7.3"
 93 | source = "registry+https://github.com/rust-lang/crates.io-index"
 94 | checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846"
 95 | 
 96 | [[package]]
 97 | name = "version_check"
 98 | version = "0.9.4"
 99 | source = "registry+https://github.com/rust-lang/crates.io-index"
100 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
101 | 


--------------------------------------------------------------------------------
/gen-tables/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "gen-tables"
 3 | version = "0.0.0"
 4 | edition = "2021"
 5 | publish = false
 6 | 
 7 | [dependencies]
 8 | regex = "1"
 9 | hashbrown = "0.14"
10 | 
11 | # Prevent this from interfering with workspaces
12 | [workspace]
13 | members = ["."]
14 | 


--------------------------------------------------------------------------------
/gen-tables/src/main.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | Parses the rules into a state machine using a pair table. Each value in the table specifies the
  3 | next state and whether it's an forced/allowed break. To handles rules such as
  4 | 
  5 |     B SP* ÷ A
  6 | 
  7 | the extra state BSP is employed in the pair table friendly equivalent rules
  8 | 
  9 |     (B | BSP) ÷ A, Treat (B | BSP) SP as if it were BSP, Treat BSP as if it were SP
 10 | */
 11 | #![recursion_limit = "512"]
 12 | 
 13 | use hashbrown::{hash_map::Entry, HashMap};
 14 | use regex::Regex;
 15 | use std::borrow::Borrow;
 16 | use std::cmp::{max, min};
 17 | use std::fs::File;
 18 | use std::hash::{BuildHasher, Hash, Hasher};
 19 | use std::io::{BufRead, BufReader, BufWriter, Write};
 20 | use std::marker::PhantomData;
 21 | use std::ops::Range;
 22 | use std::str::FromStr;
 23 | use std::{error, iter};
 24 | 
 25 | include!("../../src/shared.rs");
 26 | 
 27 | impl FromStr for BreakClass {
 28 |     type Err = &'static str;
 29 | 
 30 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
 31 |         Ok(match s {
 32 |             "BK" => BK,
 33 |             "CR" => CR,
 34 |             "LF" => LF,
 35 |             "CM" => CM,
 36 |             "NL" => NL,
 37 |             "SG" => SG,
 38 |             "WJ" => WJ,
 39 |             "ZW" => ZW,
 40 |             "GL" => GL,
 41 |             "SP" => SP,
 42 |             "ZWJ" => ZWJ,
 43 |             "B2" => B2,
 44 |             "BA" => BA,
 45 |             "BB" => BB,
 46 |             "HY" => HY,
 47 |             "CB" => CB,
 48 |             "CL" => CL,
 49 |             "CP" => CP,
 50 |             "EX" => EX,
 51 |             "IN" => IN,
 52 |             "NS" => NS,
 53 |             "OP" => OP,
 54 |             "QU" => QU,
 55 |             "IS" => IS,
 56 |             "NU" => NU,
 57 |             "PO" => PO,
 58 |             "PR" => PR,
 59 |             "SY" => SY,
 60 |             "AI" => AI,
 61 |             "AL" => AL,
 62 |             "CJ" => CJ,
 63 |             "EB" => EB,
 64 |             "EM" => EM,
 65 |             "H2" => H2,
 66 |             "H3" => H3,
 67 |             "HL" => HL,
 68 |             "ID" => ID,
 69 |             "JL" => JL,
 70 |             "JV" => JV,
 71 |             "JT" => JT,
 72 |             "RI" => RI,
 73 |             "SA" => SA,
 74 |             "XX" => XX,
 75 |             _ => return Err("Invalid break class"),
 76 |         })
 77 |     }
 78 | }
 79 | 
 80 | const NUM_CLASSES: usize = 43;
 81 | static BREAK_CLASS_TABLE: [&str; NUM_CLASSES] = [
 82 |     "BK", "CR", "LF", "CM", "NL", "SG", "WJ", "ZW", "GL", "SP", "ZWJ", "B2", "BA", "BB", "HY",
 83 |     "CB", "CL", "CP", "EX", "IN", "NS", "OP", "QU", "IS", "NU", "PO", "PR", "SY", "AI", "AL", "CJ",
 84 |     "EB", "EM", "H2", "H3", "HL", "ID", "JL", "JV", "JT", "RI", "SA", "XX",
 85 | ];
 86 | 
 87 | #[derive(Copy, Clone)]
 88 | #[repr(u8)]
 89 | #[allow(clippy::upper_case_acronyms)]
 90 | enum ExtraState {
 91 |     ZWSP = sot + 1,
 92 |     OPSP,
 93 |     QUSP,
 94 |     CLSP,
 95 |     CPSP,
 96 |     B2SP,
 97 |     HLHYBA,
 98 |     RIRI,
 99 | }
100 | 
101 | use ExtraState::*;
102 | 
103 | /// The number of classes plus the eot state.
104 | const NUM_CLASSES_EOT: usize = NUM_CLASSES + 1;
105 | const NUM_STATES: usize = NUM_CLASSES + 10;
106 | 
107 | /// Separate implementation to prevent infinite recursion.
108 | #[doc(hidden)]
109 | macro_rules! rules2table_impl {
110 |     // Operators
111 |     (($len:ident $($args:tt)*) '÷' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '÷') $($tt)+}};
112 |     (($len:ident $($args:tt)*) '×' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '×') $($tt)+}};
113 |     (($len:ident $($args:tt)*) '!' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '!') $($tt)+}};
114 |     // Perform operator
115 |     (($len:ident $pair_table:ident $($first:ident)? $operator:literal $($second:ident)?) $(, $($tt:tt)*)?) => {
116 |         $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
117 |         #[allow(unused)] let first = 0..NUM_STATES; // Default to ALL
118 |         $(let first = $first;)?
119 |         #[allow(unused)] let second = 0..NUM_CLASSES_EOT; // Default to ALL
120 |         $(let second = $second;)?
121 |         for i in first {
122 |             for j in second.clone() {
123 |                 let cell = &mut $pair_table[i][j];
124 |                 match $operator {
125 |                     '!' => *cell |= ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT,
126 |                     '÷' => *cell |= ALLOWED_BREAK_BIT,
127 |                     '×' => *cell &= !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT),
128 |                     _ => unreachable!("Bad operator"),
129 |                 }
130 |             }
131 |         }
132 |     };
133 | 
134 |     (($len:ident $($args:tt)*) Treat X $($tt:tt)*) => {
135 |         rules2table_impl! {(NUM_CLASSES_EOT $($args)* treat_x) $($tt)*}
136 |     };
137 |     (($len:ident $($args:tt)*) Treat $($tt:tt)*) => {
138 |         rules2table_impl! {(NUM_STATES $($args)* treat) $($tt)*}
139 |     };
140 |     (($len:ident $($args:tt)*) * as if it were X where X = $($tt:tt)*) => {
141 |         rules2table_impl! {(NUM_STATES $($args)* as_if_it_were_x_where_x_is) $($tt)*}
142 |     };
143 | 
144 |     (($len:ident $pair_table:ident treat_x $second:ident as_if_it_were_x_where_x_is $X:ident) $(, $($tt:tt)*)?) => {
145 |         $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
146 |         for i in $X {
147 |             for j in $second.clone() {
148 |                 $pair_table[i][j] = i as u8;
149 |             }
150 |         }
151 |     };
152 |     (($len:ident $pair_table:ident treat $first:ident $second:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => {
153 |         $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
154 |         let cls = $cls as u8;
155 |         for i in $first {
156 |             for j in $second.clone() {
157 |                 $pair_table[i][j] = cls;
158 |             }
159 |         }
160 |     };
161 |     (($len:ident $pair_table:ident treat $first:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => {
162 |         $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
163 |         for j in $first.clone().filter(|&j| j < NUM_CLASSES_EOT) {
164 |             for row in $pair_table.iter_mut() {
165 |                 row[j] = row[$cls as usize];
166 |             }
167 |         }
168 |         for i in $first {
169 |             $pair_table.copy_within($cls as usize..$cls as usize + 1, i);
170 |         }
171 |     };
172 | 
173 |     // All classes pattern
174 |     (($len:ident $($args:tt)*) ALL $($tt:tt)*) => {
175 |         let indices = 0..$len;
176 |         rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
177 |     };
178 |     // Single class pattern
179 |     (($len:ident $($args:tt)*) $cls:ident $($tt:tt)*) => {
180 |         let indices = iter::once($cls as usize);
181 |         rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
182 |     };
183 |     // Parse (X | ...) patterns
184 |     (($len:ident $($args:tt)*) ($($cls:ident)|+) $($tt:tt)*) => {
185 |         let indices = [$($cls as usize),+].into_iter();
186 |         rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
187 |     };
188 |     // Parse [^ ...] patterns
189 |     (($len:ident $($args:tt)*) [^$($cls:ident)+] $($tt:tt)*) => {
190 |         let excluded = [$($cls as usize),+];
191 |         let indices = (0..$len).filter(|i| !excluded.contains(i));
192 |         rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
193 |     };
194 | 
195 |     (($len:ident $pair_table:ident)) => {}; // Exit condition
196 | }
197 | 
198 | /// Returns a pair table conforming to the specified rules.
199 | ///
200 | /// The rule syntax is a modified subset of the one in Unicode Standard Annex #14.
201 | macro_rules! rules2table {
202 |     ($($tt:tt)+) => {{
203 |         let mut pair_table = [{
204 |             let mut row = [0; NUM_CLASSES_EOT];
205 |             for (i, x) in row.iter_mut().enumerate() {
206 |                 *x = i as u8;
207 |             }
208 |             row
209 |         }; NUM_STATES];
210 |         rules2table_impl! {(NUM_STATES pair_table) $($tt)+}
211 |         pair_table
212 |     }};
213 | }
214 | 
215 | trait IteratorExt: Iterator {
216 |     /// Tests if all elements of the iterator are equal.
217 |     fn all_equal(&mut self) -> bool
218 |     where
219 |         Self::Item: PartialEq,
220 |         Self: Sized,
221 |     {
222 |         self.next().map_or(true, |first| self.all(|x| x == first))
223 |     }
224 | }
225 | 
226 | impl<I: Iterator> IteratorExt for I {}
227 | 
228 | fn overlap<T: PartialEq, I: IntoIterator>(a: &[T], b: I) -> usize
229 | where
230 |     I::Item: Borrow<T>,
231 |     I::IntoIter: ExactSizeIterator + Clone,
232 | {
233 |     let b = b.into_iter();
234 |     (1..min(a.len(), b.len()))
235 |         .rev()
236 |         .find(|&n| {
237 |             a[a.len() - n..]
238 |                 .iter()
239 |                 .zip(b.clone())
240 |                 .all(|(x, y)| x == y.borrow())
241 |         })
242 |         .unwrap_or(0)
243 | }
244 | 
245 | const UNICODE_LIMIT: u32 = 0x110000;
246 | const ASCII_LIMIT: u32 = 0x80;
247 | const SMALL_DATA_BLOCKS_PER_BMP_BLOCK: u32 = 1 << (BMP_SHIFT - SHIFT_3);
248 | /// Number of code points per index-2 table entry.
249 | const CP_PER_INDEX_2_ENTRY: u32 = 1 << SHIFT_2;
250 | 
251 | #[derive(Clone, Copy, PartialEq, Debug)]
252 | enum Index<T> {
253 |     AllSame { value: T },
254 |     Mixed { data_block: u32 },
255 | }
256 | 
257 | /// UCPTrie builder.
258 | ///
259 | /// See: [ICU Code Point Tries]
260 | ///
261 | /// [ICU Code Point Tries]: https://icu.unicode.org/design/struct/utrie
262 | #[derive(Default)]
263 | struct CpTrieBuilder<T> {
264 |     /// Index-3 table.
265 |     index: Vec<Index<T>>,
266 |     data: Vec<T>,
267 |     initial_value: T,
268 | }
269 | 
270 | impl<T: Copy + PartialEq + Eq + Hash> CpTrieBuilder<T> {
271 |     fn new(initial_value: T) -> Self {
272 |         Self {
273 |             index: Vec::with_capacity(UNICODE_LIMIT as usize >> SHIFT_3),
274 |             data: Vec::new(),
275 |             initial_value,
276 |         }
277 |     }
278 | 
279 |     fn set_range(&mut self, Range { mut start, end }: Range<u32>, value: T) {
280 |         if start >= end {
281 |             return; // Empty range
282 |         }
283 |         if end as usize > self.index.len() {
284 |             // Round up to CP_PER_INDEX_2_ENTRY boundary to simplify compaction
285 |             let c = (end + CP_PER_INDEX_2_ENTRY - 1) & !(CP_PER_INDEX_2_ENTRY - 1);
286 |             self.index.resize(
287 |                 c as usize >> SHIFT_3,
288 |                 Index::AllSame {
289 |                     value: self.initial_value,
290 |                 },
291 |             );
292 |         }
293 | 
294 |         // Set partial block at [start, next block boundary)
295 |         let block_start = start & !(SMALL_DATA_BLOCK_LENGTH - 1);
296 |         if start > block_start {
297 |             let block = self.data_block(start);
298 |             let block = &mut self.data[block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize]
299 |                 [(start & (SMALL_DATA_BLOCK_LENGTH - 1)) as usize..];
300 |             if end < block_start + SMALL_DATA_BLOCK_LENGTH {
301 |                 block[..((end - start) & (SMALL_DATA_BLOCK_LENGTH - 1)) as usize].fill(value);
302 |                 return;
303 |             }
304 |             block.fill(value);
305 |             start = block_start + SMALL_DATA_BLOCK_LENGTH;
306 |         }
307 | 
308 |         // Fill all full blocks
309 |         while start < end & !(SMALL_DATA_BLOCK_LENGTH - 1) {
310 |             match &mut self.index[start as usize >> SHIFT_3] {
311 |                 Index::AllSame { value: prev_value } => *prev_value = value,
312 |                 Index::Mixed { data_block } => {
313 |                     self.data[*data_block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize]
314 |                         .fill(value);
315 |                 }
316 |             }
317 |             start += SMALL_DATA_BLOCK_LENGTH;
318 |         }
319 | 
320 |         // Set partial block at [last block boundary..end)
321 |         let rest = end & (SMALL_DATA_BLOCK_LENGTH - 1);
322 |         if rest > 0 {
323 |             let block = self.data_block(start);
324 |             self.data[block as usize..][..rest as usize].fill(value);
325 |         }
326 |     }
327 | 
328 |     fn data_block(&mut self, c: u32) -> u32 {
329 |         let i = c as usize >> SHIFT_3;
330 |         if let Index::Mixed { data_block } = self.index[i] {
331 |             return data_block; // Already allocated
332 |         }
333 | 
334 |         let (block_len, small_blocks) = if i < (BMP_LIMIT << SHIFT_3) as usize {
335 |             let i_start = i & !(SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize - 1);
336 |             (
337 |                 BMP_DATA_BLOCK_LENGTH,
338 |                 i_start..i_start + SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize,
339 |             )
340 |         } else {
341 |             (SMALL_DATA_BLOCK_LENGTH, i..i + 1)
342 |         };
343 |         // Allocate a new data block
344 |         let new_block = self.data.len() as u32;
345 |         self.data
346 |             .extend(iter::repeat(self.initial_value).take(block_len as usize));
347 | 
348 |         for (k, i) in small_blocks.clone().enumerate() {
349 |             let prev_value = if let Index::AllSame { value } = self.index[i] {
350 |                 value
351 |             } else {
352 |                 unreachable!()
353 |             };
354 |             let block = new_block + k as u32 * SMALL_DATA_BLOCK_LENGTH;
355 |             self.data[block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize].fill(prev_value);
356 |             self.index[i] = Index::Mixed { data_block: block };
357 |         }
358 |         new_block + SMALL_DATA_BLOCK_LENGTH * (i - small_blocks.start) as u32
359 |     }
360 | 
361 |     fn get(&self, c: u32) -> T {
362 |         match self.index[c as usize >> SHIFT_3] {
363 |             Index::AllSame { value } => value,
364 |             Index::Mixed { data_block } => {
365 |                 self.data[(data_block + (c & (SMALL_DATA_BLOCK_LENGTH - 1))) as usize]
366 |             }
367 |         }
368 |     }
369 | 
370 |     // Compact arrays by
371 |     //
372 |     // * removing blocks identical to earlier ones
373 |     // * overlapping each block as much as possible with the previously written one
374 | 
375 |     fn compact_data(&mut self) {
376 |         let mut new_data = Vec::with_capacity(self.data.len());
377 |         // Always store ASCII data linearly at start
378 |         new_data.extend((0..ASCII_LIMIT).map(|i| self.get(i)));
379 |         self.index
380 |             .iter_mut()
381 |             .take(ASCII_LIMIT as usize >> SHIFT_3)
382 |             .step_by(SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize)
383 |             .enumerate()
384 |             .for_each(|(i, x)| {
385 |                 *x = Index::Mixed {
386 |                     data_block: BMP_DATA_BLOCK_LENGTH * i as u32,
387 |                 }
388 |             });
389 | 
390 |         let mut block_len = BMP_DATA_BLOCK_LENGTH;
391 |         let mut uniform_blocks = HashMap::new();
392 |         let mut block_index = BlockIndex::new(self.data.len(), block_len as usize);
393 |         let mut inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize;
394 |         let mut i = ASCII_LIMIT as usize >> SHIFT_3;
395 |         while i < self.index.len() {
396 |             if i == BMP_LIMIT as usize >> SHIFT_3 {
397 |                 block_len = SMALL_DATA_BLOCK_LENGTH;
398 |                 inc = 1;
399 |                 block_index.clear(block_len as usize);
400 |                 block_index.extend(&new_data);
401 |             }
402 | 
403 |             let old_index = match self.index[i] {
404 |                 // Check if all of fast-range data block's blocks have all same or turn into mixed
405 |                 Index::AllSame { value }
406 |                     if !self.index[i..][1..inc]
407 |                         .iter()
408 |                         .all(|x| matches!(x, Index::AllSame { value: v } if *v == value)) =>
409 |                 {
410 |                     Index::Mixed {
411 |                         data_block: self.data_block((i as u32) << SHIFT_3), // Turn into mixed block
412 |                     }
413 |                 }
414 |                 // Check if really mixed
415 |                 x @ Index::Mixed { data_block } => {
416 |                     let block = &self.data[data_block as usize..][..block_len as usize];
417 |                     let all_same = block.iter().skip(1).all(|&x| x == block[0]);
418 |                     if all_same {
419 |                         Index::AllSame { value: block[0] }
420 |                     } else {
421 |                         x
422 |                     }
423 |                 }
424 |                 x => x,
425 |             };
426 |             let new_index = match old_index {
427 |                 Index::AllSame { value } => {
428 |                     // Is there another uniform block with the same value?
429 |                     if let Some(j) = match uniform_blocks.entry(value) {
430 |                         Entry::Occupied(entry) => Some(*entry.get()),
431 |                         Entry::Vacant(entry) => {
432 |                             entry.insert(i as u32);
433 |                             None
434 |                         }
435 |                     } {
436 |                         if let Index::Mixed { data_block } = self.index[j as usize] {
437 |                             data_block
438 |                         } else {
439 |                             unreachable!()
440 |                         }
441 |                     } else if let Some(n) = block_index
442 |                         .find_block(&new_data, iter::repeat(value).take(block_len as usize))
443 |                     {
444 |                         n
445 |                     } else {
446 |                         let overlap = new_data
447 |                             .iter()
448 |                             .rev()
449 |                             .take(block_len as usize - 1)
450 |                             .take_while(|&&x| x == value)
451 |                             .count();
452 |                         let new_index = (new_data.len() - overlap) as u32;
453 |                         new_data.extend(iter::repeat(value).take(block_len as usize - overlap));
454 |                         block_index.extend(&new_data);
455 |                         new_index
456 |                     }
457 |                 }
458 |                 Index::Mixed { data_block } => {
459 |                     let block = &self.data[data_block as usize..][..block_len as usize];
460 |                     if let Some(n) = block_index.find_block(&new_data, block) {
461 |                         n
462 |                     } else {
463 |                         let overlap = overlap(&new_data, block);
464 |                         let new_index = (new_data.len() - overlap) as u32;
465 |                         new_data.extend_from_slice(&block[overlap..]);
466 |                         block_index.extend(&new_data);
467 |                         new_index
468 |                     }
469 |                 }
470 |             };
471 |             self.index[i] = Index::Mixed {
472 |                 data_block: new_index,
473 |             };
474 |             i += inc;
475 |         }
476 | 
477 |         self.data = new_data;
478 |     }
479 | 
480 |     fn compact_index(&mut self) -> Vec<u16> {
481 |         let fast_index_len = BMP_LIMIT as usize >> BMP_SHIFT;
482 |         let index2_capacity =
483 |             (self.index.len() - (BMP_LIMIT as usize >> SHIFT_3)) >> (SHIFT_2 - SHIFT_3);
484 |         let index1_len =
485 |             (index2_capacity + INDEX_2_BLOCK_LENGTH as usize - 1) >> (SHIFT_1 - SHIFT_2);
486 |         let index1_end = fast_index_len + index1_len;
487 |         let mut index16 = Vec::with_capacity(index1_end + index2_capacity);
488 |         let mut block_index = BlockIndex::new(index16.capacity(), INDEX_3_BLOCK_LENGTH as usize);
489 | 
490 |         let (fast_index, small_index) = self.index.split_at(BMP_LIMIT as usize >> SHIFT_3);
491 |         // Condense fast index table
492 |         index16.extend(
493 |             fast_index
494 |                 .iter()
495 |                 .step_by(SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize)
496 |                 .map(|x| {
497 |                     if let Index::Mixed { data_block: i3 } = x {
498 |                         *i3 as u16
499 |                     } else {
500 |                         unreachable!()
501 |                     }
502 |                 }),
503 |         );
504 |         debug_assert_eq!(index16.len(), fast_index_len);
505 |         block_index.extend(&index16);
506 | 
507 |         index16.extend(iter::repeat(0).take(index1_len)); // Reserve space for index-1 table
508 |         block_index.skip(index1_len);
509 | 
510 |         // Compact the index-3 table and write uncompacted index-2 table
511 |         let index2: Vec<_> = small_index
512 |             .chunks_exact(INDEX_3_BLOCK_LENGTH as usize)
513 |             .map(|block| {
514 |                 let block = block.iter().map(|x| {
515 |                     if let Index::Mixed { data_block } = x {
516 |                         *data_block
517 |                     } else {
518 |                         unreachable!()
519 |                     }
520 |                 });
521 |                 let ored = block.clone().fold(0, |acc, i3| acc | i3);
522 | 
523 |                 if ored <= 0xffff {
524 |                     let block = block.map(|x| x as u16);
525 |                     if let Some(n) = block_index.find_block(&index16, block.clone()) {
526 |                         n as u16
527 |                     } else {
528 |                         let overlap = overlap(&index16[index1_end..], block.clone());
529 |                         let i3 = (index16.len() - overlap) as u16;
530 |                         index16.extend(block.skip(overlap));
531 |                         block_index.extend(&index16);
532 |                         i3
533 |                     }
534 |                 } else {
535 |                     todo!() // Encode index-3 block with one or more data indices exceeding 16 bits
536 |                 }
537 |             })
538 |             .collect();
539 | 
540 |         // Compact the index-2 table and write the index-1 table
541 |         debug_assert_eq!(
542 |             INDEX_2_BLOCK_LENGTH, INDEX_3_BLOCK_LENGTH,
543 |             "cannot reuse block index"
544 |         );
545 |         for (i, block) in index2.chunks(INDEX_2_BLOCK_LENGTH as usize).enumerate() {
546 |             let i2 = if let Some(n) = block_index.find_block(&index16, block) {
547 |                 n as u16
548 |             } else {
549 |                 let overlap = overlap(&index16[index1_end..], block);
550 |                 let i2 = (index16.len() - overlap) as u16;
551 |                 index16.extend(&block[overlap..]);
552 |                 block_index.extend(&index16);
553 |                 i2
554 |             };
555 | 
556 |             let i1 = fast_index_len + i;
557 |             index16[i1] = i2;
558 |         }
559 | 
560 |         index16
561 |     }
562 | 
563 |     fn build(mut self) -> CpTrie<T> {
564 |         if self.index.len() < BMP_LIMIT as usize >> SHIFT_3 {
565 |             self.index.resize(
566 |                 BMP_LIMIT as usize >> SHIFT_3,
567 |                 Index::AllSame {
568 |                     value: self.initial_value,
569 |                 },
570 |             );
571 |         }
572 |         self.compact_data();
573 |         let high_start = {
574 |             let i = self
575 |                 .index
576 |                 .last()
577 |                 .filter(|&x| {
578 |                     if let Index::Mixed { data_block } = x {
579 |                         self.data[*data_block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize]
580 |                             .iter()
581 |                             .all(|&x| x == self.initial_value)
582 |                     } else {
583 |                         false
584 |                     }
585 |                 })
586 |                 .map(|i| self.index.iter().rposition(|x| x != i).unwrap())
587 |                 .map_or(self.index.len(), |i| i + 1) as u32;
588 |             let c = ((i << SHIFT_3) + CP_PER_INDEX_2_ENTRY - 1) & !(CP_PER_INDEX_2_ENTRY - 1);
589 |             max(c, BMP_LIMIT)
590 |         };
591 |         self.index.truncate(high_start as usize >> SHIFT_3);
592 |         let index = self.compact_index();
593 | 
594 |         CpTrie {
595 |             high_start,
596 |             index,
597 |             data: self.data,
598 |         }
599 |     }
600 | }
601 | 
602 | struct FixedHash<T>(u64, T);
603 | 
604 | impl<T> Hash for FixedHash<T> {
605 |     fn hash<H: Hasher>(&self, state: &mut H) {
606 |         state.write_u64(self.0);
607 |     }
608 | }
609 | 
610 | struct BlockIndex<T> {
611 |     set: hashbrown::HashMap<FixedHash<u32>, ()>,
612 |     block_len: usize,
613 |     prev_end: usize,
614 |     phantom: PhantomData<T>,
615 | }
616 | 
617 | impl<T: PartialEq + Hash> BlockIndex<T> {
618 |     fn new(capacity: usize, block_len: usize) -> Self {
619 |         Self {
620 |             set: hashbrown::HashMap::with_capacity(capacity),
621 |             block_len,
622 |             prev_end: 0,
623 |             phantom: PhantomData,
624 |         }
625 |     }
626 | 
627 |     fn clear(&mut self, new_block_len: usize) {
628 |         self.set.clear();
629 |         self.block_len = new_block_len;
630 |         self.prev_end = 0;
631 |     }
632 | 
633 |     fn skip(&mut self, n: usize) {
634 |         self.prev_end += n + self.block_len - 1;
635 |     }
636 | 
637 |     fn extend(&mut self, data: &[T]) {
638 |         let start = (self.prev_end + 1).saturating_sub(self.block_len);
639 |         if data.len() <= start {
640 |             return;
641 |         }
642 |         for (i, block) in data[start..].windows(self.block_len).enumerate() {
643 |             let i = (start + i) as u32;
644 | 
645 |             let hash = {
646 |                 let mut s = self.set.hasher().build_hasher();
647 |                 block.iter().for_each(|x| x.hash(&mut s));
648 |                 s.finish()
649 |             };
650 |             let hash2 = {
651 |                 let mut s = self.set.hasher().build_hasher();
652 |                 s.write_u64(hash);
653 |                 s.finish()
654 |             };
655 |             let is_match = |&FixedHash(_, j): &FixedHash<u32>| {
656 |                 data[j as usize..][..self.block_len].iter().eq(block)
657 |             };
658 |             self.set
659 |                 .raw_entry_mut()
660 |                 .from_hash(hash2, is_match)
661 |                 .or_insert(FixedHash(hash, i), ());
662 |         }
663 |         self.prev_end = data.len();
664 |     }
665 | 
666 |     fn find_block<I: IntoIterator>(&mut self, data: &[T], block: I) -> Option<u32>
667 |     where
668 |         I::Item: Borrow<T>,
669 |         I::IntoIter: Clone,
670 |     {
671 |         let block = block.into_iter();
672 |         let hash = {
673 |             let mut s = self.set.hasher().build_hasher();
674 |             block.clone().for_each(|x| x.borrow().hash(&mut s));
675 |             s.finish()
676 |         };
677 |         let hash2 = {
678 |             let mut s = self.set.hasher().build_hasher();
679 |             s.write_u64(hash);
680 |             s.finish()
681 |         };
682 |         let is_match = |&FixedHash(_, j): &FixedHash<u32>| {
683 |             data[j as usize..][..self.block_len]
684 |                 .iter()
685 |                 .zip(block.clone())
686 |                 .all(|(x, y)| x == y.borrow())
687 |         };
688 |         self.set
689 |             .raw_entry()
690 |             .from_hash(hash2, is_match)
691 |             .map(|(&FixedHash(_, i), _)| i)
692 |     }
693 | }
694 | 
695 | struct CpTrie<T> {
696 |     high_start: u32,
697 |     index: Vec<u16>,
698 |     data: Vec<T>,
699 | }
700 | 
701 | fn main() -> Result<(), Box<dyn error::Error>> {
702 |     #[allow(clippy::assertions_on_constants)]
703 |     const _: () = debug_assert!(NUM_STATES <= 0x3F, "too many states");
704 | 
705 |     let pair_table = rules2table! {
706 |         // Non-tailorable Line Breaking Rules
707 |         // LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
708 |         // SA, SG, and XX into other line breaking classes depending on criteria outside the scope
709 |         // of this algorithm.
710 |         Treat (AI | SG | XX | SA) as if it were AL, Treat CJ as if it were NS,
711 |         // Start and end of text:
712 |         sot '×', // LB2 Never break at the start of text.
713 |         '!' eot, // LB3 Always break at the end of text.
714 |         // Mandatory breaks:
715 |         BK '!', // LB4 Always break after hard line breaks.
716 |         // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
717 |         CR '×' LF, CR '!', LF '!', NL '!',
718 |         '×' (BK | CR | LF | NL), // LB6 Do not break before hard line breaks.
719 |         // Explicit breaks and non-breaks:
720 |         '×' SP, '×' ZW, // LB7 Do not break before spaces or zero width space.
721 |         // LB8 Break before any character following a zero-width space, even if one or more spaces
722 |         // intervene.
723 |         (ZW | ZWSP) '÷', Treat (ZW | ZWSP) SP as if it were ZWSP, Treat ZWSP as if it were SP,
724 |         // ZWJ '×', // XXX Handled explicitly // LB8a Do not break after a zero width joiner.
725 |         // Combining marks:
726 |         // LB9 Do not break a combining character sequence; treat it as if it has the line breaking
727 |         // class of the base character in all of the following rules. Treat ZWJ as if it were CM.
728 |         Treat X (CM | ZWJ)* as if it were X where X = [^BK CR LF NL SP ZW sot eot ZWSP OPSP QUSP CLSP CPSP B2SP],
729 |         Treat (CM | ZWJ) as if it were AL, // LB10 Treat any remaining combining mark or ZWJ as AL.
730 |         // Word joiner:
731 |         '×' WJ, WJ '×', // LB11 Do not break before or after Word joiner and related characters.
732 |         // Non-breaking characters:
733 |         GL '×', // LB12 Do not break after NBSP and related characters.
734 | 
735 |         // Tailorable Line Breaking Rules
736 |         // LB12a Do not break before NBSP and related characters, except after spaces and hyphens.
737 |         [^SP BA HY sot eot ZWSP OPSP QUSP CLSP CPSP B2SP] '×' GL,
738 |         // LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
739 |         '×' CL, '×' CP, '×' EX, '×' IS, '×' SY,
740 |         // LB14 Do not break after ‘[’, even after spaces.
741 |         (OP | OPSP) '×', Treat (OP | OPSP) SP as if it were OPSP, Treat ZWSP as if it were SP,
742 |         // LB15 Do not break within ‘”[’, even with intervening spaces.
743 |         (QU | QUSP) '×' OP, Treat (QU | QUSP) SP as if it were QUSP, Treat QUSP as if it were SP,
744 |         // LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
745 |         // intervening spaces.
746 |         (CL | CLSP | CP | CPSP) '×' NS,
747 |         Treat (CL | CLSP) SP as if it were CLSP, Treat CLSP as if it were SP,
748 |         Treat (CP | CPSP) SP as if it were CPSP, Treat CPSP as if it were SP,
749 |         // LB17 Do not break within ‘——’, even with intervening spaces.
750 |         (B2 | B2SP) '×' B2, Treat (B2 | B2SP) SP as if it were B2SP, Treat B2SP as if it were SP,
751 |         // Spaces:
752 |         SP '÷', // LB18 Break after spaces.
753 |         // Special case rules:
754 |         '×' QU, QU '×', // LB19 Do not break before or after quotation marks, such as ‘”’.
755 |         '÷' CB, CB '÷', // LB20 Break before and after unresolved CB.
756 |         // LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana,
757 |         // and other non-starters, or after acute accents.
758 |         '×' BA, '×' HY, '×' NS, BB '×',
759 |         // LB21a Don't break after Hebrew + Hyphen. // XXX Use a single state, HLHYBA, for HLHY and HLBA
760 |         HLHYBA '×', Treat HL (HY | BA) as if it were HLHYBA, Treat HLHYBA as if it were HY,
761 |         SY '×' HL, // LB21b Don’t break between Solidus and Hebrew letters.
762 |         '×' IN, // LB22 Do not break before ellipses.
763 |         // Numbers:
764 |         (AL | HL) '×' NU, NU '×' (AL | HL), // LB23 Do not break between digits and letters.
765 |         // LB23a Do not break between numeric prefixes and ideographs, or between ideographs and
766 |         // numeric postfixes.
767 |         PR '×' (ID | EB | EM), (ID | EB | EM) '×' PO,
768 |         // LB24 Do not break between numeric prefix/postfix and letters, or between letters and
769 |         // prefix/postfix.
770 |         (PR | PO) '×' (AL | HL), (AL | HL) '×' (PR | PO),
771 |         // LB25 Do not break between the following pairs of classes relevant to numbers:
772 |         CL '×' PO, CP '×' PO, CL '×' PR, CP '×' PR, NU '×' PO, NU '×' PR, PO '×' OP, PO '×' NU, PR '×' OP, PR '×' NU, HY '×' NU, IS '×' NU, NU '×' NU, SY '×' NU,
773 |         // Korean syllable blocks
774 |         // LB26 Do not break a Korean syllable.
775 |         JL '×' (JL | JV | H2 | H3), (JV | H2) '×' (JV | JT), (JT | H3) '×' JT,
776 |         // LB27 Treat a Korean Syllable Block the same as ID.
777 |         (JL | JV | JT | H2 | H3) '×' PO, PR '×' (JL | JV | JT | H2 | H3),
778 |         // Finally, join alphabetic letters into words and break everything else.
779 |         (AL | HL) '×' (AL | HL), // LB28 Do not break between alphabetics (“at”).
780 |         IS '×' (AL | HL), // LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
781 |         // LB30 Do not break between letters, numbers, or ordinary symbols and opening or closing
782 |         // parentheses.
783 |         (AL | HL | NU) '×' OP, CP '×' (AL | HL | NU),
784 |         // LB30a Break between two regional indicator symbols if and only if there are an even
785 |         // number of regional indicators preceding the position of the break.
786 |         RI '×' RI, Treat RI RI as if it were RIRI, Treat RIRI as if it were RI,
787 |         EB '×' EM, // LB30b Do not break between an emoji base and an emoji modifier.
788 |         '÷' ALL, ALL '÷', // LB31 Break everywhere else.
789 |     };
790 | 
791 |     // Synthesize all non-"safe" pairs from pair table
792 |     let unsafe_pairs = (0..NUM_CLASSES).flat_map(|j| {
793 |         (0..NUM_CLASSES).filter_map(move |i| {
794 |             // All states that could have resulted from break class "i"
795 |             let possible_states = pair_table
796 |                 .iter()
797 |                 .map(|row| (row[i] & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT)) as usize);
798 |             // Check if all state transitions due to "j" are the same
799 |             if possible_states.map(|s| pair_table[s][j]).all_equal() {
800 |                 None
801 |             } else {
802 |                 Some((i, j))
803 |             }
804 |         })
805 |     });
806 | 
807 |     let re = Regex::new(
808 |         r"(?x)^
809 | (?P<start>[[:xdigit:]]{4,}) # Unicode code point
810 | (?:\.{2}(?P<end>[[:xdigit:]]{4,}))? # End of range
811 | ;
812 | (?P<lb>\w{2,3}) # Line_Break property",
813 |     )?;
814 |     let prop_ranges = BufReader::new(File::open("../LineBreak.txt")?)
815 |         .lines()
816 |         .map(Result::unwrap)
817 |         .filter(|l| !(l.starts_with('#') || l.is_empty()))
818 |         .map(|l| {
819 |             let caps = re.captures(&l).unwrap();
820 |             let start = u32::from_str_radix(&caps["start"], 16).unwrap();
821 |             let end = caps
822 |                 .name("end")
823 |                 .map_or(start, |m| u32::from_str_radix(m.as_str(), 16).unwrap());
824 |             let lb: BreakClass = caps["lb"].parse().unwrap();
825 |             (start..end + 1, lb)
826 |         });
827 |     let trie = {
828 |         // All code points, assigned and unassigned, that are not listed explicitly are given the value "XX"
829 |         let mut builder = CpTrieBuilder::new(XX);
830 |         // The unassigned code points in the following blocks default to "ID"
831 |         builder.set_range(0x3400..0x4DBF + 1, ID);
832 |         builder.set_range(0x4E00..0x9FFF + 1, ID);
833 |         builder.set_range(0xF900..0xFAFF + 1, ID);
834 |         // All undesignated code points in Planes 2 and 3, whether inside or outside of allocated blocks, default to "ID"
835 |         builder.set_range(0x20000..0x2FFFD + 1, ID);
836 |         builder.set_range(0x30000..0x3FFFD + 1, ID);
837 |         // All unassigned code points in the following Plane 1 range, whether inside or outside of allocated blocks, also default to "ID"
838 |         builder.set_range(0x1F000..0x1FAFF + 1, ID);
839 |         builder.set_range(0x1FC00..0x1FFFD + 1, ID);
840 |         // The unassigned code points in the following block default to "PR"
841 |         builder.set_range(0x20A0..0x20CF + 1, PR);
842 | 
843 |         prop_ranges.for_each(|(range, lb)| builder.set_range(range, lb));
844 |         builder.build()
845 |     };
846 | 
847 |     let mut stream = BufWriter::new(File::create("../src/tables.rs")?);
848 |     writeln!(
849 |         stream,
850 |         "const BREAK_PROP_TRIE_HIGH_START: u32 = {};
851 | static BREAK_PROP_TRIE_INDEX: [u16; {}] = {:?};
852 | static BREAK_PROP_TRIE_DATA: [BreakClass; {}] = [",
853 |         trie.high_start,
854 |         trie.index.len(),
855 |         trie.index,
856 |         trie.data.len(),
857 |     )?;
858 |     trie.data
859 |         .into_iter()
860 |         .flat_map(|x| [BREAK_CLASS_TABLE[x as usize], ","])
861 |         .try_for_each(|s| write!(stream, "{}", s))?;
862 |     write!(
863 |         stream,
864 |         "];
865 | 
866 | static PAIR_TABLE: [[u8; {}]; {}] = [",
867 |         NUM_CLASSES_EOT, NUM_STATES
868 |     )?;
869 |     for row in &pair_table {
870 |         write!(stream, "[")?;
871 |         for x in row {
872 |             write!(stream, "{},", x)?;
873 |         }
874 |         write!(stream, "],")?;
875 |     }
876 |     writeln!(
877 |         stream,
878 |         r"];
879 | 
880 |         fn is_safe_pair(a: BreakClass, b: BreakClass) -> bool {{
881 |             !matches!((a, b), {})
882 |         }}",
883 |         unsafe_pairs
884 |             .map(|(i, j)| format!("({}, {})", BREAK_CLASS_TABLE[i], BREAK_CLASS_TABLE[j]))
885 |             .collect::<Vec<_>>()
886 |             .join("|")
887 |     )?;
888 | 
889 |     Ok(())
890 | }
891 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14].
  2 | //!
  3 | //! Given an input text, locates "line break opportunities", or positions appropriate for wrapping
  4 | //! lines when displaying text.
  5 | //!
  6 | //! # Example
  7 | //!
  8 | //! ```
  9 | //! use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}};
 10 | //!
 11 | //! let text = "a b \nc";
 12 | //! assert!(linebreaks(text).eq([
 13 | //!     (2, Allowed),   // May break after first space
 14 | //!     (5, Mandatory), // Must break after line feed
 15 | //!     (6, Mandatory)  // Must break at end of text, so that there always is at least one LB
 16 | //! ]));
 17 | //! ```
 18 | //!
 19 | //! [UAX14]: https://www.unicode.org/reports/tr14/
 20 | 
 21 | #![no_std]
 22 | #![deny(missing_docs, missing_debug_implementations)]
 23 | 
 24 | use core::iter::once;
 25 | 
 26 | /// The [Unicode version](https://www.unicode.org/versions/) conformed to.
 27 | pub const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0);
 28 | 
 29 | include!("shared.rs");
 30 | include!("tables.rs");
 31 | 
 32 | /// Returns the line break property of the specified code point.
 33 | ///
 34 | /// # Examples
 35 | ///
 36 | /// ```
 37 | /// use unicode_linebreak::{BreakClass, break_property};
 38 | /// assert_eq!(break_property(0x2CF3), BreakClass::Alphabetic);
 39 | /// ```
 40 | #[inline(always)]
 41 | pub fn break_property(codepoint: u32) -> BreakClass {
 42 |     const BMP_INDEX_LENGTH: u32 = BMP_LIMIT >> BMP_SHIFT;
 43 |     const OMITTED_BMP_INDEX_1_LENGTH: u32 = BMP_LIMIT >> SHIFT_1;
 44 | 
 45 |     let data_pos = if codepoint < BMP_LIMIT {
 46 |         let i = codepoint >> BMP_SHIFT;
 47 |         BREAK_PROP_TRIE_INDEX[i as usize] + (codepoint & (BMP_DATA_BLOCK_LENGTH - 1)) as u16
 48 |     } else if codepoint < BREAK_PROP_TRIE_HIGH_START {
 49 |         let i1 = codepoint >> SHIFT_1;
 50 |         let i2 = BREAK_PROP_TRIE_INDEX
 51 |             [(i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH) as usize]
 52 |             + ((codepoint >> SHIFT_2) & (INDEX_2_BLOCK_LENGTH - 1)) as u16;
 53 |         let i3_block = BREAK_PROP_TRIE_INDEX[i2 as usize];
 54 |         let i3_pos = ((codepoint >> SHIFT_3) & (INDEX_3_BLOCK_LENGTH - 1)) as u16;
 55 | 
 56 |         debug_assert!(i3_block & 0x8000 == 0, "18-bit indices are unexpected");
 57 |         let data_block = BREAK_PROP_TRIE_INDEX[(i3_block + i3_pos) as usize];
 58 |         data_block + (codepoint & (SMALL_DATA_BLOCK_LENGTH - 1)) as u16
 59 |     } else {
 60 |         return XX;
 61 |     };
 62 |     BREAK_PROP_TRIE_DATA[data_pos as usize]
 63 | }
 64 | 
 65 | /// Break opportunity type.
 66 | #[derive(Copy, Clone, PartialEq, Eq, Debug)]
 67 | pub enum BreakOpportunity {
 68 |     /// A line must break at this spot.
 69 |     Mandatory,
 70 |     /// A line is allowed to end at this spot.
 71 |     Allowed,
 72 | }
 73 | 
 74 | /// Returns an iterator over line break opportunities in the specified string.
 75 | ///
 76 | /// Break opportunities are given as tuples of the byte index of the character succeeding the break
 77 | /// and the type.
 78 | ///
 79 | /// Uses the default Line Breaking Algorithm with the tailoring that Complex-Context Dependent
 80 | /// (SA) characters get resolved to Ordinary Alphabetic and Symbol Characters (AL) regardless of
 81 | /// General_Category.
 82 | ///
 83 | /// # Examples
 84 | ///
 85 | /// ```
 86 | /// use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}};
 87 | /// assert!(linebreaks("Hello world!").eq(vec![(6, Allowed), (12, Mandatory)]));
 88 | /// ```
 89 | pub fn linebreaks(s: &str) -> impl Iterator<Item = (usize, BreakOpportunity)> + Clone + '_ {
 90 |     use BreakOpportunity::{Allowed, Mandatory};
 91 | 
 92 |     s.char_indices()
 93 |         .map(|(i, c)| (i, break_property(c as u32) as u8))
 94 |         .chain(once((s.len(), eot)))
 95 |         .scan((sot, false), |state, (i, cls)| {
 96 |             // ZWJ is handled outside the table to reduce its size
 97 |             let val = PAIR_TABLE[state.0 as usize][cls as usize];
 98 |             let is_mandatory = val & MANDATORY_BREAK_BIT != 0;
 99 |             let is_break = val & ALLOWED_BREAK_BIT != 0 && (!state.1 || is_mandatory);
100 |             *state = (
101 |                 val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT),
102 |                 cls == BreakClass::ZeroWidthJoiner as u8,
103 |             );
104 | 
105 |             Some((i, is_break, is_mandatory))
106 |         })
107 |         .filter_map(|(i, is_break, is_mandatory)| {
108 |             if is_break {
109 |                 Some((i, if is_mandatory { Mandatory } else { Allowed }))
110 |             } else {
111 |                 None
112 |             }
113 |         })
114 | }
115 | 
116 | /// Divides the string at the last index where further breaks do not depend on prior context.
117 | ///
118 | /// The trivial index at `eot` is excluded.
119 | ///
120 | /// A common optimization is to determine only the nearest line break opportunity before the first
121 | /// character that would cause the line to become overfull, requiring backward traversal, of which
122 | /// there are two approaches:
123 | ///
124 | /// * Cache breaks from forward traversals
125 | /// * Step backward and with `split_at_safe` find a pos to safely search forward from, repeatedly
126 | ///
127 | /// # Examples
128 | ///
129 | /// ```
130 | /// use unicode_linebreak::{linebreaks, split_at_safe};
131 | /// let s = "Not allowed to break within em dashes: — —";
132 | /// let (prev, safe) = split_at_safe(s);
133 | /// let n = prev.len();
134 | /// assert!(linebreaks(safe).eq(linebreaks(s).filter_map(|(i, x)| i.checked_sub(n).map(|i| (i, x)))));
135 | /// ```
136 | pub fn split_at_safe(s: &str) -> (&str, &str) {
137 |     let mut chars = s.char_indices().rev().scan(None, |state, (i, c)| {
138 |         let cls = break_property(c as u32);
139 |         let is_safe_pair = state
140 |             .replace(cls)
141 |             .map_or(false, |prev| is_safe_pair(cls, prev)); // Reversed since iterating backwards
142 |         Some((i, is_safe_pair))
143 |     });
144 |     chars.find(|&(_, is_safe_pair)| is_safe_pair);
145 |     // Include preceding char for `linebreaks` to pick up break before match (disallowed after sot)
146 |     s.split_at(chars.next().map_or(0, |(i, _)| i))
147 | }
148 | 
149 | #[cfg(test)]
150 | mod tests {
151 |     use super::*;
152 | 
153 |     #[test]
154 |     fn it_works() {
155 |         assert_eq!(break_property(0xA), BreakClass::LineFeed);
156 |         assert_eq!(break_property(0xDB80), BreakClass::Surrogate);
157 |         assert_eq!(break_property(0xe01ef), BreakClass::CombiningMark);
158 |         assert_eq!(break_property(0x10ffff), BreakClass::Unknown);
159 |     }
160 | }
161 | 


--------------------------------------------------------------------------------
/src/shared.rs:
--------------------------------------------------------------------------------
  1 | /// Unicode line breaking class.
  2 | #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
  3 | #[repr(u8)]
  4 | pub enum BreakClass {
  5 |     // Non-tailorable
  6 |     /// Cause a line break (after)
  7 |     Mandatory,
  8 |     /// Cause a line break (after), except between CR and LF
  9 |     CarriageReturn,
 10 |     /// Cause a line break (after)
 11 |     LineFeed,
 12 |     /// Prohibit a line break between the character and the preceding character
 13 |     CombiningMark,
 14 |     /// Cause a line break (after)
 15 |     NextLine,
 16 |     /// Do not occur in well-formed text
 17 |     Surrogate,
 18 |     /// Prohibit line breaks before and after
 19 |     WordJoiner,
 20 |     /// Provide a break opportunity
 21 |     ZeroWidthSpace,
 22 |     /// Prohibit line breaks before and after
 23 |     NonBreakingGlue,
 24 |     /// Enable indirect line breaks
 25 |     Space,
 26 |     /// Prohibit line breaks within joiner sequences
 27 |     ZeroWidthJoiner,
 28 |     // Break opportunities
 29 |     /// Provide a line break opportunity before and after the character
 30 |     BeforeAndAfter,
 31 |     /// Generally provide a line break opportunity after the character
 32 |     After,
 33 |     /// Generally provide a line break opportunity before the character
 34 |     Before,
 35 |     /// Provide a line break opportunity after the character, except in numeric context
 36 |     Hyphen,
 37 |     /// Provide a line break opportunity contingent on additional information
 38 |     Contingent,
 39 |     // Characters prohibiting certain breaks
 40 |     /// Prohibit line breaks before
 41 |     ClosePunctuation,
 42 |     /// Prohibit line breaks before
 43 |     CloseParenthesis,
 44 |     /// Prohibit line breaks before
 45 |     Exclamation,
 46 |     /// Allow only indirect line breaks between pairs
 47 |     Inseparable,
 48 |     /// Allow only indirect line breaks before
 49 |     NonStarter,
 50 |     /// Prohibit line breaks after
 51 |     OpenPunctuation,
 52 |     /// Act like they are both opening and closing
 53 |     Quotation,
 54 |     // Numeric context
 55 |     /// Prevent breaks after any and before numeric
 56 |     InfixSeparator,
 57 |     /// Form numeric expressions for line breaking purposes
 58 |     Numeric,
 59 |     /// Do not break following a numeric expression
 60 |     Postfix,
 61 |     /// Do not break in front of a numeric expression
 62 |     Prefix,
 63 |     /// Prevent a break before, and allow a break after
 64 |     Symbol,
 65 |     // Other characters
 66 |     /// Act like AL when the resolved EAW is N; otherwise, act as ID
 67 |     Ambiguous,
 68 |     /// Are alphabetic characters or symbols that are used with alphabetic characters
 69 |     Alphabetic,
 70 |     /// Treat as NS or ID for strict or normal breaking.
 71 |     ConditionalJapaneseStarter,
 72 |     /// Do not break from following Emoji Modifier
 73 |     EmojiBase,
 74 |     /// Do not break from preceding Emoji Base
 75 |     EmojiModifier,
 76 |     /// Form Korean syllable blocks
 77 |     HangulLvSyllable,
 78 |     /// Form Korean syllable blocks
 79 |     HangulLvtSyllable,
 80 |     /// Do not break around a following hyphen; otherwise act as Alphabetic
 81 |     HebrewLetter,
 82 |     /// Break before or after, except in some numeric context
 83 |     Ideographic,
 84 |     /// Form Korean syllable blocks
 85 |     HangulLJamo,
 86 |     /// Form Korean syllable blocks
 87 |     HangulVJamo,
 88 |     /// Form Korean syllable blocks
 89 |     HangulTJamo,
 90 |     /// Keep pairs together. For pairs, break before and after other classes
 91 |     RegionalIndicator,
 92 |     /// Provide a line break opportunity contingent on additional, language-specific context analysis
 93 |     ComplexContext,
 94 |     /// Have as yet unknown line breaking behavior or unassigned code positions
 95 |     Unknown,
 96 | }
 97 | 
 98 | use BreakClass::{
 99 |     After as BA, Alphabetic as AL, Ambiguous as AI, Before as BB, BeforeAndAfter as B2,
100 |     CarriageReturn as CR, CloseParenthesis as CP, ClosePunctuation as CL, CombiningMark as CM,
101 |     ComplexContext as SA, ConditionalJapaneseStarter as CJ, Contingent as CB, EmojiBase as EB,
102 |     EmojiModifier as EM, Exclamation as EX, HangulLJamo as JL, HangulLvSyllable as H2,
103 |     HangulLvtSyllable as H3, HangulTJamo as JT, HangulVJamo as JV, HebrewLetter as HL,
104 |     Hyphen as HY, Ideographic as ID, InfixSeparator as IS, Inseparable as IN, LineFeed as LF,
105 |     Mandatory as BK, NextLine as NL, NonBreakingGlue as GL, NonStarter as NS, Numeric as NU,
106 |     OpenPunctuation as OP, Postfix as PO, Prefix as PR, Quotation as QU, RegionalIndicator as RI,
107 |     Space as SP, Surrogate as SG, Symbol as SY, Unknown as XX, WordJoiner as WJ,
108 |     ZeroWidthJoiner as ZWJ, ZeroWidthSpace as ZW,
109 | };
110 | 
111 | /// Ceiling for code points in the Basic Multilingual Place (BMP).
112 | const BMP_LIMIT: u32 = 0x10000;
113 | 
114 | /// Shift size for getting index-3 table offset.
115 | const SHIFT_3: u32 = 4;
116 | /// Shift size for getting index-2 table offset.
117 | const SHIFT_2: u32 = 5 + SHIFT_3;
118 | /// Shift size for getting index-1 table offset.
119 | const SHIFT_1: u32 = 5 + SHIFT_2;
120 | /// Shift size for getting BMP block start.
121 | const BMP_SHIFT: u32 = 6;
122 | 
123 | const INDEX_2_BLOCK_LENGTH: u32 = 1 << (SHIFT_1 - SHIFT_2);
124 | const INDEX_3_BLOCK_LENGTH: u32 = 1 << (SHIFT_2 - SHIFT_3);
125 | const SMALL_DATA_BLOCK_LENGTH: u32 = 1 << SHIFT_3;
126 | const BMP_DATA_BLOCK_LENGTH: u32 = 1 << BMP_SHIFT;
127 | 
128 | const ALLOWED_BREAK_BIT: u8 = 0x80;
129 | const MANDATORY_BREAK_BIT: u8 = 0x40;
130 | 
131 | #[allow(non_upper_case_globals)]
132 | const eot: u8 = 43;
133 | #[allow(non_upper_case_globals)]
134 | const sot: u8 = 44;
135 | 


--------------------------------------------------------------------------------
/tests/test_default.rs:
--------------------------------------------------------------------------------
 1 | //! Default Line_Break test.
 2 | 
 3 | use std::char;
 4 | use std::fs::File;
 5 | use std::io::{self, prelude::*, BufReader};
 6 | use std::iter::from_fn;
 7 | use std::u32;
 8 | use unicode_linebreak::*;
 9 | 
10 | const TEST_FILE: &str = "tests/LineBreakTest.txt";
11 | 
12 | #[test]
13 | fn test_lb_default() -> io::Result<()> {
14 |     let file = File::open(TEST_FILE)?;
15 |     for line in BufReader::new(file)
16 |         .lines()
17 |         .map(|l| l.unwrap())
18 |         .filter(|l| !l.starts_with('#'))
19 |     {
20 |         let (line, comment) = line.split_once("# ").expect("Missing comment");
21 | 
22 |         // Skip tests relying on some tailorable rules
23 |         if comment.contains("[30.22]") || comment.contains("[999.0]") {
24 |             continue;
25 |         }
26 | 
27 |         let mut items = line.split_whitespace();
28 |         items.next().unwrap(); // Skip first '×'
29 |         let mut byte_idx = 0;
30 |         let (spots, string): (Vec<_>, String) = from_fn(|| {
31 |             if let Some(hex) = items.next() {
32 |                 let codepoint = u32::from_str_radix(hex, 16)
33 |                     .ok()
34 |                     .and_then(char::from_u32)
35 |                     .expect("Invalid codepoint");
36 |                 byte_idx += codepoint.len_utf8();
37 | 
38 |                 let is_break = match items.next() {
39 |                     Some("÷") => true,
40 |                     Some("×") => false,
41 |                     _ => unreachable!(),
42 |                 };
43 | 
44 |                 Some(((byte_idx, is_break), codepoint))
45 |             } else {
46 |                 None
47 |             }
48 |         })
49 |         .unzip();
50 | 
51 |         let actual: Vec<_> = linebreaks(&string).map(|(i, _)| i).collect();
52 |         let expected: Vec<_> = spots
53 |             .into_iter()
54 |             .filter_map(|(i, is_break)| if is_break { Some(i) } else { None })
55 |             .collect();
56 | 
57 |         assert_eq!(
58 |             actual, expected,
59 |             "String: ‘{}’, comment: {}",
60 |             string, comment
61 |         );
62 |     }
63 | 
64 |     Ok(())
65 | }
66 | 


--------------------------------------------------------------------------------