├── .github └── workflows │ └── test.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── LineBreak.txt ├── README.md ├── gen-tables ├── .gitignore ├── Cargo.lock ├── Cargo.toml └── src │ └── main.rs ├── src ├── lib.rs └── shared.rs └── tests ├── LineBreakTest.txt └── test_default.rs /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - name: Install Rust 11 | uses: actions-rs/toolchain@v1 12 | with: 13 | toolchain: stable 14 | profile: minimal 15 | override: true 16 | 17 | - name: Configure src/tables.rs cache 18 | id: cache-tables 19 | uses: actions/cache@v3 20 | with: 21 | path: src/tables.rs 22 | key: ${{ hashFiles('LineBreak.txt', 'gen-tables/**') }} 23 | 24 | - name: Generates src/tables.rs 25 | run: cargo run 26 | working-directory: ./gen-tables 27 | if: steps.cache-tables.outputs.cache-hit != 'true' 28 | 29 | - name: Run tests 30 | run: cargo test 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | /src/tables.rs 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "unicode-linebreak" 3 | version = "0.1.5" 4 | authors = ["Axel Forsman "] 5 | description = "Implementation of the Unicode Line Breaking Algorithm" 6 | homepage = "https://github.com/axelf4/unicode-linebreak" 7 | repository = "https://github.com/axelf4/unicode-linebreak" 8 | readme = "README.md" 9 | keywords = ["unicode", "text", "layout"] 10 | categories = ["internationalization"] 11 | license = "Apache-2.0" 12 | include = ["src/**/*", "LICENSE"] 13 | edition = "2021" 14 | rust-version = "1.56" 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # unicode-linebreak 2 | 3 | Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14]. 4 | 5 | ![test](https://github.com/axelf4/unicode-linebreak/workflows/test/badge.svg) 6 | [![Documentation](https://docs.rs/unicode-linebreak/badge.svg)](https://docs.rs/unicode-linebreak) 7 | 8 | Given an input text, locates "line break opportunities", or positions appropriate for wrapping 9 | lines when displaying text. 10 | 11 | ## Example 12 | 13 | ```rust 14 | use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; 15 | 16 | let text = "a b \nc"; 17 | assert!(linebreaks(text).eq([ 18 | (2, Allowed), // May break after first space 19 | (5, Mandatory), // Must break after line feed 20 | (6, Mandatory) // Must break at end of text, so that there always is at least one LB 21 | ])); 22 | ``` 23 | 24 | ## Development 25 | 26 | After cloning the repository or modifying `LineBreak.txt` the tables 27 | have to be (re-)generated: 28 | 29 | ```sh 30 | # Generate src/tables.rs 31 | (cd gen-tables && cargo run) 32 | # Run tests to make sure it was successful 33 | cargo test 34 | ``` 35 | 36 | [UAX14]: https://www.unicode.org/reports/tr14/ 37 | -------------------------------------------------------------------------------- /gen-tables/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | -------------------------------------------------------------------------------- /gen-tables/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "ahash" 7 | version = "0.8.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" 10 | dependencies = [ 11 | "cfg-if", 12 | "once_cell", 13 | "version_check", 14 | ] 15 | 16 | [[package]] 17 | name = "aho-corasick" 18 | version = "1.0.2" 19 | source = "registry+https://github.com/rust-lang/crates.io-index" 20 | checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" 21 | dependencies = [ 22 | "memchr", 23 | ] 24 | 25 | [[package]] 26 | name = "allocator-api2" 27 | version = "0.2.15" 28 | source = "registry+https://github.com/rust-lang/crates.io-index" 29 | checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9" 30 | 31 | [[package]] 32 | name = "cfg-if" 33 | version = "1.0.0" 34 | source = "registry+https://github.com/rust-lang/crates.io-index" 35 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 36 | 37 | [[package]] 38 | name = "gen-tables" 39 | version = "0.0.0" 40 | dependencies = [ 41 | "hashbrown", 42 | "regex", 43 | ] 44 | 45 | [[package]] 46 | name = "hashbrown" 47 | version = "0.14.0" 48 | source = "registry+https://github.com/rust-lang/crates.io-index" 49 | checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" 50 | dependencies = [ 51 | "ahash", 52 | "allocator-api2", 53 | ] 54 | 55 | [[package]] 56 | name = "memchr" 57 | version = "2.5.0" 58 | source = "registry+https://github.com/rust-lang/crates.io-index" 59 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 60 | 61 | [[package]] 62 | name = "once_cell" 63 | version = "1.18.0" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" 66 | 67 | [[package]] 68 | name = "regex" 69 | version = "1.9.1" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" 72 | dependencies = [ 73 | "aho-corasick", 74 | "memchr", 75 | "regex-automata", 76 | "regex-syntax", 77 | ] 78 | 79 | [[package]] 80 | name = "regex-automata" 81 | version = "0.3.2" 82 | source = "registry+https://github.com/rust-lang/crates.io-index" 83 | checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf" 84 | dependencies = [ 85 | "aho-corasick", 86 | "memchr", 87 | "regex-syntax", 88 | ] 89 | 90 | [[package]] 91 | name = "regex-syntax" 92 | version = "0.7.3" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846" 95 | 96 | [[package]] 97 | name = "version_check" 98 | version = "0.9.4" 99 | source = "registry+https://github.com/rust-lang/crates.io-index" 100 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" 101 | -------------------------------------------------------------------------------- /gen-tables/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "gen-tables" 3 | version = "0.0.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | [dependencies] 8 | regex = "1" 9 | hashbrown = "0.14" 10 | 11 | # Prevent this from interfering with workspaces 12 | [workspace] 13 | members = ["."] 14 | -------------------------------------------------------------------------------- /gen-tables/src/main.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Parses the rules into a state machine using a pair table. Each value in the table specifies the 3 | next state and whether it's an forced/allowed break. To handles rules such as 4 | 5 | B SP* ÷ A 6 | 7 | the extra state BSP is employed in the pair table friendly equivalent rules 8 | 9 | (B | BSP) ÷ A, Treat (B | BSP) SP as if it were BSP, Treat BSP as if it were SP 10 | */ 11 | #![recursion_limit = "512"] 12 | 13 | use hashbrown::{hash_map::Entry, HashMap}; 14 | use regex::Regex; 15 | use std::borrow::Borrow; 16 | use std::cmp::{max, min}; 17 | use std::fs::File; 18 | use std::hash::{BuildHasher, Hash, Hasher}; 19 | use std::io::{BufRead, BufReader, BufWriter, Write}; 20 | use std::marker::PhantomData; 21 | use std::ops::Range; 22 | use std::str::FromStr; 23 | use std::{error, iter}; 24 | 25 | include!("../../src/shared.rs"); 26 | 27 | impl FromStr for BreakClass { 28 | type Err = &'static str; 29 | 30 | fn from_str(s: &str) -> Result { 31 | Ok(match s { 32 | "BK" => BK, 33 | "CR" => CR, 34 | "LF" => LF, 35 | "CM" => CM, 36 | "NL" => NL, 37 | "SG" => SG, 38 | "WJ" => WJ, 39 | "ZW" => ZW, 40 | "GL" => GL, 41 | "SP" => SP, 42 | "ZWJ" => ZWJ, 43 | "B2" => B2, 44 | "BA" => BA, 45 | "BB" => BB, 46 | "HY" => HY, 47 | "CB" => CB, 48 | "CL" => CL, 49 | "CP" => CP, 50 | "EX" => EX, 51 | "IN" => IN, 52 | "NS" => NS, 53 | "OP" => OP, 54 | "QU" => QU, 55 | "IS" => IS, 56 | "NU" => NU, 57 | "PO" => PO, 58 | "PR" => PR, 59 | "SY" => SY, 60 | "AI" => AI, 61 | "AL" => AL, 62 | "CJ" => CJ, 63 | "EB" => EB, 64 | "EM" => EM, 65 | "H2" => H2, 66 | "H3" => H3, 67 | "HL" => HL, 68 | "ID" => ID, 69 | "JL" => JL, 70 | "JV" => JV, 71 | "JT" => JT, 72 | "RI" => RI, 73 | "SA" => SA, 74 | "XX" => XX, 75 | _ => return Err("Invalid break class"), 76 | }) 77 | } 78 | } 79 | 80 | const NUM_CLASSES: usize = 43; 81 | static BREAK_CLASS_TABLE: [&str; NUM_CLASSES] = [ 82 | "BK", "CR", "LF", "CM", "NL", "SG", "WJ", "ZW", "GL", "SP", "ZWJ", "B2", "BA", "BB", "HY", 83 | "CB", "CL", "CP", "EX", "IN", "NS", "OP", "QU", "IS", "NU", "PO", "PR", "SY", "AI", "AL", "CJ", 84 | "EB", "EM", "H2", "H3", "HL", "ID", "JL", "JV", "JT", "RI", "SA", "XX", 85 | ]; 86 | 87 | #[derive(Copy, Clone)] 88 | #[repr(u8)] 89 | #[allow(clippy::upper_case_acronyms)] 90 | enum ExtraState { 91 | ZWSP = sot + 1, 92 | OPSP, 93 | QUSP, 94 | CLSP, 95 | CPSP, 96 | B2SP, 97 | HLHYBA, 98 | RIRI, 99 | } 100 | 101 | use ExtraState::*; 102 | 103 | /// The number of classes plus the eot state. 104 | const NUM_CLASSES_EOT: usize = NUM_CLASSES + 1; 105 | const NUM_STATES: usize = NUM_CLASSES + 10; 106 | 107 | /// Separate implementation to prevent infinite recursion. 108 | #[doc(hidden)] 109 | macro_rules! rules2table_impl { 110 | // Operators 111 | (($len:ident $($args:tt)*) '÷' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '÷') $($tt)+}}; 112 | (($len:ident $($args:tt)*) '×' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '×') $($tt)+}}; 113 | (($len:ident $($args:tt)*) '!' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '!') $($tt)+}}; 114 | // Perform operator 115 | (($len:ident $pair_table:ident $($first:ident)? $operator:literal $($second:ident)?) $(, $($tt:tt)*)?) => { 116 | $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? 117 | #[allow(unused)] let first = 0..NUM_STATES; // Default to ALL 118 | $(let first = $first;)? 119 | #[allow(unused)] let second = 0..NUM_CLASSES_EOT; // Default to ALL 120 | $(let second = $second;)? 121 | for i in first { 122 | for j in second.clone() { 123 | let cell = &mut $pair_table[i][j]; 124 | match $operator { 125 | '!' => *cell |= ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT, 126 | '÷' => *cell |= ALLOWED_BREAK_BIT, 127 | '×' => *cell &= !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), 128 | _ => unreachable!("Bad operator"), 129 | } 130 | } 131 | } 132 | }; 133 | 134 | (($len:ident $($args:tt)*) Treat X $($tt:tt)*) => { 135 | rules2table_impl! {(NUM_CLASSES_EOT $($args)* treat_x) $($tt)*} 136 | }; 137 | (($len:ident $($args:tt)*) Treat $($tt:tt)*) => { 138 | rules2table_impl! {(NUM_STATES $($args)* treat) $($tt)*} 139 | }; 140 | (($len:ident $($args:tt)*) * as if it were X where X = $($tt:tt)*) => { 141 | rules2table_impl! {(NUM_STATES $($args)* as_if_it_were_x_where_x_is) $($tt)*} 142 | }; 143 | 144 | (($len:ident $pair_table:ident treat_x $second:ident as_if_it_were_x_where_x_is $X:ident) $(, $($tt:tt)*)?) => { 145 | $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? 146 | for i in $X { 147 | for j in $second.clone() { 148 | $pair_table[i][j] = i as u8; 149 | } 150 | } 151 | }; 152 | (($len:ident $pair_table:ident treat $first:ident $second:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => { 153 | $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? 154 | let cls = $cls as u8; 155 | for i in $first { 156 | for j in $second.clone() { 157 | $pair_table[i][j] = cls; 158 | } 159 | } 160 | }; 161 | (($len:ident $pair_table:ident treat $first:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => { 162 | $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? 163 | for j in $first.clone().filter(|&j| j < NUM_CLASSES_EOT) { 164 | for row in $pair_table.iter_mut() { 165 | row[j] = row[$cls as usize]; 166 | } 167 | } 168 | for i in $first { 169 | $pair_table.copy_within($cls as usize..$cls as usize + 1, i); 170 | } 171 | }; 172 | 173 | // All classes pattern 174 | (($len:ident $($args:tt)*) ALL $($tt:tt)*) => { 175 | let indices = 0..$len; 176 | rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} 177 | }; 178 | // Single class pattern 179 | (($len:ident $($args:tt)*) $cls:ident $($tt:tt)*) => { 180 | let indices = iter::once($cls as usize); 181 | rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} 182 | }; 183 | // Parse (X | ...) patterns 184 | (($len:ident $($args:tt)*) ($($cls:ident)|+) $($tt:tt)*) => { 185 | let indices = [$($cls as usize),+].into_iter(); 186 | rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} 187 | }; 188 | // Parse [^ ...] patterns 189 | (($len:ident $($args:tt)*) [^$($cls:ident)+] $($tt:tt)*) => { 190 | let excluded = [$($cls as usize),+]; 191 | let indices = (0..$len).filter(|i| !excluded.contains(i)); 192 | rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} 193 | }; 194 | 195 | (($len:ident $pair_table:ident)) => {}; // Exit condition 196 | } 197 | 198 | /// Returns a pair table conforming to the specified rules. 199 | /// 200 | /// The rule syntax is a modified subset of the one in Unicode Standard Annex #14. 201 | macro_rules! rules2table { 202 | ($($tt:tt)+) => {{ 203 | let mut pair_table = [{ 204 | let mut row = [0; NUM_CLASSES_EOT]; 205 | for (i, x) in row.iter_mut().enumerate() { 206 | *x = i as u8; 207 | } 208 | row 209 | }; NUM_STATES]; 210 | rules2table_impl! {(NUM_STATES pair_table) $($tt)+} 211 | pair_table 212 | }}; 213 | } 214 | 215 | trait IteratorExt: Iterator { 216 | /// Tests if all elements of the iterator are equal. 217 | fn all_equal(&mut self) -> bool 218 | where 219 | Self::Item: PartialEq, 220 | Self: Sized, 221 | { 222 | self.next().map_or(true, |first| self.all(|x| x == first)) 223 | } 224 | } 225 | 226 | impl IteratorExt for I {} 227 | 228 | fn overlap(a: &[T], b: I) -> usize 229 | where 230 | I::Item: Borrow, 231 | I::IntoIter: ExactSizeIterator + Clone, 232 | { 233 | let b = b.into_iter(); 234 | (1..min(a.len(), b.len())) 235 | .rev() 236 | .find(|&n| { 237 | a[a.len() - n..] 238 | .iter() 239 | .zip(b.clone()) 240 | .all(|(x, y)| x == y.borrow()) 241 | }) 242 | .unwrap_or(0) 243 | } 244 | 245 | const UNICODE_LIMIT: u32 = 0x110000; 246 | const ASCII_LIMIT: u32 = 0x80; 247 | const SMALL_DATA_BLOCKS_PER_BMP_BLOCK: u32 = 1 << (BMP_SHIFT - SHIFT_3); 248 | /// Number of code points per index-2 table entry. 249 | const CP_PER_INDEX_2_ENTRY: u32 = 1 << SHIFT_2; 250 | 251 | #[derive(Clone, Copy, PartialEq, Debug)] 252 | enum Index { 253 | AllSame { value: T }, 254 | Mixed { data_block: u32 }, 255 | } 256 | 257 | /// UCPTrie builder. 258 | /// 259 | /// See: [ICU Code Point Tries] 260 | /// 261 | /// [ICU Code Point Tries]: https://icu.unicode.org/design/struct/utrie 262 | #[derive(Default)] 263 | struct CpTrieBuilder { 264 | /// Index-3 table. 265 | index: Vec>, 266 | data: Vec, 267 | initial_value: T, 268 | } 269 | 270 | impl CpTrieBuilder { 271 | fn new(initial_value: T) -> Self { 272 | Self { 273 | index: Vec::with_capacity(UNICODE_LIMIT as usize >> SHIFT_3), 274 | data: Vec::new(), 275 | initial_value, 276 | } 277 | } 278 | 279 | fn set_range(&mut self, Range { mut start, end }: Range, value: T) { 280 | if start >= end { 281 | return; // Empty range 282 | } 283 | if end as usize > self.index.len() { 284 | // Round up to CP_PER_INDEX_2_ENTRY boundary to simplify compaction 285 | let c = (end + CP_PER_INDEX_2_ENTRY - 1) & !(CP_PER_INDEX_2_ENTRY - 1); 286 | self.index.resize( 287 | c as usize >> SHIFT_3, 288 | Index::AllSame { 289 | value: self.initial_value, 290 | }, 291 | ); 292 | } 293 | 294 | // Set partial block at [start, next block boundary) 295 | let block_start = start & !(SMALL_DATA_BLOCK_LENGTH - 1); 296 | if start > block_start { 297 | let block = self.data_block(start); 298 | let block = &mut self.data[block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize] 299 | [(start & (SMALL_DATA_BLOCK_LENGTH - 1)) as usize..]; 300 | if end < block_start + SMALL_DATA_BLOCK_LENGTH { 301 | block[..((end - start) & (SMALL_DATA_BLOCK_LENGTH - 1)) as usize].fill(value); 302 | return; 303 | } 304 | block.fill(value); 305 | start = block_start + SMALL_DATA_BLOCK_LENGTH; 306 | } 307 | 308 | // Fill all full blocks 309 | while start < end & !(SMALL_DATA_BLOCK_LENGTH - 1) { 310 | match &mut self.index[start as usize >> SHIFT_3] { 311 | Index::AllSame { value: prev_value } => *prev_value = value, 312 | Index::Mixed { data_block } => { 313 | self.data[*data_block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize] 314 | .fill(value); 315 | } 316 | } 317 | start += SMALL_DATA_BLOCK_LENGTH; 318 | } 319 | 320 | // Set partial block at [last block boundary..end) 321 | let rest = end & (SMALL_DATA_BLOCK_LENGTH - 1); 322 | if rest > 0 { 323 | let block = self.data_block(start); 324 | self.data[block as usize..][..rest as usize].fill(value); 325 | } 326 | } 327 | 328 | fn data_block(&mut self, c: u32) -> u32 { 329 | let i = c as usize >> SHIFT_3; 330 | if let Index::Mixed { data_block } = self.index[i] { 331 | return data_block; // Already allocated 332 | } 333 | 334 | let (block_len, small_blocks) = if i < (BMP_LIMIT << SHIFT_3) as usize { 335 | let i_start = i & !(SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize - 1); 336 | ( 337 | BMP_DATA_BLOCK_LENGTH, 338 | i_start..i_start + SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize, 339 | ) 340 | } else { 341 | (SMALL_DATA_BLOCK_LENGTH, i..i + 1) 342 | }; 343 | // Allocate a new data block 344 | let new_block = self.data.len() as u32; 345 | self.data 346 | .extend(iter::repeat(self.initial_value).take(block_len as usize)); 347 | 348 | for (k, i) in small_blocks.clone().enumerate() { 349 | let prev_value = if let Index::AllSame { value } = self.index[i] { 350 | value 351 | } else { 352 | unreachable!() 353 | }; 354 | let block = new_block + k as u32 * SMALL_DATA_BLOCK_LENGTH; 355 | self.data[block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize].fill(prev_value); 356 | self.index[i] = Index::Mixed { data_block: block }; 357 | } 358 | new_block + SMALL_DATA_BLOCK_LENGTH * (i - small_blocks.start) as u32 359 | } 360 | 361 | fn get(&self, c: u32) -> T { 362 | match self.index[c as usize >> SHIFT_3] { 363 | Index::AllSame { value } => value, 364 | Index::Mixed { data_block } => { 365 | self.data[(data_block + (c & (SMALL_DATA_BLOCK_LENGTH - 1))) as usize] 366 | } 367 | } 368 | } 369 | 370 | // Compact arrays by 371 | // 372 | // * removing blocks identical to earlier ones 373 | // * overlapping each block as much as possible with the previously written one 374 | 375 | fn compact_data(&mut self) { 376 | let mut new_data = Vec::with_capacity(self.data.len()); 377 | // Always store ASCII data linearly at start 378 | new_data.extend((0..ASCII_LIMIT).map(|i| self.get(i))); 379 | self.index 380 | .iter_mut() 381 | .take(ASCII_LIMIT as usize >> SHIFT_3) 382 | .step_by(SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize) 383 | .enumerate() 384 | .for_each(|(i, x)| { 385 | *x = Index::Mixed { 386 | data_block: BMP_DATA_BLOCK_LENGTH * i as u32, 387 | } 388 | }); 389 | 390 | let mut block_len = BMP_DATA_BLOCK_LENGTH; 391 | let mut uniform_blocks = HashMap::new(); 392 | let mut block_index = BlockIndex::new(self.data.len(), block_len as usize); 393 | let mut inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize; 394 | let mut i = ASCII_LIMIT as usize >> SHIFT_3; 395 | while i < self.index.len() { 396 | if i == BMP_LIMIT as usize >> SHIFT_3 { 397 | block_len = SMALL_DATA_BLOCK_LENGTH; 398 | inc = 1; 399 | block_index.clear(block_len as usize); 400 | block_index.extend(&new_data); 401 | } 402 | 403 | let old_index = match self.index[i] { 404 | // Check if all of fast-range data block's blocks have all same or turn into mixed 405 | Index::AllSame { value } 406 | if !self.index[i..][1..inc] 407 | .iter() 408 | .all(|x| matches!(x, Index::AllSame { value: v } if *v == value)) => 409 | { 410 | Index::Mixed { 411 | data_block: self.data_block((i as u32) << SHIFT_3), // Turn into mixed block 412 | } 413 | } 414 | // Check if really mixed 415 | x @ Index::Mixed { data_block } => { 416 | let block = &self.data[data_block as usize..][..block_len as usize]; 417 | let all_same = block.iter().skip(1).all(|&x| x == block[0]); 418 | if all_same { 419 | Index::AllSame { value: block[0] } 420 | } else { 421 | x 422 | } 423 | } 424 | x => x, 425 | }; 426 | let new_index = match old_index { 427 | Index::AllSame { value } => { 428 | // Is there another uniform block with the same value? 429 | if let Some(j) = match uniform_blocks.entry(value) { 430 | Entry::Occupied(entry) => Some(*entry.get()), 431 | Entry::Vacant(entry) => { 432 | entry.insert(i as u32); 433 | None 434 | } 435 | } { 436 | if let Index::Mixed { data_block } = self.index[j as usize] { 437 | data_block 438 | } else { 439 | unreachable!() 440 | } 441 | } else if let Some(n) = block_index 442 | .find_block(&new_data, iter::repeat(value).take(block_len as usize)) 443 | { 444 | n 445 | } else { 446 | let overlap = new_data 447 | .iter() 448 | .rev() 449 | .take(block_len as usize - 1) 450 | .take_while(|&&x| x == value) 451 | .count(); 452 | let new_index = (new_data.len() - overlap) as u32; 453 | new_data.extend(iter::repeat(value).take(block_len as usize - overlap)); 454 | block_index.extend(&new_data); 455 | new_index 456 | } 457 | } 458 | Index::Mixed { data_block } => { 459 | let block = &self.data[data_block as usize..][..block_len as usize]; 460 | if let Some(n) = block_index.find_block(&new_data, block) { 461 | n 462 | } else { 463 | let overlap = overlap(&new_data, block); 464 | let new_index = (new_data.len() - overlap) as u32; 465 | new_data.extend_from_slice(&block[overlap..]); 466 | block_index.extend(&new_data); 467 | new_index 468 | } 469 | } 470 | }; 471 | self.index[i] = Index::Mixed { 472 | data_block: new_index, 473 | }; 474 | i += inc; 475 | } 476 | 477 | self.data = new_data; 478 | } 479 | 480 | fn compact_index(&mut self) -> Vec { 481 | let fast_index_len = BMP_LIMIT as usize >> BMP_SHIFT; 482 | let index2_capacity = 483 | (self.index.len() - (BMP_LIMIT as usize >> SHIFT_3)) >> (SHIFT_2 - SHIFT_3); 484 | let index1_len = 485 | (index2_capacity + INDEX_2_BLOCK_LENGTH as usize - 1) >> (SHIFT_1 - SHIFT_2); 486 | let index1_end = fast_index_len + index1_len; 487 | let mut index16 = Vec::with_capacity(index1_end + index2_capacity); 488 | let mut block_index = BlockIndex::new(index16.capacity(), INDEX_3_BLOCK_LENGTH as usize); 489 | 490 | let (fast_index, small_index) = self.index.split_at(BMP_LIMIT as usize >> SHIFT_3); 491 | // Condense fast index table 492 | index16.extend( 493 | fast_index 494 | .iter() 495 | .step_by(SMALL_DATA_BLOCKS_PER_BMP_BLOCK as usize) 496 | .map(|x| { 497 | if let Index::Mixed { data_block: i3 } = x { 498 | *i3 as u16 499 | } else { 500 | unreachable!() 501 | } 502 | }), 503 | ); 504 | debug_assert_eq!(index16.len(), fast_index_len); 505 | block_index.extend(&index16); 506 | 507 | index16.extend(iter::repeat(0).take(index1_len)); // Reserve space for index-1 table 508 | block_index.skip(index1_len); 509 | 510 | // Compact the index-3 table and write uncompacted index-2 table 511 | let index2: Vec<_> = small_index 512 | .chunks_exact(INDEX_3_BLOCK_LENGTH as usize) 513 | .map(|block| { 514 | let block = block.iter().map(|x| { 515 | if let Index::Mixed { data_block } = x { 516 | *data_block 517 | } else { 518 | unreachable!() 519 | } 520 | }); 521 | let ored = block.clone().fold(0, |acc, i3| acc | i3); 522 | 523 | if ored <= 0xffff { 524 | let block = block.map(|x| x as u16); 525 | if let Some(n) = block_index.find_block(&index16, block.clone()) { 526 | n as u16 527 | } else { 528 | let overlap = overlap(&index16[index1_end..], block.clone()); 529 | let i3 = (index16.len() - overlap) as u16; 530 | index16.extend(block.skip(overlap)); 531 | block_index.extend(&index16); 532 | i3 533 | } 534 | } else { 535 | todo!() // Encode index-3 block with one or more data indices exceeding 16 bits 536 | } 537 | }) 538 | .collect(); 539 | 540 | // Compact the index-2 table and write the index-1 table 541 | debug_assert_eq!( 542 | INDEX_2_BLOCK_LENGTH, INDEX_3_BLOCK_LENGTH, 543 | "cannot reuse block index" 544 | ); 545 | for (i, block) in index2.chunks(INDEX_2_BLOCK_LENGTH as usize).enumerate() { 546 | let i2 = if let Some(n) = block_index.find_block(&index16, block) { 547 | n as u16 548 | } else { 549 | let overlap = overlap(&index16[index1_end..], block); 550 | let i2 = (index16.len() - overlap) as u16; 551 | index16.extend(&block[overlap..]); 552 | block_index.extend(&index16); 553 | i2 554 | }; 555 | 556 | let i1 = fast_index_len + i; 557 | index16[i1] = i2; 558 | } 559 | 560 | index16 561 | } 562 | 563 | fn build(mut self) -> CpTrie { 564 | if self.index.len() < BMP_LIMIT as usize >> SHIFT_3 { 565 | self.index.resize( 566 | BMP_LIMIT as usize >> SHIFT_3, 567 | Index::AllSame { 568 | value: self.initial_value, 569 | }, 570 | ); 571 | } 572 | self.compact_data(); 573 | let high_start = { 574 | let i = self 575 | .index 576 | .last() 577 | .filter(|&x| { 578 | if let Index::Mixed { data_block } = x { 579 | self.data[*data_block as usize..][..SMALL_DATA_BLOCK_LENGTH as usize] 580 | .iter() 581 | .all(|&x| x == self.initial_value) 582 | } else { 583 | false 584 | } 585 | }) 586 | .map(|i| self.index.iter().rposition(|x| x != i).unwrap()) 587 | .map_or(self.index.len(), |i| i + 1) as u32; 588 | let c = ((i << SHIFT_3) + CP_PER_INDEX_2_ENTRY - 1) & !(CP_PER_INDEX_2_ENTRY - 1); 589 | max(c, BMP_LIMIT) 590 | }; 591 | self.index.truncate(high_start as usize >> SHIFT_3); 592 | let index = self.compact_index(); 593 | 594 | CpTrie { 595 | high_start, 596 | index, 597 | data: self.data, 598 | } 599 | } 600 | } 601 | 602 | struct FixedHash(u64, T); 603 | 604 | impl Hash for FixedHash { 605 | fn hash(&self, state: &mut H) { 606 | state.write_u64(self.0); 607 | } 608 | } 609 | 610 | struct BlockIndex { 611 | set: hashbrown::HashMap, ()>, 612 | block_len: usize, 613 | prev_end: usize, 614 | phantom: PhantomData, 615 | } 616 | 617 | impl BlockIndex { 618 | fn new(capacity: usize, block_len: usize) -> Self { 619 | Self { 620 | set: hashbrown::HashMap::with_capacity(capacity), 621 | block_len, 622 | prev_end: 0, 623 | phantom: PhantomData, 624 | } 625 | } 626 | 627 | fn clear(&mut self, new_block_len: usize) { 628 | self.set.clear(); 629 | self.block_len = new_block_len; 630 | self.prev_end = 0; 631 | } 632 | 633 | fn skip(&mut self, n: usize) { 634 | self.prev_end += n + self.block_len - 1; 635 | } 636 | 637 | fn extend(&mut self, data: &[T]) { 638 | let start = (self.prev_end + 1).saturating_sub(self.block_len); 639 | if data.len() <= start { 640 | return; 641 | } 642 | for (i, block) in data[start..].windows(self.block_len).enumerate() { 643 | let i = (start + i) as u32; 644 | 645 | let hash = { 646 | let mut s = self.set.hasher().build_hasher(); 647 | block.iter().for_each(|x| x.hash(&mut s)); 648 | s.finish() 649 | }; 650 | let hash2 = { 651 | let mut s = self.set.hasher().build_hasher(); 652 | s.write_u64(hash); 653 | s.finish() 654 | }; 655 | let is_match = |&FixedHash(_, j): &FixedHash| { 656 | data[j as usize..][..self.block_len].iter().eq(block) 657 | }; 658 | self.set 659 | .raw_entry_mut() 660 | .from_hash(hash2, is_match) 661 | .or_insert(FixedHash(hash, i), ()); 662 | } 663 | self.prev_end = data.len(); 664 | } 665 | 666 | fn find_block(&mut self, data: &[T], block: I) -> Option 667 | where 668 | I::Item: Borrow, 669 | I::IntoIter: Clone, 670 | { 671 | let block = block.into_iter(); 672 | let hash = { 673 | let mut s = self.set.hasher().build_hasher(); 674 | block.clone().for_each(|x| x.borrow().hash(&mut s)); 675 | s.finish() 676 | }; 677 | let hash2 = { 678 | let mut s = self.set.hasher().build_hasher(); 679 | s.write_u64(hash); 680 | s.finish() 681 | }; 682 | let is_match = |&FixedHash(_, j): &FixedHash| { 683 | data[j as usize..][..self.block_len] 684 | .iter() 685 | .zip(block.clone()) 686 | .all(|(x, y)| x == y.borrow()) 687 | }; 688 | self.set 689 | .raw_entry() 690 | .from_hash(hash2, is_match) 691 | .map(|(&FixedHash(_, i), _)| i) 692 | } 693 | } 694 | 695 | struct CpTrie { 696 | high_start: u32, 697 | index: Vec, 698 | data: Vec, 699 | } 700 | 701 | fn main() -> Result<(), Box> { 702 | #[allow(clippy::assertions_on_constants)] 703 | const _: () = debug_assert!(NUM_STATES <= 0x3F, "too many states"); 704 | 705 | let pair_table = rules2table! { 706 | // Non-tailorable Line Breaking Rules 707 | // LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ, 708 | // SA, SG, and XX into other line breaking classes depending on criteria outside the scope 709 | // of this algorithm. 710 | Treat (AI | SG | XX | SA) as if it were AL, Treat CJ as if it were NS, 711 | // Start and end of text: 712 | sot '×', // LB2 Never break at the start of text. 713 | '!' eot, // LB3 Always break at the end of text. 714 | // Mandatory breaks: 715 | BK '!', // LB4 Always break after hard line breaks. 716 | // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. 717 | CR '×' LF, CR '!', LF '!', NL '!', 718 | '×' (BK | CR | LF | NL), // LB6 Do not break before hard line breaks. 719 | // Explicit breaks and non-breaks: 720 | '×' SP, '×' ZW, // LB7 Do not break before spaces or zero width space. 721 | // LB8 Break before any character following a zero-width space, even if one or more spaces 722 | // intervene. 723 | (ZW | ZWSP) '÷', Treat (ZW | ZWSP) SP as if it were ZWSP, Treat ZWSP as if it were SP, 724 | // ZWJ '×', // XXX Handled explicitly // LB8a Do not break after a zero width joiner. 725 | // Combining marks: 726 | // LB9 Do not break a combining character sequence; treat it as if it has the line breaking 727 | // class of the base character in all of the following rules. Treat ZWJ as if it were CM. 728 | Treat X (CM | ZWJ)* as if it were X where X = [^BK CR LF NL SP ZW sot eot ZWSP OPSP QUSP CLSP CPSP B2SP], 729 | Treat (CM | ZWJ) as if it were AL, // LB10 Treat any remaining combining mark or ZWJ as AL. 730 | // Word joiner: 731 | '×' WJ, WJ '×', // LB11 Do not break before or after Word joiner and related characters. 732 | // Non-breaking characters: 733 | GL '×', // LB12 Do not break after NBSP and related characters. 734 | 735 | // Tailorable Line Breaking Rules 736 | // LB12a Do not break before NBSP and related characters, except after spaces and hyphens. 737 | [^SP BA HY sot eot ZWSP OPSP QUSP CLSP CPSP B2SP] '×' GL, 738 | // LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. 739 | '×' CL, '×' CP, '×' EX, '×' IS, '×' SY, 740 | // LB14 Do not break after ‘[’, even after spaces. 741 | (OP | OPSP) '×', Treat (OP | OPSP) SP as if it were OPSP, Treat ZWSP as if it were SP, 742 | // LB15 Do not break within ‘”[’, even with intervening spaces. 743 | (QU | QUSP) '×' OP, Treat (QU | QUSP) SP as if it were QUSP, Treat QUSP as if it were SP, 744 | // LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with 745 | // intervening spaces. 746 | (CL | CLSP | CP | CPSP) '×' NS, 747 | Treat (CL | CLSP) SP as if it were CLSP, Treat CLSP as if it were SP, 748 | Treat (CP | CPSP) SP as if it were CPSP, Treat CPSP as if it were SP, 749 | // LB17 Do not break within ‘——’, even with intervening spaces. 750 | (B2 | B2SP) '×' B2, Treat (B2 | B2SP) SP as if it were B2SP, Treat B2SP as if it were SP, 751 | // Spaces: 752 | SP '÷', // LB18 Break after spaces. 753 | // Special case rules: 754 | '×' QU, QU '×', // LB19 Do not break before or after quotation marks, such as ‘”’. 755 | '÷' CB, CB '÷', // LB20 Break before and after unresolved CB. 756 | // LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana, 757 | // and other non-starters, or after acute accents. 758 | '×' BA, '×' HY, '×' NS, BB '×', 759 | // LB21a Don't break after Hebrew + Hyphen. // XXX Use a single state, HLHYBA, for HLHY and HLBA 760 | HLHYBA '×', Treat HL (HY | BA) as if it were HLHYBA, Treat HLHYBA as if it were HY, 761 | SY '×' HL, // LB21b Don’t break between Solidus and Hebrew letters. 762 | '×' IN, // LB22 Do not break before ellipses. 763 | // Numbers: 764 | (AL | HL) '×' NU, NU '×' (AL | HL), // LB23 Do not break between digits and letters. 765 | // LB23a Do not break between numeric prefixes and ideographs, or between ideographs and 766 | // numeric postfixes. 767 | PR '×' (ID | EB | EM), (ID | EB | EM) '×' PO, 768 | // LB24 Do not break between numeric prefix/postfix and letters, or between letters and 769 | // prefix/postfix. 770 | (PR | PO) '×' (AL | HL), (AL | HL) '×' (PR | PO), 771 | // LB25 Do not break between the following pairs of classes relevant to numbers: 772 | CL '×' PO, CP '×' PO, CL '×' PR, CP '×' PR, NU '×' PO, NU '×' PR, PO '×' OP, PO '×' NU, PR '×' OP, PR '×' NU, HY '×' NU, IS '×' NU, NU '×' NU, SY '×' NU, 773 | // Korean syllable blocks 774 | // LB26 Do not break a Korean syllable. 775 | JL '×' (JL | JV | H2 | H3), (JV | H2) '×' (JV | JT), (JT | H3) '×' JT, 776 | // LB27 Treat a Korean Syllable Block the same as ID. 777 | (JL | JV | JT | H2 | H3) '×' PO, PR '×' (JL | JV | JT | H2 | H3), 778 | // Finally, join alphabetic letters into words and break everything else. 779 | (AL | HL) '×' (AL | HL), // LB28 Do not break between alphabetics (“at”). 780 | IS '×' (AL | HL), // LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). 781 | // LB30 Do not break between letters, numbers, or ordinary symbols and opening or closing 782 | // parentheses. 783 | (AL | HL | NU) '×' OP, CP '×' (AL | HL | NU), 784 | // LB30a Break between two regional indicator symbols if and only if there are an even 785 | // number of regional indicators preceding the position of the break. 786 | RI '×' RI, Treat RI RI as if it were RIRI, Treat RIRI as if it were RI, 787 | EB '×' EM, // LB30b Do not break between an emoji base and an emoji modifier. 788 | '÷' ALL, ALL '÷', // LB31 Break everywhere else. 789 | }; 790 | 791 | // Synthesize all non-"safe" pairs from pair table 792 | let unsafe_pairs = (0..NUM_CLASSES).flat_map(|j| { 793 | (0..NUM_CLASSES).filter_map(move |i| { 794 | // All states that could have resulted from break class "i" 795 | let possible_states = pair_table 796 | .iter() 797 | .map(|row| (row[i] & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT)) as usize); 798 | // Check if all state transitions due to "j" are the same 799 | if possible_states.map(|s| pair_table[s][j]).all_equal() { 800 | None 801 | } else { 802 | Some((i, j)) 803 | } 804 | }) 805 | }); 806 | 807 | let re = Regex::new( 808 | r"(?x)^ 809 | (?P[[:xdigit:]]{4,}) # Unicode code point 810 | (?:\.{2}(?P[[:xdigit:]]{4,}))? # End of range 811 | ; 812 | (?P\w{2,3}) # Line_Break property", 813 | )?; 814 | let prop_ranges = BufReader::new(File::open("../LineBreak.txt")?) 815 | .lines() 816 | .map(Result::unwrap) 817 | .filter(|l| !(l.starts_with('#') || l.is_empty())) 818 | .map(|l| { 819 | let caps = re.captures(&l).unwrap(); 820 | let start = u32::from_str_radix(&caps["start"], 16).unwrap(); 821 | let end = caps 822 | .name("end") 823 | .map_or(start, |m| u32::from_str_radix(m.as_str(), 16).unwrap()); 824 | let lb: BreakClass = caps["lb"].parse().unwrap(); 825 | (start..end + 1, lb) 826 | }); 827 | let trie = { 828 | // All code points, assigned and unassigned, that are not listed explicitly are given the value "XX" 829 | let mut builder = CpTrieBuilder::new(XX); 830 | // The unassigned code points in the following blocks default to "ID" 831 | builder.set_range(0x3400..0x4DBF + 1, ID); 832 | builder.set_range(0x4E00..0x9FFF + 1, ID); 833 | builder.set_range(0xF900..0xFAFF + 1, ID); 834 | // All undesignated code points in Planes 2 and 3, whether inside or outside of allocated blocks, default to "ID" 835 | builder.set_range(0x20000..0x2FFFD + 1, ID); 836 | builder.set_range(0x30000..0x3FFFD + 1, ID); 837 | // All unassigned code points in the following Plane 1 range, whether inside or outside of allocated blocks, also default to "ID" 838 | builder.set_range(0x1F000..0x1FAFF + 1, ID); 839 | builder.set_range(0x1FC00..0x1FFFD + 1, ID); 840 | // The unassigned code points in the following block default to "PR" 841 | builder.set_range(0x20A0..0x20CF + 1, PR); 842 | 843 | prop_ranges.for_each(|(range, lb)| builder.set_range(range, lb)); 844 | builder.build() 845 | }; 846 | 847 | let mut stream = BufWriter::new(File::create("../src/tables.rs")?); 848 | writeln!( 849 | stream, 850 | "const BREAK_PROP_TRIE_HIGH_START: u32 = {}; 851 | static BREAK_PROP_TRIE_INDEX: [u16; {}] = {:?}; 852 | static BREAK_PROP_TRIE_DATA: [BreakClass; {}] = [", 853 | trie.high_start, 854 | trie.index.len(), 855 | trie.index, 856 | trie.data.len(), 857 | )?; 858 | trie.data 859 | .into_iter() 860 | .flat_map(|x| [BREAK_CLASS_TABLE[x as usize], ","]) 861 | .try_for_each(|s| write!(stream, "{}", s))?; 862 | write!( 863 | stream, 864 | "]; 865 | 866 | static PAIR_TABLE: [[u8; {}]; {}] = [", 867 | NUM_CLASSES_EOT, NUM_STATES 868 | )?; 869 | for row in &pair_table { 870 | write!(stream, "[")?; 871 | for x in row { 872 | write!(stream, "{},", x)?; 873 | } 874 | write!(stream, "],")?; 875 | } 876 | writeln!( 877 | stream, 878 | r"]; 879 | 880 | fn is_safe_pair(a: BreakClass, b: BreakClass) -> bool {{ 881 | !matches!((a, b), {}) 882 | }}", 883 | unsafe_pairs 884 | .map(|(i, j)| format!("({}, {})", BREAK_CLASS_TABLE[i], BREAK_CLASS_TABLE[j])) 885 | .collect::>() 886 | .join("|") 887 | )?; 888 | 889 | Ok(()) 890 | } 891 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14]. 2 | //! 3 | //! Given an input text, locates "line break opportunities", or positions appropriate for wrapping 4 | //! lines when displaying text. 5 | //! 6 | //! # Example 7 | //! 8 | //! ``` 9 | //! use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; 10 | //! 11 | //! let text = "a b \nc"; 12 | //! assert!(linebreaks(text).eq([ 13 | //! (2, Allowed), // May break after first space 14 | //! (5, Mandatory), // Must break after line feed 15 | //! (6, Mandatory) // Must break at end of text, so that there always is at least one LB 16 | //! ])); 17 | //! ``` 18 | //! 19 | //! [UAX14]: https://www.unicode.org/reports/tr14/ 20 | 21 | #![no_std] 22 | #![deny(missing_docs, missing_debug_implementations)] 23 | 24 | use core::iter::once; 25 | 26 | /// The [Unicode version](https://www.unicode.org/versions/) conformed to. 27 | pub const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0); 28 | 29 | include!("shared.rs"); 30 | include!("tables.rs"); 31 | 32 | /// Returns the line break property of the specified code point. 33 | /// 34 | /// # Examples 35 | /// 36 | /// ``` 37 | /// use unicode_linebreak::{BreakClass, break_property}; 38 | /// assert_eq!(break_property(0x2CF3), BreakClass::Alphabetic); 39 | /// ``` 40 | #[inline(always)] 41 | pub fn break_property(codepoint: u32) -> BreakClass { 42 | const BMP_INDEX_LENGTH: u32 = BMP_LIMIT >> BMP_SHIFT; 43 | const OMITTED_BMP_INDEX_1_LENGTH: u32 = BMP_LIMIT >> SHIFT_1; 44 | 45 | let data_pos = if codepoint < BMP_LIMIT { 46 | let i = codepoint >> BMP_SHIFT; 47 | BREAK_PROP_TRIE_INDEX[i as usize] + (codepoint & (BMP_DATA_BLOCK_LENGTH - 1)) as u16 48 | } else if codepoint < BREAK_PROP_TRIE_HIGH_START { 49 | let i1 = codepoint >> SHIFT_1; 50 | let i2 = BREAK_PROP_TRIE_INDEX 51 | [(i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH) as usize] 52 | + ((codepoint >> SHIFT_2) & (INDEX_2_BLOCK_LENGTH - 1)) as u16; 53 | let i3_block = BREAK_PROP_TRIE_INDEX[i2 as usize]; 54 | let i3_pos = ((codepoint >> SHIFT_3) & (INDEX_3_BLOCK_LENGTH - 1)) as u16; 55 | 56 | debug_assert!(i3_block & 0x8000 == 0, "18-bit indices are unexpected"); 57 | let data_block = BREAK_PROP_TRIE_INDEX[(i3_block + i3_pos) as usize]; 58 | data_block + (codepoint & (SMALL_DATA_BLOCK_LENGTH - 1)) as u16 59 | } else { 60 | return XX; 61 | }; 62 | BREAK_PROP_TRIE_DATA[data_pos as usize] 63 | } 64 | 65 | /// Break opportunity type. 66 | #[derive(Copy, Clone, PartialEq, Eq, Debug)] 67 | pub enum BreakOpportunity { 68 | /// A line must break at this spot. 69 | Mandatory, 70 | /// A line is allowed to end at this spot. 71 | Allowed, 72 | } 73 | 74 | /// Returns an iterator over line break opportunities in the specified string. 75 | /// 76 | /// Break opportunities are given as tuples of the byte index of the character succeeding the break 77 | /// and the type. 78 | /// 79 | /// Uses the default Line Breaking Algorithm with the tailoring that Complex-Context Dependent 80 | /// (SA) characters get resolved to Ordinary Alphabetic and Symbol Characters (AL) regardless of 81 | /// General_Category. 82 | /// 83 | /// # Examples 84 | /// 85 | /// ``` 86 | /// use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; 87 | /// assert!(linebreaks("Hello world!").eq(vec![(6, Allowed), (12, Mandatory)])); 88 | /// ``` 89 | pub fn linebreaks(s: &str) -> impl Iterator + Clone + '_ { 90 | use BreakOpportunity::{Allowed, Mandatory}; 91 | 92 | s.char_indices() 93 | .map(|(i, c)| (i, break_property(c as u32) as u8)) 94 | .chain(once((s.len(), eot))) 95 | .scan((sot, false), |state, (i, cls)| { 96 | // ZWJ is handled outside the table to reduce its size 97 | let val = PAIR_TABLE[state.0 as usize][cls as usize]; 98 | let is_mandatory = val & MANDATORY_BREAK_BIT != 0; 99 | let is_break = val & ALLOWED_BREAK_BIT != 0 && (!state.1 || is_mandatory); 100 | *state = ( 101 | val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), 102 | cls == BreakClass::ZeroWidthJoiner as u8, 103 | ); 104 | 105 | Some((i, is_break, is_mandatory)) 106 | }) 107 | .filter_map(|(i, is_break, is_mandatory)| { 108 | if is_break { 109 | Some((i, if is_mandatory { Mandatory } else { Allowed })) 110 | } else { 111 | None 112 | } 113 | }) 114 | } 115 | 116 | /// Divides the string at the last index where further breaks do not depend on prior context. 117 | /// 118 | /// The trivial index at `eot` is excluded. 119 | /// 120 | /// A common optimization is to determine only the nearest line break opportunity before the first 121 | /// character that would cause the line to become overfull, requiring backward traversal, of which 122 | /// there are two approaches: 123 | /// 124 | /// * Cache breaks from forward traversals 125 | /// * Step backward and with `split_at_safe` find a pos to safely search forward from, repeatedly 126 | /// 127 | /// # Examples 128 | /// 129 | /// ``` 130 | /// use unicode_linebreak::{linebreaks, split_at_safe}; 131 | /// let s = "Not allowed to break within em dashes: — —"; 132 | /// let (prev, safe) = split_at_safe(s); 133 | /// let n = prev.len(); 134 | /// assert!(linebreaks(safe).eq(linebreaks(s).filter_map(|(i, x)| i.checked_sub(n).map(|i| (i, x))))); 135 | /// ``` 136 | pub fn split_at_safe(s: &str) -> (&str, &str) { 137 | let mut chars = s.char_indices().rev().scan(None, |state, (i, c)| { 138 | let cls = break_property(c as u32); 139 | let is_safe_pair = state 140 | .replace(cls) 141 | .map_or(false, |prev| is_safe_pair(cls, prev)); // Reversed since iterating backwards 142 | Some((i, is_safe_pair)) 143 | }); 144 | chars.find(|&(_, is_safe_pair)| is_safe_pair); 145 | // Include preceding char for `linebreaks` to pick up break before match (disallowed after sot) 146 | s.split_at(chars.next().map_or(0, |(i, _)| i)) 147 | } 148 | 149 | #[cfg(test)] 150 | mod tests { 151 | use super::*; 152 | 153 | #[test] 154 | fn it_works() { 155 | assert_eq!(break_property(0xA), BreakClass::LineFeed); 156 | assert_eq!(break_property(0xDB80), BreakClass::Surrogate); 157 | assert_eq!(break_property(0xe01ef), BreakClass::CombiningMark); 158 | assert_eq!(break_property(0x10ffff), BreakClass::Unknown); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /src/shared.rs: -------------------------------------------------------------------------------- 1 | /// Unicode line breaking class. 2 | #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] 3 | #[repr(u8)] 4 | pub enum BreakClass { 5 | // Non-tailorable 6 | /// Cause a line break (after) 7 | Mandatory, 8 | /// Cause a line break (after), except between CR and LF 9 | CarriageReturn, 10 | /// Cause a line break (after) 11 | LineFeed, 12 | /// Prohibit a line break between the character and the preceding character 13 | CombiningMark, 14 | /// Cause a line break (after) 15 | NextLine, 16 | /// Do not occur in well-formed text 17 | Surrogate, 18 | /// Prohibit line breaks before and after 19 | WordJoiner, 20 | /// Provide a break opportunity 21 | ZeroWidthSpace, 22 | /// Prohibit line breaks before and after 23 | NonBreakingGlue, 24 | /// Enable indirect line breaks 25 | Space, 26 | /// Prohibit line breaks within joiner sequences 27 | ZeroWidthJoiner, 28 | // Break opportunities 29 | /// Provide a line break opportunity before and after the character 30 | BeforeAndAfter, 31 | /// Generally provide a line break opportunity after the character 32 | After, 33 | /// Generally provide a line break opportunity before the character 34 | Before, 35 | /// Provide a line break opportunity after the character, except in numeric context 36 | Hyphen, 37 | /// Provide a line break opportunity contingent on additional information 38 | Contingent, 39 | // Characters prohibiting certain breaks 40 | /// Prohibit line breaks before 41 | ClosePunctuation, 42 | /// Prohibit line breaks before 43 | CloseParenthesis, 44 | /// Prohibit line breaks before 45 | Exclamation, 46 | /// Allow only indirect line breaks between pairs 47 | Inseparable, 48 | /// Allow only indirect line breaks before 49 | NonStarter, 50 | /// Prohibit line breaks after 51 | OpenPunctuation, 52 | /// Act like they are both opening and closing 53 | Quotation, 54 | // Numeric context 55 | /// Prevent breaks after any and before numeric 56 | InfixSeparator, 57 | /// Form numeric expressions for line breaking purposes 58 | Numeric, 59 | /// Do not break following a numeric expression 60 | Postfix, 61 | /// Do not break in front of a numeric expression 62 | Prefix, 63 | /// Prevent a break before, and allow a break after 64 | Symbol, 65 | // Other characters 66 | /// Act like AL when the resolved EAW is N; otherwise, act as ID 67 | Ambiguous, 68 | /// Are alphabetic characters or symbols that are used with alphabetic characters 69 | Alphabetic, 70 | /// Treat as NS or ID for strict or normal breaking. 71 | ConditionalJapaneseStarter, 72 | /// Do not break from following Emoji Modifier 73 | EmojiBase, 74 | /// Do not break from preceding Emoji Base 75 | EmojiModifier, 76 | /// Form Korean syllable blocks 77 | HangulLvSyllable, 78 | /// Form Korean syllable blocks 79 | HangulLvtSyllable, 80 | /// Do not break around a following hyphen; otherwise act as Alphabetic 81 | HebrewLetter, 82 | /// Break before or after, except in some numeric context 83 | Ideographic, 84 | /// Form Korean syllable blocks 85 | HangulLJamo, 86 | /// Form Korean syllable blocks 87 | HangulVJamo, 88 | /// Form Korean syllable blocks 89 | HangulTJamo, 90 | /// Keep pairs together. For pairs, break before and after other classes 91 | RegionalIndicator, 92 | /// Provide a line break opportunity contingent on additional, language-specific context analysis 93 | ComplexContext, 94 | /// Have as yet unknown line breaking behavior or unassigned code positions 95 | Unknown, 96 | } 97 | 98 | use BreakClass::{ 99 | After as BA, Alphabetic as AL, Ambiguous as AI, Before as BB, BeforeAndAfter as B2, 100 | CarriageReturn as CR, CloseParenthesis as CP, ClosePunctuation as CL, CombiningMark as CM, 101 | ComplexContext as SA, ConditionalJapaneseStarter as CJ, Contingent as CB, EmojiBase as EB, 102 | EmojiModifier as EM, Exclamation as EX, HangulLJamo as JL, HangulLvSyllable as H2, 103 | HangulLvtSyllable as H3, HangulTJamo as JT, HangulVJamo as JV, HebrewLetter as HL, 104 | Hyphen as HY, Ideographic as ID, InfixSeparator as IS, Inseparable as IN, LineFeed as LF, 105 | Mandatory as BK, NextLine as NL, NonBreakingGlue as GL, NonStarter as NS, Numeric as NU, 106 | OpenPunctuation as OP, Postfix as PO, Prefix as PR, Quotation as QU, RegionalIndicator as RI, 107 | Space as SP, Surrogate as SG, Symbol as SY, Unknown as XX, WordJoiner as WJ, 108 | ZeroWidthJoiner as ZWJ, ZeroWidthSpace as ZW, 109 | }; 110 | 111 | /// Ceiling for code points in the Basic Multilingual Place (BMP). 112 | const BMP_LIMIT: u32 = 0x10000; 113 | 114 | /// Shift size for getting index-3 table offset. 115 | const SHIFT_3: u32 = 4; 116 | /// Shift size for getting index-2 table offset. 117 | const SHIFT_2: u32 = 5 + SHIFT_3; 118 | /// Shift size for getting index-1 table offset. 119 | const SHIFT_1: u32 = 5 + SHIFT_2; 120 | /// Shift size for getting BMP block start. 121 | const BMP_SHIFT: u32 = 6; 122 | 123 | const INDEX_2_BLOCK_LENGTH: u32 = 1 << (SHIFT_1 - SHIFT_2); 124 | const INDEX_3_BLOCK_LENGTH: u32 = 1 << (SHIFT_2 - SHIFT_3); 125 | const SMALL_DATA_BLOCK_LENGTH: u32 = 1 << SHIFT_3; 126 | const BMP_DATA_BLOCK_LENGTH: u32 = 1 << BMP_SHIFT; 127 | 128 | const ALLOWED_BREAK_BIT: u8 = 0x80; 129 | const MANDATORY_BREAK_BIT: u8 = 0x40; 130 | 131 | #[allow(non_upper_case_globals)] 132 | const eot: u8 = 43; 133 | #[allow(non_upper_case_globals)] 134 | const sot: u8 = 44; 135 | -------------------------------------------------------------------------------- /tests/test_default.rs: -------------------------------------------------------------------------------- 1 | //! Default Line_Break test. 2 | 3 | use std::char; 4 | use std::fs::File; 5 | use std::io::{self, prelude::*, BufReader}; 6 | use std::iter::from_fn; 7 | use std::u32; 8 | use unicode_linebreak::*; 9 | 10 | const TEST_FILE: &str = "tests/LineBreakTest.txt"; 11 | 12 | #[test] 13 | fn test_lb_default() -> io::Result<()> { 14 | let file = File::open(TEST_FILE)?; 15 | for line in BufReader::new(file) 16 | .lines() 17 | .map(|l| l.unwrap()) 18 | .filter(|l| !l.starts_with('#')) 19 | { 20 | let (line, comment) = line.split_once("# ").expect("Missing comment"); 21 | 22 | // Skip tests relying on some tailorable rules 23 | if comment.contains("[30.22]") || comment.contains("[999.0]") { 24 | continue; 25 | } 26 | 27 | let mut items = line.split_whitespace(); 28 | items.next().unwrap(); // Skip first '×' 29 | let mut byte_idx = 0; 30 | let (spots, string): (Vec<_>, String) = from_fn(|| { 31 | if let Some(hex) = items.next() { 32 | let codepoint = u32::from_str_radix(hex, 16) 33 | .ok() 34 | .and_then(char::from_u32) 35 | .expect("Invalid codepoint"); 36 | byte_idx += codepoint.len_utf8(); 37 | 38 | let is_break = match items.next() { 39 | Some("÷") => true, 40 | Some("×") => false, 41 | _ => unreachable!(), 42 | }; 43 | 44 | Some(((byte_idx, is_break), codepoint)) 45 | } else { 46 | None 47 | } 48 | }) 49 | .unzip(); 50 | 51 | let actual: Vec<_> = linebreaks(&string).map(|(i, _)| i).collect(); 52 | let expected: Vec<_> = spots 53 | .into_iter() 54 | .filter_map(|(i, is_break)| if is_break { Some(i) } else { None }) 55 | .collect(); 56 | 57 | assert_eq!( 58 | actual, expected, 59 | "String: ‘{}’, comment: {}", 60 | string, comment 61 | ); 62 | } 63 | 64 | Ok(()) 65 | } 66 | --------------------------------------------------------------------------------