├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches ├── bench.rs └── tables │ ├── fst │ ├── general_category.fst │ ├── general_category.rs │ ├── jamo_short_name.fst │ ├── jamo_short_name.rs │ ├── mod.rs │ ├── names.fst │ └── names.rs │ ├── mod.rs │ ├── slice │ ├── general_categories.rs │ ├── general_category.rs │ ├── jamo_short_name.rs │ ├── mod.rs │ └── names.rs │ └── trie │ ├── general_categories.rs │ └── mod.rs ├── rustfmt.toml ├── scripts └── generate-unicode-tables ├── src ├── age.rs ├── app.rs ├── args.rs ├── bidi_class.rs ├── bidi_mirroring_glyph.rs ├── brk.rs ├── canonical_combining_class.rs ├── case_folding.rs ├── case_mapping.rs ├── error.rs ├── general_category.rs ├── jamo_short_name.rs ├── joining_type.rs ├── main.rs ├── names.rs ├── property_bool.rs ├── script.rs ├── util.rs └── writer.rs ├── ucd-parse ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md └── src │ ├── age.rs │ ├── arabic_shaping.rs │ ├── bidi_mirroring_glyph.rs │ ├── case_folding.rs │ ├── common.rs │ ├── core_properties.rs │ ├── derived_normalization_properties.rs │ ├── east_asian_width.rs │ ├── emoji_properties.rs │ ├── error.rs │ ├── extracted │ ├── derived_bidi_class.rs │ ├── derived_binary_properties.rs │ ├── derived_combining_class.rs │ ├── derived_decomposition_type.rs │ ├── derived_east_asian_width.rs │ ├── derived_general_category.rs │ ├── derived_joining_group.rs │ ├── derived_joining_type.rs │ ├── derived_line_break.rs │ ├── derived_name.rs │ ├── derived_numeric_type.rs │ ├── derived_numeric_values.rs │ └── mod.rs │ ├── grapheme_cluster_break.rs │ ├── jamo_short_name.rs │ ├── lib.rs │ ├── line_break.rs │ ├── name_aliases.rs │ ├── prop_list.rs │ ├── property_aliases.rs │ ├── property_value_aliases.rs │ ├── script_extensions.rs │ ├── scripts.rs │ ├── sentence_break.rs │ ├── special_casing.rs │ ├── unicode_data.rs │ └── word_break.rs ├── ucd-trie ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches │ └── bench.rs └── src │ ├── general_category.rs │ ├── lib.rs │ └── owned.rs └── ucd-util ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── LICENSE-UNICODE ├── README.md └── src ├── hangul.rs ├── ideograph.rs ├── lib.rs ├── name.rs ├── property.rs └── unicode_tables ├── jamo_short_name.rs ├── mod.rs ├── property_names.rs └── property_values.rs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [BurntSushi] 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | schedule: 8 | - cron: '00 01 * * *' 9 | jobs: 10 | test: 11 | name: test 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | # The docs seem to suggest that we can have a matrix with just an 16 | # include directive, but it result in a "matrix must define at least 17 | # one vector" error in the CI system. 18 | build: [pinned, stable, beta, nightly] 19 | include: 20 | - build: pinned 21 | os: ubuntu-latest 22 | rust: 1.70.0 23 | - build: stable 24 | os: ubuntu-latest 25 | rust: stable 26 | - build: beta 27 | os: ubuntu-latest 28 | rust: beta 29 | - build: nightly 30 | os: ubuntu-latest 31 | rust: nightly 32 | steps: 33 | - name: Checkout repository 34 | uses: actions/checkout@v3 35 | - name: Install Rust 36 | uses: dtolnay/rust-toolchain@master 37 | with: 38 | toolchain: ${{ matrix.rust }} 39 | - run: cargo build --all --verbose 40 | - run: cargo doc --all --verbose 41 | - run: cargo test --all --verbose 42 | - if: matrix.build == 'nightly' 43 | run: cargo bench --all --verbose --no-run 44 | 45 | rustfmt: 46 | name: rustfmt 47 | runs-on: ubuntu-latest 48 | steps: 49 | - name: Checkout repository 50 | uses: actions/checkout@v3 51 | - name: Install Rust 52 | uses: dtolnay/rust-toolchain@master 53 | with: 54 | toolchain: stable 55 | components: rustfmt 56 | - name: Check formatting 57 | run: | 58 | cargo fmt -- --check 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | tags 3 | target 4 | /x 5 | perf.data* 6 | /tmp 7 | /ucd-parse/Cargo.lock 8 | /ucd-trie/Cargo.lock 9 | /ucd-util/Cargo.lock 10 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "bitflags" 7 | version = "1.3.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 10 | 11 | [[package]] 12 | name = "clap" 13 | version = "2.34.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" 16 | dependencies = [ 17 | "bitflags", 18 | "strsim", 19 | "textwrap", 20 | "unicode-width", 21 | ] 22 | 23 | [[package]] 24 | name = "fst" 25 | version = "0.4.7" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" 28 | 29 | [[package]] 30 | name = "once_cell" 31 | version = "1.18.0" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" 34 | 35 | [[package]] 36 | name = "regex-lite" 37 | version = "0.1.0" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "f96ede7f386ba6e910092e7ccdc04176cface62abebea07ed6b46d870ed95ca2" 40 | 41 | [[package]] 42 | name = "strsim" 43 | version = "0.8.0" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" 46 | 47 | [[package]] 48 | name = "textwrap" 49 | version = "0.11.0" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 52 | dependencies = [ 53 | "unicode-width", 54 | ] 55 | 56 | [[package]] 57 | name = "ucd-generate" 58 | version = "0.3.1" 59 | dependencies = [ 60 | "clap", 61 | "fst", 62 | "once_cell", 63 | "ucd-parse", 64 | "ucd-trie", 65 | "ucd-util", 66 | ] 67 | 68 | [[package]] 69 | name = "ucd-parse" 70 | version = "0.1.13" 71 | dependencies = [ 72 | "regex-lite", 73 | ] 74 | 75 | [[package]] 76 | name = "ucd-trie" 77 | version = "0.1.7" 78 | dependencies = [ 79 | "once_cell", 80 | ] 81 | 82 | [[package]] 83 | name = "ucd-util" 84 | version = "0.2.2" 85 | 86 | [[package]] 87 | name = "unicode-width" 88 | version = "0.1.10" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" 91 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ucd-generate" 3 | version = "0.3.1" #:version 4 | authors = ["Andrew Gallant "] 5 | description = """ 6 | A program for generating packed representations of the Unicode character 7 | database that can be efficiently searched. 8 | """ 9 | documentation = "https://github.com/BurntSushi/ucd-generate" 10 | homepage = "https://github.com/BurntSushi/ucd-generate" 11 | repository = "https://github.com/BurntSushi/ucd-generate" 12 | readme = "README.md" 13 | keywords = ["unicode", "generate", "character", "table", "fst"] 14 | license = "MIT OR Apache-2.0" 15 | categories = ["text-processing", "internationalization"] 16 | edition = "2021" 17 | rust-version = "1.70" 18 | 19 | [workspace] 20 | members = ["ucd-parse", "ucd-trie", "ucd-util"] 21 | 22 | [[bin]] 23 | bench = false 24 | path = "src/main.rs" 25 | name = "ucd-generate" 26 | 27 | [dependencies] 28 | fst = "0.4.0" 29 | ucd-parse = { version = "0.1.10", path = "ucd-parse" } 30 | ucd-trie = { version = "0.1.7", path = "ucd-trie" } 31 | ucd-util = { version = "0.2.2", path = "ucd-util" } 32 | 33 | [dependencies.clap] 34 | version = "2.34.0" 35 | default-features = false 36 | features = ["suggestions"] 37 | 38 | [dev-dependencies] 39 | once_cell = "1" 40 | 41 | [profile.release] 42 | debug = true 43 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 The Rust Project Developers 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ucd-generate 2 | ============ 3 | A command line tool to generate Unicode tables in Rust source code. Tables 4 | can typically be generated in one of three formats: a sorted sequence of 5 | character ranges, a 6 | [finite state transducer](https://github.com/BurntSushi/fst) 7 | or a compressed trie. Full support for name canonicalization is also provided. 8 | 9 | [![Build status](https://github.com/BurntSushi/ucd-generate/workflows/ci/badge.svg)](https://github.com/BurntSushi/ucd-generate/actions) 10 | [![crates.io](https://img.shields.io/crates/v/ucd-generate.svg)](https://crates.io/crates/ucd-generate) 11 | 12 | 13 | ### Installation 14 | 15 | Since this is mostly intended as a developer tool for use while writing Rust 16 | programs, the principle method of installation is from crates.io: 17 | 18 | ``` 19 | $ cargo install ucd-generate 20 | ucd-generate --help 21 | ``` 22 | 23 | 24 | ### Example 25 | 26 | This somewhat arbitrary example shows the output of generating tables for 27 | three properties, and representing them as normal Rust character literal 28 | ranges. 29 | 30 | To run the example, you need to download the Unicode Character Database (UCD): 31 | 32 | ``` 33 | $ mkdir /tmp/ucd-15.0.0 34 | $ cd /tmp/ucd-15.0.0 35 | $ curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip 36 | $ unzip UCD.zip 37 | ``` 38 | 39 | Note that prior to version 13.0.0, `emoji/emoji-data.txt` file was distributed 40 | separate from the UCD bundle. For these versions, you may need to download this 41 | file from https://unicode.org/Public/emoji in order to generate certain tables. 42 | 43 | Now tell `ucd-generate` what you want and point it to the directory created 44 | above: 45 | 46 | ``` 47 | $ ucd-generate property-bool /tmp/ucd-15.0.0 --include Hyphen,Dash,Quotation_Mark --chars 48 | ``` 49 | 50 | And the output, which is valid Rust source code: 51 | 52 | ```rust 53 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 54 | // 55 | // ucd-generate property-bool /tmp/ucd-15.0.0 --include Hyphen,Dash,Quotation_Mark --chars 56 | // 57 | // Unicode version: 15.0.0. 58 | // 59 | // ucd-generate 0.2.10 is available on crates.io. 60 | 61 | pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ 62 | ("Dash", DASH), ("Hyphen", HYPHEN), ("Quotation_Mark", QUOTATION_MARK), 63 | ]; 64 | 65 | pub const DASH: &'static [(char, char)] = &[ 66 | ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'), 67 | ('‐', '―'), ('⁓', '⁓'), ('⁻', '⁻'), ('₋', '₋'), 68 | ('−', '−'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'), 69 | ('⹀', '⹀'), ('\u{2e5d}', '\u{2e5d}'), ('〜', '〜'), ('〰', '〰'), 70 | ('゠', '゠'), ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'), 71 | ('-', '-'), ('𐺭', '𐺭'), 72 | ]; 73 | 74 | pub const HYPHEN: &'static [(char, char)] = &[ 75 | ('-', '-'), ('\u{ad}', '\u{ad}'), ('֊', '֊'), ('᠆', '᠆'), 76 | ('‐', '‑'), ('⸗', '⸗'), ('・', '・'), ('﹣', '﹣'), 77 | ('-', '-'), ('・', '・'), 78 | ]; 79 | 80 | pub const QUOTATION_MARK: &'static [(char, char)] = &[ 81 | ('"', '"'), ('\'', '\''), ('«', '«'), ('»', '»'), ('‘', '‟'), 82 | ('‹', '›'), ('⹂', '⹂'), ('「', '』'), ('〝', '〟'), 83 | ('﹁', '﹄'), ('"', '"'), (''', '''), ('「', '」'), 84 | ]; 85 | ``` 86 | 87 | ### DFA serialization 88 | 89 | Prior to `ucd-generate 0.3.0`, the sub-commands `dfa` and `regex` could be used 90 | to build fully compiled DFAs, serialize them to disk and generate Rust code for 91 | deserializing them. This functionality was removed in `0.3.0` and 92 | [moved to `regex-cli`](https://github.com/rust-lang/regex/tree/master/regex-cli#example-serialize-a-dfa). 93 | 94 | ### Contributing 95 | 96 | The `ucd-generate` tool doesn't have any specific design goals, other than to 97 | collect Unicode table generation tasks. If you need `ucd-generate` to do 98 | something and it's reasonably straight-forward to add, then just submitting a 99 | PR would be great. Otherwise, file an issue and we can discuss. 100 | 101 | ### Alternatives 102 | 103 | The primary alternative is [ICU4X](https://github.com/unicode-org/icu4x). If 104 | you have sophisticated Unicode requirements, it is almost certainly what you 105 | should be using. 106 | 107 | It's beyond the scope of this README to do a full comparison between ICU4X 108 | and `ucd-generate`, but I think the shortest way to describe it is that 109 | `ucd-generate` is _simplistic_, with all the associated positive and negative 110 | connotations that come with that word. 111 | 112 | 113 | ### Future work 114 | 115 | This tool is by no means is exhaustive. In fact, it's not even close to 116 | exhaustive, and it may never be. For the most part, the intent of this tool 117 | is to collect virtually any kind of Unicode generation task. In theory, this 118 | would ideally replace the hodge podge collection of Python programs that is 119 | responsible for this task today in various Unicode crates. 120 | 121 | Here are some examples of future work that would be welcome: 122 | 123 | * More support for parsing things in the UCD. 124 | * More generation tasks based on things in the UCD. 125 | * More output formats, especially for reducing binary size. 126 | 127 | 128 | ### Sub-crates 129 | 130 | This repository is home to three sub-crates: 131 | 132 | * [`ucd-parse`](ucd-parse) - A crate for parsing UCD files into 133 | structured data. 134 | * [`ucd-trie`](ucd-trie) - Auxiliary type for handling the trie 135 | set table format emitted by `ucd-generate`. This crate has a `no_std` mode. 136 | * [`ucd-util`](ucd-util) - A purposely small crate for Unicode 137 | auxiliary functions. This includes things like symbol or character name 138 | canonicalization, ideograph name generation and helper functions for 139 | searching property name and value tables. 140 | 141 | 142 | ### License 143 | 144 | This project is licensed under either of 145 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or 146 | https://www.apache.org/licenses/LICENSE-2.0) 147 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or 148 | https://opensource.org/licenses/MIT) 149 | at your option. 150 | -------------------------------------------------------------------------------- /benches/bench.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | use std::cmp::Ordering; 6 | 7 | use test::Bencher; 8 | 9 | mod tables; 10 | 11 | fn u32_key(cp: u32) -> [u8; 4] { 12 | cp.to_be_bytes() 13 | } 14 | 15 | #[bench] 16 | fn general_category_slice(b: &mut Bencher) { 17 | let slice = tables::slice::general_category::GENERAL_CATEGORY; 18 | let mut i = 0; 19 | b.iter(|| { 20 | let (query, _, value) = slice[i]; 21 | i = (i + 1) % slice.len(); 22 | 23 | let pos = slice.binary_search_by(|&(s, e, _)| { 24 | if s > query { 25 | Ordering::Greater 26 | } else if e < query { 27 | Ordering::Less 28 | } else { 29 | Ordering::Equal 30 | } 31 | }); 32 | let found = slice[pos.unwrap()]; 33 | assert_eq!(found.2, value); 34 | }); 35 | } 36 | 37 | #[bench] 38 | fn general_category_fst(b: &mut Bencher) { 39 | let slice = tables::slice::general_category::GENERAL_CATEGORY; 40 | let fst = &tables::fst::general_category::GENERAL_CATEGORY; 41 | 42 | let mut i = 0; 43 | b.iter(|| { 44 | let (query, _, value) = slice[i]; 45 | i = (i + 1) % slice.len(); 46 | 47 | let found = fst.get(u32_key(query)).unwrap() as u8; 48 | assert_eq!(found, value); 49 | }); 50 | } 51 | 52 | #[bench] 53 | fn lowercase_letter_slice(b: &mut Bencher) { 54 | let slice = tables::slice::general_categories::LOWERCASE_LETTER; 55 | let mut i = 0; 56 | b.iter(|| { 57 | let (query, _) = slice[i]; 58 | i = (i + 1) % slice.len(); 59 | 60 | let pos = slice.binary_search_by(|&(s, e)| { 61 | if s > query { 62 | Ordering::Greater 63 | } else if e < query { 64 | Ordering::Less 65 | } else { 66 | Ordering::Equal 67 | } 68 | }); 69 | assert!(pos.is_ok()); 70 | }); 71 | } 72 | 73 | #[bench] 74 | fn lowercase_letter_trie(b: &mut Bencher) { 75 | let slice = tables::slice::general_categories::LOWERCASE_LETTER; 76 | let trie = tables::trie::general_categories::LOWERCASE_LETTER; 77 | let mut i = 0; 78 | b.iter(|| { 79 | let (query, _) = slice[i]; 80 | i = (i + 1) % slice.len(); 81 | assert!(trie.contains_u32(query)); 82 | }); 83 | } 84 | 85 | #[bench] 86 | fn names_slice(b: &mut Bencher) { 87 | let slice = tables::slice::names::NAMES; 88 | let mut i = 0; 89 | b.iter(|| { 90 | let (name, cp) = slice[i]; 91 | i = (i + 1) % slice.len(); 92 | 93 | let found = slice[slice.binary_search_by_key(&name, |x| x.0).unwrap()]; 94 | assert_eq!(found.1, cp); 95 | }); 96 | } 97 | 98 | #[bench] 99 | fn names_fst(b: &mut Bencher) { 100 | let slice = tables::slice::names::NAMES; 101 | let fst = &tables::fst::names::NAMES; 102 | 103 | let mut i = 0; 104 | b.iter(|| { 105 | let (name, cp) = slice[i]; 106 | i = (i + 1) % slice.len(); 107 | 108 | let found = fst.get(name).unwrap() as u32; 109 | assert_eq!(found, cp); 110 | }); 111 | } 112 | 113 | #[bench] 114 | fn jamo_short_name_fst(b: &mut Bencher) { 115 | let slice = tables::slice::jamo_short_name::JAMO_SHORT_NAME; 116 | let fst = &tables::fst::jamo_short_name::JAMO_SHORT_NAME; 117 | let mut i = 0; 118 | let mut value = String::new(); 119 | b.iter(|| { 120 | let (cp, name) = slice[i]; 121 | i = (i + 1) % slice.len(); 122 | 123 | let mut found = fst.get(u32_key(cp)).unwrap(); 124 | value.clear(); 125 | while found != 0 { 126 | value.push((found & 0xFF) as u8 as char); 127 | found = found >> 8; 128 | } 129 | assert_eq!(value, name); 130 | }); 131 | } 132 | 133 | #[bench] 134 | fn jamo_short_name_slice(b: &mut Bencher) { 135 | let slice = tables::slice::jamo_short_name::JAMO_SHORT_NAME; 136 | let mut i = 0; 137 | b.iter(|| { 138 | let (cp, name) = slice[i]; 139 | i = (i + 1) % slice.len(); 140 | 141 | let found = slice[slice.binary_search_by_key(&cp, |x| x.0).unwrap()]; 142 | assert_eq!(found.1, name); 143 | }); 144 | } 145 | 146 | #[bench] 147 | fn jamo_short_name_slice_linear(b: &mut Bencher) { 148 | let slice = tables::slice::jamo_short_name::JAMO_SHORT_NAME; 149 | let mut i = 0; 150 | b.iter(|| { 151 | let (cp, name) = slice[i]; 152 | i = (i + 1) % slice.len(); 153 | 154 | let found = slice.iter().find(|p| p.0 == cp).unwrap(); 155 | assert_eq!(found.1, name); 156 | }); 157 | } 158 | -------------------------------------------------------------------------------- /benches/tables/fst/general_category.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/ucd-generate/e8fa937f0cac643669dcaf5edac2785b15cab917/benches/tables/fst/general_category.fst -------------------------------------------------------------------------------- /benches/tables/fst/general_category.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // ucd-generate general-category ucd-16.0.0 --exclude unassigned --enum --fst-dir benches/tables/fst 4 | // 5 | // Unicode version: 16.0.0. 6 | // 7 | // ucd-generate 0.2.15 is available on crates.io. 8 | 9 | pub const GENERAL_CATEGORY_ENUM: &'static [&'static str] = &[ 10 | "Close_Punctuation", 11 | "Connector_Punctuation", 12 | "Control", 13 | "Currency_Symbol", 14 | "Dash_Punctuation", 15 | "Decimal_Number", 16 | "Enclosing_Mark", 17 | "Final_Punctuation", 18 | "Format", 19 | "Initial_Punctuation", 20 | "Letter_Number", 21 | "Line_Separator", 22 | "Lowercase_Letter", 23 | "Math_Symbol", 24 | "Modifier_Letter", 25 | "Modifier_Symbol", 26 | "Nonspacing_Mark", 27 | "Open_Punctuation", 28 | "Other_Letter", 29 | "Other_Number", 30 | "Other_Punctuation", 31 | "Other_Symbol", 32 | "Paragraph_Separator", 33 | "Private_Use", 34 | "Space_Separator", 35 | "Spacing_Mark", 36 | "Surrogate", 37 | "Titlecase_Letter", 38 | "Uppercase_Letter", 39 | ]; 40 | 41 | pub static GENERAL_CATEGORY: ::once_cell::sync::Lazy< 42 | ::fst::Map<&'static [u8]>, 43 | > = ::once_cell::sync::Lazy::new(|| { 44 | ::fst::Map::from( 45 | ::fst::raw::Fst::new(&include_bytes!("general_category.fst")[..]) 46 | .unwrap(), 47 | ) 48 | }); 49 | -------------------------------------------------------------------------------- /benches/tables/fst/jamo_short_name.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/ucd-generate/e8fa937f0cac643669dcaf5edac2785b15cab917/benches/tables/fst/jamo_short_name.fst -------------------------------------------------------------------------------- /benches/tables/fst/jamo_short_name.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // ucd-generate jamo-short-name ucd-16.0.0 --fst-dir benches/tables/fst 4 | // 5 | // Unicode version: 16.0.0. 6 | // 7 | // ucd-generate 0.2.15 is available on crates.io. 8 | 9 | pub static JAMO_SHORT_NAME: ::once_cell::sync::Lazy< 10 | ::fst::Map<&'static [u8]>, 11 | > = ::once_cell::sync::Lazy::new(|| { 12 | ::fst::Map::from( 13 | ::fst::raw::Fst::new(&include_bytes!("jamo_short_name.fst")[..]) 14 | .unwrap(), 15 | ) 16 | }); 17 | -------------------------------------------------------------------------------- /benches/tables/fst/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod general_category; 2 | pub mod jamo_short_name; 3 | pub mod names; 4 | -------------------------------------------------------------------------------- /benches/tables/fst/names.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BurntSushi/ucd-generate/e8fa937f0cac643669dcaf5edac2785b15cab917/benches/tables/fst/names.fst -------------------------------------------------------------------------------- /benches/tables/fst/names.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // ucd-generate names ucd-16.0.0 --no-aliases --no-hangul --no-ideograph --fst-dir benches/tables/fst 4 | // 5 | // Unicode version: 16.0.0. 6 | // 7 | // ucd-generate 0.2.15 is available on crates.io. 8 | 9 | pub static NAMES: ::once_cell::sync::Lazy<::fst::Map<&'static [u8]>> = 10 | ::once_cell::sync::Lazy::new(|| { 11 | ::fst::Map::from( 12 | ::fst::raw::Fst::new(&include_bytes!("names.fst")[..]).unwrap(), 13 | ) 14 | }); 15 | -------------------------------------------------------------------------------- /benches/tables/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | 3 | pub mod fst; 4 | pub mod slice; 5 | pub mod trie; 6 | -------------------------------------------------------------------------------- /benches/tables/slice/jamo_short_name.rs: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 2 | // 3 | // ucd-generate jamo-short-name ucd-16.0.0 4 | // 5 | // Unicode version: 16.0.0. 6 | // 7 | // ucd-generate 0.2.15 is available on crates.io. 8 | 9 | pub const JAMO_SHORT_NAME: &'static [(u32, &'static str)] = &[ 10 | (4352, "G"), 11 | (4353, "GG"), 12 | (4354, "N"), 13 | (4355, "D"), 14 | (4356, "DD"), 15 | (4357, "R"), 16 | (4358, "M"), 17 | (4359, "B"), 18 | (4360, "BB"), 19 | (4361, "S"), 20 | (4362, "SS"), 21 | (4363, ""), 22 | (4364, "J"), 23 | (4365, "JJ"), 24 | (4366, "C"), 25 | (4367, "K"), 26 | (4368, "T"), 27 | (4369, "P"), 28 | (4370, "H"), 29 | (4449, "A"), 30 | (4450, "AE"), 31 | (4451, "YA"), 32 | (4452, "YAE"), 33 | (4453, "EO"), 34 | (4454, "E"), 35 | (4455, "YEO"), 36 | (4456, "YE"), 37 | (4457, "O"), 38 | (4458, "WA"), 39 | (4459, "WAE"), 40 | (4460, "OE"), 41 | (4461, "YO"), 42 | (4462, "U"), 43 | (4463, "WEO"), 44 | (4464, "WE"), 45 | (4465, "WI"), 46 | (4466, "YU"), 47 | (4467, "EU"), 48 | (4468, "YI"), 49 | (4469, "I"), 50 | (4520, "G"), 51 | (4521, "GG"), 52 | (4522, "GS"), 53 | (4523, "N"), 54 | (4524, "NJ"), 55 | (4525, "NH"), 56 | (4526, "D"), 57 | (4527, "L"), 58 | (4528, "LG"), 59 | (4529, "LM"), 60 | (4530, "LB"), 61 | (4531, "LS"), 62 | (4532, "LT"), 63 | (4533, "LP"), 64 | (4534, "LH"), 65 | (4535, "M"), 66 | (4536, "B"), 67 | (4537, "BS"), 68 | (4538, "S"), 69 | (4539, "SS"), 70 | (4540, "NG"), 71 | (4541, "J"), 72 | (4542, "C"), 73 | (4543, "K"), 74 | (4544, "T"), 75 | (4545, "P"), 76 | (4546, "H"), 77 | ]; 78 | -------------------------------------------------------------------------------- /benches/tables/slice/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod general_categories; 2 | pub mod general_category; 3 | pub mod jamo_short_name; 4 | pub mod names; 5 | -------------------------------------------------------------------------------- /benches/tables/trie/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod general_categories; 2 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 79 2 | use_small_heuristics = "max" 3 | -------------------------------------------------------------------------------- /scripts/generate-unicode-tables: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This script is responsible for generating some of the Unicode tables used 4 | # in this project. It's a little weird here since ucd-generate is itself 5 | # used to build the tables used by some of its dependencies. However, most 6 | # tables are generated only for use in tests and benchmarks. 7 | # 8 | # Usage is simple, first download the Unicode data: 9 | # 10 | # $ mkdir ucd 11 | # $ cd ucd 12 | # $ curl -LO https://www.unicode.org/Public/zipped/14.0.0/UCD.zip 13 | # $ unzip UCD.zip 14 | # 15 | # And then run this script from the root of this repository by pointing it at 16 | # the data directory downloaded above: 17 | # 18 | # $ ./scripts/generate-unicode-tables path/to/ucd 19 | 20 | if [ $# != 1 ]; then 21 | echo "Usage: $(basename "$0") " >&2 22 | exit 1 23 | fi 24 | ucddir="$1" 25 | 26 | echo "generating FSTs for benchmarks" 27 | out="benches/tables/fst" 28 | ucd-generate general-category \ 29 | "$ucddir" --exclude unassigned --enum --fst-dir "$out" 30 | ucd-generate jamo-short-name \ 31 | "$ucddir" --fst-dir "$out" 32 | ucd-generate names "$ucddir" \ 33 | --no-aliases --no-hangul --no-ideograph --fst-dir "$out" 34 | 35 | echo "generating sorted slices for benchmarks" 36 | out="benches/tables/slice" 37 | ucd-generate general-category \ 38 | "$ucddir" --exclude unassigned > "$out/general_categories.rs" 39 | ucd-generate general-category \ 40 | "$ucddir" --exclude unassigned --enum > "$out/general_category.rs" 41 | ucd-generate jamo-short-name \ 42 | "$ucddir" > "$out/jamo_short_name.rs" 43 | ucd-generate names \ 44 | "$ucddir" --no-aliases --no-hangul --no-ideograph > "$out/names.rs" 45 | 46 | echo "generating tables for ucd-trie benchmarks" 47 | out="benches/tables/trie" 48 | ucd-generate general-category \ 49 | "$ucddir" --exclude unassigned --trie-set > "$out/general_categories.rs" 50 | 51 | echo "generating tables for ucd-trie tests" 52 | out="ucd-trie/src" 53 | ucd-generate general-category "$ucddir" > "$out/general_category.rs" 54 | 55 | echo "generating tables for ucd-util tests" 56 | out="ucd-util/src/unicode_tables" 57 | ucd-generate property-names "$ucddir" > "$out/property_names.rs" 58 | ucd-generate property-values "$ucddir" > "$out/property_values.rs" 59 | ucd-generate jamo-short-name "$ucddir" > "$out/jamo_short_name.rs" 60 | 61 | cargo +stable fmt 62 | -------------------------------------------------------------------------------- /src/age.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, Age}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | use crate::util::PropertyValues; 8 | 9 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 10 | let dir = args.ucd_dir()?; 11 | let propvals = PropertyValues::from_ucd_dir(&dir)?; 12 | let ages: Vec = ucd_parse::parse(&dir)?; 13 | 14 | let mut by_age: BTreeMap> = BTreeMap::new(); 15 | for x in &ages { 16 | let agename = propvals.canonical("Age", &x.age)?; 17 | by_age 18 | .entry(agename) 19 | .or_insert(BTreeSet::new()) 20 | .extend(x.codepoints.into_iter().map(|c| c.value())); 21 | } 22 | 23 | let mut wtr = args.writer("age")?; 24 | wtr.names(by_age.keys())?; 25 | for (name, set) in by_age { 26 | wtr.ranges(&name, &set)?; 27 | } 28 | Ok(()) 29 | } 30 | -------------------------------------------------------------------------------- /src/args.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::OsStr; 2 | use std::ops; 3 | 4 | use clap; 5 | 6 | use crate::error::Result; 7 | use crate::util::Filter; 8 | use crate::writer::{Writer, WriterBuilder}; 9 | 10 | /// Wraps clap matches and provides convenient accessors to various parameters. 11 | pub struct ArgMatches<'a>(&'a clap::ArgMatches<'a>); 12 | 13 | impl<'a> ops::Deref for ArgMatches<'a> { 14 | type Target = clap::ArgMatches<'a>; 15 | fn deref(&self) -> &clap::ArgMatches<'a> { 16 | &self.0 17 | } 18 | } 19 | 20 | impl<'a> ArgMatches<'a> { 21 | pub fn new(matches: &'a clap::ArgMatches<'a>) -> ArgMatches<'a> { 22 | ArgMatches(matches) 23 | } 24 | 25 | pub fn ucd_dir(&self) -> Result<&OsStr> { 26 | match self.value_of_os("ucd-dir") { 27 | Some(x) => Ok(x), 28 | None => err!("missing UCD directory"), 29 | } 30 | } 31 | 32 | pub fn writer(&self, name: &str) -> Result { 33 | let mut builder = WriterBuilder::new(name); 34 | builder 35 | .columns(79) 36 | .char_literals(self.is_present("chars")) 37 | .trie_set(self.is_present("trie-set")); 38 | // Some of the functionality of this crate works with a partial ucd 39 | // directory. 40 | match ucd_parse::ucd_directory_version(self.ucd_dir()?) { 41 | Ok((major, minor, patch)) => { 42 | builder.ucd_version(major, minor, patch) 43 | } 44 | Err(e) => return err!("Failed to determine UCD version: {}", e), 45 | }; 46 | match self.value_of_os("fst-dir") { 47 | None => Ok(builder.from_stdout()), 48 | Some(x) => builder.from_fst_dir(x), 49 | } 50 | } 51 | 52 | pub fn name(&self) -> &str { 53 | self.value_of("name").expect("the name of the table") 54 | } 55 | 56 | /// Create a new include/exclude filter command line arguments. 57 | /// 58 | /// The given canonicalization function is applied to each element in 59 | /// each of the include/exclude lists provided by the end user. 60 | pub fn filter Result>( 61 | &self, 62 | mut canonicalize: F, 63 | ) -> Result { 64 | Filter::new( 65 | self.value_of_lossy("include").map(|s| s.to_string()), 66 | self.value_of_lossy("exclude").map(|s| s.to_string()), 67 | |name| canonicalize(name), 68 | ) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/bidi_class.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, CoreProperty, UnicodeData}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | use crate::util::{print_property_values, PropertyValues}; 8 | 9 | // Bidi Class (listing UnicodeData.txt, field 4: see UAX #44: 10 | // https://www.unicode.org/reports/tr44/) Unlike other properties, unassigned 11 | // code points in blocks reserved for right-to-left scripts are given either 12 | // types R or AL. 13 | // 14 | // The unassigned code points that default to AL are in the ranges: 15 | // [\u0600-\u07BF \u0860-\u086F \u08A0-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF 16 | // \uFE70-\uFEFF \U00010D00-\U00010D3F \U00010F30-\U00010F6F 17 | // \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF] 18 | // 19 | // This includes code points in the Arabic, Syriac, and Thaana blocks, 20 | // among others. 21 | // 22 | // The unassigned code points that default to R are in the ranges: 23 | // [\u0590-\u05FF \u07C0-\u085F \u0870-\u089F \uFB1D-\uFB4F 24 | // \U00010800-\U00010CFF \U00010D40-\U00010F2F \U00010F70-\U00010FFF 25 | // \U0001E800-\U0001EC6F \U0001ECC0-\U0001ECFF \U0001ED50-\U0001EDFF 26 | // \U0001EF00-\U0001EFFF] 27 | // 28 | // This includes code points in the Hebrew, NKo, and Phoenician blocks, 29 | // among others. 30 | // 31 | // The unassigned code points that default to ET are in the range: 32 | // [\u20A0-\u20CF] 33 | // 34 | // This consists of code points in the Currency Symbols block. 35 | // 36 | // The unassigned code points that default to BN have one of the following 37 | // properties: 38 | // Default_Ignorable_Code_Point 39 | // Noncharacter_Code_Point 40 | // 41 | // For all other cases: 42 | // 43 | // All code points not explicitly listed for Bidi_Class 44 | // have the value Left_To_Right (L). 45 | const DEFAULT_CLASS_ASSIGNMENTS: &[(u32, u32, &str)] = &[ 46 | (0x0600, 0x07BF, "AL"), 47 | (0x0860, 0x086F, "AL"), 48 | (0x08A0, 0x08FF, "AL"), 49 | (0xFB50, 0xFDCF, "AL"), 50 | (0xFDF0, 0xFDFF, "AL"), 51 | (0xFE70, 0xFEFF, "AL"), 52 | (0x00010D00, 0x00010D3F, "AL"), 53 | (0x00010F30, 0x00010F6F, "AL"), 54 | (0x0001EC70, 0x0001ECBF, "AL"), 55 | (0x0001ED00, 0x0001ED4F, "AL"), 56 | (0x0001EE00, 0x0001EEFF, "AL"), 57 | (0x0590, 0x05FF, "R"), 58 | (0x07C0, 0x085F, "R"), 59 | (0x0870, 0x089F, "R"), 60 | (0xFB1D, 0xFB4F, "R"), 61 | (0x00010800, 0x00010CFF, "R"), 62 | (0x00010D40, 0x00010F2F, "R"), 63 | (0x00010F70, 0x00010FFF, "R"), 64 | (0x0001E800, 0x0001EC6F, "R"), 65 | (0x0001ECC0, 0x0001ECFF, "R"), 66 | (0x0001ED50, 0x0001EDFF, "R"), 67 | (0x0001EF00, 0x0001EFFF, "R"), 68 | (0x20A0, 0x20CF, "ET"), 69 | ]; 70 | 71 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 72 | let dir = args.ucd_dir()?; 73 | let propvals = PropertyValues::from_ucd_dir(&dir)?; 74 | let rows: Vec = ucd_parse::parse(&dir)?; 75 | let core_prop: Vec = ucd_parse::parse(&dir)?; 76 | let use_short_names = args.is_present("short-names"); 77 | let bidi_class_name = |short_name: &str| { 78 | if use_short_names { 79 | Ok(short_name.to_string()) 80 | } else { 81 | propvals.canonical("bc", short_name) 82 | } 83 | }; 84 | 85 | // If we were tasked with listing the available categories, then do that 86 | // and quit. 87 | if args.is_present("list-classes") { 88 | return print_property_values(&propvals, "Bidi_Class"); 89 | } 90 | 91 | // Collect each bidi class into an ordered set. 92 | let mut by_type: BTreeMap> = BTreeMap::new(); 93 | let mut assigned = BTreeSet::new(); 94 | for row in rows { 95 | assigned.insert(row.codepoint.value()); 96 | let bc = bidi_class_name(&row.bidi_class)?; 97 | by_type 98 | .entry(bc) 99 | .or_insert(BTreeSet::new()) 100 | .insert(row.codepoint.value()); 101 | } 102 | 103 | // Process the codepoints that are not listed as per the notes in 104 | // DerivedBidiClass.txt (UCD 12.1). See comment on 105 | // DEFAULT_CLASS_ASSIGNMENTS for more detail. 106 | // 107 | // Collect the codepoints that may default to BN 108 | let mut maybe_boundary_neutral = BTreeSet::new(); 109 | for x in &core_prop { 110 | if &x.property == "Default_Ignorable_Code_Point" 111 | || &x.property == "Noncharacter_Code_Point" 112 | { 113 | maybe_boundary_neutral 114 | .extend(x.codepoints.into_iter().map(|c| c.value())); 115 | } 116 | } 117 | 118 | // Process unassigned codepoints 119 | let left_to_right_name = bidi_class_name("L")?; 120 | let boundary_neutral_name = bidi_class_name("BN")?; 121 | for cp in 0..=0x10FFFF { 122 | if assigned.contains(&cp) { 123 | continue; 124 | } 125 | // Check if this code point is in the default Bidi classes 126 | if let Some(class) = lookup_unassigned(cp, DEFAULT_CLASS_ASSIGNMENTS) { 127 | let name = bidi_class_name(class)?; 128 | by_type.get_mut(&name).unwrap().insert(cp); 129 | } else if maybe_boundary_neutral.contains(&cp) { 130 | by_type.get_mut(&boundary_neutral_name).unwrap().insert(cp); 131 | } else { 132 | // All others get assigned Left_To_Right 133 | by_type.get_mut(&left_to_right_name).unwrap().insert(cp); 134 | } 135 | } 136 | 137 | let mut wtr = args.writer("bidi_class")?; 138 | if args.is_present("enum") { 139 | wtr.ranges_to_enum(args.name(), &by_type)?; 140 | } else if args.is_present("rust-enum") { 141 | let variants = by_type.keys().map(String::as_str).collect::>(); 142 | wtr.ranges_to_rust_enum(args.name(), &variants, &by_type)?; 143 | } else if args.is_present("combined") { 144 | wtr.ranges_to_combined(args.name(), &by_type)?; 145 | } else { 146 | wtr.names(by_type.keys())?; 147 | for (name, set) in by_type { 148 | wtr.ranges(&name, &set)?; 149 | } 150 | } 151 | 152 | Ok(()) 153 | } 154 | 155 | /// Look up a code point in the unassigned default Bidi classes. 156 | fn lookup_unassigned<'a>( 157 | codepoint: u32, 158 | defaults: &[(u32, u32, &'a str)], 159 | ) -> Option<&'a str> { 160 | defaults 161 | .iter() 162 | .find(|&&(start, end, _)| start <= codepoint && codepoint <= end) 163 | .map(|&(_, _, bidi_class)| bidi_class) 164 | } 165 | -------------------------------------------------------------------------------- /src/bidi_mirroring_glyph.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use ucd_parse::{self, BidiMirroring}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | 8 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 9 | let dir = args.ucd_dir()?; 10 | let rows: Vec = ucd_parse::parse(&dir)?; 11 | 12 | let table: BTreeMap<_, _> = rows 13 | .into_iter() 14 | .map(|mapping| { 15 | (mapping.codepoint.value(), mapping.bidi_mirroring_glyph.value()) 16 | }) 17 | .collect(); 18 | 19 | let mut wtr = args.writer("bidi_mirroring_glyph")?; 20 | if args.is_present("rust-match") { 21 | wtr.codepoint_to_codepoint_fn(args.name(), &table)?; 22 | } else { 23 | wtr.codepoint_to_codepoint(args.name(), &table)?; 24 | } 25 | 26 | Ok(()) 27 | } 28 | -------------------------------------------------------------------------------- /src/brk.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, GraphemeClusterBreak, SentenceBreak, WordBreak}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | 8 | pub fn grapheme_cluster(args: ArgMatches<'_>) -> Result<()> { 9 | let ucd_dir = args.ucd_dir()?; 10 | let vals: Vec = ucd_parse::parse(&ucd_dir)?; 11 | 12 | let mut byval: BTreeMap> = BTreeMap::new(); 13 | for x in &vals { 14 | byval 15 | .entry(x.value.clone()) 16 | .or_insert(BTreeSet::new()) 17 | .extend(x.codepoints.into_iter().map(|c| c.value())); 18 | } 19 | 20 | let mut wtr = args.writer("grapheme_cluster_break")?; 21 | if args.is_present("enum") { 22 | wtr.ranges_to_enum(args.name(), &byval)?; 23 | } else { 24 | wtr.names(byval.keys())?; 25 | for (val, set) in byval { 26 | wtr.ranges(&val, &set)?; 27 | } 28 | } 29 | Ok(()) 30 | } 31 | 32 | pub fn word(args: ArgMatches<'_>) -> Result<()> { 33 | let ucd_dir = args.ucd_dir()?; 34 | let vals: Vec = ucd_parse::parse(&ucd_dir)?; 35 | 36 | let mut byval: BTreeMap> = BTreeMap::new(); 37 | for x in &vals { 38 | byval 39 | .entry(x.value.clone()) 40 | .or_insert(BTreeSet::new()) 41 | .extend(x.codepoints.into_iter().map(|c| c.value())); 42 | } 43 | 44 | let mut wtr = args.writer("word_break")?; 45 | if args.is_present("enum") { 46 | wtr.ranges_to_enum(args.name(), &byval)?; 47 | } else { 48 | wtr.names(byval.keys())?; 49 | for (val, set) in byval { 50 | wtr.ranges(&val, &set)?; 51 | } 52 | } 53 | Ok(()) 54 | } 55 | 56 | pub fn sentence(args: ArgMatches<'_>) -> Result<()> { 57 | let ucd_dir = args.ucd_dir()?; 58 | let vals: Vec = ucd_parse::parse(&ucd_dir)?; 59 | 60 | let mut byval: BTreeMap> = BTreeMap::new(); 61 | for x in &vals { 62 | byval 63 | .entry(x.value.clone()) 64 | .or_insert(BTreeSet::new()) 65 | .extend(x.codepoints.into_iter().map(|c| c.value())); 66 | } 67 | 68 | let mut wtr = args.writer("sentence_break")?; 69 | if args.is_present("enum") { 70 | wtr.ranges_to_enum(args.name(), &byval)?; 71 | } else { 72 | wtr.names(byval.keys())?; 73 | for (val, set) in byval { 74 | wtr.ranges(&val, &set)?; 75 | } 76 | } 77 | Ok(()) 78 | } 79 | -------------------------------------------------------------------------------- /src/canonical_combining_class.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, UnicodeData}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | use crate::util::{print_property_values, PropertyValues}; 8 | 9 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 10 | let dir = args.ucd_dir()?; 11 | let propvals = PropertyValues::from_ucd_dir(&dir)?; 12 | let rows: Vec = ucd_parse::parse(&dir)?; 13 | let ccc_name = |ccc: u8| { 14 | propvals.canonical("canonicalcombiningclass", &ccc.to_string()) 15 | }; 16 | 17 | // If we were tasked with listing the available categories, then do that 18 | // and quit. 19 | if args.is_present("list-classes") { 20 | return print_property_values(&propvals, "Canonical_Combining_Class"); 21 | } 22 | 23 | // Collect each canonical combining class into an ordered set. 24 | let mut name_map: BTreeMap = BTreeMap::new(); 25 | let mut by_name: BTreeMap> = BTreeMap::new(); 26 | let mut assigned = BTreeSet::new(); 27 | for row in rows { 28 | assigned.insert(row.codepoint.value()); 29 | let ccc_value = row.canonical_combining_class; 30 | let ccc_name = ccc_name(ccc_value)?; 31 | name_map.entry(ccc_value as isize).or_insert_with(|| ccc_name.clone()); 32 | by_name 33 | .entry(ccc_name) 34 | .or_insert(BTreeSet::new()) 35 | .insert(row.codepoint.value()); 36 | } 37 | 38 | // Process the codepoints that are not listed as per the note in 39 | // DerivedCombiningClass.txt (UCD 13.0): 40 | // 41 | // - All code points not explicitly listed for Canonical_Combining_Class 42 | // have the value Not_Reordered (0). 43 | let not_reordered_name = ccc_name(0)?; 44 | for cp in 0..=0x10FFFF { 45 | if !assigned.contains(&cp) { 46 | by_name.get_mut(¬_reordered_name).unwrap().insert(cp); 47 | } 48 | } 49 | 50 | let mut wtr = args.writer("canonical_combining_class")?; 51 | if args.is_present("enum") { 52 | wtr.ranges_to_enum(args.name(), &by_name)?; 53 | } else if args.is_present("rust-enum") { 54 | wtr.ranges_to_rust_enum_with_custom_discriminants( 55 | args.name(), 56 | &name_map, 57 | &by_name, 58 | )?; 59 | } else { 60 | wtr.names(by_name.keys())?; 61 | for (name, set) in by_name { 62 | wtr.ranges(&name, &set)?; 63 | } 64 | } 65 | 66 | Ok(()) 67 | } 68 | -------------------------------------------------------------------------------- /src/case_folding.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, CaseFold, CaseStatus, Codepoint}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | 8 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 9 | let dir = args.ucd_dir()?; 10 | let case_folding: BTreeMap> = 11 | ucd_parse::parse_many_by_codepoint(dir)?; 12 | 13 | let compute_all_pairs = 14 | args.is_present("all-pairs") || args.is_present("circular"); 15 | let mut wtr = args.writer("case_folding_simple")?; 16 | let mut table = BTreeMap::new(); 17 | let mut table_all = BTreeMap::new(); 18 | for (&cp, case_folds) in &case_folding { 19 | let mapping_cp = match choose_fold(case_folds, false)? { 20 | None => continue, 21 | Some(case_fold) => &case_fold.mapping, 22 | }; 23 | assert_eq!(mapping_cp.len(), 1); 24 | 25 | let (a, b) = (cp.value(), mapping_cp[0].value()); 26 | table.insert(a, b); 27 | if compute_all_pairs { 28 | table_all.entry(a).or_insert(BTreeSet::new()).insert(b); 29 | table_all.entry(b).or_insert(BTreeSet::new()).insert(a); 30 | } 31 | } 32 | if compute_all_pairs { 33 | let mut exhaustive = BTreeMap::new(); 34 | for (&k, vs) in &table_all { 35 | exhaustive.insert(k, BTreeSet::new()); 36 | for &v in vs { 37 | exhaustive.get_mut(&k).unwrap().insert(v); 38 | if let Some(vs2) = table_all.get(&v) { 39 | for &v2 in vs2 { 40 | exhaustive.get_mut(&k).unwrap().insert(v2); 41 | } 42 | } 43 | } 44 | exhaustive.get_mut(&k).unwrap().remove(&k); 45 | } 46 | table_all = exhaustive; 47 | } 48 | 49 | if args.is_present("circular") { 50 | let mut equiv = BTreeMap::new(); 51 | let mut seen = BTreeSet::new(); 52 | for (&k, vs) in &table_all { 53 | if vs.is_empty() || seen.contains(&k) { 54 | continue; 55 | } 56 | seen.insert(k); 57 | for &v in vs { 58 | seen.insert(v); 59 | } 60 | 61 | let mut cur = *vs.iter().last().unwrap(); 62 | for &v in Some(&k).into_iter().chain(vs.iter()) { 63 | assert!(!equiv.contains_key(&cur)); 64 | equiv.insert(cur, v); 65 | cur = v; 66 | } 67 | } 68 | wtr.codepoint_to_codepoint(args.name(), &equiv)?; 69 | } else if args.is_present("all-pairs") { 70 | let flat = args.is_present("flat-table"); 71 | wtr.multi_codepoint_to_codepoint(args.name(), &table_all, flat)?; 72 | } else { 73 | wtr.codepoint_to_codepoint(args.name(), &table)?; 74 | } 75 | Ok(()) 76 | } 77 | 78 | /// Given a sequence of case fold mappings, choose exactly one mapping based 79 | /// on the mapping's status. If `full` is true, then full case mappings are 80 | /// selected, otherwise simple case mappings are selected. If there are 81 | /// multiple valid choices, then an error is returned. 82 | fn choose_fold( 83 | case_folds: &[CaseFold], 84 | full: bool, 85 | ) -> Result> { 86 | let mut choice = None; 87 | for case_fold in case_folds { 88 | if (full && case_fold.status == CaseStatus::Full) 89 | || (!full && case_fold.status == CaseStatus::Simple) 90 | || case_fold.status == CaseStatus::Common 91 | { 92 | if choice.is_some() { 93 | return err!("found multiple matches from: {:?}", case_folds); 94 | } 95 | choice = Some(case_fold); 96 | } 97 | } 98 | Ok(choice) 99 | } 100 | -------------------------------------------------------------------------------- /src/case_mapping.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use ucd_parse::{SpecialCaseMapping, UcdFile, UnicodeData}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | 8 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 9 | let dir = args.ucd_dir()?; 10 | let mut lower_map: BTreeMap> = BTreeMap::new(); 11 | let mut upper_map: BTreeMap> = BTreeMap::new(); 12 | let mut title_map: BTreeMap> = BTreeMap::new(); 13 | let mut wtr = args.writer("case_mapping")?; 14 | for item in UnicodeData::from_dir(dir)? { 15 | let item = item?; 16 | if let Some(lower) = item.simple_lowercase_mapping { 17 | lower_map.insert(item.codepoint.value(), vec![lower.value()]); 18 | } 19 | if let Some(upper) = item.simple_uppercase_mapping { 20 | upper_map.insert(item.codepoint.value(), vec![upper.value()]); 21 | } 22 | if let Some(title) = item.simple_titlecase_mapping { 23 | title_map.insert(item.codepoint.value(), vec![title.value()]); 24 | } 25 | } 26 | 27 | let includes = if let Some(what) = args.values_of("include") { 28 | what.clone().collect::>() 29 | } else { 30 | vec!["LOWER", "UPPER", "TITLE"] 31 | }; 32 | 33 | if args.is_present("simple") { 34 | let upper_map = 35 | upper_map.into_iter().map(|(k, v)| (k, v[0])).collect(); 36 | let lower_map = 37 | lower_map.into_iter().map(|(k, v)| (k, v[0])).collect(); 38 | let title_map = 39 | title_map.into_iter().map(|(k, v)| (k, v[0])).collect(); 40 | 41 | for name in includes { 42 | match name { 43 | "LOWER" => wtr.codepoint_to_codepoint("LOWER", &lower_map)?, 44 | "UPPER" => wtr.codepoint_to_codepoint("UPPER", &upper_map)?, 45 | "TITlE" => wtr.codepoint_to_codepoint("TITlE", &title_map)?, 46 | _ => (), 47 | } 48 | } 49 | } else { 50 | for special in SpecialCaseMapping::from_dir(&dir)? { 51 | let special = special?; 52 | if !special.conditions.is_empty() { 53 | // There should probably be an option to output these too, but 54 | // I'm not sure how they're typically used... 55 | continue; 56 | } 57 | if !special.lowercase.is_empty() { 58 | lower_map.insert( 59 | special.codepoint.value(), 60 | special.lowercase.iter().map(|v| v.value()).collect(), 61 | ); 62 | } 63 | if !special.uppercase.is_empty() { 64 | upper_map.insert( 65 | special.codepoint.value(), 66 | special.uppercase.iter().map(|v| v.value()).collect(), 67 | ); 68 | } 69 | if !special.titlecase.is_empty() { 70 | title_map.insert( 71 | special.codepoint.value(), 72 | special.titlecase.iter().map(|v| v.value()).collect(), 73 | ); 74 | } 75 | } 76 | let flat = args.is_present("flat-table"); 77 | for name in includes { 78 | match name { 79 | "LOWER" => { 80 | wtr.codepoint_to_codepoints("LOWER", &lower_map, flat)? 81 | } 82 | "UPPER" => { 83 | wtr.codepoint_to_codepoints("UPPER", &upper_map, flat)? 84 | } 85 | "TITLE" => { 86 | wtr.codepoint_to_codepoints("TITLE", &title_map, flat)? 87 | } 88 | _ => (), 89 | } 90 | } 91 | } 92 | Ok(()) 93 | } 94 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use std::error; 2 | use std::fmt; 3 | use std::io; 4 | use std::result; 5 | 6 | use clap; 7 | use fst; 8 | use ucd_parse; 9 | use ucd_trie; 10 | 11 | pub type Result = result::Result; 12 | 13 | #[derive(Debug)] 14 | pub enum Error { 15 | Io(io::Error), 16 | Clap(clap::Error), 17 | Other(String), 18 | } 19 | 20 | impl Error { 21 | pub fn is_broken_pipe(&self) -> bool { 22 | match *self { 23 | Error::Io(ref e) if e.kind() == io::ErrorKind::BrokenPipe => true, 24 | _ => false, 25 | } 26 | } 27 | } 28 | 29 | impl error::Error for Error { 30 | fn source(&self) -> Option<&(dyn error::Error + 'static)> { 31 | match *self { 32 | Error::Io(ref err) => Some(err), 33 | Error::Clap(ref err) => Some(err), 34 | _ => None, 35 | } 36 | } 37 | } 38 | 39 | impl fmt::Display for Error { 40 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 41 | match *self { 42 | Error::Io(ref err) => err.fmt(f), 43 | Error::Clap(ref err) => err.fmt(f), 44 | Error::Other(ref msg) => write!(f, "{}", msg), 45 | } 46 | } 47 | } 48 | 49 | impl From for Error { 50 | fn from(err: io::Error) -> Error { 51 | Error::Io(err) 52 | } 53 | } 54 | 55 | impl From for Error { 56 | fn from(err: clap::Error) -> Error { 57 | Error::Clap(err) 58 | } 59 | } 60 | 61 | impl From for Error { 62 | fn from(err: fst::Error) -> Error { 63 | Error::Other(err.to_string()) 64 | } 65 | } 66 | 67 | impl From for Error { 68 | fn from(err: ucd_parse::Error) -> Error { 69 | Error::Other(err.to_string()) 70 | } 71 | } 72 | 73 | impl From for Error { 74 | fn from(err: ucd_trie::Error) -> Error { 75 | Error::Other(err.to_string()) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/general_category.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, UnicodeData, UnicodeDataExpander}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | use crate::util::{print_property_values, PropertyValues}; 8 | 9 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 10 | let dir = args.ucd_dir()?; 11 | let propvals = PropertyValues::from_ucd_dir(&dir)?; 12 | let filter = args.filter(|name| propvals.canonical("gc", name))?; 13 | let unexpanded = ucd_parse::parse(&dir)?; 14 | 15 | // If we were tasked with listing the available categories, then do that 16 | // and quit. 17 | if args.is_present("list-categories") { 18 | return print_property_values(&propvals, "General_Category"); 19 | } 20 | 21 | let mut bycat = expand_into_categories(unexpanded, &propvals)?; 22 | 23 | // As another special case, collect all "related" groups of categories. 24 | // But don't do this when printing an enumeration, because in an 25 | // enumeration each codepoint should belong to exactly one category, which 26 | // is not true if we include related categories. 27 | if !args.is_present("enum") && !args.is_present("rust-enum") { 28 | for (name, set) in related(&propvals, &bycat) { 29 | if filter.contains(&name) { 30 | bycat.insert(name, set); 31 | } 32 | } 33 | } 34 | // Finally, filter out any sets according to what the user asked for. 35 | let bycat = bycat 36 | .into_iter() 37 | .filter(|&(ref name, _)| filter.contains(name)) 38 | .collect(); 39 | 40 | let mut wtr = args.writer("general_category")?; 41 | if args.is_present("enum") { 42 | wtr.ranges_to_enum(args.name(), &bycat)?; 43 | } else if args.is_present("rust-enum") { 44 | let variants = bycat.keys().map(String::as_str).collect::>(); 45 | wtr.ranges_to_rust_enum(args.name(), &variants, &bycat)?; 46 | } else if args.is_present("combined") { 47 | wtr.ranges_to_combined(args.name(), &bycat)?; 48 | } else { 49 | wtr.names(bycat.keys().filter(|n| filter.contains(n)))?; 50 | for (name, set) in bycat { 51 | wtr.ranges(&name, &set)?; 52 | } 53 | } 54 | 55 | Ok(()) 56 | } 57 | 58 | /// Expand a list of UnicodeData rows and group by category. 59 | pub fn expand_into_categories( 60 | unexpanded: Vec, 61 | propvals: &PropertyValues, 62 | ) -> Result>> { 63 | // Expand all of our UnicodeData rows. This results in one big list of 64 | // all assigned codepoints. 65 | let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect(); 66 | 67 | // Collect each general category into an ordered set. 68 | let mut bycat: BTreeMap> = BTreeMap::new(); 69 | let mut assigned = BTreeSet::new(); 70 | for row in rows { 71 | assigned.insert(row.codepoint.value()); 72 | let gc = propvals.canonical("gc", &row.general_category)?.to_string(); 73 | bycat 74 | .entry(gc) 75 | .or_insert(BTreeSet::new()) 76 | .insert(row.codepoint.value()); 77 | } 78 | // As a special case, collect all unassigned codepoints. 79 | let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string(); 80 | bycat.insert(unassigned_name.clone(), BTreeSet::new()); 81 | for cp in 0..=0x10FFFF { 82 | if !assigned.contains(&cp) { 83 | bycat.get_mut(&unassigned_name).unwrap().insert(cp); 84 | } 85 | } 86 | 87 | Ok(bycat) 88 | } 89 | 90 | /// Related returns a set of sets of codepoints corresponding to the "related" 91 | /// groups of categories defined by Table 12 in UAX#44 S5.7.1. 92 | /// 93 | /// The given `cats` should correspond to the normal set of general categories, 94 | /// keyed by canonical name. 95 | fn related( 96 | propvals: &PropertyValues, 97 | cats: &BTreeMap>, 98 | ) -> BTreeMap> { 99 | let mut sets = BTreeMap::new(); 100 | for (name, components) in related_categories(propvals) { 101 | let set = sets.entry(name).or_insert(BTreeSet::new()); 102 | for component in components { 103 | set.extend(cats[&component].iter().cloned()); 104 | } 105 | } 106 | sets 107 | } 108 | 109 | /// Return all groups of "related" general categories. 110 | fn related_categories( 111 | propvals: &PropertyValues, 112 | ) -> Vec<(String, Vec)> { 113 | // canonicalize a gencat property value 114 | let c = |name: &str| -> String { 115 | propvals.canonical("gc", name).unwrap().to_string() 116 | }; 117 | vec![ 118 | (c("Cased_Letter"), vec![c("lu"), c("ll"), c("lt")]), 119 | (c("Letter"), vec![c("lu"), c("ll"), c("lt"), c("lm"), c("lo")]), 120 | (c("Mark"), vec![c("mn"), c("mc"), c("me")]), 121 | (c("Number"), vec![c("nd"), c("nl"), c("no")]), 122 | ( 123 | c("Punctuation"), 124 | vec![ 125 | c("pc"), 126 | c("pd"), 127 | c("ps"), 128 | c("pe"), 129 | c("pi"), 130 | c("pf"), 131 | c("po"), 132 | ], 133 | ), 134 | (c("Symbol"), vec![c("sm"), c("sc"), c("sk"), c("so")]), 135 | (c("Separator"), vec![c("zs"), c("zl"), c("zp")]), 136 | (c("Other"), vec![c("cc"), c("cf"), c("cs"), c("co"), c("cn")]), 137 | ] 138 | } 139 | -------------------------------------------------------------------------------- /src/jamo_short_name.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::BTreeMap, path::Path}; 2 | 3 | use ucd_parse::{self, JamoShortName}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | 8 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 9 | let dir = args.ucd_dir()?; 10 | let map = jamo_map(&Path::new(dir))?; 11 | let mut wtr = args.writer("jamo_short_name")?; 12 | wtr.codepoint_to_string(args.name(), &map)?; 13 | Ok(()) 14 | } 15 | 16 | fn jamo_map(dir: &Path) -> Result> { 17 | let jamo_map = ucd_parse::parse_by_codepoint::<_, JamoShortName>(dir)?; 18 | let mut map = BTreeMap::new(); 19 | for (cp, jamo) in jamo_map { 20 | map.insert(cp.value(), jamo.name); 21 | } 22 | Ok(map) 23 | } 24 | 25 | pub fn table(dir: &Path) -> Result> { 26 | Ok(jamo_map(dir)?.into_iter().collect()) 27 | } 28 | 29 | pub fn table_ref<'a>(table: &'a [(u32, String)]) -> Vec<(u32, &'a str)> { 30 | table.iter().map(|&(cp, ref name)| (cp, &**name)).collect() 31 | } 32 | -------------------------------------------------------------------------------- /src/joining_type.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, ArabicShaping}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | use crate::general_category; 8 | use crate::util::PropertyValues; 9 | 10 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 11 | let dir = args.ucd_dir()?; 12 | let propvals = PropertyValues::from_ucd_dir(&dir)?; 13 | let rows: Vec = ucd_parse::parse(&dir)?; 14 | let unexpanded_gc = ucd_parse::parse(&dir)?; 15 | let gc = 16 | general_category::expand_into_categories(unexpanded_gc, &propvals)?; 17 | 18 | // Collect each joining type into an ordered set. 19 | let mut by_type: BTreeMap> = BTreeMap::new(); 20 | let mut assigned = BTreeSet::new(); 21 | for row in rows { 22 | assigned.insert(row.codepoint.value()); 23 | let jt = 24 | propvals.canonical("jt", row.joining_type.as_str())?.to_string(); 25 | by_type 26 | .entry(jt) 27 | .or_insert(BTreeSet::new()) 28 | .insert(row.codepoint.value()); 29 | } 30 | // Process the codepoints that are not listed as per the note in 31 | // ArabicShaping.txt: 32 | // 33 | // Note: Code points that are not explicitly listed in this file are either 34 | // of joining type T or U: 35 | // 36 | // - Those that are not explicitly listed and that are of General Category 37 | // Mn, Me, or Cf have joining type T. 38 | // - All others not explicitly listed have joining type U. 39 | let transparent_name = propvals.canonical("jt", "transparent")?; 40 | let non_joining_name = propvals.canonical("jt", "non_joining")?; 41 | let transparent_categories = ["Mn", "Me", "Cf"] 42 | .iter() 43 | .map(|cat| propvals.canonical("gc", cat).map(|name| &gc[&name])) 44 | .collect::>>()?; 45 | for cp in 0..=0x10FFFF { 46 | if assigned.contains(&cp) { 47 | continue; 48 | } 49 | // See if the code point is in any of the general categories that 50 | // map to the Transparent joining type. Otherwise add to the 51 | // Non_Joining type. 52 | if transparent_categories.iter().any(|cat| cat.contains(&cp)) { 53 | by_type.get_mut(&transparent_name).unwrap().insert(cp); 54 | } else { 55 | by_type.get_mut(&non_joining_name).unwrap().insert(cp); 56 | } 57 | } 58 | 59 | let mut wtr = args.writer("joining_type")?; 60 | if args.is_present("enum") { 61 | wtr.ranges_to_enum(args.name(), &by_type)?; 62 | } else if args.is_present("rust-enum") { 63 | let variants = by_type.keys().map(String::as_str).collect::>(); 64 | wtr.ranges_to_rust_enum(args.name(), &variants, &by_type)?; 65 | } else if args.is_present("combined") { 66 | wtr.ranges_to_combined(args.name(), &by_type)?; 67 | } else { 68 | wtr.names(by_type.keys())?; 69 | for (name, set) in by_type { 70 | wtr.ranges(&name, &set)?; 71 | } 72 | } 73 | 74 | Ok(()) 75 | } 76 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Write}; 2 | use std::process; 3 | 4 | use ucd_parse::{UcdFile, UnicodeData}; 5 | 6 | use crate::args::ArgMatches; 7 | use crate::error::Result; 8 | 9 | macro_rules! err { 10 | ($($tt:tt)*) => { 11 | Err(crate::error::Error::Other(format!($($tt)*))) 12 | } 13 | } 14 | 15 | mod app; 16 | mod args; 17 | mod error; 18 | mod util; 19 | mod writer; 20 | 21 | mod age; 22 | mod bidi_class; 23 | mod bidi_mirroring_glyph; 24 | mod brk; 25 | mod canonical_combining_class; 26 | mod case_folding; 27 | mod case_mapping; 28 | mod general_category; 29 | mod jamo_short_name; 30 | mod joining_type; 31 | mod names; 32 | mod property_bool; 33 | mod script; 34 | 35 | fn main() { 36 | if let Err(err) = run() { 37 | if err.is_broken_pipe() { 38 | process::exit(0); 39 | } 40 | eprintln!("{}", err); 41 | process::exit(1); 42 | } 43 | } 44 | 45 | fn run() -> Result<()> { 46 | let matches = app::app().get_matches(); 47 | match matches.subcommand() { 48 | ("bidi-class", Some(m)) => bidi_class::command(ArgMatches::new(m)), 49 | ("bidi-mirroring-glyph", Some(m)) => { 50 | bidi_mirroring_glyph::command(ArgMatches::new(m)) 51 | } 52 | ("canonical-combining-class", Some(m)) => { 53 | canonical_combining_class::command(ArgMatches::new(m)) 54 | } 55 | ("general-category", Some(m)) => { 56 | general_category::command(ArgMatches::new(m)) 57 | } 58 | ("script", Some(m)) => script::command_script(ArgMatches::new(m)), 59 | ("script-extension", Some(m)) => { 60 | script::command_script_extension(ArgMatches::new(m)) 61 | } 62 | ("property-bool", Some(m)) => { 63 | property_bool::command(ArgMatches::new(m)) 64 | } 65 | ("age", Some(m)) => age::command(ArgMatches::new(m)), 66 | ("perl-word", Some(m)) => { 67 | property_bool::command_perl_word(ArgMatches::new(m)) 68 | } 69 | ("jamo-short-name", Some(m)) => { 70 | jamo_short_name::command(ArgMatches::new(m)) 71 | } 72 | ("joining-type", Some(m)) => joining_type::command(ArgMatches::new(m)), 73 | ("names", Some(m)) => names::command(ArgMatches::new(m)), 74 | ("property-names", Some(m)) => cmd_property_names(ArgMatches::new(m)), 75 | ("property-values", Some(m)) => { 76 | cmd_property_values(ArgMatches::new(m)) 77 | } 78 | ("case-folding-simple", Some(m)) => { 79 | case_folding::command(ArgMatches::new(m)) 80 | } 81 | ("case-mapping", Some(m)) => case_mapping::command(ArgMatches::new(m)), 82 | ("grapheme-cluster-break", Some(m)) => { 83 | brk::grapheme_cluster(ArgMatches::new(m)) 84 | } 85 | ("word-break", Some(m)) => brk::word(ArgMatches::new(m)), 86 | ("sentence-break", Some(m)) => brk::sentence(ArgMatches::new(m)), 87 | ("test-unicode-data", Some(m)) => { 88 | cmd_test_unicode_data(ArgMatches::new(m)) 89 | } 90 | ("", _) => { 91 | app::app().print_help()?; 92 | println!(""); 93 | Ok(()) 94 | } 95 | (unknown, _) => err!("unrecognized command: {}", unknown), 96 | } 97 | } 98 | 99 | fn cmd_property_names(args: ArgMatches<'_>) -> Result<()> { 100 | use crate::util::PropertyNames; 101 | use std::collections::BTreeMap; 102 | 103 | let dir = args.ucd_dir()?; 104 | let names = PropertyNames::from_ucd_dir(&dir)?; 105 | let filter = args.filter(|name| names.canonical(name))?; 106 | 107 | let mut actual_names = BTreeMap::new(); 108 | for (k, v) in &names.0 { 109 | if filter.contains(v) { 110 | actual_names.insert(k.to_string(), v.to_string()); 111 | } 112 | } 113 | let mut wtr = args.writer("property_names")?; 114 | wtr.string_to_string(args.name(), &actual_names)?; 115 | Ok(()) 116 | } 117 | 118 | fn cmd_property_values(args: ArgMatches<'_>) -> Result<()> { 119 | use crate::util::{PropertyNames, PropertyValues}; 120 | use std::collections::BTreeMap; 121 | 122 | let dir = args.ucd_dir()?; 123 | let values = PropertyValues::from_ucd_dir(&dir)?; 124 | let names = PropertyNames::from_ucd_dir(&dir)?; 125 | let filter = args.filter(|name| names.canonical(name))?; 126 | 127 | let mut actual_values = BTreeMap::new(); 128 | for (k, v) in &values.value { 129 | if filter.contains(k) { 130 | actual_values.insert(k.to_string(), v.clone()); 131 | } 132 | } 133 | let mut wtr = args.writer("property_values")?; 134 | wtr.string_to_string_to_string(args.name(), &actual_values)?; 135 | Ok(()) 136 | } 137 | 138 | fn cmd_test_unicode_data(args: ArgMatches<'_>) -> Result<()> { 139 | let dir = args.ucd_dir()?; 140 | let mut stdout = io::stdout(); 141 | for result in UnicodeData::from_dir(dir)? { 142 | let x: UnicodeData = result?; 143 | writeln!(stdout, "{}", x)?; 144 | } 145 | Ok(()) 146 | } 147 | -------------------------------------------------------------------------------- /src/names.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::BTreeMap, path::Path}; 2 | 3 | use ucd_parse::{self, Codepoint, NameAlias, UnicodeData}; 4 | use ucd_util; 5 | 6 | use crate::args::ArgMatches; 7 | use crate::error::Result; 8 | 9 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 10 | let dir = args.ucd_dir()?; 11 | let jamo_short_name_map = crate::jamo_short_name::table(Path::new(dir))?; 12 | let data = ucd_parse::parse_by_codepoint(&dir)?; 13 | let aliases = if args.is_present("no-aliases") { 14 | None 15 | } else { 16 | Some(ucd_parse::parse_many_by_codepoint(&dir)?) 17 | }; 18 | let mut names = names_to_codepoint( 19 | &data, 20 | &aliases, 21 | &crate::jamo_short_name::table_ref(&jamo_short_name_map), 22 | !args.is_present("no-ideograph"), 23 | !args.is_present("no-hangul"), 24 | ); 25 | if args.is_present("normalize") { 26 | names = names 27 | .into_iter() 28 | .map(|(mut name, tagged)| { 29 | ucd_util::character_name_normalize(&mut name); 30 | (name, tagged) 31 | }) 32 | .collect(); 33 | } 34 | 35 | let mut wtr = args.writer("names")?; 36 | if args.is_present("tagged") { 37 | let mut map = BTreeMap::new(); 38 | for (name, (tag, cp)) in names { 39 | map.insert(name, tag.with_codepoint(cp)); 40 | } 41 | wtr.string_to_u64(args.name(), &map)?; 42 | } else { 43 | let mut map = BTreeMap::new(); 44 | for (name, (_, cp)) in names { 45 | map.insert(name, cp); 46 | } 47 | wtr.string_to_codepoint(args.name(), &map)?; 48 | } 49 | Ok(()) 50 | } 51 | 52 | /// A tag indicating how the name of a codepoint was found. 53 | /// 54 | /// When a name has both an algorithmically generated name and an 55 | /// explicit/alias name, then the algorithmically generated tag is preferred. 56 | #[derive(Debug)] 57 | enum NameTag { 58 | /// The name is listed explicitly in UnicodeData.txt. 59 | Explicit, 60 | /// The name was taken from NameAliases.txt. 61 | Alias, 62 | /// The name is an algorithmically generated Hangul syllable. 63 | Hangul, 64 | /// The name is an algorithmically generated ideograph. 65 | Ideograph, 66 | } 67 | 68 | impl NameTag { 69 | fn with_codepoint(&self, cp: u32) -> u64 { 70 | use self::NameTag::*; 71 | match *self { 72 | Explicit => (1 << 33) | (cp as u64), 73 | Alias => (1 << 34) | (cp as u64), 74 | Hangul => (1 << 35) | (cp as u64), 75 | Ideograph => (1 << 36) | (cp as u64), 76 | } 77 | } 78 | } 79 | 80 | /// Build one big map in memory from every possible name of a character to its 81 | /// corresponding codepoint. One codepoint may be pointed to by multiple names. 82 | /// 83 | /// The return value maps each name to its corresponding codepoint, along with 84 | /// a tag associated with how that mapping was generated. 85 | fn names_to_codepoint( 86 | data: &BTreeMap, 87 | aliases: &Option>>, 88 | jamo_short_name_table: &[(u32, &str)], 89 | ideograph: bool, 90 | hangul: bool, 91 | ) -> BTreeMap { 92 | // The order in which we write names is important, since there is some 93 | // overlap. 94 | // 95 | // Basically, if a character has a "canonical" name that is equivalent to 96 | // one of its aliases, then overwrite the alias with the canonical name. 97 | // The effect is that its tag will be Explicit rather than Alias. 98 | // 99 | // Additionally, write the algorithmically generated names after 100 | // everything, so that even if a algorithmically generated name matches 101 | // an Explicit/Alias name, its tag will indicate that it is generated. 102 | let mut map = BTreeMap::new(); 103 | if let Some(ref alias_map) = *aliases { 104 | for (cp, aliases) in alias_map { 105 | for name_alias in aliases { 106 | let v = (NameTag::Alias, cp.value()); 107 | map.insert(name_alias.alias.clone(), v); 108 | } 109 | } 110 | } 111 | for (cp, datum) in data { 112 | let isnull = datum.name.is_empty() 113 | || (datum.name.starts_with('<') && datum.name.ends_with('>')); 114 | if !isnull { 115 | let v = (NameTag::Explicit, cp.value()); 116 | map.insert(datum.name.clone(), v); 117 | } 118 | } 119 | if ideograph { 120 | for &(start, end) in ucd_util::RANGE_IDEOGRAPH { 121 | for cp in start..end + 1 { 122 | let v = (NameTag::Ideograph, cp); 123 | map.insert(ucd_util::ideograph_name(cp).unwrap(), v); 124 | } 125 | } 126 | } 127 | if hangul { 128 | for &(start, end) in ucd_util::RANGE_HANGUL_SYLLABLE { 129 | for cp in start..end + 1 { 130 | let v = (NameTag::Hangul, cp); 131 | map.insert( 132 | ucd_util::hangul_name(jamo_short_name_table, cp).unwrap(), 133 | v, 134 | ); 135 | } 136 | } 137 | } 138 | map 139 | } 140 | -------------------------------------------------------------------------------- /src/property_bool.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | use std::path::Path; 3 | 4 | use ucd_parse::{ 5 | self, CoreProperty, EmojiProperty, Property, UcdFileByCodepoint, 6 | UnicodeData, UnicodeDataExpander, 7 | }; 8 | 9 | use crate::args::ArgMatches; 10 | use crate::error::Result; 11 | use crate::util::{PropertyNames, PropertyValues}; 12 | 13 | pub fn command(args: ArgMatches<'_>) -> Result<()> { 14 | let dir = args.ucd_dir()?; 15 | let by_name = parse_properties(&dir)?; 16 | let properties = PropertyNames::from_ucd_dir(&dir)?; 17 | let filter = args.filter(|name| properties.canonical(name))?; 18 | 19 | if args.is_present("list-properties") { 20 | for name in by_name.keys() { 21 | println!("{}", name); 22 | } 23 | return Ok(()); 24 | } 25 | let mut wtr = args.writer("prop_list")?; 26 | wtr.names(by_name.keys().filter(|n| filter.contains(n)))?; 27 | for (name, set) in by_name { 28 | if filter.contains(&name) { 29 | wtr.ranges(&name, &set)?; 30 | } 31 | } 32 | Ok(()) 33 | } 34 | 35 | pub fn command_perl_word(args: ArgMatches<'_>) -> Result<()> { 36 | let dir = args.ucd_dir()?; 37 | let props = parse_properties(&dir)?; 38 | let gencats = parse_general_categories(&dir)?; 39 | 40 | let mut perlword = BTreeSet::new(); 41 | perlword.extend(&props["Alphabetic"]); 42 | perlword.extend(&props["Join_Control"]); 43 | perlword.extend(&gencats["Decimal_Number"]); 44 | perlword.extend(&gencats["Nonspacing_Mark"]); 45 | perlword.extend(&gencats["Enclosing_Mark"]); 46 | perlword.extend(&gencats["Spacing_Mark"]); 47 | perlword.extend(&gencats["Connector_Punctuation"]); 48 | 49 | let mut wtr = args.writer("perl_word")?; 50 | wtr.ranges(args.name(), &perlword)?; 51 | Ok(()) 52 | } 53 | 54 | fn parse_properties>( 55 | ucd_dir: P, 56 | ) -> Result>> { 57 | // TODO: PropList.txt and DerivedCoreProperties.txt cover the majority 58 | // of boolean properties, but UAX44 S5.3 Table 9 lists a smattering of 59 | // others that we should include here as well. (Some will need support in 60 | // ucd-parse, for example, the ones found in DerivedNormalizationProps.txt 61 | // while others, like Bidi_Mirrored, are derived from UnicodeData.txt. 62 | // Even still, others like Composition_Exclusion have their own file 63 | // (CompositionExclusions.txt). 64 | 65 | let mut by_name: BTreeMap> = BTreeMap::new(); 66 | 67 | let prop_list: Vec = ucd_parse::parse(&ucd_dir)?; 68 | for x in &prop_list { 69 | by_name 70 | .entry(x.property.clone()) 71 | .or_insert(BTreeSet::new()) 72 | .extend(x.codepoints.into_iter().map(|c| c.value())); 73 | } 74 | 75 | let core_prop: Vec = ucd_parse::parse(&ucd_dir)?; 76 | for x in &core_prop { 77 | by_name 78 | .entry(x.property.clone()) 79 | .or_insert(BTreeSet::new()) 80 | .extend(x.codepoints.into_iter().map(|c| c.value())); 81 | } 82 | 83 | // Add Bidi_Mirrored 84 | let unicode_data: Vec = ucd_parse::parse(&ucd_dir)?; 85 | let bidi_mirrored = 86 | unicode_data.iter().fold(BTreeSet::new(), |mut set, x| { 87 | if x.bidi_mirrored { 88 | set.extend(x.codepoints().into_iter().map(|c| c.value())) 89 | } 90 | set 91 | }); 92 | by_name.insert("Bidi_Mirrored".to_string(), bidi_mirrored); 93 | 94 | // Since emoji-data.txt isn't parse of the normal UCD download, don't 95 | // die if it doesn't exist. But emit a helpful warning message. 96 | let emoji_prop: Vec = match ucd_parse::parse(&ucd_dir) { 97 | Ok(props) => props, 98 | Err(err) => match *err.kind() { 99 | ucd_parse::ErrorKind::Io(_) => { 100 | eprintln!( 101 | "{}. skipping emoji properties. \ 102 | emoji-data.txt is included in UCD 13.0.0 and newer, and \ 103 | can be downloaded from https://unicode.org/Public/emoji/ \ 104 | for older releases.", 105 | err, 106 | ); 107 | vec![] 108 | } 109 | _ => return Err(From::from(err)), 110 | }, 111 | }; 112 | for x in &emoji_prop { 113 | by_name 114 | .entry(x.property.clone()) 115 | .or_insert(BTreeSet::new()) 116 | .extend(x.codepoints.into_iter().map(|c| c.value())); 117 | } 118 | Ok(by_name) 119 | } 120 | 121 | fn parse_general_categories>( 122 | ucd_dir: P, 123 | ) -> Result>> { 124 | let propvals = PropertyValues::from_ucd_dir(&ucd_dir)?; 125 | let unexpanded = ucd_parse::parse(&ucd_dir)?; 126 | // Expand all of our UnicodeData rows. This results in one big list of 127 | // all assigned codepoints. 128 | let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect(); 129 | 130 | // Collect each general category into an ordered set. 131 | let mut bycat: BTreeMap> = BTreeMap::new(); 132 | for row in rows { 133 | let gc = propvals.canonical("gc", &row.general_category)?.to_string(); 134 | bycat 135 | .entry(gc) 136 | .or_insert(BTreeSet::new()) 137 | .insert(row.codepoint.value()); 138 | } 139 | Ok(bycat) 140 | } 141 | -------------------------------------------------------------------------------- /src/script.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, BTreeSet}; 2 | 3 | use ucd_parse::{self, Script, ScriptExtension}; 4 | 5 | use crate::args::ArgMatches; 6 | use crate::error::Result; 7 | use crate::util::{print_property_values, PropertyValues}; 8 | 9 | pub fn command_script(args: ArgMatches<'_>) -> Result<()> { 10 | let dir = args.ucd_dir()?; 11 | let propvals = PropertyValues::from_ucd_dir(&dir)?; 12 | let filter = args.filter(|name| propvals.canonical("Script", name))?; 13 | 14 | if args.is_present("list-scripts") { 15 | return print_property_values(&propvals, "Script"); 16 | } 17 | 18 | let mut by_name: BTreeMap> = BTreeMap::new(); 19 | let scripts: Vec