├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── benches
    ├── bench.rs
    └── tables
    │   ├── fst
    │       ├── general_category.fst
    │       ├── general_category.rs
    │       ├── jamo_short_name.fst
    │       ├── jamo_short_name.rs
    │       ├── mod.rs
    │       ├── names.fst
    │       └── names.rs
    │   ├── mod.rs
    │   ├── slice
    │       ├── general_categories.rs
    │       ├── general_category.rs
    │       ├── jamo_short_name.rs
    │       ├── mod.rs
    │       └── names.rs
    │   └── trie
    │       ├── general_categories.rs
    │       └── mod.rs
├── rustfmt.toml
├── scripts
    └── generate-unicode-tables
├── src
    ├── age.rs
    ├── app.rs
    ├── args.rs
    ├── bidi_class.rs
    ├── bidi_mirroring_glyph.rs
    ├── brk.rs
    ├── canonical_combining_class.rs
    ├── case_folding.rs
    ├── case_mapping.rs
    ├── error.rs
    ├── general_category.rs
    ├── jamo_short_name.rs
    ├── joining_type.rs
    ├── main.rs
    ├── names.rs
    ├── property_bool.rs
    ├── script.rs
    ├── util.rs
    └── writer.rs
├── ucd-parse
    ├── Cargo.toml
    ├── LICENSE-APACHE
    ├── LICENSE-MIT
    ├── README.md
    └── src
    │   ├── age.rs
    │   ├── arabic_shaping.rs
    │   ├── bidi_mirroring_glyph.rs
    │   ├── case_folding.rs
    │   ├── common.rs
    │   ├── core_properties.rs
    │   ├── derived_normalization_properties.rs
    │   ├── east_asian_width.rs
    │   ├── emoji_properties.rs
    │   ├── error.rs
    │   ├── extracted
    │       ├── derived_bidi_class.rs
    │       ├── derived_binary_properties.rs
    │       ├── derived_combining_class.rs
    │       ├── derived_decomposition_type.rs
    │       ├── derived_east_asian_width.rs
    │       ├── derived_general_category.rs
    │       ├── derived_joining_group.rs
    │       ├── derived_joining_type.rs
    │       ├── derived_line_break.rs
    │       ├── derived_name.rs
    │       ├── derived_numeric_type.rs
    │       ├── derived_numeric_values.rs
    │       └── mod.rs
    │   ├── grapheme_cluster_break.rs
    │   ├── jamo_short_name.rs
    │   ├── lib.rs
    │   ├── line_break.rs
    │   ├── name_aliases.rs
    │   ├── prop_list.rs
    │   ├── property_aliases.rs
    │   ├── property_value_aliases.rs
    │   ├── script_extensions.rs
    │   ├── scripts.rs
    │   ├── sentence_break.rs
    │   ├── special_casing.rs
    │   ├── unicode_data.rs
    │   └── word_break.rs
├── ucd-trie
    ├── Cargo.toml
    ├── LICENSE-APACHE
    ├── LICENSE-MIT
    ├── README.md
    ├── benches
    │   └── bench.rs
    └── src
    │   ├── general_category.rs
    │   ├── lib.rs
    │   └── owned.rs
└── ucd-util
    ├── Cargo.toml
    ├── LICENSE-APACHE
    ├── LICENSE-MIT
    ├── LICENSE-UNICODE
    ├── README.md
    └── src
        ├── hangul.rs
        ├── ideograph.rs
        ├── lib.rs
        ├── name.rs
        ├── property.rs
        └── unicode_tables
            ├── jamo_short_name.rs
            ├── mod.rs
            ├── property_names.rs
            └── property_values.rs


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [BurntSushi]
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches:
 6 |     - master
 7 |   schedule:
 8 |     - cron: '00 01 * * *'
 9 | jobs:
10 |   test:
11 |     name: test
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         # The docs seem to suggest that we can have a matrix with just an
16 |         # include directive, but it result in a "matrix must define at least
17 |         # one vector" error in the CI system.
18 |         build: [pinned, stable, beta, nightly]
19 |         include:
20 |         - build: pinned
21 |           os: ubuntu-latest
22 |           rust: 1.70.0
23 |         - build: stable
24 |           os: ubuntu-latest
25 |           rust: stable
26 |         - build: beta
27 |           os: ubuntu-latest
28 |           rust: beta
29 |         - build: nightly
30 |           os: ubuntu-latest
31 |           rust: nightly
32 |     steps:
33 |     - name: Checkout repository
34 |       uses: actions/checkout@v3
35 |     - name: Install Rust
36 |       uses: dtolnay/rust-toolchain@master
37 |       with:
38 |         toolchain: ${{ matrix.rust }}
39 |     - run: cargo build --all --verbose
40 |     - run: cargo doc --all --verbose
41 |     - run: cargo test --all --verbose
42 |     - if: matrix.build == 'nightly'
43 |       run: cargo bench --all --verbose --no-run
44 | 
45 |   rustfmt:
46 |     name: rustfmt
47 |     runs-on: ubuntu-latest
48 |     steps:
49 |     - name: Checkout repository
50 |       uses: actions/checkout@v3
51 |     - name: Install Rust
52 |       uses: dtolnay/rust-toolchain@master
53 |       with:
54 |         toolchain: stable
55 |         components: rustfmt
56 |     - name: Check formatting
57 |       run: |
58 |         cargo fmt -- --check
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .*.swp
 2 | tags
 3 | target
 4 | /x
 5 | perf.data*
 6 | /tmp
 7 | /ucd-parse/Cargo.lock
 8 | /ucd-trie/Cargo.lock
 9 | /ucd-util/Cargo.lock
10 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | version = 3
 4 | 
 5 | [[package]]
 6 | name = "bitflags"
 7 | version = "1.3.2"
 8 | source = "registry+https://github.com/rust-lang/crates.io-index"
 9 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
10 | 
11 | [[package]]
12 | name = "clap"
13 | version = "2.34.0"
14 | source = "registry+https://github.com/rust-lang/crates.io-index"
15 | checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
16 | dependencies = [
17 |  "bitflags",
18 |  "strsim",
19 |  "textwrap",
20 |  "unicode-width",
21 | ]
22 | 
23 | [[package]]
24 | name = "fst"
25 | version = "0.4.7"
26 | source = "registry+https://github.com/rust-lang/crates.io-index"
27 | checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a"
28 | 
29 | [[package]]
30 | name = "once_cell"
31 | version = "1.18.0"
32 | source = "registry+https://github.com/rust-lang/crates.io-index"
33 | checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
34 | 
35 | [[package]]
36 | name = "regex-lite"
37 | version = "0.1.0"
38 | source = "registry+https://github.com/rust-lang/crates.io-index"
39 | checksum = "f96ede7f386ba6e910092e7ccdc04176cface62abebea07ed6b46d870ed95ca2"
40 | 
41 | [[package]]
42 | name = "strsim"
43 | version = "0.8.0"
44 | source = "registry+https://github.com/rust-lang/crates.io-index"
45 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
46 | 
47 | [[package]]
48 | name = "textwrap"
49 | version = "0.11.0"
50 | source = "registry+https://github.com/rust-lang/crates.io-index"
51 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
52 | dependencies = [
53 |  "unicode-width",
54 | ]
55 | 
56 | [[package]]
57 | name = "ucd-generate"
58 | version = "0.3.1"
59 | dependencies = [
60 |  "clap",
61 |  "fst",
62 |  "once_cell",
63 |  "ucd-parse",
64 |  "ucd-trie",
65 |  "ucd-util",
66 | ]
67 | 
68 | [[package]]
69 | name = "ucd-parse"
70 | version = "0.1.13"
71 | dependencies = [
72 |  "regex-lite",
73 | ]
74 | 
75 | [[package]]
76 | name = "ucd-trie"
77 | version = "0.1.7"
78 | dependencies = [
79 |  "once_cell",
80 | ]
81 | 
82 | [[package]]
83 | name = "ucd-util"
84 | version = "0.2.2"
85 | 
86 | [[package]]
87 | name = "unicode-width"
88 | version = "0.1.10"
89 | source = "registry+https://github.com/rust-lang/crates.io-index"
90 | checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
91 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ucd-generate"
 3 | version = "0.3.1"  #:version
 4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 5 | description = """
 6 | A program for generating packed representations of the Unicode character
 7 | database that can be efficiently searched.
 8 | """
 9 | documentation = "https://github.com/BurntSushi/ucd-generate"
10 | homepage = "https://github.com/BurntSushi/ucd-generate"
11 | repository = "https://github.com/BurntSushi/ucd-generate"
12 | readme = "README.md"
13 | keywords = ["unicode", "generate", "character", "table", "fst"]
14 | license = "MIT OR Apache-2.0"
15 | categories = ["text-processing", "internationalization"]
16 | edition = "2021"
17 | rust-version = "1.70"
18 | 
19 | [workspace]
20 | members = ["ucd-parse", "ucd-trie", "ucd-util"]
21 | 
22 | [[bin]]
23 | bench = false
24 | path = "src/main.rs"
25 | name = "ucd-generate"
26 | 
27 | [dependencies]
28 | fst = "0.4.0"
29 | ucd-parse = { version = "0.1.10", path = "ucd-parse" }
30 | ucd-trie = { version = "0.1.7", path = "ucd-trie" }
31 | ucd-util = { version = "0.2.2", path = "ucd-util"  }
32 | 
33 | [dependencies.clap]
34 | version = "2.34.0"
35 | default-features = false
36 | features = ["suggestions"]
37 | 
38 | [dev-dependencies]
39 | once_cell = "1"
40 | 
41 | [profile.release]
42 | debug = true
43 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 The Rust Project Developers
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ucd-generate
  2 | ============
  3 | A command line tool to generate Unicode tables in Rust source code. Tables
  4 | can typically be generated in one of three formats: a sorted sequence of
  5 | character ranges, a
  6 | [finite state transducer](https://github.com/BurntSushi/fst)
  7 | or a compressed trie. Full support for name canonicalization is also provided.
  8 | 
  9 | [![Build status](https://github.com/BurntSushi/ucd-generate/workflows/ci/badge.svg)](https://github.com/BurntSushi/ucd-generate/actions)
 10 | [![crates.io](https://img.shields.io/crates/v/ucd-generate.svg)](https://crates.io/crates/ucd-generate)
 11 | 
 12 | 
 13 | ### Installation
 14 | 
 15 | Since this is mostly intended as a developer tool for use while writing Rust
 16 | programs, the principle method of installation is from crates.io:
 17 | 
 18 | ```
 19 | $ cargo install ucd-generate
 20 | ucd-generate --help
 21 | ```
 22 | 
 23 | 
 24 | ### Example
 25 | 
 26 | This somewhat arbitrary example shows the output of generating tables for
 27 | three properties, and representing them as normal Rust character literal
 28 | ranges.
 29 | 
 30 | To run the example, you need to download the Unicode Character Database (UCD):
 31 | 
 32 | ```
 33 | $ mkdir /tmp/ucd-15.0.0
 34 | $ cd /tmp/ucd-15.0.0
 35 | $ curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip
 36 | $ unzip UCD.zip
 37 | ```
 38 | 
 39 | Note that prior to version 13.0.0, `emoji/emoji-data.txt` file was distributed
 40 | separate from the UCD bundle. For these versions, you may need to download this
 41 | file from https://unicode.org/Public/emoji in order to generate certain tables.
 42 | 
 43 | Now tell `ucd-generate` what you want and point it to the directory created
 44 | above:
 45 | 
 46 | ```
 47 | $ ucd-generate property-bool /tmp/ucd-15.0.0 --include Hyphen,Dash,Quotation_Mark --chars
 48 | ```
 49 | 
 50 | And the output, which is valid Rust source code:
 51 | 
 52 | ```rust
 53 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 54 | //
 55 | //   ucd-generate property-bool /tmp/ucd-15.0.0 --include Hyphen,Dash,Quotation_Mark --chars
 56 | //
 57 | // Unicode version: 15.0.0.
 58 | //
 59 | // ucd-generate 0.2.10 is available on crates.io.
 60 | 
 61 | pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
 62 |   ("Dash", DASH), ("Hyphen", HYPHEN), ("Quotation_Mark", QUOTATION_MARK),
 63 | ];
 64 | 
 65 | pub const DASH: &'static [(char, char)] = &[
 66 |   ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'),
 67 |   ('‐', '―'), ('⁓', '⁓'), ('⁻', '⁻'), ('₋', '₋'),
 68 |   ('−', '−'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'),
 69 |   ('⹀', '⹀'), ('\u{2e5d}', '\u{2e5d}'), ('〜', '〜'), ('〰', '〰'),
 70 |   ('゠', '゠'), ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'),
 71 |   ('－', '－'), ('𐺭', '𐺭'),
 72 | ];
 73 | 
 74 | pub const HYPHEN: &'static [(char, char)] = &[
 75 |   ('-', '-'), ('\u{ad}', '\u{ad}'), ('֊', '֊'), ('᠆', '᠆'),
 76 |   ('‐', '‑'), ('⸗', '⸗'), ('・', '・'), ('﹣', '﹣'),
 77 |   ('－', '－'), ('･', '･'),
 78 | ];
 79 | 
 80 | pub const QUOTATION_MARK: &'static [(char, char)] = &[
 81 |   ('"', '"'), ('\'', '\''), ('«', '«'), ('»', '»'), ('‘', '‟'),
 82 |   ('‹', '›'), ('⹂', '⹂'), ('「', '』'), ('〝', '〟'),
 83 |   ('﹁', '﹄'), ('＂', '＂'), ('＇', '＇'), ('｢', '｣'),
 84 | ];
 85 | ```
 86 | 
 87 | ### DFA serialization
 88 | 
 89 | Prior to `ucd-generate 0.3.0`, the sub-commands `dfa` and `regex` could be used
 90 | to build fully compiled DFAs, serialize them to disk and generate Rust code for
 91 | deserializing them. This functionality was removed in `0.3.0` and
 92 | [moved to `regex-cli`](https://github.com/rust-lang/regex/tree/master/regex-cli#example-serialize-a-dfa).
 93 | 
 94 | ### Contributing
 95 | 
 96 | The `ucd-generate` tool doesn't have any specific design goals, other than to
 97 | collect Unicode table generation tasks. If you need `ucd-generate` to do
 98 | something and it's reasonably straight-forward to add, then just submitting a
 99 | PR would be great. Otherwise, file an issue and we can discuss.
100 | 
101 | ### Alternatives
102 | 
103 | The primary alternative is [ICU4X](https://github.com/unicode-org/icu4x). If
104 | you have sophisticated Unicode requirements, it is almost certainly what you
105 | should be using.
106 | 
107 | It's beyond the scope of this README to do a full comparison between ICU4X
108 | and `ucd-generate`, but I think the shortest way to describe it is that
109 | `ucd-generate` is _simplistic_, with all the associated positive and negative
110 | connotations that come with that word.
111 | 
112 | 
113 | ### Future work
114 | 
115 | This tool is by no means is exhaustive. In fact, it's not even close to
116 | exhaustive, and it may never be. For the most part, the intent of this tool
117 | is to collect virtually any kind of Unicode generation task. In theory, this
118 | would ideally replace the hodge podge collection of Python programs that is
119 | responsible for this task today in various Unicode crates.
120 | 
121 | Here are some examples of future work that would be welcome:
122 | 
123 | * More support for parsing things in the UCD.
124 | * More generation tasks based on things in the UCD.
125 | * More output formats, especially for reducing binary size.
126 | 
127 | 
128 | ### Sub-crates
129 | 
130 | This repository is home to three sub-crates:
131 | 
132 | * [`ucd-parse`](ucd-parse) - A crate for parsing UCD files into
133 |   structured data.
134 | * [`ucd-trie`](ucd-trie) - Auxiliary type for handling the trie
135 |   set table format emitted by `ucd-generate`. This crate has a `no_std` mode.
136 | * [`ucd-util`](ucd-util) - A purposely small crate for Unicode
137 |   auxiliary functions. This includes things like symbol or character name
138 |   canonicalization, ideograph name generation and helper functions for
139 |   searching property name and value tables.
140 | 
141 | 
142 | ### License
143 | 
144 | This project is licensed under either of
145 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
146 |    https://www.apache.org/licenses/LICENSE-2.0)
147 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
148 |    https://opensource.org/licenses/MIT)
149 | at your option.
150 | 


--------------------------------------------------------------------------------
/benches/bench.rs:
--------------------------------------------------------------------------------
  1 | #![feature(test)]
  2 | 
  3 | extern crate test;
  4 | 
  5 | use std::cmp::Ordering;
  6 | 
  7 | use test::Bencher;
  8 | 
  9 | mod tables;
 10 | 
 11 | fn u32_key(cp: u32) -> [u8; 4] {
 12 |     cp.to_be_bytes()
 13 | }
 14 | 
 15 | #[bench]
 16 | fn general_category_slice(b: &mut Bencher) {
 17 |     let slice = tables::slice::general_category::GENERAL_CATEGORY;
 18 |     let mut i = 0;
 19 |     b.iter(|| {
 20 |         let (query, _, value) = slice[i];
 21 |         i = (i + 1) % slice.len();
 22 | 
 23 |         let pos = slice.binary_search_by(|&(s, e, _)| {
 24 |             if s > query {
 25 |                 Ordering::Greater
 26 |             } else if e < query {
 27 |                 Ordering::Less
 28 |             } else {
 29 |                 Ordering::Equal
 30 |             }
 31 |         });
 32 |         let found = slice[pos.unwrap()];
 33 |         assert_eq!(found.2, value);
 34 |     });
 35 | }
 36 | 
 37 | #[bench]
 38 | fn general_category_fst(b: &mut Bencher) {
 39 |     let slice = tables::slice::general_category::GENERAL_CATEGORY;
 40 |     let fst = &tables::fst::general_category::GENERAL_CATEGORY;
 41 | 
 42 |     let mut i = 0;
 43 |     b.iter(|| {
 44 |         let (query, _, value) = slice[i];
 45 |         i = (i + 1) % slice.len();
 46 | 
 47 |         let found = fst.get(u32_key(query)).unwrap() as u8;
 48 |         assert_eq!(found, value);
 49 |     });
 50 | }
 51 | 
 52 | #[bench]
 53 | fn lowercase_letter_slice(b: &mut Bencher) {
 54 |     let slice = tables::slice::general_categories::LOWERCASE_LETTER;
 55 |     let mut i = 0;
 56 |     b.iter(|| {
 57 |         let (query, _) = slice[i];
 58 |         i = (i + 1) % slice.len();
 59 | 
 60 |         let pos = slice.binary_search_by(|&(s, e)| {
 61 |             if s > query {
 62 |                 Ordering::Greater
 63 |             } else if e < query {
 64 |                 Ordering::Less
 65 |             } else {
 66 |                 Ordering::Equal
 67 |             }
 68 |         });
 69 |         assert!(pos.is_ok());
 70 |     });
 71 | }
 72 | 
 73 | #[bench]
 74 | fn lowercase_letter_trie(b: &mut Bencher) {
 75 |     let slice = tables::slice::general_categories::LOWERCASE_LETTER;
 76 |     let trie = tables::trie::general_categories::LOWERCASE_LETTER;
 77 |     let mut i = 0;
 78 |     b.iter(|| {
 79 |         let (query, _) = slice[i];
 80 |         i = (i + 1) % slice.len();
 81 |         assert!(trie.contains_u32(query));
 82 |     });
 83 | }
 84 | 
 85 | #[bench]
 86 | fn names_slice(b: &mut Bencher) {
 87 |     let slice = tables::slice::names::NAMES;
 88 |     let mut i = 0;
 89 |     b.iter(|| {
 90 |         let (name, cp) = slice[i];
 91 |         i = (i + 1) % slice.len();
 92 | 
 93 |         let found = slice[slice.binary_search_by_key(&name, |x| x.0).unwrap()];
 94 |         assert_eq!(found.1, cp);
 95 |     });
 96 | }
 97 | 
 98 | #[bench]
 99 | fn names_fst(b: &mut Bencher) {
100 |     let slice = tables::slice::names::NAMES;
101 |     let fst = &tables::fst::names::NAMES;
102 | 
103 |     let mut i = 0;
104 |     b.iter(|| {
105 |         let (name, cp) = slice[i];
106 |         i = (i + 1) % slice.len();
107 | 
108 |         let found = fst.get(name).unwrap() as u32;
109 |         assert_eq!(found, cp);
110 |     });
111 | }
112 | 
113 | #[bench]
114 | fn jamo_short_name_fst(b: &mut Bencher) {
115 |     let slice = tables::slice::jamo_short_name::JAMO_SHORT_NAME;
116 |     let fst = &tables::fst::jamo_short_name::JAMO_SHORT_NAME;
117 |     let mut i = 0;
118 |     let mut value = String::new();
119 |     b.iter(|| {
120 |         let (cp, name) = slice[i];
121 |         i = (i + 1) % slice.len();
122 | 
123 |         let mut found = fst.get(u32_key(cp)).unwrap();
124 |         value.clear();
125 |         while found != 0 {
126 |             value.push((found & 0xFF) as u8 as char);
127 |             found = found >> 8;
128 |         }
129 |         assert_eq!(value, name);
130 |     });
131 | }
132 | 
133 | #[bench]
134 | fn jamo_short_name_slice(b: &mut Bencher) {
135 |     let slice = tables::slice::jamo_short_name::JAMO_SHORT_NAME;
136 |     let mut i = 0;
137 |     b.iter(|| {
138 |         let (cp, name) = slice[i];
139 |         i = (i + 1) % slice.len();
140 | 
141 |         let found = slice[slice.binary_search_by_key(&cp, |x| x.0).unwrap()];
142 |         assert_eq!(found.1, name);
143 |     });
144 | }
145 | 
146 | #[bench]
147 | fn jamo_short_name_slice_linear(b: &mut Bencher) {
148 |     let slice = tables::slice::jamo_short_name::JAMO_SHORT_NAME;
149 |     let mut i = 0;
150 |     b.iter(|| {
151 |         let (cp, name) = slice[i];
152 |         i = (i + 1) % slice.len();
153 | 
154 |         let found = slice.iter().find(|p| p.0 == cp).unwrap();
155 |         assert_eq!(found.1, name);
156 |     });
157 | }
158 | 


--------------------------------------------------------------------------------
/benches/tables/fst/general_category.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/ucd-generate/e8fa937f0cac643669dcaf5edac2785b15cab917/benches/tables/fst/general_category.fst


--------------------------------------------------------------------------------
/benches/tables/fst/general_category.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //   ucd-generate general-category ucd-16.0.0 --exclude unassigned --enum --fst-dir benches/tables/fst
 4 | //
 5 | // Unicode version: 16.0.0.
 6 | //
 7 | // ucd-generate 0.2.15 is available on crates.io.
 8 | 
 9 | pub const GENERAL_CATEGORY_ENUM: &'static [&'static str] = &[
10 |     "Close_Punctuation",
11 |     "Connector_Punctuation",
12 |     "Control",
13 |     "Currency_Symbol",
14 |     "Dash_Punctuation",
15 |     "Decimal_Number",
16 |     "Enclosing_Mark",
17 |     "Final_Punctuation",
18 |     "Format",
19 |     "Initial_Punctuation",
20 |     "Letter_Number",
21 |     "Line_Separator",
22 |     "Lowercase_Letter",
23 |     "Math_Symbol",
24 |     "Modifier_Letter",
25 |     "Modifier_Symbol",
26 |     "Nonspacing_Mark",
27 |     "Open_Punctuation",
28 |     "Other_Letter",
29 |     "Other_Number",
30 |     "Other_Punctuation",
31 |     "Other_Symbol",
32 |     "Paragraph_Separator",
33 |     "Private_Use",
34 |     "Space_Separator",
35 |     "Spacing_Mark",
36 |     "Surrogate",
37 |     "Titlecase_Letter",
38 |     "Uppercase_Letter",
39 | ];
40 | 
41 | pub static GENERAL_CATEGORY: ::once_cell::sync::Lazy<
42 |     ::fst::Map<&'static [u8]>,
43 | > = ::once_cell::sync::Lazy::new(|| {
44 |     ::fst::Map::from(
45 |         ::fst::raw::Fst::new(&include_bytes!("general_category.fst")[..])
46 |             .unwrap(),
47 |     )
48 | });
49 | 


--------------------------------------------------------------------------------
/benches/tables/fst/jamo_short_name.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/ucd-generate/e8fa937f0cac643669dcaf5edac2785b15cab917/benches/tables/fst/jamo_short_name.fst


--------------------------------------------------------------------------------
/benches/tables/fst/jamo_short_name.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //   ucd-generate jamo-short-name ucd-16.0.0 --fst-dir benches/tables/fst
 4 | //
 5 | // Unicode version: 16.0.0.
 6 | //
 7 | // ucd-generate 0.2.15 is available on crates.io.
 8 | 
 9 | pub static JAMO_SHORT_NAME: ::once_cell::sync::Lazy<
10 |     ::fst::Map<&'static [u8]>,
11 | > = ::once_cell::sync::Lazy::new(|| {
12 |     ::fst::Map::from(
13 |         ::fst::raw::Fst::new(&include_bytes!("jamo_short_name.fst")[..])
14 |             .unwrap(),
15 |     )
16 | });
17 | 


--------------------------------------------------------------------------------
/benches/tables/fst/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod general_category;
2 | pub mod jamo_short_name;
3 | pub mod names;
4 | 


--------------------------------------------------------------------------------
/benches/tables/fst/names.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BurntSushi/ucd-generate/e8fa937f0cac643669dcaf5edac2785b15cab917/benches/tables/fst/names.fst


--------------------------------------------------------------------------------
/benches/tables/fst/names.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //   ucd-generate names ucd-16.0.0 --no-aliases --no-hangul --no-ideograph --fst-dir benches/tables/fst
 4 | //
 5 | // Unicode version: 16.0.0.
 6 | //
 7 | // ucd-generate 0.2.15 is available on crates.io.
 8 | 
 9 | pub static NAMES: ::once_cell::sync::Lazy<::fst::Map<&'static [u8]>> =
10 |     ::once_cell::sync::Lazy::new(|| {
11 |         ::fst::Map::from(
12 |             ::fst::raw::Fst::new(&include_bytes!("names.fst")[..]).unwrap(),
13 |         )
14 |     });
15 | 


--------------------------------------------------------------------------------
/benches/tables/mod.rs:
--------------------------------------------------------------------------------
1 | #![allow(dead_code)]
2 | 
3 | pub mod fst;
4 | pub mod slice;
5 | pub mod trie;
6 | 


--------------------------------------------------------------------------------
/benches/tables/slice/jamo_short_name.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //   ucd-generate jamo-short-name ucd-16.0.0
 4 | //
 5 | // Unicode version: 16.0.0.
 6 | //
 7 | // ucd-generate 0.2.15 is available on crates.io.
 8 | 
 9 | pub const JAMO_SHORT_NAME: &'static [(u32, &'static str)] = &[
10 |     (4352, "G"),
11 |     (4353, "GG"),
12 |     (4354, "N"),
13 |     (4355, "D"),
14 |     (4356, "DD"),
15 |     (4357, "R"),
16 |     (4358, "M"),
17 |     (4359, "B"),
18 |     (4360, "BB"),
19 |     (4361, "S"),
20 |     (4362, "SS"),
21 |     (4363, ""),
22 |     (4364, "J"),
23 |     (4365, "JJ"),
24 |     (4366, "C"),
25 |     (4367, "K"),
26 |     (4368, "T"),
27 |     (4369, "P"),
28 |     (4370, "H"),
29 |     (4449, "A"),
30 |     (4450, "AE"),
31 |     (4451, "YA"),
32 |     (4452, "YAE"),
33 |     (4453, "EO"),
34 |     (4454, "E"),
35 |     (4455, "YEO"),
36 |     (4456, "YE"),
37 |     (4457, "O"),
38 |     (4458, "WA"),
39 |     (4459, "WAE"),
40 |     (4460, "OE"),
41 |     (4461, "YO"),
42 |     (4462, "U"),
43 |     (4463, "WEO"),
44 |     (4464, "WE"),
45 |     (4465, "WI"),
46 |     (4466, "YU"),
47 |     (4467, "EU"),
48 |     (4468, "YI"),
49 |     (4469, "I"),
50 |     (4520, "G"),
51 |     (4521, "GG"),
52 |     (4522, "GS"),
53 |     (4523, "N"),
54 |     (4524, "NJ"),
55 |     (4525, "NH"),
56 |     (4526, "D"),
57 |     (4527, "L"),
58 |     (4528, "LG"),
59 |     (4529, "LM"),
60 |     (4530, "LB"),
61 |     (4531, "LS"),
62 |     (4532, "LT"),
63 |     (4533, "LP"),
64 |     (4534, "LH"),
65 |     (4535, "M"),
66 |     (4536, "B"),
67 |     (4537, "BS"),
68 |     (4538, "S"),
69 |     (4539, "SS"),
70 |     (4540, "NG"),
71 |     (4541, "J"),
72 |     (4542, "C"),
73 |     (4543, "K"),
74 |     (4544, "T"),
75 |     (4545, "P"),
76 |     (4546, "H"),
77 | ];
78 | 


--------------------------------------------------------------------------------
/benches/tables/slice/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod general_categories;
2 | pub mod general_category;
3 | pub mod jamo_short_name;
4 | pub mod names;
5 | 


--------------------------------------------------------------------------------
/benches/tables/trie/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod general_categories;
2 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 79
2 | use_small_heuristics = "max"
3 | 


--------------------------------------------------------------------------------
/scripts/generate-unicode-tables:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This script is responsible for generating some of the Unicode tables used
 4 | # in this project. It's a little weird here since ucd-generate is itself
 5 | # used to build the tables used by some of its dependencies. However, most
 6 | # tables are generated only for use in tests and benchmarks.
 7 | #
 8 | # Usage is simple, first download the Unicode data:
 9 | #
10 | #   $ mkdir ucd
11 | #   $ cd ucd
12 | #   $ curl -LO https://www.unicode.org/Public/zipped/14.0.0/UCD.zip
13 | #   $ unzip UCD.zip
14 | #
15 | # And then run this script from the root of this repository by pointing it at
16 | # the data directory downloaded above:
17 | #
18 | #   $ ./scripts/generate-unicode-tables path/to/ucd
19 | 
20 | if [ $# != 1 ]; then
21 |     echo "Usage: $(basename "$0") <ucd-data-directory>" >&2
22 |     exit 1
23 | fi
24 | ucddir="$1"
25 | 
26 | echo "generating FSTs for benchmarks"
27 | out="benches/tables/fst"
28 | ucd-generate general-category \
29 |     "$ucddir" --exclude unassigned --enum --fst-dir "$out"
30 | ucd-generate jamo-short-name \
31 |     "$ucddir" --fst-dir "$out"
32 | ucd-generate names "$ucddir" \
33 |     --no-aliases --no-hangul --no-ideograph --fst-dir "$out"
34 | 
35 | echo "generating sorted slices for benchmarks"
36 | out="benches/tables/slice"
37 | ucd-generate general-category \
38 |     "$ucddir" --exclude unassigned > "$out/general_categories.rs"
39 | ucd-generate general-category \
40 |     "$ucddir" --exclude unassigned --enum > "$out/general_category.rs"
41 | ucd-generate jamo-short-name \
42 |     "$ucddir" > "$out/jamo_short_name.rs"
43 | ucd-generate names \
44 |     "$ucddir" --no-aliases --no-hangul --no-ideograph > "$out/names.rs"
45 | 
46 | echo "generating tables for ucd-trie benchmarks"
47 | out="benches/tables/trie"
48 | ucd-generate general-category \
49 |     "$ucddir" --exclude unassigned --trie-set > "$out/general_categories.rs"
50 | 
51 | echo "generating tables for ucd-trie tests"
52 | out="ucd-trie/src"
53 | ucd-generate general-category "$ucddir" > "$out/general_category.rs"
54 | 
55 | echo "generating tables for ucd-util tests"
56 | out="ucd-util/src/unicode_tables"
57 | ucd-generate property-names "$ucddir" > "$out/property_names.rs"
58 | ucd-generate property-values "$ucddir" > "$out/property_values.rs"
59 | ucd-generate jamo-short-name "$ucddir" > "$out/jamo_short_name.rs"
60 | 
61 | cargo +stable fmt
62 | 


--------------------------------------------------------------------------------
/src/age.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::{BTreeMap, BTreeSet};
 2 | 
 3 | use ucd_parse::{self, Age};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | use crate::util::PropertyValues;
 8 | 
 9 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
10 |     let dir = args.ucd_dir()?;
11 |     let propvals = PropertyValues::from_ucd_dir(&dir)?;
12 |     let ages: Vec<Age> = ucd_parse::parse(&dir)?;
13 | 
14 |     let mut by_age: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
15 |     for x in &ages {
16 |         let agename = propvals.canonical("Age", &x.age)?;
17 |         by_age
18 |             .entry(agename)
19 |             .or_insert(BTreeSet::new())
20 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
21 |     }
22 | 
23 |     let mut wtr = args.writer("age")?;
24 |     wtr.names(by_age.keys())?;
25 |     for (name, set) in by_age {
26 |         wtr.ranges(&name, &set)?;
27 |     }
28 |     Ok(())
29 | }
30 | 


--------------------------------------------------------------------------------
/src/args.rs:
--------------------------------------------------------------------------------
 1 | use std::ffi::OsStr;
 2 | use std::ops;
 3 | 
 4 | use clap;
 5 | 
 6 | use crate::error::Result;
 7 | use crate::util::Filter;
 8 | use crate::writer::{Writer, WriterBuilder};
 9 | 
10 | /// Wraps clap matches and provides convenient accessors to various parameters.
11 | pub struct ArgMatches<'a>(&'a clap::ArgMatches<'a>);
12 | 
13 | impl<'a> ops::Deref for ArgMatches<'a> {
14 |     type Target = clap::ArgMatches<'a>;
15 |     fn deref(&self) -> &clap::ArgMatches<'a> {
16 |         &self.0
17 |     }
18 | }
19 | 
20 | impl<'a> ArgMatches<'a> {
21 |     pub fn new(matches: &'a clap::ArgMatches<'a>) -> ArgMatches<'a> {
22 |         ArgMatches(matches)
23 |     }
24 | 
25 |     pub fn ucd_dir(&self) -> Result<&OsStr> {
26 |         match self.value_of_os("ucd-dir") {
27 |             Some(x) => Ok(x),
28 |             None => err!("missing UCD directory"),
29 |         }
30 |     }
31 | 
32 |     pub fn writer(&self, name: &str) -> Result<Writer> {
33 |         let mut builder = WriterBuilder::new(name);
34 |         builder
35 |             .columns(79)
36 |             .char_literals(self.is_present("chars"))
37 |             .trie_set(self.is_present("trie-set"));
38 |         // Some of the functionality of this crate works with a partial ucd
39 |         // directory.
40 |         match ucd_parse::ucd_directory_version(self.ucd_dir()?) {
41 |             Ok((major, minor, patch)) => {
42 |                 builder.ucd_version(major, minor, patch)
43 |             }
44 |             Err(e) => return err!("Failed to determine UCD version: {}", e),
45 |         };
46 |         match self.value_of_os("fst-dir") {
47 |             None => Ok(builder.from_stdout()),
48 |             Some(x) => builder.from_fst_dir(x),
49 |         }
50 |     }
51 | 
52 |     pub fn name(&self) -> &str {
53 |         self.value_of("name").expect("the name of the table")
54 |     }
55 | 
56 |     /// Create a new include/exclude filter command line arguments.
57 |     ///
58 |     /// The given canonicalization function is applied to each element in
59 |     /// each of the include/exclude lists provided by the end user.
60 |     pub fn filter<F: FnMut(&str) -> Result<String>>(
61 |         &self,
62 |         mut canonicalize: F,
63 |     ) -> Result<Filter> {
64 |         Filter::new(
65 |             self.value_of_lossy("include").map(|s| s.to_string()),
66 |             self.value_of_lossy("exclude").map(|s| s.to_string()),
67 |             |name| canonicalize(name),
68 |         )
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/bidi_class.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{BTreeMap, BTreeSet};
  2 | 
  3 | use ucd_parse::{self, CoreProperty, UnicodeData};
  4 | 
  5 | use crate::args::ArgMatches;
  6 | use crate::error::Result;
  7 | use crate::util::{print_property_values, PropertyValues};
  8 | 
  9 | // Bidi Class (listing UnicodeData.txt, field 4: see UAX #44:
 10 | // https://www.unicode.org/reports/tr44/) Unlike other properties, unassigned
 11 | // code points in blocks reserved for right-to-left scripts are given either
 12 | // types R or AL.
 13 | //
 14 | // The unassigned code points that default to AL are in the ranges:
 15 | //     [\u0600-\u07BF \u0860-\u086F \u08A0-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF
 16 | //     \uFE70-\uFEFF \U00010D00-\U00010D3F \U00010F30-\U00010F6F
 17 | //     \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF]
 18 | //
 19 | //     This includes code points in the Arabic, Syriac, and Thaana blocks,
 20 | //     among others.
 21 | //
 22 | // The unassigned code points that default to R are in the ranges:
 23 | //     [\u0590-\u05FF \u07C0-\u085F \u0870-\u089F \uFB1D-\uFB4F
 24 | //     \U00010800-\U00010CFF \U00010D40-\U00010F2F \U00010F70-\U00010FFF
 25 | //     \U0001E800-\U0001EC6F \U0001ECC0-\U0001ECFF \U0001ED50-\U0001EDFF
 26 | //     \U0001EF00-\U0001EFFF]
 27 | //
 28 | //     This includes code points in the Hebrew, NKo, and Phoenician blocks,
 29 | //     among others.
 30 | //
 31 | // The unassigned code points that default to ET are in the range:
 32 | //     [\u20A0-\u20CF]
 33 | //
 34 | //     This consists of code points in the Currency Symbols block.
 35 | //
 36 | // The unassigned code points that default to BN have one of the following
 37 | // properties:
 38 | //     Default_Ignorable_Code_Point
 39 | //     Noncharacter_Code_Point
 40 | //
 41 | // For all other cases:
 42 | //
 43 | //  All code points not explicitly listed for Bidi_Class
 44 | //  have the value Left_To_Right (L).
 45 | const DEFAULT_CLASS_ASSIGNMENTS: &[(u32, u32, &str)] = &[
 46 |     (0x0600, 0x07BF, "AL"),
 47 |     (0x0860, 0x086F, "AL"),
 48 |     (0x08A0, 0x08FF, "AL"),
 49 |     (0xFB50, 0xFDCF, "AL"),
 50 |     (0xFDF0, 0xFDFF, "AL"),
 51 |     (0xFE70, 0xFEFF, "AL"),
 52 |     (0x00010D00, 0x00010D3F, "AL"),
 53 |     (0x00010F30, 0x00010F6F, "AL"),
 54 |     (0x0001EC70, 0x0001ECBF, "AL"),
 55 |     (0x0001ED00, 0x0001ED4F, "AL"),
 56 |     (0x0001EE00, 0x0001EEFF, "AL"),
 57 |     (0x0590, 0x05FF, "R"),
 58 |     (0x07C0, 0x085F, "R"),
 59 |     (0x0870, 0x089F, "R"),
 60 |     (0xFB1D, 0xFB4F, "R"),
 61 |     (0x00010800, 0x00010CFF, "R"),
 62 |     (0x00010D40, 0x00010F2F, "R"),
 63 |     (0x00010F70, 0x00010FFF, "R"),
 64 |     (0x0001E800, 0x0001EC6F, "R"),
 65 |     (0x0001ECC0, 0x0001ECFF, "R"),
 66 |     (0x0001ED50, 0x0001EDFF, "R"),
 67 |     (0x0001EF00, 0x0001EFFF, "R"),
 68 |     (0x20A0, 0x20CF, "ET"),
 69 | ];
 70 | 
 71 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
 72 |     let dir = args.ucd_dir()?;
 73 |     let propvals = PropertyValues::from_ucd_dir(&dir)?;
 74 |     let rows: Vec<UnicodeData> = ucd_parse::parse(&dir)?;
 75 |     let core_prop: Vec<CoreProperty> = ucd_parse::parse(&dir)?;
 76 |     let use_short_names = args.is_present("short-names");
 77 |     let bidi_class_name = |short_name: &str| {
 78 |         if use_short_names {
 79 |             Ok(short_name.to_string())
 80 |         } else {
 81 |             propvals.canonical("bc", short_name)
 82 |         }
 83 |     };
 84 | 
 85 |     // If we were tasked with listing the available categories, then do that
 86 |     // and quit.
 87 |     if args.is_present("list-classes") {
 88 |         return print_property_values(&propvals, "Bidi_Class");
 89 |     }
 90 | 
 91 |     // Collect each bidi class into an ordered set.
 92 |     let mut by_type: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
 93 |     let mut assigned = BTreeSet::new();
 94 |     for row in rows {
 95 |         assigned.insert(row.codepoint.value());
 96 |         let bc = bidi_class_name(&row.bidi_class)?;
 97 |         by_type
 98 |             .entry(bc)
 99 |             .or_insert(BTreeSet::new())
100 |             .insert(row.codepoint.value());
101 |     }
102 | 
103 |     // Process the codepoints that are not listed as per the notes in
104 |     // DerivedBidiClass.txt (UCD 12.1). See comment on
105 |     // DEFAULT_CLASS_ASSIGNMENTS for more detail.
106 |     //
107 |     // Collect the codepoints that may default to BN
108 |     let mut maybe_boundary_neutral = BTreeSet::new();
109 |     for x in &core_prop {
110 |         if &x.property == "Default_Ignorable_Code_Point"
111 |             || &x.property == "Noncharacter_Code_Point"
112 |         {
113 |             maybe_boundary_neutral
114 |                 .extend(x.codepoints.into_iter().map(|c| c.value()));
115 |         }
116 |     }
117 | 
118 |     // Process unassigned codepoints
119 |     let left_to_right_name = bidi_class_name("L")?;
120 |     let boundary_neutral_name = bidi_class_name("BN")?;
121 |     for cp in 0..=0x10FFFF {
122 |         if assigned.contains(&cp) {
123 |             continue;
124 |         }
125 |         // Check if this code point is in the default Bidi classes
126 |         if let Some(class) = lookup_unassigned(cp, DEFAULT_CLASS_ASSIGNMENTS) {
127 |             let name = bidi_class_name(class)?;
128 |             by_type.get_mut(&name).unwrap().insert(cp);
129 |         } else if maybe_boundary_neutral.contains(&cp) {
130 |             by_type.get_mut(&boundary_neutral_name).unwrap().insert(cp);
131 |         } else {
132 |             // All others get assigned Left_To_Right
133 |             by_type.get_mut(&left_to_right_name).unwrap().insert(cp);
134 |         }
135 |     }
136 | 
137 |     let mut wtr = args.writer("bidi_class")?;
138 |     if args.is_present("enum") {
139 |         wtr.ranges_to_enum(args.name(), &by_type)?;
140 |     } else if args.is_present("rust-enum") {
141 |         let variants = by_type.keys().map(String::as_str).collect::<Vec<_>>();
142 |         wtr.ranges_to_rust_enum(args.name(), &variants, &by_type)?;
143 |     } else if args.is_present("combined") {
144 |         wtr.ranges_to_combined(args.name(), &by_type)?;
145 |     } else {
146 |         wtr.names(by_type.keys())?;
147 |         for (name, set) in by_type {
148 |             wtr.ranges(&name, &set)?;
149 |         }
150 |     }
151 | 
152 |     Ok(())
153 | }
154 | 
155 | /// Look up a code point in the unassigned default Bidi classes.
156 | fn lookup_unassigned<'a>(
157 |     codepoint: u32,
158 |     defaults: &[(u32, u32, &'a str)],
159 | ) -> Option<&'a str> {
160 |     defaults
161 |         .iter()
162 |         .find(|&&(start, end, _)| start <= codepoint && codepoint <= end)
163 |         .map(|&(_, _, bidi_class)| bidi_class)
164 | }
165 | 


--------------------------------------------------------------------------------
/src/bidi_mirroring_glyph.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::BTreeMap;
 2 | 
 3 | use ucd_parse::{self, BidiMirroring};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | 
 8 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
 9 |     let dir = args.ucd_dir()?;
10 |     let rows: Vec<BidiMirroring> = ucd_parse::parse(&dir)?;
11 | 
12 |     let table: BTreeMap<_, _> = rows
13 |         .into_iter()
14 |         .map(|mapping| {
15 |             (mapping.codepoint.value(), mapping.bidi_mirroring_glyph.value())
16 |         })
17 |         .collect();
18 | 
19 |     let mut wtr = args.writer("bidi_mirroring_glyph")?;
20 |     if args.is_present("rust-match") {
21 |         wtr.codepoint_to_codepoint_fn(args.name(), &table)?;
22 |     } else {
23 |         wtr.codepoint_to_codepoint(args.name(), &table)?;
24 |     }
25 | 
26 |     Ok(())
27 | }
28 | 


--------------------------------------------------------------------------------
/src/brk.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::{BTreeMap, BTreeSet};
 2 | 
 3 | use ucd_parse::{self, GraphemeClusterBreak, SentenceBreak, WordBreak};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | 
 8 | pub fn grapheme_cluster(args: ArgMatches<'_>) -> Result<()> {
 9 |     let ucd_dir = args.ucd_dir()?;
10 |     let vals: Vec<GraphemeClusterBreak> = ucd_parse::parse(&ucd_dir)?;
11 | 
12 |     let mut byval: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
13 |     for x in &vals {
14 |         byval
15 |             .entry(x.value.clone())
16 |             .or_insert(BTreeSet::new())
17 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
18 |     }
19 | 
20 |     let mut wtr = args.writer("grapheme_cluster_break")?;
21 |     if args.is_present("enum") {
22 |         wtr.ranges_to_enum(args.name(), &byval)?;
23 |     } else {
24 |         wtr.names(byval.keys())?;
25 |         for (val, set) in byval {
26 |             wtr.ranges(&val, &set)?;
27 |         }
28 |     }
29 |     Ok(())
30 | }
31 | 
32 | pub fn word(args: ArgMatches<'_>) -> Result<()> {
33 |     let ucd_dir = args.ucd_dir()?;
34 |     let vals: Vec<WordBreak> = ucd_parse::parse(&ucd_dir)?;
35 | 
36 |     let mut byval: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
37 |     for x in &vals {
38 |         byval
39 |             .entry(x.value.clone())
40 |             .or_insert(BTreeSet::new())
41 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
42 |     }
43 | 
44 |     let mut wtr = args.writer("word_break")?;
45 |     if args.is_present("enum") {
46 |         wtr.ranges_to_enum(args.name(), &byval)?;
47 |     } else {
48 |         wtr.names(byval.keys())?;
49 |         for (val, set) in byval {
50 |             wtr.ranges(&val, &set)?;
51 |         }
52 |     }
53 |     Ok(())
54 | }
55 | 
56 | pub fn sentence(args: ArgMatches<'_>) -> Result<()> {
57 |     let ucd_dir = args.ucd_dir()?;
58 |     let vals: Vec<SentenceBreak> = ucd_parse::parse(&ucd_dir)?;
59 | 
60 |     let mut byval: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
61 |     for x in &vals {
62 |         byval
63 |             .entry(x.value.clone())
64 |             .or_insert(BTreeSet::new())
65 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
66 |     }
67 | 
68 |     let mut wtr = args.writer("sentence_break")?;
69 |     if args.is_present("enum") {
70 |         wtr.ranges_to_enum(args.name(), &byval)?;
71 |     } else {
72 |         wtr.names(byval.keys())?;
73 |         for (val, set) in byval {
74 |             wtr.ranges(&val, &set)?;
75 |         }
76 |     }
77 |     Ok(())
78 | }
79 | 


--------------------------------------------------------------------------------
/src/canonical_combining_class.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::{BTreeMap, BTreeSet};
 2 | 
 3 | use ucd_parse::{self, UnicodeData};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | use crate::util::{print_property_values, PropertyValues};
 8 | 
 9 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
10 |     let dir = args.ucd_dir()?;
11 |     let propvals = PropertyValues::from_ucd_dir(&dir)?;
12 |     let rows: Vec<UnicodeData> = ucd_parse::parse(&dir)?;
13 |     let ccc_name = |ccc: u8| {
14 |         propvals.canonical("canonicalcombiningclass", &ccc.to_string())
15 |     };
16 | 
17 |     // If we were tasked with listing the available categories, then do that
18 |     // and quit.
19 |     if args.is_present("list-classes") {
20 |         return print_property_values(&propvals, "Canonical_Combining_Class");
21 |     }
22 | 
23 |     // Collect each canonical combining class into an ordered set.
24 |     let mut name_map: BTreeMap<isize, String> = BTreeMap::new();
25 |     let mut by_name: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
26 |     let mut assigned = BTreeSet::new();
27 |     for row in rows {
28 |         assigned.insert(row.codepoint.value());
29 |         let ccc_value = row.canonical_combining_class;
30 |         let ccc_name = ccc_name(ccc_value)?;
31 |         name_map.entry(ccc_value as isize).or_insert_with(|| ccc_name.clone());
32 |         by_name
33 |             .entry(ccc_name)
34 |             .or_insert(BTreeSet::new())
35 |             .insert(row.codepoint.value());
36 |     }
37 | 
38 |     // Process the codepoints that are not listed as per the note in
39 |     // DerivedCombiningClass.txt (UCD 13.0):
40 |     //
41 |     // - All code points not explicitly listed for Canonical_Combining_Class
42 |     //   have the value Not_Reordered (0).
43 |     let not_reordered_name = ccc_name(0)?;
44 |     for cp in 0..=0x10FFFF {
45 |         if !assigned.contains(&cp) {
46 |             by_name.get_mut(&not_reordered_name).unwrap().insert(cp);
47 |         }
48 |     }
49 | 
50 |     let mut wtr = args.writer("canonical_combining_class")?;
51 |     if args.is_present("enum") {
52 |         wtr.ranges_to_enum(args.name(), &by_name)?;
53 |     } else if args.is_present("rust-enum") {
54 |         wtr.ranges_to_rust_enum_with_custom_discriminants(
55 |             args.name(),
56 |             &name_map,
57 |             &by_name,
58 |         )?;
59 |     } else {
60 |         wtr.names(by_name.keys())?;
61 |         for (name, set) in by_name {
62 |             wtr.ranges(&name, &set)?;
63 |         }
64 |     }
65 | 
66 |     Ok(())
67 | }
68 | 


--------------------------------------------------------------------------------
/src/case_folding.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{BTreeMap, BTreeSet};
  2 | 
  3 | use ucd_parse::{self, CaseFold, CaseStatus, Codepoint};
  4 | 
  5 | use crate::args::ArgMatches;
  6 | use crate::error::Result;
  7 | 
  8 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
  9 |     let dir = args.ucd_dir()?;
 10 |     let case_folding: BTreeMap<Codepoint, Vec<CaseFold>> =
 11 |         ucd_parse::parse_many_by_codepoint(dir)?;
 12 | 
 13 |     let compute_all_pairs =
 14 |         args.is_present("all-pairs") || args.is_present("circular");
 15 |     let mut wtr = args.writer("case_folding_simple")?;
 16 |     let mut table = BTreeMap::new();
 17 |     let mut table_all = BTreeMap::new();
 18 |     for (&cp, case_folds) in &case_folding {
 19 |         let mapping_cp = match choose_fold(case_folds, false)? {
 20 |             None => continue,
 21 |             Some(case_fold) => &case_fold.mapping,
 22 |         };
 23 |         assert_eq!(mapping_cp.len(), 1);
 24 | 
 25 |         let (a, b) = (cp.value(), mapping_cp[0].value());
 26 |         table.insert(a, b);
 27 |         if compute_all_pairs {
 28 |             table_all.entry(a).or_insert(BTreeSet::new()).insert(b);
 29 |             table_all.entry(b).or_insert(BTreeSet::new()).insert(a);
 30 |         }
 31 |     }
 32 |     if compute_all_pairs {
 33 |         let mut exhaustive = BTreeMap::new();
 34 |         for (&k, vs) in &table_all {
 35 |             exhaustive.insert(k, BTreeSet::new());
 36 |             for &v in vs {
 37 |                 exhaustive.get_mut(&k).unwrap().insert(v);
 38 |                 if let Some(vs2) = table_all.get(&v) {
 39 |                     for &v2 in vs2 {
 40 |                         exhaustive.get_mut(&k).unwrap().insert(v2);
 41 |                     }
 42 |                 }
 43 |             }
 44 |             exhaustive.get_mut(&k).unwrap().remove(&k);
 45 |         }
 46 |         table_all = exhaustive;
 47 |     }
 48 | 
 49 |     if args.is_present("circular") {
 50 |         let mut equiv = BTreeMap::new();
 51 |         let mut seen = BTreeSet::new();
 52 |         for (&k, vs) in &table_all {
 53 |             if vs.is_empty() || seen.contains(&k) {
 54 |                 continue;
 55 |             }
 56 |             seen.insert(k);
 57 |             for &v in vs {
 58 |                 seen.insert(v);
 59 |             }
 60 | 
 61 |             let mut cur = *vs.iter().last().unwrap();
 62 |             for &v in Some(&k).into_iter().chain(vs.iter()) {
 63 |                 assert!(!equiv.contains_key(&cur));
 64 |                 equiv.insert(cur, v);
 65 |                 cur = v;
 66 |             }
 67 |         }
 68 |         wtr.codepoint_to_codepoint(args.name(), &equiv)?;
 69 |     } else if args.is_present("all-pairs") {
 70 |         let flat = args.is_present("flat-table");
 71 |         wtr.multi_codepoint_to_codepoint(args.name(), &table_all, flat)?;
 72 |     } else {
 73 |         wtr.codepoint_to_codepoint(args.name(), &table)?;
 74 |     }
 75 |     Ok(())
 76 | }
 77 | 
 78 | /// Given a sequence of case fold mappings, choose exactly one mapping based
 79 | /// on the mapping's status. If `full` is true, then full case mappings are
 80 | /// selected, otherwise simple case mappings are selected. If there are
 81 | /// multiple valid choices, then an error is returned.
 82 | fn choose_fold(
 83 |     case_folds: &[CaseFold],
 84 |     full: bool,
 85 | ) -> Result<Option<&CaseFold>> {
 86 |     let mut choice = None;
 87 |     for case_fold in case_folds {
 88 |         if (full && case_fold.status == CaseStatus::Full)
 89 |             || (!full && case_fold.status == CaseStatus::Simple)
 90 |             || case_fold.status == CaseStatus::Common
 91 |         {
 92 |             if choice.is_some() {
 93 |                 return err!("found multiple matches from: {:?}", case_folds);
 94 |             }
 95 |             choice = Some(case_fold);
 96 |         }
 97 |     }
 98 |     Ok(choice)
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/case_mapping.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::BTreeMap;
 2 | 
 3 | use ucd_parse::{SpecialCaseMapping, UcdFile, UnicodeData};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | 
 8 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
 9 |     let dir = args.ucd_dir()?;
10 |     let mut lower_map: BTreeMap<u32, Vec<u32>> = BTreeMap::new();
11 |     let mut upper_map: BTreeMap<u32, Vec<u32>> = BTreeMap::new();
12 |     let mut title_map: BTreeMap<u32, Vec<u32>> = BTreeMap::new();
13 |     let mut wtr = args.writer("case_mapping")?;
14 |     for item in UnicodeData::from_dir(dir)? {
15 |         let item = item?;
16 |         if let Some(lower) = item.simple_lowercase_mapping {
17 |             lower_map.insert(item.codepoint.value(), vec![lower.value()]);
18 |         }
19 |         if let Some(upper) = item.simple_uppercase_mapping {
20 |             upper_map.insert(item.codepoint.value(), vec![upper.value()]);
21 |         }
22 |         if let Some(title) = item.simple_titlecase_mapping {
23 |             title_map.insert(item.codepoint.value(), vec![title.value()]);
24 |         }
25 |     }
26 | 
27 |     let includes = if let Some(what) = args.values_of("include") {
28 |         what.clone().collect::<Vec<_>>()
29 |     } else {
30 |         vec!["LOWER", "UPPER", "TITLE"]
31 |     };
32 | 
33 |     if args.is_present("simple") {
34 |         let upper_map =
35 |             upper_map.into_iter().map(|(k, v)| (k, v[0])).collect();
36 |         let lower_map =
37 |             lower_map.into_iter().map(|(k, v)| (k, v[0])).collect();
38 |         let title_map =
39 |             title_map.into_iter().map(|(k, v)| (k, v[0])).collect();
40 | 
41 |         for name in includes {
42 |             match name {
43 |                 "LOWER" => wtr.codepoint_to_codepoint("LOWER", &lower_map)?,
44 |                 "UPPER" => wtr.codepoint_to_codepoint("UPPER", &upper_map)?,
45 |                 "TITlE" => wtr.codepoint_to_codepoint("TITlE", &title_map)?,
46 |                 _ => (),
47 |             }
48 |         }
49 |     } else {
50 |         for special in SpecialCaseMapping::from_dir(&dir)? {
51 |             let special = special?;
52 |             if !special.conditions.is_empty() {
53 |                 // There should probably be an option to output these too, but
54 |                 // I'm not sure how they're typically used...
55 |                 continue;
56 |             }
57 |             if !special.lowercase.is_empty() {
58 |                 lower_map.insert(
59 |                     special.codepoint.value(),
60 |                     special.lowercase.iter().map(|v| v.value()).collect(),
61 |                 );
62 |             }
63 |             if !special.uppercase.is_empty() {
64 |                 upper_map.insert(
65 |                     special.codepoint.value(),
66 |                     special.uppercase.iter().map(|v| v.value()).collect(),
67 |                 );
68 |             }
69 |             if !special.titlecase.is_empty() {
70 |                 title_map.insert(
71 |                     special.codepoint.value(),
72 |                     special.titlecase.iter().map(|v| v.value()).collect(),
73 |                 );
74 |             }
75 |         }
76 |         let flat = args.is_present("flat-table");
77 |         for name in includes {
78 |             match name {
79 |                 "LOWER" => {
80 |                     wtr.codepoint_to_codepoints("LOWER", &lower_map, flat)?
81 |                 }
82 |                 "UPPER" => {
83 |                     wtr.codepoint_to_codepoints("UPPER", &upper_map, flat)?
84 |                 }
85 |                 "TITLE" => {
86 |                     wtr.codepoint_to_codepoints("TITLE", &title_map, flat)?
87 |                 }
88 |                 _ => (),
89 |             }
90 |         }
91 |     }
92 |     Ok(())
93 | }
94 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | use std::error;
 2 | use std::fmt;
 3 | use std::io;
 4 | use std::result;
 5 | 
 6 | use clap;
 7 | use fst;
 8 | use ucd_parse;
 9 | use ucd_trie;
10 | 
11 | pub type Result<T> = result::Result<T, Error>;
12 | 
13 | #[derive(Debug)]
14 | pub enum Error {
15 |     Io(io::Error),
16 |     Clap(clap::Error),
17 |     Other(String),
18 | }
19 | 
20 | impl Error {
21 |     pub fn is_broken_pipe(&self) -> bool {
22 |         match *self {
23 |             Error::Io(ref e) if e.kind() == io::ErrorKind::BrokenPipe => true,
24 |             _ => false,
25 |         }
26 |     }
27 | }
28 | 
29 | impl error::Error for Error {
30 |     fn source(&self) -> Option<&(dyn error::Error + 'static)> {
31 |         match *self {
32 |             Error::Io(ref err) => Some(err),
33 |             Error::Clap(ref err) => Some(err),
34 |             _ => None,
35 |         }
36 |     }
37 | }
38 | 
39 | impl fmt::Display for Error {
40 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
41 |         match *self {
42 |             Error::Io(ref err) => err.fmt(f),
43 |             Error::Clap(ref err) => err.fmt(f),
44 |             Error::Other(ref msg) => write!(f, "{}", msg),
45 |         }
46 |     }
47 | }
48 | 
49 | impl From<io::Error> for Error {
50 |     fn from(err: io::Error) -> Error {
51 |         Error::Io(err)
52 |     }
53 | }
54 | 
55 | impl From<clap::Error> for Error {
56 |     fn from(err: clap::Error) -> Error {
57 |         Error::Clap(err)
58 |     }
59 | }
60 | 
61 | impl From<fst::Error> for Error {
62 |     fn from(err: fst::Error) -> Error {
63 |         Error::Other(err.to_string())
64 |     }
65 | }
66 | 
67 | impl From<ucd_parse::Error> for Error {
68 |     fn from(err: ucd_parse::Error) -> Error {
69 |         Error::Other(err.to_string())
70 |     }
71 | }
72 | 
73 | impl From<ucd_trie::Error> for Error {
74 |     fn from(err: ucd_trie::Error) -> Error {
75 |         Error::Other(err.to_string())
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/general_category.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{BTreeMap, BTreeSet};
  2 | 
  3 | use ucd_parse::{self, UnicodeData, UnicodeDataExpander};
  4 | 
  5 | use crate::args::ArgMatches;
  6 | use crate::error::Result;
  7 | use crate::util::{print_property_values, PropertyValues};
  8 | 
  9 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
 10 |     let dir = args.ucd_dir()?;
 11 |     let propvals = PropertyValues::from_ucd_dir(&dir)?;
 12 |     let filter = args.filter(|name| propvals.canonical("gc", name))?;
 13 |     let unexpanded = ucd_parse::parse(&dir)?;
 14 | 
 15 |     // If we were tasked with listing the available categories, then do that
 16 |     // and quit.
 17 |     if args.is_present("list-categories") {
 18 |         return print_property_values(&propvals, "General_Category");
 19 |     }
 20 | 
 21 |     let mut bycat = expand_into_categories(unexpanded, &propvals)?;
 22 | 
 23 |     // As another special case, collect all "related" groups of categories.
 24 |     // But don't do this when printing an enumeration, because in an
 25 |     // enumeration each codepoint should belong to exactly one category, which
 26 |     // is not true if we include related categories.
 27 |     if !args.is_present("enum") && !args.is_present("rust-enum") {
 28 |         for (name, set) in related(&propvals, &bycat) {
 29 |             if filter.contains(&name) {
 30 |                 bycat.insert(name, set);
 31 |             }
 32 |         }
 33 |     }
 34 |     // Finally, filter out any sets according to what the user asked for.
 35 |     let bycat = bycat
 36 |         .into_iter()
 37 |         .filter(|&(ref name, _)| filter.contains(name))
 38 |         .collect();
 39 | 
 40 |     let mut wtr = args.writer("general_category")?;
 41 |     if args.is_present("enum") {
 42 |         wtr.ranges_to_enum(args.name(), &bycat)?;
 43 |     } else if args.is_present("rust-enum") {
 44 |         let variants = bycat.keys().map(String::as_str).collect::<Vec<_>>();
 45 |         wtr.ranges_to_rust_enum(args.name(), &variants, &bycat)?;
 46 |     } else if args.is_present("combined") {
 47 |         wtr.ranges_to_combined(args.name(), &bycat)?;
 48 |     } else {
 49 |         wtr.names(bycat.keys().filter(|n| filter.contains(n)))?;
 50 |         for (name, set) in bycat {
 51 |             wtr.ranges(&name, &set)?;
 52 |         }
 53 |     }
 54 | 
 55 |     Ok(())
 56 | }
 57 | 
 58 | /// Expand a list of UnicodeData rows and group by category.
 59 | pub fn expand_into_categories(
 60 |     unexpanded: Vec<UnicodeData>,
 61 |     propvals: &PropertyValues,
 62 | ) -> Result<BTreeMap<String, BTreeSet<u32>>> {
 63 |     // Expand all of our UnicodeData rows. This results in one big list of
 64 |     // all assigned codepoints.
 65 |     let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect();
 66 | 
 67 |     // Collect each general category into an ordered set.
 68 |     let mut bycat: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
 69 |     let mut assigned = BTreeSet::new();
 70 |     for row in rows {
 71 |         assigned.insert(row.codepoint.value());
 72 |         let gc = propvals.canonical("gc", &row.general_category)?.to_string();
 73 |         bycat
 74 |             .entry(gc)
 75 |             .or_insert(BTreeSet::new())
 76 |             .insert(row.codepoint.value());
 77 |     }
 78 |     // As a special case, collect all unassigned codepoints.
 79 |     let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string();
 80 |     bycat.insert(unassigned_name.clone(), BTreeSet::new());
 81 |     for cp in 0..=0x10FFFF {
 82 |         if !assigned.contains(&cp) {
 83 |             bycat.get_mut(&unassigned_name).unwrap().insert(cp);
 84 |         }
 85 |     }
 86 | 
 87 |     Ok(bycat)
 88 | }
 89 | 
 90 | /// Related returns a set of sets of codepoints corresponding to the "related"
 91 | /// groups of categories defined by Table 12 in UAX#44 S5.7.1.
 92 | ///
 93 | /// The given `cats` should correspond to the normal set of general categories,
 94 | /// keyed by canonical name.
 95 | fn related(
 96 |     propvals: &PropertyValues,
 97 |     cats: &BTreeMap<String, BTreeSet<u32>>,
 98 | ) -> BTreeMap<String, BTreeSet<u32>> {
 99 |     let mut sets = BTreeMap::new();
100 |     for (name, components) in related_categories(propvals) {
101 |         let set = sets.entry(name).or_insert(BTreeSet::new());
102 |         for component in components {
103 |             set.extend(cats[&component].iter().cloned());
104 |         }
105 |     }
106 |     sets
107 | }
108 | 
109 | /// Return all groups of "related" general categories.
110 | fn related_categories(
111 |     propvals: &PropertyValues,
112 | ) -> Vec<(String, Vec<String>)> {
113 |     // canonicalize a gencat property value
114 |     let c = |name: &str| -> String {
115 |         propvals.canonical("gc", name).unwrap().to_string()
116 |     };
117 |     vec![
118 |         (c("Cased_Letter"), vec![c("lu"), c("ll"), c("lt")]),
119 |         (c("Letter"), vec![c("lu"), c("ll"), c("lt"), c("lm"), c("lo")]),
120 |         (c("Mark"), vec![c("mn"), c("mc"), c("me")]),
121 |         (c("Number"), vec![c("nd"), c("nl"), c("no")]),
122 |         (
123 |             c("Punctuation"),
124 |             vec![
125 |                 c("pc"),
126 |                 c("pd"),
127 |                 c("ps"),
128 |                 c("pe"),
129 |                 c("pi"),
130 |                 c("pf"),
131 |                 c("po"),
132 |             ],
133 |         ),
134 |         (c("Symbol"), vec![c("sm"), c("sc"), c("sk"), c("so")]),
135 |         (c("Separator"), vec![c("zs"), c("zl"), c("zp")]),
136 |         (c("Other"), vec![c("cc"), c("cf"), c("cs"), c("co"), c("cn")]),
137 |     ]
138 | }
139 | 


--------------------------------------------------------------------------------
/src/jamo_short_name.rs:
--------------------------------------------------------------------------------
 1 | use std::{collections::BTreeMap, path::Path};
 2 | 
 3 | use ucd_parse::{self, JamoShortName};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | 
 8 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
 9 |     let dir = args.ucd_dir()?;
10 |     let map = jamo_map(&Path::new(dir))?;
11 |     let mut wtr = args.writer("jamo_short_name")?;
12 |     wtr.codepoint_to_string(args.name(), &map)?;
13 |     Ok(())
14 | }
15 | 
16 | fn jamo_map(dir: &Path) -> Result<BTreeMap<u32, String>> {
17 |     let jamo_map = ucd_parse::parse_by_codepoint::<_, JamoShortName>(dir)?;
18 |     let mut map = BTreeMap::new();
19 |     for (cp, jamo) in jamo_map {
20 |         map.insert(cp.value(), jamo.name);
21 |     }
22 |     Ok(map)
23 | }
24 | 
25 | pub fn table(dir: &Path) -> Result<Vec<(u32, String)>> {
26 |     Ok(jamo_map(dir)?.into_iter().collect())
27 | }
28 | 
29 | pub fn table_ref<'a>(table: &'a [(u32, String)]) -> Vec<(u32, &'a str)> {
30 |     table.iter().map(|&(cp, ref name)| (cp, &**name)).collect()
31 | }
32 | 


--------------------------------------------------------------------------------
/src/joining_type.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::{BTreeMap, BTreeSet};
 2 | 
 3 | use ucd_parse::{self, ArabicShaping};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | use crate::general_category;
 8 | use crate::util::PropertyValues;
 9 | 
10 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
11 |     let dir = args.ucd_dir()?;
12 |     let propvals = PropertyValues::from_ucd_dir(&dir)?;
13 |     let rows: Vec<ArabicShaping> = ucd_parse::parse(&dir)?;
14 |     let unexpanded_gc = ucd_parse::parse(&dir)?;
15 |     let gc =
16 |         general_category::expand_into_categories(unexpanded_gc, &propvals)?;
17 | 
18 |     // Collect each joining type into an ordered set.
19 |     let mut by_type: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
20 |     let mut assigned = BTreeSet::new();
21 |     for row in rows {
22 |         assigned.insert(row.codepoint.value());
23 |         let jt =
24 |             propvals.canonical("jt", row.joining_type.as_str())?.to_string();
25 |         by_type
26 |             .entry(jt)
27 |             .or_insert(BTreeSet::new())
28 |             .insert(row.codepoint.value());
29 |     }
30 |     // Process the codepoints that are not listed as per the note in
31 |     // ArabicShaping.txt:
32 |     //
33 |     // Note: Code points that are not explicitly listed in this file are either
34 |     // of joining type T or U:
35 |     //
36 |     // - Those that are not explicitly listed and that are of General Category
37 |     //   Mn, Me, or Cf have joining type T.
38 |     // - All others not explicitly listed have joining type U.
39 |     let transparent_name = propvals.canonical("jt", "transparent")?;
40 |     let non_joining_name = propvals.canonical("jt", "non_joining")?;
41 |     let transparent_categories = ["Mn", "Me", "Cf"]
42 |         .iter()
43 |         .map(|cat| propvals.canonical("gc", cat).map(|name| &gc[&name]))
44 |         .collect::<Result<Vec<_>>>()?;
45 |     for cp in 0..=0x10FFFF {
46 |         if assigned.contains(&cp) {
47 |             continue;
48 |         }
49 |         // See if the code point is in any of the general categories that
50 |         // map to the Transparent joining type. Otherwise add to the
51 |         // Non_Joining type.
52 |         if transparent_categories.iter().any(|cat| cat.contains(&cp)) {
53 |             by_type.get_mut(&transparent_name).unwrap().insert(cp);
54 |         } else {
55 |             by_type.get_mut(&non_joining_name).unwrap().insert(cp);
56 |         }
57 |     }
58 | 
59 |     let mut wtr = args.writer("joining_type")?;
60 |     if args.is_present("enum") {
61 |         wtr.ranges_to_enum(args.name(), &by_type)?;
62 |     } else if args.is_present("rust-enum") {
63 |         let variants = by_type.keys().map(String::as_str).collect::<Vec<_>>();
64 |         wtr.ranges_to_rust_enum(args.name(), &variants, &by_type)?;
65 |     } else if args.is_present("combined") {
66 |         wtr.ranges_to_combined(args.name(), &by_type)?;
67 |     } else {
68 |         wtr.names(by_type.keys())?;
69 |         for (name, set) in by_type {
70 |             wtr.ranges(&name, &set)?;
71 |         }
72 |     }
73 | 
74 |     Ok(())
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{self, Write};
  2 | use std::process;
  3 | 
  4 | use ucd_parse::{UcdFile, UnicodeData};
  5 | 
  6 | use crate::args::ArgMatches;
  7 | use crate::error::Result;
  8 | 
  9 | macro_rules! err {
 10 |     ($($tt:tt)*) => {
 11 |         Err(crate::error::Error::Other(format!($($tt)*)))
 12 |     }
 13 | }
 14 | 
 15 | mod app;
 16 | mod args;
 17 | mod error;
 18 | mod util;
 19 | mod writer;
 20 | 
 21 | mod age;
 22 | mod bidi_class;
 23 | mod bidi_mirroring_glyph;
 24 | mod brk;
 25 | mod canonical_combining_class;
 26 | mod case_folding;
 27 | mod case_mapping;
 28 | mod general_category;
 29 | mod jamo_short_name;
 30 | mod joining_type;
 31 | mod names;
 32 | mod property_bool;
 33 | mod script;
 34 | 
 35 | fn main() {
 36 |     if let Err(err) = run() {
 37 |         if err.is_broken_pipe() {
 38 |             process::exit(0);
 39 |         }
 40 |         eprintln!("{}", err);
 41 |         process::exit(1);
 42 |     }
 43 | }
 44 | 
 45 | fn run() -> Result<()> {
 46 |     let matches = app::app().get_matches();
 47 |     match matches.subcommand() {
 48 |         ("bidi-class", Some(m)) => bidi_class::command(ArgMatches::new(m)),
 49 |         ("bidi-mirroring-glyph", Some(m)) => {
 50 |             bidi_mirroring_glyph::command(ArgMatches::new(m))
 51 |         }
 52 |         ("canonical-combining-class", Some(m)) => {
 53 |             canonical_combining_class::command(ArgMatches::new(m))
 54 |         }
 55 |         ("general-category", Some(m)) => {
 56 |             general_category::command(ArgMatches::new(m))
 57 |         }
 58 |         ("script", Some(m)) => script::command_script(ArgMatches::new(m)),
 59 |         ("script-extension", Some(m)) => {
 60 |             script::command_script_extension(ArgMatches::new(m))
 61 |         }
 62 |         ("property-bool", Some(m)) => {
 63 |             property_bool::command(ArgMatches::new(m))
 64 |         }
 65 |         ("age", Some(m)) => age::command(ArgMatches::new(m)),
 66 |         ("perl-word", Some(m)) => {
 67 |             property_bool::command_perl_word(ArgMatches::new(m))
 68 |         }
 69 |         ("jamo-short-name", Some(m)) => {
 70 |             jamo_short_name::command(ArgMatches::new(m))
 71 |         }
 72 |         ("joining-type", Some(m)) => joining_type::command(ArgMatches::new(m)),
 73 |         ("names", Some(m)) => names::command(ArgMatches::new(m)),
 74 |         ("property-names", Some(m)) => cmd_property_names(ArgMatches::new(m)),
 75 |         ("property-values", Some(m)) => {
 76 |             cmd_property_values(ArgMatches::new(m))
 77 |         }
 78 |         ("case-folding-simple", Some(m)) => {
 79 |             case_folding::command(ArgMatches::new(m))
 80 |         }
 81 |         ("case-mapping", Some(m)) => case_mapping::command(ArgMatches::new(m)),
 82 |         ("grapheme-cluster-break", Some(m)) => {
 83 |             brk::grapheme_cluster(ArgMatches::new(m))
 84 |         }
 85 |         ("word-break", Some(m)) => brk::word(ArgMatches::new(m)),
 86 |         ("sentence-break", Some(m)) => brk::sentence(ArgMatches::new(m)),
 87 |         ("test-unicode-data", Some(m)) => {
 88 |             cmd_test_unicode_data(ArgMatches::new(m))
 89 |         }
 90 |         ("", _) => {
 91 |             app::app().print_help()?;
 92 |             println!("");
 93 |             Ok(())
 94 |         }
 95 |         (unknown, _) => err!("unrecognized command: {}", unknown),
 96 |     }
 97 | }
 98 | 
 99 | fn cmd_property_names(args: ArgMatches<'_>) -> Result<()> {
100 |     use crate::util::PropertyNames;
101 |     use std::collections::BTreeMap;
102 | 
103 |     let dir = args.ucd_dir()?;
104 |     let names = PropertyNames::from_ucd_dir(&dir)?;
105 |     let filter = args.filter(|name| names.canonical(name))?;
106 | 
107 |     let mut actual_names = BTreeMap::new();
108 |     for (k, v) in &names.0 {
109 |         if filter.contains(v) {
110 |             actual_names.insert(k.to_string(), v.to_string());
111 |         }
112 |     }
113 |     let mut wtr = args.writer("property_names")?;
114 |     wtr.string_to_string(args.name(), &actual_names)?;
115 |     Ok(())
116 | }
117 | 
118 | fn cmd_property_values(args: ArgMatches<'_>) -> Result<()> {
119 |     use crate::util::{PropertyNames, PropertyValues};
120 |     use std::collections::BTreeMap;
121 | 
122 |     let dir = args.ucd_dir()?;
123 |     let values = PropertyValues::from_ucd_dir(&dir)?;
124 |     let names = PropertyNames::from_ucd_dir(&dir)?;
125 |     let filter = args.filter(|name| names.canonical(name))?;
126 | 
127 |     let mut actual_values = BTreeMap::new();
128 |     for (k, v) in &values.value {
129 |         if filter.contains(k) {
130 |             actual_values.insert(k.to_string(), v.clone());
131 |         }
132 |     }
133 |     let mut wtr = args.writer("property_values")?;
134 |     wtr.string_to_string_to_string(args.name(), &actual_values)?;
135 |     Ok(())
136 | }
137 | 
138 | fn cmd_test_unicode_data(args: ArgMatches<'_>) -> Result<()> {
139 |     let dir = args.ucd_dir()?;
140 |     let mut stdout = io::stdout();
141 |     for result in UnicodeData::from_dir(dir)? {
142 |         let x: UnicodeData = result?;
143 |         writeln!(stdout, "{}", x)?;
144 |     }
145 |     Ok(())
146 | }
147 | 


--------------------------------------------------------------------------------
/src/names.rs:
--------------------------------------------------------------------------------
  1 | use std::{collections::BTreeMap, path::Path};
  2 | 
  3 | use ucd_parse::{self, Codepoint, NameAlias, UnicodeData};
  4 | use ucd_util;
  5 | 
  6 | use crate::args::ArgMatches;
  7 | use crate::error::Result;
  8 | 
  9 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
 10 |     let dir = args.ucd_dir()?;
 11 |     let jamo_short_name_map = crate::jamo_short_name::table(Path::new(dir))?;
 12 |     let data = ucd_parse::parse_by_codepoint(&dir)?;
 13 |     let aliases = if args.is_present("no-aliases") {
 14 |         None
 15 |     } else {
 16 |         Some(ucd_parse::parse_many_by_codepoint(&dir)?)
 17 |     };
 18 |     let mut names = names_to_codepoint(
 19 |         &data,
 20 |         &aliases,
 21 |         &crate::jamo_short_name::table_ref(&jamo_short_name_map),
 22 |         !args.is_present("no-ideograph"),
 23 |         !args.is_present("no-hangul"),
 24 |     );
 25 |     if args.is_present("normalize") {
 26 |         names = names
 27 |             .into_iter()
 28 |             .map(|(mut name, tagged)| {
 29 |                 ucd_util::character_name_normalize(&mut name);
 30 |                 (name, tagged)
 31 |             })
 32 |             .collect();
 33 |     }
 34 | 
 35 |     let mut wtr = args.writer("names")?;
 36 |     if args.is_present("tagged") {
 37 |         let mut map = BTreeMap::new();
 38 |         for (name, (tag, cp)) in names {
 39 |             map.insert(name, tag.with_codepoint(cp));
 40 |         }
 41 |         wtr.string_to_u64(args.name(), &map)?;
 42 |     } else {
 43 |         let mut map = BTreeMap::new();
 44 |         for (name, (_, cp)) in names {
 45 |             map.insert(name, cp);
 46 |         }
 47 |         wtr.string_to_codepoint(args.name(), &map)?;
 48 |     }
 49 |     Ok(())
 50 | }
 51 | 
 52 | /// A tag indicating how the name of a codepoint was found.
 53 | ///
 54 | /// When a name has both an algorithmically generated name and an
 55 | /// explicit/alias name, then the algorithmically generated tag is preferred.
 56 | #[derive(Debug)]
 57 | enum NameTag {
 58 |     /// The name is listed explicitly in UnicodeData.txt.
 59 |     Explicit,
 60 |     /// The name was taken from NameAliases.txt.
 61 |     Alias,
 62 |     /// The name is an algorithmically generated Hangul syllable.
 63 |     Hangul,
 64 |     /// The name is an algorithmically generated ideograph.
 65 |     Ideograph,
 66 | }
 67 | 
 68 | impl NameTag {
 69 |     fn with_codepoint(&self, cp: u32) -> u64 {
 70 |         use self::NameTag::*;
 71 |         match *self {
 72 |             Explicit => (1 << 33) | (cp as u64),
 73 |             Alias => (1 << 34) | (cp as u64),
 74 |             Hangul => (1 << 35) | (cp as u64),
 75 |             Ideograph => (1 << 36) | (cp as u64),
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | /// Build one big map in memory from every possible name of a character to its
 81 | /// corresponding codepoint. One codepoint may be pointed to by multiple names.
 82 | ///
 83 | /// The return value maps each name to its corresponding codepoint, along with
 84 | /// a tag associated with how that mapping was generated.
 85 | fn names_to_codepoint(
 86 |     data: &BTreeMap<Codepoint, UnicodeData>,
 87 |     aliases: &Option<BTreeMap<Codepoint, Vec<NameAlias>>>,
 88 |     jamo_short_name_table: &[(u32, &str)],
 89 |     ideograph: bool,
 90 |     hangul: bool,
 91 | ) -> BTreeMap<String, (NameTag, u32)> {
 92 |     // The order in which we write names is important, since there is some
 93 |     // overlap.
 94 |     //
 95 |     // Basically, if a character has a "canonical" name that is equivalent to
 96 |     // one of its aliases, then overwrite the alias with the canonical name.
 97 |     // The effect is that its tag will be Explicit rather than Alias.
 98 |     //
 99 |     // Additionally, write the algorithmically generated names after
100 |     // everything, so that even if a algorithmically generated name matches
101 |     // an Explicit/Alias name, its tag will indicate that it is generated.
102 |     let mut map = BTreeMap::new();
103 |     if let Some(ref alias_map) = *aliases {
104 |         for (cp, aliases) in alias_map {
105 |             for name_alias in aliases {
106 |                 let v = (NameTag::Alias, cp.value());
107 |                 map.insert(name_alias.alias.clone(), v);
108 |             }
109 |         }
110 |     }
111 |     for (cp, datum) in data {
112 |         let isnull = datum.name.is_empty()
113 |             || (datum.name.starts_with('<') && datum.name.ends_with('>'));
114 |         if !isnull {
115 |             let v = (NameTag::Explicit, cp.value());
116 |             map.insert(datum.name.clone(), v);
117 |         }
118 |     }
119 |     if ideograph {
120 |         for &(start, end) in ucd_util::RANGE_IDEOGRAPH {
121 |             for cp in start..end + 1 {
122 |                 let v = (NameTag::Ideograph, cp);
123 |                 map.insert(ucd_util::ideograph_name(cp).unwrap(), v);
124 |             }
125 |         }
126 |     }
127 |     if hangul {
128 |         for &(start, end) in ucd_util::RANGE_HANGUL_SYLLABLE {
129 |             for cp in start..end + 1 {
130 |                 let v = (NameTag::Hangul, cp);
131 |                 map.insert(
132 |                     ucd_util::hangul_name(jamo_short_name_table, cp).unwrap(),
133 |                     v,
134 |                 );
135 |             }
136 |         }
137 |     }
138 |     map
139 | }
140 | 


--------------------------------------------------------------------------------
/src/property_bool.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{BTreeMap, BTreeSet};
  2 | use std::path::Path;
  3 | 
  4 | use ucd_parse::{
  5 |     self, CoreProperty, EmojiProperty, Property, UcdFileByCodepoint,
  6 |     UnicodeData, UnicodeDataExpander,
  7 | };
  8 | 
  9 | use crate::args::ArgMatches;
 10 | use crate::error::Result;
 11 | use crate::util::{PropertyNames, PropertyValues};
 12 | 
 13 | pub fn command(args: ArgMatches<'_>) -> Result<()> {
 14 |     let dir = args.ucd_dir()?;
 15 |     let by_name = parse_properties(&dir)?;
 16 |     let properties = PropertyNames::from_ucd_dir(&dir)?;
 17 |     let filter = args.filter(|name| properties.canonical(name))?;
 18 | 
 19 |     if args.is_present("list-properties") {
 20 |         for name in by_name.keys() {
 21 |             println!("{}", name);
 22 |         }
 23 |         return Ok(());
 24 |     }
 25 |     let mut wtr = args.writer("prop_list")?;
 26 |     wtr.names(by_name.keys().filter(|n| filter.contains(n)))?;
 27 |     for (name, set) in by_name {
 28 |         if filter.contains(&name) {
 29 |             wtr.ranges(&name, &set)?;
 30 |         }
 31 |     }
 32 |     Ok(())
 33 | }
 34 | 
 35 | pub fn command_perl_word(args: ArgMatches<'_>) -> Result<()> {
 36 |     let dir = args.ucd_dir()?;
 37 |     let props = parse_properties(&dir)?;
 38 |     let gencats = parse_general_categories(&dir)?;
 39 | 
 40 |     let mut perlword = BTreeSet::new();
 41 |     perlword.extend(&props["Alphabetic"]);
 42 |     perlword.extend(&props["Join_Control"]);
 43 |     perlword.extend(&gencats["Decimal_Number"]);
 44 |     perlword.extend(&gencats["Nonspacing_Mark"]);
 45 |     perlword.extend(&gencats["Enclosing_Mark"]);
 46 |     perlword.extend(&gencats["Spacing_Mark"]);
 47 |     perlword.extend(&gencats["Connector_Punctuation"]);
 48 | 
 49 |     let mut wtr = args.writer("perl_word")?;
 50 |     wtr.ranges(args.name(), &perlword)?;
 51 |     Ok(())
 52 | }
 53 | 
 54 | fn parse_properties<P: AsRef<Path>>(
 55 |     ucd_dir: P,
 56 | ) -> Result<BTreeMap<String, BTreeSet<u32>>> {
 57 |     // TODO: PropList.txt and DerivedCoreProperties.txt cover the majority
 58 |     // of boolean properties, but UAX44 S5.3 Table 9 lists a smattering of
 59 |     // others that we should include here as well. (Some will need support in
 60 |     // ucd-parse, for example, the ones found in DerivedNormalizationProps.txt
 61 |     // while others, like Bidi_Mirrored, are derived from UnicodeData.txt.
 62 |     // Even still, others like Composition_Exclusion have their own file
 63 |     // (CompositionExclusions.txt).
 64 | 
 65 |     let mut by_name: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
 66 | 
 67 |     let prop_list: Vec<Property> = ucd_parse::parse(&ucd_dir)?;
 68 |     for x in &prop_list {
 69 |         by_name
 70 |             .entry(x.property.clone())
 71 |             .or_insert(BTreeSet::new())
 72 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
 73 |     }
 74 | 
 75 |     let core_prop: Vec<CoreProperty> = ucd_parse::parse(&ucd_dir)?;
 76 |     for x in &core_prop {
 77 |         by_name
 78 |             .entry(x.property.clone())
 79 |             .or_insert(BTreeSet::new())
 80 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
 81 |     }
 82 | 
 83 |     // Add Bidi_Mirrored
 84 |     let unicode_data: Vec<UnicodeData> = ucd_parse::parse(&ucd_dir)?;
 85 |     let bidi_mirrored =
 86 |         unicode_data.iter().fold(BTreeSet::new(), |mut set, x| {
 87 |             if x.bidi_mirrored {
 88 |                 set.extend(x.codepoints().into_iter().map(|c| c.value()))
 89 |             }
 90 |             set
 91 |         });
 92 |     by_name.insert("Bidi_Mirrored".to_string(), bidi_mirrored);
 93 | 
 94 |     // Since emoji-data.txt isn't parse of the normal UCD download, don't
 95 |     // die if it doesn't exist. But emit a helpful warning message.
 96 |     let emoji_prop: Vec<EmojiProperty> = match ucd_parse::parse(&ucd_dir) {
 97 |         Ok(props) => props,
 98 |         Err(err) => match *err.kind() {
 99 |             ucd_parse::ErrorKind::Io(_) => {
100 |                 eprintln!(
101 |                     "{}. skipping emoji properties. \
102 |                      emoji-data.txt is included in UCD 13.0.0 and newer, and \
103 |                      can be downloaded from https://unicode.org/Public/emoji/ \
104 |                      for older releases.",
105 |                     err,
106 |                 );
107 |                 vec![]
108 |             }
109 |             _ => return Err(From::from(err)),
110 |         },
111 |     };
112 |     for x in &emoji_prop {
113 |         by_name
114 |             .entry(x.property.clone())
115 |             .or_insert(BTreeSet::new())
116 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
117 |     }
118 |     Ok(by_name)
119 | }
120 | 
121 | fn parse_general_categories<P: AsRef<Path>>(
122 |     ucd_dir: P,
123 | ) -> Result<BTreeMap<String, BTreeSet<u32>>> {
124 |     let propvals = PropertyValues::from_ucd_dir(&ucd_dir)?;
125 |     let unexpanded = ucd_parse::parse(&ucd_dir)?;
126 |     // Expand all of our UnicodeData rows. This results in one big list of
127 |     // all assigned codepoints.
128 |     let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect();
129 | 
130 |     // Collect each general category into an ordered set.
131 |     let mut bycat: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
132 |     for row in rows {
133 |         let gc = propvals.canonical("gc", &row.general_category)?.to_string();
134 |         bycat
135 |             .entry(gc)
136 |             .or_insert(BTreeSet::new())
137 |             .insert(row.codepoint.value());
138 |     }
139 |     Ok(bycat)
140 | }
141 | 


--------------------------------------------------------------------------------
/src/script.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::{BTreeMap, BTreeSet};
 2 | 
 3 | use ucd_parse::{self, Script, ScriptExtension};
 4 | 
 5 | use crate::args::ArgMatches;
 6 | use crate::error::Result;
 7 | use crate::util::{print_property_values, PropertyValues};
 8 | 
 9 | pub fn command_script(args: ArgMatches<'_>) -> Result<()> {
10 |     let dir = args.ucd_dir()?;
11 |     let propvals = PropertyValues::from_ucd_dir(&dir)?;
12 |     let filter = args.filter(|name| propvals.canonical("Script", name))?;
13 | 
14 |     if args.is_present("list-scripts") {
15 |         return print_property_values(&propvals, "Script");
16 |     }
17 | 
18 |     let mut by_name: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
19 |     let scripts: Vec<Script> = ucd_parse::parse(&dir)?;
20 |     for x in &scripts {
21 |         by_name
22 |             .entry(x.script.clone())
23 |             .or_insert(BTreeSet::new())
24 |             .extend(x.codepoints.into_iter().map(|c| c.value()));
25 |     }
26 | 
27 |     let mut wtr = args.writer("script")?;
28 |     if args.is_present("enum") {
29 |         wtr.ranges_to_enum(args.name(), &by_name)?;
30 |     } else if args.is_present("rust-enum") {
31 |         let mut variants = vec!["Unknown"];
32 |         variants.extend(by_name.keys().map(String::as_str));
33 |         wtr.ranges_to_rust_enum(args.name(), &variants, &by_name)?;
34 |     } else if args.is_present("combined") {
35 |         wtr.ranges_to_combined(args.name(), &by_name)?;
36 |     } else {
37 |         wtr.names(by_name.keys().filter(|n| filter.contains(n)))?;
38 |         for (name, set) in by_name {
39 |             if filter.contains(&name) {
40 |                 wtr.ranges(&name, &set)?;
41 |             }
42 |         }
43 |     }
44 | 
45 |     Ok(())
46 | }
47 | 
48 | pub fn command_script_extension(args: ArgMatches<'_>) -> Result<()> {
49 |     let dir = args.ucd_dir()?;
50 |     let propvals = PropertyValues::from_ucd_dir(&dir)?;
51 |     let filter = args.filter(|name| propvals.canonical("Script", name))?;
52 | 
53 |     if args.is_present("list-script-extensions") {
54 |         return print_property_values(&propvals, "Script");
55 |     }
56 | 
57 |     let mut by_name: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
58 |     let mut seen: BTreeSet<u32> = BTreeSet::new();
59 |     let exts: Vec<ScriptExtension> = ucd_parse::parse(&dir)?;
60 |     for x in &exts {
61 |         seen.extend(x.codepoints.into_iter().map(|c| c.value()));
62 |         for name in &x.scripts {
63 |             let name = propvals.canonical("Script", name)?;
64 |             by_name
65 |                 .entry(name)
66 |                 .or_insert(BTreeSet::new())
67 |                 .extend(x.codepoints.into_iter().map(|c| c.value()));
68 |         }
69 |     }
70 | 
71 |     // ScriptExtensions.txt does not list every codepoint. Omitted codepoints
72 |     // default to the set of scripts containing exactly one element: its
73 |     // corresponding Script value. c.f. UAX #24 S4.2.
74 |     let scripts: Vec<Script> = ucd_parse::parse(&dir)?;
75 |     for x in &scripts {
76 |         if !by_name.contains_key(&x.script) {
77 |             by_name.insert(x.script.clone(), BTreeSet::new());
78 |         }
79 |         for cp in x.codepoints.into_iter().map(|c| c.value()) {
80 |             if !seen.contains(&cp) {
81 |                 by_name.get_mut(&x.script).unwrap().insert(cp);
82 |             }
83 |         }
84 |     }
85 | 
86 |     let mut wtr = args.writer("script_extension")?;
87 |     wtr.names(by_name.keys().filter(|n| filter.contains(n)))?;
88 |     for (name, set) in by_name {
89 |         if filter.contains(&name) {
90 |             wtr.ranges(&name, &set)?;
91 |         }
92 |     }
93 |     Ok(())
94 | }
95 | 


--------------------------------------------------------------------------------
/ucd-parse/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ucd-parse"
 3 | version = "0.1.13"  #:version
 4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 5 | description = """
 6 | A library for parsing data files in the Unicode character database.
 7 | """
 8 | documentation = "https://docs.rs/ucd-parse"
 9 | homepage = "https://github.com/BurntSushi/ucd-generate"
10 | repository = "https://github.com/BurntSushi/ucd-generate"
11 | readme = "README.md"
12 | keywords = ["unicode", "database", "character", "property"]
13 | license = "MIT OR Apache-2.0"
14 | edition = "2021"
15 | rust-version = "1.70"
16 | 
17 | [dependencies]
18 | regex-lite = "0.1.0"
19 | 


--------------------------------------------------------------------------------
/ucd-parse/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ucd-parse/README.md:
--------------------------------------------------------------------------------
 1 | ucd-parse
 2 | =========
 3 | A library for parsing Unicode Character Database (UCD) files into structured
 4 | data.
 5 | 
 6 | [![Build status](https://github.com/BurntSushi/ucd-generate/workflows/ci/badge.svg)](https://github.com/BurntSushi/ucd-generate/actions)
 7 | [![crates.io](https://img.shields.io/crates/v/ucd-parse.svg)](https://crates.io/crates/ucd-parse)
 8 | 
 9 | 
10 | ### Documentation
11 | 
12 | https://docs.rs/ucd-parse
13 | 
14 | 
15 | ### License
16 | 
17 | This project is licensed under either of
18 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
19 |    https://www.apache.org/licenses/LICENSE-2.0)
20 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
21 |    https://opensource.org/licenses/MIT)
22 | at your option.
23 | 


--------------------------------------------------------------------------------
/ucd-parse/src/age.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `DerivedAge.txt` file.
12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
13 | pub struct Age {
14 |     /// The codepoint or codepoint range for this entry.
15 |     pub codepoints: Codepoints,
16 |     /// The age assigned to the codepoints in this entry.
17 |     pub age: String,
18 | }
19 | 
20 | impl UcdFile for Age {
21 |     fn relative_file_path() -> &'static Path {
22 |         Path::new("DerivedAge.txt")
23 |     }
24 | }
25 | 
26 | impl UcdFileByCodepoint for Age {
27 |     fn codepoints(&self) -> CodepointIter {
28 |         self.codepoints.into_iter()
29 |     }
30 | }
31 | 
32 | impl std::str::FromStr for Age {
33 |     type Err = Error;
34 | 
35 |     fn from_str(line: &str) -> Result<Age, Error> {
36 |         let (codepoints, script) = parse_codepoint_association(line)?;
37 |         Ok(Age { codepoints, age: script.to_string() })
38 |     }
39 | }
40 | 
41 | #[cfg(test)]
42 | mod tests {
43 |     use super::Age;
44 | 
45 |     #[test]
46 |     fn parse_single() {
47 |         let line = "2BD2          ; 10.0 #       GROUP MARK\n";
48 |         let row: Age = line.parse().unwrap();
49 |         assert_eq!(row.codepoints, 0x2BD2);
50 |         assert_eq!(row.age, "10.0");
51 |     }
52 | 
53 |     #[test]
54 |     fn parse_range() {
55 |         let line = "11D0B..11D36  ; 10.0 #  [44] MASARAM GONDI LETTER AU..MASARAM GONDI VOWEL SIGN VOCALIC R\n";
56 |         let row: Age = line.parse().unwrap();
57 |         assert_eq!(row.codepoints, (0x11D0B, 0x11D36));
58 |         assert_eq!(row.age, "10.0");
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/ucd-parse/src/arabic_shaping.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
  5 |     error::Error,
  6 | };
  7 | 
  8 | /// Represents a single row in the `ArabicShaping.txt` file.
  9 | ///
 10 | /// The field names were taken from the header of ArabicShaping.txt.
 11 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 12 | pub struct ArabicShaping {
 13 |     /// The codepoint corresponding to this row.
 14 |     pub codepoint: Codepoint,
 15 |     /// A short schematic name for the codepoint.
 16 |     ///
 17 |     /// The schematic name is descriptive of the shape, based as consistently as
 18 |     /// possible on a name for the skeleton and then the diacritic marks applied
 19 |     /// to the skeleton, if any.  Note that this schematic name is considered a
 20 |     /// comment, and does not constitute a formal property value.
 21 |     pub schematic_name: String,
 22 |     /// The "joining type" of this codepoint.
 23 |     pub joining_type: JoiningType,
 24 |     /// The "joining group" of this codepoint.
 25 |     pub joining_group: String,
 26 | }
 27 | 
 28 | /// The Joining_Type field read from ArabicShaping.txt
 29 | #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 30 | pub enum JoiningType {
 31 |     RightJoining,
 32 |     LeftJoining,
 33 |     DualJoining,
 34 |     JoinCausing,
 35 |     NonJoining,
 36 |     Transparent,
 37 | }
 38 | 
 39 | impl JoiningType {
 40 |     pub fn as_str(&self) -> &str {
 41 |         match self {
 42 |             JoiningType::RightJoining => "R",
 43 |             JoiningType::LeftJoining => "L",
 44 |             JoiningType::DualJoining => "D",
 45 |             JoiningType::JoinCausing => "C",
 46 |             JoiningType::NonJoining => "U",
 47 |             JoiningType::Transparent => "T",
 48 |         }
 49 |     }
 50 | }
 51 | 
 52 | impl Default for JoiningType {
 53 |     fn default() -> JoiningType {
 54 |         JoiningType::NonJoining
 55 |     }
 56 | }
 57 | 
 58 | impl std::str::FromStr for JoiningType {
 59 |     type Err = Error;
 60 | 
 61 |     fn from_str(s: &str) -> Result<JoiningType, Error> {
 62 |         match s {
 63 |             "R" => Ok(JoiningType::RightJoining),
 64 |             "L" => Ok(JoiningType::LeftJoining),
 65 |             "D" => Ok(JoiningType::DualJoining),
 66 |             "C" => Ok(JoiningType::JoinCausing),
 67 |             "U" => Ok(JoiningType::NonJoining),
 68 |             "T" => Ok(JoiningType::Transparent),
 69 |             _ => err!(
 70 |                 "unrecognized joining type: '{}' \
 71 |                  (must be one of R, L, D, C, U or T)",
 72 |                 s
 73 |             ),
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | impl UcdFile for ArabicShaping {
 79 |     fn relative_file_path() -> &'static Path {
 80 |         Path::new("ArabicShaping.txt")
 81 |     }
 82 | }
 83 | 
 84 | impl UcdFileByCodepoint for ArabicShaping {
 85 |     fn codepoints(&self) -> CodepointIter {
 86 |         self.codepoint.into_iter()
 87 |     }
 88 | }
 89 | 
 90 | impl std::str::FromStr for ArabicShaping {
 91 |     type Err = Error;
 92 | 
 93 |     fn from_str(line: &str) -> Result<ArabicShaping, Error> {
 94 |         let re_parts = regex!(
 95 |             r"(?x)
 96 |                 ^
 97 |                 \s*(?P<codepoint>[A-F0-9]+)\s*;
 98 |                 \s*(?P<name>[^;]+)\s*;
 99 |                 \s*(?P<joining_type>[^;]+)\s*;
100 |                 \s*(?P<joining_group>[^;]+)
101 |                 $
102 |                 ",
103 |         );
104 |         let caps = match re_parts.captures(line.trim()) {
105 |             Some(caps) => caps,
106 |             None => return err!("invalid ArabicShaping line"),
107 |         };
108 | 
109 |         Ok(ArabicShaping {
110 |             codepoint: caps["codepoint"].parse()?,
111 |             schematic_name: caps["name"].to_string(),
112 |             joining_type: caps["joining_type"].parse()?,
113 |             joining_group: caps["joining_group"].to_string(),
114 |         })
115 |     }
116 | }
117 | 
118 | #[cfg(test)]
119 | mod tests {
120 |     use crate::common::Codepoint;
121 | 
122 |     use super::{ArabicShaping, JoiningType};
123 | 
124 |     fn codepoint(n: u32) -> Codepoint {
125 |         Codepoint::from_u32(n).unwrap()
126 |     }
127 | 
128 |     fn s(string: &str) -> String {
129 |         string.to_string()
130 |     }
131 | 
132 |     #[test]
133 |     fn parse1() {
134 |         let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n";
135 |         let data: ArabicShaping = line.parse().unwrap();
136 |         assert_eq!(
137 |             data,
138 |             ArabicShaping {
139 |                 codepoint: codepoint(0x0600),
140 |                 schematic_name: s("ARABIC NUMBER SIGN"),
141 |                 joining_type: JoiningType::NonJoining,
142 |                 joining_group: s("No_Joining_Group")
143 |             }
144 |         );
145 |     }
146 | 
147 |     #[test]
148 |     fn parse2() {
149 |         let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n";
150 |         let data: ArabicShaping = line.parse().unwrap();
151 |         assert_eq!(
152 |             data,
153 |             ArabicShaping {
154 |                 codepoint: codepoint(0x063D),
155 |                 schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"),
156 |                 joining_type: JoiningType::DualJoining,
157 |                 joining_group: s("FARSI YEH")
158 |             }
159 |         );
160 |     }
161 | 
162 |     #[test]
163 |     fn parse3() {
164 |         let line =
165 |             "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n";
166 |         let data: ArabicShaping = line.parse().unwrap();
167 |         assert_eq!(
168 |             data,
169 |             ArabicShaping {
170 |                 codepoint: codepoint(0x10D23),
171 |                 schematic_name: s(
172 |                     "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE"
173 |                 ),
174 |                 joining_type: JoiningType::DualJoining,
175 |                 joining_group: s("HANIFI ROHINGYA KINNA YA")
176 |             }
177 |         );
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------
/ucd-parse/src/bidi_mirroring_glyph.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
  5 |     error::Error,
  6 | };
  7 | 
  8 | /// Represents a single row in the `BidiMirroring.txt` file.
  9 | ///
 10 | /// The field names were taken from the header of BidiMirroring.txt.
 11 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 12 | pub struct BidiMirroring {
 13 |     /// The codepoint corresponding to this row.
 14 |     pub codepoint: Codepoint,
 15 |     /// The codepoint that has typically has a glyph that is the mirror image
 16 |     /// of `codepoint`.
 17 |     pub bidi_mirroring_glyph: Codepoint,
 18 | }
 19 | 
 20 | impl UcdFile for BidiMirroring {
 21 |     fn relative_file_path() -> &'static Path {
 22 |         Path::new("BidiMirroring.txt")
 23 |     }
 24 | }
 25 | 
 26 | impl UcdFileByCodepoint for BidiMirroring {
 27 |     fn codepoints(&self) -> CodepointIter {
 28 |         self.codepoint.into_iter()
 29 |     }
 30 | }
 31 | 
 32 | impl std::str::FromStr for BidiMirroring {
 33 |     type Err = Error;
 34 | 
 35 |     fn from_str(line: &str) -> Result<BidiMirroring, Error> {
 36 |         let re_parts = regex!(
 37 |             r"(?x)
 38 |                 ^
 39 |                 \s*(?P<codepoint>[A-F0-9]+)\s*;
 40 |                 \s*(?P<substitute_codepoint>[A-F0-9]+)
 41 |                 \s+
 42 |                 \#(?:.+)
 43 |                 $
 44 |                 ",
 45 |         );
 46 |         let caps = match re_parts.captures(line.trim()) {
 47 |             Some(caps) => caps,
 48 |             None => return err!("invalid BidiMirroring line"),
 49 |         };
 50 | 
 51 |         Ok(BidiMirroring {
 52 |             codepoint: caps["codepoint"].parse()?,
 53 |             bidi_mirroring_glyph: caps["substitute_codepoint"].parse()?,
 54 |         })
 55 |     }
 56 | }
 57 | 
 58 | impl std::fmt::Display for BidiMirroring {
 59 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 60 |         write!(f, "{};", self.codepoint)?;
 61 |         write!(f, "{};", self.bidi_mirroring_glyph)?;
 62 |         Ok(())
 63 |     }
 64 | }
 65 | 
 66 | #[cfg(test)]
 67 | mod tests {
 68 |     use crate::common::Codepoint;
 69 | 
 70 |     use super::BidiMirroring;
 71 | 
 72 |     fn codepoint(n: u32) -> Codepoint {
 73 |         Codepoint::from_u32(n).unwrap()
 74 |     }
 75 | 
 76 |     #[test]
 77 |     fn parse() {
 78 |         let line = "0028; 0029 # LEFT PARENTHESIS\n";
 79 |         let data: BidiMirroring = line.parse().unwrap();
 80 |         assert_eq!(
 81 |             data,
 82 |             BidiMirroring {
 83 |                 codepoint: codepoint(0x0028),
 84 |                 bidi_mirroring_glyph: codepoint(0x0029),
 85 |             }
 86 |         );
 87 |     }
 88 | 
 89 |     #[test]
 90 |     fn parse_best_fit() {
 91 |         let line = "228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO\n";
 92 |         let data: BidiMirroring = line.parse().unwrap();
 93 |         assert_eq!(
 94 |             data,
 95 |             BidiMirroring {
 96 |                 codepoint: codepoint(0x228A),
 97 |                 bidi_mirroring_glyph: codepoint(0x228B),
 98 |             }
 99 |         );
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/ucd-parse/src/case_folding.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
  5 |     error::Error,
  6 | };
  7 | 
  8 | /// A single row in the `CaseFolding.txt` file.
  9 | ///
 10 | /// The contents of `CaseFolding.txt` are a convenience derived from both
 11 | /// `UnicodeData.txt` and `SpecialCasing.txt`.
 12 | ///
 13 | /// Note that a single codepoint may be mapped multiple times. In particular,
 14 | /// a single codepoint might have distinct `CaseStatus::Simple` and
 15 | /// `CaseStatus::Full` mappings.
 16 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 17 | pub struct CaseFold {
 18 |     /// The codepoint that is being mapped.
 19 |     pub codepoint: Codepoint,
 20 |     /// The case status of this mapping.
 21 |     pub status: CaseStatus,
 22 |     /// The actual case mapping, which is more than one codepoint if this is
 23 |     /// a "full" mapping.
 24 |     pub mapping: Vec<Codepoint>,
 25 | }
 26 | 
 27 | impl UcdFile for CaseFold {
 28 |     fn relative_file_path() -> &'static Path {
 29 |         Path::new("CaseFolding.txt")
 30 |     }
 31 | }
 32 | 
 33 | impl UcdFileByCodepoint for CaseFold {
 34 |     fn codepoints(&self) -> CodepointIter {
 35 |         self.codepoint.into_iter()
 36 |     }
 37 | }
 38 | 
 39 | impl std::str::FromStr for CaseFold {
 40 |     type Err = Error;
 41 | 
 42 |     fn from_str(line: &str) -> Result<CaseFold, Error> {
 43 |         let re_parts = regex!(
 44 |             r"(?x)
 45 |                 ^
 46 |                 \s*(?P<codepoint>[^\s;]+)\s*;
 47 |                 \s*(?P<status>[^\s;]+)\s*;
 48 |                 \s*(?P<mapping>[^;]+)\s*;
 49 |                 ",
 50 |         );
 51 | 
 52 |         let caps = match re_parts.captures(line.trim()) {
 53 |             Some(caps) => caps,
 54 |             None => return err!("invalid CaseFolding line: '{}'", line),
 55 |         };
 56 |         let mut mapping = vec![];
 57 |         for cp in caps["mapping"].split_whitespace() {
 58 |             mapping.push(cp.parse()?);
 59 |         }
 60 |         Ok(CaseFold {
 61 |             codepoint: caps["codepoint"].parse()?,
 62 |             status: caps["status"].parse()?,
 63 |             mapping,
 64 |         })
 65 |     }
 66 | }
 67 | 
 68 | /// The status of a particular case mapping.
 69 | #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 70 | pub enum CaseStatus {
 71 |     /// Case mappings shared by both "simple" and "full" mappings.
 72 |     Common,
 73 |     /// A case mapping that changes the number of codepoints.
 74 |     Full,
 75 |     /// A case mapping that doesn't change the number of codepoints, when it
 76 |     /// differs from `Full`.
 77 |     Simple,
 78 |     /// Special cases (currently only for Turkic mappings) that are typically
 79 |     /// excluded by default. Special cases don't change the number of
 80 |     /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
 81 |     Special,
 82 | }
 83 | 
 84 | impl Default for CaseStatus {
 85 |     fn default() -> CaseStatus {
 86 |         CaseStatus::Common
 87 |     }
 88 | }
 89 | 
 90 | impl CaseStatus {
 91 |     /// Returns true if and only if this status indicates a case mapping that
 92 |     /// won't change the number of codepoints.
 93 |     pub fn is_fixed(&self) -> bool {
 94 |         *self != CaseStatus::Full
 95 |     }
 96 | }
 97 | 
 98 | impl std::str::FromStr for CaseStatus {
 99 |     type Err = Error;
100 | 
101 |     fn from_str(s: &str) -> Result<CaseStatus, Error> {
102 |         match s {
103 |             "C" => Ok(CaseStatus::Common),
104 |             "F" => Ok(CaseStatus::Full),
105 |             "S" => Ok(CaseStatus::Simple),
106 |             "T" => Ok(CaseStatus::Special),
107 |             _ => err!(
108 |                 "unrecognized case status: '{}' \
109 |                  (must be one of C, F, S or T)",
110 |                 s
111 |             ),
112 |         }
113 |     }
114 | }
115 | 
116 | #[cfg(test)]
117 | mod tests {
118 |     use super::{CaseFold, CaseStatus};
119 | 
120 |     #[test]
121 |     fn parse_common() {
122 |         let line =
123 |             "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n";
124 |         let row: CaseFold = line.parse().unwrap();
125 |         assert_eq!(row.codepoint, 0x0150);
126 |         assert_eq!(row.status, CaseStatus::Common);
127 |         assert_eq!(row.mapping, vec![0x0151]);
128 |     }
129 | 
130 |     #[test]
131 |     fn parse_full() {
132 |         let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n";
133 |         let row: CaseFold = line.parse().unwrap();
134 |         assert_eq!(row.codepoint, 0x03B0);
135 |         assert_eq!(row.status, CaseStatus::Full);
136 |         assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]);
137 |     }
138 | 
139 |     #[test]
140 |     fn parse_simple() {
141 |         let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n";
142 |         let row: CaseFold = line.parse().unwrap();
143 |         assert_eq!(row.codepoint, 0x1F8F);
144 |         assert_eq!(row.status, CaseStatus::Simple);
145 |         assert_eq!(row.mapping, vec![0x1F87]);
146 |     }
147 | 
148 |     #[test]
149 |     fn parse_special() {
150 |         let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n";
151 |         let row: CaseFold = line.parse().unwrap();
152 |         assert_eq!(row.codepoint, 0x0049);
153 |         assert_eq!(row.status, CaseStatus::Special);
154 |         assert_eq!(row.mapping, vec![0x0131]);
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/ucd-parse/src/core_properties.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `DerivedCoreProperties.txt` file.
12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
13 | pub struct CoreProperty {
14 |     /// The codepoint or codepoint range for this entry.
15 |     pub codepoints: Codepoints,
16 |     /// The property name assigned to the codepoints in this entry.
17 |     pub property: String,
18 | }
19 | 
20 | impl UcdFile for CoreProperty {
21 |     fn relative_file_path() -> &'static Path {
22 |         Path::new("DerivedCoreProperties.txt")
23 |     }
24 | }
25 | 
26 | impl UcdFileByCodepoint for CoreProperty {
27 |     fn codepoints(&self) -> CodepointIter {
28 |         self.codepoints.into_iter()
29 |     }
30 | }
31 | 
32 | impl std::str::FromStr for CoreProperty {
33 |     type Err = Error;
34 | 
35 |     fn from_str(line: &str) -> Result<CoreProperty, Error> {
36 |         let (codepoints, property) = parse_codepoint_association(line)?;
37 |         Ok(CoreProperty { codepoints, property: property.to_string() })
38 |     }
39 | }
40 | 
41 | #[cfg(test)]
42 | mod tests {
43 |     use super::CoreProperty;
44 | 
45 |     #[test]
46 |     fn parse_single() {
47 |         let line =
48 |             "1163D         ; Case_Ignorable # Mn       MODI SIGN ANUSVARA\n";
49 |         let row: CoreProperty = line.parse().unwrap();
50 |         assert_eq!(row.codepoints, 0x1163D);
51 |         assert_eq!(row.property, "Case_Ignorable");
52 |     }
53 | 
54 |     #[test]
55 |     fn parse_range() {
56 |         let line = "11133..11134  ; Grapheme_Link # Mn   [2] CHAKMA VIRAMA..CHAKMA MAAYYAA\n";
57 |         let row: CoreProperty = line.parse().unwrap();
58 |         assert_eq!(row.codepoints, (0x11133, 0x11134));
59 |         assert_eq!(row.property, "Grapheme_Link");
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/ucd-parse/src/derived_normalization_properties.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `DerivedNormalizationProps.txt` file.
12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
13 | pub struct DerivedNormalizationProperty {
14 |     /// The codepoint or codepoint range for this entry.
15 |     pub codepoints: Codepoints,
16 |     /// The property name assigned to the codepoints in this entry.
17 |     pub property: String,
18 | }
19 | 
20 | impl UcdFile for DerivedNormalizationProperty {
21 |     fn relative_file_path() -> &'static Path {
22 |         Path::new("DerivedNormalizationProps.txt")
23 |     }
24 | }
25 | 
26 | impl UcdFileByCodepoint for DerivedNormalizationProperty {
27 |     fn codepoints(&self) -> CodepointIter {
28 |         self.codepoints.into_iter()
29 |     }
30 | }
31 | 
32 | impl std::str::FromStr for DerivedNormalizationProperty {
33 |     type Err = Error;
34 | 
35 |     fn from_str(line: &str) -> Result<DerivedNormalizationProperty, Error> {
36 |         let (codepoints, property) = parse_codepoint_association(line)?;
37 |         Ok(DerivedNormalizationProperty {
38 |             codepoints,
39 |             property: property.to_string(),
40 |         })
41 |     }
42 | }
43 | 
44 | #[cfg(test)]
45 | mod tests {
46 |     use super::DerivedNormalizationProperty;
47 | 
48 |     #[test]
49 |     fn parse_single() {
50 |         let line =
51 |             "00A0          ; Changes_When_NFKC_Casefolded # Zs       NO-BREAK SPACE\n";
52 |         let row: DerivedNormalizationProperty = line.parse().unwrap();
53 |         assert_eq!(row.codepoints, 0xA0);
54 |         assert_eq!(row.property, "Changes_When_NFKC_Casefolded");
55 |     }
56 | 
57 |     #[test]
58 |     fn parse_range() {
59 |         let line = "0041..005A    ; Changes_When_NFKC_Casefolded # L&  [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z\n";
60 |         let row: DerivedNormalizationProperty = line.parse().unwrap();
61 |         assert_eq!(row.codepoints, (0x41, 0x5A));
62 |         assert_eq!(row.property, "Changes_When_NFKC_Casefolded");
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/ucd-parse/src/east_asian_width.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `EastAsianWidth.txt` file, describing the value of the
12 | /// `East_Asian_Width` property.
13 | ///
14 | /// Note: All code points, assigned or unassigned, that are not listed in
15 | /// EastAsianWidth.txt file are given the value "N".
16 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
17 | pub struct EastAsianWidth {
18 |     /// The codepoint or codepoint range for this entry.
19 |     pub codepoints: Codepoints,
20 |     /// One of "A", "F", "H", "N", "Na", "W".
21 |     pub width: String,
22 | }
23 | 
24 | impl UcdFile for EastAsianWidth {
25 |     fn relative_file_path() -> &'static Path {
26 |         Path::new("EastAsianWidth.txt")
27 |     }
28 | }
29 | 
30 | impl UcdFileByCodepoint for EastAsianWidth {
31 |     fn codepoints(&self) -> CodepointIter {
32 |         self.codepoints.into_iter()
33 |     }
34 | }
35 | 
36 | impl std::str::FromStr for EastAsianWidth {
37 |     type Err = Error;
38 | 
39 |     fn from_str(line: &str) -> Result<EastAsianWidth, Error> {
40 |         let (codepoints, width) = parse_codepoint_association(line)?;
41 |         Ok(EastAsianWidth { codepoints, width: width.to_string() })
42 |     }
43 | }
44 | 
45 | #[cfg(test)]
46 | mod tests {
47 |     use super::EastAsianWidth;
48 | 
49 |     #[test]
50 |     fn parse_single() {
51 |         let line = "27E7;Na          # Pe         MATHEMATICAL RIGHT WHITE SQUARE BRACKET\n";
52 |         let row: EastAsianWidth = line.parse().unwrap();
53 |         assert_eq!(row.codepoints, 0x27E7);
54 |         assert_eq!(row.width, "Na");
55 |     }
56 | 
57 |     #[test]
58 |     fn parse_range() {
59 |         let line = "1F57B..1F594;N   # So    [26] LEFT HAND TELEPHONE RECEIVER..REVERSED VICTORY HAND\n";
60 |         let row: EastAsianWidth = line.parse().unwrap();
61 |         assert_eq!(row.codepoints, (0x1F57B, 0x1F594));
62 |         assert_eq!(row.width, "N");
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/ucd-parse/src/emoji_properties.rs:
--------------------------------------------------------------------------------
 1 | use std::path::{Path, PathBuf};
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `emoji-data.txt` file.
12 | ///
13 | /// The `emoji-data.txt` file is the source of truth on several Emoji-related
14 | /// Unicode properties.
15 | ///
16 | /// Note that `emoji-data.txt` is not formally part of the Unicode Character
17 | /// Database. You can download the Emoji data files separately here:
18 | /// https://unicode.org/Public/emoji/
19 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
20 | pub struct EmojiProperty {
21 |     /// The codepoint or codepoint range for this entry.
22 |     pub codepoints: Codepoints,
23 |     /// The property name assigned to the codepoints in this entry.
24 |     pub property: String,
25 | }
26 | 
27 | impl UcdFile for EmojiProperty {
28 |     fn relative_file_path() -> &'static Path {
29 |         Path::new("emoji/emoji-data.txt")
30 |     }
31 | 
32 |     fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
33 |         let ucd_dir = ucd_dir.as_ref();
34 |         // The standard location, but only on UCDs from 13.0.0 and up.
35 |         let std = ucd_dir.join(Self::relative_file_path());
36 |         if std.exists() {
37 |             std
38 |         } else {
39 |             // If the old location does exist, use it.
40 |             let legacy = ucd_dir.join("emoji-data.txt");
41 |             if legacy.exists() {
42 |                 legacy
43 |             } else {
44 |                 // This might end up in an error message, so use the standard
45 |                 // one if forced to choose. Arguably we could do something like
46 |                 // peek
47 |                 std
48 |             }
49 |         }
50 |     }
51 | }
52 | 
53 | impl UcdFileByCodepoint for EmojiProperty {
54 |     fn codepoints(&self) -> CodepointIter {
55 |         self.codepoints.into_iter()
56 |     }
57 | }
58 | 
59 | impl std::str::FromStr for EmojiProperty {
60 |     type Err = Error;
61 | 
62 |     fn from_str(line: &str) -> Result<EmojiProperty, Error> {
63 |         let (codepoints, property) = parse_codepoint_association(line)?;
64 |         Ok(EmojiProperty { codepoints, property: property.to_string() })
65 |     }
66 | }
67 | 
68 | #[cfg(test)]
69 | mod tests {
70 |     use super::EmojiProperty;
71 | 
72 |     #[test]
73 |     fn parse_single() {
74 |         let line = "24C2          ; Emoji                #  1.1  [1] (Ⓜ️)       circled M\n";
75 |         let row: EmojiProperty = line.parse().unwrap();
76 |         assert_eq!(row.codepoints, 0x24C2);
77 |         assert_eq!(row.property, "Emoji");
78 |     }
79 | 
80 |     #[test]
81 |     fn parse_range() {
82 |         let line = "1FA6E..1FFFD  ; Extended_Pictographic#   NA[1424] (🩮️..🿽️)   <reserved-1FA6E>..<reserved-1FFFD>\n";
83 |         let row: EmojiProperty = line.parse().unwrap();
84 |         assert_eq!(row.codepoints, (0x1FA6E, 0x1FFFD));
85 |         assert_eq!(row.property, "Extended_Pictographic");
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/ucd-parse/src/error.rs:
--------------------------------------------------------------------------------
 1 | use std::path::{Path, PathBuf};
 2 | 
 3 | /// Represents any kind of error that can occur while parsing the UCD.
 4 | #[derive(Debug)]
 5 | pub struct Error {
 6 |     pub(crate) kind: ErrorKind,
 7 |     pub(crate) line: Option<u64>,
 8 |     pub(crate) path: Option<PathBuf>,
 9 | }
10 | 
11 | /// The kind of error that occurred while parsing the UCD.
12 | #[derive(Debug)]
13 | pub enum ErrorKind {
14 |     /// An I/O error.
15 |     Io(std::io::Error),
16 |     /// A generic parse error.
17 |     Parse(String),
18 | }
19 | 
20 | impl Error {
21 |     /// Create a new parse error from the given message.
22 |     pub(crate) fn parse(msg: String) -> Error {
23 |         Error { kind: ErrorKind::Parse(msg), line: None, path: None }
24 |     }
25 | 
26 |     /// Return the specific kind of this error.
27 |     pub fn kind(&self) -> &ErrorKind {
28 |         &self.kind
29 |     }
30 | 
31 |     /// Return the line number at which this error occurred, if available.
32 |     pub fn line(&self) -> Option<u64> {
33 |         self.line
34 |     }
35 | 
36 |     /// Return the file path associated with this error, if one exists.
37 |     pub fn path(&self) -> Option<&Path> {
38 |         self.path.as_ref().map(|p| &**p)
39 |     }
40 | 
41 |     /// Unwrap this error into its underlying kind.
42 |     pub fn into_kind(self) -> ErrorKind {
43 |         self.kind
44 |     }
45 | 
46 |     /// Returns true if and only if this is an I/O error.
47 |     ///
48 |     /// If this returns true, the underlying `ErrorKind` is guaranteed to be
49 |     /// `ErrorKind::Io`.
50 |     pub fn is_io_error(&self) -> bool {
51 |         match self.kind {
52 |             ErrorKind::Io(_) => true,
53 |             _ => false,
54 |         }
55 |     }
56 | }
57 | 
58 | impl std::error::Error for Error {}
59 | 
60 | impl std::fmt::Display for Error {
61 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62 |         if let Some(ref path) = self.path {
63 |             if let Some(line) = self.line {
64 |                 write!(f, "{}:{}: ", path.display(), line)?;
65 |             } else {
66 |                 write!(f, "{}: ", path.display())?;
67 |             }
68 |         } else if let Some(line) = self.line {
69 |             write!(f, "error on line {}: ", line)?;
70 |         }
71 |         match self.kind {
72 |             ErrorKind::Io(ref err) => write!(f, "{}", err),
73 |             ErrorKind::Parse(ref msg) => write!(f, "{}", msg),
74 |         }
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_bidi_class.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedBidiClass.txt` file.
12 | ///
13 | /// This file gives the derived values of the Bidi_Class property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedBidiClass {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived Bidi_Class of the codepoints in this entry.
19 |     pub bidi_class: String,
20 | }
21 | 
22 | impl UcdFile for DerivedBidiClass {
23 |     fn relative_file_path() -> &'static Path {
24 |         Path::new("extracted/DerivedBidiClass.txt")
25 |     }
26 | }
27 | 
28 | impl UcdFileByCodepoint for DerivedBidiClass {
29 |     fn codepoints(&self) -> CodepointIter {
30 |         self.codepoints.into_iter()
31 |     }
32 | }
33 | 
34 | impl std::str::FromStr for DerivedBidiClass {
35 |     type Err = Error;
36 | 
37 |     fn from_str(line: &str) -> Result<DerivedBidiClass, Error> {
38 |         let (codepoints, bidi_class) = parse_codepoint_association(line)?;
39 |         Ok(DerivedBidiClass { codepoints, bidi_class: bidi_class.to_string() })
40 |     }
41 | }
42 | 
43 | #[cfg(test)]
44 | mod tests {
45 |     use super::DerivedBidiClass;
46 | 
47 |     #[test]
48 |     fn parse_single() {
49 |         let line = "00B5          ; L # L&       MICRO SIGN\n";
50 |         let row: DerivedBidiClass = line.parse().unwrap();
51 |         assert_eq!(row.codepoints, 0x00B5);
52 |         assert_eq!(row.bidi_class, "L");
53 |     }
54 | 
55 |     #[test]
56 |     fn parse_range() {
57 |         let line = "0030..0039    ; EN # Nd  [10] DIGIT ZERO..DIGIT NINE\n";
58 |         let row: DerivedBidiClass = line.parse().unwrap();
59 |         assert_eq!(row.codepoints, (0x0030, 0x0039));
60 |         assert_eq!(row.bidi_class, "EN");
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_binary_properties.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedBinaryProperties.txt` file.
12 | ///
13 | /// This file indicates whether a codepoint has the Bidi_Mirrored property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedBinaryProperties {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived property of the codepoints in this entry. Currently,
19 |     /// this is always the always the string "Bidi_Mirrored".
20 |     pub property: String,
21 | }
22 | 
23 | impl UcdFile for DerivedBinaryProperties {
24 |     fn relative_file_path() -> &'static Path {
25 |         Path::new("extracted/DerivedBinaryProperties.txt")
26 |     }
27 | }
28 | 
29 | impl UcdFileByCodepoint for DerivedBinaryProperties {
30 |     fn codepoints(&self) -> CodepointIter {
31 |         self.codepoints.into_iter()
32 |     }
33 | }
34 | 
35 | impl std::str::FromStr for DerivedBinaryProperties {
36 |     type Err = Error;
37 | 
38 |     fn from_str(line: &str) -> Result<DerivedBinaryProperties, Error> {
39 |         let (codepoints, property) = parse_codepoint_association(line)?;
40 |         Ok(DerivedBinaryProperties {
41 |             codepoints,
42 |             property: property.to_string(),
43 |         })
44 |     }
45 | }
46 | 
47 | #[cfg(test)]
48 | mod tests {
49 |     use super::DerivedBinaryProperties;
50 | 
51 |     #[test]
52 |     fn parse_single() {
53 |         let line =
54 |             "0028          ; Bidi_Mirrored # Ps       LEFT PARENTHESIS\n";
55 |         let row: DerivedBinaryProperties = line.parse().unwrap();
56 |         assert_eq!(row.codepoints, 0x0028);
57 |         assert_eq!(row.property, "Bidi_Mirrored");
58 |     }
59 | 
60 |     #[test]
61 |     fn parse_range() {
62 |         let line =  "2A3C..2A3E    ; Bidi_Mirrored # Sm   [3] INTERIOR PRODUCT..Z NOTATION RELATIONAL COMPOSITION\n";
63 |         let row: DerivedBinaryProperties = line.parse().unwrap();
64 |         assert_eq!(row.codepoints, (0x2A3C, 0x2A3E));
65 |         assert_eq!(row.property, "Bidi_Mirrored");
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_combining_class.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedCombiningClass.txt` file.
12 | ///
13 | /// This file gives the derived values of the Canonical_Combining_Class
14 | /// property.
15 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
16 | pub struct DerivedCombiningClass {
17 |     /// The codepoint or codepoint range for this entry.
18 |     pub codepoints: Codepoints,
19 |     /// The derived Canonical_Combining_Class of the codepoints in this entry.
20 |     pub combining_class: String,
21 | }
22 | 
23 | impl UcdFile for DerivedCombiningClass {
24 |     fn relative_file_path() -> &'static Path {
25 |         Path::new("extracted/DerivedCombiningClass.txt")
26 |     }
27 | }
28 | 
29 | impl UcdFileByCodepoint for DerivedCombiningClass {
30 |     fn codepoints(&self) -> CodepointIter {
31 |         self.codepoints.into_iter()
32 |     }
33 | }
34 | 
35 | impl std::str::FromStr for DerivedCombiningClass {
36 |     type Err = Error;
37 | 
38 |     fn from_str(line: &str) -> Result<DerivedCombiningClass, Error> {
39 |         let (codepoints, combining_class) = parse_codepoint_association(line)?;
40 |         Ok(DerivedCombiningClass {
41 |             codepoints,
42 |             combining_class: combining_class.to_string(),
43 |         })
44 |     }
45 | }
46 | 
47 | #[cfg(test)]
48 | mod tests {
49 |     use super::DerivedCombiningClass;
50 | 
51 |     #[test]
52 |     fn parse_single() {
53 |         let line = "0020          ; 0 # Zs       SPACE\n";
54 |         let row: DerivedCombiningClass = line.parse().unwrap();
55 |         assert_eq!(row.codepoints, 0x0020);
56 |         assert_eq!(row.combining_class, "0");
57 |     }
58 | 
59 |     #[test]
60 |     fn parse_range() {
61 |         let line =  "1DD1..1DF5    ; 230 # Mn  [37] COMBINING UR ABOVE..COMBINING UP TACK ABOVE\n";
62 |         let row: DerivedCombiningClass = line.parse().unwrap();
63 |         assert_eq!(row.codepoints, (0x1DD1, 0x1DF5));
64 |         assert_eq!(row.combining_class, "230");
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_decomposition_type.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedCombiningClass.txt` file.
12 | ///
13 | /// This file gives the derived values of the Decomposition_Type
14 | /// property.
15 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
16 | pub struct DerivedDecompositionType {
17 |     /// The codepoint or codepoint range for this entry.
18 |     pub codepoints: Codepoints,
19 |     /// The derived Decomposition_Type of the codepoints in this entry.
20 |     pub decomposition_type: String,
21 | }
22 | 
23 | impl UcdFile for DerivedDecompositionType {
24 |     fn relative_file_path() -> &'static Path {
25 |         Path::new("extracted/DerivedDecompositionType.txt")
26 |     }
27 | }
28 | 
29 | impl UcdFileByCodepoint for DerivedDecompositionType {
30 |     fn codepoints(&self) -> CodepointIter {
31 |         self.codepoints.into_iter()
32 |     }
33 | }
34 | 
35 | impl std::str::FromStr for DerivedDecompositionType {
36 |     type Err = Error;
37 | 
38 |     fn from_str(line: &str) -> Result<DerivedDecompositionType, Error> {
39 |         let (codepoints, decomposition_type) =
40 |             parse_codepoint_association(line)?;
41 |         Ok(DerivedDecompositionType {
42 |             codepoints,
43 |             decomposition_type: decomposition_type.to_string(),
44 |         })
45 |     }
46 | }
47 | 
48 | #[cfg(test)]
49 | mod tests {
50 |     use super::DerivedDecompositionType;
51 | 
52 |     #[test]
53 |     fn parse_single() {
54 |         let line = "00A0          ; Nobreak # Zs       NO-BREAK SPACE\n";
55 |         let row: DerivedDecompositionType = line.parse().unwrap();
56 |         assert_eq!(row.codepoints, 0x00A0);
57 |         assert_eq!(row.decomposition_type, "Nobreak");
58 |     }
59 | 
60 |     #[test]
61 |     fn parse_range() {
62 |         let line =  "3070..3071    ; Canonical # Lo   [2] HIRAGANA LETTER BA..HIRAGANA LETTER PA\n";
63 |         let row: DerivedDecompositionType = line.parse().unwrap();
64 |         assert_eq!(row.codepoints, (0x3070, 0x3071));
65 |         assert_eq!(row.decomposition_type, "Canonical");
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_east_asian_width.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedEastAsianWidth.txt` file.
12 | ///
13 | /// This file gives the derived values of the East_Asian_Width
14 | /// property.
15 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
16 | pub struct DerivedEastAsianWidth {
17 |     /// The codepoint or codepoint range for this entry.
18 |     pub codepoints: Codepoints,
19 |     /// The derived East_Asian_Width of the codepoints in this entry.
20 |     pub east_asian_width: String,
21 | }
22 | 
23 | impl UcdFile for DerivedEastAsianWidth {
24 |     fn relative_file_path() -> &'static Path {
25 |         Path::new("extracted/DerivedEastAsianWidth.txt")
26 |     }
27 | }
28 | 
29 | impl UcdFileByCodepoint for DerivedEastAsianWidth {
30 |     fn codepoints(&self) -> CodepointIter {
31 |         self.codepoints.into_iter()
32 |     }
33 | }
34 | 
35 | impl std::str::FromStr for DerivedEastAsianWidth {
36 |     type Err = Error;
37 | 
38 |     fn from_str(line: &str) -> Result<DerivedEastAsianWidth, Error> {
39 |         let (codepoints, east_asian_width) =
40 |             parse_codepoint_association(line)?;
41 |         Ok(DerivedEastAsianWidth {
42 |             codepoints,
43 |             east_asian_width: east_asian_width.to_string(),
44 |         })
45 |     }
46 | }
47 | 
48 | #[cfg(test)]
49 | mod tests {
50 |     use super::DerivedEastAsianWidth;
51 | 
52 |     #[test]
53 |     fn parse_single() {
54 |         let line = "00A0          ; N # Zs       NO-BREAK SPACE\n";
55 |         let row: DerivedEastAsianWidth = line.parse().unwrap();
56 |         assert_eq!(row.codepoints, 0x00A0);
57 |         assert_eq!(row.east_asian_width, "N");
58 |     }
59 | 
60 |     #[test]
61 |     fn parse_range() {
62 |         let line =  "FF10..FF19    ; F # Nd  [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE\n";
63 |         let row: DerivedEastAsianWidth = line.parse().unwrap();
64 |         assert_eq!(row.codepoints, (0xFF10, 0xFF19));
65 |         assert_eq!(row.east_asian_width, "F");
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_general_category.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedGeneralCategory.txt` file.
12 | ///
13 | /// This file gives the derived values of the General_Category property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedGeneralCategory {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived General_Category of the codepoints in this entry.
19 |     pub general_category: String,
20 | }
21 | 
22 | impl UcdFile for DerivedGeneralCategory {
23 |     fn relative_file_path() -> &'static Path {
24 |         Path::new("extracted/DerivedGeneralCategory.txt")
25 |     }
26 | }
27 | 
28 | impl UcdFileByCodepoint for DerivedGeneralCategory {
29 |     fn codepoints(&self) -> CodepointIter {
30 |         self.codepoints.into_iter()
31 |     }
32 | }
33 | 
34 | impl std::str::FromStr for DerivedGeneralCategory {
35 |     type Err = Error;
36 | 
37 |     fn from_str(line: &str) -> Result<DerivedGeneralCategory, Error> {
38 |         let (codepoints, general_category) =
39 |             parse_codepoint_association(line)?;
40 |         Ok(DerivedGeneralCategory {
41 |             codepoints,
42 |             general_category: general_category.to_string(),
43 |         })
44 |     }
45 | }
46 | 
47 | #[cfg(test)]
48 | mod tests {
49 |     use super::DerivedGeneralCategory;
50 | 
51 |     #[test]
52 |     fn parse_single() {
53 |         let line = "04D9          ; Ll #       CYRILLIC SMALL LETTER SCHWA\n";
54 |         let row: DerivedGeneralCategory = line.parse().unwrap();
55 |         assert_eq!(row.codepoints, 0x04D9);
56 |         assert_eq!(row.general_category, "Ll");
57 |     }
58 | 
59 |     #[test]
60 |     fn parse_range() {
61 |         let line =  "0660..0669    ; Nd #  [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE";
62 |         let row: DerivedGeneralCategory = line.parse().unwrap();
63 |         assert_eq!(row.codepoints, (0x0660, 0x0669));
64 |         assert_eq!(row.general_category, "Nd");
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_joining_group.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedJoiningGroup.txt` file.
12 | ///
13 | /// This file gives the derived values of the Joining_Group property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedJoiningGroup {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived Joining_Group of the codepoints in this entry.
19 |     pub joining_group: String,
20 | }
21 | 
22 | impl UcdFile for DerivedJoiningGroup {
23 |     fn relative_file_path() -> &'static Path {
24 |         Path::new("extracted/DerivedJoiningGroup.txt")
25 |     }
26 | }
27 | 
28 | impl UcdFileByCodepoint for DerivedJoiningGroup {
29 |     fn codepoints(&self) -> CodepointIter {
30 |         self.codepoints.into_iter()
31 |     }
32 | }
33 | 
34 | impl std::str::FromStr for DerivedJoiningGroup {
35 |     type Err = Error;
36 | 
37 |     fn from_str(line: &str) -> Result<DerivedJoiningGroup, Error> {
38 |         let (codepoints, joining_group) = parse_codepoint_association(line)?;
39 |         Ok(DerivedJoiningGroup {
40 |             codepoints,
41 |             joining_group: joining_group.to_string(),
42 |         })
43 |     }
44 | }
45 | 
46 | #[cfg(test)]
47 | mod tests {
48 |     use super::DerivedJoiningGroup;
49 | 
50 |     #[test]
51 |     fn parse_single() {
52 |         let line = "0710          ; Alaph # Lo       SYRIAC LETTER ALAPH\n";
53 |         let row: DerivedJoiningGroup = line.parse().unwrap();
54 |         assert_eq!(row.codepoints, 0x0710);
55 |         assert_eq!(row.joining_group, "Alaph");
56 |     }
57 | 
58 |     #[test]
59 |     fn parse_range() {
60 |         let line =  "0633..0634    ; Seen # Lo   [2] ARABIC LETTER SEEN..ARABIC LETTER SHEEN\n";
61 |         let row: DerivedJoiningGroup = line.parse().unwrap();
62 |         assert_eq!(row.codepoints, (0x0633, 0x0634));
63 |         assert_eq!(row.joining_group, "Seen");
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_joining_type.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedJoiningType.txt` file.
12 | ///
13 | /// This file gives the derived values of the Joining_Type property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedJoiningType {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived Joining_Type of the codepoints in this entry.
19 |     pub joining_type: String,
20 | }
21 | 
22 | impl UcdFile for DerivedJoiningType {
23 |     fn relative_file_path() -> &'static Path {
24 |         Path::new("extracted/DerivedJoiningType.txt")
25 |     }
26 | }
27 | 
28 | impl UcdFileByCodepoint for DerivedJoiningType {
29 |     fn codepoints(&self) -> CodepointIter {
30 |         self.codepoints.into_iter()
31 |     }
32 | }
33 | 
34 | impl std::str::FromStr for DerivedJoiningType {
35 |     type Err = Error;
36 | 
37 |     fn from_str(line: &str) -> Result<DerivedJoiningType, Error> {
38 |         let (codepoints, joining_type) = parse_codepoint_association(line)?;
39 |         Ok(DerivedJoiningType {
40 |             codepoints,
41 |             joining_type: joining_type.to_string(),
42 |         })
43 |     }
44 | }
45 | 
46 | #[cfg(test)]
47 | mod tests {
48 |     use super::DerivedJoiningType;
49 | 
50 |     #[test]
51 |     fn parse_single() {
52 |         let line = "0628          ; D # Lo       ARABIC LETTER BEH\n";
53 |         let row: DerivedJoiningType = line.parse().unwrap();
54 |         assert_eq!(row.codepoints, 0x0628);
55 |         assert_eq!(row.joining_type, "D");
56 |     }
57 | 
58 |     #[test]
59 |     fn parse_range() {
60 |         let line =  "1133B..1133C  ; T # Mn   [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA\n";
61 |         let row: DerivedJoiningType = line.parse().unwrap();
62 |         assert_eq!(row.codepoints, (0x1133B, 0x1133C));
63 |         assert_eq!(row.joining_type, "T");
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_line_break.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedLineBreak.txt` file.
12 | ///
13 | /// This file gives the derived values of the Line_Break property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedLineBreak {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived Line_Break of the codepoints in this entry.
19 |     pub line_break: String,
20 | }
21 | 
22 | impl UcdFile for DerivedLineBreak {
23 |     fn relative_file_path() -> &'static Path {
24 |         Path::new("extracted/DerivedLineBreak.txt")
25 |     }
26 | }
27 | 
28 | impl UcdFileByCodepoint for DerivedLineBreak {
29 |     fn codepoints(&self) -> CodepointIter {
30 |         self.codepoints.into_iter()
31 |     }
32 | }
33 | 
34 | impl std::str::FromStr for DerivedLineBreak {
35 |     type Err = Error;
36 | 
37 |     fn from_str(line: &str) -> Result<DerivedLineBreak, Error> {
38 |         let (codepoints, line_break) = parse_codepoint_association(line)?;
39 |         Ok(DerivedLineBreak { codepoints, line_break: line_break.to_string() })
40 |     }
41 | }
42 | 
43 | #[cfg(test)]
44 | mod tests {
45 |     use super::DerivedLineBreak;
46 | 
47 |     #[test]
48 |     fn parse_single() {
49 |         let line = "0028          ; OP # Ps       LEFT PARENTHESIS\n";
50 |         let row: DerivedLineBreak = line.parse().unwrap();
51 |         assert_eq!(row.codepoints, 0x0028);
52 |         assert_eq!(row.line_break, "OP");
53 |     }
54 | 
55 |     #[test]
56 |     fn parse_range() {
57 |         let line = "0030..0039    ; NU # Nd  [10] DIGIT ZERO..DIGIT NINE\n";
58 |         let row: DerivedLineBreak = line.parse().unwrap();
59 |         assert_eq!(row.codepoints, (0x0030, 0x0039));
60 |         assert_eq!(row.line_break, "NU");
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_name.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedName.txt` file.
12 | ///
13 | /// This file gives the derived values of the Name property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedName {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived Name of the codepoints in this entry.
19 |     pub name: String,
20 | }
21 | 
22 | impl UcdFile for DerivedName {
23 |     fn relative_file_path() -> &'static Path {
24 |         Path::new("extracted/DerivedName.txt")
25 |     }
26 | }
27 | 
28 | impl UcdFileByCodepoint for DerivedName {
29 |     fn codepoints(&self) -> CodepointIter {
30 |         self.codepoints.into_iter()
31 |     }
32 | }
33 | 
34 | impl std::str::FromStr for DerivedName {
35 |     type Err = Error;
36 | 
37 |     fn from_str(line: &str) -> Result<DerivedName, Error> {
38 |         let (codepoints, name) = parse_codepoint_association(line)?;
39 |         Ok(DerivedName { codepoints, name: name.to_string() })
40 |     }
41 | }
42 | 
43 | #[cfg(test)]
44 | mod tests {
45 |     use super::DerivedName;
46 | 
47 |     #[test]
48 |     fn parse_single() {
49 |         let line = "0021          ; EXCLAMATION MARK\n";
50 |         let row: DerivedName = line.parse().unwrap();
51 |         assert_eq!(row.codepoints, 0x0021);
52 |         assert_eq!(row.name, "EXCLAMATION MARK");
53 |     }
54 | 
55 |     #[test]
56 |     fn parse_range() {
57 |         let line = "3400..4DBF    ; CJK UNIFIED IDEOGRAPH-*\n";
58 |         let row: DerivedName = line.parse().unwrap();
59 |         assert_eq!(row.codepoints, (0x3400, 0x4DBF));
60 |         assert_eq!(row.name, "CJK UNIFIED IDEOGRAPH-*");
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_numeric_type.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `extracted/DerivedNumericType.txt` file.
12 | ///
13 | /// This file gives the derived values of the Numeric_Type property.
14 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
15 | pub struct DerivedNumericType {
16 |     /// The codepoint or codepoint range for this entry.
17 |     pub codepoints: Codepoints,
18 |     /// The derived Numeric_Type of the codepoints in this entry.
19 |     pub numeric_type: String,
20 | }
21 | 
22 | impl UcdFile for DerivedNumericType {
23 |     fn relative_file_path() -> &'static Path {
24 |         Path::new("extracted/DerivedNumericType.txt")
25 |     }
26 | }
27 | 
28 | impl UcdFileByCodepoint for DerivedNumericType {
29 |     fn codepoints(&self) -> CodepointIter {
30 |         self.codepoints.into_iter()
31 |     }
32 | }
33 | 
34 | impl std::str::FromStr for DerivedNumericType {
35 |     type Err = Error;
36 | 
37 |     fn from_str(line: &str) -> Result<DerivedNumericType, Error> {
38 |         let (codepoints, numeric_type) = parse_codepoint_association(line)?;
39 |         Ok(DerivedNumericType {
40 |             codepoints,
41 |             numeric_type: numeric_type.to_string(),
42 |         })
43 |     }
44 | }
45 | 
46 | #[cfg(test)]
47 | mod tests {
48 |     use super::DerivedNumericType;
49 | 
50 |     #[test]
51 |     fn parse_single() {
52 |         let line =
53 |             "2189          ; Numeric # No       VULGAR FRACTION ZERO THIRDS\n";
54 |         let row: DerivedNumericType = line.parse().unwrap();
55 |         assert_eq!(row.codepoints, 0x2189);
56 |         assert_eq!(row.numeric_type, "Numeric");
57 |     }
58 | 
59 |     #[test]
60 |     fn parse_range() {
61 |         let line =  "00B2..00B3    ; Digit # No   [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE\n";
62 |         let row: DerivedNumericType = line.parse().unwrap();
63 |         assert_eq!(row.codepoints, (0x00B2, 0x00B3));
64 |         assert_eq!(row.numeric_type, "Digit");
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/derived_numeric_values.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{CodepointIter, Codepoints, UcdFile, UcdFileByCodepoint},
 5 |     error::Error,
 6 | };
 7 | 
 8 | /// A single row in the `extracted/DerivedNumericValues.txt` file.
 9 | ///
10 | /// This file gives the derived values of the Numeric_Value property.
11 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
12 | pub struct DerivedNumericValues {
13 |     /// The codepoint or codepoint range for this entry.
14 |     pub codepoints: Codepoints,
15 |     /// The approximate Numeric_Value of the codepoints in this entry,
16 |     /// as a decimal.
17 |     pub numeric_value_decimal: String,
18 |     /// The exact Numeric_Value of the codepoints in this entry, as
19 |     /// a fraction.
20 |     pub numeric_value_fraction: String,
21 | }
22 | 
23 | impl UcdFile for DerivedNumericValues {
24 |     fn relative_file_path() -> &'static Path {
25 |         Path::new("extracted/DerivedNumericValues.txt")
26 |     }
27 | }
28 | 
29 | impl UcdFileByCodepoint for DerivedNumericValues {
30 |     fn codepoints(&self) -> CodepointIter {
31 |         self.codepoints.into_iter()
32 |     }
33 | }
34 | 
35 | impl std::str::FromStr for DerivedNumericValues {
36 |     type Err = Error;
37 | 
38 |     fn from_str(line: &str) -> Result<DerivedNumericValues, Error> {
39 |         let re_parts = regex!(
40 |             r"(?x)
41 |                 ^
42 |                 \s*(?P<codepoints>[^\s;]+)\s*;
43 |                 \s*(?P<numeric_value_decimal>[^\s;]+)\s*;
44 |                 \s*;
45 |                 \s*(?P<numeric_value_fraction>[^\s;]+)\s*
46 |                 ",
47 |         );
48 | 
49 |         let caps = match re_parts.captures(line.trim()) {
50 |             Some(caps) => caps,
51 |             None => return err!("invalid PropList line: '{}'", line),
52 |         };
53 |         let codepoints = caps["codepoints"].parse()?;
54 |         let numeric_value_decimal = caps["numeric_value_decimal"].to_string();
55 |         let numeric_value_fraction =
56 |             caps["numeric_value_fraction"].to_string();
57 | 
58 |         Ok(DerivedNumericValues {
59 |             codepoints,
60 |             numeric_value_decimal,
61 |             numeric_value_fraction,
62 |         })
63 |     }
64 | }
65 | 
66 | #[cfg(test)]
67 | mod tests {
68 |     use super::DerivedNumericValues;
69 | 
70 |     #[test]
71 |     fn parse_single() {
72 |         let line = "0030          ; 0.0 ; ; 0 # Nd       DIGIT ZERO\n";
73 |         let row: DerivedNumericValues = line.parse().unwrap();
74 |         assert_eq!(row.codepoints, 0x0030);
75 |         assert_eq!(row.numeric_value_decimal, "0.0");
76 |         assert_eq!(row.numeric_value_fraction, "0");
77 |     }
78 | 
79 |     #[test]
80 |     fn parse_range() {
81 |         let line =  "11FC9..11FCA  ; 0.0625 ; ; 1/16 # No   [2] TAMIL FRACTION ONE SIXTEENTH-1..TAMIL FRACTION ONE SIXTEENTH-2\n";
82 |         let row: DerivedNumericValues = line.parse().unwrap();
83 |         assert_eq!(row.codepoints, (0x11FC9, 0x11FCA));
84 |         assert_eq!(row.numeric_value_decimal, "0.0625");
85 |         assert_eq!(row.numeric_value_fraction, "1/16");
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/ucd-parse/src/extracted/mod.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | Types for parsing files in the `extracted` subdirectory of the Unicode
 3 | Character Database download.
 4 | 
 5 | These are placed here, rather than at the top level, to help keep the number of
 6 | types in any given module managable.
 7 | */
 8 | 
 9 | pub use self::{
10 |     derived_bidi_class::DerivedBidiClass,
11 |     derived_binary_properties::DerivedBinaryProperties,
12 |     derived_combining_class::DerivedCombiningClass,
13 |     derived_decomposition_type::DerivedDecompositionType,
14 |     derived_east_asian_width::DerivedEastAsianWidth,
15 |     derived_general_category::DerivedGeneralCategory,
16 |     derived_joining_group::DerivedJoiningGroup,
17 |     derived_joining_type::DerivedJoiningType,
18 |     derived_line_break::DerivedLineBreak, derived_name::DerivedName,
19 |     derived_numeric_type::DerivedNumericType,
20 |     derived_numeric_values::DerivedNumericValues,
21 | };
22 | 
23 | mod derived_bidi_class;
24 | mod derived_binary_properties;
25 | mod derived_combining_class;
26 | mod derived_decomposition_type;
27 | mod derived_east_asian_width;
28 | mod derived_general_category;
29 | mod derived_joining_group;
30 | mod derived_joining_type;
31 | mod derived_line_break;
32 | mod derived_name;
33 | mod derived_numeric_type;
34 | mod derived_numeric_values;
35 | 


--------------------------------------------------------------------------------
/ucd-parse/src/grapheme_cluster_break.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{
  5 |         parse_break_test, parse_codepoint_association, CodepointIter,
  6 |         Codepoints, UcdFile, UcdFileByCodepoint,
  7 |     },
  8 |     error::Error,
  9 | };
 10 | 
 11 | /// A single row in the `auxiliary/GraphemeBreakProperty.txt` file.
 12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 13 | pub struct GraphemeClusterBreak {
 14 |     /// The codepoint or codepoint range for this entry.
 15 |     pub codepoints: Codepoints,
 16 |     /// The property value assigned to the codepoints in this entry.
 17 |     pub value: String,
 18 | }
 19 | 
 20 | impl UcdFile for GraphemeClusterBreak {
 21 |     fn relative_file_path() -> &'static Path {
 22 |         Path::new("auxiliary/GraphemeBreakProperty.txt")
 23 |     }
 24 | }
 25 | 
 26 | impl UcdFileByCodepoint for GraphemeClusterBreak {
 27 |     fn codepoints(&self) -> CodepointIter {
 28 |         self.codepoints.into_iter()
 29 |     }
 30 | }
 31 | 
 32 | impl std::str::FromStr for GraphemeClusterBreak {
 33 |     type Err = Error;
 34 | 
 35 |     fn from_str(line: &str) -> Result<GraphemeClusterBreak, Error> {
 36 |         let (codepoints, value) = parse_codepoint_association(line)?;
 37 |         Ok(GraphemeClusterBreak { codepoints, value: value.to_string() })
 38 |     }
 39 | }
 40 | 
 41 | /// A single row in the `auxiliary/GraphemeBreakTest.txt` file.
 42 | ///
 43 | /// This file defines tests for the grapheme cluster break algorithm.
 44 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 45 | pub struct GraphemeClusterBreakTest {
 46 |     /// Each string is a UTF-8 encoded group of codepoints that make up a
 47 |     /// single grapheme cluster.
 48 |     pub grapheme_clusters: Vec<String>,
 49 |     /// A human readable description of this test.
 50 |     pub comment: String,
 51 | }
 52 | 
 53 | impl UcdFile for GraphemeClusterBreakTest {
 54 |     fn relative_file_path() -> &'static Path {
 55 |         Path::new("auxiliary/GraphemeBreakTest.txt")
 56 |     }
 57 | }
 58 | 
 59 | impl std::str::FromStr for GraphemeClusterBreakTest {
 60 |     type Err = Error;
 61 | 
 62 |     fn from_str(line: &str) -> Result<GraphemeClusterBreakTest, Error> {
 63 |         let (groups, comment) = parse_break_test(line)?;
 64 |         Ok(GraphemeClusterBreakTest { grapheme_clusters: groups, comment })
 65 |     }
 66 | }
 67 | 
 68 | #[cfg(test)]
 69 | mod tests {
 70 |     use super::{GraphemeClusterBreak, GraphemeClusterBreakTest};
 71 | 
 72 |     #[test]
 73 |     fn parse_single() {
 74 |         let line = "093B          ; SpacingMark # Mc       DEVANAGARI VOWEL SIGN OOE\n";
 75 |         let row: GraphemeClusterBreak = line.parse().unwrap();
 76 |         assert_eq!(row.codepoints, 0x093B);
 77 |         assert_eq!(row.value, "SpacingMark");
 78 |     }
 79 | 
 80 |     #[test]
 81 |     fn parse_range() {
 82 |         let line = "1F1E6..1F1FF  ; Regional_Indicator # So  [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z\n";
 83 |         let row: GraphemeClusterBreak = line.parse().unwrap();
 84 |         assert_eq!(row.codepoints, (0x1F1E6, 0x1F1FF));
 85 |         assert_eq!(row.value, "Regional_Indicator");
 86 |     }
 87 | 
 88 |     #[test]
 89 |     fn parse_test() {
 90 |         let line = "÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷	#  ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]\n";
 91 | 
 92 |         let row: GraphemeClusterBreakTest = line.parse().unwrap();
 93 |         assert_eq!(
 94 |             row.grapheme_clusters,
 95 |             vec!["\u{0061}\u{1F3FF}", "\u{1F476}\u{200D}\u{1F6D1}",]
 96 |         );
 97 |         assert!(row.comment.starts_with("÷ [0.2] LATIN SMALL LETTER A"));
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/ucd-parse/src/jamo_short_name.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
 5 |     error::Error,
 6 | };
 7 | 
 8 | /// A single row in the `Jamo.txt` file.
 9 | ///
10 | /// The `Jamo.txt` file defines the `Jamo_Short_Name` property.
11 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
12 | pub struct JamoShortName {
13 |     /// The codepoint corresponding to this row.
14 |     pub codepoint: Codepoint,
15 |     /// The actual "Jamo Short Name." This string contains at most 3 bytes and
16 |     /// may be empty.
17 |     pub name: String,
18 | }
19 | 
20 | impl UcdFile for JamoShortName {
21 |     fn relative_file_path() -> &'static Path {
22 |         Path::new("Jamo.txt")
23 |     }
24 | }
25 | 
26 | impl UcdFileByCodepoint for JamoShortName {
27 |     fn codepoints(&self) -> CodepointIter {
28 |         self.codepoint.into_iter()
29 |     }
30 | }
31 | 
32 | impl std::str::FromStr for JamoShortName {
33 |     type Err = Error;
34 | 
35 |     fn from_str(line: &str) -> Result<JamoShortName, Error> {
36 |         let re_parts = regex!(
37 |             r"(?x)
38 |                 ^
39 |                 (?P<codepoint>[A-Z0-9]+);
40 |                 \s*
41 |                 (?P<name>[A-Z]*)
42 |                 ",
43 |         );
44 | 
45 |         let caps = match re_parts.captures(line.trim()) {
46 |             Some(caps) => caps,
47 |             None => return err!("invalid Jamo_Short_name line"),
48 |         };
49 |         Ok(JamoShortName {
50 |             codepoint: caps["codepoint"].parse()?,
51 |             name: caps.name("name").unwrap().as_str().to_string(),
52 |         })
53 |     }
54 | }
55 | 
56 | #[cfg(test)]
57 | mod tests {
58 |     use super::JamoShortName;
59 | 
60 |     #[test]
61 |     fn parse1() {
62 |         let line = "1164; YAE # HANGUL JUNGSEONG YAE\n";
63 |         let row: JamoShortName = line.parse().unwrap();
64 |         assert_eq!(row.codepoint, 0x1164);
65 |         assert_eq!(row.name, "YAE");
66 |     }
67 | 
68 |     #[test]
69 |     fn parse2() {
70 |         let line = "110B;     # HANGUL CHOSEONG IEUNG\n";
71 |         let row: JamoShortName = line.parse().unwrap();
72 |         assert_eq!(row.codepoint, 0x110B);
73 |         assert_eq!(row.name, "");
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/ucd-parse/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | A library for parsing the Unicode character database.
 3 | */
 4 | 
 5 | #![deny(missing_docs)]
 6 | 
 7 | pub use crate::{
 8 |     common::{
 9 |         parse, parse_by_codepoint, parse_many_by_codepoint,
10 |         ucd_directory_version, Codepoint, CodepointIter, CodepointRange,
11 |         Codepoints, UcdFile, UcdFileByCodepoint, UcdLineParser,
12 |     },
13 |     error::{Error, ErrorKind},
14 | };
15 | 
16 | pub use crate::{
17 |     age::Age,
18 |     arabic_shaping::ArabicShaping,
19 |     bidi_mirroring_glyph::BidiMirroring,
20 |     case_folding::{CaseFold, CaseStatus},
21 |     core_properties::CoreProperty,
22 |     derived_normalization_properties::DerivedNormalizationProperty,
23 |     east_asian_width::EastAsianWidth,
24 |     emoji_properties::EmojiProperty,
25 |     grapheme_cluster_break::{GraphemeClusterBreak, GraphemeClusterBreakTest},
26 |     jamo_short_name::JamoShortName,
27 |     line_break::LineBreakTest,
28 |     name_aliases::{NameAlias, NameAliasLabel},
29 |     prop_list::Property,
30 |     property_aliases::PropertyAlias,
31 |     property_value_aliases::PropertyValueAlias,
32 |     script_extensions::ScriptExtension,
33 |     scripts::Script,
34 |     sentence_break::{SentenceBreak, SentenceBreakTest},
35 |     special_casing::SpecialCaseMapping,
36 |     unicode_data::{
37 |         UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
38 |         UnicodeDataExpander, UnicodeDataNumeric,
39 |     },
40 |     word_break::{WordBreak, WordBreakTest},
41 | };
42 | 
43 | macro_rules! err {
44 |     ($($tt:tt)*) => {
45 |         Err(crate::error::Error::parse(format!($($tt)*)))
46 |     }
47 | }
48 | 
49 | macro_rules! regex {
50 |     ($re:literal $(,)?) => {{
51 |         use regex_lite::Regex;
52 |         use std::sync::OnceLock;
53 | 
54 |         static RE: OnceLock<Regex> = OnceLock::new();
55 |         RE.get_or_init(|| Regex::new($re).unwrap())
56 |     }};
57 | }
58 | 
59 | pub mod extracted;
60 | 
61 | mod common;
62 | mod error;
63 | 
64 | mod age;
65 | mod arabic_shaping;
66 | mod bidi_mirroring_glyph;
67 | mod case_folding;
68 | mod core_properties;
69 | mod derived_normalization_properties;
70 | mod east_asian_width;
71 | mod emoji_properties;
72 | mod grapheme_cluster_break;
73 | mod jamo_short_name;
74 | mod line_break;
75 | mod name_aliases;
76 | mod prop_list;
77 | mod property_aliases;
78 | mod property_value_aliases;
79 | mod script_extensions;
80 | mod scripts;
81 | mod sentence_break;
82 | mod special_casing;
83 | mod unicode_data;
84 | mod word_break;
85 | 


--------------------------------------------------------------------------------
/ucd-parse/src/line_break.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{parse_break_test, UcdFile},
 5 |     error::Error,
 6 | };
 7 | 
 8 | /// A single row in the `auxiliary/LineBreakTest.txt` file.
 9 | ///
10 | /// This file defines tests for the line break algorithm.
11 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
12 | pub struct LineBreakTest {
13 |     /// Each string is a UTF-8 encoded group of codepoints that make up a
14 |     /// single line.
15 |     pub lines: Vec<String>,
16 |     /// A human readable description of this test.
17 |     pub comment: String,
18 | }
19 | 
20 | impl UcdFile for LineBreakTest {
21 |     fn relative_file_path() -> &'static Path {
22 |         Path::new("auxiliary/LineBreakTest.txt")
23 |     }
24 | }
25 | 
26 | impl std::str::FromStr for LineBreakTest {
27 |     type Err = Error;
28 | 
29 |     fn from_str(line: &str) -> Result<LineBreakTest, Error> {
30 |         let (groups, comment) = parse_break_test(line)?;
31 |         Ok(LineBreakTest { lines: groups, comment })
32 |     }
33 | }
34 | 
35 | #[cfg(test)]
36 | mod tests {
37 |     use super::LineBreakTest;
38 | 
39 |     #[test]
40 |     fn parse_test() {
41 |         let line = "× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷   #  × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]";
42 | 
43 |         let row: LineBreakTest = line.parse().unwrap();
44 |         assert_eq!(
45 |             row.lines,
46 |             vec!["\u{1F1F7}\u{1F1FA}", "\u{1F1F8}\u{1F1EA}",]
47 |         );
48 |         assert!(row.comment.ends_with("(RI) ÷ [0.3]"));
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/ucd-parse/src/name_aliases.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
  5 |     error::Error,
  6 | };
  7 | 
  8 | /// A single row in the `NameAliases.txt` file.
  9 | ///
 10 | /// Note that there are multiple rows for some codepoint. Each row provides a
 11 | /// new alias.
 12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 13 | pub struct NameAlias {
 14 |     /// The codepoint corresponding to this row.
 15 |     pub codepoint: Codepoint,
 16 |     /// The alias.
 17 |     pub alias: String,
 18 |     /// The label of this alias.
 19 |     pub label: NameAliasLabel,
 20 | }
 21 | 
 22 | impl UcdFile for NameAlias {
 23 |     fn relative_file_path() -> &'static Path {
 24 |         Path::new("NameAliases.txt")
 25 |     }
 26 | }
 27 | 
 28 | impl UcdFileByCodepoint for NameAlias {
 29 |     fn codepoints(&self) -> CodepointIter {
 30 |         self.codepoint.into_iter()
 31 |     }
 32 | }
 33 | 
 34 | impl std::str::FromStr for NameAlias {
 35 |     type Err = Error;
 36 | 
 37 |     fn from_str(line: &str) -> Result<NameAlias, Error> {
 38 |         let re_parts = regex!(
 39 |             r"(?x)
 40 |                 ^
 41 |                 (?P<codepoint>[A-Z0-9]+);
 42 |                 \s*
 43 |                 (?P<alias>[^;]+);
 44 |                 \s*
 45 |                 (?P<label>\S+)
 46 |                 ",
 47 |         );
 48 | 
 49 |         let caps = match re_parts.captures(line.trim()) {
 50 |             Some(caps) => caps,
 51 |             None => return err!("invalid NameAliases line"),
 52 |         };
 53 |         Ok(NameAlias {
 54 |             codepoint: caps["codepoint"].parse()?,
 55 |             alias: caps.name("alias").unwrap().as_str().to_string(),
 56 |             label: caps["label"].parse()?,
 57 |         })
 58 |     }
 59 | }
 60 | 
 61 | /// The label of a name alias.
 62 | #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 63 | pub enum NameAliasLabel {
 64 |     /// Corrections for serious problems in a character name.
 65 |     Correction,
 66 |     /// ISO 6429 names for C0 and C1 control functions and other commonly
 67 |     /// occurring names for control codes.
 68 |     Control,
 69 |     /// A few widely used alternate names for format characters.
 70 |     Alternate,
 71 |     /// Several documented labels for C1 control code points which were
 72 |     /// never actually approved in any standard.
 73 |     Figment,
 74 |     /// Commonly occurring abbreviations (or acronyms) for control codes,
 75 |     /// format characters, spaces and variation selectors.
 76 |     Abbreviation,
 77 | }
 78 | 
 79 | impl Default for NameAliasLabel {
 80 |     fn default() -> NameAliasLabel {
 81 |         // This is arbitrary, but the Default impl is convenient.
 82 |         NameAliasLabel::Correction
 83 |     }
 84 | }
 85 | 
 86 | impl std::str::FromStr for NameAliasLabel {
 87 |     type Err = Error;
 88 | 
 89 |     fn from_str(s: &str) -> Result<NameAliasLabel, Error> {
 90 |         match s {
 91 |             "correction" => Ok(NameAliasLabel::Correction),
 92 |             "control" => Ok(NameAliasLabel::Control),
 93 |             "alternate" => Ok(NameAliasLabel::Alternate),
 94 |             "figment" => Ok(NameAliasLabel::Figment),
 95 |             "abbreviation" => Ok(NameAliasLabel::Abbreviation),
 96 |             unknown => err!("unknown name alias label: '{}'", unknown),
 97 |         }
 98 |     }
 99 | }
100 | 
101 | #[cfg(test)]
102 | mod tests {
103 |     use super::{NameAlias, NameAliasLabel};
104 | 
105 |     #[test]
106 |     fn parse1() {
107 |         let line = "0000;NULL;control\n";
108 |         let row: NameAlias = line.parse().unwrap();
109 |         assert_eq!(row.codepoint, 0x0);
110 |         assert_eq!(row.alias, "NULL");
111 |         assert_eq!(row.label, NameAliasLabel::Control);
112 |     }
113 | 
114 |     #[test]
115 |     fn parse2() {
116 |         let line = "000B;VERTICAL TABULATION;control\n";
117 |         let row: NameAlias = line.parse().unwrap();
118 |         assert_eq!(row.codepoint, 0xB);
119 |         assert_eq!(row.alias, "VERTICAL TABULATION");
120 |         assert_eq!(row.label, NameAliasLabel::Control);
121 |     }
122 | 
123 |     #[test]
124 |     fn parse3() {
125 |         let line = "0081;HIGH OCTET PRESET;figment\n";
126 |         let row: NameAlias = line.parse().unwrap();
127 |         assert_eq!(row.codepoint, 0x81);
128 |         assert_eq!(row.alias, "HIGH OCTET PRESET");
129 |         assert_eq!(row.label, NameAliasLabel::Figment);
130 |     }
131 | 
132 |     #[test]
133 |     fn parse4() {
134 |         let line = "E01EF;VS256;abbreviation\n";
135 |         let row: NameAlias = line.parse().unwrap();
136 |         assert_eq!(row.codepoint, 0xE01EF);
137 |         assert_eq!(row.alias, "VS256");
138 |         assert_eq!(row.label, NameAliasLabel::Abbreviation);
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/ucd-parse/src/prop_list.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `PropList.txt` file.
12 | ///
13 | /// The `PropList.txt` file is the source of truth on several Unicode
14 | /// properties.
15 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
16 | pub struct Property {
17 |     /// The codepoint or codepoint range for this entry.
18 |     pub codepoints: Codepoints,
19 |     /// The property name assigned to the codepoints in this entry.
20 |     pub property: String,
21 | }
22 | 
23 | impl UcdFile for Property {
24 |     fn relative_file_path() -> &'static Path {
25 |         Path::new("PropList.txt")
26 |     }
27 | }
28 | 
29 | impl UcdFileByCodepoint for Property {
30 |     fn codepoints(&self) -> CodepointIter {
31 |         self.codepoints.into_iter()
32 |     }
33 | }
34 | 
35 | impl std::str::FromStr for Property {
36 |     type Err = Error;
37 | 
38 |     fn from_str(line: &str) -> Result<Property, Error> {
39 |         let (codepoints, property) = parse_codepoint_association(line)?;
40 |         Ok(Property { codepoints, property: property.to_string() })
41 |     }
42 | }
43 | 
44 | #[cfg(test)]
45 | mod tests {
46 |     use super::Property;
47 | 
48 |     #[test]
49 |     fn parse_single() {
50 |         let line =
51 |             "061C          ; Bidi_Control # Cf       ARABIC LETTER MARK\n";
52 |         let row: Property = line.parse().unwrap();
53 |         assert_eq!(row.codepoints, 0x061C);
54 |         assert_eq!(row.property, "Bidi_Control");
55 |     }
56 | 
57 |     #[test]
58 |     fn parse_range() {
59 |         let line = "0009..000D    ; White_Space # Cc   [5] <control-0009>..<control-000D>\n";
60 |         let row: Property = line.parse().unwrap();
61 |         assert_eq!(row.codepoints, (0x0009, 0x000D));
62 |         assert_eq!(row.property, "White_Space");
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/ucd-parse/src/property_aliases.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{common::UcdFile, error::Error};
  4 | 
  5 | /// A single row in the `PropertyAliases.txt` file.
  6 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
  7 | pub struct PropertyAlias {
  8 |     /// An abbreviation for this property.
  9 |     pub abbreviation: String,
 10 |     /// The "long" name of this property.
 11 |     pub long: String,
 12 |     /// Additional aliases (if present).
 13 |     pub aliases: Vec<String>,
 14 | }
 15 | 
 16 | impl UcdFile for PropertyAlias {
 17 |     fn relative_file_path() -> &'static Path {
 18 |         Path::new("PropertyAliases.txt")
 19 |     }
 20 | }
 21 | 
 22 | impl std::str::FromStr for PropertyAlias {
 23 |     type Err = Error;
 24 | 
 25 |     fn from_str(line: &str) -> Result<PropertyAlias, Error> {
 26 |         let re_parts = regex!(
 27 |             r"(?x)
 28 |                 ^
 29 |                 \s*(?P<abbrev>[^\s;]+)\s*;
 30 |                 \s*(?P<long>[^\s;]+)\s*
 31 |                 (?:;(?P<aliases>.*))?
 32 |                 ",
 33 |         );
 34 |         let re_aliases = regex!(r"\s*(?P<alias>[^\s;]+)\s*;?\s*");
 35 | 
 36 |         let caps = match re_parts.captures(line.trim()) {
 37 |             Some(caps) => caps,
 38 |             None => return err!("invalid PropertyAliases line: '{}'", line),
 39 |         };
 40 |         let mut aliases = vec![];
 41 |         if let Some(m) = caps.name("aliases") {
 42 |             for acaps in re_aliases.captures_iter(m.as_str()) {
 43 |                 let alias = acaps.name("alias").unwrap().as_str();
 44 |                 aliases.push(alias.to_string());
 45 |             }
 46 |         }
 47 |         Ok(PropertyAlias {
 48 |             abbreviation: caps.name("abbrev").unwrap().as_str().to_string(),
 49 |             long: caps.name("long").unwrap().as_str().to_string(),
 50 |             aliases,
 51 |         })
 52 |     }
 53 | }
 54 | 
 55 | #[cfg(test)]
 56 | mod tests {
 57 |     use super::PropertyAlias;
 58 | 
 59 |     #[test]
 60 |     fn parse1() {
 61 |         let line = "cjkAccountingNumeric     ; kAccountingNumeric\n";
 62 |         let row: PropertyAlias = line.parse().unwrap();
 63 |         assert_eq!(row.abbreviation, "cjkAccountingNumeric");
 64 |         assert_eq!(row.long, "kAccountingNumeric");
 65 |         assert!(row.aliases.is_empty());
 66 |     }
 67 | 
 68 |     #[test]
 69 |     fn parse2() {
 70 |         let line = "nv                       ; Numeric_Value\n";
 71 |         let row: PropertyAlias = line.parse().unwrap();
 72 |         assert_eq!(row.abbreviation, "nv");
 73 |         assert_eq!(row.long, "Numeric_Value");
 74 |         assert!(row.aliases.is_empty());
 75 |     }
 76 | 
 77 |     #[test]
 78 |     fn parse3() {
 79 |         let line =
 80 |             "scf                      ; Simple_Case_Folding         ; sfc\n";
 81 |         let row: PropertyAlias = line.parse().unwrap();
 82 |         assert_eq!(row.abbreviation, "scf");
 83 |         assert_eq!(row.long, "Simple_Case_Folding");
 84 |         assert_eq!(row.aliases, vec!["sfc"]);
 85 |     }
 86 | 
 87 |     #[test]
 88 |     fn parse4() {
 89 |         let line = "cjkRSUnicode             ; kRSUnicode                  ; Unicode_Radical_Stroke; URS\n";
 90 |         let row: PropertyAlias = line.parse().unwrap();
 91 |         assert_eq!(row.abbreviation, "cjkRSUnicode");
 92 |         assert_eq!(row.long, "kRSUnicode");
 93 |         assert_eq!(row.aliases, vec!["Unicode_Radical_Stroke", "URS"]);
 94 |     }
 95 | 
 96 |     #[test]
 97 |     fn parse5() {
 98 |         let line = "isc                      ; ISO_Comment";
 99 |         let row: PropertyAlias = line.parse().unwrap();
100 |         assert_eq!(row.abbreviation, "isc");
101 |         assert_eq!(row.long, "ISO_Comment");
102 |         assert!(row.aliases.is_empty());
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/ucd-parse/src/property_value_aliases.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{common::UcdFile, error::Error};
  4 | 
  5 | /// A single row in the `PropertyValueAliases.txt` file.
  6 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
  7 | pub struct PropertyValueAlias {
  8 |     /// The property name for which this value alias applies.
  9 |     pub property: String,
 10 |     /// A numeric abbreviation for this property value, if present. (This is
 11 |     /// seemingly only present for the `ccc`/`Canonical_Combining_Class`
 12 |     /// property.)
 13 |     pub numeric: Option<u8>,
 14 |     /// An abbreviation for this property value.
 15 |     pub abbreviation: String,
 16 |     /// The "long" form of this property value.
 17 |     pub long: String,
 18 |     /// Additional value aliases (if present).
 19 |     pub aliases: Vec<String>,
 20 | }
 21 | 
 22 | impl UcdFile for PropertyValueAlias {
 23 |     fn relative_file_path() -> &'static Path {
 24 |         Path::new("PropertyValueAliases.txt")
 25 |     }
 26 | }
 27 | 
 28 | impl std::str::FromStr for PropertyValueAlias {
 29 |     type Err = Error;
 30 | 
 31 |     fn from_str(line: &str) -> Result<PropertyValueAlias, Error> {
 32 |         let re_parts = regex!(
 33 |             r"(?x)
 34 |                 ^
 35 |                 \s*(?P<prop>[^\s;]+)\s*;
 36 |                 \s*(?P<abbrev>[^\s;]+)\s*;
 37 |                 \s*(?P<long>[^\s;]+)\s*
 38 |                 (?:;(?P<aliases>.*))?
 39 |                 ",
 40 |         );
 41 |         let re_parts_ccc = regex!(
 42 |             r"(?x)
 43 |                 ^
 44 |                 ccc;
 45 |                 \s*(?P<num_class>[0-9]+)\s*;
 46 |                 \s*(?P<abbrev>[^\s;]+)\s*;
 47 |                 \s*(?P<long>[^\s;]+)
 48 |                 ",
 49 |         );
 50 |         let re_aliases = regex!(r"\s*(?P<alias>[^\s;]+)\s*;?\s*");
 51 | 
 52 |         if line.starts_with("ccc;") {
 53 |             let caps = match re_parts_ccc.captures(line.trim()) {
 54 |                 Some(caps) => caps,
 55 |                 None => {
 56 |                     return err!("invalid PropertyValueAliases (ccc) line")
 57 |                 }
 58 |             };
 59 |             let n = match caps["num_class"].parse() {
 60 |                 Ok(n) => n,
 61 |                 Err(err) => {
 62 |                     return err!(
 63 |                         "failed to parse ccc number '{}': {}",
 64 |                         &caps["num_class"],
 65 |                         err
 66 |                     )
 67 |                 }
 68 |             };
 69 |             let abbrev = caps.name("abbrev").unwrap().as_str();
 70 |             let long = caps.name("long").unwrap().as_str();
 71 |             return Ok(PropertyValueAlias {
 72 |                 property: line[0..3].to_string(),
 73 |                 numeric: Some(n),
 74 |                 abbreviation: abbrev.to_string(),
 75 |                 long: long.to_string(),
 76 |                 aliases: vec![],
 77 |             });
 78 |         }
 79 | 
 80 |         let caps = match re_parts.captures(line.trim()) {
 81 |             Some(caps) => caps,
 82 |             None => return err!("invalid PropertyValueAliases line"),
 83 |         };
 84 |         let mut aliases = vec![];
 85 |         if let Some(m) = caps.name("aliases") {
 86 |             for acaps in re_aliases.captures_iter(m.as_str()) {
 87 |                 let alias = acaps.name("alias").unwrap().as_str();
 88 |                 if alias == "#" {
 89 |                     // This starts a comment, so stop reading.
 90 |                     break;
 91 |                 }
 92 |                 aliases.push(alias.to_string());
 93 |             }
 94 |         }
 95 |         Ok(PropertyValueAlias {
 96 |             property: caps.name("prop").unwrap().as_str().to_string(),
 97 |             numeric: None,
 98 |             abbreviation: caps.name("abbrev").unwrap().as_str().to_string(),
 99 |             long: caps.name("long").unwrap().as_str().to_string(),
100 |             aliases,
101 |         })
102 |     }
103 | }
104 | 
105 | #[cfg(test)]
106 | mod tests {
107 |     use super::PropertyValueAlias;
108 | 
109 |     #[test]
110 |     fn parse1() {
111 |         let line = "blk; Arabic_PF_A                      ; Arabic_Presentation_Forms_A      ; Arabic_Presentation_Forms-A\n";
112 |         let row: PropertyValueAlias = line.parse().unwrap();
113 |         assert_eq!(row.property, "blk");
114 |         assert_eq!(row.numeric, None);
115 |         assert_eq!(row.abbreviation, "Arabic_PF_A");
116 |         assert_eq!(row.long, "Arabic_Presentation_Forms_A");
117 |         assert_eq!(row.aliases, vec!["Arabic_Presentation_Forms-A"]);
118 |     }
119 | 
120 |     #[test]
121 |     fn parse2() {
122 |         let line = "AHex; N                               ; No                               ; F                                ; False\n";
123 |         let row: PropertyValueAlias = line.parse().unwrap();
124 |         assert_eq!(row.property, "AHex");
125 |         assert_eq!(row.numeric, None);
126 |         assert_eq!(row.abbreviation, "N");
127 |         assert_eq!(row.long, "No");
128 |         assert_eq!(row.aliases, vec!["F", "False"]);
129 |     }
130 | 
131 |     #[test]
132 |     fn parse3() {
133 |         let line = "age; 1.1                              ; V1_1\n";
134 |         let row: PropertyValueAlias = line.parse().unwrap();
135 |         assert_eq!(row.property, "age");
136 |         assert_eq!(row.numeric, None);
137 |         assert_eq!(row.abbreviation, "1.1");
138 |         assert_eq!(row.long, "V1_1");
139 |         assert!(row.aliases.is_empty());
140 |     }
141 | 
142 |     #[test]
143 |     fn parse4() {
144 |         let line = "ccc;   0; NR                         ; Not_Reordered\n";
145 |         let row: PropertyValueAlias = line.parse().unwrap();
146 |         assert_eq!(row.property, "ccc");
147 |         assert_eq!(row.numeric, Some(0));
148 |         assert_eq!(row.abbreviation, "NR");
149 |         assert_eq!(row.long, "Not_Reordered");
150 |         assert!(row.aliases.is_empty());
151 |     }
152 | 
153 |     #[test]
154 |     fn parse5() {
155 |         let line =
156 |             "ccc; 133; CCC133                     ; CCC133 # RESERVED\n";
157 |         let row: PropertyValueAlias = line.parse().unwrap();
158 |         assert_eq!(row.property, "ccc");
159 |         assert_eq!(row.numeric, Some(133));
160 |         assert_eq!(row.abbreviation, "CCC133");
161 |         assert_eq!(row.long, "CCC133");
162 |         assert!(row.aliases.is_empty());
163 |     }
164 | 
165 |     #[test]
166 |     fn parse6() {
167 |         let line = "gc ; P                                ; Punctuation                      ; punct                            # Pc | Pd | Pe | Pf | Pi | Po | Ps\n";
168 |         let row: PropertyValueAlias = line.parse().unwrap();
169 |         assert_eq!(row.property, "gc");
170 |         assert_eq!(row.numeric, None);
171 |         assert_eq!(row.abbreviation, "P");
172 |         assert_eq!(row.long, "Punctuation");
173 |         assert_eq!(row.aliases, vec!["punct"]);
174 |     }
175 | }
176 | 


--------------------------------------------------------------------------------
/ucd-parse/src/script_extensions.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `ScriptExtensions.txt` file.
12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
13 | pub struct ScriptExtension {
14 |     /// The codepoint or codepoint range for this entry.
15 |     pub codepoints: Codepoints,
16 |     /// The script extension names assigned to the codepoints in this entry.
17 |     pub scripts: Vec<String>,
18 | }
19 | 
20 | impl UcdFile for ScriptExtension {
21 |     fn relative_file_path() -> &'static Path {
22 |         Path::new("ScriptExtensions.txt")
23 |     }
24 | }
25 | 
26 | impl UcdFileByCodepoint for ScriptExtension {
27 |     fn codepoints(&self) -> CodepointIter {
28 |         self.codepoints.into_iter()
29 |     }
30 | }
31 | 
32 | impl std::str::FromStr for ScriptExtension {
33 |     type Err = Error;
34 | 
35 |     fn from_str(line: &str) -> Result<ScriptExtension, Error> {
36 |         let (codepoints, scripts) = parse_codepoint_association(line)?;
37 |         Ok(ScriptExtension {
38 |             codepoints,
39 |             scripts: scripts.split_whitespace().map(str::to_string).collect(),
40 |         })
41 |     }
42 | }
43 | 
44 | #[cfg(test)]
45 | mod tests {
46 |     use super::ScriptExtension;
47 | 
48 |     #[test]
49 |     fn parse_single() {
50 |         let line = "060C          ; Arab Syrc Thaa # Po       ARABIC COMMA\n";
51 |         let row: ScriptExtension = line.parse().unwrap();
52 |         assert_eq!(row.codepoints, 0x060C);
53 |         assert_eq!(row.scripts, vec!["Arab", "Syrc", "Thaa"]);
54 |     }
55 | 
56 |     #[test]
57 |     fn parse_range() {
58 |         let line = "A836..A837    ; Deva Gujr Guru Kthi Mahj Modi Sind Takr Tirh # So   [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK\n";
59 |         let row: ScriptExtension = line.parse().unwrap();
60 |         assert_eq!(row.codepoints, (0xA836, 0xA837));
61 |         assert_eq!(
62 |             row.scripts,
63 |             vec![
64 |                 "Deva", "Gujr", "Guru", "Kthi", "Mahj", "Modi", "Sind",
65 |                 "Takr", "Tirh",
66 |             ]
67 |         );
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/ucd-parse/src/scripts.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use crate::{
 4 |     common::{
 5 |         parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
 6 |         UcdFileByCodepoint,
 7 |     },
 8 |     error::Error,
 9 | };
10 | 
11 | /// A single row in the `Scripts.txt` file.
12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
13 | pub struct Script {
14 |     /// The codepoint or codepoint range for this entry.
15 |     pub codepoints: Codepoints,
16 |     /// The script name assigned to the codepoints in this entry.
17 |     pub script: String,
18 | }
19 | 
20 | impl UcdFile for Script {
21 |     fn relative_file_path() -> &'static Path {
22 |         Path::new("Scripts.txt")
23 |     }
24 | }
25 | 
26 | impl UcdFileByCodepoint for Script {
27 |     fn codepoints(&self) -> CodepointIter {
28 |         self.codepoints.into_iter()
29 |     }
30 | }
31 | 
32 | impl std::str::FromStr for Script {
33 |     type Err = Error;
34 | 
35 |     fn from_str(line: &str) -> Result<Script, Error> {
36 |         let (codepoints, script) = parse_codepoint_association(line)?;
37 |         Ok(Script { codepoints, script: script.to_string() })
38 |     }
39 | }
40 | 
41 | #[cfg(test)]
42 | mod tests {
43 |     use super::Script;
44 | 
45 |     #[test]
46 |     fn parse_single() {
47 |         let line = "10A7F         ; Old_South_Arabian # Po       OLD SOUTH ARABIAN NUMERIC INDICATOR\n";
48 |         let row: Script = line.parse().unwrap();
49 |         assert_eq!(row.codepoints, 0x10A7F);
50 |         assert_eq!(row.script, "Old_South_Arabian");
51 |     }
52 | 
53 |     #[test]
54 |     fn parse_range() {
55 |         let line = "1200..1248    ; Ethiopic # Lo  [73] ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE QWA\n";
56 |         let row: Script = line.parse().unwrap();
57 |         assert_eq!(row.codepoints, (0x1200, 0x1248));
58 |         assert_eq!(row.script, "Ethiopic");
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/ucd-parse/src/sentence_break.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{
  5 |         parse_break_test, parse_codepoint_association, CodepointIter,
  6 |         Codepoints, UcdFile, UcdFileByCodepoint,
  7 |     },
  8 |     error::Error,
  9 | };
 10 | 
 11 | /// A single row in the `auxiliary/SentenceBreakProperty.txt` file.
 12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 13 | pub struct SentenceBreak {
 14 |     /// The codepoint or codepoint range for this entry.
 15 |     pub codepoints: Codepoints,
 16 |     /// The property value assigned to the codepoints in this entry.
 17 |     pub value: String,
 18 | }
 19 | 
 20 | impl UcdFile for SentenceBreak {
 21 |     fn relative_file_path() -> &'static Path {
 22 |         Path::new("auxiliary/SentenceBreakProperty.txt")
 23 |     }
 24 | }
 25 | 
 26 | impl UcdFileByCodepoint for SentenceBreak {
 27 |     fn codepoints(&self) -> CodepointIter {
 28 |         self.codepoints.into_iter()
 29 |     }
 30 | }
 31 | 
 32 | impl std::str::FromStr for SentenceBreak {
 33 |     type Err = Error;
 34 | 
 35 |     fn from_str(line: &str) -> Result<SentenceBreak, Error> {
 36 |         let (codepoints, value) = parse_codepoint_association(line)?;
 37 |         Ok(SentenceBreak { codepoints, value: value.to_string() })
 38 |     }
 39 | }
 40 | 
 41 | /// A single row in the `auxiliary/SentenceBreakTest.txt` file.
 42 | ///
 43 | /// This file defines tests for the sentence break algorithm.
 44 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 45 | pub struct SentenceBreakTest {
 46 |     /// Each string is a UTF-8 encoded group of codepoints that make up a
 47 |     /// single sentence.
 48 |     pub sentences: Vec<String>,
 49 |     /// A human readable description of this test.
 50 |     pub comment: String,
 51 | }
 52 | 
 53 | impl UcdFile for SentenceBreakTest {
 54 |     fn relative_file_path() -> &'static Path {
 55 |         Path::new("auxiliary/SentenceBreakTest.txt")
 56 |     }
 57 | }
 58 | 
 59 | impl std::str::FromStr for SentenceBreakTest {
 60 |     type Err = Error;
 61 | 
 62 |     fn from_str(line: &str) -> Result<SentenceBreakTest, Error> {
 63 |         let (groups, comment) = parse_break_test(line)?;
 64 |         Ok(SentenceBreakTest { sentences: groups, comment })
 65 |     }
 66 | }
 67 | 
 68 | #[cfg(test)]
 69 | mod tests {
 70 |     use super::{SentenceBreak, SentenceBreakTest};
 71 | 
 72 |     #[test]
 73 |     fn parse_single() {
 74 |         let line = "11445         ; Extend # Mc       NEWA SIGN VISARGA\n";
 75 |         let row: SentenceBreak = line.parse().unwrap();
 76 |         assert_eq!(row.codepoints, 0x11445);
 77 |         assert_eq!(row.value, "Extend");
 78 |     }
 79 | 
 80 |     #[test]
 81 |     fn parse_range() {
 82 |         let line = "FE31..FE32    ; SContinue # Pd   [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH\n";
 83 |         let row: SentenceBreak = line.parse().unwrap();
 84 |         assert_eq!(row.codepoints, (0xFE31, 0xFE32));
 85 |         assert_eq!(row.value, "SContinue");
 86 |     }
 87 | 
 88 |     #[test]
 89 |     fn parse_test() {
 90 |         let line = "÷ 2060 × 5B57 × 2060 × 002E × 2060 ÷ 5B57 × 2060 × 2060 ÷	#  ÷ [0.2] WORD JOINER (Format_FE) × [998.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [998.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]";
 91 | 
 92 |         let row: SentenceBreakTest = line.parse().unwrap();
 93 |         assert_eq!(
 94 |             row.sentences,
 95 |             vec![
 96 |                 "\u{2060}\u{5B57}\u{2060}\u{002E}\u{2060}",
 97 |                 "\u{5B57}\u{2060}\u{2060}",
 98 |             ]
 99 |         );
100 |         assert!(row.comment.contains("[5.0] WORD JOINER (Format_FE)"));
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/ucd-parse/src/special_casing.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{
  5 |         parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile,
  6 |         UcdFileByCodepoint,
  7 |     },
  8 |     error::Error,
  9 | };
 10 | 
 11 | /// A single row in the `SpecialCasing.txt` file.
 12 | ///
 13 | /// Note that a single codepoint may be mapped multiple times. In particular,
 14 | /// a single codepoint might have mappings based on distinct language sensitive
 15 | /// conditions (e.g., `U+0307`).
 16 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 17 | pub struct SpecialCaseMapping {
 18 |     /// The codepoint that is being mapped.
 19 |     pub codepoint: Codepoint,
 20 |     /// The lowercase mapping, which may be empty.
 21 |     pub lowercase: Vec<Codepoint>,
 22 |     /// The titlecase mapping, which may be empty.
 23 |     pub titlecase: Vec<Codepoint>,
 24 |     /// The uppercase mapping, which may be empty.
 25 |     pub uppercase: Vec<Codepoint>,
 26 |     /// A list of language specific conditions, see `SpecialCasing.txt` for
 27 |     /// more details.
 28 |     pub conditions: Vec<String>,
 29 | }
 30 | 
 31 | impl UcdFile for SpecialCaseMapping {
 32 |     fn relative_file_path() -> &'static Path {
 33 |         Path::new("SpecialCasing.txt")
 34 |     }
 35 | }
 36 | 
 37 | impl UcdFileByCodepoint for SpecialCaseMapping {
 38 |     fn codepoints(&self) -> CodepointIter {
 39 |         self.codepoint.into_iter()
 40 |     }
 41 | }
 42 | 
 43 | impl std::str::FromStr for SpecialCaseMapping {
 44 |     type Err = Error;
 45 | 
 46 |     fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> {
 47 |         let re_parts = regex!(
 48 |             r"(?x)
 49 |                 ^
 50 |                 \s*(?P<codepoint>[^\s;]+)\s*;
 51 |                 \s*(?P<lower>[^;]+)\s*;
 52 |                 \s*(?P<title>[^;]+)\s*;
 53 |                 \s*(?P<upper>[^;]+)\s*;
 54 |                 \s*(?P<conditions>[^;\x23]+)?
 55 |                 ",
 56 |         );
 57 | 
 58 |         let caps = match re_parts.captures(line.trim()) {
 59 |             Some(caps) => caps,
 60 |             None => return err!("invalid SpecialCasing line: '{}'", line),
 61 |         };
 62 |         let conditions = caps
 63 |             .name("conditions")
 64 |             .map(|x| {
 65 |                 x.as_str()
 66 |                     .trim()
 67 |                     .split_whitespace()
 68 |                     .map(|c| c.to_string())
 69 |                     .collect()
 70 |             })
 71 |             .unwrap_or(vec![]);
 72 |         Ok(SpecialCaseMapping {
 73 |             codepoint: caps["codepoint"].parse()?,
 74 |             lowercase: parse_codepoint_sequence(&caps["lower"])?,
 75 |             titlecase: parse_codepoint_sequence(&caps["title"])?,
 76 |             uppercase: parse_codepoint_sequence(&caps["upper"])?,
 77 |             conditions,
 78 |         })
 79 |     }
 80 | }
 81 | 
 82 | #[cfg(test)]
 83 | mod tests {
 84 |     use super::SpecialCaseMapping;
 85 | 
 86 |     #[test]
 87 |     fn parse_no_conds() {
 88 |         let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA\n";
 89 |         let row: SpecialCaseMapping = line.parse().unwrap();
 90 |         assert_eq!(row.codepoint, 0x1F52);
 91 |         assert_eq!(row.lowercase, vec![0x1F52]);
 92 |         assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]);
 93 |         assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]);
 94 |         assert!(row.conditions.is_empty());
 95 |     }
 96 | 
 97 |     #[test]
 98 |     fn parse_conds() {
 99 |         let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\n";
100 |         let row: SpecialCaseMapping = line.parse().unwrap();
101 |         assert_eq!(row.codepoint, 0x0307);
102 |         assert!(row.lowercase.is_empty());
103 |         assert_eq!(row.titlecase, vec![0x0307]);
104 |         assert_eq!(row.uppercase, vec![0x0307]);
105 |         assert_eq!(row.conditions, vec!["tr", "After_I"]);
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/ucd-parse/src/word_break.rs:
--------------------------------------------------------------------------------
  1 | use std::path::Path;
  2 | 
  3 | use crate::{
  4 |     common::{
  5 |         parse_break_test, parse_codepoint_association, CodepointIter,
  6 |         Codepoints, UcdFile, UcdFileByCodepoint,
  7 |     },
  8 |     error::Error,
  9 | };
 10 | 
 11 | /// A single row in the `auxiliary/WordBreakProperty.txt` file.
 12 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 13 | pub struct WordBreak {
 14 |     /// The codepoint or codepoint range for this entry.
 15 |     pub codepoints: Codepoints,
 16 |     /// The property value assigned to the codepoints in this entry.
 17 |     pub value: String,
 18 | }
 19 | 
 20 | impl UcdFile for WordBreak {
 21 |     fn relative_file_path() -> &'static Path {
 22 |         Path::new("auxiliary/WordBreakProperty.txt")
 23 |     }
 24 | }
 25 | 
 26 | impl UcdFileByCodepoint for WordBreak {
 27 |     fn codepoints(&self) -> CodepointIter {
 28 |         self.codepoints.into_iter()
 29 |     }
 30 | }
 31 | 
 32 | impl std::str::FromStr for WordBreak {
 33 |     type Err = Error;
 34 | 
 35 |     fn from_str(line: &str) -> Result<WordBreak, Error> {
 36 |         let (codepoints, value) = parse_codepoint_association(line)?;
 37 |         Ok(WordBreak { codepoints, value: value.to_string() })
 38 |     }
 39 | }
 40 | 
 41 | /// A single row in the `auxiliary/WordBreakTest.txt` file.
 42 | ///
 43 | /// This file defines tests for the word break algorithm.
 44 | #[derive(Clone, Debug, Default, Eq, PartialEq)]
 45 | pub struct WordBreakTest {
 46 |     /// Each string is a UTF-8 encoded group of codepoints that make up a
 47 |     /// single word.
 48 |     pub words: Vec<String>,
 49 |     /// A human readable description of this test.
 50 |     pub comment: String,
 51 | }
 52 | 
 53 | impl UcdFile for WordBreakTest {
 54 |     fn relative_file_path() -> &'static Path {
 55 |         Path::new("auxiliary/WordBreakTest.txt")
 56 |     }
 57 | }
 58 | 
 59 | impl std::str::FromStr for WordBreakTest {
 60 |     type Err = Error;
 61 | 
 62 |     fn from_str(line: &str) -> Result<WordBreakTest, Error> {
 63 |         let (groups, comment) = parse_break_test(line)?;
 64 |         Ok(WordBreakTest { words: groups, comment })
 65 |     }
 66 | }
 67 | 
 68 | #[cfg(test)]
 69 | mod tests {
 70 |     use super::{WordBreak, WordBreakTest};
 71 | 
 72 |     #[test]
 73 |     fn parse_single() {
 74 |         let line = "0A83          ; Extend # Mc       GUJARATI SIGN VISARGA\n";
 75 |         let row: WordBreak = line.parse().unwrap();
 76 |         assert_eq!(row.codepoints, 0x0A83);
 77 |         assert_eq!(row.value, "Extend");
 78 |     }
 79 | 
 80 |     #[test]
 81 |     fn parse_range() {
 82 |         let line = "104A0..104A9  ; Numeric # Nd  [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE\n";
 83 |         let row: WordBreak = line.parse().unwrap();
 84 |         assert_eq!(row.codepoints, (0x104A0, 0x104A9));
 85 |         assert_eq!(row.value, "Numeric");
 86 |     }
 87 | 
 88 |     #[test]
 89 |     fn parse_test() {
 90 |         let line = "÷ 0031 ÷ 0027 × 0308 ÷ 0061 ÷ 0027 × 2060 ÷	#  ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]";
 91 | 
 92 |         let row: WordBreakTest = line.parse().unwrap();
 93 |         assert_eq!(
 94 |             row.words,
 95 |             vec![
 96 |                 "\u{0031}",
 97 |                 "\u{0027}\u{0308}",
 98 |                 "\u{0061}",
 99 |                 "\u{0027}\u{2060}",
100 |             ]
101 |         );
102 |         assert!(row.comment.contains("[4.0] COMBINING DIAERESIS (Extend_FE)"));
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/ucd-trie/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ucd-trie"
 3 | version = "0.1.7"  #:version
 4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 5 | description = """
 6 | A trie for storing Unicode codepoint sets and maps.
 7 | """
 8 | documentation = "https://docs.rs/ucd-trie"
 9 | homepage = "https://github.com/BurntSushi/ucd-generate"
10 | repository = "https://github.com/BurntSushi/ucd-generate"
11 | readme = "README.md"
12 | keywords = ["unicode", "database", "character", "codepoint", "trie"]
13 | license = "MIT OR Apache-2.0"
14 | edition = "2021"
15 | 
16 | [dev-dependencies]
17 | once_cell = "1"
18 | 
19 | [features]
20 | default = ["std"]
21 | std = []
22 | 


--------------------------------------------------------------------------------
/ucd-trie/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ucd-trie/README.md:
--------------------------------------------------------------------------------
 1 | ucd-trie
 2 | ========
 3 | A library that provides compressed trie sets specifically tailored toward
 4 | representing boolean Unicode character properties.
 5 | 
 6 | [![Build status](https://github.com/BurntSushi/ucd-generate/workflows/ci/badge.svg)](https://github.com/BurntSushi/ucd-generate/actions)
 7 | [![crates.io](https://img.shields.io/crates/v/ucd-trie.svg)](https://crates.io/crates/ucd-trie)
 8 | 
 9 | 
10 | ### Documentation
11 | 
12 | https://docs.rs/ucd-trie
13 | 
14 | 
15 | ### License
16 | 
17 | This project is licensed under either of
18 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
19 |    https://www.apache.org/licenses/LICENSE-2.0)
20 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
21 |    https://opensource.org/licenses/MIT)
22 | at your option.
23 | 


--------------------------------------------------------------------------------
/ucd-trie/benches/bench.rs:
--------------------------------------------------------------------------------
 1 | #![feature(test)]
 2 | 
 3 | extern crate test;
 4 | 
 5 | use once_cell::sync::Lazy;
 6 | use ucd_trie::TrieSetOwned;
 7 | 
 8 | #[bench]
 9 | fn bench_trie_set(b: &mut test::Bencher) {
10 |     const CHARS: &'static [char] = &['a', 'β', '☃', '😼'];
11 |     // const CHARS: &'static [char] = &['a'];
12 |     static SET: Lazy<TrieSetOwned> =
13 |         Lazy::new(|| TrieSetOwned::from_scalars(CHARS).unwrap());
14 | 
15 |     let set = Lazy::force(&SET);
16 |     let mut i = 0;
17 |     b.iter(|| {
18 |         let c = CHARS[i];
19 |         i = (i + 1) % CHARS.len();
20 | 
21 |         for _ in 0..10000 {
22 |             assert!(set.contains_char(c));
23 |         }
24 |     });
25 | }
26 | 


--------------------------------------------------------------------------------
/ucd-trie/src/lib.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | The ucd-trie crate provides a compressed trie set specifically tailored for
  3 | Unicode codepoints. The principle use case for such a trie is to represent
  4 | properties defined by Unicode that correspond to sets of Unicode codepoints.
  5 | (These properties are formally called boolean properties or "single valued"
  6 | properties. See
  7 | [UTR#23 S3.3](https://www.unicode.org/reports/tr23/#PropertyTypeDefinitions)
  8 | for more details.)
  9 | 
 10 | This crate has two principle types: `TrieSetOwned` and `TrieSetSlice`,
 11 | corresponding to a similar split as there is between `Vec<T>` and `&[T]`.
 12 | `TrieSetOwned` is the only way to construct a trie from a set of Unicode
 13 | codepoints.
 14 | 
 15 | The intended use of this library is to embed a static instance of
 16 | `TrieSetSlice` into your source code, and then use its methods as defined in
 17 | this crate to test membership. (The `ucd-generate` tool can likely generate
 18 | this code for you.)
 19 | 
 20 | Finally, while this crate uses the standard library by default, it provides
 21 | `no_std` functionality by disabling the `std` feature. When `no_std` is
 22 | enabled, then `TrieSetOwned` is not provided. Instead, only `TrieSetSlice` is
 23 | provided, which means `no_std` crates can still embed tries into their code.
 24 | */
 25 | 
 26 | #![deny(missing_docs)]
 27 | #![cfg_attr(not(feature = "std"), no_std)]
 28 | 
 29 | use core::fmt;
 30 | 
 31 | #[cfg(feature = "std")]
 32 | pub use crate::owned::{Error, Result, TrieSetOwned};
 33 | 
 34 | #[cfg(test)]
 35 | #[allow(dead_code)]
 36 | mod general_category;
 37 | #[cfg(feature = "std")]
 38 | mod owned;
 39 | 
 40 | const CHUNK_SIZE: usize = 64;
 41 | 
 42 | /// A type alias for `TrieSetSlice<'static>`.
 43 | pub type TrieSet = TrieSetSlice<'static>;
 44 | 
 45 | /// A borrowed trie set.
 46 | #[derive(Clone, Copy)]
 47 | pub struct TrieSetSlice<'a> {
 48 |     /// first tree, one level
 49 |     #[doc(hidden)]
 50 |     pub tree1_level1: &'a [u64],
 51 |     /// second tree, first level
 52 |     #[doc(hidden)]
 53 |     pub tree2_level1: &'a [u8],
 54 |     /// second tree, second level
 55 |     #[doc(hidden)]
 56 |     pub tree2_level2: &'a [u64],
 57 |     /// third tree, first level
 58 |     #[doc(hidden)]
 59 |     pub tree3_level1: &'a [u8],
 60 |     /// third tree, second level
 61 |     #[doc(hidden)]
 62 |     pub tree3_level2: &'a [u8],
 63 |     /// third tree, third level
 64 |     #[doc(hidden)]
 65 |     pub tree3_level3: &'a [u64],
 66 | }
 67 | 
 68 | impl<'a> fmt::Debug for TrieSetSlice<'a> {
 69 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 70 |         write!(f, "TrieSetSlice(...)")
 71 |     }
 72 | }
 73 | 
 74 | impl<'a> TrieSetSlice<'a> {
 75 |     /// Returns true if and only if the given Unicode scalar value is in this
 76 |     /// set.
 77 |     pub fn contains_char(&self, c: char) -> bool {
 78 |         self.contains(c as usize)
 79 |     }
 80 | 
 81 |     /// Returns true if and only if the given codepoint is in this set.
 82 |     ///
 83 |     /// If the given value exceeds the codepoint range (i.e., it's greater
 84 |     /// than `0x10FFFF`), then this returns false.
 85 |     pub fn contains_u32(&self, cp: u32) -> bool {
 86 |         if cp > 0x10FFFF {
 87 |             return false;
 88 |         }
 89 |         self.contains(cp as usize)
 90 |     }
 91 | 
 92 |     #[inline(always)]
 93 |     fn contains(&self, cp: usize) -> bool {
 94 |         if cp < 0x800 {
 95 |             self.chunk_contains(cp, self.tree1_level1[cp >> 6])
 96 |         } else if cp < 0x10000 {
 97 |             let leaf = match self.tree2_level1.get((cp >> 6) - 0x20) {
 98 |                 None => return false,
 99 |                 Some(&leaf) => leaf,
100 |             };
101 |             self.chunk_contains(cp, self.tree2_level2[leaf as usize])
102 |         } else {
103 |             let child = match self.tree3_level1.get((cp >> 12) - 0x10) {
104 |                 None => return false,
105 |                 Some(&child) => child,
106 |             };
107 |             let i = ((child as usize) * CHUNK_SIZE) + ((cp >> 6) & 0b111111);
108 |             let leaf = self.tree3_level2[i];
109 |             self.chunk_contains(cp, self.tree3_level3[leaf as usize])
110 |         }
111 |     }
112 | 
113 |     #[inline(always)]
114 |     fn chunk_contains(&self, cp: usize, chunk: u64) -> bool {
115 |         ((chunk >> (cp & 0b111111)) & 1) == 1
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/ucd-util/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ucd-util"
 3 | version = "0.2.2"  #:version
 4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 5 | description = """
 6 | A small utility library for working with the Unicode character database.
 7 | """
 8 | documentation = "https://docs.rs/ucd-util"
 9 | homepage = "https://github.com/BurntSushi/ucd-generate"
10 | repository = "https://github.com/BurntSushi/ucd-generate"
11 | readme = "README.md"
12 | keywords = ["unicode", "database", "character", "property"]
13 | license = "MIT OR Apache-2.0"
14 | edition = "2021"
15 | 


--------------------------------------------------------------------------------
/ucd-util/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ucd-util/LICENSE-UNICODE:
--------------------------------------------------------------------------------
 1 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 2 | 
 3 | Unicode Data Files include all data files under the directories
 4 | http://www.unicode.org/Public/, http://www.unicode.org/reports/,
 5 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
 6 | http://www.unicode.org/utility/trac/browser/.
 7 | 
 8 | Unicode Data Files do not include PDF online code charts under the
 9 | directory http://www.unicode.org/Public/.
10 | 
11 | Software includes any source code published in the Unicode Standard
12 | or under the directories
13 | http://www.unicode.org/Public/, http://www.unicode.org/reports/,
14 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
15 | http://www.unicode.org/utility/trac/browser/.
16 | 
17 | NOTICE TO USER: Carefully read the following legal agreement.
18 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
19 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
20 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
21 | TERMS AND CONDITIONS OF THIS AGREEMENT.
22 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
23 | THE DATA FILES OR SOFTWARE.
24 | 
25 | COPYRIGHT AND PERMISSION NOTICE
26 | 
27 | Copyright © 1991-2018 Unicode, Inc. All rights reserved.
28 | Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
29 | 
30 | Permission is hereby granted, free of charge, to any person obtaining
31 | a copy of the Unicode data files and any associated documentation
32 | (the "Data Files") or Unicode software and any associated documentation
33 | (the "Software") to deal in the Data Files or Software
34 | without restriction, including without limitation the rights to use,
35 | copy, modify, merge, publish, distribute, and/or sell copies of
36 | the Data Files or Software, and to permit persons to whom the Data Files
37 | or Software are furnished to do so, provided that either
38 | (a) this copyright and permission notice appear with all copies
39 | of the Data Files or Software, or
40 | (b) this copyright and permission notice appear in associated
41 | Documentation.
42 | 
43 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
44 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
45 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
46 | NONINFRINGEMENT OF THIRD PARTY RIGHTS.
47 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
48 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
49 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
50 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
51 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
52 | PERFORMANCE OF THE DATA FILES OR SOFTWARE.
53 | 
54 | Except as contained in this notice, the name of a copyright holder
55 | shall not be used in advertising or otherwise to promote the sale,
56 | use or other dealings in these Data Files or Software without prior
57 | written authorization of the copyright holder.


--------------------------------------------------------------------------------
/ucd-util/README.md:
--------------------------------------------------------------------------------
 1 | ucd-util
 2 | ========
 3 | A library for small auxiliary Unicode functions. This includes things like
 4 | symbol or character name canonicalization, ideograph name generation and helper
 5 | functions for searching property name and value tables.
 6 | 
 7 | [![Build status](https://github.com/BurntSushi/ucd-generate/workflows/ci/badge.svg)](https://github.com/BurntSushi/ucd-generate/actions)
 8 | [![crates.io](https://img.shields.io/crates/v/ucd-util.svg)](https://crates.io/crates/ucd-util)
 9 | 
10 | 
11 | ### Documentation
12 | 
13 | https://docs.rs/ucd-util
14 | 
15 | 
16 | ### License
17 | 
18 | This project is licensed under either of
19 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
20 |    https://www.apache.org/licenses/LICENSE-2.0)
21 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
22 |    https://opensource.org/licenses/MIT)
23 | at your option.
24 | 
25 | The data in [`src/unicode_tables/`](src/unicode_tables) is licensed inder the Unicode License 
26 | Agreement ([LICENSE-UNICODE](LICENSE-UNICODE) or
27 | https://www.unicode.org/copyright.html#License).
28 | 


--------------------------------------------------------------------------------
/ucd-util/src/hangul.rs:
--------------------------------------------------------------------------------
  1 | // This implementation should correspond to the algorithms described in
  2 | // Unicode 3.12.
  3 | 
  4 | /// A set of ranges that corresponds to the set of all Hangul syllable
  5 | /// codepoints.
  6 | ///
  7 | /// These ranges are defined in Unicode 4.8 Table 4-13.
  8 | pub const RANGE_HANGUL_SYLLABLE: &'static [(u32, u32)] = &[(0xAC00, 0xD7A3)];
  9 | 
 10 | const S_BASE: u32 = 0xAC00;
 11 | const L_BASE: u32 = 0x1100;
 12 | const V_BASE: u32 = 0x1161;
 13 | const T_BASE: u32 = 0x11A7;
 14 | const T_COUNT: u32 = 28;
 15 | const N_COUNT: u32 = 588;
 16 | 
 17 | /// Return the character name of the given precomposed Hangul codepoint.
 18 | ///
 19 | /// If the given codepoint does not correspond to a precomposed Hangul
 20 | /// codepoint in the inclusive range `AC00..D7A3`, then this returns `None`.
 21 | ///
 22 | /// This implements the algorithms described in Unicode 3.12 and Unicode 4.8.
 23 | ///
 24 | /// The `table` given should be a map from codepoint to the corresponding
 25 | /// Jamo short name for that codepoint. If you're using `ucd-generate`, then
 26 | /// the table can be generated via the `jamo-short-name` sub-command.
 27 | pub fn hangul_name<'a>(
 28 |     table: &'a [(u32, &'a str)],
 29 |     cp: u32,
 30 | ) -> Option<String> {
 31 |     let mut name = "HANGUL SYLLABLE ".to_string();
 32 |     let (lpart, vpart, tpart) = match hangul_full_canonical_decomposition(cp) {
 33 |         None => return None,
 34 |         Some(triple) => triple,
 35 |     };
 36 | 
 37 |     name.push_str(jamo_short_name(table, lpart));
 38 |     name.push_str(jamo_short_name(table, vpart));
 39 |     name.push_str(tpart.map_or("", |cp| jamo_short_name(table, cp)));
 40 |     Some(name)
 41 | }
 42 | 
 43 | /// Return the full canonical decomposition of the given precomposed Hangul
 44 | /// codepoint.
 45 | ///
 46 | /// If the decomposition does not have any trailing consonant, then the third
 47 | /// part of the tuple returned is `None`.
 48 | ///
 49 | /// If the given codepoint does not correspond to a precomposed Hangul
 50 | /// codepoint in the inclusive range `AC00..D7A3`, then this returns `None`.
 51 | ///
 52 | /// This implements the algorithms described in Unicode 3.12 and Unicode 4.8.
 53 | pub fn hangul_full_canonical_decomposition(
 54 |     cp: u32,
 55 | ) -> Option<(u32, u32, Option<u32>)> {
 56 |     if !(0xAC00 <= cp && cp <= 0xD7A3) {
 57 |         return None;
 58 |     }
 59 | 
 60 |     let s_index = cp - S_BASE;
 61 |     let l_index = s_index / N_COUNT;
 62 |     let v_index = (s_index % N_COUNT) / T_COUNT;
 63 |     let t_index = s_index % T_COUNT;
 64 | 
 65 |     let l_part = L_BASE + l_index;
 66 |     let v_part = V_BASE + v_index;
 67 |     let t_part = if t_index == 0 { None } else { Some(T_BASE + t_index) };
 68 |     Some((l_part, v_part, t_part))
 69 | }
 70 | 
 71 | type JamoShortName<'a> = &'a [(u32, &'a str)];
 72 | 
 73 | fn jamo_short_name<'a>(table: JamoShortName<'a>, cp: u32) -> &'a str {
 74 |     let i = table.binary_search_by_key(&cp, |p| p.0).unwrap();
 75 |     table[i].1
 76 | }
 77 | 
 78 | #[cfg(test)]
 79 | mod tests {
 80 |     use crate::unicode_tables::jamo_short_name::JAMO_SHORT_NAME as TABLE;
 81 | 
 82 |     use super::{hangul_full_canonical_decomposition, hangul_name};
 83 | 
 84 |     #[test]
 85 |     fn canon_decomp() {
 86 |         assert_eq!(
 87 |             hangul_full_canonical_decomposition(0xD4DB),
 88 |             Some((0x1111, 0x1171, Some(0x11B6)))
 89 |         );
 90 |     }
 91 | 
 92 |     #[test]
 93 |     fn name() {
 94 |         assert_eq!(
 95 |             hangul_name(TABLE, 0xD4DB).unwrap(),
 96 |             "HANGUL SYLLABLE PWILH"
 97 |         );
 98 |     }
 99 | 
100 |     #[test]
101 |     fn all() {
102 |         for cp in 0xAC00..(0xD7A3 + 1) {
103 |             hangul_name(TABLE, cp).unwrap();
104 |         }
105 |     }
106 | 
107 |     #[test]
108 |     fn invalid() {
109 |         assert!(hangul_name(TABLE, 0).is_none());
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/ucd-util/src/ideograph.rs:
--------------------------------------------------------------------------------
 1 | /// A set of ranges that corresponds to the set of all ideograph codepoints.
 2 | ///
 3 | /// These ranges are defined in Unicode 4.8 Table 4-13.
 4 | pub const RANGE_IDEOGRAPH: &'static [(u32, u32)] = &[
 5 |     (0x3400, 0x4DB5),
 6 |     (0x4E00, 0x9FD5),
 7 |     (0x4E00, 0x9FD5),
 8 |     (0x20000, 0x2A6D6),
 9 |     (0x2A700, 0x2B734),
10 |     (0x2B740, 0x2B81D),
11 |     (0x2B820, 0x2CEA1),
12 |     (0x17000, 0x187EC),
13 |     (0xF900, 0xFA6D),
14 |     (0xFA70, 0xFAD9),
15 |     (0x2F800, 0x2FA1D),
16 | ];
17 | 
18 | /// Return the character name of the given ideograph codepoint.
19 | ///
20 | /// This operation is only defined on ideographic codepoints. This includes
21 | /// precisely the following inclusive ranges:
22 | ///
23 | /// * `3400..4DB5`
24 | /// * `4E00..9FD5`
25 | /// * `20000..2A6D6`
26 | /// * `2A700..2B734`
27 | /// * `2B740..2B81D`
28 | /// * `2B820..2CEA1`
29 | /// * `17000..187EC`
30 | /// * `F900..FA6D`
31 | /// * `FA70..FAD9`
32 | /// * `2F800..2FA1D`
33 | ///
34 | /// If the given codepoint is not in any of the above ranges, then `None` is
35 | /// returned.
36 | ///
37 | /// This implements the algorithm described in Unicode 4.8.
38 | pub fn ideograph_name(cp: u32) -> Option<String> {
39 |     // This match should be in sync with the `RANGE_IDEOGRAPH` constant.
40 |     match cp {
41 |         0x3400..=0x4DB5
42 |         | 0x4E00..=0x9FD5
43 |         | 0x20000..=0x2A6D6
44 |         | 0x2A700..=0x2B734
45 |         | 0x2B740..=0x2B81D
46 |         | 0x2B820..=0x2CEA1 => {
47 |             Some(format!("CJK UNIFIED IDEOGRAPH-{:04X}", cp))
48 |         }
49 |         0x17000..=0x187EC => Some(format!("TANGUT IDEOGRAPH-{:04X}", cp)),
50 |         0xF900..=0xFA6D | 0xFA70..=0xFAD9 | 0x2F800..=0x2FA1D => {
51 |             Some(format!("CJK COMPATIBILITY IDEOGRAPH-{:04X}", cp))
52 |         }
53 |         _ => None,
54 |     }
55 | }
56 | 
57 | #[cfg(test)]
58 | mod tests {
59 |     use super::ideograph_name;
60 | 
61 |     #[test]
62 |     fn name() {
63 |         assert_eq!(
64 |             ideograph_name(0x4E00).unwrap(),
65 |             "CJK UNIFIED IDEOGRAPH-4E00"
66 |         );
67 |         assert_eq!(
68 |             ideograph_name(0x9FD5).unwrap(),
69 |             "CJK UNIFIED IDEOGRAPH-9FD5"
70 |         );
71 |         assert_eq!(ideograph_name(0x17000).unwrap(), "TANGUT IDEOGRAPH-17000");
72 |         assert_eq!(
73 |             ideograph_name(0xF900).unwrap(),
74 |             "CJK COMPATIBILITY IDEOGRAPH-F900"
75 |         );
76 |     }
77 | 
78 |     #[test]
79 |     fn invalid() {
80 |         assert!(ideograph_name(0).is_none());
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/ucd-util/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | The `ucd-util` crate contains a smattering of utility functions that implement
 3 | various algorithms specified by Unicode. There is no specific goal for
 4 | exhaustiveness. Instead, implementations should be added on an as-needed basis.
 5 | 
 6 | A *current* design constraint of this crate is that it should not bring in any
 7 | large Unicode tables. For example, to use the various property name and value
 8 | canonicalization functions, you'll need to supply your own table, which can
 9 | be generated using `ucd-generate`.
10 | */
11 | 
12 | #![deny(missing_docs)]
13 | #![allow(unknown_lints)]
14 | #![allow(ellipsis_inclusive_range_patterns)]
15 | 
16 | mod hangul;
17 | mod ideograph;
18 | mod name;
19 | mod property;
20 | mod unicode_tables;
21 | 
22 | pub use crate::hangul::{
23 |     hangul_full_canonical_decomposition, hangul_name, RANGE_HANGUL_SYLLABLE,
24 | };
25 | pub use crate::ideograph::{ideograph_name, RANGE_IDEOGRAPH};
26 | pub use crate::name::{character_name_normalize, symbolic_name_normalize};
27 | pub use crate::property::{
28 |     canonical_property_name, canonical_property_value, property_values,
29 |     PropertyTable, PropertyValueTable, PropertyValues,
30 | };
31 | 


--------------------------------------------------------------------------------
/ucd-util/src/name.rs:
--------------------------------------------------------------------------------
  1 | /// Normalize the given character name in place according to UAX44-LM2.
  2 | ///
  3 | /// See: https://unicode.org/reports/tr44/#UAX44-LM2
  4 | pub fn character_name_normalize(string: &mut String) {
  5 |     let bytes = unsafe {
  6 |         // SAFETY: `character_name_normalize_bytes` guarantees that
  7 |         // `bytes[..len]` is valid UTF-8.
  8 |         string.as_mut_vec()
  9 |     };
 10 |     let len = character_name_normalize_bytes(bytes).len();
 11 |     bytes.truncate(len);
 12 | }
 13 | 
 14 | /// Normalize the given character name in place according to UAX44-LM2.
 15 | ///
 16 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values
 17 | /// of `slice`.
 18 | ///
 19 | /// See: https://unicode.org/reports/tr44/#UAX44-LM2
 20 | fn character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
 21 |     // According to Unicode 4.8, character names consist only of Latin
 22 |     // capital letters A to Z, ASCII digits, ASCII space or ASCII hypen.
 23 |     // Therefore, we can do very simplistic case folding and operate on the
 24 |     // raw bytes, since everything is ASCII. Note that we don't actually know
 25 |     // whether `slice` is all ASCII or not, so we drop all non-ASCII bytes.
 26 |     let mut next_write = 0;
 27 |     let mut prev_letter = false;
 28 |     // let mut prev_space = true;
 29 |     for i in 0..slice.len() {
 30 |         // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
 31 |         // UTF-8, we ensure that the slice contains only ASCII bytes. In
 32 |         // particular, we drop every non-ASCII byte from the normalized string.
 33 |         let b = slice[i];
 34 |         if b == b' ' {
 35 |             // Drop spaces.
 36 |         } else if b == b'_' {
 37 |             // Drop the underscore.
 38 |         } else if b == b'-' {
 39 |             let medial = prev_letter
 40 |                 && slice.get(i + 1).map_or(false, |b| b.is_ascii_alphabetic());
 41 |             let mut keep_hyphen = !medial;
 42 |             // We want to keep the hypen only if it isn't medial. However,
 43 |             // there is one exception. We need to keep the hypen in the
 44 |             // character (U+1180) named `HANGUL JUNGSEONG O-E`. So we check for
 45 |             // that here.
 46 |             let next_e =
 47 |                 slice.get(i + 1).map_or(false, |&b| b == b'E' || b == b'e');
 48 |             // More characters after the final E are fine, as long as they are
 49 |             // underscores and spaces.
 50 |             let rest_empty = i + 2 >= slice.len()
 51 |                 || slice[i + 2..].iter().all(|&b| b == b' ' || b == b'_');
 52 |             if !keep_hyphen && next_e && rest_empty {
 53 |                 keep_hyphen = slice[..next_write] == b"hanguljungseongo"[..];
 54 |             }
 55 |             if keep_hyphen {
 56 |                 slice[next_write] = b;
 57 |                 next_write += 1;
 58 |             }
 59 |         } else if b'A' <= b && b <= b'Z' {
 60 |             slice[next_write] = b + (b'a' - b'A');
 61 |             next_write += 1;
 62 |         } else if b <= 0x7F {
 63 |             slice[next_write] = b;
 64 |             next_write += 1;
 65 |         }
 66 |         // prev_space = false;
 67 |         prev_letter = b.is_ascii_alphabetic();
 68 |     }
 69 |     &mut slice[..next_write]
 70 | }
 71 | 
 72 | /// Normalize the given symbolic name in place according to UAX44-LM3.
 73 | ///
 74 | /// A "symbolic name" typically corresponds to property names and property
 75 | /// value aliases. Note, though, that it should not be applied to property
 76 | /// string values.
 77 | ///
 78 | /// See: https://unicode.org/reports/tr44/#UAX44-LM2
 79 | pub fn symbolic_name_normalize(string: &mut String) {
 80 |     let bytes = unsafe {
 81 |         // SAFETY: `symbolic_name_normalize_bytes` guarantees that
 82 |         // `bytes[..len]` is valid UTF-8.
 83 |         string.as_mut_vec()
 84 |     };
 85 |     let len = symbolic_name_normalize_bytes(bytes).len();
 86 |     bytes.truncate(len);
 87 | }
 88 | 
 89 | /// Normalize the given symbolic name in place according to UAX44-LM3.
 90 | ///
 91 | /// A "symbolic name" typically corresponds to property names and property
 92 | /// value aliases. Note, though, that it should not be applied to property
 93 | /// string values.
 94 | ///
 95 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values
 96 | /// of `slice`.
 97 | ///
 98 | /// See: https://unicode.org/reports/tr44/#UAX44-LM3
 99 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
100 |     // I couldn't find a place in the standard that specified that property
101 |     // names/aliases had a particular structure (unlike character names), but
102 |     // we assume that it's ASCII only and drop anything that isn't ASCII.
103 |     let mut start = 0;
104 |     let mut starts_with_is = false;
105 |     if slice.len() > 2 {
106 |         // Ignore any "is" prefix.
107 |         starts_with_is = slice[0..2] == b"is"[..]
108 |             || slice[0..2] == b"IS"[..]
109 |             || slice[0..2] == b"iS"[..]
110 |             || slice[0..2] == b"Is"[..];
111 |         if starts_with_is {
112 |             start = 2;
113 |         }
114 |     }
115 |     let mut next_write = 0;
116 |     for i in start..slice.len() {
117 |         // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
118 |         // UTF-8, we ensure that the slice contains only ASCII bytes. In
119 |         // particular, we drop every non-ASCII byte from the normalized string.
120 |         let b = slice[i];
121 |         if b == b' ' || b == b'_' || b == b'-' {
122 |             continue;
123 |         } else if b'A' <= b && b <= b'Z' {
124 |             slice[next_write] = b + (b'a' - b'A');
125 |             next_write += 1;
126 |         } else if b <= 0x7F {
127 |             slice[next_write] = b;
128 |             next_write += 1;
129 |         }
130 |     }
131 |     // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
132 |     // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
133 |     // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
134 |     // is actually an alias for the 'Other' general category.
135 |     if starts_with_is && next_write == 1 && slice[0] == b'c' {
136 |         slice[0] = b'i';
137 |         slice[1] = b's';
138 |         slice[2] = b'c';
139 |         next_write = 3;
140 |     }
141 |     &mut slice[..next_write]
142 | }
143 | 
144 | #[cfg(test)]
145 | mod tests {
146 |     use super::{
147 |         character_name_normalize, character_name_normalize_bytes,
148 |         symbolic_name_normalize, symbolic_name_normalize_bytes,
149 |     };
150 | 
151 |     fn char_norm(s: &str) -> String {
152 |         let mut s = s.to_string();
153 |         character_name_normalize(&mut s);
154 |         s
155 |     }
156 | 
157 |     fn sym_norm(s: &str) -> String {
158 |         let mut s = s.to_string();
159 |         symbolic_name_normalize(&mut s);
160 |         s
161 |     }
162 | 
163 |     #[test]
164 |     fn char_normalize() {
165 |         assert_eq!(char_norm("HANGUL JUNGSEONG O-E"), "hanguljungseongo-e");
166 |         assert_eq!(char_norm("HANGUL JUNGSEONG O-E _"), "hanguljungseongo-e");
167 |         assert_eq!(char_norm("zero-width space"), "zerowidthspace");
168 |         assert_eq!(char_norm("zerowidthspace"), "zerowidthspace");
169 |         assert_eq!(char_norm("ZERO WIDTH SPACE"), "zerowidthspace");
170 |         assert_eq!(char_norm("TIBETAN MARK TSA -PHRU"), "tibetanmarktsa-phru");
171 |         assert_eq!(char_norm("tibetan_letter_-a"), "tibetanletter-a");
172 |     }
173 | 
174 |     #[test]
175 |     fn sym_normalize() {
176 |         assert_eq!(sym_norm("Line_Break"), "linebreak");
177 |         assert_eq!(sym_norm("Line-break"), "linebreak");
178 |         assert_eq!(sym_norm("linebreak"), "linebreak");
179 |         assert_eq!(sym_norm("BA"), "ba");
180 |         assert_eq!(sym_norm("ba"), "ba");
181 |         assert_eq!(sym_norm("Greek"), "greek");
182 |         assert_eq!(sym_norm("isGreek"), "greek");
183 |         assert_eq!(sym_norm("IS_Greek"), "greek");
184 |         assert_eq!(sym_norm("isc"), "isc");
185 |         assert_eq!(sym_norm("is c"), "isc");
186 |         assert_eq!(sym_norm("is_c"), "isc");
187 |         assert_eq!(sym_norm("IS"), "is");
188 |     }
189 | 
190 |     #[test]
191 |     fn valid_utf8_character() {
192 |         let mut x = b"abc\xFFxyz".to_vec();
193 |         let y = character_name_normalize_bytes(&mut x);
194 |         assert_eq!(y, b"abcxyz");
195 |     }
196 | 
197 |     #[test]
198 |     fn valid_utf8_symbolic() {
199 |         let mut x = b"abc\xFFxyz".to_vec();
200 |         let y = symbolic_name_normalize_bytes(&mut x);
201 |         assert_eq!(y, b"abcxyz");
202 |     }
203 | }
204 | 


--------------------------------------------------------------------------------
/ucd-util/src/property.rs:
--------------------------------------------------------------------------------
  1 | /// The type of a property name table.
  2 | ///
  3 | /// A property name table is a sequence of sorted tuples, where the first
  4 | /// value in each tuple is a normalized property name and the second value of
  5 | /// each tuple is the corresponding canonical property name.
  6 | pub type PropertyTable = &'static [(&'static str, &'static str)];
  7 | 
  8 | /// Find the canonical property name for the given normalized property name.
  9 | ///
 10 | /// If no such property exists, then `None` is returned.
 11 | ///
 12 | /// The normalized property name must have been normalized according to
 13 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`.
 14 | pub fn canonical_property_name(
 15 |     property_table: PropertyTable,
 16 |     normalized_property_name: &str,
 17 | ) -> Option<&'static str> {
 18 |     property_table
 19 |         .binary_search_by_key(&normalized_property_name, |&(n, _)| n)
 20 |         .ok()
 21 |         .map(|i| property_table[i].1)
 22 | }
 23 | 
 24 | /// Type of a property value table.
 25 | ///
 26 | /// A property value table maps property names to a mapping of property values,
 27 | /// where the mapping of property values is represented by a sequence of
 28 | /// tuples. The first element of each tuple is a normalized property value
 29 | /// while the second element of each tuple is the corresponding canonical
 30 | /// property value.
 31 | ///
 32 | /// Note that a property value table only includes values for properties that
 33 | /// are catalogs, enumerations or binary properties. Properties that have
 34 | /// string values (such as case or decomposition mappings), numeric values
 35 | /// or are miscellaneous are not represented in this table.
 36 | pub type PropertyValueTable = &'static [(&'static str, PropertyValues)];
 37 | 
 38 | /// A mapping of property values for a specific property.
 39 | ///
 40 | /// The first element of each tuple is a normalized property value while the
 41 | /// second element of each tuple is the corresponding canonical property
 42 | /// value.
 43 | pub type PropertyValues = &'static [(&'static str, &'static str)];
 44 | 
 45 | /// Find the set of possible property values for a given property.
 46 | ///
 47 | /// The set returned is a mapping expressed as a sorted list of tuples.
 48 | /// The first element of each tuple is a normalized property value while the
 49 | /// second element of each tuple is the corresponding canonical property
 50 | /// value.
 51 | ///
 52 | /// If no such property exists, then `None` is returned.
 53 | ///
 54 | /// The given property name must be in its canonical form, which can be
 55 | /// found using `canonical_property_name`.
 56 | pub fn property_values(
 57 |     property_value_table: PropertyValueTable,
 58 |     canonical_property_name: &str,
 59 | ) -> Option<PropertyValues> {
 60 |     property_value_table
 61 |         .binary_search_by_key(&canonical_property_name, |&(n, _)| n)
 62 |         .ok()
 63 |         .map(|i| property_value_table[i].1)
 64 | }
 65 | 
 66 | /// Find the canonical property value for the given normalized property
 67 | /// value.
 68 | ///
 69 | /// The given property values should correspond to the values for the property
 70 | /// under question, which can be found using `property_values`.
 71 | ///
 72 | /// If no such property value exists, then `None` is returned.
 73 | ///
 74 | /// The normalized property value must have been normalized according to
 75 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`.
 76 | pub fn canonical_property_value(
 77 |     property_values: PropertyValues,
 78 |     normalized_property_value: &str,
 79 | ) -> Option<&'static str> {
 80 |     // This is cute. The types line up, so why not?
 81 |     canonical_property_name(property_values, normalized_property_value)
 82 | }
 83 | 
 84 | #[cfg(test)]
 85 | mod tests {
 86 |     use crate::unicode_tables::property_names::PROPERTY_NAMES;
 87 |     use crate::unicode_tables::property_values::PROPERTY_VALUES;
 88 | 
 89 |     use super::{
 90 |         canonical_property_name, canonical_property_value, property_values,
 91 |     };
 92 | 
 93 |     #[test]
 94 |     fn canonical_property_name_1() {
 95 |         assert_eq!(
 96 |             canonical_property_name(PROPERTY_NAMES, "gc"),
 97 |             Some("General_Category")
 98 |         );
 99 |         assert_eq!(
100 |             canonical_property_name(PROPERTY_NAMES, "generalcategory"),
101 |             Some("General_Category")
102 |         );
103 |         assert_eq!(canonical_property_name(PROPERTY_NAMES, "g c"), None);
104 |     }
105 | 
106 |     #[test]
107 |     fn property_values_1() {
108 |         assert_eq!(
109 |             property_values(PROPERTY_VALUES, "White_Space"),
110 |             Some(
111 |                 &[
112 |                     ("f", "No"),
113 |                     ("false", "No"),
114 |                     ("n", "No"),
115 |                     ("no", "No"),
116 |                     ("t", "Yes"),
117 |                     ("true", "Yes"),
118 |                     ("y", "Yes"),
119 |                     ("yes", "Yes"),
120 |                 ][..]
121 |             )
122 |         );
123 |     }
124 | 
125 |     #[test]
126 |     fn canonical_property_value_1() {
127 |         let values = property_values(PROPERTY_VALUES, "White_Space").unwrap();
128 |         assert_eq!(canonical_property_value(values, "false"), Some("No"));
129 |         assert_eq!(canonical_property_value(values, "t"), Some("Yes"));
130 |         assert_eq!(canonical_property_value(values, "F"), None);
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/ucd-util/src/unicode_tables/jamo_short_name.rs:
--------------------------------------------------------------------------------
 1 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 2 | //
 3 | //   ucd-generate jamo-short-name ucd-16.0.0
 4 | //
 5 | // Unicode version: 16.0.0.
 6 | //
 7 | // ucd-generate 0.2.15 is available on crates.io.
 8 | 
 9 | pub const JAMO_SHORT_NAME: &'static [(u32, &'static str)] = &[
10 |     (4352, "G"),
11 |     (4353, "GG"),
12 |     (4354, "N"),
13 |     (4355, "D"),
14 |     (4356, "DD"),
15 |     (4357, "R"),
16 |     (4358, "M"),
17 |     (4359, "B"),
18 |     (4360, "BB"),
19 |     (4361, "S"),
20 |     (4362, "SS"),
21 |     (4363, ""),
22 |     (4364, "J"),
23 |     (4365, "JJ"),
24 |     (4366, "C"),
25 |     (4367, "K"),
26 |     (4368, "T"),
27 |     (4369, "P"),
28 |     (4370, "H"),
29 |     (4449, "A"),
30 |     (4450, "AE"),
31 |     (4451, "YA"),
32 |     (4452, "YAE"),
33 |     (4453, "EO"),
34 |     (4454, "E"),
35 |     (4455, "YEO"),
36 |     (4456, "YE"),
37 |     (4457, "O"),
38 |     (4458, "WA"),
39 |     (4459, "WAE"),
40 |     (4460, "OE"),
41 |     (4461, "YO"),
42 |     (4462, "U"),
43 |     (4463, "WEO"),
44 |     (4464, "WE"),
45 |     (4465, "WI"),
46 |     (4466, "YU"),
47 |     (4467, "EU"),
48 |     (4468, "YI"),
49 |     (4469, "I"),
50 |     (4520, "G"),
51 |     (4521, "GG"),
52 |     (4522, "GS"),
53 |     (4523, "N"),
54 |     (4524, "NJ"),
55 |     (4525, "NH"),
56 |     (4526, "D"),
57 |     (4527, "L"),
58 |     (4528, "LG"),
59 |     (4529, "LM"),
60 |     (4530, "LB"),
61 |     (4531, "LS"),
62 |     (4532, "LT"),
63 |     (4533, "LP"),
64 |     (4534, "LH"),
65 |     (4535, "M"),
66 |     (4536, "B"),
67 |     (4537, "BS"),
68 |     (4538, "S"),
69 |     (4539, "SS"),
70 |     (4540, "NG"),
71 |     (4541, "J"),
72 |     (4542, "C"),
73 |     (4543, "K"),
74 |     (4544, "T"),
75 |     (4545, "P"),
76 |     (4546, "H"),
77 | ];
78 | 


--------------------------------------------------------------------------------
/ucd-util/src/unicode_tables/mod.rs:
--------------------------------------------------------------------------------
1 | #[cfg(test)]
2 | pub mod jamo_short_name;
3 | #[cfg(test)]
4 | pub mod property_names;
5 | #[cfg(test)]
6 | pub mod property_values;
7 | 


--------------------------------------------------------------------------------