├── .gitignore ├── examples ├── sample.txt ├── basic.rs └── process_file.rs ├── deny.toml ├── .github └── workflows │ └── rust.yml ├── tests ├── test.rs ├── test │ ├── multi-script.txt │ ├── multi-script.uroman-ref.txt │ └── multi-script.uroman-ref-perl.txt └── unit_tests.rs ├── Cargo.toml ├── NOTICE ├── src ├── utils.rs ├── rom_rule.rs ├── main.rs ├── edge.rs ├── lib.rs └── core.rs ├── data ├── Scripts.txt ├── romanization-table-arabic-block.txt ├── UnicodeDataPropsHangul.txt └── UnicodeDataOverwrite.txt ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /uroman -------------------------------------------------------------------------------- /examples/sample.txt: -------------------------------------------------------------------------------- 1 | H\uFF45llo, W\uFF4Frld! 2 | こんにちは、世界! 3 | 你好,世界! 4 | Привіт, світе! 5 | -------------------------------------------------------------------------------- /deny.toml: -------------------------------------------------------------------------------- 1 | [licenses] 2 | allow = [ 3 | "MIT", 4 | "Apache-2.0", 5 | "BSD-3-Clause", 6 | "ISC", 7 | "Unicode-3.0", 8 | "MPL-2.0", 9 | "CDLA-Permissive-2.0", 10 | "Unicode-DFS-2016", 11 | ] 12 | 13 | [bans] 14 | multiple-versions = "warn" 15 | wildcards = "warn" 16 | 17 | [advisories] 18 | 19 | [sources] -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build_and_test: 14 | name: Build and Test on ${{ matrix.os }} 15 | 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | 20 | runs-on: ${{ matrix.os }} 21 | 22 | steps: 23 | - name: Checkout repository 24 | uses: actions/checkout@v4 25 | 26 | - name: Install Rust toolchain 27 | uses: dtolnay/rust-toolchain@stable 28 | 29 | - name: Build 30 | run: cargo build --verbose 31 | 32 | - name: Run tests 33 | run: cargo test --verbose 34 | -------------------------------------------------------------------------------- /tests/test.rs: -------------------------------------------------------------------------------- 1 | use uroman::{RomFormat, Uroman}; 2 | use std::{fs::File, io::BufReader}; 3 | 4 | fn assert_uroman_output(input_path: &str, expected_output: &str) { 5 | let expected_output_normalized = expected_output.replace("\r\n", "\n"); 6 | 7 | let uroman = Uroman::new(); 8 | let mut buf = vec![]; 9 | uroman.romanize_file( 10 | BufReader::new(File::open(input_path).unwrap()), 11 | &mut buf, 12 | None, 13 | RomFormat::Str, 14 | None, 15 | false, 16 | false, 17 | ).unwrap(); 18 | 19 | let actual_output_normalized = String::from_utf8(buf).unwrap().replace("\r\n", "\n"); 20 | 21 | assert_eq!(actual_output_normalized, expected_output_normalized); 22 | } 23 | 24 | #[test] 25 | fn test_multi_script_romanization() { 26 | assert_uroman_output( 27 | concat!(env!("CARGO_MANIFEST_DIR"), "/tests/test/multi-script.txt"), 28 | include_str!("test/multi-script.uroman-ref.txt"), 29 | ); 30 | } 31 | -------------------------------------------------------------------------------- /examples/basic.rs: -------------------------------------------------------------------------------- 1 | extern crate uroman; 2 | 3 | use uroman::{RomFormat, Uroman, rom_format}; 4 | 5 | fn main() { 6 | let uroman = Uroman::new(); 7 | 8 | let s = "こんにちは、ユーロマン!"; 9 | let lcode = None; 10 | 11 | // Str output 12 | let result = uroman 13 | .romanize_string::(s, lcode) 14 | .to_string(); 15 | 16 | let result_f = uroman.romanize_with_format( 17 | s, 18 | lcode, 19 | None, // `None` defaults to `RomFormat::Str`. 20 | // RomFormat::Str, 21 | ).to_string(); 22 | 23 | // This unwrap is safe because `RomFormat::Str` never fails. 24 | // If you prefer to avoid `.unwrap()`, use `romanize_string`. 25 | assert_eq!(result, result_f.unwrap()); 26 | 27 | println!("{result}"); 28 | 29 | // Lattice output 30 | let result = uroman 31 | .romanize_string::(s, lcode) 32 | .to_string() 33 | .unwrap(); 34 | 35 | let result_f = uroman 36 | .romanize_with_format(s, lcode, Some(RomFormat::Lattice)) 37 | .to_string() 38 | .unwrap(); 39 | 40 | assert_eq!(result, result_f); 41 | 42 | println!("{result}"); 43 | } 44 | -------------------------------------------------------------------------------- /examples/process_file.rs: -------------------------------------------------------------------------------- 1 | extern crate uroman; 2 | 3 | use std::{ 4 | env, 5 | fs::File, 6 | io::{self, BufReader}, 7 | process, 8 | }; 9 | 10 | use uroman::{RomFormat, Uroman}; 11 | 12 | fn main() { 13 | let args: Vec = env::args().collect(); 14 | 15 | if args.len() < 2 { 16 | eprintln!("Usage: cargo run --example process_file -- "); 17 | eprintln!("\nFor example, try:"); 18 | eprintln!(" cargo run --example process_file -- examples/sample.txt"); 19 | process::exit(1); 20 | } 21 | 22 | let filepath = &args[1]; 23 | println!("Processing file: {filepath}\n"); 24 | 25 | let uroman = Uroman::new(); 26 | 27 | let file = match File::open(filepath) { 28 | Ok(file) => file, 29 | Err(e) => { 30 | eprintln!("Error: Failed to open file '{filepath}': {e}"); 31 | process::exit(1); 32 | } 33 | }; 34 | 35 | let reader = BufReader::new(file); 36 | 37 | if let Err(e) = uroman.romanize_file( 38 | reader, 39 | io::stdout().lock(), 40 | None, 41 | RomFormat::Str, 42 | None, // max_lines 43 | true, // decode_unicode 44 | false, // silent 45 | ) { 46 | eprintln!("{e}") 47 | }; 48 | } 49 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "uroman" 3 | authors = ["stellanomia "] 4 | version = "0.6.3" 5 | edition = "2024" 6 | repository = "https://github.com/stellanomia/uroman-rs" 7 | description = "A blazingly fast, self-contained Rust reimplementation of the uroman universal romanizer." 8 | readme = "README.md" 9 | license = "Apache-2.0" 10 | build = "build.rs" 11 | keywords = ["uroman", "romanization", "unicode", "nlp", "cli"] 12 | categories = ["command-line-utilities", "text-processing"] 13 | 14 | [[bin]] 15 | name = "uroman-rs" 16 | path = "src/main.rs" 17 | required-features = ["cli"] 18 | 19 | [dependencies] 20 | regex = "1.12.2" 21 | serde = { version = "1.0.228", features = ["derive"] } 22 | serde_json = "1.0.145" 23 | unicode_names2 = "2.0.0" 24 | unicode-properties = "0.1.4" 25 | unicode-normalization = "0.1.25" 26 | unicode-segmentation = "1.12.0" 27 | num-rational = { version = "0.4.2", features = ["serde"] } 28 | ordered-float = "5.1.0" 29 | phf = { version = "0.13.1", features = ["macros"] } 30 | thiserror = "2.0.17" 31 | 32 | clap = { version = "4.5.51", features = ["derive"], optional = true } 33 | rustyline = { version = "17.0.2", features = ["derive"], optional = true } 34 | dirs = { version = "6.0.0", optional = true } 35 | unicode-width = { version = "0.2.2", optional = true } 36 | rayon = "1.11.0" 37 | 38 | [dev-dependencies] 39 | predicates = "3.1.3" 40 | 41 | [[test]] 42 | name = "unit_test" 43 | path = "tests/unit_tests.rs" 44 | harness = true 45 | 46 | [features] 47 | default = ["cli"] 48 | cli = [ 49 | "dep:clap", 50 | "dep:rustyline", 51 | "dep:dirs", 52 | "dep:unicode-width", 53 | ] 54 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | uroman-rs 2 | Copyright 2025 fulm-o 3 | 4 | This product is a Rust implementation of the 'uroman' universal romanizer. 5 | It is licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | ========================================================================== 18 | 19 | This product contains code derived from the 'uroman' software, which is 20 | subject to the following license: 21 | 22 | Copyright (C) 2015-2024 Ulf Hermjakob, USC Information Sciences Institute 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 25 | 26 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 27 | 28 | Any publication of projects using uroman shall acknowledge its use: "This project uses the universal romanizer software 'uroman' written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)". 29 | Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. 30 | 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | //! Utility functions for parsing uroman data files. 2 | 3 | use regex::Regex; 4 | use std::sync::{LazyLock, OnceLock}; 5 | 6 | use crate::core::Value; 7 | 8 | static HAS_ESCAPE_RE: LazyLock = 9 | LazyLock::new(|| Regex::new(r"\\(x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8})").unwrap()); 10 | 11 | /// Captures the value associated with a `::slot` in a line. 12 | /// 13 | /// This function is a Rust port of the Python version's `slot_value_in_double_colon_del_list`. 14 | /// It uses a dynamically generated regex to find the slot and extract its value. 15 | /// 16 | /// # Example 17 | /// `slot_value_in_double_colon_del_list("::s1 of course ::s2 ::cost 0.3", "cost")` returns `Some("0.3")`. 18 | pub fn slot_value_in_double_colon_del_list<'a>(line: &'a str, slot: &'a str) -> Option<&'a str> { 19 | let search_str = format!("::{slot}"); 20 | if let Some(start_index) = line.find(&search_str) { 21 | let remaining = &line[start_index + search_str.len()..]; 22 | if let Some(end_index) = remaining.find("::") { 23 | Some(remaining[..end_index].trim()) 24 | } else { 25 | Some(remaining.trim()) 26 | } 27 | } else { 28 | None 29 | } 30 | } 31 | 32 | /// Checks if a slot exists in the line, even if it has no value. 33 | pub fn has_value_in_double_colon_del_list(line: &str, slot: &str) -> bool { 34 | slot_value_in_double_colon_del_list(line, slot).is_some() 35 | } 36 | 37 | /// Removes matching quotes from the start and end of a string. 38 | /// 39 | /// Handles single quotes, double quotes, and curly double quotes. 40 | pub fn dequote_string(s: &str) -> &str { 41 | static DEQUOTE_RE: OnceLock = OnceLock::new(); 42 | let re = DEQUOTE_RE.get_or_init(|| Regex::new(r#"^\s*(['"“])(.*)(['"”])\s*$"#).unwrap()); 43 | 44 | if let Some(m) = re.captures(s) { 45 | let open_quote = m.get(1).map_or("", |m| m.as_str()); 46 | let content = m.get(2).map_or("", |m| m.as_str()); 47 | let close_quote = m.get(3).map_or("", |m| m.as_str()); 48 | 49 | if (open_quote == "'" && close_quote == "'") 50 | || (open_quote == "\"" && close_quote == "\"") 51 | || (open_quote == "“" && close_quote == "”") 52 | { 53 | return content; 54 | } 55 | } 56 | s 57 | } 58 | 59 | pub fn robust_str_to_num(s: &str) -> Option { 60 | if let Ok(i) = s.parse::() { 61 | Some(Value::Int(i)) 62 | } else if let Ok(f) = s.parse::() { 63 | Some(Value::Float(f)) 64 | } else { 65 | Some(Value::String(s.to_string())) 66 | } 67 | } 68 | 69 | pub fn decode_unicode_escapes(s: &str) -> String { 70 | if !HAS_ESCAPE_RE.is_match(s) { 71 | return s.to_string(); 72 | } 73 | 74 | let mut result = String::with_capacity(s.len()); 75 | let mut last_end = 0; 76 | 77 | for m in HAS_ESCAPE_RE.find_iter(s) { 78 | result.push_str(&s[last_end..m.start()]); 79 | 80 | let full_escape_sequence = m.as_str(); 81 | let hex_part = &full_escape_sequence[2..]; 82 | 83 | let codepoint = u32::from_str_radix(hex_part, 16).unwrap(); 84 | 85 | if codepoint > 0x80 { 86 | result.push(std::char::from_u32(codepoint).unwrap_or(std::char::REPLACEMENT_CHARACTER)); 87 | } else { 88 | result.push_str(full_escape_sequence); 89 | } 90 | 91 | last_end = m.end(); 92 | } 93 | 94 | result.push_str(&s[last_end..]); 95 | 96 | result 97 | } 98 | -------------------------------------------------------------------------------- /tests/test/multi-script.txt: -------------------------------------------------------------------------------- 1 | ::lcode deu Grüße aus Bordeaux 2 | ::lcode tur İstanbul, Türkiye'de yer alan şehir ve ülkenin 81 ilinden biri. 3 | ::lcode eng ⠠⠺⠑⠀⠓⠕⠇⠙⠀⠘⠮⠀⠞⠗⠥⠹⠎⠀⠞⠕⠀⠆⠀⠎⠑⠇⠋⠤⠑⠧⠊⠙⠢⠞⠂⠀⠞⠀⠁⠇⠇⠀⠍⠑⠝⠀⠜⠑⠀⠉⠗⠂⠞⠫⠀⠑⠟⠥⠁⠇⠂⠀⠞⠀⠮⠽⠀⠜⠑⠀⠑⠝⠙⠪⠫⠀⠃⠽⠀⠸⠮⠀⠠⠉⠗⠑⠁⠞⠕⠗⠀⠾⠀⠉⠻⠞⠁⠔⠀⠥⠝⠁⠇⠊⠑⠝⠁⠃⠇⠑⠀⠠⠐⠗⠎⠂⠀⠞⠀⠁⠍⠰⠛⠀⠘⠮⠀⠜⠑⠀⠠⠇⠊⠋⠑⠂⠀⠠⠇⠊⠃⠻⠞⠽⠀⠯⠀⠮⠀⠏⠥⠗⠎⠥⠊⠞⠀⠷⠀⠠⠓⠁⠏⠏⠊⠰⠎⠲ 4 | ::lcode ell Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου. 5 | ::lcode rus Герма́ния (нем. Deutschland), официальное название — Федерати́вная Респу́блика Герма́ния (нем. Bundesrepublik Deutschland), ФРГ (нем. BRD) — государство в Западной Европе. Площадь территории — 357 021 км². Численность населения по переписи 2011 года — более 80 миллионов человек. [2][6]. 6 | ::lcode ukr Володи́мир Олекса́ндрович Зеле́нський (нар. 25 січня 1978, Кривий Ріг) — український державний діяч, політик, шоумен, актор, комік, режисер, продюсер та сценарист, шостий Президент України з 20 травня 2019 року. 7 | ::lcode srp Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства. 8 | ::lcode ara كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم. 9 | ::lcode fas کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لس‌آنجلس، سن دیگو، سن خوزه و سان‌فرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است. 10 | ::lcode uig ئامېرىكا قوشما شتاتلىرى بولسا شىمالىي ئامېرىكاغا جايلاشقان بىر دۆلەت. ئۇنىڭ پايتەختى بولسا ۋاشىنگتون، ئەڭ چوڭ شەھىرى بولسا نيۇيورك شەھىرى. دۆلەت تىلى بولسا ئېنگلىزتىلى. ھازىرقى زۇڭتۇڭ باراك ئوباما. بۇ دۆلەت ئەسلىدە ئەنگىلىيەنىڭ مۇستەملىكىسى بولۇپ ۋاشىنگىتوننىڭ رەھپەرلىكىدە 1776 يىلى 7 ئاينىڭ 4 كۇنى مۇستەقىل بولغان، يەر مەيدانى 9 مىلىيون 826 مىڭ 630 كۋادىرات كلومېتىر، نوپۇسى 306 مىللىيون 142 مىڭ، بۇلارنىڭ ئاسساسلىق دىنى خرىستىئان دىنى. 11 | ::lcode amh ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት። 12 | ::lcode hin कैलिफ़ोर्निया शब्द का पहला अर्थ था जो क्षेत्र जहाँ आज बाहा कैलिफ़ोर्निया प्रायद्वीप, नेवाडा, यूटा और एरिज़ोना, नया मेक्सिको, और वायोमिंग के कई विभाग स्थित हैं। 13 | ::lcode mar लंडन (इंग्लिश: London ) हे इंग्लंडचे व युनायटेड किंग्डमचे राजधानीचे व सर्वात मोठे शहर तसेच युरोपियन संघामधील सर्वात मोठे महानगर क्षेत्र आहे. 14 | ::lcode nep यसको उचाइ समुन्द्र सतहबाट ८,८४८ मीटर (२९,०२८ फीट) छ। यो नेपालको सोलुखुम्बु जिल्लाको खुम्जुङ्ग गा. वि. स. मा पर्छ । 15 | ::lcode tam தமிழ்நாடு (Tamil Nadu) இந்தியாவின் 29 மாநிலங்களில் ஒன்றாகும். தமிழ்நாடு, தமிழகம் என்றும் பரவலாக அழைக்கப்படுகிறது. 16 | ::lcode mal ഇന്ത്യയുടെ തെക്കുപടിഞ്ഞാറെ അറ്റത്തുള്ള സംസ്ഥാനമാണ് കേരളം. 17 | ::lcode ori ଓଡ଼ିଶା ଭାରତର ପୂର୍ବ ଉପକୂଳରେ ଥିବା ଏକ ପ୍ରଶାସନିକ ରାଜ୍ୟ । ଏହାର ଉତ୍ତର-ପୂର୍ବରେ ପଶ୍ଚିମବଙ୍ଗ, ଉତ୍ତରରେ ଝାଡ଼ଖଣ୍ଡ, ପଶ୍ଚିମ ଓ ଉତ୍ତର-ପଶ୍ଚିମରେ ଛତିଶଗଡ଼, ଦକ୍ଷିଣ ଓ ଦକ୍ଷିଣ-ପଶ୍ଚିମରେ ଆନ୍ଧ୍ରପ୍ରଦେଶ ଅବସ୍ଥିତ । ଏହା ଆୟତନ ହିସାବରେ ନବମ ଓ ଜନସଂଖ୍ୟା ହିସାବରେ ଏଗାରତମ ରାଜ୍ୟ । ଓଡ଼ିଆ ଭାଷା ରାଜ୍ୟର ସରକାରୀ ଭାଷା । ୨୦୦୧ ଜନଗଣନା ଅନୁସାରେ ରାଜ୍ୟର ପ୍ରାୟ ୩୩.୨ ନିୟୁତ ଲୋକ ଓଡ଼ିଆ ଭାଷା ବ୍ୟବହାର କରନ୍ତି । 18 | ::lcode zho 加拿大在一万四千年前即有原住民在此生活。 19 | ::lcode heb כֹּל עוֹד בַּלֵּבָב פְּנִימָה נֶפֶשׁ יְהוּדִי הוֹמִיָּה וּלְפַאֲתֵי מִזְרָח, קָדִימָה, עַיִן לְצִיּוֹן צוֹפִיָּה, עוֹד לֹא אָבְדָה תִּקְוָתֵנוּ, הַתִּקְוָה בַּת שְׁנוֹת אַלְפַּיִם לִהְיוֹת עַם חָפְשִׁי בְּאַרְצֵנוּ, אֶרֶץ צִיּוֹן וִירוּשָׁלַיִם. 20 | ::lcode yid דווקא איז אן העברעישער זשורנאל וואס באשרייבט די יידיש־שפראכיקע קולטור. עס איז דערשינען געווארן תמוז ה'תשס"ז (יולי 2006). 21 | ::lcode hye Տալնոեի շրջան (ուկր.՝ Тальнівський район), շրջան Ուկրաինայի Չերկասիի մարզում։ Ստեղծվել է 1923 թվականին։ Վարչական կենտրոնը՝ Տալնոե։ Աշխարհագրությունը Շրջանի տարածքի մակերեսը կազմում է 917 կմ²։ Բնակչություն 22 | ::lcode tai มีประเทศอิสระ 2 ประเทศ คือ ซานมารีโนและนครรัฐวาติกัน เป็นดินแดนที่ล้อมรอบไปด้วยพื้นที่ของอิตาลี ในขณะที่เมืองกัมปีโอเนดีตาเลีย เป็นดินแดนส่วนแยกของอิตาลีที่ถูกล้อมรอบด้วยพื้นที่ประเทศสวิตเซอร์แลนด์ 23 | 북쪽에는 인도네시아와 동티모르, 파푸아 뉴기니, 북동쪽에는 솔로몬 제도와 바누아투, 누벨칼레도니, 그리고 남동쪽에는 뉴질랜드가 있다. 24 | ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು ಇಂದೆನ್ನ ಹೃದಯದಲಿ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗೀ... ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗಿ ಭವ ಭವದಿ ಭತಿಸಿಹೇ ಭವತಿ ದೂರ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ || ಬಾ ಇಲ್ಲಿ || 25 | ვეპხის ტყაოსანი შოთა რუსთაველი ღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე, მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა. 26 | ᚛ᚐᚅᚋ ᚋᚖᚂᚓᚌᚖᚋᚏᚔᚇ ᚋᚐᚉᚔ ᚍᚓᚉᚒᚋᚓᚅ᚜ 27 | ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬ 28 | 𓊪𓏏𓍯𓃭𓐝𓇌𓋴 29 | チェコスロバキア 30 | ལྷ་ས་གྲོང་ཁྱེར 31 | ᓵᓕ ᓴᕕᐊᕐᔪᒃ ᐃᒻᒥᓂᒃ ᓂᓪᓕᕈᑎᖃᓲᖑᕗᖅ ᑕᐃᑦᓱᒪᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ. ᐃᒻᒥᓂᓪᓗᑕᐅᖅ ᓂᓪᓕᕈᑎᖃᓱᖑᒻᒥᓱᓂ ᐅᓪᓗᒥᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ. 32 | ⴰⵎⴰⴳⵔⴰⴷ 1 ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ. 33 | -------------------------------------------------------------------------------- /tests/test/multi-script.uroman-ref.txt: -------------------------------------------------------------------------------- 1 | ::lcode deu Gruesse aus Bordeaux 2 | ::lcode tur Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri. 3 | ::lcode eng We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness. 4 | ::lcode ell To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou. 5 | ::lcode rus Germaniya (nem. Deutschland), ofitsialnoye nazvaniye — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoy Yevrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — boleye 80 millionov chelovek. [2][6]. 6 | ::lcode ukr Volodymyr Oleksandrovych Zelensky (nar. 25 sichnya 1978, Kryvy Rih) — ukrayinsky derzhavny diyach, polityk, shoumen, aktor, komik, rezhyser, prodyuser ta stsenaryst, shosty Prezydent Ukrayiny z 20 travnya 2019 roku. 7 | ::lcode srp Sva ljudska bitsha radjaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sveshtshu i treba jedni prema drugima da postupaju u duhu bratstva. 8 | ::lcode ara knda (balinjlyzya: Canada) hy dwla fy amryka alshmalya ttalf mn 10 mqat'at wthlatha aqalym. tq' fy alqsm alshmaly mn alqara wtmtd mn almhyt alatlsy fy alshrq ila almhyt alhadye fy alghrb wtmtd shmalan fy almhyt almtjmd alshmaly. knda hy albld althany 'almyan mn hyth almsaha alklya. kma an hdwd knda almshtrka m' alwlayat almthda mn aljnwb walshmal alghrby hy alatwl fy al'alm. 9 | ::lcode fas kalifrnia (be anglisi: California) ialti dr ghrb amrika br kraneye aqianvs aram ast. mrkz an sakramntv v shhrhai mhm an lsanjls, sn digv, sn khvze v sanfransiskv hstnd.hmchnin in ialt pr jmit trin ialt amrika ast. 10 | ::lcode uig amerika qoshma shtatliri bolsa shimaliy amerikagha jaylashqan bir doelet. uning paytexti bolsa washington, eng chong shehiri bolsa nyuyork shehiri. doelet tili bolsa engliztili. hazirqi zungtung barak obama. bu doelet eslide engiliening mustemlikisi bolup washingitonning rehperlikide 1776 yili 7 ayning 4 kuni musteqil bolghan, er meydani 9 miliyon 826 ming 630 kwadirat klometir, nopusi 306 milliyon 142 ming, bularning assasliq dini xristian dini. 11 | ::lcode amh iteyopheyaa kaaalame sosetu teleqe yaaberehaame hayemaanotoche gaare taarikaawi genenyunate alaate. 12 | ::lcode hin kailiforniyaa shabda kaa pahalaa artha thaa jo kssetra jahaam aaj baahaa kailiforniyaa praayadviip, nevaaddaa, yuuttaa aur erizonaa, nayaa meksiko, aur vaayomimg ke kaii vibhaag sthit haim. 13 | ::lcode mar lamddan (imglish: London ) he imglamddace va yunaayattedd kimgddamace raajadhaaniice va sarvaat motthe shahar tasec yuropiyan samghaamadhiil sarvaat motthe mahaanagar kssetra aahe. 14 | ::lcode nep yasako ucaai samundra satahabaatt 8,848 miittar (29,028 phiitt) cha. yo nepaalako solukhumbu jillaako khumjungga gaa. vi. sa. maa parcha . 15 | ::lcode tam tamilnaadu (Tamil Nadu) intiyaavin 29 maanilangkalil onraakum. tamilnaadu, tamilakam enrum paravalaaka alaikkappadukiratu. 16 | ::lcode mal intyayutte tekkupattinynyaarre arrrrattulllla samsthaanamaann keerallam. 17 | ::lcode ori oddishaa bhaaratara puurba upakuullare thibaa eka prashaasanika raajya . ehaara uttara-puurbare pashcimabangga, uttarare jhaaddakhanndda, pashcima o uttara-pashcimare chatishagadda, dakssinna o dakssinna-pashcimare aandhrapradesha abasthita . ehaa aayatana hisaabare nabama o janasamkhyaa hisaabare egaaratama raajya . oddiaa bhaassaa raajyara sarakaarii bhaassaa . 2001 janagannanaa anusaare raajyara praaya 33.2 niyuta loka oddiaa bhaassaa byabahaara karanti . 18 | ::lcode zho jianadazai14000nianqianjiyouyuanzhuminzaicishenghuo. 19 | ::lcode heb kol 'od balevav penimah nefesh yehudi homiyah ulefa'ate mizerach, qadimah, 'ayin letsiyon tsofiyah, 'od lo avedah tiqvatenu, hatiqvah bat shenot 'alepayim liheyot 'am chafeshiy be'aretsenu, erets tsiyon virushalayim. 20 | ::lcode yid duuka yz an hebreysher zhurnal was bashreybt dy eydysh-shfrachyke kultur. es yz dershynen gewarn smuz h'sshs"z (yuly 2006). 21 | ::lcode hye Talnoei shrjan (ukr., Talnivsky raion), shrjan Ukrainayi Cherkasii marzum. Steghtsvel e 1923 tvakanin. Varchakan kentrone, Talnoe. Ashkharhagrutyune Shrjani taratski makerese kazmum e 917 km². Bnakchutyun 22 | ::lcode tai miiprathetitra 2 prathet khuee saanmaariinolaenkhanatwaatikan pendindaenthiilomroppaiduaiphueenthiikhongitaalii naiknathiimueangkampiionediitaalia pendindaensuanyaekkhongitaaliithiithuuklomropduaiphueenthiiprathetswitsoelaen 23 | bugjjogeneun indonesiawa dongtimoreu, papua nyugini, bugdongjjogeneun solromon jedowa banuatu, nubelkalredoni, geurigo namdongjjogeneun nyujilraendeuga issda. 24 | baa illi sambhavisu imdenna hrdayadali nityavuu avataripa satyaavataara mannnnaagi maravaagi migavaagi kagavaagii... mannnnaagi maravaagi migavaagi kagavaagi bhava bhavadi bhatisihee bhavati duura nityavuu avataripa satyaavataara || baa illi || 25 | vepxis tqaosani shota rustaveli ghmertsi shemvedre, nutu kvla damxsnas sophlisa shromasa, tsetsxls, tsqalsa da mitsasa, haerta tana mromasa; momtsnes phrteni da aghvphrinde, mivhxvde mas chemsa ndomasa, dghisit da ghamit vhxedvide mzisa elvata krtomaasa. 26 | anm moilegoimrid maki vekumen 27 | ic mag glas eotan ond hit ne hearmiath me. 28 | ptolmys 29 | chekosurobakia 30 | lha·sa·grong·khyer 31 | saali safiaryok imminik nillirotiqasoongofoq taitsomanitatsayaonirarsoni. imminillotaoq nillirotiqasongommisoni ollominitatsayaonirarsoni. 32 | amagrad 1 ar d ttlalan middn gan ilellitn mgaddan gh waddur d izrfan, yili ak darsn unlli d ufrak, illa flla sn ad ttmyawasn ngratsn s tagmat. 33 | -------------------------------------------------------------------------------- /tests/test/multi-script.uroman-ref-perl.txt: -------------------------------------------------------------------------------- 1 | ::lcode deu Gruesse aus Bordeaux 2 | ::lcode tur Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri. 3 | ::lcode eng We hold ⠘e truos to ; self-evid⠢t, t all men aee cr,te equal, t ey aee endoee by ⠸e Creator u cita⠔ unalienable ⠠⠐rs, t amg ⠘e aee Life, Libity ⠯ e pursuit a Happis. 4 | ::lcode ell To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou. 5 | ::lcode rus Germaniya (nem. Deutschland), ofitsialnoe nazvanie — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoi Evrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — bolee 80 millionov chelovek. [2][6]. 6 | ::lcode ukr Volodimir Oleksandrovich Zelensky (nar. 25 sichnya 1978, Krivy Rig) — ukrayinsky derzhavny diyach, politik, shoumen, aktor, komik, rezhiser, prodyuser ta stsenarist, shosty Prezident Ukrayini z 20 travnya 2019 roku. 7 | ::lcode srp Sva ljudska bitsha radjaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sveshtshu i treba jedni prema drugima da postupaju u dukhu bratstva. 8 | ::lcode ara knda (balinjlyzya: Canada) hy dwla fy amryka alshmalya ttalf mn 10 mqat'at wthlatha aqalym. tq' fy alqsm alshmaly mn alqara wtmtd mn almhyt alatlsy fy alshrq ila almhyt alhadye fy alghrb wtmtd shmalan fy almhyt almtjmd alshmaly. knda hy albld althany 'almyan mn hyth almsaha alklya. kma an hdwd knda almshtrka m' alwlayat almthda mn aljnwb walshmal alghrby hy alatwl fy al'alm. 9 | ::lcode fas kalifrnia (bh anglisi: California) ialti dr ghrb amrika br kranh' aqianws aram ast. mrkz an sakramntw w shhrhai mhm an lsanjls, sn digw, sn khwzh w sanfransiskw hstnd.hmtchnin in ialt pr jm'it trin ialt amrika ast. 10 | ::lcode uig yeameraka qwshma shtatlara bwlsa shamalay yeamerakagha jaylashqan bar doelaet. yeunang paytaekhta bwlsa vashangtwn, yeaeng tchwng shaehara bwlsa nyuywrk shaehara. doelaet tala bwlsa yeenglaztala. hazarqa zungtung barak yewbama. bu doelaet yeaesladae yeaengalayaenang mustaemlakasa bwlup vashangatwnnang raehpaerlakadae 1776 yala 7 yeaynang 4 kuna mustaeqal bwlghan, yaer maeydana 9 malaywn 826 mang 630 kvadarat klwmetar, nwpusa 306 mallaywn 142 mang, bularnang yeassaslaq dana khrastayean dana. 11 | ::lcode amh iteyopheyaa kaaalame sosetu teleqe yaaberehaame hayemaanotoche gaare taarikaawi genenyunate alaate. 12 | ::lcode hin kailiforniyaa shabda kaa pahalaa artha thaa jo kssetra jahaam aaj baahaa kailiforniyaa praayadviip, nevaaddaa, yuuttaa aur erizonaa, nayaa meksiko, aur vaayomimga ke kaii vibhaag sthit haim. 13 | ::lcode mar lamddan (imglish: London ) he imglamddace va yunaayattedd kimgddamace raajadhaaniice va sarvaat motthe shahar tasec yuropiyan samghaamadhiil sarvaat motthe mahaanagar kssetra aahe. 14 | ::lcode nep yasako ucaai samundra satahabaatt 8,848 miittar (29,028 phiitt) cha. yo nepaalako solukhumbu jillaako khumjungga gaa. vi. sa. maa parcha . 15 | ::lcode tam tamilnaadu (Tamil Nadu) intiyaavin 29 maanilangkalil onraakum. tamilnaadu, tamilakam enrum paravalaaka alaikkappadukiratu. 16 | ::lcode mal intyayutte tekkupattinynyaarre arrrrattulllla samsthaanamaann keerallam. 17 | ::lcode ori oddishaa bhaaratara puurba upakuullare thibaa eka prashaasanika raajya . ehaara uttara-puurbare pashcimabangga, uttarare jhaaddakhanndda, pashcima o uttara-pashcimare chatishagadda, dakssinna o dakssinna-pashcimare aandhrapradesha abasthita . ehaa aayatana hisaabare nabama o janasamkhyaa hisaabare egaaratama raajya . oddiaa bhaassaa raajyara sarakaarii bhaassaa . 2001 janagannanaa anusaare raajyara praaya 33.2 niyuta loka oddiaa bhaassaa byabahaara karanti . 18 | ::lcode zho jianadazai14000nianqianjiyouyuanzhuminzaicishenghuo. 19 | ::lcode heb kol 'od balevav penimah nefesh yehudi homiyah ulefa'ate mizerach, qadimah, 'ayin letsiyon tsofiyah, 'od lo avedah tiqvatenu, hatiqvah bat shenot 'alepayim liheyot 'am chafeshiy be'aretsenu, erets tsiyon virushalayim. 20 | ::lcode yid dvvqa ayz an h'vr'ysh'r zshvrnal vvas vashryyvt dy yydysh-shfrakyq' qvltvr. 's ayz d'rshyn'n g'vvarn tmvz h'tshs"z (yvly 2006). 21 | ::lcode hye Talnoei shrjan (ukr., Talnivsky raion), shrjan Ukrainayi Cherkasii marzum. Steghtsvel e 1923 tvakanin. Varchakan kentrone, Talnoe. Ashkharhagrutyune Shrjani taratski makerese kazmum e 917 km². Bnakchutyun 22 | ::lcode tai miipraethsisra 2 praeths khuee saanmaariinoaelankhanathwaatikan peondinaednthiilomrobpaiduaiphueenthiikhongitaalii naikhnathiimeueengkampiionediitaaelia peondinaednswnaeykkhongitaaliithiithuuklomrobduaiphueenthiipraethsswitseraelnd 23 | bugjjogeneun indonesiawa dongtimoreu, papua nyugini, bugdongjjogeneun solromon jedowa banuatu, nubelkalredoni, geurigo namdongjjogeneun nyujilraendeuga issda. 24 | baa illi sambhavisu imdenna hrdayadali nityavuu avataripa satyaavataara mannnnaagi maravaagi migavaagi kagavaagii... mannnnaagi maravaagi migavaagi kagavaagi bhava bhavadi bhatisihee bhavati duura nityavuu avataripa satyaavataara || baa illi || 25 | vepxis tqaosani shota rustaveli ghmertsi shemvedre, nutu kvla damxsnas sophlisa shromasa, tsetsxls, tsqalsa da mitsasa, haerta tana mromasa; momtsnes phrteni da aghvphrinde, mivhxvde mas chemsa ndomasa, dghisit da ghamit vhxedvide mzisa elvata krtomaasa. 26 | anm moilegoimrid maki vekumen 27 | ic mag glas eotan ond hit ne hearmiath me. 28 | ptolmys 29 | chekosurobakia 30 | lha·sa·grong·khyer 31 | saali safiaryok imminik nillirotiqasoongofoq taitsomanitatsayaonirarsoni. imminillotaoq nillirotiqasongommisoni ollominitatsayaonirarsoni. 32 | amagrad 1 ar d ttlalan middn gan ilellitn mgaddan gh waddur d izrfan, yili ak darsn unlli d ufrak, illa flla sn ad ttmyawasn ngratsn s tagmat. 33 | -------------------------------------------------------------------------------- /data/Scripts.txt: -------------------------------------------------------------------------------- 1 | ::script-name Adlam 2 | ::script-name Aegean 3 | ::script-name Ahom 4 | ::script-name Anatolian Hieroglyph 5 | ::script-name Arabic ::direction right-to-left 6 | ::script-name Arabic-Indic 7 | ::script-name Armenian 8 | ::script-name Avestan 9 | ::script-name Balinese 10 | ::script-name Bamum 11 | ::script-name Bassa Vah 12 | ::script-name Batak 13 | ::script-name Bengali ::abugida-default-vowel a 14 | ::script-name Bhaiksuki 15 | ::script-name Bopomofo ::language Chinese 16 | ::script-name Brahmi ::abugida-default-vowel a 17 | ::script-name Braille 18 | ::script-name Buginese 19 | ::script-name Buhid 20 | ::script-name Canadian Syllabics 21 | ::script-name Carian 22 | ::script-name Caucasian Albanian 23 | ::script-name Chakma 24 | ::script-name Cham 25 | ::script-name Cherokee 26 | ::script-name Chorasmian 27 | ::script-name Coptic 28 | ::script-name Cuneiform 29 | ::script-name Cypro-Minoan 30 | ::script-name Cypriot 31 | ::script-name Cyrillic 32 | ::script-name CJK ::alt-script-name Chinese, Kanji ::language Chinese, Japanese, Korean, Mandarin 33 | ::script-name Deseret 34 | ::script-name Devanagari ::abugida-default-vowel a 35 | ::script-name Dives Akuru 36 | ::script-name Dogra 37 | ::script-name Duployan 38 | ::script-name Egyptian Hieroglyph ::alt-script-name Egyptian 39 | ::script-name Elbasan 40 | ::script-name Elymaic 41 | ::script-name Ethiopic 42 | ::script-name Extended Arabic-Indic 43 | ::script-name Georgian 44 | ::script-name Glagolitic 45 | ::script-name Gothic 46 | ::script-name Grantha 47 | ::script-name Greek 48 | ::script-name Greek Acrophonic 49 | ::script-name Gujarati ::abugida-default-vowel a 50 | ::script-name Gunjala Gondi 51 | ::script-name Gurmukhi ::abugida-default-vowel a 52 | ::script-name Hangul ::language Korean 53 | ::script-name Hangzhou 54 | ::script-name Hanifi Rohingya 55 | ::script-name Hanunoo 56 | ::script-name Hatran 57 | ::script-name Hebrew ::direction right-to-left 58 | ::script-name Hiragana ::language Japanese 59 | ::script-name Indic Siyaq 60 | ::script-name Imperial Aramaic 61 | ::script-name Inscriptional Pahlavi 62 | ::script-name Inscriptional Parthian 63 | ::script-name Javanese 64 | ::script-name Kaithi 65 | ::script-name Kannada ::abugida-default-vowel a 66 | ::script-name Katakana ::language Japanese 67 | ::script-name Kawi 68 | ::script-name Kayah Li 69 | ::script-name Kharoshthi 70 | ::script-name Khitan Small Script 71 | ::script-name Khmer ::abugida-default-vowel a, o 72 | ::script-name Khojki 73 | ::script-name Khudawadi 74 | ::script-name Klingon 75 | ::script-name Lao 76 | ::script-name Lepcha 77 | ::script-name Latin 78 | ::script-name Limbu 79 | ::script-name Linear A 80 | ::script-name Linear B 81 | ::script-name Lisu 82 | ::script-name Lycian 83 | ::script-name Lydian 84 | ::script-name Mahajani 85 | ::script-name Makasar 86 | ::script-name Malayalam ::abugida-default-vowel a 87 | ::script-name Mandaic 88 | ::script-name Manichaean 89 | ::script-name Marchen 90 | ::script-name Masaram Gondi 91 | ::script-name Mayan 92 | ::script-name Medefaidrin 93 | ::script-name Meetei Mayek 94 | ::script-name Mende Kikakui 95 | ::script-name Meroitic Cursive 96 | ::script-name Meroitic Hieroglyphic 97 | ::script-name Miao 98 | ::script-name Modi ::abugida-default-vowel a 99 | ::script-name Mongolian 100 | ::script-name Mro 101 | ::script-name Multani 102 | ::script-name Myanmar ::alt-script-name Burmese ::abugida-default-vowel a 103 | ::script-name Nabataean 104 | ::script-name Nag Mundari 105 | ::script-name Nandinagari 106 | ::script-name New Tai Lue 107 | ::script-name Newa 108 | ::script-name Nko ::direction right-to-left 109 | ::script-name North Indic 110 | ::script-name Nushu 111 | ::script-name Nyiakeng Puachue Hmong 112 | ::script-name Ogham 113 | ::script-name Ol Chiki 114 | ::script-name Old Hungarian 115 | ::script-name Old Italic 116 | ::script-name Old Permic 117 | ::script-name Old Persian 118 | ::script-name Old North Arabian 119 | ::script-name Old Sogdian 120 | ::script-name Old South Arabian 121 | ::script-name Old Turkic 122 | ::script-name Old Uyghur 123 | ::script-name Oriya ::alt-script-name Odia ::abugida-default-vowel a 124 | ::script-name Osage 125 | ::script-name Osmanya 126 | ::script-name Ottoman Siyaq 127 | ::script-name Pahawh Hmong 128 | ::script-name Palmyrene 129 | ::script-name Pau Cin Hau 130 | ::script-name Phags-Pa 131 | ::script-name Phaistos Disc 132 | ::script-name Phoenician 133 | ::script-name Psalter Pahlavi 134 | ::script-name Rejang 135 | ::script-name Rumi 136 | ::script-name Runic 137 | ::script-name Samaritan 138 | ::script-name Saurashtra 139 | ::script-name Sharada 140 | ::script-name Shavian 141 | ::script-name Siddham 142 | ::script-name SignWriting 143 | ::script-name Sinhala ::abugida-default-vowel a 144 | ::script-name Sogdian 145 | ::script-name Sora Sompeng 146 | ::script-name Soyombo 147 | ::script-name Sundanese ::abugida-default-vowel a 148 | ::script-name Syloti Nagri 149 | ::script-name Syriac 150 | ::script-name Tagalog 151 | ::script-name Tagbanwa 152 | ::script-name Tai Le 153 | ::script-name Tai Tham 154 | ::script-name Tai Viet 155 | ::script-name Takri 156 | ::script-name Tamil ::abugida-default-vowel a 157 | ::script-name Tangsa 158 | ::script-name Tangut 159 | ::script-name Telugu ::abugida-default-vowel a 160 | ::script-name Thaana ::direction right-to-left 161 | ::script-name Thai 162 | ::script-name Tibetan ::abugida-default-vowel a 163 | ::script-name Tifinagh 164 | ::script-name Tirhuta 165 | ::script-name Toto 166 | ::script-name Ugaritic 167 | ::script-name Vai 168 | ::script-name Vedic 169 | ::script-name Vithkuqi 170 | ::script-name Wancho 171 | ::script-name Warang Citi 172 | ::script-name Yezidi 173 | ::script-name Yi 174 | ::script-name Zanabazar Square 175 | -------------------------------------------------------------------------------- /src/rom_rule.rs: -------------------------------------------------------------------------------- 1 | //! Defines the `RomRule` struct and related parsing logic. 2 | 3 | use std::collections::HashMap; 4 | 5 | use crate::{ 6 | core::UromanInner, utils::{ 7 | dequote_string, has_value_in_double_colon_del_list, slot_value_in_double_colon_del_list, 8 | } 9 | }; 10 | 11 | #[allow(unused)] 12 | /// Represents a single romanization rule parsed from the data files. 13 | #[derive(Debug, Clone)] 14 | pub(super) struct RomRule { 15 | pub s: String, 16 | pub t: Option, 17 | pub prov: String, 18 | pub lcodes: Vec, 19 | pub use_only_at_start_of_word: bool, 20 | pub dont_use_at_start_of_word: bool, 21 | pub use_only_at_end_of_word: bool, 22 | pub dont_use_at_end_of_word: bool, 23 | pub use_only_for_whole_word: bool, 24 | pub n_restr: usize, 25 | pub t_alts: Vec, 26 | pub num: Option, 27 | pub is_minus_sign: bool, 28 | pub is_plus_sign: bool, 29 | pub is_decimal_point: bool, 30 | pub fraction_connector: bool, 31 | pub percentage_marker: bool, 32 | pub int_frac_connector: bool, 33 | pub is_large_power: bool, 34 | pub t_at_end_of_syllable: Option, 35 | } 36 | 37 | impl RomRule { 38 | /// Creates a simple `RomRule` with default values for most fields. 39 | pub fn new_simple(s: String, t: &str, provenance: &str) -> Self { 40 | Self { 41 | s, 42 | t: Some(t.to_string()), 43 | prov: provenance.to_string(), 44 | lcodes: Vec::new(), 45 | use_only_at_start_of_word: false, 46 | dont_use_at_start_of_word: false, 47 | use_only_at_end_of_word: false, 48 | dont_use_at_end_of_word: false, 49 | use_only_for_whole_word: false, 50 | n_restr: 0, 51 | t_alts: Vec::new(), 52 | num: None, 53 | is_minus_sign: false, 54 | is_plus_sign: false, 55 | is_decimal_point: false, 56 | fraction_connector: false, 57 | percentage_marker: false, 58 | int_frac_connector: false, 59 | is_large_power: false, 60 | t_at_end_of_syllable: None, 61 | } 62 | } 63 | 64 | /// Checks if the rule has no contextual restrictions (lcodes, word position). 65 | /// This is crucial for the rule overwriting logic in `load_rom_file`. 66 | pub fn is_unconditional(&self) -> bool { 67 | self.lcodes.is_empty() 68 | && !self.use_only_at_start_of_word 69 | && !self.dont_use_at_start_of_word 70 | && !self.use_only_at_end_of_word 71 | && !self.dont_use_at_end_of_word 72 | && !self.use_only_for_whole_word 73 | } 74 | 75 | pub fn from_line( 76 | line: &str, 77 | provenance: &str, 78 | file_format: &str, 79 | uroman: &mut UromanInner, 80 | ) -> Option { 81 | let (s, t) = if file_format == "u2r" { 82 | let u_str = slot_value_in_double_colon_del_list(line, "u")?; 83 | let cp = u32::from_str_radix(u_str, 16).ok()?; 84 | let s = std::char::from_u32(cp)?.to_string(); 85 | let t = slot_value_in_double_colon_del_list(line, "r") 86 | .map(|s_val| dequote_string(s_val).to_string()); 87 | (s, t) 88 | } else { 89 | let s = slot_value_in_double_colon_del_list(line, "s") 90 | .map(|s_val| dequote_string(s_val).to_string())?; 91 | let t = slot_value_in_double_colon_del_list(line, "t") 92 | .map(|s_val| dequote_string(s_val).to_string()); 93 | (s, t) 94 | }; 95 | 96 | // println!("DEBUG: RomRule::from_line - s: '{}', t: {:?}, prov: {}", s, t, provenance); 97 | 98 | let t = uroman.second_rom_filter(&s, t.as_deref()); 99 | 100 | let lcodes: Vec = slot_value_in_double_colon_del_list(line, "lcode") 101 | .map(|s| { 102 | s.split([',', ';']) 103 | .map(|part| part.trim().to_string()) 104 | .collect() 105 | }) 106 | .unwrap_or_default(); 107 | 108 | let use_only_at_start_of_word = 109 | has_value_in_double_colon_del_list(line, "use-only-at-start-of-word"); 110 | let dont_use_at_start_of_word = 111 | has_value_in_double_colon_del_list(line, "dont-use-at-start-of-word"); 112 | let use_only_at_end_of_word = 113 | has_value_in_double_colon_del_list(line, "use-only-at-end-of-word"); 114 | let dont_use_at_end_of_word = 115 | has_value_in_double_colon_del_list(line, "dont-use-at-end-of-word"); 116 | let use_only_for_whole_word = 117 | has_value_in_double_colon_del_list(line, "use-only-for-whole-word"); 118 | 119 | let t_alts: Vec = slot_value_in_double_colon_del_list(line, "t-alt") 120 | .map(|s| { 121 | s.split([',', ';']) 122 | .map(|part| dequote_string(part).to_string()) 123 | .collect() 124 | }) 125 | .unwrap_or_default(); 126 | 127 | let num = slot_value_in_double_colon_del_list(line, "num") 128 | .and_then(crate::utils::robust_str_to_num); 129 | 130 | let is_minus_sign = has_value_in_double_colon_del_list(line, "is-minus-sign"); 131 | let is_plus_sign = has_value_in_double_colon_del_list(line, "is-plus-sign"); 132 | let is_decimal_point = has_value_in_double_colon_del_list(line, "is-decimal-point"); 133 | let fraction_connector = has_value_in_double_colon_del_list(line, "fraction-connector"); 134 | let percentage_marker = has_value_in_double_colon_del_list(line, "percentage-marker"); 135 | let int_frac_connector = has_value_in_double_colon_del_list(line, "int-frac-connector"); 136 | let is_large_power = has_value_in_double_colon_del_list(line, "is-large-power"); 137 | let t_at_end_of_syllable = slot_value_in_double_colon_del_list(line, "t-end-of-syllable") 138 | .map(|s_val| dequote_string(s_val).to_string()); 139 | 140 | // Calculate the number of restrictions in a more declarative way 141 | let n_restr = [ 142 | !lcodes.is_empty(), 143 | use_only_at_start_of_word, 144 | dont_use_at_start_of_word, 145 | use_only_at_end_of_word, 146 | dont_use_at_end_of_word, 147 | use_only_for_whole_word, 148 | ] 149 | .iter() 150 | .filter(|&&is_restr| is_restr) 151 | .count(); 152 | 153 | Some(RomRule { 154 | s, 155 | t, 156 | prov: provenance.to_string(), 157 | lcodes, 158 | use_only_at_start_of_word, 159 | dont_use_at_start_of_word, 160 | use_only_at_end_of_word, 161 | dont_use_at_end_of_word, 162 | use_only_for_whole_word, 163 | n_restr, 164 | t_alts, 165 | num, 166 | is_minus_sign, 167 | is_plus_sign, 168 | is_decimal_point, 169 | fraction_connector, 170 | percentage_marker, 171 | int_frac_connector, 172 | is_large_power, 173 | t_at_end_of_syllable, 174 | }) 175 | } 176 | } 177 | 178 | /// A collection of romanization rules, typically grouped by the source string `s`. 179 | pub type RomRules = HashMap>; 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

uroman-rs

3 |

4 | A self-contained Rust reimplementation of the uroman universal romanizer. 5 |

6 |

7 | Crates.io 8 | CI 9 | License: Apache-2.0 10 |

11 |
12 | 13 | ## Overview 14 | 15 | `uroman-rs` is a complete rewrite of the original `uroman` (Universal Romanizer) in Rust. It provides high-speed, accurate romanization for a vast number of languages and writing systems, faithfully reproducing the behavior of the original implementation. 16 | 17 | As a reimplementation, it is designed to be a drop-in replacement that passes the original's comprehensive test suite. This means its romanization logic, including its strengths and limitations, is identical to the original. For effective use, we recommend reviewing the original authors' documentation on [Reversibility](https://github.com/isi-nlp/uroman?tab=readme-ov-file#reversibility) and [Known Limitations](https://github.com/isi-nlp/uroman?tab=readme-ov-file#limitations). 18 | 19 | In the same spirit of fidelity, this project respects the licensing of the original `uroman` software. `uroman-rs` is licensed under the Apache License 2.0, and includes the original's license as required. For full details, please refer to the [License section](#license). 20 | 21 | ## Features 22 | 23 | * **Performance**: Achieves approximately **27x the speed** of the standard Python version, making it ideal for large-scale data processing. (See [Benchmark](#benchmark)) 24 | * **Robustness**: Fixes several edge-case bugs present in the original implementation, ensuring safer processing of diverse inputs. (See [Bug Fixes](#bug-fixes)) 25 | * **Self-Contained**: A pure Rust implementation with no dependency on external runtimes. It compiles to a single, portable binary. 26 | * **High Fidelity**: Faithfully reproduces the behavior of the original `uroman` and passes its test suite. 27 | * **Rich Output Formats**: Supports multiple output formats, including simple strings (`str`) and structured JSON data (`edges`, `alts`, `lattice`). 28 | * **Versatile**: Can be used as a standalone Command-Line Interface (CLI) tool or as a library in your Rust applications. 29 | 30 | ## Installation 31 | 32 | The `uroman-rs` project is available as a crate named uroman. You can use it both as a command-line tool and as a library in your Rust projects. 33 | 34 | ### As a Command-Line Tool 35 | 36 | To install the `uroman-rs` command-line tool, run the following: 37 | 38 | ```bash 39 | cargo install uroman 40 | ``` 41 | 42 | This will install the executable as `uroman-rs` on your system. 43 | 44 | ### As a Library 45 | 46 | Add `uroman-rs` to your project's Cargo.toml. 47 | For library usage, it's recommended to disable default features to avoid pulling in CLI-specific dependencies. 48 | 49 | ```bash 50 | cargo add uroman --no-default-features 51 | ``` 52 | 53 | ## Usage 54 | 55 | ### Command-Line Interface (CLI) 56 | 57 | `uroman-rs` can be used directly from your terminal. 58 | 59 | **Show sample conversions:** 60 | See examples of how various scripts are romanized. 61 | 62 | ```bash 63 | uroman-rs --sample 64 | ``` 65 | 66 | **View all options:** 67 | 68 | Display the help message for a full list of commands and flags. 69 | ```bash 70 | uroman-rs --help 71 | ``` 72 | 73 | **Use in REPL mode:** 74 | 75 | Run `uroman-rs` without any arguments to process input line by line. Press `Ctrl+D` to exit. 76 | 77 | ```bash 78 | $ uroman-rs 79 | >> こんにちは、世界! 80 | konnichiha, shijie! 81 | >> ᚺᚨᛚᛚᛟ ᚹᛟᚱᛚᛞ 82 | hallo world 83 | >> (Ctrl+D) 84 | ``` 85 | 86 | 87 | ### Library 88 | 89 | ```rust 90 | // Uroman::new() is infallible and does not return a `Result`. 91 | let uroman = Uroman::new(); 92 | 93 | let romanized_string/*: String*/ = uroman.romanize_string::( 94 | "✨ユーロマン✨", 95 | Some("jpn"), 96 | ).to_string(); 97 | 98 | assert_eq!(romanized_string, "✨yuuroman✨"); 99 | println!("{romanized_string}"); 100 | ``` 101 | For more advanced examples, please see the examples/ directory. 102 | 103 | ## Benchmark 104 | 105 | Performance was measured against the original Python implementation using [`hyperfine`](https://github.com/sharkdp/hyperfine). 106 | 107 | * **Test File**: `multi-script.txt` from the original `uroman` repository. 108 | * **Environment**: Intel Core i7-14700, WSL2 (Ubuntu 24.04) 109 | 110 | | Implementation | Mean Time (± σ) | Relative Performance | 111 | |-------------------------------|-----------------------|----------------------| 112 | | **`uroman-rs` (This project)**| **82.9 ms ± 2.4 ms** | **~27.7x faster** | 113 | | `uroman.py` (via `uv run`) | 2295 ms ± 20 ms | Baseline | 114 | 115 | ## Bug Fixes 116 | 117 | `uroman-rs` aims to be not only a faithful reimplementation but also a more robust and accurate one. It handles several edge cases that can cause the original `uroman.py` script to crash or produce incorrect output. 118 | 119 | ### Crash Prevention on Incomplete Patterns 120 | 121 | For example, the original script panics on inputs with incomplete fractional patterns like `"百分之"` ("percent of..."). This occurs because the script expects a subsequent number but does not safely handle cases where one is not found, leading to a `NoneType` attribute error. This issue has been reported to the original author (see [isi-nlp/uroman#16](https://github.com/isi-nlp/uroman/issues/16)). 122 | 123 | ```sh 124 | $ uv run uroman.py "百分之多少" 125 | Traceback (most recent call last): 126 | ... 127 | AttributeError: 'NoneType' object has no attribute 'value' 128 | ``` 129 | 130 | In contrast, `uroman-rs` handles this input safely and provides a reasonable fallback romanization, demonstrating its enhanced reliability: 131 | 132 | ```sh 133 | $ uroman-rs "百分之多少" 134 | baifenzhiduoshao 135 | ``` 136 | 137 | ### Correct Romanization of Tibetan Letter '-A' (U+0F60) 138 | 139 | In addition to improving stability, `uroman-rs` also corrects certain romanization errors found in the original implementation. A notable example is the handling of the Tibetan letter `འ` (U+0F60, TIBETAN LETTER -A). 140 | 141 | The original script incorrectly romanizes this character, which represents the vowel `a` with a preceding glottal stop `[ʔ]`, by omitting the vowel sound entirely. 142 | 143 | ```sh 144 | # Original uroman.py output omits the 'a' sound 145 | $ uv run uroman.py "འ" 146 | ' 147 | ``` 148 | 149 | `uroman-rs` provides the linguistically correct romanization, faithfully representing both the glottal stop (as an apostrophe) and the vowel sound. This ensures a higher quality and more accurate transliteration for Tibetan script. 150 | 151 | ```sh 152 | # uroman-rs provides the correct output 153 | $ uroman-rs "འ" 154 | 'a 155 | ``` 156 | 157 | ### More Precise Romanization by Distinguishing Tibetan Consonants 158 | 159 | `uroman-rs` provides a more precise romanization for certain Tibetan characters compared to the original script. The `uroman.py` implementation fails to distinguish between the glottal stop consonant `འ` ('a-chung) and the vowel carrier `ཨ` ('a-chen) when followed by the vowel `ེ` (`e`). 160 | 161 | The original script produces the same output for both `འེ` and `ཨེ`. 162 | 163 | ```sh 164 | # Original uroman.py output is identical for both characters 165 | $ uv run uroman.py "ཨེ" 166 | e 167 | $ uv run uroman.py "འེ" 168 | e 169 | ``` 170 | 171 | In contrast, `uroman-rs` correctly preserves the leading glottal stop of `འ`, maintaining the distinction between the two characters as intended by the script. 172 | 173 | ```sh 174 | # uroman-rs distinguishes the two characters 175 | $ uroman-rs "ཨེ" 176 | e 177 | $ uroman-rs "འེ" 178 | 'e 179 | ``` 180 | 181 | ## License 182 | 183 | This project is licensed under the Apache License, Version 2.0. 184 | 185 | ### Acknowledgements 186 | 187 | `uroman-rs` is a Rust implementation of the original `uroman` software by Ulf Hermjakob. As such, it is a derivative work and includes the original license notice in the `NOTICE` file. 188 | 189 | Please be aware that any academic publication of projects using `uroman-rs` should acknowledge the use of the original `uroman` software as specified in its license. For details, please see the `NOTICE` file. 190 | -------------------------------------------------------------------------------- /data/romanization-table-arabic-block.txt: -------------------------------------------------------------------------------- 1 | ::s ، ::t , ::comment ARABIC COMMA 2 | ::s ؛ ::t ; ::comment ARABIC SEMICOLON 3 | ::s ؟ ::t ? ::comment ARABIC QUESTION MARK 4 | ::s ء ::t ' ::comment ARABIC LETTER HAMZA 5 | ::s آ ::t a ::comment ARABIC LETTER ALEF WITH MADDA ABOVE 6 | ::s أ ::t a ::comment ARABIC LETTER ALEF WITH HAMZA ABOVE 7 | ::s ؤ ::t w ::comment ARABIC LETTER WAW WITH HAMZA ABOVE 8 | ::s إ ::t i ::comment ARABIC LETTER ALEF WITH HAMZA BELOW 9 | ::s ئ ::t ye ::comment ARABIC LETTER YEH WITH HAMZA ABOVE 10 | ::s ا ::t a ::comment ARABIC LETTER ALEF 11 | ::s ب ::t b ::comment ARABIC LETTER BEH 12 | ::s ة ::t a ::comment ARABIC LETTER TEH MARBUTA 13 | ::s ت ::t t ::comment ARABIC LETTER TEH 14 | ::s ث ::t th ::comment ARABIC LETTER THEH 15 | ::s ج ::t j ::comment ARABIC LETTER JEEM 16 | ::s ح ::t h ::comment ARABIC LETTER HAH 17 | ::s خ ::t kh ::comment ARABIC LETTER KHAH 18 | ::s د ::t d ::comment ARABIC LETTER DAL 19 | ::s ذ ::t th ::comment ARABIC LETTER THAL 20 | ::s ر ::t r ::comment ARABIC LETTER REH 21 | ::s ز ::t z ::comment ARABIC LETTER ZAIN 22 | ::s س ::t s ::comment ARABIC LETTER SEEN 23 | ::s ش ::t sh ::comment ARABIC LETTER SHEEN 24 | ::s ص ::t s ::comment ARABIC LETTER SAD 25 | ::s ض ::t d ::comment ARABIC LETTER DAD 26 | ::s ط ::t t ::comment ARABIC LETTER TAH 27 | ::s ظ ::t z ::comment ARABIC LETTER ZAH 28 | ::s ع ::t ' ::comment ARABIC LETTER AIN 29 | ::s غ ::t gh ::comment ARABIC LETTER GHAIN 30 | ::s ـ ::t - ::comment ARABIC TATWEEL 31 | ::s ف ::t f ::comment ARABIC LETTER FEH 32 | ::s ق ::t q ::comment ARABIC LETTER QAF 33 | ::s ك ::t k ::comment ARABIC LETTER KAF 34 | ::s ل ::t l ::comment ARABIC LETTER LAM 35 | ::s م ::t m ::comment ARABIC LETTER MEEM 36 | ::s ن ::t n ::comment ARABIC LETTER NOON 37 | ::s ه ::t h ::comment ARABIC LETTER HEH 38 | ::s و ::t w ::comment ARABIC LETTER WAW 39 | ::s ى ::t a ::comment ARABIC LETTER ALEF MAKSURA 40 | ::s ي ::t y ::comment ARABIC LETTER YEH 41 | ::s َ ::t a ::comment ARABIC FATHA 42 | ::s ُ ::t u ::comment ARABIC DAMMA 43 | ::s ِ ::t i ::comment ARABIC KASRA 44 | ::s ْ ::t ::comment ARABIC SUKUN 45 | ::s ٔ ::t ' ::comment ARABIC HAMZA ABOVE 46 | ::s ٕ ::t ' ::comment ARABIC HAMZA BELOW 47 | ::s ٠ ::t 0 ::comment ARABIC-INDIC DIGIT ZERO 48 | ::s ١ ::t 1 ::comment ARABIC-INDIC DIGIT ONE 49 | ::s ٢ ::t 2 ::comment ARABIC-INDIC DIGIT TWO 50 | ::s ٣ ::t 3 ::comment ARABIC-INDIC DIGIT THREE 51 | ::s ٤ ::t 4 ::comment ARABIC-INDIC DIGIT FOUR 52 | ::s ٥ ::t 5 ::comment ARABIC-INDIC DIGIT FIVE 53 | ::s ٦ ::t 6 ::comment ARABIC-INDIC DIGIT SIX 54 | ::s ٧ ::t 7 ::comment ARABIC-INDIC DIGIT SEVEN 55 | ::s ٨ ::t 8 ::comment ARABIC-INDIC DIGIT EIGHT 56 | ::s ٩ ::t 9 ::comment ARABIC-INDIC DIGIT NINE 57 | ::s ٪ ::t % ::comment ARABIC PERCENT SIGN 58 | ::s ٫ ::t , ::comment ARABIC DECIMAL SEPARATOR 59 | ::s ٬ ::t , ::comment ARABIC THOUSANDS SEPARATOR 60 | ::s ٮ ::t b ::comment ARABIC LETTER DOTLESS BEH 61 | ::s ٯ ::t q ::comment ARABIC LETTER DOTLESS QAF 62 | ::s ٰ ::t a ::comment ARABIC LETTER SUPERSCRIPT ALEF 63 | ::s ٱ ::t a ::comment ARABIC LETTER ALEF WASLA 64 | ::s ٲ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE 65 | ::s ٳ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA BELOW 66 | ::s ٷ ::t u ::comment ARABIC LETTER U WITH HAMZA ABOVE 67 | ::s ٹ ::t tt ::comment ARABIC LETTER TTEH 68 | ::s ٺ ::t tt ::comment ARABIC LETTER TTEHEH 69 | ::s ٻ ::t b ::comment ARABIC LETTER BEEH 70 | ::s ټ ::t t ::comment ARABIC LETTER TEH WITH RING 71 | ::s ٽ ::t t ::comment ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS 72 | ::s پ ::t p ::comment ARABIC LETTER PEH 73 | ::s ٿ ::t t ::comment ARABIC LETTER TEHEH 74 | ::s ڀ ::t b ::comment ARABIC LETTER BEHEH 75 | ::s ځ ::t h ::comment ARABIC LETTER HAH WITH HAMZA ABOVE 76 | ::s ڂ ::t h ::comment ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE 77 | ::s ڃ ::t ny ::comment ARABIC LETTER NYEH 78 | ::s ڄ ::t dy ::comment ARABIC LETTER DYEH 79 | ::s څ ::t h ::comment ARABIC LETTER HAH WITH THREE DOTS ABOVE 80 | ::s چ ::t tch ::comment ARABIC LETTER TCHEH 81 | ::s ڇ ::t tch ::comment ARABIC LETTER TCHEHEH 82 | ::s ڈ ::t dd ::comment ARABIC LETTER DDAL 83 | ::s ډ ::t d ::comment ARABIC LETTER DAL WITH RING 84 | ::s ڊ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW 85 | ::s ڋ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH 86 | ::s ڌ ::t d ::comment ARABIC LETTER DAHAL 87 | ::s ڍ ::t dd ::comment ARABIC LETTER DDAHAL 88 | ::s ڎ ::t d ::comment ARABIC LETTER DUL 89 | ::s ڏ ::t d ::comment ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS 90 | ::s ڐ ::t d ::comment ARABIC LETTER DAL WITH FOUR DOTS ABOVE 91 | ::s ڑ ::t rr ::comment ARABIC LETTER RREH 92 | ::s ڒ ::t r ::comment ARABIC LETTER REH WITH SMALL V 93 | ::s ړ ::t r ::comment ARABIC LETTER REH WITH RING 94 | ::s ڔ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW 95 | ::s ڕ ::t r ::comment ARABIC LETTER REH WITH SMALL V BELOW 96 | ::s ږ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE 97 | ::s ڗ ::t r ::comment ARABIC LETTER REH WITH TWO DOTS ABOVE 98 | ::s ژ ::t j ::comment ARABIC LETTER JEH 99 | ::s ڙ ::t r ::comment ARABIC LETTER REH WITH FOUR DOTS ABOVE 100 | ::s ښ ::t s ::comment ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE 101 | ::s ڛ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW 102 | ::s ڜ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE 103 | ::s ڝ ::t s ::comment ARABIC LETTER SAD WITH TWO DOTS BELOW 104 | ::s ڞ ::t s ::comment ARABIC LETTER SAD WITH THREE DOTS ABOVE 105 | ::s ڟ ::t t ::comment ARABIC LETTER TAH WITH THREE DOTS ABOVE 106 | ::s ڠ ::t n ::comment ARABIC LETTER AIN WITH THREE DOTS ABOVE 107 | ::s ڡ ::t f ::comment ARABIC LETTER DOTLESS FEH 108 | ::s ڢ ::t f ::comment ARABIC LETTER FEH WITH DOT MOVED BELOW 109 | ::s ڣ ::t f ::comment ARABIC LETTER FEH WITH DOT BELOW 110 | ::s ڤ ::t v ::comment ARABIC LETTER VEH 111 | ::s ڥ ::t f ::comment ARABIC LETTER FEH WITH THREE DOTS BELOW 112 | ::s ڦ ::t p ::comment ARABIC LETTER PEHEH 113 | ::s ڧ ::t q ::comment ARABIC LETTER QAF WITH DOT ABOVE 114 | ::s ڨ ::t q ::comment ARABIC LETTER QAF WITH THREE DOTS ABOVE 115 | ::s ک ::t k ::comment ARABIC LETTER KEHEH 116 | ::s ڪ ::t k ::comment ARABIC LETTER SWASH KAF 117 | ::s ګ ::t k ::comment ARABIC LETTER KAF WITH RING 118 | ::s ڬ ::t k ::comment ARABIC LETTER KAF WITH DOT ABOVE 119 | ::s ڭ ::t ng ::comment ARABIC LETTER NG 120 | ::s ڮ ::t k ::comment ARABIC LETTER KAF WITH THREE DOTS BELOW 121 | ::s گ ::t g ::comment ARABIC LETTER GAF 122 | ::s ڰ ::t g ::comment ARABIC LETTER GAF WITH RING 123 | ::s ڱ ::t ng ::comment ARABIC LETTER NGOEH 124 | ::s ڲ ::t g ::comment ARABIC LETTER GAF WITH TWO DOTS BELOW 125 | ::s ڳ ::t g ::comment ARABIC LETTER GUEH 126 | ::s ڴ ::t g ::comment ARABIC LETTER GAF WITH THREE DOTS ABOVE 127 | ::s ڵ ::t l ::comment ARABIC LETTER LAM WITH SMALL V 128 | ::s ڶ ::t l ::comment ARABIC LETTER LAM WITH DOT ABOVE 129 | ::s ڷ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS ABOVE 130 | ::s ڸ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS BELOW 131 | ::s ڹ ::t n ::comment ARABIC LETTER NOON WITH DOT BELOW 132 | ::s ں ::t n ::comment ARABIC LETTER NOON GHUNNA 133 | ::s ڻ ::t rn ::comment ARABIC LETTER RNOON 134 | ::s ڼ ::t n ::comment ARABIC LETTER NOON WITH RING 135 | ::s ڽ ::t n ::comment ARABIC LETTER NOON WITH THREE DOTS ABOVE 136 | ::s ھ ::t h ::comment ARABIC LETTER HEH DOACHASHMEE 137 | ::s ڿ ::t tch ::comment ARABIC LETTER TCHEH WITH DOT ABOVE 138 | ::s ۀ ::t h ::comment ARABIC LETTER HEH WITH YEH ABOVE 139 | ::s ہ ::t h ::comment ARABIC LETTER HEH GOAL 140 | ::s ۂ ::t h ::comment ARABIC LETTER HEH GOAL WITH HAMZA ABOVE 141 | ::s ۃ ::t a ::comment ARABIC LETTER TEH MARBUTA GOAL 142 | ::s ۄ ::t w ::comment ARABIC LETTER WAW WITH RING 143 | ::s ۅ ::t oe ::comment ARABIC LETTER KIRGHIZ OE 144 | ::s ۆ ::t oe ::comment ARABIC LETTER OE 145 | ::s ۇ ::t u ::comment ARABIC LETTER U 146 | ::s ۈ ::t yu ::comment ARABIC LETTER YU 147 | ::s ۉ ::t yu ::comment ARABIC LETTER KIRGHIZ YU 148 | ::s ۊ ::t w ::comment ARABIC LETTER WAW WITH TWO DOTS ABOVE 149 | ::s ۋ ::t v ::comment ARABIC LETTER VE 150 | ::s ی ::t y ::comment ARABIC LETTER FARSI YEH 151 | ::s ۍ ::t y ::comment ARABIC LETTER YEH WITH TAIL 152 | ::s ێ ::t y ::comment ARABIC LETTER YEH WITH SMALL V 153 | ::s ۏ ::t w ::comment ARABIC LETTER WAW WITH DOT ABOVE 154 | ::s ې ::t e ::comment ARABIC LETTER E 155 | ::s ۑ ::t y ::comment ARABIC LETTER YEH WITH THREE DOTS BELOW 156 | ::s ے ::t y ::comment ARABIC LETTER YEH BARREE 157 | ::s ۓ ::t y ::comment ARABIC LETTER YEH BARREE WITH HAMZA ABOVE 158 | ::s ۔ ::t . ::comment ARABIC FULL STOP 159 | ::s ە ::t ae ::comment ARABIC LETTER AE 160 | ::s ۮ ::t d ::comment ARABIC LETTER DAL WITH INVERTED V 161 | ::s ۯ ::t r ::comment ARABIC LETTER REH WITH INVERTED V 162 | ::s ۰ ::t 0 ::comment EXTENDED ARABIC-INDIC DIGIT ZERO 163 | ::s ۱ ::t 1 ::comment EXTENDED ARABIC-INDIC DIGIT ONE 164 | ::s ۲ ::t 2 ::comment EXTENDED ARABIC-INDIC DIGIT TWO 165 | ::s ۳ ::t 3 ::comment EXTENDED ARABIC-INDIC DIGIT THREE 166 | ::s ۴ ::t 4 ::comment EXTENDED ARABIC-INDIC DIGIT FOUR 167 | ::s ۵ ::t 5 ::comment EXTENDED ARABIC-INDIC DIGIT FIVE 168 | ::s ۶ ::t 6 ::comment EXTENDED ARABIC-INDIC DIGIT SIX 169 | ::s ۷ ::t 7 ::comment EXTENDED ARABIC-INDIC DIGIT SEVEN 170 | ::s ۸ ::t 8 ::comment EXTENDED ARABIC-INDIC DIGIT EIGHT 171 | ::s ۹ ::t 9 ::comment EXTENDED ARABIC-INDIC DIGIT NINE 172 | ::s ۺ ::t sh ::comment ARABIC LETTER SHEEN WITH DOT BELOW 173 | ::s ۻ ::t d ::comment ARABIC LETTER DAD WITH DOT BELOW 174 | ::s ۼ ::t gh ::comment ARABIC LETTER GHAIN WITH DOT BELOW 175 | ::s ۽ ::t & ::comment ARABIC SIGN SINDHI AMPERSAND 176 | ::s ﷲ ::t allah ::comment ARABIC LIGATURE ALLAH ISOLATED FORM 177 | 178 | ::s ‌ ::t ::comment ZERO WIDTH NON-JOINER 179 | ::s ‍ ::t ::comment ZERO WIDTH JOINER 180 | -------------------------------------------------------------------------------- /data/UnicodeDataPropsHangul.txt: -------------------------------------------------------------------------------- 1 | ::script-name Hangul ::n-char 11265 ::char ㄱㄲㄳㄴㄵㄶㄷㄸㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅃㅄㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣㅥㅦㅧㅨㅩㅪㅫㅬㅭㅮㅯㅰㅱㅲㅳㅴㅵㅶㅷㅸㅹㅺㅻㅼㅽㅾㅿㆀㆁㆂㆃㆄㆅㆆㆇㆈㆉㆊㆋㆌㆍㆎ가각갂갃간갅갆갇갈갉갊갋갌갍갎갏감갑값갓갔강갖갗갘같갚갛개객갞갟갠갡갢갣갤갥갦갧갨갩갪갫갬갭갮갯갰갱갲갳갴갵갶갷갸갹갺갻갼갽갾갿걀걁걂걃걄걅걆걇걈걉걊걋걌걍걎걏걐걑걒걓걔걕걖걗걘걙걚걛걜걝걞걟걠걡걢걣걤걥걦걧걨걩걪걫걬걭걮걯거걱걲걳건걵걶걷걸걹걺걻걼걽걾걿검겁겂것겄겅겆겇겈겉겊겋게겍겎겏겐겑겒겓겔겕겖겗겘겙겚겛겜겝겞겟겠겡겢겣겤겥겦겧겨격겪겫견겭겮겯결겱겲겳겴겵겶겷겸겹겺겻겼경겾겿곀곁곂곃계곅곆곇곈곉곊곋곌곍곎곏곐곑곒곓곔곕곖곗곘곙곚곛곜곝곞곟고곡곢곣곤곥곦곧골곩곪곫곬곭곮곯곰곱곲곳곴공곶곷곸곹곺곻과곽곾곿관괁괂괃괄괅괆괇괈괉괊괋괌괍괎괏괐광괒괓괔괕괖괗괘괙괚괛괜괝괞괟괠괡괢괣괤괥괦괧괨괩괪괫괬괭괮괯괰괱괲괳괴괵괶괷괸괹괺괻괼괽괾괿굀굁굂굃굄굅굆굇굈굉굊굋굌굍굎굏교굑굒굓굔굕굖굗굘굙굚굛굜굝굞굟굠굡굢굣굤굥굦굧굨굩굪굫구국굮굯군굱굲굳굴굵굶굷굸굹굺굻굼굽굾굿궀궁궂궃궄궅궆궇궈궉궊궋권궍궎궏궐궑궒궓궔궕궖궗궘궙궚궛궜궝궞궟궠궡궢궣궤궥궦궧궨궩궪궫궬궭궮궯궰궱궲궳궴궵궶궷궸궹궺궻궼궽궾궿귀귁귂귃귄귅귆귇귈귉귊귋귌귍귎귏귐귑귒귓귔귕귖귗귘귙귚귛규귝귞귟균귡귢귣귤귥귦귧귨귩귪귫귬귭귮귯귰귱귲귳귴귵귶귷그극귺귻근귽귾귿글긁긂긃긄긅긆긇금급긊긋긌긍긎긏긐긑긒긓긔긕긖긗긘긙긚긛긜긝긞긟긠긡긢긣긤긥긦긧긨긩긪긫긬긭긮긯기긱긲긳긴긵긶긷길긹긺긻긼긽긾긿김깁깂깃깄깅깆깇깈깉깊깋까깍깎깏깐깑깒깓깔깕깖깗깘깙깚깛깜깝깞깟깠깡깢깣깤깥깦깧깨깩깪깫깬깭깮깯깰깱깲깳깴깵깶깷깸깹깺깻깼깽깾깿꺀꺁꺂꺃꺄꺅꺆꺇꺈꺉꺊꺋꺌꺍꺎꺏꺐꺑꺒꺓꺔꺕꺖꺗꺘꺙꺚꺛꺜꺝꺞꺟꺠꺡꺢꺣꺤꺥꺦꺧꺨꺩꺪꺫꺬꺭꺮꺯꺰꺱꺲꺳꺴꺵꺶꺷꺸꺹꺺꺻꺼꺽꺾꺿껀껁껂껃껄껅껆껇껈껉껊껋껌껍껎껏껐껑껒껓껔껕껖껗께껙껚껛껜껝껞껟껠껡껢껣껤껥껦껧껨껩껪껫껬껭껮껯껰껱껲껳껴껵껶껷껸껹껺껻껼껽껾껿꼀꼁꼂꼃꼄꼅꼆꼇꼈꼉꼊꼋꼌꼍꼎꼏꼐꼑꼒꼓꼔꼕꼖꼗꼘꼙꼚꼛꼜꼝꼞꼟꼠꼡꼢꼣꼤꼥꼦꼧꼨꼩꼪꼫꼬꼭꼮꼯꼰꼱꼲꼳꼴꼵꼶꼷꼸꼹꼺꼻꼼꼽꼾꼿꽀꽁꽂꽃꽄꽅꽆꽇꽈꽉꽊꽋꽌꽍꽎꽏꽐꽑꽒꽓꽔꽕꽖꽗꽘꽙꽚꽛꽜꽝꽞꽟꽠꽡꽢꽣꽤꽥꽦꽧꽨꽩꽪꽫꽬꽭꽮꽯꽰꽱꽲꽳꽴꽵꽶꽷꽸꽹꽺꽻꽼꽽꽾꽿꾀꾁꾂꾃꾄꾅꾆꾇꾈꾉꾊꾋꾌꾍꾎꾏꾐꾑꾒꾓꾔꾕꾖꾗꾘꾙꾚꾛꾜꾝꾞꾟꾠꾡꾢꾣꾤꾥꾦꾧꾨꾩꾪꾫꾬꾭꾮꾯꾰꾱꾲꾳꾴꾵꾶꾷꾸꾹꾺꾻꾼꾽꾾꾿꿀꿁꿂꿃꿄꿅꿆꿇꿈꿉꿊꿋꿌꿍꿎꿏꿐꿑꿒꿓꿔꿕꿖꿗꿘꿙꿚꿛꿜꿝꿞꿟꿠꿡꿢꿣꿤꿥꿦꿧꿨꿩꿪꿫꿬꿭꿮꿯꿰꿱꿲꿳꿴꿵꿶꿷꿸꿹꿺꿻꿼꿽꿾꿿뀀뀁뀂뀃뀄뀅뀆뀇뀈뀉뀊뀋뀌뀍뀎뀏뀐뀑뀒뀓뀔뀕뀖뀗뀘뀙뀚뀛뀜뀝뀞뀟뀠뀡뀢뀣뀤뀥뀦뀧뀨뀩뀪뀫뀬뀭뀮뀯뀰뀱뀲뀳뀴뀵뀶뀷뀸뀹뀺뀻뀼뀽뀾뀿끀끁끂끃끄끅끆끇끈끉끊끋끌끍끎끏끐끑끒끓끔끕끖끗끘끙끚끛끜끝끞끟끠끡끢끣끤끥끦끧끨끩끪끫끬끭끮끯끰끱끲끳끴끵끶끷끸끹끺끻끼끽끾끿낀낁낂낃낄낅낆낇낈낉낊낋낌낍낎낏낐낑낒낓낔낕낖낗나낙낚낛난낝낞낟날낡낢낣낤낥낦낧남납낪낫났낭낮낯낰낱낲낳내낵낶낷낸낹낺낻낼낽낾낿냀냁냂냃냄냅냆냇냈냉냊냋냌냍냎냏냐냑냒냓냔냕냖냗냘냙냚냛냜냝냞냟냠냡냢냣냤냥냦냧냨냩냪냫냬냭냮냯냰냱냲냳냴냵냶냷냸냹냺냻냼냽냾냿넀넁넂넃넄넅넆넇너넉넊넋넌넍넎넏널넑넒넓넔넕넖넗넘넙넚넛넜넝넞넟넠넡넢넣네넥넦넧넨넩넪넫넬넭넮넯넰넱넲넳넴넵넶넷넸넹넺넻넼넽넾넿녀녁녂녃년녅녆녇녈녉녊녋녌녍녎녏념녑녒녓녔녕녖녗녘녙녚녛녜녝녞녟녠녡녢녣녤녥녦녧녨녩녪녫녬녭녮녯녰녱녲녳녴녵녶녷노녹녺녻논녽녾녿놀놁놂놃놄놅놆놇놈놉놊놋놌농놎놏놐놑높놓놔놕놖놗놘놙놚놛놜놝놞놟놠놡놢놣놤놥놦놧놨놩놪놫놬놭놮놯놰놱놲놳놴놵놶놷놸놹놺놻놼놽놾놿뇀뇁뇂뇃뇄뇅뇆뇇뇈뇉뇊뇋뇌뇍뇎뇏뇐뇑뇒뇓뇔뇕뇖뇗뇘뇙뇚뇛뇜뇝뇞뇟뇠뇡뇢뇣뇤뇥뇦뇧뇨뇩뇪뇫뇬뇭뇮뇯뇰뇱뇲뇳뇴뇵뇶뇷뇸뇹뇺뇻뇼뇽뇾뇿눀눁눂눃누눅눆눇눈눉눊눋눌눍눎눏눐눑눒눓눔눕눖눗눘눙눚눛눜눝눞눟눠눡눢눣눤눥눦눧눨눩눪눫눬눭눮눯눰눱눲눳눴눵눶눷눸눹눺눻눼눽눾눿뉀뉁뉂뉃뉄뉅뉆뉇뉈뉉뉊뉋뉌뉍뉎뉏뉐뉑뉒뉓뉔뉕뉖뉗뉘뉙뉚뉛뉜뉝뉞뉟뉠뉡뉢뉣뉤뉥뉦뉧뉨뉩뉪뉫뉬뉭뉮뉯뉰뉱뉲뉳뉴뉵뉶뉷뉸뉹뉺뉻뉼뉽뉾뉿늀늁늂늃늄늅늆늇늈늉늊늋늌늍늎늏느늑늒늓는늕늖늗늘늙늚늛늜늝늞늟늠늡늢늣늤능늦늧늨늩늪늫늬늭늮늯늰늱늲늳늴늵늶늷늸늹늺늻늼늽늾늿닀닁닂닃닄닅닆닇니닉닊닋닌닍닎닏닐닑닒닓닔닕닖닗님닙닚닛닜닝닞닟닠닡닢닣다닥닦닧단닩닪닫달닭닮닯닰닱닲닳담답닶닷닸당닺닻닼닽닾닿대댁댂댃댄댅댆댇댈댉댊댋댌댍댎댏댐댑댒댓댔댕댖댗댘댙댚댛댜댝댞댟댠댡댢댣댤댥댦댧댨댩댪댫댬댭댮댯댰댱댲댳댴댵댶댷댸댹댺댻댼댽댾댿덀덁덂덃덄덅덆덇덈덉덊덋덌덍덎덏덐덑덒덓더덕덖덗던덙덚덛덜덝덞덟덠덡덢덣덤덥덦덧덨덩덪덫덬덭덮덯데덱덲덳덴덵덶덷델덹덺덻덼덽덾덿뎀뎁뎂뎃뎄뎅뎆뎇뎈뎉뎊뎋뎌뎍뎎뎏뎐뎑뎒뎓뎔뎕뎖뎗뎘뎙뎚뎛뎜뎝뎞뎟뎠뎡뎢뎣뎤뎥뎦뎧뎨뎩뎪뎫뎬뎭뎮뎯뎰뎱뎲뎳뎴뎵뎶뎷뎸뎹뎺뎻뎼뎽뎾뎿돀돁돂돃도독돆돇돈돉돊돋돌돍돎돏돐돑돒돓돔돕돖돗돘동돚돛돜돝돞돟돠돡돢돣돤돥돦돧돨돩돪돫돬돭돮돯돰돱돲돳돴돵돶돷돸돹돺돻돼돽돾돿됀됁됂됃됄됅됆됇됈됉됊됋됌됍됎됏됐됑됒됓됔됕됖됗되됙됚됛된됝됞됟될됡됢됣됤됥됦됧됨됩됪됫됬됭됮됯됰됱됲됳됴됵됶됷됸됹됺됻됼됽됾됿둀둁둂둃둄둅둆둇둈둉둊둋둌둍둎둏두둑둒둓둔둕둖둗둘둙둚둛둜둝둞둟둠둡둢둣둤둥둦둧둨둩둪둫둬둭둮둯둰둱둲둳둴둵둶둷둸둹둺둻둼둽둾둿뒀뒁뒂뒃뒄뒅뒆뒇뒈뒉뒊뒋뒌뒍뒎뒏뒐뒑뒒뒓뒔뒕뒖뒗뒘뒙뒚뒛뒜뒝뒞뒟뒠뒡뒢뒣뒤뒥뒦뒧뒨뒩뒪뒫뒬뒭뒮뒯뒰뒱뒲뒳뒴뒵뒶뒷뒸뒹뒺뒻뒼뒽뒾뒿듀듁듂듃듄듅듆듇듈듉듊듋듌듍듎듏듐듑듒듓듔듕듖듗듘듙듚듛드득듞듟든듡듢듣들듥듦듧듨듩듪듫듬듭듮듯듰등듲듳듴듵듶듷듸듹듺듻듼듽듾듿딀딁딂딃딄딅딆딇딈딉딊딋딌딍딎딏딐딑딒딓디딕딖딗딘딙딚딛딜딝딞딟딠딡딢딣딤딥딦딧딨딩딪딫딬딭딮딯따딱딲딳딴딵딶딷딸딹딺딻딼딽딾딿땀땁땂땃땄땅땆땇땈땉땊땋때땍땎땏땐땑땒땓땔땕땖땗땘땙땚땛땜땝땞땟땠땡땢땣땤땥땦땧땨땩땪땫땬땭땮땯땰땱땲땳땴땵땶땷땸땹땺땻땼땽땾땿떀떁떂떃떄떅떆떇떈떉떊떋떌떍떎떏떐떑떒떓떔떕떖떗떘떙떚떛떜떝떞떟떠떡떢떣떤떥떦떧떨떩떪떫떬떭떮떯떰떱떲떳떴떵떶떷떸떹떺떻떼떽떾떿뗀뗁뗂뗃뗄뗅뗆뗇뗈뗉뗊뗋뗌뗍뗎뗏뗐뗑뗒뗓뗔뗕뗖뗗뗘뗙뗚뗛뗜뗝뗞뗟뗠뗡뗢뗣뗤뗥뗦뗧뗨뗩뗪뗫뗬뗭뗮뗯뗰뗱뗲뗳뗴뗵뗶뗷뗸뗹뗺뗻뗼뗽뗾뗿똀똁똂똃똄똅똆똇똈똉똊똋똌똍똎똏또똑똒똓똔똕똖똗똘똙똚똛똜똝똞똟똠똡똢똣똤똥똦똧똨똩똪똫똬똭똮똯똰똱똲똳똴똵똶똷똸똹똺똻똼똽똾똿뙀뙁뙂뙃뙄뙅뙆뙇뙈뙉뙊뙋뙌뙍뙎뙏뙐뙑뙒뙓뙔뙕뙖뙗뙘뙙뙚뙛뙜뙝뙞뙟뙠뙡뙢뙣뙤뙥뙦뙧뙨뙩뙪뙫뙬뙭뙮뙯뙰뙱뙲뙳뙴뙵뙶뙷뙸뙹뙺뙻뙼뙽뙾뙿뚀뚁뚂뚃뚄뚅뚆뚇뚈뚉뚊뚋뚌뚍뚎뚏뚐뚑뚒뚓뚔뚕뚖뚗뚘뚙뚚뚛뚜뚝뚞뚟뚠뚡뚢뚣뚤뚥뚦뚧뚨뚩뚪뚫뚬뚭뚮뚯뚰뚱뚲뚳뚴뚵뚶뚷뚸뚹뚺뚻뚼뚽뚾뚿뛀뛁뛂뛃뛄뛅뛆뛇뛈뛉뛊뛋뛌뛍뛎뛏뛐뛑뛒뛓뛔뛕뛖뛗뛘뛙뛚뛛뛜뛝뛞뛟뛠뛡뛢뛣뛤뛥뛦뛧뛨뛩뛪뛫뛬뛭뛮뛯뛰뛱뛲뛳뛴뛵뛶뛷뛸뛹뛺뛻뛼뛽뛾뛿뜀뜁뜂뜃뜄뜅뜆뜇뜈뜉뜊뜋뜌뜍뜎뜏뜐뜑뜒뜓뜔뜕뜖뜗뜘뜙뜚뜛뜜뜝뜞뜟뜠뜡뜢뜣뜤뜥뜦뜧뜨뜩뜪뜫뜬뜭뜮뜯뜰뜱뜲뜳뜴뜵뜶뜷뜸뜹뜺뜻뜼뜽뜾뜿띀띁띂띃띄띅띆띇띈띉띊띋띌띍띎띏띐띑띒띓띔띕띖띗띘띙띚띛띜띝띞띟띠띡띢띣띤띥띦띧띨띩띪띫띬띭띮띯띰띱띲띳띴띵띶띷띸띹띺띻라락띾띿란랁랂랃랄랅랆랇랈랉랊랋람랍랎랏랐랑랒랓랔랕랖랗래랙랚랛랜랝랞랟랠랡랢랣랤랥랦랧램랩랪랫랬랭랮랯랰랱랲랳랴략랶랷랸랹랺랻랼랽랾랿럀럁럂럃럄럅럆럇럈량럊럋럌럍럎럏럐럑럒럓럔럕럖럗럘럙럚럛럜럝럞럟럠럡럢럣럤럥럦럧럨럩럪럫러럭럮럯런럱럲럳럴럵럶럷럸럹럺럻럼럽럾럿렀렁렂렃렄렅렆렇레렉렊렋렌렍렎렏렐렑렒렓렔렕렖렗렘렙렚렛렜렝렞렟렠렡렢렣려력렦렧련렩렪렫렬렭렮렯렰렱렲렳렴렵렶렷렸령렺렻렼렽렾렿례롁롂롃롄롅롆롇롈롉롊롋롌롍롎롏롐롑롒롓롔롕롖롗롘롙롚롛로록롞롟론롡롢롣롤롥롦롧롨롩롪롫롬롭롮롯롰롱롲롳롴롵롶롷롸롹롺롻롼롽롾롿뢀뢁뢂뢃뢄뢅뢆뢇뢈뢉뢊뢋뢌뢍뢎뢏뢐뢑뢒뢓뢔뢕뢖뢗뢘뢙뢚뢛뢜뢝뢞뢟뢠뢡뢢뢣뢤뢥뢦뢧뢨뢩뢪뢫뢬뢭뢮뢯뢰뢱뢲뢳뢴뢵뢶뢷뢸뢹뢺뢻뢼뢽뢾뢿룀룁룂룃룄룅룆룇룈룉룊룋료룍룎룏룐룑룒룓룔룕룖룗룘룙룚룛룜룝룞룟룠룡룢룣룤룥룦룧루룩룪룫룬룭룮룯룰룱룲룳룴룵룶룷룸룹룺룻룼룽룾룿뤀뤁뤂뤃뤄뤅뤆뤇뤈뤉뤊뤋뤌뤍뤎뤏뤐뤑뤒뤓뤔뤕뤖뤗뤘뤙뤚뤛뤜뤝뤞뤟뤠뤡뤢뤣뤤뤥뤦뤧뤨뤩뤪뤫뤬뤭뤮뤯뤰뤱뤲뤳뤴뤵뤶뤷뤸뤹뤺뤻뤼뤽뤾뤿륀륁륂륃륄륅륆륇륈륉륊륋륌륍륎륏륐륑륒륓륔륕륖륗류륙륚륛륜륝륞륟률륡륢륣륤륥륦륧륨륩륪륫륬륭륮륯륰륱륲륳르륵륶륷른륹륺륻를륽륾륿릀릁릂릃름릅릆릇릈릉릊릋릌릍릎릏릐릑릒릓릔릕릖릗릘릙릚릛릜릝릞릟릠릡릢릣릤릥릦릧릨릩릪릫리릭릮릯린릱릲릳릴릵릶릷릸릹릺릻림립릾릿맀링맂맃맄맅맆맇마막맊맋만맍많맏말맑맒맓맔맕맖맗맘맙맚맛맜망맞맟맠맡맢맣매맥맦맧맨맩맪맫맬맭맮맯맰맱맲맳맴맵맶맷맸맹맺맻맼맽맾맿먀먁먂먃먄먅먆먇먈먉먊먋먌먍먎먏먐먑먒먓먔먕먖먗먘먙먚먛먜먝먞먟먠먡먢먣먤먥먦먧먨먩먪먫먬먭먮먯먰먱먲먳먴먵먶먷머먹먺먻먼먽먾먿멀멁멂멃멄멅멆멇멈멉멊멋멌멍멎멏멐멑멒멓메멕멖멗멘멙멚멛멜멝멞멟멠멡멢멣멤멥멦멧멨멩멪멫멬멭멮멯며멱멲멳면멵멶멷멸멹멺멻멼멽멾멿몀몁몂몃몄명몆몇몈몉몊몋몌몍몎몏몐몑몒몓몔몕몖몗몘몙몚몛몜몝몞몟몠몡몢몣몤몥몦몧모목몪몫몬몭몮몯몰몱몲몳몴몵몶몷몸몹몺못몼몽몾몿뫀뫁뫂뫃뫄뫅뫆뫇뫈뫉뫊뫋뫌뫍뫎뫏뫐뫑뫒뫓뫔뫕뫖뫗뫘뫙뫚뫛뫜뫝뫞뫟뫠뫡뫢뫣뫤뫥뫦뫧뫨뫩뫪뫫뫬뫭뫮뫯뫰뫱뫲뫳뫴뫵뫶뫷뫸뫹뫺뫻뫼뫽뫾뫿묀묁묂묃묄묅묆묇묈묉묊묋묌묍묎묏묐묑묒묓묔묕묖묗묘묙묚묛묜묝묞묟묠묡묢묣묤묥묦묧묨묩묪묫묬묭묮묯묰묱묲묳무묵묶묷문묹묺묻물묽묾묿뭀뭁뭂뭃뭄뭅뭆뭇뭈뭉뭊뭋뭌뭍뭎뭏뭐뭑뭒뭓뭔뭕뭖뭗뭘뭙뭚뭛뭜뭝뭞뭟뭠뭡뭢뭣뭤뭥뭦뭧뭨뭩뭪뭫뭬뭭뭮뭯뭰뭱뭲뭳뭴뭵뭶뭷뭸뭹뭺뭻뭼뭽뭾뭿뮀뮁뮂뮃뮄뮅뮆뮇뮈뮉뮊뮋뮌뮍뮎뮏뮐뮑뮒뮓뮔뮕뮖뮗뮘뮙뮚뮛뮜뮝뮞뮟뮠뮡뮢뮣뮤뮥뮦뮧뮨뮩뮪뮫뮬뮭뮮뮯뮰뮱뮲뮳뮴뮵뮶뮷뮸뮹뮺뮻뮼뮽뮾뮿므믁믂믃믄믅믆믇믈믉믊믋믌믍믎믏믐믑믒믓믔믕믖믗믘믙믚믛믜믝믞믟믠믡믢믣믤믥믦믧믨믩믪믫믬믭믮믯믰믱믲믳믴믵믶믷미믹믺믻민믽믾믿밀밁밂밃밄밅밆밇밈밉밊밋밌밍밎및밐밑밒밓바박밖밗반밙밚받발밝밞밟밠밡밢밣밤밥밦밧밨방밪밫밬밭밮밯배백밲밳밴밵밶밷밸밹밺밻밼밽밾밿뱀뱁뱂뱃뱄뱅뱆뱇뱈뱉뱊뱋뱌뱍뱎뱏뱐뱑뱒뱓뱔뱕뱖뱗뱘뱙뱚뱛뱜뱝뱞뱟뱠뱡뱢뱣뱤뱥뱦뱧뱨뱩뱪뱫뱬뱭뱮뱯뱰뱱뱲뱳뱴뱵뱶뱷뱸뱹뱺뱻뱼뱽뱾뱿벀벁벂벃버벅벆벇번벉벊벋벌벍벎벏벐벑벒벓범법벖벗벘벙벚벛벜벝벞벟베벡벢벣벤벥벦벧벨벩벪벫벬벭벮벯벰벱벲벳벴벵벶벷벸벹벺벻벼벽벾벿변볁볂볃별볅볆볇볈볉볊볋볌볍볎볏볐병볒볓볔볕볖볗볘볙볚볛볜볝볞볟볠볡볢볣볤볥볦볧볨볩볪볫볬볭볮볯볰볱볲볳보복볶볷본볹볺볻볼볽볾볿봀봁봂봃봄봅봆봇봈봉봊봋봌봍봎봏봐봑봒봓봔봕봖봗봘봙봚봛봜봝봞봟봠봡봢봣봤봥봦봧봨봩봪봫봬봭봮봯봰봱봲봳봴봵봶봷봸봹봺봻봼봽봾봿뵀뵁뵂뵃뵄뵅뵆뵇뵈뵉뵊뵋뵌뵍뵎뵏뵐뵑뵒뵓뵔뵕뵖뵗뵘뵙뵚뵛뵜뵝뵞뵟뵠뵡뵢뵣뵤뵥뵦뵧뵨뵩뵪뵫뵬뵭뵮뵯뵰뵱뵲뵳뵴뵵뵶뵷뵸뵹뵺뵻뵼뵽뵾뵿부북붂붃분붅붆붇불붉붊붋붌붍붎붏붐붑붒붓붔붕붖붗붘붙붚붛붜붝붞붟붠붡붢붣붤붥붦붧붨붩붪붫붬붭붮붯붰붱붲붳붴붵붶붷붸붹붺붻붼붽붾붿뷀뷁뷂뷃뷄뷅뷆뷇뷈뷉뷊뷋뷌뷍뷎뷏뷐뷑뷒뷓뷔뷕뷖뷗뷘뷙뷚뷛뷜뷝뷞뷟뷠뷡뷢뷣뷤뷥뷦뷧뷨뷩뷪뷫뷬뷭뷮뷯뷰뷱뷲뷳뷴뷵뷶뷷뷸뷹뷺뷻뷼뷽뷾뷿븀븁븂븃븄븅븆븇븈븉븊븋브븍븎븏븐븑븒븓블븕븖븗븘븙븚븛븜븝븞븟븠븡븢븣븤븥븦븧븨븩븪븫븬븭븮븯븰븱븲븳븴븵븶븷븸븹븺븻븼븽븾븿빀빁빂빃비빅빆빇빈빉빊빋빌빍빎빏빐빑빒빓빔빕빖빗빘빙빚빛빜빝빞빟빠빡빢빣빤빥빦빧빨빩빪빫빬빭빮빯빰빱빲빳빴빵빶빷빸빹빺빻빼빽빾빿뺀뺁뺂뺃뺄뺅뺆뺇뺈뺉뺊뺋뺌뺍뺎뺏뺐뺑뺒뺓뺔뺕뺖뺗뺘뺙뺚뺛뺜뺝뺞뺟뺠뺡뺢뺣뺤뺥뺦뺧뺨뺩뺪뺫뺬뺭뺮뺯뺰뺱뺲뺳뺴뺵뺶뺷뺸뺹뺺뺻뺼뺽뺾뺿뻀뻁뻂뻃뻄뻅뻆뻇뻈뻉뻊뻋뻌뻍뻎뻏뻐뻑뻒뻓뻔뻕뻖뻗뻘뻙뻚뻛뻜뻝뻞뻟뻠뻡뻢뻣뻤뻥뻦뻧뻨뻩뻪뻫뻬뻭뻮뻯뻰뻱뻲뻳뻴뻵뻶뻷뻸뻹뻺뻻뻼뻽뻾뻿뼀뼁뼂뼃뼄뼅뼆뼇뼈뼉뼊뼋뼌뼍뼎뼏뼐뼑뼒뼓뼔뼕뼖뼗뼘뼙뼚뼛뼜뼝뼞뼟뼠뼡뼢뼣뼤뼥뼦뼧뼨뼩뼪뼫뼬뼭뼮뼯뼰뼱뼲뼳뼴뼵뼶뼷뼸뼹뼺뼻뼼뼽뼾뼿뽀뽁뽂뽃뽄뽅뽆뽇뽈뽉뽊뽋뽌뽍뽎뽏뽐뽑뽒뽓뽔뽕뽖뽗뽘뽙뽚뽛뽜뽝뽞뽟뽠뽡뽢뽣뽤뽥뽦뽧뽨뽩뽪뽫뽬뽭뽮뽯뽰뽱뽲뽳뽴뽵뽶뽷뽸뽹뽺뽻뽼뽽뽾뽿뾀뾁뾂뾃뾄뾅뾆뾇뾈뾉뾊뾋뾌뾍뾎뾏뾐뾑뾒뾓뾔뾕뾖뾗뾘뾙뾚뾛뾜뾝뾞뾟뾠뾡뾢뾣뾤뾥뾦뾧뾨뾩뾪뾫뾬뾭뾮뾯뾰뾱뾲뾳뾴뾵뾶뾷뾸뾹뾺뾻뾼뾽뾾뾿뿀뿁뿂뿃뿄뿅뿆뿇뿈뿉뿊뿋뿌뿍뿎뿏뿐뿑뿒뿓뿔뿕뿖뿗뿘뿙뿚뿛뿜뿝뿞뿟뿠뿡뿢뿣뿤뿥뿦뿧뿨뿩뿪뿫뿬뿭뿮뿯뿰뿱뿲뿳뿴뿵뿶뿷뿸뿹뿺뿻뿼뿽뿾뿿쀀쀁쀂쀃쀄쀅쀆쀇쀈쀉쀊쀋쀌쀍쀎쀏쀐쀑쀒쀓쀔쀕쀖쀗쀘쀙쀚쀛쀜쀝쀞쀟쀠쀡쀢쀣쀤쀥쀦쀧쀨쀩쀪쀫쀬쀭쀮쀯쀰쀱쀲쀳쀴쀵쀶쀷쀸쀹쀺쀻쀼쀽쀾쀿쁀쁁쁂쁃쁄쁅쁆쁇쁈쁉쁊쁋쁌쁍쁎쁏쁐쁑쁒쁓쁔쁕쁖쁗쁘쁙쁚쁛쁜쁝쁞쁟쁠쁡쁢쁣쁤쁥쁦쁧쁨쁩쁪쁫쁬쁭쁮쁯쁰쁱쁲쁳쁴쁵쁶쁷쁸쁹쁺쁻쁼쁽쁾쁿삀삁삂삃삄삅삆삇삈삉삊삋삌삍삎삏삐삑삒삓삔삕삖삗삘삙삚삛삜삝삞삟삠삡삢삣삤삥삦삧삨삩삪삫사삭삮삯산삱삲삳살삵삶삷삸삹삺삻삼삽삾삿샀상샂샃샄샅샆샇새색샊샋샌샍샎샏샐샑샒샓샔샕샖샗샘샙샚샛샜생샞샟샠샡샢샣샤샥샦샧샨샩샪샫샬샭샮샯샰샱샲샳샴샵샶샷샸샹샺샻샼샽샾샿섀섁섂섃섄섅섆섇섈섉섊섋섌섍섎섏섐섑섒섓섔섕섖섗섘섙섚섛서석섞섟선섡섢섣설섥섦섧섨섩섪섫섬섭섮섯섰성섲섳섴섵섶섷세섹섺섻센섽섾섿셀셁셂셃셄셅셆셇셈셉셊셋셌셍셎셏셐셑셒셓셔셕셖셗션셙셚셛셜셝셞셟셠셡셢셣셤셥셦셧셨셩셪셫셬셭셮셯셰셱셲셳셴셵셶셷셸셹셺셻셼셽셾셿솀솁솂솃솄솅솆솇솈솉솊솋소속솎솏손솑솒솓솔솕솖솗솘솙솚솛솜솝솞솟솠송솢솣솤솥솦솧솨솩솪솫솬솭솮솯솰솱솲솳솴솵솶솷솸솹솺솻솼솽솾솿쇀쇁쇂쇃쇄쇅쇆쇇쇈쇉쇊쇋쇌쇍쇎쇏쇐쇑쇒쇓쇔쇕쇖쇗쇘쇙쇚쇛쇜쇝쇞쇟쇠쇡쇢쇣쇤쇥쇦쇧쇨쇩쇪쇫쇬쇭쇮쇯쇰쇱쇲쇳쇴쇵쇶쇷쇸쇹쇺쇻쇼쇽쇾쇿숀숁숂숃숄숅숆숇숈숉숊숋숌숍숎숏숐숑숒숓숔숕숖숗수숙숚숛순숝숞숟술숡숢숣숤숥숦숧숨숩숪숫숬숭숮숯숰숱숲숳숴숵숶숷숸숹숺숻숼숽숾숿쉀쉁쉂쉃쉄쉅쉆쉇쉈쉉쉊쉋쉌쉍쉎쉏쉐쉑쉒쉓쉔쉕쉖쉗쉘쉙쉚쉛쉜쉝쉞쉟쉠쉡쉢쉣쉤쉥쉦쉧쉨쉩쉪쉫쉬쉭쉮쉯쉰쉱쉲쉳쉴쉵쉶쉷쉸쉹쉺쉻쉼쉽쉾쉿슀슁슂슃슄슅슆슇슈슉슊슋슌슍슎슏슐슑슒슓슔슕슖슗슘슙슚슛슜슝슞슟슠슡슢슣스슥슦슧슨슩슪슫슬슭슮슯슰슱슲슳슴습슶슷슸승슺슻슼슽슾슿싀싁싂싃싄싅싆싇싈싉싊싋싌싍싎싏싐싑싒싓싔싕싖싗싘싙싚싛시식싞싟신싡싢싣실싥싦싧싨싩싪싫심십싮싯싰싱싲싳싴싵싶싷싸싹싺싻싼싽싾싿쌀쌁쌂쌃쌄쌅쌆쌇쌈쌉쌊쌋쌌쌍쌎쌏쌐쌑쌒쌓쌔쌕쌖쌗쌘쌙쌚쌛쌜쌝쌞쌟쌠쌡쌢쌣쌤쌥쌦쌧쌨쌩쌪쌫쌬쌭쌮쌯쌰쌱쌲쌳쌴쌵쌶쌷쌸쌹쌺쌻쌼쌽쌾쌿썀썁썂썃썄썅썆썇썈썉썊썋썌썍썎썏썐썑썒썓썔썕썖썗썘썙썚썛썜썝썞썟썠썡썢썣썤썥썦썧써썩썪썫썬썭썮썯썰썱썲썳썴썵썶썷썸썹썺썻썼썽썾썿쎀쎁쎂쎃쎄쎅쎆쎇쎈쎉쎊쎋쎌쎍쎎쎏쎐쎑쎒쎓쎔쎕쎖쎗쎘쎙쎚쎛쎜쎝쎞쎟쎠쎡쎢쎣쎤쎥쎦쎧쎨쎩쎪쎫쎬쎭쎮쎯쎰쎱쎲쎳쎴쎵쎶쎷쎸쎹쎺쎻쎼쎽쎾쎿쏀쏁쏂쏃쏄쏅쏆쏇쏈쏉쏊쏋쏌쏍쏎쏏쏐쏑쏒쏓쏔쏕쏖쏗쏘쏙쏚쏛쏜쏝쏞쏟쏠쏡쏢쏣쏤쏥쏦쏧쏨쏩쏪쏫쏬쏭쏮쏯쏰쏱쏲쏳쏴쏵쏶쏷쏸쏹쏺쏻쏼쏽쏾쏿쐀쐁쐂쐃쐄쐅쐆쐇쐈쐉쐊쐋쐌쐍쐎쐏쐐쐑쐒쐓쐔쐕쐖쐗쐘쐙쐚쐛쐜쐝쐞쐟쐠쐡쐢쐣쐤쐥쐦쐧쐨쐩쐪쐫쐬쐭쐮쐯쐰쐱쐲쐳쐴쐵쐶쐷쐸쐹쐺쐻쐼쐽쐾쐿쑀쑁쑂쑃쑄쑅쑆쑇쑈쑉쑊쑋쑌쑍쑎쑏쑐쑑쑒쑓쑔쑕쑖쑗쑘쑙쑚쑛쑜쑝쑞쑟쑠쑡쑢쑣쑤쑥쑦쑧쑨쑩쑪쑫쑬쑭쑮쑯쑰쑱쑲쑳쑴쑵쑶쑷쑸쑹쑺쑻쑼쑽쑾쑿쒀쒁쒂쒃쒄쒅쒆쒇쒈쒉쒊쒋쒌쒍쒎쒏쒐쒑쒒쒓쒔쒕쒖쒗쒘쒙쒚쒛쒜쒝쒞쒟쒠쒡쒢쒣쒤쒥쒦쒧쒨쒩쒪쒫쒬쒭쒮쒯쒰쒱쒲쒳쒴쒵쒶쒷쒸쒹쒺쒻쒼쒽쒾쒿쓀쓁쓂쓃쓄쓅쓆쓇쓈쓉쓊쓋쓌쓍쓎쓏쓐쓑쓒쓓쓔쓕쓖쓗쓘쓙쓚쓛쓜쓝쓞쓟쓠쓡쓢쓣쓤쓥쓦쓧쓨쓩쓪쓫쓬쓭쓮쓯쓰쓱쓲쓳쓴쓵쓶쓷쓸쓹쓺쓻쓼쓽쓾쓿씀씁씂씃씄씅씆씇씈씉씊씋씌씍씎씏씐씑씒씓씔씕씖씗씘씙씚씛씜씝씞씟씠씡씢씣씤씥씦씧씨씩씪씫씬씭씮씯씰씱씲씳씴씵씶씷씸씹씺씻씼씽씾씿앀앁앂앃아악앆앇안앉않앋알앍앎앏앐앑앒앓암압앖앗았앙앚앛앜앝앞앟애액앢앣앤앥앦앧앨앩앪앫앬앭앮앯앰앱앲앳앴앵앶앷앸앹앺앻야약앾앿얀얁얂얃얄얅얆얇얈얉얊얋얌얍얎얏얐양얒얓얔얕얖얗얘얙얚얛얜얝얞얟얠얡얢얣얤얥얦얧얨얩얪얫얬얭얮얯얰얱얲얳어억얶얷언얹얺얻얼얽얾얿엀엁엂엃엄업없엇었엉엊엋엌엍엎엏에엑엒엓엔엕엖엗엘엙엚엛엜엝엞엟엠엡엢엣엤엥엦엧엨엩엪엫여역엮엯연엱엲엳열엵엶엷엸엹엺엻염엽엾엿였영옂옃옄옅옆옇예옉옊옋옌옍옎옏옐옑옒옓옔옕옖옗옘옙옚옛옜옝옞옟옠옡옢옣오옥옦옧온옩옪옫올옭옮옯옰옱옲옳옴옵옶옷옸옹옺옻옼옽옾옿와왁왂왃완왅왆왇왈왉왊왋왌왍왎왏왐왑왒왓왔왕왖왗왘왙왚왛왜왝왞왟왠왡왢왣왤왥왦왧왨왩왪왫왬왭왮왯왰왱왲왳왴왵왶왷외왹왺왻왼왽왾왿욀욁욂욃욄욅욆욇욈욉욊욋욌욍욎욏욐욑욒욓요욕욖욗욘욙욚욛욜욝욞욟욠욡욢욣욤욥욦욧욨용욪욫욬욭욮욯우욱욲욳운욵욶욷울욹욺욻욼욽욾욿움웁웂웃웄웅웆웇웈웉웊웋워웍웎웏원웑웒웓월웕웖웗웘웙웚웛웜웝웞웟웠웡웢웣웤웥웦웧웨웩웪웫웬웭웮웯웰웱웲웳웴웵웶웷웸웹웺웻웼웽웾웿윀윁윂윃위윅윆윇윈윉윊윋윌윍윎윏윐윑윒윓윔윕윖윗윘윙윚윛윜윝윞윟유육윢윣윤윥윦윧율윩윪윫윬윭윮윯윰윱윲윳윴융윶윷윸윹윺윻으윽윾윿은읁읂읃을읅읆읇읈읉읊읋음읍읎읏읐응읒읓읔읕읖읗의읙읚읛읜읝읞읟읠읡읢읣읤읥읦읧읨읩읪읫읬읭읮읯읰읱읲읳이익읶읷인읹읺읻일읽읾읿잀잁잂잃임입잆잇있잉잊잋잌잍잎잏자작잒잓잔잕잖잗잘잙잚잛잜잝잞잟잠잡잢잣잤장잦잧잨잩잪잫재잭잮잯잰잱잲잳잴잵잶잷잸잹잺잻잼잽잾잿쟀쟁쟂쟃쟄쟅쟆쟇쟈쟉쟊쟋쟌쟍쟎쟏쟐쟑쟒쟓쟔쟕쟖쟗쟘쟙쟚쟛쟜쟝쟞쟟쟠쟡쟢쟣쟤쟥쟦쟧쟨쟩쟪쟫쟬쟭쟮쟯쟰쟱쟲쟳쟴쟵쟶쟷쟸쟹쟺쟻쟼쟽쟾쟿저적젂젃전젅젆젇절젉젊젋젌젍젎젏점접젒젓젔정젖젗젘젙젚젛제젝젞젟젠젡젢젣젤젥젦젧젨젩젪젫젬젭젮젯젰젱젲젳젴젵젶젷져젹젺젻젼젽젾젿졀졁졂졃졄졅졆졇졈졉졊졋졌졍졎졏졐졑졒졓졔졕졖졗졘졙졚졛졜졝졞졟졠졡졢졣졤졥졦졧졨졩졪졫졬졭졮졯조족졲졳존졵졶졷졸졹졺졻졼졽졾졿좀좁좂좃좄종좆좇좈좉좊좋좌좍좎좏좐좑좒좓좔좕좖좗좘좙좚좛좜좝좞좟좠좡좢좣좤좥좦좧좨좩좪좫좬좭좮좯좰좱좲좳좴좵좶좷좸좹좺좻좼좽좾좿죀죁죂죃죄죅죆죇죈죉죊죋죌죍죎죏죐죑죒죓죔죕죖죗죘죙죚죛죜죝죞죟죠죡죢죣죤죥죦죧죨죩죪죫죬죭죮죯죰죱죲죳죴죵죶죷죸죹죺죻주죽죾죿준줁줂줃줄줅줆줇줈줉줊줋줌줍줎줏줐중줒줓줔줕줖줗줘줙줚줛줜줝줞줟줠줡줢줣줤줥줦줧줨줩줪줫줬줭줮줯줰줱줲줳줴줵줶줷줸줹줺줻줼줽줾줿쥀쥁쥂쥃쥄쥅쥆쥇쥈쥉쥊쥋쥌쥍쥎쥏쥐쥑쥒쥓쥔쥕쥖쥗쥘쥙쥚쥛쥜쥝쥞쥟쥠쥡쥢쥣쥤쥥쥦쥧쥨쥩쥪쥫쥬쥭쥮쥯쥰쥱쥲쥳쥴쥵쥶쥷쥸쥹쥺쥻쥼쥽쥾쥿즀즁즂즃즄즅즆즇즈즉즊즋즌즍즎즏즐즑즒즓즔즕즖즗즘즙즚즛즜증즞즟즠즡즢즣즤즥즦즧즨즩즪즫즬즭즮즯즰즱즲즳즴즵즶즷즸즹즺즻즼즽즾즿지직짂짃진짅짆짇질짉짊짋짌짍짎짏짐집짒짓짔징짖짗짘짙짚짛짜짝짞짟짠짡짢짣짤짥짦짧짨짩짪짫짬짭짮짯짰짱짲짳짴짵짶짷째짹짺짻짼짽짾짿쨀쨁쨂쨃쨄쨅쨆쨇쨈쨉쨊쨋쨌쨍쨎쨏쨐쨑쨒쨓쨔쨕쨖쨗쨘쨙쨚쨛쨜쨝쨞쨟쨠쨡쨢쨣쨤쨥쨦쨧쨨쨩쨪쨫쨬쨭쨮쨯쨰쨱쨲쨳쨴쨵쨶쨷쨸쨹쨺쨻쨼쨽쨾쨿쩀쩁쩂쩃쩄쩅쩆쩇쩈쩉쩊쩋쩌쩍쩎쩏쩐쩑쩒쩓쩔쩕쩖쩗쩘쩙쩚쩛쩜쩝쩞쩟쩠쩡쩢쩣쩤쩥쩦쩧쩨쩩쩪쩫쩬쩭쩮쩯쩰쩱쩲쩳쩴쩵쩶쩷쩸쩹쩺쩻쩼쩽쩾쩿쪀쪁쪂쪃쪄쪅쪆쪇쪈쪉쪊쪋쪌쪍쪎쪏쪐쪑쪒쪓쪔쪕쪖쪗쪘쪙쪚쪛쪜쪝쪞쪟쪠쪡쪢쪣쪤쪥쪦쪧쪨쪩쪪쪫쪬쪭쪮쪯쪰쪱쪲쪳쪴쪵쪶쪷쪸쪹쪺쪻쪼쪽쪾쪿쫀쫁쫂쫃쫄쫅쫆쫇쫈쫉쫊쫋쫌쫍쫎쫏쫐쫑쫒쫓쫔쫕쫖쫗쫘쫙쫚쫛쫜쫝쫞쫟쫠쫡쫢쫣쫤쫥쫦쫧쫨쫩쫪쫫쫬쫭쫮쫯쫰쫱쫲쫳쫴쫵쫶쫷쫸쫹쫺쫻쫼쫽쫾쫿쬀쬁쬂쬃쬄쬅쬆쬇쬈쬉쬊쬋쬌쬍쬎쬏쬐쬑쬒쬓쬔쬕쬖쬗쬘쬙쬚쬛쬜쬝쬞쬟쬠쬡쬢쬣쬤쬥쬦쬧쬨쬩쬪쬫쬬쬭쬮쬯쬰쬱쬲쬳쬴쬵쬶쬷쬸쬹쬺쬻쬼쬽쬾쬿쭀쭁쭂쭃쭄쭅쭆쭇쭈쭉쭊쭋쭌쭍쭎쭏쭐쭑쭒쭓쭔쭕쭖쭗쭘쭙쭚쭛쭜쭝쭞쭟쭠쭡쭢쭣쭤쭥쭦쭧쭨쭩쭪쭫쭬쭭쭮쭯쭰쭱쭲쭳쭴쭵쭶쭷쭸쭹쭺쭻쭼쭽쭾쭿쮀쮁쮂쮃쮄쮅쮆쮇쮈쮉쮊쮋쮌쮍쮎쮏쮐쮑쮒쮓쮔쮕쮖쮗쮘쮙쮚쮛쮜쮝쮞쮟쮠쮡쮢쮣쮤쮥쮦쮧쮨쮩쮪쮫쮬쮭쮮쮯쮰쮱쮲쮳쮴쮵쮶쮷쮸쮹쮺쮻쮼쮽쮾쮿쯀쯁쯂쯃쯄쯅쯆쯇쯈쯉쯊쯋쯌쯍쯎쯏쯐쯑쯒쯓쯔쯕쯖쯗쯘쯙쯚쯛쯜쯝쯞쯟쯠쯡쯢쯣쯤쯥쯦쯧쯨쯩쯪쯫쯬쯭쯮쯯쯰쯱쯲쯳쯴쯵쯶쯷쯸쯹쯺쯻쯼쯽쯾쯿찀찁찂찃찄찅찆찇찈찉찊찋찌찍찎찏찐찑찒찓찔찕찖찗찘찙찚찛찜찝찞찟찠찡찢찣찤찥찦찧차착찪찫찬찭찮찯찰찱찲찳찴찵찶찷참찹찺찻찼창찾찿챀챁챂챃채책챆챇챈챉챊챋챌챍챎챏챐챑챒챓챔챕챖챗챘챙챚챛챜챝챞챟챠챡챢챣챤챥챦챧챨챩챪챫챬챭챮챯챰챱챲챳챴챵챶챷챸챹챺챻챼챽챾챿첀첁첂첃첄첅첆첇첈첉첊첋첌첍첎첏첐첑첒첓첔첕첖첗처척첚첛천첝첞첟철첡첢첣첤첥첦첧첨첩첪첫첬청첮첯첰첱첲첳체첵첶첷첸첹첺첻첼첽첾첿쳀쳁쳂쳃쳄쳅쳆쳇쳈쳉쳊쳋쳌쳍쳎쳏쳐쳑쳒쳓쳔쳕쳖쳗쳘쳙쳚쳛쳜쳝쳞쳟쳠쳡쳢쳣쳤쳥쳦쳧쳨쳩쳪쳫쳬쳭쳮쳯쳰쳱쳲쳳쳴쳵쳶쳷쳸쳹쳺쳻쳼쳽쳾쳿촀촁촂촃촄촅촆촇초촉촊촋촌촍촎촏촐촑촒촓촔촕촖촗촘촙촚촛촜총촞촟촠촡촢촣촤촥촦촧촨촩촪촫촬촭촮촯촰촱촲촳촴촵촶촷촸촹촺촻촼촽촾촿쵀쵁쵂쵃쵄쵅쵆쵇쵈쵉쵊쵋쵌쵍쵎쵏쵐쵑쵒쵓쵔쵕쵖쵗쵘쵙쵚쵛최쵝쵞쵟쵠쵡쵢쵣쵤쵥쵦쵧쵨쵩쵪쵫쵬쵭쵮쵯쵰쵱쵲쵳쵴쵵쵶쵷쵸쵹쵺쵻쵼쵽쵾쵿춀춁춂춃춄춅춆춇춈춉춊춋춌춍춎춏춐춑춒춓추축춖춗춘춙춚춛출춝춞춟춠춡춢춣춤춥춦춧춨충춪춫춬춭춮춯춰춱춲춳춴춵춶춷춸춹춺춻춼춽춾춿췀췁췂췃췄췅췆췇췈췉췊췋췌췍췎췏췐췑췒췓췔췕췖췗췘췙췚췛췜췝췞췟췠췡췢췣췤췥췦췧취췩췪췫췬췭췮췯췰췱췲췳췴췵췶췷췸췹췺췻췼췽췾췿츀츁츂츃츄츅츆츇츈츉츊츋츌츍츎츏츐츑츒츓츔츕츖츗츘츙츚츛츜츝츞츟츠측츢츣츤츥츦츧츨츩츪츫츬츭츮츯츰츱츲츳츴층츶츷츸츹츺츻츼츽츾츿칀칁칂칃칄칅칆칇칈칉칊칋칌칍칎칏칐칑칒칓칔칕칖칗치칙칚칛친칝칞칟칠칡칢칣칤칥칦칧침칩칪칫칬칭칮칯칰칱칲칳카칵칶칷칸칹칺칻칼칽칾칿캀캁캂캃캄캅캆캇캈캉캊캋캌캍캎캏캐캑캒캓캔캕캖캗캘캙캚캛캜캝캞캟캠캡캢캣캤캥캦캧캨캩캪캫캬캭캮캯캰캱캲캳캴캵캶캷캸캹캺캻캼캽캾캿컀컁컂컃컄컅컆컇컈컉컊컋컌컍컎컏컐컑컒컓컔컕컖컗컘컙컚컛컜컝컞컟컠컡컢컣커컥컦컧컨컩컪컫컬컭컮컯컰컱컲컳컴컵컶컷컸컹컺컻컼컽컾컿케켁켂켃켄켅켆켇켈켉켊켋켌켍켎켏켐켑켒켓켔켕켖켗켘켙켚켛켜켝켞켟켠켡켢켣켤켥켦켧켨켩켪켫켬켭켮켯켰켱켲켳켴켵켶켷켸켹켺켻켼켽켾켿콀콁콂콃콄콅콆콇콈콉콊콋콌콍콎콏콐콑콒콓코콕콖콗콘콙콚콛콜콝콞콟콠콡콢콣콤콥콦콧콨콩콪콫콬콭콮콯콰콱콲콳콴콵콶콷콸콹콺콻콼콽콾콿쾀쾁쾂쾃쾄쾅쾆쾇쾈쾉쾊쾋쾌쾍쾎쾏쾐쾑쾒쾓쾔쾕쾖쾗쾘쾙쾚쾛쾜쾝쾞쾟쾠쾡쾢쾣쾤쾥쾦쾧쾨쾩쾪쾫쾬쾭쾮쾯쾰쾱쾲쾳쾴쾵쾶쾷쾸쾹쾺쾻쾼쾽쾾쾿쿀쿁쿂쿃쿄쿅쿆쿇쿈쿉쿊쿋쿌쿍쿎쿏쿐쿑쿒쿓쿔쿕쿖쿗쿘쿙쿚쿛쿜쿝쿞쿟쿠쿡쿢쿣쿤쿥쿦쿧쿨쿩쿪쿫쿬쿭쿮쿯쿰쿱쿲쿳쿴쿵쿶쿷쿸쿹쿺쿻쿼쿽쿾쿿퀀퀁퀂퀃퀄퀅퀆퀇퀈퀉퀊퀋퀌퀍퀎퀏퀐퀑퀒퀓퀔퀕퀖퀗퀘퀙퀚퀛퀜퀝퀞퀟퀠퀡퀢퀣퀤퀥퀦퀧퀨퀩퀪퀫퀬퀭퀮퀯퀰퀱퀲퀳퀴퀵퀶퀷퀸퀹퀺퀻퀼퀽퀾퀿큀큁큂큃큄큅큆큇큈큉큊큋큌큍큎큏큐큑큒큓큔큕큖큗큘큙큚큛큜큝큞큟큠큡큢큣큤큥큦큧큨큩큪큫크큭큮큯큰큱큲큳클큵큶큷큸큹큺큻큼큽큾큿킀킁킂킃킄킅킆킇킈킉킊킋킌킍킎킏킐킑킒킓킔킕킖킗킘킙킚킛킜킝킞킟킠킡킢킣키킥킦킧킨킩킪킫킬킭킮킯킰킱킲킳킴킵킶킷킸킹킺킻킼킽킾킿타탁탂탃탄탅탆탇탈탉탊탋탌탍탎탏탐탑탒탓탔탕탖탗탘탙탚탛태택탞탟탠탡탢탣탤탥탦탧탨탩탪탫탬탭탮탯탰탱탲탳탴탵탶탷탸탹탺탻탼탽탾탿턀턁턂턃턄턅턆턇턈턉턊턋턌턍턎턏턐턑턒턓턔턕턖턗턘턙턚턛턜턝턞턟턠턡턢턣턤턥턦턧턨턩턪턫턬턭턮턯터턱턲턳턴턵턶턷털턹턺턻턼턽턾턿텀텁텂텃텄텅텆텇텈텉텊텋테텍텎텏텐텑텒텓텔텕텖텗텘텙텚텛템텝텞텟텠텡텢텣텤텥텦텧텨텩텪텫텬텭텮텯텰텱텲텳텴텵텶텷텸텹텺텻텼텽텾텿톀톁톂톃톄톅톆톇톈톉톊톋톌톍톎톏톐톑톒톓톔톕톖톗톘톙톚톛톜톝톞톟토톡톢톣톤톥톦톧톨톩톪톫톬톭톮톯톰톱톲톳톴통톶톷톸톹톺톻톼톽톾톿퇀퇁퇂퇃퇄퇅퇆퇇퇈퇉퇊퇋퇌퇍퇎퇏퇐퇑퇒퇓퇔퇕퇖퇗퇘퇙퇚퇛퇜퇝퇞퇟퇠퇡퇢퇣퇤퇥퇦퇧퇨퇩퇪퇫퇬퇭퇮퇯퇰퇱퇲퇳퇴퇵퇶퇷퇸퇹퇺퇻퇼퇽퇾퇿툀툁툂툃툄툅툆툇툈툉툊툋툌툍툎툏툐툑툒툓툔툕툖툗툘툙툚툛툜툝툞툟툠툡툢툣툤툥툦툧툨툩툪툫투툭툮툯툰툱툲툳툴툵툶툷툸툹툺툻툼툽툾툿퉀퉁퉂퉃퉄퉅퉆퉇퉈퉉퉊퉋퉌퉍퉎퉏퉐퉑퉒퉓퉔퉕퉖퉗퉘퉙퉚퉛퉜퉝퉞퉟퉠퉡퉢퉣퉤퉥퉦퉧퉨퉩퉪퉫퉬퉭퉮퉯퉰퉱퉲퉳퉴퉵퉶퉷퉸퉹퉺퉻퉼퉽퉾퉿튀튁튂튃튄튅튆튇튈튉튊튋튌튍튎튏튐튑튒튓튔튕튖튗튘튙튚튛튜튝튞튟튠튡튢튣튤튥튦튧튨튩튪튫튬튭튮튯튰튱튲튳튴튵튶튷트특튺튻튼튽튾튿틀틁틂틃틄틅틆틇틈틉틊틋틌틍틎틏틐틑틒틓틔틕틖틗틘틙틚틛틜틝틞틟틠틡틢틣틤틥틦틧틨틩틪틫틬틭틮틯티틱틲틳틴틵틶틷틸틹틺틻틼틽틾틿팀팁팂팃팄팅팆팇팈팉팊팋파팍팎팏판팑팒팓팔팕팖팗팘팙팚팛팜팝팞팟팠팡팢팣팤팥팦팧패팩팪팫팬팭팮팯팰팱팲팳팴팵팶팷팸팹팺팻팼팽팾팿퍀퍁퍂퍃퍄퍅퍆퍇퍈퍉퍊퍋퍌퍍퍎퍏퍐퍑퍒퍓퍔퍕퍖퍗퍘퍙퍚퍛퍜퍝퍞퍟퍠퍡퍢퍣퍤퍥퍦퍧퍨퍩퍪퍫퍬퍭퍮퍯퍰퍱퍲퍳퍴퍵퍶퍷퍸퍹퍺퍻퍼퍽퍾퍿펀펁펂펃펄펅펆펇펈펉펊펋펌펍펎펏펐펑펒펓펔펕펖펗페펙펚펛펜펝펞펟펠펡펢펣펤펥펦펧펨펩펪펫펬펭펮펯펰펱펲펳펴펵펶펷편펹펺펻펼펽펾펿폀폁폂폃폄폅폆폇폈평폊폋폌폍폎폏폐폑폒폓폔폕폖폗폘폙폚폛폜폝폞폟폠폡폢폣폤폥폦폧폨폩폪폫포폭폮폯폰폱폲폳폴폵폶폷폸폹폺폻폼폽폾폿퐀퐁퐂퐃퐄퐅퐆퐇퐈퐉퐊퐋퐌퐍퐎퐏퐐퐑퐒퐓퐔퐕퐖퐗퐘퐙퐚퐛퐜퐝퐞퐟퐠퐡퐢퐣퐤퐥퐦퐧퐨퐩퐪퐫퐬퐭퐮퐯퐰퐱퐲퐳퐴퐵퐶퐷퐸퐹퐺퐻퐼퐽퐾퐿푀푁푂푃푄푅푆푇푈푉푊푋푌푍푎푏푐푑푒푓푔푕푖푗푘푙푚푛표푝푞푟푠푡푢푣푤푥푦푧푨푩푪푫푬푭푮푯푰푱푲푳푴푵푶푷푸푹푺푻푼푽푾푿풀풁풂풃풄풅풆풇품풉풊풋풌풍풎풏풐풑풒풓풔풕풖풗풘풙풚풛풜풝풞풟풠풡풢풣풤풥풦풧풨풩풪풫풬풭풮풯풰풱풲풳풴풵풶풷풸풹풺풻풼풽풾풿퓀퓁퓂퓃퓄퓅퓆퓇퓈퓉퓊퓋퓌퓍퓎퓏퓐퓑퓒퓓퓔퓕퓖퓗퓘퓙퓚퓛퓜퓝퓞퓟퓠퓡퓢퓣퓤퓥퓦퓧퓨퓩퓪퓫퓬퓭퓮퓯퓰퓱퓲퓳퓴퓵퓶퓷퓸퓹퓺퓻퓼퓽퓾퓿픀픁픂픃프픅픆픇픈픉픊픋플픍픎픏픐픑픒픓픔픕픖픗픘픙픚픛픜픝픞픟픠픡픢픣픤픥픦픧픨픩픪픫픬픭픮픯픰픱픲픳픴픵픶픷픸픹픺픻피픽픾픿핀핁핂핃필핅핆핇핈핉핊핋핌핍핎핏핐핑핒핓핔핕핖핗하학핚핛한핝핞핟할핡핢핣핤핥핦핧함합핪핫핬항핮핯핰핱핲핳해핵핶핷핸핹핺핻핼핽핾핿햀햁햂햃햄햅햆햇했행햊햋햌햍햎햏햐햑햒햓햔햕햖햗햘햙햚햛햜햝햞햟햠햡햢햣햤향햦햧햨햩햪햫햬햭햮햯햰햱햲햳햴햵햶햷햸햹햺햻햼햽햾햿헀헁헂헃헄헅헆헇허헉헊헋헌헍헎헏헐헑헒헓헔헕헖헗험헙헚헛헜헝헞헟헠헡헢헣헤헥헦헧헨헩헪헫헬헭헮헯헰헱헲헳헴헵헶헷헸헹헺헻헼헽헾헿혀혁혂혃현혅혆혇혈혉혊혋혌혍혎혏혐협혒혓혔형혖혗혘혙혚혛혜혝혞혟혠혡혢혣혤혥혦혧혨혩혪혫혬혭혮혯혰혱혲혳혴혵혶혷호혹혺혻혼혽혾혿홀홁홂홃홄홅홆홇홈홉홊홋홌홍홎홏홐홑홒홓화확홖홗환홙홚홛활홝홞홟홠홡홢홣홤홥홦홧홨황홪홫홬홭홮홯홰홱홲홳홴홵홶홷홸홹홺홻홼홽홾홿횀횁횂횃횄횅횆횇횈횉횊횋회획횎횏횐횑횒횓횔횕횖횗횘횙횚횛횜횝횞횟횠횡횢횣횤횥횦횧효횩횪횫횬횭횮횯횰횱횲횳횴횵횶횷횸횹횺횻횼횽횾횿훀훁훂훃후훅훆훇훈훉훊훋훌훍훎훏훐훑훒훓훔훕훖훗훘훙훚훛훜훝훞훟훠훡훢훣훤훥훦훧훨훩훪훫훬훭훮훯훰훱훲훳훴훵훶훷훸훹훺훻훼훽훾훿휀휁휂휃휄휅휆휇휈휉휊휋휌휍휎휏휐휑휒휓휔휕휖휗휘휙휚휛휜휝휞휟휠휡휢휣휤휥휦휧휨휩휪휫휬휭휮휯휰휱휲휳휴휵휶휷휸휹휺휻휼휽휾휿흀흁흂흃흄흅흆흇흈흉흊흋흌흍흎흏흐흑흒흓흔흕흖흗흘흙흚흛흜흝흞흟흠흡흢흣흤흥흦흧흨흩흪흫희흭흮흯흰흱흲흳흴흵흶흷흸흹흺흻흼흽흾흿힀힁힂힃힄힅힆힇히힉힊힋힌힍힎힏힐힑힒힓힔힕힖힗힘힙힚힛힜힝힞힟힠힡힢힣 2 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | //! Command-line interface for uroman-rs. 2 | 3 | use clap::{Parser, ValueEnum}; 4 | use rustyline::DefaultEditor; 5 | use rustyline::error::ReadlineError; 6 | use std::io::{self, BufRead, BufReader, BufWriter, IsTerminal, Write}; 7 | use std::path::PathBuf; 8 | use std::{fs, time}; 9 | use thiserror::Error; 10 | use unicode_width::UnicodeWidthStr; 11 | use uroman::{RomFormat, RomanizationError, Uroman, rom_format}; 12 | 13 | #[derive(ValueEnum, Clone, Copy, Debug, Default)] 14 | enum CliRomFormat { 15 | #[default] 16 | Str, 17 | Edges, 18 | Alts, 19 | Lattice, 20 | } 21 | 22 | impl From for RomFormat { 23 | fn from(cli_format: CliRomFormat) -> Self { 24 | match cli_format { 25 | CliRomFormat::Str => RomFormat::Str, 26 | CliRomFormat::Edges => RomFormat::Edges, 27 | CliRomFormat::Alts => RomFormat::Alts, 28 | CliRomFormat::Lattice => RomFormat::Lattice, 29 | } 30 | } 31 | } 32 | 33 | #[derive(Error, Debug)] 34 | enum UromanError { 35 | #[error("Failed to open input file '{path}': {source}")] 36 | InputFileOpen { path: PathBuf, source: io::Error }, 37 | 38 | #[error("Failed to create output file '{path}': {source}")] 39 | OutputFileCreate { path: PathBuf, source: io::Error }, 40 | 41 | #[error(transparent)] 42 | Io(#[from] io::Error), 43 | 44 | #[error("REPL error: {0}")] 45 | Repl(#[from] ReadlineError), 46 | 47 | #[error("Romanization failed: {0}")] 48 | Romanization(#[from] RomanizationError), 49 | } 50 | 51 | #[derive(Parser, Debug)] 52 | #[command(author, version)] 53 | struct Cli { 54 | /// Direct text input to be romanized. 55 | #[arg(value_name = "DIRECT_INPUT")] 56 | direct_input: Vec, 57 | 58 | /// Input file path (default: stdin). 59 | #[arg(short, long, value_name = "FILE")] 60 | input_filename: Option, 61 | 62 | /// Output file path (default: stdout). 63 | #[arg(short, long, value_name = "FILE")] 64 | output_filename: Option, 65 | 66 | /// [ISO 639-3 language code](https://www.loc.gov/standards/iso639-2/php/code_list.php) (e.g., 'eng'). 67 | #[arg(short = 'l', long)] 68 | lcode: Option, 69 | 70 | /// Output format of romanization. 'edges' provides offsets. 71 | #[arg(short = 'f', long, value_enum, default_value_t = CliRomFormat::default())] 72 | rom_format: CliRomFormat, 73 | 74 | /// Limit uroman to the first n lines of a file. 75 | #[arg(long)] 76 | max_lines: Option, 77 | 78 | /// Decodes Unicode escape notation, e.g., \\u03B4 to δ. 79 | #[arg(short = 'd', long, action = clap::ArgAction::SetTrue)] 80 | decode_unicode: bool, 81 | 82 | /// Enable parallel file processing. 83 | #[arg(short = 'p', long = "use-parallel", action = clap::ArgAction::SetTrue)] 84 | use_parallel: bool, 85 | 86 | /// Run and display a few samples. 87 | #[arg(long, action = clap::ArgAction::SetTrue)] 88 | sample: bool, 89 | 90 | /// Suppress progress indicators. 91 | #[arg(long, action = clap::ArgAction::SetTrue)] 92 | silent: bool, 93 | } 94 | 95 | fn main() { 96 | if let Err(err) = run() { 97 | if let UromanError::Io(e) = &err 98 | && e.kind() == io::ErrorKind::BrokenPipe 99 | { 100 | return; 101 | } 102 | 103 | eprintln!("Error: {err}"); 104 | std::process::exit(1); 105 | } 106 | } 107 | 108 | fn run() -> Result<(), UromanError> { 109 | let cli = Cli::parse(); 110 | let uroman = Uroman::new(); 111 | 112 | if cli.direct_input.is_empty() 113 | && cli.input_filename.is_none() 114 | && !cli.sample 115 | && std::io::stdin().is_terminal() 116 | { 117 | run_repl(&uroman, &cli)?; 118 | return Ok(()); 119 | } 120 | 121 | if cli.sample 122 | && cli.direct_input.is_empty() 123 | && cli.input_filename.is_none() 124 | && cli.output_filename.is_none() 125 | && !cli.silent 126 | { 127 | show_samples(&uroman)?; 128 | return Ok(()); 129 | } 130 | 131 | let mut writer = get_writer(&cli.output_filename)?; 132 | 133 | if !cli.direct_input.is_empty() { 134 | process_direct_input(&uroman, &cli, &mut writer)?; 135 | } 136 | 137 | if cli.input_filename.is_some() || cli.direct_input.is_empty() { 138 | process_stream(&uroman, &cli, &mut writer)?; 139 | } 140 | 141 | writer.flush()?; 142 | 143 | if cli.sample { 144 | println!( 145 | "Note: The --sample option was ignored because input was provided via other flags." 146 | ); 147 | } 148 | 149 | Ok(()) 150 | } 151 | 152 | fn process_direct_input( 153 | uroman: &Uroman, 154 | cli: &Cli, 155 | writer: &mut dyn Write, 156 | ) -> Result<(), UromanError> { 157 | let rom_format = Some(cli.rom_format.into()); 158 | let lcode = cli.lcode.as_deref(); 159 | for s in &cli.direct_input { 160 | let result = if !cli.decode_unicode { 161 | uroman.romanize_with_format(s, lcode, rom_format) 162 | } else { 163 | uroman.romanize_escaped_with_format(s, lcode, rom_format) 164 | }; 165 | writeln!(writer, "{}", result.to_string()?)?; 166 | } 167 | Ok(()) 168 | } 169 | 170 | fn process_stream(uroman: &Uroman, cli: &Cli, writer: &mut dyn Write) -> Result<(), UromanError> { 171 | let reader = get_reader(&cli.input_filename)?; 172 | 173 | if cli.use_parallel { 174 | uroman.romanize_file_parallel( 175 | reader, 176 | writer, 177 | cli.lcode.as_deref(), 178 | cli.rom_format.into(), 179 | cli.max_lines, 180 | cli.decode_unicode, 181 | cli.silent, 182 | )?; 183 | } else { 184 | uroman.romanize_file( 185 | reader, 186 | writer, 187 | cli.lcode.as_deref(), 188 | cli.rom_format.into(), 189 | cli.max_lines, 190 | cli.decode_unicode, 191 | cli.silent, 192 | )?; 193 | } 194 | Ok(()) 195 | } 196 | 197 | fn get_reader(path: &Option) -> Result, UromanError> { 198 | match path { 199 | Some(p) => { 200 | let file = fs::File::open(p).map_err(|e| UromanError::InputFileOpen { 201 | path: p.clone(), 202 | source: e, 203 | })?; 204 | Ok(Box::new(BufReader::new(file))) 205 | } 206 | None => Ok(Box::new(BufReader::new(io::stdin()))), 207 | } 208 | } 209 | 210 | fn get_writer(path: &Option) -> Result, UromanError> { 211 | match path { 212 | Some(p) => { 213 | let file = fs::File::create(p).map_err(|e| UromanError::OutputFileCreate { 214 | path: p.clone(), 215 | source: e, 216 | })?; 217 | Ok(Box::new(BufWriter::new(file))) 218 | } 219 | None => Ok(Box::new(BufWriter::new(io::stdout()))), 220 | } 221 | } 222 | 223 | fn run_repl(uroman: &Uroman, cli: &Cli) -> Result<(), UromanError> { 224 | let mut rl = DefaultEditor::new()?; 225 | 226 | let history_path = || -> Option { 227 | let mut path = dirs::cache_dir()?; 228 | path.push("uroman-rs"); 229 | std::fs::create_dir_all(&path).ok()?; 230 | path.push("history.txt"); 231 | Some(path) 232 | }; 233 | 234 | if let Some(path) = history_path() 235 | && rl.load_history(&path).is_err() 236 | {} 237 | 238 | let lcode = cli.lcode.as_deref(); 239 | 240 | loop { 241 | let readline = rl.readline(">> "); 242 | 243 | match readline { 244 | Ok(line) => { 245 | rl.add_history_entry(&line)?; 246 | 247 | if line.trim() == ":exit" || line.trim() == ":quit" { 248 | break; 249 | } 250 | 251 | if line.trim().is_empty() { 252 | continue; 253 | } 254 | 255 | match uroman 256 | .romanize_with_format(&line, lcode, Some(cli.rom_format.into())) 257 | .to_string() 258 | { 259 | Ok(output) => println!("{output}"), 260 | Err(e) => eprintln!("Error formatting output: {e}"), 261 | } 262 | } 263 | Err(ReadlineError::Interrupted) => { 264 | println!("Interrupted. To exit, press Ctrl-D or type :exit."); 265 | continue; 266 | } 267 | Err(ReadlineError::Eof) => { 268 | println!("Exiting."); 269 | break; 270 | } 271 | Err(err) => { 272 | eprintln!("REPL Error: {err}"); 273 | break; 274 | } 275 | } 276 | } 277 | 278 | if let Some(path) = history_path() 279 | && let Err(err) = rl.save_history(&path) 280 | { 281 | eprintln!("Warning: could not save history to {path:?}: {err}"); 282 | } 283 | 284 | Ok(()) 285 | } 286 | 287 | fn show_samples(uroman: &Uroman) -> Result<(), UromanError> { 288 | println!("Running sample conversions with uroman-rs:"); 289 | println!("---------------------------------------"); 290 | 291 | let samples = [ 292 | ("jpn", "一兆二千万四十二えん ほしい!"), 293 | ("amh", "ሰላም ልዑል!"), 294 | ("ara", "مرحبا بالعالم"), 295 | ("ell", "Καλημέρα, κόσμε."), 296 | ("heb", "שלום עולם"), 297 | ("hin", "नमस्ते दुनिया"), 298 | ("hye", "Բարև աշխարհ"), 299 | ("kor", "안녕하세요 세계"), 300 | ("rus", "Привет, мир! Как дела?"), 301 | ("tai", "สวัสดีชาวโลก"), 302 | ("ukr", "Привіт, світе!"), 303 | ("zho", "你好,世界!谢谢。"), 304 | ("", "¡Hola! ¿Cómo estás?"), 305 | ("", "မင်္ဂလာပါ"), 306 | ("", "ལྷ་ས་གྲོང་ཁྱེར"), 307 | ("", "ສະບາຍດີ"), 308 | ("", "ᚑᚌᚐᚋ ᚛ᚅᚐᚋᚓ᚜"), 309 | ("", "ᐊᕐᕌᒍᒥ ᓄᑖᒥ ᖁᕕᐊᓱᒋᑦ"), 310 | ("", "გამარჯობა"), 311 | ("", "ಧನ್ಯವಾದಗಳು"), 312 | ("", "ⴰⵎⵢⴰ ⵉⵊⵊⴻⵏ ⵙⵉⵏ"), 313 | ("", "⠓⠑⠇⠇⠕ ⠺⠕⠗⠇⠙"), 314 | ("", "𓊪𓏏𓍯𓃭𓐝𓇌𓋴"), 315 | ("", "ᚺᚨᛚᛚᛟ ᚹᛟᚱᛚᛞ"), 316 | ("", "ꦧꦱꦗꦮ"), 317 | ("", "Tôi yêu tiếng Việt!"), 318 | ("", "✨ユーロマン✨(ウロマン)"), 319 | ]; 320 | 321 | let max_width = 29; 322 | let mut total_duration_ns: u128 = 0; 323 | 324 | for (lang_code, text) in samples.iter() { 325 | let start = time::Instant::now(); 326 | let romanized = uroman 327 | .romanize_string::(text, Some(lang_code)) 328 | .to_string(); 329 | let duration = start.elapsed(); 330 | total_duration_ns += duration.as_nanos(); 331 | 332 | let current_width = UnicodeWidthStr::width(*text); 333 | let padding = " ".repeat(max_width - current_width); 334 | if lang_code.is_empty() { 335 | println!(" {text}{padding} -> {romanized}"); 336 | } else { 337 | println!("[{lang_code}] {text}{padding} -> {romanized}"); 338 | } 339 | } 340 | 341 | println!("---------------------------------------"); 342 | 343 | let num_samples = samples.len() as u128; 344 | if num_samples > 0 { 345 | let avg_duration_ns = total_duration_ns / num_samples; 346 | let avg_duration_us = avg_duration_ns as f64 / 1_000.0; 347 | let avg_duration_ms = avg_duration_us / 1_000.0; 348 | 349 | println!( 350 | "Avg. processing time: {avg_duration_ms:.3} ms ({avg_duration_us:.1} μs) per sample" 351 | ); 352 | } 353 | 354 | Ok(()) 355 | } 356 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [2025] [Stellanomia] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /src/edge.rs: -------------------------------------------------------------------------------- 1 | use crate::core::{UromanInner, Value}; 2 | use num_rational::Ratio; 3 | use serde::Serialize; 4 | use std::hash::{Hash, Hasher}; 5 | 6 | #[derive(Debug, Clone, Serialize, PartialEq, PartialOrd)] 7 | pub struct EdgeData { 8 | pub start: usize, 9 | pub end: usize, 10 | pub txt: String, 11 | pub r#type: String, 12 | } 13 | 14 | #[derive(Debug, Clone, Default, PartialEq, PartialOrd, Serialize)] 15 | pub struct NumData { 16 | pub orig_txt: String, 17 | pub value: Option, 18 | pub fraction: Option>, 19 | pub num_base: Option, 20 | pub base_multiplier: Option, 21 | pub script: Option, 22 | pub is_large_power: bool, 23 | pub active: bool, 24 | pub value_s: Option, 25 | pub n_decimals: Option, 26 | } 27 | 28 | /// A dedicated struct for flexibly updating fields of a `NumData`. 29 | /// This mimics Python's keyword arguments, allowing partial updates. 30 | #[derive(Default, Debug)] 31 | pub struct NumDataUpdates { 32 | pub value: Option, 33 | pub fraction: Option>, 34 | pub num_base: Option, 35 | pub base_multiplier: Option, 36 | pub r#type: Option, 37 | pub script: Option, 38 | pub is_large_power: Option, 39 | pub active: Option, 40 | pub n_decimals: Option, 41 | pub orig_txt: Option, 42 | pub value_s: Option, 43 | } 44 | 45 | /// A unified Edge type. 46 | #[derive(Debug, Clone, Serialize, PartialOrd)] 47 | pub enum Edge { 48 | Regular(EdgeData), 49 | Numeric { data: EdgeData, num_data: NumData }, 50 | } 51 | 52 | impl Hash for Edge { 53 | fn hash(&self, state: &mut H) { 54 | let d = self.get_data(); 55 | d.start.hash(state); 56 | d.end.hash(state); 57 | d.txt.hash(state); 58 | d.r#type.hash(state); 59 | } 60 | } 61 | 62 | impl PartialEq for Edge { 63 | fn eq(&self, other: &Self) -> bool { 64 | let d1 = self.get_data(); 65 | let d2 = other.get_data(); 66 | d1.start == d2.start && d1.end == d2.end && d1.txt == d2.txt && d1.r#type == d2.r#type 67 | } 68 | } 69 | 70 | impl Eq for Edge {} 71 | 72 | impl Edge { 73 | /// Creates a regular edge. 74 | pub fn new_regular(start: usize, end: usize, txt: String, r#type: String) -> Self { 75 | Edge::Regular(EdgeData { 76 | start, 77 | end, 78 | txt, 79 | r#type, 80 | }) 81 | } 82 | 83 | /// Creates an initial numeric edge from `uroman.num_props`. 84 | pub(crate) fn new_numeric(start: usize, end: usize, char: char, uroman: &UromanInner) -> Option { 85 | let props_map = uroman.num_props.get(&char.to_string())?; 86 | 87 | let rom_text = props_map 88 | .get("rom") 89 | .and_then(|v| match v { 90 | Value::String(s) => Some(s.clone()), 91 | _ => None, 92 | }) 93 | .unwrap_or_else(|| char.to_string()); 94 | 95 | let value = props_map.get("value").and_then(|v| match v { 96 | Value::Int(i) => Some(*i as f64), 97 | Value::Float(f) => Some(*f), 98 | _ => None, 99 | }); 100 | 101 | let fraction = props_map.get("fraction").and_then(|v| match v { 102 | Value::String(s) => s 103 | .split_once('/') 104 | .and_then(|(num, den)| Some(Ratio::new(num.parse().ok()?, den.parse().ok()?))), 105 | Value::Array(arr) if arr.len() == 2 => { 106 | if let (Some(Value::Int(num)), Some(Value::Int(den))) = (arr.first(), arr.get(1)) { 107 | if *den != 0 { 108 | Some(Ratio::new(*num, *den)) 109 | } else { 110 | None 111 | } 112 | } else { 113 | None 114 | } 115 | }, 116 | _ => None, 117 | }); 118 | 119 | let r#type = props_map 120 | .get("type") 121 | .and_then(|v| match v { 122 | Value::String(s) => Some(s.clone()), 123 | _ => None, 124 | }) 125 | .unwrap_or_default(); 126 | 127 | let is_large_power = props_map 128 | .get("is-large-power") 129 | .is_some_and(|v| matches!(v, Value::Int(1))); 130 | 131 | let num_base = props_map.get("base").and_then(|v| match v { 132 | Value::Int(i) => Some(*i), 133 | _ => None, 134 | }); 135 | 136 | let base_multiplier = props_map.get("mult").and_then(|v| match v { 137 | Value::Int(i) => Some(*i as f64), 138 | Value::Float(f) => Some(*f), 139 | _ => None, 140 | }); 141 | 142 | let script = props_map.get("script").and_then(|v| match v { 143 | Value::String(s) => Some(s.clone()), 144 | _ => None, 145 | }); 146 | 147 | let edge = Edge::Numeric { 148 | data: EdgeData { 149 | start, 150 | end, 151 | txt: rom_text, 152 | r#type, 153 | }, 154 | num_data: NumData { 155 | orig_txt: char.to_string(), 156 | value, 157 | fraction, 158 | num_base, 159 | base_multiplier, 160 | script, 161 | is_large_power, 162 | active: true, 163 | ..Default::default() 164 | }, 165 | }; 166 | Some(edge) 167 | } 168 | 169 | /// Creates a new combined numeric edge from multiple existing edges. 170 | /// 171 | /// # Arguments 172 | /// * `start` - The start position of the new edge. 173 | /// * `end` - The end position of the new edge. 174 | /// * `value` - The combined numeric value as an f64. 175 | /// * `e_type` - The type of the new edge (e.g., "D1", "G1", "G2"). 176 | /// * `script` - The script of the edge (optional). 177 | /// * `num_base` - The base of the new numeric edge (optional). 178 | /// * `n_decimals` - The number of decimal places (optional). 179 | /// * `orig_txt` - The original text (concatenation of orig_txt from previous edges). 180 | pub fn new_combined_numeric( 181 | start: usize, 182 | end: usize, 183 | value: f64, 184 | e_type: String, 185 | script: Option, 186 | num_base: Option, 187 | n_decimals: Option, 188 | orig_txt: String, // Accepts the combined original text. 189 | ) -> Self { 190 | let num_data = NumData { 191 | orig_txt, 192 | value: Some(value), 193 | num_base, 194 | script, 195 | is_large_power: false, 196 | active: true, 197 | n_decimals, 198 | ..Default::default() 199 | }; 200 | 201 | let mut edge = Edge::Numeric { 202 | data: EdgeData { 203 | start, 204 | end, 205 | txt: "".to_string(), 206 | r#type: e_type, 207 | }, 208 | num_data, 209 | }; 210 | 211 | edge.update(NumDataUpdates::default()); 212 | edge 213 | } 214 | 215 | /// Returns the original text (`orig_txt`) of the edge. 216 | /// For `Edge::Numeric`, it returns `orig_txt` from `NumData`. 217 | /// For `Edge::Regular`, it returns an empty string. 218 | pub fn orig_txt(&self) -> &str { 219 | match self { 220 | Edge::Numeric { num_data, .. } => &num_data.orig_txt, 221 | Edge::Regular(_) => "", // Regular edges don't have orig_txt, so return an empty string. 222 | } 223 | } 224 | 225 | pub fn is_large_power(&self) -> bool { 226 | self.get_num_data().is_some_and(|d| d.is_large_power) 227 | } 228 | 229 | pub fn get_num_base(&self) -> Option { 230 | self.get_num_data().and_then(|d| d.num_base) 231 | } 232 | 233 | pub fn get_script(&self) -> Option { 234 | self.get_num_data().and_then(|d| d.script.clone()) 235 | } 236 | 237 | pub fn is_numeric(&self) -> bool { 238 | matches!(self, Edge::Numeric { .. }) 239 | } 240 | 241 | /// Updates the properties of a numeric edge and recalculates `txt` (the romanized representation) accordingly. 242 | pub fn update(&mut self, updates: NumDataUpdates) { 243 | if let Edge::Numeric { num_data, data } = self { 244 | // --- Update data from the `updates` struct --- 245 | if let Some(v) = updates.value { 246 | num_data.value = Some(v); 247 | } 248 | if let Some(v) = updates.fraction { 249 | num_data.fraction = Some(v); 250 | } 251 | if let Some(v) = updates.num_base { 252 | num_data.num_base = Some(v); 253 | } 254 | if let Some(v) = updates.base_multiplier { 255 | num_data.base_multiplier = Some(v); 256 | } 257 | if let Some(v) = updates.r#type { 258 | data.r#type = v; 259 | } // Update EdgeData's type 260 | if let Some(v) = updates.script { 261 | num_data.script = Some(v); 262 | } 263 | if let Some(v) = updates.is_large_power { 264 | num_data.is_large_power = v; 265 | } 266 | if let Some(v) = updates.active { 267 | num_data.active = v; 268 | } 269 | if let Some(v) = updates.n_decimals { 270 | num_data.n_decimals = Some(v); 271 | } 272 | if let Some(v) = updates.orig_txt { 273 | num_data.orig_txt = v; 274 | } 275 | if let Some(v) = updates.value_s { 276 | num_data.value_s = Some(v); 277 | } 278 | 279 | // --- Recalculate the display text (`txt`) after all updates --- 280 | self.recalculate_numeric_txt(); 281 | } 282 | } 283 | 284 | /// Helper function to recalculate the display text for a numeric edge based on its current data. 285 | /// This should be called after any modification to `NumData`. 286 | fn recalculate_numeric_txt(&mut self) { 287 | if let Edge::Numeric { num_data, data } = self { 288 | // Determine the primary string for the value, prioritizing `value_s`. 289 | let value_s = if let Some(vs) = &num_data.value_s { 290 | vs.clone() 291 | } else if let Some(v) = num_data.value { 292 | if let Some(nd) = num_data.n_decimals { 293 | format!("{v:.nd$}") 294 | } else if v.fract() == 0.0 { 295 | (v as i64).to_string() 296 | } else { 297 | v.to_string() 298 | } 299 | } else { 300 | "".to_string() 301 | }; 302 | 303 | // Format the fraction part. 304 | let fraction_s = num_data 305 | .fraction 306 | .map(|f| format!("{}/{}", f.numer(), f.denom())) 307 | .unwrap_or_default(); 308 | 309 | let delimiter = if !value_s.is_empty() && !fraction_s.is_empty() { 310 | " " 311 | } else { 312 | "" 313 | }; 314 | 315 | let final_txt = format!("{value_s}{delimiter}{fraction_s}"); 316 | 317 | // Fallback to original text if the calculated text is empty. 318 | data.txt = if final_txt.is_empty() { 319 | num_data.orig_txt.clone() 320 | } else { 321 | final_txt 322 | }; 323 | } 324 | } 325 | 326 | // --- Accessors for common data --- 327 | pub fn get_data(&self) -> &EdgeData { 328 | match self { 329 | Edge::Regular(data) | Edge::Numeric { data, .. } => data, 330 | } 331 | } 332 | 333 | pub fn get_data_mut(&mut self) -> &mut EdgeData { 334 | match self { 335 | Edge::Regular(data) | Edge::Numeric { data, .. } => data, 336 | } 337 | } 338 | 339 | // --- Accessors for Numeric data (immutable/mutable) --- 340 | pub fn get_num_data(&self) -> Option<&NumData> { 341 | match self { 342 | Edge::Numeric { num_data, .. } => Some(num_data), 343 | _ => None, 344 | } 345 | } 346 | 347 | pub fn get_num_data_mut(&mut self) -> Option<&mut NumData> { 348 | match self { 349 | Edge::Numeric { num_data, .. } => Some(num_data), 350 | _ => None, 351 | } 352 | } 353 | 354 | pub fn start(&self) -> usize { 355 | self.get_data().start 356 | } 357 | pub fn end(&self) -> usize { 358 | self.get_data().end 359 | } 360 | pub fn txt(&self) -> &str { 361 | &self.get_data().txt 362 | } 363 | pub fn r#type(&self) -> &str { 364 | &self.get_data().r#type 365 | } 366 | 367 | pub fn is_active(&self) -> bool { 368 | self.get_num_data().is_none_or(|d| d.active) 369 | } 370 | 371 | pub fn set_active(&mut self, active: bool) { 372 | if let Some(d) = self.get_num_data_mut() { 373 | d.active = active; 374 | } 375 | } 376 | 377 | pub fn value(&self) -> Option { 378 | self.get_num_data().and_then(|d| d.value) 379 | } 380 | } 381 | -------------------------------------------------------------------------------- /tests/unit_tests.rs: -------------------------------------------------------------------------------- 1 | use uroman::{Uroman, rom_format}; 2 | 3 | #[track_caller] 4 | fn assert_romanizes_to_str(uroman: &Uroman, input: &str, lcode: Option<&str>, expected_str: &str) { 5 | let result = uroman.romanize_string::(input, lcode); 6 | 7 | assert_eq!(result.to_string(), expected_str); 8 | } 9 | 10 | #[track_caller] 11 | fn assert_romanizes_to_str_with_decode( 12 | uroman: &Uroman, 13 | input: &str, 14 | lcode: Option<&str>, 15 | expected_str: &str, 16 | ) { 17 | let result = uroman.romanize_escaped::(input, lcode); 18 | 19 | assert_eq!(result.to_string(), expected_str); 20 | } 21 | 22 | #[test] 23 | fn test_simple_romanization() { 24 | let uroman = Uroman::new(); 25 | 26 | // Test a simple ASCII character 27 | assert_romanizes_to_str(&uroman, "A", None, "A"); 28 | assert_romanizes_to_str(&uroman, "z", None, "z"); 29 | 30 | // Test a character with a direct mapping from romanization-auto-table.txt (e.g., Greek Beta) 31 | // This assumes 'β' maps to 'b' in the auto-generated table 32 | assert_romanizes_to_str(&uroman, "β", None, "b"); 33 | 34 | // Test a character with a direct mapping from UnicodeDataOverwrite.txt (e.g., Egyptian Hieroglyph) 35 | // This assumes '𓍧' maps to '600' in the overwrite table 36 | assert_romanizes_to_str(&uroman, "𓍧", None, "600"); 37 | } 38 | 39 | #[test] 40 | fn test_romanize_with_decode_unicode_escapes() { 41 | let uroman = Uroman::new(); 42 | 43 | assert_romanizes_to_str_with_decode(&uroman, "fran\\xE7ais", Some("fra"), "fransais"); 44 | 45 | // `Р` (U+0420), `у` (U+0443), `с` (U+0441), `к` (U+043A), `и` (U+0438), `й` (U+0439) 46 | let russian_escaped = "\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439"; 47 | assert_romanizes_to_str_with_decode(&uroman, russian_escaped, Some("rus"), "Russky"); 48 | 49 | // `你` (U+4F60), `好` (U+597D) 50 | assert_romanizes_to_str_with_decode(&uroman, "\\u4F60\\u597D", Some("zho"), "nihao"); 51 | 52 | // emoji: "😀" (U+1F600) 53 | assert_romanizes_to_str_with_decode(&uroman, "\\U0001F600", None, "😀"); 54 | 55 | assert_romanizes_to_str_with_decode(&uroman, "H\\x45LLO", None, "H\\x45LLO"); 56 | 57 | assert_romanizes_to_str(&uroman, "fran\\xE7ais", Some("fra"), "fran\\xE7ais"); 58 | } 59 | 60 | #[test] 61 | fn test_ascii_passthrough() { 62 | let uroman = Uroman::new(); 63 | 64 | assert_romanizes_to_str(&uroman, "Hello World!", None, "Hello World!"); 65 | assert_romanizes_to_str(&uroman, "12345", None, "12345"); 66 | assert_romanizes_to_str(&uroman, "!@#$%^&*()", None, "!@#$%^&*()"); 67 | } 68 | 69 | #[test] 70 | fn test_kanji_number() { 71 | let uroman = Uroman::new(); 72 | 73 | assert_romanizes_to_str(&uroman, "六万五百三", None, "60503"); 74 | assert_romanizes_to_str(&uroman, "二千万四十二", None, "20000042"); 75 | assert_romanizes_to_str(&uroman, "八億五万一千二", None, "800051002"); 76 | } 77 | 78 | #[test] 79 | fn test_fractions() { 80 | let uroman = Uroman::new(); 81 | 82 | assert_romanizes_to_str(&uroman, "½", None, "1/2"); 83 | assert_romanizes_to_str(&uroman, "¼", None, "1/4"); 84 | assert_romanizes_to_str(&uroman, "¾", None, "3/4"); 85 | assert_romanizes_to_str(&uroman, "23½", None, "23 1/2"); 86 | assert_romanizes_to_str(&uroman, "1¼", None, "1 1/4"); 87 | assert_romanizes_to_str(&uroman, "abc½", None, "abc1/2"); 88 | assert_romanizes_to_str(&uroman, "½¼", None, "1/2 1/4"); 89 | } 90 | 91 | #[test] 92 | fn test_chinese_fractions_and_percentages() { 93 | let uroman = Uroman::new(); 94 | 95 | assert_romanizes_to_str(&uroman, "百分之一", None, "1%"); 96 | assert_romanizes_to_str(&uroman, "百分之五", None, "5%"); 97 | 98 | assert_romanizes_to_str(&uroman, "十分之一", None, "1/10"); 99 | assert_romanizes_to_str(&uroman, "三分之二", None, "2/3"); 100 | 101 | assert_romanizes_to_str(&uroman, "零分之五", None, "0fenzhi5"); 102 | assert_romanizes_to_str( 103 | &uroman, 104 | "今年的增长率是百分之多少?一些分析师认为会更高。", 105 | None, 106 | "jinniandezengzhanglushibaifenzhiduoshao? 1xiefenxishirenweihuigenggao. " 107 | ); 108 | } 109 | 110 | #[test] 111 | fn test_tibetan_edge_cases() { 112 | let uroman = Uroman::new(); 113 | 114 | assert_romanizes_to_str(&uroman, "བཟང", None, "bzang"); 115 | assert_romanizes_to_str(&uroman, "འ", None, "'a"); 116 | assert_romanizes_to_str(&uroman, "ཉིན", None, "nyin"); 117 | assert_romanizes_to_str(&uroman, "འདིའི་", None, "'di'i·"); 118 | assert_romanizes_to_str(&uroman, "འདིའི་", None, "'di'i·"); 119 | assert_romanizes_to_str(&uroman, "འི", None, "i"); 120 | assert_romanizes_to_str(&uroman, "འཁྲིད", None, "'khrid"); 121 | assert_romanizes_to_str(&uroman, "ངའི་ཕའི་དཔེ་དེབ།", None, "nga'i·pha'i·dpe·deb,"); 122 | assert_romanizes_to_str(&uroman, "བསྒྲུབས", None, "bsgrubs"); 123 | assert_romanizes_to_str(&uroman, "ཨ་མདོ", None, "a·mdo"); 124 | assert_romanizes_to_str(&uroman, "འེ", None, "'e"); 125 | } 126 | 127 | #[test] 128 | fn test_robustness_and_complex_fallbacks() { 129 | let uroman = Uroman::new(); 130 | 131 | assert_romanizes_to_str(&uroman, "百分之", None, "baifenzhi"); 132 | assert_romanizes_to_str(&uroman, "零分之", None, "0fenzhi"); 133 | assert_romanizes_to_str(&uroman, "十分之泰", None, "10fenzhitai"); 134 | 135 | assert_romanizes_to_str(&uroman, "分之", None, "fenzhi"); 136 | assert_romanizes_to_str(&uroman, "零分之½ไม่มี-๑๒๓%", None, "0fenzhi1/2maimii-123%"); 137 | assert_romanizes_to_str(&uroman, "測試一百分之", None, "ceshi100fenzhi"); 138 | assert_romanizes_to_str(&uroman, "100分之50", None, "50%"); 139 | } 140 | 141 | #[test] 142 | fn test_deu() { 143 | let uroman = Uroman::new(); 144 | 145 | assert_romanizes_to_str(&uroman, "Grüße", Some("deu"), "Gruesse"); 146 | assert_romanizes_to_str(&uroman, "Schön", Some("deu"), "Schoen"); 147 | assert_romanizes_to_str(&uroman, "Fußball", Some("deu"), "Fussball"); 148 | assert_romanizes_to_str( 149 | &uroman, 150 | "Grüße aus Bordeaux", 151 | Some("deu"), 152 | "Gruesse aus Bordeaux", 153 | ); 154 | } 155 | 156 | #[test] 157 | fn test_tur() { 158 | let uroman = Uroman::new(); 159 | 160 | assert_romanizes_to_str( 161 | &uroman, 162 | "İstanbul, Türkiye'de yer alan şehir ve ülkenin 81 ilinden biri.", 163 | Some("tur"), 164 | "Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri.", 165 | ); 166 | } 167 | 168 | #[test] 169 | fn test_eng_braille() { 170 | let uroman = Uroman::new(); 171 | assert_romanizes_to_str( 172 | &uroman, 173 | "⠠⠺⠑⠀⠓⠕⠇⠙⠀⠘⠮⠀⠞⠗⠥⠹⠎⠀⠞⠕⠀⠆⠀⠎⠑⠇⠋⠤⠑⠧⠊⠙⠢⠞⠂⠀⠞⠀⠁⠇⠇⠀⠍⠑⠝⠀⠜⠑⠀⠉⠗⠂⠞⠫⠀⠑⠟⠥⠁⠇⠂⠀⠞⠀⠮⠽⠀⠜⠑⠀⠑⠝⠙⠪⠫⠀⠃⠽⠀⠸⠮⠀⠠⠉⠗⠑⠁⠞⠕⠗⠀⠾⠀⠉⠻⠞⠁⠔⠀⠥⠝⠁⠇⠊⠑⠝⠁⠃⠇⠑⠀⠠⠐⠗⠎⠂⠀⠞⠀⠁⠍⠰⠛⠀⠘⠮⠀⠜⠑⠀⠠⠇⠊⠋⠑⠂⠀⠠⠇⠊⠃⠻⠞⠽⠀⠯⠀⠮⠀⠏⠥⠗⠎⠥⠊⠞⠀⠷⠀⠠⠓⠁⠏⠏⠊⠰⠎⠲", 174 | Some("eng"), 175 | "We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.", 176 | ); 177 | } 178 | 179 | #[test] 180 | fn test_ell() { 181 | let uroman = Uroman::new(); 182 | assert_romanizes_to_str( 183 | &uroman, 184 | "Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.", 185 | Some("ell"), 186 | "To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou.", 187 | ); 188 | } 189 | 190 | #[test] 191 | fn test_rus() { 192 | let uroman = Uroman::new(); 193 | assert_romanizes_to_str( 194 | &uroman, 195 | "Герма́ния (нем. Deutschland), официальное название — Федерати́вная Респу́блика Герма́ния (нем. Bundesrepublik Deutschland), ФРГ (нем. BRD) — государство в Западной Европе. Площадь территории — 357 021 км². Численность населения по переписи 2011 года — более 80 миллионов человек. [2][6].", 196 | Some("rus"), 197 | "Germaniya (nem. Deutschland), ofitsialnoye nazvaniye — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoy Yevrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — boleye 80 millionov chelovek. [2][6].", 198 | ); 199 | } 200 | 201 | #[test] 202 | fn test_ukr() { 203 | let uroman = Uroman::new(); 204 | assert_romanizes_to_str( 205 | &uroman, 206 | "Володи́мир Олекса́ндрович Зеле́нський (нар. 25 січня 1978, Кривий Ріг) — український державний діяч, політик, шоумен, актор, комік, режисер, продюсер та сценарист, шостий Президент України з 20 травня 2019 року.", 207 | Some("ukr"), 208 | "Volodymyr Oleksandrovych Zelensky (nar. 25 sichnya 1978, Kryvy Rih) — ukrayinsky derzhavny diyach, polityk, shoumen, aktor, komik, rezhyser, prodyuser ta stsenaryst, shosty Prezydent Ukrayiny z 20 travnya 2019 roku.", 209 | ); 210 | } 211 | 212 | #[test] 213 | fn test_srp() { 214 | let uroman = Uroman::new(); 215 | assert_romanizes_to_str( 216 | &uroman, 217 | "Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства.", 218 | Some("srp"), 219 | "Sva ljudska bitsha radjaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sveshtshu i treba jedni prema drugima da postupaju u duhu bratstva.", 220 | ); 221 | } 222 | 223 | #[test] 224 | fn test_ara() { 225 | let uroman = Uroman::new(); 226 | assert_romanizes_to_str( 227 | &uroman, 228 | "كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.", 229 | Some("ara"), 230 | "knda (balinjlyzya: Canada) hy dwla fy amryka alshmalya ttalf mn 10 mqat'at wthlatha aqalym. tq' fy alqsm alshmaly mn alqara wtmtd mn almhyt alatlsy fy alshrq ila almhyt alhadye fy alghrb wtmtd shmalan fy almhyt almtjmd alshmaly. knda hy albld althany 'almyan mn hyth almsaha alklya. kma an hdwd knda almshtrka m' alwlayat almthda mn aljnwb walshmal alghrby hy alatwl fy al'alm.", 231 | ); 232 | } 233 | 234 | #[test] 235 | fn test_fas() { 236 | let uroman = Uroman::new(); 237 | assert_romanizes_to_str( 238 | &uroman, 239 | "کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لس‌آنجلس، سن دیگو، سن خوزه و سان‌فرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است.", 240 | Some("fas"), 241 | "kalifrnia (be anglisi: California) ialti dr ghrb amrika br kraneye aqianvs aram ast. mrkz an sakramntv v shhrhai mhm an lsanjls, sn digv, sn khvze v sanfransiskv hstnd.hmchnin in ialt pr jmit trin ialt amrika ast.", 242 | ); 243 | } 244 | 245 | #[test] 246 | fn test_uig() { 247 | let uroman = Uroman::new(); 248 | assert_romanizes_to_str( 249 | &uroman, 250 | "ئامېرىكا قوشما شتاتلىرى بولسا شىمالىي ئامېرىكاغا جايلاشقان بىر دۆلەت. ئۇنىڭ پايتەختى بولسا ۋاشىנגتون، ئەڭ چوڭ شەھىرى بولسا نيۇيورك شەھىرى. دۆلەت تىلى بولسا ئېנגلىزتىلى. ھازىرقى زۇڭتۇڭ باراك ئوباما. بۇ دۆلەت ئەسلىدە ئەنگىلىيەنىڭ مۇستەملىكىسى بولۇپ ۋاشىנגىتوننىڭ رەھپەرلىكىدە 1776 يىلى 7 ئاينىڭ 4 كۇنى مۇستەقىل بولغان، يەر مەيدانى 9 مىلىيون 826 مىڭ 630 كۋادىرات كلومېتىر، نوپۇسى 306 مىللىيون 142 مىڭ، بۇلارنىڭ ئاسساسلىق دىنى خرىستىئان دىنى.", 251 | Some("uig"), 252 | "amerika qoshma shtatliri bolsa shimaliy amerikagha jaylashqan bir doelet. uning paytexti bolsa washington, eng chong shehiri bolsa nyuyork shehiri. doelet tili bolsa engliztili. hazirqi zungtung barak obama. bu doelet eslide engiliening mustemlikisi bolup washingitonning rehperlikide 1776 yili 7 ayning 4 kuni musteqil bolghan, er meydani 9 miliyon 826 ming 630 kwadirat klometir, nopusi 306 milliyon 142 ming, bularning assasliq dini xristian dini.", 253 | ); 254 | } 255 | 256 | #[test] 257 | fn test_amh() { 258 | let uroman = Uroman::new(); 259 | assert_romanizes_to_str( 260 | &uroman, 261 | "ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።", 262 | Some("amh"), 263 | "iteyopheyaa kaaalame sosetu teleqe yaaberehaame hayemaanotoche gaare taarikaawi genenyunate alaate.", 264 | ); 265 | } 266 | 267 | #[test] 268 | fn test_hin() { 269 | let uroman = Uroman::new(); 270 | assert_romanizes_to_str( 271 | &uroman, 272 | "कैलिफ़ोर्निया शब्द का पहला अर्थ था जो क्षेत्र जहाँ आज बाहा कैलिफ़ोर्निया प्रायद्वीप, नेवाडा, यूटा और एरिज़ोना, नया मेक्सिको, और वायोमिंग के कई विभाग स्थित हैं।", 273 | Some("hin"), 274 | "kailiforniyaa shabda kaa pahalaa artha thaa jo kssetra jahaam aaj baahaa kailiforniyaa praayadviip, nevaaddaa, yuuttaa aur erizonaa, nayaa meksiko, aur vaayomimg ke kaii vibhaag sthit haim.", 275 | ); 276 | } 277 | 278 | #[test] 279 | fn test_mar() { 280 | let uroman = Uroman::new(); 281 | assert_romanizes_to_str( 282 | &uroman, 283 | "लंडन (इंग्लिश: London ) हे इंग्लंडचे व युनायटेड किंग्डमचे राजधानीचे व सर्वात मोठे शहर तसेच युरोपियन संघामधील सर्वात मोठे महानगर क्षेत्र आहे.", 284 | Some("mar"), 285 | "lamddan (imglish: London ) he imglamddace va yunaayattedd kimgddamace raajadhaaniice va sarvaat motthe shahar tasec yuropiyan samghaamadhiil sarvaat motthe mahaanagar kssetra aahe.", 286 | ); 287 | } 288 | 289 | #[test] 290 | fn test_nep() { 291 | let uroman = Uroman::new(); 292 | assert_romanizes_to_str( 293 | &uroman, 294 | "यसको उचाइ समुन्द्र सतहबाट ८,८४८ मीटर (२९,०२८ फीट) छ। यो नेपालको सोलुखुम्बु जिल्लाको खुम्जुङ्ग गा. वि. स. मा पर्छ ।", 295 | Some("nep"), 296 | "yasako ucaai samundra satahabaatt 8,848 miittar (29,028 phiitt) cha. yo nepaalako solukhumbu jillaako khumjungga gaa. vi. sa. maa parcha .", 297 | ); 298 | } 299 | 300 | #[test] 301 | fn test_tam() { 302 | let uroman = Uroman::new(); 303 | assert_romanizes_to_str( 304 | &uroman, 305 | "தமிழ்நாடு (Tamil Nadu) இந்தியாவின் 29 மாநிலங்களில் ஒன்றாகும். தமிழ்நாடு, தமிழகம் என்றும் பரவலாக அழைக்கப்படுகிறது.", 306 | Some("tam"), 307 | "tamilnaadu (Tamil Nadu) intiyaavin 29 maanilangkalil onraakum. tamilnaadu, tamilakam enrum paravalaaka alaikkappadukiratu.", 308 | ); 309 | } 310 | 311 | #[test] 312 | fn test_mal() { 313 | let uroman = Uroman::new(); 314 | assert_romanizes_to_str( 315 | &uroman, 316 | "ഇന്ത്യയുടെ തെക്കുപടിഞ്ഞാറെ അറ്റത്തുള്ള സംസ്ഥാനമാണ് കേരളം.", 317 | Some("mal"), 318 | "intyayutte tekkupattinynyaarre arrrrattulllla samsthaanamaann keerallam.", 319 | ); 320 | } 321 | 322 | #[test] 323 | fn test_ori() { 324 | let uroman = Uroman::new(); 325 | assert_romanizes_to_str( 326 | &uroman, 327 | r###"ଓଡ଼ିଶା ଭାରତର ପୂର୍ବ ଉପକୂଳରେ ଥିବା ଏକ ପ୍ରଶାସନିକ ରାଜ୍ୟ । ଏହାର ଉତ୍ତର-ପୂର୍ବରେ ପଶ୍ଚିମବଙ୍ଗ, ଉତ୍ତରରେ ଝାଡ଼ଖଣ୍ଡ, ପଶ୍ଚିମ ଓ ଉତ୍ତର-ପଶ୍ଚିମରେ ଛତିଶଗଡ଼, ଦକ୍ଷିଣ ଓ ଦକ୍ଷିଣ-ପଶ୍ଚିମରେ ଆନ୍ଧ୍ରପ୍ରଦେଶ ଅବସ୍ଥିତ । ଏହା ଆୟତନ ହିସାବରେ ନବମ ଓ ଜନସଂଖ୍ୟା ହିସାବରେ ଏଗାରତମ ରାଜ୍ୟ । ଓଡ଼ିଆ ଭାଷା ରାଜ୍ୟର ସରକାରୀ ଭାଷା । ୨୦୦୧ ଜନଗଣନା ଅନୁସାରେ ରାଜ୍ୟର ପ୍ରାୟ ୩୩.୨ ନିୟୁତ ଲୋକ ଓଡ଼ିଆ ଭାଷା ବ୍ୟବହାର କରନ୍ତି । "###.trim(), 328 | Some("ori"), 329 | r###"oddishaa bhaaratara puurba upakuullare thibaa eka prashaasanika raajya . ehaara uttara-puurbare pashcimabangga, uttarare jhaaddakhanndda, pashcima o uttara-pashcimare chatishagadda, dakssinna o dakssinna-pashcimare aandhrapradesha abasthita . ehaa aayatana hisaabare nabama o janasamkhyaa hisaabare egaaratama raajya . oddiaa bhaassaa raajyara sarakaarii bhaassaa . 2001 janagannanaa anusaare raajyara praaya 33.2 niyuta loka oddiaa bhaassaa byabahaara karanti . "###.trim(), 330 | ); 331 | } 332 | 333 | #[test] 334 | fn test_zho() { 335 | let uroman = Uroman::new(); 336 | assert_romanizes_to_str( 337 | &uroman, 338 | "加拿大在一万四千年前即有原住民在此生活。", 339 | Some("zho"), 340 | "jianadazai14000nianqianjiyouyuanzhuminzaicishenghuo. ", 341 | ); 342 | } 343 | 344 | #[test] 345 | fn test_heb() { 346 | let uroman = Uroman::new(); 347 | assert_romanizes_to_str( 348 | &uroman, 349 | "כֹּל עוֹד בַּלֵּבָב פְּנִימָה נֶפֶשׁ יְהוּדִי הוֹמִיָּה וּלְפַאֲתֵי מִזְרָח, קָדִימָה, עַיִן לְצִיּוֹן צוֹפִיָּה, עוֹד לֹא אָבְדָה תִּקְוָתֵנוּ, הַתִּקְוָה בַּת שְׁנוֹת אַלְפַּיִם לִהְיוֹת עַם חָפְשִׁי בְּאַרְצֵנוּ, אֶרֶץ צִיּוֹן וִירוּשָׁלַיִם.", 350 | Some("heb"), 351 | "kol 'od balevav penimah nefesh yehudi homiyah ulefa'ate mizerach, qadimah, 'ayin letsiyon tsofiyah, 'od lo avedah tiqvatenu, hatiqvah bat shenot 'alepayim liheyot 'am chafeshiy be'aretsenu, erets tsiyon virushalayim.", 352 | ); 353 | } 354 | 355 | #[test] 356 | fn test_yid() { 357 | let uroman = Uroman::new(); 358 | assert_romanizes_to_str( 359 | &uroman, 360 | "דווקא איז אן העברעישער זשורנאל וואס באשרייבט די יידיש־שפראכיקע קולטור. עס איז דערשינען געווארן תמוז ה'תשס\"ז (יולי 2006).", 361 | Some("yid"), 362 | "duuka yz an hebreysher zhurnal was bashreybt dy eydysh-shfrachyke kultur. es yz dershynen gewarn smuz h'sshs\"z (yuly 2006).", 363 | ); 364 | } 365 | 366 | #[test] 367 | fn test_hye() { 368 | let uroman = Uroman::new(); 369 | assert_romanizes_to_str( 370 | &uroman, 371 | "Տալնոեի շրջան (ուկր.՝ Тальнівський район), շրջան Ուկրաինայի Չերկասիի մարզում։ Ստեղծվել է 1923 թվականին։ Վարչական կենտրոնը՝ Տալնոե։ Աշխարհագրությունը Շրջանի տարածքի մակերեսը կազմում է 917 կմ²։ Բնակչություն", 372 | Some("hye"), 373 | "Talnoei shrjan (ukr., Talnivsky raion), shrjan Ukrainayi Cherkasii marzum. Steghtsvel e 1923 tvakanin. Varchakan kentrone, Talnoe. Ashkharhagrutyune Shrjani taratski makerese kazmum e 917 km². Bnakchutyun", 374 | ); 375 | } 376 | 377 | #[test] 378 | // #[ignore = "skill issue"] 379 | fn test_tai() { 380 | let uroman = Uroman::new(); 381 | assert_romanizes_to_str( 382 | &uroman, 383 | "มีประเทศอิสระ 2 ประเทศ คือ ซานมารีโนและนครรัฐวาติกัน เป็นดินแดนที่ล้อมรอบไปด้วยพื้นที่ของอิตาลี ในขณะที่เมืองกัมปีโอเนดีตาเลีย เป็นดินแดนส่วนแยกของอิตาลีที่ถูกล้อมรอบด้วยพื้นที่ประเทศสวิตเซอร์แลนด์", 384 | Some("tai"), 385 | "miiprathetitra 2 prathet khuee saanmaariinolaenkhanatwaatikan pendindaenthiilomroppaiduaiphueenthiikhongitaalii naiknathiimueangkampiionediitaalia pendindaensuanyaekkhongitaaliithiithuuklomropduaiphueenthiiprathetswitsoelaen", 386 | ); 387 | } 388 | 389 | #[test] 390 | fn test_generic_korean() { 391 | let uroman = Uroman::new(); 392 | assert_romanizes_to_str( 393 | &uroman, 394 | "북쪽에는 인도네시아와 동티모르, 파푸아 뉴기니, 북동쪽에는 솔로몬 제도와 바누아투, 누벨칼레도니, 그리고 남동쪽에는 뉴질랜드가 있다.", 395 | None, 396 | "bugjjogeneun indonesiawa dongtimoreu, papua nyugini, bugdongjjogeneun solromon jedowa banuatu, nubelkalredoni, geurigo namdongjjogeneun nyujilraendeuga issda.", 397 | ); 398 | } 399 | 400 | #[test] 401 | fn test_generic_kannada() { 402 | let uroman = Uroman::new(); 403 | assert_romanizes_to_str( 404 | &uroman, 405 | "ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು ಇಂದೆನ್ನ ಹೃದಯದಲಿ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗೀ... ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗಿ ಭವ ಭವದಿ ಭತಿಸಿಹೇ ಭವತಿ ದೂರ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ || ಬಾ ಇಲ್ಲಿ ||", 406 | None, 407 | "baa illi sambhavisu imdenna hrdayadali nityavuu avataripa satyaavataara mannnnaagi maravaagi migavaagi kagavaagii... mannnnaagi maravaagi migavaagi kagavaagi bhava bhavadi bhatisihee bhavati duura nityavuu avataripa satyaavataara || baa illi ||", 408 | ); 409 | } 410 | 411 | #[test] 412 | fn test_generic_georgian() { 413 | let uroman = Uroman::new(); 414 | assert_romanizes_to_str( 415 | &uroman, 416 | "ვეპხის ტყაოსანი შოთა რუსთაველი ღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცנეს ფრთენი და აღვფრინდე, მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა.", 417 | None, 418 | "vepxis tqaosani shota rustaveli ghmertsi shemvedre, nutu kvla damxsnas sophlisa shromasa, tsetsxls, tsqalsa da mitsasa, haerta tana mromasa; momtsnes phrteni da aghvphrinde, mivhxvde mas chemsa ndomasa, dghisit da ghamit vhxedvide mzisa elvata krtomaasa.", 419 | ); 420 | } 421 | 422 | #[test] 423 | fn test_generic_ogham() { 424 | let uroman = Uroman::new(); 425 | assert_romanizes_to_str( 426 | &uroman, 427 | "᚛ᚐᚅᚋ ᚋᚖᚂᚓᚌᚖᚋᚏᚔᚇ ᚋᚐᚉᚔ ᚍᚓᚉᚒᚋᚓᚅ᚜", 428 | None, 429 | "anm moilegoimrid maki vekumen", 430 | ); 431 | } 432 | 433 | #[test] 434 | fn test_generic_runic() { 435 | let uroman = Uroman::new(); 436 | assert_romanizes_to_str( 437 | &uroman, 438 | "ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬", 439 | None, 440 | "ic mag glas eotan ond hit ne hearmiath me.", 441 | ); 442 | } 443 | 444 | #[test] 445 | fn test_generic_egyptian_hieroglyphs() { 446 | let uroman = Uroman::new(); 447 | assert_romanizes_to_str(&uroman, "𓊪𓏏𓍯𓃭𓐝𓇌𓋴", None, "ptolmys"); 448 | } 449 | 450 | #[test] 451 | fn test_generic_japanese() { 452 | let uroman = Uroman::new(); 453 | assert_romanizes_to_str(&uroman, "チェコスロバキア", None, "chekosurobakia"); 454 | } 455 | 456 | #[test] 457 | fn test_generic_tibetan() { 458 | let uroman = Uroman::new(); 459 | assert_romanizes_to_str(&uroman, "ལྷ་ས་གྲོང་ཁྱེར", None, "lha·sa·grong·khyer"); 460 | } 461 | 462 | #[test] 463 | fn test_generic_inuktitut() { 464 | let uroman = Uroman::new(); 465 | assert_romanizes_to_str( 466 | &uroman, 467 | "ᓵᓕ ᓴᕕᐊᕐᔪᒃ ᐃᒻᒥᓂᒃ ᓂᓪᓕᕈᑎᖃᓲᖑᕗᖅ ᑕᐃᑦᓱᒪᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ. ᐃᒻᒥᓂᓪᓗᑕᐅᖅ ᓂᓪᓕᕈᑎᖃᓱᖑᒻᒥᓱᓂ ᐅᓪᓗᒥᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ.", 468 | None, 469 | "saali safiaryok imminik nillirotiqasoongofoq taitsomanitatsayaonirarsoni. imminillotaoq nillirotiqasongommisoni ollominitatsayaonirarsoni.", 470 | ); 471 | } 472 | 473 | #[test] 474 | fn test_generic_tifinagh() { 475 | let uroman = Uroman::new(); 476 | assert_romanizes_to_str( 477 | &uroman, 478 | "ⴰⵎⴰⴳⵔⴰⴷ 1 ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ.", 479 | None, 480 | "amagrad 1 ar d ttlalan middn gan ilellitn mgaddan gh waddur d izrfan, yili ak darsn unlli d ufrak, illa flla sn ad ttmyawasn ngratsn s tagmat.", 481 | ); 482 | } 483 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Main library for the uroman-rs project. 2 | //! 3 | //! This library provides the `Uroman` struct, which is the main entry point 4 | //! for romanizing strings. It loads romanization rules from data files and 5 | //! applies them to input text. 6 | 7 | #![allow(clippy::too_many_arguments)] 8 | 9 | use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; 10 | use serde::Serialize; 11 | use std::any::TypeId; 12 | use std::fmt; 13 | use std::io::{self, BufRead, Write}; 14 | use std::marker::PhantomData; 15 | use std::sync::{Arc, LazyLock}; 16 | use thiserror::Error; 17 | 18 | pub use crate::edge::Edge; 19 | use crate::core::UromanInner; 20 | use crate::lattice::Lattice; 21 | use crate::utils::decode_unicode_escapes; 22 | 23 | mod core; 24 | mod decompositions; 25 | mod edge; 26 | mod lattice; 27 | mod rom_rule; 28 | mod utils; 29 | 30 | use rom_rule::RomRule; 31 | 32 | #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] 33 | pub enum RomFormat { 34 | #[default] 35 | Str, 36 | Edges, 37 | Alts, 38 | Lattice, 39 | } 40 | 41 | pub(crate) use rom_format::RomFormatType; 42 | pub(crate) use rom_format::IsStrFormat; 43 | pub(crate) use rom_format::IsEdgeFormat; 44 | 45 | pub mod rom_format { 46 | use crate::RomanizationError; 47 | 48 | pub struct Str; 49 | pub struct Edges; 50 | pub struct Alts; 51 | pub struct Lattice; 52 | 53 | pub trait RomFormatType { 54 | type Output; 55 | } 56 | 57 | impl RomFormatType for Str { 58 | type Output = String; 59 | } 60 | 61 | impl RomFormatType for Edges { 62 | type Output = Result; 63 | } 64 | 65 | impl RomFormatType for Alts { 66 | type Output = Result; 67 | } 68 | 69 | impl RomFormatType for Lattice { 70 | type Output = Result; 71 | } 72 | 73 | pub trait IsStrFormat {} 74 | pub trait IsEdgeFormat {} 75 | 76 | impl IsStrFormat for Str {} 77 | impl IsEdgeFormat for Edges {} 78 | impl IsEdgeFormat for Alts {} 79 | impl IsEdgeFormat for Lattice {} 80 | } 81 | 82 | pub struct RomanizationOutput { 83 | pub(crate) result: RomanizationResult, 84 | _marker: PhantomData, 85 | } 86 | 87 | impl RomanizationOutput { 88 | pub fn to_string(self) -> F::Output 89 | where 90 | F::Output: From, 91 | { 92 | self.result.into() 93 | } 94 | } 95 | 96 | impl RomanizationOutput { 97 | pub fn as_str(&self) -> &str { 98 | match &self.result { 99 | RomanizationResult::Str(s) => s, 100 | RomanizationResult::Edges(_) => unreachable!(), 101 | } 102 | } 103 | } 104 | 105 | impl RomanizationOutput { 106 | pub fn to_edges(self) -> Vec { 107 | match self.result { 108 | RomanizationResult::Edges(edges) => edges, 109 | RomanizationResult::Str(_) => { 110 | panic!("Logic error: An edge-based format produced a string result.") 111 | } 112 | } 113 | } 114 | } 115 | 116 | impl RomanizationOutput { 117 | pub fn as_edges(&self) -> &[Edge] { 118 | match &self.result { 119 | RomanizationResult::Edges(edges) => edges, 120 | RomanizationResult::Str(_) => { 121 | panic!("Logic error: An edge-based format produced a string result.") 122 | } 123 | } 124 | } 125 | } 126 | 127 | impl<'a, F> IntoIterator for &'a RomanizationOutput 128 | where 129 | F: RomFormatType + IsEdgeFormat, 130 | { 131 | type Item = &'a Edge; 132 | type IntoIter = std::slice::Iter<'a, Edge>; 133 | 134 | fn into_iter(self) -> Self::IntoIter { 135 | self.as_edges().iter() 136 | } 137 | } 138 | 139 | impl IntoIterator for RomanizationOutput 140 | where 141 | F: RomFormatType + IsEdgeFormat, 142 | { 143 | type Item = Edge; 144 | type IntoIter = std::vec::IntoIter; 145 | 146 | fn into_iter(self) -> Self::IntoIter { 147 | self.to_edges().into_iter() 148 | } 149 | } 150 | 151 | impl From for String { 152 | fn from(res: RomanizationResult) -> Self { 153 | match res { 154 | RomanizationResult::Str(s) => s, 155 | _ => panic!("Expected RomanizationResult::Str, but got Edges"), 156 | } 157 | } 158 | } 159 | 160 | impl From for Result { 161 | fn from(res: RomanizationResult) -> Self { 162 | match res { 163 | RomanizationResult::Edges(edges) => Ok(serde_json::to_string_pretty(&edges)?), 164 | _ => Err(RomanizationError::InternalError( 165 | "Mismatched result".to_string(), 166 | )), 167 | } 168 | } 169 | } 170 | 171 | impl fmt::Display for RomanizationOutput 172 | where 173 | F::Output: From, 174 | F::Output: std::fmt::Display, 175 | { 176 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 177 | match self.result.to_string() { 178 | Ok(s) => write!(f, "{}", s), 179 | Err(e) => { 180 | write!(f, "Error: {:?}", e) 181 | } 182 | } 183 | } 184 | } 185 | 186 | #[derive(Debug, Serialize, Clone, PartialEq, Eq, PartialOrd)] 187 | #[serde(untagged)] 188 | pub enum RomanizationResult { 189 | Str(String), 190 | Edges(Vec), 191 | } 192 | 193 | impl RomanizationResult { 194 | pub fn to_string(&self) -> Result { 195 | match self { 196 | RomanizationResult::Str(s) => Ok(s.clone()), 197 | RomanizationResult::Edges(edges) => Ok(serde_json::to_string_pretty(edges)?), 198 | } 199 | } 200 | } 201 | 202 | impl fmt::Display for RomanizationResult { 203 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 204 | match self.to_string() { 205 | Ok(s) => write!(f, "{}", s), 206 | Err(e) => write!(f, "Error: {:?}", e), 207 | } 208 | } 209 | } 210 | 211 | #[derive(Error, Debug)] 212 | pub enum RomanizationError { 213 | #[error("Failed to serialize the result to JSON: {0}")] 214 | SerializationFailed(#[from] serde_json::Error), 215 | 216 | #[error(transparent)] 217 | Io(#[from] io::Error), 218 | 219 | #[error("Internal logic error: {0}")] 220 | InternalError(String), 221 | } 222 | 223 | static GLOBAL_UROMAN_INNER: LazyLock> = LazyLock::new(|| { 224 | Arc::new(UromanInner::new()) 225 | }); 226 | 227 | /// The main struct for romanization. 228 | /// 229 | /// It holds the romanization rules and provides methods to romanize strings. 230 | /// This corresponds to the `Uroman` class in the Python implementation. 231 | #[derive(Debug, Clone, Default)] 232 | pub struct Uroman { 233 | inner: Arc, 234 | } 235 | 236 | impl Uroman { 237 | pub fn new() -> Self { 238 | Self { 239 | inner: Arc::clone(&GLOBAL_UROMAN_INNER), 240 | } 241 | } 242 | 243 | /// Romanizes a given string. 244 | /// 245 | /// # Arguments 246 | /// 247 | /// lcode: [ISO 639-3 language code](https://www.loc.gov/standards/iso639-2/php/code_list.php) 248 | /// (e.g., eng, jpn, hin, ara, zho) 249 | /// 250 | /// # Example 251 | /// ``` 252 | /// # use uroman::{Uroman, rom_format}; 253 | /// # let uroman = Uroman::new(); 254 | /// let lcode = None; 255 | /// let result = uroman.romanize_string::("ᚺᚨᛚᛚᛟ ᚹᛟᚱᛚᛞ", lcode); 256 | /// 257 | /// let str = result.to_string(); 258 | /// 259 | /// println!("{str}"); 260 | /// ``` 261 | pub fn romanize_string( 262 | &self, 263 | s: &str, 264 | lcode: Option<&str>, 265 | ) -> RomanizationOutput { 266 | let mut lat = Lattice::new(s, &self.inner, lcode); 267 | 268 | lat.pick_tibetan_vowel_edge(); 269 | lat.prep_braille(); 270 | lat.add_romanization(); 271 | lat.add_numbers(); 272 | lat.add_braille_numbers(); 273 | lat.add_rom_fall_back_singles(); 274 | 275 | let type_id = TypeId::of::(); 276 | 277 | let result = if type_id == TypeId::of::() { 278 | let best_edges = lat.best_rom_edge_path(0, s.chars().count(), false); 279 | RomanizationResult::Str( 280 | best_edges.iter().map(|edge| edge.txt()).collect::(), 281 | ) 282 | } else if type_id == TypeId::of::() { 283 | RomanizationResult::Edges( 284 | lat.best_rom_edge_path(0, s.chars().count(), false) 285 | ) 286 | } else if type_id == TypeId::of::() { 287 | let mut best_edges = lat.best_rom_edge_path(0, s.chars().count(), false); 288 | lat.add_alternatives(&mut best_edges); 289 | 290 | RomanizationResult::Edges(best_edges) 291 | } else if type_id == TypeId::of::() { 292 | let mut all_edges = lat.all_edges(0, s.chars().count()); 293 | lat.add_alternatives(&mut all_edges); 294 | 295 | RomanizationResult::Edges(all_edges) 296 | } else { 297 | unreachable!("Unknown RomFormatType provided"); 298 | }; 299 | 300 | RomanizationOutput { 301 | result, 302 | _marker: PhantomData, 303 | } 304 | } 305 | 306 | /// Decodes Unicode escape sequences before performing romanization. 307 | /// 308 | /// # Arguments 309 | /// 310 | /// lcode: [ISO 639-3 language code](https://www.loc.gov/standards/iso639-2/php/code_list.php) 311 | /// (e.g., eng, jpn, hin, ara, zho) 312 | /// 313 | /// # Example 314 | /// ``` 315 | /// # use uroman::{Uroman, rom_format}; 316 | /// # let uroman = Uroman::new(); 317 | /// let lcode = None; 318 | /// let result = uroman.romanize_escaped::("ᚺᚨᛚᛚᛟ ᚹᛟᚱᛚᛞ", lcode); 319 | /// 320 | /// let str = result.to_string(); 321 | /// 322 | /// println!("{str}"); 323 | /// ``` 324 | pub fn romanize_escaped( 325 | &self, 326 | s: &str, 327 | lcode: Option<&str>, 328 | ) -> RomanizationOutput { 329 | let s = decode_unicode_escapes(s); 330 | self.romanize_string::(s.as_str(), lcode) 331 | } 332 | 333 | /// Decodes Unicode escape sequences and then romanizes the string using the specified `RomFormat`. 334 | /// 335 | /// # Arguments 336 | /// 337 | /// lcode: [ISO 639-3 language code](https://www.loc.gov/standards/iso639-2/php/code_list.php) 338 | /// (e.g., eng, jpn, hin, ara, zho) 339 | /// 340 | /// # Example 341 | /// ``` 342 | /// # use uroman::{Uroman, RomFormat}; 343 | /// # let uroman = Uroman::new(); 344 | /// let lcode = None; 345 | /// let result = uroman.romanize_with_format( 346 | /// "ᚺᚨᛚᛚᛟ ᚹᛟᚱᛚᛞ", 347 | /// lcode, 348 | /// Some(RomFormat::Edges), 349 | /// ); 350 | /// 351 | /// let str = result.to_string().unwrap(); 352 | /// 353 | /// println!("{str}"); 354 | /// ``` 355 | pub fn romanize_escaped_with_format( 356 | &self, 357 | s: &str, 358 | lcode: Option<&str>, 359 | rom_format: Option, 360 | ) -> RomanizationResult { 361 | let s = decode_unicode_escapes(s); 362 | self.romanize_with_format(&s, lcode, rom_format) 363 | } 364 | 365 | /// Romanizes a given string using `RomFormat`. 366 | /// 367 | /// # Arguments 368 | /// 369 | /// lcode: [ISO 639-3 language code](https://www.loc.gov/standards/iso639-2/php/code_list.php) 370 | /// (e.g., eng, jpn, hin, ara, zho) 371 | /// 372 | /// # Example 373 | /// ``` 374 | /// # use uroman::Uroman; 375 | /// # let uroman = Uroman::new(); 376 | /// let lcode = None; 377 | /// let result = uroman.romanize_with_format( 378 | /// "ᚺᚨᛚᛚᛟ ᚹᛟᚱᛚᛞ", 379 | /// lcode, 380 | /// None, // `None` defaults to `RomFormat::Str`. 381 | /// // RomFormat::Str, 382 | /// ); 383 | /// 384 | /// let str = result.to_string().unwrap(); 385 | /// 386 | /// println!("{str}"); 387 | /// ``` 388 | pub fn romanize_with_format( 389 | &self, 390 | s: &str, 391 | lcode: Option<&str>, 392 | rom_format: Option, 393 | ) -> RomanizationResult { 394 | let rom_format = rom_format.unwrap_or(RomFormat::Str); 395 | 396 | match rom_format { 397 | RomFormat::Str => { 398 | let str = self.romanize_string::(s, lcode); 399 | RomanizationResult::Str(str.to_string()) 400 | } 401 | RomFormat::Edges => self.romanize_string::(s, lcode).result, 402 | RomFormat::Alts => self.romanize_string::(s, lcode).result, 403 | RomFormat::Lattice => self.romanize_string::(s, lcode).result, 404 | } 405 | } 406 | 407 | /// Romanizes a stream of text line by line and writes the output to another stream. 408 | /// 409 | /// This method efficiently processes large amounts of text by reading from a buffered 410 | /// reader and writing to a writer without loading the entire content into memory. 411 | /// 412 | /// # Arguments 413 | /// 414 | /// lcode: [ISO 639-3 language code](https://www.loc.gov/standards/iso639-2/php/code_list.php) 415 | /// (e.g., eng, jpn, hin, ara, zho) 416 | /// 417 | /// # Errors 418 | /// 419 | /// This function will return an `io::Error` if any I/O operation fails during 420 | /// reading from the `reader` or writing to the `writer`. 421 | pub fn romanize_file( 422 | &self, 423 | mut reader: R, 424 | mut writer: W, 425 | lcode: Option<&str>, 426 | rom_format: RomFormat, 427 | max_lines: Option, 428 | decode_unicode: bool, 429 | silent: bool, 430 | ) -> Result<(), RomanizationError> { 431 | let mut line_number = 0; 432 | let mut non_utf8_chars_total = 0; 433 | let mut n_error_messages_output = 0; 434 | let max_n_error_messages = 10; 435 | 436 | let mut buffer = vec![]; 437 | let default_lcode = lcode; 438 | let lcode_directive = "::lcode "; 439 | 440 | while reader.read_until(b'\n', &mut buffer)? > 0 { 441 | line_number += 1; 442 | 443 | let original_len = buffer.len(); 444 | let line_str = String::from_utf8_lossy(&buffer); 445 | let replaced_len = line_str.len(); 446 | if replaced_len < original_len { 447 | non_utf8_chars_total += 1; 448 | if n_error_messages_output < max_n_error_messages { 449 | eprintln!( 450 | "Detected encoding error on line {line_number}: non-UTF-8 characters were replaced." 451 | ); 452 | n_error_messages_output += 1; 453 | } else if n_error_messages_output == max_n_error_messages { 454 | eprintln!("Too many encoding errors. No further errors reported."); 455 | n_error_messages_output += 1; 456 | } 457 | } 458 | let mut line_trimmed = &*line_str; 459 | 460 | if line_trimmed.ends_with('\n') { 461 | line_trimmed = &line_trimmed[..line_trimmed.len() - 1]; 462 | } 463 | if line_trimmed.ends_with('\r') { 464 | line_trimmed = &line_trimmed[..line_trimmed.len() - 1]; 465 | } 466 | 467 | if let Some(rest_of_line) = line_trimmed.strip_prefix(lcode_directive) { 468 | let parts: Vec<&str> = rest_of_line.splitn(2, char::is_whitespace).collect(); 469 | let (lcode, text_to_romanize) = 470 | (parts.first().cloned(), parts.get(1).cloned().unwrap_or("")); 471 | 472 | let result = if decode_unicode { 473 | self.romanize_escaped_with_format(text_to_romanize, lcode, Some(rom_format)) 474 | } else { 475 | self.romanize_with_format(text_to_romanize, lcode, Some(rom_format)) 476 | }; 477 | 478 | match rom_format { 479 | RomFormat::Str => { 480 | let prefix = format!("{}{}{} ", lcode_directive, lcode.unwrap_or(""), ""); 481 | let output = prefix + &result.to_string().unwrap(); 482 | writeln!(writer, "{output}")?; 483 | } 484 | _ => { 485 | let meta_edge = format!(r#"[0,0,"","lcode: {}"]"#, lcode.unwrap_or("")); 486 | let result_json = result.to_string().unwrap(); 487 | if let Some(stripped) = result_json.strip_prefix('[') { 488 | writeln!(writer, "[{meta_edge},{stripped}")?; 489 | } else { 490 | writeln!(writer, "{result_json}")?; 491 | } 492 | } 493 | } 494 | } else { 495 | let result = if decode_unicode { 496 | self.romanize_escaped_with_format(line_trimmed, default_lcode, Some(rom_format)) 497 | } else { 498 | self.romanize_with_format(line_trimmed, default_lcode, Some(rom_format)) 499 | }; 500 | let output = result 501 | .to_string() 502 | .expect("JSON serialization failed"); 503 | writeln!(writer, "{output}")?; 504 | } 505 | 506 | if let Some(max) = max_lines 507 | && line_number >= max 508 | { 509 | break; 510 | } 511 | buffer.clear(); 512 | } 513 | 514 | if !silent && line_number > 0 { 515 | eprintln!(); 516 | } 517 | if non_utf8_chars_total > 0 { 518 | eprintln!( 519 | "Total number of lines with non-UTF-8 characters: {non_utf8_chars_total}" 520 | ); 521 | } 522 | 523 | writer.flush()?; 524 | Ok(()) 525 | } 526 | 527 | /// Romanizes a stream of text line by line in parallel for maximum performance. 528 | /// 529 | /// This version reads the entire input into memory to process lines concurrently using 530 | /// multiple CPU cores. It is significantly faster than `romanize_file` but requires 531 | /// more memory. For very large files, consider using the sequential `romanize_file`. 532 | /// 533 | /// The output order is preserved. 534 | /// 535 | /// # Differences from `romanize_file` 536 | /// 537 | /// * Memory Usage: Loads the entire file into memory. May fail on files larger than RAM. 538 | /// * Error Reporting: Does not warn about invalid UTF-8 characters. 539 | /// 540 | /// # Arguments 541 | /// 542 | /// lcode: [ISO 639-3 language code](https://www.loc.gov/standards/iso-639-2/php/code_list.php) 543 | /// (e.g., eng, jpn, hin, ara, zho) 544 | /// 545 | /// # Errors 546 | /// 547 | /// This function will return an `io::Error` if any I/O operation fails during 548 | /// reading from the `reader` or writing to the `writer`. 549 | pub fn romanize_file_parallel( 550 | &self, 551 | reader: R, 552 | mut writer: W, 553 | lcode: Option<&str>, 554 | rom_format: RomFormat, 555 | max_lines: Option, 556 | decode_unicode: bool, 557 | silent: bool, 558 | ) -> Result<(), RomanizationError> { 559 | let mut lines: Vec = reader.lines().collect::>()?; 560 | 561 | if let Some(max) = max_lines { 562 | lines.truncate(max); 563 | } 564 | 565 | let line_count = lines.len(); 566 | let default_lcode = lcode; 567 | let lcode_directive = "::lcode "; 568 | // UTF-8 error handling is simplified as `lines()` replaces invalid sequences. 569 | // The original byte-level diff check is not replicated here. 570 | 571 | let results: Vec = lines 572 | .par_iter() 573 | .map(|line| { 574 | if let Some(rest_of_line) = line.strip_prefix(lcode_directive) { 575 | let parts: Vec<&str> = rest_of_line.splitn(2, char::is_whitespace).collect(); 576 | let (lcode, text_to_romanize) = 577 | (parts.first().cloned(), parts.get(1).cloned().unwrap_or("")); 578 | 579 | let result = if decode_unicode { 580 | self.romanize_escaped_with_format(text_to_romanize, lcode, Some(rom_format)) 581 | } else { 582 | self.romanize_with_format(text_to_romanize, lcode, Some(rom_format)) 583 | }; 584 | 585 | let output = result.to_string().unwrap_or_default(); 586 | 587 | match rom_format { 588 | RomFormat::Str => { 589 | format!("{}{}{} {}", lcode_directive, lcode.unwrap_or(""), "", output) 590 | } 591 | _ => { 592 | let meta_edge = format!(r#"[0,0,"","lcode: {}"]"#, lcode.unwrap_or("")); 593 | if let Some(stripped) = output.strip_prefix('[') { 594 | format!("[{meta_edge},{stripped}") 595 | } else { 596 | output 597 | } 598 | } 599 | } 600 | } else { 601 | let result = if decode_unicode { 602 | self.romanize_escaped_with_format(line, default_lcode, Some(rom_format)) 603 | } else { 604 | self.romanize_with_format(line, default_lcode, Some(rom_format)) 605 | }; 606 | result.to_string().unwrap_or_default() 607 | } 608 | }) 609 | .collect(); 610 | 611 | for output in results { 612 | writeln!(writer, "{}", output)?; 613 | } 614 | 615 | if !silent && line_count > 0 { 616 | eprintln!(); 617 | } 618 | 619 | writer.flush()?; 620 | Ok(()) 621 | } 622 | } 623 | -------------------------------------------------------------------------------- /data/UnicodeDataOverwrite.txt: -------------------------------------------------------------------------------- 1 | ## UnicodeDataOverwrite.txt 2 | ::u 00A0 ::r " " ::comment no-break space 3 | ::u 01BF ::r w ::comment ƿ Latin Character Wynn (Old English) 4 | ::u 0294 ::r ' ::comment gottal stop 5 | ::u 0295 ::r ' ::comment ʕ voiced pharyngeal fricative 6 | ::u 0305 ::r "" ::comment ̅ Combining overline 7 | ::u 0306 ::r "" ::comment ̆ Combining breve 8 | ::u 0307 ::r "" ::comment ̇ Combining dot above 9 | ::u 030A ::r "" ::comment ̊ Combining ring above 10 | ::u 030C ::r "" ::comment ̌ Combining caron 11 | ::u 0311 ::r "" ::comment ̑ Combining inverted breve 12 | ::u 031D ::r "" ::comment ̝ Combining down up below 13 | ::u 031E ::r "" ::comment ̞ Combining down tack below 14 | ::u 031F ::r "" ::comment ̟ Combining plus sign below 15 | ::u 0323 ::r "" ::comment ̣ Combining dot below 16 | ::u 0325 ::r "" ::comment ̥ Combining ring below 17 | ::u 0329 ::r "" ::comment ̩ Combining vertical line below 18 | ::u 032A ::r "" ::comment ̪ Combining bridge below 19 | ::u 032F ::r "" ::comment ̯ Combining inverted breve below 20 | ::u 0342 ::r "" ::comment ͂ Combining Greek perispomeni (circumflex accent) 21 | ::u 0343 ::r "" ::comment ̓ Combining Greek koronis 22 | ::u 0361 ::r "" ::comment Combining double inverted breve 23 | ::u 0384 ::r "" ::comment ΄ Greek tonos 24 | ::u 0482 ::r 1000· ::comment ҂ Cyrillic thousands sign 25 | ::u 0483 ::r "" ::comment ҃ Combining Cyrillic Titlo ::annotation titlo 26 | ::u 0484 ::r "" ::comment ҄ Combining Cyrillic Palatalization ::annotation palatalization 27 | ::u 055B ::r "" ::comment ՛ Armenian emphasis mark 28 | ::u 055F ::r "" ::comment ՟ Armenian abbreviation mark ::annotation abbreviation 29 | 30 | ::u 0901 ::r +m ::comment Devanagari sign candrabindu 31 | ::u 0902 ::r +m ::comment Devanagari sign anusvara 32 | ::u 0903 ::r +h ::comment Devanagari sign visarga 33 | ::u 093D ::r ' ::comment Devanagari sign avagraha 34 | ::u 0950 ::r om ::comment ॐ Devanagari om symbol 35 | ::u 0951 ::r "" ::comment ॑ Devanagari stress sign "udatta" 36 | ::u 0952 ::r "" ::comment ॒ Devanagari stress sign "anudatta" 37 | ::u 0981 ::r +n ::comment Bengali sign candrabindu ("chôndrôbindu") 38 | ::u 0982 ::r +ng ::comment Bengali sign anusvara ("ônushar") 39 | ::u 0983 ::r +h ::comment Bengali sign visarga ("bishôrgô") 40 | ::u 099A ::r ch ::comment instead of Bengali C(A) 41 | ::u 099B ::r chh ::comment instead of Bengali CC(A) 42 | ::u 0A02 ::r +m ::comment Gurmukhi sign bindi 43 | ::u 0A70 ::r +m ::comment Gurmukhi tippi 44 | # ::u 0A72 ::r "" ::comment Gurmukhi addak 45 | ::u 0A72 ::r "" ::comment Gurmukhi iri 46 | ::u 0A73 ::r "" ::comment Gurmukhi ura 47 | ::u 0B01 ::r +m ::comment Oriya sign candrabindu 48 | ::u 0B03 ::r +h ::comment Oriya sign visarga 49 | ::u 0B5F ::r ya ::comment ୟ Oriya letter yya 50 | ::u 0B82 ::r +m ::comment Tamil sign anusvara (not to be used?) 51 | ::u 0B83 ::r +h ::comment Tamil sign visarga ("āytam") 52 | ::u 0B9F ::r t ::comment instead of Tamil TT(A) 53 | ::u 0BA3 ::r n ::comment instead of Tamil NN(A) 54 | ::u 0BA9 ::r n ::comment instead of Tamil NNN(A) 55 | ::u 0BB1 ::r r ::comment instead of Tamil RR(A) 56 | ::u 0BB3 ::r l ::comment instead of Tamil LL(A) 57 | ::u 0BB4 ::r l ::comment instead of Tamil LLL(A) 58 | ::u 0C03 ::r +h ::comment ః Telugu sign visarga 59 | ::u 0C83 ::r +h ::comment Kannada sign visarga 60 | ::u 0D02 ::r +m ::comment Malayalam sign anusvara 61 | ::u 0D03 ::r +h ::comment Malayalam sign visarga 62 | ::u 0D82 ::r +n ::comment Sinhala sign anusvaraya 63 | ::u 0DA4 ::r ny ::comment Sinhala ඤ 64 | ::u 0DA5 ::r gn ::comment Sinhala ඥ 65 | ::u 0DCA ::r "" ::comment Sinhala sign al-lakuna (virama = no vowel) 66 | ::u 0DCF ::r aa ::comment Sinhala ා 67 | ::u 0DD0 ::r ae ::comment Sinhala ැ 68 | ::u 0DD1 ::r ae ::comment Sinhala ෑ 69 | ::u 0DD2 ::r i ::comment Sinhala ි 70 | ::u 0DD3 ::r ii ::comment Sinhala ී 71 | ::u 0DD4 ::r u ::comment Sinhala ු 72 | ::u 0DD6 ::r uu ::comment Sinhala ූ 73 | ::u 0DD8 ::r r ::comment Sinhala ෘ 74 | ::u 0DD9 ::r e ::comment Sinhala ෙ 75 | ::u 0DDA ::r ee ::comment Sinhala ේ 76 | ::u 0DDB ::r ai ::comment Sinhala ෛ 77 | ::u 0DDC ::r o ::comment Sinhala ො 78 | ::u 0DDD ::r oo ::comment Sinhala ෝ 79 | ::u 0DDE ::r au ::comment Sinhala ෞ 80 | ::u 0DDF ::r aa ::comment Sinhala ා 81 | ::u 0DF2 ::r rr ::comment Sinhala ෲ 82 | 83 | ::u 0E02 ::r k ::comment Thai character KHO KHAI 84 | ::u 0E03 ::r k ::comment Thai character KHO KHUAT 85 | ::u 0E04 ::r k ::comment Thai character KHO KHWAI 86 | ::u 0E05 ::r k ::comment Thai character KHO KHON 87 | ::u 0E06 ::r k ::comment Thai character KHO RAKHANG 88 | ::u 0E10 ::r t ::comment Thai character THO THAN 89 | ::u 0E11 ::r t ::comment Thai character THO NANGMONTHO 90 | ::u 0E12 ::r t ::comment Thai character THO PHUTHAO 91 | ::u 0E16 ::r t ::comment Thai character THO THUNG 92 | ::u 0E17 ::r t ::comment Thai character THO THAHAN 93 | ::u 0E18 ::r t ::comment Thai character THO THONG 94 | ::u 0E1C ::r p ::comment Thai character PHO PHUNG 95 | ::u 0E1E ::r p ::comment Thai character PHO PHAN 96 | ::u 0E20 ::r p ::comment Thai character PHO SAMPHAO 97 | ::u 0E2D ::r o ::comment Thai character O ANG 98 | ::u 0E2F ::r ... ::comment ฯ Thai character PAIYANNOI (ellipsis, abbreviation) 99 | ::u 0E31 ::r a ::comment Thai character MAI HAN-AKAT 100 | ::u 0E3A ::r "" ::comment Thai character PHINTHU (Pali virama) 101 | ::u 0E40 ::r e ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA E 102 | ::u 0E41 ::r ae ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AE 103 | ::u 0E42 ::r o ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA O 104 | ::u 0E43 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMUAN 105 | ::u 0E44 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMALAI 106 | ::u 0E45 ::r "" ::comment Thai character LAKKHANGYAO vowel lengthener 107 | ::u 0E47 ::r o ::comment Thai character MAITAIKHU vowel shortener 108 | ::u 0E48 ::r "" ::tone-mark non-standard ::comment Thai tone mark MAI EK 109 | ::u 0E49 ::r "" ::tone-mark standard ::comment Thai tone mark MAI THO 110 | ::u 0E4A ::r "" ::tone-mark high ::comment Thai tone mark MAI TRI 111 | ::u 0E4B ::r "" ::tone-mark rising ::comment Thai tone mark MAI CHATTAWA 112 | ::u 0E4C ::r "" ::comment Thai character THANTHAKHAT cancellation mark (cf. virama) 113 | ::u 0E4D ::r +m ::comment ํ Thai character NIKHAHIT final nasal (cf. anusvara) 114 | ::u 0ECC ::r "" ::comment ໌ Lao cancellation mark ::annotation cancellation 115 | ::u 0F0B ::r · ::comment ་ Tibetan mark intersyllabic tsheg 116 | ::u 0F0C ::r "" ::comment ༌ Tibetan mark delimiter tsheg bstar 117 | ::u 0F84 ::r "" ::comment ྄ Tibetan halanta 118 | ::u 1036 ::r +n ::comment Myanmar sign anusvara ("auk myit") 119 | ::u 1037 ::r "" ::tone-mark creaky ::comment Myanmar sign dot below 120 | ::u 1038 ::r "" ::tone-mark high ::comment Myanmar sign visarga 121 | 122 | ::u 16A0 ::r f ::comment ᚠ RUNIC LETTER FEHU FEOH FE F 123 | ::u 16A1 ::r v ::comment ᚡ RUNIC LETTER V 124 | ::u 16A2 ::r u ::comment ᚢ RUNIC LETTER URUZ UR U 125 | ::u 16A3 ::r y ::comment ᚣ RUNIC LETTER YR 126 | ::u 16A4 ::r y ::comment ᚤ RUNIC LETTER Y 127 | ::u 16A5 ::r w ::comment ᚥ RUNIC LETTER W 128 | ::u 16A6 ::r th ::comment ᚦ RUNIC LETTER THURISAZ THURS THORN 129 | ::u 16A7 ::r th ::comment ᚧ RUNIC LETTER ETH 130 | ::u 16A8 ::r a ::comment ᚨ RUNIC LETTER ANSUZ A 131 | ::u 16A9 ::r o ::comment ᚩ RUNIC LETTER OS O 132 | ::u 16AA ::r a ::comment ᚪ RUNIC LETTER AC A 133 | ::u 16AB ::r ae ::comment ᚫ RUNIC LETTER AESC 134 | ::u 16AC ::r o ::comment ᚬ RUNIC LETTER LONG-BRANCH-OSS O 135 | ::u 16AD ::r o ::comment ᚭ RUNIC LETTER SHORT-TWIG-OSS O 136 | ::u 16AE ::r o ::comment ᚮ RUNIC LETTER O 137 | ::u 16AF ::r oe ::comment ᚯ RUNIC LETTER OE 138 | ::u 16B0 ::r on ::comment ᚰ RUNIC LETTER ON 139 | ::u 16B1 ::r r ::comment ᚱ RUNIC LETTER RAIDO RAD REID R 140 | ::u 16B2 ::r k ::comment ᚲ RUNIC LETTER KAUNA 141 | ::u 16B3 ::r c ::comment ᚳ RUNIC LETTER CEN 142 | ::u 16B4 ::r k ::comment ᚴ RUNIC LETTER KAUN K 143 | ::u 16B5 ::r g ::comment ᚵ RUNIC LETTER G 144 | ::u 16B6 ::r ng ::comment ᚶ RUNIC LETTER ENG 145 | ::u 16B7 ::r g ::comment ᚷ RUNIC LETTER GEBO GYFU G 146 | ::u 16B8 ::r g ::comment ᚸ RUNIC LETTER GAR 147 | ::u 16B9 ::r w ::comment ᚹ RUNIC LETTER WUNJO WYNN W 148 | ::u 16BA ::r h ::comment ᚺ RUNIC LETTER HAGLAZ H 149 | ::u 16BB ::r h ::comment ᚻ RUNIC LETTER HAEGL H 150 | ::u 16BC ::r h ::comment ᚼ RUNIC LETTER LONG-BRANCH-HAGALL H 151 | ::u 16BD ::r h ::comment ᚽ RUNIC LETTER SHORT-TWIG-HAGALL H 152 | ::u 16BE ::r n ::comment ᚾ RUNIC LETTER NAUDIZ NYD NAUD N 153 | ::u 16BF ::r n ::comment ᚿ RUNIC LETTER SHORT-TWIG-NAUD N 154 | ::u 16C0 ::r n ::comment ᛀ RUNIC LETTER DOTTED-N 155 | ::u 16C1 ::r i ::comment ᛁ RUNIC LETTER ISAZ IS ISS I 156 | ::u 16C2 ::r e ::comment ᛂ RUNIC LETTER E 157 | ::u 16C3 ::r j ::comment ᛃ RUNIC LETTER JERAN J 158 | ::u 16C4 ::r j ::comment ᛄ RUNIC LETTER GER 159 | ::u 16C5 ::r ae ::comment ᛅ RUNIC LETTER LONG-BRANCH-AR AE 160 | ::u 16C6 ::r a ::comment ᛆ RUNIC LETTER SHORT-TWIG-AR A 161 | ::u 16C7 ::r i ::comment ᛇ RUNIC LETTER IWAZ EOH 162 | ::u 16C8 ::r p ::comment ᛈ RUNIC LETTER PERTHO PEORTH P 163 | ::u 16C9 ::r z ::comment ᛉ RUNIC LETTER ALGIZ EOLHX 164 | ::u 16CA ::r s ::comment ᛊ RUNIC LETTER SOWILO S 165 | ::u 16CB ::r s ::comment ᛋ RUNIC LETTER SIGEL LONG-BRANCH-SOL S 166 | ::u 16CC ::r s ::comment ᛌ RUNIC LETTER SHORT-TWIG-SOL S 167 | ::u 16CD ::r c ::comment ᛍ RUNIC LETTER C 168 | ::u 16CE ::r z ::comment ᛎ RUNIC LETTER Z 169 | ::u 16CF ::r t ::comment ᛏ RUNIC LETTER TIWAZ TIR TYR T 170 | ::u 16D0 ::r t ::comment ᛐ RUNIC LETTER SHORT-TWIG-TYR T 171 | ::u 16D1 ::r d ::comment ᛑ RUNIC LETTER D 172 | ::u 16D2 ::r b ::comment ᛒ RUNIC LETTER BERKANAN BEORC BJARKAN B 173 | ::u 16D3 ::r b ::comment ᛓ RUNIC LETTER SHORT-TWIG-BJARKAN B 174 | ::u 16D4 ::r p ::comment ᛔ RUNIC LETTER DOTTED-P 175 | ::u 16D5 ::r p ::comment ᛕ RUNIC LETTER OPEN-P 176 | ::u 16D6 ::r e ::comment ᛖ RUNIC LETTER EHWAZ EH E 177 | ::u 16D7 ::r m ::comment ᛗ RUNIC LETTER MANNAZ MAN M 178 | ::u 16D8 ::r m ::comment ᛘ RUNIC LETTER LONG-BRANCH-MADR M 179 | ::u 16D9 ::r m ::comment ᛙ RUNIC LETTER SHORT-TWIG-MADR M 180 | ::u 16DA ::r l ::comment ᛚ RUNIC LETTER LAUKAZ LAGU LOGR L 181 | ::u 16DB ::r l ::comment ᛛ RUNIC LETTER DOTTED-L 182 | ::u 16DC ::r ng ::comment ᛜ RUNIC LETTER INGWAZ 183 | ::u 16DD ::r ng ::comment ᛝ RUNIC LETTER ING 184 | ::u 16DE ::r d ::comment ᛞ RUNIC LETTER DAGAZ DAEG D 185 | ::u 16DF ::r o ::comment ᛟ RUNIC LETTER OTHALAN ETHEL O 186 | ::u 16E0 ::r ea ::comment ᛠ RUNIC LETTER EAR 187 | ::u 16E1 ::r io ::comment ᛡ RUNIC LETTER IOR 188 | ::u 16E2 ::r q ::comment ᛢ RUNIC LETTER CWEORTH 189 | ::u 16E3 ::r k ::comment ᛣ RUNIC LETTER CALC 190 | ::u 16E4 ::r k ::comment ᛤ RUNIC LETTER CEALC 191 | ::u 16E5 ::r st ::comment ᛥ RUNIC LETTER STAN 192 | ::u 16E6 ::r r ::comment ᛦ RUNIC LETTER LONG-BRANCH-YR 193 | ::u 16E7 ::r r ::comment ᛧ RUNIC LETTER SHORT-TWIG-YR 194 | ::u 16E8 ::r r ::comment ᛨ RUNIC LETTER ICELANDIC-YR 195 | ::u 16E9 ::r q ::comment ᛩ RUNIC LETTER Q 196 | ::u 16EA ::r x ::comment ᛪ RUNIC LETTER X 197 | 198 | ::u 17B9 ::r oe ::comment Khmer vowel sign y (short) 199 | ::u 17BA ::r oe ::comment Khmer vowel sign yy (long) 200 | ::u 17C6 ::r +m ::comment Khmer sign nikahit (cf. anusvara) 201 | ::u 17C7 ::r +h ::comment Khmer sign reahmuk (cf. visarga) 202 | ::u 17C8 ::r ' ::comment Khmer sign yuukaleapintu (short vowel and glottal stop) 203 | ::u 17C9 ::r "" ::comment Khmer sign muusikatoan: changes the second register to the first 204 | ::u 17CA ::r "" ::comment Khmer sign triisap: changes the first register to the second 205 | ::u 17CB ::r "" ::comment Khmer sign bantoc (vowel shortener) 206 | ::u 17D2 ::r "" ::comment Khmer sign coeng (foot/subscript, cf. virama = no vowel) 207 | ::u 17D5 ::r . ::comment Khmer sign bariyoosan; period ending entire text or chapter 208 | 209 | ::u 180E ::r ' ::comment ᠎ Mongolian vowel separator 210 | 211 | ::u 1B80 ::r +ng ::comment ᮀ Sundanese sign panyecek 212 | ::u 1B81 ::r +r ::comment ᮁ Sundanese sign panglayar 213 | ::u 1B82 ::r +h ::comment ᮂ Sundanese sign pangwisad 214 | ::u 1BA1 ::r ya ::comment ᮡ Sundanese consonant sign pamingkal 215 | ::u 1BA2 ::r ra ::comment ᮢ Sundanese consonant sign panyakr 216 | ::u 1BA3 ::r la ::comment ᮣ Sundanese consonant sign panyiku 217 | ::u 1BA4 ::r i ::comment ᮤ Sundanese consonant sign panghulu 218 | ::u 1BA5 ::r u ::comment ᮥ Sundanese consonant sign panyuku 219 | ::u 1BA6 ::r e ::comment ᮦ Sundanese vowel sign panaelaeng 220 | ::u 1BA7 ::r o ::comment ᮧ Sundanese vowel sign panolong 221 | ::u 1BA8 ::r e ::comment ᮨ Sundanese vowel sign pamepet 222 | ::u 1BA9 ::r eu ::comment ᮩ Sundanese vowel sign paneuleung 223 | ::u 1BAA ::r "" ::comment ᮪ Sundanese sign pamaaeh or patén (no vowel/virama) 224 | 225 | ::u 1FBD ::r "" ::comment ᾽ Greek koronis 226 | ::u 1FFE ::r "" ::comment Greek dasia (rough breathing) 227 | 228 | ::u 2002 ::r " " ::comment en space 229 | ::u 2003 ::r " " ::comment em space 230 | ::u 2004 ::r " " ::comment three-per-em space 231 | ::u 2005 ::r " " ::comment four-per-em space 232 | ::u 2006 ::r " " ::comment six-per-em space 233 | ::u 2007 ::r " " ::comment figure space 234 | ::u 2008 ::r " " ::comment punctuation space 235 | ::u 2009 ::r " " ::comment thin space 236 | ::u 200A ::r " " ::comment hair space 237 | ::u 202F ::r " " ::comment narrow no-break space 238 | 239 | ::u 2D30 ::r a ::comment TIFINAGH LETTER YA ⴰ 240 | ::u 2D31 ::r b ::comment TIFINAGH LETTER YAB ⴱ 241 | ::u 2D32 ::r bh ::comment TIFINAGH LETTER YABH ⴲ 242 | ::u 2D33 ::r g ::comment TIFINAGH LETTER YAG ⴳ 243 | ::u 2D34 ::r ghh ::comment TIFINAGH LETTER YAGHH ⴴ 244 | ::u 2D35 ::r j ::comment TIFINAGH LETTER BERBER ACADEMY YAJ ⴵ 245 | ::u 2D36 ::r j ::comment TIFINAGH LETTER YAJ ⴶ 246 | ::u 2D37 ::r d ::comment TIFINAGH LETTER YAD ⴷ 247 | ::u 2D38 ::r dh ::comment TIFINAGH LETTER YADH ⴸ 248 | ::u 2D39 ::r dd ::comment TIFINAGH LETTER YADD ⴹ 249 | ::u 2D3A ::r ddh ::comment TIFINAGH LETTER YADDH ⴺ 250 | ::u 2D3B ::r e ::comment TIFINAGH LETTER YEY ⴻ 251 | ::u 2D3C ::r f ::comment TIFINAGH LETTER YAF ⴼ 252 | ::u 2D3D ::r k ::comment TIFINAGH LETTER YAK ⴽ 253 | ::u 2D3E ::r k ::comment TIFINAGH LETTER TUAREG YAK ⴾ 254 | ::u 2D3F ::r khh ::comment TIFINAGH LETTER YAKHH ⴿ 255 | ::u 2D40 ::r h ::comment TIFINAGH LETTER YAH ⵀ 256 | ::u 2D41 ::r h ::comment TIFINAGH LETTER BERBER ACADEMY YAH ⵁ 257 | ::u 2D42 ::r h ::comment TIFINAGH LETTER TUAREG YAH ⵂ 258 | ::u 2D43 ::r hh ::comment TIFINAGH LETTER YAHH ⵃ 259 | ::u 2D44 ::r ' ::comment TIFINAGH LETTER YAA ⵄ 260 | ::u 2D45 ::r kh ::comment TIFINAGH LETTER YAKH ⵅ 261 | ::u 2D46 ::r kh ::comment TIFINAGH LETTER TUAREG YAKH ⵆ 262 | ::u 2D47 ::r q ::comment TIFINAGH LETTER YAQ ⵇ 263 | ::u 2D48 ::r q ::comment TIFINAGH LETTER TUAREG YAQ ⵈ 264 | ::u 2D49 ::r i ::comment TIFINAGH LETTER YI ⵉ 265 | ::u 2D4A ::r zh ::comment TIFINAGH LETTER YAZH ⵊ 266 | ::u 2D4B ::r zh ::comment TIFINAGH LETTER AHAGGAR YAZH ⵋ 267 | ::u 2D4C ::r zh ::comment TIFINAGH LETTER TUAREG YAZH ⵌ 268 | ::u 2D4D ::r l ::comment TIFINAGH LETTER YAL ⵍ 269 | ::u 2D4E ::r m ::comment TIFINAGH LETTER YAM ⵎ 270 | ::u 2D4F ::r n ::comment TIFINAGH LETTER YAN ⵏ 271 | ::u 2D50 ::r gn ::comment TIFINAGH LETTER TUAREG YAGN ⵐ 272 | ::u 2D51 ::r ng ::comment TIFINAGH LETTER TUAREG YANG ⵑ 273 | ::u 2D52 ::r p ::comment TIFINAGH LETTER YAP ⵒ 274 | ::u 2D53 ::r u ::comment TIFINAGH LETTER YU ⵓ 275 | ::u 2D54 ::r r ::comment TIFINAGH LETTER YAR ⵔ 276 | ::u 2D55 ::r rr ::comment TIFINAGH LETTER YARR ⵕ 277 | ::u 2D56 ::r gh ::comment TIFINAGH LETTER YAGH ⵖ 278 | ::u 2D57 ::r gh ::comment TIFINAGH LETTER TUAREG YAGH ⵗ 279 | ::u 2D58 ::r gh ::comment TIFINAGH LETTER AYER YAGH ⵘ 280 | ::u 2D59 ::r s ::comment TIFINAGH LETTER YAS ⵙ 281 | ::u 2D5A ::r ss ::comment TIFINAGH LETTER YASS ⵚ 282 | ::u 2D5B ::r sh ::comment TIFINAGH LETTER YASH ⵛ 283 | ::u 2D5C ::r t ::comment TIFINAGH LETTER YAT ⵜ 284 | ::u 2D5D ::r th ::comment TIFINAGH LETTER YATH ⵝ 285 | ::u 2D5E ::r ch ::comment TIFINAGH LETTER YACH ⵞ 286 | ::u 2D5F ::r tt ::comment TIFINAGH LETTER YATT ⵟ 287 | ::u 2D60 ::r v ::comment TIFINAGH LETTER YAV ⵠ 288 | ::u 2D61 ::r w ::comment TIFINAGH LETTER YAW ⵡ 289 | ::u 2D62 ::r y ::comment TIFINAGH LETTER YAY ⵢ 290 | ::u 2D63 ::r z ::comment TIFINAGH LETTER YAZ ⵣ 291 | ::u 2D64 ::r z ::comment TIFINAGH LETTER TAWELLEMET YAZ ⵤ 292 | ::u 2D65 ::r zz ::comment TIFINAGH LETTER YAZZ ⵥ 293 | ::u 2D66 ::r ye ::comment TIFINAGH LETTER YE ⵦ 294 | ::u 2D67 ::r yo ::comment TIFINAGH LETTER YO ⵧ 295 | ::u 2D6F ::r "" ::comment TIFINAGH MODIFIER LETTER LABIALIZATION MARK ⵯ 296 | ::u 2D70 ::r "" ::comment TIFINAGH SEPARATOR MARK ⵰ 297 | ::u 2D7F ::r "" ::comment TIFINAGH CONSONANT JOINER ⵿ 298 | 299 | ::u 3063 ::r tsu ::comment Hiragana letter small tsu 300 | ::u 30C3 ::r tsu ::comment Katakana letter small tsu 301 | 302 | ::u ABE3 ::r o ::comment ꯣ Meetei Mayek vowel sign onap 303 | ::u ABE7 ::r ou ::comment ꯧ Meetei Mayek vowel sign sounap 304 | 305 | ::u F008 ::r "" ::comment Yoruba diacritic in private use area 306 | ::u F00F ::r "" ::comment Yoruba diacritic in private use area 307 | ::u F023 ::r "" ::comment Yoruba diacritic in private use area 308 | ::u F025 ::r "" ::comment Yoruba diacritic in private use area 309 | 310 | ::u F8D0 ::r a ::name KLINGON LETTER A 311 | ::u F8D1 ::r b ::name KLINGON LETTER B 312 | ::u F8D2 ::r ch ::name KLINGON LETTER CH 313 | ::u F8D3 ::r D ::name KLINGON LETTER D 314 | ::u F8D4 ::r e ::name KLINGON LETTER E 315 | ::u F8D5 ::r gh ::name KLINGON LETTER GH 316 | ::u F8D6 ::r H ::name KLINGON LETTER H 317 | ::u F8D7 ::r I ::name KLINGON LETTER I 318 | ::u F8D8 ::r j ::name KLINGON LETTER J 319 | ::u F8D9 ::r l ::name KLINGON LETTER L 320 | ::u F8DA ::r m ::name KLINGON LETTER M 321 | ::u F8DB ::r n ::name KLINGON LETTER N 322 | ::u F8DC ::r ng ::name KLINGON LETTER NG 323 | ::u F8DD ::r o ::name KLINGON LETTER O 324 | ::u F8DE ::r p ::name KLINGON LETTER P 325 | ::u F8DF ::r q ::name KLINGON LETTER Q 326 | ::u F8E0 ::r Q ::name KLINGON LETTER Q 327 | ::u F8E1 ::r r ::name KLINGON LETTER R 328 | ::u F8E2 ::r S ::name KLINGON LETTER S 329 | ::u F8E3 ::r t ::name KLINGON LETTER T 330 | ::u F8E4 ::r tlh ::name KLINGON LETTER TLH 331 | ::u F8E5 ::r u ::name KLINGON LETTER U 332 | ::u F8E6 ::r v ::name KLINGON LETTER V 333 | ::u F8E7 ::r w ::name KLINGON LETTER W 334 | ::u F8E8 ::r y ::name KLINGON LETTER Y 335 | ::u F8E9 ::r ' ::name KLINGON LETTER GLOTTAL STOP 336 | ::u F8F0 ::num 0 ::name KLINGON DIGIT ZERO 337 | ::u F8F1 ::num 1 ::name KLINGON DIGIT ONE 338 | ::u F8F2 ::num 2 ::name KLINGON DIGIT TWO 339 | ::u F8F3 ::num 3 ::name KLINGON DIGIT THREE 340 | ::u F8F4 ::num 4 ::name KLINGON DIGIT FOUR 341 | ::u F8F5 ::num 5 ::name KLINGON DIGIT FIVE 342 | ::u F8F6 ::num 6 ::name KLINGON DIGIT SIX 343 | ::u F8F7 ::num 7 ::name KLINGON DIGIT SEVEN 344 | ::u F8F8 ::num 8 ::name KLINGON DIGIT EIGHT 345 | ::u F8F9 ::num 9 ::name KLINGON DIGIT NINE 346 | ::u F8FD ::r , ::name KLINGON COMMA 347 | ::u F8FE ::r . ::name KLINGON FULL STOP 348 | ::u F8FF ::name KLINGON MUMMIFICATION GLYPH 349 | ::u FEFF ::r "" ::comment Byte Order Mark (BOM); ZERO WIDTH NO-BREAK SPACE (deprecated) 350 | 351 | ::u 1163D ::r +m ::comment Modi sign anusvara 352 | ::u 1163E ::r +h ::comment Modi sign visarga 353 | 354 | ::u 13068 ::num 1000000 ::comment Egyptian Hieroglyph 355 | ::u 1308B ::r r ::comment Egyptian Hieroglyph ::pic mouth 356 | ::u 1309D ::r ' ::comment Egyptian Hieroglyph (ayn) ::pic forearm 357 | ::u 130A7 ::r d ::comment Egyptian Hieroglyph ::pic hand 358 | ::u 130AD ::num 10000 ::comment Egyptian Hieroglyph 359 | ::u 130AE ::num 20000 ::comment Egyptian Hieroglyph 360 | ::u 130AF ::num 30000 ::comment Egyptian Hieroglyph 361 | ::u 130B0 ::num 40000 ::comment Egyptian Hieroglyph 362 | ::u 130B1 ::num 50000 ::comment Egyptian Hieroglyph 363 | ::u 130B2 ::num 60000 ::comment Egyptian Hieroglyph 364 | ::u 130B3 ::num 70000 ::comment Egyptian Hieroglyph 365 | ::u 130B4 ::num 80000 ::comment Egyptian Hieroglyph 366 | ::u 130B5 ::num 90000 ::comment Egyptian Hieroglyph 367 | ::u 130B6 ::num 50000 ::comment Egyptian Hieroglyph 368 | ::u 130C0 ::r b ::comment Egyptian Hieroglyph ::pic foot 369 | ::u 130ED ::r l ::comment Egyptian Hieroglyph [also rw] ::pic lion recumbent 370 | ::u 13121 ::r h ::comment Egyptian Hieroglyph (f-underscore) ::pic aninal's belly and udder 371 | ::u 1313F ::r a ::comment Egyptian Hieroglyph (alef) ::pic vulture 372 | ::u 13153 ::r m ::comment Egyptian Hieroglyph ::pic owl 373 | ::u 13171 ::r w ::comment Egyptian Hieroglyph ::pic quail chick 374 | ::u 13187 ::r ::comment Egyptian Hieroglyph (determinative/son) H8 ::pic egg 375 | ::u 13190 ::num 100000 ::comment Egyptian Hieroglyph 376 | ::u 13191 ::r f ::comment Egyptian Hieroglyph ::pic horned viper 377 | ::u 13193 ::r d ::comment Egyptian Hieroglyph (J) ::pic cobra 378 | ::u 131BC ::num 1000 ::comment Egyptian Hieroglyph 379 | ::u 131BD ::num 2000 ::comment Egyptian Hieroglyph 380 | ::u 131BE ::num 3000 ::comment Egyptian Hieroglyph 381 | ::u 131BF ::num 4000 ::comment Egyptian Hieroglyph 382 | ::u 131C0 ::num 5000 ::comment Egyptian Hieroglyph 383 | ::u 131C1 ::num 6000 ::comment Egyptian Hieroglyph 384 | ::u 131C2 ::num 7000 ::comment Egyptian Hieroglyph 385 | ::u 131C3 ::num 8000 ::comment Egyptian Hieroglyph 386 | ::u 131C4 ::num 9000 ::comment Egyptian Hieroglyph 387 | ::u 131CB ::r i ::comment Egyptian Hieroglyph (yod) ::pic single reed 388 | ::u 131CC ::r y ::comment Egyptian Hieroglyph ::pic double reed 389 | ::u 1320E ::r q ::comment Egyptian Hieroglyph (qaf) ::pic sandy slope 390 | ::u 13209 ::comment Egyptian Hieroglyph ::pic desert hills 391 | ::u 13216 ::r n ::comment Egyptian Hieroglyph ::pic ripple of water 392 | ::u 13219 ::r sh ::comment Egyptian Hieroglyph (š) ::pic basin 393 | ::u 13254 ::r h ::comment Egyptian Hieroglyph ::pic reed shelter 394 | ::u 13283 ::r z ::comment Egyptian Hieroglyph [also S?] ::pic door bolt 395 | ::u 132AA ::r p ::comment Egyptian Hieroglyph ::pic stool 396 | ::u 132D4 ::r n ::comment Egyptian Hieroglyph ::pic red crown 397 | ::u 132F4 ::r s ::comment Egyptian Hieroglyph [also Z?] ::pic folded cloth 398 | ::u 13319 ::comment Egyptian Hieroglyph ::pic throw stick 399 | ::u 13362 ::num 100 ::comment Egyptian Hieroglyph 400 | ::u 13363 ::num 200 ::comment Egyptian Hieroglyph 401 | ::u 13364 ::num 300 ::comment Egyptian Hieroglyph 402 | ::u 13365 ::num 400 ::comment Egyptian Hieroglyph 403 | ::u 13366 ::num 500 ::comment Egyptian Hieroglyph 404 | ::u 13367 ::num 600 ::comment Egyptian Hieroglyph 405 | ::u 13368 ::num 700 ::comment Egyptian Hieroglyph 406 | ::u 13369 ::num 800 ::comment Egyptian Hieroglyph 407 | ::u 1336A ::num 900 ::comment Egyptian Hieroglyph 408 | ::u 1336B ::num 500 ::comment Egyptian Hieroglyph 409 | ::u 1336F ::r o ::comment Egyptian Hieroglyph ::pic lasso 410 | ::u 1337F ::r t ::comment Egyptian Hieroglyph (ṯ) ::pic hobble 411 | ::u 13386 ::num 10 ::comment Egyptian Hieroglyph 412 | ::u 13387 ::num 20 ::comment Egyptian Hieroglyph 413 | ::u 13388 ::num 30 ::comment Egyptian Hieroglyph 414 | ::u 13389 ::num 40 ::comment Egyptian Hieroglyph 415 | ::u 1338A ::num 50 ::comment Egyptian Hieroglyph 416 | ::u 1338B ::num 60 ::comment Egyptian Hieroglyph 417 | ::u 1338C ::num 70 ::comment Egyptian Hieroglyph 418 | ::u 1338D ::num 80 ::comment Egyptian Hieroglyph 419 | ::u 1338E ::num 90 ::comment Egyptian Hieroglyph 420 | ::u 1338F ::num 20 ::comment Egyptian Hieroglyph 421 | ::u 13390 ::num 30 ::comment Egyptian Hieroglyph 422 | ::u 13391 ::num 40 ::comment Egyptian Hieroglyph 423 | ::u 13392 ::num 50 ::comment Egyptian Hieroglyph 424 | ::u 1339B ::r h ::comment Egyptian Hieroglyph ::pic twisted flax 425 | ::u 133A1 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle 426 | ::u 133A2 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle, variant 427 | ::u 133A4 ::r g ::comment Egyptian Hieroglyph ::pic bag 428 | ::u 133BC ::r g ::comment Egyptian Hieroglyph ::pic stand 429 | ::u 133CF ::r t ::comment Egyptian Hieroglyph ::pic loaf 430 | ::u 133ED ::r y ::comment Egyptian Hieroglyph ::pic two strokes 431 | ::u 133F2 ::r w ::comment Egyptian Hieroglyph ::pic quail chick, hieratic variant 432 | ::u 133FA ::num 1 ::comment Egyptian Hieroglyph 433 | ::u 133FB ::num 2 ::comment Egyptian Hieroglyph 434 | ::u 133FC ::num 3 ::comment Egyptian Hieroglyph 435 | ::u 133FD ::num 4 ::comment Egyptian Hieroglyph 436 | ::u 133FE ::num 5 ::comment Egyptian Hieroglyph 437 | ::u 133FF ::num 6 ::comment Egyptian Hieroglyph 438 | ::u 13400 ::num 7 ::comment Egyptian Hieroglyph 439 | ::u 13401 ::num 8 ::comment Egyptian Hieroglyph 440 | ::u 13402 ::num 9 ::comment Egyptian Hieroglyph 441 | ::u 13403 ::num 5 ::comment Egyptian Hieroglyph 442 | ::u 1340D ::r kh ::comment Egyptian Hieroglyph (ḫ, khah) ::pic placenta? 443 | ::u 1341D ::r m ::comment Egyptian Hieroglyph (also jm) 444 | -------------------------------------------------------------------------------- /src/core.rs: -------------------------------------------------------------------------------- 1 | use regex::Regex; 2 | use unicode_normalization::UnicodeNormalization; 3 | use unicode_properties::UnicodeGeneralCategory; 4 | use std::collections::{HashMap, HashSet}; 5 | use std::sync::LazyLock; 6 | use serde_json::Value as JsonValue; 7 | 8 | use crate::{RomRule, utils}; 9 | use crate::rom_rule::RomRules; 10 | use crate::utils::slot_value_in_double_colon_del_list; 11 | 12 | static KAYAH_RE: LazyLock = LazyLock::new(|| Regex::new(r"kayah\s+(\S+)\s*$").unwrap()); 13 | static MENDE_RE: LazyLock = LazyLock::new(|| Regex::new(r"m\d+\s+(\S+)\s*$").unwrap()); 14 | static SPACE_RE: LazyLock = LazyLock::new(|| Regex::new(r"\S\s+\S").unwrap()); 15 | static HANGUL_LEADS: &[&str] = &[ 16 | "g", "gg", "n", "d", "dd", "r", "m", "b", "bb", "s", "ss", "-", "j", "jj", "c", "k", "t", "p", "h" 17 | ]; 18 | static HANGUL_VOWELS: &[&str] = &[ 19 | "a", "ae", "ya", "yae", "eo", "e", "yeo", "ye", "o", "wa", "wai", "oe", "yo", "u", "weo", 20 | "we", "wi", "yu", "eu", "yi", "i" 21 | ]; 22 | static HANGUL_TAILS: &[&str] = &[ 23 | "-", "g", "gg", "gs", "n", "nj", "nh", "d", "l", "lg", "lm", "lb", "ls", "lt", "lp", 24 | "lh", "m", "b", "bs", "s", "ss", "ng", "j", "c", "k", "t", "p", "h" 25 | ]; 26 | 27 | /// Represents a value that can be an integer, float, or string. 28 | #[derive(Debug, Clone)] 29 | pub(crate) enum Value { 30 | Int(i64), 31 | Float(f64), 32 | String(String), 33 | Array(Vec), 34 | } 35 | 36 | /// Represents a script with its properties. 37 | #[allow(unused)] 38 | #[derive(Debug, Clone)] 39 | pub(crate) struct Script { 40 | pub script_name: String, 41 | pub direction: Option, 42 | pub abugida_default_vowels: Vec, 43 | pub alt_script_names: Vec, 44 | pub languages: Vec, 45 | pub abugida_rule_type: Option, 46 | } 47 | 48 | // #[derive(Default, Debug)] 49 | // struct NumPropDefaults { 50 | // pub value: Option, 51 | // pub num_base: Option, 52 | // pub is_large_power: Option, 53 | // } 54 | 55 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] 56 | pub(crate) enum AbugidaRuleType { 57 | A, 58 | AO, 59 | } 60 | 61 | #[derive(Debug, Default)] 62 | pub(crate) struct UromanInner { 63 | pub(crate) rom_rules: RomRules, 64 | pub(crate) scripts: HashMap, 65 | pub(crate) dict_bool: HashMap<(String, String), bool>, 66 | pub(crate) dict_str: HashMap<(String, String), String>, 67 | pub(crate) num_props: HashMap>, 68 | pub(crate) percentage_markers: HashSet, 69 | pub(crate) fraction_connectors: HashSet, 70 | pub(crate) plus_signs: HashSet, 71 | pub(crate) minus_signs: HashSet, 72 | } 73 | 74 | impl UromanInner { 75 | pub(crate) fn new() -> Self { 76 | let mut uroman = Self { 77 | rom_rules: HashMap::with_capacity(42980), 78 | scripts: HashMap::with_capacity(179), 79 | dict_bool: HashMap::with_capacity(44366), 80 | dict_str: HashMap::with_capacity(122770), 81 | num_props: HashMap::with_capacity(1599), 82 | percentage_markers: HashSet::with_capacity(1), 83 | fraction_connectors: HashSet::with_capacity(1), 84 | minus_signs: HashSet::with_capacity(2), 85 | plus_signs: HashSet::new(), 86 | }; 87 | uroman.load_resource_files(); 88 | uroman 89 | } 90 | 91 | /// Registers all prefixes of a string `s` for efficient lookup later. 92 | fn register_s_prefix(&mut self, s: &str) { 93 | let mut prefix = String::with_capacity(s.chars().count()); 94 | for c in s.chars() { 95 | prefix.push(c); 96 | self.dict_bool 97 | .insert(("s-prefix".to_string(), prefix.clone()), true); 98 | } 99 | } 100 | 101 | fn load_resource_files(&mut self) { 102 | self.load_rom_file( 103 | include_str!("../data/romanization-auto-table.txt"), 104 | "ud", 105 | "rom", 106 | ); 107 | self.load_rom_file( 108 | include_str!("../data/UnicodeDataOverwrite.txt"), 109 | "ow", 110 | "u2r", 111 | ); 112 | self.load_rom_file( 113 | include_str!("../data/romanization-table.txt"), 114 | "man", 115 | "rom", 116 | ); 117 | self.load_chinese_pinyin_file(include_str!("../data/Chinese_to_Pinyin.txt")); 118 | self.load_script_file(include_str!("../data/Scripts.txt")); 119 | self.load_unicode_data_props(include_str!("../data/UnicodeDataProps.txt")); 120 | self.load_unicode_data_props(include_str!("../data/UnicodeDataPropsCJK.txt")); 121 | self.load_unicode_data_props(include_str!("../data/UnicodeDataPropsHangul.txt")); 122 | self.load_num_props(include_str!("../data/NumProps.jsonl")); 123 | self.add_thai_cancellation_rules(); 124 | } 125 | 126 | /// Loads numerical properties from a JSONL file (e.g., NumProps.jsonl). 127 | fn load_num_props(&mut self, file_content: &'static str) { 128 | for line in file_content.lines() { 129 | if line.starts_with('#') || line.trim().is_empty() { 130 | continue; 131 | } 132 | 133 | let json: JsonValue = serde_json::from_str(line).unwrap(); 134 | 135 | if let Some(obj) = json.as_object() 136 | && let Some(txt) = obj.get("txt").and_then(|v| v.as_str()) { 137 | let txt_key = txt.to_string(); 138 | 139 | for bool_key in ["is-large-power"] { 140 | if obj.get(bool_key).and_then(|v| v.as_bool()).unwrap_or(false) { 141 | self.dict_bool.insert((bool_key.to_string(), txt_key.clone()), true); 142 | } 143 | } 144 | 145 | let mut prop_map: HashMap = HashMap::new(); 146 | for (key, val) in obj { 147 | match val { 148 | JsonValue::Number(n) => { 149 | if let Some(i) = n.as_i64() { 150 | prop_map.insert(key.clone(), Value::Int(i)); 151 | } else if let Some(f) = n.as_f64() { 152 | prop_map.insert(key.clone(), Value::Float(f)); 153 | } 154 | } 155 | JsonValue::String(s) => { 156 | prop_map.insert(key.clone(), Value::String(s.clone())); 157 | } 158 | JsonValue::Bool(b) => { 159 | prop_map.insert(key.clone(), Value::Int(if *b { 1 } else { 0 })); 160 | } 161 | JsonValue::Array(arr) => { 162 | let mut values = Vec::new(); 163 | for item in arr { 164 | if let Some(i) = item.as_i64() { 165 | values.push(Value::Int(i)); 166 | } 167 | } 168 | prop_map.insert(key.clone(), Value::Array(values)); 169 | } 170 | _ => {} 171 | } 172 | } 173 | 174 | self.num_props.insert(txt_key, prop_map); 175 | } 176 | } 177 | } 178 | 179 | /// Loads Unicode data properties from a file (e.g., UnicodeDataProps.txt). 180 | fn load_unicode_data_props(&mut self, file: &'static str) { 181 | for line in file.lines() { 182 | if line.starts_with('#') || line.trim().is_empty() { 183 | continue; 184 | } 185 | 186 | if let Some(script_name) = 187 | utils::slot_value_in_double_colon_del_list(line, "script-name") 188 | { 189 | if let Some(chars_str) = utils::slot_value_in_double_colon_del_list(line, "char") { 190 | for c in chars_str.chars() { 191 | self.dict_str.insert( 192 | ("script".to_string(), c.to_string()), 193 | script_name.to_string(), 194 | ); 195 | } 196 | } 197 | if let Some(vowel_sign_str) = 198 | utils::slot_value_in_double_colon_del_list(line, "vowel-sign") 199 | { 200 | for c in vowel_sign_str.chars() { 201 | self.dict_bool 202 | .insert(("is-vowel-sign".to_string(), c.to_string()), true); 203 | } 204 | } 205 | if let Some(medial_consonant_sign_str) = 206 | utils::slot_value_in_double_colon_del_list(line, "medial-consonant-sign") 207 | { 208 | for c in medial_consonant_sign_str.chars() { 209 | self.dict_bool.insert( 210 | ("is-medial-consonant-sign".to_string(), c.to_string()), 211 | true, 212 | ); 213 | } 214 | } 215 | if let Some(virama_str) = 216 | utils::slot_value_in_double_colon_del_list(line, "sign-virama") 217 | { 218 | for c in virama_str.chars() { 219 | self.dict_bool 220 | .insert(("is-virama".to_string(), c.to_string()), true); 221 | } 222 | } 223 | } 224 | } 225 | } 226 | 227 | /// Loads a script definition file (e.g., Scripts.txt). 228 | fn load_script_file(&mut self, file: &'static str) { 229 | for line in file.lines() { 230 | if line.starts_with('#') || line.trim().is_empty() { 231 | continue; 232 | } 233 | 234 | if let Some(script_name) = 235 | utils::slot_value_in_double_colon_del_list(line, "script-name") 236 | { 237 | let lc_script_name = script_name.to_lowercase(); 238 | if self.scripts.contains_key(&lc_script_name) { 239 | // Handle duplicate script names (Python version warns and ignores) 240 | continue; 241 | } 242 | 243 | let direction = utils::slot_value_in_double_colon_del_list(line, "direction") 244 | .map(|s| s.to_string()); 245 | let abugida_default_vowel_s = 246 | utils::slot_value_in_double_colon_del_list(line, "abugida-default-vowel") 247 | .unwrap_or(""); 248 | let abugida_default_vowels = if abugida_default_vowel_s.is_empty() { 249 | vec![] 250 | } else { 251 | abugida_default_vowel_s 252 | .split([',', ';']) 253 | .map(|s| s.trim().to_string()) 254 | .collect() 255 | }; 256 | let alt_script_name_s = 257 | utils::slot_value_in_double_colon_del_list(line, "alt-script-name") 258 | .unwrap_or(""); 259 | let alt_script_names = if alt_script_name_s.is_empty() { 260 | vec![] 261 | } else { 262 | alt_script_name_s 263 | .split([',', ';']) 264 | .map(|s| s.trim().to_string()) 265 | .collect() 266 | }; 267 | let language_s = 268 | utils::slot_value_in_double_colon_del_list(line, "language").unwrap_or(""); 269 | let languages = if language_s.is_empty() { 270 | vec![] 271 | } else { 272 | language_s 273 | .split([',', ';']) 274 | .map(|s| s.trim().to_string()) 275 | .collect() 276 | }; 277 | 278 | let abugida_rule_type = if !abugida_default_vowels.is_empty() { 279 | let vowels_regex1 = abugida_default_vowels.join("|"); 280 | let vowels_regex2 = abugida_default_vowels 281 | .iter() 282 | .map(|v| format!("{v}+")) 283 | .collect::>() 284 | .join("|"); 285 | 286 | let abugida_rule_type = match (vowels_regex1.as_str(), vowels_regex2.as_str()) { 287 | ("a|o", "a+|o+") => AbugidaRuleType::AO, 288 | ("a", "a+") => AbugidaRuleType::A, 289 | _ => unreachable!() 290 | }; 291 | 292 | Some(abugida_rule_type) 293 | } else { 294 | None 295 | }; 296 | 297 | let new_script = Script { 298 | script_name: script_name.to_string(), 299 | direction, 300 | abugida_default_vowels, 301 | alt_script_names: alt_script_names.clone(), 302 | languages: languages.clone(), 303 | abugida_rule_type, 304 | }; 305 | 306 | self.scripts.insert(lc_script_name, new_script.clone()); 307 | 308 | for alt_script_name in alt_script_names { 309 | self.scripts 310 | .insert(alt_script_name.to_lowercase(), new_script.clone()); 311 | } 312 | } 313 | } 314 | } 315 | 316 | fn load_rom_file(&mut self, file: &'static str, provenance: &str, file_format: &str) { 317 | for line in file.lines() { 318 | if line.starts_with('#') || line.trim().is_empty() { 319 | continue; 320 | } 321 | 322 | if file_format == "u2r" { 323 | let u_str = match slot_value_in_double_colon_del_list(line, "u") { 324 | Some(s) => s, 325 | None => continue, 326 | }; 327 | 328 | let s = match u32::from_str_radix(u_str, 16) 329 | .ok() 330 | .and_then(std::char::from_u32) 331 | { 332 | Some(c) => c, 333 | None => continue, 334 | }; 335 | 336 | if let Some(tone_mark) = slot_value_in_double_colon_del_list(line, "tone-mark") { 337 | self.dict_str.insert( 338 | ("tone-mark".to_string(), s.to_string()), 339 | tone_mark.to_string(), 340 | ); 341 | } 342 | 343 | if let Some(syllable_info) = 344 | slot_value_in_double_colon_del_list(line, "syllable-info") 345 | { 346 | self.dict_str.insert( 347 | ("syllable-info".to_string(), s.to_string()), 348 | syllable_info.to_string(), 349 | ); 350 | } 351 | 352 | if let Some(syllable_info) = slot_value_in_double_colon_del_list(line, "pic") { 353 | self.dict_str.insert( 354 | ("pic".to_string(), s.to_string()), 355 | syllable_info.to_string(), 356 | ); 357 | } 358 | 359 | if let Some(syllable_info) = slot_value_in_double_colon_del_list(line, "name") { 360 | self.dict_str.insert( 361 | ("name".to_string(), s.to_string()), 362 | syllable_info.to_string(), 363 | ); 364 | } 365 | 366 | if let Some(rule) = RomRule::from_line(line, provenance, file_format, self) { 367 | self.add_rom_rule(rule); 368 | } 369 | } else if let Some(rule) = RomRule::from_line(line, provenance, file_format, self) { 370 | self.add_rom_rule(rule); 371 | } 372 | } 373 | } 374 | 375 | fn add_rom_rule(&mut self, rule: RomRule) { 376 | if rule.is_minus_sign { 377 | self.minus_signs.insert(rule.s.clone()); 378 | } 379 | if rule.is_plus_sign { 380 | self.plus_signs.insert(rule.s.clone()); 381 | } 382 | if rule.fraction_connector { 383 | self.fraction_connectors.insert(rule.s.clone()); 384 | } 385 | if rule.percentage_marker { 386 | self.percentage_markers.insert(rule.s.clone()); 387 | } 388 | 389 | if rule.is_large_power { 390 | self.dict_bool 391 | .insert(("is-large-power".to_string(), rule.s.clone()), true); 392 | } 393 | 394 | self.register_s_prefix(&rule.s); 395 | 396 | let old_rules = self.rom_rules.entry(rule.s.clone()).or_default(); 397 | 398 | // Python: `and not (lcodes or ...)` 399 | let is_unconditional = rule.is_unconditional(); 400 | 401 | let should_overwrite = old_rules.len() == 1 && { 402 | let old_rule = &old_rules[0]; 403 | (old_rule.prov == "ud" || old_rule.prov == "ow") && is_unconditional 404 | }; 405 | 406 | // println!( 407 | // "LOAD: s='{}', prov='{}', is_uncond={}, should_ow={}", 408 | // rule.s, rule.prov, is_unconditional, should_overwrite 409 | // ); 410 | 411 | if should_overwrite { 412 | *old_rules = vec![rule]; 413 | } else { 414 | old_rules.push(rule); 415 | } 416 | } 417 | 418 | /// Loads and processes the Chinese to Pinyin mapping file. 419 | fn load_chinese_pinyin_file(&mut self, file: &'static str) { 420 | for line in file.lines() { 421 | if line.starts_with('#') || line.trim().is_empty() { 422 | continue; 423 | } 424 | 425 | if let Some((chinese, pinyin_with_accent)) = line.split_once(char::is_whitespace) { 426 | // `de_accent_pinyin` logic: NFD decomposition to separate base chars and accents. 427 | let rom: String = pinyin_with_accent 428 | .nfd() 429 | .filter(|c| { 430 | !matches!( 431 | c.general_category_group(), 432 | unicode_properties::GeneralCategoryGroup::Mark 433 | ) 434 | }) 435 | .collect::() 436 | .replace('ü', "u"); 437 | 438 | let rule = RomRule::new_simple(chinese.to_string(), &rom, "rom pinyin"); 439 | self.rom_rules 440 | .entry(chinese.to_string()) 441 | .or_default() 442 | .push(rule); 443 | self.register_s_prefix(chinese); 444 | } 445 | } 446 | } 447 | 448 | // /// Retrieves the numerical properties for a given character. 449 | // /// 450 | // /// This method looks up the character in the `num_props` map. 451 | // /// If the character is not found, it returns default values. 452 | // fn get_num_props(&self, c: char) -> NumPropDefaults { 453 | // self.num_props 454 | // .get(&c) 455 | // .map_or_else(NumPropDefaults::default, |props| { 456 | // let value = props.get("value").and_then(|v| match v { 457 | // Value::Int(i) => Some(*i as f64), 458 | // Value::Float(f) => Some(*f), 459 | // _ => None, 460 | // }); 461 | // let num_base = props.get("base").and_then(|v| match v { 462 | // Value::Int(i) => Some(*i), 463 | // _ => None, 464 | // }); 465 | // let is_large_power = props.get("is-large-power").and_then(|v| match v { 466 | // Value::Int(1) => Some(true), 467 | // _ => None, 468 | // }); 469 | 470 | // NumPropDefaults { 471 | // value, 472 | // num_base, 473 | // is_large_power, 474 | // } 475 | // }) 476 | // } 477 | 478 | /// A helper to get a string value from `dict_str`, returning `""` if not found. 479 | pub(crate) fn dict_str_get(&self, k1: &str, k2_char: char) -> &str { 480 | self.dict_str 481 | .get(&(k1.to_string(), k2_char.to_string())) 482 | .map(|s| s.as_str()) // Option<&String> -> Option<&str> 483 | .unwrap_or("") // None -> "" 484 | } 485 | 486 | /// A helper to get a boolean value from `dict_bool`, returning `false` if not found. 487 | /// This mimics the behavior of Python's `defaultdict(bool)`. 488 | pub(crate) fn dict_bool_get(&self, k1: &str, k2: &str) -> bool { 489 | self.dict_bool 490 | .get(&(k1.to_string(), k2.to_string())) 491 | .copied() 492 | .unwrap_or(false) 493 | } 494 | 495 | pub(crate) fn second_rom_filter(&self, c: &str, rom: Option<&str>) -> Option { 496 | if c.is_empty() { 497 | return rom.map(|s| s.to_string()); 498 | } 499 | 500 | let rom_str = match rom { 501 | Some(r) if r.contains(' ') => r, 502 | _ => return rom.map(|s| s.to_string()), 503 | }; 504 | 505 | let name = self.chr_name(c.chars().next().unwrap()); 506 | 507 | if name.contains("MYANMAR VOWEL SIGN KAYAH") 508 | && let Some(cap) = KAYAH_RE.captures(rom_str) 509 | { 510 | return Some(cap.get(1).unwrap().as_str().to_string()); 511 | } 512 | if name.contains("MENDE KIKAKUI SYLLABLE") 513 | && let Some(cap) = MENDE_RE.captures(rom_str) 514 | { 515 | return Some(cap.get(1).unwrap().as_str().to_string()); 516 | } 517 | if SPACE_RE.is_match(rom_str) { 518 | return Some(c.to_string()); 519 | } 520 | 521 | rom.map(|s| s.to_string()) 522 | } 523 | 524 | // /// Gets the numeric value of a character from the loaded `num_props` data. 525 | // /// This is the correct replacement for Python's `unicodedata.numeric()`. 526 | // /// 527 | // /// It looks up the character, then the "value" key, and converts the result to `f64`. 528 | // fn get_numeric_value(&self, c: char) -> Option { 529 | // self.num_props 530 | // .get(&c) 531 | // .and_then(|props| props.get("value")) 532 | // .and_then(|val| match val { 533 | // Value::Int(i) => Some(*i as f64), 534 | // Value::Float(f) => Some(*f), 535 | // _ => None, 536 | // }) 537 | // } 538 | 539 | /// Checks if a character is a non-spacing mark. 540 | pub(crate) fn char_is_nonspacing_mark(&self, c: char) -> bool { 541 | use unicode_properties::UnicodeGeneralCategory; 542 | matches!( 543 | c.general_category(), 544 | unicode_properties::GeneralCategory::NonspacingMark 545 | ) 546 | } 547 | 548 | // /// Checks if a character is a format control character. 549 | // fn char_is_format_char(&self, c: char) -> bool { 550 | // use unicode_properties::UnicodeGeneralCategory; 551 | // matches!( 552 | // c.general_category(), 553 | // unicode_properties::GeneralCategory::Format 554 | // ) 555 | // } 556 | 557 | pub(crate) fn chr_name(&self, c: char) -> String { 558 | // Check for an overridden name in dict_str. 559 | if let Some(name) = self.dict_str.get(&("name".to_string(), c.to_string())) { 560 | return name.clone(); 561 | } 562 | unicode_names2::name(c) 563 | .map(|n| n.to_string()) 564 | .unwrap_or_default() 565 | } 566 | 567 | /// Converts a Korean Hangul character to its Latin alphabet representation. 568 | /// 569 | /// This is a special algorithmic romanization that decomposes a Hangul syllable 570 | /// into its constituent Jamo (lead, vowel, tail) and maps them to roman characters. 571 | /// The results are cached for performance. 572 | pub(crate) fn unicode_hangul_romanization(&self, c: char) -> Option { 573 | let cp = c as u32; 574 | 575 | if !(0xAC00..=0xD7A3).contains(&cp) { 576 | return None; 577 | } 578 | 579 | let code = cp - 0xAC00; 580 | 581 | let lead_index = (code / (28 * 21)) as usize; 582 | let vowel_index = ((code / 28) % 21) as usize; 583 | let tail_index = (code % 28) as usize; 584 | 585 | let rom = format!( 586 | "{}{}{}", 587 | HANGUL_LEADS[lead_index], HANGUL_VOWELS[vowel_index], HANGUL_TAILS[tail_index] 588 | ); 589 | 590 | let rom = rom.replace('-', ""); 591 | 592 | Some(rom) 593 | } 594 | 595 | // fn unicode_hangul_romanization_str(&mut self, s: &str, pass_through_p: bool) -> String { 596 | // let mut result = String::new(); 597 | // for c in s.chars() { 598 | // if let Some(rom) = self.unicode_hangul_romanization(c) { 599 | // result.push_str(&rom); 600 | // } else if pass_through_p { 601 | // result.push(c); 602 | // } 603 | // } 604 | // result 605 | // } 606 | 607 | /// Returns the script name of a character. 608 | /// 609 | /// This is derived from `UnicodeDataProps*.txt` and stored in `dict_str`. 610 | /// Returns an empty string if not found. 611 | pub(crate) fn chr_script_name(&self, c: char) -> String { 612 | self.dict_str 613 | .get(&("script".to_string(), c.to_string())) 614 | .cloned() 615 | .unwrap_or_default() 616 | } 617 | 618 | /// Adds automatic cancellation rules for the Thai script. 619 | /// 620 | /// This method programmatically generates rules to handle the Thai character 621 | /// THANTHAKHAT (`\u0E4C`), which indicates that the preceding character(s) 622 | /// should not be pronounced (and thus not romanized). 623 | fn add_thai_cancellation_rules(&mut self) { 624 | let thai_cancellation_mark = '\u{0E4C}'; 625 | for cp in 0x0E01..0x0E4C { 626 | if let Some(c) = std::char::from_u32(cp) { 627 | let s = format!("{c}{thai_cancellation_mark}"); 628 | 629 | let rules_for_s = self.rom_rules.entry(s.clone()).or_default(); 630 | if rules_for_s.is_empty() { 631 | let rule = RomRule::new_simple(s.clone(), "", "auto cancel letter"); 632 | rules_for_s.push(rule); 633 | self.register_s_prefix(&s); 634 | } 635 | } 636 | } 637 | 638 | let thai_consonants = (0x0E01..0x0E2F).filter_map(std::char::from_u32); 639 | 640 | let thai_vowel_modifiers = ['\u{0E31}', '\u{0E47}'] 641 | .into_iter() 642 | .chain((0x0E33..=0x0E3B).filter_map(std::char::from_u32)); 643 | 644 | for c1 in thai_consonants.clone() { 645 | for v in thai_vowel_modifiers.clone() { 646 | let s = format!("{c1}{v}{thai_cancellation_mark}"); 647 | 648 | let rules_for_s = self.rom_rules.entry(s.clone()).or_default(); 649 | if rules_for_s.is_empty() { 650 | let rule = RomRule::new_simple(s.clone(), "", "auto cancel syllable"); 651 | rules_for_s.push(rule); 652 | self.register_s_prefix(&s); 653 | } 654 | } 655 | } 656 | } 657 | } --------------------------------------------------------------------------------