├── BRZO ├── .ocp-indent ├── .merlin ├── .gitignore ├── test ├── link_test.ml ├── examples.ml ├── perf.ml └── test_uucp.ml ├── pkg ├── META └── pkg.ml ├── _tags ├── doc └── index.mld ├── src ├── uucp__alpha.ml ├── uucp__case_nfkc.ml ├── uucp__white.ml ├── uucp__case_nfkc_simple.ml ├── uucp_version_data.ml ├── uucp__hangul.ml ├── uucp__gc.ml ├── uucp__block.ml ├── uucp__case_fold.ml ├── uucp__case_fold.mli ├── uucp__alpha.mli ├── uucp__case_nfkc.mli ├── uucp_hangul_base.ml ├── uucp__white.mli ├── uucp__age.ml ├── uucp__case_nfkc_simple.mli ├── uucp__script.ml ├── uucp__case_map.ml ├── uucp_name_base.ml ├── uucp__num.ml ├── uucp__age.mli ├── uucp__case.ml ├── uucp__hangul.mli ├── uucp__gen.ml ├── uucp__cjk.ml ├── uucp__name.ml ├── uucp__emoji.ml ├── uucp__case_map.mli ├── uucp_num_base.ml ├── uucp__gc.mli ├── uucp.mllib ├── uucp.ml ├── uucp__id.ml ├── uucp__name.mli ├── uucp_gc_base.ml ├── uucp__func.ml ├── uucp__gen.mli ├── uucp__emoji.mli ├── uucp__num.mli ├── uucp__cjk.mli ├── uucp_fmt.ml ├── uucp_tmap.ml ├── uucp_tmapbyte.ml ├── uucp__id.mli ├── uucp_cmap.ml ├── uucp__func.mli ├── uucp_tmapbool.ml ├── uucp__break.ml ├── uucp_rmap.ml ├── uucp_white_data.ml ├── uucp__script.mli ├── uucp_tmap5bytes.ml ├── uucp__case.mli ├── uucp_break_base.ml ├── uucp.mli ├── uucp_script_base.ml ├── uucp__block.mli ├── uucp__break.mli └── uucp_gen_data.ml ├── support ├── gen_alpha.ml ├── gen_white.ml ├── gen_case_fold.ml ├── gen_case.ml ├── gen_gc.ml ├── gen_gen.ml ├── gen_hangul.ml ├── gen_cjk.ml ├── gen_emoji.ml ├── gen_age.ml ├── gen_id.ml ├── gen_func.ml ├── gen_case_map.ml ├── gen_case_nfkc_simple.ml ├── gen_case_nfkc.ml ├── gen_script.ml ├── gen_block.ml ├── gen_num.ml ├── gen_break.ml ├── gen_name.ml ├── generate_data.ml └── gen.ml ├── LICENSE.md ├── DEVEL.md ├── README.md ├── opam ├── CHANGES.md └── B0.ml /BRZO: -------------------------------------------------------------------------------- 1 | (srcs-x pkg support test) -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | strict_with=always,match_clause=4,strict_else=never -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG uucd uunf cmdliner b0.kit 2 | S src 3 | S test 4 | S support 5 | B _b0/b/** 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _b0 2 | _build 3 | tmp 4 | *~ 5 | \.\#* 6 | \#*# 7 | support/ucd.xml 8 | *.native 9 | *.byte 10 | *.install 11 | -------------------------------------------------------------------------------- /test/link_test.ml: -------------------------------------------------------------------------------- 1 | (* 2 | Compile with: 3 | ocamlfind ocamlopt -package uucp -linkpkg -o link_test.native link_test.ml 4 | *) 5 | 6 | let () = ignore (Uucp.Age.age (Uchar.of_int 0x1F42B)) 7 | -------------------------------------------------------------------------------- /pkg/META: -------------------------------------------------------------------------------- 1 | description = "Unicode character properties" 2 | version = "%%VERSION_NUM%%" 3 | requires = "" 4 | archive(byte) = "uucp.cma" 5 | archive(native) = "uucp.cmxa" 6 | plugin(byte) = "uucp.cma" 7 | plugin(native) = "uucp.cmxs" 8 | -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | true : bin_annot, safe_string 2 | 3 | <_b0> : -traverse 4 | : include 5 | : no_alias_deps 6 | : include 7 | : package(uucd unix) 8 | 9 | : include 10 | : package(uucd) 11 | : package(uunf) 12 | : package(uunf cmdliner) -------------------------------------------------------------------------------- /doc/index.mld: -------------------------------------------------------------------------------- 1 | {0 Uucp {%html: %%VERSION%%%}} 2 | 3 | Uucp provides efficient access to a selection of character properties 4 | of the Unicode character database. 5 | 6 | Consult {{!page-unicode}this page} for a minimal Unicode introduction 7 | and OCaml Unicode tips. 8 | 9 | {1:library_uucp Library [uucp]} 10 | 11 | {!modules: 12 | Uucp 13 | } 14 | -------------------------------------------------------------------------------- /src/uucp__alpha.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let is_alphabetic u = 7 | Uucp_tmapbool.get Uucp_alpha_data.alphabetic_map (Uchar.to_int u) 8 | -------------------------------------------------------------------------------- /src/uucp__case_nfkc.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let fold u = 7 | Uucp_tmap.get Uucp_case_nfkc_data.nfkc_fold_map_map (Uchar.to_int u) 8 | -------------------------------------------------------------------------------- /src/uucp__white.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let is_white_space u = 7 | Uucp_tmapbool.get Uucp_white_data.white_space_map (Uchar.to_int u) 8 | -------------------------------------------------------------------------------- /src/uucp__case_nfkc_simple.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let fold u = 7 | Uucp_tmap.get 8 | Uucp_case_nfkc_simple_data.nfkc_simple_fold_map_map (Uchar.to_int u) 9 | -------------------------------------------------------------------------------- /src/uucp_version_data.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2025 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* WARNING do not edit. This file was automatically generated. *) 7 | 8 | let unicode_version = "17.0.0" 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/uucp__hangul.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | include Uucp_hangul_base 7 | 8 | let syllable_type u = 9 | Uucp_rmap.get Uucp_hangul_data.syllable_type_map (Uchar.to_int u) 10 | -------------------------------------------------------------------------------- /src/uucp__gc.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | include Uucp_gc_base 7 | 8 | let compare = compare 9 | let general_category u = 10 | Uucp_rmap.get Uucp_gc_data.general_category_map (Uchar.to_int u) 11 | -------------------------------------------------------------------------------- /support/gen_alpha.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | Gen.pp_prop_tmapbool_ucd ppf ucd Uucd.alphabetic "alphabetic"; 8 | () 9 | 10 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 11 | -------------------------------------------------------------------------------- /src/uucp__block.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | include Uucp_block_base 7 | 8 | let compare = compare 9 | let blocks = Uucp_block_data.block_list 10 | let block u = Uucp_rmap.get Uucp_block_data.block_map (Uchar.to_int u) 11 | -------------------------------------------------------------------------------- /src/uucp__case_fold.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let untagged_get m u = match Uucp_tmap.get m u with 7 | | [] -> `Self | us -> `Uchars us 8 | 9 | let fold u = untagged_get Uucp_case_fold_data.fold_map_map (Uchar.to_int u) 10 | -------------------------------------------------------------------------------- /support/gen_white.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 8 | prop Uucd.white_space "white_space"; 9 | () 10 | 11 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 12 | -------------------------------------------------------------------------------- /support/gen_case_fold.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let map prop pname = Gen_case_map.pp_map_untagged_prop prop pname ppf ucd in 8 | map Uucd.case_folding "fold_map"; 9 | () 10 | 11 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 12 | -------------------------------------------------------------------------------- /src/uucp__case_fold.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Case folding. *) 7 | 8 | (** {1:casefolding Case folding} *) 9 | 10 | val fold : Uchar.t -> [ `Self | `Uchars of Uchar.t list ] 11 | (** [fold u] is [u]'s 12 | {{:http://www.unicode.org/reports/tr44/#Case_Folding}Case_Folding} 13 | property. *) 14 | -------------------------------------------------------------------------------- /src/uucp__alpha.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Alphabetic property. *) 7 | 8 | (** {1:alphaprop Alphabetic property} *) 9 | 10 | val is_alphabetic : Uchar.t -> bool 11 | (** [is_alphabetic u] is [true] if [u] has the 12 | {{:http://www.unicode.org/reports/tr44/#Alphabetic}Alphabetic} 13 | property. *) 14 | -------------------------------------------------------------------------------- /src/uucp__case_nfkc.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** NFKC case folding. *) 7 | 8 | (** {1:nfkcfold NFKC Case folding} *) 9 | 10 | val fold : Uchar.t -> [ `Self | `Uchars of Uchar.t list ] 11 | (** [fold u] is [u]'s 12 | {{:http://www.unicode.org/reports/tr44/#NFKC_Casefold}NFKC_Casefold} 13 | property. *) 14 | -------------------------------------------------------------------------------- /src/uucp_hangul_base.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type syllable_type = [ `L | `V | `T | `LV | `LVT | `NA ] 7 | 8 | let pp_syllable_type ppf v = Format.fprintf ppf "%s" begin match v with 9 | | `NA -> "NA" 10 | | `L -> "L" 11 | | `V -> "V" 12 | | `T -> "T" 13 | | `LV -> "LV" 14 | | `LVT -> "LVT" 15 | end 16 | -------------------------------------------------------------------------------- /src/uucp__white.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** White space property. *) 7 | 8 | (** {1:whiteprop White space property} *) 9 | 10 | val is_white_space : Uchar.t -> bool 11 | (** [is_white_space u] is [true] if [u] has the 12 | {{:http://www.unicode.org/reports/tr44/#White_Space}White_Space} 13 | property. *) 14 | -------------------------------------------------------------------------------- /src/uucp__age.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type t = [ `Unassigned | `Version of int * int ] 7 | 8 | let compare = compare 9 | let pp ppf = function 10 | | `Version (maj,min) -> Format.fprintf ppf "%d.%d" maj min 11 | | `Unassigned -> Format.fprintf ppf "unassigned" 12 | 13 | let age u = Uucp_rmap.get Uucp_age_data.age_map (Uchar.to_int u) 14 | -------------------------------------------------------------------------------- /support/gen_case.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 8 | prop Uucd.uppercase "upper"; 9 | prop Uucd.lowercase "lower"; 10 | prop Uucd.cased "cased"; 11 | prop Uucd.case_ignorable "case_ignorable"; 12 | () 13 | 14 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 15 | -------------------------------------------------------------------------------- /src/uucp__case_nfkc_simple.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** NFKC simple case folding. *) 7 | 8 | (** {1:nfkcfold NFKC simple case folding} *) 9 | 10 | val fold : Uchar.t -> [ `Self | `Uchars of Uchar.t list ] 11 | (** [fold u] is [u]'s 12 | {{:http://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold} 13 | NFKC_Simple_Casefold}property. *) 14 | -------------------------------------------------------------------------------- /src/uucp__script.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | include Uucp_script_base 7 | 8 | let compare = compare 9 | let script u = Uucp_rmap.get Uucp_script_data.script_map (Uchar.to_int u) 10 | let script_extensions u = 11 | match Uucp_rmap.get Uucp_script_data.script_extensions_map (Uchar.to_int u) 12 | with 13 | | [] -> [ script u ] 14 | | scripts -> scripts 15 | -------------------------------------------------------------------------------- /src/uucp__case_map.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let untagged_get m u = match Uucp_tmap.get m u with 7 | | [] -> `Self | us -> `Uchars us 8 | 9 | let to_upper u = untagged_get Uucp_case_map_data.upper_map_map (Uchar.to_int u) 10 | let to_lower u = untagged_get Uucp_case_map_data.lower_map_map (Uchar.to_int u) 11 | let to_title u = untagged_get Uucp_case_map_data.title_map_map (Uchar.to_int u) 12 | -------------------------------------------------------------------------------- /src/uucp_name_base.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | 7 | type alias_tag = 8 | [ `Abbreviation | `Alternate | `Control | `Correction | `Figment ] 9 | 10 | let pp_alias_tag ppf t = Format.fprintf ppf "%s" begin match t with 11 | | `Abbreviation -> "Abbreviation" 12 | | `Alternate -> "Alternate" 13 | | `Control -> "Control" 14 | | `Correction -> "Correction" 15 | | `Figment -> "Figment" 16 | end 17 | -------------------------------------------------------------------------------- /support/gen_gc.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_general_category ppf ucd = 7 | let size v = 0 in 8 | let pp_gc ppf v = Gen.pp ppf "`%a" Uucp_gc_base.pp v in 9 | Gen.pp_prop_rmap_ucd ~share:false ppf ucd 10 | Uucd.general_category "general_category" "Uucp_gc_base.t" 11 | pp_gc ~default:`Cn size 12 | 13 | let pp_props ppf ucd = 14 | pp_general_category ppf ucd; 15 | () 16 | 17 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 18 | -------------------------------------------------------------------------------- /src/uucp__num.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | include Uucp_num_base 7 | 8 | let is_ascii_hex_digit u = 9 | Uucp_tmapbool.get Uucp_num_data.ascii_hex_digit_map (Uchar.to_int u) 10 | 11 | let is_hex_digit u = 12 | Uucp_tmapbool.get Uucp_num_data.hex_digit_map (Uchar.to_int u) 13 | 14 | let numeric_type u = 15 | Uucp_rmap.get Uucp_num_data.numeric_type_map (Uchar.to_int u) 16 | 17 | let numeric_value u = 18 | Uucp_cmap.get Uucp_num_data.numeric_value_map (Uchar.to_int u) 19 | -------------------------------------------------------------------------------- /support/gen_gen.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 8 | prop Uucd.default_ignorable_code_point "default_ignorable"; 9 | prop Uucd.deprecated "deprecated"; 10 | prop Uucd.logical_order_exception "logical_order_exception"; 11 | prop Uucd.noncharacter_code_point "non_character"; 12 | prop Uucd.variation_selector "variation_selector"; 13 | () 14 | 15 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 16 | -------------------------------------------------------------------------------- /support/gen_hangul.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_syllable_type ppf ucd = 7 | let size v = 0 in 8 | let pp_stype ppf t = Gen.pp ppf "`%a" Uucp_hangul_base.pp_syllable_type t in 9 | Gen.pp_prop_rmap_ucd ~share:false ppf ucd Uucd.hangul_syllable_type 10 | "syllable_type" "Uucp_hangul_base.syllable_type" pp_stype 11 | ~default:`NA size 12 | 13 | let pp_props ppf ucd = 14 | pp_syllable_type ppf ucd; 15 | () 16 | 17 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 18 | -------------------------------------------------------------------------------- /support/gen_cjk.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 8 | prop Uucd.ideographic "ideographic"; 9 | prop Uucd.ids_unary_operator "ids_unary_operator"; 10 | prop Uucd.ids_binary_operator "ids_binary_operator"; 11 | prop Uucd.ids_trinary_operator "ids_trinary_operator"; 12 | prop Uucd.radical "radical"; 13 | prop Uucd.unified_ideograph "unified_ideograph"; 14 | () 15 | 16 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 17 | -------------------------------------------------------------------------------- /support/gen_emoji.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2020 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 8 | prop Uucd.emoji "emoji"; 9 | prop Uucd.emoji_presentation "emoji_presentation"; 10 | prop Uucd.emoji_modifier "emoji_modifier"; 11 | prop Uucd.emoji_modifier_base "emoji_modifier_base"; 12 | prop Uucd.emoji_component "emoji_component"; 13 | prop Uucd.extended_pictographic "extended_pictographic"; 14 | () 15 | 16 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 17 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 The uucp programmers 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /src/uucp__age.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Age property. *) 7 | 8 | (** {1:ageprop Age property} *) 9 | 10 | type t = [ `Unassigned | `Version of int * int ] 11 | (** The type for character age. *) 12 | 13 | val compare : t -> t -> int 14 | (** [compare a a'] is [Stdlib.compare a a'] *) 15 | 16 | val pp : Format.formatter -> t -> unit 17 | (** [pp ppf a] prints an unspecified representation of [a] on [ppf]. *) 18 | 19 | val age : Uchar.t -> t 20 | (** [age u] is [u]'s 21 | {{:http://www.unicode.org/reports/tr44/#Age}Age} property. *) 22 | -------------------------------------------------------------------------------- /support/gen_age.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_age ppf ucd = 7 | let size = function 8 | | `Unassigned -> 0 9 | | `Version _ -> 3 10 | in 11 | let pp_age ppf = function 12 | | `Unassigned -> Gen.pp ppf "`Unassigned" 13 | | `Version (maj, min) -> Gen.pp ppf "(`Version (%d,%d))" maj min 14 | in 15 | Gen.pp_prop_rmap_ucd ppf ucd 16 | Uucd.age "age" "[ `Unassigned | `Version of int * int ]" pp_age 17 | ~default:`Unassigned size 18 | 19 | let pp_props ppf ucd = 20 | pp_age ppf ucd; 21 | () 22 | 23 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 24 | -------------------------------------------------------------------------------- /support/gen_id.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 8 | prop Uucd.id_start "id_start"; 9 | prop Uucd.id_continue "id_continue"; 10 | prop Uucd.xid_start "xid_start"; 11 | prop Uucd.xid_continue "xid_continue"; 12 | prop Uucd.id_compat_math_start "id_compat_math_start"; 13 | prop Uucd.id_compat_math_continue "id_compat_math_continue"; 14 | prop Uucd.pattern_syntax "pattern_syntax"; 15 | prop Uucd.pattern_white_space "pattern_white_space"; 16 | () 17 | 18 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 19 | -------------------------------------------------------------------------------- /src/uucp__case.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Case properties *) 7 | 8 | let is_upper u = Uucp_tmapbool.get Uucp_case_data.upper_map (Uchar.to_int u) 9 | let is_lower u = Uucp_tmapbool.get Uucp_case_data.lower_map (Uchar.to_int u) 10 | let is_cased u = Uucp_tmapbool.get Uucp_case_data.cased_map (Uchar.to_int u) 11 | let is_case_ignorable u = 12 | Uucp_tmapbool.get Uucp_case_data.case_ignorable_map (Uchar.to_int u) 13 | 14 | (* Case mappings *) 15 | 16 | module Map = Uucp__case_map 17 | module Fold = Uucp__case_fold 18 | module Nfkc_fold = Uucp__case_nfkc 19 | module Nfkc_simple_fold = Uucp__case_nfkc_simple 20 | -------------------------------------------------------------------------------- /src/uucp__hangul.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Hangul properties. *) 7 | 8 | (** {1:hangul_syllable_type Hangul syllable type property} *) 9 | 10 | type syllable_type = [ `L | `V | `T | `LV | `LVT | `NA ] 11 | (** The type for hangul syllable types. *) 12 | 13 | val pp_syllable_type : Format.formatter -> syllable_type -> unit 14 | (** [pp_syllable_type ppf s] prints an unspecified representation of 15 | [s] on [ppf]. *) 16 | 17 | val syllable_type : Uchar.t -> syllable_type 18 | (** [syllable_type u] is [u]'s 19 | {{:http://www.unicode.org/reports/tr44/#Hangul_Syllable_Type} 20 | Hangul_Syllable_type} property. *) 21 | -------------------------------------------------------------------------------- /src/uucp__gen.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* General properties *) 7 | 8 | let is_default_ignorable u = 9 | Uucp_tmapbool.get Uucp_gen_data.default_ignorable_map (Uchar.to_int u) 10 | 11 | let is_deprecated u = 12 | Uucp_tmapbool.get Uucp_gen_data.deprecated_map (Uchar.to_int u) 13 | 14 | let is_logical_order_exception u = 15 | Uucp_tmapbool.get Uucp_gen_data.logical_order_exception_map (Uchar.to_int u) 16 | 17 | let is_non_character u = 18 | Uucp_tmapbool.get Uucp_gen_data.non_character_map (Uchar.to_int u) 19 | 20 | let is_variation_selector u = 21 | Uucp_tmapbool.get Uucp_gen_data.variation_selector_map (Uchar.to_int u) 22 | -------------------------------------------------------------------------------- /support/gen_func.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_props ppf ucd = 7 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 8 | prop Uucd.dash "dash"; 9 | prop Uucd.diacritic "diacritic"; 10 | prop Uucd.extender "extender"; 11 | prop Uucd.grapheme_base "grapheme_base"; 12 | prop Uucd.grapheme_extend "grapheme_extend"; 13 | prop Uucd.math "math"; 14 | prop Uucd.quotation_mark "quotation_mark"; 15 | prop Uucd.soft_dotted "soft_dotted"; 16 | prop Uucd.terminal_punctuation "terminal_punctuation"; 17 | prop Uucd.regional_indicator "regional_indicator"; 18 | prop Uucd.join_control "join_control"; 19 | () 20 | 21 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 22 | -------------------------------------------------------------------------------- /src/uucp__cjk.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let is_ideographic u = 7 | Uucp_tmapbool.get Uucp_cjk_data.ideographic_map (Uchar.to_int u) 8 | 9 | let is_ids_unary_operator u = 10 | Uucp_tmapbool.get Uucp_cjk_data.ids_unary_operator_map (Uchar.to_int u) 11 | 12 | let is_ids_binary_operator u = 13 | Uucp_tmapbool.get Uucp_cjk_data.ids_binary_operator_map (Uchar.to_int u) 14 | 15 | let is_ids_trinary_operator u = 16 | Uucp_tmapbool.get Uucp_cjk_data.ids_trinary_operator_map (Uchar.to_int u) 17 | 18 | let is_radical u = 19 | Uucp_tmapbool.get Uucp_cjk_data.radical_map (Uchar.to_int u) 20 | 21 | let is_unified_ideograph u = 22 | Uucp_tmapbool.get Uucp_cjk_data.unified_ideograph_map (Uchar.to_int u) 23 | -------------------------------------------------------------------------------- /src/uucp__name.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | include Uucp_name_base 7 | 8 | let tok_len i = 9 | let rec loop size i = 10 | if String.unsafe_get Uucp_name_data.name_toks i = '\x00' then size else 11 | loop (size + 1) (i + 1) 12 | in 13 | loop 0 i 14 | 15 | let get_tok i = String.sub Uucp_name_data.name_toks i (tok_len i) 16 | 17 | let name u = 18 | let u = Uchar.to_int u in 19 | match Uucp_tmap5bytes.get_uint20_pair Uucp_name_data.name_map u with 20 | | 0, 0 -> "" 21 | | l, 0 -> get_tok l 22 | | 0, r -> Printf.sprintf "%s%04X" (get_tok r) u 23 | | l, r -> String.concat "" [get_tok l; get_tok r] 24 | 25 | let name_alias u = Uucp_cmap.get Uucp_name_data.name_alias_map (Uchar.to_int u) 26 | -------------------------------------------------------------------------------- /support/gen_case_map.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_map_untagged_prop prop pname ppf ucd = 7 | let size us = 3 * (List.length us) in 8 | let pp_uchars = Gen.pp_list Gen.pp_uchar in 9 | let prop cp = match Gen.ucd_get ucd cp prop with 10 | | `Self -> [] 11 | | `Cps [] -> assert false 12 | | `Cps us -> us 13 | in 14 | Gen.pp_prop_tmap ppf prop pname "Uchar.t list" pp_uchars ~default:[] size 15 | 16 | let pp_props ppf ucd = 17 | let map prop pname = pp_map_untagged_prop prop pname ppf ucd in 18 | map Uucd.uppercase_mapping "upper_map"; 19 | map Uucd.lowercase_mapping "lower_map"; 20 | map Uucd.titlecase_mapping "title_map"; 21 | () 22 | 23 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 24 | -------------------------------------------------------------------------------- /src/uucp__emoji.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2020 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let is_emoji u = 7 | Uucp_tmapbool.get Uucp_emoji_data.emoji_map (Uchar.to_int u) 8 | 9 | let is_emoji_presentation u = 10 | Uucp_tmapbool.get Uucp_emoji_data.emoji_presentation_map (Uchar.to_int u) 11 | 12 | let is_emoji_modifier u = 13 | Uucp_tmapbool.get Uucp_emoji_data.emoji_modifier_map (Uchar.to_int u) 14 | 15 | let is_emoji_modifier_base u = 16 | Uucp_tmapbool.get Uucp_emoji_data.emoji_modifier_base_map (Uchar.to_int u) 17 | 18 | let is_emoji_component u = 19 | Uucp_tmapbool.get Uucp_emoji_data.emoji_component_map (Uchar.to_int u) 20 | 21 | let is_extended_pictographic u = 22 | Uucp_tmapbool.get Uucp_emoji_data.extended_pictographic_map (Uchar.to_int u) 23 | -------------------------------------------------------------------------------- /src/uucp__case_map.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Case mappings. *) 7 | 8 | (** {1:casemaps Case mappings} *) 9 | 10 | val to_lower : Uchar.t -> [ `Self | `Uchars of Uchar.t list ] 11 | (** [to_lower u] is [u]'s 12 | {{:http://www.unicode.org/reports/tr44/#Lowercase_Mapping} 13 | Lowercase_Mapping} property. *) 14 | 15 | val to_upper : Uchar.t -> [ `Self | `Uchars of Uchar.t list ] 16 | (** [to_upper u] is [u]'s 17 | {{:http://www.unicode.org/reports/tr44/#Uppercase_Mapping} 18 | Uppercase_Mapping} property. *) 19 | 20 | val to_title : Uchar.t -> [ `Self | `Uchars of Uchar.t list ] 21 | (** [to_title u] is [u]'s 22 | {{:http://www.unicode.org/reports/tr44/#Titlecase_Mapping} 23 | Titlecase_Mapping} property. *) 24 | -------------------------------------------------------------------------------- /src/uucp_num_base.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type numeric_type = [ `De | `Di | `None | `Nu ] 7 | type numeric_value = 8 | [ `NaN | `Nums of [`Frac of int * int | `Num of int64 ] list ] 9 | 10 | let pp_numeric_type ppf v = Format.fprintf ppf "%s" begin match v with 11 | | `De -> "De" 12 | | `Di -> "Di" 13 | | `None -> "None" 14 | | `Nu -> "Nu" 15 | end 16 | 17 | let pp_num ppf = function 18 | | `Frac (a, b) -> Format.fprintf ppf "Frac(%d,%d)" a b 19 | | `Num n -> Format.fprintf ppf "Num(%LdL)" n 20 | 21 | let pp_numeric_value ppf = function 22 | | `NaN -> Format.fprintf ppf "NaN" 23 | | `Nums nums -> 24 | let pp_sep ppf () = Format.fprintf ppf " " in 25 | Format.fprintf ppf "@[%a@]" (Format.pp_print_list ~pp_sep pp_num) nums 26 | -------------------------------------------------------------------------------- /src/uucp__gc.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** General category property. *) 7 | 8 | (** {1:gcprop General category property} *) 9 | 10 | type t = 11 | [ `Cc | `Cf | `Cn | `Co | `Cs | `Ll | `Lm | `Lo | `Lt | `Lu | `Mc 12 | | `Me | `Mn | `Nd | `Nl | `No | `Pc | `Pd | `Pe | `Pf | `Pi | `Po 13 | | `Ps | `Sc | `Sk | `Sm | `So | `Zl | `Zp | `Zs ] 14 | (** The type for general categories. *) 15 | 16 | val compare : t -> t -> int 17 | (** [compare c c'] is [Stdlib.compare s s']. *) 18 | 19 | val pp : Format.formatter -> t -> unit 20 | (** [pp ppf c] prints an unspecified representation of [c] on [ppf]. *) 21 | 22 | val general_category : Uchar.t -> t 23 | (** [general_category u] is [u]'s 24 | {{:http://www.unicode.org/reports/tr44/#General_Category} 25 | General_Category} property. *) 26 | -------------------------------------------------------------------------------- /src/uucp.mllib: -------------------------------------------------------------------------------- 1 | Uucp 2 | Uucp__age 3 | Uucp_age_data 4 | Uucp__alpha 5 | Uucp_alpha_data 6 | Uucp__block 7 | Uucp_block_base 8 | Uucp_block_data 9 | Uucp__break 10 | Uucp_break_base 11 | Uucp_break_data 12 | Uucp__case 13 | Uucp_case_data 14 | Uucp__case_fold 15 | Uucp_case_fold_data 16 | Uucp__case_map 17 | Uucp_case_map_data 18 | Uucp__case_nfkc 19 | Uucp_case_nfkc_data 20 | Uucp__case_nfkc_simple 21 | Uucp_case_nfkc_simple_data 22 | Uucp__cjk 23 | Uucp_cjk_data 24 | Uucp_cmap 25 | Uucp_fmt 26 | Uucp__emoji 27 | Uucp_emoji_data 28 | Uucp__func 29 | Uucp_func_data 30 | Uucp__gc 31 | Uucp_gc_base 32 | Uucp_gc_data 33 | Uucp__gen 34 | Uucp_gen_data 35 | Uucp_hangul_base 36 | Uucp__hangul 37 | Uucp_hangul_data 38 | Uucp__id 39 | Uucp_id_data 40 | Uucp__name 41 | Uucp_name_base 42 | Uucp_name_data 43 | Uucp__num 44 | Uucp_num_base 45 | Uucp_num_data 46 | Uucp_rmap 47 | Uucp__script 48 | Uucp_script_base 49 | Uucp_script_data 50 | Uucp_tmap 51 | Uucp_tmap5bytes 52 | Uucp_tmapbool 53 | Uucp_tmapbyte 54 | Uucp_version_data 55 | Uucp__white 56 | Uucp_white_data 57 | -------------------------------------------------------------------------------- /support/gen_case_nfkc_simple.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2023 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_map_prop prop pname ppf ucd = 7 | let pp_map_value ppf = function 8 | | `Self -> Format.fprintf ppf "`Self" 9 | | `Uchars us -> Format.fprintf ppf "@[<1>(`Uchars@ %a)@]" 10 | (Gen.pp_list Gen.pp_uchar) us 11 | in 12 | let size = function 13 | | `Self -> 0 14 | | `Uchars us -> 2 + 3 * (List.length us) 15 | in 16 | let prop cp = match Gen.ucd_get ucd cp prop with 17 | | `Self -> `Self 18 | | `Cps us -> `Uchars us 19 | in 20 | Gen.pp_prop_tmap ppf prop pname "[ `Self | `Uchars of Uchar.t list ]" 21 | pp_map_value ~default:`Self size 22 | 23 | let pp_props ppf ucd = 24 | pp_map_prop Uucd.nfkc_simple_casefold "nfkc_simple_fold_map" ppf ucd; 25 | () 26 | 27 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 28 | -------------------------------------------------------------------------------- /src/uucp.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Unicode version *) 7 | 8 | let unicode_version = Uucp_version_data.unicode_version 9 | 10 | (* Properties *) 11 | 12 | module Age = Uucp__age 13 | module Alpha = Uucp__alpha 14 | module Break = Uucp__break 15 | module Block = Uucp__block 16 | module Case = Uucp__case 17 | module Cjk = Uucp__cjk 18 | module Emoji = Uucp__emoji 19 | module Func = Uucp__func 20 | module Gc = Uucp__gc 21 | module Gen = Uucp__gen 22 | module Hangul = Uucp__hangul 23 | module Id = Uucp__id 24 | module Name = Uucp__name 25 | module Num = Uucp__num 26 | module Script = Uucp__script 27 | module White = Uucp__white 28 | 29 | (* Maps. Not part of the public API. *) 30 | 31 | module Cmap = Uucp_cmap 32 | module Rmap = Uucp_rmap 33 | module Tmap = Uucp_tmap 34 | module Tmapbool = Uucp_tmapbool 35 | module Tmapbyte = Uucp_tmapbyte 36 | -------------------------------------------------------------------------------- /pkg/pkg.ml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ocaml 2 | #use "topfind" 3 | #require "topkg" 4 | open Topkg 5 | 6 | let uunf = Conf.with_pkg "uunf" 7 | let cmdliner = Conf.with_pkg "cmdliner" 8 | 9 | let uucp_api = 10 | [ "Uucp"; 11 | "Uucp__age"; 12 | "Uucp__alpha"; 13 | "Uucp__block"; 14 | "Uucp__break"; 15 | "Uucp__case"; 16 | "Uucp__case_fold"; 17 | "Uucp__case_map"; 18 | "Uucp__case_nfkc"; 19 | "Uucp__case_nfkc_simple"; 20 | "Uucp__cjk"; 21 | "Uucp__emoji"; 22 | "Uucp__func"; 23 | "Uucp__gc"; 24 | "Uucp__gen"; 25 | "Uucp__hangul"; 26 | "Uucp__id"; 27 | "Uucp__name"; 28 | "Uucp__num"; 29 | "Uucp__script"; 30 | "Uucp__white"; ] 31 | 32 | let () = 33 | Pkg.describe "uucp" @@ fun c -> 34 | let uunf = Conf.value c uunf in 35 | let cmdliner = Conf.value c cmdliner in 36 | Ok [ Pkg.mllib ~api:uucp_api "src/uucp.mllib"; 37 | Pkg.bin ~cond:(uunf && cmdliner) "test/ucharinfo"; 38 | Pkg.doc "doc/index.mld" ~dst:"odoc-pages/index.mld"; 39 | Pkg.doc "doc/unicode.mld" ~dst:"odoc-pages/unicode.mld"; 40 | Pkg.doc "test/examples.ml"; ] 41 | -------------------------------------------------------------------------------- /src/uucp__id.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let is_id_start u = 7 | Uucp_tmapbool.get Uucp_id_data.id_start_map (Uchar.to_int u) 8 | 9 | let is_id_continue u = 10 | Uucp_tmapbool.get Uucp_id_data.id_continue_map (Uchar.to_int u) 11 | 12 | let is_xid_start u = 13 | Uucp_tmapbool.get Uucp_id_data.xid_start_map (Uchar.to_int u) 14 | 15 | let is_xid_continue u = 16 | Uucp_tmapbool.get Uucp_id_data.xid_continue_map (Uchar.to_int u) 17 | 18 | let is_id_compat_math_start u = 19 | Uucp_tmapbool.get Uucp_id_data.id_compat_math_start_map (Uchar.to_int u) 20 | 21 | let is_id_compat_math_continue u = 22 | Uucp_tmapbool.get Uucp_id_data.id_compat_math_continue_map (Uchar.to_int u) 23 | 24 | let is_pattern_syntax u = 25 | Uucp_tmapbool.get Uucp_id_data.pattern_syntax_map (Uchar.to_int u) 26 | 27 | let is_pattern_white_space u = 28 | Uucp_tmapbool.get Uucp_id_data.pattern_white_space_map (Uchar.to_int u) 29 | -------------------------------------------------------------------------------- /support/gen_case_nfkc.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_map_prop prop pname ppf ucd = 7 | let pp_map_value ppf = function 8 | | `Self -> Format.fprintf ppf "`Self" 9 | | `Uchars us -> Format.fprintf ppf "@[<1>(`Uchars@ %a)@]" 10 | (Gen.pp_list Gen.pp_uchar) us 11 | in 12 | let size = function 13 | | `Self -> 0 14 | | `Uchars us -> 2 + 3 * (List.length us) 15 | in 16 | let prop cp = match Gen.ucd_get ucd cp prop with 17 | | `Self -> `Self 18 | | `Cps us -> `Uchars us 19 | in 20 | Gen.pp_prop_tmap ppf prop pname "[ `Self | `Uchars of Uchar.t list ]" 21 | pp_map_value ~default:`Self size 22 | 23 | let pp_props ppf ucd = 24 | pp_map_prop Uucd.nfkc_casefold "nfkc_fold_map" ppf ucd; 25 | (* let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 26 | prop Uucd.changes_when_nfkc_casefolded "changes_when_casefolded"; *) 27 | () 28 | 29 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 30 | -------------------------------------------------------------------------------- /src/uucp__name.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Name and name alias properties. 7 | 8 | {b References.} 9 | {ul 10 | {- {{:http://unicode.org/faq/casemap_charprop.html#nameprop} 11 | The Unicode names FAQ}.} 12 | {- The Unicode consortium. 13 | {{:https://unicode.org/charts/nameslist/index.html} 14 | The Unicode names charts}}} *) 15 | 16 | (** {1:nameprop Names} *) 17 | 18 | val name : Uchar.t -> string 19 | (** [name u] is [u]'s 20 | {{:http://www.unicode.org/reports/tr44/#Name}Name} property. *) 21 | 22 | (** {1:namealiasprop Name aliases} *) 23 | 24 | type alias_tag = 25 | [ `Abbreviation | `Alternate | `Control | `Correction | `Figment ] 26 | 27 | val pp_alias_tag : Format.formatter -> alias_tag -> unit 28 | (** [pp_alias_tag t] prints an unspecified representation of [t] 29 | on [ppf]. *) 30 | 31 | val name_alias : Uchar.t -> (alias_tag * string) list 32 | (** [name_alias u] is [u]'s 33 | {{:http://www.unicode.org/reports/tr44/#Name_Alias}Name_Alias} 34 | property. *) 35 | -------------------------------------------------------------------------------- /support/gen_script.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_script_variant ppf s = Gen.pp ppf "`%a" Uucp_script_base.pp s 7 | 8 | let pp_script_prop ppf ucd = 9 | let size v = 0 in 10 | Gen.pp_prop_rmap_ucd ~share:false ppf ucd 11 | Uucd.script "script" "Uucp_script_base.t" pp_script_variant 12 | ~default:`Zzzz size 13 | 14 | let pp_script_extensions_prop ppf ucd = 15 | let size v = 3 * (List.length v) in 16 | let pp_script_list = Gen.pp_list pp_script_variant in 17 | let prop u = 18 | let script = Gen.ucd_get ucd u Uucd.script in 19 | let es = Gen.ucd_get ucd u Uucd.script_extensions in 20 | match es with 21 | | [] -> assert false 22 | | [script'] when script = script' -> [] 23 | | es -> es 24 | in 25 | Gen.pp_prop_rmap ppf 26 | prop "script_extensions" "Uucp_script_base.t list" pp_script_list 27 | ~default:[] size 28 | 29 | let pp_props ppf ucd = 30 | pp_script_prop ppf ucd; 31 | pp_script_extensions_prop ppf ucd; 32 | () 33 | 34 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 35 | -------------------------------------------------------------------------------- /src/uucp_gc_base.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type t = 7 | [ `Cc 8 | | `Cf 9 | | `Cn 10 | | `Co 11 | | `Cs 12 | | `Ll 13 | | `Lm 14 | | `Lo 15 | | `Lt 16 | | `Lu 17 | | `Mc 18 | | `Me 19 | | `Mn 20 | | `Nd 21 | | `Nl 22 | | `No 23 | | `Pc 24 | | `Pd 25 | | `Pe 26 | | `Pf 27 | | `Pi 28 | | `Po 29 | | `Ps 30 | | `Sc 31 | | `Sk 32 | | `Sm 33 | | `So 34 | | `Zl 35 | | `Zp 36 | | `Zs ] 37 | 38 | let pp ppf c = Format.fprintf ppf "%s" begin match c with 39 | | `Cc -> "Cc" 40 | | `Cf -> "Cf" 41 | | `Cn -> "Cn" 42 | | `Co -> "Co" 43 | | `Cs -> "Cs" 44 | | `Ll -> "Ll" 45 | | `Lm -> "Lm" 46 | | `Lo -> "Lo" 47 | | `Lt -> "Lt" 48 | | `Lu -> "Lu" 49 | | `Mc -> "Mc" 50 | | `Me -> "Me" 51 | | `Mn -> "Mn" 52 | | `Nd -> "Nd" 53 | | `Nl -> "Nl" 54 | | `No -> "No" 55 | | `Pc -> "Pc" 56 | | `Pd -> "Pd" 57 | | `Pe -> "Pe" 58 | | `Pf -> "Pf" 59 | | `Pi -> "Pi" 60 | | `Po -> "Po" 61 | | `Ps -> "Ps" 62 | | `Sc -> "Sc" 63 | | `Sk -> "Sk" 64 | | `Sm -> "Sm" 65 | | `So -> "So" 66 | | `Zl -> "Zl" 67 | | `Zp -> "Zp" 68 | | `Zs -> "Zs" 69 | end 70 | -------------------------------------------------------------------------------- /src/uucp__func.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let is_dash u = 7 | Uucp_tmapbool.get Uucp_func_data.dash_map (Uchar.to_int u) 8 | 9 | let is_diacritic u = 10 | Uucp_tmapbool.get Uucp_func_data.diacritic_map (Uchar.to_int u) 11 | 12 | let is_extender u = 13 | Uucp_tmapbool.get Uucp_func_data.extender_map (Uchar.to_int u) 14 | 15 | let is_grapheme_base u = 16 | Uucp_tmapbool.get Uucp_func_data.grapheme_base_map (Uchar.to_int u) 17 | 18 | let is_grapheme_extend u = 19 | Uucp_tmapbool.get Uucp_func_data.grapheme_extend_map (Uchar.to_int u) 20 | 21 | let is_math u = 22 | Uucp_tmapbool.get Uucp_func_data.math_map (Uchar.to_int u) 23 | 24 | let is_quotation_mark u = 25 | Uucp_tmapbool.get Uucp_func_data.quotation_mark_map (Uchar.to_int u) 26 | 27 | let is_soft_dotted u = 28 | Uucp_tmapbool.get Uucp_func_data.soft_dotted_map (Uchar.to_int u) 29 | 30 | let is_terminal_punctuation u = 31 | Uucp_tmapbool.get Uucp_func_data.terminal_punctuation_map (Uchar.to_int u) 32 | 33 | let is_regional_indicator u = 34 | Uucp_tmapbool.get Uucp_func_data.regional_indicator_map (Uchar.to_int u) 35 | 36 | let is_join_control u = 37 | Uucp_tmapbool.get Uucp_func_data.join_control_map (Uchar.to_int u) 38 | -------------------------------------------------------------------------------- /DEVEL.md: -------------------------------------------------------------------------------- 1 | # New Unicode release 2 | 3 | The files `src/uucd_*_data.ml` contain generated data. These files need 4 | to be regenerated on new Unicode releases, as well as the `opam` file. 5 | 6 | In order to do so you need to install an updated version of the [uucd] 7 | OCaml package which is capable of reading the latest XML Unicode 8 | character database. 9 | 10 | You can then bump the Unicode release number at the top of the `B0.ml` 11 | file. Verify that everything is as expected with: 12 | 13 | b0 -- unicode-version 14 | 15 | You should then download a copy of the XML Unicode character database 16 | to the `support/ucd.xml` file which is ignored by git. If you have 17 | `curl` and `unzip` in your `PATH` you can simply issue: 18 | 19 | b0 -- download-ucdxml 20 | 21 | You can now proceed to generate the `src/uucd_*_data.ml` and update the opam 22 | file by issuing: 23 | 24 | b0 -- generate-data 25 | b0 -- .opam file > opam 26 | 27 | [uucd]: http://erratique.ch/software/uucd 28 | 29 | 30 | # Tests and performance 31 | 32 | To check Uucp against the data of the Unicode character database to 33 | ensure they agree make sure you have a copy of the XML Unicode 34 | character database in `support/ucd.xml` (use `b0 -- download-ucdxml` 35 | or specify it on the cli) and run: 36 | 37 | b0 test 38 | 39 | If you are tweaking the datastructures, the performance lookup of Uucp 40 | can be tested with: 41 | 42 | time $(b0 --path -- perf) 43 | 44 | See `b0 -- perf --help` for more options. 45 | -------------------------------------------------------------------------------- /src/uucp__gen.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** General properties. *) 7 | 8 | (** {1:genprops General properties} *) 9 | 10 | val is_default_ignorable : Uchar.t -> bool 11 | (** [is_default_ignorable u] is [true] if [u] has the 12 | {{:http://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point} 13 | Default_Ignorable_Code_Point} property. *) 14 | 15 | val is_deprecated : Uchar.t -> bool 16 | (** [is_deprecated u] is [true] if [u] has the 17 | {{:http://www.unicode.org/reports/tr44/#Deprecated} 18 | Deprecated} property. *) 19 | 20 | val is_logical_order_exception : Uchar.t -> bool 21 | (** [is_logical_order_exception u] is [true] if [u] has the 22 | {{:http://www.unicode.org/reports/tr44/#Logical_Order_Exception} 23 | Logical_Order_Exception} property. *) 24 | 25 | val is_non_character : Uchar.t -> bool 26 | (** [is_non_character u] is [true] if [u] has the 27 | {{:http://www.unicode.org/reports/tr44/#Noncharacter_Code_Point} 28 | Noncharacter_Code_Point} property. *) 29 | 30 | val is_variation_selector : Uchar.t -> bool 31 | (** [is_variation_selector u] is [true] if [u] has the 32 | {{:http://www.unicode.org/reports/tr44/#Variation_Selector} 33 | Variation_Selector} property. See the 34 | {{:http://www.unicode.org/faq/vs.html}Variation Sequences FAQ}. *) 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Uucp — Unicode character properties for OCaml 2 | ============================================= 3 | %%VERSION%% 4 | 5 | Uucp is an OCaml library providing efficient access to a selection of 6 | character properties of the [Unicode character database]. 7 | 8 | Uucp is distributed under the ISC license. It has no dependency. 9 | 10 | Home page: 11 | 12 | [Unicode character database]: http://www.unicode.org/reports/tr44/ 13 | 14 | ## Installation 15 | 16 | Uucp can be installed with `opam`: 17 | 18 | opam install uucp 19 | opam install uucp uunf cmdliner # For ucharinfo cli tool 20 | 21 | If you don't use `opam` consult the [`opam`](opam) file for build 22 | instructions. 23 | 24 | 25 | ## Documentation 26 | 27 | The documentation can be consulted [online] or via `odig doc uucp`. 28 | 29 | Uucp's documentation also has a [minimal Unicode introduction][intro] 30 | and some [Unicode OCaml tips][tips]. 31 | 32 | Questions are welcome but better asked on the [OCaml forum] than on 33 | the issue tracker. 34 | 35 | [online]: http://erratique.ch/software/uucp/doc/ 36 | [intro]: http://erratique.ch/software/uucp/doc/unicode.html#minimal 37 | [tips]: http://erratique.ch/software/uucp/doc/unicode.html#tips 38 | [OCaml forum]: https://discuss.ocaml.org/ 39 | 40 | 41 | ## Sample programs 42 | 43 | The [`ucharinfo`] tool allows to report character information on the 44 | command line. 45 | 46 | See also the [doc examples]. 47 | 48 | [`ucharinfo`]: test/ucharinfo.ml 49 | [doc examples]: test/examples.ml 50 | -------------------------------------------------------------------------------- /src/uucp__emoji.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2020 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Emoji properties. *) 7 | 8 | val is_emoji : Uchar.t -> bool 9 | (** [is_emoji u] is [true] if [u] has the 10 | {{:http://www.unicode.org/reports/tr44/#Emoji}Emoji} property. *) 11 | 12 | val is_emoji_presentation : Uchar.t -> bool 13 | (** [is_emoji_presentation u] is [true] if [u] has the 14 | {{:http://www.unicode.org/reports/tr44/#Emoji_Presentation} 15 | Emoji_Presentation} property. *) 16 | 17 | val is_emoji_modifier : Uchar.t -> bool 18 | (** [is_emoji_modifier u] is [true] if [u] has the 19 | {{:http://www.unicode.org/reports/tr44/#Emoji_Modifier} 20 | Emoji_Modifier} property. *) 21 | 22 | val is_emoji_modifier_base : Uchar.t -> bool 23 | (** [is_emoji_modifier u] is [true] if [u] has the 24 | {{:http://www.unicode.org/reports/tr44/#Emoji_Modifier_Base} 25 | Emoji_Modifier_Base} property. *) 26 | 27 | val is_emoji_component : Uchar.t -> bool 28 | (** [is_emoji_component u] is [true] if [u] has the 29 | {{:http://www.unicode.org/reports/tr44/#Emoji_Component} 30 | Emoji_Component} property. *) 31 | 32 | val is_extended_pictographic : Uchar.t -> bool 33 | (** [is_extended_pictographic u] is [true] if [u] has the 34 | {{:http://www.unicode.org/reports/tr44/#Extended_Pictographic} 35 | Extended_Pictographic} property. *) 36 | -------------------------------------------------------------------------------- /support/gen_block.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_block ppf b = Gen.pp ppf "`%a" Uucp_block_base.pp b 7 | let pp_block_option ppf = function 8 | | None -> Gen.pp ppf "None" 9 | | Some b -> Gen.pp ppf "@[<1>(Some@ %a)@]" pp_block b 10 | 11 | let block_prop ucd u = match Gen.ucd_get ucd u Uucd.block with 12 | | `High_Surrogates -> assert false 13 | | `Low_Surrogates -> assert false 14 | | `High_PU_Surrogates -> assert false 15 | | #Uucp_block_base.t as b -> b 16 | 17 | let pp_block_prop ppf ucd = 18 | let size v = 0 in 19 | let prop u = block_prop ucd u in 20 | Gen.pp_prop_rmap ~share:false ppf prop "block" "Uucp_block_base.t" pp_block 21 | ~default:`NB size 22 | 23 | let pp_blocks ppf ucd = 24 | let ranges = Gen.prop_find_ranges (block_prop ucd) in 25 | let not_nb (`R (_, _, b)) = b <> `NB in 26 | let ranges = List.find_all not_nb ranges in 27 | let pp_block ppf (`R (is,ie,b)) = 28 | Gen.pp ppf "@[<1>(%a,@,(%a,@,%a))@]" 29 | pp_block b Gen.pp_uchar is Gen.pp_uchar ie 30 | in 31 | Gen.pp ppf "@[let block_list : \ 32 | (Uucp_block_base.t * (Uchar.t * Uchar.t)) list =\ 33 | @\n %a@]@\n" 34 | (Gen.pp_list pp_block) ranges 35 | 36 | let pp_props ppf ucd = 37 | pp_block_prop ppf ucd; 38 | pp_blocks ppf ucd; 39 | () 40 | 41 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 42 | -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "uucp" 3 | synopsis: "Unicode character properties for OCaml" 4 | description: """\ 5 | Uucp is an OCaml library providing efficient access to a selection of 6 | character properties of the [Unicode character database]. 7 | 8 | Uucp is distributed under the ISC license. It has no dependency. 9 | 10 | Home page: 11 | 12 | [Unicode character database]: http://www.unicode.org/reports/tr44/""" 13 | maintainer: "Daniel Bünzli " 14 | authors: "The uucp programmers" 15 | license: "ISC" 16 | tags: ["unicode" "text" "character" "org:erratique"] 17 | homepage: "https://erratique.ch/software/uucp" 18 | doc: "https://erratique.ch/software/uucp/doc/" 19 | bug-reports: "https://github.com/dbuenzli/uucp/issues" 20 | depends: [ 21 | "ocaml" {>= "4.14.0"} 22 | "ocamlfind" {build} 23 | "ocamlbuild" {build} 24 | "topkg" {build & >= "1.1.0"} 25 | "uucd" {with-test & dev & >= "17.0.0" & < "18.0.0"} 26 | "uunf" {with-test} 27 | ] 28 | depopts: ["uunf" "cmdliner"] 29 | conflicts: [ 30 | "uunf" {< "17.0.0" | >= "18.0.0"} 31 | "cmdliner" {< "1.1.0"} 32 | ] 33 | build: [ 34 | "ocaml" 35 | "pkg/pkg.ml" 36 | "build" 37 | "--dev-pkg" 38 | "%{dev}%" 39 | "--with-uunf" 40 | "%{uunf:installed}%" 41 | "--with-cmdliner" 42 | "%{cmdliner:installed}%" 43 | ] 44 | post-messages: 45 | "If the build fails with \"ocamlopt.opt got signal and exited\", issue 'ulimit -s unlimited' and retry." 46 | {failure & (arch = "ppc64" | arch = "arm64")} 47 | dev-repo: "git+https://erratique.ch/repos/uucp.git" 48 | x-maintenance-intent: ["(latest)"] 49 | -------------------------------------------------------------------------------- /src/uucp__num.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Numeric properties. *) 7 | 8 | (** {1:hexprop Hex digits} *) 9 | 10 | val is_ascii_hex_digit : Uchar.t -> bool 11 | (** [is_ascii_hex_digit u] is [true] if [u] has the 12 | {{:http://www.unicode.org/reports/tr44/#ASCII_Hex_Digit}ASCII_Hex_Digit} 13 | property. *) 14 | 15 | val is_hex_digit : Uchar.t -> bool 16 | (** [is_hex_digit u] is [true] if [u] has the 17 | {{:http://www.unicode.org/reports/tr44/#Hex_Digit}Hex_Digit} 18 | property. *) 19 | 20 | (** {1:numtypeprop Numeric type} *) 21 | 22 | type numeric_type = [ `De | `Di | `None | `Nu ] 23 | (** The type for numeric types. *) 24 | 25 | val pp_numeric_type : Format.formatter -> numeric_type -> unit 26 | (** [pp_numeric_type ppf n] prints an unspecified representation of 27 | [n] on [ppf]. *) 28 | 29 | val numeric_type : Uchar.t -> numeric_type 30 | (** [numeric_type u] is [u]'s 31 | {{:http://www.unicode.org/reports/tr44/#Numeric_Type} 32 | Numeric_Type} property. *) 33 | 34 | (** {1:numvalueprop Numeric value} *) 35 | 36 | type numeric_value = 37 | [ `NaN | `Nums of [`Frac of int * int | `Num of int64 ] list ] 38 | (** The type for numeric values. *) 39 | 40 | val pp_numeric_value : Format.formatter -> numeric_value -> unit 41 | (** [pp_numeric_value ppf n] prints an unspecified representation of 42 | [n] on [ppf]. *) 43 | 44 | val numeric_value : Uchar.t -> numeric_value 45 | (** [numeric_type u] is [u]'s 46 | {{:http://www.unicode.org/reports/tr44/#Numeric_Value} 47 | Numeric_Value} property. *) 48 | -------------------------------------------------------------------------------- /support/gen_num.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_numeric_type ppf ucd = 7 | let size v = 0 in 8 | let pp_ntype ppf t = Gen.pp ppf "`%a" Uucp_num_base.pp_numeric_type t in 9 | Gen.pp_prop_rmap_ucd ~share:false ppf ucd Uucd.numeric_type 10 | "numeric_type" "Uucp_num_base.numeric_type" pp_ntype 11 | ~default:`None size 12 | 13 | let pp_numeric_value ppf ucd = 14 | let num_size = function 15 | | `Frac _ -> 1 + 2 + 2 (* cons *) 16 | | `Num _ -> 1 + (64 / Sys.word_size) + 2 (* cons *) 17 | in 18 | let size = function 19 | | `NaN -> 0 20 | | `Nums nums -> List.fold_left (fun acc num -> acc + num_size num) 0 nums 21 | in 22 | let pp_numeric_value ppf = function 23 | | `NaN -> Format.fprintf ppf "`NaN" 24 | | `Nums nums -> 25 | let pp_num ppf = function 26 | | `Frac (a, b) -> Format.fprintf ppf "`Frac(%d,%d)" a b 27 | | `Num n -> Format.fprintf ppf "`Num(%LdL)" n 28 | in 29 | let pp_sep ppf () = Format.fprintf ppf ";@," in 30 | Format.fprintf ppf "`Nums[%a]" (Format.pp_print_list ~pp_sep pp_num) nums 31 | in 32 | let pp_nvalue ppf v = Gen.pp ppf "(%a)" pp_numeric_value v in 33 | Gen.pp_prop_cmap_ucd ppf ucd Uucd.numeric_value 34 | "numeric_value" "Uucp_num_base.numeric_value" pp_nvalue 35 | ~default:`NaN size 36 | 37 | let pp_props ppf ucd = 38 | let prop = Gen.pp_prop_tmapbool_ucd ppf ucd in 39 | prop Uucd.ascii_hex_digit "ascii_hex_digit"; 40 | prop Uucd.hex_digit "hex_digit"; 41 | pp_numeric_type ppf ucd; 42 | pp_numeric_value ppf ucd; 43 | () 44 | 45 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 46 | -------------------------------------------------------------------------------- /src/uucp__cjk.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** CJK properties. 7 | 8 | {b References.} 9 | {ul 10 | {- {{:http://www.unicode.org/faq/han_cjk.html} 11 | The Unicode Chinese and Japanese FAQ.}} 12 | {- {{:http://www.unicode.org/faq/korean.html} 13 | The Unicode Korean FAQ.}}} *) 14 | 15 | (** {1:cjkprops CJK properties} *) 16 | 17 | val is_ideographic : Uchar.t -> bool 18 | (** [is_ideographic u] is [true] if [u] has the 19 | {{:http://www.unicode.org/reports/tr44/#Ideographic}Ideographic} 20 | property. *) 21 | 22 | val is_ids_unary_operator : Uchar.t -> bool 23 | (** [is_ids_unary_operator u] is [true] if [u] has the 24 | {{:http://www.unicode.org/reports/tr44/#IDS_Unary_Operator} 25 | IDS_Binary_Operator} property. *) 26 | 27 | val is_ids_binary_operator : Uchar.t -> bool 28 | (** [is_ids_binary_operator u] is [true] if [u] has the 29 | {{:http://www.unicode.org/reports/tr44/#IDS_Binary_Operator} 30 | IDS_Binary_Operator} property. *) 31 | 32 | val is_ids_trinary_operator : Uchar.t -> bool 33 | (** [is_ids_trinary_operator u] is [true] if [u] has the 34 | {{:http://www.unicode.org/reports/tr44/#IDS_Trinary_Operator} 35 | IDS_Trinary_Operator} property. *) 36 | 37 | val is_radical : Uchar.t -> bool 38 | (** [is_radical u] is [true] if [u] has the 39 | {{:http://www.unicode.org/reports/tr44/#Radical}Radical} 40 | property. *) 41 | 42 | val is_unified_ideograph : Uchar.t -> bool 43 | (** [is_unified_ideograph u] is [true] if [u] has the 44 | {{:http://www.unicode.org/reports/tr44/#Unified_Ideograph} 45 | Unified_Ideograph} property. *) 46 | -------------------------------------------------------------------------------- /src/uucp_fmt.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2020 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pf = Format.fprintf 7 | let strf = Format.asprintf 8 | let string = Format.pp_print_string 9 | let string_X ppf s = 10 | Format.pp_open_vbox ppf 1; string ppf "\""; 11 | for i = 0 to String.length s - 1 do 12 | if i mod 16 = 0 && i > 0 then pf ppf "\\@\n"; 13 | pf ppf "\\x%02x" (Char.code s.[i]) 14 | done; 15 | string ppf "\""; Format.pp_close_box ppf () 16 | 17 | let string_XN ppf = function "" -> string ppf "snil" | x -> string_X ppf x 18 | let bool = Format.pp_print_bool 19 | let sp = Format.pp_print_space 20 | let semi ppf () = string ppf ";"; sp ppf () 21 | let int = Format.pp_print_int 22 | let iter i ?(sep = sp) pp ppf x = 23 | let fst = ref true in 24 | i (fun v -> (if !fst then fst := false else sep ppf ()); pp ppf v) x 25 | 26 | let as_array i pp ppf = pf ppf "@[<2>[|%a|]@]" (iter i ~sep:semi pp) 27 | let array pp = as_array Array.iter pp 28 | let array_N pp ppf = function [||] -> string ppf "nil" | x -> array pp ppf x 29 | 30 | module R = struct 31 | type _ record = 32 | | [] : unit record 33 | | (::) : 34 | (string * (Format.formatter -> 'a -> unit)) * 'b record -> 35 | ('a -> 'b) record 36 | end 37 | 38 | let record record ppf = 39 | let field name pp_v ppf v = pf ppf "@[<1>%s =@ %a@]" name pp_v v in 40 | let open R in (* 4.03 compat *) 41 | let rec go : type a. (unit -> unit) -> a R.record -> a = fun k -> function 42 | | [] -> pf ppf "@[<2>{ %a }@]" (fun _ -> k) () 43 | | [name, pp_v] -> 44 | fun v -> go (fun () -> k (); field name pp_v ppf v) [] 45 | | (name, pp_v) :: record -> 46 | fun v -> go (fun () -> k (); field name pp_v ppf v; semi ppf ()) record 47 | in 48 | go ignore record 49 | -------------------------------------------------------------------------------- /support/gen_break.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pp_line_break ppf ucd = 7 | Gen.pp_code_prop_tmapbyte_ucd ppf ucd 8 | Uucp_break_base.line_to_byte Uucd.line_break "line_break" ~default:`XX 9 | Uucp_break_base.pp_line 10 | 11 | let pp_grapheme_cluster_break ppf ucd = 12 | Gen.pp_code_prop_tmapbyte_ucd ppf ucd 13 | Uucp_break_base.grapheme_cluster_to_byte Uucd.grapheme_cluster_break 14 | "grapheme_cluster_break" ~default:`XX Uucp_break_base.pp_grapheme_cluster 15 | 16 | let pp_word_break ppf ucd = 17 | Gen.pp_code_prop_tmapbyte_ucd ppf ucd 18 | Uucp_break_base.word_to_byte Uucd.word_break 19 | "word_break" ~default:`XX Uucp_break_base.pp_word 20 | 21 | let pp_sentence_break ppf ucd = 22 | Gen.pp_code_prop_tmapbyte_ucd ppf ucd 23 | Uucp_break_base.sentence_to_byte Uucd.sentence_break 24 | "sentence_break" ~default:`XX Uucp_break_base.pp_sentence 25 | 26 | let pp_indic_conjunct_break ppf ucd = 27 | Gen.pp_code_prop_tmapbyte_ucd ppf ucd 28 | Uucp_break_base.indic_conjunct_break_to_byte Uucd.indic_conjunct_break 29 | "indic_conjunct_break" ~default:`None 30 | Uucp_break_base.pp_indic_conjunct_break 31 | 32 | let pp_east_asian_width ppf ucd = 33 | let size _ = 0 in 34 | let pp ppf w = Gen.pp ppf "`%a" Uucp_break_base.pp_east_asian_width w in 35 | Gen.pp_prop_rmap_ucd ~share:false ppf ucd 36 | Uucd.east_asian_width "east_asian_width" "Uucp_break_base.east_asian_width" 37 | pp ~default:`N size 38 | 39 | let pp_props ppf ucd = 40 | pp_line_break ppf ucd; 41 | pp_grapheme_cluster_break ppf ucd; 42 | pp_word_break ppf ucd; 43 | pp_sentence_break ppf ucd; 44 | pp_indic_conjunct_break ppf ucd; 45 | pp_east_asian_width ppf ucd; 46 | () 47 | 48 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 49 | -------------------------------------------------------------------------------- /src/uucp_tmap.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* uchar to arbitrary value trie maps *) 7 | 8 | type 'a t = 9 | { default : 'a; (* default value. *) 10 | l0 : 'a array array array } (* 0x1FFFFF as 0x1FF - 0xFF - 0xF. *) 11 | 12 | let nil = [||] 13 | let l0_shift = 12 14 | let l0_size = 0x10F + 1 15 | let l1_shift = 4 16 | let l1_mask = 0xFF 17 | let l1_size = 0xFF + 1 18 | let l2_mask = 0xF 19 | let l2_size = 0xF + 1 20 | 21 | let create default = { default; l0 = Array.make l0_size nil } 22 | 23 | let get m u = 24 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in 25 | if l1 == nil then m.default else 26 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in 27 | if l2 == nil then m.default else 28 | Array.unsafe_get l2 (u land l2_mask) 29 | 30 | let set m u v = 31 | if v = m.default then () else 32 | let i = u lsr l0_shift in 33 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size nil; 34 | let j = u lsr l1_shift land l1_mask in 35 | if m.l0.(i).(j) == nil then m.l0.(i).(j) <- Array.make l2_size m.default; 36 | m.l0.(i).(j).(u land l2_mask) <- v 37 | 38 | let word_size v_size m = match m.l0 with 39 | | [||] -> 3 + 1 + v_size m.default 40 | | l0 -> 41 | let size = ref (3 + v_size m.default + 1 + Array.length l0) in 42 | for i = 0 to Array.length l0 - 1 do match l0.(i) with 43 | | [||] -> () 44 | | l1 -> 45 | size := !size + (1 + Array.length l1); 46 | for j = 0 to Array.length l1 - 1 do match l1.(j) with 47 | | [||] -> () 48 | | l2 -> 49 | size := !size + (1 + Array.length l2); 50 | for k = 0 to Array.length l2 - 1 do 51 | size := !size + v_size l2.(k) 52 | done; 53 | done; 54 | done; 55 | !size 56 | 57 | let dump pp_v ppf m = 58 | let open Uucp_fmt in 59 | record ["default", pp_v; "l0", pp_v |> array_N |> array_N |> array] 60 | ppf m.default m.l0 61 | -------------------------------------------------------------------------------- /src/uucp_tmapbyte.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* uchar to byte trie maps *) 7 | 8 | type t = 9 | { default : int; (* default value. *) 10 | l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *) 11 | 12 | let nil = [||] 13 | let snil = "" 14 | let l0_shift = 12 15 | let l0_size = 0x10F + 1 16 | let l1_shift = 8 17 | let l1_mask = 0xF 18 | let l1_size = 0xF + 1 19 | let l2_mask = 0xFF 20 | let l2_size = 0xFF + 1 21 | 22 | let create default = { default; l0 = Array.make l0_size nil } 23 | 24 | let get m u = 25 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in 26 | if l1 == nil then m.default else 27 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in 28 | if l2 == snil then m.default else 29 | Char.code (String.unsafe_get l2 (u land l2_mask)) 30 | 31 | let set m u byte = 32 | let l2_make m = Bytes.make l2_size (Char.chr m.default) in 33 | if byte = m.default then () else 34 | let i = u lsr l0_shift in 35 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil; 36 | let j = u lsr l1_shift land l1_mask in 37 | if m.l0.(i).(j) == snil then 38 | m.l0.(i).(j) <- (Bytes.unsafe_to_string (l2_make m)); 39 | let k = u land l2_mask in 40 | Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) k (Char.unsafe_chr byte) 41 | 42 | let word_size m = match m.l0 with 43 | | [||] -> 3 + 1 44 | | l0 -> 45 | let size = ref (3 + 1 + Array.length l0) in 46 | for i = 0 to Array.length l0 - 1 do match l0.(i) with 47 | | [||] -> () 48 | | l1 -> 49 | size := !size + 1 + Array.length l1; 50 | for j = 0 to Array.length l1 - 1 do 51 | size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size) 52 | done; 53 | done; 54 | !size 55 | 56 | let iter_blobs i m = Array.(iter (iter i)) m.l0 57 | 58 | let dump_pp pp_v ppf m = 59 | let open Uucp_fmt in 60 | record ["default", int; "l0", pp_v |> array_N |> array] ppf m.default m.l0 61 | 62 | let pp_v = Uucp_fmt.string_XN 63 | let dump = dump_pp pp_v 64 | -------------------------------------------------------------------------------- /src/uucp__id.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Identifier properties. 7 | 8 | {b References.} 9 | {ul 10 | {- Mark Davis. 11 | {e {{:http://www.unicode.org/reports/tr31/}UAX #31 12 | Unicode Identifier and Pattern Syntax}}. (latest version)}} *) 13 | 14 | (** {1:idprops Identifier properties} *) 15 | 16 | val is_id_start : Uchar.t -> bool 17 | (** [is_id_start u] is [true] if [u] has the 18 | {{:http://www.unicode.org/reports/tr44/#ID_Start}ID_Start} 19 | property. *) 20 | 21 | val is_id_continue : Uchar.t -> bool 22 | (** [is_id_continue u] is [true] if [u] has the 23 | {{:http://www.unicode.org/reports/tr44/#ID_Continue}ID_Continue} 24 | property. *) 25 | 26 | val is_xid_start : Uchar.t -> bool 27 | (** [is_xid_start u] is [true] if [u] has the 28 | {{:http://www.unicode.org/reports/tr44/#XID_Start}XID_Start} 29 | property. *) 30 | 31 | val is_xid_continue : Uchar.t -> bool 32 | (** [is_xid_continue u] is [true] if [u] has the 33 | {{:http://www.unicode.org/reports/tr44/#XID_Continue}XID_Continue} 34 | property. *) 35 | 36 | (** {1:mathprops Mathematical compatibility notation profile} *) 37 | 38 | val is_id_compat_math_start : Uchar.t -> bool 39 | (** [is_id_compat_math_start u] is [true] if [u] has then 40 | {{:https://www.unicode.org/reports/tr44/#ID_Compat_Math_Start} 41 | ID_Compat_Math_Start} property. *) 42 | 43 | val is_id_compat_math_continue : Uchar.t -> bool 44 | (** [is_id_compat_math_continue u] is [true] if [u] has then 45 | {{:https://www.unicode.org/reports/tr44/#ID_Compat_Math_Continue} 46 | ID_Compat_Math_Continue} property. *) 47 | 48 | (** {1:patprops Pattern syntax properties} *) 49 | 50 | val is_pattern_syntax : Uchar.t -> bool 51 | (** [is_pattern_syntax u] is [true] if [u] has the 52 | {{:http://www.unicode.org/reports/tr44/#Pattern_Syntax}Pattern_Syntax} 53 | property. *) 54 | 55 | val is_pattern_white_space : Uchar.t -> bool 56 | (** [is_pattern_white_space u] is [true] if [u] has the 57 | {{:http://www.unicode.org/reports/tr44/#Pattern_White_Space} 58 | Pattern_White_Space} property. *) 59 | -------------------------------------------------------------------------------- /src/uucp_cmap.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Binary tree uchar maps. *) 7 | 8 | type 'a tree = 9 | | Empty 10 | | C of int * 'a 11 | | Cn of 'a tree * 'a tree * int * 'a 12 | 13 | type 'a t = { default : 'a; tree : 'a tree } 14 | 15 | let get m cp = 16 | let rec loop cp = function 17 | | Cn (l, r, i, v) -> 18 | if cp < i then loop cp l else 19 | if cp > i then loop cp r else 20 | v 21 | | C (i, v) -> if cp = i then v else m.default 22 | | Empty -> m.default 23 | in 24 | loop cp m.tree 25 | 26 | let of_sorted_list default l = (* perfect balance. *) 27 | let rec loop len l = 28 | if len = 1 then match l with 29 | | `C (i, v) :: r -> C (i, v), r 30 | | _ -> assert false 31 | else 32 | let len_ll = len / 2 in 33 | let len_rl = len - len_ll in 34 | let ltree, rlist = loop len_ll l in 35 | match rlist with 36 | | [] -> ltree, [] 37 | | `C (i, v) :: r -> 38 | if len_rl = 1 then Cn (ltree, Empty, i, v), r else 39 | let rtree, rlist = loop (len_rl - 1) r in 40 | Cn (ltree, rtree, i, v), rlist 41 | in 42 | let keep acc (`C (_, v) as p) = if v <> default then p :: acc else acc in 43 | let l = List.rev (List.fold_left keep [] l) in 44 | let len = List.length l in 45 | let tree = if len = 0 then Empty else fst (loop len l) in 46 | { default; tree } 47 | 48 | let height m = 49 | let rec loop = function 50 | | Empty -> 0 51 | | C _ -> 1 52 | | Cn (l, r, _, _) -> 1 + max (loop l) (loop r) 53 | in 54 | loop m.tree 55 | 56 | let word_size v_size m = (* value sharing not taken into account. *) 57 | let rec loop = function 58 | | Empty -> 0 59 | | C (_, v) -> 3 + v_size v 60 | | Cn (l, r, _, v) -> 5 + loop l + loop r + v_size v 61 | in 62 | loop m.tree 63 | 64 | let rec dump pp_v ppf m = 65 | let open Uucp_fmt in 66 | let rec dump_tree ppf = function 67 | | Cn (l, r, i, v) -> 68 | pf ppf "@[<4>Cn(%a,@,%a,@,0x%04X,@,%a)@]" dump_tree l dump_tree r i pp_v v 69 | | C (i, v) -> 70 | pf ppf "@[<3>C(0x%04X,@,%a)@]" i pp_v v 71 | | Empty -> 72 | pf ppf "Empty" 73 | in 74 | record ["default", pp_v; "tree", dump_tree] ppf m.default m.tree 75 | -------------------------------------------------------------------------------- /src/uucp__func.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Function and graphics properties. *) 7 | 8 | (** {1:funcprops Function and graphics properties} *) 9 | 10 | val is_dash : Uchar.t -> bool 11 | (** [is_dash u] is [true] if [u] has the 12 | {{:http://www.unicode.org/reports/tr44/#Dash}Dash} 13 | property. *) 14 | 15 | val is_diacritic : Uchar.t -> bool 16 | (** [is_diacritic u] is [true] if [u] has the 17 | {{:http://www.unicode.org/reports/tr44/#Diacritic}Diacritic} 18 | property. *) 19 | 20 | val is_extender : Uchar.t -> bool 21 | (** [is_extender u] is [true] if [u] has the 22 | {{:http://www.unicode.org/reports/tr44/#Extender}Extender} 23 | property. *) 24 | 25 | val is_grapheme_base : Uchar.t -> bool 26 | (** [is_grapheme_base u] is [true] if [u] has the 27 | {{:http://www.unicode.org/reports/tr44/#Grapheme_Base}Grapheme_Base} 28 | property. *) 29 | 30 | val is_grapheme_extend : Uchar.t -> bool 31 | (** [is_grapheme_extend u] is [true] if [u] has the 32 | {{:http://www.unicode.org/reports/tr44/#Grapheme_Extend}Grapheme_Extend} 33 | property. *) 34 | 35 | val is_math : Uchar.t -> bool 36 | (** [is_math u] is [true] if [u] has the 37 | {{:http://www.unicode.org/reports/tr44/#Math}Math} 38 | property. *) 39 | 40 | val is_quotation_mark : Uchar.t -> bool 41 | (** [is_quotation_mark u] is [true] if [u] has the 42 | {{:http://www.unicode.org/reports/tr44/#Quotation_Mark}Quotation_Mark} 43 | property. *) 44 | 45 | val is_soft_dotted : Uchar.t -> bool 46 | (** [is_soft_dotted u] is [true] if [u] has the 47 | {{:http://www.unicode.org/reports/tr44/#Soft_Dotted}Soft_Dotted} 48 | property. *) 49 | 50 | val is_terminal_punctuation : Uchar.t -> bool 51 | (** [is_terminal_punctuation u] is [true] if [u] has the 52 | {{:http://www.unicode.org/reports/tr44/#Terminal_Punctuation} 53 | Terminal_Punctuation} property. *) 54 | 55 | val is_regional_indicator : Uchar.t -> bool 56 | (** [is_regional_indicator u] is [true] if [u] has the 57 | {{:http://www.unicode.org/reports/tr44/#Regional_Indicator} 58 | Regional_indicator} property. *) 59 | 60 | val is_join_control : Uchar.t -> bool 61 | (** [is_join_control u] is [true] if [u] has the 62 | {{:http://www.unicode.org/reports/tr44/#Join_Control}Join_Control} 63 | property. *) 64 | -------------------------------------------------------------------------------- /src/uucp_tmapbool.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* uchar to bool trie maps *) 7 | 8 | type t = 9 | { default : bool; (* default value. *) 10 | l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *) 11 | 12 | let nil = [||] 13 | let snil = "" 14 | let l0_shift = 12 15 | let l0_size = 0x10F + 1 16 | let l1_shift = 8 17 | let l1_mask = 0xF 18 | let l1_size = 0xF + 1 19 | let l2_mask = 0xFF 20 | let l2_size = (0xFF + 1) / 8 21 | 22 | let create default = { default; l0 = Array.make l0_size nil } 23 | 24 | let get m u = 25 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in 26 | if l1 == nil then m.default else 27 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in 28 | if l2 == snil then m.default else 29 | let k = u land l2_mask in 30 | let byte_num = k lsr 3 (* / 8 *) in 31 | let bit_num = k land 7 (* mod 8 *) in 32 | let byte = Char.code (String.unsafe_get l2 byte_num) in 33 | byte land (1 lsl bit_num) > 0 34 | 35 | let set m u b = 36 | let l2_make m = Bytes.make l2_size (if m.default then '\xFF' else '\x00') in 37 | if b = m.default then () else 38 | let i = u lsr l0_shift in 39 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil; 40 | let j = u lsr l1_shift land l1_mask in 41 | if m.l0.(i).(j) == snil then 42 | m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m); 43 | let k = u land l2_mask in 44 | let byte_num = k lsr 3 (* / 8 *) in 45 | let bit_num = k land 7 (* mod 8 *) in 46 | let byte = Char.code (String.get m.l0.(i).(j) byte_num) in 47 | let new_byte = 48 | if b then (Char.unsafe_chr (byte lor (1 lsl bit_num))) else 49 | (Char.unsafe_chr (byte land lnot (1 lsl bit_num))) 50 | in 51 | Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) byte_num new_byte 52 | 53 | let word_size m = match m.l0 with 54 | | [||] -> 3 + 1 55 | | l0 -> 56 | let size = ref (3 + 1 + Array.length l0) in 57 | for i = 0 to Array.length l0 - 1 do match l0.(i) with 58 | | [||] -> () 59 | | l1 -> 60 | size := !size + 1 + Array.length l1; 61 | for j = 0 to Array.length l1 - 1 do 62 | size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size) 63 | done; 64 | done; 65 | !size 66 | 67 | let iter_blobs i m = Array.(iter (iter i)) m.l0 68 | 69 | let dump_pp pp_v ppf m = 70 | let open Uucp_fmt in 71 | record ["default", bool; "l0", pp_v |> array_N |> array] ppf m.default m.l0 72 | 73 | let pp_v = Uucp_fmt.string_XN 74 | let dump = dump_pp pp_v 75 | -------------------------------------------------------------------------------- /src/uucp__break.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | include Uucp_break_base 7 | 8 | module Low = struct 9 | let line u = 10 | Uucp_tmapbyte.get Uucp_break_data.line_break_map (Uchar.to_int u) 11 | 12 | let line_max = line_max 13 | let line_of_int = line_of_byte 14 | 15 | let grapheme_cluster u = 16 | Uucp_tmapbyte.get Uucp_break_data.grapheme_cluster_break_map 17 | (Uchar.to_int u) 18 | 19 | let grapheme_cluster_max = grapheme_cluster_max 20 | let grapheme_cluster_of_int = grapheme_cluster_of_byte 21 | 22 | let word u = Uucp_tmapbyte.get Uucp_break_data.word_break_map (Uchar.to_int u) 23 | let word_max = word_max 24 | let word_of_int = word_of_byte 25 | 26 | let sentence u = 27 | Uucp_tmapbyte.get Uucp_break_data.sentence_break_map (Uchar.to_int u) 28 | 29 | let sentence_max = sentence_max 30 | let sentence_of_int = sentence_of_byte 31 | 32 | let indic_conjunct_break u = 33 | Uucp_tmapbyte.get Uucp_break_data.indic_conjunct_break_map (Uchar.to_int u) 34 | 35 | let indic_conjunct_break_max = indic_conjunct_break_max 36 | let indic_conjunct_break_of_int = indic_conjunct_break_of_byte 37 | end 38 | 39 | let line u = Array.unsafe_get Low.line_of_int (Low.line u) 40 | let grapheme_cluster u = Array.unsafe_get Low.grapheme_cluster_of_int 41 | (Low.grapheme_cluster u) 42 | 43 | let word u = Array.unsafe_get Low.word_of_int (Low.word u) 44 | let sentence u = Array.unsafe_get Low.sentence_of_int (Low.sentence u) 45 | let indic_conjunct_break u = 46 | Array.unsafe_get Low.indic_conjunct_break_of_int 47 | (Low.indic_conjunct_break u) 48 | 49 | let east_asian_width u = 50 | Uucp_rmap.get Uucp_break_data.east_asian_width_map (Uchar.to_int u) 51 | 52 | let tty_width_hint = 53 | let gc i = Uucp__gc.general_category (Uchar.unsafe_of_int i) in 54 | let eaw i = east_asian_width (Uchar.unsafe_of_int i) in 55 | fun u -> match Uchar.to_int u with 56 | (* U+0000 is actually safe to (non-)render. *) 57 | | 0 -> 0 58 | (* C0 or DELETE and C1 (general category Cc) is non-sensical. *) 59 | |u when u <= 0x001F || 0x007F <= u && u <= 0x009F -> -1 60 | (* Euro-centric fast path (blocks ASCII - Modifier Letters). 61 | Notably includes one Cf character, U+00AD (Soft hyphen). *) 62 | | u when u <= 0x02FF -> 1 63 | (* Non-spacing. *) 64 | | u when (let c = gc u in c = `Mn || c = `Me || c = `Cf) -> 0 65 | (* Wide east-asian; intersects non-spacing. *) 66 | | u when (let w = eaw u in w = `W || w = `F) -> 2 67 | (* or else. Notably includes Zl (U+2028) and Zp (U+2029). *) 68 | | _ -> 1 69 | -------------------------------------------------------------------------------- /src/uucp_rmap.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Binary tree uchar ranges maps. *) 7 | 8 | type 'a tree = 9 | | Empty 10 | | R of int * int * 'a 11 | | Rn of 'a tree * 'a tree * int * int * 'a 12 | 13 | type 'a t = { default : 'a; tree : 'a tree } 14 | 15 | let get m cp = 16 | let rec loop cp = function 17 | | Rn (l, r, is, ie, v) -> 18 | if cp < is then loop cp l else 19 | if cp > ie then loop cp r else 20 | v 21 | | R (is, ie, v) -> 22 | if cp < is then m.default else 23 | if cp > ie then m.default else 24 | v 25 | | Empty -> m.default 26 | in 27 | loop cp m.tree 28 | 29 | let of_sorted_list default l = (* perfect balance. *) 30 | let rec loop len l = 31 | if len = 1 then match l with 32 | | `R (is, ie, v) :: r -> R (is, ie, v), r 33 | | _ -> assert false 34 | else 35 | let len_ll = len / 2 in 36 | let len_rl = len - len_ll in 37 | let ltree, rlist = loop len_ll l in 38 | match rlist with 39 | | [] -> ltree, [] 40 | | `R (is, ie, v) :: r -> 41 | if len_rl = 1 then Rn (ltree, Empty, is, ie, v), r else 42 | let rtree, rlist = loop (len_rl - 1) r in 43 | Rn (ltree, rtree, is, ie, v), rlist 44 | in 45 | let keep acc (`R (_, _, v) as p) = if v <> default then p :: acc else acc in 46 | let l = List.rev (List.fold_left keep [] l) in 47 | let len = List.length l in 48 | let tree = if len = 0 then Empty else fst (loop len l) in 49 | { default; tree } 50 | 51 | let height m = 52 | let rec loop = function 53 | | Empty -> 0 54 | | R _ -> 1 55 | | Rn (l, r, _, _, _) -> 1 + max (loop l) (loop r) 56 | in 57 | loop m.tree 58 | 59 | let rec word_size v_size m = (* value sharing not taken into account. *) 60 | let rec loop = function 61 | | Empty -> 0 62 | | R (_, _, v) -> 4 + v_size v 63 | | Rn (l, r, _, _, v) -> 6 + loop l + loop r + v_size v 64 | in 65 | loop m.tree 66 | 67 | let iter_values f m = 68 | let rec loop f = function 69 | | Empty -> () 70 | | R (_, _, v) -> f v 71 | | Rn (l, r, _, _, v) -> f v; loop f l; loop f r 72 | in 73 | f m.default; loop f m.tree 74 | 75 | let rec dump pp_v ppf m = 76 | let open Uucp_fmt in 77 | let rec dump_tree ppf = function 78 | | Rn (l, r, is, ie, v) -> 79 | pf ppf "@[<4>Rn(%a,@,%a,@,0x%04X,@,0x%04X,@,%a)@]" 80 | dump_tree l dump_tree r is ie pp_v v 81 | | R (is, ie, v) -> 82 | pf ppf "@[<3>R(0x%04X,@,0x%04X,@,%a)@]" is ie pp_v v 83 | | Empty -> 84 | pf ppf "Empty" 85 | in 86 | record ["default", pp_v; "tree", dump_tree] ppf m.default m.tree 87 | -------------------------------------------------------------------------------- /test/examples.ml: -------------------------------------------------------------------------------- 1 | (* This code is in the public domain *) 2 | 3 | (* Case conversion on UTF-8 strings *) 4 | 5 | let cmap_utf_8 cmap s = 6 | let rec loop buf s i max = 7 | if i > max then Buffer.contents buf else 8 | let dec = String.get_utf_8_uchar s i in 9 | let u = Uchar.utf_decode_uchar dec in 10 | begin match cmap u with 11 | | `Self -> Buffer.add_utf_8_uchar buf u 12 | | `Uchars us -> List.iter (Buffer.add_utf_8_uchar buf) us 13 | end; 14 | loop buf s (i + Uchar.utf_decode_length dec) max 15 | in 16 | let buf = Buffer.create (String.length s * 2) in 17 | loop buf s 0 (String.length s - 1) 18 | 19 | let lowercase_utf_8 s = cmap_utf_8 Uucp.Case.Map.to_lower s 20 | let uppercase_utf_8 s = cmap_utf_8 Uucp.Case.Map.to_upper s 21 | 22 | (* Canonical caseless equality on UTF-8 strings *) 23 | 24 | let canonical_caseless_key s = 25 | let buf = Buffer.create (String.length s * 3) in 26 | let to_nfd_and_utf_8 = 27 | let n = Uunf.create `NFD in 28 | let rec add v = match Uunf.add n v with 29 | | `Await | `End -> () 30 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add `Await 31 | in 32 | add 33 | in 34 | let add = 35 | let n = Uunf.create `NFD in 36 | let rec add v = match Uunf.add n v with 37 | | `Await | `End -> () 38 | | `Uchar u -> 39 | begin match Uucp.Case.Fold.fold u with 40 | | `Self -> to_nfd_and_utf_8 (`Uchar u) 41 | | `Uchars us -> List.iter (fun u -> to_nfd_and_utf_8 (`Uchar u)) us 42 | end; 43 | add `Await 44 | in 45 | add 46 | in 47 | let rec loop buf s i max = 48 | if i > max then (add `End; to_nfd_and_utf_8 `End; Buffer.contents buf) else 49 | let dec = String.get_utf_8_uchar s i in 50 | add (`Uchar (Uchar.utf_decode_uchar dec)); 51 | loop buf s (i + Uchar.utf_decode_length dec) max 52 | in 53 | loop buf s 0 (String.length s - 1) 54 | 55 | let canonical_caseless_eq s0 s1 = 56 | canonical_caseless_key s0 = canonical_caseless_key s1 57 | 58 | (* Caseless equality for identifiers on UTF-8 strings. *) 59 | 60 | let id_caseless_key s = 61 | let rec add buf normalizer v = match Uunf.add normalizer v with 62 | | `Await | `End -> () 63 | | `Uchar u -> 64 | match Uucp.Case.Nfkc_fold.fold u with 65 | | `Self -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await 66 | | `Uchars us -> 67 | List.iter (Buffer.add_utf_8_uchar buf) us; add buf normalizer `Await 68 | in 69 | let rec loop buf s i max normalizer = 70 | if i > max then (add buf normalizer `End; Buffer.contents buf) else 71 | let dec = String.get_utf_8_uchar s i in 72 | add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec)); 73 | loop buf s (i + Uchar.utf_decode_length dec) max normalizer 74 | in 75 | let buf = Buffer.create (String.length s * 3) in 76 | let normalizer = Uunf.create `NFD in 77 | loop buf s 0 (String.length s - 1) normalizer 78 | 79 | let id_caseless_eq s0 s1 = id_caseless_key s0 = id_caseless_key s1 80 | -------------------------------------------------------------------------------- /src/uucp_white_data.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2025 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* WARNING do not edit. This file was automatically generated. *) 7 | 8 | open Uucp_tmapbool 9 | let v000 = 10 | "\x00\x3e\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 11 | \x20\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 12 | let v001 = snil 13 | let v002 = 14 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 15 | \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 16 | let v003 = 17 | "\xff\x07\x00\x00\x00\x83\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\ 18 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 19 | let v004 = 20 | "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 21 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 22 | let white_space_map = 23 | { default = false; 24 | l0 = 25 | [|[|v000; v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; 26 | v001; v001; v001; v001; v001|]; 27 | [|v001; v001; v001; v001; v001; v001; v002; v001; v001; v001; v001; 28 | v001; v001; v001; v001; v001|]; 29 | [|v003; v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; 30 | v001; v001; v001; v001; v001|]; 31 | [|v004; v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; 32 | v001; v001; v001; v001; v001|]; 33 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 34 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 35 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 36 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 37 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 38 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 39 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 40 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 41 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 42 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 43 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 44 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 45 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 46 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 47 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 48 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 49 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 50 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 51 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 52 | nil; nil|] } 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/uucp__script.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Script and script extensions properties. 7 | 8 | {b References.} 9 | {ul 10 | {- Mark Davis, Ken Whistler. 11 | {{:http://www.unicode.org/reports/tr24/}{e Unicode script property}}. 12 | (latest version)} 13 | {- {{:http://www.unicode.org/charts/script/index.html}The Unicode script 14 | charts}.}} *) 15 | 16 | 17 | (** {1:scriptprop Script} *) 18 | 19 | type t = [ 20 | | `Adlm 21 | | `Aghb 22 | | `Ahom 23 | | `Arab 24 | | `Armi 25 | | `Armn 26 | | `Avst 27 | | `Bali 28 | | `Bamu 29 | | `Bass 30 | | `Batk 31 | | `Beng 32 | | `Berf 33 | | `Bhks 34 | | `Bopo 35 | | `Brah 36 | | `Brai 37 | | `Bugi 38 | | `Buhd 39 | | `Cakm 40 | | `Cans 41 | | `Cari 42 | | `Cham 43 | | `Cher 44 | | `Chrs 45 | | `Copt 46 | | `Cpmn 47 | | `Cprt 48 | | `Cyrl 49 | | `Deva 50 | | `Diak 51 | | `Dogr 52 | | `Dsrt 53 | | `Dupl 54 | | `Egyp 55 | | `Elba 56 | | `Elym 57 | | `Ethi 58 | | `Gara 59 | | `Geor 60 | | `Glag 61 | | `Gong 62 | | `Gonm 63 | | `Goth 64 | | `Gran 65 | | `Grek 66 | | `Gujr 67 | | `Gukh 68 | | `Guru 69 | | `Hang 70 | | `Hani 71 | | `Hano 72 | | `Hatr 73 | | `Hebr 74 | | `Hira 75 | | `Hluw 76 | | `Hmng 77 | | `Hmnp 78 | | `Hrkt 79 | | `Hung 80 | | `Ital 81 | | `Java 82 | | `Kali 83 | | `Kana 84 | | `Kawi 85 | | `Khar 86 | | `Khmr 87 | | `Khoj 88 | | `Knda 89 | | `Krai 90 | | `Kthi 91 | | `Kits 92 | | `Lana 93 | | `Laoo 94 | | `Latn 95 | | `Lepc 96 | | `Limb 97 | | `Lina 98 | | `Linb 99 | | `Lisu 100 | | `Lyci 101 | | `Lydi 102 | | `Mahj 103 | | `Maka 104 | | `Mand 105 | | `Mani 106 | | `Marc 107 | | `Medf 108 | | `Mend 109 | | `Merc 110 | | `Mero 111 | | `Mlym 112 | | `Modi 113 | | `Mong 114 | | `Mroo 115 | | `Mtei 116 | | `Mult 117 | | `Mymr 118 | | `Nagm 119 | | `Nand 120 | | `Narb 121 | | `Nbat 122 | | `Newa 123 | | `Nkoo 124 | | `Nshu 125 | | `Ogam 126 | | `Olck 127 | | `Onao 128 | | `Orkh 129 | | `Orya 130 | | `Osge 131 | | `Osma 132 | | `Ougr 133 | | `Palm 134 | | `Pauc 135 | | `Perm 136 | | `Phag 137 | | `Phli 138 | | `Phlp 139 | | `Phnx 140 | | `Plrd 141 | | `Prti 142 | | `Qaai 143 | | `Rjng 144 | | `Rohg 145 | | `Runr 146 | | `Samr 147 | | `Sarb 148 | | `Saur 149 | | `Sgnw 150 | | `Shaw 151 | | `Shrd 152 | | `Sidd 153 | | `Sidt 154 | | `Sind 155 | | `Sinh 156 | | `Sogd 157 | | `Sogo 158 | | `Sora 159 | | `Soyo 160 | | `Sund 161 | | `Sunu 162 | | `Sylo 163 | | `Syrc 164 | | `Tagb 165 | | `Takr 166 | | `Tale 167 | | `Talu 168 | | `Taml 169 | | `Tang 170 | | `Tavt 171 | | `Tayo 172 | | `Telu 173 | | `Tfng 174 | | `Tglg 175 | | `Thaa 176 | | `Thai 177 | | `Tibt 178 | | `Tirh 179 | | `Tnsa 180 | | `Todr 181 | | `Tols 182 | | `Toto 183 | | `Tutg 184 | | `Ugar 185 | | `Vaii 186 | | `Vith 187 | | `Wara 188 | | `Wcho 189 | | `Xpeo 190 | | `Xsux 191 | | `Yezi 192 | | `Yiii 193 | | `Zanb 194 | | `Zinh 195 | | `Zyyy 196 | | `Zzzz 197 | ] 198 | (** The type for scripts. *) 199 | 200 | val compare : t -> t -> int 201 | (** [compare s s'] is [Stdlib.compare s s']. *) 202 | 203 | val pp : Format.formatter -> t -> unit 204 | (** [pp ppf s] prints an unspecified representation of [s] on [ppf]. *) 205 | 206 | val script : Uchar.t -> t 207 | (** [script u] is [u]'s 208 | {{:http://www.unicode.org/reports/tr44/#Script}Script} property. *) 209 | 210 | val script_extensions : Uchar.t -> t list 211 | (** [script_extension u] is [u]'s 212 | {{:http://www.unicode.org/reports/tr44/#Script_Extensions} 213 | Script_Extensions} property. The list is never empty. *) 214 | -------------------------------------------------------------------------------- /src/uucp_tmap5bytes.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* uchar to 5 bytes trie maps. *) 7 | 8 | let str = Printf.sprintf 9 | let err_default l = str "default value length is %d, must be at least 5" l 10 | 11 | type t = 12 | { default : string; (* default value. *) 13 | l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *) 14 | 15 | let nil = [||] 16 | let snil = "" 17 | let l0_shift = 12 18 | let l0_size = 0x10F + 1 19 | let l1_shift = 8 20 | let l1_mask = 0xF 21 | let l1_size = 0xF + 1 22 | let l2_mask = 0xFF 23 | let l2_size = (0xFF + 1) * 5 24 | 25 | let create default = 26 | let dlen = String.length default in 27 | if dlen >= 5 then { default; l0 = Array.make l0_size nil } else 28 | invalid_arg (err_default dlen) 29 | 30 | let word_size m = match m.l0 with 31 | | [||] -> 3 + 4 + 1 32 | | l0 -> 33 | let size = ref (3 + 4 + 1 + Array.length l0) in 34 | for i = 0 to Array.length l0 - 1 do match l0.(i) with 35 | | [||] -> () 36 | | l1 -> 37 | size := !size + 1 + Array.length l1; 38 | for j = 0 to Array.length l1 - 1 do 39 | size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size) 40 | done; 41 | done; 42 | !size 43 | 44 | let iter_blobs i m = Array.(iter (iter i)) m.l0 45 | 46 | let dump_pp pp_v ppf m = 47 | let open Uucp_fmt in 48 | record ["default", string_X; "l0", pp_v |> array_N |> array] 49 | ppf m.default m.l0 50 | 51 | let pp_v = Uucp_fmt.string_XN 52 | let dump = dump_pp pp_v 53 | 54 | (* Five bytes as an uint20 pair *) 55 | 56 | let create_uint20_pair (d0, d1) = 57 | let default = Bytes.create 5 in 58 | Bytes.set default 0 (Char.unsafe_chr ((d0 land 0xFF))); 59 | Bytes.set default 1 (Char.unsafe_chr ((d0 lsr 8 land 0xFF))); 60 | Bytes.set default 2 (Char.unsafe_chr 61 | ((d0 lsr 12 land 0xF0) lor 62 | (d1 lsr 16 land 0x0F))); 63 | Bytes.set default 3 (Char.unsafe_chr ((d1 lsr 8 land 0xFF))); 64 | Bytes.set default 4 (Char.unsafe_chr ((d1 land 0xFF))); 65 | create (Bytes.unsafe_to_string default) 66 | 67 | let get_uint20_pair m u = 68 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in 69 | let s, k = 70 | if l1 == nil then m.default, 0 else 71 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in 72 | if l2 == snil then m.default, 0 else 73 | l2, (u land l2_mask) * 5 74 | in 75 | let i00 = Char.code (String.unsafe_get s (k )) in 76 | let i01 = Char.code (String.unsafe_get s (k + 1)) in 77 | let im = Char.code (String.unsafe_get s (k + 2)) in 78 | let i11 = Char.code (String.unsafe_get s (k + 3)) in 79 | let i10 = Char.code (String.unsafe_get s (k + 4)) in 80 | let i0 = ((im land 0xF0) lsl 12) lor (i01 lsl 8) lor i00 in 81 | let i1 = ((im land 0x0F) lsl 16) lor (i11 lsl 8) lor i10 in 82 | i0, i1 83 | 84 | let set_uint20_pair m u (i0, i1) = 85 | let l2_make m = 86 | let s = Bytes.create l2_size in 87 | for i = 0 to l2_size - 1 do Bytes.set s i (m.default.[i mod 4]) done; 88 | s 89 | in 90 | let d0 = 91 | ((Char.code m.default.[2] land 0xF0) lsl 12) lor 92 | (Char.code m.default.[1] lsl 8) 93 | lor (Char.code m.default.[0]) 94 | in 95 | let d1 = 96 | ((Char.code m.default.[2] land 0x0F) lsl 16) lor 97 | (Char.code m.default.[3] lsl 8) lor (Char.code m.default.[4]) 98 | in 99 | if d0 = i0 && d1 = i1 then () else 100 | let i = u lsr l0_shift in 101 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil; 102 | let j = u lsr l1_shift land l1_mask in 103 | if m.l0.(i).(j) == snil then 104 | m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m); 105 | let k = (u land l2_mask) * 5 in 106 | let s = Bytes.unsafe_of_string (m.l0.(i).(j)) in 107 | Bytes.set s (k ) (Char.unsafe_chr ((i0 land 0xFF))); 108 | Bytes.set s (k + 1) (Char.unsafe_chr ((i0 lsr 8 land 0xFF))); 109 | Bytes.set s (k + 2) (Char.unsafe_chr 110 | ((i0 lsr 12 land 0xF0) lor 111 | (i1 lsr 16 land 0x0F))); 112 | Bytes.set s (k + 3) (Char.unsafe_chr ((i1 lsr 8 land 0xFF))); 113 | Bytes.set s (k + 4) (Char.unsafe_chr ((i1 land 0xFF))); 114 | () 115 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | v17.0.0 2025-09-11 Zagreb 2 | ------------------------- 3 | 4 | - Unicode 17.0.0 support. 5 | 6 | v16.0.0 2024-09-11 Zagreb 7 | ------------------------- 8 | 9 | - Unicode 16.0.0 support. 10 | - Fix larger than needed size constant in implementation of boolean 11 | trie maps. Thanks to John Tristan for the report (#26). 12 | 13 | v15.1.0 2023-09-15 Zagreb 14 | ------------------------- 15 | 16 | - Unicode 15.1.0 support. 17 | - Require OCaml 4.14.0. 18 | - Use module aliases for the property modules. Only pay for the 19 | modules you use (#2). 20 | - Use the standard library UTF decoders in the sample code and in 21 | `ucharinfo` (#23). 22 | - The `Num.numeric_value` had to be changed to accomodate for the 23 | data. It now returns either NaN or a list of numbers. This is due to 24 | the interpretation of U+5146 and U+79ED which is locale dependent 25 | and thus can represent multiple values. In all other cases you 26 | should get singelton lists so far. 27 | - Rename `Uucd.Cjk.ids_bin_op` to `Uucd.Cjk.ids_binary_operator`. 28 | - Rename `Uucd.Cjk.ids_tri_op` to `Uccd.Cjk.ids_trinary_operator`. 29 | - Add `Uucd.Cjk.ids_unary_operator`, support for the new `IDS_Unary_Operator` 30 | property. 31 | - Add `Uucd.Id.is_id_compat_math_{start,continue}`, support for the new 32 | `ID_Compat_Math_{Start,Continue}` properties. 33 | - Add `Uucd.Case.Nfkc_simple_fold.fold`, support for the new 34 | `NFKC_Simple_Casefold` property. 35 | - Add `Uucd.Break.indic_conjunct_break`, support for the new 36 | `Indic_Conjunct_Break` property. 37 | 38 | v15.0.0 2022-09-15 Zagreb 39 | ------------------------- 40 | 41 | - Unicode 15.0.0 support. 42 | - Tool `ucharinfo`: allow to query more than one character at a time. 43 | - Tool `ucharinfo`: tool allow to search characters by name. 44 | 45 | v14.0.0 2021-09-17 Zagreb 46 | ------------------------- 47 | 48 | - Unicode 14.0.0 support. 49 | - Tweak `Uucp.Break.tty_width_hint` (09d2186). Thanks to David Kaloper 50 | Meršinjak. 51 | 52 | v13.0.0 2020-03-10 La Forclaz (VS) 53 | ---------------------------------- 54 | 55 | - Unicode 13.0.0 support. Adds the `Emoji` module with the new emoji 56 | properties. 57 | - Reduce data size by improving structure sharing. Thanks to David Kaloper 58 | Meršinjak for the help. 59 | - Handle `Pervasives` deprecation. 60 | - Require OCaml >= 4.03.0 61 | 62 | v12.0.0 2019-03-07 La Forclaz (VS) 63 | ---------------------------------- 64 | 65 | - Unicode 12.0.0 support. 66 | 67 | v11.0.0 2018-06-06 Zürich 68 | ------------------------- 69 | 70 | - Unicode 11.0.0 support. 71 | - Add support for the Join_Control property (`Uucp.Func.is_join_control`) 72 | and the Hangul_Syllable_Type property (`Uucp.Hangul.syllable_type`). 73 | 74 | v10.0.1 2017-06-21 Cambridge (UK) 75 | --------------------------------- 76 | 77 | - Fix wrong build dependencies of ucharinfo. Thanks to Andreas Hauptmann 78 | for the report. 79 | 80 | v10.0.0 2017-06-20 Cambridge (UK) 81 | --------------------------------- 82 | 83 | - Unicode 10.0.0 support. 84 | - Add ucharinfo tool to query Unicode character information on the cli. 85 | - OCaml 4.05 compatibility (removal of `Uchar.dump`) 86 | 87 | v2.0.0 2016-11-23 Zagreb 88 | ------------------------ 89 | 90 | - Unicode 9.0.0 support. 91 | - OCaml standard library `Uchar.t` support. 92 | - Removes and substitutes `type Uucp.uchar = int` by the (abstract) 93 | `Uchar.t` type. `Uchar.{of,to}_int` allows to recover the previous 94 | representation. 95 | - Removes the `Uucp.Uchar` module, corresponding functionality can 96 | be found in `Uchar`. 97 | - Safe string support. 98 | - Build depend on topkg. 99 | - Relicense from BSD3 to ISC. 100 | 101 | v1.1.0 2015-11-20 Cambridge (UK) 102 | -------------------------------- 103 | 104 | - Add support for the East Asian width property (`Uucp.Break.east_asian_width`). 105 | - Add the non-normative, heuristic function `Uucp.Break.tty_width_hint`. 106 | Thanks to David Kaloper for the contribution. 107 | 108 | v1.0.0 2015-06-17 Cambridge (UK) 109 | -------------------------------- 110 | 111 | - Updated for Unicode 8.0.0 112 | Incompatible release, new variants cases are introduced, see commit 113 | adbb5efc036 for details. 114 | 115 | v0.9.1 2014-12-23 Cugy (VD) 116 | --------------------------- 117 | 118 | - Add access to the `Line_break`, `Grapheme_cluster_break`, `Word_break` and 119 | `Sentence_break` Unicode properties. See the `Uucp.Break` module. 120 | - Improvements and fixes to the minimal Unicode Introduction. 121 | 122 | 123 | v0.9.0 2014-06-28 Cambridge (UK) 124 | ------------------------------- 125 | 126 | First release. Part of the work was sponsored by OCaml Labs. 127 | -------------------------------------------------------------------------------- /support/gen_name.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* The name property being distinct for most scalar value it doesn't 7 | compress well using any of Uucp's maps, especially on 64-bit 8 | platforms, where a pointer costs 8 bytes. To give an idea having 9 | one pointer per scalar value already uses around 8 Mo of memory 10 | ((0x10_FFFF - 0x07FF) * 8 / (1024 * 1024)). 11 | 12 | For a more efficient encoding of names we cut the name property of 13 | a scalar value in two trying to share the prefix with the name of 14 | the two next scalar value. This results in two tokens. Tokens are 15 | stored uniquely in a large strings separated by \x00 bytes. To map 16 | a scalar value to a name we use a trie that maps scalars values to 17 | 5 bytes stored as string chunks (see uucp_tmap5bytes.ml). These 18 | five bytes encode two 20-bit unsigned integers which are the 19 | offsets of the two tokens making up the name in the string of 20 | tokens. 21 | 22 | The lookup procedure simply gets the two 20-bit integers from the 23 | trie map, looks up the corresponding tokens in the string of tokens 24 | concatenates them to form the name. We also keep the pattern naming 25 | mechanism inherited from the UCD XML for CJK names (see prop in 26 | pp_name below and Uucp_name.name). *) 27 | 28 | let split_in_two_tokens ~other ~other' n = 29 | let find_right_sp s = try String.rindex s ' ' with Not_found -> 0 in 30 | let find_prefix_len s0 s1 = 31 | let len_s0 = String.length s0 in 32 | let len_s1 = String.length s1 in 33 | let max_idx = if len_s0 < len_s1 then len_s0 - 1 else len_s1 -1 in 34 | let rec loop i = if i > max_idx || s0.[i] <> s1.[i] then i else loop (i + 1) 35 | in loop 0 36 | in 37 | let n_len = String.length n in 38 | let cut_len = find_prefix_len other n in 39 | let cut_len' = find_prefix_len other' n in 40 | let cut_len = max cut_len cut_len' in 41 | let cut_len = if cut_len < 4 then find_right_sp n else cut_len in 42 | let cut_len = 43 | (* This refinement makes slightly less tokens and space *) 44 | if cut_len <> 0 && n.[cut_len - 1] = ' ' then cut_len - 1 else cut_len 45 | in 46 | String.sub n 0 cut_len, String.sub n cut_len (n_len - cut_len) 47 | 48 | let has_final_sharp s = 49 | try ignore (String.index s '#' = String.length s - 1); true with 50 | | Not_found -> false 51 | 52 | let name_prop ucd = 53 | let tok_buffer = Buffer.create (1024 * 1024) in 54 | let tok_index = Hashtbl.create 30_000 in 55 | let tok_offset = ref 0 in 56 | let get_tok_idx tok = try Hashtbl.find tok_index tok with 57 | | Not_found -> 58 | let offset = !tok_offset in 59 | Hashtbl.add tok_index tok offset; 60 | Buffer.add_string tok_buffer tok; 61 | Buffer.add_char tok_buffer '\x00'; 62 | tok_offset := !tok_offset + String.length tok + 1; 63 | offset 64 | in 65 | let prop u = match Gen.ucd_get ucd u Uucd.name with 66 | | `Name n -> 67 | assert (not (has_final_sharp n)); 68 | let u = Uchar.unsafe_of_int u in 69 | let bound_succ u = 70 | if Uchar.equal u Uchar.max then Uchar.max else Uchar.succ u 71 | in 72 | let u' = bound_succ u in 73 | let u'' = bound_succ u' in 74 | let other = match Gen.ucd_get ucd (Uchar.to_int u') Uucd.name with 75 | | `Name n -> n | `Pattern n -> n 76 | in 77 | let other' = match Gen.ucd_get ucd (Uchar.to_int u'') Uucd.name with 78 | | `Name n -> n | `Pattern n -> n 79 | in 80 | let l, r = split_in_two_tokens ~other ~other' n in 81 | (* empty on the left is used for patterns so don't let this happen *) 82 | let l, r = if l = "" then r, "" else l, r in 83 | get_tok_idx l, get_tok_idx r 84 | | `Pattern n -> 85 | assert (has_final_sharp n); 86 | (* empty on the left and non-empty on right implies pattern *) 87 | get_tok_idx "", get_tok_idx (String.sub n 0 (String.length n - 1)) 88 | in 89 | ignore (get_tok_idx ""); (* assign idx 0 to "" *) 90 | let default = (0, 0) in 91 | Buffer.contents tok_buffer, 92 | Hashtbl.length tok_index, 93 | prop, Gen.prop_tmap5bytes_uint20_pair prop default 94 | 95 | let pp_name ppf ucd = 96 | Gen.log "* name property, 5 bytes trie map and token table@\n"; 97 | let toks, tok_count, prop, (m, get) = name_prop ucd in 98 | let size = Uucp_tmap5bytes.word_size m in 99 | Gen.log " size: %a" Gen.pp_size size; 100 | Gen.log " token count: %d tokens size: %a@\n" 101 | tok_count Gen.pp_size (String.length toks / (Sys.word_size / 8)); 102 | Gen.log " asserting"; Gen.assert_prop_map prop get; 103 | Gen.log ", generating@\n"; 104 | Gen.pp ppf "@[<2>let name_toks : string =@ %a@]@\n@\n" Uucp_fmt.string_X toks; 105 | Gen.pp_tmap5byte ppf "name" m; 106 | () 107 | 108 | let pp_name_alias ppf ucd = 109 | let size v = 110 | 3 * (List.length v) + 111 | List.fold_left (fun acc (_, n) -> 3 + String.length n) 0 v 112 | in 113 | let pp_tag ppf t = Gen.pp ppf "`%a" Uucp_name_base.pp_alias_tag t in 114 | let pp_alias ppf (t, n) = Gen.pp ppf "@[<1>(%a,@,%S)@]" pp_tag t n in 115 | let pp_alist = Gen.pp_list pp_alias in 116 | let prop u = 117 | let permute (n, t) = (t, n) in 118 | List.map permute (Gen.ucd_get ucd u Uucd.name_alias) 119 | in 120 | Gen.pp_prop_cmap ppf prop 121 | "name_alias" "(Uucp_name_base.alias_tag * string) list" 122 | pp_alist ~default:[] size 123 | 124 | let pp_props ppf ucd = 125 | pp_name ppf ucd; 126 | pp_name_alias ppf ucd; 127 | () 128 | 129 | let pp_mod ppf ucd = Gen.pp_mod pp_props ppf ucd 130 | -------------------------------------------------------------------------------- /B0.ml: -------------------------------------------------------------------------------- 1 | open B0_kit.V000 2 | open Result.Syntax 3 | 4 | let unicode_version = 17, 0, 0, None (* Adjust on new releases *) 5 | let next_major = B0_version.next_major unicode_version 6 | 7 | (* OCaml library names *) 8 | 9 | let b0_std = B0_ocaml.libname "b0.std" 10 | let unix = B0_ocaml.libname "unix" 11 | let uucd = B0_ocaml.libname "uucd" 12 | let uunf = B0_ocaml.libname "uunf" 13 | let cmdliner = B0_ocaml.libname "cmdliner" 14 | 15 | let uucp = B0_ocaml.libname "uucp" 16 | 17 | (* Libraries *) 18 | 19 | let uucp_lib = B0_ocaml.lib uucp ~srcs:[ `Dir ~/"src" ] 20 | 21 | (* Data generation. *) 22 | 23 | let generate_data = 24 | let doc = "uucp_*_data.ml files generator" in 25 | let srcs = 26 | [ `Dir ~/"support"; 27 | (* Well that was based on ocamlbuild loose' inclusion, 28 | maybe we could move all that to a single _base.ml module. *) 29 | `File ~/"src/uucp_block_base.ml"; 30 | `File ~/"src/uucp_break_base.ml"; 31 | `File ~/"src/uucp_gc_base.ml"; 32 | `File ~/"src/uucp_hangul_base.ml"; 33 | `File ~/"src/uucp_name_base.ml"; 34 | `File ~/"src/uucp_num_base.ml"; 35 | `File ~/"src/uucp_script_base.ml"; 36 | (* *) 37 | `File ~/"src/uucp_cmap.ml"; 38 | `File ~/"src/uucp_fmt.ml"; 39 | `File ~/"src/uucp_rmap.ml"; 40 | `File ~/"src/uucp_tmap.ml"; 41 | `File ~/"src/uucp_tmap5bytes.ml"; 42 | `File ~/"src/uucp_tmapbool.ml"; 43 | `File ~/"src/uucp_tmapbyte.ml"; ] 44 | in 45 | let requires = [ uucd; unix ] in 46 | let meta = 47 | B0_meta.empty 48 | |> B0_meta.(tag build) 49 | |> ~~ B0_unit.Action.cwd `Scope_dir 50 | in 51 | B0_ocaml.exe "generate-data" ~doc ~srcs ~requires ~meta 52 | 53 | (* Tools *) 54 | 55 | let ucharinfo = 56 | let doc = "The ucharinfo tool" in 57 | let srcs = [ `File ~/"test/ucharinfo.ml" ] in 58 | let requires = [ cmdliner; uunf; uucp ] in 59 | B0_ocaml.exe "ucharinfo" ~public:true ~doc ~srcs ~requires 60 | 61 | (* Tests *) 62 | 63 | let test ?doc ?(meta = B0_meta.empty) ?run:(r = false) ?(requires = []) src = 64 | let srcs = [ `File src ] in 65 | let requires = uucp :: requires in 66 | let meta = B0_meta.(meta |> tag test |> add run r) in 67 | let name = Fpath.basename ~strip_exts:true src in 68 | B0_ocaml.exe name ?doc ~srcs ~meta ~requires 69 | 70 | let test_uucp = 71 | let doc = "Test Uucp against the Unicode database." in 72 | let meta = B0_meta.empty |> ~~ B0_unit.Action.cwd `Scope_dir in 73 | test ~/"test/test_uucp.ml" ~requires:[uucd; b0_std] ~meta ~run:true ~doc 74 | 75 | let perf = test ~/"test/perf.ml" ~doc:"Test performance" 76 | let link_test = test ~/"test/link_test.ml" ~doc:"Link test" 77 | let examples = test ~/"test/examples.ml" ~requires:[uunf] ~doc:"Doc samples" 78 | 79 | (* Actions *) 80 | 81 | let uc_base = "http://www.unicode.org/Public" 82 | 83 | let show_version = 84 | B0_unit.of_action "unicode-version" ~doc:"Show supported unicode version" @@ 85 | fun _ _ ~args:_ -> 86 | Ok (Log.stdout (fun m -> m "%s" (B0_version.to_string unicode_version))) 87 | 88 | let download_ucdxml = 89 | let doc = "Download the ucdxml to support/ucd.xml" in 90 | B0_unit.of_action "download-ucdxml" ~doc @@ fun env _ ~args:_ -> 91 | let* unzip = B0_env.get_cmd env (Cmd.tool "unzip") in 92 | let version = B0_version.to_string unicode_version in 93 | let ucd_url = Fmt.str "%s/%s/ucdxml/ucd.all.grouped.zip" uc_base version in 94 | let ucd_file = B0_env.in_scope_dir env ~/"support/ucd.xml" in 95 | Result.join @@ Os.File.with_tmp_fd @@ fun tmpfile tmpfd -> 96 | (Log.stdout @@ fun m -> 97 | m "@[Downloading %s@,to %a@]" ucd_url Fpath.pp ucd_file); 98 | let* () = B0_action_kit.fetch_url env ucd_url tmpfile in 99 | let stdout = Os.Cmd.out_file ~force:true ~make_path:true ucd_file in 100 | Os.Cmd.run Cmd.(unzip % "-p" %% path tmpfile) ~stdout 101 | 102 | (* Packs *) 103 | 104 | let default = 105 | let unicode_version = B0_version.to_string unicode_version in 106 | let next_major = B0_version.to_string next_major in 107 | let meta = 108 | B0_meta.empty 109 | |> ~~ B0_meta.authors ["The uucp programmers"] 110 | |> ~~ B0_meta.maintainers ["Daniel Bünzli "] 111 | |> ~~ B0_meta.homepage "https://erratique.ch/software/uucp" 112 | |> ~~ B0_meta.online_doc "https://erratique.ch/software/uucp/doc/" 113 | |> ~~ B0_meta.licenses ["ISC"] 114 | |> ~~ B0_meta.repo "git+https://erratique.ch/repos/uucp.git" 115 | |> ~~ B0_meta.issues "https://github.com/dbuenzli/uucp/issues" 116 | |> ~~ B0_meta.description_tags 117 | ["unicode"; "text"; "character"; "org:erratique"] 118 | |> ~~ B0_opam.build 119 | {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%" 120 | "--with-uunf" "%{uunf:installed}%" 121 | "--with-cmdliner" "%{cmdliner:installed}%" ]]|} 122 | |> B0_meta.tag B0_opam.tag 123 | |> ~~ B0_opam.depopts [ "uunf", ""; "cmdliner", ""] 124 | |> ~~ B0_opam.conflicts 125 | [ "uunf", Fmt.str {|< "%s" | >= "%s" |} unicode_version next_major; 126 | "cmdliner", {|< "1.1.0"|} ] 127 | |> ~~ B0_opam.depends 128 | [ "ocaml", {|>= "4.14.0"|}; 129 | "ocamlfind", {|build|}; 130 | "ocamlbuild", {|build|}; 131 | "topkg", {|build & >= "1.1.0"|}; 132 | "uucd", Fmt.str {|with-test dev & >= "%s" & < "%s"|} 133 | unicode_version next_major; 134 | "uunf", {|with-test|} ] 135 | |> ~~ B0_opam.file_addendum 136 | [ `Field ("post-messages", `L (true, [ 137 | `S "If the build fails with \"ocamlopt.opt got signal and \ 138 | exited\", issue 'ulimit -s unlimited' and retry."; 139 | `Raw {|{failure & (arch = "ppc64" | arch = "arm64")}|}]))] 140 | in 141 | B0_pack.make "default" ~doc:"uucd package" ~meta ~locked:true @@ 142 | B0_unit.list () 143 | -------------------------------------------------------------------------------- /src/uucp__case.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2013 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Case properties, mappings and foldings. 7 | 8 | These properties can implement Unicode's default case detection, 9 | case conversion and caseless equality over Unicode text, see the 10 | {{!Case.caseexamples}examples}. 11 | 12 | {b References.} 13 | {ul 14 | {- {{:http://unicode.org/faq/casemap_charprop.html#casemap} 15 | The Unicode case mapping FAQ.}} 16 | {- {{:http://www.unicode.org/charts/case/}The Unicode case mapping 17 | charts.}}} *) 18 | 19 | (** {1:caseprops Case properties} *) 20 | 21 | val is_lower : Uchar.t -> bool 22 | (** [is_lower u] is [true] iff [u] has the 23 | {{:http://www.unicode.org/reports/tr44/#Lowercase}Lowercase} derived 24 | property. *) 25 | 26 | val is_upper : Uchar.t -> bool 27 | (** [is_upper u] is [true] iff [u] has the 28 | {{:http://www.unicode.org/reports/tr44/#Uppercase}Uppercase} derived 29 | property. *) 30 | 31 | val is_cased : Uchar.t -> bool 32 | (** [is_cased u] is [true] iff [u] has the 33 | {{:http://www.unicode.org/reports/tr44/#Cased}Cased} derived property. *) 34 | 35 | val is_case_ignorable : Uchar.t -> bool 36 | (** [is_case_ignorable] is [true] iff [u] has the 37 | {{:http://www.unicode.org/reports/tr44/#Case_Ignorable}Case_Ignorable} 38 | derived property. *) 39 | 40 | (** {1:casemapfold Case mappings and foldings} 41 | 42 | These character mapping functions return [`Self] 43 | whenever a character maps to itself. *) 44 | 45 | module Map = Uucp__case_map 46 | module Fold = Uucp__case_fold 47 | module Nfkc_fold = Uucp__case_nfkc 48 | module Nfkc_simple_fold = Uucp__case_nfkc_simple 49 | 50 | (** {1:caseexamples Examples} 51 | 52 | All these examples replace invalid UTF-8 decodes by an {!Uchar.rep}. 53 | 54 | {2:caseconversion Default case conversion on UTF-8 strings} 55 | 56 | The value [casemap_utf_8 cmap s] is the UTF-8 encoded string 57 | resulting from applying the character map [cmap] to every 58 | character of the UTF-8 encoded string [s]. 59 | 60 | {[ 61 | let cmap_utf_8 cmap s = 62 | let rec loop buf s i max = 63 | if i > max then Buffer.contents buf else 64 | let dec = String.get_utf_8_uchar s i in 65 | let u = Uchar.utf_decode_uchar dec in 66 | begin match cmap u with 67 | | `Self -> Buffer.add_utf_8_uchar buf u 68 | | `Uchars us -> List.iter (Buffer.add_utf_8_uchar buf) us 69 | end; 70 | loop buf s (i + Uchar.utf_decode_length dec) max 71 | in 72 | let buf = Buffer.create (String.length s * 2) in 73 | loop buf s 0 (String.length s - 1) 74 | ]} 75 | 76 | Using the function [cmap_utf_8], Unicode's default case 77 | conversions can be implemented with: 78 | 79 | {[ 80 | let lowercase_utf_8 s = cmap_utf_8 Uucp.Case.Map.to_lower s 81 | let uppercase_utf_8 s = cmap_utf_8 Uucp.Case.Map.to_upper s 82 | ]} 83 | 84 | However strictly speaking [lowercase_utf_8] is not conformant 85 | as it doesn't handle the context sensitive mapping of capital 86 | sigma U+03A3 to final sigma U+03C2. 87 | 88 | Note that applying Unicode's default case algorithms to a normalized 89 | string does not preserve its normalization form. 90 | 91 | {2:caselesseq Default caseless matching (equality) on UTF-8 strings} 92 | 93 | These examples use {!Uunf} to normalize character sequences 94 | 95 | Unicode canonical caseless matching (D145) is defined by 96 | normalizing to NFD, applying the Case_Folding mapping, normalizing 97 | again to NFD and test the result for binary equality: 98 | 99 | {[ 100 | let canonical_caseless_key s = 101 | let buf = Buffer.create (String.length s * 3) in 102 | let to_nfd_and_utf_8 = 103 | let n = Uunf.create `NFD in 104 | let rec add v = match Uunf.add n v with 105 | | `Await | `End -> () 106 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add `Await 107 | in 108 | add 109 | in 110 | let add = 111 | let n = Uunf.create `NFD in 112 | let rec add v = match Uunf.add n v with 113 | | `Await | `End -> () 114 | | `Uchar u -> 115 | begin match Uucp.Case.Fold.fold u with 116 | | `Self -> to_nfd_and_utf_8 (`Uchar u) 117 | | `Uchars us -> List.iter (fun u -> to_nfd_and_utf_8 (`Uchar u)) us 118 | end; 119 | add `Await 120 | in 121 | add 122 | in 123 | let rec loop buf s i max = 124 | if i > max then (add `End; to_nfd_and_utf_8 `End; Buffer.contents buf) else 125 | let dec = String.get_utf_8_uchar s i in 126 | add (`Uchar (Uchar.utf_decode_uchar dec)); 127 | loop buf s (i + Uchar.utf_decode_length dec) max 128 | in 129 | loop buf s 0 (String.length s - 1) 130 | 131 | let canonical_caseless_eq s0 s1 = 132 | canonical_caseless_key s0 = canonical_caseless_key s1 133 | ]} 134 | 135 | Unicode's caseless matching for identifiers (D147, see also 136 | {{:http://www.unicode.org/reports/tr31/}UAX 31}) is defined 137 | by normalizing to NFD, applying the NFKC_Casefold mapping and test 138 | the result for binary equality: 139 | 140 | {[ 141 | let id_caseless_key s = 142 | let rec add buf normalizer v = match Uunf.add normalizer v with 143 | | `Await | `End -> () 144 | | `Uchar u -> 145 | match Uucp.Case.Nfkc_fold.fold u with 146 | | `Self -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await 147 | | `Uchars us -> 148 | List.iter (Buffer.add_utf_8_uchar buf) us; add buf normalizer `Await 149 | in 150 | let rec loop buf s i max normalizer = 151 | if i > max then (add buf normalizer `End; Buffer.contents buf) else 152 | let dec = String.get_utf_8_uchar s i in 153 | add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec)); 154 | loop buf s (i + Uchar.utf_decode_length dec) max normalizer 155 | in 156 | let buf = Buffer.create (String.length s * 3) in 157 | let normalizer = Uunf.create `NFD in 158 | loop buf s 0 (String.length s - 1) normalizer 159 | 160 | let id_caseless_eq s0 s1 = id_caseless_key s0 = id_caseless_key s1 161 | ]} 162 | *) 163 | -------------------------------------------------------------------------------- /support/generate_data.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Extracts data from the Unicode Character Database *) 7 | 8 | let str = Format.sprintf 9 | let exec = Filename.basename Sys.executable_name 10 | 11 | let ucd_or_die inf = try 12 | let ic = if inf = "-" then stdin else open_in inf in 13 | let d = Uucd.decoder (`Channel ic) in 14 | match Uucd.decode d with 15 | | `Ok db -> db 16 | | `Error e -> 17 | let (l0, c0), (l1, c1) = Uucd.decoded_range d in 18 | Printf.eprintf "%s:%d.%d-%d.%d: %s\n%!" inf l0 c0 l1 c1 e; 19 | exit 1 20 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1 21 | 22 | let process 23 | inf use_default age alpha block break case case_map case_fold case_nfkc 24 | case_nfkc_simple cjk emoji func gc gen hangul id name num script version 25 | white 26 | = 27 | let ucd = (Gen.log "Loading Unicode character database.\n"; ucd_or_die inf)in 28 | let generate pp f ucd = match f with 29 | | `Default _ when not use_default -> () 30 | | `Default fn | `Set fn -> 31 | try 32 | let oc = if fn = "-" then stdout else open_out fn in 33 | try 34 | let ppf = Format.formatter_of_out_channel oc in 35 | pp ppf ucd; 36 | Format.pp_print_flush ppf (); 37 | close_out oc 38 | with Sys_error _ as e -> close_out oc; raise e 39 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1 40 | in 41 | Gen.log "Note: reported sizes do not take sharing into account.\n"; 42 | generate Gen_age.pp_mod age ucd; 43 | generate Gen_alpha.pp_mod alpha ucd; 44 | generate Gen_block.pp_mod block ucd; 45 | generate Gen_break.pp_mod break ucd; 46 | generate Gen_case.pp_mod case ucd; 47 | generate Gen_case_map.pp_mod case_map ucd; 48 | generate Gen_case_fold.pp_mod case_fold ucd; 49 | generate Gen_case_nfkc.pp_mod case_nfkc ucd; 50 | generate Gen_case_nfkc_simple.pp_mod case_nfkc_simple ucd; 51 | generate Gen_cjk.pp_mod cjk ucd; 52 | generate Gen_emoji.pp_mod emoji ucd; 53 | generate Gen_func.pp_mod func ucd; 54 | generate Gen_gc.pp_mod gc ucd; 55 | generate Gen_gen.pp_mod gen ucd; 56 | generate Gen_hangul.pp_mod hangul ucd; 57 | generate Gen_id.pp_mod id ucd; 58 | generate Gen_name.pp_mod name ucd; 59 | generate Gen_num.pp_mod num ucd; 60 | generate Gen_script.pp_mod script ucd; 61 | generate (Gen.pp_mod Gen.pp_version) version ucd; 62 | generate Gen_white.pp_mod white ucd; 63 | () 64 | 65 | let main () = 66 | let usage = str 67 | "Usage: %s [OPTION]... [DBFILE]\n\ 68 | \ Generates data modules from Unicode character database XML file.\n\ 69 | \ DBFILE defaults to support/ucd.xml\n\ 70 | \ If no option is specified all files are generated to \ 71 | src/uucd_*_data.ml files\n\ 72 | Options:" exec 73 | in 74 | let inf = ref None in 75 | let set_inf f = 76 | if !inf = None then inf := Some f else 77 | raise (Arg.Bad "only one Unicode character database file can be specified") 78 | in 79 | let use_default = ref true in 80 | let age = ref (`Default "src/uucp_age_data.ml") in 81 | let alpha = ref (`Default "src/uucp_alpha_data.ml") in 82 | let block = ref (`Default "src/uucp_block_data.ml") in 83 | let break = ref (`Default "src/uucp_break_data.ml") in 84 | let case = ref (`Default "src/uucp_case_data.ml") in 85 | let case_map = ref (`Default "src/uucp_case_map_data.ml") in 86 | let case_fold = ref (`Default "src/uucp_case_fold_data.ml") in 87 | let case_nfkc = ref (`Default "src/uucp_case_nfkc_data.ml") in 88 | let case_nfkc_simple = ref (`Default "src/uucp_case_nfkc_simple_data.ml") in 89 | let cjk = ref (`Default "src/uucp_cjk_data.ml") in 90 | let emoji = ref (`Default "src/uucp_emoji_data.ml") in 91 | let func = ref (`Default "src/uucp_func_data.ml") in 92 | let gc = ref (`Default "src/uucp_gc_data.ml") in 93 | let gen = ref (`Default "src/uucp_gen_data.ml") in 94 | let hangul = ref (`Default "src/uucp_hangul_data.ml") in 95 | let id = ref (`Default "src/uucp_id_data.ml") in 96 | let name = ref (`Default "src/uucp_name_data.ml") in 97 | let num = ref (`Default "src/uucp_num_data.ml") in 98 | let script = ref (`Default "src/uucp_script_data.ml") in 99 | let version = ref (`Default "src/uucp_version_data.ml") in 100 | let white = ref (`Default "src/uucp_white_data.ml") in 101 | let set r = Arg.String (fun s -> use_default := false; r := `Set s) in 102 | let options = [ 103 | "-age", set age, " Support for the age property"; 104 | "-alpha", set alpha, " Support for the alphabetic property"; 105 | "-block", set block, " Support for block properties"; 106 | "-break", set break, " Support for break properties"; 107 | "-case", set case, " Support for case properties"; 108 | "-case-map", set case_map, " Support for case mappings"; 109 | "-case-fold", set case_fold, " Support for case folding"; 110 | "-case-nfkc", set case_nfkc, " Support for NFKC case folding"; 111 | "-case-nfkc-simple", set case_nfkc_simple, 112 | " Support for NFKC simple case folding"; 113 | "-cjk", set cjk, " Support for CJK properties"; 114 | "-emoji", set emoji, " Support for emoji props"; 115 | "-func", set func, " Support for function and graph props"; 116 | "-gc", set gc, " Support for the general category property"; 117 | "-gen", set gen, " Support for general props"; 118 | "-hangul", set hangul, " Support for hangul props"; 119 | "-id", set id, " Support for id properties"; 120 | "-name", set name, " Support for name properties"; 121 | "-num", set num, " Support for numeric properties"; 122 | "-script", set script, " Support for script properties"; 123 | "-version", set version, " Support for the Unicode version"; 124 | "-white", set white, " Support for the white space property"; ] 125 | in 126 | Arg.parse (Arg.align options) set_inf usage; 127 | let inf = match !inf with None -> "support/ucd.xml" | Some inf -> inf in 128 | process inf !use_default !age !alpha !block !break !case !case_map !case_fold 129 | !case_nfkc !case_nfkc_simple !cjk !emoji !func !gc !gen !hangul !id !name 130 | !num !script !version !white 131 | 132 | let () = main () 133 | -------------------------------------------------------------------------------- /src/uucp_break_base.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Line break *) 7 | 8 | type line = 9 | [ `AI | `AK | `AL | `AP | `AS | `B2 | `BA | `BB | `BK | `CB | `CJ | `CL 10 | | `CM | `CP | `CR | `EX | `EB | `EM | `GL | `H2 | `H3 | `HH | `HL | `HY | `ID 11 | | `IN | `IS | `JL | `JT | `JV | `LF | `NL | `NS | `NU | `OP | `PO | `PR 12 | | `QU | `RI | `SA | `SG | `SP | `SY | `VF | `VI | `WJ | `XX | `ZW | `ZWJ ] 13 | 14 | let line_of_byte : line array = 15 | [| `AI; `AK; `AL; `AP; `AS; `B2; `BA; `BB; `BK; `CB; `CJ; `CL; 16 | `CM; `CP; `CR; `EX; `EB; `EM; `GL; `H2; `H3; `HH; `HL; `HY; `ID; 17 | `IN; `IS; `JL; `JT; `JV; `LF; `NL; `NS; `NU; `OP; `PO; `PR; 18 | `QU; `RI; `SA; `SG; `SP; `SY; `VF; `VI; `WJ; `XX; `ZW; `ZWJ |] 19 | 20 | let line_max = Array.length line_of_byte - 1 21 | 22 | let line_to_byte = function 23 | | `AI -> 0 | `AK -> 1 | `AL -> 2 | `AP -> 3 | `AS -> 4 | `B2 -> 5 24 | | `BA -> 6 | `BB -> 7 | `BK -> 8 | `CB -> 9 | `CJ -> 10 | `CL -> 11 25 | | `CM -> 12 | `CP -> 13 | `CR -> 14 | `EX -> 15 | `EB -> 16 | `EM -> 17 26 | | `GL -> 18 | `H2 -> 19 | `H3 -> 20 | `HH -> 21 | `HL -> 22 | `HY -> 23 27 | | `ID -> 24 | `IN -> 25 | `IS -> 26 | `JL -> 27 | `JT -> 28 | `JV -> 29 28 | | `LF -> 30 | `NL -> 31 | `NS -> 32 | `NU -> 33 | `OP -> 34 | `PO -> 35 29 | | `PR -> 36 | `QU -> 37 | `RI -> 38 | `SA -> 39 | `SG -> 40 | `SP -> 41 30 | | `SY -> 42 | `VF -> 43 | `VI -> 44 | `WJ -> 45 | `XX -> 46 | `ZW -> 47 31 | | `ZWJ -> 48 32 | 33 | let pp_line ppf v = Format.fprintf ppf "%s" begin match v with 34 | | `AI -> "AI" | `AK -> "AK" | `AL -> "AL" | `AP -> "AP" | `AS -> "AS" 35 | | `B2 -> "B2" | `BA -> "BA" | `BB -> "BB" | `BK -> "BK" | `CB -> "CB" 36 | | `CJ -> "CJ" | `CL -> "CL" | `CM -> "CM" | `CP -> "CP" | `CR -> "CR" 37 | | `EX -> "EX" | `EB -> "EB" | `EM -> "EM" | `GL -> "GL" | `H2 -> "H2" 38 | | `H3 -> "H3" | `HH -> "HH" | `HL -> "HL" | `HY -> "HY" | `ID -> "ID" 39 | | `IN -> "IN" | `IS -> "IS" | `JL -> "JL" | `JT -> "JT" | `JV -> "JV" 40 | | `LF -> "LF" | `NL -> "NL" | `NS -> "NS" | `NU -> "NU" | `OP -> "OP" 41 | | `PO -> "PO" | `PR -> "PR" | `QU -> "QU" | `RI -> "RI" | `SA -> "SA" 42 | | `SG -> "SG" | `SP -> "SP" | `SY -> "SY" | `VF -> "VF" | `VI -> "VI" 43 | | `WJ -> "WJ" | `XX -> "XX" | `ZW -> "ZW" | `ZWJ -> "ZWJ" 44 | end 45 | 46 | (* Grapheme cluster break *) 47 | 48 | type grapheme_cluster = 49 | [ `CN | `CR | `EX | `EB | `EBG | `EM | `GAZ | `L | `LF | `LV | `LVT 50 | | `PP | `RI | `SM | `T | `V | `XX | `ZWJ ] 51 | 52 | let grapheme_cluster_of_byte : grapheme_cluster array = 53 | [| `CN; `CR; `EX; `EB; `EBG; `EM; `GAZ; `L; `LF; `LV; `LVT; `PP; `RI; 54 | `SM; `T; `V; `XX; `ZWJ |] 55 | 56 | let grapheme_cluster_max = Array.length grapheme_cluster_of_byte - 1 57 | 58 | let grapheme_cluster_to_byte = function 59 | | `CN -> 0 | `CR -> 1 | `EX -> 2 | `EB -> 3 | `EBG -> 4 | `EM -> 5 60 | | `GAZ -> 6 | `L -> 7 | `LF -> 8 | `LV -> 9 | `LVT -> 10 | `PP -> 11 61 | | `RI -> 12 | `SM -> 13 | `T -> 14 | `V -> 15 | `XX -> 16 | `ZWJ -> 17 62 | 63 | let pp_grapheme_cluster ppf v = Format.fprintf ppf "%s" begin match v with 64 | | `CN -> "CN" | `CR -> "CR" | `EX -> "EX" | `EB -> "EB" | `EBG -> "EBG" 65 | | `EM -> "EM" | `GAZ -> "GAZ" | `L -> "L" | `LF -> "LF" | `LV -> "LV" 66 | | `LVT -> "LVT" | `PP -> "PP" | `RI -> "RI" | `SM -> "SM" | `T -> "T" 67 | | `V -> "V" | `XX -> "XX" | `ZWJ -> "ZWJ" 68 | end 69 | 70 | (* Word break *) 71 | 72 | type word = 73 | [ `CR | `DQ | `EX | `EB | `EBG | `EM | `Extend | `FO | `GAZ | `HL | `KA 74 | | `LE | `LF | `MB | `ML | `MN | `NL | `NU | `RI | `SQ | `WSegSpace | `XX 75 | | `ZWJ ] 76 | 77 | let word_of_byte : word array = 78 | [| `CR; `DQ; `EX; `EB; `EBG; `EM; `Extend; `FO; `GAZ; `HL; `KA; `LE; `LF; 79 | `MB; `ML; `MN; `NL; `NU; `RI; `SQ; `WSegSpace; `XX; `ZWJ |] 80 | 81 | let word_max = Array.length word_of_byte - 1 82 | 83 | let word_to_byte = function 84 | | `CR -> 0 | `DQ -> 1 | `EX -> 2 | `EB -> 3 | `EBG -> 4 | `EM -> 5 85 | | `Extend -> 6 | `FO -> 7 | `GAZ -> 8 | `HL -> 9 | `KA -> 10 | `LE -> 11 86 | | `LF -> 12 | `MB -> 13 | `ML -> 14 | `MN -> 15 | `NL -> 16 | `NU -> 17 87 | | `RI -> 18 | `SQ -> 19 | `WSegSpace -> 20 | `XX -> 21 | `ZWJ -> 22 88 | 89 | let pp_word ppf v = Format.fprintf ppf "%s" begin match v with 90 | | `CR -> "CR" | `DQ -> "DQ" | `EX -> "EX" | `EB -> "EB" | `EBG -> "EBG" 91 | | `EM -> "EM" | `Extend -> "Extend" | `FO -> "FO" | `GAZ -> "GAZ" 92 | | `HL -> "HL" | `KA -> "KA" | `LE -> "LE" | `LF -> "LF" | `MB -> "MB" 93 | | `ML -> "ML" | `MN -> "MN" | `NL -> "NL" | `NU -> "NU" | `RI -> "RI" 94 | | `SQ -> "SQ" | `WSegSpace -> "WSegSpace" | `XX -> "XX" | `ZWJ -> "ZWJ" 95 | end 96 | 97 | (* Sentence break *) 98 | 99 | type sentence = 100 | [ `AT | `CL | `CR | `EX | `FO | `LE | `LF | `LO | `NU | `SC | `SE | `SP 101 | | `ST | `UP | `XX ] 102 | 103 | let sentence_of_byte : sentence array = 104 | [| `AT; `CL; `CR; `EX; `FO; `LE; `LF; `LO; `NU; `SC; `SE; `SP; `ST; `UP; `XX|] 105 | 106 | let sentence_max = Array.length sentence_of_byte - 1 107 | 108 | let sentence_to_byte = function 109 | | `AT -> 0 | `CL -> 1 | `CR -> 2 | `EX -> 3 | `FO -> 4 | `LE -> 5 | `LF -> 6 110 | | `LO -> 7 | `NU -> 8 | `SC -> 9 | `SE -> 10 | `SP -> 11 | `ST -> 12 111 | | `UP -> 13 | `XX -> 14 112 | 113 | let pp_sentence ppf v = Format.fprintf ppf "%s" begin match v with 114 | | `AT -> "AT" | `CL -> "CL" | `CR -> "CR" | `EX -> "EX" | `FO -> "FO" 115 | | `LE -> "LE" | `LF -> "LF" | `LO -> "LO" | `NU -> "NU" | `SC -> "SC" 116 | | `SE -> "SE" | `SP -> "SP" | `ST -> "ST" | `UP -> "UP" | `XX -> "XX" 117 | end 118 | 119 | (* Indic conjunct break *) 120 | 121 | type indic_conjunct_break = 122 | [ `Consonant | `Extend | `Linker | `None ] 123 | 124 | let indic_conjunct_break_of_byte : indic_conjunct_break array = 125 | [| `Consonant; `Extend; `Linker; `None |] 126 | 127 | let indic_conjunct_break_max = Array.length indic_conjunct_break_of_byte - 1 128 | 129 | let indic_conjunct_break_to_byte = function 130 | | `Consonant -> 0 | `Extend -> 1 | `Linker -> 2 | `None -> 3 131 | 132 | let pp_indic_conjunct_break ppf v = Format.fprintf ppf "%s" begin match v with 133 | | `Consonant -> "Consonant" | `Extend -> "Extend" | `Linker -> "Linker" 134 | | `None -> "None" 135 | end 136 | 137 | (* East Asian width *) 138 | 139 | type east_asian_width = [ `A | `F | `H | `N | `Na | `W ] 140 | let pp_east_asian_width ppf v = Format.pp_print_string ppf begin match v with 141 | | `A -> "A" | `F -> "F" | `H -> "H" | `N -> "N" | `Na -> "Na" | `W -> "W" 142 | end 143 | -------------------------------------------------------------------------------- /src/uucp.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Unicode character properties. 7 | 8 | [Uucp] provides efficient access to a selection of character 9 | {{!props}properties} of the Unicode character database. 10 | 11 | Consult {{!page-unicode}this page} for a minimal Unicode 12 | introduction and OCaml Unicode tips. Individual modules have 13 | sample code related to the properties. 14 | 15 | {b References.} 16 | {ul 17 | {- {{:http://www.unicode.org/faq/}The Unicode FAQ.}} 18 | {- The Unicode Consortium. 19 | {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. 20 | (latest version)} 21 | {- Mark Davis, Ken Whistler. 22 | {e {{:http://www.unicode.org/reports/tr44/}UAX #44 Unicode Character 23 | Database}}. (latest version)}} *) 24 | 25 | (** {1:props Properties} 26 | 27 | Consult information about the {{!distrib_omit}property distribution 28 | in modules and omissions}. *) 29 | 30 | val unicode_version : string 31 | (** [unicode_version] is the Unicode version supported by the library. *) 32 | 33 | module Age = Uucp__age 34 | module Alpha = Uucp__alpha 35 | module Block = Uucp__block 36 | module Break = Uucp__break 37 | module Case = Uucp__case 38 | module Cjk = Uucp__cjk 39 | module Emoji = Uucp__emoji 40 | module Func = Uucp__func 41 | module Gc = Uucp__gc 42 | module Gen = Uucp__gen 43 | module Hangul = Uucp__hangul 44 | module Id = Uucp__id 45 | module Name = Uucp__name 46 | module Num = Uucp__num 47 | module Script = Uucp__script 48 | module White = Uucp__white 49 | 50 | (** {1:distrib_omit Property module distribution and omissions} 51 | 52 | Properties are approximatively distributed in modules by scope of use 53 | like in this 54 | {{:http://www.unicode.org/reports/tr44/#Property_Index_Table}property 55 | index table}. However some subset of properties 56 | live in their own modules. 57 | 58 | Obsolete and 59 | {{:http://www.unicode.org/reports/tr44/#Deprecated_Property_Table} 60 | deprecated} properties are 61 | omitted. So are those related to normalization, shaping and 62 | bidirectionality. Here is the full list of omitted properties, 63 | if you think one of these property should be added get in touch 64 | with a rationale. 65 | {ul 66 | {- Case. 67 | {{:http://www.unicode.org/reports/tr44/#Simple_Lowercase_Mapping} 68 | Simple_Lowercase_Mapping}, 69 | {{:http://www.unicode.org/reports/tr44/#Simple_Uppercase_Mapping} 70 | Simple_Uppercase_Mapping}, 71 | {{:http://www.unicode.org/reports/tr44/#Simple_Titlecase_Mapping} 72 | Simple_Titlecase_Mapping}, 73 | {{:http://www.unicode.org/reports/tr44/#Simple_Case_Folding} 74 | Simple_Case_folding}, 75 | {{:http://www.unicode.org/reports/tr44/#CWL} 76 | Changes_When_Lowercased}, 77 | {{:http://www.unicode.org/reports/tr44/#CWU} 78 | Changes_When_Uppercased}, 79 | {{:http://www.unicode.org/reports/tr44/#CWT} 80 | Changes_When_Titlecased}, 81 | {{:http://www.unicode.org/reports/tr44/#CWCF} 82 | Changes_When_Casefolded}, 83 | {{:http://www.unicode.org/reports/tr44/#CWCM} 84 | Changes_When_Casemapped}, 85 | {{:https://www.unicode.org/reports/tr44/proposed.html#CWKCF} 86 | Changes_When_NFKC_Casefolded}.} 87 | {- Normalization. All properties under that section name in 88 | {{:http://www.unicode.org/reports/tr44/#Property_Index_Table} 89 | this table}.} 90 | {- Shaping and rendering. 91 | {{:http://www.unicode.org/reports/tr44/#Joining_Group}Joining_Group}, 92 | {{:http://www.unicode.org/reports/tr44/#Joining_Type}Joining_Type}, 93 | {{:http://www.unicode.org/reports/tr44/#Vertical_Orientation}Vertical_Orientation}, 94 | {{:http://www.unicode.org/reports/tr44/#Indic_Syllabic_Category} 95 | Indic_Syllabic_Category}, 96 | {{:http://www.unicode.org/reports/tr44/#Indic_Positional_Category} 97 | Indic_Positional_Category}, 98 | {{:http://www.unicode.org/reports/tr44/#Prepended_Concatenation_Mark} 99 | Prepended_Concatenation_Mark}} 100 | {- Bidirectional. All properties under that section name in 101 | {{:http://www.unicode.org/reports/tr44/#Property_Index_Table} 102 | this table}.} 103 | {- CJK. 104 | {{:http://www.unicode.org/reports/tr44/#Unicode_Radical_Stroke} 105 | Unicode_Radical_Stroke}, 106 | {{:http://www.unicode.org/reports/tr44/#Equivalent_Unified_Ideograph} 107 | Equivalent_Unified_Ideograph} and all the properties of the 108 | {{:http://www.unicode.org/reports/tr38/}Unicode HAN Database}.} 109 | {- Miscellaneous. 110 | {{:http://www.unicode.org/reports/tr44/#STerm}STerm}.} 111 | {- Contributory properties. All properties under that section in 112 | {{:http://www.unicode.org/reports/tr44/#Property_Index_Table}this 113 | table.}}} *) 114 | 115 | (**/**) 116 | 117 | (* Warning this is not part of the public API and subject 118 | to change without notice. *) 119 | 120 | module Cmap : sig 121 | type 'a tree = Empty | C of int * 'a | Cn of 'a tree * 'a tree * int * 'a 122 | type 'a t = { default : 'a; tree : 'a tree; } 123 | val get : 'a t -> int -> 'a 124 | val of_sorted_list : 'a -> [ `C of int * 'a ] list -> 'a t 125 | val height : 'a t -> int 126 | val word_size : ('a -> int) -> 'a t -> int 127 | val dump : 128 | (Format.formatter -> 'a -> unit) -> Format.formatter -> 'a t -> unit 129 | end 130 | 131 | module Rmap : sig 132 | type 'a tree = 133 | | Empty 134 | | R of int * int * 'a 135 | | Rn of 'a tree * 'a tree * int * int * 'a 136 | type 'a t = { default : 'a; tree : 'a tree; } 137 | val get : 'a t -> int -> 'a 138 | val of_sorted_list : 'a -> [ `R of int * int * 'a ] list -> 'a t 139 | val height : 'a t -> int 140 | val word_size : ('a -> int) -> 'a t -> int 141 | val dump : 142 | (Format.formatter -> 'a -> unit) -> Format.formatter -> 'a t -> unit 143 | end 144 | 145 | module Tmap : sig 146 | type 'a t = { default : 'a; l0 : 'a array array array; } 147 | val nil : 'a array 148 | val create : 'a -> 'a t 149 | val get : 'a t -> int -> 'a 150 | val set : 'a t -> int -> 'a -> unit 151 | val word_size : ('a -> int) -> 'a t -> int 152 | val dump : 153 | (Format.formatter -> 'a -> unit) -> Format.formatter -> 'a t -> unit 154 | end 155 | 156 | module Tmapbool : sig 157 | type t = { default : bool; l0 : string array array; } 158 | val nil : 'a array 159 | val snil : string 160 | val create : bool -> t 161 | val get : t -> int -> bool 162 | val set : t -> int -> bool -> unit 163 | val word_size : t -> int 164 | val dump : Format.formatter -> t -> unit 165 | end 166 | 167 | module Tmapbyte : sig 168 | type t = { default : int; l0 : string array array; } 169 | val nil : 'a array 170 | val snil : string 171 | val create : int -> t 172 | val get : t -> int -> int 173 | val set : t -> int -> int -> unit 174 | val word_size : t -> int 175 | val dump : Format.formatter -> t -> unit 176 | end 177 | 178 | (**/**) 179 | -------------------------------------------------------------------------------- /src/uucp_script_base.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type t = [ 7 | | `Adlm 8 | | `Aghb 9 | | `Ahom 10 | | `Arab 11 | | `Armi 12 | | `Armn 13 | | `Avst 14 | | `Bali 15 | | `Bamu 16 | | `Bass 17 | | `Batk 18 | | `Beng 19 | | `Berf 20 | | `Bhks 21 | | `Bopo 22 | | `Brah 23 | | `Brai 24 | | `Bugi 25 | | `Buhd 26 | | `Cakm 27 | | `Cans 28 | | `Cari 29 | | `Cham 30 | | `Cher 31 | | `Chrs 32 | | `Copt 33 | | `Cpmn 34 | | `Cprt 35 | | `Cyrl 36 | | `Deva 37 | | `Diak 38 | | `Dogr 39 | | `Dsrt 40 | | `Dupl 41 | | `Egyp 42 | | `Elba 43 | | `Elym 44 | | `Ethi 45 | | `Gara 46 | | `Geor 47 | | `Glag 48 | | `Gong 49 | | `Gonm 50 | | `Goth 51 | | `Gran 52 | | `Grek 53 | | `Gujr 54 | | `Gukh 55 | | `Guru 56 | | `Hang 57 | | `Hani 58 | | `Hano 59 | | `Hatr 60 | | `Hebr 61 | | `Hira 62 | | `Hluw 63 | | `Hmng 64 | | `Hmnp 65 | | `Hrkt 66 | | `Hung 67 | | `Ital 68 | | `Java 69 | | `Kali 70 | | `Kana 71 | | `Kawi 72 | | `Khar 73 | | `Khmr 74 | | `Khoj 75 | | `Knda 76 | | `Krai 77 | | `Kthi 78 | | `Kits 79 | | `Lana 80 | | `Laoo 81 | | `Latn 82 | | `Lepc 83 | | `Limb 84 | | `Lina 85 | | `Linb 86 | | `Lisu 87 | | `Lyci 88 | | `Lydi 89 | | `Mahj 90 | | `Maka 91 | | `Mand 92 | | `Mani 93 | | `Marc 94 | | `Medf 95 | | `Mend 96 | | `Merc 97 | | `Mero 98 | | `Mlym 99 | | `Modi 100 | | `Mong 101 | | `Mroo 102 | | `Mtei 103 | | `Mult 104 | | `Mymr 105 | | `Nagm 106 | | `Nand 107 | | `Narb 108 | | `Nbat 109 | | `Newa 110 | | `Nkoo 111 | | `Nshu 112 | | `Ogam 113 | | `Olck 114 | | `Onao 115 | | `Orkh 116 | | `Orya 117 | | `Osge 118 | | `Osma 119 | | `Ougr 120 | | `Palm 121 | | `Pauc 122 | | `Perm 123 | | `Phag 124 | | `Phli 125 | | `Phlp 126 | | `Phnx 127 | | `Plrd 128 | | `Prti 129 | | `Qaai 130 | | `Rjng 131 | | `Rohg 132 | | `Runr 133 | | `Samr 134 | | `Sarb 135 | | `Saur 136 | | `Sgnw 137 | | `Shaw 138 | | `Shrd 139 | | `Sidd 140 | | `Sidt 141 | | `Sind 142 | | `Sinh 143 | | `Sogd 144 | | `Sogo 145 | | `Sora 146 | | `Soyo 147 | | `Sund 148 | | `Sunu 149 | | `Sylo 150 | | `Syrc 151 | | `Tagb 152 | | `Takr 153 | | `Tale 154 | | `Talu 155 | | `Taml 156 | | `Tang 157 | | `Tavt 158 | | `Tayo 159 | | `Telu 160 | | `Tfng 161 | | `Tglg 162 | | `Thaa 163 | | `Thai 164 | | `Tibt 165 | | `Tirh 166 | | `Tnsa 167 | | `Todr 168 | | `Tols 169 | | `Toto 170 | | `Tutg 171 | | `Ugar 172 | | `Vaii 173 | | `Vith 174 | | `Wara 175 | | `Wcho 176 | | `Xpeo 177 | | `Xsux 178 | | `Yezi 179 | | `Yiii 180 | | `Zanb 181 | | `Zinh 182 | | `Zyyy 183 | | `Zzzz 184 | ] 185 | 186 | let pp ppf s = Format.fprintf ppf "%s" begin match s with 187 | | `Adlm -> "Adlm" 188 | | `Aghb -> "Aghb" 189 | | `Ahom -> "Ahom" 190 | | `Arab -> "Arab" 191 | | `Armi -> "Armi" 192 | | `Armn -> "Armn" 193 | | `Avst -> "Avst" 194 | | `Bali -> "Bali" 195 | | `Bamu -> "Bamu" 196 | | `Bass -> "Bass" 197 | | `Batk -> "Batk" 198 | | `Beng -> "Beng" 199 | | `Berf -> "Berf" 200 | | `Bhks -> "Bhks" 201 | | `Bopo -> "Bopo" 202 | | `Brah -> "Brah" 203 | | `Brai -> "Brai" 204 | | `Bugi -> "Bugi" 205 | | `Buhd -> "Buhd" 206 | | `Cakm -> "Cakm" 207 | | `Cans -> "Cans" 208 | | `Cari -> "Cari" 209 | | `Cham -> "Cham" 210 | | `Cher -> "Cher" 211 | | `Chrs -> "Chrs" 212 | | `Copt -> "Copt" 213 | | `Cpmn -> "Cpmn" 214 | | `Cprt -> "Cprt" 215 | | `Cyrl -> "Cyrl" 216 | | `Deva -> "Deva" 217 | | `Diak -> "Diak" 218 | | `Dogr -> "Dogr" 219 | | `Dsrt -> "Dsrt" 220 | | `Dupl -> "Dupl" 221 | | `Egyp -> "Egyp" 222 | | `Elba -> "Elba" 223 | | `Elym -> "Elym" 224 | | `Ethi -> "Ethi" 225 | | `Gara -> "Gara" 226 | | `Geor -> "Geor" 227 | | `Glag -> "Glag" 228 | | `Gong -> "Gong" 229 | | `Gonm -> "Gonm" 230 | | `Goth -> "Goth" 231 | | `Gran -> "Gran" 232 | | `Grek -> "Grek" 233 | | `Gujr -> "Gujr" 234 | | `Gukh -> "Gukh" 235 | | `Guru -> "Guru" 236 | | `Hang -> "Hang" 237 | | `Hani -> "Hani" 238 | | `Hano -> "Hano" 239 | | `Hatr -> "Hatr" 240 | | `Hebr -> "Hebr" 241 | | `Hira -> "Hira" 242 | | `Hluw -> "Hluw" 243 | | `Hmng -> "Hmng" 244 | | `Hmnp -> "Hmnp" 245 | | `Hrkt -> "Hrkt" 246 | | `Hung -> "Hung" 247 | | `Ital -> "Ital" 248 | | `Java -> "Java" 249 | | `Kali -> "Kali" 250 | | `Kana -> "Kana" 251 | | `Kawi -> "Kawi" 252 | | `Khar -> "Khar" 253 | | `Khmr -> "Khmr" 254 | | `Khoj -> "Khoj" 255 | | `Knda -> "Knda" 256 | | `Krai -> "Krai" 257 | | `Kthi -> "Kthi" 258 | | `Kits -> "Kits" 259 | | `Lana -> "Lana" 260 | | `Laoo -> "Laoo" 261 | | `Latn -> "Latn" 262 | | `Lepc -> "Lepc" 263 | | `Limb -> "Limb" 264 | | `Lina -> "Lina" 265 | | `Linb -> "Linb" 266 | | `Lisu -> "Lisu" 267 | | `Lyci -> "Lyci" 268 | | `Lydi -> "Lydi" 269 | | `Mahj -> "Mahj" 270 | | `Maka -> "Maka" 271 | | `Mand -> "Mand" 272 | | `Mani -> "Mani" 273 | | `Marc -> "Marc" 274 | | `Medf -> "Medf" 275 | | `Mend -> "Mend" 276 | | `Merc -> "Merc" 277 | | `Mero -> "Mero" 278 | | `Mlym -> "Mlym" 279 | | `Modi -> "Modi" 280 | | `Mong -> "Mong" 281 | | `Mroo -> "Mroo" 282 | | `Mtei -> "Mtei" 283 | | `Mult -> "Mult" 284 | | `Mymr -> "Mymr" 285 | | `Nagm -> "Nagm" 286 | | `Nand -> "Nand" 287 | | `Narb -> "Narb" 288 | | `Nbat -> "Nbat" 289 | | `Newa -> "Newa" 290 | | `Nkoo -> "Nkoo" 291 | | `Nshu -> "Nshu" 292 | | `Ogam -> "Ogam" 293 | | `Olck -> "Olck" 294 | | `Onao -> "Onao" 295 | | `Orkh -> "Orkh" 296 | | `Orya -> "Orya" 297 | | `Osge -> "Osge" 298 | | `Osma -> "Osma" 299 | | `Ougr -> "Ougr" 300 | | `Palm -> "Palm" 301 | | `Pauc -> "Pauc" 302 | | `Perm -> "Perm" 303 | | `Phag -> "Phag" 304 | | `Phli -> "Phli" 305 | | `Phlp -> "Phlp" 306 | | `Phnx -> "Phnx" 307 | | `Plrd -> "Plrd" 308 | | `Prti -> "Prti" 309 | | `Qaai -> "Qaai" 310 | | `Rjng -> "Rjng" 311 | | `Rohg -> "Rohg" 312 | | `Runr -> "Runr" 313 | | `Samr -> "Samr" 314 | | `Sarb -> "Sarb" 315 | | `Saur -> "Saur" 316 | | `Sgnw -> "Sgnw" 317 | | `Shaw -> "Shaw" 318 | | `Shrd -> "Shrd" 319 | | `Sidd -> "Sidd" 320 | | `Sidt -> "Sidt" 321 | | `Sind -> "Sind" 322 | | `Sinh -> "Sinh" 323 | | `Sogd -> "Sogd" 324 | | `Sogo -> "Sogo" 325 | | `Sora -> "Sora" 326 | | `Soyo -> "Soyo" 327 | | `Sund -> "Sund" 328 | | `Sunu -> "Sunu" 329 | | `Sylo -> "Sylo" 330 | | `Syrc -> "Syrc" 331 | | `Tagb -> "Tagb" 332 | | `Takr -> "Takr" 333 | | `Tale -> "Tale" 334 | | `Talu -> "Talu" 335 | | `Taml -> "Taml" 336 | | `Tang -> "Tang" 337 | | `Tavt -> "Tavt" 338 | | `Tayo -> "Tayo" 339 | | `Telu -> "Telu" 340 | | `Tfng -> "Tfng" 341 | | `Tglg -> "Tglg" 342 | | `Thaa -> "Thaa" 343 | | `Thai -> "Thai" 344 | | `Tibt -> "Tibt" 345 | | `Tirh -> "Tirh" 346 | | `Tnsa -> "Tnsa" 347 | | `Todr -> "Todr" 348 | | `Tols -> "Tols" 349 | | `Toto -> "Toto" 350 | | `Tutg -> "Tutg" 351 | | `Ugar -> "Ugar" 352 | | `Vaii -> "Vaii" 353 | | `Vith -> "Vith" 354 | | `Wara -> "Wara" 355 | | `Wcho -> "Wcho" 356 | | `Xpeo -> "Xpeo" 357 | | `Xsux -> "Xsux" 358 | | `Yezi -> "Yezi" 359 | | `Yiii -> "Yiii" 360 | | `Zanb -> "Zanb" 361 | | `Zinh -> "Zinh" 362 | | `Zyyy -> "Zyyy" 363 | | `Zzzz -> "Zzzz" 364 | end 365 | -------------------------------------------------------------------------------- /test/perf.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Reapeatedly looks up properties for the whole character set. *) 7 | 8 | let str = Format.sprintf 9 | let exec = Filename.basename Sys.executable_name 10 | let log fmt = Format.eprintf (fmt ^^ "%!") 11 | 12 | let prop count mname fname prop = 13 | log "Lookup %s.%s for each uchar (%dx)@\n" mname fname count; 14 | for i = 1 to count do 15 | for u = 0 to 0xD7FF do ignore (prop (Uchar.unsafe_of_int u)) done; 16 | for u = 0xE000 to 0x10FFFF do ignore (prop (Uchar.unsafe_of_int u)) done; 17 | done; 18 | () 19 | 20 | let lookup_age count = 21 | let prop fname p = prop count "Uucp.Age" fname p in 22 | prop "age" Uucp.Age.age; 23 | () 24 | 25 | let lookup_alpha count = 26 | let prop fname p = prop count "Uucd.Alpha" fname p in 27 | prop "is_alphabetic" Uucp.Alpha.is_alphabetic; 28 | () 29 | 30 | let lookup_block count = 31 | let prop fname p = prop count "Uucd.Block" fname p in 32 | prop "block" Uucp.Block.block; 33 | () 34 | 35 | let lookup_break count = 36 | let prop fname p = prop count "Uucp.Break" fname p in 37 | prop "line" Uucp.Break.line; 38 | prop "grapheme_cluster" Uucp.Break.grapheme_cluster; 39 | prop "word" Uucp.Break.word; 40 | prop "sentence" Uucp.Break.sentence; 41 | prop "indic_conjunct_break" Uucp.Break.indic_conjunct_break; 42 | prop "east_asian_width" Uucp.Break.east_asian_width; 43 | () 44 | 45 | let lookup_case count = 46 | let prop fname p = prop count "Uucd.Case" fname p in 47 | prop "is_upper" Uucp.Case.is_upper; 48 | prop "is_lower" Uucp.Case.is_lower; 49 | prop "is_cased" Uucp.Case.is_cased; 50 | prop "is_case_ignorable" Uucp.Case.is_case_ignorable; 51 | prop "Map.to_upper" Uucp.Case.Map.to_upper; 52 | prop "Map.to_lower" Uucp.Case.Map.to_lower; 53 | prop "Map.to_title" Uucp.Case.Map.to_title; 54 | prop "Fold.fold" Uucp.Case.Fold.fold; 55 | prop "Fold.Nfkc.fold" Uucp.Case.Nfkc_fold.fold; 56 | () 57 | 58 | let lookup_cjk count = 59 | let prop fname p = prop count "Uucd.Cjk" fname p in 60 | prop "ideographic" Uucp.Cjk.is_ideographic; 61 | prop "ids_unary_operator" Uucp.Cjk.is_ids_unary_operator; 62 | prop "ids_binary_operator" Uucp.Cjk.is_ids_binary_operator; 63 | prop "ids_trinary_operator" Uucp.Cjk.is_ids_trinary_operator; 64 | prop "radical" Uucp.Cjk.is_radical; 65 | prop "unified_ideograph" Uucp.Cjk.is_unified_ideograph; 66 | () 67 | 68 | let lookup_emoji count = 69 | let prop fname p = prop count "Uucp.Emoji" fname p in 70 | prop "is_emoji" Uucp.Emoji.is_emoji; 71 | prop "is_emoji_presentation" Uucp.Emoji.is_emoji_presentation; 72 | prop "is_emoji_modifier" Uucp.Emoji.is_emoji_modifier; 73 | prop "is_emoji_modifier_base" Uucp.Emoji.is_emoji_modifier_base; 74 | prop "is_emoji_component" Uucp.Emoji.is_emoji_component; 75 | prop "is_extended_pictographic" Uucp.Emoji.is_emoji_component; 76 | () 77 | 78 | let lookup_func count = 79 | let prop fname p = prop count "Uucp.Func" fname p in 80 | prop "is_dash" Uucp.Func.is_dash; 81 | prop "is_diacritic" Uucp.Func.is_diacritic; 82 | prop "is_extender" Uucp.Func.is_extender; 83 | prop "is_grapheme_base" Uucp.Func.is_grapheme_base; 84 | prop "is_grapheme_extend" Uucp.Func.is_grapheme_extend; 85 | prop "is_math" Uucp.Func.is_math; 86 | prop "is_quotation_mark" Uucp.Func.is_quotation_mark; 87 | prop "is_soft_dotted" Uucp.Func.is_soft_dotted; 88 | prop "is_terminal_punctuation" Uucp.Func.is_terminal_punctuation; 89 | prop "is_regional_indicator" Uucp.Func.is_regional_indicator; 90 | () 91 | 92 | let lookup_gc count = 93 | let prop fname p = prop count "Uucp.Gc" fname p in 94 | prop "general_category" Uucp.Gc.general_category; 95 | () 96 | 97 | let lookup_gen count = 98 | let prop fname p = prop count "Uucp.Gen" fname p in 99 | prop "is_default_ignorable" Uucp.Gen.is_default_ignorable; 100 | prop "is_deprecated" Uucp.Gen.is_deprecated ; 101 | prop "is_logical_order_exception" Uucp.Gen.is_logical_order_exception; 102 | prop "is_non_character" Uucp.Gen.is_non_character; 103 | prop "is_variation_selector" Uucp.Gen.is_variation_selector; 104 | () 105 | 106 | let lookup_hangul count = 107 | let prop fname p = prop count "Uucp.Hangul" fname p in 108 | prop "syllable_type" Uucp.Hangul.syllable_type; 109 | () 110 | 111 | let lookup_id count = 112 | let prop fname p = prop count "Uucp.Id" fname p in 113 | prop "is_id_start" Uucp.Id.is_id_start; 114 | prop "is_id_continue" Uucp.Id.is_id_continue; 115 | prop "is_xid_start" Uucp.Id.is_xid_start; 116 | prop "is_xid_continue" Uucp.Id.is_xid_continue; 117 | prop "is_pattern_syntax" Uucp.Id.is_pattern_syntax; 118 | prop "is_pattern_white_space" Uucp.Id.is_pattern_white_space; 119 | () 120 | 121 | let lookup_name count = 122 | let prop fname p = prop count "Uucp.Name" fname p in 123 | prop "name" Uucp.Name.name; 124 | prop "name_alias" Uucp.Name.name_alias; 125 | () 126 | 127 | let lookup_num count = 128 | let prop fname p = prop count "Uucp.Num" fname p in 129 | prop "is_ascii_hex_digit" Uucp.Num.is_ascii_hex_digit; 130 | prop "is_hex_digit" Uucp.Num.is_hex_digit; 131 | prop "numeric_type" Uucp.Num.numeric_type; 132 | prop "numeric_value" Uucp.Num.numeric_value; 133 | () 134 | 135 | let lookup_script count = 136 | let prop fname p = prop count "Uucp.Script" fname p in 137 | prop "script" Uucp.Script.script; 138 | prop "script_extensions" Uucp.Script.script_extensions; 139 | () 140 | 141 | let lookup_white count = 142 | let prop fname p = prop count "Uucp.White" fname p in 143 | prop "is_white_space" Uucp.White.is_white_space; 144 | () 145 | 146 | let lookup count mods = 147 | let do_lookup m = mods = [] || List.mem m mods in 148 | if do_lookup `Age then lookup_age count; 149 | if do_lookup `Alpha then lookup_alpha count; 150 | if do_lookup `Block then lookup_block count; 151 | if do_lookup `Break then lookup_break count; 152 | if do_lookup `Case then lookup_case count; 153 | if do_lookup `Cjk then lookup_cjk count; 154 | if do_lookup `Emoji then lookup_emoji count; 155 | if do_lookup `Func then lookup_func count; 156 | if do_lookup `Gc then lookup_gc count; 157 | if do_lookup `Gen then lookup_gen count; 158 | if do_lookup `Hangul then lookup_hangul count; 159 | if do_lookup `Id then lookup_id count; 160 | if do_lookup `Name then lookup_name count; 161 | if do_lookup `Num then lookup_num count; 162 | if do_lookup `Script then lookup_script count; 163 | if do_lookup `White then lookup_white count; 164 | log "Done.@\n"; 165 | () 166 | 167 | let main () = 168 | let usage = str 169 | "Usage: %s [OPTION]...\n\ 170 | \ Tests lookup performance, without any option tests all properties.\n\ 171 | Options:" exec 172 | in 173 | let count = ref 10 in 174 | let mods = ref [] in 175 | let add v = Arg.Unit (fun () -> mods := v :: !mods) in 176 | let pos p = raise (Arg.Bad ("don't know what to to with " ^ p)) in 177 | let options = [ 178 | "-count", Arg.Set_int count, 179 | "N number of full character set traversals (default 10)"; 180 | "-age", add `Age, " test the Age module"; 181 | "-alpha", add `Alpha, " test the Alpha module"; 182 | "-block", add `Block, " test the Block module"; 183 | "-break", add `Break, " test the Break module"; 184 | "-case", add `Case, " test the Case module"; 185 | "-cjk", add `Cjk, " test the CJK module"; 186 | "-emoji", add `Emoji, " test the Emoji module"; 187 | "-func", add `Func, " test the Func module"; 188 | "-gc", add `Gc, " test the Gc module"; 189 | "-gen", add `Gen, " test the Gen module"; 190 | "-hangul", add `Hangul, " test the Hangul module"; 191 | "-id", add `Id, " test the Id module"; 192 | "-name", add `Name, " test the Name module"; 193 | "-num", add `Num, " test the Num module"; 194 | "-script", add `Script, " test the Script module"; 195 | "-white", add `White, " test the White module"; ] 196 | in 197 | Arg.parse (Arg.align options) pos usage; 198 | lookup !count !mods 199 | 200 | let () = main () 201 | -------------------------------------------------------------------------------- /src/uucp__block.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Block property and block ranges. 7 | 8 | {b References.} 9 | {ul 10 | {- {{:http://www.unicode.org/faq/blocks_ranges.html}The Unicode 11 | blocks and ranges FAQ}.}} *) 12 | 13 | (** {1:blockprop Blocks} *) 14 | 15 | type t = [ 16 | | `ASCII 17 | | `Adlam 18 | | `Aegean_Numbers 19 | | `Ahom 20 | | `Alchemical 21 | | `Alphabetic_PF 22 | | `Anatolian_Hieroglyphs 23 | | `Ancient_Greek_Music 24 | | `Ancient_Greek_Numbers 25 | | `Ancient_Symbols 26 | | `Arabic 27 | | `Arabic_Ext_A 28 | | `Arabic_Ext_B 29 | | `Arabic_Ext_C 30 | | `Arabic_Math 31 | | `Arabic_PF_A 32 | | `Arabic_PF_B 33 | | `Arabic_Sup 34 | | `Armenian 35 | | `Arrows 36 | | `Avestan 37 | | `Balinese 38 | | `Bamum 39 | | `Bamum_Sup 40 | | `Bassa_Vah 41 | | `Batak 42 | | `Bengali 43 | | `Beria_Erfe 44 | | `Bhaiksuki 45 | | `Block_Elements 46 | | `Bopomofo 47 | | `Bopomofo_Ext 48 | | `Box_Drawing 49 | | `Brahmi 50 | | `Braille 51 | | `Buginese 52 | | `Buhid 53 | | `Byzantine_Music 54 | | `CJK 55 | | `CJK_Compat 56 | | `CJK_Compat_Forms 57 | | `CJK_Compat_Ideographs 58 | | `CJK_Compat_Ideographs_Sup 59 | | `CJK_Ext_A 60 | | `CJK_Ext_B 61 | | `CJK_Ext_C 62 | | `CJK_Ext_D 63 | | `CJK_Ext_E 64 | | `CJK_Ext_F 65 | | `CJK_Ext_G 66 | | `CJK_Ext_H 67 | | `CJK_Ext_I 68 | | `CJK_Ext_J 69 | | `CJK_Radicals_Sup 70 | | `CJK_Strokes 71 | | `CJK_Symbols 72 | | `Carian 73 | | `Caucasian_Albanian 74 | | `Chakma 75 | | `Cham 76 | | `Cherokee 77 | | `Cherokee_Sup 78 | | `Chess_Symbols 79 | | `Chorasmian 80 | | `Compat_Jamo 81 | | `Control_Pictures 82 | | `Coptic 83 | | `Coptic_Epact_Numbers 84 | | `Counting_Rod 85 | | `Cuneiform 86 | | `Cuneiform_Numbers 87 | | `Currency_Symbols 88 | | `Cypriot_Syllabary 89 | | `Cypro_Minoan 90 | | `Cyrillic 91 | | `Cyrillic_Ext_A 92 | | `Cyrillic_Ext_B 93 | | `Cyrillic_Ext_C 94 | | `Cyrillic_Ext_D 95 | | `Cyrillic_Sup 96 | | `Deseret 97 | | `Devanagari 98 | | `Devanagari_Ext 99 | | `Devanagari_Ext_A 100 | | `Diacriticals 101 | | `Diacriticals_Ext 102 | | `Diacriticals_For_Symbols 103 | | `Diacriticals_Sup 104 | | `Dingbats 105 | | `Dives_Akuru 106 | | `Dogra 107 | | `Domino 108 | | `Duployan 109 | | `Early_Dynastic_Cuneiform 110 | | `Egyptian_Hieroglyph_Format_Controls 111 | | `Egyptian_Hieroglyphs 112 | | `Egyptian_Hieroglyphs_Ext_A 113 | | `Elbasan 114 | | `Elymaic 115 | | `Emoticons 116 | | `Enclosed_Alphanum 117 | | `Enclosed_Alphanum_Sup 118 | | `Enclosed_CJK 119 | | `Enclosed_Ideographic_Sup 120 | | `Ethiopic 121 | | `Ethiopic_Ext 122 | | `Ethiopic_Ext_A 123 | | `Ethiopic_Ext_B 124 | | `Ethiopic_Sup 125 | | `Garay 126 | | `Geometric_Shapes 127 | | `Geometric_Shapes_Ext 128 | | `Georgian 129 | | `Georgian_Ext 130 | | `Georgian_Sup 131 | | `Glagolitic 132 | | `Glagolitic_Sup 133 | | `Gothic 134 | | `Grantha 135 | | `Greek 136 | | `Greek_Ext 137 | | `Gujarati 138 | | `Gunjala_Gondi 139 | | `Gurmukhi 140 | | `Gurung_Khema 141 | | `Half_And_Full_Forms 142 | | `Half_Marks 143 | | `Hangul 144 | | `Hanifi_Rohingya 145 | | `Hanunoo 146 | | `Hatran 147 | | `Hebrew 148 | | `Hiragana 149 | | `IDC 150 | | `IPA_Ext 151 | | `Ideographic_Symbols 152 | | `Imperial_Aramaic 153 | | `Indic_Number_Forms 154 | | `Indic_Siyaq_Numbers 155 | | `Inscriptional_Pahlavi 156 | | `Inscriptional_Parthian 157 | | `Jamo 158 | | `Jamo_Ext_A 159 | | `Jamo_Ext_B 160 | | `Javanese 161 | | `Kaithi 162 | | `Kaktovik_Numerals 163 | | `Kana_Ext_A 164 | | `Kana_Ext_B 165 | | `Kawi 166 | | `Kana_Sup 167 | | `Kanbun 168 | | `Kangxi 169 | | `Kannada 170 | | `Katakana 171 | | `Katakana_Ext 172 | | `Kayah_Li 173 | | `Kharoshthi 174 | | `Khitan_Small_Script 175 | | `Khmer 176 | | `Khmer_Symbols 177 | | `Khojki 178 | | `Khudawadi 179 | | `Kirat_Rai 180 | | `Lao 181 | | `Latin_1_Sup 182 | | `Latin_Ext_A 183 | | `Latin_Ext_Additional 184 | | `Latin_Ext_B 185 | | `Latin_Ext_C 186 | | `Latin_Ext_D 187 | | `Latin_Ext_E 188 | | `Latin_Ext_F 189 | | `Latin_Ext_G 190 | | `Lepcha 191 | | `Letterlike_Symbols 192 | | `Limbu 193 | | `Linear_A 194 | | `Linear_B_Ideograms 195 | | `Linear_B_Syllabary 196 | | `Lisu 197 | | `Lisu_Sup 198 | | `Lycian 199 | | `Lydian 200 | | `Mahajani 201 | | `Mahjong 202 | | `Makasar 203 | | `Malayalam 204 | | `Mandaic 205 | | `Manichaean 206 | | `Marchen 207 | | `Masaram_Gondi 208 | | `Math_Alphanum 209 | | `Math_Operators 210 | | `Mayan_Numerals 211 | | `Medefaidrin 212 | | `Meetei_Mayek 213 | | `Meetei_Mayek_Ext 214 | | `Mende_Kikakui 215 | | `Meroitic_Cursive 216 | | `Meroitic_Hieroglyphs 217 | | `Miao 218 | | `Misc_Arrows 219 | | `Misc_Math_Symbols_A 220 | | `Misc_Math_Symbols_B 221 | | `Misc_Pictographs 222 | | `Misc_Symbols 223 | | `Misc_Symbols_Sup 224 | | `Misc_Technical 225 | | `Modi 226 | | `Modifier_Letters 227 | | `Modifier_Tone_Letters 228 | | `Mongolian 229 | | `Mongolian_Sup 230 | | `Mro 231 | | `Multani 232 | | `Music 233 | | `Myanmar 234 | | `Myanmar_Ext_A 235 | | `Myanmar_Ext_B 236 | | `Myanmar_Ext_C 237 | | `NB (** Non_block *) 238 | | `NKo 239 | | `Nabataean 240 | | `Nag_Mundari 241 | | `Nandinagari 242 | | `New_Tai_Lue 243 | | `Newa 244 | | `Number_Forms 245 | | `Nushu 246 | | `Nyiakeng_Puachue_Hmong 247 | | `OCR 248 | | `Ogham 249 | | `Ol_Chiki 250 | | `Ol_Onal 251 | | `Old_Hungarian 252 | | `Old_Italic 253 | | `Old_North_Arabian 254 | | `Old_Permic 255 | | `Old_Persian 256 | | `Old_Sogdian 257 | | `Old_South_Arabian 258 | | `Old_Turkic 259 | | `Old_Uyghur 260 | | `Oriya 261 | | `Ornamental_Dingbats 262 | | `Osage 263 | | `Osmanya 264 | | `Ottoman_Siyaq_Numbers 265 | | `PUA 266 | | `Pahawh_Hmong 267 | | `Palmyrene 268 | | `Pau_Cin_Hau 269 | | `Phags_Pa 270 | | `Phaistos 271 | | `Phoenician 272 | | `Phonetic_Ext 273 | | `Phonetic_Ext_Sup 274 | | `Playing_Cards 275 | | `Psalter_Pahlavi 276 | | `Punctuation 277 | | `Rejang 278 | | `Rumi 279 | | `Runic 280 | | `Samaritan 281 | | `Saurashtra 282 | | `Sharada 283 | | `Sharada_Sup 284 | | `Shavian 285 | | `Shorthand_Format_Controls 286 | | `Siddham 287 | | `Sidetic 288 | | `Sinhala 289 | | `Sinhala_Archaic_Numbers 290 | | `Small_Forms 291 | | `Small_Kana_Ext 292 | | `Sogdian 293 | | `Sora_Sompeng 294 | | `Soyombo 295 | | `Specials 296 | | `Sundanese 297 | | `Sundanese_Sup 298 | | `Sunuwar 299 | | `Sup_Arrows_A 300 | | `Sup_Arrows_B 301 | | `Sup_Arrows_C 302 | | `Sup_Math_Operators 303 | | `Sup_PUA_A 304 | | `Sup_PUA_B 305 | | `Sup_Punctuation 306 | | `Sup_Symbols_And_Pictographs 307 | | `Super_And_Sub 308 | | `Sutton_SignWriting 309 | | `Syloti_Nagri 310 | | `Symbols_And_Pictographs_Ext_A 311 | | `Symbols_For_Legacy_Computing 312 | | `Symbols_For_Legacy_Computing_Sup 313 | | `Syriac 314 | | `Syriac_Sup 315 | | `Tagalog 316 | | `Tagbanwa 317 | | `Tags 318 | | `Tai_Le 319 | | `Tai_Tham 320 | | `Tai_Viet 321 | | `Tai_Xuan_Jing 322 | | `Tai_Yo 323 | | `Takri 324 | | `Tamil 325 | | `Tamil_Sup 326 | | `Tangsa 327 | | `Tangut 328 | | `Tangut_Components 329 | | `Tangut_Components_Sup 330 | | `Tangut_Sup 331 | | `Telugu 332 | | `Thaana 333 | | `Thai 334 | | `Tibetan 335 | | `Tifinagh 336 | | `Tirhuta 337 | | `Todhri 338 | | `Tolong_Siki 339 | | `Toto 340 | | `Transport_And_Map 341 | | `Tulu_Tigalari 342 | | `UCAS 343 | | `UCAS_Ext 344 | | `UCAS_Ext_A 345 | | `Ugaritic 346 | | `VS 347 | | `VS_Sup 348 | | `Vai 349 | | `Vedic_Ext 350 | | `Vertical_Forms 351 | | `Vithkuqi 352 | | `Wancho 353 | | `Warang_Citi 354 | | `Yezidi 355 | | `Yi_Radicals 356 | | `Yi_Syllables 357 | | `Yijing 358 | | `Zanabazar_Square 359 | | `Znamenny_Music 360 | ] 361 | (** The type for blocks. The value [`NB] is for characters that are not 362 | yet assigned to a block. *) 363 | 364 | val compare : t -> t -> int 365 | (** [compare b b'] is [Stdlib.compare b b']. *) 366 | 367 | val pp : Format.formatter -> t -> unit 368 | (** [pp ppf b] prints an unspecified representation of [b] on [ppf]. *) 369 | 370 | val blocks : (t * (Uchar.t * Uchar.t)) list 371 | (** [blocks] is the list of blocks sorted by increasing range order. 372 | Each block appears exactly once in the list except 373 | [`NB] which is not part of this list as it is not a block. *) 374 | 375 | val block : Uchar.t -> t 376 | (** [block u] is [u]'s 377 | {{:http://www.unicode.org/reports/tr44/#Block}Block} property. *) 378 | -------------------------------------------------------------------------------- /support/gen.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let log fmt = Format.eprintf (fmt ^^ "%!") 7 | let pp = Format.fprintf 8 | let str = Format.asprintf 9 | let pp_size ppf s = 10 | let b = s * (Sys.word_size / 8) in 11 | if b < 1_048_576 then pp ppf "%.1f Ko" (float b /. 1024.) else 12 | if b < 1_073_741_824 then pp ppf "%.1f Mo" (float b /. (1024. ** 2.)) else 13 | pp ppf "%.1f Go" (float b /. (1024. ** 3.)) 14 | 15 | let pp_uchar ppf u = pp ppf "Uchar.unsafe_of_int 0x%04X" u 16 | 17 | let pp_list pp_v ppf vs = 18 | let rec loop = function 19 | | v :: vs -> pp ppf "@[%a@];@," pp_v v; loop vs 20 | | [] -> () 21 | in 22 | pp ppf "@[<1>["; loop vs; pp ppf "]@]" 23 | 24 | let uchar_iter_ints f = 25 | let rec loop u f = 26 | f (Uchar.to_int u); 27 | if Uchar.equal u Uchar.max then () else loop (Uchar.succ u) f 28 | in 29 | loop Uchar.min f 30 | 31 | (* Generate the Unicode version *) 32 | 33 | let pp_version ppf ucd = 34 | let version = match String.split_on_char ' ' ucd.Uucd.description with 35 | | [tok] -> tok 36 | | [_; tok] -> tok 37 | | _ -> ucd.Uucd.description 38 | in 39 | pp ppf "@[<2>let unicode_version = \"%s\"@]@\n@\n" version 40 | 41 | (* Property lookup *) 42 | 43 | let ucd_find ucd u p = Uucd.cp_prop ucd u p 44 | let ucd_get ucd u p = match Uucd.cp_prop ucd u p with 45 | | None -> invalid_arg (str "miss property for U+%04X in character database" u) 46 | | Some v -> v 47 | 48 | (* Generic map builders *) 49 | 50 | let assert_prop_map prop get = 51 | let assert_u u = 52 | if prop u = get u then () else 53 | failwith (str "map failure for U+%04X" u) 54 | in 55 | uchar_iter_ints assert_u 56 | 57 | let prop_map create set get prop default = 58 | let m = create default in 59 | let add_u u = set m u (prop u) in 60 | uchar_iter_ints add_u; m, (get m) 61 | 62 | (* Structure sharing *) 63 | 64 | let intern (type a) ?eqh iter pp_v ppf x = 65 | let module H = Hashtbl.Make (struct 66 | type t = a 67 | let equal, hash = match eqh with Some fg -> fg | _ -> (=), Hashtbl.hash 68 | end) in 69 | let t = H.create 23 and n = ref 0 in 70 | x |> iter (fun v -> if not (H.mem t v) then begin 71 | let name = str "v%03d" !n in 72 | H.add t v name; incr n; 73 | pp ppf "@[<2>let %s =@ %a@]@\n" name pp_v v 74 | end); 75 | (fun ppf v -> match H.find_opt t v with 76 | | Some name -> pp ppf "%s" name 77 | | None -> pp_v ppf v) 78 | 79 | (* Generate Uucp_cmap.t values *) 80 | 81 | let prop_cmap ~default prop = 82 | let m = ref [] in 83 | let add_u u = m := (`C (u, prop u)) :: !m in 84 | uchar_iter_ints add_u; Uucp_cmap.of_sorted_list default (List.rev !m) 85 | 86 | let pp_prop_cmap ppf prop pname ptype pp_prop ~default size_v = 87 | log "* %s property, binary tree character map@\n" pname; 88 | let m = prop_cmap ~default prop in 89 | let size = Uucp_cmap.word_size size_v m in 90 | let h = Uucp_cmap.height m in 91 | log " size (default %a): %a height: %d@\n" pp_prop default pp_size size h; 92 | log " asserting"; assert_prop_map prop (Uucp_cmap.get m); 93 | log ", generating@\n"; 94 | pp ppf "open Uucp_cmap@\n"; 95 | pp ppf "@[<2>let %s_map : %s t =@ %a@]@\n@\n" 96 | pname ptype (Uucp_cmap.dump pp_prop) m; 97 | () 98 | 99 | let pp_prop_cmap_ucd ppf ucd prop pname ptype pp_prop ~default size_v = 100 | let prop u = ucd_get ucd u prop in 101 | pp_prop_cmap ppf prop pname ptype pp_prop ~default size_v 102 | 103 | (* Generate Uucp_rmap.t value *) 104 | 105 | let prop_find_ranges prop = 106 | let u_max = Uchar.(to_int max) in 107 | let current = ref None in 108 | let start = ref 0 in 109 | let ranges = ref [] in 110 | let rec add_u u = 111 | let p = prop u in 112 | let add_range v max = 113 | ranges := (`R (!start, max, v)) :: !ranges; 114 | current := None; add_u u 115 | in 116 | match !current with 117 | | None -> current := Some p; start := u 118 | | Some v -> 119 | if v = p then (if u = u_max then add_range v u) else 120 | add_range v (u - 1) 121 | in 122 | uchar_iter_ints add_u; (List.rev !ranges) 123 | 124 | let pp_prop_rmap ?(share = true) ppf prop pname ptype pp_prop ~default size_v = 125 | log "* %s property, binary tree range code point map@\n" pname; 126 | let m = Uucp_rmap.of_sorted_list default (prop_find_ranges prop) in 127 | let size = Uucp_rmap.word_size size_v m in 128 | let h = Uucp_rmap.height m in 129 | log " size (default %a): %a height: %d@\n" pp_prop default pp_size size h; 130 | log " asserting"; assert_prop_map prop (Uucp_rmap.get m); 131 | log ", generating@\n"; 132 | pp ppf "open Uucp_rmap@\n"; 133 | let pp_prop = 134 | if share then intern Uucp_rmap.iter_values pp_prop ppf m else 135 | pp_prop 136 | in 137 | pp ppf "@[<2>let %s_map : %s t =@ %a@]@\n@\n" 138 | pname ptype (Uucp_rmap.dump pp_prop) m; 139 | () 140 | 141 | let pp_prop_rmap_ucd ?share ppf ucd prop pname ptype pp_prop ~default size_v = 142 | let prop u = ucd_get ucd u prop in 143 | pp_prop_rmap ?share ppf prop pname ptype pp_prop ~default size_v 144 | 145 | (* Generate Uucp_tmap.t values *) 146 | 147 | let prop_tmap prop default = 148 | prop_map 149 | Uucp_tmap.create 150 | Uucp_tmap.set 151 | Uucp_tmap.get 152 | prop default 153 | 154 | let pp_prop_tmap ppf prop pname ptype pp_prop ~default size_v = 155 | log "* %s property, trie map@\n" pname; 156 | let m, get = prop_tmap prop default in 157 | let t_size = Uucp_tmap.word_size size_v m in 158 | log " size (default %a): %a@\n" pp_prop default pp_size t_size; 159 | log " asserting"; assert_prop_map prop get; 160 | log ", generating@\n"; 161 | pp ppf "open Uucp_tmap@\n"; 162 | pp ppf "@[<2>let %s_map : %s t =@ %a@]@\n@\n" pname ptype (Uucp_tmap.dump pp_prop) m; 163 | () 164 | 165 | let pp_prop_tmap_ucd ppf ucd prop pname ptype pp_prop ~default size_v = 166 | let prop u = ucd_get ucd u prop in 167 | pp_prop_tmap ppf prop pname ptype pp_prop ~default size_v 168 | 169 | (* Generate Uucp_tmapbool.t value *) 170 | 171 | let prop_tmapbools prop = 172 | let tm = Uucp_tmapbool.create true in 173 | let fm = Uucp_tmapbool.create false in 174 | let add_u u = 175 | let b = prop u in 176 | Uucp_tmapbool.set tm u b; 177 | Uucp_tmapbool.set fm u b; 178 | in 179 | uchar_iter_ints add_u; tm, fm 180 | 181 | let assert_tmapbools prop tm fm = 182 | let assert_u u = 183 | let fail () = failwith (str "bool prop map failure for U+%04X" u) in 184 | let b = prop u in 185 | if b <> Uucp_tmapbool.get tm u then fail (); 186 | if b <> Uucp_tmapbool.get fm u then fail (); 187 | in 188 | uchar_iter_ints assert_u 189 | 190 | let pp_prop_tmapbool ppf prop pname = 191 | log "* %s property, boolean trie map@\n" pname; 192 | let tm, fm = prop_tmapbools prop in 193 | let tm_size, fm_size = 194 | Uucp_tmapbool.word_size tm, Uucp_tmapbool.word_size fm 195 | in 196 | let use_fm = tm_size > fm_size in 197 | log " size (default true): %a, size (default false): %a@\n" 198 | pp_size tm_size pp_size fm_size; 199 | log " using default %b map" (not use_fm); 200 | log ", asserting"; assert_tmapbools prop tm fm; 201 | log ", generating@\n"; 202 | let m = if use_fm then fm else tm in 203 | pp ppf "open Uucp_tmapbool@\n"; 204 | let pp_v = intern Uucp_tmapbool.iter_blobs Uucp_tmapbool.pp_v ppf m in 205 | pp ppf "@[<2>let %s_map =@ %a@]@\n@\n" pname (Uucp_tmapbool.dump_pp pp_v) m; 206 | () 207 | 208 | let pp_prop_tmapbool_ucd ppf ucd prop pname = 209 | let prop u = ucd_get ucd u prop in 210 | pp_prop_tmapbool ppf prop pname 211 | 212 | (* Genenerate Uucp_tmapbyte.t values *) 213 | 214 | let prop_tmapbyte prop default = 215 | prop_map 216 | Uucp_tmapbyte.create 217 | Uucp_tmapbyte.set 218 | Uucp_tmapbyte.get 219 | prop default 220 | 221 | let pp_prop_tmapbyte ppf prop pname ~default default_str = 222 | log "* %s property, trie byte map@\n" pname; 223 | let m, get = prop_tmapbyte prop default in 224 | let size = Uucp_tmapbyte.word_size m in 225 | log " size (default %s): %a@\n" default_str pp_size size; 226 | log " asserting"; assert_prop_map prop get; 227 | log ", generating@\n"; 228 | pp ppf "open Uucp_tmapbyte@\n"; 229 | let pp_v = intern Uucp_tmapbyte.iter_blobs Uucp_tmapbyte.pp_v ppf m in 230 | pp ppf "@[<2>let %s_map : t =@ %a@]@\n@\n" pname (Uucp_tmapbyte.dump_pp pp_v) m; 231 | () 232 | 233 | let pp_prop_tmapbyte_ucd ppf ucd prop pname ~default = 234 | let prop u = ucd_get ucd u prop in 235 | pp_prop_tmapbyte ppf prop pname ~default (str "%d" default) 236 | 237 | let pp_code_prop_tmapbyte_ucd ppf ucd code prop pname ~default pp_prop = 238 | let prop u = code (ucd_get ucd u prop) in 239 | pp_prop_tmapbyte ppf prop pname ~default:(code default) 240 | (str "`%a" pp_prop default) 241 | 242 | (* Generate Uucp_tmap5bytes.t values. *) 243 | 244 | let prop_tmap5bytes_uint20_pair prop default = 245 | prop_map 246 | Uucp_tmap5bytes.create_uint20_pair 247 | Uucp_tmap5bytes.set_uint20_pair 248 | Uucp_tmap5bytes.get_uint20_pair 249 | prop default 250 | 251 | let pp_tmap5byte ppf pname m = 252 | pp ppf "open Uucp_tmap5bytes@\n"; 253 | let pp_v = intern Uucp_tmap5bytes.iter_blobs Uucp_tmap5bytes.pp_v ppf m in 254 | pp ppf "@[<2>let %s_map : t =@ %a@]@\n@\n" pname 255 | (Uucp_tmap5bytes.dump_pp pp_v) m; 256 | () 257 | 258 | (* Generate a module *) 259 | 260 | let year = (Unix.gmtime (Unix.gettimeofday ())).Unix.tm_year + 1900 261 | 262 | let pp_mod pp_mod ppf m = 263 | pp ppf 264 | "\ 265 | (*--------------------------------------------------------------------------- 266 | Copyright (c) %d The uucp programmers. All rights reserved. 267 | SPDX-License-Identifier: ISC 268 | ---------------------------------------------------------------------------*) 269 | 270 | (* WARNING do not edit. This file was automatically generated. *) 271 | @\n@[%a@]@\n" year pp_mod m 272 | -------------------------------------------------------------------------------- /src/uucp__break.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Break properties. 7 | 8 | These properties are mainly for the Unicode text segmentation and line 9 | breaking algorithm. 10 | 11 | {b References.} 12 | {ul 13 | {- Mark Davis. 14 | {e {{:http://www.unicode.org/reports/tr29/}UAX #29 Unicode Text 15 | Segmentation}}. (latest version)} 16 | {- Andy Heninger. 17 | {e {{:http://www.unicode.org/reports/tr14/}UAX #14 Unicode Line Breaking 18 | Algorithm}}. (latest version)} 19 | {- Ken Lunde 小林劍. 20 | {e {{:http://www.unicode.org/reports/tr11/}UAX #11 East Asian width.} 21 | (latest version)}}} *) 22 | 23 | 24 | (** {1:line_break Line break} *) 25 | 26 | type line = 27 | [ `AI | `AK | `AL | `AP | `AS | `B2 | `BA | `BB | `BK | `CB | `CJ | `CL 28 | | `CM | `CP | `CR | `EX | `EB | `EM | `GL | `H2 | `H3 | `HH | `HL | `HY | `ID 29 | | `IN | `IS | `JL | `JT | `JV | `LF | `NL | `NS | `NU | `OP | `PO | `PR 30 | | `QU | `RI | `SA | `SG | `SP | `SY | `VF | `VI | `WJ | `XX | `ZW | `ZWJ ] 31 | (** The type for line breaks. *) 32 | 33 | val pp_line : Format.formatter -> line -> unit 34 | (** [pp_line ppf l] prints an unspecified representation of [l] on [ppf]. *) 35 | 36 | val line : Uchar.t -> line 37 | (** [line u] is [u]'s 38 | {{:http://www.unicode.org/reports/tr44/#Line_Break}line break} 39 | property. *) 40 | 41 | (** {1:grapheme_cluster_break Grapheme cluster break} *) 42 | 43 | type grapheme_cluster = 44 | [ `CN | `CR | `EX | `EB | `EBG | `EM | `GAZ | `L | `LF | `LV | `LVT | `PP 45 | | `RI | `SM | `T | `V | `XX | `ZWJ ] 46 | (** The type for grapheme cluster breaks. *) 47 | 48 | val pp_grapheme_cluster : Format.formatter -> grapheme_cluster -> unit 49 | (** [pp_grapheme_cluster ppf g] prints an unspecified representation of [g] 50 | on [ppf]. *) 51 | 52 | val grapheme_cluster : Uchar.t -> grapheme_cluster 53 | (** [grapheme_cluster u] is [u]'s 54 | {{:http://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break}grapheme 55 | cluster break} property. *) 56 | 57 | (** {1:word_break Word break} *) 58 | 59 | type word = 60 | [ `CR | `DQ | `EX | `EB | `EBG | `EM | `Extend | `FO | `GAZ | `HL | `KA 61 | | `LE | `LF | `MB | `ML | `MN | `NL | `NU | `RI | `SQ | `WSegSpace 62 | | `XX | `ZWJ ] 63 | (** The type for word breaks. *) 64 | 65 | val pp_word : Format.formatter -> word -> unit 66 | (** [pp_word ppf b] prints an unspecified representation of [b] on [ppf]. *) 67 | 68 | val word : Uchar.t -> word 69 | (** [word u] is [u]'s 70 | {{:http://www.unicode.org/reports/tr44/#Word_Break}word break} 71 | property. *) 72 | 73 | (** {1:sentence_break Sentence break} *) 74 | 75 | type sentence = 76 | [ `AT | `CL | `CR | `EX | `FO | `LE | `LF | `LO | `NU | `SC | `SE | `SP 77 | | `ST | `UP | `XX ] 78 | (** The type for sentence breaks. *) 79 | 80 | val pp_sentence : Format.formatter -> sentence -> unit 81 | (** [pp_sentence ppf b] prints an unspecified representation of [b] on [ppf]. *) 82 | 83 | val sentence : Uchar.t -> sentence 84 | (** [sentence u] is [u]'s 85 | {{:http://www.unicode.org/reports/tr44/#Sentence_Break}sentence break} 86 | property. *) 87 | 88 | (** {1:indic_conjunct_break Indic conjunct break} *) 89 | 90 | type indic_conjunct_break = [ `Consonant | `Extend | `Linker | `None ] 91 | (** The type for Indic Conjunct Break. *) 92 | 93 | val pp_indic_conjunct_break : Format.formatter -> indic_conjunct_break -> unit 94 | (** [pp_indic_conjunct_break ppf b] prints an unspecified representation of [b] 95 | on [ppf]. *) 96 | 97 | val indic_conjunct_break : Uchar.t -> indic_conjunct_break 98 | (** [indic_conjunct_break u] is [u]'s 99 | {{:http://www.unicode.org/reports/tr44/#Indic_Conjunct_Break} 100 | Indic conjunct break} property. *) 101 | 102 | (** {1:east_asian_width East Asian width} *) 103 | 104 | type east_asian_width = [ `A | `F | `H | `N | `Na | `W ] 105 | (** The type for East Asian widths. *) 106 | 107 | val pp_east_asian_width : Format.formatter -> east_asian_width -> unit 108 | (** [pp_east_asian_width ppf w] prints an unspecified representation of 109 | [w] on [ppf]. *) 110 | 111 | val east_asian_width : Uchar.t -> east_asian_width 112 | (** [east_asian_width u] is [u]'s 113 | {{:http://www.unicode.org/reports/tr44/#East_Asian_Width}East 114 | Asian width} property. *) 115 | 116 | (** {1:terminal_width Terminal width} *) 117 | 118 | val tty_width_hint : Uchar.t -> int 119 | (** [tty_width_hint u] approximates [u]'s column width as rendered by a 120 | typical character terminal. 121 | 122 | The current implementation of the function returns either [0], 123 | [1], [2] or [-1]. The value [-1] is only returned for scalar 124 | values for which the property is non-sensical; clients are 125 | expected to sanitize their inputs and not to use the function 126 | with these scalar values which are those in range U+0001-U+001F 127 | ({b C0} controls without U+0000) and U+007F-U+009F (DELETE and 128 | {b C1} controls). 129 | 130 | {b Note.} Converting a string to 131 | {{:http://unicode.org/glossary/#normalization_form_c}normalization 132 | form C} before folding this function over its scalar values 133 | will, in general, yield better approximations (e.g. on Hangul). 134 | 135 | {b Warning.} This is not a normative property and only a 136 | heuristic. If you find yourself using this function please read 137 | carefully the following lines. 138 | 139 | This function is the moral equivalent of POSIX 140 | {{:http://pubs.opengroup.org/onlinepubs/9699919799/functions/wcwidth.html} 141 | [wcwidth]}, in that its purpose is to help align text displayed by a 142 | character terminal. It mimics [wcwidth], as widely implemented, in yet 143 | another way: it is {e mostly wrong}. 144 | 145 | Computing column width is a surprisingly difficult task in general. Much 146 | of the software infrastructure still carries legacy assumptions about the 147 | nature of text harking back to the ASCII era. Different terminal emulators 148 | attempt to cope with general Unicode text in different ways, creating a 149 | fundamental problem: width of text fragments will vary across terminal 150 | emulators, with no way of getting feedback from the output layer back 151 | into the text-producing layer. 152 | 153 | For example: on a modern Linux system, a collection of terminals 154 | will disagree on some or all of U+00AD, U+0CBF, and 155 | U+2029. They will likewise disagree about unassigned 156 | characters (category {e Cn}), sometimes contradicting the 157 | system's [wcwidth] (e.g. U+0378, U+0530). Terminals using 158 | bare {{:http://cgit.freedesktop.org/xorg/lib/libXft}libxft} 159 | will display complex scripts differently from terminals using 160 | {{:http://www.freedesktop.org/wiki/Software/HarfBuzz}HarfBuzz}, 161 | and the rendering on OS X will be slightly different from both. 162 | 163 | [tty_width_hint] uses a simple and predictable width algorithm, based 164 | on Markus Kuhn's {{:https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c} 165 | portable [wcwidth]}: 166 | 167 | {ul 168 | {- Scalar values in the ranges U+0001-U+001F ({b C0} controls without 169 | U+0000) and U+007F-U+009F (DELETE and {b C1} controls) have undefined 170 | width ([-1]).} 171 | {- Characters with {{:http://www.unicode.org/reports/tr11/tr11-29.html} 172 | East Asian Width} {e Fullwidth} or {e Wide} have a width of [2].} 173 | {- Characters with 174 | {{:http://unicode.org/glossary/#general_category}General Category} 175 | {e Mn}, {e Me}, {e Cf} and U+0000 have a width of [0].} 176 | {- {e Most} other characters have a width of [1], including {e Cn}.}} 177 | 178 | This approach works well, in that it gives results generally consistent 179 | with a wide range of terminals, for 180 | {{:https://en.wikipedia.org/wiki/Alphabet}alphabetic} scripts, and for 181 | east Asian {{:https://en.wikipedia.org/wiki/Syllabary}syllabic} and 182 | {{:https://en.wikipedia.org/wiki/Logogram}logographic} scripts in 183 | non-decomposed form. Support varies for 184 | {{:https://en.wikipedia.org/wiki/Abjad}abjad} scripts in the presence of 185 | vowel marks, and it mostly breaks down on 186 | {{:https://en.wikipedia.org/wiki/Abugida}abugidas}. 187 | 188 | Moreover, non-text symbols like 189 | {{:http://unicode.org/emoji/charts/full-emoji-list.html}Emoji} 190 | or {{:http://unicode.org/charts/PDF/U4DC0.pdf}Yijing hexagrams} 191 | will be incorrectly classified as [1]-wide, but this in fact 192 | agrees with their rendering on many terminals. 193 | 194 | Clients should not over-rely on [tty_width_hint]. It provides a 195 | best-effort approximation which will sometimes fail in 196 | practice. *) 197 | 198 | 199 | (** {1:break_low Low level interface} *) 200 | 201 | (** Low level interface. 202 | 203 | This interface may be useful for table based implementers of 204 | segmenters. For each kind of break, property values are 205 | assigned integer values starting from [0]. An array 206 | allows to recover the high-level representation of the 207 | corresponding property value. *) 208 | module Low : sig 209 | 210 | (** {1:low Low level access to break properties} 211 | 212 | {b Warning.} Do not mutate these array. *) 213 | 214 | val line : Uchar.t -> int 215 | (** [line u] is an integer that can be used with {!line_of_int}. *) 216 | 217 | val line_max : int 218 | (** [line_max] is the maximal value returned by {!val-line}. *) 219 | 220 | val line_of_int : line array 221 | (** [line_of_int.(i)] is the line break property value corresponding 222 | to [i]. *) 223 | 224 | val grapheme_cluster : Uchar.t -> int 225 | (** [grapheme_cluster u] is an integer that can be used with 226 | {!grapheme_cluster_of_int}. *) 227 | 228 | val grapheme_cluster_max : int 229 | (** [grapheme_cluster_max] is the maximal value returned by 230 | {!val-grapheme_cluster}. *) 231 | 232 | val grapheme_cluster_of_int : grapheme_cluster array 233 | (** [grapheme_cluster_of_int.(i)] is the grapheme cluster break property 234 | value corresponding to [i]. *) 235 | 236 | val word : Uchar.t -> int 237 | (** [word u] is an integer that can be used with {!word_of_int}. *) 238 | 239 | val word_max : int 240 | (** [word_max] is the maximal value returned by {!val-word}. *) 241 | 242 | val word_of_int : word array 243 | (** [word_of_int.(i)] is the word break property value 244 | corresponding to [i]. *) 245 | 246 | val sentence : Uchar.t -> int 247 | (** [sentence u] is an integer that can be used with {!sentence_of_int}. *) 248 | 249 | val sentence_max : int 250 | (** [sentence_max] is the maximal value returned by {!val-sentence}. *) 251 | 252 | val sentence_of_int : sentence array 253 | (** [sentence_of_int.(i)] is the sentence break property value 254 | corresponding to [i]. *) 255 | 256 | val indic_conjunct_break : Uchar.t -> int 257 | (** [indic_conjunct_break u] is an integer that can be used with 258 | {!indic_conjunct_break_of_int}. *) 259 | 260 | val indic_conjunct_break_max : int 261 | (** [indic_conjunct_break_max] is the maximal value returned by 262 | {!val-indic_conjunct_break_of_int}. *) 263 | 264 | val indic_conjunct_break_of_int : indic_conjunct_break array 265 | (** [indic_conjunct_break.(i)] is the Indic conjunct break property 266 | value corresponding to [i]. *) 267 | end 268 | -------------------------------------------------------------------------------- /test/test_uucp.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open B0_testing 7 | 8 | (* Tests the properties against the XML Unicode character database. *) 9 | 10 | let str = Format.asprintf 11 | let exec = Filename.basename Sys.executable_name 12 | 13 | let uchar_dump ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u) 14 | 15 | (* UCD loading and access *) 16 | 17 | let load_ucd inf = 18 | try 19 | Test.log "Loading Unicode character database."; 20 | let inf = match inf with None -> "support/ucd.xml" | Some inf -> inf in 21 | let ic = if inf = "-" then stdin else open_in inf in 22 | let d = Uucd.decoder (`Channel ic) in 23 | match Uucd.decode d with 24 | | `Ok db -> db 25 | | `Error e -> 26 | let (l0, c0), (l1, c1) = Uucd.decoded_range d in 27 | Test.failstop "%s:%d.%d-%d.%d: %s" inf l0 c0 l1 c1 e; 28 | with Sys_error e -> Test.failstop "%s" e 29 | 30 | let ucd_get p ucd u = match Uucd.cp_prop ucd (Uchar.to_int u) p with 31 | | None -> invalid_arg (str "no property for %a" uchar_dump u) 32 | | Some v -> v 33 | 34 | (* Assert properties *) 35 | 36 | let prop ucd mname fname ucd_get prop = 37 | (* Not very B0_testy should be redone at some point. *) 38 | (Test.test (str "%s.%s" mname fname) @@ fun () -> 39 | let do_assert u = 40 | if ucd_get ucd u = prop u then () else 41 | failwith (str "assertion failure on %a" uchar_dump u) 42 | in 43 | for u = 0 to 0xD7FF do do_assert (Uchar.of_int u) done; 44 | for u = 0xE000 to 0x10FFFF do do_assert (Uchar.of_int u) done;) () 45 | 46 | (* Assert modules *) 47 | 48 | let assert_age ucd = 49 | let prop fname ucd_p p = prop ucd "Uucp.Age" fname (ucd_get ucd_p) p in 50 | prop "age" Uucd.age Uucp.Age.age; 51 | () 52 | 53 | let assert_alpha ucd = 54 | let prop fname ucd_p p = prop ucd "Uucd.Alpha" fname (ucd_get ucd_p) p in 55 | prop "is_alphabetic" Uucd.alphabetic Uucp.Alpha.is_alphabetic; 56 | () 57 | 58 | let assert_block ucd = 59 | let block_prop ucd u = match ucd_get Uucd.block ucd u with 60 | | `High_Surrogates -> assert false 61 | | `Low_Surrogates -> assert false 62 | | `High_PU_Surrogates -> assert false 63 | | #Uucp.Block.t as b -> 64 | try 65 | (* Test Uucp.Block.blocks at the same time *) 66 | let (is, ie) = List.assoc b Uucp.Block.blocks in 67 | if u < is || u > ie then assert false else 68 | b 69 | with Not_found -> assert (b = `NB); b 70 | in 71 | prop ucd "Uucd.Block" "block" block_prop Uucp.Block.block; 72 | () 73 | 74 | let assert_break ucd = 75 | let prop fname ucd_p p = prop ucd "Uucp.Break" fname (ucd_get ucd_p) p in 76 | prop "line" Uucd.line_break Uucp.Break.line; 77 | prop "grapheme_cluster" Uucd.grapheme_cluster_break 78 | Uucp.Break.grapheme_cluster; 79 | prop "word" Uucd.word_break Uucp.Break.word; 80 | prop "sentence" Uucd.sentence_break Uucp.Break.sentence; 81 | prop "indic_conjunct_break" 82 | Uucd.indic_conjunct_break Uucp.Break.indic_conjunct_break; 83 | () 84 | 85 | let assert_case ucd = 86 | let map fname ucd_p p = 87 | let assert_map ucd u = match ucd_get ucd_p ucd u with 88 | | `Self -> `Self 89 | | `Cps cps -> `Uchars (List.map Uchar.of_int cps) 90 | in 91 | prop ucd "Uucd.Case" fname assert_map p 92 | in 93 | let prop fname ucd_p p = prop ucd "Uucd.Case" fname (ucd_get ucd_p) p in 94 | prop "is_upper" Uucd.uppercase Uucp.Case.is_upper; 95 | prop "is_lower" Uucd.lowercase Uucp.Case.is_lower; 96 | prop "is_cased" Uucd.cased Uucp.Case.is_cased; 97 | prop "is_case_ignorable" Uucd.case_ignorable Uucp.Case.is_case_ignorable; 98 | map "Map.to_upper" Uucd.uppercase_mapping Uucp.Case.Map.to_upper; 99 | map "Map.to_lower" Uucd.lowercase_mapping Uucp.Case.Map.to_lower; 100 | map "Map.to_title" Uucd.titlecase_mapping Uucp.Case.Map.to_title; 101 | map "Fold.fold" Uucd.case_folding Uucp.Case.Fold.fold; 102 | map "Nfkc_fold.fold" Uucd.nfkc_casefold Uucp.Case.Nfkc_fold.fold; 103 | map "Nfkc_simple_fold.fold" Uucd.nfkc_simple_casefold 104 | Uucp.Case.Nfkc_simple_fold.fold; 105 | () 106 | 107 | let assert_cjk ucd = 108 | let prop fname ucd_p p = prop ucd "Uucd.Cjk" fname (ucd_get ucd_p) p in 109 | prop "ideographic" Uucd.ideographic Uucp.Cjk.is_ideographic; 110 | prop "ids_unary_operator" 111 | Uucd.ids_unary_operator Uucp.Cjk.is_ids_unary_operator; 112 | prop "ids_binary_operator" 113 | Uucd.ids_binary_operator Uucp.Cjk.is_ids_binary_operator; 114 | prop "ids_trinary_operator" 115 | Uucd.ids_trinary_operator Uucp.Cjk.is_ids_trinary_operator; 116 | prop "radical" Uucd.radical Uucp.Cjk.is_radical; 117 | prop "unified_ideograph" Uucd.unified_ideograph Uucp.Cjk.is_unified_ideograph; 118 | () 119 | 120 | let assert_emoji ucd = 121 | let prop fname ucd_p p = prop ucd "Uucd.Emoji" fname (ucd_get ucd_p) p in 122 | prop "is_emoji" Uucd.emoji Uucp.Emoji.is_emoji; 123 | prop "is_emoji_presentation" 124 | Uucd.emoji_presentation Uucp.Emoji.is_emoji_presentation; 125 | prop "is_emoji_modifier" Uucd.emoji_modifier Uucp.Emoji.is_emoji_modifier; 126 | prop "is_emoji_modifier_base" 127 | Uucd.emoji_modifier_base Uucp.Emoji.is_emoji_modifier_base; 128 | prop "is_emoji_component" Uucd.emoji_component Uucp.Emoji.is_emoji_component; 129 | prop "is_extended_pictographic" 130 | Uucd.extended_pictographic Uucp.Emoji.is_extended_pictographic; 131 | () 132 | 133 | let assert_func ucd = 134 | let prop fname ucd_p p = prop ucd "Uucd.Func" fname (ucd_get ucd_p) p in 135 | prop "is_dash" Uucd.dash Uucp.Func.is_dash; 136 | prop "is_diacritic" Uucd.diacritic Uucp.Func.is_diacritic; 137 | prop "is_extender" Uucd.extender Uucp.Func.is_extender; 138 | prop "is_grapheme_base" Uucd.grapheme_base Uucp.Func.is_grapheme_base; 139 | prop "is_grapheme_extend" Uucd.grapheme_extend Uucp.Func.is_grapheme_extend; 140 | prop "is_math" Uucd.math Uucp.Func.is_math; 141 | prop "is_quotation_mark" Uucd.quotation_mark Uucp.Func.is_quotation_mark; 142 | prop "is_soft_dotted" Uucd.soft_dotted Uucp.Func.is_soft_dotted; 143 | prop "is_terminal_punctuation" Uucd.terminal_punctuation 144 | Uucp.Func.is_terminal_punctuation; 145 | prop "is_regional_indicator" Uucd.regional_indicator 146 | Uucp.Func.is_regional_indicator; 147 | () 148 | 149 | let assert_gc ucd = 150 | let prop fname ucd_p p = prop ucd "Uucp.Gc" fname (ucd_get ucd_p) p in 151 | prop "general_category" Uucd.general_category Uucp.Gc.general_category; 152 | () 153 | 154 | let assert_gen ucd = 155 | let prop fname ucd_p p = prop ucd "Uucp.Gen" fname (ucd_get ucd_p) p in 156 | prop "is_default_ignorable" Uucd.default_ignorable_code_point 157 | Uucp.Gen.is_default_ignorable; 158 | prop "is_deprecated" Uucd.deprecated Uucp.Gen.is_deprecated ; 159 | prop "is_logical_order_exception" Uucd.logical_order_exception 160 | Uucp.Gen.is_logical_order_exception; 161 | prop "is_non_character" Uucd.noncharacter_code_point 162 | Uucp.Gen.is_non_character; 163 | prop "is_variation_selector" Uucd.variation_selector 164 | Uucp.Gen.is_variation_selector; 165 | () 166 | 167 | let assert_hangul ucd = 168 | let prop fname ucd_p p = prop ucd "Uucp.Hangul" fname (ucd_get ucd_p) p in 169 | prop "syllable_type" Uucd.hangul_syllable_type Uucp.Hangul.syllable_type; 170 | () 171 | 172 | let assert_id ucd = 173 | let prop fname ucd_p p = prop ucd "Uucp.Id" fname (ucd_get ucd_p) p in 174 | prop "is_id_start" Uucd.id_start Uucp.Id.is_id_start; 175 | prop "is_id_continue" Uucd.id_continue Uucp.Id.is_id_continue; 176 | prop "is_xid_start" Uucd.xid_start Uucp.Id.is_xid_start; 177 | prop "is_xid_continue" Uucd.xid_continue Uucp.Id.is_xid_continue; 178 | prop "is_id_compat_math_start" 179 | Uucd.id_compat_math_start Uucp.Id.is_id_compat_math_start; 180 | prop "is_id_compat_math_continue" 181 | Uucd.id_compat_math_continue Uucp.Id.is_id_compat_math_continue; 182 | prop "is_pattern_syntax" Uucd.pattern_syntax Uucp.Id.is_pattern_syntax; 183 | prop "is_pattern_white_space" Uucd.pattern_white_space 184 | Uucp.Id.is_pattern_white_space; 185 | () 186 | 187 | let assert_name ucd = 188 | let buf = Buffer.create 244 in 189 | let name_prop ucd u = match (ucd_get Uucd.name ucd u) with 190 | | `Name n -> n 191 | | `Pattern n -> 192 | Buffer.clear buf; 193 | for i = 0 to String.length n - 1 do 194 | if n.[i] = '#' 195 | then Buffer.add_string buf (str "%04X" (Uchar.to_int u)) 196 | else Buffer.add_char buf n.[i] 197 | done; 198 | Buffer.contents buf 199 | in 200 | prop ucd "Uucd.Name" "name" name_prop Uucp.Name.name; 201 | let alias_prop ucd u = 202 | let permute (n, t) = (t, n) in 203 | List.map permute (ucd_get Uucd.name_alias ucd u) 204 | in 205 | prop ucd "Uucd.Name" "name_alias" alias_prop Uucp.Name.name_alias; 206 | () 207 | 208 | let assert_num ucd = 209 | let prop fname ucd_p p = prop ucd "Uucp.Num" fname (ucd_get ucd_p) p in 210 | prop "is_ascii_hex_digit" Uucd.ascii_hex_digit Uucp.Num.is_ascii_hex_digit; 211 | prop "is_hex_digit" Uucd.hex_digit Uucp.Num.is_hex_digit; 212 | prop "numeric_type" Uucd.numeric_type Uucp.Num.numeric_type; 213 | prop "numeric_value" Uucd.numeric_value Uucp.Num.numeric_value; 214 | () 215 | 216 | let assert_script ucd = 217 | let prop fname ucd_p p = prop ucd "Uucp.Script" fname (ucd_get ucd_p) p in 218 | prop "script" Uucd.script Uucp.Script.script; 219 | prop "script_extensions" 220 | Uucd.script_extensions Uucp.Script.script_extensions; 221 | () 222 | 223 | let assert_white ucd = 224 | let prop fname ucd_p p = prop ucd "Uucd.White" fname (ucd_get ucd_p) p in 225 | prop "is_white_space" Uucd.white_space Uucp.White.is_white_space; 226 | () 227 | 228 | let test inf mods = 229 | Test.main @@ fun () -> 230 | let do_assert m = mods = [] || List.mem m mods in 231 | let ucd = load_ucd inf in 232 | if do_assert `Age then assert_age ucd; 233 | if do_assert `Alpha then assert_alpha ucd; 234 | if do_assert `Block then assert_block ucd; 235 | if do_assert `Break then assert_break ucd; 236 | if do_assert `Case then assert_case ucd; 237 | if do_assert `Cjk then assert_cjk ucd; 238 | if do_assert `Emoji then assert_emoji ucd; 239 | if do_assert `Func then assert_func ucd; 240 | if do_assert `Gc then assert_gc ucd; 241 | if do_assert `Gen then assert_gen ucd; 242 | if do_assert `Hangul then assert_hangul ucd; 243 | if do_assert `Id then assert_id ucd; 244 | if do_assert `Name then assert_name ucd; 245 | if do_assert `Num then assert_num ucd; 246 | if do_assert `Script then assert_script ucd; 247 | if do_assert `White then assert_white ucd; 248 | () 249 | 250 | let main () = 251 | let usage = str 252 | "Usage: %s [OPTION]... [DBFILE]\n\ 253 | \ Asserts Uucp's data against the Unicode character database DBFILE.\n\ 254 | \ DBFILE defaults to support/ucd.xml, without any option asserts all\n\ 255 | \ modules.\n\ 256 | Options:" exec 257 | in 258 | let inf = ref None in 259 | let set_inf f = 260 | if !inf = None then inf := Some f else 261 | raise (Arg.Bad "only one Unicode character database file can be specified") 262 | in 263 | let mods = ref [] in 264 | let add v = Arg.Unit (fun () -> mods := v :: !mods) in 265 | let options = [ 266 | "-age", add `Age, " assert the Age module"; 267 | "-alpha", add `Alpha, " assert the Alpha module"; 268 | "-block", add `Block, " assert the Block module"; 269 | "-break", add `Break, " assert the Break module"; 270 | "-case", add `Case, " assert the Case module"; 271 | "-cjk", add `Cjk, " assert the CJK module"; 272 | "-emoji", add `Emoji, " assert the Emoji module"; 273 | "-func", add `Func, " assert the Func module"; 274 | "-gc", add `Gc, " assert the Gc module"; 275 | "-gen", add `Gen, " assert the Gen module"; 276 | "-hangul", add `Hangul, " assert the Hangul module"; 277 | "-id", add `Id, " assert the Id module"; 278 | "-name", add `Name, " assert the Name module"; 279 | "-num", add `Num, " assert the Num module"; 280 | "-script", add `Script, " assert the Script module"; 281 | "-white", add `White, " assert the White module"; ] 282 | in 283 | Arg.parse (Arg.align options) set_inf usage; 284 | test !inf !mods 285 | 286 | let () = if !Sys.interactive then () else exit (main ()) 287 | -------------------------------------------------------------------------------- /src/uucp_gen_data.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2025 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* WARNING do not edit. This file was automatically generated. *) 7 | 8 | open Uucp_tmapbool 9 | let v000 = 10 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 11 | \x00\x00\x00\x00\x00\x20\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 12 | let v001 = snil 13 | let v002 = 14 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\ 15 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 16 | let v003 = 17 | "\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 18 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 19 | let v004 = 20 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x01\x00\x00\x00\ 21 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 22 | let v005 = 23 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 24 | \x00\x00\x00\x00\x00\x00\x30\x00\x00\x00\x00\x00\x00\x00\x00\x00" 25 | let v006 = 26 | "\x00\xf8\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 27 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 28 | let v007 = 29 | "\x00\xf8\x00\x00\x00\x7c\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\ 30 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 31 | let v008 = 32 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\ 33 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 34 | let v009 = 35 | "\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 36 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80" 37 | let v010 = 38 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 39 | \x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\x01" 40 | let v011 = 41 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 42 | \x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 43 | let v012 = 44 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf8\x07\ 45 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 46 | let v013 = 47 | "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\ 48 | \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" 49 | let default_ignorable_map = 50 | { default = false; 51 | l0 = 52 | [|[|v000; v001; v001; v002; v001; v001; v003; v001; v001; v001; v001; 53 | v001; v001; v001; v001; v001|]; 54 | [|v001; v004; v001; v001; v001; v001; v001; v005; v006; v001; v001; 55 | v001; v001; v001; v001; v001|]; 56 | [|v007; v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; 57 | v001; v001; v001; v001; v001|]; 58 | [|v001; v008; v001; v001; v001; v001; v001; v001; v001; v001; v001; 59 | v001; v001; v001; v001; v001|]; 60 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 61 | [|v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; 62 | v001; v001; v001; v009; v010|]; 63 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 64 | [|v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; v001; 65 | v001; v011; v001; v001; v001|]; 66 | nil; 67 | [|v001; v012; v001; v001; v001; v001; v001; v001; v001; v001; v001; 68 | v001; v001; v001; v001; v001|]; 69 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 70 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 71 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 72 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 73 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 74 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 75 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 76 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 77 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 78 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 79 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 80 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 81 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 82 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 83 | [|v013; v013; v013; v013; v013; v013; v013; v013; v013; v013; v013; 84 | v013; v013; v013; v013; v013|]; 85 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 86 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 87 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 88 | nil; nil; nil; nil; nil|] } 89 | 90 | open Uucp_tmapbool 91 | let v000 = snil 92 | let v001 = 93 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\ 94 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 95 | let v002 = 96 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\ 97 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 98 | let v003 = 99 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x02\ 100 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 101 | let v004 = 102 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 103 | \x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 104 | let v005 = 105 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfc\x00\x00\ 106 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 107 | let v006 = 108 | "\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 109 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 110 | let v007 = 111 | "\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 112 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 113 | let deprecated_map = 114 | { default = false; 115 | l0 = 116 | [|[|v000; v001; v000; v000; v000; v000; v002; v000; v000; v000; v000; 117 | v000; v000; v000; v000; v003|]; 118 | [|v000; v000; v000; v000; v000; v000; v000; v004; v000; v000; v000; 119 | v000; v000; v000; v000; v000|]; 120 | [|v005; v000; v000; v006; v000; v000; v000; v000; v000; v000; v000; 121 | v000; v000; v000; v000; v000|]; 122 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 123 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 124 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 125 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 126 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 127 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 128 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 129 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 130 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 131 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 132 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 133 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 134 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 135 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 136 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 137 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 138 | [|v007; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 139 | v000; v000; v000; v000; v000|]; 140 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 141 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 142 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 143 | nil; nil; nil; nil; nil|] } 144 | 145 | open Uucp_tmapbool 146 | let v000 = snil 147 | let v001 = 148 | "\x00\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00\ 149 | \x00\x00\x00\x00\x00\x00\x00\x00\x1f\x00\x00\x00\x00\x00\x00\x00" 150 | let v002 = 151 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 152 | \x00\x00\x00\x00\x00\x00\xe0\x04\x00\x00\x00\x00\x00\x00\x00\x00" 153 | let v003 = 154 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 155 | \x00\x00\x00\x00\x00\x00\x60\x1a\x00\x00\x00\x00\x00\x00\x00\x00" 156 | let logical_order_exception_map = 157 | { default = false; 158 | l0 = 159 | [|[|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 160 | v000; v000; v000; v001; v000|]; 161 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v002; v000; 162 | v000; v000; v000; v000; v000|]; 163 | nil; nil; nil; nil; nil; nil; nil; nil; 164 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v003; 165 | v000; v000; v000; v000; v000|]; 166 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 167 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 168 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 169 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 170 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 171 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 172 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 173 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 174 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 175 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 176 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 177 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 178 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 179 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 180 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 181 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 182 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 183 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 184 | nil; nil; nil; nil; nil; nil; nil; nil; nil|] } 185 | 186 | open Uucp_tmapbool 187 | let v000 = snil 188 | let v001 = 189 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 190 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00" 191 | let v002 = 192 | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 193 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc0" 194 | let non_character_map = 195 | { default = false; 196 | l0 = 197 | [|nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 198 | nil; 199 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 200 | v000; v000; v001; v000; v002|]; 201 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 202 | nil; 203 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 204 | v000; v000; v000; v000; v002|]; 205 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 206 | nil; 207 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 208 | v000; v000; v000; v000; v002|]; 209 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 210 | nil; 211 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 212 | v000; v000; v000; v000; v002|]; 213 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 214 | nil; 215 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 216 | v000; v000; v000; v000; v002|]; 217 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 218 | nil; 219 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 220 | v000; v000; v000; v000; v002|]; 221 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 222 | nil; 223 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 224 | v000; v000; v000; v000; v002|]; 225 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 226 | nil; 227 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 228 | v000; v000; v000; v000; v002|]; 229 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 230 | nil; 231 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 232 | v000; v000; v000; v000; v002|]; 233 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 234 | nil; 235 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 236 | v000; v000; v000; v000; v002|]; 237 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 238 | nil; 239 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 240 | v000; v000; v000; v000; v002|]; 241 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 242 | nil; 243 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 244 | v000; v000; v000; v000; v002|]; 245 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 246 | nil; 247 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 248 | v000; v000; v000; v000; v002|]; 249 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 250 | nil; 251 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 252 | v000; v000; v000; v000; v002|]; 253 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 254 | nil; 255 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 256 | v000; v000; v000; v000; v002|]; 257 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 258 | nil; 259 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 260 | v000; v000; v000; v000; v002|]; 261 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 262 | nil; 263 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 264 | v000; v000; v000; v000; v002|]|] } 265 | 266 | open Uucp_tmapbool 267 | let v000 = snil 268 | let v001 = 269 | "\x00\xb8\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 270 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 271 | let v002 = 272 | "\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ 273 | \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" 274 | let v003 = 275 | "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\ 276 | \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00" 277 | let variation_selector_map = 278 | { default = false; 279 | l0 = 280 | [|nil; 281 | [|v000; v000; v000; v000; v000; v000; v000; v000; v001; v000; v000; 282 | v000; v000; v000; v000; v000|]; 283 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 284 | [|v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; v000; 285 | v000; v000; v000; v002; v000|]; 286 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 287 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 288 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 289 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 290 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 291 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 292 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 293 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 294 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 295 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 296 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 297 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 298 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 299 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 300 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 301 | [|v000; v003; v000; v000; v000; v000; v000; v000; v000; v000; v000; 302 | v000; v000; v000; v000; v000|]; 303 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 304 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 305 | nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; nil; 306 | nil; nil; nil; nil; nil|] } 307 | 308 | 309 | --------------------------------------------------------------------------------