├── BRZO ├── src ├── uucd.mllib ├── uucd.mli └── uucd.ml ├── doc ├── api.odocl └── index.mld ├── .merlin ├── .gitignore ├── _tags ├── pkg ├── META └── pkg.ml ├── DEVEL.md ├── LICENSE.md ├── README.md ├── test ├── test_uucd.ml └── example.ml ├── opam ├── CHANGES.md └── B0.ml /BRZO: -------------------------------------------------------------------------------- 1 | (srcs-x pkg) -------------------------------------------------------------------------------- /src/uucd.mllib: -------------------------------------------------------------------------------- 1 | Uucd -------------------------------------------------------------------------------- /doc/api.odocl: -------------------------------------------------------------------------------- 1 | Uucd 2 | -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG b0.kit xmlm 2 | B _b0/** 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _b0 2 | _build 3 | tmp 4 | *.install 5 | test/ucd.xml -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | <**/*.{ml,mli}> : bin_annot, safe_string 2 | : include 3 | : include 4 | : package(xmlm) 5 | : package(xmlm) 6 | <_b0> : -traverse -------------------------------------------------------------------------------- /doc/index.mld: -------------------------------------------------------------------------------- 1 | {0 Uucd {%html: %%VERSION%%%}} 2 | 3 | Uucd is an OCaml module to decode the data of the Unicode character 4 | database from its XML representation. 5 | 6 | {1:uucd Library [uucd]} 7 | 8 | {!modules: 9 | Uucd 10 | } 11 | -------------------------------------------------------------------------------- /pkg/META: -------------------------------------------------------------------------------- 1 | description = "Unicode character database decoder for OCaml" 2 | version = "%%VERSION_NUM%%" 3 | requires = "xmlm" 4 | archive(byte) = "uucd.cma" 5 | archive(native) = "uucd.cmxa" 6 | plugin(byte) = "uucd.cma" 7 | plugin(native) = "uucd.cmxs" 8 | exists_if = "uucd.cma uucd.cmxa" 9 | -------------------------------------------------------------------------------- /pkg/pkg.ml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ocaml 2 | #use "topfind" 3 | #require "topkg" 4 | open Topkg 5 | 6 | let distrib = 7 | (* FIXME OPAMv2, move this to an x-unicode-version field in the opam file. *) 8 | let watermarks = ("UNICODE_VERSION", `String "16.0.0") :: Pkg.watermarks in 9 | Pkg.distrib ~watermarks () 10 | 11 | let () = 12 | Pkg.describe "uucd" ~distrib @@ fun c -> 13 | Ok [ Pkg.mllib ~api:["Uucd"] "src/uucd.mllib"; ] 14 | -------------------------------------------------------------------------------- /DEVEL.md: -------------------------------------------------------------------------------- 1 | # New Unicode release 2 | 3 | Bump the Unicode release number at the top of the `B0.ml` file and in 4 | `pkg/pkg.ml`. Verify that everything is as expected with: 5 | 6 | b0 -- unicode-version 7 | 8 | Download the latest xml unicode database to the `test/ucd.xml` file 9 | which is ignored by git. If you have `curl` and `unzip` in your `PATH` 10 | you can simply issue: 11 | 12 | b0 -- download-ucdxml 13 | 14 | Then you should run 15 | 16 | b0 test 17 | 18 | this will likely fail with a parse error. Adjust the parser and 19 | datatypes with the help of: 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 The uucd programmers 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Uucd — Unicode character database decoder for OCaml 2 | =================================================== 3 | 4 | Uucd is an OCaml module to decode the data of the [Unicode character 5 | database][1] from its XML [representation][2]. It provides high-level 6 | (but not necessarily efficient) access to the data so that efficient 7 | representations can be extracted. 8 | 9 | Uucd is made of a single module, depends on [Xmlm][xmlm] and is distributed 10 | under the ISC license. 11 | 12 | [1]: http://www.unicode.org/reports/tr44/ 13 | [2]: http://www.unicode.org/reports/tr42/ 14 | [xmlm]: http://erratique.ch/software/xmlm 15 | 16 | Home page: 17 | 18 | ## Installation 19 | 20 | Uucd can be installed with `opam`: 21 | 22 | opam install uucd 23 | 24 | If you don't use `opam` consult the [`opam`](opam) file for build 25 | instructions and a complete specification of the dependencies. 26 | 27 | ## Documentation 28 | 29 | The documentation and API reference can be consulted [online][doc] 30 | or via `odig doc uucd`. 31 | 32 | [doc]: http://erratique.ch/software/uucd/doc/ 33 | -------------------------------------------------------------------------------- /test/test_uucd.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2024 The uucd programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open B0_std 7 | open B0_testing 8 | 9 | let test_decode = 10 | Test.test "Uucd.decode" @@ fun file -> 11 | Test.noraise ~__POS__ @@ fun () -> 12 | let cwd = Os.Dir.cwd () |> Result.error_to_failure in 13 | let inf = Fpath.(cwd // v "test/ucd.xml") in 14 | let inf = Fpath.to_string inf in 15 | try 16 | In_channel.with_open_bin inf @@ fun ic -> 17 | let d = Uucd.decoder (`Channel ic) in 18 | match Uucd.decode d with 19 | | `Ok db -> 20 | let props = Uucd.Cpmap.find 0x0020 db.repertoire in 21 | Test.(option T.any) (Uucd.find props Uucd.general_category) (Some `Zs) 22 | ~__POS__ 23 | | `Error e -> 24 | let (l0, c0), (l1, c1) = Uucd.decoded_range d in 25 | Test.failstop ~__POS__ "%s:%d.%d-%d.%d: %s\n%!" inf l0 c0 l1 c1 e 26 | with 27 | | Sys_error e -> Test.failstop "%s" e ~__POS__ 28 | 29 | let main () = Test.main @@ fun () -> Test.autorun () 30 | let () = if !Sys.interactive then () else exit (main ()) 31 | -------------------------------------------------------------------------------- /test/example.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uucd programmers. All rights reserved. 3 | SPDX-License-Identifier: CC0-1.0 4 | ---------------------------------------------------------------------------*) 5 | 6 | let ucd_or_die inf = 7 | try 8 | let ic = if inf = "-" then stdin else open_in inf in 9 | let d = Uucd.decoder (`Channel ic) in 10 | match Uucd.decode d with 11 | | `Ok db -> db 12 | | `Error e -> 13 | let (l0, c0), (l1, c1) = Uucd.decoded_range d in 14 | Printf.eprintf "%s:%d.%d-%d.%d: %s\n%!" inf l0 c0 l1 c1 e; 15 | exit 1 16 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1 17 | 18 | let ucd_from_marshaled : string -> Uucd.t = 19 | fun inf -> Marshal.from_channel (open_in inf) 20 | 21 | let main () = 22 | let usage = "test [ucd.xml]" in 23 | let inf = ref None in 24 | let anon_fun file = match !inf with 25 | | Some _ -> raise (Arg.Bad ("Don't now what to do with " ^ file)) 26 | | None -> inf := Some file 27 | in 28 | Arg.parse [] anon_fun usage; 29 | let inf = Option.value ~default:"-" !inf in 30 | Marshal.to_channel stdout (ucd_or_die inf) [] 31 | 32 | let () = if !Sys.interactive then () else main () 33 | -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "uucd" 3 | synopsis: "Unicode character database decoder for OCaml" 4 | description: """\ 5 | Uucd is an OCaml module to decode the data of the [Unicode character 6 | database][1] from its XML [representation][2]. It provides high-level 7 | (but not necessarily efficient) access to the data so that efficient 8 | representations can be extracted. 9 | 10 | Uucd is made of a single module, depends on [Xmlm][xmlm] and is distributed 11 | under the ISC license. 12 | 13 | [1]: http://www.unicode.org/reports/tr44/ 14 | [2]: http://www.unicode.org/reports/tr42/ 15 | [xmlm]: http://erratique.ch/software/xmlm 16 | 17 | Home page: """ 18 | maintainer: "Daniel Bünzli " 19 | authors: "The uucd programmers" 20 | license: "ISC" 21 | tags: ["unicode" "database" "decoder" "org:erratique"] 22 | homepage: "https://erratique.ch/software/uucd" 23 | doc: "https://erratique.ch/software/uucd/doc/Uucd" 24 | bug-reports: "https://github.com/dbuenzli/uucd/issues" 25 | depends: [ 26 | "ocaml" {>= "4.08.0"} 27 | "ocamlfind" {build} 28 | "ocamlbuild" {build} 29 | "topkg" {build & >= "1.1.0"} 30 | "xmlm" 31 | ] 32 | build: ["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"] 33 | dev-repo: "git+https://erratique.ch/repos/uucd.git" 34 | x-maintenance-intent: ["(latest)"] 35 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | v17.0.0 2025-09-11 Zagreb 2 | ------------------------- 3 | 4 | - Support for Unicode 17.0.0 5 | 6 | v16.0.0 2024-09-11 Zagreb 7 | ------------------------- 8 | 9 | - Support for Unicode 16.0.0 10 | 11 | v15.1.0 2023-09-15 Zagreb 12 | ------------------------- 13 | 14 | - Support for Unicode 15.1.0 15 | 16 | v15.0.0 2022-09-15 Zagreb 17 | ------------------------- 18 | 19 | - Support for Unicode 15.0.0 20 | 21 | v14.0.0 2021-09-17 Zagreb 22 | ------------------------- 23 | 24 | - Support for Unicode 14.0.0 25 | 26 | v13.0.0 2020-03-10 La Forclaz (VS) 27 | ---------------------------------- 28 | 29 | - Support for Unicode 13.0.0 30 | 31 | v12.0.0 2019-03-07 La Forclaz (VS) 32 | ---------------------------------- 33 | 34 | - Support for Unicode 12.0.0 35 | 36 | v11.0.0 2018-06-06 Lausanne 37 | --------------------------- 38 | 39 | - Support for Unicode 11.0.0 40 | 41 | v10.0.0 2017-06-20 Cambridge (UK) 42 | --------------------------------- 43 | 44 | - Support for Unicode 10.0.0 45 | 46 | v4.0.0 2016-06-26 Cambridge (UK) 47 | -------------------------------- 48 | 49 | - Updated for Unicode 9.0.0 50 | - Build depend on topkg. 51 | - Relicensed from BSD3 to ISC. 52 | 53 | v3.0.0 2015-06-17 Cambridge (UK) 54 | -------------------------------- 55 | 56 | - Updated for Unicode 8.0.0 57 | 58 | v2.0.0 2014-06-16 Cambridge (UK) 59 | -------------------------------- 60 | 61 | - Updated for Unicode 7.0.0 62 | 63 | v1.0.0 2013-10-01 Lausanne 64 | -------------------------- 65 | 66 | - Updated for Unicode 6.3.0. 67 | - OPAM friendly workflow and drop OASIS support. 68 | 69 | v0.9.2 2013-01-04 La Forclaz (VS) 70 | --------------------------------- 71 | 72 | - Updated for Unicode 6.2.0. 73 | 74 | v0.9.1 2013-01-04 La Forclaz (VS) 75 | --------------------------------- 76 | 77 | - Fix Uucd.is_scalar_value always returning false. 78 | 79 | v0.9.0 2012-09-07 Lausanne 80 | -------------------------- 81 | 82 | First release. 83 | -------------------------------------------------------------------------------- /B0.ml: -------------------------------------------------------------------------------- 1 | open B0_kit.V000 2 | open Result.Syntax 3 | 4 | let unicode_version = 17, 0, 0, None (* Adjust on new releases *) 5 | 6 | (* OCaml library names *) 7 | 8 | let b0_std = B0_ocaml.libname "b0.std" 9 | let xmlm = B0_ocaml.libname "xmlm" 10 | let uucd = B0_ocaml.libname "uucd" 11 | 12 | (* Libraries *) 13 | 14 | let uucd_lib = 15 | let srcs = [ `Dir ~/"src" ] and requires = [ xmlm ] in 16 | B0_ocaml.lib uucd ~doc:"Uucd library" ~srcs ~requires 17 | 18 | (* Actions *) 19 | 20 | let uc_base = "http://www.unicode.org/Public" 21 | 22 | let download_ucdxml = 23 | let doc = "Download the Unicode character database to test/ucd.xml" in 24 | B0_unit.of_action "download-ucdxml" ~doc @@ fun env _ ~args:_ -> 25 | let* unzip = B0_env.get_cmd env (Cmd.arg "unzip") in 26 | let version = B0_version.to_string unicode_version in 27 | let ucd_url = Fmt.str "%s/%s/ucdxml/ucd.all.grouped.zip" uc_base version in 28 | let ucd_file = B0_env.in_scope_dir env ~/"test/ucd.xml" in 29 | Result.join @@ Os.File.with_tmp_fd @@ fun tmpfile tmpfd -> 30 | (Log.stdout @@ fun m -> 31 | m "@[Downloading %s@,to %a@]" ucd_url Fpath.pp ucd_file); 32 | let* () = B0_action_kit.fetch_url env ucd_url tmpfile in 33 | let stdout = Os.Cmd.out_file ~force:true ~make_path:true ucd_file in 34 | Os.Cmd.run Cmd.(unzip % "-p" %% path tmpfile) ~stdout 35 | 36 | let show_version = 37 | B0_unit.of_action "unicode-version" ~doc:"Show supported unicode version" @@ 38 | fun _ _ ~args:_ -> 39 | Ok (Log.stdout (fun m -> m "%s" (B0_version.to_string unicode_version))) 40 | 41 | (* Tests *) 42 | 43 | let test_uucd = 44 | let srcs = [ `File ~/"test/test_uucd.ml" ] in 45 | let meta = 46 | B0_meta.(empty |> tag test |> tag run |> ~~ B0_unit.Action.cwd `Scope_dir) 47 | in 48 | let requires = [uucd; b0_std] in 49 | B0_ocaml.exe "test_uucd" ~doc:"Test decoder" ~srcs ~requires ~meta 50 | 51 | let example = 52 | let srcs = [ `File ~/"test/example.ml" ] in 53 | let meta = B0_meta.(empty |> tag test) in 54 | B0_ocaml.exe "example" ~doc:"Sample code" ~srcs ~meta ~requires:[uucd] 55 | 56 | (* Packs *) 57 | 58 | let default = 59 | let meta = 60 | B0_meta.empty 61 | |> ~~ B0_meta.authors ["The uucd programmers"] 62 | |> ~~ B0_meta.maintainers ["Daniel Bünzli "] 63 | |> ~~ B0_meta.homepage "https://erratique.ch/software/uucd" 64 | |> ~~ B0_meta.online_doc "https://erratique.ch/software/uucd/doc/Uucd" 65 | |> ~~ B0_meta.licenses ["ISC"] 66 | |> ~~ B0_meta.repo "git+https://erratique.ch/repos/uucd.git" 67 | |> ~~ B0_meta.issues "https://github.com/dbuenzli/uucd/issues" 68 | |> ~~ B0_meta.description_tags 69 | ["unicode"; "database"; "decoder"; "org:erratique"] 70 | |> B0_meta.tag B0_opam.tag 71 | |> ~~ B0_opam.build 72 | {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"]]|} 73 | |> ~~ B0_opam.depends 74 | [ "ocaml", {|>= "4.08.0"|}; 75 | "ocamlfind", {|build|}; 76 | "ocamlbuild", {|build|}; 77 | "topkg", {|build & >= "1.1.0"|}; 78 | "xmlm", {||} ] 79 | in 80 | B0_pack.make "default" ~doc:"uucd package" ~meta ~locked:true @@ 81 | B0_unit.list () 82 | -------------------------------------------------------------------------------- /src/uucd.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uucd programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Unicode character database decoder. 7 | 8 | [Uucd] decodes the data of the 9 | {{:http://www.unicode.org/reports/tr44}Unicode character database} 10 | from its XML representation. It provides high-level (but not 11 | necessarily efficient) access to the data so that efficient 12 | representations can be extracted. 13 | 14 | [Uucd] decodes the representation described in the Annex #42 of 15 | Unicode %%UNICODE_VERSION%%. Subsequent versions may be decoded as 16 | long as no new cases are introduced in parsed enumerated 17 | properties. 18 | 19 | Consult the {{!basics}basics}. 20 | 21 | {b Note.} All strings returned by the module are UTF-8 encoded. 22 | 23 | {e Unicode version %%UNICODE_VERSION%%} 24 | 25 | {3 References} 26 | {ul 27 | {- The Unicode Consortium. 28 | {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. 29 | (latest version)} 30 | {- Mark Davis, Ken Whistler. 31 | {e {{:http://www.unicode.org/reports/tr44/}UAX #44 Unicode Character 32 | Database}}. (latest version)} 33 | {- Eric Muller. 34 | {e {{:http://www.unicode.org/reports/tr42/}UAX #42 Unicode Character 35 | Database in XML}}. (latest version)}} *) 36 | 37 | (** {1:chars Code points} *) 38 | 39 | type cp = int 40 | (** The type for Unicode {{:http://unicode.org/glossary/#code_point}code 41 | points}, ranges from [0x0000] to [0x10_FFFF]. *) 42 | 43 | val is_cp : int -> bool 44 | (** [is_cp n] is [true] iff [n] a Unicode 45 | {{:http://unicode.org/glossary/#code_point}code 46 | point}. *) 47 | 48 | val is_scalar_value : int -> bool 49 | (** [is_scalar_value n] is [true] iff [n] is a Unicode 50 | {{:http://unicode.org/glossary/#Unicode_scalar_value}scalar value}. *) 51 | 52 | (** Code point maps. *) 53 | module Cpmap : Map.S with type key = cp 54 | 55 | (** {1:props Properties} 56 | 57 | Properties are referenced by their name and property values by 58 | their 59 | {{:http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt} 60 | abbreviated name}. To understand their semantics refer to the 61 | {{:http://www.unicode.org/versions/latest/}standard}. *) 62 | 63 | type props 64 | (** The type for sets of properties. *) 65 | 66 | type 'a prop 67 | (** The type for properties with property value of type ['a]. *) 68 | 69 | val find : props -> 'a prop -> 'a option 70 | (** [find ps p] is the value of property [p] in [ps], if any. *) 71 | 72 | val unknown_prop : string * string -> string prop 73 | (** [unknown_prop (ns, n)] is a property read from an XML attribute 74 | whose expanded name is [(ns, n)]. This can be used to access a 75 | property unknown to the module. *) 76 | 77 | (** {2:nonunihan Non Unihan properties} 78 | 79 | In alphabetical order. *) 80 | 81 | val age : [ `Version of int * int | `Unassigned ] prop 82 | val alphabetic : bool prop 83 | val ascii_hex_digit : bool prop 84 | val bidi_class : [ 85 | | `AL 86 | | `AN 87 | | `B 88 | | `BN 89 | | `CS 90 | | `EN 91 | | `ES 92 | | `ET 93 | | `FSI 94 | | `L 95 | | `LRE 96 | | `LRI 97 | | `LRO 98 | | `NSM 99 | | `ON 100 | | `PDF 101 | | `PDI 102 | | `R 103 | | `RLE 104 | | `RLI 105 | | `RLO 106 | | `S 107 | | `WS 108 | ] prop 109 | 110 | val bidi_control : bool prop 111 | val bidi_mirrored : bool prop 112 | val bidi_mirroring_glyph : cp option prop 113 | val bidi_paired_bracket : [ `Self | `Cp of cp ] prop 114 | val bidi_paired_bracket_type : [ `O | `C | `N ] prop 115 | val block : [ 116 | | `ASCII 117 | | `Adlam 118 | | `Aegean_Numbers 119 | | `Ahom 120 | | `Alchemical 121 | | `Alphabetic_PF 122 | | `Anatolian_Hieroglyphs 123 | | `Ancient_Greek_Music 124 | | `Ancient_Greek_Numbers 125 | | `Ancient_Symbols 126 | | `Arabic 127 | | `Arabic_Ext_A 128 | | `Arabic_Ext_B 129 | | `Arabic_Ext_C 130 | | `Arabic_Math 131 | | `Arabic_PF_A 132 | | `Arabic_PF_B 133 | | `Arabic_Sup 134 | | `Armenian 135 | | `Arrows 136 | | `Avestan 137 | | `Balinese 138 | | `Bamum 139 | | `Bamum_Sup 140 | | `Bassa_Vah 141 | | `Batak 142 | | `Bengali 143 | | `Beria_Erfe 144 | | `Bhaiksuki 145 | | `Block_Elements 146 | | `Bopomofo 147 | | `Bopomofo_Ext 148 | | `Box_Drawing 149 | | `Brahmi 150 | | `Braille 151 | | `Buginese 152 | | `Buhid 153 | | `Byzantine_Music 154 | | `CJK 155 | | `CJK_Compat 156 | | `CJK_Compat_Forms 157 | | `CJK_Compat_Ideographs 158 | | `CJK_Compat_Ideographs_Sup 159 | | `CJK_Ext_A 160 | | `CJK_Ext_B 161 | | `CJK_Ext_C 162 | | `CJK_Ext_D 163 | | `CJK_Ext_E 164 | | `CJK_Ext_F 165 | | `CJK_Ext_G 166 | | `CJK_Ext_H 167 | | `CJK_Ext_I 168 | | `CJK_Ext_J 169 | | `CJK_Radicals_Sup 170 | | `CJK_Strokes 171 | | `CJK_Symbols 172 | | `Carian 173 | | `Caucasian_Albanian 174 | | `Chakma 175 | | `Cham 176 | | `Cherokee 177 | | `Cherokee_Sup 178 | | `Chess_Symbols 179 | | `Chorasmian 180 | | `Compat_Jamo 181 | | `Control_Pictures 182 | | `Coptic 183 | | `Coptic_Epact_Numbers 184 | | `Counting_Rod 185 | | `Cuneiform 186 | | `Cuneiform_Numbers 187 | | `Currency_Symbols 188 | | `Cypriot_Syllabary 189 | | `Cypro_Minoan 190 | | `Cyrillic 191 | | `Cyrillic_Ext_A 192 | | `Cyrillic_Ext_B 193 | | `Cyrillic_Ext_C 194 | | `Cyrillic_Ext_D 195 | | `Cyrillic_Sup 196 | | `Deseret 197 | | `Devanagari 198 | | `Devanagari_Ext 199 | | `Devanagari_Ext_A 200 | | `Diacriticals 201 | | `Diacriticals_Ext 202 | | `Diacriticals_For_Symbols 203 | | `Diacriticals_Sup 204 | | `Dingbats 205 | | `Dives_Akuru 206 | | `Dogra 207 | | `Domino 208 | | `Duployan 209 | | `Early_Dynastic_Cuneiform 210 | | `Egyptian_Hieroglyph_Format_Controls 211 | | `Egyptian_Hieroglyphs 212 | | `Egyptian_Hieroglyphs_Ext_A 213 | | `Elbasan 214 | | `Elymaic 215 | | `Emoticons 216 | | `Enclosed_Alphanum 217 | | `Enclosed_Alphanum_Sup 218 | | `Enclosed_CJK 219 | | `Enclosed_Ideographic_Sup 220 | | `Ethiopic 221 | | `Ethiopic_Ext 222 | | `Ethiopic_Ext_A 223 | | `Ethiopic_Ext_B 224 | | `Ethiopic_Sup 225 | | `Garay 226 | | `Geometric_Shapes 227 | | `Geometric_Shapes_Ext 228 | | `Georgian 229 | | `Georgian_Ext 230 | | `Georgian_Sup 231 | | `Glagolitic 232 | | `Glagolitic_Sup 233 | | `Gothic 234 | | `Grantha 235 | | `Greek 236 | | `Greek_Ext 237 | | `Gujarati 238 | | `Gunjala_Gondi 239 | | `Gurmukhi 240 | | `Gurung_Khema 241 | | `Half_And_Full_Forms 242 | | `Half_Marks 243 | | `Hangul 244 | | `Hanifi_Rohingya 245 | | `Hanunoo 246 | | `Hatran 247 | | `Hebrew 248 | | `High_PU_Surrogates 249 | | `High_Surrogates 250 | | `Hiragana 251 | | `IDC 252 | | `IPA_Ext 253 | | `Ideographic_Symbols 254 | | `Imperial_Aramaic 255 | | `Indic_Number_Forms 256 | | `Indic_Siyaq_Numbers 257 | | `Inscriptional_Pahlavi 258 | | `Inscriptional_Parthian 259 | | `Jamo 260 | | `Jamo_Ext_A 261 | | `Jamo_Ext_B 262 | | `Javanese 263 | | `Kaithi 264 | | `Kaktovik_Numerals 265 | | `Kana_Ext_A 266 | | `Kana_Ext_B 267 | | `Kana_Sup 268 | | `Kanbun 269 | | `Kangxi 270 | | `Kannada 271 | | `Katakana 272 | | `Katakana_Ext 273 | | `Kawi 274 | | `Kayah_Li 275 | | `Kharoshthi 276 | | `Khitan_Small_Script 277 | | `Khmer 278 | | `Khmer_Symbols 279 | | `Khojki 280 | | `Khudawadi 281 | | `Kirat_Rai 282 | | `Lao 283 | | `Latin_1_Sup 284 | | `Latin_Ext_A 285 | | `Latin_Ext_Additional 286 | | `Latin_Ext_B 287 | | `Latin_Ext_C 288 | | `Latin_Ext_D 289 | | `Latin_Ext_E 290 | | `Latin_Ext_F 291 | | `Latin_Ext_G 292 | | `Lepcha 293 | | `Letterlike_Symbols 294 | | `Limbu 295 | | `Linear_A 296 | | `Linear_B_Ideograms 297 | | `Linear_B_Syllabary 298 | | `Lisu 299 | | `Lisu_Sup 300 | | `Low_Surrogates 301 | | `Lycian 302 | | `Lydian 303 | | `Mahajani 304 | | `Mahjong 305 | | `Makasar 306 | | `Malayalam 307 | | `Mandaic 308 | | `Manichaean 309 | | `Marchen 310 | | `Masaram_Gondi 311 | | `Math_Alphanum 312 | | `Math_Operators 313 | | `Mayan_Numerals 314 | | `Medefaidrin 315 | | `Meetei_Mayek 316 | | `Meetei_Mayek_Ext 317 | | `Mende_Kikakui 318 | | `Meroitic_Cursive 319 | | `Meroitic_Hieroglyphs 320 | | `Miao 321 | | `Misc_Arrows 322 | | `Misc_Math_Symbols_A 323 | | `Misc_Math_Symbols_B 324 | | `Misc_Pictographs 325 | | `Misc_Symbols 326 | | `Misc_Symbols_Sup 327 | | `Misc_Technical 328 | | `Modi 329 | | `Modifier_Letters 330 | | `Modifier_Tone_Letters 331 | | `Mongolian 332 | | `Mongolian_Sup 333 | | `Mro 334 | | `Multani 335 | | `Music 336 | | `Myanmar 337 | | `Myanmar_Ext_A 338 | | `Myanmar_Ext_B 339 | | `Myanmar_Ext_C 340 | | `NB 341 | | `NKo 342 | | `Nabataean 343 | | `Nag_Mundari 344 | | `Nandinagari 345 | | `New_Tai_Lue 346 | | `Newa 347 | | `Number_Forms 348 | | `Nushu 349 | | `Nyiakeng_Puachue_Hmong 350 | | `OCR 351 | | `Ogham 352 | | `Ol_Onal 353 | | `Ol_Chiki 354 | | `Old_Hungarian 355 | | `Old_Italic 356 | | `Old_North_Arabian 357 | | `Old_Permic 358 | | `Old_Persian 359 | | `Old_Sogdian 360 | | `Old_South_Arabian 361 | | `Old_Turkic 362 | | `Old_Uyghur 363 | | `Oriya 364 | | `Ornamental_Dingbats 365 | | `Osage 366 | | `Osmanya 367 | | `Ottoman_Siyaq_Numbers 368 | | `PUA 369 | | `Pahawh_Hmong 370 | | `Palmyrene 371 | | `Pau_Cin_Hau 372 | | `Phags_Pa 373 | | `Phaistos 374 | | `Phoenician 375 | | `Phonetic_Ext 376 | | `Phonetic_Ext_Sup 377 | | `Playing_Cards 378 | | `Psalter_Pahlavi 379 | | `Punctuation 380 | | `Rejang 381 | | `Rumi 382 | | `Runic 383 | | `Samaritan 384 | | `Saurashtra 385 | | `Sharada 386 | | `Sharada_Sup 387 | | `Shavian 388 | | `Shorthand_Format_Controls 389 | | `Siddham 390 | | `Sidetic 391 | | `Sinhala 392 | | `Sinhala_Archaic_Numbers 393 | | `Small_Forms 394 | | `Small_Kana_Ext 395 | | `Sogdian 396 | | `Sora_Sompeng 397 | | `Soyombo 398 | | `Specials 399 | | `Sundanese 400 | | `Sundanese_Sup 401 | | `Sunuwar 402 | | `Sup_Arrows_A 403 | | `Sup_Arrows_B 404 | | `Sup_Arrows_C 405 | | `Sup_Math_Operators 406 | | `Sup_PUA_A 407 | | `Sup_PUA_B 408 | | `Sup_Punctuation 409 | | `Sup_Symbols_And_Pictographs 410 | | `Super_And_Sub 411 | | `Sutton_SignWriting 412 | | `Syloti_Nagri 413 | | `Symbols_And_Pictographs_Ext_A 414 | | `Symbols_For_Legacy_Computing 415 | | `Symbols_For_Legacy_Computing_Sup 416 | | `Syriac 417 | | `Syriac_Sup 418 | | `Tagalog 419 | | `Tagbanwa 420 | | `Tags 421 | | `Tai_Le 422 | | `Tai_Tham 423 | | `Tai_Viet 424 | | `Tai_Xuan_Jing 425 | | `Tai_Yo 426 | | `Takri 427 | | `Tamil 428 | | `Tamil_Sup 429 | | `Tangsa 430 | | `Tangut 431 | | `Tangut_Components 432 | | `Tangut_Components_Sup 433 | | `Tangut_Sup 434 | | `Telugu 435 | | `Thaana 436 | | `Thai 437 | | `Tibetan 438 | | `Tifinagh 439 | | `Tirhuta 440 | | `Todhri 441 | | `Tolong_Siki 442 | | `Toto 443 | | `Transport_And_Map 444 | | `Tulu_Tigalari 445 | | `UCAS 446 | | `UCAS_Ext 447 | | `UCAS_Ext_A 448 | | `Ugaritic 449 | | `VS 450 | | `VS_Sup 451 | | `Vai 452 | | `Vedic_Ext 453 | | `Vertical_Forms 454 | | `Vithkuqi 455 | | `Wancho 456 | | `Warang_Citi 457 | | `Yezidi 458 | | `Yi_Radicals 459 | | `Yi_Syllables 460 | | `Yijing 461 | | `Zanabazar_Square 462 | | `Znamenny_Music 463 | ] prop 464 | 465 | val canonical_combining_class : int prop 466 | val cased : bool prop 467 | val case_folding : [`Self | `Cps of cp list ] prop 468 | val case_ignorable : bool prop 469 | val changes_when_casefolded : bool prop 470 | val changes_when_casemapped : bool prop 471 | val changes_when_lowercased : bool prop 472 | val changes_when_nfkc_casefolded : bool prop 473 | val changes_when_titlecased : bool prop 474 | val changes_when_uppercased : bool prop 475 | val composition_exclusion : bool prop 476 | val dash : bool prop 477 | val decomposition_mapping : [`Self | `Cps of cp list ] prop 478 | val decomposition_type : [ 479 | | `Can 480 | | `Com 481 | | `Enc 482 | | `Fin 483 | | `Font 484 | | `Fra 485 | | `Init 486 | | `Iso 487 | | `Med 488 | | `Nar 489 | | `Nb 490 | | `Sml 491 | | `Sqr 492 | | `Sub 493 | | `Sup 494 | | `Vert 495 | | `Wide 496 | | `None 497 | ] prop 498 | 499 | val default_ignorable_code_point : bool prop 500 | val deprecated : bool prop 501 | val diacritic : bool prop 502 | val east_asian_width : [ `A | `F | `H | `N | `Na | `W ] prop 503 | val emoji : bool prop 504 | val emoji_presentation : bool prop 505 | val emoji_modifier : bool prop 506 | val emoji_modifier_base : bool prop 507 | val emoji_component : bool prop 508 | val equivalent_unified_ideograph : cp option prop 509 | val extended_pictographic : bool prop 510 | val extender : bool prop 511 | val full_composition_exclusion : bool prop 512 | val general_category : [ 513 | | `Lu 514 | | `Ll 515 | | `Lt 516 | | `Lm 517 | | `Lo 518 | | `Mn 519 | | `Mc 520 | | `Me 521 | | `Nd 522 | | `Nl 523 | | `No 524 | | `Pc 525 | | `Pd 526 | | `Ps 527 | | `Pe 528 | | `Pi 529 | | `Pf 530 | | `Po 531 | | `Sm 532 | | `Sc 533 | | `Sk 534 | | `So 535 | | `Zs 536 | | `Zl 537 | | `Zp 538 | | `Cc 539 | | `Cf 540 | | `Cs 541 | | `Co 542 | | `Cn 543 | ] prop 544 | 545 | val grapheme_base : bool prop 546 | val grapheme_cluster_break : [ 547 | | `CN 548 | | `CR 549 | | `EB 550 | | `EBG 551 | | `EM 552 | | `EX 553 | | `GAZ 554 | | `L 555 | | `LF 556 | | `LV 557 | | `LVT 558 | | `PP 559 | | `RI 560 | | `SM 561 | | `T 562 | | `V 563 | | `XX 564 | | `ZWJ 565 | ] prop 566 | 567 | val grapheme_extend : bool prop 568 | val hangul_syllable_type : [ `L | `LV | `LVT | `T | `V | `NA ] prop 569 | val hex_digit : bool prop 570 | val id_continue : bool prop 571 | val id_compat_math_continue : bool prop 572 | val id_compat_math_start : bool prop 573 | val id_start : bool prop 574 | val ideographic : bool prop 575 | val ids_binary_operator : bool prop 576 | val ids_trinary_operator : bool prop 577 | val ids_unary_operator : bool prop 578 | 579 | val indic_conjunct_break : [ 580 | | `Consonant 581 | | `Extend 582 | | `Linker 583 | | `None ] prop 584 | 585 | val indic_syllabic_category : [ 586 | | `Avagraha 587 | | `Bindu 588 | | `Brahmi_Joining_Number 589 | | `Cantillation_Mark 590 | | `Consonant 591 | | `Consonant_Dead 592 | | `Consonant_Final 593 | | `Consonant_Head_Letter 594 | | `Consonant_Initial_Postfixed 595 | | `Consonant_Killer 596 | | `Consonant_Medial 597 | | `Consonant_Placeholder 598 | | `Consonant_Preceding_Repha 599 | | `Consonant_Prefixed 600 | | `Consonant_Repha 601 | | `Consonant_Subjoined 602 | | `Consonant_Succeeding_Repha 603 | | `Consonant_With_Stacker 604 | | `Gemination_Mark 605 | | `Invisible_Stacker 606 | | `Joiner 607 | | `Modifying_Letter 608 | | `Non_Joiner 609 | | `Nukta 610 | | `Number 611 | | `Number_Joiner 612 | | `Other 613 | | `Pure_Killer 614 | | `Reordering_Killer 615 | | `Register_Shifter 616 | | `Syllable_Modifier 617 | | `Tone_Letter 618 | | `Tone_Mark 619 | | `Virama 620 | | `Visarga 621 | | `Vowel 622 | | `Vowel_Dependent 623 | | `Vowel_Independent 624 | ] prop 625 | 626 | val indic_matra_category : [ 627 | | `Right 628 | | `Left 629 | | `Visual_Order_Left 630 | | `Left_And_Right 631 | | `Top 632 | | `Bottom 633 | | `Top_And_Bottom 634 | | `Top_And_Right 635 | | `Top_And_Left 636 | | `Top_And_Left_And_Right 637 | | `Bottom_And_Right 638 | | `Top_And_Bottom_And_Right 639 | | `Overstruck 640 | | `Invisible 641 | | `NA 642 | ] prop 643 | 644 | val indic_positional_category : [ 645 | | `Bottom 646 | | `Bottom_And_Left 647 | | `Bottom_And_Right 648 | | `Invisible 649 | | `Left 650 | | `Left_And_Right 651 | | `NA 652 | | `Overstruck 653 | | `Right 654 | | `Top 655 | | `Top_And_Bottom 656 | | `Top_And_Bottom_And_Left 657 | | `Top_And_Bottom_And_Right 658 | | `Top_And_Left 659 | | `Top_And_Left_And_Right 660 | | `Top_And_Right 661 | | `Visual_Order_Left 662 | ] prop 663 | 664 | val jamo_short_name : string prop 665 | val join_control : bool prop 666 | val joining_group : [ 667 | | `African_Feh 668 | | `African_Noon 669 | | `African_Qaf 670 | | `Ain 671 | | `Alaph 672 | | `Alef 673 | | `Alef_Maqsurah 674 | | `Beh 675 | | `Beth 676 | | `Burushaski_Yeh_Barree 677 | | `Dal 678 | | `Dalath_Rish 679 | | `E 680 | | `Farsi_Yeh 681 | | `Fe 682 | | `Feh 683 | | `Final_Semkath 684 | | `Gaf 685 | | `Gamal 686 | | `Hah 687 | | `Hanifi_Rohingya_Kinna_Ya 688 | | `Hanifi_Rohingya_Pa 689 | | `Hamza_On_Heh_Goal 690 | | `He 691 | | `Heh 692 | | `Heh_Goal 693 | | `Heth 694 | | `Kaf 695 | | `Kaph 696 | | `Kashmiri_Yeh 697 | | `Khaph 698 | | `Knotted_Heh 699 | | `Lam 700 | | `Lamadh 701 | | `Malayalam_Bha 702 | | `Malayalam_Ja 703 | | `Malayalam_Lla 704 | | `Malayalam_Llla 705 | | `Malayalam_Nga 706 | | `Malayalam_Nna 707 | | `Malayalam_Nnna 708 | | `Malayalam_Nya 709 | | `Malayalam_Ra 710 | | `Malayalam_Ssa 711 | | `Malayalam_Tta 712 | | `Manichaean_Aleph 713 | | `Manichaean_Ayin 714 | | `Manichaean_Beth 715 | | `Manichaean_Daleth 716 | | `Manichaean_Dhamedh 717 | | `Manichaean_Five 718 | | `Manichaean_Gimel 719 | | `Manichaean_Heth 720 | | `Manichaean_Hundred 721 | | `Manichaean_Kaph 722 | | `Manichaean_Lamedh 723 | | `Manichaean_Mem 724 | | `Manichaean_Nun 725 | | `Manichaean_One 726 | | `Manichaean_Pe 727 | | `Manichaean_Qoph 728 | | `Manichaean_Resh 729 | | `Manichaean_Sadhe 730 | | `Manichaean_Samekh 731 | | `Manichaean_Taw 732 | | `Manichaean_Ten 733 | | `Manichaean_Teth 734 | | `Manichaean_Thamedh 735 | | `Manichaean_Twenty 736 | | `Manichaean_Waw 737 | | `Manichaean_Yodh 738 | | `Manichaean_Zayin 739 | | `Meem 740 | | `Mim 741 | | `No_Joining_Group 742 | | `Noon 743 | | `Nun 744 | | `Nya 745 | | `Pe 746 | | `Qaf 747 | | `Qaph 748 | | `Reh 749 | | `Reversed_Pe 750 | | `Rohingya_Yeh 751 | | `Sad 752 | | `Sadhe 753 | | `Seen 754 | | `Semkath 755 | | `Shin 756 | | `Straight_Waw 757 | | `Swash_Kaf 758 | | `Syriac_Waw 759 | | `Tah 760 | | `Taw 761 | | `Teh_Marbuta 762 | | `Teh_Marbuta_Goal 763 | | `Teth 764 | | `Thin_Noon 765 | | `Thin_Yeh 766 | | `Vertical_Tail 767 | | `Waw 768 | | `Yeh 769 | | `Yeh_Barree 770 | | `Yeh_With_Tail 771 | | `Yudh 772 | | `Yudh_He 773 | | `Zain 774 | | `Zhain 775 | | `BAA 776 | | `FA 777 | | `HAA 778 | | `HA_GOAL 779 | | `HA 780 | | `CAF 781 | | `KNOTTED_HA 782 | | `RA 783 | | `SWASH_CAF 784 | | `HAMZAH_ON_HA_GOAL 785 | | `TAA_MARBUTAH 786 | | `YA_BARREE 787 | | `YA 788 | | `ALEF_MAQSURAH 789 | ] prop 790 | 791 | val joining_type : [ `U | `C | `T | `D | `L | `R ] prop 792 | val line_break : [ 793 | | `AI 794 | | `AK 795 | | `AL 796 | | `AP 797 | | `AS 798 | | `B2 799 | | `BA 800 | | `BB 801 | | `BK 802 | | `CB 803 | | `CJ 804 | | `CL 805 | | `CM 806 | | `CP 807 | | `CR 808 | | `EX 809 | | `GL 810 | | `H2 811 | | `H3 812 | | `HH 813 | | `HL 814 | | `HY 815 | | `ID 816 | | `IN 817 | | `IS 818 | | `JL 819 | | `JT 820 | | `JV 821 | | `LF 822 | | `NL 823 | | `NS 824 | | `NU 825 | | `OP 826 | | `PO 827 | | `PR 828 | | `QU 829 | | `RI 830 | | `SA 831 | | `SG 832 | | `SP 833 | | `SY 834 | | `VF 835 | | `VI 836 | | `WJ 837 | | `XX 838 | | `ZW 839 | | `EB 840 | | `EM 841 | | `ZWJ 842 | ] prop 843 | 844 | val logical_order_exception : bool prop 845 | val lowercase : bool prop 846 | val lowercase_mapping : [`Self | `Cps of cp list ] prop 847 | val math : bool prop 848 | val name : [`Pattern of string | `Name of string ] prop 849 | (** In the [`Pattern] case occurrences of the character ['#'] 850 | ([U+0023]) in the string must be replaced by the value of the code 851 | point as four to six uppercase hexadecimal digits (the minimal 852 | needed). E.g. the pattern ["CJK UNIFIED IDEOGRAPH-#"] associated 853 | to code point [U+3400] gives the name ["CJK UNIFIED IDEOGRAPH-3400"]. *) 854 | 855 | val modifier_combining_mark : bool prop 856 | 857 | val name_alias : 858 | (string * [`Abbreviation | `Alternate | `Control | `Correction | `Figment]) 859 | list prop 860 | 861 | val nfc_quick_check : [ `True | `False | `Maybe ] prop 862 | val nfd_quick_check : [ `True | `False | `Maybe ] prop 863 | val nfkc_quick_check : [ `True | `False | `Maybe ] prop 864 | val nfkc_casefold : [`Self | `Cps of cp list] prop 865 | val nfkc_simple_casefold : [ `Self | `Cps of cp list ] prop 866 | val nfkd_quick_check : [ `True | `False | `Maybe ] prop 867 | val noncharacter_code_point : bool prop 868 | val numeric_type : [ `None | `De | `Di | `Nu ] prop 869 | val numeric_value : 870 | [ `NaN | `Nums of [`Frac of int * int | `Num of int64 ] list] prop 871 | 872 | val other_alphabetic : bool prop 873 | val other_default_ignorable_code_point : bool prop 874 | val other_grapheme_extend : bool prop 875 | val other_id_continue : bool prop 876 | val other_id_start : bool prop 877 | val other_lowercase : bool prop 878 | val other_math : bool prop 879 | val other_uppercase : bool prop 880 | val pattern_syntax : bool prop 881 | val pattern_white_space : bool prop 882 | val prepended_concatenation_mark : bool prop 883 | val quotation_mark : bool prop 884 | val radical : bool prop 885 | val regional_indicator : bool prop 886 | 887 | type script = [ 888 | | `Adlm 889 | | `Aghb 890 | | `Ahom 891 | | `Arab 892 | | `Armi 893 | | `Armn 894 | | `Avst 895 | | `Bali 896 | | `Bamu 897 | | `Bass 898 | | `Batk 899 | | `Beng 900 | | `Berf 901 | | `Bhks 902 | | `Bopo 903 | | `Brah 904 | | `Brai 905 | | `Bugi 906 | | `Buhd 907 | | `Cakm 908 | | `Cans 909 | | `Cari 910 | | `Cham 911 | | `Cher 912 | | `Chrs 913 | | `Copt 914 | | `Cpmn 915 | | `Cprt 916 | | `Cyrl 917 | | `Deva 918 | | `Diak 919 | | `Dogr 920 | | `Dsrt 921 | | `Dupl 922 | | `Egyp 923 | | `Elba 924 | | `Elym 925 | | `Ethi 926 | | `Gara 927 | | `Geor 928 | | `Glag 929 | | `Gong 930 | | `Gonm 931 | | `Goth 932 | | `Gran 933 | | `Grek 934 | | `Gujr 935 | | `Gukh 936 | | `Guru 937 | | `Hang 938 | | `Hani 939 | | `Hano 940 | | `Hatr 941 | | `Hebr 942 | | `Hira 943 | | `Hluw 944 | | `Hmng 945 | | `Hmnp 946 | | `Hrkt 947 | | `Hung 948 | | `Ital 949 | | `Java 950 | | `Kali 951 | | `Kana 952 | | `Kawi 953 | | `Khar 954 | | `Khmr 955 | | `Khoj 956 | | `Knda 957 | | `Krai 958 | | `Kthi 959 | | `Kits 960 | | `Lana 961 | | `Laoo 962 | | `Latn 963 | | `Lepc 964 | | `Limb 965 | | `Lina 966 | | `Linb 967 | | `Lisu 968 | | `Lyci 969 | | `Lydi 970 | | `Mahj 971 | | `Maka 972 | | `Mand 973 | | `Mani 974 | | `Marc 975 | | `Medf 976 | | `Mend 977 | | `Merc 978 | | `Mero 979 | | `Mlym 980 | | `Modi 981 | | `Mong 982 | | `Mroo 983 | | `Mtei 984 | | `Mult 985 | | `Mymr 986 | | `Nagm 987 | | `Nand 988 | | `Narb 989 | | `Nbat 990 | | `Newa 991 | | `Nkoo 992 | | `Nshu 993 | | `Ogam 994 | | `Olck 995 | | `Onao 996 | | `Orkh 997 | | `Orya 998 | | `Osge 999 | | `Osma 1000 | | `Ougr 1001 | | `Palm 1002 | | `Pauc 1003 | | `Perm 1004 | | `Phag 1005 | | `Phli 1006 | | `Phlp 1007 | | `Phnx 1008 | | `Plrd 1009 | | `Prti 1010 | | `Qaai 1011 | | `Rjng 1012 | | `Rohg 1013 | | `Runr 1014 | | `Samr 1015 | | `Sarb 1016 | | `Saur 1017 | | `Sgnw 1018 | | `Shaw 1019 | | `Shrd 1020 | | `Sidd 1021 | | `Sidt 1022 | | `Sind 1023 | | `Sinh 1024 | | `Sogd 1025 | | `Sogo 1026 | | `Sora 1027 | | `Soyo 1028 | | `Sund 1029 | | `Sunu 1030 | | `Sylo 1031 | | `Syrc 1032 | | `Tagb 1033 | | `Takr 1034 | | `Tale 1035 | | `Talu 1036 | | `Taml 1037 | | `Tang 1038 | | `Tavt 1039 | | `Tayo 1040 | | `Telu 1041 | | `Tfng 1042 | | `Tglg 1043 | | `Thaa 1044 | | `Thai 1045 | | `Tibt 1046 | | `Tirh 1047 | | `Tnsa 1048 | | `Todr 1049 | | `Tols 1050 | | `Toto 1051 | | `Tutg 1052 | | `Ugar 1053 | | `Vaii 1054 | | `Vith 1055 | | `Wara 1056 | | `Wcho 1057 | | `Xpeo 1058 | | `Xsux 1059 | | `Yezi 1060 | | `Yiii 1061 | | `Zanb 1062 | | `Zinh 1063 | | `Zyyy 1064 | | `Zzzz 1065 | ] 1066 | 1067 | val script : script prop 1068 | val script_extensions : script list prop 1069 | 1070 | val sentence_break : [ 1071 | | `AT 1072 | | `CL 1073 | | `CR 1074 | | `EX 1075 | | `FO 1076 | | `LE 1077 | | `LF 1078 | | `LO 1079 | | `NU 1080 | | `SC 1081 | | `SE 1082 | | `SP 1083 | | `ST 1084 | | `UP 1085 | | `XX 1086 | ] prop 1087 | 1088 | val simple_case_folding : [ `Self | `Cp of cp ] prop 1089 | val simple_lowercase_mapping : [ `Self | `Cp of cp ] prop 1090 | val simple_titlecase_mapping : [ `Self | `Cp of cp ] prop 1091 | val simple_uppercase_mapping : [ `Self | `Cp of cp ] prop 1092 | val soft_dotted : bool prop 1093 | val sterm : bool prop 1094 | val terminal_punctuation : bool prop 1095 | val titlecase_mapping : [`Self | `Cps of cp list ] prop 1096 | val uax_42_element : [ `Reserved | `Noncharacter | `Surrogate | `Char ] prop 1097 | (** Not normative, artefact of [Uucd]. Corresponds to the 1098 | {{:http://www.unicode.org/reports/tr42/#w1aac13b9b1}XML element name} 1099 | that describes the code point. *) 1100 | 1101 | val unicode_1_name : string prop 1102 | val unified_ideograph : bool prop 1103 | val uppercase : bool prop 1104 | val uppercase_mapping : [`Self | `Cps of cp list ] prop 1105 | val variation_selector : bool prop 1106 | val vertical_orientation : [ `U | `R | `Tu | `Tr ] prop 1107 | val white_space : bool prop 1108 | val word_break : [ 1109 | | `CR 1110 | | `DQ 1111 | | `EB 1112 | | `EBG 1113 | | `EM 1114 | | `EX 1115 | | `Extend 1116 | | `FO 1117 | | `GAZ 1118 | | `HL 1119 | | `KA 1120 | | `LE 1121 | | `LF 1122 | | `MB 1123 | | `ML 1124 | | `MN 1125 | | `NL 1126 | | `NU 1127 | | `RI 1128 | | `SQ 1129 | | `WSegSpace 1130 | | `XX 1131 | | `ZWJ 1132 | ] prop 1133 | 1134 | val xid_continue : bool prop 1135 | val xid_start : bool prop 1136 | 1137 | (** {2:unihan Unihan properties} 1138 | 1139 | In alphabetic order. For now unihan properties are always 1140 | represented as strings. *) 1141 | 1142 | val kAccountingNumeric : string prop 1143 | val kAlternateHanYu : string prop 1144 | val kAlternateJEF : string prop 1145 | val kAlternateKangXi : string prop 1146 | val kAlternateMorohashi : string prop 1147 | val kAlternateTotalStrokes : string prop 1148 | val kBigFive : string prop 1149 | val kCCCII : string prop 1150 | val kCNS1986 : string prop 1151 | val kCNS1992 : string prop 1152 | val kCangjie : string prop 1153 | val kCantonese : string prop 1154 | val kCheungBauer : string prop 1155 | val kCheungBauerIndex : string prop 1156 | val kCihaiT : string prop 1157 | val kCompatibilityVariant : string prop 1158 | val kCowles : string prop 1159 | val kDaeJaweon : string prop 1160 | val kDefinition : string prop 1161 | val kEACC : string prop 1162 | val kFanqie : string prop 1163 | val kFenn : string prop 1164 | val kFennIndex : string prop 1165 | val kFourCornerCode : string prop 1166 | val kFrequency : string prop 1167 | val kGB0 : string prop 1168 | val kGB1 : string prop 1169 | val kGB3 : string prop 1170 | val kGB5 : string prop 1171 | val kGB8 : string prop 1172 | val kGSR : string prop 1173 | val kGradeLevel : string prop 1174 | val kHDZRadBreak : string prop 1175 | val kHKGlyph : string prop 1176 | val kHKSCS : string prop 1177 | val kHanYu : string prop 1178 | val kHangul : string prop 1179 | val kHanyuPinlu : string prop 1180 | val kHanyuPinyin : string prop 1181 | val kIBMJapan : string prop 1182 | val kIICore : string prop 1183 | val kIRGDaeJaweon : string prop 1184 | val kIRGDaiKanwaZiten : string prop 1185 | val kIRGHanyuDaZidian : string prop 1186 | val kIRGKangXi : string prop 1187 | val kIRG_GSource : string prop 1188 | val kIRG_HSource : string prop 1189 | val kIRG_JSource : string prop 1190 | val kIRG_KPSource : string prop 1191 | val kIRG_KSource : string prop 1192 | val kIRG_MSource : string prop 1193 | val kIRG_SSource : string prop 1194 | val kIRG_TSource : string prop 1195 | val kIRG_USource : string prop 1196 | val kIRG_UKSource : string prop 1197 | val kIRG_VSource : string prop 1198 | val kJapanese : string prop 1199 | val kJapaneseKun : string prop 1200 | val kJapaneseOn : string prop 1201 | val kJHJ : string prop 1202 | val kJIS0213 : string prop 1203 | val kJinmeiyoKanji : string prop 1204 | val kJis0 : string prop 1205 | val kJis1 : string prop 1206 | val kJoyoKanji : string prop 1207 | val kKPS0 : string prop 1208 | val kKPS1 : string prop 1209 | val kKSC0 : string prop 1210 | val kKSC1 : string prop 1211 | val kKangXi : string prop 1212 | val kKarlgren : string prop 1213 | val kKorean : string prop 1214 | val kKoreanEducationHanja : string prop 1215 | val kKoreanName : string prop 1216 | val kLau : string prop 1217 | val kMainlandTelegraph : string prop 1218 | val kMandarin : string prop 1219 | val kMatthews : string prop 1220 | val kMeyerWempe : string prop 1221 | val kMojiJoho : string prop 1222 | val kMorohashi : string prop 1223 | val kNelson : string prop 1224 | val kNSHU_DubenSrc : string prop 1225 | val kNSHU_Reading : string prop 1226 | val kOtherNumeric : string prop 1227 | val kPhonetic : string prop 1228 | val kPrimaryNumeric : string prop 1229 | val kPseudoGB1 : string prop 1230 | val kRSAdobe_Japan1_6 : string prop 1231 | val kRSJapanese : string prop 1232 | val kRSKanWa : string prop 1233 | val kRSKangXi : string prop 1234 | val kRSKorean : string prop 1235 | val kRSMerged : string prop 1236 | val kRSUnicode : string prop 1237 | val kSBGY : string prop 1238 | val kSemanticVariant : string prop 1239 | val kSimplifiedVariant : string prop 1240 | val kSMSZD2003Index : string prop 1241 | val kSMSZD2003Readings : string prop 1242 | val kSpecializedSemanticVariant : string prop 1243 | val kSpoofingVariant : string prop 1244 | val kStrange : string prop 1245 | val kUnihanCore2020 : string prop 1246 | val kTGH : string prop 1247 | val kTGHZ2013 : string prop 1248 | val kTGT_MergedSrc : string prop 1249 | val kTGT_RSUnicode : string prop 1250 | val kTaiwanTelegraph : string prop 1251 | val kTang : string prop 1252 | val kTayNumeric : string prop 1253 | val kTotalStrokes : string prop 1254 | val kTraditionalVariant : string prop 1255 | val kVietnamese : string prop 1256 | val kVietnameseNumeric : string prop 1257 | val kWubi : string prop 1258 | val kXHC1983 : string prop 1259 | val kZhuang : string prop 1260 | val kXerox : string prop 1261 | val kZhuangNumeric : string prop 1262 | val kZVariant : string prop 1263 | 1264 | (** {1:unikemet Unikemet properties} *) 1265 | 1266 | val kEH_Cat : string prop 1267 | val kEH_Core : string prop 1268 | val kEH_Desc : string prop 1269 | val kEH_Func : string prop 1270 | val kEH_FVal : string prop 1271 | val kEH_UniK : string prop 1272 | val kEH_JSesh : string prop 1273 | val kEH_HG : string prop 1274 | val kEH_IFAO : string prop 1275 | val kEH_NoMirror : bool prop 1276 | val kEH_NoRotate : bool prop 1277 | val kEH_AltSeq : string prop 1278 | 1279 | (** {1:db Unicode character databases} *) 1280 | 1281 | type block = (cp * cp) * string 1282 | (** The type for blocks. Code point range, name of the block. *) 1283 | 1284 | type named_sequence = string * cp list 1285 | (** The type for named sequences. Sequence name, code point sequence. *) 1286 | 1287 | type standardized_variant = 1288 | cp list * string * [ `Isolate | `Initial | `Medial | `Final ] list 1289 | (** The type for standarized variants. Code point sequence, 1290 | description, when. *) 1291 | 1292 | type cjk_radical = string * cp * cp 1293 | (** The type for CJK radicals. Radical number, CJK radical character, 1294 | CJK unified ideograph. *) 1295 | 1296 | type do_not_emit = { instead_of : cp list; use : cp list; because : string; } 1297 | (** The type for do not emit character sequences. *) 1298 | 1299 | type t = 1300 | { description : string; 1301 | repertoire : props Cpmap.t; 1302 | blocks : block list; 1303 | named_sequences : named_sequence list; 1304 | provisional_named_sequences : named_sequence list; 1305 | standardized_variants : standardized_variant list; 1306 | cjk_radicals : cjk_radical list; 1307 | do_not_emit : do_not_emit list 1308 | } 1309 | (** The type for Unicode character databases. 1310 | 1311 | {b Note.} Absence of an optional top-level field in the database 1312 | is denoted by the neutral element of its type (empty string, empty 1313 | list, {!Cpmap.empty}). This means that the module doesn't 1314 | distinguish between absence of a field and presence of the field 1315 | with empty data (but incurs no problems in this context). *) 1316 | 1317 | val cp_prop : t -> cp -> 'a prop -> 'a option 1318 | (** [cp_prop ucd cp p] is the property [p] of the code point [cp] 1319 | in [db]'s repertoire, if [p] is in the repertoire and the property 1320 | exists for [cp]. *) 1321 | 1322 | (** {1:decoder Decode} *) 1323 | 1324 | type src = [ `Channel of in_channel | `String of string ] 1325 | (** The type for input sources. *) 1326 | 1327 | type decoder 1328 | (** The type for Unicode character database decoders. *) 1329 | 1330 | val decoder : [< src] -> decoder 1331 | (** [decoder src] is a decoder that inputs from [src]. *) 1332 | 1333 | val decode : decoder -> [`Ok of t | `Error of string ] 1334 | (** [decode d] decodes a database from [d] or returns an error. *) 1335 | 1336 | val decoded_range : decoder -> (int * int) * (int * int) 1337 | (** [decoded_range d] is the range of characters spanning the [`Error] 1338 | decoded by [d]. A pair of line and column numbers respectively one and 1339 | zero based. *) 1340 | 1341 | (** {1:basics Basics} 1342 | 1343 | The database and subsets of it for Unicode %%UNICODE_VERSION%% are 1344 | available 1345 | {{:http://www.unicode.org/Public/%%UNICODE_VERSION%%/ucdxml/}here}. 1346 | Databases with groups should be preferred, they maximize value 1347 | sharing and improve parsing performance. 1348 | 1349 | A database is decoded as follows: 1350 | {[ 1351 | let ucd_or_die inf = try 1352 | let ic = if inf = "-" then stdin else open_in inf in 1353 | let d = Uucd.decoder (`Channel ic) in 1354 | match Uucd.decode d with 1355 | | `Ok db -> db 1356 | | `Error e -> 1357 | let (l0, c0), (l1, c1) = Uucd.decoded_range d in 1358 | Printf.eprintf "%s:%d.%d-%d.%d: %s\n%!" inf l0 c0 l1 c1 e; 1359 | exit 1 1360 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1 1361 | 1362 | let ucd = ucd_or_die "/tmp/ucd.all.grouped.xml" 1363 | ]} 1364 | The convenience function {!cp_prop} can be used to query 1365 | the property of a given code point. For example the 1366 | {{!general_category}general category} of [U+1F42B] 1367 | is given by: 1368 | {[ 1369 | let u_1F42B_gc = Uucd.cp_prop ucd 0x1F42B Uucd.general_category 1370 | ]} 1371 | *) 1372 | -------------------------------------------------------------------------------- /src/uucd.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uucd programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let str = Printf.sprintf 7 | let str_of_name (u,l) = str "{%s}%s" u l 8 | let split_string s sep = 9 | let rec split accum j = 10 | let i = try (String.rindex_from s j sep) with Not_found -> -1 in 11 | if (i = -1) then 12 | let p = String.sub s 0 (j + 1) in 13 | if p <> "" then p :: accum else accum 14 | else 15 | let p = String.sub s (i + 1) (j - i) in 16 | let accum' = if p <> "" then p :: accum else accum in 17 | split accum' (i - 1) 18 | in 19 | split [] (String.length s - 1) 20 | 21 | (* Error messages *) 22 | 23 | let err s = failwith s 24 | let err_data = "character data not allowed here" 25 | let err_exp_el_end = "expected end of element" 26 | let err_exp_data = "expected character data" 27 | let err_wf = "document not well formed" 28 | let err_dup n = str "duplicate element (%s)" (str_of_name n) 29 | let err_miss_att n = str "missing attribute (%s)" n 30 | let err_att_val v = str "invalid attribute value (\"%s\")" v 31 | let err_invalid_cp v = str "invalid code point (\"%s\")" v 32 | let err_empty_cps = "empty code point sequence" 33 | let err_exp_ucd fnd = str "expected ucd element found %s" (str_of_name fnd) 34 | let err_invalid_cp_spec = str "invalid code point specification" 35 | let err_invalid_name_alias_spec = str "invalid name alias specification" 36 | 37 | (* Code points *) 38 | 39 | module Cp = struct 40 | type t = int 41 | let compare : int -> int -> int = compare 42 | end 43 | 44 | type cp = Cp.t 45 | 46 | let is_cp i = 0x0000 <= i && i <= 0x10_FFFF 47 | let is_scalar_value i = 48 | (0x0000 <= i && i <= 0xD7FF) || (0xE000 <= i && i <= 0x10FFFF) 49 | 50 | let cp_of_string v = (* parses a code point value. *) 51 | let is_hex c = (0x30 <= c && c <= 0x39) || (0x41 <= c && c <= 0x46) in 52 | let cp = ref 0 in 53 | for k = 0 to (String.length v) - 1 do 54 | let c = Char.code v.[k] in 55 | if not (is_hex c) then err (err_invalid_cp v) else 56 | cp := !cp * 16 + (if c <= 0x39 then c - 48 else c - 55) 57 | done; 58 | if is_cp !cp then !cp else err (err_invalid_cp v) 59 | 60 | let cps_of_string ?(empty = false) v = (* parses a code point sequence value. *) 61 | if (v = "") then (if empty then [] else err err_empty_cps) else 62 | List.map cp_of_string (split_string v ' ') 63 | 64 | module Cpmap = Map.Make (Cp) 65 | 66 | (* Properties *) 67 | 68 | type key = (* the type for property keys (names). *) 69 | | Age 70 | | Alphabetic 71 | | Ascii_hex_digit 72 | | Bidi_class 73 | | Bidi_control 74 | | Bidi_mirrored 75 | | Bidi_mirroring_glyph 76 | | Bidi_paired_bracket 77 | | Bidi_paired_bracket_type 78 | | Block 79 | | Canonical_combining_class 80 | | Cased 81 | | Case_folding 82 | | Case_ignorable 83 | | Changes_when_casefolded 84 | | Changes_when_casemapped 85 | | Changes_when_lowercased 86 | | Changes_when_nfkc_casefolded 87 | | Changes_when_titlecased 88 | | Changes_when_uppercased 89 | | Composition_exclusion 90 | | Dash 91 | | Decomposition_mapping 92 | | Decomposition_type 93 | | Default_ignorable_code_point 94 | | Deprecated 95 | | Diacritic 96 | | East_asian_width 97 | | Emoji 98 | | Emoji_presentation 99 | | Emoji_modifier 100 | | Emoji_modifier_base 101 | | Emoji_component 102 | | Equivalent_unified_ideograph 103 | | Extender 104 | | Extended_pictographic 105 | | Full_composition_exclusion 106 | | General_category 107 | | Grapheme_base 108 | | Grapheme_cluster_break 109 | | Grapheme_extend 110 | | Hangul_syllable_type 111 | | Hex_digit 112 | | Id_continue 113 | | Id_compat_math_continue 114 | | Id_compat_math_start 115 | | Id_start 116 | | Ideographic 117 | | Ids_binary_operator 118 | | Ids_trinary_operator 119 | | Ids_unary_operator 120 | | Indic_conjunct_break 121 | | Indic_syllabic_category 122 | | Indic_matra_category 123 | | Indic_positional_category 124 | | Jamo_short_name 125 | | Join_control 126 | | Joining_group 127 | | Joining_type 128 | | Line_break 129 | | Logical_order_exception 130 | | Lowercase 131 | | Lowercase_mapping 132 | | Math 133 | | Modifier_combining_mark 134 | | Name 135 | | Name_alias 136 | | Nfc_quick_check 137 | | Nfd_quick_check 138 | | Nfkc_quick_check 139 | | Nfkc_casefold 140 | | Nfkc_simple_casefold 141 | | Nfkd_quick_check 142 | | Noncharacter_code_point 143 | | Numeric_type 144 | | Numeric_value 145 | | Other_alphabetic 146 | | Other_default_ignorable_code_point 147 | | Other_grapheme_extend 148 | | Other_id_continue 149 | | Other_id_start 150 | | Other_lowercase 151 | | Other_math 152 | | Other_uppercase 153 | | Pattern_syntax 154 | | Pattern_white_space 155 | | Prepended_concatenation_mark 156 | | Quotation_mark 157 | | Radical 158 | | Regional_indicator 159 | | Script 160 | | Script_extensions 161 | | Sentence_break 162 | | Simple_case_folding 163 | | Simple_lowercase_mapping 164 | | Simple_titlecase_mapping 165 | | Simple_uppercase_mapping 166 | | Soft_dotted 167 | | Sterm 168 | | Terminal_punctuation 169 | | Titlecase_mapping 170 | | UAX_42_element 171 | | Unicode_1_name 172 | | Unified_ideograph 173 | | Uppercase 174 | | Uppercase_mapping 175 | | Variation_selector 176 | | Vertical_orientation 177 | | White_space 178 | | Word_break 179 | | Xid_continue 180 | | Xid_start 181 | (* Unihan *) 182 | | KAccountingNumeric 183 | | KAlternateHanYu 184 | | KAlternateJEF 185 | | KAlternateKangXi 186 | | KAlternateMorohashi 187 | | KAlternateTotalStrokes 188 | | KBigFive 189 | | KCCCII 190 | | KCNS1986 191 | | KCNS1992 192 | | KCangjie 193 | | KCantonese 194 | | KCheungBauer 195 | | KCheungBauerIndex 196 | | KCihaiT 197 | | KCompatibilityVariant 198 | | KCowles 199 | | KDaeJaweon 200 | | KDefinition 201 | | KEACC 202 | | KFanqie 203 | | KFenn 204 | | KFennIndex 205 | | KFourCornerCode 206 | | KFrequency 207 | | KGB0 208 | | KGB1 209 | | KGB3 210 | | KGB5 211 | | KGB8 212 | | KGSR 213 | | KGradeLevel 214 | | KHDZRadBreak 215 | | KHKGlyph 216 | | KHKSCS 217 | | KHanYu 218 | | KHangul 219 | | KHanyuPinlu 220 | | KHanyuPinyin 221 | | KIBMJapan 222 | | KIICore 223 | | KIRGDaeJaweon 224 | | KIRGDaiKanwaZiten 225 | | KIRGHanyuDaZidian 226 | | KIRGKangXi 227 | | KIRG_GSource 228 | | KIRG_HSource 229 | | KIRG_JSource 230 | | KIRG_KPSource 231 | | KIRG_KSource 232 | | KIRG_MSource 233 | | KIRG_SSource 234 | | KIRG_TSource 235 | | KIRG_USource 236 | | KIRG_UKSource 237 | | KIRG_VSource 238 | | KJapanese 239 | | KJHJ 240 | | KJIS0213 241 | | KJapaneseKun 242 | | KJapaneseOn 243 | | KJinmeiyoKanji 244 | | KJis0 245 | | KJis1 246 | | KJoyoKanji 247 | | KKPS0 248 | | KKPS1 249 | | KKSC0 250 | | KKSC1 251 | | KKangXi 252 | | KKarlgren 253 | | KKorean 254 | | KKoreanEducationHanja 255 | | KKoreanName 256 | | KLau 257 | | KMainlandTelegraph 258 | | KMandarin 259 | | KMatthews 260 | | KMeyerWempe 261 | | KMojiJoho 262 | | KMorohashi 263 | | KNelson 264 | | KNSHU_DubenSrc 265 | | KNSHU_Reading 266 | | KOtherNumeric 267 | | KPhonetic 268 | | KPrimaryNumeric 269 | | KPseudoGB1 270 | | KRSAdobe_Japan1_6 271 | | KRSJapanese 272 | | KRSKanWa 273 | | KRSKangXi 274 | | KRSKorean 275 | | KRSMerged 276 | | KRSUnicode 277 | | KSBGY 278 | | KSemanticVariant 279 | | KSimplifiedVariant 280 | | KSMSZD2003Index 281 | | KSMSZD2003Readings 282 | | KSpecializedSemanticVariant 283 | | KSpoofingVariant 284 | | KStrange 285 | | KTGH 286 | | KTGHZ2013 287 | | KTGT_MergedSrc 288 | | KTGT_RSUnicode 289 | | KTaiwanTelegraph 290 | | KTang 291 | | KTayNumeric 292 | | KTotalStrokes 293 | | KTraditionalVariant 294 | | KUnihanCore2020 295 | | KVietnamese 296 | | KVietnameseNumeric 297 | | KWubi 298 | | KXHC1983 299 | | KXerox 300 | | KZhuang 301 | | KZhuangNumeric 302 | | KZVariant 303 | (* Unikemet *) 304 | | KEH_Cat 305 | | KEH_Core 306 | | KEH_Desc 307 | | KEH_Func 308 | | KEH_FVal 309 | | KEH_UniK 310 | | KEH_JSesh 311 | | KEH_HG 312 | | KEH_IFAO 313 | | KEH_NoMirror 314 | | KEH_NoRotate 315 | | KEH_AltSeq 316 | | Other of (string * string) (* expanded XML name. *) 317 | 318 | type script = [ 319 | | `Adlm 320 | | `Aghb 321 | | `Ahom 322 | | `Arab 323 | | `Armi 324 | | `Armn 325 | | `Avst 326 | | `Bali 327 | | `Bamu 328 | | `Bass 329 | | `Batk 330 | | `Beng 331 | | `Berf 332 | | `Bhks 333 | | `Bopo 334 | | `Brah 335 | | `Brai 336 | | `Bugi 337 | | `Buhd 338 | | `Cakm 339 | | `Cans 340 | | `Cari 341 | | `Cham 342 | | `Cher 343 | | `Chrs 344 | | `Copt 345 | | `Cpmn 346 | | `Cprt 347 | | `Cyrl 348 | | `Deva 349 | | `Diak 350 | | `Dogr 351 | | `Dsrt 352 | | `Dupl 353 | | `Egyp 354 | | `Elba 355 | | `Elym 356 | | `Ethi 357 | | `Gara 358 | | `Geor 359 | | `Glag 360 | | `Gong 361 | | `Gonm 362 | | `Goth 363 | | `Gran 364 | | `Grek 365 | | `Gujr 366 | | `Gukh 367 | | `Guru 368 | | `Hang 369 | | `Hani 370 | | `Hano 371 | | `Hatr 372 | | `Hebr 373 | | `Hira 374 | | `Hluw 375 | | `Hmng 376 | | `Hmnp 377 | | `Hrkt 378 | | `Hung 379 | | `Ital 380 | | `Java 381 | | `Kali 382 | | `Kana 383 | | `Kawi 384 | | `Khar 385 | | `Khmr 386 | | `Khoj 387 | | `Knda 388 | | `Krai 389 | | `Kthi 390 | | `Kits 391 | | `Lana 392 | | `Laoo 393 | | `Latn 394 | | `Lepc 395 | | `Limb 396 | | `Lina 397 | | `Linb 398 | | `Lisu 399 | | `Lyci 400 | | `Lydi 401 | | `Mahj 402 | | `Maka 403 | | `Mand 404 | | `Mani 405 | | `Marc 406 | | `Medf 407 | | `Mend 408 | | `Merc 409 | | `Mero 410 | | `Mlym 411 | | `Modi 412 | | `Mong 413 | | `Mroo 414 | | `Mtei 415 | | `Mult 416 | | `Mymr 417 | | `Nagm 418 | | `Nand 419 | | `Narb 420 | | `Nbat 421 | | `Newa 422 | | `Nkoo 423 | | `Nshu 424 | | `Ogam 425 | | `Olck 426 | | `Onao 427 | | `Orkh 428 | | `Orya 429 | | `Osge 430 | | `Osma 431 | | `Ougr 432 | | `Palm 433 | | `Pauc 434 | | `Perm 435 | | `Phag 436 | | `Phli 437 | | `Phlp 438 | | `Phnx 439 | | `Plrd 440 | | `Prti 441 | | `Qaai 442 | | `Rjng 443 | | `Rohg 444 | | `Runr 445 | | `Samr 446 | | `Sarb 447 | | `Saur 448 | | `Sgnw 449 | | `Shaw 450 | | `Shrd 451 | | `Sidd 452 | | `Sidt 453 | | `Sind 454 | | `Sinh 455 | | `Sogd 456 | | `Sogo 457 | | `Sora 458 | | `Soyo 459 | | `Sund 460 | | `Sunu 461 | | `Sylo 462 | | `Syrc 463 | | `Tagb 464 | | `Takr 465 | | `Tale 466 | | `Talu 467 | | `Taml 468 | | `Tang 469 | | `Tavt 470 | | `Tayo 471 | | `Telu 472 | | `Tfng 473 | | `Tglg 474 | | `Thaa 475 | | `Thai 476 | | `Tibt 477 | | `Tirh 478 | | `Tnsa 479 | | `Todr 480 | | `Tols 481 | | `Toto 482 | | `Tutg 483 | | `Ugar 484 | | `Vaii 485 | | `Vith 486 | | `Wara 487 | | `Wcho 488 | | `Xpeo 489 | | `Xsux 490 | | `Yezi 491 | | `Yiii 492 | | `Zanb 493 | | `Zinh 494 | | `Zyyy 495 | | `Zzzz 496 | ] 497 | 498 | type block_prop = [ 499 | | `ASCII 500 | | `Adlam 501 | | `Aegean_Numbers 502 | | `Ahom 503 | | `Alchemical 504 | | `Alphabetic_PF 505 | | `Anatolian_Hieroglyphs 506 | | `Ancient_Greek_Music 507 | | `Ancient_Greek_Numbers 508 | | `Ancient_Symbols 509 | | `Arabic 510 | | `Arabic_Ext_A 511 | | `Arabic_Ext_B 512 | | `Arabic_Ext_C 513 | | `Arabic_Math 514 | | `Arabic_PF_A 515 | | `Arabic_PF_B 516 | | `Arabic_Sup 517 | | `Armenian 518 | | `Arrows 519 | | `Avestan 520 | | `Balinese 521 | | `Bamum 522 | | `Bamum_Sup 523 | | `Bassa_Vah 524 | | `Batak 525 | | `Bengali 526 | | `Beria_Erfe 527 | | `Bhaiksuki 528 | | `Block_Elements 529 | | `Bopomofo 530 | | `Bopomofo_Ext 531 | | `Box_Drawing 532 | | `Brahmi 533 | | `Braille 534 | | `Buginese 535 | | `Buhid 536 | | `Byzantine_Music 537 | | `CJK 538 | | `CJK_Compat 539 | | `CJK_Compat_Forms 540 | | `CJK_Compat_Ideographs 541 | | `CJK_Compat_Ideographs_Sup 542 | | `CJK_Ext_A 543 | | `CJK_Ext_B 544 | | `CJK_Ext_C 545 | | `CJK_Ext_D 546 | | `CJK_Ext_E 547 | | `CJK_Ext_F 548 | | `CJK_Ext_G 549 | | `CJK_Ext_H 550 | | `CJK_Ext_I 551 | | `CJK_Ext_J 552 | | `CJK_Radicals_Sup 553 | | `CJK_Strokes 554 | | `CJK_Symbols 555 | | `Carian 556 | | `Caucasian_Albanian 557 | | `Chakma 558 | | `Cham 559 | | `Cherokee 560 | | `Cherokee_Sup 561 | | `Chess_Symbols 562 | | `Chorasmian 563 | | `Compat_Jamo 564 | | `Control_Pictures 565 | | `Coptic 566 | | `Coptic_Epact_Numbers 567 | | `Counting_Rod 568 | | `Cuneiform 569 | | `Cuneiform_Numbers 570 | | `Currency_Symbols 571 | | `Cypriot_Syllabary 572 | | `Cypro_Minoan 573 | | `Cyrillic 574 | | `Cyrillic_Ext_A 575 | | `Cyrillic_Ext_B 576 | | `Cyrillic_Ext_C 577 | | `Cyrillic_Ext_D 578 | | `Cyrillic_Sup 579 | | `Deseret 580 | | `Devanagari 581 | | `Devanagari_Ext 582 | | `Devanagari_Ext_A 583 | | `Diacriticals 584 | | `Diacriticals_Ext 585 | | `Diacriticals_For_Symbols 586 | | `Diacriticals_Sup 587 | | `Dingbats 588 | | `Dives_Akuru 589 | | `Dogra 590 | | `Domino 591 | | `Duployan 592 | | `Early_Dynastic_Cuneiform 593 | | `Egyptian_Hieroglyph_Format_Controls 594 | | `Egyptian_Hieroglyphs 595 | | `Egyptian_Hieroglyphs_Ext_A 596 | | `Elbasan 597 | | `Elymaic 598 | | `Emoticons 599 | | `Enclosed_Alphanum 600 | | `Enclosed_Alphanum_Sup 601 | | `Enclosed_CJK 602 | | `Enclosed_Ideographic_Sup 603 | | `Ethiopic 604 | | `Ethiopic_Ext 605 | | `Ethiopic_Ext_A 606 | | `Ethiopic_Ext_B 607 | | `Ethiopic_Sup 608 | | `Garay 609 | | `Geometric_Shapes 610 | | `Geometric_Shapes_Ext 611 | | `Georgian 612 | | `Georgian_Ext 613 | | `Georgian_Sup 614 | | `Glagolitic 615 | | `Glagolitic_Sup 616 | | `Gothic 617 | | `Grantha 618 | | `Greek 619 | | `Greek_Ext 620 | | `Gujarati 621 | | `Gunjala_Gondi 622 | | `Gurmukhi 623 | | `Gurung_Khema 624 | | `Half_And_Full_Forms 625 | | `Half_Marks 626 | | `Hangul 627 | | `Hanifi_Rohingya 628 | | `Hanunoo 629 | | `Hatran 630 | | `Hebrew 631 | | `High_PU_Surrogates 632 | | `High_Surrogates 633 | | `Hiragana 634 | | `IDC 635 | | `IPA_Ext 636 | | `Ideographic_Symbols 637 | | `Imperial_Aramaic 638 | | `Indic_Number_Forms 639 | | `Indic_Siyaq_Numbers 640 | | `Inscriptional_Pahlavi 641 | | `Inscriptional_Parthian 642 | | `Jamo 643 | | `Jamo_Ext_A 644 | | `Jamo_Ext_B 645 | | `Javanese 646 | | `Kaithi 647 | | `Kaktovik_Numerals 648 | | `Kana_Ext_A 649 | | `Kana_Ext_B 650 | | `Kana_Sup 651 | | `Kanbun 652 | | `Kangxi 653 | | `Kannada 654 | | `Katakana 655 | | `Katakana_Ext 656 | | `Kawi 657 | | `Kayah_Li 658 | | `Kharoshthi 659 | | `Khitan_Small_Script 660 | | `Khmer 661 | | `Khmer_Symbols 662 | | `Khojki 663 | | `Khudawadi 664 | | `Kirat_Rai 665 | | `Lao 666 | | `Latin_1_Sup 667 | | `Latin_Ext_A 668 | | `Latin_Ext_Additional 669 | | `Latin_Ext_B 670 | | `Latin_Ext_C 671 | | `Latin_Ext_D 672 | | `Latin_Ext_E 673 | | `Latin_Ext_F 674 | | `Latin_Ext_G 675 | | `Lepcha 676 | | `Letterlike_Symbols 677 | | `Limbu 678 | | `Linear_A 679 | | `Linear_B_Ideograms 680 | | `Linear_B_Syllabary 681 | | `Lisu 682 | | `Lisu_Sup 683 | | `Low_Surrogates 684 | | `Lycian 685 | | `Lydian 686 | | `Mahajani 687 | | `Mahjong 688 | | `Makasar 689 | | `Malayalam 690 | | `Mandaic 691 | | `Manichaean 692 | | `Marchen 693 | | `Masaram_Gondi 694 | | `Math_Alphanum 695 | | `Math_Operators 696 | | `Mayan_Numerals 697 | | `Medefaidrin 698 | | `Meetei_Mayek 699 | | `Meetei_Mayek_Ext 700 | | `Mende_Kikakui 701 | | `Meroitic_Cursive 702 | | `Meroitic_Hieroglyphs 703 | | `Miao 704 | | `Misc_Arrows 705 | | `Misc_Math_Symbols_A 706 | | `Misc_Math_Symbols_B 707 | | `Misc_Pictographs 708 | | `Misc_Symbols 709 | | `Misc_Symbols_Sup 710 | | `Misc_Technical 711 | | `Modi 712 | | `Modifier_Letters 713 | | `Modifier_Tone_Letters 714 | | `Mongolian 715 | | `Mongolian_Sup 716 | | `Mro 717 | | `Multani 718 | | `Music 719 | | `Myanmar 720 | | `Myanmar_Ext_A 721 | | `Myanmar_Ext_B 722 | | `Myanmar_Ext_C 723 | | `NB 724 | | `NKo 725 | | `Nabataean 726 | | `Nag_Mundari 727 | | `Nandinagari 728 | | `New_Tai_Lue 729 | | `Newa 730 | | `Number_Forms 731 | | `Nushu 732 | | `Nyiakeng_Puachue_Hmong 733 | | `OCR 734 | | `Ogham 735 | | `Ol_Onal 736 | | `Ol_Chiki 737 | | `Old_Hungarian 738 | | `Old_Italic 739 | | `Old_North_Arabian 740 | | `Old_Permic 741 | | `Old_Persian 742 | | `Old_Sogdian 743 | | `Old_South_Arabian 744 | | `Old_Turkic 745 | | `Old_Uyghur 746 | | `Oriya 747 | | `Ornamental_Dingbats 748 | | `Osage 749 | | `Osmanya 750 | | `Ottoman_Siyaq_Numbers 751 | | `PUA 752 | | `Pahawh_Hmong 753 | | `Palmyrene 754 | | `Pau_Cin_Hau 755 | | `Phags_Pa 756 | | `Phaistos 757 | | `Phoenician 758 | | `Phonetic_Ext 759 | | `Phonetic_Ext_Sup 760 | | `Playing_Cards 761 | | `Psalter_Pahlavi 762 | | `Punctuation 763 | | `Rejang 764 | | `Rumi 765 | | `Runic 766 | | `Samaritan 767 | | `Saurashtra 768 | | `Sharada 769 | | `Sharada_Sup 770 | | `Shavian 771 | | `Shorthand_Format_Controls 772 | | `Siddham 773 | | `Sidetic 774 | | `Sinhala 775 | | `Sinhala_Archaic_Numbers 776 | | `Small_Forms 777 | | `Small_Kana_Ext 778 | | `Sogdian 779 | | `Sora_Sompeng 780 | | `Soyombo 781 | | `Specials 782 | | `Sundanese 783 | | `Sundanese_Sup 784 | | `Sunuwar 785 | | `Sup_Arrows_A 786 | | `Sup_Arrows_B 787 | | `Sup_Arrows_C 788 | | `Sup_Math_Operators 789 | | `Sup_PUA_A 790 | | `Sup_PUA_B 791 | | `Sup_Punctuation 792 | | `Sup_Symbols_And_Pictographs 793 | | `Super_And_Sub 794 | | `Sutton_SignWriting 795 | | `Syloti_Nagri 796 | | `Symbols_And_Pictographs_Ext_A 797 | | `Symbols_For_Legacy_Computing 798 | | `Symbols_For_Legacy_Computing_Sup 799 | | `Syriac 800 | | `Syriac_Sup 801 | | `Tagalog 802 | | `Tagbanwa 803 | | `Tags 804 | | `Tai_Le 805 | | `Tai_Tham 806 | | `Tai_Viet 807 | | `Tai_Xuan_Jing 808 | | `Tai_Yo 809 | | `Takri 810 | | `Tamil 811 | | `Tamil_Sup 812 | | `Tangsa 813 | | `Tangut 814 | | `Tangut_Components 815 | | `Tangut_Components_Sup 816 | | `Tangut_Sup 817 | | `Telugu 818 | | `Thaana 819 | | `Thai 820 | | `Tibetan 821 | | `Tifinagh 822 | | `Tirhuta 823 | | `Todhri 824 | | `Tolong_Siki 825 | | `Toto 826 | | `Transport_And_Map 827 | | `Tulu_Tigalari 828 | | `UCAS 829 | | `UCAS_Ext 830 | | `UCAS_Ext_A 831 | | `Ugaritic 832 | | `VS 833 | | `VS_Sup 834 | | `Vai 835 | | `Vedic_Ext 836 | | `Vertical_Forms 837 | | `Vithkuqi 838 | | `Wancho 839 | | `Warang_Citi 840 | | `Yezidi 841 | | `Yi_Radicals 842 | | `Yi_Syllables 843 | | `Yijing 844 | | `Zanabazar_Square 845 | | `Znamenny_Music 846 | ] 847 | 848 | type value = (* the type for property values. *) 849 | | Age_v of [ `Version of int * int | `Unassigned ] 850 | | Block_v of block_prop 851 | | Bidi_class_v of [ 852 | | `AL 853 | | `AN 854 | | `B 855 | | `BN 856 | | `CS 857 | | `EN 858 | | `ES 859 | | `ET 860 | | `L 861 | | `LRE 862 | | `LRO 863 | | `NSM 864 | | `ON 865 | | `PDF 866 | | `R 867 | | `RLE 868 | | `RLO 869 | | `S 870 | | `WS 871 | | `LRI 872 | | `RLI 873 | | `FSI 874 | | `PDI 875 | ] 876 | | Bidi_paired_bracket_type_v of [ `O | `C | `N ] 877 | | Bool_v of bool 878 | | Bool_maybe_v of [ `True | `False | `Maybe ] 879 | | Cp_v of cp 880 | | Cp_map_v of [ `Self | `Cp of cp ] 881 | | Cp_opt_v of cp option 882 | | Decomposition_type_v of [ 883 | | `Can 884 | | `Com 885 | | `Enc 886 | | `Fin 887 | | `Font 888 | | `Fra 889 | | `Init 890 | | `Iso 891 | | `Med 892 | | `Nar 893 | | `Nb 894 | | `Sml 895 | | `Sqr 896 | | `Sub 897 | | `Sup 898 | | `Vert 899 | | `Wide 900 | | `None 901 | ] 902 | | East_asian_width_v of [ `A | `F | `H | `N | `Na | `W ] 903 | | General_category_v of [ 904 | | `Lu 905 | | `Ll 906 | | `Lt 907 | | `Lm 908 | | `Lo 909 | | `Mn 910 | | `Mc 911 | | `Me 912 | | `Nd 913 | | `Nl 914 | | `No 915 | | `Pc 916 | | `Pd 917 | | `Ps 918 | | `Pe 919 | | `Pi 920 | | `Pf 921 | | `Po 922 | | `Sm 923 | | `Sc 924 | | `Sk 925 | | `So 926 | | `Zs 927 | | `Zl 928 | | `Zp 929 | | `Cc 930 | | `Cf 931 | | `Cs 932 | | `Co 933 | | `Cn 934 | ] 935 | | Grapheme_cluster_break_v of [ 936 | | `CN 937 | | `CR 938 | | `EB 939 | | `EBG 940 | | `EM 941 | | `EX 942 | | `GAZ 943 | | `L 944 | | `LF 945 | | `LV 946 | | `LVT 947 | | `PP 948 | | `RI 949 | | `SM 950 | | `T 951 | | `V 952 | | `XX 953 | | `ZWJ ] 954 | | Hangul_syllable_type_v of [ `L | `LV | `LVT | `T | `V | `NA ] 955 | | Int_v of int 956 | | Indic_conjunct_break_v of 957 | [ `Consonant 958 | | `Extend 959 | | `Linker 960 | | `None ] 961 | | Indic_syllabic_category_v of 962 | [ `Avagraha 963 | | `Bindu 964 | | `Brahmi_Joining_Number 965 | | `Cantillation_Mark 966 | | `Consonant 967 | | `Consonant_Dead 968 | | `Consonant_Final 969 | | `Consonant_Head_Letter 970 | | `Consonant_Initial_Postfixed 971 | | `Consonant_Killer 972 | | `Consonant_Medial 973 | | `Consonant_Placeholder 974 | | `Consonant_Preceding_Repha 975 | | `Consonant_Prefixed 976 | | `Consonant_Repha 977 | | `Consonant_Subjoined 978 | | `Consonant_Succeeding_Repha 979 | | `Consonant_With_Stacker 980 | | `Gemination_Mark 981 | | `Invisible_Stacker 982 | | `Joiner 983 | | `Modifying_Letter 984 | | `Non_Joiner 985 | | `Nukta 986 | | `Number 987 | | `Number_Joiner 988 | | `Other 989 | | `Pure_Killer 990 | | `Reordering_Killer 991 | | `Register_Shifter 992 | | `Syllable_Modifier 993 | | `Tone_Letter 994 | | `Tone_Mark 995 | | `Virama 996 | | `Visarga 997 | | `Vowel 998 | | `Vowel_Dependent 999 | | `Vowel_Independent ] 1000 | | Indic_matra_category_v of [ 1001 | | `Right 1002 | | `Left 1003 | | `Visual_Order_Left 1004 | | `Left_And_Right 1005 | | `Top 1006 | | `Bottom 1007 | | `Top_And_Bottom 1008 | | `Top_And_Right 1009 | | `Top_And_Left 1010 | | `Top_And_Left_And_Right 1011 | | `Bottom_And_Right 1012 | | `Top_And_Bottom_And_Right 1013 | | `Overstruck 1014 | | `Invisible 1015 | | `NA 1016 | ] 1017 | | Indic_positional_category_v of [ 1018 | | `Bottom 1019 | | `Bottom_And_Left 1020 | | `Bottom_And_Right 1021 | | `Invisible 1022 | | `Left 1023 | | `Left_And_Right 1024 | | `NA 1025 | | `Overstruck 1026 | | `Right 1027 | | `Top 1028 | | `Top_And_Bottom 1029 | | `Top_And_Bottom_And_Left 1030 | | `Top_And_Bottom_And_Right 1031 | | `Top_And_Left 1032 | | `Top_And_Left_And_Right 1033 | | `Top_And_Right 1034 | | `Visual_Order_Left 1035 | ] 1036 | | Joining_group_v of [ 1037 | | `African_Feh 1038 | | `African_Noon 1039 | | `African_Qaf 1040 | | `Ain 1041 | | `Alaph 1042 | | `Alef 1043 | | `Alef_Maqsurah 1044 | | `Beh 1045 | | `Beth 1046 | | `Burushaski_Yeh_Barree 1047 | | `Dal 1048 | | `Dalath_Rish 1049 | | `E 1050 | | `Farsi_Yeh 1051 | | `Fe 1052 | | `Feh 1053 | | `Final_Semkath 1054 | | `Gaf 1055 | | `Gamal 1056 | | `Hah 1057 | | `Hanifi_Rohingya_Kinna_Ya 1058 | | `Hanifi_Rohingya_Pa 1059 | | `Hamza_On_Heh_Goal 1060 | | `He 1061 | | `Heh 1062 | | `Heh_Goal 1063 | | `Heth 1064 | | `Kaf 1065 | | `Kaph 1066 | | `Kashmiri_Yeh 1067 | | `Khaph 1068 | | `Knotted_Heh 1069 | | `Lam 1070 | | `Lamadh 1071 | | `Malayalam_Bha 1072 | | `Malayalam_Ja 1073 | | `Malayalam_Lla 1074 | | `Malayalam_Llla 1075 | | `Malayalam_Nga 1076 | | `Malayalam_Nna 1077 | | `Malayalam_Nnna 1078 | | `Malayalam_Nya 1079 | | `Malayalam_Ra 1080 | | `Malayalam_Ssa 1081 | | `Malayalam_Tta 1082 | | `Manichaean_Aleph 1083 | | `Manichaean_Ayin 1084 | | `Manichaean_Beth 1085 | | `Manichaean_Daleth 1086 | | `Manichaean_Dhamedh 1087 | | `Manichaean_Five 1088 | | `Manichaean_Gimel 1089 | | `Manichaean_Heth 1090 | | `Manichaean_Hundred 1091 | | `Manichaean_Kaph 1092 | | `Manichaean_Lamedh 1093 | | `Manichaean_Mem 1094 | | `Manichaean_Nun 1095 | | `Manichaean_One 1096 | | `Manichaean_Pe 1097 | | `Manichaean_Qoph 1098 | | `Manichaean_Resh 1099 | | `Manichaean_Sadhe 1100 | | `Manichaean_Samekh 1101 | | `Manichaean_Taw 1102 | | `Manichaean_Ten 1103 | | `Manichaean_Teth 1104 | | `Manichaean_Thamedh 1105 | | `Manichaean_Twenty 1106 | | `Manichaean_Waw 1107 | | `Manichaean_Yodh 1108 | | `Manichaean_Zayin 1109 | | `Meem 1110 | | `Mim 1111 | | `No_Joining_Group 1112 | | `Noon 1113 | | `Nun 1114 | | `Nya 1115 | | `Pe 1116 | | `Qaf 1117 | | `Qaph 1118 | | `Reh 1119 | | `Reversed_Pe 1120 | | `Rohingya_Yeh 1121 | | `Sad 1122 | | `Sadhe 1123 | | `Seen 1124 | | `Semkath 1125 | | `Shin 1126 | | `Straight_Waw 1127 | | `Swash_Kaf 1128 | | `Syriac_Waw 1129 | | `Tah 1130 | | `Taw 1131 | | `Teh_Marbuta 1132 | | `Teh_Marbuta_Goal 1133 | | `Teth 1134 | | `Thin_Noon 1135 | | `Thin_Yeh 1136 | | `Vertical_Tail 1137 | | `Waw 1138 | | `Yeh 1139 | | `Yeh_Barree 1140 | | `Yeh_With_Tail 1141 | | `Yudh 1142 | | `Yudh_He 1143 | | `Zain 1144 | | `Zhain 1145 | | `BAA 1146 | | `FA 1147 | | `HAA 1148 | | `HA_GOAL 1149 | | `HA 1150 | | `CAF 1151 | | `KNOTTED_HA 1152 | | `RA 1153 | | `SWASH_CAF 1154 | | `HAMZAH_ON_HA_GOAL 1155 | | `TAA_MARBUTAH 1156 | | `YA_BARREE 1157 | | `YA 1158 | | `ALEF_MAQSURAH ] 1159 | | Joining_type_v of [ `U | `C | `T | `D | `L | `R ] 1160 | | Line_break_v of [ 1161 | | `AI 1162 | | `AK 1163 | | `AL 1164 | | `AP 1165 | | `AS 1166 | | `B2 1167 | | `BA 1168 | | `BB 1169 | | `BK 1170 | | `CB 1171 | | `CJ 1172 | | `CL 1173 | | `CM 1174 | | `CP 1175 | | `CR 1176 | | `EB 1177 | | `EM 1178 | | `EX 1179 | | `GL 1180 | | `H2 1181 | | `H3 1182 | | `HH 1183 | | `HL 1184 | | `HY 1185 | | `ID 1186 | | `IN 1187 | | `IS 1188 | | `JL 1189 | | `JT 1190 | | `JV 1191 | | `LF 1192 | | `NL 1193 | | `NS 1194 | | `NU 1195 | | `OP 1196 | | `PO 1197 | | `PR 1198 | | `QU 1199 | | `RI 1200 | | `SA 1201 | | `SG 1202 | | `SP 1203 | | `SY 1204 | | `VF 1205 | | `VI 1206 | | `WJ 1207 | | `XX 1208 | | `ZW 1209 | | `ZWJ 1210 | ] 1211 | | Name_v of [`Pattern of string | `Name of string ] 1212 | | Name_alias_v of 1213 | (string * [`Abbreviation | `Alternate | `Control | `Correction | `Figment]) 1214 | list 1215 | | Numeric_type_v of [ `None | `De | `Di | `Nu ] 1216 | | Numeric_value_v of 1217 | [ `NaN | `Nums of [`Frac of int * int | `Num of int64 ] list] 1218 | | Script_v of script 1219 | | Script_extensions_v of script list 1220 | | Sentence_break_v of [ 1221 | | `AT 1222 | | `CL 1223 | | `CR 1224 | | `EX 1225 | | `FO 1226 | | `LE 1227 | | `LF 1228 | | `LO 1229 | | `NU 1230 | | `SC 1231 | | `SE 1232 | | `SP 1233 | | `ST 1234 | | `UP 1235 | | `XX 1236 | ] 1237 | | Cps_v of cp list 1238 | | Cps_map_v of [ `Self | `Cps of cp list ] 1239 | | String_v of string 1240 | | UAX_42_element_v of [ `Reserved | `Noncharacter | `Surrogate | `Char ] 1241 | | Vertical_orientation_v of [ `U | `R | `Tu | `Tr ] 1242 | | Word_break_v of [ 1243 | | `CR 1244 | | `DQ 1245 | | `EB 1246 | | `EBG 1247 | | `EM 1248 | | `EX 1249 | | `Extend 1250 | | `FO 1251 | | `GAZ 1252 | | `HL 1253 | | `KA 1254 | | `LE 1255 | | `LF 1256 | | `MB 1257 | | `ML 1258 | | `MN 1259 | | `NL 1260 | | `NU 1261 | | `RI 1262 | | `SQ 1263 | | `WSegSpace 1264 | | `XX 1265 | | `ZWJ 1266 | ] 1267 | 1268 | (* property value projection *) 1269 | 1270 | let o_age = function Age_v v -> v | _ -> assert false 1271 | let o_bidi_class = function Bidi_class_v v -> v | _ -> assert false 1272 | let o_bidi_paired_bracket_type = 1273 | function Bidi_paired_bracket_type_v v -> v | _ -> assert false 1274 | 1275 | let o_block = function Block_v v -> v | _ -> assert false 1276 | let o_bool = function Bool_v v -> v | _ -> assert false 1277 | let o_bool_maybe = function Bool_maybe_v v -> v | _ -> assert false 1278 | let o_cp = function Cp_v v -> v | _ -> assert false 1279 | let o_cp_map = function Cp_map_v v -> v | _ -> assert false 1280 | let o_cp_opt = function Cp_opt_v v -> v | _ -> assert false 1281 | let o_decomposition_type = 1282 | function Decomposition_type_v v -> v | _ -> assert false 1283 | 1284 | let o_east_asian_width = function East_asian_width_v v -> v | _ -> assert false 1285 | let o_general_category = function General_category_v v -> v | _ -> assert false 1286 | let o_grapheme_cluster_break = 1287 | function Grapheme_cluster_break_v v -> v | _ -> assert false 1288 | 1289 | let o_hangul_syllable_type = 1290 | function Hangul_syllable_type_v v -> v | _ -> assert false 1291 | 1292 | let o_int = function Int_v v -> v | _ -> assert false 1293 | 1294 | let o_indic_conjunct_break = 1295 | function Indic_conjunct_break_v v -> v | _ -> assert false 1296 | 1297 | let o_indic_syllabic_category = 1298 | function Indic_syllabic_category_v v -> v | _ -> assert false 1299 | 1300 | let o_indic_matra_category = 1301 | function Indic_matra_category_v v -> v | _ -> assert false 1302 | 1303 | let o_indic_positional_category = 1304 | function Indic_positional_category_v v -> v | _ -> assert false 1305 | 1306 | let o_joining_group = function Joining_group_v v -> v | _ -> assert false 1307 | let o_joining_type = function Joining_type_v v -> v | _ -> assert false 1308 | let o_line_break = function Line_break_v v -> v | _ -> assert false 1309 | let o_name = function Name_v v -> v | _ -> assert false 1310 | let o_name_alias = function Name_alias_v v -> v | _ -> assert false 1311 | let o_numeric_type = function Numeric_type_v v -> v | _ -> assert false 1312 | let o_numeric_value = function Numeric_value_v v -> v | _ -> assert false 1313 | let o_script = function Script_v v -> v | _ -> assert false 1314 | let o_script_extensions = 1315 | function Script_extensions_v v -> v | _ -> assert false 1316 | 1317 | let o_sentence_break = function Sentence_break_v v -> v | _ -> assert false 1318 | let o_cps = function Cps_v v -> v | _ -> assert false 1319 | let o_cps_map = function Cps_map_v v -> v | _ -> assert false 1320 | let o_string = function String_v v -> v | _ -> assert false 1321 | let o_uax_42_element = function UAX_42_element_v v -> v | _ -> assert false 1322 | let o_vertical_orientation = 1323 | function Vertical_orientation_v v -> v | _ -> assert false 1324 | let o_word_break = function Word_break_v v -> v | _ -> assert false 1325 | 1326 | (* property value injection *) 1327 | 1328 | let i_age v = Age_v begin match v with 1329 | | "unassigned" -> `Unassigned 1330 | | v -> 1331 | try match List.map int_of_string (split_string v '.') with 1332 | | [v1; v2;] -> `Version (v1, v2) 1333 | | _ -> failwith "" 1334 | with Failure _ -> err (err_att_val v) 1335 | end 1336 | 1337 | let i_bidi_class v = Bidi_class_v begin match v with 1338 | | "AL" -> `AL 1339 | | "AN" -> `AN 1340 | | "B" -> `B 1341 | | "BN" -> `BN 1342 | | "CS" -> `CS 1343 | | "EN" -> `EN 1344 | | "ES" -> `ES 1345 | | "ET" -> `ET 1346 | | "L" -> `L 1347 | | "LRE" -> `LRE 1348 | | "LRO" -> `LRO 1349 | | "NSM" -> `NSM 1350 | | "ON" -> `ON 1351 | | "PDF" -> `PDF 1352 | | "R" -> `R 1353 | | "RLE" -> `RLE 1354 | | "RLO" -> `RLO 1355 | | "S" -> `S 1356 | | "WS" -> `WS 1357 | | "LRI" -> `LRI 1358 | | "RLI" -> `RLI 1359 | | "FSI" -> `FSI 1360 | | "PDI" -> `PDI 1361 | | v -> err (err_att_val v) 1362 | end 1363 | 1364 | let i_bidi_paired_bracket_type v = Bidi_paired_bracket_type_v begin match v with 1365 | | "o" -> `O 1366 | | "c" -> `C 1367 | | "n" -> `N 1368 | | v -> err (err_att_val v) 1369 | end 1370 | 1371 | let i_block v = Block_v begin match v with 1372 | | "ASCII" -> `ASCII 1373 | | "Adlam" -> `Adlam 1374 | | "Aegean_Numbers" -> `Aegean_Numbers 1375 | | "Ahom" -> `Ahom 1376 | | "Alchemical" -> `Alchemical 1377 | | "Alphabetic_PF" -> `Alphabetic_PF 1378 | | "Anatolian_Hieroglyphs" -> `Anatolian_Hieroglyphs 1379 | | "Ancient_Greek_Music" -> `Ancient_Greek_Music 1380 | | "Ancient_Greek_Numbers" -> `Ancient_Greek_Numbers 1381 | | "Ancient_Symbols" -> `Ancient_Symbols 1382 | | "Arabic" -> `Arabic 1383 | | "Arabic_Ext_A" -> `Arabic_Ext_A 1384 | | "Arabic_Ext_B" -> `Arabic_Ext_A 1385 | | "Arabic_Ext_C" -> `Arabic_Ext_C 1386 | | "Arabic_Math" -> `Arabic_Math 1387 | | "Arabic_PF_A" -> `Arabic_PF_A 1388 | | "Arabic_PF_B" -> `Arabic_PF_B 1389 | | "Arabic_Sup" -> `Arabic_Sup 1390 | | "Armenian" -> `Armenian 1391 | | "Arrows" -> `Arrows 1392 | | "Avestan" -> `Avestan 1393 | | "Balinese" -> `Balinese 1394 | | "Bamum" -> `Bamum 1395 | | "Bamum_Sup" -> `Bamum_Sup 1396 | | "Bassa_Vah" -> `Bassa_Vah 1397 | | "Batak" -> `Batak 1398 | | "Bengali" -> `Bengali 1399 | | "Beria_Erfe" -> `Beria_Erfe 1400 | | "Bhaiksuki" -> `Bhaiksuki 1401 | | "Block_Elements" -> `Block_Elements 1402 | | "Bopomofo" -> `Bopomofo 1403 | | "Bopomofo_Ext" -> `Bopomofo_Ext 1404 | | "Box_Drawing" -> `Box_Drawing 1405 | | "Brahmi" -> `Brahmi 1406 | | "Braille" -> `Braille 1407 | | "Buginese" -> `Buginese 1408 | | "Buhid" -> `Buhid 1409 | | "Byzantine_Music" -> `Byzantine_Music 1410 | | "CJK" -> `CJK 1411 | | "CJK_Compat" -> `CJK_Compat 1412 | | "CJK_Compat_Forms" -> `CJK_Compat_Forms 1413 | | "CJK_Compat_Ideographs" -> `CJK_Compat_Ideographs 1414 | | "CJK_Compat_Ideographs_Sup" -> `CJK_Compat_Ideographs_Sup 1415 | | "CJK_Ext_A" -> `CJK_Ext_A 1416 | | "CJK_Ext_B" -> `CJK_Ext_B 1417 | | "CJK_Ext_C" -> `CJK_Ext_C 1418 | | "CJK_Ext_D" -> `CJK_Ext_D 1419 | | "CJK_Ext_E" -> `CJK_Ext_E 1420 | | "CJK_Ext_F" -> `CJK_Ext_F 1421 | | "CJK_Ext_G" -> `CJK_Ext_G 1422 | | "CJK_Ext_H" -> `CJK_Ext_H 1423 | | "CJK_Ext_I" -> `CJK_Ext_I 1424 | | "CJK_Ext_J" -> `CJK_Ext_J 1425 | | "CJK_Radicals_Sup" -> `CJK_Radicals_Sup 1426 | | "CJK_Strokes" -> `CJK_Strokes 1427 | | "CJK_Symbols" -> `CJK_Symbols 1428 | | "Carian" -> `Carian 1429 | | "Caucasian_Albanian" -> `Caucasian_Albanian 1430 | | "Chakma" -> `Chakma 1431 | | "Cham" -> `Cham 1432 | | "Cherokee" -> `Cherokee 1433 | | "Cherokee_Sup" -> `Cherokee_Sup 1434 | | "Chess_Symbols" -> `Chess_Symbols 1435 | | "Chorasmian" -> `Chorasmian 1436 | | "Compat_Jamo" -> `Compat_Jamo 1437 | | "Control_Pictures" -> `Control_Pictures 1438 | | "Coptic" -> `Coptic 1439 | | "Coptic_Epact_Numbers" -> `Coptic_Epact_Numbers 1440 | | "Counting_Rod" -> `Counting_Rod 1441 | | "Cuneiform" -> `Cuneiform 1442 | | "Cuneiform_Numbers" -> `Cuneiform_Numbers 1443 | | "Currency_Symbols" -> `Currency_Symbols 1444 | | "Cypriot_Syllabary" -> `Cypriot_Syllabary 1445 | | "Cypro_Minoan" -> `Cypro_Minoan 1446 | | "Cyrillic" -> `Cyrillic 1447 | | "Cyrillic_Ext_A" -> `Cyrillic_Ext_A 1448 | | "Cyrillic_Ext_B" -> `Cyrillic_Ext_B 1449 | | "Cyrillic_Ext_C" -> `Cyrillic_Ext_C 1450 | | "Cyrillic_Ext_D" -> `Cyrillic_Ext_D 1451 | | "Cyrillic_Sup" -> `Cyrillic_Sup 1452 | | "Deseret" -> `Deseret 1453 | | "Devanagari" -> `Devanagari 1454 | | "Devanagari_Ext" -> `Devanagari_Ext 1455 | | "Devanagari_Ext_A" -> `Devanagari_Ext_A 1456 | | "Diacriticals" -> `Diacriticals 1457 | | "Diacriticals_Ext" -> `Diacriticals_Ext 1458 | | "Diacriticals_For_Symbols" -> `Diacriticals_For_Symbols 1459 | | "Diacriticals_Sup" -> `Diacriticals_Sup 1460 | | "Dingbats" -> `Dingbats 1461 | | "Dives_Akuru" -> `Dives_Akuru 1462 | | "Dogra" -> `Dogra 1463 | | "Domino" -> `Domino 1464 | | "Duployan" -> `Duployan 1465 | | "Early_Dynastic_Cuneiform" -> `Early_Dynastic_Cuneiform 1466 | | "Egyptian_Hieroglyph_Format_Controls" -> `Egyptian_Hieroglyph_Format_Controls 1467 | | "Egyptian_Hieroglyphs" -> `Egyptian_Hieroglyphs 1468 | | "Egyptian_Hieroglyphs_Ext_A" -> `Egyptian_Hieroglyphs_Ext_A 1469 | | "Elbasan" -> `Elbasan 1470 | | "Elymaic" -> `Elymaic 1471 | | "Emoticons" -> `Emoticons 1472 | | "Enclosed_Alphanum" -> `Enclosed_Alphanum 1473 | | "Enclosed_Alphanum_Sup" -> `Enclosed_Alphanum_Sup 1474 | | "Enclosed_CJK" -> `Enclosed_CJK 1475 | | "Enclosed_Ideographic_Sup" -> `Enclosed_Ideographic_Sup 1476 | | "Ethiopic" -> `Ethiopic 1477 | | "Ethiopic_Ext" -> `Ethiopic_Ext 1478 | | "Ethiopic_Ext_A" -> `Ethiopic_Ext_A 1479 | | "Ethiopic_Ext_B" -> `Ethiopic_Ext_B 1480 | | "Ethiopic_Sup" -> `Ethiopic_Sup 1481 | | "Garay" -> `Garay 1482 | | "Geometric_Shapes" -> `Geometric_Shapes 1483 | | "Geometric_Shapes_Ext" -> `Geometric_Shapes_Ext 1484 | | "Georgian" -> `Georgian 1485 | | "Georgian_Ext" -> `Georgian_Ext 1486 | | "Georgian_Sup" -> `Georgian_Sup 1487 | | "Glagolitic" -> `Glagolitic 1488 | | "Glagolitic_Sup" -> `Glagolitic_Sup 1489 | | "Gothic" -> `Gothic 1490 | | "Grantha" -> `Grantha 1491 | | "Greek" -> `Greek 1492 | | "Greek_Ext" -> `Greek_Ext 1493 | | "Gujarati" -> `Gujarati 1494 | | "Gunjala_Gondi" -> `Gunjala_Gondi 1495 | | "Gurmukhi" -> `Gurmukhi 1496 | | "Gurung_Khema" -> `Gurung_Khema 1497 | | "Half_And_Full_Forms" -> `Half_And_Full_Forms 1498 | | "Half_Marks" -> `Half_Marks 1499 | | "Hangul" -> `Hangul 1500 | | "Hanifi_Rohingya" -> `Hanifi_Rohingya 1501 | | "Hanunoo" -> `Hanunoo 1502 | | "Hatran" -> `Hatran 1503 | | "Hebrew" -> `Hebrew 1504 | | "High_PU_Surrogates" -> `High_PU_Surrogates 1505 | | "High_Surrogates" -> `High_Surrogates 1506 | | "Hiragana" -> `Hiragana 1507 | | "IDC" -> `IDC 1508 | | "IPA_Ext" -> `IPA_Ext 1509 | | "Ideographic_Symbols" -> `Ideographic_Symbols 1510 | | "Imperial_Aramaic" -> `Imperial_Aramaic 1511 | | "Indic_Number_Forms" -> `Indic_Number_Forms 1512 | | "Indic_Siyaq_Numbers" -> `Indic_Siyaq_Numbers 1513 | | "Inscriptional_Pahlavi" -> `Inscriptional_Pahlavi 1514 | | "Inscriptional_Parthian" -> `Inscriptional_Parthian 1515 | | "Jamo" -> `Jamo 1516 | | "Jamo_Ext_A" -> `Jamo_Ext_A 1517 | | "Jamo_Ext_B" -> `Jamo_Ext_B 1518 | | "Javanese" -> `Javanese 1519 | | "Kaithi" -> `Kaithi 1520 | | "Kaktovik_Numerals" -> `Kaktovik_Numerals 1521 | | "Kana_Ext_A" -> `Kana_Ext_A 1522 | | "Kana_Ext_B" -> `Kana_Ext_B 1523 | | "Kawi" -> `Kawi 1524 | | "Kana_Sup" -> `Kana_Sup 1525 | | "Kanbun" -> `Kanbun 1526 | | "Kangxi" -> `Kangxi 1527 | | "Kannada" -> `Kannada 1528 | | "Katakana" -> `Katakana 1529 | | "Katakana_Ext" -> `Katakana_Ext 1530 | | "Kayah_Li" -> `Kayah_Li 1531 | | "Kharoshthi" -> `Kharoshthi 1532 | | "Khitan_Small_Script" -> `Khitan_Small_Script 1533 | | "Khmer" -> `Khmer 1534 | | "Khmer_Symbols" -> `Khmer_Symbols 1535 | | "Khojki" -> `Khojki 1536 | | "Khudawadi" -> `Khudawadi 1537 | | "Kirat_Rai" -> `Kirat_Rai 1538 | | "Lao" -> `Lao 1539 | | "Latin_1_Sup" -> `Latin_1_Sup 1540 | | "Latin_Ext_A" -> `Latin_Ext_A 1541 | | "Latin_Ext_Additional" -> `Latin_Ext_Additional 1542 | | "Latin_Ext_B" -> `Latin_Ext_B 1543 | | "Latin_Ext_C" -> `Latin_Ext_C 1544 | | "Latin_Ext_D" -> `Latin_Ext_D 1545 | | "Latin_Ext_E" -> `Latin_Ext_E 1546 | | "Latin_Ext_F" -> `Latin_Ext_F 1547 | | "Latin_Ext_G" -> `Latin_Ext_G 1548 | | "Lepcha" -> `Lepcha 1549 | | "Letterlike_Symbols" -> `Letterlike_Symbols 1550 | | "Limbu" -> `Limbu 1551 | | "Linear_A" -> `Linear_A 1552 | | "Linear_B_Ideograms" -> `Linear_B_Ideograms 1553 | | "Linear_B_Syllabary" -> `Linear_B_Syllabary 1554 | | "Lisu" -> `Lisu 1555 | | "Lisu_Sup" -> `Lisu_Sup 1556 | | "Low_Surrogates" -> `Low_Surrogates 1557 | | "Lycian" -> `Lycian 1558 | | "Lydian" -> `Lydian 1559 | | "Mahajani" -> `Mahajani 1560 | | "Mahjong" -> `Mahjong 1561 | | "Makasar" -> `Makasar 1562 | | "Malayalam" -> `Malayalam 1563 | | "Mandaic" -> `Mandaic 1564 | | "Manichaean" -> `Manichaean 1565 | | "Marchen" -> `Marchen 1566 | | "Masaram_Gondi" -> `Masaram_Gondi 1567 | | "Math_Alphanum" -> `Math_Alphanum 1568 | | "Math_Operators" -> `Math_Operators 1569 | | "Mayan_Numerals" -> `Mayan_Numerals 1570 | | "Medefaidrin" -> `Medefaidrin 1571 | | "Meetei_Mayek" -> `Meetei_Mayek 1572 | | "Meetei_Mayek_Ext" -> `Meetei_Mayek_Ext 1573 | | "Mende_Kikakui" -> `Mende_Kikakui 1574 | | "Meroitic_Cursive" -> `Meroitic_Cursive 1575 | | "Meroitic_Hieroglyphs" -> `Meroitic_Hieroglyphs 1576 | | "Miao" -> `Miao 1577 | | "Misc_Arrows" -> `Misc_Arrows 1578 | | "Misc_Math_Symbols_A" -> `Misc_Math_Symbols_A 1579 | | "Misc_Math_Symbols_B" -> `Misc_Math_Symbols_B 1580 | | "Misc_Pictographs" -> `Misc_Pictographs 1581 | | "Misc_Symbols" -> `Misc_Symbols 1582 | | "Misc_Symbols_Sup" -> `Misc_Symbols_Sup 1583 | | "Misc_Technical" -> `Misc_Technical 1584 | | "Modi" -> `Modi 1585 | | "Modifier_Letters" -> `Modifier_Letters 1586 | | "Modifier_Tone_Letters" -> `Modifier_Tone_Letters 1587 | | "Mongolian" -> `Mongolian 1588 | | "Mongolian_Sup" -> `Mongolian_Sup 1589 | | "Mro" -> `Mro 1590 | | "Multani" -> `Multani 1591 | | "Music" -> `Music 1592 | | "Myanmar" -> `Myanmar 1593 | | "Myanmar_Ext_A" -> `Myanmar_Ext_A 1594 | | "Myanmar_Ext_B" -> `Myanmar_Ext_B 1595 | | "Myanmar_Ext_C" -> `Myanmar_Ext_C 1596 | | "NB" -> `NB 1597 | | "NKo" -> `NKo 1598 | | "Nabataean" -> `Nabataean 1599 | | "Nag_Mundari" -> `Nag_Mundari 1600 | | "Nandinagari" -> `Nandinagari 1601 | | "New_Tai_Lue" -> `New_Tai_Lue 1602 | | "Newa" -> `Newa 1603 | | "Number_Forms" -> `Number_Forms 1604 | | "Nushu" -> `Nushu 1605 | | "Nyiakeng_Puachue_Hmong" -> `Nyiakeng_Puachue_Hmong 1606 | | "OCR" -> `OCR 1607 | | "Ogham" -> `Ogham 1608 | | "Ol_Chiki" -> `Ol_Chiki 1609 | | "Ol_Onal" -> `Ol_Onal 1610 | | "Old_Hungarian" -> `Old_Hungarian 1611 | | "Old_Italic" -> `Old_Italic 1612 | | "Old_North_Arabian" -> `Old_North_Arabian 1613 | | "Old_Permic" -> `Old_Permic 1614 | | "Old_Persian" -> `Old_Persian 1615 | | "Old_Sogdian" -> `Old_Sogdian 1616 | | "Old_South_Arabian" -> `Old_South_Arabian 1617 | | "Old_Turkic" -> `Old_Turkic 1618 | | "Old_Uyghur" -> `Old_Uyghur 1619 | | "Oriya" -> `Oriya 1620 | | "Ornamental_Dingbats" -> `Ornamental_Dingbats 1621 | | "Osage" -> `Osage 1622 | | "Osmanya" -> `Osmanya 1623 | | "Ottoman_Siyaq_Numbers" -> `Ottoman_Siyaq_Numbers 1624 | | "PUA" -> `PUA 1625 | | "Pahawh_Hmong" -> `Pahawh_Hmong 1626 | | "Palmyrene" -> `Palmyrene 1627 | | "Pau_Cin_Hau" -> `Pau_Cin_Hau 1628 | | "Phags_Pa" -> `Phags_Pa 1629 | | "Phaistos" -> `Phaistos 1630 | | "Phoenician" -> `Phoenician 1631 | | "Phonetic_Ext" -> `Phonetic_Ext 1632 | | "Phonetic_Ext_Sup" -> `Phonetic_Ext_Sup 1633 | | "Playing_Cards" -> `Playing_Cards 1634 | | "Psalter_Pahlavi" -> `Psalter_Pahlavi 1635 | | "Punctuation" -> `Punctuation 1636 | | "Rejang" -> `Rejang 1637 | | "Rumi" -> `Rumi 1638 | | "Runic" -> `Runic 1639 | | "Samaritan" -> `Samaritan 1640 | | "Saurashtra" -> `Saurashtra 1641 | | "Sharada" -> `Sharada 1642 | | "Sharada_Sup" -> `Sharada_Sup 1643 | | "Shavian" -> `Shavian 1644 | | "Shorthand_Format_Controls" -> `Shorthand_Format_Controls 1645 | | "Siddham" -> `Siddham 1646 | | "Sidetic" -> `Sidetic 1647 | | "Sinhala" -> `Sinhala 1648 | | "Sinhala_Archaic_Numbers" -> `Sinhala_Archaic_Numbers 1649 | | "Small_Forms" -> `Small_Forms 1650 | | "Small_Kana_Ext" -> `Small_Kana_Ext 1651 | | "Sogdian" -> `Sogdian 1652 | | "Sora_Sompeng" -> `Sora_Sompeng 1653 | | "Soyombo" -> `Soyombo 1654 | | "Specials" -> `Specials 1655 | | "Sundanese" -> `Sundanese 1656 | | "Sundanese_Sup" -> `Sundanese_Sup 1657 | | "Sunuwar" -> `Sunuwar 1658 | | "Sup_Arrows_A" -> `Sup_Arrows_A 1659 | | "Sup_Arrows_B" -> `Sup_Arrows_B 1660 | | "Sup_Arrows_C" -> `Sup_Arrows_C 1661 | | "Sup_Math_Operators" -> `Sup_Math_Operators 1662 | | "Sup_PUA_A" -> `Sup_PUA_A 1663 | | "Sup_PUA_B" -> `Sup_PUA_B 1664 | | "Sup_Punctuation" -> `Sup_Punctuation 1665 | | "Sup_Symbols_And_Pictographs" -> `Sup_Symbols_And_Pictographs 1666 | | "Super_And_Sub" -> `Super_And_Sub 1667 | | "Sutton_SignWriting" -> `Sutton_SignWriting 1668 | | "Syloti_Nagri" -> `Syloti_Nagri 1669 | | "Symbols_And_Pictographs_Ext_A" -> `Symbols_And_Pictographs_Ext_A 1670 | | "Symbols_For_Legacy_Computing" -> `Symbols_For_Legacy_Computing 1671 | | "Symbols_For_Legacy_Computing_Sup" -> `Symbols_For_Legacy_Computing_Sup 1672 | | "Syriac" -> `Syriac 1673 | | "Syriac_Sup" -> `Syriac_Sup 1674 | | "Tagalog" -> `Tagalog 1675 | | "Tagbanwa" -> `Tagbanwa 1676 | | "Tags" -> `Tags 1677 | | "Tai_Le" -> `Tai_Le 1678 | | "Tai_Tham" -> `Tai_Tham 1679 | | "Tai_Viet" -> `Tai_Viet 1680 | | "Tai_Xuan_Jing" -> `Tai_Xuan_Jing 1681 | | "Tai_Yo" -> `Tai_Yo 1682 | | "Takri" -> `Takri 1683 | | "Tamil" -> `Tamil 1684 | | "Tamil_Sup" -> `Tamil_Sup 1685 | | "Tangsa" -> `Tangsa 1686 | | "Tangut" -> `Tangut 1687 | | "Tangut_Components" -> `Tangut_Components 1688 | | "Tangut_Components_Sup" -> `Tangut_Components_Sup 1689 | | "Tangut_Sup" -> `Tangut_Sup 1690 | | "Telugu" -> `Telugu 1691 | | "Thaana" -> `Thaana 1692 | | "Thai" -> `Thai 1693 | | "Tibetan" -> `Tibetan 1694 | | "Tifinagh" -> `Tifinagh 1695 | | "Tirhuta" -> `Tirhuta 1696 | | "Todhri" -> `Todhri 1697 | | "Tolong_Siki" -> `Tolong_Siki 1698 | | "Toto" -> `Toto 1699 | | "Transport_And_Map" -> `Transport_And_Map 1700 | | "Tulu_Tigalari" -> `Tulu_Tigalari 1701 | | "UCAS" -> `UCAS 1702 | | "UCAS_Ext" -> `UCAS_Ext 1703 | | "UCAS_Ext_A" -> `UCAS_Ext_A 1704 | | "Ugaritic" -> `Ugaritic 1705 | | "VS" -> `VS 1706 | | "VS_Sup" -> `VS_Sup 1707 | | "Vai" -> `Vai 1708 | | "Vedic_Ext" -> `Vedic_Ext 1709 | | "Vertical_Forms" -> `Vertical_Forms 1710 | | "Vithkuqi" -> `Vithkuqi 1711 | | "Wancho" -> `Wancho 1712 | | "Warang_Citi" -> `Warang_Citi 1713 | | "Yezidi" -> `Yezidi 1714 | | "Yi_Radicals" -> `Yi_Radicals 1715 | | "Yi_Syllables" -> `Yi_Syllables 1716 | | "Yijing" -> `Yijing 1717 | | "Zanabazar_Square" -> `Zanabazar_Square 1718 | | "Znamenny_Music" -> `Znamenny_Music 1719 | | v -> err (err_att_val v) 1720 | end 1721 | 1722 | let i_bool v = Bool_v begin match v with 1723 | | "Y" -> true | "N" -> false 1724 | | v -> err (err_att_val v) 1725 | end 1726 | 1727 | let i_bool_maybe v = Bool_maybe_v begin match v with 1728 | | "Y" -> `True | "N" -> `False | "M" -> `Maybe 1729 | | v -> err (err_att_val v) 1730 | end 1731 | 1732 | let i_cp v = Cp_v (cp_of_string v) 1733 | let i_cp_map v = 1734 | if v = "#" then Cp_map_v `Self else Cp_map_v (`Cp (cp_of_string v)) 1735 | 1736 | let i_cp_opt v = 1737 | if v = "" then Cp_opt_v None else Cp_opt_v (Some (cp_of_string v)) 1738 | 1739 | let i_cps ?empty v = Cps_v (cps_of_string ?empty v) 1740 | let i_cps_map ?empty v = 1741 | if v = "#" then Cps_map_v `Self else Cps_map_v (`Cps (cps_of_string ?empty v)) 1742 | 1743 | let i_decomposition_type v = Decomposition_type_v begin match v with 1744 | | "can" -> `Can 1745 | | "com" -> `Com 1746 | | "enc" -> `Enc 1747 | | "fin" -> `Fin 1748 | | "font" -> `Font 1749 | | "fra" -> `Fra 1750 | | "init" -> `Init 1751 | | "iso" -> `Iso 1752 | | "med" -> `Med 1753 | | "nar" -> `Nar 1754 | | "nb" -> `Nb 1755 | | "sml" -> `Sml 1756 | | "sqr" -> `Sqr 1757 | | "sub" -> `Sub 1758 | | "sup" -> `Sup 1759 | | "vert" -> `Vert 1760 | | "wide" -> `Wide 1761 | | "none" -> `None 1762 | | v -> err (err_att_val v) 1763 | end 1764 | 1765 | let i_east_asian_width v = East_asian_width_v begin match v with 1766 | | "A" -> `A 1767 | | "F" -> `F 1768 | | "H" -> `H 1769 | | "N" -> `N 1770 | | "Na" -> `Na 1771 | | "W" -> `W 1772 | | v -> err (err_att_val v) 1773 | end 1774 | 1775 | let i_general_category v = General_category_v begin match v with 1776 | | "Lu" -> `Lu 1777 | | "Ll" -> `Ll 1778 | | "Lt" -> `Lt 1779 | | "Lm" -> `Lm 1780 | | "Lo" -> `Lo 1781 | | "Mn" -> `Mn 1782 | | "Mc" -> `Mc 1783 | | "Me" -> `Me 1784 | | "Nd" -> `Nd 1785 | | "Nl" -> `Nl 1786 | | "No" -> `No 1787 | | "Pc" -> `Pc 1788 | | "Pd" -> `Pd 1789 | | "Ps" -> `Ps 1790 | | "Pe" -> `Pe 1791 | | "Pi" -> `Pi 1792 | | "Pf" -> `Pf 1793 | | "Po" -> `Po 1794 | | "Sm" -> `Sm 1795 | | "Sc" ->`Sc 1796 | | "Sk" -> `Sk 1797 | | "So" -> `So 1798 | | "Zs" -> `Zs 1799 | | "Zl" -> `Zl 1800 | | "Zp" -> `Zp 1801 | | "Cc" -> `Cc 1802 | | "Cf" -> `Cf 1803 | | "Cs" -> `Cs 1804 | | "Co" -> `Co 1805 | | "Cn" -> `Cn 1806 | | v -> err (err_att_val v) 1807 | end 1808 | 1809 | let i_grapheme_cluster_break v = Grapheme_cluster_break_v begin match v with 1810 | | "CN" -> `CN 1811 | | "CR" -> `CR 1812 | | "EB" -> `EB 1813 | | "EBG" -> `EBG 1814 | | "EM" -> `EM 1815 | | "EX" -> `EX 1816 | | "GAZ" -> `GAZ 1817 | | "L" -> `L 1818 | | "LF" -> `LF 1819 | | "LV" -> `LV 1820 | | "LVT" -> `LVT 1821 | | "PP" -> `PP 1822 | | "RI" -> `RI 1823 | | "SM" -> `SM 1824 | | "T" -> `T 1825 | | "V" -> `V 1826 | | "XX" -> `XX 1827 | | "ZWJ" -> `ZWJ 1828 | | v -> err (err_att_val v) 1829 | end 1830 | 1831 | let i_hangul_syllable_type v = Hangul_syllable_type_v begin match v with 1832 | | "L" -> `L 1833 | | "LV" -> `LV 1834 | | "LVT" -> `LVT 1835 | | "T" -> `T 1836 | | "V" -> `V 1837 | | "NA" -> `NA 1838 | | v -> err (err_att_val v) 1839 | end 1840 | 1841 | let i_int v = try Int_v (int_of_string v) with Failure _ -> err (err_att_val v) 1842 | let i_indic_conjunct_break v = Indic_conjunct_break_v begin match v with 1843 | | "Consonant" -> `Consonant 1844 | | "Extend" -> `Extend 1845 | | "Linker" -> `Linker 1846 | | "None" -> `None 1847 | | v -> err (err_att_val v) 1848 | end 1849 | 1850 | let i_indic_syllabic_category v = Indic_syllabic_category_v begin match v with 1851 | | "Avagraha" -> `Avagraha 1852 | | "Bindu" -> `Bindu 1853 | | "Brahmi_Joining_Number" -> `Brahmi_Joining_Number 1854 | | "Cantillation_Mark" -> `Cantillation_Mark 1855 | | "Consonant" -> `Consonant 1856 | | "Consonant_Dead" -> `Consonant_Dead 1857 | | "Consonant_Final" -> `Consonant_Final 1858 | | "Consonant_Head_Letter" -> `Consonant_Head_Letter 1859 | | "Consonant_Initial_Postfixed" -> `Consonant_Initial_Postfixed 1860 | | "Consonant_Killer" -> `Consonant_Killer 1861 | | "Consonant_Medial" -> `Consonant_Medial 1862 | | "Consonant_Placeholder" -> `Consonant_Placeholder 1863 | | "Consonant_Preceding_Repha" -> `Consonant_Preceding_Repha 1864 | | "Consonant_Prefixed" -> `Consonant_Prefixed 1865 | | "Consonant_Repha" -> `Consonant_Repha 1866 | | "Consonant_Subjoined" -> `Consonant_Subjoined 1867 | | "Consonant_Succeeding_Repha" -> `Consonant_Succeeding_Repha 1868 | | "Consonant_With_Stacker" -> `Consonant_With_Stacker 1869 | | "Gemination_Mark" -> `Gemination_Mark 1870 | | "Invisible_Stacker" -> `Invisible_Stacker 1871 | | "Joiner" -> `Joiner 1872 | | "Modifying_Letter" -> `Modifying_Letter 1873 | | "Non_Joiner" -> `Non_Joiner 1874 | | "Nukta" -> `Nukta 1875 | | "Number" -> `Number 1876 | | "Number_Joiner" -> `Number_Joiner 1877 | | "Other" -> `Other 1878 | | "Pure_Killer" -> `Pure_Killer 1879 | | "Reordering_Killer" -> `Reordering_Killer 1880 | | "Register_Shifter" -> `Register_Shifter 1881 | | "Syllable_Modifier" -> `Syllable_Modifier 1882 | | "Tone_Letter" -> `Tone_Letter 1883 | | "Tone_Mark" -> `Tone_Mark 1884 | | "Virama" -> `Virama 1885 | | "Visarga" -> `Visarga 1886 | | "Vowel" -> `Vowel 1887 | | "Vowel_Dependent" -> `Vowel_Dependent 1888 | | "Vowel_Independent" -> `Vowel_Independent 1889 | | v -> err (err_att_val v) 1890 | end 1891 | 1892 | let i_indic_matra_category v = Indic_matra_category_v begin match v with 1893 | | "Right" -> `Right 1894 | | "Left" -> `Left 1895 | | "Visual_Order_Left" -> `Visual_Order_Left 1896 | | "Left_And_Right" -> `Left_And_Right 1897 | | "Top" -> `Top 1898 | | "Bottom" -> `Bottom 1899 | | "Top_And_Bottom" -> `Top_And_Bottom 1900 | | "Top_And_Right" -> `Top_And_Right 1901 | | "Top_And_Left" -> `Top_And_Left 1902 | | "Top_And_Left_And_Right" -> `Top_And_Left_And_Right 1903 | | "Bottom_And_Right" -> `Bottom_And_Right 1904 | | "Top_And_Bottom_And_Right" -> `Top_And_Bottom_And_Right 1905 | | "Overstruck" -> `Overstruck 1906 | | "Invisible" -> `Invisible 1907 | | "NA" -> `NA 1908 | | v -> err (err_att_val v) 1909 | end 1910 | 1911 | let i_indic_positional_category v = Indic_positional_category_v 1912 | begin match v with 1913 | | "Bottom" -> `Bottom 1914 | | "Bottom_And_Left" -> `Bottom_And_Right 1915 | | "Bottom_And_Right" -> `Bottom_And_Right 1916 | | "Invisible" -> `Invisible 1917 | | "Left" -> `Left 1918 | | "Left_And_Right" -> `Left_And_Right 1919 | | "NA" -> `NA 1920 | | "Overstruck" -> `Overstruck 1921 | | "Right" -> `Right 1922 | | "Top" -> `Top 1923 | | "Top_And_Bottom" -> `Top_And_Bottom 1924 | | "Top_And_Bottom_And_Left" -> `Top_And_Bottom_And_Left 1925 | | "Top_And_Bottom_And_Right" -> `Top_And_Bottom_And_Right 1926 | | "Top_And_Left" -> `Top_And_Left 1927 | | "Top_And_Left_And_Right" -> `Top_And_Left_And_Right 1928 | | "Top_And_Right" -> `Top_And_Right 1929 | | "Visual_Order_Left" -> `Visual_Order_Left 1930 | | v -> err (err_att_val v) 1931 | end 1932 | 1933 | let i_joining_group v = Joining_group_v begin match v with 1934 | | "African_Feh" -> `African_Feh 1935 | | "African_Noon" -> `African_Noon 1936 | | "African_Qaf" -> `African_Qaf 1937 | | "Ain" -> `Ain 1938 | | "Alaph" -> `Alaph 1939 | | "Alef" -> `Alef 1940 | | "Alef_Maqsurah" -> `Alef_Maqsurah 1941 | | "Beh" -> `Beh 1942 | | "Beth" -> `Beth 1943 | | "Burushaski_Yeh_Barree" -> `Burushaski_Yeh_Barree 1944 | | "Dal" -> `Dal 1945 | | "Dalath_Rish" -> `Dalath_Rish 1946 | | "E" -> `E 1947 | | "Farsi_Yeh" -> `Farsi_Yeh 1948 | | "Fe" -> `Fe 1949 | | "Feh" -> `Feh 1950 | | "Final_Semkath" -> `Final_Semkath 1951 | | "Gaf" -> `Gaf 1952 | | "Gamal" -> `Gamal 1953 | | "Hah" -> `Hah 1954 | | "Hanifi_Rohingya_Kinna_Ya" -> `Hanifi_Rohingya_Kinna_Ya 1955 | | "Hanifi_Rohingya_Pa" -> `Hanifi_Rohingya_Pa 1956 | | "Hamza_On_Heh_Goal" -> `Hamza_On_Heh_Goal 1957 | | "He" -> `He 1958 | | "Heh" -> `Heh 1959 | | "Heh_Goal" -> `Heh_Goal 1960 | | "Heth" -> `Heth 1961 | | "Kaf" -> `Kaf 1962 | | "Kaph" -> `Kaph 1963 | | "Kashmiri_Yeh" -> `Kashmiri_Yeh 1964 | | "Khaph" -> `Khaph 1965 | | "Knotted_Heh" -> `Knotted_Heh 1966 | | "Lam" -> `Lam 1967 | | "Lamadh" -> `Lamadh 1968 | | "Malayalam_Bha" -> `Malayalam_Bha 1969 | | "Malayalam_Ja" -> `Malayalam_Ja 1970 | | "Malayalam_Lla" -> `Malayalam_Lla 1971 | | "Malayalam_Llla" -> `Malayalam_Llla 1972 | | "Malayalam_Nna" -> `Malayalam_Nna 1973 | | "Malayalam_Nnna" -> `Malayalam_Nnna 1974 | | "Malayalam_Nya" -> `Malayalam_Nya 1975 | | "Malayalam_Ra" -> `Malayalam_Ra 1976 | | "Malayalam_Ssa" -> `Malayalam_Ssa 1977 | | "Malayalam_Tta" -> `Malayalam_Tta 1978 | | "Malayalam_Nga" -> `Malayalam_Nga 1979 | | "Manichaean_Aleph" -> `Manichaean_Aleph 1980 | | "Manichaean_Ayin" -> `Manichaean_Ayin 1981 | | "Manichaean_Beth" -> `Manichaean_Beth 1982 | | "Manichaean_Daleth" -> `Manichaean_Daleth 1983 | | "Manichaean_Dhamedh" -> `Manichaean_Dhamedh 1984 | | "Manichaean_Five" -> `Manichaean_Five 1985 | | "Manichaean_Gimel" -> `Manichaean_Gimel 1986 | | "Manichaean_Heth" -> `Manichaean_Heth 1987 | | "Manichaean_Hundred" -> `Manichaean_Hundred 1988 | | "Manichaean_Kaph" -> `Manichaean_Kaph 1989 | | "Manichaean_Lamedh" -> `Manichaean_Lamedh 1990 | | "Manichaean_Mem" -> `Manichaean_Mem 1991 | | "Manichaean_Nun" -> `Manichaean_Nun 1992 | | "Manichaean_One" -> `Manichaean_One 1993 | | "Manichaean_Pe" -> `Manichaean_Pe 1994 | | "Manichaean_Qoph" -> `Manichaean_Qoph 1995 | | "Manichaean_Resh" -> `Manichaean_Resh 1996 | | "Manichaean_Sadhe" -> `Manichaean_Sadhe 1997 | | "Manichaean_Samekh" -> `Manichaean_Samekh 1998 | | "Manichaean_Taw" -> `Manichaean_Taw 1999 | | "Manichaean_Ten" -> `Manichaean_Ten 2000 | | "Manichaean_Teth" -> `Manichaean_Teth 2001 | | "Manichaean_Thamedh" -> `Manichaean_Thamedh 2002 | | "Manichaean_Twenty" -> `Manichaean_Twenty 2003 | | "Manichaean_Waw" -> `Manichaean_Waw 2004 | | "Manichaean_Yodh" -> `Manichaean_Yodh 2005 | | "Manichaean_Zayin" -> `Manichaean_Zayin 2006 | | "Meem" -> `Meem 2007 | | "Mim" -> `Mim 2008 | | "No_Joining_Group" -> `No_Joining_Group 2009 | | "Noon" -> `Noon 2010 | | "Nun" -> `Nun 2011 | | "Nya" -> `Nya 2012 | | "Pe" -> `Pe 2013 | | "Qaf" -> `Qaf 2014 | | "Qaph" -> `Qaph 2015 | | "Reh" -> `Reh 2016 | | "Reversed_Pe" -> `Reversed_Pe 2017 | | "Rohingya_Yeh" -> `Rohingya_Yeh 2018 | | "Sad" -> `Sad 2019 | | "Sadhe" -> `Sadhe 2020 | | "Seen" -> `Seen 2021 | | "Semkath" -> `Semkath 2022 | | "Shin" -> `Shin 2023 | | "Straight_Waw" -> `Straight_Waw 2024 | | "Swash_Kaf" -> `Swash_Kaf 2025 | | "Syriac_Waw" -> `Syriac_Waw 2026 | | "Tah" -> `Tah 2027 | | "Taw" -> `Taw 2028 | | "Teh_Marbuta" -> `Teh_Marbuta 2029 | | "Teh_Marbuta_Goal" -> `Teh_Marbuta_Goal 2030 | | "Teth" -> `Teth 2031 | | "Thin_Noon" -> `Thin_Noon 2032 | | "Thin_Yeh" -> `Thin_Yeh 2033 | | "Vertical_Tail" -> `Vertical_Tail 2034 | | "Waw" -> `Waw 2035 | | "Yeh" -> `Yeh 2036 | | "Yeh_Barree" -> `Yeh_Barree 2037 | | "Yeh_With_Tail" -> `Yeh_With_Tail 2038 | | "Yudh" -> `Yudh 2039 | | "Yudh_He" -> `Yudh_He 2040 | | "Zain" -> `Zain 2041 | | "Zhain" -> `Zhain 2042 | | "BAA" -> `BAA 2043 | | "FA" -> `FA 2044 | | "HAA" -> `HAA 2045 | | "HA_GOAL" -> `HA_GOAL 2046 | | "HA" -> `HA 2047 | | "CAF" -> `CAF 2048 | | "KNOTTED_HA" -> `KNOTTED_HA 2049 | | "RA" -> `RA 2050 | | "SWASH_CAF" -> `SWASH_CAF 2051 | | "HAMZAH_ON_HA_GOAL" -> `HAMZAH_ON_HA_GOAL 2052 | | "TAA_MARBUTAH" -> `TAA_MARBUTAH 2053 | | "YA_BARREE" -> `YA_BARREE 2054 | | "YA" -> `YA 2055 | | "ALEF_MAQSURAH " -> `ALEF_MAQSURAH 2056 | | v -> err (err_att_val v) 2057 | end 2058 | 2059 | let i_joining_type v = Joining_type_v begin match v with 2060 | | "U" -> `U 2061 | | "C" -> `C 2062 | | "T" -> `T 2063 | | "D" -> `D 2064 | | "L" -> `L 2065 | | "R" -> `R 2066 | | v -> err (err_att_val v) 2067 | end 2068 | 2069 | let i_line_break v = Line_break_v begin match v with 2070 | | "AI" -> `AI 2071 | | "AK" -> `AK 2072 | | "AL" -> `AL 2073 | | "AP" -> `AP 2074 | | "AS" -> `AS 2075 | | "B2" -> `B2 2076 | | "BA" -> `BA 2077 | | "BB" -> `BB 2078 | | "BK" -> `BK 2079 | | "CB" -> `CB 2080 | | "CJ" -> `CJ 2081 | | "CL" -> `CL 2082 | | "CM" -> `CM 2083 | | "CP" -> `CP 2084 | | "CR" -> `CR 2085 | | "EB" -> `EB 2086 | | "EM" -> `EM 2087 | | "EX" -> `EX 2088 | | "GL" -> `GL 2089 | | "H2" -> `H2 2090 | | "H3" -> `H3 2091 | | "HH" -> `HH 2092 | | "HL" -> `HL 2093 | | "HY" -> `HY 2094 | | "ID" -> `ID 2095 | | "IN" -> `IN 2096 | | "IS" -> `IS 2097 | | "JL" -> `JL 2098 | | "JT" -> `JT 2099 | | "JV" -> `JV 2100 | | "LF" -> `LF 2101 | | "NL" -> `NL 2102 | | "NS" -> `NS 2103 | | "NU" -> `NU 2104 | | "OP" -> `OP 2105 | | "PO" -> `PO 2106 | | "PR" -> `PR 2107 | | "QU" -> `QU 2108 | | "RI" -> `RI 2109 | | "SA" -> `SA 2110 | | "SG" -> `SG 2111 | | "SP" -> `SP 2112 | | "SY" -> `SY 2113 | | "VF" -> `VF 2114 | | "VI" -> `VI 2115 | | "WJ" -> `WJ 2116 | | "XX" -> `XX 2117 | | "ZW" -> `ZW 2118 | | "ZWJ" -> `ZWJ 2119 | | v -> err (err_att_val v) 2120 | end 2121 | 2122 | let i_name v = Name_v (if String.contains v '#' then `Pattern v else `Name v) 2123 | let i_name_alias_type = function 2124 | | "abbreviation" -> `Abbreviation 2125 | | "alternate" -> `Alternate 2126 | | "control" -> `Control 2127 | | "correction" -> `Correction 2128 | | "figment" -> `Figment 2129 | | v -> err (err_att_val v) 2130 | 2131 | let i_numeric_type v = Numeric_type_v begin match v with 2132 | | "None" -> `None 2133 | | "De" -> `De 2134 | | "Di" -> `Di 2135 | | "Nu" -> `Nu 2136 | | v -> err (err_att_val v) 2137 | end 2138 | 2139 | let i_numeric_value v = Numeric_value_v begin try match String.trim v with 2140 | | "NaN" -> `NaN 2141 | | s -> 2142 | let base s = match split_string (String.trim s) '/' with 2143 | | [num; denom] -> `Frac (int_of_string num, int_of_string denom) 2144 | | [num] -> `Num (Int64.of_string num) 2145 | | _ -> failwith "" 2146 | in 2147 | `Nums (List.map base (split_string s ' ')) 2148 | with Failure _ -> err (err_att_val v) 2149 | end 2150 | 2151 | let i_script v = Script_v begin match v with 2152 | | "Adlm" -> `Adlm 2153 | | "Aghb" -> `Aghb 2154 | | "Ahom" -> `Ahom 2155 | | "Arab" -> `Arab 2156 | | "Armi" -> `Armi 2157 | | "Armn" -> `Armn 2158 | | "Avst" -> `Avst 2159 | | "Bali" -> `Bali 2160 | | "Bamu" -> `Bamu 2161 | | "Bass" -> `Bass 2162 | | "Batk" -> `Batk 2163 | | "Beng" -> `Beng 2164 | | "Berf" -> `Berf 2165 | | "Bhks" -> `Bhks 2166 | | "Bopo" -> `Bopo 2167 | | "Brah" -> `Brah 2168 | | "Brai" -> `Brai 2169 | | "Bugi" -> `Bugi 2170 | | "Buhd" -> `Buhd 2171 | | "Cakm" -> `Cakm 2172 | | "Cans" -> `Cans 2173 | | "Cari" -> `Cari 2174 | | "Cham" -> `Cham 2175 | | "Cher" -> `Cher 2176 | | "Chrs" -> `Chrs 2177 | | "Copt" -> `Copt 2178 | | "Cpmn" -> `Cpmn 2179 | | "Cprt" -> `Cprt 2180 | | "Cyrl" -> `Cyrl 2181 | | "Deva" -> `Deva 2182 | | "Diak" -> `Diak 2183 | | "Dogr" -> `Dogr 2184 | | "Dsrt" -> `Dsrt 2185 | | "Dupl" -> `Dupl 2186 | | "Egyp" -> `Egyp 2187 | | "Elba" -> `Elba 2188 | | "Elym" -> `Elym 2189 | | "Ethi" -> `Ethi 2190 | | "Gara" -> `Gara 2191 | | "Geor" -> `Geor 2192 | | "Glag" -> `Glag 2193 | | "Gong" -> `Gong 2194 | | "Gonm" -> `Gonm 2195 | | "Goth" -> `Goth 2196 | | "Gran" -> `Gran 2197 | | "Grek" -> `Grek 2198 | | "Gujr" -> `Gujr 2199 | | "Gukh" -> `Gukh 2200 | | "Guru" -> `Guru 2201 | | "Hang" -> `Hang 2202 | | "Hani" -> `Hani 2203 | | "Hano" -> `Hano 2204 | | "Hatr" -> `Hatr 2205 | | "Hebr" -> `Hebr 2206 | | "Hira" -> `Hira 2207 | | "Hluw" -> `Hluw 2208 | | "Hmng" -> `Hmng 2209 | | "Hmnp" -> `Hmnp 2210 | | "Hrkt" -> `Hrkt 2211 | | "Hung" -> `Hung 2212 | | "Ital" -> `Ital 2213 | | "Java" -> `Java 2214 | | "Kali" -> `Kali 2215 | | "Kana" -> `Kana 2216 | | "Kawi" -> `Kawi 2217 | | "Khar" -> `Khar 2218 | | "Khmr" -> `Khmr 2219 | | "Khoj" -> `Khoj 2220 | | "Knda" -> `Knda 2221 | | "Krai" -> `Krai 2222 | | "Kthi" -> `Kthi 2223 | | "Kits" -> `Kits 2224 | | "Lana" -> `Lana 2225 | | "Laoo" -> `Laoo 2226 | | "Latn" -> `Latn 2227 | | "Lepc" -> `Lepc 2228 | | "Limb" -> `Limb 2229 | | "Lina" -> `Lina 2230 | | "Linb" -> `Linb 2231 | | "Lisu" -> `Lisu 2232 | | "Lyci" -> `Lyci 2233 | | "Lydi" -> `Lydi 2234 | | "Mahj" -> `Mahj 2235 | | "Maka" -> `Maka 2236 | | "Mand" -> `Mand 2237 | | "Mani" -> `Mani 2238 | | "Marc" -> `Marc 2239 | | "Medf" -> `Medf 2240 | | "Mend" -> `Mend 2241 | | "Merc" -> `Merc 2242 | | "Mero" -> `Mero 2243 | | "Mlym" -> `Mlym 2244 | | "Modi" -> `Modi 2245 | | "Mong" -> `Mong 2246 | | "Mroo" -> `Mroo 2247 | | "Mtei" -> `Mtei 2248 | | "Mult" -> `Mult 2249 | | "Mymr" -> `Mymr 2250 | | "Nagm" -> `Nagm 2251 | | "Nand" -> `Nand 2252 | | "Narb" -> `Narb 2253 | | "Nbat" -> `Nbat 2254 | | "Newa" -> `Newa 2255 | | "Nkoo" -> `Nkoo 2256 | | "Nshu" -> `Nshu 2257 | | "Ogam" -> `Ogam 2258 | | "Olck" -> `Olck 2259 | | "Onao" -> `Onao 2260 | | "Orkh" -> `Orkh 2261 | | "Orya" -> `Orya 2262 | | "Osge" -> `Osge 2263 | | "Osma" -> `Osma 2264 | | "Ougr" -> `Ougr 2265 | | "Palm" -> `Palm 2266 | | "Pauc" -> `Pauc 2267 | | "Perm" -> `Perm 2268 | | "Phag" -> `Phag 2269 | | "Phli" -> `Phli 2270 | | "Phlp" -> `Phlp 2271 | | "Phnx" -> `Phnx 2272 | | "Plrd" -> `Plrd 2273 | | "Prti" -> `Prti 2274 | | "Qaai" -> `Qaai 2275 | | "Rjng" -> `Rjng 2276 | | "Rohg" -> `Rohg 2277 | | "Runr" -> `Runr 2278 | | "Samr" -> `Samr 2279 | | "Sarb" -> `Sarb 2280 | | "Saur" -> `Saur 2281 | | "Sgnw" -> `Sgnw 2282 | | "Shaw" -> `Shaw 2283 | | "Shrd" -> `Shrd 2284 | | "Sidd" -> `Sidd 2285 | | "Sidt" -> `Sidt 2286 | | "Sind" -> `Sind 2287 | | "Sinh" -> `Sinh 2288 | | "Sogd" -> `Sogd 2289 | | "Sogo" -> `Sogo 2290 | | "Sora" -> `Sora 2291 | | "Soyo" -> `Soyo 2292 | | "Sund" -> `Sund 2293 | | "Sunu" -> `Sunu 2294 | | "Sylo" -> `Sylo 2295 | | "Syrc" -> `Syrc 2296 | | "Tagb" -> `Tagb 2297 | | "Takr" -> `Takr 2298 | | "Tale" -> `Tale 2299 | | "Talu" -> `Talu 2300 | | "Taml" -> `Taml 2301 | | "Tang" -> `Tang 2302 | | "Tavt" -> `Tavt 2303 | | "Tayo" -> `Tayo 2304 | | "Telu" -> `Telu 2305 | | "Tfng" -> `Tfng 2306 | | "Tglg" -> `Tglg 2307 | | "Thaa" -> `Thaa 2308 | | "Thai" -> `Thai 2309 | | "Tibt" -> `Tibt 2310 | | "Tirh" -> `Tirh 2311 | | "Tnsa" -> `Tnsa 2312 | | "Todr" -> `Todr 2313 | | "Tols" -> `Tols 2314 | | "Toto" -> `Toto 2315 | | "Tutg" -> `Tutg 2316 | | "Ugar" -> `Ugar 2317 | | "Vaii" -> `Vaii 2318 | | "Vith" -> `Vith 2319 | | "Wara" -> `Wara 2320 | | "Wcho" -> `Wcho 2321 | | "Xpeo" -> `Xpeo 2322 | | "Xsux" -> `Xsux 2323 | | "Yezi" -> `Yezi 2324 | | "Yiii" -> `Yiii 2325 | | "Zanb" -> `Zanb 2326 | | "Zinh" -> `Zinh 2327 | | "Zyyy" -> `Zyyy 2328 | | "Zzzz" -> `Zzzz 2329 | | v -> err (err_att_val v) 2330 | end 2331 | 2332 | let i_script_seq v = 2333 | let script v = o_script (i_script v) in 2334 | Script_extensions_v (List.map script (split_string v ' ')) 2335 | 2336 | let i_sentence_break v = Sentence_break_v begin match v with 2337 | | "AT" -> `AT 2338 | | "CL" -> `CL 2339 | | "CR" -> `CR 2340 | | "EX" -> `EX 2341 | | "FO" -> `FO 2342 | | "LE" -> `LE 2343 | | "LF" -> `LF 2344 | | "LO" -> `LO 2345 | | "NU" -> `NU 2346 | | "SC" -> `SC 2347 | | "SE" -> `SE 2348 | | "SP" -> `SP 2349 | | "ST" -> `ST 2350 | | "UP" -> `UP 2351 | | "XX" -> `XX 2352 | | v -> err (err_att_val v) 2353 | end 2354 | 2355 | let i_string v = String_v v 2356 | let i_uax_42_element v = UAX_42_element_v begin match v with 2357 | | "reserved" -> `Reserved 2358 | | "noncharacter" -> `Noncharacter 2359 | | "surrogate" -> `Surrogate 2360 | | "char" -> `Char 2361 | | s -> err (err_att_val s) 2362 | end 2363 | 2364 | let i_vertical_orientation v = Vertical_orientation_v begin match v with 2365 | | "U" -> `U 2366 | | "R" -> `R 2367 | | "Tu" -> `Tu 2368 | | "Tr" -> `Tr 2369 | | s -> err (err_att_val s) 2370 | end 2371 | 2372 | let i_word_break v = Word_break_v begin match v with 2373 | | "CR" -> `CR 2374 | | "DQ" -> `DQ 2375 | | "EB" -> `EB 2376 | | "EBG" -> `EBG 2377 | | "EM" -> `EM 2378 | | "EX" -> `EX 2379 | | "Extend" -> `Extend 2380 | | "FO" -> `FO 2381 | | "GAZ" -> `GAZ 2382 | | "HL" -> `HL 2383 | | "KA" -> `KA 2384 | | "LE" -> `LE 2385 | | "LF" -> `LF 2386 | | "MB" -> `MB 2387 | | "ML" -> `ML 2388 | | "MN" -> `MN 2389 | | "NL" -> `NL 2390 | | "NU" -> `NU 2391 | | "RI" -> `RI 2392 | | "SQ" -> `SQ 2393 | | "WSegSpace" -> `WSegSpace 2394 | | "XX" -> `XX 2395 | | "ZWJ" -> `ZWJ 2396 | | v -> err (err_att_val v) 2397 | end 2398 | 2399 | module Pkey = struct type t = key let compare : key -> key -> int = compare end 2400 | module Pmap = Map.Make (Pkey) 2401 | type props = value Pmap.t 2402 | type 'a prop = key * (value -> 'a) (* property key and value projection. *) 2403 | 2404 | let find props (k, o) = try Some (o (Pmap.find k props)) with Not_found -> None 2405 | let unknown_prop name = (Other name), o_string 2406 | 2407 | 2408 | (* non hunihan and unikemet properties *) 2409 | 2410 | let uax_42_element = UAX_42_element, o_uax_42_element (* artefact of Uucd *) 2411 | 2412 | let age = Age, o_age 2413 | let alphabetic = Alphabetic, o_bool 2414 | let ascii_hex_digit = Ascii_hex_digit, o_bool 2415 | let bidi_class = Bidi_class, o_bidi_class 2416 | let bidi_control = Bidi_control, o_bool 2417 | let bidi_mirrored = Bidi_mirrored, o_bool 2418 | let bidi_mirroring_glyph = Bidi_mirroring_glyph, o_cp_opt 2419 | let bidi_paired_bracket = Bidi_paired_bracket, o_cp_map 2420 | let bidi_paired_bracket_type = 2421 | Bidi_paired_bracket_type, o_bidi_paired_bracket_type 2422 | 2423 | let block = Block, o_block 2424 | let canonical_combining_class = Canonical_combining_class, o_int 2425 | let cased = Cased, o_bool 2426 | let case_folding = Case_folding, o_cps_map 2427 | let case_ignorable = Case_ignorable, o_bool 2428 | let changes_when_casefolded = Changes_when_casefolded, o_bool 2429 | let changes_when_casemapped = Changes_when_casemapped, o_bool 2430 | let changes_when_lowercased = Changes_when_lowercased, o_bool 2431 | let changes_when_nfkc_casefolded = Changes_when_nfkc_casefolded, o_bool 2432 | let changes_when_titlecased = Changes_when_titlecased, o_bool 2433 | let changes_when_uppercased = Changes_when_uppercased, o_bool 2434 | let composition_exclusion = Composition_exclusion, o_bool 2435 | let dash = Dash, o_bool 2436 | let decomposition_mapping = Decomposition_mapping, o_cps_map 2437 | let decomposition_type = Decomposition_type, o_decomposition_type 2438 | let default_ignorable_code_point = Default_ignorable_code_point, o_bool 2439 | let deprecated = Deprecated, o_bool 2440 | let diacritic = Diacritic, o_bool 2441 | let east_asian_width = East_asian_width, o_east_asian_width 2442 | let emoji = Emoji, o_bool 2443 | let emoji_presentation = Emoji_presentation, o_bool 2444 | let emoji_modifier = Emoji_modifier, o_bool 2445 | let emoji_modifier_base = Emoji_modifier_base, o_bool 2446 | let emoji_component = Emoji_component, o_bool 2447 | let equivalent_unified_ideograph = Equivalent_unified_ideograph, o_cp_opt 2448 | let extended_pictographic = Extended_pictographic, o_bool 2449 | let extender = Extender, o_bool 2450 | let full_composition_exclusion = Full_composition_exclusion, o_bool 2451 | let general_category = General_category, o_general_category 2452 | let grapheme_base = Grapheme_base, o_bool 2453 | let grapheme_cluster_break = Grapheme_cluster_break, o_grapheme_cluster_break 2454 | let grapheme_extend = Grapheme_extend, o_bool 2455 | let hangul_syllable_type = Hangul_syllable_type, o_hangul_syllable_type 2456 | let hex_digit = Hex_digit, o_bool 2457 | let id_continue = Id_continue, o_bool 2458 | let id_compat_math_continue = Id_compat_math_continue, o_bool 2459 | let id_compat_math_start = Id_compat_math_start, o_bool 2460 | let id_start = Id_start, o_bool 2461 | let ideographic = Ideographic, o_bool 2462 | let ids_binary_operator = Ids_binary_operator, o_bool 2463 | let ids_trinary_operator = Ids_trinary_operator, o_bool 2464 | let ids_unary_operator = Ids_unary_operator, o_bool 2465 | let indic_conjunct_break = Indic_conjunct_break, o_indic_conjunct_break 2466 | let indic_syllabic_category = Indic_syllabic_category, o_indic_syllabic_category 2467 | let indic_matra_category = Indic_matra_category, o_indic_matra_category 2468 | let indic_positional_category = 2469 | Indic_positional_category, o_indic_positional_category 2470 | let jamo_short_name = Jamo_short_name, o_string 2471 | let join_control = Join_control, o_bool 2472 | let joining_group = Joining_group, o_joining_group 2473 | let joining_type = Joining_type, o_joining_type 2474 | let line_break = Line_break, o_line_break 2475 | let logical_order_exception = Logical_order_exception, o_bool 2476 | let lowercase = Lowercase, o_bool 2477 | let lowercase_mapping = Lowercase_mapping, o_cps_map 2478 | let math = Math, o_bool 2479 | let modifier_combining_mark = Modifier_combining_mark, o_bool 2480 | let name = Name, o_name 2481 | let name_alias = Name_alias, o_name_alias 2482 | let nfc_quick_check = Nfc_quick_check, o_bool_maybe 2483 | let nfd_quick_check = Nfd_quick_check, o_bool_maybe 2484 | let nfkc_quick_check = Nfkc_quick_check, o_bool_maybe 2485 | let nfkc_casefold = Nfkc_casefold, o_cps_map 2486 | let nfkc_simple_casefold = Nfkc_simple_casefold, o_cps_map 2487 | let nfkd_quick_check = Nfkd_quick_check, o_bool_maybe 2488 | let noncharacter_code_point = Noncharacter_code_point, o_bool 2489 | let numeric_type = Numeric_type, o_numeric_type 2490 | let numeric_value = Numeric_value, o_numeric_value 2491 | let other_alphabetic = Other_alphabetic, o_bool 2492 | let other_default_ignorable_code_point = 2493 | Other_default_ignorable_code_point, o_bool 2494 | 2495 | let other_grapheme_extend = Other_grapheme_extend, o_bool 2496 | let other_id_continue = Other_id_continue, o_bool 2497 | let other_id_start = Other_id_start, o_bool 2498 | let other_lowercase = Other_lowercase, o_bool 2499 | let other_math = Other_math, o_bool 2500 | let other_uppercase = Other_uppercase, o_bool 2501 | let pattern_syntax = Pattern_syntax, o_bool 2502 | let pattern_white_space = Pattern_white_space, o_bool 2503 | let prepended_concatenation_mark = Prepended_concatenation_mark, o_bool 2504 | let quotation_mark = Quotation_mark, o_bool 2505 | let radical = Radical, o_bool 2506 | let regional_indicator = Regional_indicator, o_bool 2507 | let script = Script, o_script 2508 | let script_extensions = Script_extensions, o_script_extensions 2509 | let sentence_break = Sentence_break, o_sentence_break 2510 | let simple_case_folding = Simple_case_folding, o_cp_map 2511 | let simple_lowercase_mapping = Simple_lowercase_mapping, o_cp_map 2512 | let simple_titlecase_mapping = Simple_titlecase_mapping, o_cp_map 2513 | let simple_uppercase_mapping = Simple_uppercase_mapping, o_cp_map 2514 | let soft_dotted = Soft_dotted, o_bool 2515 | let sterm = Sterm, o_bool 2516 | let terminal_punctuation = Terminal_punctuation, o_bool 2517 | let titlecase_mapping = Titlecase_mapping, o_cps_map 2518 | let unicode_1_name = Unicode_1_name, o_string 2519 | let unified_ideograph = Unified_ideograph, o_bool 2520 | let uppercase = Uppercase, o_bool 2521 | let uppercase_mapping = Uppercase_mapping, o_cps_map 2522 | let variation_selector = Variation_selector, o_bool 2523 | let vertical_orientation = Vertical_orientation, o_vertical_orientation 2524 | let white_space = White_space, o_bool 2525 | let word_break = Word_break, o_word_break 2526 | let xid_continue = Xid_continue, o_bool 2527 | let xid_start = Xid_start, o_bool 2528 | 2529 | (* unihan properties *) 2530 | 2531 | let kAccountingNumeric = KAccountingNumeric, o_string 2532 | let kAlternateHanYu = KAlternateHanYu, o_string 2533 | let kAlternateJEF = KAlternateJEF, o_string 2534 | let kAlternateKangXi = KAlternateKangXi, o_string 2535 | let kAlternateMorohashi = KAlternateMorohashi, o_string 2536 | let kAlternateTotalStrokes = KAlternateTotalStrokes, o_string 2537 | let kBigFive = KBigFive, o_string 2538 | let kCCCII = KCCCII, o_string 2539 | let kCNS1986 = KCNS1986, o_string 2540 | let kCNS1992 = KCNS1992, o_string 2541 | let kCangjie = KCangjie, o_string 2542 | let kCantonese = KCantonese, o_string 2543 | let kCheungBauer = KCheungBauer, o_string 2544 | let kCheungBauerIndex = KCheungBauerIndex, o_string 2545 | let kCihaiT = KCihaiT, o_string 2546 | let kCompatibilityVariant = KCompatibilityVariant, o_string 2547 | let kCowles = KCowles, o_string 2548 | let kDaeJaweon = KDaeJaweon, o_string 2549 | let kDefinition = KDefinition, o_string 2550 | let kEACC = KEACC, o_string 2551 | let kFanqie = KFanqie, o_string 2552 | let kFenn = KFenn, o_string 2553 | let kFennIndex = KFennIndex, o_string 2554 | let kFourCornerCode = KFourCornerCode, o_string 2555 | let kFrequency = KFrequency, o_string 2556 | let kGB0 = KGB0, o_string 2557 | let kGB1 = KGB1, o_string 2558 | let kGB3 = KGB3, o_string 2559 | let kGB5 = KGB5, o_string 2560 | let kGB8 = KGB8, o_string 2561 | let kGSR = KGSR, o_string 2562 | let kGradeLevel = KGradeLevel, o_string 2563 | let kHDZRadBreak = KHDZRadBreak, o_string 2564 | let kHKGlyph = KHKGlyph, o_string 2565 | let kHKSCS = KHKSCS, o_string 2566 | let kHanYu = KHanYu, o_string 2567 | let kHangul = KHangul, o_string 2568 | let kHanyuPinlu = KHanyuPinlu, o_string 2569 | let kHanyuPinyin = KHanyuPinyin, o_string 2570 | let kIBMJapan = KIBMJapan, o_string 2571 | let kIICore = KIICore, o_string 2572 | let kIRGDaeJaweon = KIRGDaeJaweon, o_string 2573 | let kIRGDaiKanwaZiten = KIRGDaiKanwaZiten, o_string 2574 | let kIRGHanyuDaZidian = KIRGHanyuDaZidian, o_string 2575 | let kIRGKangXi = KIRGKangXi, o_string 2576 | let kIRG_GSource = KIRG_GSource, o_string 2577 | let kIRG_HSource = KIRG_HSource, o_string 2578 | let kIRG_JSource = KIRG_JSource, o_string 2579 | let kIRG_KPSource = KIRG_KPSource, o_string 2580 | let kIRG_KSource = KIRG_KSource, o_string 2581 | let kIRG_MSource = KIRG_MSource, o_string 2582 | let kIRG_SSource = KIRG_SSource, o_string 2583 | let kIRG_TSource = KIRG_TSource, o_string 2584 | let kIRG_USource = KIRG_USource, o_string 2585 | let kIRG_UKSource = KIRG_UKSource, o_string 2586 | let kIRG_VSource = KIRG_VSource, o_string 2587 | let kJHJ = KJHJ, o_string 2588 | let kJIS0213 = KJIS0213, o_string 2589 | let kJapanese = KJapanese, o_string 2590 | let kJapaneseKun = KJapaneseKun, o_string 2591 | let kJapaneseOn = KJapaneseOn, o_string 2592 | let kJinmeiyoKanji = KJinmeiyoKanji, o_string 2593 | let kJis0 = KJis0, o_string 2594 | let kJis1 = KJis1, o_string 2595 | let kJoyoKanji = KJoyoKanji, o_string 2596 | let kKPS0 = KKPS0, o_string 2597 | let kKPS1 = KKPS1, o_string 2598 | let kKSC0 = KKSC0, o_string 2599 | let kKSC1 = KKSC1, o_string 2600 | let kKangXi = KKangXi, o_string 2601 | let kKarlgren = KKarlgren, o_string 2602 | let kKorean = KKorean, o_string 2603 | let kKoreanEducationHanja = KKoreanEducationHanja, o_string 2604 | let kKoreanName = KKoreanName, o_string 2605 | let kLau = KLau, o_string 2606 | let kMainlandTelegraph = KMainlandTelegraph, o_string 2607 | let kMandarin = KMandarin, o_string 2608 | let kMatthews = KMatthews, o_string 2609 | let kMeyerWempe = KMeyerWempe, o_string 2610 | let kMojiJoho = KMojiJoho, o_string 2611 | let kMorohashi = KMorohashi, o_string 2612 | let kNelson = KNelson, o_string 2613 | let kNSHU_DubenSrc = KNSHU_DubenSrc, o_string 2614 | let kNSHU_Reading = KNSHU_Reading, o_string 2615 | let kOtherNumeric = KOtherNumeric, o_string 2616 | let kPhonetic = KPhonetic, o_string 2617 | let kPrimaryNumeric = KPrimaryNumeric, o_string 2618 | let kPseudoGB1 = KPseudoGB1, o_string 2619 | let kRSAdobe_Japan1_6 = KRSAdobe_Japan1_6, o_string 2620 | let kRSJapanese = KRSJapanese, o_string 2621 | let kRSKanWa = KRSKanWa, o_string 2622 | let kRSKangXi = KRSKangXi, o_string 2623 | let kRSKorean = KRSKorean, o_string 2624 | let kRSMerged = KRSMerged, o_string 2625 | let kRSUnicode = KRSUnicode, o_string 2626 | let kSBGY = KSBGY, o_string 2627 | let kSemanticVariant = KSemanticVariant, o_string 2628 | let kSimplifiedVariant = KSimplifiedVariant, o_string 2629 | let kSMSZD2003Index = KSMSZD2003Index, o_string 2630 | let kSMSZD2003Readings = KSMSZD2003Readings, o_string 2631 | let kSpecializedSemanticVariant = KSpecializedSemanticVariant, o_string 2632 | let kSpoofingVariant = KSpoofingVariant, o_string 2633 | let kStrange = KStrange, o_string 2634 | let kTGH = KTGH, o_string 2635 | let kTGHZ2013 = KTGHZ2013, o_string 2636 | let kTGT_MergedSrc = KTGT_MergedSrc, o_string 2637 | let kTGT_RSUnicode = KTGT_RSUnicode, o_string 2638 | let kTaiwanTelegraph = KTaiwanTelegraph, o_string 2639 | let kTang = KTang, o_string 2640 | let kTayNumeric = KTayNumeric, o_string 2641 | let kTotalStrokes = KTotalStrokes, o_string 2642 | let kTraditionalVariant = KTraditionalVariant, o_string 2643 | let kUnihanCore2020 = KUnihanCore2020, o_string 2644 | let kVietnamese = KVietnamese, o_string 2645 | let kVietnameseNumeric = KVietnameseNumeric, o_string 2646 | let kWubi = KWubi, o_string 2647 | let kXHC1983 = KXHC1983, o_string 2648 | let kXerox = KXerox, o_string 2649 | let kZhuang = KZhuang, o_string 2650 | let kZhuangNumeric = KZhuangNumeric, o_string 2651 | let kZVariant = KZVariant, o_string 2652 | 2653 | (* Unikemet properties *) 2654 | 2655 | let kEH_Cat = KEH_Cat, o_string 2656 | let kEH_Core = KEH_Core, o_string 2657 | let kEH_Desc = KEH_Desc, o_string 2658 | let kEH_Func = KEH_Func, o_string 2659 | let kEH_FVal = KEH_FVal, o_string 2660 | let kEH_UniK = KEH_UniK, o_string 2661 | let kEH_JSesh = KEH_JSesh, o_string 2662 | let kEH_HG = KEH_HG, o_string 2663 | let kEH_IFAO = KEH_IFAO, o_string 2664 | let kEH_NoMirror = KEH_NoMirror, o_bool 2665 | let kEH_NoRotate = KEH_NoRotate, o_bool 2666 | let kEH_AltSeq = KEH_AltSeq, o_string 2667 | 2668 | 2669 | (* Unicode Character Databases *) 2670 | 2671 | type block = (cp * cp) * string 2672 | type named_sequence = string * cp list 2673 | type standardized_variant = 2674 | cp list * string * [ `Isolate | `Initial | `Medial | `Final ] list 2675 | 2676 | type cjk_radical = string * cp * cp 2677 | type do_not_emit = { instead_of : cp list; use : cp list; because : string; } 2678 | 2679 | type t = 2680 | { description : string; 2681 | repertoire : props Cpmap.t; 2682 | blocks : block list; 2683 | named_sequences : named_sequence list; 2684 | provisional_named_sequences : named_sequence list; 2685 | standardized_variants : standardized_variant list; 2686 | cjk_radicals : cjk_radical list; 2687 | do_not_emit : do_not_emit list; } 2688 | 2689 | let cp_props db cp = 2690 | try Some (Cpmap.find cp db.repertoire) with Not_found -> None 2691 | 2692 | let cp_prop db cp p = try find (Cpmap.find cp db.repertoire) p 2693 | with Not_found -> None 2694 | 2695 | (* Decode *) 2696 | 2697 | (* Xml names *) 2698 | 2699 | let ns_ucd = "http://www.unicode.org/ns/2003/ucd/1.0" 2700 | let n_block = (ns_ucd, "block") 2701 | let n_blocks = (ns_ucd, "blocks") 2702 | let n_char = (ns_ucd, "char") 2703 | let n_cjk_radical = (ns_ucd, "cjk-radical") 2704 | let n_cjk_radicals = (ns_ucd, "cjk-radicals") 2705 | let n_do_not_emit = (ns_ucd, "do-not-emit") 2706 | let n_description = (ns_ucd, "description") 2707 | let n_group = (ns_ucd, "group") 2708 | let n_instead = (ns_ucd, "instead") 2709 | let n_name_alias = (ns_ucd, "name-alias") 2710 | let n_named_sequence = (ns_ucd, "named-sequence") 2711 | let n_named_sequences = (ns_ucd, "named-sequences") 2712 | let n_noncharacter = (ns_ucd, "noncharacter") 2713 | let n_provisional_named_sequences = (ns_ucd, "provisional-named-sequences") 2714 | let n_repertoire = (ns_ucd, "repertoire") 2715 | let n_reserved = (ns_ucd, "reserved") 2716 | let n_standardized_variant = (ns_ucd, "standardized-variant") 2717 | let n_standardized_variants = (ns_ucd, "standardized-variants") 2718 | let n_surrogate = (ns_ucd, "surrogate") 2719 | let n_ucd = (ns_ucd, "ucd") 2720 | 2721 | (* Attribute parsing *) 2722 | 2723 | let add_prop : value Pmap.t -> Xmlm.attribute -> value Pmap.t = 2724 | let h = Hashtbl.create 500 in 2725 | let map = Hashtbl.add h in 2726 | map "AHex" (Ascii_hex_digit, i_bool); 2727 | map "Alpha" (Alphabetic, i_bool); 2728 | map "Bidi_C" (Bidi_control, i_bool); 2729 | map "Bidi_M" (Bidi_mirrored, i_bool); 2730 | map "Cased" (Cased, i_bool); 2731 | map "CI" (Case_ignorable, i_bool); 2732 | map "CE" (Composition_exclusion, i_bool); 2733 | map "CWCF" (Changes_when_casefolded, i_bool); 2734 | map "CWCM" (Changes_when_casemapped, i_bool); 2735 | map "CWL" (Changes_when_lowercased, i_bool); 2736 | map "CWKCF" (Changes_when_nfkc_casefolded, i_bool); 2737 | map "CWT" (Changes_when_titlecased, i_bool); 2738 | map "CWU" (Changes_when_uppercased, i_bool); 2739 | map "Comp_Ex" (Full_composition_exclusion, i_bool); 2740 | map "DI" (Default_ignorable_code_point, i_bool); 2741 | map "Dash" (Dash, i_bool); 2742 | map "Dep" (Deprecated, i_bool); 2743 | map "Dia" (Diacritic, i_bool); 2744 | map "EqUIdeo" (Equivalent_unified_ideograph, i_cp_opt); 2745 | map "Ext" (Extender, i_bool); 2746 | map "GCB" (Grapheme_cluster_break, i_grapheme_cluster_break); 2747 | map "Gr_Base" (Grapheme_base, i_bool); 2748 | map "Gr_Ext" (Grapheme_extend, i_bool); 2749 | map "Hex" (Hex_digit, i_bool); 2750 | map "ID_Compat_Math_Continue" (Id_compat_math_continue, i_bool); 2751 | map "ID_Compat_Math_Start" (Id_compat_math_start, i_bool); 2752 | map "IDC" (Id_continue, i_bool); 2753 | map "IDS" (Id_start, i_bool); 2754 | map "IDSB" (Ids_binary_operator, i_bool); 2755 | map "IDST" (Ids_trinary_operator, i_bool); 2756 | map "IDSU" (Ids_unary_operator, i_bool); 2757 | map "Ideo" (Ideographic, i_bool); 2758 | map "InCB" (Indic_conjunct_break, i_indic_conjunct_break); 2759 | map "InSC" (Indic_syllabic_category, i_indic_syllabic_category); 2760 | map "InMC" (Indic_matra_category, i_indic_matra_category); 2761 | map "InPC" (Indic_positional_category, i_indic_positional_category); 2762 | map "JSN" (Jamo_short_name, i_string); 2763 | map "Join_C" (Join_control, i_bool); 2764 | map "LOE" (Logical_order_exception, i_bool); 2765 | map "Lower" (Lowercase, i_bool); 2766 | map "Math" (Math, i_bool); 2767 | map "MCM" (Modifier_combining_mark, i_bool); 2768 | map "NChar" (Noncharacter_code_point, i_bool); 2769 | map "NFC_QC" (Nfc_quick_check, i_bool_maybe); 2770 | map "NFD_QC" (Nfd_quick_check, i_bool_maybe); 2771 | map "NFKC_QC" (Nfkc_quick_check, i_bool_maybe); 2772 | map "NFKC_CF" (Nfkc_casefold, i_cps_map ~empty:true); 2773 | map "NFKC_SCF" (Nfkc_simple_casefold, i_cps_map ~empty:true); 2774 | map "NFKD_QC" (Nfkd_quick_check, i_bool_maybe); 2775 | map "OAlpha" (Other_alphabetic, i_bool); 2776 | map "ODI" (Other_default_ignorable_code_point, i_bool); 2777 | map "OGr_Ext" (Other_grapheme_extend, i_bool); 2778 | map "OIDC" (Other_id_continue, i_bool); 2779 | map "OIDS" (Other_id_start, i_bool); 2780 | map "OLower" (Other_lowercase, i_bool); 2781 | map "OMath" (Other_math, i_bool); 2782 | map "OUpper" (Other_uppercase, i_bool); 2783 | map "Pat_Syn" (Pattern_syntax, i_bool); 2784 | map "Pat_WS" (Pattern_white_space, i_bool); 2785 | map "PCM" (Prepended_concatenation_mark, i_bool); 2786 | map "QMark" (Quotation_mark, i_bool); 2787 | map "Radical" (Radical, i_bool); 2788 | map "RI" (Regional_indicator, i_bool); 2789 | map "SB" (Sentence_break, i_sentence_break); 2790 | map "SD" (Soft_dotted, i_bool); 2791 | map "STerm" (Sterm, i_bool); 2792 | map "Term" (Terminal_punctuation, i_bool); 2793 | map "UIdeo" (Unified_ideograph, i_bool); 2794 | map "Upper" (Uppercase, i_bool); 2795 | map "VS" (Variation_selector, i_bool); 2796 | map "vo" (Vertical_orientation, i_vertical_orientation); 2797 | map "WB" (Word_break, i_word_break); 2798 | map "WSpace" (White_space, i_bool); 2799 | map "XIDC" (Xid_continue, i_bool); 2800 | map "XIDS" (Xid_start, i_bool); 2801 | map "age" (Age, i_age); 2802 | map "bc" (Bidi_class, i_bidi_class); 2803 | map "blk" (Block, i_block); 2804 | map "bmg" (Bidi_mirroring_glyph, i_cp_opt); 2805 | map "bpb" (Bidi_paired_bracket, i_cp_map); 2806 | map "bpt" (Bidi_paired_bracket_type, i_bidi_paired_bracket_type); 2807 | map "ccc" (Canonical_combining_class, i_int); 2808 | map "cf" (Case_folding, i_cps_map ~empty:false); 2809 | map "dm" (Decomposition_mapping, (i_cps_map ~empty:true)); 2810 | map "dt" (Decomposition_type, i_decomposition_type); 2811 | map "ea" (East_asian_width, i_east_asian_width); 2812 | map "Emoji" (Emoji, i_bool); 2813 | map "EPres" (Emoji_presentation, i_bool); 2814 | map "EMod" (Emoji_modifier, i_bool); 2815 | map "EBase" (Emoji_modifier_base, i_bool); 2816 | map "EComp" (Emoji_component, i_bool); 2817 | map "ExtPict" (Extended_pictographic, i_bool); 2818 | map "gc" (General_category, i_general_category); 2819 | map "hst" (Hangul_syllable_type, i_hangul_syllable_type); 2820 | map "jg" (Joining_group, i_joining_group); 2821 | map "jt" (Joining_type, i_joining_type); 2822 | map "lb" (Line_break, i_line_break); 2823 | map "lc" (Lowercase_mapping, i_cps_map ~empty:false); 2824 | map "na" (Name, i_name); 2825 | map "na1" (Unicode_1_name, i_string); 2826 | map "nt" (Numeric_type, i_numeric_type); 2827 | map "nv" (Numeric_value, i_numeric_value); 2828 | map "sc" (Script, i_script); 2829 | map "scf" (Simple_case_folding, i_cp_map); 2830 | map "scx" (Script_extensions, i_script_seq); 2831 | map "slc" (Simple_lowercase_mapping, i_cp_map); 2832 | map "stc" (Simple_titlecase_mapping, i_cp_map); 2833 | map "suc" (Simple_uppercase_mapping, i_cp_map); 2834 | map "tc" (Titlecase_mapping, i_cps_map ~empty:false); 2835 | map "uax_42_element" (UAX_42_element, i_uax_42_element); (* artefact *) 2836 | map "uc" (Uppercase_mapping, i_cps_map ~empty:false); 2837 | map "kAccountingNumeric" (KAccountingNumeric, i_string); 2838 | map "kAlternateHanYu" (KAlternateHanYu, i_string); 2839 | map "kAlternateJEF" (KAlternateJEF, i_string); 2840 | map "kAlternateKangXi" (KAlternateKangXi, i_string); 2841 | map "kAlternateMorohashi" (KAlternateMorohashi, i_string); 2842 | map "kBigFive" (KBigFive, i_string); 2843 | map "kCCCII" (KCCCII, i_string); 2844 | map "kCNS1986" (KCNS1986, i_string); 2845 | map "kCNS1992" (KCNS1992, i_string); 2846 | map "kCangjie" (KCangjie, i_string); 2847 | map "kCantonese" (KCantonese, i_string); 2848 | map "kCheungBauer" (KCheungBauer, i_string); 2849 | map "kCheungBauerIndex" (KCheungBauerIndex, i_string); 2850 | map "kCihaiT" (KCihaiT, i_string); 2851 | map "kCompatibilityVariant" (KCompatibilityVariant, i_string); 2852 | map "kCowles" (KCowles, i_string); 2853 | map "kDaeJaweon" (KDaeJaweon, i_string); 2854 | map "kDefinition" (KDefinition, i_string); 2855 | map "kEACC" (KEACC, i_string); 2856 | map "kFanqie" (KFanqie, i_string); 2857 | map "kFenn" (KFenn, i_string); 2858 | map "kFennIndex" (KFennIndex, i_string); 2859 | map "kFourCornerCode" (KFourCornerCode, i_string); 2860 | map "kFrequency" (KFrequency, i_string); 2861 | map "kGB0" (KGB0, i_string); 2862 | map "kGB1" (KGB1, i_string); 2863 | map "kGB3" (KGB3, i_string); 2864 | map "kGB5" (KGB5, i_string); 2865 | map "kGB8" (KGB8, i_string); 2866 | map "kGSR" (KGSR, i_string); 2867 | map "kGradeLevel" (KGradeLevel, i_string); 2868 | map "kHDZRadBreak" (KHDZRadBreak, i_string); 2869 | map "kHKGlyph" (KHKGlyph, i_string); 2870 | map "kHKSCS" (KHKSCS, i_string); 2871 | map "kHanYu" (KHanYu, i_string); 2872 | map "kHangul" (KHangul, i_string); 2873 | map "kHanyuPinlu" (KHanyuPinlu, i_string); 2874 | map "kHanyuPinyin" (KHanyuPinyin, i_string); 2875 | map "kIBMJapan" (KIBMJapan, i_string); 2876 | map "kIICore" (KIICore, i_string); 2877 | map "kIRGDaeJaweon" (KIRGDaeJaweon, i_string); 2878 | map "kIRGDaiKanwaZiten" (KIRGDaiKanwaZiten, i_string); 2879 | map "kIRGHanyuDaZidian" (KIRGHanyuDaZidian, i_string); 2880 | map "kIRGKangXi" (KIRGKangXi, i_string); 2881 | map "kIRG_GSource" (KIRG_GSource, i_string); 2882 | map "kIRG_HSource" (KIRG_HSource, i_string); 2883 | map "kIRG_JSource" (KIRG_JSource, i_string); 2884 | map "kIRG_KPSource" (KIRG_KPSource, i_string); 2885 | map "kIRG_KSource" (KIRG_KSource, i_string); 2886 | map "kIRG_MSource" (KIRG_MSource, i_string); 2887 | map "kIRG_SSource" (KIRG_SSource, i_string); 2888 | map "kIRG_TSource" (KIRG_TSource, i_string); 2889 | map "kIRG_USource" (KIRG_USource, i_string); 2890 | map "kIRG_UKSource" (KIRG_UKSource, i_string); 2891 | map "kIRG_VSource" (KIRG_VSource, i_string); 2892 | map "kJapanese" (KJapanese, i_string); 2893 | map "kJHJ" (KJHJ, i_string); 2894 | map "kJIS0213" (KJIS0213, i_string); 2895 | map "kJapaneseKun" (KJapaneseKun, i_string); 2896 | map "kJapaneseOn" (KJapaneseOn, i_string); 2897 | map "kJinmeiyoKanji" (KJinmeiyoKanji, i_string); 2898 | map "kJis0" (KJis0, i_string); 2899 | map "kJis1" (KJis1, i_string); 2900 | map "kJoyoKanji" (KJoyoKanji, i_string); 2901 | map "kKPS0" (KKPS0, i_string); 2902 | map "kKPS1" (KKPS1, i_string); 2903 | map "kKSC0" (KKSC0, i_string); 2904 | map "kKSC1" (KKSC1, i_string); 2905 | map "kKangXi" (KKangXi, i_string); 2906 | map "kKarlgren" (KKarlgren, i_string); 2907 | map "kKorean" (KKorean, i_string); 2908 | map "kKoreanEducationHanja" (KKoreanEducationHanja, i_string); 2909 | map "kKoreanName" (KKoreanName, i_string); 2910 | map "kLau" (KLau, i_string); 2911 | map "kMainlandTelegraph" (KMainlandTelegraph, i_string); 2912 | map "kMandarin" (KMandarin, i_string); 2913 | map "kMatthews" (KMatthews, i_string); 2914 | map "kMeyerWempe" (KMeyerWempe, i_string); 2915 | map "kMorohashi" (KMorohashi, i_string); 2916 | map "kNelson" (KNelson, i_string); 2917 | map "kNSHU_DubenSrc" (KNSHU_DubenSrc, i_string); 2918 | map "kNSHU_Reading" (KNSHU_Reading, i_string); 2919 | map "kOtherNumeric" (KOtherNumeric, i_string); 2920 | map "kPhonetic" (KPhonetic, i_string); 2921 | map "kPrimaryNumeric" (KPrimaryNumeric, i_string); 2922 | map "kPseudoGB1" (KPseudoGB1, i_string); 2923 | map "kRSAdobe_Japan1_6" (KRSAdobe_Japan1_6, i_string); 2924 | map "kRSJapanese" (KRSJapanese, i_string); 2925 | map "kRSKanWa" (KRSKanWa, i_string); 2926 | map "kRSKangXi" (KRSKangXi, i_string); 2927 | map "kRSKorean" (KRSKorean, i_string); 2928 | map "kRSMerged" (KRSMerged, i_string); 2929 | map "kRSUnicode" (KRSUnicode, i_string); 2930 | map "kSBGY" (KSBGY, i_string); 2931 | map "kSemanticVariant" (KSemanticVariant, i_string); 2932 | map "kSimplifiedVariant" (KSimplifiedVariant, i_string); 2933 | map "kSMSZD2003Index" (KSMSZD2003Index, i_string); 2934 | map "kSMSZD2003Readings" (KSMSZD2003Readings, i_string); 2935 | map "kSpecializedSemanticVariant" (KSpecializedSemanticVariant, i_string); 2936 | map "kSpoofingVariant" (KSpoofingVariant, i_string); 2937 | map "kTGH" (KTGH, i_string); 2938 | map "kTGHZ2013" (KTGHZ2013, i_string); 2939 | map "kTGT_MergedSrc" (KTGT_MergedSrc, i_string); 2940 | map "kTGT_RSUnicode" (KTGT_RSUnicode, i_string); 2941 | map "kTaiwanTelegraph" (KTaiwanTelegraph, i_string); 2942 | map "kTang" (KTang, i_string); 2943 | map "kTayNumeric" (KTayNumeric, i_string); 2944 | map "kTotalStrokes" (KTotalStrokes, i_string); 2945 | map "kTraditionalVariant" (KTraditionalVariant, i_string); 2946 | map "kVietnamese" (KVietnamese, i_string); 2947 | map "kVietnameseNumeric" (KVietnameseNumeric, i_string); 2948 | map "kWubi" (KWubi, i_string); 2949 | map "kXHC1983" (KXHC1983, i_string); 2950 | map "kXerox" (KXerox, i_string); 2951 | map "kZhuang" (KZhuang, i_string); 2952 | map "kZhuangNumeric" (KZhuangNumeric, i_string); 2953 | map "kZVariant" (KZVariant, i_string); 2954 | map "kEH_Cat" (KEH_Cat, i_string); 2955 | map "kEH_Core" (KEH_Core, i_string); 2956 | map "kEH_Desc" (KEH_Desc, i_string); 2957 | map "kEH_Func" (KEH_Func, i_string); 2958 | map "kEH_FVal" (KEH_FVal, i_string); 2959 | map "kEH_UniK" (KEH_UniK, i_string); 2960 | map "kEH_JSesh" (KEH_JSesh, i_string); 2961 | map "kEH_HG" (KEH_HG, i_string); 2962 | map "kEH_IFAO" (KEH_IFAO, i_string); 2963 | map "kEH_NoMirror" (KEH_NoMirror, i_bool); 2964 | map "kEH_NoRotate" (KEH_NoRotate, i_bool); 2965 | map "kEH_AltSeq" (KEH_AltSeq, i_string); 2966 | fun m (n, v) -> 2967 | try match n with 2968 | | ("", p) -> 2969 | let k, conv = Hashtbl.find h p in 2970 | Pmap.add k (conv v) m 2971 | | _ -> raise Not_found 2972 | with Not_found -> Pmap.add (Other n) (i_string v) m 2973 | 2974 | let attv n atts = (* value of attribute [n] in atts or raises. *) 2975 | try snd (List.find (fun (en, v) -> en = ("", n)) atts) with 2976 | | Not_found -> err_miss_att n 2977 | 2978 | let rec skip_el d = (* skips an element, start signal was input. *) 2979 | let rec loop d depth = match Xmlm.input d with 2980 | | `El_start _ -> loop d (depth + 1) 2981 | | `El_end -> if depth = 0 then () else loop d (depth - 1) 2982 | | s -> loop d depth 2983 | in 2984 | loop d 0 2985 | 2986 | (* Parses a sequence of empty elements named n and a El_end. *) 2987 | let p_seq n p_atts d = 2988 | let rec aux n p_atts d acc = match Xmlm.input d with 2989 | | `El_start (n', atts) when n' = n -> 2990 | if Xmlm.input d <> `El_end then err err_exp_el_end else 2991 | aux n p_atts d ((p_atts atts) :: acc); 2992 | | `El_start _ -> skip_el d; aux n p_atts d acc 2993 | | `El_end -> List.rev acc 2994 | | `Data _ -> err err_data 2995 | | _ -> assert false 2996 | in 2997 | aux n p_atts d [] 2998 | 2999 | let p_description d = match (Xmlm.input d) with 3000 | | `Data desc -> if (Xmlm.input d <> `El_end) then err err_exp_el_end else desc 3001 | | `El_end -> "" 3002 | | _ -> err err_exp_data 3003 | 3004 | let p_name_aliases d = 3005 | let rec loop d depth acc = match Xmlm.peek d with 3006 | | `El_start (n, atts) when n = n_name_alias -> 3007 | ignore (Xmlm.input d); 3008 | let alias = ref "" in 3009 | let atype = ref None in 3010 | let p_alias_atts = function 3011 | | ("", "alias"), v -> alias := v 3012 | | ("", "type"), v -> atype := Some (i_name_alias_type v) 3013 | | _ -> () 3014 | in 3015 | List.iter p_alias_atts atts; 3016 | begin match !atype with None -> err err_invalid_name_alias_spec 3017 | | Some t -> loop d (depth + 1) ((!alias, t) :: acc) 3018 | end 3019 | | `El_start (n, atts) -> ignore (Xmlm.input d); skip_el d; loop d depth acc 3020 | | `El_end -> 3021 | if depth = 0 then List.rev acc else 3022 | (ignore (Xmlm.input d); loop d (depth - 1) acc) 3023 | | `Data _ -> err err_data 3024 | | _ -> assert false 3025 | in 3026 | loop d 0 [] 3027 | 3028 | let p_cp d rep atts g_props = 3029 | let cp = ref None in 3030 | let cp_first = ref None in 3031 | let cp_last = ref None in 3032 | let add acc ((n, v) as a) = match n with 3033 | | ("", "cp") -> cp := Some (cp_of_string v); acc 3034 | | ("", "first-cp") -> cp_first := Some (cp_of_string v); acc 3035 | | ("", "last-cp") -> cp_last := Some (cp_of_string v); acc 3036 | | _ -> add_prop acc a 3037 | in 3038 | let props = List.fold_left add g_props atts in 3039 | let props = Pmap.add Name_alias (Name_alias_v (p_name_aliases d)) props in 3040 | match !cp with 3041 | | Some cp -> Cpmap.add cp props rep 3042 | | None -> match !cp_first, !cp_last with 3043 | | Some f, Some l -> 3044 | let rep = ref rep in 3045 | for cp = f to l do rep := Cpmap.add cp props !rep done; 3046 | !rep 3047 | | _ -> err err_invalid_cp_spec 3048 | 3049 | let p_repertoire d = 3050 | let eatt t = ("","uax_42_element"), t in (* fake attribute for uniformity *) 3051 | let rec loop d depth rep g_atts = match Xmlm.input d with 3052 | | `El_start (n, atts) when n = n_reserved -> 3053 | loop d (depth + 1) (p_cp d rep (eatt "reserved" :: atts) g_atts) g_atts 3054 | | `El_start (n, atts) when n = n_noncharacter -> 3055 | loop d (depth + 1) (p_cp d rep (eatt "noncharacter":: atts) g_atts) g_atts 3056 | | `El_start (n, atts) when n = n_surrogate -> 3057 | loop d (depth + 1) (p_cp d rep (eatt "surrogate" :: atts) g_atts) g_atts 3058 | | `El_start (n, atts) when n = n_char -> 3059 | loop d (depth + 1) (p_cp d rep (eatt "char" :: atts) g_atts) g_atts 3060 | | `El_start (n, atts) when n = n_group -> 3061 | let atts = List.fold_left add_prop Pmap.empty atts in 3062 | let rep = loop d 0 rep atts in (* ^ empty: no group hierarchy *) 3063 | loop d depth rep Pmap.empty 3064 | | `El_start (n, atts) -> skip_el d; loop d depth rep g_atts (* skip foreign *) 3065 | | `El_end -> if depth = 0 then rep else loop d (depth - 1) rep g_atts 3066 | | `Data _ -> err err_data 3067 | | _ -> assert false 3068 | in 3069 | loop d 0 Cpmap.empty Pmap.empty 3070 | 3071 | let p_blocks d = 3072 | let b_atts atts = 3073 | (cp_of_string (attv "first-cp" atts), cp_of_string (attv "last-cp" atts)), 3074 | attv "name" atts 3075 | in 3076 | p_seq n_block b_atts d 3077 | 3078 | let p_named_sequences d = 3079 | let ns_atts atts = attv "name" atts, cps_of_string (attv "cps" atts) in 3080 | p_seq n_named_sequence ns_atts d 3081 | 3082 | let p_standardized_variants d = 3083 | let when_of_string v = 3084 | let w s = match s with 3085 | | "isolate" -> `Isolate 3086 | | "initial" -> `Initial 3087 | | "medial" -> `Medial 3088 | | "final" -> `Final 3089 | | s -> err (err_att_val s) 3090 | in 3091 | List.map w (split_string v ' ') 3092 | in 3093 | let sv_atts atts = 3094 | cps_of_string (attv "cps" atts), 3095 | attv "desc" atts, 3096 | when_of_string (attv "when" atts) 3097 | in 3098 | p_seq n_standardized_variant sv_atts d 3099 | 3100 | let p_cjk_radicals d = 3101 | let cjk_r_atts atts = 3102 | attv "number" atts, 3103 | cp_of_string (attv "radical" atts), 3104 | cp_of_string (attv "ideograph" atts) 3105 | in 3106 | p_seq n_cjk_radical cjk_r_atts d 3107 | 3108 | let p_do_not_emit d = 3109 | let instead_atts atts = 3110 | let instead_of = cps_of_string (attv "of" atts) in 3111 | let use = cps_of_string (attv "use" atts) in 3112 | let because = attv "because" atts in 3113 | { instead_of; use; because } 3114 | in 3115 | p_seq n_instead instead_atts d 3116 | 3117 | let p_ucd d = 3118 | let description = ref None in 3119 | let repertoire = ref None in 3120 | let blocks = ref None in 3121 | let named_sequences = ref None in 3122 | let provisional_named_sequences = ref None in 3123 | let standardized_variants = ref None in 3124 | let cjk_radicals = ref None in 3125 | let do_not_emit = ref None in 3126 | let set n r p d = if !r <> None then err (err_dup n) else r := Some (p d) in 3127 | while (Xmlm.peek d <> `El_end) do match Xmlm.input d with 3128 | | `El_start (n, _) when n = n_description -> 3129 | set n description p_description d 3130 | | `El_start (n, _) when n = n_repertoire -> 3131 | set n repertoire p_repertoire d 3132 | | `El_start (n, _) when n = n_blocks -> 3133 | set n blocks p_blocks d 3134 | | `El_start (n, _) when n = n_named_sequences -> 3135 | set n named_sequences p_named_sequences d 3136 | | `El_start (n, _) when n = n_provisional_named_sequences -> 3137 | set n provisional_named_sequences p_named_sequences d 3138 | | `El_start (n, _) when n = n_standardized_variants -> 3139 | set n standardized_variants p_standardized_variants d 3140 | | `El_start (n, _) when n = n_cjk_radicals -> 3141 | set n cjk_radicals p_cjk_radicals d 3142 | | `El_start (n, _) when n = n_do_not_emit -> 3143 | set n do_not_emit p_do_not_emit d 3144 | | `El_start (n, _) -> skip_el d (* foreign markup *) 3145 | | `Data _ -> err err_data 3146 | | _ -> assert false 3147 | done; 3148 | ignore (Xmlm.input d); 3149 | if not (Xmlm.eoi d) then err err_wf; 3150 | let some v default = match v with Some v -> v | None -> default in 3151 | { description = some !description ""; 3152 | repertoire = some !repertoire Cpmap.empty; 3153 | blocks = some !blocks []; 3154 | named_sequences = some !named_sequences []; 3155 | provisional_named_sequences = some !provisional_named_sequences []; 3156 | standardized_variants = some !standardized_variants []; 3157 | cjk_radicals = some !cjk_radicals []; 3158 | do_not_emit = some !do_not_emit []; } 3159 | 3160 | type src = [ `Channel of in_channel | `String of string ] 3161 | type decoder = Xmlm.input 3162 | 3163 | let decoder src = 3164 | let src = match src with `String s -> `String (0, s) | `Channel _ as s -> s in 3165 | Xmlm.make_input ~strip:true src 3166 | 3167 | let decoded_range d = Xmlm.pos d, Xmlm.pos d 3168 | let decode d = try 3169 | ignore (Xmlm.input d); (* `Dtd *) 3170 | begin match Xmlm.input d with 3171 | | `El_start (n, _) when n = n_ucd -> `Ok (p_ucd d) 3172 | | `El_start (n, _) -> err (err_exp_ucd n) 3173 | | _ -> assert false 3174 | end; 3175 | with 3176 | | Failure e -> `Error e | Xmlm.Error (_, e) -> `Error (Xmlm.error_message e) 3177 | --------------------------------------------------------------------------------