├── BRZO ├── .ocp-indent ├── src ├── uunf.mllib ├── uunf_string.mli ├── uunf_string.ml ├── uunf_fmt.ml ├── uunf_tmap.ml ├── uunf_tmapbyte.ml ├── uunf_tmapbool.ml ├── uunf.mli └── uunf.ml ├── .merlin ├── .gitignore ├── doc └── index.mld ├── _tags ├── pkg ├── META └── pkg.ml ├── LICENSE.md ├── test ├── examples.ml ├── unftrip.ml └── test_uunf.ml ├── README.md ├── DEVEL.md ├── opam ├── support ├── generate_data.ml ├── gen.ml └── gen_norm.ml ├── CHANGES.md └── B0.ml /BRZO: -------------------------------------------------------------------------------- 1 | (srcs-x pkg support test) -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | strict_with=always,match_clause=4,strict_else=never -------------------------------------------------------------------------------- /src/uunf.mllib: -------------------------------------------------------------------------------- 1 | Uunf_fmt 2 | Uunf_tmap 3 | Uunf_tmapbool 4 | Uunf_tmapbyte 5 | Uunf_data 6 | Uunf 7 | Uunf_string -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG uucd cmdliner uutf b0.kit 2 | S src 3 | S data 4 | S test 5 | S support 6 | B _b0/b/** 7 | B _build/** 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _b0 2 | _build 3 | support/ucd.xml 4 | test/NormalizationTest.txt 5 | tmp 6 | *.native 7 | *.byte 8 | *.install 9 | *~ 10 | \.\#* 11 | \#*# 12 | -------------------------------------------------------------------------------- /doc/index.mld: -------------------------------------------------------------------------------- 1 | {0 Uunf {%html: %%VERSION%%%}} 2 | 3 | Uunf normalizes Unicode text. See {!Uunf} for more details. 4 | 5 | {1:uunf Library [uunf]} 6 | 7 | {!modules: 8 | Uunf 9 | Uunf_string 10 | } 11 | -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | true : bin_annot, safe_string 2 | 3 | <_b0> : -traverse 4 | 5 | : include 6 | 7 | : include 8 | : package(uucd unix) 9 | 10 | : include 11 | : package(uutf), package(cmdliner) 12 | -------------------------------------------------------------------------------- /pkg/META: -------------------------------------------------------------------------------- 1 | description = "Unicode text normalization for OCaml" 2 | version = "%%VERSION_NUM%%" 3 | requires = "" 4 | archive(byte) = "uunf.cma" 5 | archive(native) = "uunf.cmxa" 6 | plugin(byte) = "uunf.cma" 7 | plugin(native) = "uunf.cmxs" 8 | exists_if = "uunf.cma uunf.cmxa" 9 | 10 | package "string" ( 11 | description = "The uunf.string library (deprecated)" 12 | version = "%%VERSION_NUM%%" 13 | requires = "uunf" 14 | exports = "uunf" 15 | warning = "Deprecated, use the uunf library." 16 | ) 17 | -------------------------------------------------------------------------------- /pkg/pkg.ml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ocaml 2 | #use "topfind" 3 | #require "topkg" 4 | open Topkg 5 | 6 | let uutf = Conf.with_pkg "uutf" 7 | let cmdliner = Conf.with_pkg "cmdliner" 8 | 9 | let () = 10 | let opams = 11 | [Pkg.opam_file "opam" ~lint_deps_excluding:(Some ["b0"; "xmlm"; "uucd"])] 12 | in 13 | Pkg.describe "uunf" ~opams @@ fun c -> 14 | let uutf = Conf.value c uutf in 15 | let cmdliner = Conf.value c cmdliner in 16 | Ok [ Pkg.mllib ~api:["Uunf"; "Uunf_string"] "src/uunf.mllib"; 17 | Pkg.bin ~cond:(uutf && cmdliner) "test/unftrip"; 18 | Pkg.test "test/test" ~args:(Cmd.v "test/NormalizationTest.txt"); 19 | Pkg.test "test/examples"; 20 | Pkg.doc "doc/index.mld" ~dst:"odoc-pages/index.mld"; 21 | Pkg.doc "test/examples.ml"; ] 22 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 The uunf programmers 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /test/examples.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: CC0-1.0 4 | ---------------------------------------------------------------------------*) 5 | 6 | let utf_8_normalize nf s = 7 | let rec add buf normalizer v = match Uunf.add normalizer v with 8 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await 9 | | `Await | `End -> () 10 | in 11 | let rec loop buf s i max normalizer = 12 | if i > max then (add buf normalizer `End; Buffer.contents buf) else 13 | let dec = String.get_utf_8_uchar s i in 14 | add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec)); 15 | loop buf s (i + Uchar.utf_decode_length dec) max normalizer 16 | in 17 | let buf = Buffer.create (String.length s * 3) in 18 | let normalizer = Uunf.create nf in 19 | loop buf s 0 (String.length s - 1) normalizer 20 | -------------------------------------------------------------------------------- /src/uunf_string.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2015 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Unicode text normalization on UTF OCaml strings. 7 | 8 | {!Uunf} functions acting directly on UTF encoded OCaml strings. 9 | 10 | {b Warning.} All these function silently replace malformed encoded Unicode 11 | data by a {!Stdlib.Uchar.rep} character. *) 12 | 13 | (** {1:norm Normalize} *) 14 | 15 | val normalize_utf_8 : Uunf.form -> string -> string 16 | (** [normalize_utf_8 nf s] is the UTF-8 encoded string [s] in normal 17 | form [nf]. *) 18 | 19 | val normalize_utf_16be : Uunf.form -> string -> string 20 | (** [normalize_utf_16be nf s] is the UTF-16BE encoded string [s] in 21 | normal form [nf]. *) 22 | 23 | val normalize_utf_16le : Uunf.form -> string -> string 24 | (** [normalize_utf_16le nf s] is the UTF-16LE encoded string [s] in 25 | normal form [nf]. *) 26 | -------------------------------------------------------------------------------- /src/uunf_string.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2015 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let normalize_utf_x get_utf add_utf nf s = 7 | let rec add add_utf buf normalizer v = match Uunf.add normalizer v with 8 | | `Uchar u -> add_utf buf u; add add_utf buf normalizer `Await 9 | | `Await | `End -> () 10 | in 11 | let rec loop get_utf add_utf buf s i max normalizer = 12 | if i > max then (add add_utf buf normalizer `End; Buffer.contents buf) else 13 | let dec = get_utf s i in 14 | let u = Uchar.utf_decode_uchar dec in 15 | add add_utf buf normalizer (`Uchar u); 16 | loop get_utf add_utf buf s (i + Uchar.utf_decode_length dec) max normalizer 17 | in 18 | let b = Buffer.create (String.length s * 3) in 19 | let normalizer = Uunf.create nf in 20 | loop get_utf add_utf b s 0 (String.length s - 1) normalizer 21 | 22 | let normalize_utf_8 nf s = 23 | normalize_utf_x String.get_utf_8_uchar Buffer.add_utf_8_uchar nf s 24 | 25 | let normalize_utf_16be nf s = 26 | normalize_utf_x String.get_utf_16be_uchar Buffer.add_utf_16be_uchar nf s 27 | 28 | let normalize_utf_16le nf s = 29 | normalize_utf_x String.get_utf_16le_uchar Buffer.add_utf_16le_uchar nf s 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Uunf — Unicode text normalization for OCaml 2 | =========================================== 3 | %%VERSION%% 4 | 5 | Uunf is an OCaml library for normalizing Unicode text. It supports all 6 | Unicode [normalization forms]. The library is independent from any IO 7 | mechanism or Unicode text data structure and it can process text 8 | without a complete in-memory representation. 9 | 10 | Uunf is distributed under the ISC license. It has no dependency. 11 | 12 | [normalization forms]: http://www.unicode.org/reports/tr15/ 13 | 14 | Homepage: 15 | 16 | 17 | ## Installation 18 | 19 | Uunf can be installed with `opam`: 20 | 21 | opam install uunf 22 | opam install uunf cmdliner uutf # For the unftrip tool 23 | 24 | If you don't use `opam` consult the [`opam`](opam) file for build 25 | instructions. 26 | 27 | 28 | ## Documentation 29 | 30 | The documentation can be consulted [online] or via `odig doc uunf`. 31 | 32 | Questions are welcome but better asked on the [OCaml forum] than on 33 | the issue tracker. 34 | 35 | [online]: http://erratique.ch/software/uunf/doc/ 36 | [OCaml forum]: https://discuss.ocaml.org/ 37 | 38 | 39 | ## Sample programs 40 | 41 | The [`unftrip`] tool normalises text provided on standard input. 42 | 43 | See also the [doc examples]. 44 | 45 | [`unftrip`]: test/unftrip.ml 46 | [doc examples]: test/examples.ml 47 | -------------------------------------------------------------------------------- /DEVEL.md: -------------------------------------------------------------------------------- 1 | # New Unicode release 2 | 3 | The file `src/uunf_data.ml` contains generated data. This file needs 4 | to be regenerated on new Unicode releases, as well as the `opam` file. 5 | 6 | In order to do so you need to install an updated version of the [uucd] 7 | OCaml package which is capable of reading the latest XML Unicode 8 | character database. 9 | 10 | You can then bump the Unicode release number at the top of the `B0.ml` 11 | file. Verify that everything is as expected with: 12 | 13 | b0 -- unicode-version 14 | 15 | You should then download a copy of the XML Unicode character database 16 | to the `support/ucd.xml` file which is ignored by git. If you have 17 | `curl` and `unzip` in your `PATH` you can simply issue: 18 | 19 | b0 -- download-ucdxml 20 | 21 | You can now proceed to generate the `src/uunf_data.ml` and update the opam file 22 | file by issuing: 23 | 24 | b0 -- generate-data 25 | b0 -- .opam file > opam 26 | 27 | [uucd]: http://erratique.ch/software/uucd 28 | 29 | # Reference tests 30 | 31 | To test the package on the reference normalization tests of you must 32 | download a copy of the tests to the `test/NormalizationTest.txt` file 33 | which is ignored by git. 34 | 35 | If you have `curl` in your `PATH` you can simply issue: 36 | 37 | b0 -- download-tests 38 | 39 | this downloads the tests for the unicode version mentioned in `B0.ml`. 40 | 41 | You can then check them with: 42 | 43 | b0 test 44 | 45 | -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "uunf" 3 | synopsis: "Unicode text normalization for OCaml" 4 | description: """\ 5 | Uunf is an OCaml library for normalizing Unicode text. It supports all 6 | Unicode [normalization forms]. The library is independent from any IO 7 | mechanism or Unicode text data structure and it can process text 8 | without a complete in-memory representation. 9 | 10 | Uunf is distributed under the ISC license. It has no dependency. 11 | 12 | [normalization forms]: http://www.unicode.org/reports/tr15/ 13 | 14 | Homepage: """ 15 | maintainer: "Daniel Bünzli " 16 | authors: "The uunf programmers" 17 | license: "ISC" 18 | tags: ["unicode" "text" "normalization" "org:erratique"] 19 | homepage: "https://erratique.ch/software/uunf" 20 | doc: "https://erratique.ch/software/uunf/doc/Uunf" 21 | bug-reports: "https://github.com/dbuenzli/uunf/issues" 22 | depends: [ 23 | "ocaml" {>= "4.14.0"} 24 | "ocamlfind" {build} 25 | "ocamlbuild" {build} 26 | "topkg" {build & >= "1.1.0"} 27 | "uucd" {dev & >= "17.0.0" & < "18.0.0"} 28 | ] 29 | depopts: ["uutf" "cmdliner"] 30 | conflicts: [ 31 | "uutf" {< "1.0.0"} 32 | "cmdliner" {< "1.1.0"} 33 | ] 34 | build: [ 35 | "ocaml" 36 | "pkg/pkg.ml" 37 | "build" 38 | "--dev-pkg" 39 | "%{dev}%" 40 | "--with-uutf" 41 | "%{uutf:installed}%" 42 | "--with-cmdliner" 43 | "%{cmdliner:installed}%" 44 | ] 45 | post-messages: 46 | "If the build fails with \"ocamlopt.opt got signal and exited\", issue 'ulimit -s unlimited' and retry." 47 | {failure & (arch = "ppc64" | arch = "arm64")} 48 | dev-repo: "git+https://erratique.ch/repos/uunf.git" 49 | x-maintenance-intent: ["(latest)"] 50 | -------------------------------------------------------------------------------- /src/uunf_fmt.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2020 The uucp programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let pf = Format.fprintf 7 | let strf = Format.asprintf 8 | let string = Format.pp_print_string 9 | let string_X ppf s = 10 | Format.pp_open_vbox ppf 1; string ppf "\""; 11 | for i = 0 to String.length s - 1 do 12 | if i mod 16 = 0 && i > 0 then pf ppf "\\@\n"; 13 | pf ppf "\\x%02x" (Char.code s.[i]) 14 | done; 15 | string ppf "\""; Format.pp_close_box ppf () 16 | 17 | let string_XN ppf = function "" -> string ppf "snil" | x -> string_X ppf x 18 | let bool = Format.pp_print_bool 19 | let sp = Format.pp_print_space 20 | let semi ppf () = string ppf ";"; sp ppf () 21 | let int = Format.pp_print_int 22 | let iter i ?(sep = sp) pp ppf x = 23 | let fst = ref true in 24 | i (fun v -> (if !fst then fst := false else sep ppf ()); pp ppf v) x 25 | 26 | let as_array i pp ppf = pf ppf "@[<2>[|%a|]@]" (iter i ~sep:semi pp) 27 | let array pp = as_array Array.iter pp 28 | let array_N pp ppf = function [||] -> string ppf "nil" | x -> array pp ppf x 29 | 30 | module R = struct 31 | type _ record = 32 | | [] : unit record 33 | | (::) : 34 | (string * (Format.formatter -> 'a -> unit)) * 'b record -> 35 | ('a -> 'b) record 36 | end 37 | 38 | let record record ppf = 39 | let field name pp_v ppf v = pf ppf "@[<1>%s =@ %a@]" name pp_v v in 40 | let open R in (* 4.03 compat *) 41 | let rec go : type a. (unit -> unit) -> a R.record -> a = fun k -> function 42 | | [] -> pf ppf "@[<2>{ %a }@]" (fun _ -> k) () 43 | | [name, pp_v] -> 44 | fun v -> go (fun () -> k (); field name pp_v ppf v) [] 45 | | (name, pp_v) :: record -> 46 | fun v -> go (fun () -> k (); field name pp_v ppf v; semi ppf ()) record 47 | in 48 | go ignore record 49 | -------------------------------------------------------------------------------- /src/uunf_tmap.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Trie character maps *) 7 | 8 | type 'a t = 9 | { default : 'a; (* default value. *) 10 | l0 : 'a array array array } (* 0x1FFFFF as 0x1FF - 0xFF - 0xF. *) 11 | 12 | let nil = [||] 13 | let l0_shift = 12 14 | let l0_size = 272 (* 0x10F + 1 *) 15 | let l1_shift = 4 16 | let l1_mask = 0xFF 17 | let l1_size = 256 (* 0xFF + 1 *) 18 | let l2_mask = 0xF 19 | let l2_size = 16 (* 0xF + 1 *) 20 | let get m u = 21 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in 22 | if l1 == nil then m.default else 23 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in 24 | if l2 == nil then m.default else 25 | Array.unsafe_get l2 (u land l2_mask) 26 | 27 | let create default = { default; l0 = Array.make l0_size nil } 28 | let set m u v = 29 | if v = m.default then () else 30 | let i = u lsr l0_shift in 31 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size nil; 32 | let j = u lsr l1_shift land l1_mask in 33 | if m.l0.(i).(j) == nil then m.l0.(i).(j) <- Array.make l2_size m.default; 34 | m.l0.(i).(j).(u land l2_mask) <- v 35 | 36 | let size v_size m = match m.l0 with 37 | | [||] -> 3 + 1 + v_size m.default 38 | | l0 -> 39 | let size = ref (3 + v_size m.default + 1 + Array.length l0) in 40 | for i = 0 to Array.length l0 - 1 do match l0.(i) with 41 | | [||] -> () 42 | | l1 -> 43 | size := !size + (1 + Array.length l1); 44 | for j = 0 to Array.length l1 - 1 do match l1.(j) with 45 | | [||] -> () 46 | | l2 -> 47 | size := !size + (1 + Array.length l2); 48 | for k = 0 to Array.length l2 - 1 do 49 | size := !size + v_size l2.(k) 50 | done; 51 | done; 52 | done; 53 | !size 54 | 55 | let dump pp_v ppf m = 56 | let open Uunf_fmt in 57 | record ["default", pp_v; "l0", pp_v |> array_N |> array_N |> array] 58 | ppf m.default m.l0 59 | -------------------------------------------------------------------------------- /src/uunf_tmapbyte.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Trie character byte maps *) 7 | 8 | type t = 9 | { default : int; (* default value. *) 10 | l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *) 11 | 12 | let nil = [||] 13 | let snil = "" 14 | let l0_shift = 12 15 | let l0_size = 272 (* 0x10F + 1 *) 16 | let l1_shift = 8 17 | let l1_mask = 0xF 18 | let l1_size = 16 (* 0xF + 1 *) 19 | let l2_mask = 0xFF 20 | let l2_size = 256 (* 0xFF + 1 *) 21 | let get m u = 22 | let l1 = Array.get m.l0 (u lsr l0_shift) in 23 | if l1 == nil then m.default else 24 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in 25 | if l2 == snil then m.default else 26 | Char.code (String.unsafe_get l2 (u land l2_mask)) 27 | 28 | let create default = { default; l0 = Array.make l0_size nil } 29 | let set m u byte = 30 | let l2_make m = Bytes.make l2_size (Char.chr m.default) in 31 | if byte = m.default then () else 32 | let i = u lsr l0_shift in 33 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil; 34 | let j = u lsr l1_shift land l1_mask in 35 | if m.l0.(i).(j) == snil then 36 | m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m); 37 | let k = u land l2_mask in 38 | Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) k (Char.unsafe_chr byte) 39 | 40 | let size m = match m.l0 with 41 | | [||] -> 3 + 1 42 | | l0 -> 43 | let size = ref (3 + 1 + Array.length l0) in 44 | for i = 0 to Array.length l0 - 1 do match l0.(i) with 45 | | [||] -> () 46 | | l1 -> 47 | size := !size + 1 + Array.length l1; 48 | for j = 0 to Array.length l1 - 1 do 49 | size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size) 50 | done; 51 | done; 52 | !size 53 | 54 | let iter_blobs i m = Array.(iter (iter i)) m.l0 55 | 56 | let dump_pp pp_v ppf m = 57 | let open Uunf_fmt in 58 | record ["default", int; "l0", pp_v |> array_N |> array] 59 | ppf m.default m.l0 60 | 61 | let pp_v = Uunf_fmt.string_XN 62 | let dump = dump_pp pp_v 63 | -------------------------------------------------------------------------------- /support/generate_data.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2015 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Extracts data from the Unicode Character Database *) 7 | 8 | let str = Format.sprintf 9 | let exec = Filename.basename Sys.executable_name 10 | 11 | let ucd_or_die inf = try 12 | let ic = if inf = "-" then stdin else open_in inf in 13 | let d = Uucd.decoder (`Channel ic) in 14 | match Uucd.decode d with 15 | | `Ok db -> db 16 | | `Error e -> 17 | let (l0, c0), (l1, c1) = Uucd.decoded_range d in 18 | Printf.eprintf "%s:%d.%d-%d.%d: %s\n%!" inf l0 c0 l1 c1 e; 19 | exit 1 20 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1 21 | 22 | let process inf outf = 23 | let ucd = (Gen.log "Loading Unicode character database.\n"; ucd_or_die inf) in 24 | let generate pp outf ucd = 25 | try 26 | let oc = if outf = "-" then stdout else open_out outf in 27 | try 28 | let ppf = Format.formatter_of_out_channel oc in 29 | pp ppf ucd; 30 | Format.pp_print_flush ppf (); 31 | close_out oc 32 | with Sys_error _ as e -> close_out oc; raise e 33 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1 34 | in 35 | Gen.log "Note: reported sizes do not take sharing into account.\n"; 36 | generate Gen_norm.pp_mod outf ucd; 37 | () 38 | 39 | let main () = 40 | let usage = str 41 | "Usage: %s [OPTION]... [DBFILE]\n\ 42 | \ Generates data modules from an Unicode character database XML file.\n\ 43 | \ DBFILE defaults to support/ucd.xml\n\ 44 | Options:" exec 45 | in 46 | let inf = ref None in 47 | let set_inf f = 48 | if !inf = None then inf := Some f else 49 | raise (Arg.Bad "only one Unicode character database file can be specified") 50 | in 51 | let outf = ref None in 52 | let set r = Arg.String (fun s -> r := Some s) in 53 | let options = [ 54 | "-o", set outf, " output file, defaults to src/uunf_data.ml"; 55 | ] 56 | in 57 | Arg.parse (Arg.align options) set_inf usage; 58 | let inf = match !inf with None -> "support/ucd.xml" | Some inf -> inf in 59 | let outf = match !outf with None -> "src/uunf_data.ml" | Some outf -> outf in 60 | process inf outf 61 | 62 | let () = main () 63 | -------------------------------------------------------------------------------- /src/uunf_tmapbool.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Trie character boolean maps *) 7 | 8 | type t = 9 | { default : bool; (* default value. *) 10 | l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *) 11 | 12 | let nil = [||] 13 | let snil = "" 14 | let l0_shift = 12 15 | let l0_size = 272 (* 0x10F + 1 *) 16 | let l1_shift = 8 17 | let l1_mask = 0xF 18 | let l1_size = 16 (* 0xF + 1 *) 19 | let l2_mask = 0xFF 20 | let l2_size = 32 (* 0xFF + 1 / 8 *) 21 | let get m u = 22 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in 23 | if l1 == nil then m.default else 24 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in 25 | if l2 == snil then m.default else 26 | let k = u land l2_mask in 27 | let byte_num = k lsr 3 (* / 8 *) in 28 | let bit_num = k land 7 (* mod 8 *) in 29 | let byte = Char.code (String.unsafe_get l2 byte_num) in 30 | byte land (1 lsl bit_num) > 0 31 | 32 | let create default = { default; l0 = Array.make l0_size nil } 33 | let set m u b = 34 | let l2_make m = Bytes.make l2_size (if m.default then '\xFF' else '\x00') in 35 | if b = m.default then () else 36 | let i = u lsr l0_shift in 37 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil; 38 | let j = u lsr l1_shift land l1_mask in 39 | if m.l0.(i).(j) == snil then 40 | m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m); 41 | let k = u land l2_mask in 42 | let byte_num = k lsr 3 (* / 8 *) in 43 | let bit_num = k land 7 (* mod 8 *) in 44 | let byte = Char.code m.l0.(i).(j).[byte_num] in 45 | let new_byte = 46 | if b then (Char.unsafe_chr (byte lor (1 lsl bit_num))) else 47 | (Char.unsafe_chr (byte land lnot (1 lsl bit_num))) 48 | in 49 | Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) byte_num new_byte 50 | 51 | let size m = match m.l0 with 52 | | [||] -> 3 + 1 53 | | l0 -> 54 | let size = ref (3 + 1 + Array.length l0) in 55 | for i = 0 to Array.length l0 - 1 do match l0.(i) with 56 | | [||] -> () 57 | | l1 -> 58 | size := !size + 1 + Array.length l1; 59 | for j = 0 to Array.length l1 - 1 do 60 | size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size) 61 | done; 62 | done; 63 | !size 64 | 65 | let iter_blobs i m = Array.(iter (iter i)) m.l0 66 | 67 | let dump_pp pp_v ppf m = 68 | let open Uunf_fmt in 69 | record ["default", bool; "l0", pp_v |> array_N |> array] 70 | ppf m.default m.l0 71 | 72 | let pp_v = Uunf_fmt.string_XN 73 | let dump = dump_pp pp_v 74 | -------------------------------------------------------------------------------- /support/gen.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Extracts normalization data from the Unicode Character Database *) 7 | 8 | let log fmt = Printf.eprintf (fmt ^^ "%!") 9 | let pp = Format.fprintf 10 | let str = Printf.sprintf 11 | let str_of_size s = 12 | let b = s * (Sys.word_size / 8) in 13 | if b < 1_048_576 then str "%.1f Ko" (float b /. 1024.) else 14 | if b < 1_073_741_824 then str "%.1f Mo" (float b /. 1024. ** 2.) else 15 | str "%.1f Go" (float b /. 1024. ** 3.) 16 | 17 | (* Characters *) 18 | 19 | let is_hangul_syllabe u = 0xAC00 <= u && u <= 0xD7A3 20 | 21 | let iter_uchar_ints f = 22 | let rec loop u = 23 | let i = Uchar.to_int u in 24 | if Uchar.equal u Uchar.max then f i else 25 | (f i; loop (Uchar.succ u)) 26 | in 27 | loop Uchar.min 28 | 29 | (* Compact maps from characters to booleans. *) 30 | 31 | let bool_prop_maps prop = 32 | let tm = Uunf_tmapbool.create true in 33 | let fm = Uunf_tmapbool.create false in 34 | let add_uchar u = 35 | let b = prop u in 36 | Uunf_tmapbool.set tm u b; 37 | Uunf_tmapbool.set fm u b; 38 | in 39 | iter_uchar_ints add_uchar; tm, fm 40 | 41 | let assert_bool_prop_maps prop tm fm = 42 | let assert_uchar u = 43 | let fail () = failwith (str "bool prop map failure for U+%04X" u) in 44 | let b = prop u in 45 | if b <> Uunf_tmapbool.get tm u then fail (); 46 | if b <> Uunf_tmapbool.get fm u then fail (); 47 | in 48 | iter_uchar_ints assert_uchar 49 | 50 | (* Compact maps from characters to bytes. *) 51 | 52 | let byte_prop_map ~default prop = 53 | let m = Uunf_tmapbyte.create default in 54 | let add_uchar u = Uunf_tmapbyte.set m u (prop u) in 55 | iter_uchar_ints add_uchar; m 56 | 57 | let assert_byte_prop_map prop m = 58 | let assert_uchar u = 59 | if (prop u) = Uunf_tmapbyte.get m u then () else 60 | failwith (str "byte prop map failure for U+%04X" u) 61 | in 62 | iter_uchar_ints assert_uchar 63 | 64 | (* Compact maps from characters to arbitrary values. *) 65 | 66 | let prop_map ~default prop = 67 | let m = Uunf_tmap.create default in 68 | let add_uchar u = Uunf_tmap.set m u (prop u) in 69 | iter_uchar_ints add_uchar; m 70 | 71 | let assert_prop_map prop m = 72 | let assert_uchar u = 73 | if (prop u) = Uunf_tmap.get m u then () else 74 | failwith (str "prop map failure for U+%04X" u) 75 | in 76 | iter_uchar_ints assert_uchar 77 | 78 | let ucd_get ucd u p pstr = match Uucd.cp_prop ucd u p with 79 | | None -> invalid_arg (str "no %s property for U+%04X" pstr u) 80 | | Some v -> v 81 | 82 | (* Generate a module *) 83 | 84 | let year = (Unix.gmtime (Unix.gettimeofday ())).Unix.tm_year + 1900 85 | 86 | let pp_mod pp_mod ppf m = 87 | pp ppf 88 | "\ 89 | (*--------------------------------------------------------------------------- 90 | Copyright (c) %d The uunf programmers. All rights reserved. 91 | SPDX-License-Identifier: ISC 92 | ---------------------------------------------------------------------------*) 93 | 94 | (* WARNING do not edit. This file was automatically generated. *) 95 | @\n@[%a@]@\n" year pp_mod m 96 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | v17.0.0 2025-09-11 Zagreb 2 | ------------------------- 3 | 4 | - Unicode 17.0.0 support. 5 | 6 | v16.0.0 2024-09-10 Zagreb 7 | ------------------------- 8 | 9 | - Unicode 16.0.0 support. 10 | 11 | v15.1.0 2022-09-15 Zagreb 12 | ------------------------- 13 | 14 | - Unicode 15.1.0 support. 15 | - Requires OCaml 4.14.0. 16 | - The `Uunf_string` module was rewritten to use the standard library 17 | UTF decoders and was moved to the `uunf` library. The `uunf.string` 18 | library is deprecated, it warns on usage and simply requires `uunf`. 19 | - The sample code was rewritten to use the standard library UTF 20 | decoders. 21 | 22 | v15.0.0 2022-09-15 Zagreb 23 | ------------------------- 24 | 25 | - Unicode 15.0.0 support. 26 | 27 | v14.0.0 2021-09-17 Zagreb 28 | ------------------------- 29 | 30 | - Unicode 14.0.0 support. 31 | 32 | v13.0.0 2020-03-11 La Forclaz (VS) 33 | --------------------------------- 34 | 35 | - Unicode 13.0.0 support. 36 | - Require OCaml >= 4.03.0. 37 | 38 | v12.0.0 2019-03-08 La Forclaz (VS) 39 | ---------------------------------- 40 | 41 | - Unicode 12.0.0 support. 42 | 43 | v11.0.0 2018-06-06 Lausanne 44 | --------------------------- 45 | 46 | - Unicode 11.0.0 support. 47 | - Fix bug when normalizer flushes at the end of stream: a spurious 48 | `Await` was returned before the final `End`. 49 | 50 | v10.0.0 2017-06-20 Cambridge (UK) 51 | --------------------------------- 52 | 53 | - Unicode 10.0.0 support 54 | - Fix bug in canonical composition algorithm (used by NFC and NFKC forms). 55 | Thanks to Stephen Dolan for the report. 56 | - Fix regression of `Uucp.ccc` introduced by f4c0363 which went into 57 | v2.0.{0,1}. 58 | 59 | v2.0.1 2016-03-07 La Forclaz (VS) 60 | --------------------------------- 61 | 62 | - OCaml 4.05.0 compatibility (removal of `Uchar.dump`). 63 | 64 | v2.0.0 2016-11-23 Zagreb 65 | ------------------------ 66 | 67 | - Support for Unicode 9.0.0. 68 | - OCaml standard library `Uchar.t` support. 69 | - Removes and substitutes `type Uunf.uchar = int` by the (abstract) 70 | `Uchar.t` type. `Uchar.{of,to}_int` allows to recover the previous 71 | representation. 72 | - Removes `Uunf.is_scalar_value`. `Uchar.is_valid` can be used instead. 73 | - Safe string support. 74 | - Build depend on topkg. 75 | - Relicense from BSD3 to ISC. 76 | 77 | v1.0.0 2015-06-17 Cambridge (UK) 78 | -------------------------------- 79 | 80 | - Updated for Unicode 8.0.0 81 | - `topkg` support 82 | - `Uunf.add` now eventually returns `` `End`` whenever the latter was 83 | encoded and the character stream was entirely output. In most existing 84 | programs this will simply entail to add `` `End`` to the existing 85 | `` `Await`` case in pattern matches on the result of `Uunf.add`. 86 | - Adds the `Uunf_string` library that allows to directly normalize UTF-X 87 | OCaml encoded strings. This library depends on `Uutf`. 88 | - Rewrote the utility `unftrip` to use `Cmdliner` which is now 89 | an optional dependency of the package. The cli interface is 90 | incompatible with previous versions. Support for random 91 | Unicode scalar value generation was removed, use `utftrip` from 92 | the `Uutf` package for that. 93 | - Rewrote the module's data generation to essentially match what is done 94 | in `Uucp`. Much less ugly, no source file `sed`ding. 95 | 96 | v0.9.3 2014-06-16 Cambridge (UK) 97 | -------------------------------- 98 | 99 | - Updated for Unicode 7.0.0 100 | 101 | v0.9.2 2013-10-01 Lausanne 102 | -------------------------- 103 | 104 | - Updated for Unicode 6.3.0 105 | - OPAM friendly workflow and drop OASIS support. 106 | 107 | v0.9.1 2013-01-04 La Forclaz (VS) 108 | --------------------------------- 109 | 110 | - Updated for Unicode 6.2.0. 111 | - Fix Uunf.is_scalar_value always returning false. 112 | - Make the module completely safe for the client. 113 | - Change command line help of unftrip. 114 | 115 | v0.9.0 2012-09-07 Lausanne 116 | -------------------------- 117 | 118 | First release. 119 | -------------------------------------------------------------------------------- /B0.ml: -------------------------------------------------------------------------------- 1 | open B0_kit.V000 2 | open Result.Syntax 3 | 4 | let unicode_version = 17, 0, 0, None (* Adjust on new releases *) 5 | let next_major = B0_version.next_major unicode_version 6 | 7 | (* OCaml library names *) 8 | 9 | let b0_std = B0_ocaml.libname "b0.std" 10 | let unix = B0_ocaml.libname "unix" 11 | let uucd = B0_ocaml.libname "uucd" 12 | let uutf = B0_ocaml.libname "uutf" 13 | let cmdliner = B0_ocaml.libname "cmdliner" 14 | 15 | let uunf = B0_ocaml.libname "uunf" 16 | 17 | (* Libraries *) 18 | 19 | let uunf_lib = B0_ocaml.lib uunf ~doc:"The uunf library" ~srcs:[ `Dir ~/"src" ] 20 | let uunf_string_lib = 21 | B0_ocaml.deprecated_lib ~exports:[uunf] (B0_ocaml.libname "uunf.string") 22 | 23 | (* Data generation. *) 24 | 25 | let generate_data = 26 | let doc = "uunf_data.ml generator" in 27 | let srcs = 28 | [ `Dir ~/"support"; 29 | `File ~/"src/uunf_tmapbool.ml"; 30 | `File ~/"src/uunf_tmapbyte.ml"; 31 | `File ~/"src/uunf_tmap.ml"; 32 | `File ~/"src/uunf_fmt.ml" ] 33 | in 34 | let requires = [ uucd; unix ] in 35 | let meta = B0_meta.(empty |> tag build |> ~~ B0_unit.Action.cwd `Scope_dir) in 36 | B0_ocaml.exe "generate-data" ~doc ~srcs ~requires ~meta 37 | 38 | (* Tools *) 39 | 40 | let unftrip = 41 | let srcs = [ `File ~/"test/unftrip.ml" ] in 42 | let requires = [ cmdliner; uutf; uunf ] in 43 | B0_ocaml.exe "unftrip" ~public:true ~doc:"The unftrip tool" ~srcs ~requires 44 | 45 | (* Tests *) 46 | 47 | let test = 48 | let srcs = [ `File ~/"test/test_uunf.ml" ] in 49 | let meta = 50 | B0_meta.(empty |> tag test |> tag run |> ~~ B0_unit.Action.cwd `Scope_dir) 51 | in 52 | let requires = [b0_std; uunf; cmdliner] in 53 | B0_ocaml.exe "test_uunf" ~srcs ~meta ~requires ~doc:"Test normalization" 54 | 55 | let examples = 56 | let srcs = [ `File ~/"test/examples.ml" ] in 57 | let meta = B0_meta.empty |> B0_meta.(tag test) in 58 | B0_ocaml.exe "examples" ~srcs ~meta ~requires:[uunf] ~doc:"Doc samples" 59 | 60 | (* Actions *) 61 | 62 | let uc_base = "http://www.unicode.org/Public" 63 | 64 | let show_version = 65 | B0_unit.of_action "unicode-version" ~doc:"Show supported unicode version" @@ 66 | fun _ _ ~args:_ -> 67 | Ok (Log.stdout (fun m -> m "%s" (B0_version.to_string unicode_version))) 68 | 69 | let download_tests = 70 | let doc = "Download the UCD normalization tests" in 71 | B0_unit.of_action "download-tests" ~doc @@ fun env _ ~args:_ -> 72 | let version = B0_version.to_string unicode_version in 73 | let test_url = Fmt.str "%s/%s/ucd/NormalizationTest.txt" uc_base version in 74 | let test_file = B0_env.in_scope_dir env ~/"test/NormalizationTest.txt" in 75 | (Log.stdout @@ fun m -> 76 | m "@[Downloading %s@,to %a@]" test_url Fpath.pp test_file); 77 | B0_action_kit.fetch_url env test_url test_file 78 | 79 | let download_ucdxml = 80 | let doc = "Download the ucdxml to support/ucd.xml" in 81 | B0_unit.of_action "download-ucdxml" ~doc @@ fun env _ ~args:_ -> 82 | let* unzip = B0_env.get_cmd env (Cmd.tool "unzip") in 83 | let version = B0_version.to_string unicode_version in 84 | let ucd_url = Fmt.str "%s/%s/ucdxml/ucd.all.grouped.zip" uc_base version in 85 | let ucd_file = B0_env.in_scope_dir env ~/"support/ucd.xml" in 86 | Result.join @@ Os.File.with_tmp_fd @@ fun tmpfile tmpfd -> 87 | (Log.stdout @@ fun m -> 88 | m "@[Downloading %s@,to %a@]" ucd_url Fpath.pp ucd_file); 89 | let* () = B0_action_kit.fetch_url env ucd_url tmpfile in 90 | let stdout = Os.Cmd.out_file ~force:true ~make_path:true ucd_file in 91 | Os.Cmd.run Cmd.(unzip % "-p" %% path tmpfile) ~stdout 92 | 93 | (* Packs *) 94 | 95 | let default = 96 | let meta = 97 | B0_meta.empty 98 | |> ~~ B0_meta.authors ["The uunf programmers"] 99 | |> ~~ B0_meta.maintainers ["Daniel Bünzli "] 100 | |> ~~ B0_meta.homepage "https://erratique.ch/software/uunf" 101 | |> ~~ B0_meta.online_doc "https://erratique.ch/software/uunf/doc/Uunf" 102 | |> ~~ B0_meta.licenses ["ISC"] 103 | |> ~~ B0_meta.repo "git+https://erratique.ch/repos/uunf.git" 104 | |> ~~ B0_meta.issues "https://github.com/dbuenzli/uunf/issues" 105 | |> ~~ B0_meta.description_tags 106 | ["unicode"; "text"; "normalization"; "org:erratique"] 107 | |> ~~ B0_opam.build 108 | {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%" 109 | "--with-uutf" "%{uutf:installed}%" 110 | "--with-cmdliner" "%{cmdliner:installed}%" ]]|} 111 | |> ~~ B0_opam.depopts [ "uutf", ""; "cmdliner", ""] 112 | |> ~~ B0_opam.conflicts 113 | [ "uutf", {|< "1.0.0"|}; 114 | "cmdliner", {|< "1.1.0"|}] 115 | |> ~~ B0_opam.depends 116 | [ "ocaml", {|>= "4.14.0"|}; 117 | "ocamlfind", {|build|}; 118 | "ocamlbuild", {|build|}; 119 | "topkg", {|build & >= "1.1.0"|}; 120 | "uucd", 121 | Fmt.str {|dev & >= "%s" & < "%s"|} 122 | (B0_version.to_string unicode_version) 123 | (B0_version.to_string next_major) 124 | ] 125 | |> ~~ B0_opam.file_addendum 126 | [ `Field ("post-messages", `L (true, [ 127 | `S "If the build fails with \"ocamlopt.opt got signal and \ 128 | exited\", issue 'ulimit -s unlimited' and retry."; 129 | `Raw {|{failure & (arch = "ppc64" | arch = "arm64")}|}])) 130 | ] 131 | |> B0_meta.tag B0_opam.tag 132 | in 133 | B0_pack.make "default" ~doc:"uunf package" ~meta ~locked:true @@ 134 | B0_unit.list () 135 | -------------------------------------------------------------------------------- /test/unftrip.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let strf = Printf.sprintf 7 | let pp = Format.fprintf 8 | let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) " 9 | (Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d) 10 | (Uutf.decoder_byte_count d) 11 | 12 | let pp_malformed ppf bs = 13 | let l = String.length bs in 14 | pp ppf "@[malformed bytes @[("; 15 | if l > 0 then pp ppf "%02X" (Char.code (bs.[0])); 16 | for i = 1 to l - 1 do pp ppf "@ %02X" (Char.code (bs.[i])) done; 17 | pp ppf ")@]@]" 18 | 19 | let pp_dump_uchar ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u) 20 | 21 | let exec = Filename.basename Sys.executable_name 22 | let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec 23 | 24 | let input_malformed = ref false 25 | let log_malformed inf d bs = 26 | input_malformed := true; 27 | log "%s:%a: %a@." inf pp_pos d pp_malformed bs 28 | 29 | (* Output *) 30 | 31 | let uchar_dump ppf = function 32 | | `End -> () | `Uchar u -> pp ppf "%a@\n" pp_dump_uchar u 33 | 34 | let uchar_encoder enc = 35 | let enc = match enc with `ISO_8859_1 | `US_ASCII -> `UTF_8 36 | | #Uutf.encoding as enc -> enc 37 | in 38 | let e = Uutf.encoder enc (`Channel stdout) in 39 | fun v -> ignore (Uutf.encode e v) 40 | 41 | let out_fun ascii oe = 42 | if ascii then uchar_dump Format.std_formatter else uchar_encoder oe 43 | 44 | (* Trip *) 45 | 46 | let u_rep = `Uchar Uutf.u_rep 47 | let id inf d first_dec out = (* no normalization. *) 48 | let rec loop d = function 49 | | `Uchar _ as v -> out v; loop d (Uutf.decode d) 50 | | `End as v -> out v 51 | | `Malformed bs -> log_malformed inf d bs; out u_rep; loop d (Uutf.decode d) 52 | | `Await -> assert false 53 | in 54 | if Uutf.decoder_removed_bom d then out (`Uchar Uutf.u_bom); 55 | loop d first_dec 56 | 57 | let normalize nf inf d first_dec out = (* normalize to nf. *) 58 | let n = Uunf.create nf in 59 | let rec add v = match Uunf.add n v with 60 | | `Uchar cp as u -> out u; add `Await 61 | | `Await | `End -> () 62 | in 63 | let rec loop d = function 64 | | `Uchar _ as v -> add v; loop d (Uutf.decode d) 65 | | `End as v -> add v; out `End 66 | | `Malformed bs -> log_malformed inf d bs; add u_rep; loop d (Uutf.decode d) 67 | | `Await -> assert false 68 | in 69 | if Uutf.decoder_removed_bom d then add (`Uchar Uutf.u_bom); 70 | loop d first_dec 71 | 72 | let trip nf inf enc ascii = 73 | try 74 | let ic = if inf = "-" then stdin else open_in inf in 75 | let d = Uutf.decoder ?encoding:enc (`Channel ic) in 76 | let first_dec = Uutf.decode d in (* guess encoding if needed. *) 77 | let out = out_fun ascii (Uutf.decoder_encoding d) in 78 | begin match nf with 79 | | None -> id inf d first_dec out 80 | | Some nf -> normalize nf inf d first_dec out 81 | end; 82 | if inf <> "-" then close_in ic; 83 | flush stdout; 84 | with Sys_error e -> log "%s@." e; exit 1 85 | 86 | (* Version *) 87 | 88 | let unicode_version () = Format.printf "%s@." Uunf.unicode_version 89 | 90 | (* Cmd *) 91 | 92 | let do_cmd cmd nf inf enc ascii = match cmd with 93 | | `Unicode_version -> unicode_version () 94 | | `Trip -> trip nf inf enc ascii 95 | 96 | (* Cmdline interface *) 97 | 98 | open Cmdliner 99 | 100 | let cmd = 101 | let doc = "Output supported Unicode version." in 102 | let unicode_version = `Unicode_version, Arg.info ["unicode-version"] ~doc in 103 | Arg.(value & vflag `Trip [unicode_version]) 104 | 105 | let nf_doc = "NORMALIZATION" 106 | let nf = 107 | let docs = nf_doc in 108 | let doc = "Normalization Form D (NFD), canonical decomposition." in 109 | let nfd = Some `NFD, Arg.info ["nfd"] ~doc ~docs in 110 | let doc = "Normalization Form C (NFC), canonical decomposition followed by \ 111 | canonical composition." in 112 | let nfc = Some `NFC, Arg.info ["nfc"] ~doc ~docs in 113 | let doc = "Normalization form KD (NFKD), compatibility decomposition." in 114 | let nfkd = Some `NFKD, Arg.info ["nfkd"] ~doc ~docs in 115 | let doc = "Normalization form KC (NFKC), compatibility decomposition \ 116 | followed by canonical composition." in 117 | let nfkc = Some `NFKC, Arg.info ["nfkc"] ~doc ~docs in 118 | Arg.(value & vflag None [nfd; nfc; nfkd; nfkc]) 119 | 120 | let file = 121 | let doc = "The input file. Reads from stdin if unspecified." in 122 | Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE") 123 | 124 | let enc = 125 | let enc = [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE; 126 | "UTF-16BE", `UTF_16BE; "ASCII", `US_ASCII; "latin1", `ISO_8859_1 ] 127 | in 128 | let doc = strf "Input encoding, must %s. If unspecified the encoding is \ 129 | guessed. The output encoding is the same as the input \ 130 | encoding except for ASCII and latin1 where UTF-8 is output." 131 | (Arg.doc_alts_enum enc) 132 | in 133 | Arg.(value & opt (some (enum enc)) None & info [ "e"; "encoding" ] ~doc) 134 | 135 | let ascii = 136 | let doc = "Output the input text as newline (U+000A) separated Unicode 137 | scalar values written in the US-ASCII charset." 138 | in 139 | Arg.(value & flag & info ["a"; "ascii"] ~doc) 140 | 141 | let cmd = 142 | let doc = "normalize Unicode text" in 143 | let man = [ 144 | `S "DESCRIPTION"; 145 | `P "$(tname) inputs Unicode text from stdin and rewrites it to stdout 146 | according to a specified Unicode normalization form (see UAX 15)."; 147 | `P "If no normalization form is specified the character stream is left 148 | intact."; 149 | `P "Invalid byte sequences in the input are reported on stderr and 150 | replaced by the Unicode replacement character (U+FFFD) in the output."; 151 | `S nf_doc; 152 | `S "OPTIONS"; 153 | `S "EXIT STATUS"; 154 | `P "$(tname) exits with one of the following values:"; 155 | `I ("0", "no error occurred"); 156 | `I ("1", "a command line parsing error occurred"); 157 | `I ("2", "the input text was malformed"); 158 | `S "BUGS"; 159 | `P "This program is distributed with the Uunf OCaml library. 160 | See http://erratique.ch/software/uunf for contact information." ] 161 | in 162 | Cmd.v (Cmd.info "unftrip" ~version:"%%VERSION%%" ~doc ~man) 163 | Term.(const do_cmd $ cmd $ nf $ file $ enc $ ascii) 164 | 165 | let main () = match Cmd.eval cmd with 166 | | 0 -> if !input_malformed then exit 2 else exit 0 167 | | c when c = Cmd.Exit.cli_error -> exit 1 168 | | c -> exit c 169 | 170 | let () = if !Sys.interactive then () else main () 171 | -------------------------------------------------------------------------------- /support/gen_norm.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2015 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let strf = Printf.sprintf 7 | let pp = Format.fprintf 8 | let str = Format.asprintf 9 | 10 | (* Structure sharing *) 11 | 12 | let intern (type a) ?eqh iter pp_v ppf x = 13 | let module H = Hashtbl.Make (struct 14 | type t = a 15 | let equal, hash = match eqh with Some fg -> fg | _ -> (=), Hashtbl.hash 16 | end) in 17 | let t = H.create 23 and n = ref 0 in 18 | x |> iter (fun v -> if not (H.mem t v) then begin 19 | let name = str "v%03d" !n in 20 | H.add t v name; incr n; 21 | pp ppf "@[<2>let %s =@ %a@]@\n" name pp_v v 22 | end); 23 | (fun ppf v -> match H.find_opt t v with 24 | | Some name -> pp ppf "%s" name 25 | | None -> pp_v ppf v) 26 | 27 | (* Normalization properties. *) 28 | 29 | let pp_boundary nf ucd ppf nf_quick_check = 30 | Gen.log "%s boundary property as character boolean trie map" nf; 31 | let prop_str = strf "%s_quick_check" nf in 32 | let prop u = match Gen.ucd_get ucd u nf_quick_check prop_str with 33 | | `Maybe | `False -> false 34 | | `True -> (Gen.ucd_get ucd u Uucd.canonical_combining_class "ccc") = 0 35 | in 36 | let tm, fm = Gen.bool_prop_maps prop in 37 | let tm_size, fm_size = Uunf_tmapbool.size tm, Uunf_tmapbool.size fm in 38 | let use_fm = tm_size > fm_size in 39 | Gen.log ", asserting data.\n"; Gen.assert_bool_prop_maps prop tm fm; 40 | Gen.log " boolean trie map (default true) size: %s\n" 41 | (Gen.str_of_size tm_size); 42 | Gen.log " boolean trie map (default false) size: %s\n" 43 | (Gen.str_of_size fm_size); 44 | Gen.log " Using map with default %b.\n\n" (not use_fm); 45 | let m = if use_fm then fm else tm in 46 | let pp_v = intern Uunf_tmapbool.iter_blobs Uunf_tmapbool.pp_v ppf m in 47 | pp ppf "@[<2>let %s_boundary_map =@ %a@]@\n@\n" 48 | nf (Uunf_tmapbool.dump_pp pp_v) m; 49 | () 50 | 51 | let pp_ccc ppf ucd = 52 | Gen.log "ccc property as character byte trie map"; 53 | let prop u = Gen.ucd_get ucd u Uucd.canonical_combining_class "ccc" in 54 | let m = Gen.byte_prop_map ~default:0 prop in 55 | let t_size = Uunf_tmapbyte.size m in 56 | Gen.log ", asserting data.\n"; Gen.assert_byte_prop_map prop m; 57 | Gen.log " trie map size: %s\n\n" (Gen.str_of_size t_size); 58 | let pp_v = intern Uunf_tmapbyte.iter_blobs Uunf_tmapbyte.pp_v ppf m in 59 | pp ppf "@[<2>let ccc_map =@ %a@]@\n@\n" 60 | (Uunf_tmapbyte.dump_pp pp_v) m; 61 | () 62 | 63 | let pp_decomp ppf ucd = 64 | Gen.log "decomposition mapping as trie map"; 65 | let default = Uunf_tmap.nil in 66 | let prop u = 67 | match Gen.ucd_get ucd u Uucd.decomposition_mapping "decomposition mapping" 68 | with 69 | | `Self -> default 70 | | `Cps cps -> 71 | let t = Gen.ucd_get ucd u Uucd.decomposition_type "decomposition_type"in 72 | if Gen.is_hangul_syllabe u then begin 73 | if t <> `Can then invalid_arg (strf "hangul not canon decomp %X" u); 74 | default 75 | end else begin 76 | let d = Array.of_list cps in 77 | let compat = t <> `Can in 78 | if compat then d.(0) <- (1 lsl 24) lor d.(0); 79 | d 80 | end 81 | in 82 | let m = Gen.prop_map ~default prop in 83 | let size_v = function [||] -> 0 | a -> 1 + Array.length a in 84 | let t_size = Uunf_tmap.size size_v m in 85 | let pp_decomp ppf = function 86 | | [||] -> pp ppf "nil" 87 | | a -> 88 | pp ppf "[|@,"; 89 | for i = 0 to Array.length a - 1 do pp ppf "@,0x%X;@," a.(i) done; 90 | pp ppf "@,|]" 91 | in 92 | Gen.log ", asserting data.\n"; Gen.assert_prop_map prop m; 93 | Gen.log " trie map size: %s\n\n" (Gen.str_of_size t_size); 94 | pp ppf "@[<2>let decomp_map =@ %a@]@\n@\n" (Uunf_tmap.dump pp_decomp) m; 95 | () 96 | 97 | module Cpmap = Uucd.Cpmap 98 | 99 | let pp_compose ppf ucd = 100 | Gen.log "composition to primary composites as trie map"; 101 | let m = ref Cpmap.empty in 102 | let add_map cp1 cp2 c = 103 | let l = try Cpmap.find cp1 !m with Not_found -> [] in 104 | m := Cpmap.add cp1 ((cp2, c) :: l) !m 105 | in 106 | let add u = 107 | match Gen.ucd_get ucd u Uucd.decomposition_mapping "decomposition_mapping" 108 | with 109 | | `Self -> () 110 | | `Cps cps -> 111 | let fce = "full_decomposition_exclusion" in 112 | if Gen.ucd_get ucd u Uucd.full_composition_exclusion fce then () else 113 | let t = Gen.ucd_get ucd u Uucd.decomposition_type "decomposition_type"in 114 | if t <> `Can then () else 115 | if Gen.is_hangul_syllabe u then () else 116 | match cps with 117 | | [cp1; cp2] -> add_map cp1 cp2 u 118 | | _ -> invalid_arg (strf "cannot handle composition for %X" u); 119 | in 120 | Gen.iter_uchar_ints add; 121 | let default = Uunf_tmap.nil in 122 | let max_comps = ref 0 in 123 | let prop u = 124 | try 125 | let comps = List.sort compare (Cpmap.find u !m) in 126 | let len = List.length comps in 127 | let a = Array.make (len * 2) 0 in 128 | let set i (cp2, c) = a.(2 * i) <- cp2; a.(2 * i + 1) <- c in 129 | List.iteri set comps; 130 | max_comps := max !max_comps len; 131 | a 132 | with Not_found -> Uunf_tmap.nil 133 | in 134 | let m = Gen.prop_map ~default prop in 135 | let size_v = function [||] -> 0 | a -> 1 + Array.length a in 136 | let t_size = Uunf_tmap.size size_v m in 137 | let pp_d ppf = function 138 | | [||] -> pp ppf "nil" 139 | | a -> 140 | pp ppf "[|@,"; 141 | for i = 0 to Array.length a - 1 do pp ppf "@,0x%X;@," a.(i) done; 142 | pp ppf "@,|]" 143 | in 144 | Gen.log ", asserting data.\n"; Gen.assert_prop_map prop m; 145 | Gen.log " trie map size: %s\n" (Gen.str_of_size t_size); 146 | Gen.log " max num. of possible composition for a base char: %d\n\n" 147 | !max_comps; 148 | pp ppf "@[<2>let compose_map =@ %a@]@\n@\n" (Uunf_tmap.dump pp_d) m; 149 | () 150 | 151 | let pp_version ppf ucd = 152 | let version = match String.split_on_char ' ' ucd.Uucd.description with 153 | | [tok] -> tok 154 | | [_; tok] -> tok 155 | | _ -> ucd.Uucd.description 156 | in 157 | pp ppf "@[<2>let unicode_version = \"%s\"@]@\n@\n" version 158 | 159 | let pp_norms ppf ucd = 160 | pp_version ppf ucd; 161 | pp ppf "open Uunf_tmapbool;;@\n@\n"; 162 | pp_boundary "nfc" ucd ppf Uucd.nfc_quick_check; 163 | pp_boundary "nfd" ucd ppf Uucd.nfd_quick_check; 164 | pp_boundary "nfkc" ucd ppf Uucd.nfkc_quick_check; 165 | pp_boundary "nfkd" ucd ppf Uucd.nfkd_quick_check; 166 | pp ppf "open Uunf_tmapbyte;;@\n@\n"; 167 | pp_ccc ppf ucd; 168 | pp ppf "open Uunf_tmap;;@\n@\n"; 169 | pp_decomp ppf ucd; 170 | pp_compose ppf ucd; 171 | () 172 | 173 | let pp_mod ppf ucd = Gen.pp_mod pp_norms ppf ucd 174 | -------------------------------------------------------------------------------- /src/uunf.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Unicode text normalization. 7 | 8 | [Uunf] normalizes Unicode text. It supports all Unicode 9 | normalization forms. The module is independent from any IO 10 | mechanism or Unicode text data structure and it can process text 11 | without a complete in-memory representation of the data. 12 | 13 | The supported Unicode version is determined by the {!unicode_version} 14 | value. 15 | 16 | Consult the {{!basics}basics}, {{!limits}limitations} and 17 | {{!examples}examples} of use. 18 | 19 | {3 References} 20 | {ul 21 | {- The Unicode Consortium. 22 | {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. 23 | (latest version)} 24 | {- Mark Davis. 25 | {e {{:http://www.unicode.org/reports/tr15/}UAX #15 Unicode Normalization 26 | Forms}}. (latest version)} 27 | {- The Unicode Consortium. 28 | {e {{:http://www.unicode.org/charts/normalization/}Normalization charts}. 29 | }}} *) 30 | 31 | (** {1 Normalize} *) 32 | 33 | type form = [ `NFD | `NFC | `NFKD | `NFKC ] 34 | (** The type for normalization forms. 35 | {ul 36 | {- [`NFD] {{:http://www.unicode.org/glossary/#normalization_form_d} 37 | normalization form D}, canonical decomposition.} 38 | {- [`NFC] {{:http://www.unicode.org/glossary/#normalization_form_c} 39 | normalization form C}, canonical decomposition followed by 40 | canonical composition 41 | ({{:http://www.w3.org/TR/charmod-norm/}recommended} for the www).} 42 | {- [`NFKD] {{:http://www.unicode.org/glossary/#normalization_form_kd} 43 | normalization form KD}, compatibility decomposition.} 44 | {- [`NFKC] {{:http://www.unicode.org/glossary/#normalization_form_kc} 45 | normalization form KC}, compatibility decomposition, 46 | followed by canonical composition.}} *) 47 | 48 | type t 49 | (** The type for Unicode text normalizers. *) 50 | 51 | type ret = [ `Uchar of Uchar.t | `End | `Await ] 52 | (** The type for normalizer results. See {!add}. *) 53 | 54 | val create : [< form ] -> t 55 | (** [create nf] is an Unicode text normalizer for the normal form [nf]. *) 56 | 57 | val form : t -> form 58 | (** [form n] is the normalization form of [n]. *) 59 | 60 | val add : t -> [ `Uchar of Uchar.t | `Await | `End ] -> ret 61 | (** [add n v] is: 62 | {ul 63 | {- [`Uchar u] if [u] is the next character in the normalized 64 | sequence. The client must then call [add] with [`Await] 65 | until [`Await] is returned.} 66 | {- [`Await] when the normalizer is ready to add a new 67 | [`Uchar] or [`End].}} 68 | 69 | For [v] use [`Uchar u] to add a new character to the sequence 70 | to normalize and [`End] to signal the end of sequence. After 71 | adding one of these two values, always call [add] with [`Await] 72 | until [`Await] is returned. 73 | 74 | {b Raises.} [Invalid_argument] if [`Uchar ] or [`End] is 75 | added directly after an [`Uchar] was returned by the normalizer 76 | or if an [`Uchar] is added after [`End] was added. *) 77 | 78 | val reset : t -> unit 79 | (** [reset n] resets the normalizer to a state equivalent to the 80 | state of [Uunf.create (Uunf.form n)]. *) 81 | 82 | val copy : t -> t 83 | (** [copy n] is a copy of [n] in its current state. Subsequent 84 | {!add}s on [n] do not affect the copy. *) 85 | 86 | val pp_ret : Format.formatter -> ret -> unit 87 | (** [pp_ret ppf v] prints an unspecified representation of [v] on [ppf]. *) 88 | 89 | (** {1:props Normalization properties} 90 | 91 | These properties are used internally to implement the normalizers. 92 | They are not needed to use the module but are exposed as they may 93 | be useful to implement other algorithms. *) 94 | 95 | val unicode_version : string 96 | (** [unicode_version] is the Unicode version supported by the module. *) 97 | 98 | val ccc : Uchar.t -> int 99 | (** [ccc u] is [u]'s 100 | {{:http://www.unicode.org/glossary/#combining_class}canonical combining 101 | class} value. *) 102 | 103 | val decomp : Uchar.t -> int array 104 | (** [decomp u] is [u]'s 105 | {{:http://www.unicode.org/glossary/#decomposition_mapping}decomposition 106 | mapping}. If the empty array is returned, [u] decomposes to itself. 107 | 108 | The first number in the array contains additional information, it 109 | cannot be used as an {!Uchar.t}. Use {!d_uchar} on the number to get the 110 | actual character and {!d_compatibility} to find out if this is 111 | a compatibility decomposition. All other characters of the array 112 | are guaranteed to be convertible using {!Uchar.of_int}. 113 | 114 | {b Warning.} Do {b not} mutate the array. *) 115 | 116 | val d_uchar : int -> Uchar.t 117 | (** See {!decomp}. *) 118 | 119 | val d_compatibility : int -> bool 120 | (** See {!decomp}. *) 121 | 122 | val composite : Uchar.t -> Uchar.t -> Uchar.t option 123 | (** [composite u1 u2] is the 124 | {{:http://www.unicode.org/glossary/#primary_composite}primary composite} 125 | canonically equivalent to the sequence [], if any. *) 126 | 127 | (** {1:limits Limitations} 128 | 129 | An [Uunf] normalizer consumes only a small bounded amount of 130 | memory on ordinary, {e meaningful} text. However on legal but {e 131 | degenerate} text like a 132 | {{:http://www.unicode.org/glossary/#starter}starter} followed by 133 | 10'000 combining 134 | {{:http://www.unicode.org/glossary/#nonspacing_mark}non-spacing 135 | marks} it will have to bufferize all the marks (a workaround is 136 | to first convert your input to 137 | {{:http://www.unicode.org/reports/tr15/#Stream_Safe_Text_Format}stream-safe 138 | text format}). *) 139 | 140 | (** {1:basics Basics} 141 | 142 | A normalizer is a stateful filter that inputs a sequence of 143 | characters and outputs an equivalent sequence in the requested 144 | normal form. 145 | 146 | The function {!create} returns a new normalizer for a given normal 147 | form: 148 | {[ 149 | let nfd = Uunf.create `NFD 150 | ]} 151 | To add characters to the sequence to normalize, call {!add} on 152 | [nfd] with [`Uchar _]. To end the sequence, call {!add} on [nfd] 153 | with [`End]. The normalized sequence of characters is returned, 154 | character by character, by the successive calls to {!add}. 155 | 156 | The client and the normalizer must wait on each other to limit 157 | internal buffering: each time the client adds to the sequence by 158 | calling {!add} with [`Uchar] or [`End] it must continue to call 159 | {!add} with [`Await] until the normalizer returns [`Await]. In 160 | practice this leads to the following kind of control flow: 161 | {[ 162 | let rec add acc v = match Uunf.add nfd v with 163 | | `Uchar u -> add (u :: acc) `Await 164 | | `Await | `End -> acc 165 | ]} 166 | For example to normalize the character [U+00E9] (é) with [nfd] to a list 167 | of characters we can write: 168 | {[ 169 | let e_acute = Uchar.of_int 0x00E9 170 | let e_acute_nfd = List.rev (add (add [] (`Uchar e_acute)) `End) 171 | ]} 172 | The next section has more examples. 173 | *) 174 | 175 | (** {1:examples Examples} 176 | 177 | {2:utf8 UTF-8 normalization} 178 | 179 | [utf_8_normalize nf s] is the UTF-8 encoded normal form [nf] of 180 | the UTF-8 encoded string [s]. 181 | {[ 182 | let utf_8_normalize nf s = 183 | let rec add buf normalizer v = match Uunf.add normalizer v with 184 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await 185 | | `Await | `End -> () 186 | in 187 | let rec loop buf s i max normalizer = 188 | if i > max then (add buf normalizer `End; Buffer.contents buf) else 189 | let dec = String.get_utf_8_uchar s i in 190 | add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec)); 191 | loop buf s (i + Uchar.utf_decode_length dec) max normalizer 192 | in 193 | let buf = Buffer.create (String.length s * 3) in 194 | let normalizer = Uunf.create nf in 195 | loop buf s 0 (String.length s - 1) normalizer 196 | ]} 197 | 198 | Note that this functionality is available directly through 199 | {!Uunf_string.normalize_utf_8} 200 | *) 201 | -------------------------------------------------------------------------------- /src/uunf.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type ret = [ `Uchar of Uchar.t | `End | `Await ] 7 | 8 | let pp_ret ppf v = match (v :> ret) with 9 | | `Uchar u -> Format.fprintf ppf "`Uchar U+%04X" (Uchar.to_int u) 10 | | `End -> Format.fprintf ppf "`End" 11 | | `Await -> Format.fprintf ppf "`Await" 12 | 13 | let err_exp_await add = 14 | invalid_arg (Format.asprintf "can't add %a, expected `Await" pp_ret add) 15 | 16 | let err_ended add = 17 | invalid_arg (Format.asprintf "can't add %a, `End already added" pp_ret add) 18 | 19 | (* The normalization process is implemented as described in UAX #15 20 | section 9.1 for normalizing the concatenation of normalized 21 | strings. We detect ranges of characters in the input sequence 22 | enclosed between two characters for which NFX_quick_check=YES *and* 23 | ccc = 0 (6.1.0 wrongly claims that quick_check=YES implies ccc = 0, 24 | we therefore call this property nfx_boundary). Only these ranges 25 | (including the left boundary) need to be bufferized to perform the 26 | normalization process. *) 27 | 28 | (* Characters *) 29 | 30 | let ux_none = max_int (* no char, outside unicode range. *) 31 | let u_dumb = (* placeholder, overwritten. *) 32 | `Uchar (Uchar.of_int 0x0000) 33 | 34 | (* Normalization properties. *) 35 | 36 | let unicode_version = Uunf_data.unicode_version 37 | 38 | let nfc_boundary u = Uunf_tmapbool.get Uunf_data.nfc_boundary_map u 39 | let nfd_boundary u = Uunf_tmapbool.get Uunf_data.nfd_boundary_map u 40 | let nfkc_boundary u = Uunf_tmapbool.get Uunf_data.nfkc_boundary_map u 41 | let nfkd_boundary u = Uunf_tmapbool.get Uunf_data.nfkd_boundary_map u 42 | let _ccc u = Uunf_tmapbyte.get Uunf_data.ccc_map u 43 | let ccc u = _ccc (Uchar.to_int u) 44 | let decomp_prop u = Uunf_tmap.get Uunf_data.decomp_map u 45 | let compose_prop u = Uunf_tmap.get Uunf_data.compose_map u 46 | 47 | module H = struct (* Hangul arithmetic constants. *) 48 | let sbase = 0xAC00 49 | let lbase = 0x1100 50 | let vbase = 0x1161 51 | let tbase = 0x11A7 52 | let scount = 11172 53 | let lcount = 19 54 | let vcount = 21 55 | let tcount = 28 56 | let ncount = 588 57 | let scount = 11172 58 | end 59 | 60 | let decomp u = 61 | let u = Uchar.to_int u in 62 | if u < 0xAC00 || 0xD7A3 < u then decomp_prop u else 63 | begin (* LV or LVT hangul composite *) 64 | let sindex = u - H.sbase in 65 | let l = H.lbase + (sindex / H.ncount) in 66 | let v = H.vbase + (sindex mod H.ncount) / H.tcount in 67 | let t = H.tbase + (sindex mod H.tcount) in 68 | if t = H.tbase then [|l; v|] else [|l; v; t|] 69 | end 70 | 71 | (* N.B. to help stream-safe text implementers we *could* use the bits 72 | 25-27 of [(decomp u).(0)] to indicate the number of initial non 73 | starters in the NFKD decomposition of [u] and bits and 28-30 to 74 | indicate the non starter count increment. *) 75 | 76 | let d_compatibility i = i land (1 lsl 24) > 0 77 | let _d_uchar i = i land 0x1FFFFF 78 | let d_uchar i = Uchar.unsafe_of_int (_d_uchar i) 79 | 80 | let _composite u1 u2 = 81 | if 0x1100 <= u1 && u1 <= 0x1112 then 82 | begin 83 | if u2 < 0x1161 || 0x1175 < u2 then ux_none else 84 | let l = u1 - H.lbase in (* LV hangul composite *) 85 | let v = u2 - H.vbase in 86 | H.sbase + l * H.ncount + v * H.tcount 87 | end 88 | else if 0xAC00 <= u1 && u1 <= 0xD788 && (u1 - 0x0AC00) mod H.tcount = 0 then 89 | begin 90 | if u2 < 0x11A8 || u2 > 0x11C3 then ux_none else 91 | (u1 + u2 - H.tbase) (* LVT hangul composite *) 92 | end 93 | else match compose_prop u1 with 94 | | [||] -> ux_none 95 | | a (* [u2; c; u2'; c'; ...] sorted *) -> 96 | let len = Array.length a / 2 in 97 | let i = ref 0 in 98 | try 99 | while (!i < len) do 100 | if a.(!i * 2) = u2 then raise Exit else incr i; 101 | done; 102 | ux_none 103 | with Exit -> (a.(!i * 2 + 1)) 104 | 105 | let composite u1 u2 = 106 | let u = _composite (Uchar.to_int u1) (Uchar.to_int u2) in 107 | if u = ux_none then None else Some (Uchar.unsafe_of_int u) 108 | 109 | (* Normalize *) 110 | 111 | type form = [ `NFC | `NFD | `NFKC | `NFKD ] 112 | type state = (* normalizer state. *) 113 | | Start (* no cp seen yet. *) 114 | | Boundary (* cp with boundary = true found in n.uc, no accumulation yet. *) 115 | | Acc (* accumulate until next cp with boundary = true. *) 116 | | Flush (* next cp with boundary = true found, flush previous data. *) 117 | | End (* end of normalization sequence. *) 118 | 119 | type t = 120 | { form : form; (* normalization form. *) 121 | compat : bool; (* true if compatibility decomposition needed. *) 122 | compose : bool; (* true if composition needed. *) 123 | boundary : int -> bool; (* nfx_boundary. *) 124 | mutable state : state; (* normalizer state. *) 125 | mutable uc : [`Uchar of Uchar.t]; (* last cp with boundary = true. *) 126 | mutable acc : int array; (* code point accumulator. *) 127 | mutable first : int; (* index of first code point in acc. *) 128 | mutable last : int; (* index of last code point in acc. *) 129 | mutable is_end : bool;} (* [true] if `End was seen. *) 130 | 131 | let create_acc () = Array.make 35 ux_none 132 | let create form = 133 | let boundary, compat, compose = match form with 134 | | `NFC -> nfc_boundary, false, true 135 | | `NFD -> nfd_boundary, false, false 136 | | `NFKC -> nfkc_boundary, true, true 137 | | `NFKD -> nfkd_boundary, true, false 138 | in 139 | { form = (form :> form); compat; compose; boundary; state = Start; 140 | uc = u_dumb; acc = create_acc (); first = 0; last = -1; is_end = false} 141 | 142 | let get_u n = let `Uchar u = n.uc in Uchar.to_int u 143 | let acc_empty n = n.first > n.last 144 | let form n = n.form 145 | let copy n = { n with acc = Array.copy n.acc } 146 | let reset n = 147 | n.state <- Start; n.uc <- u_dumb; n.acc <- create_acc (); 148 | n.first <- 0; n.last <- -1; n.is_end <- false 149 | 150 | let grow_acc n = 151 | let len = Array.length n.acc in 152 | let acc' = Array.make (2 * len) ux_none in 153 | Array.blit n.acc 0 acc' 0 len; n.acc <- acc' 154 | 155 | let ordered_add n u = (* canonical ordering algorithm via insertion sort. *) 156 | n.last <- n.last + 1; if n.last = Array.length n.acc then grow_acc n; 157 | let c = _ccc u in 158 | if c = 0 then n.acc.(n.last) <- u else 159 | begin 160 | let i = ref (n.last - 1) in 161 | while (!i >= 0 && _ccc (n.acc.(!i)) > c) do 162 | n.acc.(!i + 1) <- n.acc.(!i); decr i; (* shift right. *) 163 | done; 164 | n.acc.(!i + 1) <- u 165 | end 166 | 167 | let rec add n u = 168 | if 0xAC00 <= u && u <= 0xD7A3 then 169 | begin (* LV or LVT hangul composite, copied from decomp to avoid alloc. *) 170 | let sindex = u - H.sbase in 171 | let l = H.lbase + (sindex / H.ncount) in 172 | let v = H.vbase + (sindex mod H.ncount) / H.tcount in 173 | let t = H.tbase + (sindex mod H.tcount) in 174 | if t = H.tbase then (ordered_add n l; ordered_add n v) else 175 | (ordered_add n l; ordered_add n v; ordered_add n t) 176 | end 177 | else 178 | begin match decomp_prop u with 179 | | [||] -> ordered_add n u 180 | | d -> 181 | if d_compatibility d.(0) && not n.compat then ordered_add n u else 182 | begin 183 | add n (_d_uchar d.(0)); 184 | for i = 1 to Array.length d - 1 do add n d.(i) done 185 | end 186 | end 187 | 188 | let compose n = (* canonical composition algorithm. *) 189 | let rec loop ~last_starter ~prev_ccc i = 190 | if i > n.last then () else 191 | let ccc_i = _ccc n.acc.(i) in 192 | let u_comp = _composite n.acc.(last_starter) n.acc.(i) in 193 | match (u_comp = ux_none || (ccc_i = 0 && last_starter <> i - 1)) with 194 | | true -> 195 | let last_starter = if ccc_i = 0 then i else last_starter in 196 | loop ~last_starter ~prev_ccc:ccc_i (i + 1) 197 | | false -> 198 | match prev_ccc <> 0 && prev_ccc >= ccc_i with 199 | | true -> loop ~last_starter ~prev_ccc:ccc_i (i + 1) 200 | | false -> 201 | n.acc.(last_starter) <- u_comp; 202 | Array.blit n.acc (i + 1) n.acc i (n.last - i); 203 | n.last <- n.last - 1; 204 | let prev_ccc = _ccc n.acc.(last_starter) in 205 | loop ~last_starter ~prev_ccc (last_starter + 1) 206 | in 207 | let last_starter = n.first in 208 | let prev_ccc = _ccc n.acc.(last_starter) in 209 | loop ~last_starter ~prev_ccc (last_starter + 1) 210 | 211 | let flush_next n = 212 | let ret = `Uchar (Uchar.unsafe_of_int n.acc.(n.first)) in 213 | if n.first = n.last then (n.first <- 0; n.last <- -1) else 214 | (n.first <- n.first + 1); 215 | ret 216 | 217 | let flush_start n = if n.compose then compose n; flush_next n 218 | let add n = function 219 | | `Uchar u as uc -> 220 | let u = Uchar.to_int u in 221 | begin match n.state with 222 | | Boundary -> 223 | if n.boundary u 224 | then (let prev = n.uc in n.uc <- uc; (prev :> ret)) 225 | else (n.state <- Acc; add n (get_u n); add n u; `Await) 226 | | Acc -> 227 | if n.boundary u 228 | then (n.state <- Flush; n.uc <- uc; flush_start n) 229 | else (add n u; `Await) 230 | | Start -> 231 | if n.boundary u 232 | then (n.state <- Boundary; n.uc <- uc; `Await) 233 | else (n.state <- Acc; add n u; `Await) 234 | | Flush -> err_exp_await uc 235 | | End -> err_ended uc 236 | end 237 | | `Await -> 238 | begin match n.state with 239 | | Flush -> 240 | if not (acc_empty n) then flush_next n else 241 | if n.is_end then (n.state <- End; `End) else 242 | (n.state <- Boundary; `Await) 243 | | Start | Boundary | Acc -> `Await 244 | | End -> `End 245 | end 246 | | `End -> 247 | n.is_end <- true; 248 | begin match n.state with 249 | | Boundary -> n.state <- End; (n.uc :> ret) 250 | | Acc -> n.state <- Flush; flush_start n 251 | | Start -> n.state <- End; `End 252 | | Flush -> err_exp_await `End 253 | | End -> err_ended `End 254 | end 255 | -------------------------------------------------------------------------------- /test/test_uunf.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2012 The uunf programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open B0_testing 7 | open B0_std 8 | open Result.Syntax 9 | 10 | (* Uunf tests, including Unicode's Normalization Conformance tests *) 11 | 12 | let uchar_dump ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u) 13 | 14 | (* Conformance data decoding and tests *) 15 | 16 | type conformance_test = int list array * string (* columns + comment. *) 17 | module Uset = Set.Make (Uchar) (* not a diet set, but will do here. *) 18 | 19 | let uchar_of_string v = (* parses a scalar value. *) 20 | let is_hex c = (0x30 <= c && c <= 0x39) || (0x41 <= c && c <= 0x46) in 21 | let cp = ref 0 in 22 | for k = 0 to (String.length v) - 1 do 23 | let c = Char.code v.[k] in 24 | if not (is_hex c) then failwith "" else 25 | cp := !cp * 16 + (if c <= 0x39 then c - 48 else c - 55) 26 | done; 27 | Uchar.of_int !cp 28 | 29 | let uchars_of_string v = List.map uchar_of_string (String.split_on_char ' ' v) 30 | 31 | let decode_conformance_data test_data_file = 32 | Test.log "Reading test data from %a" (Fmt.code' Fpath.pp) test_data_file; 33 | let split_string sep s = 34 | List.filter (fun s -> s <> "") (String.split_on_char sep s) 35 | in 36 | let rec loop tests collect_decomps decomps = function 37 | | [] -> List.rev tests, decomps 38 | | l :: ls -> 39 | try match split_string '#' l with 40 | | "@Part1 " :: _ -> loop tests true decomps ls 41 | | "@Part2 " :: _ -> loop tests false decomps ls 42 | | p :: _ :: _ when p.[0] = '@' -> loop tests collect_decomps decomps ls 43 | | [] | _ :: [] -> loop tests collect_decomps decomps ls 44 | | test :: comment :: _ -> 45 | begin match split_string ';' test with 46 | | c1 :: c2 :: c3 :: c4 :: c5 :: _ -> 47 | let test = [| uchars_of_string c1; uchars_of_string c2; 48 | uchars_of_string c3; uchars_of_string c4; 49 | uchars_of_string c5; |], comment 50 | in 51 | let decomps = 52 | if not collect_decomps then decomps else 53 | match (fst test).(0) with [ uchar ] -> 54 | Uset.add uchar decomps 55 | | _ -> failwith "" 56 | in 57 | loop (test :: tests) collect_decomps decomps ls 58 | | _ -> failwith "" 59 | end 60 | with Failure _ -> 61 | Test.log "Unable to parse line:\n`%s'\n" l; 62 | loop tests collect_decomps decomps ls 63 | in 64 | let* s = Os.File.read test_data_file in 65 | Ok (loop [] false Uset.empty (String.split_on_char '\n' s)) 66 | 67 | 68 | let conformance = Test.Arg.make () 69 | 70 | let test_conformance_normalizations = 71 | Test.test' conformance "conformance normalization invariants" @@ 72 | fun (tests, _) -> 73 | let nc, nfc = Array.init 5 (fun _ -> Uunf.create `NFC), Array.make 5 [] in 74 | let nd, nfd = Array.init 5 (fun _ -> Uunf.create `NFD), Array.make 5 [] in 75 | let nkc, nfkc = Array.init 5 (fun _ -> Uunf.create `NFKC), Array.make 5 [] in 76 | let nkd, nfkd = Array.init 5 (fun _ -> Uunf.create `NFKD), Array.make 5 [] in 77 | let rec add n acc v = match Uunf.add n v with 78 | | `Uchar u -> add n (u :: acc) `Await 79 | | `Await | `End -> acc 80 | in 81 | let parallel_add i v = 82 | nfc.(i) <- add nc.(i) nfc.(i) v; 83 | nfd.(i) <- add nd.(i) nfd.(i) v; 84 | nfkc.(i) <- add nkc.(i) nfkc.(i) v; 85 | nfkd.(i) <- add nkd.(i) nfkd.(i) v 86 | in 87 | let test (cs, comment) = 88 | for i = 0 to 4 do 89 | Uunf.reset nc.(i); nfc.(i) <- []; 90 | Uunf.reset nd.(i); nfd.(i) <- []; 91 | Uunf.reset nkc.(i); nfkc.(i) <- []; 92 | Uunf.reset nkd.(i); nfkd.(i) <- []; 93 | List.iter (fun u -> parallel_add i (`Uchar u)) cs.(i); 94 | parallel_add i `End; 95 | nfc.(i) <- List.rev nfc.(i); 96 | nfd.(i) <- List.rev nfd.(i); 97 | nfkc.(i) <- List.rev nfkc.(i); 98 | nfkd.(i) <- List.rev nfkd.(i); 99 | done; 100 | if cs.(1) <> nfc.(0) then Test.fail "NFC: c2 <> toNFC(c1) for%s" comment; 101 | if cs.(1) <> nfc.(1) then Test.fail "NFC: c2 <> toNFC(c2) for%s" comment; 102 | if cs.(1) <> nfc.(2) then Test.fail "NFC: c2 <> toNFC(c3) for%s" comment; 103 | if cs.(3) <> nfc.(3) then Test.fail "NFC: c4 <> toNFC(c4) for%s" comment; 104 | if cs.(3) <> nfc.(4) then Test.fail "NFC: c4 <> toNFC(c5) for%s" comment; 105 | if cs.(2) <> nfd.(0) then Test.fail "NFD: c3 <> toNFD(c1) for%s" comment; 106 | if cs.(2) <> nfd.(1) then Test.fail "NFD: c3 <> toNFD(c2) for%s" comment; 107 | if cs.(2) <> nfd.(2) then Test.fail "NFD: c3 <> toNFD(c3) for%s" comment; 108 | if cs.(4) <> nfd.(3) then Test.fail "NFD: c5 <> toNFD(c4) for%s" comment; 109 | if cs.(4) <> nfd.(4) then Test.fail "NFD: c5 <> toNFD(c5) for%s" comment; 110 | if cs.(3) <> nfkc.(0) then Test.fail "NFKC: c4 <> toNFKC(c1) for%s" comment; 111 | if cs.(3) <> nfkc.(1) then Test.fail "NFKC: c4 <> toNFKC(c2) for%s" comment; 112 | if cs.(3) <> nfkc.(2) then Test.fail "NFKC: c4 <> toNFKC(c3) for%s" comment; 113 | if cs.(3) <> nfkc.(3) then Test.fail "NFKC: c4 <> toNFKC(c4) for%s" comment; 114 | if cs.(3) <> nfkc.(4) then Test.fail "NFKC: c4 <> toNFKC(c5) for%s" comment; 115 | if cs.(4) <> nfkd.(0) then Test.fail "NFKD: c5 <> toNFKD(c1) for%s" comment; 116 | if cs.(4) <> nfkd.(1) then Test.fail "NFKD: c5 <> toNFKD(c2) for%s" comment; 117 | if cs.(4) <> nfkd.(2) then Test.fail "NFKD: c5 <> toNFKD(c3) for%s" comment; 118 | if cs.(4) <> nfkd.(3) then Test.fail "NFKD: c5 <> toNFKD(c4) for%s" comment; 119 | if cs.(4) <> nfkd.(4) then Test.fail "NFKD: c5 <> toNFKD(c5) for%s" comment; 120 | in 121 | List.iter test tests 122 | 123 | let test_conformance_non_decomposables = 124 | Test.test' conformance "conformance of non-decomposable characters" @@ 125 | fun (_, decomps) -> 126 | let nc = Uunf.create `NFC in 127 | let nd = Uunf.create `NFD in 128 | let nkc = Uunf.create `NFKC in 129 | let nkd = Uunf.create `NFKD in 130 | let norm n u = 131 | let rec add acc v = match Uunf.add n v with 132 | | `Uchar u -> add (u :: acc) `Await 133 | | `Await | `End -> acc 134 | in 135 | List.rev (add (add [] (`Uchar u)) `End) 136 | in 137 | let check u = 138 | if Uset.mem u decomps then () else 139 | begin 140 | let ul = [u] in 141 | Uunf.reset nc; Uunf.reset nd; Uunf.reset nkc; Uunf.reset nkd; 142 | if norm nc u <> ul then 143 | Test.fail "NFC: %a <> toNFC(%a)" uchar_dump u uchar_dump u; 144 | if norm nd u <> ul then 145 | Test.fail "NFD: %a <> toNFD(%a)" uchar_dump u uchar_dump u; 146 | if norm nkc u <> ul then 147 | Test.fail "NFKC: %a <> toNFKC(%a)" uchar_dump u uchar_dump u; 148 | if norm nkd u <> ul then 149 | Test.fail "NFKD: %a <> toNFKD(%a)" uchar_dump u uchar_dump u; 150 | end 151 | in 152 | (* For each unicode scalar value *) 153 | let rec loop u = 154 | if Uchar.equal Uchar.max u then check u else 155 | (check u; loop (Uchar.succ u)) 156 | in 157 | loop Uchar.min 158 | 159 | let test_ccc = 160 | Test.test "Uunf.ccc" @@ fun () -> 161 | Test.int (Uunf.ccc (Uchar.of_int 0x0020)) 0 ~__POS__; 162 | Test.int (Uunf.ccc (Uchar.of_int 0x0301)) 230 ~__POS__; 163 | () 164 | 165 | let various_norm_tests ?__POS__ test = 166 | Test.block ?__POS__ @@ fun () -> 167 | test [0x1E69] `NFD [0x0073; 0x0323; 0x0307] ; 168 | test [0x1E69] `NFC [0x1E69]; 169 | test [0x1E0B; 0x0323] `NFD [0x0064; 0x0323; 0x0307]; 170 | test [0x1E0B; 0x0323] `NFC [0x1E0D; 0x0307]; 171 | test [0xFB01] `NFD [0xFB01]; 172 | test [0xFB01] `NFC [0xFB01]; 173 | test [0xFB01] `NFKD [0x0066; 0x0069]; 174 | test [0xFB01] `NFKC [0x0066; 0x0069]; 175 | test [0x0032; 0x2075] `NFD [0x0032; 0x2075]; 176 | test [0x0032; 0x2075] `NFC [0x0032; 0x2075]; 177 | test [0x0032; 0x2075] `NFKD [0x0032; 0x0035]; 178 | test [0x0032; 0x2075] `NFKC [0x0032; 0x0035]; 179 | test [0x1E9B; 0x0323] `NFD [0x017F; 0x0323; 0x307]; 180 | test [0x1E9B; 0x0323] `NFC [0x1E9B; 0x0323; ]; 181 | test [0x1E9B; 0x0323] `NFKD [0x0073; 0x0323; 0x0307]; 182 | test [0x1E9B; 0x0323] `NFKC [0x1E69]; 183 | test [0x0041; 0x007A; 0x0335; 0x0327; 0x0324; 0x0301; 0x0041] `NFC 184 | [0x0041; 0x017A; 0x0335; 0x0327; 0x0324; 0x0041]; 185 | (* found by crowbar *) 186 | test [0x01C6; 0x032D] `NFKC [0x0064; 0x017E; 0x032D]; 187 | test [0xFF80; 0x1FD3; 0xFF9E; 0x1FD3;] `NFKC [0x30BF; 0x0390; 0x3099; 0x0390]; 188 | (* found again by crowbar *) 189 | test [0xC100; 0x20D2; 0x11C1; 0x11C1] `NFC [0xC100; 0x20D2; 0x11C1; 0x11C1]; 190 | () 191 | 192 | let test_specific = 193 | Test.test "specific normalizations" @@ fun () -> 194 | let test src nf dst = 195 | let n = Uunf.create nf in 196 | let rec add acc v = match Uunf.add n v with 197 | | `Uchar u -> add (u :: acc) `Await 198 | | `Await | `End -> acc 199 | in 200 | let add_uchar acc u = add acc (`Uchar (Uchar.of_int u)) in 201 | let nseq = List.rev (add (List.fold_left add_uchar [] src) `End) in 202 | let dst = List.map Uchar.of_int dst in 203 | if nseq <> dst then Test.fail "" 204 | in 205 | various_norm_tests test ~__POS__ 206 | 207 | let test_uunf_string = 208 | Test.test "Uunf_string" @@ fun () -> 209 | let test enc normalize = 210 | let b = Buffer.create 42 in 211 | let enc us = 212 | let rec loop = function 213 | | u :: us -> enc b (Uchar.of_int u); loop us 214 | | [] -> Buffer.contents b 215 | in 216 | Buffer.reset b; loop us 217 | in 218 | let test src nf dst = assert ((normalize nf (enc src)) = (enc dst)) in 219 | various_norm_tests test ~__POS__ 220 | in 221 | test Buffer.add_utf_8_uchar Uunf_string.normalize_utf_8; 222 | test Buffer.add_utf_16be_uchar Uunf_string.normalize_utf_16be; 223 | test Buffer.add_utf_16le_uchar Uunf_string.normalize_utf_16le; 224 | () 225 | 226 | let test_flushing_end_seq = 227 | Test.test "flushing end of stream" @@ fun () -> 228 | let n = Uunf.create `NFKC in 229 | let uchar u = `Uchar (Uchar.of_int u) in 230 | if Uunf.add n (uchar 0x2105) <> `Await then Test.fail ""; 231 | if Uunf.add n `Await <> `Await then Test.fail ""; 232 | if Uunf.add n `End <> (uchar 0x0063) then Test.fail ""; 233 | if Uunf.add n `Await <> (uchar 0x002F) then Test.fail ""; 234 | if Uunf.add n `Await <> (uchar 0x006F) then Test.fail ""; 235 | if Uunf.add n `Await <> `End then Test.fail ""; 236 | () 237 | 238 | let main () = 239 | let test_data_file = 240 | let default = Fpath.v "test/NormalizationTest.txt" in 241 | let doc = "Unicode normalization conformance test file." in 242 | Cmdliner.Arg.(value & pos 0 B0_std_cli.filepath default & info [] ~doc) 243 | in 244 | Test.main' test_data_file @@ fun test_data_file -> 245 | match decode_conformance_data test_data_file with 246 | | Error e -> Test.failstop "%s" e 247 | | Ok data -> 248 | let args = Test.Arg.[value conformance data] in 249 | Test.autorun ~args () 250 | 251 | let () = if !Sys.interactive then () else exit (main ()) 252 | --------------------------------------------------------------------------------