├── BRZO
├── .ocp-indent
├── src
├── uunf.mllib
├── uunf_string.mli
├── uunf_string.ml
├── uunf_fmt.ml
├── uunf_tmap.ml
├── uunf_tmapbyte.ml
├── uunf_tmapbool.ml
├── uunf.mli
└── uunf.ml
├── .merlin
├── .gitignore
├── doc
└── index.mld
├── _tags
├── pkg
├── META
└── pkg.ml
├── LICENSE.md
├── test
├── examples.ml
├── unftrip.ml
└── test_uunf.ml
├── README.md
├── DEVEL.md
├── opam
├── support
├── generate_data.ml
├── gen.ml
└── gen_norm.ml
├── CHANGES.md
└── B0.ml
/BRZO:
--------------------------------------------------------------------------------
1 | (srcs-x pkg support test)
--------------------------------------------------------------------------------
/.ocp-indent:
--------------------------------------------------------------------------------
1 | strict_with=always,match_clause=4,strict_else=never
--------------------------------------------------------------------------------
/src/uunf.mllib:
--------------------------------------------------------------------------------
1 | Uunf_fmt
2 | Uunf_tmap
3 | Uunf_tmapbool
4 | Uunf_tmapbyte
5 | Uunf_data
6 | Uunf
7 | Uunf_string
--------------------------------------------------------------------------------
/.merlin:
--------------------------------------------------------------------------------
1 | PKG uucd cmdliner uutf b0.kit
2 | S src
3 | S data
4 | S test
5 | S support
6 | B _b0/b/**
7 | B _build/**
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _b0
2 | _build
3 | support/ucd.xml
4 | test/NormalizationTest.txt
5 | tmp
6 | *.native
7 | *.byte
8 | *.install
9 | *~
10 | \.\#*
11 | \#*#
12 |
--------------------------------------------------------------------------------
/doc/index.mld:
--------------------------------------------------------------------------------
1 | {0 Uunf {%html: %%VERSION%%%}}
2 |
3 | Uunf normalizes Unicode text. See {!Uunf} for more details.
4 |
5 | {1:uunf Library [uunf]}
6 |
7 | {!modules:
8 | Uunf
9 | Uunf_string
10 | }
11 |
--------------------------------------------------------------------------------
/_tags:
--------------------------------------------------------------------------------
1 | true : bin_annot, safe_string
2 |
3 | <_b0> : -traverse
4 |
5 | : include
6 |
7 | : include
8 | : package(uucd unix)
9 |
10 | : include
11 | : package(uutf), package(cmdliner)
12 |
--------------------------------------------------------------------------------
/pkg/META:
--------------------------------------------------------------------------------
1 | description = "Unicode text normalization for OCaml"
2 | version = "%%VERSION_NUM%%"
3 | requires = ""
4 | archive(byte) = "uunf.cma"
5 | archive(native) = "uunf.cmxa"
6 | plugin(byte) = "uunf.cma"
7 | plugin(native) = "uunf.cmxs"
8 | exists_if = "uunf.cma uunf.cmxa"
9 |
10 | package "string" (
11 | description = "The uunf.string library (deprecated)"
12 | version = "%%VERSION_NUM%%"
13 | requires = "uunf"
14 | exports = "uunf"
15 | warning = "Deprecated, use the uunf library."
16 | )
17 |
--------------------------------------------------------------------------------
/pkg/pkg.ml:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ocaml
2 | #use "topfind"
3 | #require "topkg"
4 | open Topkg
5 |
6 | let uutf = Conf.with_pkg "uutf"
7 | let cmdliner = Conf.with_pkg "cmdliner"
8 |
9 | let () =
10 | let opams =
11 | [Pkg.opam_file "opam" ~lint_deps_excluding:(Some ["b0"; "xmlm"; "uucd"])]
12 | in
13 | Pkg.describe "uunf" ~opams @@ fun c ->
14 | let uutf = Conf.value c uutf in
15 | let cmdliner = Conf.value c cmdliner in
16 | Ok [ Pkg.mllib ~api:["Uunf"; "Uunf_string"] "src/uunf.mllib";
17 | Pkg.bin ~cond:(uutf && cmdliner) "test/unftrip";
18 | Pkg.test "test/test" ~args:(Cmd.v "test/NormalizationTest.txt");
19 | Pkg.test "test/examples";
20 | Pkg.doc "doc/index.mld" ~dst:"odoc-pages/index.mld";
21 | Pkg.doc "test/examples.ml"; ]
22 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012 The uunf programmers
2 |
3 | Permission to use, copy, modify, and/or distribute this software for any
4 | purpose with or without fee is hereby granted, provided that the above
5 | copyright notice and this permission notice appear in all copies.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 |
--------------------------------------------------------------------------------
/test/examples.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: CC0-1.0
4 | ---------------------------------------------------------------------------*)
5 |
6 | let utf_8_normalize nf s =
7 | let rec add buf normalizer v = match Uunf.add normalizer v with
8 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await
9 | | `Await | `End -> ()
10 | in
11 | let rec loop buf s i max normalizer =
12 | if i > max then (add buf normalizer `End; Buffer.contents buf) else
13 | let dec = String.get_utf_8_uchar s i in
14 | add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec));
15 | loop buf s (i + Uchar.utf_decode_length dec) max normalizer
16 | in
17 | let buf = Buffer.create (String.length s * 3) in
18 | let normalizer = Uunf.create nf in
19 | loop buf s 0 (String.length s - 1) normalizer
20 |
--------------------------------------------------------------------------------
/src/uunf_string.mli:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2015 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | (** Unicode text normalization on UTF OCaml strings.
7 |
8 | {!Uunf} functions acting directly on UTF encoded OCaml strings.
9 |
10 | {b Warning.} All these function silently replace malformed encoded Unicode
11 | data by a {!Stdlib.Uchar.rep} character. *)
12 |
13 | (** {1:norm Normalize} *)
14 |
15 | val normalize_utf_8 : Uunf.form -> string -> string
16 | (** [normalize_utf_8 nf s] is the UTF-8 encoded string [s] in normal
17 | form [nf]. *)
18 |
19 | val normalize_utf_16be : Uunf.form -> string -> string
20 | (** [normalize_utf_16be nf s] is the UTF-16BE encoded string [s] in
21 | normal form [nf]. *)
22 |
23 | val normalize_utf_16le : Uunf.form -> string -> string
24 | (** [normalize_utf_16le nf s] is the UTF-16LE encoded string [s] in
25 | normal form [nf]. *)
26 |
--------------------------------------------------------------------------------
/src/uunf_string.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2015 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | let normalize_utf_x get_utf add_utf nf s =
7 | let rec add add_utf buf normalizer v = match Uunf.add normalizer v with
8 | | `Uchar u -> add_utf buf u; add add_utf buf normalizer `Await
9 | | `Await | `End -> ()
10 | in
11 | let rec loop get_utf add_utf buf s i max normalizer =
12 | if i > max then (add add_utf buf normalizer `End; Buffer.contents buf) else
13 | let dec = get_utf s i in
14 | let u = Uchar.utf_decode_uchar dec in
15 | add add_utf buf normalizer (`Uchar u);
16 | loop get_utf add_utf buf s (i + Uchar.utf_decode_length dec) max normalizer
17 | in
18 | let b = Buffer.create (String.length s * 3) in
19 | let normalizer = Uunf.create nf in
20 | loop get_utf add_utf b s 0 (String.length s - 1) normalizer
21 |
22 | let normalize_utf_8 nf s =
23 | normalize_utf_x String.get_utf_8_uchar Buffer.add_utf_8_uchar nf s
24 |
25 | let normalize_utf_16be nf s =
26 | normalize_utf_x String.get_utf_16be_uchar Buffer.add_utf_16be_uchar nf s
27 |
28 | let normalize_utf_16le nf s =
29 | normalize_utf_x String.get_utf_16le_uchar Buffer.add_utf_16le_uchar nf s
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Uunf — Unicode text normalization for OCaml
2 | ===========================================
3 | %%VERSION%%
4 |
5 | Uunf is an OCaml library for normalizing Unicode text. It supports all
6 | Unicode [normalization forms]. The library is independent from any IO
7 | mechanism or Unicode text data structure and it can process text
8 | without a complete in-memory representation.
9 |
10 | Uunf is distributed under the ISC license. It has no dependency.
11 |
12 | [normalization forms]: http://www.unicode.org/reports/tr15/
13 |
14 | Homepage:
15 |
16 |
17 | ## Installation
18 |
19 | Uunf can be installed with `opam`:
20 |
21 | opam install uunf
22 | opam install uunf cmdliner uutf # For the unftrip tool
23 |
24 | If you don't use `opam` consult the [`opam`](opam) file for build
25 | instructions.
26 |
27 |
28 | ## Documentation
29 |
30 | The documentation can be consulted [online] or via `odig doc uunf`.
31 |
32 | Questions are welcome but better asked on the [OCaml forum] than on
33 | the issue tracker.
34 |
35 | [online]: http://erratique.ch/software/uunf/doc/
36 | [OCaml forum]: https://discuss.ocaml.org/
37 |
38 |
39 | ## Sample programs
40 |
41 | The [`unftrip`] tool normalises text provided on standard input.
42 |
43 | See also the [doc examples].
44 |
45 | [`unftrip`]: test/unftrip.ml
46 | [doc examples]: test/examples.ml
47 |
--------------------------------------------------------------------------------
/DEVEL.md:
--------------------------------------------------------------------------------
1 | # New Unicode release
2 |
3 | The file `src/uunf_data.ml` contains generated data. This file needs
4 | to be regenerated on new Unicode releases, as well as the `opam` file.
5 |
6 | In order to do so you need to install an updated version of the [uucd]
7 | OCaml package which is capable of reading the latest XML Unicode
8 | character database.
9 |
10 | You can then bump the Unicode release number at the top of the `B0.ml`
11 | file. Verify that everything is as expected with:
12 |
13 | b0 -- unicode-version
14 |
15 | You should then download a copy of the XML Unicode character database
16 | to the `support/ucd.xml` file which is ignored by git. If you have
17 | `curl` and `unzip` in your `PATH` you can simply issue:
18 |
19 | b0 -- download-ucdxml
20 |
21 | You can now proceed to generate the `src/uunf_data.ml` and update the opam file
22 | file by issuing:
23 |
24 | b0 -- generate-data
25 | b0 -- .opam file > opam
26 |
27 | [uucd]: http://erratique.ch/software/uucd
28 |
29 | # Reference tests
30 |
31 | To test the package on the reference normalization tests of you must
32 | download a copy of the tests to the `test/NormalizationTest.txt` file
33 | which is ignored by git.
34 |
35 | If you have `curl` in your `PATH` you can simply issue:
36 |
37 | b0 -- download-tests
38 |
39 | this downloads the tests for the unicode version mentioned in `B0.ml`.
40 |
41 | You can then check them with:
42 |
43 | b0 test
44 |
45 |
--------------------------------------------------------------------------------
/opam:
--------------------------------------------------------------------------------
1 | opam-version: "2.0"
2 | name: "uunf"
3 | synopsis: "Unicode text normalization for OCaml"
4 | description: """\
5 | Uunf is an OCaml library for normalizing Unicode text. It supports all
6 | Unicode [normalization forms]. The library is independent from any IO
7 | mechanism or Unicode text data structure and it can process text
8 | without a complete in-memory representation.
9 |
10 | Uunf is distributed under the ISC license. It has no dependency.
11 |
12 | [normalization forms]: http://www.unicode.org/reports/tr15/
13 |
14 | Homepage: """
15 | maintainer: "Daniel Bünzli "
16 | authors: "The uunf programmers"
17 | license: "ISC"
18 | tags: ["unicode" "text" "normalization" "org:erratique"]
19 | homepage: "https://erratique.ch/software/uunf"
20 | doc: "https://erratique.ch/software/uunf/doc/Uunf"
21 | bug-reports: "https://github.com/dbuenzli/uunf/issues"
22 | depends: [
23 | "ocaml" {>= "4.14.0"}
24 | "ocamlfind" {build}
25 | "ocamlbuild" {build}
26 | "topkg" {build & >= "1.1.0"}
27 | "uucd" {dev & >= "17.0.0" & < "18.0.0"}
28 | ]
29 | depopts: ["uutf" "cmdliner"]
30 | conflicts: [
31 | "uutf" {< "1.0.0"}
32 | "cmdliner" {< "1.1.0"}
33 | ]
34 | build: [
35 | "ocaml"
36 | "pkg/pkg.ml"
37 | "build"
38 | "--dev-pkg"
39 | "%{dev}%"
40 | "--with-uutf"
41 | "%{uutf:installed}%"
42 | "--with-cmdliner"
43 | "%{cmdliner:installed}%"
44 | ]
45 | post-messages:
46 | "If the build fails with \"ocamlopt.opt got signal and exited\", issue 'ulimit -s unlimited' and retry."
47 | {failure & (arch = "ppc64" | arch = "arm64")}
48 | dev-repo: "git+https://erratique.ch/repos/uunf.git"
49 | x-maintenance-intent: ["(latest)"]
50 |
--------------------------------------------------------------------------------
/src/uunf_fmt.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2020 The uucp programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | let pf = Format.fprintf
7 | let strf = Format.asprintf
8 | let string = Format.pp_print_string
9 | let string_X ppf s =
10 | Format.pp_open_vbox ppf 1; string ppf "\"";
11 | for i = 0 to String.length s - 1 do
12 | if i mod 16 = 0 && i > 0 then pf ppf "\\@\n";
13 | pf ppf "\\x%02x" (Char.code s.[i])
14 | done;
15 | string ppf "\""; Format.pp_close_box ppf ()
16 |
17 | let string_XN ppf = function "" -> string ppf "snil" | x -> string_X ppf x
18 | let bool = Format.pp_print_bool
19 | let sp = Format.pp_print_space
20 | let semi ppf () = string ppf ";"; sp ppf ()
21 | let int = Format.pp_print_int
22 | let iter i ?(sep = sp) pp ppf x =
23 | let fst = ref true in
24 | i (fun v -> (if !fst then fst := false else sep ppf ()); pp ppf v) x
25 |
26 | let as_array i pp ppf = pf ppf "@[<2>[|%a|]@]" (iter i ~sep:semi pp)
27 | let array pp = as_array Array.iter pp
28 | let array_N pp ppf = function [||] -> string ppf "nil" | x -> array pp ppf x
29 |
30 | module R = struct
31 | type _ record =
32 | | [] : unit record
33 | | (::) :
34 | (string * (Format.formatter -> 'a -> unit)) * 'b record ->
35 | ('a -> 'b) record
36 | end
37 |
38 | let record record ppf =
39 | let field name pp_v ppf v = pf ppf "@[<1>%s =@ %a@]" name pp_v v in
40 | let open R in (* 4.03 compat *)
41 | let rec go : type a. (unit -> unit) -> a R.record -> a = fun k -> function
42 | | [] -> pf ppf "@[<2>{ %a }@]" (fun _ -> k) ()
43 | | [name, pp_v] ->
44 | fun v -> go (fun () -> k (); field name pp_v ppf v) []
45 | | (name, pp_v) :: record ->
46 | fun v -> go (fun () -> k (); field name pp_v ppf v; semi ppf ()) record
47 | in
48 | go ignore record
49 |
--------------------------------------------------------------------------------
/src/uunf_tmap.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | (* Trie character maps *)
7 |
8 | type 'a t =
9 | { default : 'a; (* default value. *)
10 | l0 : 'a array array array } (* 0x1FFFFF as 0x1FF - 0xFF - 0xF. *)
11 |
12 | let nil = [||]
13 | let l0_shift = 12
14 | let l0_size = 272 (* 0x10F + 1 *)
15 | let l1_shift = 4
16 | let l1_mask = 0xFF
17 | let l1_size = 256 (* 0xFF + 1 *)
18 | let l2_mask = 0xF
19 | let l2_size = 16 (* 0xF + 1 *)
20 | let get m u =
21 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in
22 | if l1 == nil then m.default else
23 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in
24 | if l2 == nil then m.default else
25 | Array.unsafe_get l2 (u land l2_mask)
26 |
27 | let create default = { default; l0 = Array.make l0_size nil }
28 | let set m u v =
29 | if v = m.default then () else
30 | let i = u lsr l0_shift in
31 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size nil;
32 | let j = u lsr l1_shift land l1_mask in
33 | if m.l0.(i).(j) == nil then m.l0.(i).(j) <- Array.make l2_size m.default;
34 | m.l0.(i).(j).(u land l2_mask) <- v
35 |
36 | let size v_size m = match m.l0 with
37 | | [||] -> 3 + 1 + v_size m.default
38 | | l0 ->
39 | let size = ref (3 + v_size m.default + 1 + Array.length l0) in
40 | for i = 0 to Array.length l0 - 1 do match l0.(i) with
41 | | [||] -> ()
42 | | l1 ->
43 | size := !size + (1 + Array.length l1);
44 | for j = 0 to Array.length l1 - 1 do match l1.(j) with
45 | | [||] -> ()
46 | | l2 ->
47 | size := !size + (1 + Array.length l2);
48 | for k = 0 to Array.length l2 - 1 do
49 | size := !size + v_size l2.(k)
50 | done;
51 | done;
52 | done;
53 | !size
54 |
55 | let dump pp_v ppf m =
56 | let open Uunf_fmt in
57 | record ["default", pp_v; "l0", pp_v |> array_N |> array_N |> array]
58 | ppf m.default m.l0
59 |
--------------------------------------------------------------------------------
/src/uunf_tmapbyte.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | (* Trie character byte maps *)
7 |
8 | type t =
9 | { default : int; (* default value. *)
10 | l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *)
11 |
12 | let nil = [||]
13 | let snil = ""
14 | let l0_shift = 12
15 | let l0_size = 272 (* 0x10F + 1 *)
16 | let l1_shift = 8
17 | let l1_mask = 0xF
18 | let l1_size = 16 (* 0xF + 1 *)
19 | let l2_mask = 0xFF
20 | let l2_size = 256 (* 0xFF + 1 *)
21 | let get m u =
22 | let l1 = Array.get m.l0 (u lsr l0_shift) in
23 | if l1 == nil then m.default else
24 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in
25 | if l2 == snil then m.default else
26 | Char.code (String.unsafe_get l2 (u land l2_mask))
27 |
28 | let create default = { default; l0 = Array.make l0_size nil }
29 | let set m u byte =
30 | let l2_make m = Bytes.make l2_size (Char.chr m.default) in
31 | if byte = m.default then () else
32 | let i = u lsr l0_shift in
33 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil;
34 | let j = u lsr l1_shift land l1_mask in
35 | if m.l0.(i).(j) == snil then
36 | m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m);
37 | let k = u land l2_mask in
38 | Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) k (Char.unsafe_chr byte)
39 |
40 | let size m = match m.l0 with
41 | | [||] -> 3 + 1
42 | | l0 ->
43 | let size = ref (3 + 1 + Array.length l0) in
44 | for i = 0 to Array.length l0 - 1 do match l0.(i) with
45 | | [||] -> ()
46 | | l1 ->
47 | size := !size + 1 + Array.length l1;
48 | for j = 0 to Array.length l1 - 1 do
49 | size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size)
50 | done;
51 | done;
52 | !size
53 |
54 | let iter_blobs i m = Array.(iter (iter i)) m.l0
55 |
56 | let dump_pp pp_v ppf m =
57 | let open Uunf_fmt in
58 | record ["default", int; "l0", pp_v |> array_N |> array]
59 | ppf m.default m.l0
60 |
61 | let pp_v = Uunf_fmt.string_XN
62 | let dump = dump_pp pp_v
63 |
--------------------------------------------------------------------------------
/support/generate_data.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2015 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | (* Extracts data from the Unicode Character Database *)
7 |
8 | let str = Format.sprintf
9 | let exec = Filename.basename Sys.executable_name
10 |
11 | let ucd_or_die inf = try
12 | let ic = if inf = "-" then stdin else open_in inf in
13 | let d = Uucd.decoder (`Channel ic) in
14 | match Uucd.decode d with
15 | | `Ok db -> db
16 | | `Error e ->
17 | let (l0, c0), (l1, c1) = Uucd.decoded_range d in
18 | Printf.eprintf "%s:%d.%d-%d.%d: %s\n%!" inf l0 c0 l1 c1 e;
19 | exit 1
20 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1
21 |
22 | let process inf outf =
23 | let ucd = (Gen.log "Loading Unicode character database.\n"; ucd_or_die inf) in
24 | let generate pp outf ucd =
25 | try
26 | let oc = if outf = "-" then stdout else open_out outf in
27 | try
28 | let ppf = Format.formatter_of_out_channel oc in
29 | pp ppf ucd;
30 | Format.pp_print_flush ppf ();
31 | close_out oc
32 | with Sys_error _ as e -> close_out oc; raise e
33 | with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1
34 | in
35 | Gen.log "Note: reported sizes do not take sharing into account.\n";
36 | generate Gen_norm.pp_mod outf ucd;
37 | ()
38 |
39 | let main () =
40 | let usage = str
41 | "Usage: %s [OPTION]... [DBFILE]\n\
42 | \ Generates data modules from an Unicode character database XML file.\n\
43 | \ DBFILE defaults to support/ucd.xml\n\
44 | Options:" exec
45 | in
46 | let inf = ref None in
47 | let set_inf f =
48 | if !inf = None then inf := Some f else
49 | raise (Arg.Bad "only one Unicode character database file can be specified")
50 | in
51 | let outf = ref None in
52 | let set r = Arg.String (fun s -> r := Some s) in
53 | let options = [
54 | "-o", set outf, " output file, defaults to src/uunf_data.ml";
55 | ]
56 | in
57 | Arg.parse (Arg.align options) set_inf usage;
58 | let inf = match !inf with None -> "support/ucd.xml" | Some inf -> inf in
59 | let outf = match !outf with None -> "src/uunf_data.ml" | Some outf -> outf in
60 | process inf outf
61 |
62 | let () = main ()
63 |
--------------------------------------------------------------------------------
/src/uunf_tmapbool.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | (* Trie character boolean maps *)
7 |
8 | type t =
9 | { default : bool; (* default value. *)
10 | l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *)
11 |
12 | let nil = [||]
13 | let snil = ""
14 | let l0_shift = 12
15 | let l0_size = 272 (* 0x10F + 1 *)
16 | let l1_shift = 8
17 | let l1_mask = 0xF
18 | let l1_size = 16 (* 0xF + 1 *)
19 | let l2_mask = 0xFF
20 | let l2_size = 32 (* 0xFF + 1 / 8 *)
21 | let get m u =
22 | let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in
23 | if l1 == nil then m.default else
24 | let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in
25 | if l2 == snil then m.default else
26 | let k = u land l2_mask in
27 | let byte_num = k lsr 3 (* / 8 *) in
28 | let bit_num = k land 7 (* mod 8 *) in
29 | let byte = Char.code (String.unsafe_get l2 byte_num) in
30 | byte land (1 lsl bit_num) > 0
31 |
32 | let create default = { default; l0 = Array.make l0_size nil }
33 | let set m u b =
34 | let l2_make m = Bytes.make l2_size (if m.default then '\xFF' else '\x00') in
35 | if b = m.default then () else
36 | let i = u lsr l0_shift in
37 | if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil;
38 | let j = u lsr l1_shift land l1_mask in
39 | if m.l0.(i).(j) == snil then
40 | m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m);
41 | let k = u land l2_mask in
42 | let byte_num = k lsr 3 (* / 8 *) in
43 | let bit_num = k land 7 (* mod 8 *) in
44 | let byte = Char.code m.l0.(i).(j).[byte_num] in
45 | let new_byte =
46 | if b then (Char.unsafe_chr (byte lor (1 lsl bit_num))) else
47 | (Char.unsafe_chr (byte land lnot (1 lsl bit_num)))
48 | in
49 | Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) byte_num new_byte
50 |
51 | let size m = match m.l0 with
52 | | [||] -> 3 + 1
53 | | l0 ->
54 | let size = ref (3 + 1 + Array.length l0) in
55 | for i = 0 to Array.length l0 - 1 do match l0.(i) with
56 | | [||] -> ()
57 | | l1 ->
58 | size := !size + 1 + Array.length l1;
59 | for j = 0 to Array.length l1 - 1 do
60 | size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size)
61 | done;
62 | done;
63 | !size
64 |
65 | let iter_blobs i m = Array.(iter (iter i)) m.l0
66 |
67 | let dump_pp pp_v ppf m =
68 | let open Uunf_fmt in
69 | record ["default", bool; "l0", pp_v |> array_N |> array]
70 | ppf m.default m.l0
71 |
72 | let pp_v = Uunf_fmt.string_XN
73 | let dump = dump_pp pp_v
74 |
--------------------------------------------------------------------------------
/support/gen.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | (* Extracts normalization data from the Unicode Character Database *)
7 |
8 | let log fmt = Printf.eprintf (fmt ^^ "%!")
9 | let pp = Format.fprintf
10 | let str = Printf.sprintf
11 | let str_of_size s =
12 | let b = s * (Sys.word_size / 8) in
13 | if b < 1_048_576 then str "%.1f Ko" (float b /. 1024.) else
14 | if b < 1_073_741_824 then str "%.1f Mo" (float b /. 1024. ** 2.) else
15 | str "%.1f Go" (float b /. 1024. ** 3.)
16 |
17 | (* Characters *)
18 |
19 | let is_hangul_syllabe u = 0xAC00 <= u && u <= 0xD7A3
20 |
21 | let iter_uchar_ints f =
22 | let rec loop u =
23 | let i = Uchar.to_int u in
24 | if Uchar.equal u Uchar.max then f i else
25 | (f i; loop (Uchar.succ u))
26 | in
27 | loop Uchar.min
28 |
29 | (* Compact maps from characters to booleans. *)
30 |
31 | let bool_prop_maps prop =
32 | let tm = Uunf_tmapbool.create true in
33 | let fm = Uunf_tmapbool.create false in
34 | let add_uchar u =
35 | let b = prop u in
36 | Uunf_tmapbool.set tm u b;
37 | Uunf_tmapbool.set fm u b;
38 | in
39 | iter_uchar_ints add_uchar; tm, fm
40 |
41 | let assert_bool_prop_maps prop tm fm =
42 | let assert_uchar u =
43 | let fail () = failwith (str "bool prop map failure for U+%04X" u) in
44 | let b = prop u in
45 | if b <> Uunf_tmapbool.get tm u then fail ();
46 | if b <> Uunf_tmapbool.get fm u then fail ();
47 | in
48 | iter_uchar_ints assert_uchar
49 |
50 | (* Compact maps from characters to bytes. *)
51 |
52 | let byte_prop_map ~default prop =
53 | let m = Uunf_tmapbyte.create default in
54 | let add_uchar u = Uunf_tmapbyte.set m u (prop u) in
55 | iter_uchar_ints add_uchar; m
56 |
57 | let assert_byte_prop_map prop m =
58 | let assert_uchar u =
59 | if (prop u) = Uunf_tmapbyte.get m u then () else
60 | failwith (str "byte prop map failure for U+%04X" u)
61 | in
62 | iter_uchar_ints assert_uchar
63 |
64 | (* Compact maps from characters to arbitrary values. *)
65 |
66 | let prop_map ~default prop =
67 | let m = Uunf_tmap.create default in
68 | let add_uchar u = Uunf_tmap.set m u (prop u) in
69 | iter_uchar_ints add_uchar; m
70 |
71 | let assert_prop_map prop m =
72 | let assert_uchar u =
73 | if (prop u) = Uunf_tmap.get m u then () else
74 | failwith (str "prop map failure for U+%04X" u)
75 | in
76 | iter_uchar_ints assert_uchar
77 |
78 | let ucd_get ucd u p pstr = match Uucd.cp_prop ucd u p with
79 | | None -> invalid_arg (str "no %s property for U+%04X" pstr u)
80 | | Some v -> v
81 |
82 | (* Generate a module *)
83 |
84 | let year = (Unix.gmtime (Unix.gettimeofday ())).Unix.tm_year + 1900
85 |
86 | let pp_mod pp_mod ppf m =
87 | pp ppf
88 | "\
89 | (*---------------------------------------------------------------------------
90 | Copyright (c) %d The uunf programmers. All rights reserved.
91 | SPDX-License-Identifier: ISC
92 | ---------------------------------------------------------------------------*)
93 |
94 | (* WARNING do not edit. This file was automatically generated. *)
95 | @\n@[%a@]@\n" year pp_mod m
96 |
--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | v17.0.0 2025-09-11 Zagreb
2 | -------------------------
3 |
4 | - Unicode 17.0.0 support.
5 |
6 | v16.0.0 2024-09-10 Zagreb
7 | -------------------------
8 |
9 | - Unicode 16.0.0 support.
10 |
11 | v15.1.0 2022-09-15 Zagreb
12 | -------------------------
13 |
14 | - Unicode 15.1.0 support.
15 | - Requires OCaml 4.14.0.
16 | - The `Uunf_string` module was rewritten to use the standard library
17 | UTF decoders and was moved to the `uunf` library. The `uunf.string`
18 | library is deprecated, it warns on usage and simply requires `uunf`.
19 | - The sample code was rewritten to use the standard library UTF
20 | decoders.
21 |
22 | v15.0.0 2022-09-15 Zagreb
23 | -------------------------
24 |
25 | - Unicode 15.0.0 support.
26 |
27 | v14.0.0 2021-09-17 Zagreb
28 | -------------------------
29 |
30 | - Unicode 14.0.0 support.
31 |
32 | v13.0.0 2020-03-11 La Forclaz (VS)
33 | ---------------------------------
34 |
35 | - Unicode 13.0.0 support.
36 | - Require OCaml >= 4.03.0.
37 |
38 | v12.0.0 2019-03-08 La Forclaz (VS)
39 | ----------------------------------
40 |
41 | - Unicode 12.0.0 support.
42 |
43 | v11.0.0 2018-06-06 Lausanne
44 | ---------------------------
45 |
46 | - Unicode 11.0.0 support.
47 | - Fix bug when normalizer flushes at the end of stream: a spurious
48 | `Await` was returned before the final `End`.
49 |
50 | v10.0.0 2017-06-20 Cambridge (UK)
51 | ---------------------------------
52 |
53 | - Unicode 10.0.0 support
54 | - Fix bug in canonical composition algorithm (used by NFC and NFKC forms).
55 | Thanks to Stephen Dolan for the report.
56 | - Fix regression of `Uucp.ccc` introduced by f4c0363 which went into
57 | v2.0.{0,1}.
58 |
59 | v2.0.1 2016-03-07 La Forclaz (VS)
60 | ---------------------------------
61 |
62 | - OCaml 4.05.0 compatibility (removal of `Uchar.dump`).
63 |
64 | v2.0.0 2016-11-23 Zagreb
65 | ------------------------
66 |
67 | - Support for Unicode 9.0.0.
68 | - OCaml standard library `Uchar.t` support.
69 | - Removes and substitutes `type Uunf.uchar = int` by the (abstract)
70 | `Uchar.t` type. `Uchar.{of,to}_int` allows to recover the previous
71 | representation.
72 | - Removes `Uunf.is_scalar_value`. `Uchar.is_valid` can be used instead.
73 | - Safe string support.
74 | - Build depend on topkg.
75 | - Relicense from BSD3 to ISC.
76 |
77 | v1.0.0 2015-06-17 Cambridge (UK)
78 | --------------------------------
79 |
80 | - Updated for Unicode 8.0.0
81 | - `topkg` support
82 | - `Uunf.add` now eventually returns `` `End`` whenever the latter was
83 | encoded and the character stream was entirely output. In most existing
84 | programs this will simply entail to add `` `End`` to the existing
85 | `` `Await`` case in pattern matches on the result of `Uunf.add`.
86 | - Adds the `Uunf_string` library that allows to directly normalize UTF-X
87 | OCaml encoded strings. This library depends on `Uutf`.
88 | - Rewrote the utility `unftrip` to use `Cmdliner` which is now
89 | an optional dependency of the package. The cli interface is
90 | incompatible with previous versions. Support for random
91 | Unicode scalar value generation was removed, use `utftrip` from
92 | the `Uutf` package for that.
93 | - Rewrote the module's data generation to essentially match what is done
94 | in `Uucp`. Much less ugly, no source file `sed`ding.
95 |
96 | v0.9.3 2014-06-16 Cambridge (UK)
97 | --------------------------------
98 |
99 | - Updated for Unicode 7.0.0
100 |
101 | v0.9.2 2013-10-01 Lausanne
102 | --------------------------
103 |
104 | - Updated for Unicode 6.3.0
105 | - OPAM friendly workflow and drop OASIS support.
106 |
107 | v0.9.1 2013-01-04 La Forclaz (VS)
108 | ---------------------------------
109 |
110 | - Updated for Unicode 6.2.0.
111 | - Fix Uunf.is_scalar_value always returning false.
112 | - Make the module completely safe for the client.
113 | - Change command line help of unftrip.
114 |
115 | v0.9.0 2012-09-07 Lausanne
116 | --------------------------
117 |
118 | First release.
119 |
--------------------------------------------------------------------------------
/B0.ml:
--------------------------------------------------------------------------------
1 | open B0_kit.V000
2 | open Result.Syntax
3 |
4 | let unicode_version = 17, 0, 0, None (* Adjust on new releases *)
5 | let next_major = B0_version.next_major unicode_version
6 |
7 | (* OCaml library names *)
8 |
9 | let b0_std = B0_ocaml.libname "b0.std"
10 | let unix = B0_ocaml.libname "unix"
11 | let uucd = B0_ocaml.libname "uucd"
12 | let uutf = B0_ocaml.libname "uutf"
13 | let cmdliner = B0_ocaml.libname "cmdliner"
14 |
15 | let uunf = B0_ocaml.libname "uunf"
16 |
17 | (* Libraries *)
18 |
19 | let uunf_lib = B0_ocaml.lib uunf ~doc:"The uunf library" ~srcs:[ `Dir ~/"src" ]
20 | let uunf_string_lib =
21 | B0_ocaml.deprecated_lib ~exports:[uunf] (B0_ocaml.libname "uunf.string")
22 |
23 | (* Data generation. *)
24 |
25 | let generate_data =
26 | let doc = "uunf_data.ml generator" in
27 | let srcs =
28 | [ `Dir ~/"support";
29 | `File ~/"src/uunf_tmapbool.ml";
30 | `File ~/"src/uunf_tmapbyte.ml";
31 | `File ~/"src/uunf_tmap.ml";
32 | `File ~/"src/uunf_fmt.ml" ]
33 | in
34 | let requires = [ uucd; unix ] in
35 | let meta = B0_meta.(empty |> tag build |> ~~ B0_unit.Action.cwd `Scope_dir) in
36 | B0_ocaml.exe "generate-data" ~doc ~srcs ~requires ~meta
37 |
38 | (* Tools *)
39 |
40 | let unftrip =
41 | let srcs = [ `File ~/"test/unftrip.ml" ] in
42 | let requires = [ cmdliner; uutf; uunf ] in
43 | B0_ocaml.exe "unftrip" ~public:true ~doc:"The unftrip tool" ~srcs ~requires
44 |
45 | (* Tests *)
46 |
47 | let test =
48 | let srcs = [ `File ~/"test/test_uunf.ml" ] in
49 | let meta =
50 | B0_meta.(empty |> tag test |> tag run |> ~~ B0_unit.Action.cwd `Scope_dir)
51 | in
52 | let requires = [b0_std; uunf; cmdliner] in
53 | B0_ocaml.exe "test_uunf" ~srcs ~meta ~requires ~doc:"Test normalization"
54 |
55 | let examples =
56 | let srcs = [ `File ~/"test/examples.ml" ] in
57 | let meta = B0_meta.empty |> B0_meta.(tag test) in
58 | B0_ocaml.exe "examples" ~srcs ~meta ~requires:[uunf] ~doc:"Doc samples"
59 |
60 | (* Actions *)
61 |
62 | let uc_base = "http://www.unicode.org/Public"
63 |
64 | let show_version =
65 | B0_unit.of_action "unicode-version" ~doc:"Show supported unicode version" @@
66 | fun _ _ ~args:_ ->
67 | Ok (Log.stdout (fun m -> m "%s" (B0_version.to_string unicode_version)))
68 |
69 | let download_tests =
70 | let doc = "Download the UCD normalization tests" in
71 | B0_unit.of_action "download-tests" ~doc @@ fun env _ ~args:_ ->
72 | let version = B0_version.to_string unicode_version in
73 | let test_url = Fmt.str "%s/%s/ucd/NormalizationTest.txt" uc_base version in
74 | let test_file = B0_env.in_scope_dir env ~/"test/NormalizationTest.txt" in
75 | (Log.stdout @@ fun m ->
76 | m "@[Downloading %s@,to %a@]" test_url Fpath.pp test_file);
77 | B0_action_kit.fetch_url env test_url test_file
78 |
79 | let download_ucdxml =
80 | let doc = "Download the ucdxml to support/ucd.xml" in
81 | B0_unit.of_action "download-ucdxml" ~doc @@ fun env _ ~args:_ ->
82 | let* unzip = B0_env.get_cmd env (Cmd.tool "unzip") in
83 | let version = B0_version.to_string unicode_version in
84 | let ucd_url = Fmt.str "%s/%s/ucdxml/ucd.all.grouped.zip" uc_base version in
85 | let ucd_file = B0_env.in_scope_dir env ~/"support/ucd.xml" in
86 | Result.join @@ Os.File.with_tmp_fd @@ fun tmpfile tmpfd ->
87 | (Log.stdout @@ fun m ->
88 | m "@[Downloading %s@,to %a@]" ucd_url Fpath.pp ucd_file);
89 | let* () = B0_action_kit.fetch_url env ucd_url tmpfile in
90 | let stdout = Os.Cmd.out_file ~force:true ~make_path:true ucd_file in
91 | Os.Cmd.run Cmd.(unzip % "-p" %% path tmpfile) ~stdout
92 |
93 | (* Packs *)
94 |
95 | let default =
96 | let meta =
97 | B0_meta.empty
98 | |> ~~ B0_meta.authors ["The uunf programmers"]
99 | |> ~~ B0_meta.maintainers ["Daniel Bünzli "]
100 | |> ~~ B0_meta.homepage "https://erratique.ch/software/uunf"
101 | |> ~~ B0_meta.online_doc "https://erratique.ch/software/uunf/doc/Uunf"
102 | |> ~~ B0_meta.licenses ["ISC"]
103 | |> ~~ B0_meta.repo "git+https://erratique.ch/repos/uunf.git"
104 | |> ~~ B0_meta.issues "https://github.com/dbuenzli/uunf/issues"
105 | |> ~~ B0_meta.description_tags
106 | ["unicode"; "text"; "normalization"; "org:erratique"]
107 | |> ~~ B0_opam.build
108 | {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"
109 | "--with-uutf" "%{uutf:installed}%"
110 | "--with-cmdliner" "%{cmdliner:installed}%" ]]|}
111 | |> ~~ B0_opam.depopts [ "uutf", ""; "cmdliner", ""]
112 | |> ~~ B0_opam.conflicts
113 | [ "uutf", {|< "1.0.0"|};
114 | "cmdliner", {|< "1.1.0"|}]
115 | |> ~~ B0_opam.depends
116 | [ "ocaml", {|>= "4.14.0"|};
117 | "ocamlfind", {|build|};
118 | "ocamlbuild", {|build|};
119 | "topkg", {|build & >= "1.1.0"|};
120 | "uucd",
121 | Fmt.str {|dev & >= "%s" & < "%s"|}
122 | (B0_version.to_string unicode_version)
123 | (B0_version.to_string next_major)
124 | ]
125 | |> ~~ B0_opam.file_addendum
126 | [ `Field ("post-messages", `L (true, [
127 | `S "If the build fails with \"ocamlopt.opt got signal and \
128 | exited\", issue 'ulimit -s unlimited' and retry.";
129 | `Raw {|{failure & (arch = "ppc64" | arch = "arm64")}|}]))
130 | ]
131 | |> B0_meta.tag B0_opam.tag
132 | in
133 | B0_pack.make "default" ~doc:"uunf package" ~meta ~locked:true @@
134 | B0_unit.list ()
135 |
--------------------------------------------------------------------------------
/test/unftrip.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | let strf = Printf.sprintf
7 | let pp = Format.fprintf
8 | let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) "
9 | (Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d)
10 | (Uutf.decoder_byte_count d)
11 |
12 | let pp_malformed ppf bs =
13 | let l = String.length bs in
14 | pp ppf "@[malformed bytes @[(";
15 | if l > 0 then pp ppf "%02X" (Char.code (bs.[0]));
16 | for i = 1 to l - 1 do pp ppf "@ %02X" (Char.code (bs.[i])) done;
17 | pp ppf ")@]@]"
18 |
19 | let pp_dump_uchar ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u)
20 |
21 | let exec = Filename.basename Sys.executable_name
22 | let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec
23 |
24 | let input_malformed = ref false
25 | let log_malformed inf d bs =
26 | input_malformed := true;
27 | log "%s:%a: %a@." inf pp_pos d pp_malformed bs
28 |
29 | (* Output *)
30 |
31 | let uchar_dump ppf = function
32 | | `End -> () | `Uchar u -> pp ppf "%a@\n" pp_dump_uchar u
33 |
34 | let uchar_encoder enc =
35 | let enc = match enc with `ISO_8859_1 | `US_ASCII -> `UTF_8
36 | | #Uutf.encoding as enc -> enc
37 | in
38 | let e = Uutf.encoder enc (`Channel stdout) in
39 | fun v -> ignore (Uutf.encode e v)
40 |
41 | let out_fun ascii oe =
42 | if ascii then uchar_dump Format.std_formatter else uchar_encoder oe
43 |
44 | (* Trip *)
45 |
46 | let u_rep = `Uchar Uutf.u_rep
47 | let id inf d first_dec out = (* no normalization. *)
48 | let rec loop d = function
49 | | `Uchar _ as v -> out v; loop d (Uutf.decode d)
50 | | `End as v -> out v
51 | | `Malformed bs -> log_malformed inf d bs; out u_rep; loop d (Uutf.decode d)
52 | | `Await -> assert false
53 | in
54 | if Uutf.decoder_removed_bom d then out (`Uchar Uutf.u_bom);
55 | loop d first_dec
56 |
57 | let normalize nf inf d first_dec out = (* normalize to nf. *)
58 | let n = Uunf.create nf in
59 | let rec add v = match Uunf.add n v with
60 | | `Uchar cp as u -> out u; add `Await
61 | | `Await | `End -> ()
62 | in
63 | let rec loop d = function
64 | | `Uchar _ as v -> add v; loop d (Uutf.decode d)
65 | | `End as v -> add v; out `End
66 | | `Malformed bs -> log_malformed inf d bs; add u_rep; loop d (Uutf.decode d)
67 | | `Await -> assert false
68 | in
69 | if Uutf.decoder_removed_bom d then add (`Uchar Uutf.u_bom);
70 | loop d first_dec
71 |
72 | let trip nf inf enc ascii =
73 | try
74 | let ic = if inf = "-" then stdin else open_in inf in
75 | let d = Uutf.decoder ?encoding:enc (`Channel ic) in
76 | let first_dec = Uutf.decode d in (* guess encoding if needed. *)
77 | let out = out_fun ascii (Uutf.decoder_encoding d) in
78 | begin match nf with
79 | | None -> id inf d first_dec out
80 | | Some nf -> normalize nf inf d first_dec out
81 | end;
82 | if inf <> "-" then close_in ic;
83 | flush stdout;
84 | with Sys_error e -> log "%s@." e; exit 1
85 |
86 | (* Version *)
87 |
88 | let unicode_version () = Format.printf "%s@." Uunf.unicode_version
89 |
90 | (* Cmd *)
91 |
92 | let do_cmd cmd nf inf enc ascii = match cmd with
93 | | `Unicode_version -> unicode_version ()
94 | | `Trip -> trip nf inf enc ascii
95 |
96 | (* Cmdline interface *)
97 |
98 | open Cmdliner
99 |
100 | let cmd =
101 | let doc = "Output supported Unicode version." in
102 | let unicode_version = `Unicode_version, Arg.info ["unicode-version"] ~doc in
103 | Arg.(value & vflag `Trip [unicode_version])
104 |
105 | let nf_doc = "NORMALIZATION"
106 | let nf =
107 | let docs = nf_doc in
108 | let doc = "Normalization Form D (NFD), canonical decomposition." in
109 | let nfd = Some `NFD, Arg.info ["nfd"] ~doc ~docs in
110 | let doc = "Normalization Form C (NFC), canonical decomposition followed by \
111 | canonical composition." in
112 | let nfc = Some `NFC, Arg.info ["nfc"] ~doc ~docs in
113 | let doc = "Normalization form KD (NFKD), compatibility decomposition." in
114 | let nfkd = Some `NFKD, Arg.info ["nfkd"] ~doc ~docs in
115 | let doc = "Normalization form KC (NFKC), compatibility decomposition \
116 | followed by canonical composition." in
117 | let nfkc = Some `NFKC, Arg.info ["nfkc"] ~doc ~docs in
118 | Arg.(value & vflag None [nfd; nfc; nfkd; nfkc])
119 |
120 | let file =
121 | let doc = "The input file. Reads from stdin if unspecified." in
122 | Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE")
123 |
124 | let enc =
125 | let enc = [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE;
126 | "UTF-16BE", `UTF_16BE; "ASCII", `US_ASCII; "latin1", `ISO_8859_1 ]
127 | in
128 | let doc = strf "Input encoding, must %s. If unspecified the encoding is \
129 | guessed. The output encoding is the same as the input \
130 | encoding except for ASCII and latin1 where UTF-8 is output."
131 | (Arg.doc_alts_enum enc)
132 | in
133 | Arg.(value & opt (some (enum enc)) None & info [ "e"; "encoding" ] ~doc)
134 |
135 | let ascii =
136 | let doc = "Output the input text as newline (U+000A) separated Unicode
137 | scalar values written in the US-ASCII charset."
138 | in
139 | Arg.(value & flag & info ["a"; "ascii"] ~doc)
140 |
141 | let cmd =
142 | let doc = "normalize Unicode text" in
143 | let man = [
144 | `S "DESCRIPTION";
145 | `P "$(tname) inputs Unicode text from stdin and rewrites it to stdout
146 | according to a specified Unicode normalization form (see UAX 15).";
147 | `P "If no normalization form is specified the character stream is left
148 | intact.";
149 | `P "Invalid byte sequences in the input are reported on stderr and
150 | replaced by the Unicode replacement character (U+FFFD) in the output.";
151 | `S nf_doc;
152 | `S "OPTIONS";
153 | `S "EXIT STATUS";
154 | `P "$(tname) exits with one of the following values:";
155 | `I ("0", "no error occurred");
156 | `I ("1", "a command line parsing error occurred");
157 | `I ("2", "the input text was malformed");
158 | `S "BUGS";
159 | `P "This program is distributed with the Uunf OCaml library.
160 | See http://erratique.ch/software/uunf for contact information." ]
161 | in
162 | Cmd.v (Cmd.info "unftrip" ~version:"%%VERSION%%" ~doc ~man)
163 | Term.(const do_cmd $ cmd $ nf $ file $ enc $ ascii)
164 |
165 | let main () = match Cmd.eval cmd with
166 | | 0 -> if !input_malformed then exit 2 else exit 0
167 | | c when c = Cmd.Exit.cli_error -> exit 1
168 | | c -> exit c
169 |
170 | let () = if !Sys.interactive then () else main ()
171 |
--------------------------------------------------------------------------------
/support/gen_norm.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2015 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | let strf = Printf.sprintf
7 | let pp = Format.fprintf
8 | let str = Format.asprintf
9 |
10 | (* Structure sharing *)
11 |
12 | let intern (type a) ?eqh iter pp_v ppf x =
13 | let module H = Hashtbl.Make (struct
14 | type t = a
15 | let equal, hash = match eqh with Some fg -> fg | _ -> (=), Hashtbl.hash
16 | end) in
17 | let t = H.create 23 and n = ref 0 in
18 | x |> iter (fun v -> if not (H.mem t v) then begin
19 | let name = str "v%03d" !n in
20 | H.add t v name; incr n;
21 | pp ppf "@[<2>let %s =@ %a@]@\n" name pp_v v
22 | end);
23 | (fun ppf v -> match H.find_opt t v with
24 | | Some name -> pp ppf "%s" name
25 | | None -> pp_v ppf v)
26 |
27 | (* Normalization properties. *)
28 |
29 | let pp_boundary nf ucd ppf nf_quick_check =
30 | Gen.log "%s boundary property as character boolean trie map" nf;
31 | let prop_str = strf "%s_quick_check" nf in
32 | let prop u = match Gen.ucd_get ucd u nf_quick_check prop_str with
33 | | `Maybe | `False -> false
34 | | `True -> (Gen.ucd_get ucd u Uucd.canonical_combining_class "ccc") = 0
35 | in
36 | let tm, fm = Gen.bool_prop_maps prop in
37 | let tm_size, fm_size = Uunf_tmapbool.size tm, Uunf_tmapbool.size fm in
38 | let use_fm = tm_size > fm_size in
39 | Gen.log ", asserting data.\n"; Gen.assert_bool_prop_maps prop tm fm;
40 | Gen.log " boolean trie map (default true) size: %s\n"
41 | (Gen.str_of_size tm_size);
42 | Gen.log " boolean trie map (default false) size: %s\n"
43 | (Gen.str_of_size fm_size);
44 | Gen.log " Using map with default %b.\n\n" (not use_fm);
45 | let m = if use_fm then fm else tm in
46 | let pp_v = intern Uunf_tmapbool.iter_blobs Uunf_tmapbool.pp_v ppf m in
47 | pp ppf "@[<2>let %s_boundary_map =@ %a@]@\n@\n"
48 | nf (Uunf_tmapbool.dump_pp pp_v) m;
49 | ()
50 |
51 | let pp_ccc ppf ucd =
52 | Gen.log "ccc property as character byte trie map";
53 | let prop u = Gen.ucd_get ucd u Uucd.canonical_combining_class "ccc" in
54 | let m = Gen.byte_prop_map ~default:0 prop in
55 | let t_size = Uunf_tmapbyte.size m in
56 | Gen.log ", asserting data.\n"; Gen.assert_byte_prop_map prop m;
57 | Gen.log " trie map size: %s\n\n" (Gen.str_of_size t_size);
58 | let pp_v = intern Uunf_tmapbyte.iter_blobs Uunf_tmapbyte.pp_v ppf m in
59 | pp ppf "@[<2>let ccc_map =@ %a@]@\n@\n"
60 | (Uunf_tmapbyte.dump_pp pp_v) m;
61 | ()
62 |
63 | let pp_decomp ppf ucd =
64 | Gen.log "decomposition mapping as trie map";
65 | let default = Uunf_tmap.nil in
66 | let prop u =
67 | match Gen.ucd_get ucd u Uucd.decomposition_mapping "decomposition mapping"
68 | with
69 | | `Self -> default
70 | | `Cps cps ->
71 | let t = Gen.ucd_get ucd u Uucd.decomposition_type "decomposition_type"in
72 | if Gen.is_hangul_syllabe u then begin
73 | if t <> `Can then invalid_arg (strf "hangul not canon decomp %X" u);
74 | default
75 | end else begin
76 | let d = Array.of_list cps in
77 | let compat = t <> `Can in
78 | if compat then d.(0) <- (1 lsl 24) lor d.(0);
79 | d
80 | end
81 | in
82 | let m = Gen.prop_map ~default prop in
83 | let size_v = function [||] -> 0 | a -> 1 + Array.length a in
84 | let t_size = Uunf_tmap.size size_v m in
85 | let pp_decomp ppf = function
86 | | [||] -> pp ppf "nil"
87 | | a ->
88 | pp ppf "[|@,";
89 | for i = 0 to Array.length a - 1 do pp ppf "@,0x%X;@," a.(i) done;
90 | pp ppf "@,|]"
91 | in
92 | Gen.log ", asserting data.\n"; Gen.assert_prop_map prop m;
93 | Gen.log " trie map size: %s\n\n" (Gen.str_of_size t_size);
94 | pp ppf "@[<2>let decomp_map =@ %a@]@\n@\n" (Uunf_tmap.dump pp_decomp) m;
95 | ()
96 |
97 | module Cpmap = Uucd.Cpmap
98 |
99 | let pp_compose ppf ucd =
100 | Gen.log "composition to primary composites as trie map";
101 | let m = ref Cpmap.empty in
102 | let add_map cp1 cp2 c =
103 | let l = try Cpmap.find cp1 !m with Not_found -> [] in
104 | m := Cpmap.add cp1 ((cp2, c) :: l) !m
105 | in
106 | let add u =
107 | match Gen.ucd_get ucd u Uucd.decomposition_mapping "decomposition_mapping"
108 | with
109 | | `Self -> ()
110 | | `Cps cps ->
111 | let fce = "full_decomposition_exclusion" in
112 | if Gen.ucd_get ucd u Uucd.full_composition_exclusion fce then () else
113 | let t = Gen.ucd_get ucd u Uucd.decomposition_type "decomposition_type"in
114 | if t <> `Can then () else
115 | if Gen.is_hangul_syllabe u then () else
116 | match cps with
117 | | [cp1; cp2] -> add_map cp1 cp2 u
118 | | _ -> invalid_arg (strf "cannot handle composition for %X" u);
119 | in
120 | Gen.iter_uchar_ints add;
121 | let default = Uunf_tmap.nil in
122 | let max_comps = ref 0 in
123 | let prop u =
124 | try
125 | let comps = List.sort compare (Cpmap.find u !m) in
126 | let len = List.length comps in
127 | let a = Array.make (len * 2) 0 in
128 | let set i (cp2, c) = a.(2 * i) <- cp2; a.(2 * i + 1) <- c in
129 | List.iteri set comps;
130 | max_comps := max !max_comps len;
131 | a
132 | with Not_found -> Uunf_tmap.nil
133 | in
134 | let m = Gen.prop_map ~default prop in
135 | let size_v = function [||] -> 0 | a -> 1 + Array.length a in
136 | let t_size = Uunf_tmap.size size_v m in
137 | let pp_d ppf = function
138 | | [||] -> pp ppf "nil"
139 | | a ->
140 | pp ppf "[|@,";
141 | for i = 0 to Array.length a - 1 do pp ppf "@,0x%X;@," a.(i) done;
142 | pp ppf "@,|]"
143 | in
144 | Gen.log ", asserting data.\n"; Gen.assert_prop_map prop m;
145 | Gen.log " trie map size: %s\n" (Gen.str_of_size t_size);
146 | Gen.log " max num. of possible composition for a base char: %d\n\n"
147 | !max_comps;
148 | pp ppf "@[<2>let compose_map =@ %a@]@\n@\n" (Uunf_tmap.dump pp_d) m;
149 | ()
150 |
151 | let pp_version ppf ucd =
152 | let version = match String.split_on_char ' ' ucd.Uucd.description with
153 | | [tok] -> tok
154 | | [_; tok] -> tok
155 | | _ -> ucd.Uucd.description
156 | in
157 | pp ppf "@[<2>let unicode_version = \"%s\"@]@\n@\n" version
158 |
159 | let pp_norms ppf ucd =
160 | pp_version ppf ucd;
161 | pp ppf "open Uunf_tmapbool;;@\n@\n";
162 | pp_boundary "nfc" ucd ppf Uucd.nfc_quick_check;
163 | pp_boundary "nfd" ucd ppf Uucd.nfd_quick_check;
164 | pp_boundary "nfkc" ucd ppf Uucd.nfkc_quick_check;
165 | pp_boundary "nfkd" ucd ppf Uucd.nfkd_quick_check;
166 | pp ppf "open Uunf_tmapbyte;;@\n@\n";
167 | pp_ccc ppf ucd;
168 | pp ppf "open Uunf_tmap;;@\n@\n";
169 | pp_decomp ppf ucd;
170 | pp_compose ppf ucd;
171 | ()
172 |
173 | let pp_mod ppf ucd = Gen.pp_mod pp_norms ppf ucd
174 |
--------------------------------------------------------------------------------
/src/uunf.mli:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | (** Unicode text normalization.
7 |
8 | [Uunf] normalizes Unicode text. It supports all Unicode
9 | normalization forms. The module is independent from any IO
10 | mechanism or Unicode text data structure and it can process text
11 | without a complete in-memory representation of the data.
12 |
13 | The supported Unicode version is determined by the {!unicode_version}
14 | value.
15 |
16 | Consult the {{!basics}basics}, {{!limits}limitations} and
17 | {{!examples}examples} of use.
18 |
19 | {3 References}
20 | {ul
21 | {- The Unicode Consortium.
22 | {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}.
23 | (latest version)}
24 | {- Mark Davis.
25 | {e {{:http://www.unicode.org/reports/tr15/}UAX #15 Unicode Normalization
26 | Forms}}. (latest version)}
27 | {- The Unicode Consortium.
28 | {e {{:http://www.unicode.org/charts/normalization/}Normalization charts}.
29 | }}} *)
30 |
31 | (** {1 Normalize} *)
32 |
33 | type form = [ `NFD | `NFC | `NFKD | `NFKC ]
34 | (** The type for normalization forms.
35 | {ul
36 | {- [`NFD] {{:http://www.unicode.org/glossary/#normalization_form_d}
37 | normalization form D}, canonical decomposition.}
38 | {- [`NFC] {{:http://www.unicode.org/glossary/#normalization_form_c}
39 | normalization form C}, canonical decomposition followed by
40 | canonical composition
41 | ({{:http://www.w3.org/TR/charmod-norm/}recommended} for the www).}
42 | {- [`NFKD] {{:http://www.unicode.org/glossary/#normalization_form_kd}
43 | normalization form KD}, compatibility decomposition.}
44 | {- [`NFKC] {{:http://www.unicode.org/glossary/#normalization_form_kc}
45 | normalization form KC}, compatibility decomposition,
46 | followed by canonical composition.}} *)
47 |
48 | type t
49 | (** The type for Unicode text normalizers. *)
50 |
51 | type ret = [ `Uchar of Uchar.t | `End | `Await ]
52 | (** The type for normalizer results. See {!add}. *)
53 |
54 | val create : [< form ] -> t
55 | (** [create nf] is an Unicode text normalizer for the normal form [nf]. *)
56 |
57 | val form : t -> form
58 | (** [form n] is the normalization form of [n]. *)
59 |
60 | val add : t -> [ `Uchar of Uchar.t | `Await | `End ] -> ret
61 | (** [add n v] is:
62 | {ul
63 | {- [`Uchar u] if [u] is the next character in the normalized
64 | sequence. The client must then call [add] with [`Await]
65 | until [`Await] is returned.}
66 | {- [`Await] when the normalizer is ready to add a new
67 | [`Uchar] or [`End].}}
68 |
69 | For [v] use [`Uchar u] to add a new character to the sequence
70 | to normalize and [`End] to signal the end of sequence. After
71 | adding one of these two values, always call [add] with [`Await]
72 | until [`Await] is returned.
73 |
74 | {b Raises.} [Invalid_argument] if [`Uchar ] or [`End] is
75 | added directly after an [`Uchar] was returned by the normalizer
76 | or if an [`Uchar] is added after [`End] was added. *)
77 |
78 | val reset : t -> unit
79 | (** [reset n] resets the normalizer to a state equivalent to the
80 | state of [Uunf.create (Uunf.form n)]. *)
81 |
82 | val copy : t -> t
83 | (** [copy n] is a copy of [n] in its current state. Subsequent
84 | {!add}s on [n] do not affect the copy. *)
85 |
86 | val pp_ret : Format.formatter -> ret -> unit
87 | (** [pp_ret ppf v] prints an unspecified representation of [v] on [ppf]. *)
88 |
89 | (** {1:props Normalization properties}
90 |
91 | These properties are used internally to implement the normalizers.
92 | They are not needed to use the module but are exposed as they may
93 | be useful to implement other algorithms. *)
94 |
95 | val unicode_version : string
96 | (** [unicode_version] is the Unicode version supported by the module. *)
97 |
98 | val ccc : Uchar.t -> int
99 | (** [ccc u] is [u]'s
100 | {{:http://www.unicode.org/glossary/#combining_class}canonical combining
101 | class} value. *)
102 |
103 | val decomp : Uchar.t -> int array
104 | (** [decomp u] is [u]'s
105 | {{:http://www.unicode.org/glossary/#decomposition_mapping}decomposition
106 | mapping}. If the empty array is returned, [u] decomposes to itself.
107 |
108 | The first number in the array contains additional information, it
109 | cannot be used as an {!Uchar.t}. Use {!d_uchar} on the number to get the
110 | actual character and {!d_compatibility} to find out if this is
111 | a compatibility decomposition. All other characters of the array
112 | are guaranteed to be convertible using {!Uchar.of_int}.
113 |
114 | {b Warning.} Do {b not} mutate the array. *)
115 |
116 | val d_uchar : int -> Uchar.t
117 | (** See {!decomp}. *)
118 |
119 | val d_compatibility : int -> bool
120 | (** See {!decomp}. *)
121 |
122 | val composite : Uchar.t -> Uchar.t -> Uchar.t option
123 | (** [composite u1 u2] is the
124 | {{:http://www.unicode.org/glossary/#primary_composite}primary composite}
125 | canonically equivalent to the sequence [], if any. *)
126 |
127 | (** {1:limits Limitations}
128 |
129 | An [Uunf] normalizer consumes only a small bounded amount of
130 | memory on ordinary, {e meaningful} text. However on legal but {e
131 | degenerate} text like a
132 | {{:http://www.unicode.org/glossary/#starter}starter} followed by
133 | 10'000 combining
134 | {{:http://www.unicode.org/glossary/#nonspacing_mark}non-spacing
135 | marks} it will have to bufferize all the marks (a workaround is
136 | to first convert your input to
137 | {{:http://www.unicode.org/reports/tr15/#Stream_Safe_Text_Format}stream-safe
138 | text format}). *)
139 |
140 | (** {1:basics Basics}
141 |
142 | A normalizer is a stateful filter that inputs a sequence of
143 | characters and outputs an equivalent sequence in the requested
144 | normal form.
145 |
146 | The function {!create} returns a new normalizer for a given normal
147 | form:
148 | {[
149 | let nfd = Uunf.create `NFD
150 | ]}
151 | To add characters to the sequence to normalize, call {!add} on
152 | [nfd] with [`Uchar _]. To end the sequence, call {!add} on [nfd]
153 | with [`End]. The normalized sequence of characters is returned,
154 | character by character, by the successive calls to {!add}.
155 |
156 | The client and the normalizer must wait on each other to limit
157 | internal buffering: each time the client adds to the sequence by
158 | calling {!add} with [`Uchar] or [`End] it must continue to call
159 | {!add} with [`Await] until the normalizer returns [`Await]. In
160 | practice this leads to the following kind of control flow:
161 | {[
162 | let rec add acc v = match Uunf.add nfd v with
163 | | `Uchar u -> add (u :: acc) `Await
164 | | `Await | `End -> acc
165 | ]}
166 | For example to normalize the character [U+00E9] (é) with [nfd] to a list
167 | of characters we can write:
168 | {[
169 | let e_acute = Uchar.of_int 0x00E9
170 | let e_acute_nfd = List.rev (add (add [] (`Uchar e_acute)) `End)
171 | ]}
172 | The next section has more examples.
173 | *)
174 |
175 | (** {1:examples Examples}
176 |
177 | {2:utf8 UTF-8 normalization}
178 |
179 | [utf_8_normalize nf s] is the UTF-8 encoded normal form [nf] of
180 | the UTF-8 encoded string [s].
181 | {[
182 | let utf_8_normalize nf s =
183 | let rec add buf normalizer v = match Uunf.add normalizer v with
184 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await
185 | | `Await | `End -> ()
186 | in
187 | let rec loop buf s i max normalizer =
188 | if i > max then (add buf normalizer `End; Buffer.contents buf) else
189 | let dec = String.get_utf_8_uchar s i in
190 | add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec));
191 | loop buf s (i + Uchar.utf_decode_length dec) max normalizer
192 | in
193 | let buf = Buffer.create (String.length s * 3) in
194 | let normalizer = Uunf.create nf in
195 | loop buf s 0 (String.length s - 1) normalizer
196 | ]}
197 |
198 | Note that this functionality is available directly through
199 | {!Uunf_string.normalize_utf_8}
200 | *)
201 |
--------------------------------------------------------------------------------
/src/uunf.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | type ret = [ `Uchar of Uchar.t | `End | `Await ]
7 |
8 | let pp_ret ppf v = match (v :> ret) with
9 | | `Uchar u -> Format.fprintf ppf "`Uchar U+%04X" (Uchar.to_int u)
10 | | `End -> Format.fprintf ppf "`End"
11 | | `Await -> Format.fprintf ppf "`Await"
12 |
13 | let err_exp_await add =
14 | invalid_arg (Format.asprintf "can't add %a, expected `Await" pp_ret add)
15 |
16 | let err_ended add =
17 | invalid_arg (Format.asprintf "can't add %a, `End already added" pp_ret add)
18 |
19 | (* The normalization process is implemented as described in UAX #15
20 | section 9.1 for normalizing the concatenation of normalized
21 | strings. We detect ranges of characters in the input sequence
22 | enclosed between two characters for which NFX_quick_check=YES *and*
23 | ccc = 0 (6.1.0 wrongly claims that quick_check=YES implies ccc = 0,
24 | we therefore call this property nfx_boundary). Only these ranges
25 | (including the left boundary) need to be bufferized to perform the
26 | normalization process. *)
27 |
28 | (* Characters *)
29 |
30 | let ux_none = max_int (* no char, outside unicode range. *)
31 | let u_dumb = (* placeholder, overwritten. *)
32 | `Uchar (Uchar.of_int 0x0000)
33 |
34 | (* Normalization properties. *)
35 |
36 | let unicode_version = Uunf_data.unicode_version
37 |
38 | let nfc_boundary u = Uunf_tmapbool.get Uunf_data.nfc_boundary_map u
39 | let nfd_boundary u = Uunf_tmapbool.get Uunf_data.nfd_boundary_map u
40 | let nfkc_boundary u = Uunf_tmapbool.get Uunf_data.nfkc_boundary_map u
41 | let nfkd_boundary u = Uunf_tmapbool.get Uunf_data.nfkd_boundary_map u
42 | let _ccc u = Uunf_tmapbyte.get Uunf_data.ccc_map u
43 | let ccc u = _ccc (Uchar.to_int u)
44 | let decomp_prop u = Uunf_tmap.get Uunf_data.decomp_map u
45 | let compose_prop u = Uunf_tmap.get Uunf_data.compose_map u
46 |
47 | module H = struct (* Hangul arithmetic constants. *)
48 | let sbase = 0xAC00
49 | let lbase = 0x1100
50 | let vbase = 0x1161
51 | let tbase = 0x11A7
52 | let scount = 11172
53 | let lcount = 19
54 | let vcount = 21
55 | let tcount = 28
56 | let ncount = 588
57 | let scount = 11172
58 | end
59 |
60 | let decomp u =
61 | let u = Uchar.to_int u in
62 | if u < 0xAC00 || 0xD7A3 < u then decomp_prop u else
63 | begin (* LV or LVT hangul composite *)
64 | let sindex = u - H.sbase in
65 | let l = H.lbase + (sindex / H.ncount) in
66 | let v = H.vbase + (sindex mod H.ncount) / H.tcount in
67 | let t = H.tbase + (sindex mod H.tcount) in
68 | if t = H.tbase then [|l; v|] else [|l; v; t|]
69 | end
70 |
71 | (* N.B. to help stream-safe text implementers we *could* use the bits
72 | 25-27 of [(decomp u).(0)] to indicate the number of initial non
73 | starters in the NFKD decomposition of [u] and bits and 28-30 to
74 | indicate the non starter count increment. *)
75 |
76 | let d_compatibility i = i land (1 lsl 24) > 0
77 | let _d_uchar i = i land 0x1FFFFF
78 | let d_uchar i = Uchar.unsafe_of_int (_d_uchar i)
79 |
80 | let _composite u1 u2 =
81 | if 0x1100 <= u1 && u1 <= 0x1112 then
82 | begin
83 | if u2 < 0x1161 || 0x1175 < u2 then ux_none else
84 | let l = u1 - H.lbase in (* LV hangul composite *)
85 | let v = u2 - H.vbase in
86 | H.sbase + l * H.ncount + v * H.tcount
87 | end
88 | else if 0xAC00 <= u1 && u1 <= 0xD788 && (u1 - 0x0AC00) mod H.tcount = 0 then
89 | begin
90 | if u2 < 0x11A8 || u2 > 0x11C3 then ux_none else
91 | (u1 + u2 - H.tbase) (* LVT hangul composite *)
92 | end
93 | else match compose_prop u1 with
94 | | [||] -> ux_none
95 | | a (* [u2; c; u2'; c'; ...] sorted *) ->
96 | let len = Array.length a / 2 in
97 | let i = ref 0 in
98 | try
99 | while (!i < len) do
100 | if a.(!i * 2) = u2 then raise Exit else incr i;
101 | done;
102 | ux_none
103 | with Exit -> (a.(!i * 2 + 1))
104 |
105 | let composite u1 u2 =
106 | let u = _composite (Uchar.to_int u1) (Uchar.to_int u2) in
107 | if u = ux_none then None else Some (Uchar.unsafe_of_int u)
108 |
109 | (* Normalize *)
110 |
111 | type form = [ `NFC | `NFD | `NFKC | `NFKD ]
112 | type state = (* normalizer state. *)
113 | | Start (* no cp seen yet. *)
114 | | Boundary (* cp with boundary = true found in n.uc, no accumulation yet. *)
115 | | Acc (* accumulate until next cp with boundary = true. *)
116 | | Flush (* next cp with boundary = true found, flush previous data. *)
117 | | End (* end of normalization sequence. *)
118 |
119 | type t =
120 | { form : form; (* normalization form. *)
121 | compat : bool; (* true if compatibility decomposition needed. *)
122 | compose : bool; (* true if composition needed. *)
123 | boundary : int -> bool; (* nfx_boundary. *)
124 | mutable state : state; (* normalizer state. *)
125 | mutable uc : [`Uchar of Uchar.t]; (* last cp with boundary = true. *)
126 | mutable acc : int array; (* code point accumulator. *)
127 | mutable first : int; (* index of first code point in acc. *)
128 | mutable last : int; (* index of last code point in acc. *)
129 | mutable is_end : bool;} (* [true] if `End was seen. *)
130 |
131 | let create_acc () = Array.make 35 ux_none
132 | let create form =
133 | let boundary, compat, compose = match form with
134 | | `NFC -> nfc_boundary, false, true
135 | | `NFD -> nfd_boundary, false, false
136 | | `NFKC -> nfkc_boundary, true, true
137 | | `NFKD -> nfkd_boundary, true, false
138 | in
139 | { form = (form :> form); compat; compose; boundary; state = Start;
140 | uc = u_dumb; acc = create_acc (); first = 0; last = -1; is_end = false}
141 |
142 | let get_u n = let `Uchar u = n.uc in Uchar.to_int u
143 | let acc_empty n = n.first > n.last
144 | let form n = n.form
145 | let copy n = { n with acc = Array.copy n.acc }
146 | let reset n =
147 | n.state <- Start; n.uc <- u_dumb; n.acc <- create_acc ();
148 | n.first <- 0; n.last <- -1; n.is_end <- false
149 |
150 | let grow_acc n =
151 | let len = Array.length n.acc in
152 | let acc' = Array.make (2 * len) ux_none in
153 | Array.blit n.acc 0 acc' 0 len; n.acc <- acc'
154 |
155 | let ordered_add n u = (* canonical ordering algorithm via insertion sort. *)
156 | n.last <- n.last + 1; if n.last = Array.length n.acc then grow_acc n;
157 | let c = _ccc u in
158 | if c = 0 then n.acc.(n.last) <- u else
159 | begin
160 | let i = ref (n.last - 1) in
161 | while (!i >= 0 && _ccc (n.acc.(!i)) > c) do
162 | n.acc.(!i + 1) <- n.acc.(!i); decr i; (* shift right. *)
163 | done;
164 | n.acc.(!i + 1) <- u
165 | end
166 |
167 | let rec add n u =
168 | if 0xAC00 <= u && u <= 0xD7A3 then
169 | begin (* LV or LVT hangul composite, copied from decomp to avoid alloc. *)
170 | let sindex = u - H.sbase in
171 | let l = H.lbase + (sindex / H.ncount) in
172 | let v = H.vbase + (sindex mod H.ncount) / H.tcount in
173 | let t = H.tbase + (sindex mod H.tcount) in
174 | if t = H.tbase then (ordered_add n l; ordered_add n v) else
175 | (ordered_add n l; ordered_add n v; ordered_add n t)
176 | end
177 | else
178 | begin match decomp_prop u with
179 | | [||] -> ordered_add n u
180 | | d ->
181 | if d_compatibility d.(0) && not n.compat then ordered_add n u else
182 | begin
183 | add n (_d_uchar d.(0));
184 | for i = 1 to Array.length d - 1 do add n d.(i) done
185 | end
186 | end
187 |
188 | let compose n = (* canonical composition algorithm. *)
189 | let rec loop ~last_starter ~prev_ccc i =
190 | if i > n.last then () else
191 | let ccc_i = _ccc n.acc.(i) in
192 | let u_comp = _composite n.acc.(last_starter) n.acc.(i) in
193 | match (u_comp = ux_none || (ccc_i = 0 && last_starter <> i - 1)) with
194 | | true ->
195 | let last_starter = if ccc_i = 0 then i else last_starter in
196 | loop ~last_starter ~prev_ccc:ccc_i (i + 1)
197 | | false ->
198 | match prev_ccc <> 0 && prev_ccc >= ccc_i with
199 | | true -> loop ~last_starter ~prev_ccc:ccc_i (i + 1)
200 | | false ->
201 | n.acc.(last_starter) <- u_comp;
202 | Array.blit n.acc (i + 1) n.acc i (n.last - i);
203 | n.last <- n.last - 1;
204 | let prev_ccc = _ccc n.acc.(last_starter) in
205 | loop ~last_starter ~prev_ccc (last_starter + 1)
206 | in
207 | let last_starter = n.first in
208 | let prev_ccc = _ccc n.acc.(last_starter) in
209 | loop ~last_starter ~prev_ccc (last_starter + 1)
210 |
211 | let flush_next n =
212 | let ret = `Uchar (Uchar.unsafe_of_int n.acc.(n.first)) in
213 | if n.first = n.last then (n.first <- 0; n.last <- -1) else
214 | (n.first <- n.first + 1);
215 | ret
216 |
217 | let flush_start n = if n.compose then compose n; flush_next n
218 | let add n = function
219 | | `Uchar u as uc ->
220 | let u = Uchar.to_int u in
221 | begin match n.state with
222 | | Boundary ->
223 | if n.boundary u
224 | then (let prev = n.uc in n.uc <- uc; (prev :> ret))
225 | else (n.state <- Acc; add n (get_u n); add n u; `Await)
226 | | Acc ->
227 | if n.boundary u
228 | then (n.state <- Flush; n.uc <- uc; flush_start n)
229 | else (add n u; `Await)
230 | | Start ->
231 | if n.boundary u
232 | then (n.state <- Boundary; n.uc <- uc; `Await)
233 | else (n.state <- Acc; add n u; `Await)
234 | | Flush -> err_exp_await uc
235 | | End -> err_ended uc
236 | end
237 | | `Await ->
238 | begin match n.state with
239 | | Flush ->
240 | if not (acc_empty n) then flush_next n else
241 | if n.is_end then (n.state <- End; `End) else
242 | (n.state <- Boundary; `Await)
243 | | Start | Boundary | Acc -> `Await
244 | | End -> `End
245 | end
246 | | `End ->
247 | n.is_end <- true;
248 | begin match n.state with
249 | | Boundary -> n.state <- End; (n.uc :> ret)
250 | | Acc -> n.state <- Flush; flush_start n
251 | | Start -> n.state <- End; `End
252 | | Flush -> err_exp_await `End
253 | | End -> err_ended `End
254 | end
255 |
--------------------------------------------------------------------------------
/test/test_uunf.ml:
--------------------------------------------------------------------------------
1 | (*---------------------------------------------------------------------------
2 | Copyright (c) 2012 The uunf programmers. All rights reserved.
3 | SPDX-License-Identifier: ISC
4 | ---------------------------------------------------------------------------*)
5 |
6 | open B0_testing
7 | open B0_std
8 | open Result.Syntax
9 |
10 | (* Uunf tests, including Unicode's Normalization Conformance tests *)
11 |
12 | let uchar_dump ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u)
13 |
14 | (* Conformance data decoding and tests *)
15 |
16 | type conformance_test = int list array * string (* columns + comment. *)
17 | module Uset = Set.Make (Uchar) (* not a diet set, but will do here. *)
18 |
19 | let uchar_of_string v = (* parses a scalar value. *)
20 | let is_hex c = (0x30 <= c && c <= 0x39) || (0x41 <= c && c <= 0x46) in
21 | let cp = ref 0 in
22 | for k = 0 to (String.length v) - 1 do
23 | let c = Char.code v.[k] in
24 | if not (is_hex c) then failwith "" else
25 | cp := !cp * 16 + (if c <= 0x39 then c - 48 else c - 55)
26 | done;
27 | Uchar.of_int !cp
28 |
29 | let uchars_of_string v = List.map uchar_of_string (String.split_on_char ' ' v)
30 |
31 | let decode_conformance_data test_data_file =
32 | Test.log "Reading test data from %a" (Fmt.code' Fpath.pp) test_data_file;
33 | let split_string sep s =
34 | List.filter (fun s -> s <> "") (String.split_on_char sep s)
35 | in
36 | let rec loop tests collect_decomps decomps = function
37 | | [] -> List.rev tests, decomps
38 | | l :: ls ->
39 | try match split_string '#' l with
40 | | "@Part1 " :: _ -> loop tests true decomps ls
41 | | "@Part2 " :: _ -> loop tests false decomps ls
42 | | p :: _ :: _ when p.[0] = '@' -> loop tests collect_decomps decomps ls
43 | | [] | _ :: [] -> loop tests collect_decomps decomps ls
44 | | test :: comment :: _ ->
45 | begin match split_string ';' test with
46 | | c1 :: c2 :: c3 :: c4 :: c5 :: _ ->
47 | let test = [| uchars_of_string c1; uchars_of_string c2;
48 | uchars_of_string c3; uchars_of_string c4;
49 | uchars_of_string c5; |], comment
50 | in
51 | let decomps =
52 | if not collect_decomps then decomps else
53 | match (fst test).(0) with [ uchar ] ->
54 | Uset.add uchar decomps
55 | | _ -> failwith ""
56 | in
57 | loop (test :: tests) collect_decomps decomps ls
58 | | _ -> failwith ""
59 | end
60 | with Failure _ ->
61 | Test.log "Unable to parse line:\n`%s'\n" l;
62 | loop tests collect_decomps decomps ls
63 | in
64 | let* s = Os.File.read test_data_file in
65 | Ok (loop [] false Uset.empty (String.split_on_char '\n' s))
66 |
67 |
68 | let conformance = Test.Arg.make ()
69 |
70 | let test_conformance_normalizations =
71 | Test.test' conformance "conformance normalization invariants" @@
72 | fun (tests, _) ->
73 | let nc, nfc = Array.init 5 (fun _ -> Uunf.create `NFC), Array.make 5 [] in
74 | let nd, nfd = Array.init 5 (fun _ -> Uunf.create `NFD), Array.make 5 [] in
75 | let nkc, nfkc = Array.init 5 (fun _ -> Uunf.create `NFKC), Array.make 5 [] in
76 | let nkd, nfkd = Array.init 5 (fun _ -> Uunf.create `NFKD), Array.make 5 [] in
77 | let rec add n acc v = match Uunf.add n v with
78 | | `Uchar u -> add n (u :: acc) `Await
79 | | `Await | `End -> acc
80 | in
81 | let parallel_add i v =
82 | nfc.(i) <- add nc.(i) nfc.(i) v;
83 | nfd.(i) <- add nd.(i) nfd.(i) v;
84 | nfkc.(i) <- add nkc.(i) nfkc.(i) v;
85 | nfkd.(i) <- add nkd.(i) nfkd.(i) v
86 | in
87 | let test (cs, comment) =
88 | for i = 0 to 4 do
89 | Uunf.reset nc.(i); nfc.(i) <- [];
90 | Uunf.reset nd.(i); nfd.(i) <- [];
91 | Uunf.reset nkc.(i); nfkc.(i) <- [];
92 | Uunf.reset nkd.(i); nfkd.(i) <- [];
93 | List.iter (fun u -> parallel_add i (`Uchar u)) cs.(i);
94 | parallel_add i `End;
95 | nfc.(i) <- List.rev nfc.(i);
96 | nfd.(i) <- List.rev nfd.(i);
97 | nfkc.(i) <- List.rev nfkc.(i);
98 | nfkd.(i) <- List.rev nfkd.(i);
99 | done;
100 | if cs.(1) <> nfc.(0) then Test.fail "NFC: c2 <> toNFC(c1) for%s" comment;
101 | if cs.(1) <> nfc.(1) then Test.fail "NFC: c2 <> toNFC(c2) for%s" comment;
102 | if cs.(1) <> nfc.(2) then Test.fail "NFC: c2 <> toNFC(c3) for%s" comment;
103 | if cs.(3) <> nfc.(3) then Test.fail "NFC: c4 <> toNFC(c4) for%s" comment;
104 | if cs.(3) <> nfc.(4) then Test.fail "NFC: c4 <> toNFC(c5) for%s" comment;
105 | if cs.(2) <> nfd.(0) then Test.fail "NFD: c3 <> toNFD(c1) for%s" comment;
106 | if cs.(2) <> nfd.(1) then Test.fail "NFD: c3 <> toNFD(c2) for%s" comment;
107 | if cs.(2) <> nfd.(2) then Test.fail "NFD: c3 <> toNFD(c3) for%s" comment;
108 | if cs.(4) <> nfd.(3) then Test.fail "NFD: c5 <> toNFD(c4) for%s" comment;
109 | if cs.(4) <> nfd.(4) then Test.fail "NFD: c5 <> toNFD(c5) for%s" comment;
110 | if cs.(3) <> nfkc.(0) then Test.fail "NFKC: c4 <> toNFKC(c1) for%s" comment;
111 | if cs.(3) <> nfkc.(1) then Test.fail "NFKC: c4 <> toNFKC(c2) for%s" comment;
112 | if cs.(3) <> nfkc.(2) then Test.fail "NFKC: c4 <> toNFKC(c3) for%s" comment;
113 | if cs.(3) <> nfkc.(3) then Test.fail "NFKC: c4 <> toNFKC(c4) for%s" comment;
114 | if cs.(3) <> nfkc.(4) then Test.fail "NFKC: c4 <> toNFKC(c5) for%s" comment;
115 | if cs.(4) <> nfkd.(0) then Test.fail "NFKD: c5 <> toNFKD(c1) for%s" comment;
116 | if cs.(4) <> nfkd.(1) then Test.fail "NFKD: c5 <> toNFKD(c2) for%s" comment;
117 | if cs.(4) <> nfkd.(2) then Test.fail "NFKD: c5 <> toNFKD(c3) for%s" comment;
118 | if cs.(4) <> nfkd.(3) then Test.fail "NFKD: c5 <> toNFKD(c4) for%s" comment;
119 | if cs.(4) <> nfkd.(4) then Test.fail "NFKD: c5 <> toNFKD(c5) for%s" comment;
120 | in
121 | List.iter test tests
122 |
123 | let test_conformance_non_decomposables =
124 | Test.test' conformance "conformance of non-decomposable characters" @@
125 | fun (_, decomps) ->
126 | let nc = Uunf.create `NFC in
127 | let nd = Uunf.create `NFD in
128 | let nkc = Uunf.create `NFKC in
129 | let nkd = Uunf.create `NFKD in
130 | let norm n u =
131 | let rec add acc v = match Uunf.add n v with
132 | | `Uchar u -> add (u :: acc) `Await
133 | | `Await | `End -> acc
134 | in
135 | List.rev (add (add [] (`Uchar u)) `End)
136 | in
137 | let check u =
138 | if Uset.mem u decomps then () else
139 | begin
140 | let ul = [u] in
141 | Uunf.reset nc; Uunf.reset nd; Uunf.reset nkc; Uunf.reset nkd;
142 | if norm nc u <> ul then
143 | Test.fail "NFC: %a <> toNFC(%a)" uchar_dump u uchar_dump u;
144 | if norm nd u <> ul then
145 | Test.fail "NFD: %a <> toNFD(%a)" uchar_dump u uchar_dump u;
146 | if norm nkc u <> ul then
147 | Test.fail "NFKC: %a <> toNFKC(%a)" uchar_dump u uchar_dump u;
148 | if norm nkd u <> ul then
149 | Test.fail "NFKD: %a <> toNFKD(%a)" uchar_dump u uchar_dump u;
150 | end
151 | in
152 | (* For each unicode scalar value *)
153 | let rec loop u =
154 | if Uchar.equal Uchar.max u then check u else
155 | (check u; loop (Uchar.succ u))
156 | in
157 | loop Uchar.min
158 |
159 | let test_ccc =
160 | Test.test "Uunf.ccc" @@ fun () ->
161 | Test.int (Uunf.ccc (Uchar.of_int 0x0020)) 0 ~__POS__;
162 | Test.int (Uunf.ccc (Uchar.of_int 0x0301)) 230 ~__POS__;
163 | ()
164 |
165 | let various_norm_tests ?__POS__ test =
166 | Test.block ?__POS__ @@ fun () ->
167 | test [0x1E69] `NFD [0x0073; 0x0323; 0x0307] ;
168 | test [0x1E69] `NFC [0x1E69];
169 | test [0x1E0B; 0x0323] `NFD [0x0064; 0x0323; 0x0307];
170 | test [0x1E0B; 0x0323] `NFC [0x1E0D; 0x0307];
171 | test [0xFB01] `NFD [0xFB01];
172 | test [0xFB01] `NFC [0xFB01];
173 | test [0xFB01] `NFKD [0x0066; 0x0069];
174 | test [0xFB01] `NFKC [0x0066; 0x0069];
175 | test [0x0032; 0x2075] `NFD [0x0032; 0x2075];
176 | test [0x0032; 0x2075] `NFC [0x0032; 0x2075];
177 | test [0x0032; 0x2075] `NFKD [0x0032; 0x0035];
178 | test [0x0032; 0x2075] `NFKC [0x0032; 0x0035];
179 | test [0x1E9B; 0x0323] `NFD [0x017F; 0x0323; 0x307];
180 | test [0x1E9B; 0x0323] `NFC [0x1E9B; 0x0323; ];
181 | test [0x1E9B; 0x0323] `NFKD [0x0073; 0x0323; 0x0307];
182 | test [0x1E9B; 0x0323] `NFKC [0x1E69];
183 | test [0x0041; 0x007A; 0x0335; 0x0327; 0x0324; 0x0301; 0x0041] `NFC
184 | [0x0041; 0x017A; 0x0335; 0x0327; 0x0324; 0x0041];
185 | (* found by crowbar *)
186 | test [0x01C6; 0x032D] `NFKC [0x0064; 0x017E; 0x032D];
187 | test [0xFF80; 0x1FD3; 0xFF9E; 0x1FD3;] `NFKC [0x30BF; 0x0390; 0x3099; 0x0390];
188 | (* found again by crowbar *)
189 | test [0xC100; 0x20D2; 0x11C1; 0x11C1] `NFC [0xC100; 0x20D2; 0x11C1; 0x11C1];
190 | ()
191 |
192 | let test_specific =
193 | Test.test "specific normalizations" @@ fun () ->
194 | let test src nf dst =
195 | let n = Uunf.create nf in
196 | let rec add acc v = match Uunf.add n v with
197 | | `Uchar u -> add (u :: acc) `Await
198 | | `Await | `End -> acc
199 | in
200 | let add_uchar acc u = add acc (`Uchar (Uchar.of_int u)) in
201 | let nseq = List.rev (add (List.fold_left add_uchar [] src) `End) in
202 | let dst = List.map Uchar.of_int dst in
203 | if nseq <> dst then Test.fail ""
204 | in
205 | various_norm_tests test ~__POS__
206 |
207 | let test_uunf_string =
208 | Test.test "Uunf_string" @@ fun () ->
209 | let test enc normalize =
210 | let b = Buffer.create 42 in
211 | let enc us =
212 | let rec loop = function
213 | | u :: us -> enc b (Uchar.of_int u); loop us
214 | | [] -> Buffer.contents b
215 | in
216 | Buffer.reset b; loop us
217 | in
218 | let test src nf dst = assert ((normalize nf (enc src)) = (enc dst)) in
219 | various_norm_tests test ~__POS__
220 | in
221 | test Buffer.add_utf_8_uchar Uunf_string.normalize_utf_8;
222 | test Buffer.add_utf_16be_uchar Uunf_string.normalize_utf_16be;
223 | test Buffer.add_utf_16le_uchar Uunf_string.normalize_utf_16le;
224 | ()
225 |
226 | let test_flushing_end_seq =
227 | Test.test "flushing end of stream" @@ fun () ->
228 | let n = Uunf.create `NFKC in
229 | let uchar u = `Uchar (Uchar.of_int u) in
230 | if Uunf.add n (uchar 0x2105) <> `Await then Test.fail "";
231 | if Uunf.add n `Await <> `Await then Test.fail "";
232 | if Uunf.add n `End <> (uchar 0x0063) then Test.fail "";
233 | if Uunf.add n `Await <> (uchar 0x002F) then Test.fail "";
234 | if Uunf.add n `Await <> (uchar 0x006F) then Test.fail "";
235 | if Uunf.add n `Await <> `End then Test.fail "";
236 | ()
237 |
238 | let main () =
239 | let test_data_file =
240 | let default = Fpath.v "test/NormalizationTest.txt" in
241 | let doc = "Unicode normalization conformance test file." in
242 | Cmdliner.Arg.(value & pos 0 B0_std_cli.filepath default & info [] ~doc)
243 | in
244 | Test.main' test_data_file @@ fun test_data_file ->
245 | match decode_conformance_data test_data_file with
246 | | Error e -> Test.failstop "%s" e
247 | | Ok data ->
248 | let args = Test.Arg.[value conformance data] in
249 | Test.autorun ~args ()
250 |
251 | let () = if !Sys.interactive then () else exit (main ())
252 |
--------------------------------------------------------------------------------