├── .gitignore ├── .merlin ├── .ocp-indent ├── CHANGES.md ├── LICENSE.md ├── README.md ├── _tags ├── doc ├── api.odocl └── dev.odocl ├── opam ├── pkg ├── META └── pkg.ml ├── src ├── utext.ml ├── utext.mli ├── utext.mllib └── utext_top_init.ml └── test └── test.ml /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | tmp 3 | *~ 4 | \.\#* 5 | \#*# 6 | *.install 7 | *.native 8 | *.byte -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG bytes pvec uucp uunf uuseg uutf 2 | S src 3 | S test 4 | B _build/** 5 | -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | strict_with=always,match_clause=4,strict_else=never 2 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | vX.Y.Z YYYY-MM-DD Location 2 | -------------------------- 3 | 4 | First release. 5 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The utext programmers 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | utext — Unicode text for OCaml 2 | ------------------------------------------------------------------------------- 3 | %%VERSION%% 4 | 5 | utext is an OCaml type for storing and processing Unicode text. 6 | 7 | utext is distributed under the ISC license and depends on [pvec][pvec] 8 | 9 | [pvec]: http://erratique.ch/software/pvec 10 | 11 | Homepage: http://erratique.ch/software/utext 12 | 13 | ## Installation 14 | 15 | utext can be installed with `opam`: 16 | 17 | opam install utext 18 | 19 | If you don't use `opam` consult the [`opam`](opam) file for build 20 | instructions. 21 | 22 | ## Documentation 23 | 24 | The documentation and API reference is generated from the source 25 | interfaces. It can be consulted [online][doc] or via `odig doc 26 | utext`. 27 | 28 | [doc]: http://erratique.ch/software/utext/doc 29 | 30 | ## Sample programs 31 | 32 | If you installed utext with `opam` sample programs are located in 33 | the directory `opam var utext:doc`. 34 | 35 | In the distribution sample programs and tests are located in the 36 | [`test`](test) directory. They can be built and run 37 | with: 38 | 39 | topkg build --tests true && topkg test 40 | -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | true : bin_annot, safe_string, package(bytes pvec uucp uunf uuseg uuseg.string uutf) 2 | : include 3 | : include -------------------------------------------------------------------------------- /doc/api.odocl: -------------------------------------------------------------------------------- 1 | Utext -------------------------------------------------------------------------------- /doc/dev.odocl: -------------------------------------------------------------------------------- 1 | Utext -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "1.2" 2 | maintainer: "Daniel Bünzli " 3 | authors: ["The utext programmers"] 4 | homepage: "http://erratique.ch/software/utext" 5 | doc: "http://erratique.ch/software/utext/doc" 6 | license: "ISC" 7 | dev-repo: "http://erratique.ch/repos/utext.git" 8 | bug-reports: "https://github.com/dbuenzli/utext/issues" 9 | tags: [] 10 | available: [ ocaml-version >= "4.01.0"] 11 | depends: 12 | [ 13 | "ocamlfind" {build} 14 | "ocamlbuild" {build} 15 | "topkg" {build & >= "0.9.0"} 16 | "pvec" 17 | "uutf" 18 | "uucp" 19 | "uunf" 20 | "uuseg" 21 | ] 22 | depopts: [] 23 | build: 24 | [[ 25 | "ocaml" "pkg/pkg.ml" "build" 26 | "--dev-pkg" "%{dev}%" 27 | ]] 28 | -------------------------------------------------------------------------------- /pkg/META: -------------------------------------------------------------------------------- 1 | description = "Unicode text datastructure for OCaml" 2 | version = "%%VERSION_NUM%%" 3 | requires = "" 4 | archive(byte) = "utext.cma" 5 | archive(native) = "utext.cmxa" 6 | plugin(byte) = "utext.cma" 7 | plugin(native) = "utext.cmxs" 8 | -------------------------------------------------------------------------------- /pkg/pkg.ml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ocaml 2 | #use "topfind" 3 | #require "topkg" 4 | open Topkg 5 | 6 | let () = 7 | Pkg.describe "utext" @@ fun c -> 8 | Ok [ Pkg.mllib "src/utext.mllib"; 9 | Pkg.lib "src/utext_top_init.ml"; 10 | Pkg.test "test/test"; ] 11 | -------------------------------------------------------------------------------- /src/utext.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The utext programmers. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | let _strf = Printf.sprintf 8 | let unicode_version = Uucp.unicode_version 9 | 10 | type t = Uchar.t Pvec.t 11 | 12 | let empty = Pvec.empty 13 | let v = Pvec.v 14 | let init = Pvec.init 15 | let of_uchar = Pvec.singleton 16 | let _of_ascii s = init ~len:(String.length s) (fun i -> Uchar.of_char s.[i]) 17 | 18 | (* Predicates and comparison. *) 19 | 20 | let is_empty = Pvec.is_empty 21 | let equal t0 t1 = Pvec.equal ~eq:Uchar.equal t0 t1 22 | let compare t0 t1 = Pvec.compare ~cmp:Uchar.compare t0 t1 23 | 24 | (* Normalization *) 25 | 26 | type normalization = [ `NFC | `NFD | `NFKC | `NFKD ] 27 | 28 | let normalized nf t = 29 | (* XXX would be interesting to do a direct reimplementation on pvec. 30 | and benchmark the diff (but on which corpus ?) also how would the 31 | case algorithms react. Also consider first checking using 32 | [is_normalized]. *) 33 | let rec add n acc v = match Uunf.add n v with 34 | | `Uchar u -> add n (Pvec.add_last acc u) `Await 35 | | `Await | `End -> acc 36 | in 37 | let add_uchar n acc u = add n acc (`Uchar u) in 38 | let n = Uunf.create nf in 39 | add n (Pvec.fold_left (add_uchar n) empty t) `End 40 | 41 | let is_normalized nf t = 42 | (* FIXME this is ridiculous, do a direct reimplementation. *) 43 | equal t (normalized nf t) 44 | 45 | (* Case mapping *) 46 | 47 | let append_uchar_map map acc u = match map u with 48 | | `Self -> Pvec.add_last acc u 49 | | `Uchars l -> List.fold_left Pvec.add_last acc l 50 | 51 | let lowercased t = 52 | Pvec.fold_left (append_uchar_map Uucp.Case.Map.to_lower) empty t 53 | 54 | let uppercased t = 55 | Pvec.fold_left (append_uchar_map Uucp.Case.Map.to_upper) empty t 56 | 57 | let _map_first_if_cased map t = match Pvec.first_el t with 58 | | None -> t 59 | | Some first -> 60 | match Uucp.Case.is_cased first with 61 | | false -> t 62 | | true -> 63 | match map first with 64 | | `Self -> t 65 | | `Uchars us -> 66 | let first_map = Pvec.of_list us in 67 | Pvec.(first_map ++ rem_first t) 68 | 69 | let capitalized t = _map_first_if_cased Uucp.Case.Map.to_title t 70 | let uncapitalized t = _map_first_if_cased Uucp.Case.Map.to_lower t 71 | 72 | (* Case folding *) 73 | 74 | let casefolded t = 75 | Pvec.fold_left (append_uchar_map Uucp.Case.Fold.fold) empty t 76 | 77 | let uchar_app acc = function 78 | | `Uchar u -> Pvec.add_last acc u 79 | | `End -> acc 80 | | `Await -> acc 81 | 82 | let rec nf_app n app acc v = match Uunf.add n v with 83 | | `Uchar u as v -> nf_app n app (app acc v) `Await 84 | | `End -> app acc `End 85 | | `Await -> acc 86 | 87 | let map_app map app acc = function 88 | | `Uchar u as v -> 89 | begin match map u with 90 | | `Self -> app acc v 91 | | `Uchars l -> List.fold_left (fun acc u -> app acc (`Uchar u)) acc l 92 | end 93 | | `End -> app acc `End 94 | | `Await -> acc 95 | 96 | let fold_with_append append t = 97 | let add_uchar acc u = append acc (`Uchar u) in 98 | append (Pvec.fold_left add_uchar empty t) `End 99 | 100 | let canonical_caseless_key t = 101 | (* XXX TUS mentions optims that could eschew the first NFD. *) 102 | (* This is [normalized `NFD @@ casefolded @@ normalized `NFD t] *) 103 | let append = 104 | nf_app (Uunf.create `NFD) @@ map_app Uucp.Case.Fold.fold @@ 105 | nf_app (Uunf.create `NFD) @@ uchar_app 106 | in 107 | fold_with_append append t 108 | 109 | let compatibility_caseless_key t = 110 | (* This is [normalized `NFKD @@ casefold @@ normalized `NFKD @@ casefold @@ 111 | normalized NFD t] *) 112 | let append = 113 | nf_app (Uunf.create `NFD) @@ map_app Uucp.Case.Fold.fold @@ 114 | nf_app (Uunf.create `NFKD) @@ map_app Uucp.Case.Fold.fold @@ 115 | nf_app (Uunf.create `NFKD) uchar_app 116 | in 117 | fold_with_append append t 118 | 119 | (* Identifiers *) 120 | 121 | let is_identifier t = match Pvec.first_el t with 122 | | None -> false 123 | | Some u -> 124 | (* N.B. is_xid_continue => is_xid_start *) 125 | Uucp.Id.is_xid_start u && Pvec.for_all Uucp.Id.is_xid_continue t 126 | 127 | let identifier_caseless_key t = 128 | (* This is [normalize `NFC @@ nfkc_casefold @@ normalize NFD t] *) 129 | let append = 130 | nf_app (Uunf.create `NFD) @@ map_app Uucp.Case.Nfkc_fold.fold @@ 131 | nf_app (Uunf.create `NFC) @@ uchar_app 132 | in 133 | fold_with_append append t 134 | 135 | (* Breaking lines and paragraphs *) 136 | 137 | type newline = [ `ASCII | `NLF | `Readline ] 138 | 139 | let lines ?(drop_empty = false) ?(newline = `Readline) t = 140 | (* XXX range vs spans *) 141 | let add_line ls first last = 142 | let line = Pvec.range ~first ~last t in 143 | if drop_empty && Pvec.is_empty line then ls else Pvec.(add_last ls line) 144 | in 145 | let rec lines (ls, first as acc) i u = match Uchar.to_int u with 146 | | 0x000D (* CR *) -> 147 | let first' = match Pvec.el t (i + 1) with 148 | | Some u when Uchar.to_int u = 0x000A (* LF *) -> i + 2 149 | | None | Some _ -> i + 1 150 | in 151 | add_line ls first (i - 1), first' 152 | | 0x000A (* LF *) -> 153 | if first > i (* last i was CR *) then acc else 154 | add_line ls first (i - 1), i + 1 155 | | 0x0085 (* NEL *) when newline <> `ASCII -> 156 | add_line ls first (i - 1), i + 1 157 | | (0x000C (* FF *) | 0x2028 (* LS *) 158 | | 0x2029 (* PS *)) when newline = `Readline -> 159 | add_line ls first (i - 1), i + 1 160 | | _ -> acc 161 | in 162 | let ls, first = Pvec.foldi_left lines (empty, 0) t in 163 | add_line ls first (Pvec.length t - 1) 164 | 165 | let paragraphs ?(drop_empty = false) t = 166 | let fst_half_break u = match Uchar.to_int u with 167 | | 0x000D | 0x000A | 0x0085 | 0x2028 | 0x2029 -> true | _ -> false 168 | in 169 | let snd_half_break bstart k = match Pvec.el t k with 170 | | None -> None 171 | | Some u -> 172 | begin match Uchar.to_int u with 173 | | 0x2029 (* PS *) -> Some (k, k) 174 | | 0x000A (* LF *) | 0x0085 (* NEL *) | 0x2028 (* LS *) -> Some (bstart, k) 175 | | 0x000D (* CR *) -> 176 | begin match Pvec.el t (k + 1) with 177 | | None -> Some (bstart, k) 178 | | Some u -> 179 | if Uchar.to_int u = 0x000A (* LF *) then Some (bstart, k + 1) else 180 | Some (bstart, k) 181 | end 182 | | _ -> None 183 | end 184 | in 185 | let rec find_break start = match Pvec.left_find ~start fst_half_break t with 186 | | None -> None 187 | | Some (i, u) -> 188 | let break = match Uchar.to_int u with 189 | | 0x2029 (* PS *) -> Some (i, i) 190 | | 0x000A (* LF *) 191 | | 0x0085 (* NEL *) | 0x2028 (* LS *) -> snd_half_break i (i + 1) 192 | | 0x000D (* CR *) -> 193 | begin match Pvec.el t (i + 1) with 194 | | None -> None 195 | | Some u -> 196 | if Uchar.to_int u = 0x000A (* LF *) then snd_half_break i (i + 2) 197 | else snd_half_break i (i + 1) 198 | end 199 | | _ -> assert false 200 | in 201 | match break with None -> find_break (i + 2) | Some _ as b -> b 202 | in 203 | let add_para ps first last = 204 | let para = Pvec.range ~first ~last t in 205 | if drop_empty && Pvec.is_empty para then ps else Pvec.(add_last ps para) 206 | in 207 | let rec loop ps first start = match find_break start with 208 | | None -> add_para ps first (Pvec.length t - 1) 209 | | Some (bfirst, blast) -> 210 | loop (add_para ps first (bfirst - 1)) (blast + 1) (blast + 1) 211 | in 212 | loop empty 0 0 213 | 214 | (* Segmentation *) 215 | 216 | type boundary = [ `Grapheme_cluster | `Line_break | `Sentence | `Word ] 217 | 218 | let segments b t = 219 | let rec add s (segs, seg as acc) v = match Uuseg.add s v with 220 | | `Uchar u -> add s (segs, (Pvec.add_last seg u)) `Await 221 | | `Boundary -> 222 | let segs = if is_empty seg then segs else Pvec.add_last segs seg in 223 | add s (segs, empty) `Await 224 | | `Await | `End -> acc 225 | in 226 | let add_uchar s acc u = add s acc (`Uchar u) in 227 | let s = Uuseg.create b in 228 | fst (add s (Pvec.fold_left (add_uchar s) (empty, empty) t) `End) 229 | 230 | let segment_count b t = 231 | let rec add s count v = match Uuseg.add s v with 232 | | `Uchar _ -> add s count `Await 233 | | `Boundary -> add s (count + 1) `Await 234 | | `Await | `End -> count 235 | in 236 | let add_uchar s count u = add s count (`Uchar u) in 237 | let s = Uuseg.create b in 238 | match add s (Pvec.fold_left (add_uchar s) 0 t) `End with 239 | | 0 -> 0 240 | | n -> n - 1 241 | 242 | type pos = int 243 | 244 | let boundaries b t = 245 | let rec add s is i v = match Uuseg.add s v with 246 | | `Uchar _ -> add s is i `Await 247 | | `Boundary -> add s (Pvec.add_last is i) i `Await 248 | | `Await | `End -> is 249 | in 250 | let add_uchar s is i u = add s is i (`Uchar u) in 251 | let s = Uuseg.create b in 252 | add s (Pvec.foldi_left (add_uchar s) empty t) (Pvec.length t) `End 253 | 254 | let boundaries_mandatory b t = 255 | let rec add s is i v = match Uuseg.add s v with 256 | | `Uchar _ -> add s is i `Await 257 | | `Boundary -> add s (Pvec.add_last is (i, Uuseg.mandatory s)) i `Await 258 | | `Await | `End -> is 259 | in 260 | let add_uchar s is i u = add s is i (`Uchar u) in 261 | let s = Uuseg.create b in 262 | add s (Pvec.foldi_left (add_uchar s) empty t) (Pvec.length t) `End 263 | 264 | (* Escaping and unescaping *) 265 | 266 | let bslash = Uchar.of_char '\\' 267 | let esc char = Pvec.of_list [bslash; Uchar.of_char char] 268 | let esc_bell = esc 'b' 269 | let esc_tab = esc 't' 270 | let esc_lf = esc 'n' 271 | let esc_cr = esc 'r' 272 | let esc_quote = esc '"' 273 | let esc_bslash = esc '\\' 274 | let esc_uchar u = _of_ascii (_strf "\\u{%04X}" (Uchar.to_int u)) 275 | let escaped t = 276 | let escape_uchar (te, first as acc) i u = 277 | let esc u = 278 | Pvec.(concat_list [te; range ~first ~last:(i - 1) t; u]), i + 1 279 | in 280 | match Uchar.to_int u with 281 | | 0x0008 (* '\b' *) -> esc esc_bell 282 | | 0x0009 (* '\t' *) -> esc esc_tab 283 | | 0x000A (* '\n' *) -> esc esc_lf 284 | | 0x000D (* '\r' *) -> esc esc_cr 285 | | 0x0022 (* '\"' *) -> esc esc_quote 286 | | 0x005C (* '\\' *) -> esc esc_bslash 287 | | _ when Uucp.Gc.general_category u = `Cc -> esc (esc_uchar u) 288 | | _ -> acc 289 | in 290 | let te, first = Pvec.foldi_left escape_uchar (empty, 0) t in 291 | Pvec.(te ++ drop_left first t) 292 | 293 | let is_hex_digit u = Uchar.is_char u && match Uchar.to_char u with 294 | | '0' .. '9' | 'A' .. 'F' | 'a' .. 'f' -> true 295 | | _ -> false 296 | 297 | let int_of_hex_num t = 298 | let add_digit n u = n * 16 + match Uchar.to_int u with 299 | | i when i <= 0x0039 -> i - 0x0030 300 | | i when i <= 0x0046 -> i - 0x0041 + 10 301 | | i when i <= 0x0066 -> i - 0x0061 + 10 302 | | _ -> assert false 303 | in 304 | Pvec.fold_left add_digit 0 t 305 | 306 | let unescaped t = 307 | let not_bslash u = not @@ Uchar.equal u bslash in 308 | let rec loop i acc t = 309 | let l, r = Pvec.span_left not_bslash t in 310 | if Pvec.is_empty r then Ok Pvec.(acc ++ l) else 311 | let i = i + Pvec.length l in 312 | let err () = Error i in 313 | let unesc u i skip r = 314 | loop (i + skip) Pvec.(add_last (acc ++ l) u) (Pvec.drop_left skip r) 315 | in 316 | match Pvec.el r 1 with 317 | | None -> err () 318 | | Some u -> 319 | match Uchar.to_int u with 320 | | 0x0062 (* 'b' *) -> unesc (Uchar.of_int 0x0008) i 2 r 321 | | 0x0074 (* 't' *) -> unesc (Uchar.of_int 0x0009) i 2 r 322 | | 0x006E (* 'n' *) -> unesc (Uchar.of_int 0x000A) i 2 r 323 | | 0x0072 (* 'r' *) -> unesc (Uchar.of_int 0x000D) i 2 r 324 | | 0x0022 (* '"' *) -> unesc (Uchar.of_int 0x0022) i 2 r 325 | | 0x005C (* '\\' *) -> unesc (Uchar.of_int 0x005C) i 2 r 326 | | 0x0075 (* 'u' *) -> 327 | begin match Pvec.el r 2 with 328 | | None -> err () 329 | | Some u when Uchar.to_int u <> 0x007B (* '{' *) -> err () 330 | | Some _ -> 331 | let hs, r = Pvec.(span_left is_hex_digit (drop_left 3 r)) in 332 | begin match Pvec.first_el r with 333 | | None -> err () 334 | | Some u -> 335 | if Uchar.to_int u <> 0x007D (* '}' *) then err () else 336 | if Pvec.(length hs = 0 || length hs > 6) then err () else 337 | let u = int_of_hex_num hs in 338 | if not (Uchar.is_valid u) then err () else 339 | unesc (Uchar.of_int u) (i + 4 + Pvec.length hs) 1 r 340 | end 341 | end 342 | | _ -> err () 343 | in 344 | loop 0 empty t 345 | 346 | (* Decoding and encoding *) 347 | 348 | let encoding_guess = Uutf.String.encoding_guess 349 | 350 | (* XXX implement codecs directly. *) 351 | 352 | let str text = 353 | let add_uchar acc pos = function 354 | | `Uchar u -> Pvec.add_last acc u 355 | | `Malformed bytes -> invalid_arg (_strf "%d: invalid bytes %S" pos bytes) 356 | in 357 | Uutf.String.fold_utf_8 add_uchar empty text 358 | 359 | let strf fmt = Format.kasprintf (fun s -> str s) fmt 360 | 361 | (* Best-effort decoding *) 362 | 363 | let of_utf_x fold ?(first = 0) ?last s = 364 | let add_uchar acc pos = function 365 | | `Uchar u -> Pvec.add_last acc u 366 | | `Malformed bytes -> Pvec.add_last acc Uutf.u_rep 367 | in 368 | let last = match last with None -> String.length s - 1 | Some l -> l in 369 | let len = last - first + 1 in 370 | fold ?pos:(Some first) ?len:(Some len) add_uchar empty s 371 | 372 | let of_utf_8 = of_utf_x Uutf.String.fold_utf_8 373 | let of_utf_16le = of_utf_x Uutf.String.fold_utf_16le 374 | let of_utf_16be = of_utf_x Uutf.String.fold_utf_16be 375 | 376 | (* Decoding with error handling *) 377 | 378 | type decode = (t, t * int * int option) result 379 | 380 | exception Dec_error of (t * int * int option) 381 | 382 | let try_of_utf_x fold ?(first = 0) ?last s = 383 | let last = match last with None -> String.length s - 1 | Some l -> l in 384 | let add_uchar acc idx = function 385 | | `Uchar u -> Pvec.add_last acc u 386 | | `Malformed bytes -> 387 | let restart = idx + String.length bytes in 388 | let restart = if restart > last then None else Some restart in 389 | raise (Dec_error (acc, idx, restart)) 390 | in 391 | let len = last - first + 1 in 392 | try Ok (fold ?pos:(Some first) ?len:(Some len) add_uchar empty s) with 393 | | Dec_error err -> Error err 394 | 395 | let try_of_utf_8 = try_of_utf_x Uutf.String.fold_utf_8 396 | let try_of_utf_16le = try_of_utf_x Uutf.String.fold_utf_16le 397 | let try_of_utf_16le = try_of_utf_x Uutf.String.fold_utf_16be 398 | 399 | (* Encoding *) 400 | 401 | let buffer_add_utf_8 b t = Pvec.iter_left (Uutf.Buffer.add_utf_8 b) t 402 | let buffer_add_utf_16le b t = Pvec.iter_left (Uutf.Buffer.add_utf_16le b) t 403 | let buffer_add_utf_16be b t = Pvec.iter_left (Uutf.Buffer.add_utf_16be b) t 404 | 405 | let to_utf_x buffer_add_utf_x t = 406 | let b = Buffer.create (Pvec.length t * 2) in 407 | buffer_add_utf_x b t; 408 | Buffer.contents b 409 | 410 | let to_utf_8 t = to_utf_x buffer_add_utf_8 t 411 | let to_utf_16le t = to_utf_x buffer_add_utf_16le t 412 | let to_utf_16be t = to_utf_x buffer_add_utf_16be t 413 | 414 | (* Pretty-printing 415 | 416 | XXX avoid the detour via string *) 417 | 418 | let pp ppf t = Uuseg_string.pp_utf_8 ppf (to_utf_8 t) 419 | let pp_text ppf t = Uuseg_string.pp_utf_8_text ppf (to_utf_8 t) 420 | let pp_lines ppf t = Uuseg_string.pp_utf_8_lines ppf (to_utf_8 t) 421 | let pp_uchars ppf t = 422 | let sep = Format.pp_print_space in 423 | let pp_uchar ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u) in 424 | Pvec.pp ~sep pp_uchar ppf t 425 | 426 | let pp_toplevel ppf t = 427 | Format.fprintf ppf "(Utext.str \"%a\")" pp (escaped t) 428 | 429 | let pp_toplevel_pvec ppf ts = 430 | let pp_t ppf t = Format.fprintf ppf "Utext.str \"%a\"" pp (escaped t) in 431 | let sep ppf () = Format.fprintf ppf ";@ " in 432 | Format.fprintf ppf "@[<1>Pvec.of_list @[<1>[%a]@]@]" Pvec.(pp ~sep pp_t) ts 433 | 434 | (*--------------------------------------------------------------------------- 435 | Copyright (c) 2018 The utext programmers 436 | 437 | Permission to use, copy, modify, and/or distribute this software for any 438 | purpose with or without fee is hereby granted, provided that the above 439 | copyright notice and this permission notice appear in all copies. 440 | 441 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 442 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 443 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 444 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 445 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 446 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 447 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 448 | ---------------------------------------------------------------------------*) 449 | -------------------------------------------------------------------------------- /src/utext.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The utext programmers. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | (** Unicode text for OCaml. 8 | 9 | [Utext] provides a type for processing Unicode text. 10 | 11 | See also {!Uucp} and {!Pvec} and consult a 12 | {{!Uucp.uminimal}minimal unicode introduction}. 13 | TODO make a minimal Utext specific minimal intro. 14 | 15 | {e %%VERSION%% — Unicode version %%UNICODE_VERSION%% — 16 | {{:%%PKG_HOMEPAGE%% }homepage}} 17 | 18 | {3 References} 19 | {ul 20 | {- {{:http://www.unicode.org/faq/}The Unicode FAQ.}} 21 | {- The Unicode Consortium. 22 | {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. 23 | (latest version)}} *) 24 | 25 | (** {1 Utext} *) 26 | 27 | val unicode_version : string 28 | (** [unicode_version] is the Unicode version supported by [Utext]. *) 29 | 30 | type t = Uchar.t Pvec.t 31 | (** The type for Unicode text, a persistent vector of Unicode 32 | characters. *) 33 | 34 | val empty : t 35 | (** [empty] is {!Pvec.empty}, the empty Unicode text. *) 36 | 37 | val v : len:int -> Uchar.t -> t 38 | (** [v ~len u] is {!Pvec.v}[ ~len u]. *) 39 | 40 | val init : len:int -> (int -> Uchar.t) -> t 41 | (** [init ~len f] is {!Pvec.init}[ ~len f]. *) 42 | 43 | val of_uchar : Uchar.t -> t 44 | (** [of_uchar u] is {!Pvec.singleton}[ u]. *) 45 | 46 | val str : string -> t 47 | (** [str s] is Unicode text from the {e valid} UTF-8 encoded bytes [s]. 48 | 49 | @raise Invalid_argument if [text] is invalid UTF-8, use 50 | {!of_utf_8} and {!try_of_utf_8} to deal with untrusted input. *) 51 | 52 | val strf : ('a, Format.formatter, unit, t) Pervasives.format4 -> 'a 53 | (** [strf fmt ...] is [Format.kasprintf (fun s -> str s) fmt ...)]. *) 54 | 55 | (** {1:preds Predicates and comparison} 56 | 57 | See also [Pvec]'s {{!Pvec.preds}predicates and comparisons}. *) 58 | 59 | val is_empty : t -> bool 60 | (** [is_empty t] is [true] if [t] is empty, this is equal to 61 | {!Pvec.is_empty.} *) 62 | 63 | val equal : t -> t -> bool 64 | (** [equal t0 t1] is [true] if the elements in each vector are 65 | equal. {b Warning.} The test is {e textually} meaningless unless 66 | [t0] and [t1] are known to be in a particular form, see e.g. 67 | {!canonical_caseless_key} or {{!nf}normal forms}. 68 | 69 | {b FIXME.} Should we provide a fool-proof equality that 70 | always compares in [`NFD] or [`NFC] ? Problem is that 71 | since we are using raw Pvec.t we cannot cache. *) 72 | 73 | val compare : t -> t -> int 74 | (** [compare t0 t1] is the per element lexicographical order between 75 | [t0] and [t1]. {b Warning.} The comparison is {e textually} 76 | meaningless. *) 77 | 78 | (** {1:cmap Case mapping and folding} 79 | 80 | For more information about case see the 81 | {{:http://unicode.org/faq/casemap_charprop.html#casemap}Unicode 82 | case mapping FAQ} and the 83 | {{:http://www.unicode.org/charts/case/}case mapping charts}. Note 84 | that these algorithms are insensitive to language and context and 85 | may produce sub-par results for some users. *) 86 | 87 | val lowercased : t -> t 88 | (** [lowercase t] is [t] lowercased according to Unicode's default case 89 | conversion. *) 90 | 91 | val uppercased : t -> t 92 | (** [uppercase t] is [t] uppercased according to Unicode's default case 93 | conversion. *) 94 | 95 | val capitalized : t -> t 96 | (** [capitalized t] is [t] capitalized: if the first character of [t] 97 | is {{!Uucp.Case.is_cased}cased} it is mapped to its 98 | {{!Uucp.Case.Map.to_title}title case mapping}; otherwise [t] is 99 | left unchanged. *) 100 | 101 | val uncapitalized : t -> t 102 | (** [uncapitalized t] is [t] uncapitalized: if the first character of 103 | [t] is {{!Uucp.Case.is_cased}cased} it is mapped to its 104 | {{!Uucp.Case.Map.to_lower}lowercase case mapping}; otherwise [t] 105 | is left unchanged. *) 106 | 107 | (** {2:caseless Case insensitive equality} 108 | 109 | Testing the equality of two Unicode texts in a case insensitive 110 | manner requires a fair amount of data massaging that includes 111 | {{!nf}normalization} and {{!casefold}case folding}. These results 112 | should be cached if many comparisons have to be made on the same 113 | text. The following functions return keys for a given text that 114 | can be used to test equality against other keys. {b Do not} test 115 | keys generated by different functions, the comparison would be 116 | meaningless. See also {!identifier_caseless_key}. *) 117 | 118 | val casefolded : t -> t 119 | (** [casefold t] is [t] casefolded according to Unicode's default 120 | casefold. This can be used to implement various forms of caseless 121 | equalities. [equal (casefolded t0) (casefolded t1)] determines 122 | default case equality 123 | ({{:http://www.unicode.org/versions/latest/ch03.pdf#G34145}TUS 124 | D144}) of [t0] and [t1]. {b Warning.} In general this notion is 125 | not good enough use one of the following functions. *) 126 | 127 | val canonical_caseless_key : t -> t 128 | (** [canonical_caseless_key t] is a key such that 129 | [equal (canonical_caseless_key t0) (canonical_caseless_key t1)] 130 | determines canonical caseless 131 | equality ({{:http://www.unicode.org/versions/latest/ch03.pdf#G34145}TUS 132 | D145}) of [t0] and [t1]. *) 133 | 134 | val compatibility_caseless_key : t -> t 135 | (** [compatability_caseless_key t] is a key such that 136 | [equal (compatibility_caseless_key t0) (compatibility_caseless_key t1)] 137 | determines compatibility caseless 138 | equality ({{:http://www.unicode.org/versions/latest/ch03.pdf#G34145}TUS 139 | D146}) of [t0] and [t1]. *) 140 | 141 | (** {1:ids Unicode identifiers} 142 | 143 | For more information see {{:http://unicode.org/reports/tr31/}UAX 31 144 | Unicode Identifier and Pattern Syntax}. *) 145 | 146 | val is_identifier : t -> bool 147 | (** [is_identifier t] is [true] iff [t] is a 148 | {{:http://unicode.org/reports/tr31/#Default_Identifier_Syntax}Default 149 | Unicode identifier}, more precisely this is 150 | {{:http://unicode.org/reports/tr31/#R1}UAX31-R1}. *) 151 | 152 | val identifier_caseless_key : t -> t 153 | (** [identifier_caseless_key t] is a key such that 154 | [equal (identifier_caseless_key t0) (identifier_caseless_key t1)] 155 | determines identifier caseless 156 | equality ({{:http://www.unicode.org/versions/latest/ch03.pdf##G34145}TUS 157 | D147}) of [t0] and [t1]. *) 158 | 159 | (** {1:line_cut Breaking lines and paragraphs} 160 | 161 | These functions break text like a simple [readline] function 162 | would. If you are looking for line breaks to layout text, see 163 | {{!seg}line break segmentation}. *) 164 | 165 | type newline = [ `ASCII | `NLF | `Readline ] 166 | (** The type for specifying newlines. 167 | {ul 168 | {- [`ASCII] newlines occur after a CR (U+000D), LF (U+000A) or 169 | CRLF ([]).} 170 | {- [`NLF] newlines occur after the 171 | {{:http://www.unicode.org/versions/Unicode10.0.0/ch05.pdf#G27861} 172 | {e Unicode newline function}}, this 173 | is [`ASCII] along with NEL (Ub+0085).} 174 | {- [`Readline] newlines are determined as for a 175 | {{:http://www.unicode.org/versions/Unicode10.0.0/ch05.pdf#G21160} 176 | {e Unicode readline function} (R4)}, 177 | this is [`NLF] along with FF (U+000C), LS (U+2028) or 178 | PS (U+2029).}} *) 179 | 180 | val lines : ?drop_empty:bool -> ?newline:newline -> t -> t Pvec.t 181 | (** [lines ~drop_empty ~newline t] breaks [t] into subtexts separated 182 | by newlines determined according to [newline] (defaults to 183 | [`Readline]). Separators are not part of the result and lost. If 184 | [drop_empty] is [true] (defaults to [false]) drops lines that are 185 | empty. *) 186 | 187 | val paragraphs : ?drop_empty:bool -> t -> t Pvec.t 188 | (** [paragraphs ~newline t] breaks [t] into subtexts separated either 189 | by two consecutive newlines (determined as {{!newline}[`NLF]} or 190 | LS (U+2028)) or a single PS (U+2029). Separators are not part of 191 | the result and lost. If [drop_empty] is [true] (defaults to 192 | [false]) drops paragraphs that are empty. *) 193 | 194 | (** {1:nf Normalization} 195 | 196 | For more information on normalization consult a short 197 | {{!Uucp.equivalence}introduction}, the 198 | {{:http://www.unicode.org/reports/tr15/}UAX #15 Unicode 199 | Normalization Forms} and 200 | {{:http://www.unicode.org/charts/normalization/} normalization 201 | charts}. *) 202 | 203 | type normalization = [`NFD | `NFC | `NFKD | `NFKC ] 204 | (** The type for normalization forms. 205 | {ul 206 | {- [`NFD] {{:http://www.unicode.org/glossary/#normalization_form_d} 207 | normalization form D}, canonical decomposition.} 208 | {- [`NFC] {{:http://www.unicode.org/glossary/#normalization_form_c} 209 | normalization form C}, canonical decomposition followed by 210 | canonical composition.} 211 | {- [`NFKD] {{:http://www.unicode.org/glossary/#normalization_form_kd} 212 | normalization form KD}, compatibility decomposition.} 213 | {- [`NFKC] {{:http://www.unicode.org/glossary/#normalization_form_kc} 214 | normalization form KC}, compatibility decomposition, 215 | followed by canonical composition.}} *) 216 | 217 | val normalized : normalization -> t -> t 218 | (** [normalized nf t] is [t] normalized to [nf]. *) 219 | 220 | val is_normalized : normalization -> t -> bool 221 | (** [is_normalized nf t] is [true] iff [t] is in normalization form [nf]. *) 222 | 223 | (** {1:seg Segmentation} 224 | 225 | For more information consult the 226 | {{:http://www.unicode.org/reports/tr29/}UAX #29 Unicode Text 227 | Segmentation}, the {{:http://www.unicode.org/reports/tr14/}UAX #14 228 | Unicode Line Breaking Algorithm} and the web based 229 | {{:http://unicode.org/cldr/utility/breaks.jsp}ICU break utility}. *) 230 | 231 | type boundary = [ `Grapheme_cluster | `Word | `Sentence | `Line_break ] 232 | (** The type for boundaries. 233 | {ul 234 | {- [`Grapheme_cluster] determines 235 | {{:http://www.unicode.org/glossary/#extended_grapheme_cluster} 236 | extended grapheme clusters} boundaries according to UAX 29 237 | (corresponds, for most scripts, to user-perceived characters).} 238 | {- [`Word] determines word boundaries according to UAX 29.} 239 | {- [`Sentence] determines sentence boundaries according to UAX 29.} 240 | {- [`Line_break] determines {{!mandatory}mandatory} line breaks and 241 | line break opportunities according to UAX 14.}} *) 242 | 243 | val segments : boundary -> t -> t Pvec.t 244 | (** [segments b t] is are the segments of text [t] delimited by two 245 | boundaries of type [b]. *) 246 | 247 | val segment_count : boundary -> t -> int 248 | (** [segment_count b t] is [Pvec.length (segments b t)]. *) 249 | 250 | (** {2:boundary_pos Boundary positions} *) 251 | 252 | type pos = int 253 | (** The type for positions. The positions of a vector [v] of length [l] 254 | range over \[[0];[l]\]. They are the slits before each element and after 255 | the last one. They are labelled from left to right by increasing number. 256 | The [i]th index is between positions [i] and [i+1]. 257 | {v 258 | positions 0 1 2 3 4 l-1 l 259 | +---+---+---+---+ +-----+ 260 | indices | 0 | 1 | 2 | 3 | ... | l-1 | 261 | +---+---+---+---+ +-----+ 262 | v} *) 263 | 264 | val boundaries : boundary -> t -> pos Pvec.t 265 | (** [boundaries b t] are the positions of boundaries [b] in 266 | [t]. *) 267 | 268 | val boundaries_mandatory : boundary -> t -> (pos * bool) Pvec.t 269 | (** [boundaries_mandatory] is like {!boundaries} but returns 270 | the mandatory status of a boundary if the kind of boundary 271 | sports that notion (or always [true] if not). *) 272 | 273 | (** {1:esc Escaping and unescaping} *) 274 | 275 | val escaped : t -> t 276 | (** [escaped t] is [t] except characters whose general category is 277 | [Control], U+0022 or U+005C which are escaped according to OCaml's 278 | lexical conventions for strings with: 279 | {ul 280 | {- Any U+0008 (['\b']) escaped to the sequence 281 | (["\\b"])} 282 | {- Any U+0009 (['\t']) escaped to the sequence 283 | (["\\t"])} 284 | {- Any U+000A (['\n']) escaped to the sequence 285 | ["\\n"]} 286 | {- Any U+000D (['\r']) escaped to the sequence 287 | (["\\r"])} 288 | {- Any U+0022 (['\"']) escaped to the sequence 289 | (["\\\""])} 290 | {- Any U+005C (['\\']) escaped to the sequence 291 | (["\\\\"])} 292 | {- Any other character is escaped by an {e hexadecimal} ["\u{H+}"] escape 293 | with [H] a capital hexadecimal number.}} 294 | 295 | {b Note.} As far as OCaml is concerned [\u{H+}] escapes are only 296 | supported from 4.06 on. *) 297 | 298 | val unescaped : t -> (t, int) result 299 | (** [unescaped s] unescapes what {!escaped} did and any other valid 300 | [\u{H+}] escape. The, at most six, hexadecimal digits [H] of Unicode 301 | hex escapes can be upper, lower, or mixed case. Any truncated or 302 | undefined by {!escaped} escape makes the function return 303 | an [Error idx] with [idx] the start index of the offending escape. 304 | 305 | The invariant [unescape (escape t) = Ok t] holds. *) 306 | 307 | (** {1:codec Decoding and encoding} *) 308 | 309 | val encoding_guess : string -> [ `UTF_8 | `UTF_16BE | `UTF_16LE ] * bool 310 | (** [encoding_guess s] is the encoding guessed for [s] coupled with 311 | [true] iff there's an initial 312 | {{:http://unicode.org/glossary/#byte_order_mark}BOM}. *) 313 | 314 | (** {2:dec Best-effort decoding} 315 | 316 | {b Warning.} The following are a best-effort decodes in which any UTF-X 317 | decoding error is replaced by at least one replacement character 318 | {!Uchar.u_rep}. *) 319 | 320 | val of_utf_8 : ?first:int -> ?last:int -> string -> t 321 | (** [of_utf_8 ~first ~last s] is the Unicode text that results of 322 | best-effort UTF-8 decoding the bytes of [s] that exist in the 323 | range \[[first];[last]\]. [first] defaults to [0] and [last] to 324 | [length s - 1]. *) 325 | 326 | val of_utf_16le : ?first:int -> ?last:int -> string -> t 327 | (** [of_utf_16le ~first ~last s] is like {!of_utf_8} but decodes 328 | UTF-16LE. *) 329 | 330 | val of_utf_16be : ?first:int -> ?last:int -> string -> t 331 | (** [of_utf_16be ~first ~last s] is like {!of_utf_8} but decodes 332 | UTF-16BE. *) 333 | 334 | (** {2:decerr Decoding with error handling} *) 335 | 336 | type decode = (t, t * int * int option) result 337 | (** The type for decode result. This is: 338 | {ul 339 | {- [Ok t] if no decoding error occured.} 340 | {- [Error (t, err, restart)] if a decoding error occured. [t] is 341 | the text decoded until the error, [err] the byte index where 342 | the decode error occured and [restart] a valid byte index where 343 | a new best-effort decode could be restarted (if any).}} *) 344 | 345 | val try_of_utf_8 : ?first:int -> ?last:int -> string -> decode 346 | (** [try_of_utf_8] is like {!of_utf_8} except in case of error 347 | [Error _] is returned as described in {!decode_result}. *) 348 | 349 | val try_of_utf_16le : ?first:int -> ?last:int -> string -> decode 350 | (** [try_of_utf_16be] is like {!try_of_utf_8} but decodes UTF-16BE. *) 351 | 352 | val try_of_utf_16le : ?first:int -> ?last:int -> string -> decode 353 | (** [try_of_utf_16be] is like {!try_of_utf_8} but decodes UTF-16BE. *) 354 | 355 | (** {2:enc Encoding} 356 | 357 | {b Warning.} All these functions raise [Invalid_argument] if the 358 | result cannot fit in the limits of {!Sys.max_string_length}. *) 359 | 360 | val to_utf_8 : t -> string 361 | (** [to_utf_8 t] is the UTF-8 encoding of [t]. *) 362 | 363 | val to_utf_16le : t -> string 364 | (** [to_utf_16le t] is the UTF-16LE encoding of [t]. *) 365 | 366 | val to_utf_16be : t -> string 367 | (** [to_utf_16be t] is the UTF-16BE encoding of [t]. *) 368 | 369 | val buffer_add_utf_8 : Buffer.t -> t -> unit 370 | (** [buffer_add_utf_8 b t] adds the UTF-8 encoding of [t] to [b]. *) 371 | 372 | val buffer_add_utf_16le : Buffer.t -> t -> unit 373 | (** [buffer_add_utf_16le b t] adds the UTF-16LE encoding of [t] to [b]. *) 374 | 375 | val buffer_add_utf_16be : Buffer.t -> t -> unit 376 | (** [buffer_add_utf_16be b t] adds the UTF-16BE encoding of [t] to [b]. *) 377 | 378 | (** {1:pretty Pretty-printing} *) 379 | 380 | val pp : Format.formatter -> t -> unit 381 | (** [pp ppf t] prints the UTF-8 encoding of [t] instructing the [ppf] 382 | to use a length of [1] for each grapheme cluster of [t]. *) 383 | 384 | val pp_text : Format.formatter -> t -> unit 385 | (** [pp_text ppf t] is like {!pp} except each line break is hinted 386 | to the formatter, see {!Uuseg_string.pp_utf_8_text} for details. *) 387 | 388 | val pp_lines : Format.formatter -> t -> unit 389 | (** [pp_lines ppf t] is like {!pp} except only {e mandatory} line breaks 390 | are hinted to the formatter, see {!Uuseg_string.pp_utf_8_lines} for 391 | details. *) 392 | 393 | val pp_uchars : Format.formatter -> t -> unit 394 | (** [dump_uchars ppf t] formats [t] as a sequence of OCaml {!Uchar.t} value 395 | using only US-ASCII encoded characters according to the Unicode 396 | {{:http://www.unicode.org/versions/Unicode10.0.0/appA.pdf#G7083} 397 | notational convention} for code points. *) 398 | 399 | val pp_toplevel : Format.formatter -> t -> unit 400 | (** [pp_toplevel ppf t] formats [t] using {!escaped} and {!pp} in a manner 401 | suitable for the toplevel to represent a [Utext.t] value. 402 | 403 | {b Warning.} Before OCaml 4.06 the result might not be cut and pastable 404 | as [\u{H+}] escapes are not supported. *) 405 | 406 | val pp_toplevel_pvec : Format.formatter -> t Pvec.t -> unit 407 | (** [pp_toplevel_pvec ppf ts] formats [ts] using {!pp_toplevel}. *) 408 | 409 | (*--------------------------------------------------------------------------- 410 | Copyright (c) 2018 The utext programmers 411 | 412 | Permission to use, copy, modify, and/or distribute this software for any 413 | purpose with or without fee is hereby granted, provided that the above 414 | copyright notice and this permission notice appear in all copies. 415 | 416 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 417 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 418 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 419 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 420 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 421 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 422 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 423 | ---------------------------------------------------------------------------*) 424 | -------------------------------------------------------------------------------- /src/utext.mllib: -------------------------------------------------------------------------------- 1 | Utext 2 | -------------------------------------------------------------------------------- /src/utext_top_init.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The pvec programmers. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | #install_printer Utext.pp_toplevel ;; 8 | #install_printer Utext.pp_toplevel_pvec ;; 9 | 10 | (*--------------------------------------------------------------------------- 11 | Copyright (c) 2018 The pvec programmers 12 | 13 | Permission to use, copy, modify, and/or distribute this software for any 14 | purpose with or without fee is hereby granted, provided that the above 15 | copyright notice and this permission notice appear in all copies. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 18 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 19 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 20 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 21 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 22 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 23 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 24 | ---------------------------------------------------------------------------*) 25 | -------------------------------------------------------------------------------- /test/test.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2018 The utext programmers. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | let strf = Printf.sprintf 8 | let log f = Format.printf (f ^^ "@.") 9 | 10 | let u = Utext.str 11 | let eq = Utext.equal 12 | let assert_f f t0 t1 = assert (Utext.equal (f (u t0)) (u t1)) 13 | let assert_invalid_arg f x = try f x; assert false with Invalid_argument _ -> () 14 | 15 | let test_base () = 16 | log "Test Utext.{empty,is_empty,str,strf}"; 17 | assert (Pvec.length Utext.empty = 0); 18 | assert Utext.(is_empty empty); 19 | assert (Utext.(not @@ is_empty @@ str "bla")); 20 | assert_invalid_arg Utext.str "\xFF"; 21 | assert (eq (Utext.strf "%d" 1) (u "1")); 22 | () 23 | 24 | let test_case () = 25 | log "Test Utext.{lowercased,uppercased,capitalized,uncapitalized}"; 26 | assert_f Utext.lowercased "abc" "abc"; 27 | assert_f Utext.lowercased "aBc" "abc"; 28 | assert_f Utext.uppercased "abc" "ABC"; 29 | assert_f Utext.uppercased "aBc" "ABC"; 30 | assert_f Utext.capitalized "abc" "Abc"; 31 | assert_f Utext.capitalized "Abc" "Abc"; 32 | assert_f Utext.uncapitalized "abc" "abc"; 33 | assert_f Utext.uncapitalized "Abc" "abc"; 34 | assert_f Utext.uppercased "dz" "DZ"; 35 | assert_f Utext.capitalized "dz" "Dz"; 36 | () 37 | 38 | let test_case_insensitive () = 39 | log "Test case insensitive equalities"; 40 | let assert_eq key t0 t1 = assert (eq (key (u t0)) (key (u t1))) in 41 | let assert_neq key t0 t1 = assert (not @@ eq (key (u t0)) (key (u t1))) in 42 | assert_eq Utext.casefolded "Ὀδυσσεύς" "ὈΔΥΣΣΕΎΣ"; 43 | assert_neq Utext.casefolded "Å" (*U+00C5*) "Å" (**); 44 | assert_eq Utext.canonical_caseless_key "Å" "Å"; 45 | assert_eq Utext.compatibility_caseless_key "Å" "Å"; 46 | assert_neq Utext.casefolded "Ⅻ" (*U+216B*) "ⅹⅰⅰ" (**); 47 | assert_neq Utext.canonical_caseless_key "Ⅻ" "ⅹⅰⅰ"; 48 | assert_eq Utext.compatibility_caseless_key "Ⅻ" "ⅹⅰⅰ"; 49 | assert_neq Utext.casefolded "℅" (*U+2105*) "C/O"; 50 | assert_neq Utext.canonical_caseless_key "℅" (*U+2105*) "C/O"; 51 | assert_eq Utext.compatibility_caseless_key "℅" (*U+2105*) "C/O"; 52 | assert_neq Utext.casefolded "㎒" "MHZ"; 53 | assert_neq Utext.canonical_caseless_key "㎒" "MHZ"; 54 | assert_eq Utext.compatibility_caseless_key "㎒" "MHZ"; 55 | () 56 | 57 | let test_identifiers () = 58 | log "Test Utext.is_identifier"; 59 | assert (not @@ Utext.is_identifier (u "32x")); 60 | assert (not @@ Utext.is_identifier (u "\xCC\x81Ela")); 61 | assert (Utext.is_identifier (u "Éla")); 62 | assert (not @@ Utext.is_identifier (u "_Éla")); 63 | assert (not @@ Utext.is_identifier (u "\xF0\x9F\x90\xAB")); 64 | assert (not @@ Utext.is_identifier (u "a\xF0\x9F\x90\xAB")); 65 | () 66 | 67 | let test_lines () = 68 | log "Test Utext.lines"; 69 | let assert_ls ?drop_empty ?newline src segs = 70 | assert (Pvec.equal ~eq 71 | (Utext.lines ?drop_empty ?newline (u src)) 72 | (Pvec.of_list (List.map u segs))) 73 | in 74 | let drop_empty = true in 75 | assert_ls "" [""]; 76 | assert_ls ~drop_empty "" []; 77 | assert_ls "\n" ["";""]; 78 | assert_ls ~drop_empty "\n" []; 79 | assert_ls "fst\r\nsnd\ntrd" ["fst";"snd";"trd"]; 80 | assert_ls "fst\r\nsnd\ntrd\n\r" ["fst";"snd";"trd";"";""]; 81 | assert_ls ~drop_empty "fst\r\nsnd\ntrd\n\r" ["fst";"snd";"trd"]; 82 | assert_ls "fst\r\nsnd\ntrd\r\n" ["fst";"snd";"trd";""]; 83 | assert_ls ~drop_empty "fst\r\nsnd\ntrd\r\n" ["fst";"snd";"trd"]; 84 | assert_ls "\n\rabc" [""; ""; "abc"]; 85 | assert_ls ~drop_empty "\n\rabc" ["abc"]; 86 | assert_ls "\r\nabc" [""; "abc"]; 87 | assert_ls ~drop_empty "\n\rabc" ["abc"]; 88 | () 89 | 90 | let test_paragraphs () = 91 | log "Test Utext.paragraphs"; 92 | let assert_ps ?drop_empty src segs = 93 | assert (Pvec.equal ~eq 94 | (Utext.paragraphs ?drop_empty (u src)) 95 | (Pvec.of_list (List.map u segs))) 96 | in 97 | let drop_empty = true in 98 | assert_ps "" [""]; 99 | assert_ps ~drop_empty "" []; 100 | assert_ps "\n\n" ["";""]; 101 | assert_ps "\n" ["\n"]; 102 | assert_ps "\r\n" ["\r\n"]; 103 | assert_ps "\np1\n" ["\np1\n"]; 104 | assert_ps "\np1\n\xE2\x80\xA9bla" ["\np1\n"; "bla"]; 105 | assert_ps "\np1\n\xE2\x80\xA9p2" ["\np1\n"; "p2"]; 106 | assert_ps "\np1\r\n\xE2\x80\xA9p2" ["\np1\r\n"; "p2"]; 107 | assert_ps "p1\r\n\r\np2" ["p1"; "p2"]; 108 | assert_ps "p1\r\n\np2" ["p1"; "p2"]; 109 | assert_ps "p1\n\np2\n\np3\n\n" ["p1"; "p2"; "p3"; ""]; 110 | assert_ps ~drop_empty "p1\n\np2\n\np3\n\n" ["p1"; "p2"; "p3"]; 111 | () 112 | 113 | let test_normalization () = 114 | log "Test Utext.{normalized,is_normalized}"; 115 | (* These tests were taken from Uunf. *) 116 | let test src nf dst = 117 | let src = Pvec.of_list (List.map Uchar.of_int src) in 118 | let dst = Pvec.of_list (List.map Uchar.of_int dst) in 119 | assert (eq (Utext.normalized nf src) dst); 120 | assert (Utext.is_normalized nf dst); 121 | in 122 | test [0x1E69] `NFD [0x0073; 0x0323; 0x0307]; 123 | test [0x1E69] `NFC [0x1E69]; 124 | test [0x1E0B; 0x0323] `NFD [0x0064; 0x0323; 0x0307]; 125 | test [0x1E0B; 0x0323] `NFC [0x1E0D; 0x0307]; 126 | test [0xFB01] `NFD [0xFB01]; 127 | test [0xFB01] `NFC [0xFB01]; 128 | test [0xFB01] `NFKD [0x0066; 0x0069]; 129 | test [0xFB01] `NFKC [0x0066; 0x0069]; 130 | test [0x0032; 0x2075] `NFD [0x0032; 0x2075]; 131 | test [0x0032; 0x2075] `NFC [0x0032; 0x2075]; 132 | test [0x0032; 0x2075] `NFKD [0x0032; 0x0035]; 133 | test [0x0032; 0x2075] `NFKC [0x0032; 0x0035]; 134 | test [0x1E9B; 0x0323] `NFD [0x017F; 0x0323; 0x307]; 135 | test [0x1E9B; 0x0323] `NFC [0x1E9B; 0x0323; ]; 136 | test [0x1E9B; 0x0323] `NFKD [0x0073; 0x0323; 0x0307]; 137 | test [0x1E9B; 0x0323] `NFKC [0x1E69]; 138 | test [0x0041; 0x007A; 0x0335; 0x0327; 0x0324; 0x0301; 0x0041] `NFC 139 | [0x0041; 0x017A; 0x0335; 0x0327; 0x0324; 0x0041]; 140 | test [0x01C6; 0x032D] `NFKC [0x0064; 0x017E; 0x032D]; 141 | test [0xFF80; 0x1FD3; 0xFF9E; 0x1FD3;] `NFKC [0x30BF; 0x0390; 0x3099; 0x0390]; 142 | test [0xC100; 0x20D2; 0x11C1; 0x11C1] `NFC [0xC100; 0x20D2; 0x11C1; 0x11C1]; 143 | () 144 | 145 | let test_segmentation () = 146 | log "Text Utext.{segments,segment_count,boundaries,boundaries_mandatory}"; 147 | let assert_vec ~eq v els = assert (Pvec.equal ~eq v (Pvec.of_list els)) in 148 | let assert_segs v els = assert_vec ~eq v (List.map u els) in 149 | let abla = u "Åbla" in 150 | let heyho = u "hey ho let's go" in 151 | assert_segs (Utext.segments `Grapheme_cluster Utext.empty) []; 152 | assert (Utext.segment_count `Grapheme_cluster Utext.empty = 0); 153 | assert_segs (Utext.segments `Word @@ u "") []; 154 | assert (Pvec.length abla = 5); 155 | assert_segs (Utext.segments `Grapheme_cluster abla) ["Å"; "b"; "l"; "a"]; 156 | assert (Utext.segment_count `Grapheme_cluster abla = 4); 157 | assert_segs (Utext.segments `Word heyho) 158 | ["hey"; " "; "ho"; " "; "let's"; " "; "go"]; 159 | assert_segs (Utext.segments `Word heyho) 160 | ["hey"; " "; "ho"; " "; "let's"; " "; "go"]; 161 | assert_vec ~eq:(=) (Utext.boundaries `Grapheme_cluster abla) [0;2;3;4;5]; 162 | assert_vec ~eq:(=) (Utext.boundaries_mandatory `Line_break (u "hey ho\nno")) 163 | [4, false; 7, true; 9, true]; 164 | () 165 | 166 | let test_escapes () = 167 | log "Test Utext.{escape,unescape}"; 168 | let unescaped_ok u = match Utext.unescaped u with 169 | | Ok u -> u | Error _ -> assert false 170 | in 171 | let trip unesc esc = 172 | assert_f Utext.escaped unesc esc; 173 | assert_f unescaped_ok esc unesc; 174 | () 175 | in 176 | trip "\xC2\x98\n" "\\u{0098}\\n"; 177 | trip "\xC2\x98\n" "\\u{0098}\\n"; 178 | trip "a\x00\bla" "a\\u{0000}\\bla"; 179 | trip "\b\t\n\r\"\\" "\\b\\t\\n\\r\\\"\\\\"; 180 | assert_f unescaped_ok "\\u{1F42B}a" "\xF0\x9F\x90\xABa"; 181 | assert (Utext.unescaped (u "a\nbc\\") = Error 4); 182 | assert (Utext.unescaped (u "a\\nbc\\") = Error 5); 183 | assert (Utext.unescaped (u "a\\n\\u{0}bc\\u") = Error 11); 184 | assert (Utext.unescaped (u "a\\n\\u{0}bc\\u{") = Error 11); 185 | assert (Utext.unescaped (u "a\\n\\u{0}bc\\u{}") = Error 11); 186 | assert (Utext.unescaped (u "ab\\{}}") = Error 2); 187 | assert (Utext.unescaped (u "ab\\u{}}") = Error 2); 188 | () 189 | 190 | let test_codec () = 191 | log "Test codec"; 192 | let trip ?(first = 0) ?last s = 193 | let last = match last with None -> String.length s - 1 | Some l -> l in 194 | assert (Utext.(to_utf_8 @@ of_utf_8 ~first ~last s) = 195 | String.sub s first (last - first + 1)) 196 | in 197 | trip ~first:4 ~last:6 "hop hap mop"; 198 | trip "métallique"; 199 | trip "矢量"; 200 | () 201 | 202 | 203 | let test () = 204 | try 205 | Printexc.record_backtrace true; 206 | test_base (); 207 | test_case (); 208 | test_case_insensitive (); 209 | test_identifiers (); 210 | test_lines (); 211 | test_paragraphs (); 212 | test_normalization (); 213 | test_segmentation (); 214 | test_escapes (); 215 | test_codec (); 216 | with 217 | | e -> 218 | let bt = Printexc.get_raw_backtrace () in 219 | log "%s\n%s\n[FAIL] A test failed!" 220 | (Printexc.to_string e) (Printexc.raw_backtrace_to_string bt); 221 | exit 1 222 | 223 | let () = test () 224 | 225 | (*--------------------------------------------------------------------------- 226 | Copyright (c) 2018 The utext programmers 227 | 228 | Permission to use, copy, modify, and/or distribute this software for any 229 | purpose with or without fee is hereby granted, provided that the above 230 | copyright notice and this permission notice appear in all copies. 231 | 232 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 233 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 234 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 235 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 236 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 237 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 238 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 239 | ---------------------------------------------------------------------------*) 240 | --------------------------------------------------------------------------------