├── .gitignore
├── .merlin
├── .ocp-indent
├── CHANGES.md
├── LICENSE.md
├── README.md
├── _tags
├── doc
    ├── api.odocl
    └── dev.odocl
├── opam
├── pkg
    ├── META
    └── pkg.ml
├── src
    ├── utext.ml
    ├── utext.mli
    ├── utext.mllib
    └── utext_top_init.ml
└── test
    └── test.ml


/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | tmp
3 | *~
4 | \.\#*
5 | \#*#
6 | *.install
7 | *.native
8 | *.byte


--------------------------------------------------------------------------------
/.merlin:
--------------------------------------------------------------------------------
1 | PKG bytes pvec uucp uunf uuseg uutf
2 | S src
3 | S test
4 | B _build/**
5 | 


--------------------------------------------------------------------------------
/.ocp-indent:
--------------------------------------------------------------------------------
1 | strict_with=always,match_clause=4,strict_else=never
2 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | vX.Y.Z YYYY-MM-DD Location
2 | --------------------------
3 | 
4 | First release. 
5 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The utext programmers
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for any
 4 | purpose with or without fee is hereby granted, provided that the above
 5 | copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | utext — Unicode text for OCaml
 2 | -------------------------------------------------------------------------------
 3 | %%VERSION%%
 4 | 
 5 | utext is an OCaml type for storing and processing Unicode text.
 6 | 
 7 | utext is distributed under the ISC license and depends on [pvec][pvec]
 8 | 
 9 | [pvec]: http://erratique.ch/software/pvec 
10 | 
11 | Homepage: http://erratique.ch/software/utext  
12 | 
13 | ## Installation
14 | 
15 | utext can be installed with `opam`:
16 | 
17 |     opam install utext
18 | 
19 | If you don't use `opam` consult the [`opam`](opam) file for build
20 | instructions.
21 | 
22 | ## Documentation
23 | 
24 | The documentation and API reference is generated from the source
25 | interfaces. It can be consulted [online][doc] or via `odig doc
26 | utext`.
27 | 
28 | [doc]: http://erratique.ch/software/utext/doc
29 | 
30 | ## Sample programs
31 | 
32 | If you installed utext with `opam` sample programs are located in
33 | the directory `opam var utext:doc`.
34 | 
35 | In the distribution sample programs and tests are located in the
36 | [`test`](test) directory. They can be built and run
37 | with:
38 | 
39 |     topkg build --tests true && topkg test 
40 | 


--------------------------------------------------------------------------------
/_tags:
--------------------------------------------------------------------------------
1 | true : bin_annot, safe_string, package(bytes pvec uucp uunf uuseg uuseg.string uutf)
2 | <src> : include
3 | <test> : include


--------------------------------------------------------------------------------
/doc/api.odocl:
--------------------------------------------------------------------------------
1 | Utext


--------------------------------------------------------------------------------
/doc/dev.odocl:
--------------------------------------------------------------------------------
1 | Utext


--------------------------------------------------------------------------------
/opam:
--------------------------------------------------------------------------------
 1 | opam-version: "1.2"
 2 | maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>"
 3 | authors: ["The utext programmers"]
 4 | homepage: "http://erratique.ch/software/utext"
 5 | doc: "http://erratique.ch/software/utext/doc"
 6 | license: "ISC"
 7 | dev-repo: "http://erratique.ch/repos/utext.git"
 8 | bug-reports: "https://github.com/dbuenzli/utext/issues"
 9 | tags: []
10 | available: [ ocaml-version >= "4.01.0"]
11 | depends:
12 | [
13 |   "ocamlfind" {build}
14 |   "ocamlbuild" {build}
15 |   "topkg" {build & >= "0.9.0"}
16 |   "pvec"
17 |   "uutf"
18 |   "uucp"
19 |   "uunf"
20 |   "uuseg"
21 | ]
22 | depopts: []
23 | build:
24 | [[
25 |   "ocaml" "pkg/pkg.ml" "build"
26 |           "--dev-pkg" "%{dev}%"
27 | ]]
28 | 


--------------------------------------------------------------------------------
/pkg/META:
--------------------------------------------------------------------------------
1 | description = "Unicode text datastructure for OCaml"
2 | version = "%%VERSION_NUM%%"
3 | requires = ""
4 | archive(byte) = "utext.cma"
5 | archive(native) = "utext.cmxa"
6 | plugin(byte) = "utext.cma"
7 | plugin(native) = "utext.cmxs"
8 | 


--------------------------------------------------------------------------------
/pkg/pkg.ml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ocaml
 2 | #use "topfind"
 3 | #require "topkg"
 4 | open Topkg
 5 | 
 6 | let () =
 7 |   Pkg.describe "utext" @@ fun c ->
 8 |   Ok [ Pkg.mllib "src/utext.mllib";
 9 |        Pkg.lib "src/utext_top_init.ml";
10 |        Pkg.test "test/test"; ]
11 | 


--------------------------------------------------------------------------------
/src/utext.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2018 The utext programmers. All rights reserved.
  3 |    Distributed under the ISC license, see terms at the end of the file.
  4 |    %%NAME%% %%VERSION%%
  5 |   ---------------------------------------------------------------------------*)
  6 | 
  7 | let _strf = Printf.sprintf
  8 | let unicode_version = Uucp.unicode_version
  9 | 
 10 | type t = Uchar.t Pvec.t
 11 | 
 12 | let empty = Pvec.empty
 13 | let v = Pvec.v
 14 | let init = Pvec.init
 15 | let of_uchar = Pvec.singleton
 16 | let _of_ascii s = init ~len:(String.length s) (fun i -> Uchar.of_char s.[i])
 17 | 
 18 | (* Predicates and comparison. *)
 19 | 
 20 | let is_empty = Pvec.is_empty
 21 | let equal t0 t1 = Pvec.equal ~eq:Uchar.equal t0 t1
 22 | let compare t0 t1 = Pvec.compare ~cmp:Uchar.compare t0 t1
 23 | 
 24 | (* Normalization *)
 25 | 
 26 | type normalization = [ `NFC | `NFD | `NFKC | `NFKD ]
 27 | 
 28 | let normalized nf t =
 29 |   (* XXX would be interesting to do a direct reimplementation on pvec.
 30 |      and benchmark the diff (but on which corpus ?) also how would the
 31 |      case algorithms react. Also consider first checking using
 32 |      [is_normalized]. *)
 33 |   let rec add n acc v = match Uunf.add n v with
 34 |   | `Uchar u -> add n (Pvec.add_last acc u) `Await
 35 |   | `Await | `End -> acc
 36 |   in
 37 |   let add_uchar n acc u = add n acc (`Uchar u) in
 38 |   let n = Uunf.create nf in
 39 |   add n (Pvec.fold_left (add_uchar n) empty t) `End
 40 | 
 41 | let is_normalized nf t =
 42 |   (* FIXME this is ridiculous, do a direct reimplementation. *)
 43 |   equal t (normalized nf t)
 44 | 
 45 | (* Case mapping *)
 46 | 
 47 | let append_uchar_map map acc u = match map u with
 48 | | `Self -> Pvec.add_last acc u
 49 | | `Uchars l -> List.fold_left Pvec.add_last acc l
 50 | 
 51 | let lowercased t =
 52 |   Pvec.fold_left (append_uchar_map Uucp.Case.Map.to_lower) empty t
 53 | 
 54 | let uppercased t =
 55 |   Pvec.fold_left (append_uchar_map Uucp.Case.Map.to_upper) empty t
 56 | 
 57 | let _map_first_if_cased map t = match Pvec.first_el t with
 58 | | None -> t
 59 | | Some first ->
 60 |     match Uucp.Case.is_cased first with
 61 |     | false -> t
 62 |     | true ->
 63 |         match map first with
 64 |         | `Self -> t
 65 |         | `Uchars us ->
 66 |             let first_map = Pvec.of_list us in
 67 |             Pvec.(first_map ++ rem_first t)
 68 | 
 69 | let capitalized t = _map_first_if_cased Uucp.Case.Map.to_title t
 70 | let uncapitalized t = _map_first_if_cased Uucp.Case.Map.to_lower t
 71 | 
 72 | (* Case folding *)
 73 | 
 74 | let casefolded t =
 75 |   Pvec.fold_left (append_uchar_map Uucp.Case.Fold.fold) empty t
 76 | 
 77 | let uchar_app acc = function
 78 | | `Uchar u -> Pvec.add_last acc u
 79 | | `End -> acc
 80 | | `Await -> acc
 81 | 
 82 | let rec nf_app n app acc v = match Uunf.add n v with
 83 | | `Uchar u as v -> nf_app n app (app acc v) `Await
 84 | | `End -> app acc `End
 85 | | `Await -> acc
 86 | 
 87 | let map_app map app acc = function
 88 | | `Uchar u as v ->
 89 |     begin match map u with
 90 |     | `Self -> app acc v
 91 |     | `Uchars l -> List.fold_left (fun acc u -> app acc (`Uchar u)) acc l
 92 |     end
 93 | | `End -> app acc `End
 94 | | `Await -> acc
 95 | 
 96 | let fold_with_append append t =
 97 |   let add_uchar acc u = append acc (`Uchar u) in
 98 |   append (Pvec.fold_left add_uchar empty t) `End
 99 | 
100 | let canonical_caseless_key t =
101 |   (* XXX TUS mentions optims that could eschew the first NFD. *)
102 |   (* This is [normalized `NFD @@ casefolded @@ normalized `NFD t] *)
103 |   let append =
104 |     nf_app (Uunf.create `NFD) @@ map_app Uucp.Case.Fold.fold @@
105 |     nf_app (Uunf.create `NFD) @@ uchar_app
106 |   in
107 |   fold_with_append append t
108 | 
109 | let compatibility_caseless_key t =
110 |   (* This is [normalized `NFKD @@ casefold @@ normalized `NFKD @@ casefold @@
111 |               normalized NFD t] *)
112 |   let append =
113 |     nf_app (Uunf.create `NFD) @@ map_app Uucp.Case.Fold.fold @@
114 |     nf_app (Uunf.create `NFKD) @@ map_app Uucp.Case.Fold.fold @@
115 |     nf_app (Uunf.create `NFKD) uchar_app
116 |   in
117 |   fold_with_append append t
118 | 
119 | (* Identifiers *)
120 | 
121 | let is_identifier t = match Pvec.first_el t with
122 | | None -> false
123 | | Some u ->
124 |     (* N.B. is_xid_continue => is_xid_start *)
125 |     Uucp.Id.is_xid_start u && Pvec.for_all Uucp.Id.is_xid_continue t
126 | 
127 | let identifier_caseless_key t =
128 |   (* This is [normalize `NFC @@ nfkc_casefold @@ normalize NFD t] *)
129 |   let append =
130 |     nf_app (Uunf.create `NFD) @@ map_app Uucp.Case.Nfkc_fold.fold @@
131 |     nf_app (Uunf.create `NFC) @@ uchar_app
132 |   in
133 |   fold_with_append append t
134 | 
135 | (* Breaking lines and paragraphs *)
136 | 
137 | type newline = [ `ASCII | `NLF | `Readline ]
138 | 
139 | let lines ?(drop_empty = false)  ?(newline = `Readline) t =
140 |   (* XXX range vs spans *)
141 |   let add_line ls first last =
142 |     let line = Pvec.range ~first ~last t in
143 |     if drop_empty && Pvec.is_empty line then ls else Pvec.(add_last ls line)
144 |   in
145 |   let rec lines (ls, first as acc) i u = match Uchar.to_int u with
146 |   | 0x000D (* CR *) ->
147 |       let first' = match Pvec.el t (i + 1) with
148 |       | Some u when Uchar.to_int u = 0x000A (* LF *) -> i + 2
149 |       | None | Some _ -> i + 1
150 |       in
151 |       add_line ls first (i - 1), first'
152 |   | 0x000A (* LF *) ->
153 |       if first > i (* last i was CR *) then acc else
154 |       add_line ls first (i - 1), i + 1
155 |   | 0x0085 (* NEL *) when newline <> `ASCII ->
156 |       add_line ls first (i - 1), i + 1
157 |   | (0x000C (* FF *) | 0x2028 (* LS *)
158 |     | 0x2029 (* PS *)) when newline = `Readline ->
159 |       add_line ls first (i - 1), i + 1
160 |   | _ -> acc
161 |   in
162 |   let ls, first = Pvec.foldi_left lines (empty, 0) t in
163 |   add_line ls first (Pvec.length t - 1)
164 | 
165 | let paragraphs ?(drop_empty = false) t =
166 |   let fst_half_break u = match Uchar.to_int u with
167 |   | 0x000D | 0x000A | 0x0085 | 0x2028 | 0x2029 -> true | _ -> false
168 |   in
169 |   let snd_half_break bstart k = match Pvec.el t k with
170 |   | None -> None
171 |   | Some u ->
172 |       begin match Uchar.to_int u with
173 |       | 0x2029 (* PS *) -> Some (k, k)
174 |       | 0x000A (* LF *) | 0x0085 (* NEL *) | 0x2028 (* LS *) -> Some (bstart, k)
175 |       | 0x000D (* CR *) ->
176 |           begin match Pvec.el t (k + 1) with
177 |           | None -> Some (bstart, k)
178 |           | Some u ->
179 |               if Uchar.to_int u = 0x000A (* LF *) then Some (bstart, k + 1) else
180 |               Some (bstart, k)
181 |           end
182 |       | _ -> None
183 |       end
184 |   in
185 |   let rec find_break start = match Pvec.left_find ~start fst_half_break t with
186 |   | None -> None
187 |   | Some (i, u) ->
188 |       let break = match Uchar.to_int u with
189 |       | 0x2029 (* PS *) -> Some (i, i)
190 |       | 0x000A (* LF *)
191 |       | 0x0085 (* NEL *) | 0x2028 (* LS *) -> snd_half_break i (i + 1)
192 |       | 0x000D (* CR *) ->
193 |           begin match Pvec.el t (i + 1) with
194 |           | None -> None
195 |           | Some u ->
196 |             if Uchar.to_int u = 0x000A (* LF *) then snd_half_break i (i + 2)
197 |             else snd_half_break i (i + 1)
198 |           end
199 |       | _ -> assert false
200 |       in
201 |       match break with None -> find_break (i + 2) | Some _ as b -> b
202 |   in
203 |   let add_para ps first last =
204 |     let para = Pvec.range ~first ~last t in
205 |     if drop_empty && Pvec.is_empty para then ps else Pvec.(add_last ps para)
206 |   in
207 |   let rec loop ps first start = match find_break start with
208 |   | None -> add_para ps first (Pvec.length t - 1)
209 |   | Some (bfirst, blast) ->
210 |       loop (add_para ps first (bfirst - 1)) (blast + 1) (blast + 1)
211 |   in
212 |   loop empty 0 0
213 | 
214 | (* Segmentation *)
215 | 
216 | type boundary = [ `Grapheme_cluster | `Line_break | `Sentence | `Word ]
217 | 
218 | let segments b t =
219 |   let rec add s (segs, seg as acc) v = match Uuseg.add s v with
220 |   | `Uchar u -> add s (segs, (Pvec.add_last seg u)) `Await
221 |   | `Boundary ->
222 |       let segs = if is_empty seg then segs else Pvec.add_last segs seg in
223 |       add s (segs, empty) `Await
224 |   | `Await | `End -> acc
225 |   in
226 |   let add_uchar s acc u = add s acc (`Uchar u) in
227 |   let s = Uuseg.create b in
228 |   fst (add s (Pvec.fold_left (add_uchar s) (empty, empty) t) `End)
229 | 
230 | let segment_count b t =
231 |   let rec add s count v = match Uuseg.add s v with
232 |   | `Uchar _ -> add s count `Await
233 |   | `Boundary -> add s (count + 1) `Await
234 |   | `Await | `End -> count
235 |   in
236 |   let add_uchar s count u = add s count (`Uchar u) in
237 |   let s = Uuseg.create b in
238 |   match add s (Pvec.fold_left (add_uchar s) 0 t) `End with
239 |   | 0 -> 0
240 |   | n -> n - 1
241 | 
242 | type pos = int
243 | 
244 | let boundaries b t =
245 |   let rec add s is i v = match Uuseg.add s v with
246 |   | `Uchar _ -> add s is i `Await
247 |   | `Boundary -> add s (Pvec.add_last is i) i `Await
248 |   | `Await | `End -> is
249 |   in
250 |   let add_uchar s is i u = add s is i (`Uchar u) in
251 |   let s = Uuseg.create b in
252 |   add s (Pvec.foldi_left (add_uchar s) empty t) (Pvec.length t) `End
253 | 
254 | let boundaries_mandatory b t =
255 |   let rec add s is i v = match Uuseg.add s v with
256 |   | `Uchar _ -> add s is i `Await
257 |   | `Boundary -> add s (Pvec.add_last is (i, Uuseg.mandatory s)) i `Await
258 |   | `Await | `End -> is
259 |   in
260 |   let add_uchar s is i u = add s is i (`Uchar u) in
261 |   let s = Uuseg.create b in
262 |   add s (Pvec.foldi_left (add_uchar s) empty t) (Pvec.length t) `End
263 | 
264 | (* Escaping and unescaping *)
265 | 
266 | let bslash = Uchar.of_char '\\'
267 | let esc char = Pvec.of_list [bslash; Uchar.of_char char]
268 | let esc_bell = esc 'b'
269 | let esc_tab = esc 't'
270 | let esc_lf = esc 'n'
271 | let esc_cr = esc 'r'
272 | let esc_quote = esc '"'
273 | let esc_bslash = esc '\\'
274 | let esc_uchar u = _of_ascii (_strf "\\u{%04X}" (Uchar.to_int u))
275 | let escaped t =
276 |   let escape_uchar (te, first as acc) i u =
277 |     let esc u =
278 |       Pvec.(concat_list [te; range ~first ~last:(i - 1) t; u]), i + 1
279 |     in
280 |     match Uchar.to_int u with
281 |     | 0x0008 (* '\b' *) -> esc esc_bell
282 |     | 0x0009 (* '\t' *) -> esc esc_tab
283 |     | 0x000A (* '\n' *) -> esc esc_lf
284 |     | 0x000D (* '\r' *) -> esc esc_cr
285 |     | 0x0022 (* '\"' *) -> esc esc_quote
286 |     | 0x005C (* '\\' *) -> esc esc_bslash
287 |     | _ when Uucp.Gc.general_category u = `Cc -> esc (esc_uchar u)
288 |     | _ -> acc
289 |   in
290 |   let te, first = Pvec.foldi_left escape_uchar (empty, 0) t in
291 |   Pvec.(te ++ drop_left first t)
292 | 
293 | let is_hex_digit u = Uchar.is_char u && match Uchar.to_char u with
294 | | '0' .. '9' | 'A' .. 'F' | 'a' .. 'f' -> true
295 | | _ -> false
296 | 
297 | let int_of_hex_num t =
298 |   let add_digit n u = n * 16 + match Uchar.to_int u with
299 |   | i when i <= 0x0039 -> i - 0x0030
300 |   | i when i <= 0x0046 -> i - 0x0041 + 10
301 |   | i when i <= 0x0066 -> i - 0x0061 + 10
302 |   | _ -> assert false
303 |   in
304 |   Pvec.fold_left add_digit 0 t
305 | 
306 | let unescaped t =
307 |   let not_bslash u = not @@ Uchar.equal u bslash in
308 |   let rec loop i acc t =
309 |     let l, r = Pvec.span_left not_bslash t in
310 |     if Pvec.is_empty r then Ok Pvec.(acc ++ l) else
311 |     let i = i + Pvec.length l in
312 |     let err () = Error i in
313 |     let unesc u i skip r =
314 |       loop (i + skip) Pvec.(add_last (acc ++ l) u) (Pvec.drop_left skip r)
315 |     in
316 |     match Pvec.el r 1 with
317 |     | None -> err ()
318 |     | Some u ->
319 |         match Uchar.to_int u with
320 |         | 0x0062 (* 'b' *) -> unesc (Uchar.of_int 0x0008) i 2 r
321 |         | 0x0074 (* 't' *) -> unesc (Uchar.of_int 0x0009) i 2 r
322 |         | 0x006E (* 'n' *) -> unesc (Uchar.of_int 0x000A) i 2 r
323 |         | 0x0072 (* 'r' *) -> unesc (Uchar.of_int 0x000D) i 2 r
324 |         | 0x0022 (* '"' *) -> unesc (Uchar.of_int 0x0022) i 2 r
325 |         | 0x005C (* '\\' *) -> unesc (Uchar.of_int 0x005C) i 2 r
326 |         | 0x0075 (* 'u' *) ->
327 |             begin match Pvec.el r 2 with
328 |             | None -> err ()
329 |             | Some u when Uchar.to_int u <> 0x007B (* '{' *) -> err ()
330 |             | Some _ ->
331 |                 let hs, r = Pvec.(span_left is_hex_digit (drop_left 3 r)) in
332 |                 begin match Pvec.first_el r with
333 |                 | None -> err ()
334 |                 | Some u ->
335 |                     if Uchar.to_int u <>  0x007D (* '}' *) then err () else
336 |                     if Pvec.(length hs = 0 || length hs > 6) then err () else
337 |                     let u = int_of_hex_num hs in
338 |                     if not (Uchar.is_valid u) then err () else
339 |                     unesc (Uchar.of_int u) (i + 4 + Pvec.length hs) 1 r
340 |                 end
341 |             end
342 |         | _ -> err ()
343 |   in
344 |   loop 0 empty t
345 | 
346 | (* Decoding and encoding *)
347 | 
348 | let encoding_guess = Uutf.String.encoding_guess
349 | 
350 | (* XXX implement codecs directly. *)
351 | 
352 | let str text =
353 |   let add_uchar acc pos = function
354 |   | `Uchar u -> Pvec.add_last acc u
355 |   | `Malformed bytes -> invalid_arg (_strf "%d: invalid bytes %S" pos bytes)
356 |   in
357 |   Uutf.String.fold_utf_8 add_uchar empty text
358 | 
359 | let strf fmt = Format.kasprintf (fun s -> str s) fmt
360 | 
361 | (* Best-effort decoding *)
362 | 
363 | let of_utf_x fold ?(first = 0) ?last s =
364 |   let add_uchar acc pos = function
365 |   | `Uchar u -> Pvec.add_last acc u
366 |   | `Malformed bytes -> Pvec.add_last acc Uutf.u_rep
367 |   in
368 |   let last = match last with None -> String.length s - 1 | Some l -> l in
369 |   let len = last - first + 1 in
370 |   fold ?pos:(Some first) ?len:(Some len) add_uchar empty s
371 | 
372 | let of_utf_8 = of_utf_x Uutf.String.fold_utf_8
373 | let of_utf_16le = of_utf_x Uutf.String.fold_utf_16le
374 | let of_utf_16be = of_utf_x Uutf.String.fold_utf_16be
375 | 
376 | (* Decoding with error handling *)
377 | 
378 | type decode = (t, t * int * int option) result
379 | 
380 | exception Dec_error of (t * int * int option)
381 | 
382 | let try_of_utf_x fold ?(first = 0) ?last s =
383 |   let last = match last with None -> String.length s - 1 | Some l -> l in
384 |   let add_uchar acc idx = function
385 |   | `Uchar u -> Pvec.add_last acc u
386 |   | `Malformed bytes ->
387 |       let restart = idx + String.length bytes in
388 |       let restart = if restart > last then None else Some restart in
389 |       raise (Dec_error (acc, idx, restart))
390 |   in
391 |   let len = last - first + 1 in
392 |   try Ok (fold ?pos:(Some first) ?len:(Some len) add_uchar empty s) with
393 |   | Dec_error err -> Error err
394 | 
395 | let try_of_utf_8 = try_of_utf_x Uutf.String.fold_utf_8
396 | let try_of_utf_16le = try_of_utf_x Uutf.String.fold_utf_16le
397 | let try_of_utf_16le = try_of_utf_x Uutf.String.fold_utf_16be
398 | 
399 | (* Encoding *)
400 | 
401 | let buffer_add_utf_8 b t = Pvec.iter_left (Uutf.Buffer.add_utf_8 b) t
402 | let buffer_add_utf_16le b t = Pvec.iter_left (Uutf.Buffer.add_utf_16le b) t
403 | let buffer_add_utf_16be b t = Pvec.iter_left (Uutf.Buffer.add_utf_16be b) t
404 | 
405 | let to_utf_x buffer_add_utf_x t =
406 |   let b = Buffer.create (Pvec.length t * 2) in
407 |   buffer_add_utf_x b t;
408 |   Buffer.contents b
409 | 
410 | let to_utf_8 t = to_utf_x buffer_add_utf_8 t
411 | let to_utf_16le t = to_utf_x buffer_add_utf_16le t
412 | let to_utf_16be t = to_utf_x buffer_add_utf_16be t
413 | 
414 | (* Pretty-printing
415 | 
416 |    XXX avoid the detour via string *)
417 | 
418 | let pp ppf t = Uuseg_string.pp_utf_8 ppf (to_utf_8 t)
419 | let pp_text ppf t = Uuseg_string.pp_utf_8_text ppf (to_utf_8 t)
420 | let pp_lines ppf t = Uuseg_string.pp_utf_8_lines ppf (to_utf_8 t)
421 | let pp_uchars ppf t =
422 |   let sep = Format.pp_print_space in
423 |   let pp_uchar ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u) in
424 |   Pvec.pp ~sep pp_uchar ppf t
425 | 
426 | let pp_toplevel ppf t =
427 |   Format.fprintf ppf "(Utext.str \"%a\")" pp (escaped t)
428 | 
429 | let pp_toplevel_pvec ppf ts =
430 |   let pp_t ppf t = Format.fprintf ppf "Utext.str \"%a\"" pp (escaped t) in
431 |   let sep ppf () = Format.fprintf ppf ";@ " in
432 |   Format.fprintf ppf "@[<1>Pvec.of_list @[<1>[%a]@]@]" Pvec.(pp ~sep pp_t) ts
433 | 
434 | (*---------------------------------------------------------------------------
435 |    Copyright (c) 2018 The utext programmers
436 | 
437 |    Permission to use, copy, modify, and/or distribute this software for any
438 |    purpose with or without fee is hereby granted, provided that the above
439 |    copyright notice and this permission notice appear in all copies.
440 | 
441 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
442 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
443 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
444 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
445 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
446 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
447 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
448 |   ---------------------------------------------------------------------------*)
449 | 


--------------------------------------------------------------------------------
/src/utext.mli:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2018 The utext programmers. All rights reserved.
  3 |    Distributed under the ISC license, see terms at the end of the file.
  4 |    %%NAME%% %%VERSION%%
  5 |   ---------------------------------------------------------------------------*)
  6 | 
  7 | (** Unicode text for OCaml.
  8 | 
  9 |     [Utext] provides a type for processing Unicode text.
 10 | 
 11 |     See also {!Uucp} and {!Pvec} and consult a
 12 |     {{!Uucp.uminimal}minimal unicode introduction}.
 13 |     TODO make a minimal Utext specific minimal intro.
 14 | 
 15 |     {e %%VERSION%% — Unicode version %%UNICODE_VERSION%% —
 16 |        {{:%%PKG_HOMEPAGE%% }homepage}}
 17 | 
 18 |     {3 References}
 19 |     {ul
 20 |     {- {{:http://www.unicode.org/faq/}The Unicode FAQ.}}
 21 |     {- The Unicode Consortium.
 22 |     {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}.
 23 |     (latest version)}} *)
 24 | 
 25 | (** {1 Utext} *)
 26 | 
 27 | val unicode_version : string
 28 | (** [unicode_version] is the Unicode version supported by [Utext]. *)
 29 | 
 30 | type t = Uchar.t Pvec.t
 31 | (** The type for Unicode text, a persistent vector of Unicode
 32 |     characters. *)
 33 | 
 34 | val empty : t
 35 | (** [empty] is {!Pvec.empty}, the empty Unicode text. *)
 36 | 
 37 | val v : len:int -> Uchar.t -> t
 38 | (** [v ~len u] is {!Pvec.v}[ ~len u]. *)
 39 | 
 40 | val init : len:int -> (int -> Uchar.t) -> t
 41 | (** [init ~len f] is {!Pvec.init}[ ~len f]. *)
 42 | 
 43 | val of_uchar : Uchar.t -> t
 44 | (** [of_uchar u] is {!Pvec.singleton}[ u]. *)
 45 | 
 46 | val str : string -> t
 47 | (** [str s] is Unicode text from the {e valid} UTF-8 encoded bytes [s].
 48 | 
 49 |     @raise Invalid_argument if [text] is invalid UTF-8, use
 50 |     {!of_utf_8} and {!try_of_utf_8} to deal with untrusted input. *)
 51 | 
 52 | val strf : ('a, Format.formatter, unit, t) Pervasives.format4 -> 'a
 53 | (** [strf fmt ...] is [Format.kasprintf (fun s -> str s) fmt ...)]. *)
 54 | 
 55 | (** {1:preds Predicates and comparison}
 56 | 
 57 |     See also [Pvec]'s {{!Pvec.preds}predicates and comparisons}. *)
 58 | 
 59 | val is_empty : t -> bool
 60 | (** [is_empty t] is [true] if [t] is empty, this is equal to
 61 |     {!Pvec.is_empty.} *)
 62 | 
 63 | val equal : t -> t -> bool
 64 | (** [equal t0 t1] is [true] if the elements in each vector are
 65 |     equal. {b Warning.} The test is {e textually} meaningless unless
 66 |     [t0] and [t1] are known to be in a particular form, see e.g.
 67 |     {!canonical_caseless_key} or {{!nf}normal forms}.
 68 | 
 69 |     {b FIXME.} Should we provide a fool-proof equality that
 70 |     always compares in [`NFD] or [`NFC] ? Problem is that
 71 |     since we are using raw Pvec.t we cannot cache. *)
 72 | 
 73 | val compare : t -> t -> int
 74 | (** [compare t0 t1] is the per element lexicographical order between
 75 |     [t0] and [t1]. {b Warning.} The comparison is {e textually}
 76 |     meaningless. *)
 77 | 
 78 | (** {1:cmap Case mapping and folding}
 79 | 
 80 |     For more information about case see the
 81 |     {{:http://unicode.org/faq/casemap_charprop.html#casemap}Unicode
 82 |     case mapping FAQ} and the
 83 |     {{:http://www.unicode.org/charts/case/}case mapping charts}. Note
 84 |     that these algorithms are insensitive to language and context and
 85 |     may produce sub-par results for some users. *)
 86 | 
 87 | val lowercased : t -> t
 88 | (** [lowercase t] is [t] lowercased according to Unicode's default case
 89 |     conversion. *)
 90 | 
 91 | val uppercased : t -> t
 92 | (** [uppercase t] is [t] uppercased according to Unicode's default case
 93 |     conversion. *)
 94 | 
 95 | val capitalized : t -> t
 96 | (** [capitalized t] is [t] capitalized: if the first character of [t]
 97 |     is {{!Uucp.Case.is_cased}cased} it is mapped to its
 98 |     {{!Uucp.Case.Map.to_title}title case mapping}; otherwise [t] is
 99 |     left unchanged. *)
100 | 
101 | val uncapitalized : t -> t
102 | (** [uncapitalized t] is [t] uncapitalized: if the first character of
103 |     [t] is {{!Uucp.Case.is_cased}cased} it is mapped to its
104 |     {{!Uucp.Case.Map.to_lower}lowercase case mapping}; otherwise [t]
105 |     is left unchanged. *)
106 | 
107 | (** {2:caseless Case insensitive equality}
108 | 
109 |     Testing the equality of two Unicode texts in a case insensitive
110 |     manner requires a fair amount of data massaging that includes
111 |     {{!nf}normalization} and {{!casefold}case folding}. These results
112 |     should be cached if many comparisons have to be made on the same
113 |     text. The following functions return keys for a given text that
114 |     can be used to test equality against other keys. {b Do not} test
115 |     keys generated by different functions, the comparison would be
116 |     meaningless. See also {!identifier_caseless_key}. *)
117 | 
118 | val casefolded : t -> t
119 | (** [casefold t] is [t] casefolded according to Unicode's default
120 |     casefold. This can be used to implement various forms of caseless
121 |     equalities. [equal (casefolded t0) (casefolded t1)] determines
122 |     default case equality
123 |     ({{:http://www.unicode.org/versions/latest/ch03.pdf#G34145}TUS
124 |     D144}) of [t0] and [t1]. {b Warning.} In general this notion is
125 |     not good enough use one of the following functions. *)
126 | 
127 | val canonical_caseless_key : t -> t
128 | (** [canonical_caseless_key t] is a key such that
129 |     [equal (canonical_caseless_key t0) (canonical_caseless_key t1)]
130 |     determines canonical caseless
131 |     equality ({{:http://www.unicode.org/versions/latest/ch03.pdf#G34145}TUS
132 |     D145}) of [t0] and [t1]. *)
133 | 
134 | val compatibility_caseless_key : t -> t
135 | (** [compatability_caseless_key t] is a key such that
136 |     [equal (compatibility_caseless_key t0) (compatibility_caseless_key t1)]
137 |     determines compatibility caseless
138 |     equality ({{:http://www.unicode.org/versions/latest/ch03.pdf#G34145}TUS
139 |     D146}) of [t0] and [t1]. *)
140 | 
141 | (** {1:ids Unicode identifiers}
142 | 
143 |     For more information see {{:http://unicode.org/reports/tr31/}UAX 31
144 |     Unicode Identifier and Pattern Syntax}. *)
145 | 
146 | val is_identifier : t -> bool
147 | (** [is_identifier t] is [true] iff [t] is a
148 |     {{:http://unicode.org/reports/tr31/#Default_Identifier_Syntax}Default
149 |     Unicode identifier}, more precisely this is
150 |     {{:http://unicode.org/reports/tr31/#R1}UAX31-R1}. *)
151 | 
152 | val identifier_caseless_key : t -> t
153 | (** [identifier_caseless_key t] is a key such that
154 |     [equal (identifier_caseless_key t0) (identifier_caseless_key t1)]
155 |     determines identifier caseless
156 |     equality ({{:http://www.unicode.org/versions/latest/ch03.pdf##G34145}TUS
157 |     D147}) of [t0] and [t1]. *)
158 | 
159 | (** {1:line_cut Breaking lines and paragraphs}
160 | 
161 |     These functions break text like a simple [readline] function
162 |     would. If you are looking for line breaks to layout text, see
163 |     {{!seg}line break segmentation}. *)
164 | 
165 | type newline = [ `ASCII | `NLF | `Readline ]
166 | (** The type for specifying newlines.
167 |     {ul
168 |     {- [`ASCII] newlines occur after a CR (U+000D), LF (U+000A) or
169 |        CRLF ([<U+000D, U+000A>]).}
170 |     {- [`NLF] newlines occur after the
171 |        {{:http://www.unicode.org/versions/Unicode10.0.0/ch05.pdf#G27861}
172 |        {e Unicode newline function}}, this
173 |        is [`ASCII] along with NEL (Ub+0085).}
174 |     {- [`Readline] newlines are determined as for a
175 |        {{:http://www.unicode.org/versions/Unicode10.0.0/ch05.pdf#G21160}
176 |        {e Unicode readline function} (R4)},
177 |        this is [`NLF] along with FF (U+000C), LS (U+2028) or
178 |        PS (U+2029).}} *)
179 | 
180 | val lines : ?drop_empty:bool -> ?newline:newline -> t -> t Pvec.t
181 | (** [lines ~drop_empty ~newline t] breaks [t] into subtexts separated
182 |     by newlines determined according to [newline] (defaults to
183 |     [`Readline]). Separators are not part of the result and lost. If
184 |     [drop_empty] is [true] (defaults to [false]) drops lines that are
185 |     empty. *)
186 | 
187 | val paragraphs : ?drop_empty:bool -> t -> t Pvec.t
188 | (** [paragraphs ~newline t] breaks [t] into subtexts separated either
189 |     by two consecutive newlines (determined as {{!newline}[`NLF]} or
190 |     LS (U+2028)) or a single PS (U+2029). Separators are not part of
191 |     the result and lost. If [drop_empty] is [true] (defaults to
192 |     [false]) drops paragraphs that are empty. *)
193 | 
194 | (** {1:nf Normalization}
195 | 
196 |     For more information on normalization consult a short
197 |     {{!Uucp.equivalence}introduction}, the
198 |     {{:http://www.unicode.org/reports/tr15/}UAX #15 Unicode
199 |     Normalization Forms} and
200 |     {{:http://www.unicode.org/charts/normalization/} normalization
201 |     charts}. *)
202 | 
203 | type normalization = [`NFD | `NFC | `NFKD | `NFKC ]
204 | (** The type for normalization forms.
205 |     {ul
206 |     {- [`NFD] {{:http://www.unicode.org/glossary/#normalization_form_d}
207 |        normalization form D}, canonical decomposition.}
208 |     {- [`NFC] {{:http://www.unicode.org/glossary/#normalization_form_c}
209 |        normalization form C}, canonical decomposition followed by
210 |        canonical composition.}
211 |     {- [`NFKD] {{:http://www.unicode.org/glossary/#normalization_form_kd}
212 |        normalization form KD}, compatibility decomposition.}
213 |     {- [`NFKC] {{:http://www.unicode.org/glossary/#normalization_form_kc}
214 |        normalization form KC}, compatibility decomposition,
215 |        followed by canonical composition.}} *)
216 | 
217 | val normalized : normalization -> t -> t
218 | (** [normalized nf t] is [t] normalized to [nf]. *)
219 | 
220 | val is_normalized : normalization -> t -> bool
221 | (** [is_normalized nf t] is [true] iff [t] is in normalization form [nf]. *)
222 | 
223 | (** {1:seg Segmentation}
224 | 
225 |     For more information consult the
226 |     {{:http://www.unicode.org/reports/tr29/}UAX #29 Unicode Text
227 |     Segmentation}, the {{:http://www.unicode.org/reports/tr14/}UAX #14
228 |     Unicode Line Breaking Algorithm} and the web based
229 |     {{:http://unicode.org/cldr/utility/breaks.jsp}ICU break utility}. *)
230 | 
231 | type boundary = [ `Grapheme_cluster | `Word | `Sentence | `Line_break ]
232 | (** The type for boundaries.
233 |     {ul
234 |     {- [`Grapheme_cluster] determines
235 |     {{:http://www.unicode.org/glossary/#extended_grapheme_cluster}
236 |     extended grapheme clusters} boundaries according to UAX 29
237 |     (corresponds, for most scripts, to user-perceived characters).}
238 |     {- [`Word] determines word boundaries according to UAX 29.}
239 |     {- [`Sentence] determines sentence boundaries according to UAX 29.}
240 |     {- [`Line_break] determines {{!mandatory}mandatory} line breaks and
241 |        line break opportunities according to UAX 14.}} *)
242 | 
243 | val segments : boundary -> t -> t Pvec.t
244 | (** [segments b t] is are the segments of text [t] delimited by two
245 |     boundaries of type [b]. *)
246 | 
247 | val segment_count : boundary -> t -> int
248 | (** [segment_count b t] is [Pvec.length (segments b t)]. *)
249 | 
250 | (** {2:boundary_pos Boundary positions} *)
251 | 
252 | type pos = int
253 | (** The type for positions. The positions of a vector [v] of length [l]
254 |     range over \[[0];[l]\]. They are the slits before each element and after
255 |     the last one. They are labelled from left to right by increasing number.
256 |     The [i]th index is between positions [i] and [i+1].
257 | {v
258 | positions  0   1   2   3   4    l-1    l
259 |            +---+---+---+---+     +-----+
260 |   indices  | 0 | 1 | 2 | 3 | ... | l-1 |
261 |            +---+---+---+---+     +-----+
262 | v}  *)
263 | 
264 | val boundaries : boundary -> t -> pos Pvec.t
265 | (** [boundaries b t] are the positions of boundaries [b] in
266 |     [t]. *)
267 | 
268 | val boundaries_mandatory : boundary -> t -> (pos * bool) Pvec.t
269 | (** [boundaries_mandatory] is like {!boundaries} but returns
270 |     the mandatory status of a boundary if the kind of boundary
271 |     sports that notion (or always [true] if not). *)
272 | 
273 | (** {1:esc Escaping and unescaping} *)
274 | 
275 | val escaped : t -> t
276 | (** [escaped t] is [t] except characters whose general category is
277 |     [Control], U+0022 or U+005C which are escaped according to OCaml's
278 |     lexical conventions for strings with:
279 |     {ul
280 |     {- Any U+0008 (['\b']) escaped to the sequence <U+005C, U+0062>
281 |        (["\\b"])}
282 |     {- Any U+0009 (['\t']) escaped to the sequence <U+005C, U+0074>
283 |        (["\\t"])}
284 |     {- Any U+000A (['\n']) escaped to the sequence <U+005C, U+006E>
285 |        ["\\n"]}
286 |     {- Any U+000D (['\r']) escaped to the sequence <U+005C, U+0072>
287 |        (["\\r"])}
288 |     {- Any U+0022 (['\"']) escaped to the sequence <U+005C, U+0022>
289 |        (["\\\""])}
290 |     {- Any U+005C (['\\']) escaped to the sequence <U+005C, U+005C>
291 |        (["\\\\"])}
292 |     {- Any other character is escaped by an {e hexadecimal} ["\u{H+}"] escape
293 |        with [H] a capital hexadecimal number.}}
294 | 
295 |     {b Note.} As far as OCaml is concerned [\u{H+}] escapes are only
296 |     supported from 4.06 on. *)
297 | 
298 | val unescaped : t -> (t, int) result
299 | (** [unescaped s] unescapes what {!escaped} did and any other valid
300 |     [\u{H+}] escape. The, at most six, hexadecimal digits [H] of Unicode
301 |     hex escapes can be upper, lower, or mixed case. Any truncated or
302 |     undefined by {!escaped} escape makes the function return
303 |     an [Error idx] with [idx] the start index of the offending escape.
304 | 
305 |     The invariant [unescape (escape t) = Ok t] holds. *)
306 | 
307 | (** {1:codec Decoding and encoding} *)
308 | 
309 | val encoding_guess : string -> [ `UTF_8 | `UTF_16BE | `UTF_16LE ] * bool
310 | (** [encoding_guess s] is the encoding guessed for [s] coupled with
311 |     [true] iff there's an initial
312 |     {{:http://unicode.org/glossary/#byte_order_mark}BOM}. *)
313 | 
314 | (** {2:dec Best-effort decoding}
315 | 
316 |     {b Warning.} The following are a best-effort decodes in which any UTF-X
317 |     decoding error is replaced by at least one replacement character
318 |     {!Uchar.u_rep}. *)
319 | 
320 | val of_utf_8 : ?first:int -> ?last:int -> string -> t
321 | (** [of_utf_8 ~first ~last s] is the Unicode text that results of
322 |     best-effort UTF-8 decoding the bytes of [s] that exist in the
323 |     range \[[first];[last]\]. [first] defaults to [0] and [last] to
324 |     [length s - 1]. *)
325 | 
326 | val of_utf_16le : ?first:int -> ?last:int -> string -> t
327 | (** [of_utf_16le ~first ~last s] is like {!of_utf_8} but decodes
328 |     UTF-16LE. *)
329 | 
330 | val of_utf_16be : ?first:int -> ?last:int -> string -> t
331 | (** [of_utf_16be ~first ~last s] is like {!of_utf_8} but decodes
332 |     UTF-16BE. *)
333 | 
334 | (** {2:decerr Decoding with error handling} *)
335 | 
336 | type decode = (t, t * int * int option) result
337 | (** The type for decode result. This is:
338 |     {ul
339 |     {- [Ok t] if no decoding error occured.}
340 |     {- [Error (t, err, restart)] if a decoding error occured.  [t] is
341 |        the text decoded until the error, [err] the byte index where
342 |        the decode error occured and [restart] a valid byte index where
343 |        a new best-effort decode could be restarted (if any).}} *)
344 | 
345 | val try_of_utf_8 : ?first:int -> ?last:int -> string -> decode
346 | (** [try_of_utf_8] is like {!of_utf_8} except in case of error
347 |     [Error _] is returned as described in {!decode_result}. *)
348 | 
349 | val try_of_utf_16le : ?first:int -> ?last:int -> string -> decode
350 | (** [try_of_utf_16be] is like {!try_of_utf_8} but decodes UTF-16BE. *)
351 | 
352 | val try_of_utf_16le : ?first:int -> ?last:int -> string -> decode
353 | (** [try_of_utf_16be] is like {!try_of_utf_8} but decodes UTF-16BE. *)
354 | 
355 | (** {2:enc Encoding}
356 | 
357 |     {b Warning.} All these functions raise [Invalid_argument] if the
358 |     result cannot fit in the limits of {!Sys.max_string_length}. *)
359 | 
360 | val to_utf_8 : t -> string
361 | (** [to_utf_8 t] is the UTF-8 encoding of [t]. *)
362 | 
363 | val to_utf_16le : t -> string
364 | (** [to_utf_16le t] is the UTF-16LE encoding of [t]. *)
365 | 
366 | val to_utf_16be : t -> string
367 | (** [to_utf_16be t] is the UTF-16BE encoding of [t]. *)
368 | 
369 | val buffer_add_utf_8 : Buffer.t -> t -> unit
370 | (** [buffer_add_utf_8 b t] adds the UTF-8 encoding of [t] to [b]. *)
371 | 
372 | val buffer_add_utf_16le : Buffer.t -> t -> unit
373 | (** [buffer_add_utf_16le b t] adds the UTF-16LE encoding of [t] to [b]. *)
374 | 
375 | val buffer_add_utf_16be : Buffer.t -> t -> unit
376 | (** [buffer_add_utf_16be b t] adds the UTF-16BE encoding of [t] to [b]. *)
377 | 
378 | (** {1:pretty Pretty-printing} *)
379 | 
380 | val pp : Format.formatter -> t -> unit
381 | (** [pp ppf t] prints the UTF-8 encoding of [t] instructing the [ppf]
382 |     to use a length of [1] for each grapheme cluster of [t]. *)
383 | 
384 | val pp_text : Format.formatter -> t -> unit
385 | (** [pp_text ppf t] is like {!pp} except each line break is hinted
386 |     to the formatter, see {!Uuseg_string.pp_utf_8_text} for details. *)
387 | 
388 | val pp_lines : Format.formatter -> t -> unit
389 | (** [pp_lines ppf t] is like {!pp} except only {e mandatory} line breaks
390 |     are hinted to the formatter, see {!Uuseg_string.pp_utf_8_lines} for
391 |     details. *)
392 | 
393 | val pp_uchars : Format.formatter -> t -> unit
394 | (** [dump_uchars ppf t] formats [t] as a sequence of OCaml {!Uchar.t} value
395 |     using only US-ASCII encoded characters according to the Unicode
396 |     {{:http://www.unicode.org/versions/Unicode10.0.0/appA.pdf#G7083}
397 |     notational convention} for code points. *)
398 | 
399 | val pp_toplevel : Format.formatter -> t -> unit
400 | (** [pp_toplevel ppf t] formats [t] using {!escaped} and {!pp} in a manner
401 |     suitable for the toplevel to represent a [Utext.t] value.
402 | 
403 |     {b Warning.} Before OCaml 4.06 the result might not be cut and pastable
404 |     as [\u{H+}] escapes are not supported. *)
405 | 
406 | val pp_toplevel_pvec : Format.formatter -> t Pvec.t -> unit
407 | (** [pp_toplevel_pvec ppf ts] formats [ts] using {!pp_toplevel}. *)
408 | 
409 | (*---------------------------------------------------------------------------
410 |    Copyright (c) 2018 The utext programmers
411 | 
412 |    Permission to use, copy, modify, and/or distribute this software for any
413 |    purpose with or without fee is hereby granted, provided that the above
414 |    copyright notice and this permission notice appear in all copies.
415 | 
416 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
417 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
418 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
419 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
420 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
421 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
422 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
423 |   ---------------------------------------------------------------------------*)
424 | 


--------------------------------------------------------------------------------
/src/utext.mllib:
--------------------------------------------------------------------------------
1 | Utext
2 | 


--------------------------------------------------------------------------------
/src/utext_top_init.ml:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2018 The pvec programmers. All rights reserved.
 3 |    Distributed under the ISC license, see terms at the end of the file.
 4 |    %%NAME%% %%VERSION%%
 5 |   ---------------------------------------------------------------------------*)
 6 | 
 7 | #install_printer Utext.pp_toplevel ;;
 8 | #install_printer Utext.pp_toplevel_pvec ;;
 9 | 
10 | (*---------------------------------------------------------------------------
11 |    Copyright (c) 2018 The pvec programmers
12 | 
13 |    Permission to use, copy, modify, and/or distribute this software for any
14 |    purpose with or without fee is hereby granted, provided that the above
15 |    copyright notice and this permission notice appear in all copies.
16 | 
17 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 |   ---------------------------------------------------------------------------*)
25 | 


--------------------------------------------------------------------------------
/test/test.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2018 The utext programmers. All rights reserved.
  3 |    Distributed under the ISC license, see terms at the end of the file.
  4 |    %%NAME%% %%VERSION%%
  5 |   ---------------------------------------------------------------------------*)
  6 | 
  7 | let strf = Printf.sprintf
  8 | let log f = Format.printf (f ^^ "@.")
  9 | 
 10 | let u = Utext.str
 11 | let eq = Utext.equal
 12 | let assert_f f t0 t1 = assert (Utext.equal (f (u t0)) (u t1))
 13 | let assert_invalid_arg f x = try f x; assert false with Invalid_argument _ -> ()
 14 | 
 15 | let test_base () =
 16 |   log "Test Utext.{empty,is_empty,str,strf}";
 17 |   assert (Pvec.length Utext.empty = 0);
 18 |   assert Utext.(is_empty empty);
 19 |   assert (Utext.(not @@ is_empty @@ str "bla"));
 20 |   assert_invalid_arg Utext.str "\xFF";
 21 |   assert (eq (Utext.strf "%d" 1) (u "1"));
 22 |   ()
 23 | 
 24 | let test_case () =
 25 |   log "Test Utext.{lowercased,uppercased,capitalized,uncapitalized}";
 26 |   assert_f Utext.lowercased "abc" "abc";
 27 |   assert_f Utext.lowercased "aBc" "abc";
 28 |   assert_f Utext.uppercased "abc" "ABC";
 29 |   assert_f Utext.uppercased "aBc" "ABC";
 30 |   assert_f Utext.capitalized "abc" "Abc";
 31 |   assert_f Utext.capitalized "Abc" "Abc";
 32 |   assert_f Utext.uncapitalized "abc" "abc";
 33 |   assert_f Utext.uncapitalized "Abc" "abc";
 34 |   assert_f Utext.uppercased "ǳ" "Ǳ";
 35 |   assert_f Utext.capitalized "ǳ" "ǲ";
 36 |   ()
 37 | 
 38 | let test_case_insensitive () =
 39 |   log "Test case insensitive equalities";
 40 |   let assert_eq key t0 t1 = assert (eq (key (u t0)) (key (u t1))) in
 41 |   let assert_neq key t0 t1 = assert (not @@ eq (key (u t0)) (key (u t1))) in
 42 |   assert_eq Utext.casefolded "Ὀδυσσεύς" "ὈΔΥΣΣΕΎΣ";
 43 |   assert_neq Utext.casefolded "Å" (*U+00C5*) "Å" (*<U+0041,U+030A>*);
 44 |   assert_eq Utext.canonical_caseless_key "Å" "Å";
 45 |   assert_eq Utext.compatibility_caseless_key "Å" "Å";
 46 |   assert_neq Utext.casefolded "Ⅻ" (*U+216B*) "ⅹⅰⅰ" (*<U+2179,U+2171,U+2171>*);
 47 |   assert_neq Utext.canonical_caseless_key "Ⅻ" "ⅹⅰⅰ";
 48 |   assert_eq Utext.compatibility_caseless_key "Ⅻ" "ⅹⅰⅰ";
 49 |   assert_neq Utext.casefolded "℅" (*U+2105*) "C/O";
 50 |   assert_neq Utext.canonical_caseless_key "℅" (*U+2105*) "C/O";
 51 |   assert_eq Utext.compatibility_caseless_key "℅" (*U+2105*) "C/O";
 52 |   assert_neq Utext.casefolded "㎒" "MHZ";
 53 |   assert_neq Utext.canonical_caseless_key "㎒" "MHZ";
 54 |   assert_eq Utext.compatibility_caseless_key "㎒" "MHZ";
 55 |   ()
 56 | 
 57 | let test_identifiers () =
 58 |   log "Test Utext.is_identifier";
 59 |   assert (not @@ Utext.is_identifier (u "32x"));
 60 |   assert (not @@ Utext.is_identifier (u "\xCC\x81Ela"));
 61 |   assert (Utext.is_identifier (u "Éla"));
 62 |   assert (not @@ Utext.is_identifier (u "_Éla"));
 63 |   assert (not @@ Utext.is_identifier (u "\xF0\x9F\x90\xAB"));
 64 |   assert (not @@ Utext.is_identifier (u "a\xF0\x9F\x90\xAB"));
 65 |   ()
 66 | 
 67 | let test_lines () =
 68 |   log "Test Utext.lines";
 69 |   let assert_ls ?drop_empty ?newline src segs =
 70 |     assert (Pvec.equal ~eq
 71 |               (Utext.lines ?drop_empty ?newline (u src))
 72 |               (Pvec.of_list (List.map u segs)))
 73 |   in
 74 |   let drop_empty = true in
 75 |   assert_ls "" [""];
 76 |   assert_ls ~drop_empty "" [];
 77 |   assert_ls "\n" ["";""];
 78 |   assert_ls ~drop_empty "\n" [];
 79 |   assert_ls "fst\r\nsnd\ntrd" ["fst";"snd";"trd"];
 80 |   assert_ls "fst\r\nsnd\ntrd\n\r" ["fst";"snd";"trd";"";""];
 81 |   assert_ls ~drop_empty "fst\r\nsnd\ntrd\n\r" ["fst";"snd";"trd"];
 82 |   assert_ls "fst\r\nsnd\ntrd\r\n" ["fst";"snd";"trd";""];
 83 |   assert_ls ~drop_empty "fst\r\nsnd\ntrd\r\n" ["fst";"snd";"trd"];
 84 |   assert_ls "\n\rabc" [""; ""; "abc"];
 85 |   assert_ls ~drop_empty "\n\rabc" ["abc"];
 86 |   assert_ls "\r\nabc" [""; "abc"];
 87 |   assert_ls ~drop_empty "\n\rabc" ["abc"];
 88 |   ()
 89 | 
 90 | let test_paragraphs () =
 91 |   log "Test Utext.paragraphs";
 92 |   let assert_ps ?drop_empty src segs =
 93 |     assert (Pvec.equal ~eq
 94 |               (Utext.paragraphs ?drop_empty (u src))
 95 |               (Pvec.of_list (List.map u segs)))
 96 |   in
 97 |   let drop_empty = true in
 98 |   assert_ps "" [""];
 99 |   assert_ps ~drop_empty "" [];
100 |   assert_ps "\n\n" ["";""];
101 |   assert_ps "\n" ["\n"];
102 |   assert_ps "\r\n" ["\r\n"];
103 |   assert_ps "\np1\n" ["\np1\n"];
104 |   assert_ps "\np1\n\xE2\x80\xA9bla" ["\np1\n"; "bla"];
105 |   assert_ps "\np1\n\xE2\x80\xA9p2" ["\np1\n"; "p2"];
106 |   assert_ps "\np1\r\n\xE2\x80\xA9p2" ["\np1\r\n"; "p2"];
107 |   assert_ps "p1\r\n\r\np2" ["p1"; "p2"];
108 |   assert_ps "p1\r\n\np2" ["p1"; "p2"];
109 |   assert_ps "p1\n\np2\n\np3\n\n" ["p1"; "p2"; "p3"; ""];
110 |   assert_ps ~drop_empty "p1\n\np2\n\np3\n\n" ["p1"; "p2"; "p3"];
111 |   ()
112 | 
113 | let test_normalization () =
114 |   log "Test Utext.{normalized,is_normalized}";
115 |   (* These tests were taken from Uunf. *)
116 |   let test src nf dst =
117 |     let src = Pvec.of_list (List.map Uchar.of_int src) in
118 |     let dst = Pvec.of_list (List.map Uchar.of_int dst) in
119 |     assert (eq (Utext.normalized nf src) dst);
120 |     assert (Utext.is_normalized nf dst);
121 |   in
122 |   test [0x1E69] `NFD [0x0073; 0x0323; 0x0307];
123 |   test [0x1E69] `NFC [0x1E69];
124 |   test [0x1E0B; 0x0323] `NFD [0x0064; 0x0323; 0x0307];
125 |   test [0x1E0B; 0x0323] `NFC [0x1E0D; 0x0307];
126 |   test [0xFB01] `NFD [0xFB01];
127 |   test [0xFB01] `NFC [0xFB01];
128 |   test [0xFB01] `NFKD [0x0066; 0x0069];
129 |   test [0xFB01] `NFKC [0x0066; 0x0069];
130 |   test [0x0032; 0x2075] `NFD [0x0032; 0x2075];
131 |   test [0x0032; 0x2075] `NFC [0x0032; 0x2075];
132 |   test [0x0032; 0x2075] `NFKD [0x0032; 0x0035];
133 |   test [0x0032; 0x2075] `NFKC [0x0032; 0x0035];
134 |   test [0x1E9B; 0x0323] `NFD [0x017F; 0x0323; 0x307];
135 |   test [0x1E9B; 0x0323] `NFC [0x1E9B; 0x0323; ];
136 |   test [0x1E9B; 0x0323] `NFKD [0x0073; 0x0323; 0x0307];
137 |   test [0x1E9B; 0x0323] `NFKC [0x1E69];
138 |   test [0x0041; 0x007A; 0x0335; 0x0327; 0x0324; 0x0301; 0x0041] `NFC
139 |        [0x0041; 0x017A; 0x0335; 0x0327; 0x0324; 0x0041];
140 |   test [0x01C6; 0x032D] `NFKC [0x0064; 0x017E; 0x032D];
141 |   test [0xFF80; 0x1FD3; 0xFF9E; 0x1FD3;] `NFKC [0x30BF; 0x0390; 0x3099; 0x0390];
142 |   test [0xC100; 0x20D2; 0x11C1; 0x11C1] `NFC [0xC100; 0x20D2; 0x11C1; 0x11C1];
143 |   ()
144 | 
145 | let test_segmentation () =
146 |   log "Text Utext.{segments,segment_count,boundaries,boundaries_mandatory}";
147 |   let assert_vec ~eq v els = assert (Pvec.equal ~eq v (Pvec.of_list els)) in
148 |   let assert_segs v els = assert_vec ~eq v (List.map u els) in
149 |   let abla = u "Åbla" in
150 |   let heyho = u "hey ho let's go" in
151 |   assert_segs (Utext.segments `Grapheme_cluster Utext.empty) [];
152 |   assert (Utext.segment_count `Grapheme_cluster Utext.empty = 0);
153 |   assert_segs (Utext.segments `Word @@ u "") [];
154 |   assert (Pvec.length abla = 5);
155 |   assert_segs (Utext.segments `Grapheme_cluster abla) ["Å"; "b"; "l"; "a"];
156 |   assert (Utext.segment_count `Grapheme_cluster abla = 4);
157 |   assert_segs (Utext.segments `Word heyho)
158 |     ["hey"; " "; "ho"; " "; "let's"; " "; "go"];
159 |   assert_segs (Utext.segments `Word heyho)
160 |     ["hey"; " "; "ho"; " "; "let's"; " "; "go"];
161 |   assert_vec ~eq:(=) (Utext.boundaries `Grapheme_cluster abla) [0;2;3;4;5];
162 |   assert_vec ~eq:(=) (Utext.boundaries_mandatory `Line_break (u "hey ho\nno"))
163 |     [4, false; 7, true; 9, true];
164 |   ()
165 | 
166 | let test_escapes () =
167 |   log "Test Utext.{escape,unescape}";
168 |   let unescaped_ok u = match Utext.unescaped u with
169 |   | Ok u -> u | Error _ -> assert false
170 |   in
171 |   let trip unesc esc =
172 |     assert_f Utext.escaped unesc esc;
173 |     assert_f unescaped_ok esc unesc;
174 |     ()
175 |   in
176 |   trip "\xC2\x98\n" "\\u{0098}\\n";
177 |   trip "\xC2\x98\n" "\\u{0098}\\n";
178 |   trip "a\x00\bla" "a\\u{0000}\\bla";
179 |   trip "\b\t\n\r\"\\" "\\b\\t\\n\\r\\\"\\\\";
180 |   assert_f unescaped_ok "\\u{1F42B}a" "\xF0\x9F\x90\xABa";
181 |   assert (Utext.unescaped (u "a\nbc\\") = Error 4);
182 |   assert (Utext.unescaped (u "a\\nbc\\") = Error 5);
183 |   assert (Utext.unescaped (u "a\\n\\u{0}bc\\u") = Error 11);
184 |   assert (Utext.unescaped (u "a\\n\\u{0}bc\\u{") = Error 11);
185 |   assert (Utext.unescaped (u "a\\n\\u{0}bc\\u{}") = Error 11);
186 |   assert (Utext.unescaped (u "ab\\{}}") = Error 2);
187 |   assert (Utext.unescaped (u "ab\\u{}}") = Error 2);
188 |   ()
189 | 
190 | let test_codec () =
191 |   log "Test codec";
192 |   let trip ?(first = 0) ?last s =
193 |     let last = match last with None -> String.length s - 1 | Some l -> l in
194 |     assert (Utext.(to_utf_8 @@ of_utf_8 ~first ~last s) =
195 |             String.sub s first (last - first + 1))
196 |   in
197 |   trip ~first:4 ~last:6 "hop hap mop";
198 |   trip "métallique";
199 |   trip "矢量";
200 |   ()
201 | 
202 | 
203 | let test () =
204 |   try
205 |     Printexc.record_backtrace true;
206 |     test_base ();
207 |     test_case ();
208 |     test_case_insensitive ();
209 |     test_identifiers ();
210 |     test_lines ();
211 |     test_paragraphs ();
212 |     test_normalization ();
213 |     test_segmentation ();
214 |     test_escapes ();
215 |     test_codec ();
216 |   with
217 |   | e ->
218 |       let bt = Printexc.get_raw_backtrace () in
219 |       log "%s\n%s\n[FAIL] A test failed!"
220 |         (Printexc.to_string e) (Printexc.raw_backtrace_to_string bt);
221 |       exit 1
222 | 
223 | let () = test ()
224 | 
225 | (*---------------------------------------------------------------------------
226 |    Copyright (c) 2018 The utext programmers
227 | 
228 |    Permission to use, copy, modify, and/or distribute this software for any
229 |    purpose with or without fee is hereby granted, provided that the above
230 |    copyright notice and this permission notice appear in all copies.
231 | 
232 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
233 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
234 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
235 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
236 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
237 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
238 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
239 |   ---------------------------------------------------------------------------*)
240 | 


--------------------------------------------------------------------------------